首页 > 代码库 > libevent源码分析:epoll后端实现

libevent源码分析:epoll后端实现

epoll后端机制的实现代码在epoll.c文件中。

技术分享
  1 /*  2  * Copyright 2000-2007 Niels Provos <provos@citi.umich.edu>  3  * Copyright 2007-2012 Niels Provos, Nick Mathewson  4  *  5  * Redistribution and use in source and binary forms, with or without  6  * modification, are permitted provided that the following conditions  7  * are met:  8  * 1. Redistributions of source code must retain the above copyright  9  *    notice, this list of conditions and the following disclaimer. 10  * 2. Redistributions in binary form must reproduce the above copyright 11  *    notice, this list of conditions and the following disclaimer in the 12  *    documentation and/or other materials provided with the distribution. 13  * 3. The name of the author may not be used to endorse or promote products 14  *    derived from this software without specific prior written permission. 15  * 16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS‘‘ AND ANY EXPRESS OR 17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26  */ 27 #include "event2/event-config.h" 28 #include "evconfig-private.h" 29  30 #ifdef EVENT__HAVE_EPOLL 31  32 #include <stdint.h> 33 #include <sys/types.h> 34 #include <sys/resource.h> 35 #ifdef EVENT__HAVE_SYS_TIME_H 36 #include <sys/time.h> 37 #endif 38 #include <sys/queue.h> 39 #include <sys/epoll.h> 40 #include <signal.h> 41 #include <limits.h> 42 #include <stdio.h> 43 #include <stdlib.h> 44 #include <string.h> 45 #include <unistd.h> 46 #include <errno.h> 47 #ifdef EVENT__HAVE_FCNTL_H 48 #include <fcntl.h> 49 #endif 50 #ifdef EVENT__HAVE_SYS_TIMERFD_H 51 #include <sys/timerfd.h> 52 #endif 53  54 #include "event-internal.h" 55 #include "evsignal-internal.h" 56 #include "event2/thread.h" 57 #include "evthread-internal.h" 58 #include "log-internal.h" 59 #include "evmap-internal.h" 60 #include "changelist-internal.h" 61 #include "time-internal.h" 62  63 /* Since Linux 2.6.17, epoll is able to report about peer half-closed connection 64    using special EPOLLRDHUP flag on a read event. 65 */ 66 #if !defined(EPOLLRDHUP) 67 #define EPOLLRDHUP 0 68 #define EARLY_CLOSE_IF_HAVE_RDHUP 0 69 #else 70 #define EARLY_CLOSE_IF_HAVE_RDHUP EV_FEATURE_EARLY_CLOSE 71 #endif 72  73 #include "epolltable-internal.h" 74  75 #if defined(EVENT__HAVE_SYS_TIMERFD_H) &&               76     defined(EVENT__HAVE_TIMERFD_CREATE) &&               77     defined(HAVE_POSIX_MONOTONIC) && defined(TFD_NONBLOCK) &&  78     defined(TFD_CLOEXEC) 79 /* Note that we only use timerfd if TFD_NONBLOCK and TFD_CLOEXEC are available 80    and working.  This means that we can‘t support it on 2.6.25 (where timerfd 81    was introduced) or 2.6.26, since 2.6.27 introduced those flags. 82  */ 83 #define USING_TIMERFD 84 #endif 85  86 struct epollop { 87     struct epoll_event *events; 88     int nevents; 89     int epfd; 90 #ifdef USING_TIMERFD 91     int timerfd; 92 #endif 93 }; 94  95 static void *epoll_init(struct event_base *); 96 static int epoll_dispatch(struct event_base *, struct timeval *); 97 static void epoll_dealloc(struct event_base *); 98  99 static const struct eventop epollops_changelist = {100     "epoll (with changelist)",101     epoll_init,102     event_changelist_add_,103     event_changelist_del_,104     epoll_dispatch,105     epoll_dealloc,106     1, /* need reinit */107     EV_FEATURE_ET|EV_FEATURE_O1| EARLY_CLOSE_IF_HAVE_RDHUP,108     EVENT_CHANGELIST_FDINFO_SIZE109 };110 111 112 static int epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,113     short old, short events, void *p);114 static int epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,115     short old, short events, void *p);116 117 const struct eventop epollops = {118     "epoll",119     epoll_init,120     epoll_nochangelist_add,121     epoll_nochangelist_del,122     epoll_dispatch,123     epoll_dealloc,124     1, /* need reinit */125     EV_FEATURE_ET|EV_FEATURE_O1|EV_FEATURE_EARLY_CLOSE,126     0127 };128 129 #define INITIAL_NEVENT 32130 #define MAX_NEVENT 4096131 132 /* On Linux kernels at least up to 2.6.24.4, epoll can‘t handle timeout133  * values bigger than (LONG_MAX - 999ULL)/HZ.  HZ in the wild can be134  * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the135  * largest number of msec we can support here is 2147482.  Let‘s136  * round that down by 47 seconds.137  */138 #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000)139 140 static void *141 epoll_init(struct event_base *base)142 {143     int epfd = -1;144     struct epollop *epollop;145 146 #ifdef EVENT__HAVE_EPOLL_CREATE1147     /* First, try the shiny new epoll_create1 interface, if we have it. */148     epfd = epoll_create1(EPOLL_CLOEXEC);149 #endif150     if (epfd == -1) {151         /* Initialize the kernel queue using the old interface.  (The152         size field is ignored   since 2.6.8.) */153         if ((epfd = epoll_create(32000)) == -1) {154             if (errno != ENOSYS)155                 event_warn("epoll_create");156             return (NULL);157         }158         evutil_make_socket_closeonexec(epfd);159     }160 161     if (!(epollop = mm_calloc(1, sizeof(struct epollop)))) {162         close(epfd);163         return (NULL);164     }165 166     epollop->epfd = epfd;167 168     /* Initialize fields */169     epollop->events = mm_calloc(INITIAL_NEVENT, sizeof(struct epoll_event));170     if (epollop->events == NULL) {171         mm_free(epollop);172         close(epfd);173         return (NULL);174     }175     epollop->nevents = INITIAL_NEVENT;176 177     if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||178         ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&179         evutil_getenv_("EVENT_EPOLL_USE_CHANGELIST") != NULL)) {180 181         base->evsel = &epollops_changelist;182     }183 184 #ifdef USING_TIMERFD185     /*186       The epoll interface ordinarily gives us one-millisecond precision,187       so on Linux it makes perfect sense to use the CLOCK_MONOTONIC_COARSE188       timer.  But when the user has set the new PRECISE_TIMER flag for an189       event_base, we can try to use timerfd to give them finer granularity.190     */191     if ((base->flags & EVENT_BASE_FLAG_PRECISE_TIMER) &&192         base->monotonic_timer.monotonic_clock == CLOCK_MONOTONIC) {193         int fd;194         fd = epollop->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);195         if (epollop->timerfd >= 0) {196             struct epoll_event epev;197             memset(&epev, 0, sizeof(epev));198             epev.data.fd = epollop->timerfd;199             epev.events = EPOLLIN;200             if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, fd, &epev) < 0) {201                 event_warn("epoll_ctl(timerfd)");202                 close(fd);203                 epollop->timerfd = -1;204             }205         } else {206             if (errno != EINVAL && errno != ENOSYS) {207                 /* These errors probably mean that we were208                  * compiled with timerfd/TFD_* support, but209                  * we‘re running on a kernel that lacks those.210                  */211                 event_warn("timerfd_create");212             }213             epollop->timerfd = -1;214         }215     } else {216         epollop->timerfd = -1;217     }218 #endif219 220     evsig_init_(base);221 222     return (epollop);223 }224 225 static const char *226 change_to_string(int change)227 {228     change &= (EV_CHANGE_ADD|EV_CHANGE_DEL);229     if (change == EV_CHANGE_ADD) {230         return "add";231     } else if (change == EV_CHANGE_DEL) {232         return "del";233     } else if (change == 0) {234         return "none";235     } else {236         return "???";237     }238 }239 240 static const char *241 epoll_op_to_string(int op)242 {243     return op == EPOLL_CTL_ADD?"ADD":244         op == EPOLL_CTL_DEL?"DEL":245         op == EPOLL_CTL_MOD?"MOD":246         "???";247 }248 249 #define PRINT_CHANGES(op, events, ch, status)  250     "Epoll %s(%d) on fd %d " status ". "       251     "Old events were %d; "                     252     "read change was %d (%s); "                253     "write change was %d (%s); "               254     "close change was %d (%s)",                255     epoll_op_to_string(op),                    256     events,                                    257     ch->fd,                                    258     ch->old_events,                            259     ch->read_change,                           260     change_to_string(ch->read_change),         261     ch->write_change,                          262     change_to_string(ch->write_change),        263     ch->close_change,                          264     change_to_string(ch->close_change)265 266 static int267 epoll_apply_one_change(struct event_base *base,268     struct epollop *epollop,269     const struct event_change *ch)270 {271     struct epoll_event epev;272     int op, events = 0;273     int idx;274 275     idx = EPOLL_OP_TABLE_INDEX(ch);276     op = epoll_op_table[idx].op;277     events = epoll_op_table[idx].events;278 279     if (!events) {280         EVUTIL_ASSERT(op == 0);281         return 0;282     }283 284     if ((ch->read_change|ch->write_change) & EV_CHANGE_ET)285         events |= EPOLLET;286 287     memset(&epev, 0, sizeof(epev));288     epev.data.fd = ch->fd;289     epev.events = events;290     if (epoll_ctl(epollop->epfd, op, ch->fd, &epev) == 0) {291         event_debug((PRINT_CHANGES(op, epev.events, ch, "okay")));292         return 0;293     }294 295     switch (op) {296     case EPOLL_CTL_MOD:297         if (errno == ENOENT) {298             /* If a MOD operation fails with ENOENT, the299              * fd was probably closed and re-opened.  We300              * should retry the operation as an ADD.301              */302             if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, ch->fd, &epev) == -1) {303                 event_warn("Epoll MOD(%d) on %d retried as ADD; that failed too",304                     (int)epev.events, ch->fd);305                 return -1;306             } else {307                 event_debug(("Epoll MOD(%d) on %d retried as ADD; succeeded.",308                     (int)epev.events,309                     ch->fd));310                 return 0;311             }312         }313         break;314     case EPOLL_CTL_ADD:315         if (errno == EEXIST) {316             /* If an ADD operation fails with EEXIST,317              * either the operation was redundant (as with a318              * precautionary add), or we ran into a fun319              * kernel bug where using dup*() to duplicate the320              * same file into the same fd gives you the same epitem321              * rather than a fresh one.  For the second case,322              * we must retry with MOD. */323             if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {324                 event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",325                     (int)epev.events, ch->fd);326                 return -1;327             } else {328                 event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",329                     (int)epev.events,330                     ch->fd));331                 return 0;332             }333         }334         break;335     case EPOLL_CTL_DEL:336         if (errno == ENOENT || errno == EBADF || errno == EPERM) {337             /* If a delete fails with one of these errors,338              * that‘s fine too: we closed the fd before we339              * got around to calling epoll_dispatch. */340             event_debug(("Epoll DEL(%d) on fd %d gave %s: DEL was unnecessary.",341                 (int)epev.events,342                 ch->fd,343                 strerror(errno)));344             return 0;345         }346         break;347     default:348         break;349     }350 351     event_warn(PRINT_CHANGES(op, epev.events, ch, "failed"));352     return -1;353 }354 355 static int356 epoll_apply_changes(struct event_base *base)357 {358     struct event_changelist *changelist = &base->changelist;359     struct epollop *epollop = base->evbase;360     struct event_change *ch;361 362     int r = 0;363     int i;364 365     for (i = 0; i < changelist->n_changes; ++i) {366         ch = &changelist->changes[i];367         if (epoll_apply_one_change(base, epollop, ch) < 0)368             r = -1;369     }370 371     return (r);372 }373 374 static int375 epoll_nochangelist_add(struct event_base *base, evutil_socket_t fd,376     short old, short events, void *p)377 {378     struct event_change ch;379     ch.fd = fd;380     ch.old_events = old;381     ch.read_change = ch.write_change = ch.close_change = 0;382     if (events & EV_WRITE)383         ch.write_change = EV_CHANGE_ADD |384             (events & EV_ET);385     if (events & EV_READ)386         ch.read_change = EV_CHANGE_ADD |387             (events & EV_ET);388     if (events & EV_CLOSED)389         ch.close_change = EV_CHANGE_ADD |390             (events & EV_ET);391 392     return epoll_apply_one_change(base, base->evbase, &ch);393 }394 395 static int396 epoll_nochangelist_del(struct event_base *base, evutil_socket_t fd,397     short old, short events, void *p)398 {399     struct event_change ch;400     ch.fd = fd;401     ch.old_events = old;402     ch.read_change = ch.write_change = ch.close_change = 0;403     if (events & EV_WRITE)404         ch.write_change = EV_CHANGE_DEL;405     if (events & EV_READ)406         ch.read_change = EV_CHANGE_DEL;407     if (events & EV_CLOSED)408         ch.close_change = EV_CHANGE_DEL;409 410     return epoll_apply_one_change(base, base->evbase, &ch);411 }412 413 static int414 epoll_dispatch(struct event_base *base, struct timeval *tv)415 {416     struct epollop *epollop = base->evbase;417     struct epoll_event *events = epollop->events;418     int i, res;419     long timeout = -1;420 421 #ifdef USING_TIMERFD422     if (epollop->timerfd >= 0) {423         struct itimerspec is;424         is.it_interval.tv_sec = 0;425         is.it_interval.tv_nsec = 0;426         if (tv == NULL) {427             /* No timeout; disarm the timer. */428             is.it_value.tv_sec = 0;429             is.it_value.tv_nsec = 0;430         } else {431             if (tv->tv_sec == 0 && tv->tv_usec == 0) {432                 /* we need to exit immediately; timerfd can‘t433                  * do that. */434                 timeout = 0;435             }436             is.it_value.tv_sec = tv->tv_sec;437             is.it_value.tv_nsec = tv->tv_usec * 1000;438         }439         /* TODO: we could avoid unnecessary syscalls here by only440            calling timerfd_settime when the top timeout changes, or441            when we‘re called with a different timeval.442         */443         if (timerfd_settime(epollop->timerfd, 0, &is, NULL) < 0) {444             event_warn("timerfd_settime");445         }446     } else447 #endif448     if (tv != NULL) {449         timeout = evutil_tv_to_msec_(tv);450         if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {451             /* Linux kernels can wait forever if the timeout is452              * too big; see comment on MAX_EPOLL_TIMEOUT_MSEC. */453             timeout = MAX_EPOLL_TIMEOUT_MSEC;454         }455     }456 457     epoll_apply_changes(base);458     event_changelist_remove_all_(&base->changelist, base);459 460     EVBASE_RELEASE_LOCK(base, th_base_lock);461 462     res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);463 464     EVBASE_ACQUIRE_LOCK(base, th_base_lock);465 466     if (res == -1) {467         if (errno != EINTR) {468             event_warn("epoll_wait");469             return (-1);470         }471 472         return (0);473     }474 475     event_debug(("%s: epoll_wait reports %d", __func__, res));476     EVUTIL_ASSERT(res <= epollop->nevents);477 478     for (i = 0; i < res; i++) {479         int what = events[i].events;480         short ev = 0;481 #ifdef USING_TIMERFD482         if (events[i].data.fd == epollop->timerfd)483             continue;484 #endif485 486         if (what & (EPOLLHUP|EPOLLERR)) {487             ev = EV_READ | EV_WRITE;488         } else {489             if (what & EPOLLIN)490                 ev |= EV_READ;491             if (what & EPOLLOUT)492                 ev |= EV_WRITE;493             if (what & EPOLLRDHUP)494                 ev |= EV_CLOSED;495         }496 497         if (!ev)498             continue;499 500         evmap_io_active_(base, events[i].data.fd, ev | EV_ET);501     }502 503     if (res == epollop->nevents && epollop->nevents < MAX_NEVENT) {504         /* We used all of the event space this time.  We should505            be ready for more events next time. */506         int new_nevents = epollop->nevents * 2;507         struct epoll_event *new_events;508 509         new_events = mm_realloc(epollop->events,510             new_nevents * sizeof(struct epoll_event));511         if (new_events) {512             epollop->events = new_events;513             epollop->nevents = new_nevents;514         }515     }516 517     return (0);518 }519 520 521 static void522 epoll_dealloc(struct event_base *base)523 {524     struct epollop *epollop = base->evbase;525 526     evsig_dealloc_(base);527     if (epollop->events)528         mm_free(epollop->events);529     if (epollop->epfd >= 0)530         close(epollop->epfd);531 #ifdef USING_TIMERFD532     if (epollop->timerfd >= 0)533         close(epollop->timerfd);534 #endif535 536     memset(epollop, 0, sizeof(struct epollop));537     mm_free(epollop);538 }539 540 #endif /* EVENT__HAVE_EPOLL */
View Code

(1)第117行-127行定义的epollops对应了这篇文章里说的epoll后端机制的定义。

(2)该文件中定义了epoll_init函数用于初始化、epoll_add函数用于添加一个事件、epoll_del函数用于删除一个事件、epoll_dispatch用于事件循环。

1、epoll_init函数

1)调用epoll_create创建epfd。

2)在堆上分配一个struct epollop结构epollop。

3)把epollop的成员epfd赋值为刚才创建的epfd。

4)初始化成员events,调用mm_malloc函数来分配。

5)初始化成员nevents为INITIAL_NEVENT。

6)如果定义了USING_TIMERFD宏,就初始化成员timerfd。

7)调用svsig_init_函数。

8)返回epollop。

2、epoll_nochanglist_add函数

1)判断read、write、close是否有改变。

2)调用epoll_apply_one_change函数,在该函数中首先调用epool_ctl修改事件,然后处理各种异常情况,比如:ENOENT、EEXIST等等。

3、epoll_nochangelist_del函数

1)判断read、write、close是否有删除。

2)调用函数epoll_apply_one_change函数。

4、epoll_dispatch函数

1)通过event_base结构的evbase获取epollop指针,然后获取到初始化时传入的events指针并保存在events中。

2)获取timeout。

3)调用epoll_wait函数。

4)在一个for循环中处理激活事件,在每一次循环中,先把epoll事件转换为libevent定义的事件,EPOLLIN->EV_READ,EPOLLOUT->EV_WRITE,EPOLLRDHUP->EV_CLOSED,然后调用evmap_io_active_函数。

5)判断如果用完了所有事件,则为下一次准备更多的事件,扩展为原来的2倍,第一次默认是32。

 

libevent源码分析:epoll后端实现