本文最后更新于:2020年7月2日 晚上

* 最近拖延症又犯了。。。嗯。。。废话不多说。。。直接上硬货。。。→_→ *

比较select系统调用请戳传送门——select内核源代码剖析

了解poll机制请戳传送门——poll机制内核源代码剖析

  • epoll_create

这是Linux_3.0.12内核版本。。。和之前剖析的2.4.0内核版本的系统调用有一些差别。。。所以直接进SYSCALL_DEFINE1。。。

//为每一个监听的事件都分配一个epitem数据结构
struct epitem {
	/* RB tree node used to link this structure to the eventpoll RB tree */
	//每个epitem都存放在eventpoll中以rbr为根的红黑树中
	//rbn记录epitem在红黑树中的结点
	struct rb_node rbn;

	/* List header used to link this structure to the eventpoll ready list */
	//每个就绪事件所对应的epitem都链入了eventpoll中的rdllink
	//rdllink记录就绪链表头
	struct list_head rdllink;

	/*
	 * Works together "struct eventpoll"->ovflist in keeping the
	 * single linked chain of items.
	 */
	//记录每个epitem在eventpoll数据结构中的ovflist的下一个epitem
	struct epitem *next;

	/* The file descriptor information this item refers to */
	//epoll_filefd数据结构记录epitem所对应的struct file和fd文件描述符
	struct epoll_filefd ffd;

	/* Number of active wait queue attached to poll operations */
	//poll操作上的等待队列个数
	int nwait;

	/* List containing poll wait queues */
	//包含等待队列对头的单链表
	struct list_head pwqlist;

	/* The "container" of this item */
	//记录epitem所属哪一个eventpoll数据结构
	struct eventpoll *ep;

	/* List header used to link this item to the "struct file" items list */
	//记录epitem所对应的struct file的单链表
	struct list_head fllink;

	/* The structure that describe the interested events and the source fd */
	//记录epitem对应的epoll_event数据结构,epoll_event是epoll_ctl函数传入的参数 
	struct epoll_event event;
};
struct eventpoll {
	/* Protect the access to this structure */
	spinlock_t lock;

	/*
	 * This mutex is used to ensure that files are not removed
	 * while epoll is using them. This is held during the event
	 * collection loop, the file cleanup path, the epoll file exit
	 * code and the ctl operations.
	 */
	//对事件进行处理时,内核都都会持有这个互斥锁,因此在内核态中epoll的相关操作可以保证是线程安全的
	struct mutex mtx;
	
	/* Wait queue used by sys_epoll_wait() */
	//调用sys_epoll_wait()时,存放当前进程的等待队列
	wait_queue_head_t wq;

	/* Wait queue used by file->poll() */
	//此等待队列存放监听事件的poll操作
	wait_queue_head_t poll_wait;

	/* List of ready file descriptors */
	//为每个事件都会分配一个epitem,当事件就绪时其所对应的epitem就会链入rdllist双向链表中
	//epitem数据类型定义在上面
	struct list_head rdllist;

	/* RB tree root used to store monitored fd structs */
	//为每个事件都会分配一个epitem,所有的epitem都会存放在这个红黑树中
	struct rb_root rbr;

	/*
	 * This is a single linked list that chains all the "struct epitem" that
	 * happened while transferring ready events to userspace w/out
	 * holding ->lock.
	 */
	//就绪事件在转移到用户空间时,发生了就绪事件,其所对应的epitem被链入ovflist双向链表中
	struct epitem *ovflist;

	/* The user that created the eventpoll descriptor */
	//保存用户信息,比如资源的上限值
	struct user_struct *user;
};
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
	int error;
	//eventpoll是epoll中非常重要的数据结构!每一个epollfd都有一个对应的eventpoll数据结构
	//eventpoll数据结构定义在上面
	struct eventpoll *ep = NULL;

	/* Check the EPOLL_* constant for consistency.  */
	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

	if (flags & ~EPOLL_CLOEXEC)
		return -EINVAL;
	/*
	 * Create the internal data structure ("struct eventpoll").
	 */
	//初始化一个eventpoll数据结构
	//ep_alloc定义在下面
	error = ep_alloc(&ep);
	if (error < 0)
		return error;
	/*
	 * Creates all the items needed to setup an eventpoll file. That is,
	 * a file structure and a free file descriptor.
	 */
	//创建epollfd
	//因为epollfd并不存在真正对应的文件,所以内核创建了一个虚拟的文件,并为这个虚拟文件分配struct file数据结构
	//参数eventpoll_fops就是file operations,即文件支持的操作
	//关于file operations在之前的poll机制内核源代码剖析一文中已经做了非常深入的解释
	//这里简单解释一下,file operations中的每一个成员都是回调函数指针,对应每一种操作的具体实现
	//epollfd文件实现了三种操作,即release、poll、llseek
	//eventpoll_fops数据结构定义在下面
	//参数ep就是epollfd所对应的eventpoll数据结构,在anon_inode_getfd中,将struct file的private_data成员赋值为ep的地址
	//anon_inode_getfd定义在下面
	error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
				 O_RDWR | (flags & O_CLOEXEC));
	if (error < 0)
		ep_free(ep);
	//返回epollfd的值
	return error;
}

SYSCALL_DEFINE1(epoll_create, int, size)
{
	//实际上传入的size参数并没有什么用。。。
	if (size <= 0)
		return -EINVAL;
	//sys_epoll_create1定义在上面
	return sys_epoll_create1(0);
}
static int ep_alloc(struct eventpoll **pep)
{
	int error;
	struct user_struct *user;
	struct eventpoll *ep;
	
	//获取当前用户信息
	user = get_current_user();
	error = -ENOMEM;
	//通过kmalloc为eventpoll数据结构分配内存空间
	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
	if (unlikely(!ep))
		goto free_uid;
	
	spin_lock_init(&ep->lock);
	mutex_init(&ep->mtx);
	//初始化eventpoll中的wq
	init_waitqueue_head(&ep->wq);
	//初始化eventpoll中的poll_wait
	init_waitqueue_head(&ep->poll_wait);
	//初始化存放就绪事件所对应的epitem的双向链表
	INIT_LIST_HEAD(&ep->rdllist);
	//初始化存放所有事件对应的epiitem的红黑树,初始值为NULL
	//#define RB_ROOT	(struct rb_root) { NULL, }
	ep->rbr = RB_ROOT;
	//初始化转移到用户空间之前,存放就绪事件所对应的epitem的双向链表,初始值为-1L
	//#define EP_UNACTIVE_PTR ((void *) -1L)
	ep->ovflist = EP_UNACTIVE_PTR;
	//初始化用户信息
	ep->user = user;
	//为eventpoll数据结构指针赋值
	*pep = ep;

	return 0;

free_uid:
	free_uid(user);
	return error;
}
//由此可见epollfd所对应的的匿名文件只实现了三种操作
//release操作为释放epollfd所对应的eventpoll数据结构
//ep_eventpoll_release定义在下面
//poll操作为事件就绪时,调用poll操作对应的回调函数对当前进程进行一些列操作
//ep_eventpoll_poll定义先放一边,在epoll_wait中会详细解释
//llseek操作为获取匿名文件的游标偏移
//noop_llseek定义在下面
static const struct file_operations eventpoll_fops = {
	.release	= ep_eventpoll_release, 
	.poll		= ep_eventpoll_poll,
	.llseek		= noop_llseek,
};
static int ep_eventpoll_release(struct inode *inode, struct file *file)
{
	//通过struct file中的成员private_data得到epollfd所对应的eventpoll数据结构
	struct eventpoll *ep = file->private_data;
	//释放eventpoll数据结构
	if (ep)
		ep_free(ep);

	return 0;
}
loff_t noop_llseek(struct file *file, loff_t offset, int origin)
{
	//返回当前文件的偏移量
	return file->f_pos;
}
int anon_inode_getfd(const char *name, const struct file_operations *fops,
		     void *priv, int flags)
{
	int error, fd;
	struct file *file;
	//分配文件描述符,即epollfd
	error = get_unused_fd_flags(flags);
	if (error < 0)
		return error;
	fd = error;
	//创建匿名文件
	file = anon_inode_getfile(name, fops, priv, flags);
	if (IS_ERR(file)) {
		error = PTR_ERR(file);
		goto err_put_unused_fd;
	}
	//将文件描述符fd和匿名文件绑定,即将file_struct中的fdtable的成员fd[fd]赋值为file
	fd_install(fd, file);
	//返回epollfd的值
	return fd;

err_put_unused_fd:
	put_unused_fd(fd);
	return error;
}
  • epoll_ctl
struct epoll_event {
	__u32 events; //epoll事件类型
	__u64 data; //指定所要监听的事件的文件描述符
} EPOLL_PACKED;
//参数epfd就是epoll_create中返回的epollfd
//参数op指定对事件的操作类型,具体分为三种
//#define EPOLL_CTL_ADD 1 添加新的监听事件
//#define EPOLL_CTL_DEL 2 删除监听事件
//#define EPOLL_CTL_MOD 3 修改监听事件
//参数fd就是想要操作的文件描述符
//参数event表示监听的是什么事件类型
//数据可读事件EPOLLIN、高效工作事件模式EPOLLET、事件只被处理一次EPOLLONESHOT
//epoll_event数据结构定义在上面
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
		struct epoll_event __user *, event)
{
	int error; 
	int did_lock_epmutex = 0;
	struct file *file, *tfile;
	struct eventpoll *ep;
	struct epitem *epi;
	struct epoll_event epds;

	error = -EFAULT;
	//ep_op_has_event中为return op != EPOLL_CTL_DEL;即判断op操作是否为删除监听事件
	//从用户拷贝epoll_event数据结构到内核空间
	if (ep_op_has_event(op) &&
	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
		goto error_return;

	/* Get the "struct file *" for the eventpoll file */
	error = -EBADF;
	//获取epollfd所对应的匿名文件struct file数据结构
	file = fget(epfd);
	if (!file)
		goto error_return;

	/* Get the "struct file *" for the target file */
	//获取所要操作的文件描述符所对应的struct file数据结构
	tfile = fget(fd);
	if (!tfile)
		goto error_fput;

	/* The target file descriptor must support poll */
	error = -EPERM;
	//判断所要监听的事件是否支持文件操作或poll操作
	if (!tfile->f_op || !tfile->f_op->poll)
		goto error_tgt_fput;

	/*
	 * We have to check that the file structure underneath the file descriptor
	 * the user passed to us _is_ an eventpoll file. And also we do not permit
	 * adding an epoll file descriptor inside itself.
	 */
	error = -EINVAL;
	//判断所要监听的事件是否是epollfd本身、判断所要监听的事件是否支持epoll对文件的三种操作
	if (file == tfile || !is_file_epoll(file))
		goto error_tgt_fput;

	/*
	 * At this point it is safe to assume that the "private_data" contains
	 * our own data structure.
	 */
	//从struct file数据结构中获取eventpoll数据结构
	ep = file->private_data;

	/*
	 * When we insert an epoll file descriptor, inside another epoll file
	 * descriptor, there is the change of creating closed loops, which are
	 * better be handled here, than in more critical paths.
	 *
	 * We hold epmutex across the loop check and the insert in this case, in
	 * order to prevent two separate inserts from racing and each doing the
	 * insert "at the same time" such that ep_loop_check passes on both
	 * before either one does the insert, thereby creating a cycle.
	 */
	//检查监听的事件是否支持epoll对文件的三种操作且为添加事件
	//当我们插入一个epoll文件描述符时,在另一个epoll文件描述符中,创建闭环,这在这里更好地处理,而不是更关键的路径。
	//在这种情况下,我们保留epmutex的循环检查和插入,以防止两个单独的插入,并且每个插入“同时进行”,使得ep_loop_check在两个插入之前都通过,从而创建一个周期。
	if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
		mutex_lock(&epmutex);
		did_lock_epmutex = 1;
		error = -ELOOP;
		if (ep_loop_check(ep, tfile) != 0)
			goto error_tgt_fput;
	}

	mutex_lock_nested(&ep->mtx, 0);

	/*
	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
	 * above, we can be sure to be able to use the item looked up by
	 * ep_find() till we release the mutex.
	 */
	//epoll不允许重复添加fd
	//在eventpoll数据结构中的rbr红黑树里,根据监听事件的struct和fd,与每一个epitem中的epoll_filefd数据结构进行比较
	//找到返回监听事件对应的epitem,没有找到返回NULL
	epi = ep_find(ep, tfile, fd);

	error = -EINVAL;
	//根据对事件的操作进行分类操作
	switch (op) {
	//添加新的监听事件
	case EPOLL_CTL_ADD:
		//如果之前不存在此事件才可以添加
		if (!epi) {
			//添加内核关心的事件类型POLLERR和POLLHUP
			epds.events |= POLLERR | POLLHUP;
			//真正的添加新的监听事件
			//ep_insert定义在下面
			error = ep_insert(ep, &epds, tfile, fd);
		} else
			error = -EEXIST;
		break;
	//删除事件
	case EPOLL_CTL_DEL:
		//如果存在此事件才可以删除
		if (epi)
			//ep_remove就不剖了。。
			error = ep_remove(ep, epi);
		else
			error = -ENOENT;
		break;
	//修改事件
	case EPOLL_CTL_MOD:
		if (epi) {
			epds.events |= POLLERR | POLLHUP;
			//ep_modify就不剖了。。。
			error = ep_modify(ep, epi, &epds);
		} else
			error = -ENOENT;
		break;
	}
	mutex_unlock(&ep->mtx);

error_tgt_fput:
	if (unlikely(did_lock_epmutex))
		mutex_unlock(&epmutex);

	fput(tfile);
error_fput:
	fput(file);
error_return:

	return error;
}
typedef struct poll_table_struct {
	//poll_queue_proc就是当监听事件就绪时,对事件进行具体操作的回调函数
	poll_queue_proc qproc;
	//key记录对监听事件的何种event感兴趣
	unsigned long key;
} poll_table;
struct ep_pqueue {
	//poll_table数据结构和poll回调函数机制有关
	//poll_table数据结构定义在上面
	poll_table pt; 
	//记录对应的epitem数据结构
	struct epitem *epi;
};
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
	//初始化poll机制的回调函数
	pt->qproc = qproc;
	//初始化感兴趣的事件类型,初值为对所有event都感兴趣
	pt->key   = ~0UL; /* all events enabled */
}
//ep参数为epollfd所对应的eventpoll数据结构
//event参数为新监听事件的epoll事件类型,即epoll_event数据结构
//tfile参数为新监听事件所对应的struct file数据结构
//fd参数为新监听事件的文件描述符
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
		     struct file *tfile, int fd)
{
	int error, revents, pwake = 0;
	unsigned long flags;
	long user_watches;
	struct epitem *epi;
	struct ep_pqueue epq;
	
	//将当前用户的监听事件数加1
	user_watches = atomic_long_read(&ep->user->epoll_watches);
	//判断是否超过当前用户的最大监听数
	if (unlikely(user_watches >= max_user_watches))
		return -ENOSPC;
	//从slab中分配一个epitem数据结构
	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
		return -ENOMEM;

	/* Item initialization follow here ... */
	//初始化各个链表
	INIT_LIST_HEAD(&epi->rdllink);
	INIT_LIST_HEAD(&epi->fllink);
	INIT_LIST_HEAD(&epi->pwqlist);
	//记录epitem所对应的eventpoll数据结构
	epi->ep = ep;
	//在epitem中的epoll_filefd数据结构中记录新监听事件所对应的struct file数据结构和文件描述符fd
	ep_set_ffd(&epi->ffd, tfile, fd);
	//记录新监听事件,想要监听的事件类型
	epi->event = *event;
	//poll操作上的等待队列个数初始化为0
	epi->nwait = 0;
	//初始化epitem在eventpoll中的ovflist链表的后继为(void *) -1L
	epi->next = EP_UNACTIVE_PTR;

	/* Initialize the poll table using the queue callback */
	//记录ep_pqueue中的epitem数据结构
	//epq数据类型为ep_pqueue,ep_pqueue数据结构定义在上面
	epq.epi = epi;
	//初始化poll_table数据结构
	//init_poll_funcptr定义在上面
	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

	/*
	 * Attach the item to the poll hooks and get current event bits.
	 * We can safely use the file* here because its usage count has
	 * been increased by the caller of this function. Note that after
	 * this operation completes, the poll callback can start hitting
	 * the new item.
	 */
	//对监听事件所对应的struct file中的file operation中的poll操作进行初始化,即对poll回调函数进行初始化,详细的poll机制回调函数在之前已经做了详细说明
	//返回值为已经就绪的事件
	revents = tfile->f_op->poll(tfile, &epq.pt);

	/*
	 * We have to check if something went wrong during the poll wait queue
	 * install process. Namely an allocation for a wait queue failed due
	 * high memory pressure.
	 */
	error = -ENOMEM;
	//如果内存不够,有可能导致等待队列分配失败,所以此时需要判断等待队列是否存在
	if (epi->nwait < 0)
		goto error_unregister;

	/* Add the current item to the list of active epoll hook for this file */
	spin_lock(&tfile->f_lock);
	//将epitem链入监听事件所对应的strcut file中的f_ep_links成员上
	list_add_tail(&epi->fllink, &tfile->f_ep_links);
	spin_unlock(&tfile->f_lock);

	/*
	 * Add the current item to the RB tree. All RB tree operations are
	 * protected by "mtx", and ep_insert() is called with "mtx" held.
	 */
	//将epitem插入到epollfd所对应的eventpolld中的rbr红黑树中
	ep_rbtree_insert(ep, epi);

	/* We have to drop the new item inside our item list to keep track of it */
	spin_lock_irqsave(&ep->lock, flags);

	/* If the file is already "ready" we drop it inside the ready list */
	//此时判断一下是不是新的监听事件已经就绪且就绪链表为空
	if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
		//将epitem链入就绪链表中
		list_add_tail(&epi->rdllink, &ep->rdllist);

		/* Notify waiting tasks that events are available */
		//判断eventpoll中的wq等待队列是否为NULL,如果不为空,就唤醒等待队列上对应的进程
		if (waitqueue_active(&ep->wq))
			wake_up_locked(&ep->wq);
		//判断poll_wait等待队列是否为NULL,如果不为NULL,pwake加1
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}

	spin_unlock_irqrestore(&ep->lock, flags);

	atomic_long_inc(&ep->user->epoll_watches);

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&ep->poll_wait);

	return 0;

error_unregister:
	ep_unregister_pollwait(ep, epi);

	/*
	 * We need to do this because an event could have been arrived on some
	 * allocated wait queue. Note that we don't care about the ep->ovflist
	 * list, since that is used/cleaned only inside a section bound by "mtx".
	 * And ep_insert() is called with "mtx" held.
	 */
	spin_lock_irqsave(&ep->lock, flags);
	if (ep_is_linked(&epi->rdllink))
		list_del_init(&epi->rdllink);
	spin_unlock_irqrestore(&ep->lock, flags);

	kmem_cache_free(epi_cache, epi);

	return error;
}
  • epoll_wait
//参数epfd就是epollfd
//参数events指向一个数组,用来存放最后返回的就绪事件
//参数maxevents表示最多监听多少个事件
//参数timeout表示阻塞时间
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
		int, maxevents, int, timeout)
{
	int error;
	struct file *file;
	struct eventpoll *ep;

	/* The maximum number of event must be greater than zero */
	//判断maxevents是否合法
	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
		return -EINVAL;

	/* Verify that the area passed by the user is writeable */
	//判断用户传入的events指向的空间是否合法有效
	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
		error = -EFAULT;
		goto error_return;
	}

	/* Get the "struct file *" for the eventpoll file */
	error = -EBADF;
	//通过epollfd获得其所对应的struct file数据结构
	file = fget(epfd);
	if (!file)
		goto error_return;

	/*
	 * We have to check that the file structure underneath the fd
	 * the user passed to us _is_ an eventpoll file.
	 */
	error = -EINVAL;
	//判断file文件是否支持epoll对文件的操作
	if (!is_file_epoll(file))
		goto error_fput;

	/*
	 * At this point it is safe to assume that the "private_data" contains
	 * our own data structure.
	 */
	//struct file中的private_data成员存储着epollfd对应的eventpoll数据结构
	ep = file->private_data;

	/* Time to fish for events ... */
	//ep_poll的定义在下面
	error = ep_poll(ep, events, maxevents, timeout);

error_fput:
	fput(file);
error_return:

	return error;
}
//参数ep为epollfd所对应的eventpoll数据结构
//其余参数与epoll_wait参数含义相同
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
		   int maxevents, long timeout)
{
	int res = 0, eavail, timed_out = 0;
	unsigned long flags;
	long slack = 0;
	//存放当前进程的等待队列
	wait_queue_t wait;
	ktime_t expires, *to = NULL;
	
	//如果阻塞时间大于0,就将timeout转化为计算机内部的时间
	if (timeout > 0) {
		struct timespec end_time = ep_set_mstimeout(timeout);

		slack = select_estimate_accuracy(&end_time);
		to = &expires;
		*to = timespec_to_ktime(end_time);
	}
	//如果阻塞时间等于0,即非阻塞模式就直接调转到check_events执行
	else if (timeout == 0) {
		/*
		 * Avoid the unnecessary trip to the wait queue loop, if the
		 * caller specified a non blocking operation.
		 */
		timed_out = 1;
		spin_lock_irqsave(&ep->lock, flags);
		goto check_events;
	}

fetch_events:
	spin_lock_irqsave(&ep->lock, flags);
	//如果eventpoll中的rdllist为空或者ovflist为初始化值EP_UNACTIVE_PTR时,满足条件
	if (!ep_events_available(ep)) {
		/*
		 * We don't have any available event to return to the caller.
		 * We need to sleep here, and we will be wake up by
		 * ep_poll_callback() when events will become available.
		 */
		//初始化等待队列wait,参数current是一个宏,代表当前进程
		//init_waitqueue_entry定义在下面
		init_waitqueue_entry(&wait, current);
		//将等待队列wait添加到eventpoll中的wq等待队列中
		__add_wait_queue_exclusive(&ep->wq, &wait);

		for (;;) {
			/*
			 * We don't want to sleep if the ep_poll_callback() sends us
			 * a wakeup in between. That's why we set the task state
			 * to TASK_INTERRUPTIBLE before doing the checks.
			 */
			//将当前进程调度后的状态设置为浅睡眠,即可中断睡眠状态
			set_current_state(TASK_INTERRUPTIBLE);
			//如果此时eventpoll中的rdllist就绪链表不为NULL或ovflist不为EP_UNACTIVE_PTR或timed_out为0,那么就不再调度了,直接break跳出循环
			if (ep_events_available(ep) || timed_out)
				break;
			//如果此时收到了信号,那么也不再调度了,直接break跳出循环
			if (signal_pending(current)) {
				res = -EINTR;
				break;
			}

			spin_unlock_irqrestore(&ep->lock, flags);
			//当前进程被调度,进入前睡眠状态
			//在此期间,若发生事件就绪或收到信号,就执行poll回调机制
			if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
				timed_out = 1;

			spin_lock_irqsave(&ep->lock, flags);
		}
		//此时已从for循环中跳出
		//从eventpoll中的wq等待队列里删除wait等待队列
		__remove_wait_queue(&ep->wq, &wait);
		//设置当前进程下一次调度的状态为运行中状态
		set_current_state(TASK_RUNNING);
	}
check_events:
	/* Is it worth to try to dig for events ? */
	//判断此时eventpoll中的rdllist是否为空或者ovflist为初始化值是否为EP_UNACTIVE_PTR
	eavail = ep_events_available(ep);

	spin_unlock_irqrestore(&ep->lock, flags);

	/*
	 * Try to transfer events to user space. In case we get 0 events and
	 * there's still timeout left over, we go trying again in search of
	 * more luck.
	 */
	//此时尝试将就绪事件传输到用户空间
	//如果我们得到0个就绪事件,还有超时时间,就跳转至fetch_events
	//ep_send_events定义在下面
	if (!res && eavail &&
	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
		goto fetch_events;

	return res;
}
static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
{
	q->flags = 0; //将等待队列状态初始化为0
	q->private = p; //将等待队列的成员private指针初始化为p,即当前进程
	q->func = default_wake_function; //将等待队列的成员func初始化为default_wake_function,即唤醒进程时的函数
}
//参数含义与ep_poll函数参数相同,不再赘述
static int ep_send_events(struct eventpoll *ep,
			  struct epoll_event __user *events, int maxevents)
{
	//初始化ep_send_events_data数据结构,这个数据结构就只包含maxevents和events
	struct ep_send_events_data esed;

	esed.maxevents = maxevents;
	esed.events = events;
	//ep_scan_ready_list定义在下面
	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
}
//参数ep为epollfd所对应的eventpoll
//参数sproc为函数指针,调用时赋值为ep_send_events_proc
//参数priv指向ep_send_events_data数据结构
//参数depth初始化为0
static int ep_scan_ready_list(struct eventpoll *ep,
			      int (*sproc)(struct eventpoll *,
					   struct list_head *, void *),
			      void *priv,
			      int depth)
{
	int error, pwake = 0;
	unsigned long flags;
	struct epitem *epi, *nepi;
	LIST_HEAD(txlist);

	/*
	 * We need to lock this because we could be hit by
	 * eventpoll_release_file() and epoll_ctl().
	 */
	mutex_lock_nested(&ep->mtx, depth);

	/*
	 * Steal the ready list, and re-init the original one to the
	 * empty list. Also, set ep->ovflist to NULL so that events
	 * happening while looping w/out locks, are not lost. We cannot
	 * have the poll callback to queue directly on ep->rdllist,
	 * because we want the "sproc" callback to be able to do it
	 * in a lockless way.
	 */
	spin_lock_irqsave(&ep->lock, flags);
	//此时所有发生就绪事件的epitem都已经链入了eventpoll中的rdllist就绪链表了
	//此时将rdllist就绪链表上的所有元素都转移到txlist中,而rdllist被清空
	list_splice_init(&ep->rdllist, &txlist);
	//将ovlist置NULL,是因为此时不希望再有新的就绪事件对应的epitem加入到rdllist中
	ep->ovflist = NULL;
	spin_unlock_irqrestore(&ep->lock, flags);

	/*
	 * Now call the callback function.
	 */
	//此时调用参数传入的回调函数,即ep_send_events_proc
	//ep_send_events_proc定义在下面
	error = (*sproc)(ep, &txlist, priv);

	spin_lock_irqsave(&ep->lock, flags);
	/*
	 * During the time we spent inside the "sproc" callback, some
	 * other events might have been queued by the poll callback.
	 * We re-insert them inside the main ready-list here.
	 */
	//当调用ep_send_events_proc函数时,即向用户空间传递数据时
	//发生了就绪事件,这些就绪事件对应的epitem都链入了eventpoll中的ovflist
	//现在遍历ovflist链表,依次处理这些epitem
	for (nepi = ep->ovflist; (epi = nepi) != NULL;
	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
		/*
		 * We need to check if the item is already in the list.
		 * During the "sproc" callback execution time, items are
		 * queued into ->ovflist but the "txlist" might already
		 * contain them, and the list_splice() below takes care of them.
		 */
		//如果epitem存在,就将epitem尾插进rddlist中
		if (!ep_is_linked(&epi->rdllink))
			list_add_tail(&epi->rdllink, &ep->rdllist);
	}
	/*
	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
	 * releasing the lock, events will be queued in the normal way inside
	 * ep->rdllist.
	 */
	//将ovflist置为EP_UNACTIVE_PTR,即((void *) -1L)
	ep->ovflist = EP_UNACTIVE_PTR;

	/*
	 * Quickly re-inject items left on "txlist".
	 */
	//经过ep_send_events_proc对epitem的处理后,有的epitem还未被处理完,将这些epitem重新链入rdllist中
	list_splice(&txlist, &ep->rdllist);
	
	//如果rdllist就绪链表不为NULL时
	if (!list_empty(&ep->rdllist)) {
		/*
		 * Wake up (if active) both the eventpoll wait list and
		 * the ->poll() wait list (delayed after we release the lock).
		 */
		//当wq等待队列wq不为NULL时
		if (waitqueue_active(&ep->wq))
			//唤醒等待队列wq上的成员,及当前进程
			wake_up_locked(&ep->wq);
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}
	spin_unlock_irqrestore(&ep->lock, flags);

	mutex_unlock(&ep->mtx);

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&ep->poll_wait);

	return error;
}
//参数ep为epollfd所对应的eventpoll
//参数head为txlist
//参数priv为ep_send_events_data数据结构
static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
			       void *priv)
{
	struct ep_send_events_data *esed = priv;
	int eventcnt;
	unsigned int revents;
	struct epitem *epi;
	struct epoll_event __user *uevent;

	/*
	 * We can loop without lock because we are passed a task private list.
	 * Items cannot vanish during the loop because ep_scan_ready_list() is
	 * holding "mtx" during this call.
	 */
	//遍历整个txlist链表
	for (eventcnt = 0, uevent = esed->events;
	     !list_empty(head) && eventcnt < esed->maxevents;) {
	    //获取txlist链表中的第一个节点
		epi = list_first_entry(head, struct epitem, rdllink);
		//从txlink链表中将epitem删除
		list_del_init(&epi->rdllink);
		//获取此时,最新的epitem的就绪事件类型
		revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
			epi->event.events;

		/*
		 * If the event mask intersect the caller-requested one,
		 * deliver the event to userspace. Again, ep_scan_ready_list()
		 * is holding "mtx", so no operations coming from userspace
		 * can change the item.
		 */
		//再一次判断是否有就绪事件发生
		if (revents) {
			//将当前的就绪事件拷贝到用户空间中
			//如果此时epitem还没有处理完,就将epitem再链入txlist链表中
			if (__put_user(revents, &uevent->events) ||
			    __put_user(epi->event.data, &uevent->data)) {
				list_add(&epi->rdllink, head);
				return eventcnt ? eventcnt : -EFAULT;
			}
			eventcnt++;
			uevent++;
			if (epi->event.events & EPOLLONESHOT)
				epi->event.events &= EP_PRIVATE_BITS;
			//判断fd是否为ET模式,如果不是ET模式,就要将自己再一次链入rdllist就绪链表中,这是LT和ET模式本质区别
			//以便下次调用epoll_wait()会再次检查事件的可用性
			else if (!(epi->event.events & EPOLLET)) {
				/*
				 * If this file has been added with Level
				 * Trigger mode, we need to insert back inside
				 * the ready list, so that the next call to
				 * epoll_wait() will check again the events
				 * availability. At this point, no one can insert
				 * into ep->rdllist besides us. The epoll_ctl()
				 * callers are locked out by
				 * ep_scan_ready_list() holding "mtx" and the
				 * poll callback will queue them in ep->ovflist.
				 */
				list_add_tail(&epi->rdllink, &ep->rdllist);
			}
		}
	}

	return eventcnt;
}

本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!

【转】Linux strace跟踪命令 上一篇
什么是POD数据类型? 下一篇