阅读源码时看到一个结构体的定义不知道在哪时，可以通过这个网站来查找，非常方便。

__user 是一个宏定义，表示用户空间的，内核不能直接使用，需要使用函数 copy_from_user/copy_to_user 进行处理。

0. 进程打开的文件

进程的表示：

// 源码位置：include/linux/sched.h
struct task_struct {
    // ....省略其他属性

    /* 文件系统信息: */
    struct fs_struct        *fs;

    /* 打开的文件信息: */
    struct files_struct     *files;

    // ....省略其他属性
}

进程维护打开的文件的数据结构：

// 源码位置：include/linux/fdtable.h
struct files_struct {
  /*
   * read mostly part
   */
    atomic_t count;
    bool resize_in_progress;
    wait_queue_head_t resize_wait;

    struct fdtable __rcu *fdt;
    struct fdtable fdtab;
  /*
   * written part on a separate cache line in SMP
   */
    spinlock_t file_lock ____cacheline_aligned_in_smp;
    unsigned int next_fd;
    unsigned long close_on_exec_init[1];
    unsigned long open_fds_init[1];
    unsigned long full_fds_bits_init[1];
    struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

struct fdtable {
    // 进程能打开的最大文件数
    unsigned int max_fds;
    struct file __rcu **fd;         /* current fd array */
    unsigned long *close_on_exec;

    // 当前打开的一组文件
    unsigned long *open_fds; 
    unsigned long *full_fds_bits;
    struct rcu_head rcu;
};

static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
{
    return test_bit(fd, fdt->open_fds);
}

小结：进程打开的文件维护在位图 fdtable.open_fds 里，对应的比特位为 1 表示文件打开，为 0 是关闭。

select 里传递事件也借鉴了这种思想，通过位图来传递，FD 对应的比特位为 1 表示对事件感兴趣或有事件发生。

1. 基本数据结构

// include/uapi/linux/posix_types.h
#define __FD_SETSIZE    1024
typedef struct {
    unsigned long fds_bits[__FD_SETSIZE / (8 * sizeof(long))];
} __kernel_fd_set;

// 源码位置： include/linux/types.h
typedef __kernel_fd_set     fd_set;

// 源码位置： fs/select.c
// 用于传递 select 的输入事件、输出结果，是 fd_set 的扩展版。
typedef struct {
    unsigned long *in, *out, *ex;
    unsigned long *res_in, *res_out, *res_ex;
} fd_set_bits;

小结：从上述定义可以看到，fd_set 就是一个位图，限制了 select 操作就多可以 poll 1024 个文件，如果要支持 poll 更多的文件，需要修改源码、重新编译。

fd_set_bits 里的6个变量是6个指针，指向不同的位图起始地址，见下文里的注释说明。

2. select 主逻辑

select 调用利用 fd_set 这个位图来传递输入的要监听的事件和输出结果。

// 源码位置： fs/select.c

// select 系统调用原型
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
        fd_set __user *, exp, struct timeval __user *, tvp)
{
    struct timespec64 end_time, *to = NULL;
    struct timeval tv;
    int ret;

    if (tvp) {
        if (copy_from_user(&tv, tvp, sizeof(tv)))
            return -EFAULT;

        to = &end_time;
        if (poll_select_set_timeout(to,
                tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
                (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
            return -EINVAL;
    }

    ret = core_sys_select(n, inp, outp, exp, to);
    ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);

    return ret;
}

int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
               fd_set __user *exp, struct timespec64 *end_time)
{
    fd_set_bits fds;
    void *bits;
    int ret, max_fds;
    size_t size, alloc_size;
    struct fdtable *fdt;
    /* Allocate small arguments on the stack to save memory and be faster */
    // 在栈上分配的小段参数，用于节省内存和提升速度
    // SELECT_STACK_ALLOC=256
    long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

    ret = -EINVAL;
    if (n < 0)
        goto out_nofds;

    /* max_fds can increase, so grab it once to avoid race */
    rcu_read_lock();
    fdt = files_fdtable(current->files);
    max_fds = fdt->max_fds;
    rcu_read_unlock();

    // poll 的最大 FD 不能超过 进程打开的最大FD
    if (n > max_fds)
        n = max_fds;


    // 每个文件有3种输入、3种输出，因此每个文件需要6个位图来表示事件
    /*
     * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
     * since we used fdset we need to allocate memory in units of
     * long-words. 
     */
    // n 个文件需要的字节数，也是每份位图的大小
    size = FDS_BYTES(n);
    bits = stack_fds;

    // sizeof(stack_fds) / 6 是把栈上分配的内存块划分为 6 份做位图
    if (size > sizeof(stack_fds) / 6 ) {
        /* Not enough space in on-stack array; must use kmalloc */
        // 栈上分配的空间不够，要使用 kmalloc 
        ret = -ENOMEM;
        if (size > (SIZE_MAX / 6))
            goto out_nofds;

        alloc_size = 6 * size;
        bits = kvmalloc(alloc_size, GFP_KERNEL);
        if (!bits)
            goto out_nofds;
    }

    // 把 fds 里的指针指向不同位图的起始地址
    fds.in      = bits;
    fds.out     = bits +   size;
    fds.ex      = bits + 2*size;
    fds.res_in  = bits + 3*size;
    fds.res_out = bits + 4*size;
    fds.res_ex  = bits + 5*size;

    // 把用户空间的事件拷贝到内核空间
    if ((ret = get_fd_set(n, inp, fds.in)) ||
        (ret = get_fd_set(n, outp, fds.out)) ||
        (ret = get_fd_set(n, exp, fds.ex)))
        goto out;

    // 清零输出结果
    zero_fd_set(n, fds.res_in);
    zero_fd_set(n, fds.res_out);
    zero_fd_set(n, fds.res_ex);

    ret = do_select(n, &fds, end_time);

    if (ret < 0)
        goto out;
    if (!ret) {
        ret = -ERESTARTNOHAND;
        if (signal_pending(current))
            goto out;
        ret = 0;
    }

    // 通过 __copy_to_user 拷贝结果到用户空间
    if (set_fd_set(n, inp, fds.res_in) ||
        set_fd_set(n, outp, fds.res_out) ||
        set_fd_set(n, exp, fds.res_ex))
        ret = -EFAULT;

out:
    if (bits != stack_fds)
        kvfree(bits);
out_nofds:
    return ret;
}


static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
    ktime_t expire, *to = NULL;
    // 构建一个等待队列，该队列维护着对所有添加到文件的等待队列的节点的指针
    struct poll_wqueues table;

    // 等待节点的数据原型，主要用于传递参数
    poll_table *wait;
    int retval, i, timed_out = 0;
    u64 slack = 0;
    unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
    unsigned long busy_start = 0;

    rcu_read_lock();
    retval = max_select_fd(n, fds);
    rcu_read_unlock();

    if (retval < 0)
        return retval;
    n = retval;

    // 设置 wait._qproc = __pollwait
    poll_initwait(&table);
    wait = &table.pt;
    if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
        wait->_qproc = NULL;
        timed_out = 1;
    }

    if (end_time && !timed_out)
        slack = select_estimate_accuracy(end_time);

    retval = 0;
    for (;;) {
        unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
        bool can_busy_loop = false;

        inp = fds->in; outp = fds->out; exp = fds->ex;
        rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

        // 分批轮询
        for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
            unsigned long in, out, ex, all_bits, bit = 1, mask, j;
            unsigned long res_in = 0, res_out = 0, res_ex = 0;

            in = *inp++; out = *outp++; ex = *exp++;
            all_bits = in | out | ex;           // 
            if (all_bits == 0) {
                // 没有敢兴趣的事件，跳过 BITS_PER_LONG 个文件
                i += BITS_PER_LONG;
                continue;
            }

            // 批次内逐个轮询
            for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {   // bit 左移是为了给正确的文件设置事件结果
                struct fd f;
                if (i >= n)
                    break;
                if (!(bit & all_bits))
                    continue;
                f = fdget(i);
                if (f.file) {
                    // 找到了文件
                    const struct file_operations *f_op;
                    f_op = f.file->f_op;
                    mask = DEFAULT_POLLMASK;
                    if (f_op->poll) {
                        wait_key_set(wait, in, out,
                                 bit, busy_flag);

                        // 调用文件的 poll 函数，最终会调用到  __pollwait 函数
                        // __pollwait 
                        mask = (*f_op->poll)(f.file, wait);
                    }
                    fdput(f);

                    // 下面的 if 语句块内，是已经检测到事件发生了，进程不需要进行等待和唤醒
                    // 把 _qproc 设置为 NULL 是为了避免往后续 poll 未就绪的文件时被加入等待队列
                    // 这样可以避免无效的唤醒
                    if ((mask & POLLIN_SET) && (in & bit)) {
                        res_in |= bit;
                        retval++;
                        wait->_qproc = NULL;
                    }
                    if ((mask & POLLOUT_SET) && (out & bit)) {
                        res_out |= bit;
                        retval++;
                        wait->_qproc = NULL;
                    }
                    if ((mask & POLLEX_SET) && (ex & bit)) {
                        res_ex |= bit;
                        retval++;
                        wait->_qproc = NULL;
                    }

                    // 
                    /* got something, stop busy polling */
                    if (retval) {
                        can_busy_loop = false;
                        busy_flag = 0;

                    /*
                     * only remember a returned
                     * POLL_BUSY_LOOP if we asked for it
                     */
                    } else if (busy_flag & mask)
                        can_busy_loop = true;

                }
            }

            // 小批次轮询完，把结果记录下来
            if (res_in)
                *rinp = res_in;
            if (res_out)
                *routp = res_out;
            if (res_ex)
                *rexp = res_ex;

            //  进入睡眠，等待超时或唤醒
            cond_resched();
        }

        // 所有文件都轮询了一遍，要加入文件等待队列的都已经加了，避免下次轮询重复添加
        wait->_qproc = NULL;

        // 有事件、或超时、或有信号要处理
        if (retval || timed_out || signal_pending(current))
            break;
        if (table.error) {
            retval = table.error;
            break;
        }

        /* only if found POLL_BUSY_LOOP sockets && not out of time */
        if (can_busy_loop && !need_resched()) {
            if (!busy_start) {
                busy_start = busy_loop_current_time();
                continue;
            }
            if (!busy_loop_timeout(busy_start))
                continue;
        }
        busy_flag = 0;

        /*
         * If this is the first loop and we have a timeout
         * given, then we convert to ktime_t and set the to
         * pointer to the expiry value.
         */
        if (end_time && !to) {
            expire = timespec64_to_ktime(*end_time);
            to = &expire;
        }

        // 进程状态设置为 TASK_INTERRUPTIBLE，进入睡眠直到超时
        if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
                       to, slack))
            timed_out = 1;
    }

    // 释放等待节点，重点是把等待节点从文件的等待队列删除掉
    poll_freewait(&table);

    return retval;
}

3. 等待与唤醒

3.0 相关数据结构

// 添加到文件等待队列的节点类型
struct poll_table_entry {
    // 要监听的目标文件
    struct file *filp;

    // 要监听的事件
    unsigned long key;

    wait_queue_entry_t wait;

    // 等待队列头
    wait_queue_head_t *wait_address;
};


// 每次执行 select 调用时维护的等待队列
struct poll_wqueues {
    // 调用 poll 操作时用于传递信息的对象
    poll_table pt;

    // inline_entries 空间不够用申请的等待节点列表
    struct poll_table_page *table;

    // 执行 select 的进程信息
    struct task_struct *polling_task;

    // 是否已唤醒
    int triggered;
    int error;

    // 指向 inline_entries 里未使用的节点下标
    int inline_index;

    // 预申请的等待节点
    struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};

// 如果 inline_entries 不够用，则以 poll_table_page 链表的形式存起来
struct poll_table_page {
    struct poll_table_page * next;
    struct poll_table_entry * entry;
    struct poll_table_entry entries[0];
};

3.1 等待

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
    // p->_qproc 对应了上面 wait->_qproc = NULL; 的优化
    if (p && p->_qproc && wait_address)
        p->_qproc(filp, wait_address, p);
}

void poll_initwait(struct poll_wqueues *pwq)
{
    // 设置 poll 等待队列的处理函数为 __pollwait
    init_poll_funcptr(&pwq->pt, __pollwait);
    pwq->polling_task = current;
    pwq->triggered = 0;
    pwq->error = 0;
    pwq->table = NULL;
    pwq->inline_index = 0;
}
EXPORT_SYMBOL(poll_initwait);

static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                poll_table *p)
{
    struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
    // 获取一个等待节点
    struct poll_table_entry *entry = poll_get_entry(pwq);
    if (!entry)
        return;
    entry->filp = get_file(filp);
    entry->wait_address = wait_address;

    // 设置敢兴趣的事件
    entry->key = p->_key;

    // 设置等待节点的回调函数为 pollwake，也即唤醒函数
    init_waitqueue_func_entry(&entry->wait, pollwake);

    // 指向本次 select 操作的 等待队列
    entry->wait.private = pwq;

    // 把等待节点添加到文件的等待队列上
    add_wait_queue(wait_address, &entry->wait);
}

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
    pt->_qproc = qproc;
    pt->_key   = ~0UL; /* all events enabled */
}

3.2 唤醒

static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
    struct poll_table_entry *entry;

    // 找出等待节点 poll_table_entry
    entry = container_of(wait, struct poll_table_entry, wait);

    // 检测是否发生了敢兴趣的事件
    if (key && !((unsigned long)key & entry->key))
        return 0;

    // 执行唤醒
    return __pollwake(wait, mode, sync, key);
}

// 具体的唤醒逻辑就不展开了，涉及进程调度那些。
static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
    struct poll_wqueues *pwq = wait->private;
    DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

    /*
     * Although this function is called under waitqueue lock, LOCK
     * doesn't imply write barrier and the users expect write
     * barrier semantics on wakeup functions.  The following
     * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
     * and is paired with smp_store_mb() in poll_schedule_timeout.
     */
    smp_wmb();
    pwq->triggered = 1;     // 设置为已触发

    /*
     * Perform the default wake up operation using a dummy
     * waitqueue.
     *
     * TODO: This is hacky but there currently is no interface to
     * pass in @sync.  @sync is scheduled to be removed and once
     * that happens, wake_up_process() can be used directly.
     */
    return default_wake_function(&dummy_wait, mode, sync, key);
}

4. 小结

4.0 概要说明

select 调用使用 fd_set 这个位图数据结构来传递输入事件、结果。
select执行时，由于内核不能直接访问用户空间，因此需要用 3 个位图来存放输入的事件，再用3个位图来存放 poll 的结果。fd_set_bits 里的6个指针分别指向这 6 个位图的起始地址。
从 do_select 的逻辑可以看到：
- select 实现首先尝试在栈上分配一块内存来存放上述6个位图，如果存放不下则申请新的内存块来存放。
- 每次 select 调用都尝试对所有的文件分批次调用 poll，在每个批次之间会睡眠。
- 如果还没有文件有感兴趣的事件发生，则在被 poll 的文件的等待队列上加入等待节点。一旦某个文件有事件发生，则后续的文件不管是否有事件发生，都不再加入等待节点。
- 如果前一轮 poll 没有感兴趣的事件发生，进程进入睡眠，等待唤醒或直至超时或有信号要处理；进程再次运行时，仍然需要 poll 所有的文件。
- select 调用结束前，把本次调用添加到文件等待队列上的节点都删除掉。

4.1 不足

最多 poll 1024 个文件的限制。
要 poll 的文件数量很大时，为传递输入的感兴趣的事件和输出结果，在用户空间与内核空间需要拷贝很大的内存。
效率受限于要 poll 的文件数量，数量越多需要的时间就越多。

欢迎关注我的微信公众号: coderbee笔记，可以更及时回复你的讨论。

coderbee笔记

日拱一卒，不期速成，厚积薄发。 wen866595@gmail.com

Linux select/poll/epoll 原理（二）select 实现

0. 进程打开的文件

1. 基本数据结构

2. select 主逻辑

3. 等待与唤醒

3.0 相关数据结构

3.1 等待

3.2 唤醒

4. 小结

4.0 概要说明

4.1 不足

发表回复取消回复

0. 进程打开的文件

1. 基本数据结构

2. select 主逻辑

3. 等待与唤醒

3.0 相关数据结构

3.1 等待

3.2 唤醒

4. 小结

4.0 概要说明

4.1 不足

相关文章：

发表回复 取消回复

发表回复取消回复