首页 > 代码库 > Linux Kernel File IO Syscall Kernel-Source-Code Analysis(undone)

Linux Kernel File IO Syscall Kernel-Source-Code Analysis(undone)

目录

0. 引言1. open() syscall2. close() syscall

 

0. 引言

在linux的哲学中,所有的磁盘文件、目录、外设设备、驱动设备全部被抽象为了"文件"这个概念,所以本文提到的"File IO"适用于linux下所有的IO操作,需要明白的的,本文分析的是linux下的IO系统调用对应的内核源代码,linux下每一个系统调用都有对应的内核源代码,而我们在ring3常用的glib c的编程所有的c库API,它们只是对系统调用的一个封装,最终还是要通过系统调用实现功能

0x1: SYSCALL_DEFINE宏定义

我们在学习内核源代码的时候经常会遇到一个宏定义: SYSCALL_DEFINE,所有的系统调用的声明都通过它来实现

\linux-2.6.32.63\include\linux\syscalls.h

#define SYSCALL_DEFINE0(sname)                    \    SYSCALL_TRACE_ENTER_EVENT(_##sname);                SYSCALL_TRACE_EXIT_EVENT(_##sname);                static const struct syscall_metadata __used              __attribute__((__aligned__(4)))                  __attribute__((section("__syscalls_metadata")))          __syscall_meta_##sname = {                        .name         = "sys_"#sname,                    .nb_args     = 0,                        .enter_event    = &event_enter__##sname,            .exit_event    = &event_exit__##sname,            };                                asmlinkage long sys_##sname(void)#else    #define SYSCALL_DEFINE0(name)       asmlinkage long sys_##name(void)#endif#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

...

#ifdef CONFIG_FTRACE_SYSCALLS    #define SYSCALL_DEFINEx(x, sname, ...)                        static const char *types_##sname[] = {                        __SC_STR_TDECL##x(__VA_ARGS__)                    };                                    static const char *args_##sname[] = {                        __SC_STR_ADECL##x(__VA_ARGS__)                    };                                    SYSCALL_METADATA(sname, x);                        __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)#else    #define SYSCALL_DEFINEx(x, sname, ...)                \        __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)#endif#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS    #define SYSCALL_DEFINE(name) static inline long SYSC_##name    #define __SYSCALL_DEFINEx(x, name, ...)                    \    asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__));            static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__));        asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__))            {                                        __SC_TEST##x(__VA_ARGS__);                        return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__));        }                                    SYSCALL_ALIAS(sys##name, SyS##name);                    static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__))#else /* CONFIG_HAVE_SYSCALL_WRAPPERS */    #define SYSCALL_DEFINE(name) asmlinkage long sys_##name    #define __SYSCALL_DEFINEx(x, name, ...) asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))#endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */

所以对函数定义

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)就等于
asmlinkage long sys_socket(int family, int type, int protocol)

Relevant Link:

http://blog.csdn.net/p_panyuch/article/details/5648007

 

1. open() syscall

open()系统调用在kernel中对应的是sys_open()

\linux-2.6.32.63\fs\open.c

SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode){    long ret;    if (force_o_largefile())    {        flags |= O_LARGEFILE;    }     //调用do_sys_open完成实际功能    ret = do_sys_open(AT_FDCWD, filename, flags, mode);    /* avoid REGPARM breakage on x86: */    asmlinkage_protect(3, ret, filename, flags, mode);    return ret;}

继续跟进do_sys_open()函数

long do_sys_open(int dfd, const char __user *filename, int flags, int mode){    /*获取文件名称,由getname()函数完成,其内部首先创建存取文件名称的空间,然后从用户空间把文件名拷贝过来*/    char *tmp = getname(filename);    int fd = PTR_ERR(tmp);    if (!IS_ERR(tmp))     {        /*获取一个可用的fd,此函数调用alloc_fd()函数从fd_table中获取一个可用fd,并进行初始化*/        fd = get_unused_fd_flags(flags);        if (fd >= 0)         {            /*fd获取成功则开始打开文件,此函数是主要完成打开功能的函数*/            struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);            if (IS_ERR(f))             {                /*打开失败,释放fd*/                put_unused_fd(fd);                fd = PTR_ERR(f);            }             else             {                //文件如果已经被打开了,调用fsnotify_open()函数                 fsnotify_open(f->f_path.dentry);                //将文件指针安装在fd数组中,每个进程都会将打开的文件句柄保存在fd_array[]数组中                fd_install(fd, f);            }        }        //释放放置从用户空间拷贝过来的文件名的存储空间         putname(tmp);    }    return fd;}

继续跟进do_file_open()函数

/* * Note that the low bits of the passed in "open_flag" * are not the same as in the local variable "flag". See * open_to_namei_flags() for more details. */struct file *do_filp_open(int dfd, const char *pathname, int open_flag, int mode, int acc_mode){    /* 若干变量声明 */    struct file *filp;    struct nameidata nd;    int error;    struct path path;    struct dentry *dir;    int count = 0;    int will_write;    /*改变参数flag的值,具体做法是flag+1*/    int flag = open_to_namei_flags(open_flag);    /*设置访问权限*/    if (!acc_mode)    {        acc_mode = MAY_OPEN | ACC_MODE(flag);    }     /* O_TRUNC implies we need access checks for write permissions */    /* 根据O_TRUNC标志设置写权限 */    if (flag & O_TRUNC)    {        acc_mode |= MAY_WRITE;    }     /* Allow the LSM permission hook to distinguish append access from general write access. */    /* 设置O_APPEND标志 */    if (flag & O_APPEND)    {        acc_mode |= MAY_APPEND;    }     /* The simplest case - just a plain lookup. */    /* 如果不是创建文件 */    if (!(flag & O_CREAT))     {         /*        当内核要访问一个文件的时候,第一步要做的是找到这个文件,而查找文件的过程在vfs里面是由path_lookup或者path_lookup_open函数来完成的        这两个函数将用户传进来的字符串表示的文件路径转换成一个dentry结构,并建立好相应的inode和file结构,将指向file的描述符返回用户        用户随后通过文件描述符,来访问这些数据结构        */        error = path_lookup_open(dfd, pathname, lookup_flags(flag), &nd, flag);        if (error)        {            return ERR_PTR(error);        }         goto ok;    }    /*     * Create - we need to know the parent.     */    //path-init为查找作准备工作,path_walk真正上路查找,这两个函数联合起来根据一段路径名找到对应的dentry      error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);    if (error)    {        return ERR_PTR(error);    }     /*    这个函数相当重要,是整个NFS的名字解析函数,其实也是NFS得以构筑的函数    该函数采用一个for循环,对name路径根据目录的层次,一层一层推进,直到终点或失败。在推进的过程中,一步步建立了目录树的dentry和对应的inode    */    error = path_walk(pathname, &nd);    if (error)     {        if (nd.root.mnt)        {            /*减少dentry和vsmount得计数*/            path_put(&nd.root);        }         return ERR_PTR(error);    }    if (unlikely(!audit_dummy_context()))    {        /*保存inode节点信息*/        audit_inode(pathname, nd.path.dentry);    }     /*     * We have the parent and last component. First of all, check     * that we are not asked to creat(2) an obvious directory - that     * will not do.     */    error = -EISDIR;    /*父节点信息*/    if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])    {        goto exit_parent;    }     error = -ENFILE;    /* 返回特定的file结构体指针 */    filp = get_empty_filp();    if (filp == NULL)    {        goto exit_parent;    }     /* 填充nameidata结构 */    nd.intent.open.file = filp;    nd.intent.open.flags = flag;    nd.intent.open.create_mode = mode;    dir = nd.path.dentry;    nd.flags &= ~LOOKUP_PARENT;    nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;    if (flag & O_EXCL)    {        nd.flags |= LOOKUP_EXCL;    }     mutex_lock(&dir->d_inode->i_mutex);    /*从哈希表中查找nd对应的dentry*/    path.dentry = lookup_hash(&nd);    path.mnt = nd.path.mnt;do_last:    error = PTR_ERR(path.dentry);    if (IS_ERR(path.dentry))     {        mutex_unlock(&dir->d_inode->i_mutex);        goto exit;    }    if (IS_ERR(nd.intent.open.file))     {        error = PTR_ERR(nd.intent.open.file);        goto exit_mutex_unlock;    }    /* Negative dentry, just create the file */    /*如果此dentry结构没有对应的inode节点,说明是无效的,应该创建文件节点 */    if (!path.dentry->d_inode)     {        /*         * This write is needed to ensure that a         * ro->rw transition does not occur between         * the time when the file is created and when         * a permanent write count is taken through         * the ‘struct file‘ in nameidata_to_filp().        */        /*write权限是必需的*/        error = mnt_want_write(nd.path.mnt);        if (error)        {            goto exit_mutex_unlock;        }         /*按照namei格式的flag open*/        error = __open_namei_create(&nd, &path, flag, mode);        if (error)         {            mnt_drop_write(nd.path.mnt);            goto exit;        }        /*根据nameidata 得到相应的file结构*/        filp = nameidata_to_filp(&nd, open_flag);        if (IS_ERR(filp))        {            ima_counts_put(&nd.path, acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));        }         /*放弃写权限*/        mnt_drop_write(nd.path.mnt);        if (nd.root.mnt)        {            /*计数减一*/            path_put(&nd.root);        }         return filp;    }    /*     * It already exists.     */    /*要打开的文件已经存在*/    mutex_unlock(&dir->d_inode->i_mutex);    /*保存inode节点*/    audit_inode(pathname, path.dentry);    error = -EEXIST;    /*flag标志检查代码*/    if (flag & O_EXCL)    {        goto exit_dput;    }     if (__follow_mount(&path))    {        error = -ELOOP;        if (flag & O_NOFOLLOW)        {            goto exit_dput;        }     }    error = -ENOENT;    if (!path.dentry->d_inode)    {        goto exit_dput;    }     if (path.dentry->d_inode->i_op->follow_link)    {        goto do_link;    }     /*路径装化为相应的nameidata结构*/    path_to_nameidata(&path, &nd);    error = -EISDIR;    /*如果是文件夹*/    if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))    {        goto exit;    } ok:    /*     * Consider:     * 1. may_open() truncates a file     * 2. a rw->ro mount transition occurs     * 3. nameidata_to_filp() fails due to     *    the ro mount.     * That would be inconsistent, and should     * be avoided. Taking this mnt write here     * ensures that (2) can not occur.     */    /*检测是否截断文件标志*/    will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);    if (will_write)     {        /*要截断的话就要获取写权限*/        error = mnt_want_write(nd.path.mnt);        if (error)        {            goto exit;        }     }    //may_open执行权限检测、文件打开和truncate的操作    error = may_open(&nd.path, acc_mode, flag);    if (error)     {        if (will_write)        {            mnt_drop_write(nd.path.mnt);        }         goto exit;    }    filp = nameidata_to_filp(&nd, open_flag);    if (IS_ERR(filp))    {        ima_counts_put(&nd.path, acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));    }            /*     * It is now safe to drop the mnt write     * because the filp has had a write taken     * on its behalf.     */    //安全的放弃写权限    if (will_write)    {        mnt_drop_write(nd.path.mnt);    }     if (nd.root.mnt)    {        path_put(&nd.root);    }     return filp;exit_mutex_unlock:    mutex_unlock(&dir->d_inode->i_mutex);exit_dput:    path_put_conditional(&path, &nd);exit:    if (!IS_ERR(nd.intent.open.file))    {        release_open_intent(&nd);    }        exit_parent:    if (nd.root.mnt)    {        path_put(&nd.root);    }     path_put(&nd.path);    return ERR_PTR(error);do_link://允许遍历连接文件,则手工找到连接文件对应的文件    error = -ELOOP;    if (flag & O_NOFOLLOW)    {        //不允许遍历连接文件,返回错误        goto exit_dput;    }     /*     * This is subtle. Instead of calling do_follow_link() we do the     * thing by hands. The reason is that this way we have zero link_count     * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.     * After that we have the parent and last component, i.e.     * we are in the same situation as after the first path_walk().     * Well, almost - if the last component is normal we get its copy     * stored in nd->last.name and we will have to putname() it when we     * are done. Procfs-like symlinks just set LAST_BIND.     */    /* 以下是手工找到链接文件对应的文件dentry结构代码 */    //设置查找LOOKUP_PARENT标志    nd.flags |= LOOKUP_PARENT;    //判断操作是否安全    error = security_inode_follow_link(path.dentry, &nd);    if (error)    {        goto exit_dput;    }     //处理符号链接    error = __do_follow_link(&path, &nd);    if (error)     {        /* Does someone understand code flow here? Or it is only         * me so stupid? Anathema to whoever designed this non-sense         * with "intent.open".         */        release_open_intent(&nd);        if (nd.root.mnt)        {            path_put(&nd.root);        }         return ERR_PTR(error);    }    nd.flags &= ~LOOKUP_PARENT;    //检查最后一段文件或目录名的属性情况    if (nd.last_type == LAST_BIND)    {        goto ok;    }     error = -EISDIR;    if (nd.last_type != LAST_NORM)    {        goto exit;    }     if (nd.last.name[nd.last.len])     {        __putname(nd.last.name);        goto exit;    }    error = -ELOOP;    //出现回环标志: 循环超过32次    if (count++==32)     {        __putname(nd.last.name);        goto exit;    }    dir = nd.path.dentry;    mutex_lock(&dir->d_inode->i_mutex);    //更新路径的挂接点和dentry    path.dentry = lookup_hash(&nd);    path.mnt = nd.path.mnt;    __putname(nd.last.name);    goto do_last;}

总结一下流程

1. open系统调用访问SYSCALL_DEFINE3函数2. 在open系统调用中,调用do_sys_open函数完成主要功能3. 在do_sys_open函数中,调用函数do_filp_open完成主要的打开功能4. 在内核中要打开一个文件,首先应该找到这个文件,而查找文件的过程在vfs里面是由do_path_lookup或者path_lookup_open函数来完成的    4.1 设置nd->root=根路径(绝对地址)或者当前工作目录(相对地址)    4.2 这一步做完了后,内核会建立一些数据结构(dentry,inode)来初始化查找的起点    if(!retval){ retval = path_walk(name,nd);}    4.3 path_walk会遍历路径的每一节点分量,也就是用"/"分隔开的每一部分,最终找到name指向的文件     int path_walk(const char *name,struct nameidata *nd)    {        return link_path_walk(name,nd);        //path_walk其实相当于直接调用link_path_walk来完成工作    }    4.4 link_path_walk的主要工作是有其内部函数__link_path_walk 来完成的        result = __link_path_walk(name,nd)    4.5 __link_walk_path,该函数把传进来的字符串name,也就是用户指定的路径,按路径分隔符分解成一系列小的component。比如用户说,我要找"/path/to/dest"这个文件,那么我们的文件系统就会按path、to、dest一个
一个来找,知道最后一个分量是文件或者查找完成。他找的时候,会先用path_init初始化过的根路径去找第一个分量,也就是path。然后用path的dentry->d_inode去找to,这样循环到最后一个。注意,内核会缓存找到的路径分量,
所以往往只有第一次访问一个路径的时候,才会去访问磁盘,后面的访问会直接从缓存里找,下面会看到,很多与页告诉缓存打交道的代码。但不管怎样,第一遍查找总是会访问磁盘的
static int __link_path_walk(const char *name,strucy nameidata *nd){..}至此,按照每一个component查找完成之后,就会找到相应的文件,然后相应的打开工作就基本完成了

Relevant Link:

http://oss.org.cn/kernel-book/http://blog.csdn.net/f413933206/article/details/5701913

 

2. close() syscall

close()系统调用对应内核中的函数为: sys_close()

\linux-2.6.32.63\fs\open.c

/* * Careful here! We test whether the file pointer is NULL before * releasing the fd. This ensures that one clone task can‘t release * an fd while another clone is opening it. */SYSCALL_DEFINE1(close, unsigned int, fd){    struct file * filp;    struct files_struct *files = current->files;    struct fdtable *fdt;    int retval;    spin_lock(&files->file_lock);    /*    获取指向struct fdtable结构体的指针    \linux-2.6.32.63\include\linux\fdtable.h    #define files_fdtable(files) (rcu_dereference((files)->fdt))    */    fdt = files_fdtable(files);    if (fd >= fdt->max_fds)    {        goto out_unlock;    }     //获取需要关闭的文件描述符编号    filp = fdt->fd[fd];    if (!filp)    {        goto out_unlock;    }     /*    将fd_array[]中的的指定元素值置null     */    rcu_assign_pointer(fdt->fd[fd], NULL);    FD_CLR(fd, fdt->close_on_exec);     /*    调用__put_unused_fd函数,将当前fd回收,则下一次打开新的文件又可以用这个fd了    static void __put_unused_fd(struct files_struct *files, unsigned int fd)    {        struct fdtable *fdt = files_fdtable(files);        __FD_CLR(fd, fdt->open_fds);        if (fd < files->next_fd)        {            files->next_fd = fd;        }     }    */    __put_unused_fd(files, fd);    spin_unlock(&files->file_lock);    retval = filp_close(filp, files);    /* can‘t restart close syscall because file table entry was cleared */    if (unlikely(retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK))    {        retval = -EINTR;    }     return retval;out_unlock:    spin_unlock(&files->file_lock);    return -EBADF;}EXPORT_SYMBOL(sys_close);

对于,我们需要重点跟进2个函数: rcu_assign_pointer(fdt->fd[fd], NULL);、retval = filp_close(filp, files);

\linux-2.6.32.63\fs\rcupdate.h

/** * rcu_assign_pointer - assign (publicize) a pointer to a newly * initialized structure that will be dereferenced by RCU read-side * critical sections.  Returns the value assigned. * * Inserts memory barriers on architectures that require them * (pretty much all of them other than x86), and also prevents * the compiler from reordering the code that initializes the * structure after the pointer assignment.  More importantly, this * call documents which pointers will be dereferenced by RCU read-side * code. */#define rcu_assign_pointer(p, v) \    ({         if (!__builtin_constant_p(v) ||             ((v) != NULL))             smp_wmb();         (p) = (v);     })

我们知道,每个进程在kernel中都有一个对应的task_struct与之对应,而通过task_struct可以间接地获得一个fd_array[]数组,表示当前进程已经打开的文件,每一个元素都是一个文件描述符的值,只有通过这个fd_array[x]才能获取当前进程打开的文件的struc file*,而rcu_assign_pointer(fdt->fd[fd], NULL)的作用就在于将将这个数组的指定元素置空,即断开了这个引用的关系,至于之后内核栈中的那个struct file*是否释放,那内存回收的事,至少现在进程想通过task_stuct是无法再引用到之前打开过的文件了,这里面的关系图可以参阅:

http://www.cnblogs.com/LittleHann/p/3865490.html//搜索: 用一张图表示task_struct、fs_struct、files_struct、fdtable、file的关系

我们继续分析etval = filp_close(filp, files);

\linux-2.6.32.63\fs\open.c

/* * "id" is the POSIX thread ID. We use the * files pointer for this.. */int filp_close(struct file *filp, fl_owner_t id){    int retval = 0;    if (!file_count(filp))     {        printk(KERN_ERR "VFS: Close: file count is 0\n");        return 0;    }    if (filp->f_op && filp->f_op->flush)    {        retval = filp->f_op->flush(filp, id);    }     dnotify_flush(filp, id);    locks_remove_posix(filp, id);    fput(filp);    return retval;}

filp_close()负责将表示打开的文件的struct file*内存空间进行释放,至此,内核栈中就再也没有之前打开过的文件的任何痕迹了

Relevant Link:

http://blog.csdn.net/ce123_zhouwei/article/details/8459794

 

Copyright (c) 2014 LittleHann All rights reserved

 

Linux Kernel File IO Syscall Kernel-Source-Code Analysis(undone)