首页 > 代码库 > Socket内核调用数SYSCALL_DEFINE3

Socket内核调用数SYSCALL_DEFINE3

http://blog.chinaunix.net/uid-20788636-id-4408261.html

前言:

       对于Linux内核的Socket系列文章都是依据于:Linux-3.14.5的版本内核分析,对于文中的注释和问题的说明也参考了网络上经典分析文章,对他们奉献表示感谢!
     转载请标明:http://blog.chinaunix.net/uid-20788636-id-4408261.html

Socket的创建是在用户空间调用socket系统函数完成的,创建一个Socket返回一个文件描述符fd,内核的系统调用接口为SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol),在net/socket.c文件中,下面我们看一下内核中的源码实现。

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)

{

         int retval;

         struct socket *sock;

         int flags;

 

         /* Check the SOCK_* constants for consistency.  下面这些都是进行各种的检查操作*/

         BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);

         BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);

         BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);

         BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

 

         flags = type & ~SOCK_TYPE_MASK;

         if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))

                   return -EINVAL;

         type &= SOCK_TYPE_MASK;

   

         if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))

                   flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

//调用创建socket的函数

         retval = sock_create(family, type, protocol, &sock);//------参考下面的分析

         if (retval < 0)

                   goto out;

 

         retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));

         if (retval < 0)

                   goto out_release;

 

out:

         /* It may be already another descriptor 8) Not kernel problem. */

         return retval;

 

out_release:

         sock_release(sock);

         return retval;

}

1.1  socket_create函数

对于sock_create(family, type, protocol, &sock)函数调用的是包囊函数,

__sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);

对于__sock_create函数的定义如下:

int __sock_create(struct net *net, int family, int type, int protocol,

                             struct socket **res, int kern)

{

         int err;

         struct socket *sock;

         const struct net_proto_family *pf;

         /*

          *      Check protocol is in range 检查协议的范围,现在内核定义的最大范围为41,这里的family指的是AF_INET6,AF_INET协议簇

       #define NPROTO                  AF_MAX  

#define AF_MAX           41     /* For now.. */

          */

         if (family < 0 || family >= NPROTO)

                   return -EAFNOSUPPORT;

         if (type < 0 || type >= SOCK_MAX)//这里的type是socket的类型例如SOCK_STREAM

                   return -EINVAL;

         /* Compatibility.

            This uglymoron is moved from INET layer to here to avoid

            deadlock in module load.

          */

         if (family == PF_INET && type == SOCK_PACKET) {//如果是该类型的socket,对family进行重新的赋值

                   static int warned;//这里自动初始化为0,

                   if (!warned) {

                            warned = 1;

                            printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",

                                   current->comm);

                   }

                   family = PF_PACKET;//赋值为PF_PACKET

         }

 

         err = security_socket_create(family, type, protocol, kern);

         if (err)

                   return err;

 

         /*

          *     Allocate the socket and allow the family to set things up. if

          *     the protocol is 0, the family is instructed to select an appropriate

          *     default.这里调用sock_alloc分配sock,见下面的分析

          */

         sock = sock_alloc();

         if (!sock) {

                   net_warn_ratelimited("socket: no more sockets\n");

                   return -ENFILE;         /* Not exactly a match, but its the

                                        closest posix thing */

         }

 

         sock->type = type;

 

#ifdef CONFIG_MODULES

         /* Attempt to load a protocol module if the find failed.

          *

          * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user

          * requested real, full-featured networking support upon configuration.

          * Otherwise module support will break!

          */

         if (rcu_access_pointer(net_families[family]) == NULL)

                   request_module("net-pf-%d", family);

#endif

 

         rcu_read_lock();

         pf = rcu_dereference(net_families[family]);

         err = -EAFNOSUPPORT;

         if (!pf)

                   goto out_release;

 

         /*

          * We will call the ->create function, that possibly is in a loadable

          * module, so we have to bump that loadable module refcnt first.

          */

         if (!try_module_get(pf->owner))

                   goto out_release;

 

         /* Now protected by module ref count */

         rcu_read_unlock();

/*static const struct net_proto_family inet_family_ops = {

         .family = PF_INET,

         .create = inet_create,

         .owner     = THIS_MODULE,

};这里根据注册的family类型,调用不同的create函数,这里就是调用inet_ctreate*/

         err = pf->create(net, sock, protocol, kern);

         if (err < 0)

                   goto out_module_put;

 

         /*

          * Now to bump the refcnt of the [loadable] module that owns this

          * socket at sock_release time we decrement its refcnt.

          */

         if (!try_module_get(sock->ops->owner))

                   goto out_module_busy;

 

         /*

          * Now that we‘re done with the ->create function, the [loadable]

          * module can have its refcnt decremented

          */

         module_put(pf->owner);

         err = security_socket_post_create(sock, family, type, protocol, kern);

         if (err)

                   goto out_sock_release;

         *res = sock;

 

         return 0;

out_module_busy:

         err = -EAFNOSUPPORT;

out_module_put:

         sock->ops = NULL;

         module_put(pf->owner);

out_sock_release:

         sock_release(sock);

         return err;

 

out_release:

         rcu_read_unlock();

         goto out_sock_release;

}

1.1.1   sock_alloc函数

sock_alloc函数用于分配一个socket结构体,这这里涉及了inode结构以及在分配完成后返回的地址指针。

static struct socket *sock_alloc(void)

{

         struct inode *inode;

         struct socket *sock;

   /*下面的new_inode_pseudo函数是分配一个新的inode结构体,但在实际分配过程中,分配了一个socket_alloc结构体,返回d的是inode地址,struct socket_alloc {

         struct socket socket;

         struct inode vfs_inode;

};

*/

         inode = new_inode_pseudo(sock_mnt->mnt_sb);//sock_mnt哪里进行初始的,请看下面的分析-----(1)

         if (!inode)

                   return NULL;

         sock = SOCKET_I(inode);//该宏根据返回的inode获取到分配的socket_alloc指针

         kmemcheck_annotate_bitfield(sock, type);

   /*下面是对inode变量进行初始化操作,*/

         inode->i_ino = get_next_ino();

         inode->i_mode = S_IFSOCK | S_IRWXUGO;

         inode->i_uid = current_fsuid();//用户ID,在后面调用bind系统调用时会进行对比

         inode->i_gid = current_fsgid();//组ID

         inode->i_op = &sockfs_inode_ops;

 

         this_cpu_add(sockets_in_use, 1);

         return sock;

}

(1)对于sock_mnt->mnt_sb的赋值和分配过程如下:

在sock_init函数中对socket类型的文件系统进行注册

static struct file_system_type sock_fs_type = {

         .name =             "sockfs",

         .mount =  sockfs_mount,

         .kill_sb =  kill_anon_super,

};

static int __init sock_init(void)

{

         int err;

         /*

          *      Initialize the network sysctl infrastructure.

          */

         err = net_sysctl_init();

         if (err)

                   goto out;

 

         /*

          *      Initialize skbuff SLAB cache

          */

         skb_init();

 

         /*

          *      Initialize the protocols module.

          */

 

         init_inodecache();

  /*下面的函数进行文件系统的注册*/

         err = register_filesystem(&sock_fs_type);

         if (err)

                   goto out_fs;

/*下面的函数挂载文件系统*/

         sock_mnt = kern_mount(&sock_fs_type);

         if (IS_ERR(sock_mnt)) {

                   err = PTR_ERR(sock_mnt);

                   goto out_mount;

         }

 

         /* The real protocol initialization is performed in later initcalls.

          */

 

#ifdef CONFIG_NETFILTER

         err = netfilter_init();

         if (err)

                   goto out;

#endif

 

#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING

         skb_timestamping_init();

#endif

 

out:

         return err;

 

out_mount:

         unregister_filesystem(&sock_fs_type);

out_fs:

         goto out;

}

 

 

(2)new_inode_pseudo函数创建一个inode,并初始化inode的i_state变量和inode->i_sb_list链表,实际的分配函数为alloc_inode函数

struct inode *new_inode_pseudo(struct super_block *sb)

{

         struct inode *inode = alloc_inode(sb);

 

         if (inode) {

                   spin_lock(&inode->i_lock);

                   inode->i_state = 0;

                   spin_unlock(&inode->i_lock);

                   INIT_LIST_HEAD(&inode->i_sb_list);

         }

         return inode;

}

         alloc_inode分配一个inode节点,

static struct inode *alloc_inode(struct super_block *sb)

{

         struct inode *inode;

 

         if (sb->s_op->alloc_inode)

/*如果当前文件系统的超级块,有自己的分配inode的函数,则调用它自己的分配函数,否则从公用的高速缓存中分配一个inode.对于sokcet来说,在socket.c文件中,调用的函数为sock_alloc_inode

static const struct super_operations sockfs_ops = {

         .alloc_inode     = sock_alloc_inode,

         .destroy_inode         = sock_destroy_inode,

         .statfs                = simple_statfs,

};

*/

                   inode = sb->s_op->alloc_inode(sb);

         else

                   inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);

 

         if (!inode)

                   return NULL;

/*对inode结构进行初始化*/

         if (unlikely(inode_init_always(sb, inode))) {

                   if (inode->i_sb->s_op->destroy_inode)

                            inode->i_sb->s_op->destroy_inode(inode);

                   else

                            kmem_cache_free(inode_cachep, inode);

                   return NULL;

         }

 

         return inode;

}

         (3) 下面是sock_alloc_inode函数,在socket.c文件中

static struct inode *sock_alloc_inode(struct super_block *sb)

{

         struct socket_alloc *ei;

         struct socket_wq *wq;

   /*下面的函数分配struct socket_alloc结构体,这里是怎么分配的呢?参考下面的说明 */

         ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);

         if (!ei)

                   return NULL;

         wq = kmalloc(sizeof(*wq), GFP_KERNEL);

         if (!wq) {

                   kmem_cache_free(sock_inode_cachep, ei);

                   return NULL;

         }

         init_waitqueue_head(&wq->wait);

         wq->fasync_list = NULL;

         RCU_INIT_POINTER(ei->socket.wq, wq);

 

         ei->socket.state = SS_UNCONNECTED;

         ei->socket.flags = 0;

         ei->socket.ops = NULL;

         ei->socket.sk = NULL;

         ei->socket.file = NULL;

 

         return &ei->vfs_inode; //这里返回的是struct inode vfs_inode;

}

  备注说明:在分配函数sock_alloc_inode中调用了ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);这里分配的大小为socket_alloc大小,下面分析一下是如果分配该大小的?

init_inodecache函数中(net/socket.c),对其进行了高速缓存的分配操作,定义在socket.c文件中,这里分配的大小为socket_alloc,但返回是的socket_alloc结构体中的struct inode vfs_inode;变量。该函数在sock_init函数中被调用

static int init_inodecache(void)

{

         sock_inode_cachep = kmem_cache_create("sock_inode_cache",

                                                     sizeof(struct socket_alloc),

                                                     0,

                                                     (SLAB_HWCACHE_ALIGN |

                                                      SLAB_RECLAIM_ACCOUNT |

                                                      SLAB_MEM_SPREAD),

                                                     init_once);

         if (sock_inode_cachep == NULL)

                   return -ENOMEM;

         return 0;

}

1.1.2   inet_create函数

         在socket_create函数中调用pf->create这里的函数指针为inet_create。在文件/net/af_inet.c中

static int inet_create(struct net *net, struct socket *sock, int protocol,

                          int kern)

{

         struct sock *sk;

         struct inet_protosw *answer;

         struct inet_sock *inet;

         struct proto *answer_prot;

         unsigned char answer_flags;

         char answer_no_check;

         int try_loading_module = 0;

         int err;

 

         sock->state = SS_UNCONNECTED;

 

         /* Look for the requested type/protocol pair. */

lookup_protocol:

         err = -ESOCKTNOSUPPORT;

         rcu_read_lock();

/*  从inetsw中根据类型、协议查找相应的socket interface也就是 inet_protosw */

         list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

 

                   err = 0;

                   /* Check the non-wild match. */

                   if (protocol == answer->protocol) {

                            if (protocol != IPPROTO_IP)

                                     break;

                   } else {

                            /* Check for the two wild cases. */

                            if (IPPROTO_IP == protocol) {

                                     protocol = answer->protocol;

                                     break;

                            }

                            if (IPPROTO_IP == answer->protocol)

                                     break;

                   }

                   err = -EPROTONOSUPPORT;

         }

/*如果没有找到,尝试加载模块*/

         if (unlikely(err)) {

                   if (try_loading_module < 2) {

                            rcu_read_unlock();

                            /*

                             * Be more specific, e.g. net-pf-2-proto-132-type-1

                             * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)

                             */

                            if (++try_loading_module == 1)

                                     request_module("net-pf-%d-proto-%d-type-%d",

                                                      PF_INET, protocol, sock->type);

                            /*

                             * Fall back to generic, e.g. net-pf-2-proto-132

                             * (net-pf-PF_INET-proto-IPPROTO_SCTP)

                             */

                            else

                                     request_module("net-pf-%d-proto-%d",

                                                      PF_INET, protocol);

                            goto lookup_protocol;

                   } else

                            goto out_rcu_unlock;

         }

 

         err = -EPERM;

         if (sock->type == SOCK_RAW && !kern &&

             !ns_capable(net->user_ns, CAP_NET_RAW))

                   goto out_rcu_unlock;

 

         sock->ops = answer->ops;

         answer_prot = answer->prot;

         answer_no_check = answer->no_check;

         answer_flags = answer->flags;

         rcu_read_unlock();

 

         WARN_ON(answer_prot->slab == NULL);

/* sk_alloc表面上是生成一个sock的结构体,但是实际上对于tcp来说是一个tcp_sock的大小的结构体,这样就可以使用inet_sk(sk);进行强制的类型转换,具体是怎么分配的是tcp_sock大小的,在后续进行分析*/

         err = -ENOBUFS;

         sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);

         if (sk == NULL)

                   goto out;

 

         err = 0;

         sk->sk_no_check = answer_no_check;

         if (INET_PROTOSW_REUSE & answer_flags)

                   sk->sk_reuse = SK_CAN_REUSE;

 

         inet = inet_sk(sk);

         inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

 

         inet->nodefrag = 0;

 

         if (SOCK_RAW == sock->type) {

                   inet->inet_num = protocol;

                   if (IPPROTO_RAW == protocol)

                            inet->hdrincl = 1;

         }

 

         if (net->ipv4.sysctl_ip_no_pmtu_disc)

                   inet->pmtudisc = IP_PMTUDISC_DONT;

         else

                   inet->pmtudisc = IP_PMTUDISC_WANT;

 

         inet->inet_id = 0;

    /*对sk结构体中的变量进行初始化操作,*/

         sock_init_data(sock, sk);------------------(1)

 

         sk->sk_destruct          = inet_sock_destruct;

         sk->sk_protocol           = protocol;

         sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

 

         inet->uc_ttl      = -1;

         inet->mc_loop = 1;

         inet->mc_ttl     = 1;

         inet->mc_all     = 1;

         inet->mc_index        = 0;

         inet->mc_list   = NULL;

         inet->rcv_tos   = 0;

 

         sk_refcnt_debug_inc(sk);

 

         if (inet->inet_num) {

                   /* It assumes that any protocol which allows

                    * the user to assign a number at socket

                    * creation time automatically

                    * shares.

                    */

                   inet->inet_sport = htons(inet->inet_num);

                   /* Add to protocol hash chains. */

                   sk->sk_prot->hash(sk);

         }

 

         if (sk->sk_prot->init) {

                   err = sk->sk_prot->init(sk);//如果是tcp的话,这里就是tcp_v4_init_sock--------(2)

                   if (err)

                            sk_common_release(sk);

         }

out:

         return err;

out_rcu_unlock:

         rcu_read_unlock();

         goto out;

}

 

(1)sock_init_data函数分析

void sock_init_data(struct socket *sock, struct sock *sk)

{

         skb_queue_head_init(&sk->sk_receive_queue);

         skb_queue_head_init(&sk->sk_write_queue);

         skb_queue_head_init(&sk->sk_error_queue);

#ifdef CONFIG_NET_DMA

         skb_queue_head_init(&sk->sk_async_wait_queue);

#endif

 

         sk->sk_send_head   =       NULL;

   /*初始化sk定时器*/

         init_timer(&sk->sk_timer);

 

         sk->sk_allocation     =       GFP_KERNEL;

         sk->sk_rcvbuf            =       sysctl_rmem_default;

         sk->sk_sndbuf           =       sysctl_wmem_default;

         sk->sk_state             =       TCP_CLOSE; //初始化sk_state=TCP_CLOSE状态,在后面是的系统调用中会进行判断

         sk_set_socket(sk, sock);// sk->sk_socket = sock; 设置sk中指向socket的指针

 

         sock_set_flag(sk, SOCK_ZAPPED);//设置SOKCET的flag位,表明该sokcet已经绑定了一个名字,该标志位没有搞明白什么意思?

         if (sock) {

                   sk->sk_type      =       sock->type;

                   sk->sk_wq        =       sock->wq;

                   sock->sk   =       sk; // struct socket *sock 的sk指向sock

         } else

                   sk->sk_wq        =       NULL;

 

         spin_lock_init(&sk->sk_dst_lock);

         rwlock_init(&sk->sk_callback_lock);

         lockdep_set_class_and_name(&sk->sk_callback_lock,

                            af_callback_keys + sk->sk_family,

                            af_family_clock_key_strings[sk->sk_family]);

 

         sk->sk_state_change       =       sock_def_wakeup;

         sk->sk_data_ready  =       sock_def_readable;

         sk->sk_write_space         =       sock_def_write_space;

         sk->sk_error_report         =       sock_def_error_report;

         sk->sk_destruct                 =       sock_def_destruct;

 

         sk->sk_frag.page     =       NULL;

         sk->sk_frag.offset   =       0;

         sk->sk_peek_off                =       -1;

 

         sk->sk_peer_pid     =       NULL;

         sk->sk_peer_cred    =       NULL;

         sk->sk_write_pending     =       0;

         sk->sk_rcvlowat                =       1;

         sk->sk_rcvtimeo                =       MAX_SCHEDULE_TIMEOUT;

         sk->sk_sndtimeo               =       MAX_SCHEDULE_TIMEOUT;

 

         sk->sk_stamp = ktime_set(-1L, 0);

 

#ifdef CONFIG_NET_RX_BUSY_POLL

         sk->sk_napi_id                   =       0;

         sk->sk_ll_usec          =       sysctl_net_busy_read;

#endif

 

         sk->sk_max_pacing_rate = ~0U;

         sk->sk_pacing_rate = ~0U;

         /*

          * Before updating sk_refcnt, we must commit prior changes to memory

          * (Documentation/RCU/rculist_nulls.txt for details)

          */

         smp_wmb();

         atomic_set(&sk->sk_refcnt, 1);//sk的引用计数加1

         atomic_set(&sk->sk_drops, 0);

}

(2)static int tcp_v4_init_sock(struct sock *sk)

{

         struct inet_connection_sock *icsk = inet_csk(sk);//基于上面的原因分析,其实这里可以进行强制的类型转换

         tcp_init_sock(sk);//进行tcp相关变量的初始化工作

         icsk->icsk_af_ops = &ipv4_specific;

#ifdef CONFIG_TCP_MD5SIG

         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;

#endif

 

         return 0;

}

Socket内核调用数SYSCALL_DEFINE3