天天看點

linux 伺服器、用戶端socket(3)

本篇部落客要記錄socket建立是的流程,其它socket接口API詳見​

1. 應用層建立socket套接字

int socket(int domain, int type, int protocol);      

參數說明:

domain:協定域,又稱協定族(family)。常用的協定族有AF_INET、AF_INET6、AF_LOCAL(或稱AF_UNIX,Unix域Socket)、AF_ROUTE等。協定族決定了socket的位址類型,在通信中必須采用對應的位址,如AF_INET決定了要用ipv4位址(32位的)與端口号(16位的)的組合、AF_UNIX決定了要用一個絕對路徑名作為位址;

type:指定Socket類型。常用的socket類型有SOCK_STREAM、SOCK_DGRAM、SOCK_RAW、SOCK_PACKET、SOCK_SEQPACKET等。流式Socket(SOCK_STREAM)是一種面向連接配接的Socket,針對于面向連接配接的TCP服務應用。資料報式Socket(SOCK_DGRAM)是一種無連接配接的Socket,對應于無連接配接的UDP服務應用;

protocol:指定協定。常用協定有IPPROTO_TCP、IPPROTO_UDP、IPPROTO_STCP、IPPROTO_TIPC等,分别對應TCP傳輸協定、UDP傳輸協定、STCP傳輸協定、TIPC傳輸協定。

2. socket建立時的套接字堆棧資訊

CPU: 0 PID: 472 Comm: init Not tainted 3.10.32 #216
Backtrace: 
[<c0012df8>] (dump_backtrace+0x0/0x10c) from [<c0012f1c>] (show_stack+0x18/0x1c)
 r7:00000000 r6:00000001 r5:00000002 r4:00000000
[<c0012f04>] (show_stack+0x0/0x1c) from [<c03244bc>] (dump_stack+0x20/0x2c)
[<c032449c>] (dump_stack+0x0/0x2c) from [<c029e3f4>] (SyS_socket+0x28/0xcc)
[<c029e3cc>] (SyS_socket+0x0/0xcc) from [<c000f7c0>] (ret_fast_syscall+0x0/0x2c)
 r8:c000f948 r7:00000119 r6:b6fdc33c r5:00000002 r4:ffffffff      

3. socket核心源碼分析

根據第2.點的堆棧資訊分析其對應的源碼如下:

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
  int retval;
  struct socket *sock; //套接字
  int flags;

  /* Check the SOCK_* constants for consistency.  */
  BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
  BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
  BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
  BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

  flags = type & ~SOCK_TYPE_MASK;
  //SOCK_CLOEXEC(close-on-exec):執行exec函數時關閉本程序内打開的檔案描述符
  //SOCK_NONBLOCK:設定為非阻塞模式
  //具體詳見篇文章的使用:http://blog.chinaunix.net/uid-24907956-id-3969651.html
  if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
    return -EINVAL;
  type &= SOCK_TYPE_MASK;

  if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
    flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

  retval = sock_create(family, type, protocol, &sock); //建立socket,詳見下源碼
  if (retval < 0)
    goto out;

  retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); //套接字映射一個描述符,詳見下源碼
  if (retval < 0)
    goto out_release;

out:
  /* It may be already another descriptor 8) Not kernel problem. */
  return retval;

out_release:
  sock_release(sock);
  return retval;
}      

在SYSCALL_DEFINE3這個函數内部主要完成兩個工作:

第一,socket建立 sock_create(...);

第二,socket建立之後通過sock_map_fd映射對應的fd并傳回給應用程式。

接下來,逐個分析!

4. 核心socket建立

retval = sock_create(family, type, protocol, &sock); //建立socket      
int sock_create(int family, int type, int protocol, struct socket **res)
{
  return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
EXPORT_SYMBOL(sock_create);      

氣質current->nsproxy->net_ns是網絡命名空間​

int __sock_create(struct net *net, int family, int type, int protocol,
       struct socket **res, int kern)
{
  int err;
  struct socket *sock;
  const struct net_proto_family *pf;

  /*
   *      Check protocol is in range
   */
  if (family < 0 || family >= NPROTO) //協定族範圍判定
    return -EAFNOSUPPORT;
  if (type < 0 || type >= SOCK_MAX) //socket類型範圍判定
    return -EINVAL;

  /* Compatibility.

     This ugly moron is moved from INET layer to here to avoid
     deadlock in module load.
   */
  if (family == PF_INET && type == SOCK_PACKET) {
    static int warned;
    if (!warned) {
      warned = 1;
      printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
             current->comm);
    }
    family = PF_PACKET;
  }

  err = security_socket_create(family, type, protocol, kern); //LSM校驗
  if (err)
    return err;

  /*
   * Allocate the socket and allow the family to set things up. if
   * the protocol is 0, the family is instructed to select an appropriate(合适)
   * default.
   */
  sock = sock_alloc(); //動态配置設定一個socket
  if (!sock) {
    net_warn_ratelimited("socket: no more sockets\n");
    return -ENFILE;  /* Not exactly a match, but its the
           closest posix thing */
  }

  sock->type = type; //綁定socket類型

#ifdef CONFIG_MODULES
  /* Attempt to load a protocol module if the find failed.
   *
   * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
   * requested real, full-featured networking support upon configuration.
   * Otherwise module support will break!
   */
  //通過family索引,從全局協定族數組net_families[]中查找是否有效的;
  //關于協定族的注冊,詳見部落格:javascript:void(0)
  if (rcu_access_pointer(net_families[family]) == NULL)
    request_module("net-pf-%d", family);
#endif

  rcu_read_lock();
  pf = rcu_dereference(net_families[family]); //rcu的方式擷取協定族
  err = -EAFNOSUPPORT;
  if (!pf)
    goto out_release;

  /*
   * We will call the ->create function, that possibly is in a loadable
   * module, so we have to bump that loadable module refcnt first.
   */
  if (!try_module_get(pf->owner))
    goto out_release;

  /* Now protected by module ref count */
  rcu_read_unlock();

  //調用協定族的函數create socket
  err = pf->create(net, sock, protocol, kern);
  if (err < 0)
    goto out_module_put;

  /*
   * Now to bump the refcnt of the [loadable] module that owns this
   * socket at sock_release time we decrement its refcnt.
   */
  if (!try_module_get(sock->ops->owner))
    goto out_module_busy;

  /*
   * Now that we're done with the ->create function, the [loadable]
   * module can have its refcnt decremented
   */
  module_put(pf->owner);
  err = security_socket_post_create(sock, family, type, protocol, kern);
  if (err)
    goto out_sock_release;
  *res = sock;

  return 0;

out_module_busy:
  err = -EAFNOSUPPORT;
out_module_put:
  sock->ops = NULL;
  module_put(pf->owner);
out_sock_release:
  sock_release(sock);
  return err;

out_release:
  rcu_read_unlock();
  goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);      

在該函數__sock_create内部,主要完成以下幾個工作,

第一,動态配置設定一個socket

static struct socket *sock_alloc(void)
{
  struct inode *inode;
  struct socket *sock;

  //建立一個inode
  inode = new_inode_pseudo(sock_mnt->mnt_sb); //pseudo: 假的、冒充的
  if (!inode)
    return NULL;

  //通過inode,從vfs_inode中内部擷取socket
  sock = SOCKET_I(inode);

  //kmemcheck_annotate_bitfield為宏定義,該宏内部展開後,通過sock調用socket結構體内部的成員
  //kmemcheck_bitfield_begin(type)\kmemcheck_bitfield_end(type),然後再在該宏内部通過函數
  //kmemcheck_mark_initialized标記被初始化
  kmemcheck_annotate_bitfield(sock, type); //annotate: 注釋,給...作注釋
  inode->i_ino = get_next_ino(); //擷取下一個節點
  inode->i_mode = S_IFSOCK | S_IRWXUGO; //模式為socket | 讀寫可執行(使用者、組、其他)
  inode->i_uid = current_fsuid(); //擷取目前的使用者ID
  inode->i_gid = current_fsgid(); //擷取目前的組ID
  inode->i_op = &sockfs_inode_ops; //綁定節點的操作句柄

  this_cpu_add(sockets_in_use, 1);
  return sock;
}      

其中socket節點操作句柄為

static const struct inode_operations sockfs_inode_ops = {
  .getxattr = sockfs_getxattr,
  .listxattr = sockfs_listxattr,
};      

第二,根據協定族family,在全局協定族數組net_families[*]中查找比對的

//通過family索引,從全局協定族數組net_families[]中查找有效的;
  if (rcu_access_pointer(net_families[family]) == NULL)
    request_module("net-pf-%d", family);
#endif

  rcu_read_lock();
  pf = rcu_dereference(net_families[family]); //rcu的方式擷取協定族
  err = -EAFNOSUPPORT;
  if (!pf)
    goto out_release;      

關于net_families[*]的建立,詳見另外一篇部落格​

第三,通過比對成功的協定族,調用協定族的create函數建立socket

//調用協定族的函數create socket
  err = pf->create(net, sock, protocol, kern);
  if (err < 0)
    goto out_module_put;      

假設現在的協定族類型為PF_INET,那麼pf->create的函數指針指向inet_create

static const struct net_proto_family inet_family_ops = {
  .family = PF_INET,
  .create = inet_create,
  .owner  = THIS_MODULE,
};      
static int inet_create(struct net *net, struct socket *sock, int protocol,
           int kern)
{
  struct sock *sk;
  struct inet_protosw *answer;
  struct inet_sock *inet;
  struct proto *answer_prot;
  unsigned char answer_flags;
  char answer_no_check;
  int try_loading_module = 0;
  int err;

  if (unlikely(!inet_ehash_secret))
    if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
      build_ehash_secret();

  sock->state = SS_UNCONNECTED; //設定socket的狀态為未連接配接

  /* Look for the requested type/protocol pair. */
lookup_protocol:
  err = -ESOCKTNOSUPPORT;
  rcu_read_lock();
  //根據sock->type協定類型,從inetsw[]連結清單頭中擷取一個網絡層協定
  list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

    err = 0;
    /* Check the non-wild match. */
    if (protocol == answer->protocol) { //協定比對
      if (protocol != IPPROTO_IP) //非虛拟協定
        break;
    } else {
      /* Check for the two wild cases. */
      if (IPPROTO_IP == protocol) { //虛拟協定
        protocol = answer->protocol; //将inetsw中的協定強制指派給protocol
        break;
      }
      if (IPPROTO_IP == answer->protocol) //answer->protocol中的協定為虛拟就直接跳出,因為檢索就沒有意義了
        break;
    }
    err = -EPROTONOSUPPORT;
  }

  if (unlikely(err)) { //條件為真
    if (try_loading_module < 2) {
      rcu_read_unlock();
      /*
       * Be more specific, e.g. net-pf-2-proto-132-type-1
       * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
       */
      if (++try_loading_module == 1)
        request_module("net-pf-%d-proto-%d-type-%d",
                 PF_INET, protocol, sock->type);
      /*
       * Fall back to generic, e.g. net-pf-2-proto-132
       * (net-pf-PF_INET-proto-IPPROTO_SCTP)
       */
      else
        request_module("net-pf-%d-proto-%d",
                 PF_INET, protocol);
      goto lookup_protocol;
    } else
      goto out_rcu_unlock;
  }

  err = -EPERM;
  if (sock->type == SOCK_RAW && !kern &&
      !ns_capable(net->user_ns, CAP_NET_RAW))
    goto out_rcu_unlock;

  sock->ops = answer->ops; //協定特定套接字操作句柄綁定  inet_stream_ops
  answer_prot = answer->prot; //傳輸層協定綁定 tcp_prot
  answer_no_check = answer->no_check; //接收、發送是否校驗
  answer_flags = answer->flags;
  rcu_read_unlock();

  WARN_ON(answer_prot->slab == NULL);

  err = -ENOBUFS;
  //動态申請一個網絡層表示的套接字sock
  sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
  if (sk == NULL)
    goto out;

  err = 0;
  sk->sk_no_check = answer_no_check;
  if (INET_PROTOSW_REUSE & answer_flags)
    sk->sk_reuse = SK_CAN_REUSE;

  //通過sock擷取inet_sock
  inet = inet_sk(sk);
  inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

  inet->nodefrag = 0;

  if (SOCK_RAW == sock->type) {
    inet->inet_num = protocol;
    if (IPPROTO_RAW == protocol)
      inet->hdrincl = 1;
  }

  if (ipv4_config.no_pmtu_disc)
    inet->pmtudisc = IP_PMTUDISC_DONT;
  else
    inet->pmtudisc = IP_PMTUDISC_WANT;

  inet->inet_id = 0;

  //sock參數初始化(包括發送、接收、錯誤隊列,以及記憶體空間)
  sock_init_data(sock, sk);

  sk->sk_destruct     = inet_sock_destruct;
  sk->sk_protocol     = protocol;
  sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

  inet->uc_ttl  = -1;
  inet->mc_loop  = 1;
  inet->mc_ttl  = 1;
  inet->mc_all  = 1;
  inet->mc_index  = 0;
  inet->mc_list  = NULL;
  inet->rcv_tos  = 0;

  sk_refcnt_debug_inc(sk);

  if (inet->inet_num) {
    /* It assumes that any protocol which allows
     * the user to assign a number at socket
     * creation time automatically
     * shares.
     */
    inet->inet_sport = htons(inet->inet_num); 
    /* Add to protocol hash chains. */
    sk->sk_prot->hash(sk); //調用傳輸層協定 inet_hash
  }

  //初始化傳輸層協定
  if (sk->sk_prot->init) {
    err = sk->sk_prot->init(sk); //調用傳輸層協定   tcp_v4_init_sock
    if (err)
      sk_common_release(sk);
  }
out:
  return err;
out_rcu_unlock:
  rcu_read_unlock();
  goto out;
}      

在inet_create函數内部主要完成以下:

第一,設定socket的狀态為未連接配接

sock->state = SS_UNCONNECTED; //設定socket的狀态為未連接配接      

第二,協定類型的判定

//根據sock->type協定類型,從inetsw[]連結清單頭中擷取一個網絡層協定
  list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

    err = 0;
    /* Check the non-wild match. */
    if (protocol == answer->protocol) { //協定比對
      if (protocol != IPPROTO_IP) //非虛拟協定
        break;
    } else {
      /* Check for the two wild cases. */
      if (IPPROTO_IP == protocol) { //虛拟協定
        protocol = answer->protocol; //将inetsw中的協定強制指派給protocol
        break;
      }
      if (IPPROTO_IP == answer->protocol) //answer->protocol中的協定為虛拟就直接跳出,因為檢索就沒有意義了
        break;
    }
    err = -EPROTONOSUPPORT;
  }      

第三,動态申請一個sock

//動态申請一個網絡層表示的套接字sock
  sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
  if (sk == NULL)
    goto out;      
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
          struct proto *prot)
{
  struct sock *sk;

  //配置設定sock
  sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
  if (sk) {
    sk->sk_family = family; //綁定協定族
    /*
     * See comment in struct sock definition to understand
     * why we need sk_prot_creator -acme
     */
    sk->sk_prot = sk->sk_prot_creator = prot; //傳輸層協定綁定
    sock_lock_init(sk);
    sock_net_set(sk, get_net(net)); //net為命名空間,get_net(net)是命名空間個數加1,然後在函數sock_net_set(...)内部将sk->sk_net指向net命名空間,這裡的net指令空間相當于一個全局變量
    atomic_set(&sk->sk_wmem_alloc, 1);

    sock_update_classid(sk);
    sock_update_netprioidx(sk);
  }

  return sk;
}
EXPORT_SYMBOL(sk_alloc);      

在該函數内部,net為命名空間,get_net(net)是命名空間個數加1,然後在函數sock_net_set(...)内部将sk->sk_net指向net命名空間,這裡的net指令空間相當于一個全局變量,最後傳回sk,這樣就将命名空間與sock關聯起來了。

第四,通過sock擷取網絡層inet_sock

//通過sock擷取inet_sock
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;      

第五,sock參數初始化

//sock參數初始化(包括發送、接收、錯誤隊列,以及記憶體空間)
sock_init_data(sock, sk);      
void sock_init_data(struct socket *sock, struct sock *sk)
{
  skb_queue_head_init(&sk->sk_receive_queue); //接收隊列
  skb_queue_head_init(&sk->sk_write_queue); //寫隊列
  skb_queue_head_init(&sk->sk_error_queue); //錯誤隊列
#ifdef CONFIG_NET_DMA
  skb_queue_head_init(&sk->sk_async_wait_queue);
#endif

  sk->sk_send_head  =  NULL;

  init_timer(&sk->sk_timer); //初始化定時器

  sk->sk_allocation  =  GFP_KERNEL;
  sk->sk_rcvbuf    =  sysctl_rmem_default; //預設記憶體尺寸 256*256,
  sk->sk_sndbuf    =  sysctl_wmem_default;
  sk->sk_state    =  TCP_CLOSE;
  sk_set_socket(sk, sock);

  sock_set_flag(sk, SOCK_ZAPPED);

  if (sock) {
    sk->sk_type  =  sock->type;
    sk->sk_wq  =  sock->wq;
    sock->sk  =  sk;
  } else
    sk->sk_wq  =  NULL;

  spin_lock_init(&sk->sk_dst_lock);
  rwlock_init(&sk->sk_callback_lock);
  lockdep_set_class_and_name(&sk->sk_callback_lock,
      af_callback_keys + sk->sk_family,
      af_family_clock_key_strings[sk->sk_family]);

  sk->sk_state_change  =  sock_def_wakeup;
  sk->sk_data_ready  =  sock_def_readable;
  sk->sk_write_space  =  sock_def_write_space;
  sk->sk_error_report  =  sock_def_error_report;
  sk->sk_destruct    =  sock_def_destruct;

  sk->sk_frag.page  =  NULL;
  sk->sk_frag.offset  =  0;
  sk->sk_peek_off    =  -1;

  sk->sk_peer_pid  =  NULL;
  sk->sk_peer_cred  =  NULL;
  sk->sk_write_pending  =  0;
  sk->sk_rcvlowat    =  1;
  sk->sk_rcvtimeo    =  MAX_SCHEDULE_TIMEOUT;
  sk->sk_sndtimeo    =  MAX_SCHEDULE_TIMEOUT;

  sk->sk_stamp = ktime_set(-1L, 0);

  sk->sk_pacing_rate = ~0U;
  /*
   * Before updating sk_refcnt, we must commit prior changes to memory
   * (Documentation/RCU/rculist_nulls.txt for details)
   */
  smp_wmb();
  atomic_set(&sk->sk_refcnt, 1);
  atomic_set(&sk->sk_drops, 0);
}
EXPORT_SYMBOL(sock_init_data);      

第六,sock inet_hash初始化(重要!)

inet->inet_sport = htons(inet->inet_num); 
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk); //調用傳輸層協定 inet_hash      
void inet_hash(struct sock *sk)
{
  if (sk->sk_state != TCP_CLOSE) {
    local_bh_disable();
    __inet_hash(sk);
    local_bh_enable();
  }
}      
static void __inet_hash(struct sock *sk)
{
  struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; //将調用 tcp_hashinfo()函數
  struct inet_listen_hashbucket *ilb;

  /*Socket不處于監聽狀态*/
  if (sk->sk_state != TCP_LISTEN) {
    __inet_hash_nolisten(sk, NULL); /*這裡對應的是已經建立連接配接的*/
    return;
  }

  WARN_ON(!sk_unhashed(sk));
  /*根據監聽的端口号,查找相對應的HASH*/
  ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];

  spin_lock(&ilb->lock);
  /*把sock添加到監聽HASH桶的頭部,連接配接到sk->sk_nulls_node */
  __sk_nulls_add_node_rcu(sk, &ilb->head);
  sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
  spin_unlock(&ilb->lock);
}      

第七,sock 傳輸層協定初始化(重要!)

err = sk->sk_prot->init(sk); //調用傳輸層協定   tcp_v4_init_sock      
static int tcp_v4_init_sock(struct sock *sk)
{
  struct inet_connection_sock *icsk = inet_csk(sk);

  tcp_init_sock(sk);

  icsk->icsk_af_ops = &ipv4_specific; //見下,重要

#ifdef CONFIG_TCP_MD5SIG
  tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
#endif

  return 0;
}      
const struct inet_connection_sock_af_ops ipv4_specific = {
  .queue_xmit     = ip_queue_xmit, //ip: 網絡層協定接口
  .send_check     = tcp_v4_send_check, //tcp:傳輸層ipv4發送校驗
  .rebuild_header     = inet_sk_rebuild_header, //inet sock重建頭
  .sk_rx_dst_set     = inet_sk_rx_dst_set, //inet socket 接收目的位址設定
  .conn_request     = tcp_v4_conn_request, //tcp ipv4連接配接請求
  .syn_recv_sock     = tcp_v4_syn_recv_sock, //tcp ipv4同步接收socket
  .net_header_len     = sizeof(struct iphdr),
  .setsockopt     = ip_setsockopt, //ip網絡層設定socket操作集
  .getsockopt     = ip_getsockopt, //ip網絡層擷取socket操作集
  .addr2sockaddr     = inet_csk_addr2sockaddr,
  .sockaddr_len     = sizeof(struct sockaddr_in),
  .bind_conflict     = inet_csk_bind_conflict,
#ifdef CONFIG_COMPAT
  .compat_setsockopt = compat_ip_setsockopt, //compat:相容
  .compat_getsockopt = compat_ip_getsockopt,
#endif
};
EXPORT_SYMBOL(ipv4_specific);      

顧名思義,上面const struct inet_connection_sock_af_ops ipv4_specific結構體内部成員描述了TCP與IP協定層之間的接口,該結構體的内部成員非常重要,後續會在connect()\bind()\recv()\send()詳解!

第八,最終傳回建立的 sock。

5. sock_map_fd檔案描述符映射

retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); //套接字映射一個描述符      
static int sock_map_fd(struct socket *sock, int flags)
{
  struct file *newfile;
  int fd = get_unused_fd_flags(flags); //擷取一個未被使用的描述符
  if (unlikely(fd < 0))
    return fd;

  //sock動态配置設定檔案
  newfile = sock_alloc_file(sock, flags, NULL);
  if (likely(!IS_ERR(newfile))) {
    fd_install(fd, newfile); //fd安裝struct file *newfile
    return fd;
  }

  put_unused_fd(fd);
  return PTR_ERR(newfile);
}      

在sock_map_fd函數内部主要完成以下幾部分:

第一,擷取一個未被使用的檔案描述符fd

int fd = get_unused_fd_flags(flags); //擷取一個未被使用的描述符      
int get_unused_fd_flags(unsigned flags)
{
  return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
}
EXPORT_SYMBOL(get_unused_fd_flags);      
int __alloc_fd(struct files_struct *files,
         unsigned start, unsigned end, unsigned flags)
{
  unsigned int fd;
  int error;
  struct fdtable *fdt;

  spin_lock(&files->file_lock);
repeat:
  fdt = files_fdtable(files); //通過 struct files_struct *files 查找 struct fdtable *fdt
  fd = start;
  if (fd < files->next_fd)
    fd = files->next_fd;

  if (fd < fdt->max_fds)
    fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); //配置設定一個未被使用的bit位

  /*
   * N.B. For clone tasks sharing a files structure, this test
   * will limit the total number of files that can be opened.
   */
  error = -EMFILE;
  if (fd >= end)
    goto out;

  error = expand_files(files, fd); //确定單前fd的下一個描述符是否有效
  if (error < 0)
    goto out;

  /*
   * If we needed to expand the fs array we
   * might have blocked - try again.
   */
  if (error)
    goto repeat;

  if (start <= files->next_fd)
    files->next_fd = fd + 1; //切換到下一個fd

  __set_open_fd(fd, fdt); //設定為打開辨別
  if (flags & O_CLOEXEC) //close-no-exec,為真,表示執行exec時關閉其已經打開的描述符
    __set_close_on_exec(fd, fdt); 
  else
    __clear_close_on_exec(fd, fdt);
  error = fd;
#if 1
  /* Sanity check */
  if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
    printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
    rcu_assign_pointer(fdt->fd[fd], NULL);
  }
#endif

out:
  spin_unlock(&files->file_lock);
  return error;
}      

在__alloc_fd函數内部主要是動态配置設定一個描述符fd,單裡面涉及的知識其實挺廣的,關于fd的具體配置設定,詳見:

​​點選打開連結​​

第二,sock動态配置設定一個file結構體

//sock動态配置設定檔案
newfile = sock_alloc_file(sock, flags, NULL);      
//在目前程序裡,建立一個file struct結構體,并把它映射到fd的空間裡
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
  struct qstr name = { .name = "" };
  struct path path;
  struct file *file;

  if (dname) { //dname=“NULL”
    name.name = dname;
    name.len = strlen(name.name);
  } else if (sock->sk) {
    name.name = sock->sk->sk_prot_creator->name; //“TCP”
    name.len = strlen(name.name);
  }
  //動态配置設定一個假的目錄
  path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name); //pseudo: 冒充,假的
  if (unlikely(!path.dentry))
    return ERR_PTR(-ENOMEM);
  path.mnt = mntget(sock_mnt);

  d_instantiate(path.dentry, SOCK_INODE(sock));
  SOCK_INODE(sock)->i_fop = &socket_file_ops; //非常重要,對應檔案操作句柄

  //動态配置設定一個file
  file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
      &socket_file_ops);
  if (unlikely(IS_ERR(file))) {
    /* drop dentry, keep inode */
    ihold(path.dentry->d_inode);
    path_put(&path);
    return file;
  }

  sock->file = file;
  file->f_flags = O_RDWR | (flags & O_NONBLOCK);
  file->private_data = sock; //綁定file私有資料為sock,這也是串聯整個socket套接字的關鍵資料!
  return file;
}
EXPORT_SYMBOL(sock_alloc_file);      

在sock_alloc_file該函數内部會動态配置設定一個檔案,如下

//動态配置設定一個file
file = alloc_file(&path, FMODE_READ | FMODE_WRITE,&socket_file_ops);      

期中有一個重要的file結構體綁定,其對應檔案操作句柄,在編寫驅動的時候,這個結構體是最熟悉不過的了

static const struct file_operations socket_file_ops = {
  .owner =  THIS_MODULE,
  .llseek =  no_llseek,
  .aio_read =  sock_aio_read, //異步讀取
  .aio_write =  sock_aio_write,
  .poll =    sock_poll,
  .unlocked_ioctl = sock_ioctl, //對socket套接字設定時調用的接口,具體詳見其源碼
#ifdef CONFIG_COMPAT
  .compat_ioctl = compat_sock_ioctl,
#endif
  .mmap =    sock_mmap,
  .open =    sock_no_open,  /* special open code to disallow open via /proc */
  .release =  sock_close,
  .fasync =  sock_fasync,
  .sendpage =  sock_sendpage,
  .splice_write = generic_splice_sendpage,
  .splice_read =  sock_splice_read,
};      

最後在sock_alloc_file内部進行綁定,這個是不是很熟悉,哈哈!

file->private_data = sock; //綁定file私有資料為sock,這也是串聯整個socket套接字的關鍵資料!      

第三,最後根據擷取到的fd和file進行安裝

fd_install(fd, newfile); //fd安裝struct file *newfile      
void fd_install(unsigned int fd, struct file *file)
{
  __fd_install(current->files, fd, file);
}      
/*
 * Install a file pointer in the fd array.
 *
 * The VFS is full of places where we drop the files lock between
 * setting the open_fds bitmap and installing the file in the file
 * array.  At any such point, we are vulnerable to a dup2() race
 * installing a file in the array before us.  We need to detect this and
 * fput() the struct file we are about to overwrite in this case.
 *
 * It should never happen - if we allow dup2() do it, _really_ bad things
 * will follow.
 *
 * NOTE: __fd_install() variant is really, really low-level; don't
 * use it unless you are forced to by truly lousy API shoved down
 * your throat.  'files' *MUST* be either current->files or obtained
 * by get_files_struct(current) done by whoever had given it to you,
 * or really bad things will happen.  Normally you want to use
 * fd_install() instead.
 */

void __fd_install(struct files_struct *files, unsigned int fd,
    struct file *file)
{
  struct fdtable *fdt;
  spin_lock(&files->file_lock);
  fdt = files_fdtable(files);
  BUG_ON(fdt->fd[fd] != NULL);
  rcu_assign_pointer(fdt->fd[fd], file);
  spin_unlock(&files->file_lock);
}      

這個函數的注解很詳細,這裡不再贅述!

6. 總結

本篇部落客要針對socket套接字建立進行源碼流程分析,通過協定族、協定類型、協定建立socket套接字,然後通過建立成果的套接字去配置設定一個檔案描述符fd,這裡涉及到兩個重要的結構體:

第一,目前程序任務結構體 struct task_struct,在套接字建立裡面主要涉及到該結構體成員net

第二, 命名空間struct nsproxy,個人認為,命名空間相當于一個全局變量,在socket裡面建立,然後在bind,recv,send都能通路該命名空間,是以相當于一個全局變量;

最後傳回檔案句柄fd給應用層。