本篇部落客要記錄socket建立是的流程,其它socket接口API詳見
1. 應用層建立socket套接字
int socket(int domain, int type, int protocol);
參數說明:
domain:協定域,又稱協定族(family)。常用的協定族有AF_INET、AF_INET6、AF_LOCAL(或稱AF_UNIX,Unix域Socket)、AF_ROUTE等。協定族決定了socket的位址類型,在通信中必須采用對應的位址,如AF_INET決定了要用ipv4位址(32位的)與端口号(16位的)的組合、AF_UNIX決定了要用一個絕對路徑名作為位址;
type:指定Socket類型。常用的socket類型有SOCK_STREAM、SOCK_DGRAM、SOCK_RAW、SOCK_PACKET、SOCK_SEQPACKET等。流式Socket(SOCK_STREAM)是一種面向連接配接的Socket,針對于面向連接配接的TCP服務應用。資料報式Socket(SOCK_DGRAM)是一種無連接配接的Socket,對應于無連接配接的UDP服務應用;
protocol:指定協定。常用協定有IPPROTO_TCP、IPPROTO_UDP、IPPROTO_STCP、IPPROTO_TIPC等,分别對應TCP傳輸協定、UDP傳輸協定、STCP傳輸協定、TIPC傳輸協定。
2. socket建立時的套接字堆棧資訊
CPU: 0 PID: 472 Comm: init Not tainted 3.10.32 #216
Backtrace:
[<c0012df8>] (dump_backtrace+0x0/0x10c) from [<c0012f1c>] (show_stack+0x18/0x1c)
r7:00000000 r6:00000001 r5:00000002 r4:00000000
[<c0012f04>] (show_stack+0x0/0x1c) from [<c03244bc>] (dump_stack+0x20/0x2c)
[<c032449c>] (dump_stack+0x0/0x2c) from [<c029e3f4>] (SyS_socket+0x28/0xcc)
[<c029e3cc>] (SyS_socket+0x0/0xcc) from [<c000f7c0>] (ret_fast_syscall+0x0/0x2c)
r8:c000f948 r7:00000119 r6:b6fdc33c r5:00000002 r4:ffffffff
3. socket核心源碼分析
根據第2.點的堆棧資訊分析其對應的源碼如下:
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
int retval;
struct socket *sock; //套接字
int flags;
/* Check the SOCK_* constants for consistency. */
BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
flags = type & ~SOCK_TYPE_MASK;
//SOCK_CLOEXEC(close-on-exec):執行exec函數時關閉本程序内打開的檔案描述符
//SOCK_NONBLOCK:設定為非阻塞模式
//具體詳見篇文章的使用:http://blog.chinaunix.net/uid-24907956-id-3969651.html
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
type &= SOCK_TYPE_MASK;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
retval = sock_create(family, type, protocol, &sock); //建立socket,詳見下源碼
if (retval < 0)
goto out;
retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); //套接字映射一個描述符,詳見下源碼
if (retval < 0)
goto out_release;
out:
/* It may be already another descriptor 8) Not kernel problem. */
return retval;
out_release:
sock_release(sock);
return retval;
}
在SYSCALL_DEFINE3這個函數内部主要完成兩個工作:
第一,socket建立 sock_create(...);
第二,socket建立之後通過sock_map_fd映射對應的fd并傳回給應用程式。
接下來,逐個分析!
4. 核心socket建立
retval = sock_create(family, type, protocol, &sock); //建立socket
int sock_create(int family, int type, int protocol, struct socket **res)
{
return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
EXPORT_SYMBOL(sock_create);
氣質current->nsproxy->net_ns是網絡命名空間
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
/*
* Check protocol is in range
*/
if (family < 0 || family >= NPROTO) //協定族範圍判定
return -EAFNOSUPPORT;
if (type < 0 || type >= SOCK_MAX) //socket類型範圍判定
return -EINVAL;
/* Compatibility.
This ugly moron is moved from INET layer to here to avoid
deadlock in module load.
*/
if (family == PF_INET && type == SOCK_PACKET) {
static int warned;
if (!warned) {
warned = 1;
printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
current->comm);
}
family = PF_PACKET;
}
err = security_socket_create(family, type, protocol, kern); //LSM校驗
if (err)
return err;
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate(合适)
* default.
*/
sock = sock_alloc(); //動态配置設定一個socket
if (!sock) {
net_warn_ratelimited("socket: no more sockets\n");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
sock->type = type; //綁定socket類型
#ifdef CONFIG_MODULES
/* Attempt to load a protocol module if the find failed.
*
* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
* requested real, full-featured networking support upon configuration.
* Otherwise module support will break!
*/
//通過family索引,從全局協定族數組net_families[]中查找是否有效的;
//關于協定族的注冊,詳見部落格:javascript:void(0)
if (rcu_access_pointer(net_families[family]) == NULL)
request_module("net-pf-%d", family);
#endif
rcu_read_lock();
pf = rcu_dereference(net_families[family]); //rcu的方式擷取協定族
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
/*
* We will call the ->create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
if (!try_module_get(pf->owner))
goto out_release;
/* Now protected by module ref count */
rcu_read_unlock();
//調用協定族的函數create socket
err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
/*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
*/
if (!try_module_get(sock->ops->owner))
goto out_module_busy;
/*
* Now that we're done with the ->create function, the [loadable]
* module can have its refcnt decremented
*/
module_put(pf->owner);
err = security_socket_post_create(sock, family, type, protocol, kern);
if (err)
goto out_sock_release;
*res = sock;
return 0;
out_module_busy:
err = -EAFNOSUPPORT;
out_module_put:
sock->ops = NULL;
module_put(pf->owner);
out_sock_release:
sock_release(sock);
return err;
out_release:
rcu_read_unlock();
goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);
在該函數__sock_create内部,主要完成以下幾個工作,
第一,動态配置設定一個socket
static struct socket *sock_alloc(void)
{
struct inode *inode;
struct socket *sock;
//建立一個inode
inode = new_inode_pseudo(sock_mnt->mnt_sb); //pseudo: 假的、冒充的
if (!inode)
return NULL;
//通過inode,從vfs_inode中内部擷取socket
sock = SOCKET_I(inode);
//kmemcheck_annotate_bitfield為宏定義,該宏内部展開後,通過sock調用socket結構體内部的成員
//kmemcheck_bitfield_begin(type)\kmemcheck_bitfield_end(type),然後再在該宏内部通過函數
//kmemcheck_mark_initialized标記被初始化
kmemcheck_annotate_bitfield(sock, type); //annotate: 注釋,給...作注釋
inode->i_ino = get_next_ino(); //擷取下一個節點
inode->i_mode = S_IFSOCK | S_IRWXUGO; //模式為socket | 讀寫可執行(使用者、組、其他)
inode->i_uid = current_fsuid(); //擷取目前的使用者ID
inode->i_gid = current_fsgid(); //擷取目前的組ID
inode->i_op = &sockfs_inode_ops; //綁定節點的操作句柄
this_cpu_add(sockets_in_use, 1);
return sock;
}
其中socket節點操作句柄為
static const struct inode_operations sockfs_inode_ops = {
.getxattr = sockfs_getxattr,
.listxattr = sockfs_listxattr,
};
第二,根據協定族family,在全局協定族數組net_families[*]中查找比對的
//通過family索引,從全局協定族數組net_families[]中查找有效的;
if (rcu_access_pointer(net_families[family]) == NULL)
request_module("net-pf-%d", family);
#endif
rcu_read_lock();
pf = rcu_dereference(net_families[family]); //rcu的方式擷取協定族
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
關于net_families[*]的建立,詳見另外一篇部落格
第三,通過比對成功的協定族,調用協定族的create函數建立socket
//調用協定族的函數create socket
err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
假設現在的協定族類型為PF_INET,那麼pf->create的函數指針指向inet_create
static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
char answer_no_check;
int try_loading_module = 0;
int err;
if (unlikely(!inet_ehash_secret))
if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
build_ehash_secret();
sock->state = SS_UNCONNECTED; //設定socket的狀态為未連接配接
/* Look for the requested type/protocol pair. */
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
//根據sock->type協定類型,從inetsw[]連結清單頭中擷取一個網絡層協定
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) { //協定比對
if (protocol != IPPROTO_IP) //非虛拟協定
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) { //虛拟協定
protocol = answer->protocol; //将inetsw中的協定強制指派給protocol
break;
}
if (IPPROTO_IP == answer->protocol) //answer->protocol中的協定為虛拟就直接跳出,因為檢索就沒有意義了
break;
}
err = -EPROTONOSUPPORT;
}
if (unlikely(err)) { //條件為真
if (try_loading_module < 2) {
rcu_read_unlock();
/*
* Be more specific, e.g. net-pf-2-proto-132-type-1
* (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
*/
if (++try_loading_module == 1)
request_module("net-pf-%d-proto-%d-type-%d",
PF_INET, protocol, sock->type);
/*
* Fall back to generic, e.g. net-pf-2-proto-132
* (net-pf-PF_INET-proto-IPPROTO_SCTP)
*/
else
request_module("net-pf-%d-proto-%d",
PF_INET, protocol);
goto lookup_protocol;
} else
goto out_rcu_unlock;
}
err = -EPERM;
if (sock->type == SOCK_RAW && !kern &&
!ns_capable(net->user_ns, CAP_NET_RAW))
goto out_rcu_unlock;
sock->ops = answer->ops; //協定特定套接字操作句柄綁定 inet_stream_ops
answer_prot = answer->prot; //傳輸層協定綁定 tcp_prot
answer_no_check = answer->no_check; //接收、發送是否校驗
answer_flags = answer->flags;
rcu_read_unlock();
WARN_ON(answer_prot->slab == NULL);
err = -ENOBUFS;
//動态申請一個網絡層表示的套接字sock
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
if (sk == NULL)
goto out;
err = 0;
sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = SK_CAN_REUSE;
//通過sock擷取inet_sock
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
inet->nodefrag = 0;
if (SOCK_RAW == sock->type) {
inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}
if (ipv4_config.no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
inet->inet_id = 0;
//sock參數初始化(包括發送、接收、錯誤隊列,以及記憶體空間)
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
inet->uc_ttl = -1;
inet->mc_loop = 1;
inet->mc_ttl = 1;
inet->mc_all = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
inet->rcv_tos = 0;
sk_refcnt_debug_inc(sk);
if (inet->inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk); //調用傳輸層協定 inet_hash
}
//初始化傳輸層協定
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk); //調用傳輸層協定 tcp_v4_init_sock
if (err)
sk_common_release(sk);
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}
在inet_create函數内部主要完成以下:
第一,設定socket的狀态為未連接配接
sock->state = SS_UNCONNECTED; //設定socket的狀态為未連接配接
第二,協定類型的判定
//根據sock->type協定類型,從inetsw[]連結清單頭中擷取一個網絡層協定
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) { //協定比對
if (protocol != IPPROTO_IP) //非虛拟協定
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) { //虛拟協定
protocol = answer->protocol; //将inetsw中的協定強制指派給protocol
break;
}
if (IPPROTO_IP == answer->protocol) //answer->protocol中的協定為虛拟就直接跳出,因為檢索就沒有意義了
break;
}
err = -EPROTONOSUPPORT;
}
第三,動态申請一個sock
//動态申請一個網絡層表示的套接字sock
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
if (sk == NULL)
goto out;
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot)
{
struct sock *sk;
//配置設定sock
sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
if (sk) {
sk->sk_family = family; //綁定協定族
/*
* See comment in struct sock definition to understand
* why we need sk_prot_creator -acme
*/
sk->sk_prot = sk->sk_prot_creator = prot; //傳輸層協定綁定
sock_lock_init(sk);
sock_net_set(sk, get_net(net)); //net為命名空間,get_net(net)是命名空間個數加1,然後在函數sock_net_set(...)内部将sk->sk_net指向net命名空間,這裡的net指令空間相當于一個全局變量
atomic_set(&sk->sk_wmem_alloc, 1);
sock_update_classid(sk);
sock_update_netprioidx(sk);
}
return sk;
}
EXPORT_SYMBOL(sk_alloc);
在該函數内部,net為命名空間,get_net(net)是命名空間個數加1,然後在函數sock_net_set(...)内部将sk->sk_net指向net命名空間,這裡的net指令空間相當于一個全局變量,最後傳回sk,這樣就将命名空間與sock關聯起來了。
第四,通過sock擷取網絡層inet_sock
//通過sock擷取inet_sock
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
第五,sock參數初始化
//sock參數初始化(包括發送、接收、錯誤隊列,以及記憶體空間)
sock_init_data(sock, sk);
void sock_init_data(struct socket *sock, struct sock *sk)
{
skb_queue_head_init(&sk->sk_receive_queue); //接收隊列
skb_queue_head_init(&sk->sk_write_queue); //寫隊列
skb_queue_head_init(&sk->sk_error_queue); //錯誤隊列
#ifdef CONFIG_NET_DMA
skb_queue_head_init(&sk->sk_async_wait_queue);
#endif
sk->sk_send_head = NULL;
init_timer(&sk->sk_timer); //初始化定時器
sk->sk_allocation = GFP_KERNEL;
sk->sk_rcvbuf = sysctl_rmem_default; //預設記憶體尺寸 256*256,
sk->sk_sndbuf = sysctl_wmem_default;
sk->sk_state = TCP_CLOSE;
sk_set_socket(sk, sock);
sock_set_flag(sk, SOCK_ZAPPED);
if (sock) {
sk->sk_type = sock->type;
sk->sk_wq = sock->wq;
sock->sk = sk;
} else
sk->sk_wq = NULL;
spin_lock_init(&sk->sk_dst_lock);
rwlock_init(&sk->sk_callback_lock);
lockdep_set_class_and_name(&sk->sk_callback_lock,
af_callback_keys + sk->sk_family,
af_family_clock_key_strings[sk->sk_family]);
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
sk->sk_write_space = sock_def_write_space;
sk->sk_error_report = sock_def_error_report;
sk->sk_destruct = sock_def_destruct;
sk->sk_frag.page = NULL;
sk->sk_frag.offset = 0;
sk->sk_peek_off = -1;
sk->sk_peer_pid = NULL;
sk->sk_peer_cred = NULL;
sk->sk_write_pending = 0;
sk->sk_rcvlowat = 1;
sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_stamp = ktime_set(-1L, 0);
sk->sk_pacing_rate = ~0U;
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
*/
smp_wmb();
atomic_set(&sk->sk_refcnt, 1);
atomic_set(&sk->sk_drops, 0);
}
EXPORT_SYMBOL(sock_init_data);
第六,sock inet_hash初始化(重要!)
inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk); //調用傳輸層協定 inet_hash
void inet_hash(struct sock *sk)
{
if (sk->sk_state != TCP_CLOSE) {
local_bh_disable();
__inet_hash(sk);
local_bh_enable();
}
}
static void __inet_hash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; //将調用 tcp_hashinfo()函數
struct inet_listen_hashbucket *ilb;
/*Socket不處于監聽狀态*/
if (sk->sk_state != TCP_LISTEN) {
__inet_hash_nolisten(sk, NULL); /*這裡對應的是已經建立連接配接的*/
return;
}
WARN_ON(!sk_unhashed(sk));
/*根據監聽的端口号,查找相對應的HASH*/
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
spin_lock(&ilb->lock);
/*把sock添加到監聽HASH桶的頭部,連接配接到sk->sk_nulls_node */
__sk_nulls_add_node_rcu(sk, &ilb->head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
spin_unlock(&ilb->lock);
}
第七,sock 傳輸層協定初始化(重要!)
err = sk->sk_prot->init(sk); //調用傳輸層協定 tcp_v4_init_sock
static int tcp_v4_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_init_sock(sk);
icsk->icsk_af_ops = &ipv4_specific; //見下,重要
#ifdef CONFIG_TCP_MD5SIG
tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
#endif
return 0;
}
const struct inet_connection_sock_af_ops ipv4_specific = {
.queue_xmit = ip_queue_xmit, //ip: 網絡層協定接口
.send_check = tcp_v4_send_check, //tcp:傳輸層ipv4發送校驗
.rebuild_header = inet_sk_rebuild_header, //inet sock重建頭
.sk_rx_dst_set = inet_sk_rx_dst_set, //inet socket 接收目的位址設定
.conn_request = tcp_v4_conn_request, //tcp ipv4連接配接請求
.syn_recv_sock = tcp_v4_syn_recv_sock, //tcp ipv4同步接收socket
.net_header_len = sizeof(struct iphdr),
.setsockopt = ip_setsockopt, //ip網絡層設定socket操作集
.getsockopt = ip_getsockopt, //ip網絡層擷取socket操作集
.addr2sockaddr = inet_csk_addr2sockaddr,
.sockaddr_len = sizeof(struct sockaddr_in),
.bind_conflict = inet_csk_bind_conflict,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ip_setsockopt, //compat:相容
.compat_getsockopt = compat_ip_getsockopt,
#endif
};
EXPORT_SYMBOL(ipv4_specific);
顧名思義,上面const struct inet_connection_sock_af_ops ipv4_specific結構體内部成員描述了TCP與IP協定層之間的接口,該結構體的内部成員非常重要,後續會在connect()\bind()\recv()\send()詳解!
第八,最終傳回建立的 sock。
5. sock_map_fd檔案描述符映射
retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); //套接字映射一個描述符
static int sock_map_fd(struct socket *sock, int flags)
{
struct file *newfile;
int fd = get_unused_fd_flags(flags); //擷取一個未被使用的描述符
if (unlikely(fd < 0))
return fd;
//sock動态配置設定檔案
newfile = sock_alloc_file(sock, flags, NULL);
if (likely(!IS_ERR(newfile))) {
fd_install(fd, newfile); //fd安裝struct file *newfile
return fd;
}
put_unused_fd(fd);
return PTR_ERR(newfile);
}
在sock_map_fd函數内部主要完成以下幾部分:
第一,擷取一個未被使用的檔案描述符fd
int fd = get_unused_fd_flags(flags); //擷取一個未被使用的描述符
int get_unused_fd_flags(unsigned flags)
{
return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
}
EXPORT_SYMBOL(get_unused_fd_flags);
int __alloc_fd(struct files_struct *files,
unsigned start, unsigned end, unsigned flags)
{
unsigned int fd;
int error;
struct fdtable *fdt;
spin_lock(&files->file_lock);
repeat:
fdt = files_fdtable(files); //通過 struct files_struct *files 查找 struct fdtable *fdt
fd = start;
if (fd < files->next_fd)
fd = files->next_fd;
if (fd < fdt->max_fds)
fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); //配置設定一個未被使用的bit位
/*
* N.B. For clone tasks sharing a files structure, this test
* will limit the total number of files that can be opened.
*/
error = -EMFILE;
if (fd >= end)
goto out;
error = expand_files(files, fd); //确定單前fd的下一個描述符是否有效
if (error < 0)
goto out;
/*
* If we needed to expand the fs array we
* might have blocked - try again.
*/
if (error)
goto repeat;
if (start <= files->next_fd)
files->next_fd = fd + 1; //切換到下一個fd
__set_open_fd(fd, fdt); //設定為打開辨別
if (flags & O_CLOEXEC) //close-no-exec,為真,表示執行exec時關閉其已經打開的描述符
__set_close_on_exec(fd, fdt);
else
__clear_close_on_exec(fd, fdt);
error = fd;
#if 1
/* Sanity check */
if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
rcu_assign_pointer(fdt->fd[fd], NULL);
}
#endif
out:
spin_unlock(&files->file_lock);
return error;
}
在__alloc_fd函數内部主要是動态配置設定一個描述符fd,單裡面涉及的知識其實挺廣的,關于fd的具體配置設定,詳見:
點選打開連結
第二,sock動态配置設定一個file結構體
//sock動态配置設定檔案
newfile = sock_alloc_file(sock, flags, NULL);
//在目前程序裡,建立一個file struct結構體,并把它映射到fd的空間裡
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
struct qstr name = { .name = "" };
struct path path;
struct file *file;
if (dname) { //dname=“NULL”
name.name = dname;
name.len = strlen(name.name);
} else if (sock->sk) {
name.name = sock->sk->sk_prot_creator->name; //“TCP”
name.len = strlen(name.name);
}
//動态配置設定一個假的目錄
path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name); //pseudo: 冒充,假的
if (unlikely(!path.dentry))
return ERR_PTR(-ENOMEM);
path.mnt = mntget(sock_mnt);
d_instantiate(path.dentry, SOCK_INODE(sock));
SOCK_INODE(sock)->i_fop = &socket_file_ops; //非常重要,對應檔案操作句柄
//動态配置設定一個file
file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
&socket_file_ops);
if (unlikely(IS_ERR(file))) {
/* drop dentry, keep inode */
ihold(path.dentry->d_inode);
path_put(&path);
return file;
}
sock->file = file;
file->f_flags = O_RDWR | (flags & O_NONBLOCK);
file->private_data = sock; //綁定file私有資料為sock,這也是串聯整個socket套接字的關鍵資料!
return file;
}
EXPORT_SYMBOL(sock_alloc_file);
在sock_alloc_file該函數内部會動态配置設定一個檔案,如下
//動态配置設定一個file
file = alloc_file(&path, FMODE_READ | FMODE_WRITE,&socket_file_ops);
期中有一個重要的file結構體綁定,其對應檔案操作句柄,在編寫驅動的時候,這個結構體是最熟悉不過的了
static const struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.aio_read = sock_aio_read, //異步讀取
.aio_write = sock_aio_write,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl, //對socket套接字設定時調用的接口,具體詳見其源碼
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl,
#endif
.mmap = sock_mmap,
.open = sock_no_open, /* special open code to disallow open via /proc */
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
};
最後在sock_alloc_file内部進行綁定,這個是不是很熟悉,哈哈!
file->private_data = sock; //綁定file私有資料為sock,這也是串聯整個socket套接字的關鍵資料!
第三,最後根據擷取到的fd和file進行安裝
fd_install(fd, newfile); //fd安裝struct file *newfile
void fd_install(unsigned int fd, struct file *file)
{
__fd_install(current->files, fd, file);
}
/*
* Install a file pointer in the fd array.
*
* The VFS is full of places where we drop the files lock between
* setting the open_fds bitmap and installing the file in the file
* array. At any such point, we are vulnerable to a dup2() race
* installing a file in the array before us. We need to detect this and
* fput() the struct file we are about to overwrite in this case.
*
* It should never happen - if we allow dup2() do it, _really_ bad things
* will follow.
*
* NOTE: __fd_install() variant is really, really low-level; don't
* use it unless you are forced to by truly lousy API shoved down
* your throat. 'files' *MUST* be either current->files or obtained
* by get_files_struct(current) done by whoever had given it to you,
* or really bad things will happen. Normally you want to use
* fd_install() instead.
*/
void __fd_install(struct files_struct *files, unsigned int fd,
struct file *file)
{
struct fdtable *fdt;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
}
這個函數的注解很詳細,這裡不再贅述!
6. 總結
本篇部落客要針對socket套接字建立進行源碼流程分析,通過協定族、協定類型、協定建立socket套接字,然後通過建立成果的套接字去配置設定一個檔案描述符fd,這裡涉及到兩個重要的結構體:
第一,目前程序任務結構體 struct task_struct,在套接字建立裡面主要涉及到該結構體成員net
第二, 命名空間struct nsproxy,個人認為,命名空間相當于一個全局變量,在socket裡面建立,然後在bind,recv,send都能通路該命名空間,是以相當于一個全局變量;
最後傳回檔案句柄fd給應用層。