天天看點

Linux網絡解讀(6) - TCP之listen,accept

TCP

總算到了TCP了。

TCP可以描述為一個無法選擇确認也無法否認的滑動視窗協定。

TCP協定棧的初始化

tcp協定棧初始化的入口是在inet_init中      
static int __init inet_init(void)
{
        rc = sk_alloc_slab(&tcp_prot, "tcp_sock");
        if (rc) {
                sk_alloc_slab_error(&tcp_prot);
                goto out;
        }
        rc = sk_alloc_slab(&udp_prot, "udp_sock");
        if (rc) {
                sk_alloc_slab_error(&udp_prot);
                goto out_tcp_free_slab;
        }
        rc = sk_alloc_slab(&raw_prot, "raw_sock");
        if (rc) {
                sk_alloc_slab_error(&raw_prot);
                goto out_udp_free_slab;
        }
          (void)sock_register(&inet_family_ops);
        if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
                printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
        if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
                printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
        if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
                printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
        for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
                inet_register_protosw(q);
        arp_init();
        ip_init();
        // tcp協定棧的初始化
        tcp_v4_init(&inet_family_ops);
        tcp_init();
        
}      

tcp協定棧的初始化分兩個步驟tcp_v4_init和tcp_init。

tcp_v4_init 初始化了RST的socket;真正做初始化的是tcp_init:

void __init tcp_init(void)
{
        struct sk_buff *skb = NULL;
        unsigned long limit;
        int order, i, max_share;
        if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
                __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
                                           sizeof(skb->cb));
                                           
        // 初始化tcp_hashinfo中的3個記憶體塊bind_bucket_cachep,ehash,bhash
        tcp_hashinfo.bind_bucket_cachep =
                kmem_cache_create("tcp_bind_bucket",
                                  sizeof(struct inet_bind_bucket), 0,
                                  SLAB_HWCACHE_ALIGN, NULL, NULL);
        if (!tcp_hashinfo.bind_bucket_cachep)
                panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
        tcp_hashinfo.ehash =
                alloc_large_system_hash("TCP established",
                                        sizeof(struct inet_ehash_bucket),
                                        thash_entries,
                                        (num_physpages >= 128 * 1024) ?
                                        13 : 15,
                                        HASH_HIGHMEM,
                                        &tcp_hashinfo.ehash_size,
                                        NULL,
                                        0);
        tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
        for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
                rwlock_init(&tcp_hashinfo.ehash[i].lock);
                INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
        }
        tcp_hashinfo.bhash =
                alloc_large_system_hash("TCP bind",
                                        sizeof(struct inet_bind_hashbucket),
                                        tcp_hashinfo.ehash_size,
                                        (num_physpages >= 128 * 1024) ?
                                        13 : 15,
                                        HASH_HIGHMEM,
                                        &tcp_hashinfo.bhash_size,
                                        NULL,
                                        64 * 1024);
        tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
        for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
                spin_lock_init(&tcp_hashinfo.bhash[i].lock);
                INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
        }
        for (order = 0; ((1 << order) << PAGE_SHIFT) <
                        (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
                        order++)
                ;
        if (order >= 4) {
                // 設定端口範圍
                sysctl_local_port_range[0] = 32768;
                sysctl_local_port_range[1] = 61000;
                tcp_death_row.sysctl_max_tw_buckets = 180000;
                sysctl_tcp_max_orphans = 4096 << (order - 4);
                sysctl_max_syn_backlog = 1024;
        } else if (order < 3) {
                sysctl_local_port_range[0] = 1024 * (3 - order);
                tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
                sysctl_tcp_max_orphans >>= (3 - order);
                sysctl_max_syn_backlog = 128;
        }
        sysctl_tcp_mem[0] =  768 << order;
        sysctl_tcp_mem[1] = 1024 << order;
        sysctl_tcp_mem[2] = 1536 << order;
        limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
        max_share = min(4UL*1024*1024, limit);
        sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM;
        sysctl_tcp_wmem[1] = 16*1024;
        sysctl_tcp_wmem[2] = max(64*1024, max_share);
        sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM;
        sysctl_tcp_rmem[1] = 87380;
        sysctl_tcp_rmem[2] = max(87380, max_share);
        printk(KERN_INFO "TCP: Hash tables configured "
               "(established %d bind %d)\n",
               tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
        // 注冊擁塞函數       
        tcp_register_congestion_control(&tcp_reno);
}      

tcp_hashinfo 是個重要的不能再重要的全局資料結構:

Linux網絡解讀(6) - TCP之listen,accept

tcp_hashinfo 裡有3張hash表,分别存儲3種類型的sock結構體:

1) ehash 連結建立成功的sock;

2) bhash bind()調用會把對應的sock插入到這樣表,以便于檢查端口沖突;

3) listening_hash 監聽套接字

這個資料結構後面還會深入研究。

下面再看一下建立tcp套接字的初始化,這個函數到哪裡找呢?

socket_layer ----> transport_layer 使用inet_protsw結構

transport_layer ----> ip_layer 使用 net_protocol結構

這兩者都在inet_init中初始化的。

而建立一個套接字調用到inet_create()函數,在根據AF_INET,SOCK_STREAM找到一個tcp的inet_protsw項,然後調用sk->sk_prot->init(sk),

是以,建立tcp套接字時的初始化是在結構體tcp_prot中的tcp_v4_init_sock函數。

static int tcp_v4_init_sock(struct sock *sk)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        skb_queue_head_init(&tp->out_of_order_queue);
        
        // 初始化timers
        tcp_init_xmit_timers(sk);
        tcp_prequeue_init(tp);
        icsk->icsk_rto = TCP_TIMEOUT_INIT;
        tp->mdev = TCP_TIMEOUT_INIT;
        tp->snd_cwnd = 2;
        tp->snd_ssthresh = 0x7fffffff;        /* Infinity */
        tp->snd_cwnd_clamp = ~0;
        tp->mss_cache = 536;
        tp->reordering = sysctl_tcp_reordering;
        // 擁塞函數
        icsk->icsk_ca_ops = &tcp_init_congestion_ops;
        sk->sk_state = TCP_CLOSE;
        sk->sk_write_space = sk_stream_write_space;
        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
        
        // icsk_af_ops這是tcp是運作在哪個協定之上,目前是IPV4
        icsk->icsk_af_ops = &ipv4_specific;
        icsk->icsk_sync_mss = tcp_sync_mss;
        sk->sk_sndbuf = sysctl_tcp_wmem[1];
        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
        atomic_inc(&tcp_sockets_allocated);
        return 0;
}      

listen

前面說過listen()調用的是sock->ops->listen,而ops在inet_create中被指派為inet_protosw的ops,tcp對應的是inet_stream_ops,是以listen最終調用到inet_listen

int inet_listen(struct socket *sock, int backlog)
{
        struct sock *sk = sock->sk;
        unsigned char old_state;
        int err;
        lock_sock(sk);
        err = -EINVAL;
        if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
                goto out;
        old_state = sk->sk_state;
        if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
                goto out;
        if (old_state != TCP_LISTEN) {
                err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
                if (err)
                        goto out;
        }
        sk->sk_max_ack_backlog = backlog;
        err = 0;
out:
        release_sock(sk);
        return err;
}      

隻有到sock結構體中的state為TCP_CLOSE才會進入真正的listen

int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
        struct inet_sock *inet = inet_sk(sk);
        // 将sock結構體指向的結構體cast成描述tcp socket的結構體inet_connection_sock
        // 這是在inet_create中調用sk_alloc配置設定的,sk_alloc根據inet_protosw結構體中的objsize配置設定記憶體,每種協定都是不同的。
        struct inet_connection_sock *icsk = inet_csk(sk);
        // 初始化icsk_accept_queue,這是一個連結清單。
        int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
        if (rc != 0)
                return rc;
        sk->sk_max_ack_backlog = 0;
        sk->sk_ack_backlog = 0;
        inet_csk_delack_init(sk);
        // 設定sock的sk_state為TCP_LISTEN
        sk->sk_state = TCP_LISTEN;
        if (!sk->sk_prot->get_port(sk, inet->num)) {
                inet->sport = htons(inet->num);
                sk_dst_reset(sk);
                // 這個hash函數是tcp_prot中的tcp_v4_hash
                // 會根據sock的狀态把sock放入全局的tcp_hashinfo結構體中:
                //     如果是TCP_LISTEN,則被會被插入到tcp_hashinfo->listening_hash中;
                //     否則,被插入到inet_ehash_bucket中。
                sk->sk_prot->hash(sk);
                return 0;
        }
        sk->sk_state = TCP_CLOSE;
        __reqsk_queue_destroy(&icsk->icsk_accept_queue);
        return -EADDRINUSE;
}      

從上面的代碼可以看到listen的開銷在于:

1) 初始化sock_state;

2) 初始化icsk_accept_queue;

3) 把sock插入到tcp_hashinfo->listening_hash.

完成listen之後,會把sock結構體加入到tcp_hashinfo的bind_hash和listening_hash兩張hash表中。

accept

accept最終diao      
asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen)
{
        struct socket *sock, *newsock;
        struct file *newfile;
        int err, len, newfd, fput_needed;
        char address[MAX_SOCK_ADDR];
        // 找到fd對應的監聽套接字
        sock = sockfd_lookup_light(fd, &err, &fput_needed);
        if (!sock)
                goto out;
        err = -ENFILE;
        // 給新的連接配接配置設定socket
        if (!(newsock = sock_alloc())) 
                goto out_put;
        newsock->type = sock->type;
        newsock->ops = sock->ops;
        __module_get(newsock->ops->owner);
        // 給新的連接配接配置設定fd
        newfd = sock_alloc_fd(&newfile);
        if (unlikely(newfd < 0)) {
                err = newfd;
                sock_release(newsock);
                goto out_put;
        }
        // 把新的fd和新的套接字關聯起來
        err = sock_attach_fd(newsock, newfile);
        if (err < 0)
                goto out_fd;
        err = security_socket_accept(sock, newsock);
        if (err)
                goto out_fd;
        // sock->ops->accept開始accept
        // ops是inet_stream_ops
        err = sock->ops->accept(sock, newsock, sock->file->f_flags);
        if (err < 0)
                goto out_fd;
        if (upeer_sockaddr) {
                if(newsock->ops->getname(newsock, (struct sockaddr *)address, &len, 2)<0) {
                        err = -ECONNABORTED;
                        goto out_fd;
                }
                err = move_addr_to_user(address, len, upeer_sockaddr, upeer_addrlen);
                if (err < 0)
                        goto out_fd;
        }
        fd_install(newfd, newfile);
        err = newfd;
        security_socket_post_accept(sock, newsock);
}      
Linux網絡解讀(6) - TCP之listen,accept

函數的跳轉在上面這張圖中,重點來看最關鍵的函數inet_csk_accept:

struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct sock *newsk;
        int error;
        lock_sock(sk);
        error = -EINVAL;
        // 檢查sk_state是否為TCP_LISTEN
        if (sk->sk_state != TCP_LISTEN)
                goto out_err;
        // 判斷icsk_accept_queue隊列是否為空:
        // 如果非空則說明已經有連接配接建立起來了,直接拿走就好。
        // 如果為空則說明此時還沒有連接配接建立,需要阻塞的等待。
        if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
                long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
                error = -EAGAIN;
                // 非阻塞的監聽套接字直接傳回
                if (!timeo)
                        goto out_err;
                        
                // 開始等待        
                error = inet_csk_wait_for_connect(sk, timeo);
                if (error)
                        goto out_err;
        }
        newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
        BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
out:
        release_sock(sk);
        return newsk;
out_err:
        newsk = NULL;
        *err = error;
        goto out;
}      

繼續閱讀