天天看點

Linux核心-協定棧-從BSD Socket接口層到傳輸層1

版權聲明:本文為部落客原創文章,未經部落客允許不得轉載。 https://blog.csdn.net/feilengcui008/article/details/49530991

本文接上一篇

Linux核心協定棧-初始化流程分析

,在上一篇中主要分析了了Linux核心協定棧涉及到的關鍵初始化函數,在這一篇文章中将分析協定棧的BSD socket和到傳輸層的流程。采取的方式是分析socket相關的主要系統調用。針對不同的系統調用,其到達的協定層深度可能不同,有的基本隻到sock層就夠了,但是有些可能需要會涉及到比如tcp的具體細節和更底層的細節。本文基本追溯到傳輸層的開始,再深入的細節後續文章分析。

1.準備

協定的基本分層:

(A代表socket的某個系統調用)

BSD socket system calls A => proto_ops->A => sock->A => tcp_prot => A

  • BSD socket層和具體協定族某個類型的聯系是通過struct proto_ops,在include/linux/net.h中定義了不同協定族如af_inet,af_unix等的通用操作函數指針的結構體struct proto_ops,具體的定義有各個協定族的某個類型的子子產品自己完成。比如ipv4/af_inet.c中定義的af_inet family的tcp/udp等相應的struct proto_ops。
  • 由于對于每個family的不同類型,其針對socket的某些需求可能不同,是以抽了一層struct sock出來,sock->sk_prot挂接到具體tcp/udp等傳輸層的struct proto上(具體定義在ipv4/tcp_ipv4.c,ipv4/udp.c)
  • 另外,由于内容比較多,這一篇主要分析socket,bind,listen,accept幾個系統調用,下一篇會涉及connect,send,recv等的分析
//不同協定族的通用函數hooks
//比如af_inet相關的定義在ipv4/af_inet.c中
//除了建立socket為系統調用外,基本針對socket層的操作函數都在這裡面
struct proto_ops {
    int     family;
    struct module   *owner;
    int     (*release)   (struct socket *sock);
    int     (*bind)      (struct socket *sock,
                      struct sockaddr *myaddr,
                      int sockaddr_len);
    int     (*connect)   (struct socket *sock,
                      struct sockaddr *vaddr,
                      int sockaddr_len, int flags);
    int     (*socketpair)(struct socket *sock1,
                      struct socket *sock2);
    int     (*accept)    (struct socket *sock,
                      struct socket *newsock, int flags);
    int     (*getname)   (struct socket *sock,
                      struct sockaddr *addr,
                      int *sockaddr_len, int peer);
    unsigned int    (*poll)      (struct file *file, struct socket *sock,
                      struct poll_table_struct *wait);
    int     (*ioctl)     (struct socket *sock, unsigned int cmd,
                      unsigned long arg);
#ifdef CONFIG_COMPAT
    int     (*compat_ioctl) (struct socket *sock, unsigned int cmd,
                      unsigned long arg);
#endif
    int     (*listen)    (struct socket *sock, int len);
    int     (*shutdown)  (struct socket *sock, int flags);
    int     (*setsockopt)(struct socket *sock, int level,
                      int optname, char __user *optval, unsigned int optlen);
/*省略部分*/
};           
//傳輸層的proto 
//作為sock->sk_prot與具體傳輸層的hooks
struct proto {
    void            (*close)(struct sock *sk,
                    long timeout);
    int         (*connect)(struct sock *sk,
                    struct sockaddr *uaddr,
                    int addr_len);
    int         (*disconnect)(struct sock *sk, int flags);

    struct sock *       (*accept)(struct sock *sk, int flags, int *err);

    int         (*ioctl)(struct sock *sk, int cmd,
                     unsigned long arg);
    int         (*init)(struct sock *sk);
    void            (*destroy)(struct sock *sk);
    void            (*shutdown)(struct sock *sk, int how);
    int         (*setsockopt)(struct sock *sk, int level,
                    int optname, char __user *optval,
                    unsigned int optlen);
    int         (*getsockopt)(struct sock *sk, int level,
                    int optname, char __user *optval,
                    int __user *option);
#ifdef CONFIG_COMPAT
    int         (*compat_setsockopt)(struct sock *sk,
                    int level,
                    int optname, char __user *optval,
                    unsigned int optlen);
    int         (*compat_getsockopt)(struct sock *sk,
                    int level,
                    int optname, char __user *optval,
                    int __user *option);
    int         (*compat_ioctl)(struct sock *sk,
                    unsigned int cmd, unsigned long arg);
#endif
    int         (*sendmsg)(struct kiocb *iocb, struct sock *sk,
                       struct msghdr *msg, size_t len);
    int         (*recvmsg)(struct kiocb *iocb, struct sock *sk,
                       struct msghdr *msg,
                       size_t len, int noblock, int flags,
                       int *addr_len);
    int         (*sendpage)(struct sock *sk, struct page *page,
                    int offset, size_t size, int flags);
    int         (*bind)(struct sock *sk,
                    struct sockaddr *uaddr, int addr_len);

    /*省略部分*/
};
           

同時附上其他幾個關鍵結構體:

//bsd socket層
//include/linux/net.h
struct socket {
    socket_state        state;
    kmemcheck_bitfield_begin(type);
    short           type;
    kmemcheck_bitfield_end(type);
    unsigned long       flags;
    struct socket_wq __rcu  *wq;
    struct file     *file;
    struct sock     *sk;
    const struct proto_ops  *ops;
};           
//sock層
struct sock {
 sock_common    __sk_common;
#define sk_node         __sk_common.skc_node
#define sk_nulls_node       __sk_common.skc_nulls_node
#define sk_refcnt       __sk_common.skc_refcnt
#define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping
#define sk_dontcopy_begin   __sk_common.skc_dontcopy_begin
#define sk_dontcopy_end     __sk_common.skc_dontcopy_end
#define sk_hash         __sk_common.skc_hash
#define sk_portpair     __sk_common.skc_portpair
#define sk_num          __sk_common.skc_num
#define sk_dport        __sk_common.skc_dport
#define sk_addrpair     __sk_common.skc_addrpair
#define sk_daddr        __sk_common.skc_daddr
#define sk_rcv_saddr        __sk_common.skc_rcv_saddr
#define sk_family       __sk_common.skc_family
#define sk_state        __sk_common.skc_state
#define sk_reuse        __sk_common.skc_reuse
#define sk_reuseport        __sk_common.skc_reuseport
#define sk_ipv6only     __sk_common.skc_ipv6only
#define sk_bound_dev_if     __sk_common.skc_bound_dev_if
#define sk_bind_node        __sk_common.skc_bind_node
#define sk_prot         __sk_common.skc_prot
#define sk_net          __sk_common.skc_net
#define sk_v6_daddr     __sk_common.skc_v6_daddr
#define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr

    unsigned long       sk_flags;
    struct dst_entry    *sk_rx_dst;
    struct dst_entry __rcu  *sk_dst_cache;
    spinlock_t      sk_dst_lock;
    atomic_t        sk_wmem_alloc;
    atomic_t        sk_omem_alloc;
    int         sk_sndbuf;
    struct sk_buff_head sk_write_queue;
    /*省略部分*/
    struct pid      *sk_peer_pid;
    const struct cred   *sk_peer_cred;
    long            sk_rcvtimeo;
    long            sk_sndtimeo;
    void            *sk_protinfo;
    struct timer_list   sk_timer;
    ktime_t         sk_stamp;
    u16         sk_tsflags;
    u32         sk_tskey;
    struct socket       *sk_socket;
    void            *sk_user_data;
    struct page_frag    sk_frag;
    struct sk_buff      *sk_send_head;
    /*省略部分*/
};           

2.開始

主要追溯幾個典型的socket相關的系統調用,如socket,bind,listen,accept等等

  • socket
//建立socket的系統調用
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
    int retval;
    struct socket *sock;
    int flags;

    /* Check the SOCK_* constants for consistency.  */
    BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
    BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
    BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
    BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

    flags = type & ~SOCK_TYPE_MASK;
    if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
        return -EINVAL;
    type &= SOCK_TYPE_MASK;

    if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
        flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

    //配置設定inode,傳回inode中的一個成員作為sock
    retval = sock_create(family, type, protocol, &sock);
    if (retval < 0)
        goto out;

    //找個fd映射sock
    //得到空fd
    //配置設定僞dentry和file,并将socket file的operations與file挂接 
    retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
/*省略部分*/
}           
  • socketpair
//建立socketpair,注意af_inet協定族下沒有pair,af_unix下有
SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
        int __user *, usockvec)
{
    struct socket *sock1, *sock2;
    int fd1, fd2, err;
    struct file *newfile1, *newfile2;
    int flags;

    flags = type & ~SOCK_TYPE_MASK;
    if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
        return -EINVAL;
    type &= SOCK_TYPE_MASK;

    if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
        flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

    //建立socket1 
    err = sock_create(family, type, protocol, &sock1);
    if (err < 0)
        goto out;

    //建立socket2
    err = sock_create(family, type, protocol, &sock2);
    if (err < 0)
        goto out_release_1;

    //調用socket operations的socketpair 
    //關于不同協定層的函數hook,公共結構體是struct proto_ops 
    //對于不同的family,比如af_inet協定族的定義在ipv4/af_inet.c
    //
    //對于af_inet沒有socketpair 
    //對于af_unix有socketpair
    err = sock1->ops->socketpair(sock1, sock2);
    if (err < 0)
        goto out_release_both;

    //後面部分就很類似了,找到空fd,配置設定file,綁定到socket,将file
    安裝到目前程序
    fd1 = get_unused_fd_flags(flags);
    if (unlikely(fd1 < 0)) {
        err = fd1;
        goto out_release_both;
    }

    fd2 = get_unused_fd_flags(flags);
    if (unlikely(fd2 < 0)) {
        err = fd2;
        goto out_put_unused_1;
    }

    newfile1 = sock_alloc_file(sock1, flags, NULL);
    if (unlikely(IS_ERR(newfile1))) {
        err = PTR_ERR(newfile1);
        goto out_put_unused_both;
    }

    newfile2 = sock_alloc_file(sock2, flags, NULL);
    if (IS_ERR(newfile2)) {
        err = PTR_ERR(newfile2);
        goto out_fput_1;
    }

    err = put_user(fd1, &usockvec[0]);
    if (err)
        goto out_fput_both;

    err = put_user(fd2, &usockvec[1]);
    if (err)
        goto out_fput_both;

    audit_fd_pair(fd1, fd2);

    fd_install(fd1, newfile1);
    fd_install(fd2, newfile2);
    /* fd1 and fd2 may be already another descriptors.
     * Not kernel problem.
     */
    return 0;           
  • bind
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
    struct socket *sock;
    struct sockaddr_storage address;
    int err, fput_needed;
    //根據fd查找file,進而查找socket指針sock
    sock = sockfd_lookup_light(fd, &err, &fput_needed);
    if (sock) {
        //把使用者态位址資料移到核心态
        //調用copy_from_user 
        err = move_addr_to_kernel(umyaddr, addrlen, &address);
        if (err >= 0) {
            //security hook
            err = security_socket_bind(sock,
                           (struct sockaddr *)&address,
                           addrlen);
            if (!err)
                //ok, 到具體family定義的proto_ops中的bind 
                //比如對af_inet,主要是設定socket->sock->inet_sock的一些參數,比如接收位址,端口什麼的
                err = sock->ops->bind(sock,
                              (struct sockaddr *)
                              &address, addrlen);
        }
        fput_light(sock->file, fput_needed);
    }
    return err;
}           
  • listen

    listen所做的事情也比較簡單,從系統調用的listen(fd, backlog)到proto_ops 的inet_listen與前面類似,這裡分析下inet_listen中的核心函數inet_csk_listen_start(位于ipv4/inet_connection_sock.c中)。

int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
    //獲得網絡層inte_sock 
    struct inet_sock *inet = inet_sk(sk);
    //管理request connection的結構體  
    struct inet_connection_sock *icsk = inet_csk(sk);
    //配置設定backlog個長度的accpet_queue的結構連接配接請求的隊列
    int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);

    if (rc != 0)
        return rc;

    sk->sk_max_ack_backlog = 0;
    sk->sk_ack_backlog = 0;
    inet_csk_delack_init(sk);

    /* There is race window here: we announce ourselves listening,
     * but this transition is still not validated by get_port().
     * It is OK, because this socket enters to hash table only
     * after validation is complete.
     */
    //切換狀态到listening 
    sk->sk_state = TCP_LISTEN;
    if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
        inet->inet_sport = htons(inet->inet_num);
        //更新dst_entry表
        sk_dst_reset(sk);
        sk->sk_prot->hash(sk);

        return 0;
    }
    sk->sk_state = TCP_CLOSE;
    __reqsk_queue_destroy(&icsk->icsk_accept_queue);
    return -EADDRINUSE;
}           
  • accept

    上面socket, socketpair, bind基本隻涉及到BSD socket, sock層相關的,過程比較簡單,而accept層在sock層和tcp層互動稍微複雜,下面詳細分析

//socket.c
//accept系統調用
SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
        int __user *, upeer_addrlen, int, flags)
{
    /*省略部分*/
    err = -ENFILE;
    //for client socket 
    newsock = sock_alloc();
    if (!newsock)
        goto out_put;

    newsock->type = sock->type;
    newsock->ops = sock->ops;

    /*
     * We don't need try_module_get here, as the listening socket (sock)
     * has the protocol module (sock->ops->owner) held.
     */
    __module_get(newsock->ops->owner);

    //得到目前程序空fd,分給newsock file
    newfd = get_unused_fd_flags(flags);
    if (unlikely(newfd < 0)) {
        err = newfd;
        sock_release(newsock);
        goto out_put;
    }
    //從flab配置設定空file結構
    newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
    if (unlikely(IS_ERR(newfile))) {
        err = PTR_ERR(newfile);
        put_unused_fd(newfd);
        sock_release(newsock);
        goto out_put;
    }

    err = security_socket_accept(sock, newsock);
    if (err)
        goto out_fd;

    //proto_ops中的accept 
    //accept從系統調用到具體協定族的某個type的struct proto_ops的accept如af_inet tcp的的accept,再到sock層的accept,然後sock層的accept實際上對應的是具體傳輸層的struct proto中的accpet,如tcp/udp的struct proto tcp_prot/udp_prot,然後放入newsock 
    err = sock->ops->accept(sock, newsock, sock->file->f_flags);
    if (err < 0)
        goto out_fd;

    if (upeer_sockaddr) {
        if (newsock->ops->getname(newsock, (struct sockaddr *)&address,
                      &len, 2) < 0) {
            err = -ECONNABORTED;
            goto out_fd;
        }
        //拷貝client socket addr storage到userspace
        err = move_addr_to_user(&address,
                    len, upeer_sockaddr, upeer_addrlen);
        if (err < 0)
            goto out_fd;
    }
    fd_install(newfd, newfile);
    err = newfd;
    /*省略部分*/

}           
//ipv4/af_inet.c
//inet family的tcp相關的proto_ops
int inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
    struct sock *sk1 = sock->sk;
    int err = -EINVAL;
    //進入(網絡)sock層,accept新sock 
    struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
    if (!sk2)
        goto do_err;

    //鎖住sock,因為需要操作sock内的request_socket請求隊列頭
    wait_queue_head_t等資料
    lock_sock(sk2);
    sock_rps_record_flow(sk2);
    WARN_ON(!((1 << sk2->sk_state) &
          (TCPF_ESTABLISHED | TCPF_SYN_RECV |
          TCPF_CLOSE_WAIT | TCPF_CLOSE)));
    sock_graft(sk2, newsock);
    //設定client socket狀态 
    newsock->state = SS_CONNECTED;
    err = 0;
    release_sock(sk2);
do_err:
    return err;
}           
//ipv4/tcp_ipv4.c
//這裡進入struct proto tcp_prot中的accept
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
    struct inet_connection_sock *icsk = inet_csk(sk);
    //icsk : inet_connection_sock 面向連接配接的用戶端連接配接處理相關的資訊
    //接收隊列
    struct request_sock_queue *queue = &icsk->icsk_accept_queue;
    struct sock *newsk;
    struct request_sock *req;
    int error;
    //lock sock
    lock_sock(sk);
    //如果不是ACCPET狀态轉換過來,出錯
    error = -EINVAL;
    if (sk->sk_state != TCP_LISTEN)
        goto out_err;

    //如果request_sock隊列是空的, 利用等待隊列挂起目前程序到等待隊列,并且将等待隊列放入sock中的請求隊列頭
    if (reqsk_queue_empty(queue)) { 
        //如果非阻塞,0,否則為sk的接收時間
        long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
        error = -EAGAIN;
        if (!timeo)   //如果非阻塞而且接收隊列是空,直接傳回-EAGAIN
            goto out_err;
        //阻塞情況下,等待timeo時間的逾時
        //利用了等待隊列,下面會詳細注解 
        error = inet_csk_wait_for_connect(sk, timeo);
        if (error)
            goto out_err;
    }
    //不是空,移出一個連接配接請求 
    req = reqsk_queue_remove(queue);
    //連接配接請求的sock
    newsk = req->sk;
    //減少backlog 
    sk_acceptq_removed(sk);
    //fastopenq?
    if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
        spin_lock_bh(&queue->fastopenq->lock);
        if (tcp_rsk(req)->listener) {
            /* We are still waiting for the final ACK from 3WHS
             * so can't free req now. Instead, we set req->sk to
             * NULL to signify that the child socket is taken
             * so reqsk_fastopen_remove() will free the req
             * when 3WHS finishes (or is aborted).
             */
            req->sk = NULL;
            req = NULL;
        }
        spin_unlock_bh(&queue->fastopenq->lock);
    }
    //ok,清理,傳回newsk
    /*省略部分*/           
//ipv4/inet_connection_sock.c
//accept連接配接請求的核心函數
static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
{
    struct inet_connection_sock *icsk = inet_csk(sk);
    //定義一個等待隊列wait_queue_t wait 程序是目前程序
    DEFINE_WAIT(wait);
    int err;
    for (;;) {
        //sk_leep(sk) : sock的wait_queue_head_t
        //wait : wait_queue_t
        //這裡将current程序的wait_queue_t加入sk的wait_queue_head_t中,spin鎖定 
        //wait_queue_head_t,設定current狀态,然後spin解鎖時可能重新schedule 
        prepare_to_wait_exclusive(sk_sleep(sk), &wait,
                      TASK_INTERRUPTIBLE);

        //被喚醒,解鎖sock 
        release_sock(sk);
        //如果請求隊列為空,說明timeout了
        if (reqsk_queue_empty(&icsk->icsk_accept_queue))
            //schedule timeout
            timeo = schedule_timeout(timeo);

        //再鎖住進行下次循環,準備再次進入TASK_INTERRUPTIBLE
        lock_sock(sk);
        err = 0;

        //檢查是否有連接配接到達, 如果有,break,喚醒等待隊列 
        if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
            break;
        err = -EINVAL;
        //如果不是listening 狀态轉過來的, 除錯-EINVAL  
        if (sk->sk_state != TCP_LISTEN)
            break;

        //檢查interrupt錯誤
        err = sock_intr_errno(timeo);

        //如果目前程序收到信号了,break 
        if (signal_pending(current))
            break;

        //如果傳入的timeo為0,則回到nonblock的狀态, break 
        err = -EAGAIN;
        if (!timeo)
            break;
    }

    //ok, 有連接配接到達,設定state為running, 喚醒wait queue的第一個程序,移除wait_queue_t和wait_queue_head_t 
    finish_wait(sk_sleep(sk), &wait);
    return err;
}           

繼續閱讀