TCP
總算到了TCP了。
TCP可以描述為一個無法選擇确認也無法否認的滑動視窗協定。
TCP協定棧的初始化
tcp協定棧初始化的入口是在inet_init中
static int __init inet_init(void)
{
rc = sk_alloc_slab(&tcp_prot, "tcp_sock");
if (rc) {
sk_alloc_slab_error(&tcp_prot);
goto out;
}
rc = sk_alloc_slab(&udp_prot, "udp_sock");
if (rc) {
sk_alloc_slab_error(&udp_prot);
goto out_tcp_free_slab;
}
rc = sk_alloc_slab(&raw_prot, "raw_sock");
if (rc) {
sk_alloc_slab_error(&raw_prot);
goto out_udp_free_slab;
}
(void)sock_register(&inet_family_ops);
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
arp_init();
ip_init();
// tcp協定棧的初始化
tcp_v4_init(&inet_family_ops);
tcp_init();
}
tcp協定棧的初始化分兩個步驟tcp_v4_init和tcp_init。
tcp_v4_init 初始化了RST的socket;真正做初始化的是tcp_init:
void __init tcp_init(void)
{
struct sk_buff *skb = NULL;
unsigned long limit;
int order, i, max_share;
if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
sizeof(skb->cb));
// 初始化tcp_hashinfo中的3個記憶體塊bind_bucket_cachep,ehash,bhash
tcp_hashinfo.bind_bucket_cachep =
kmem_cache_create("tcp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
if (!tcp_hashinfo.bind_bucket_cachep)
panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
tcp_hashinfo.ehash =
alloc_large_system_hash("TCP established",
sizeof(struct inet_ehash_bucket),
thash_entries,
(num_physpages >= 128 * 1024) ?
13 : 15,
HASH_HIGHMEM,
&tcp_hashinfo.ehash_size,
NULL,
0);
tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
rwlock_init(&tcp_hashinfo.ehash[i].lock);
INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
}
tcp_hashinfo.bhash =
alloc_large_system_hash("TCP bind",
sizeof(struct inet_bind_hashbucket),
tcp_hashinfo.ehash_size,
(num_physpages >= 128 * 1024) ?
13 : 15,
HASH_HIGHMEM,
&tcp_hashinfo.bhash_size,
NULL,
64 * 1024);
tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
spin_lock_init(&tcp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
}
for (order = 0; ((1 << order) << PAGE_SHIFT) <
(tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
order++)
;
if (order >= 4) {
// 設定端口範圍
sysctl_local_port_range[0] = 32768;
sysctl_local_port_range[1] = 61000;
tcp_death_row.sysctl_max_tw_buckets = 180000;
sysctl_tcp_max_orphans = 4096 << (order - 4);
sysctl_max_syn_backlog = 1024;
} else if (order < 3) {
sysctl_local_port_range[0] = 1024 * (3 - order);
tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
sysctl_tcp_max_orphans >>= (3 - order);
sysctl_max_syn_backlog = 128;
}
sysctl_tcp_mem[0] = 768 << order;
sysctl_tcp_mem[1] = 1024 << order;
sysctl_tcp_mem[2] = 1536 << order;
limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
max_share = min(4UL*1024*1024, limit);
sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM;
sysctl_tcp_wmem[1] = 16*1024;
sysctl_tcp_wmem[2] = max(64*1024, max_share);
sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM;
sysctl_tcp_rmem[1] = 87380;
sysctl_tcp_rmem[2] = max(87380, max_share);
printk(KERN_INFO "TCP: Hash tables configured "
"(established %d bind %d)\n",
tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
// 注冊擁塞函數
tcp_register_congestion_control(&tcp_reno);
}
tcp_hashinfo 是個重要的不能再重要的全局資料結構:
tcp_hashinfo 裡有3張hash表,分别存儲3種類型的sock結構體:
1) ehash 連結建立成功的sock;
2) bhash bind()調用會把對應的sock插入到這樣表,以便于檢查端口沖突;
3) listening_hash 監聽套接字
這個資料結構後面還會深入研究。
下面再看一下建立tcp套接字的初始化,這個函數到哪裡找呢?
socket_layer ----> transport_layer 使用inet_protsw結構
transport_layer ----> ip_layer 使用 net_protocol結構
這兩者都在inet_init中初始化的。
而建立一個套接字調用到inet_create()函數,在根據AF_INET,SOCK_STREAM找到一個tcp的inet_protsw項,然後調用sk->sk_prot->init(sk),
是以,建立tcp套接字時的初始化是在結構體tcp_prot中的tcp_v4_init_sock函數。
static int tcp_v4_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
skb_queue_head_init(&tp->out_of_order_queue);
// 初始化timers
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev = TCP_TIMEOUT_INIT;
tp->snd_cwnd = 2;
tp->snd_ssthresh = 0x7fffffff; /* Infinity */
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = 536;
tp->reordering = sysctl_tcp_reordering;
// 擁塞函數
icsk->icsk_ca_ops = &tcp_init_congestion_ops;
sk->sk_state = TCP_CLOSE;
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
// icsk_af_ops這是tcp是運作在哪個協定之上,目前是IPV4
icsk->icsk_af_ops = &ipv4_specific;
icsk->icsk_sync_mss = tcp_sync_mss;
sk->sk_sndbuf = sysctl_tcp_wmem[1];
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
atomic_inc(&tcp_sockets_allocated);
return 0;
}
listen
前面說過listen()調用的是sock->ops->listen,而ops在inet_create中被指派為inet_protosw的ops,tcp對應的是inet_stream_ops,是以listen最終調用到inet_listen
int inet_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
unsigned char old_state;
int err;
lock_sock(sk);
err = -EINVAL;
if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
goto out;
old_state = sk->sk_state;
if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto out;
if (old_state != TCP_LISTEN) {
err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
if (err)
goto out;
}
sk->sk_max_ack_backlog = backlog;
err = 0;
out:
release_sock(sk);
return err;
}
隻有到sock結構體中的state為TCP_CLOSE才會進入真正的listen
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
struct inet_sock *inet = inet_sk(sk);
// 将sock結構體指向的結構體cast成描述tcp socket的結構體inet_connection_sock
// 這是在inet_create中調用sk_alloc配置設定的,sk_alloc根據inet_protosw結構體中的objsize配置設定記憶體,每種協定都是不同的。
struct inet_connection_sock *icsk = inet_csk(sk);
// 初始化icsk_accept_queue,這是一個連結清單。
int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
if (rc != 0)
return rc;
sk->sk_max_ack_backlog = 0;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
// 設定sock的sk_state為TCP_LISTEN
sk->sk_state = TCP_LISTEN;
if (!sk->sk_prot->get_port(sk, inet->num)) {
inet->sport = htons(inet->num);
sk_dst_reset(sk);
// 這個hash函數是tcp_prot中的tcp_v4_hash
// 會根據sock的狀态把sock放入全局的tcp_hashinfo結構體中:
// 如果是TCP_LISTEN,則被會被插入到tcp_hashinfo->listening_hash中;
// 否則,被插入到inet_ehash_bucket中。
sk->sk_prot->hash(sk);
return 0;
}
sk->sk_state = TCP_CLOSE;
__reqsk_queue_destroy(&icsk->icsk_accept_queue);
return -EADDRINUSE;
}
從上面的代碼可以看到listen的開銷在于:
1) 初始化sock_state;
2) 初始化icsk_accept_queue;
3) 把sock插入到tcp_hashinfo->listening_hash.
完成listen之後,會把sock結構體加入到tcp_hashinfo的bind_hash和listening_hash兩張hash表中。
accept
accept最終diao
asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen)
{
struct socket *sock, *newsock;
struct file *newfile;
int err, len, newfd, fput_needed;
char address[MAX_SOCK_ADDR];
// 找到fd對應的監聽套接字
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
goto out;
err = -ENFILE;
// 給新的連接配接配置設定socket
if (!(newsock = sock_alloc()))
goto out_put;
newsock->type = sock->type;
newsock->ops = sock->ops;
__module_get(newsock->ops->owner);
// 給新的連接配接配置設定fd
newfd = sock_alloc_fd(&newfile);
if (unlikely(newfd < 0)) {
err = newfd;
sock_release(newsock);
goto out_put;
}
// 把新的fd和新的套接字關聯起來
err = sock_attach_fd(newsock, newfile);
if (err < 0)
goto out_fd;
err = security_socket_accept(sock, newsock);
if (err)
goto out_fd;
// sock->ops->accept開始accept
// ops是inet_stream_ops
err = sock->ops->accept(sock, newsock, sock->file->f_flags);
if (err < 0)
goto out_fd;
if (upeer_sockaddr) {
if(newsock->ops->getname(newsock, (struct sockaddr *)address, &len, 2)<0) {
err = -ECONNABORTED;
goto out_fd;
}
err = move_addr_to_user(address, len, upeer_sockaddr, upeer_addrlen);
if (err < 0)
goto out_fd;
}
fd_install(newfd, newfile);
err = newfd;
security_socket_post_accept(sock, newsock);
}
函數的跳轉在上面這張圖中,重點來看最關鍵的函數inet_csk_accept:
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct sock *newsk;
int error;
lock_sock(sk);
error = -EINVAL;
// 檢查sk_state是否為TCP_LISTEN
if (sk->sk_state != TCP_LISTEN)
goto out_err;
// 判斷icsk_accept_queue隊列是否為空:
// 如果非空則說明已經有連接配接建立起來了,直接拿走就好。
// 如果為空則說明此時還沒有連接配接建立,需要阻塞的等待。
if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
error = -EAGAIN;
// 非阻塞的監聽套接字直接傳回
if (!timeo)
goto out_err;
// 開始等待
error = inet_csk_wait_for_connect(sk, timeo);
if (error)
goto out_err;
}
newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
out:
release_sock(sk);
return newsk;
out_err:
newsk = NULL;
*err = error;
goto out;
}