connect
sys_connect調用棧如下:
最終調用inetsw_array的inet_stream_ops中的connect方法。
int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
int err;
long timeo;
lock_sock(sk);
switch (sock->state) {
default:
err = -EINVAL;
goto out;
case SS_CONNECTED:
err = -EISCONN;
goto out;
case SS_CONNECTING:
err = -EALREADY;
break;
case SS_UNCONNECTED:
err = -EISCONN;
if (sk->sk_state != TCP_CLOSE)
goto out;
// tcp_v4_connect
err = sk->sk_prot->connect(sk, uaddr, addr_len);
if (err < 0)
goto out;
// SS_CONNECTING 和 SS_CONNECTED的确别是,在non_blocking中傳回值是 EINPROGRESS
sock->state = SS_CONNECTING;
err = -EINPROGRESS;
break;
}
timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (!timeo || !inet_wait_for_connect(sk, timeo))
goto out;
err = sock_intr_errno(timeo);
if (signal_pending(current))
goto out;
}
// 連接配接被RST
if (sk->sk_state == TCP_CLOSE)
goto sock_error;
/* sk->sk_err may be not zero now, if RECVERR was ordered by user
* and error was received after socket entered established state.
* Hence, it is handled normally after connect() return successfully.
*/
sock->state = SS_CONNECTED;
err = 0;
out:
release_sock(sk);
return err;
}
sk->sk_prot->connect指向tcp_v4_connect():
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct rtable *rt;
u32 daddr, nexthop;
int tmp;
int err;
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
if (usin->sin_family != AF_INET)
return -EAFNOSUPPORT;
nexthop = daddr = usin->sin_addr.s_addr;
if (inet->opt && inet->opt->srr) {
if (!daddr)
return -EINVAL;
nexthop = inet->opt->faddr;
}
// 查詢路由表
tmp = ip_route_connect(&rt, nexthop, inet->saddr,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
IPPROTO_TCP,
inet->sport, usin->sin_port, sk);
if (tmp < 0)
return tmp;
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
ip_rt_put(rt);
return -ENETUNREACH;
}
if (!inet->opt || !inet->opt->srr)
daddr = rt->rt_dst;
if (!inet->saddr)
inet->saddr = rt->rt_src;
inet->rcv_saddr = inet->saddr;
if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
tp->write_seq = 0;
}
if (tcp_death_row.sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
struct inet_peer *peer = rt_get_peer(rt);
/* VJ's idea. We save last timestamp seen from
* the destination in peer table, when entering state TIME-WAIT
* and initialize rx_opt.ts_recent from it, when trying new connection.
*/
if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
tp->rx_opt.ts_recent = peer->tcp_ts;
}
}
// 指定目的位址和端口
inet->dport = usin->sin_port;
inet->daddr = daddr;
inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet->opt)
inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
tp->rx_opt.mss_clamp = 536;
tcp_set_state(sk, TCP_SYN_SENT);
// 給套接字綁定一個源端口,并且hash
// 把sk插入到established表中
err = inet_hash_connect(&tcp_death_row, sk);
if (err)
goto failure;
err = ip_route_newports(&rt, IPPROTO_TCP, inet->sport, inet->dport, sk);
if (err)
goto failure;
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->u.dst);
// 生成ISN
if (!tp->write_seq)
tp->write_seq = secure_tcp_sequence_number(inet->saddr,
inet->daddr,
inet->sport,
usin->sin_port);
inet->id = tp->write_seq ^ jiffies;
// 非常重要!
// 構造一個SYN包,然後發送。
err = tcp_connect(sk);
rt = NULL;
if (err)
goto failure;
return 0;
failure:
tcp_set_state(sk, TCP_CLOSE);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->dport = 0;
return err;
}
inet_hash_connect 是個很重要的函數,給sk指定一個源端口,并且插入到established表中,配置設定源端口的過程中還要判斷resue,tw等等。
以後會重點分析,這裡先發connect的流程走完。
int tcp_connect(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
tcp_connect_init(sk);
// 配置設定skbuff
buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
if (unlikely(buff == NULL))
return -ENOBUFS;
skb_reserve(buff, MAX_TCP_HEADER);
// 構造SYN包
TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
TCP_ECN_send_syn(sk, tp, buff);
TCP_SKB_CB(buff)->sacked = 0;
skb_shinfo(buff)->gso_segs = 1;
skb_shinfo(buff)->gso_size = 0;
skb_shinfo(buff)->gso_type = 0;
buff->csum = 0;
tp->snd_nxt = tp->write_seq;
TCP_SKB_CB(buff)->seq = tp->write_seq++;
TCP_SKB_CB(buff)->end_seq = tp->write_seq;
// 設定發送的時間
TCP_SKB_CB(buff)->when = tcp_time_stamp;
tp->retrans_stamp = TCP_SKB_CB(buff)->when;
skb_header_release(buff);
// 把要發送的skbuf插入到sk->sk_write_queue尾部
__skb_queue_tail(&sk->sk_write_queue, buff);
sk_charge_skb(sk, buff);
tp->packets_out += tcp_skb_pcount(buff);
// 發送skbuff
// 發送函數是:icsk->icsk_af_ops = &ipv4_specific;
tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
tp->snd_nxt = tp->write_seq;
tp->pushed_seq = tp->write_seq;
TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
// 定時器的設定!!!
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
Inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
return 0;
}
如何發送一個tcp包
看一下如何發送一個TCP封包tcp_transmit_skb():
static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet;
struct tcp_sock *tp;
struct tcp_skb_cb *tcb;
int tcp_header_size;
struct tcphdr *th;
int sysctl_flags;
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
if (icsk->icsk_ca_ops->rtt_sample)
__net_timestamp(skb);
if (likely(clone_it)) {
if (unlikely(skb_cloned(skb)))
skb = pskb_copy(skb, gfp_mask);
else
skb = skb_clone(skb, gfp_mask);
if (unlikely(!skb))
return -ENOBUFS;
}
inet = inet_sk(sk);
tp = tcp_sk(sk);
tcb = TCP_SKB_CB(skb);
tcp_header_size = tp->tcp_header_len;
sysctl_flags = 0;
if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
// 如果是SYN包,設定相關options
tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
if(sysctl_tcp_timestamps) {
tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
}
if (sysctl_tcp_window_scaling) {
tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
sysctl_flags |= SYSCTL_FLAG_WSCALE;
}
if (sysctl_tcp_sack) {
sysctl_flags |= SYSCTL_FLAG_SACK;
if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
}
} else if (unlikely(tp->rx_opt.eff_sacks)) {
tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
(tp->rx_opt.eff_sacks *
TCPOLEN_SACK_PERBLOCK));
}
if (tcp_packets_in_flight(tp) == 0)
tcp_ca_event(sk, CA_EVENT_TX_START);
// 構造tcp頭部
th = (struct tcphdr *) skb_push(skb, tcp_header_size);
skb->h.th = th;
skb_set_owner_w(skb, sk);
th->source = inet->sport;
th->dest = inet->dport;
th->seq = htonl(tcb->seq);
th->ack_seq = htonl(tp->rcv_nxt);
*(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
tcb->flags);
if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
th->window = htons(tp->rcv_wnd);
} else {
th->window = htons(tcp_select_window(sk));
}
th->check = 0;
th->urg_ptr = 0;
if (unlikely(tp->urg_mode &&
between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) {
th->urg_ptr = htons(tp->snd_up-tcb->seq);
th->urg = 1;
}
if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
// 構造tcp選項
tcp_syn_build_options((__u32 *)(th + 1),
tcp_advertise_mss(sk),
(sysctl_flags & SYSCTL_FLAG_TSTAMPS),
(sysctl_flags & SYSCTL_FLAG_SACK),
(sysctl_flags & SYSCTL_FLAG_WSCALE),
tp->rx_opt.rcv_wscale,
tcb->when,
tp->rx_opt.ts_recent);
} else {
tcp_build_and_update_options((__u32 *)(th + 1),
tp, tcb->when);
TCP_ECN_send(sk, tp, skb, tcp_header_size);
}
icsk->icsk_af_ops->send_check(sk, skb->len, skb);
if (likely(tcb->flags & TCPCB_FLAG_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
if (skb->len != tcp_header_size)
tcp_event_data_sent(tp, skb, sk);
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
TCP_INC_STATS(TCP_MIB_OUTSEGS);
// 調用底層協定的發送函數
// 此處是icsk->icsk_af_ops = &ipv4_specific
// 這是在tcp_v4_init中設定的
err = icsk->icsk_af_ops->queue_xmit(skb, 0);
if (likely(err <= 0))
return err;
tcp_enter_cwr(sk);
return err == NET_XMIT_CN ? 0 : err;
}
把TCP的SYN包交給IP層,
int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
{
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
struct ip_options *opt = inet->opt;
struct rtable *rt;
struct iphdr *iph;
rt = (struct rtable *) skb->dst;
if (rt != NULL)
goto packet_routed;
rt = (struct rtable *)__sk_dst_check(sk, 0);
if (rt == NULL) {
u32 daddr;
daddr = inet->daddr;
if(opt && opt->srr)
daddr = opt->faddr;
{
struct flowi fl = { .oif = sk->sk_bound_dev_if,
.nl_u = { .ip4_u =
{ .daddr = daddr,
.saddr = inet->saddr,
.tos = RT_CONN_FLAGS(sk) } },
.proto = sk->sk_protocol,
.uli_u = { .ports =
{ .sport = inet->sport,
.dport = inet->dport } } };
if (ip_route_output_flow(&rt, &fl, sk, 0))
goto no_route;
}
sk_setup_caps(sk, &rt->u.dst);
}
skb->dst = dst_clone(&rt->u.dst);
packet_routed:
if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
goto no_route;
/* OK, we know where to send it, allocate and build IP header. */
iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
*((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
iph->tot_len = htons(skb->len);
if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
iph->frag_off = htons(IP_DF);
else
iph->frag_off = 0;
iph->ttl = ip_select_ttl(inet, &rt->u.dst);
iph->protocol = sk->sk_protocol;
iph->saddr = rt->rt_src;
iph->daddr = rt->rt_dst;
skb->nh.iph = iph;
if (opt && opt->optlen) {
iph->ihl += opt->optlen >> 2;
ip_options_build(skb, opt, inet->daddr, rt, 0);
}
ip_select_ident_more(iph, &rt->u.dst, sk,
(skb_shinfo(skb)->gso_segs ?: 1) - 1);
ip_send_check(iph);
skb->priority = sk->sk_priority;
// 注意: 這個地方會調用netfilter的機制
// dst_output在路由系統中的mkroute_output中設定為ip_output()
return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
dst_output);
no_route:
IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return -EHOSTUNREACH;
}
skbuf一次穿過各層協定,調用連如下:
dst_output() -> ip_output() -> ip_finish_output2() -> dev_queue_xmit():
q = rcu_dereference(dev->qdisc);
if (q->enqueue) {
spin_lock(&dev->queue_lock);
rc = q->enqueue(skb, q);
qdisc_run(dev);
spin_unlock(&dev->queue_lock);
rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
goto out;
}
dev->hard_start_xmit(skb, dev)
可以看到skbuf被插入到了net_device的qdisc中,最後調用驅動hard_start_xmit。
hard_start_xmit是各個網卡驅動裡實作的回調。
大緻的過程為:在net_device->base_addr上把要發送的skbuf按照驅動的CMD和位元組順序拼接上去:
比如'TX_CMD_PORT:send_cmd|TX_LEN_PORT:skb->len|TX_FRAME_PORT:skb->data'
這樣驅動拿到這塊位元組就知道如何解析,該怎麼發送了。驅動的代碼是不是很簡單。
到這裡可以認為這個SYN包已經發出去了(隻是發送到驅動的buf裡),下面看看tcp_connect在發送出去一個包後,啟動的定時器。
定時器
tcp_connect()
{
// 發送skbuff
// 發送函數是:icsk->icsk_af_ops = &ipv4_specific;
tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
tp->snd_nxt = tp->write_seq;
tp->pushed_seq = tp->write_seq;
TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
// 定時器的設定!!!
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
}
tcp 中有4種定時器:
#define ICSK_TIME_RETRANS1/* Retransmit timer */
#define ICSK_TIME_DACK2/* Delayed ack timer */
#define ICSK_TIME_PROBE03/* Zero window probe timer */
#define ICSK_TIME_KEEPOPEN4/* Keepalive timer */
在發送tcp資料中用到的是重傳定時器。
定時器的設定是在套接字初始化過程中進行的:
void tcp_init_xmit_timers(struct sock *sk)
{
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
}
可見,重傳定時器的回調是tcp_write_timer:
static void tcp_write_timer(unsigned long data)
{
struct sock *sk = (struct sock*)data;
struct inet_connection_sock *icsk = inet_csk(sk);
int event;
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
goto out_unlock;
}
if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
goto out;
if (time_after(icsk->icsk_timeout, jiffies)) {
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
goto out;
}
event = icsk->icsk_pending;
icsk->icsk_pending = 0;
// 根據event類型決定如何處理重傳
switch (event) {
case ICSK_TIME_RETRANS:
// tcp重傳的邏輯,涉及TCP協定細節,後面專題分析
// TODO
tcp_retransmit_timer(sk);
break;
case ICSK_TIME_PROBE0:
tcp_probe_timer(sk);
break;
}
TCP_CHECK_TIMER(sk);
out:
sk_stream_mem_reclaim(sk);
out_unlock:
bh_unlock_sock(sk);
sock_put(sk);
}
到此,connect發送一個SYN包的邏輯已經全部完成,下面是接收TCP包。