send調用
所有和socket相關的調用都是通過sys_socketcall轉發
asmlinkage long sys_socketcall(int call, unsigned long __user *args)
{
switch(call)
{
case SYS_SOCKET:
err = sys_socket(a0,a1,a[2]);
break;
case SYS_SEND:
err = sys_send(a0, (void __user *)a1, a[2], a[3]);
break;
case SYS_SENDTO:
err = sys_sendto(a0,(void __user *)a1, a[2], a[3],
(struct sockaddr __user *)a[4], a[5]);
break;
...
...
}
}
最終調用socket_sendmsg
int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
struct kiocb iocb;
struct sock_iocb siocb;
int ret;
init_sync_kiocb(&iocb, NULL);
iocb.private = &siocb;
ret = __sock_sendmsg(&iocb, sock, msg, size);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&iocb);
return ret;
}
static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t size)
{
struct sock_iocb *si = kiocb_to_siocb(iocb);
int err;
si->sock = sock;
si->scm = NULL;
si->msg = msg;
si->size = size;
err = security_socket_sendmsg(sock, msg, size);
if (err)
return err;
return sock->ops->sendmsg(iocb, sock, msg, size);
}
會調用具體的socket->ops->sendmsg方法
而建立socket的時候在方法inet_create中根據AF_INET和RAW參數找到的ops是inet_sockraw_ops
static struct proto_ops inet_sockraw_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_dgram_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = inet_getname,
.poll = datagram_poll,
.ioctl = inet_ioctl,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
};
是以,此處socket->ops->sendmsg是
.sendmsg = inet_sendmsg,
中的inet_sendmsg方法。
從sys_socketcall()到socket->ops->sendmsg是socket層負責完成的事情,接下來具體的協定來決定如何發送資料。
接着看socket->ops->sendmsg的細節。
int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size)
{
struct sock *sk = sock->sk;
if (!inet_sk(sk)->num && inet_autobind(sk))
return -EAGAIN;
return sk->sk_prot->sendmsg(iocb, sk, msg, size);
}
可以看出socket層的發送函數inet_sendmsg負責完成端口的綁定之後,然後就調用具體協定的發送函數sk->sk_prot->sendmsg了,這裡的sk_prot是raw_prot。
對于raw套接字,資料包在transport層僅僅需要做以上這些處理,然後就進入IP層。IP層主要的工作是決定這個資料包該發向何處。
IP層的路由系統
在發送每個封包時,都必須要查詢發送接口。這個過程分為3個步驟:
1) 查詢路由cache;
2) 查詢FIB表;
3) 将最終結果填入路由cache.
路由cache
目的位址的cache表項和路由cache的表項是等價的。
通用的目的位址cache系統如下:
struct rt_hash_bucket {
struct rtable*chain;
};
這是開鍊的hash表。
rtable的結構體如下
struct rtable
{
union
{
struct dst_entry dst; // 目的位址的cache表項
struct rtable *rt_next; // 路由表項
} u;
struct in_device *idev;
unsigned rt_flags;
unsigned rt_type;
__u32 rt_dst;
__u32 rt_src;
int rt_iif;
__u32 rt_gateway;
// 路由查找key的計算資訊
struct flowi fl;
/* Miscellaneous cached information */
__u32 rt_spec_dst; /* RFC1122 specific destination */
struct inet_peer *peer; /* long-living peer info */
};
注意:rtable的第一個元素u是一個共同體,rtable的第一個元素既可以看作目的cache的指針也可以看作路由表項,如圖所示。
注意:在hash表中比對路由時,key的計算資訊都在flowi結構體中。
struct flowi {
int oif;
int iif;
union {
struct {
__u32 daddr;
__u32 saddr;
__u32 fwmark;
__u8 tos;
__u8 scope;
} ip4_u;
struct {
struct in6_addr daddr;
struct in6_addr saddr;
__u32 flowlabel;
} ip6_u;
struct {
__u16 daddr;
__u16 saddr;
__u32 fwmark;
__u8 scope;
} dn_u;
} nl_u;
這個結構體區分不同的業務流,i意為identifer。
oif和iinf字段:确定input,output接口。iif是輸入接口的索引值,它從net_device結構裡的ifIndex擷取的,net_device是接收到封包的裝置。
fwmark:防火牆mark,流量shaping。
tos:type of service。
scope:是到目的位址的距離,用來歸類路由。
可以看出:路由的本質是網絡業不同的業務流的辨別,而flowi是核心中表示業務流的結構。
再來看dst_entry,這個是目的位址的cache表項。dst_entry的成員dst_ops,指向管理dst_entry函數,供arp協定調用。
對于IPRoute Cache來說,
struct dst_ops
{
unsigned short family;
unsigned short protocol;
unsigned gc_thresh;
int (*gc)(void);
struct dst_entry * (*check)(struct dst_entry *, __u32 cookie);
void (*destroy)(struct dst_entry *);
void (*ifdown)(struct dst_entry *,
struct net_device *dev, int how);
struct dst_entry * (*negative_advice)(struct dst_entry *);
void (*link_failure)(struct sk_buff *);
void (*update_pmtu)(struct dst_entry *dst, u32 mtu);
int (*get_mss)(struct dst_entry *dst, u32 mtu);
int entry_size;
atomic_t entries;
kmem_cache_t *kmem_cachep;
};
family:AF_NET
protocol:0x800
destroy:ipv4_dst_destroy
在下面的raw_sendmsg中,會根據要發送的封包,查找目的位址的cache表項 ip_route_output_flow函數
int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
{
unsigned hash;
struct rtable *rth;
// 根據flowi計算hash值
hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
rcu_read_lock_bh();
// 在全局的hash表rt_hash_table中查找
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
rth = rcu_dereference(rth->u.rt_next)) {
if (rth->fl.fl4_dst == flp->fl4_dst &&
rth->fl.fl4_src == flp->fl4_src &&
rth->fl.iif == 0 &&
rth->fl.oif == flp->oif &&
!((rth->fl.fl4_tos ^ flp->fl4_tos) &
(IPTOS_RT_MASK | RTO_ONLINK))) {
// 如果找到了則傳回這個表項
rth->u.dst.lastuse = jiffies;
dst_hold(&rth->u.dst);
rth->u.dst.__use++;
RT_CACHE_STAT_INC(out_hit);
rcu_read_unlock_bh();
*rp = rth;
return 0;
}
RT_CACHE_STAT_INC(out_hlist_search);
}
rcu_read_unlock_bh();
// 否則,進行路由解析
return ip_route_output_slow(rp, flp);
}
發送細節:raw_sendmsg
static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len)
{
// 略過參數檢查位址檢查等細節
struct inet_sock *inet = inet_sk(sk);
{
struct flowi fl = { .oif = ipc.oif,
.nl_u = { .ip4_u =
{ .daddr = daddr, // 目的位址
.saddr = saddr, // 源位址
.tos = tos } },
// socket(AF_INET, SOCK_RAW, ICMP_PROT)的初始化過程中inet->hdrincl被設定為0,表示ICMP封包
// 是以,此處的proto被指派為IPPROTO_ICMP其值為17
.proto = inet->hdrincl ? IPPROTO_RAW :
sk->sk_protocol,
};
// 這是一個ICMP封包,從用于拷貝ICMP的code,type
if (!inet->hdrincl)
raw_probe_proto_opt(&fl, msg);
// 路由子產品根據fl的内容計算路由資訊, 并填入rt結構
err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT));
}
// 如果是RAW封包, 核心不會做過多的幹涉, 直接發送出去了。
if (inet->hdrincl)
err = raw_send_hdrinc(sk, msg->msg_iov, len,
rt, msg->msg_flags);
else {
if (!ipc.addr)
ipc.addr = rt->rt_dst;
lock_sock(sk);
// ICMP封包會合并小資料包
err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
&ipc, rt, msg->msg_flags);
// 開始從IP層發送封包
err = ip_push_pending_frames(sk);
release_sock(sk);
}
}
IP層的發送
int ip_push_pending_frames(struct sock *sk)
{
struct sk_buff *skb, *tmp_skb;
struct sk_buff **tail_skb;
struct inet_sock *inet = inet_sk(sk);
struct ip_options *opt = NULL;
struct rtable *rt = inet->cork.rt;
struct iphdr *iph;
int df = 0;
__u8 ttl;
int err = 0;
if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
goto out;
tail_skb = &(skb_shinfo(skb)->frag_list);
// 把data指針移動到IP層
if (skb->data < skb->nh.raw)
__skb_pull(skb, skb->nh.raw - skb->data);
while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
*tail_skb = tmp_skb;
tail_skb = &(tmp_skb->next);
skb->len += tmp_skb->len;
skb->data_len += tmp_skb->len;
skb->truesize += tmp_skb->truesize;
__sock_put(tmp_skb->sk);
tmp_skb->destructor = NULL;
tmp_skb->sk = NULL;
}
// 對ip層封包進行設定
iph = (struct iphdr *)skb->data;
iph->version = 4;
iph->ihl = 5;
if (opt) {
iph->ihl += opt->optlen>>2;
ip_options_build(skb, opt, inet->cork.addr, rt, 0);
}
iph->tos = inet->tos;
iph->tot_len = htons(skb->len);
iph->frag_off = df;
if (!df) {
__ip_select_ident(iph, &rt->u.dst, 0);
} else {
iph->id = htons(inet->id++);
}
iph->ttl = ttl;
iph->protocol = sk->sk_protocol;
iph->saddr = rt->rt_src;
iph->daddr = rt->rt_dst;
ip_send_check(iph);
skb->priority = sk->sk_priority;
// 把路由系統的資訊指派給sk_buff中的dst成員
skb->dst = dst_clone(&rt->u.dst);
// 這個地方會回調一次NF_IP_LOCAL_OUT
// netfilter 功能
// 這個5個hook點中的LOCAL_OUT
// dst_output發送sk_buff
err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
skb->dst->dev, dst_output);
}
dst_output在路由系統中的mkroute_output中設定為ip_output():
int ip_output(struct sk_buff *skb)
{
IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
if (skb->len > dst_pmtu(skb->dst) && !skb_shinfo(skb)->tso_size)
return ip_fragment(skb, ip_finish_output);
else
return ip_finish_output(skb);
}
ip_output會判斷skb->len>dst_pmtu(skb->dst)是否需要分片。然後,調用ip_finish_output發送。
int ip_finish_output(struct sk_buff *skb)
{
struct net_device *dev = skb->dst->dev;
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
// netfilter功能
// 這個5個hook點中的POST_ROUTING
return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
ip_finish_output2);
}
進入ip_finish_output2:
這個函數的任務是構造2層封包的MAC位址。并發送這個2層封包。
static inline int ip_finish_output2(struct sk_buff *skb)
{
struct dst_entry *dst = skb->dst;
struct hh_cache *hh = dst->hh;
struct net_device *dev = dst->dev;
int hh_len = LL_RESERVED_SPACE(dev);
if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
struct sk_buff *skb2;
skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
if (skb2 == NULL) {
kfree_skb(skb);
return -ENOMEM;
}
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
kfree_skb(skb);
skb = skb2;
}
// dst->neighbour->output指向了neigh_resolve_output(),在arp_constructor函數中,初始化neighbour時候指定的。
// hh->hh_output 指向了dev_queue_xmit()
if (hh) {
int hh_alen;
read_lock_bh(&hh->hh_lock);
hh_alen = HH_DATA_ALIGN(hh->hh_len);
memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
read_unlock_bh(&hh->hh_lock);
skb_push(skb, hh->hh_len);
return hh->hh_output(skb);
} else if (dst->neighbour)
return dst->neighbour->output(skb);
if (net_ratelimit())
printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
kfree_skb(skb);
return -EINVAL;
}
從dst中取出hh(hard header cahe),緩存了mac的資訊(鄰居的mac,自己的mac,協定号)。
在設定完dev->hard_header後,調用hh->hh_output()或dst->neighbour->output()發送2層封包。
如果沒有找到hh,就說明這個目的IP對應的mac位址 不在本機的鄰居系統裡,需要發送arp查詢封包。
下面看看如何發送arp查詢封包。
先看dst->neighbour->output()在arp_constructor被設定成了neigh_resolve_output:
int neigh_resolve_output(struct sk_buff *skb)
{
struct dst_entry *dst = skb->dst;
struct neighbour *neigh;
int rc = 0;
if (!dst || !(neigh = dst->neighbour))
goto discard;
__skb_pull(skb, skb->nh.raw - skb->data);
if (!neigh_event_send(neigh, skb)) {
int err;
struct net_device *dev = neigh->dev;
if (dev->hard_header_cache && !dst->hh) {
write_lock_bh(&neigh->lock);
if (!dst->hh)
neigh_hh_init(neigh, dst, dst->ops->protocol);
err = dev->hard_header(skb, dev, ntohs(skb->protocol),
neigh->ha, NULL, skb->len);
write_unlock_bh(&neigh->lock);
} else {
read_lock_bh(&neigh->lock);
err = dev->hard_header(skb, dev, ntohs(skb->protocol),
neigh->ha, NULL, skb->len);
read_unlock_bh(&neigh->lock);
}
if (err >= 0)
rc = neigh->ops->queue_xmit(skb);
else
goto out_kfree_skb;
}
}
這個函數的目标就是發送arp封包, 在neigh_event_send()函數中實作:
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
int rc;
unsigned long now;
write_lock_bh(&neigh->lock);
rc = 0;
if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
goto out_unlock_bh;
now = jiffies;
if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
if (neigh->parms->mcast_probes + neigh->parms->app_probes) {
atomic_set(&neigh->probes, neigh->parms->ucast_probes);
neigh->nud_state = NUD_INCOMPLETE;
neigh_hold(neigh);
neigh->timer.expires = now + 1;
add_timer(&neigh->timer);
} else {
neigh->nud_state = NUD_FAILED;
write_unlock_bh(&neigh->lock);
if (skb)
kfree_skb(skb);
return 1;
}
} else if (neigh->nud_state & NUD_STALE) {
NEIGH_PRINTK2("neigh %p is delayed.\n", neigh);
neigh_hold(neigh);
neigh->nud_state = NUD_DELAY;
neigh->timer.expires = jiffies + neigh->parms->delay_probe_time;
add_timer(&neigh->timer);
}
if (neigh->nud_state == NUD_INCOMPLETE) {
if (skb) {
if (skb_queue_len(&neigh->arp_queue) >=
neigh->parms->queue_len) {
struct sk_buff *buff;
buff = neigh->arp_queue.next;
__skb_unlink(buff, &neigh->arp_queue);
kfree_skb(buff);
}
__skb_queue_tail(&neigh->arp_queue, skb);
}
rc = 1;
}
out_unlock_bh:
write_unlock_bh(&neigh->lock);
return rc;
}
發送arp封包并不是我們想的調用相關的發送函數,
而是設定neigh->nud_state為NUD_INCOMPLETE狀态,
同時添加一個timer,把skb挂到neigh->arp_queue隊列中。
在timer的回調裡從skb取出必要資訊,構造arp封包。
典型的 非阻塞操作+回調函數。
neighbour->timer是在neigh_alloc中配置設定的,指向neigh_timer_handler(),隻看和NUD_INCOMPLETE相關的代碼
static void neigh_timer_handler(unsigned long arg)
{
unsigned long now, next;
struct neighbour *neigh = (struct neighbour *)arg;
unsigned state;
int notify = 0;
write_lock(&neigh->lock);
state = neigh->nud_state;
now = jiffies;
next = now + HZ;
if (neigh->nud_state & NUD_IN_TIMER) {
neigh_hold(neigh);
if (time_before(next, jiffies + HZ/2))
next = jiffies + HZ/2;
neigh->timer.expires = next;
add_timer(&neigh->timer);
}
if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
struct sk_buff *skb = skb_peek(&neigh->arp_queue);
if (skb)
skb_get(skb);
write_unlock(&neigh->lock);
neigh->ops->solicit(neigh, skb);
atomic_inc(&neigh->probes);
if (skb)
kfree_skb(skb);
}
}
最終在timer的回調中調用neigh->ops->solicit,發送arp,然後就傳回了。
到此,一個封包穿過了transport層,IP層,到了鍊路層,如果有緩存hh,則直接拷貝緩存的mac位址;如果沒有則發送arp查詢封包。
在arp的協助下構造好了2層封包後,就會調用裝置層的發送函數發送這個封包,咱們下節再續。