作者:[email protected]部落格:linuxfocus.blog.chinaunix.net
今天學習上次剩下的最後一個UDP 發送資料調用中的最後一個函數,ip_fragment。這個函數用于當IP資料包過大時,對其進行分片發送。
/*
* This IP datagram is too large to be sent in one piece. Break it up into
* smaller pieces (each of size equal to IP header plus
* a block of the data of the original IP data part) that will yet fit in a
* single device frame, and queue such a frame for sending.
*/
int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
{
struct iphdr *iph;
int ptr;
struct net_device *dev;
struct sk_buff *skb2;
unsigned int mtu, hlen, left, len, ll_rs;
int offset;
__be16 not_last_frag;
struct rtable *rt = skb_rtable(skb);
int err = 0;
dev = rt->dst.dev;
/*
* Point into the IP datagram header.
*/
/* 得到IP封包頭的指針 */
iph = ip_hdr(skb);
if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
/* 禁止分片,增加錯誤計數 */
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(ip_skb_dst_mtu(skb)));
kfree_skb(skb);
return -EMSGSIZE;
}
* Setup starting values.
/* 得到IP封包總長度 */
hlen = iph->ihl * 4;
/* 這裡的mtu為真正的MTU-IP封包頭,即允許的最大IP資料長度 */
mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
#ifdef CONFIG_BRIDGE_NETFILTER
if (skb->nf_bridge)
mtu -= nf_bridge_mtu_reduction(skb);
#endif
/* 為這個skb_buff置上分片完成的标志 */
IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
/* When frag_list is given, use it. First, check its validity:
* some transformers could create wrong frag_list or break existing
* one, it is not prohibited. In this case fall back to copying.
*
* LATER: this step can be merged to real generation of fragments,
* we can switch to copy when see the first bad fragment.
/* 根據前面的學習,我們知道4層有可能會将資料包分片。這些分片存放在skb的frag_list中*/
if (skb_has_frags(skb)) {
/* skb_buffer已經有了一個frag list */
struct sk_buff *frag, *frag2;
/* 拿到資料包的長度 */
int first_len = skb_pagelen(skb);
/*
1.資料包的長度超過了MTU;
2.資料包長度沒有按8位元組對齊;
3.資料包設定了IP_MF或者IP_OFFSET位
這樣,進入slow_path
*/
if (first_len - hlen > mtu ||
((first_len - hlen) & 7) ||
(iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
skb_cloned(skb))
goto slow_path; //跳到slow_path
/* 周遊每一個分片 */
skb_walk_frags(skb, frag) {
/* 檢查每個分片,如果有一個分片不符合要求,就隻能使用slow path */
/* Correct geometry. */
if (frag->len > mtu ||
((frag->len & 7) && frag->next) ||
skb_headroom(frag) hlen)
goto slow_path_clean;
/* Partially cloned skb? */
if (skb_shared(frag))
BUG_ON(frag->sk);
if (skb->sk) {
frag->sk = skb->sk;
frag->destructor = sock_wfree;
}
skb->truesize -= frag->truesize;
}
/* Everything is OK. Generate! */
/* 現在可以進行fast path了*/
err = 0;
offset = 0;
/* 拿到frag list */
frag = skb_shinfo(skb)->frag_list;
/* 重置原來的frag list,相當于從skb_buff上取走了frag list */
skb_frag_list_init(skb);
/*
得到實際的資料長度,置分片标志位和校驗和
*/
skb->data_len = first_len - skb_headlen(skb);
skb->len = first_len;
iph->tot_len = htons(first_len);
iph->frag_off = htons(IP_MF);
ip_send_check(iph);
/* 分别處理每一個分片 */
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
if (frag) {
/* 表示checksm已經算好*/
frag->ip_summed = CHECKSUM_NONE;
/* 設定傳輸層*/
skb_reset_transport_header(frag);
__skb_push(frag, hlen);
/* 設定網絡層 */
skb_reset_network_header(frag);
memcpy(skb_network_header(frag), iph, hlen);
iph = ip_hdr(frag);
iph->tot_len = htons(frag->len);
ip_copy_metadata(frag, skb);
if (offset == 0)
ip_options_fragment(frag);
offset = skb->len - hlen;
iph->frag_off = htons(offset>>3);
if (frag->next != NULL)
iph->frag_off |= htons(IP_MF);
/* Ready, complete checksum */
/* 計算分片的校驗和 */
ip_send_check(iph);
/* 發送 */
err = output(skb);
if (!err)
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
if (err || !frag)
break;
skb = frag;
frag = skb->next;
skb->next = NULL;
if (err == 0) {
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
return 0;
/* 出錯是否記憶體 */
while (frag) {
skb = frag->next;
kfree_skb(frag);
frag = skb;
return err;
slow_path_clean:
/* 清除shared sk_buff */
skb_walk_frags(skb, frag2) {
if (frag2 == frag)
frag2->sk = NULL;
frag2->destructor = NULL;
skb->truesize = frag2->truesize;
slow_path:
left = skb->len - hlen; /* Space per frame */
ptr = hlen; /* Where to start from */
/* for bridged IP traffic encapsulated inside f.e. a vlan header,
* we need to make room for the encapsulating header
ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
* Fragment the datagram.
/* 得到偏移 */
offset = (ntohs(iph->frag_off) & IP_OFFSET) 3;
/* 通過IP_MF标志位,判斷是否是最後一個分片 */
not_last_frag = iph->frag_off & htons(IP_MF);
* Keep copying data until we run out.
while (left > 0) {
/* 計算分片長度 */
len = left;
/* IF: it doesn't fit, use 'mtu' - the data space left */
if (len > mtu)
len = mtu;
/* IF: we are not sending upto and including the packet end
then align the next start on an eight byte boundary */
if (len left) {
len &= ~7;
/*
* Allocate buffer.
*/
/* 為分片申請該分片申請一個sk_buff */
if ((skb2 = alloc_skb(len hlen ll_rs, GFP_ATOMIC)) == NULL) {
NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
err = -ENOMEM;
goto fail;
* Set up data on packet
/* 複制資料,以及運輸層 */
ip_copy_metadata(skb2, skb);
skb_reserve(skb2, ll_rs);
skb_put(skb2, len hlen);
skb_reset_network_header(skb2);
skb2->transport_header = skb2->network_header hlen;
* Charge the memory for the fragment to any owner
* it might possess
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
* Copy the packet header into the new buffer.
skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
* Copy a block of the IP datagram.
if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
BUG();
left -= len;
* Fill in the new header fields.
/* 填充網絡層 */
iph = ip_hdr(skb2);
iph->frag_off = htons((offset >> 3));
/* ANK: dirty, but effective trick. Upgrade options only if
* the segment to be fragmented was THE FIRST (otherwise,
* options are already fixed) and make it ONCE
* on the initial skb, so that all the following fragments
* will inherit fixed options.
/* 如果是第一個分片, 填充ip option */
if (offset == 0)
ip_options_fragment(skb);
* Added AC : If we are fragmenting a fragment that's not the
* last fragment then keep MF on each bit
/* 設定IP_MF标志位 */
if (left > 0 || not_last_frag)
iph->frag_off |= htons(IP_MF);
ptr = len;
offset = len;
* Put this fragment into the sending queue.
iph->tot_len = htons(len hlen);
/* 計算校驗和 */
/* 發送該分片 */
err = output(skb2);
if (err)
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
/* 釋放sk_buff */
kfree_skb(skb);
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
return err;
fail:
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
}
前段時間一直沒有大塊的時間,每天工作回家比較累,也就懶得看了。周末又陪老婆打球,休息了休息,也沒看。今天總算給看完了。