首页 > 代码库 > IP的分片与组装学习笔记
IP的分片与组装学习笔记
IP的分片与组装
当要发送的IP数据报的长度超出了最大传输单位MTU,且允许分片时,就会进行IP分片。通常,使用UDP协议发送的数据报很容易导致IP分片,而TCP协议是基于流的传输,通常不会产生分片。
IP数据报被分片以后,各分片(fragment)分别组成一个具有IP首部的分组,并各自独立地选择路由,在其分别抵达目的主机后,目的主机的IP层会在传送给传输层之前将接收到的所有分片重装成一个IP数据报。可以怎么理解,IP数据报是IP层端到端的传输单元(在分片之前和重组之后),分组是指在IP层和链路层之间传送的数据单元。一个分组可以是一个完整的IP数据报也可以是IP数据报的一个分片。而分片对传输层是透明的。
IP数据报输出时的分片以及输入IP数据报分片的组装涉及以下文件:
net/ipv4/ip_output.c IP数据报的输出
net/ipv4/ip_fragment.c IP数据报分片的组装
系统参数
ipfrag_high_thresh,可用于组装IP数据报的内存上限值,在进行组装IP分片时,如果占用的缓存超过了此参数,会调用ip_evictor()进行垃圾收集
ipfrag_low_thresh,可用于组装IP数据报的内存下限值,在调用ip_evictor()进行垃圾收集时,一旦组装IP分片所占用的缓存下降到此参数,就会中止垃圾收集
ipfrag_time,待组装IP分片在内存中允许保留的时间,默认值为30s
ipfrag_secret_interval,定时重组ipq散列表的时间间隔,默认值为600s
分片
ip_finish_output()将数据报发送出去之前,该数据报包括本地发送的数据报也包括转发的数据报,会对skb缓存区的数据长度进行检测,如果超出了输出设备的MTU值,且又符合其他一些条件,则调用ip_fragment()对数据报进行分片;否则直接调用ip_finish_output2()输出到数据链路层
static int ip_finish_output(struct sk_buff *skb) { #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm != NULL) { IPCB(skb)->flags |= IPSKB_REROUTED; return dst_output(skb); } #endif if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb)) return ip_fragment(skb, ip_finish_output2); else return ip_finish_output2(skb); }目前有两种分片的处理方式:快速分片和慢速分片。整个分片过程需要完成的工作,除了将三层的有效负载根据MTU分成一个一个片段之外,还需要为每个分片设置IP首部,更新校验和等,在快速分片中,将数据分割成片段已经由传输层完成,三层只需将这些片段组成IP分片;而慢速分片则需要完成全部的工作,即对一个完整的IP数据报的MTU值循环进行分片,直至完成。
/* * This IP datagram is too large to be sent in one piece. Break it up into * smaller pieces (each of size equal to IP header plus * a block of the data of the original IP data part) that will yet fit in a * single device frame, and queue such a frame for sending. */ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct iphdr *iph; int raw = 0; int ptr; struct net_device *dev; struct sk_buff *skb2; unsigned int mtu, hlen, left, len, ll_rs, pad; int offset; __be16 not_last_frag; struct rtable *rt = skb_rtable(skb); int err = 0; dev = rt->u.dst.dev; /* * Point into the IP datagram header. */ iph = ip_hdr(skb); if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(ip_skb_dst_mtu(skb))); kfree_skb(skb); return -EMSGSIZE; } /* * Setup starting values. */ hlen = iph->ihl * 4; mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing * one, it is not prohibited. In this case fall back to copying. * * LATER: this step can be merged to real generation of fragments, * we can switch to copy when see the first bad fragment. */ if (skb_has_frags(skb)) { struct sk_buff *frag, *frag2; int first_len = skb_pagelen(skb); if (first_len - hlen > mtu || ((first_len - hlen) & 7) || (iph->frag_off & htons(IP_MF|IP_OFFSET)) || skb_cloned(skb)) goto slow_path; skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || skb_headroom(frag) < hlen) goto slow_path_clean; /* Partially cloned skb? */ if (skb_shared(frag)) goto slow_path_clean; BUG_ON(frag->sk); if (skb->sk) { frag->sk = skb->sk; frag->destructor = sock_wfree; } skb->truesize -= frag->truesize; } /* Everything is OK. Generate! */ err = 0; offset = 0; frag = skb_shinfo(skb)->frag_list; skb_frag_list_init(skb); skb->data_len = first_len - skb_headlen(skb); skb->len = first_len; iph->tot_len = htons(first_len); iph->frag_off = htons(IP_MF); ip_send_check(iph); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ if (frag) { frag->ip_summed = CHECKSUM_NONE; skb_reset_transport_header(frag); __skb_push(frag, hlen); skb_reset_network_header(frag); memcpy(skb_network_header(frag), iph, hlen); iph = ip_hdr(frag); iph->tot_len = htons(frag->len); ip_copy_metadata(frag, skb); if (offset == 0) ip_options_fragment(frag); offset += skb->len - hlen; iph->frag_off = htons(offset>>3); if (frag->next != NULL) iph->frag_off |= htons(IP_MF); /* Ready, complete checksum */ ip_send_check(iph); } err = output(skb); if (!err) IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; skb = frag; frag = skb->next; skb->next = NULL; } if (err == 0) { IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return 0; } while (frag) { skb = frag->next; kfree_skb(frag); frag = skb; } IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; slow_path_clean: skb_walk_frags(skb, frag2) { if (frag2 == frag) break; frag2->sk = NULL; frag2->destructor = NULL; skb->truesize += frag2->truesize; } } slow_path: left = skb->len - hlen; /* Space per frame */ ptr = raw + hlen; /* Where to start from */ /* for bridged IP traffic encapsulated inside f.e. a vlan header, * we need to make room for the encapsulating header */ pad = nf_bridge_pad(skb); ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad); mtu -= pad; /* * Fragment the datagram. */ offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; not_last_frag = iph->frag_off & htons(IP_MF); /* * Keep copying data until we run out. */ while (left > 0) { len = left; /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > mtu) len = mtu; /* IF: we are not sending upto and including the packet end then align the next start on an eight byte boundary */ if (len < left) { len &= ~7; } /* * Allocate buffer. */ if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); err = -ENOMEM; goto fail; } /* * Set up data on packet */ ip_copy_metadata(skb2, skb); skb_reserve(skb2, ll_rs); skb_put(skb2, len + hlen); skb_reset_network_header(skb2); skb2->transport_header = skb2->network_header + hlen; /* * Charge the memory for the fragment to any owner * it might possess */ if (skb->sk) skb_set_owner_w(skb2, skb->sk); /* * Copy the packet header into the new buffer. */ skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); /* * Copy a block of the IP datagram. */ if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) BUG(); left -= len; /* * Fill in the new header fields. */ iph = ip_hdr(skb2); iph->frag_off = htons((offset >> 3)); /* ANK: dirty, but effective trick. Upgrade options only if * the segment to be fragmented was THE FIRST (otherwise, * options are already fixed) and make it ONCE * on the initial skb, so that all the following fragments * will inherit fixed options. */ if (offset == 0) ip_options_fragment(skb); /* * Added AC : If we are fragmenting a fragment that's not the * last fragment then keep MF on each bit */ if (left > 0 || not_last_frag) iph->frag_off |= htons(IP_MF); ptr += len; offset += len; /* * Put this fragment into the sending queue. */ iph->tot_len = htons(len + hlen); ip_send_check(iph); err = output(skb2); if (err) goto fail; IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); } kfree_skb(skb); IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return err; fail: kfree_skb(skb); IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; }
组装
在接收方,一个由发送方发出的原始IP数据报,其所有分片将被重新组合,然后才能提交到上层协议。每一个将被重组的IP数据报都用一个ipq结构实例来表示,因此先来看看ipq这个非常重要的结构。
为了能高效地组装分片,用于保存分片的数据结构必须能做到以下几点:
1、快速定位属于某一个数据报的一组分组
2、在属于某一个数据报的一组分片中快速插入新的分片
3、有效地判断一个数据报的所有分片是否已经全部接收
4、具有组装超时机制,如果在重组完成之前定时器溢出,则删除该数据报的所有内容
struct inet_frag_queue { struct hlist_node list; struct netns_frags *net; struct list_head lru_list; /* lru list member */ spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ struct sk_buff *fragments; /* list of received fragments */ ktime_t stamp; int len; /* total length of orig datagram */ int meat; __u8 last_in; /* first/last segment arrived? */ #define INET_FRAG_COMPLETE 4 #define INET_FRAG_FIRST_IN 2 #define INET_FRAG_LAST_IN 1 }; /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; u32 user; __be32 saddr; __be32 daddr; __be16 id; u8 protocol; int iif; unsigned int rid; struct inet_peer *peer; };saddr、daddr、id、protocol来自IP首部,用来唯一确定分片来自哪个IP数据报
上送传输层及分片处理调用
ip_local_deliver
ip_defrag
ip_find
inet_frag_find
ip_frag_queue
ip_local_deliver_finish
IP的分片与组装学习笔记