首页 > 代码库 > Linux协议栈优化之Netfilter分类conntrack

Linux协议栈优化之Netfilter分类conntrack

首先,如果你不同意我以下的观点,本文就不必看了:如今内存不值钱,空间换时间很划算,要知道,一万年前的1秒和一万年后1秒是一样的,你要是觉得人们做的事情可能不同,但是请记住,永远都会发生“安迪给你的被比尔拿走”之类的事情,对于你而言,什么都没有改变!
       虽然空间可以被拓展,但是要讲技巧。以下是一张截图,测试的是http服务器的常规性能,我依然使用相对值比较,因为我的电脑压不出真实性能,另外,为了将网络影响降低,我使用了本地环回地址,我的目的是测试conntrack对新建连接数的影响,截图如下:

如果你稍微明白一些conntrack的原理,就会明白它的开销主要集中在两个方面,一个是查找开销,在内存不值钱的年代,可以通过设置很大的hash bucket来缓解,另外一个开销就是分配conn结构体内存的开销。如果在一台压力很大的设备上,短连接特别多,所有的conntrack将会在一张hash表内,我们所能指望的就是针对一个五元组计算的hash值足够散列了,如果对于一些攻击流量,特别是当他研究了Linux计算conntrack hash值的算法后,他会构造很多hash一致的不同五元组的数据包,这会使得hash表的冲突链表过长,遍历开销过大,在当前的内核conntrack模块实现中,这是无法避免的,因为所有鸡蛋都在一个篮子里面。现在换一个思路。
       将不同类的数据流分到不同的hash表中如何?也就是说将一张大表拆分成几张小表,或者说如果你真的不在乎内存的话,设置多张大表也行!那么剩下的问题就是将数据包进行分类了。可以有两种实现:
1.在RAW表中做一个模块
根据skb的特征设置其conntrack表索引,值得注意的是,为了避免同一个流分配到不同的hash表,match项必须是计算hash的元素,比如源/目标IP,源/目标端口,协议等。对于IP分片直接通过,不予考虑。
2.在MANGLE表中做一个优先级大于conntrack小于defrag的模块
这个就不多说了,和1一样,就是解决了IP分片的问题。
iptables规则如下:
iptables -t mangle -A PREROUTING -s 1.1.1.0/24 -j CONNMAP 2
以上规则将源IP为1.1.1.0/24段的数据包的conntrack设置在表2中。你也可以区分TCP协议和UDP协议,每一个协议一张表,这样就可以避免恶意UDP流量攻击问题。在加载模块的时候,你需要设置一个参数,即hash表的数量。
       为了快速测试效果,我依然是先将代码写死,设置了3张hash表,基于内核3.2.5,两个patch如下:
C文件patch:

diff -uNr linux-source-3.2/net/ipv4/netfilter/ipt_MASQUERADE.c linux-source-3.2.new/net/ipv4/netfilter/ipt_MASQUERADE.c
--- linux-source-3.2/net/ipv4/netfilter/ipt_MASQUERADE.c        2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/ipv4/netfilter/ipt_MASQUERADE.c    2014-07-11 10:32:57.736666273 +0800
@@ -85,7 +85,7 @@
                  mr->range[0].min, mr->range[0].max });
 
        /* Hand modified range to generic setup. */
-       return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC);
+       return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC, 0);
 }
 
 static int
diff -uNr linux-source-3.2/net/ipv4/netfilter/ipt_NETMAP.c linux-source-3.2.new/net/ipv4/netfilter/ipt_NETMAP.c
--- linux-source-3.2/net/ipv4/netfilter/ipt_NETMAP.c    2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/ipv4/netfilter/ipt_NETMAP.c        2014-07-11 10:35:14.976667434 +0800
@@ -67,7 +67,7 @@
                  mr->range[0].min, mr->range[0].max });
 
        /* Hand modified range to generic setup. */
-       return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
+       return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum), 0);
 }
 
 static struct xt_target netmap_tg_reg __read_mostly = {
diff -uNr linux-source-3.2/net/ipv4/netfilter/ipt_REDIRECT.c linux-source-3.2.new/net/ipv4/netfilter/ipt_REDIRECT.c
--- linux-source-3.2/net/ipv4/netfilter/ipt_REDIRECT.c  2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/ipv4/netfilter/ipt_REDIRECT.c      2014-07-11 10:36:45.760668202 +0800
@@ -82,7 +82,7 @@
                  mr->range[0].min, mr->range[0].max });
 
        /* Hand modified range to generic setup. */
-       return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_DST);
+       return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_DST, 0);
 }
 
 static struct xt_target redirect_tg_reg __read_mostly = {
diff -uNr linux-source-3.2/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c linux-source-3.2.new/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
--- linux-source-3.2/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c     2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c 2014-07-11 15:03:50.596093049 +0800
@@ -151,6 +151,12 @@
                                      const struct net_device *out,
                                      int (*okfn)(struct sk_buff *))
 {
+       struct iphdr *hdr = ip_hdr(skb);
+       if (ipv4_is_loopback(hdr->saddr)||ipv4_is_loopback(hdr->daddr)) {
+               skb->ij = 1;
+       } else {
+               skb->ij = 0;
+       }
        return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb);
 }
 
@@ -160,6 +166,12 @@
                                         const struct net_device *out,
                                         int (*okfn)(struct sk_buff *))
 {
+       struct iphdr *hdr = ip_hdr(skb);
+       if (ipv4_is_loopback(hdr->saddr)||ipv4_is_loopback(hdr->daddr)) {
+               skb->ij = 1;
+       } else {
+               skb->ij = 0;
+       }
        /* root is playing with raw sockets. */
        if (skb->len < sizeof(struct iphdr) ||
            ip_hdrlen(skb) < sizeof(struct iphdr))
@@ -254,7 +266,7 @@
 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 {
        const struct inet_sock *inet = inet_sk(sk);
-       const struct nf_conntrack_tuple_hash *h;
+       /*const struct nf_conntrack_tuple_hash *h;*/
        struct nf_conntrack_tuple tuple;
 
        memset(&tuple, 0, sizeof(tuple));
@@ -276,7 +288,7 @@
                         *len, sizeof(struct sockaddr_in));
                return -EINVAL;
        }
-
+/*
        h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple);
        if (h) {
                struct sockaddr_in sin;
@@ -297,6 +309,7 @@
                else
                        return 0;
        }
+*/
        pr_debug("SO_ORIGINAL_DST: Can‘t find %pI4/%u-%pI4/%u.\n",
                 &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
                 &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
diff -uNr linux-source-3.2/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c linux-source-3.2.new/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
--- linux-source-3.2/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c      2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c  2014-07-11 09:11:02.144624681 +0800
@@ -38,7 +38,7 @@
             st->bucket < net->ct.htable_size;
             st->bucket++) {
                n = rcu_dereference(
-                       hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+                       hlist_nulls_first_rcu(&net->ct.hash[0][st->bucket]));
                if (!is_a_nulls(n))
                        return n;
        }
@@ -58,7 +58,7 @@
                                return NULL;
                }
                head = rcu_dereference(
-                       hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+                       hlist_nulls_first_rcu(&net->ct.hash[0][st->bucket]));
        }
        return head;
 }
diff -uNr linux-source-3.2/net/ipv4/netfilter/nf_conntrack_proto_icmp.c linux-source-3.2.new/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
--- linux-source-3.2/net/ipv4/netfilter/nf_conntrack_proto_icmp.c       2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/ipv4/netfilter/nf_conntrack_proto_icmp.c   2014-07-11 09:55:31.704647269 +0800
@@ -148,7 +148,7 @@
 
        *ctinfo = IP_CT_RELATED;
 
-       h = nf_conntrack_find_get(net, zone, &innertuple);
+       h = nf_conntrack_find_get(net, zone, &innertuple, skb->ij);
        if (!h) {
                pr_debug("icmp_error_message: no match\n");
                return -NF_ACCEPT;
diff -uNr linux-source-3.2/net/ipv4/netfilter/nf_nat_core.c linux-source-3.2.new/net/ipv4/netfilter/nf_nat_core.c
--- linux-source-3.2/net/ipv4/netfilter/nf_nat_core.c   2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/ipv4/netfilter/nf_nat_core.c       2014-07-11 10:24:16.544661863 +0800
@@ -74,7 +74,7 @@
        struct nf_conntrack_tuple reply;
 
        nf_ct_invert_tuplepr(&reply, tuple);
-       return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
+       return nf_conntrack_tuple_taken(&reply, ignored_conntrack, 0);
 }
 EXPORT_SYMBOL(nf_nat_used_tuple);
 
@@ -206,7 +206,7 @@
                 const struct nf_conntrack_tuple *orig_tuple,
                 const struct nf_nat_range *range,
                 struct nf_conn *ct,
-                enum nf_nat_manip_type maniptype)
+                enum nf_nat_manip_type maniptype, int ij)
 {
        struct net *net = nf_ct_net(ct);
        const struct nf_nat_protocol *proto;
@@ -268,7 +268,7 @@
 unsigned int
 nf_nat_setup_info(struct nf_conn *ct,
                  const struct nf_nat_range *range,
-                 enum nf_nat_manip_type maniptype)
+                 enum nf_nat_manip_type maniptype, int ij)
 {
        struct net *net = nf_ct_net(ct);
        struct nf_conntrack_tuple curr_tuple, new_tuple;
@@ -296,7 +296,7 @@
        nf_ct_invert_tuplepr(&curr_tuple,
                             &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 
-       get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
+       get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype, ij);
 
        if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
                struct nf_conntrack_tuple reply;
@@ -670,7 +670,7 @@
        if (nf_nat_initialized(ct, manip))
                return -EEXIST;
 
-       return nf_nat_setup_info(ct, &range, manip);
+       return nf_nat_setup_info(ct, &range, manip, 0);
 }
 #else
 static int
diff -uNr linux-source-3.2/net/ipv4/netfilter/nf_nat_h323.c linux-source-3.2.new/net/ipv4/netfilter/nf_nat_h323.c
--- linux-source-3.2/net/ipv4/netfilter/nf_nat_h323.c   2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/ipv4/netfilter/nf_nat_h323.c       2014-07-11 10:08:16.712653742 +0800
@@ -411,14 +411,14 @@
        /* Change src to where master sends to */
        range.flags = IP_NAT_RANGE_MAP_IPS;
        range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
-       nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC);
+       nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC, 0);
 
        /* For DST manip, map port here to where it‘s expected. */
        range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
        range.min = range.max = this->saved_proto;
        range.min_ip = range.max_ip =
            new->master->tuplehash[!this->dir].tuple.src.u3.ip;
-       nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST);
+       nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST, 0);
 }
 
 /****************************************************************************/
@@ -504,13 +504,13 @@
        /* Change src to where master sends to */
        range.flags = IP_NAT_RANGE_MAP_IPS;
        range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.u3.ip;
-       nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC);
+       nf_nat_setup_info(new, &range, IP_NAT_MANIP_SRC, 0);
 
        /* For DST manip, map port here to where it‘s expected. */
        range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
        range.min = range.max = this->saved_proto;
        range.min_ip = range.max_ip = this->saved_ip;
-       nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST);
+       nf_nat_setup_info(new, &range, IP_NAT_MANIP_DST, 0);
 }
 
 /****************************************************************************/
diff -uNr linux-source-3.2/net/ipv4/netfilter/nf_nat_helper.c linux-source-3.2.new/net/ipv4/netfilter/nf_nat_helper.c
--- linux-source-3.2/net/ipv4/netfilter/nf_nat_helper.c 2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/ipv4/netfilter/nf_nat_helper.c     2014-07-11 10:08:55.008654066 +0800
@@ -439,13 +439,13 @@
        range.flags = IP_NAT_RANGE_MAP_IPS;
        range.min_ip = range.max_ip
                = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
-       nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC);
+       nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC, 0);
 
        /* For DST manip, map port here to where it‘s expected. */
        range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
        range.min = range.max = exp->saved_proto;
        range.min_ip = range.max_ip
                = ct->master->tuplehash[!exp->dir].tuple.src.u3.ip;
-       nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST);
+       nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST, 0);
 }
 EXPORT_SYMBOL(nf_nat_follow_master);
diff -uNr linux-source-3.2/net/ipv4/netfilter/nf_nat_pptp.c linux-source-3.2.new/net/ipv4/netfilter/nf_nat_pptp.c
--- linux-source-3.2/net/ipv4/netfilter/nf_nat_pptp.c   2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/ipv4/netfilter/nf_nat_pptp.c       2014-07-11 10:08:35.424653900 +0800
@@ -95,7 +95,7 @@
                range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
                range.min = range.max = exp->saved_proto;
        }
-       nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC);
+       nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC, 0);
 
        /* For DST manip, map port here to where it‘s expected. */
        range.flags = IP_NAT_RANGE_MAP_IPS;
@@ -105,7 +105,7 @@
                range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
                range.min = range.max = exp->saved_proto;
        }
-       nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST);
+       nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST, 0);
 }
 
 /* outbound packets == from PNS to PAC */
diff -uNr linux-source-3.2/net/ipv4/netfilter/nf_nat_rule.c linux-source-3.2.new/net/ipv4/netfilter/nf_nat_rule.c
--- linux-source-3.2/net/ipv4/netfilter/nf_nat_rule.c   2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/ipv4/netfilter/nf_nat_rule.c       2014-07-11 10:09:31.024654370 +0800
@@ -56,7 +56,7 @@
                            ctinfo == IP_CT_RELATED_REPLY));
        NF_CT_ASSERT(par->out != NULL);
 
-       return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC);
+       return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC, 0);
 }
 
 static unsigned int
@@ -74,7 +74,7 @@
        /* Connection must be valid and new. */
        NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
 
-       return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST);
+       return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST, 0);
 }
 
 static int ipt_snat_checkentry(const struct xt_tgchk_param *par)
@@ -115,7 +115,7 @@
                 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
                 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
 
-       return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
+       return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum), 0);
 }
 
 int nf_nat_rule_find(struct sk_buff *skb,
diff -uNr linux-source-3.2/net/ipv4/netfilter/nf_nat_sip.c linux-source-3.2.new/net/ipv4/netfilter/nf_nat_sip.c
--- linux-source-3.2/net/ipv4/netfilter/nf_nat_sip.c    2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/ipv4/netfilter/nf_nat_sip.c        2014-07-11 10:07:40.744653437 +0800
@@ -259,7 +259,7 @@
        range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
        range.min = range.max = exp->saved_proto;
        range.min_ip = range.max_ip = exp->saved_ip;
-       nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST);
+       nf_nat_setup_info(ct, &range, IP_NAT_MANIP_DST, 0);
 
        /* Change src to where master sends to, but only if the connection
         * actually came from the same source. */
@@ -268,7 +268,7 @@
                range.flags = IP_NAT_RANGE_MAP_IPS;
                range.min_ip = range.max_ip
                        = ct->master->tuplehash[!exp->dir].tuple.dst.u3.ip;
-               nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC);
+               nf_nat_setup_info(ct, &range, IP_NAT_MANIP_SRC, 0);
        }
 }
 
diff -uNr linux-source-3.2/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c linux-source-3.2.new/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
--- linux-source-3.2/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c     2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c 2014-07-11 09:12:04.320625207 +0800
@@ -165,7 +165,7 @@
 
        *ctinfo = IP_CT_RELATED;
 
-       h = nf_conntrack_find_get(net, zone, &intuple);
+       h = nf_conntrack_find_get(net, zone, &intuple, skb->ij);
        if (!h) {
                pr_debug("icmpv6_error: no match\n");
                return -NF_ACCEPT;
diff -uNr linux-source-3.2/net/netfilter/ipvs/ip_vs_nfct.c linux-source-3.2.new/net/netfilter/ipvs/ip_vs_nfct.c
--- linux-source-3.2/net/netfilter/ipvs/ip_vs_nfct.c    2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/netfilter/ipvs/ip_vs_nfct.c        2014-07-11 11:10:42.224685433 +0800
@@ -270,7 +270,7 @@
                __func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
 
        h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE,
-                                 &tuple);
+                                 &tuple, 0);
        if (h) {
                ct = nf_ct_tuplehash_to_ctrack(h);
                /* Show what happens instead of calling nf_ct_kill() */
diff -uNr linux-source-3.2/net/netfilter/nf_conntrack_core.c linux-source-3.2.new/net/netfilter/nf_conntrack_core.c
--- linux-source-3.2/net/netfilter/nf_conntrack_core.c  2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/netfilter/nf_conntrack_core.c      2014-07-11 10:48:19.948674076 +0800
@@ -316,7 +316,7 @@
  */
 static struct nf_conntrack_tuple_hash *
 ____nf_conntrack_find(struct net *net, u16 zone,
-                     const struct nf_conntrack_tuple *tuple, u32 hash)
+                     const struct nf_conntrack_tuple *tuple, u32 hash, int ij)
 {
        struct nf_conntrack_tuple_hash *h;
        struct hlist_nulls_node *n;
@@ -327,7 +327,7 @@
         */
        local_bh_disable();
 begin:
-       hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) {
+       hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[ij][bucket], hnnode) {
                if (nf_ct_tuple_equal(tuple, &h->tuple) &&
                    nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) {
                        NF_CT_STAT_INC(net, found);
@@ -352,24 +352,24 @@
 
 struct nf_conntrack_tuple_hash *
 __nf_conntrack_find(struct net *net, u16 zone,
-                   const struct nf_conntrack_tuple *tuple)
+                   const struct nf_conntrack_tuple *tuple, int ij)
 {
        return ____nf_conntrack_find(net, zone, tuple,
-                                    hash_conntrack_raw(tuple, zone));
+                                    hash_conntrack_raw(tuple, zone), ij);
 }
 EXPORT_SYMBOL_GPL(__nf_conntrack_find);
 
 /* Find a connection corresponding to a tuple. */
 static struct nf_conntrack_tuple_hash *
 __nf_conntrack_find_get(struct net *net, u16 zone,
-                       const struct nf_conntrack_tuple *tuple, u32 hash)
+                       const struct nf_conntrack_tuple *tuple, u32 hash, int ij)
 {
        struct nf_conntrack_tuple_hash *h;
        struct nf_conn *ct;
 
        rcu_read_lock();
 begin:
-       h = ____nf_conntrack_find(net, zone, tuple, hash);
+       h = ____nf_conntrack_find(net, zone, tuple, hash, ij);
        if (h) {
                ct = nf_ct_tuplehash_to_ctrack(h);
                if (unlikely(nf_ct_is_dying(ct) ||
@@ -390,26 +390,27 @@
 
 struct nf_conntrack_tuple_hash *
 nf_conntrack_find_get(struct net *net, u16 zone,
-                     const struct nf_conntrack_tuple *tuple)
+                     const struct nf_conntrack_tuple *tuple, int ij)
 {
        return __nf_conntrack_find_get(net, zone, tuple,
-                                      hash_conntrack_raw(tuple, zone));
+                                      hash_conntrack_raw(tuple, zone), ij);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
 
 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
                                       unsigned int hash,
-                                      unsigned int repl_hash)
+                                      unsigned int repl_hash,
+                                       int ij)
 {
        struct net *net = nf_ct_net(ct);
 
        hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
-                          &net->ct.hash[hash]);
+                          &net->ct.hash[ij][hash]);
        hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
-                          &net->ct.hash[repl_hash]);
+                          &net->ct.hash[ij][repl_hash]);
 }
 
-void nf_conntrack_hash_insert(struct nf_conn *ct)
+void nf_conntrack_hash_insert(struct nf_conn *ct, int ij)
 {
        struct net *net = nf_ct_net(ct);
        unsigned int hash, repl_hash;
@@ -419,7 +420,7 @@
        hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
        repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 
-       __nf_conntrack_hash_insert(ct, hash, repl_hash);
+       __nf_conntrack_hash_insert(ct, hash, repl_hash, ij);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
 
@@ -479,12 +480,12 @@
        /* See if there‘s one in the list already, including reverse:
           NAT could have grabbed it without realizing, since we‘re
           not in the hash.  If there is, we lost race. */
-       hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
+       hlist_nulls_for_each_entry(h, n, &net->ct.hash[skb->ij][hash], hnnode)
                if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
                                      &h->tuple) &&
                    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
                        goto out;
-       hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
+       hlist_nulls_for_each_entry(h, n, &net->ct.hash[skb->ij][repl_hash], hnnode)
                if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
                                      &h->tuple) &&
                    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
@@ -514,7 +515,7 @@
         * guarantee that no other CPU can find the conntrack before the above
         * stores are visible.
         */
-       __nf_conntrack_hash_insert(ct, hash, repl_hash);
+       __nf_conntrack_hash_insert(ct, hash, repl_hash, skb->ij);
        NF_CT_STAT_INC(net, insert);
        spin_unlock_bh(&nf_conntrack_lock);
 
@@ -537,7 +538,7 @@
    for NAT). */
 int
 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
-                        const struct nf_conn *ignored_conntrack)
+                        const struct nf_conn *ignored_conntrack, int ij)
 {
        struct net *net = nf_ct_net(ignored_conntrack);
        struct nf_conntrack_tuple_hash *h;
@@ -550,7 +551,7 @@
         * least once for the stats anyway.
         */
        rcu_read_lock_bh();
-       hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
+       hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[ij][hash], hnnode) {
                ct = nf_ct_tuplehash_to_ctrack(h);
                if (ct != ignored_conntrack &&
                    nf_ct_tuple_equal(tuple, &h->tuple) &&
@@ -571,7 +572,7 @@
 
 /* There‘s a small race here where we may free a just-assured
    connection.  Too bad: we‘re in trouble anyway. */
-static noinline int early_drop(struct net *net, unsigned int hash)
+static noinline int early_drop(struct net *net, unsigned int hash, int ij)
 {
        /* Use oldest entry, which is roughly LRU */
        struct nf_conntrack_tuple_hash *h;
@@ -582,7 +583,7 @@
 
        rcu_read_lock();
        for (i = 0; i < net->ct.htable_size; i++) {
-               hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
+               hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[ij][hash],
                                         hnnode) {
                        tmp = nf_ct_tuplehash_to_ctrack(h);
                        if (!test_bit(IPS_ASSURED_BIT, &tmp->status))
@@ -636,7 +637,7 @@
 __nf_conntrack_alloc(struct net *net, u16 zone,
                     const struct nf_conntrack_tuple *orig,
                     const struct nf_conntrack_tuple *repl,
-                    gfp_t gfp, u32 hash)
+                    gfp_t gfp, u32 hash, int ij)
 {
        struct nf_conn *ct;
 
@@ -651,7 +652,7 @@
 
        if (nf_conntrack_max &&
            unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
-               if (!early_drop(net, hash_bucket(hash, net))) {
+               if (!early_drop(net, hash_bucket(hash, net), ij)) {
                        atomic_dec(&net->ct.count);
                        if (net_ratelimit())
                                printk(KERN_WARNING
@@ -713,9 +714,9 @@
 struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone,
                                   const struct nf_conntrack_tuple *orig,
                                   const struct nf_conntrack_tuple *repl,
-                                  gfp_t gfp)
+                                  gfp_t gfp, int ij)
 {
-       return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
+       return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0, ij);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
 
@@ -753,7 +754,7 @@
        }
 
        ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
-                                 hash);
+                                 hash, skb->ij);
        if (IS_ERR(ct))
                return (struct nf_conntrack_tuple_hash *)ct;
 
@@ -840,7 +841,7 @@
 
        /* look for tuple match */
        hash = hash_conntrack_raw(&tuple, zone);
-       h = __nf_conntrack_find_get(net, zone, &tuple, hash);
+       h = __nf_conntrack_find_get(net, zone, &tuple, hash, skb->ij);
        if (!h) {
                h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
                                   skb, dataoff, hash);
@@ -1170,7 +1171,7 @@
 
        spin_lock_bh(&nf_conntrack_lock);
        for (; *bucket < net->ct.htable_size; (*bucket)++) {
-               hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
+               hlist_nulls_for_each_entry(h, n, &net->ct.hash[0][*bucket], hnnode) {
                        ct = nf_ct_tuplehash_to_ctrack(h);
                        if (iter(ct, data))
                                goto found;
@@ -1297,6 +1298,7 @@
 
 static void nf_conntrack_cleanup_net(struct net *net)
 {
+       int i = 0;
  i_see_dead_people:
        nf_ct_iterate_cleanup(net, kill_all, NULL);
        nf_ct_release_dying_list(net);
@@ -1305,7 +1307,9 @@
                goto i_see_dead_people;
        }
 
-       nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
+       for (i = 0; i < 3; i++) {
+               nf_ct_free_hashtable(net->ct.hash[i], net->ct.htable_size);
+       }
        nf_conntrack_ecache_fini(net);
        nf_conntrack_tstamp_fini(net);
        nf_conntrack_acct_fini(net);
@@ -1364,7 +1368,7 @@
 {
        int i, bucket;
        unsigned int hashsize, old_size;
-       struct hlist_nulls_head *hash, *old_hash;
+       struct hlist_nulls_head *hash[3], *old_hash[3];
        struct nf_conntrack_tuple_hash *h;
        struct nf_conn *ct;
 
@@ -1378,10 +1382,19 @@
        hashsize = simple_strtoul(val, NULL, 0);
        if (!hashsize)
                return -EINVAL;
-
-       hash = nf_ct_alloc_hashtable(&hashsize, 1);
-       if (!hash)
-               return -ENOMEM;
+       {
+               int k = 0;
+               for (k = 0; k < 3; k++) {
+                       hash[k] = nf_ct_alloc_hashtable(&hashsize, 1);
+                       if (!hash[k]) {
+                               int j = 0;
+                               for (j = 0; j < k; j++) {
+                                       //free hash[j];
+                               }
+                               return -ENOMEM;
+                       }
+               }
+       }
 
        /* Lookups in the old hash might happen in parallel, which means we
         * might get false negatives during connection lookup. New connections
@@ -1390,24 +1403,38 @@
         */
        spin_lock_bh(&nf_conntrack_lock);
        for (i = 0; i < init_net.ct.htable_size; i++) {
-               while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
-                       h = hlist_nulls_entry(init_net.ct.hash[i].first,
+               int k = 0;
+               for (k = 0; k < 3; k++) {
+               while (!hlist_nulls_empty(&init_net.ct.hash[k][i])) {
+                       h = hlist_nulls_entry(init_net.ct.hash[k][i].first,
                                        struct nf_conntrack_tuple_hash, hnnode);
                        ct = nf_ct_tuplehash_to_ctrack(h);
                        hlist_nulls_del_rcu(&h->hnnode);
                        bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct),
                                                  hashsize);
-                       hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
+                       hlist_nulls_add_head_rcu(&h->hnnode, &hash[k][bucket]);
+               }
                }
        }
        old_size = init_net.ct.htable_size;
-       old_hash = init_net.ct.hash;
+       old_hash[0] = init_net.ct.hash[0];
+       old_hash[1] = init_net.ct.hash[1];
+       old_hash[2] = init_net.ct.hash[2];
 
        init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
-       init_net.ct.hash = hash;
+       {
+               int k = 0;
+               for (k = 0; k < 3; k++) {
+                       init_net.ct.hash[k] = hash[k];
+               }
+       }
        spin_unlock_bh(&nf_conntrack_lock);
-
-       nf_ct_free_hashtable(old_hash, old_size);
+       {
+               int k = 0;
+               for (k = 0; k < 3; k++) {
+                       nf_ct_free_hashtable(old_hash[k], old_size);
+               }
+       }
        return 0;
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
@@ -1494,6 +1521,7 @@
 static int nf_conntrack_init_net(struct net *net)
 {
        int ret;
+       int i = 0;
 
        atomic_set(&net->ct.count, 0);
        INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL);
@@ -1520,11 +1548,13 @@
        }
 
        net->ct.htable_size = nf_conntrack_htable_size;
-       net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
-       if (!net->ct.hash) {
-               ret = -ENOMEM;
-               printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
-               goto err_hash;
+       for (i = 0; i < 3; i++) {
+               net->ct.hash[i] = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
+               if (!net->ct.hash[i]) {
+                       ret = -ENOMEM;
+                       printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
+                       goto err_hash;
+               }
        }
        ret = nf_conntrack_expect_init(net);
        if (ret < 0)
@@ -1548,7 +1578,9 @@
 err_acct:
        nf_conntrack_expect_fini(net);
 err_expect:
-       nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
+       for (i = 0; i < 3; i++) {
+               nf_ct_free_hashtable(net->ct.hash[i], net->ct.htable_size);
+       }
 err_hash:
        kmem_cache_destroy(net->ct.nf_conntrack_cachep);
 err_cache:
diff -uNr linux-source-3.2/net/netfilter/nf_conntrack_helper.c linux-source-3.2.new/net/netfilter/nf_conntrack_helper.c
--- linux-source-3.2/net/netfilter/nf_conntrack_helper.c        2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/netfilter/nf_conntrack_helper.c    2014-07-11 10:57:25.424678691 +0800
@@ -227,7 +227,7 @@
        hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode)
                unhelp(h, me);
        for (i = 0; i < net->ct.htable_size; i++) {
-               hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
+               hlist_nulls_for_each_entry(h, nn, &net->ct.hash[0][i], hnnode)
                        unhelp(h, me);
        }
 }
diff -uNr linux-source-3.2/net/netfilter/nf_conntrack_netlink.c linux-source-3.2.new/net/netfilter/nf_conntrack_netlink.c
--- linux-source-3.2/net/netfilter/nf_conntrack_netlink.c       2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/netfilter/nf_conntrack_netlink.c   2014-07-11 11:02:54.168681473 +0800
@@ -692,7 +692,7 @@
        last = (struct nf_conn *)cb->args[1];
        for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
 restart:
-               hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
+               hlist_nulls_for_each_entry(h, n, &net->ct.hash[0][cb->args[0]],
                                         hnnode) {
                        if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
                                continue;
@@ -920,7 +920,7 @@
        if (err < 0)
                return err;
 
-       h = nf_conntrack_find_get(net, zone, &tuple);
+       h = nf_conntrack_find_get(net, zone, &tuple, 0);
        if (!h)
                return -ENOENT;
 
@@ -986,7 +986,7 @@
        if (err < 0)
                return err;
 
-       h = nf_conntrack_find_get(net, zone, &tuple);
+       h = nf_conntrack_find_get(net, zone, &tuple, skb->ij);
        if (!h)
                return -ENOENT;
 
@@ -1336,7 +1336,7 @@
        struct nf_conntrack_helper *helper;
        struct nf_conn_tstamp *tstamp;
 
-       ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC);
+       ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC, 0);
        if (IS_ERR(ct))
                return ERR_PTR(-ENOMEM);
 
@@ -1446,7 +1446,7 @@
                if (err < 0)
                        goto err2;
 
-               master_h = nf_conntrack_find_get(net, zone, &master);
+               master_h = nf_conntrack_find_get(net, zone, &master, 0);
                if (master_h == NULL) {
                        err = -ENOENT;
                        goto err2;
@@ -1460,7 +1460,7 @@
                tstamp->start = ktime_to_ns(ktime_get_real());
 
        add_timer(&ct->timeout);
-       nf_conntrack_hash_insert(ct);
+       nf_conntrack_hash_insert(ct, 0);
        rcu_read_unlock();
 
        return ct;
@@ -1503,9 +1503,9 @@
 
        spin_lock_bh(&nf_conntrack_lock);
        if (cda[CTA_TUPLE_ORIG])
-               h = __nf_conntrack_find(net, zone, &otuple);
+               h = __nf_conntrack_find(net, zone, &otuple, 0);
        else if (cda[CTA_TUPLE_REPLY])
-               h = __nf_conntrack_find(net, zone, &rtuple);
+               h = __nf_conntrack_find(net, zone, &rtuple, 0);
 
        if (h == NULL) {
                err = -ENOENT;
@@ -2020,7 +2020,7 @@
                return err;
 
        /* Look for master conntrack of this expectation */
-       h = nf_conntrack_find_get(net, zone, &master_tuple);
+       h = nf_conntrack_find_get(net, zone, &master_tuple, 0);
        if (!h)
                return -ENOENT;
        ct = nf_ct_tuplehash_to_ctrack(h);
diff -uNr linux-source-3.2/net/netfilter/nf_conntrack_pptp.c linux-source-3.2.new/net/netfilter/nf_conntrack_pptp.c
--- linux-source-3.2/net/netfilter/nf_conntrack_pptp.c  2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/netfilter/nf_conntrack_pptp.c      2014-07-11 11:04:11.824682130 +0800
@@ -148,7 +148,7 @@
        pr_debug("trying to timeout ct or exp for tuple ");
        nf_ct_dump_tuple(t);
 
-       h = nf_conntrack_find_get(net, zone, t);
+       h = nf_conntrack_find_get(net, zone, t, 0);
        if (h)  {
                sibling = nf_ct_tuplehash_to_ctrack(h);
                pr_debug("setting timeout of conntrack %p to 0\n", sibling);
diff -uNr linux-source-3.2/net/netfilter/nf_conntrack_standalone.c linux-source-3.2.new/net/netfilter/nf_conntrack_standalone.c
--- linux-source-3.2/net/netfilter/nf_conntrack_standalone.c    2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/netfilter/nf_conntrack_standalone.c        2014-07-11 10:56:08.276678039 +0800
@@ -59,7 +59,7 @@
        for (st->bucket = 0;
             st->bucket < net->ct.htable_size;
             st->bucket++) {
-               n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+               n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[0][st->bucket]));
                if (!is_a_nulls(n))
                        return n;
        }
@@ -80,7 +80,7 @@
                }
                head = rcu_dereference(
                                hlist_nulls_first_rcu(
-                                       &net->ct.hash[st->bucket]));
+                                       &net->ct.hash[0][st->bucket]));
        }
        return head;
 }
diff -uNr linux-source-3.2/net/netfilter/xt_connlimit.c linux-source-3.2.new/net/netfilter/xt_connlimit.c
--- linux-source-3.2/net/netfilter/xt_connlimit.c       2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/netfilter/xt_connlimit.c   2014-07-11 11:07:55.712684024 +0800
@@ -117,7 +117,7 @@
        /* check the saved connections */
        hlist_for_each_entry_safe(conn, pos, n, hash, node) {
                found    = nf_conntrack_find_get(net, NF_CT_DEFAULT_ZONE,
-                                                &conn->tuple);
+                                                &conn->tuple, 0);
                found_ct = NULL;
 
                if (found != NULL)
diff -uNr linux-source-3.2/net/netfilter/xt_CT.c linux-source-3.2.new/net/netfilter/xt_CT.c
--- linux-source-3.2/net/netfilter/xt_CT.c      2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/net/netfilter/xt_CT.c  2014-07-11 11:06:38.648683372 +0800
@@ -81,7 +81,7 @@
                goto err1;
 
        memset(&t, 0, sizeof(t));
-       ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL);
+       ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL, 0);
        ret = PTR_ERR(ct);
        if (IS_ERR(ct))
                goto err2;



H文件patch:


diff -uNr linux-source-3.2/include/linux/skbuff.h linux-source-3.2.new/include/linux/skbuff.h
--- linux-source-3.2/include/linux/skbuff.h     2014-02-01 21:18:39.000000000 +0800
+++ linux-source-3.2.new/include/linux/skbuff.h 2014-07-11 07:00:38.696558485 +0800
@@ -474,6 +474,7 @@
        unsigned char           *head,
                                *data;
        unsigned int            truesize;
+       int                     ij;
        atomic_t                users;
 };
 
diff -uNr linux-source-3.2/include/linux/version.h linux-source-3.2.new/include/linux/version.h
--- linux-source-3.2/include/linux/version.h    1970-01-01 08:00:00.000000000 +0800
+++ linux-source-3.2.new/include/linux/version.h        2014-07-11 07:01:03.972558699 +0800
@@ -0,0 +1,2 @@
+#define LINUX_VERSION_CODE 197174
+#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))
diff -uNr linux-source-3.2/include/net/netfilter/nf_conntrack_core.h linux-source-3.2.new/include/net/netfilter/nf_conntrack_core.h
--- linux-source-3.2/include/net/netfilter/nf_conntrack_core.h  2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/include/net/netfilter/nf_conntrack_core.h      2014-07-11 08:59:04.872618612 +0800
@@ -50,7 +50,7 @@
 /* Find a connection corresponding to a tuple. */
 extern struct nf_conntrack_tuple_hash *
 nf_conntrack_find_get(struct net *net, u16 zone,
-                     const struct nf_conntrack_tuple *tuple);
+                     const struct nf_conntrack_tuple *tuple, int ij);
 
 extern int __nf_conntrack_confirm(struct sk_buff *skb);
 
diff -uNr linux-source-3.2/include/net/netfilter/nf_conntrack.h linux-source-3.2.new/include/net/netfilter/nf_conntrack.h
--- linux-source-3.2/include/net/netfilter/nf_conntrack.h       2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/include/net/netfilter/nf_conntrack.h   2014-07-11 09:05:35.412621916 +0800
@@ -176,7 +176,7 @@
    conntrack). */
 extern int
 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
-                        const struct nf_conn *ignored_conntrack);
+                        const struct nf_conn *ignored_conntrack, int ij);
 
 /* Return conntrack_info and tuple hash for given skb. */
 static inline struct nf_conn *
@@ -207,9 +207,9 @@
 
 extern struct nf_conntrack_tuple_hash *
 __nf_conntrack_find(struct net *net, u16 zone,
-                   const struct nf_conntrack_tuple *tuple);
+                   const struct nf_conntrack_tuple *tuple, int ij);
 
-extern void nf_conntrack_hash_insert(struct nf_conn *ct);
+extern void nf_conntrack_hash_insert(struct nf_conn *ct, int ij);
 extern void nf_ct_delete_from_lists(struct nf_conn *ct);
 extern void nf_ct_insert_dying_list(struct nf_conn *ct);
 
@@ -284,7 +284,7 @@
 nf_conntrack_alloc(struct net *net, u16 zone,
                   const struct nf_conntrack_tuple *orig,
                   const struct nf_conntrack_tuple *repl,
-                  gfp_t gfp);
+                  gfp_t gfp, int ij);
 
 static inline int nf_ct_is_template(const struct nf_conn *ct)
 {
diff -uNr linux-source-3.2/include/net/netfilter/nf_nat.h linux-source-3.2.new/include/net/netfilter/nf_nat.h
--- linux-source-3.2/include/net/netfilter/nf_nat.h     2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/include/net/netfilter/nf_nat.h 2014-07-11 10:05:22.568652268 +0800
@@ -53,7 +53,7 @@
 /* Set up the info structure to map into this range. */
 extern unsigned int nf_nat_setup_info(struct nf_conn *ct,
                                      const struct nf_nat_range *range,
-                                     enum nf_nat_manip_type maniptype);
+                                     enum nf_nat_manip_type maniptype, int ij);
 
 /* Is this tuple already taken? (not by us)*/
 extern int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
diff -uNr linux-source-3.2/include/net/netns/conntrack.h linux-source-3.2.new/include/net/netns/conntrack.h
--- linux-source-3.2/include/net/netns/conntrack.h      2014-01-03 12:33:36.000000000 +0800
+++ linux-source-3.2.new/include/net/netns/conntrack.h  2014-07-11 08:56:06.816617105 +0800
@@ -13,7 +13,7 @@
        unsigned int            expect_count;
        unsigned int            htable_size;
        struct kmem_cache       *nf_conntrack_cachep;
-       struct hlist_nulls_head *hash;
+       struct hlist_nulls_head *hash[3];
        struct hlist_head       *expect_hash;
        struct hlist_nulls_head unconfirmed;
        struct hlist_nulls_head dying;



重新编译内核之后,效果十分可观,按照下面的脚本加入巨量的conntrack也丝毫不会影响最大连接数:
for((i=1;i<220;i++));do for((j=1;j<254;j++));do conntrack -I conntrack -s 172.129.$j.$i -d $j.$i.192.$j -p udp -t 4000  --sport 245 --dport 2001;conntrack -I conntrack -s 192.13.$i.$j -d $i.$j.24.19 -p tcp --state ESTABLISHED -t 4000  --sport 67 --dport 505;conntrack -I conntrack -s 172.129.$j.$i -d $j.$i.192.$j -p icmp -t 4000;done;done >/dev/null 2>&1

除了分了多张hash之外,另外的好处在于锁的粒度变细了,每次在操作conntrack的时候只需要锁住和自己相关的hash表的锁即可,当然,在本次的优化中没有涉及这一点。和ipset结合起来就更猛了,本身ipset就可以定义一个集合,而该集合则可以用来计算conntrack的hash值,多表版本的conntrack和ipset结合,这在效果上不就是多级hash表吗?而且表面上还是iptables配置的。

Linux协议栈优化之Netfilter分类conntrack