Linux下NAT功能的實(shí)現(xiàn) 本文檔的Copyleft歸yfydz所有,使用GPL發(fā)布,可以自由拷貝、轉(zhuǎn)載,轉(zhuǎn)載時(shí)請(qǐng)保持文檔的完整性,嚴(yán)禁用于任何商業(yè)用途。 msn: yfydz_no1@hotmail.com 來(lái)源: http://yfydz.cublog.cn 1. 前言 在2.4/2.6內(nèi)核的Linux中的防火墻代碼netfilter中支持源NAT(SNAT)和目的NAT (DNAT),基本可以滿足各種類型的NAT需求,本文介紹Linux下的NAT的具體實(shí)現(xiàn)過(guò)程,所引的內(nèi)核代碼版本2.4.26,NAT原理部分不在此介紹,有興趣者可先看我的另一篇NAT原理介紹的文章。 2. NAT hook NAT操作也是以netfilter節(jié)點(diǎn)形式掛接在相應(yīng)的處理點(diǎn)上的,DNAT掛接在NF_IP_PRE_ROUTING點(diǎn)上,優(yōu)先級(jí)高于FILTER低于MANGLE,表示在mangle表后處理,但在filter表前處理數(shù)據(jù)包;SNAT掛接在NF_IP_POST_ROUTING點(diǎn)上,優(yōu)先級(jí)低于FILTER,表示在filter表后面處理數(shù)據(jù)包。 在net/ipv4/netfilter/ip_nat_standalone.c中: 目的NAT的hook節(jié)點(diǎn): /* Before packet filtering, change destination */ static struct nf_hook_ops ip_nat_in_ops = { { NULL, NULL }, ip_nat_fn, PF_INET, NF_IP_PRE_ROUTING, NF_IP_PRI_NAT_DST }; 源NAT的hook節(jié)點(diǎn): /* After packet filtering, change source */ static struct nf_hook_ops ip_nat_out_ops = { { NULL, NULL }, ip_nat_out, PF_INET, NF_IP_POST_ROUTING, NF_IP_PRI_NAT_SRC}; include/linux/netfilter_ipv4.h enum nf_ip_hook_priorities { NF_IP_PRI_FIRST = INT_MIN, NF_IP_PRI_CONNTRACK = -200, // 連接跟蹤 NF_IP_PRI_MANGLE = -150, // mangle table NF_IP_PRI_NAT_DST = -100, // DNAT NF_IP_PRI_FILTER = 0, // filter table NF_IP_PRI_NAT_SRC = 100, // SNAT NF_IP_PRI_LAST = INT_MAX, }; ip_nat_fn()是NAT hook的主處理函數(shù),ip_nat_out()函數(shù)也是在數(shù)據(jù)合法性檢查后調(diào)用ip_nat_fn()函數(shù)。 3. NAT處理相關(guān)結(jié)構(gòu) 在狀態(tài)連接結(jié)構(gòu)struct ip_conntrack中包含了關(guān)于NAT的相關(guān)結(jié)構(gòu)(include/linux/netfilter/ip_conntrack.h): struct ip_conntrack { ...... #ifdef CONFIG_IP_NF_NAT_NEEDED struct { struct ip_nat_info info; union ip_conntrack_nat_help help; #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) int masq_index; #endif } nat; #endif /* CONFIG_IP_NF_NAT_NEEDED */ }; 其中比較重要的是struct ip_nat_info結(jié)構(gòu),而union ip_conntrack_nat_help是各協(xié)議NAT時(shí)需要特殊處理的結(jié)構(gòu)描述,不過(guò)在2.4.26內(nèi)核中都沒(méi)定義,聯(lián)合為空。 #define IP_NAT_MAX_MANIPS (2*3) // 此結(jié)構(gòu)描述數(shù)據(jù)包中要修改部分的信息 struct ip_nat_info_manip { /* The direction. */ u_int8_t direction; /* Which hook the manipulation happens on. */ u_int8_t hooknum; /* The manipulation type. */ u_int8_t maniptype; // 修改類型: SNAT / DNAT // 連接的數(shù)據(jù)包要修改的信息,包括地址和上層的協(xié)議信息 /* Manipulations to occur at each conntrack in this dirn. */ struct ip_conntrack_manip manip; }; /* The structure embedded in the conntrack structure. */ struct ip_nat_info { /* Set to zero when conntrack created: bitmask of maniptypes */ int initialized; // 實(shí)際最多用兩位 unsigned int num_manips; /* Manipulations to be done on this conntrack. */ // 每個(gè)最多可以記錄6個(gè)NAT信息 struct ip_nat_info_manip manips[IP_NAT_MAX_MANIPS]; struct ip_nat_hash bysource, byipsproto; // 按地址和協(xié)議的HASH表 /* Helper (NULL if none). */ struct ip_nat_helper *helper; // 多連接協(xié)議的NAT時(shí)的helper struct ip_nat_seq seq[IP_CT_DIR_MAX]; // 描述兩個(gè)方向的序列號(hào)變化情況 }; 4. ip_nat_fn()函數(shù) ip_nat_fn()是NAT hook的基本處理函數(shù)(net/ipv4/netfilter/ip_nat_standalone.c),目的是建立連接的NAT info信息, 并修改數(shù)據(jù)包中的相應(yīng)部分。 static unsigned int ip_nat_fn(unsigned int hooknum, struct sk_buff **pskb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; struct ip_nat_info *info; /* maniptype == SRC for postrouting. */ // 根據(jù)hooknum來(lái)確定進(jìn)行哪種方式的NAT,netfilter在hook點(diǎn)是能進(jìn)行哪種NAT是固定的: // NF_IP_PRE_ROUTING點(diǎn)進(jìn)行的是DNAT,maniptype=1 // NF_IP_POST_ROUTING點(diǎn)進(jìn)行的是SNAT,maniptype=0 enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum); /* We never see fragments: conntrack defrags on pre-routing and local-out, and ip_nat_out protects post-routing. */ IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET))); (*pskb)->nfcache |= NFC_UNKNOWN; /* If we had a hardware checksum before, it's now invalid */ if ((*pskb)->ip_summed == CHECKSUM_HW) (*pskb)->ip_summed = CHECKSUM_NONE; // 進(jìn)行NAT的包必須都經(jīng)過(guò)的連接跟蹤處理,如果找不到該包對(duì)應(yīng)的連接,不對(duì)其進(jìn)行NAT處理 // 連接跟蹤優(yōu)先級(jí)最高,是數(shù)據(jù)包一進(jìn)入netfilter就要進(jìn)行處理的 ct = ip_conntrack_get(*pskb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would have dropped it. Hence it's the user's responsibilty to packet filter it out, or implement conntrack/NAT for that protocol. 8) --RR */ if (!ct) { /* Exception: ICMP redirect to new connection (not in hash table yet). We must not let this through, in case we're doing NAT to the same network. */ struct iphdr *iph = (*pskb)->nh.iph; struct icmphdr *hdr = (struct icmphdr *) ((u_int32_t *)iph + iph->ihl); if (iph->protocol == IPPROTO_ICMP && hdr->type == ICMP_REDIRECT) return NF_DROP; return NF_ACCEPT; } switch (ctinfo) { //對(duì)于相關(guān)連接、相關(guān)連接的回復(fù)、新連接的包進(jìn)行NAT信息的構(gòu)建 case IP_CT_RELATED: case IP_CT_RELATED+IP_CT_IS_REPLY: if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { return icmp_reply_translation(*pskb, ct, hooknum, CTINFO2DIR(ctinfo)); } /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ case IP_CT_NEW: info = &ct->nat.info; WRITE_LOCK(&ip_nat_lock); /* Seen it before? This can happen for loopback, retrans, or local packets.. */ // 檢查是否已經(jīng)進(jìn)行相應(yīng)方向的初始化,注意初始化可以是兩個(gè)方向同時(shí)進(jìn)行的 // 這就是說(shuō)一個(gè)數(shù)據(jù)包可以同時(shí)修改源和目的, 這在服務(wù)器和內(nèi)網(wǎng)在相同網(wǎng)段時(shí)會(huì)用到, // netfilter已經(jīng)能自動(dòng)處理這種情況,根本不需要進(jìn)行修改,以前我的理解有誤,以為 // 只能修改一個(gè)方向的數(shù)據(jù) if (!(info->initialized & (1 local traffic with * CONFIG_IP_NF_NAT_LOCAL disabled. */ && !(ct->status & IPS_CONFIRMED) #endif ) { unsigned int ret; if (ct->master && master_ct(ct)->nat.info.helper && master_ct(ct)->nat.info.helper->expect) { // 多連接協(xié)議情況, 如果是子連接, 調(diào)用主連接相關(guān)的expect函數(shù)處理填寫(xiě)NAT info信息 ret = call_expect(master_ct(ct), pskb, hooknum, ct, info); } else { #ifdef CONFIG_IP_NF_NAT_LOCAL /* LOCAL_IN hook doesn't have a chain! */ if (hooknum == NF_IP_LOCAL_IN) ret = alloc_null_binding(ct, info, hooknum); else #endif // 否則根據(jù)NAT規(guī)則表查找規(guī)則, 執(zhí)行規(guī)則的動(dòng)作: SNAT或DNAT, 填寫(xiě)NAT info信息 ret = ip_nat_rule_find(pskb, hooknum, in, out, ct, info); } // 返回值不是接受的話直接返回, 數(shù)據(jù)包將被丟棄 if (ret != NF_ACCEPT) { WRITE_UNLOCK(&ip_nat_lock); return ret; } } else DEBUGP("Already setup manip %s for ct %p\n", maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", ct); WRITE_UNLOCK(&ip_nat_lock); break; default: // 連接的NAT信息已經(jīng)填好, 直接使用 /* ESTABLISHED */ IP_NF_ASSERT(ctinfo == IP_CT_ESTABLISHED || ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY)); info = &ct->nat.info; } IP_NF_ASSERT(info); // 根據(jù)NAT info信息對(duì)數(shù)據(jù)包的相應(yīng)部分進(jìn)行修改 return do_bindings(ct, ctinfo, info, hooknum, pskb); } 4. do_bindings()函數(shù) do_bindings()是完成具體的NAT操作部分的函數(shù)(net/ipv4/netfilter/ip_nat_core.c),修改地址端口等信息,必要時(shí)修改數(shù)據(jù)內(nèi)容部分信息(這種情況下可能數(shù)據(jù)包長(zhǎng)度會(huì)變,序列號(hào)/確認(rèn)號(hào)相應(yīng)會(huì)改變,這些都累計(jì)進(jìn)NAT info參數(shù)中),并重新各種校驗(yàn)和(TCP/UDP/ICMP校驗(yàn)和,IP頭校驗(yàn)和): /* Do packet manipulations according to binding. */ unsigned int do_bindings(struct ip_conntrack *ct, enum ip_conntrack_info ctinfo, struct ip_nat_info *info, unsigned int hooknum, struct sk_buff **pskb) { unsigned int i; struct ip_nat_helper *helper; // 數(shù)據(jù)方向:original or reply enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); // 是否是TCP協(xié)議,TCP協(xié)議要處理序列號(hào)/確認(rèn)號(hào) int is_tcp= (*pskb)->nh.iph->protocol == IPPROTO_TCP; /* Need nat lock to protect against modification, but neither conntrack (referenced) and helper (deleted with synchronize_bh()) can vanish. */ READ_LOCK(&ip_nat_lock); for (i = 0; i num_manips; i++) { /* rawsocket(tcpdump) may have clone of incoming skb: don't disturb it --RR */ if (skb_cloned(*pskb) && !(*pskb)->sk) { struct sk_buff *nskb = skb_copy(*pskb, GFP_ATOMIC); if (!nskb) { READ_UNLOCK(&ip_nat_lock); return NF_DROP; } kfree_skb(*pskb); *pskb = nskb; } // 檢查數(shù)據(jù)包方向和hooknum是否是與NAT info中規(guī)定的一致 if (info->manips.direction == dir && info->manips.hooknum == hooknum) { DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n", *pskb, info->manips.maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", NIPQUAD(info->manips.manip.ip), htons(info->manips.manip.u.all)); // 進(jìn)行具體的NAT操作,修改IP頭的地址、TCP、UDP等的端口 manip_pkt((*pskb)->nh.iph->protocol, (*pskb)->nh.iph, (*pskb)->len, &info->manips.manip, info->manips.maniptype, &(*pskb)->nfcache); } } helper = info->helper; READ_UNLOCK(&ip_nat_lock); // 多連接協(xié)議 if (helper) { struct ip_conntrack_expect *exp = NULL; struct list_head *cur_item; int ret = NF_ACCEPT; int helper_called = 0; DEBUGP("do_bindings: helper existing for (%p)\n", ct); /* Always defragged for helpers */ IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET))); /* Have to grab read lock before sibling_list traversal */ READ_LOCK(&ip_conntrack_lock); // 主連接的子連接鏈表是倒著搜索的 list_for_each_prev(cur_item, &ct->sibling_list) { // 取得期待的連接信息 exp = list_entry(cur_item, struct ip_conntrack_expect, expected_list);
/* if this expectation is already established, skip */ // 期待的子連接已經(jīng)到了,不用再處理 if (exp->sibling) continue; // 檢查數(shù)據(jù)包是否是要修改的數(shù)據(jù)包,對(duì)于UDP、ICMP函數(shù)返回始終是1,TCP協(xié)議是才可能返回0 if (exp_for_packet(exp, pskb)) { /* FIXME: May be true multiple times in the * case of UDP!! */ DEBUGP("calling nat helper (exp=%p) for packet\n", exp); // 調(diào)用多連接協(xié)議的help函數(shù)修改內(nèi)容部分的相關(guān)數(shù)據(jù) ret = helper->help(ct, exp, info, ctinfo, hooknum, pskb); if (ret != NF_ACCEPT) { READ_UNLOCK(&ip_conntrack_lock); return ret; } helper_called = 1; } } /* Helper might want to manip the packet even when there is no * matching expectation for this packet */ if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) { DEBUGP("calling nat helper for packet without expectation\n"); ret = helper->help(ct, NULL, info, ctinfo, hooknum, pskb); if (ret != NF_ACCEPT) { READ_UNLOCK(&ip_conntrack_lock); return ret; } } READ_UNLOCK(&ip_conntrack_lock);
/* Adjust sequence number only once per packet * (helper is called at all hooks) */ // 調(diào)整TCP的序列號(hào) if (is_tcp && (hooknum == NF_IP_POST_ROUTING || hooknum == NF_IP_LOCAL_IN)) { DEBUGP("ip_nat_core: adjusting sequence number\n"); /* future: put this in a l4-proto specific function, * and call this function here. */ ip_nat_seq_adjust(*pskb, ct, ctinfo); } return ret; } else return NF_ACCEPT; /* not reached */ }
/* Where to manip the reply packets (will be reverse manip). */ static unsigned int opposite_hook[NF_IP_NUMHOOKS] = { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING, [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING, #ifdef CONFIG_IP_NF_NAT_LOCAL [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN, [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT, #endif };
unsigned int ip_nat_setup_info(struct ip_conntrack *conntrack, const struct ip_nat_multi_range *mr, unsigned int hooknum) { struct ip_conntrack_tuple new_tuple, inv_tuple, reply; struct ip_conntrack_tuple orig_tp; struct ip_nat_info *info = &conntrack->nat.info; // 如果info->initialized不為0,表示已經(jīng)初始化過(guò)了 int in_hashes = info->initialized; MUST_BE_WRITE_LOCKED(&ip_nat_lock); IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_POST_ROUTING || hooknum == NF_IP_LOCAL_OUT); IP_NF_ASSERT(info->num_manips initialized & (1 /* What we've got will look like inverse of reply. Normally this is what is in the conntrack, except for prior manipulations (future optimization: if num_manips == 0, orig_tp = conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ // 根據(jù)連接的回應(yīng)方向的tuple進(jìn)行反轉(zhuǎn)得到原始方向的tuple invert_tuplepr(&orig_tp, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple); #if 0 { unsigned int i; DEBUGP("Hook %u (%s), ", hooknum, HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST"); DUMP_TUPLE(&orig_tp); DEBUGP("Range %p: ", mr); for (i = 0; i rangesize; i++) { DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n", i, (mr->range.flags & IP_NAT_RANGE_MAP_IPS) ? " MAP_IPS" : "", (mr->range.flags & IP_NAT_RANGE_PROTO_SPECIFIED) ? " PROTO_SPECIFIED" : "", (mr->range.flags & IP_NAT_RANGE_FULL) ? " FULL" : "", NIPQUAD(mr->range.min_ip), NIPQUAD(mr->range.max_ip), mr->range.min.all, mr->range.max.all); } } #endif
do { // 找一個(gè)未使用的進(jìn)行了轉(zhuǎn)換后的tuple結(jié)構(gòu)參數(shù),mr是NAT規(guī)則確定的要轉(zhuǎn)換后的 // 地址端口參數(shù), new_tuple保持轉(zhuǎn)換后的連接原始方向的tuple if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack, hooknum)) { DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n", conntrack); return NF_DROP; } #if 0 DEBUGP("Hook %u (%s) %p\n", hooknum, HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST", conntrack); DEBUGP("Original: "); DUMP_TUPLE(&orig_tp); DEBUGP("New: "); DUMP_TUPLE(&new_tuple); #endif /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT): the original (A/B/C/D') and the mangled one (E/F/G/H'). We're only allowed to work with the SRC per-proto part, so we create inverses of both to start, then derive the other fields we need. */ /* Reply connection: simply invert the new tuple (G/H/E/F') */ // 建立連接地址轉(zhuǎn)換后的反向的tuple,這使netfilter能自動(dòng)對(duì)連接的反方向數(shù)據(jù) // 進(jìn)行處理,也就是說(shuō)定義了一條SNAT規(guī)則后,并不需要再定義一條DNAT規(guī)則來(lái)處理 // 返回的數(shù)據(jù),netfilter已經(jīng)自動(dòng)處理了 invert_tuplepr(&reply, &new_tuple); /* Alter conntrack table so it recognizes replies. If fail this race (reply tuple now used), repeat. */ // 修改連接參數(shù)使能正確識(shí)別返回?cái)?shù)據(jù),如果reply已經(jīng)對(duì)應(yīng)一條連接 // ip_conntrack_alter_reply()函數(shù)返回0,表示要繼續(xù)修改轉(zhuǎn)換后的參數(shù)值 } while (!ip_conntrack_alter_reply(conntrack, &reply)); /* FIXME: We can simply used existing conntrack reply tuple here --RR */ /* Create inverse of original: C/D/A/B' */ invert_tuplepr(&inv_tuple, &orig_tp);
/* Has source changed?. */ // 源NAT if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) { /* In this direction, a source manip. */ // 連接正方向是SNAT info->manips[info->num_manips++] = ((struct ip_nat_info_manip) { IP_CT_DIR_ORIGINAL, hooknum, IP_NAT_MANIP_SRC, new_tuple.src }); IP_NF_ASSERT(info->num_manips /* In the reverse direction, a destination manip. */ // 連接反方向是DNAT info->manips[info->num_manips++] = ((struct ip_nat_info_manip) { IP_CT_DIR_REPLY, opposite_hook[hooknum], IP_NAT_MANIP_DST, orig_tp.src }); IP_NF_ASSERT(info->num_manips
/* Has destination changed? */ // 目的NAT if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) { /* In this direction, a destination manip */ // 連接正方向是DNAT info->manips[info->num_manips++] = ((struct ip_nat_info_manip) { IP_CT_DIR_ORIGINAL, hooknum, IP_NAT_MANIP_DST, reply.src }); IP_NF_ASSERT(info->num_manips /* In the reverse direction, a source manip. */ // 連接反方向是SNAT info->manips[info->num_manips++] = ((struct ip_nat_info_manip) { IP_CT_DIR_REPLY, opposite_hook[hooknum], IP_NAT_MANIP_SRC, inv_tuple.src }); IP_NF_ASSERT(info->num_manips
/* If there's a helper, assign it; based on new tuple. */ // 對(duì)于主連接檢查是否有應(yīng)用層協(xié)議的NAT helper結(jié)構(gòu) if (!conntrack->master) info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *, &reply);