aboutsummaryrefslogtreecommitdiffstats
path: root/net/sched
diff options
context:
space:
mode:
authorHerbert Xu <herbert@gondor.apana.org.au>2007-09-27 15:48:05 -0400
committerDavid S. Miller <davem@sunset.davemloft.net>2007-10-10 19:53:11 -0400
commitb4219952356baa162368f2f5dab6421a5dbc5e15 (patch)
tree4d99027b3453853c79a32793c4ebda76c6386169 /net/sched
parentce1234d299f3823ea07019c0f7b7b0bcb81ee7a0 (diff)
[PKT_SCHED]: Add stateless NAT
Stateless NAT is useful in controlled environments where restrictions are placed on through traffic such that we don't need connection tracking to correctly NAT protocol-specific data. In particular, this is of interest when the number of flows or the number of addresses being NATed is large, or if connection tracking information has to be replicated and where it is not practical to do so. Previously we had stateless NAT functionality which was integrated into the IPv4 routing subsystem. This was a great solution as long as the NAT worked on a subnet to subnet basis such that the number of NAT rules was relatively small. The reason is that for SNAT the routing based system had to perform a linear scan through the rules. If the number of rules is large then major renovations would have take place in the routing subsystem to make this practical. For the time being, the least intrusive way of achieving this is to use the u32 classifier written by Alexey Kuznetsov along with the actions infrastructure implemented by Jamal Hadi Salim. The following patch is an attempt at this problem by creating a new nat action that can be invoked from u32 hash tables which would allow large number of stateless NAT rules that can be used/updated in constant time. The actual NAT code is mostly based on the previous stateless NAT code written by Alexey. In future we might be able to utilise the protocol NAT code from netfilter to improve support for other protocols. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/sched')
-rw-r--r--net/sched/Kconfig11
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_nat.c322
3 files changed, 334 insertions, 0 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 8a74cac0be8c..92435a882fac 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -447,6 +447,17 @@ config NET_ACT_IPT
447 To compile this code as a module, choose M here: the 447 To compile this code as a module, choose M here: the
448 module will be called ipt. 448 module will be called ipt.
449 449
450config NET_ACT_NAT
451 tristate "Stateless NAT"
452 depends on NET_CLS_ACT
453 select NETFILTER
454 ---help---
455 Say Y here to do stateless NAT on IPv4 packets. You should use
456 netfilter for NAT unless you know what you are doing.
457
458 To compile this code as a module, choose M here: the
459 module will be called nat.
460
450config NET_ACT_PEDIT 461config NET_ACT_PEDIT
451 tristate "Packet Editing" 462 tristate "Packet Editing"
452 depends on NET_CLS_ACT 463 depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index b67c36f65cf2..81ecbe8e7dce 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_NET_ACT_POLICE) += act_police.o
11obj-$(CONFIG_NET_ACT_GACT) += act_gact.o 11obj-$(CONFIG_NET_ACT_GACT) += act_gact.o
12obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o 12obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o
13obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o 13obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o
14obj-$(CONFIG_NET_ACT_NAT) += act_nat.o
14obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o 15obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o
15obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o 16obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o
16obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o 17obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
new file mode 100644
index 000000000000..c96273bcaf9c
--- /dev/null
+++ b/net/sched/act_nat.c
@@ -0,0 +1,322 @@
1/*
2 * Stateless NAT actions
3 *
4 * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the Free
8 * Software Foundation; either version 2 of the License, or (at your option)
9 * any later version.
10 */
11
12#include <linux/errno.h>
13#include <linux/init.h>
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/netfilter.h>
17#include <linux/rtnetlink.h>
18#include <linux/skbuff.h>
19#include <linux/slab.h>
20#include <linux/spinlock.h>
21#include <linux/string.h>
22#include <linux/tc_act/tc_nat.h>
23#include <net/act_api.h>
24#include <net/icmp.h>
25#include <net/ip.h>
26#include <net/netlink.h>
27#include <net/tc_act/tc_nat.h>
28#include <net/tcp.h>
29#include <net/udp.h>
30
31
32#define NAT_TAB_MASK 15
33static struct tcf_common *tcf_nat_ht[NAT_TAB_MASK + 1];
34static u32 nat_idx_gen;
35static DEFINE_RWLOCK(nat_lock);
36
37static struct tcf_hashinfo nat_hash_info = {
38 .htab = tcf_nat_ht,
39 .hmask = NAT_TAB_MASK,
40 .lock = &nat_lock,
41};
42
43static int tcf_nat_init(struct rtattr *rta, struct rtattr *est,
44 struct tc_action *a, int ovr, int bind)
45{
46 struct rtattr *tb[TCA_NAT_MAX];
47 struct tc_nat *parm;
48 int ret = 0;
49 struct tcf_nat *p;
50 struct tcf_common *pc;
51
52 if (rta == NULL || rtattr_parse_nested(tb, TCA_NAT_MAX, rta) < 0)
53 return -EINVAL;
54
55 if (tb[TCA_NAT_PARMS - 1] == NULL ||
56 RTA_PAYLOAD(tb[TCA_NAT_PARMS - 1]) < sizeof(*parm))
57 return -EINVAL;
58 parm = RTA_DATA(tb[TCA_NAT_PARMS - 1]);
59
60 pc = tcf_hash_check(parm->index, a, bind, &nat_hash_info);
61 if (!pc) {
62 pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
63 &nat_idx_gen, &nat_hash_info);
64 if (unlikely(!pc))
65 return -ENOMEM;
66 p = to_tcf_nat(pc);
67 ret = ACT_P_CREATED;
68 } else {
69 p = to_tcf_nat(pc);
70 if (!ovr) {
71 tcf_hash_release(pc, bind, &nat_hash_info);
72 return -EEXIST;
73 }
74 }
75
76 spin_lock_bh(&p->tcf_lock);
77 p->old_addr = parm->old_addr;
78 p->new_addr = parm->new_addr;
79 p->mask = parm->mask;
80 p->flags = parm->flags;
81
82 p->tcf_action = parm->action;
83 spin_unlock_bh(&p->tcf_lock);
84
85 if (ret == ACT_P_CREATED)
86 tcf_hash_insert(pc, &nat_hash_info);
87
88 return ret;
89}
90
91static int tcf_nat_cleanup(struct tc_action *a, int bind)
92{
93 struct tcf_nat *p = a->priv;
94
95 return tcf_hash_release(&p->common, bind, &nat_hash_info);
96}
97
98static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
99 struct tcf_result *res)
100{
101 struct tcf_nat *p = a->priv;
102 struct iphdr *iph;
103 __be32 old_addr;
104 __be32 new_addr;
105 __be32 mask;
106 __be32 addr;
107 int egress;
108 int action;
109 int ihl;
110
111 spin_lock(&p->tcf_lock);
112
113 p->tcf_tm.lastuse = jiffies;
114 old_addr = p->old_addr;
115 new_addr = p->new_addr;
116 mask = p->mask;
117 egress = p->flags & TCA_NAT_FLAG_EGRESS;
118 action = p->tcf_action;
119
120 p->tcf_bstats.bytes += skb->len;
121 p->tcf_bstats.packets++;
122
123 spin_unlock(&p->tcf_lock);
124
125 if (unlikely(action == TC_ACT_SHOT))
126 goto drop;
127
128 if (!pskb_may_pull(skb, sizeof(*iph)))
129 goto drop;
130
131 iph = ip_hdr(skb);
132
133 if (egress)
134 addr = iph->saddr;
135 else
136 addr = iph->daddr;
137
138 if (!((old_addr ^ addr) & mask)) {
139 if (skb_cloned(skb) &&
140 !skb_clone_writable(skb, sizeof(*iph)) &&
141 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
142 goto drop;
143
144 new_addr &= mask;
145 new_addr |= addr & ~mask;
146
147 /* Rewrite IP header */
148 iph = ip_hdr(skb);
149 if (egress)
150 iph->saddr = new_addr;
151 else
152 iph->daddr = new_addr;
153
154 nf_csum_replace4(&iph->check, addr, new_addr);
155 }
156
157 ihl = iph->ihl * 4;
158
159 /* It would be nice to share code with stateful NAT. */
160 switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
161 case IPPROTO_TCP:
162 {
163 struct tcphdr *tcph;
164
165 if (!pskb_may_pull(skb, ihl + sizeof(*tcph)) ||
166 (skb_cloned(skb) &&
167 !skb_clone_writable(skb, ihl + sizeof(*tcph)) &&
168 pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
169 goto drop;
170
171 tcph = (void *)(skb_network_header(skb) + ihl);
172 nf_proto_csum_replace4(&tcph->check, skb, addr, new_addr, 1);
173 break;
174 }
175 case IPPROTO_UDP:
176 {
177 struct udphdr *udph;
178
179 if (!pskb_may_pull(skb, ihl + sizeof(*udph)) ||
180 (skb_cloned(skb) &&
181 !skb_clone_writable(skb, ihl + sizeof(*udph)) &&
182 pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
183 goto drop;
184
185 udph = (void *)(skb_network_header(skb) + ihl);
186 if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
187 nf_proto_csum_replace4(&udph->check, skb, addr,
188 new_addr, 1);
189 if (!udph->check)
190 udph->check = CSUM_MANGLED_0;
191 }
192 break;
193 }
194 case IPPROTO_ICMP:
195 {
196 struct icmphdr *icmph;
197
198 if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph)))
199 goto drop;
200
201 icmph = (void *)(skb_network_header(skb) + ihl);
202
203 if ((icmph->type != ICMP_DEST_UNREACH) &&
204 (icmph->type != ICMP_TIME_EXCEEDED) &&
205 (icmph->type != ICMP_PARAMETERPROB))
206 break;
207
208 iph = (void *)(icmph + 1);
209 if (egress)
210 addr = iph->daddr;
211 else
212 addr = iph->saddr;
213
214 if ((old_addr ^ addr) & mask)
215 break;
216
217 if (skb_cloned(skb) &&
218 !skb_clone_writable(skb,
219 ihl + sizeof(*icmph) + sizeof(*iph)) &&
220 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
221 goto drop;
222
223 icmph = (void *)(skb_network_header(skb) + ihl);
224 iph = (void *)(icmph + 1);
225
226 new_addr &= mask;
227 new_addr |= addr & ~mask;
228
229 /* XXX Fix up the inner checksums. */
230 if (egress)
231 iph->daddr = new_addr;
232 else
233 iph->saddr = new_addr;
234
235 nf_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr,
236 1);
237 break;
238 }
239 default:
240 break;
241 }
242
243 return action;
244
245drop:
246 spin_lock(&p->tcf_lock);
247 p->tcf_qstats.drops++;
248 spin_unlock(&p->tcf_lock);
249 return TC_ACT_SHOT;
250}
251
252static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
253 int bind, int ref)
254{
255 unsigned char *b = skb_tail_pointer(skb);
256 struct tcf_nat *p = a->priv;
257 struct tc_nat *opt;
258 struct tcf_t t;
259 int s;
260
261 s = sizeof(*opt);
262
263 /* netlink spinlocks held above us - must use ATOMIC */
264 opt = kzalloc(s, GFP_ATOMIC);
265 if (unlikely(!opt))
266 return -ENOBUFS;
267
268 opt->old_addr = p->old_addr;
269 opt->new_addr = p->new_addr;
270 opt->mask = p->mask;
271 opt->flags = p->flags;
272
273 opt->index = p->tcf_index;
274 opt->action = p->tcf_action;
275 opt->refcnt = p->tcf_refcnt - ref;
276 opt->bindcnt = p->tcf_bindcnt - bind;
277
278 RTA_PUT(skb, TCA_NAT_PARMS, s, opt);
279 t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
280 t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
281 t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
282 RTA_PUT(skb, TCA_NAT_TM, sizeof(t), &t);
283
284 kfree(opt);
285
286 return skb->len;
287
288rtattr_failure:
289 nlmsg_trim(skb, b);
290 kfree(opt);
291 return -1;
292}
293
294static struct tc_action_ops act_nat_ops = {
295 .kind = "nat",
296 .hinfo = &nat_hash_info,
297 .type = TCA_ACT_NAT,
298 .capab = TCA_CAP_NONE,
299 .owner = THIS_MODULE,
300 .act = tcf_nat,
301 .dump = tcf_nat_dump,
302 .cleanup = tcf_nat_cleanup,
303 .lookup = tcf_hash_search,
304 .init = tcf_nat_init,
305 .walk = tcf_generic_walker
306};
307
308MODULE_DESCRIPTION("Stateless NAT actions");
309MODULE_LICENSE("GPL");
310
311static int __init nat_init_module(void)
312{
313 return tcf_register_action(&act_nat_ops);
314}
315
316static void __exit nat_cleanup_module(void)
317{
318 tcf_unregister_action(&act_nat_ops);
319}
320
321module_init(nat_init_module);
322module_exit(nat_cleanup_module);