diff options
Diffstat (limited to 'net/ipv4')
79 files changed, 1507 insertions, 13655 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 591ea23639ca..691268f3a359 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig | |||
@@ -630,5 +630,3 @@ config TCP_MD5SIG | |||
630 | 630 | ||
631 | If unsure, say N. | 631 | If unsure, say N. |
632 | 632 | ||
633 | source "net/ipv4/ipvs/Kconfig" | ||
634 | |||
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ad40ef3f9ebc..80ff87ce43aa 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
@@ -33,7 +33,6 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o | |||
33 | obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o | 33 | obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o |
34 | obj-$(CONFIG_IP_PNP) += ipconfig.o | 34 | obj-$(CONFIG_IP_PNP) += ipconfig.o |
35 | obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ | 35 | obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ |
36 | obj-$(CONFIG_IP_VS) += ipvs/ | ||
37 | obj-$(CONFIG_INET_DIAG) += inet_diag.o | 36 | obj-$(CONFIG_INET_DIAG) += inet_diag.o |
38 | obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o | 37 | obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o |
39 | obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o | 38 | obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8a3ac1fa71a9..1fbff5fa4241 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -469,7 +469,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) | |||
469 | */ | 469 | */ |
470 | err = -EADDRNOTAVAIL; | 470 | err = -EADDRNOTAVAIL; |
471 | if (!sysctl_ip_nonlocal_bind && | 471 | if (!sysctl_ip_nonlocal_bind && |
472 | !inet->freebind && | 472 | !(inet->freebind || inet->transparent) && |
473 | addr->sin_addr.s_addr != htonl(INADDR_ANY) && | 473 | addr->sin_addr.s_addr != htonl(INADDR_ANY) && |
474 | chk_addr_ret != RTN_LOCAL && | 474 | chk_addr_ret != RTN_LOCAL && |
475 | chk_addr_ret != RTN_MULTICAST && | 475 | chk_addr_ret != RTN_MULTICAST && |
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index f70fac612596..7f9e337e3908 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c | |||
@@ -1234,6 +1234,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) | |||
1234 | write_lock_bh(&in_dev->mc_list_lock); | 1234 | write_lock_bh(&in_dev->mc_list_lock); |
1235 | im->next=in_dev->mc_list; | 1235 | im->next=in_dev->mc_list; |
1236 | in_dev->mc_list=im; | 1236 | in_dev->mc_list=im; |
1237 | in_dev->mc_count++; | ||
1237 | write_unlock_bh(&in_dev->mc_list_lock); | 1238 | write_unlock_bh(&in_dev->mc_list_lock); |
1238 | #ifdef CONFIG_IP_MULTICAST | 1239 | #ifdef CONFIG_IP_MULTICAST |
1239 | igmpv3_del_delrec(in_dev, im->multiaddr); | 1240 | igmpv3_del_delrec(in_dev, im->multiaddr); |
@@ -1282,6 +1283,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) | |||
1282 | if (--i->users == 0) { | 1283 | if (--i->users == 0) { |
1283 | write_lock_bh(&in_dev->mc_list_lock); | 1284 | write_lock_bh(&in_dev->mc_list_lock); |
1284 | *ip = i->next; | 1285 | *ip = i->next; |
1286 | in_dev->mc_count--; | ||
1285 | write_unlock_bh(&in_dev->mc_list_lock); | 1287 | write_unlock_bh(&in_dev->mc_list_lock); |
1286 | igmp_group_dropped(i); | 1288 | igmp_group_dropped(i); |
1287 | 1289 | ||
@@ -1330,6 +1332,7 @@ void ip_mc_init_dev(struct in_device *in_dev) | |||
1330 | setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire, | 1332 | setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire, |
1331 | (unsigned long)in_dev); | 1333 | (unsigned long)in_dev); |
1332 | in_dev->mr_ifc_count = 0; | 1334 | in_dev->mr_ifc_count = 0; |
1335 | in_dev->mc_count = 0; | ||
1333 | setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, | 1336 | setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, |
1334 | (unsigned long)in_dev); | 1337 | (unsigned long)in_dev); |
1335 | in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; | 1338 | in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; |
@@ -1369,8 +1372,8 @@ void ip_mc_destroy_dev(struct in_device *in_dev) | |||
1369 | write_lock_bh(&in_dev->mc_list_lock); | 1372 | write_lock_bh(&in_dev->mc_list_lock); |
1370 | while ((i = in_dev->mc_list) != NULL) { | 1373 | while ((i = in_dev->mc_list) != NULL) { |
1371 | in_dev->mc_list = i->next; | 1374 | in_dev->mc_list = i->next; |
1375 | in_dev->mc_count--; | ||
1372 | write_unlock_bh(&in_dev->mc_list_lock); | 1376 | write_unlock_bh(&in_dev->mc_list_lock); |
1373 | |||
1374 | igmp_group_dropped(i); | 1377 | igmp_group_dropped(i); |
1375 | ip_ma_put(i); | 1378 | ip_ma_put(i); |
1376 | 1379 | ||
@@ -2383,7 +2386,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) | |||
2383 | 2386 | ||
2384 | if (state->in_dev->mc_list == im) { | 2387 | if (state->in_dev->mc_list == im) { |
2385 | seq_printf(seq, "%d\t%-10s: %5d %7s\n", | 2388 | seq_printf(seq, "%d\t%-10s: %5d %7s\n", |
2386 | state->dev->ifindex, state->dev->name, state->dev->mc_count, querier); | 2389 | state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); |
2387 | } | 2390 | } |
2388 | 2391 | ||
2389 | seq_printf(seq, | 2392 | seq_printf(seq, |
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0c1ae68ee84b..bd1278a2d828 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -30,20 +30,22 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); | |||
30 | #endif | 30 | #endif |
31 | 31 | ||
32 | /* | 32 | /* |
33 | * This array holds the first and last local port number. | 33 | * This struct holds the first and last local port number. |
34 | */ | 34 | */ |
35 | int sysctl_local_port_range[2] = { 32768, 61000 }; | 35 | struct local_ports sysctl_local_ports __read_mostly = { |
36 | DEFINE_SEQLOCK(sysctl_port_range_lock); | 36 | .lock = SEQLOCK_UNLOCKED, |
37 | .range = { 32768, 61000 }, | ||
38 | }; | ||
37 | 39 | ||
38 | void inet_get_local_port_range(int *low, int *high) | 40 | void inet_get_local_port_range(int *low, int *high) |
39 | { | 41 | { |
40 | unsigned seq; | 42 | unsigned seq; |
41 | do { | 43 | do { |
42 | seq = read_seqbegin(&sysctl_port_range_lock); | 44 | seq = read_seqbegin(&sysctl_local_ports.lock); |
43 | 45 | ||
44 | *low = sysctl_local_port_range[0]; | 46 | *low = sysctl_local_ports.range[0]; |
45 | *high = sysctl_local_port_range[1]; | 47 | *high = sysctl_local_ports.range[1]; |
46 | } while (read_seqretry(&sysctl_port_range_lock, seq)); | 48 | } while (read_seqretry(&sysctl_local_ports.lock, seq)); |
47 | } | 49 | } |
48 | EXPORT_SYMBOL(inet_get_local_port_range); | 50 | EXPORT_SYMBOL(inet_get_local_port_range); |
49 | 51 | ||
@@ -335,6 +337,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk, | |||
335 | .saddr = ireq->loc_addr, | 337 | .saddr = ireq->loc_addr, |
336 | .tos = RT_CONN_FLAGS(sk) } }, | 338 | .tos = RT_CONN_FLAGS(sk) } }, |
337 | .proto = sk->sk_protocol, | 339 | .proto = sk->sk_protocol, |
340 | .flags = inet_sk_flowi_flags(sk), | ||
338 | .uli_u = { .ports = | 341 | .uli_u = { .ports = |
339 | { .sport = inet_sk(sk)->sport, | 342 | { .sport = inet_sk(sk)->sport, |
340 | .dport = ireq->rmt_port } } }; | 343 | .dport = ireq->rmt_port } } }; |
@@ -515,6 +518,8 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, | |||
515 | newicsk->icsk_bind_hash = NULL; | 518 | newicsk->icsk_bind_hash = NULL; |
516 | 519 | ||
517 | inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; | 520 | inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; |
521 | inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port); | ||
522 | inet_sk(newsk)->sport = inet_rsk(req)->loc_port; | ||
518 | newsk->sk_write_space = sk_stream_write_space; | 523 | newsk->sk_write_space = sk_stream_write_space; |
519 | 524 | ||
520 | newicsk->icsk_retransmits = 0; | 525 | newicsk->icsk_retransmits = 0; |
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c10036e7a463..89cb047ab314 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c | |||
@@ -782,11 +782,15 @@ skip_listen_ht: | |||
782 | struct sock *sk; | 782 | struct sock *sk; |
783 | struct hlist_node *node; | 783 | struct hlist_node *node; |
784 | 784 | ||
785 | num = 0; | ||
786 | |||
787 | if (hlist_empty(&head->chain) && hlist_empty(&head->twchain)) | ||
788 | continue; | ||
789 | |||
785 | if (i > s_i) | 790 | if (i > s_i) |
786 | s_num = 0; | 791 | s_num = 0; |
787 | 792 | ||
788 | read_lock_bh(lock); | 793 | read_lock_bh(lock); |
789 | num = 0; | ||
790 | sk_for_each(sk, node, &head->chain) { | 794 | sk_for_each(sk, node, &head->chain) { |
791 | struct inet_sock *inet = inet_sk(sk); | 795 | struct inet_sock *inet = inet_sk(sk); |
792 | 796 | ||
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 743f011b9a84..1c5fd38f8824 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c | |||
@@ -126,6 +126,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat | |||
126 | tw->tw_reuse = sk->sk_reuse; | 126 | tw->tw_reuse = sk->sk_reuse; |
127 | tw->tw_hash = sk->sk_hash; | 127 | tw->tw_hash = sk->sk_hash; |
128 | tw->tw_ipv6only = 0; | 128 | tw->tw_ipv6only = 0; |
129 | tw->tw_transparent = inet->transparent; | ||
129 | tw->tw_prot = sk->sk_prot_creator; | 130 | tw->tw_prot = sk->sk_prot_creator; |
130 | twsk_net_set(tw, hold_net(sock_net(sk))); | 131 | twsk_net_set(tw, hold_net(sock_net(sk))); |
131 | atomic_set(&tw->tw_refcnt, 1); | 132 | atomic_set(&tw->tw_refcnt, 1); |
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2a61158ea722..85c487b8572b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/inetdevice.h> | 27 | #include <linux/inetdevice.h> |
28 | #include <linux/igmp.h> | 28 | #include <linux/igmp.h> |
29 | #include <linux/netfilter_ipv4.h> | 29 | #include <linux/netfilter_ipv4.h> |
30 | #include <linux/etherdevice.h> | ||
30 | #include <linux/if_ether.h> | 31 | #include <linux/if_ether.h> |
31 | 32 | ||
32 | #include <net/sock.h> | 33 | #include <net/sock.h> |
@@ -41,6 +42,7 @@ | |||
41 | #include <net/xfrm.h> | 42 | #include <net/xfrm.h> |
42 | #include <net/net_namespace.h> | 43 | #include <net/net_namespace.h> |
43 | #include <net/netns/generic.h> | 44 | #include <net/netns/generic.h> |
45 | #include <net/rtnetlink.h> | ||
44 | 46 | ||
45 | #ifdef CONFIG_IPV6 | 47 | #ifdef CONFIG_IPV6 |
46 | #include <net/ipv6.h> | 48 | #include <net/ipv6.h> |
@@ -117,8 +119,10 @@ | |||
117 | Alexey Kuznetsov. | 119 | Alexey Kuznetsov. |
118 | */ | 120 | */ |
119 | 121 | ||
122 | static struct rtnl_link_ops ipgre_link_ops __read_mostly; | ||
120 | static int ipgre_tunnel_init(struct net_device *dev); | 123 | static int ipgre_tunnel_init(struct net_device *dev); |
121 | static void ipgre_tunnel_setup(struct net_device *dev); | 124 | static void ipgre_tunnel_setup(struct net_device *dev); |
125 | static int ipgre_tunnel_bind_dev(struct net_device *dev); | ||
122 | 126 | ||
123 | /* Fallback tunnel: no source, no destination, no key, no options */ | 127 | /* Fallback tunnel: no source, no destination, no key, no options */ |
124 | 128 | ||
@@ -163,38 +167,64 @@ static DEFINE_RWLOCK(ipgre_lock); | |||
163 | /* Given src, dst and key, find appropriate for input tunnel. */ | 167 | /* Given src, dst and key, find appropriate for input tunnel. */ |
164 | 168 | ||
165 | static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net, | 169 | static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net, |
166 | __be32 remote, __be32 local, __be32 key) | 170 | __be32 remote, __be32 local, |
171 | __be32 key, __be16 gre_proto) | ||
167 | { | 172 | { |
168 | unsigned h0 = HASH(remote); | 173 | unsigned h0 = HASH(remote); |
169 | unsigned h1 = HASH(key); | 174 | unsigned h1 = HASH(key); |
170 | struct ip_tunnel *t; | 175 | struct ip_tunnel *t; |
176 | struct ip_tunnel *t2 = NULL; | ||
171 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); | 177 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); |
178 | int dev_type = (gre_proto == htons(ETH_P_TEB)) ? | ||
179 | ARPHRD_ETHER : ARPHRD_IPGRE; | ||
172 | 180 | ||
173 | for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) { | 181 | for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) { |
174 | if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { | 182 | if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { |
175 | if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) | 183 | if (t->parms.i_key == key && t->dev->flags & IFF_UP) { |
176 | return t; | 184 | if (t->dev->type == dev_type) |
185 | return t; | ||
186 | if (t->dev->type == ARPHRD_IPGRE && !t2) | ||
187 | t2 = t; | ||
188 | } | ||
177 | } | 189 | } |
178 | } | 190 | } |
191 | |||
179 | for (t = ign->tunnels_r[h0^h1]; t; t = t->next) { | 192 | for (t = ign->tunnels_r[h0^h1]; t; t = t->next) { |
180 | if (remote == t->parms.iph.daddr) { | 193 | if (remote == t->parms.iph.daddr) { |
181 | if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) | 194 | if (t->parms.i_key == key && t->dev->flags & IFF_UP) { |
182 | return t; | 195 | if (t->dev->type == dev_type) |
196 | return t; | ||
197 | if (t->dev->type == ARPHRD_IPGRE && !t2) | ||
198 | t2 = t; | ||
199 | } | ||
183 | } | 200 | } |
184 | } | 201 | } |
202 | |||
185 | for (t = ign->tunnels_l[h1]; t; t = t->next) { | 203 | for (t = ign->tunnels_l[h1]; t; t = t->next) { |
186 | if (local == t->parms.iph.saddr || | 204 | if (local == t->parms.iph.saddr || |
187 | (local == t->parms.iph.daddr && | 205 | (local == t->parms.iph.daddr && |
188 | ipv4_is_multicast(local))) { | 206 | ipv4_is_multicast(local))) { |
189 | if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) | 207 | if (t->parms.i_key == key && t->dev->flags & IFF_UP) { |
190 | return t; | 208 | if (t->dev->type == dev_type) |
209 | return t; | ||
210 | if (t->dev->type == ARPHRD_IPGRE && !t2) | ||
211 | t2 = t; | ||
212 | } | ||
191 | } | 213 | } |
192 | } | 214 | } |
215 | |||
193 | for (t = ign->tunnels_wc[h1]; t; t = t->next) { | 216 | for (t = ign->tunnels_wc[h1]; t; t = t->next) { |
194 | if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) | 217 | if (t->parms.i_key == key && t->dev->flags & IFF_UP) { |
195 | return t; | 218 | if (t->dev->type == dev_type) |
219 | return t; | ||
220 | if (t->dev->type == ARPHRD_IPGRE && !t2) | ||
221 | t2 = t; | ||
222 | } | ||
196 | } | 223 | } |
197 | 224 | ||
225 | if (t2) | ||
226 | return t2; | ||
227 | |||
198 | if (ign->fb_tunnel_dev->flags&IFF_UP) | 228 | if (ign->fb_tunnel_dev->flags&IFF_UP) |
199 | return netdev_priv(ign->fb_tunnel_dev); | 229 | return netdev_priv(ign->fb_tunnel_dev); |
200 | return NULL; | 230 | return NULL; |
@@ -249,25 +279,37 @@ static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) | |||
249 | } | 279 | } |
250 | } | 280 | } |
251 | 281 | ||
252 | static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, | 282 | static struct ip_tunnel *ipgre_tunnel_find(struct net *net, |
253 | struct ip_tunnel_parm *parms, int create) | 283 | struct ip_tunnel_parm *parms, |
284 | int type) | ||
254 | { | 285 | { |
255 | __be32 remote = parms->iph.daddr; | 286 | __be32 remote = parms->iph.daddr; |
256 | __be32 local = parms->iph.saddr; | 287 | __be32 local = parms->iph.saddr; |
257 | __be32 key = parms->i_key; | 288 | __be32 key = parms->i_key; |
258 | struct ip_tunnel *t, **tp, *nt; | 289 | struct ip_tunnel *t, **tp; |
290 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); | ||
291 | |||
292 | for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) | ||
293 | if (local == t->parms.iph.saddr && | ||
294 | remote == t->parms.iph.daddr && | ||
295 | key == t->parms.i_key && | ||
296 | type == t->dev->type) | ||
297 | break; | ||
298 | |||
299 | return t; | ||
300 | } | ||
301 | |||
302 | static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, | ||
303 | struct ip_tunnel_parm *parms, int create) | ||
304 | { | ||
305 | struct ip_tunnel *t, *nt; | ||
259 | struct net_device *dev; | 306 | struct net_device *dev; |
260 | char name[IFNAMSIZ]; | 307 | char name[IFNAMSIZ]; |
261 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); | 308 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); |
262 | 309 | ||
263 | for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) { | 310 | t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE); |
264 | if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { | 311 | if (t || !create) |
265 | if (key == t->parms.i_key) | 312 | return t; |
266 | return t; | ||
267 | } | ||
268 | } | ||
269 | if (!create) | ||
270 | return NULL; | ||
271 | 313 | ||
272 | if (parms->name[0]) | 314 | if (parms->name[0]) |
273 | strlcpy(name, parms->name, IFNAMSIZ); | 315 | strlcpy(name, parms->name, IFNAMSIZ); |
@@ -285,9 +327,11 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, | |||
285 | goto failed_free; | 327 | goto failed_free; |
286 | } | 328 | } |
287 | 329 | ||
288 | dev->init = ipgre_tunnel_init; | ||
289 | nt = netdev_priv(dev); | 330 | nt = netdev_priv(dev); |
290 | nt->parms = *parms; | 331 | nt->parms = *parms; |
332 | dev->rtnl_link_ops = &ipgre_link_ops; | ||
333 | |||
334 | dev->mtu = ipgre_tunnel_bind_dev(dev); | ||
291 | 335 | ||
292 | if (register_netdevice(dev) < 0) | 336 | if (register_netdevice(dev) < 0) |
293 | goto failed_free; | 337 | goto failed_free; |
@@ -380,8 +424,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info) | |||
380 | 424 | ||
381 | read_lock(&ipgre_lock); | 425 | read_lock(&ipgre_lock); |
382 | t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr, | 426 | t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr, |
383 | (flags&GRE_KEY) ? | 427 | flags & GRE_KEY ? |
384 | *(((__be32*)p) + (grehlen>>2) - 1) : 0); | 428 | *(((__be32 *)p) + (grehlen / 4) - 1) : 0, |
429 | p[1]); | ||
385 | if (t == NULL || t->parms.iph.daddr == 0 || | 430 | if (t == NULL || t->parms.iph.daddr == 0 || |
386 | ipv4_is_multicast(t->parms.iph.daddr)) | 431 | ipv4_is_multicast(t->parms.iph.daddr)) |
387 | goto out; | 432 | goto out; |
@@ -431,6 +476,8 @@ static int ipgre_rcv(struct sk_buff *skb) | |||
431 | u32 seqno = 0; | 476 | u32 seqno = 0; |
432 | struct ip_tunnel *tunnel; | 477 | struct ip_tunnel *tunnel; |
433 | int offset = 4; | 478 | int offset = 4; |
479 | __be16 gre_proto; | ||
480 | unsigned int len; | ||
434 | 481 | ||
435 | if (!pskb_may_pull(skb, 16)) | 482 | if (!pskb_may_pull(skb, 16)) |
436 | goto drop_nolock; | 483 | goto drop_nolock; |
@@ -470,20 +517,22 @@ static int ipgre_rcv(struct sk_buff *skb) | |||
470 | } | 517 | } |
471 | } | 518 | } |
472 | 519 | ||
520 | gre_proto = *(__be16 *)(h + 2); | ||
521 | |||
473 | read_lock(&ipgre_lock); | 522 | read_lock(&ipgre_lock); |
474 | if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev), | 523 | if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev), |
475 | iph->saddr, iph->daddr, key)) != NULL) { | 524 | iph->saddr, iph->daddr, key, |
525 | gre_proto))) { | ||
476 | struct net_device_stats *stats = &tunnel->dev->stats; | 526 | struct net_device_stats *stats = &tunnel->dev->stats; |
477 | 527 | ||
478 | secpath_reset(skb); | 528 | secpath_reset(skb); |
479 | 529 | ||
480 | skb->protocol = *(__be16*)(h + 2); | 530 | skb->protocol = gre_proto; |
481 | /* WCCP version 1 and 2 protocol decoding. | 531 | /* WCCP version 1 and 2 protocol decoding. |
482 | * - Change protocol to IP | 532 | * - Change protocol to IP |
483 | * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header | 533 | * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header |
484 | */ | 534 | */ |
485 | if (flags == 0 && | 535 | if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { |
486 | skb->protocol == htons(ETH_P_WCCP)) { | ||
487 | skb->protocol = htons(ETH_P_IP); | 536 | skb->protocol = htons(ETH_P_IP); |
488 | if ((*(h + offset) & 0xF0) != 0x40) | 537 | if ((*(h + offset) & 0xF0) != 0x40) |
489 | offset += 4; | 538 | offset += 4; |
@@ -491,7 +540,6 @@ static int ipgre_rcv(struct sk_buff *skb) | |||
491 | 540 | ||
492 | skb->mac_header = skb->network_header; | 541 | skb->mac_header = skb->network_header; |
493 | __pskb_pull(skb, offset); | 542 | __pskb_pull(skb, offset); |
494 | skb_reset_network_header(skb); | ||
495 | skb_postpull_rcsum(skb, skb_transport_header(skb), offset); | 543 | skb_postpull_rcsum(skb, skb_transport_header(skb), offset); |
496 | skb->pkt_type = PACKET_HOST; | 544 | skb->pkt_type = PACKET_HOST; |
497 | #ifdef CONFIG_NET_IPGRE_BROADCAST | 545 | #ifdef CONFIG_NET_IPGRE_BROADCAST |
@@ -519,13 +567,32 @@ static int ipgre_rcv(struct sk_buff *skb) | |||
519 | } | 567 | } |
520 | tunnel->i_seqno = seqno + 1; | 568 | tunnel->i_seqno = seqno + 1; |
521 | } | 569 | } |
570 | |||
571 | len = skb->len; | ||
572 | |||
573 | /* Warning: All skb pointers will be invalidated! */ | ||
574 | if (tunnel->dev->type == ARPHRD_ETHER) { | ||
575 | if (!pskb_may_pull(skb, ETH_HLEN)) { | ||
576 | stats->rx_length_errors++; | ||
577 | stats->rx_errors++; | ||
578 | goto drop; | ||
579 | } | ||
580 | |||
581 | iph = ip_hdr(skb); | ||
582 | skb->protocol = eth_type_trans(skb, tunnel->dev); | ||
583 | skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); | ||
584 | } | ||
585 | |||
522 | stats->rx_packets++; | 586 | stats->rx_packets++; |
523 | stats->rx_bytes += skb->len; | 587 | stats->rx_bytes += len; |
524 | skb->dev = tunnel->dev; | 588 | skb->dev = tunnel->dev; |
525 | dst_release(skb->dst); | 589 | dst_release(skb->dst); |
526 | skb->dst = NULL; | 590 | skb->dst = NULL; |
527 | nf_reset(skb); | 591 | nf_reset(skb); |
592 | |||
593 | skb_reset_network_header(skb); | ||
528 | ipgre_ecn_decapsulate(iph, skb); | 594 | ipgre_ecn_decapsulate(iph, skb); |
595 | |||
529 | netif_rx(skb); | 596 | netif_rx(skb); |
530 | read_unlock(&ipgre_lock); | 597 | read_unlock(&ipgre_lock); |
531 | return(0); | 598 | return(0); |
@@ -560,7 +627,10 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
560 | goto tx_error; | 627 | goto tx_error; |
561 | } | 628 | } |
562 | 629 | ||
563 | if (dev->header_ops) { | 630 | if (dev->type == ARPHRD_ETHER) |
631 | IPCB(skb)->flags = 0; | ||
632 | |||
633 | if (dev->header_ops && dev->type == ARPHRD_IPGRE) { | ||
564 | gre_hlen = 0; | 634 | gre_hlen = 0; |
565 | tiph = (struct iphdr*)skb->data; | 635 | tiph = (struct iphdr*)skb->data; |
566 | } else { | 636 | } else { |
@@ -637,7 +707,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
637 | 707 | ||
638 | df = tiph->frag_off; | 708 | df = tiph->frag_off; |
639 | if (df) | 709 | if (df) |
640 | mtu = dst_mtu(&rt->u.dst) - tunnel->hlen; | 710 | mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen; |
641 | else | 711 | else |
642 | mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; | 712 | mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; |
643 | 713 | ||
@@ -703,7 +773,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
703 | old_iph = ip_hdr(skb); | 773 | old_iph = ip_hdr(skb); |
704 | } | 774 | } |
705 | 775 | ||
706 | skb->transport_header = skb->network_header; | 776 | skb_reset_transport_header(skb); |
707 | skb_push(skb, gre_hlen); | 777 | skb_push(skb, gre_hlen); |
708 | skb_reset_network_header(skb); | 778 | skb_reset_network_header(skb); |
709 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | 779 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); |
@@ -736,8 +806,9 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |||
736 | iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT); | 806 | iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT); |
737 | } | 807 | } |
738 | 808 | ||
739 | ((__be16*)(iph+1))[0] = tunnel->parms.o_flags; | 809 | ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; |
740 | ((__be16*)(iph+1))[1] = skb->protocol; | 810 | ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ? |
811 | htons(ETH_P_TEB) : skb->protocol; | ||
741 | 812 | ||
742 | if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { | 813 | if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { |
743 | __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4); | 814 | __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4); |
@@ -773,7 +844,7 @@ tx_error: | |||
773 | return 0; | 844 | return 0; |
774 | } | 845 | } |
775 | 846 | ||
776 | static void ipgre_tunnel_bind_dev(struct net_device *dev) | 847 | static int ipgre_tunnel_bind_dev(struct net_device *dev) |
777 | { | 848 | { |
778 | struct net_device *tdev = NULL; | 849 | struct net_device *tdev = NULL; |
779 | struct ip_tunnel *tunnel; | 850 | struct ip_tunnel *tunnel; |
@@ -785,7 +856,7 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev) | |||
785 | tunnel = netdev_priv(dev); | 856 | tunnel = netdev_priv(dev); |
786 | iph = &tunnel->parms.iph; | 857 | iph = &tunnel->parms.iph; |
787 | 858 | ||
788 | /* Guess output device to choose reasonable mtu and hard_header_len */ | 859 | /* Guess output device to choose reasonable mtu and needed_headroom */ |
789 | 860 | ||
790 | if (iph->daddr) { | 861 | if (iph->daddr) { |
791 | struct flowi fl = { .oif = tunnel->parms.link, | 862 | struct flowi fl = { .oif = tunnel->parms.link, |
@@ -799,14 +870,16 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev) | |||
799 | tdev = rt->u.dst.dev; | 870 | tdev = rt->u.dst.dev; |
800 | ip_rt_put(rt); | 871 | ip_rt_put(rt); |
801 | } | 872 | } |
802 | dev->flags |= IFF_POINTOPOINT; | 873 | |
874 | if (dev->type != ARPHRD_ETHER) | ||
875 | dev->flags |= IFF_POINTOPOINT; | ||
803 | } | 876 | } |
804 | 877 | ||
805 | if (!tdev && tunnel->parms.link) | 878 | if (!tdev && tunnel->parms.link) |
806 | tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); | 879 | tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); |
807 | 880 | ||
808 | if (tdev) { | 881 | if (tdev) { |
809 | hlen = tdev->hard_header_len; | 882 | hlen = tdev->hard_header_len + tdev->needed_headroom; |
810 | mtu = tdev->mtu; | 883 | mtu = tdev->mtu; |
811 | } | 884 | } |
812 | dev->iflink = tunnel->parms.link; | 885 | dev->iflink = tunnel->parms.link; |
@@ -820,10 +893,15 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev) | |||
820 | if (tunnel->parms.o_flags&GRE_SEQ) | 893 | if (tunnel->parms.o_flags&GRE_SEQ) |
821 | addend += 4; | 894 | addend += 4; |
822 | } | 895 | } |
823 | dev->hard_header_len = hlen + addend; | 896 | dev->needed_headroom = addend + hlen; |
824 | dev->mtu = mtu - addend; | 897 | mtu -= dev->hard_header_len - addend; |
898 | |||
899 | if (mtu < 68) | ||
900 | mtu = 68; | ||
901 | |||
825 | tunnel->hlen = addend; | 902 | tunnel->hlen = addend; |
826 | 903 | ||
904 | return mtu; | ||
827 | } | 905 | } |
828 | 906 | ||
829 | static int | 907 | static int |
@@ -917,7 +995,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) | |||
917 | t->parms.iph.frag_off = p.iph.frag_off; | 995 | t->parms.iph.frag_off = p.iph.frag_off; |
918 | if (t->parms.link != p.link) { | 996 | if (t->parms.link != p.link) { |
919 | t->parms.link = p.link; | 997 | t->parms.link = p.link; |
920 | ipgre_tunnel_bind_dev(dev); | 998 | dev->mtu = ipgre_tunnel_bind_dev(dev); |
921 | netdev_state_change(dev); | 999 | netdev_state_change(dev); |
922 | } | 1000 | } |
923 | } | 1001 | } |
@@ -959,7 +1037,8 @@ done: | |||
959 | static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) | 1037 | static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) |
960 | { | 1038 | { |
961 | struct ip_tunnel *tunnel = netdev_priv(dev); | 1039 | struct ip_tunnel *tunnel = netdev_priv(dev); |
962 | if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen) | 1040 | if (new_mtu < 68 || |
1041 | new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen) | ||
963 | return -EINVAL; | 1042 | return -EINVAL; |
964 | dev->mtu = new_mtu; | 1043 | dev->mtu = new_mtu; |
965 | return 0; | 1044 | return 0; |
@@ -1078,6 +1157,7 @@ static int ipgre_close(struct net_device *dev) | |||
1078 | 1157 | ||
1079 | static void ipgre_tunnel_setup(struct net_device *dev) | 1158 | static void ipgre_tunnel_setup(struct net_device *dev) |
1080 | { | 1159 | { |
1160 | dev->init = ipgre_tunnel_init; | ||
1081 | dev->uninit = ipgre_tunnel_uninit; | 1161 | dev->uninit = ipgre_tunnel_uninit; |
1082 | dev->destructor = free_netdev; | 1162 | dev->destructor = free_netdev; |
1083 | dev->hard_start_xmit = ipgre_tunnel_xmit; | 1163 | dev->hard_start_xmit = ipgre_tunnel_xmit; |
@@ -1085,7 +1165,7 @@ static void ipgre_tunnel_setup(struct net_device *dev) | |||
1085 | dev->change_mtu = ipgre_tunnel_change_mtu; | 1165 | dev->change_mtu = ipgre_tunnel_change_mtu; |
1086 | 1166 | ||
1087 | dev->type = ARPHRD_IPGRE; | 1167 | dev->type = ARPHRD_IPGRE; |
1088 | dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4; | 1168 | dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; |
1089 | dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; | 1169 | dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; |
1090 | dev->flags = IFF_NOARP; | 1170 | dev->flags = IFF_NOARP; |
1091 | dev->iflink = 0; | 1171 | dev->iflink = 0; |
@@ -1107,8 +1187,6 @@ static int ipgre_tunnel_init(struct net_device *dev) | |||
1107 | memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); | 1187 | memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); |
1108 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); | 1188 | memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); |
1109 | 1189 | ||
1110 | ipgre_tunnel_bind_dev(dev); | ||
1111 | |||
1112 | if (iph->daddr) { | 1190 | if (iph->daddr) { |
1113 | #ifdef CONFIG_NET_IPGRE_BROADCAST | 1191 | #ifdef CONFIG_NET_IPGRE_BROADCAST |
1114 | if (ipv4_is_multicast(iph->daddr)) { | 1192 | if (ipv4_is_multicast(iph->daddr)) { |
@@ -1189,6 +1267,7 @@ static int ipgre_init_net(struct net *net) | |||
1189 | 1267 | ||
1190 | ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init; | 1268 | ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init; |
1191 | dev_net_set(ign->fb_tunnel_dev, net); | 1269 | dev_net_set(ign->fb_tunnel_dev, net); |
1270 | ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; | ||
1192 | 1271 | ||
1193 | if ((err = register_netdev(ign->fb_tunnel_dev))) | 1272 | if ((err = register_netdev(ign->fb_tunnel_dev))) |
1194 | goto err_reg_dev; | 1273 | goto err_reg_dev; |
@@ -1221,6 +1300,298 @@ static struct pernet_operations ipgre_net_ops = { | |||
1221 | .exit = ipgre_exit_net, | 1300 | .exit = ipgre_exit_net, |
1222 | }; | 1301 | }; |
1223 | 1302 | ||
1303 | static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) | ||
1304 | { | ||
1305 | __be16 flags; | ||
1306 | |||
1307 | if (!data) | ||
1308 | return 0; | ||
1309 | |||
1310 | flags = 0; | ||
1311 | if (data[IFLA_GRE_IFLAGS]) | ||
1312 | flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); | ||
1313 | if (data[IFLA_GRE_OFLAGS]) | ||
1314 | flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); | ||
1315 | if (flags & (GRE_VERSION|GRE_ROUTING)) | ||
1316 | return -EINVAL; | ||
1317 | |||
1318 | return 0; | ||
1319 | } | ||
1320 | |||
1321 | static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) | ||
1322 | { | ||
1323 | __be32 daddr; | ||
1324 | |||
1325 | if (tb[IFLA_ADDRESS]) { | ||
1326 | if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) | ||
1327 | return -EINVAL; | ||
1328 | if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) | ||
1329 | return -EADDRNOTAVAIL; | ||
1330 | } | ||
1331 | |||
1332 | if (!data) | ||
1333 | goto out; | ||
1334 | |||
1335 | if (data[IFLA_GRE_REMOTE]) { | ||
1336 | memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4); | ||
1337 | if (!daddr) | ||
1338 | return -EINVAL; | ||
1339 | } | ||
1340 | |||
1341 | out: | ||
1342 | return ipgre_tunnel_validate(tb, data); | ||
1343 | } | ||
1344 | |||
1345 | static void ipgre_netlink_parms(struct nlattr *data[], | ||
1346 | struct ip_tunnel_parm *parms) | ||
1347 | { | ||
1348 | memset(parms, 0, sizeof(*parms)); | ||
1349 | |||
1350 | parms->iph.protocol = IPPROTO_GRE; | ||
1351 | |||
1352 | if (!data) | ||
1353 | return; | ||
1354 | |||
1355 | if (data[IFLA_GRE_LINK]) | ||
1356 | parms->link = nla_get_u32(data[IFLA_GRE_LINK]); | ||
1357 | |||
1358 | if (data[IFLA_GRE_IFLAGS]) | ||
1359 | parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); | ||
1360 | |||
1361 | if (data[IFLA_GRE_OFLAGS]) | ||
1362 | parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); | ||
1363 | |||
1364 | if (data[IFLA_GRE_IKEY]) | ||
1365 | parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); | ||
1366 | |||
1367 | if (data[IFLA_GRE_OKEY]) | ||
1368 | parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); | ||
1369 | |||
1370 | if (data[IFLA_GRE_LOCAL]) | ||
1371 | parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]); | ||
1372 | |||
1373 | if (data[IFLA_GRE_REMOTE]) | ||
1374 | parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]); | ||
1375 | |||
1376 | if (data[IFLA_GRE_TTL]) | ||
1377 | parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]); | ||
1378 | |||
1379 | if (data[IFLA_GRE_TOS]) | ||
1380 | parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); | ||
1381 | |||
1382 | if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) | ||
1383 | parms->iph.frag_off = htons(IP_DF); | ||
1384 | } | ||
1385 | |||
1386 | static int ipgre_tap_init(struct net_device *dev) | ||
1387 | { | ||
1388 | struct ip_tunnel *tunnel; | ||
1389 | |||
1390 | tunnel = netdev_priv(dev); | ||
1391 | |||
1392 | tunnel->dev = dev; | ||
1393 | strcpy(tunnel->parms.name, dev->name); | ||
1394 | |||
1395 | ipgre_tunnel_bind_dev(dev); | ||
1396 | |||
1397 | return 0; | ||
1398 | } | ||
1399 | |||
1400 | static void ipgre_tap_setup(struct net_device *dev) | ||
1401 | { | ||
1402 | |||
1403 | ether_setup(dev); | ||
1404 | |||
1405 | dev->init = ipgre_tap_init; | ||
1406 | dev->uninit = ipgre_tunnel_uninit; | ||
1407 | dev->destructor = free_netdev; | ||
1408 | dev->hard_start_xmit = ipgre_tunnel_xmit; | ||
1409 | dev->change_mtu = ipgre_tunnel_change_mtu; | ||
1410 | |||
1411 | dev->iflink = 0; | ||
1412 | dev->features |= NETIF_F_NETNS_LOCAL; | ||
1413 | } | ||
1414 | |||
1415 | static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[], | ||
1416 | struct nlattr *data[]) | ||
1417 | { | ||
1418 | struct ip_tunnel *nt; | ||
1419 | struct net *net = dev_net(dev); | ||
1420 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); | ||
1421 | int mtu; | ||
1422 | int err; | ||
1423 | |||
1424 | nt = netdev_priv(dev); | ||
1425 | ipgre_netlink_parms(data, &nt->parms); | ||
1426 | |||
1427 | if (ipgre_tunnel_find(net, &nt->parms, dev->type)) | ||
1428 | return -EEXIST; | ||
1429 | |||
1430 | if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) | ||
1431 | random_ether_addr(dev->dev_addr); | ||
1432 | |||
1433 | mtu = ipgre_tunnel_bind_dev(dev); | ||
1434 | if (!tb[IFLA_MTU]) | ||
1435 | dev->mtu = mtu; | ||
1436 | |||
1437 | err = register_netdevice(dev); | ||
1438 | if (err) | ||
1439 | goto out; | ||
1440 | |||
1441 | dev_hold(dev); | ||
1442 | ipgre_tunnel_link(ign, nt); | ||
1443 | |||
1444 | out: | ||
1445 | return err; | ||
1446 | } | ||
1447 | |||
1448 | static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], | ||
1449 | struct nlattr *data[]) | ||
1450 | { | ||
1451 | struct ip_tunnel *t, *nt; | ||
1452 | struct net *net = dev_net(dev); | ||
1453 | struct ipgre_net *ign = net_generic(net, ipgre_net_id); | ||
1454 | struct ip_tunnel_parm p; | ||
1455 | int mtu; | ||
1456 | |||
1457 | if (dev == ign->fb_tunnel_dev) | ||
1458 | return -EINVAL; | ||
1459 | |||
1460 | nt = netdev_priv(dev); | ||
1461 | ipgre_netlink_parms(data, &p); | ||
1462 | |||
1463 | t = ipgre_tunnel_locate(net, &p, 0); | ||
1464 | |||
1465 | if (t) { | ||
1466 | if (t->dev != dev) | ||
1467 | return -EEXIST; | ||
1468 | } else { | ||
1469 | unsigned nflags = 0; | ||
1470 | |||
1471 | t = nt; | ||
1472 | |||
1473 | if (ipv4_is_multicast(p.iph.daddr)) | ||
1474 | nflags = IFF_BROADCAST; | ||
1475 | else if (p.iph.daddr) | ||
1476 | nflags = IFF_POINTOPOINT; | ||
1477 | |||
1478 | if ((dev->flags ^ nflags) & | ||
1479 | (IFF_POINTOPOINT | IFF_BROADCAST)) | ||
1480 | return -EINVAL; | ||
1481 | |||
1482 | ipgre_tunnel_unlink(ign, t); | ||
1483 | t->parms.iph.saddr = p.iph.saddr; | ||
1484 | t->parms.iph.daddr = p.iph.daddr; | ||
1485 | t->parms.i_key = p.i_key; | ||
1486 | memcpy(dev->dev_addr, &p.iph.saddr, 4); | ||
1487 | memcpy(dev->broadcast, &p.iph.daddr, 4); | ||
1488 | ipgre_tunnel_link(ign, t); | ||
1489 | netdev_state_change(dev); | ||
1490 | } | ||
1491 | |||
1492 | t->parms.o_key = p.o_key; | ||
1493 | t->parms.iph.ttl = p.iph.ttl; | ||
1494 | t->parms.iph.tos = p.iph.tos; | ||
1495 | t->parms.iph.frag_off = p.iph.frag_off; | ||
1496 | |||
1497 | if (t->parms.link != p.link) { | ||
1498 | t->parms.link = p.link; | ||
1499 | mtu = ipgre_tunnel_bind_dev(dev); | ||
1500 | if (!tb[IFLA_MTU]) | ||
1501 | dev->mtu = mtu; | ||
1502 | netdev_state_change(dev); | ||
1503 | } | ||
1504 | |||
1505 | return 0; | ||
1506 | } | ||
1507 | |||
1508 | static size_t ipgre_get_size(const struct net_device *dev) | ||
1509 | { | ||
1510 | return | ||
1511 | /* IFLA_GRE_LINK */ | ||
1512 | nla_total_size(4) + | ||
1513 | /* IFLA_GRE_IFLAGS */ | ||
1514 | nla_total_size(2) + | ||
1515 | /* IFLA_GRE_OFLAGS */ | ||
1516 | nla_total_size(2) + | ||
1517 | /* IFLA_GRE_IKEY */ | ||
1518 | nla_total_size(4) + | ||
1519 | /* IFLA_GRE_OKEY */ | ||
1520 | nla_total_size(4) + | ||
1521 | /* IFLA_GRE_LOCAL */ | ||
1522 | nla_total_size(4) + | ||
1523 | /* IFLA_GRE_REMOTE */ | ||
1524 | nla_total_size(4) + | ||
1525 | /* IFLA_GRE_TTL */ | ||
1526 | nla_total_size(1) + | ||
1527 | /* IFLA_GRE_TOS */ | ||
1528 | nla_total_size(1) + | ||
1529 | /* IFLA_GRE_PMTUDISC */ | ||
1530 | nla_total_size(1) + | ||
1531 | 0; | ||
1532 | } | ||
1533 | |||
1534 | static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) | ||
1535 | { | ||
1536 | struct ip_tunnel *t = netdev_priv(dev); | ||
1537 | struct ip_tunnel_parm *p = &t->parms; | ||
1538 | |||
1539 | NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link); | ||
1540 | NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags); | ||
1541 | NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags); | ||
1542 | NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key); | ||
1543 | NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key); | ||
1544 | NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr); | ||
1545 | NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr); | ||
1546 | NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl); | ||
1547 | NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos); | ||
1548 | NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF))); | ||
1549 | |||
1550 | return 0; | ||
1551 | |||
1552 | nla_put_failure: | ||
1553 | return -EMSGSIZE; | ||
1554 | } | ||
1555 | |||
1556 | static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { | ||
1557 | [IFLA_GRE_LINK] = { .type = NLA_U32 }, | ||
1558 | [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, | ||
1559 | [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, | ||
1560 | [IFLA_GRE_IKEY] = { .type = NLA_U32 }, | ||
1561 | [IFLA_GRE_OKEY] = { .type = NLA_U32 }, | ||
1562 | [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, | ||
1563 | [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, | ||
1564 | [IFLA_GRE_TTL] = { .type = NLA_U8 }, | ||
1565 | [IFLA_GRE_TOS] = { .type = NLA_U8 }, | ||
1566 | [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, | ||
1567 | }; | ||
1568 | |||
1569 | static struct rtnl_link_ops ipgre_link_ops __read_mostly = { | ||
1570 | .kind = "gre", | ||
1571 | .maxtype = IFLA_GRE_MAX, | ||
1572 | .policy = ipgre_policy, | ||
1573 | .priv_size = sizeof(struct ip_tunnel), | ||
1574 | .setup = ipgre_tunnel_setup, | ||
1575 | .validate = ipgre_tunnel_validate, | ||
1576 | .newlink = ipgre_newlink, | ||
1577 | .changelink = ipgre_changelink, | ||
1578 | .get_size = ipgre_get_size, | ||
1579 | .fill_info = ipgre_fill_info, | ||
1580 | }; | ||
1581 | |||
1582 | static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { | ||
1583 | .kind = "gretap", | ||
1584 | .maxtype = IFLA_GRE_MAX, | ||
1585 | .policy = ipgre_policy, | ||
1586 | .priv_size = sizeof(struct ip_tunnel), | ||
1587 | .setup = ipgre_tap_setup, | ||
1588 | .validate = ipgre_tap_validate, | ||
1589 | .newlink = ipgre_newlink, | ||
1590 | .changelink = ipgre_changelink, | ||
1591 | .get_size = ipgre_get_size, | ||
1592 | .fill_info = ipgre_fill_info, | ||
1593 | }; | ||
1594 | |||
1224 | /* | 1595 | /* |
1225 | * And now the modules code and kernel interface. | 1596 | * And now the modules code and kernel interface. |
1226 | */ | 1597 | */ |
@@ -1238,19 +1609,39 @@ static int __init ipgre_init(void) | |||
1238 | 1609 | ||
1239 | err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops); | 1610 | err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops); |
1240 | if (err < 0) | 1611 | if (err < 0) |
1241 | inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); | 1612 | goto gen_device_failed; |
1242 | 1613 | ||
1614 | err = rtnl_link_register(&ipgre_link_ops); | ||
1615 | if (err < 0) | ||
1616 | goto rtnl_link_failed; | ||
1617 | |||
1618 | err = rtnl_link_register(&ipgre_tap_ops); | ||
1619 | if (err < 0) | ||
1620 | goto tap_ops_failed; | ||
1621 | |||
1622 | out: | ||
1243 | return err; | 1623 | return err; |
1624 | |||
1625 | tap_ops_failed: | ||
1626 | rtnl_link_unregister(&ipgre_link_ops); | ||
1627 | rtnl_link_failed: | ||
1628 | unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); | ||
1629 | gen_device_failed: | ||
1630 | inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); | ||
1631 | goto out; | ||
1244 | } | 1632 | } |
1245 | 1633 | ||
1246 | static void __exit ipgre_fini(void) | 1634 | static void __exit ipgre_fini(void) |
1247 | { | 1635 | { |
1636 | rtnl_link_unregister(&ipgre_tap_ops); | ||
1637 | rtnl_link_unregister(&ipgre_link_ops); | ||
1638 | unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); | ||
1248 | if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) | 1639 | if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) |
1249 | printk(KERN_INFO "ipgre close: can't remove protocol\n"); | 1640 | printk(KERN_INFO "ipgre close: can't remove protocol\n"); |
1250 | |||
1251 | unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); | ||
1252 | } | 1641 | } |
1253 | 1642 | ||
1254 | module_init(ipgre_init); | 1643 | module_init(ipgre_init); |
1255 | module_exit(ipgre_fini); | 1644 | module_exit(ipgre_fini); |
1256 | MODULE_LICENSE("GPL"); | 1645 | MODULE_LICENSE("GPL"); |
1646 | MODULE_ALIAS_RTNL_LINK("gre"); | ||
1647 | MODULE_ALIAS_RTNL_LINK("gretap"); | ||
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index d533a89e08de..d2a8f8bb78a6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -340,6 +340,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) | |||
340 | .saddr = inet->saddr, | 340 | .saddr = inet->saddr, |
341 | .tos = RT_CONN_FLAGS(sk) } }, | 341 | .tos = RT_CONN_FLAGS(sk) } }, |
342 | .proto = sk->sk_protocol, | 342 | .proto = sk->sk_protocol, |
343 | .flags = inet_sk_flowi_flags(sk), | ||
343 | .uli_u = { .ports = | 344 | .uli_u = { .ports = |
344 | { .sport = inet->sport, | 345 | { .sport = inet->sport, |
345 | .dport = inet->dport } } }; | 346 | .dport = inet->dport } } }; |
@@ -1371,7 +1372,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar | |||
1371 | .uli_u = { .ports = | 1372 | .uli_u = { .ports = |
1372 | { .sport = tcp_hdr(skb)->dest, | 1373 | { .sport = tcp_hdr(skb)->dest, |
1373 | .dport = tcp_hdr(skb)->source } }, | 1374 | .dport = tcp_hdr(skb)->source } }, |
1374 | .proto = sk->sk_protocol }; | 1375 | .proto = sk->sk_protocol, |
1376 | .flags = ip_reply_arg_flowi_flags(arg) }; | ||
1375 | security_skb_classify_flow(skb, &fl); | 1377 | security_skb_classify_flow(skb, &fl); |
1376 | if (ip_route_output_key(sock_net(sk), &rt, &fl)) | 1378 | if (ip_route_output_key(sock_net(sk), &rt, &fl)) |
1377 | return; | 1379 | return; |
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 105d92a039b9..465abf0a9869 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c | |||
@@ -419,7 +419,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
419 | (1<<IP_TTL) | (1<<IP_HDRINCL) | | 419 | (1<<IP_TTL) | (1<<IP_HDRINCL) | |
420 | (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | | 420 | (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | |
421 | (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | | 421 | (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | |
422 | (1<<IP_PASSSEC))) || | 422 | (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) || |
423 | optname == IP_MULTICAST_TTL || | 423 | optname == IP_MULTICAST_TTL || |
424 | optname == IP_MULTICAST_LOOP) { | 424 | optname == IP_MULTICAST_LOOP) { |
425 | if (optlen >= sizeof(int)) { | 425 | if (optlen >= sizeof(int)) { |
@@ -878,6 +878,16 @@ static int do_ip_setsockopt(struct sock *sk, int level, | |||
878 | err = xfrm_user_policy(sk, optname, optval, optlen); | 878 | err = xfrm_user_policy(sk, optname, optval, optlen); |
879 | break; | 879 | break; |
880 | 880 | ||
881 | case IP_TRANSPARENT: | ||
882 | if (!capable(CAP_NET_ADMIN)) { | ||
883 | err = -EPERM; | ||
884 | break; | ||
885 | } | ||
886 | if (optlen < 1) | ||
887 | goto e_inval; | ||
888 | inet->transparent = !!val; | ||
889 | break; | ||
890 | |||
881 | default: | 891 | default: |
882 | err = -ENOPROTOOPT; | 892 | err = -ENOPROTOOPT; |
883 | break; | 893 | break; |
@@ -1130,6 +1140,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, | |||
1130 | case IP_FREEBIND: | 1140 | case IP_FREEBIND: |
1131 | val = inet->freebind; | 1141 | val = inet->freebind; |
1132 | break; | 1142 | break; |
1143 | case IP_TRANSPARENT: | ||
1144 | val = inet->transparent; | ||
1145 | break; | ||
1133 | default: | 1146 | default: |
1134 | release_sock(sk); | 1147 | release_sock(sk); |
1135 | return -ENOPROTOOPT; | 1148 | return -ENOPROTOOPT; |
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig deleted file mode 100644 index 09d0c3f35669..000000000000 --- a/net/ipv4/ipvs/Kconfig +++ /dev/null | |||
@@ -1,224 +0,0 @@ | |||
1 | # | ||
2 | # IP Virtual Server configuration | ||
3 | # | ||
4 | menuconfig IP_VS | ||
5 | tristate "IP virtual server support (EXPERIMENTAL)" | ||
6 | depends on NETFILTER | ||
7 | ---help--- | ||
8 | IP Virtual Server support will let you build a high-performance | ||
9 | virtual server based on cluster of two or more real servers. This | ||
10 | option must be enabled for at least one of the clustered computers | ||
11 | that will take care of intercepting incoming connections to a | ||
12 | single IP address and scheduling them to real servers. | ||
13 | |||
14 | Three request dispatching techniques are implemented, they are | ||
15 | virtual server via NAT, virtual server via tunneling and virtual | ||
16 | server via direct routing. The several scheduling algorithms can | ||
17 | be used to choose which server the connection is directed to, | ||
18 | thus load balancing can be achieved among the servers. For more | ||
19 | information and its administration program, please visit the | ||
20 | following URL: <http://www.linuxvirtualserver.org/>. | ||
21 | |||
22 | If you want to compile it in kernel, say Y. To compile it as a | ||
23 | module, choose M here. If unsure, say N. | ||
24 | |||
25 | if IP_VS | ||
26 | |||
27 | config IP_VS_DEBUG | ||
28 | bool "IP virtual server debugging" | ||
29 | ---help--- | ||
30 | Say Y here if you want to get additional messages useful in | ||
31 | debugging the IP virtual server code. You can change the debug | ||
32 | level in /proc/sys/net/ipv4/vs/debug_level | ||
33 | |||
34 | config IP_VS_TAB_BITS | ||
35 | int "IPVS connection table size (the Nth power of 2)" | ||
36 | default "12" | ||
37 | ---help--- | ||
38 | The IPVS connection hash table uses the chaining scheme to handle | ||
39 | hash collisions. Using a big IPVS connection hash table will greatly | ||
40 | reduce conflicts when there are hundreds of thousands of connections | ||
41 | in the hash table. | ||
42 | |||
43 | Note the table size must be power of 2. The table size will be the | ||
44 | value of 2 to the your input number power. The number to choose is | ||
45 | from 8 to 20, the default number is 12, which means the table size | ||
46 | is 4096. Don't input the number too small, otherwise you will lose | ||
47 | performance on it. You can adapt the table size yourself, according | ||
48 | to your virtual server application. It is good to set the table size | ||
49 | not far less than the number of connections per second multiplying | ||
50 | average lasting time of connection in the table. For example, your | ||
51 | virtual server gets 200 connections per second, the connection lasts | ||
52 | for 200 seconds in average in the connection table, the table size | ||
53 | should be not far less than 200x200, it is good to set the table | ||
54 | size 32768 (2**15). | ||
55 | |||
56 | Another note that each connection occupies 128 bytes effectively and | ||
57 | each hash entry uses 8 bytes, so you can estimate how much memory is | ||
58 | needed for your box. | ||
59 | |||
60 | comment "IPVS transport protocol load balancing support" | ||
61 | |||
62 | config IP_VS_PROTO_TCP | ||
63 | bool "TCP load balancing support" | ||
64 | ---help--- | ||
65 | This option enables support for load balancing TCP transport | ||
66 | protocol. Say Y if unsure. | ||
67 | |||
68 | config IP_VS_PROTO_UDP | ||
69 | bool "UDP load balancing support" | ||
70 | ---help--- | ||
71 | This option enables support for load balancing UDP transport | ||
72 | protocol. Say Y if unsure. | ||
73 | |||
74 | config IP_VS_PROTO_ESP | ||
75 | bool "ESP load balancing support" | ||
76 | ---help--- | ||
77 | This option enables support for load balancing ESP (Encapsulation | ||
78 | Security Payload) transport protocol. Say Y if unsure. | ||
79 | |||
80 | config IP_VS_PROTO_AH | ||
81 | bool "AH load balancing support" | ||
82 | ---help--- | ||
83 | This option enables support for load balancing AH (Authentication | ||
84 | Header) transport protocol. Say Y if unsure. | ||
85 | |||
86 | comment "IPVS scheduler" | ||
87 | |||
88 | config IP_VS_RR | ||
89 | tristate "round-robin scheduling" | ||
90 | ---help--- | ||
91 | The robin-robin scheduling algorithm simply directs network | ||
92 | connections to different real servers in a round-robin manner. | ||
93 | |||
94 | If you want to compile it in kernel, say Y. To compile it as a | ||
95 | module, choose M here. If unsure, say N. | ||
96 | |||
97 | config IP_VS_WRR | ||
98 | tristate "weighted round-robin scheduling" | ||
99 | ---help--- | ||
100 | The weighted robin-robin scheduling algorithm directs network | ||
101 | connections to different real servers based on server weights | ||
102 | in a round-robin manner. Servers with higher weights receive | ||
103 | new connections first than those with less weights, and servers | ||
104 | with higher weights get more connections than those with less | ||
105 | weights and servers with equal weights get equal connections. | ||
106 | |||
107 | If you want to compile it in kernel, say Y. To compile it as a | ||
108 | module, choose M here. If unsure, say N. | ||
109 | |||
110 | config IP_VS_LC | ||
111 | tristate "least-connection scheduling" | ||
112 | ---help--- | ||
113 | The least-connection scheduling algorithm directs network | ||
114 | connections to the server with the least number of active | ||
115 | connections. | ||
116 | |||
117 | If you want to compile it in kernel, say Y. To compile it as a | ||
118 | module, choose M here. If unsure, say N. | ||
119 | |||
120 | config IP_VS_WLC | ||
121 | tristate "weighted least-connection scheduling" | ||
122 | ---help--- | ||
123 | The weighted least-connection scheduling algorithm directs network | ||
124 | connections to the server with the least active connections | ||
125 | normalized by the server weight. | ||
126 | |||
127 | If you want to compile it in kernel, say Y. To compile it as a | ||
128 | module, choose M here. If unsure, say N. | ||
129 | |||
130 | config IP_VS_LBLC | ||
131 | tristate "locality-based least-connection scheduling" | ||
132 | ---help--- | ||
133 | The locality-based least-connection scheduling algorithm is for | ||
134 | destination IP load balancing. It is usually used in cache cluster. | ||
135 | This algorithm usually directs packet destined for an IP address to | ||
136 | its server if the server is alive and under load. If the server is | ||
137 | overloaded (its active connection numbers is larger than its weight) | ||
138 | and there is a server in its half load, then allocate the weighted | ||
139 | least-connection server to this IP address. | ||
140 | |||
141 | If you want to compile it in kernel, say Y. To compile it as a | ||
142 | module, choose M here. If unsure, say N. | ||
143 | |||
144 | config IP_VS_LBLCR | ||
145 | tristate "locality-based least-connection with replication scheduling" | ||
146 | ---help--- | ||
147 | The locality-based least-connection with replication scheduling | ||
148 | algorithm is also for destination IP load balancing. It is | ||
149 | usually used in cache cluster. It differs from the LBLC scheduling | ||
150 | as follows: the load balancer maintains mappings from a target | ||
151 | to a set of server nodes that can serve the target. Requests for | ||
152 | a target are assigned to the least-connection node in the target's | ||
153 | server set. If all the node in the server set are over loaded, | ||
154 | it picks up a least-connection node in the cluster and adds it | ||
155 | in the sever set for the target. If the server set has not been | ||
156 | modified for the specified time, the most loaded node is removed | ||
157 | from the server set, in order to avoid high degree of replication. | ||
158 | |||
159 | If you want to compile it in kernel, say Y. To compile it as a | ||
160 | module, choose M here. If unsure, say N. | ||
161 | |||
162 | config IP_VS_DH | ||
163 | tristate "destination hashing scheduling" | ||
164 | ---help--- | ||
165 | The destination hashing scheduling algorithm assigns network | ||
166 | connections to the servers through looking up a statically assigned | ||
167 | hash table by their destination IP addresses. | ||
168 | |||
169 | If you want to compile it in kernel, say Y. To compile it as a | ||
170 | module, choose M here. If unsure, say N. | ||
171 | |||
172 | config IP_VS_SH | ||
173 | tristate "source hashing scheduling" | ||
174 | ---help--- | ||
175 | The source hashing scheduling algorithm assigns network | ||
176 | connections to the servers through looking up a statically assigned | ||
177 | hash table by their source IP addresses. | ||
178 | |||
179 | If you want to compile it in kernel, say Y. To compile it as a | ||
180 | module, choose M here. If unsure, say N. | ||
181 | |||
182 | config IP_VS_SED | ||
183 | tristate "shortest expected delay scheduling" | ||
184 | ---help--- | ||
185 | The shortest expected delay scheduling algorithm assigns network | ||
186 | connections to the server with the shortest expected delay. The | ||
187 | expected delay that the job will experience is (Ci + 1) / Ui if | ||
188 | sent to the ith server, in which Ci is the number of connections | ||
189 | on the ith server and Ui is the fixed service rate (weight) | ||
190 | of the ith server. | ||
191 | |||
192 | If you want to compile it in kernel, say Y. To compile it as a | ||
193 | module, choose M here. If unsure, say N. | ||
194 | |||
195 | config IP_VS_NQ | ||
196 | tristate "never queue scheduling" | ||
197 | ---help--- | ||
198 | The never queue scheduling algorithm adopts a two-speed model. | ||
199 | When there is an idle server available, the job will be sent to | ||
200 | the idle server, instead of waiting for a fast one. When there | ||
201 | is no idle server available, the job will be sent to the server | ||
202 | that minimize its expected delay (The Shortest Expected Delay | ||
203 | scheduling algorithm). | ||
204 | |||
205 | If you want to compile it in kernel, say Y. To compile it as a | ||
206 | module, choose M here. If unsure, say N. | ||
207 | |||
208 | comment 'IPVS application helper' | ||
209 | |||
210 | config IP_VS_FTP | ||
211 | tristate "FTP protocol helper" | ||
212 | depends on IP_VS_PROTO_TCP | ||
213 | ---help--- | ||
214 | FTP is a protocol that transfers IP address and/or port number in | ||
215 | the payload. In the virtual server via Network Address Translation, | ||
216 | the IP address and port number of real servers cannot be sent to | ||
217 | clients in ftp connections directly, so FTP protocol helper is | ||
218 | required for tracking the connection and mangling it back to that of | ||
219 | virtual service. | ||
220 | |||
221 | If you want to compile it in kernel, say Y. To compile it as a | ||
222 | module, choose M here. If unsure, say N. | ||
223 | |||
224 | endif # IP_VS | ||
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile deleted file mode 100644 index 30e85de9ffff..000000000000 --- a/net/ipv4/ipvs/Makefile +++ /dev/null | |||
@@ -1,34 +0,0 @@ | |||
1 | # | ||
2 | # Makefile for the IPVS modules on top of IPv4. | ||
3 | # | ||
4 | |||
5 | # IPVS transport protocol load balancing support | ||
6 | ip_vs_proto-objs-y := | ||
7 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o | ||
8 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o | ||
9 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o | ||
10 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o | ||
11 | |||
12 | ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ | ||
13 | ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ | ||
14 | ip_vs_est.o ip_vs_proto.o \ | ||
15 | $(ip_vs_proto-objs-y) | ||
16 | |||
17 | |||
18 | # IPVS core | ||
19 | obj-$(CONFIG_IP_VS) += ip_vs.o | ||
20 | |||
21 | # IPVS schedulers | ||
22 | obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o | ||
23 | obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o | ||
24 | obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o | ||
25 | obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o | ||
26 | obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o | ||
27 | obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o | ||
28 | obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o | ||
29 | obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o | ||
30 | obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o | ||
31 | obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o | ||
32 | |||
33 | # IPVS application helpers | ||
34 | obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o | ||
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c deleted file mode 100644 index 201b8ea3020d..000000000000 --- a/net/ipv4/ipvs/ip_vs_app.c +++ /dev/null | |||
@@ -1,622 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_app.c: Application module support for IPVS | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference | ||
12 | * is that ip_vs_app module handles the reverse direction (incoming requests | ||
13 | * and outgoing responses). | ||
14 | * | ||
15 | * IP_MASQ_APP application masquerading module | ||
16 | * | ||
17 | * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> | ||
18 | * | ||
19 | */ | ||
20 | |||
21 | #include <linux/module.h> | ||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/skbuff.h> | ||
24 | #include <linux/in.h> | ||
25 | #include <linux/ip.h> | ||
26 | #include <linux/netfilter.h> | ||
27 | #include <net/net_namespace.h> | ||
28 | #include <net/protocol.h> | ||
29 | #include <net/tcp.h> | ||
30 | #include <asm/system.h> | ||
31 | #include <linux/stat.h> | ||
32 | #include <linux/proc_fs.h> | ||
33 | #include <linux/seq_file.h> | ||
34 | #include <linux/mutex.h> | ||
35 | |||
36 | #include <net/ip_vs.h> | ||
37 | |||
38 | EXPORT_SYMBOL(register_ip_vs_app); | ||
39 | EXPORT_SYMBOL(unregister_ip_vs_app); | ||
40 | EXPORT_SYMBOL(register_ip_vs_app_inc); | ||
41 | |||
42 | /* ipvs application list head */ | ||
43 | static LIST_HEAD(ip_vs_app_list); | ||
44 | static DEFINE_MUTEX(__ip_vs_app_mutex); | ||
45 | |||
46 | |||
47 | /* | ||
48 | * Get an ip_vs_app object | ||
49 | */ | ||
50 | static inline int ip_vs_app_get(struct ip_vs_app *app) | ||
51 | { | ||
52 | return try_module_get(app->module); | ||
53 | } | ||
54 | |||
55 | |||
56 | static inline void ip_vs_app_put(struct ip_vs_app *app) | ||
57 | { | ||
58 | module_put(app->module); | ||
59 | } | ||
60 | |||
61 | |||
62 | /* | ||
63 | * Allocate/initialize app incarnation and register it in proto apps. | ||
64 | */ | ||
65 | static int | ||
66 | ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) | ||
67 | { | ||
68 | struct ip_vs_protocol *pp; | ||
69 | struct ip_vs_app *inc; | ||
70 | int ret; | ||
71 | |||
72 | if (!(pp = ip_vs_proto_get(proto))) | ||
73 | return -EPROTONOSUPPORT; | ||
74 | |||
75 | if (!pp->unregister_app) | ||
76 | return -EOPNOTSUPP; | ||
77 | |||
78 | inc = kmemdup(app, sizeof(*inc), GFP_KERNEL); | ||
79 | if (!inc) | ||
80 | return -ENOMEM; | ||
81 | INIT_LIST_HEAD(&inc->p_list); | ||
82 | INIT_LIST_HEAD(&inc->incs_list); | ||
83 | inc->app = app; | ||
84 | inc->port = htons(port); | ||
85 | atomic_set(&inc->usecnt, 0); | ||
86 | |||
87 | if (app->timeouts) { | ||
88 | inc->timeout_table = | ||
89 | ip_vs_create_timeout_table(app->timeouts, | ||
90 | app->timeouts_size); | ||
91 | if (!inc->timeout_table) { | ||
92 | ret = -ENOMEM; | ||
93 | goto out; | ||
94 | } | ||
95 | } | ||
96 | |||
97 | ret = pp->register_app(inc); | ||
98 | if (ret) | ||
99 | goto out; | ||
100 | |||
101 | list_add(&inc->a_list, &app->incs_list); | ||
102 | IP_VS_DBG(9, "%s application %s:%u registered\n", | ||
103 | pp->name, inc->name, inc->port); | ||
104 | |||
105 | return 0; | ||
106 | |||
107 | out: | ||
108 | kfree(inc->timeout_table); | ||
109 | kfree(inc); | ||
110 | return ret; | ||
111 | } | ||
112 | |||
113 | |||
114 | /* | ||
115 | * Release app incarnation | ||
116 | */ | ||
117 | static void | ||
118 | ip_vs_app_inc_release(struct ip_vs_app *inc) | ||
119 | { | ||
120 | struct ip_vs_protocol *pp; | ||
121 | |||
122 | if (!(pp = ip_vs_proto_get(inc->protocol))) | ||
123 | return; | ||
124 | |||
125 | if (pp->unregister_app) | ||
126 | pp->unregister_app(inc); | ||
127 | |||
128 | IP_VS_DBG(9, "%s App %s:%u unregistered\n", | ||
129 | pp->name, inc->name, inc->port); | ||
130 | |||
131 | list_del(&inc->a_list); | ||
132 | |||
133 | kfree(inc->timeout_table); | ||
134 | kfree(inc); | ||
135 | } | ||
136 | |||
137 | |||
138 | /* | ||
139 | * Get reference to app inc (only called from softirq) | ||
140 | * | ||
141 | */ | ||
142 | int ip_vs_app_inc_get(struct ip_vs_app *inc) | ||
143 | { | ||
144 | int result; | ||
145 | |||
146 | atomic_inc(&inc->usecnt); | ||
147 | if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) | ||
148 | atomic_dec(&inc->usecnt); | ||
149 | return result; | ||
150 | } | ||
151 | |||
152 | |||
153 | /* | ||
154 | * Put the app inc (only called from timer or net softirq) | ||
155 | */ | ||
156 | void ip_vs_app_inc_put(struct ip_vs_app *inc) | ||
157 | { | ||
158 | ip_vs_app_put(inc->app); | ||
159 | atomic_dec(&inc->usecnt); | ||
160 | } | ||
161 | |||
162 | |||
163 | /* | ||
164 | * Register an application incarnation in protocol applications | ||
165 | */ | ||
166 | int | ||
167 | register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) | ||
168 | { | ||
169 | int result; | ||
170 | |||
171 | mutex_lock(&__ip_vs_app_mutex); | ||
172 | |||
173 | result = ip_vs_app_inc_new(app, proto, port); | ||
174 | |||
175 | mutex_unlock(&__ip_vs_app_mutex); | ||
176 | |||
177 | return result; | ||
178 | } | ||
179 | |||
180 | |||
181 | /* | ||
182 | * ip_vs_app registration routine | ||
183 | */ | ||
184 | int register_ip_vs_app(struct ip_vs_app *app) | ||
185 | { | ||
186 | /* increase the module use count */ | ||
187 | ip_vs_use_count_inc(); | ||
188 | |||
189 | mutex_lock(&__ip_vs_app_mutex); | ||
190 | |||
191 | list_add(&app->a_list, &ip_vs_app_list); | ||
192 | |||
193 | mutex_unlock(&__ip_vs_app_mutex); | ||
194 | |||
195 | return 0; | ||
196 | } | ||
197 | |||
198 | |||
199 | /* | ||
200 | * ip_vs_app unregistration routine | ||
201 | * We are sure there are no app incarnations attached to services | ||
202 | */ | ||
203 | void unregister_ip_vs_app(struct ip_vs_app *app) | ||
204 | { | ||
205 | struct ip_vs_app *inc, *nxt; | ||
206 | |||
207 | mutex_lock(&__ip_vs_app_mutex); | ||
208 | |||
209 | list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { | ||
210 | ip_vs_app_inc_release(inc); | ||
211 | } | ||
212 | |||
213 | list_del(&app->a_list); | ||
214 | |||
215 | mutex_unlock(&__ip_vs_app_mutex); | ||
216 | |||
217 | /* decrease the module use count */ | ||
218 | ip_vs_use_count_dec(); | ||
219 | } | ||
220 | |||
221 | |||
222 | /* | ||
223 | * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) | ||
224 | */ | ||
225 | int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) | ||
226 | { | ||
227 | return pp->app_conn_bind(cp); | ||
228 | } | ||
229 | |||
230 | |||
231 | /* | ||
232 | * Unbind cp from application incarnation (called by cp destructor) | ||
233 | */ | ||
234 | void ip_vs_unbind_app(struct ip_vs_conn *cp) | ||
235 | { | ||
236 | struct ip_vs_app *inc = cp->app; | ||
237 | |||
238 | if (!inc) | ||
239 | return; | ||
240 | |||
241 | if (inc->unbind_conn) | ||
242 | inc->unbind_conn(inc, cp); | ||
243 | if (inc->done_conn) | ||
244 | inc->done_conn(inc, cp); | ||
245 | ip_vs_app_inc_put(inc); | ||
246 | cp->app = NULL; | ||
247 | } | ||
248 | |||
249 | |||
250 | /* | ||
251 | * Fixes th->seq based on ip_vs_seq info. | ||
252 | */ | ||
253 | static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) | ||
254 | { | ||
255 | __u32 seq = ntohl(th->seq); | ||
256 | |||
257 | /* | ||
258 | * Adjust seq with delta-offset for all packets after | ||
259 | * the most recent resized pkt seq and with previous_delta offset | ||
260 | * for all packets before most recent resized pkt seq. | ||
261 | */ | ||
262 | if (vseq->delta || vseq->previous_delta) { | ||
263 | if(after(seq, vseq->init_seq)) { | ||
264 | th->seq = htonl(seq + vseq->delta); | ||
265 | IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n", | ||
266 | vseq->delta); | ||
267 | } else { | ||
268 | th->seq = htonl(seq + vseq->previous_delta); | ||
269 | IP_VS_DBG(9, "vs_fix_seq(): added previous_delta " | ||
270 | "(%d) to seq\n", vseq->previous_delta); | ||
271 | } | ||
272 | } | ||
273 | } | ||
274 | |||
275 | |||
276 | /* | ||
277 | * Fixes th->ack_seq based on ip_vs_seq info. | ||
278 | */ | ||
279 | static inline void | ||
280 | vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) | ||
281 | { | ||
282 | __u32 ack_seq = ntohl(th->ack_seq); | ||
283 | |||
284 | /* | ||
285 | * Adjust ack_seq with delta-offset for | ||
286 | * the packets AFTER most recent resized pkt has caused a shift | ||
287 | * for packets before most recent resized pkt, use previous_delta | ||
288 | */ | ||
289 | if (vseq->delta || vseq->previous_delta) { | ||
290 | /* since ack_seq is the number of octet that is expected | ||
291 | to receive next, so compare it with init_seq+delta */ | ||
292 | if(after(ack_seq, vseq->init_seq+vseq->delta)) { | ||
293 | th->ack_seq = htonl(ack_seq - vseq->delta); | ||
294 | IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta " | ||
295 | "(%d) from ack_seq\n", vseq->delta); | ||
296 | |||
297 | } else { | ||
298 | th->ack_seq = htonl(ack_seq - vseq->previous_delta); | ||
299 | IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted " | ||
300 | "previous_delta (%d) from ack_seq\n", | ||
301 | vseq->previous_delta); | ||
302 | } | ||
303 | } | ||
304 | } | ||
305 | |||
306 | |||
307 | /* | ||
308 | * Updates ip_vs_seq if pkt has been resized | ||
309 | * Assumes already checked proto==IPPROTO_TCP and diff!=0. | ||
310 | */ | ||
311 | static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, | ||
312 | unsigned flag, __u32 seq, int diff) | ||
313 | { | ||
314 | /* spinlock is to keep updating cp->flags atomic */ | ||
315 | spin_lock(&cp->lock); | ||
316 | if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { | ||
317 | vseq->previous_delta = vseq->delta; | ||
318 | vseq->delta += diff; | ||
319 | vseq->init_seq = seq; | ||
320 | cp->flags |= flag; | ||
321 | } | ||
322 | spin_unlock(&cp->lock); | ||
323 | } | ||
324 | |||
325 | static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, | ||
326 | struct ip_vs_app *app) | ||
327 | { | ||
328 | int diff; | ||
329 | const unsigned int tcp_offset = ip_hdrlen(skb); | ||
330 | struct tcphdr *th; | ||
331 | __u32 seq; | ||
332 | |||
333 | if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) | ||
334 | return 0; | ||
335 | |||
336 | th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); | ||
337 | |||
338 | /* | ||
339 | * Remember seq number in case this pkt gets resized | ||
340 | */ | ||
341 | seq = ntohl(th->seq); | ||
342 | |||
343 | /* | ||
344 | * Fix seq stuff if flagged as so. | ||
345 | */ | ||
346 | if (cp->flags & IP_VS_CONN_F_OUT_SEQ) | ||
347 | vs_fix_seq(&cp->out_seq, th); | ||
348 | if (cp->flags & IP_VS_CONN_F_IN_SEQ) | ||
349 | vs_fix_ack_seq(&cp->in_seq, th); | ||
350 | |||
351 | /* | ||
352 | * Call private output hook function | ||
353 | */ | ||
354 | if (app->pkt_out == NULL) | ||
355 | return 1; | ||
356 | |||
357 | if (!app->pkt_out(app, cp, skb, &diff)) | ||
358 | return 0; | ||
359 | |||
360 | /* | ||
361 | * Update ip_vs seq stuff if len has changed. | ||
362 | */ | ||
363 | if (diff != 0) | ||
364 | vs_seq_update(cp, &cp->out_seq, | ||
365 | IP_VS_CONN_F_OUT_SEQ, seq, diff); | ||
366 | |||
367 | return 1; | ||
368 | } | ||
369 | |||
370 | /* | ||
371 | * Output pkt hook. Will call bound ip_vs_app specific function | ||
372 | * called by ipvs packet handler, assumes previously checked cp!=NULL | ||
373 | * returns false if it can't handle packet (oom) | ||
374 | */ | ||
375 | int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) | ||
376 | { | ||
377 | struct ip_vs_app *app; | ||
378 | |||
379 | /* | ||
380 | * check if application module is bound to | ||
381 | * this ip_vs_conn. | ||
382 | */ | ||
383 | if ((app = cp->app) == NULL) | ||
384 | return 1; | ||
385 | |||
386 | /* TCP is complicated */ | ||
387 | if (cp->protocol == IPPROTO_TCP) | ||
388 | return app_tcp_pkt_out(cp, skb, app); | ||
389 | |||
390 | /* | ||
391 | * Call private output hook function | ||
392 | */ | ||
393 | if (app->pkt_out == NULL) | ||
394 | return 1; | ||
395 | |||
396 | return app->pkt_out(app, cp, skb, NULL); | ||
397 | } | ||
398 | |||
399 | |||
400 | static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, | ||
401 | struct ip_vs_app *app) | ||
402 | { | ||
403 | int diff; | ||
404 | const unsigned int tcp_offset = ip_hdrlen(skb); | ||
405 | struct tcphdr *th; | ||
406 | __u32 seq; | ||
407 | |||
408 | if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) | ||
409 | return 0; | ||
410 | |||
411 | th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); | ||
412 | |||
413 | /* | ||
414 | * Remember seq number in case this pkt gets resized | ||
415 | */ | ||
416 | seq = ntohl(th->seq); | ||
417 | |||
418 | /* | ||
419 | * Fix seq stuff if flagged as so. | ||
420 | */ | ||
421 | if (cp->flags & IP_VS_CONN_F_IN_SEQ) | ||
422 | vs_fix_seq(&cp->in_seq, th); | ||
423 | if (cp->flags & IP_VS_CONN_F_OUT_SEQ) | ||
424 | vs_fix_ack_seq(&cp->out_seq, th); | ||
425 | |||
426 | /* | ||
427 | * Call private input hook function | ||
428 | */ | ||
429 | if (app->pkt_in == NULL) | ||
430 | return 1; | ||
431 | |||
432 | if (!app->pkt_in(app, cp, skb, &diff)) | ||
433 | return 0; | ||
434 | |||
435 | /* | ||
436 | * Update ip_vs seq stuff if len has changed. | ||
437 | */ | ||
438 | if (diff != 0) | ||
439 | vs_seq_update(cp, &cp->in_seq, | ||
440 | IP_VS_CONN_F_IN_SEQ, seq, diff); | ||
441 | |||
442 | return 1; | ||
443 | } | ||
444 | |||
445 | /* | ||
446 | * Input pkt hook. Will call bound ip_vs_app specific function | ||
447 | * called by ipvs packet handler, assumes previously checked cp!=NULL. | ||
448 | * returns false if can't handle packet (oom). | ||
449 | */ | ||
450 | int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) | ||
451 | { | ||
452 | struct ip_vs_app *app; | ||
453 | |||
454 | /* | ||
455 | * check if application module is bound to | ||
456 | * this ip_vs_conn. | ||
457 | */ | ||
458 | if ((app = cp->app) == NULL) | ||
459 | return 1; | ||
460 | |||
461 | /* TCP is complicated */ | ||
462 | if (cp->protocol == IPPROTO_TCP) | ||
463 | return app_tcp_pkt_in(cp, skb, app); | ||
464 | |||
465 | /* | ||
466 | * Call private input hook function | ||
467 | */ | ||
468 | if (app->pkt_in == NULL) | ||
469 | return 1; | ||
470 | |||
471 | return app->pkt_in(app, cp, skb, NULL); | ||
472 | } | ||
473 | |||
474 | |||
475 | #ifdef CONFIG_PROC_FS | ||
476 | /* | ||
477 | * /proc/net/ip_vs_app entry function | ||
478 | */ | ||
479 | |||
480 | static struct ip_vs_app *ip_vs_app_idx(loff_t pos) | ||
481 | { | ||
482 | struct ip_vs_app *app, *inc; | ||
483 | |||
484 | list_for_each_entry(app, &ip_vs_app_list, a_list) { | ||
485 | list_for_each_entry(inc, &app->incs_list, a_list) { | ||
486 | if (pos-- == 0) | ||
487 | return inc; | ||
488 | } | ||
489 | } | ||
490 | return NULL; | ||
491 | |||
492 | } | ||
493 | |||
494 | static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) | ||
495 | { | ||
496 | mutex_lock(&__ip_vs_app_mutex); | ||
497 | |||
498 | return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; | ||
499 | } | ||
500 | |||
501 | static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
502 | { | ||
503 | struct ip_vs_app *inc, *app; | ||
504 | struct list_head *e; | ||
505 | |||
506 | ++*pos; | ||
507 | if (v == SEQ_START_TOKEN) | ||
508 | return ip_vs_app_idx(0); | ||
509 | |||
510 | inc = v; | ||
511 | app = inc->app; | ||
512 | |||
513 | if ((e = inc->a_list.next) != &app->incs_list) | ||
514 | return list_entry(e, struct ip_vs_app, a_list); | ||
515 | |||
516 | /* go on to next application */ | ||
517 | for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { | ||
518 | app = list_entry(e, struct ip_vs_app, a_list); | ||
519 | list_for_each_entry(inc, &app->incs_list, a_list) { | ||
520 | return inc; | ||
521 | } | ||
522 | } | ||
523 | return NULL; | ||
524 | } | ||
525 | |||
526 | static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) | ||
527 | { | ||
528 | mutex_unlock(&__ip_vs_app_mutex); | ||
529 | } | ||
530 | |||
531 | static int ip_vs_app_seq_show(struct seq_file *seq, void *v) | ||
532 | { | ||
533 | if (v == SEQ_START_TOKEN) | ||
534 | seq_puts(seq, "prot port usecnt name\n"); | ||
535 | else { | ||
536 | const struct ip_vs_app *inc = v; | ||
537 | |||
538 | seq_printf(seq, "%-3s %-7u %-6d %-17s\n", | ||
539 | ip_vs_proto_name(inc->protocol), | ||
540 | ntohs(inc->port), | ||
541 | atomic_read(&inc->usecnt), | ||
542 | inc->name); | ||
543 | } | ||
544 | return 0; | ||
545 | } | ||
546 | |||
547 | static const struct seq_operations ip_vs_app_seq_ops = { | ||
548 | .start = ip_vs_app_seq_start, | ||
549 | .next = ip_vs_app_seq_next, | ||
550 | .stop = ip_vs_app_seq_stop, | ||
551 | .show = ip_vs_app_seq_show, | ||
552 | }; | ||
553 | |||
554 | static int ip_vs_app_open(struct inode *inode, struct file *file) | ||
555 | { | ||
556 | return seq_open(file, &ip_vs_app_seq_ops); | ||
557 | } | ||
558 | |||
559 | static const struct file_operations ip_vs_app_fops = { | ||
560 | .owner = THIS_MODULE, | ||
561 | .open = ip_vs_app_open, | ||
562 | .read = seq_read, | ||
563 | .llseek = seq_lseek, | ||
564 | .release = seq_release, | ||
565 | }; | ||
566 | #endif | ||
567 | |||
568 | |||
569 | /* | ||
570 | * Replace a segment of data with a new segment | ||
571 | */ | ||
572 | int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, | ||
573 | char *o_buf, int o_len, char *n_buf, int n_len) | ||
574 | { | ||
575 | int diff; | ||
576 | int o_offset; | ||
577 | int o_left; | ||
578 | |||
579 | EnterFunction(9); | ||
580 | |||
581 | diff = n_len - o_len; | ||
582 | o_offset = o_buf - (char *)skb->data; | ||
583 | /* The length of left data after o_buf+o_len in the skb data */ | ||
584 | o_left = skb->len - (o_offset + o_len); | ||
585 | |||
586 | if (diff <= 0) { | ||
587 | memmove(o_buf + n_len, o_buf + o_len, o_left); | ||
588 | memcpy(o_buf, n_buf, n_len); | ||
589 | skb_trim(skb, skb->len + diff); | ||
590 | } else if (diff <= skb_tailroom(skb)) { | ||
591 | skb_put(skb, diff); | ||
592 | memmove(o_buf + n_len, o_buf + o_len, o_left); | ||
593 | memcpy(o_buf, n_buf, n_len); | ||
594 | } else { | ||
595 | if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) | ||
596 | return -ENOMEM; | ||
597 | skb_put(skb, diff); | ||
598 | memmove(skb->data + o_offset + n_len, | ||
599 | skb->data + o_offset + o_len, o_left); | ||
600 | skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len); | ||
601 | } | ||
602 | |||
603 | /* must update the iph total length here */ | ||
604 | ip_hdr(skb)->tot_len = htons(skb->len); | ||
605 | |||
606 | LeaveFunction(9); | ||
607 | return 0; | ||
608 | } | ||
609 | |||
610 | |||
611 | int __init ip_vs_app_init(void) | ||
612 | { | ||
613 | /* we will replace it with proc_net_ipvs_create() soon */ | ||
614 | proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); | ||
615 | return 0; | ||
616 | } | ||
617 | |||
618 | |||
619 | void ip_vs_app_cleanup(void) | ||
620 | { | ||
621 | proc_net_remove(&init_net, "ip_vs_app"); | ||
622 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c deleted file mode 100644 index 44a6872dc245..000000000000 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ /dev/null | |||
@@ -1,1023 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS An implementation of the IP virtual server support for the | ||
3 | * LINUX operating system. IPVS is now implemented as a module | ||
4 | * over the Netfilter framework. IPVS can be used to build a | ||
5 | * high-performance and highly available server based on a | ||
6 | * cluster of servers. | ||
7 | * | ||
8 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
9 | * Peter Kese <peter.kese@ijs.si> | ||
10 | * Julian Anastasov <ja@ssi.bg> | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU General Public License | ||
14 | * as published by the Free Software Foundation; either version | ||
15 | * 2 of the License, or (at your option) any later version. | ||
16 | * | ||
17 | * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, | ||
18 | * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms | ||
19 | * and others. Many code here is taken from IP MASQ code of kernel 2.2. | ||
20 | * | ||
21 | * Changes: | ||
22 | * | ||
23 | */ | ||
24 | |||
25 | #include <linux/interrupt.h> | ||
26 | #include <linux/in.h> | ||
27 | #include <linux/net.h> | ||
28 | #include <linux/kernel.h> | ||
29 | #include <linux/module.h> | ||
30 | #include <linux/vmalloc.h> | ||
31 | #include <linux/proc_fs.h> /* for proc_net_* */ | ||
32 | #include <linux/seq_file.h> | ||
33 | #include <linux/jhash.h> | ||
34 | #include <linux/random.h> | ||
35 | |||
36 | #include <net/net_namespace.h> | ||
37 | #include <net/ip_vs.h> | ||
38 | |||
39 | |||
40 | /* | ||
41 | * Connection hash table: for input and output packets lookups of IPVS | ||
42 | */ | ||
43 | static struct list_head *ip_vs_conn_tab; | ||
44 | |||
45 | /* SLAB cache for IPVS connections */ | ||
46 | static struct kmem_cache *ip_vs_conn_cachep __read_mostly; | ||
47 | |||
48 | /* counter for current IPVS connections */ | ||
49 | static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); | ||
50 | |||
51 | /* counter for no client port connections */ | ||
52 | static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); | ||
53 | |||
54 | /* random value for IPVS connection hash */ | ||
55 | static unsigned int ip_vs_conn_rnd; | ||
56 | |||
57 | /* | ||
58 | * Fine locking granularity for big connection hash table | ||
59 | */ | ||
60 | #define CT_LOCKARRAY_BITS 4 | ||
61 | #define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) | ||
62 | #define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) | ||
63 | |||
64 | struct ip_vs_aligned_lock | ||
65 | { | ||
66 | rwlock_t l; | ||
67 | } __attribute__((__aligned__(SMP_CACHE_BYTES))); | ||
68 | |||
69 | /* lock array for conn table */ | ||
70 | static struct ip_vs_aligned_lock | ||
71 | __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; | ||
72 | |||
73 | static inline void ct_read_lock(unsigned key) | ||
74 | { | ||
75 | read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
76 | } | ||
77 | |||
78 | static inline void ct_read_unlock(unsigned key) | ||
79 | { | ||
80 | read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
81 | } | ||
82 | |||
83 | static inline void ct_write_lock(unsigned key) | ||
84 | { | ||
85 | write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
86 | } | ||
87 | |||
88 | static inline void ct_write_unlock(unsigned key) | ||
89 | { | ||
90 | write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
91 | } | ||
92 | |||
93 | static inline void ct_read_lock_bh(unsigned key) | ||
94 | { | ||
95 | read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
96 | } | ||
97 | |||
98 | static inline void ct_read_unlock_bh(unsigned key) | ||
99 | { | ||
100 | read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
101 | } | ||
102 | |||
103 | static inline void ct_write_lock_bh(unsigned key) | ||
104 | { | ||
105 | write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
106 | } | ||
107 | |||
108 | static inline void ct_write_unlock_bh(unsigned key) | ||
109 | { | ||
110 | write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); | ||
111 | } | ||
112 | |||
113 | |||
114 | /* | ||
115 | * Returns hash value for IPVS connection entry | ||
116 | */ | ||
117 | static unsigned int ip_vs_conn_hashkey(unsigned proto, __be32 addr, __be16 port) | ||
118 | { | ||
119 | return jhash_3words((__force u32)addr, (__force u32)port, proto, ip_vs_conn_rnd) | ||
120 | & IP_VS_CONN_TAB_MASK; | ||
121 | } | ||
122 | |||
123 | |||
124 | /* | ||
125 | * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. | ||
126 | * returns bool success. | ||
127 | */ | ||
128 | static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) | ||
129 | { | ||
130 | unsigned hash; | ||
131 | int ret; | ||
132 | |||
133 | /* Hash by protocol, client address and port */ | ||
134 | hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); | ||
135 | |||
136 | ct_write_lock(hash); | ||
137 | |||
138 | if (!(cp->flags & IP_VS_CONN_F_HASHED)) { | ||
139 | list_add(&cp->c_list, &ip_vs_conn_tab[hash]); | ||
140 | cp->flags |= IP_VS_CONN_F_HASHED; | ||
141 | atomic_inc(&cp->refcnt); | ||
142 | ret = 1; | ||
143 | } else { | ||
144 | IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " | ||
145 | "called from %p\n", __builtin_return_address(0)); | ||
146 | ret = 0; | ||
147 | } | ||
148 | |||
149 | ct_write_unlock(hash); | ||
150 | |||
151 | return ret; | ||
152 | } | ||
153 | |||
154 | |||
155 | /* | ||
156 | * UNhashes ip_vs_conn from ip_vs_conn_tab. | ||
157 | * returns bool success. | ||
158 | */ | ||
159 | static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) | ||
160 | { | ||
161 | unsigned hash; | ||
162 | int ret; | ||
163 | |||
164 | /* unhash it and decrease its reference counter */ | ||
165 | hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); | ||
166 | |||
167 | ct_write_lock(hash); | ||
168 | |||
169 | if (cp->flags & IP_VS_CONN_F_HASHED) { | ||
170 | list_del(&cp->c_list); | ||
171 | cp->flags &= ~IP_VS_CONN_F_HASHED; | ||
172 | atomic_dec(&cp->refcnt); | ||
173 | ret = 1; | ||
174 | } else | ||
175 | ret = 0; | ||
176 | |||
177 | ct_write_unlock(hash); | ||
178 | |||
179 | return ret; | ||
180 | } | ||
181 | |||
182 | |||
183 | /* | ||
184 | * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. | ||
185 | * Called for pkts coming from OUTside-to-INside. | ||
186 | * s_addr, s_port: pkt source address (foreign host) | ||
187 | * d_addr, d_port: pkt dest address (load balancer) | ||
188 | */ | ||
189 | static inline struct ip_vs_conn *__ip_vs_conn_in_get | ||
190 | (int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) | ||
191 | { | ||
192 | unsigned hash; | ||
193 | struct ip_vs_conn *cp; | ||
194 | |||
195 | hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); | ||
196 | |||
197 | ct_read_lock(hash); | ||
198 | |||
199 | list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { | ||
200 | if (s_addr==cp->caddr && s_port==cp->cport && | ||
201 | d_port==cp->vport && d_addr==cp->vaddr && | ||
202 | ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && | ||
203 | protocol==cp->protocol) { | ||
204 | /* HIT */ | ||
205 | atomic_inc(&cp->refcnt); | ||
206 | ct_read_unlock(hash); | ||
207 | return cp; | ||
208 | } | ||
209 | } | ||
210 | |||
211 | ct_read_unlock(hash); | ||
212 | |||
213 | return NULL; | ||
214 | } | ||
215 | |||
216 | struct ip_vs_conn *ip_vs_conn_in_get | ||
217 | (int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) | ||
218 | { | ||
219 | struct ip_vs_conn *cp; | ||
220 | |||
221 | cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port); | ||
222 | if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) | ||
223 | cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); | ||
224 | |||
225 | IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", | ||
226 | ip_vs_proto_name(protocol), | ||
227 | NIPQUAD(s_addr), ntohs(s_port), | ||
228 | NIPQUAD(d_addr), ntohs(d_port), | ||
229 | cp?"hit":"not hit"); | ||
230 | |||
231 | return cp; | ||
232 | } | ||
233 | |||
234 | /* Get reference to connection template */ | ||
235 | struct ip_vs_conn *ip_vs_ct_in_get | ||
236 | (int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) | ||
237 | { | ||
238 | unsigned hash; | ||
239 | struct ip_vs_conn *cp; | ||
240 | |||
241 | hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); | ||
242 | |||
243 | ct_read_lock(hash); | ||
244 | |||
245 | list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { | ||
246 | if (s_addr==cp->caddr && s_port==cp->cport && | ||
247 | d_port==cp->vport && d_addr==cp->vaddr && | ||
248 | cp->flags & IP_VS_CONN_F_TEMPLATE && | ||
249 | protocol==cp->protocol) { | ||
250 | /* HIT */ | ||
251 | atomic_inc(&cp->refcnt); | ||
252 | goto out; | ||
253 | } | ||
254 | } | ||
255 | cp = NULL; | ||
256 | |||
257 | out: | ||
258 | ct_read_unlock(hash); | ||
259 | |||
260 | IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", | ||
261 | ip_vs_proto_name(protocol), | ||
262 | NIPQUAD(s_addr), ntohs(s_port), | ||
263 | NIPQUAD(d_addr), ntohs(d_port), | ||
264 | cp?"hit":"not hit"); | ||
265 | |||
266 | return cp; | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. | ||
271 | * Called for pkts coming from inside-to-OUTside. | ||
272 | * s_addr, s_port: pkt source address (inside host) | ||
273 | * d_addr, d_port: pkt dest address (foreign host) | ||
274 | */ | ||
275 | struct ip_vs_conn *ip_vs_conn_out_get | ||
276 | (int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) | ||
277 | { | ||
278 | unsigned hash; | ||
279 | struct ip_vs_conn *cp, *ret=NULL; | ||
280 | |||
281 | /* | ||
282 | * Check for "full" addressed entries | ||
283 | */ | ||
284 | hash = ip_vs_conn_hashkey(protocol, d_addr, d_port); | ||
285 | |||
286 | ct_read_lock(hash); | ||
287 | |||
288 | list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { | ||
289 | if (d_addr == cp->caddr && d_port == cp->cport && | ||
290 | s_port == cp->dport && s_addr == cp->daddr && | ||
291 | protocol == cp->protocol) { | ||
292 | /* HIT */ | ||
293 | atomic_inc(&cp->refcnt); | ||
294 | ret = cp; | ||
295 | break; | ||
296 | } | ||
297 | } | ||
298 | |||
299 | ct_read_unlock(hash); | ||
300 | |||
301 | IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", | ||
302 | ip_vs_proto_name(protocol), | ||
303 | NIPQUAD(s_addr), ntohs(s_port), | ||
304 | NIPQUAD(d_addr), ntohs(d_port), | ||
305 | ret?"hit":"not hit"); | ||
306 | |||
307 | return ret; | ||
308 | } | ||
309 | |||
310 | |||
311 | /* | ||
312 | * Put back the conn and restart its timer with its timeout | ||
313 | */ | ||
314 | void ip_vs_conn_put(struct ip_vs_conn *cp) | ||
315 | { | ||
316 | /* reset it expire in its timeout */ | ||
317 | mod_timer(&cp->timer, jiffies+cp->timeout); | ||
318 | |||
319 | __ip_vs_conn_put(cp); | ||
320 | } | ||
321 | |||
322 | |||
323 | /* | ||
324 | * Fill a no_client_port connection with a client port number | ||
325 | */ | ||
326 | void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) | ||
327 | { | ||
328 | if (ip_vs_conn_unhash(cp)) { | ||
329 | spin_lock(&cp->lock); | ||
330 | if (cp->flags & IP_VS_CONN_F_NO_CPORT) { | ||
331 | atomic_dec(&ip_vs_conn_no_cport_cnt); | ||
332 | cp->flags &= ~IP_VS_CONN_F_NO_CPORT; | ||
333 | cp->cport = cport; | ||
334 | } | ||
335 | spin_unlock(&cp->lock); | ||
336 | |||
337 | /* hash on new dport */ | ||
338 | ip_vs_conn_hash(cp); | ||
339 | } | ||
340 | } | ||
341 | |||
342 | |||
343 | /* | ||
344 | * Bind a connection entry with the corresponding packet_xmit. | ||
345 | * Called by ip_vs_conn_new. | ||
346 | */ | ||
347 | static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) | ||
348 | { | ||
349 | switch (IP_VS_FWD_METHOD(cp)) { | ||
350 | case IP_VS_CONN_F_MASQ: | ||
351 | cp->packet_xmit = ip_vs_nat_xmit; | ||
352 | break; | ||
353 | |||
354 | case IP_VS_CONN_F_TUNNEL: | ||
355 | cp->packet_xmit = ip_vs_tunnel_xmit; | ||
356 | break; | ||
357 | |||
358 | case IP_VS_CONN_F_DROUTE: | ||
359 | cp->packet_xmit = ip_vs_dr_xmit; | ||
360 | break; | ||
361 | |||
362 | case IP_VS_CONN_F_LOCALNODE: | ||
363 | cp->packet_xmit = ip_vs_null_xmit; | ||
364 | break; | ||
365 | |||
366 | case IP_VS_CONN_F_BYPASS: | ||
367 | cp->packet_xmit = ip_vs_bypass_xmit; | ||
368 | break; | ||
369 | } | ||
370 | } | ||
371 | |||
372 | |||
373 | static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) | ||
374 | { | ||
375 | return atomic_read(&dest->activeconns) | ||
376 | + atomic_read(&dest->inactconns); | ||
377 | } | ||
378 | |||
379 | /* | ||
380 | * Bind a connection entry with a virtual service destination | ||
381 | * Called just after a new connection entry is created. | ||
382 | */ | ||
383 | static inline void | ||
384 | ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) | ||
385 | { | ||
386 | /* if dest is NULL, then return directly */ | ||
387 | if (!dest) | ||
388 | return; | ||
389 | |||
390 | /* Increase the refcnt counter of the dest */ | ||
391 | atomic_inc(&dest->refcnt); | ||
392 | |||
393 | /* Bind with the destination and its corresponding transmitter */ | ||
394 | if ((cp->flags & IP_VS_CONN_F_SYNC) && | ||
395 | (!(cp->flags & IP_VS_CONN_F_TEMPLATE))) | ||
396 | /* if the connection is not template and is created | ||
397 | * by sync, preserve the activity flag. | ||
398 | */ | ||
399 | cp->flags |= atomic_read(&dest->conn_flags) & | ||
400 | (~IP_VS_CONN_F_INACTIVE); | ||
401 | else | ||
402 | cp->flags |= atomic_read(&dest->conn_flags); | ||
403 | cp->dest = dest; | ||
404 | |||
405 | IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " | ||
406 | "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " | ||
407 | "dest->refcnt:%d\n", | ||
408 | ip_vs_proto_name(cp->protocol), | ||
409 | NIPQUAD(cp->caddr), ntohs(cp->cport), | ||
410 | NIPQUAD(cp->vaddr), ntohs(cp->vport), | ||
411 | NIPQUAD(cp->daddr), ntohs(cp->dport), | ||
412 | ip_vs_fwd_tag(cp), cp->state, | ||
413 | cp->flags, atomic_read(&cp->refcnt), | ||
414 | atomic_read(&dest->refcnt)); | ||
415 | |||
416 | /* Update the connection counters */ | ||
417 | if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { | ||
418 | /* It is a normal connection, so increase the inactive | ||
419 | connection counter because it is in TCP SYNRECV | ||
420 | state (inactive) or other protocol inacive state */ | ||
421 | if ((cp->flags & IP_VS_CONN_F_SYNC) && | ||
422 | (!(cp->flags & IP_VS_CONN_F_INACTIVE))) | ||
423 | atomic_inc(&dest->activeconns); | ||
424 | else | ||
425 | atomic_inc(&dest->inactconns); | ||
426 | } else { | ||
427 | /* It is a persistent connection/template, so increase | ||
428 | the peristent connection counter */ | ||
429 | atomic_inc(&dest->persistconns); | ||
430 | } | ||
431 | |||
432 | if (dest->u_threshold != 0 && | ||
433 | ip_vs_dest_totalconns(dest) >= dest->u_threshold) | ||
434 | dest->flags |= IP_VS_DEST_F_OVERLOAD; | ||
435 | } | ||
436 | |||
437 | |||
438 | /* | ||
439 | * Check if there is a destination for the connection, if so | ||
440 | * bind the connection to the destination. | ||
441 | */ | ||
442 | struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) | ||
443 | { | ||
444 | struct ip_vs_dest *dest; | ||
445 | |||
446 | if ((cp) && (!cp->dest)) { | ||
447 | dest = ip_vs_find_dest(cp->daddr, cp->dport, | ||
448 | cp->vaddr, cp->vport, cp->protocol); | ||
449 | ip_vs_bind_dest(cp, dest); | ||
450 | return dest; | ||
451 | } else | ||
452 | return NULL; | ||
453 | } | ||
454 | |||
455 | |||
456 | /* | ||
457 | * Unbind a connection entry with its VS destination | ||
458 | * Called by the ip_vs_conn_expire function. | ||
459 | */ | ||
460 | static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) | ||
461 | { | ||
462 | struct ip_vs_dest *dest = cp->dest; | ||
463 | |||
464 | if (!dest) | ||
465 | return; | ||
466 | |||
467 | IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " | ||
468 | "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " | ||
469 | "dest->refcnt:%d\n", | ||
470 | ip_vs_proto_name(cp->protocol), | ||
471 | NIPQUAD(cp->caddr), ntohs(cp->cport), | ||
472 | NIPQUAD(cp->vaddr), ntohs(cp->vport), | ||
473 | NIPQUAD(cp->daddr), ntohs(cp->dport), | ||
474 | ip_vs_fwd_tag(cp), cp->state, | ||
475 | cp->flags, atomic_read(&cp->refcnt), | ||
476 | atomic_read(&dest->refcnt)); | ||
477 | |||
478 | /* Update the connection counters */ | ||
479 | if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { | ||
480 | /* It is a normal connection, so decrease the inactconns | ||
481 | or activeconns counter */ | ||
482 | if (cp->flags & IP_VS_CONN_F_INACTIVE) { | ||
483 | atomic_dec(&dest->inactconns); | ||
484 | } else { | ||
485 | atomic_dec(&dest->activeconns); | ||
486 | } | ||
487 | } else { | ||
488 | /* It is a persistent connection/template, so decrease | ||
489 | the peristent connection counter */ | ||
490 | atomic_dec(&dest->persistconns); | ||
491 | } | ||
492 | |||
493 | if (dest->l_threshold != 0) { | ||
494 | if (ip_vs_dest_totalconns(dest) < dest->l_threshold) | ||
495 | dest->flags &= ~IP_VS_DEST_F_OVERLOAD; | ||
496 | } else if (dest->u_threshold != 0) { | ||
497 | if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) | ||
498 | dest->flags &= ~IP_VS_DEST_F_OVERLOAD; | ||
499 | } else { | ||
500 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
501 | dest->flags &= ~IP_VS_DEST_F_OVERLOAD; | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | * Simply decrease the refcnt of the dest, because the | ||
506 | * dest will be either in service's destination list | ||
507 | * or in the trash. | ||
508 | */ | ||
509 | atomic_dec(&dest->refcnt); | ||
510 | } | ||
511 | |||
512 | |||
513 | /* | ||
514 | * Checking if the destination of a connection template is available. | ||
515 | * If available, return 1, otherwise invalidate this connection | ||
516 | * template and return 0. | ||
517 | */ | ||
518 | int ip_vs_check_template(struct ip_vs_conn *ct) | ||
519 | { | ||
520 | struct ip_vs_dest *dest = ct->dest; | ||
521 | |||
522 | /* | ||
523 | * Checking the dest server status. | ||
524 | */ | ||
525 | if ((dest == NULL) || | ||
526 | !(dest->flags & IP_VS_DEST_F_AVAILABLE) || | ||
527 | (sysctl_ip_vs_expire_quiescent_template && | ||
528 | (atomic_read(&dest->weight) == 0))) { | ||
529 | IP_VS_DBG(9, "check_template: dest not available for " | ||
530 | "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " | ||
531 | "-> d:%u.%u.%u.%u:%d\n", | ||
532 | ip_vs_proto_name(ct->protocol), | ||
533 | NIPQUAD(ct->caddr), ntohs(ct->cport), | ||
534 | NIPQUAD(ct->vaddr), ntohs(ct->vport), | ||
535 | NIPQUAD(ct->daddr), ntohs(ct->dport)); | ||
536 | |||
537 | /* | ||
538 | * Invalidate the connection template | ||
539 | */ | ||
540 | if (ct->vport != htons(0xffff)) { | ||
541 | if (ip_vs_conn_unhash(ct)) { | ||
542 | ct->dport = htons(0xffff); | ||
543 | ct->vport = htons(0xffff); | ||
544 | ct->cport = 0; | ||
545 | ip_vs_conn_hash(ct); | ||
546 | } | ||
547 | } | ||
548 | |||
549 | /* | ||
550 | * Simply decrease the refcnt of the template, | ||
551 | * don't restart its timer. | ||
552 | */ | ||
553 | atomic_dec(&ct->refcnt); | ||
554 | return 0; | ||
555 | } | ||
556 | return 1; | ||
557 | } | ||
558 | |||
559 | static void ip_vs_conn_expire(unsigned long data) | ||
560 | { | ||
561 | struct ip_vs_conn *cp = (struct ip_vs_conn *)data; | ||
562 | |||
563 | cp->timeout = 60*HZ; | ||
564 | |||
565 | /* | ||
566 | * hey, I'm using it | ||
567 | */ | ||
568 | atomic_inc(&cp->refcnt); | ||
569 | |||
570 | /* | ||
571 | * do I control anybody? | ||
572 | */ | ||
573 | if (atomic_read(&cp->n_control)) | ||
574 | goto expire_later; | ||
575 | |||
576 | /* | ||
577 | * unhash it if it is hashed in the conn table | ||
578 | */ | ||
579 | if (!ip_vs_conn_unhash(cp)) | ||
580 | goto expire_later; | ||
581 | |||
582 | /* | ||
583 | * refcnt==1 implies I'm the only one referrer | ||
584 | */ | ||
585 | if (likely(atomic_read(&cp->refcnt) == 1)) { | ||
586 | /* delete the timer if it is activated by other users */ | ||
587 | if (timer_pending(&cp->timer)) | ||
588 | del_timer(&cp->timer); | ||
589 | |||
590 | /* does anybody control me? */ | ||
591 | if (cp->control) | ||
592 | ip_vs_control_del(cp); | ||
593 | |||
594 | if (unlikely(cp->app != NULL)) | ||
595 | ip_vs_unbind_app(cp); | ||
596 | ip_vs_unbind_dest(cp); | ||
597 | if (cp->flags & IP_VS_CONN_F_NO_CPORT) | ||
598 | atomic_dec(&ip_vs_conn_no_cport_cnt); | ||
599 | atomic_dec(&ip_vs_conn_count); | ||
600 | |||
601 | kmem_cache_free(ip_vs_conn_cachep, cp); | ||
602 | return; | ||
603 | } | ||
604 | |||
605 | /* hash it back to the table */ | ||
606 | ip_vs_conn_hash(cp); | ||
607 | |||
608 | expire_later: | ||
609 | IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", | ||
610 | atomic_read(&cp->refcnt)-1, | ||
611 | atomic_read(&cp->n_control)); | ||
612 | |||
613 | ip_vs_conn_put(cp); | ||
614 | } | ||
615 | |||
616 | |||
617 | void ip_vs_conn_expire_now(struct ip_vs_conn *cp) | ||
618 | { | ||
619 | if (del_timer(&cp->timer)) | ||
620 | mod_timer(&cp->timer, jiffies); | ||
621 | } | ||
622 | |||
623 | |||
624 | /* | ||
625 | * Create a new connection entry and hash it into the ip_vs_conn_tab | ||
626 | */ | ||
627 | struct ip_vs_conn * | ||
628 | ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport, | ||
629 | __be32 daddr, __be16 dport, unsigned flags, | ||
630 | struct ip_vs_dest *dest) | ||
631 | { | ||
632 | struct ip_vs_conn *cp; | ||
633 | struct ip_vs_protocol *pp = ip_vs_proto_get(proto); | ||
634 | |||
635 | cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); | ||
636 | if (cp == NULL) { | ||
637 | IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); | ||
638 | return NULL; | ||
639 | } | ||
640 | |||
641 | INIT_LIST_HEAD(&cp->c_list); | ||
642 | setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); | ||
643 | cp->protocol = proto; | ||
644 | cp->caddr = caddr; | ||
645 | cp->cport = cport; | ||
646 | cp->vaddr = vaddr; | ||
647 | cp->vport = vport; | ||
648 | cp->daddr = daddr; | ||
649 | cp->dport = dport; | ||
650 | cp->flags = flags; | ||
651 | spin_lock_init(&cp->lock); | ||
652 | |||
653 | /* | ||
654 | * Set the entry is referenced by the current thread before hashing | ||
655 | * it in the table, so that other thread run ip_vs_random_dropentry | ||
656 | * but cannot drop this entry. | ||
657 | */ | ||
658 | atomic_set(&cp->refcnt, 1); | ||
659 | |||
660 | atomic_set(&cp->n_control, 0); | ||
661 | atomic_set(&cp->in_pkts, 0); | ||
662 | |||
663 | atomic_inc(&ip_vs_conn_count); | ||
664 | if (flags & IP_VS_CONN_F_NO_CPORT) | ||
665 | atomic_inc(&ip_vs_conn_no_cport_cnt); | ||
666 | |||
667 | /* Bind the connection with a destination server */ | ||
668 | ip_vs_bind_dest(cp, dest); | ||
669 | |||
670 | /* Set its state and timeout */ | ||
671 | cp->state = 0; | ||
672 | cp->timeout = 3*HZ; | ||
673 | |||
674 | /* Bind its packet transmitter */ | ||
675 | ip_vs_bind_xmit(cp); | ||
676 | |||
677 | if (unlikely(pp && atomic_read(&pp->appcnt))) | ||
678 | ip_vs_bind_app(cp, pp); | ||
679 | |||
680 | /* Hash it in the ip_vs_conn_tab finally */ | ||
681 | ip_vs_conn_hash(cp); | ||
682 | |||
683 | return cp; | ||
684 | } | ||
685 | |||
686 | |||
687 | /* | ||
688 | * /proc/net/ip_vs_conn entries | ||
689 | */ | ||
690 | #ifdef CONFIG_PROC_FS | ||
691 | |||
692 | static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) | ||
693 | { | ||
694 | int idx; | ||
695 | struct ip_vs_conn *cp; | ||
696 | |||
697 | for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { | ||
698 | ct_read_lock_bh(idx); | ||
699 | list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { | ||
700 | if (pos-- == 0) { | ||
701 | seq->private = &ip_vs_conn_tab[idx]; | ||
702 | return cp; | ||
703 | } | ||
704 | } | ||
705 | ct_read_unlock_bh(idx); | ||
706 | } | ||
707 | |||
708 | return NULL; | ||
709 | } | ||
710 | |||
711 | static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) | ||
712 | { | ||
713 | seq->private = NULL; | ||
714 | return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; | ||
715 | } | ||
716 | |||
717 | static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
718 | { | ||
719 | struct ip_vs_conn *cp = v; | ||
720 | struct list_head *e, *l = seq->private; | ||
721 | int idx; | ||
722 | |||
723 | ++*pos; | ||
724 | if (v == SEQ_START_TOKEN) | ||
725 | return ip_vs_conn_array(seq, 0); | ||
726 | |||
727 | /* more on same hash chain? */ | ||
728 | if ((e = cp->c_list.next) != l) | ||
729 | return list_entry(e, struct ip_vs_conn, c_list); | ||
730 | |||
731 | idx = l - ip_vs_conn_tab; | ||
732 | ct_read_unlock_bh(idx); | ||
733 | |||
734 | while (++idx < IP_VS_CONN_TAB_SIZE) { | ||
735 | ct_read_lock_bh(idx); | ||
736 | list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { | ||
737 | seq->private = &ip_vs_conn_tab[idx]; | ||
738 | return cp; | ||
739 | } | ||
740 | ct_read_unlock_bh(idx); | ||
741 | } | ||
742 | seq->private = NULL; | ||
743 | return NULL; | ||
744 | } | ||
745 | |||
746 | static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) | ||
747 | { | ||
748 | struct list_head *l = seq->private; | ||
749 | |||
750 | if (l) | ||
751 | ct_read_unlock_bh(l - ip_vs_conn_tab); | ||
752 | } | ||
753 | |||
754 | static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) | ||
755 | { | ||
756 | |||
757 | if (v == SEQ_START_TOKEN) | ||
758 | seq_puts(seq, | ||
759 | "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); | ||
760 | else { | ||
761 | const struct ip_vs_conn *cp = v; | ||
762 | |||
763 | seq_printf(seq, | ||
764 | "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n", | ||
765 | ip_vs_proto_name(cp->protocol), | ||
766 | ntohl(cp->caddr), ntohs(cp->cport), | ||
767 | ntohl(cp->vaddr), ntohs(cp->vport), | ||
768 | ntohl(cp->daddr), ntohs(cp->dport), | ||
769 | ip_vs_state_name(cp->protocol, cp->state), | ||
770 | (cp->timer.expires-jiffies)/HZ); | ||
771 | } | ||
772 | return 0; | ||
773 | } | ||
774 | |||
775 | static const struct seq_operations ip_vs_conn_seq_ops = { | ||
776 | .start = ip_vs_conn_seq_start, | ||
777 | .next = ip_vs_conn_seq_next, | ||
778 | .stop = ip_vs_conn_seq_stop, | ||
779 | .show = ip_vs_conn_seq_show, | ||
780 | }; | ||
781 | |||
782 | static int ip_vs_conn_open(struct inode *inode, struct file *file) | ||
783 | { | ||
784 | return seq_open(file, &ip_vs_conn_seq_ops); | ||
785 | } | ||
786 | |||
787 | static const struct file_operations ip_vs_conn_fops = { | ||
788 | .owner = THIS_MODULE, | ||
789 | .open = ip_vs_conn_open, | ||
790 | .read = seq_read, | ||
791 | .llseek = seq_lseek, | ||
792 | .release = seq_release, | ||
793 | }; | ||
794 | |||
795 | static const char *ip_vs_origin_name(unsigned flags) | ||
796 | { | ||
797 | if (flags & IP_VS_CONN_F_SYNC) | ||
798 | return "SYNC"; | ||
799 | else | ||
800 | return "LOCAL"; | ||
801 | } | ||
802 | |||
803 | static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) | ||
804 | { | ||
805 | |||
806 | if (v == SEQ_START_TOKEN) | ||
807 | seq_puts(seq, | ||
808 | "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); | ||
809 | else { | ||
810 | const struct ip_vs_conn *cp = v; | ||
811 | |||
812 | seq_printf(seq, | ||
813 | "%-3s %08X %04X %08X %04X %08X %04X %-11s %-6s %7lu\n", | ||
814 | ip_vs_proto_name(cp->protocol), | ||
815 | ntohl(cp->caddr), ntohs(cp->cport), | ||
816 | ntohl(cp->vaddr), ntohs(cp->vport), | ||
817 | ntohl(cp->daddr), ntohs(cp->dport), | ||
818 | ip_vs_state_name(cp->protocol, cp->state), | ||
819 | ip_vs_origin_name(cp->flags), | ||
820 | (cp->timer.expires-jiffies)/HZ); | ||
821 | } | ||
822 | return 0; | ||
823 | } | ||
824 | |||
825 | static const struct seq_operations ip_vs_conn_sync_seq_ops = { | ||
826 | .start = ip_vs_conn_seq_start, | ||
827 | .next = ip_vs_conn_seq_next, | ||
828 | .stop = ip_vs_conn_seq_stop, | ||
829 | .show = ip_vs_conn_sync_seq_show, | ||
830 | }; | ||
831 | |||
832 | static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) | ||
833 | { | ||
834 | return seq_open(file, &ip_vs_conn_sync_seq_ops); | ||
835 | } | ||
836 | |||
837 | static const struct file_operations ip_vs_conn_sync_fops = { | ||
838 | .owner = THIS_MODULE, | ||
839 | .open = ip_vs_conn_sync_open, | ||
840 | .read = seq_read, | ||
841 | .llseek = seq_lseek, | ||
842 | .release = seq_release, | ||
843 | }; | ||
844 | |||
845 | #endif | ||
846 | |||
847 | |||
848 | /* | ||
849 | * Randomly drop connection entries before running out of memory | ||
850 | */ | ||
851 | static inline int todrop_entry(struct ip_vs_conn *cp) | ||
852 | { | ||
853 | /* | ||
854 | * The drop rate array needs tuning for real environments. | ||
855 | * Called from timer bh only => no locking | ||
856 | */ | ||
857 | static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; | ||
858 | static char todrop_counter[9] = {0}; | ||
859 | int i; | ||
860 | |||
861 | /* if the conn entry hasn't lasted for 60 seconds, don't drop it. | ||
862 | This will leave enough time for normal connection to get | ||
863 | through. */ | ||
864 | if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) | ||
865 | return 0; | ||
866 | |||
867 | /* Don't drop the entry if its number of incoming packets is not | ||
868 | located in [0, 8] */ | ||
869 | i = atomic_read(&cp->in_pkts); | ||
870 | if (i > 8 || i < 0) return 0; | ||
871 | |||
872 | if (!todrop_rate[i]) return 0; | ||
873 | if (--todrop_counter[i] > 0) return 0; | ||
874 | |||
875 | todrop_counter[i] = todrop_rate[i]; | ||
876 | return 1; | ||
877 | } | ||
878 | |||
879 | /* Called from keventd and must protect itself from softirqs */ | ||
880 | void ip_vs_random_dropentry(void) | ||
881 | { | ||
882 | int idx; | ||
883 | struct ip_vs_conn *cp; | ||
884 | |||
885 | /* | ||
886 | * Randomly scan 1/32 of the whole table every second | ||
887 | */ | ||
888 | for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { | ||
889 | unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; | ||
890 | |||
891 | /* | ||
892 | * Lock is actually needed in this loop. | ||
893 | */ | ||
894 | ct_write_lock_bh(hash); | ||
895 | |||
896 | list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { | ||
897 | if (cp->flags & IP_VS_CONN_F_TEMPLATE) | ||
898 | /* connection template */ | ||
899 | continue; | ||
900 | |||
901 | if (cp->protocol == IPPROTO_TCP) { | ||
902 | switch(cp->state) { | ||
903 | case IP_VS_TCP_S_SYN_RECV: | ||
904 | case IP_VS_TCP_S_SYNACK: | ||
905 | break; | ||
906 | |||
907 | case IP_VS_TCP_S_ESTABLISHED: | ||
908 | if (todrop_entry(cp)) | ||
909 | break; | ||
910 | continue; | ||
911 | |||
912 | default: | ||
913 | continue; | ||
914 | } | ||
915 | } else { | ||
916 | if (!todrop_entry(cp)) | ||
917 | continue; | ||
918 | } | ||
919 | |||
920 | IP_VS_DBG(4, "del connection\n"); | ||
921 | ip_vs_conn_expire_now(cp); | ||
922 | if (cp->control) { | ||
923 | IP_VS_DBG(4, "del conn template\n"); | ||
924 | ip_vs_conn_expire_now(cp->control); | ||
925 | } | ||
926 | } | ||
927 | ct_write_unlock_bh(hash); | ||
928 | } | ||
929 | } | ||
930 | |||
931 | |||
932 | /* | ||
933 | * Flush all the connection entries in the ip_vs_conn_tab | ||
934 | */ | ||
935 | static void ip_vs_conn_flush(void) | ||
936 | { | ||
937 | int idx; | ||
938 | struct ip_vs_conn *cp; | ||
939 | |||
940 | flush_again: | ||
941 | for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { | ||
942 | /* | ||
943 | * Lock is actually needed in this loop. | ||
944 | */ | ||
945 | ct_write_lock_bh(idx); | ||
946 | |||
947 | list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { | ||
948 | |||
949 | IP_VS_DBG(4, "del connection\n"); | ||
950 | ip_vs_conn_expire_now(cp); | ||
951 | if (cp->control) { | ||
952 | IP_VS_DBG(4, "del conn template\n"); | ||
953 | ip_vs_conn_expire_now(cp->control); | ||
954 | } | ||
955 | } | ||
956 | ct_write_unlock_bh(idx); | ||
957 | } | ||
958 | |||
959 | /* the counter may be not NULL, because maybe some conn entries | ||
960 | are run by slow timer handler or unhashed but still referred */ | ||
961 | if (atomic_read(&ip_vs_conn_count) != 0) { | ||
962 | schedule(); | ||
963 | goto flush_again; | ||
964 | } | ||
965 | } | ||
966 | |||
967 | |||
968 | int __init ip_vs_conn_init(void) | ||
969 | { | ||
970 | int idx; | ||
971 | |||
972 | /* | ||
973 | * Allocate the connection hash table and initialize its list heads | ||
974 | */ | ||
975 | ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); | ||
976 | if (!ip_vs_conn_tab) | ||
977 | return -ENOMEM; | ||
978 | |||
979 | /* Allocate ip_vs_conn slab cache */ | ||
980 | ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", | ||
981 | sizeof(struct ip_vs_conn), 0, | ||
982 | SLAB_HWCACHE_ALIGN, NULL); | ||
983 | if (!ip_vs_conn_cachep) { | ||
984 | vfree(ip_vs_conn_tab); | ||
985 | return -ENOMEM; | ||
986 | } | ||
987 | |||
988 | IP_VS_INFO("Connection hash table configured " | ||
989 | "(size=%d, memory=%ldKbytes)\n", | ||
990 | IP_VS_CONN_TAB_SIZE, | ||
991 | (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); | ||
992 | IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", | ||
993 | sizeof(struct ip_vs_conn)); | ||
994 | |||
995 | for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { | ||
996 | INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); | ||
997 | } | ||
998 | |||
999 | for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { | ||
1000 | rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); | ||
1001 | } | ||
1002 | |||
1003 | proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); | ||
1004 | proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); | ||
1005 | |||
1006 | /* calculate the random value for connection hash */ | ||
1007 | get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); | ||
1008 | |||
1009 | return 0; | ||
1010 | } | ||
1011 | |||
1012 | |||
1013 | void ip_vs_conn_cleanup(void) | ||
1014 | { | ||
1015 | /* flush all the connection entries first */ | ||
1016 | ip_vs_conn_flush(); | ||
1017 | |||
1018 | /* Release the empty cache */ | ||
1019 | kmem_cache_destroy(ip_vs_conn_cachep); | ||
1020 | proc_net_remove(&init_net, "ip_vs_conn"); | ||
1021 | proc_net_remove(&init_net, "ip_vs_conn_sync"); | ||
1022 | vfree(ip_vs_conn_tab); | ||
1023 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c deleted file mode 100644 index a7879eafc3b5..000000000000 --- a/net/ipv4/ipvs/ip_vs_core.c +++ /dev/null | |||
@@ -1,1125 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS An implementation of the IP virtual server support for the | ||
3 | * LINUX operating system. IPVS is now implemented as a module | ||
4 | * over the Netfilter framework. IPVS can be used to build a | ||
5 | * high-performance and highly available server based on a | ||
6 | * cluster of servers. | ||
7 | * | ||
8 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
9 | * Peter Kese <peter.kese@ijs.si> | ||
10 | * Julian Anastasov <ja@ssi.bg> | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU General Public License | ||
14 | * as published by the Free Software Foundation; either version | ||
15 | * 2 of the License, or (at your option) any later version. | ||
16 | * | ||
17 | * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, | ||
18 | * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms | ||
19 | * and others. | ||
20 | * | ||
21 | * Changes: | ||
22 | * Paul `Rusty' Russell properly handle non-linear skbs | ||
23 | * Harald Welte don't use nfcache | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | #include <linux/module.h> | ||
28 | #include <linux/kernel.h> | ||
29 | #include <linux/ip.h> | ||
30 | #include <linux/tcp.h> | ||
31 | #include <linux/icmp.h> | ||
32 | |||
33 | #include <net/ip.h> | ||
34 | #include <net/tcp.h> | ||
35 | #include <net/udp.h> | ||
36 | #include <net/icmp.h> /* for icmp_send */ | ||
37 | #include <net/route.h> | ||
38 | |||
39 | #include <linux/netfilter.h> | ||
40 | #include <linux/netfilter_ipv4.h> | ||
41 | |||
42 | #include <net/ip_vs.h> | ||
43 | |||
44 | |||
45 | EXPORT_SYMBOL(register_ip_vs_scheduler); | ||
46 | EXPORT_SYMBOL(unregister_ip_vs_scheduler); | ||
47 | EXPORT_SYMBOL(ip_vs_skb_replace); | ||
48 | EXPORT_SYMBOL(ip_vs_proto_name); | ||
49 | EXPORT_SYMBOL(ip_vs_conn_new); | ||
50 | EXPORT_SYMBOL(ip_vs_conn_in_get); | ||
51 | EXPORT_SYMBOL(ip_vs_conn_out_get); | ||
52 | #ifdef CONFIG_IP_VS_PROTO_TCP | ||
53 | EXPORT_SYMBOL(ip_vs_tcp_conn_listen); | ||
54 | #endif | ||
55 | EXPORT_SYMBOL(ip_vs_conn_put); | ||
56 | #ifdef CONFIG_IP_VS_DEBUG | ||
57 | EXPORT_SYMBOL(ip_vs_get_debug_level); | ||
58 | #endif | ||
59 | |||
60 | |||
61 | /* ID used in ICMP lookups */ | ||
62 | #define icmp_id(icmph) (((icmph)->un).echo.id) | ||
63 | |||
64 | const char *ip_vs_proto_name(unsigned proto) | ||
65 | { | ||
66 | static char buf[20]; | ||
67 | |||
68 | switch (proto) { | ||
69 | case IPPROTO_IP: | ||
70 | return "IP"; | ||
71 | case IPPROTO_UDP: | ||
72 | return "UDP"; | ||
73 | case IPPROTO_TCP: | ||
74 | return "TCP"; | ||
75 | case IPPROTO_ICMP: | ||
76 | return "ICMP"; | ||
77 | default: | ||
78 | sprintf(buf, "IP_%d", proto); | ||
79 | return buf; | ||
80 | } | ||
81 | } | ||
82 | |||
83 | void ip_vs_init_hash_table(struct list_head *table, int rows) | ||
84 | { | ||
85 | while (--rows >= 0) | ||
86 | INIT_LIST_HEAD(&table[rows]); | ||
87 | } | ||
88 | |||
89 | static inline void | ||
90 | ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) | ||
91 | { | ||
92 | struct ip_vs_dest *dest = cp->dest; | ||
93 | if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { | ||
94 | spin_lock(&dest->stats.lock); | ||
95 | dest->stats.inpkts++; | ||
96 | dest->stats.inbytes += skb->len; | ||
97 | spin_unlock(&dest->stats.lock); | ||
98 | |||
99 | spin_lock(&dest->svc->stats.lock); | ||
100 | dest->svc->stats.inpkts++; | ||
101 | dest->svc->stats.inbytes += skb->len; | ||
102 | spin_unlock(&dest->svc->stats.lock); | ||
103 | |||
104 | spin_lock(&ip_vs_stats.lock); | ||
105 | ip_vs_stats.inpkts++; | ||
106 | ip_vs_stats.inbytes += skb->len; | ||
107 | spin_unlock(&ip_vs_stats.lock); | ||
108 | } | ||
109 | } | ||
110 | |||
111 | |||
112 | static inline void | ||
113 | ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) | ||
114 | { | ||
115 | struct ip_vs_dest *dest = cp->dest; | ||
116 | if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { | ||
117 | spin_lock(&dest->stats.lock); | ||
118 | dest->stats.outpkts++; | ||
119 | dest->stats.outbytes += skb->len; | ||
120 | spin_unlock(&dest->stats.lock); | ||
121 | |||
122 | spin_lock(&dest->svc->stats.lock); | ||
123 | dest->svc->stats.outpkts++; | ||
124 | dest->svc->stats.outbytes += skb->len; | ||
125 | spin_unlock(&dest->svc->stats.lock); | ||
126 | |||
127 | spin_lock(&ip_vs_stats.lock); | ||
128 | ip_vs_stats.outpkts++; | ||
129 | ip_vs_stats.outbytes += skb->len; | ||
130 | spin_unlock(&ip_vs_stats.lock); | ||
131 | } | ||
132 | } | ||
133 | |||
134 | |||
135 | static inline void | ||
136 | ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) | ||
137 | { | ||
138 | spin_lock(&cp->dest->stats.lock); | ||
139 | cp->dest->stats.conns++; | ||
140 | spin_unlock(&cp->dest->stats.lock); | ||
141 | |||
142 | spin_lock(&svc->stats.lock); | ||
143 | svc->stats.conns++; | ||
144 | spin_unlock(&svc->stats.lock); | ||
145 | |||
146 | spin_lock(&ip_vs_stats.lock); | ||
147 | ip_vs_stats.conns++; | ||
148 | spin_unlock(&ip_vs_stats.lock); | ||
149 | } | ||
150 | |||
151 | |||
152 | static inline int | ||
153 | ip_vs_set_state(struct ip_vs_conn *cp, int direction, | ||
154 | const struct sk_buff *skb, | ||
155 | struct ip_vs_protocol *pp) | ||
156 | { | ||
157 | if (unlikely(!pp->state_transition)) | ||
158 | return 0; | ||
159 | return pp->state_transition(cp, direction, skb, pp); | ||
160 | } | ||
161 | |||
162 | |||
163 | /* | ||
164 | * IPVS persistent scheduling function | ||
165 | * It creates a connection entry according to its template if exists, | ||
166 | * or selects a server and creates a connection entry plus a template. | ||
167 | * Locking: we are svc user (svc->refcnt), so we hold all dests too | ||
168 | * Protocols supported: TCP, UDP | ||
169 | */ | ||
170 | static struct ip_vs_conn * | ||
171 | ip_vs_sched_persist(struct ip_vs_service *svc, | ||
172 | const struct sk_buff *skb, | ||
173 | __be16 ports[2]) | ||
174 | { | ||
175 | struct ip_vs_conn *cp = NULL; | ||
176 | struct iphdr *iph = ip_hdr(skb); | ||
177 | struct ip_vs_dest *dest; | ||
178 | struct ip_vs_conn *ct; | ||
179 | __be16 dport; /* destination port to forward */ | ||
180 | __be32 snet; /* source network of the client, after masking */ | ||
181 | |||
182 | /* Mask saddr with the netmask to adjust template granularity */ | ||
183 | snet = iph->saddr & svc->netmask; | ||
184 | |||
185 | IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u " | ||
186 | "mnet %u.%u.%u.%u\n", | ||
187 | NIPQUAD(iph->saddr), ntohs(ports[0]), | ||
188 | NIPQUAD(iph->daddr), ntohs(ports[1]), | ||
189 | NIPQUAD(snet)); | ||
190 | |||
191 | /* | ||
192 | * As far as we know, FTP is a very complicated network protocol, and | ||
193 | * it uses control connection and data connections. For active FTP, | ||
194 | * FTP server initialize data connection to the client, its source port | ||
195 | * is often 20. For passive FTP, FTP server tells the clients the port | ||
196 | * that it passively listens to, and the client issues the data | ||
197 | * connection. In the tunneling or direct routing mode, the load | ||
198 | * balancer is on the client-to-server half of connection, the port | ||
199 | * number is unknown to the load balancer. So, a conn template like | ||
200 | * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP | ||
201 | * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> | ||
202 | * is created for other persistent services. | ||
203 | */ | ||
204 | if (ports[1] == svc->port) { | ||
205 | /* Check if a template already exists */ | ||
206 | if (svc->port != FTPPORT) | ||
207 | ct = ip_vs_ct_in_get(iph->protocol, snet, 0, | ||
208 | iph->daddr, ports[1]); | ||
209 | else | ||
210 | ct = ip_vs_ct_in_get(iph->protocol, snet, 0, | ||
211 | iph->daddr, 0); | ||
212 | |||
213 | if (!ct || !ip_vs_check_template(ct)) { | ||
214 | /* | ||
215 | * No template found or the dest of the connection | ||
216 | * template is not available. | ||
217 | */ | ||
218 | dest = svc->scheduler->schedule(svc, skb); | ||
219 | if (dest == NULL) { | ||
220 | IP_VS_DBG(1, "p-schedule: no dest found.\n"); | ||
221 | return NULL; | ||
222 | } | ||
223 | |||
224 | /* | ||
225 | * Create a template like <protocol,caddr,0, | ||
226 | * vaddr,vport,daddr,dport> for non-ftp service, | ||
227 | * and <protocol,caddr,0,vaddr,0,daddr,0> | ||
228 | * for ftp service. | ||
229 | */ | ||
230 | if (svc->port != FTPPORT) | ||
231 | ct = ip_vs_conn_new(iph->protocol, | ||
232 | snet, 0, | ||
233 | iph->daddr, | ||
234 | ports[1], | ||
235 | dest->addr, dest->port, | ||
236 | IP_VS_CONN_F_TEMPLATE, | ||
237 | dest); | ||
238 | else | ||
239 | ct = ip_vs_conn_new(iph->protocol, | ||
240 | snet, 0, | ||
241 | iph->daddr, 0, | ||
242 | dest->addr, 0, | ||
243 | IP_VS_CONN_F_TEMPLATE, | ||
244 | dest); | ||
245 | if (ct == NULL) | ||
246 | return NULL; | ||
247 | |||
248 | ct->timeout = svc->timeout; | ||
249 | } else { | ||
250 | /* set destination with the found template */ | ||
251 | dest = ct->dest; | ||
252 | } | ||
253 | dport = dest->port; | ||
254 | } else { | ||
255 | /* | ||
256 | * Note: persistent fwmark-based services and persistent | ||
257 | * port zero service are handled here. | ||
258 | * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> | ||
259 | * port zero template: <protocol,caddr,0,vaddr,0,daddr,0> | ||
260 | */ | ||
261 | if (svc->fwmark) | ||
262 | ct = ip_vs_ct_in_get(IPPROTO_IP, snet, 0, | ||
263 | htonl(svc->fwmark), 0); | ||
264 | else | ||
265 | ct = ip_vs_ct_in_get(iph->protocol, snet, 0, | ||
266 | iph->daddr, 0); | ||
267 | |||
268 | if (!ct || !ip_vs_check_template(ct)) { | ||
269 | /* | ||
270 | * If it is not persistent port zero, return NULL, | ||
271 | * otherwise create a connection template. | ||
272 | */ | ||
273 | if (svc->port) | ||
274 | return NULL; | ||
275 | |||
276 | dest = svc->scheduler->schedule(svc, skb); | ||
277 | if (dest == NULL) { | ||
278 | IP_VS_DBG(1, "p-schedule: no dest found.\n"); | ||
279 | return NULL; | ||
280 | } | ||
281 | |||
282 | /* | ||
283 | * Create a template according to the service | ||
284 | */ | ||
285 | if (svc->fwmark) | ||
286 | ct = ip_vs_conn_new(IPPROTO_IP, | ||
287 | snet, 0, | ||
288 | htonl(svc->fwmark), 0, | ||
289 | dest->addr, 0, | ||
290 | IP_VS_CONN_F_TEMPLATE, | ||
291 | dest); | ||
292 | else | ||
293 | ct = ip_vs_conn_new(iph->protocol, | ||
294 | snet, 0, | ||
295 | iph->daddr, 0, | ||
296 | dest->addr, 0, | ||
297 | IP_VS_CONN_F_TEMPLATE, | ||
298 | dest); | ||
299 | if (ct == NULL) | ||
300 | return NULL; | ||
301 | |||
302 | ct->timeout = svc->timeout; | ||
303 | } else { | ||
304 | /* set destination with the found template */ | ||
305 | dest = ct->dest; | ||
306 | } | ||
307 | dport = ports[1]; | ||
308 | } | ||
309 | |||
310 | /* | ||
311 | * Create a new connection according to the template | ||
312 | */ | ||
313 | cp = ip_vs_conn_new(iph->protocol, | ||
314 | iph->saddr, ports[0], | ||
315 | iph->daddr, ports[1], | ||
316 | dest->addr, dport, | ||
317 | 0, | ||
318 | dest); | ||
319 | if (cp == NULL) { | ||
320 | ip_vs_conn_put(ct); | ||
321 | return NULL; | ||
322 | } | ||
323 | |||
324 | /* | ||
325 | * Add its control | ||
326 | */ | ||
327 | ip_vs_control_add(cp, ct); | ||
328 | ip_vs_conn_put(ct); | ||
329 | |||
330 | ip_vs_conn_stats(cp, svc); | ||
331 | return cp; | ||
332 | } | ||
333 | |||
334 | |||
335 | /* | ||
336 | * IPVS main scheduling function | ||
337 | * It selects a server according to the virtual service, and | ||
338 | * creates a connection entry. | ||
339 | * Protocols supported: TCP, UDP | ||
340 | */ | ||
341 | struct ip_vs_conn * | ||
342 | ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
343 | { | ||
344 | struct ip_vs_conn *cp = NULL; | ||
345 | struct iphdr *iph = ip_hdr(skb); | ||
346 | struct ip_vs_dest *dest; | ||
347 | __be16 _ports[2], *pptr; | ||
348 | |||
349 | pptr = skb_header_pointer(skb, iph->ihl*4, | ||
350 | sizeof(_ports), _ports); | ||
351 | if (pptr == NULL) | ||
352 | return NULL; | ||
353 | |||
354 | /* | ||
355 | * Persistent service | ||
356 | */ | ||
357 | if (svc->flags & IP_VS_SVC_F_PERSISTENT) | ||
358 | return ip_vs_sched_persist(svc, skb, pptr); | ||
359 | |||
360 | /* | ||
361 | * Non-persistent service | ||
362 | */ | ||
363 | if (!svc->fwmark && pptr[1] != svc->port) { | ||
364 | if (!svc->port) | ||
365 | IP_VS_ERR("Schedule: port zero only supported " | ||
366 | "in persistent services, " | ||
367 | "check your ipvs configuration\n"); | ||
368 | return NULL; | ||
369 | } | ||
370 | |||
371 | dest = svc->scheduler->schedule(svc, skb); | ||
372 | if (dest == NULL) { | ||
373 | IP_VS_DBG(1, "Schedule: no dest found.\n"); | ||
374 | return NULL; | ||
375 | } | ||
376 | |||
377 | /* | ||
378 | * Create a connection entry. | ||
379 | */ | ||
380 | cp = ip_vs_conn_new(iph->protocol, | ||
381 | iph->saddr, pptr[0], | ||
382 | iph->daddr, pptr[1], | ||
383 | dest->addr, dest->port?dest->port:pptr[1], | ||
384 | 0, | ||
385 | dest); | ||
386 | if (cp == NULL) | ||
387 | return NULL; | ||
388 | |||
389 | IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " | ||
390 | "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n", | ||
391 | ip_vs_fwd_tag(cp), | ||
392 | NIPQUAD(cp->caddr), ntohs(cp->cport), | ||
393 | NIPQUAD(cp->vaddr), ntohs(cp->vport), | ||
394 | NIPQUAD(cp->daddr), ntohs(cp->dport), | ||
395 | cp->flags, atomic_read(&cp->refcnt)); | ||
396 | |||
397 | ip_vs_conn_stats(cp, svc); | ||
398 | return cp; | ||
399 | } | ||
400 | |||
401 | |||
402 | /* | ||
403 | * Pass or drop the packet. | ||
404 | * Called by ip_vs_in, when the virtual service is available but | ||
405 | * no destination is available for a new connection. | ||
406 | */ | ||
407 | int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, | ||
408 | struct ip_vs_protocol *pp) | ||
409 | { | ||
410 | __be16 _ports[2], *pptr; | ||
411 | struct iphdr *iph = ip_hdr(skb); | ||
412 | |||
413 | pptr = skb_header_pointer(skb, iph->ihl*4, | ||
414 | sizeof(_ports), _ports); | ||
415 | if (pptr == NULL) { | ||
416 | ip_vs_service_put(svc); | ||
417 | return NF_DROP; | ||
418 | } | ||
419 | |||
420 | /* if it is fwmark-based service, the cache_bypass sysctl is up | ||
421 | and the destination is RTN_UNICAST (and not local), then create | ||
422 | a cache_bypass connection entry */ | ||
423 | if (sysctl_ip_vs_cache_bypass && svc->fwmark | ||
424 | && (inet_addr_type(&init_net, iph->daddr) == RTN_UNICAST)) { | ||
425 | int ret, cs; | ||
426 | struct ip_vs_conn *cp; | ||
427 | |||
428 | ip_vs_service_put(svc); | ||
429 | |||
430 | /* create a new connection entry */ | ||
431 | IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); | ||
432 | cp = ip_vs_conn_new(iph->protocol, | ||
433 | iph->saddr, pptr[0], | ||
434 | iph->daddr, pptr[1], | ||
435 | 0, 0, | ||
436 | IP_VS_CONN_F_BYPASS, | ||
437 | NULL); | ||
438 | if (cp == NULL) | ||
439 | return NF_DROP; | ||
440 | |||
441 | /* statistics */ | ||
442 | ip_vs_in_stats(cp, skb); | ||
443 | |||
444 | /* set state */ | ||
445 | cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); | ||
446 | |||
447 | /* transmit the first SYN packet */ | ||
448 | ret = cp->packet_xmit(skb, cp, pp); | ||
449 | /* do not touch skb anymore */ | ||
450 | |||
451 | atomic_inc(&cp->in_pkts); | ||
452 | ip_vs_conn_put(cp); | ||
453 | return ret; | ||
454 | } | ||
455 | |||
456 | /* | ||
457 | * When the virtual ftp service is presented, packets destined | ||
458 | * for other services on the VIP may get here (except services | ||
459 | * listed in the ipvs table), pass the packets, because it is | ||
460 | * not ipvs job to decide to drop the packets. | ||
461 | */ | ||
462 | if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { | ||
463 | ip_vs_service_put(svc); | ||
464 | return NF_ACCEPT; | ||
465 | } | ||
466 | |||
467 | ip_vs_service_put(svc); | ||
468 | |||
469 | /* | ||
470 | * Notify the client that the destination is unreachable, and | ||
471 | * release the socket buffer. | ||
472 | * Since it is in IP layer, the TCP socket is not actually | ||
473 | * created, the TCP RST packet cannot be sent, instead that | ||
474 | * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ | ||
475 | */ | ||
476 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); | ||
477 | return NF_DROP; | ||
478 | } | ||
479 | |||
480 | |||
481 | /* | ||
482 | * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING | ||
483 | * chain, and is used for VS/NAT. | ||
484 | * It detects packets for VS/NAT connections and sends the packets | ||
485 | * immediately. This can avoid that iptable_nat mangles the packets | ||
486 | * for VS/NAT. | ||
487 | */ | ||
488 | static unsigned int ip_vs_post_routing(unsigned int hooknum, | ||
489 | struct sk_buff *skb, | ||
490 | const struct net_device *in, | ||
491 | const struct net_device *out, | ||
492 | int (*okfn)(struct sk_buff *)) | ||
493 | { | ||
494 | if (!skb->ipvs_property) | ||
495 | return NF_ACCEPT; | ||
496 | /* The packet was sent from IPVS, exit this chain */ | ||
497 | return NF_STOP; | ||
498 | } | ||
499 | |||
500 | __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) | ||
501 | { | ||
502 | return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); | ||
503 | } | ||
504 | |||
505 | static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) | ||
506 | { | ||
507 | int err = ip_defrag(skb, user); | ||
508 | |||
509 | if (!err) | ||
510 | ip_send_check(ip_hdr(skb)); | ||
511 | |||
512 | return err; | ||
513 | } | ||
514 | |||
515 | /* | ||
516 | * Packet has been made sufficiently writable in caller | ||
517 | * - inout: 1=in->out, 0=out->in | ||
518 | */ | ||
519 | void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
520 | struct ip_vs_conn *cp, int inout) | ||
521 | { | ||
522 | struct iphdr *iph = ip_hdr(skb); | ||
523 | unsigned int icmp_offset = iph->ihl*4; | ||
524 | struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + | ||
525 | icmp_offset); | ||
526 | struct iphdr *ciph = (struct iphdr *)(icmph + 1); | ||
527 | |||
528 | if (inout) { | ||
529 | iph->saddr = cp->vaddr; | ||
530 | ip_send_check(iph); | ||
531 | ciph->daddr = cp->vaddr; | ||
532 | ip_send_check(ciph); | ||
533 | } else { | ||
534 | iph->daddr = cp->daddr; | ||
535 | ip_send_check(iph); | ||
536 | ciph->saddr = cp->daddr; | ||
537 | ip_send_check(ciph); | ||
538 | } | ||
539 | |||
540 | /* the TCP/UDP port */ | ||
541 | if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) { | ||
542 | __be16 *ports = (void *)ciph + ciph->ihl*4; | ||
543 | |||
544 | if (inout) | ||
545 | ports[1] = cp->vport; | ||
546 | else | ||
547 | ports[0] = cp->dport; | ||
548 | } | ||
549 | |||
550 | /* And finally the ICMP checksum */ | ||
551 | icmph->checksum = 0; | ||
552 | icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); | ||
553 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
554 | |||
555 | if (inout) | ||
556 | IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, | ||
557 | "Forwarding altered outgoing ICMP"); | ||
558 | else | ||
559 | IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, | ||
560 | "Forwarding altered incoming ICMP"); | ||
561 | } | ||
562 | |||
563 | /* | ||
564 | * Handle ICMP messages in the inside-to-outside direction (outgoing). | ||
565 | * Find any that might be relevant, check against existing connections, | ||
566 | * forward to the right destination host if relevant. | ||
567 | * Currently handles error types - unreachable, quench, ttl exceeded. | ||
568 | * (Only used in VS/NAT) | ||
569 | */ | ||
570 | static int ip_vs_out_icmp(struct sk_buff *skb, int *related) | ||
571 | { | ||
572 | struct iphdr *iph; | ||
573 | struct icmphdr _icmph, *ic; | ||
574 | struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ | ||
575 | struct ip_vs_conn *cp; | ||
576 | struct ip_vs_protocol *pp; | ||
577 | unsigned int offset, ihl, verdict; | ||
578 | |||
579 | *related = 1; | ||
580 | |||
581 | /* reassemble IP fragments */ | ||
582 | if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { | ||
583 | if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) | ||
584 | return NF_STOLEN; | ||
585 | } | ||
586 | |||
587 | iph = ip_hdr(skb); | ||
588 | offset = ihl = iph->ihl * 4; | ||
589 | ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); | ||
590 | if (ic == NULL) | ||
591 | return NF_DROP; | ||
592 | |||
593 | IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", | ||
594 | ic->type, ntohs(icmp_id(ic)), | ||
595 | NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); | ||
596 | |||
597 | /* | ||
598 | * Work through seeing if this is for us. | ||
599 | * These checks are supposed to be in an order that means easy | ||
600 | * things are checked first to speed up processing.... however | ||
601 | * this means that some packets will manage to get a long way | ||
602 | * down this stack and then be rejected, but that's life. | ||
603 | */ | ||
604 | if ((ic->type != ICMP_DEST_UNREACH) && | ||
605 | (ic->type != ICMP_SOURCE_QUENCH) && | ||
606 | (ic->type != ICMP_TIME_EXCEEDED)) { | ||
607 | *related = 0; | ||
608 | return NF_ACCEPT; | ||
609 | } | ||
610 | |||
611 | /* Now find the contained IP header */ | ||
612 | offset += sizeof(_icmph); | ||
613 | cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); | ||
614 | if (cih == NULL) | ||
615 | return NF_ACCEPT; /* The packet looks wrong, ignore */ | ||
616 | |||
617 | pp = ip_vs_proto_get(cih->protocol); | ||
618 | if (!pp) | ||
619 | return NF_ACCEPT; | ||
620 | |||
621 | /* Is the embedded protocol header present? */ | ||
622 | if (unlikely(cih->frag_off & htons(IP_OFFSET) && | ||
623 | pp->dont_defrag)) | ||
624 | return NF_ACCEPT; | ||
625 | |||
626 | IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); | ||
627 | |||
628 | offset += cih->ihl * 4; | ||
629 | |||
630 | /* The embedded headers contain source and dest in reverse order */ | ||
631 | cp = pp->conn_out_get(skb, pp, cih, offset, 1); | ||
632 | if (!cp) | ||
633 | return NF_ACCEPT; | ||
634 | |||
635 | verdict = NF_DROP; | ||
636 | |||
637 | if (IP_VS_FWD_METHOD(cp) != 0) { | ||
638 | IP_VS_ERR("shouldn't reach here, because the box is on the " | ||
639 | "half connection in the tun/dr module.\n"); | ||
640 | } | ||
641 | |||
642 | /* Ensure the checksum is correct */ | ||
643 | if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { | ||
644 | /* Failed checksum! */ | ||
645 | IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", | ||
646 | NIPQUAD(iph->saddr)); | ||
647 | goto out; | ||
648 | } | ||
649 | |||
650 | if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) | ||
651 | offset += 2 * sizeof(__u16); | ||
652 | if (!skb_make_writable(skb, offset)) | ||
653 | goto out; | ||
654 | |||
655 | ip_vs_nat_icmp(skb, pp, cp, 1); | ||
656 | |||
657 | /* do the statistics and put it back */ | ||
658 | ip_vs_out_stats(cp, skb); | ||
659 | |||
660 | skb->ipvs_property = 1; | ||
661 | verdict = NF_ACCEPT; | ||
662 | |||
663 | out: | ||
664 | __ip_vs_conn_put(cp); | ||
665 | |||
666 | return verdict; | ||
667 | } | ||
668 | |||
669 | static inline int is_tcp_reset(const struct sk_buff *skb) | ||
670 | { | ||
671 | struct tcphdr _tcph, *th; | ||
672 | |||
673 | th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); | ||
674 | if (th == NULL) | ||
675 | return 0; | ||
676 | return th->rst; | ||
677 | } | ||
678 | |||
679 | /* | ||
680 | * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. | ||
681 | * Check if outgoing packet belongs to the established ip_vs_conn, | ||
682 | * rewrite addresses of the packet and send it on its way... | ||
683 | */ | ||
684 | static unsigned int | ||
685 | ip_vs_out(unsigned int hooknum, struct sk_buff *skb, | ||
686 | const struct net_device *in, const struct net_device *out, | ||
687 | int (*okfn)(struct sk_buff *)) | ||
688 | { | ||
689 | struct iphdr *iph; | ||
690 | struct ip_vs_protocol *pp; | ||
691 | struct ip_vs_conn *cp; | ||
692 | int ihl; | ||
693 | |||
694 | EnterFunction(11); | ||
695 | |||
696 | if (skb->ipvs_property) | ||
697 | return NF_ACCEPT; | ||
698 | |||
699 | iph = ip_hdr(skb); | ||
700 | if (unlikely(iph->protocol == IPPROTO_ICMP)) { | ||
701 | int related, verdict = ip_vs_out_icmp(skb, &related); | ||
702 | |||
703 | if (related) | ||
704 | return verdict; | ||
705 | iph = ip_hdr(skb); | ||
706 | } | ||
707 | |||
708 | pp = ip_vs_proto_get(iph->protocol); | ||
709 | if (unlikely(!pp)) | ||
710 | return NF_ACCEPT; | ||
711 | |||
712 | /* reassemble IP fragments */ | ||
713 | if (unlikely(iph->frag_off & htons(IP_MF|IP_OFFSET) && | ||
714 | !pp->dont_defrag)) { | ||
715 | if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) | ||
716 | return NF_STOLEN; | ||
717 | iph = ip_hdr(skb); | ||
718 | } | ||
719 | |||
720 | ihl = iph->ihl << 2; | ||
721 | |||
722 | /* | ||
723 | * Check if the packet belongs to an existing entry | ||
724 | */ | ||
725 | cp = pp->conn_out_get(skb, pp, iph, ihl, 0); | ||
726 | |||
727 | if (unlikely(!cp)) { | ||
728 | if (sysctl_ip_vs_nat_icmp_send && | ||
729 | (pp->protocol == IPPROTO_TCP || | ||
730 | pp->protocol == IPPROTO_UDP)) { | ||
731 | __be16 _ports[2], *pptr; | ||
732 | |||
733 | pptr = skb_header_pointer(skb, ihl, | ||
734 | sizeof(_ports), _ports); | ||
735 | if (pptr == NULL) | ||
736 | return NF_ACCEPT; /* Not for me */ | ||
737 | if (ip_vs_lookup_real_service(iph->protocol, | ||
738 | iph->saddr, pptr[0])) { | ||
739 | /* | ||
740 | * Notify the real server: there is no | ||
741 | * existing entry if it is not RST | ||
742 | * packet or not TCP packet. | ||
743 | */ | ||
744 | if (iph->protocol != IPPROTO_TCP | ||
745 | || !is_tcp_reset(skb)) { | ||
746 | icmp_send(skb,ICMP_DEST_UNREACH, | ||
747 | ICMP_PORT_UNREACH, 0); | ||
748 | return NF_DROP; | ||
749 | } | ||
750 | } | ||
751 | } | ||
752 | IP_VS_DBG_PKT(12, pp, skb, 0, | ||
753 | "packet continues traversal as normal"); | ||
754 | return NF_ACCEPT; | ||
755 | } | ||
756 | |||
757 | IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); | ||
758 | |||
759 | if (!skb_make_writable(skb, ihl)) | ||
760 | goto drop; | ||
761 | |||
762 | /* mangle the packet */ | ||
763 | if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) | ||
764 | goto drop; | ||
765 | ip_hdr(skb)->saddr = cp->vaddr; | ||
766 | ip_send_check(ip_hdr(skb)); | ||
767 | |||
768 | /* For policy routing, packets originating from this | ||
769 | * machine itself may be routed differently to packets | ||
770 | * passing through. We want this packet to be routed as | ||
771 | * if it came from this machine itself. So re-compute | ||
772 | * the routing information. | ||
773 | */ | ||
774 | if (ip_route_me_harder(skb, RTN_LOCAL) != 0) | ||
775 | goto drop; | ||
776 | |||
777 | IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); | ||
778 | |||
779 | ip_vs_out_stats(cp, skb); | ||
780 | ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); | ||
781 | ip_vs_conn_put(cp); | ||
782 | |||
783 | skb->ipvs_property = 1; | ||
784 | |||
785 | LeaveFunction(11); | ||
786 | return NF_ACCEPT; | ||
787 | |||
788 | drop: | ||
789 | ip_vs_conn_put(cp); | ||
790 | kfree_skb(skb); | ||
791 | return NF_STOLEN; | ||
792 | } | ||
793 | |||
794 | |||
795 | /* | ||
796 | * Handle ICMP messages in the outside-to-inside direction (incoming). | ||
797 | * Find any that might be relevant, check against existing connections, | ||
798 | * forward to the right destination host if relevant. | ||
799 | * Currently handles error types - unreachable, quench, ttl exceeded. | ||
800 | */ | ||
801 | static int | ||
802 | ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) | ||
803 | { | ||
804 | struct iphdr *iph; | ||
805 | struct icmphdr _icmph, *ic; | ||
806 | struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ | ||
807 | struct ip_vs_conn *cp; | ||
808 | struct ip_vs_protocol *pp; | ||
809 | unsigned int offset, ihl, verdict; | ||
810 | |||
811 | *related = 1; | ||
812 | |||
813 | /* reassemble IP fragments */ | ||
814 | if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { | ||
815 | if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? | ||
816 | IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD)) | ||
817 | return NF_STOLEN; | ||
818 | } | ||
819 | |||
820 | iph = ip_hdr(skb); | ||
821 | offset = ihl = iph->ihl * 4; | ||
822 | ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); | ||
823 | if (ic == NULL) | ||
824 | return NF_DROP; | ||
825 | |||
826 | IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", | ||
827 | ic->type, ntohs(icmp_id(ic)), | ||
828 | NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); | ||
829 | |||
830 | /* | ||
831 | * Work through seeing if this is for us. | ||
832 | * These checks are supposed to be in an order that means easy | ||
833 | * things are checked first to speed up processing.... however | ||
834 | * this means that some packets will manage to get a long way | ||
835 | * down this stack and then be rejected, but that's life. | ||
836 | */ | ||
837 | if ((ic->type != ICMP_DEST_UNREACH) && | ||
838 | (ic->type != ICMP_SOURCE_QUENCH) && | ||
839 | (ic->type != ICMP_TIME_EXCEEDED)) { | ||
840 | *related = 0; | ||
841 | return NF_ACCEPT; | ||
842 | } | ||
843 | |||
844 | /* Now find the contained IP header */ | ||
845 | offset += sizeof(_icmph); | ||
846 | cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); | ||
847 | if (cih == NULL) | ||
848 | return NF_ACCEPT; /* The packet looks wrong, ignore */ | ||
849 | |||
850 | pp = ip_vs_proto_get(cih->protocol); | ||
851 | if (!pp) | ||
852 | return NF_ACCEPT; | ||
853 | |||
854 | /* Is the embedded protocol header present? */ | ||
855 | if (unlikely(cih->frag_off & htons(IP_OFFSET) && | ||
856 | pp->dont_defrag)) | ||
857 | return NF_ACCEPT; | ||
858 | |||
859 | IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); | ||
860 | |||
861 | offset += cih->ihl * 4; | ||
862 | |||
863 | /* The embedded headers contain source and dest in reverse order */ | ||
864 | cp = pp->conn_in_get(skb, pp, cih, offset, 1); | ||
865 | if (!cp) | ||
866 | return NF_ACCEPT; | ||
867 | |||
868 | verdict = NF_DROP; | ||
869 | |||
870 | /* Ensure the checksum is correct */ | ||
871 | if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { | ||
872 | /* Failed checksum! */ | ||
873 | IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", | ||
874 | NIPQUAD(iph->saddr)); | ||
875 | goto out; | ||
876 | } | ||
877 | |||
878 | /* do the statistics and put it back */ | ||
879 | ip_vs_in_stats(cp, skb); | ||
880 | if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) | ||
881 | offset += 2 * sizeof(__u16); | ||
882 | verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); | ||
883 | /* do not touch skb anymore */ | ||
884 | |||
885 | out: | ||
886 | __ip_vs_conn_put(cp); | ||
887 | |||
888 | return verdict; | ||
889 | } | ||
890 | |||
891 | /* | ||
892 | * Check if it's for virtual services, look it up, | ||
893 | * and send it on its way... | ||
894 | */ | ||
895 | static unsigned int | ||
896 | ip_vs_in(unsigned int hooknum, struct sk_buff *skb, | ||
897 | const struct net_device *in, const struct net_device *out, | ||
898 | int (*okfn)(struct sk_buff *)) | ||
899 | { | ||
900 | struct iphdr *iph; | ||
901 | struct ip_vs_protocol *pp; | ||
902 | struct ip_vs_conn *cp; | ||
903 | int ret, restart; | ||
904 | int ihl; | ||
905 | |||
906 | /* | ||
907 | * Big tappo: only PACKET_HOST (neither loopback nor mcasts) | ||
908 | * ... don't know why 1st test DOES NOT include 2nd (?) | ||
909 | */ | ||
910 | if (unlikely(skb->pkt_type != PACKET_HOST | ||
911 | || skb->dev->flags & IFF_LOOPBACK || skb->sk)) { | ||
912 | IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", | ||
913 | skb->pkt_type, | ||
914 | ip_hdr(skb)->protocol, | ||
915 | NIPQUAD(ip_hdr(skb)->daddr)); | ||
916 | return NF_ACCEPT; | ||
917 | } | ||
918 | |||
919 | iph = ip_hdr(skb); | ||
920 | if (unlikely(iph->protocol == IPPROTO_ICMP)) { | ||
921 | int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); | ||
922 | |||
923 | if (related) | ||
924 | return verdict; | ||
925 | iph = ip_hdr(skb); | ||
926 | } | ||
927 | |||
928 | /* Protocol supported? */ | ||
929 | pp = ip_vs_proto_get(iph->protocol); | ||
930 | if (unlikely(!pp)) | ||
931 | return NF_ACCEPT; | ||
932 | |||
933 | ihl = iph->ihl << 2; | ||
934 | |||
935 | /* | ||
936 | * Check if the packet belongs to an existing connection entry | ||
937 | */ | ||
938 | cp = pp->conn_in_get(skb, pp, iph, ihl, 0); | ||
939 | |||
940 | if (unlikely(!cp)) { | ||
941 | int v; | ||
942 | |||
943 | if (!pp->conn_schedule(skb, pp, &v, &cp)) | ||
944 | return v; | ||
945 | } | ||
946 | |||
947 | if (unlikely(!cp)) { | ||
948 | /* sorry, all this trouble for a no-hit :) */ | ||
949 | IP_VS_DBG_PKT(12, pp, skb, 0, | ||
950 | "packet continues traversal as normal"); | ||
951 | return NF_ACCEPT; | ||
952 | } | ||
953 | |||
954 | IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); | ||
955 | |||
956 | /* Check the server status */ | ||
957 | if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { | ||
958 | /* the destination server is not available */ | ||
959 | |||
960 | if (sysctl_ip_vs_expire_nodest_conn) { | ||
961 | /* try to expire the connection immediately */ | ||
962 | ip_vs_conn_expire_now(cp); | ||
963 | } | ||
964 | /* don't restart its timer, and silently | ||
965 | drop the packet. */ | ||
966 | __ip_vs_conn_put(cp); | ||
967 | return NF_DROP; | ||
968 | } | ||
969 | |||
970 | ip_vs_in_stats(cp, skb); | ||
971 | restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); | ||
972 | if (cp->packet_xmit) | ||
973 | ret = cp->packet_xmit(skb, cp, pp); | ||
974 | /* do not touch skb anymore */ | ||
975 | else { | ||
976 | IP_VS_DBG_RL("warning: packet_xmit is null"); | ||
977 | ret = NF_ACCEPT; | ||
978 | } | ||
979 | |||
980 | /* Increase its packet counter and check if it is needed | ||
981 | * to be synchronized | ||
982 | * | ||
983 | * Sync connection if it is about to close to | ||
984 | * encorage the standby servers to update the connections timeout | ||
985 | */ | ||
986 | atomic_inc(&cp->in_pkts); | ||
987 | if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && | ||
988 | (((cp->protocol != IPPROTO_TCP || | ||
989 | cp->state == IP_VS_TCP_S_ESTABLISHED) && | ||
990 | (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] | ||
991 | == sysctl_ip_vs_sync_threshold[0])) || | ||
992 | ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && | ||
993 | ((cp->state == IP_VS_TCP_S_FIN_WAIT) || | ||
994 | (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || | ||
995 | (cp->state == IP_VS_TCP_S_TIME_WAIT))))) | ||
996 | ip_vs_sync_conn(cp); | ||
997 | cp->old_state = cp->state; | ||
998 | |||
999 | ip_vs_conn_put(cp); | ||
1000 | return ret; | ||
1001 | } | ||
1002 | |||
1003 | |||
1004 | /* | ||
1005 | * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP | ||
1006 | * related packets destined for 0.0.0.0/0. | ||
1007 | * When fwmark-based virtual service is used, such as transparent | ||
1008 | * cache cluster, TCP packets can be marked and routed to ip_vs_in, | ||
1009 | * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and | ||
1010 | * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain | ||
1011 | * and send them to ip_vs_in_icmp. | ||
1012 | */ | ||
1013 | static unsigned int | ||
1014 | ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, | ||
1015 | const struct net_device *in, const struct net_device *out, | ||
1016 | int (*okfn)(struct sk_buff *)) | ||
1017 | { | ||
1018 | int r; | ||
1019 | |||
1020 | if (ip_hdr(skb)->protocol != IPPROTO_ICMP) | ||
1021 | return NF_ACCEPT; | ||
1022 | |||
1023 | return ip_vs_in_icmp(skb, &r, hooknum); | ||
1024 | } | ||
1025 | |||
1026 | |||
1027 | static struct nf_hook_ops ip_vs_ops[] __read_mostly = { | ||
1028 | /* After packet filtering, forward packet through VS/DR, VS/TUN, | ||
1029 | * or VS/NAT(change destination), so that filtering rules can be | ||
1030 | * applied to IPVS. */ | ||
1031 | { | ||
1032 | .hook = ip_vs_in, | ||
1033 | .owner = THIS_MODULE, | ||
1034 | .pf = PF_INET, | ||
1035 | .hooknum = NF_INET_LOCAL_IN, | ||
1036 | .priority = 100, | ||
1037 | }, | ||
1038 | /* After packet filtering, change source only for VS/NAT */ | ||
1039 | { | ||
1040 | .hook = ip_vs_out, | ||
1041 | .owner = THIS_MODULE, | ||
1042 | .pf = PF_INET, | ||
1043 | .hooknum = NF_INET_FORWARD, | ||
1044 | .priority = 100, | ||
1045 | }, | ||
1046 | /* After packet filtering (but before ip_vs_out_icmp), catch icmp | ||
1047 | * destined for 0.0.0.0/0, which is for incoming IPVS connections */ | ||
1048 | { | ||
1049 | .hook = ip_vs_forward_icmp, | ||
1050 | .owner = THIS_MODULE, | ||
1051 | .pf = PF_INET, | ||
1052 | .hooknum = NF_INET_FORWARD, | ||
1053 | .priority = 99, | ||
1054 | }, | ||
1055 | /* Before the netfilter connection tracking, exit from POST_ROUTING */ | ||
1056 | { | ||
1057 | .hook = ip_vs_post_routing, | ||
1058 | .owner = THIS_MODULE, | ||
1059 | .pf = PF_INET, | ||
1060 | .hooknum = NF_INET_POST_ROUTING, | ||
1061 | .priority = NF_IP_PRI_NAT_SRC-1, | ||
1062 | }, | ||
1063 | }; | ||
1064 | |||
1065 | |||
1066 | /* | ||
1067 | * Initialize IP Virtual Server | ||
1068 | */ | ||
1069 | static int __init ip_vs_init(void) | ||
1070 | { | ||
1071 | int ret; | ||
1072 | |||
1073 | ret = ip_vs_control_init(); | ||
1074 | if (ret < 0) { | ||
1075 | IP_VS_ERR("can't setup control.\n"); | ||
1076 | goto cleanup_nothing; | ||
1077 | } | ||
1078 | |||
1079 | ip_vs_protocol_init(); | ||
1080 | |||
1081 | ret = ip_vs_app_init(); | ||
1082 | if (ret < 0) { | ||
1083 | IP_VS_ERR("can't setup application helper.\n"); | ||
1084 | goto cleanup_protocol; | ||
1085 | } | ||
1086 | |||
1087 | ret = ip_vs_conn_init(); | ||
1088 | if (ret < 0) { | ||
1089 | IP_VS_ERR("can't setup connection table.\n"); | ||
1090 | goto cleanup_app; | ||
1091 | } | ||
1092 | |||
1093 | ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); | ||
1094 | if (ret < 0) { | ||
1095 | IP_VS_ERR("can't register hooks.\n"); | ||
1096 | goto cleanup_conn; | ||
1097 | } | ||
1098 | |||
1099 | IP_VS_INFO("ipvs loaded.\n"); | ||
1100 | return ret; | ||
1101 | |||
1102 | cleanup_conn: | ||
1103 | ip_vs_conn_cleanup(); | ||
1104 | cleanup_app: | ||
1105 | ip_vs_app_cleanup(); | ||
1106 | cleanup_protocol: | ||
1107 | ip_vs_protocol_cleanup(); | ||
1108 | ip_vs_control_cleanup(); | ||
1109 | cleanup_nothing: | ||
1110 | return ret; | ||
1111 | } | ||
1112 | |||
1113 | static void __exit ip_vs_cleanup(void) | ||
1114 | { | ||
1115 | nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); | ||
1116 | ip_vs_conn_cleanup(); | ||
1117 | ip_vs_app_cleanup(); | ||
1118 | ip_vs_protocol_cleanup(); | ||
1119 | ip_vs_control_cleanup(); | ||
1120 | IP_VS_INFO("ipvs unloaded.\n"); | ||
1121 | } | ||
1122 | |||
1123 | module_init(ip_vs_init); | ||
1124 | module_exit(ip_vs_cleanup); | ||
1125 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c deleted file mode 100644 index 6379705a8dcb..000000000000 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ /dev/null | |||
@@ -1,2373 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS An implementation of the IP virtual server support for the | ||
3 | * LINUX operating system. IPVS is now implemented as a module | ||
4 | * over the NetFilter framework. IPVS can be used to build a | ||
5 | * high-performance and highly available server based on a | ||
6 | * cluster of servers. | ||
7 | * | ||
8 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
9 | * Peter Kese <peter.kese@ijs.si> | ||
10 | * Julian Anastasov <ja@ssi.bg> | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU General Public License | ||
14 | * as published by the Free Software Foundation; either version | ||
15 | * 2 of the License, or (at your option) any later version. | ||
16 | * | ||
17 | * Changes: | ||
18 | * | ||
19 | */ | ||
20 | |||
21 | #include <linux/module.h> | ||
22 | #include <linux/init.h> | ||
23 | #include <linux/types.h> | ||
24 | #include <linux/capability.h> | ||
25 | #include <linux/fs.h> | ||
26 | #include <linux/sysctl.h> | ||
27 | #include <linux/proc_fs.h> | ||
28 | #include <linux/workqueue.h> | ||
29 | #include <linux/swap.h> | ||
30 | #include <linux/seq_file.h> | ||
31 | |||
32 | #include <linux/netfilter.h> | ||
33 | #include <linux/netfilter_ipv4.h> | ||
34 | #include <linux/mutex.h> | ||
35 | |||
36 | #include <net/net_namespace.h> | ||
37 | #include <net/ip.h> | ||
38 | #include <net/route.h> | ||
39 | #include <net/sock.h> | ||
40 | |||
41 | #include <asm/uaccess.h> | ||
42 | |||
43 | #include <net/ip_vs.h> | ||
44 | |||
45 | /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ | ||
46 | static DEFINE_MUTEX(__ip_vs_mutex); | ||
47 | |||
48 | /* lock for service table */ | ||
49 | static DEFINE_RWLOCK(__ip_vs_svc_lock); | ||
50 | |||
51 | /* lock for table with the real services */ | ||
52 | static DEFINE_RWLOCK(__ip_vs_rs_lock); | ||
53 | |||
54 | /* lock for state and timeout tables */ | ||
55 | static DEFINE_RWLOCK(__ip_vs_securetcp_lock); | ||
56 | |||
57 | /* lock for drop entry handling */ | ||
58 | static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); | ||
59 | |||
60 | /* lock for drop packet handling */ | ||
61 | static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); | ||
62 | |||
63 | /* 1/rate drop and drop-entry variables */ | ||
64 | int ip_vs_drop_rate = 0; | ||
65 | int ip_vs_drop_counter = 0; | ||
66 | static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); | ||
67 | |||
68 | /* number of virtual services */ | ||
69 | static int ip_vs_num_services = 0; | ||
70 | |||
71 | /* sysctl variables */ | ||
72 | static int sysctl_ip_vs_drop_entry = 0; | ||
73 | static int sysctl_ip_vs_drop_packet = 0; | ||
74 | static int sysctl_ip_vs_secure_tcp = 0; | ||
75 | static int sysctl_ip_vs_amemthresh = 1024; | ||
76 | static int sysctl_ip_vs_am_droprate = 10; | ||
77 | int sysctl_ip_vs_cache_bypass = 0; | ||
78 | int sysctl_ip_vs_expire_nodest_conn = 0; | ||
79 | int sysctl_ip_vs_expire_quiescent_template = 0; | ||
80 | int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; | ||
81 | int sysctl_ip_vs_nat_icmp_send = 0; | ||
82 | |||
83 | |||
84 | #ifdef CONFIG_IP_VS_DEBUG | ||
85 | static int sysctl_ip_vs_debug_level = 0; | ||
86 | |||
87 | int ip_vs_get_debug_level(void) | ||
88 | { | ||
89 | return sysctl_ip_vs_debug_level; | ||
90 | } | ||
91 | #endif | ||
92 | |||
93 | /* | ||
94 | * update_defense_level is called from keventd and from sysctl, | ||
95 | * so it needs to protect itself from softirqs | ||
96 | */ | ||
97 | static void update_defense_level(void) | ||
98 | { | ||
99 | struct sysinfo i; | ||
100 | static int old_secure_tcp = 0; | ||
101 | int availmem; | ||
102 | int nomem; | ||
103 | int to_change = -1; | ||
104 | |||
105 | /* we only count free and buffered memory (in pages) */ | ||
106 | si_meminfo(&i); | ||
107 | availmem = i.freeram + i.bufferram; | ||
108 | /* however in linux 2.5 the i.bufferram is total page cache size, | ||
109 | we need adjust it */ | ||
110 | /* si_swapinfo(&i); */ | ||
111 | /* availmem = availmem - (i.totalswap - i.freeswap); */ | ||
112 | |||
113 | nomem = (availmem < sysctl_ip_vs_amemthresh); | ||
114 | |||
115 | local_bh_disable(); | ||
116 | |||
117 | /* drop_entry */ | ||
118 | spin_lock(&__ip_vs_dropentry_lock); | ||
119 | switch (sysctl_ip_vs_drop_entry) { | ||
120 | case 0: | ||
121 | atomic_set(&ip_vs_dropentry, 0); | ||
122 | break; | ||
123 | case 1: | ||
124 | if (nomem) { | ||
125 | atomic_set(&ip_vs_dropentry, 1); | ||
126 | sysctl_ip_vs_drop_entry = 2; | ||
127 | } else { | ||
128 | atomic_set(&ip_vs_dropentry, 0); | ||
129 | } | ||
130 | break; | ||
131 | case 2: | ||
132 | if (nomem) { | ||
133 | atomic_set(&ip_vs_dropentry, 1); | ||
134 | } else { | ||
135 | atomic_set(&ip_vs_dropentry, 0); | ||
136 | sysctl_ip_vs_drop_entry = 1; | ||
137 | }; | ||
138 | break; | ||
139 | case 3: | ||
140 | atomic_set(&ip_vs_dropentry, 1); | ||
141 | break; | ||
142 | } | ||
143 | spin_unlock(&__ip_vs_dropentry_lock); | ||
144 | |||
145 | /* drop_packet */ | ||
146 | spin_lock(&__ip_vs_droppacket_lock); | ||
147 | switch (sysctl_ip_vs_drop_packet) { | ||
148 | case 0: | ||
149 | ip_vs_drop_rate = 0; | ||
150 | break; | ||
151 | case 1: | ||
152 | if (nomem) { | ||
153 | ip_vs_drop_rate = ip_vs_drop_counter | ||
154 | = sysctl_ip_vs_amemthresh / | ||
155 | (sysctl_ip_vs_amemthresh-availmem); | ||
156 | sysctl_ip_vs_drop_packet = 2; | ||
157 | } else { | ||
158 | ip_vs_drop_rate = 0; | ||
159 | } | ||
160 | break; | ||
161 | case 2: | ||
162 | if (nomem) { | ||
163 | ip_vs_drop_rate = ip_vs_drop_counter | ||
164 | = sysctl_ip_vs_amemthresh / | ||
165 | (sysctl_ip_vs_amemthresh-availmem); | ||
166 | } else { | ||
167 | ip_vs_drop_rate = 0; | ||
168 | sysctl_ip_vs_drop_packet = 1; | ||
169 | } | ||
170 | break; | ||
171 | case 3: | ||
172 | ip_vs_drop_rate = sysctl_ip_vs_am_droprate; | ||
173 | break; | ||
174 | } | ||
175 | spin_unlock(&__ip_vs_droppacket_lock); | ||
176 | |||
177 | /* secure_tcp */ | ||
178 | write_lock(&__ip_vs_securetcp_lock); | ||
179 | switch (sysctl_ip_vs_secure_tcp) { | ||
180 | case 0: | ||
181 | if (old_secure_tcp >= 2) | ||
182 | to_change = 0; | ||
183 | break; | ||
184 | case 1: | ||
185 | if (nomem) { | ||
186 | if (old_secure_tcp < 2) | ||
187 | to_change = 1; | ||
188 | sysctl_ip_vs_secure_tcp = 2; | ||
189 | } else { | ||
190 | if (old_secure_tcp >= 2) | ||
191 | to_change = 0; | ||
192 | } | ||
193 | break; | ||
194 | case 2: | ||
195 | if (nomem) { | ||
196 | if (old_secure_tcp < 2) | ||
197 | to_change = 1; | ||
198 | } else { | ||
199 | if (old_secure_tcp >= 2) | ||
200 | to_change = 0; | ||
201 | sysctl_ip_vs_secure_tcp = 1; | ||
202 | } | ||
203 | break; | ||
204 | case 3: | ||
205 | if (old_secure_tcp < 2) | ||
206 | to_change = 1; | ||
207 | break; | ||
208 | } | ||
209 | old_secure_tcp = sysctl_ip_vs_secure_tcp; | ||
210 | if (to_change >= 0) | ||
211 | ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); | ||
212 | write_unlock(&__ip_vs_securetcp_lock); | ||
213 | |||
214 | local_bh_enable(); | ||
215 | } | ||
216 | |||
217 | |||
218 | /* | ||
219 | * Timer for checking the defense | ||
220 | */ | ||
221 | #define DEFENSE_TIMER_PERIOD 1*HZ | ||
222 | static void defense_work_handler(struct work_struct *work); | ||
223 | static DECLARE_DELAYED_WORK(defense_work, defense_work_handler); | ||
224 | |||
225 | static void defense_work_handler(struct work_struct *work) | ||
226 | { | ||
227 | update_defense_level(); | ||
228 | if (atomic_read(&ip_vs_dropentry)) | ||
229 | ip_vs_random_dropentry(); | ||
230 | |||
231 | schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); | ||
232 | } | ||
233 | |||
234 | int | ||
235 | ip_vs_use_count_inc(void) | ||
236 | { | ||
237 | return try_module_get(THIS_MODULE); | ||
238 | } | ||
239 | |||
240 | void | ||
241 | ip_vs_use_count_dec(void) | ||
242 | { | ||
243 | module_put(THIS_MODULE); | ||
244 | } | ||
245 | |||
246 | |||
247 | /* | ||
248 | * Hash table: for virtual service lookups | ||
249 | */ | ||
250 | #define IP_VS_SVC_TAB_BITS 8 | ||
251 | #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) | ||
252 | #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) | ||
253 | |||
254 | /* the service table hashed by <protocol, addr, port> */ | ||
255 | static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; | ||
256 | /* the service table hashed by fwmark */ | ||
257 | static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; | ||
258 | |||
259 | /* | ||
260 | * Hash table: for real service lookups | ||
261 | */ | ||
262 | #define IP_VS_RTAB_BITS 4 | ||
263 | #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) | ||
264 | #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) | ||
265 | |||
266 | static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; | ||
267 | |||
268 | /* | ||
269 | * Trash for destinations | ||
270 | */ | ||
271 | static LIST_HEAD(ip_vs_dest_trash); | ||
272 | |||
273 | /* | ||
274 | * FTP & NULL virtual service counters | ||
275 | */ | ||
276 | static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); | ||
277 | static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); | ||
278 | |||
279 | |||
280 | /* | ||
281 | * Returns hash value for virtual service | ||
282 | */ | ||
283 | static __inline__ unsigned | ||
284 | ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port) | ||
285 | { | ||
286 | register unsigned porth = ntohs(port); | ||
287 | |||
288 | return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth) | ||
289 | & IP_VS_SVC_TAB_MASK; | ||
290 | } | ||
291 | |||
292 | /* | ||
293 | * Returns hash value of fwmark for virtual service lookup | ||
294 | */ | ||
295 | static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) | ||
296 | { | ||
297 | return fwmark & IP_VS_SVC_TAB_MASK; | ||
298 | } | ||
299 | |||
300 | /* | ||
301 | * Hashes a service in the ip_vs_svc_table by <proto,addr,port> | ||
302 | * or in the ip_vs_svc_fwm_table by fwmark. | ||
303 | * Should be called with locked tables. | ||
304 | */ | ||
305 | static int ip_vs_svc_hash(struct ip_vs_service *svc) | ||
306 | { | ||
307 | unsigned hash; | ||
308 | |||
309 | if (svc->flags & IP_VS_SVC_F_HASHED) { | ||
310 | IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " | ||
311 | "called from %p\n", __builtin_return_address(0)); | ||
312 | return 0; | ||
313 | } | ||
314 | |||
315 | if (svc->fwmark == 0) { | ||
316 | /* | ||
317 | * Hash it by <protocol,addr,port> in ip_vs_svc_table | ||
318 | */ | ||
319 | hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port); | ||
320 | list_add(&svc->s_list, &ip_vs_svc_table[hash]); | ||
321 | } else { | ||
322 | /* | ||
323 | * Hash it by fwmark in ip_vs_svc_fwm_table | ||
324 | */ | ||
325 | hash = ip_vs_svc_fwm_hashkey(svc->fwmark); | ||
326 | list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); | ||
327 | } | ||
328 | |||
329 | svc->flags |= IP_VS_SVC_F_HASHED; | ||
330 | /* increase its refcnt because it is referenced by the svc table */ | ||
331 | atomic_inc(&svc->refcnt); | ||
332 | return 1; | ||
333 | } | ||
334 | |||
335 | |||
336 | /* | ||
337 | * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. | ||
338 | * Should be called with locked tables. | ||
339 | */ | ||
340 | static int ip_vs_svc_unhash(struct ip_vs_service *svc) | ||
341 | { | ||
342 | if (!(svc->flags & IP_VS_SVC_F_HASHED)) { | ||
343 | IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " | ||
344 | "called from %p\n", __builtin_return_address(0)); | ||
345 | return 0; | ||
346 | } | ||
347 | |||
348 | if (svc->fwmark == 0) { | ||
349 | /* Remove it from the ip_vs_svc_table table */ | ||
350 | list_del(&svc->s_list); | ||
351 | } else { | ||
352 | /* Remove it from the ip_vs_svc_fwm_table table */ | ||
353 | list_del(&svc->f_list); | ||
354 | } | ||
355 | |||
356 | svc->flags &= ~IP_VS_SVC_F_HASHED; | ||
357 | atomic_dec(&svc->refcnt); | ||
358 | return 1; | ||
359 | } | ||
360 | |||
361 | |||
362 | /* | ||
363 | * Get service by {proto,addr,port} in the service table. | ||
364 | */ | ||
365 | static __inline__ struct ip_vs_service * | ||
366 | __ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport) | ||
367 | { | ||
368 | unsigned hash; | ||
369 | struct ip_vs_service *svc; | ||
370 | |||
371 | /* Check for "full" addressed entries */ | ||
372 | hash = ip_vs_svc_hashkey(protocol, vaddr, vport); | ||
373 | |||
374 | list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ | ||
375 | if ((svc->addr == vaddr) | ||
376 | && (svc->port == vport) | ||
377 | && (svc->protocol == protocol)) { | ||
378 | /* HIT */ | ||
379 | atomic_inc(&svc->usecnt); | ||
380 | return svc; | ||
381 | } | ||
382 | } | ||
383 | |||
384 | return NULL; | ||
385 | } | ||
386 | |||
387 | |||
388 | /* | ||
389 | * Get service by {fwmark} in the service table. | ||
390 | */ | ||
391 | static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark) | ||
392 | { | ||
393 | unsigned hash; | ||
394 | struct ip_vs_service *svc; | ||
395 | |||
396 | /* Check for fwmark addressed entries */ | ||
397 | hash = ip_vs_svc_fwm_hashkey(fwmark); | ||
398 | |||
399 | list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { | ||
400 | if (svc->fwmark == fwmark) { | ||
401 | /* HIT */ | ||
402 | atomic_inc(&svc->usecnt); | ||
403 | return svc; | ||
404 | } | ||
405 | } | ||
406 | |||
407 | return NULL; | ||
408 | } | ||
409 | |||
410 | struct ip_vs_service * | ||
411 | ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport) | ||
412 | { | ||
413 | struct ip_vs_service *svc; | ||
414 | |||
415 | read_lock(&__ip_vs_svc_lock); | ||
416 | |||
417 | /* | ||
418 | * Check the table hashed by fwmark first | ||
419 | */ | ||
420 | if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark))) | ||
421 | goto out; | ||
422 | |||
423 | /* | ||
424 | * Check the table hashed by <protocol,addr,port> | ||
425 | * for "full" addressed entries | ||
426 | */ | ||
427 | svc = __ip_vs_service_get(protocol, vaddr, vport); | ||
428 | |||
429 | if (svc == NULL | ||
430 | && protocol == IPPROTO_TCP | ||
431 | && atomic_read(&ip_vs_ftpsvc_counter) | ||
432 | && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { | ||
433 | /* | ||
434 | * Check if ftp service entry exists, the packet | ||
435 | * might belong to FTP data connections. | ||
436 | */ | ||
437 | svc = __ip_vs_service_get(protocol, vaddr, FTPPORT); | ||
438 | } | ||
439 | |||
440 | if (svc == NULL | ||
441 | && atomic_read(&ip_vs_nullsvc_counter)) { | ||
442 | /* | ||
443 | * Check if the catch-all port (port zero) exists | ||
444 | */ | ||
445 | svc = __ip_vs_service_get(protocol, vaddr, 0); | ||
446 | } | ||
447 | |||
448 | out: | ||
449 | read_unlock(&__ip_vs_svc_lock); | ||
450 | |||
451 | IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", | ||
452 | fwmark, ip_vs_proto_name(protocol), | ||
453 | NIPQUAD(vaddr), ntohs(vport), | ||
454 | svc?"hit":"not hit"); | ||
455 | |||
456 | return svc; | ||
457 | } | ||
458 | |||
459 | |||
460 | static inline void | ||
461 | __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) | ||
462 | { | ||
463 | atomic_inc(&svc->refcnt); | ||
464 | dest->svc = svc; | ||
465 | } | ||
466 | |||
467 | static inline void | ||
468 | __ip_vs_unbind_svc(struct ip_vs_dest *dest) | ||
469 | { | ||
470 | struct ip_vs_service *svc = dest->svc; | ||
471 | |||
472 | dest->svc = NULL; | ||
473 | if (atomic_dec_and_test(&svc->refcnt)) | ||
474 | kfree(svc); | ||
475 | } | ||
476 | |||
477 | |||
478 | /* | ||
479 | * Returns hash value for real service | ||
480 | */ | ||
481 | static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port) | ||
482 | { | ||
483 | register unsigned porth = ntohs(port); | ||
484 | |||
485 | return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth) | ||
486 | & IP_VS_RTAB_MASK; | ||
487 | } | ||
488 | |||
489 | /* | ||
490 | * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>. | ||
491 | * should be called with locked tables. | ||
492 | */ | ||
493 | static int ip_vs_rs_hash(struct ip_vs_dest *dest) | ||
494 | { | ||
495 | unsigned hash; | ||
496 | |||
497 | if (!list_empty(&dest->d_list)) { | ||
498 | return 0; | ||
499 | } | ||
500 | |||
501 | /* | ||
502 | * Hash by proto,addr,port, | ||
503 | * which are the parameters of the real service. | ||
504 | */ | ||
505 | hash = ip_vs_rs_hashkey(dest->addr, dest->port); | ||
506 | list_add(&dest->d_list, &ip_vs_rtable[hash]); | ||
507 | |||
508 | return 1; | ||
509 | } | ||
510 | |||
511 | /* | ||
512 | * UNhashes ip_vs_dest from ip_vs_rtable. | ||
513 | * should be called with locked tables. | ||
514 | */ | ||
515 | static int ip_vs_rs_unhash(struct ip_vs_dest *dest) | ||
516 | { | ||
517 | /* | ||
518 | * Remove it from the ip_vs_rtable table. | ||
519 | */ | ||
520 | if (!list_empty(&dest->d_list)) { | ||
521 | list_del(&dest->d_list); | ||
522 | INIT_LIST_HEAD(&dest->d_list); | ||
523 | } | ||
524 | |||
525 | return 1; | ||
526 | } | ||
527 | |||
528 | /* | ||
529 | * Lookup real service by <proto,addr,port> in the real service table. | ||
530 | */ | ||
531 | struct ip_vs_dest * | ||
532 | ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport) | ||
533 | { | ||
534 | unsigned hash; | ||
535 | struct ip_vs_dest *dest; | ||
536 | |||
537 | /* | ||
538 | * Check for "full" addressed entries | ||
539 | * Return the first found entry | ||
540 | */ | ||
541 | hash = ip_vs_rs_hashkey(daddr, dport); | ||
542 | |||
543 | read_lock(&__ip_vs_rs_lock); | ||
544 | list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { | ||
545 | if ((dest->addr == daddr) | ||
546 | && (dest->port == dport) | ||
547 | && ((dest->protocol == protocol) || | ||
548 | dest->vfwmark)) { | ||
549 | /* HIT */ | ||
550 | read_unlock(&__ip_vs_rs_lock); | ||
551 | return dest; | ||
552 | } | ||
553 | } | ||
554 | read_unlock(&__ip_vs_rs_lock); | ||
555 | |||
556 | return NULL; | ||
557 | } | ||
558 | |||
559 | /* | ||
560 | * Lookup destination by {addr,port} in the given service | ||
561 | */ | ||
562 | static struct ip_vs_dest * | ||
563 | ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) | ||
564 | { | ||
565 | struct ip_vs_dest *dest; | ||
566 | |||
567 | /* | ||
568 | * Find the destination for the given service | ||
569 | */ | ||
570 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
571 | if ((dest->addr == daddr) && (dest->port == dport)) { | ||
572 | /* HIT */ | ||
573 | return dest; | ||
574 | } | ||
575 | } | ||
576 | |||
577 | return NULL; | ||
578 | } | ||
579 | |||
580 | /* | ||
581 | * Find destination by {daddr,dport,vaddr,protocol} | ||
582 | * Cretaed to be used in ip_vs_process_message() in | ||
583 | * the backup synchronization daemon. It finds the | ||
584 | * destination to be bound to the received connection | ||
585 | * on the backup. | ||
586 | * | ||
587 | * ip_vs_lookup_real_service() looked promissing, but | ||
588 | * seems not working as expected. | ||
589 | */ | ||
590 | struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport, | ||
591 | __be32 vaddr, __be16 vport, __u16 protocol) | ||
592 | { | ||
593 | struct ip_vs_dest *dest; | ||
594 | struct ip_vs_service *svc; | ||
595 | |||
596 | svc = ip_vs_service_get(0, protocol, vaddr, vport); | ||
597 | if (!svc) | ||
598 | return NULL; | ||
599 | dest = ip_vs_lookup_dest(svc, daddr, dport); | ||
600 | if (dest) | ||
601 | atomic_inc(&dest->refcnt); | ||
602 | ip_vs_service_put(svc); | ||
603 | return dest; | ||
604 | } | ||
605 | |||
606 | /* | ||
607 | * Lookup dest by {svc,addr,port} in the destination trash. | ||
608 | * The destination trash is used to hold the destinations that are removed | ||
609 | * from the service table but are still referenced by some conn entries. | ||
610 | * The reason to add the destination trash is when the dest is temporary | ||
611 | * down (either by administrator or by monitor program), the dest can be | ||
612 | * picked back from the trash, the remaining connections to the dest can | ||
613 | * continue, and the counting information of the dest is also useful for | ||
614 | * scheduling. | ||
615 | */ | ||
616 | static struct ip_vs_dest * | ||
617 | ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) | ||
618 | { | ||
619 | struct ip_vs_dest *dest, *nxt; | ||
620 | |||
621 | /* | ||
622 | * Find the destination in trash | ||
623 | */ | ||
624 | list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { | ||
625 | IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " | ||
626 | "dest->refcnt=%d\n", | ||
627 | dest->vfwmark, | ||
628 | NIPQUAD(dest->addr), ntohs(dest->port), | ||
629 | atomic_read(&dest->refcnt)); | ||
630 | if (dest->addr == daddr && | ||
631 | dest->port == dport && | ||
632 | dest->vfwmark == svc->fwmark && | ||
633 | dest->protocol == svc->protocol && | ||
634 | (svc->fwmark || | ||
635 | (dest->vaddr == svc->addr && | ||
636 | dest->vport == svc->port))) { | ||
637 | /* HIT */ | ||
638 | return dest; | ||
639 | } | ||
640 | |||
641 | /* | ||
642 | * Try to purge the destination from trash if not referenced | ||
643 | */ | ||
644 | if (atomic_read(&dest->refcnt) == 1) { | ||
645 | IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u " | ||
646 | "from trash\n", | ||
647 | dest->vfwmark, | ||
648 | NIPQUAD(dest->addr), ntohs(dest->port)); | ||
649 | list_del(&dest->n_list); | ||
650 | ip_vs_dst_reset(dest); | ||
651 | __ip_vs_unbind_svc(dest); | ||
652 | kfree(dest); | ||
653 | } | ||
654 | } | ||
655 | |||
656 | return NULL; | ||
657 | } | ||
658 | |||
659 | |||
660 | /* | ||
661 | * Clean up all the destinations in the trash | ||
662 | * Called by the ip_vs_control_cleanup() | ||
663 | * | ||
664 | * When the ip_vs_control_clearup is activated by ipvs module exit, | ||
665 | * the service tables must have been flushed and all the connections | ||
666 | * are expired, and the refcnt of each destination in the trash must | ||
667 | * be 1, so we simply release them here. | ||
668 | */ | ||
669 | static void ip_vs_trash_cleanup(void) | ||
670 | { | ||
671 | struct ip_vs_dest *dest, *nxt; | ||
672 | |||
673 | list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { | ||
674 | list_del(&dest->n_list); | ||
675 | ip_vs_dst_reset(dest); | ||
676 | __ip_vs_unbind_svc(dest); | ||
677 | kfree(dest); | ||
678 | } | ||
679 | } | ||
680 | |||
681 | |||
682 | static void | ||
683 | ip_vs_zero_stats(struct ip_vs_stats *stats) | ||
684 | { | ||
685 | spin_lock_bh(&stats->lock); | ||
686 | |||
687 | stats->conns = 0; | ||
688 | stats->inpkts = 0; | ||
689 | stats->outpkts = 0; | ||
690 | stats->inbytes = 0; | ||
691 | stats->outbytes = 0; | ||
692 | |||
693 | stats->cps = 0; | ||
694 | stats->inpps = 0; | ||
695 | stats->outpps = 0; | ||
696 | stats->inbps = 0; | ||
697 | stats->outbps = 0; | ||
698 | |||
699 | ip_vs_zero_estimator(stats); | ||
700 | |||
701 | spin_unlock_bh(&stats->lock); | ||
702 | } | ||
703 | |||
704 | /* | ||
705 | * Update a destination in the given service | ||
706 | */ | ||
707 | static void | ||
708 | __ip_vs_update_dest(struct ip_vs_service *svc, | ||
709 | struct ip_vs_dest *dest, struct ip_vs_dest_user *udest) | ||
710 | { | ||
711 | int conn_flags; | ||
712 | |||
713 | /* set the weight and the flags */ | ||
714 | atomic_set(&dest->weight, udest->weight); | ||
715 | conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; | ||
716 | |||
717 | /* check if local node and update the flags */ | ||
718 | if (inet_addr_type(&init_net, udest->addr) == RTN_LOCAL) { | ||
719 | conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) | ||
720 | | IP_VS_CONN_F_LOCALNODE; | ||
721 | } | ||
722 | |||
723 | /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ | ||
724 | if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { | ||
725 | conn_flags |= IP_VS_CONN_F_NOOUTPUT; | ||
726 | } else { | ||
727 | /* | ||
728 | * Put the real service in ip_vs_rtable if not present. | ||
729 | * For now only for NAT! | ||
730 | */ | ||
731 | write_lock_bh(&__ip_vs_rs_lock); | ||
732 | ip_vs_rs_hash(dest); | ||
733 | write_unlock_bh(&__ip_vs_rs_lock); | ||
734 | } | ||
735 | atomic_set(&dest->conn_flags, conn_flags); | ||
736 | |||
737 | /* bind the service */ | ||
738 | if (!dest->svc) { | ||
739 | __ip_vs_bind_svc(dest, svc); | ||
740 | } else { | ||
741 | if (dest->svc != svc) { | ||
742 | __ip_vs_unbind_svc(dest); | ||
743 | ip_vs_zero_stats(&dest->stats); | ||
744 | __ip_vs_bind_svc(dest, svc); | ||
745 | } | ||
746 | } | ||
747 | |||
748 | /* set the dest status flags */ | ||
749 | dest->flags |= IP_VS_DEST_F_AVAILABLE; | ||
750 | |||
751 | if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) | ||
752 | dest->flags &= ~IP_VS_DEST_F_OVERLOAD; | ||
753 | dest->u_threshold = udest->u_threshold; | ||
754 | dest->l_threshold = udest->l_threshold; | ||
755 | } | ||
756 | |||
757 | |||
758 | /* | ||
759 | * Create a destination for the given service | ||
760 | */ | ||
761 | static int | ||
762 | ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, | ||
763 | struct ip_vs_dest **dest_p) | ||
764 | { | ||
765 | struct ip_vs_dest *dest; | ||
766 | unsigned atype; | ||
767 | |||
768 | EnterFunction(2); | ||
769 | |||
770 | atype = inet_addr_type(&init_net, udest->addr); | ||
771 | if (atype != RTN_LOCAL && atype != RTN_UNICAST) | ||
772 | return -EINVAL; | ||
773 | |||
774 | dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); | ||
775 | if (dest == NULL) { | ||
776 | IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); | ||
777 | return -ENOMEM; | ||
778 | } | ||
779 | |||
780 | dest->protocol = svc->protocol; | ||
781 | dest->vaddr = svc->addr; | ||
782 | dest->vport = svc->port; | ||
783 | dest->vfwmark = svc->fwmark; | ||
784 | dest->addr = udest->addr; | ||
785 | dest->port = udest->port; | ||
786 | |||
787 | atomic_set(&dest->activeconns, 0); | ||
788 | atomic_set(&dest->inactconns, 0); | ||
789 | atomic_set(&dest->persistconns, 0); | ||
790 | atomic_set(&dest->refcnt, 0); | ||
791 | |||
792 | INIT_LIST_HEAD(&dest->d_list); | ||
793 | spin_lock_init(&dest->dst_lock); | ||
794 | spin_lock_init(&dest->stats.lock); | ||
795 | __ip_vs_update_dest(svc, dest, udest); | ||
796 | ip_vs_new_estimator(&dest->stats); | ||
797 | |||
798 | *dest_p = dest; | ||
799 | |||
800 | LeaveFunction(2); | ||
801 | return 0; | ||
802 | } | ||
803 | |||
804 | |||
805 | /* | ||
806 | * Add a destination into an existing service | ||
807 | */ | ||
808 | static int | ||
809 | ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) | ||
810 | { | ||
811 | struct ip_vs_dest *dest; | ||
812 | __be32 daddr = udest->addr; | ||
813 | __be16 dport = udest->port; | ||
814 | int ret; | ||
815 | |||
816 | EnterFunction(2); | ||
817 | |||
818 | if (udest->weight < 0) { | ||
819 | IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); | ||
820 | return -ERANGE; | ||
821 | } | ||
822 | |||
823 | if (udest->l_threshold > udest->u_threshold) { | ||
824 | IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than " | ||
825 | "upper threshold\n"); | ||
826 | return -ERANGE; | ||
827 | } | ||
828 | |||
829 | /* | ||
830 | * Check if the dest already exists in the list | ||
831 | */ | ||
832 | dest = ip_vs_lookup_dest(svc, daddr, dport); | ||
833 | if (dest != NULL) { | ||
834 | IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); | ||
835 | return -EEXIST; | ||
836 | } | ||
837 | |||
838 | /* | ||
839 | * Check if the dest already exists in the trash and | ||
840 | * is from the same service | ||
841 | */ | ||
842 | dest = ip_vs_trash_get_dest(svc, daddr, dport); | ||
843 | if (dest != NULL) { | ||
844 | IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " | ||
845 | "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n", | ||
846 | NIPQUAD(daddr), ntohs(dport), | ||
847 | atomic_read(&dest->refcnt), | ||
848 | dest->vfwmark, | ||
849 | NIPQUAD(dest->vaddr), | ||
850 | ntohs(dest->vport)); | ||
851 | __ip_vs_update_dest(svc, dest, udest); | ||
852 | |||
853 | /* | ||
854 | * Get the destination from the trash | ||
855 | */ | ||
856 | list_del(&dest->n_list); | ||
857 | |||
858 | ip_vs_new_estimator(&dest->stats); | ||
859 | |||
860 | write_lock_bh(&__ip_vs_svc_lock); | ||
861 | |||
862 | /* | ||
863 | * Wait until all other svc users go away. | ||
864 | */ | ||
865 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
866 | |||
867 | list_add(&dest->n_list, &svc->destinations); | ||
868 | svc->num_dests++; | ||
869 | |||
870 | /* call the update_service function of its scheduler */ | ||
871 | svc->scheduler->update_service(svc); | ||
872 | |||
873 | write_unlock_bh(&__ip_vs_svc_lock); | ||
874 | return 0; | ||
875 | } | ||
876 | |||
877 | /* | ||
878 | * Allocate and initialize the dest structure | ||
879 | */ | ||
880 | ret = ip_vs_new_dest(svc, udest, &dest); | ||
881 | if (ret) { | ||
882 | return ret; | ||
883 | } | ||
884 | |||
885 | /* | ||
886 | * Add the dest entry into the list | ||
887 | */ | ||
888 | atomic_inc(&dest->refcnt); | ||
889 | |||
890 | write_lock_bh(&__ip_vs_svc_lock); | ||
891 | |||
892 | /* | ||
893 | * Wait until all other svc users go away. | ||
894 | */ | ||
895 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
896 | |||
897 | list_add(&dest->n_list, &svc->destinations); | ||
898 | svc->num_dests++; | ||
899 | |||
900 | /* call the update_service function of its scheduler */ | ||
901 | svc->scheduler->update_service(svc); | ||
902 | |||
903 | write_unlock_bh(&__ip_vs_svc_lock); | ||
904 | |||
905 | LeaveFunction(2); | ||
906 | |||
907 | return 0; | ||
908 | } | ||
909 | |||
910 | |||
911 | /* | ||
912 | * Edit a destination in the given service | ||
913 | */ | ||
914 | static int | ||
915 | ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) | ||
916 | { | ||
917 | struct ip_vs_dest *dest; | ||
918 | __be32 daddr = udest->addr; | ||
919 | __be16 dport = udest->port; | ||
920 | |||
921 | EnterFunction(2); | ||
922 | |||
923 | if (udest->weight < 0) { | ||
924 | IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n"); | ||
925 | return -ERANGE; | ||
926 | } | ||
927 | |||
928 | if (udest->l_threshold > udest->u_threshold) { | ||
929 | IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than " | ||
930 | "upper threshold\n"); | ||
931 | return -ERANGE; | ||
932 | } | ||
933 | |||
934 | /* | ||
935 | * Lookup the destination list | ||
936 | */ | ||
937 | dest = ip_vs_lookup_dest(svc, daddr, dport); | ||
938 | if (dest == NULL) { | ||
939 | IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); | ||
940 | return -ENOENT; | ||
941 | } | ||
942 | |||
943 | __ip_vs_update_dest(svc, dest, udest); | ||
944 | |||
945 | write_lock_bh(&__ip_vs_svc_lock); | ||
946 | |||
947 | /* Wait until all other svc users go away */ | ||
948 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
949 | |||
950 | /* call the update_service, because server weight may be changed */ | ||
951 | svc->scheduler->update_service(svc); | ||
952 | |||
953 | write_unlock_bh(&__ip_vs_svc_lock); | ||
954 | |||
955 | LeaveFunction(2); | ||
956 | |||
957 | return 0; | ||
958 | } | ||
959 | |||
960 | |||
961 | /* | ||
962 | * Delete a destination (must be already unlinked from the service) | ||
963 | */ | ||
964 | static void __ip_vs_del_dest(struct ip_vs_dest *dest) | ||
965 | { | ||
966 | ip_vs_kill_estimator(&dest->stats); | ||
967 | |||
968 | /* | ||
969 | * Remove it from the d-linked list with the real services. | ||
970 | */ | ||
971 | write_lock_bh(&__ip_vs_rs_lock); | ||
972 | ip_vs_rs_unhash(dest); | ||
973 | write_unlock_bh(&__ip_vs_rs_lock); | ||
974 | |||
975 | /* | ||
976 | * Decrease the refcnt of the dest, and free the dest | ||
977 | * if nobody refers to it (refcnt=0). Otherwise, throw | ||
978 | * the destination into the trash. | ||
979 | */ | ||
980 | if (atomic_dec_and_test(&dest->refcnt)) { | ||
981 | ip_vs_dst_reset(dest); | ||
982 | /* simply decrease svc->refcnt here, let the caller check | ||
983 | and release the service if nobody refers to it. | ||
984 | Only user context can release destination and service, | ||
985 | and only one user context can update virtual service at a | ||
986 | time, so the operation here is OK */ | ||
987 | atomic_dec(&dest->svc->refcnt); | ||
988 | kfree(dest); | ||
989 | } else { | ||
990 | IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, " | ||
991 | "dest->refcnt=%d\n", | ||
992 | NIPQUAD(dest->addr), ntohs(dest->port), | ||
993 | atomic_read(&dest->refcnt)); | ||
994 | list_add(&dest->n_list, &ip_vs_dest_trash); | ||
995 | atomic_inc(&dest->refcnt); | ||
996 | } | ||
997 | } | ||
998 | |||
999 | |||
1000 | /* | ||
1001 | * Unlink a destination from the given service | ||
1002 | */ | ||
1003 | static void __ip_vs_unlink_dest(struct ip_vs_service *svc, | ||
1004 | struct ip_vs_dest *dest, | ||
1005 | int svcupd) | ||
1006 | { | ||
1007 | dest->flags &= ~IP_VS_DEST_F_AVAILABLE; | ||
1008 | |||
1009 | /* | ||
1010 | * Remove it from the d-linked destination list. | ||
1011 | */ | ||
1012 | list_del(&dest->n_list); | ||
1013 | svc->num_dests--; | ||
1014 | if (svcupd) { | ||
1015 | /* | ||
1016 | * Call the update_service function of its scheduler | ||
1017 | */ | ||
1018 | svc->scheduler->update_service(svc); | ||
1019 | } | ||
1020 | } | ||
1021 | |||
1022 | |||
1023 | /* | ||
1024 | * Delete a destination server in the given service | ||
1025 | */ | ||
1026 | static int | ||
1027 | ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest) | ||
1028 | { | ||
1029 | struct ip_vs_dest *dest; | ||
1030 | __be32 daddr = udest->addr; | ||
1031 | __be16 dport = udest->port; | ||
1032 | |||
1033 | EnterFunction(2); | ||
1034 | |||
1035 | dest = ip_vs_lookup_dest(svc, daddr, dport); | ||
1036 | if (dest == NULL) { | ||
1037 | IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); | ||
1038 | return -ENOENT; | ||
1039 | } | ||
1040 | |||
1041 | write_lock_bh(&__ip_vs_svc_lock); | ||
1042 | |||
1043 | /* | ||
1044 | * Wait until all other svc users go away. | ||
1045 | */ | ||
1046 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
1047 | |||
1048 | /* | ||
1049 | * Unlink dest from the service | ||
1050 | */ | ||
1051 | __ip_vs_unlink_dest(svc, dest, 1); | ||
1052 | |||
1053 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1054 | |||
1055 | /* | ||
1056 | * Delete the destination | ||
1057 | */ | ||
1058 | __ip_vs_del_dest(dest); | ||
1059 | |||
1060 | LeaveFunction(2); | ||
1061 | |||
1062 | return 0; | ||
1063 | } | ||
1064 | |||
1065 | |||
1066 | /* | ||
1067 | * Add a service into the service hash table | ||
1068 | */ | ||
1069 | static int | ||
1070 | ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p) | ||
1071 | { | ||
1072 | int ret = 0; | ||
1073 | struct ip_vs_scheduler *sched = NULL; | ||
1074 | struct ip_vs_service *svc = NULL; | ||
1075 | |||
1076 | /* increase the module use count */ | ||
1077 | ip_vs_use_count_inc(); | ||
1078 | |||
1079 | /* Lookup the scheduler by 'u->sched_name' */ | ||
1080 | sched = ip_vs_scheduler_get(u->sched_name); | ||
1081 | if (sched == NULL) { | ||
1082 | IP_VS_INFO("Scheduler module ip_vs_%s not found\n", | ||
1083 | u->sched_name); | ||
1084 | ret = -ENOENT; | ||
1085 | goto out_mod_dec; | ||
1086 | } | ||
1087 | |||
1088 | svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); | ||
1089 | if (svc == NULL) { | ||
1090 | IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); | ||
1091 | ret = -ENOMEM; | ||
1092 | goto out_err; | ||
1093 | } | ||
1094 | |||
1095 | /* I'm the first user of the service */ | ||
1096 | atomic_set(&svc->usecnt, 1); | ||
1097 | atomic_set(&svc->refcnt, 0); | ||
1098 | |||
1099 | svc->protocol = u->protocol; | ||
1100 | svc->addr = u->addr; | ||
1101 | svc->port = u->port; | ||
1102 | svc->fwmark = u->fwmark; | ||
1103 | svc->flags = u->flags; | ||
1104 | svc->timeout = u->timeout * HZ; | ||
1105 | svc->netmask = u->netmask; | ||
1106 | |||
1107 | INIT_LIST_HEAD(&svc->destinations); | ||
1108 | rwlock_init(&svc->sched_lock); | ||
1109 | spin_lock_init(&svc->stats.lock); | ||
1110 | |||
1111 | /* Bind the scheduler */ | ||
1112 | ret = ip_vs_bind_scheduler(svc, sched); | ||
1113 | if (ret) | ||
1114 | goto out_err; | ||
1115 | sched = NULL; | ||
1116 | |||
1117 | /* Update the virtual service counters */ | ||
1118 | if (svc->port == FTPPORT) | ||
1119 | atomic_inc(&ip_vs_ftpsvc_counter); | ||
1120 | else if (svc->port == 0) | ||
1121 | atomic_inc(&ip_vs_nullsvc_counter); | ||
1122 | |||
1123 | ip_vs_new_estimator(&svc->stats); | ||
1124 | ip_vs_num_services++; | ||
1125 | |||
1126 | /* Hash the service into the service table */ | ||
1127 | write_lock_bh(&__ip_vs_svc_lock); | ||
1128 | ip_vs_svc_hash(svc); | ||
1129 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1130 | |||
1131 | *svc_p = svc; | ||
1132 | return 0; | ||
1133 | |||
1134 | out_err: | ||
1135 | if (svc != NULL) { | ||
1136 | if (svc->scheduler) | ||
1137 | ip_vs_unbind_scheduler(svc); | ||
1138 | if (svc->inc) { | ||
1139 | local_bh_disable(); | ||
1140 | ip_vs_app_inc_put(svc->inc); | ||
1141 | local_bh_enable(); | ||
1142 | } | ||
1143 | kfree(svc); | ||
1144 | } | ||
1145 | ip_vs_scheduler_put(sched); | ||
1146 | |||
1147 | out_mod_dec: | ||
1148 | /* decrease the module use count */ | ||
1149 | ip_vs_use_count_dec(); | ||
1150 | |||
1151 | return ret; | ||
1152 | } | ||
1153 | |||
1154 | |||
1155 | /* | ||
1156 | * Edit a service and bind it with a new scheduler | ||
1157 | */ | ||
1158 | static int | ||
1159 | ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u) | ||
1160 | { | ||
1161 | struct ip_vs_scheduler *sched, *old_sched; | ||
1162 | int ret = 0; | ||
1163 | |||
1164 | /* | ||
1165 | * Lookup the scheduler, by 'u->sched_name' | ||
1166 | */ | ||
1167 | sched = ip_vs_scheduler_get(u->sched_name); | ||
1168 | if (sched == NULL) { | ||
1169 | IP_VS_INFO("Scheduler module ip_vs_%s not found\n", | ||
1170 | u->sched_name); | ||
1171 | return -ENOENT; | ||
1172 | } | ||
1173 | old_sched = sched; | ||
1174 | |||
1175 | write_lock_bh(&__ip_vs_svc_lock); | ||
1176 | |||
1177 | /* | ||
1178 | * Wait until all other svc users go away. | ||
1179 | */ | ||
1180 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
1181 | |||
1182 | /* | ||
1183 | * Set the flags and timeout value | ||
1184 | */ | ||
1185 | svc->flags = u->flags | IP_VS_SVC_F_HASHED; | ||
1186 | svc->timeout = u->timeout * HZ; | ||
1187 | svc->netmask = u->netmask; | ||
1188 | |||
1189 | old_sched = svc->scheduler; | ||
1190 | if (sched != old_sched) { | ||
1191 | /* | ||
1192 | * Unbind the old scheduler | ||
1193 | */ | ||
1194 | if ((ret = ip_vs_unbind_scheduler(svc))) { | ||
1195 | old_sched = sched; | ||
1196 | goto out; | ||
1197 | } | ||
1198 | |||
1199 | /* | ||
1200 | * Bind the new scheduler | ||
1201 | */ | ||
1202 | if ((ret = ip_vs_bind_scheduler(svc, sched))) { | ||
1203 | /* | ||
1204 | * If ip_vs_bind_scheduler fails, restore the old | ||
1205 | * scheduler. | ||
1206 | * The main reason of failure is out of memory. | ||
1207 | * | ||
1208 | * The question is if the old scheduler can be | ||
1209 | * restored all the time. TODO: if it cannot be | ||
1210 | * restored some time, we must delete the service, | ||
1211 | * otherwise the system may crash. | ||
1212 | */ | ||
1213 | ip_vs_bind_scheduler(svc, old_sched); | ||
1214 | old_sched = sched; | ||
1215 | goto out; | ||
1216 | } | ||
1217 | } | ||
1218 | |||
1219 | out: | ||
1220 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1221 | |||
1222 | if (old_sched) | ||
1223 | ip_vs_scheduler_put(old_sched); | ||
1224 | |||
1225 | return ret; | ||
1226 | } | ||
1227 | |||
1228 | |||
1229 | /* | ||
1230 | * Delete a service from the service list | ||
1231 | * - The service must be unlinked, unlocked and not referenced! | ||
1232 | * - We are called under _bh lock | ||
1233 | */ | ||
1234 | static void __ip_vs_del_service(struct ip_vs_service *svc) | ||
1235 | { | ||
1236 | struct ip_vs_dest *dest, *nxt; | ||
1237 | struct ip_vs_scheduler *old_sched; | ||
1238 | |||
1239 | ip_vs_num_services--; | ||
1240 | ip_vs_kill_estimator(&svc->stats); | ||
1241 | |||
1242 | /* Unbind scheduler */ | ||
1243 | old_sched = svc->scheduler; | ||
1244 | ip_vs_unbind_scheduler(svc); | ||
1245 | if (old_sched) | ||
1246 | ip_vs_scheduler_put(old_sched); | ||
1247 | |||
1248 | /* Unbind app inc */ | ||
1249 | if (svc->inc) { | ||
1250 | ip_vs_app_inc_put(svc->inc); | ||
1251 | svc->inc = NULL; | ||
1252 | } | ||
1253 | |||
1254 | /* | ||
1255 | * Unlink the whole destination list | ||
1256 | */ | ||
1257 | list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { | ||
1258 | __ip_vs_unlink_dest(svc, dest, 0); | ||
1259 | __ip_vs_del_dest(dest); | ||
1260 | } | ||
1261 | |||
1262 | /* | ||
1263 | * Update the virtual service counters | ||
1264 | */ | ||
1265 | if (svc->port == FTPPORT) | ||
1266 | atomic_dec(&ip_vs_ftpsvc_counter); | ||
1267 | else if (svc->port == 0) | ||
1268 | atomic_dec(&ip_vs_nullsvc_counter); | ||
1269 | |||
1270 | /* | ||
1271 | * Free the service if nobody refers to it | ||
1272 | */ | ||
1273 | if (atomic_read(&svc->refcnt) == 0) | ||
1274 | kfree(svc); | ||
1275 | |||
1276 | /* decrease the module use count */ | ||
1277 | ip_vs_use_count_dec(); | ||
1278 | } | ||
1279 | |||
1280 | /* | ||
1281 | * Delete a service from the service list | ||
1282 | */ | ||
1283 | static int ip_vs_del_service(struct ip_vs_service *svc) | ||
1284 | { | ||
1285 | if (svc == NULL) | ||
1286 | return -EEXIST; | ||
1287 | |||
1288 | /* | ||
1289 | * Unhash it from the service table | ||
1290 | */ | ||
1291 | write_lock_bh(&__ip_vs_svc_lock); | ||
1292 | |||
1293 | ip_vs_svc_unhash(svc); | ||
1294 | |||
1295 | /* | ||
1296 | * Wait until all the svc users go away. | ||
1297 | */ | ||
1298 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); | ||
1299 | |||
1300 | __ip_vs_del_service(svc); | ||
1301 | |||
1302 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1303 | |||
1304 | return 0; | ||
1305 | } | ||
1306 | |||
1307 | |||
1308 | /* | ||
1309 | * Flush all the virtual services | ||
1310 | */ | ||
1311 | static int ip_vs_flush(void) | ||
1312 | { | ||
1313 | int idx; | ||
1314 | struct ip_vs_service *svc, *nxt; | ||
1315 | |||
1316 | /* | ||
1317 | * Flush the service table hashed by <protocol,addr,port> | ||
1318 | */ | ||
1319 | for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1320 | list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { | ||
1321 | write_lock_bh(&__ip_vs_svc_lock); | ||
1322 | ip_vs_svc_unhash(svc); | ||
1323 | /* | ||
1324 | * Wait until all the svc users go away. | ||
1325 | */ | ||
1326 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); | ||
1327 | __ip_vs_del_service(svc); | ||
1328 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1329 | } | ||
1330 | } | ||
1331 | |||
1332 | /* | ||
1333 | * Flush the service table hashed by fwmark | ||
1334 | */ | ||
1335 | for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1336 | list_for_each_entry_safe(svc, nxt, | ||
1337 | &ip_vs_svc_fwm_table[idx], f_list) { | ||
1338 | write_lock_bh(&__ip_vs_svc_lock); | ||
1339 | ip_vs_svc_unhash(svc); | ||
1340 | /* | ||
1341 | * Wait until all the svc users go away. | ||
1342 | */ | ||
1343 | IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); | ||
1344 | __ip_vs_del_service(svc); | ||
1345 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1346 | } | ||
1347 | } | ||
1348 | |||
1349 | return 0; | ||
1350 | } | ||
1351 | |||
1352 | |||
1353 | /* | ||
1354 | * Zero counters in a service or all services | ||
1355 | */ | ||
1356 | static int ip_vs_zero_service(struct ip_vs_service *svc) | ||
1357 | { | ||
1358 | struct ip_vs_dest *dest; | ||
1359 | |||
1360 | write_lock_bh(&__ip_vs_svc_lock); | ||
1361 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
1362 | ip_vs_zero_stats(&dest->stats); | ||
1363 | } | ||
1364 | ip_vs_zero_stats(&svc->stats); | ||
1365 | write_unlock_bh(&__ip_vs_svc_lock); | ||
1366 | return 0; | ||
1367 | } | ||
1368 | |||
1369 | static int ip_vs_zero_all(void) | ||
1370 | { | ||
1371 | int idx; | ||
1372 | struct ip_vs_service *svc; | ||
1373 | |||
1374 | for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1375 | list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { | ||
1376 | ip_vs_zero_service(svc); | ||
1377 | } | ||
1378 | } | ||
1379 | |||
1380 | for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1381 | list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { | ||
1382 | ip_vs_zero_service(svc); | ||
1383 | } | ||
1384 | } | ||
1385 | |||
1386 | ip_vs_zero_stats(&ip_vs_stats); | ||
1387 | return 0; | ||
1388 | } | ||
1389 | |||
1390 | |||
1391 | static int | ||
1392 | proc_do_defense_mode(ctl_table *table, int write, struct file * filp, | ||
1393 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1394 | { | ||
1395 | int *valp = table->data; | ||
1396 | int val = *valp; | ||
1397 | int rc; | ||
1398 | |||
1399 | rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); | ||
1400 | if (write && (*valp != val)) { | ||
1401 | if ((*valp < 0) || (*valp > 3)) { | ||
1402 | /* Restore the correct value */ | ||
1403 | *valp = val; | ||
1404 | } else { | ||
1405 | update_defense_level(); | ||
1406 | } | ||
1407 | } | ||
1408 | return rc; | ||
1409 | } | ||
1410 | |||
1411 | |||
1412 | static int | ||
1413 | proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, | ||
1414 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1415 | { | ||
1416 | int *valp = table->data; | ||
1417 | int val[2]; | ||
1418 | int rc; | ||
1419 | |||
1420 | /* backup the value first */ | ||
1421 | memcpy(val, valp, sizeof(val)); | ||
1422 | |||
1423 | rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); | ||
1424 | if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { | ||
1425 | /* Restore the correct value */ | ||
1426 | memcpy(valp, val, sizeof(val)); | ||
1427 | } | ||
1428 | return rc; | ||
1429 | } | ||
1430 | |||
1431 | |||
1432 | /* | ||
1433 | * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) | ||
1434 | */ | ||
1435 | |||
1436 | static struct ctl_table vs_vars[] = { | ||
1437 | { | ||
1438 | .procname = "amemthresh", | ||
1439 | .data = &sysctl_ip_vs_amemthresh, | ||
1440 | .maxlen = sizeof(int), | ||
1441 | .mode = 0644, | ||
1442 | .proc_handler = &proc_dointvec, | ||
1443 | }, | ||
1444 | #ifdef CONFIG_IP_VS_DEBUG | ||
1445 | { | ||
1446 | .procname = "debug_level", | ||
1447 | .data = &sysctl_ip_vs_debug_level, | ||
1448 | .maxlen = sizeof(int), | ||
1449 | .mode = 0644, | ||
1450 | .proc_handler = &proc_dointvec, | ||
1451 | }, | ||
1452 | #endif | ||
1453 | { | ||
1454 | .procname = "am_droprate", | ||
1455 | .data = &sysctl_ip_vs_am_droprate, | ||
1456 | .maxlen = sizeof(int), | ||
1457 | .mode = 0644, | ||
1458 | .proc_handler = &proc_dointvec, | ||
1459 | }, | ||
1460 | { | ||
1461 | .procname = "drop_entry", | ||
1462 | .data = &sysctl_ip_vs_drop_entry, | ||
1463 | .maxlen = sizeof(int), | ||
1464 | .mode = 0644, | ||
1465 | .proc_handler = &proc_do_defense_mode, | ||
1466 | }, | ||
1467 | { | ||
1468 | .procname = "drop_packet", | ||
1469 | .data = &sysctl_ip_vs_drop_packet, | ||
1470 | .maxlen = sizeof(int), | ||
1471 | .mode = 0644, | ||
1472 | .proc_handler = &proc_do_defense_mode, | ||
1473 | }, | ||
1474 | { | ||
1475 | .procname = "secure_tcp", | ||
1476 | .data = &sysctl_ip_vs_secure_tcp, | ||
1477 | .maxlen = sizeof(int), | ||
1478 | .mode = 0644, | ||
1479 | .proc_handler = &proc_do_defense_mode, | ||
1480 | }, | ||
1481 | #if 0 | ||
1482 | { | ||
1483 | .procname = "timeout_established", | ||
1484 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], | ||
1485 | .maxlen = sizeof(int), | ||
1486 | .mode = 0644, | ||
1487 | .proc_handler = &proc_dointvec_jiffies, | ||
1488 | }, | ||
1489 | { | ||
1490 | .procname = "timeout_synsent", | ||
1491 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], | ||
1492 | .maxlen = sizeof(int), | ||
1493 | .mode = 0644, | ||
1494 | .proc_handler = &proc_dointvec_jiffies, | ||
1495 | }, | ||
1496 | { | ||
1497 | .procname = "timeout_synrecv", | ||
1498 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], | ||
1499 | .maxlen = sizeof(int), | ||
1500 | .mode = 0644, | ||
1501 | .proc_handler = &proc_dointvec_jiffies, | ||
1502 | }, | ||
1503 | { | ||
1504 | .procname = "timeout_finwait", | ||
1505 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], | ||
1506 | .maxlen = sizeof(int), | ||
1507 | .mode = 0644, | ||
1508 | .proc_handler = &proc_dointvec_jiffies, | ||
1509 | }, | ||
1510 | { | ||
1511 | .procname = "timeout_timewait", | ||
1512 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], | ||
1513 | .maxlen = sizeof(int), | ||
1514 | .mode = 0644, | ||
1515 | .proc_handler = &proc_dointvec_jiffies, | ||
1516 | }, | ||
1517 | { | ||
1518 | .procname = "timeout_close", | ||
1519 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], | ||
1520 | .maxlen = sizeof(int), | ||
1521 | .mode = 0644, | ||
1522 | .proc_handler = &proc_dointvec_jiffies, | ||
1523 | }, | ||
1524 | { | ||
1525 | .procname = "timeout_closewait", | ||
1526 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], | ||
1527 | .maxlen = sizeof(int), | ||
1528 | .mode = 0644, | ||
1529 | .proc_handler = &proc_dointvec_jiffies, | ||
1530 | }, | ||
1531 | { | ||
1532 | .procname = "timeout_lastack", | ||
1533 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], | ||
1534 | .maxlen = sizeof(int), | ||
1535 | .mode = 0644, | ||
1536 | .proc_handler = &proc_dointvec_jiffies, | ||
1537 | }, | ||
1538 | { | ||
1539 | .procname = "timeout_listen", | ||
1540 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], | ||
1541 | .maxlen = sizeof(int), | ||
1542 | .mode = 0644, | ||
1543 | .proc_handler = &proc_dointvec_jiffies, | ||
1544 | }, | ||
1545 | { | ||
1546 | .procname = "timeout_synack", | ||
1547 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], | ||
1548 | .maxlen = sizeof(int), | ||
1549 | .mode = 0644, | ||
1550 | .proc_handler = &proc_dointvec_jiffies, | ||
1551 | }, | ||
1552 | { | ||
1553 | .procname = "timeout_udp", | ||
1554 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], | ||
1555 | .maxlen = sizeof(int), | ||
1556 | .mode = 0644, | ||
1557 | .proc_handler = &proc_dointvec_jiffies, | ||
1558 | }, | ||
1559 | { | ||
1560 | .procname = "timeout_icmp", | ||
1561 | .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], | ||
1562 | .maxlen = sizeof(int), | ||
1563 | .mode = 0644, | ||
1564 | .proc_handler = &proc_dointvec_jiffies, | ||
1565 | }, | ||
1566 | #endif | ||
1567 | { | ||
1568 | .procname = "cache_bypass", | ||
1569 | .data = &sysctl_ip_vs_cache_bypass, | ||
1570 | .maxlen = sizeof(int), | ||
1571 | .mode = 0644, | ||
1572 | .proc_handler = &proc_dointvec, | ||
1573 | }, | ||
1574 | { | ||
1575 | .procname = "expire_nodest_conn", | ||
1576 | .data = &sysctl_ip_vs_expire_nodest_conn, | ||
1577 | .maxlen = sizeof(int), | ||
1578 | .mode = 0644, | ||
1579 | .proc_handler = &proc_dointvec, | ||
1580 | }, | ||
1581 | { | ||
1582 | .procname = "expire_quiescent_template", | ||
1583 | .data = &sysctl_ip_vs_expire_quiescent_template, | ||
1584 | .maxlen = sizeof(int), | ||
1585 | .mode = 0644, | ||
1586 | .proc_handler = &proc_dointvec, | ||
1587 | }, | ||
1588 | { | ||
1589 | .procname = "sync_threshold", | ||
1590 | .data = &sysctl_ip_vs_sync_threshold, | ||
1591 | .maxlen = sizeof(sysctl_ip_vs_sync_threshold), | ||
1592 | .mode = 0644, | ||
1593 | .proc_handler = &proc_do_sync_threshold, | ||
1594 | }, | ||
1595 | { | ||
1596 | .procname = "nat_icmp_send", | ||
1597 | .data = &sysctl_ip_vs_nat_icmp_send, | ||
1598 | .maxlen = sizeof(int), | ||
1599 | .mode = 0644, | ||
1600 | .proc_handler = &proc_dointvec, | ||
1601 | }, | ||
1602 | { .ctl_name = 0 } | ||
1603 | }; | ||
1604 | |||
1605 | const struct ctl_path net_vs_ctl_path[] = { | ||
1606 | { .procname = "net", .ctl_name = CTL_NET, }, | ||
1607 | { .procname = "ipv4", .ctl_name = NET_IPV4, }, | ||
1608 | { .procname = "vs", }, | ||
1609 | { } | ||
1610 | }; | ||
1611 | EXPORT_SYMBOL_GPL(net_vs_ctl_path); | ||
1612 | |||
1613 | static struct ctl_table_header * sysctl_header; | ||
1614 | |||
1615 | #ifdef CONFIG_PROC_FS | ||
1616 | |||
1617 | struct ip_vs_iter { | ||
1618 | struct list_head *table; | ||
1619 | int bucket; | ||
1620 | }; | ||
1621 | |||
1622 | /* | ||
1623 | * Write the contents of the VS rule table to a PROCfs file. | ||
1624 | * (It is kept just for backward compatibility) | ||
1625 | */ | ||
1626 | static inline const char *ip_vs_fwd_name(unsigned flags) | ||
1627 | { | ||
1628 | switch (flags & IP_VS_CONN_F_FWD_MASK) { | ||
1629 | case IP_VS_CONN_F_LOCALNODE: | ||
1630 | return "Local"; | ||
1631 | case IP_VS_CONN_F_TUNNEL: | ||
1632 | return "Tunnel"; | ||
1633 | case IP_VS_CONN_F_DROUTE: | ||
1634 | return "Route"; | ||
1635 | default: | ||
1636 | return "Masq"; | ||
1637 | } | ||
1638 | } | ||
1639 | |||
1640 | |||
1641 | /* Get the Nth entry in the two lists */ | ||
1642 | static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) | ||
1643 | { | ||
1644 | struct ip_vs_iter *iter = seq->private; | ||
1645 | int idx; | ||
1646 | struct ip_vs_service *svc; | ||
1647 | |||
1648 | /* look in hash by protocol */ | ||
1649 | for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1650 | list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { | ||
1651 | if (pos-- == 0){ | ||
1652 | iter->table = ip_vs_svc_table; | ||
1653 | iter->bucket = idx; | ||
1654 | return svc; | ||
1655 | } | ||
1656 | } | ||
1657 | } | ||
1658 | |||
1659 | /* keep looking in fwmark */ | ||
1660 | for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
1661 | list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { | ||
1662 | if (pos-- == 0) { | ||
1663 | iter->table = ip_vs_svc_fwm_table; | ||
1664 | iter->bucket = idx; | ||
1665 | return svc; | ||
1666 | } | ||
1667 | } | ||
1668 | } | ||
1669 | |||
1670 | return NULL; | ||
1671 | } | ||
1672 | |||
1673 | static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) | ||
1674 | { | ||
1675 | |||
1676 | read_lock_bh(&__ip_vs_svc_lock); | ||
1677 | return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; | ||
1678 | } | ||
1679 | |||
1680 | |||
1681 | static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
1682 | { | ||
1683 | struct list_head *e; | ||
1684 | struct ip_vs_iter *iter; | ||
1685 | struct ip_vs_service *svc; | ||
1686 | |||
1687 | ++*pos; | ||
1688 | if (v == SEQ_START_TOKEN) | ||
1689 | return ip_vs_info_array(seq,0); | ||
1690 | |||
1691 | svc = v; | ||
1692 | iter = seq->private; | ||
1693 | |||
1694 | if (iter->table == ip_vs_svc_table) { | ||
1695 | /* next service in table hashed by protocol */ | ||
1696 | if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) | ||
1697 | return list_entry(e, struct ip_vs_service, s_list); | ||
1698 | |||
1699 | |||
1700 | while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { | ||
1701 | list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], | ||
1702 | s_list) { | ||
1703 | return svc; | ||
1704 | } | ||
1705 | } | ||
1706 | |||
1707 | iter->table = ip_vs_svc_fwm_table; | ||
1708 | iter->bucket = -1; | ||
1709 | goto scan_fwmark; | ||
1710 | } | ||
1711 | |||
1712 | /* next service in hashed by fwmark */ | ||
1713 | if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) | ||
1714 | return list_entry(e, struct ip_vs_service, f_list); | ||
1715 | |||
1716 | scan_fwmark: | ||
1717 | while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { | ||
1718 | list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], | ||
1719 | f_list) | ||
1720 | return svc; | ||
1721 | } | ||
1722 | |||
1723 | return NULL; | ||
1724 | } | ||
1725 | |||
1726 | static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) | ||
1727 | { | ||
1728 | read_unlock_bh(&__ip_vs_svc_lock); | ||
1729 | } | ||
1730 | |||
1731 | |||
1732 | static int ip_vs_info_seq_show(struct seq_file *seq, void *v) | ||
1733 | { | ||
1734 | if (v == SEQ_START_TOKEN) { | ||
1735 | seq_printf(seq, | ||
1736 | "IP Virtual Server version %d.%d.%d (size=%d)\n", | ||
1737 | NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); | ||
1738 | seq_puts(seq, | ||
1739 | "Prot LocalAddress:Port Scheduler Flags\n"); | ||
1740 | seq_puts(seq, | ||
1741 | " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); | ||
1742 | } else { | ||
1743 | const struct ip_vs_service *svc = v; | ||
1744 | const struct ip_vs_iter *iter = seq->private; | ||
1745 | const struct ip_vs_dest *dest; | ||
1746 | |||
1747 | if (iter->table == ip_vs_svc_table) | ||
1748 | seq_printf(seq, "%s %08X:%04X %s ", | ||
1749 | ip_vs_proto_name(svc->protocol), | ||
1750 | ntohl(svc->addr), | ||
1751 | ntohs(svc->port), | ||
1752 | svc->scheduler->name); | ||
1753 | else | ||
1754 | seq_printf(seq, "FWM %08X %s ", | ||
1755 | svc->fwmark, svc->scheduler->name); | ||
1756 | |||
1757 | if (svc->flags & IP_VS_SVC_F_PERSISTENT) | ||
1758 | seq_printf(seq, "persistent %d %08X\n", | ||
1759 | svc->timeout, | ||
1760 | ntohl(svc->netmask)); | ||
1761 | else | ||
1762 | seq_putc(seq, '\n'); | ||
1763 | |||
1764 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
1765 | seq_printf(seq, | ||
1766 | " -> %08X:%04X %-7s %-6d %-10d %-10d\n", | ||
1767 | ntohl(dest->addr), ntohs(dest->port), | ||
1768 | ip_vs_fwd_name(atomic_read(&dest->conn_flags)), | ||
1769 | atomic_read(&dest->weight), | ||
1770 | atomic_read(&dest->activeconns), | ||
1771 | atomic_read(&dest->inactconns)); | ||
1772 | } | ||
1773 | } | ||
1774 | return 0; | ||
1775 | } | ||
1776 | |||
1777 | static const struct seq_operations ip_vs_info_seq_ops = { | ||
1778 | .start = ip_vs_info_seq_start, | ||
1779 | .next = ip_vs_info_seq_next, | ||
1780 | .stop = ip_vs_info_seq_stop, | ||
1781 | .show = ip_vs_info_seq_show, | ||
1782 | }; | ||
1783 | |||
1784 | static int ip_vs_info_open(struct inode *inode, struct file *file) | ||
1785 | { | ||
1786 | return seq_open_private(file, &ip_vs_info_seq_ops, | ||
1787 | sizeof(struct ip_vs_iter)); | ||
1788 | } | ||
1789 | |||
1790 | static const struct file_operations ip_vs_info_fops = { | ||
1791 | .owner = THIS_MODULE, | ||
1792 | .open = ip_vs_info_open, | ||
1793 | .read = seq_read, | ||
1794 | .llseek = seq_lseek, | ||
1795 | .release = seq_release_private, | ||
1796 | }; | ||
1797 | |||
1798 | #endif | ||
1799 | |||
1800 | struct ip_vs_stats ip_vs_stats = { | ||
1801 | .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock), | ||
1802 | }; | ||
1803 | |||
1804 | #ifdef CONFIG_PROC_FS | ||
1805 | static int ip_vs_stats_show(struct seq_file *seq, void *v) | ||
1806 | { | ||
1807 | |||
1808 | /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ | ||
1809 | seq_puts(seq, | ||
1810 | " Total Incoming Outgoing Incoming Outgoing\n"); | ||
1811 | seq_printf(seq, | ||
1812 | " Conns Packets Packets Bytes Bytes\n"); | ||
1813 | |||
1814 | spin_lock_bh(&ip_vs_stats.lock); | ||
1815 | seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns, | ||
1816 | ip_vs_stats.inpkts, ip_vs_stats.outpkts, | ||
1817 | (unsigned long long) ip_vs_stats.inbytes, | ||
1818 | (unsigned long long) ip_vs_stats.outbytes); | ||
1819 | |||
1820 | /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ | ||
1821 | seq_puts(seq, | ||
1822 | " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); | ||
1823 | seq_printf(seq,"%8X %8X %8X %16X %16X\n", | ||
1824 | ip_vs_stats.cps, | ||
1825 | ip_vs_stats.inpps, | ||
1826 | ip_vs_stats.outpps, | ||
1827 | ip_vs_stats.inbps, | ||
1828 | ip_vs_stats.outbps); | ||
1829 | spin_unlock_bh(&ip_vs_stats.lock); | ||
1830 | |||
1831 | return 0; | ||
1832 | } | ||
1833 | |||
1834 | static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) | ||
1835 | { | ||
1836 | return single_open(file, ip_vs_stats_show, NULL); | ||
1837 | } | ||
1838 | |||
1839 | static const struct file_operations ip_vs_stats_fops = { | ||
1840 | .owner = THIS_MODULE, | ||
1841 | .open = ip_vs_stats_seq_open, | ||
1842 | .read = seq_read, | ||
1843 | .llseek = seq_lseek, | ||
1844 | .release = single_release, | ||
1845 | }; | ||
1846 | |||
1847 | #endif | ||
1848 | |||
1849 | /* | ||
1850 | * Set timeout values for tcp tcpfin udp in the timeout_table. | ||
1851 | */ | ||
1852 | static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) | ||
1853 | { | ||
1854 | IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", | ||
1855 | u->tcp_timeout, | ||
1856 | u->tcp_fin_timeout, | ||
1857 | u->udp_timeout); | ||
1858 | |||
1859 | #ifdef CONFIG_IP_VS_PROTO_TCP | ||
1860 | if (u->tcp_timeout) { | ||
1861 | ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] | ||
1862 | = u->tcp_timeout * HZ; | ||
1863 | } | ||
1864 | |||
1865 | if (u->tcp_fin_timeout) { | ||
1866 | ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] | ||
1867 | = u->tcp_fin_timeout * HZ; | ||
1868 | } | ||
1869 | #endif | ||
1870 | |||
1871 | #ifdef CONFIG_IP_VS_PROTO_UDP | ||
1872 | if (u->udp_timeout) { | ||
1873 | ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] | ||
1874 | = u->udp_timeout * HZ; | ||
1875 | } | ||
1876 | #endif | ||
1877 | return 0; | ||
1878 | } | ||
1879 | |||
1880 | |||
1881 | #define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) | ||
1882 | #define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) | ||
1883 | #define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ | ||
1884 | sizeof(struct ip_vs_dest_user)) | ||
1885 | #define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) | ||
1886 | #define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) | ||
1887 | #define MAX_ARG_LEN SVCDEST_ARG_LEN | ||
1888 | |||
1889 | static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { | ||
1890 | [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, | ||
1891 | [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, | ||
1892 | [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, | ||
1893 | [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, | ||
1894 | [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, | ||
1895 | [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, | ||
1896 | [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, | ||
1897 | [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, | ||
1898 | [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, | ||
1899 | [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, | ||
1900 | [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, | ||
1901 | }; | ||
1902 | |||
1903 | static int | ||
1904 | do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) | ||
1905 | { | ||
1906 | int ret; | ||
1907 | unsigned char arg[MAX_ARG_LEN]; | ||
1908 | struct ip_vs_service_user *usvc; | ||
1909 | struct ip_vs_service *svc; | ||
1910 | struct ip_vs_dest_user *udest; | ||
1911 | |||
1912 | if (!capable(CAP_NET_ADMIN)) | ||
1913 | return -EPERM; | ||
1914 | |||
1915 | if (len != set_arglen[SET_CMDID(cmd)]) { | ||
1916 | IP_VS_ERR("set_ctl: len %u != %u\n", | ||
1917 | len, set_arglen[SET_CMDID(cmd)]); | ||
1918 | return -EINVAL; | ||
1919 | } | ||
1920 | |||
1921 | if (copy_from_user(arg, user, len) != 0) | ||
1922 | return -EFAULT; | ||
1923 | |||
1924 | /* increase the module use count */ | ||
1925 | ip_vs_use_count_inc(); | ||
1926 | |||
1927 | if (mutex_lock_interruptible(&__ip_vs_mutex)) { | ||
1928 | ret = -ERESTARTSYS; | ||
1929 | goto out_dec; | ||
1930 | } | ||
1931 | |||
1932 | if (cmd == IP_VS_SO_SET_FLUSH) { | ||
1933 | /* Flush the virtual service */ | ||
1934 | ret = ip_vs_flush(); | ||
1935 | goto out_unlock; | ||
1936 | } else if (cmd == IP_VS_SO_SET_TIMEOUT) { | ||
1937 | /* Set timeout values for (tcp tcpfin udp) */ | ||
1938 | ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); | ||
1939 | goto out_unlock; | ||
1940 | } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { | ||
1941 | struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; | ||
1942 | ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); | ||
1943 | goto out_unlock; | ||
1944 | } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { | ||
1945 | struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; | ||
1946 | ret = stop_sync_thread(dm->state); | ||
1947 | goto out_unlock; | ||
1948 | } | ||
1949 | |||
1950 | usvc = (struct ip_vs_service_user *)arg; | ||
1951 | udest = (struct ip_vs_dest_user *)(usvc + 1); | ||
1952 | |||
1953 | if (cmd == IP_VS_SO_SET_ZERO) { | ||
1954 | /* if no service address is set, zero counters in all */ | ||
1955 | if (!usvc->fwmark && !usvc->addr && !usvc->port) { | ||
1956 | ret = ip_vs_zero_all(); | ||
1957 | goto out_unlock; | ||
1958 | } | ||
1959 | } | ||
1960 | |||
1961 | /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ | ||
1962 | if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) { | ||
1963 | IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n", | ||
1964 | usvc->protocol, NIPQUAD(usvc->addr), | ||
1965 | ntohs(usvc->port), usvc->sched_name); | ||
1966 | ret = -EFAULT; | ||
1967 | goto out_unlock; | ||
1968 | } | ||
1969 | |||
1970 | /* Lookup the exact service by <protocol, addr, port> or fwmark */ | ||
1971 | if (usvc->fwmark == 0) | ||
1972 | svc = __ip_vs_service_get(usvc->protocol, | ||
1973 | usvc->addr, usvc->port); | ||
1974 | else | ||
1975 | svc = __ip_vs_svc_fwm_get(usvc->fwmark); | ||
1976 | |||
1977 | if (cmd != IP_VS_SO_SET_ADD | ||
1978 | && (svc == NULL || svc->protocol != usvc->protocol)) { | ||
1979 | ret = -ESRCH; | ||
1980 | goto out_unlock; | ||
1981 | } | ||
1982 | |||
1983 | switch (cmd) { | ||
1984 | case IP_VS_SO_SET_ADD: | ||
1985 | if (svc != NULL) | ||
1986 | ret = -EEXIST; | ||
1987 | else | ||
1988 | ret = ip_vs_add_service(usvc, &svc); | ||
1989 | break; | ||
1990 | case IP_VS_SO_SET_EDIT: | ||
1991 | ret = ip_vs_edit_service(svc, usvc); | ||
1992 | break; | ||
1993 | case IP_VS_SO_SET_DEL: | ||
1994 | ret = ip_vs_del_service(svc); | ||
1995 | if (!ret) | ||
1996 | goto out_unlock; | ||
1997 | break; | ||
1998 | case IP_VS_SO_SET_ZERO: | ||
1999 | ret = ip_vs_zero_service(svc); | ||
2000 | break; | ||
2001 | case IP_VS_SO_SET_ADDDEST: | ||
2002 | ret = ip_vs_add_dest(svc, udest); | ||
2003 | break; | ||
2004 | case IP_VS_SO_SET_EDITDEST: | ||
2005 | ret = ip_vs_edit_dest(svc, udest); | ||
2006 | break; | ||
2007 | case IP_VS_SO_SET_DELDEST: | ||
2008 | ret = ip_vs_del_dest(svc, udest); | ||
2009 | break; | ||
2010 | default: | ||
2011 | ret = -EINVAL; | ||
2012 | } | ||
2013 | |||
2014 | if (svc) | ||
2015 | ip_vs_service_put(svc); | ||
2016 | |||
2017 | out_unlock: | ||
2018 | mutex_unlock(&__ip_vs_mutex); | ||
2019 | out_dec: | ||
2020 | /* decrease the module use count */ | ||
2021 | ip_vs_use_count_dec(); | ||
2022 | |||
2023 | return ret; | ||
2024 | } | ||
2025 | |||
2026 | |||
2027 | static void | ||
2028 | ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) | ||
2029 | { | ||
2030 | spin_lock_bh(&src->lock); | ||
2031 | memcpy(dst, src, (char*)&src->lock - (char*)src); | ||
2032 | spin_unlock_bh(&src->lock); | ||
2033 | } | ||
2034 | |||
2035 | static void | ||
2036 | ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) | ||
2037 | { | ||
2038 | dst->protocol = src->protocol; | ||
2039 | dst->addr = src->addr; | ||
2040 | dst->port = src->port; | ||
2041 | dst->fwmark = src->fwmark; | ||
2042 | strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); | ||
2043 | dst->flags = src->flags; | ||
2044 | dst->timeout = src->timeout / HZ; | ||
2045 | dst->netmask = src->netmask; | ||
2046 | dst->num_dests = src->num_dests; | ||
2047 | ip_vs_copy_stats(&dst->stats, &src->stats); | ||
2048 | } | ||
2049 | |||
2050 | static inline int | ||
2051 | __ip_vs_get_service_entries(const struct ip_vs_get_services *get, | ||
2052 | struct ip_vs_get_services __user *uptr) | ||
2053 | { | ||
2054 | int idx, count=0; | ||
2055 | struct ip_vs_service *svc; | ||
2056 | struct ip_vs_service_entry entry; | ||
2057 | int ret = 0; | ||
2058 | |||
2059 | for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
2060 | list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { | ||
2061 | if (count >= get->num_services) | ||
2062 | goto out; | ||
2063 | memset(&entry, 0, sizeof(entry)); | ||
2064 | ip_vs_copy_service(&entry, svc); | ||
2065 | if (copy_to_user(&uptr->entrytable[count], | ||
2066 | &entry, sizeof(entry))) { | ||
2067 | ret = -EFAULT; | ||
2068 | goto out; | ||
2069 | } | ||
2070 | count++; | ||
2071 | } | ||
2072 | } | ||
2073 | |||
2074 | for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
2075 | list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { | ||
2076 | if (count >= get->num_services) | ||
2077 | goto out; | ||
2078 | memset(&entry, 0, sizeof(entry)); | ||
2079 | ip_vs_copy_service(&entry, svc); | ||
2080 | if (copy_to_user(&uptr->entrytable[count], | ||
2081 | &entry, sizeof(entry))) { | ||
2082 | ret = -EFAULT; | ||
2083 | goto out; | ||
2084 | } | ||
2085 | count++; | ||
2086 | } | ||
2087 | } | ||
2088 | out: | ||
2089 | return ret; | ||
2090 | } | ||
2091 | |||
2092 | static inline int | ||
2093 | __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, | ||
2094 | struct ip_vs_get_dests __user *uptr) | ||
2095 | { | ||
2096 | struct ip_vs_service *svc; | ||
2097 | int ret = 0; | ||
2098 | |||
2099 | if (get->fwmark) | ||
2100 | svc = __ip_vs_svc_fwm_get(get->fwmark); | ||
2101 | else | ||
2102 | svc = __ip_vs_service_get(get->protocol, | ||
2103 | get->addr, get->port); | ||
2104 | if (svc) { | ||
2105 | int count = 0; | ||
2106 | struct ip_vs_dest *dest; | ||
2107 | struct ip_vs_dest_entry entry; | ||
2108 | |||
2109 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
2110 | if (count >= get->num_dests) | ||
2111 | break; | ||
2112 | |||
2113 | entry.addr = dest->addr; | ||
2114 | entry.port = dest->port; | ||
2115 | entry.conn_flags = atomic_read(&dest->conn_flags); | ||
2116 | entry.weight = atomic_read(&dest->weight); | ||
2117 | entry.u_threshold = dest->u_threshold; | ||
2118 | entry.l_threshold = dest->l_threshold; | ||
2119 | entry.activeconns = atomic_read(&dest->activeconns); | ||
2120 | entry.inactconns = atomic_read(&dest->inactconns); | ||
2121 | entry.persistconns = atomic_read(&dest->persistconns); | ||
2122 | ip_vs_copy_stats(&entry.stats, &dest->stats); | ||
2123 | if (copy_to_user(&uptr->entrytable[count], | ||
2124 | &entry, sizeof(entry))) { | ||
2125 | ret = -EFAULT; | ||
2126 | break; | ||
2127 | } | ||
2128 | count++; | ||
2129 | } | ||
2130 | ip_vs_service_put(svc); | ||
2131 | } else | ||
2132 | ret = -ESRCH; | ||
2133 | return ret; | ||
2134 | } | ||
2135 | |||
2136 | static inline void | ||
2137 | __ip_vs_get_timeouts(struct ip_vs_timeout_user *u) | ||
2138 | { | ||
2139 | #ifdef CONFIG_IP_VS_PROTO_TCP | ||
2140 | u->tcp_timeout = | ||
2141 | ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; | ||
2142 | u->tcp_fin_timeout = | ||
2143 | ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; | ||
2144 | #endif | ||
2145 | #ifdef CONFIG_IP_VS_PROTO_UDP | ||
2146 | u->udp_timeout = | ||
2147 | ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; | ||
2148 | #endif | ||
2149 | } | ||
2150 | |||
2151 | |||
2152 | #define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) | ||
2153 | #define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) | ||
2154 | #define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) | ||
2155 | #define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) | ||
2156 | #define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) | ||
2157 | #define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) | ||
2158 | #define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) | ||
2159 | |||
2160 | static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { | ||
2161 | [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, | ||
2162 | [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, | ||
2163 | [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, | ||
2164 | [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, | ||
2165 | [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, | ||
2166 | [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, | ||
2167 | [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, | ||
2168 | }; | ||
2169 | |||
2170 | static int | ||
2171 | do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) | ||
2172 | { | ||
2173 | unsigned char arg[128]; | ||
2174 | int ret = 0; | ||
2175 | |||
2176 | if (!capable(CAP_NET_ADMIN)) | ||
2177 | return -EPERM; | ||
2178 | |||
2179 | if (*len < get_arglen[GET_CMDID(cmd)]) { | ||
2180 | IP_VS_ERR("get_ctl: len %u < %u\n", | ||
2181 | *len, get_arglen[GET_CMDID(cmd)]); | ||
2182 | return -EINVAL; | ||
2183 | } | ||
2184 | |||
2185 | if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0) | ||
2186 | return -EFAULT; | ||
2187 | |||
2188 | if (mutex_lock_interruptible(&__ip_vs_mutex)) | ||
2189 | return -ERESTARTSYS; | ||
2190 | |||
2191 | switch (cmd) { | ||
2192 | case IP_VS_SO_GET_VERSION: | ||
2193 | { | ||
2194 | char buf[64]; | ||
2195 | |||
2196 | sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", | ||
2197 | NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); | ||
2198 | if (copy_to_user(user, buf, strlen(buf)+1) != 0) { | ||
2199 | ret = -EFAULT; | ||
2200 | goto out; | ||
2201 | } | ||
2202 | *len = strlen(buf)+1; | ||
2203 | } | ||
2204 | break; | ||
2205 | |||
2206 | case IP_VS_SO_GET_INFO: | ||
2207 | { | ||
2208 | struct ip_vs_getinfo info; | ||
2209 | info.version = IP_VS_VERSION_CODE; | ||
2210 | info.size = IP_VS_CONN_TAB_SIZE; | ||
2211 | info.num_services = ip_vs_num_services; | ||
2212 | if (copy_to_user(user, &info, sizeof(info)) != 0) | ||
2213 | ret = -EFAULT; | ||
2214 | } | ||
2215 | break; | ||
2216 | |||
2217 | case IP_VS_SO_GET_SERVICES: | ||
2218 | { | ||
2219 | struct ip_vs_get_services *get; | ||
2220 | int size; | ||
2221 | |||
2222 | get = (struct ip_vs_get_services *)arg; | ||
2223 | size = sizeof(*get) + | ||
2224 | sizeof(struct ip_vs_service_entry) * get->num_services; | ||
2225 | if (*len != size) { | ||
2226 | IP_VS_ERR("length: %u != %u\n", *len, size); | ||
2227 | ret = -EINVAL; | ||
2228 | goto out; | ||
2229 | } | ||
2230 | ret = __ip_vs_get_service_entries(get, user); | ||
2231 | } | ||
2232 | break; | ||
2233 | |||
2234 | case IP_VS_SO_GET_SERVICE: | ||
2235 | { | ||
2236 | struct ip_vs_service_entry *entry; | ||
2237 | struct ip_vs_service *svc; | ||
2238 | |||
2239 | entry = (struct ip_vs_service_entry *)arg; | ||
2240 | if (entry->fwmark) | ||
2241 | svc = __ip_vs_svc_fwm_get(entry->fwmark); | ||
2242 | else | ||
2243 | svc = __ip_vs_service_get(entry->protocol, | ||
2244 | entry->addr, entry->port); | ||
2245 | if (svc) { | ||
2246 | ip_vs_copy_service(entry, svc); | ||
2247 | if (copy_to_user(user, entry, sizeof(*entry)) != 0) | ||
2248 | ret = -EFAULT; | ||
2249 | ip_vs_service_put(svc); | ||
2250 | } else | ||
2251 | ret = -ESRCH; | ||
2252 | } | ||
2253 | break; | ||
2254 | |||
2255 | case IP_VS_SO_GET_DESTS: | ||
2256 | { | ||
2257 | struct ip_vs_get_dests *get; | ||
2258 | int size; | ||
2259 | |||
2260 | get = (struct ip_vs_get_dests *)arg; | ||
2261 | size = sizeof(*get) + | ||
2262 | sizeof(struct ip_vs_dest_entry) * get->num_dests; | ||
2263 | if (*len != size) { | ||
2264 | IP_VS_ERR("length: %u != %u\n", *len, size); | ||
2265 | ret = -EINVAL; | ||
2266 | goto out; | ||
2267 | } | ||
2268 | ret = __ip_vs_get_dest_entries(get, user); | ||
2269 | } | ||
2270 | break; | ||
2271 | |||
2272 | case IP_VS_SO_GET_TIMEOUT: | ||
2273 | { | ||
2274 | struct ip_vs_timeout_user t; | ||
2275 | |||
2276 | __ip_vs_get_timeouts(&t); | ||
2277 | if (copy_to_user(user, &t, sizeof(t)) != 0) | ||
2278 | ret = -EFAULT; | ||
2279 | } | ||
2280 | break; | ||
2281 | |||
2282 | case IP_VS_SO_GET_DAEMON: | ||
2283 | { | ||
2284 | struct ip_vs_daemon_user d[2]; | ||
2285 | |||
2286 | memset(&d, 0, sizeof(d)); | ||
2287 | if (ip_vs_sync_state & IP_VS_STATE_MASTER) { | ||
2288 | d[0].state = IP_VS_STATE_MASTER; | ||
2289 | strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn)); | ||
2290 | d[0].syncid = ip_vs_master_syncid; | ||
2291 | } | ||
2292 | if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { | ||
2293 | d[1].state = IP_VS_STATE_BACKUP; | ||
2294 | strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn)); | ||
2295 | d[1].syncid = ip_vs_backup_syncid; | ||
2296 | } | ||
2297 | if (copy_to_user(user, &d, sizeof(d)) != 0) | ||
2298 | ret = -EFAULT; | ||
2299 | } | ||
2300 | break; | ||
2301 | |||
2302 | default: | ||
2303 | ret = -EINVAL; | ||
2304 | } | ||
2305 | |||
2306 | out: | ||
2307 | mutex_unlock(&__ip_vs_mutex); | ||
2308 | return ret; | ||
2309 | } | ||
2310 | |||
2311 | |||
2312 | static struct nf_sockopt_ops ip_vs_sockopts = { | ||
2313 | .pf = PF_INET, | ||
2314 | .set_optmin = IP_VS_BASE_CTL, | ||
2315 | .set_optmax = IP_VS_SO_SET_MAX+1, | ||
2316 | .set = do_ip_vs_set_ctl, | ||
2317 | .get_optmin = IP_VS_BASE_CTL, | ||
2318 | .get_optmax = IP_VS_SO_GET_MAX+1, | ||
2319 | .get = do_ip_vs_get_ctl, | ||
2320 | .owner = THIS_MODULE, | ||
2321 | }; | ||
2322 | |||
2323 | |||
2324 | int __init ip_vs_control_init(void) | ||
2325 | { | ||
2326 | int ret; | ||
2327 | int idx; | ||
2328 | |||
2329 | EnterFunction(2); | ||
2330 | |||
2331 | ret = nf_register_sockopt(&ip_vs_sockopts); | ||
2332 | if (ret) { | ||
2333 | IP_VS_ERR("cannot register sockopt.\n"); | ||
2334 | return ret; | ||
2335 | } | ||
2336 | |||
2337 | proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); | ||
2338 | proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); | ||
2339 | |||
2340 | sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); | ||
2341 | |||
2342 | /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ | ||
2343 | for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | ||
2344 | INIT_LIST_HEAD(&ip_vs_svc_table[idx]); | ||
2345 | INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); | ||
2346 | } | ||
2347 | for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { | ||
2348 | INIT_LIST_HEAD(&ip_vs_rtable[idx]); | ||
2349 | } | ||
2350 | |||
2351 | ip_vs_new_estimator(&ip_vs_stats); | ||
2352 | |||
2353 | /* Hook the defense timer */ | ||
2354 | schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); | ||
2355 | |||
2356 | LeaveFunction(2); | ||
2357 | return 0; | ||
2358 | } | ||
2359 | |||
2360 | |||
2361 | void ip_vs_control_cleanup(void) | ||
2362 | { | ||
2363 | EnterFunction(2); | ||
2364 | ip_vs_trash_cleanup(); | ||
2365 | cancel_rearming_delayed_work(&defense_work); | ||
2366 | cancel_work_sync(&defense_work.work); | ||
2367 | ip_vs_kill_estimator(&ip_vs_stats); | ||
2368 | unregister_sysctl_table(sysctl_header); | ||
2369 | proc_net_remove(&init_net, "ip_vs_stats"); | ||
2370 | proc_net_remove(&init_net, "ip_vs"); | ||
2371 | nf_unregister_sockopt(&ip_vs_sockopts); | ||
2372 | LeaveFunction(2); | ||
2373 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c deleted file mode 100644 index fa66824d264f..000000000000 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ /dev/null | |||
@@ -1,258 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Destination Hashing scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@gnuchina.org> | ||
5 | * | ||
6 | * Inspired by the consistent hashing scheduler patch from | ||
7 | * Thomas Proell <proellt@gmx.de> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * as published by the Free Software Foundation; either version | ||
12 | * 2 of the License, or (at your option) any later version. | ||
13 | * | ||
14 | * Changes: | ||
15 | * | ||
16 | */ | ||
17 | |||
18 | /* | ||
19 | * The dh algorithm is to select server by the hash key of destination IP | ||
20 | * address. The pseudo code is as follows: | ||
21 | * | ||
22 | * n <- servernode[dest_ip]; | ||
23 | * if (n is dead) OR | ||
24 | * (n is overloaded) OR (n.weight <= 0) then | ||
25 | * return NULL; | ||
26 | * | ||
27 | * return n; | ||
28 | * | ||
29 | * Notes that servernode is a 256-bucket hash table that maps the hash | ||
30 | * index derived from packet destination IP address to the current server | ||
31 | * array. If the dh scheduler is used in cache cluster, it is good to | ||
32 | * combine it with cache_bypass feature. When the statically assigned | ||
33 | * server is dead or overloaded, the load balancer can bypass the cache | ||
34 | * server and send requests to the original server directly. | ||
35 | * | ||
36 | */ | ||
37 | |||
38 | #include <linux/ip.h> | ||
39 | #include <linux/module.h> | ||
40 | #include <linux/kernel.h> | ||
41 | #include <linux/skbuff.h> | ||
42 | |||
43 | #include <net/ip_vs.h> | ||
44 | |||
45 | |||
46 | /* | ||
47 | * IPVS DH bucket | ||
48 | */ | ||
49 | struct ip_vs_dh_bucket { | ||
50 | struct ip_vs_dest *dest; /* real server (cache) */ | ||
51 | }; | ||
52 | |||
53 | /* | ||
54 | * for IPVS DH entry hash table | ||
55 | */ | ||
56 | #ifndef CONFIG_IP_VS_DH_TAB_BITS | ||
57 | #define CONFIG_IP_VS_DH_TAB_BITS 8 | ||
58 | #endif | ||
59 | #define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS | ||
60 | #define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) | ||
61 | #define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) | ||
62 | |||
63 | |||
64 | /* | ||
65 | * Returns hash value for IPVS DH entry | ||
66 | */ | ||
67 | static inline unsigned ip_vs_dh_hashkey(__be32 addr) | ||
68 | { | ||
69 | return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK; | ||
70 | } | ||
71 | |||
72 | |||
73 | /* | ||
74 | * Get ip_vs_dest associated with supplied parameters. | ||
75 | */ | ||
76 | static inline struct ip_vs_dest * | ||
77 | ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __be32 addr) | ||
78 | { | ||
79 | return (tbl[ip_vs_dh_hashkey(addr)]).dest; | ||
80 | } | ||
81 | |||
82 | |||
83 | /* | ||
84 | * Assign all the hash buckets of the specified table with the service. | ||
85 | */ | ||
86 | static int | ||
87 | ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) | ||
88 | { | ||
89 | int i; | ||
90 | struct ip_vs_dh_bucket *b; | ||
91 | struct list_head *p; | ||
92 | struct ip_vs_dest *dest; | ||
93 | |||
94 | b = tbl; | ||
95 | p = &svc->destinations; | ||
96 | for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { | ||
97 | if (list_empty(p)) { | ||
98 | b->dest = NULL; | ||
99 | } else { | ||
100 | if (p == &svc->destinations) | ||
101 | p = p->next; | ||
102 | |||
103 | dest = list_entry(p, struct ip_vs_dest, n_list); | ||
104 | atomic_inc(&dest->refcnt); | ||
105 | b->dest = dest; | ||
106 | |||
107 | p = p->next; | ||
108 | } | ||
109 | b++; | ||
110 | } | ||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | |||
115 | /* | ||
116 | * Flush all the hash buckets of the specified table. | ||
117 | */ | ||
118 | static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) | ||
119 | { | ||
120 | int i; | ||
121 | struct ip_vs_dh_bucket *b; | ||
122 | |||
123 | b = tbl; | ||
124 | for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { | ||
125 | if (b->dest) { | ||
126 | atomic_dec(&b->dest->refcnt); | ||
127 | b->dest = NULL; | ||
128 | } | ||
129 | b++; | ||
130 | } | ||
131 | } | ||
132 | |||
133 | |||
134 | static int ip_vs_dh_init_svc(struct ip_vs_service *svc) | ||
135 | { | ||
136 | struct ip_vs_dh_bucket *tbl; | ||
137 | |||
138 | /* allocate the DH table for this service */ | ||
139 | tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, | ||
140 | GFP_ATOMIC); | ||
141 | if (tbl == NULL) { | ||
142 | IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n"); | ||
143 | return -ENOMEM; | ||
144 | } | ||
145 | svc->sched_data = tbl; | ||
146 | IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " | ||
147 | "current service\n", | ||
148 | sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); | ||
149 | |||
150 | /* assign the hash buckets with the updated service */ | ||
151 | ip_vs_dh_assign(tbl, svc); | ||
152 | |||
153 | return 0; | ||
154 | } | ||
155 | |||
156 | |||
157 | static int ip_vs_dh_done_svc(struct ip_vs_service *svc) | ||
158 | { | ||
159 | struct ip_vs_dh_bucket *tbl = svc->sched_data; | ||
160 | |||
161 | /* got to clean up hash buckets here */ | ||
162 | ip_vs_dh_flush(tbl); | ||
163 | |||
164 | /* release the table itself */ | ||
165 | kfree(svc->sched_data); | ||
166 | IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", | ||
167 | sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); | ||
168 | |||
169 | return 0; | ||
170 | } | ||
171 | |||
172 | |||
173 | static int ip_vs_dh_update_svc(struct ip_vs_service *svc) | ||
174 | { | ||
175 | struct ip_vs_dh_bucket *tbl = svc->sched_data; | ||
176 | |||
177 | /* got to clean up hash buckets here */ | ||
178 | ip_vs_dh_flush(tbl); | ||
179 | |||
180 | /* assign the hash buckets with the updated service */ | ||
181 | ip_vs_dh_assign(tbl, svc); | ||
182 | |||
183 | return 0; | ||
184 | } | ||
185 | |||
186 | |||
187 | /* | ||
188 | * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, | ||
189 | * consider that the server is overloaded here. | ||
190 | */ | ||
191 | static inline int is_overloaded(struct ip_vs_dest *dest) | ||
192 | { | ||
193 | return dest->flags & IP_VS_DEST_F_OVERLOAD; | ||
194 | } | ||
195 | |||
196 | |||
197 | /* | ||
198 | * Destination hashing scheduling | ||
199 | */ | ||
200 | static struct ip_vs_dest * | ||
201 | ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
202 | { | ||
203 | struct ip_vs_dest *dest; | ||
204 | struct ip_vs_dh_bucket *tbl; | ||
205 | struct iphdr *iph = ip_hdr(skb); | ||
206 | |||
207 | IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); | ||
208 | |||
209 | tbl = (struct ip_vs_dh_bucket *)svc->sched_data; | ||
210 | dest = ip_vs_dh_get(tbl, iph->daddr); | ||
211 | if (!dest | ||
212 | || !(dest->flags & IP_VS_DEST_F_AVAILABLE) | ||
213 | || atomic_read(&dest->weight) <= 0 | ||
214 | || is_overloaded(dest)) { | ||
215 | return NULL; | ||
216 | } | ||
217 | |||
218 | IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u " | ||
219 | "--> server %u.%u.%u.%u:%d\n", | ||
220 | NIPQUAD(iph->daddr), | ||
221 | NIPQUAD(dest->addr), | ||
222 | ntohs(dest->port)); | ||
223 | |||
224 | return dest; | ||
225 | } | ||
226 | |||
227 | |||
228 | /* | ||
229 | * IPVS DH Scheduler structure | ||
230 | */ | ||
231 | static struct ip_vs_scheduler ip_vs_dh_scheduler = | ||
232 | { | ||
233 | .name = "dh", | ||
234 | .refcnt = ATOMIC_INIT(0), | ||
235 | .module = THIS_MODULE, | ||
236 | .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), | ||
237 | .init_service = ip_vs_dh_init_svc, | ||
238 | .done_service = ip_vs_dh_done_svc, | ||
239 | .update_service = ip_vs_dh_update_svc, | ||
240 | .schedule = ip_vs_dh_schedule, | ||
241 | }; | ||
242 | |||
243 | |||
244 | static int __init ip_vs_dh_init(void) | ||
245 | { | ||
246 | return register_ip_vs_scheduler(&ip_vs_dh_scheduler); | ||
247 | } | ||
248 | |||
249 | |||
250 | static void __exit ip_vs_dh_cleanup(void) | ||
251 | { | ||
252 | unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); | ||
253 | } | ||
254 | |||
255 | |||
256 | module_init(ip_vs_dh_init); | ||
257 | module_exit(ip_vs_dh_cleanup); | ||
258 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c deleted file mode 100644 index 5a20f93bd7f9..000000000000 --- a/net/ipv4/ipvs/ip_vs_est.c +++ /dev/null | |||
@@ -1,162 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_est.c: simple rate estimator for IPVS | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Changes: | ||
12 | * | ||
13 | */ | ||
14 | #include <linux/kernel.h> | ||
15 | #include <linux/jiffies.h> | ||
16 | #include <linux/slab.h> | ||
17 | #include <linux/types.h> | ||
18 | #include <linux/interrupt.h> | ||
19 | #include <linux/sysctl.h> | ||
20 | #include <linux/list.h> | ||
21 | |||
22 | #include <net/ip_vs.h> | ||
23 | |||
24 | /* | ||
25 | This code is to estimate rate in a shorter interval (such as 8 | ||
26 | seconds) for virtual services and real servers. For measure rate in a | ||
27 | long interval, it is easy to implement a user level daemon which | ||
28 | periodically reads those statistical counters and measure rate. | ||
29 | |||
30 | Currently, the measurement is activated by slow timer handler. Hope | ||
31 | this measurement will not introduce too much load. | ||
32 | |||
33 | We measure rate during the last 8 seconds every 2 seconds: | ||
34 | |||
35 | avgrate = avgrate*(1-W) + rate*W | ||
36 | |||
37 | where W = 2^(-2) | ||
38 | |||
39 | NOTES. | ||
40 | |||
41 | * The stored value for average bps is scaled by 2^5, so that maximal | ||
42 | rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. | ||
43 | |||
44 | * A lot code is taken from net/sched/estimator.c | ||
45 | */ | ||
46 | |||
47 | |||
48 | static void estimation_timer(unsigned long arg); | ||
49 | |||
50 | static LIST_HEAD(est_list); | ||
51 | static DEFINE_SPINLOCK(est_lock); | ||
52 | static DEFINE_TIMER(est_timer, estimation_timer, 0, 0); | ||
53 | |||
54 | static void estimation_timer(unsigned long arg) | ||
55 | { | ||
56 | struct ip_vs_estimator *e; | ||
57 | struct ip_vs_stats *s; | ||
58 | u32 n_conns; | ||
59 | u32 n_inpkts, n_outpkts; | ||
60 | u64 n_inbytes, n_outbytes; | ||
61 | u32 rate; | ||
62 | |||
63 | spin_lock(&est_lock); | ||
64 | list_for_each_entry(e, &est_list, list) { | ||
65 | s = container_of(e, struct ip_vs_stats, est); | ||
66 | |||
67 | spin_lock(&s->lock); | ||
68 | n_conns = s->conns; | ||
69 | n_inpkts = s->inpkts; | ||
70 | n_outpkts = s->outpkts; | ||
71 | n_inbytes = s->inbytes; | ||
72 | n_outbytes = s->outbytes; | ||
73 | |||
74 | /* scaled by 2^10, but divided 2 seconds */ | ||
75 | rate = (n_conns - e->last_conns)<<9; | ||
76 | e->last_conns = n_conns; | ||
77 | e->cps += ((long)rate - (long)e->cps)>>2; | ||
78 | s->cps = (e->cps+0x1FF)>>10; | ||
79 | |||
80 | rate = (n_inpkts - e->last_inpkts)<<9; | ||
81 | e->last_inpkts = n_inpkts; | ||
82 | e->inpps += ((long)rate - (long)e->inpps)>>2; | ||
83 | s->inpps = (e->inpps+0x1FF)>>10; | ||
84 | |||
85 | rate = (n_outpkts - e->last_outpkts)<<9; | ||
86 | e->last_outpkts = n_outpkts; | ||
87 | e->outpps += ((long)rate - (long)e->outpps)>>2; | ||
88 | s->outpps = (e->outpps+0x1FF)>>10; | ||
89 | |||
90 | rate = (n_inbytes - e->last_inbytes)<<4; | ||
91 | e->last_inbytes = n_inbytes; | ||
92 | e->inbps += ((long)rate - (long)e->inbps)>>2; | ||
93 | s->inbps = (e->inbps+0xF)>>5; | ||
94 | |||
95 | rate = (n_outbytes - e->last_outbytes)<<4; | ||
96 | e->last_outbytes = n_outbytes; | ||
97 | e->outbps += ((long)rate - (long)e->outbps)>>2; | ||
98 | s->outbps = (e->outbps+0xF)>>5; | ||
99 | spin_unlock(&s->lock); | ||
100 | } | ||
101 | spin_unlock(&est_lock); | ||
102 | mod_timer(&est_timer, jiffies + 2*HZ); | ||
103 | } | ||
104 | |||
105 | void ip_vs_new_estimator(struct ip_vs_stats *stats) | ||
106 | { | ||
107 | struct ip_vs_estimator *est = &stats->est; | ||
108 | |||
109 | INIT_LIST_HEAD(&est->list); | ||
110 | |||
111 | est->last_conns = stats->conns; | ||
112 | est->cps = stats->cps<<10; | ||
113 | |||
114 | est->last_inpkts = stats->inpkts; | ||
115 | est->inpps = stats->inpps<<10; | ||
116 | |||
117 | est->last_outpkts = stats->outpkts; | ||
118 | est->outpps = stats->outpps<<10; | ||
119 | |||
120 | est->last_inbytes = stats->inbytes; | ||
121 | est->inbps = stats->inbps<<5; | ||
122 | |||
123 | est->last_outbytes = stats->outbytes; | ||
124 | est->outbps = stats->outbps<<5; | ||
125 | |||
126 | spin_lock_bh(&est_lock); | ||
127 | if (list_empty(&est_list)) | ||
128 | mod_timer(&est_timer, jiffies + 2 * HZ); | ||
129 | list_add(&est->list, &est_list); | ||
130 | spin_unlock_bh(&est_lock); | ||
131 | } | ||
132 | |||
133 | void ip_vs_kill_estimator(struct ip_vs_stats *stats) | ||
134 | { | ||
135 | struct ip_vs_estimator *est = &stats->est; | ||
136 | |||
137 | spin_lock_bh(&est_lock); | ||
138 | list_del(&est->list); | ||
139 | while (list_empty(&est_list) && try_to_del_timer_sync(&est_timer) < 0) { | ||
140 | spin_unlock_bh(&est_lock); | ||
141 | cpu_relax(); | ||
142 | spin_lock_bh(&est_lock); | ||
143 | } | ||
144 | spin_unlock_bh(&est_lock); | ||
145 | } | ||
146 | |||
147 | void ip_vs_zero_estimator(struct ip_vs_stats *stats) | ||
148 | { | ||
149 | struct ip_vs_estimator *est = &stats->est; | ||
150 | |||
151 | /* set counters zero, caller must hold the stats->lock lock */ | ||
152 | est->last_inbytes = 0; | ||
153 | est->last_outbytes = 0; | ||
154 | est->last_conns = 0; | ||
155 | est->last_inpkts = 0; | ||
156 | est->last_outpkts = 0; | ||
157 | est->cps = 0; | ||
158 | est->inpps = 0; | ||
159 | est->outpps = 0; | ||
160 | est->inbps = 0; | ||
161 | est->outbps = 0; | ||
162 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c deleted file mode 100644 index c1c758e4f733..000000000000 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ /dev/null | |||
@@ -1,393 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_ftp.c: IPVS ftp application module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * | ||
6 | * Changes: | ||
7 | * | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * as published by the Free Software Foundation; either version | ||
12 | * 2 of the License, or (at your option) any later version. | ||
13 | * | ||
14 | * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference | ||
15 | * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. | ||
16 | * | ||
17 | * IP_MASQ_FTP ftp masquerading module | ||
18 | * | ||
19 | * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 | ||
20 | * | ||
21 | * Author: Wouter Gadeyne | ||
22 | * | ||
23 | */ | ||
24 | |||
25 | #include <linux/module.h> | ||
26 | #include <linux/moduleparam.h> | ||
27 | #include <linux/kernel.h> | ||
28 | #include <linux/skbuff.h> | ||
29 | #include <linux/in.h> | ||
30 | #include <linux/ip.h> | ||
31 | #include <linux/netfilter.h> | ||
32 | #include <net/protocol.h> | ||
33 | #include <net/tcp.h> | ||
34 | #include <asm/unaligned.h> | ||
35 | |||
36 | #include <net/ip_vs.h> | ||
37 | |||
38 | |||
39 | #define SERVER_STRING "227 Entering Passive Mode (" | ||
40 | #define CLIENT_STRING "PORT " | ||
41 | |||
42 | |||
43 | /* | ||
44 | * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper | ||
45 | * First port is set to the default port. | ||
46 | */ | ||
47 | static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; | ||
48 | module_param_array(ports, ushort, NULL, 0); | ||
49 | MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); | ||
50 | |||
51 | |||
52 | /* Dummy variable */ | ||
53 | static int ip_vs_ftp_pasv; | ||
54 | |||
55 | |||
56 | static int | ||
57 | ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) | ||
58 | { | ||
59 | return 0; | ||
60 | } | ||
61 | |||
62 | |||
63 | static int | ||
64 | ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) | ||
65 | { | ||
66 | return 0; | ||
67 | } | ||
68 | |||
69 | |||
70 | /* | ||
71 | * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started | ||
72 | * with the "pattern" and terminated with the "term" character. | ||
73 | * <addr,port> is in network order. | ||
74 | */ | ||
75 | static int ip_vs_ftp_get_addrport(char *data, char *data_limit, | ||
76 | const char *pattern, size_t plen, char term, | ||
77 | __be32 *addr, __be16 *port, | ||
78 | char **start, char **end) | ||
79 | { | ||
80 | unsigned char p[6]; | ||
81 | int i = 0; | ||
82 | |||
83 | if (data_limit - data < plen) { | ||
84 | /* check if there is partial match */ | ||
85 | if (strnicmp(data, pattern, data_limit - data) == 0) | ||
86 | return -1; | ||
87 | else | ||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | if (strnicmp(data, pattern, plen) != 0) { | ||
92 | return 0; | ||
93 | } | ||
94 | *start = data + plen; | ||
95 | |||
96 | for (data = *start; *data != term; data++) { | ||
97 | if (data == data_limit) | ||
98 | return -1; | ||
99 | } | ||
100 | *end = data; | ||
101 | |||
102 | memset(p, 0, sizeof(p)); | ||
103 | for (data = *start; data != *end; data++) { | ||
104 | if (*data >= '0' && *data <= '9') { | ||
105 | p[i] = p[i]*10 + *data - '0'; | ||
106 | } else if (*data == ',' && i < 5) { | ||
107 | i++; | ||
108 | } else { | ||
109 | /* unexpected character */ | ||
110 | return -1; | ||
111 | } | ||
112 | } | ||
113 | |||
114 | if (i != 5) | ||
115 | return -1; | ||
116 | |||
117 | *addr = get_unaligned((__be32 *)p); | ||
118 | *port = get_unaligned((__be16 *)(p + 4)); | ||
119 | return 1; | ||
120 | } | ||
121 | |||
122 | |||
123 | /* | ||
124 | * Look at outgoing ftp packets to catch the response to a PASV command | ||
125 | * from the server (inside-to-outside). | ||
126 | * When we see one, we build a connection entry with the client address, | ||
127 | * client port 0 (unknown at the moment), the server address and the | ||
128 | * server port. Mark the current connection entry as a control channel | ||
129 | * of the new entry. All this work is just to make the data connection | ||
130 | * can be scheduled to the right server later. | ||
131 | * | ||
132 | * The outgoing packet should be something like | ||
133 | * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". | ||
134 | * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. | ||
135 | */ | ||
136 | static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, | ||
137 | struct sk_buff *skb, int *diff) | ||
138 | { | ||
139 | struct iphdr *iph; | ||
140 | struct tcphdr *th; | ||
141 | char *data, *data_limit; | ||
142 | char *start, *end; | ||
143 | __be32 from; | ||
144 | __be16 port; | ||
145 | struct ip_vs_conn *n_cp; | ||
146 | char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ | ||
147 | unsigned buf_len; | ||
148 | int ret; | ||
149 | |||
150 | *diff = 0; | ||
151 | |||
152 | /* Only useful for established sessions */ | ||
153 | if (cp->state != IP_VS_TCP_S_ESTABLISHED) | ||
154 | return 1; | ||
155 | |||
156 | /* Linear packets are much easier to deal with. */ | ||
157 | if (!skb_make_writable(skb, skb->len)) | ||
158 | return 0; | ||
159 | |||
160 | if (cp->app_data == &ip_vs_ftp_pasv) { | ||
161 | iph = ip_hdr(skb); | ||
162 | th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); | ||
163 | data = (char *)th + (th->doff << 2); | ||
164 | data_limit = skb_tail_pointer(skb); | ||
165 | |||
166 | if (ip_vs_ftp_get_addrport(data, data_limit, | ||
167 | SERVER_STRING, | ||
168 | sizeof(SERVER_STRING)-1, ')', | ||
169 | &from, &port, | ||
170 | &start, &end) != 1) | ||
171 | return 1; | ||
172 | |||
173 | IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> " | ||
174 | "%u.%u.%u.%u:%d detected\n", | ||
175 | NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0); | ||
176 | |||
177 | /* | ||
178 | * Now update or create an connection entry for it | ||
179 | */ | ||
180 | n_cp = ip_vs_conn_out_get(iph->protocol, from, port, | ||
181 | cp->caddr, 0); | ||
182 | if (!n_cp) { | ||
183 | n_cp = ip_vs_conn_new(IPPROTO_TCP, | ||
184 | cp->caddr, 0, | ||
185 | cp->vaddr, port, | ||
186 | from, port, | ||
187 | IP_VS_CONN_F_NO_CPORT, | ||
188 | cp->dest); | ||
189 | if (!n_cp) | ||
190 | return 0; | ||
191 | |||
192 | /* add its controller */ | ||
193 | ip_vs_control_add(n_cp, cp); | ||
194 | } | ||
195 | |||
196 | /* | ||
197 | * Replace the old passive address with the new one | ||
198 | */ | ||
199 | from = n_cp->vaddr; | ||
200 | port = n_cp->vport; | ||
201 | sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from), | ||
202 | (ntohs(port)>>8)&255, ntohs(port)&255); | ||
203 | buf_len = strlen(buf); | ||
204 | |||
205 | /* | ||
206 | * Calculate required delta-offset to keep TCP happy | ||
207 | */ | ||
208 | *diff = buf_len - (end-start); | ||
209 | |||
210 | if (*diff == 0) { | ||
211 | /* simply replace it with new passive address */ | ||
212 | memcpy(start, buf, buf_len); | ||
213 | ret = 1; | ||
214 | } else { | ||
215 | ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start, | ||
216 | end-start, buf, buf_len); | ||
217 | } | ||
218 | |||
219 | cp->app_data = NULL; | ||
220 | ip_vs_tcp_conn_listen(n_cp); | ||
221 | ip_vs_conn_put(n_cp); | ||
222 | return ret; | ||
223 | } | ||
224 | return 1; | ||
225 | } | ||
226 | |||
227 | |||
228 | /* | ||
229 | * Look at incoming ftp packets to catch the PASV/PORT command | ||
230 | * (outside-to-inside). | ||
231 | * | ||
232 | * The incoming packet having the PORT command should be something like | ||
233 | * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". | ||
234 | * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. | ||
235 | * In this case, we create a connection entry using the client address and | ||
236 | * port, so that the active ftp data connection from the server can reach | ||
237 | * the client. | ||
238 | */ | ||
239 | static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, | ||
240 | struct sk_buff *skb, int *diff) | ||
241 | { | ||
242 | struct iphdr *iph; | ||
243 | struct tcphdr *th; | ||
244 | char *data, *data_start, *data_limit; | ||
245 | char *start, *end; | ||
246 | __be32 to; | ||
247 | __be16 port; | ||
248 | struct ip_vs_conn *n_cp; | ||
249 | |||
250 | /* no diff required for incoming packets */ | ||
251 | *diff = 0; | ||
252 | |||
253 | /* Only useful for established sessions */ | ||
254 | if (cp->state != IP_VS_TCP_S_ESTABLISHED) | ||
255 | return 1; | ||
256 | |||
257 | /* Linear packets are much easier to deal with. */ | ||
258 | if (!skb_make_writable(skb, skb->len)) | ||
259 | return 0; | ||
260 | |||
261 | /* | ||
262 | * Detecting whether it is passive | ||
263 | */ | ||
264 | iph = ip_hdr(skb); | ||
265 | th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); | ||
266 | |||
267 | /* Since there may be OPTIONS in the TCP packet and the HLEN is | ||
268 | the length of the header in 32-bit multiples, it is accurate | ||
269 | to calculate data address by th+HLEN*4 */ | ||
270 | data = data_start = (char *)th + (th->doff << 2); | ||
271 | data_limit = skb_tail_pointer(skb); | ||
272 | |||
273 | while (data <= data_limit - 6) { | ||
274 | if (strnicmp(data, "PASV\r\n", 6) == 0) { | ||
275 | /* Passive mode on */ | ||
276 | IP_VS_DBG(7, "got PASV at %td of %td\n", | ||
277 | data - data_start, | ||
278 | data_limit - data_start); | ||
279 | cp->app_data = &ip_vs_ftp_pasv; | ||
280 | return 1; | ||
281 | } | ||
282 | data++; | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | * To support virtual FTP server, the scenerio is as follows: | ||
287 | * FTP client ----> Load Balancer ----> FTP server | ||
288 | * First detect the port number in the application data, | ||
289 | * then create a new connection entry for the coming data | ||
290 | * connection. | ||
291 | */ | ||
292 | if (ip_vs_ftp_get_addrport(data_start, data_limit, | ||
293 | CLIENT_STRING, sizeof(CLIENT_STRING)-1, | ||
294 | '\r', &to, &port, | ||
295 | &start, &end) != 1) | ||
296 | return 1; | ||
297 | |||
298 | IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n", | ||
299 | NIPQUAD(to), ntohs(port)); | ||
300 | |||
301 | /* Passive mode off */ | ||
302 | cp->app_data = NULL; | ||
303 | |||
304 | /* | ||
305 | * Now update or create a connection entry for it | ||
306 | */ | ||
307 | IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", | ||
308 | ip_vs_proto_name(iph->protocol), | ||
309 | NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0); | ||
310 | |||
311 | n_cp = ip_vs_conn_in_get(iph->protocol, | ||
312 | to, port, | ||
313 | cp->vaddr, htons(ntohs(cp->vport)-1)); | ||
314 | if (!n_cp) { | ||
315 | n_cp = ip_vs_conn_new(IPPROTO_TCP, | ||
316 | to, port, | ||
317 | cp->vaddr, htons(ntohs(cp->vport)-1), | ||
318 | cp->daddr, htons(ntohs(cp->dport)-1), | ||
319 | 0, | ||
320 | cp->dest); | ||
321 | if (!n_cp) | ||
322 | return 0; | ||
323 | |||
324 | /* add its controller */ | ||
325 | ip_vs_control_add(n_cp, cp); | ||
326 | } | ||
327 | |||
328 | /* | ||
329 | * Move tunnel to listen state | ||
330 | */ | ||
331 | ip_vs_tcp_conn_listen(n_cp); | ||
332 | ip_vs_conn_put(n_cp); | ||
333 | |||
334 | return 1; | ||
335 | } | ||
336 | |||
337 | |||
338 | static struct ip_vs_app ip_vs_ftp = { | ||
339 | .name = "ftp", | ||
340 | .type = IP_VS_APP_TYPE_FTP, | ||
341 | .protocol = IPPROTO_TCP, | ||
342 | .module = THIS_MODULE, | ||
343 | .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), | ||
344 | .init_conn = ip_vs_ftp_init_conn, | ||
345 | .done_conn = ip_vs_ftp_done_conn, | ||
346 | .bind_conn = NULL, | ||
347 | .unbind_conn = NULL, | ||
348 | .pkt_out = ip_vs_ftp_out, | ||
349 | .pkt_in = ip_vs_ftp_in, | ||
350 | }; | ||
351 | |||
352 | |||
353 | /* | ||
354 | * ip_vs_ftp initialization | ||
355 | */ | ||
356 | static int __init ip_vs_ftp_init(void) | ||
357 | { | ||
358 | int i, ret; | ||
359 | struct ip_vs_app *app = &ip_vs_ftp; | ||
360 | |||
361 | ret = register_ip_vs_app(app); | ||
362 | if (ret) | ||
363 | return ret; | ||
364 | |||
365 | for (i=0; i<IP_VS_APP_MAX_PORTS; i++) { | ||
366 | if (!ports[i]) | ||
367 | continue; | ||
368 | ret = register_ip_vs_app_inc(app, app->protocol, ports[i]); | ||
369 | if (ret) | ||
370 | break; | ||
371 | IP_VS_INFO("%s: loaded support on port[%d] = %d\n", | ||
372 | app->name, i, ports[i]); | ||
373 | } | ||
374 | |||
375 | if (ret) | ||
376 | unregister_ip_vs_app(app); | ||
377 | |||
378 | return ret; | ||
379 | } | ||
380 | |||
381 | |||
382 | /* | ||
383 | * ip_vs_ftp finish. | ||
384 | */ | ||
385 | static void __exit ip_vs_ftp_exit(void) | ||
386 | { | ||
387 | unregister_ip_vs_app(&ip_vs_ftp); | ||
388 | } | ||
389 | |||
390 | |||
391 | module_init(ip_vs_ftp_init); | ||
392 | module_exit(ip_vs_ftp_exit); | ||
393 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c deleted file mode 100644 index 7a6a319f544a..000000000000 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ /dev/null | |||
@@ -1,571 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Locality-Based Least-Connection scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@gnuchina.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Changes: | ||
12 | * Martin Hamilton : fixed the terrible locking bugs | ||
13 | * *lock(tbl->lock) ==> *lock(&tbl->lock) | ||
14 | * Wensong Zhang : fixed the uninitilized tbl->lock bug | ||
15 | * Wensong Zhang : added doing full expiration check to | ||
16 | * collect stale entries of 24+ hours when | ||
17 | * no partial expire check in a half hour | ||
18 | * Julian Anastasov : replaced del_timer call with del_timer_sync | ||
19 | * to avoid the possible race between timer | ||
20 | * handler and del_timer thread in SMP | ||
21 | * | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | * The lblc algorithm is as follows (pseudo code): | ||
26 | * | ||
27 | * if cachenode[dest_ip] is null then | ||
28 | * n, cachenode[dest_ip] <- {weighted least-conn node}; | ||
29 | * else | ||
30 | * n <- cachenode[dest_ip]; | ||
31 | * if (n is dead) OR | ||
32 | * (n.conns>n.weight AND | ||
33 | * there is a node m with m.conns<m.weight/2) then | ||
34 | * n, cachenode[dest_ip] <- {weighted least-conn node}; | ||
35 | * | ||
36 | * return n; | ||
37 | * | ||
38 | * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing | ||
39 | * me to write this module. | ||
40 | */ | ||
41 | |||
42 | #include <linux/ip.h> | ||
43 | #include <linux/module.h> | ||
44 | #include <linux/kernel.h> | ||
45 | #include <linux/skbuff.h> | ||
46 | #include <linux/jiffies.h> | ||
47 | |||
48 | /* for sysctl */ | ||
49 | #include <linux/fs.h> | ||
50 | #include <linux/sysctl.h> | ||
51 | |||
52 | #include <net/ip_vs.h> | ||
53 | |||
54 | |||
55 | /* | ||
56 | * It is for garbage collection of stale IPVS lblc entries, | ||
57 | * when the table is full. | ||
58 | */ | ||
59 | #define CHECK_EXPIRE_INTERVAL (60*HZ) | ||
60 | #define ENTRY_TIMEOUT (6*60*HZ) | ||
61 | |||
62 | /* | ||
63 | * It is for full expiration check. | ||
64 | * When there is no partial expiration check (garbage collection) | ||
65 | * in a half hour, do a full expiration check to collect stale | ||
66 | * entries that haven't been touched for a day. | ||
67 | */ | ||
68 | #define COUNT_FOR_FULL_EXPIRATION 30 | ||
69 | static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; | ||
70 | |||
71 | |||
72 | /* | ||
73 | * for IPVS lblc entry hash table | ||
74 | */ | ||
75 | #ifndef CONFIG_IP_VS_LBLC_TAB_BITS | ||
76 | #define CONFIG_IP_VS_LBLC_TAB_BITS 10 | ||
77 | #endif | ||
78 | #define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS | ||
79 | #define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) | ||
80 | #define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) | ||
81 | |||
82 | |||
83 | /* | ||
84 | * IPVS lblc entry represents an association between destination | ||
85 | * IP address and its destination server | ||
86 | */ | ||
87 | struct ip_vs_lblc_entry { | ||
88 | struct list_head list; | ||
89 | __be32 addr; /* destination IP address */ | ||
90 | struct ip_vs_dest *dest; /* real server (cache) */ | ||
91 | unsigned long lastuse; /* last used time */ | ||
92 | }; | ||
93 | |||
94 | |||
95 | /* | ||
96 | * IPVS lblc hash table | ||
97 | */ | ||
98 | struct ip_vs_lblc_table { | ||
99 | rwlock_t lock; /* lock for this table */ | ||
100 | struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ | ||
101 | atomic_t entries; /* number of entries */ | ||
102 | int max_size; /* maximum size of entries */ | ||
103 | struct timer_list periodic_timer; /* collect stale entries */ | ||
104 | int rover; /* rover for expire check */ | ||
105 | int counter; /* counter for no expire */ | ||
106 | }; | ||
107 | |||
108 | |||
109 | /* | ||
110 | * IPVS LBLC sysctl table | ||
111 | */ | ||
112 | |||
113 | static ctl_table vs_vars_table[] = { | ||
114 | { | ||
115 | .procname = "lblc_expiration", | ||
116 | .data = &sysctl_ip_vs_lblc_expiration, | ||
117 | .maxlen = sizeof(int), | ||
118 | .mode = 0644, | ||
119 | .proc_handler = &proc_dointvec_jiffies, | ||
120 | }, | ||
121 | { .ctl_name = 0 } | ||
122 | }; | ||
123 | |||
124 | static struct ctl_table_header * sysctl_header; | ||
125 | |||
126 | /* | ||
127 | * new/free a ip_vs_lblc_entry, which is a mapping of a destionation | ||
128 | * IP address to a server. | ||
129 | */ | ||
130 | static inline struct ip_vs_lblc_entry * | ||
131 | ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest) | ||
132 | { | ||
133 | struct ip_vs_lblc_entry *en; | ||
134 | |||
135 | en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); | ||
136 | if (en == NULL) { | ||
137 | IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); | ||
138 | return NULL; | ||
139 | } | ||
140 | |||
141 | INIT_LIST_HEAD(&en->list); | ||
142 | en->addr = daddr; | ||
143 | |||
144 | atomic_inc(&dest->refcnt); | ||
145 | en->dest = dest; | ||
146 | |||
147 | return en; | ||
148 | } | ||
149 | |||
150 | |||
151 | static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) | ||
152 | { | ||
153 | list_del(&en->list); | ||
154 | /* | ||
155 | * We don't kfree dest because it is refered either by its service | ||
156 | * or the trash dest list. | ||
157 | */ | ||
158 | atomic_dec(&en->dest->refcnt); | ||
159 | kfree(en); | ||
160 | } | ||
161 | |||
162 | |||
163 | /* | ||
164 | * Returns hash value for IPVS LBLC entry | ||
165 | */ | ||
166 | static inline unsigned ip_vs_lblc_hashkey(__be32 addr) | ||
167 | { | ||
168 | return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; | ||
169 | } | ||
170 | |||
171 | |||
172 | /* | ||
173 | * Hash an entry in the ip_vs_lblc_table. | ||
174 | * returns bool success. | ||
175 | */ | ||
176 | static int | ||
177 | ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) | ||
178 | { | ||
179 | unsigned hash; | ||
180 | |||
181 | if (!list_empty(&en->list)) { | ||
182 | IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " | ||
183 | "called from %p\n", __builtin_return_address(0)); | ||
184 | return 0; | ||
185 | } | ||
186 | |||
187 | /* | ||
188 | * Hash by destination IP address | ||
189 | */ | ||
190 | hash = ip_vs_lblc_hashkey(en->addr); | ||
191 | |||
192 | write_lock(&tbl->lock); | ||
193 | list_add(&en->list, &tbl->bucket[hash]); | ||
194 | atomic_inc(&tbl->entries); | ||
195 | write_unlock(&tbl->lock); | ||
196 | |||
197 | return 1; | ||
198 | } | ||
199 | |||
200 | |||
201 | /* | ||
202 | * Get ip_vs_lblc_entry associated with supplied parameters. | ||
203 | */ | ||
204 | static inline struct ip_vs_lblc_entry * | ||
205 | ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) | ||
206 | { | ||
207 | unsigned hash; | ||
208 | struct ip_vs_lblc_entry *en; | ||
209 | |||
210 | hash = ip_vs_lblc_hashkey(addr); | ||
211 | |||
212 | read_lock(&tbl->lock); | ||
213 | |||
214 | list_for_each_entry(en, &tbl->bucket[hash], list) { | ||
215 | if (en->addr == addr) { | ||
216 | /* HIT */ | ||
217 | read_unlock(&tbl->lock); | ||
218 | return en; | ||
219 | } | ||
220 | } | ||
221 | |||
222 | read_unlock(&tbl->lock); | ||
223 | |||
224 | return NULL; | ||
225 | } | ||
226 | |||
227 | |||
228 | /* | ||
229 | * Flush all the entries of the specified table. | ||
230 | */ | ||
231 | static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) | ||
232 | { | ||
233 | int i; | ||
234 | struct ip_vs_lblc_entry *en, *nxt; | ||
235 | |||
236 | for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { | ||
237 | write_lock(&tbl->lock); | ||
238 | list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { | ||
239 | ip_vs_lblc_free(en); | ||
240 | atomic_dec(&tbl->entries); | ||
241 | } | ||
242 | write_unlock(&tbl->lock); | ||
243 | } | ||
244 | } | ||
245 | |||
246 | |||
247 | static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) | ||
248 | { | ||
249 | unsigned long now = jiffies; | ||
250 | int i, j; | ||
251 | struct ip_vs_lblc_entry *en, *nxt; | ||
252 | |||
253 | for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { | ||
254 | j = (j + 1) & IP_VS_LBLC_TAB_MASK; | ||
255 | |||
256 | write_lock(&tbl->lock); | ||
257 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | ||
258 | if (time_before(now, | ||
259 | en->lastuse + sysctl_ip_vs_lblc_expiration)) | ||
260 | continue; | ||
261 | |||
262 | ip_vs_lblc_free(en); | ||
263 | atomic_dec(&tbl->entries); | ||
264 | } | ||
265 | write_unlock(&tbl->lock); | ||
266 | } | ||
267 | tbl->rover = j; | ||
268 | } | ||
269 | |||
270 | |||
271 | /* | ||
272 | * Periodical timer handler for IPVS lblc table | ||
273 | * It is used to collect stale entries when the number of entries | ||
274 | * exceeds the maximum size of the table. | ||
275 | * | ||
276 | * Fixme: we probably need more complicated algorithm to collect | ||
277 | * entries that have not been used for a long time even | ||
278 | * if the number of entries doesn't exceed the maximum size | ||
279 | * of the table. | ||
280 | * The full expiration check is for this purpose now. | ||
281 | */ | ||
282 | static void ip_vs_lblc_check_expire(unsigned long data) | ||
283 | { | ||
284 | struct ip_vs_lblc_table *tbl; | ||
285 | unsigned long now = jiffies; | ||
286 | int goal; | ||
287 | int i, j; | ||
288 | struct ip_vs_lblc_entry *en, *nxt; | ||
289 | |||
290 | tbl = (struct ip_vs_lblc_table *)data; | ||
291 | |||
292 | if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { | ||
293 | /* do full expiration check */ | ||
294 | ip_vs_lblc_full_check(tbl); | ||
295 | tbl->counter = 1; | ||
296 | goto out; | ||
297 | } | ||
298 | |||
299 | if (atomic_read(&tbl->entries) <= tbl->max_size) { | ||
300 | tbl->counter++; | ||
301 | goto out; | ||
302 | } | ||
303 | |||
304 | goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; | ||
305 | if (goal > tbl->max_size/2) | ||
306 | goal = tbl->max_size/2; | ||
307 | |||
308 | for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { | ||
309 | j = (j + 1) & IP_VS_LBLC_TAB_MASK; | ||
310 | |||
311 | write_lock(&tbl->lock); | ||
312 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | ||
313 | if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) | ||
314 | continue; | ||
315 | |||
316 | ip_vs_lblc_free(en); | ||
317 | atomic_dec(&tbl->entries); | ||
318 | goal--; | ||
319 | } | ||
320 | write_unlock(&tbl->lock); | ||
321 | if (goal <= 0) | ||
322 | break; | ||
323 | } | ||
324 | tbl->rover = j; | ||
325 | |||
326 | out: | ||
327 | mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); | ||
328 | } | ||
329 | |||
330 | |||
331 | static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) | ||
332 | { | ||
333 | int i; | ||
334 | struct ip_vs_lblc_table *tbl; | ||
335 | |||
336 | /* | ||
337 | * Allocate the ip_vs_lblc_table for this service | ||
338 | */ | ||
339 | tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); | ||
340 | if (tbl == NULL) { | ||
341 | IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); | ||
342 | return -ENOMEM; | ||
343 | } | ||
344 | svc->sched_data = tbl; | ||
345 | IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " | ||
346 | "current service\n", | ||
347 | sizeof(struct ip_vs_lblc_table)); | ||
348 | |||
349 | /* | ||
350 | * Initialize the hash buckets | ||
351 | */ | ||
352 | for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { | ||
353 | INIT_LIST_HEAD(&tbl->bucket[i]); | ||
354 | } | ||
355 | rwlock_init(&tbl->lock); | ||
356 | tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; | ||
357 | tbl->rover = 0; | ||
358 | tbl->counter = 1; | ||
359 | |||
360 | /* | ||
361 | * Hook periodic timer for garbage collection | ||
362 | */ | ||
363 | setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, | ||
364 | (unsigned long)tbl); | ||
365 | tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; | ||
366 | add_timer(&tbl->periodic_timer); | ||
367 | |||
368 | return 0; | ||
369 | } | ||
370 | |||
371 | |||
372 | static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) | ||
373 | { | ||
374 | struct ip_vs_lblc_table *tbl = svc->sched_data; | ||
375 | |||
376 | /* remove periodic timer */ | ||
377 | del_timer_sync(&tbl->periodic_timer); | ||
378 | |||
379 | /* got to clean up table entries here */ | ||
380 | ip_vs_lblc_flush(tbl); | ||
381 | |||
382 | /* release the table itself */ | ||
383 | kfree(svc->sched_data); | ||
384 | IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", | ||
385 | sizeof(struct ip_vs_lblc_table)); | ||
386 | |||
387 | return 0; | ||
388 | } | ||
389 | |||
390 | |||
391 | static int ip_vs_lblc_update_svc(struct ip_vs_service *svc) | ||
392 | { | ||
393 | return 0; | ||
394 | } | ||
395 | |||
396 | |||
397 | static inline struct ip_vs_dest * | ||
398 | __ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) | ||
399 | { | ||
400 | struct ip_vs_dest *dest, *least; | ||
401 | int loh, doh; | ||
402 | |||
403 | /* | ||
404 | * We think the overhead of processing active connections is fifty | ||
405 | * times higher than that of inactive connections in average. (This | ||
406 | * fifty times might not be accurate, we will change it later.) We | ||
407 | * use the following formula to estimate the overhead: | ||
408 | * dest->activeconns*50 + dest->inactconns | ||
409 | * and the load: | ||
410 | * (dest overhead) / dest->weight | ||
411 | * | ||
412 | * Remember -- no floats in kernel mode!!! | ||
413 | * The comparison of h1*w2 > h2*w1 is equivalent to that of | ||
414 | * h1/w1 > h2/w2 | ||
415 | * if every weight is larger than zero. | ||
416 | * | ||
417 | * The server with weight=0 is quiesced and will not receive any | ||
418 | * new connection. | ||
419 | */ | ||
420 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
421 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
422 | continue; | ||
423 | if (atomic_read(&dest->weight) > 0) { | ||
424 | least = dest; | ||
425 | loh = atomic_read(&least->activeconns) * 50 | ||
426 | + atomic_read(&least->inactconns); | ||
427 | goto nextstage; | ||
428 | } | ||
429 | } | ||
430 | return NULL; | ||
431 | |||
432 | /* | ||
433 | * Find the destination with the least load. | ||
434 | */ | ||
435 | nextstage: | ||
436 | list_for_each_entry_continue(dest, &svc->destinations, n_list) { | ||
437 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
438 | continue; | ||
439 | |||
440 | doh = atomic_read(&dest->activeconns) * 50 | ||
441 | + atomic_read(&dest->inactconns); | ||
442 | if (loh * atomic_read(&dest->weight) > | ||
443 | doh * atomic_read(&least->weight)) { | ||
444 | least = dest; | ||
445 | loh = doh; | ||
446 | } | ||
447 | } | ||
448 | |||
449 | IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " | ||
450 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
451 | NIPQUAD(least->addr), ntohs(least->port), | ||
452 | atomic_read(&least->activeconns), | ||
453 | atomic_read(&least->refcnt), | ||
454 | atomic_read(&least->weight), loh); | ||
455 | |||
456 | return least; | ||
457 | } | ||
458 | |||
459 | |||
460 | /* | ||
461 | * If this destination server is overloaded and there is a less loaded | ||
462 | * server, then return true. | ||
463 | */ | ||
464 | static inline int | ||
465 | is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) | ||
466 | { | ||
467 | if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { | ||
468 | struct ip_vs_dest *d; | ||
469 | |||
470 | list_for_each_entry(d, &svc->destinations, n_list) { | ||
471 | if (atomic_read(&d->activeconns)*2 | ||
472 | < atomic_read(&d->weight)) { | ||
473 | return 1; | ||
474 | } | ||
475 | } | ||
476 | } | ||
477 | return 0; | ||
478 | } | ||
479 | |||
480 | |||
481 | /* | ||
482 | * Locality-Based (weighted) Least-Connection scheduling | ||
483 | */ | ||
484 | static struct ip_vs_dest * | ||
485 | ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
486 | { | ||
487 | struct ip_vs_dest *dest; | ||
488 | struct ip_vs_lblc_table *tbl; | ||
489 | struct ip_vs_lblc_entry *en; | ||
490 | struct iphdr *iph = ip_hdr(skb); | ||
491 | |||
492 | IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); | ||
493 | |||
494 | tbl = (struct ip_vs_lblc_table *)svc->sched_data; | ||
495 | en = ip_vs_lblc_get(tbl, iph->daddr); | ||
496 | if (en == NULL) { | ||
497 | dest = __ip_vs_wlc_schedule(svc, iph); | ||
498 | if (dest == NULL) { | ||
499 | IP_VS_DBG(1, "no destination available\n"); | ||
500 | return NULL; | ||
501 | } | ||
502 | en = ip_vs_lblc_new(iph->daddr, dest); | ||
503 | if (en == NULL) { | ||
504 | return NULL; | ||
505 | } | ||
506 | ip_vs_lblc_hash(tbl, en); | ||
507 | } else { | ||
508 | dest = en->dest; | ||
509 | if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) | ||
510 | || atomic_read(&dest->weight) <= 0 | ||
511 | || is_overloaded(dest, svc)) { | ||
512 | dest = __ip_vs_wlc_schedule(svc, iph); | ||
513 | if (dest == NULL) { | ||
514 | IP_VS_DBG(1, "no destination available\n"); | ||
515 | return NULL; | ||
516 | } | ||
517 | atomic_dec(&en->dest->refcnt); | ||
518 | atomic_inc(&dest->refcnt); | ||
519 | en->dest = dest; | ||
520 | } | ||
521 | } | ||
522 | en->lastuse = jiffies; | ||
523 | |||
524 | IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " | ||
525 | "--> server %u.%u.%u.%u:%d\n", | ||
526 | NIPQUAD(en->addr), | ||
527 | NIPQUAD(dest->addr), | ||
528 | ntohs(dest->port)); | ||
529 | |||
530 | return dest; | ||
531 | } | ||
532 | |||
533 | |||
534 | /* | ||
535 | * IPVS LBLC Scheduler structure | ||
536 | */ | ||
537 | static struct ip_vs_scheduler ip_vs_lblc_scheduler = | ||
538 | { | ||
539 | .name = "lblc", | ||
540 | .refcnt = ATOMIC_INIT(0), | ||
541 | .module = THIS_MODULE, | ||
542 | .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), | ||
543 | .init_service = ip_vs_lblc_init_svc, | ||
544 | .done_service = ip_vs_lblc_done_svc, | ||
545 | .update_service = ip_vs_lblc_update_svc, | ||
546 | .schedule = ip_vs_lblc_schedule, | ||
547 | }; | ||
548 | |||
549 | |||
550 | static int __init ip_vs_lblc_init(void) | ||
551 | { | ||
552 | int ret; | ||
553 | |||
554 | sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); | ||
555 | ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); | ||
556 | if (ret) | ||
557 | unregister_sysctl_table(sysctl_header); | ||
558 | return ret; | ||
559 | } | ||
560 | |||
561 | |||
562 | static void __exit ip_vs_lblc_cleanup(void) | ||
563 | { | ||
564 | unregister_sysctl_table(sysctl_header); | ||
565 | unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); | ||
566 | } | ||
567 | |||
568 | |||
569 | module_init(ip_vs_lblc_init); | ||
570 | module_exit(ip_vs_lblc_cleanup); | ||
571 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c deleted file mode 100644 index c234e73968a6..000000000000 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ /dev/null | |||
@@ -1,760 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Locality-Based Least-Connection with Replication scheduler | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@gnuchina.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Changes: | ||
12 | * Julian Anastasov : Added the missing (dest->weight>0) | ||
13 | * condition in the ip_vs_dest_set_max. | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | /* | ||
18 | * The lblc/r algorithm is as follows (pseudo code): | ||
19 | * | ||
20 | * if serverSet[dest_ip] is null then | ||
21 | * n, serverSet[dest_ip] <- {weighted least-conn node}; | ||
22 | * else | ||
23 | * n <- {least-conn (alive) node in serverSet[dest_ip]}; | ||
24 | * if (n is null) OR | ||
25 | * (n.conns>n.weight AND | ||
26 | * there is a node m with m.conns<m.weight/2) then | ||
27 | * n <- {weighted least-conn node}; | ||
28 | * add n to serverSet[dest_ip]; | ||
29 | * if |serverSet[dest_ip]| > 1 AND | ||
30 | * now - serverSet[dest_ip].lastMod > T then | ||
31 | * m <- {most conn node in serverSet[dest_ip]}; | ||
32 | * remove m from serverSet[dest_ip]; | ||
33 | * if serverSet[dest_ip] changed then | ||
34 | * serverSet[dest_ip].lastMod <- now; | ||
35 | * | ||
36 | * return n; | ||
37 | * | ||
38 | */ | ||
39 | |||
40 | #include <linux/ip.h> | ||
41 | #include <linux/module.h> | ||
42 | #include <linux/kernel.h> | ||
43 | #include <linux/skbuff.h> | ||
44 | #include <linux/jiffies.h> | ||
45 | |||
46 | /* for sysctl */ | ||
47 | #include <linux/fs.h> | ||
48 | #include <linux/sysctl.h> | ||
49 | #include <net/net_namespace.h> | ||
50 | |||
51 | #include <net/ip_vs.h> | ||
52 | |||
53 | |||
54 | /* | ||
55 | * It is for garbage collection of stale IPVS lblcr entries, | ||
56 | * when the table is full. | ||
57 | */ | ||
58 | #define CHECK_EXPIRE_INTERVAL (60*HZ) | ||
59 | #define ENTRY_TIMEOUT (6*60*HZ) | ||
60 | |||
61 | /* | ||
62 | * It is for full expiration check. | ||
63 | * When there is no partial expiration check (garbage collection) | ||
64 | * in a half hour, do a full expiration check to collect stale | ||
65 | * entries that haven't been touched for a day. | ||
66 | */ | ||
67 | #define COUNT_FOR_FULL_EXPIRATION 30 | ||
68 | static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; | ||
69 | |||
70 | |||
71 | /* | ||
72 | * for IPVS lblcr entry hash table | ||
73 | */ | ||
74 | #ifndef CONFIG_IP_VS_LBLCR_TAB_BITS | ||
75 | #define CONFIG_IP_VS_LBLCR_TAB_BITS 10 | ||
76 | #endif | ||
77 | #define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS | ||
78 | #define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) | ||
79 | #define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) | ||
80 | |||
81 | |||
82 | /* | ||
83 | * IPVS destination set structure and operations | ||
84 | */ | ||
85 | struct ip_vs_dest_list { | ||
86 | struct ip_vs_dest_list *next; /* list link */ | ||
87 | struct ip_vs_dest *dest; /* destination server */ | ||
88 | }; | ||
89 | |||
90 | struct ip_vs_dest_set { | ||
91 | atomic_t size; /* set size */ | ||
92 | unsigned long lastmod; /* last modified time */ | ||
93 | struct ip_vs_dest_list *list; /* destination list */ | ||
94 | rwlock_t lock; /* lock for this list */ | ||
95 | }; | ||
96 | |||
97 | |||
98 | static struct ip_vs_dest_list * | ||
99 | ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) | ||
100 | { | ||
101 | struct ip_vs_dest_list *e; | ||
102 | |||
103 | for (e=set->list; e!=NULL; e=e->next) { | ||
104 | if (e->dest == dest) | ||
105 | /* already existed */ | ||
106 | return NULL; | ||
107 | } | ||
108 | |||
109 | e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC); | ||
110 | if (e == NULL) { | ||
111 | IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); | ||
112 | return NULL; | ||
113 | } | ||
114 | |||
115 | atomic_inc(&dest->refcnt); | ||
116 | e->dest = dest; | ||
117 | |||
118 | /* link it to the list */ | ||
119 | write_lock(&set->lock); | ||
120 | e->next = set->list; | ||
121 | set->list = e; | ||
122 | atomic_inc(&set->size); | ||
123 | write_unlock(&set->lock); | ||
124 | |||
125 | set->lastmod = jiffies; | ||
126 | return e; | ||
127 | } | ||
128 | |||
129 | static void | ||
130 | ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) | ||
131 | { | ||
132 | struct ip_vs_dest_list *e, **ep; | ||
133 | |||
134 | write_lock(&set->lock); | ||
135 | for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { | ||
136 | if (e->dest == dest) { | ||
137 | /* HIT */ | ||
138 | *ep = e->next; | ||
139 | atomic_dec(&set->size); | ||
140 | set->lastmod = jiffies; | ||
141 | atomic_dec(&e->dest->refcnt); | ||
142 | kfree(e); | ||
143 | break; | ||
144 | } | ||
145 | ep = &e->next; | ||
146 | } | ||
147 | write_unlock(&set->lock); | ||
148 | } | ||
149 | |||
150 | static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) | ||
151 | { | ||
152 | struct ip_vs_dest_list *e, **ep; | ||
153 | |||
154 | write_lock(&set->lock); | ||
155 | for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { | ||
156 | *ep = e->next; | ||
157 | /* | ||
158 | * We don't kfree dest because it is refered either | ||
159 | * by its service or by the trash dest list. | ||
160 | */ | ||
161 | atomic_dec(&e->dest->refcnt); | ||
162 | kfree(e); | ||
163 | } | ||
164 | write_unlock(&set->lock); | ||
165 | } | ||
166 | |||
167 | /* get weighted least-connection node in the destination set */ | ||
168 | static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) | ||
169 | { | ||
170 | register struct ip_vs_dest_list *e; | ||
171 | struct ip_vs_dest *dest, *least; | ||
172 | int loh, doh; | ||
173 | |||
174 | if (set == NULL) | ||
175 | return NULL; | ||
176 | |||
177 | read_lock(&set->lock); | ||
178 | /* select the first destination server, whose weight > 0 */ | ||
179 | for (e=set->list; e!=NULL; e=e->next) { | ||
180 | least = e->dest; | ||
181 | if (least->flags & IP_VS_DEST_F_OVERLOAD) | ||
182 | continue; | ||
183 | |||
184 | if ((atomic_read(&least->weight) > 0) | ||
185 | && (least->flags & IP_VS_DEST_F_AVAILABLE)) { | ||
186 | loh = atomic_read(&least->activeconns) * 50 | ||
187 | + atomic_read(&least->inactconns); | ||
188 | goto nextstage; | ||
189 | } | ||
190 | } | ||
191 | read_unlock(&set->lock); | ||
192 | return NULL; | ||
193 | |||
194 | /* find the destination with the weighted least load */ | ||
195 | nextstage: | ||
196 | for (e=e->next; e!=NULL; e=e->next) { | ||
197 | dest = e->dest; | ||
198 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
199 | continue; | ||
200 | |||
201 | doh = atomic_read(&dest->activeconns) * 50 | ||
202 | + atomic_read(&dest->inactconns); | ||
203 | if ((loh * atomic_read(&dest->weight) > | ||
204 | doh * atomic_read(&least->weight)) | ||
205 | && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { | ||
206 | least = dest; | ||
207 | loh = doh; | ||
208 | } | ||
209 | } | ||
210 | read_unlock(&set->lock); | ||
211 | |||
212 | IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " | ||
213 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
214 | NIPQUAD(least->addr), ntohs(least->port), | ||
215 | atomic_read(&least->activeconns), | ||
216 | atomic_read(&least->refcnt), | ||
217 | atomic_read(&least->weight), loh); | ||
218 | return least; | ||
219 | } | ||
220 | |||
221 | |||
222 | /* get weighted most-connection node in the destination set */ | ||
223 | static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) | ||
224 | { | ||
225 | register struct ip_vs_dest_list *e; | ||
226 | struct ip_vs_dest *dest, *most; | ||
227 | int moh, doh; | ||
228 | |||
229 | if (set == NULL) | ||
230 | return NULL; | ||
231 | |||
232 | read_lock(&set->lock); | ||
233 | /* select the first destination server, whose weight > 0 */ | ||
234 | for (e=set->list; e!=NULL; e=e->next) { | ||
235 | most = e->dest; | ||
236 | if (atomic_read(&most->weight) > 0) { | ||
237 | moh = atomic_read(&most->activeconns) * 50 | ||
238 | + atomic_read(&most->inactconns); | ||
239 | goto nextstage; | ||
240 | } | ||
241 | } | ||
242 | read_unlock(&set->lock); | ||
243 | return NULL; | ||
244 | |||
245 | /* find the destination with the weighted most load */ | ||
246 | nextstage: | ||
247 | for (e=e->next; e!=NULL; e=e->next) { | ||
248 | dest = e->dest; | ||
249 | doh = atomic_read(&dest->activeconns) * 50 | ||
250 | + atomic_read(&dest->inactconns); | ||
251 | /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ | ||
252 | if ((moh * atomic_read(&dest->weight) < | ||
253 | doh * atomic_read(&most->weight)) | ||
254 | && (atomic_read(&dest->weight) > 0)) { | ||
255 | most = dest; | ||
256 | moh = doh; | ||
257 | } | ||
258 | } | ||
259 | read_unlock(&set->lock); | ||
260 | |||
261 | IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " | ||
262 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
263 | NIPQUAD(most->addr), ntohs(most->port), | ||
264 | atomic_read(&most->activeconns), | ||
265 | atomic_read(&most->refcnt), | ||
266 | atomic_read(&most->weight), moh); | ||
267 | return most; | ||
268 | } | ||
269 | |||
270 | |||
271 | /* | ||
272 | * IPVS lblcr entry represents an association between destination | ||
273 | * IP address and its destination server set | ||
274 | */ | ||
275 | struct ip_vs_lblcr_entry { | ||
276 | struct list_head list; | ||
277 | __be32 addr; /* destination IP address */ | ||
278 | struct ip_vs_dest_set set; /* destination server set */ | ||
279 | unsigned long lastuse; /* last used time */ | ||
280 | }; | ||
281 | |||
282 | |||
283 | /* | ||
284 | * IPVS lblcr hash table | ||
285 | */ | ||
286 | struct ip_vs_lblcr_table { | ||
287 | rwlock_t lock; /* lock for this table */ | ||
288 | struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ | ||
289 | atomic_t entries; /* number of entries */ | ||
290 | int max_size; /* maximum size of entries */ | ||
291 | struct timer_list periodic_timer; /* collect stale entries */ | ||
292 | int rover; /* rover for expire check */ | ||
293 | int counter; /* counter for no expire */ | ||
294 | }; | ||
295 | |||
296 | |||
297 | /* | ||
298 | * IPVS LBLCR sysctl table | ||
299 | */ | ||
300 | |||
301 | static ctl_table vs_vars_table[] = { | ||
302 | { | ||
303 | .procname = "lblcr_expiration", | ||
304 | .data = &sysctl_ip_vs_lblcr_expiration, | ||
305 | .maxlen = sizeof(int), | ||
306 | .mode = 0644, | ||
307 | .proc_handler = &proc_dointvec_jiffies, | ||
308 | }, | ||
309 | { .ctl_name = 0 } | ||
310 | }; | ||
311 | |||
312 | static struct ctl_table_header * sysctl_header; | ||
313 | |||
314 | /* | ||
315 | * new/free a ip_vs_lblcr_entry, which is a mapping of a destination | ||
316 | * IP address to a server. | ||
317 | */ | ||
318 | static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr) | ||
319 | { | ||
320 | struct ip_vs_lblcr_entry *en; | ||
321 | |||
322 | en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC); | ||
323 | if (en == NULL) { | ||
324 | IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); | ||
325 | return NULL; | ||
326 | } | ||
327 | |||
328 | INIT_LIST_HEAD(&en->list); | ||
329 | en->addr = daddr; | ||
330 | |||
331 | /* initilize its dest set */ | ||
332 | atomic_set(&(en->set.size), 0); | ||
333 | en->set.list = NULL; | ||
334 | rwlock_init(&en->set.lock); | ||
335 | |||
336 | return en; | ||
337 | } | ||
338 | |||
339 | |||
340 | static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) | ||
341 | { | ||
342 | list_del(&en->list); | ||
343 | ip_vs_dest_set_eraseall(&en->set); | ||
344 | kfree(en); | ||
345 | } | ||
346 | |||
347 | |||
348 | /* | ||
349 | * Returns hash value for IPVS LBLCR entry | ||
350 | */ | ||
351 | static inline unsigned ip_vs_lblcr_hashkey(__be32 addr) | ||
352 | { | ||
353 | return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; | ||
354 | } | ||
355 | |||
356 | |||
357 | /* | ||
358 | * Hash an entry in the ip_vs_lblcr_table. | ||
359 | * returns bool success. | ||
360 | */ | ||
361 | static int | ||
362 | ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) | ||
363 | { | ||
364 | unsigned hash; | ||
365 | |||
366 | if (!list_empty(&en->list)) { | ||
367 | IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, " | ||
368 | "called from %p\n", __builtin_return_address(0)); | ||
369 | return 0; | ||
370 | } | ||
371 | |||
372 | /* | ||
373 | * Hash by destination IP address | ||
374 | */ | ||
375 | hash = ip_vs_lblcr_hashkey(en->addr); | ||
376 | |||
377 | write_lock(&tbl->lock); | ||
378 | list_add(&en->list, &tbl->bucket[hash]); | ||
379 | atomic_inc(&tbl->entries); | ||
380 | write_unlock(&tbl->lock); | ||
381 | |||
382 | return 1; | ||
383 | } | ||
384 | |||
385 | |||
386 | /* | ||
387 | * Get ip_vs_lblcr_entry associated with supplied parameters. | ||
388 | */ | ||
389 | static inline struct ip_vs_lblcr_entry * | ||
390 | ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr) | ||
391 | { | ||
392 | unsigned hash; | ||
393 | struct ip_vs_lblcr_entry *en; | ||
394 | |||
395 | hash = ip_vs_lblcr_hashkey(addr); | ||
396 | |||
397 | read_lock(&tbl->lock); | ||
398 | |||
399 | list_for_each_entry(en, &tbl->bucket[hash], list) { | ||
400 | if (en->addr == addr) { | ||
401 | /* HIT */ | ||
402 | read_unlock(&tbl->lock); | ||
403 | return en; | ||
404 | } | ||
405 | } | ||
406 | |||
407 | read_unlock(&tbl->lock); | ||
408 | |||
409 | return NULL; | ||
410 | } | ||
411 | |||
412 | |||
413 | /* | ||
414 | * Flush all the entries of the specified table. | ||
415 | */ | ||
416 | static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) | ||
417 | { | ||
418 | int i; | ||
419 | struct ip_vs_lblcr_entry *en, *nxt; | ||
420 | |||
421 | for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { | ||
422 | write_lock(&tbl->lock); | ||
423 | list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { | ||
424 | ip_vs_lblcr_free(en); | ||
425 | atomic_dec(&tbl->entries); | ||
426 | } | ||
427 | write_unlock(&tbl->lock); | ||
428 | } | ||
429 | } | ||
430 | |||
431 | |||
432 | static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) | ||
433 | { | ||
434 | unsigned long now = jiffies; | ||
435 | int i, j; | ||
436 | struct ip_vs_lblcr_entry *en, *nxt; | ||
437 | |||
438 | for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { | ||
439 | j = (j + 1) & IP_VS_LBLCR_TAB_MASK; | ||
440 | |||
441 | write_lock(&tbl->lock); | ||
442 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | ||
443 | if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, | ||
444 | now)) | ||
445 | continue; | ||
446 | |||
447 | ip_vs_lblcr_free(en); | ||
448 | atomic_dec(&tbl->entries); | ||
449 | } | ||
450 | write_unlock(&tbl->lock); | ||
451 | } | ||
452 | tbl->rover = j; | ||
453 | } | ||
454 | |||
455 | |||
456 | /* | ||
457 | * Periodical timer handler for IPVS lblcr table | ||
458 | * It is used to collect stale entries when the number of entries | ||
459 | * exceeds the maximum size of the table. | ||
460 | * | ||
461 | * Fixme: we probably need more complicated algorithm to collect | ||
462 | * entries that have not been used for a long time even | ||
463 | * if the number of entries doesn't exceed the maximum size | ||
464 | * of the table. | ||
465 | * The full expiration check is for this purpose now. | ||
466 | */ | ||
467 | static void ip_vs_lblcr_check_expire(unsigned long data) | ||
468 | { | ||
469 | struct ip_vs_lblcr_table *tbl; | ||
470 | unsigned long now = jiffies; | ||
471 | int goal; | ||
472 | int i, j; | ||
473 | struct ip_vs_lblcr_entry *en, *nxt; | ||
474 | |||
475 | tbl = (struct ip_vs_lblcr_table *)data; | ||
476 | |||
477 | if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { | ||
478 | /* do full expiration check */ | ||
479 | ip_vs_lblcr_full_check(tbl); | ||
480 | tbl->counter = 1; | ||
481 | goto out; | ||
482 | } | ||
483 | |||
484 | if (atomic_read(&tbl->entries) <= tbl->max_size) { | ||
485 | tbl->counter++; | ||
486 | goto out; | ||
487 | } | ||
488 | |||
489 | goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; | ||
490 | if (goal > tbl->max_size/2) | ||
491 | goal = tbl->max_size/2; | ||
492 | |||
493 | for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { | ||
494 | j = (j + 1) & IP_VS_LBLCR_TAB_MASK; | ||
495 | |||
496 | write_lock(&tbl->lock); | ||
497 | list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { | ||
498 | if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) | ||
499 | continue; | ||
500 | |||
501 | ip_vs_lblcr_free(en); | ||
502 | atomic_dec(&tbl->entries); | ||
503 | goal--; | ||
504 | } | ||
505 | write_unlock(&tbl->lock); | ||
506 | if (goal <= 0) | ||
507 | break; | ||
508 | } | ||
509 | tbl->rover = j; | ||
510 | |||
511 | out: | ||
512 | mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); | ||
513 | } | ||
514 | |||
515 | static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) | ||
516 | { | ||
517 | int i; | ||
518 | struct ip_vs_lblcr_table *tbl; | ||
519 | |||
520 | /* | ||
521 | * Allocate the ip_vs_lblcr_table for this service | ||
522 | */ | ||
523 | tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC); | ||
524 | if (tbl == NULL) { | ||
525 | IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); | ||
526 | return -ENOMEM; | ||
527 | } | ||
528 | svc->sched_data = tbl; | ||
529 | IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " | ||
530 | "current service\n", | ||
531 | sizeof(struct ip_vs_lblcr_table)); | ||
532 | |||
533 | /* | ||
534 | * Initialize the hash buckets | ||
535 | */ | ||
536 | for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { | ||
537 | INIT_LIST_HEAD(&tbl->bucket[i]); | ||
538 | } | ||
539 | rwlock_init(&tbl->lock); | ||
540 | tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; | ||
541 | tbl->rover = 0; | ||
542 | tbl->counter = 1; | ||
543 | |||
544 | /* | ||
545 | * Hook periodic timer for garbage collection | ||
546 | */ | ||
547 | setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, | ||
548 | (unsigned long)tbl); | ||
549 | tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; | ||
550 | add_timer(&tbl->periodic_timer); | ||
551 | |||
552 | return 0; | ||
553 | } | ||
554 | |||
555 | |||
556 | static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) | ||
557 | { | ||
558 | struct ip_vs_lblcr_table *tbl = svc->sched_data; | ||
559 | |||
560 | /* remove periodic timer */ | ||
561 | del_timer_sync(&tbl->periodic_timer); | ||
562 | |||
563 | /* got to clean up table entries here */ | ||
564 | ip_vs_lblcr_flush(tbl); | ||
565 | |||
566 | /* release the table itself */ | ||
567 | kfree(svc->sched_data); | ||
568 | IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", | ||
569 | sizeof(struct ip_vs_lblcr_table)); | ||
570 | |||
571 | return 0; | ||
572 | } | ||
573 | |||
574 | |||
575 | static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc) | ||
576 | { | ||
577 | return 0; | ||
578 | } | ||
579 | |||
580 | |||
581 | static inline struct ip_vs_dest * | ||
582 | __ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) | ||
583 | { | ||
584 | struct ip_vs_dest *dest, *least; | ||
585 | int loh, doh; | ||
586 | |||
587 | /* | ||
588 | * We think the overhead of processing active connections is fifty | ||
589 | * times higher than that of inactive connections in average. (This | ||
590 | * fifty times might not be accurate, we will change it later.) We | ||
591 | * use the following formula to estimate the overhead: | ||
592 | * dest->activeconns*50 + dest->inactconns | ||
593 | * and the load: | ||
594 | * (dest overhead) / dest->weight | ||
595 | * | ||
596 | * Remember -- no floats in kernel mode!!! | ||
597 | * The comparison of h1*w2 > h2*w1 is equivalent to that of | ||
598 | * h1/w1 > h2/w2 | ||
599 | * if every weight is larger than zero. | ||
600 | * | ||
601 | * The server with weight=0 is quiesced and will not receive any | ||
602 | * new connection. | ||
603 | */ | ||
604 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
605 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
606 | continue; | ||
607 | |||
608 | if (atomic_read(&dest->weight) > 0) { | ||
609 | least = dest; | ||
610 | loh = atomic_read(&least->activeconns) * 50 | ||
611 | + atomic_read(&least->inactconns); | ||
612 | goto nextstage; | ||
613 | } | ||
614 | } | ||
615 | return NULL; | ||
616 | |||
617 | /* | ||
618 | * Find the destination with the least load. | ||
619 | */ | ||
620 | nextstage: | ||
621 | list_for_each_entry_continue(dest, &svc->destinations, n_list) { | ||
622 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
623 | continue; | ||
624 | |||
625 | doh = atomic_read(&dest->activeconns) * 50 | ||
626 | + atomic_read(&dest->inactconns); | ||
627 | if (loh * atomic_read(&dest->weight) > | ||
628 | doh * atomic_read(&least->weight)) { | ||
629 | least = dest; | ||
630 | loh = doh; | ||
631 | } | ||
632 | } | ||
633 | |||
634 | IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " | ||
635 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
636 | NIPQUAD(least->addr), ntohs(least->port), | ||
637 | atomic_read(&least->activeconns), | ||
638 | atomic_read(&least->refcnt), | ||
639 | atomic_read(&least->weight), loh); | ||
640 | |||
641 | return least; | ||
642 | } | ||
643 | |||
644 | |||
645 | /* | ||
646 | * If this destination server is overloaded and there is a less loaded | ||
647 | * server, then return true. | ||
648 | */ | ||
649 | static inline int | ||
650 | is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) | ||
651 | { | ||
652 | if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { | ||
653 | struct ip_vs_dest *d; | ||
654 | |||
655 | list_for_each_entry(d, &svc->destinations, n_list) { | ||
656 | if (atomic_read(&d->activeconns)*2 | ||
657 | < atomic_read(&d->weight)) { | ||
658 | return 1; | ||
659 | } | ||
660 | } | ||
661 | } | ||
662 | return 0; | ||
663 | } | ||
664 | |||
665 | |||
666 | /* | ||
667 | * Locality-Based (weighted) Least-Connection scheduling | ||
668 | */ | ||
669 | static struct ip_vs_dest * | ||
670 | ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
671 | { | ||
672 | struct ip_vs_dest *dest; | ||
673 | struct ip_vs_lblcr_table *tbl; | ||
674 | struct ip_vs_lblcr_entry *en; | ||
675 | struct iphdr *iph = ip_hdr(skb); | ||
676 | |||
677 | IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); | ||
678 | |||
679 | tbl = (struct ip_vs_lblcr_table *)svc->sched_data; | ||
680 | en = ip_vs_lblcr_get(tbl, iph->daddr); | ||
681 | if (en == NULL) { | ||
682 | dest = __ip_vs_wlc_schedule(svc, iph); | ||
683 | if (dest == NULL) { | ||
684 | IP_VS_DBG(1, "no destination available\n"); | ||
685 | return NULL; | ||
686 | } | ||
687 | en = ip_vs_lblcr_new(iph->daddr); | ||
688 | if (en == NULL) { | ||
689 | return NULL; | ||
690 | } | ||
691 | ip_vs_dest_set_insert(&en->set, dest); | ||
692 | ip_vs_lblcr_hash(tbl, en); | ||
693 | } else { | ||
694 | dest = ip_vs_dest_set_min(&en->set); | ||
695 | if (!dest || is_overloaded(dest, svc)) { | ||
696 | dest = __ip_vs_wlc_schedule(svc, iph); | ||
697 | if (dest == NULL) { | ||
698 | IP_VS_DBG(1, "no destination available\n"); | ||
699 | return NULL; | ||
700 | } | ||
701 | ip_vs_dest_set_insert(&en->set, dest); | ||
702 | } | ||
703 | if (atomic_read(&en->set.size) > 1 && | ||
704 | jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) { | ||
705 | struct ip_vs_dest *m; | ||
706 | m = ip_vs_dest_set_max(&en->set); | ||
707 | if (m) | ||
708 | ip_vs_dest_set_erase(&en->set, m); | ||
709 | } | ||
710 | } | ||
711 | en->lastuse = jiffies; | ||
712 | |||
713 | IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " | ||
714 | "--> server %u.%u.%u.%u:%d\n", | ||
715 | NIPQUAD(en->addr), | ||
716 | NIPQUAD(dest->addr), | ||
717 | ntohs(dest->port)); | ||
718 | |||
719 | return dest; | ||
720 | } | ||
721 | |||
722 | |||
723 | /* | ||
724 | * IPVS LBLCR Scheduler structure | ||
725 | */ | ||
726 | static struct ip_vs_scheduler ip_vs_lblcr_scheduler = | ||
727 | { | ||
728 | .name = "lblcr", | ||
729 | .refcnt = ATOMIC_INIT(0), | ||
730 | .module = THIS_MODULE, | ||
731 | .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), | ||
732 | .init_service = ip_vs_lblcr_init_svc, | ||
733 | .done_service = ip_vs_lblcr_done_svc, | ||
734 | .update_service = ip_vs_lblcr_update_svc, | ||
735 | .schedule = ip_vs_lblcr_schedule, | ||
736 | }; | ||
737 | |||
738 | |||
739 | static int __init ip_vs_lblcr_init(void) | ||
740 | { | ||
741 | int ret; | ||
742 | |||
743 | sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); | ||
744 | ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); | ||
745 | if (ret) | ||
746 | unregister_sysctl_table(sysctl_header); | ||
747 | return ret; | ||
748 | } | ||
749 | |||
750 | |||
751 | static void __exit ip_vs_lblcr_cleanup(void) | ||
752 | { | ||
753 | unregister_sysctl_table(sysctl_header); | ||
754 | unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); | ||
755 | } | ||
756 | |||
757 | |||
758 | module_init(ip_vs_lblcr_init); | ||
759 | module_exit(ip_vs_lblcr_cleanup); | ||
760 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c deleted file mode 100644 index ebcdbf75ac65..000000000000 --- a/net/ipv4/ipvs/ip_vs_lc.c +++ /dev/null | |||
@@ -1,121 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Least-Connection Scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Changes: | ||
12 | * Wensong Zhang : added the ip_vs_lc_update_svc | ||
13 | * Wensong Zhang : added any dest with weight=0 is quiesced | ||
14 | * | ||
15 | */ | ||
16 | |||
17 | #include <linux/module.h> | ||
18 | #include <linux/kernel.h> | ||
19 | |||
20 | #include <net/ip_vs.h> | ||
21 | |||
22 | |||
23 | static int ip_vs_lc_init_svc(struct ip_vs_service *svc) | ||
24 | { | ||
25 | return 0; | ||
26 | } | ||
27 | |||
28 | |||
29 | static int ip_vs_lc_done_svc(struct ip_vs_service *svc) | ||
30 | { | ||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | |||
35 | static int ip_vs_lc_update_svc(struct ip_vs_service *svc) | ||
36 | { | ||
37 | return 0; | ||
38 | } | ||
39 | |||
40 | |||
41 | static inline unsigned int | ||
42 | ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) | ||
43 | { | ||
44 | /* | ||
45 | * We think the overhead of processing active connections is 256 | ||
46 | * times higher than that of inactive connections in average. (This | ||
47 | * 256 times might not be accurate, we will change it later) We | ||
48 | * use the following formula to estimate the overhead now: | ||
49 | * dest->activeconns*256 + dest->inactconns | ||
50 | */ | ||
51 | return (atomic_read(&dest->activeconns) << 8) + | ||
52 | atomic_read(&dest->inactconns); | ||
53 | } | ||
54 | |||
55 | |||
56 | /* | ||
57 | * Least Connection scheduling | ||
58 | */ | ||
59 | static struct ip_vs_dest * | ||
60 | ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
61 | { | ||
62 | struct ip_vs_dest *dest, *least = NULL; | ||
63 | unsigned int loh = 0, doh; | ||
64 | |||
65 | IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); | ||
66 | |||
67 | /* | ||
68 | * Simply select the server with the least number of | ||
69 | * (activeconns<<5) + inactconns | ||
70 | * Except whose weight is equal to zero. | ||
71 | * If the weight is equal to zero, it means that the server is | ||
72 | * quiesced, the existing connections to the server still get | ||
73 | * served, but no new connection is assigned to the server. | ||
74 | */ | ||
75 | |||
76 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
77 | if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || | ||
78 | atomic_read(&dest->weight) == 0) | ||
79 | continue; | ||
80 | doh = ip_vs_lc_dest_overhead(dest); | ||
81 | if (!least || doh < loh) { | ||
82 | least = dest; | ||
83 | loh = doh; | ||
84 | } | ||
85 | } | ||
86 | |||
87 | if (least) | ||
88 | IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n", | ||
89 | NIPQUAD(least->addr), ntohs(least->port), | ||
90 | atomic_read(&least->activeconns), | ||
91 | atomic_read(&least->inactconns)); | ||
92 | |||
93 | return least; | ||
94 | } | ||
95 | |||
96 | |||
97 | static struct ip_vs_scheduler ip_vs_lc_scheduler = { | ||
98 | .name = "lc", | ||
99 | .refcnt = ATOMIC_INIT(0), | ||
100 | .module = THIS_MODULE, | ||
101 | .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), | ||
102 | .init_service = ip_vs_lc_init_svc, | ||
103 | .done_service = ip_vs_lc_done_svc, | ||
104 | .update_service = ip_vs_lc_update_svc, | ||
105 | .schedule = ip_vs_lc_schedule, | ||
106 | }; | ||
107 | |||
108 | |||
109 | static int __init ip_vs_lc_init(void) | ||
110 | { | ||
111 | return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; | ||
112 | } | ||
113 | |||
114 | static void __exit ip_vs_lc_cleanup(void) | ||
115 | { | ||
116 | unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); | ||
117 | } | ||
118 | |||
119 | module_init(ip_vs_lc_init); | ||
120 | module_exit(ip_vs_lc_cleanup); | ||
121 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c deleted file mode 100644 index 92f3a6770031..000000000000 --- a/net/ipv4/ipvs/ip_vs_nq.c +++ /dev/null | |||
@@ -1,159 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Never Queue scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Changes: | ||
12 | * | ||
13 | */ | ||
14 | |||
15 | /* | ||
16 | * The NQ algorithm adopts a two-speed model. When there is an idle server | ||
17 | * available, the job will be sent to the idle server, instead of waiting | ||
18 | * for a fast one. When there is no idle server available, the job will be | ||
19 | * sent to the server that minimize its expected delay (The Shortest | ||
20 | * Expected Delay scheduling algorithm). | ||
21 | * | ||
22 | * See the following paper for more information: | ||
23 | * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing | ||
24 | * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, | ||
25 | * pages 986-994, 1988. | ||
26 | * | ||
27 | * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me. | ||
28 | * | ||
29 | * The difference between NQ and SED is that NQ can improve overall | ||
30 | * system utilization. | ||
31 | * | ||
32 | */ | ||
33 | |||
34 | #include <linux/module.h> | ||
35 | #include <linux/kernel.h> | ||
36 | |||
37 | #include <net/ip_vs.h> | ||
38 | |||
39 | |||
40 | static int | ||
41 | ip_vs_nq_init_svc(struct ip_vs_service *svc) | ||
42 | { | ||
43 | return 0; | ||
44 | } | ||
45 | |||
46 | |||
47 | static int | ||
48 | ip_vs_nq_done_svc(struct ip_vs_service *svc) | ||
49 | { | ||
50 | return 0; | ||
51 | } | ||
52 | |||
53 | |||
54 | static int | ||
55 | ip_vs_nq_update_svc(struct ip_vs_service *svc) | ||
56 | { | ||
57 | return 0; | ||
58 | } | ||
59 | |||
60 | |||
61 | static inline unsigned int | ||
62 | ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) | ||
63 | { | ||
64 | /* | ||
65 | * We only use the active connection number in the cost | ||
66 | * calculation here. | ||
67 | */ | ||
68 | return atomic_read(&dest->activeconns) + 1; | ||
69 | } | ||
70 | |||
71 | |||
72 | /* | ||
73 | * Weighted Least Connection scheduling | ||
74 | */ | ||
75 | static struct ip_vs_dest * | ||
76 | ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
77 | { | ||
78 | struct ip_vs_dest *dest, *least = NULL; | ||
79 | unsigned int loh = 0, doh; | ||
80 | |||
81 | IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n"); | ||
82 | |||
83 | /* | ||
84 | * We calculate the load of each dest server as follows: | ||
85 | * (server expected overhead) / dest->weight | ||
86 | * | ||
87 | * Remember -- no floats in kernel mode!!! | ||
88 | * The comparison of h1*w2 > h2*w1 is equivalent to that of | ||
89 | * h1/w1 > h2/w2 | ||
90 | * if every weight is larger than zero. | ||
91 | * | ||
92 | * The server with weight=0 is quiesced and will not receive any | ||
93 | * new connections. | ||
94 | */ | ||
95 | |||
96 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
97 | |||
98 | if (dest->flags & IP_VS_DEST_F_OVERLOAD || | ||
99 | !atomic_read(&dest->weight)) | ||
100 | continue; | ||
101 | |||
102 | doh = ip_vs_nq_dest_overhead(dest); | ||
103 | |||
104 | /* return the server directly if it is idle */ | ||
105 | if (atomic_read(&dest->activeconns) == 0) { | ||
106 | least = dest; | ||
107 | loh = doh; | ||
108 | goto out; | ||
109 | } | ||
110 | |||
111 | if (!least || | ||
112 | (loh * atomic_read(&dest->weight) > | ||
113 | doh * atomic_read(&least->weight))) { | ||
114 | least = dest; | ||
115 | loh = doh; | ||
116 | } | ||
117 | } | ||
118 | |||
119 | if (!least) | ||
120 | return NULL; | ||
121 | |||
122 | out: | ||
123 | IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u " | ||
124 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
125 | NIPQUAD(least->addr), ntohs(least->port), | ||
126 | atomic_read(&least->activeconns), | ||
127 | atomic_read(&least->refcnt), | ||
128 | atomic_read(&least->weight), loh); | ||
129 | |||
130 | return least; | ||
131 | } | ||
132 | |||
133 | |||
134 | static struct ip_vs_scheduler ip_vs_nq_scheduler = | ||
135 | { | ||
136 | .name = "nq", | ||
137 | .refcnt = ATOMIC_INIT(0), | ||
138 | .module = THIS_MODULE, | ||
139 | .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), | ||
140 | .init_service = ip_vs_nq_init_svc, | ||
141 | .done_service = ip_vs_nq_done_svc, | ||
142 | .update_service = ip_vs_nq_update_svc, | ||
143 | .schedule = ip_vs_nq_schedule, | ||
144 | }; | ||
145 | |||
146 | |||
147 | static int __init ip_vs_nq_init(void) | ||
148 | { | ||
149 | return register_ip_vs_scheduler(&ip_vs_nq_scheduler); | ||
150 | } | ||
151 | |||
152 | static void __exit ip_vs_nq_cleanup(void) | ||
153 | { | ||
154 | unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); | ||
155 | } | ||
156 | |||
157 | module_init(ip_vs_nq_init); | ||
158 | module_exit(ip_vs_nq_cleanup); | ||
159 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c deleted file mode 100644 index 6099a88fc200..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ /dev/null | |||
@@ -1,233 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_proto.c: transport protocol load balancing support for IPVS | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * Julian Anastasov <ja@ssi.bg> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version | ||
10 | * 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * Changes: | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/module.h> | ||
17 | #include <linux/kernel.h> | ||
18 | #include <linux/skbuff.h> | ||
19 | #include <linux/in.h> | ||
20 | #include <linux/ip.h> | ||
21 | #include <net/protocol.h> | ||
22 | #include <net/tcp.h> | ||
23 | #include <net/udp.h> | ||
24 | #include <asm/system.h> | ||
25 | #include <linux/stat.h> | ||
26 | #include <linux/proc_fs.h> | ||
27 | |||
28 | #include <net/ip_vs.h> | ||
29 | |||
30 | |||
31 | /* | ||
32 | * IPVS protocols can only be registered/unregistered when the ipvs | ||
33 | * module is loaded/unloaded, so no lock is needed in accessing the | ||
34 | * ipvs protocol table. | ||
35 | */ | ||
36 | |||
37 | #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ | ||
38 | #define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) | ||
39 | |||
40 | static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; | ||
41 | |||
42 | |||
43 | /* | ||
44 | * register an ipvs protocol | ||
45 | */ | ||
46 | static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) | ||
47 | { | ||
48 | unsigned hash = IP_VS_PROTO_HASH(pp->protocol); | ||
49 | |||
50 | pp->next = ip_vs_proto_table[hash]; | ||
51 | ip_vs_proto_table[hash] = pp; | ||
52 | |||
53 | if (pp->init != NULL) | ||
54 | pp->init(pp); | ||
55 | |||
56 | return 0; | ||
57 | } | ||
58 | |||
59 | |||
60 | /* | ||
61 | * unregister an ipvs protocol | ||
62 | */ | ||
63 | static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) | ||
64 | { | ||
65 | struct ip_vs_protocol **pp_p; | ||
66 | unsigned hash = IP_VS_PROTO_HASH(pp->protocol); | ||
67 | |||
68 | pp_p = &ip_vs_proto_table[hash]; | ||
69 | for (; *pp_p; pp_p = &(*pp_p)->next) { | ||
70 | if (*pp_p == pp) { | ||
71 | *pp_p = pp->next; | ||
72 | if (pp->exit != NULL) | ||
73 | pp->exit(pp); | ||
74 | return 0; | ||
75 | } | ||
76 | } | ||
77 | |||
78 | return -ESRCH; | ||
79 | } | ||
80 | |||
81 | |||
82 | /* | ||
83 | * get ip_vs_protocol object by its proto. | ||
84 | */ | ||
85 | struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) | ||
86 | { | ||
87 | struct ip_vs_protocol *pp; | ||
88 | unsigned hash = IP_VS_PROTO_HASH(proto); | ||
89 | |||
90 | for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { | ||
91 | if (pp->protocol == proto) | ||
92 | return pp; | ||
93 | } | ||
94 | |||
95 | return NULL; | ||
96 | } | ||
97 | |||
98 | |||
99 | /* | ||
100 | * Propagate event for state change to all protocols | ||
101 | */ | ||
102 | void ip_vs_protocol_timeout_change(int flags) | ||
103 | { | ||
104 | struct ip_vs_protocol *pp; | ||
105 | int i; | ||
106 | |||
107 | for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { | ||
108 | for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { | ||
109 | if (pp->timeout_change) | ||
110 | pp->timeout_change(pp, flags); | ||
111 | } | ||
112 | } | ||
113 | } | ||
114 | |||
115 | |||
116 | int * | ||
117 | ip_vs_create_timeout_table(int *table, int size) | ||
118 | { | ||
119 | return kmemdup(table, size, GFP_ATOMIC); | ||
120 | } | ||
121 | |||
122 | |||
123 | /* | ||
124 | * Set timeout value for state specified by name | ||
125 | */ | ||
126 | int | ||
127 | ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to) | ||
128 | { | ||
129 | int i; | ||
130 | |||
131 | if (!table || !name || !to) | ||
132 | return -EINVAL; | ||
133 | |||
134 | for (i = 0; i < num; i++) { | ||
135 | if (strcmp(names[i], name)) | ||
136 | continue; | ||
137 | table[i] = to * HZ; | ||
138 | return 0; | ||
139 | } | ||
140 | return -ENOENT; | ||
141 | } | ||
142 | |||
143 | |||
144 | const char * ip_vs_state_name(__u16 proto, int state) | ||
145 | { | ||
146 | struct ip_vs_protocol *pp = ip_vs_proto_get(proto); | ||
147 | |||
148 | if (pp == NULL || pp->state_name == NULL) | ||
149 | return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; | ||
150 | return pp->state_name(state); | ||
151 | } | ||
152 | |||
153 | |||
154 | void | ||
155 | ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, | ||
156 | const struct sk_buff *skb, | ||
157 | int offset, | ||
158 | const char *msg) | ||
159 | { | ||
160 | char buf[128]; | ||
161 | struct iphdr _iph, *ih; | ||
162 | |||
163 | ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); | ||
164 | if (ih == NULL) | ||
165 | sprintf(buf, "%s TRUNCATED", pp->name); | ||
166 | else if (ih->frag_off & htons(IP_OFFSET)) | ||
167 | sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", | ||
168 | pp->name, NIPQUAD(ih->saddr), | ||
169 | NIPQUAD(ih->daddr)); | ||
170 | else { | ||
171 | __be16 _ports[2], *pptr | ||
172 | ; | ||
173 | pptr = skb_header_pointer(skb, offset + ih->ihl*4, | ||
174 | sizeof(_ports), _ports); | ||
175 | if (pptr == NULL) | ||
176 | sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u", | ||
177 | pp->name, | ||
178 | NIPQUAD(ih->saddr), | ||
179 | NIPQUAD(ih->daddr)); | ||
180 | else | ||
181 | sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", | ||
182 | pp->name, | ||
183 | NIPQUAD(ih->saddr), | ||
184 | ntohs(pptr[0]), | ||
185 | NIPQUAD(ih->daddr), | ||
186 | ntohs(pptr[1])); | ||
187 | } | ||
188 | |||
189 | printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); | ||
190 | } | ||
191 | |||
192 | |||
193 | int __init ip_vs_protocol_init(void) | ||
194 | { | ||
195 | char protocols[64]; | ||
196 | #define REGISTER_PROTOCOL(p) \ | ||
197 | do { \ | ||
198 | register_ip_vs_protocol(p); \ | ||
199 | strcat(protocols, ", "); \ | ||
200 | strcat(protocols, (p)->name); \ | ||
201 | } while (0) | ||
202 | |||
203 | protocols[0] = '\0'; | ||
204 | protocols[2] = '\0'; | ||
205 | #ifdef CONFIG_IP_VS_PROTO_TCP | ||
206 | REGISTER_PROTOCOL(&ip_vs_protocol_tcp); | ||
207 | #endif | ||
208 | #ifdef CONFIG_IP_VS_PROTO_UDP | ||
209 | REGISTER_PROTOCOL(&ip_vs_protocol_udp); | ||
210 | #endif | ||
211 | #ifdef CONFIG_IP_VS_PROTO_AH | ||
212 | REGISTER_PROTOCOL(&ip_vs_protocol_ah); | ||
213 | #endif | ||
214 | #ifdef CONFIG_IP_VS_PROTO_ESP | ||
215 | REGISTER_PROTOCOL(&ip_vs_protocol_esp); | ||
216 | #endif | ||
217 | IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]); | ||
218 | |||
219 | return 0; | ||
220 | } | ||
221 | |||
222 | |||
223 | void ip_vs_protocol_cleanup(void) | ||
224 | { | ||
225 | struct ip_vs_protocol *pp; | ||
226 | int i; | ||
227 | |||
228 | /* unregister all the ipvs protocols */ | ||
229 | for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { | ||
230 | while ((pp = ip_vs_proto_table[i]) != NULL) | ||
231 | unregister_ip_vs_protocol(pp); | ||
232 | } | ||
233 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c deleted file mode 100644 index 73e0ea87c1f5..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ /dev/null | |||
@@ -1,178 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS | ||
3 | * | ||
4 | * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 | ||
5 | * Wensong Zhang <wensong@linuxvirtualserver.org> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * version 2 as published by the Free Software Foundation; | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/in.h> | ||
14 | #include <linux/ip.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/netfilter.h> | ||
18 | #include <linux/netfilter_ipv4.h> | ||
19 | |||
20 | #include <net/ip_vs.h> | ||
21 | |||
22 | |||
23 | /* TODO: | ||
24 | |||
25 | struct isakmp_hdr { | ||
26 | __u8 icookie[8]; | ||
27 | __u8 rcookie[8]; | ||
28 | __u8 np; | ||
29 | __u8 version; | ||
30 | __u8 xchgtype; | ||
31 | __u8 flags; | ||
32 | __u32 msgid; | ||
33 | __u32 length; | ||
34 | }; | ||
35 | |||
36 | */ | ||
37 | |||
38 | #define PORT_ISAKMP 500 | ||
39 | |||
40 | |||
41 | static struct ip_vs_conn * | ||
42 | ah_conn_in_get(const struct sk_buff *skb, | ||
43 | struct ip_vs_protocol *pp, | ||
44 | const struct iphdr *iph, | ||
45 | unsigned int proto_off, | ||
46 | int inverse) | ||
47 | { | ||
48 | struct ip_vs_conn *cp; | ||
49 | |||
50 | if (likely(!inverse)) { | ||
51 | cp = ip_vs_conn_in_get(IPPROTO_UDP, | ||
52 | iph->saddr, | ||
53 | htons(PORT_ISAKMP), | ||
54 | iph->daddr, | ||
55 | htons(PORT_ISAKMP)); | ||
56 | } else { | ||
57 | cp = ip_vs_conn_in_get(IPPROTO_UDP, | ||
58 | iph->daddr, | ||
59 | htons(PORT_ISAKMP), | ||
60 | iph->saddr, | ||
61 | htons(PORT_ISAKMP)); | ||
62 | } | ||
63 | |||
64 | if (!cp) { | ||
65 | /* | ||
66 | * We are not sure if the packet is from our | ||
67 | * service, so our conn_schedule hook should return NF_ACCEPT | ||
68 | */ | ||
69 | IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " | ||
70 | "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", | ||
71 | inverse ? "ICMP+" : "", | ||
72 | pp->name, | ||
73 | NIPQUAD(iph->saddr), | ||
74 | NIPQUAD(iph->daddr)); | ||
75 | } | ||
76 | |||
77 | return cp; | ||
78 | } | ||
79 | |||
80 | |||
81 | static struct ip_vs_conn * | ||
82 | ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
83 | const struct iphdr *iph, unsigned int proto_off, int inverse) | ||
84 | { | ||
85 | struct ip_vs_conn *cp; | ||
86 | |||
87 | if (likely(!inverse)) { | ||
88 | cp = ip_vs_conn_out_get(IPPROTO_UDP, | ||
89 | iph->saddr, | ||
90 | htons(PORT_ISAKMP), | ||
91 | iph->daddr, | ||
92 | htons(PORT_ISAKMP)); | ||
93 | } else { | ||
94 | cp = ip_vs_conn_out_get(IPPROTO_UDP, | ||
95 | iph->daddr, | ||
96 | htons(PORT_ISAKMP), | ||
97 | iph->saddr, | ||
98 | htons(PORT_ISAKMP)); | ||
99 | } | ||
100 | |||
101 | if (!cp) { | ||
102 | IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " | ||
103 | "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", | ||
104 | inverse ? "ICMP+" : "", | ||
105 | pp->name, | ||
106 | NIPQUAD(iph->saddr), | ||
107 | NIPQUAD(iph->daddr)); | ||
108 | } | ||
109 | |||
110 | return cp; | ||
111 | } | ||
112 | |||
113 | |||
114 | static int | ||
115 | ah_conn_schedule(struct sk_buff *skb, | ||
116 | struct ip_vs_protocol *pp, | ||
117 | int *verdict, struct ip_vs_conn **cpp) | ||
118 | { | ||
119 | /* | ||
120 | * AH is only related traffic. Pass the packet to IP stack. | ||
121 | */ | ||
122 | *verdict = NF_ACCEPT; | ||
123 | return 0; | ||
124 | } | ||
125 | |||
126 | |||
127 | static void | ||
128 | ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, | ||
129 | int offset, const char *msg) | ||
130 | { | ||
131 | char buf[256]; | ||
132 | struct iphdr _iph, *ih; | ||
133 | |||
134 | ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); | ||
135 | if (ih == NULL) | ||
136 | sprintf(buf, "%s TRUNCATED", pp->name); | ||
137 | else | ||
138 | sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", | ||
139 | pp->name, NIPQUAD(ih->saddr), | ||
140 | NIPQUAD(ih->daddr)); | ||
141 | |||
142 | printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); | ||
143 | } | ||
144 | |||
145 | |||
146 | static void ah_init(struct ip_vs_protocol *pp) | ||
147 | { | ||
148 | /* nothing to do now */ | ||
149 | } | ||
150 | |||
151 | |||
152 | static void ah_exit(struct ip_vs_protocol *pp) | ||
153 | { | ||
154 | /* nothing to do now */ | ||
155 | } | ||
156 | |||
157 | |||
158 | struct ip_vs_protocol ip_vs_protocol_ah = { | ||
159 | .name = "AH", | ||
160 | .protocol = IPPROTO_AH, | ||
161 | .num_states = 1, | ||
162 | .dont_defrag = 1, | ||
163 | .init = ah_init, | ||
164 | .exit = ah_exit, | ||
165 | .conn_schedule = ah_conn_schedule, | ||
166 | .conn_in_get = ah_conn_in_get, | ||
167 | .conn_out_get = ah_conn_out_get, | ||
168 | .snat_handler = NULL, | ||
169 | .dnat_handler = NULL, | ||
170 | .csum_check = NULL, | ||
171 | .state_transition = NULL, | ||
172 | .register_app = NULL, | ||
173 | .unregister_app = NULL, | ||
174 | .app_conn_bind = NULL, | ||
175 | .debug_packet = ah_debug_packet, | ||
176 | .timeout_change = NULL, /* ISAKMP */ | ||
177 | .set_state_timeout = NULL, | ||
178 | }; | ||
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c deleted file mode 100644 index 21d70c8ffa54..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ /dev/null | |||
@@ -1,176 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS | ||
3 | * | ||
4 | * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 | ||
5 | * Wensong Zhang <wensong@linuxvirtualserver.org> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * version 2 as published by the Free Software Foundation; | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/in.h> | ||
14 | #include <linux/ip.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/netfilter.h> | ||
18 | #include <linux/netfilter_ipv4.h> | ||
19 | |||
20 | #include <net/ip_vs.h> | ||
21 | |||
22 | |||
23 | /* TODO: | ||
24 | |||
25 | struct isakmp_hdr { | ||
26 | __u8 icookie[8]; | ||
27 | __u8 rcookie[8]; | ||
28 | __u8 np; | ||
29 | __u8 version; | ||
30 | __u8 xchgtype; | ||
31 | __u8 flags; | ||
32 | __u32 msgid; | ||
33 | __u32 length; | ||
34 | }; | ||
35 | |||
36 | */ | ||
37 | |||
38 | #define PORT_ISAKMP 500 | ||
39 | |||
40 | |||
41 | static struct ip_vs_conn * | ||
42 | esp_conn_in_get(const struct sk_buff *skb, | ||
43 | struct ip_vs_protocol *pp, | ||
44 | const struct iphdr *iph, | ||
45 | unsigned int proto_off, | ||
46 | int inverse) | ||
47 | { | ||
48 | struct ip_vs_conn *cp; | ||
49 | |||
50 | if (likely(!inverse)) { | ||
51 | cp = ip_vs_conn_in_get(IPPROTO_UDP, | ||
52 | iph->saddr, | ||
53 | htons(PORT_ISAKMP), | ||
54 | iph->daddr, | ||
55 | htons(PORT_ISAKMP)); | ||
56 | } else { | ||
57 | cp = ip_vs_conn_in_get(IPPROTO_UDP, | ||
58 | iph->daddr, | ||
59 | htons(PORT_ISAKMP), | ||
60 | iph->saddr, | ||
61 | htons(PORT_ISAKMP)); | ||
62 | } | ||
63 | |||
64 | if (!cp) { | ||
65 | /* | ||
66 | * We are not sure if the packet is from our | ||
67 | * service, so our conn_schedule hook should return NF_ACCEPT | ||
68 | */ | ||
69 | IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " | ||
70 | "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", | ||
71 | inverse ? "ICMP+" : "", | ||
72 | pp->name, | ||
73 | NIPQUAD(iph->saddr), | ||
74 | NIPQUAD(iph->daddr)); | ||
75 | } | ||
76 | |||
77 | return cp; | ||
78 | } | ||
79 | |||
80 | |||
81 | static struct ip_vs_conn * | ||
82 | esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
83 | const struct iphdr *iph, unsigned int proto_off, int inverse) | ||
84 | { | ||
85 | struct ip_vs_conn *cp; | ||
86 | |||
87 | if (likely(!inverse)) { | ||
88 | cp = ip_vs_conn_out_get(IPPROTO_UDP, | ||
89 | iph->saddr, | ||
90 | htons(PORT_ISAKMP), | ||
91 | iph->daddr, | ||
92 | htons(PORT_ISAKMP)); | ||
93 | } else { | ||
94 | cp = ip_vs_conn_out_get(IPPROTO_UDP, | ||
95 | iph->daddr, | ||
96 | htons(PORT_ISAKMP), | ||
97 | iph->saddr, | ||
98 | htons(PORT_ISAKMP)); | ||
99 | } | ||
100 | |||
101 | if (!cp) { | ||
102 | IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " | ||
103 | "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", | ||
104 | inverse ? "ICMP+" : "", | ||
105 | pp->name, | ||
106 | NIPQUAD(iph->saddr), | ||
107 | NIPQUAD(iph->daddr)); | ||
108 | } | ||
109 | |||
110 | return cp; | ||
111 | } | ||
112 | |||
113 | |||
114 | static int | ||
115 | esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
116 | int *verdict, struct ip_vs_conn **cpp) | ||
117 | { | ||
118 | /* | ||
119 | * ESP is only related traffic. Pass the packet to IP stack. | ||
120 | */ | ||
121 | *verdict = NF_ACCEPT; | ||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | |||
126 | static void | ||
127 | esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, | ||
128 | int offset, const char *msg) | ||
129 | { | ||
130 | char buf[256]; | ||
131 | struct iphdr _iph, *ih; | ||
132 | |||
133 | ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); | ||
134 | if (ih == NULL) | ||
135 | sprintf(buf, "%s TRUNCATED", pp->name); | ||
136 | else | ||
137 | sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", | ||
138 | pp->name, NIPQUAD(ih->saddr), | ||
139 | NIPQUAD(ih->daddr)); | ||
140 | |||
141 | printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); | ||
142 | } | ||
143 | |||
144 | |||
145 | static void esp_init(struct ip_vs_protocol *pp) | ||
146 | { | ||
147 | /* nothing to do now */ | ||
148 | } | ||
149 | |||
150 | |||
151 | static void esp_exit(struct ip_vs_protocol *pp) | ||
152 | { | ||
153 | /* nothing to do now */ | ||
154 | } | ||
155 | |||
156 | |||
157 | struct ip_vs_protocol ip_vs_protocol_esp = { | ||
158 | .name = "ESP", | ||
159 | .protocol = IPPROTO_ESP, | ||
160 | .num_states = 1, | ||
161 | .dont_defrag = 1, | ||
162 | .init = esp_init, | ||
163 | .exit = esp_exit, | ||
164 | .conn_schedule = esp_conn_schedule, | ||
165 | .conn_in_get = esp_conn_in_get, | ||
166 | .conn_out_get = esp_conn_out_get, | ||
167 | .snat_handler = NULL, | ||
168 | .dnat_handler = NULL, | ||
169 | .csum_check = NULL, | ||
170 | .state_transition = NULL, | ||
171 | .register_app = NULL, | ||
172 | .unregister_app = NULL, | ||
173 | .app_conn_bind = NULL, | ||
174 | .debug_packet = esp_debug_packet, | ||
175 | .timeout_change = NULL, /* ISAKMP */ | ||
176 | }; | ||
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c deleted file mode 100644 index d0ea467986a0..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ /dev/null | |||
@@ -1,614 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_proto_tcp.c: TCP load balancing support for IPVS | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * Julian Anastasov <ja@ssi.bg> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version | ||
10 | * 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * Changes: | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/ip.h> | ||
18 | #include <linux/tcp.h> /* for tcphdr */ | ||
19 | #include <net/ip.h> | ||
20 | #include <net/tcp.h> /* for csum_tcpudp_magic */ | ||
21 | #include <linux/netfilter.h> | ||
22 | #include <linux/netfilter_ipv4.h> | ||
23 | |||
24 | #include <net/ip_vs.h> | ||
25 | |||
26 | |||
27 | static struct ip_vs_conn * | ||
28 | tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
29 | const struct iphdr *iph, unsigned int proto_off, int inverse) | ||
30 | { | ||
31 | __be16 _ports[2], *pptr; | ||
32 | |||
33 | pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); | ||
34 | if (pptr == NULL) | ||
35 | return NULL; | ||
36 | |||
37 | if (likely(!inverse)) { | ||
38 | return ip_vs_conn_in_get(iph->protocol, | ||
39 | iph->saddr, pptr[0], | ||
40 | iph->daddr, pptr[1]); | ||
41 | } else { | ||
42 | return ip_vs_conn_in_get(iph->protocol, | ||
43 | iph->daddr, pptr[1], | ||
44 | iph->saddr, pptr[0]); | ||
45 | } | ||
46 | } | ||
47 | |||
48 | static struct ip_vs_conn * | ||
49 | tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
50 | const struct iphdr *iph, unsigned int proto_off, int inverse) | ||
51 | { | ||
52 | __be16 _ports[2], *pptr; | ||
53 | |||
54 | pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); | ||
55 | if (pptr == NULL) | ||
56 | return NULL; | ||
57 | |||
58 | if (likely(!inverse)) { | ||
59 | return ip_vs_conn_out_get(iph->protocol, | ||
60 | iph->saddr, pptr[0], | ||
61 | iph->daddr, pptr[1]); | ||
62 | } else { | ||
63 | return ip_vs_conn_out_get(iph->protocol, | ||
64 | iph->daddr, pptr[1], | ||
65 | iph->saddr, pptr[0]); | ||
66 | } | ||
67 | } | ||
68 | |||
69 | |||
70 | static int | ||
71 | tcp_conn_schedule(struct sk_buff *skb, | ||
72 | struct ip_vs_protocol *pp, | ||
73 | int *verdict, struct ip_vs_conn **cpp) | ||
74 | { | ||
75 | struct ip_vs_service *svc; | ||
76 | struct tcphdr _tcph, *th; | ||
77 | |||
78 | th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); | ||
79 | if (th == NULL) { | ||
80 | *verdict = NF_DROP; | ||
81 | return 0; | ||
82 | } | ||
83 | |||
84 | if (th->syn && | ||
85 | (svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol, | ||
86 | ip_hdr(skb)->daddr, th->dest))) { | ||
87 | if (ip_vs_todrop()) { | ||
88 | /* | ||
89 | * It seems that we are very loaded. | ||
90 | * We have to drop this packet :( | ||
91 | */ | ||
92 | ip_vs_service_put(svc); | ||
93 | *verdict = NF_DROP; | ||
94 | return 0; | ||
95 | } | ||
96 | |||
97 | /* | ||
98 | * Let the virtual server select a real server for the | ||
99 | * incoming connection, and create a connection entry. | ||
100 | */ | ||
101 | *cpp = ip_vs_schedule(svc, skb); | ||
102 | if (!*cpp) { | ||
103 | *verdict = ip_vs_leave(svc, skb, pp); | ||
104 | return 0; | ||
105 | } | ||
106 | ip_vs_service_put(svc); | ||
107 | } | ||
108 | return 1; | ||
109 | } | ||
110 | |||
111 | |||
112 | static inline void | ||
113 | tcp_fast_csum_update(struct tcphdr *tcph, __be32 oldip, __be32 newip, | ||
114 | __be16 oldport, __be16 newport) | ||
115 | { | ||
116 | tcph->check = | ||
117 | csum_fold(ip_vs_check_diff4(oldip, newip, | ||
118 | ip_vs_check_diff2(oldport, newport, | ||
119 | ~csum_unfold(tcph->check)))); | ||
120 | } | ||
121 | |||
122 | |||
123 | static int | ||
124 | tcp_snat_handler(struct sk_buff *skb, | ||
125 | struct ip_vs_protocol *pp, struct ip_vs_conn *cp) | ||
126 | { | ||
127 | struct tcphdr *tcph; | ||
128 | const unsigned int tcphoff = ip_hdrlen(skb); | ||
129 | |||
130 | /* csum_check requires unshared skb */ | ||
131 | if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) | ||
132 | return 0; | ||
133 | |||
134 | if (unlikely(cp->app != NULL)) { | ||
135 | /* Some checks before mangling */ | ||
136 | if (pp->csum_check && !pp->csum_check(skb, pp)) | ||
137 | return 0; | ||
138 | |||
139 | /* Call application helper if needed */ | ||
140 | if (!ip_vs_app_pkt_out(cp, skb)) | ||
141 | return 0; | ||
142 | } | ||
143 | |||
144 | tcph = (void *)ip_hdr(skb) + tcphoff; | ||
145 | tcph->source = cp->vport; | ||
146 | |||
147 | /* Adjust TCP checksums */ | ||
148 | if (!cp->app) { | ||
149 | /* Only port and addr are changed, do fast csum update */ | ||
150 | tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr, | ||
151 | cp->dport, cp->vport); | ||
152 | if (skb->ip_summed == CHECKSUM_COMPLETE) | ||
153 | skb->ip_summed = CHECKSUM_NONE; | ||
154 | } else { | ||
155 | /* full checksum calculation */ | ||
156 | tcph->check = 0; | ||
157 | skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); | ||
158 | tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, | ||
159 | skb->len - tcphoff, | ||
160 | cp->protocol, skb->csum); | ||
161 | IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", | ||
162 | pp->name, tcph->check, | ||
163 | (char*)&(tcph->check) - (char*)tcph); | ||
164 | } | ||
165 | return 1; | ||
166 | } | ||
167 | |||
168 | |||
169 | static int | ||
170 | tcp_dnat_handler(struct sk_buff *skb, | ||
171 | struct ip_vs_protocol *pp, struct ip_vs_conn *cp) | ||
172 | { | ||
173 | struct tcphdr *tcph; | ||
174 | const unsigned int tcphoff = ip_hdrlen(skb); | ||
175 | |||
176 | /* csum_check requires unshared skb */ | ||
177 | if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) | ||
178 | return 0; | ||
179 | |||
180 | if (unlikely(cp->app != NULL)) { | ||
181 | /* Some checks before mangling */ | ||
182 | if (pp->csum_check && !pp->csum_check(skb, pp)) | ||
183 | return 0; | ||
184 | |||
185 | /* | ||
186 | * Attempt ip_vs_app call. | ||
187 | * It will fix ip_vs_conn and iph ack_seq stuff | ||
188 | */ | ||
189 | if (!ip_vs_app_pkt_in(cp, skb)) | ||
190 | return 0; | ||
191 | } | ||
192 | |||
193 | tcph = (void *)ip_hdr(skb) + tcphoff; | ||
194 | tcph->dest = cp->dport; | ||
195 | |||
196 | /* | ||
197 | * Adjust TCP checksums | ||
198 | */ | ||
199 | if (!cp->app) { | ||
200 | /* Only port and addr are changed, do fast csum update */ | ||
201 | tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr, | ||
202 | cp->vport, cp->dport); | ||
203 | if (skb->ip_summed == CHECKSUM_COMPLETE) | ||
204 | skb->ip_summed = CHECKSUM_NONE; | ||
205 | } else { | ||
206 | /* full checksum calculation */ | ||
207 | tcph->check = 0; | ||
208 | skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); | ||
209 | tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, | ||
210 | skb->len - tcphoff, | ||
211 | cp->protocol, skb->csum); | ||
212 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
213 | } | ||
214 | return 1; | ||
215 | } | ||
216 | |||
217 | |||
218 | static int | ||
219 | tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) | ||
220 | { | ||
221 | const unsigned int tcphoff = ip_hdrlen(skb); | ||
222 | |||
223 | switch (skb->ip_summed) { | ||
224 | case CHECKSUM_NONE: | ||
225 | skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); | ||
226 | case CHECKSUM_COMPLETE: | ||
227 | if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, | ||
228 | skb->len - tcphoff, | ||
229 | ip_hdr(skb)->protocol, skb->csum)) { | ||
230 | IP_VS_DBG_RL_PKT(0, pp, skb, 0, | ||
231 | "Failed checksum for"); | ||
232 | return 0; | ||
233 | } | ||
234 | break; | ||
235 | default: | ||
236 | /* No need to checksum. */ | ||
237 | break; | ||
238 | } | ||
239 | |||
240 | return 1; | ||
241 | } | ||
242 | |||
243 | |||
244 | #define TCP_DIR_INPUT 0 | ||
245 | #define TCP_DIR_OUTPUT 4 | ||
246 | #define TCP_DIR_INPUT_ONLY 8 | ||
247 | |||
248 | static const int tcp_state_off[IP_VS_DIR_LAST] = { | ||
249 | [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, | ||
250 | [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, | ||
251 | [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, | ||
252 | }; | ||
253 | |||
254 | /* | ||
255 | * Timeout table[state] | ||
256 | */ | ||
257 | static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { | ||
258 | [IP_VS_TCP_S_NONE] = 2*HZ, | ||
259 | [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, | ||
260 | [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, | ||
261 | [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, | ||
262 | [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, | ||
263 | [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, | ||
264 | [IP_VS_TCP_S_CLOSE] = 10*HZ, | ||
265 | [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, | ||
266 | [IP_VS_TCP_S_LAST_ACK] = 30*HZ, | ||
267 | [IP_VS_TCP_S_LISTEN] = 2*60*HZ, | ||
268 | [IP_VS_TCP_S_SYNACK] = 120*HZ, | ||
269 | [IP_VS_TCP_S_LAST] = 2*HZ, | ||
270 | }; | ||
271 | |||
272 | static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { | ||
273 | [IP_VS_TCP_S_NONE] = "NONE", | ||
274 | [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", | ||
275 | [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", | ||
276 | [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", | ||
277 | [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", | ||
278 | [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", | ||
279 | [IP_VS_TCP_S_CLOSE] = "CLOSE", | ||
280 | [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", | ||
281 | [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", | ||
282 | [IP_VS_TCP_S_LISTEN] = "LISTEN", | ||
283 | [IP_VS_TCP_S_SYNACK] = "SYNACK", | ||
284 | [IP_VS_TCP_S_LAST] = "BUG!", | ||
285 | }; | ||
286 | |||
287 | #define sNO IP_VS_TCP_S_NONE | ||
288 | #define sES IP_VS_TCP_S_ESTABLISHED | ||
289 | #define sSS IP_VS_TCP_S_SYN_SENT | ||
290 | #define sSR IP_VS_TCP_S_SYN_RECV | ||
291 | #define sFW IP_VS_TCP_S_FIN_WAIT | ||
292 | #define sTW IP_VS_TCP_S_TIME_WAIT | ||
293 | #define sCL IP_VS_TCP_S_CLOSE | ||
294 | #define sCW IP_VS_TCP_S_CLOSE_WAIT | ||
295 | #define sLA IP_VS_TCP_S_LAST_ACK | ||
296 | #define sLI IP_VS_TCP_S_LISTEN | ||
297 | #define sSA IP_VS_TCP_S_SYNACK | ||
298 | |||
299 | struct tcp_states_t { | ||
300 | int next_state[IP_VS_TCP_S_LAST]; | ||
301 | }; | ||
302 | |||
303 | static const char * tcp_state_name(int state) | ||
304 | { | ||
305 | if (state >= IP_VS_TCP_S_LAST) | ||
306 | return "ERR!"; | ||
307 | return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; | ||
308 | } | ||
309 | |||
310 | static struct tcp_states_t tcp_states [] = { | ||
311 | /* INPUT */ | ||
312 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
313 | /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, | ||
314 | /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, | ||
315 | /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, | ||
316 | /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, | ||
317 | |||
318 | /* OUTPUT */ | ||
319 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
320 | /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, | ||
321 | /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, | ||
322 | /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, | ||
323 | /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, | ||
324 | |||
325 | /* INPUT-ONLY */ | ||
326 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
327 | /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, | ||
328 | /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, | ||
329 | /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, | ||
330 | /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, | ||
331 | }; | ||
332 | |||
333 | static struct tcp_states_t tcp_states_dos [] = { | ||
334 | /* INPUT */ | ||
335 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
336 | /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, | ||
337 | /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, | ||
338 | /*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, | ||
339 | /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, | ||
340 | |||
341 | /* OUTPUT */ | ||
342 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
343 | /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, | ||
344 | /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, | ||
345 | /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, | ||
346 | /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, | ||
347 | |||
348 | /* INPUT-ONLY */ | ||
349 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ | ||
350 | /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, | ||
351 | /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, | ||
352 | /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, | ||
353 | /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, | ||
354 | }; | ||
355 | |||
356 | static struct tcp_states_t *tcp_state_table = tcp_states; | ||
357 | |||
358 | |||
359 | static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) | ||
360 | { | ||
361 | int on = (flags & 1); /* secure_tcp */ | ||
362 | |||
363 | /* | ||
364 | ** FIXME: change secure_tcp to independent sysctl var | ||
365 | ** or make it per-service or per-app because it is valid | ||
366 | ** for most if not for all of the applications. Something | ||
367 | ** like "capabilities" (flags) for each object. | ||
368 | */ | ||
369 | tcp_state_table = (on? tcp_states_dos : tcp_states); | ||
370 | } | ||
371 | |||
372 | static int | ||
373 | tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) | ||
374 | { | ||
375 | return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, | ||
376 | tcp_state_name_table, sname, to); | ||
377 | } | ||
378 | |||
379 | static inline int tcp_state_idx(struct tcphdr *th) | ||
380 | { | ||
381 | if (th->rst) | ||
382 | return 3; | ||
383 | if (th->syn) | ||
384 | return 0; | ||
385 | if (th->fin) | ||
386 | return 1; | ||
387 | if (th->ack) | ||
388 | return 2; | ||
389 | return -1; | ||
390 | } | ||
391 | |||
392 | static inline void | ||
393 | set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, | ||
394 | int direction, struct tcphdr *th) | ||
395 | { | ||
396 | int state_idx; | ||
397 | int new_state = IP_VS_TCP_S_CLOSE; | ||
398 | int state_off = tcp_state_off[direction]; | ||
399 | |||
400 | /* | ||
401 | * Update state offset to INPUT_ONLY if necessary | ||
402 | * or delete NO_OUTPUT flag if output packet detected | ||
403 | */ | ||
404 | if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { | ||
405 | if (state_off == TCP_DIR_OUTPUT) | ||
406 | cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; | ||
407 | else | ||
408 | state_off = TCP_DIR_INPUT_ONLY; | ||
409 | } | ||
410 | |||
411 | if ((state_idx = tcp_state_idx(th)) < 0) { | ||
412 | IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); | ||
413 | goto tcp_state_out; | ||
414 | } | ||
415 | |||
416 | new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; | ||
417 | |||
418 | tcp_state_out: | ||
419 | if (new_state != cp->state) { | ||
420 | struct ip_vs_dest *dest = cp->dest; | ||
421 | |||
422 | IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" | ||
423 | "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n", | ||
424 | pp->name, | ||
425 | (state_off==TCP_DIR_OUTPUT)?"output ":"input ", | ||
426 | th->syn? 'S' : '.', | ||
427 | th->fin? 'F' : '.', | ||
428 | th->ack? 'A' : '.', | ||
429 | th->rst? 'R' : '.', | ||
430 | NIPQUAD(cp->daddr), ntohs(cp->dport), | ||
431 | NIPQUAD(cp->caddr), ntohs(cp->cport), | ||
432 | tcp_state_name(cp->state), | ||
433 | tcp_state_name(new_state), | ||
434 | atomic_read(&cp->refcnt)); | ||
435 | if (dest) { | ||
436 | if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && | ||
437 | (new_state != IP_VS_TCP_S_ESTABLISHED)) { | ||
438 | atomic_dec(&dest->activeconns); | ||
439 | atomic_inc(&dest->inactconns); | ||
440 | cp->flags |= IP_VS_CONN_F_INACTIVE; | ||
441 | } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && | ||
442 | (new_state == IP_VS_TCP_S_ESTABLISHED)) { | ||
443 | atomic_inc(&dest->activeconns); | ||
444 | atomic_dec(&dest->inactconns); | ||
445 | cp->flags &= ~IP_VS_CONN_F_INACTIVE; | ||
446 | } | ||
447 | } | ||
448 | } | ||
449 | |||
450 | cp->timeout = pp->timeout_table[cp->state = new_state]; | ||
451 | } | ||
452 | |||
453 | |||
454 | /* | ||
455 | * Handle state transitions | ||
456 | */ | ||
457 | static int | ||
458 | tcp_state_transition(struct ip_vs_conn *cp, int direction, | ||
459 | const struct sk_buff *skb, | ||
460 | struct ip_vs_protocol *pp) | ||
461 | { | ||
462 | struct tcphdr _tcph, *th; | ||
463 | |||
464 | th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); | ||
465 | if (th == NULL) | ||
466 | return 0; | ||
467 | |||
468 | spin_lock(&cp->lock); | ||
469 | set_tcp_state(pp, cp, direction, th); | ||
470 | spin_unlock(&cp->lock); | ||
471 | |||
472 | return 1; | ||
473 | } | ||
474 | |||
475 | |||
476 | /* | ||
477 | * Hash table for TCP application incarnations | ||
478 | */ | ||
479 | #define TCP_APP_TAB_BITS 4 | ||
480 | #define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) | ||
481 | #define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) | ||
482 | |||
483 | static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; | ||
484 | static DEFINE_SPINLOCK(tcp_app_lock); | ||
485 | |||
486 | static inline __u16 tcp_app_hashkey(__be16 port) | ||
487 | { | ||
488 | return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) | ||
489 | & TCP_APP_TAB_MASK; | ||
490 | } | ||
491 | |||
492 | |||
493 | static int tcp_register_app(struct ip_vs_app *inc) | ||
494 | { | ||
495 | struct ip_vs_app *i; | ||
496 | __u16 hash; | ||
497 | __be16 port = inc->port; | ||
498 | int ret = 0; | ||
499 | |||
500 | hash = tcp_app_hashkey(port); | ||
501 | |||
502 | spin_lock_bh(&tcp_app_lock); | ||
503 | list_for_each_entry(i, &tcp_apps[hash], p_list) { | ||
504 | if (i->port == port) { | ||
505 | ret = -EEXIST; | ||
506 | goto out; | ||
507 | } | ||
508 | } | ||
509 | list_add(&inc->p_list, &tcp_apps[hash]); | ||
510 | atomic_inc(&ip_vs_protocol_tcp.appcnt); | ||
511 | |||
512 | out: | ||
513 | spin_unlock_bh(&tcp_app_lock); | ||
514 | return ret; | ||
515 | } | ||
516 | |||
517 | |||
518 | static void | ||
519 | tcp_unregister_app(struct ip_vs_app *inc) | ||
520 | { | ||
521 | spin_lock_bh(&tcp_app_lock); | ||
522 | atomic_dec(&ip_vs_protocol_tcp.appcnt); | ||
523 | list_del(&inc->p_list); | ||
524 | spin_unlock_bh(&tcp_app_lock); | ||
525 | } | ||
526 | |||
527 | |||
528 | static int | ||
529 | tcp_app_conn_bind(struct ip_vs_conn *cp) | ||
530 | { | ||
531 | int hash; | ||
532 | struct ip_vs_app *inc; | ||
533 | int result = 0; | ||
534 | |||
535 | /* Default binding: bind app only for NAT */ | ||
536 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) | ||
537 | return 0; | ||
538 | |||
539 | /* Lookup application incarnations and bind the right one */ | ||
540 | hash = tcp_app_hashkey(cp->vport); | ||
541 | |||
542 | spin_lock(&tcp_app_lock); | ||
543 | list_for_each_entry(inc, &tcp_apps[hash], p_list) { | ||
544 | if (inc->port == cp->vport) { | ||
545 | if (unlikely(!ip_vs_app_inc_get(inc))) | ||
546 | break; | ||
547 | spin_unlock(&tcp_app_lock); | ||
548 | |||
549 | IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" | ||
550 | "%u.%u.%u.%u:%u to app %s on port %u\n", | ||
551 | __func__, | ||
552 | NIPQUAD(cp->caddr), ntohs(cp->cport), | ||
553 | NIPQUAD(cp->vaddr), ntohs(cp->vport), | ||
554 | inc->name, ntohs(inc->port)); | ||
555 | cp->app = inc; | ||
556 | if (inc->init_conn) | ||
557 | result = inc->init_conn(inc, cp); | ||
558 | goto out; | ||
559 | } | ||
560 | } | ||
561 | spin_unlock(&tcp_app_lock); | ||
562 | |||
563 | out: | ||
564 | return result; | ||
565 | } | ||
566 | |||
567 | |||
568 | /* | ||
569 | * Set LISTEN timeout. (ip_vs_conn_put will setup timer) | ||
570 | */ | ||
571 | void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) | ||
572 | { | ||
573 | spin_lock(&cp->lock); | ||
574 | cp->state = IP_VS_TCP_S_LISTEN; | ||
575 | cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; | ||
576 | spin_unlock(&cp->lock); | ||
577 | } | ||
578 | |||
579 | |||
580 | static void ip_vs_tcp_init(struct ip_vs_protocol *pp) | ||
581 | { | ||
582 | IP_VS_INIT_HASH_TABLE(tcp_apps); | ||
583 | pp->timeout_table = tcp_timeouts; | ||
584 | } | ||
585 | |||
586 | |||
587 | static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) | ||
588 | { | ||
589 | } | ||
590 | |||
591 | |||
592 | struct ip_vs_protocol ip_vs_protocol_tcp = { | ||
593 | .name = "TCP", | ||
594 | .protocol = IPPROTO_TCP, | ||
595 | .num_states = IP_VS_TCP_S_LAST, | ||
596 | .dont_defrag = 0, | ||
597 | .appcnt = ATOMIC_INIT(0), | ||
598 | .init = ip_vs_tcp_init, | ||
599 | .exit = ip_vs_tcp_exit, | ||
600 | .register_app = tcp_register_app, | ||
601 | .unregister_app = tcp_unregister_app, | ||
602 | .conn_schedule = tcp_conn_schedule, | ||
603 | .conn_in_get = tcp_conn_in_get, | ||
604 | .conn_out_get = tcp_conn_out_get, | ||
605 | .snat_handler = tcp_snat_handler, | ||
606 | .dnat_handler = tcp_dnat_handler, | ||
607 | .csum_check = tcp_csum_check, | ||
608 | .state_name = tcp_state_name, | ||
609 | .state_transition = tcp_state_transition, | ||
610 | .app_conn_bind = tcp_app_conn_bind, | ||
611 | .debug_packet = ip_vs_tcpudp_debug_packet, | ||
612 | .timeout_change = tcp_timeout_change, | ||
613 | .set_state_timeout = tcp_set_state_timeout, | ||
614 | }; | ||
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c deleted file mode 100644 index c6be5d56823f..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ /dev/null | |||
@@ -1,428 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_proto_udp.c: UDP load balancing support for IPVS | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * Julian Anastasov <ja@ssi.bg> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version | ||
10 | * 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * Changes: | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/in.h> | ||
17 | #include <linux/ip.h> | ||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/netfilter.h> | ||
20 | #include <linux/netfilter_ipv4.h> | ||
21 | #include <linux/udp.h> | ||
22 | |||
23 | #include <net/ip_vs.h> | ||
24 | #include <net/ip.h> | ||
25 | |||
26 | static struct ip_vs_conn * | ||
27 | udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
28 | const struct iphdr *iph, unsigned int proto_off, int inverse) | ||
29 | { | ||
30 | struct ip_vs_conn *cp; | ||
31 | __be16 _ports[2], *pptr; | ||
32 | |||
33 | pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); | ||
34 | if (pptr == NULL) | ||
35 | return NULL; | ||
36 | |||
37 | if (likely(!inverse)) { | ||
38 | cp = ip_vs_conn_in_get(iph->protocol, | ||
39 | iph->saddr, pptr[0], | ||
40 | iph->daddr, pptr[1]); | ||
41 | } else { | ||
42 | cp = ip_vs_conn_in_get(iph->protocol, | ||
43 | iph->daddr, pptr[1], | ||
44 | iph->saddr, pptr[0]); | ||
45 | } | ||
46 | |||
47 | return cp; | ||
48 | } | ||
49 | |||
50 | |||
51 | static struct ip_vs_conn * | ||
52 | udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
53 | const struct iphdr *iph, unsigned int proto_off, int inverse) | ||
54 | { | ||
55 | struct ip_vs_conn *cp; | ||
56 | __be16 _ports[2], *pptr; | ||
57 | |||
58 | pptr = skb_header_pointer(skb, ip_hdrlen(skb), | ||
59 | sizeof(_ports), _ports); | ||
60 | if (pptr == NULL) | ||
61 | return NULL; | ||
62 | |||
63 | if (likely(!inverse)) { | ||
64 | cp = ip_vs_conn_out_get(iph->protocol, | ||
65 | iph->saddr, pptr[0], | ||
66 | iph->daddr, pptr[1]); | ||
67 | } else { | ||
68 | cp = ip_vs_conn_out_get(iph->protocol, | ||
69 | iph->daddr, pptr[1], | ||
70 | iph->saddr, pptr[0]); | ||
71 | } | ||
72 | |||
73 | return cp; | ||
74 | } | ||
75 | |||
76 | |||
77 | static int | ||
78 | udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, | ||
79 | int *verdict, struct ip_vs_conn **cpp) | ||
80 | { | ||
81 | struct ip_vs_service *svc; | ||
82 | struct udphdr _udph, *uh; | ||
83 | |||
84 | uh = skb_header_pointer(skb, ip_hdrlen(skb), | ||
85 | sizeof(_udph), &_udph); | ||
86 | if (uh == NULL) { | ||
87 | *verdict = NF_DROP; | ||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | if ((svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol, | ||
92 | ip_hdr(skb)->daddr, uh->dest))) { | ||
93 | if (ip_vs_todrop()) { | ||
94 | /* | ||
95 | * It seems that we are very loaded. | ||
96 | * We have to drop this packet :( | ||
97 | */ | ||
98 | ip_vs_service_put(svc); | ||
99 | *verdict = NF_DROP; | ||
100 | return 0; | ||
101 | } | ||
102 | |||
103 | /* | ||
104 | * Let the virtual server select a real server for the | ||
105 | * incoming connection, and create a connection entry. | ||
106 | */ | ||
107 | *cpp = ip_vs_schedule(svc, skb); | ||
108 | if (!*cpp) { | ||
109 | *verdict = ip_vs_leave(svc, skb, pp); | ||
110 | return 0; | ||
111 | } | ||
112 | ip_vs_service_put(svc); | ||
113 | } | ||
114 | return 1; | ||
115 | } | ||
116 | |||
117 | |||
118 | static inline void | ||
119 | udp_fast_csum_update(struct udphdr *uhdr, __be32 oldip, __be32 newip, | ||
120 | __be16 oldport, __be16 newport) | ||
121 | { | ||
122 | uhdr->check = | ||
123 | csum_fold(ip_vs_check_diff4(oldip, newip, | ||
124 | ip_vs_check_diff2(oldport, newport, | ||
125 | ~csum_unfold(uhdr->check)))); | ||
126 | if (!uhdr->check) | ||
127 | uhdr->check = CSUM_MANGLED_0; | ||
128 | } | ||
129 | |||
130 | static int | ||
131 | udp_snat_handler(struct sk_buff *skb, | ||
132 | struct ip_vs_protocol *pp, struct ip_vs_conn *cp) | ||
133 | { | ||
134 | struct udphdr *udph; | ||
135 | const unsigned int udphoff = ip_hdrlen(skb); | ||
136 | |||
137 | /* csum_check requires unshared skb */ | ||
138 | if (!skb_make_writable(skb, udphoff+sizeof(*udph))) | ||
139 | return 0; | ||
140 | |||
141 | if (unlikely(cp->app != NULL)) { | ||
142 | /* Some checks before mangling */ | ||
143 | if (pp->csum_check && !pp->csum_check(skb, pp)) | ||
144 | return 0; | ||
145 | |||
146 | /* | ||
147 | * Call application helper if needed | ||
148 | */ | ||
149 | if (!ip_vs_app_pkt_out(cp, skb)) | ||
150 | return 0; | ||
151 | } | ||
152 | |||
153 | udph = (void *)ip_hdr(skb) + udphoff; | ||
154 | udph->source = cp->vport; | ||
155 | |||
156 | /* | ||
157 | * Adjust UDP checksums | ||
158 | */ | ||
159 | if (!cp->app && (udph->check != 0)) { | ||
160 | /* Only port and addr are changed, do fast csum update */ | ||
161 | udp_fast_csum_update(udph, cp->daddr, cp->vaddr, | ||
162 | cp->dport, cp->vport); | ||
163 | if (skb->ip_summed == CHECKSUM_COMPLETE) | ||
164 | skb->ip_summed = CHECKSUM_NONE; | ||
165 | } else { | ||
166 | /* full checksum calculation */ | ||
167 | udph->check = 0; | ||
168 | skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); | ||
169 | udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, | ||
170 | skb->len - udphoff, | ||
171 | cp->protocol, skb->csum); | ||
172 | if (udph->check == 0) | ||
173 | udph->check = CSUM_MANGLED_0; | ||
174 | IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", | ||
175 | pp->name, udph->check, | ||
176 | (char*)&(udph->check) - (char*)udph); | ||
177 | } | ||
178 | return 1; | ||
179 | } | ||
180 | |||
181 | |||
182 | static int | ||
183 | udp_dnat_handler(struct sk_buff *skb, | ||
184 | struct ip_vs_protocol *pp, struct ip_vs_conn *cp) | ||
185 | { | ||
186 | struct udphdr *udph; | ||
187 | unsigned int udphoff = ip_hdrlen(skb); | ||
188 | |||
189 | /* csum_check requires unshared skb */ | ||
190 | if (!skb_make_writable(skb, udphoff+sizeof(*udph))) | ||
191 | return 0; | ||
192 | |||
193 | if (unlikely(cp->app != NULL)) { | ||
194 | /* Some checks before mangling */ | ||
195 | if (pp->csum_check && !pp->csum_check(skb, pp)) | ||
196 | return 0; | ||
197 | |||
198 | /* | ||
199 | * Attempt ip_vs_app call. | ||
200 | * It will fix ip_vs_conn | ||
201 | */ | ||
202 | if (!ip_vs_app_pkt_in(cp, skb)) | ||
203 | return 0; | ||
204 | } | ||
205 | |||
206 | udph = (void *)ip_hdr(skb) + udphoff; | ||
207 | udph->dest = cp->dport; | ||
208 | |||
209 | /* | ||
210 | * Adjust UDP checksums | ||
211 | */ | ||
212 | if (!cp->app && (udph->check != 0)) { | ||
213 | /* Only port and addr are changed, do fast csum update */ | ||
214 | udp_fast_csum_update(udph, cp->vaddr, cp->daddr, | ||
215 | cp->vport, cp->dport); | ||
216 | if (skb->ip_summed == CHECKSUM_COMPLETE) | ||
217 | skb->ip_summed = CHECKSUM_NONE; | ||
218 | } else { | ||
219 | /* full checksum calculation */ | ||
220 | udph->check = 0; | ||
221 | skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); | ||
222 | udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, | ||
223 | skb->len - udphoff, | ||
224 | cp->protocol, skb->csum); | ||
225 | if (udph->check == 0) | ||
226 | udph->check = CSUM_MANGLED_0; | ||
227 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
228 | } | ||
229 | return 1; | ||
230 | } | ||
231 | |||
232 | |||
233 | static int | ||
234 | udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) | ||
235 | { | ||
236 | struct udphdr _udph, *uh; | ||
237 | const unsigned int udphoff = ip_hdrlen(skb); | ||
238 | |||
239 | uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); | ||
240 | if (uh == NULL) | ||
241 | return 0; | ||
242 | |||
243 | if (uh->check != 0) { | ||
244 | switch (skb->ip_summed) { | ||
245 | case CHECKSUM_NONE: | ||
246 | skb->csum = skb_checksum(skb, udphoff, | ||
247 | skb->len - udphoff, 0); | ||
248 | case CHECKSUM_COMPLETE: | ||
249 | if (csum_tcpudp_magic(ip_hdr(skb)->saddr, | ||
250 | ip_hdr(skb)->daddr, | ||
251 | skb->len - udphoff, | ||
252 | ip_hdr(skb)->protocol, | ||
253 | skb->csum)) { | ||
254 | IP_VS_DBG_RL_PKT(0, pp, skb, 0, | ||
255 | "Failed checksum for"); | ||
256 | return 0; | ||
257 | } | ||
258 | break; | ||
259 | default: | ||
260 | /* No need to checksum. */ | ||
261 | break; | ||
262 | } | ||
263 | } | ||
264 | return 1; | ||
265 | } | ||
266 | |||
267 | |||
268 | /* | ||
269 | * Note: the caller guarantees that only one of register_app, | ||
270 | * unregister_app or app_conn_bind is called each time. | ||
271 | */ | ||
272 | |||
273 | #define UDP_APP_TAB_BITS 4 | ||
274 | #define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) | ||
275 | #define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) | ||
276 | |||
277 | static struct list_head udp_apps[UDP_APP_TAB_SIZE]; | ||
278 | static DEFINE_SPINLOCK(udp_app_lock); | ||
279 | |||
280 | static inline __u16 udp_app_hashkey(__be16 port) | ||
281 | { | ||
282 | return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) | ||
283 | & UDP_APP_TAB_MASK; | ||
284 | } | ||
285 | |||
286 | |||
287 | static int udp_register_app(struct ip_vs_app *inc) | ||
288 | { | ||
289 | struct ip_vs_app *i; | ||
290 | __u16 hash; | ||
291 | __be16 port = inc->port; | ||
292 | int ret = 0; | ||
293 | |||
294 | hash = udp_app_hashkey(port); | ||
295 | |||
296 | |||
297 | spin_lock_bh(&udp_app_lock); | ||
298 | list_for_each_entry(i, &udp_apps[hash], p_list) { | ||
299 | if (i->port == port) { | ||
300 | ret = -EEXIST; | ||
301 | goto out; | ||
302 | } | ||
303 | } | ||
304 | list_add(&inc->p_list, &udp_apps[hash]); | ||
305 | atomic_inc(&ip_vs_protocol_udp.appcnt); | ||
306 | |||
307 | out: | ||
308 | spin_unlock_bh(&udp_app_lock); | ||
309 | return ret; | ||
310 | } | ||
311 | |||
312 | |||
313 | static void | ||
314 | udp_unregister_app(struct ip_vs_app *inc) | ||
315 | { | ||
316 | spin_lock_bh(&udp_app_lock); | ||
317 | atomic_dec(&ip_vs_protocol_udp.appcnt); | ||
318 | list_del(&inc->p_list); | ||
319 | spin_unlock_bh(&udp_app_lock); | ||
320 | } | ||
321 | |||
322 | |||
323 | static int udp_app_conn_bind(struct ip_vs_conn *cp) | ||
324 | { | ||
325 | int hash; | ||
326 | struct ip_vs_app *inc; | ||
327 | int result = 0; | ||
328 | |||
329 | /* Default binding: bind app only for NAT */ | ||
330 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) | ||
331 | return 0; | ||
332 | |||
333 | /* Lookup application incarnations and bind the right one */ | ||
334 | hash = udp_app_hashkey(cp->vport); | ||
335 | |||
336 | spin_lock(&udp_app_lock); | ||
337 | list_for_each_entry(inc, &udp_apps[hash], p_list) { | ||
338 | if (inc->port == cp->vport) { | ||
339 | if (unlikely(!ip_vs_app_inc_get(inc))) | ||
340 | break; | ||
341 | spin_unlock(&udp_app_lock); | ||
342 | |||
343 | IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" | ||
344 | "%u.%u.%u.%u:%u to app %s on port %u\n", | ||
345 | __func__, | ||
346 | NIPQUAD(cp->caddr), ntohs(cp->cport), | ||
347 | NIPQUAD(cp->vaddr), ntohs(cp->vport), | ||
348 | inc->name, ntohs(inc->port)); | ||
349 | cp->app = inc; | ||
350 | if (inc->init_conn) | ||
351 | result = inc->init_conn(inc, cp); | ||
352 | goto out; | ||
353 | } | ||
354 | } | ||
355 | spin_unlock(&udp_app_lock); | ||
356 | |||
357 | out: | ||
358 | return result; | ||
359 | } | ||
360 | |||
361 | |||
362 | static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { | ||
363 | [IP_VS_UDP_S_NORMAL] = 5*60*HZ, | ||
364 | [IP_VS_UDP_S_LAST] = 2*HZ, | ||
365 | }; | ||
366 | |||
367 | static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = { | ||
368 | [IP_VS_UDP_S_NORMAL] = "UDP", | ||
369 | [IP_VS_UDP_S_LAST] = "BUG!", | ||
370 | }; | ||
371 | |||
372 | |||
373 | static int | ||
374 | udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) | ||
375 | { | ||
376 | return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, | ||
377 | udp_state_name_table, sname, to); | ||
378 | } | ||
379 | |||
380 | static const char * udp_state_name(int state) | ||
381 | { | ||
382 | if (state >= IP_VS_UDP_S_LAST) | ||
383 | return "ERR!"; | ||
384 | return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; | ||
385 | } | ||
386 | |||
387 | static int | ||
388 | udp_state_transition(struct ip_vs_conn *cp, int direction, | ||
389 | const struct sk_buff *skb, | ||
390 | struct ip_vs_protocol *pp) | ||
391 | { | ||
392 | cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; | ||
393 | return 1; | ||
394 | } | ||
395 | |||
396 | static void udp_init(struct ip_vs_protocol *pp) | ||
397 | { | ||
398 | IP_VS_INIT_HASH_TABLE(udp_apps); | ||
399 | pp->timeout_table = udp_timeouts; | ||
400 | } | ||
401 | |||
402 | static void udp_exit(struct ip_vs_protocol *pp) | ||
403 | { | ||
404 | } | ||
405 | |||
406 | |||
407 | struct ip_vs_protocol ip_vs_protocol_udp = { | ||
408 | .name = "UDP", | ||
409 | .protocol = IPPROTO_UDP, | ||
410 | .num_states = IP_VS_UDP_S_LAST, | ||
411 | .dont_defrag = 0, | ||
412 | .init = udp_init, | ||
413 | .exit = udp_exit, | ||
414 | .conn_schedule = udp_conn_schedule, | ||
415 | .conn_in_get = udp_conn_in_get, | ||
416 | .conn_out_get = udp_conn_out_get, | ||
417 | .snat_handler = udp_snat_handler, | ||
418 | .dnat_handler = udp_dnat_handler, | ||
419 | .csum_check = udp_csum_check, | ||
420 | .state_transition = udp_state_transition, | ||
421 | .state_name = udp_state_name, | ||
422 | .register_app = udp_register_app, | ||
423 | .unregister_app = udp_unregister_app, | ||
424 | .app_conn_bind = udp_app_conn_bind, | ||
425 | .debug_packet = ip_vs_tcpudp_debug_packet, | ||
426 | .timeout_change = NULL, | ||
427 | .set_state_timeout = udp_set_state_timeout, | ||
428 | }; | ||
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c deleted file mode 100644 index 358110d17e59..000000000000 --- a/net/ipv4/ipvs/ip_vs_rr.c +++ /dev/null | |||
@@ -1,116 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Round-Robin Scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * Peter Kese <peter.kese@ijs.si> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version | ||
10 | * 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * Fixes/Changes: | ||
13 | * Wensong Zhang : changed the ip_vs_rr_schedule to return dest | ||
14 | * Julian Anastasov : fixed the NULL pointer access bug in debugging | ||
15 | * Wensong Zhang : changed some comestics things for debugging | ||
16 | * Wensong Zhang : changed for the d-linked destination list | ||
17 | * Wensong Zhang : added the ip_vs_rr_update_svc | ||
18 | * Wensong Zhang : added any dest with weight=0 is quiesced | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #include <linux/module.h> | ||
23 | #include <linux/kernel.h> | ||
24 | |||
25 | #include <net/ip_vs.h> | ||
26 | |||
27 | |||
28 | static int ip_vs_rr_init_svc(struct ip_vs_service *svc) | ||
29 | { | ||
30 | svc->sched_data = &svc->destinations; | ||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | |||
35 | static int ip_vs_rr_done_svc(struct ip_vs_service *svc) | ||
36 | { | ||
37 | return 0; | ||
38 | } | ||
39 | |||
40 | |||
41 | static int ip_vs_rr_update_svc(struct ip_vs_service *svc) | ||
42 | { | ||
43 | svc->sched_data = &svc->destinations; | ||
44 | return 0; | ||
45 | } | ||
46 | |||
47 | |||
48 | /* | ||
49 | * Round-Robin Scheduling | ||
50 | */ | ||
51 | static struct ip_vs_dest * | ||
52 | ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
53 | { | ||
54 | struct list_head *p, *q; | ||
55 | struct ip_vs_dest *dest; | ||
56 | |||
57 | IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); | ||
58 | |||
59 | write_lock(&svc->sched_lock); | ||
60 | p = (struct list_head *)svc->sched_data; | ||
61 | p = p->next; | ||
62 | q = p; | ||
63 | do { | ||
64 | /* skip list head */ | ||
65 | if (q == &svc->destinations) { | ||
66 | q = q->next; | ||
67 | continue; | ||
68 | } | ||
69 | |||
70 | dest = list_entry(q, struct ip_vs_dest, n_list); | ||
71 | if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && | ||
72 | atomic_read(&dest->weight) > 0) | ||
73 | /* HIT */ | ||
74 | goto out; | ||
75 | q = q->next; | ||
76 | } while (q != p); | ||
77 | write_unlock(&svc->sched_lock); | ||
78 | return NULL; | ||
79 | |||
80 | out: | ||
81 | svc->sched_data = q; | ||
82 | write_unlock(&svc->sched_lock); | ||
83 | IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u " | ||
84 | "activeconns %d refcnt %d weight %d\n", | ||
85 | NIPQUAD(dest->addr), ntohs(dest->port), | ||
86 | atomic_read(&dest->activeconns), | ||
87 | atomic_read(&dest->refcnt), atomic_read(&dest->weight)); | ||
88 | |||
89 | return dest; | ||
90 | } | ||
91 | |||
92 | |||
93 | static struct ip_vs_scheduler ip_vs_rr_scheduler = { | ||
94 | .name = "rr", /* name */ | ||
95 | .refcnt = ATOMIC_INIT(0), | ||
96 | .module = THIS_MODULE, | ||
97 | .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), | ||
98 | .init_service = ip_vs_rr_init_svc, | ||
99 | .done_service = ip_vs_rr_done_svc, | ||
100 | .update_service = ip_vs_rr_update_svc, | ||
101 | .schedule = ip_vs_rr_schedule, | ||
102 | }; | ||
103 | |||
104 | static int __init ip_vs_rr_init(void) | ||
105 | { | ||
106 | return register_ip_vs_scheduler(&ip_vs_rr_scheduler); | ||
107 | } | ||
108 | |||
109 | static void __exit ip_vs_rr_cleanup(void) | ||
110 | { | ||
111 | unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); | ||
112 | } | ||
113 | |||
114 | module_init(ip_vs_rr_init); | ||
115 | module_exit(ip_vs_rr_cleanup); | ||
116 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c deleted file mode 100644 index a46ad9e35016..000000000000 --- a/net/ipv4/ipvs/ip_vs_sched.c +++ /dev/null | |||
@@ -1,251 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS An implementation of the IP virtual server support for the | ||
3 | * LINUX operating system. IPVS is now implemented as a module | ||
4 | * over the Netfilter framework. IPVS can be used to build a | ||
5 | * high-performance and highly available server based on a | ||
6 | * cluster of servers. | ||
7 | * | ||
8 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
9 | * Peter Kese <peter.kese@ijs.si> | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public License | ||
13 | * as published by the Free Software Foundation; either version | ||
14 | * 2 of the License, or (at your option) any later version. | ||
15 | * | ||
16 | * Changes: | ||
17 | * | ||
18 | */ | ||
19 | |||
20 | #include <linux/module.h> | ||
21 | #include <linux/spinlock.h> | ||
22 | #include <linux/interrupt.h> | ||
23 | #include <asm/string.h> | ||
24 | #include <linux/kmod.h> | ||
25 | #include <linux/sysctl.h> | ||
26 | |||
27 | #include <net/ip_vs.h> | ||
28 | |||
29 | /* | ||
30 | * IPVS scheduler list | ||
31 | */ | ||
32 | static LIST_HEAD(ip_vs_schedulers); | ||
33 | |||
34 | /* lock for service table */ | ||
35 | static DEFINE_RWLOCK(__ip_vs_sched_lock); | ||
36 | |||
37 | |||
38 | /* | ||
39 | * Bind a service with a scheduler | ||
40 | */ | ||
41 | int ip_vs_bind_scheduler(struct ip_vs_service *svc, | ||
42 | struct ip_vs_scheduler *scheduler) | ||
43 | { | ||
44 | int ret; | ||
45 | |||
46 | if (svc == NULL) { | ||
47 | IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); | ||
48 | return -EINVAL; | ||
49 | } | ||
50 | if (scheduler == NULL) { | ||
51 | IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); | ||
52 | return -EINVAL; | ||
53 | } | ||
54 | |||
55 | svc->scheduler = scheduler; | ||
56 | |||
57 | if (scheduler->init_service) { | ||
58 | ret = scheduler->init_service(svc); | ||
59 | if (ret) { | ||
60 | IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); | ||
61 | return ret; | ||
62 | } | ||
63 | } | ||
64 | |||
65 | return 0; | ||
66 | } | ||
67 | |||
68 | |||
69 | /* | ||
70 | * Unbind a service with its scheduler | ||
71 | */ | ||
72 | int ip_vs_unbind_scheduler(struct ip_vs_service *svc) | ||
73 | { | ||
74 | struct ip_vs_scheduler *sched; | ||
75 | |||
76 | if (svc == NULL) { | ||
77 | IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); | ||
78 | return -EINVAL; | ||
79 | } | ||
80 | |||
81 | sched = svc->scheduler; | ||
82 | if (sched == NULL) { | ||
83 | IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); | ||
84 | return -EINVAL; | ||
85 | } | ||
86 | |||
87 | if (sched->done_service) { | ||
88 | if (sched->done_service(svc) != 0) { | ||
89 | IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); | ||
90 | return -EINVAL; | ||
91 | } | ||
92 | } | ||
93 | |||
94 | svc->scheduler = NULL; | ||
95 | return 0; | ||
96 | } | ||
97 | |||
98 | |||
99 | /* | ||
100 | * Get scheduler in the scheduler list by name | ||
101 | */ | ||
102 | static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) | ||
103 | { | ||
104 | struct ip_vs_scheduler *sched; | ||
105 | |||
106 | IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", | ||
107 | sched_name); | ||
108 | |||
109 | read_lock_bh(&__ip_vs_sched_lock); | ||
110 | |||
111 | list_for_each_entry(sched, &ip_vs_schedulers, n_list) { | ||
112 | /* | ||
113 | * Test and get the modules atomically | ||
114 | */ | ||
115 | if (sched->module && !try_module_get(sched->module)) { | ||
116 | /* | ||
117 | * This scheduler is just deleted | ||
118 | */ | ||
119 | continue; | ||
120 | } | ||
121 | if (strcmp(sched_name, sched->name)==0) { | ||
122 | /* HIT */ | ||
123 | read_unlock_bh(&__ip_vs_sched_lock); | ||
124 | return sched; | ||
125 | } | ||
126 | if (sched->module) | ||
127 | module_put(sched->module); | ||
128 | } | ||
129 | |||
130 | read_unlock_bh(&__ip_vs_sched_lock); | ||
131 | return NULL; | ||
132 | } | ||
133 | |||
134 | |||
135 | /* | ||
136 | * Lookup scheduler and try to load it if it doesn't exist | ||
137 | */ | ||
138 | struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) | ||
139 | { | ||
140 | struct ip_vs_scheduler *sched; | ||
141 | |||
142 | /* | ||
143 | * Search for the scheduler by sched_name | ||
144 | */ | ||
145 | sched = ip_vs_sched_getbyname(sched_name); | ||
146 | |||
147 | /* | ||
148 | * If scheduler not found, load the module and search again | ||
149 | */ | ||
150 | if (sched == NULL) { | ||
151 | request_module("ip_vs_%s", sched_name); | ||
152 | sched = ip_vs_sched_getbyname(sched_name); | ||
153 | } | ||
154 | |||
155 | return sched; | ||
156 | } | ||
157 | |||
158 | void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) | ||
159 | { | ||
160 | if (scheduler->module) | ||
161 | module_put(scheduler->module); | ||
162 | } | ||
163 | |||
164 | |||
165 | /* | ||
166 | * Register a scheduler in the scheduler list | ||
167 | */ | ||
168 | int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) | ||
169 | { | ||
170 | struct ip_vs_scheduler *sched; | ||
171 | |||
172 | if (!scheduler) { | ||
173 | IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); | ||
174 | return -EINVAL; | ||
175 | } | ||
176 | |||
177 | if (!scheduler->name) { | ||
178 | IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); | ||
179 | return -EINVAL; | ||
180 | } | ||
181 | |||
182 | /* increase the module use count */ | ||
183 | ip_vs_use_count_inc(); | ||
184 | |||
185 | write_lock_bh(&__ip_vs_sched_lock); | ||
186 | |||
187 | if (!list_empty(&scheduler->n_list)) { | ||
188 | write_unlock_bh(&__ip_vs_sched_lock); | ||
189 | ip_vs_use_count_dec(); | ||
190 | IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " | ||
191 | "already linked\n", scheduler->name); | ||
192 | return -EINVAL; | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * Make sure that the scheduler with this name doesn't exist | ||
197 | * in the scheduler list. | ||
198 | */ | ||
199 | list_for_each_entry(sched, &ip_vs_schedulers, n_list) { | ||
200 | if (strcmp(scheduler->name, sched->name) == 0) { | ||
201 | write_unlock_bh(&__ip_vs_sched_lock); | ||
202 | ip_vs_use_count_dec(); | ||
203 | IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " | ||
204 | "already existed in the system\n", | ||
205 | scheduler->name); | ||
206 | return -EINVAL; | ||
207 | } | ||
208 | } | ||
209 | /* | ||
210 | * Add it into the d-linked scheduler list | ||
211 | */ | ||
212 | list_add(&scheduler->n_list, &ip_vs_schedulers); | ||
213 | write_unlock_bh(&__ip_vs_sched_lock); | ||
214 | |||
215 | IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); | ||
216 | |||
217 | return 0; | ||
218 | } | ||
219 | |||
220 | |||
221 | /* | ||
222 | * Unregister a scheduler from the scheduler list | ||
223 | */ | ||
224 | int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) | ||
225 | { | ||
226 | if (!scheduler) { | ||
227 | IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); | ||
228 | return -EINVAL; | ||
229 | } | ||
230 | |||
231 | write_lock_bh(&__ip_vs_sched_lock); | ||
232 | if (list_empty(&scheduler->n_list)) { | ||
233 | write_unlock_bh(&__ip_vs_sched_lock); | ||
234 | IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " | ||
235 | "is not in the list. failed\n", scheduler->name); | ||
236 | return -EINVAL; | ||
237 | } | ||
238 | |||
239 | /* | ||
240 | * Remove it from the d-linked scheduler list | ||
241 | */ | ||
242 | list_del(&scheduler->n_list); | ||
243 | write_unlock_bh(&__ip_vs_sched_lock); | ||
244 | |||
245 | /* decrease the module use count */ | ||
246 | ip_vs_use_count_dec(); | ||
247 | |||
248 | IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); | ||
249 | |||
250 | return 0; | ||
251 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c deleted file mode 100644 index 77663d84cbd1..000000000000 --- a/net/ipv4/ipvs/ip_vs_sed.c +++ /dev/null | |||
@@ -1,161 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Shortest Expected Delay scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Changes: | ||
12 | * | ||
13 | */ | ||
14 | |||
15 | /* | ||
16 | * The SED algorithm attempts to minimize each job's expected delay until | ||
17 | * completion. The expected delay that the job will experience is | ||
18 | * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of | ||
19 | * jobs on the ith server and Ui is the fixed service rate (weight) of | ||
20 | * the ith server. The SED algorithm adopts a greedy policy that each does | ||
21 | * what is in its own best interest, i.e. to join the queue which would | ||
22 | * minimize its expected delay of completion. | ||
23 | * | ||
24 | * See the following paper for more information: | ||
25 | * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing | ||
26 | * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, | ||
27 | * pages 986-994, 1988. | ||
28 | * | ||
29 | * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me. | ||
30 | * | ||
31 | * The difference between SED and WLC is that SED includes the incoming | ||
32 | * job in the cost function (the increment of 1). SED may outperform | ||
33 | * WLC, while scheduling big jobs under larger heterogeneous systems | ||
34 | * (the server weight varies a lot). | ||
35 | * | ||
36 | */ | ||
37 | |||
38 | #include <linux/module.h> | ||
39 | #include <linux/kernel.h> | ||
40 | |||
41 | #include <net/ip_vs.h> | ||
42 | |||
43 | |||
44 | static int | ||
45 | ip_vs_sed_init_svc(struct ip_vs_service *svc) | ||
46 | { | ||
47 | return 0; | ||
48 | } | ||
49 | |||
50 | |||
51 | static int | ||
52 | ip_vs_sed_done_svc(struct ip_vs_service *svc) | ||
53 | { | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | |||
58 | static int | ||
59 | ip_vs_sed_update_svc(struct ip_vs_service *svc) | ||
60 | { | ||
61 | return 0; | ||
62 | } | ||
63 | |||
64 | |||
65 | static inline unsigned int | ||
66 | ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) | ||
67 | { | ||
68 | /* | ||
69 | * We only use the active connection number in the cost | ||
70 | * calculation here. | ||
71 | */ | ||
72 | return atomic_read(&dest->activeconns) + 1; | ||
73 | } | ||
74 | |||
75 | |||
76 | /* | ||
77 | * Weighted Least Connection scheduling | ||
78 | */ | ||
79 | static struct ip_vs_dest * | ||
80 | ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
81 | { | ||
82 | struct ip_vs_dest *dest, *least; | ||
83 | unsigned int loh, doh; | ||
84 | |||
85 | IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n"); | ||
86 | |||
87 | /* | ||
88 | * We calculate the load of each dest server as follows: | ||
89 | * (server expected overhead) / dest->weight | ||
90 | * | ||
91 | * Remember -- no floats in kernel mode!!! | ||
92 | * The comparison of h1*w2 > h2*w1 is equivalent to that of | ||
93 | * h1/w1 > h2/w2 | ||
94 | * if every weight is larger than zero. | ||
95 | * | ||
96 | * The server with weight=0 is quiesced and will not receive any | ||
97 | * new connections. | ||
98 | */ | ||
99 | |||
100 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
101 | if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && | ||
102 | atomic_read(&dest->weight) > 0) { | ||
103 | least = dest; | ||
104 | loh = ip_vs_sed_dest_overhead(least); | ||
105 | goto nextstage; | ||
106 | } | ||
107 | } | ||
108 | return NULL; | ||
109 | |||
110 | /* | ||
111 | * Find the destination with the least load. | ||
112 | */ | ||
113 | nextstage: | ||
114 | list_for_each_entry_continue(dest, &svc->destinations, n_list) { | ||
115 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
116 | continue; | ||
117 | doh = ip_vs_sed_dest_overhead(dest); | ||
118 | if (loh * atomic_read(&dest->weight) > | ||
119 | doh * atomic_read(&least->weight)) { | ||
120 | least = dest; | ||
121 | loh = doh; | ||
122 | } | ||
123 | } | ||
124 | |||
125 | IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u " | ||
126 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
127 | NIPQUAD(least->addr), ntohs(least->port), | ||
128 | atomic_read(&least->activeconns), | ||
129 | atomic_read(&least->refcnt), | ||
130 | atomic_read(&least->weight), loh); | ||
131 | |||
132 | return least; | ||
133 | } | ||
134 | |||
135 | |||
136 | static struct ip_vs_scheduler ip_vs_sed_scheduler = | ||
137 | { | ||
138 | .name = "sed", | ||
139 | .refcnt = ATOMIC_INIT(0), | ||
140 | .module = THIS_MODULE, | ||
141 | .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), | ||
142 | .init_service = ip_vs_sed_init_svc, | ||
143 | .done_service = ip_vs_sed_done_svc, | ||
144 | .update_service = ip_vs_sed_update_svc, | ||
145 | .schedule = ip_vs_sed_schedule, | ||
146 | }; | ||
147 | |||
148 | |||
149 | static int __init ip_vs_sed_init(void) | ||
150 | { | ||
151 | return register_ip_vs_scheduler(&ip_vs_sed_scheduler); | ||
152 | } | ||
153 | |||
154 | static void __exit ip_vs_sed_cleanup(void) | ||
155 | { | ||
156 | unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); | ||
157 | } | ||
158 | |||
159 | module_init(ip_vs_sed_init); | ||
160 | module_exit(ip_vs_sed_cleanup); | ||
161 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c deleted file mode 100644 index 7b979e228056..000000000000 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ /dev/null | |||
@@ -1,255 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Source Hashing scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@gnuchina.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Changes: | ||
12 | * | ||
13 | */ | ||
14 | |||
15 | /* | ||
16 | * The sh algorithm is to select server by the hash key of source IP | ||
17 | * address. The pseudo code is as follows: | ||
18 | * | ||
19 | * n <- servernode[src_ip]; | ||
20 | * if (n is dead) OR | ||
21 | * (n is overloaded) or (n.weight <= 0) then | ||
22 | * return NULL; | ||
23 | * | ||
24 | * return n; | ||
25 | * | ||
26 | * Notes that servernode is a 256-bucket hash table that maps the hash | ||
27 | * index derived from packet source IP address to the current server | ||
28 | * array. If the sh scheduler is used in cache cluster, it is good to | ||
29 | * combine it with cache_bypass feature. When the statically assigned | ||
30 | * server is dead or overloaded, the load balancer can bypass the cache | ||
31 | * server and send requests to the original server directly. | ||
32 | * | ||
33 | */ | ||
34 | |||
35 | #include <linux/ip.h> | ||
36 | #include <linux/module.h> | ||
37 | #include <linux/kernel.h> | ||
38 | #include <linux/skbuff.h> | ||
39 | |||
40 | #include <net/ip_vs.h> | ||
41 | |||
42 | |||
43 | /* | ||
44 | * IPVS SH bucket | ||
45 | */ | ||
46 | struct ip_vs_sh_bucket { | ||
47 | struct ip_vs_dest *dest; /* real server (cache) */ | ||
48 | }; | ||
49 | |||
50 | /* | ||
51 | * for IPVS SH entry hash table | ||
52 | */ | ||
53 | #ifndef CONFIG_IP_VS_SH_TAB_BITS | ||
54 | #define CONFIG_IP_VS_SH_TAB_BITS 8 | ||
55 | #endif | ||
56 | #define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS | ||
57 | #define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) | ||
58 | #define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) | ||
59 | |||
60 | |||
61 | /* | ||
62 | * Returns hash value for IPVS SH entry | ||
63 | */ | ||
64 | static inline unsigned ip_vs_sh_hashkey(__be32 addr) | ||
65 | { | ||
66 | return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK; | ||
67 | } | ||
68 | |||
69 | |||
70 | /* | ||
71 | * Get ip_vs_dest associated with supplied parameters. | ||
72 | */ | ||
73 | static inline struct ip_vs_dest * | ||
74 | ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr) | ||
75 | { | ||
76 | return (tbl[ip_vs_sh_hashkey(addr)]).dest; | ||
77 | } | ||
78 | |||
79 | |||
80 | /* | ||
81 | * Assign all the hash buckets of the specified table with the service. | ||
82 | */ | ||
83 | static int | ||
84 | ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) | ||
85 | { | ||
86 | int i; | ||
87 | struct ip_vs_sh_bucket *b; | ||
88 | struct list_head *p; | ||
89 | struct ip_vs_dest *dest; | ||
90 | |||
91 | b = tbl; | ||
92 | p = &svc->destinations; | ||
93 | for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { | ||
94 | if (list_empty(p)) { | ||
95 | b->dest = NULL; | ||
96 | } else { | ||
97 | if (p == &svc->destinations) | ||
98 | p = p->next; | ||
99 | |||
100 | dest = list_entry(p, struct ip_vs_dest, n_list); | ||
101 | atomic_inc(&dest->refcnt); | ||
102 | b->dest = dest; | ||
103 | |||
104 | p = p->next; | ||
105 | } | ||
106 | b++; | ||
107 | } | ||
108 | return 0; | ||
109 | } | ||
110 | |||
111 | |||
112 | /* | ||
113 | * Flush all the hash buckets of the specified table. | ||
114 | */ | ||
115 | static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) | ||
116 | { | ||
117 | int i; | ||
118 | struct ip_vs_sh_bucket *b; | ||
119 | |||
120 | b = tbl; | ||
121 | for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { | ||
122 | if (b->dest) { | ||
123 | atomic_dec(&b->dest->refcnt); | ||
124 | b->dest = NULL; | ||
125 | } | ||
126 | b++; | ||
127 | } | ||
128 | } | ||
129 | |||
130 | |||
131 | static int ip_vs_sh_init_svc(struct ip_vs_service *svc) | ||
132 | { | ||
133 | struct ip_vs_sh_bucket *tbl; | ||
134 | |||
135 | /* allocate the SH table for this service */ | ||
136 | tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, | ||
137 | GFP_ATOMIC); | ||
138 | if (tbl == NULL) { | ||
139 | IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); | ||
140 | return -ENOMEM; | ||
141 | } | ||
142 | svc->sched_data = tbl; | ||
143 | IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " | ||
144 | "current service\n", | ||
145 | sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); | ||
146 | |||
147 | /* assign the hash buckets with the updated service */ | ||
148 | ip_vs_sh_assign(tbl, svc); | ||
149 | |||
150 | return 0; | ||
151 | } | ||
152 | |||
153 | |||
154 | static int ip_vs_sh_done_svc(struct ip_vs_service *svc) | ||
155 | { | ||
156 | struct ip_vs_sh_bucket *tbl = svc->sched_data; | ||
157 | |||
158 | /* got to clean up hash buckets here */ | ||
159 | ip_vs_sh_flush(tbl); | ||
160 | |||
161 | /* release the table itself */ | ||
162 | kfree(svc->sched_data); | ||
163 | IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", | ||
164 | sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); | ||
165 | |||
166 | return 0; | ||
167 | } | ||
168 | |||
169 | |||
170 | static int ip_vs_sh_update_svc(struct ip_vs_service *svc) | ||
171 | { | ||
172 | struct ip_vs_sh_bucket *tbl = svc->sched_data; | ||
173 | |||
174 | /* got to clean up hash buckets here */ | ||
175 | ip_vs_sh_flush(tbl); | ||
176 | |||
177 | /* assign the hash buckets with the updated service */ | ||
178 | ip_vs_sh_assign(tbl, svc); | ||
179 | |||
180 | return 0; | ||
181 | } | ||
182 | |||
183 | |||
184 | /* | ||
185 | * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, | ||
186 | * consider that the server is overloaded here. | ||
187 | */ | ||
188 | static inline int is_overloaded(struct ip_vs_dest *dest) | ||
189 | { | ||
190 | return dest->flags & IP_VS_DEST_F_OVERLOAD; | ||
191 | } | ||
192 | |||
193 | |||
194 | /* | ||
195 | * Source Hashing scheduling | ||
196 | */ | ||
197 | static struct ip_vs_dest * | ||
198 | ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
199 | { | ||
200 | struct ip_vs_dest *dest; | ||
201 | struct ip_vs_sh_bucket *tbl; | ||
202 | struct iphdr *iph = ip_hdr(skb); | ||
203 | |||
204 | IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); | ||
205 | |||
206 | tbl = (struct ip_vs_sh_bucket *)svc->sched_data; | ||
207 | dest = ip_vs_sh_get(tbl, iph->saddr); | ||
208 | if (!dest | ||
209 | || !(dest->flags & IP_VS_DEST_F_AVAILABLE) | ||
210 | || atomic_read(&dest->weight) <= 0 | ||
211 | || is_overloaded(dest)) { | ||
212 | return NULL; | ||
213 | } | ||
214 | |||
215 | IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " | ||
216 | "--> server %u.%u.%u.%u:%d\n", | ||
217 | NIPQUAD(iph->saddr), | ||
218 | NIPQUAD(dest->addr), | ||
219 | ntohs(dest->port)); | ||
220 | |||
221 | return dest; | ||
222 | } | ||
223 | |||
224 | |||
225 | /* | ||
226 | * IPVS SH Scheduler structure | ||
227 | */ | ||
228 | static struct ip_vs_scheduler ip_vs_sh_scheduler = | ||
229 | { | ||
230 | .name = "sh", | ||
231 | .refcnt = ATOMIC_INIT(0), | ||
232 | .module = THIS_MODULE, | ||
233 | .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), | ||
234 | .init_service = ip_vs_sh_init_svc, | ||
235 | .done_service = ip_vs_sh_done_svc, | ||
236 | .update_service = ip_vs_sh_update_svc, | ||
237 | .schedule = ip_vs_sh_schedule, | ||
238 | }; | ||
239 | |||
240 | |||
241 | static int __init ip_vs_sh_init(void) | ||
242 | { | ||
243 | return register_ip_vs_scheduler(&ip_vs_sh_scheduler); | ||
244 | } | ||
245 | |||
246 | |||
247 | static void __exit ip_vs_sh_cleanup(void) | ||
248 | { | ||
249 | unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); | ||
250 | } | ||
251 | |||
252 | |||
253 | module_init(ip_vs_sh_init); | ||
254 | module_exit(ip_vs_sh_cleanup); | ||
255 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c deleted file mode 100644 index a652da2c3200..000000000000 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ /dev/null | |||
@@ -1,930 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS An implementation of the IP virtual server support for the | ||
3 | * LINUX operating system. IPVS is now implemented as a module | ||
4 | * over the NetFilter framework. IPVS can be used to build a | ||
5 | * high-performance and highly available server based on a | ||
6 | * cluster of servers. | ||
7 | * | ||
8 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
9 | * | ||
10 | * ip_vs_sync: sync connection info from master load balancer to backups | ||
11 | * through multicast | ||
12 | * | ||
13 | * Changes: | ||
14 | * Alexandre Cassen : Added master & backup support at a time. | ||
15 | * Alexandre Cassen : Added SyncID support for incoming sync | ||
16 | * messages filtering. | ||
17 | * Justin Ossevoort : Fix endian problem on sync message size. | ||
18 | */ | ||
19 | |||
20 | #include <linux/module.h> | ||
21 | #include <linux/slab.h> | ||
22 | #include <linux/inetdevice.h> | ||
23 | #include <linux/net.h> | ||
24 | #include <linux/completion.h> | ||
25 | #include <linux/delay.h> | ||
26 | #include <linux/skbuff.h> | ||
27 | #include <linux/in.h> | ||
28 | #include <linux/igmp.h> /* for ip_mc_join_group */ | ||
29 | #include <linux/udp.h> | ||
30 | #include <linux/err.h> | ||
31 | #include <linux/kthread.h> | ||
32 | #include <linux/wait.h> | ||
33 | |||
34 | #include <net/ip.h> | ||
35 | #include <net/sock.h> | ||
36 | |||
37 | #include <net/ip_vs.h> | ||
38 | |||
39 | #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ | ||
40 | #define IP_VS_SYNC_PORT 8848 /* multicast port */ | ||
41 | |||
42 | |||
43 | /* | ||
44 | * IPVS sync connection entry | ||
45 | */ | ||
46 | struct ip_vs_sync_conn { | ||
47 | __u8 reserved; | ||
48 | |||
49 | /* Protocol, addresses and port numbers */ | ||
50 | __u8 protocol; /* Which protocol (TCP/UDP) */ | ||
51 | __be16 cport; | ||
52 | __be16 vport; | ||
53 | __be16 dport; | ||
54 | __be32 caddr; /* client address */ | ||
55 | __be32 vaddr; /* virtual address */ | ||
56 | __be32 daddr; /* destination address */ | ||
57 | |||
58 | /* Flags and state transition */ | ||
59 | __be16 flags; /* status flags */ | ||
60 | __be16 state; /* state info */ | ||
61 | |||
62 | /* The sequence options start here */ | ||
63 | }; | ||
64 | |||
65 | struct ip_vs_sync_conn_options { | ||
66 | struct ip_vs_seq in_seq; /* incoming seq. struct */ | ||
67 | struct ip_vs_seq out_seq; /* outgoing seq. struct */ | ||
68 | }; | ||
69 | |||
70 | struct ip_vs_sync_thread_data { | ||
71 | struct socket *sock; | ||
72 | char *buf; | ||
73 | }; | ||
74 | |||
75 | #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) | ||
76 | #define FULL_CONN_SIZE \ | ||
77 | (sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) | ||
78 | |||
79 | |||
80 | /* | ||
81 | The master mulitcasts messages to the backup load balancers in the | ||
82 | following format. | ||
83 | |||
84 | 0 1 2 3 | ||
85 | 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
86 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
87 | | Count Conns | SyncID | Size | | ||
88 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
89 | | | | ||
90 | | IPVS Sync Connection (1) | | ||
91 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
92 | | . | | ||
93 | | . | | ||
94 | | . | | ||
95 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
96 | | | | ||
97 | | IPVS Sync Connection (n) | | ||
98 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
99 | */ | ||
100 | |||
101 | #define SYNC_MESG_HEADER_LEN 4 | ||
102 | |||
103 | struct ip_vs_sync_mesg { | ||
104 | __u8 nr_conns; | ||
105 | __u8 syncid; | ||
106 | __u16 size; | ||
107 | |||
108 | /* ip_vs_sync_conn entries start here */ | ||
109 | }; | ||
110 | |||
111 | /* the maximum length of sync (sending/receiving) message */ | ||
112 | static int sync_send_mesg_maxlen; | ||
113 | static int sync_recv_mesg_maxlen; | ||
114 | |||
115 | struct ip_vs_sync_buff { | ||
116 | struct list_head list; | ||
117 | unsigned long firstuse; | ||
118 | |||
119 | /* pointers for the message data */ | ||
120 | struct ip_vs_sync_mesg *mesg; | ||
121 | unsigned char *head; | ||
122 | unsigned char *end; | ||
123 | }; | ||
124 | |||
125 | |||
126 | /* the sync_buff list head and the lock */ | ||
127 | static LIST_HEAD(ip_vs_sync_queue); | ||
128 | static DEFINE_SPINLOCK(ip_vs_sync_lock); | ||
129 | |||
130 | /* current sync_buff for accepting new conn entries */ | ||
131 | static struct ip_vs_sync_buff *curr_sb = NULL; | ||
132 | static DEFINE_SPINLOCK(curr_sb_lock); | ||
133 | |||
134 | /* ipvs sync daemon state */ | ||
135 | volatile int ip_vs_sync_state = IP_VS_STATE_NONE; | ||
136 | volatile int ip_vs_master_syncid = 0; | ||
137 | volatile int ip_vs_backup_syncid = 0; | ||
138 | |||
139 | /* multicast interface name */ | ||
140 | char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; | ||
141 | char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; | ||
142 | |||
143 | /* sync daemon tasks */ | ||
144 | static struct task_struct *sync_master_thread; | ||
145 | static struct task_struct *sync_backup_thread; | ||
146 | |||
147 | /* multicast addr */ | ||
148 | static struct sockaddr_in mcast_addr = { | ||
149 | .sin_family = AF_INET, | ||
150 | .sin_port = __constant_htons(IP_VS_SYNC_PORT), | ||
151 | .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP), | ||
152 | }; | ||
153 | |||
154 | |||
155 | static inline struct ip_vs_sync_buff *sb_dequeue(void) | ||
156 | { | ||
157 | struct ip_vs_sync_buff *sb; | ||
158 | |||
159 | spin_lock_bh(&ip_vs_sync_lock); | ||
160 | if (list_empty(&ip_vs_sync_queue)) { | ||
161 | sb = NULL; | ||
162 | } else { | ||
163 | sb = list_entry(ip_vs_sync_queue.next, | ||
164 | struct ip_vs_sync_buff, | ||
165 | list); | ||
166 | list_del(&sb->list); | ||
167 | } | ||
168 | spin_unlock_bh(&ip_vs_sync_lock); | ||
169 | |||
170 | return sb; | ||
171 | } | ||
172 | |||
173 | static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) | ||
174 | { | ||
175 | struct ip_vs_sync_buff *sb; | ||
176 | |||
177 | if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) | ||
178 | return NULL; | ||
179 | |||
180 | if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { | ||
181 | kfree(sb); | ||
182 | return NULL; | ||
183 | } | ||
184 | sb->mesg->nr_conns = 0; | ||
185 | sb->mesg->syncid = ip_vs_master_syncid; | ||
186 | sb->mesg->size = 4; | ||
187 | sb->head = (unsigned char *)sb->mesg + 4; | ||
188 | sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; | ||
189 | sb->firstuse = jiffies; | ||
190 | return sb; | ||
191 | } | ||
192 | |||
193 | static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) | ||
194 | { | ||
195 | kfree(sb->mesg); | ||
196 | kfree(sb); | ||
197 | } | ||
198 | |||
199 | static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) | ||
200 | { | ||
201 | spin_lock(&ip_vs_sync_lock); | ||
202 | if (ip_vs_sync_state & IP_VS_STATE_MASTER) | ||
203 | list_add_tail(&sb->list, &ip_vs_sync_queue); | ||
204 | else | ||
205 | ip_vs_sync_buff_release(sb); | ||
206 | spin_unlock(&ip_vs_sync_lock); | ||
207 | } | ||
208 | |||
209 | /* | ||
210 | * Get the current sync buffer if it has been created for more | ||
211 | * than the specified time or the specified time is zero. | ||
212 | */ | ||
213 | static inline struct ip_vs_sync_buff * | ||
214 | get_curr_sync_buff(unsigned long time) | ||
215 | { | ||
216 | struct ip_vs_sync_buff *sb; | ||
217 | |||
218 | spin_lock_bh(&curr_sb_lock); | ||
219 | if (curr_sb && (time == 0 || | ||
220 | time_before(jiffies - curr_sb->firstuse, time))) { | ||
221 | sb = curr_sb; | ||
222 | curr_sb = NULL; | ||
223 | } else | ||
224 | sb = NULL; | ||
225 | spin_unlock_bh(&curr_sb_lock); | ||
226 | return sb; | ||
227 | } | ||
228 | |||
229 | |||
230 | /* | ||
231 | * Add an ip_vs_conn information into the current sync_buff. | ||
232 | * Called by ip_vs_in. | ||
233 | */ | ||
234 | void ip_vs_sync_conn(struct ip_vs_conn *cp) | ||
235 | { | ||
236 | struct ip_vs_sync_mesg *m; | ||
237 | struct ip_vs_sync_conn *s; | ||
238 | int len; | ||
239 | |||
240 | spin_lock(&curr_sb_lock); | ||
241 | if (!curr_sb) { | ||
242 | if (!(curr_sb=ip_vs_sync_buff_create())) { | ||
243 | spin_unlock(&curr_sb_lock); | ||
244 | IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); | ||
245 | return; | ||
246 | } | ||
247 | } | ||
248 | |||
249 | len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : | ||
250 | SIMPLE_CONN_SIZE; | ||
251 | m = curr_sb->mesg; | ||
252 | s = (struct ip_vs_sync_conn *)curr_sb->head; | ||
253 | |||
254 | /* copy members */ | ||
255 | s->protocol = cp->protocol; | ||
256 | s->cport = cp->cport; | ||
257 | s->vport = cp->vport; | ||
258 | s->dport = cp->dport; | ||
259 | s->caddr = cp->caddr; | ||
260 | s->vaddr = cp->vaddr; | ||
261 | s->daddr = cp->daddr; | ||
262 | s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); | ||
263 | s->state = htons(cp->state); | ||
264 | if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { | ||
265 | struct ip_vs_sync_conn_options *opt = | ||
266 | (struct ip_vs_sync_conn_options *)&s[1]; | ||
267 | memcpy(opt, &cp->in_seq, sizeof(*opt)); | ||
268 | } | ||
269 | |||
270 | m->nr_conns++; | ||
271 | m->size += len; | ||
272 | curr_sb->head += len; | ||
273 | |||
274 | /* check if there is a space for next one */ | ||
275 | if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { | ||
276 | sb_queue_tail(curr_sb); | ||
277 | curr_sb = NULL; | ||
278 | } | ||
279 | spin_unlock(&curr_sb_lock); | ||
280 | |||
281 | /* synchronize its controller if it has */ | ||
282 | if (cp->control) | ||
283 | ip_vs_sync_conn(cp->control); | ||
284 | } | ||
285 | |||
286 | |||
287 | /* | ||
288 | * Process received multicast message and create the corresponding | ||
289 | * ip_vs_conn entries. | ||
290 | */ | ||
291 | static void ip_vs_process_message(const char *buffer, const size_t buflen) | ||
292 | { | ||
293 | struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; | ||
294 | struct ip_vs_sync_conn *s; | ||
295 | struct ip_vs_sync_conn_options *opt; | ||
296 | struct ip_vs_conn *cp; | ||
297 | struct ip_vs_protocol *pp; | ||
298 | struct ip_vs_dest *dest; | ||
299 | char *p; | ||
300 | int i; | ||
301 | |||
302 | if (buflen < sizeof(struct ip_vs_sync_mesg)) { | ||
303 | IP_VS_ERR_RL("sync message header too short\n"); | ||
304 | return; | ||
305 | } | ||
306 | |||
307 | /* Convert size back to host byte order */ | ||
308 | m->size = ntohs(m->size); | ||
309 | |||
310 | if (buflen != m->size) { | ||
311 | IP_VS_ERR_RL("bogus sync message size\n"); | ||
312 | return; | ||
313 | } | ||
314 | |||
315 | /* SyncID sanity check */ | ||
316 | if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { | ||
317 | IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", | ||
318 | m->syncid); | ||
319 | return; | ||
320 | } | ||
321 | |||
322 | p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); | ||
323 | for (i=0; i<m->nr_conns; i++) { | ||
324 | unsigned flags, state; | ||
325 | |||
326 | if (p + SIMPLE_CONN_SIZE > buffer+buflen) { | ||
327 | IP_VS_ERR_RL("bogus conn in sync message\n"); | ||
328 | return; | ||
329 | } | ||
330 | s = (struct ip_vs_sync_conn *) p; | ||
331 | flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; | ||
332 | flags &= ~IP_VS_CONN_F_HASHED; | ||
333 | if (flags & IP_VS_CONN_F_SEQ_MASK) { | ||
334 | opt = (struct ip_vs_sync_conn_options *)&s[1]; | ||
335 | p += FULL_CONN_SIZE; | ||
336 | if (p > buffer+buflen) { | ||
337 | IP_VS_ERR_RL("bogus conn options in sync message\n"); | ||
338 | return; | ||
339 | } | ||
340 | } else { | ||
341 | opt = NULL; | ||
342 | p += SIMPLE_CONN_SIZE; | ||
343 | } | ||
344 | |||
345 | state = ntohs(s->state); | ||
346 | if (!(flags & IP_VS_CONN_F_TEMPLATE)) { | ||
347 | pp = ip_vs_proto_get(s->protocol); | ||
348 | if (!pp) { | ||
349 | IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", | ||
350 | s->protocol); | ||
351 | continue; | ||
352 | } | ||
353 | if (state >= pp->num_states) { | ||
354 | IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", | ||
355 | pp->name, state); | ||
356 | continue; | ||
357 | } | ||
358 | } else { | ||
359 | /* protocol in templates is not used for state/timeout */ | ||
360 | pp = NULL; | ||
361 | if (state > 0) { | ||
362 | IP_VS_DBG(2, "Invalid template state %u in sync msg\n", | ||
363 | state); | ||
364 | state = 0; | ||
365 | } | ||
366 | } | ||
367 | |||
368 | if (!(flags & IP_VS_CONN_F_TEMPLATE)) | ||
369 | cp = ip_vs_conn_in_get(s->protocol, | ||
370 | s->caddr, s->cport, | ||
371 | s->vaddr, s->vport); | ||
372 | else | ||
373 | cp = ip_vs_ct_in_get(s->protocol, | ||
374 | s->caddr, s->cport, | ||
375 | s->vaddr, s->vport); | ||
376 | if (!cp) { | ||
377 | /* | ||
378 | * Find the appropriate destination for the connection. | ||
379 | * If it is not found the connection will remain unbound | ||
380 | * but still handled. | ||
381 | */ | ||
382 | dest = ip_vs_find_dest(s->daddr, s->dport, | ||
383 | s->vaddr, s->vport, | ||
384 | s->protocol); | ||
385 | /* Set the approprite ativity flag */ | ||
386 | if (s->protocol == IPPROTO_TCP) { | ||
387 | if (state != IP_VS_TCP_S_ESTABLISHED) | ||
388 | flags |= IP_VS_CONN_F_INACTIVE; | ||
389 | else | ||
390 | flags &= ~IP_VS_CONN_F_INACTIVE; | ||
391 | } | ||
392 | cp = ip_vs_conn_new(s->protocol, | ||
393 | s->caddr, s->cport, | ||
394 | s->vaddr, s->vport, | ||
395 | s->daddr, s->dport, | ||
396 | flags, dest); | ||
397 | if (dest) | ||
398 | atomic_dec(&dest->refcnt); | ||
399 | if (!cp) { | ||
400 | IP_VS_ERR("ip_vs_conn_new failed\n"); | ||
401 | return; | ||
402 | } | ||
403 | } else if (!cp->dest) { | ||
404 | dest = ip_vs_try_bind_dest(cp); | ||
405 | if (dest) | ||
406 | atomic_dec(&dest->refcnt); | ||
407 | } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && | ||
408 | (cp->state != state)) { | ||
409 | /* update active/inactive flag for the connection */ | ||
410 | dest = cp->dest; | ||
411 | if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && | ||
412 | (state != IP_VS_TCP_S_ESTABLISHED)) { | ||
413 | atomic_dec(&dest->activeconns); | ||
414 | atomic_inc(&dest->inactconns); | ||
415 | cp->flags |= IP_VS_CONN_F_INACTIVE; | ||
416 | } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && | ||
417 | (state == IP_VS_TCP_S_ESTABLISHED)) { | ||
418 | atomic_inc(&dest->activeconns); | ||
419 | atomic_dec(&dest->inactconns); | ||
420 | cp->flags &= ~IP_VS_CONN_F_INACTIVE; | ||
421 | } | ||
422 | } | ||
423 | |||
424 | if (opt) | ||
425 | memcpy(&cp->in_seq, opt, sizeof(*opt)); | ||
426 | atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); | ||
427 | cp->state = state; | ||
428 | cp->old_state = cp->state; | ||
429 | /* | ||
430 | * We can not recover the right timeout for templates | ||
431 | * in all cases, we can not find the right fwmark | ||
432 | * virtual service. If needed, we can do it for | ||
433 | * non-fwmark persistent services. | ||
434 | */ | ||
435 | if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) | ||
436 | cp->timeout = pp->timeout_table[state]; | ||
437 | else | ||
438 | cp->timeout = (3*60*HZ); | ||
439 | ip_vs_conn_put(cp); | ||
440 | } | ||
441 | } | ||
442 | |||
443 | |||
444 | /* | ||
445 | * Setup loopback of outgoing multicasts on a sending socket | ||
446 | */ | ||
447 | static void set_mcast_loop(struct sock *sk, u_char loop) | ||
448 | { | ||
449 | struct inet_sock *inet = inet_sk(sk); | ||
450 | |||
451 | /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ | ||
452 | lock_sock(sk); | ||
453 | inet->mc_loop = loop ? 1 : 0; | ||
454 | release_sock(sk); | ||
455 | } | ||
456 | |||
457 | /* | ||
458 | * Specify TTL for outgoing multicasts on a sending socket | ||
459 | */ | ||
460 | static void set_mcast_ttl(struct sock *sk, u_char ttl) | ||
461 | { | ||
462 | struct inet_sock *inet = inet_sk(sk); | ||
463 | |||
464 | /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ | ||
465 | lock_sock(sk); | ||
466 | inet->mc_ttl = ttl; | ||
467 | release_sock(sk); | ||
468 | } | ||
469 | |||
470 | /* | ||
471 | * Specifiy default interface for outgoing multicasts | ||
472 | */ | ||
473 | static int set_mcast_if(struct sock *sk, char *ifname) | ||
474 | { | ||
475 | struct net_device *dev; | ||
476 | struct inet_sock *inet = inet_sk(sk); | ||
477 | |||
478 | if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) | ||
479 | return -ENODEV; | ||
480 | |||
481 | if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) | ||
482 | return -EINVAL; | ||
483 | |||
484 | lock_sock(sk); | ||
485 | inet->mc_index = dev->ifindex; | ||
486 | /* inet->mc_addr = 0; */ | ||
487 | release_sock(sk); | ||
488 | |||
489 | return 0; | ||
490 | } | ||
491 | |||
492 | |||
493 | /* | ||
494 | * Set the maximum length of sync message according to the | ||
495 | * specified interface's MTU. | ||
496 | */ | ||
497 | static int set_sync_mesg_maxlen(int sync_state) | ||
498 | { | ||
499 | struct net_device *dev; | ||
500 | int num; | ||
501 | |||
502 | if (sync_state == IP_VS_STATE_MASTER) { | ||
503 | if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) | ||
504 | return -ENODEV; | ||
505 | |||
506 | num = (dev->mtu - sizeof(struct iphdr) - | ||
507 | sizeof(struct udphdr) - | ||
508 | SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; | ||
509 | sync_send_mesg_maxlen = | ||
510 | SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num; | ||
511 | IP_VS_DBG(7, "setting the maximum length of sync sending " | ||
512 | "message %d.\n", sync_send_mesg_maxlen); | ||
513 | } else if (sync_state == IP_VS_STATE_BACKUP) { | ||
514 | if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) | ||
515 | return -ENODEV; | ||
516 | |||
517 | sync_recv_mesg_maxlen = dev->mtu - | ||
518 | sizeof(struct iphdr) - sizeof(struct udphdr); | ||
519 | IP_VS_DBG(7, "setting the maximum length of sync receiving " | ||
520 | "message %d.\n", sync_recv_mesg_maxlen); | ||
521 | } | ||
522 | |||
523 | return 0; | ||
524 | } | ||
525 | |||
526 | |||
527 | /* | ||
528 | * Join a multicast group. | ||
529 | * the group is specified by a class D multicast address 224.0.0.0/8 | ||
530 | * in the in_addr structure passed in as a parameter. | ||
531 | */ | ||
532 | static int | ||
533 | join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) | ||
534 | { | ||
535 | struct ip_mreqn mreq; | ||
536 | struct net_device *dev; | ||
537 | int ret; | ||
538 | |||
539 | memset(&mreq, 0, sizeof(mreq)); | ||
540 | memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); | ||
541 | |||
542 | if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) | ||
543 | return -ENODEV; | ||
544 | if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) | ||
545 | return -EINVAL; | ||
546 | |||
547 | mreq.imr_ifindex = dev->ifindex; | ||
548 | |||
549 | lock_sock(sk); | ||
550 | ret = ip_mc_join_group(sk, &mreq); | ||
551 | release_sock(sk); | ||
552 | |||
553 | return ret; | ||
554 | } | ||
555 | |||
556 | |||
557 | static int bind_mcastif_addr(struct socket *sock, char *ifname) | ||
558 | { | ||
559 | struct net_device *dev; | ||
560 | __be32 addr; | ||
561 | struct sockaddr_in sin; | ||
562 | |||
563 | if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) | ||
564 | return -ENODEV; | ||
565 | |||
566 | addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); | ||
567 | if (!addr) | ||
568 | IP_VS_ERR("You probably need to specify IP address on " | ||
569 | "multicast interface.\n"); | ||
570 | |||
571 | IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", | ||
572 | ifname, NIPQUAD(addr)); | ||
573 | |||
574 | /* Now bind the socket with the address of multicast interface */ | ||
575 | sin.sin_family = AF_INET; | ||
576 | sin.sin_addr.s_addr = addr; | ||
577 | sin.sin_port = 0; | ||
578 | |||
579 | return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); | ||
580 | } | ||
581 | |||
582 | /* | ||
583 | * Set up sending multicast socket over UDP | ||
584 | */ | ||
585 | static struct socket * make_send_sock(void) | ||
586 | { | ||
587 | struct socket *sock; | ||
588 | int result; | ||
589 | |||
590 | /* First create a socket */ | ||
591 | result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); | ||
592 | if (result < 0) { | ||
593 | IP_VS_ERR("Error during creation of socket; terminating\n"); | ||
594 | return ERR_PTR(result); | ||
595 | } | ||
596 | |||
597 | result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); | ||
598 | if (result < 0) { | ||
599 | IP_VS_ERR("Error setting outbound mcast interface\n"); | ||
600 | goto error; | ||
601 | } | ||
602 | |||
603 | set_mcast_loop(sock->sk, 0); | ||
604 | set_mcast_ttl(sock->sk, 1); | ||
605 | |||
606 | result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); | ||
607 | if (result < 0) { | ||
608 | IP_VS_ERR("Error binding address of the mcast interface\n"); | ||
609 | goto error; | ||
610 | } | ||
611 | |||
612 | result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, | ||
613 | sizeof(struct sockaddr), 0); | ||
614 | if (result < 0) { | ||
615 | IP_VS_ERR("Error connecting to the multicast addr\n"); | ||
616 | goto error; | ||
617 | } | ||
618 | |||
619 | return sock; | ||
620 | |||
621 | error: | ||
622 | sock_release(sock); | ||
623 | return ERR_PTR(result); | ||
624 | } | ||
625 | |||
626 | |||
627 | /* | ||
628 | * Set up receiving multicast socket over UDP | ||
629 | */ | ||
630 | static struct socket * make_receive_sock(void) | ||
631 | { | ||
632 | struct socket *sock; | ||
633 | int result; | ||
634 | |||
635 | /* First create a socket */ | ||
636 | result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); | ||
637 | if (result < 0) { | ||
638 | IP_VS_ERR("Error during creation of socket; terminating\n"); | ||
639 | return ERR_PTR(result); | ||
640 | } | ||
641 | |||
642 | /* it is equivalent to the REUSEADDR option in user-space */ | ||
643 | sock->sk->sk_reuse = 1; | ||
644 | |||
645 | result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, | ||
646 | sizeof(struct sockaddr)); | ||
647 | if (result < 0) { | ||
648 | IP_VS_ERR("Error binding to the multicast addr\n"); | ||
649 | goto error; | ||
650 | } | ||
651 | |||
652 | /* join the multicast group */ | ||
653 | result = join_mcast_group(sock->sk, | ||
654 | (struct in_addr *) &mcast_addr.sin_addr, | ||
655 | ip_vs_backup_mcast_ifn); | ||
656 | if (result < 0) { | ||
657 | IP_VS_ERR("Error joining to the multicast group\n"); | ||
658 | goto error; | ||
659 | } | ||
660 | |||
661 | return sock; | ||
662 | |||
663 | error: | ||
664 | sock_release(sock); | ||
665 | return ERR_PTR(result); | ||
666 | } | ||
667 | |||
668 | |||
669 | static int | ||
670 | ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) | ||
671 | { | ||
672 | struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; | ||
673 | struct kvec iov; | ||
674 | int len; | ||
675 | |||
676 | EnterFunction(7); | ||
677 | iov.iov_base = (void *)buffer; | ||
678 | iov.iov_len = length; | ||
679 | |||
680 | len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); | ||
681 | |||
682 | LeaveFunction(7); | ||
683 | return len; | ||
684 | } | ||
685 | |||
686 | static void | ||
687 | ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) | ||
688 | { | ||
689 | int msize; | ||
690 | |||
691 | msize = msg->size; | ||
692 | |||
693 | /* Put size in network byte order */ | ||
694 | msg->size = htons(msg->size); | ||
695 | |||
696 | if (ip_vs_send_async(sock, (char *)msg, msize) != msize) | ||
697 | IP_VS_ERR("ip_vs_send_async error\n"); | ||
698 | } | ||
699 | |||
700 | static int | ||
701 | ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) | ||
702 | { | ||
703 | struct msghdr msg = {NULL,}; | ||
704 | struct kvec iov; | ||
705 | int len; | ||
706 | |||
707 | EnterFunction(7); | ||
708 | |||
709 | /* Receive a packet */ | ||
710 | iov.iov_base = buffer; | ||
711 | iov.iov_len = (size_t)buflen; | ||
712 | |||
713 | len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); | ||
714 | |||
715 | if (len < 0) | ||
716 | return -1; | ||
717 | |||
718 | LeaveFunction(7); | ||
719 | return len; | ||
720 | } | ||
721 | |||
722 | |||
723 | static int sync_thread_master(void *data) | ||
724 | { | ||
725 | struct ip_vs_sync_thread_data *tinfo = data; | ||
726 | struct ip_vs_sync_buff *sb; | ||
727 | |||
728 | IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " | ||
729 | "syncid = %d\n", | ||
730 | ip_vs_master_mcast_ifn, ip_vs_master_syncid); | ||
731 | |||
732 | while (!kthread_should_stop()) { | ||
733 | while ((sb = sb_dequeue())) { | ||
734 | ip_vs_send_sync_msg(tinfo->sock, sb->mesg); | ||
735 | ip_vs_sync_buff_release(sb); | ||
736 | } | ||
737 | |||
738 | /* check if entries stay in curr_sb for 2 seconds */ | ||
739 | sb = get_curr_sync_buff(2 * HZ); | ||
740 | if (sb) { | ||
741 | ip_vs_send_sync_msg(tinfo->sock, sb->mesg); | ||
742 | ip_vs_sync_buff_release(sb); | ||
743 | } | ||
744 | |||
745 | schedule_timeout_interruptible(HZ); | ||
746 | } | ||
747 | |||
748 | /* clean up the sync_buff queue */ | ||
749 | while ((sb=sb_dequeue())) { | ||
750 | ip_vs_sync_buff_release(sb); | ||
751 | } | ||
752 | |||
753 | /* clean up the current sync_buff */ | ||
754 | if ((sb = get_curr_sync_buff(0))) { | ||
755 | ip_vs_sync_buff_release(sb); | ||
756 | } | ||
757 | |||
758 | /* release the sending multicast socket */ | ||
759 | sock_release(tinfo->sock); | ||
760 | kfree(tinfo); | ||
761 | |||
762 | return 0; | ||
763 | } | ||
764 | |||
765 | |||
766 | static int sync_thread_backup(void *data) | ||
767 | { | ||
768 | struct ip_vs_sync_thread_data *tinfo = data; | ||
769 | int len; | ||
770 | |||
771 | IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " | ||
772 | "syncid = %d\n", | ||
773 | ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); | ||
774 | |||
775 | while (!kthread_should_stop()) { | ||
776 | wait_event_interruptible(*tinfo->sock->sk->sk_sleep, | ||
777 | !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) | ||
778 | || kthread_should_stop()); | ||
779 | |||
780 | /* do we have data now? */ | ||
781 | while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { | ||
782 | len = ip_vs_receive(tinfo->sock, tinfo->buf, | ||
783 | sync_recv_mesg_maxlen); | ||
784 | if (len <= 0) { | ||
785 | IP_VS_ERR("receiving message error\n"); | ||
786 | break; | ||
787 | } | ||
788 | |||
789 | /* disable bottom half, because it accesses the data | ||
790 | shared by softirq while getting/creating conns */ | ||
791 | local_bh_disable(); | ||
792 | ip_vs_process_message(tinfo->buf, len); | ||
793 | local_bh_enable(); | ||
794 | } | ||
795 | } | ||
796 | |||
797 | /* release the sending multicast socket */ | ||
798 | sock_release(tinfo->sock); | ||
799 | kfree(tinfo->buf); | ||
800 | kfree(tinfo); | ||
801 | |||
802 | return 0; | ||
803 | } | ||
804 | |||
805 | |||
806 | int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) | ||
807 | { | ||
808 | struct ip_vs_sync_thread_data *tinfo; | ||
809 | struct task_struct **realtask, *task; | ||
810 | struct socket *sock; | ||
811 | char *name, *buf = NULL; | ||
812 | int (*threadfn)(void *data); | ||
813 | int result = -ENOMEM; | ||
814 | |||
815 | IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); | ||
816 | IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", | ||
817 | sizeof(struct ip_vs_sync_conn)); | ||
818 | |||
819 | if (state == IP_VS_STATE_MASTER) { | ||
820 | if (sync_master_thread) | ||
821 | return -EEXIST; | ||
822 | |||
823 | strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, | ||
824 | sizeof(ip_vs_master_mcast_ifn)); | ||
825 | ip_vs_master_syncid = syncid; | ||
826 | realtask = &sync_master_thread; | ||
827 | name = "ipvs_syncmaster"; | ||
828 | threadfn = sync_thread_master; | ||
829 | sock = make_send_sock(); | ||
830 | } else if (state == IP_VS_STATE_BACKUP) { | ||
831 | if (sync_backup_thread) | ||
832 | return -EEXIST; | ||
833 | |||
834 | strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, | ||
835 | sizeof(ip_vs_backup_mcast_ifn)); | ||
836 | ip_vs_backup_syncid = syncid; | ||
837 | realtask = &sync_backup_thread; | ||
838 | name = "ipvs_syncbackup"; | ||
839 | threadfn = sync_thread_backup; | ||
840 | sock = make_receive_sock(); | ||
841 | } else { | ||
842 | return -EINVAL; | ||
843 | } | ||
844 | |||
845 | if (IS_ERR(sock)) { | ||
846 | result = PTR_ERR(sock); | ||
847 | goto out; | ||
848 | } | ||
849 | |||
850 | set_sync_mesg_maxlen(state); | ||
851 | if (state == IP_VS_STATE_BACKUP) { | ||
852 | buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); | ||
853 | if (!buf) | ||
854 | goto outsocket; | ||
855 | } | ||
856 | |||
857 | tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); | ||
858 | if (!tinfo) | ||
859 | goto outbuf; | ||
860 | |||
861 | tinfo->sock = sock; | ||
862 | tinfo->buf = buf; | ||
863 | |||
864 | task = kthread_run(threadfn, tinfo, name); | ||
865 | if (IS_ERR(task)) { | ||
866 | result = PTR_ERR(task); | ||
867 | goto outtinfo; | ||
868 | } | ||
869 | |||
870 | /* mark as active */ | ||
871 | *realtask = task; | ||
872 | ip_vs_sync_state |= state; | ||
873 | |||
874 | /* increase the module use count */ | ||
875 | ip_vs_use_count_inc(); | ||
876 | |||
877 | return 0; | ||
878 | |||
879 | outtinfo: | ||
880 | kfree(tinfo); | ||
881 | outbuf: | ||
882 | kfree(buf); | ||
883 | outsocket: | ||
884 | sock_release(sock); | ||
885 | out: | ||
886 | return result; | ||
887 | } | ||
888 | |||
889 | |||
890 | int stop_sync_thread(int state) | ||
891 | { | ||
892 | IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); | ||
893 | |||
894 | if (state == IP_VS_STATE_MASTER) { | ||
895 | if (!sync_master_thread) | ||
896 | return -ESRCH; | ||
897 | |||
898 | IP_VS_INFO("stopping master sync thread %d ...\n", | ||
899 | task_pid_nr(sync_master_thread)); | ||
900 | |||
901 | /* | ||
902 | * The lock synchronizes with sb_queue_tail(), so that we don't | ||
903 | * add sync buffers to the queue, when we are already in | ||
904 | * progress of stopping the master sync daemon. | ||
905 | */ | ||
906 | |||
907 | spin_lock_bh(&ip_vs_sync_lock); | ||
908 | ip_vs_sync_state &= ~IP_VS_STATE_MASTER; | ||
909 | spin_unlock_bh(&ip_vs_sync_lock); | ||
910 | kthread_stop(sync_master_thread); | ||
911 | sync_master_thread = NULL; | ||
912 | } else if (state == IP_VS_STATE_BACKUP) { | ||
913 | if (!sync_backup_thread) | ||
914 | return -ESRCH; | ||
915 | |||
916 | IP_VS_INFO("stopping backup sync thread %d ...\n", | ||
917 | task_pid_nr(sync_backup_thread)); | ||
918 | |||
919 | ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; | ||
920 | kthread_stop(sync_backup_thread); | ||
921 | sync_backup_thread = NULL; | ||
922 | } else { | ||
923 | return -EINVAL; | ||
924 | } | ||
925 | |||
926 | /* decrease the module use count */ | ||
927 | ip_vs_use_count_dec(); | ||
928 | |||
929 | return 0; | ||
930 | } | ||
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c deleted file mode 100644 index 9b0ef86bb1f7..000000000000 --- a/net/ipv4/ipvs/ip_vs_wlc.c +++ /dev/null | |||
@@ -1,149 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Weighted Least-Connection Scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * Peter Kese <peter.kese@ijs.si> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version | ||
10 | * 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * Changes: | ||
13 | * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest | ||
14 | * Wensong Zhang : changed to use the inactconns in scheduling | ||
15 | * Wensong Zhang : changed some comestics things for debugging | ||
16 | * Wensong Zhang : changed for the d-linked destination list | ||
17 | * Wensong Zhang : added the ip_vs_wlc_update_svc | ||
18 | * Wensong Zhang : added any dest with weight=0 is quiesced | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #include <linux/module.h> | ||
23 | #include <linux/kernel.h> | ||
24 | |||
25 | #include <net/ip_vs.h> | ||
26 | |||
27 | |||
28 | static int | ||
29 | ip_vs_wlc_init_svc(struct ip_vs_service *svc) | ||
30 | { | ||
31 | return 0; | ||
32 | } | ||
33 | |||
34 | |||
35 | static int | ||
36 | ip_vs_wlc_done_svc(struct ip_vs_service *svc) | ||
37 | { | ||
38 | return 0; | ||
39 | } | ||
40 | |||
41 | |||
42 | static int | ||
43 | ip_vs_wlc_update_svc(struct ip_vs_service *svc) | ||
44 | { | ||
45 | return 0; | ||
46 | } | ||
47 | |||
48 | |||
49 | static inline unsigned int | ||
50 | ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) | ||
51 | { | ||
52 | /* | ||
53 | * We think the overhead of processing active connections is 256 | ||
54 | * times higher than that of inactive connections in average. (This | ||
55 | * 256 times might not be accurate, we will change it later) We | ||
56 | * use the following formula to estimate the overhead now: | ||
57 | * dest->activeconns*256 + dest->inactconns | ||
58 | */ | ||
59 | return (atomic_read(&dest->activeconns) << 8) + | ||
60 | atomic_read(&dest->inactconns); | ||
61 | } | ||
62 | |||
63 | |||
64 | /* | ||
65 | * Weighted Least Connection scheduling | ||
66 | */ | ||
67 | static struct ip_vs_dest * | ||
68 | ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
69 | { | ||
70 | struct ip_vs_dest *dest, *least; | ||
71 | unsigned int loh, doh; | ||
72 | |||
73 | IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); | ||
74 | |||
75 | /* | ||
76 | * We calculate the load of each dest server as follows: | ||
77 | * (dest overhead) / dest->weight | ||
78 | * | ||
79 | * Remember -- no floats in kernel mode!!! | ||
80 | * The comparison of h1*w2 > h2*w1 is equivalent to that of | ||
81 | * h1/w1 > h2/w2 | ||
82 | * if every weight is larger than zero. | ||
83 | * | ||
84 | * The server with weight=0 is quiesced and will not receive any | ||
85 | * new connections. | ||
86 | */ | ||
87 | |||
88 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
89 | if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && | ||
90 | atomic_read(&dest->weight) > 0) { | ||
91 | least = dest; | ||
92 | loh = ip_vs_wlc_dest_overhead(least); | ||
93 | goto nextstage; | ||
94 | } | ||
95 | } | ||
96 | return NULL; | ||
97 | |||
98 | /* | ||
99 | * Find the destination with the least load. | ||
100 | */ | ||
101 | nextstage: | ||
102 | list_for_each_entry_continue(dest, &svc->destinations, n_list) { | ||
103 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) | ||
104 | continue; | ||
105 | doh = ip_vs_wlc_dest_overhead(dest); | ||
106 | if (loh * atomic_read(&dest->weight) > | ||
107 | doh * atomic_read(&least->weight)) { | ||
108 | least = dest; | ||
109 | loh = doh; | ||
110 | } | ||
111 | } | ||
112 | |||
113 | IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u " | ||
114 | "activeconns %d refcnt %d weight %d overhead %d\n", | ||
115 | NIPQUAD(least->addr), ntohs(least->port), | ||
116 | atomic_read(&least->activeconns), | ||
117 | atomic_read(&least->refcnt), | ||
118 | atomic_read(&least->weight), loh); | ||
119 | |||
120 | return least; | ||
121 | } | ||
122 | |||
123 | |||
124 | static struct ip_vs_scheduler ip_vs_wlc_scheduler = | ||
125 | { | ||
126 | .name = "wlc", | ||
127 | .refcnt = ATOMIC_INIT(0), | ||
128 | .module = THIS_MODULE, | ||
129 | .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), | ||
130 | .init_service = ip_vs_wlc_init_svc, | ||
131 | .done_service = ip_vs_wlc_done_svc, | ||
132 | .update_service = ip_vs_wlc_update_svc, | ||
133 | .schedule = ip_vs_wlc_schedule, | ||
134 | }; | ||
135 | |||
136 | |||
137 | static int __init ip_vs_wlc_init(void) | ||
138 | { | ||
139 | return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); | ||
140 | } | ||
141 | |||
142 | static void __exit ip_vs_wlc_cleanup(void) | ||
143 | { | ||
144 | unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); | ||
145 | } | ||
146 | |||
147 | module_init(ip_vs_wlc_init); | ||
148 | module_exit(ip_vs_wlc_cleanup); | ||
149 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c deleted file mode 100644 index 0d86a79b87b5..000000000000 --- a/net/ipv4/ipvs/ip_vs_wrr.c +++ /dev/null | |||
@@ -1,234 +0,0 @@ | |||
1 | /* | ||
2 | * IPVS: Weighted Round-Robin Scheduling module | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * Changes: | ||
12 | * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest | ||
13 | * Wensong Zhang : changed some comestics things for debugging | ||
14 | * Wensong Zhang : changed for the d-linked destination list | ||
15 | * Wensong Zhang : added the ip_vs_wrr_update_svc | ||
16 | * Julian Anastasov : fixed the bug of returning destination | ||
17 | * with weight 0 when all weights are zero | ||
18 | * | ||
19 | */ | ||
20 | |||
21 | #include <linux/module.h> | ||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/net.h> | ||
24 | |||
25 | #include <net/ip_vs.h> | ||
26 | |||
27 | /* | ||
28 | * current destination pointer for weighted round-robin scheduling | ||
29 | */ | ||
30 | struct ip_vs_wrr_mark { | ||
31 | struct list_head *cl; /* current list head */ | ||
32 | int cw; /* current weight */ | ||
33 | int mw; /* maximum weight */ | ||
34 | int di; /* decreasing interval */ | ||
35 | }; | ||
36 | |||
37 | |||
38 | /* | ||
39 | * Get the gcd of server weights | ||
40 | */ | ||
41 | static int gcd(int a, int b) | ||
42 | { | ||
43 | int c; | ||
44 | |||
45 | while ((c = a % b)) { | ||
46 | a = b; | ||
47 | b = c; | ||
48 | } | ||
49 | return b; | ||
50 | } | ||
51 | |||
52 | static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) | ||
53 | { | ||
54 | struct ip_vs_dest *dest; | ||
55 | int weight; | ||
56 | int g = 0; | ||
57 | |||
58 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
59 | weight = atomic_read(&dest->weight); | ||
60 | if (weight > 0) { | ||
61 | if (g > 0) | ||
62 | g = gcd(weight, g); | ||
63 | else | ||
64 | g = weight; | ||
65 | } | ||
66 | } | ||
67 | return g ? g : 1; | ||
68 | } | ||
69 | |||
70 | |||
71 | /* | ||
72 | * Get the maximum weight of the service destinations. | ||
73 | */ | ||
74 | static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) | ||
75 | { | ||
76 | struct ip_vs_dest *dest; | ||
77 | int weight = 0; | ||
78 | |||
79 | list_for_each_entry(dest, &svc->destinations, n_list) { | ||
80 | if (atomic_read(&dest->weight) > weight) | ||
81 | weight = atomic_read(&dest->weight); | ||
82 | } | ||
83 | |||
84 | return weight; | ||
85 | } | ||
86 | |||
87 | |||
88 | static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) | ||
89 | { | ||
90 | struct ip_vs_wrr_mark *mark; | ||
91 | |||
92 | /* | ||
93 | * Allocate the mark variable for WRR scheduling | ||
94 | */ | ||
95 | mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); | ||
96 | if (mark == NULL) { | ||
97 | IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); | ||
98 | return -ENOMEM; | ||
99 | } | ||
100 | mark->cl = &svc->destinations; | ||
101 | mark->cw = 0; | ||
102 | mark->mw = ip_vs_wrr_max_weight(svc); | ||
103 | mark->di = ip_vs_wrr_gcd_weight(svc); | ||
104 | svc->sched_data = mark; | ||
105 | |||
106 | return 0; | ||
107 | } | ||
108 | |||
109 | |||
110 | static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) | ||
111 | { | ||
112 | /* | ||
113 | * Release the mark variable | ||
114 | */ | ||
115 | kfree(svc->sched_data); | ||
116 | |||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | |||
121 | static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) | ||
122 | { | ||
123 | struct ip_vs_wrr_mark *mark = svc->sched_data; | ||
124 | |||
125 | mark->cl = &svc->destinations; | ||
126 | mark->mw = ip_vs_wrr_max_weight(svc); | ||
127 | mark->di = ip_vs_wrr_gcd_weight(svc); | ||
128 | if (mark->cw > mark->mw) | ||
129 | mark->cw = 0; | ||
130 | return 0; | ||
131 | } | ||
132 | |||
133 | |||
134 | /* | ||
135 | * Weighted Round-Robin Scheduling | ||
136 | */ | ||
137 | static struct ip_vs_dest * | ||
138 | ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | ||
139 | { | ||
140 | struct ip_vs_dest *dest; | ||
141 | struct ip_vs_wrr_mark *mark = svc->sched_data; | ||
142 | struct list_head *p; | ||
143 | |||
144 | IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); | ||
145 | |||
146 | /* | ||
147 | * This loop will always terminate, because mark->cw in (0, max_weight] | ||
148 | * and at least one server has its weight equal to max_weight. | ||
149 | */ | ||
150 | write_lock(&svc->sched_lock); | ||
151 | p = mark->cl; | ||
152 | while (1) { | ||
153 | if (mark->cl == &svc->destinations) { | ||
154 | /* it is at the head of the destination list */ | ||
155 | |||
156 | if (mark->cl == mark->cl->next) { | ||
157 | /* no dest entry */ | ||
158 | dest = NULL; | ||
159 | goto out; | ||
160 | } | ||
161 | |||
162 | mark->cl = svc->destinations.next; | ||
163 | mark->cw -= mark->di; | ||
164 | if (mark->cw <= 0) { | ||
165 | mark->cw = mark->mw; | ||
166 | /* | ||
167 | * Still zero, which means no available servers. | ||
168 | */ | ||
169 | if (mark->cw == 0) { | ||
170 | mark->cl = &svc->destinations; | ||
171 | IP_VS_ERR_RL("ip_vs_wrr_schedule(): " | ||
172 | "no available servers\n"); | ||
173 | dest = NULL; | ||
174 | goto out; | ||
175 | } | ||
176 | } | ||
177 | } else | ||
178 | mark->cl = mark->cl->next; | ||
179 | |||
180 | if (mark->cl != &svc->destinations) { | ||
181 | /* not at the head of the list */ | ||
182 | dest = list_entry(mark->cl, struct ip_vs_dest, n_list); | ||
183 | if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && | ||
184 | atomic_read(&dest->weight) >= mark->cw) { | ||
185 | /* got it */ | ||
186 | break; | ||
187 | } | ||
188 | } | ||
189 | |||
190 | if (mark->cl == p && mark->cw == mark->di) { | ||
191 | /* back to the start, and no dest is found. | ||
192 | It is only possible when all dests are OVERLOADED */ | ||
193 | dest = NULL; | ||
194 | goto out; | ||
195 | } | ||
196 | } | ||
197 | |||
198 | IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u " | ||
199 | "activeconns %d refcnt %d weight %d\n", | ||
200 | NIPQUAD(dest->addr), ntohs(dest->port), | ||
201 | atomic_read(&dest->activeconns), | ||
202 | atomic_read(&dest->refcnt), | ||
203 | atomic_read(&dest->weight)); | ||
204 | |||
205 | out: | ||
206 | write_unlock(&svc->sched_lock); | ||
207 | return dest; | ||
208 | } | ||
209 | |||
210 | |||
211 | static struct ip_vs_scheduler ip_vs_wrr_scheduler = { | ||
212 | .name = "wrr", | ||
213 | .refcnt = ATOMIC_INIT(0), | ||
214 | .module = THIS_MODULE, | ||
215 | .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), | ||
216 | .init_service = ip_vs_wrr_init_svc, | ||
217 | .done_service = ip_vs_wrr_done_svc, | ||
218 | .update_service = ip_vs_wrr_update_svc, | ||
219 | .schedule = ip_vs_wrr_schedule, | ||
220 | }; | ||
221 | |||
222 | static int __init ip_vs_wrr_init(void) | ||
223 | { | ||
224 | return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; | ||
225 | } | ||
226 | |||
227 | static void __exit ip_vs_wrr_cleanup(void) | ||
228 | { | ||
229 | unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); | ||
230 | } | ||
231 | |||
232 | module_init(ip_vs_wrr_init); | ||
233 | module_exit(ip_vs_wrr_cleanup); | ||
234 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c deleted file mode 100644 index 9892d4aca42e..000000000000 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ /dev/null | |||
@@ -1,559 +0,0 @@ | |||
1 | /* | ||
2 | * ip_vs_xmit.c: various packet transmitters for IPVS | ||
3 | * | ||
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | ||
5 | * Julian Anastasov <ja@ssi.bg> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version | ||
10 | * 2 of the License, or (at your option) any later version. | ||
11 | * | ||
12 | * Changes: | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/tcp.h> /* for tcphdr */ | ||
18 | #include <net/ip.h> | ||
19 | #include <net/tcp.h> /* for csum_tcpudp_magic */ | ||
20 | #include <net/udp.h> | ||
21 | #include <net/icmp.h> /* for icmp_send */ | ||
22 | #include <net/route.h> /* for ip_route_output */ | ||
23 | #include <linux/netfilter.h> | ||
24 | #include <linux/netfilter_ipv4.h> | ||
25 | |||
26 | #include <net/ip_vs.h> | ||
27 | |||
28 | |||
29 | /* | ||
30 | * Destination cache to speed up outgoing route lookup | ||
31 | */ | ||
32 | static inline void | ||
33 | __ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) | ||
34 | { | ||
35 | struct dst_entry *old_dst; | ||
36 | |||
37 | old_dst = dest->dst_cache; | ||
38 | dest->dst_cache = dst; | ||
39 | dest->dst_rtos = rtos; | ||
40 | dst_release(old_dst); | ||
41 | } | ||
42 | |||
43 | static inline struct dst_entry * | ||
44 | __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) | ||
45 | { | ||
46 | struct dst_entry *dst = dest->dst_cache; | ||
47 | |||
48 | if (!dst) | ||
49 | return NULL; | ||
50 | if ((dst->obsolete || rtos != dest->dst_rtos) && | ||
51 | dst->ops->check(dst, cookie) == NULL) { | ||
52 | dest->dst_cache = NULL; | ||
53 | dst_release(dst); | ||
54 | return NULL; | ||
55 | } | ||
56 | dst_hold(dst); | ||
57 | return dst; | ||
58 | } | ||
59 | |||
60 | static struct rtable * | ||
61 | __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) | ||
62 | { | ||
63 | struct rtable *rt; /* Route to the other host */ | ||
64 | struct ip_vs_dest *dest = cp->dest; | ||
65 | |||
66 | if (dest) { | ||
67 | spin_lock(&dest->dst_lock); | ||
68 | if (!(rt = (struct rtable *) | ||
69 | __ip_vs_dst_check(dest, rtos, 0))) { | ||
70 | struct flowi fl = { | ||
71 | .oif = 0, | ||
72 | .nl_u = { | ||
73 | .ip4_u = { | ||
74 | .daddr = dest->addr, | ||
75 | .saddr = 0, | ||
76 | .tos = rtos, } }, | ||
77 | }; | ||
78 | |||
79 | if (ip_route_output_key(&init_net, &rt, &fl)) { | ||
80 | spin_unlock(&dest->dst_lock); | ||
81 | IP_VS_DBG_RL("ip_route_output error, " | ||
82 | "dest: %u.%u.%u.%u\n", | ||
83 | NIPQUAD(dest->addr)); | ||
84 | return NULL; | ||
85 | } | ||
86 | __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); | ||
87 | IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", | ||
88 | NIPQUAD(dest->addr), | ||
89 | atomic_read(&rt->u.dst.__refcnt), rtos); | ||
90 | } | ||
91 | spin_unlock(&dest->dst_lock); | ||
92 | } else { | ||
93 | struct flowi fl = { | ||
94 | .oif = 0, | ||
95 | .nl_u = { | ||
96 | .ip4_u = { | ||
97 | .daddr = cp->daddr, | ||
98 | .saddr = 0, | ||
99 | .tos = rtos, } }, | ||
100 | }; | ||
101 | |||
102 | if (ip_route_output_key(&init_net, &rt, &fl)) { | ||
103 | IP_VS_DBG_RL("ip_route_output error, dest: " | ||
104 | "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); | ||
105 | return NULL; | ||
106 | } | ||
107 | } | ||
108 | |||
109 | return rt; | ||
110 | } | ||
111 | |||
112 | |||
113 | /* | ||
114 | * Release dest->dst_cache before a dest is removed | ||
115 | */ | ||
116 | void | ||
117 | ip_vs_dst_reset(struct ip_vs_dest *dest) | ||
118 | { | ||
119 | struct dst_entry *old_dst; | ||
120 | |||
121 | old_dst = dest->dst_cache; | ||
122 | dest->dst_cache = NULL; | ||
123 | dst_release(old_dst); | ||
124 | } | ||
125 | |||
126 | #define IP_VS_XMIT(skb, rt) \ | ||
127 | do { \ | ||
128 | (skb)->ipvs_property = 1; \ | ||
129 | skb_forward_csum(skb); \ | ||
130 | NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, (skb), NULL, \ | ||
131 | (rt)->u.dst.dev, dst_output); \ | ||
132 | } while (0) | ||
133 | |||
134 | |||
135 | /* | ||
136 | * NULL transmitter (do nothing except return NF_ACCEPT) | ||
137 | */ | ||
138 | int | ||
139 | ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
140 | struct ip_vs_protocol *pp) | ||
141 | { | ||
142 | /* we do not touch skb and do not need pskb ptr */ | ||
143 | return NF_ACCEPT; | ||
144 | } | ||
145 | |||
146 | |||
147 | /* | ||
148 | * Bypass transmitter | ||
149 | * Let packets bypass the destination when the destination is not | ||
150 | * available, it may be only used in transparent cache cluster. | ||
151 | */ | ||
152 | int | ||
153 | ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
154 | struct ip_vs_protocol *pp) | ||
155 | { | ||
156 | struct rtable *rt; /* Route to the other host */ | ||
157 | struct iphdr *iph = ip_hdr(skb); | ||
158 | u8 tos = iph->tos; | ||
159 | int mtu; | ||
160 | struct flowi fl = { | ||
161 | .oif = 0, | ||
162 | .nl_u = { | ||
163 | .ip4_u = { | ||
164 | .daddr = iph->daddr, | ||
165 | .saddr = 0, | ||
166 | .tos = RT_TOS(tos), } }, | ||
167 | }; | ||
168 | |||
169 | EnterFunction(10); | ||
170 | |||
171 | if (ip_route_output_key(&init_net, &rt, &fl)) { | ||
172 | IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " | ||
173 | "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); | ||
174 | goto tx_error_icmp; | ||
175 | } | ||
176 | |||
177 | /* MTU checking */ | ||
178 | mtu = dst_mtu(&rt->u.dst); | ||
179 | if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { | ||
180 | ip_rt_put(rt); | ||
181 | icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); | ||
182 | IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); | ||
183 | goto tx_error; | ||
184 | } | ||
185 | |||
186 | /* | ||
187 | * Call ip_send_check because we are not sure it is called | ||
188 | * after ip_defrag. Is copy-on-write needed? | ||
189 | */ | ||
190 | if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { | ||
191 | ip_rt_put(rt); | ||
192 | return NF_STOLEN; | ||
193 | } | ||
194 | ip_send_check(ip_hdr(skb)); | ||
195 | |||
196 | /* drop old route */ | ||
197 | dst_release(skb->dst); | ||
198 | skb->dst = &rt->u.dst; | ||
199 | |||
200 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
201 | skb->local_df = 1; | ||
202 | |||
203 | IP_VS_XMIT(skb, rt); | ||
204 | |||
205 | LeaveFunction(10); | ||
206 | return NF_STOLEN; | ||
207 | |||
208 | tx_error_icmp: | ||
209 | dst_link_failure(skb); | ||
210 | tx_error: | ||
211 | kfree_skb(skb); | ||
212 | LeaveFunction(10); | ||
213 | return NF_STOLEN; | ||
214 | } | ||
215 | |||
216 | |||
217 | /* | ||
218 | * NAT transmitter (only for outside-to-inside nat forwarding) | ||
219 | * Not used for related ICMP | ||
220 | */ | ||
221 | int | ||
222 | ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
223 | struct ip_vs_protocol *pp) | ||
224 | { | ||
225 | struct rtable *rt; /* Route to the other host */ | ||
226 | int mtu; | ||
227 | struct iphdr *iph = ip_hdr(skb); | ||
228 | |||
229 | EnterFunction(10); | ||
230 | |||
231 | /* check if it is a connection of no-client-port */ | ||
232 | if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { | ||
233 | __be16 _pt, *p; | ||
234 | p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); | ||
235 | if (p == NULL) | ||
236 | goto tx_error; | ||
237 | ip_vs_conn_fill_cport(cp, *p); | ||
238 | IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); | ||
239 | } | ||
240 | |||
241 | if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) | ||
242 | goto tx_error_icmp; | ||
243 | |||
244 | /* MTU checking */ | ||
245 | mtu = dst_mtu(&rt->u.dst); | ||
246 | if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { | ||
247 | ip_rt_put(rt); | ||
248 | icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); | ||
249 | IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); | ||
250 | goto tx_error; | ||
251 | } | ||
252 | |||
253 | /* copy-on-write the packet before mangling it */ | ||
254 | if (!skb_make_writable(skb, sizeof(struct iphdr))) | ||
255 | goto tx_error_put; | ||
256 | |||
257 | if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) | ||
258 | goto tx_error_put; | ||
259 | |||
260 | /* drop old route */ | ||
261 | dst_release(skb->dst); | ||
262 | skb->dst = &rt->u.dst; | ||
263 | |||
264 | /* mangle the packet */ | ||
265 | if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) | ||
266 | goto tx_error; | ||
267 | ip_hdr(skb)->daddr = cp->daddr; | ||
268 | ip_send_check(ip_hdr(skb)); | ||
269 | |||
270 | IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); | ||
271 | |||
272 | /* FIXME: when application helper enlarges the packet and the length | ||
273 | is larger than the MTU of outgoing device, there will be still | ||
274 | MTU problem. */ | ||
275 | |||
276 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
277 | skb->local_df = 1; | ||
278 | |||
279 | IP_VS_XMIT(skb, rt); | ||
280 | |||
281 | LeaveFunction(10); | ||
282 | return NF_STOLEN; | ||
283 | |||
284 | tx_error_icmp: | ||
285 | dst_link_failure(skb); | ||
286 | tx_error: | ||
287 | LeaveFunction(10); | ||
288 | kfree_skb(skb); | ||
289 | return NF_STOLEN; | ||
290 | tx_error_put: | ||
291 | ip_rt_put(rt); | ||
292 | goto tx_error; | ||
293 | } | ||
294 | |||
295 | |||
296 | /* | ||
297 | * IP Tunneling transmitter | ||
298 | * | ||
299 | * This function encapsulates the packet in a new IP packet, its | ||
300 | * destination will be set to cp->daddr. Most code of this function | ||
301 | * is taken from ipip.c. | ||
302 | * | ||
303 | * It is used in VS/TUN cluster. The load balancer selects a real | ||
304 | * server from a cluster based on a scheduling algorithm, | ||
305 | * encapsulates the request packet and forwards it to the selected | ||
306 | * server. For example, all real servers are configured with | ||
307 | * "ifconfig tunl0 <Virtual IP Address> up". When the server receives | ||
308 | * the encapsulated packet, it will decapsulate the packet, processe | ||
309 | * the request and return the response packets directly to the client | ||
310 | * without passing the load balancer. This can greatly increase the | ||
311 | * scalability of virtual server. | ||
312 | * | ||
313 | * Used for ANY protocol | ||
314 | */ | ||
315 | int | ||
316 | ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
317 | struct ip_vs_protocol *pp) | ||
318 | { | ||
319 | struct rtable *rt; /* Route to the other host */ | ||
320 | struct net_device *tdev; /* Device to other host */ | ||
321 | struct iphdr *old_iph = ip_hdr(skb); | ||
322 | u8 tos = old_iph->tos; | ||
323 | __be16 df = old_iph->frag_off; | ||
324 | sk_buff_data_t old_transport_header = skb->transport_header; | ||
325 | struct iphdr *iph; /* Our new IP header */ | ||
326 | unsigned int max_headroom; /* The extra header space needed */ | ||
327 | int mtu; | ||
328 | |||
329 | EnterFunction(10); | ||
330 | |||
331 | if (skb->protocol != htons(ETH_P_IP)) { | ||
332 | IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " | ||
333 | "ETH_P_IP: %d, skb protocol: %d\n", | ||
334 | htons(ETH_P_IP), skb->protocol); | ||
335 | goto tx_error; | ||
336 | } | ||
337 | |||
338 | if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) | ||
339 | goto tx_error_icmp; | ||
340 | |||
341 | tdev = rt->u.dst.dev; | ||
342 | |||
343 | mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); | ||
344 | if (mtu < 68) { | ||
345 | ip_rt_put(rt); | ||
346 | IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); | ||
347 | goto tx_error; | ||
348 | } | ||
349 | if (skb->dst) | ||
350 | skb->dst->ops->update_pmtu(skb->dst, mtu); | ||
351 | |||
352 | df |= (old_iph->frag_off & htons(IP_DF)); | ||
353 | |||
354 | if ((old_iph->frag_off & htons(IP_DF)) | ||
355 | && mtu < ntohs(old_iph->tot_len)) { | ||
356 | icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); | ||
357 | ip_rt_put(rt); | ||
358 | IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); | ||
359 | goto tx_error; | ||
360 | } | ||
361 | |||
362 | /* | ||
363 | * Okay, now see if we can stuff it in the buffer as-is. | ||
364 | */ | ||
365 | max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); | ||
366 | |||
367 | if (skb_headroom(skb) < max_headroom | ||
368 | || skb_cloned(skb) || skb_shared(skb)) { | ||
369 | struct sk_buff *new_skb = | ||
370 | skb_realloc_headroom(skb, max_headroom); | ||
371 | if (!new_skb) { | ||
372 | ip_rt_put(rt); | ||
373 | kfree_skb(skb); | ||
374 | IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); | ||
375 | return NF_STOLEN; | ||
376 | } | ||
377 | kfree_skb(skb); | ||
378 | skb = new_skb; | ||
379 | old_iph = ip_hdr(skb); | ||
380 | } | ||
381 | |||
382 | skb->transport_header = old_transport_header; | ||
383 | |||
384 | /* fix old IP header checksum */ | ||
385 | ip_send_check(old_iph); | ||
386 | |||
387 | skb_push(skb, sizeof(struct iphdr)); | ||
388 | skb_reset_network_header(skb); | ||
389 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | ||
390 | |||
391 | /* drop old route */ | ||
392 | dst_release(skb->dst); | ||
393 | skb->dst = &rt->u.dst; | ||
394 | |||
395 | /* | ||
396 | * Push down and install the IPIP header. | ||
397 | */ | ||
398 | iph = ip_hdr(skb); | ||
399 | iph->version = 4; | ||
400 | iph->ihl = sizeof(struct iphdr)>>2; | ||
401 | iph->frag_off = df; | ||
402 | iph->protocol = IPPROTO_IPIP; | ||
403 | iph->tos = tos; | ||
404 | iph->daddr = rt->rt_dst; | ||
405 | iph->saddr = rt->rt_src; | ||
406 | iph->ttl = old_iph->ttl; | ||
407 | ip_select_ident(iph, &rt->u.dst, NULL); | ||
408 | |||
409 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
410 | skb->local_df = 1; | ||
411 | |||
412 | ip_local_out(skb); | ||
413 | |||
414 | LeaveFunction(10); | ||
415 | |||
416 | return NF_STOLEN; | ||
417 | |||
418 | tx_error_icmp: | ||
419 | dst_link_failure(skb); | ||
420 | tx_error: | ||
421 | kfree_skb(skb); | ||
422 | LeaveFunction(10); | ||
423 | return NF_STOLEN; | ||
424 | } | ||
425 | |||
426 | |||
427 | /* | ||
428 | * Direct Routing transmitter | ||
429 | * Used for ANY protocol | ||
430 | */ | ||
431 | int | ||
432 | ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
433 | struct ip_vs_protocol *pp) | ||
434 | { | ||
435 | struct rtable *rt; /* Route to the other host */ | ||
436 | struct iphdr *iph = ip_hdr(skb); | ||
437 | int mtu; | ||
438 | |||
439 | EnterFunction(10); | ||
440 | |||
441 | if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) | ||
442 | goto tx_error_icmp; | ||
443 | |||
444 | /* MTU checking */ | ||
445 | mtu = dst_mtu(&rt->u.dst); | ||
446 | if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { | ||
447 | icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); | ||
448 | ip_rt_put(rt); | ||
449 | IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); | ||
450 | goto tx_error; | ||
451 | } | ||
452 | |||
453 | /* | ||
454 | * Call ip_send_check because we are not sure it is called | ||
455 | * after ip_defrag. Is copy-on-write needed? | ||
456 | */ | ||
457 | if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { | ||
458 | ip_rt_put(rt); | ||
459 | return NF_STOLEN; | ||
460 | } | ||
461 | ip_send_check(ip_hdr(skb)); | ||
462 | |||
463 | /* drop old route */ | ||
464 | dst_release(skb->dst); | ||
465 | skb->dst = &rt->u.dst; | ||
466 | |||
467 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
468 | skb->local_df = 1; | ||
469 | |||
470 | IP_VS_XMIT(skb, rt); | ||
471 | |||
472 | LeaveFunction(10); | ||
473 | return NF_STOLEN; | ||
474 | |||
475 | tx_error_icmp: | ||
476 | dst_link_failure(skb); | ||
477 | tx_error: | ||
478 | kfree_skb(skb); | ||
479 | LeaveFunction(10); | ||
480 | return NF_STOLEN; | ||
481 | } | ||
482 | |||
483 | |||
484 | /* | ||
485 | * ICMP packet transmitter | ||
486 | * called by the ip_vs_in_icmp | ||
487 | */ | ||
488 | int | ||
489 | ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | ||
490 | struct ip_vs_protocol *pp, int offset) | ||
491 | { | ||
492 | struct rtable *rt; /* Route to the other host */ | ||
493 | int mtu; | ||
494 | int rc; | ||
495 | |||
496 | EnterFunction(10); | ||
497 | |||
498 | /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be | ||
499 | forwarded directly here, because there is no need to | ||
500 | translate address/port back */ | ||
501 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { | ||
502 | if (cp->packet_xmit) | ||
503 | rc = cp->packet_xmit(skb, cp, pp); | ||
504 | else | ||
505 | rc = NF_ACCEPT; | ||
506 | /* do not touch skb anymore */ | ||
507 | atomic_inc(&cp->in_pkts); | ||
508 | goto out; | ||
509 | } | ||
510 | |||
511 | /* | ||
512 | * mangle and send the packet here (only for VS/NAT) | ||
513 | */ | ||
514 | |||
515 | if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) | ||
516 | goto tx_error_icmp; | ||
517 | |||
518 | /* MTU checking */ | ||
519 | mtu = dst_mtu(&rt->u.dst); | ||
520 | if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { | ||
521 | ip_rt_put(rt); | ||
522 | icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); | ||
523 | IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); | ||
524 | goto tx_error; | ||
525 | } | ||
526 | |||
527 | /* copy-on-write the packet before mangling it */ | ||
528 | if (!skb_make_writable(skb, offset)) | ||
529 | goto tx_error_put; | ||
530 | |||
531 | if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) | ||
532 | goto tx_error_put; | ||
533 | |||
534 | /* drop the old route when skb is not shared */ | ||
535 | dst_release(skb->dst); | ||
536 | skb->dst = &rt->u.dst; | ||
537 | |||
538 | ip_vs_nat_icmp(skb, pp, cp, 0); | ||
539 | |||
540 | /* Another hack: avoid icmp_send in ip_fragment */ | ||
541 | skb->local_df = 1; | ||
542 | |||
543 | IP_VS_XMIT(skb, rt); | ||
544 | |||
545 | rc = NF_STOLEN; | ||
546 | goto out; | ||
547 | |||
548 | tx_error_icmp: | ||
549 | dst_link_failure(skb); | ||
550 | tx_error: | ||
551 | dev_kfree_skb(skb); | ||
552 | rc = NF_STOLEN; | ||
553 | out: | ||
554 | LeaveFunction(10); | ||
555 | return rc; | ||
556 | tx_error_put: | ||
557 | ip_rt_put(rt); | ||
558 | goto tx_error; | ||
559 | } | ||
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index f8edacdf991d..6efdb70b3eb2 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c | |||
@@ -12,6 +12,7 @@ | |||
12 | /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ | 12 | /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ |
13 | int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) | 13 | int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) |
14 | { | 14 | { |
15 | struct net *net = dev_net(skb->dst->dev); | ||
15 | const struct iphdr *iph = ip_hdr(skb); | 16 | const struct iphdr *iph = ip_hdr(skb); |
16 | struct rtable *rt; | 17 | struct rtable *rt; |
17 | struct flowi fl = {}; | 18 | struct flowi fl = {}; |
@@ -19,7 +20,9 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) | |||
19 | unsigned int hh_len; | 20 | unsigned int hh_len; |
20 | unsigned int type; | 21 | unsigned int type; |
21 | 22 | ||
22 | type = inet_addr_type(&init_net, iph->saddr); | 23 | type = inet_addr_type(net, iph->saddr); |
24 | if (skb->sk && inet_sk(skb->sk)->transparent) | ||
25 | type = RTN_LOCAL; | ||
23 | if (addr_type == RTN_UNSPEC) | 26 | if (addr_type == RTN_UNSPEC) |
24 | addr_type = type; | 27 | addr_type = type; |
25 | 28 | ||
@@ -33,7 +36,8 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) | |||
33 | fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); | 36 | fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); |
34 | fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; | 37 | fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; |
35 | fl.mark = skb->mark; | 38 | fl.mark = skb->mark; |
36 | if (ip_route_output_key(&init_net, &rt, &fl) != 0) | 39 | fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; |
40 | if (ip_route_output_key(net, &rt, &fl) != 0) | ||
37 | return -1; | 41 | return -1; |
38 | 42 | ||
39 | /* Drop old route. */ | 43 | /* Drop old route. */ |
@@ -43,7 +47,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) | |||
43 | /* non-local src, find valid iif to satisfy | 47 | /* non-local src, find valid iif to satisfy |
44 | * rp-filter when calling ip_route_input. */ | 48 | * rp-filter when calling ip_route_input. */ |
45 | fl.nl_u.ip4_u.daddr = iph->saddr; | 49 | fl.nl_u.ip4_u.daddr = iph->saddr; |
46 | if (ip_route_output_key(&init_net, &rt, &fl) != 0) | 50 | if (ip_route_output_key(net, &rt, &fl) != 0) |
47 | return -1; | 51 | return -1; |
48 | 52 | ||
49 | odst = skb->dst; | 53 | odst = skb->dst; |
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 90eb7cb47e77..3816e1dc9295 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig | |||
@@ -5,10 +5,15 @@ | |||
5 | menu "IP: Netfilter Configuration" | 5 | menu "IP: Netfilter Configuration" |
6 | depends on INET && NETFILTER | 6 | depends on INET && NETFILTER |
7 | 7 | ||
8 | config NF_DEFRAG_IPV4 | ||
9 | tristate | ||
10 | default n | ||
11 | |||
8 | config NF_CONNTRACK_IPV4 | 12 | config NF_CONNTRACK_IPV4 |
9 | tristate "IPv4 connection tracking support (required for NAT)" | 13 | tristate "IPv4 connection tracking support (required for NAT)" |
10 | depends on NF_CONNTRACK | 14 | depends on NF_CONNTRACK |
11 | default m if NETFILTER_ADVANCED=n | 15 | default m if NETFILTER_ADVANCED=n |
16 | select NF_DEFRAG_IPV4 | ||
12 | ---help--- | 17 | ---help--- |
13 | Connection tracking keeps a record of what packets have passed | 18 | Connection tracking keeps a record of what packets have passed |
14 | through your machine, in order to figure out how they are related | 19 | through your machine, in order to figure out how they are related |
@@ -56,23 +61,30 @@ config IP_NF_IPTABLES | |||
56 | 61 | ||
57 | To compile it as a module, choose M here. If unsure, say N. | 62 | To compile it as a module, choose M here. If unsure, say N. |
58 | 63 | ||
64 | if IP_NF_IPTABLES | ||
65 | |||
59 | # The matches. | 66 | # The matches. |
60 | config IP_NF_MATCH_RECENT | 67 | config IP_NF_MATCH_ADDRTYPE |
61 | tristate '"recent" match support' | 68 | tristate '"addrtype" address type match support' |
62 | depends on IP_NF_IPTABLES | ||
63 | depends on NETFILTER_ADVANCED | 69 | depends on NETFILTER_ADVANCED |
64 | help | 70 | help |
65 | This match is used for creating one or many lists of recently | 71 | This option allows you to match what routing thinks of an address, |
66 | used addresses and then matching against that/those list(s). | 72 | eg. UNICAST, LOCAL, BROADCAST, ... |
67 | 73 | ||
68 | Short options are available by using 'iptables -m recent -h' | 74 | If you want to compile it as a module, say M here and read |
69 | Official Website: <http://snowman.net/projects/ipt_recent/> | 75 | <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. |
76 | |||
77 | config IP_NF_MATCH_AH | ||
78 | tristate '"ah" match support' | ||
79 | depends on NETFILTER_ADVANCED | ||
80 | help | ||
81 | This match extension allows you to match a range of SPIs | ||
82 | inside AH header of IPSec packets. | ||
70 | 83 | ||
71 | To compile it as a module, choose M here. If unsure, say N. | 84 | To compile it as a module, choose M here. If unsure, say N. |
72 | 85 | ||
73 | config IP_NF_MATCH_ECN | 86 | config IP_NF_MATCH_ECN |
74 | tristate '"ecn" match support' | 87 | tristate '"ecn" match support' |
75 | depends on IP_NF_IPTABLES | ||
76 | depends on NETFILTER_ADVANCED | 88 | depends on NETFILTER_ADVANCED |
77 | help | 89 | help |
78 | This option adds a `ECN' match, which allows you to match against | 90 | This option adds a `ECN' match, which allows you to match against |
@@ -80,19 +92,8 @@ config IP_NF_MATCH_ECN | |||
80 | 92 | ||
81 | To compile it as a module, choose M here. If unsure, say N. | 93 | To compile it as a module, choose M here. If unsure, say N. |
82 | 94 | ||
83 | config IP_NF_MATCH_AH | ||
84 | tristate '"ah" match support' | ||
85 | depends on IP_NF_IPTABLES | ||
86 | depends on NETFILTER_ADVANCED | ||
87 | help | ||
88 | This match extension allows you to match a range of SPIs | ||
89 | inside AH header of IPSec packets. | ||
90 | |||
91 | To compile it as a module, choose M here. If unsure, say N. | ||
92 | |||
93 | config IP_NF_MATCH_TTL | 95 | config IP_NF_MATCH_TTL |
94 | tristate '"ttl" match support' | 96 | tristate '"ttl" match support' |
95 | depends on IP_NF_IPTABLES | ||
96 | depends on NETFILTER_ADVANCED | 97 | depends on NETFILTER_ADVANCED |
97 | help | 98 | help |
98 | This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user | 99 | This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user |
@@ -100,21 +101,9 @@ config IP_NF_MATCH_TTL | |||
100 | 101 | ||
101 | To compile it as a module, choose M here. If unsure, say N. | 102 | To compile it as a module, choose M here. If unsure, say N. |
102 | 103 | ||
103 | config IP_NF_MATCH_ADDRTYPE | ||
104 | tristate '"addrtype" address type match support' | ||
105 | depends on IP_NF_IPTABLES | ||
106 | depends on NETFILTER_ADVANCED | ||
107 | help | ||
108 | This option allows you to match what routing thinks of an address, | ||
109 | eg. UNICAST, LOCAL, BROADCAST, ... | ||
110 | |||
111 | If you want to compile it as a module, say M here and read | ||
112 | <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. | ||
113 | |||
114 | # `filter', generic and specific targets | 104 | # `filter', generic and specific targets |
115 | config IP_NF_FILTER | 105 | config IP_NF_FILTER |
116 | tristate "Packet filtering" | 106 | tristate "Packet filtering" |
117 | depends on IP_NF_IPTABLES | ||
118 | default m if NETFILTER_ADVANCED=n | 107 | default m if NETFILTER_ADVANCED=n |
119 | help | 108 | help |
120 | Packet filtering defines a table `filter', which has a series of | 109 | Packet filtering defines a table `filter', which has a series of |
@@ -136,7 +125,6 @@ config IP_NF_TARGET_REJECT | |||
136 | 125 | ||
137 | config IP_NF_TARGET_LOG | 126 | config IP_NF_TARGET_LOG |
138 | tristate "LOG target support" | 127 | tristate "LOG target support" |
139 | depends on IP_NF_IPTABLES | ||
140 | default m if NETFILTER_ADVANCED=n | 128 | default m if NETFILTER_ADVANCED=n |
141 | help | 129 | help |
142 | This option adds a `LOG' target, which allows you to create rules in | 130 | This option adds a `LOG' target, which allows you to create rules in |
@@ -146,7 +134,6 @@ config IP_NF_TARGET_LOG | |||
146 | 134 | ||
147 | config IP_NF_TARGET_ULOG | 135 | config IP_NF_TARGET_ULOG |
148 | tristate "ULOG target support" | 136 | tristate "ULOG target support" |
149 | depends on IP_NF_IPTABLES | ||
150 | default m if NETFILTER_ADVANCED=n | 137 | default m if NETFILTER_ADVANCED=n |
151 | ---help--- | 138 | ---help--- |
152 | 139 | ||
@@ -167,7 +154,7 @@ config IP_NF_TARGET_ULOG | |||
167 | # NAT + specific targets: nf_conntrack | 154 | # NAT + specific targets: nf_conntrack |
168 | config NF_NAT | 155 | config NF_NAT |
169 | tristate "Full NAT" | 156 | tristate "Full NAT" |
170 | depends on IP_NF_IPTABLES && NF_CONNTRACK_IPV4 | 157 | depends on NF_CONNTRACK_IPV4 |
171 | default m if NETFILTER_ADVANCED=n | 158 | default m if NETFILTER_ADVANCED=n |
172 | help | 159 | help |
173 | The Full NAT option allows masquerading, port forwarding and other | 160 | The Full NAT option allows masquerading, port forwarding and other |
@@ -194,26 +181,26 @@ config IP_NF_TARGET_MASQUERADE | |||
194 | 181 | ||
195 | To compile it as a module, choose M here. If unsure, say N. | 182 | To compile it as a module, choose M here. If unsure, say N. |
196 | 183 | ||
197 | config IP_NF_TARGET_REDIRECT | 184 | config IP_NF_TARGET_NETMAP |
198 | tristate "REDIRECT target support" | 185 | tristate "NETMAP target support" |
199 | depends on NF_NAT | 186 | depends on NF_NAT |
200 | depends on NETFILTER_ADVANCED | 187 | depends on NETFILTER_ADVANCED |
201 | help | 188 | help |
202 | REDIRECT is a special case of NAT: all incoming connections are | 189 | NETMAP is an implementation of static 1:1 NAT mapping of network |
203 | mapped onto the incoming interface's address, causing the packets to | 190 | addresses. It maps the network address part, while keeping the host |
204 | come to the local machine instead of passing through. This is | 191 | address part intact. |
205 | useful for transparent proxies. | ||
206 | 192 | ||
207 | To compile it as a module, choose M here. If unsure, say N. | 193 | To compile it as a module, choose M here. If unsure, say N. |
208 | 194 | ||
209 | config IP_NF_TARGET_NETMAP | 195 | config IP_NF_TARGET_REDIRECT |
210 | tristate "NETMAP target support" | 196 | tristate "REDIRECT target support" |
211 | depends on NF_NAT | 197 | depends on NF_NAT |
212 | depends on NETFILTER_ADVANCED | 198 | depends on NETFILTER_ADVANCED |
213 | help | 199 | help |
214 | NETMAP is an implementation of static 1:1 NAT mapping of network | 200 | REDIRECT is a special case of NAT: all incoming connections are |
215 | addresses. It maps the network address part, while keeping the host | 201 | mapped onto the incoming interface's address, causing the packets to |
216 | address part intact. | 202 | come to the local machine instead of passing through. This is |
203 | useful for transparent proxies. | ||
217 | 204 | ||
218 | To compile it as a module, choose M here. If unsure, say N. | 205 | To compile it as a module, choose M here. If unsure, say N. |
219 | 206 | ||
@@ -262,44 +249,43 @@ config NF_NAT_PROTO_SCTP | |||
262 | 249 | ||
263 | config NF_NAT_FTP | 250 | config NF_NAT_FTP |
264 | tristate | 251 | tristate |
265 | depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT | 252 | depends on NF_CONNTRACK && NF_NAT |
266 | default NF_NAT && NF_CONNTRACK_FTP | 253 | default NF_NAT && NF_CONNTRACK_FTP |
267 | 254 | ||
268 | config NF_NAT_IRC | 255 | config NF_NAT_IRC |
269 | tristate | 256 | tristate |
270 | depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT | 257 | depends on NF_CONNTRACK && NF_NAT |
271 | default NF_NAT && NF_CONNTRACK_IRC | 258 | default NF_NAT && NF_CONNTRACK_IRC |
272 | 259 | ||
273 | config NF_NAT_TFTP | 260 | config NF_NAT_TFTP |
274 | tristate | 261 | tristate |
275 | depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT | 262 | depends on NF_CONNTRACK && NF_NAT |
276 | default NF_NAT && NF_CONNTRACK_TFTP | 263 | default NF_NAT && NF_CONNTRACK_TFTP |
277 | 264 | ||
278 | config NF_NAT_AMANDA | 265 | config NF_NAT_AMANDA |
279 | tristate | 266 | tristate |
280 | depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT | 267 | depends on NF_CONNTRACK && NF_NAT |
281 | default NF_NAT && NF_CONNTRACK_AMANDA | 268 | default NF_NAT && NF_CONNTRACK_AMANDA |
282 | 269 | ||
283 | config NF_NAT_PPTP | 270 | config NF_NAT_PPTP |
284 | tristate | 271 | tristate |
285 | depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT | 272 | depends on NF_CONNTRACK && NF_NAT |
286 | default NF_NAT && NF_CONNTRACK_PPTP | 273 | default NF_NAT && NF_CONNTRACK_PPTP |
287 | select NF_NAT_PROTO_GRE | 274 | select NF_NAT_PROTO_GRE |
288 | 275 | ||
289 | config NF_NAT_H323 | 276 | config NF_NAT_H323 |
290 | tristate | 277 | tristate |
291 | depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT | 278 | depends on NF_CONNTRACK && NF_NAT |
292 | default NF_NAT && NF_CONNTRACK_H323 | 279 | default NF_NAT && NF_CONNTRACK_H323 |
293 | 280 | ||
294 | config NF_NAT_SIP | 281 | config NF_NAT_SIP |
295 | tristate | 282 | tristate |
296 | depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT | 283 | depends on NF_CONNTRACK && NF_NAT |
297 | default NF_NAT && NF_CONNTRACK_SIP | 284 | default NF_NAT && NF_CONNTRACK_SIP |
298 | 285 | ||
299 | # mangle + specific targets | 286 | # mangle + specific targets |
300 | config IP_NF_MANGLE | 287 | config IP_NF_MANGLE |
301 | tristate "Packet mangling" | 288 | tristate "Packet mangling" |
302 | depends on IP_NF_IPTABLES | ||
303 | default m if NETFILTER_ADVANCED=n | 289 | default m if NETFILTER_ADVANCED=n |
304 | help | 290 | help |
305 | This option adds a `mangle' table to iptables: see the man page for | 291 | This option adds a `mangle' table to iptables: see the man page for |
@@ -308,6 +294,19 @@ config IP_NF_MANGLE | |||
308 | 294 | ||
309 | To compile it as a module, choose M here. If unsure, say N. | 295 | To compile it as a module, choose M here. If unsure, say N. |
310 | 296 | ||
297 | config IP_NF_TARGET_CLUSTERIP | ||
298 | tristate "CLUSTERIP target support (EXPERIMENTAL)" | ||
299 | depends on IP_NF_MANGLE && EXPERIMENTAL | ||
300 | depends on NF_CONNTRACK_IPV4 | ||
301 | depends on NETFILTER_ADVANCED | ||
302 | select NF_CONNTRACK_MARK | ||
303 | help | ||
304 | The CLUSTERIP target allows you to build load-balancing clusters of | ||
305 | network servers without having a dedicated load-balancing | ||
306 | router/server/switch. | ||
307 | |||
308 | To compile it as a module, choose M here. If unsure, say N. | ||
309 | |||
311 | config IP_NF_TARGET_ECN | 310 | config IP_NF_TARGET_ECN |
312 | tristate "ECN target support" | 311 | tristate "ECN target support" |
313 | depends on IP_NF_MANGLE | 312 | depends on IP_NF_MANGLE |
@@ -338,23 +337,9 @@ config IP_NF_TARGET_TTL | |||
338 | 337 | ||
339 | To compile it as a module, choose M here. If unsure, say N. | 338 | To compile it as a module, choose M here. If unsure, say N. |
340 | 339 | ||
341 | config IP_NF_TARGET_CLUSTERIP | ||
342 | tristate "CLUSTERIP target support (EXPERIMENTAL)" | ||
343 | depends on IP_NF_MANGLE && EXPERIMENTAL | ||
344 | depends on NF_CONNTRACK_IPV4 | ||
345 | depends on NETFILTER_ADVANCED | ||
346 | select NF_CONNTRACK_MARK | ||
347 | help | ||
348 | The CLUSTERIP target allows you to build load-balancing clusters of | ||
349 | network servers without having a dedicated load-balancing | ||
350 | router/server/switch. | ||
351 | |||
352 | To compile it as a module, choose M here. If unsure, say N. | ||
353 | |||
354 | # raw + specific targets | 340 | # raw + specific targets |
355 | config IP_NF_RAW | 341 | config IP_NF_RAW |
356 | tristate 'raw table support (required for NOTRACK/TRACE)' | 342 | tristate 'raw table support (required for NOTRACK/TRACE)' |
357 | depends on IP_NF_IPTABLES | ||
358 | depends on NETFILTER_ADVANCED | 343 | depends on NETFILTER_ADVANCED |
359 | help | 344 | help |
360 | This option adds a `raw' table to iptables. This table is the very | 345 | This option adds a `raw' table to iptables. This table is the very |
@@ -367,7 +352,6 @@ config IP_NF_RAW | |||
367 | # security table for MAC policy | 352 | # security table for MAC policy |
368 | config IP_NF_SECURITY | 353 | config IP_NF_SECURITY |
369 | tristate "Security table" | 354 | tristate "Security table" |
370 | depends on IP_NF_IPTABLES | ||
371 | depends on SECURITY | 355 | depends on SECURITY |
372 | depends on NETFILTER_ADVANCED | 356 | depends on NETFILTER_ADVANCED |
373 | help | 357 | help |
@@ -376,6 +360,8 @@ config IP_NF_SECURITY | |||
376 | 360 | ||
377 | If unsure, say N. | 361 | If unsure, say N. |
378 | 362 | ||
363 | endif # IP_NF_IPTABLES | ||
364 | |||
379 | # ARP tables | 365 | # ARP tables |
380 | config IP_NF_ARPTABLES | 366 | config IP_NF_ARPTABLES |
381 | tristate "ARP tables support" | 367 | tristate "ARP tables support" |
@@ -388,9 +374,10 @@ config IP_NF_ARPTABLES | |||
388 | 374 | ||
389 | To compile it as a module, choose M here. If unsure, say N. | 375 | To compile it as a module, choose M here. If unsure, say N. |
390 | 376 | ||
377 | if IP_NF_ARPTABLES | ||
378 | |||
391 | config IP_NF_ARPFILTER | 379 | config IP_NF_ARPFILTER |
392 | tristate "ARP packet filtering" | 380 | tristate "ARP packet filtering" |
393 | depends on IP_NF_ARPTABLES | ||
394 | help | 381 | help |
395 | ARP packet filtering defines a table `filter', which has a series of | 382 | ARP packet filtering defines a table `filter', which has a series of |
396 | rules for simple ARP packet filtering at local input and | 383 | rules for simple ARP packet filtering at local input and |
@@ -401,10 +388,11 @@ config IP_NF_ARPFILTER | |||
401 | 388 | ||
402 | config IP_NF_ARP_MANGLE | 389 | config IP_NF_ARP_MANGLE |
403 | tristate "ARP payload mangling" | 390 | tristate "ARP payload mangling" |
404 | depends on IP_NF_ARPTABLES | ||
405 | help | 391 | help |
406 | Allows altering the ARP packet payload: source and destination | 392 | Allows altering the ARP packet payload: source and destination |
407 | hardware and network addresses. | 393 | hardware and network addresses. |
408 | 394 | ||
395 | endif # IP_NF_ARPTABLES | ||
396 | |||
409 | endmenu | 397 | endmenu |
410 | 398 | ||
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 3f31291f37ce..5f9b650d90fc 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile | |||
@@ -18,6 +18,9 @@ obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o | |||
18 | 18 | ||
19 | obj-$(CONFIG_NF_NAT) += nf_nat.o | 19 | obj-$(CONFIG_NF_NAT) += nf_nat.o |
20 | 20 | ||
21 | # defrag | ||
22 | obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o | ||
23 | |||
21 | # NAT helpers (nf_conntrack) | 24 | # NAT helpers (nf_conntrack) |
22 | obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o | 25 | obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o |
23 | obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o | 26 | obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o |
@@ -48,7 +51,6 @@ obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o | |||
48 | obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o | 51 | obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o |
49 | obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o | 52 | obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o |
50 | obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o | 53 | obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o |
51 | obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o | ||
52 | obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o | 54 | obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o |
53 | 55 | ||
54 | # targets | 56 | # targets |
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 03e83a65aec5..8d70d29f1ccf 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c | |||
@@ -200,15 +200,12 @@ static inline int arp_checkentry(const struct arpt_arp *arp) | |||
200 | return 1; | 200 | return 1; |
201 | } | 201 | } |
202 | 202 | ||
203 | static unsigned int arpt_error(struct sk_buff *skb, | 203 | static unsigned int |
204 | const struct net_device *in, | 204 | arpt_error(struct sk_buff *skb, const struct xt_target_param *par) |
205 | const struct net_device *out, | ||
206 | unsigned int hooknum, | ||
207 | const struct xt_target *target, | ||
208 | const void *targinfo) | ||
209 | { | 205 | { |
210 | if (net_ratelimit()) | 206 | if (net_ratelimit()) |
211 | printk("arp_tables: error: '%s'\n", (char *)targinfo); | 207 | printk("arp_tables: error: '%s'\n", |
208 | (const char *)par->targinfo); | ||
212 | 209 | ||
213 | return NF_DROP; | 210 | return NF_DROP; |
214 | } | 211 | } |
@@ -232,6 +229,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, | |||
232 | const char *indev, *outdev; | 229 | const char *indev, *outdev; |
233 | void *table_base; | 230 | void *table_base; |
234 | const struct xt_table_info *private; | 231 | const struct xt_table_info *private; |
232 | struct xt_target_param tgpar; | ||
235 | 233 | ||
236 | if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) | 234 | if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) |
237 | return NF_DROP; | 235 | return NF_DROP; |
@@ -245,6 +243,11 @@ unsigned int arpt_do_table(struct sk_buff *skb, | |||
245 | e = get_entry(table_base, private->hook_entry[hook]); | 243 | e = get_entry(table_base, private->hook_entry[hook]); |
246 | back = get_entry(table_base, private->underflow[hook]); | 244 | back = get_entry(table_base, private->underflow[hook]); |
247 | 245 | ||
246 | tgpar.in = in; | ||
247 | tgpar.out = out; | ||
248 | tgpar.hooknum = hook; | ||
249 | tgpar.family = NFPROTO_ARP; | ||
250 | |||
248 | arp = arp_hdr(skb); | 251 | arp = arp_hdr(skb); |
249 | do { | 252 | do { |
250 | if (arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { | 253 | if (arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { |
@@ -290,11 +293,10 @@ unsigned int arpt_do_table(struct sk_buff *skb, | |||
290 | /* Targets which reenter must return | 293 | /* Targets which reenter must return |
291 | * abs. verdicts | 294 | * abs. verdicts |
292 | */ | 295 | */ |
296 | tgpar.target = t->u.kernel.target; | ||
297 | tgpar.targinfo = t->data; | ||
293 | verdict = t->u.kernel.target->target(skb, | 298 | verdict = t->u.kernel.target->target(skb, |
294 | in, out, | 299 | &tgpar); |
295 | hook, | ||
296 | t->u.kernel.target, | ||
297 | t->data); | ||
298 | 300 | ||
299 | /* Target might have changed stuff. */ | 301 | /* Target might have changed stuff. */ |
300 | arp = arp_hdr(skb); | 302 | arp = arp_hdr(skb); |
@@ -456,23 +458,24 @@ static inline int check_entry(struct arpt_entry *e, const char *name) | |||
456 | 458 | ||
457 | static inline int check_target(struct arpt_entry *e, const char *name) | 459 | static inline int check_target(struct arpt_entry *e, const char *name) |
458 | { | 460 | { |
459 | struct arpt_entry_target *t; | 461 | struct arpt_entry_target *t = arpt_get_target(e); |
460 | struct xt_target *target; | ||
461 | int ret; | 462 | int ret; |
462 | 463 | struct xt_tgchk_param par = { | |
463 | t = arpt_get_target(e); | 464 | .table = name, |
464 | target = t->u.kernel.target; | 465 | .entryinfo = e, |
465 | 466 | .target = t->u.kernel.target, | |
466 | ret = xt_check_target(target, NF_ARP, t->u.target_size - sizeof(*t), | 467 | .targinfo = t->data, |
467 | name, e->comefrom, 0, 0); | 468 | .hook_mask = e->comefrom, |
468 | if (!ret && t->u.kernel.target->checkentry | 469 | .family = NFPROTO_ARP, |
469 | && !t->u.kernel.target->checkentry(name, e, target, t->data, | 470 | }; |
470 | e->comefrom)) { | 471 | |
472 | ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); | ||
473 | if (ret < 0) { | ||
471 | duprintf("arp_tables: check failed for `%s'.\n", | 474 | duprintf("arp_tables: check failed for `%s'.\n", |
472 | t->u.kernel.target->name); | 475 | t->u.kernel.target->name); |
473 | ret = -EINVAL; | 476 | return ret; |
474 | } | 477 | } |
475 | return ret; | 478 | return 0; |
476 | } | 479 | } |
477 | 480 | ||
478 | static inline int | 481 | static inline int |
@@ -488,7 +491,8 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, | |||
488 | return ret; | 491 | return ret; |
489 | 492 | ||
490 | t = arpt_get_target(e); | 493 | t = arpt_get_target(e); |
491 | target = try_then_request_module(xt_find_target(NF_ARP, t->u.user.name, | 494 | target = try_then_request_module(xt_find_target(NFPROTO_ARP, |
495 | t->u.user.name, | ||
492 | t->u.user.revision), | 496 | t->u.user.revision), |
493 | "arpt_%s", t->u.user.name); | 497 | "arpt_%s", t->u.user.name); |
494 | if (IS_ERR(target) || !target) { | 498 | if (IS_ERR(target) || !target) { |
@@ -554,15 +558,19 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, | |||
554 | 558 | ||
555 | static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) | 559 | static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) |
556 | { | 560 | { |
561 | struct xt_tgdtor_param par; | ||
557 | struct arpt_entry_target *t; | 562 | struct arpt_entry_target *t; |
558 | 563 | ||
559 | if (i && (*i)-- == 0) | 564 | if (i && (*i)-- == 0) |
560 | return 1; | 565 | return 1; |
561 | 566 | ||
562 | t = arpt_get_target(e); | 567 | t = arpt_get_target(e); |
563 | if (t->u.kernel.target->destroy) | 568 | par.target = t->u.kernel.target; |
564 | t->u.kernel.target->destroy(t->u.kernel.target, t->data); | 569 | par.targinfo = t->data; |
565 | module_put(t->u.kernel.target->me); | 570 | par.family = NFPROTO_ARP; |
571 | if (par.target->destroy != NULL) | ||
572 | par.target->destroy(&par); | ||
573 | module_put(par.target->me); | ||
566 | return 0; | 574 | return 0; |
567 | } | 575 | } |
568 | 576 | ||
@@ -788,7 +796,7 @@ static void compat_standard_from_user(void *dst, void *src) | |||
788 | int v = *(compat_int_t *)src; | 796 | int v = *(compat_int_t *)src; |
789 | 797 | ||
790 | if (v > 0) | 798 | if (v > 0) |
791 | v += xt_compat_calc_jump(NF_ARP, v); | 799 | v += xt_compat_calc_jump(NFPROTO_ARP, v); |
792 | memcpy(dst, &v, sizeof(v)); | 800 | memcpy(dst, &v, sizeof(v)); |
793 | } | 801 | } |
794 | 802 | ||
@@ -797,7 +805,7 @@ static int compat_standard_to_user(void __user *dst, void *src) | |||
797 | compat_int_t cv = *(int *)src; | 805 | compat_int_t cv = *(int *)src; |
798 | 806 | ||
799 | if (cv > 0) | 807 | if (cv > 0) |
800 | cv -= xt_compat_calc_jump(NF_ARP, cv); | 808 | cv -= xt_compat_calc_jump(NFPROTO_ARP, cv); |
801 | return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; | 809 | return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; |
802 | } | 810 | } |
803 | 811 | ||
@@ -815,7 +823,7 @@ static int compat_calc_entry(struct arpt_entry *e, | |||
815 | t = arpt_get_target(e); | 823 | t = arpt_get_target(e); |
816 | off += xt_compat_target_offset(t->u.kernel.target); | 824 | off += xt_compat_target_offset(t->u.kernel.target); |
817 | newinfo->size -= off; | 825 | newinfo->size -= off; |
818 | ret = xt_compat_add_offset(NF_ARP, entry_offset, off); | 826 | ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); |
819 | if (ret) | 827 | if (ret) |
820 | return ret; | 828 | return ret; |
821 | 829 | ||
@@ -866,9 +874,9 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) | |||
866 | name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; | 874 | name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; |
867 | #ifdef CONFIG_COMPAT | 875 | #ifdef CONFIG_COMPAT |
868 | if (compat) | 876 | if (compat) |
869 | xt_compat_lock(NF_ARP); | 877 | xt_compat_lock(NFPROTO_ARP); |
870 | #endif | 878 | #endif |
871 | t = try_then_request_module(xt_find_table_lock(net, NF_ARP, name), | 879 | t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), |
872 | "arptable_%s", name); | 880 | "arptable_%s", name); |
873 | if (t && !IS_ERR(t)) { | 881 | if (t && !IS_ERR(t)) { |
874 | struct arpt_getinfo info; | 882 | struct arpt_getinfo info; |
@@ -878,7 +886,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) | |||
878 | if (compat) { | 886 | if (compat) { |
879 | struct xt_table_info tmp; | 887 | struct xt_table_info tmp; |
880 | ret = compat_table_info(private, &tmp); | 888 | ret = compat_table_info(private, &tmp); |
881 | xt_compat_flush_offsets(NF_ARP); | 889 | xt_compat_flush_offsets(NFPROTO_ARP); |
882 | private = &tmp; | 890 | private = &tmp; |
883 | } | 891 | } |
884 | #endif | 892 | #endif |
@@ -901,7 +909,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) | |||
901 | ret = t ? PTR_ERR(t) : -ENOENT; | 909 | ret = t ? PTR_ERR(t) : -ENOENT; |
902 | #ifdef CONFIG_COMPAT | 910 | #ifdef CONFIG_COMPAT |
903 | if (compat) | 911 | if (compat) |
904 | xt_compat_unlock(NF_ARP); | 912 | xt_compat_unlock(NFPROTO_ARP); |
905 | #endif | 913 | #endif |
906 | return ret; | 914 | return ret; |
907 | } | 915 | } |
@@ -925,7 +933,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, | |||
925 | return -EINVAL; | 933 | return -EINVAL; |
926 | } | 934 | } |
927 | 935 | ||
928 | t = xt_find_table_lock(net, NF_ARP, get.name); | 936 | t = xt_find_table_lock(net, NFPROTO_ARP, get.name); |
929 | if (t && !IS_ERR(t)) { | 937 | if (t && !IS_ERR(t)) { |
930 | const struct xt_table_info *private = t->private; | 938 | const struct xt_table_info *private = t->private; |
931 | 939 | ||
@@ -967,7 +975,7 @@ static int __do_replace(struct net *net, const char *name, | |||
967 | goto out; | 975 | goto out; |
968 | } | 976 | } |
969 | 977 | ||
970 | t = try_then_request_module(xt_find_table_lock(net, NF_ARP, name), | 978 | t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name), |
971 | "arptable_%s", name); | 979 | "arptable_%s", name); |
972 | if (!t || IS_ERR(t)) { | 980 | if (!t || IS_ERR(t)) { |
973 | ret = t ? PTR_ERR(t) : -ENOENT; | 981 | ret = t ? PTR_ERR(t) : -ENOENT; |
@@ -1134,7 +1142,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, | |||
1134 | goto free; | 1142 | goto free; |
1135 | } | 1143 | } |
1136 | 1144 | ||
1137 | t = xt_find_table_lock(net, NF_ARP, name); | 1145 | t = xt_find_table_lock(net, NFPROTO_ARP, name); |
1138 | if (!t || IS_ERR(t)) { | 1146 | if (!t || IS_ERR(t)) { |
1139 | ret = t ? PTR_ERR(t) : -ENOENT; | 1147 | ret = t ? PTR_ERR(t) : -ENOENT; |
1140 | goto free; | 1148 | goto free; |
@@ -1218,7 +1226,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, | |||
1218 | entry_offset = (void *)e - (void *)base; | 1226 | entry_offset = (void *)e - (void *)base; |
1219 | 1227 | ||
1220 | t = compat_arpt_get_target(e); | 1228 | t = compat_arpt_get_target(e); |
1221 | target = try_then_request_module(xt_find_target(NF_ARP, | 1229 | target = try_then_request_module(xt_find_target(NFPROTO_ARP, |
1222 | t->u.user.name, | 1230 | t->u.user.name, |
1223 | t->u.user.revision), | 1231 | t->u.user.revision), |
1224 | "arpt_%s", t->u.user.name); | 1232 | "arpt_%s", t->u.user.name); |
@@ -1232,7 +1240,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, | |||
1232 | 1240 | ||
1233 | off += xt_compat_target_offset(target); | 1241 | off += xt_compat_target_offset(target); |
1234 | *size += off; | 1242 | *size += off; |
1235 | ret = xt_compat_add_offset(NF_ARP, entry_offset, off); | 1243 | ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); |
1236 | if (ret) | 1244 | if (ret) |
1237 | goto release_target; | 1245 | goto release_target; |
1238 | 1246 | ||
@@ -1333,7 +1341,7 @@ static int translate_compat_table(const char *name, | |||
1333 | 1341 | ||
1334 | duprintf("translate_compat_table: size %u\n", info->size); | 1342 | duprintf("translate_compat_table: size %u\n", info->size); |
1335 | j = 0; | 1343 | j = 0; |
1336 | xt_compat_lock(NF_ARP); | 1344 | xt_compat_lock(NFPROTO_ARP); |
1337 | /* Walk through entries, checking offsets. */ | 1345 | /* Walk through entries, checking offsets. */ |
1338 | ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, | 1346 | ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, |
1339 | check_compat_entry_size_and_hooks, | 1347 | check_compat_entry_size_and_hooks, |
@@ -1383,8 +1391,8 @@ static int translate_compat_table(const char *name, | |||
1383 | ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, | 1391 | ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, |
1384 | compat_copy_entry_from_user, | 1392 | compat_copy_entry_from_user, |
1385 | &pos, &size, name, newinfo, entry1); | 1393 | &pos, &size, name, newinfo, entry1); |
1386 | xt_compat_flush_offsets(NF_ARP); | 1394 | xt_compat_flush_offsets(NFPROTO_ARP); |
1387 | xt_compat_unlock(NF_ARP); | 1395 | xt_compat_unlock(NFPROTO_ARP); |
1388 | if (ret) | 1396 | if (ret) |
1389 | goto free_newinfo; | 1397 | goto free_newinfo; |
1390 | 1398 | ||
@@ -1420,8 +1428,8 @@ out: | |||
1420 | COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); | 1428 | COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); |
1421 | return ret; | 1429 | return ret; |
1422 | out_unlock: | 1430 | out_unlock: |
1423 | xt_compat_flush_offsets(NF_ARP); | 1431 | xt_compat_flush_offsets(NFPROTO_ARP); |
1424 | xt_compat_unlock(NF_ARP); | 1432 | xt_compat_unlock(NFPROTO_ARP); |
1425 | goto out; | 1433 | goto out; |
1426 | } | 1434 | } |
1427 | 1435 | ||
@@ -1607,8 +1615,8 @@ static int compat_get_entries(struct net *net, | |||
1607 | return -EINVAL; | 1615 | return -EINVAL; |
1608 | } | 1616 | } |
1609 | 1617 | ||
1610 | xt_compat_lock(NF_ARP); | 1618 | xt_compat_lock(NFPROTO_ARP); |
1611 | t = xt_find_table_lock(net, NF_ARP, get.name); | 1619 | t = xt_find_table_lock(net, NFPROTO_ARP, get.name); |
1612 | if (t && !IS_ERR(t)) { | 1620 | if (t && !IS_ERR(t)) { |
1613 | const struct xt_table_info *private = t->private; | 1621 | const struct xt_table_info *private = t->private; |
1614 | struct xt_table_info info; | 1622 | struct xt_table_info info; |
@@ -1623,13 +1631,13 @@ static int compat_get_entries(struct net *net, | |||
1623 | private->size, get.size); | 1631 | private->size, get.size); |
1624 | ret = -EAGAIN; | 1632 | ret = -EAGAIN; |
1625 | } | 1633 | } |
1626 | xt_compat_flush_offsets(NF_ARP); | 1634 | xt_compat_flush_offsets(NFPROTO_ARP); |
1627 | module_put(t->me); | 1635 | module_put(t->me); |
1628 | xt_table_unlock(t); | 1636 | xt_table_unlock(t); |
1629 | } else | 1637 | } else |
1630 | ret = t ? PTR_ERR(t) : -ENOENT; | 1638 | ret = t ? PTR_ERR(t) : -ENOENT; |
1631 | 1639 | ||
1632 | xt_compat_unlock(NF_ARP); | 1640 | xt_compat_unlock(NFPROTO_ARP); |
1633 | return ret; | 1641 | return ret; |
1634 | } | 1642 | } |
1635 | 1643 | ||
@@ -1709,7 +1717,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len | |||
1709 | break; | 1717 | break; |
1710 | } | 1718 | } |
1711 | 1719 | ||
1712 | try_then_request_module(xt_find_revision(NF_ARP, rev.name, | 1720 | try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name, |
1713 | rev.revision, 1, &ret), | 1721 | rev.revision, 1, &ret), |
1714 | "arpt_%s", rev.name); | 1722 | "arpt_%s", rev.name); |
1715 | break; | 1723 | break; |
@@ -1787,7 +1795,7 @@ void arpt_unregister_table(struct xt_table *table) | |||
1787 | static struct xt_target arpt_standard_target __read_mostly = { | 1795 | static struct xt_target arpt_standard_target __read_mostly = { |
1788 | .name = ARPT_STANDARD_TARGET, | 1796 | .name = ARPT_STANDARD_TARGET, |
1789 | .targetsize = sizeof(int), | 1797 | .targetsize = sizeof(int), |
1790 | .family = NF_ARP, | 1798 | .family = NFPROTO_ARP, |
1791 | #ifdef CONFIG_COMPAT | 1799 | #ifdef CONFIG_COMPAT |
1792 | .compatsize = sizeof(compat_int_t), | 1800 | .compatsize = sizeof(compat_int_t), |
1793 | .compat_from_user = compat_standard_from_user, | 1801 | .compat_from_user = compat_standard_from_user, |
@@ -1799,7 +1807,7 @@ static struct xt_target arpt_error_target __read_mostly = { | |||
1799 | .name = ARPT_ERROR_TARGET, | 1807 | .name = ARPT_ERROR_TARGET, |
1800 | .target = arpt_error, | 1808 | .target = arpt_error, |
1801 | .targetsize = ARPT_FUNCTION_MAXNAMELEN, | 1809 | .targetsize = ARPT_FUNCTION_MAXNAMELEN, |
1802 | .family = NF_ARP, | 1810 | .family = NFPROTO_ARP, |
1803 | }; | 1811 | }; |
1804 | 1812 | ||
1805 | static struct nf_sockopt_ops arpt_sockopts = { | 1813 | static struct nf_sockopt_ops arpt_sockopts = { |
@@ -1821,12 +1829,12 @@ static struct nf_sockopt_ops arpt_sockopts = { | |||
1821 | 1829 | ||
1822 | static int __net_init arp_tables_net_init(struct net *net) | 1830 | static int __net_init arp_tables_net_init(struct net *net) |
1823 | { | 1831 | { |
1824 | return xt_proto_init(net, NF_ARP); | 1832 | return xt_proto_init(net, NFPROTO_ARP); |
1825 | } | 1833 | } |
1826 | 1834 | ||
1827 | static void __net_exit arp_tables_net_exit(struct net *net) | 1835 | static void __net_exit arp_tables_net_exit(struct net *net) |
1828 | { | 1836 | { |
1829 | xt_proto_fini(net, NF_ARP); | 1837 | xt_proto_fini(net, NFPROTO_ARP); |
1830 | } | 1838 | } |
1831 | 1839 | ||
1832 | static struct pernet_operations arp_tables_net_ops = { | 1840 | static struct pernet_operations arp_tables_net_ops = { |
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index a385959d2655..b0d5b1d0a769 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c | |||
@@ -9,12 +9,9 @@ MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); | |||
9 | MODULE_DESCRIPTION("arptables arp payload mangle target"); | 9 | MODULE_DESCRIPTION("arptables arp payload mangle target"); |
10 | 10 | ||
11 | static unsigned int | 11 | static unsigned int |
12 | target(struct sk_buff *skb, | 12 | target(struct sk_buff *skb, const struct xt_target_param *par) |
13 | const struct net_device *in, const struct net_device *out, | ||
14 | unsigned int hooknum, const struct xt_target *target, | ||
15 | const void *targinfo) | ||
16 | { | 13 | { |
17 | const struct arpt_mangle *mangle = targinfo; | 14 | const struct arpt_mangle *mangle = par->targinfo; |
18 | const struct arphdr *arp; | 15 | const struct arphdr *arp; |
19 | unsigned char *arpptr; | 16 | unsigned char *arpptr; |
20 | int pln, hln; | 17 | int pln, hln; |
@@ -57,11 +54,9 @@ target(struct sk_buff *skb, | |||
57 | return mangle->target; | 54 | return mangle->target; |
58 | } | 55 | } |
59 | 56 | ||
60 | static bool | 57 | static bool checkentry(const struct xt_tgchk_param *par) |
61 | checkentry(const char *tablename, const void *e, const struct xt_target *target, | ||
62 | void *targinfo, unsigned int hook_mask) | ||
63 | { | 58 | { |
64 | const struct arpt_mangle *mangle = targinfo; | 59 | const struct arpt_mangle *mangle = par->targinfo; |
65 | 60 | ||
66 | if (mangle->flags & ~ARPT_MANGLE_MASK || | 61 | if (mangle->flags & ~ARPT_MANGLE_MASK || |
67 | !(mangle->flags & ARPT_MANGLE_MASK)) | 62 | !(mangle->flags & ARPT_MANGLE_MASK)) |
@@ -75,7 +70,7 @@ checkentry(const char *tablename, const void *e, const struct xt_target *target, | |||
75 | 70 | ||
76 | static struct xt_target arpt_mangle_reg __read_mostly = { | 71 | static struct xt_target arpt_mangle_reg __read_mostly = { |
77 | .name = "mangle", | 72 | .name = "mangle", |
78 | .family = NF_ARP, | 73 | .family = NFPROTO_ARP, |
79 | .target = target, | 74 | .target = target, |
80 | .targetsize = sizeof(struct arpt_mangle), | 75 | .targetsize = sizeof(struct arpt_mangle), |
81 | .checkentry = checkentry, | 76 | .checkentry = checkentry, |
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 082f5dd3156c..bee3d117661a 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c | |||
@@ -51,7 +51,7 @@ static struct xt_table packet_filter = { | |||
51 | .lock = __RW_LOCK_UNLOCKED(packet_filter.lock), | 51 | .lock = __RW_LOCK_UNLOCKED(packet_filter.lock), |
52 | .private = NULL, | 52 | .private = NULL, |
53 | .me = THIS_MODULE, | 53 | .me = THIS_MODULE, |
54 | .af = NF_ARP, | 54 | .af = NFPROTO_ARP, |
55 | }; | 55 | }; |
56 | 56 | ||
57 | /* The work comes in here from netfilter.c */ | 57 | /* The work comes in here from netfilter.c */ |
@@ -89,21 +89,21 @@ static struct nf_hook_ops arpt_ops[] __read_mostly = { | |||
89 | { | 89 | { |
90 | .hook = arpt_in_hook, | 90 | .hook = arpt_in_hook, |
91 | .owner = THIS_MODULE, | 91 | .owner = THIS_MODULE, |
92 | .pf = NF_ARP, | 92 | .pf = NFPROTO_ARP, |
93 | .hooknum = NF_ARP_IN, | 93 | .hooknum = NF_ARP_IN, |
94 | .priority = NF_IP_PRI_FILTER, | 94 | .priority = NF_IP_PRI_FILTER, |
95 | }, | 95 | }, |
96 | { | 96 | { |
97 | .hook = arpt_out_hook, | 97 | .hook = arpt_out_hook, |
98 | .owner = THIS_MODULE, | 98 | .owner = THIS_MODULE, |
99 | .pf = NF_ARP, | 99 | .pf = NFPROTO_ARP, |
100 | .hooknum = NF_ARP_OUT, | 100 | .hooknum = NF_ARP_OUT, |
101 | .priority = NF_IP_PRI_FILTER, | 101 | .priority = NF_IP_PRI_FILTER, |
102 | }, | 102 | }, |
103 | { | 103 | { |
104 | .hook = arpt_forward_hook, | 104 | .hook = arpt_forward_hook, |
105 | .owner = THIS_MODULE, | 105 | .owner = THIS_MODULE, |
106 | .pf = NF_ARP, | 106 | .pf = NFPROTO_ARP, |
107 | .hooknum = NF_ARP_FORWARD, | 107 | .hooknum = NF_ARP_FORWARD, |
108 | .priority = NF_IP_PRI_FILTER, | 108 | .priority = NF_IP_PRI_FILTER, |
109 | }, | 109 | }, |
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 4e7c719445c2..213fb27debc1 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c | |||
@@ -171,31 +171,25 @@ ip_checkentry(const struct ipt_ip *ip) | |||
171 | } | 171 | } |
172 | 172 | ||
173 | static unsigned int | 173 | static unsigned int |
174 | ipt_error(struct sk_buff *skb, | 174 | ipt_error(struct sk_buff *skb, const struct xt_target_param *par) |
175 | const struct net_device *in, | ||
176 | const struct net_device *out, | ||
177 | unsigned int hooknum, | ||
178 | const struct xt_target *target, | ||
179 | const void *targinfo) | ||
180 | { | 175 | { |
181 | if (net_ratelimit()) | 176 | if (net_ratelimit()) |
182 | printk("ip_tables: error: `%s'\n", (char *)targinfo); | 177 | printk("ip_tables: error: `%s'\n", |
178 | (const char *)par->targinfo); | ||
183 | 179 | ||
184 | return NF_DROP; | 180 | return NF_DROP; |
185 | } | 181 | } |
186 | 182 | ||
187 | /* Performance critical - called for every packet */ | 183 | /* Performance critical - called for every packet */ |
188 | static inline bool | 184 | static inline bool |
189 | do_match(struct ipt_entry_match *m, | 185 | do_match(struct ipt_entry_match *m, const struct sk_buff *skb, |
190 | const struct sk_buff *skb, | 186 | struct xt_match_param *par) |
191 | const struct net_device *in, | ||
192 | const struct net_device *out, | ||
193 | int offset, | ||
194 | bool *hotdrop) | ||
195 | { | 187 | { |
188 | par->match = m->u.kernel.match; | ||
189 | par->matchinfo = m->data; | ||
190 | |||
196 | /* Stop iteration if it doesn't match */ | 191 | /* Stop iteration if it doesn't match */ |
197 | if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data, | 192 | if (!m->u.kernel.match->match(skb, par)) |
198 | offset, ip_hdrlen(skb), hotdrop)) | ||
199 | return true; | 193 | return true; |
200 | else | 194 | else |
201 | return false; | 195 | return false; |
@@ -326,7 +320,6 @@ ipt_do_table(struct sk_buff *skb, | |||
326 | struct xt_table *table) | 320 | struct xt_table *table) |
327 | { | 321 | { |
328 | static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); | 322 | static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); |
329 | u_int16_t offset; | ||
330 | const struct iphdr *ip; | 323 | const struct iphdr *ip; |
331 | u_int16_t datalen; | 324 | u_int16_t datalen; |
332 | bool hotdrop = false; | 325 | bool hotdrop = false; |
@@ -336,6 +329,8 @@ ipt_do_table(struct sk_buff *skb, | |||
336 | void *table_base; | 329 | void *table_base; |
337 | struct ipt_entry *e, *back; | 330 | struct ipt_entry *e, *back; |
338 | struct xt_table_info *private; | 331 | struct xt_table_info *private; |
332 | struct xt_match_param mtpar; | ||
333 | struct xt_target_param tgpar; | ||
339 | 334 | ||
340 | /* Initialization */ | 335 | /* Initialization */ |
341 | ip = ip_hdr(skb); | 336 | ip = ip_hdr(skb); |
@@ -348,7 +343,13 @@ ipt_do_table(struct sk_buff *skb, | |||
348 | * things we don't know, ie. tcp syn flag or ports). If the | 343 | * things we don't know, ie. tcp syn flag or ports). If the |
349 | * rule is also a fragment-specific rule, non-fragments won't | 344 | * rule is also a fragment-specific rule, non-fragments won't |
350 | * match it. */ | 345 | * match it. */ |
351 | offset = ntohs(ip->frag_off) & IP_OFFSET; | 346 | mtpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; |
347 | mtpar.thoff = ip_hdrlen(skb); | ||
348 | mtpar.hotdrop = &hotdrop; | ||
349 | mtpar.in = tgpar.in = in; | ||
350 | mtpar.out = tgpar.out = out; | ||
351 | mtpar.family = tgpar.family = NFPROTO_IPV4; | ||
352 | tgpar.hooknum = hook; | ||
352 | 353 | ||
353 | read_lock_bh(&table->lock); | 354 | read_lock_bh(&table->lock); |
354 | IP_NF_ASSERT(table->valid_hooks & (1 << hook)); | 355 | IP_NF_ASSERT(table->valid_hooks & (1 << hook)); |
@@ -362,12 +363,11 @@ ipt_do_table(struct sk_buff *skb, | |||
362 | do { | 363 | do { |
363 | IP_NF_ASSERT(e); | 364 | IP_NF_ASSERT(e); |
364 | IP_NF_ASSERT(back); | 365 | IP_NF_ASSERT(back); |
365 | if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { | 366 | if (ip_packet_match(ip, indev, outdev, |
367 | &e->ip, mtpar.fragoff)) { | ||
366 | struct ipt_entry_target *t; | 368 | struct ipt_entry_target *t; |
367 | 369 | ||
368 | if (IPT_MATCH_ITERATE(e, do_match, | 370 | if (IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0) |
369 | skb, in, out, | ||
370 | offset, &hotdrop) != 0) | ||
371 | goto no_match; | 371 | goto no_match; |
372 | 372 | ||
373 | ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); | 373 | ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); |
@@ -413,16 +413,14 @@ ipt_do_table(struct sk_buff *skb, | |||
413 | } else { | 413 | } else { |
414 | /* Targets which reenter must return | 414 | /* Targets which reenter must return |
415 | abs. verdicts */ | 415 | abs. verdicts */ |
416 | tgpar.target = t->u.kernel.target; | ||
417 | tgpar.targinfo = t->data; | ||
416 | #ifdef CONFIG_NETFILTER_DEBUG | 418 | #ifdef CONFIG_NETFILTER_DEBUG |
417 | ((struct ipt_entry *)table_base)->comefrom | 419 | ((struct ipt_entry *)table_base)->comefrom |
418 | = 0xeeeeeeec; | 420 | = 0xeeeeeeec; |
419 | #endif | 421 | #endif |
420 | verdict = t->u.kernel.target->target(skb, | 422 | verdict = t->u.kernel.target->target(skb, |
421 | in, out, | 423 | &tgpar); |
422 | hook, | ||
423 | t->u.kernel.target, | ||
424 | t->data); | ||
425 | |||
426 | #ifdef CONFIG_NETFILTER_DEBUG | 424 | #ifdef CONFIG_NETFILTER_DEBUG |
427 | if (((struct ipt_entry *)table_base)->comefrom | 425 | if (((struct ipt_entry *)table_base)->comefrom |
428 | != 0xeeeeeeec | 426 | != 0xeeeeeeec |
@@ -575,12 +573,17 @@ mark_source_chains(struct xt_table_info *newinfo, | |||
575 | static int | 573 | static int |
576 | cleanup_match(struct ipt_entry_match *m, unsigned int *i) | 574 | cleanup_match(struct ipt_entry_match *m, unsigned int *i) |
577 | { | 575 | { |
576 | struct xt_mtdtor_param par; | ||
577 | |||
578 | if (i && (*i)-- == 0) | 578 | if (i && (*i)-- == 0) |
579 | return 1; | 579 | return 1; |
580 | 580 | ||
581 | if (m->u.kernel.match->destroy) | 581 | par.match = m->u.kernel.match; |
582 | m->u.kernel.match->destroy(m->u.kernel.match, m->data); | 582 | par.matchinfo = m->data; |
583 | module_put(m->u.kernel.match->me); | 583 | par.family = NFPROTO_IPV4; |
584 | if (par.match->destroy != NULL) | ||
585 | par.match->destroy(&par); | ||
586 | module_put(par.match->me); | ||
584 | return 0; | 587 | return 0; |
585 | } | 588 | } |
586 | 589 | ||
@@ -606,34 +609,28 @@ check_entry(struct ipt_entry *e, const char *name) | |||
606 | } | 609 | } |
607 | 610 | ||
608 | static int | 611 | static int |
609 | check_match(struct ipt_entry_match *m, const char *name, | 612 | check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, |
610 | const struct ipt_ip *ip, | 613 | unsigned int *i) |
611 | unsigned int hookmask, unsigned int *i) | ||
612 | { | 614 | { |
613 | struct xt_match *match; | 615 | const struct ipt_ip *ip = par->entryinfo; |
614 | int ret; | 616 | int ret; |
615 | 617 | ||
616 | match = m->u.kernel.match; | 618 | par->match = m->u.kernel.match; |
617 | ret = xt_check_match(match, AF_INET, m->u.match_size - sizeof(*m), | 619 | par->matchinfo = m->data; |
618 | name, hookmask, ip->proto, | 620 | |
619 | ip->invflags & IPT_INV_PROTO); | 621 | ret = xt_check_match(par, m->u.match_size - sizeof(*m), |
620 | if (!ret && m->u.kernel.match->checkentry | 622 | ip->proto, ip->invflags & IPT_INV_PROTO); |
621 | && !m->u.kernel.match->checkentry(name, ip, match, m->data, | 623 | if (ret < 0) { |
622 | hookmask)) { | ||
623 | duprintf("ip_tables: check failed for `%s'.\n", | 624 | duprintf("ip_tables: check failed for `%s'.\n", |
624 | m->u.kernel.match->name); | 625 | par.match->name); |
625 | ret = -EINVAL; | 626 | return ret; |
626 | } | 627 | } |
627 | if (!ret) | 628 | ++*i; |
628 | (*i)++; | 629 | return 0; |
629 | return ret; | ||
630 | } | 630 | } |
631 | 631 | ||
632 | static int | 632 | static int |
633 | find_check_match(struct ipt_entry_match *m, | 633 | find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, |
634 | const char *name, | ||
635 | const struct ipt_ip *ip, | ||
636 | unsigned int hookmask, | ||
637 | unsigned int *i) | 634 | unsigned int *i) |
638 | { | 635 | { |
639 | struct xt_match *match; | 636 | struct xt_match *match; |
@@ -648,7 +645,7 @@ find_check_match(struct ipt_entry_match *m, | |||
648 | } | 645 | } |
649 | m->u.kernel.match = match; | 646 | m->u.kernel.match = match; |
650 | 647 | ||
651 | ret = check_match(m, name, ip, hookmask, i); | 648 | ret = check_match(m, par, i); |
652 | if (ret) | 649 | if (ret) |
653 | goto err; | 650 | goto err; |
654 | 651 | ||
@@ -660,23 +657,25 @@ err: | |||
660 | 657 | ||
661 | static int check_target(struct ipt_entry *e, const char *name) | 658 | static int check_target(struct ipt_entry *e, const char *name) |
662 | { | 659 | { |
663 | struct ipt_entry_target *t; | 660 | struct ipt_entry_target *t = ipt_get_target(e); |
664 | struct xt_target *target; | 661 | struct xt_tgchk_param par = { |
662 | .table = name, | ||
663 | .entryinfo = e, | ||
664 | .target = t->u.kernel.target, | ||
665 | .targinfo = t->data, | ||
666 | .hook_mask = e->comefrom, | ||
667 | .family = NFPROTO_IPV4, | ||
668 | }; | ||
665 | int ret; | 669 | int ret; |
666 | 670 | ||
667 | t = ipt_get_target(e); | 671 | ret = xt_check_target(&par, t->u.target_size - sizeof(*t), |
668 | target = t->u.kernel.target; | 672 | e->ip.proto, e->ip.invflags & IPT_INV_PROTO); |
669 | ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t), | 673 | if (ret < 0) { |
670 | name, e->comefrom, e->ip.proto, | ||
671 | e->ip.invflags & IPT_INV_PROTO); | ||
672 | if (!ret && t->u.kernel.target->checkentry | ||
673 | && !t->u.kernel.target->checkentry(name, e, target, t->data, | ||
674 | e->comefrom)) { | ||
675 | duprintf("ip_tables: check failed for `%s'.\n", | 674 | duprintf("ip_tables: check failed for `%s'.\n", |
676 | t->u.kernel.target->name); | 675 | t->u.kernel.target->name); |
677 | ret = -EINVAL; | 676 | return ret; |
678 | } | 677 | } |
679 | return ret; | 678 | return 0; |
680 | } | 679 | } |
681 | 680 | ||
682 | static int | 681 | static int |
@@ -687,14 +686,18 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, | |||
687 | struct xt_target *target; | 686 | struct xt_target *target; |
688 | int ret; | 687 | int ret; |
689 | unsigned int j; | 688 | unsigned int j; |
689 | struct xt_mtchk_param mtpar; | ||
690 | 690 | ||
691 | ret = check_entry(e, name); | 691 | ret = check_entry(e, name); |
692 | if (ret) | 692 | if (ret) |
693 | return ret; | 693 | return ret; |
694 | 694 | ||
695 | j = 0; | 695 | j = 0; |
696 | ret = IPT_MATCH_ITERATE(e, find_check_match, name, &e->ip, | 696 | mtpar.table = name; |
697 | e->comefrom, &j); | 697 | mtpar.entryinfo = &e->ip; |
698 | mtpar.hook_mask = e->comefrom; | ||
699 | mtpar.family = NFPROTO_IPV4; | ||
700 | ret = IPT_MATCH_ITERATE(e, find_check_match, &mtpar, &j); | ||
698 | if (ret != 0) | 701 | if (ret != 0) |
699 | goto cleanup_matches; | 702 | goto cleanup_matches; |
700 | 703 | ||
@@ -769,6 +772,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, | |||
769 | static int | 772 | static int |
770 | cleanup_entry(struct ipt_entry *e, unsigned int *i) | 773 | cleanup_entry(struct ipt_entry *e, unsigned int *i) |
771 | { | 774 | { |
775 | struct xt_tgdtor_param par; | ||
772 | struct ipt_entry_target *t; | 776 | struct ipt_entry_target *t; |
773 | 777 | ||
774 | if (i && (*i)-- == 0) | 778 | if (i && (*i)-- == 0) |
@@ -777,9 +781,13 @@ cleanup_entry(struct ipt_entry *e, unsigned int *i) | |||
777 | /* Cleanup all matches */ | 781 | /* Cleanup all matches */ |
778 | IPT_MATCH_ITERATE(e, cleanup_match, NULL); | 782 | IPT_MATCH_ITERATE(e, cleanup_match, NULL); |
779 | t = ipt_get_target(e); | 783 | t = ipt_get_target(e); |
780 | if (t->u.kernel.target->destroy) | 784 | |
781 | t->u.kernel.target->destroy(t->u.kernel.target, t->data); | 785 | par.target = t->u.kernel.target; |
782 | module_put(t->u.kernel.target->me); | 786 | par.targinfo = t->data; |
787 | par.family = NFPROTO_IPV4; | ||
788 | if (par.target->destroy != NULL) | ||
789 | par.target->destroy(&par); | ||
790 | module_put(par.target->me); | ||
783 | return 0; | 791 | return 0; |
784 | } | 792 | } |
785 | 793 | ||
@@ -1648,12 +1656,16 @@ static int | |||
1648 | compat_check_entry(struct ipt_entry *e, const char *name, | 1656 | compat_check_entry(struct ipt_entry *e, const char *name, |
1649 | unsigned int *i) | 1657 | unsigned int *i) |
1650 | { | 1658 | { |
1659 | struct xt_mtchk_param mtpar; | ||
1651 | unsigned int j; | 1660 | unsigned int j; |
1652 | int ret; | 1661 | int ret; |
1653 | 1662 | ||
1654 | j = 0; | 1663 | j = 0; |
1655 | ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, | 1664 | mtpar.table = name; |
1656 | e->comefrom, &j); | 1665 | mtpar.entryinfo = &e->ip; |
1666 | mtpar.hook_mask = e->comefrom; | ||
1667 | mtpar.family = NFPROTO_IPV4; | ||
1668 | ret = IPT_MATCH_ITERATE(e, check_match, &mtpar, &j); | ||
1657 | if (ret) | 1669 | if (ret) |
1658 | goto cleanup_matches; | 1670 | goto cleanup_matches; |
1659 | 1671 | ||
@@ -2121,30 +2133,23 @@ icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, | |||
2121 | } | 2133 | } |
2122 | 2134 | ||
2123 | static bool | 2135 | static bool |
2124 | icmp_match(const struct sk_buff *skb, | 2136 | icmp_match(const struct sk_buff *skb, const struct xt_match_param *par) |
2125 | const struct net_device *in, | ||
2126 | const struct net_device *out, | ||
2127 | const struct xt_match *match, | ||
2128 | const void *matchinfo, | ||
2129 | int offset, | ||
2130 | unsigned int protoff, | ||
2131 | bool *hotdrop) | ||
2132 | { | 2137 | { |
2133 | const struct icmphdr *ic; | 2138 | const struct icmphdr *ic; |
2134 | struct icmphdr _icmph; | 2139 | struct icmphdr _icmph; |
2135 | const struct ipt_icmp *icmpinfo = matchinfo; | 2140 | const struct ipt_icmp *icmpinfo = par->matchinfo; |
2136 | 2141 | ||
2137 | /* Must not be a fragment. */ | 2142 | /* Must not be a fragment. */ |
2138 | if (offset) | 2143 | if (par->fragoff != 0) |
2139 | return false; | 2144 | return false; |
2140 | 2145 | ||
2141 | ic = skb_header_pointer(skb, protoff, sizeof(_icmph), &_icmph); | 2146 | ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); |
2142 | if (ic == NULL) { | 2147 | if (ic == NULL) { |
2143 | /* We've been asked to examine this packet, and we | 2148 | /* We've been asked to examine this packet, and we |
2144 | * can't. Hence, no choice but to drop. | 2149 | * can't. Hence, no choice but to drop. |
2145 | */ | 2150 | */ |
2146 | duprintf("Dropping evil ICMP tinygram.\n"); | 2151 | duprintf("Dropping evil ICMP tinygram.\n"); |
2147 | *hotdrop = true; | 2152 | *par->hotdrop = true; |
2148 | return false; | 2153 | return false; |
2149 | } | 2154 | } |
2150 | 2155 | ||
@@ -2155,15 +2160,9 @@ icmp_match(const struct sk_buff *skb, | |||
2155 | !!(icmpinfo->invflags&IPT_ICMP_INV)); | 2160 | !!(icmpinfo->invflags&IPT_ICMP_INV)); |
2156 | } | 2161 | } |
2157 | 2162 | ||
2158 | /* Called when user tries to insert an entry of this type. */ | 2163 | static bool icmp_checkentry(const struct xt_mtchk_param *par) |
2159 | static bool | ||
2160 | icmp_checkentry(const char *tablename, | ||
2161 | const void *entry, | ||
2162 | const struct xt_match *match, | ||
2163 | void *matchinfo, | ||
2164 | unsigned int hook_mask) | ||
2165 | { | 2164 | { |
2166 | const struct ipt_icmp *icmpinfo = matchinfo; | 2165 | const struct ipt_icmp *icmpinfo = par->matchinfo; |
2167 | 2166 | ||
2168 | /* Must specify no unknown invflags */ | 2167 | /* Must specify no unknown invflags */ |
2169 | return !(icmpinfo->invflags & ~IPT_ICMP_INV); | 2168 | return !(icmpinfo->invflags & ~IPT_ICMP_INV); |
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index fafe8ebb4c55..7ac1677419a9 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c | |||
@@ -281,11 +281,9 @@ clusterip_responsible(const struct clusterip_config *config, u_int32_t hash) | |||
281 | ***********************************************************************/ | 281 | ***********************************************************************/ |
282 | 282 | ||
283 | static unsigned int | 283 | static unsigned int |
284 | clusterip_tg(struct sk_buff *skb, const struct net_device *in, | 284 | clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par) |
285 | const struct net_device *out, unsigned int hooknum, | ||
286 | const struct xt_target *target, const void *targinfo) | ||
287 | { | 285 | { |
288 | const struct ipt_clusterip_tgt_info *cipinfo = targinfo; | 286 | const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; |
289 | struct nf_conn *ct; | 287 | struct nf_conn *ct; |
290 | enum ip_conntrack_info ctinfo; | 288 | enum ip_conntrack_info ctinfo; |
291 | u_int32_t hash; | 289 | u_int32_t hash; |
@@ -349,13 +347,10 @@ clusterip_tg(struct sk_buff *skb, const struct net_device *in, | |||
349 | return XT_CONTINUE; | 347 | return XT_CONTINUE; |
350 | } | 348 | } |
351 | 349 | ||
352 | static bool | 350 | static bool clusterip_tg_check(const struct xt_tgchk_param *par) |
353 | clusterip_tg_check(const char *tablename, const void *e_void, | ||
354 | const struct xt_target *target, void *targinfo, | ||
355 | unsigned int hook_mask) | ||
356 | { | 351 | { |
357 | struct ipt_clusterip_tgt_info *cipinfo = targinfo; | 352 | struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; |
358 | const struct ipt_entry *e = e_void; | 353 | const struct ipt_entry *e = par->entryinfo; |
359 | 354 | ||
360 | struct clusterip_config *config; | 355 | struct clusterip_config *config; |
361 | 356 | ||
@@ -406,9 +401,9 @@ clusterip_tg_check(const char *tablename, const void *e_void, | |||
406 | } | 401 | } |
407 | cipinfo->config = config; | 402 | cipinfo->config = config; |
408 | 403 | ||
409 | if (nf_ct_l3proto_try_module_get(target->family) < 0) { | 404 | if (nf_ct_l3proto_try_module_get(par->target->family) < 0) { |
410 | printk(KERN_WARNING "can't load conntrack support for " | 405 | printk(KERN_WARNING "can't load conntrack support for " |
411 | "proto=%u\n", target->family); | 406 | "proto=%u\n", par->target->family); |
412 | return false; | 407 | return false; |
413 | } | 408 | } |
414 | 409 | ||
@@ -416,9 +411,9 @@ clusterip_tg_check(const char *tablename, const void *e_void, | |||
416 | } | 411 | } |
417 | 412 | ||
418 | /* drop reference count of cluster config when rule is deleted */ | 413 | /* drop reference count of cluster config when rule is deleted */ |
419 | static void clusterip_tg_destroy(const struct xt_target *target, void *targinfo) | 414 | static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) |
420 | { | 415 | { |
421 | const struct ipt_clusterip_tgt_info *cipinfo = targinfo; | 416 | const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; |
422 | 417 | ||
423 | /* if no more entries are referencing the config, remove it | 418 | /* if no more entries are referencing the config, remove it |
424 | * from the list and destroy the proc entry */ | 419 | * from the list and destroy the proc entry */ |
@@ -426,7 +421,7 @@ static void clusterip_tg_destroy(const struct xt_target *target, void *targinfo) | |||
426 | 421 | ||
427 | clusterip_config_put(cipinfo->config); | 422 | clusterip_config_put(cipinfo->config); |
428 | 423 | ||
429 | nf_ct_l3proto_module_put(target->family); | 424 | nf_ct_l3proto_module_put(par->target->family); |
430 | } | 425 | } |
431 | 426 | ||
432 | #ifdef CONFIG_COMPAT | 427 | #ifdef CONFIG_COMPAT |
@@ -445,7 +440,7 @@ struct compat_ipt_clusterip_tgt_info | |||
445 | 440 | ||
446 | static struct xt_target clusterip_tg_reg __read_mostly = { | 441 | static struct xt_target clusterip_tg_reg __read_mostly = { |
447 | .name = "CLUSTERIP", | 442 | .name = "CLUSTERIP", |
448 | .family = AF_INET, | 443 | .family = NFPROTO_IPV4, |
449 | .target = clusterip_tg, | 444 | .target = clusterip_tg, |
450 | .checkentry = clusterip_tg_check, | 445 | .checkentry = clusterip_tg_check, |
451 | .destroy = clusterip_tg_destroy, | 446 | .destroy = clusterip_tg_destroy, |
@@ -546,7 +541,7 @@ arp_mangle(unsigned int hook, | |||
546 | 541 | ||
547 | static struct nf_hook_ops cip_arp_ops __read_mostly = { | 542 | static struct nf_hook_ops cip_arp_ops __read_mostly = { |
548 | .hook = arp_mangle, | 543 | .hook = arp_mangle, |
549 | .pf = NF_ARP, | 544 | .pf = NFPROTO_ARP, |
550 | .hooknum = NF_ARP_OUT, | 545 | .hooknum = NF_ARP_OUT, |
551 | .priority = -1 | 546 | .priority = -1 |
552 | }; | 547 | }; |
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index d60139c134ca..f7e2fa0974dc 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c | |||
@@ -77,11 +77,9 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo) | |||
77 | } | 77 | } |
78 | 78 | ||
79 | static unsigned int | 79 | static unsigned int |
80 | ecn_tg(struct sk_buff *skb, const struct net_device *in, | 80 | ecn_tg(struct sk_buff *skb, const struct xt_target_param *par) |
81 | const struct net_device *out, unsigned int hooknum, | ||
82 | const struct xt_target *target, const void *targinfo) | ||
83 | { | 81 | { |
84 | const struct ipt_ECN_info *einfo = targinfo; | 82 | const struct ipt_ECN_info *einfo = par->targinfo; |
85 | 83 | ||
86 | if (einfo->operation & IPT_ECN_OP_SET_IP) | 84 | if (einfo->operation & IPT_ECN_OP_SET_IP) |
87 | if (!set_ect_ip(skb, einfo)) | 85 | if (!set_ect_ip(skb, einfo)) |
@@ -95,13 +93,10 @@ ecn_tg(struct sk_buff *skb, const struct net_device *in, | |||
95 | return XT_CONTINUE; | 93 | return XT_CONTINUE; |
96 | } | 94 | } |
97 | 95 | ||
98 | static bool | 96 | static bool ecn_tg_check(const struct xt_tgchk_param *par) |
99 | ecn_tg_check(const char *tablename, const void *e_void, | ||
100 | const struct xt_target *target, void *targinfo, | ||
101 | unsigned int hook_mask) | ||
102 | { | 97 | { |
103 | const struct ipt_ECN_info *einfo = targinfo; | 98 | const struct ipt_ECN_info *einfo = par->targinfo; |
104 | const struct ipt_entry *e = e_void; | 99 | const struct ipt_entry *e = par->entryinfo; |
105 | 100 | ||
106 | if (einfo->operation & IPT_ECN_OP_MASK) { | 101 | if (einfo->operation & IPT_ECN_OP_MASK) { |
107 | printk(KERN_WARNING "ECN: unsupported ECN operation %x\n", | 102 | printk(KERN_WARNING "ECN: unsupported ECN operation %x\n", |
@@ -124,7 +119,7 @@ ecn_tg_check(const char *tablename, const void *e_void, | |||
124 | 119 | ||
125 | static struct xt_target ecn_tg_reg __read_mostly = { | 120 | static struct xt_target ecn_tg_reg __read_mostly = { |
126 | .name = "ECN", | 121 | .name = "ECN", |
127 | .family = AF_INET, | 122 | .family = NFPROTO_IPV4, |
128 | .target = ecn_tg, | 123 | .target = ecn_tg, |
129 | .targetsize = sizeof(struct ipt_ECN_info), | 124 | .targetsize = sizeof(struct ipt_ECN_info), |
130 | .table = "mangle", | 125 | .table = "mangle", |
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 0af14137137b..fc6ce04a3e35 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c | |||
@@ -375,7 +375,7 @@ static struct nf_loginfo default_loginfo = { | |||
375 | }; | 375 | }; |
376 | 376 | ||
377 | static void | 377 | static void |
378 | ipt_log_packet(unsigned int pf, | 378 | ipt_log_packet(u_int8_t pf, |
379 | unsigned int hooknum, | 379 | unsigned int hooknum, |
380 | const struct sk_buff *skb, | 380 | const struct sk_buff *skb, |
381 | const struct net_device *in, | 381 | const struct net_device *in, |
@@ -426,28 +426,23 @@ ipt_log_packet(unsigned int pf, | |||
426 | } | 426 | } |
427 | 427 | ||
428 | static unsigned int | 428 | static unsigned int |
429 | log_tg(struct sk_buff *skb, const struct net_device *in, | 429 | log_tg(struct sk_buff *skb, const struct xt_target_param *par) |
430 | const struct net_device *out, unsigned int hooknum, | ||
431 | const struct xt_target *target, const void *targinfo) | ||
432 | { | 430 | { |
433 | const struct ipt_log_info *loginfo = targinfo; | 431 | const struct ipt_log_info *loginfo = par->targinfo; |
434 | struct nf_loginfo li; | 432 | struct nf_loginfo li; |
435 | 433 | ||
436 | li.type = NF_LOG_TYPE_LOG; | 434 | li.type = NF_LOG_TYPE_LOG; |
437 | li.u.log.level = loginfo->level; | 435 | li.u.log.level = loginfo->level; |
438 | li.u.log.logflags = loginfo->logflags; | 436 | li.u.log.logflags = loginfo->logflags; |
439 | 437 | ||
440 | ipt_log_packet(PF_INET, hooknum, skb, in, out, &li, | 438 | ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in, par->out, &li, |
441 | loginfo->prefix); | 439 | loginfo->prefix); |
442 | return XT_CONTINUE; | 440 | return XT_CONTINUE; |
443 | } | 441 | } |
444 | 442 | ||
445 | static bool | 443 | static bool log_tg_check(const struct xt_tgchk_param *par) |
446 | log_tg_check(const char *tablename, const void *e, | ||
447 | const struct xt_target *target, void *targinfo, | ||
448 | unsigned int hook_mask) | ||
449 | { | 444 | { |
450 | const struct ipt_log_info *loginfo = targinfo; | 445 | const struct ipt_log_info *loginfo = par->targinfo; |
451 | 446 | ||
452 | if (loginfo->level >= 8) { | 447 | if (loginfo->level >= 8) { |
453 | pr_debug("LOG: level %u >= 8\n", loginfo->level); | 448 | pr_debug("LOG: level %u >= 8\n", loginfo->level); |
@@ -463,7 +458,7 @@ log_tg_check(const char *tablename, const void *e, | |||
463 | 458 | ||
464 | static struct xt_target log_tg_reg __read_mostly = { | 459 | static struct xt_target log_tg_reg __read_mostly = { |
465 | .name = "LOG", | 460 | .name = "LOG", |
466 | .family = AF_INET, | 461 | .family = NFPROTO_IPV4, |
467 | .target = log_tg, | 462 | .target = log_tg, |
468 | .targetsize = sizeof(struct ipt_log_info), | 463 | .targetsize = sizeof(struct ipt_log_info), |
469 | .checkentry = log_tg_check, | 464 | .checkentry = log_tg_check, |
@@ -483,7 +478,7 @@ static int __init log_tg_init(void) | |||
483 | ret = xt_register_target(&log_tg_reg); | 478 | ret = xt_register_target(&log_tg_reg); |
484 | if (ret < 0) | 479 | if (ret < 0) |
485 | return ret; | 480 | return ret; |
486 | nf_log_register(PF_INET, &ipt_log_logger); | 481 | nf_log_register(NFPROTO_IPV4, &ipt_log_logger); |
487 | return 0; | 482 | return 0; |
488 | } | 483 | } |
489 | 484 | ||
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 0841aefaa503..f389f60cb105 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c | |||
@@ -31,12 +31,9 @@ MODULE_DESCRIPTION("Xtables: automatic-address SNAT"); | |||
31 | static DEFINE_RWLOCK(masq_lock); | 31 | static DEFINE_RWLOCK(masq_lock); |
32 | 32 | ||
33 | /* FIXME: Multiple targets. --RR */ | 33 | /* FIXME: Multiple targets. --RR */ |
34 | static bool | 34 | static bool masquerade_tg_check(const struct xt_tgchk_param *par) |
35 | masquerade_tg_check(const char *tablename, const void *e, | ||
36 | const struct xt_target *target, void *targinfo, | ||
37 | unsigned int hook_mask) | ||
38 | { | 35 | { |
39 | const struct nf_nat_multi_range_compat *mr = targinfo; | 36 | const struct nf_nat_multi_range_compat *mr = par->targinfo; |
40 | 37 | ||
41 | if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { | 38 | if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { |
42 | pr_debug("masquerade_check: bad MAP_IPS.\n"); | 39 | pr_debug("masquerade_check: bad MAP_IPS.\n"); |
@@ -50,9 +47,7 @@ masquerade_tg_check(const char *tablename, const void *e, | |||
50 | } | 47 | } |
51 | 48 | ||
52 | static unsigned int | 49 | static unsigned int |
53 | masquerade_tg(struct sk_buff *skb, const struct net_device *in, | 50 | masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par) |
54 | const struct net_device *out, unsigned int hooknum, | ||
55 | const struct xt_target *target, const void *targinfo) | ||
56 | { | 51 | { |
57 | struct nf_conn *ct; | 52 | struct nf_conn *ct; |
58 | struct nf_conn_nat *nat; | 53 | struct nf_conn_nat *nat; |
@@ -62,7 +57,7 @@ masquerade_tg(struct sk_buff *skb, const struct net_device *in, | |||
62 | const struct rtable *rt; | 57 | const struct rtable *rt; |
63 | __be32 newsrc; | 58 | __be32 newsrc; |
64 | 59 | ||
65 | NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); | 60 | NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); |
66 | 61 | ||
67 | ct = nf_ct_get(skb, &ctinfo); | 62 | ct = nf_ct_get(skb, &ctinfo); |
68 | nat = nfct_nat(ct); | 63 | nat = nfct_nat(ct); |
@@ -76,16 +71,16 @@ masquerade_tg(struct sk_buff *skb, const struct net_device *in, | |||
76 | if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) | 71 | if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) |
77 | return NF_ACCEPT; | 72 | return NF_ACCEPT; |
78 | 73 | ||
79 | mr = targinfo; | 74 | mr = par->targinfo; |
80 | rt = skb->rtable; | 75 | rt = skb->rtable; |
81 | newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE); | 76 | newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE); |
82 | if (!newsrc) { | 77 | if (!newsrc) { |
83 | printk("MASQUERADE: %s ate my IP address\n", out->name); | 78 | printk("MASQUERADE: %s ate my IP address\n", par->out->name); |
84 | return NF_DROP; | 79 | return NF_DROP; |
85 | } | 80 | } |
86 | 81 | ||
87 | write_lock_bh(&masq_lock); | 82 | write_lock_bh(&masq_lock); |
88 | nat->masq_index = out->ifindex; | 83 | nat->masq_index = par->out->ifindex; |
89 | write_unlock_bh(&masq_lock); | 84 | write_unlock_bh(&masq_lock); |
90 | 85 | ||
91 | /* Transfer from original range. */ | 86 | /* Transfer from original range. */ |
@@ -119,9 +114,7 @@ static int masq_device_event(struct notifier_block *this, | |||
119 | void *ptr) | 114 | void *ptr) |
120 | { | 115 | { |
121 | const struct net_device *dev = ptr; | 116 | const struct net_device *dev = ptr; |
122 | 117 | struct net *net = dev_net(dev); | |
123 | if (!net_eq(dev_net(dev), &init_net)) | ||
124 | return NOTIFY_DONE; | ||
125 | 118 | ||
126 | if (event == NETDEV_DOWN) { | 119 | if (event == NETDEV_DOWN) { |
127 | /* Device was downed. Search entire table for | 120 | /* Device was downed. Search entire table for |
@@ -129,7 +122,8 @@ static int masq_device_event(struct notifier_block *this, | |||
129 | and forget them. */ | 122 | and forget them. */ |
130 | NF_CT_ASSERT(dev->ifindex != 0); | 123 | NF_CT_ASSERT(dev->ifindex != 0); |
131 | 124 | ||
132 | nf_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); | 125 | nf_ct_iterate_cleanup(net, device_cmp, |
126 | (void *)(long)dev->ifindex); | ||
133 | } | 127 | } |
134 | 128 | ||
135 | return NOTIFY_DONE; | 129 | return NOTIFY_DONE; |
@@ -153,7 +147,7 @@ static struct notifier_block masq_inet_notifier = { | |||
153 | 147 | ||
154 | static struct xt_target masquerade_tg_reg __read_mostly = { | 148 | static struct xt_target masquerade_tg_reg __read_mostly = { |
155 | .name = "MASQUERADE", | 149 | .name = "MASQUERADE", |
156 | .family = AF_INET, | 150 | .family = NFPROTO_IPV4, |
157 | .target = masquerade_tg, | 151 | .target = masquerade_tg, |
158 | .targetsize = sizeof(struct nf_nat_multi_range_compat), | 152 | .targetsize = sizeof(struct nf_nat_multi_range_compat), |
159 | .table = "nat", | 153 | .table = "nat", |
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c index 6739abfd1521..7c29582d4ec8 100644 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ b/net/ipv4/netfilter/ipt_NETMAP.c | |||
@@ -22,12 +22,9 @@ MODULE_LICENSE("GPL"); | |||
22 | MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>"); | 22 | MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>"); |
23 | MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets"); | 23 | MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets"); |
24 | 24 | ||
25 | static bool | 25 | static bool netmap_tg_check(const struct xt_tgchk_param *par) |
26 | netmap_tg_check(const char *tablename, const void *e, | ||
27 | const struct xt_target *target, void *targinfo, | ||
28 | unsigned int hook_mask) | ||
29 | { | 26 | { |
30 | const struct nf_nat_multi_range_compat *mr = targinfo; | 27 | const struct nf_nat_multi_range_compat *mr = par->targinfo; |
31 | 28 | ||
32 | if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { | 29 | if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { |
33 | pr_debug("NETMAP:check: bad MAP_IPS.\n"); | 30 | pr_debug("NETMAP:check: bad MAP_IPS.\n"); |
@@ -41,24 +38,23 @@ netmap_tg_check(const char *tablename, const void *e, | |||
41 | } | 38 | } |
42 | 39 | ||
43 | static unsigned int | 40 | static unsigned int |
44 | netmap_tg(struct sk_buff *skb, const struct net_device *in, | 41 | netmap_tg(struct sk_buff *skb, const struct xt_target_param *par) |
45 | const struct net_device *out, unsigned int hooknum, | ||
46 | const struct xt_target *target, const void *targinfo) | ||
47 | { | 42 | { |
48 | struct nf_conn *ct; | 43 | struct nf_conn *ct; |
49 | enum ip_conntrack_info ctinfo; | 44 | enum ip_conntrack_info ctinfo; |
50 | __be32 new_ip, netmask; | 45 | __be32 new_ip, netmask; |
51 | const struct nf_nat_multi_range_compat *mr = targinfo; | 46 | const struct nf_nat_multi_range_compat *mr = par->targinfo; |
52 | struct nf_nat_range newrange; | 47 | struct nf_nat_range newrange; |
53 | 48 | ||
54 | NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING | 49 | NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || |
55 | || hooknum == NF_INET_POST_ROUTING | 50 | par->hooknum == NF_INET_POST_ROUTING || |
56 | || hooknum == NF_INET_LOCAL_OUT); | 51 | par->hooknum == NF_INET_LOCAL_OUT); |
57 | ct = nf_ct_get(skb, &ctinfo); | 52 | ct = nf_ct_get(skb, &ctinfo); |
58 | 53 | ||
59 | netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); | 54 | netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); |
60 | 55 | ||
61 | if (hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_LOCAL_OUT) | 56 | if (par->hooknum == NF_INET_PRE_ROUTING || |
57 | par->hooknum == NF_INET_LOCAL_OUT) | ||
62 | new_ip = ip_hdr(skb)->daddr & ~netmask; | 58 | new_ip = ip_hdr(skb)->daddr & ~netmask; |
63 | else | 59 | else |
64 | new_ip = ip_hdr(skb)->saddr & ~netmask; | 60 | new_ip = ip_hdr(skb)->saddr & ~netmask; |
@@ -70,12 +66,12 @@ netmap_tg(struct sk_buff *skb, const struct net_device *in, | |||
70 | mr->range[0].min, mr->range[0].max }); | 66 | mr->range[0].min, mr->range[0].max }); |
71 | 67 | ||
72 | /* Hand modified range to generic setup. */ | 68 | /* Hand modified range to generic setup. */ |
73 | return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(hooknum)); | 69 | return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum)); |
74 | } | 70 | } |
75 | 71 | ||
76 | static struct xt_target netmap_tg_reg __read_mostly = { | 72 | static struct xt_target netmap_tg_reg __read_mostly = { |
77 | .name = "NETMAP", | 73 | .name = "NETMAP", |
78 | .family = AF_INET, | 74 | .family = NFPROTO_IPV4, |
79 | .target = netmap_tg, | 75 | .target = netmap_tg, |
80 | .targetsize = sizeof(struct nf_nat_multi_range_compat), | 76 | .targetsize = sizeof(struct nf_nat_multi_range_compat), |
81 | .table = "nat", | 77 | .table = "nat", |
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c index 5c6292449d13..698e5e78685b 100644 --- a/net/ipv4/netfilter/ipt_REDIRECT.c +++ b/net/ipv4/netfilter/ipt_REDIRECT.c | |||
@@ -26,12 +26,9 @@ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); | |||
26 | MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); | 26 | MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); |
27 | 27 | ||
28 | /* FIXME: Take multiple ranges --RR */ | 28 | /* FIXME: Take multiple ranges --RR */ |
29 | static bool | 29 | static bool redirect_tg_check(const struct xt_tgchk_param *par) |
30 | redirect_tg_check(const char *tablename, const void *e, | ||
31 | const struct xt_target *target, void *targinfo, | ||
32 | unsigned int hook_mask) | ||
33 | { | 30 | { |
34 | const struct nf_nat_multi_range_compat *mr = targinfo; | 31 | const struct nf_nat_multi_range_compat *mr = par->targinfo; |
35 | 32 | ||
36 | if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { | 33 | if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { |
37 | pr_debug("redirect_check: bad MAP_IPS.\n"); | 34 | pr_debug("redirect_check: bad MAP_IPS.\n"); |
@@ -45,24 +42,22 @@ redirect_tg_check(const char *tablename, const void *e, | |||
45 | } | 42 | } |
46 | 43 | ||
47 | static unsigned int | 44 | static unsigned int |
48 | redirect_tg(struct sk_buff *skb, const struct net_device *in, | 45 | redirect_tg(struct sk_buff *skb, const struct xt_target_param *par) |
49 | const struct net_device *out, unsigned int hooknum, | ||
50 | const struct xt_target *target, const void *targinfo) | ||
51 | { | 46 | { |
52 | struct nf_conn *ct; | 47 | struct nf_conn *ct; |
53 | enum ip_conntrack_info ctinfo; | 48 | enum ip_conntrack_info ctinfo; |
54 | __be32 newdst; | 49 | __be32 newdst; |
55 | const struct nf_nat_multi_range_compat *mr = targinfo; | 50 | const struct nf_nat_multi_range_compat *mr = par->targinfo; |
56 | struct nf_nat_range newrange; | 51 | struct nf_nat_range newrange; |
57 | 52 | ||
58 | NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING | 53 | NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || |
59 | || hooknum == NF_INET_LOCAL_OUT); | 54 | par->hooknum == NF_INET_LOCAL_OUT); |
60 | 55 | ||
61 | ct = nf_ct_get(skb, &ctinfo); | 56 | ct = nf_ct_get(skb, &ctinfo); |
62 | NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); | 57 | NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); |
63 | 58 | ||
64 | /* Local packets: make them go to loopback */ | 59 | /* Local packets: make them go to loopback */ |
65 | if (hooknum == NF_INET_LOCAL_OUT) | 60 | if (par->hooknum == NF_INET_LOCAL_OUT) |
66 | newdst = htonl(0x7F000001); | 61 | newdst = htonl(0x7F000001); |
67 | else { | 62 | else { |
68 | struct in_device *indev; | 63 | struct in_device *indev; |
@@ -92,7 +87,7 @@ redirect_tg(struct sk_buff *skb, const struct net_device *in, | |||
92 | 87 | ||
93 | static struct xt_target redirect_tg_reg __read_mostly = { | 88 | static struct xt_target redirect_tg_reg __read_mostly = { |
94 | .name = "REDIRECT", | 89 | .name = "REDIRECT", |
95 | .family = AF_INET, | 90 | .family = NFPROTO_IPV4, |
96 | .target = redirect_tg, | 91 | .target = redirect_tg, |
97 | .targetsize = sizeof(struct nf_nat_multi_range_compat), | 92 | .targetsize = sizeof(struct nf_nat_multi_range_compat), |
98 | .table = "nat", | 93 | .table = "nat", |
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 2639872849da..0b4b6e0ff2b9 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c | |||
@@ -136,11 +136,9 @@ static inline void send_unreach(struct sk_buff *skb_in, int code) | |||
136 | } | 136 | } |
137 | 137 | ||
138 | static unsigned int | 138 | static unsigned int |
139 | reject_tg(struct sk_buff *skb, const struct net_device *in, | 139 | reject_tg(struct sk_buff *skb, const struct xt_target_param *par) |
140 | const struct net_device *out, unsigned int hooknum, | ||
141 | const struct xt_target *target, const void *targinfo) | ||
142 | { | 140 | { |
143 | const struct ipt_reject_info *reject = targinfo; | 141 | const struct ipt_reject_info *reject = par->targinfo; |
144 | 142 | ||
145 | /* WARNING: This code causes reentry within iptables. | 143 | /* WARNING: This code causes reentry within iptables. |
146 | This means that the iptables jump stack is now crap. We | 144 | This means that the iptables jump stack is now crap. We |
@@ -168,7 +166,7 @@ reject_tg(struct sk_buff *skb, const struct net_device *in, | |||
168 | send_unreach(skb, ICMP_PKT_FILTERED); | 166 | send_unreach(skb, ICMP_PKT_FILTERED); |
169 | break; | 167 | break; |
170 | case IPT_TCP_RESET: | 168 | case IPT_TCP_RESET: |
171 | send_reset(skb, hooknum); | 169 | send_reset(skb, par->hooknum); |
172 | case IPT_ICMP_ECHOREPLY: | 170 | case IPT_ICMP_ECHOREPLY: |
173 | /* Doesn't happen. */ | 171 | /* Doesn't happen. */ |
174 | break; | 172 | break; |
@@ -177,13 +175,10 @@ reject_tg(struct sk_buff *skb, const struct net_device *in, | |||
177 | return NF_DROP; | 175 | return NF_DROP; |
178 | } | 176 | } |
179 | 177 | ||
180 | static bool | 178 | static bool reject_tg_check(const struct xt_tgchk_param *par) |
181 | reject_tg_check(const char *tablename, const void *e_void, | ||
182 | const struct xt_target *target, void *targinfo, | ||
183 | unsigned int hook_mask) | ||
184 | { | 179 | { |
185 | const struct ipt_reject_info *rejinfo = targinfo; | 180 | const struct ipt_reject_info *rejinfo = par->targinfo; |
186 | const struct ipt_entry *e = e_void; | 181 | const struct ipt_entry *e = par->entryinfo; |
187 | 182 | ||
188 | if (rejinfo->with == IPT_ICMP_ECHOREPLY) { | 183 | if (rejinfo->with == IPT_ICMP_ECHOREPLY) { |
189 | printk("ipt_REJECT: ECHOREPLY no longer supported.\n"); | 184 | printk("ipt_REJECT: ECHOREPLY no longer supported.\n"); |
@@ -201,7 +196,7 @@ reject_tg_check(const char *tablename, const void *e_void, | |||
201 | 196 | ||
202 | static struct xt_target reject_tg_reg __read_mostly = { | 197 | static struct xt_target reject_tg_reg __read_mostly = { |
203 | .name = "REJECT", | 198 | .name = "REJECT", |
204 | .family = AF_INET, | 199 | .family = NFPROTO_IPV4, |
205 | .target = reject_tg, | 200 | .target = reject_tg, |
206 | .targetsize = sizeof(struct ipt_reject_info), | 201 | .targetsize = sizeof(struct ipt_reject_info), |
207 | .table = "filter", | 202 | .table = "filter", |
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c index 30eed65e7338..6d76aae90cc0 100644 --- a/net/ipv4/netfilter/ipt_TTL.c +++ b/net/ipv4/netfilter/ipt_TTL.c | |||
@@ -20,12 +20,10 @@ MODULE_DESCRIPTION("Xtables: IPv4 TTL field modification target"); | |||
20 | MODULE_LICENSE("GPL"); | 20 | MODULE_LICENSE("GPL"); |
21 | 21 | ||
22 | static unsigned int | 22 | static unsigned int |
23 | ttl_tg(struct sk_buff *skb, const struct net_device *in, | 23 | ttl_tg(struct sk_buff *skb, const struct xt_target_param *par) |
24 | const struct net_device *out, unsigned int hooknum, | ||
25 | const struct xt_target *target, const void *targinfo) | ||
26 | { | 24 | { |
27 | struct iphdr *iph; | 25 | struct iphdr *iph; |
28 | const struct ipt_TTL_info *info = targinfo; | 26 | const struct ipt_TTL_info *info = par->targinfo; |
29 | int new_ttl; | 27 | int new_ttl; |
30 | 28 | ||
31 | if (!skb_make_writable(skb, skb->len)) | 29 | if (!skb_make_writable(skb, skb->len)) |
@@ -61,12 +59,9 @@ ttl_tg(struct sk_buff *skb, const struct net_device *in, | |||
61 | return XT_CONTINUE; | 59 | return XT_CONTINUE; |
62 | } | 60 | } |
63 | 61 | ||
64 | static bool | 62 | static bool ttl_tg_check(const struct xt_tgchk_param *par) |
65 | ttl_tg_check(const char *tablename, const void *e, | ||
66 | const struct xt_target *target, void *targinfo, | ||
67 | unsigned int hook_mask) | ||
68 | { | 63 | { |
69 | const struct ipt_TTL_info *info = targinfo; | 64 | const struct ipt_TTL_info *info = par->targinfo; |
70 | 65 | ||
71 | if (info->mode > IPT_TTL_MAXMODE) { | 66 | if (info->mode > IPT_TTL_MAXMODE) { |
72 | printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n", | 67 | printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n", |
@@ -80,7 +75,7 @@ ttl_tg_check(const char *tablename, const void *e, | |||
80 | 75 | ||
81 | static struct xt_target ttl_tg_reg __read_mostly = { | 76 | static struct xt_target ttl_tg_reg __read_mostly = { |
82 | .name = "TTL", | 77 | .name = "TTL", |
83 | .family = AF_INET, | 78 | .family = NFPROTO_IPV4, |
84 | .target = ttl_tg, | 79 | .target = ttl_tg, |
85 | .targetsize = sizeof(struct ipt_TTL_info), | 80 | .targetsize = sizeof(struct ipt_TTL_info), |
86 | .table = "mangle", | 81 | .table = "mangle", |
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index b192756c6d0d..18a2826b57c6 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c | |||
@@ -281,18 +281,14 @@ alloc_failure: | |||
281 | } | 281 | } |
282 | 282 | ||
283 | static unsigned int | 283 | static unsigned int |
284 | ulog_tg(struct sk_buff *skb, const struct net_device *in, | 284 | ulog_tg(struct sk_buff *skb, const struct xt_target_param *par) |
285 | const struct net_device *out, unsigned int hooknum, | ||
286 | const struct xt_target *target, const void *targinfo) | ||
287 | { | 285 | { |
288 | struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo; | 286 | ipt_ulog_packet(par->hooknum, skb, par->in, par->out, |
289 | 287 | par->targinfo, NULL); | |
290 | ipt_ulog_packet(hooknum, skb, in, out, loginfo, NULL); | ||
291 | |||
292 | return XT_CONTINUE; | 288 | return XT_CONTINUE; |
293 | } | 289 | } |
294 | 290 | ||
295 | static void ipt_logfn(unsigned int pf, | 291 | static void ipt_logfn(u_int8_t pf, |
296 | unsigned int hooknum, | 292 | unsigned int hooknum, |
297 | const struct sk_buff *skb, | 293 | const struct sk_buff *skb, |
298 | const struct net_device *in, | 294 | const struct net_device *in, |
@@ -317,12 +313,9 @@ static void ipt_logfn(unsigned int pf, | |||
317 | ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); | 313 | ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); |
318 | } | 314 | } |
319 | 315 | ||
320 | static bool | 316 | static bool ulog_tg_check(const struct xt_tgchk_param *par) |
321 | ulog_tg_check(const char *tablename, const void *e, | ||
322 | const struct xt_target *target, void *targinfo, | ||
323 | unsigned int hookmask) | ||
324 | { | 317 | { |
325 | const struct ipt_ulog_info *loginfo = targinfo; | 318 | const struct ipt_ulog_info *loginfo = par->targinfo; |
326 | 319 | ||
327 | if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { | 320 | if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { |
328 | pr_debug("ipt_ULOG: prefix term %i\n", | 321 | pr_debug("ipt_ULOG: prefix term %i\n", |
@@ -374,7 +367,7 @@ static int ulog_tg_compat_to_user(void __user *dst, void *src) | |||
374 | 367 | ||
375 | static struct xt_target ulog_tg_reg __read_mostly = { | 368 | static struct xt_target ulog_tg_reg __read_mostly = { |
376 | .name = "ULOG", | 369 | .name = "ULOG", |
377 | .family = AF_INET, | 370 | .family = NFPROTO_IPV4, |
378 | .target = ulog_tg, | 371 | .target = ulog_tg, |
379 | .targetsize = sizeof(struct ipt_ulog_info), | 372 | .targetsize = sizeof(struct ipt_ulog_info), |
380 | .checkentry = ulog_tg_check, | 373 | .checkentry = ulog_tg_check, |
@@ -419,7 +412,7 @@ static int __init ulog_tg_init(void) | |||
419 | return ret; | 412 | return ret; |
420 | } | 413 | } |
421 | if (nflog) | 414 | if (nflog) |
422 | nf_log_register(PF_INET, &ipt_ulog_logger); | 415 | nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger); |
423 | 416 | ||
424 | return 0; | 417 | return 0; |
425 | } | 418 | } |
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c index 462a22c97877..88762f02779d 100644 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ b/net/ipv4/netfilter/ipt_addrtype.c | |||
@@ -30,12 +30,9 @@ static inline bool match_type(const struct net_device *dev, __be32 addr, | |||
30 | } | 30 | } |
31 | 31 | ||
32 | static bool | 32 | static bool |
33 | addrtype_mt_v0(const struct sk_buff *skb, const struct net_device *in, | 33 | addrtype_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) |
34 | const struct net_device *out, const struct xt_match *match, | ||
35 | const void *matchinfo, int offset, unsigned int protoff, | ||
36 | bool *hotdrop) | ||
37 | { | 34 | { |
38 | const struct ipt_addrtype_info *info = matchinfo; | 35 | const struct ipt_addrtype_info *info = par->matchinfo; |
39 | const struct iphdr *iph = ip_hdr(skb); | 36 | const struct iphdr *iph = ip_hdr(skb); |
40 | bool ret = true; | 37 | bool ret = true; |
41 | 38 | ||
@@ -50,20 +47,17 @@ addrtype_mt_v0(const struct sk_buff *skb, const struct net_device *in, | |||
50 | } | 47 | } |
51 | 48 | ||
52 | static bool | 49 | static bool |
53 | addrtype_mt_v1(const struct sk_buff *skb, const struct net_device *in, | 50 | addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par) |
54 | const struct net_device *out, const struct xt_match *match, | ||
55 | const void *matchinfo, int offset, unsigned int protoff, | ||
56 | bool *hotdrop) | ||
57 | { | 51 | { |
58 | const struct ipt_addrtype_info_v1 *info = matchinfo; | 52 | const struct ipt_addrtype_info_v1 *info = par->matchinfo; |
59 | const struct iphdr *iph = ip_hdr(skb); | 53 | const struct iphdr *iph = ip_hdr(skb); |
60 | const struct net_device *dev = NULL; | 54 | const struct net_device *dev = NULL; |
61 | bool ret = true; | 55 | bool ret = true; |
62 | 56 | ||
63 | if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) | 57 | if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) |
64 | dev = in; | 58 | dev = par->in; |
65 | else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) | 59 | else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) |
66 | dev = out; | 60 | dev = par->out; |
67 | 61 | ||
68 | if (info->source) | 62 | if (info->source) |
69 | ret &= match_type(dev, iph->saddr, info->source) ^ | 63 | ret &= match_type(dev, iph->saddr, info->source) ^ |
@@ -74,12 +68,9 @@ addrtype_mt_v1(const struct sk_buff *skb, const struct net_device *in, | |||
74 | return ret; | 68 | return ret; |
75 | } | 69 | } |
76 | 70 | ||
77 | static bool | 71 | static bool addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) |
78 | addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void, | ||
79 | const struct xt_match *match, void *matchinfo, | ||
80 | unsigned int hook_mask) | ||
81 | { | 72 | { |
82 | struct ipt_addrtype_info_v1 *info = matchinfo; | 73 | struct ipt_addrtype_info_v1 *info = par->matchinfo; |
83 | 74 | ||
84 | if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN && | 75 | if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN && |
85 | info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { | 76 | info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { |
@@ -88,14 +79,16 @@ addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void, | |||
88 | return false; | 79 | return false; |
89 | } | 80 | } |
90 | 81 | ||
91 | if (hook_mask & (1 << NF_INET_PRE_ROUTING | 1 << NF_INET_LOCAL_IN) && | 82 | if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | |
83 | (1 << NF_INET_LOCAL_IN)) && | ||
92 | info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { | 84 | info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { |
93 | printk(KERN_ERR "ipt_addrtype: output interface limitation " | 85 | printk(KERN_ERR "ipt_addrtype: output interface limitation " |
94 | "not valid in PRE_ROUTING and INPUT\n"); | 86 | "not valid in PRE_ROUTING and INPUT\n"); |
95 | return false; | 87 | return false; |
96 | } | 88 | } |
97 | 89 | ||
98 | if (hook_mask & (1 << NF_INET_POST_ROUTING | 1 << NF_INET_LOCAL_OUT) && | 90 | if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | |
91 | (1 << NF_INET_LOCAL_OUT)) && | ||
99 | info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) { | 92 | info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) { |
100 | printk(KERN_ERR "ipt_addrtype: input interface limitation " | 93 | printk(KERN_ERR "ipt_addrtype: input interface limitation " |
101 | "not valid in POST_ROUTING and OUTPUT\n"); | 94 | "not valid in POST_ROUTING and OUTPUT\n"); |
@@ -108,14 +101,14 @@ addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void, | |||
108 | static struct xt_match addrtype_mt_reg[] __read_mostly = { | 101 | static struct xt_match addrtype_mt_reg[] __read_mostly = { |
109 | { | 102 | { |
110 | .name = "addrtype", | 103 | .name = "addrtype", |
111 | .family = AF_INET, | 104 | .family = NFPROTO_IPV4, |
112 | .match = addrtype_mt_v0, | 105 | .match = addrtype_mt_v0, |
113 | .matchsize = sizeof(struct ipt_addrtype_info), | 106 | .matchsize = sizeof(struct ipt_addrtype_info), |
114 | .me = THIS_MODULE | 107 | .me = THIS_MODULE |
115 | }, | 108 | }, |
116 | { | 109 | { |
117 | .name = "addrtype", | 110 | .name = "addrtype", |
118 | .family = AF_INET, | 111 | .family = NFPROTO_IPV4, |
119 | .revision = 1, | 112 | .revision = 1, |
120 | .match = addrtype_mt_v1, | 113 | .match = addrtype_mt_v1, |
121 | .checkentry = addrtype_mt_checkentry_v1, | 114 | .checkentry = addrtype_mt_checkentry_v1, |
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c index e977989629c7..0104c0b399de 100644 --- a/net/ipv4/netfilter/ipt_ah.c +++ b/net/ipv4/netfilter/ipt_ah.c | |||
@@ -36,27 +36,23 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) | |||
36 | return r; | 36 | return r; |
37 | } | 37 | } |
38 | 38 | ||
39 | static bool | 39 | static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par) |
40 | ah_mt(const struct sk_buff *skb, const struct net_device *in, | ||
41 | const struct net_device *out, const struct xt_match *match, | ||
42 | const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) | ||
43 | { | 40 | { |
44 | struct ip_auth_hdr _ahdr; | 41 | struct ip_auth_hdr _ahdr; |
45 | const struct ip_auth_hdr *ah; | 42 | const struct ip_auth_hdr *ah; |
46 | const struct ipt_ah *ahinfo = matchinfo; | 43 | const struct ipt_ah *ahinfo = par->matchinfo; |
47 | 44 | ||
48 | /* Must not be a fragment. */ | 45 | /* Must not be a fragment. */ |
49 | if (offset) | 46 | if (par->fragoff != 0) |
50 | return false; | 47 | return false; |
51 | 48 | ||
52 | ah = skb_header_pointer(skb, protoff, | 49 | ah = skb_header_pointer(skb, par->thoff, sizeof(_ahdr), &_ahdr); |
53 | sizeof(_ahdr), &_ahdr); | ||
54 | if (ah == NULL) { | 50 | if (ah == NULL) { |
55 | /* We've been asked to examine this packet, and we | 51 | /* We've been asked to examine this packet, and we |
56 | * can't. Hence, no choice but to drop. | 52 | * can't. Hence, no choice but to drop. |
57 | */ | 53 | */ |
58 | duprintf("Dropping evil AH tinygram.\n"); | 54 | duprintf("Dropping evil AH tinygram.\n"); |
59 | *hotdrop = true; | 55 | *par->hotdrop = true; |
60 | return 0; | 56 | return 0; |
61 | } | 57 | } |
62 | 58 | ||
@@ -65,13 +61,9 @@ ah_mt(const struct sk_buff *skb, const struct net_device *in, | |||
65 | !!(ahinfo->invflags & IPT_AH_INV_SPI)); | 61 | !!(ahinfo->invflags & IPT_AH_INV_SPI)); |
66 | } | 62 | } |
67 | 63 | ||
68 | /* Called when user tries to insert an entry of this type. */ | 64 | static bool ah_mt_check(const struct xt_mtchk_param *par) |
69 | static bool | ||
70 | ah_mt_check(const char *tablename, const void *ip_void, | ||
71 | const struct xt_match *match, void *matchinfo, | ||
72 | unsigned int hook_mask) | ||
73 | { | 65 | { |
74 | const struct ipt_ah *ahinfo = matchinfo; | 66 | const struct ipt_ah *ahinfo = par->matchinfo; |
75 | 67 | ||
76 | /* Must specify no unknown invflags */ | 68 | /* Must specify no unknown invflags */ |
77 | if (ahinfo->invflags & ~IPT_AH_INV_MASK) { | 69 | if (ahinfo->invflags & ~IPT_AH_INV_MASK) { |
@@ -83,7 +75,7 @@ ah_mt_check(const char *tablename, const void *ip_void, | |||
83 | 75 | ||
84 | static struct xt_match ah_mt_reg __read_mostly = { | 76 | static struct xt_match ah_mt_reg __read_mostly = { |
85 | .name = "ah", | 77 | .name = "ah", |
86 | .family = AF_INET, | 78 | .family = NFPROTO_IPV4, |
87 | .match = ah_mt, | 79 | .match = ah_mt, |
88 | .matchsize = sizeof(struct ipt_ah), | 80 | .matchsize = sizeof(struct ipt_ah), |
89 | .proto = IPPROTO_AH, | 81 | .proto = IPPROTO_AH, |
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c index 749de8284ce5..6289b64144c6 100644 --- a/net/ipv4/netfilter/ipt_ecn.c +++ b/net/ipv4/netfilter/ipt_ecn.c | |||
@@ -67,12 +67,9 @@ static inline bool match_tcp(const struct sk_buff *skb, | |||
67 | return true; | 67 | return true; |
68 | } | 68 | } |
69 | 69 | ||
70 | static bool | 70 | static bool ecn_mt(const struct sk_buff *skb, const struct xt_match_param *par) |
71 | ecn_mt(const struct sk_buff *skb, const struct net_device *in, | ||
72 | const struct net_device *out, const struct xt_match *match, | ||
73 | const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) | ||
74 | { | 71 | { |
75 | const struct ipt_ecn_info *info = matchinfo; | 72 | const struct ipt_ecn_info *info = par->matchinfo; |
76 | 73 | ||
77 | if (info->operation & IPT_ECN_OP_MATCH_IP) | 74 | if (info->operation & IPT_ECN_OP_MATCH_IP) |
78 | if (!match_ip(skb, info)) | 75 | if (!match_ip(skb, info)) |
@@ -81,20 +78,17 @@ ecn_mt(const struct sk_buff *skb, const struct net_device *in, | |||
81 | if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { | 78 | if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { |
82 | if (ip_hdr(skb)->protocol != IPPROTO_TCP) | 79 | if (ip_hdr(skb)->protocol != IPPROTO_TCP) |
83 | return false; | 80 | return false; |
84 | if (!match_tcp(skb, info, hotdrop)) | 81 | if (!match_tcp(skb, info, par->hotdrop)) |
85 | return false; | 82 | return false; |
86 | } | 83 | } |
87 | 84 | ||
88 | return true; | 85 | return true; |
89 | } | 86 | } |
90 | 87 | ||
91 | static bool | 88 | static bool ecn_mt_check(const struct xt_mtchk_param *par) |
92 | ecn_mt_check(const char *tablename, const void *ip_void, | ||
93 | const struct xt_match *match, void *matchinfo, | ||
94 | unsigned int hook_mask) | ||
95 | { | 89 | { |
96 | const struct ipt_ecn_info *info = matchinfo; | 90 | const struct ipt_ecn_info *info = par->matchinfo; |
97 | const struct ipt_ip *ip = ip_void; | 91 | const struct ipt_ip *ip = par->entryinfo; |
98 | 92 | ||
99 | if (info->operation & IPT_ECN_OP_MATCH_MASK) | 93 | if (info->operation & IPT_ECN_OP_MATCH_MASK) |
100 | return false; | 94 | return false; |
@@ -114,7 +108,7 @@ ecn_mt_check(const char *tablename, const void *ip_void, | |||
114 | 108 | ||
115 | static struct xt_match ecn_mt_reg __read_mostly = { | 109 | static struct xt_match ecn_mt_reg __read_mostly = { |
116 | .name = "ecn", | 110 | .name = "ecn", |
117 | .family = AF_INET, | 111 | .family = NFPROTO_IPV4, |
118 | .match = ecn_mt, | 112 | .match = ecn_mt, |
119 | .matchsize = sizeof(struct ipt_ecn_info), | 113 | .matchsize = sizeof(struct ipt_ecn_info), |
120 | .checkentry = ecn_mt_check, | 114 | .checkentry = ecn_mt_check, |
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c deleted file mode 100644 index 3974d7cae5c0..000000000000 --- a/net/ipv4/netfilter/ipt_recent.c +++ /dev/null | |||
@@ -1,501 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * This is a replacement of the old ipt_recent module, which carried the | ||
9 | * following copyright notice: | ||
10 | * | ||
11 | * Author: Stephen Frost <sfrost@snowman.net> | ||
12 | * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org | ||
13 | */ | ||
14 | #include <linux/init.h> | ||
15 | #include <linux/ip.h> | ||
16 | #include <linux/moduleparam.h> | ||
17 | #include <linux/proc_fs.h> | ||
18 | #include <linux/seq_file.h> | ||
19 | #include <linux/string.h> | ||
20 | #include <linux/ctype.h> | ||
21 | #include <linux/list.h> | ||
22 | #include <linux/random.h> | ||
23 | #include <linux/jhash.h> | ||
24 | #include <linux/bitops.h> | ||
25 | #include <linux/skbuff.h> | ||
26 | #include <linux/inet.h> | ||
27 | #include <net/net_namespace.h> | ||
28 | |||
29 | #include <linux/netfilter/x_tables.h> | ||
30 | #include <linux/netfilter_ipv4/ipt_recent.h> | ||
31 | |||
32 | MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); | ||
33 | MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching for IPv4"); | ||
34 | MODULE_LICENSE("GPL"); | ||
35 | |||
36 | static unsigned int ip_list_tot = 100; | ||
37 | static unsigned int ip_pkt_list_tot = 20; | ||
38 | static unsigned int ip_list_hash_size = 0; | ||
39 | static unsigned int ip_list_perms = 0644; | ||
40 | static unsigned int ip_list_uid = 0; | ||
41 | static unsigned int ip_list_gid = 0; | ||
42 | module_param(ip_list_tot, uint, 0400); | ||
43 | module_param(ip_pkt_list_tot, uint, 0400); | ||
44 | module_param(ip_list_hash_size, uint, 0400); | ||
45 | module_param(ip_list_perms, uint, 0400); | ||
46 | module_param(ip_list_uid, uint, 0400); | ||
47 | module_param(ip_list_gid, uint, 0400); | ||
48 | MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); | ||
49 | MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)"); | ||
50 | MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); | ||
51 | MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files"); | ||
52 | MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/ipt_recent/* files"); | ||
53 | MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/ipt_recent/* files"); | ||
54 | |||
55 | struct recent_entry { | ||
56 | struct list_head list; | ||
57 | struct list_head lru_list; | ||
58 | __be32 addr; | ||
59 | u_int8_t ttl; | ||
60 | u_int8_t index; | ||
61 | u_int16_t nstamps; | ||
62 | unsigned long stamps[0]; | ||
63 | }; | ||
64 | |||
65 | struct recent_table { | ||
66 | struct list_head list; | ||
67 | char name[IPT_RECENT_NAME_LEN]; | ||
68 | #ifdef CONFIG_PROC_FS | ||
69 | struct proc_dir_entry *proc; | ||
70 | #endif | ||
71 | unsigned int refcnt; | ||
72 | unsigned int entries; | ||
73 | struct list_head lru_list; | ||
74 | struct list_head iphash[0]; | ||
75 | }; | ||
76 | |||
77 | static LIST_HEAD(tables); | ||
78 | static DEFINE_SPINLOCK(recent_lock); | ||
79 | static DEFINE_MUTEX(recent_mutex); | ||
80 | |||
81 | #ifdef CONFIG_PROC_FS | ||
82 | static struct proc_dir_entry *proc_dir; | ||
83 | static const struct file_operations recent_fops; | ||
84 | #endif | ||
85 | |||
86 | static u_int32_t hash_rnd; | ||
87 | static int hash_rnd_initted; | ||
88 | |||
89 | static unsigned int recent_entry_hash(__be32 addr) | ||
90 | { | ||
91 | if (!hash_rnd_initted) { | ||
92 | get_random_bytes(&hash_rnd, 4); | ||
93 | hash_rnd_initted = 1; | ||
94 | } | ||
95 | return jhash_1word((__force u32)addr, hash_rnd) & (ip_list_hash_size - 1); | ||
96 | } | ||
97 | |||
98 | static struct recent_entry * | ||
99 | recent_entry_lookup(const struct recent_table *table, __be32 addr, u_int8_t ttl) | ||
100 | { | ||
101 | struct recent_entry *e; | ||
102 | unsigned int h; | ||
103 | |||
104 | h = recent_entry_hash(addr); | ||
105 | list_for_each_entry(e, &table->iphash[h], list) | ||
106 | if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl)) | ||
107 | return e; | ||
108 | return NULL; | ||
109 | } | ||
110 | |||
111 | static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) | ||
112 | { | ||
113 | list_del(&e->list); | ||
114 | list_del(&e->lru_list); | ||
115 | kfree(e); | ||
116 | t->entries--; | ||
117 | } | ||
118 | |||
119 | static struct recent_entry * | ||
120 | recent_entry_init(struct recent_table *t, __be32 addr, u_int8_t ttl) | ||
121 | { | ||
122 | struct recent_entry *e; | ||
123 | |||
124 | if (t->entries >= ip_list_tot) { | ||
125 | e = list_entry(t->lru_list.next, struct recent_entry, lru_list); | ||
126 | recent_entry_remove(t, e); | ||
127 | } | ||
128 | e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot, | ||
129 | GFP_ATOMIC); | ||
130 | if (e == NULL) | ||
131 | return NULL; | ||
132 | e->addr = addr; | ||
133 | e->ttl = ttl; | ||
134 | e->stamps[0] = jiffies; | ||
135 | e->nstamps = 1; | ||
136 | e->index = 1; | ||
137 | list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]); | ||
138 | list_add_tail(&e->lru_list, &t->lru_list); | ||
139 | t->entries++; | ||
140 | return e; | ||
141 | } | ||
142 | |||
143 | static void recent_entry_update(struct recent_table *t, struct recent_entry *e) | ||
144 | { | ||
145 | e->stamps[e->index++] = jiffies; | ||
146 | if (e->index > e->nstamps) | ||
147 | e->nstamps = e->index; | ||
148 | e->index %= ip_pkt_list_tot; | ||
149 | list_move_tail(&e->lru_list, &t->lru_list); | ||
150 | } | ||
151 | |||
152 | static struct recent_table *recent_table_lookup(const char *name) | ||
153 | { | ||
154 | struct recent_table *t; | ||
155 | |||
156 | list_for_each_entry(t, &tables, list) | ||
157 | if (!strcmp(t->name, name)) | ||
158 | return t; | ||
159 | return NULL; | ||
160 | } | ||
161 | |||
162 | static void recent_table_flush(struct recent_table *t) | ||
163 | { | ||
164 | struct recent_entry *e, *next; | ||
165 | unsigned int i; | ||
166 | |||
167 | for (i = 0; i < ip_list_hash_size; i++) | ||
168 | list_for_each_entry_safe(e, next, &t->iphash[i], list) | ||
169 | recent_entry_remove(t, e); | ||
170 | } | ||
171 | |||
172 | static bool | ||
173 | recent_mt(const struct sk_buff *skb, const struct net_device *in, | ||
174 | const struct net_device *out, const struct xt_match *match, | ||
175 | const void *matchinfo, int offset, unsigned int protoff, | ||
176 | bool *hotdrop) | ||
177 | { | ||
178 | const struct ipt_recent_info *info = matchinfo; | ||
179 | struct recent_table *t; | ||
180 | struct recent_entry *e; | ||
181 | __be32 addr; | ||
182 | u_int8_t ttl; | ||
183 | bool ret = info->invert; | ||
184 | |||
185 | if (info->side == IPT_RECENT_DEST) | ||
186 | addr = ip_hdr(skb)->daddr; | ||
187 | else | ||
188 | addr = ip_hdr(skb)->saddr; | ||
189 | |||
190 | ttl = ip_hdr(skb)->ttl; | ||
191 | /* use TTL as seen before forwarding */ | ||
192 | if (out && !skb->sk) | ||
193 | ttl++; | ||
194 | |||
195 | spin_lock_bh(&recent_lock); | ||
196 | t = recent_table_lookup(info->name); | ||
197 | e = recent_entry_lookup(t, addr, | ||
198 | info->check_set & IPT_RECENT_TTL ? ttl : 0); | ||
199 | if (e == NULL) { | ||
200 | if (!(info->check_set & IPT_RECENT_SET)) | ||
201 | goto out; | ||
202 | e = recent_entry_init(t, addr, ttl); | ||
203 | if (e == NULL) | ||
204 | *hotdrop = true; | ||
205 | ret = !ret; | ||
206 | goto out; | ||
207 | } | ||
208 | |||
209 | if (info->check_set & IPT_RECENT_SET) | ||
210 | ret = !ret; | ||
211 | else if (info->check_set & IPT_RECENT_REMOVE) { | ||
212 | recent_entry_remove(t, e); | ||
213 | ret = !ret; | ||
214 | } else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) { | ||
215 | unsigned long time = jiffies - info->seconds * HZ; | ||
216 | unsigned int i, hits = 0; | ||
217 | |||
218 | for (i = 0; i < e->nstamps; i++) { | ||
219 | if (info->seconds && time_after(time, e->stamps[i])) | ||
220 | continue; | ||
221 | if (++hits >= info->hit_count) { | ||
222 | ret = !ret; | ||
223 | break; | ||
224 | } | ||
225 | } | ||
226 | } | ||
227 | |||
228 | if (info->check_set & IPT_RECENT_SET || | ||
229 | (info->check_set & IPT_RECENT_UPDATE && ret)) { | ||
230 | recent_entry_update(t, e); | ||
231 | e->ttl = ttl; | ||
232 | } | ||
233 | out: | ||
234 | spin_unlock_bh(&recent_lock); | ||
235 | return ret; | ||
236 | } | ||
237 | |||
238 | static bool | ||
239 | recent_mt_check(const char *tablename, const void *ip, | ||
240 | const struct xt_match *match, void *matchinfo, | ||
241 | unsigned int hook_mask) | ||
242 | { | ||
243 | const struct ipt_recent_info *info = matchinfo; | ||
244 | struct recent_table *t; | ||
245 | unsigned i; | ||
246 | bool ret = false; | ||
247 | |||
248 | if (hweight8(info->check_set & | ||
249 | (IPT_RECENT_SET | IPT_RECENT_REMOVE | | ||
250 | IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1) | ||
251 | return false; | ||
252 | if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) && | ||
253 | (info->seconds || info->hit_count)) | ||
254 | return false; | ||
255 | if (info->hit_count > ip_pkt_list_tot) | ||
256 | return false; | ||
257 | if (info->name[0] == '\0' || | ||
258 | strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) | ||
259 | return false; | ||
260 | |||
261 | mutex_lock(&recent_mutex); | ||
262 | t = recent_table_lookup(info->name); | ||
263 | if (t != NULL) { | ||
264 | t->refcnt++; | ||
265 | ret = true; | ||
266 | goto out; | ||
267 | } | ||
268 | |||
269 | t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, | ||
270 | GFP_KERNEL); | ||
271 | if (t == NULL) | ||
272 | goto out; | ||
273 | t->refcnt = 1; | ||
274 | strcpy(t->name, info->name); | ||
275 | INIT_LIST_HEAD(&t->lru_list); | ||
276 | for (i = 0; i < ip_list_hash_size; i++) | ||
277 | INIT_LIST_HEAD(&t->iphash[i]); | ||
278 | #ifdef CONFIG_PROC_FS | ||
279 | t->proc = proc_create(t->name, ip_list_perms, proc_dir, &recent_fops); | ||
280 | if (t->proc == NULL) { | ||
281 | kfree(t); | ||
282 | goto out; | ||
283 | } | ||
284 | t->proc->uid = ip_list_uid; | ||
285 | t->proc->gid = ip_list_gid; | ||
286 | t->proc->data = t; | ||
287 | #endif | ||
288 | spin_lock_bh(&recent_lock); | ||
289 | list_add_tail(&t->list, &tables); | ||
290 | spin_unlock_bh(&recent_lock); | ||
291 | ret = true; | ||
292 | out: | ||
293 | mutex_unlock(&recent_mutex); | ||
294 | return ret; | ||
295 | } | ||
296 | |||
297 | static void recent_mt_destroy(const struct xt_match *match, void *matchinfo) | ||
298 | { | ||
299 | const struct ipt_recent_info *info = matchinfo; | ||
300 | struct recent_table *t; | ||
301 | |||
302 | mutex_lock(&recent_mutex); | ||
303 | t = recent_table_lookup(info->name); | ||
304 | if (--t->refcnt == 0) { | ||
305 | spin_lock_bh(&recent_lock); | ||
306 | list_del(&t->list); | ||
307 | spin_unlock_bh(&recent_lock); | ||
308 | #ifdef CONFIG_PROC_FS | ||
309 | remove_proc_entry(t->name, proc_dir); | ||
310 | #endif | ||
311 | recent_table_flush(t); | ||
312 | kfree(t); | ||
313 | } | ||
314 | mutex_unlock(&recent_mutex); | ||
315 | } | ||
316 | |||
317 | #ifdef CONFIG_PROC_FS | ||
318 | struct recent_iter_state { | ||
319 | struct recent_table *table; | ||
320 | unsigned int bucket; | ||
321 | }; | ||
322 | |||
323 | static void *recent_seq_start(struct seq_file *seq, loff_t *pos) | ||
324 | __acquires(recent_lock) | ||
325 | { | ||
326 | struct recent_iter_state *st = seq->private; | ||
327 | const struct recent_table *t = st->table; | ||
328 | struct recent_entry *e; | ||
329 | loff_t p = *pos; | ||
330 | |||
331 | spin_lock_bh(&recent_lock); | ||
332 | |||
333 | for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) | ||
334 | list_for_each_entry(e, &t->iphash[st->bucket], list) | ||
335 | if (p-- == 0) | ||
336 | return e; | ||
337 | return NULL; | ||
338 | } | ||
339 | |||
340 | static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
341 | { | ||
342 | struct recent_iter_state *st = seq->private; | ||
343 | const struct recent_table *t = st->table; | ||
344 | struct recent_entry *e = v; | ||
345 | struct list_head *head = e->list.next; | ||
346 | |||
347 | while (head == &t->iphash[st->bucket]) { | ||
348 | if (++st->bucket >= ip_list_hash_size) | ||
349 | return NULL; | ||
350 | head = t->iphash[st->bucket].next; | ||
351 | } | ||
352 | (*pos)++; | ||
353 | return list_entry(head, struct recent_entry, list); | ||
354 | } | ||
355 | |||
356 | static void recent_seq_stop(struct seq_file *s, void *v) | ||
357 | __releases(recent_lock) | ||
358 | { | ||
359 | spin_unlock_bh(&recent_lock); | ||
360 | } | ||
361 | |||
362 | static int recent_seq_show(struct seq_file *seq, void *v) | ||
363 | { | ||
364 | const struct recent_entry *e = v; | ||
365 | unsigned int i; | ||
366 | |||
367 | i = (e->index - 1) % ip_pkt_list_tot; | ||
368 | seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u", | ||
369 | NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index); | ||
370 | for (i = 0; i < e->nstamps; i++) | ||
371 | seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); | ||
372 | seq_printf(seq, "\n"); | ||
373 | return 0; | ||
374 | } | ||
375 | |||
376 | static const struct seq_operations recent_seq_ops = { | ||
377 | .start = recent_seq_start, | ||
378 | .next = recent_seq_next, | ||
379 | .stop = recent_seq_stop, | ||
380 | .show = recent_seq_show, | ||
381 | }; | ||
382 | |||
383 | static int recent_seq_open(struct inode *inode, struct file *file) | ||
384 | { | ||
385 | struct proc_dir_entry *pde = PDE(inode); | ||
386 | struct recent_iter_state *st; | ||
387 | |||
388 | st = __seq_open_private(file, &recent_seq_ops, sizeof(*st)); | ||
389 | if (st == NULL) | ||
390 | return -ENOMEM; | ||
391 | |||
392 | st->table = pde->data; | ||
393 | return 0; | ||
394 | } | ||
395 | |||
396 | static ssize_t recent_proc_write(struct file *file, const char __user *input, | ||
397 | size_t size, loff_t *loff) | ||
398 | { | ||
399 | const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); | ||
400 | struct recent_table *t = pde->data; | ||
401 | struct recent_entry *e; | ||
402 | char buf[sizeof("+255.255.255.255")], *c = buf; | ||
403 | __be32 addr; | ||
404 | int add; | ||
405 | |||
406 | if (size > sizeof(buf)) | ||
407 | size = sizeof(buf); | ||
408 | if (copy_from_user(buf, input, size)) | ||
409 | return -EFAULT; | ||
410 | while (isspace(*c)) | ||
411 | c++; | ||
412 | |||
413 | if (size - (c - buf) < 5) | ||
414 | return c - buf; | ||
415 | if (!strncmp(c, "clear", 5)) { | ||
416 | c += 5; | ||
417 | spin_lock_bh(&recent_lock); | ||
418 | recent_table_flush(t); | ||
419 | spin_unlock_bh(&recent_lock); | ||
420 | return c - buf; | ||
421 | } | ||
422 | |||
423 | switch (*c) { | ||
424 | case '-': | ||
425 | add = 0; | ||
426 | c++; | ||
427 | break; | ||
428 | case '+': | ||
429 | c++; | ||
430 | default: | ||
431 | add = 1; | ||
432 | break; | ||
433 | } | ||
434 | addr = in_aton(c); | ||
435 | |||
436 | spin_lock_bh(&recent_lock); | ||
437 | e = recent_entry_lookup(t, addr, 0); | ||
438 | if (e == NULL) { | ||
439 | if (add) | ||
440 | recent_entry_init(t, addr, 0); | ||
441 | } else { | ||
442 | if (add) | ||
443 | recent_entry_update(t, e); | ||
444 | else | ||
445 | recent_entry_remove(t, e); | ||
446 | } | ||
447 | spin_unlock_bh(&recent_lock); | ||
448 | return size; | ||
449 | } | ||
450 | |||
451 | static const struct file_operations recent_fops = { | ||
452 | .open = recent_seq_open, | ||
453 | .read = seq_read, | ||
454 | .write = recent_proc_write, | ||
455 | .release = seq_release_private, | ||
456 | .owner = THIS_MODULE, | ||
457 | }; | ||
458 | #endif /* CONFIG_PROC_FS */ | ||
459 | |||
460 | static struct xt_match recent_mt_reg __read_mostly = { | ||
461 | .name = "recent", | ||
462 | .family = AF_INET, | ||
463 | .match = recent_mt, | ||
464 | .matchsize = sizeof(struct ipt_recent_info), | ||
465 | .checkentry = recent_mt_check, | ||
466 | .destroy = recent_mt_destroy, | ||
467 | .me = THIS_MODULE, | ||
468 | }; | ||
469 | |||
470 | static int __init recent_mt_init(void) | ||
471 | { | ||
472 | int err; | ||
473 | |||
474 | if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255) | ||
475 | return -EINVAL; | ||
476 | ip_list_hash_size = 1 << fls(ip_list_tot); | ||
477 | |||
478 | err = xt_register_match(&recent_mt_reg); | ||
479 | #ifdef CONFIG_PROC_FS | ||
480 | if (err) | ||
481 | return err; | ||
482 | proc_dir = proc_mkdir("ipt_recent", init_net.proc_net); | ||
483 | if (proc_dir == NULL) { | ||
484 | xt_unregister_match(&recent_mt_reg); | ||
485 | err = -ENOMEM; | ||
486 | } | ||
487 | #endif | ||
488 | return err; | ||
489 | } | ||
490 | |||
491 | static void __exit recent_mt_exit(void) | ||
492 | { | ||
493 | BUG_ON(!list_empty(&tables)); | ||
494 | xt_unregister_match(&recent_mt_reg); | ||
495 | #ifdef CONFIG_PROC_FS | ||
496 | remove_proc_entry("ipt_recent", init_net.proc_net); | ||
497 | #endif | ||
498 | } | ||
499 | |||
500 | module_init(recent_mt_init); | ||
501 | module_exit(recent_mt_exit); | ||
diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c index e0b8caeb710c..297f1cbf4ff5 100644 --- a/net/ipv4/netfilter/ipt_ttl.c +++ b/net/ipv4/netfilter/ipt_ttl.c | |||
@@ -18,12 +18,9 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); | |||
18 | MODULE_DESCRIPTION("Xtables: IPv4 TTL field match"); | 18 | MODULE_DESCRIPTION("Xtables: IPv4 TTL field match"); |
19 | MODULE_LICENSE("GPL"); | 19 | MODULE_LICENSE("GPL"); |
20 | 20 | ||
21 | static bool | 21 | static bool ttl_mt(const struct sk_buff *skb, const struct xt_match_param *par) |
22 | ttl_mt(const struct sk_buff *skb, const struct net_device *in, | ||
23 | const struct net_device *out, const struct xt_match *match, | ||
24 | const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) | ||
25 | { | 22 | { |
26 | const struct ipt_ttl_info *info = matchinfo; | 23 | const struct ipt_ttl_info *info = par->matchinfo; |
27 | const u8 ttl = ip_hdr(skb)->ttl; | 24 | const u8 ttl = ip_hdr(skb)->ttl; |
28 | 25 | ||
29 | switch (info->mode) { | 26 | switch (info->mode) { |
@@ -46,7 +43,7 @@ ttl_mt(const struct sk_buff *skb, const struct net_device *in, | |||
46 | 43 | ||
47 | static struct xt_match ttl_mt_reg __read_mostly = { | 44 | static struct xt_match ttl_mt_reg __read_mostly = { |
48 | .name = "ttl", | 45 | .name = "ttl", |
49 | .family = AF_INET, | 46 | .family = NFPROTO_IPV4, |
50 | .match = ttl_mt, | 47 | .match = ttl_mt, |
51 | .matchsize = sizeof(struct ipt_ttl_info), | 48 | .matchsize = sizeof(struct ipt_ttl_info), |
52 | .me = THIS_MODULE, | 49 | .me = THIS_MODULE, |
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 1ea677dcf845..c9224310ebae 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c | |||
@@ -70,7 +70,7 @@ ipt_local_in_hook(unsigned int hook, | |||
70 | int (*okfn)(struct sk_buff *)) | 70 | int (*okfn)(struct sk_buff *)) |
71 | { | 71 | { |
72 | return ipt_do_table(skb, hook, in, out, | 72 | return ipt_do_table(skb, hook, in, out, |
73 | nf_local_in_net(in, out)->ipv4.iptable_filter); | 73 | dev_net(in)->ipv4.iptable_filter); |
74 | } | 74 | } |
75 | 75 | ||
76 | static unsigned int | 76 | static unsigned int |
@@ -81,7 +81,7 @@ ipt_hook(unsigned int hook, | |||
81 | int (*okfn)(struct sk_buff *)) | 81 | int (*okfn)(struct sk_buff *)) |
82 | { | 82 | { |
83 | return ipt_do_table(skb, hook, in, out, | 83 | return ipt_do_table(skb, hook, in, out, |
84 | nf_forward_net(in, out)->ipv4.iptable_filter); | 84 | dev_net(in)->ipv4.iptable_filter); |
85 | } | 85 | } |
86 | 86 | ||
87 | static unsigned int | 87 | static unsigned int |
@@ -101,7 +101,7 @@ ipt_local_out_hook(unsigned int hook, | |||
101 | } | 101 | } |
102 | 102 | ||
103 | return ipt_do_table(skb, hook, in, out, | 103 | return ipt_do_table(skb, hook, in, out, |
104 | nf_local_out_net(in, out)->ipv4.iptable_filter); | 104 | dev_net(out)->ipv4.iptable_filter); |
105 | } | 105 | } |
106 | 106 | ||
107 | static struct nf_hook_ops ipt_ops[] __read_mostly = { | 107 | static struct nf_hook_ops ipt_ops[] __read_mostly = { |
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index da59182f2226..69f2c4287146 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c | |||
@@ -81,7 +81,7 @@ ipt_pre_routing_hook(unsigned int hook, | |||
81 | int (*okfn)(struct sk_buff *)) | 81 | int (*okfn)(struct sk_buff *)) |
82 | { | 82 | { |
83 | return ipt_do_table(skb, hook, in, out, | 83 | return ipt_do_table(skb, hook, in, out, |
84 | nf_pre_routing_net(in, out)->ipv4.iptable_mangle); | 84 | dev_net(in)->ipv4.iptable_mangle); |
85 | } | 85 | } |
86 | 86 | ||
87 | static unsigned int | 87 | static unsigned int |
@@ -92,7 +92,7 @@ ipt_post_routing_hook(unsigned int hook, | |||
92 | int (*okfn)(struct sk_buff *)) | 92 | int (*okfn)(struct sk_buff *)) |
93 | { | 93 | { |
94 | return ipt_do_table(skb, hook, in, out, | 94 | return ipt_do_table(skb, hook, in, out, |
95 | nf_post_routing_net(in, out)->ipv4.iptable_mangle); | 95 | dev_net(out)->ipv4.iptable_mangle); |
96 | } | 96 | } |
97 | 97 | ||
98 | static unsigned int | 98 | static unsigned int |
@@ -103,7 +103,7 @@ ipt_local_in_hook(unsigned int hook, | |||
103 | int (*okfn)(struct sk_buff *)) | 103 | int (*okfn)(struct sk_buff *)) |
104 | { | 104 | { |
105 | return ipt_do_table(skb, hook, in, out, | 105 | return ipt_do_table(skb, hook, in, out, |
106 | nf_local_in_net(in, out)->ipv4.iptable_mangle); | 106 | dev_net(in)->ipv4.iptable_mangle); |
107 | } | 107 | } |
108 | 108 | ||
109 | static unsigned int | 109 | static unsigned int |
@@ -114,7 +114,7 @@ ipt_forward_hook(unsigned int hook, | |||
114 | int (*okfn)(struct sk_buff *)) | 114 | int (*okfn)(struct sk_buff *)) |
115 | { | 115 | { |
116 | return ipt_do_table(skb, hook, in, out, | 116 | return ipt_do_table(skb, hook, in, out, |
117 | nf_forward_net(in, out)->ipv4.iptable_mangle); | 117 | dev_net(in)->ipv4.iptable_mangle); |
118 | } | 118 | } |
119 | 119 | ||
120 | static unsigned int | 120 | static unsigned int |
@@ -147,7 +147,7 @@ ipt_local_hook(unsigned int hook, | |||
147 | tos = iph->tos; | 147 | tos = iph->tos; |
148 | 148 | ||
149 | ret = ipt_do_table(skb, hook, in, out, | 149 | ret = ipt_do_table(skb, hook, in, out, |
150 | nf_local_out_net(in, out)->ipv4.iptable_mangle); | 150 | dev_net(out)->ipv4.iptable_mangle); |
151 | /* Reroute for ANY change. */ | 151 | /* Reroute for ANY change. */ |
152 | if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { | 152 | if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { |
153 | iph = ip_hdr(skb); | 153 | iph = ip_hdr(skb); |
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index fddce7754b72..8faebfe638f1 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c | |||
@@ -53,7 +53,7 @@ ipt_hook(unsigned int hook, | |||
53 | int (*okfn)(struct sk_buff *)) | 53 | int (*okfn)(struct sk_buff *)) |
54 | { | 54 | { |
55 | return ipt_do_table(skb, hook, in, out, | 55 | return ipt_do_table(skb, hook, in, out, |
56 | nf_pre_routing_net(in, out)->ipv4.iptable_raw); | 56 | dev_net(in)->ipv4.iptable_raw); |
57 | } | 57 | } |
58 | 58 | ||
59 | static unsigned int | 59 | static unsigned int |
@@ -72,7 +72,7 @@ ipt_local_hook(unsigned int hook, | |||
72 | return NF_ACCEPT; | 72 | return NF_ACCEPT; |
73 | } | 73 | } |
74 | return ipt_do_table(skb, hook, in, out, | 74 | return ipt_do_table(skb, hook, in, out, |
75 | nf_local_out_net(in, out)->ipv4.iptable_raw); | 75 | dev_net(out)->ipv4.iptable_raw); |
76 | } | 76 | } |
77 | 77 | ||
78 | /* 'raw' is the very first table. */ | 78 | /* 'raw' is the very first table. */ |
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index db6d312128e1..36f3be3cc428 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c | |||
@@ -73,7 +73,7 @@ ipt_local_in_hook(unsigned int hook, | |||
73 | int (*okfn)(struct sk_buff *)) | 73 | int (*okfn)(struct sk_buff *)) |
74 | { | 74 | { |
75 | return ipt_do_table(skb, hook, in, out, | 75 | return ipt_do_table(skb, hook, in, out, |
76 | nf_local_in_net(in, out)->ipv4.iptable_security); | 76 | dev_net(in)->ipv4.iptable_security); |
77 | } | 77 | } |
78 | 78 | ||
79 | static unsigned int | 79 | static unsigned int |
@@ -84,7 +84,7 @@ ipt_forward_hook(unsigned int hook, | |||
84 | int (*okfn)(struct sk_buff *)) | 84 | int (*okfn)(struct sk_buff *)) |
85 | { | 85 | { |
86 | return ipt_do_table(skb, hook, in, out, | 86 | return ipt_do_table(skb, hook, in, out, |
87 | nf_forward_net(in, out)->ipv4.iptable_security); | 87 | dev_net(in)->ipv4.iptable_security); |
88 | } | 88 | } |
89 | 89 | ||
90 | static unsigned int | 90 | static unsigned int |
@@ -103,7 +103,7 @@ ipt_local_out_hook(unsigned int hook, | |||
103 | return NF_ACCEPT; | 103 | return NF_ACCEPT; |
104 | } | 104 | } |
105 | return ipt_do_table(skb, hook, in, out, | 105 | return ipt_do_table(skb, hook, in, out, |
106 | nf_local_out_net(in, out)->ipv4.iptable_security); | 106 | dev_net(out)->ipv4.iptable_security); |
107 | } | 107 | } |
108 | 108 | ||
109 | static struct nf_hook_ops ipt_ops[] __read_mostly = { | 109 | static struct nf_hook_ops ipt_ops[] __read_mostly = { |
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 5a955c440364..4a7c35275396 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | |||
@@ -1,3 +1,4 @@ | |||
1 | |||
1 | /* (C) 1999-2001 Paul `Rusty' Russell | 2 | /* (C) 1999-2001 Paul `Rusty' Russell |
2 | * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> | 3 | * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> |
3 | * | 4 | * |
@@ -24,6 +25,7 @@ | |||
24 | #include <net/netfilter/nf_conntrack_core.h> | 25 | #include <net/netfilter/nf_conntrack_core.h> |
25 | #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> | 26 | #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> |
26 | #include <net/netfilter/nf_nat_helper.h> | 27 | #include <net/netfilter/nf_nat_helper.h> |
28 | #include <net/netfilter/ipv4/nf_defrag_ipv4.h> | ||
27 | 29 | ||
28 | int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, | 30 | int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, |
29 | struct nf_conn *ct, | 31 | struct nf_conn *ct, |
@@ -63,23 +65,6 @@ static int ipv4_print_tuple(struct seq_file *s, | |||
63 | NIPQUAD(tuple->dst.u3.ip)); | 65 | NIPQUAD(tuple->dst.u3.ip)); |
64 | } | 66 | } |
65 | 67 | ||
66 | /* Returns new sk_buff, or NULL */ | ||
67 | static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) | ||
68 | { | ||
69 | int err; | ||
70 | |||
71 | skb_orphan(skb); | ||
72 | |||
73 | local_bh_disable(); | ||
74 | err = ip_defrag(skb, user); | ||
75 | local_bh_enable(); | ||
76 | |||
77 | if (!err) | ||
78 | ip_send_check(ip_hdr(skb)); | ||
79 | |||
80 | return err; | ||
81 | } | ||
82 | |||
83 | static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, | 68 | static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, |
84 | unsigned int *dataoff, u_int8_t *protonum) | 69 | unsigned int *dataoff, u_int8_t *protonum) |
85 | { | 70 | { |
@@ -144,35 +129,13 @@ out: | |||
144 | return nf_conntrack_confirm(skb); | 129 | return nf_conntrack_confirm(skb); |
145 | } | 130 | } |
146 | 131 | ||
147 | static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, | ||
148 | struct sk_buff *skb, | ||
149 | const struct net_device *in, | ||
150 | const struct net_device *out, | ||
151 | int (*okfn)(struct sk_buff *)) | ||
152 | { | ||
153 | /* Previously seen (loopback)? Ignore. Do this before | ||
154 | fragment check. */ | ||
155 | if (skb->nfct) | ||
156 | return NF_ACCEPT; | ||
157 | |||
158 | /* Gather fragments. */ | ||
159 | if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { | ||
160 | if (nf_ct_ipv4_gather_frags(skb, | ||
161 | hooknum == NF_INET_PRE_ROUTING ? | ||
162 | IP_DEFRAG_CONNTRACK_IN : | ||
163 | IP_DEFRAG_CONNTRACK_OUT)) | ||
164 | return NF_STOLEN; | ||
165 | } | ||
166 | return NF_ACCEPT; | ||
167 | } | ||
168 | |||
169 | static unsigned int ipv4_conntrack_in(unsigned int hooknum, | 132 | static unsigned int ipv4_conntrack_in(unsigned int hooknum, |
170 | struct sk_buff *skb, | 133 | struct sk_buff *skb, |
171 | const struct net_device *in, | 134 | const struct net_device *in, |
172 | const struct net_device *out, | 135 | const struct net_device *out, |
173 | int (*okfn)(struct sk_buff *)) | 136 | int (*okfn)(struct sk_buff *)) |
174 | { | 137 | { |
175 | return nf_conntrack_in(PF_INET, hooknum, skb); | 138 | return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb); |
176 | } | 139 | } |
177 | 140 | ||
178 | static unsigned int ipv4_conntrack_local(unsigned int hooknum, | 141 | static unsigned int ipv4_conntrack_local(unsigned int hooknum, |
@@ -188,20 +151,13 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum, | |||
188 | printk("ipt_hook: happy cracking.\n"); | 151 | printk("ipt_hook: happy cracking.\n"); |
189 | return NF_ACCEPT; | 152 | return NF_ACCEPT; |
190 | } | 153 | } |
191 | return nf_conntrack_in(PF_INET, hooknum, skb); | 154 | return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb); |
192 | } | 155 | } |
193 | 156 | ||
194 | /* Connection tracking may drop packets, but never alters them, so | 157 | /* Connection tracking may drop packets, but never alters them, so |
195 | make it the first hook. */ | 158 | make it the first hook. */ |
196 | static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { | 159 | static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { |
197 | { | 160 | { |
198 | .hook = ipv4_conntrack_defrag, | ||
199 | .owner = THIS_MODULE, | ||
200 | .pf = PF_INET, | ||
201 | .hooknum = NF_INET_PRE_ROUTING, | ||
202 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, | ||
203 | }, | ||
204 | { | ||
205 | .hook = ipv4_conntrack_in, | 161 | .hook = ipv4_conntrack_in, |
206 | .owner = THIS_MODULE, | 162 | .owner = THIS_MODULE, |
207 | .pf = PF_INET, | 163 | .pf = PF_INET, |
@@ -209,13 +165,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { | |||
209 | .priority = NF_IP_PRI_CONNTRACK, | 165 | .priority = NF_IP_PRI_CONNTRACK, |
210 | }, | 166 | }, |
211 | { | 167 | { |
212 | .hook = ipv4_conntrack_defrag, | ||
213 | .owner = THIS_MODULE, | ||
214 | .pf = PF_INET, | ||
215 | .hooknum = NF_INET_LOCAL_OUT, | ||
216 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, | ||
217 | }, | ||
218 | { | ||
219 | .hook = ipv4_conntrack_local, | 168 | .hook = ipv4_conntrack_local, |
220 | .owner = THIS_MODULE, | 169 | .owner = THIS_MODULE, |
221 | .pf = PF_INET, | 170 | .pf = PF_INET, |
@@ -254,7 +203,7 @@ static ctl_table ip_ct_sysctl_table[] = { | |||
254 | { | 203 | { |
255 | .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT, | 204 | .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT, |
256 | .procname = "ip_conntrack_count", | 205 | .procname = "ip_conntrack_count", |
257 | .data = &nf_conntrack_count, | 206 | .data = &init_net.ct.count, |
258 | .maxlen = sizeof(int), | 207 | .maxlen = sizeof(int), |
259 | .mode = 0444, | 208 | .mode = 0444, |
260 | .proc_handler = &proc_dointvec, | 209 | .proc_handler = &proc_dointvec, |
@@ -270,7 +219,7 @@ static ctl_table ip_ct_sysctl_table[] = { | |||
270 | { | 219 | { |
271 | .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM, | 220 | .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM, |
272 | .procname = "ip_conntrack_checksum", | 221 | .procname = "ip_conntrack_checksum", |
273 | .data = &nf_conntrack_checksum, | 222 | .data = &init_net.ct.sysctl_checksum, |
274 | .maxlen = sizeof(int), | 223 | .maxlen = sizeof(int), |
275 | .mode = 0644, | 224 | .mode = 0644, |
276 | .proc_handler = &proc_dointvec, | 225 | .proc_handler = &proc_dointvec, |
@@ -278,7 +227,7 @@ static ctl_table ip_ct_sysctl_table[] = { | |||
278 | { | 227 | { |
279 | .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID, | 228 | .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID, |
280 | .procname = "ip_conntrack_log_invalid", | 229 | .procname = "ip_conntrack_log_invalid", |
281 | .data = &nf_ct_log_invalid, | 230 | .data = &init_net.ct.sysctl_log_invalid, |
282 | .maxlen = sizeof(unsigned int), | 231 | .maxlen = sizeof(unsigned int), |
283 | .mode = 0644, | 232 | .mode = 0644, |
284 | .proc_handler = &proc_dointvec_minmax, | 233 | .proc_handler = &proc_dointvec_minmax, |
@@ -323,7 +272,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) | |||
323 | return -EINVAL; | 272 | return -EINVAL; |
324 | } | 273 | } |
325 | 274 | ||
326 | h = nf_conntrack_find_get(&tuple); | 275 | h = nf_conntrack_find_get(sock_net(sk), &tuple); |
327 | if (h) { | 276 | if (h) { |
328 | struct sockaddr_in sin; | 277 | struct sockaddr_in sin; |
329 | struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); | 278 | struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); |
@@ -422,6 +371,7 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) | |||
422 | int ret = 0; | 371 | int ret = 0; |
423 | 372 | ||
424 | need_conntrack(); | 373 | need_conntrack(); |
374 | nf_defrag_ipv4_enable(); | ||
425 | 375 | ||
426 | ret = nf_register_sockopt(&so_getorigdst); | 376 | ret = nf_register_sockopt(&so_getorigdst); |
427 | if (ret < 0) { | 377 | if (ret < 0) { |
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 3a020720e40b..313ebf00ee36 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c | |||
@@ -21,18 +21,20 @@ | |||
21 | #include <net/netfilter/nf_conntrack_acct.h> | 21 | #include <net/netfilter/nf_conntrack_acct.h> |
22 | 22 | ||
23 | struct ct_iter_state { | 23 | struct ct_iter_state { |
24 | struct seq_net_private p; | ||
24 | unsigned int bucket; | 25 | unsigned int bucket; |
25 | }; | 26 | }; |
26 | 27 | ||
27 | static struct hlist_node *ct_get_first(struct seq_file *seq) | 28 | static struct hlist_node *ct_get_first(struct seq_file *seq) |
28 | { | 29 | { |
30 | struct net *net = seq_file_net(seq); | ||
29 | struct ct_iter_state *st = seq->private; | 31 | struct ct_iter_state *st = seq->private; |
30 | struct hlist_node *n; | 32 | struct hlist_node *n; |
31 | 33 | ||
32 | for (st->bucket = 0; | 34 | for (st->bucket = 0; |
33 | st->bucket < nf_conntrack_htable_size; | 35 | st->bucket < nf_conntrack_htable_size; |
34 | st->bucket++) { | 36 | st->bucket++) { |
35 | n = rcu_dereference(nf_conntrack_hash[st->bucket].first); | 37 | n = rcu_dereference(net->ct.hash[st->bucket].first); |
36 | if (n) | 38 | if (n) |
37 | return n; | 39 | return n; |
38 | } | 40 | } |
@@ -42,13 +44,14 @@ static struct hlist_node *ct_get_first(struct seq_file *seq) | |||
42 | static struct hlist_node *ct_get_next(struct seq_file *seq, | 44 | static struct hlist_node *ct_get_next(struct seq_file *seq, |
43 | struct hlist_node *head) | 45 | struct hlist_node *head) |
44 | { | 46 | { |
47 | struct net *net = seq_file_net(seq); | ||
45 | struct ct_iter_state *st = seq->private; | 48 | struct ct_iter_state *st = seq->private; |
46 | 49 | ||
47 | head = rcu_dereference(head->next); | 50 | head = rcu_dereference(head->next); |
48 | while (head == NULL) { | 51 | while (head == NULL) { |
49 | if (++st->bucket >= nf_conntrack_htable_size) | 52 | if (++st->bucket >= nf_conntrack_htable_size) |
50 | return NULL; | 53 | return NULL; |
51 | head = rcu_dereference(nf_conntrack_hash[st->bucket].first); | 54 | head = rcu_dereference(net->ct.hash[st->bucket].first); |
52 | } | 55 | } |
53 | return head; | 56 | return head; |
54 | } | 57 | } |
@@ -158,8 +161,8 @@ static const struct seq_operations ct_seq_ops = { | |||
158 | 161 | ||
159 | static int ct_open(struct inode *inode, struct file *file) | 162 | static int ct_open(struct inode *inode, struct file *file) |
160 | { | 163 | { |
161 | return seq_open_private(file, &ct_seq_ops, | 164 | return seq_open_net(inode, file, &ct_seq_ops, |
162 | sizeof(struct ct_iter_state)); | 165 | sizeof(struct ct_iter_state)); |
163 | } | 166 | } |
164 | 167 | ||
165 | static const struct file_operations ct_file_ops = { | 168 | static const struct file_operations ct_file_ops = { |
@@ -167,21 +170,23 @@ static const struct file_operations ct_file_ops = { | |||
167 | .open = ct_open, | 170 | .open = ct_open, |
168 | .read = seq_read, | 171 | .read = seq_read, |
169 | .llseek = seq_lseek, | 172 | .llseek = seq_lseek, |
170 | .release = seq_release_private, | 173 | .release = seq_release_net, |
171 | }; | 174 | }; |
172 | 175 | ||
173 | /* expects */ | 176 | /* expects */ |
174 | struct ct_expect_iter_state { | 177 | struct ct_expect_iter_state { |
178 | struct seq_net_private p; | ||
175 | unsigned int bucket; | 179 | unsigned int bucket; |
176 | }; | 180 | }; |
177 | 181 | ||
178 | static struct hlist_node *ct_expect_get_first(struct seq_file *seq) | 182 | static struct hlist_node *ct_expect_get_first(struct seq_file *seq) |
179 | { | 183 | { |
184 | struct net *net = seq_file_net(seq); | ||
180 | struct ct_expect_iter_state *st = seq->private; | 185 | struct ct_expect_iter_state *st = seq->private; |
181 | struct hlist_node *n; | 186 | struct hlist_node *n; |
182 | 187 | ||
183 | for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { | 188 | for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { |
184 | n = rcu_dereference(nf_ct_expect_hash[st->bucket].first); | 189 | n = rcu_dereference(net->ct.expect_hash[st->bucket].first); |
185 | if (n) | 190 | if (n) |
186 | return n; | 191 | return n; |
187 | } | 192 | } |
@@ -191,13 +196,14 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) | |||
191 | static struct hlist_node *ct_expect_get_next(struct seq_file *seq, | 196 | static struct hlist_node *ct_expect_get_next(struct seq_file *seq, |
192 | struct hlist_node *head) | 197 | struct hlist_node *head) |
193 | { | 198 | { |
199 | struct net *net = seq_file_net(seq); | ||
194 | struct ct_expect_iter_state *st = seq->private; | 200 | struct ct_expect_iter_state *st = seq->private; |
195 | 201 | ||
196 | head = rcu_dereference(head->next); | 202 | head = rcu_dereference(head->next); |
197 | while (head == NULL) { | 203 | while (head == NULL) { |
198 | if (++st->bucket >= nf_ct_expect_hsize) | 204 | if (++st->bucket >= nf_ct_expect_hsize) |
199 | return NULL; | 205 | return NULL; |
200 | head = rcu_dereference(nf_ct_expect_hash[st->bucket].first); | 206 | head = rcu_dereference(net->ct.expect_hash[st->bucket].first); |
201 | } | 207 | } |
202 | return head; | 208 | return head; |
203 | } | 209 | } |
@@ -265,8 +271,8 @@ static const struct seq_operations exp_seq_ops = { | |||
265 | 271 | ||
266 | static int exp_open(struct inode *inode, struct file *file) | 272 | static int exp_open(struct inode *inode, struct file *file) |
267 | { | 273 | { |
268 | return seq_open_private(file, &exp_seq_ops, | 274 | return seq_open_net(inode, file, &exp_seq_ops, |
269 | sizeof(struct ct_expect_iter_state)); | 275 | sizeof(struct ct_expect_iter_state)); |
270 | } | 276 | } |
271 | 277 | ||
272 | static const struct file_operations ip_exp_file_ops = { | 278 | static const struct file_operations ip_exp_file_ops = { |
@@ -274,11 +280,12 @@ static const struct file_operations ip_exp_file_ops = { | |||
274 | .open = exp_open, | 280 | .open = exp_open, |
275 | .read = seq_read, | 281 | .read = seq_read, |
276 | .llseek = seq_lseek, | 282 | .llseek = seq_lseek, |
277 | .release = seq_release_private, | 283 | .release = seq_release_net, |
278 | }; | 284 | }; |
279 | 285 | ||
280 | static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) | 286 | static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) |
281 | { | 287 | { |
288 | struct net *net = seq_file_net(seq); | ||
282 | int cpu; | 289 | int cpu; |
283 | 290 | ||
284 | if (*pos == 0) | 291 | if (*pos == 0) |
@@ -288,7 +295,7 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) | |||
288 | if (!cpu_possible(cpu)) | 295 | if (!cpu_possible(cpu)) |
289 | continue; | 296 | continue; |
290 | *pos = cpu+1; | 297 | *pos = cpu+1; |
291 | return &per_cpu(nf_conntrack_stat, cpu); | 298 | return per_cpu_ptr(net->ct.stat, cpu); |
292 | } | 299 | } |
293 | 300 | ||
294 | return NULL; | 301 | return NULL; |
@@ -296,13 +303,14 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) | |||
296 | 303 | ||
297 | static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) | 304 | static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) |
298 | { | 305 | { |
306 | struct net *net = seq_file_net(seq); | ||
299 | int cpu; | 307 | int cpu; |
300 | 308 | ||
301 | for (cpu = *pos; cpu < NR_CPUS; ++cpu) { | 309 | for (cpu = *pos; cpu < NR_CPUS; ++cpu) { |
302 | if (!cpu_possible(cpu)) | 310 | if (!cpu_possible(cpu)) |
303 | continue; | 311 | continue; |
304 | *pos = cpu+1; | 312 | *pos = cpu+1; |
305 | return &per_cpu(nf_conntrack_stat, cpu); | 313 | return per_cpu_ptr(net->ct.stat, cpu); |
306 | } | 314 | } |
307 | 315 | ||
308 | return NULL; | 316 | return NULL; |
@@ -314,7 +322,8 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v) | |||
314 | 322 | ||
315 | static int ct_cpu_seq_show(struct seq_file *seq, void *v) | 323 | static int ct_cpu_seq_show(struct seq_file *seq, void *v) |
316 | { | 324 | { |
317 | unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); | 325 | struct net *net = seq_file_net(seq); |
326 | unsigned int nr_conntracks = atomic_read(&net->ct.count); | ||
318 | const struct ip_conntrack_stat *st = v; | 327 | const struct ip_conntrack_stat *st = v; |
319 | 328 | ||
320 | if (v == SEQ_START_TOKEN) { | 329 | if (v == SEQ_START_TOKEN) { |
@@ -354,7 +363,8 @@ static const struct seq_operations ct_cpu_seq_ops = { | |||
354 | 363 | ||
355 | static int ct_cpu_seq_open(struct inode *inode, struct file *file) | 364 | static int ct_cpu_seq_open(struct inode *inode, struct file *file) |
356 | { | 365 | { |
357 | return seq_open(file, &ct_cpu_seq_ops); | 366 | return seq_open_net(inode, file, &ct_cpu_seq_ops, |
367 | sizeof(struct seq_net_private)); | ||
358 | } | 368 | } |
359 | 369 | ||
360 | static const struct file_operations ct_cpu_seq_fops = { | 370 | static const struct file_operations ct_cpu_seq_fops = { |
@@ -362,39 +372,54 @@ static const struct file_operations ct_cpu_seq_fops = { | |||
362 | .open = ct_cpu_seq_open, | 372 | .open = ct_cpu_seq_open, |
363 | .read = seq_read, | 373 | .read = seq_read, |
364 | .llseek = seq_lseek, | 374 | .llseek = seq_lseek, |
365 | .release = seq_release, | 375 | .release = seq_release_net, |
366 | }; | 376 | }; |
367 | 377 | ||
368 | int __init nf_conntrack_ipv4_compat_init(void) | 378 | static int __net_init ip_conntrack_net_init(struct net *net) |
369 | { | 379 | { |
370 | struct proc_dir_entry *proc, *proc_exp, *proc_stat; | 380 | struct proc_dir_entry *proc, *proc_exp, *proc_stat; |
371 | 381 | ||
372 | proc = proc_net_fops_create(&init_net, "ip_conntrack", 0440, &ct_file_ops); | 382 | proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops); |
373 | if (!proc) | 383 | if (!proc) |
374 | goto err1; | 384 | goto err1; |
375 | 385 | ||
376 | proc_exp = proc_net_fops_create(&init_net, "ip_conntrack_expect", 0440, | 386 | proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440, |
377 | &ip_exp_file_ops); | 387 | &ip_exp_file_ops); |
378 | if (!proc_exp) | 388 | if (!proc_exp) |
379 | goto err2; | 389 | goto err2; |
380 | 390 | ||
381 | proc_stat = proc_create("ip_conntrack", S_IRUGO, | 391 | proc_stat = proc_create("ip_conntrack", S_IRUGO, |
382 | init_net.proc_net_stat, &ct_cpu_seq_fops); | 392 | net->proc_net_stat, &ct_cpu_seq_fops); |
383 | if (!proc_stat) | 393 | if (!proc_stat) |
384 | goto err3; | 394 | goto err3; |
385 | return 0; | 395 | return 0; |
386 | 396 | ||
387 | err3: | 397 | err3: |
388 | proc_net_remove(&init_net, "ip_conntrack_expect"); | 398 | proc_net_remove(net, "ip_conntrack_expect"); |
389 | err2: | 399 | err2: |
390 | proc_net_remove(&init_net, "ip_conntrack"); | 400 | proc_net_remove(net, "ip_conntrack"); |
391 | err1: | 401 | err1: |
392 | return -ENOMEM; | 402 | return -ENOMEM; |
393 | } | 403 | } |
394 | 404 | ||
405 | static void __net_exit ip_conntrack_net_exit(struct net *net) | ||
406 | { | ||
407 | remove_proc_entry("ip_conntrack", net->proc_net_stat); | ||
408 | proc_net_remove(net, "ip_conntrack_expect"); | ||
409 | proc_net_remove(net, "ip_conntrack"); | ||
410 | } | ||
411 | |||
412 | static struct pernet_operations ip_conntrack_net_ops = { | ||
413 | .init = ip_conntrack_net_init, | ||
414 | .exit = ip_conntrack_net_exit, | ||
415 | }; | ||
416 | |||
417 | int __init nf_conntrack_ipv4_compat_init(void) | ||
418 | { | ||
419 | return register_pernet_subsys(&ip_conntrack_net_ops); | ||
420 | } | ||
421 | |||
395 | void __exit nf_conntrack_ipv4_compat_fini(void) | 422 | void __exit nf_conntrack_ipv4_compat_fini(void) |
396 | { | 423 | { |
397 | remove_proc_entry("ip_conntrack", init_net.proc_net_stat); | 424 | unregister_pernet_subsys(&ip_conntrack_net_ops); |
398 | proc_net_remove(&init_net, "ip_conntrack_expect"); | ||
399 | proc_net_remove(&init_net, "ip_conntrack"); | ||
400 | } | 425 | } |
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 97791048fa9b..4e8879220222 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c | |||
@@ -79,7 +79,7 @@ static int icmp_packet(struct nf_conn *ct, | |||
79 | const struct sk_buff *skb, | 79 | const struct sk_buff *skb, |
80 | unsigned int dataoff, | 80 | unsigned int dataoff, |
81 | enum ip_conntrack_info ctinfo, | 81 | enum ip_conntrack_info ctinfo, |
82 | int pf, | 82 | u_int8_t pf, |
83 | unsigned int hooknum) | 83 | unsigned int hooknum) |
84 | { | 84 | { |
85 | /* Try to delete connection immediately after all replies: | 85 | /* Try to delete connection immediately after all replies: |
@@ -91,7 +91,7 @@ static int icmp_packet(struct nf_conn *ct, | |||
91 | nf_ct_kill_acct(ct, ctinfo, skb); | 91 | nf_ct_kill_acct(ct, ctinfo, skb); |
92 | } else { | 92 | } else { |
93 | atomic_inc(&ct->proto.icmp.count); | 93 | atomic_inc(&ct->proto.icmp.count); |
94 | nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); | 94 | nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct); |
95 | nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout); | 95 | nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout); |
96 | } | 96 | } |
97 | 97 | ||
@@ -123,7 +123,7 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, | |||
123 | 123 | ||
124 | /* Returns conntrack if it dealt with ICMP, and filled in skb fields */ | 124 | /* Returns conntrack if it dealt with ICMP, and filled in skb fields */ |
125 | static int | 125 | static int |
126 | icmp_error_message(struct sk_buff *skb, | 126 | icmp_error_message(struct net *net, struct sk_buff *skb, |
127 | enum ip_conntrack_info *ctinfo, | 127 | enum ip_conntrack_info *ctinfo, |
128 | unsigned int hooknum) | 128 | unsigned int hooknum) |
129 | { | 129 | { |
@@ -155,7 +155,7 @@ icmp_error_message(struct sk_buff *skb, | |||
155 | 155 | ||
156 | *ctinfo = IP_CT_RELATED; | 156 | *ctinfo = IP_CT_RELATED; |
157 | 157 | ||
158 | h = nf_conntrack_find_get(&innertuple); | 158 | h = nf_conntrack_find_get(net, &innertuple); |
159 | if (!h) { | 159 | if (!h) { |
160 | pr_debug("icmp_error_message: no match\n"); | 160 | pr_debug("icmp_error_message: no match\n"); |
161 | return -NF_ACCEPT; | 161 | return -NF_ACCEPT; |
@@ -172,8 +172,8 @@ icmp_error_message(struct sk_buff *skb, | |||
172 | 172 | ||
173 | /* Small and modified version of icmp_rcv */ | 173 | /* Small and modified version of icmp_rcv */ |
174 | static int | 174 | static int |
175 | icmp_error(struct sk_buff *skb, unsigned int dataoff, | 175 | icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, |
176 | enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum) | 176 | enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) |
177 | { | 177 | { |
178 | const struct icmphdr *icmph; | 178 | const struct icmphdr *icmph; |
179 | struct icmphdr _ih; | 179 | struct icmphdr _ih; |
@@ -181,16 +181,16 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, | |||
181 | /* Not enough header? */ | 181 | /* Not enough header? */ |
182 | icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); | 182 | icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); |
183 | if (icmph == NULL) { | 183 | if (icmph == NULL) { |
184 | if (LOG_INVALID(IPPROTO_ICMP)) | 184 | if (LOG_INVALID(net, IPPROTO_ICMP)) |
185 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | 185 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
186 | "nf_ct_icmp: short packet "); | 186 | "nf_ct_icmp: short packet "); |
187 | return -NF_ACCEPT; | 187 | return -NF_ACCEPT; |
188 | } | 188 | } |
189 | 189 | ||
190 | /* See ip_conntrack_proto_tcp.c */ | 190 | /* See ip_conntrack_proto_tcp.c */ |
191 | if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && | 191 | if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && |
192 | nf_ip_checksum(skb, hooknum, dataoff, 0)) { | 192 | nf_ip_checksum(skb, hooknum, dataoff, 0)) { |
193 | if (LOG_INVALID(IPPROTO_ICMP)) | 193 | if (LOG_INVALID(net, IPPROTO_ICMP)) |
194 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | 194 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
195 | "nf_ct_icmp: bad HW ICMP checksum "); | 195 | "nf_ct_icmp: bad HW ICMP checksum "); |
196 | return -NF_ACCEPT; | 196 | return -NF_ACCEPT; |
@@ -203,7 +203,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, | |||
203 | * discarded. | 203 | * discarded. |
204 | */ | 204 | */ |
205 | if (icmph->type > NR_ICMP_TYPES) { | 205 | if (icmph->type > NR_ICMP_TYPES) { |
206 | if (LOG_INVALID(IPPROTO_ICMP)) | 206 | if (LOG_INVALID(net, IPPROTO_ICMP)) |
207 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | 207 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
208 | "nf_ct_icmp: invalid ICMP type "); | 208 | "nf_ct_icmp: invalid ICMP type "); |
209 | return -NF_ACCEPT; | 209 | return -NF_ACCEPT; |
@@ -217,7 +217,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, | |||
217 | && icmph->type != ICMP_REDIRECT) | 217 | && icmph->type != ICMP_REDIRECT) |
218 | return NF_ACCEPT; | 218 | return NF_ACCEPT; |
219 | 219 | ||
220 | return icmp_error_message(skb, ctinfo, hooknum); | 220 | return icmp_error_message(net, skb, ctinfo, hooknum); |
221 | } | 221 | } |
222 | 222 | ||
223 | #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) | 223 | #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) |
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c new file mode 100644 index 000000000000..aa2c50a180f7 --- /dev/null +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c | |||
@@ -0,0 +1,96 @@ | |||
1 | /* (C) 1999-2001 Paul `Rusty' Russell | ||
2 | * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | */ | ||
8 | |||
9 | #include <linux/types.h> | ||
10 | #include <linux/ip.h> | ||
11 | #include <linux/netfilter.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/skbuff.h> | ||
14 | #include <net/route.h> | ||
15 | #include <net/ip.h> | ||
16 | |||
17 | #include <linux/netfilter_ipv4.h> | ||
18 | #include <net/netfilter/ipv4/nf_defrag_ipv4.h> | ||
19 | |||
20 | /* Returns new sk_buff, or NULL */ | ||
21 | static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) | ||
22 | { | ||
23 | int err; | ||
24 | |||
25 | skb_orphan(skb); | ||
26 | |||
27 | local_bh_disable(); | ||
28 | err = ip_defrag(skb, user); | ||
29 | local_bh_enable(); | ||
30 | |||
31 | if (!err) | ||
32 | ip_send_check(ip_hdr(skb)); | ||
33 | |||
34 | return err; | ||
35 | } | ||
36 | |||
37 | static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, | ||
38 | struct sk_buff *skb, | ||
39 | const struct net_device *in, | ||
40 | const struct net_device *out, | ||
41 | int (*okfn)(struct sk_buff *)) | ||
42 | { | ||
43 | #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) | ||
44 | /* Previously seen (loopback)? Ignore. Do this before | ||
45 | fragment check. */ | ||
46 | if (skb->nfct) | ||
47 | return NF_ACCEPT; | ||
48 | #endif | ||
49 | |||
50 | /* Gather fragments. */ | ||
51 | if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { | ||
52 | if (nf_ct_ipv4_gather_frags(skb, | ||
53 | hooknum == NF_INET_PRE_ROUTING ? | ||
54 | IP_DEFRAG_CONNTRACK_IN : | ||
55 | IP_DEFRAG_CONNTRACK_OUT)) | ||
56 | return NF_STOLEN; | ||
57 | } | ||
58 | return NF_ACCEPT; | ||
59 | } | ||
60 | |||
61 | static struct nf_hook_ops ipv4_defrag_ops[] = { | ||
62 | { | ||
63 | .hook = ipv4_conntrack_defrag, | ||
64 | .owner = THIS_MODULE, | ||
65 | .pf = PF_INET, | ||
66 | .hooknum = NF_INET_PRE_ROUTING, | ||
67 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, | ||
68 | }, | ||
69 | { | ||
70 | .hook = ipv4_conntrack_defrag, | ||
71 | .owner = THIS_MODULE, | ||
72 | .pf = PF_INET, | ||
73 | .hooknum = NF_INET_LOCAL_OUT, | ||
74 | .priority = NF_IP_PRI_CONNTRACK_DEFRAG, | ||
75 | }, | ||
76 | }; | ||
77 | |||
78 | static int __init nf_defrag_init(void) | ||
79 | { | ||
80 | return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); | ||
81 | } | ||
82 | |||
83 | static void __exit nf_defrag_fini(void) | ||
84 | { | ||
85 | nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); | ||
86 | } | ||
87 | |||
88 | void nf_defrag_ipv4_enable(void) | ||
89 | { | ||
90 | } | ||
91 | EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); | ||
92 | |||
93 | module_init(nf_defrag_init); | ||
94 | module_exit(nf_defrag_fini); | ||
95 | |||
96 | MODULE_LICENSE("GPL"); | ||
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 6c6a3cba8d50..2ac9eaf1a8c9 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c | |||
@@ -37,9 +37,6 @@ static struct nf_conntrack_l3proto *l3proto __read_mostly; | |||
37 | 37 | ||
38 | /* Calculated at init based on memory size */ | 38 | /* Calculated at init based on memory size */ |
39 | static unsigned int nf_nat_htable_size __read_mostly; | 39 | static unsigned int nf_nat_htable_size __read_mostly; |
40 | static int nf_nat_vmalloced; | ||
41 | |||
42 | static struct hlist_head *bysource __read_mostly; | ||
43 | 40 | ||
44 | #define MAX_IP_NAT_PROTO 256 | 41 | #define MAX_IP_NAT_PROTO 256 |
45 | static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] | 42 | static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] |
@@ -145,7 +142,8 @@ same_src(const struct nf_conn *ct, | |||
145 | 142 | ||
146 | /* Only called for SRC manip */ | 143 | /* Only called for SRC manip */ |
147 | static int | 144 | static int |
148 | find_appropriate_src(const struct nf_conntrack_tuple *tuple, | 145 | find_appropriate_src(struct net *net, |
146 | const struct nf_conntrack_tuple *tuple, | ||
149 | struct nf_conntrack_tuple *result, | 147 | struct nf_conntrack_tuple *result, |
150 | const struct nf_nat_range *range) | 148 | const struct nf_nat_range *range) |
151 | { | 149 | { |
@@ -155,7 +153,7 @@ find_appropriate_src(const struct nf_conntrack_tuple *tuple, | |||
155 | const struct hlist_node *n; | 153 | const struct hlist_node *n; |
156 | 154 | ||
157 | rcu_read_lock(); | 155 | rcu_read_lock(); |
158 | hlist_for_each_entry_rcu(nat, n, &bysource[h], bysource) { | 156 | hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) { |
159 | ct = nat->ct; | 157 | ct = nat->ct; |
160 | if (same_src(ct, tuple)) { | 158 | if (same_src(ct, tuple)) { |
161 | /* Copy source part from reply tuple. */ | 159 | /* Copy source part from reply tuple. */ |
@@ -231,6 +229,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, | |||
231 | struct nf_conn *ct, | 229 | struct nf_conn *ct, |
232 | enum nf_nat_manip_type maniptype) | 230 | enum nf_nat_manip_type maniptype) |
233 | { | 231 | { |
232 | struct net *net = nf_ct_net(ct); | ||
234 | const struct nf_nat_protocol *proto; | 233 | const struct nf_nat_protocol *proto; |
235 | 234 | ||
236 | /* 1) If this srcip/proto/src-proto-part is currently mapped, | 235 | /* 1) If this srcip/proto/src-proto-part is currently mapped, |
@@ -242,7 +241,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, | |||
242 | manips not an issue. */ | 241 | manips not an issue. */ |
243 | if (maniptype == IP_NAT_MANIP_SRC && | 242 | if (maniptype == IP_NAT_MANIP_SRC && |
244 | !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { | 243 | !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { |
245 | if (find_appropriate_src(orig_tuple, tuple, range)) { | 244 | if (find_appropriate_src(net, orig_tuple, tuple, range)) { |
246 | pr_debug("get_unique_tuple: Found current src map\n"); | 245 | pr_debug("get_unique_tuple: Found current src map\n"); |
247 | if (!nf_nat_used_tuple(tuple, ct)) | 246 | if (!nf_nat_used_tuple(tuple, ct)) |
248 | return; | 247 | return; |
@@ -283,6 +282,7 @@ nf_nat_setup_info(struct nf_conn *ct, | |||
283 | const struct nf_nat_range *range, | 282 | const struct nf_nat_range *range, |
284 | enum nf_nat_manip_type maniptype) | 283 | enum nf_nat_manip_type maniptype) |
285 | { | 284 | { |
285 | struct net *net = nf_ct_net(ct); | ||
286 | struct nf_conntrack_tuple curr_tuple, new_tuple; | 286 | struct nf_conntrack_tuple curr_tuple, new_tuple; |
287 | struct nf_conn_nat *nat; | 287 | struct nf_conn_nat *nat; |
288 | int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK); | 288 | int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK); |
@@ -334,7 +334,8 @@ nf_nat_setup_info(struct nf_conn *ct, | |||
334 | /* nf_conntrack_alter_reply might re-allocate exntension aera */ | 334 | /* nf_conntrack_alter_reply might re-allocate exntension aera */ |
335 | nat = nfct_nat(ct); | 335 | nat = nfct_nat(ct); |
336 | nat->ct = ct; | 336 | nat->ct = ct; |
337 | hlist_add_head_rcu(&nat->bysource, &bysource[srchash]); | 337 | hlist_add_head_rcu(&nat->bysource, |
338 | &net->ipv4.nat_bysource[srchash]); | ||
338 | spin_unlock_bh(&nf_nat_lock); | 339 | spin_unlock_bh(&nf_nat_lock); |
339 | } | 340 | } |
340 | 341 | ||
@@ -583,6 +584,40 @@ static struct nf_ct_ext_type nat_extend __read_mostly = { | |||
583 | .flags = NF_CT_EXT_F_PREALLOC, | 584 | .flags = NF_CT_EXT_F_PREALLOC, |
584 | }; | 585 | }; |
585 | 586 | ||
587 | static int __net_init nf_nat_net_init(struct net *net) | ||
588 | { | ||
589 | net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, | ||
590 | &net->ipv4.nat_vmalloced); | ||
591 | if (!net->ipv4.nat_bysource) | ||
592 | return -ENOMEM; | ||
593 | return 0; | ||
594 | } | ||
595 | |||
596 | /* Clear NAT section of all conntracks, in case we're loaded again. */ | ||
597 | static int clean_nat(struct nf_conn *i, void *data) | ||
598 | { | ||
599 | struct nf_conn_nat *nat = nfct_nat(i); | ||
600 | |||
601 | if (!nat) | ||
602 | return 0; | ||
603 | memset(nat, 0, sizeof(*nat)); | ||
604 | i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST); | ||
605 | return 0; | ||
606 | } | ||
607 | |||
608 | static void __net_exit nf_nat_net_exit(struct net *net) | ||
609 | { | ||
610 | nf_ct_iterate_cleanup(net, &clean_nat, NULL); | ||
611 | synchronize_rcu(); | ||
612 | nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, | ||
613 | nf_nat_htable_size); | ||
614 | } | ||
615 | |||
616 | static struct pernet_operations nf_nat_net_ops = { | ||
617 | .init = nf_nat_net_init, | ||
618 | .exit = nf_nat_net_exit, | ||
619 | }; | ||
620 | |||
586 | static int __init nf_nat_init(void) | 621 | static int __init nf_nat_init(void) |
587 | { | 622 | { |
588 | size_t i; | 623 | size_t i; |
@@ -599,12 +634,9 @@ static int __init nf_nat_init(void) | |||
599 | /* Leave them the same for the moment. */ | 634 | /* Leave them the same for the moment. */ |
600 | nf_nat_htable_size = nf_conntrack_htable_size; | 635 | nf_nat_htable_size = nf_conntrack_htable_size; |
601 | 636 | ||
602 | bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, | 637 | ret = register_pernet_subsys(&nf_nat_net_ops); |
603 | &nf_nat_vmalloced); | 638 | if (ret < 0) |
604 | if (!bysource) { | ||
605 | ret = -ENOMEM; | ||
606 | goto cleanup_extend; | 639 | goto cleanup_extend; |
607 | } | ||
608 | 640 | ||
609 | /* Sew in builtin protocols. */ | 641 | /* Sew in builtin protocols. */ |
610 | spin_lock_bh(&nf_nat_lock); | 642 | spin_lock_bh(&nf_nat_lock); |
@@ -629,23 +661,9 @@ static int __init nf_nat_init(void) | |||
629 | return ret; | 661 | return ret; |
630 | } | 662 | } |
631 | 663 | ||
632 | /* Clear NAT section of all conntracks, in case we're loaded again. */ | ||
633 | static int clean_nat(struct nf_conn *i, void *data) | ||
634 | { | ||
635 | struct nf_conn_nat *nat = nfct_nat(i); | ||
636 | |||
637 | if (!nat) | ||
638 | return 0; | ||
639 | memset(nat, 0, sizeof(*nat)); | ||
640 | i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST); | ||
641 | return 0; | ||
642 | } | ||
643 | |||
644 | static void __exit nf_nat_cleanup(void) | 664 | static void __exit nf_nat_cleanup(void) |
645 | { | 665 | { |
646 | nf_ct_iterate_cleanup(&clean_nat, NULL); | 666 | unregister_pernet_subsys(&nf_nat_net_ops); |
647 | synchronize_rcu(); | ||
648 | nf_ct_free_hashtable(bysource, nf_nat_vmalloced, nf_nat_htable_size); | ||
649 | nf_ct_l3proto_put(l3proto); | 667 | nf_ct_l3proto_put(l3proto); |
650 | nf_ct_extend_unregister(&nat_extend); | 668 | nf_ct_extend_unregister(&nat_extend); |
651 | rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); | 669 | rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); |
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 11976ea29884..cf7a42bf9820 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/udp.h> | 16 | #include <linux/udp.h> |
17 | #include <net/checksum.h> | 17 | #include <net/checksum.h> |
18 | #include <net/tcp.h> | 18 | #include <net/tcp.h> |
19 | #include <net/route.h> | ||
19 | 20 | ||
20 | #include <linux/netfilter_ipv4.h> | 21 | #include <linux/netfilter_ipv4.h> |
21 | #include <net/netfilter/nf_conntrack.h> | 22 | #include <net/netfilter/nf_conntrack.h> |
@@ -192,7 +193,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb, | |||
192 | nf_conntrack_tcp_update(skb, ip_hdrlen(skb), | 193 | nf_conntrack_tcp_update(skb, ip_hdrlen(skb), |
193 | ct, CTINFO2DIR(ctinfo)); | 194 | ct, CTINFO2DIR(ctinfo)); |
194 | 195 | ||
195 | nf_conntrack_event_cache(IPCT_NATSEQADJ, skb); | 196 | nf_conntrack_event_cache(IPCT_NATSEQADJ, ct); |
196 | } | 197 | } |
197 | return 1; | 198 | return 1; |
198 | } | 199 | } |
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index da3d91a5ef5c..9eb171056c63 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c | |||
@@ -40,6 +40,7 @@ MODULE_ALIAS("ip_nat_pptp"); | |||
40 | static void pptp_nat_expected(struct nf_conn *ct, | 40 | static void pptp_nat_expected(struct nf_conn *ct, |
41 | struct nf_conntrack_expect *exp) | 41 | struct nf_conntrack_expect *exp) |
42 | { | 42 | { |
43 | struct net *net = nf_ct_net(ct); | ||
43 | const struct nf_conn *master = ct->master; | 44 | const struct nf_conn *master = ct->master; |
44 | struct nf_conntrack_expect *other_exp; | 45 | struct nf_conntrack_expect *other_exp; |
45 | struct nf_conntrack_tuple t; | 46 | struct nf_conntrack_tuple t; |
@@ -73,7 +74,7 @@ static void pptp_nat_expected(struct nf_conn *ct, | |||
73 | 74 | ||
74 | pr_debug("trying to unexpect other dir: "); | 75 | pr_debug("trying to unexpect other dir: "); |
75 | nf_ct_dump_tuple_ip(&t); | 76 | nf_ct_dump_tuple_ip(&t); |
76 | other_exp = nf_ct_expect_find_get(&t); | 77 | other_exp = nf_ct_expect_find_get(net, &t); |
77 | if (other_exp) { | 78 | if (other_exp) { |
78 | nf_ct_unexpect_related(other_exp); | 79 | nf_ct_unexpect_related(other_exp); |
79 | nf_ct_expect_put(other_exp); | 80 | nf_ct_expect_put(other_exp); |
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index e8b4d0d4439e..bea54a685109 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c | |||
@@ -33,7 +33,7 @@ static struct | |||
33 | struct ipt_replace repl; | 33 | struct ipt_replace repl; |
34 | struct ipt_standard entries[3]; | 34 | struct ipt_standard entries[3]; |
35 | struct ipt_error term; | 35 | struct ipt_error term; |
36 | } nat_initial_table __initdata = { | 36 | } nat_initial_table __net_initdata = { |
37 | .repl = { | 37 | .repl = { |
38 | .name = "nat", | 38 | .name = "nat", |
39 | .valid_hooks = NAT_VALID_HOOKS, | 39 | .valid_hooks = NAT_VALID_HOOKS, |
@@ -58,47 +58,42 @@ static struct | |||
58 | .term = IPT_ERROR_INIT, /* ERROR */ | 58 | .term = IPT_ERROR_INIT, /* ERROR */ |
59 | }; | 59 | }; |
60 | 60 | ||
61 | static struct xt_table __nat_table = { | 61 | static struct xt_table nat_table = { |
62 | .name = "nat", | 62 | .name = "nat", |
63 | .valid_hooks = NAT_VALID_HOOKS, | 63 | .valid_hooks = NAT_VALID_HOOKS, |
64 | .lock = __RW_LOCK_UNLOCKED(__nat_table.lock), | 64 | .lock = __RW_LOCK_UNLOCKED(__nat_table.lock), |
65 | .me = THIS_MODULE, | 65 | .me = THIS_MODULE, |
66 | .af = AF_INET, | 66 | .af = AF_INET, |
67 | }; | 67 | }; |
68 | static struct xt_table *nat_table; | ||
69 | 68 | ||
70 | /* Source NAT */ | 69 | /* Source NAT */ |
71 | static unsigned int ipt_snat_target(struct sk_buff *skb, | 70 | static unsigned int |
72 | const struct net_device *in, | 71 | ipt_snat_target(struct sk_buff *skb, const struct xt_target_param *par) |
73 | const struct net_device *out, | ||
74 | unsigned int hooknum, | ||
75 | const struct xt_target *target, | ||
76 | const void *targinfo) | ||
77 | { | 72 | { |
78 | struct nf_conn *ct; | 73 | struct nf_conn *ct; |
79 | enum ip_conntrack_info ctinfo; | 74 | enum ip_conntrack_info ctinfo; |
80 | const struct nf_nat_multi_range_compat *mr = targinfo; | 75 | const struct nf_nat_multi_range_compat *mr = par->targinfo; |
81 | 76 | ||
82 | NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); | 77 | NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING); |
83 | 78 | ||
84 | ct = nf_ct_get(skb, &ctinfo); | 79 | ct = nf_ct_get(skb, &ctinfo); |
85 | 80 | ||
86 | /* Connection must be valid and new. */ | 81 | /* Connection must be valid and new. */ |
87 | NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || | 82 | NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || |
88 | ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); | 83 | ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); |
89 | NF_CT_ASSERT(out); | 84 | NF_CT_ASSERT(par->out != NULL); |
90 | 85 | ||
91 | return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC); | 86 | return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC); |
92 | } | 87 | } |
93 | 88 | ||
94 | /* Before 2.6.11 we did implicit source NAT if required. Warn about change. */ | 89 | /* Before 2.6.11 we did implicit source NAT if required. Warn about change. */ |
95 | static void warn_if_extra_mangle(__be32 dstip, __be32 srcip) | 90 | static void warn_if_extra_mangle(struct net *net, __be32 dstip, __be32 srcip) |
96 | { | 91 | { |
97 | static int warned = 0; | 92 | static int warned = 0; |
98 | struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } }; | 93 | struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } }; |
99 | struct rtable *rt; | 94 | struct rtable *rt; |
100 | 95 | ||
101 | if (ip_route_output_key(&init_net, &rt, &fl) != 0) | 96 | if (ip_route_output_key(net, &rt, &fl) != 0) |
102 | return; | 97 | return; |
103 | 98 | ||
104 | if (rt->rt_src != srcip && !warned) { | 99 | if (rt->rt_src != srcip && !warned) { |
@@ -110,40 +105,32 @@ static void warn_if_extra_mangle(__be32 dstip, __be32 srcip) | |||
110 | ip_rt_put(rt); | 105 | ip_rt_put(rt); |
111 | } | 106 | } |
112 | 107 | ||
113 | static unsigned int ipt_dnat_target(struct sk_buff *skb, | 108 | static unsigned int |
114 | const struct net_device *in, | 109 | ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par) |
115 | const struct net_device *out, | ||
116 | unsigned int hooknum, | ||
117 | const struct xt_target *target, | ||
118 | const void *targinfo) | ||
119 | { | 110 | { |
120 | struct nf_conn *ct; | 111 | struct nf_conn *ct; |
121 | enum ip_conntrack_info ctinfo; | 112 | enum ip_conntrack_info ctinfo; |
122 | const struct nf_nat_multi_range_compat *mr = targinfo; | 113 | const struct nf_nat_multi_range_compat *mr = par->targinfo; |
123 | 114 | ||
124 | NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING || | 115 | NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING || |
125 | hooknum == NF_INET_LOCAL_OUT); | 116 | par->hooknum == NF_INET_LOCAL_OUT); |
126 | 117 | ||
127 | ct = nf_ct_get(skb, &ctinfo); | 118 | ct = nf_ct_get(skb, &ctinfo); |
128 | 119 | ||
129 | /* Connection must be valid and new. */ | 120 | /* Connection must be valid and new. */ |
130 | NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); | 121 | NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); |
131 | 122 | ||
132 | if (hooknum == NF_INET_LOCAL_OUT && | 123 | if (par->hooknum == NF_INET_LOCAL_OUT && |
133 | mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) | 124 | mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) |
134 | warn_if_extra_mangle(ip_hdr(skb)->daddr, | 125 | warn_if_extra_mangle(dev_net(par->out), ip_hdr(skb)->daddr, |
135 | mr->range[0].min_ip); | 126 | mr->range[0].min_ip); |
136 | 127 | ||
137 | return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST); | 128 | return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST); |
138 | } | 129 | } |
139 | 130 | ||
140 | static bool ipt_snat_checkentry(const char *tablename, | 131 | static bool ipt_snat_checkentry(const struct xt_tgchk_param *par) |
141 | const void *entry, | ||
142 | const struct xt_target *target, | ||
143 | void *targinfo, | ||
144 | unsigned int hook_mask) | ||
145 | { | 132 | { |
146 | const struct nf_nat_multi_range_compat *mr = targinfo; | 133 | const struct nf_nat_multi_range_compat *mr = par->targinfo; |
147 | 134 | ||
148 | /* Must be a valid range */ | 135 | /* Must be a valid range */ |
149 | if (mr->rangesize != 1) { | 136 | if (mr->rangesize != 1) { |
@@ -153,13 +140,9 @@ static bool ipt_snat_checkentry(const char *tablename, | |||
153 | return true; | 140 | return true; |
154 | } | 141 | } |
155 | 142 | ||
156 | static bool ipt_dnat_checkentry(const char *tablename, | 143 | static bool ipt_dnat_checkentry(const struct xt_tgchk_param *par) |
157 | const void *entry, | ||
158 | const struct xt_target *target, | ||
159 | void *targinfo, | ||
160 | unsigned int hook_mask) | ||
161 | { | 144 | { |
162 | const struct nf_nat_multi_range_compat *mr = targinfo; | 145 | const struct nf_nat_multi_range_compat *mr = par->targinfo; |
163 | 146 | ||
164 | /* Must be a valid range */ | 147 | /* Must be a valid range */ |
165 | if (mr->rangesize != 1) { | 148 | if (mr->rangesize != 1) { |
@@ -194,9 +177,10 @@ int nf_nat_rule_find(struct sk_buff *skb, | |||
194 | const struct net_device *out, | 177 | const struct net_device *out, |
195 | struct nf_conn *ct) | 178 | struct nf_conn *ct) |
196 | { | 179 | { |
180 | struct net *net = nf_ct_net(ct); | ||
197 | int ret; | 181 | int ret; |
198 | 182 | ||
199 | ret = ipt_do_table(skb, hooknum, in, out, nat_table); | 183 | ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table); |
200 | 184 | ||
201 | if (ret == NF_ACCEPT) { | 185 | if (ret == NF_ACCEPT) { |
202 | if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) | 186 | if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) |
@@ -226,14 +210,32 @@ static struct xt_target ipt_dnat_reg __read_mostly = { | |||
226 | .family = AF_INET, | 210 | .family = AF_INET, |
227 | }; | 211 | }; |
228 | 212 | ||
213 | static int __net_init nf_nat_rule_net_init(struct net *net) | ||
214 | { | ||
215 | net->ipv4.nat_table = ipt_register_table(net, &nat_table, | ||
216 | &nat_initial_table.repl); | ||
217 | if (IS_ERR(net->ipv4.nat_table)) | ||
218 | return PTR_ERR(net->ipv4.nat_table); | ||
219 | return 0; | ||
220 | } | ||
221 | |||
222 | static void __net_exit nf_nat_rule_net_exit(struct net *net) | ||
223 | { | ||
224 | ipt_unregister_table(net->ipv4.nat_table); | ||
225 | } | ||
226 | |||
227 | static struct pernet_operations nf_nat_rule_net_ops = { | ||
228 | .init = nf_nat_rule_net_init, | ||
229 | .exit = nf_nat_rule_net_exit, | ||
230 | }; | ||
231 | |||
229 | int __init nf_nat_rule_init(void) | 232 | int __init nf_nat_rule_init(void) |
230 | { | 233 | { |
231 | int ret; | 234 | int ret; |
232 | 235 | ||
233 | nat_table = ipt_register_table(&init_net, &__nat_table, | 236 | ret = register_pernet_subsys(&nf_nat_rule_net_ops); |
234 | &nat_initial_table.repl); | 237 | if (ret != 0) |
235 | if (IS_ERR(nat_table)) | 238 | goto out; |
236 | return PTR_ERR(nat_table); | ||
237 | ret = xt_register_target(&ipt_snat_reg); | 239 | ret = xt_register_target(&ipt_snat_reg); |
238 | if (ret != 0) | 240 | if (ret != 0) |
239 | goto unregister_table; | 241 | goto unregister_table; |
@@ -247,8 +249,8 @@ int __init nf_nat_rule_init(void) | |||
247 | unregister_snat: | 249 | unregister_snat: |
248 | xt_unregister_target(&ipt_snat_reg); | 250 | xt_unregister_target(&ipt_snat_reg); |
249 | unregister_table: | 251 | unregister_table: |
250 | ipt_unregister_table(nat_table); | 252 | unregister_pernet_subsys(&nf_nat_rule_net_ops); |
251 | 253 | out: | |
252 | return ret; | 254 | return ret; |
253 | } | 255 | } |
254 | 256 | ||
@@ -256,5 +258,5 @@ void nf_nat_rule_cleanup(void) | |||
256 | { | 258 | { |
257 | xt_unregister_target(&ipt_dnat_reg); | 259 | xt_unregister_target(&ipt_dnat_reg); |
258 | xt_unregister_target(&ipt_snat_reg); | 260 | xt_unregister_target(&ipt_snat_reg); |
259 | ipt_unregister_table(nat_table); | 261 | unregister_pernet_subsys(&nf_nat_rule_net_ops); |
260 | } | 262 | } |
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6ee5354c9aa1..a6d7c584f53b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c | |||
@@ -282,6 +282,8 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq) | |||
282 | struct rtable *r = NULL; | 282 | struct rtable *r = NULL; |
283 | 283 | ||
284 | for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { | 284 | for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { |
285 | if (!rt_hash_table[st->bucket].chain) | ||
286 | continue; | ||
285 | rcu_read_lock_bh(); | 287 | rcu_read_lock_bh(); |
286 | r = rcu_dereference(rt_hash_table[st->bucket].chain); | 288 | r = rcu_dereference(rt_hash_table[st->bucket].chain); |
287 | while (r) { | 289 | while (r) { |
@@ -299,11 +301,14 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq, | |||
299 | struct rtable *r) | 301 | struct rtable *r) |
300 | { | 302 | { |
301 | struct rt_cache_iter_state *st = seq->private; | 303 | struct rt_cache_iter_state *st = seq->private; |
304 | |||
302 | r = r->u.dst.rt_next; | 305 | r = r->u.dst.rt_next; |
303 | while (!r) { | 306 | while (!r) { |
304 | rcu_read_unlock_bh(); | 307 | rcu_read_unlock_bh(); |
305 | if (--st->bucket < 0) | 308 | do { |
306 | break; | 309 | if (--st->bucket < 0) |
310 | return NULL; | ||
311 | } while (!rt_hash_table[st->bucket].chain); | ||
307 | rcu_read_lock_bh(); | 312 | rcu_read_lock_bh(); |
308 | r = rt_hash_table[st->bucket].chain; | 313 | r = rt_hash_table[st->bucket].chain; |
309 | } | 314 | } |
@@ -2356,11 +2361,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, | |||
2356 | ipv4_is_zeronet(oldflp->fl4_src)) | 2361 | ipv4_is_zeronet(oldflp->fl4_src)) |
2357 | goto out; | 2362 | goto out; |
2358 | 2363 | ||
2359 | /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ | ||
2360 | dev_out = ip_dev_find(net, oldflp->fl4_src); | ||
2361 | if (dev_out == NULL) | ||
2362 | goto out; | ||
2363 | |||
2364 | /* I removed check for oif == dev_out->oif here. | 2364 | /* I removed check for oif == dev_out->oif here. |
2365 | It was wrong for two reasons: | 2365 | It was wrong for two reasons: |
2366 | 1. ip_dev_find(net, saddr) can return wrong iface, if saddr | 2366 | 1. ip_dev_find(net, saddr) can return wrong iface, if saddr |
@@ -2372,6 +2372,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, | |||
2372 | if (oldflp->oif == 0 | 2372 | if (oldflp->oif == 0 |
2373 | && (ipv4_is_multicast(oldflp->fl4_dst) || | 2373 | && (ipv4_is_multicast(oldflp->fl4_dst) || |
2374 | oldflp->fl4_dst == htonl(0xFFFFFFFF))) { | 2374 | oldflp->fl4_dst == htonl(0xFFFFFFFF))) { |
2375 | /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ | ||
2376 | dev_out = ip_dev_find(net, oldflp->fl4_src); | ||
2377 | if (dev_out == NULL) | ||
2378 | goto out; | ||
2379 | |||
2375 | /* Special hack: user can direct multicasts | 2380 | /* Special hack: user can direct multicasts |
2376 | and limited broadcast via necessary interface | 2381 | and limited broadcast via necessary interface |
2377 | without fiddling with IP_MULTICAST_IF or IP_PKTINFO. | 2382 | without fiddling with IP_MULTICAST_IF or IP_PKTINFO. |
@@ -2390,9 +2395,15 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, | |||
2390 | fl.oif = dev_out->ifindex; | 2395 | fl.oif = dev_out->ifindex; |
2391 | goto make_route; | 2396 | goto make_route; |
2392 | } | 2397 | } |
2393 | if (dev_out) | 2398 | |
2399 | if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { | ||
2400 | /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ | ||
2401 | dev_out = ip_dev_find(net, oldflp->fl4_src); | ||
2402 | if (dev_out == NULL) | ||
2403 | goto out; | ||
2394 | dev_put(dev_out); | 2404 | dev_put(dev_out); |
2395 | dev_out = NULL; | 2405 | dev_out = NULL; |
2406 | } | ||
2396 | } | 2407 | } |
2397 | 2408 | ||
2398 | 2409 | ||
@@ -2840,7 +2851,9 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) | |||
2840 | if (s_h < 0) | 2851 | if (s_h < 0) |
2841 | s_h = 0; | 2852 | s_h = 0; |
2842 | s_idx = idx = cb->args[1]; | 2853 | s_idx = idx = cb->args[1]; |
2843 | for (h = s_h; h <= rt_hash_mask; h++) { | 2854 | for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { |
2855 | if (!rt_hash_table[h].chain) | ||
2856 | continue; | ||
2844 | rcu_read_lock_bh(); | 2857 | rcu_read_lock_bh(); |
2845 | for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; | 2858 | for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; |
2846 | rt = rcu_dereference(rt->u.dst.rt_next), idx++) { | 2859 | rt = rcu_dereference(rt->u.dst.rt_next), idx++) { |
@@ -2859,7 +2872,6 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) | |||
2859 | dst_release(xchg(&skb->dst, NULL)); | 2872 | dst_release(xchg(&skb->dst, NULL)); |
2860 | } | 2873 | } |
2861 | rcu_read_unlock_bh(); | 2874 | rcu_read_unlock_bh(); |
2862 | s_idx = 0; | ||
2863 | } | 2875 | } |
2864 | 2876 | ||
2865 | done: | 2877 | done: |
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 9d38005abbac..d346c22aa6ae 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/cryptohash.h> | 16 | #include <linux/cryptohash.h> |
17 | #include <linux/kernel.h> | 17 | #include <linux/kernel.h> |
18 | #include <net/tcp.h> | 18 | #include <net/tcp.h> |
19 | #include <net/route.h> | ||
19 | 20 | ||
20 | /* Timestamps: lowest 9 bits store TCP options */ | 21 | /* Timestamps: lowest 9 bits store TCP options */ |
21 | #define TSBITS 9 | 22 | #define TSBITS 9 |
@@ -296,6 +297,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
296 | treq->rcv_isn = ntohl(th->seq) - 1; | 297 | treq->rcv_isn = ntohl(th->seq) - 1; |
297 | treq->snt_isn = cookie; | 298 | treq->snt_isn = cookie; |
298 | req->mss = mss; | 299 | req->mss = mss; |
300 | ireq->loc_port = th->dest; | ||
299 | ireq->rmt_port = th->source; | 301 | ireq->rmt_port = th->source; |
300 | ireq->loc_addr = ip_hdr(skb)->daddr; | 302 | ireq->loc_addr = ip_hdr(skb)->daddr; |
301 | ireq->rmt_addr = ip_hdr(skb)->saddr; | 303 | ireq->rmt_addr = ip_hdr(skb)->saddr; |
@@ -337,6 +339,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, | |||
337 | .saddr = ireq->loc_addr, | 339 | .saddr = ireq->loc_addr, |
338 | .tos = RT_CONN_FLAGS(sk) } }, | 340 | .tos = RT_CONN_FLAGS(sk) } }, |
339 | .proto = IPPROTO_TCP, | 341 | .proto = IPPROTO_TCP, |
342 | .flags = inet_sk_flowi_flags(sk), | ||
340 | .uli_u = { .ports = | 343 | .uli_u = { .ports = |
341 | { .sport = th->dest, | 344 | { .sport = th->dest, |
342 | .dport = th->source } } }; | 345 | .dport = th->source } } }; |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e0689fd7b798..276d047fb85a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -26,16 +26,13 @@ static int tcp_retr1_max = 255; | |||
26 | static int ip_local_port_range_min[] = { 1, 1 }; | 26 | static int ip_local_port_range_min[] = { 1, 1 }; |
27 | static int ip_local_port_range_max[] = { 65535, 65535 }; | 27 | static int ip_local_port_range_max[] = { 65535, 65535 }; |
28 | 28 | ||
29 | extern seqlock_t sysctl_port_range_lock; | ||
30 | extern int sysctl_local_port_range[2]; | ||
31 | |||
32 | /* Update system visible IP port range */ | 29 | /* Update system visible IP port range */ |
33 | static void set_local_port_range(int range[2]) | 30 | static void set_local_port_range(int range[2]) |
34 | { | 31 | { |
35 | write_seqlock(&sysctl_port_range_lock); | 32 | write_seqlock(&sysctl_local_ports.lock); |
36 | sysctl_local_port_range[0] = range[0]; | 33 | sysctl_local_ports.range[0] = range[0]; |
37 | sysctl_local_port_range[1] = range[1]; | 34 | sysctl_local_ports.range[1] = range[1]; |
38 | write_sequnlock(&sysctl_port_range_lock); | 35 | write_sequnlock(&sysctl_local_ports.lock); |
39 | } | 36 | } |
40 | 37 | ||
41 | /* Validate changes from /proc interface. */ | 38 | /* Validate changes from /proc interface. */ |
@@ -44,8 +41,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, | |||
44 | size_t *lenp, loff_t *ppos) | 41 | size_t *lenp, loff_t *ppos) |
45 | { | 42 | { |
46 | int ret; | 43 | int ret; |
47 | int range[2] = { sysctl_local_port_range[0], | 44 | int range[2]; |
48 | sysctl_local_port_range[1] }; | ||
49 | ctl_table tmp = { | 45 | ctl_table tmp = { |
50 | .data = &range, | 46 | .data = &range, |
51 | .maxlen = sizeof(range), | 47 | .maxlen = sizeof(range), |
@@ -54,6 +50,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, | |||
54 | .extra2 = &ip_local_port_range_max, | 50 | .extra2 = &ip_local_port_range_max, |
55 | }; | 51 | }; |
56 | 52 | ||
53 | inet_get_local_port_range(range, range + 1); | ||
57 | ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos); | 54 | ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos); |
58 | 55 | ||
59 | if (write && ret == 0) { | 56 | if (write && ret == 0) { |
@@ -73,8 +70,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name, | |||
73 | void __user *newval, size_t newlen) | 70 | void __user *newval, size_t newlen) |
74 | { | 71 | { |
75 | int ret; | 72 | int ret; |
76 | int range[2] = { sysctl_local_port_range[0], | 73 | int range[2]; |
77 | sysctl_local_port_range[1] }; | ||
78 | ctl_table tmp = { | 74 | ctl_table tmp = { |
79 | .data = &range, | 75 | .data = &range, |
80 | .maxlen = sizeof(range), | 76 | .maxlen = sizeof(range), |
@@ -83,6 +79,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name, | |||
83 | .extra2 = &ip_local_port_range_max, | 79 | .extra2 = &ip_local_port_range_max, |
84 | }; | 80 | }; |
85 | 81 | ||
82 | inet_get_local_port_range(range, range + 1); | ||
86 | ret = sysctl_intvec(&tmp, name, nlen, oldval, oldlenp, newval, newlen); | 83 | ret = sysctl_intvec(&tmp, name, nlen, oldval, oldlenp, newval, newlen); |
87 | if (ret == 0 && newval && newlen) { | 84 | if (ret == 0 && newval && newlen) { |
88 | if (range[1] < range[0]) | 85 | if (range[1] < range[0]) |
@@ -396,8 +393,8 @@ static struct ctl_table ipv4_table[] = { | |||
396 | { | 393 | { |
397 | .ctl_name = NET_IPV4_LOCAL_PORT_RANGE, | 394 | .ctl_name = NET_IPV4_LOCAL_PORT_RANGE, |
398 | .procname = "ip_local_port_range", | 395 | .procname = "ip_local_port_range", |
399 | .data = &sysctl_local_port_range, | 396 | .data = &sysctl_local_ports.range, |
400 | .maxlen = sizeof(sysctl_local_port_range), | 397 | .maxlen = sizeof(sysctl_local_ports.range), |
401 | .mode = 0644, | 398 | .mode = 0644, |
402 | .proc_handler = &ipv4_local_port_range, | 399 | .proc_handler = &ipv4_local_port_range, |
403 | .strategy = &ipv4_sysctl_local_port_range, | 400 | .strategy = &ipv4_sysctl_local_port_range, |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1ab341e5d3e0..eccb7165a80c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -384,13 +384,17 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) | |||
384 | 384 | ||
385 | /* Connected? */ | 385 | /* Connected? */ |
386 | if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { | 386 | if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { |
387 | int target = sock_rcvlowat(sk, 0, INT_MAX); | ||
388 | |||
389 | if (tp->urg_seq == tp->copied_seq && | ||
390 | !sock_flag(sk, SOCK_URGINLINE) && | ||
391 | tp->urg_data) | ||
392 | target--; | ||
393 | |||
387 | /* Potential race condition. If read of tp below will | 394 | /* Potential race condition. If read of tp below will |
388 | * escape above sk->sk_state, we can be illegally awaken | 395 | * escape above sk->sk_state, we can be illegally awaken |
389 | * in SYN_* states. */ | 396 | * in SYN_* states. */ |
390 | if ((tp->rcv_nxt != tp->copied_seq) && | 397 | if (tp->rcv_nxt - tp->copied_seq >= target) |
391 | (tp->urg_seq != tp->copied_seq || | ||
392 | tp->rcv_nxt != tp->copied_seq + 1 || | ||
393 | sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data)) | ||
394 | mask |= POLLIN | POLLRDNORM; | 398 | mask |= POLLIN | POLLRDNORM; |
395 | 399 | ||
396 | if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { | 400 | if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { |
@@ -493,10 +497,8 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) | |||
493 | static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, | 497 | static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, |
494 | struct sk_buff *skb) | 498 | struct sk_buff *skb) |
495 | { | 499 | { |
496 | if (flags & MSG_OOB) { | 500 | if (flags & MSG_OOB) |
497 | tp->urg_mode = 1; | ||
498 | tp->snd_up = tp->write_seq; | 501 | tp->snd_up = tp->write_seq; |
499 | } | ||
500 | } | 502 | } |
501 | 503 | ||
502 | static inline void tcp_push(struct sock *sk, int flags, int mss_now, | 504 | static inline void tcp_push(struct sock *sk, int flags, int mss_now, |
@@ -1157,7 +1159,7 @@ static void tcp_prequeue_process(struct sock *sk) | |||
1157 | * necessary */ | 1159 | * necessary */ |
1158 | local_bh_disable(); | 1160 | local_bh_disable(); |
1159 | while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) | 1161 | while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) |
1160 | sk->sk_backlog_rcv(sk, skb); | 1162 | sk_backlog_rcv(sk, skb); |
1161 | local_bh_enable(); | 1163 | local_bh_enable(); |
1162 | 1164 | ||
1163 | /* Clear memory counter. */ | 1165 | /* Clear memory counter. */ |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7abc6b80d47d..d77c0d29e239 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -979,6 +979,39 @@ static void tcp_update_reordering(struct sock *sk, const int metric, | |||
979 | } | 979 | } |
980 | } | 980 | } |
981 | 981 | ||
982 | /* This must be called before lost_out is incremented */ | ||
983 | static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) | ||
984 | { | ||
985 | if ((tp->retransmit_skb_hint == NULL) || | ||
986 | before(TCP_SKB_CB(skb)->seq, | ||
987 | TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) | ||
988 | tp->retransmit_skb_hint = skb; | ||
989 | |||
990 | if (!tp->lost_out || | ||
991 | after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high)) | ||
992 | tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; | ||
993 | } | ||
994 | |||
995 | static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) | ||
996 | { | ||
997 | if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { | ||
998 | tcp_verify_retransmit_hint(tp, skb); | ||
999 | |||
1000 | tp->lost_out += tcp_skb_pcount(skb); | ||
1001 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | ||
1002 | } | ||
1003 | } | ||
1004 | |||
1005 | void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) | ||
1006 | { | ||
1007 | tcp_verify_retransmit_hint(tp, skb); | ||
1008 | |||
1009 | if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { | ||
1010 | tp->lost_out += tcp_skb_pcount(skb); | ||
1011 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | ||
1012 | } | ||
1013 | } | ||
1014 | |||
982 | /* This procedure tags the retransmission queue when SACKs arrive. | 1015 | /* This procedure tags the retransmission queue when SACKs arrive. |
983 | * | 1016 | * |
984 | * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). | 1017 | * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). |
@@ -1155,13 +1188,7 @@ static void tcp_mark_lost_retrans(struct sock *sk) | |||
1155 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | 1188 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
1156 | tp->retrans_out -= tcp_skb_pcount(skb); | 1189 | tp->retrans_out -= tcp_skb_pcount(skb); |
1157 | 1190 | ||
1158 | /* clear lost hint */ | 1191 | tcp_skb_mark_lost_uncond_verify(tp, skb); |
1159 | tp->retransmit_skb_hint = NULL; | ||
1160 | |||
1161 | if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { | ||
1162 | tp->lost_out += tcp_skb_pcount(skb); | ||
1163 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | ||
1164 | } | ||
1165 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); | 1192 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); |
1166 | } else { | 1193 | } else { |
1167 | if (before(ack_seq, new_low_seq)) | 1194 | if (before(ack_seq, new_low_seq)) |
@@ -1271,9 +1298,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, | |||
1271 | ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); | 1298 | ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); |
1272 | tp->lost_out -= tcp_skb_pcount(skb); | 1299 | tp->lost_out -= tcp_skb_pcount(skb); |
1273 | tp->retrans_out -= tcp_skb_pcount(skb); | 1300 | tp->retrans_out -= tcp_skb_pcount(skb); |
1274 | |||
1275 | /* clear lost hint */ | ||
1276 | tp->retransmit_skb_hint = NULL; | ||
1277 | } | 1301 | } |
1278 | } else { | 1302 | } else { |
1279 | if (!(sacked & TCPCB_RETRANS)) { | 1303 | if (!(sacked & TCPCB_RETRANS)) { |
@@ -1292,9 +1316,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, | |||
1292 | if (sacked & TCPCB_LOST) { | 1316 | if (sacked & TCPCB_LOST) { |
1293 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; | 1317 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; |
1294 | tp->lost_out -= tcp_skb_pcount(skb); | 1318 | tp->lost_out -= tcp_skb_pcount(skb); |
1295 | |||
1296 | /* clear lost hint */ | ||
1297 | tp->retransmit_skb_hint = NULL; | ||
1298 | } | 1319 | } |
1299 | } | 1320 | } |
1300 | 1321 | ||
@@ -1324,7 +1345,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, | |||
1324 | if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { | 1345 | if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { |
1325 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | 1346 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
1326 | tp->retrans_out -= tcp_skb_pcount(skb); | 1347 | tp->retrans_out -= tcp_skb_pcount(skb); |
1327 | tp->retransmit_skb_hint = NULL; | ||
1328 | } | 1348 | } |
1329 | 1349 | ||
1330 | return flag; | 1350 | return flag; |
@@ -1726,6 +1746,8 @@ int tcp_use_frto(struct sock *sk) | |||
1726 | return 0; | 1746 | return 0; |
1727 | 1747 | ||
1728 | skb = tcp_write_queue_head(sk); | 1748 | skb = tcp_write_queue_head(sk); |
1749 | if (tcp_skb_is_last(sk, skb)) | ||
1750 | return 1; | ||
1729 | skb = tcp_write_queue_next(sk, skb); /* Skips head */ | 1751 | skb = tcp_write_queue_next(sk, skb); /* Skips head */ |
1730 | tcp_for_write_queue_from(skb, sk) { | 1752 | tcp_for_write_queue_from(skb, sk) { |
1731 | if (skb == tcp_send_head(sk)) | 1753 | if (skb == tcp_send_head(sk)) |
@@ -1867,6 +1889,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) | |||
1867 | if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { | 1889 | if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { |
1868 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 1890 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
1869 | tp->lost_out += tcp_skb_pcount(skb); | 1891 | tp->lost_out += tcp_skb_pcount(skb); |
1892 | tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; | ||
1870 | } | 1893 | } |
1871 | } | 1894 | } |
1872 | tcp_verify_left_out(tp); | 1895 | tcp_verify_left_out(tp); |
@@ -1883,7 +1906,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) | |||
1883 | tp->high_seq = tp->snd_nxt; | 1906 | tp->high_seq = tp->snd_nxt; |
1884 | TCP_ECN_queue_cwr(tp); | 1907 | TCP_ECN_queue_cwr(tp); |
1885 | 1908 | ||
1886 | tcp_clear_retrans_hints_partial(tp); | 1909 | tcp_clear_all_retrans_hints(tp); |
1887 | } | 1910 | } |
1888 | 1911 | ||
1889 | static void tcp_clear_retrans_partial(struct tcp_sock *tp) | 1912 | static void tcp_clear_retrans_partial(struct tcp_sock *tp) |
@@ -1934,12 +1957,11 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
1934 | /* Push undo marker, if it was plain RTO and nothing | 1957 | /* Push undo marker, if it was plain RTO and nothing |
1935 | * was retransmitted. */ | 1958 | * was retransmitted. */ |
1936 | tp->undo_marker = tp->snd_una; | 1959 | tp->undo_marker = tp->snd_una; |
1937 | tcp_clear_retrans_hints_partial(tp); | ||
1938 | } else { | 1960 | } else { |
1939 | tp->sacked_out = 0; | 1961 | tp->sacked_out = 0; |
1940 | tp->fackets_out = 0; | 1962 | tp->fackets_out = 0; |
1941 | tcp_clear_all_retrans_hints(tp); | ||
1942 | } | 1963 | } |
1964 | tcp_clear_all_retrans_hints(tp); | ||
1943 | 1965 | ||
1944 | tcp_for_write_queue(skb, sk) { | 1966 | tcp_for_write_queue(skb, sk) { |
1945 | if (skb == tcp_send_head(sk)) | 1967 | if (skb == tcp_send_head(sk)) |
@@ -1952,6 +1974,7 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
1952 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; | 1974 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; |
1953 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 1975 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
1954 | tp->lost_out += tcp_skb_pcount(skb); | 1976 | tp->lost_out += tcp_skb_pcount(skb); |
1977 | tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; | ||
1955 | } | 1978 | } |
1956 | } | 1979 | } |
1957 | tcp_verify_left_out(tp); | 1980 | tcp_verify_left_out(tp); |
@@ -2157,19 +2180,6 @@ static int tcp_time_to_recover(struct sock *sk) | |||
2157 | return 0; | 2180 | return 0; |
2158 | } | 2181 | } |
2159 | 2182 | ||
2160 | /* RFC: This is from the original, I doubt that this is necessary at all: | ||
2161 | * clear xmit_retrans hint if seq of this skb is beyond hint. How could we | ||
2162 | * retransmitted past LOST markings in the first place? I'm not fully sure | ||
2163 | * about undo and end of connection cases, which can cause R without L? | ||
2164 | */ | ||
2165 | static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) | ||
2166 | { | ||
2167 | if ((tp->retransmit_skb_hint != NULL) && | ||
2168 | before(TCP_SKB_CB(skb)->seq, | ||
2169 | TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) | ||
2170 | tp->retransmit_skb_hint = NULL; | ||
2171 | } | ||
2172 | |||
2173 | /* Mark head of queue up as lost. With RFC3517 SACK, the packets is | 2183 | /* Mark head of queue up as lost. With RFC3517 SACK, the packets is |
2174 | * is against sacked "cnt", otherwise it's against facked "cnt" | 2184 | * is against sacked "cnt", otherwise it's against facked "cnt" |
2175 | */ | 2185 | */ |
@@ -2217,11 +2227,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) | |||
2217 | cnt = packets; | 2227 | cnt = packets; |
2218 | } | 2228 | } |
2219 | 2229 | ||
2220 | if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { | 2230 | tcp_skb_mark_lost(tp, skb); |
2221 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | ||
2222 | tp->lost_out += tcp_skb_pcount(skb); | ||
2223 | tcp_verify_retransmit_hint(tp, skb); | ||
2224 | } | ||
2225 | } | 2231 | } |
2226 | tcp_verify_left_out(tp); | 2232 | tcp_verify_left_out(tp); |
2227 | } | 2233 | } |
@@ -2263,11 +2269,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) | |||
2263 | if (!tcp_skb_timedout(sk, skb)) | 2269 | if (!tcp_skb_timedout(sk, skb)) |
2264 | break; | 2270 | break; |
2265 | 2271 | ||
2266 | if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { | 2272 | tcp_skb_mark_lost(tp, skb); |
2267 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | ||
2268 | tp->lost_out += tcp_skb_pcount(skb); | ||
2269 | tcp_verify_retransmit_hint(tp, skb); | ||
2270 | } | ||
2271 | } | 2273 | } |
2272 | 2274 | ||
2273 | tp->scoreboard_skb_hint = skb; | 2275 | tp->scoreboard_skb_hint = skb; |
@@ -2378,10 +2380,6 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) | |||
2378 | } | 2380 | } |
2379 | tcp_moderate_cwnd(tp); | 2381 | tcp_moderate_cwnd(tp); |
2380 | tp->snd_cwnd_stamp = tcp_time_stamp; | 2382 | tp->snd_cwnd_stamp = tcp_time_stamp; |
2381 | |||
2382 | /* There is something screwy going on with the retrans hints after | ||
2383 | an undo */ | ||
2384 | tcp_clear_all_retrans_hints(tp); | ||
2385 | } | 2383 | } |
2386 | 2384 | ||
2387 | static inline int tcp_may_undo(struct tcp_sock *tp) | 2385 | static inline int tcp_may_undo(struct tcp_sock *tp) |
@@ -2838,7 +2836,8 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) | |||
2838 | * is before the ack sequence we can discard it as it's confirmed to have | 2836 | * is before the ack sequence we can discard it as it's confirmed to have |
2839 | * arrived at the other end. | 2837 | * arrived at the other end. |
2840 | */ | 2838 | */ |
2841 | static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) | 2839 | static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, |
2840 | u32 prior_snd_una) | ||
2842 | { | 2841 | { |
2843 | struct tcp_sock *tp = tcp_sk(sk); | 2842 | struct tcp_sock *tp = tcp_sk(sk); |
2844 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2843 | const struct inet_connection_sock *icsk = inet_csk(sk); |
@@ -2848,6 +2847,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) | |||
2848 | int flag = 0; | 2847 | int flag = 0; |
2849 | u32 pkts_acked = 0; | 2848 | u32 pkts_acked = 0; |
2850 | u32 reord = tp->packets_out; | 2849 | u32 reord = tp->packets_out; |
2850 | u32 prior_sacked = tp->sacked_out; | ||
2851 | s32 seq_rtt = -1; | 2851 | s32 seq_rtt = -1; |
2852 | s32 ca_seq_rtt = -1; | 2852 | s32 ca_seq_rtt = -1; |
2853 | ktime_t last_ackt = net_invalid_timestamp(); | 2853 | ktime_t last_ackt = net_invalid_timestamp(); |
@@ -2904,9 +2904,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) | |||
2904 | if (sacked & TCPCB_LOST) | 2904 | if (sacked & TCPCB_LOST) |
2905 | tp->lost_out -= acked_pcount; | 2905 | tp->lost_out -= acked_pcount; |
2906 | 2906 | ||
2907 | if (unlikely(tp->urg_mode && !before(end_seq, tp->snd_up))) | ||
2908 | tp->urg_mode = 0; | ||
2909 | |||
2910 | tp->packets_out -= acked_pcount; | 2907 | tp->packets_out -= acked_pcount; |
2911 | pkts_acked += acked_pcount; | 2908 | pkts_acked += acked_pcount; |
2912 | 2909 | ||
@@ -2929,9 +2926,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) | |||
2929 | 2926 | ||
2930 | tcp_unlink_write_queue(skb, sk); | 2927 | tcp_unlink_write_queue(skb, sk); |
2931 | sk_wmem_free_skb(sk, skb); | 2928 | sk_wmem_free_skb(sk, skb); |
2932 | tcp_clear_all_retrans_hints(tp); | 2929 | tp->scoreboard_skb_hint = NULL; |
2930 | if (skb == tp->retransmit_skb_hint) | ||
2931 | tp->retransmit_skb_hint = NULL; | ||
2932 | if (skb == tp->lost_skb_hint) | ||
2933 | tp->lost_skb_hint = NULL; | ||
2933 | } | 2934 | } |
2934 | 2935 | ||
2936 | if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) | ||
2937 | tp->snd_up = tp->snd_una; | ||
2938 | |||
2935 | if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) | 2939 | if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) |
2936 | flag |= FLAG_SACK_RENEGING; | 2940 | flag |= FLAG_SACK_RENEGING; |
2937 | 2941 | ||
@@ -2948,6 +2952,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) | |||
2948 | /* Non-retransmitted hole got filled? That's reordering */ | 2952 | /* Non-retransmitted hole got filled? That's reordering */ |
2949 | if (reord < prior_fackets) | 2953 | if (reord < prior_fackets) |
2950 | tcp_update_reordering(sk, tp->fackets_out - reord, 0); | 2954 | tcp_update_reordering(sk, tp->fackets_out - reord, 0); |
2955 | |||
2956 | /* No need to care for underflows here because | ||
2957 | * the lost_skb_hint gets NULLed if we're past it | ||
2958 | * (or something non-trivial happened) | ||
2959 | */ | ||
2960 | if (tcp_is_fack(tp)) | ||
2961 | tp->lost_cnt_hint -= pkts_acked; | ||
2962 | else | ||
2963 | tp->lost_cnt_hint -= prior_sacked - tp->sacked_out; | ||
2951 | } | 2964 | } |
2952 | 2965 | ||
2953 | tp->fackets_out -= min(pkts_acked, tp->fackets_out); | 2966 | tp->fackets_out -= min(pkts_acked, tp->fackets_out); |
@@ -3299,7 +3312,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
3299 | goto no_queue; | 3312 | goto no_queue; |
3300 | 3313 | ||
3301 | /* See if we can take anything off of the retransmit queue. */ | 3314 | /* See if we can take anything off of the retransmit queue. */ |
3302 | flag |= tcp_clean_rtx_queue(sk, prior_fackets); | 3315 | flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); |
3303 | 3316 | ||
3304 | if (tp->frto_counter) | 3317 | if (tp->frto_counter) |
3305 | frto_cwnd = tcp_process_frto(sk, flag); | 3318 | frto_cwnd = tcp_process_frto(sk, flag); |
@@ -3442,6 +3455,22 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, | |||
3442 | } | 3455 | } |
3443 | } | 3456 | } |
3444 | 3457 | ||
3458 | static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) | ||
3459 | { | ||
3460 | __be32 *ptr = (__be32 *)(th + 1); | ||
3461 | |||
3462 | if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | ||
3463 | | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { | ||
3464 | tp->rx_opt.saw_tstamp = 1; | ||
3465 | ++ptr; | ||
3466 | tp->rx_opt.rcv_tsval = ntohl(*ptr); | ||
3467 | ++ptr; | ||
3468 | tp->rx_opt.rcv_tsecr = ntohl(*ptr); | ||
3469 | return 1; | ||
3470 | } | ||
3471 | return 0; | ||
3472 | } | ||
3473 | |||
3445 | /* Fast parse options. This hopes to only see timestamps. | 3474 | /* Fast parse options. This hopes to only see timestamps. |
3446 | * If it is wrong it falls back on tcp_parse_options(). | 3475 | * If it is wrong it falls back on tcp_parse_options(). |
3447 | */ | 3476 | */ |
@@ -3453,16 +3482,8 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, | |||
3453 | return 0; | 3482 | return 0; |
3454 | } else if (tp->rx_opt.tstamp_ok && | 3483 | } else if (tp->rx_opt.tstamp_ok && |
3455 | th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { | 3484 | th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { |
3456 | __be32 *ptr = (__be32 *)(th + 1); | 3485 | if (tcp_parse_aligned_timestamp(tp, th)) |
3457 | if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | ||
3458 | | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { | ||
3459 | tp->rx_opt.saw_tstamp = 1; | ||
3460 | ++ptr; | ||
3461 | tp->rx_opt.rcv_tsval = ntohl(*ptr); | ||
3462 | ++ptr; | ||
3463 | tp->rx_opt.rcv_tsecr = ntohl(*ptr); | ||
3464 | return 1; | 3486 | return 1; |
3465 | } | ||
3466 | } | 3487 | } |
3467 | tcp_parse_options(skb, &tp->rx_opt, 1); | 3488 | tcp_parse_options(skb, &tp->rx_opt, 1); |
3468 | return 1; | 3489 | return 1; |
@@ -4138,7 +4159,7 @@ drop: | |||
4138 | skb1 = skb1->prev; | 4159 | skb1 = skb1->prev; |
4139 | } | 4160 | } |
4140 | } | 4161 | } |
4141 | __skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue); | 4162 | __skb_queue_after(&tp->out_of_order_queue, skb1, skb); |
4142 | 4163 | ||
4143 | /* And clean segments covered by new one as whole. */ | 4164 | /* And clean segments covered by new one as whole. */ |
4144 | while ((skb1 = skb->next) != | 4165 | while ((skb1 = skb->next) != |
@@ -4161,6 +4182,18 @@ add_sack: | |||
4161 | } | 4182 | } |
4162 | } | 4183 | } |
4163 | 4184 | ||
4185 | static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, | ||
4186 | struct sk_buff_head *list) | ||
4187 | { | ||
4188 | struct sk_buff *next = skb->next; | ||
4189 | |||
4190 | __skb_unlink(skb, list); | ||
4191 | __kfree_skb(skb); | ||
4192 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); | ||
4193 | |||
4194 | return next; | ||
4195 | } | ||
4196 | |||
4164 | /* Collapse contiguous sequence of skbs head..tail with | 4197 | /* Collapse contiguous sequence of skbs head..tail with |
4165 | * sequence numbers start..end. | 4198 | * sequence numbers start..end. |
4166 | * Segments with FIN/SYN are not collapsed (only because this | 4199 | * Segments with FIN/SYN are not collapsed (only because this |
@@ -4178,11 +4211,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, | |||
4178 | for (skb = head; skb != tail;) { | 4211 | for (skb = head; skb != tail;) { |
4179 | /* No new bits? It is possible on ofo queue. */ | 4212 | /* No new bits? It is possible on ofo queue. */ |
4180 | if (!before(start, TCP_SKB_CB(skb)->end_seq)) { | 4213 | if (!before(start, TCP_SKB_CB(skb)->end_seq)) { |
4181 | struct sk_buff *next = skb->next; | 4214 | skb = tcp_collapse_one(sk, skb, list); |
4182 | __skb_unlink(skb, list); | ||
4183 | __kfree_skb(skb); | ||
4184 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); | ||
4185 | skb = next; | ||
4186 | continue; | 4215 | continue; |
4187 | } | 4216 | } |
4188 | 4217 | ||
@@ -4228,7 +4257,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, | |||
4228 | memcpy(nskb->head, skb->head, header); | 4257 | memcpy(nskb->head, skb->head, header); |
4229 | memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); | 4258 | memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); |
4230 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; | 4259 | TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; |
4231 | __skb_insert(nskb, skb->prev, skb, list); | 4260 | __skb_queue_before(list, skb, nskb); |
4232 | skb_set_owner_r(nskb, sk); | 4261 | skb_set_owner_r(nskb, sk); |
4233 | 4262 | ||
4234 | /* Copy data, releasing collapsed skbs. */ | 4263 | /* Copy data, releasing collapsed skbs. */ |
@@ -4246,11 +4275,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, | |||
4246 | start += size; | 4275 | start += size; |
4247 | } | 4276 | } |
4248 | if (!before(start, TCP_SKB_CB(skb)->end_seq)) { | 4277 | if (!before(start, TCP_SKB_CB(skb)->end_seq)) { |
4249 | struct sk_buff *next = skb->next; | 4278 | skb = tcp_collapse_one(sk, skb, list); |
4250 | __skb_unlink(skb, list); | ||
4251 | __kfree_skb(skb); | ||
4252 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); | ||
4253 | skb = next; | ||
4254 | if (skb == tail || | 4279 | if (skb == tail || |
4255 | tcp_hdr(skb)->syn || | 4280 | tcp_hdr(skb)->syn || |
4256 | tcp_hdr(skb)->fin) | 4281 | tcp_hdr(skb)->fin) |
@@ -4436,8 +4461,8 @@ static void tcp_new_space(struct sock *sk) | |||
4436 | 4461 | ||
4437 | if (tcp_should_expand_sndbuf(sk)) { | 4462 | if (tcp_should_expand_sndbuf(sk)) { |
4438 | int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + | 4463 | int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + |
4439 | MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), | 4464 | MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); |
4440 | demanded = max_t(unsigned int, tp->snd_cwnd, | 4465 | int demanded = max_t(unsigned int, tp->snd_cwnd, |
4441 | tp->reordering + 1); | 4466 | tp->reordering + 1); |
4442 | sndmem *= 2 * demanded; | 4467 | sndmem *= 2 * demanded; |
4443 | if (sndmem > sk->sk_sndbuf) | 4468 | if (sndmem > sk->sk_sndbuf) |
@@ -4691,6 +4716,67 @@ out: | |||
4691 | } | 4716 | } |
4692 | #endif /* CONFIG_NET_DMA */ | 4717 | #endif /* CONFIG_NET_DMA */ |
4693 | 4718 | ||
4719 | /* Does PAWS and seqno based validation of an incoming segment, flags will | ||
4720 | * play significant role here. | ||
4721 | */ | ||
4722 | static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, | ||
4723 | struct tcphdr *th, int syn_inerr) | ||
4724 | { | ||
4725 | struct tcp_sock *tp = tcp_sk(sk); | ||
4726 | |||
4727 | /* RFC1323: H1. Apply PAWS check first. */ | ||
4728 | if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && | ||
4729 | tcp_paws_discard(sk, skb)) { | ||
4730 | if (!th->rst) { | ||
4731 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); | ||
4732 | tcp_send_dupack(sk, skb); | ||
4733 | goto discard; | ||
4734 | } | ||
4735 | /* Reset is accepted even if it did not pass PAWS. */ | ||
4736 | } | ||
4737 | |||
4738 | /* Step 1: check sequence number */ | ||
4739 | if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { | ||
4740 | /* RFC793, page 37: "In all states except SYN-SENT, all reset | ||
4741 | * (RST) segments are validated by checking their SEQ-fields." | ||
4742 | * And page 69: "If an incoming segment is not acceptable, | ||
4743 | * an acknowledgment should be sent in reply (unless the RST | ||
4744 | * bit is set, if so drop the segment and return)". | ||
4745 | */ | ||
4746 | if (!th->rst) | ||
4747 | tcp_send_dupack(sk, skb); | ||
4748 | goto discard; | ||
4749 | } | ||
4750 | |||
4751 | /* Step 2: check RST bit */ | ||
4752 | if (th->rst) { | ||
4753 | tcp_reset(sk); | ||
4754 | goto discard; | ||
4755 | } | ||
4756 | |||
4757 | /* ts_recent update must be made after we are sure that the packet | ||
4758 | * is in window. | ||
4759 | */ | ||
4760 | tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); | ||
4761 | |||
4762 | /* step 3: check security and precedence [ignored] */ | ||
4763 | |||
4764 | /* step 4: Check for a SYN in window. */ | ||
4765 | if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { | ||
4766 | if (syn_inerr) | ||
4767 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); | ||
4768 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); | ||
4769 | tcp_reset(sk); | ||
4770 | return -1; | ||
4771 | } | ||
4772 | |||
4773 | return 1; | ||
4774 | |||
4775 | discard: | ||
4776 | __kfree_skb(skb); | ||
4777 | return 0; | ||
4778 | } | ||
4779 | |||
4694 | /* | 4780 | /* |
4695 | * TCP receive function for the ESTABLISHED state. | 4781 | * TCP receive function for the ESTABLISHED state. |
4696 | * | 4782 | * |
@@ -4718,6 +4804,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
4718 | struct tcphdr *th, unsigned len) | 4804 | struct tcphdr *th, unsigned len) |
4719 | { | 4805 | { |
4720 | struct tcp_sock *tp = tcp_sk(sk); | 4806 | struct tcp_sock *tp = tcp_sk(sk); |
4807 | int res; | ||
4721 | 4808 | ||
4722 | /* | 4809 | /* |
4723 | * Header prediction. | 4810 | * Header prediction. |
@@ -4756,19 +4843,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
4756 | 4843 | ||
4757 | /* Check timestamp */ | 4844 | /* Check timestamp */ |
4758 | if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { | 4845 | if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { |
4759 | __be32 *ptr = (__be32 *)(th + 1); | ||
4760 | |||
4761 | /* No? Slow path! */ | 4846 | /* No? Slow path! */ |
4762 | if (*ptr != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | 4847 | if (!tcp_parse_aligned_timestamp(tp, th)) |
4763 | | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) | ||
4764 | goto slow_path; | 4848 | goto slow_path; |
4765 | 4849 | ||
4766 | tp->rx_opt.saw_tstamp = 1; | ||
4767 | ++ptr; | ||
4768 | tp->rx_opt.rcv_tsval = ntohl(*ptr); | ||
4769 | ++ptr; | ||
4770 | tp->rx_opt.rcv_tsecr = ntohl(*ptr); | ||
4771 | |||
4772 | /* If PAWS failed, check it more carefully in slow path */ | 4850 | /* If PAWS failed, check it more carefully in slow path */ |
4773 | if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) | 4851 | if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) |
4774 | goto slow_path; | 4852 | goto slow_path; |
@@ -4900,51 +4978,12 @@ slow_path: | |||
4900 | goto csum_error; | 4978 | goto csum_error; |
4901 | 4979 | ||
4902 | /* | 4980 | /* |
4903 | * RFC1323: H1. Apply PAWS check first. | ||
4904 | */ | ||
4905 | if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && | ||
4906 | tcp_paws_discard(sk, skb)) { | ||
4907 | if (!th->rst) { | ||
4908 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); | ||
4909 | tcp_send_dupack(sk, skb); | ||
4910 | goto discard; | ||
4911 | } | ||
4912 | /* Resets are accepted even if PAWS failed. | ||
4913 | |||
4914 | ts_recent update must be made after we are sure | ||
4915 | that the packet is in window. | ||
4916 | */ | ||
4917 | } | ||
4918 | |||
4919 | /* | ||
4920 | * Standard slow path. | 4981 | * Standard slow path. |
4921 | */ | 4982 | */ |
4922 | 4983 | ||
4923 | if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { | 4984 | res = tcp_validate_incoming(sk, skb, th, 1); |
4924 | /* RFC793, page 37: "In all states except SYN-SENT, all reset | 4985 | if (res <= 0) |
4925 | * (RST) segments are validated by checking their SEQ-fields." | 4986 | return -res; |
4926 | * And page 69: "If an incoming segment is not acceptable, | ||
4927 | * an acknowledgment should be sent in reply (unless the RST bit | ||
4928 | * is set, if so drop the segment and return)". | ||
4929 | */ | ||
4930 | if (!th->rst) | ||
4931 | tcp_send_dupack(sk, skb); | ||
4932 | goto discard; | ||
4933 | } | ||
4934 | |||
4935 | if (th->rst) { | ||
4936 | tcp_reset(sk); | ||
4937 | goto discard; | ||
4938 | } | ||
4939 | |||
4940 | tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); | ||
4941 | |||
4942 | if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { | ||
4943 | TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); | ||
4944 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); | ||
4945 | tcp_reset(sk); | ||
4946 | return 1; | ||
4947 | } | ||
4948 | 4987 | ||
4949 | step5: | 4988 | step5: |
4950 | if (th->ack) | 4989 | if (th->ack) |
@@ -5226,6 +5265,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
5226 | struct tcp_sock *tp = tcp_sk(sk); | 5265 | struct tcp_sock *tp = tcp_sk(sk); |
5227 | struct inet_connection_sock *icsk = inet_csk(sk); | 5266 | struct inet_connection_sock *icsk = inet_csk(sk); |
5228 | int queued = 0; | 5267 | int queued = 0; |
5268 | int res; | ||
5229 | 5269 | ||
5230 | tp->rx_opt.saw_tstamp = 0; | 5270 | tp->rx_opt.saw_tstamp = 0; |
5231 | 5271 | ||
@@ -5278,42 +5318,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
5278 | return 0; | 5318 | return 0; |
5279 | } | 5319 | } |
5280 | 5320 | ||
5281 | if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && | 5321 | res = tcp_validate_incoming(sk, skb, th, 0); |
5282 | tcp_paws_discard(sk, skb)) { | 5322 | if (res <= 0) |
5283 | if (!th->rst) { | 5323 | return -res; |
5284 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); | ||
5285 | tcp_send_dupack(sk, skb); | ||
5286 | goto discard; | ||
5287 | } | ||
5288 | /* Reset is accepted even if it did not pass PAWS. */ | ||
5289 | } | ||
5290 | |||
5291 | /* step 1: check sequence number */ | ||
5292 | if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { | ||
5293 | if (!th->rst) | ||
5294 | tcp_send_dupack(sk, skb); | ||
5295 | goto discard; | ||
5296 | } | ||
5297 | |||
5298 | /* step 2: check RST bit */ | ||
5299 | if (th->rst) { | ||
5300 | tcp_reset(sk); | ||
5301 | goto discard; | ||
5302 | } | ||
5303 | |||
5304 | tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); | ||
5305 | |||
5306 | /* step 3: check security and precedence [ignored] */ | ||
5307 | |||
5308 | /* step 4: | ||
5309 | * | ||
5310 | * Check for a SYN in window. | ||
5311 | */ | ||
5312 | if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { | ||
5313 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); | ||
5314 | tcp_reset(sk); | ||
5315 | return 1; | ||
5316 | } | ||
5317 | 5324 | ||
5318 | /* step 5: check the ACK field */ | 5325 | /* step 5: check the ACK field */ |
5319 | if (th->ack) { | 5326 | if (th->ack) { |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 011478e46c40..5c8fa7f1e327 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -583,14 +583,15 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) | |||
583 | rep.th.doff = arg.iov[0].iov_len / 4; | 583 | rep.th.doff = arg.iov[0].iov_len / 4; |
584 | 584 | ||
585 | tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], | 585 | tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], |
586 | key, ip_hdr(skb)->daddr, | 586 | key, ip_hdr(skb)->saddr, |
587 | ip_hdr(skb)->saddr, &rep.th); | 587 | ip_hdr(skb)->daddr, &rep.th); |
588 | } | 588 | } |
589 | #endif | 589 | #endif |
590 | arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, | 590 | arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, |
591 | ip_hdr(skb)->saddr, /* XXX */ | 591 | ip_hdr(skb)->saddr, /* XXX */ |
592 | sizeof(struct tcphdr), IPPROTO_TCP, 0); | 592 | arg.iov[0].iov_len, IPPROTO_TCP, 0); |
593 | arg.csumoffset = offsetof(struct tcphdr, check) / 2; | 593 | arg.csumoffset = offsetof(struct tcphdr, check) / 2; |
594 | arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; | ||
594 | 595 | ||
595 | net = dev_net(skb->dst->dev); | 596 | net = dev_net(skb->dst->dev); |
596 | ip_send_reply(net->ipv4.tcp_sock, skb, | 597 | ip_send_reply(net->ipv4.tcp_sock, skb, |
@@ -606,7 +607,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) | |||
606 | 607 | ||
607 | static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, | 608 | static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, |
608 | u32 win, u32 ts, int oif, | 609 | u32 win, u32 ts, int oif, |
609 | struct tcp_md5sig_key *key) | 610 | struct tcp_md5sig_key *key, |
611 | int reply_flags) | ||
610 | { | 612 | { |
611 | struct tcphdr *th = tcp_hdr(skb); | 613 | struct tcphdr *th = tcp_hdr(skb); |
612 | struct { | 614 | struct { |
@@ -659,6 +661,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, | |||
659 | ip_hdr(skb)->daddr, &rep.th); | 661 | ip_hdr(skb)->daddr, &rep.th); |
660 | } | 662 | } |
661 | #endif | 663 | #endif |
664 | arg.flags = reply_flags; | ||
662 | arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, | 665 | arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, |
663 | ip_hdr(skb)->saddr, /* XXX */ | 666 | ip_hdr(skb)->saddr, /* XXX */ |
664 | arg.iov[0].iov_len, IPPROTO_TCP, 0); | 667 | arg.iov[0].iov_len, IPPROTO_TCP, 0); |
@@ -681,7 +684,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) | |||
681 | tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, | 684 | tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, |
682 | tcptw->tw_ts_recent, | 685 | tcptw->tw_ts_recent, |
683 | tw->tw_bound_dev_if, | 686 | tw->tw_bound_dev_if, |
684 | tcp_twsk_md5_key(tcptw) | 687 | tcp_twsk_md5_key(tcptw), |
688 | tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0 | ||
685 | ); | 689 | ); |
686 | 690 | ||
687 | inet_twsk_put(tw); | 691 | inet_twsk_put(tw); |
@@ -694,7 +698,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, | |||
694 | tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, | 698 | tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, |
695 | req->ts_recent, | 699 | req->ts_recent, |
696 | 0, | 700 | 0, |
697 | tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr)); | 701 | tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr), |
702 | inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0); | ||
698 | } | 703 | } |
699 | 704 | ||
700 | /* | 705 | /* |
@@ -1244,6 +1249,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) | |||
1244 | ireq = inet_rsk(req); | 1249 | ireq = inet_rsk(req); |
1245 | ireq->loc_addr = daddr; | 1250 | ireq->loc_addr = daddr; |
1246 | ireq->rmt_addr = saddr; | 1251 | ireq->rmt_addr = saddr; |
1252 | ireq->no_srccheck = inet_sk(sk)->transparent; | ||
1247 | ireq->opt = tcp_v4_save_options(sk, skb); | 1253 | ireq->opt = tcp_v4_save_options(sk, skb); |
1248 | if (!want_cookie) | 1254 | if (!want_cookie) |
1249 | TCP_ECN_create_request(req, tcp_hdr(skb)); | 1255 | TCP_ECN_create_request(req, tcp_hdr(skb)); |
@@ -1364,6 +1370,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, | |||
1364 | tcp_mtup_init(newsk); | 1370 | tcp_mtup_init(newsk); |
1365 | tcp_sync_mss(newsk, dst_mtu(dst)); | 1371 | tcp_sync_mss(newsk, dst_mtu(dst)); |
1366 | newtp->advmss = dst_metric(dst, RTAX_ADVMSS); | 1372 | newtp->advmss = dst_metric(dst, RTAX_ADVMSS); |
1373 | if (tcp_sk(sk)->rx_opt.user_mss && | ||
1374 | tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) | ||
1375 | newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; | ||
1376 | |||
1367 | tcp_initialize_rcv_mss(newsk); | 1377 | tcp_initialize_rcv_mss(newsk); |
1368 | 1378 | ||
1369 | #ifdef CONFIG_TCP_MD5SIG | 1379 | #ifdef CONFIG_TCP_MD5SIG |
@@ -1567,8 +1577,7 @@ int tcp_v4_rcv(struct sk_buff *skb) | |||
1567 | TCP_SKB_CB(skb)->flags = iph->tos; | 1577 | TCP_SKB_CB(skb)->flags = iph->tos; |
1568 | TCP_SKB_CB(skb)->sacked = 0; | 1578 | TCP_SKB_CB(skb)->sacked = 0; |
1569 | 1579 | ||
1570 | sk = __inet_lookup(net, &tcp_hashinfo, iph->saddr, | 1580 | sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); |
1571 | th->source, iph->daddr, th->dest, inet_iif(skb)); | ||
1572 | if (!sk) | 1581 | if (!sk) |
1573 | goto no_tcp_socket; | 1582 | goto no_tcp_socket; |
1574 | 1583 | ||
@@ -1946,6 +1955,12 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) | |||
1946 | return rc; | 1955 | return rc; |
1947 | } | 1956 | } |
1948 | 1957 | ||
1958 | static inline int empty_bucket(struct tcp_iter_state *st) | ||
1959 | { | ||
1960 | return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) && | ||
1961 | hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain); | ||
1962 | } | ||
1963 | |||
1949 | static void *established_get_first(struct seq_file *seq) | 1964 | static void *established_get_first(struct seq_file *seq) |
1950 | { | 1965 | { |
1951 | struct tcp_iter_state* st = seq->private; | 1966 | struct tcp_iter_state* st = seq->private; |
@@ -1958,6 +1973,10 @@ static void *established_get_first(struct seq_file *seq) | |||
1958 | struct inet_timewait_sock *tw; | 1973 | struct inet_timewait_sock *tw; |
1959 | rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); | 1974 | rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); |
1960 | 1975 | ||
1976 | /* Lockless fast path for the common case of empty buckets */ | ||
1977 | if (empty_bucket(st)) | ||
1978 | continue; | ||
1979 | |||
1961 | read_lock_bh(lock); | 1980 | read_lock_bh(lock); |
1962 | sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { | 1981 | sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { |
1963 | if (sk->sk_family != st->family || | 1982 | if (sk->sk_family != st->family || |
@@ -2008,13 +2027,15 @@ get_tw: | |||
2008 | read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); | 2027 | read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); |
2009 | st->state = TCP_SEQ_STATE_ESTABLISHED; | 2028 | st->state = TCP_SEQ_STATE_ESTABLISHED; |
2010 | 2029 | ||
2011 | if (++st->bucket < tcp_hashinfo.ehash_size) { | 2030 | /* Look for next non empty bucket */ |
2012 | read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); | 2031 | while (++st->bucket < tcp_hashinfo.ehash_size && |
2013 | sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); | 2032 | empty_bucket(st)) |
2014 | } else { | 2033 | ; |
2015 | cur = NULL; | 2034 | if (st->bucket >= tcp_hashinfo.ehash_size) |
2016 | goto out; | 2035 | return NULL; |
2017 | } | 2036 | |
2037 | read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); | ||
2038 | sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); | ||
2018 | } else | 2039 | } else |
2019 | sk = sk_next(sk); | 2040 | sk = sk_next(sk); |
2020 | 2041 | ||
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f976fc57892c..779f2e9d0689 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -395,6 +395,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
395 | newtp->pred_flags = 0; | 395 | newtp->pred_flags = 0; |
396 | newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; | 396 | newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; |
397 | newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; | 397 | newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; |
398 | newtp->snd_up = treq->snt_isn + 1; | ||
398 | 399 | ||
399 | tcp_prequeue_init(newtp); | 400 | tcp_prequeue_init(newtp); |
400 | 401 | ||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8165f5aa8c71..990a58493235 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -345,6 +345,11 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) | |||
345 | TCP_SKB_CB(skb)->end_seq = seq; | 345 | TCP_SKB_CB(skb)->end_seq = seq; |
346 | } | 346 | } |
347 | 347 | ||
348 | static inline int tcp_urg_mode(const struct tcp_sock *tp) | ||
349 | { | ||
350 | return tp->snd_una != tp->snd_up; | ||
351 | } | ||
352 | |||
348 | #define OPTION_SACK_ADVERTISE (1 << 0) | 353 | #define OPTION_SACK_ADVERTISE (1 << 0) |
349 | #define OPTION_TS (1 << 1) | 354 | #define OPTION_TS (1 << 1) |
350 | #define OPTION_MD5 (1 << 2) | 355 | #define OPTION_MD5 (1 << 2) |
@@ -646,7 +651,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
646 | th->check = 0; | 651 | th->check = 0; |
647 | th->urg_ptr = 0; | 652 | th->urg_ptr = 0; |
648 | 653 | ||
649 | if (unlikely(tp->urg_mode && | 654 | /* The urg_mode check is necessary during a below snd_una win probe */ |
655 | if (unlikely(tcp_urg_mode(tp) && | ||
650 | between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { | 656 | between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { |
651 | th->urg_ptr = htons(tp->snd_up - tcb->seq); | 657 | th->urg_ptr = htons(tp->snd_up - tcb->seq); |
652 | th->urg = 1; | 658 | th->urg = 1; |
@@ -1012,7 +1018,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) | |||
1012 | /* Compute the current effective MSS, taking SACKs and IP options, | 1018 | /* Compute the current effective MSS, taking SACKs and IP options, |
1013 | * and even PMTU discovery events into account. | 1019 | * and even PMTU discovery events into account. |
1014 | * | 1020 | * |
1015 | * LARGESEND note: !urg_mode is overkill, only frames up to snd_up | 1021 | * LARGESEND note: !tcp_urg_mode is overkill, only frames up to snd_up |
1016 | * cannot be large. However, taking into account rare use of URG, this | 1022 | * cannot be large. However, taking into account rare use of URG, this |
1017 | * is not a big flaw. | 1023 | * is not a big flaw. |
1018 | */ | 1024 | */ |
@@ -1029,7 +1035,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) | |||
1029 | 1035 | ||
1030 | mss_now = tp->mss_cache; | 1036 | mss_now = tp->mss_cache; |
1031 | 1037 | ||
1032 | if (large_allowed && sk_can_gso(sk) && !tp->urg_mode) | 1038 | if (large_allowed && sk_can_gso(sk) && !tcp_urg_mode(tp)) |
1033 | doing_tso = 1; | 1039 | doing_tso = 1; |
1034 | 1040 | ||
1035 | if (dst) { | 1041 | if (dst) { |
@@ -1193,7 +1199,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, | |||
1193 | /* Don't use the nagle rule for urgent data (or for the final FIN). | 1199 | /* Don't use the nagle rule for urgent data (or for the final FIN). |
1194 | * Nagle can be ignored during F-RTO too (see RFC4138). | 1200 | * Nagle can be ignored during F-RTO too (see RFC4138). |
1195 | */ | 1201 | */ |
1196 | if (tp->urg_mode || (tp->frto_counter == 2) || | 1202 | if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || |
1197 | (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) | 1203 | (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) |
1198 | return 1; | 1204 | return 1; |
1199 | 1205 | ||
@@ -1824,6 +1830,8 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, | |||
1824 | 1830 | ||
1825 | /* changed transmit queue under us so clear hints */ | 1831 | /* changed transmit queue under us so clear hints */ |
1826 | tcp_clear_retrans_hints_partial(tp); | 1832 | tcp_clear_retrans_hints_partial(tp); |
1833 | if (next_skb == tp->retransmit_skb_hint) | ||
1834 | tp->retransmit_skb_hint = skb; | ||
1827 | 1835 | ||
1828 | sk_wmem_free_skb(sk, next_skb); | 1836 | sk_wmem_free_skb(sk, next_skb); |
1829 | } | 1837 | } |
@@ -1838,7 +1846,7 @@ void tcp_simple_retransmit(struct sock *sk) | |||
1838 | struct tcp_sock *tp = tcp_sk(sk); | 1846 | struct tcp_sock *tp = tcp_sk(sk); |
1839 | struct sk_buff *skb; | 1847 | struct sk_buff *skb; |
1840 | unsigned int mss = tcp_current_mss(sk, 0); | 1848 | unsigned int mss = tcp_current_mss(sk, 0); |
1841 | int lost = 0; | 1849 | u32 prior_lost = tp->lost_out; |
1842 | 1850 | ||
1843 | tcp_for_write_queue(skb, sk) { | 1851 | tcp_for_write_queue(skb, sk) { |
1844 | if (skb == tcp_send_head(sk)) | 1852 | if (skb == tcp_send_head(sk)) |
@@ -1849,17 +1857,13 @@ void tcp_simple_retransmit(struct sock *sk) | |||
1849 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | 1857 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
1850 | tp->retrans_out -= tcp_skb_pcount(skb); | 1858 | tp->retrans_out -= tcp_skb_pcount(skb); |
1851 | } | 1859 | } |
1852 | if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) { | 1860 | tcp_skb_mark_lost_uncond_verify(tp, skb); |
1853 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | ||
1854 | tp->lost_out += tcp_skb_pcount(skb); | ||
1855 | lost = 1; | ||
1856 | } | ||
1857 | } | 1861 | } |
1858 | } | 1862 | } |
1859 | 1863 | ||
1860 | tcp_clear_all_retrans_hints(tp); | 1864 | tcp_clear_retrans_hints_partial(tp); |
1861 | 1865 | ||
1862 | if (!lost) | 1866 | if (prior_lost == tp->lost_out) |
1863 | return; | 1867 | return; |
1864 | 1868 | ||
1865 | if (tcp_is_reno(tp)) | 1869 | if (tcp_is_reno(tp)) |
@@ -1934,8 +1938,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
1934 | /* Collapse two adjacent packets if worthwhile and we can. */ | 1938 | /* Collapse two adjacent packets if worthwhile and we can. */ |
1935 | if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && | 1939 | if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && |
1936 | (skb->len < (cur_mss >> 1)) && | 1940 | (skb->len < (cur_mss >> 1)) && |
1937 | (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && | ||
1938 | (!tcp_skb_is_last(sk, skb)) && | 1941 | (!tcp_skb_is_last(sk, skb)) && |
1942 | (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && | ||
1939 | (skb_shinfo(skb)->nr_frags == 0 && | 1943 | (skb_shinfo(skb)->nr_frags == 0 && |
1940 | skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && | 1944 | skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && |
1941 | (tcp_skb_pcount(skb) == 1 && | 1945 | (tcp_skb_pcount(skb) == 1 && |
@@ -1996,86 +2000,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
1996 | return err; | 2000 | return err; |
1997 | } | 2001 | } |
1998 | 2002 | ||
1999 | /* This gets called after a retransmit timeout, and the initially | 2003 | static int tcp_can_forward_retransmit(struct sock *sk) |
2000 | * retransmitted data is acknowledged. It tries to continue | ||
2001 | * resending the rest of the retransmit queue, until either | ||
2002 | * we've sent it all or the congestion window limit is reached. | ||
2003 | * If doing SACK, the first ACK which comes back for a timeout | ||
2004 | * based retransmit packet might feed us FACK information again. | ||
2005 | * If so, we use it to avoid unnecessarily retransmissions. | ||
2006 | */ | ||
2007 | void tcp_xmit_retransmit_queue(struct sock *sk) | ||
2008 | { | 2004 | { |
2009 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2005 | const struct inet_connection_sock *icsk = inet_csk(sk); |
2010 | struct tcp_sock *tp = tcp_sk(sk); | 2006 | struct tcp_sock *tp = tcp_sk(sk); |
2011 | struct sk_buff *skb; | ||
2012 | int packet_cnt; | ||
2013 | |||
2014 | if (tp->retransmit_skb_hint) { | ||
2015 | skb = tp->retransmit_skb_hint; | ||
2016 | packet_cnt = tp->retransmit_cnt_hint; | ||
2017 | } else { | ||
2018 | skb = tcp_write_queue_head(sk); | ||
2019 | packet_cnt = 0; | ||
2020 | } | ||
2021 | |||
2022 | /* First pass: retransmit lost packets. */ | ||
2023 | if (tp->lost_out) { | ||
2024 | tcp_for_write_queue_from(skb, sk) { | ||
2025 | __u8 sacked = TCP_SKB_CB(skb)->sacked; | ||
2026 | |||
2027 | if (skb == tcp_send_head(sk)) | ||
2028 | break; | ||
2029 | /* we could do better than to assign each time */ | ||
2030 | tp->retransmit_skb_hint = skb; | ||
2031 | tp->retransmit_cnt_hint = packet_cnt; | ||
2032 | |||
2033 | /* Assume this retransmit will generate | ||
2034 | * only one packet for congestion window | ||
2035 | * calculation purposes. This works because | ||
2036 | * tcp_retransmit_skb() will chop up the | ||
2037 | * packet to be MSS sized and all the | ||
2038 | * packet counting works out. | ||
2039 | */ | ||
2040 | if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) | ||
2041 | return; | ||
2042 | |||
2043 | if (sacked & TCPCB_LOST) { | ||
2044 | if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { | ||
2045 | int mib_idx; | ||
2046 | |||
2047 | if (tcp_retransmit_skb(sk, skb)) { | ||
2048 | tp->retransmit_skb_hint = NULL; | ||
2049 | return; | ||
2050 | } | ||
2051 | if (icsk->icsk_ca_state != TCP_CA_Loss) | ||
2052 | mib_idx = LINUX_MIB_TCPFASTRETRANS; | ||
2053 | else | ||
2054 | mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; | ||
2055 | NET_INC_STATS_BH(sock_net(sk), mib_idx); | ||
2056 | |||
2057 | if (skb == tcp_write_queue_head(sk)) | ||
2058 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | ||
2059 | inet_csk(sk)->icsk_rto, | ||
2060 | TCP_RTO_MAX); | ||
2061 | } | ||
2062 | |||
2063 | packet_cnt += tcp_skb_pcount(skb); | ||
2064 | if (packet_cnt >= tp->lost_out) | ||
2065 | break; | ||
2066 | } | ||
2067 | } | ||
2068 | } | ||
2069 | |||
2070 | /* OK, demanded retransmission is finished. */ | ||
2071 | 2007 | ||
2072 | /* Forward retransmissions are possible only during Recovery. */ | 2008 | /* Forward retransmissions are possible only during Recovery. */ |
2073 | if (icsk->icsk_ca_state != TCP_CA_Recovery) | 2009 | if (icsk->icsk_ca_state != TCP_CA_Recovery) |
2074 | return; | 2010 | return 0; |
2075 | 2011 | ||
2076 | /* No forward retransmissions in Reno are possible. */ | 2012 | /* No forward retransmissions in Reno are possible. */ |
2077 | if (tcp_is_reno(tp)) | 2013 | if (tcp_is_reno(tp)) |
2078 | return; | 2014 | return 0; |
2079 | 2015 | ||
2080 | /* Yeah, we have to make difficult choice between forward transmission | 2016 | /* Yeah, we have to make difficult choice between forward transmission |
2081 | * and retransmission... Both ways have their merits... | 2017 | * and retransmission... Both ways have their merits... |
@@ -2086,43 +2022,104 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
2086 | */ | 2022 | */ |
2087 | 2023 | ||
2088 | if (tcp_may_send_now(sk)) | 2024 | if (tcp_may_send_now(sk)) |
2089 | return; | 2025 | return 0; |
2090 | 2026 | ||
2091 | /* If nothing is SACKed, highest_sack in the loop won't be valid */ | 2027 | return 1; |
2092 | if (!tp->sacked_out) | 2028 | } |
2093 | return; | ||
2094 | 2029 | ||
2095 | if (tp->forward_skb_hint) | 2030 | /* This gets called after a retransmit timeout, and the initially |
2096 | skb = tp->forward_skb_hint; | 2031 | * retransmitted data is acknowledged. It tries to continue |
2097 | else | 2032 | * resending the rest of the retransmit queue, until either |
2033 | * we've sent it all or the congestion window limit is reached. | ||
2034 | * If doing SACK, the first ACK which comes back for a timeout | ||
2035 | * based retransmit packet might feed us FACK information again. | ||
2036 | * If so, we use it to avoid unnecessarily retransmissions. | ||
2037 | */ | ||
2038 | void tcp_xmit_retransmit_queue(struct sock *sk) | ||
2039 | { | ||
2040 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
2041 | struct tcp_sock *tp = tcp_sk(sk); | ||
2042 | struct sk_buff *skb; | ||
2043 | struct sk_buff *hole = NULL; | ||
2044 | u32 last_lost; | ||
2045 | int mib_idx; | ||
2046 | int fwd_rexmitting = 0; | ||
2047 | |||
2048 | if (!tp->lost_out) | ||
2049 | tp->retransmit_high = tp->snd_una; | ||
2050 | |||
2051 | if (tp->retransmit_skb_hint) { | ||
2052 | skb = tp->retransmit_skb_hint; | ||
2053 | last_lost = TCP_SKB_CB(skb)->end_seq; | ||
2054 | if (after(last_lost, tp->retransmit_high)) | ||
2055 | last_lost = tp->retransmit_high; | ||
2056 | } else { | ||
2098 | skb = tcp_write_queue_head(sk); | 2057 | skb = tcp_write_queue_head(sk); |
2058 | last_lost = tp->snd_una; | ||
2059 | } | ||
2099 | 2060 | ||
2061 | /* First pass: retransmit lost packets. */ | ||
2100 | tcp_for_write_queue_from(skb, sk) { | 2062 | tcp_for_write_queue_from(skb, sk) { |
2101 | if (skb == tcp_send_head(sk)) | 2063 | __u8 sacked = TCP_SKB_CB(skb)->sacked; |
2102 | break; | ||
2103 | tp->forward_skb_hint = skb; | ||
2104 | 2064 | ||
2105 | if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) | 2065 | if (skb == tcp_send_head(sk)) |
2106 | break; | 2066 | break; |
2067 | /* we could do better than to assign each time */ | ||
2068 | if (hole == NULL) | ||
2069 | tp->retransmit_skb_hint = skb; | ||
2107 | 2070 | ||
2071 | /* Assume this retransmit will generate | ||
2072 | * only one packet for congestion window | ||
2073 | * calculation purposes. This works because | ||
2074 | * tcp_retransmit_skb() will chop up the | ||
2075 | * packet to be MSS sized and all the | ||
2076 | * packet counting works out. | ||
2077 | */ | ||
2108 | if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) | 2078 | if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) |
2109 | break; | 2079 | return; |
2080 | |||
2081 | if (fwd_rexmitting) { | ||
2082 | begin_fwd: | ||
2083 | if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) | ||
2084 | break; | ||
2085 | mib_idx = LINUX_MIB_TCPFORWARDRETRANS; | ||
2086 | |||
2087 | } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) { | ||
2088 | tp->retransmit_high = last_lost; | ||
2089 | if (!tcp_can_forward_retransmit(sk)) | ||
2090 | break; | ||
2091 | /* Backtrack if necessary to non-L'ed skb */ | ||
2092 | if (hole != NULL) { | ||
2093 | skb = hole; | ||
2094 | hole = NULL; | ||
2095 | } | ||
2096 | fwd_rexmitting = 1; | ||
2097 | goto begin_fwd; | ||
2110 | 2098 | ||
2111 | if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) | 2099 | } else if (!(sacked & TCPCB_LOST)) { |
2100 | if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS)) | ||
2101 | hole = skb; | ||
2112 | continue; | 2102 | continue; |
2113 | 2103 | ||
2114 | /* Ok, retransmit it. */ | 2104 | } else { |
2115 | if (tcp_retransmit_skb(sk, skb)) { | 2105 | last_lost = TCP_SKB_CB(skb)->end_seq; |
2116 | tp->forward_skb_hint = NULL; | 2106 | if (icsk->icsk_ca_state != TCP_CA_Loss) |
2117 | break; | 2107 | mib_idx = LINUX_MIB_TCPFASTRETRANS; |
2108 | else | ||
2109 | mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; | ||
2118 | } | 2110 | } |
2119 | 2111 | ||
2112 | if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) | ||
2113 | continue; | ||
2114 | |||
2115 | if (tcp_retransmit_skb(sk, skb)) | ||
2116 | return; | ||
2117 | NET_INC_STATS_BH(sock_net(sk), mib_idx); | ||
2118 | |||
2120 | if (skb == tcp_write_queue_head(sk)) | 2119 | if (skb == tcp_write_queue_head(sk)) |
2121 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 2120 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
2122 | inet_csk(sk)->icsk_rto, | 2121 | inet_csk(sk)->icsk_rto, |
2123 | TCP_RTO_MAX); | 2122 | TCP_RTO_MAX); |
2124 | |||
2125 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFORWARDRETRANS); | ||
2126 | } | 2123 | } |
2127 | } | 2124 | } |
2128 | 2125 | ||
@@ -2241,6 +2238,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2241 | struct sk_buff *skb; | 2238 | struct sk_buff *skb; |
2242 | struct tcp_md5sig_key *md5; | 2239 | struct tcp_md5sig_key *md5; |
2243 | __u8 *md5_hash_location; | 2240 | __u8 *md5_hash_location; |
2241 | int mss; | ||
2244 | 2242 | ||
2245 | skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); | 2243 | skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); |
2246 | if (skb == NULL) | 2244 | if (skb == NULL) |
@@ -2251,13 +2249,17 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2251 | 2249 | ||
2252 | skb->dst = dst_clone(dst); | 2250 | skb->dst = dst_clone(dst); |
2253 | 2251 | ||
2252 | mss = dst_metric(dst, RTAX_ADVMSS); | ||
2253 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) | ||
2254 | mss = tp->rx_opt.user_mss; | ||
2255 | |||
2254 | if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ | 2256 | if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ |
2255 | __u8 rcv_wscale; | 2257 | __u8 rcv_wscale; |
2256 | /* Set this up on the first call only */ | 2258 | /* Set this up on the first call only */ |
2257 | req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); | 2259 | req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); |
2258 | /* tcp_full_space because it is guaranteed to be the first packet */ | 2260 | /* tcp_full_space because it is guaranteed to be the first packet */ |
2259 | tcp_select_initial_window(tcp_full_space(sk), | 2261 | tcp_select_initial_window(tcp_full_space(sk), |
2260 | dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), | 2262 | mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), |
2261 | &req->rcv_wnd, | 2263 | &req->rcv_wnd, |
2262 | &req->window_clamp, | 2264 | &req->window_clamp, |
2263 | ireq->wscale_ok, | 2265 | ireq->wscale_ok, |
@@ -2267,8 +2269,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2267 | 2269 | ||
2268 | memset(&opts, 0, sizeof(opts)); | 2270 | memset(&opts, 0, sizeof(opts)); |
2269 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 2271 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
2270 | tcp_header_size = tcp_synack_options(sk, req, | 2272 | tcp_header_size = tcp_synack_options(sk, req, mss, |
2271 | dst_metric(dst, RTAX_ADVMSS), | ||
2272 | skb, &opts, &md5) + | 2273 | skb, &opts, &md5) + |
2273 | sizeof(struct tcphdr); | 2274 | sizeof(struct tcphdr); |
2274 | 2275 | ||
@@ -2280,7 +2281,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
2280 | th->syn = 1; | 2281 | th->syn = 1; |
2281 | th->ack = 1; | 2282 | th->ack = 1; |
2282 | TCP_ECN_make_synack(req, th); | 2283 | TCP_ECN_make_synack(req, th); |
2283 | th->source = inet_sk(sk)->sport; | 2284 | th->source = ireq->loc_port; |
2284 | th->dest = ireq->rmt_port; | 2285 | th->dest = ireq->rmt_port; |
2285 | /* Setting of flags are superfluous here for callers (and ECE is | 2286 | /* Setting of flags are superfluous here for callers (and ECE is |
2286 | * not even correctly set) | 2287 | * not even correctly set) |
@@ -2342,6 +2343,9 @@ static void tcp_connect_init(struct sock *sk) | |||
2342 | if (!tp->window_clamp) | 2343 | if (!tp->window_clamp) |
2343 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); | 2344 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); |
2344 | tp->advmss = dst_metric(dst, RTAX_ADVMSS); | 2345 | tp->advmss = dst_metric(dst, RTAX_ADVMSS); |
2346 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) | ||
2347 | tp->advmss = tp->rx_opt.user_mss; | ||
2348 | |||
2345 | tcp_initialize_rcv_mss(sk); | 2349 | tcp_initialize_rcv_mss(sk); |
2346 | 2350 | ||
2347 | tcp_select_initial_window(tcp_full_space(sk), | 2351 | tcp_select_initial_window(tcp_full_space(sk), |
@@ -2360,6 +2364,7 @@ static void tcp_connect_init(struct sock *sk) | |||
2360 | tcp_init_wl(tp, tp->write_seq, 0); | 2364 | tcp_init_wl(tp, tp->write_seq, 0); |
2361 | tp->snd_una = tp->write_seq; | 2365 | tp->snd_una = tp->write_seq; |
2362 | tp->snd_sml = tp->write_seq; | 2366 | tp->snd_sml = tp->write_seq; |
2367 | tp->snd_up = tp->write_seq; | ||
2363 | tp->rcv_nxt = 0; | 2368 | tp->rcv_nxt = 0; |
2364 | tp->rcv_wup = 0; | 2369 | tp->rcv_wup = 0; |
2365 | tp->copied_seq = 0; | 2370 | tp->copied_seq = 0; |
@@ -2569,8 +2574,7 @@ int tcp_write_wakeup(struct sock *sk) | |||
2569 | tcp_event_new_data_sent(sk, skb); | 2574 | tcp_event_new_data_sent(sk, skb); |
2570 | return err; | 2575 | return err; |
2571 | } else { | 2576 | } else { |
2572 | if (tp->urg_mode && | 2577 | if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) |
2573 | between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) | ||
2574 | tcp_xmit_probe_skb(sk, 1); | 2578 | tcp_xmit_probe_skb(sk, 1); |
2575 | return tcp_xmit_probe_skb(sk, 0); | 2579 | return tcp_xmit_probe_skb(sk, 0); |
2576 | } | 2580 | } |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 5ab6ba19c3ce..6b6dff1164b9 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -201,7 +201,7 @@ static void tcp_delack_timer(unsigned long data) | |||
201 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); | 201 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); |
202 | 202 | ||
203 | while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) | 203 | while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) |
204 | sk->sk_backlog_rcv(sk, skb); | 204 | sk_backlog_rcv(sk, skb); |
205 | 205 | ||
206 | tp->ucopy.memory = 0; | 206 | tp->ucopy.memory = 0; |
207 | } | 207 | } |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 57e26fa66185..eacf4cfef146 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
@@ -108,9 +108,6 @@ | |||
108 | * Snmp MIB for the UDP layer | 108 | * Snmp MIB for the UDP layer |
109 | */ | 109 | */ |
110 | 110 | ||
111 | DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly; | ||
112 | EXPORT_SYMBOL(udp_stats_in6); | ||
113 | |||
114 | struct hlist_head udp_hash[UDP_HTABLE_SIZE]; | 111 | struct hlist_head udp_hash[UDP_HTABLE_SIZE]; |
115 | DEFINE_RWLOCK(udp_hash_lock); | 112 | DEFINE_RWLOCK(udp_hash_lock); |
116 | 113 | ||
@@ -125,14 +122,23 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min); | |||
125 | atomic_t udp_memory_allocated; | 122 | atomic_t udp_memory_allocated; |
126 | EXPORT_SYMBOL(udp_memory_allocated); | 123 | EXPORT_SYMBOL(udp_memory_allocated); |
127 | 124 | ||
128 | static inline int __udp_lib_lport_inuse(struct net *net, __u16 num, | 125 | static int udp_lib_lport_inuse(struct net *net, __u16 num, |
129 | const struct hlist_head udptable[]) | 126 | const struct hlist_head udptable[], |
127 | struct sock *sk, | ||
128 | int (*saddr_comp)(const struct sock *sk1, | ||
129 | const struct sock *sk2)) | ||
130 | { | 130 | { |
131 | struct sock *sk; | 131 | struct sock *sk2; |
132 | struct hlist_node *node; | 132 | struct hlist_node *node; |
133 | 133 | ||
134 | sk_for_each(sk, node, &udptable[udp_hashfn(net, num)]) | 134 | sk_for_each(sk2, node, &udptable[udp_hashfn(net, num)]) |
135 | if (net_eq(sock_net(sk), net) && sk->sk_hash == num) | 135 | if (net_eq(sock_net(sk2), net) && |
136 | sk2 != sk && | ||
137 | sk2->sk_hash == num && | ||
138 | (!sk2->sk_reuse || !sk->sk_reuse) && | ||
139 | (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if | ||
140 | || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && | ||
141 | (*saddr_comp)(sk, sk2)) | ||
136 | return 1; | 142 | return 1; |
137 | return 0; | 143 | return 0; |
138 | } | 144 | } |
@@ -149,83 +155,37 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, | |||
149 | const struct sock *sk2 ) ) | 155 | const struct sock *sk2 ) ) |
150 | { | 156 | { |
151 | struct hlist_head *udptable = sk->sk_prot->h.udp_hash; | 157 | struct hlist_head *udptable = sk->sk_prot->h.udp_hash; |
152 | struct hlist_node *node; | ||
153 | struct hlist_head *head; | ||
154 | struct sock *sk2; | ||
155 | int error = 1; | 158 | int error = 1; |
156 | struct net *net = sock_net(sk); | 159 | struct net *net = sock_net(sk); |
157 | 160 | ||
158 | write_lock_bh(&udp_hash_lock); | 161 | write_lock_bh(&udp_hash_lock); |
159 | 162 | ||
160 | if (!snum) { | 163 | if (!snum) { |
161 | int i, low, high, remaining; | 164 | int low, high, remaining; |
162 | unsigned rover, best, best_size_so_far; | 165 | unsigned rand; |
166 | unsigned short first; | ||
163 | 167 | ||
164 | inet_get_local_port_range(&low, &high); | 168 | inet_get_local_port_range(&low, &high); |
165 | remaining = (high - low) + 1; | 169 | remaining = (high - low) + 1; |
166 | 170 | ||
167 | best_size_so_far = UINT_MAX; | 171 | rand = net_random(); |
168 | best = rover = net_random() % remaining + low; | 172 | snum = first = rand % remaining + low; |
169 | 173 | rand |= 1; | |
170 | /* 1st pass: look for empty (or shortest) hash chain */ | 174 | while (udp_lib_lport_inuse(net, snum, udptable, sk, |
171 | for (i = 0; i < UDP_HTABLE_SIZE; i++) { | 175 | saddr_comp)) { |
172 | int size = 0; | 176 | do { |
173 | 177 | snum = snum + rand; | |
174 | head = &udptable[udp_hashfn(net, rover)]; | 178 | } while (snum < low || snum > high); |
175 | if (hlist_empty(head)) | 179 | if (snum == first) |
176 | goto gotit; | 180 | goto fail; |
177 | |||
178 | sk_for_each(sk2, node, head) { | ||
179 | if (++size >= best_size_so_far) | ||
180 | goto next; | ||
181 | } | ||
182 | best_size_so_far = size; | ||
183 | best = rover; | ||
184 | next: | ||
185 | /* fold back if end of range */ | ||
186 | if (++rover > high) | ||
187 | rover = low + ((rover - low) | ||
188 | & (UDP_HTABLE_SIZE - 1)); | ||
189 | |||
190 | |||
191 | } | ||
192 | |||
193 | /* 2nd pass: find hole in shortest hash chain */ | ||
194 | rover = best; | ||
195 | for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) { | ||
196 | if (! __udp_lib_lport_inuse(net, rover, udptable)) | ||
197 | goto gotit; | ||
198 | rover += UDP_HTABLE_SIZE; | ||
199 | if (rover > high) | ||
200 | rover = low + ((rover - low) | ||
201 | & (UDP_HTABLE_SIZE - 1)); | ||
202 | } | 181 | } |
203 | 182 | } else if (udp_lib_lport_inuse(net, snum, udptable, sk, saddr_comp)) | |
204 | |||
205 | /* All ports in use! */ | ||
206 | goto fail; | 183 | goto fail; |
207 | 184 | ||
208 | gotit: | ||
209 | snum = rover; | ||
210 | } else { | ||
211 | head = &udptable[udp_hashfn(net, snum)]; | ||
212 | |||
213 | sk_for_each(sk2, node, head) | ||
214 | if (sk2->sk_hash == snum && | ||
215 | sk2 != sk && | ||
216 | net_eq(sock_net(sk2), net) && | ||
217 | (!sk2->sk_reuse || !sk->sk_reuse) && | ||
218 | (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if | ||
219 | || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && | ||
220 | (*saddr_comp)(sk, sk2) ) | ||
221 | goto fail; | ||
222 | } | ||
223 | |||
224 | inet_sk(sk)->num = snum; | 185 | inet_sk(sk)->num = snum; |
225 | sk->sk_hash = snum; | 186 | sk->sk_hash = snum; |
226 | if (sk_unhashed(sk)) { | 187 | if (sk_unhashed(sk)) { |
227 | head = &udptable[udp_hashfn(net, snum)]; | 188 | sk_add_node(sk, &udptable[udp_hashfn(net, snum)]); |
228 | sk_add_node(sk, head); | ||
229 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 189 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
230 | } | 190 | } |
231 | error = 0; | 191 | error = 0; |
@@ -302,6 +262,28 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, | |||
302 | return result; | 262 | return result; |
303 | } | 263 | } |
304 | 264 | ||
265 | static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, | ||
266 | __be16 sport, __be16 dport, | ||
267 | struct hlist_head udptable[]) | ||
268 | { | ||
269 | struct sock *sk; | ||
270 | const struct iphdr *iph = ip_hdr(skb); | ||
271 | |||
272 | if (unlikely(sk = skb_steal_sock(skb))) | ||
273 | return sk; | ||
274 | else | ||
275 | return __udp4_lib_lookup(dev_net(skb->dst->dev), iph->saddr, sport, | ||
276 | iph->daddr, dport, inet_iif(skb), | ||
277 | udptable); | ||
278 | } | ||
279 | |||
280 | struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, | ||
281 | __be32 daddr, __be16 dport, int dif) | ||
282 | { | ||
283 | return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, udp_hash); | ||
284 | } | ||
285 | EXPORT_SYMBOL_GPL(udp4_lib_lookup); | ||
286 | |||
305 | static inline struct sock *udp_v4_mcast_next(struct sock *sk, | 287 | static inline struct sock *udp_v4_mcast_next(struct sock *sk, |
306 | __be16 loc_port, __be32 loc_addr, | 288 | __be16 loc_port, __be32 loc_addr, |
307 | __be16 rmt_port, __be32 rmt_addr, | 289 | __be16 rmt_port, __be32 rmt_addr, |
@@ -1201,8 +1183,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], | |||
1201 | return __udp4_lib_mcast_deliver(net, skb, uh, | 1183 | return __udp4_lib_mcast_deliver(net, skb, uh, |
1202 | saddr, daddr, udptable); | 1184 | saddr, daddr, udptable); |
1203 | 1185 | ||
1204 | sk = __udp4_lib_lookup(net, saddr, uh->source, daddr, | 1186 | sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); |
1205 | uh->dest, inet_iif(skb), udptable); | ||
1206 | 1187 | ||
1207 | if (sk != NULL) { | 1188 | if (sk != NULL) { |
1208 | int ret = udp_queue_rcv_skb(sk, skb); | 1189 | int ret = udp_queue_rcv_skb(sk, skb); |