aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig2
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/igmp.c7
-rw-r--r--net/ipv4/inet_connection_sock.c19
-rw-r--r--net/ipv4/inet_diag.c6
-rw-r--r--net/ipv4/inet_timewait_sock.c1
-rw-r--r--net/ipv4/ip_gre.c487
-rw-r--r--net/ipv4/ip_output.c4
-rw-r--r--net/ipv4/ip_sockglue.c15
-rw-r--r--net/ipv4/ipvs/Kconfig224
-rw-r--r--net/ipv4/ipvs/Makefile34
-rw-r--r--net/ipv4/ipvs/ip_vs_app.c622
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c1023
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c1125
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c2373
-rw-r--r--net/ipv4/ipvs/ip_vs_dh.c258
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c162
-rw-r--r--net/ipv4/ipvs/ip_vs_ftp.c393
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c571
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c760
-rw-r--r--net/ipv4/ipvs/ip_vs_lc.c121
-rw-r--r--net/ipv4/ipvs/ip_vs_nq.c159
-rw-r--r--net/ipv4/ipvs/ip_vs_proto.c233
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah.c178
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_esp.c176
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c614
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_udp.c428
-rw-r--r--net/ipv4/ipvs/ip_vs_rr.c116
-rw-r--r--net/ipv4/ipvs/ip_vs_sched.c251
-rw-r--r--net/ipv4/ipvs/ip_vs_sed.c161
-rw-r--r--net/ipv4/ipvs/ip_vs_sh.c255
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c930
-rw-r--r--net/ipv4/ipvs/ip_vs_wlc.c149
-rw-r--r--net/ipv4/ipvs/ip_vs_wrr.c234
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c559
-rw-r--r--net/ipv4/netfilter.c10
-rw-r--r--net/ipv4/netfilter/Kconfig128
-rw-r--r--net/ipv4/netfilter/Makefile4
-rw-r--r--net/ipv4/netfilter/arp_tables.c116
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c15
-rw-r--r--net/ipv4/netfilter/arptable_filter.c8
-rw-r--r--net/ipv4/netfilter/ip_tables.c177
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c29
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c17
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c21
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c30
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c26
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c21
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c19
-rw-r--r--net/ipv4/netfilter/ipt_TTL.c15
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c23
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c35
-rw-r--r--net/ipv4/netfilter/ipt_ah.c24
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c20
-rw-r--r--net/ipv4/netfilter/ipt_recent.c501
-rw-r--r--net/ipv4/netfilter/ipt_ttl.c9
-rw-r--r--net/ipv4/netfilter/iptable_filter.c6
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c10
-rw-r--r--net/ipv4/netfilter/iptable_raw.c4
-rw-r--r--net/ipv4/netfilter/iptable_security.c6
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c68
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c73
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c22
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c96
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c72
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c3
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c3
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c92
-rw-r--r--net/ipv4/route.c34
-rw-r--r--net/ipv4/syncookies.c3
-rw-r--r--net/ipv4/sysctl_net_ipv4.c23
-rw-r--r--net/ipv4/tcp.c18
-rw-r--r--net/ipv4/tcp_input.c329
-rw-r--r--net/ipv4/tcp_ipv4.c51
-rw-r--r--net/ipv4/tcp_minisocks.c1
-rw-r--r--net/ipv4/tcp_output.c222
-rw-r--r--net/ipv4/tcp_timer.c2
-rw-r--r--net/ipv4/udp.c123
79 files changed, 1507 insertions, 13655 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 591ea23639ca..691268f3a359 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -630,5 +630,3 @@ config TCP_MD5SIG
630 630
631 If unsure, say N. 631 If unsure, say N.
632 632
633source "net/ipv4/ipvs/Kconfig"
634
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ad40ef3f9ebc..80ff87ce43aa 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
33obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o 33obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
34obj-$(CONFIG_IP_PNP) += ipconfig.o 34obj-$(CONFIG_IP_PNP) += ipconfig.o
35obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ 35obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
36obj-$(CONFIG_IP_VS) += ipvs/
37obj-$(CONFIG_INET_DIAG) += inet_diag.o 36obj-$(CONFIG_INET_DIAG) += inet_diag.o
38obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o 37obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
39obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o 38obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8a3ac1fa71a9..1fbff5fa4241 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -469,7 +469,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
469 */ 469 */
470 err = -EADDRNOTAVAIL; 470 err = -EADDRNOTAVAIL;
471 if (!sysctl_ip_nonlocal_bind && 471 if (!sysctl_ip_nonlocal_bind &&
472 !inet->freebind && 472 !(inet->freebind || inet->transparent) &&
473 addr->sin_addr.s_addr != htonl(INADDR_ANY) && 473 addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
474 chk_addr_ret != RTN_LOCAL && 474 chk_addr_ret != RTN_LOCAL &&
475 chk_addr_ret != RTN_MULTICAST && 475 chk_addr_ret != RTN_MULTICAST &&
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index f70fac612596..7f9e337e3908 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1234,6 +1234,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
1234 write_lock_bh(&in_dev->mc_list_lock); 1234 write_lock_bh(&in_dev->mc_list_lock);
1235 im->next=in_dev->mc_list; 1235 im->next=in_dev->mc_list;
1236 in_dev->mc_list=im; 1236 in_dev->mc_list=im;
1237 in_dev->mc_count++;
1237 write_unlock_bh(&in_dev->mc_list_lock); 1238 write_unlock_bh(&in_dev->mc_list_lock);
1238#ifdef CONFIG_IP_MULTICAST 1239#ifdef CONFIG_IP_MULTICAST
1239 igmpv3_del_delrec(in_dev, im->multiaddr); 1240 igmpv3_del_delrec(in_dev, im->multiaddr);
@@ -1282,6 +1283,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
1282 if (--i->users == 0) { 1283 if (--i->users == 0) {
1283 write_lock_bh(&in_dev->mc_list_lock); 1284 write_lock_bh(&in_dev->mc_list_lock);
1284 *ip = i->next; 1285 *ip = i->next;
1286 in_dev->mc_count--;
1285 write_unlock_bh(&in_dev->mc_list_lock); 1287 write_unlock_bh(&in_dev->mc_list_lock);
1286 igmp_group_dropped(i); 1288 igmp_group_dropped(i);
1287 1289
@@ -1330,6 +1332,7 @@ void ip_mc_init_dev(struct in_device *in_dev)
1330 setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 1332 setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire,
1331 (unsigned long)in_dev); 1333 (unsigned long)in_dev);
1332 in_dev->mr_ifc_count = 0; 1334 in_dev->mr_ifc_count = 0;
1335 in_dev->mc_count = 0;
1333 setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 1336 setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
1334 (unsigned long)in_dev); 1337 (unsigned long)in_dev);
1335 in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; 1338 in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
@@ -1369,8 +1372,8 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
1369 write_lock_bh(&in_dev->mc_list_lock); 1372 write_lock_bh(&in_dev->mc_list_lock);
1370 while ((i = in_dev->mc_list) != NULL) { 1373 while ((i = in_dev->mc_list) != NULL) {
1371 in_dev->mc_list = i->next; 1374 in_dev->mc_list = i->next;
1375 in_dev->mc_count--;
1372 write_unlock_bh(&in_dev->mc_list_lock); 1376 write_unlock_bh(&in_dev->mc_list_lock);
1373
1374 igmp_group_dropped(i); 1377 igmp_group_dropped(i);
1375 ip_ma_put(i); 1378 ip_ma_put(i);
1376 1379
@@ -2383,7 +2386,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
2383 2386
2384 if (state->in_dev->mc_list == im) { 2387 if (state->in_dev->mc_list == im) {
2385 seq_printf(seq, "%d\t%-10s: %5d %7s\n", 2388 seq_printf(seq, "%d\t%-10s: %5d %7s\n",
2386 state->dev->ifindex, state->dev->name, state->dev->mc_count, querier); 2389 state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
2387 } 2390 }
2388 2391
2389 seq_printf(seq, 2392 seq_printf(seq,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 0c1ae68ee84b..bd1278a2d828 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -30,20 +30,22 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
30#endif 30#endif
31 31
32/* 32/*
33 * This array holds the first and last local port number. 33 * This struct holds the first and last local port number.
34 */ 34 */
35int sysctl_local_port_range[2] = { 32768, 61000 }; 35struct local_ports sysctl_local_ports __read_mostly = {
36DEFINE_SEQLOCK(sysctl_port_range_lock); 36 .lock = SEQLOCK_UNLOCKED,
37 .range = { 32768, 61000 },
38};
37 39
38void inet_get_local_port_range(int *low, int *high) 40void inet_get_local_port_range(int *low, int *high)
39{ 41{
40 unsigned seq; 42 unsigned seq;
41 do { 43 do {
42 seq = read_seqbegin(&sysctl_port_range_lock); 44 seq = read_seqbegin(&sysctl_local_ports.lock);
43 45
44 *low = sysctl_local_port_range[0]; 46 *low = sysctl_local_ports.range[0];
45 *high = sysctl_local_port_range[1]; 47 *high = sysctl_local_ports.range[1];
46 } while (read_seqretry(&sysctl_port_range_lock, seq)); 48 } while (read_seqretry(&sysctl_local_ports.lock, seq));
47} 49}
48EXPORT_SYMBOL(inet_get_local_port_range); 50EXPORT_SYMBOL(inet_get_local_port_range);
49 51
@@ -335,6 +337,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk,
335 .saddr = ireq->loc_addr, 337 .saddr = ireq->loc_addr,
336 .tos = RT_CONN_FLAGS(sk) } }, 338 .tos = RT_CONN_FLAGS(sk) } },
337 .proto = sk->sk_protocol, 339 .proto = sk->sk_protocol,
340 .flags = inet_sk_flowi_flags(sk),
338 .uli_u = { .ports = 341 .uli_u = { .ports =
339 { .sport = inet_sk(sk)->sport, 342 { .sport = inet_sk(sk)->sport,
340 .dport = ireq->rmt_port } } }; 343 .dport = ireq->rmt_port } } };
@@ -515,6 +518,8 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
515 newicsk->icsk_bind_hash = NULL; 518 newicsk->icsk_bind_hash = NULL;
516 519
517 inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; 520 inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
521 inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port);
522 inet_sk(newsk)->sport = inet_rsk(req)->loc_port;
518 newsk->sk_write_space = sk_stream_write_space; 523 newsk->sk_write_space = sk_stream_write_space;
519 524
520 newicsk->icsk_retransmits = 0; 525 newicsk->icsk_retransmits = 0;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index c10036e7a463..89cb047ab314 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -782,11 +782,15 @@ skip_listen_ht:
782 struct sock *sk; 782 struct sock *sk;
783 struct hlist_node *node; 783 struct hlist_node *node;
784 784
785 num = 0;
786
787 if (hlist_empty(&head->chain) && hlist_empty(&head->twchain))
788 continue;
789
785 if (i > s_i) 790 if (i > s_i)
786 s_num = 0; 791 s_num = 0;
787 792
788 read_lock_bh(lock); 793 read_lock_bh(lock);
789 num = 0;
790 sk_for_each(sk, node, &head->chain) { 794 sk_for_each(sk, node, &head->chain) {
791 struct inet_sock *inet = inet_sk(sk); 795 struct inet_sock *inet = inet_sk(sk);
792 796
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 743f011b9a84..1c5fd38f8824 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -126,6 +126,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
126 tw->tw_reuse = sk->sk_reuse; 126 tw->tw_reuse = sk->sk_reuse;
127 tw->tw_hash = sk->sk_hash; 127 tw->tw_hash = sk->sk_hash;
128 tw->tw_ipv6only = 0; 128 tw->tw_ipv6only = 0;
129 tw->tw_transparent = inet->transparent;
129 tw->tw_prot = sk->sk_prot_creator; 130 tw->tw_prot = sk->sk_prot_creator;
130 twsk_net_set(tw, hold_net(sock_net(sk))); 131 twsk_net_set(tw, hold_net(sock_net(sk)));
131 atomic_set(&tw->tw_refcnt, 1); 132 atomic_set(&tw->tw_refcnt, 1);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 2a61158ea722..85c487b8572b 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -27,6 +27,7 @@
27#include <linux/inetdevice.h> 27#include <linux/inetdevice.h>
28#include <linux/igmp.h> 28#include <linux/igmp.h>
29#include <linux/netfilter_ipv4.h> 29#include <linux/netfilter_ipv4.h>
30#include <linux/etherdevice.h>
30#include <linux/if_ether.h> 31#include <linux/if_ether.h>
31 32
32#include <net/sock.h> 33#include <net/sock.h>
@@ -41,6 +42,7 @@
41#include <net/xfrm.h> 42#include <net/xfrm.h>
42#include <net/net_namespace.h> 43#include <net/net_namespace.h>
43#include <net/netns/generic.h> 44#include <net/netns/generic.h>
45#include <net/rtnetlink.h>
44 46
45#ifdef CONFIG_IPV6 47#ifdef CONFIG_IPV6
46#include <net/ipv6.h> 48#include <net/ipv6.h>
@@ -117,8 +119,10 @@
117 Alexey Kuznetsov. 119 Alexey Kuznetsov.
118 */ 120 */
119 121
122static struct rtnl_link_ops ipgre_link_ops __read_mostly;
120static int ipgre_tunnel_init(struct net_device *dev); 123static int ipgre_tunnel_init(struct net_device *dev);
121static void ipgre_tunnel_setup(struct net_device *dev); 124static void ipgre_tunnel_setup(struct net_device *dev);
125static int ipgre_tunnel_bind_dev(struct net_device *dev);
122 126
123/* Fallback tunnel: no source, no destination, no key, no options */ 127/* Fallback tunnel: no source, no destination, no key, no options */
124 128
@@ -163,38 +167,64 @@ static DEFINE_RWLOCK(ipgre_lock);
163/* Given src, dst and key, find appropriate for input tunnel. */ 167/* Given src, dst and key, find appropriate for input tunnel. */
164 168
165static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net, 169static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
166 __be32 remote, __be32 local, __be32 key) 170 __be32 remote, __be32 local,
171 __be32 key, __be16 gre_proto)
167{ 172{
168 unsigned h0 = HASH(remote); 173 unsigned h0 = HASH(remote);
169 unsigned h1 = HASH(key); 174 unsigned h1 = HASH(key);
170 struct ip_tunnel *t; 175 struct ip_tunnel *t;
176 struct ip_tunnel *t2 = NULL;
171 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 177 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
178 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
179 ARPHRD_ETHER : ARPHRD_IPGRE;
172 180
173 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) { 181 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
174 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { 182 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
175 if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) 183 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
176 return t; 184 if (t->dev->type == dev_type)
185 return t;
186 if (t->dev->type == ARPHRD_IPGRE && !t2)
187 t2 = t;
188 }
177 } 189 }
178 } 190 }
191
179 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) { 192 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
180 if (remote == t->parms.iph.daddr) { 193 if (remote == t->parms.iph.daddr) {
181 if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) 194 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
182 return t; 195 if (t->dev->type == dev_type)
196 return t;
197 if (t->dev->type == ARPHRD_IPGRE && !t2)
198 t2 = t;
199 }
183 } 200 }
184 } 201 }
202
185 for (t = ign->tunnels_l[h1]; t; t = t->next) { 203 for (t = ign->tunnels_l[h1]; t; t = t->next) {
186 if (local == t->parms.iph.saddr || 204 if (local == t->parms.iph.saddr ||
187 (local == t->parms.iph.daddr && 205 (local == t->parms.iph.daddr &&
188 ipv4_is_multicast(local))) { 206 ipv4_is_multicast(local))) {
189 if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) 207 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
190 return t; 208 if (t->dev->type == dev_type)
209 return t;
210 if (t->dev->type == ARPHRD_IPGRE && !t2)
211 t2 = t;
212 }
191 } 213 }
192 } 214 }
215
193 for (t = ign->tunnels_wc[h1]; t; t = t->next) { 216 for (t = ign->tunnels_wc[h1]; t; t = t->next) {
194 if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) 217 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
195 return t; 218 if (t->dev->type == dev_type)
219 return t;
220 if (t->dev->type == ARPHRD_IPGRE && !t2)
221 t2 = t;
222 }
196 } 223 }
197 224
225 if (t2)
226 return t2;
227
198 if (ign->fb_tunnel_dev->flags&IFF_UP) 228 if (ign->fb_tunnel_dev->flags&IFF_UP)
199 return netdev_priv(ign->fb_tunnel_dev); 229 return netdev_priv(ign->fb_tunnel_dev);
200 return NULL; 230 return NULL;
@@ -249,25 +279,37 @@ static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
249 } 279 }
250} 280}
251 281
252static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, 282static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
253 struct ip_tunnel_parm *parms, int create) 283 struct ip_tunnel_parm *parms,
284 int type)
254{ 285{
255 __be32 remote = parms->iph.daddr; 286 __be32 remote = parms->iph.daddr;
256 __be32 local = parms->iph.saddr; 287 __be32 local = parms->iph.saddr;
257 __be32 key = parms->i_key; 288 __be32 key = parms->i_key;
258 struct ip_tunnel *t, **tp, *nt; 289 struct ip_tunnel *t, **tp;
290 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
291
292 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
293 if (local == t->parms.iph.saddr &&
294 remote == t->parms.iph.daddr &&
295 key == t->parms.i_key &&
296 type == t->dev->type)
297 break;
298
299 return t;
300}
301
302static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
303 struct ip_tunnel_parm *parms, int create)
304{
305 struct ip_tunnel *t, *nt;
259 struct net_device *dev; 306 struct net_device *dev;
260 char name[IFNAMSIZ]; 307 char name[IFNAMSIZ];
261 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 308 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
262 309
263 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) { 310 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
264 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { 311 if (t || !create)
265 if (key == t->parms.i_key) 312 return t;
266 return t;
267 }
268 }
269 if (!create)
270 return NULL;
271 313
272 if (parms->name[0]) 314 if (parms->name[0])
273 strlcpy(name, parms->name, IFNAMSIZ); 315 strlcpy(name, parms->name, IFNAMSIZ);
@@ -285,9 +327,11 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
285 goto failed_free; 327 goto failed_free;
286 } 328 }
287 329
288 dev->init = ipgre_tunnel_init;
289 nt = netdev_priv(dev); 330 nt = netdev_priv(dev);
290 nt->parms = *parms; 331 nt->parms = *parms;
332 dev->rtnl_link_ops = &ipgre_link_ops;
333
334 dev->mtu = ipgre_tunnel_bind_dev(dev);
291 335
292 if (register_netdevice(dev) < 0) 336 if (register_netdevice(dev) < 0)
293 goto failed_free; 337 goto failed_free;
@@ -380,8 +424,9 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
380 424
381 read_lock(&ipgre_lock); 425 read_lock(&ipgre_lock);
382 t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr, 426 t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
383 (flags&GRE_KEY) ? 427 flags & GRE_KEY ?
384 *(((__be32*)p) + (grehlen>>2) - 1) : 0); 428 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
429 p[1]);
385 if (t == NULL || t->parms.iph.daddr == 0 || 430 if (t == NULL || t->parms.iph.daddr == 0 ||
386 ipv4_is_multicast(t->parms.iph.daddr)) 431 ipv4_is_multicast(t->parms.iph.daddr))
387 goto out; 432 goto out;
@@ -431,6 +476,8 @@ static int ipgre_rcv(struct sk_buff *skb)
431 u32 seqno = 0; 476 u32 seqno = 0;
432 struct ip_tunnel *tunnel; 477 struct ip_tunnel *tunnel;
433 int offset = 4; 478 int offset = 4;
479 __be16 gre_proto;
480 unsigned int len;
434 481
435 if (!pskb_may_pull(skb, 16)) 482 if (!pskb_may_pull(skb, 16))
436 goto drop_nolock; 483 goto drop_nolock;
@@ -470,20 +517,22 @@ static int ipgre_rcv(struct sk_buff *skb)
470 } 517 }
471 } 518 }
472 519
520 gre_proto = *(__be16 *)(h + 2);
521
473 read_lock(&ipgre_lock); 522 read_lock(&ipgre_lock);
474 if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev), 523 if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
475 iph->saddr, iph->daddr, key)) != NULL) { 524 iph->saddr, iph->daddr, key,
525 gre_proto))) {
476 struct net_device_stats *stats = &tunnel->dev->stats; 526 struct net_device_stats *stats = &tunnel->dev->stats;
477 527
478 secpath_reset(skb); 528 secpath_reset(skb);
479 529
480 skb->protocol = *(__be16*)(h + 2); 530 skb->protocol = gre_proto;
481 /* WCCP version 1 and 2 protocol decoding. 531 /* WCCP version 1 and 2 protocol decoding.
482 * - Change protocol to IP 532 * - Change protocol to IP
483 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header 533 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
484 */ 534 */
485 if (flags == 0 && 535 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
486 skb->protocol == htons(ETH_P_WCCP)) {
487 skb->protocol = htons(ETH_P_IP); 536 skb->protocol = htons(ETH_P_IP);
488 if ((*(h + offset) & 0xF0) != 0x40) 537 if ((*(h + offset) & 0xF0) != 0x40)
489 offset += 4; 538 offset += 4;
@@ -491,7 +540,6 @@ static int ipgre_rcv(struct sk_buff *skb)
491 540
492 skb->mac_header = skb->network_header; 541 skb->mac_header = skb->network_header;
493 __pskb_pull(skb, offset); 542 __pskb_pull(skb, offset);
494 skb_reset_network_header(skb);
495 skb_postpull_rcsum(skb, skb_transport_header(skb), offset); 543 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
496 skb->pkt_type = PACKET_HOST; 544 skb->pkt_type = PACKET_HOST;
497#ifdef CONFIG_NET_IPGRE_BROADCAST 545#ifdef CONFIG_NET_IPGRE_BROADCAST
@@ -519,13 +567,32 @@ static int ipgre_rcv(struct sk_buff *skb)
519 } 567 }
520 tunnel->i_seqno = seqno + 1; 568 tunnel->i_seqno = seqno + 1;
521 } 569 }
570
571 len = skb->len;
572
573 /* Warning: All skb pointers will be invalidated! */
574 if (tunnel->dev->type == ARPHRD_ETHER) {
575 if (!pskb_may_pull(skb, ETH_HLEN)) {
576 stats->rx_length_errors++;
577 stats->rx_errors++;
578 goto drop;
579 }
580
581 iph = ip_hdr(skb);
582 skb->protocol = eth_type_trans(skb, tunnel->dev);
583 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
584 }
585
522 stats->rx_packets++; 586 stats->rx_packets++;
523 stats->rx_bytes += skb->len; 587 stats->rx_bytes += len;
524 skb->dev = tunnel->dev; 588 skb->dev = tunnel->dev;
525 dst_release(skb->dst); 589 dst_release(skb->dst);
526 skb->dst = NULL; 590 skb->dst = NULL;
527 nf_reset(skb); 591 nf_reset(skb);
592
593 skb_reset_network_header(skb);
528 ipgre_ecn_decapsulate(iph, skb); 594 ipgre_ecn_decapsulate(iph, skb);
595
529 netif_rx(skb); 596 netif_rx(skb);
530 read_unlock(&ipgre_lock); 597 read_unlock(&ipgre_lock);
531 return(0); 598 return(0);
@@ -560,7 +627,10 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
560 goto tx_error; 627 goto tx_error;
561 } 628 }
562 629
563 if (dev->header_ops) { 630 if (dev->type == ARPHRD_ETHER)
631 IPCB(skb)->flags = 0;
632
633 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
564 gre_hlen = 0; 634 gre_hlen = 0;
565 tiph = (struct iphdr*)skb->data; 635 tiph = (struct iphdr*)skb->data;
566 } else { 636 } else {
@@ -637,7 +707,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
637 707
638 df = tiph->frag_off; 708 df = tiph->frag_off;
639 if (df) 709 if (df)
640 mtu = dst_mtu(&rt->u.dst) - tunnel->hlen; 710 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
641 else 711 else
642 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; 712 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
643 713
@@ -703,7 +773,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
703 old_iph = ip_hdr(skb); 773 old_iph = ip_hdr(skb);
704 } 774 }
705 775
706 skb->transport_header = skb->network_header; 776 skb_reset_transport_header(skb);
707 skb_push(skb, gre_hlen); 777 skb_push(skb, gre_hlen);
708 skb_reset_network_header(skb); 778 skb_reset_network_header(skb);
709 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 779 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -736,8 +806,9 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
736 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT); 806 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
737 } 807 }
738 808
739 ((__be16*)(iph+1))[0] = tunnel->parms.o_flags; 809 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
740 ((__be16*)(iph+1))[1] = skb->protocol; 810 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
811 htons(ETH_P_TEB) : skb->protocol;
741 812
742 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { 813 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
743 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4); 814 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
@@ -773,7 +844,7 @@ tx_error:
773 return 0; 844 return 0;
774} 845}
775 846
776static void ipgre_tunnel_bind_dev(struct net_device *dev) 847static int ipgre_tunnel_bind_dev(struct net_device *dev)
777{ 848{
778 struct net_device *tdev = NULL; 849 struct net_device *tdev = NULL;
779 struct ip_tunnel *tunnel; 850 struct ip_tunnel *tunnel;
@@ -785,7 +856,7 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev)
785 tunnel = netdev_priv(dev); 856 tunnel = netdev_priv(dev);
786 iph = &tunnel->parms.iph; 857 iph = &tunnel->parms.iph;
787 858
788 /* Guess output device to choose reasonable mtu and hard_header_len */ 859 /* Guess output device to choose reasonable mtu and needed_headroom */
789 860
790 if (iph->daddr) { 861 if (iph->daddr) {
791 struct flowi fl = { .oif = tunnel->parms.link, 862 struct flowi fl = { .oif = tunnel->parms.link,
@@ -799,14 +870,16 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev)
799 tdev = rt->u.dst.dev; 870 tdev = rt->u.dst.dev;
800 ip_rt_put(rt); 871 ip_rt_put(rt);
801 } 872 }
802 dev->flags |= IFF_POINTOPOINT; 873
874 if (dev->type != ARPHRD_ETHER)
875 dev->flags |= IFF_POINTOPOINT;
803 } 876 }
804 877
805 if (!tdev && tunnel->parms.link) 878 if (!tdev && tunnel->parms.link)
806 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); 879 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
807 880
808 if (tdev) { 881 if (tdev) {
809 hlen = tdev->hard_header_len; 882 hlen = tdev->hard_header_len + tdev->needed_headroom;
810 mtu = tdev->mtu; 883 mtu = tdev->mtu;
811 } 884 }
812 dev->iflink = tunnel->parms.link; 885 dev->iflink = tunnel->parms.link;
@@ -820,10 +893,15 @@ static void ipgre_tunnel_bind_dev(struct net_device *dev)
820 if (tunnel->parms.o_flags&GRE_SEQ) 893 if (tunnel->parms.o_flags&GRE_SEQ)
821 addend += 4; 894 addend += 4;
822 } 895 }
823 dev->hard_header_len = hlen + addend; 896 dev->needed_headroom = addend + hlen;
824 dev->mtu = mtu - addend; 897 mtu -= dev->hard_header_len - addend;
898
899 if (mtu < 68)
900 mtu = 68;
901
825 tunnel->hlen = addend; 902 tunnel->hlen = addend;
826 903
904 return mtu;
827} 905}
828 906
829static int 907static int
@@ -917,7 +995,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
917 t->parms.iph.frag_off = p.iph.frag_off; 995 t->parms.iph.frag_off = p.iph.frag_off;
918 if (t->parms.link != p.link) { 996 if (t->parms.link != p.link) {
919 t->parms.link = p.link; 997 t->parms.link = p.link;
920 ipgre_tunnel_bind_dev(dev); 998 dev->mtu = ipgre_tunnel_bind_dev(dev);
921 netdev_state_change(dev); 999 netdev_state_change(dev);
922 } 1000 }
923 } 1001 }
@@ -959,7 +1037,8 @@ done:
959static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) 1037static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
960{ 1038{
961 struct ip_tunnel *tunnel = netdev_priv(dev); 1039 struct ip_tunnel *tunnel = netdev_priv(dev);
962 if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen) 1040 if (new_mtu < 68 ||
1041 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
963 return -EINVAL; 1042 return -EINVAL;
964 dev->mtu = new_mtu; 1043 dev->mtu = new_mtu;
965 return 0; 1044 return 0;
@@ -1078,6 +1157,7 @@ static int ipgre_close(struct net_device *dev)
1078 1157
1079static void ipgre_tunnel_setup(struct net_device *dev) 1158static void ipgre_tunnel_setup(struct net_device *dev)
1080{ 1159{
1160 dev->init = ipgre_tunnel_init;
1081 dev->uninit = ipgre_tunnel_uninit; 1161 dev->uninit = ipgre_tunnel_uninit;
1082 dev->destructor = free_netdev; 1162 dev->destructor = free_netdev;
1083 dev->hard_start_xmit = ipgre_tunnel_xmit; 1163 dev->hard_start_xmit = ipgre_tunnel_xmit;
@@ -1085,7 +1165,7 @@ static void ipgre_tunnel_setup(struct net_device *dev)
1085 dev->change_mtu = ipgre_tunnel_change_mtu; 1165 dev->change_mtu = ipgre_tunnel_change_mtu;
1086 1166
1087 dev->type = ARPHRD_IPGRE; 1167 dev->type = ARPHRD_IPGRE;
1088 dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4; 1168 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1089 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; 1169 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1090 dev->flags = IFF_NOARP; 1170 dev->flags = IFF_NOARP;
1091 dev->iflink = 0; 1171 dev->iflink = 0;
@@ -1107,8 +1187,6 @@ static int ipgre_tunnel_init(struct net_device *dev)
1107 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); 1187 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1108 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 1188 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1109 1189
1110 ipgre_tunnel_bind_dev(dev);
1111
1112 if (iph->daddr) { 1190 if (iph->daddr) {
1113#ifdef CONFIG_NET_IPGRE_BROADCAST 1191#ifdef CONFIG_NET_IPGRE_BROADCAST
1114 if (ipv4_is_multicast(iph->daddr)) { 1192 if (ipv4_is_multicast(iph->daddr)) {
@@ -1189,6 +1267,7 @@ static int ipgre_init_net(struct net *net)
1189 1267
1190 ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init; 1268 ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1191 dev_net_set(ign->fb_tunnel_dev, net); 1269 dev_net_set(ign->fb_tunnel_dev, net);
1270 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1192 1271
1193 if ((err = register_netdev(ign->fb_tunnel_dev))) 1272 if ((err = register_netdev(ign->fb_tunnel_dev)))
1194 goto err_reg_dev; 1273 goto err_reg_dev;
@@ -1221,6 +1300,298 @@ static struct pernet_operations ipgre_net_ops = {
1221 .exit = ipgre_exit_net, 1300 .exit = ipgre_exit_net,
1222}; 1301};
1223 1302
1303static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1304{
1305 __be16 flags;
1306
1307 if (!data)
1308 return 0;
1309
1310 flags = 0;
1311 if (data[IFLA_GRE_IFLAGS])
1312 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1313 if (data[IFLA_GRE_OFLAGS])
1314 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1315 if (flags & (GRE_VERSION|GRE_ROUTING))
1316 return -EINVAL;
1317
1318 return 0;
1319}
1320
1321static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1322{
1323 __be32 daddr;
1324
1325 if (tb[IFLA_ADDRESS]) {
1326 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1327 return -EINVAL;
1328 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1329 return -EADDRNOTAVAIL;
1330 }
1331
1332 if (!data)
1333 goto out;
1334
1335 if (data[IFLA_GRE_REMOTE]) {
1336 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1337 if (!daddr)
1338 return -EINVAL;
1339 }
1340
1341out:
1342 return ipgre_tunnel_validate(tb, data);
1343}
1344
1345static void ipgre_netlink_parms(struct nlattr *data[],
1346 struct ip_tunnel_parm *parms)
1347{
1348 memset(parms, 0, sizeof(*parms));
1349
1350 parms->iph.protocol = IPPROTO_GRE;
1351
1352 if (!data)
1353 return;
1354
1355 if (data[IFLA_GRE_LINK])
1356 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1357
1358 if (data[IFLA_GRE_IFLAGS])
1359 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1360
1361 if (data[IFLA_GRE_OFLAGS])
1362 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1363
1364 if (data[IFLA_GRE_IKEY])
1365 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1366
1367 if (data[IFLA_GRE_OKEY])
1368 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1369
1370 if (data[IFLA_GRE_LOCAL])
1371 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1372
1373 if (data[IFLA_GRE_REMOTE])
1374 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1375
1376 if (data[IFLA_GRE_TTL])
1377 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1378
1379 if (data[IFLA_GRE_TOS])
1380 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1381
1382 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1383 parms->iph.frag_off = htons(IP_DF);
1384}
1385
1386static int ipgre_tap_init(struct net_device *dev)
1387{
1388 struct ip_tunnel *tunnel;
1389
1390 tunnel = netdev_priv(dev);
1391
1392 tunnel->dev = dev;
1393 strcpy(tunnel->parms.name, dev->name);
1394
1395 ipgre_tunnel_bind_dev(dev);
1396
1397 return 0;
1398}
1399
1400static void ipgre_tap_setup(struct net_device *dev)
1401{
1402
1403 ether_setup(dev);
1404
1405 dev->init = ipgre_tap_init;
1406 dev->uninit = ipgre_tunnel_uninit;
1407 dev->destructor = free_netdev;
1408 dev->hard_start_xmit = ipgre_tunnel_xmit;
1409 dev->change_mtu = ipgre_tunnel_change_mtu;
1410
1411 dev->iflink = 0;
1412 dev->features |= NETIF_F_NETNS_LOCAL;
1413}
1414
1415static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1416 struct nlattr *data[])
1417{
1418 struct ip_tunnel *nt;
1419 struct net *net = dev_net(dev);
1420 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1421 int mtu;
1422 int err;
1423
1424 nt = netdev_priv(dev);
1425 ipgre_netlink_parms(data, &nt->parms);
1426
1427 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1428 return -EEXIST;
1429
1430 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1431 random_ether_addr(dev->dev_addr);
1432
1433 mtu = ipgre_tunnel_bind_dev(dev);
1434 if (!tb[IFLA_MTU])
1435 dev->mtu = mtu;
1436
1437 err = register_netdevice(dev);
1438 if (err)
1439 goto out;
1440
1441 dev_hold(dev);
1442 ipgre_tunnel_link(ign, nt);
1443
1444out:
1445 return err;
1446}
1447
1448static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1449 struct nlattr *data[])
1450{
1451 struct ip_tunnel *t, *nt;
1452 struct net *net = dev_net(dev);
1453 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1454 struct ip_tunnel_parm p;
1455 int mtu;
1456
1457 if (dev == ign->fb_tunnel_dev)
1458 return -EINVAL;
1459
1460 nt = netdev_priv(dev);
1461 ipgre_netlink_parms(data, &p);
1462
1463 t = ipgre_tunnel_locate(net, &p, 0);
1464
1465 if (t) {
1466 if (t->dev != dev)
1467 return -EEXIST;
1468 } else {
1469 unsigned nflags = 0;
1470
1471 t = nt;
1472
1473 if (ipv4_is_multicast(p.iph.daddr))
1474 nflags = IFF_BROADCAST;
1475 else if (p.iph.daddr)
1476 nflags = IFF_POINTOPOINT;
1477
1478 if ((dev->flags ^ nflags) &
1479 (IFF_POINTOPOINT | IFF_BROADCAST))
1480 return -EINVAL;
1481
1482 ipgre_tunnel_unlink(ign, t);
1483 t->parms.iph.saddr = p.iph.saddr;
1484 t->parms.iph.daddr = p.iph.daddr;
1485 t->parms.i_key = p.i_key;
1486 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1487 memcpy(dev->broadcast, &p.iph.daddr, 4);
1488 ipgre_tunnel_link(ign, t);
1489 netdev_state_change(dev);
1490 }
1491
1492 t->parms.o_key = p.o_key;
1493 t->parms.iph.ttl = p.iph.ttl;
1494 t->parms.iph.tos = p.iph.tos;
1495 t->parms.iph.frag_off = p.iph.frag_off;
1496
1497 if (t->parms.link != p.link) {
1498 t->parms.link = p.link;
1499 mtu = ipgre_tunnel_bind_dev(dev);
1500 if (!tb[IFLA_MTU])
1501 dev->mtu = mtu;
1502 netdev_state_change(dev);
1503 }
1504
1505 return 0;
1506}
1507
1508static size_t ipgre_get_size(const struct net_device *dev)
1509{
1510 return
1511 /* IFLA_GRE_LINK */
1512 nla_total_size(4) +
1513 /* IFLA_GRE_IFLAGS */
1514 nla_total_size(2) +
1515 /* IFLA_GRE_OFLAGS */
1516 nla_total_size(2) +
1517 /* IFLA_GRE_IKEY */
1518 nla_total_size(4) +
1519 /* IFLA_GRE_OKEY */
1520 nla_total_size(4) +
1521 /* IFLA_GRE_LOCAL */
1522 nla_total_size(4) +
1523 /* IFLA_GRE_REMOTE */
1524 nla_total_size(4) +
1525 /* IFLA_GRE_TTL */
1526 nla_total_size(1) +
1527 /* IFLA_GRE_TOS */
1528 nla_total_size(1) +
1529 /* IFLA_GRE_PMTUDISC */
1530 nla_total_size(1) +
1531 0;
1532}
1533
1534static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1535{
1536 struct ip_tunnel *t = netdev_priv(dev);
1537 struct ip_tunnel_parm *p = &t->parms;
1538
1539 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1540 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1541 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1542 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1543 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1544 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1545 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1546 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1547 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1548 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1549
1550 return 0;
1551
1552nla_put_failure:
1553 return -EMSGSIZE;
1554}
1555
1556static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1557 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1558 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1559 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1560 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1561 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
1562 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1563 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1564 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1565 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1566 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1567};
1568
1569static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1570 .kind = "gre",
1571 .maxtype = IFLA_GRE_MAX,
1572 .policy = ipgre_policy,
1573 .priv_size = sizeof(struct ip_tunnel),
1574 .setup = ipgre_tunnel_setup,
1575 .validate = ipgre_tunnel_validate,
1576 .newlink = ipgre_newlink,
1577 .changelink = ipgre_changelink,
1578 .get_size = ipgre_get_size,
1579 .fill_info = ipgre_fill_info,
1580};
1581
1582static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1583 .kind = "gretap",
1584 .maxtype = IFLA_GRE_MAX,
1585 .policy = ipgre_policy,
1586 .priv_size = sizeof(struct ip_tunnel),
1587 .setup = ipgre_tap_setup,
1588 .validate = ipgre_tap_validate,
1589 .newlink = ipgre_newlink,
1590 .changelink = ipgre_changelink,
1591 .get_size = ipgre_get_size,
1592 .fill_info = ipgre_fill_info,
1593};
1594
1224/* 1595/*
1225 * And now the modules code and kernel interface. 1596 * And now the modules code and kernel interface.
1226 */ 1597 */
@@ -1238,19 +1609,39 @@ static int __init ipgre_init(void)
1238 1609
1239 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops); 1610 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1240 if (err < 0) 1611 if (err < 0)
1241 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); 1612 goto gen_device_failed;
1242 1613
1614 err = rtnl_link_register(&ipgre_link_ops);
1615 if (err < 0)
1616 goto rtnl_link_failed;
1617
1618 err = rtnl_link_register(&ipgre_tap_ops);
1619 if (err < 0)
1620 goto tap_ops_failed;
1621
1622out:
1243 return err; 1623 return err;
1624
1625tap_ops_failed:
1626 rtnl_link_unregister(&ipgre_link_ops);
1627rtnl_link_failed:
1628 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1629gen_device_failed:
1630 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1631 goto out;
1244} 1632}
1245 1633
1246static void __exit ipgre_fini(void) 1634static void __exit ipgre_fini(void)
1247{ 1635{
1636 rtnl_link_unregister(&ipgre_tap_ops);
1637 rtnl_link_unregister(&ipgre_link_ops);
1638 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1248 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) 1639 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1249 printk(KERN_INFO "ipgre close: can't remove protocol\n"); 1640 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1250
1251 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1252} 1641}
1253 1642
1254module_init(ipgre_init); 1643module_init(ipgre_init);
1255module_exit(ipgre_fini); 1644module_exit(ipgre_fini);
1256MODULE_LICENSE("GPL"); 1645MODULE_LICENSE("GPL");
1646MODULE_ALIAS_RTNL_LINK("gre");
1647MODULE_ALIAS_RTNL_LINK("gretap");
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d533a89e08de..d2a8f8bb78a6 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -340,6 +340,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
340 .saddr = inet->saddr, 340 .saddr = inet->saddr,
341 .tos = RT_CONN_FLAGS(sk) } }, 341 .tos = RT_CONN_FLAGS(sk) } },
342 .proto = sk->sk_protocol, 342 .proto = sk->sk_protocol,
343 .flags = inet_sk_flowi_flags(sk),
343 .uli_u = { .ports = 344 .uli_u = { .ports =
344 { .sport = inet->sport, 345 { .sport = inet->sport,
345 .dport = inet->dport } } }; 346 .dport = inet->dport } } };
@@ -1371,7 +1372,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1371 .uli_u = { .ports = 1372 .uli_u = { .ports =
1372 { .sport = tcp_hdr(skb)->dest, 1373 { .sport = tcp_hdr(skb)->dest,
1373 .dport = tcp_hdr(skb)->source } }, 1374 .dport = tcp_hdr(skb)->source } },
1374 .proto = sk->sk_protocol }; 1375 .proto = sk->sk_protocol,
1376 .flags = ip_reply_arg_flowi_flags(arg) };
1375 security_skb_classify_flow(skb, &fl); 1377 security_skb_classify_flow(skb, &fl);
1376 if (ip_route_output_key(sock_net(sk), &rt, &fl)) 1378 if (ip_route_output_key(sock_net(sk), &rt, &fl))
1377 return; 1379 return;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 105d92a039b9..465abf0a9869 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -419,7 +419,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
419 (1<<IP_TTL) | (1<<IP_HDRINCL) | 419 (1<<IP_TTL) | (1<<IP_HDRINCL) |
420 (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | 420 (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
421 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | 421 (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
422 (1<<IP_PASSSEC))) || 422 (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) ||
423 optname == IP_MULTICAST_TTL || 423 optname == IP_MULTICAST_TTL ||
424 optname == IP_MULTICAST_LOOP) { 424 optname == IP_MULTICAST_LOOP) {
425 if (optlen >= sizeof(int)) { 425 if (optlen >= sizeof(int)) {
@@ -878,6 +878,16 @@ static int do_ip_setsockopt(struct sock *sk, int level,
878 err = xfrm_user_policy(sk, optname, optval, optlen); 878 err = xfrm_user_policy(sk, optname, optval, optlen);
879 break; 879 break;
880 880
881 case IP_TRANSPARENT:
882 if (!capable(CAP_NET_ADMIN)) {
883 err = -EPERM;
884 break;
885 }
886 if (optlen < 1)
887 goto e_inval;
888 inet->transparent = !!val;
889 break;
890
881 default: 891 default:
882 err = -ENOPROTOOPT; 892 err = -ENOPROTOOPT;
883 break; 893 break;
@@ -1130,6 +1140,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
1130 case IP_FREEBIND: 1140 case IP_FREEBIND:
1131 val = inet->freebind; 1141 val = inet->freebind;
1132 break; 1142 break;
1143 case IP_TRANSPARENT:
1144 val = inet->transparent;
1145 break;
1133 default: 1146 default:
1134 release_sock(sk); 1147 release_sock(sk);
1135 return -ENOPROTOOPT; 1148 return -ENOPROTOOPT;
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
deleted file mode 100644
index 09d0c3f35669..000000000000
--- a/net/ipv4/ipvs/Kconfig
+++ /dev/null
@@ -1,224 +0,0 @@
1#
2# IP Virtual Server configuration
3#
4menuconfig IP_VS
5 tristate "IP virtual server support (EXPERIMENTAL)"
6 depends on NETFILTER
7 ---help---
8 IP Virtual Server support will let you build a high-performance
9 virtual server based on cluster of two or more real servers. This
10 option must be enabled for at least one of the clustered computers
11 that will take care of intercepting incoming connections to a
12 single IP address and scheduling them to real servers.
13
14 Three request dispatching techniques are implemented, they are
15 virtual server via NAT, virtual server via tunneling and virtual
16 server via direct routing. The several scheduling algorithms can
17 be used to choose which server the connection is directed to,
18 thus load balancing can be achieved among the servers. For more
19 information and its administration program, please visit the
20 following URL: <http://www.linuxvirtualserver.org/>.
21
22 If you want to compile it in kernel, say Y. To compile it as a
23 module, choose M here. If unsure, say N.
24
25if IP_VS
26
27config IP_VS_DEBUG
28 bool "IP virtual server debugging"
29 ---help---
30 Say Y here if you want to get additional messages useful in
31 debugging the IP virtual server code. You can change the debug
32 level in /proc/sys/net/ipv4/vs/debug_level
33
34config IP_VS_TAB_BITS
35 int "IPVS connection table size (the Nth power of 2)"
36 default "12"
37 ---help---
38 The IPVS connection hash table uses the chaining scheme to handle
39 hash collisions. Using a big IPVS connection hash table will greatly
40 reduce conflicts when there are hundreds of thousands of connections
41 in the hash table.
42
43 Note the table size must be power of 2. The table size will be the
44 value of 2 to the your input number power. The number to choose is
45 from 8 to 20, the default number is 12, which means the table size
46 is 4096. Don't input the number too small, otherwise you will lose
47 performance on it. You can adapt the table size yourself, according
48 to your virtual server application. It is good to set the table size
49 not far less than the number of connections per second multiplying
50 average lasting time of connection in the table. For example, your
51 virtual server gets 200 connections per second, the connection lasts
52 for 200 seconds in average in the connection table, the table size
53 should be not far less than 200x200, it is good to set the table
54 size 32768 (2**15).
55
56 Another note that each connection occupies 128 bytes effectively and
57 each hash entry uses 8 bytes, so you can estimate how much memory is
58 needed for your box.
59
60comment "IPVS transport protocol load balancing support"
61
62config IP_VS_PROTO_TCP
63 bool "TCP load balancing support"
64 ---help---
65 This option enables support for load balancing TCP transport
66 protocol. Say Y if unsure.
67
68config IP_VS_PROTO_UDP
69 bool "UDP load balancing support"
70 ---help---
71 This option enables support for load balancing UDP transport
72 protocol. Say Y if unsure.
73
74config IP_VS_PROTO_ESP
75 bool "ESP load balancing support"
76 ---help---
77 This option enables support for load balancing ESP (Encapsulation
78 Security Payload) transport protocol. Say Y if unsure.
79
80config IP_VS_PROTO_AH
81 bool "AH load balancing support"
82 ---help---
83 This option enables support for load balancing AH (Authentication
84 Header) transport protocol. Say Y if unsure.
85
86comment "IPVS scheduler"
87
88config IP_VS_RR
89 tristate "round-robin scheduling"
90 ---help---
91 The robin-robin scheduling algorithm simply directs network
92 connections to different real servers in a round-robin manner.
93
94 If you want to compile it in kernel, say Y. To compile it as a
95 module, choose M here. If unsure, say N.
96
97config IP_VS_WRR
98 tristate "weighted round-robin scheduling"
99 ---help---
100 The weighted robin-robin scheduling algorithm directs network
101 connections to different real servers based on server weights
102 in a round-robin manner. Servers with higher weights receive
103 new connections first than those with less weights, and servers
104 with higher weights get more connections than those with less
105 weights and servers with equal weights get equal connections.
106
107 If you want to compile it in kernel, say Y. To compile it as a
108 module, choose M here. If unsure, say N.
109
110config IP_VS_LC
111 tristate "least-connection scheduling"
112 ---help---
113 The least-connection scheduling algorithm directs network
114 connections to the server with the least number of active
115 connections.
116
117 If you want to compile it in kernel, say Y. To compile it as a
118 module, choose M here. If unsure, say N.
119
120config IP_VS_WLC
121 tristate "weighted least-connection scheduling"
122 ---help---
123 The weighted least-connection scheduling algorithm directs network
124 connections to the server with the least active connections
125 normalized by the server weight.
126
127 If you want to compile it in kernel, say Y. To compile it as a
128 module, choose M here. If unsure, say N.
129
130config IP_VS_LBLC
131 tristate "locality-based least-connection scheduling"
132 ---help---
133 The locality-based least-connection scheduling algorithm is for
134 destination IP load balancing. It is usually used in cache cluster.
135 This algorithm usually directs packet destined for an IP address to
136 its server if the server is alive and under load. If the server is
137 overloaded (its active connection numbers is larger than its weight)
138 and there is a server in its half load, then allocate the weighted
139 least-connection server to this IP address.
140
141 If you want to compile it in kernel, say Y. To compile it as a
142 module, choose M here. If unsure, say N.
143
144config IP_VS_LBLCR
145 tristate "locality-based least-connection with replication scheduling"
146 ---help---
147 The locality-based least-connection with replication scheduling
148 algorithm is also for destination IP load balancing. It is
149 usually used in cache cluster. It differs from the LBLC scheduling
150 as follows: the load balancer maintains mappings from a target
151 to a set of server nodes that can serve the target. Requests for
152 a target are assigned to the least-connection node in the target's
153 server set. If all the node in the server set are over loaded,
154 it picks up a least-connection node in the cluster and adds it
155 in the sever set for the target. If the server set has not been
156 modified for the specified time, the most loaded node is removed
157 from the server set, in order to avoid high degree of replication.
158
159 If you want to compile it in kernel, say Y. To compile it as a
160 module, choose M here. If unsure, say N.
161
162config IP_VS_DH
163 tristate "destination hashing scheduling"
164 ---help---
165 The destination hashing scheduling algorithm assigns network
166 connections to the servers through looking up a statically assigned
167 hash table by their destination IP addresses.
168
169 If you want to compile it in kernel, say Y. To compile it as a
170 module, choose M here. If unsure, say N.
171
172config IP_VS_SH
173 tristate "source hashing scheduling"
174 ---help---
175 The source hashing scheduling algorithm assigns network
176 connections to the servers through looking up a statically assigned
177 hash table by their source IP addresses.
178
179 If you want to compile it in kernel, say Y. To compile it as a
180 module, choose M here. If unsure, say N.
181
182config IP_VS_SED
183 tristate "shortest expected delay scheduling"
184 ---help---
185 The shortest expected delay scheduling algorithm assigns network
186 connections to the server with the shortest expected delay. The
187 expected delay that the job will experience is (Ci + 1) / Ui if
188 sent to the ith server, in which Ci is the number of connections
189 on the ith server and Ui is the fixed service rate (weight)
190 of the ith server.
191
192 If you want to compile it in kernel, say Y. To compile it as a
193 module, choose M here. If unsure, say N.
194
195config IP_VS_NQ
196 tristate "never queue scheduling"
197 ---help---
198 The never queue scheduling algorithm adopts a two-speed model.
199 When there is an idle server available, the job will be sent to
200 the idle server, instead of waiting for a fast one. When there
201 is no idle server available, the job will be sent to the server
202 that minimize its expected delay (The Shortest Expected Delay
203 scheduling algorithm).
204
205 If you want to compile it in kernel, say Y. To compile it as a
206 module, choose M here. If unsure, say N.
207
208comment 'IPVS application helper'
209
210config IP_VS_FTP
211 tristate "FTP protocol helper"
212 depends on IP_VS_PROTO_TCP
213 ---help---
214 FTP is a protocol that transfers IP address and/or port number in
215 the payload. In the virtual server via Network Address Translation,
216 the IP address and port number of real servers cannot be sent to
217 clients in ftp connections directly, so FTP protocol helper is
218 required for tracking the connection and mangling it back to that of
219 virtual service.
220
221 If you want to compile it in kernel, say Y. To compile it as a
222 module, choose M here. If unsure, say N.
223
224endif # IP_VS
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile
deleted file mode 100644
index 30e85de9ffff..000000000000
--- a/net/ipv4/ipvs/Makefile
+++ /dev/null
@@ -1,34 +0,0 @@
1#
2# Makefile for the IPVS modules on top of IPv4.
3#
4
5# IPVS transport protocol load balancing support
6ip_vs_proto-objs-y :=
7ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
8ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
9ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o
10ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o
11
12ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
13 ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
14 ip_vs_est.o ip_vs_proto.o \
15 $(ip_vs_proto-objs-y)
16
17
18# IPVS core
19obj-$(CONFIG_IP_VS) += ip_vs.o
20
21# IPVS schedulers
22obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
23obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
24obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
25obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
26obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
27obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
28obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
29obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
30obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
31obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
32
33# IPVS application helpers
34obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
deleted file mode 100644
index 201b8ea3020d..000000000000
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ /dev/null
@@ -1,622 +0,0 @@
1/*
2 * ip_vs_app.c: Application module support for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
12 * is that ip_vs_app module handles the reverse direction (incoming requests
13 * and outgoing responses).
14 *
15 * IP_MASQ_APP application masquerading module
16 *
17 * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
18 *
19 */
20
21#include <linux/module.h>
22#include <linux/kernel.h>
23#include <linux/skbuff.h>
24#include <linux/in.h>
25#include <linux/ip.h>
26#include <linux/netfilter.h>
27#include <net/net_namespace.h>
28#include <net/protocol.h>
29#include <net/tcp.h>
30#include <asm/system.h>
31#include <linux/stat.h>
32#include <linux/proc_fs.h>
33#include <linux/seq_file.h>
34#include <linux/mutex.h>
35
36#include <net/ip_vs.h>
37
38EXPORT_SYMBOL(register_ip_vs_app);
39EXPORT_SYMBOL(unregister_ip_vs_app);
40EXPORT_SYMBOL(register_ip_vs_app_inc);
41
42/* ipvs application list head */
43static LIST_HEAD(ip_vs_app_list);
44static DEFINE_MUTEX(__ip_vs_app_mutex);
45
46
47/*
48 * Get an ip_vs_app object
49 */
50static inline int ip_vs_app_get(struct ip_vs_app *app)
51{
52 return try_module_get(app->module);
53}
54
55
56static inline void ip_vs_app_put(struct ip_vs_app *app)
57{
58 module_put(app->module);
59}
60
61
62/*
63 * Allocate/initialize app incarnation and register it in proto apps.
64 */
65static int
66ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
67{
68 struct ip_vs_protocol *pp;
69 struct ip_vs_app *inc;
70 int ret;
71
72 if (!(pp = ip_vs_proto_get(proto)))
73 return -EPROTONOSUPPORT;
74
75 if (!pp->unregister_app)
76 return -EOPNOTSUPP;
77
78 inc = kmemdup(app, sizeof(*inc), GFP_KERNEL);
79 if (!inc)
80 return -ENOMEM;
81 INIT_LIST_HEAD(&inc->p_list);
82 INIT_LIST_HEAD(&inc->incs_list);
83 inc->app = app;
84 inc->port = htons(port);
85 atomic_set(&inc->usecnt, 0);
86
87 if (app->timeouts) {
88 inc->timeout_table =
89 ip_vs_create_timeout_table(app->timeouts,
90 app->timeouts_size);
91 if (!inc->timeout_table) {
92 ret = -ENOMEM;
93 goto out;
94 }
95 }
96
97 ret = pp->register_app(inc);
98 if (ret)
99 goto out;
100
101 list_add(&inc->a_list, &app->incs_list);
102 IP_VS_DBG(9, "%s application %s:%u registered\n",
103 pp->name, inc->name, inc->port);
104
105 return 0;
106
107 out:
108 kfree(inc->timeout_table);
109 kfree(inc);
110 return ret;
111}
112
113
114/*
115 * Release app incarnation
116 */
117static void
118ip_vs_app_inc_release(struct ip_vs_app *inc)
119{
120 struct ip_vs_protocol *pp;
121
122 if (!(pp = ip_vs_proto_get(inc->protocol)))
123 return;
124
125 if (pp->unregister_app)
126 pp->unregister_app(inc);
127
128 IP_VS_DBG(9, "%s App %s:%u unregistered\n",
129 pp->name, inc->name, inc->port);
130
131 list_del(&inc->a_list);
132
133 kfree(inc->timeout_table);
134 kfree(inc);
135}
136
137
138/*
139 * Get reference to app inc (only called from softirq)
140 *
141 */
142int ip_vs_app_inc_get(struct ip_vs_app *inc)
143{
144 int result;
145
146 atomic_inc(&inc->usecnt);
147 if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
148 atomic_dec(&inc->usecnt);
149 return result;
150}
151
152
153/*
154 * Put the app inc (only called from timer or net softirq)
155 */
156void ip_vs_app_inc_put(struct ip_vs_app *inc)
157{
158 ip_vs_app_put(inc->app);
159 atomic_dec(&inc->usecnt);
160}
161
162
163/*
164 * Register an application incarnation in protocol applications
165 */
166int
167register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
168{
169 int result;
170
171 mutex_lock(&__ip_vs_app_mutex);
172
173 result = ip_vs_app_inc_new(app, proto, port);
174
175 mutex_unlock(&__ip_vs_app_mutex);
176
177 return result;
178}
179
180
181/*
182 * ip_vs_app registration routine
183 */
184int register_ip_vs_app(struct ip_vs_app *app)
185{
186 /* increase the module use count */
187 ip_vs_use_count_inc();
188
189 mutex_lock(&__ip_vs_app_mutex);
190
191 list_add(&app->a_list, &ip_vs_app_list);
192
193 mutex_unlock(&__ip_vs_app_mutex);
194
195 return 0;
196}
197
198
199/*
200 * ip_vs_app unregistration routine
201 * We are sure there are no app incarnations attached to services
202 */
203void unregister_ip_vs_app(struct ip_vs_app *app)
204{
205 struct ip_vs_app *inc, *nxt;
206
207 mutex_lock(&__ip_vs_app_mutex);
208
209 list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
210 ip_vs_app_inc_release(inc);
211 }
212
213 list_del(&app->a_list);
214
215 mutex_unlock(&__ip_vs_app_mutex);
216
217 /* decrease the module use count */
218 ip_vs_use_count_dec();
219}
220
221
222/*
223 * Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
224 */
225int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
226{
227 return pp->app_conn_bind(cp);
228}
229
230
231/*
232 * Unbind cp from application incarnation (called by cp destructor)
233 */
234void ip_vs_unbind_app(struct ip_vs_conn *cp)
235{
236 struct ip_vs_app *inc = cp->app;
237
238 if (!inc)
239 return;
240
241 if (inc->unbind_conn)
242 inc->unbind_conn(inc, cp);
243 if (inc->done_conn)
244 inc->done_conn(inc, cp);
245 ip_vs_app_inc_put(inc);
246 cp->app = NULL;
247}
248
249
250/*
251 * Fixes th->seq based on ip_vs_seq info.
252 */
253static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
254{
255 __u32 seq = ntohl(th->seq);
256
257 /*
258 * Adjust seq with delta-offset for all packets after
259 * the most recent resized pkt seq and with previous_delta offset
260 * for all packets before most recent resized pkt seq.
261 */
262 if (vseq->delta || vseq->previous_delta) {
263 if(after(seq, vseq->init_seq)) {
264 th->seq = htonl(seq + vseq->delta);
265 IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
266 vseq->delta);
267 } else {
268 th->seq = htonl(seq + vseq->previous_delta);
269 IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
270 "(%d) to seq\n", vseq->previous_delta);
271 }
272 }
273}
274
275
276/*
277 * Fixes th->ack_seq based on ip_vs_seq info.
278 */
279static inline void
280vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
281{
282 __u32 ack_seq = ntohl(th->ack_seq);
283
284 /*
285 * Adjust ack_seq with delta-offset for
286 * the packets AFTER most recent resized pkt has caused a shift
287 * for packets before most recent resized pkt, use previous_delta
288 */
289 if (vseq->delta || vseq->previous_delta) {
290 /* since ack_seq is the number of octet that is expected
291 to receive next, so compare it with init_seq+delta */
292 if(after(ack_seq, vseq->init_seq+vseq->delta)) {
293 th->ack_seq = htonl(ack_seq - vseq->delta);
294 IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
295 "(%d) from ack_seq\n", vseq->delta);
296
297 } else {
298 th->ack_seq = htonl(ack_seq - vseq->previous_delta);
299 IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
300 "previous_delta (%d) from ack_seq\n",
301 vseq->previous_delta);
302 }
303 }
304}
305
306
307/*
308 * Updates ip_vs_seq if pkt has been resized
309 * Assumes already checked proto==IPPROTO_TCP and diff!=0.
310 */
311static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
312 unsigned flag, __u32 seq, int diff)
313{
314 /* spinlock is to keep updating cp->flags atomic */
315 spin_lock(&cp->lock);
316 if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
317 vseq->previous_delta = vseq->delta;
318 vseq->delta += diff;
319 vseq->init_seq = seq;
320 cp->flags |= flag;
321 }
322 spin_unlock(&cp->lock);
323}
324
325static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
326 struct ip_vs_app *app)
327{
328 int diff;
329 const unsigned int tcp_offset = ip_hdrlen(skb);
330 struct tcphdr *th;
331 __u32 seq;
332
333 if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
334 return 0;
335
336 th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
337
338 /*
339 * Remember seq number in case this pkt gets resized
340 */
341 seq = ntohl(th->seq);
342
343 /*
344 * Fix seq stuff if flagged as so.
345 */
346 if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
347 vs_fix_seq(&cp->out_seq, th);
348 if (cp->flags & IP_VS_CONN_F_IN_SEQ)
349 vs_fix_ack_seq(&cp->in_seq, th);
350
351 /*
352 * Call private output hook function
353 */
354 if (app->pkt_out == NULL)
355 return 1;
356
357 if (!app->pkt_out(app, cp, skb, &diff))
358 return 0;
359
360 /*
361 * Update ip_vs seq stuff if len has changed.
362 */
363 if (diff != 0)
364 vs_seq_update(cp, &cp->out_seq,
365 IP_VS_CONN_F_OUT_SEQ, seq, diff);
366
367 return 1;
368}
369
370/*
371 * Output pkt hook. Will call bound ip_vs_app specific function
372 * called by ipvs packet handler, assumes previously checked cp!=NULL
373 * returns false if it can't handle packet (oom)
374 */
375int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
376{
377 struct ip_vs_app *app;
378
379 /*
380 * check if application module is bound to
381 * this ip_vs_conn.
382 */
383 if ((app = cp->app) == NULL)
384 return 1;
385
386 /* TCP is complicated */
387 if (cp->protocol == IPPROTO_TCP)
388 return app_tcp_pkt_out(cp, skb, app);
389
390 /*
391 * Call private output hook function
392 */
393 if (app->pkt_out == NULL)
394 return 1;
395
396 return app->pkt_out(app, cp, skb, NULL);
397}
398
399
400static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
401 struct ip_vs_app *app)
402{
403 int diff;
404 const unsigned int tcp_offset = ip_hdrlen(skb);
405 struct tcphdr *th;
406 __u32 seq;
407
408 if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
409 return 0;
410
411 th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
412
413 /*
414 * Remember seq number in case this pkt gets resized
415 */
416 seq = ntohl(th->seq);
417
418 /*
419 * Fix seq stuff if flagged as so.
420 */
421 if (cp->flags & IP_VS_CONN_F_IN_SEQ)
422 vs_fix_seq(&cp->in_seq, th);
423 if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
424 vs_fix_ack_seq(&cp->out_seq, th);
425
426 /*
427 * Call private input hook function
428 */
429 if (app->pkt_in == NULL)
430 return 1;
431
432 if (!app->pkt_in(app, cp, skb, &diff))
433 return 0;
434
435 /*
436 * Update ip_vs seq stuff if len has changed.
437 */
438 if (diff != 0)
439 vs_seq_update(cp, &cp->in_seq,
440 IP_VS_CONN_F_IN_SEQ, seq, diff);
441
442 return 1;
443}
444
445/*
446 * Input pkt hook. Will call bound ip_vs_app specific function
447 * called by ipvs packet handler, assumes previously checked cp!=NULL.
448 * returns false if can't handle packet (oom).
449 */
450int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
451{
452 struct ip_vs_app *app;
453
454 /*
455 * check if application module is bound to
456 * this ip_vs_conn.
457 */
458 if ((app = cp->app) == NULL)
459 return 1;
460
461 /* TCP is complicated */
462 if (cp->protocol == IPPROTO_TCP)
463 return app_tcp_pkt_in(cp, skb, app);
464
465 /*
466 * Call private input hook function
467 */
468 if (app->pkt_in == NULL)
469 return 1;
470
471 return app->pkt_in(app, cp, skb, NULL);
472}
473
474
475#ifdef CONFIG_PROC_FS
476/*
477 * /proc/net/ip_vs_app entry function
478 */
479
480static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
481{
482 struct ip_vs_app *app, *inc;
483
484 list_for_each_entry(app, &ip_vs_app_list, a_list) {
485 list_for_each_entry(inc, &app->incs_list, a_list) {
486 if (pos-- == 0)
487 return inc;
488 }
489 }
490 return NULL;
491
492}
493
494static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
495{
496 mutex_lock(&__ip_vs_app_mutex);
497
498 return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
499}
500
501static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
502{
503 struct ip_vs_app *inc, *app;
504 struct list_head *e;
505
506 ++*pos;
507 if (v == SEQ_START_TOKEN)
508 return ip_vs_app_idx(0);
509
510 inc = v;
511 app = inc->app;
512
513 if ((e = inc->a_list.next) != &app->incs_list)
514 return list_entry(e, struct ip_vs_app, a_list);
515
516 /* go on to next application */
517 for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
518 app = list_entry(e, struct ip_vs_app, a_list);
519 list_for_each_entry(inc, &app->incs_list, a_list) {
520 return inc;
521 }
522 }
523 return NULL;
524}
525
526static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
527{
528 mutex_unlock(&__ip_vs_app_mutex);
529}
530
531static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
532{
533 if (v == SEQ_START_TOKEN)
534 seq_puts(seq, "prot port usecnt name\n");
535 else {
536 const struct ip_vs_app *inc = v;
537
538 seq_printf(seq, "%-3s %-7u %-6d %-17s\n",
539 ip_vs_proto_name(inc->protocol),
540 ntohs(inc->port),
541 atomic_read(&inc->usecnt),
542 inc->name);
543 }
544 return 0;
545}
546
547static const struct seq_operations ip_vs_app_seq_ops = {
548 .start = ip_vs_app_seq_start,
549 .next = ip_vs_app_seq_next,
550 .stop = ip_vs_app_seq_stop,
551 .show = ip_vs_app_seq_show,
552};
553
554static int ip_vs_app_open(struct inode *inode, struct file *file)
555{
556 return seq_open(file, &ip_vs_app_seq_ops);
557}
558
559static const struct file_operations ip_vs_app_fops = {
560 .owner = THIS_MODULE,
561 .open = ip_vs_app_open,
562 .read = seq_read,
563 .llseek = seq_lseek,
564 .release = seq_release,
565};
566#endif
567
568
569/*
570 * Replace a segment of data with a new segment
571 */
572int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
573 char *o_buf, int o_len, char *n_buf, int n_len)
574{
575 int diff;
576 int o_offset;
577 int o_left;
578
579 EnterFunction(9);
580
581 diff = n_len - o_len;
582 o_offset = o_buf - (char *)skb->data;
583 /* The length of left data after o_buf+o_len in the skb data */
584 o_left = skb->len - (o_offset + o_len);
585
586 if (diff <= 0) {
587 memmove(o_buf + n_len, o_buf + o_len, o_left);
588 memcpy(o_buf, n_buf, n_len);
589 skb_trim(skb, skb->len + diff);
590 } else if (diff <= skb_tailroom(skb)) {
591 skb_put(skb, diff);
592 memmove(o_buf + n_len, o_buf + o_len, o_left);
593 memcpy(o_buf, n_buf, n_len);
594 } else {
595 if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
596 return -ENOMEM;
597 skb_put(skb, diff);
598 memmove(skb->data + o_offset + n_len,
599 skb->data + o_offset + o_len, o_left);
600 skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len);
601 }
602
603 /* must update the iph total length here */
604 ip_hdr(skb)->tot_len = htons(skb->len);
605
606 LeaveFunction(9);
607 return 0;
608}
609
610
611int __init ip_vs_app_init(void)
612{
613 /* we will replace it with proc_net_ipvs_create() soon */
614 proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops);
615 return 0;
616}
617
618
619void ip_vs_app_cleanup(void)
620{
621 proc_net_remove(&init_net, "ip_vs_app");
622}
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
deleted file mode 100644
index 44a6872dc245..000000000000
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ /dev/null
@@ -1,1023 +0,0 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 * Julian Anastasov <ja@ssi.bg>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19 * and others. Many code here is taken from IP MASQ code of kernel 2.2.
20 *
21 * Changes:
22 *
23 */
24
25#include <linux/interrupt.h>
26#include <linux/in.h>
27#include <linux/net.h>
28#include <linux/kernel.h>
29#include <linux/module.h>
30#include <linux/vmalloc.h>
31#include <linux/proc_fs.h> /* for proc_net_* */
32#include <linux/seq_file.h>
33#include <linux/jhash.h>
34#include <linux/random.h>
35
36#include <net/net_namespace.h>
37#include <net/ip_vs.h>
38
39
40/*
41 * Connection hash table: for input and output packets lookups of IPVS
42 */
43static struct list_head *ip_vs_conn_tab;
44
45/* SLAB cache for IPVS connections */
46static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
47
48/* counter for current IPVS connections */
49static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
50
51/* counter for no client port connections */
52static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
53
54/* random value for IPVS connection hash */
55static unsigned int ip_vs_conn_rnd;
56
57/*
58 * Fine locking granularity for big connection hash table
59 */
60#define CT_LOCKARRAY_BITS 4
61#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
62#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
63
64struct ip_vs_aligned_lock
65{
66 rwlock_t l;
67} __attribute__((__aligned__(SMP_CACHE_BYTES)));
68
69/* lock array for conn table */
70static struct ip_vs_aligned_lock
71__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
72
73static inline void ct_read_lock(unsigned key)
74{
75 read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
76}
77
78static inline void ct_read_unlock(unsigned key)
79{
80 read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
81}
82
83static inline void ct_write_lock(unsigned key)
84{
85 write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
86}
87
88static inline void ct_write_unlock(unsigned key)
89{
90 write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
91}
92
93static inline void ct_read_lock_bh(unsigned key)
94{
95 read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
96}
97
98static inline void ct_read_unlock_bh(unsigned key)
99{
100 read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
101}
102
103static inline void ct_write_lock_bh(unsigned key)
104{
105 write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
106}
107
108static inline void ct_write_unlock_bh(unsigned key)
109{
110 write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
111}
112
113
114/*
115 * Returns hash value for IPVS connection entry
116 */
117static unsigned int ip_vs_conn_hashkey(unsigned proto, __be32 addr, __be16 port)
118{
119 return jhash_3words((__force u32)addr, (__force u32)port, proto, ip_vs_conn_rnd)
120 & IP_VS_CONN_TAB_MASK;
121}
122
123
124/*
125 * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
126 * returns bool success.
127 */
128static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
129{
130 unsigned hash;
131 int ret;
132
133 /* Hash by protocol, client address and port */
134 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
135
136 ct_write_lock(hash);
137
138 if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
139 list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
140 cp->flags |= IP_VS_CONN_F_HASHED;
141 atomic_inc(&cp->refcnt);
142 ret = 1;
143 } else {
144 IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
145 "called from %p\n", __builtin_return_address(0));
146 ret = 0;
147 }
148
149 ct_write_unlock(hash);
150
151 return ret;
152}
153
154
155/*
156 * UNhashes ip_vs_conn from ip_vs_conn_tab.
157 * returns bool success.
158 */
159static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
160{
161 unsigned hash;
162 int ret;
163
164 /* unhash it and decrease its reference counter */
165 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
166
167 ct_write_lock(hash);
168
169 if (cp->flags & IP_VS_CONN_F_HASHED) {
170 list_del(&cp->c_list);
171 cp->flags &= ~IP_VS_CONN_F_HASHED;
172 atomic_dec(&cp->refcnt);
173 ret = 1;
174 } else
175 ret = 0;
176
177 ct_write_unlock(hash);
178
179 return ret;
180}
181
182
183/*
184 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
185 * Called for pkts coming from OUTside-to-INside.
186 * s_addr, s_port: pkt source address (foreign host)
187 * d_addr, d_port: pkt dest address (load balancer)
188 */
189static inline struct ip_vs_conn *__ip_vs_conn_in_get
190(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port)
191{
192 unsigned hash;
193 struct ip_vs_conn *cp;
194
195 hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
196
197 ct_read_lock(hash);
198
199 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
200 if (s_addr==cp->caddr && s_port==cp->cport &&
201 d_port==cp->vport && d_addr==cp->vaddr &&
202 ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
203 protocol==cp->protocol) {
204 /* HIT */
205 atomic_inc(&cp->refcnt);
206 ct_read_unlock(hash);
207 return cp;
208 }
209 }
210
211 ct_read_unlock(hash);
212
213 return NULL;
214}
215
216struct ip_vs_conn *ip_vs_conn_in_get
217(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port)
218{
219 struct ip_vs_conn *cp;
220
221 cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
222 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
223 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
224
225 IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
226 ip_vs_proto_name(protocol),
227 NIPQUAD(s_addr), ntohs(s_port),
228 NIPQUAD(d_addr), ntohs(d_port),
229 cp?"hit":"not hit");
230
231 return cp;
232}
233
234/* Get reference to connection template */
235struct ip_vs_conn *ip_vs_ct_in_get
236(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port)
237{
238 unsigned hash;
239 struct ip_vs_conn *cp;
240
241 hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
242
243 ct_read_lock(hash);
244
245 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
246 if (s_addr==cp->caddr && s_port==cp->cport &&
247 d_port==cp->vport && d_addr==cp->vaddr &&
248 cp->flags & IP_VS_CONN_F_TEMPLATE &&
249 protocol==cp->protocol) {
250 /* HIT */
251 atomic_inc(&cp->refcnt);
252 goto out;
253 }
254 }
255 cp = NULL;
256
257 out:
258 ct_read_unlock(hash);
259
260 IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
261 ip_vs_proto_name(protocol),
262 NIPQUAD(s_addr), ntohs(s_port),
263 NIPQUAD(d_addr), ntohs(d_port),
264 cp?"hit":"not hit");
265
266 return cp;
267}
268
269/*
270 * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
271 * Called for pkts coming from inside-to-OUTside.
272 * s_addr, s_port: pkt source address (inside host)
273 * d_addr, d_port: pkt dest address (foreign host)
274 */
275struct ip_vs_conn *ip_vs_conn_out_get
276(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port)
277{
278 unsigned hash;
279 struct ip_vs_conn *cp, *ret=NULL;
280
281 /*
282 * Check for "full" addressed entries
283 */
284 hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
285
286 ct_read_lock(hash);
287
288 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
289 if (d_addr == cp->caddr && d_port == cp->cport &&
290 s_port == cp->dport && s_addr == cp->daddr &&
291 protocol == cp->protocol) {
292 /* HIT */
293 atomic_inc(&cp->refcnt);
294 ret = cp;
295 break;
296 }
297 }
298
299 ct_read_unlock(hash);
300
301 IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
302 ip_vs_proto_name(protocol),
303 NIPQUAD(s_addr), ntohs(s_port),
304 NIPQUAD(d_addr), ntohs(d_port),
305 ret?"hit":"not hit");
306
307 return ret;
308}
309
310
311/*
312 * Put back the conn and restart its timer with its timeout
313 */
314void ip_vs_conn_put(struct ip_vs_conn *cp)
315{
316 /* reset it expire in its timeout */
317 mod_timer(&cp->timer, jiffies+cp->timeout);
318
319 __ip_vs_conn_put(cp);
320}
321
322
323/*
324 * Fill a no_client_port connection with a client port number
325 */
326void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
327{
328 if (ip_vs_conn_unhash(cp)) {
329 spin_lock(&cp->lock);
330 if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
331 atomic_dec(&ip_vs_conn_no_cport_cnt);
332 cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
333 cp->cport = cport;
334 }
335 spin_unlock(&cp->lock);
336
337 /* hash on new dport */
338 ip_vs_conn_hash(cp);
339 }
340}
341
342
343/*
344 * Bind a connection entry with the corresponding packet_xmit.
345 * Called by ip_vs_conn_new.
346 */
347static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
348{
349 switch (IP_VS_FWD_METHOD(cp)) {
350 case IP_VS_CONN_F_MASQ:
351 cp->packet_xmit = ip_vs_nat_xmit;
352 break;
353
354 case IP_VS_CONN_F_TUNNEL:
355 cp->packet_xmit = ip_vs_tunnel_xmit;
356 break;
357
358 case IP_VS_CONN_F_DROUTE:
359 cp->packet_xmit = ip_vs_dr_xmit;
360 break;
361
362 case IP_VS_CONN_F_LOCALNODE:
363 cp->packet_xmit = ip_vs_null_xmit;
364 break;
365
366 case IP_VS_CONN_F_BYPASS:
367 cp->packet_xmit = ip_vs_bypass_xmit;
368 break;
369 }
370}
371
372
373static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
374{
375 return atomic_read(&dest->activeconns)
376 + atomic_read(&dest->inactconns);
377}
378
379/*
380 * Bind a connection entry with a virtual service destination
381 * Called just after a new connection entry is created.
382 */
383static inline void
384ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
385{
386 /* if dest is NULL, then return directly */
387 if (!dest)
388 return;
389
390 /* Increase the refcnt counter of the dest */
391 atomic_inc(&dest->refcnt);
392
393 /* Bind with the destination and its corresponding transmitter */
394 if ((cp->flags & IP_VS_CONN_F_SYNC) &&
395 (!(cp->flags & IP_VS_CONN_F_TEMPLATE)))
396 /* if the connection is not template and is created
397 * by sync, preserve the activity flag.
398 */
399 cp->flags |= atomic_read(&dest->conn_flags) &
400 (~IP_VS_CONN_F_INACTIVE);
401 else
402 cp->flags |= atomic_read(&dest->conn_flags);
403 cp->dest = dest;
404
405 IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
406 "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
407 "dest->refcnt:%d\n",
408 ip_vs_proto_name(cp->protocol),
409 NIPQUAD(cp->caddr), ntohs(cp->cport),
410 NIPQUAD(cp->vaddr), ntohs(cp->vport),
411 NIPQUAD(cp->daddr), ntohs(cp->dport),
412 ip_vs_fwd_tag(cp), cp->state,
413 cp->flags, atomic_read(&cp->refcnt),
414 atomic_read(&dest->refcnt));
415
416 /* Update the connection counters */
417 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
418 /* It is a normal connection, so increase the inactive
419 connection counter because it is in TCP SYNRECV
420 state (inactive) or other protocol inacive state */
421 if ((cp->flags & IP_VS_CONN_F_SYNC) &&
422 (!(cp->flags & IP_VS_CONN_F_INACTIVE)))
423 atomic_inc(&dest->activeconns);
424 else
425 atomic_inc(&dest->inactconns);
426 } else {
427 /* It is a persistent connection/template, so increase
428 the peristent connection counter */
429 atomic_inc(&dest->persistconns);
430 }
431
432 if (dest->u_threshold != 0 &&
433 ip_vs_dest_totalconns(dest) >= dest->u_threshold)
434 dest->flags |= IP_VS_DEST_F_OVERLOAD;
435}
436
437
438/*
439 * Check if there is a destination for the connection, if so
440 * bind the connection to the destination.
441 */
442struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
443{
444 struct ip_vs_dest *dest;
445
446 if ((cp) && (!cp->dest)) {
447 dest = ip_vs_find_dest(cp->daddr, cp->dport,
448 cp->vaddr, cp->vport, cp->protocol);
449 ip_vs_bind_dest(cp, dest);
450 return dest;
451 } else
452 return NULL;
453}
454
455
456/*
457 * Unbind a connection entry with its VS destination
458 * Called by the ip_vs_conn_expire function.
459 */
460static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
461{
462 struct ip_vs_dest *dest = cp->dest;
463
464 if (!dest)
465 return;
466
467 IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
468 "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
469 "dest->refcnt:%d\n",
470 ip_vs_proto_name(cp->protocol),
471 NIPQUAD(cp->caddr), ntohs(cp->cport),
472 NIPQUAD(cp->vaddr), ntohs(cp->vport),
473 NIPQUAD(cp->daddr), ntohs(cp->dport),
474 ip_vs_fwd_tag(cp), cp->state,
475 cp->flags, atomic_read(&cp->refcnt),
476 atomic_read(&dest->refcnt));
477
478 /* Update the connection counters */
479 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
480 /* It is a normal connection, so decrease the inactconns
481 or activeconns counter */
482 if (cp->flags & IP_VS_CONN_F_INACTIVE) {
483 atomic_dec(&dest->inactconns);
484 } else {
485 atomic_dec(&dest->activeconns);
486 }
487 } else {
488 /* It is a persistent connection/template, so decrease
489 the peristent connection counter */
490 atomic_dec(&dest->persistconns);
491 }
492
493 if (dest->l_threshold != 0) {
494 if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
495 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
496 } else if (dest->u_threshold != 0) {
497 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
498 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
499 } else {
500 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
501 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
502 }
503
504 /*
505 * Simply decrease the refcnt of the dest, because the
506 * dest will be either in service's destination list
507 * or in the trash.
508 */
509 atomic_dec(&dest->refcnt);
510}
511
512
513/*
514 * Checking if the destination of a connection template is available.
515 * If available, return 1, otherwise invalidate this connection
516 * template and return 0.
517 */
518int ip_vs_check_template(struct ip_vs_conn *ct)
519{
520 struct ip_vs_dest *dest = ct->dest;
521
522 /*
523 * Checking the dest server status.
524 */
525 if ((dest == NULL) ||
526 !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
527 (sysctl_ip_vs_expire_quiescent_template &&
528 (atomic_read(&dest->weight) == 0))) {
529 IP_VS_DBG(9, "check_template: dest not available for "
530 "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
531 "-> d:%u.%u.%u.%u:%d\n",
532 ip_vs_proto_name(ct->protocol),
533 NIPQUAD(ct->caddr), ntohs(ct->cport),
534 NIPQUAD(ct->vaddr), ntohs(ct->vport),
535 NIPQUAD(ct->daddr), ntohs(ct->dport));
536
537 /*
538 * Invalidate the connection template
539 */
540 if (ct->vport != htons(0xffff)) {
541 if (ip_vs_conn_unhash(ct)) {
542 ct->dport = htons(0xffff);
543 ct->vport = htons(0xffff);
544 ct->cport = 0;
545 ip_vs_conn_hash(ct);
546 }
547 }
548
549 /*
550 * Simply decrease the refcnt of the template,
551 * don't restart its timer.
552 */
553 atomic_dec(&ct->refcnt);
554 return 0;
555 }
556 return 1;
557}
558
559static void ip_vs_conn_expire(unsigned long data)
560{
561 struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
562
563 cp->timeout = 60*HZ;
564
565 /*
566 * hey, I'm using it
567 */
568 atomic_inc(&cp->refcnt);
569
570 /*
571 * do I control anybody?
572 */
573 if (atomic_read(&cp->n_control))
574 goto expire_later;
575
576 /*
577 * unhash it if it is hashed in the conn table
578 */
579 if (!ip_vs_conn_unhash(cp))
580 goto expire_later;
581
582 /*
583 * refcnt==1 implies I'm the only one referrer
584 */
585 if (likely(atomic_read(&cp->refcnt) == 1)) {
586 /* delete the timer if it is activated by other users */
587 if (timer_pending(&cp->timer))
588 del_timer(&cp->timer);
589
590 /* does anybody control me? */
591 if (cp->control)
592 ip_vs_control_del(cp);
593
594 if (unlikely(cp->app != NULL))
595 ip_vs_unbind_app(cp);
596 ip_vs_unbind_dest(cp);
597 if (cp->flags & IP_VS_CONN_F_NO_CPORT)
598 atomic_dec(&ip_vs_conn_no_cport_cnt);
599 atomic_dec(&ip_vs_conn_count);
600
601 kmem_cache_free(ip_vs_conn_cachep, cp);
602 return;
603 }
604
605 /* hash it back to the table */
606 ip_vs_conn_hash(cp);
607
608 expire_later:
609 IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
610 atomic_read(&cp->refcnt)-1,
611 atomic_read(&cp->n_control));
612
613 ip_vs_conn_put(cp);
614}
615
616
617void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
618{
619 if (del_timer(&cp->timer))
620 mod_timer(&cp->timer, jiffies);
621}
622
623
624/*
625 * Create a new connection entry and hash it into the ip_vs_conn_tab
626 */
627struct ip_vs_conn *
628ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport,
629 __be32 daddr, __be16 dport, unsigned flags,
630 struct ip_vs_dest *dest)
631{
632 struct ip_vs_conn *cp;
633 struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
634
635 cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
636 if (cp == NULL) {
637 IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
638 return NULL;
639 }
640
641 INIT_LIST_HEAD(&cp->c_list);
642 setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
643 cp->protocol = proto;
644 cp->caddr = caddr;
645 cp->cport = cport;
646 cp->vaddr = vaddr;
647 cp->vport = vport;
648 cp->daddr = daddr;
649 cp->dport = dport;
650 cp->flags = flags;
651 spin_lock_init(&cp->lock);
652
653 /*
654 * Set the entry is referenced by the current thread before hashing
655 * it in the table, so that other thread run ip_vs_random_dropentry
656 * but cannot drop this entry.
657 */
658 atomic_set(&cp->refcnt, 1);
659
660 atomic_set(&cp->n_control, 0);
661 atomic_set(&cp->in_pkts, 0);
662
663 atomic_inc(&ip_vs_conn_count);
664 if (flags & IP_VS_CONN_F_NO_CPORT)
665 atomic_inc(&ip_vs_conn_no_cport_cnt);
666
667 /* Bind the connection with a destination server */
668 ip_vs_bind_dest(cp, dest);
669
670 /* Set its state and timeout */
671 cp->state = 0;
672 cp->timeout = 3*HZ;
673
674 /* Bind its packet transmitter */
675 ip_vs_bind_xmit(cp);
676
677 if (unlikely(pp && atomic_read(&pp->appcnt)))
678 ip_vs_bind_app(cp, pp);
679
680 /* Hash it in the ip_vs_conn_tab finally */
681 ip_vs_conn_hash(cp);
682
683 return cp;
684}
685
686
687/*
688 * /proc/net/ip_vs_conn entries
689 */
690#ifdef CONFIG_PROC_FS
691
692static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
693{
694 int idx;
695 struct ip_vs_conn *cp;
696
697 for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
698 ct_read_lock_bh(idx);
699 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
700 if (pos-- == 0) {
701 seq->private = &ip_vs_conn_tab[idx];
702 return cp;
703 }
704 }
705 ct_read_unlock_bh(idx);
706 }
707
708 return NULL;
709}
710
711static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
712{
713 seq->private = NULL;
714 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
715}
716
717static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
718{
719 struct ip_vs_conn *cp = v;
720 struct list_head *e, *l = seq->private;
721 int idx;
722
723 ++*pos;
724 if (v == SEQ_START_TOKEN)
725 return ip_vs_conn_array(seq, 0);
726
727 /* more on same hash chain? */
728 if ((e = cp->c_list.next) != l)
729 return list_entry(e, struct ip_vs_conn, c_list);
730
731 idx = l - ip_vs_conn_tab;
732 ct_read_unlock_bh(idx);
733
734 while (++idx < IP_VS_CONN_TAB_SIZE) {
735 ct_read_lock_bh(idx);
736 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
737 seq->private = &ip_vs_conn_tab[idx];
738 return cp;
739 }
740 ct_read_unlock_bh(idx);
741 }
742 seq->private = NULL;
743 return NULL;
744}
745
746static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
747{
748 struct list_head *l = seq->private;
749
750 if (l)
751 ct_read_unlock_bh(l - ip_vs_conn_tab);
752}
753
754static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
755{
756
757 if (v == SEQ_START_TOKEN)
758 seq_puts(seq,
759 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n");
760 else {
761 const struct ip_vs_conn *cp = v;
762
763 seq_printf(seq,
764 "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n",
765 ip_vs_proto_name(cp->protocol),
766 ntohl(cp->caddr), ntohs(cp->cport),
767 ntohl(cp->vaddr), ntohs(cp->vport),
768 ntohl(cp->daddr), ntohs(cp->dport),
769 ip_vs_state_name(cp->protocol, cp->state),
770 (cp->timer.expires-jiffies)/HZ);
771 }
772 return 0;
773}
774
775static const struct seq_operations ip_vs_conn_seq_ops = {
776 .start = ip_vs_conn_seq_start,
777 .next = ip_vs_conn_seq_next,
778 .stop = ip_vs_conn_seq_stop,
779 .show = ip_vs_conn_seq_show,
780};
781
782static int ip_vs_conn_open(struct inode *inode, struct file *file)
783{
784 return seq_open(file, &ip_vs_conn_seq_ops);
785}
786
787static const struct file_operations ip_vs_conn_fops = {
788 .owner = THIS_MODULE,
789 .open = ip_vs_conn_open,
790 .read = seq_read,
791 .llseek = seq_lseek,
792 .release = seq_release,
793};
794
795static const char *ip_vs_origin_name(unsigned flags)
796{
797 if (flags & IP_VS_CONN_F_SYNC)
798 return "SYNC";
799 else
800 return "LOCAL";
801}
802
803static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
804{
805
806 if (v == SEQ_START_TOKEN)
807 seq_puts(seq,
808 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n");
809 else {
810 const struct ip_vs_conn *cp = v;
811
812 seq_printf(seq,
813 "%-3s %08X %04X %08X %04X %08X %04X %-11s %-6s %7lu\n",
814 ip_vs_proto_name(cp->protocol),
815 ntohl(cp->caddr), ntohs(cp->cport),
816 ntohl(cp->vaddr), ntohs(cp->vport),
817 ntohl(cp->daddr), ntohs(cp->dport),
818 ip_vs_state_name(cp->protocol, cp->state),
819 ip_vs_origin_name(cp->flags),
820 (cp->timer.expires-jiffies)/HZ);
821 }
822 return 0;
823}
824
825static const struct seq_operations ip_vs_conn_sync_seq_ops = {
826 .start = ip_vs_conn_seq_start,
827 .next = ip_vs_conn_seq_next,
828 .stop = ip_vs_conn_seq_stop,
829 .show = ip_vs_conn_sync_seq_show,
830};
831
832static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
833{
834 return seq_open(file, &ip_vs_conn_sync_seq_ops);
835}
836
837static const struct file_operations ip_vs_conn_sync_fops = {
838 .owner = THIS_MODULE,
839 .open = ip_vs_conn_sync_open,
840 .read = seq_read,
841 .llseek = seq_lseek,
842 .release = seq_release,
843};
844
845#endif
846
847
848/*
849 * Randomly drop connection entries before running out of memory
850 */
851static inline int todrop_entry(struct ip_vs_conn *cp)
852{
853 /*
854 * The drop rate array needs tuning for real environments.
855 * Called from timer bh only => no locking
856 */
857 static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
858 static char todrop_counter[9] = {0};
859 int i;
860
861 /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
862 This will leave enough time for normal connection to get
863 through. */
864 if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
865 return 0;
866
867 /* Don't drop the entry if its number of incoming packets is not
868 located in [0, 8] */
869 i = atomic_read(&cp->in_pkts);
870 if (i > 8 || i < 0) return 0;
871
872 if (!todrop_rate[i]) return 0;
873 if (--todrop_counter[i] > 0) return 0;
874
875 todrop_counter[i] = todrop_rate[i];
876 return 1;
877}
878
879/* Called from keventd and must protect itself from softirqs */
880void ip_vs_random_dropentry(void)
881{
882 int idx;
883 struct ip_vs_conn *cp;
884
885 /*
886 * Randomly scan 1/32 of the whole table every second
887 */
888 for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
889 unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
890
891 /*
892 * Lock is actually needed in this loop.
893 */
894 ct_write_lock_bh(hash);
895
896 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
897 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
898 /* connection template */
899 continue;
900
901 if (cp->protocol == IPPROTO_TCP) {
902 switch(cp->state) {
903 case IP_VS_TCP_S_SYN_RECV:
904 case IP_VS_TCP_S_SYNACK:
905 break;
906
907 case IP_VS_TCP_S_ESTABLISHED:
908 if (todrop_entry(cp))
909 break;
910 continue;
911
912 default:
913 continue;
914 }
915 } else {
916 if (!todrop_entry(cp))
917 continue;
918 }
919
920 IP_VS_DBG(4, "del connection\n");
921 ip_vs_conn_expire_now(cp);
922 if (cp->control) {
923 IP_VS_DBG(4, "del conn template\n");
924 ip_vs_conn_expire_now(cp->control);
925 }
926 }
927 ct_write_unlock_bh(hash);
928 }
929}
930
931
932/*
933 * Flush all the connection entries in the ip_vs_conn_tab
934 */
935static void ip_vs_conn_flush(void)
936{
937 int idx;
938 struct ip_vs_conn *cp;
939
940 flush_again:
941 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
942 /*
943 * Lock is actually needed in this loop.
944 */
945 ct_write_lock_bh(idx);
946
947 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
948
949 IP_VS_DBG(4, "del connection\n");
950 ip_vs_conn_expire_now(cp);
951 if (cp->control) {
952 IP_VS_DBG(4, "del conn template\n");
953 ip_vs_conn_expire_now(cp->control);
954 }
955 }
956 ct_write_unlock_bh(idx);
957 }
958
959 /* the counter may be not NULL, because maybe some conn entries
960 are run by slow timer handler or unhashed but still referred */
961 if (atomic_read(&ip_vs_conn_count) != 0) {
962 schedule();
963 goto flush_again;
964 }
965}
966
967
968int __init ip_vs_conn_init(void)
969{
970 int idx;
971
972 /*
973 * Allocate the connection hash table and initialize its list heads
974 */
975 ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
976 if (!ip_vs_conn_tab)
977 return -ENOMEM;
978
979 /* Allocate ip_vs_conn slab cache */
980 ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
981 sizeof(struct ip_vs_conn), 0,
982 SLAB_HWCACHE_ALIGN, NULL);
983 if (!ip_vs_conn_cachep) {
984 vfree(ip_vs_conn_tab);
985 return -ENOMEM;
986 }
987
988 IP_VS_INFO("Connection hash table configured "
989 "(size=%d, memory=%ldKbytes)\n",
990 IP_VS_CONN_TAB_SIZE,
991 (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
992 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
993 sizeof(struct ip_vs_conn));
994
995 for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
996 INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
997 }
998
999 for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
1000 rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
1001 }
1002
1003 proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops);
1004 proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
1005
1006 /* calculate the random value for connection hash */
1007 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
1008
1009 return 0;
1010}
1011
1012
1013void ip_vs_conn_cleanup(void)
1014{
1015 /* flush all the connection entries first */
1016 ip_vs_conn_flush();
1017
1018 /* Release the empty cache */
1019 kmem_cache_destroy(ip_vs_conn_cachep);
1020 proc_net_remove(&init_net, "ip_vs_conn");
1021 proc_net_remove(&init_net, "ip_vs_conn_sync");
1022 vfree(ip_vs_conn_tab);
1023}
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
deleted file mode 100644
index a7879eafc3b5..000000000000
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ /dev/null
@@ -1,1125 +0,0 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 * Julian Anastasov <ja@ssi.bg>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19 * and others.
20 *
21 * Changes:
22 * Paul `Rusty' Russell properly handle non-linear skbs
23 * Harald Welte don't use nfcache
24 *
25 */
26
27#include <linux/module.h>
28#include <linux/kernel.h>
29#include <linux/ip.h>
30#include <linux/tcp.h>
31#include <linux/icmp.h>
32
33#include <net/ip.h>
34#include <net/tcp.h>
35#include <net/udp.h>
36#include <net/icmp.h> /* for icmp_send */
37#include <net/route.h>
38
39#include <linux/netfilter.h>
40#include <linux/netfilter_ipv4.h>
41
42#include <net/ip_vs.h>
43
44
45EXPORT_SYMBOL(register_ip_vs_scheduler);
46EXPORT_SYMBOL(unregister_ip_vs_scheduler);
47EXPORT_SYMBOL(ip_vs_skb_replace);
48EXPORT_SYMBOL(ip_vs_proto_name);
49EXPORT_SYMBOL(ip_vs_conn_new);
50EXPORT_SYMBOL(ip_vs_conn_in_get);
51EXPORT_SYMBOL(ip_vs_conn_out_get);
52#ifdef CONFIG_IP_VS_PROTO_TCP
53EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
54#endif
55EXPORT_SYMBOL(ip_vs_conn_put);
56#ifdef CONFIG_IP_VS_DEBUG
57EXPORT_SYMBOL(ip_vs_get_debug_level);
58#endif
59
60
61/* ID used in ICMP lookups */
62#define icmp_id(icmph) (((icmph)->un).echo.id)
63
64const char *ip_vs_proto_name(unsigned proto)
65{
66 static char buf[20];
67
68 switch (proto) {
69 case IPPROTO_IP:
70 return "IP";
71 case IPPROTO_UDP:
72 return "UDP";
73 case IPPROTO_TCP:
74 return "TCP";
75 case IPPROTO_ICMP:
76 return "ICMP";
77 default:
78 sprintf(buf, "IP_%d", proto);
79 return buf;
80 }
81}
82
83void ip_vs_init_hash_table(struct list_head *table, int rows)
84{
85 while (--rows >= 0)
86 INIT_LIST_HEAD(&table[rows]);
87}
88
89static inline void
90ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
91{
92 struct ip_vs_dest *dest = cp->dest;
93 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
94 spin_lock(&dest->stats.lock);
95 dest->stats.inpkts++;
96 dest->stats.inbytes += skb->len;
97 spin_unlock(&dest->stats.lock);
98
99 spin_lock(&dest->svc->stats.lock);
100 dest->svc->stats.inpkts++;
101 dest->svc->stats.inbytes += skb->len;
102 spin_unlock(&dest->svc->stats.lock);
103
104 spin_lock(&ip_vs_stats.lock);
105 ip_vs_stats.inpkts++;
106 ip_vs_stats.inbytes += skb->len;
107 spin_unlock(&ip_vs_stats.lock);
108 }
109}
110
111
112static inline void
113ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
114{
115 struct ip_vs_dest *dest = cp->dest;
116 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
117 spin_lock(&dest->stats.lock);
118 dest->stats.outpkts++;
119 dest->stats.outbytes += skb->len;
120 spin_unlock(&dest->stats.lock);
121
122 spin_lock(&dest->svc->stats.lock);
123 dest->svc->stats.outpkts++;
124 dest->svc->stats.outbytes += skb->len;
125 spin_unlock(&dest->svc->stats.lock);
126
127 spin_lock(&ip_vs_stats.lock);
128 ip_vs_stats.outpkts++;
129 ip_vs_stats.outbytes += skb->len;
130 spin_unlock(&ip_vs_stats.lock);
131 }
132}
133
134
135static inline void
136ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
137{
138 spin_lock(&cp->dest->stats.lock);
139 cp->dest->stats.conns++;
140 spin_unlock(&cp->dest->stats.lock);
141
142 spin_lock(&svc->stats.lock);
143 svc->stats.conns++;
144 spin_unlock(&svc->stats.lock);
145
146 spin_lock(&ip_vs_stats.lock);
147 ip_vs_stats.conns++;
148 spin_unlock(&ip_vs_stats.lock);
149}
150
151
152static inline int
153ip_vs_set_state(struct ip_vs_conn *cp, int direction,
154 const struct sk_buff *skb,
155 struct ip_vs_protocol *pp)
156{
157 if (unlikely(!pp->state_transition))
158 return 0;
159 return pp->state_transition(cp, direction, skb, pp);
160}
161
162
163/*
164 * IPVS persistent scheduling function
165 * It creates a connection entry according to its template if exists,
166 * or selects a server and creates a connection entry plus a template.
167 * Locking: we are svc user (svc->refcnt), so we hold all dests too
168 * Protocols supported: TCP, UDP
169 */
170static struct ip_vs_conn *
171ip_vs_sched_persist(struct ip_vs_service *svc,
172 const struct sk_buff *skb,
173 __be16 ports[2])
174{
175 struct ip_vs_conn *cp = NULL;
176 struct iphdr *iph = ip_hdr(skb);
177 struct ip_vs_dest *dest;
178 struct ip_vs_conn *ct;
179 __be16 dport; /* destination port to forward */
180 __be32 snet; /* source network of the client, after masking */
181
182 /* Mask saddr with the netmask to adjust template granularity */
183 snet = iph->saddr & svc->netmask;
184
185 IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
186 "mnet %u.%u.%u.%u\n",
187 NIPQUAD(iph->saddr), ntohs(ports[0]),
188 NIPQUAD(iph->daddr), ntohs(ports[1]),
189 NIPQUAD(snet));
190
191 /*
192 * As far as we know, FTP is a very complicated network protocol, and
193 * it uses control connection and data connections. For active FTP,
194 * FTP server initialize data connection to the client, its source port
195 * is often 20. For passive FTP, FTP server tells the clients the port
196 * that it passively listens to, and the client issues the data
197 * connection. In the tunneling or direct routing mode, the load
198 * balancer is on the client-to-server half of connection, the port
199 * number is unknown to the load balancer. So, a conn template like
200 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
201 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
202 * is created for other persistent services.
203 */
204 if (ports[1] == svc->port) {
205 /* Check if a template already exists */
206 if (svc->port != FTPPORT)
207 ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
208 iph->daddr, ports[1]);
209 else
210 ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
211 iph->daddr, 0);
212
213 if (!ct || !ip_vs_check_template(ct)) {
214 /*
215 * No template found or the dest of the connection
216 * template is not available.
217 */
218 dest = svc->scheduler->schedule(svc, skb);
219 if (dest == NULL) {
220 IP_VS_DBG(1, "p-schedule: no dest found.\n");
221 return NULL;
222 }
223
224 /*
225 * Create a template like <protocol,caddr,0,
226 * vaddr,vport,daddr,dport> for non-ftp service,
227 * and <protocol,caddr,0,vaddr,0,daddr,0>
228 * for ftp service.
229 */
230 if (svc->port != FTPPORT)
231 ct = ip_vs_conn_new(iph->protocol,
232 snet, 0,
233 iph->daddr,
234 ports[1],
235 dest->addr, dest->port,
236 IP_VS_CONN_F_TEMPLATE,
237 dest);
238 else
239 ct = ip_vs_conn_new(iph->protocol,
240 snet, 0,
241 iph->daddr, 0,
242 dest->addr, 0,
243 IP_VS_CONN_F_TEMPLATE,
244 dest);
245 if (ct == NULL)
246 return NULL;
247
248 ct->timeout = svc->timeout;
249 } else {
250 /* set destination with the found template */
251 dest = ct->dest;
252 }
253 dport = dest->port;
254 } else {
255 /*
256 * Note: persistent fwmark-based services and persistent
257 * port zero service are handled here.
258 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
259 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
260 */
261 if (svc->fwmark)
262 ct = ip_vs_ct_in_get(IPPROTO_IP, snet, 0,
263 htonl(svc->fwmark), 0);
264 else
265 ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
266 iph->daddr, 0);
267
268 if (!ct || !ip_vs_check_template(ct)) {
269 /*
270 * If it is not persistent port zero, return NULL,
271 * otherwise create a connection template.
272 */
273 if (svc->port)
274 return NULL;
275
276 dest = svc->scheduler->schedule(svc, skb);
277 if (dest == NULL) {
278 IP_VS_DBG(1, "p-schedule: no dest found.\n");
279 return NULL;
280 }
281
282 /*
283 * Create a template according to the service
284 */
285 if (svc->fwmark)
286 ct = ip_vs_conn_new(IPPROTO_IP,
287 snet, 0,
288 htonl(svc->fwmark), 0,
289 dest->addr, 0,
290 IP_VS_CONN_F_TEMPLATE,
291 dest);
292 else
293 ct = ip_vs_conn_new(iph->protocol,
294 snet, 0,
295 iph->daddr, 0,
296 dest->addr, 0,
297 IP_VS_CONN_F_TEMPLATE,
298 dest);
299 if (ct == NULL)
300 return NULL;
301
302 ct->timeout = svc->timeout;
303 } else {
304 /* set destination with the found template */
305 dest = ct->dest;
306 }
307 dport = ports[1];
308 }
309
310 /*
311 * Create a new connection according to the template
312 */
313 cp = ip_vs_conn_new(iph->protocol,
314 iph->saddr, ports[0],
315 iph->daddr, ports[1],
316 dest->addr, dport,
317 0,
318 dest);
319 if (cp == NULL) {
320 ip_vs_conn_put(ct);
321 return NULL;
322 }
323
324 /*
325 * Add its control
326 */
327 ip_vs_control_add(cp, ct);
328 ip_vs_conn_put(ct);
329
330 ip_vs_conn_stats(cp, svc);
331 return cp;
332}
333
334
335/*
336 * IPVS main scheduling function
337 * It selects a server according to the virtual service, and
338 * creates a connection entry.
339 * Protocols supported: TCP, UDP
340 */
341struct ip_vs_conn *
342ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
343{
344 struct ip_vs_conn *cp = NULL;
345 struct iphdr *iph = ip_hdr(skb);
346 struct ip_vs_dest *dest;
347 __be16 _ports[2], *pptr;
348
349 pptr = skb_header_pointer(skb, iph->ihl*4,
350 sizeof(_ports), _ports);
351 if (pptr == NULL)
352 return NULL;
353
354 /*
355 * Persistent service
356 */
357 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
358 return ip_vs_sched_persist(svc, skb, pptr);
359
360 /*
361 * Non-persistent service
362 */
363 if (!svc->fwmark && pptr[1] != svc->port) {
364 if (!svc->port)
365 IP_VS_ERR("Schedule: port zero only supported "
366 "in persistent services, "
367 "check your ipvs configuration\n");
368 return NULL;
369 }
370
371 dest = svc->scheduler->schedule(svc, skb);
372 if (dest == NULL) {
373 IP_VS_DBG(1, "Schedule: no dest found.\n");
374 return NULL;
375 }
376
377 /*
378 * Create a connection entry.
379 */
380 cp = ip_vs_conn_new(iph->protocol,
381 iph->saddr, pptr[0],
382 iph->daddr, pptr[1],
383 dest->addr, dest->port?dest->port:pptr[1],
384 0,
385 dest);
386 if (cp == NULL)
387 return NULL;
388
389 IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
390 "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n",
391 ip_vs_fwd_tag(cp),
392 NIPQUAD(cp->caddr), ntohs(cp->cport),
393 NIPQUAD(cp->vaddr), ntohs(cp->vport),
394 NIPQUAD(cp->daddr), ntohs(cp->dport),
395 cp->flags, atomic_read(&cp->refcnt));
396
397 ip_vs_conn_stats(cp, svc);
398 return cp;
399}
400
401
402/*
403 * Pass or drop the packet.
404 * Called by ip_vs_in, when the virtual service is available but
405 * no destination is available for a new connection.
406 */
407int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
408 struct ip_vs_protocol *pp)
409{
410 __be16 _ports[2], *pptr;
411 struct iphdr *iph = ip_hdr(skb);
412
413 pptr = skb_header_pointer(skb, iph->ihl*4,
414 sizeof(_ports), _ports);
415 if (pptr == NULL) {
416 ip_vs_service_put(svc);
417 return NF_DROP;
418 }
419
420 /* if it is fwmark-based service, the cache_bypass sysctl is up
421 and the destination is RTN_UNICAST (and not local), then create
422 a cache_bypass connection entry */
423 if (sysctl_ip_vs_cache_bypass && svc->fwmark
424 && (inet_addr_type(&init_net, iph->daddr) == RTN_UNICAST)) {
425 int ret, cs;
426 struct ip_vs_conn *cp;
427
428 ip_vs_service_put(svc);
429
430 /* create a new connection entry */
431 IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
432 cp = ip_vs_conn_new(iph->protocol,
433 iph->saddr, pptr[0],
434 iph->daddr, pptr[1],
435 0, 0,
436 IP_VS_CONN_F_BYPASS,
437 NULL);
438 if (cp == NULL)
439 return NF_DROP;
440
441 /* statistics */
442 ip_vs_in_stats(cp, skb);
443
444 /* set state */
445 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
446
447 /* transmit the first SYN packet */
448 ret = cp->packet_xmit(skb, cp, pp);
449 /* do not touch skb anymore */
450
451 atomic_inc(&cp->in_pkts);
452 ip_vs_conn_put(cp);
453 return ret;
454 }
455
456 /*
457 * When the virtual ftp service is presented, packets destined
458 * for other services on the VIP may get here (except services
459 * listed in the ipvs table), pass the packets, because it is
460 * not ipvs job to decide to drop the packets.
461 */
462 if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
463 ip_vs_service_put(svc);
464 return NF_ACCEPT;
465 }
466
467 ip_vs_service_put(svc);
468
469 /*
470 * Notify the client that the destination is unreachable, and
471 * release the socket buffer.
472 * Since it is in IP layer, the TCP socket is not actually
473 * created, the TCP RST packet cannot be sent, instead that
474 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
475 */
476 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
477 return NF_DROP;
478}
479
480
481/*
482 * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
483 * chain, and is used for VS/NAT.
484 * It detects packets for VS/NAT connections and sends the packets
485 * immediately. This can avoid that iptable_nat mangles the packets
486 * for VS/NAT.
487 */
488static unsigned int ip_vs_post_routing(unsigned int hooknum,
489 struct sk_buff *skb,
490 const struct net_device *in,
491 const struct net_device *out,
492 int (*okfn)(struct sk_buff *))
493{
494 if (!skb->ipvs_property)
495 return NF_ACCEPT;
496 /* The packet was sent from IPVS, exit this chain */
497 return NF_STOP;
498}
499
500__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
501{
502 return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
503}
504
505static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
506{
507 int err = ip_defrag(skb, user);
508
509 if (!err)
510 ip_send_check(ip_hdr(skb));
511
512 return err;
513}
514
515/*
516 * Packet has been made sufficiently writable in caller
517 * - inout: 1=in->out, 0=out->in
518 */
519void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
520 struct ip_vs_conn *cp, int inout)
521{
522 struct iphdr *iph = ip_hdr(skb);
523 unsigned int icmp_offset = iph->ihl*4;
524 struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) +
525 icmp_offset);
526 struct iphdr *ciph = (struct iphdr *)(icmph + 1);
527
528 if (inout) {
529 iph->saddr = cp->vaddr;
530 ip_send_check(iph);
531 ciph->daddr = cp->vaddr;
532 ip_send_check(ciph);
533 } else {
534 iph->daddr = cp->daddr;
535 ip_send_check(iph);
536 ciph->saddr = cp->daddr;
537 ip_send_check(ciph);
538 }
539
540 /* the TCP/UDP port */
541 if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
542 __be16 *ports = (void *)ciph + ciph->ihl*4;
543
544 if (inout)
545 ports[1] = cp->vport;
546 else
547 ports[0] = cp->dport;
548 }
549
550 /* And finally the ICMP checksum */
551 icmph->checksum = 0;
552 icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
553 skb->ip_summed = CHECKSUM_UNNECESSARY;
554
555 if (inout)
556 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
557 "Forwarding altered outgoing ICMP");
558 else
559 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
560 "Forwarding altered incoming ICMP");
561}
562
563/*
564 * Handle ICMP messages in the inside-to-outside direction (outgoing).
565 * Find any that might be relevant, check against existing connections,
566 * forward to the right destination host if relevant.
567 * Currently handles error types - unreachable, quench, ttl exceeded.
568 * (Only used in VS/NAT)
569 */
570static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
571{
572 struct iphdr *iph;
573 struct icmphdr _icmph, *ic;
574 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
575 struct ip_vs_conn *cp;
576 struct ip_vs_protocol *pp;
577 unsigned int offset, ihl, verdict;
578
579 *related = 1;
580
581 /* reassemble IP fragments */
582 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
583 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
584 return NF_STOLEN;
585 }
586
587 iph = ip_hdr(skb);
588 offset = ihl = iph->ihl * 4;
589 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
590 if (ic == NULL)
591 return NF_DROP;
592
593 IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
594 ic->type, ntohs(icmp_id(ic)),
595 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
596
597 /*
598 * Work through seeing if this is for us.
599 * These checks are supposed to be in an order that means easy
600 * things are checked first to speed up processing.... however
601 * this means that some packets will manage to get a long way
602 * down this stack and then be rejected, but that's life.
603 */
604 if ((ic->type != ICMP_DEST_UNREACH) &&
605 (ic->type != ICMP_SOURCE_QUENCH) &&
606 (ic->type != ICMP_TIME_EXCEEDED)) {
607 *related = 0;
608 return NF_ACCEPT;
609 }
610
611 /* Now find the contained IP header */
612 offset += sizeof(_icmph);
613 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
614 if (cih == NULL)
615 return NF_ACCEPT; /* The packet looks wrong, ignore */
616
617 pp = ip_vs_proto_get(cih->protocol);
618 if (!pp)
619 return NF_ACCEPT;
620
621 /* Is the embedded protocol header present? */
622 if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
623 pp->dont_defrag))
624 return NF_ACCEPT;
625
626 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
627
628 offset += cih->ihl * 4;
629
630 /* The embedded headers contain source and dest in reverse order */
631 cp = pp->conn_out_get(skb, pp, cih, offset, 1);
632 if (!cp)
633 return NF_ACCEPT;
634
635 verdict = NF_DROP;
636
637 if (IP_VS_FWD_METHOD(cp) != 0) {
638 IP_VS_ERR("shouldn't reach here, because the box is on the "
639 "half connection in the tun/dr module.\n");
640 }
641
642 /* Ensure the checksum is correct */
643 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
644 /* Failed checksum! */
645 IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
646 NIPQUAD(iph->saddr));
647 goto out;
648 }
649
650 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
651 offset += 2 * sizeof(__u16);
652 if (!skb_make_writable(skb, offset))
653 goto out;
654
655 ip_vs_nat_icmp(skb, pp, cp, 1);
656
657 /* do the statistics and put it back */
658 ip_vs_out_stats(cp, skb);
659
660 skb->ipvs_property = 1;
661 verdict = NF_ACCEPT;
662
663 out:
664 __ip_vs_conn_put(cp);
665
666 return verdict;
667}
668
669static inline int is_tcp_reset(const struct sk_buff *skb)
670{
671 struct tcphdr _tcph, *th;
672
673 th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
674 if (th == NULL)
675 return 0;
676 return th->rst;
677}
678
679/*
680 * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
681 * Check if outgoing packet belongs to the established ip_vs_conn,
682 * rewrite addresses of the packet and send it on its way...
683 */
684static unsigned int
685ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
686 const struct net_device *in, const struct net_device *out,
687 int (*okfn)(struct sk_buff *))
688{
689 struct iphdr *iph;
690 struct ip_vs_protocol *pp;
691 struct ip_vs_conn *cp;
692 int ihl;
693
694 EnterFunction(11);
695
696 if (skb->ipvs_property)
697 return NF_ACCEPT;
698
699 iph = ip_hdr(skb);
700 if (unlikely(iph->protocol == IPPROTO_ICMP)) {
701 int related, verdict = ip_vs_out_icmp(skb, &related);
702
703 if (related)
704 return verdict;
705 iph = ip_hdr(skb);
706 }
707
708 pp = ip_vs_proto_get(iph->protocol);
709 if (unlikely(!pp))
710 return NF_ACCEPT;
711
712 /* reassemble IP fragments */
713 if (unlikely(iph->frag_off & htons(IP_MF|IP_OFFSET) &&
714 !pp->dont_defrag)) {
715 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
716 return NF_STOLEN;
717 iph = ip_hdr(skb);
718 }
719
720 ihl = iph->ihl << 2;
721
722 /*
723 * Check if the packet belongs to an existing entry
724 */
725 cp = pp->conn_out_get(skb, pp, iph, ihl, 0);
726
727 if (unlikely(!cp)) {
728 if (sysctl_ip_vs_nat_icmp_send &&
729 (pp->protocol == IPPROTO_TCP ||
730 pp->protocol == IPPROTO_UDP)) {
731 __be16 _ports[2], *pptr;
732
733 pptr = skb_header_pointer(skb, ihl,
734 sizeof(_ports), _ports);
735 if (pptr == NULL)
736 return NF_ACCEPT; /* Not for me */
737 if (ip_vs_lookup_real_service(iph->protocol,
738 iph->saddr, pptr[0])) {
739 /*
740 * Notify the real server: there is no
741 * existing entry if it is not RST
742 * packet or not TCP packet.
743 */
744 if (iph->protocol != IPPROTO_TCP
745 || !is_tcp_reset(skb)) {
746 icmp_send(skb,ICMP_DEST_UNREACH,
747 ICMP_PORT_UNREACH, 0);
748 return NF_DROP;
749 }
750 }
751 }
752 IP_VS_DBG_PKT(12, pp, skb, 0,
753 "packet continues traversal as normal");
754 return NF_ACCEPT;
755 }
756
757 IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
758
759 if (!skb_make_writable(skb, ihl))
760 goto drop;
761
762 /* mangle the packet */
763 if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
764 goto drop;
765 ip_hdr(skb)->saddr = cp->vaddr;
766 ip_send_check(ip_hdr(skb));
767
768 /* For policy routing, packets originating from this
769 * machine itself may be routed differently to packets
770 * passing through. We want this packet to be routed as
771 * if it came from this machine itself. So re-compute
772 * the routing information.
773 */
774 if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
775 goto drop;
776
777 IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
778
779 ip_vs_out_stats(cp, skb);
780 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
781 ip_vs_conn_put(cp);
782
783 skb->ipvs_property = 1;
784
785 LeaveFunction(11);
786 return NF_ACCEPT;
787
788 drop:
789 ip_vs_conn_put(cp);
790 kfree_skb(skb);
791 return NF_STOLEN;
792}
793
794
795/*
796 * Handle ICMP messages in the outside-to-inside direction (incoming).
797 * Find any that might be relevant, check against existing connections,
798 * forward to the right destination host if relevant.
799 * Currently handles error types - unreachable, quench, ttl exceeded.
800 */
801static int
802ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
803{
804 struct iphdr *iph;
805 struct icmphdr _icmph, *ic;
806 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
807 struct ip_vs_conn *cp;
808 struct ip_vs_protocol *pp;
809 unsigned int offset, ihl, verdict;
810
811 *related = 1;
812
813 /* reassemble IP fragments */
814 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
815 if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
816 IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
817 return NF_STOLEN;
818 }
819
820 iph = ip_hdr(skb);
821 offset = ihl = iph->ihl * 4;
822 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
823 if (ic == NULL)
824 return NF_DROP;
825
826 IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
827 ic->type, ntohs(icmp_id(ic)),
828 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
829
830 /*
831 * Work through seeing if this is for us.
832 * These checks are supposed to be in an order that means easy
833 * things are checked first to speed up processing.... however
834 * this means that some packets will manage to get a long way
835 * down this stack and then be rejected, but that's life.
836 */
837 if ((ic->type != ICMP_DEST_UNREACH) &&
838 (ic->type != ICMP_SOURCE_QUENCH) &&
839 (ic->type != ICMP_TIME_EXCEEDED)) {
840 *related = 0;
841 return NF_ACCEPT;
842 }
843
844 /* Now find the contained IP header */
845 offset += sizeof(_icmph);
846 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
847 if (cih == NULL)
848 return NF_ACCEPT; /* The packet looks wrong, ignore */
849
850 pp = ip_vs_proto_get(cih->protocol);
851 if (!pp)
852 return NF_ACCEPT;
853
854 /* Is the embedded protocol header present? */
855 if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
856 pp->dont_defrag))
857 return NF_ACCEPT;
858
859 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
860
861 offset += cih->ihl * 4;
862
863 /* The embedded headers contain source and dest in reverse order */
864 cp = pp->conn_in_get(skb, pp, cih, offset, 1);
865 if (!cp)
866 return NF_ACCEPT;
867
868 verdict = NF_DROP;
869
870 /* Ensure the checksum is correct */
871 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
872 /* Failed checksum! */
873 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
874 NIPQUAD(iph->saddr));
875 goto out;
876 }
877
878 /* do the statistics and put it back */
879 ip_vs_in_stats(cp, skb);
880 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
881 offset += 2 * sizeof(__u16);
882 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
883 /* do not touch skb anymore */
884
885 out:
886 __ip_vs_conn_put(cp);
887
888 return verdict;
889}
890
891/*
892 * Check if it's for virtual services, look it up,
893 * and send it on its way...
894 */
895static unsigned int
896ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
897 const struct net_device *in, const struct net_device *out,
898 int (*okfn)(struct sk_buff *))
899{
900 struct iphdr *iph;
901 struct ip_vs_protocol *pp;
902 struct ip_vs_conn *cp;
903 int ret, restart;
904 int ihl;
905
906 /*
907 * Big tappo: only PACKET_HOST (neither loopback nor mcasts)
908 * ... don't know why 1st test DOES NOT include 2nd (?)
909 */
910 if (unlikely(skb->pkt_type != PACKET_HOST
911 || skb->dev->flags & IFF_LOOPBACK || skb->sk)) {
912 IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
913 skb->pkt_type,
914 ip_hdr(skb)->protocol,
915 NIPQUAD(ip_hdr(skb)->daddr));
916 return NF_ACCEPT;
917 }
918
919 iph = ip_hdr(skb);
920 if (unlikely(iph->protocol == IPPROTO_ICMP)) {
921 int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
922
923 if (related)
924 return verdict;
925 iph = ip_hdr(skb);
926 }
927
928 /* Protocol supported? */
929 pp = ip_vs_proto_get(iph->protocol);
930 if (unlikely(!pp))
931 return NF_ACCEPT;
932
933 ihl = iph->ihl << 2;
934
935 /*
936 * Check if the packet belongs to an existing connection entry
937 */
938 cp = pp->conn_in_get(skb, pp, iph, ihl, 0);
939
940 if (unlikely(!cp)) {
941 int v;
942
943 if (!pp->conn_schedule(skb, pp, &v, &cp))
944 return v;
945 }
946
947 if (unlikely(!cp)) {
948 /* sorry, all this trouble for a no-hit :) */
949 IP_VS_DBG_PKT(12, pp, skb, 0,
950 "packet continues traversal as normal");
951 return NF_ACCEPT;
952 }
953
954 IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
955
956 /* Check the server status */
957 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
958 /* the destination server is not available */
959
960 if (sysctl_ip_vs_expire_nodest_conn) {
961 /* try to expire the connection immediately */
962 ip_vs_conn_expire_now(cp);
963 }
964 /* don't restart its timer, and silently
965 drop the packet. */
966 __ip_vs_conn_put(cp);
967 return NF_DROP;
968 }
969
970 ip_vs_in_stats(cp, skb);
971 restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
972 if (cp->packet_xmit)
973 ret = cp->packet_xmit(skb, cp, pp);
974 /* do not touch skb anymore */
975 else {
976 IP_VS_DBG_RL("warning: packet_xmit is null");
977 ret = NF_ACCEPT;
978 }
979
980 /* Increase its packet counter and check if it is needed
981 * to be synchronized
982 *
983 * Sync connection if it is about to close to
984 * encorage the standby servers to update the connections timeout
985 */
986 atomic_inc(&cp->in_pkts);
987 if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
988 (((cp->protocol != IPPROTO_TCP ||
989 cp->state == IP_VS_TCP_S_ESTABLISHED) &&
990 (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
991 == sysctl_ip_vs_sync_threshold[0])) ||
992 ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
993 ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
994 (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
995 (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
996 ip_vs_sync_conn(cp);
997 cp->old_state = cp->state;
998
999 ip_vs_conn_put(cp);
1000 return ret;
1001}
1002
1003
1004/*
1005 * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1006 * related packets destined for 0.0.0.0/0.
1007 * When fwmark-based virtual service is used, such as transparent
1008 * cache cluster, TCP packets can be marked and routed to ip_vs_in,
1009 * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1010 * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1011 * and send them to ip_vs_in_icmp.
1012 */
1013static unsigned int
1014ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1015 const struct net_device *in, const struct net_device *out,
1016 int (*okfn)(struct sk_buff *))
1017{
1018 int r;
1019
1020 if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1021 return NF_ACCEPT;
1022
1023 return ip_vs_in_icmp(skb, &r, hooknum);
1024}
1025
1026
1027static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1028 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1029 * or VS/NAT(change destination), so that filtering rules can be
1030 * applied to IPVS. */
1031 {
1032 .hook = ip_vs_in,
1033 .owner = THIS_MODULE,
1034 .pf = PF_INET,
1035 .hooknum = NF_INET_LOCAL_IN,
1036 .priority = 100,
1037 },
1038 /* After packet filtering, change source only for VS/NAT */
1039 {
1040 .hook = ip_vs_out,
1041 .owner = THIS_MODULE,
1042 .pf = PF_INET,
1043 .hooknum = NF_INET_FORWARD,
1044 .priority = 100,
1045 },
1046 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1047 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1048 {
1049 .hook = ip_vs_forward_icmp,
1050 .owner = THIS_MODULE,
1051 .pf = PF_INET,
1052 .hooknum = NF_INET_FORWARD,
1053 .priority = 99,
1054 },
1055 /* Before the netfilter connection tracking, exit from POST_ROUTING */
1056 {
1057 .hook = ip_vs_post_routing,
1058 .owner = THIS_MODULE,
1059 .pf = PF_INET,
1060 .hooknum = NF_INET_POST_ROUTING,
1061 .priority = NF_IP_PRI_NAT_SRC-1,
1062 },
1063};
1064
1065
1066/*
1067 * Initialize IP Virtual Server
1068 */
1069static int __init ip_vs_init(void)
1070{
1071 int ret;
1072
1073 ret = ip_vs_control_init();
1074 if (ret < 0) {
1075 IP_VS_ERR("can't setup control.\n");
1076 goto cleanup_nothing;
1077 }
1078
1079 ip_vs_protocol_init();
1080
1081 ret = ip_vs_app_init();
1082 if (ret < 0) {
1083 IP_VS_ERR("can't setup application helper.\n");
1084 goto cleanup_protocol;
1085 }
1086
1087 ret = ip_vs_conn_init();
1088 if (ret < 0) {
1089 IP_VS_ERR("can't setup connection table.\n");
1090 goto cleanup_app;
1091 }
1092
1093 ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1094 if (ret < 0) {
1095 IP_VS_ERR("can't register hooks.\n");
1096 goto cleanup_conn;
1097 }
1098
1099 IP_VS_INFO("ipvs loaded.\n");
1100 return ret;
1101
1102 cleanup_conn:
1103 ip_vs_conn_cleanup();
1104 cleanup_app:
1105 ip_vs_app_cleanup();
1106 cleanup_protocol:
1107 ip_vs_protocol_cleanup();
1108 ip_vs_control_cleanup();
1109 cleanup_nothing:
1110 return ret;
1111}
1112
1113static void __exit ip_vs_cleanup(void)
1114{
1115 nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1116 ip_vs_conn_cleanup();
1117 ip_vs_app_cleanup();
1118 ip_vs_protocol_cleanup();
1119 ip_vs_control_cleanup();
1120 IP_VS_INFO("ipvs unloaded.\n");
1121}
1122
1123module_init(ip_vs_init);
1124module_exit(ip_vs_cleanup);
1125MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
deleted file mode 100644
index 6379705a8dcb..000000000000
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ /dev/null
@@ -1,2373 +0,0 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 * Julian Anastasov <ja@ssi.bg>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * Changes:
18 *
19 */
20
21#include <linux/module.h>
22#include <linux/init.h>
23#include <linux/types.h>
24#include <linux/capability.h>
25#include <linux/fs.h>
26#include <linux/sysctl.h>
27#include <linux/proc_fs.h>
28#include <linux/workqueue.h>
29#include <linux/swap.h>
30#include <linux/seq_file.h>
31
32#include <linux/netfilter.h>
33#include <linux/netfilter_ipv4.h>
34#include <linux/mutex.h>
35
36#include <net/net_namespace.h>
37#include <net/ip.h>
38#include <net/route.h>
39#include <net/sock.h>
40
41#include <asm/uaccess.h>
42
43#include <net/ip_vs.h>
44
45/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
46static DEFINE_MUTEX(__ip_vs_mutex);
47
48/* lock for service table */
49static DEFINE_RWLOCK(__ip_vs_svc_lock);
50
51/* lock for table with the real services */
52static DEFINE_RWLOCK(__ip_vs_rs_lock);
53
54/* lock for state and timeout tables */
55static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
56
57/* lock for drop entry handling */
58static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
59
60/* lock for drop packet handling */
61static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
62
63/* 1/rate drop and drop-entry variables */
64int ip_vs_drop_rate = 0;
65int ip_vs_drop_counter = 0;
66static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
67
68/* number of virtual services */
69static int ip_vs_num_services = 0;
70
71/* sysctl variables */
72static int sysctl_ip_vs_drop_entry = 0;
73static int sysctl_ip_vs_drop_packet = 0;
74static int sysctl_ip_vs_secure_tcp = 0;
75static int sysctl_ip_vs_amemthresh = 1024;
76static int sysctl_ip_vs_am_droprate = 10;
77int sysctl_ip_vs_cache_bypass = 0;
78int sysctl_ip_vs_expire_nodest_conn = 0;
79int sysctl_ip_vs_expire_quiescent_template = 0;
80int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
81int sysctl_ip_vs_nat_icmp_send = 0;
82
83
84#ifdef CONFIG_IP_VS_DEBUG
85static int sysctl_ip_vs_debug_level = 0;
86
87int ip_vs_get_debug_level(void)
88{
89 return sysctl_ip_vs_debug_level;
90}
91#endif
92
93/*
94 * update_defense_level is called from keventd and from sysctl,
95 * so it needs to protect itself from softirqs
96 */
97static void update_defense_level(void)
98{
99 struct sysinfo i;
100 static int old_secure_tcp = 0;
101 int availmem;
102 int nomem;
103 int to_change = -1;
104
105 /* we only count free and buffered memory (in pages) */
106 si_meminfo(&i);
107 availmem = i.freeram + i.bufferram;
108 /* however in linux 2.5 the i.bufferram is total page cache size,
109 we need adjust it */
110 /* si_swapinfo(&i); */
111 /* availmem = availmem - (i.totalswap - i.freeswap); */
112
113 nomem = (availmem < sysctl_ip_vs_amemthresh);
114
115 local_bh_disable();
116
117 /* drop_entry */
118 spin_lock(&__ip_vs_dropentry_lock);
119 switch (sysctl_ip_vs_drop_entry) {
120 case 0:
121 atomic_set(&ip_vs_dropentry, 0);
122 break;
123 case 1:
124 if (nomem) {
125 atomic_set(&ip_vs_dropentry, 1);
126 sysctl_ip_vs_drop_entry = 2;
127 } else {
128 atomic_set(&ip_vs_dropentry, 0);
129 }
130 break;
131 case 2:
132 if (nomem) {
133 atomic_set(&ip_vs_dropentry, 1);
134 } else {
135 atomic_set(&ip_vs_dropentry, 0);
136 sysctl_ip_vs_drop_entry = 1;
137 };
138 break;
139 case 3:
140 atomic_set(&ip_vs_dropentry, 1);
141 break;
142 }
143 spin_unlock(&__ip_vs_dropentry_lock);
144
145 /* drop_packet */
146 spin_lock(&__ip_vs_droppacket_lock);
147 switch (sysctl_ip_vs_drop_packet) {
148 case 0:
149 ip_vs_drop_rate = 0;
150 break;
151 case 1:
152 if (nomem) {
153 ip_vs_drop_rate = ip_vs_drop_counter
154 = sysctl_ip_vs_amemthresh /
155 (sysctl_ip_vs_amemthresh-availmem);
156 sysctl_ip_vs_drop_packet = 2;
157 } else {
158 ip_vs_drop_rate = 0;
159 }
160 break;
161 case 2:
162 if (nomem) {
163 ip_vs_drop_rate = ip_vs_drop_counter
164 = sysctl_ip_vs_amemthresh /
165 (sysctl_ip_vs_amemthresh-availmem);
166 } else {
167 ip_vs_drop_rate = 0;
168 sysctl_ip_vs_drop_packet = 1;
169 }
170 break;
171 case 3:
172 ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
173 break;
174 }
175 spin_unlock(&__ip_vs_droppacket_lock);
176
177 /* secure_tcp */
178 write_lock(&__ip_vs_securetcp_lock);
179 switch (sysctl_ip_vs_secure_tcp) {
180 case 0:
181 if (old_secure_tcp >= 2)
182 to_change = 0;
183 break;
184 case 1:
185 if (nomem) {
186 if (old_secure_tcp < 2)
187 to_change = 1;
188 sysctl_ip_vs_secure_tcp = 2;
189 } else {
190 if (old_secure_tcp >= 2)
191 to_change = 0;
192 }
193 break;
194 case 2:
195 if (nomem) {
196 if (old_secure_tcp < 2)
197 to_change = 1;
198 } else {
199 if (old_secure_tcp >= 2)
200 to_change = 0;
201 sysctl_ip_vs_secure_tcp = 1;
202 }
203 break;
204 case 3:
205 if (old_secure_tcp < 2)
206 to_change = 1;
207 break;
208 }
209 old_secure_tcp = sysctl_ip_vs_secure_tcp;
210 if (to_change >= 0)
211 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
212 write_unlock(&__ip_vs_securetcp_lock);
213
214 local_bh_enable();
215}
216
217
218/*
219 * Timer for checking the defense
220 */
221#define DEFENSE_TIMER_PERIOD 1*HZ
222static void defense_work_handler(struct work_struct *work);
223static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
224
225static void defense_work_handler(struct work_struct *work)
226{
227 update_defense_level();
228 if (atomic_read(&ip_vs_dropentry))
229 ip_vs_random_dropentry();
230
231 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
232}
233
234int
235ip_vs_use_count_inc(void)
236{
237 return try_module_get(THIS_MODULE);
238}
239
240void
241ip_vs_use_count_dec(void)
242{
243 module_put(THIS_MODULE);
244}
245
246
247/*
248 * Hash table: for virtual service lookups
249 */
250#define IP_VS_SVC_TAB_BITS 8
251#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
252#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
253
254/* the service table hashed by <protocol, addr, port> */
255static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
256/* the service table hashed by fwmark */
257static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
258
259/*
260 * Hash table: for real service lookups
261 */
262#define IP_VS_RTAB_BITS 4
263#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
264#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
265
266static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
267
268/*
269 * Trash for destinations
270 */
271static LIST_HEAD(ip_vs_dest_trash);
272
273/*
274 * FTP & NULL virtual service counters
275 */
276static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
277static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
278
279
280/*
281 * Returns hash value for virtual service
282 */
283static __inline__ unsigned
284ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port)
285{
286 register unsigned porth = ntohs(port);
287
288 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
289 & IP_VS_SVC_TAB_MASK;
290}
291
292/*
293 * Returns hash value of fwmark for virtual service lookup
294 */
295static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
296{
297 return fwmark & IP_VS_SVC_TAB_MASK;
298}
299
300/*
301 * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
302 * or in the ip_vs_svc_fwm_table by fwmark.
303 * Should be called with locked tables.
304 */
305static int ip_vs_svc_hash(struct ip_vs_service *svc)
306{
307 unsigned hash;
308
309 if (svc->flags & IP_VS_SVC_F_HASHED) {
310 IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
311 "called from %p\n", __builtin_return_address(0));
312 return 0;
313 }
314
315 if (svc->fwmark == 0) {
316 /*
317 * Hash it by <protocol,addr,port> in ip_vs_svc_table
318 */
319 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
320 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
321 } else {
322 /*
323 * Hash it by fwmark in ip_vs_svc_fwm_table
324 */
325 hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
326 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
327 }
328
329 svc->flags |= IP_VS_SVC_F_HASHED;
330 /* increase its refcnt because it is referenced by the svc table */
331 atomic_inc(&svc->refcnt);
332 return 1;
333}
334
335
336/*
337 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
338 * Should be called with locked tables.
339 */
340static int ip_vs_svc_unhash(struct ip_vs_service *svc)
341{
342 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
343 IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
344 "called from %p\n", __builtin_return_address(0));
345 return 0;
346 }
347
348 if (svc->fwmark == 0) {
349 /* Remove it from the ip_vs_svc_table table */
350 list_del(&svc->s_list);
351 } else {
352 /* Remove it from the ip_vs_svc_fwm_table table */
353 list_del(&svc->f_list);
354 }
355
356 svc->flags &= ~IP_VS_SVC_F_HASHED;
357 atomic_dec(&svc->refcnt);
358 return 1;
359}
360
361
362/*
363 * Get service by {proto,addr,port} in the service table.
364 */
365static __inline__ struct ip_vs_service *
366__ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport)
367{
368 unsigned hash;
369 struct ip_vs_service *svc;
370
371 /* Check for "full" addressed entries */
372 hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
373
374 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
375 if ((svc->addr == vaddr)
376 && (svc->port == vport)
377 && (svc->protocol == protocol)) {
378 /* HIT */
379 atomic_inc(&svc->usecnt);
380 return svc;
381 }
382 }
383
384 return NULL;
385}
386
387
388/*
389 * Get service by {fwmark} in the service table.
390 */
391static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
392{
393 unsigned hash;
394 struct ip_vs_service *svc;
395
396 /* Check for fwmark addressed entries */
397 hash = ip_vs_svc_fwm_hashkey(fwmark);
398
399 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
400 if (svc->fwmark == fwmark) {
401 /* HIT */
402 atomic_inc(&svc->usecnt);
403 return svc;
404 }
405 }
406
407 return NULL;
408}
409
410struct ip_vs_service *
411ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
412{
413 struct ip_vs_service *svc;
414
415 read_lock(&__ip_vs_svc_lock);
416
417 /*
418 * Check the table hashed by fwmark first
419 */
420 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
421 goto out;
422
423 /*
424 * Check the table hashed by <protocol,addr,port>
425 * for "full" addressed entries
426 */
427 svc = __ip_vs_service_get(protocol, vaddr, vport);
428
429 if (svc == NULL
430 && protocol == IPPROTO_TCP
431 && atomic_read(&ip_vs_ftpsvc_counter)
432 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
433 /*
434 * Check if ftp service entry exists, the packet
435 * might belong to FTP data connections.
436 */
437 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
438 }
439
440 if (svc == NULL
441 && atomic_read(&ip_vs_nullsvc_counter)) {
442 /*
443 * Check if the catch-all port (port zero) exists
444 */
445 svc = __ip_vs_service_get(protocol, vaddr, 0);
446 }
447
448 out:
449 read_unlock(&__ip_vs_svc_lock);
450
451 IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
452 fwmark, ip_vs_proto_name(protocol),
453 NIPQUAD(vaddr), ntohs(vport),
454 svc?"hit":"not hit");
455
456 return svc;
457}
458
459
460static inline void
461__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
462{
463 atomic_inc(&svc->refcnt);
464 dest->svc = svc;
465}
466
467static inline void
468__ip_vs_unbind_svc(struct ip_vs_dest *dest)
469{
470 struct ip_vs_service *svc = dest->svc;
471
472 dest->svc = NULL;
473 if (atomic_dec_and_test(&svc->refcnt))
474 kfree(svc);
475}
476
477
478/*
479 * Returns hash value for real service
480 */
481static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port)
482{
483 register unsigned porth = ntohs(port);
484
485 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
486 & IP_VS_RTAB_MASK;
487}
488
489/*
490 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
491 * should be called with locked tables.
492 */
493static int ip_vs_rs_hash(struct ip_vs_dest *dest)
494{
495 unsigned hash;
496
497 if (!list_empty(&dest->d_list)) {
498 return 0;
499 }
500
501 /*
502 * Hash by proto,addr,port,
503 * which are the parameters of the real service.
504 */
505 hash = ip_vs_rs_hashkey(dest->addr, dest->port);
506 list_add(&dest->d_list, &ip_vs_rtable[hash]);
507
508 return 1;
509}
510
511/*
512 * UNhashes ip_vs_dest from ip_vs_rtable.
513 * should be called with locked tables.
514 */
515static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
516{
517 /*
518 * Remove it from the ip_vs_rtable table.
519 */
520 if (!list_empty(&dest->d_list)) {
521 list_del(&dest->d_list);
522 INIT_LIST_HEAD(&dest->d_list);
523 }
524
525 return 1;
526}
527
528/*
529 * Lookup real service by <proto,addr,port> in the real service table.
530 */
531struct ip_vs_dest *
532ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
533{
534 unsigned hash;
535 struct ip_vs_dest *dest;
536
537 /*
538 * Check for "full" addressed entries
539 * Return the first found entry
540 */
541 hash = ip_vs_rs_hashkey(daddr, dport);
542
543 read_lock(&__ip_vs_rs_lock);
544 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
545 if ((dest->addr == daddr)
546 && (dest->port == dport)
547 && ((dest->protocol == protocol) ||
548 dest->vfwmark)) {
549 /* HIT */
550 read_unlock(&__ip_vs_rs_lock);
551 return dest;
552 }
553 }
554 read_unlock(&__ip_vs_rs_lock);
555
556 return NULL;
557}
558
559/*
560 * Lookup destination by {addr,port} in the given service
561 */
562static struct ip_vs_dest *
563ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
564{
565 struct ip_vs_dest *dest;
566
567 /*
568 * Find the destination for the given service
569 */
570 list_for_each_entry(dest, &svc->destinations, n_list) {
571 if ((dest->addr == daddr) && (dest->port == dport)) {
572 /* HIT */
573 return dest;
574 }
575 }
576
577 return NULL;
578}
579
580/*
581 * Find destination by {daddr,dport,vaddr,protocol}
582 * Cretaed to be used in ip_vs_process_message() in
583 * the backup synchronization daemon. It finds the
584 * destination to be bound to the received connection
585 * on the backup.
586 *
587 * ip_vs_lookup_real_service() looked promissing, but
588 * seems not working as expected.
589 */
590struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport,
591 __be32 vaddr, __be16 vport, __u16 protocol)
592{
593 struct ip_vs_dest *dest;
594 struct ip_vs_service *svc;
595
596 svc = ip_vs_service_get(0, protocol, vaddr, vport);
597 if (!svc)
598 return NULL;
599 dest = ip_vs_lookup_dest(svc, daddr, dport);
600 if (dest)
601 atomic_inc(&dest->refcnt);
602 ip_vs_service_put(svc);
603 return dest;
604}
605
606/*
607 * Lookup dest by {svc,addr,port} in the destination trash.
608 * The destination trash is used to hold the destinations that are removed
609 * from the service table but are still referenced by some conn entries.
610 * The reason to add the destination trash is when the dest is temporary
611 * down (either by administrator or by monitor program), the dest can be
612 * picked back from the trash, the remaining connections to the dest can
613 * continue, and the counting information of the dest is also useful for
614 * scheduling.
615 */
616static struct ip_vs_dest *
617ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
618{
619 struct ip_vs_dest *dest, *nxt;
620
621 /*
622 * Find the destination in trash
623 */
624 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
625 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
626 "dest->refcnt=%d\n",
627 dest->vfwmark,
628 NIPQUAD(dest->addr), ntohs(dest->port),
629 atomic_read(&dest->refcnt));
630 if (dest->addr == daddr &&
631 dest->port == dport &&
632 dest->vfwmark == svc->fwmark &&
633 dest->protocol == svc->protocol &&
634 (svc->fwmark ||
635 (dest->vaddr == svc->addr &&
636 dest->vport == svc->port))) {
637 /* HIT */
638 return dest;
639 }
640
641 /*
642 * Try to purge the destination from trash if not referenced
643 */
644 if (atomic_read(&dest->refcnt) == 1) {
645 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
646 "from trash\n",
647 dest->vfwmark,
648 NIPQUAD(dest->addr), ntohs(dest->port));
649 list_del(&dest->n_list);
650 ip_vs_dst_reset(dest);
651 __ip_vs_unbind_svc(dest);
652 kfree(dest);
653 }
654 }
655
656 return NULL;
657}
658
659
660/*
661 * Clean up all the destinations in the trash
662 * Called by the ip_vs_control_cleanup()
663 *
664 * When the ip_vs_control_clearup is activated by ipvs module exit,
665 * the service tables must have been flushed and all the connections
666 * are expired, and the refcnt of each destination in the trash must
667 * be 1, so we simply release them here.
668 */
669static void ip_vs_trash_cleanup(void)
670{
671 struct ip_vs_dest *dest, *nxt;
672
673 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
674 list_del(&dest->n_list);
675 ip_vs_dst_reset(dest);
676 __ip_vs_unbind_svc(dest);
677 kfree(dest);
678 }
679}
680
681
682static void
683ip_vs_zero_stats(struct ip_vs_stats *stats)
684{
685 spin_lock_bh(&stats->lock);
686
687 stats->conns = 0;
688 stats->inpkts = 0;
689 stats->outpkts = 0;
690 stats->inbytes = 0;
691 stats->outbytes = 0;
692
693 stats->cps = 0;
694 stats->inpps = 0;
695 stats->outpps = 0;
696 stats->inbps = 0;
697 stats->outbps = 0;
698
699 ip_vs_zero_estimator(stats);
700
701 spin_unlock_bh(&stats->lock);
702}
703
704/*
705 * Update a destination in the given service
706 */
707static void
708__ip_vs_update_dest(struct ip_vs_service *svc,
709 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
710{
711 int conn_flags;
712
713 /* set the weight and the flags */
714 atomic_set(&dest->weight, udest->weight);
715 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
716
717 /* check if local node and update the flags */
718 if (inet_addr_type(&init_net, udest->addr) == RTN_LOCAL) {
719 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
720 | IP_VS_CONN_F_LOCALNODE;
721 }
722
723 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
724 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
725 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
726 } else {
727 /*
728 * Put the real service in ip_vs_rtable if not present.
729 * For now only for NAT!
730 */
731 write_lock_bh(&__ip_vs_rs_lock);
732 ip_vs_rs_hash(dest);
733 write_unlock_bh(&__ip_vs_rs_lock);
734 }
735 atomic_set(&dest->conn_flags, conn_flags);
736
737 /* bind the service */
738 if (!dest->svc) {
739 __ip_vs_bind_svc(dest, svc);
740 } else {
741 if (dest->svc != svc) {
742 __ip_vs_unbind_svc(dest);
743 ip_vs_zero_stats(&dest->stats);
744 __ip_vs_bind_svc(dest, svc);
745 }
746 }
747
748 /* set the dest status flags */
749 dest->flags |= IP_VS_DEST_F_AVAILABLE;
750
751 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
752 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
753 dest->u_threshold = udest->u_threshold;
754 dest->l_threshold = udest->l_threshold;
755}
756
757
758/*
759 * Create a destination for the given service
760 */
761static int
762ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
763 struct ip_vs_dest **dest_p)
764{
765 struct ip_vs_dest *dest;
766 unsigned atype;
767
768 EnterFunction(2);
769
770 atype = inet_addr_type(&init_net, udest->addr);
771 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
772 return -EINVAL;
773
774 dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
775 if (dest == NULL) {
776 IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
777 return -ENOMEM;
778 }
779
780 dest->protocol = svc->protocol;
781 dest->vaddr = svc->addr;
782 dest->vport = svc->port;
783 dest->vfwmark = svc->fwmark;
784 dest->addr = udest->addr;
785 dest->port = udest->port;
786
787 atomic_set(&dest->activeconns, 0);
788 atomic_set(&dest->inactconns, 0);
789 atomic_set(&dest->persistconns, 0);
790 atomic_set(&dest->refcnt, 0);
791
792 INIT_LIST_HEAD(&dest->d_list);
793 spin_lock_init(&dest->dst_lock);
794 spin_lock_init(&dest->stats.lock);
795 __ip_vs_update_dest(svc, dest, udest);
796 ip_vs_new_estimator(&dest->stats);
797
798 *dest_p = dest;
799
800 LeaveFunction(2);
801 return 0;
802}
803
804
805/*
806 * Add a destination into an existing service
807 */
808static int
809ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
810{
811 struct ip_vs_dest *dest;
812 __be32 daddr = udest->addr;
813 __be16 dport = udest->port;
814 int ret;
815
816 EnterFunction(2);
817
818 if (udest->weight < 0) {
819 IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
820 return -ERANGE;
821 }
822
823 if (udest->l_threshold > udest->u_threshold) {
824 IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
825 "upper threshold\n");
826 return -ERANGE;
827 }
828
829 /*
830 * Check if the dest already exists in the list
831 */
832 dest = ip_vs_lookup_dest(svc, daddr, dport);
833 if (dest != NULL) {
834 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
835 return -EEXIST;
836 }
837
838 /*
839 * Check if the dest already exists in the trash and
840 * is from the same service
841 */
842 dest = ip_vs_trash_get_dest(svc, daddr, dport);
843 if (dest != NULL) {
844 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
845 "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
846 NIPQUAD(daddr), ntohs(dport),
847 atomic_read(&dest->refcnt),
848 dest->vfwmark,
849 NIPQUAD(dest->vaddr),
850 ntohs(dest->vport));
851 __ip_vs_update_dest(svc, dest, udest);
852
853 /*
854 * Get the destination from the trash
855 */
856 list_del(&dest->n_list);
857
858 ip_vs_new_estimator(&dest->stats);
859
860 write_lock_bh(&__ip_vs_svc_lock);
861
862 /*
863 * Wait until all other svc users go away.
864 */
865 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
866
867 list_add(&dest->n_list, &svc->destinations);
868 svc->num_dests++;
869
870 /* call the update_service function of its scheduler */
871 svc->scheduler->update_service(svc);
872
873 write_unlock_bh(&__ip_vs_svc_lock);
874 return 0;
875 }
876
877 /*
878 * Allocate and initialize the dest structure
879 */
880 ret = ip_vs_new_dest(svc, udest, &dest);
881 if (ret) {
882 return ret;
883 }
884
885 /*
886 * Add the dest entry into the list
887 */
888 atomic_inc(&dest->refcnt);
889
890 write_lock_bh(&__ip_vs_svc_lock);
891
892 /*
893 * Wait until all other svc users go away.
894 */
895 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
896
897 list_add(&dest->n_list, &svc->destinations);
898 svc->num_dests++;
899
900 /* call the update_service function of its scheduler */
901 svc->scheduler->update_service(svc);
902
903 write_unlock_bh(&__ip_vs_svc_lock);
904
905 LeaveFunction(2);
906
907 return 0;
908}
909
910
911/*
912 * Edit a destination in the given service
913 */
914static int
915ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
916{
917 struct ip_vs_dest *dest;
918 __be32 daddr = udest->addr;
919 __be16 dport = udest->port;
920
921 EnterFunction(2);
922
923 if (udest->weight < 0) {
924 IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
925 return -ERANGE;
926 }
927
928 if (udest->l_threshold > udest->u_threshold) {
929 IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
930 "upper threshold\n");
931 return -ERANGE;
932 }
933
934 /*
935 * Lookup the destination list
936 */
937 dest = ip_vs_lookup_dest(svc, daddr, dport);
938 if (dest == NULL) {
939 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
940 return -ENOENT;
941 }
942
943 __ip_vs_update_dest(svc, dest, udest);
944
945 write_lock_bh(&__ip_vs_svc_lock);
946
947 /* Wait until all other svc users go away */
948 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
949
950 /* call the update_service, because server weight may be changed */
951 svc->scheduler->update_service(svc);
952
953 write_unlock_bh(&__ip_vs_svc_lock);
954
955 LeaveFunction(2);
956
957 return 0;
958}
959
960
961/*
962 * Delete a destination (must be already unlinked from the service)
963 */
964static void __ip_vs_del_dest(struct ip_vs_dest *dest)
965{
966 ip_vs_kill_estimator(&dest->stats);
967
968 /*
969 * Remove it from the d-linked list with the real services.
970 */
971 write_lock_bh(&__ip_vs_rs_lock);
972 ip_vs_rs_unhash(dest);
973 write_unlock_bh(&__ip_vs_rs_lock);
974
975 /*
976 * Decrease the refcnt of the dest, and free the dest
977 * if nobody refers to it (refcnt=0). Otherwise, throw
978 * the destination into the trash.
979 */
980 if (atomic_dec_and_test(&dest->refcnt)) {
981 ip_vs_dst_reset(dest);
982 /* simply decrease svc->refcnt here, let the caller check
983 and release the service if nobody refers to it.
984 Only user context can release destination and service,
985 and only one user context can update virtual service at a
986 time, so the operation here is OK */
987 atomic_dec(&dest->svc->refcnt);
988 kfree(dest);
989 } else {
990 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, "
991 "dest->refcnt=%d\n",
992 NIPQUAD(dest->addr), ntohs(dest->port),
993 atomic_read(&dest->refcnt));
994 list_add(&dest->n_list, &ip_vs_dest_trash);
995 atomic_inc(&dest->refcnt);
996 }
997}
998
999
1000/*
1001 * Unlink a destination from the given service
1002 */
1003static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1004 struct ip_vs_dest *dest,
1005 int svcupd)
1006{
1007 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1008
1009 /*
1010 * Remove it from the d-linked destination list.
1011 */
1012 list_del(&dest->n_list);
1013 svc->num_dests--;
1014 if (svcupd) {
1015 /*
1016 * Call the update_service function of its scheduler
1017 */
1018 svc->scheduler->update_service(svc);
1019 }
1020}
1021
1022
1023/*
1024 * Delete a destination server in the given service
1025 */
1026static int
1027ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
1028{
1029 struct ip_vs_dest *dest;
1030 __be32 daddr = udest->addr;
1031 __be16 dport = udest->port;
1032
1033 EnterFunction(2);
1034
1035 dest = ip_vs_lookup_dest(svc, daddr, dport);
1036 if (dest == NULL) {
1037 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1038 return -ENOENT;
1039 }
1040
1041 write_lock_bh(&__ip_vs_svc_lock);
1042
1043 /*
1044 * Wait until all other svc users go away.
1045 */
1046 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1047
1048 /*
1049 * Unlink dest from the service
1050 */
1051 __ip_vs_unlink_dest(svc, dest, 1);
1052
1053 write_unlock_bh(&__ip_vs_svc_lock);
1054
1055 /*
1056 * Delete the destination
1057 */
1058 __ip_vs_del_dest(dest);
1059
1060 LeaveFunction(2);
1061
1062 return 0;
1063}
1064
1065
1066/*
1067 * Add a service into the service hash table
1068 */
1069static int
1070ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1071{
1072 int ret = 0;
1073 struct ip_vs_scheduler *sched = NULL;
1074 struct ip_vs_service *svc = NULL;
1075
1076 /* increase the module use count */
1077 ip_vs_use_count_inc();
1078
1079 /* Lookup the scheduler by 'u->sched_name' */
1080 sched = ip_vs_scheduler_get(u->sched_name);
1081 if (sched == NULL) {
1082 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1083 u->sched_name);
1084 ret = -ENOENT;
1085 goto out_mod_dec;
1086 }
1087
1088 svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1089 if (svc == NULL) {
1090 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
1091 ret = -ENOMEM;
1092 goto out_err;
1093 }
1094
1095 /* I'm the first user of the service */
1096 atomic_set(&svc->usecnt, 1);
1097 atomic_set(&svc->refcnt, 0);
1098
1099 svc->protocol = u->protocol;
1100 svc->addr = u->addr;
1101 svc->port = u->port;
1102 svc->fwmark = u->fwmark;
1103 svc->flags = u->flags;
1104 svc->timeout = u->timeout * HZ;
1105 svc->netmask = u->netmask;
1106
1107 INIT_LIST_HEAD(&svc->destinations);
1108 rwlock_init(&svc->sched_lock);
1109 spin_lock_init(&svc->stats.lock);
1110
1111 /* Bind the scheduler */
1112 ret = ip_vs_bind_scheduler(svc, sched);
1113 if (ret)
1114 goto out_err;
1115 sched = NULL;
1116
1117 /* Update the virtual service counters */
1118 if (svc->port == FTPPORT)
1119 atomic_inc(&ip_vs_ftpsvc_counter);
1120 else if (svc->port == 0)
1121 atomic_inc(&ip_vs_nullsvc_counter);
1122
1123 ip_vs_new_estimator(&svc->stats);
1124 ip_vs_num_services++;
1125
1126 /* Hash the service into the service table */
1127 write_lock_bh(&__ip_vs_svc_lock);
1128 ip_vs_svc_hash(svc);
1129 write_unlock_bh(&__ip_vs_svc_lock);
1130
1131 *svc_p = svc;
1132 return 0;
1133
1134 out_err:
1135 if (svc != NULL) {
1136 if (svc->scheduler)
1137 ip_vs_unbind_scheduler(svc);
1138 if (svc->inc) {
1139 local_bh_disable();
1140 ip_vs_app_inc_put(svc->inc);
1141 local_bh_enable();
1142 }
1143 kfree(svc);
1144 }
1145 ip_vs_scheduler_put(sched);
1146
1147 out_mod_dec:
1148 /* decrease the module use count */
1149 ip_vs_use_count_dec();
1150
1151 return ret;
1152}
1153
1154
1155/*
1156 * Edit a service and bind it with a new scheduler
1157 */
1158static int
1159ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1160{
1161 struct ip_vs_scheduler *sched, *old_sched;
1162 int ret = 0;
1163
1164 /*
1165 * Lookup the scheduler, by 'u->sched_name'
1166 */
1167 sched = ip_vs_scheduler_get(u->sched_name);
1168 if (sched == NULL) {
1169 IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
1170 u->sched_name);
1171 return -ENOENT;
1172 }
1173 old_sched = sched;
1174
1175 write_lock_bh(&__ip_vs_svc_lock);
1176
1177 /*
1178 * Wait until all other svc users go away.
1179 */
1180 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1181
1182 /*
1183 * Set the flags and timeout value
1184 */
1185 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1186 svc->timeout = u->timeout * HZ;
1187 svc->netmask = u->netmask;
1188
1189 old_sched = svc->scheduler;
1190 if (sched != old_sched) {
1191 /*
1192 * Unbind the old scheduler
1193 */
1194 if ((ret = ip_vs_unbind_scheduler(svc))) {
1195 old_sched = sched;
1196 goto out;
1197 }
1198
1199 /*
1200 * Bind the new scheduler
1201 */
1202 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1203 /*
1204 * If ip_vs_bind_scheduler fails, restore the old
1205 * scheduler.
1206 * The main reason of failure is out of memory.
1207 *
1208 * The question is if the old scheduler can be
1209 * restored all the time. TODO: if it cannot be
1210 * restored some time, we must delete the service,
1211 * otherwise the system may crash.
1212 */
1213 ip_vs_bind_scheduler(svc, old_sched);
1214 old_sched = sched;
1215 goto out;
1216 }
1217 }
1218
1219 out:
1220 write_unlock_bh(&__ip_vs_svc_lock);
1221
1222 if (old_sched)
1223 ip_vs_scheduler_put(old_sched);
1224
1225 return ret;
1226}
1227
1228
1229/*
1230 * Delete a service from the service list
1231 * - The service must be unlinked, unlocked and not referenced!
1232 * - We are called under _bh lock
1233 */
1234static void __ip_vs_del_service(struct ip_vs_service *svc)
1235{
1236 struct ip_vs_dest *dest, *nxt;
1237 struct ip_vs_scheduler *old_sched;
1238
1239 ip_vs_num_services--;
1240 ip_vs_kill_estimator(&svc->stats);
1241
1242 /* Unbind scheduler */
1243 old_sched = svc->scheduler;
1244 ip_vs_unbind_scheduler(svc);
1245 if (old_sched)
1246 ip_vs_scheduler_put(old_sched);
1247
1248 /* Unbind app inc */
1249 if (svc->inc) {
1250 ip_vs_app_inc_put(svc->inc);
1251 svc->inc = NULL;
1252 }
1253
1254 /*
1255 * Unlink the whole destination list
1256 */
1257 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1258 __ip_vs_unlink_dest(svc, dest, 0);
1259 __ip_vs_del_dest(dest);
1260 }
1261
1262 /*
1263 * Update the virtual service counters
1264 */
1265 if (svc->port == FTPPORT)
1266 atomic_dec(&ip_vs_ftpsvc_counter);
1267 else if (svc->port == 0)
1268 atomic_dec(&ip_vs_nullsvc_counter);
1269
1270 /*
1271 * Free the service if nobody refers to it
1272 */
1273 if (atomic_read(&svc->refcnt) == 0)
1274 kfree(svc);
1275
1276 /* decrease the module use count */
1277 ip_vs_use_count_dec();
1278}
1279
1280/*
1281 * Delete a service from the service list
1282 */
1283static int ip_vs_del_service(struct ip_vs_service *svc)
1284{
1285 if (svc == NULL)
1286 return -EEXIST;
1287
1288 /*
1289 * Unhash it from the service table
1290 */
1291 write_lock_bh(&__ip_vs_svc_lock);
1292
1293 ip_vs_svc_unhash(svc);
1294
1295 /*
1296 * Wait until all the svc users go away.
1297 */
1298 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
1299
1300 __ip_vs_del_service(svc);
1301
1302 write_unlock_bh(&__ip_vs_svc_lock);
1303
1304 return 0;
1305}
1306
1307
1308/*
1309 * Flush all the virtual services
1310 */
1311static int ip_vs_flush(void)
1312{
1313 int idx;
1314 struct ip_vs_service *svc, *nxt;
1315
1316 /*
1317 * Flush the service table hashed by <protocol,addr,port>
1318 */
1319 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1320 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
1321 write_lock_bh(&__ip_vs_svc_lock);
1322 ip_vs_svc_unhash(svc);
1323 /*
1324 * Wait until all the svc users go away.
1325 */
1326 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1327 __ip_vs_del_service(svc);
1328 write_unlock_bh(&__ip_vs_svc_lock);
1329 }
1330 }
1331
1332 /*
1333 * Flush the service table hashed by fwmark
1334 */
1335 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1336 list_for_each_entry_safe(svc, nxt,
1337 &ip_vs_svc_fwm_table[idx], f_list) {
1338 write_lock_bh(&__ip_vs_svc_lock);
1339 ip_vs_svc_unhash(svc);
1340 /*
1341 * Wait until all the svc users go away.
1342 */
1343 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1344 __ip_vs_del_service(svc);
1345 write_unlock_bh(&__ip_vs_svc_lock);
1346 }
1347 }
1348
1349 return 0;
1350}
1351
1352
1353/*
1354 * Zero counters in a service or all services
1355 */
1356static int ip_vs_zero_service(struct ip_vs_service *svc)
1357{
1358 struct ip_vs_dest *dest;
1359
1360 write_lock_bh(&__ip_vs_svc_lock);
1361 list_for_each_entry(dest, &svc->destinations, n_list) {
1362 ip_vs_zero_stats(&dest->stats);
1363 }
1364 ip_vs_zero_stats(&svc->stats);
1365 write_unlock_bh(&__ip_vs_svc_lock);
1366 return 0;
1367}
1368
1369static int ip_vs_zero_all(void)
1370{
1371 int idx;
1372 struct ip_vs_service *svc;
1373
1374 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1375 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1376 ip_vs_zero_service(svc);
1377 }
1378 }
1379
1380 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1381 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1382 ip_vs_zero_service(svc);
1383 }
1384 }
1385
1386 ip_vs_zero_stats(&ip_vs_stats);
1387 return 0;
1388}
1389
1390
1391static int
1392proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1393 void __user *buffer, size_t *lenp, loff_t *ppos)
1394{
1395 int *valp = table->data;
1396 int val = *valp;
1397 int rc;
1398
1399 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1400 if (write && (*valp != val)) {
1401 if ((*valp < 0) || (*valp > 3)) {
1402 /* Restore the correct value */
1403 *valp = val;
1404 } else {
1405 update_defense_level();
1406 }
1407 }
1408 return rc;
1409}
1410
1411
1412static int
1413proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
1414 void __user *buffer, size_t *lenp, loff_t *ppos)
1415{
1416 int *valp = table->data;
1417 int val[2];
1418 int rc;
1419
1420 /* backup the value first */
1421 memcpy(val, valp, sizeof(val));
1422
1423 rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
1424 if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1425 /* Restore the correct value */
1426 memcpy(valp, val, sizeof(val));
1427 }
1428 return rc;
1429}
1430
1431
1432/*
1433 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1434 */
1435
1436static struct ctl_table vs_vars[] = {
1437 {
1438 .procname = "amemthresh",
1439 .data = &sysctl_ip_vs_amemthresh,
1440 .maxlen = sizeof(int),
1441 .mode = 0644,
1442 .proc_handler = &proc_dointvec,
1443 },
1444#ifdef CONFIG_IP_VS_DEBUG
1445 {
1446 .procname = "debug_level",
1447 .data = &sysctl_ip_vs_debug_level,
1448 .maxlen = sizeof(int),
1449 .mode = 0644,
1450 .proc_handler = &proc_dointvec,
1451 },
1452#endif
1453 {
1454 .procname = "am_droprate",
1455 .data = &sysctl_ip_vs_am_droprate,
1456 .maxlen = sizeof(int),
1457 .mode = 0644,
1458 .proc_handler = &proc_dointvec,
1459 },
1460 {
1461 .procname = "drop_entry",
1462 .data = &sysctl_ip_vs_drop_entry,
1463 .maxlen = sizeof(int),
1464 .mode = 0644,
1465 .proc_handler = &proc_do_defense_mode,
1466 },
1467 {
1468 .procname = "drop_packet",
1469 .data = &sysctl_ip_vs_drop_packet,
1470 .maxlen = sizeof(int),
1471 .mode = 0644,
1472 .proc_handler = &proc_do_defense_mode,
1473 },
1474 {
1475 .procname = "secure_tcp",
1476 .data = &sysctl_ip_vs_secure_tcp,
1477 .maxlen = sizeof(int),
1478 .mode = 0644,
1479 .proc_handler = &proc_do_defense_mode,
1480 },
1481#if 0
1482 {
1483 .procname = "timeout_established",
1484 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1485 .maxlen = sizeof(int),
1486 .mode = 0644,
1487 .proc_handler = &proc_dointvec_jiffies,
1488 },
1489 {
1490 .procname = "timeout_synsent",
1491 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1492 .maxlen = sizeof(int),
1493 .mode = 0644,
1494 .proc_handler = &proc_dointvec_jiffies,
1495 },
1496 {
1497 .procname = "timeout_synrecv",
1498 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1499 .maxlen = sizeof(int),
1500 .mode = 0644,
1501 .proc_handler = &proc_dointvec_jiffies,
1502 },
1503 {
1504 .procname = "timeout_finwait",
1505 .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1506 .maxlen = sizeof(int),
1507 .mode = 0644,
1508 .proc_handler = &proc_dointvec_jiffies,
1509 },
1510 {
1511 .procname = "timeout_timewait",
1512 .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1513 .maxlen = sizeof(int),
1514 .mode = 0644,
1515 .proc_handler = &proc_dointvec_jiffies,
1516 },
1517 {
1518 .procname = "timeout_close",
1519 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1520 .maxlen = sizeof(int),
1521 .mode = 0644,
1522 .proc_handler = &proc_dointvec_jiffies,
1523 },
1524 {
1525 .procname = "timeout_closewait",
1526 .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1527 .maxlen = sizeof(int),
1528 .mode = 0644,
1529 .proc_handler = &proc_dointvec_jiffies,
1530 },
1531 {
1532 .procname = "timeout_lastack",
1533 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1534 .maxlen = sizeof(int),
1535 .mode = 0644,
1536 .proc_handler = &proc_dointvec_jiffies,
1537 },
1538 {
1539 .procname = "timeout_listen",
1540 .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1541 .maxlen = sizeof(int),
1542 .mode = 0644,
1543 .proc_handler = &proc_dointvec_jiffies,
1544 },
1545 {
1546 .procname = "timeout_synack",
1547 .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1548 .maxlen = sizeof(int),
1549 .mode = 0644,
1550 .proc_handler = &proc_dointvec_jiffies,
1551 },
1552 {
1553 .procname = "timeout_udp",
1554 .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1555 .maxlen = sizeof(int),
1556 .mode = 0644,
1557 .proc_handler = &proc_dointvec_jiffies,
1558 },
1559 {
1560 .procname = "timeout_icmp",
1561 .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1562 .maxlen = sizeof(int),
1563 .mode = 0644,
1564 .proc_handler = &proc_dointvec_jiffies,
1565 },
1566#endif
1567 {
1568 .procname = "cache_bypass",
1569 .data = &sysctl_ip_vs_cache_bypass,
1570 .maxlen = sizeof(int),
1571 .mode = 0644,
1572 .proc_handler = &proc_dointvec,
1573 },
1574 {
1575 .procname = "expire_nodest_conn",
1576 .data = &sysctl_ip_vs_expire_nodest_conn,
1577 .maxlen = sizeof(int),
1578 .mode = 0644,
1579 .proc_handler = &proc_dointvec,
1580 },
1581 {
1582 .procname = "expire_quiescent_template",
1583 .data = &sysctl_ip_vs_expire_quiescent_template,
1584 .maxlen = sizeof(int),
1585 .mode = 0644,
1586 .proc_handler = &proc_dointvec,
1587 },
1588 {
1589 .procname = "sync_threshold",
1590 .data = &sysctl_ip_vs_sync_threshold,
1591 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1592 .mode = 0644,
1593 .proc_handler = &proc_do_sync_threshold,
1594 },
1595 {
1596 .procname = "nat_icmp_send",
1597 .data = &sysctl_ip_vs_nat_icmp_send,
1598 .maxlen = sizeof(int),
1599 .mode = 0644,
1600 .proc_handler = &proc_dointvec,
1601 },
1602 { .ctl_name = 0 }
1603};
1604
1605const struct ctl_path net_vs_ctl_path[] = {
1606 { .procname = "net", .ctl_name = CTL_NET, },
1607 { .procname = "ipv4", .ctl_name = NET_IPV4, },
1608 { .procname = "vs", },
1609 { }
1610};
1611EXPORT_SYMBOL_GPL(net_vs_ctl_path);
1612
1613static struct ctl_table_header * sysctl_header;
1614
1615#ifdef CONFIG_PROC_FS
1616
1617struct ip_vs_iter {
1618 struct list_head *table;
1619 int bucket;
1620};
1621
1622/*
1623 * Write the contents of the VS rule table to a PROCfs file.
1624 * (It is kept just for backward compatibility)
1625 */
1626static inline const char *ip_vs_fwd_name(unsigned flags)
1627{
1628 switch (flags & IP_VS_CONN_F_FWD_MASK) {
1629 case IP_VS_CONN_F_LOCALNODE:
1630 return "Local";
1631 case IP_VS_CONN_F_TUNNEL:
1632 return "Tunnel";
1633 case IP_VS_CONN_F_DROUTE:
1634 return "Route";
1635 default:
1636 return "Masq";
1637 }
1638}
1639
1640
1641/* Get the Nth entry in the two lists */
1642static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1643{
1644 struct ip_vs_iter *iter = seq->private;
1645 int idx;
1646 struct ip_vs_service *svc;
1647
1648 /* look in hash by protocol */
1649 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1650 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1651 if (pos-- == 0){
1652 iter->table = ip_vs_svc_table;
1653 iter->bucket = idx;
1654 return svc;
1655 }
1656 }
1657 }
1658
1659 /* keep looking in fwmark */
1660 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1661 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1662 if (pos-- == 0) {
1663 iter->table = ip_vs_svc_fwm_table;
1664 iter->bucket = idx;
1665 return svc;
1666 }
1667 }
1668 }
1669
1670 return NULL;
1671}
1672
1673static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1674{
1675
1676 read_lock_bh(&__ip_vs_svc_lock);
1677 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1678}
1679
1680
1681static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1682{
1683 struct list_head *e;
1684 struct ip_vs_iter *iter;
1685 struct ip_vs_service *svc;
1686
1687 ++*pos;
1688 if (v == SEQ_START_TOKEN)
1689 return ip_vs_info_array(seq,0);
1690
1691 svc = v;
1692 iter = seq->private;
1693
1694 if (iter->table == ip_vs_svc_table) {
1695 /* next service in table hashed by protocol */
1696 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1697 return list_entry(e, struct ip_vs_service, s_list);
1698
1699
1700 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1701 list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1702 s_list) {
1703 return svc;
1704 }
1705 }
1706
1707 iter->table = ip_vs_svc_fwm_table;
1708 iter->bucket = -1;
1709 goto scan_fwmark;
1710 }
1711
1712 /* next service in hashed by fwmark */
1713 if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1714 return list_entry(e, struct ip_vs_service, f_list);
1715
1716 scan_fwmark:
1717 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1718 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1719 f_list)
1720 return svc;
1721 }
1722
1723 return NULL;
1724}
1725
1726static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1727{
1728 read_unlock_bh(&__ip_vs_svc_lock);
1729}
1730
1731
1732static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1733{
1734 if (v == SEQ_START_TOKEN) {
1735 seq_printf(seq,
1736 "IP Virtual Server version %d.%d.%d (size=%d)\n",
1737 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
1738 seq_puts(seq,
1739 "Prot LocalAddress:Port Scheduler Flags\n");
1740 seq_puts(seq,
1741 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1742 } else {
1743 const struct ip_vs_service *svc = v;
1744 const struct ip_vs_iter *iter = seq->private;
1745 const struct ip_vs_dest *dest;
1746
1747 if (iter->table == ip_vs_svc_table)
1748 seq_printf(seq, "%s %08X:%04X %s ",
1749 ip_vs_proto_name(svc->protocol),
1750 ntohl(svc->addr),
1751 ntohs(svc->port),
1752 svc->scheduler->name);
1753 else
1754 seq_printf(seq, "FWM %08X %s ",
1755 svc->fwmark, svc->scheduler->name);
1756
1757 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1758 seq_printf(seq, "persistent %d %08X\n",
1759 svc->timeout,
1760 ntohl(svc->netmask));
1761 else
1762 seq_putc(seq, '\n');
1763
1764 list_for_each_entry(dest, &svc->destinations, n_list) {
1765 seq_printf(seq,
1766 " -> %08X:%04X %-7s %-6d %-10d %-10d\n",
1767 ntohl(dest->addr), ntohs(dest->port),
1768 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1769 atomic_read(&dest->weight),
1770 atomic_read(&dest->activeconns),
1771 atomic_read(&dest->inactconns));
1772 }
1773 }
1774 return 0;
1775}
1776
1777static const struct seq_operations ip_vs_info_seq_ops = {
1778 .start = ip_vs_info_seq_start,
1779 .next = ip_vs_info_seq_next,
1780 .stop = ip_vs_info_seq_stop,
1781 .show = ip_vs_info_seq_show,
1782};
1783
1784static int ip_vs_info_open(struct inode *inode, struct file *file)
1785{
1786 return seq_open_private(file, &ip_vs_info_seq_ops,
1787 sizeof(struct ip_vs_iter));
1788}
1789
1790static const struct file_operations ip_vs_info_fops = {
1791 .owner = THIS_MODULE,
1792 .open = ip_vs_info_open,
1793 .read = seq_read,
1794 .llseek = seq_lseek,
1795 .release = seq_release_private,
1796};
1797
1798#endif
1799
1800struct ip_vs_stats ip_vs_stats = {
1801 .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock),
1802};
1803
1804#ifdef CONFIG_PROC_FS
1805static int ip_vs_stats_show(struct seq_file *seq, void *v)
1806{
1807
1808/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1809 seq_puts(seq,
1810 " Total Incoming Outgoing Incoming Outgoing\n");
1811 seq_printf(seq,
1812 " Conns Packets Packets Bytes Bytes\n");
1813
1814 spin_lock_bh(&ip_vs_stats.lock);
1815 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
1816 ip_vs_stats.inpkts, ip_vs_stats.outpkts,
1817 (unsigned long long) ip_vs_stats.inbytes,
1818 (unsigned long long) ip_vs_stats.outbytes);
1819
1820/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1821 seq_puts(seq,
1822 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1823 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1824 ip_vs_stats.cps,
1825 ip_vs_stats.inpps,
1826 ip_vs_stats.outpps,
1827 ip_vs_stats.inbps,
1828 ip_vs_stats.outbps);
1829 spin_unlock_bh(&ip_vs_stats.lock);
1830
1831 return 0;
1832}
1833
1834static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1835{
1836 return single_open(file, ip_vs_stats_show, NULL);
1837}
1838
1839static const struct file_operations ip_vs_stats_fops = {
1840 .owner = THIS_MODULE,
1841 .open = ip_vs_stats_seq_open,
1842 .read = seq_read,
1843 .llseek = seq_lseek,
1844 .release = single_release,
1845};
1846
1847#endif
1848
1849/*
1850 * Set timeout values for tcp tcpfin udp in the timeout_table.
1851 */
1852static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
1853{
1854 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
1855 u->tcp_timeout,
1856 u->tcp_fin_timeout,
1857 u->udp_timeout);
1858
1859#ifdef CONFIG_IP_VS_PROTO_TCP
1860 if (u->tcp_timeout) {
1861 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
1862 = u->tcp_timeout * HZ;
1863 }
1864
1865 if (u->tcp_fin_timeout) {
1866 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
1867 = u->tcp_fin_timeout * HZ;
1868 }
1869#endif
1870
1871#ifdef CONFIG_IP_VS_PROTO_UDP
1872 if (u->udp_timeout) {
1873 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
1874 = u->udp_timeout * HZ;
1875 }
1876#endif
1877 return 0;
1878}
1879
1880
1881#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
1882#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
1883#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
1884 sizeof(struct ip_vs_dest_user))
1885#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
1886#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
1887#define MAX_ARG_LEN SVCDEST_ARG_LEN
1888
1889static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1890 [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
1891 [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
1892 [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
1893 [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
1894 [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
1895 [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
1896 [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
1897 [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
1898 [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
1899 [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
1900 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1901};
1902
1903static int
1904do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1905{
1906 int ret;
1907 unsigned char arg[MAX_ARG_LEN];
1908 struct ip_vs_service_user *usvc;
1909 struct ip_vs_service *svc;
1910 struct ip_vs_dest_user *udest;
1911
1912 if (!capable(CAP_NET_ADMIN))
1913 return -EPERM;
1914
1915 if (len != set_arglen[SET_CMDID(cmd)]) {
1916 IP_VS_ERR("set_ctl: len %u != %u\n",
1917 len, set_arglen[SET_CMDID(cmd)]);
1918 return -EINVAL;
1919 }
1920
1921 if (copy_from_user(arg, user, len) != 0)
1922 return -EFAULT;
1923
1924 /* increase the module use count */
1925 ip_vs_use_count_inc();
1926
1927 if (mutex_lock_interruptible(&__ip_vs_mutex)) {
1928 ret = -ERESTARTSYS;
1929 goto out_dec;
1930 }
1931
1932 if (cmd == IP_VS_SO_SET_FLUSH) {
1933 /* Flush the virtual service */
1934 ret = ip_vs_flush();
1935 goto out_unlock;
1936 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
1937 /* Set timeout values for (tcp tcpfin udp) */
1938 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
1939 goto out_unlock;
1940 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
1941 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1942 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
1943 goto out_unlock;
1944 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
1945 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
1946 ret = stop_sync_thread(dm->state);
1947 goto out_unlock;
1948 }
1949
1950 usvc = (struct ip_vs_service_user *)arg;
1951 udest = (struct ip_vs_dest_user *)(usvc + 1);
1952
1953 if (cmd == IP_VS_SO_SET_ZERO) {
1954 /* if no service address is set, zero counters in all */
1955 if (!usvc->fwmark && !usvc->addr && !usvc->port) {
1956 ret = ip_vs_zero_all();
1957 goto out_unlock;
1958 }
1959 }
1960
1961 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1962 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
1963 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1964 usvc->protocol, NIPQUAD(usvc->addr),
1965 ntohs(usvc->port), usvc->sched_name);
1966 ret = -EFAULT;
1967 goto out_unlock;
1968 }
1969
1970 /* Lookup the exact service by <protocol, addr, port> or fwmark */
1971 if (usvc->fwmark == 0)
1972 svc = __ip_vs_service_get(usvc->protocol,
1973 usvc->addr, usvc->port);
1974 else
1975 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
1976
1977 if (cmd != IP_VS_SO_SET_ADD
1978 && (svc == NULL || svc->protocol != usvc->protocol)) {
1979 ret = -ESRCH;
1980 goto out_unlock;
1981 }
1982
1983 switch (cmd) {
1984 case IP_VS_SO_SET_ADD:
1985 if (svc != NULL)
1986 ret = -EEXIST;
1987 else
1988 ret = ip_vs_add_service(usvc, &svc);
1989 break;
1990 case IP_VS_SO_SET_EDIT:
1991 ret = ip_vs_edit_service(svc, usvc);
1992 break;
1993 case IP_VS_SO_SET_DEL:
1994 ret = ip_vs_del_service(svc);
1995 if (!ret)
1996 goto out_unlock;
1997 break;
1998 case IP_VS_SO_SET_ZERO:
1999 ret = ip_vs_zero_service(svc);
2000 break;
2001 case IP_VS_SO_SET_ADDDEST:
2002 ret = ip_vs_add_dest(svc, udest);
2003 break;
2004 case IP_VS_SO_SET_EDITDEST:
2005 ret = ip_vs_edit_dest(svc, udest);
2006 break;
2007 case IP_VS_SO_SET_DELDEST:
2008 ret = ip_vs_del_dest(svc, udest);
2009 break;
2010 default:
2011 ret = -EINVAL;
2012 }
2013
2014 if (svc)
2015 ip_vs_service_put(svc);
2016
2017 out_unlock:
2018 mutex_unlock(&__ip_vs_mutex);
2019 out_dec:
2020 /* decrease the module use count */
2021 ip_vs_use_count_dec();
2022
2023 return ret;
2024}
2025
2026
2027static void
2028ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2029{
2030 spin_lock_bh(&src->lock);
2031 memcpy(dst, src, (char*)&src->lock - (char*)src);
2032 spin_unlock_bh(&src->lock);
2033}
2034
2035static void
2036ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2037{
2038 dst->protocol = src->protocol;
2039 dst->addr = src->addr;
2040 dst->port = src->port;
2041 dst->fwmark = src->fwmark;
2042 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2043 dst->flags = src->flags;
2044 dst->timeout = src->timeout / HZ;
2045 dst->netmask = src->netmask;
2046 dst->num_dests = src->num_dests;
2047 ip_vs_copy_stats(&dst->stats, &src->stats);
2048}
2049
2050static inline int
2051__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2052 struct ip_vs_get_services __user *uptr)
2053{
2054 int idx, count=0;
2055 struct ip_vs_service *svc;
2056 struct ip_vs_service_entry entry;
2057 int ret = 0;
2058
2059 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2060 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2061 if (count >= get->num_services)
2062 goto out;
2063 memset(&entry, 0, sizeof(entry));
2064 ip_vs_copy_service(&entry, svc);
2065 if (copy_to_user(&uptr->entrytable[count],
2066 &entry, sizeof(entry))) {
2067 ret = -EFAULT;
2068 goto out;
2069 }
2070 count++;
2071 }
2072 }
2073
2074 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2075 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2076 if (count >= get->num_services)
2077 goto out;
2078 memset(&entry, 0, sizeof(entry));
2079 ip_vs_copy_service(&entry, svc);
2080 if (copy_to_user(&uptr->entrytable[count],
2081 &entry, sizeof(entry))) {
2082 ret = -EFAULT;
2083 goto out;
2084 }
2085 count++;
2086 }
2087 }
2088 out:
2089 return ret;
2090}
2091
2092static inline int
2093__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2094 struct ip_vs_get_dests __user *uptr)
2095{
2096 struct ip_vs_service *svc;
2097 int ret = 0;
2098
2099 if (get->fwmark)
2100 svc = __ip_vs_svc_fwm_get(get->fwmark);
2101 else
2102 svc = __ip_vs_service_get(get->protocol,
2103 get->addr, get->port);
2104 if (svc) {
2105 int count = 0;
2106 struct ip_vs_dest *dest;
2107 struct ip_vs_dest_entry entry;
2108
2109 list_for_each_entry(dest, &svc->destinations, n_list) {
2110 if (count >= get->num_dests)
2111 break;
2112
2113 entry.addr = dest->addr;
2114 entry.port = dest->port;
2115 entry.conn_flags = atomic_read(&dest->conn_flags);
2116 entry.weight = atomic_read(&dest->weight);
2117 entry.u_threshold = dest->u_threshold;
2118 entry.l_threshold = dest->l_threshold;
2119 entry.activeconns = atomic_read(&dest->activeconns);
2120 entry.inactconns = atomic_read(&dest->inactconns);
2121 entry.persistconns = atomic_read(&dest->persistconns);
2122 ip_vs_copy_stats(&entry.stats, &dest->stats);
2123 if (copy_to_user(&uptr->entrytable[count],
2124 &entry, sizeof(entry))) {
2125 ret = -EFAULT;
2126 break;
2127 }
2128 count++;
2129 }
2130 ip_vs_service_put(svc);
2131 } else
2132 ret = -ESRCH;
2133 return ret;
2134}
2135
2136static inline void
2137__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
2138{
2139#ifdef CONFIG_IP_VS_PROTO_TCP
2140 u->tcp_timeout =
2141 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2142 u->tcp_fin_timeout =
2143 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2144#endif
2145#ifdef CONFIG_IP_VS_PROTO_UDP
2146 u->udp_timeout =
2147 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2148#endif
2149}
2150
2151
2152#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
2153#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
2154#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
2155#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
2156#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
2157#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
2158#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
2159
2160static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2161 [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
2162 [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
2163 [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
2164 [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
2165 [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
2166 [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
2167 [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
2168};
2169
2170static int
2171do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2172{
2173 unsigned char arg[128];
2174 int ret = 0;
2175
2176 if (!capable(CAP_NET_ADMIN))
2177 return -EPERM;
2178
2179 if (*len < get_arglen[GET_CMDID(cmd)]) {
2180 IP_VS_ERR("get_ctl: len %u < %u\n",
2181 *len, get_arglen[GET_CMDID(cmd)]);
2182 return -EINVAL;
2183 }
2184
2185 if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
2186 return -EFAULT;
2187
2188 if (mutex_lock_interruptible(&__ip_vs_mutex))
2189 return -ERESTARTSYS;
2190
2191 switch (cmd) {
2192 case IP_VS_SO_GET_VERSION:
2193 {
2194 char buf[64];
2195
2196 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2197 NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
2198 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2199 ret = -EFAULT;
2200 goto out;
2201 }
2202 *len = strlen(buf)+1;
2203 }
2204 break;
2205
2206 case IP_VS_SO_GET_INFO:
2207 {
2208 struct ip_vs_getinfo info;
2209 info.version = IP_VS_VERSION_CODE;
2210 info.size = IP_VS_CONN_TAB_SIZE;
2211 info.num_services = ip_vs_num_services;
2212 if (copy_to_user(user, &info, sizeof(info)) != 0)
2213 ret = -EFAULT;
2214 }
2215 break;
2216
2217 case IP_VS_SO_GET_SERVICES:
2218 {
2219 struct ip_vs_get_services *get;
2220 int size;
2221
2222 get = (struct ip_vs_get_services *)arg;
2223 size = sizeof(*get) +
2224 sizeof(struct ip_vs_service_entry) * get->num_services;
2225 if (*len != size) {
2226 IP_VS_ERR("length: %u != %u\n", *len, size);
2227 ret = -EINVAL;
2228 goto out;
2229 }
2230 ret = __ip_vs_get_service_entries(get, user);
2231 }
2232 break;
2233
2234 case IP_VS_SO_GET_SERVICE:
2235 {
2236 struct ip_vs_service_entry *entry;
2237 struct ip_vs_service *svc;
2238
2239 entry = (struct ip_vs_service_entry *)arg;
2240 if (entry->fwmark)
2241 svc = __ip_vs_svc_fwm_get(entry->fwmark);
2242 else
2243 svc = __ip_vs_service_get(entry->protocol,
2244 entry->addr, entry->port);
2245 if (svc) {
2246 ip_vs_copy_service(entry, svc);
2247 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2248 ret = -EFAULT;
2249 ip_vs_service_put(svc);
2250 } else
2251 ret = -ESRCH;
2252 }
2253 break;
2254
2255 case IP_VS_SO_GET_DESTS:
2256 {
2257 struct ip_vs_get_dests *get;
2258 int size;
2259
2260 get = (struct ip_vs_get_dests *)arg;
2261 size = sizeof(*get) +
2262 sizeof(struct ip_vs_dest_entry) * get->num_dests;
2263 if (*len != size) {
2264 IP_VS_ERR("length: %u != %u\n", *len, size);
2265 ret = -EINVAL;
2266 goto out;
2267 }
2268 ret = __ip_vs_get_dest_entries(get, user);
2269 }
2270 break;
2271
2272 case IP_VS_SO_GET_TIMEOUT:
2273 {
2274 struct ip_vs_timeout_user t;
2275
2276 __ip_vs_get_timeouts(&t);
2277 if (copy_to_user(user, &t, sizeof(t)) != 0)
2278 ret = -EFAULT;
2279 }
2280 break;
2281
2282 case IP_VS_SO_GET_DAEMON:
2283 {
2284 struct ip_vs_daemon_user d[2];
2285
2286 memset(&d, 0, sizeof(d));
2287 if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
2288 d[0].state = IP_VS_STATE_MASTER;
2289 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
2290 d[0].syncid = ip_vs_master_syncid;
2291 }
2292 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
2293 d[1].state = IP_VS_STATE_BACKUP;
2294 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
2295 d[1].syncid = ip_vs_backup_syncid;
2296 }
2297 if (copy_to_user(user, &d, sizeof(d)) != 0)
2298 ret = -EFAULT;
2299 }
2300 break;
2301
2302 default:
2303 ret = -EINVAL;
2304 }
2305
2306 out:
2307 mutex_unlock(&__ip_vs_mutex);
2308 return ret;
2309}
2310
2311
2312static struct nf_sockopt_ops ip_vs_sockopts = {
2313 .pf = PF_INET,
2314 .set_optmin = IP_VS_BASE_CTL,
2315 .set_optmax = IP_VS_SO_SET_MAX+1,
2316 .set = do_ip_vs_set_ctl,
2317 .get_optmin = IP_VS_BASE_CTL,
2318 .get_optmax = IP_VS_SO_GET_MAX+1,
2319 .get = do_ip_vs_get_ctl,
2320 .owner = THIS_MODULE,
2321};
2322
2323
2324int __init ip_vs_control_init(void)
2325{
2326 int ret;
2327 int idx;
2328
2329 EnterFunction(2);
2330
2331 ret = nf_register_sockopt(&ip_vs_sockopts);
2332 if (ret) {
2333 IP_VS_ERR("cannot register sockopt.\n");
2334 return ret;
2335 }
2336
2337 proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
2338 proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
2339
2340 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
2341
2342 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
2343 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2344 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
2345 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
2346 }
2347 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
2348 INIT_LIST_HEAD(&ip_vs_rtable[idx]);
2349 }
2350
2351 ip_vs_new_estimator(&ip_vs_stats);
2352
2353 /* Hook the defense timer */
2354 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
2355
2356 LeaveFunction(2);
2357 return 0;
2358}
2359
2360
2361void ip_vs_control_cleanup(void)
2362{
2363 EnterFunction(2);
2364 ip_vs_trash_cleanup();
2365 cancel_rearming_delayed_work(&defense_work);
2366 cancel_work_sync(&defense_work.work);
2367 ip_vs_kill_estimator(&ip_vs_stats);
2368 unregister_sysctl_table(sysctl_header);
2369 proc_net_remove(&init_net, "ip_vs_stats");
2370 proc_net_remove(&init_net, "ip_vs");
2371 nf_unregister_sockopt(&ip_vs_sockopts);
2372 LeaveFunction(2);
2373}
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
deleted file mode 100644
index fa66824d264f..000000000000
--- a/net/ipv4/ipvs/ip_vs_dh.c
+++ /dev/null
@@ -1,258 +0,0 @@
1/*
2 * IPVS: Destination Hashing scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@gnuchina.org>
5 *
6 * Inspired by the consistent hashing scheduler patch from
7 * Thomas Proell <proellt@gmx.de>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * Changes:
15 *
16 */
17
18/*
19 * The dh algorithm is to select server by the hash key of destination IP
20 * address. The pseudo code is as follows:
21 *
22 * n <- servernode[dest_ip];
23 * if (n is dead) OR
24 * (n is overloaded) OR (n.weight <= 0) then
25 * return NULL;
26 *
27 * return n;
28 *
29 * Notes that servernode is a 256-bucket hash table that maps the hash
30 * index derived from packet destination IP address to the current server
31 * array. If the dh scheduler is used in cache cluster, it is good to
32 * combine it with cache_bypass feature. When the statically assigned
33 * server is dead or overloaded, the load balancer can bypass the cache
34 * server and send requests to the original server directly.
35 *
36 */
37
38#include <linux/ip.h>
39#include <linux/module.h>
40#include <linux/kernel.h>
41#include <linux/skbuff.h>
42
43#include <net/ip_vs.h>
44
45
46/*
47 * IPVS DH bucket
48 */
49struct ip_vs_dh_bucket {
50 struct ip_vs_dest *dest; /* real server (cache) */
51};
52
53/*
54 * for IPVS DH entry hash table
55 */
56#ifndef CONFIG_IP_VS_DH_TAB_BITS
57#define CONFIG_IP_VS_DH_TAB_BITS 8
58#endif
59#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS
60#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS)
61#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1)
62
63
64/*
65 * Returns hash value for IPVS DH entry
66 */
67static inline unsigned ip_vs_dh_hashkey(__be32 addr)
68{
69 return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK;
70}
71
72
73/*
74 * Get ip_vs_dest associated with supplied parameters.
75 */
76static inline struct ip_vs_dest *
77ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __be32 addr)
78{
79 return (tbl[ip_vs_dh_hashkey(addr)]).dest;
80}
81
82
83/*
84 * Assign all the hash buckets of the specified table with the service.
85 */
86static int
87ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
88{
89 int i;
90 struct ip_vs_dh_bucket *b;
91 struct list_head *p;
92 struct ip_vs_dest *dest;
93
94 b = tbl;
95 p = &svc->destinations;
96 for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
97 if (list_empty(p)) {
98 b->dest = NULL;
99 } else {
100 if (p == &svc->destinations)
101 p = p->next;
102
103 dest = list_entry(p, struct ip_vs_dest, n_list);
104 atomic_inc(&dest->refcnt);
105 b->dest = dest;
106
107 p = p->next;
108 }
109 b++;
110 }
111 return 0;
112}
113
114
115/*
116 * Flush all the hash buckets of the specified table.
117 */
118static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
119{
120 int i;
121 struct ip_vs_dh_bucket *b;
122
123 b = tbl;
124 for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
125 if (b->dest) {
126 atomic_dec(&b->dest->refcnt);
127 b->dest = NULL;
128 }
129 b++;
130 }
131}
132
133
134static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
135{
136 struct ip_vs_dh_bucket *tbl;
137
138 /* allocate the DH table for this service */
139 tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
140 GFP_ATOMIC);
141 if (tbl == NULL) {
142 IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n");
143 return -ENOMEM;
144 }
145 svc->sched_data = tbl;
146 IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
147 "current service\n",
148 sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
149
150 /* assign the hash buckets with the updated service */
151 ip_vs_dh_assign(tbl, svc);
152
153 return 0;
154}
155
156
157static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
158{
159 struct ip_vs_dh_bucket *tbl = svc->sched_data;
160
161 /* got to clean up hash buckets here */
162 ip_vs_dh_flush(tbl);
163
164 /* release the table itself */
165 kfree(svc->sched_data);
166 IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
167 sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
168
169 return 0;
170}
171
172
173static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
174{
175 struct ip_vs_dh_bucket *tbl = svc->sched_data;
176
177 /* got to clean up hash buckets here */
178 ip_vs_dh_flush(tbl);
179
180 /* assign the hash buckets with the updated service */
181 ip_vs_dh_assign(tbl, svc);
182
183 return 0;
184}
185
186
187/*
188 * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
189 * consider that the server is overloaded here.
190 */
191static inline int is_overloaded(struct ip_vs_dest *dest)
192{
193 return dest->flags & IP_VS_DEST_F_OVERLOAD;
194}
195
196
197/*
198 * Destination hashing scheduling
199 */
200static struct ip_vs_dest *
201ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
202{
203 struct ip_vs_dest *dest;
204 struct ip_vs_dh_bucket *tbl;
205 struct iphdr *iph = ip_hdr(skb);
206
207 IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
208
209 tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
210 dest = ip_vs_dh_get(tbl, iph->daddr);
211 if (!dest
212 || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
213 || atomic_read(&dest->weight) <= 0
214 || is_overloaded(dest)) {
215 return NULL;
216 }
217
218 IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
219 "--> server %u.%u.%u.%u:%d\n",
220 NIPQUAD(iph->daddr),
221 NIPQUAD(dest->addr),
222 ntohs(dest->port));
223
224 return dest;
225}
226
227
228/*
229 * IPVS DH Scheduler structure
230 */
231static struct ip_vs_scheduler ip_vs_dh_scheduler =
232{
233 .name = "dh",
234 .refcnt = ATOMIC_INIT(0),
235 .module = THIS_MODULE,
236 .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
237 .init_service = ip_vs_dh_init_svc,
238 .done_service = ip_vs_dh_done_svc,
239 .update_service = ip_vs_dh_update_svc,
240 .schedule = ip_vs_dh_schedule,
241};
242
243
244static int __init ip_vs_dh_init(void)
245{
246 return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
247}
248
249
250static void __exit ip_vs_dh_cleanup(void)
251{
252 unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
253}
254
255
256module_init(ip_vs_dh_init);
257module_exit(ip_vs_dh_cleanup);
258MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
deleted file mode 100644
index 5a20f93bd7f9..000000000000
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ /dev/null
@@ -1,162 +0,0 @@
1/*
2 * ip_vs_est.c: simple rate estimator for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 *
13 */
14#include <linux/kernel.h>
15#include <linux/jiffies.h>
16#include <linux/slab.h>
17#include <linux/types.h>
18#include <linux/interrupt.h>
19#include <linux/sysctl.h>
20#include <linux/list.h>
21
22#include <net/ip_vs.h>
23
24/*
25 This code is to estimate rate in a shorter interval (such as 8
26 seconds) for virtual services and real servers. For measure rate in a
27 long interval, it is easy to implement a user level daemon which
28 periodically reads those statistical counters and measure rate.
29
30 Currently, the measurement is activated by slow timer handler. Hope
31 this measurement will not introduce too much load.
32
33 We measure rate during the last 8 seconds every 2 seconds:
34
35 avgrate = avgrate*(1-W) + rate*W
36
37 where W = 2^(-2)
38
39 NOTES.
40
41 * The stored value for average bps is scaled by 2^5, so that maximal
42 rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
43
44 * A lot code is taken from net/sched/estimator.c
45 */
46
47
48static void estimation_timer(unsigned long arg);
49
50static LIST_HEAD(est_list);
51static DEFINE_SPINLOCK(est_lock);
52static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);
53
54static void estimation_timer(unsigned long arg)
55{
56 struct ip_vs_estimator *e;
57 struct ip_vs_stats *s;
58 u32 n_conns;
59 u32 n_inpkts, n_outpkts;
60 u64 n_inbytes, n_outbytes;
61 u32 rate;
62
63 spin_lock(&est_lock);
64 list_for_each_entry(e, &est_list, list) {
65 s = container_of(e, struct ip_vs_stats, est);
66
67 spin_lock(&s->lock);
68 n_conns = s->conns;
69 n_inpkts = s->inpkts;
70 n_outpkts = s->outpkts;
71 n_inbytes = s->inbytes;
72 n_outbytes = s->outbytes;
73
74 /* scaled by 2^10, but divided 2 seconds */
75 rate = (n_conns - e->last_conns)<<9;
76 e->last_conns = n_conns;
77 e->cps += ((long)rate - (long)e->cps)>>2;
78 s->cps = (e->cps+0x1FF)>>10;
79
80 rate = (n_inpkts - e->last_inpkts)<<9;
81 e->last_inpkts = n_inpkts;
82 e->inpps += ((long)rate - (long)e->inpps)>>2;
83 s->inpps = (e->inpps+0x1FF)>>10;
84
85 rate = (n_outpkts - e->last_outpkts)<<9;
86 e->last_outpkts = n_outpkts;
87 e->outpps += ((long)rate - (long)e->outpps)>>2;
88 s->outpps = (e->outpps+0x1FF)>>10;
89
90 rate = (n_inbytes - e->last_inbytes)<<4;
91 e->last_inbytes = n_inbytes;
92 e->inbps += ((long)rate - (long)e->inbps)>>2;
93 s->inbps = (e->inbps+0xF)>>5;
94
95 rate = (n_outbytes - e->last_outbytes)<<4;
96 e->last_outbytes = n_outbytes;
97 e->outbps += ((long)rate - (long)e->outbps)>>2;
98 s->outbps = (e->outbps+0xF)>>5;
99 spin_unlock(&s->lock);
100 }
101 spin_unlock(&est_lock);
102 mod_timer(&est_timer, jiffies + 2*HZ);
103}
104
105void ip_vs_new_estimator(struct ip_vs_stats *stats)
106{
107 struct ip_vs_estimator *est = &stats->est;
108
109 INIT_LIST_HEAD(&est->list);
110
111 est->last_conns = stats->conns;
112 est->cps = stats->cps<<10;
113
114 est->last_inpkts = stats->inpkts;
115 est->inpps = stats->inpps<<10;
116
117 est->last_outpkts = stats->outpkts;
118 est->outpps = stats->outpps<<10;
119
120 est->last_inbytes = stats->inbytes;
121 est->inbps = stats->inbps<<5;
122
123 est->last_outbytes = stats->outbytes;
124 est->outbps = stats->outbps<<5;
125
126 spin_lock_bh(&est_lock);
127 if (list_empty(&est_list))
128 mod_timer(&est_timer, jiffies + 2 * HZ);
129 list_add(&est->list, &est_list);
130 spin_unlock_bh(&est_lock);
131}
132
133void ip_vs_kill_estimator(struct ip_vs_stats *stats)
134{
135 struct ip_vs_estimator *est = &stats->est;
136
137 spin_lock_bh(&est_lock);
138 list_del(&est->list);
139 while (list_empty(&est_list) && try_to_del_timer_sync(&est_timer) < 0) {
140 spin_unlock_bh(&est_lock);
141 cpu_relax();
142 spin_lock_bh(&est_lock);
143 }
144 spin_unlock_bh(&est_lock);
145}
146
147void ip_vs_zero_estimator(struct ip_vs_stats *stats)
148{
149 struct ip_vs_estimator *est = &stats->est;
150
151 /* set counters zero, caller must hold the stats->lock lock */
152 est->last_inbytes = 0;
153 est->last_outbytes = 0;
154 est->last_conns = 0;
155 est->last_inpkts = 0;
156 est->last_outpkts = 0;
157 est->cps = 0;
158 est->inpps = 0;
159 est->outpps = 0;
160 est->inbps = 0;
161 est->outbps = 0;
162}
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c
deleted file mode 100644
index c1c758e4f733..000000000000
--- a/net/ipv4/ipvs/ip_vs_ftp.c
+++ /dev/null
@@ -1,393 +0,0 @@
1/*
2 * ip_vs_ftp.c: IPVS ftp application module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * Changes:
7 *
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
15 * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
16 *
17 * IP_MASQ_FTP ftp masquerading module
18 *
19 * Version: @(#)ip_masq_ftp.c 0.04 02/05/96
20 *
21 * Author: Wouter Gadeyne
22 *
23 */
24
25#include <linux/module.h>
26#include <linux/moduleparam.h>
27#include <linux/kernel.h>
28#include <linux/skbuff.h>
29#include <linux/in.h>
30#include <linux/ip.h>
31#include <linux/netfilter.h>
32#include <net/protocol.h>
33#include <net/tcp.h>
34#include <asm/unaligned.h>
35
36#include <net/ip_vs.h>
37
38
39#define SERVER_STRING "227 Entering Passive Mode ("
40#define CLIENT_STRING "PORT "
41
42
43/*
44 * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
45 * First port is set to the default port.
46 */
47static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0};
48module_param_array(ports, ushort, NULL, 0);
49MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
50
51
52/* Dummy variable */
53static int ip_vs_ftp_pasv;
54
55
56static int
57ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
58{
59 return 0;
60}
61
62
63static int
64ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
65{
66 return 0;
67}
68
69
70/*
71 * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
72 * with the "pattern" and terminated with the "term" character.
73 * <addr,port> is in network order.
74 */
75static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
76 const char *pattern, size_t plen, char term,
77 __be32 *addr, __be16 *port,
78 char **start, char **end)
79{
80 unsigned char p[6];
81 int i = 0;
82
83 if (data_limit - data < plen) {
84 /* check if there is partial match */
85 if (strnicmp(data, pattern, data_limit - data) == 0)
86 return -1;
87 else
88 return 0;
89 }
90
91 if (strnicmp(data, pattern, plen) != 0) {
92 return 0;
93 }
94 *start = data + plen;
95
96 for (data = *start; *data != term; data++) {
97 if (data == data_limit)
98 return -1;
99 }
100 *end = data;
101
102 memset(p, 0, sizeof(p));
103 for (data = *start; data != *end; data++) {
104 if (*data >= '0' && *data <= '9') {
105 p[i] = p[i]*10 + *data - '0';
106 } else if (*data == ',' && i < 5) {
107 i++;
108 } else {
109 /* unexpected character */
110 return -1;
111 }
112 }
113
114 if (i != 5)
115 return -1;
116
117 *addr = get_unaligned((__be32 *)p);
118 *port = get_unaligned((__be16 *)(p + 4));
119 return 1;
120}
121
122
123/*
124 * Look at outgoing ftp packets to catch the response to a PASV command
125 * from the server (inside-to-outside).
126 * When we see one, we build a connection entry with the client address,
127 * client port 0 (unknown at the moment), the server address and the
128 * server port. Mark the current connection entry as a control channel
129 * of the new entry. All this work is just to make the data connection
130 * can be scheduled to the right server later.
131 *
132 * The outgoing packet should be something like
133 * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
134 * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
135 */
136static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
137 struct sk_buff *skb, int *diff)
138{
139 struct iphdr *iph;
140 struct tcphdr *th;
141 char *data, *data_limit;
142 char *start, *end;
143 __be32 from;
144 __be16 port;
145 struct ip_vs_conn *n_cp;
146 char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
147 unsigned buf_len;
148 int ret;
149
150 *diff = 0;
151
152 /* Only useful for established sessions */
153 if (cp->state != IP_VS_TCP_S_ESTABLISHED)
154 return 1;
155
156 /* Linear packets are much easier to deal with. */
157 if (!skb_make_writable(skb, skb->len))
158 return 0;
159
160 if (cp->app_data == &ip_vs_ftp_pasv) {
161 iph = ip_hdr(skb);
162 th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
163 data = (char *)th + (th->doff << 2);
164 data_limit = skb_tail_pointer(skb);
165
166 if (ip_vs_ftp_get_addrport(data, data_limit,
167 SERVER_STRING,
168 sizeof(SERVER_STRING)-1, ')',
169 &from, &port,
170 &start, &end) != 1)
171 return 1;
172
173 IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> "
174 "%u.%u.%u.%u:%d detected\n",
175 NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0);
176
177 /*
178 * Now update or create an connection entry for it
179 */
180 n_cp = ip_vs_conn_out_get(iph->protocol, from, port,
181 cp->caddr, 0);
182 if (!n_cp) {
183 n_cp = ip_vs_conn_new(IPPROTO_TCP,
184 cp->caddr, 0,
185 cp->vaddr, port,
186 from, port,
187 IP_VS_CONN_F_NO_CPORT,
188 cp->dest);
189 if (!n_cp)
190 return 0;
191
192 /* add its controller */
193 ip_vs_control_add(n_cp, cp);
194 }
195
196 /*
197 * Replace the old passive address with the new one
198 */
199 from = n_cp->vaddr;
200 port = n_cp->vport;
201 sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from),
202 (ntohs(port)>>8)&255, ntohs(port)&255);
203 buf_len = strlen(buf);
204
205 /*
206 * Calculate required delta-offset to keep TCP happy
207 */
208 *diff = buf_len - (end-start);
209
210 if (*diff == 0) {
211 /* simply replace it with new passive address */
212 memcpy(start, buf, buf_len);
213 ret = 1;
214 } else {
215 ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start,
216 end-start, buf, buf_len);
217 }
218
219 cp->app_data = NULL;
220 ip_vs_tcp_conn_listen(n_cp);
221 ip_vs_conn_put(n_cp);
222 return ret;
223 }
224 return 1;
225}
226
227
228/*
229 * Look at incoming ftp packets to catch the PASV/PORT command
230 * (outside-to-inside).
231 *
232 * The incoming packet having the PORT command should be something like
233 * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
234 * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
235 * In this case, we create a connection entry using the client address and
236 * port, so that the active ftp data connection from the server can reach
237 * the client.
238 */
239static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
240 struct sk_buff *skb, int *diff)
241{
242 struct iphdr *iph;
243 struct tcphdr *th;
244 char *data, *data_start, *data_limit;
245 char *start, *end;
246 __be32 to;
247 __be16 port;
248 struct ip_vs_conn *n_cp;
249
250 /* no diff required for incoming packets */
251 *diff = 0;
252
253 /* Only useful for established sessions */
254 if (cp->state != IP_VS_TCP_S_ESTABLISHED)
255 return 1;
256
257 /* Linear packets are much easier to deal with. */
258 if (!skb_make_writable(skb, skb->len))
259 return 0;
260
261 /*
262 * Detecting whether it is passive
263 */
264 iph = ip_hdr(skb);
265 th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
266
267 /* Since there may be OPTIONS in the TCP packet and the HLEN is
268 the length of the header in 32-bit multiples, it is accurate
269 to calculate data address by th+HLEN*4 */
270 data = data_start = (char *)th + (th->doff << 2);
271 data_limit = skb_tail_pointer(skb);
272
273 while (data <= data_limit - 6) {
274 if (strnicmp(data, "PASV\r\n", 6) == 0) {
275 /* Passive mode on */
276 IP_VS_DBG(7, "got PASV at %td of %td\n",
277 data - data_start,
278 data_limit - data_start);
279 cp->app_data = &ip_vs_ftp_pasv;
280 return 1;
281 }
282 data++;
283 }
284
285 /*
286 * To support virtual FTP server, the scenerio is as follows:
287 * FTP client ----> Load Balancer ----> FTP server
288 * First detect the port number in the application data,
289 * then create a new connection entry for the coming data
290 * connection.
291 */
292 if (ip_vs_ftp_get_addrport(data_start, data_limit,
293 CLIENT_STRING, sizeof(CLIENT_STRING)-1,
294 '\r', &to, &port,
295 &start, &end) != 1)
296 return 1;
297
298 IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n",
299 NIPQUAD(to), ntohs(port));
300
301 /* Passive mode off */
302 cp->app_data = NULL;
303
304 /*
305 * Now update or create a connection entry for it
306 */
307 IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
308 ip_vs_proto_name(iph->protocol),
309 NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0);
310
311 n_cp = ip_vs_conn_in_get(iph->protocol,
312 to, port,
313 cp->vaddr, htons(ntohs(cp->vport)-1));
314 if (!n_cp) {
315 n_cp = ip_vs_conn_new(IPPROTO_TCP,
316 to, port,
317 cp->vaddr, htons(ntohs(cp->vport)-1),
318 cp->daddr, htons(ntohs(cp->dport)-1),
319 0,
320 cp->dest);
321 if (!n_cp)
322 return 0;
323
324 /* add its controller */
325 ip_vs_control_add(n_cp, cp);
326 }
327
328 /*
329 * Move tunnel to listen state
330 */
331 ip_vs_tcp_conn_listen(n_cp);
332 ip_vs_conn_put(n_cp);
333
334 return 1;
335}
336
337
338static struct ip_vs_app ip_vs_ftp = {
339 .name = "ftp",
340 .type = IP_VS_APP_TYPE_FTP,
341 .protocol = IPPROTO_TCP,
342 .module = THIS_MODULE,
343 .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list),
344 .init_conn = ip_vs_ftp_init_conn,
345 .done_conn = ip_vs_ftp_done_conn,
346 .bind_conn = NULL,
347 .unbind_conn = NULL,
348 .pkt_out = ip_vs_ftp_out,
349 .pkt_in = ip_vs_ftp_in,
350};
351
352
353/*
354 * ip_vs_ftp initialization
355 */
356static int __init ip_vs_ftp_init(void)
357{
358 int i, ret;
359 struct ip_vs_app *app = &ip_vs_ftp;
360
361 ret = register_ip_vs_app(app);
362 if (ret)
363 return ret;
364
365 for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
366 if (!ports[i])
367 continue;
368 ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
369 if (ret)
370 break;
371 IP_VS_INFO("%s: loaded support on port[%d] = %d\n",
372 app->name, i, ports[i]);
373 }
374
375 if (ret)
376 unregister_ip_vs_app(app);
377
378 return ret;
379}
380
381
382/*
383 * ip_vs_ftp finish.
384 */
385static void __exit ip_vs_ftp_exit(void)
386{
387 unregister_ip_vs_app(&ip_vs_ftp);
388}
389
390
391module_init(ip_vs_ftp_init);
392module_exit(ip_vs_ftp_exit);
393MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
deleted file mode 100644
index 7a6a319f544a..000000000000
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ /dev/null
@@ -1,571 +0,0 @@
1/*
2 * IPVS: Locality-Based Least-Connection scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@gnuchina.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 * Martin Hamilton : fixed the terrible locking bugs
13 * *lock(tbl->lock) ==> *lock(&tbl->lock)
14 * Wensong Zhang : fixed the uninitilized tbl->lock bug
15 * Wensong Zhang : added doing full expiration check to
16 * collect stale entries of 24+ hours when
17 * no partial expire check in a half hour
18 * Julian Anastasov : replaced del_timer call with del_timer_sync
19 * to avoid the possible race between timer
20 * handler and del_timer thread in SMP
21 *
22 */
23
24/*
25 * The lblc algorithm is as follows (pseudo code):
26 *
27 * if cachenode[dest_ip] is null then
28 * n, cachenode[dest_ip] <- {weighted least-conn node};
29 * else
30 * n <- cachenode[dest_ip];
31 * if (n is dead) OR
32 * (n.conns>n.weight AND
33 * there is a node m with m.conns<m.weight/2) then
34 * n, cachenode[dest_ip] <- {weighted least-conn node};
35 *
36 * return n;
37 *
38 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
39 * me to write this module.
40 */
41
42#include <linux/ip.h>
43#include <linux/module.h>
44#include <linux/kernel.h>
45#include <linux/skbuff.h>
46#include <linux/jiffies.h>
47
48/* for sysctl */
49#include <linux/fs.h>
50#include <linux/sysctl.h>
51
52#include <net/ip_vs.h>
53
54
55/*
56 * It is for garbage collection of stale IPVS lblc entries,
57 * when the table is full.
58 */
59#define CHECK_EXPIRE_INTERVAL (60*HZ)
60#define ENTRY_TIMEOUT (6*60*HZ)
61
62/*
63 * It is for full expiration check.
64 * When there is no partial expiration check (garbage collection)
65 * in a half hour, do a full expiration check to collect stale
66 * entries that haven't been touched for a day.
67 */
68#define COUNT_FOR_FULL_EXPIRATION 30
69static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
70
71
72/*
73 * for IPVS lblc entry hash table
74 */
75#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
76#define CONFIG_IP_VS_LBLC_TAB_BITS 10
77#endif
78#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
79#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
80#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
81
82
83/*
84 * IPVS lblc entry represents an association between destination
85 * IP address and its destination server
86 */
87struct ip_vs_lblc_entry {
88 struct list_head list;
89 __be32 addr; /* destination IP address */
90 struct ip_vs_dest *dest; /* real server (cache) */
91 unsigned long lastuse; /* last used time */
92};
93
94
95/*
96 * IPVS lblc hash table
97 */
98struct ip_vs_lblc_table {
99 rwlock_t lock; /* lock for this table */
100 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
101 atomic_t entries; /* number of entries */
102 int max_size; /* maximum size of entries */
103 struct timer_list periodic_timer; /* collect stale entries */
104 int rover; /* rover for expire check */
105 int counter; /* counter for no expire */
106};
107
108
109/*
110 * IPVS LBLC sysctl table
111 */
112
113static ctl_table vs_vars_table[] = {
114 {
115 .procname = "lblc_expiration",
116 .data = &sysctl_ip_vs_lblc_expiration,
117 .maxlen = sizeof(int),
118 .mode = 0644,
119 .proc_handler = &proc_dointvec_jiffies,
120 },
121 { .ctl_name = 0 }
122};
123
124static struct ctl_table_header * sysctl_header;
125
126/*
127 * new/free a ip_vs_lblc_entry, which is a mapping of a destionation
128 * IP address to a server.
129 */
130static inline struct ip_vs_lblc_entry *
131ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest)
132{
133 struct ip_vs_lblc_entry *en;
134
135 en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
136 if (en == NULL) {
137 IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
138 return NULL;
139 }
140
141 INIT_LIST_HEAD(&en->list);
142 en->addr = daddr;
143
144 atomic_inc(&dest->refcnt);
145 en->dest = dest;
146
147 return en;
148}
149
150
151static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
152{
153 list_del(&en->list);
154 /*
155 * We don't kfree dest because it is refered either by its service
156 * or the trash dest list.
157 */
158 atomic_dec(&en->dest->refcnt);
159 kfree(en);
160}
161
162
163/*
164 * Returns hash value for IPVS LBLC entry
165 */
166static inline unsigned ip_vs_lblc_hashkey(__be32 addr)
167{
168 return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
169}
170
171
172/*
173 * Hash an entry in the ip_vs_lblc_table.
174 * returns bool success.
175 */
176static int
177ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
178{
179 unsigned hash;
180
181 if (!list_empty(&en->list)) {
182 IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
183 "called from %p\n", __builtin_return_address(0));
184 return 0;
185 }
186
187 /*
188 * Hash by destination IP address
189 */
190 hash = ip_vs_lblc_hashkey(en->addr);
191
192 write_lock(&tbl->lock);
193 list_add(&en->list, &tbl->bucket[hash]);
194 atomic_inc(&tbl->entries);
195 write_unlock(&tbl->lock);
196
197 return 1;
198}
199
200
201/*
202 * Get ip_vs_lblc_entry associated with supplied parameters.
203 */
204static inline struct ip_vs_lblc_entry *
205ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
206{
207 unsigned hash;
208 struct ip_vs_lblc_entry *en;
209
210 hash = ip_vs_lblc_hashkey(addr);
211
212 read_lock(&tbl->lock);
213
214 list_for_each_entry(en, &tbl->bucket[hash], list) {
215 if (en->addr == addr) {
216 /* HIT */
217 read_unlock(&tbl->lock);
218 return en;
219 }
220 }
221
222 read_unlock(&tbl->lock);
223
224 return NULL;
225}
226
227
228/*
229 * Flush all the entries of the specified table.
230 */
231static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
232{
233 int i;
234 struct ip_vs_lblc_entry *en, *nxt;
235
236 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
237 write_lock(&tbl->lock);
238 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
239 ip_vs_lblc_free(en);
240 atomic_dec(&tbl->entries);
241 }
242 write_unlock(&tbl->lock);
243 }
244}
245
246
247static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
248{
249 unsigned long now = jiffies;
250 int i, j;
251 struct ip_vs_lblc_entry *en, *nxt;
252
253 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
254 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
255
256 write_lock(&tbl->lock);
257 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
258 if (time_before(now,
259 en->lastuse + sysctl_ip_vs_lblc_expiration))
260 continue;
261
262 ip_vs_lblc_free(en);
263 atomic_dec(&tbl->entries);
264 }
265 write_unlock(&tbl->lock);
266 }
267 tbl->rover = j;
268}
269
270
271/*
272 * Periodical timer handler for IPVS lblc table
273 * It is used to collect stale entries when the number of entries
274 * exceeds the maximum size of the table.
275 *
276 * Fixme: we probably need more complicated algorithm to collect
277 * entries that have not been used for a long time even
278 * if the number of entries doesn't exceed the maximum size
279 * of the table.
280 * The full expiration check is for this purpose now.
281 */
282static void ip_vs_lblc_check_expire(unsigned long data)
283{
284 struct ip_vs_lblc_table *tbl;
285 unsigned long now = jiffies;
286 int goal;
287 int i, j;
288 struct ip_vs_lblc_entry *en, *nxt;
289
290 tbl = (struct ip_vs_lblc_table *)data;
291
292 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
293 /* do full expiration check */
294 ip_vs_lblc_full_check(tbl);
295 tbl->counter = 1;
296 goto out;
297 }
298
299 if (atomic_read(&tbl->entries) <= tbl->max_size) {
300 tbl->counter++;
301 goto out;
302 }
303
304 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
305 if (goal > tbl->max_size/2)
306 goal = tbl->max_size/2;
307
308 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
309 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
310
311 write_lock(&tbl->lock);
312 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
313 if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
314 continue;
315
316 ip_vs_lblc_free(en);
317 atomic_dec(&tbl->entries);
318 goal--;
319 }
320 write_unlock(&tbl->lock);
321 if (goal <= 0)
322 break;
323 }
324 tbl->rover = j;
325
326 out:
327 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
328}
329
330
331static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
332{
333 int i;
334 struct ip_vs_lblc_table *tbl;
335
336 /*
337 * Allocate the ip_vs_lblc_table for this service
338 */
339 tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
340 if (tbl == NULL) {
341 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
342 return -ENOMEM;
343 }
344 svc->sched_data = tbl;
345 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
346 "current service\n",
347 sizeof(struct ip_vs_lblc_table));
348
349 /*
350 * Initialize the hash buckets
351 */
352 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
353 INIT_LIST_HEAD(&tbl->bucket[i]);
354 }
355 rwlock_init(&tbl->lock);
356 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
357 tbl->rover = 0;
358 tbl->counter = 1;
359
360 /*
361 * Hook periodic timer for garbage collection
362 */
363 setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
364 (unsigned long)tbl);
365 tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
366 add_timer(&tbl->periodic_timer);
367
368 return 0;
369}
370
371
372static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
373{
374 struct ip_vs_lblc_table *tbl = svc->sched_data;
375
376 /* remove periodic timer */
377 del_timer_sync(&tbl->periodic_timer);
378
379 /* got to clean up table entries here */
380 ip_vs_lblc_flush(tbl);
381
382 /* release the table itself */
383 kfree(svc->sched_data);
384 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
385 sizeof(struct ip_vs_lblc_table));
386
387 return 0;
388}
389
390
391static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
392{
393 return 0;
394}
395
396
397static inline struct ip_vs_dest *
398__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
399{
400 struct ip_vs_dest *dest, *least;
401 int loh, doh;
402
403 /*
404 * We think the overhead of processing active connections is fifty
405 * times higher than that of inactive connections in average. (This
406 * fifty times might not be accurate, we will change it later.) We
407 * use the following formula to estimate the overhead:
408 * dest->activeconns*50 + dest->inactconns
409 * and the load:
410 * (dest overhead) / dest->weight
411 *
412 * Remember -- no floats in kernel mode!!!
413 * The comparison of h1*w2 > h2*w1 is equivalent to that of
414 * h1/w1 > h2/w2
415 * if every weight is larger than zero.
416 *
417 * The server with weight=0 is quiesced and will not receive any
418 * new connection.
419 */
420 list_for_each_entry(dest, &svc->destinations, n_list) {
421 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
422 continue;
423 if (atomic_read(&dest->weight) > 0) {
424 least = dest;
425 loh = atomic_read(&least->activeconns) * 50
426 + atomic_read(&least->inactconns);
427 goto nextstage;
428 }
429 }
430 return NULL;
431
432 /*
433 * Find the destination with the least load.
434 */
435 nextstage:
436 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
437 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
438 continue;
439
440 doh = atomic_read(&dest->activeconns) * 50
441 + atomic_read(&dest->inactconns);
442 if (loh * atomic_read(&dest->weight) >
443 doh * atomic_read(&least->weight)) {
444 least = dest;
445 loh = doh;
446 }
447 }
448
449 IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
450 "activeconns %d refcnt %d weight %d overhead %d\n",
451 NIPQUAD(least->addr), ntohs(least->port),
452 atomic_read(&least->activeconns),
453 atomic_read(&least->refcnt),
454 atomic_read(&least->weight), loh);
455
456 return least;
457}
458
459
460/*
461 * If this destination server is overloaded and there is a less loaded
462 * server, then return true.
463 */
464static inline int
465is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
466{
467 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
468 struct ip_vs_dest *d;
469
470 list_for_each_entry(d, &svc->destinations, n_list) {
471 if (atomic_read(&d->activeconns)*2
472 < atomic_read(&d->weight)) {
473 return 1;
474 }
475 }
476 }
477 return 0;
478}
479
480
481/*
482 * Locality-Based (weighted) Least-Connection scheduling
483 */
484static struct ip_vs_dest *
485ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
486{
487 struct ip_vs_dest *dest;
488 struct ip_vs_lblc_table *tbl;
489 struct ip_vs_lblc_entry *en;
490 struct iphdr *iph = ip_hdr(skb);
491
492 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
493
494 tbl = (struct ip_vs_lblc_table *)svc->sched_data;
495 en = ip_vs_lblc_get(tbl, iph->daddr);
496 if (en == NULL) {
497 dest = __ip_vs_wlc_schedule(svc, iph);
498 if (dest == NULL) {
499 IP_VS_DBG(1, "no destination available\n");
500 return NULL;
501 }
502 en = ip_vs_lblc_new(iph->daddr, dest);
503 if (en == NULL) {
504 return NULL;
505 }
506 ip_vs_lblc_hash(tbl, en);
507 } else {
508 dest = en->dest;
509 if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
510 || atomic_read(&dest->weight) <= 0
511 || is_overloaded(dest, svc)) {
512 dest = __ip_vs_wlc_schedule(svc, iph);
513 if (dest == NULL) {
514 IP_VS_DBG(1, "no destination available\n");
515 return NULL;
516 }
517 atomic_dec(&en->dest->refcnt);
518 atomic_inc(&dest->refcnt);
519 en->dest = dest;
520 }
521 }
522 en->lastuse = jiffies;
523
524 IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
525 "--> server %u.%u.%u.%u:%d\n",
526 NIPQUAD(en->addr),
527 NIPQUAD(dest->addr),
528 ntohs(dest->port));
529
530 return dest;
531}
532
533
534/*
535 * IPVS LBLC Scheduler structure
536 */
537static struct ip_vs_scheduler ip_vs_lblc_scheduler =
538{
539 .name = "lblc",
540 .refcnt = ATOMIC_INIT(0),
541 .module = THIS_MODULE,
542 .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
543 .init_service = ip_vs_lblc_init_svc,
544 .done_service = ip_vs_lblc_done_svc,
545 .update_service = ip_vs_lblc_update_svc,
546 .schedule = ip_vs_lblc_schedule,
547};
548
549
550static int __init ip_vs_lblc_init(void)
551{
552 int ret;
553
554 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
555 ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
556 if (ret)
557 unregister_sysctl_table(sysctl_header);
558 return ret;
559}
560
561
562static void __exit ip_vs_lblc_cleanup(void)
563{
564 unregister_sysctl_table(sysctl_header);
565 unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
566}
567
568
569module_init(ip_vs_lblc_init);
570module_exit(ip_vs_lblc_cleanup);
571MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
deleted file mode 100644
index c234e73968a6..000000000000
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ /dev/null
@@ -1,760 +0,0 @@
1/*
2 * IPVS: Locality-Based Least-Connection with Replication scheduler
3 *
4 * Authors: Wensong Zhang <wensong@gnuchina.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 * Julian Anastasov : Added the missing (dest->weight>0)
13 * condition in the ip_vs_dest_set_max.
14 *
15 */
16
17/*
18 * The lblc/r algorithm is as follows (pseudo code):
19 *
20 * if serverSet[dest_ip] is null then
21 * n, serverSet[dest_ip] <- {weighted least-conn node};
22 * else
23 * n <- {least-conn (alive) node in serverSet[dest_ip]};
24 * if (n is null) OR
25 * (n.conns>n.weight AND
26 * there is a node m with m.conns<m.weight/2) then
27 * n <- {weighted least-conn node};
28 * add n to serverSet[dest_ip];
29 * if |serverSet[dest_ip]| > 1 AND
30 * now - serverSet[dest_ip].lastMod > T then
31 * m <- {most conn node in serverSet[dest_ip]};
32 * remove m from serverSet[dest_ip];
33 * if serverSet[dest_ip] changed then
34 * serverSet[dest_ip].lastMod <- now;
35 *
36 * return n;
37 *
38 */
39
40#include <linux/ip.h>
41#include <linux/module.h>
42#include <linux/kernel.h>
43#include <linux/skbuff.h>
44#include <linux/jiffies.h>
45
46/* for sysctl */
47#include <linux/fs.h>
48#include <linux/sysctl.h>
49#include <net/net_namespace.h>
50
51#include <net/ip_vs.h>
52
53
54/*
55 * It is for garbage collection of stale IPVS lblcr entries,
56 * when the table is full.
57 */
58#define CHECK_EXPIRE_INTERVAL (60*HZ)
59#define ENTRY_TIMEOUT (6*60*HZ)
60
61/*
62 * It is for full expiration check.
63 * When there is no partial expiration check (garbage collection)
64 * in a half hour, do a full expiration check to collect stale
65 * entries that haven't been touched for a day.
66 */
67#define COUNT_FOR_FULL_EXPIRATION 30
68static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
69
70
71/*
72 * for IPVS lblcr entry hash table
73 */
74#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
75#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
76#endif
77#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
78#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
79#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
80
81
82/*
83 * IPVS destination set structure and operations
84 */
85struct ip_vs_dest_list {
86 struct ip_vs_dest_list *next; /* list link */
87 struct ip_vs_dest *dest; /* destination server */
88};
89
90struct ip_vs_dest_set {
91 atomic_t size; /* set size */
92 unsigned long lastmod; /* last modified time */
93 struct ip_vs_dest_list *list; /* destination list */
94 rwlock_t lock; /* lock for this list */
95};
96
97
98static struct ip_vs_dest_list *
99ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
100{
101 struct ip_vs_dest_list *e;
102
103 for (e=set->list; e!=NULL; e=e->next) {
104 if (e->dest == dest)
105 /* already existed */
106 return NULL;
107 }
108
109 e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
110 if (e == NULL) {
111 IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
112 return NULL;
113 }
114
115 atomic_inc(&dest->refcnt);
116 e->dest = dest;
117
118 /* link it to the list */
119 write_lock(&set->lock);
120 e->next = set->list;
121 set->list = e;
122 atomic_inc(&set->size);
123 write_unlock(&set->lock);
124
125 set->lastmod = jiffies;
126 return e;
127}
128
129static void
130ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
131{
132 struct ip_vs_dest_list *e, **ep;
133
134 write_lock(&set->lock);
135 for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
136 if (e->dest == dest) {
137 /* HIT */
138 *ep = e->next;
139 atomic_dec(&set->size);
140 set->lastmod = jiffies;
141 atomic_dec(&e->dest->refcnt);
142 kfree(e);
143 break;
144 }
145 ep = &e->next;
146 }
147 write_unlock(&set->lock);
148}
149
150static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
151{
152 struct ip_vs_dest_list *e, **ep;
153
154 write_lock(&set->lock);
155 for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
156 *ep = e->next;
157 /*
158 * We don't kfree dest because it is refered either
159 * by its service or by the trash dest list.
160 */
161 atomic_dec(&e->dest->refcnt);
162 kfree(e);
163 }
164 write_unlock(&set->lock);
165}
166
167/* get weighted least-connection node in the destination set */
168static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
169{
170 register struct ip_vs_dest_list *e;
171 struct ip_vs_dest *dest, *least;
172 int loh, doh;
173
174 if (set == NULL)
175 return NULL;
176
177 read_lock(&set->lock);
178 /* select the first destination server, whose weight > 0 */
179 for (e=set->list; e!=NULL; e=e->next) {
180 least = e->dest;
181 if (least->flags & IP_VS_DEST_F_OVERLOAD)
182 continue;
183
184 if ((atomic_read(&least->weight) > 0)
185 && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
186 loh = atomic_read(&least->activeconns) * 50
187 + atomic_read(&least->inactconns);
188 goto nextstage;
189 }
190 }
191 read_unlock(&set->lock);
192 return NULL;
193
194 /* find the destination with the weighted least load */
195 nextstage:
196 for (e=e->next; e!=NULL; e=e->next) {
197 dest = e->dest;
198 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
199 continue;
200
201 doh = atomic_read(&dest->activeconns) * 50
202 + atomic_read(&dest->inactconns);
203 if ((loh * atomic_read(&dest->weight) >
204 doh * atomic_read(&least->weight))
205 && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
206 least = dest;
207 loh = doh;
208 }
209 }
210 read_unlock(&set->lock);
211
212 IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
213 "activeconns %d refcnt %d weight %d overhead %d\n",
214 NIPQUAD(least->addr), ntohs(least->port),
215 atomic_read(&least->activeconns),
216 atomic_read(&least->refcnt),
217 atomic_read(&least->weight), loh);
218 return least;
219}
220
221
222/* get weighted most-connection node in the destination set */
223static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
224{
225 register struct ip_vs_dest_list *e;
226 struct ip_vs_dest *dest, *most;
227 int moh, doh;
228
229 if (set == NULL)
230 return NULL;
231
232 read_lock(&set->lock);
233 /* select the first destination server, whose weight > 0 */
234 for (e=set->list; e!=NULL; e=e->next) {
235 most = e->dest;
236 if (atomic_read(&most->weight) > 0) {
237 moh = atomic_read(&most->activeconns) * 50
238 + atomic_read(&most->inactconns);
239 goto nextstage;
240 }
241 }
242 read_unlock(&set->lock);
243 return NULL;
244
245 /* find the destination with the weighted most load */
246 nextstage:
247 for (e=e->next; e!=NULL; e=e->next) {
248 dest = e->dest;
249 doh = atomic_read(&dest->activeconns) * 50
250 + atomic_read(&dest->inactconns);
251 /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
252 if ((moh * atomic_read(&dest->weight) <
253 doh * atomic_read(&most->weight))
254 && (atomic_read(&dest->weight) > 0)) {
255 most = dest;
256 moh = doh;
257 }
258 }
259 read_unlock(&set->lock);
260
261 IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
262 "activeconns %d refcnt %d weight %d overhead %d\n",
263 NIPQUAD(most->addr), ntohs(most->port),
264 atomic_read(&most->activeconns),
265 atomic_read(&most->refcnt),
266 atomic_read(&most->weight), moh);
267 return most;
268}
269
270
271/*
272 * IPVS lblcr entry represents an association between destination
273 * IP address and its destination server set
274 */
275struct ip_vs_lblcr_entry {
276 struct list_head list;
277 __be32 addr; /* destination IP address */
278 struct ip_vs_dest_set set; /* destination server set */
279 unsigned long lastuse; /* last used time */
280};
281
282
283/*
284 * IPVS lblcr hash table
285 */
286struct ip_vs_lblcr_table {
287 rwlock_t lock; /* lock for this table */
288 struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
289 atomic_t entries; /* number of entries */
290 int max_size; /* maximum size of entries */
291 struct timer_list periodic_timer; /* collect stale entries */
292 int rover; /* rover for expire check */
293 int counter; /* counter for no expire */
294};
295
296
297/*
298 * IPVS LBLCR sysctl table
299 */
300
301static ctl_table vs_vars_table[] = {
302 {
303 .procname = "lblcr_expiration",
304 .data = &sysctl_ip_vs_lblcr_expiration,
305 .maxlen = sizeof(int),
306 .mode = 0644,
307 .proc_handler = &proc_dointvec_jiffies,
308 },
309 { .ctl_name = 0 }
310};
311
312static struct ctl_table_header * sysctl_header;
313
314/*
315 * new/free a ip_vs_lblcr_entry, which is a mapping of a destination
316 * IP address to a server.
317 */
318static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr)
319{
320 struct ip_vs_lblcr_entry *en;
321
322 en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
323 if (en == NULL) {
324 IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
325 return NULL;
326 }
327
328 INIT_LIST_HEAD(&en->list);
329 en->addr = daddr;
330
331 /* initilize its dest set */
332 atomic_set(&(en->set.size), 0);
333 en->set.list = NULL;
334 rwlock_init(&en->set.lock);
335
336 return en;
337}
338
339
340static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
341{
342 list_del(&en->list);
343 ip_vs_dest_set_eraseall(&en->set);
344 kfree(en);
345}
346
347
348/*
349 * Returns hash value for IPVS LBLCR entry
350 */
351static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
352{
353 return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
354}
355
356
357/*
358 * Hash an entry in the ip_vs_lblcr_table.
359 * returns bool success.
360 */
361static int
362ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
363{
364 unsigned hash;
365
366 if (!list_empty(&en->list)) {
367 IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
368 "called from %p\n", __builtin_return_address(0));
369 return 0;
370 }
371
372 /*
373 * Hash by destination IP address
374 */
375 hash = ip_vs_lblcr_hashkey(en->addr);
376
377 write_lock(&tbl->lock);
378 list_add(&en->list, &tbl->bucket[hash]);
379 atomic_inc(&tbl->entries);
380 write_unlock(&tbl->lock);
381
382 return 1;
383}
384
385
386/*
387 * Get ip_vs_lblcr_entry associated with supplied parameters.
388 */
389static inline struct ip_vs_lblcr_entry *
390ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
391{
392 unsigned hash;
393 struct ip_vs_lblcr_entry *en;
394
395 hash = ip_vs_lblcr_hashkey(addr);
396
397 read_lock(&tbl->lock);
398
399 list_for_each_entry(en, &tbl->bucket[hash], list) {
400 if (en->addr == addr) {
401 /* HIT */
402 read_unlock(&tbl->lock);
403 return en;
404 }
405 }
406
407 read_unlock(&tbl->lock);
408
409 return NULL;
410}
411
412
413/*
414 * Flush all the entries of the specified table.
415 */
416static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
417{
418 int i;
419 struct ip_vs_lblcr_entry *en, *nxt;
420
421 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
422 write_lock(&tbl->lock);
423 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
424 ip_vs_lblcr_free(en);
425 atomic_dec(&tbl->entries);
426 }
427 write_unlock(&tbl->lock);
428 }
429}
430
431
432static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
433{
434 unsigned long now = jiffies;
435 int i, j;
436 struct ip_vs_lblcr_entry *en, *nxt;
437
438 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
439 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
440
441 write_lock(&tbl->lock);
442 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
443 if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
444 now))
445 continue;
446
447 ip_vs_lblcr_free(en);
448 atomic_dec(&tbl->entries);
449 }
450 write_unlock(&tbl->lock);
451 }
452 tbl->rover = j;
453}
454
455
456/*
457 * Periodical timer handler for IPVS lblcr table
458 * It is used to collect stale entries when the number of entries
459 * exceeds the maximum size of the table.
460 *
461 * Fixme: we probably need more complicated algorithm to collect
462 * entries that have not been used for a long time even
463 * if the number of entries doesn't exceed the maximum size
464 * of the table.
465 * The full expiration check is for this purpose now.
466 */
467static void ip_vs_lblcr_check_expire(unsigned long data)
468{
469 struct ip_vs_lblcr_table *tbl;
470 unsigned long now = jiffies;
471 int goal;
472 int i, j;
473 struct ip_vs_lblcr_entry *en, *nxt;
474
475 tbl = (struct ip_vs_lblcr_table *)data;
476
477 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
478 /* do full expiration check */
479 ip_vs_lblcr_full_check(tbl);
480 tbl->counter = 1;
481 goto out;
482 }
483
484 if (atomic_read(&tbl->entries) <= tbl->max_size) {
485 tbl->counter++;
486 goto out;
487 }
488
489 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
490 if (goal > tbl->max_size/2)
491 goal = tbl->max_size/2;
492
493 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
494 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
495
496 write_lock(&tbl->lock);
497 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
498 if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
499 continue;
500
501 ip_vs_lblcr_free(en);
502 atomic_dec(&tbl->entries);
503 goal--;
504 }
505 write_unlock(&tbl->lock);
506 if (goal <= 0)
507 break;
508 }
509 tbl->rover = j;
510
511 out:
512 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
513}
514
515static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
516{
517 int i;
518 struct ip_vs_lblcr_table *tbl;
519
520 /*
521 * Allocate the ip_vs_lblcr_table for this service
522 */
523 tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
524 if (tbl == NULL) {
525 IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
526 return -ENOMEM;
527 }
528 svc->sched_data = tbl;
529 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
530 "current service\n",
531 sizeof(struct ip_vs_lblcr_table));
532
533 /*
534 * Initialize the hash buckets
535 */
536 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
537 INIT_LIST_HEAD(&tbl->bucket[i]);
538 }
539 rwlock_init(&tbl->lock);
540 tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
541 tbl->rover = 0;
542 tbl->counter = 1;
543
544 /*
545 * Hook periodic timer for garbage collection
546 */
547 setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
548 (unsigned long)tbl);
549 tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
550 add_timer(&tbl->periodic_timer);
551
552 return 0;
553}
554
555
556static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
557{
558 struct ip_vs_lblcr_table *tbl = svc->sched_data;
559
560 /* remove periodic timer */
561 del_timer_sync(&tbl->periodic_timer);
562
563 /* got to clean up table entries here */
564 ip_vs_lblcr_flush(tbl);
565
566 /* release the table itself */
567 kfree(svc->sched_data);
568 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
569 sizeof(struct ip_vs_lblcr_table));
570
571 return 0;
572}
573
574
575static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
576{
577 return 0;
578}
579
580
581static inline struct ip_vs_dest *
582__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
583{
584 struct ip_vs_dest *dest, *least;
585 int loh, doh;
586
587 /*
588 * We think the overhead of processing active connections is fifty
589 * times higher than that of inactive connections in average. (This
590 * fifty times might not be accurate, we will change it later.) We
591 * use the following formula to estimate the overhead:
592 * dest->activeconns*50 + dest->inactconns
593 * and the load:
594 * (dest overhead) / dest->weight
595 *
596 * Remember -- no floats in kernel mode!!!
597 * The comparison of h1*w2 > h2*w1 is equivalent to that of
598 * h1/w1 > h2/w2
599 * if every weight is larger than zero.
600 *
601 * The server with weight=0 is quiesced and will not receive any
602 * new connection.
603 */
604 list_for_each_entry(dest, &svc->destinations, n_list) {
605 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
606 continue;
607
608 if (atomic_read(&dest->weight) > 0) {
609 least = dest;
610 loh = atomic_read(&least->activeconns) * 50
611 + atomic_read(&least->inactconns);
612 goto nextstage;
613 }
614 }
615 return NULL;
616
617 /*
618 * Find the destination with the least load.
619 */
620 nextstage:
621 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
622 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
623 continue;
624
625 doh = atomic_read(&dest->activeconns) * 50
626 + atomic_read(&dest->inactconns);
627 if (loh * atomic_read(&dest->weight) >
628 doh * atomic_read(&least->weight)) {
629 least = dest;
630 loh = doh;
631 }
632 }
633
634 IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
635 "activeconns %d refcnt %d weight %d overhead %d\n",
636 NIPQUAD(least->addr), ntohs(least->port),
637 atomic_read(&least->activeconns),
638 atomic_read(&least->refcnt),
639 atomic_read(&least->weight), loh);
640
641 return least;
642}
643
644
645/*
646 * If this destination server is overloaded and there is a less loaded
647 * server, then return true.
648 */
649static inline int
650is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
651{
652 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
653 struct ip_vs_dest *d;
654
655 list_for_each_entry(d, &svc->destinations, n_list) {
656 if (atomic_read(&d->activeconns)*2
657 < atomic_read(&d->weight)) {
658 return 1;
659 }
660 }
661 }
662 return 0;
663}
664
665
666/*
667 * Locality-Based (weighted) Least-Connection scheduling
668 */
669static struct ip_vs_dest *
670ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
671{
672 struct ip_vs_dest *dest;
673 struct ip_vs_lblcr_table *tbl;
674 struct ip_vs_lblcr_entry *en;
675 struct iphdr *iph = ip_hdr(skb);
676
677 IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
678
679 tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
680 en = ip_vs_lblcr_get(tbl, iph->daddr);
681 if (en == NULL) {
682 dest = __ip_vs_wlc_schedule(svc, iph);
683 if (dest == NULL) {
684 IP_VS_DBG(1, "no destination available\n");
685 return NULL;
686 }
687 en = ip_vs_lblcr_new(iph->daddr);
688 if (en == NULL) {
689 return NULL;
690 }
691 ip_vs_dest_set_insert(&en->set, dest);
692 ip_vs_lblcr_hash(tbl, en);
693 } else {
694 dest = ip_vs_dest_set_min(&en->set);
695 if (!dest || is_overloaded(dest, svc)) {
696 dest = __ip_vs_wlc_schedule(svc, iph);
697 if (dest == NULL) {
698 IP_VS_DBG(1, "no destination available\n");
699 return NULL;
700 }
701 ip_vs_dest_set_insert(&en->set, dest);
702 }
703 if (atomic_read(&en->set.size) > 1 &&
704 jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
705 struct ip_vs_dest *m;
706 m = ip_vs_dest_set_max(&en->set);
707 if (m)
708 ip_vs_dest_set_erase(&en->set, m);
709 }
710 }
711 en->lastuse = jiffies;
712
713 IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
714 "--> server %u.%u.%u.%u:%d\n",
715 NIPQUAD(en->addr),
716 NIPQUAD(dest->addr),
717 ntohs(dest->port));
718
719 return dest;
720}
721
722
723/*
724 * IPVS LBLCR Scheduler structure
725 */
726static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
727{
728 .name = "lblcr",
729 .refcnt = ATOMIC_INIT(0),
730 .module = THIS_MODULE,
731 .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
732 .init_service = ip_vs_lblcr_init_svc,
733 .done_service = ip_vs_lblcr_done_svc,
734 .update_service = ip_vs_lblcr_update_svc,
735 .schedule = ip_vs_lblcr_schedule,
736};
737
738
739static int __init ip_vs_lblcr_init(void)
740{
741 int ret;
742
743 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
744 ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
745 if (ret)
746 unregister_sysctl_table(sysctl_header);
747 return ret;
748}
749
750
751static void __exit ip_vs_lblcr_cleanup(void)
752{
753 unregister_sysctl_table(sysctl_header);
754 unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
755}
756
757
758module_init(ip_vs_lblcr_init);
759module_exit(ip_vs_lblcr_cleanup);
760MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c
deleted file mode 100644
index ebcdbf75ac65..000000000000
--- a/net/ipv4/ipvs/ip_vs_lc.c
+++ /dev/null
@@ -1,121 +0,0 @@
1/*
2 * IPVS: Least-Connection Scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 * Wensong Zhang : added the ip_vs_lc_update_svc
13 * Wensong Zhang : added any dest with weight=0 is quiesced
14 *
15 */
16
17#include <linux/module.h>
18#include <linux/kernel.h>
19
20#include <net/ip_vs.h>
21
22
23static int ip_vs_lc_init_svc(struct ip_vs_service *svc)
24{
25 return 0;
26}
27
28
29static int ip_vs_lc_done_svc(struct ip_vs_service *svc)
30{
31 return 0;
32}
33
34
35static int ip_vs_lc_update_svc(struct ip_vs_service *svc)
36{
37 return 0;
38}
39
40
41static inline unsigned int
42ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
43{
44 /*
45 * We think the overhead of processing active connections is 256
46 * times higher than that of inactive connections in average. (This
47 * 256 times might not be accurate, we will change it later) We
48 * use the following formula to estimate the overhead now:
49 * dest->activeconns*256 + dest->inactconns
50 */
51 return (atomic_read(&dest->activeconns) << 8) +
52 atomic_read(&dest->inactconns);
53}
54
55
56/*
57 * Least Connection scheduling
58 */
59static struct ip_vs_dest *
60ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
61{
62 struct ip_vs_dest *dest, *least = NULL;
63 unsigned int loh = 0, doh;
64
65 IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
66
67 /*
68 * Simply select the server with the least number of
69 * (activeconns<<5) + inactconns
70 * Except whose weight is equal to zero.
71 * If the weight is equal to zero, it means that the server is
72 * quiesced, the existing connections to the server still get
73 * served, but no new connection is assigned to the server.
74 */
75
76 list_for_each_entry(dest, &svc->destinations, n_list) {
77 if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
78 atomic_read(&dest->weight) == 0)
79 continue;
80 doh = ip_vs_lc_dest_overhead(dest);
81 if (!least || doh < loh) {
82 least = dest;
83 loh = doh;
84 }
85 }
86
87 if (least)
88 IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n",
89 NIPQUAD(least->addr), ntohs(least->port),
90 atomic_read(&least->activeconns),
91 atomic_read(&least->inactconns));
92
93 return least;
94}
95
96
97static struct ip_vs_scheduler ip_vs_lc_scheduler = {
98 .name = "lc",
99 .refcnt = ATOMIC_INIT(0),
100 .module = THIS_MODULE,
101 .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
102 .init_service = ip_vs_lc_init_svc,
103 .done_service = ip_vs_lc_done_svc,
104 .update_service = ip_vs_lc_update_svc,
105 .schedule = ip_vs_lc_schedule,
106};
107
108
109static int __init ip_vs_lc_init(void)
110{
111 return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
112}
113
114static void __exit ip_vs_lc_cleanup(void)
115{
116 unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
117}
118
119module_init(ip_vs_lc_init);
120module_exit(ip_vs_lc_cleanup);
121MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c
deleted file mode 100644
index 92f3a6770031..000000000000
--- a/net/ipv4/ipvs/ip_vs_nq.c
+++ /dev/null
@@ -1,159 +0,0 @@
1/*
2 * IPVS: Never Queue scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 *
13 */
14
15/*
16 * The NQ algorithm adopts a two-speed model. When there is an idle server
17 * available, the job will be sent to the idle server, instead of waiting
18 * for a fast one. When there is no idle server available, the job will be
19 * sent to the server that minimize its expected delay (The Shortest
20 * Expected Delay scheduling algorithm).
21 *
22 * See the following paper for more information:
23 * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
24 * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
25 * pages 986-994, 1988.
26 *
27 * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
28 *
29 * The difference between NQ and SED is that NQ can improve overall
30 * system utilization.
31 *
32 */
33
34#include <linux/module.h>
35#include <linux/kernel.h>
36
37#include <net/ip_vs.h>
38
39
40static int
41ip_vs_nq_init_svc(struct ip_vs_service *svc)
42{
43 return 0;
44}
45
46
47static int
48ip_vs_nq_done_svc(struct ip_vs_service *svc)
49{
50 return 0;
51}
52
53
54static int
55ip_vs_nq_update_svc(struct ip_vs_service *svc)
56{
57 return 0;
58}
59
60
61static inline unsigned int
62ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
63{
64 /*
65 * We only use the active connection number in the cost
66 * calculation here.
67 */
68 return atomic_read(&dest->activeconns) + 1;
69}
70
71
72/*
73 * Weighted Least Connection scheduling
74 */
75static struct ip_vs_dest *
76ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
77{
78 struct ip_vs_dest *dest, *least = NULL;
79 unsigned int loh = 0, doh;
80
81 IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
82
83 /*
84 * We calculate the load of each dest server as follows:
85 * (server expected overhead) / dest->weight
86 *
87 * Remember -- no floats in kernel mode!!!
88 * The comparison of h1*w2 > h2*w1 is equivalent to that of
89 * h1/w1 > h2/w2
90 * if every weight is larger than zero.
91 *
92 * The server with weight=0 is quiesced and will not receive any
93 * new connections.
94 */
95
96 list_for_each_entry(dest, &svc->destinations, n_list) {
97
98 if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
99 !atomic_read(&dest->weight))
100 continue;
101
102 doh = ip_vs_nq_dest_overhead(dest);
103
104 /* return the server directly if it is idle */
105 if (atomic_read(&dest->activeconns) == 0) {
106 least = dest;
107 loh = doh;
108 goto out;
109 }
110
111 if (!least ||
112 (loh * atomic_read(&dest->weight) >
113 doh * atomic_read(&least->weight))) {
114 least = dest;
115 loh = doh;
116 }
117 }
118
119 if (!least)
120 return NULL;
121
122 out:
123 IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u "
124 "activeconns %d refcnt %d weight %d overhead %d\n",
125 NIPQUAD(least->addr), ntohs(least->port),
126 atomic_read(&least->activeconns),
127 atomic_read(&least->refcnt),
128 atomic_read(&least->weight), loh);
129
130 return least;
131}
132
133
134static struct ip_vs_scheduler ip_vs_nq_scheduler =
135{
136 .name = "nq",
137 .refcnt = ATOMIC_INIT(0),
138 .module = THIS_MODULE,
139 .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
140 .init_service = ip_vs_nq_init_svc,
141 .done_service = ip_vs_nq_done_svc,
142 .update_service = ip_vs_nq_update_svc,
143 .schedule = ip_vs_nq_schedule,
144};
145
146
147static int __init ip_vs_nq_init(void)
148{
149 return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
150}
151
152static void __exit ip_vs_nq_cleanup(void)
153{
154 unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
155}
156
157module_init(ip_vs_nq_init);
158module_exit(ip_vs_nq_cleanup);
159MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
deleted file mode 100644
index 6099a88fc200..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto.c
+++ /dev/null
@@ -1,233 +0,0 @@
1/*
2 * ip_vs_proto.c: transport protocol load balancing support for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
16#include <linux/module.h>
17#include <linux/kernel.h>
18#include <linux/skbuff.h>
19#include <linux/in.h>
20#include <linux/ip.h>
21#include <net/protocol.h>
22#include <net/tcp.h>
23#include <net/udp.h>
24#include <asm/system.h>
25#include <linux/stat.h>
26#include <linux/proc_fs.h>
27
28#include <net/ip_vs.h>
29
30
31/*
32 * IPVS protocols can only be registered/unregistered when the ipvs
33 * module is loaded/unloaded, so no lock is needed in accessing the
34 * ipvs protocol table.
35 */
36
37#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */
38#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
39
40static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
41
42
43/*
44 * register an ipvs protocol
45 */
46static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
47{
48 unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
49
50 pp->next = ip_vs_proto_table[hash];
51 ip_vs_proto_table[hash] = pp;
52
53 if (pp->init != NULL)
54 pp->init(pp);
55
56 return 0;
57}
58
59
60/*
61 * unregister an ipvs protocol
62 */
63static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
64{
65 struct ip_vs_protocol **pp_p;
66 unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
67
68 pp_p = &ip_vs_proto_table[hash];
69 for (; *pp_p; pp_p = &(*pp_p)->next) {
70 if (*pp_p == pp) {
71 *pp_p = pp->next;
72 if (pp->exit != NULL)
73 pp->exit(pp);
74 return 0;
75 }
76 }
77
78 return -ESRCH;
79}
80
81
82/*
83 * get ip_vs_protocol object by its proto.
84 */
85struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
86{
87 struct ip_vs_protocol *pp;
88 unsigned hash = IP_VS_PROTO_HASH(proto);
89
90 for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
91 if (pp->protocol == proto)
92 return pp;
93 }
94
95 return NULL;
96}
97
98
99/*
100 * Propagate event for state change to all protocols
101 */
102void ip_vs_protocol_timeout_change(int flags)
103{
104 struct ip_vs_protocol *pp;
105 int i;
106
107 for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
108 for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
109 if (pp->timeout_change)
110 pp->timeout_change(pp, flags);
111 }
112 }
113}
114
115
116int *
117ip_vs_create_timeout_table(int *table, int size)
118{
119 return kmemdup(table, size, GFP_ATOMIC);
120}
121
122
123/*
124 * Set timeout value for state specified by name
125 */
126int
127ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
128{
129 int i;
130
131 if (!table || !name || !to)
132 return -EINVAL;
133
134 for (i = 0; i < num; i++) {
135 if (strcmp(names[i], name))
136 continue;
137 table[i] = to * HZ;
138 return 0;
139 }
140 return -ENOENT;
141}
142
143
144const char * ip_vs_state_name(__u16 proto, int state)
145{
146 struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
147
148 if (pp == NULL || pp->state_name == NULL)
149 return (IPPROTO_IP == proto) ? "NONE" : "ERR!";
150 return pp->state_name(state);
151}
152
153
154void
155ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
156 const struct sk_buff *skb,
157 int offset,
158 const char *msg)
159{
160 char buf[128];
161 struct iphdr _iph, *ih;
162
163 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
164 if (ih == NULL)
165 sprintf(buf, "%s TRUNCATED", pp->name);
166 else if (ih->frag_off & htons(IP_OFFSET))
167 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
168 pp->name, NIPQUAD(ih->saddr),
169 NIPQUAD(ih->daddr));
170 else {
171 __be16 _ports[2], *pptr
172;
173 pptr = skb_header_pointer(skb, offset + ih->ihl*4,
174 sizeof(_ports), _ports);
175 if (pptr == NULL)
176 sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
177 pp->name,
178 NIPQUAD(ih->saddr),
179 NIPQUAD(ih->daddr));
180 else
181 sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
182 pp->name,
183 NIPQUAD(ih->saddr),
184 ntohs(pptr[0]),
185 NIPQUAD(ih->daddr),
186 ntohs(pptr[1]));
187 }
188
189 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
190}
191
192
193int __init ip_vs_protocol_init(void)
194{
195 char protocols[64];
196#define REGISTER_PROTOCOL(p) \
197 do { \
198 register_ip_vs_protocol(p); \
199 strcat(protocols, ", "); \
200 strcat(protocols, (p)->name); \
201 } while (0)
202
203 protocols[0] = '\0';
204 protocols[2] = '\0';
205#ifdef CONFIG_IP_VS_PROTO_TCP
206 REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
207#endif
208#ifdef CONFIG_IP_VS_PROTO_UDP
209 REGISTER_PROTOCOL(&ip_vs_protocol_udp);
210#endif
211#ifdef CONFIG_IP_VS_PROTO_AH
212 REGISTER_PROTOCOL(&ip_vs_protocol_ah);
213#endif
214#ifdef CONFIG_IP_VS_PROTO_ESP
215 REGISTER_PROTOCOL(&ip_vs_protocol_esp);
216#endif
217 IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
218
219 return 0;
220}
221
222
223void ip_vs_protocol_cleanup(void)
224{
225 struct ip_vs_protocol *pp;
226 int i;
227
228 /* unregister all the ipvs protocols */
229 for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
230 while ((pp = ip_vs_proto_table[i]) != NULL)
231 unregister_ip_vs_protocol(pp);
232 }
233}
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
deleted file mode 100644
index 73e0ea87c1f5..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_ah.c
+++ /dev/null
@@ -1,178 +0,0 @@
1/*
2 * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS
3 *
4 * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
5 * Wensong Zhang <wensong@linuxvirtualserver.org>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * version 2 as published by the Free Software Foundation;
10 *
11 */
12
13#include <linux/in.h>
14#include <linux/ip.h>
15#include <linux/module.h>
16#include <linux/kernel.h>
17#include <linux/netfilter.h>
18#include <linux/netfilter_ipv4.h>
19
20#include <net/ip_vs.h>
21
22
23/* TODO:
24
25struct isakmp_hdr {
26 __u8 icookie[8];
27 __u8 rcookie[8];
28 __u8 np;
29 __u8 version;
30 __u8 xchgtype;
31 __u8 flags;
32 __u32 msgid;
33 __u32 length;
34};
35
36*/
37
38#define PORT_ISAKMP 500
39
40
41static struct ip_vs_conn *
42ah_conn_in_get(const struct sk_buff *skb,
43 struct ip_vs_protocol *pp,
44 const struct iphdr *iph,
45 unsigned int proto_off,
46 int inverse)
47{
48 struct ip_vs_conn *cp;
49
50 if (likely(!inverse)) {
51 cp = ip_vs_conn_in_get(IPPROTO_UDP,
52 iph->saddr,
53 htons(PORT_ISAKMP),
54 iph->daddr,
55 htons(PORT_ISAKMP));
56 } else {
57 cp = ip_vs_conn_in_get(IPPROTO_UDP,
58 iph->daddr,
59 htons(PORT_ISAKMP),
60 iph->saddr,
61 htons(PORT_ISAKMP));
62 }
63
64 if (!cp) {
65 /*
66 * We are not sure if the packet is from our
67 * service, so our conn_schedule hook should return NF_ACCEPT
68 */
69 IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
70 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
71 inverse ? "ICMP+" : "",
72 pp->name,
73 NIPQUAD(iph->saddr),
74 NIPQUAD(iph->daddr));
75 }
76
77 return cp;
78}
79
80
81static struct ip_vs_conn *
82ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
83 const struct iphdr *iph, unsigned int proto_off, int inverse)
84{
85 struct ip_vs_conn *cp;
86
87 if (likely(!inverse)) {
88 cp = ip_vs_conn_out_get(IPPROTO_UDP,
89 iph->saddr,
90 htons(PORT_ISAKMP),
91 iph->daddr,
92 htons(PORT_ISAKMP));
93 } else {
94 cp = ip_vs_conn_out_get(IPPROTO_UDP,
95 iph->daddr,
96 htons(PORT_ISAKMP),
97 iph->saddr,
98 htons(PORT_ISAKMP));
99 }
100
101 if (!cp) {
102 IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
103 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
104 inverse ? "ICMP+" : "",
105 pp->name,
106 NIPQUAD(iph->saddr),
107 NIPQUAD(iph->daddr));
108 }
109
110 return cp;
111}
112
113
114static int
115ah_conn_schedule(struct sk_buff *skb,
116 struct ip_vs_protocol *pp,
117 int *verdict, struct ip_vs_conn **cpp)
118{
119 /*
120 * AH is only related traffic. Pass the packet to IP stack.
121 */
122 *verdict = NF_ACCEPT;
123 return 0;
124}
125
126
127static void
128ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
129 int offset, const char *msg)
130{
131 char buf[256];
132 struct iphdr _iph, *ih;
133
134 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
135 if (ih == NULL)
136 sprintf(buf, "%s TRUNCATED", pp->name);
137 else
138 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
139 pp->name, NIPQUAD(ih->saddr),
140 NIPQUAD(ih->daddr));
141
142 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
143}
144
145
146static void ah_init(struct ip_vs_protocol *pp)
147{
148 /* nothing to do now */
149}
150
151
152static void ah_exit(struct ip_vs_protocol *pp)
153{
154 /* nothing to do now */
155}
156
157
158struct ip_vs_protocol ip_vs_protocol_ah = {
159 .name = "AH",
160 .protocol = IPPROTO_AH,
161 .num_states = 1,
162 .dont_defrag = 1,
163 .init = ah_init,
164 .exit = ah_exit,
165 .conn_schedule = ah_conn_schedule,
166 .conn_in_get = ah_conn_in_get,
167 .conn_out_get = ah_conn_out_get,
168 .snat_handler = NULL,
169 .dnat_handler = NULL,
170 .csum_check = NULL,
171 .state_transition = NULL,
172 .register_app = NULL,
173 .unregister_app = NULL,
174 .app_conn_bind = NULL,
175 .debug_packet = ah_debug_packet,
176 .timeout_change = NULL, /* ISAKMP */
177 .set_state_timeout = NULL,
178};
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
deleted file mode 100644
index 21d70c8ffa54..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_esp.c
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS
3 *
4 * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
5 * Wensong Zhang <wensong@linuxvirtualserver.org>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * version 2 as published by the Free Software Foundation;
10 *
11 */
12
13#include <linux/in.h>
14#include <linux/ip.h>
15#include <linux/module.h>
16#include <linux/kernel.h>
17#include <linux/netfilter.h>
18#include <linux/netfilter_ipv4.h>
19
20#include <net/ip_vs.h>
21
22
23/* TODO:
24
25struct isakmp_hdr {
26 __u8 icookie[8];
27 __u8 rcookie[8];
28 __u8 np;
29 __u8 version;
30 __u8 xchgtype;
31 __u8 flags;
32 __u32 msgid;
33 __u32 length;
34};
35
36*/
37
38#define PORT_ISAKMP 500
39
40
41static struct ip_vs_conn *
42esp_conn_in_get(const struct sk_buff *skb,
43 struct ip_vs_protocol *pp,
44 const struct iphdr *iph,
45 unsigned int proto_off,
46 int inverse)
47{
48 struct ip_vs_conn *cp;
49
50 if (likely(!inverse)) {
51 cp = ip_vs_conn_in_get(IPPROTO_UDP,
52 iph->saddr,
53 htons(PORT_ISAKMP),
54 iph->daddr,
55 htons(PORT_ISAKMP));
56 } else {
57 cp = ip_vs_conn_in_get(IPPROTO_UDP,
58 iph->daddr,
59 htons(PORT_ISAKMP),
60 iph->saddr,
61 htons(PORT_ISAKMP));
62 }
63
64 if (!cp) {
65 /*
66 * We are not sure if the packet is from our
67 * service, so our conn_schedule hook should return NF_ACCEPT
68 */
69 IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
70 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
71 inverse ? "ICMP+" : "",
72 pp->name,
73 NIPQUAD(iph->saddr),
74 NIPQUAD(iph->daddr));
75 }
76
77 return cp;
78}
79
80
81static struct ip_vs_conn *
82esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
83 const struct iphdr *iph, unsigned int proto_off, int inverse)
84{
85 struct ip_vs_conn *cp;
86
87 if (likely(!inverse)) {
88 cp = ip_vs_conn_out_get(IPPROTO_UDP,
89 iph->saddr,
90 htons(PORT_ISAKMP),
91 iph->daddr,
92 htons(PORT_ISAKMP));
93 } else {
94 cp = ip_vs_conn_out_get(IPPROTO_UDP,
95 iph->daddr,
96 htons(PORT_ISAKMP),
97 iph->saddr,
98 htons(PORT_ISAKMP));
99 }
100
101 if (!cp) {
102 IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
103 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
104 inverse ? "ICMP+" : "",
105 pp->name,
106 NIPQUAD(iph->saddr),
107 NIPQUAD(iph->daddr));
108 }
109
110 return cp;
111}
112
113
114static int
115esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
116 int *verdict, struct ip_vs_conn **cpp)
117{
118 /*
119 * ESP is only related traffic. Pass the packet to IP stack.
120 */
121 *verdict = NF_ACCEPT;
122 return 0;
123}
124
125
126static void
127esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
128 int offset, const char *msg)
129{
130 char buf[256];
131 struct iphdr _iph, *ih;
132
133 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
134 if (ih == NULL)
135 sprintf(buf, "%s TRUNCATED", pp->name);
136 else
137 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
138 pp->name, NIPQUAD(ih->saddr),
139 NIPQUAD(ih->daddr));
140
141 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
142}
143
144
145static void esp_init(struct ip_vs_protocol *pp)
146{
147 /* nothing to do now */
148}
149
150
151static void esp_exit(struct ip_vs_protocol *pp)
152{
153 /* nothing to do now */
154}
155
156
157struct ip_vs_protocol ip_vs_protocol_esp = {
158 .name = "ESP",
159 .protocol = IPPROTO_ESP,
160 .num_states = 1,
161 .dont_defrag = 1,
162 .init = esp_init,
163 .exit = esp_exit,
164 .conn_schedule = esp_conn_schedule,
165 .conn_in_get = esp_conn_in_get,
166 .conn_out_get = esp_conn_out_get,
167 .snat_handler = NULL,
168 .dnat_handler = NULL,
169 .csum_check = NULL,
170 .state_transition = NULL,
171 .register_app = NULL,
172 .unregister_app = NULL,
173 .app_conn_bind = NULL,
174 .debug_packet = esp_debug_packet,
175 .timeout_change = NULL, /* ISAKMP */
176};
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
deleted file mode 100644
index d0ea467986a0..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ /dev/null
@@ -1,614 +0,0 @@
1/*
2 * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
16#include <linux/kernel.h>
17#include <linux/ip.h>
18#include <linux/tcp.h> /* for tcphdr */
19#include <net/ip.h>
20#include <net/tcp.h> /* for csum_tcpudp_magic */
21#include <linux/netfilter.h>
22#include <linux/netfilter_ipv4.h>
23
24#include <net/ip_vs.h>
25
26
27static struct ip_vs_conn *
28tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
29 const struct iphdr *iph, unsigned int proto_off, int inverse)
30{
31 __be16 _ports[2], *pptr;
32
33 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
34 if (pptr == NULL)
35 return NULL;
36
37 if (likely(!inverse)) {
38 return ip_vs_conn_in_get(iph->protocol,
39 iph->saddr, pptr[0],
40 iph->daddr, pptr[1]);
41 } else {
42 return ip_vs_conn_in_get(iph->protocol,
43 iph->daddr, pptr[1],
44 iph->saddr, pptr[0]);
45 }
46}
47
48static struct ip_vs_conn *
49tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
50 const struct iphdr *iph, unsigned int proto_off, int inverse)
51{
52 __be16 _ports[2], *pptr;
53
54 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
55 if (pptr == NULL)
56 return NULL;
57
58 if (likely(!inverse)) {
59 return ip_vs_conn_out_get(iph->protocol,
60 iph->saddr, pptr[0],
61 iph->daddr, pptr[1]);
62 } else {
63 return ip_vs_conn_out_get(iph->protocol,
64 iph->daddr, pptr[1],
65 iph->saddr, pptr[0]);
66 }
67}
68
69
70static int
71tcp_conn_schedule(struct sk_buff *skb,
72 struct ip_vs_protocol *pp,
73 int *verdict, struct ip_vs_conn **cpp)
74{
75 struct ip_vs_service *svc;
76 struct tcphdr _tcph, *th;
77
78 th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
79 if (th == NULL) {
80 *verdict = NF_DROP;
81 return 0;
82 }
83
84 if (th->syn &&
85 (svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol,
86 ip_hdr(skb)->daddr, th->dest))) {
87 if (ip_vs_todrop()) {
88 /*
89 * It seems that we are very loaded.
90 * We have to drop this packet :(
91 */
92 ip_vs_service_put(svc);
93 *verdict = NF_DROP;
94 return 0;
95 }
96
97 /*
98 * Let the virtual server select a real server for the
99 * incoming connection, and create a connection entry.
100 */
101 *cpp = ip_vs_schedule(svc, skb);
102 if (!*cpp) {
103 *verdict = ip_vs_leave(svc, skb, pp);
104 return 0;
105 }
106 ip_vs_service_put(svc);
107 }
108 return 1;
109}
110
111
112static inline void
113tcp_fast_csum_update(struct tcphdr *tcph, __be32 oldip, __be32 newip,
114 __be16 oldport, __be16 newport)
115{
116 tcph->check =
117 csum_fold(ip_vs_check_diff4(oldip, newip,
118 ip_vs_check_diff2(oldport, newport,
119 ~csum_unfold(tcph->check))));
120}
121
122
123static int
124tcp_snat_handler(struct sk_buff *skb,
125 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
126{
127 struct tcphdr *tcph;
128 const unsigned int tcphoff = ip_hdrlen(skb);
129
130 /* csum_check requires unshared skb */
131 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
132 return 0;
133
134 if (unlikely(cp->app != NULL)) {
135 /* Some checks before mangling */
136 if (pp->csum_check && !pp->csum_check(skb, pp))
137 return 0;
138
139 /* Call application helper if needed */
140 if (!ip_vs_app_pkt_out(cp, skb))
141 return 0;
142 }
143
144 tcph = (void *)ip_hdr(skb) + tcphoff;
145 tcph->source = cp->vport;
146
147 /* Adjust TCP checksums */
148 if (!cp->app) {
149 /* Only port and addr are changed, do fast csum update */
150 tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr,
151 cp->dport, cp->vport);
152 if (skb->ip_summed == CHECKSUM_COMPLETE)
153 skb->ip_summed = CHECKSUM_NONE;
154 } else {
155 /* full checksum calculation */
156 tcph->check = 0;
157 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
158 tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
159 skb->len - tcphoff,
160 cp->protocol, skb->csum);
161 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
162 pp->name, tcph->check,
163 (char*)&(tcph->check) - (char*)tcph);
164 }
165 return 1;
166}
167
168
169static int
170tcp_dnat_handler(struct sk_buff *skb,
171 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
172{
173 struct tcphdr *tcph;
174 const unsigned int tcphoff = ip_hdrlen(skb);
175
176 /* csum_check requires unshared skb */
177 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
178 return 0;
179
180 if (unlikely(cp->app != NULL)) {
181 /* Some checks before mangling */
182 if (pp->csum_check && !pp->csum_check(skb, pp))
183 return 0;
184
185 /*
186 * Attempt ip_vs_app call.
187 * It will fix ip_vs_conn and iph ack_seq stuff
188 */
189 if (!ip_vs_app_pkt_in(cp, skb))
190 return 0;
191 }
192
193 tcph = (void *)ip_hdr(skb) + tcphoff;
194 tcph->dest = cp->dport;
195
196 /*
197 * Adjust TCP checksums
198 */
199 if (!cp->app) {
200 /* Only port and addr are changed, do fast csum update */
201 tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr,
202 cp->vport, cp->dport);
203 if (skb->ip_summed == CHECKSUM_COMPLETE)
204 skb->ip_summed = CHECKSUM_NONE;
205 } else {
206 /* full checksum calculation */
207 tcph->check = 0;
208 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
209 tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
210 skb->len - tcphoff,
211 cp->protocol, skb->csum);
212 skb->ip_summed = CHECKSUM_UNNECESSARY;
213 }
214 return 1;
215}
216
217
218static int
219tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
220{
221 const unsigned int tcphoff = ip_hdrlen(skb);
222
223 switch (skb->ip_summed) {
224 case CHECKSUM_NONE:
225 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
226 case CHECKSUM_COMPLETE:
227 if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
228 skb->len - tcphoff,
229 ip_hdr(skb)->protocol, skb->csum)) {
230 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
231 "Failed checksum for");
232 return 0;
233 }
234 break;
235 default:
236 /* No need to checksum. */
237 break;
238 }
239
240 return 1;
241}
242
243
244#define TCP_DIR_INPUT 0
245#define TCP_DIR_OUTPUT 4
246#define TCP_DIR_INPUT_ONLY 8
247
248static const int tcp_state_off[IP_VS_DIR_LAST] = {
249 [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
250 [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
251 [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
252};
253
254/*
255 * Timeout table[state]
256 */
257static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
258 [IP_VS_TCP_S_NONE] = 2*HZ,
259 [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
260 [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
261 [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
262 [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
263 [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
264 [IP_VS_TCP_S_CLOSE] = 10*HZ,
265 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
266 [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
267 [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
268 [IP_VS_TCP_S_SYNACK] = 120*HZ,
269 [IP_VS_TCP_S_LAST] = 2*HZ,
270};
271
272static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
273 [IP_VS_TCP_S_NONE] = "NONE",
274 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
275 [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
276 [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
277 [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
278 [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
279 [IP_VS_TCP_S_CLOSE] = "CLOSE",
280 [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
281 [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
282 [IP_VS_TCP_S_LISTEN] = "LISTEN",
283 [IP_VS_TCP_S_SYNACK] = "SYNACK",
284 [IP_VS_TCP_S_LAST] = "BUG!",
285};
286
287#define sNO IP_VS_TCP_S_NONE
288#define sES IP_VS_TCP_S_ESTABLISHED
289#define sSS IP_VS_TCP_S_SYN_SENT
290#define sSR IP_VS_TCP_S_SYN_RECV
291#define sFW IP_VS_TCP_S_FIN_WAIT
292#define sTW IP_VS_TCP_S_TIME_WAIT
293#define sCL IP_VS_TCP_S_CLOSE
294#define sCW IP_VS_TCP_S_CLOSE_WAIT
295#define sLA IP_VS_TCP_S_LAST_ACK
296#define sLI IP_VS_TCP_S_LISTEN
297#define sSA IP_VS_TCP_S_SYNACK
298
299struct tcp_states_t {
300 int next_state[IP_VS_TCP_S_LAST];
301};
302
303static const char * tcp_state_name(int state)
304{
305 if (state >= IP_VS_TCP_S_LAST)
306 return "ERR!";
307 return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
308}
309
310static struct tcp_states_t tcp_states [] = {
311/* INPUT */
312/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
313/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
314/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
315/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
316/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
317
318/* OUTPUT */
319/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
320/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
321/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
322/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
323/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
324
325/* INPUT-ONLY */
326/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
327/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
328/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
329/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
330/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
331};
332
333static struct tcp_states_t tcp_states_dos [] = {
334/* INPUT */
335/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
336/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
337/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
338/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
339/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
340
341/* OUTPUT */
342/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
343/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
344/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
345/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
346/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
347
348/* INPUT-ONLY */
349/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
350/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
351/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
352/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
353/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
354};
355
356static struct tcp_states_t *tcp_state_table = tcp_states;
357
358
359static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
360{
361 int on = (flags & 1); /* secure_tcp */
362
363 /*
364 ** FIXME: change secure_tcp to independent sysctl var
365 ** or make it per-service or per-app because it is valid
366 ** for most if not for all of the applications. Something
367 ** like "capabilities" (flags) for each object.
368 */
369 tcp_state_table = (on? tcp_states_dos : tcp_states);
370}
371
372static int
373tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
374{
375 return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
376 tcp_state_name_table, sname, to);
377}
378
379static inline int tcp_state_idx(struct tcphdr *th)
380{
381 if (th->rst)
382 return 3;
383 if (th->syn)
384 return 0;
385 if (th->fin)
386 return 1;
387 if (th->ack)
388 return 2;
389 return -1;
390}
391
392static inline void
393set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
394 int direction, struct tcphdr *th)
395{
396 int state_idx;
397 int new_state = IP_VS_TCP_S_CLOSE;
398 int state_off = tcp_state_off[direction];
399
400 /*
401 * Update state offset to INPUT_ONLY if necessary
402 * or delete NO_OUTPUT flag if output packet detected
403 */
404 if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
405 if (state_off == TCP_DIR_OUTPUT)
406 cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
407 else
408 state_off = TCP_DIR_INPUT_ONLY;
409 }
410
411 if ((state_idx = tcp_state_idx(th)) < 0) {
412 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
413 goto tcp_state_out;
414 }
415
416 new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
417
418 tcp_state_out:
419 if (new_state != cp->state) {
420 struct ip_vs_dest *dest = cp->dest;
421
422 IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
423 "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n",
424 pp->name,
425 (state_off==TCP_DIR_OUTPUT)?"output ":"input ",
426 th->syn? 'S' : '.',
427 th->fin? 'F' : '.',
428 th->ack? 'A' : '.',
429 th->rst? 'R' : '.',
430 NIPQUAD(cp->daddr), ntohs(cp->dport),
431 NIPQUAD(cp->caddr), ntohs(cp->cport),
432 tcp_state_name(cp->state),
433 tcp_state_name(new_state),
434 atomic_read(&cp->refcnt));
435 if (dest) {
436 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
437 (new_state != IP_VS_TCP_S_ESTABLISHED)) {
438 atomic_dec(&dest->activeconns);
439 atomic_inc(&dest->inactconns);
440 cp->flags |= IP_VS_CONN_F_INACTIVE;
441 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
442 (new_state == IP_VS_TCP_S_ESTABLISHED)) {
443 atomic_inc(&dest->activeconns);
444 atomic_dec(&dest->inactconns);
445 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
446 }
447 }
448 }
449
450 cp->timeout = pp->timeout_table[cp->state = new_state];
451}
452
453
454/*
455 * Handle state transitions
456 */
457static int
458tcp_state_transition(struct ip_vs_conn *cp, int direction,
459 const struct sk_buff *skb,
460 struct ip_vs_protocol *pp)
461{
462 struct tcphdr _tcph, *th;
463
464 th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
465 if (th == NULL)
466 return 0;
467
468 spin_lock(&cp->lock);
469 set_tcp_state(pp, cp, direction, th);
470 spin_unlock(&cp->lock);
471
472 return 1;
473}
474
475
476/*
477 * Hash table for TCP application incarnations
478 */
479#define TCP_APP_TAB_BITS 4
480#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
481#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
482
483static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
484static DEFINE_SPINLOCK(tcp_app_lock);
485
486static inline __u16 tcp_app_hashkey(__be16 port)
487{
488 return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
489 & TCP_APP_TAB_MASK;
490}
491
492
493static int tcp_register_app(struct ip_vs_app *inc)
494{
495 struct ip_vs_app *i;
496 __u16 hash;
497 __be16 port = inc->port;
498 int ret = 0;
499
500 hash = tcp_app_hashkey(port);
501
502 spin_lock_bh(&tcp_app_lock);
503 list_for_each_entry(i, &tcp_apps[hash], p_list) {
504 if (i->port == port) {
505 ret = -EEXIST;
506 goto out;
507 }
508 }
509 list_add(&inc->p_list, &tcp_apps[hash]);
510 atomic_inc(&ip_vs_protocol_tcp.appcnt);
511
512 out:
513 spin_unlock_bh(&tcp_app_lock);
514 return ret;
515}
516
517
518static void
519tcp_unregister_app(struct ip_vs_app *inc)
520{
521 spin_lock_bh(&tcp_app_lock);
522 atomic_dec(&ip_vs_protocol_tcp.appcnt);
523 list_del(&inc->p_list);
524 spin_unlock_bh(&tcp_app_lock);
525}
526
527
528static int
529tcp_app_conn_bind(struct ip_vs_conn *cp)
530{
531 int hash;
532 struct ip_vs_app *inc;
533 int result = 0;
534
535 /* Default binding: bind app only for NAT */
536 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
537 return 0;
538
539 /* Lookup application incarnations and bind the right one */
540 hash = tcp_app_hashkey(cp->vport);
541
542 spin_lock(&tcp_app_lock);
543 list_for_each_entry(inc, &tcp_apps[hash], p_list) {
544 if (inc->port == cp->vport) {
545 if (unlikely(!ip_vs_app_inc_get(inc)))
546 break;
547 spin_unlock(&tcp_app_lock);
548
549 IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
550 "%u.%u.%u.%u:%u to app %s on port %u\n",
551 __func__,
552 NIPQUAD(cp->caddr), ntohs(cp->cport),
553 NIPQUAD(cp->vaddr), ntohs(cp->vport),
554 inc->name, ntohs(inc->port));
555 cp->app = inc;
556 if (inc->init_conn)
557 result = inc->init_conn(inc, cp);
558 goto out;
559 }
560 }
561 spin_unlock(&tcp_app_lock);
562
563 out:
564 return result;
565}
566
567
568/*
569 * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
570 */
571void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
572{
573 spin_lock(&cp->lock);
574 cp->state = IP_VS_TCP_S_LISTEN;
575 cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
576 spin_unlock(&cp->lock);
577}
578
579
580static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
581{
582 IP_VS_INIT_HASH_TABLE(tcp_apps);
583 pp->timeout_table = tcp_timeouts;
584}
585
586
587static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
588{
589}
590
591
592struct ip_vs_protocol ip_vs_protocol_tcp = {
593 .name = "TCP",
594 .protocol = IPPROTO_TCP,
595 .num_states = IP_VS_TCP_S_LAST,
596 .dont_defrag = 0,
597 .appcnt = ATOMIC_INIT(0),
598 .init = ip_vs_tcp_init,
599 .exit = ip_vs_tcp_exit,
600 .register_app = tcp_register_app,
601 .unregister_app = tcp_unregister_app,
602 .conn_schedule = tcp_conn_schedule,
603 .conn_in_get = tcp_conn_in_get,
604 .conn_out_get = tcp_conn_out_get,
605 .snat_handler = tcp_snat_handler,
606 .dnat_handler = tcp_dnat_handler,
607 .csum_check = tcp_csum_check,
608 .state_name = tcp_state_name,
609 .state_transition = tcp_state_transition,
610 .app_conn_bind = tcp_app_conn_bind,
611 .debug_packet = ip_vs_tcpudp_debug_packet,
612 .timeout_change = tcp_timeout_change,
613 .set_state_timeout = tcp_set_state_timeout,
614};
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
deleted file mode 100644
index c6be5d56823f..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ /dev/null
@@ -1,428 +0,0 @@
1/*
2 * ip_vs_proto_udp.c: UDP load balancing support for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
16#include <linux/in.h>
17#include <linux/ip.h>
18#include <linux/kernel.h>
19#include <linux/netfilter.h>
20#include <linux/netfilter_ipv4.h>
21#include <linux/udp.h>
22
23#include <net/ip_vs.h>
24#include <net/ip.h>
25
26static struct ip_vs_conn *
27udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
28 const struct iphdr *iph, unsigned int proto_off, int inverse)
29{
30 struct ip_vs_conn *cp;
31 __be16 _ports[2], *pptr;
32
33 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
34 if (pptr == NULL)
35 return NULL;
36
37 if (likely(!inverse)) {
38 cp = ip_vs_conn_in_get(iph->protocol,
39 iph->saddr, pptr[0],
40 iph->daddr, pptr[1]);
41 } else {
42 cp = ip_vs_conn_in_get(iph->protocol,
43 iph->daddr, pptr[1],
44 iph->saddr, pptr[0]);
45 }
46
47 return cp;
48}
49
50
51static struct ip_vs_conn *
52udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
53 const struct iphdr *iph, unsigned int proto_off, int inverse)
54{
55 struct ip_vs_conn *cp;
56 __be16 _ports[2], *pptr;
57
58 pptr = skb_header_pointer(skb, ip_hdrlen(skb),
59 sizeof(_ports), _ports);
60 if (pptr == NULL)
61 return NULL;
62
63 if (likely(!inverse)) {
64 cp = ip_vs_conn_out_get(iph->protocol,
65 iph->saddr, pptr[0],
66 iph->daddr, pptr[1]);
67 } else {
68 cp = ip_vs_conn_out_get(iph->protocol,
69 iph->daddr, pptr[1],
70 iph->saddr, pptr[0]);
71 }
72
73 return cp;
74}
75
76
77static int
78udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
79 int *verdict, struct ip_vs_conn **cpp)
80{
81 struct ip_vs_service *svc;
82 struct udphdr _udph, *uh;
83
84 uh = skb_header_pointer(skb, ip_hdrlen(skb),
85 sizeof(_udph), &_udph);
86 if (uh == NULL) {
87 *verdict = NF_DROP;
88 return 0;
89 }
90
91 if ((svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol,
92 ip_hdr(skb)->daddr, uh->dest))) {
93 if (ip_vs_todrop()) {
94 /*
95 * It seems that we are very loaded.
96 * We have to drop this packet :(
97 */
98 ip_vs_service_put(svc);
99 *verdict = NF_DROP;
100 return 0;
101 }
102
103 /*
104 * Let the virtual server select a real server for the
105 * incoming connection, and create a connection entry.
106 */
107 *cpp = ip_vs_schedule(svc, skb);
108 if (!*cpp) {
109 *verdict = ip_vs_leave(svc, skb, pp);
110 return 0;
111 }
112 ip_vs_service_put(svc);
113 }
114 return 1;
115}
116
117
118static inline void
119udp_fast_csum_update(struct udphdr *uhdr, __be32 oldip, __be32 newip,
120 __be16 oldport, __be16 newport)
121{
122 uhdr->check =
123 csum_fold(ip_vs_check_diff4(oldip, newip,
124 ip_vs_check_diff2(oldport, newport,
125 ~csum_unfold(uhdr->check))));
126 if (!uhdr->check)
127 uhdr->check = CSUM_MANGLED_0;
128}
129
130static int
131udp_snat_handler(struct sk_buff *skb,
132 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
133{
134 struct udphdr *udph;
135 const unsigned int udphoff = ip_hdrlen(skb);
136
137 /* csum_check requires unshared skb */
138 if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
139 return 0;
140
141 if (unlikely(cp->app != NULL)) {
142 /* Some checks before mangling */
143 if (pp->csum_check && !pp->csum_check(skb, pp))
144 return 0;
145
146 /*
147 * Call application helper if needed
148 */
149 if (!ip_vs_app_pkt_out(cp, skb))
150 return 0;
151 }
152
153 udph = (void *)ip_hdr(skb) + udphoff;
154 udph->source = cp->vport;
155
156 /*
157 * Adjust UDP checksums
158 */
159 if (!cp->app && (udph->check != 0)) {
160 /* Only port and addr are changed, do fast csum update */
161 udp_fast_csum_update(udph, cp->daddr, cp->vaddr,
162 cp->dport, cp->vport);
163 if (skb->ip_summed == CHECKSUM_COMPLETE)
164 skb->ip_summed = CHECKSUM_NONE;
165 } else {
166 /* full checksum calculation */
167 udph->check = 0;
168 skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
169 udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
170 skb->len - udphoff,
171 cp->protocol, skb->csum);
172 if (udph->check == 0)
173 udph->check = CSUM_MANGLED_0;
174 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
175 pp->name, udph->check,
176 (char*)&(udph->check) - (char*)udph);
177 }
178 return 1;
179}
180
181
182static int
183udp_dnat_handler(struct sk_buff *skb,
184 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
185{
186 struct udphdr *udph;
187 unsigned int udphoff = ip_hdrlen(skb);
188
189 /* csum_check requires unshared skb */
190 if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
191 return 0;
192
193 if (unlikely(cp->app != NULL)) {
194 /* Some checks before mangling */
195 if (pp->csum_check && !pp->csum_check(skb, pp))
196 return 0;
197
198 /*
199 * Attempt ip_vs_app call.
200 * It will fix ip_vs_conn
201 */
202 if (!ip_vs_app_pkt_in(cp, skb))
203 return 0;
204 }
205
206 udph = (void *)ip_hdr(skb) + udphoff;
207 udph->dest = cp->dport;
208
209 /*
210 * Adjust UDP checksums
211 */
212 if (!cp->app && (udph->check != 0)) {
213 /* Only port and addr are changed, do fast csum update */
214 udp_fast_csum_update(udph, cp->vaddr, cp->daddr,
215 cp->vport, cp->dport);
216 if (skb->ip_summed == CHECKSUM_COMPLETE)
217 skb->ip_summed = CHECKSUM_NONE;
218 } else {
219 /* full checksum calculation */
220 udph->check = 0;
221 skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
222 udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
223 skb->len - udphoff,
224 cp->protocol, skb->csum);
225 if (udph->check == 0)
226 udph->check = CSUM_MANGLED_0;
227 skb->ip_summed = CHECKSUM_UNNECESSARY;
228 }
229 return 1;
230}
231
232
233static int
234udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
235{
236 struct udphdr _udph, *uh;
237 const unsigned int udphoff = ip_hdrlen(skb);
238
239 uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
240 if (uh == NULL)
241 return 0;
242
243 if (uh->check != 0) {
244 switch (skb->ip_summed) {
245 case CHECKSUM_NONE:
246 skb->csum = skb_checksum(skb, udphoff,
247 skb->len - udphoff, 0);
248 case CHECKSUM_COMPLETE:
249 if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
250 ip_hdr(skb)->daddr,
251 skb->len - udphoff,
252 ip_hdr(skb)->protocol,
253 skb->csum)) {
254 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
255 "Failed checksum for");
256 return 0;
257 }
258 break;
259 default:
260 /* No need to checksum. */
261 break;
262 }
263 }
264 return 1;
265}
266
267
268/*
269 * Note: the caller guarantees that only one of register_app,
270 * unregister_app or app_conn_bind is called each time.
271 */
272
273#define UDP_APP_TAB_BITS 4
274#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
275#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
276
277static struct list_head udp_apps[UDP_APP_TAB_SIZE];
278static DEFINE_SPINLOCK(udp_app_lock);
279
280static inline __u16 udp_app_hashkey(__be16 port)
281{
282 return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
283 & UDP_APP_TAB_MASK;
284}
285
286
287static int udp_register_app(struct ip_vs_app *inc)
288{
289 struct ip_vs_app *i;
290 __u16 hash;
291 __be16 port = inc->port;
292 int ret = 0;
293
294 hash = udp_app_hashkey(port);
295
296
297 spin_lock_bh(&udp_app_lock);
298 list_for_each_entry(i, &udp_apps[hash], p_list) {
299 if (i->port == port) {
300 ret = -EEXIST;
301 goto out;
302 }
303 }
304 list_add(&inc->p_list, &udp_apps[hash]);
305 atomic_inc(&ip_vs_protocol_udp.appcnt);
306
307 out:
308 spin_unlock_bh(&udp_app_lock);
309 return ret;
310}
311
312
313static void
314udp_unregister_app(struct ip_vs_app *inc)
315{
316 spin_lock_bh(&udp_app_lock);
317 atomic_dec(&ip_vs_protocol_udp.appcnt);
318 list_del(&inc->p_list);
319 spin_unlock_bh(&udp_app_lock);
320}
321
322
323static int udp_app_conn_bind(struct ip_vs_conn *cp)
324{
325 int hash;
326 struct ip_vs_app *inc;
327 int result = 0;
328
329 /* Default binding: bind app only for NAT */
330 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
331 return 0;
332
333 /* Lookup application incarnations and bind the right one */
334 hash = udp_app_hashkey(cp->vport);
335
336 spin_lock(&udp_app_lock);
337 list_for_each_entry(inc, &udp_apps[hash], p_list) {
338 if (inc->port == cp->vport) {
339 if (unlikely(!ip_vs_app_inc_get(inc)))
340 break;
341 spin_unlock(&udp_app_lock);
342
343 IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
344 "%u.%u.%u.%u:%u to app %s on port %u\n",
345 __func__,
346 NIPQUAD(cp->caddr), ntohs(cp->cport),
347 NIPQUAD(cp->vaddr), ntohs(cp->vport),
348 inc->name, ntohs(inc->port));
349 cp->app = inc;
350 if (inc->init_conn)
351 result = inc->init_conn(inc, cp);
352 goto out;
353 }
354 }
355 spin_unlock(&udp_app_lock);
356
357 out:
358 return result;
359}
360
361
362static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
363 [IP_VS_UDP_S_NORMAL] = 5*60*HZ,
364 [IP_VS_UDP_S_LAST] = 2*HZ,
365};
366
367static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
368 [IP_VS_UDP_S_NORMAL] = "UDP",
369 [IP_VS_UDP_S_LAST] = "BUG!",
370};
371
372
373static int
374udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
375{
376 return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
377 udp_state_name_table, sname, to);
378}
379
380static const char * udp_state_name(int state)
381{
382 if (state >= IP_VS_UDP_S_LAST)
383 return "ERR!";
384 return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
385}
386
387static int
388udp_state_transition(struct ip_vs_conn *cp, int direction,
389 const struct sk_buff *skb,
390 struct ip_vs_protocol *pp)
391{
392 cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
393 return 1;
394}
395
396static void udp_init(struct ip_vs_protocol *pp)
397{
398 IP_VS_INIT_HASH_TABLE(udp_apps);
399 pp->timeout_table = udp_timeouts;
400}
401
402static void udp_exit(struct ip_vs_protocol *pp)
403{
404}
405
406
407struct ip_vs_protocol ip_vs_protocol_udp = {
408 .name = "UDP",
409 .protocol = IPPROTO_UDP,
410 .num_states = IP_VS_UDP_S_LAST,
411 .dont_defrag = 0,
412 .init = udp_init,
413 .exit = udp_exit,
414 .conn_schedule = udp_conn_schedule,
415 .conn_in_get = udp_conn_in_get,
416 .conn_out_get = udp_conn_out_get,
417 .snat_handler = udp_snat_handler,
418 .dnat_handler = udp_dnat_handler,
419 .csum_check = udp_csum_check,
420 .state_transition = udp_state_transition,
421 .state_name = udp_state_name,
422 .register_app = udp_register_app,
423 .unregister_app = udp_unregister_app,
424 .app_conn_bind = udp_app_conn_bind,
425 .debug_packet = ip_vs_tcpudp_debug_packet,
426 .timeout_change = NULL,
427 .set_state_timeout = udp_set_state_timeout,
428};
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c
deleted file mode 100644
index 358110d17e59..000000000000
--- a/net/ipv4/ipvs/ip_vs_rr.c
+++ /dev/null
@@ -1,116 +0,0 @@
1/*
2 * IPVS: Round-Robin Scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Peter Kese <peter.kese@ijs.si>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Fixes/Changes:
13 * Wensong Zhang : changed the ip_vs_rr_schedule to return dest
14 * Julian Anastasov : fixed the NULL pointer access bug in debugging
15 * Wensong Zhang : changed some comestics things for debugging
16 * Wensong Zhang : changed for the d-linked destination list
17 * Wensong Zhang : added the ip_vs_rr_update_svc
18 * Wensong Zhang : added any dest with weight=0 is quiesced
19 *
20 */
21
22#include <linux/module.h>
23#include <linux/kernel.h>
24
25#include <net/ip_vs.h>
26
27
28static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
29{
30 svc->sched_data = &svc->destinations;
31 return 0;
32}
33
34
35static int ip_vs_rr_done_svc(struct ip_vs_service *svc)
36{
37 return 0;
38}
39
40
41static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
42{
43 svc->sched_data = &svc->destinations;
44 return 0;
45}
46
47
48/*
49 * Round-Robin Scheduling
50 */
51static struct ip_vs_dest *
52ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
53{
54 struct list_head *p, *q;
55 struct ip_vs_dest *dest;
56
57 IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
58
59 write_lock(&svc->sched_lock);
60 p = (struct list_head *)svc->sched_data;
61 p = p->next;
62 q = p;
63 do {
64 /* skip list head */
65 if (q == &svc->destinations) {
66 q = q->next;
67 continue;
68 }
69
70 dest = list_entry(q, struct ip_vs_dest, n_list);
71 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
72 atomic_read(&dest->weight) > 0)
73 /* HIT */
74 goto out;
75 q = q->next;
76 } while (q != p);
77 write_unlock(&svc->sched_lock);
78 return NULL;
79
80 out:
81 svc->sched_data = q;
82 write_unlock(&svc->sched_lock);
83 IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u "
84 "activeconns %d refcnt %d weight %d\n",
85 NIPQUAD(dest->addr), ntohs(dest->port),
86 atomic_read(&dest->activeconns),
87 atomic_read(&dest->refcnt), atomic_read(&dest->weight));
88
89 return dest;
90}
91
92
93static struct ip_vs_scheduler ip_vs_rr_scheduler = {
94 .name = "rr", /* name */
95 .refcnt = ATOMIC_INIT(0),
96 .module = THIS_MODULE,
97 .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
98 .init_service = ip_vs_rr_init_svc,
99 .done_service = ip_vs_rr_done_svc,
100 .update_service = ip_vs_rr_update_svc,
101 .schedule = ip_vs_rr_schedule,
102};
103
104static int __init ip_vs_rr_init(void)
105{
106 return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
107}
108
109static void __exit ip_vs_rr_cleanup(void)
110{
111 unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
112}
113
114module_init(ip_vs_rr_init);
115module_exit(ip_vs_rr_cleanup);
116MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c
deleted file mode 100644
index a46ad9e35016..000000000000
--- a/net/ipv4/ipvs/ip_vs_sched.c
+++ /dev/null
@@ -1,251 +0,0 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the Netfilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * Peter Kese <peter.kese@ijs.si>
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version.
15 *
16 * Changes:
17 *
18 */
19
20#include <linux/module.h>
21#include <linux/spinlock.h>
22#include <linux/interrupt.h>
23#include <asm/string.h>
24#include <linux/kmod.h>
25#include <linux/sysctl.h>
26
27#include <net/ip_vs.h>
28
29/*
30 * IPVS scheduler list
31 */
32static LIST_HEAD(ip_vs_schedulers);
33
34/* lock for service table */
35static DEFINE_RWLOCK(__ip_vs_sched_lock);
36
37
38/*
39 * Bind a service with a scheduler
40 */
41int ip_vs_bind_scheduler(struct ip_vs_service *svc,
42 struct ip_vs_scheduler *scheduler)
43{
44 int ret;
45
46 if (svc == NULL) {
47 IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
48 return -EINVAL;
49 }
50 if (scheduler == NULL) {
51 IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
52 return -EINVAL;
53 }
54
55 svc->scheduler = scheduler;
56
57 if (scheduler->init_service) {
58 ret = scheduler->init_service(svc);
59 if (ret) {
60 IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
61 return ret;
62 }
63 }
64
65 return 0;
66}
67
68
69/*
70 * Unbind a service with its scheduler
71 */
72int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
73{
74 struct ip_vs_scheduler *sched;
75
76 if (svc == NULL) {
77 IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
78 return -EINVAL;
79 }
80
81 sched = svc->scheduler;
82 if (sched == NULL) {
83 IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
84 return -EINVAL;
85 }
86
87 if (sched->done_service) {
88 if (sched->done_service(svc) != 0) {
89 IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
90 return -EINVAL;
91 }
92 }
93
94 svc->scheduler = NULL;
95 return 0;
96}
97
98
99/*
100 * Get scheduler in the scheduler list by name
101 */
102static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
103{
104 struct ip_vs_scheduler *sched;
105
106 IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
107 sched_name);
108
109 read_lock_bh(&__ip_vs_sched_lock);
110
111 list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
112 /*
113 * Test and get the modules atomically
114 */
115 if (sched->module && !try_module_get(sched->module)) {
116 /*
117 * This scheduler is just deleted
118 */
119 continue;
120 }
121 if (strcmp(sched_name, sched->name)==0) {
122 /* HIT */
123 read_unlock_bh(&__ip_vs_sched_lock);
124 return sched;
125 }
126 if (sched->module)
127 module_put(sched->module);
128 }
129
130 read_unlock_bh(&__ip_vs_sched_lock);
131 return NULL;
132}
133
134
135/*
136 * Lookup scheduler and try to load it if it doesn't exist
137 */
138struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
139{
140 struct ip_vs_scheduler *sched;
141
142 /*
143 * Search for the scheduler by sched_name
144 */
145 sched = ip_vs_sched_getbyname(sched_name);
146
147 /*
148 * If scheduler not found, load the module and search again
149 */
150 if (sched == NULL) {
151 request_module("ip_vs_%s", sched_name);
152 sched = ip_vs_sched_getbyname(sched_name);
153 }
154
155 return sched;
156}
157
158void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
159{
160 if (scheduler->module)
161 module_put(scheduler->module);
162}
163
164
165/*
166 * Register a scheduler in the scheduler list
167 */
168int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
169{
170 struct ip_vs_scheduler *sched;
171
172 if (!scheduler) {
173 IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
174 return -EINVAL;
175 }
176
177 if (!scheduler->name) {
178 IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
179 return -EINVAL;
180 }
181
182 /* increase the module use count */
183 ip_vs_use_count_inc();
184
185 write_lock_bh(&__ip_vs_sched_lock);
186
187 if (!list_empty(&scheduler->n_list)) {
188 write_unlock_bh(&__ip_vs_sched_lock);
189 ip_vs_use_count_dec();
190 IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
191 "already linked\n", scheduler->name);
192 return -EINVAL;
193 }
194
195 /*
196 * Make sure that the scheduler with this name doesn't exist
197 * in the scheduler list.
198 */
199 list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
200 if (strcmp(scheduler->name, sched->name) == 0) {
201 write_unlock_bh(&__ip_vs_sched_lock);
202 ip_vs_use_count_dec();
203 IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
204 "already existed in the system\n",
205 scheduler->name);
206 return -EINVAL;
207 }
208 }
209 /*
210 * Add it into the d-linked scheduler list
211 */
212 list_add(&scheduler->n_list, &ip_vs_schedulers);
213 write_unlock_bh(&__ip_vs_sched_lock);
214
215 IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
216
217 return 0;
218}
219
220
221/*
222 * Unregister a scheduler from the scheduler list
223 */
224int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
225{
226 if (!scheduler) {
227 IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
228 return -EINVAL;
229 }
230
231 write_lock_bh(&__ip_vs_sched_lock);
232 if (list_empty(&scheduler->n_list)) {
233 write_unlock_bh(&__ip_vs_sched_lock);
234 IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
235 "is not in the list. failed\n", scheduler->name);
236 return -EINVAL;
237 }
238
239 /*
240 * Remove it from the d-linked scheduler list
241 */
242 list_del(&scheduler->n_list);
243 write_unlock_bh(&__ip_vs_sched_lock);
244
245 /* decrease the module use count */
246 ip_vs_use_count_dec();
247
248 IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
249
250 return 0;
251}
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c
deleted file mode 100644
index 77663d84cbd1..000000000000
--- a/net/ipv4/ipvs/ip_vs_sed.c
+++ /dev/null
@@ -1,161 +0,0 @@
1/*
2 * IPVS: Shortest Expected Delay scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 *
13 */
14
15/*
16 * The SED algorithm attempts to minimize each job's expected delay until
17 * completion. The expected delay that the job will experience is
18 * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
19 * jobs on the ith server and Ui is the fixed service rate (weight) of
20 * the ith server. The SED algorithm adopts a greedy policy that each does
21 * what is in its own best interest, i.e. to join the queue which would
22 * minimize its expected delay of completion.
23 *
24 * See the following paper for more information:
25 * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
26 * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
27 * pages 986-994, 1988.
28 *
29 * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
30 *
31 * The difference between SED and WLC is that SED includes the incoming
32 * job in the cost function (the increment of 1). SED may outperform
33 * WLC, while scheduling big jobs under larger heterogeneous systems
34 * (the server weight varies a lot).
35 *
36 */
37
38#include <linux/module.h>
39#include <linux/kernel.h>
40
41#include <net/ip_vs.h>
42
43
44static int
45ip_vs_sed_init_svc(struct ip_vs_service *svc)
46{
47 return 0;
48}
49
50
51static int
52ip_vs_sed_done_svc(struct ip_vs_service *svc)
53{
54 return 0;
55}
56
57
58static int
59ip_vs_sed_update_svc(struct ip_vs_service *svc)
60{
61 return 0;
62}
63
64
65static inline unsigned int
66ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
67{
68 /*
69 * We only use the active connection number in the cost
70 * calculation here.
71 */
72 return atomic_read(&dest->activeconns) + 1;
73}
74
75
76/*
77 * Weighted Least Connection scheduling
78 */
79static struct ip_vs_dest *
80ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
81{
82 struct ip_vs_dest *dest, *least;
83 unsigned int loh, doh;
84
85 IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
86
87 /*
88 * We calculate the load of each dest server as follows:
89 * (server expected overhead) / dest->weight
90 *
91 * Remember -- no floats in kernel mode!!!
92 * The comparison of h1*w2 > h2*w1 is equivalent to that of
93 * h1/w1 > h2/w2
94 * if every weight is larger than zero.
95 *
96 * The server with weight=0 is quiesced and will not receive any
97 * new connections.
98 */
99
100 list_for_each_entry(dest, &svc->destinations, n_list) {
101 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
102 atomic_read(&dest->weight) > 0) {
103 least = dest;
104 loh = ip_vs_sed_dest_overhead(least);
105 goto nextstage;
106 }
107 }
108 return NULL;
109
110 /*
111 * Find the destination with the least load.
112 */
113 nextstage:
114 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
115 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
116 continue;
117 doh = ip_vs_sed_dest_overhead(dest);
118 if (loh * atomic_read(&dest->weight) >
119 doh * atomic_read(&least->weight)) {
120 least = dest;
121 loh = doh;
122 }
123 }
124
125 IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u "
126 "activeconns %d refcnt %d weight %d overhead %d\n",
127 NIPQUAD(least->addr), ntohs(least->port),
128 atomic_read(&least->activeconns),
129 atomic_read(&least->refcnt),
130 atomic_read(&least->weight), loh);
131
132 return least;
133}
134
135
136static struct ip_vs_scheduler ip_vs_sed_scheduler =
137{
138 .name = "sed",
139 .refcnt = ATOMIC_INIT(0),
140 .module = THIS_MODULE,
141 .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
142 .init_service = ip_vs_sed_init_svc,
143 .done_service = ip_vs_sed_done_svc,
144 .update_service = ip_vs_sed_update_svc,
145 .schedule = ip_vs_sed_schedule,
146};
147
148
149static int __init ip_vs_sed_init(void)
150{
151 return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
152}
153
154static void __exit ip_vs_sed_cleanup(void)
155{
156 unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
157}
158
159module_init(ip_vs_sed_init);
160module_exit(ip_vs_sed_cleanup);
161MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
deleted file mode 100644
index 7b979e228056..000000000000
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ /dev/null
@@ -1,255 +0,0 @@
1/*
2 * IPVS: Source Hashing scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@gnuchina.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 *
13 */
14
15/*
16 * The sh algorithm is to select server by the hash key of source IP
17 * address. The pseudo code is as follows:
18 *
19 * n <- servernode[src_ip];
20 * if (n is dead) OR
21 * (n is overloaded) or (n.weight <= 0) then
22 * return NULL;
23 *
24 * return n;
25 *
26 * Notes that servernode is a 256-bucket hash table that maps the hash
27 * index derived from packet source IP address to the current server
28 * array. If the sh scheduler is used in cache cluster, it is good to
29 * combine it with cache_bypass feature. When the statically assigned
30 * server is dead or overloaded, the load balancer can bypass the cache
31 * server and send requests to the original server directly.
32 *
33 */
34
35#include <linux/ip.h>
36#include <linux/module.h>
37#include <linux/kernel.h>
38#include <linux/skbuff.h>
39
40#include <net/ip_vs.h>
41
42
43/*
44 * IPVS SH bucket
45 */
46struct ip_vs_sh_bucket {
47 struct ip_vs_dest *dest; /* real server (cache) */
48};
49
50/*
51 * for IPVS SH entry hash table
52 */
53#ifndef CONFIG_IP_VS_SH_TAB_BITS
54#define CONFIG_IP_VS_SH_TAB_BITS 8
55#endif
56#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS
57#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
58#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
59
60
61/*
62 * Returns hash value for IPVS SH entry
63 */
64static inline unsigned ip_vs_sh_hashkey(__be32 addr)
65{
66 return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
67}
68
69
70/*
71 * Get ip_vs_dest associated with supplied parameters.
72 */
73static inline struct ip_vs_dest *
74ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr)
75{
76 return (tbl[ip_vs_sh_hashkey(addr)]).dest;
77}
78
79
80/*
81 * Assign all the hash buckets of the specified table with the service.
82 */
83static int
84ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
85{
86 int i;
87 struct ip_vs_sh_bucket *b;
88 struct list_head *p;
89 struct ip_vs_dest *dest;
90
91 b = tbl;
92 p = &svc->destinations;
93 for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
94 if (list_empty(p)) {
95 b->dest = NULL;
96 } else {
97 if (p == &svc->destinations)
98 p = p->next;
99
100 dest = list_entry(p, struct ip_vs_dest, n_list);
101 atomic_inc(&dest->refcnt);
102 b->dest = dest;
103
104 p = p->next;
105 }
106 b++;
107 }
108 return 0;
109}
110
111
112/*
113 * Flush all the hash buckets of the specified table.
114 */
115static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
116{
117 int i;
118 struct ip_vs_sh_bucket *b;
119
120 b = tbl;
121 for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
122 if (b->dest) {
123 atomic_dec(&b->dest->refcnt);
124 b->dest = NULL;
125 }
126 b++;
127 }
128}
129
130
131static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
132{
133 struct ip_vs_sh_bucket *tbl;
134
135 /* allocate the SH table for this service */
136 tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
137 GFP_ATOMIC);
138 if (tbl == NULL) {
139 IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
140 return -ENOMEM;
141 }
142 svc->sched_data = tbl;
143 IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
144 "current service\n",
145 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
146
147 /* assign the hash buckets with the updated service */
148 ip_vs_sh_assign(tbl, svc);
149
150 return 0;
151}
152
153
154static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
155{
156 struct ip_vs_sh_bucket *tbl = svc->sched_data;
157
158 /* got to clean up hash buckets here */
159 ip_vs_sh_flush(tbl);
160
161 /* release the table itself */
162 kfree(svc->sched_data);
163 IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
164 sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
165
166 return 0;
167}
168
169
170static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
171{
172 struct ip_vs_sh_bucket *tbl = svc->sched_data;
173
174 /* got to clean up hash buckets here */
175 ip_vs_sh_flush(tbl);
176
177 /* assign the hash buckets with the updated service */
178 ip_vs_sh_assign(tbl, svc);
179
180 return 0;
181}
182
183
184/*
185 * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
186 * consider that the server is overloaded here.
187 */
188static inline int is_overloaded(struct ip_vs_dest *dest)
189{
190 return dest->flags & IP_VS_DEST_F_OVERLOAD;
191}
192
193
194/*
195 * Source Hashing scheduling
196 */
197static struct ip_vs_dest *
198ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
199{
200 struct ip_vs_dest *dest;
201 struct ip_vs_sh_bucket *tbl;
202 struct iphdr *iph = ip_hdr(skb);
203
204 IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
205
206 tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
207 dest = ip_vs_sh_get(tbl, iph->saddr);
208 if (!dest
209 || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
210 || atomic_read(&dest->weight) <= 0
211 || is_overloaded(dest)) {
212 return NULL;
213 }
214
215 IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
216 "--> server %u.%u.%u.%u:%d\n",
217 NIPQUAD(iph->saddr),
218 NIPQUAD(dest->addr),
219 ntohs(dest->port));
220
221 return dest;
222}
223
224
225/*
226 * IPVS SH Scheduler structure
227 */
228static struct ip_vs_scheduler ip_vs_sh_scheduler =
229{
230 .name = "sh",
231 .refcnt = ATOMIC_INIT(0),
232 .module = THIS_MODULE,
233 .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
234 .init_service = ip_vs_sh_init_svc,
235 .done_service = ip_vs_sh_done_svc,
236 .update_service = ip_vs_sh_update_svc,
237 .schedule = ip_vs_sh_schedule,
238};
239
240
241static int __init ip_vs_sh_init(void)
242{
243 return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
244}
245
246
247static void __exit ip_vs_sh_cleanup(void)
248{
249 unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
250}
251
252
253module_init(ip_vs_sh_init);
254module_exit(ip_vs_sh_cleanup);
255MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
deleted file mode 100644
index a652da2c3200..000000000000
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ /dev/null
@@ -1,930 +0,0 @@
1/*
2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
6 * cluster of servers.
7 *
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 *
10 * ip_vs_sync: sync connection info from master load balancer to backups
11 * through multicast
12 *
13 * Changes:
14 * Alexandre Cassen : Added master & backup support at a time.
15 * Alexandre Cassen : Added SyncID support for incoming sync
16 * messages filtering.
17 * Justin Ossevoort : Fix endian problem on sync message size.
18 */
19
20#include <linux/module.h>
21#include <linux/slab.h>
22#include <linux/inetdevice.h>
23#include <linux/net.h>
24#include <linux/completion.h>
25#include <linux/delay.h>
26#include <linux/skbuff.h>
27#include <linux/in.h>
28#include <linux/igmp.h> /* for ip_mc_join_group */
29#include <linux/udp.h>
30#include <linux/err.h>
31#include <linux/kthread.h>
32#include <linux/wait.h>
33
34#include <net/ip.h>
35#include <net/sock.h>
36
37#include <net/ip_vs.h>
38
39#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
40#define IP_VS_SYNC_PORT 8848 /* multicast port */
41
42
43/*
44 * IPVS sync connection entry
45 */
46struct ip_vs_sync_conn {
47 __u8 reserved;
48
49 /* Protocol, addresses and port numbers */
50 __u8 protocol; /* Which protocol (TCP/UDP) */
51 __be16 cport;
52 __be16 vport;
53 __be16 dport;
54 __be32 caddr; /* client address */
55 __be32 vaddr; /* virtual address */
56 __be32 daddr; /* destination address */
57
58 /* Flags and state transition */
59 __be16 flags; /* status flags */
60 __be16 state; /* state info */
61
62 /* The sequence options start here */
63};
64
65struct ip_vs_sync_conn_options {
66 struct ip_vs_seq in_seq; /* incoming seq. struct */
67 struct ip_vs_seq out_seq; /* outgoing seq. struct */
68};
69
70struct ip_vs_sync_thread_data {
71 struct socket *sock;
72 char *buf;
73};
74
75#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
76#define FULL_CONN_SIZE \
77(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
78
79
80/*
81 The master mulitcasts messages to the backup load balancers in the
82 following format.
83
84 0 1 2 3
85 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
86 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87 | Count Conns | SyncID | Size |
88 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89 | |
90 | IPVS Sync Connection (1) |
91 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
92 | . |
93 | . |
94 | . |
95 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
96 | |
97 | IPVS Sync Connection (n) |
98 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99*/
100
101#define SYNC_MESG_HEADER_LEN 4
102
103struct ip_vs_sync_mesg {
104 __u8 nr_conns;
105 __u8 syncid;
106 __u16 size;
107
108 /* ip_vs_sync_conn entries start here */
109};
110
111/* the maximum length of sync (sending/receiving) message */
112static int sync_send_mesg_maxlen;
113static int sync_recv_mesg_maxlen;
114
115struct ip_vs_sync_buff {
116 struct list_head list;
117 unsigned long firstuse;
118
119 /* pointers for the message data */
120 struct ip_vs_sync_mesg *mesg;
121 unsigned char *head;
122 unsigned char *end;
123};
124
125
126/* the sync_buff list head and the lock */
127static LIST_HEAD(ip_vs_sync_queue);
128static DEFINE_SPINLOCK(ip_vs_sync_lock);
129
130/* current sync_buff for accepting new conn entries */
131static struct ip_vs_sync_buff *curr_sb = NULL;
132static DEFINE_SPINLOCK(curr_sb_lock);
133
134/* ipvs sync daemon state */
135volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
136volatile int ip_vs_master_syncid = 0;
137volatile int ip_vs_backup_syncid = 0;
138
139/* multicast interface name */
140char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
141char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
142
143/* sync daemon tasks */
144static struct task_struct *sync_master_thread;
145static struct task_struct *sync_backup_thread;
146
147/* multicast addr */
148static struct sockaddr_in mcast_addr = {
149 .sin_family = AF_INET,
150 .sin_port = __constant_htons(IP_VS_SYNC_PORT),
151 .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP),
152};
153
154
155static inline struct ip_vs_sync_buff *sb_dequeue(void)
156{
157 struct ip_vs_sync_buff *sb;
158
159 spin_lock_bh(&ip_vs_sync_lock);
160 if (list_empty(&ip_vs_sync_queue)) {
161 sb = NULL;
162 } else {
163 sb = list_entry(ip_vs_sync_queue.next,
164 struct ip_vs_sync_buff,
165 list);
166 list_del(&sb->list);
167 }
168 spin_unlock_bh(&ip_vs_sync_lock);
169
170 return sb;
171}
172
173static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
174{
175 struct ip_vs_sync_buff *sb;
176
177 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
178 return NULL;
179
180 if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
181 kfree(sb);
182 return NULL;
183 }
184 sb->mesg->nr_conns = 0;
185 sb->mesg->syncid = ip_vs_master_syncid;
186 sb->mesg->size = 4;
187 sb->head = (unsigned char *)sb->mesg + 4;
188 sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
189 sb->firstuse = jiffies;
190 return sb;
191}
192
193static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
194{
195 kfree(sb->mesg);
196 kfree(sb);
197}
198
199static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
200{
201 spin_lock(&ip_vs_sync_lock);
202 if (ip_vs_sync_state & IP_VS_STATE_MASTER)
203 list_add_tail(&sb->list, &ip_vs_sync_queue);
204 else
205 ip_vs_sync_buff_release(sb);
206 spin_unlock(&ip_vs_sync_lock);
207}
208
209/*
210 * Get the current sync buffer if it has been created for more
211 * than the specified time or the specified time is zero.
212 */
213static inline struct ip_vs_sync_buff *
214get_curr_sync_buff(unsigned long time)
215{
216 struct ip_vs_sync_buff *sb;
217
218 spin_lock_bh(&curr_sb_lock);
219 if (curr_sb && (time == 0 ||
220 time_before(jiffies - curr_sb->firstuse, time))) {
221 sb = curr_sb;
222 curr_sb = NULL;
223 } else
224 sb = NULL;
225 spin_unlock_bh(&curr_sb_lock);
226 return sb;
227}
228
229
230/*
231 * Add an ip_vs_conn information into the current sync_buff.
232 * Called by ip_vs_in.
233 */
234void ip_vs_sync_conn(struct ip_vs_conn *cp)
235{
236 struct ip_vs_sync_mesg *m;
237 struct ip_vs_sync_conn *s;
238 int len;
239
240 spin_lock(&curr_sb_lock);
241 if (!curr_sb) {
242 if (!(curr_sb=ip_vs_sync_buff_create())) {
243 spin_unlock(&curr_sb_lock);
244 IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
245 return;
246 }
247 }
248
249 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
250 SIMPLE_CONN_SIZE;
251 m = curr_sb->mesg;
252 s = (struct ip_vs_sync_conn *)curr_sb->head;
253
254 /* copy members */
255 s->protocol = cp->protocol;
256 s->cport = cp->cport;
257 s->vport = cp->vport;
258 s->dport = cp->dport;
259 s->caddr = cp->caddr;
260 s->vaddr = cp->vaddr;
261 s->daddr = cp->daddr;
262 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
263 s->state = htons(cp->state);
264 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
265 struct ip_vs_sync_conn_options *opt =
266 (struct ip_vs_sync_conn_options *)&s[1];
267 memcpy(opt, &cp->in_seq, sizeof(*opt));
268 }
269
270 m->nr_conns++;
271 m->size += len;
272 curr_sb->head += len;
273
274 /* check if there is a space for next one */
275 if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
276 sb_queue_tail(curr_sb);
277 curr_sb = NULL;
278 }
279 spin_unlock(&curr_sb_lock);
280
281 /* synchronize its controller if it has */
282 if (cp->control)
283 ip_vs_sync_conn(cp->control);
284}
285
286
287/*
288 * Process received multicast message and create the corresponding
289 * ip_vs_conn entries.
290 */
291static void ip_vs_process_message(const char *buffer, const size_t buflen)
292{
293 struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
294 struct ip_vs_sync_conn *s;
295 struct ip_vs_sync_conn_options *opt;
296 struct ip_vs_conn *cp;
297 struct ip_vs_protocol *pp;
298 struct ip_vs_dest *dest;
299 char *p;
300 int i;
301
302 if (buflen < sizeof(struct ip_vs_sync_mesg)) {
303 IP_VS_ERR_RL("sync message header too short\n");
304 return;
305 }
306
307 /* Convert size back to host byte order */
308 m->size = ntohs(m->size);
309
310 if (buflen != m->size) {
311 IP_VS_ERR_RL("bogus sync message size\n");
312 return;
313 }
314
315 /* SyncID sanity check */
316 if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
317 IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
318 m->syncid);
319 return;
320 }
321
322 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
323 for (i=0; i<m->nr_conns; i++) {
324 unsigned flags, state;
325
326 if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
327 IP_VS_ERR_RL("bogus conn in sync message\n");
328 return;
329 }
330 s = (struct ip_vs_sync_conn *) p;
331 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
332 flags &= ~IP_VS_CONN_F_HASHED;
333 if (flags & IP_VS_CONN_F_SEQ_MASK) {
334 opt = (struct ip_vs_sync_conn_options *)&s[1];
335 p += FULL_CONN_SIZE;
336 if (p > buffer+buflen) {
337 IP_VS_ERR_RL("bogus conn options in sync message\n");
338 return;
339 }
340 } else {
341 opt = NULL;
342 p += SIMPLE_CONN_SIZE;
343 }
344
345 state = ntohs(s->state);
346 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
347 pp = ip_vs_proto_get(s->protocol);
348 if (!pp) {
349 IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
350 s->protocol);
351 continue;
352 }
353 if (state >= pp->num_states) {
354 IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
355 pp->name, state);
356 continue;
357 }
358 } else {
359 /* protocol in templates is not used for state/timeout */
360 pp = NULL;
361 if (state > 0) {
362 IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
363 state);
364 state = 0;
365 }
366 }
367
368 if (!(flags & IP_VS_CONN_F_TEMPLATE))
369 cp = ip_vs_conn_in_get(s->protocol,
370 s->caddr, s->cport,
371 s->vaddr, s->vport);
372 else
373 cp = ip_vs_ct_in_get(s->protocol,
374 s->caddr, s->cport,
375 s->vaddr, s->vport);
376 if (!cp) {
377 /*
378 * Find the appropriate destination for the connection.
379 * If it is not found the connection will remain unbound
380 * but still handled.
381 */
382 dest = ip_vs_find_dest(s->daddr, s->dport,
383 s->vaddr, s->vport,
384 s->protocol);
385 /* Set the approprite ativity flag */
386 if (s->protocol == IPPROTO_TCP) {
387 if (state != IP_VS_TCP_S_ESTABLISHED)
388 flags |= IP_VS_CONN_F_INACTIVE;
389 else
390 flags &= ~IP_VS_CONN_F_INACTIVE;
391 }
392 cp = ip_vs_conn_new(s->protocol,
393 s->caddr, s->cport,
394 s->vaddr, s->vport,
395 s->daddr, s->dport,
396 flags, dest);
397 if (dest)
398 atomic_dec(&dest->refcnt);
399 if (!cp) {
400 IP_VS_ERR("ip_vs_conn_new failed\n");
401 return;
402 }
403 } else if (!cp->dest) {
404 dest = ip_vs_try_bind_dest(cp);
405 if (dest)
406 atomic_dec(&dest->refcnt);
407 } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
408 (cp->state != state)) {
409 /* update active/inactive flag for the connection */
410 dest = cp->dest;
411 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
412 (state != IP_VS_TCP_S_ESTABLISHED)) {
413 atomic_dec(&dest->activeconns);
414 atomic_inc(&dest->inactconns);
415 cp->flags |= IP_VS_CONN_F_INACTIVE;
416 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
417 (state == IP_VS_TCP_S_ESTABLISHED)) {
418 atomic_inc(&dest->activeconns);
419 atomic_dec(&dest->inactconns);
420 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
421 }
422 }
423
424 if (opt)
425 memcpy(&cp->in_seq, opt, sizeof(*opt));
426 atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
427 cp->state = state;
428 cp->old_state = cp->state;
429 /*
430 * We can not recover the right timeout for templates
431 * in all cases, we can not find the right fwmark
432 * virtual service. If needed, we can do it for
433 * non-fwmark persistent services.
434 */
435 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
436 cp->timeout = pp->timeout_table[state];
437 else
438 cp->timeout = (3*60*HZ);
439 ip_vs_conn_put(cp);
440 }
441}
442
443
444/*
445 * Setup loopback of outgoing multicasts on a sending socket
446 */
447static void set_mcast_loop(struct sock *sk, u_char loop)
448{
449 struct inet_sock *inet = inet_sk(sk);
450
451 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
452 lock_sock(sk);
453 inet->mc_loop = loop ? 1 : 0;
454 release_sock(sk);
455}
456
457/*
458 * Specify TTL for outgoing multicasts on a sending socket
459 */
460static void set_mcast_ttl(struct sock *sk, u_char ttl)
461{
462 struct inet_sock *inet = inet_sk(sk);
463
464 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
465 lock_sock(sk);
466 inet->mc_ttl = ttl;
467 release_sock(sk);
468}
469
470/*
471 * Specifiy default interface for outgoing multicasts
472 */
473static int set_mcast_if(struct sock *sk, char *ifname)
474{
475 struct net_device *dev;
476 struct inet_sock *inet = inet_sk(sk);
477
478 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
479 return -ENODEV;
480
481 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
482 return -EINVAL;
483
484 lock_sock(sk);
485 inet->mc_index = dev->ifindex;
486 /* inet->mc_addr = 0; */
487 release_sock(sk);
488
489 return 0;
490}
491
492
493/*
494 * Set the maximum length of sync message according to the
495 * specified interface's MTU.
496 */
497static int set_sync_mesg_maxlen(int sync_state)
498{
499 struct net_device *dev;
500 int num;
501
502 if (sync_state == IP_VS_STATE_MASTER) {
503 if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
504 return -ENODEV;
505
506 num = (dev->mtu - sizeof(struct iphdr) -
507 sizeof(struct udphdr) -
508 SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
509 sync_send_mesg_maxlen =
510 SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num;
511 IP_VS_DBG(7, "setting the maximum length of sync sending "
512 "message %d.\n", sync_send_mesg_maxlen);
513 } else if (sync_state == IP_VS_STATE_BACKUP) {
514 if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
515 return -ENODEV;
516
517 sync_recv_mesg_maxlen = dev->mtu -
518 sizeof(struct iphdr) - sizeof(struct udphdr);
519 IP_VS_DBG(7, "setting the maximum length of sync receiving "
520 "message %d.\n", sync_recv_mesg_maxlen);
521 }
522
523 return 0;
524}
525
526
527/*
528 * Join a multicast group.
529 * the group is specified by a class D multicast address 224.0.0.0/8
530 * in the in_addr structure passed in as a parameter.
531 */
532static int
533join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
534{
535 struct ip_mreqn mreq;
536 struct net_device *dev;
537 int ret;
538
539 memset(&mreq, 0, sizeof(mreq));
540 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
541
542 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
543 return -ENODEV;
544 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
545 return -EINVAL;
546
547 mreq.imr_ifindex = dev->ifindex;
548
549 lock_sock(sk);
550 ret = ip_mc_join_group(sk, &mreq);
551 release_sock(sk);
552
553 return ret;
554}
555
556
557static int bind_mcastif_addr(struct socket *sock, char *ifname)
558{
559 struct net_device *dev;
560 __be32 addr;
561 struct sockaddr_in sin;
562
563 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
564 return -ENODEV;
565
566 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
567 if (!addr)
568 IP_VS_ERR("You probably need to specify IP address on "
569 "multicast interface.\n");
570
571 IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
572 ifname, NIPQUAD(addr));
573
574 /* Now bind the socket with the address of multicast interface */
575 sin.sin_family = AF_INET;
576 sin.sin_addr.s_addr = addr;
577 sin.sin_port = 0;
578
579 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
580}
581
582/*
583 * Set up sending multicast socket over UDP
584 */
585static struct socket * make_send_sock(void)
586{
587 struct socket *sock;
588 int result;
589
590 /* First create a socket */
591 result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
592 if (result < 0) {
593 IP_VS_ERR("Error during creation of socket; terminating\n");
594 return ERR_PTR(result);
595 }
596
597 result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
598 if (result < 0) {
599 IP_VS_ERR("Error setting outbound mcast interface\n");
600 goto error;
601 }
602
603 set_mcast_loop(sock->sk, 0);
604 set_mcast_ttl(sock->sk, 1);
605
606 result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
607 if (result < 0) {
608 IP_VS_ERR("Error binding address of the mcast interface\n");
609 goto error;
610 }
611
612 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
613 sizeof(struct sockaddr), 0);
614 if (result < 0) {
615 IP_VS_ERR("Error connecting to the multicast addr\n");
616 goto error;
617 }
618
619 return sock;
620
621 error:
622 sock_release(sock);
623 return ERR_PTR(result);
624}
625
626
627/*
628 * Set up receiving multicast socket over UDP
629 */
630static struct socket * make_receive_sock(void)
631{
632 struct socket *sock;
633 int result;
634
635 /* First create a socket */
636 result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
637 if (result < 0) {
638 IP_VS_ERR("Error during creation of socket; terminating\n");
639 return ERR_PTR(result);
640 }
641
642 /* it is equivalent to the REUSEADDR option in user-space */
643 sock->sk->sk_reuse = 1;
644
645 result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
646 sizeof(struct sockaddr));
647 if (result < 0) {
648 IP_VS_ERR("Error binding to the multicast addr\n");
649 goto error;
650 }
651
652 /* join the multicast group */
653 result = join_mcast_group(sock->sk,
654 (struct in_addr *) &mcast_addr.sin_addr,
655 ip_vs_backup_mcast_ifn);
656 if (result < 0) {
657 IP_VS_ERR("Error joining to the multicast group\n");
658 goto error;
659 }
660
661 return sock;
662
663 error:
664 sock_release(sock);
665 return ERR_PTR(result);
666}
667
668
669static int
670ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
671{
672 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
673 struct kvec iov;
674 int len;
675
676 EnterFunction(7);
677 iov.iov_base = (void *)buffer;
678 iov.iov_len = length;
679
680 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
681
682 LeaveFunction(7);
683 return len;
684}
685
686static void
687ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
688{
689 int msize;
690
691 msize = msg->size;
692
693 /* Put size in network byte order */
694 msg->size = htons(msg->size);
695
696 if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
697 IP_VS_ERR("ip_vs_send_async error\n");
698}
699
700static int
701ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
702{
703 struct msghdr msg = {NULL,};
704 struct kvec iov;
705 int len;
706
707 EnterFunction(7);
708
709 /* Receive a packet */
710 iov.iov_base = buffer;
711 iov.iov_len = (size_t)buflen;
712
713 len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
714
715 if (len < 0)
716 return -1;
717
718 LeaveFunction(7);
719 return len;
720}
721
722
723static int sync_thread_master(void *data)
724{
725 struct ip_vs_sync_thread_data *tinfo = data;
726 struct ip_vs_sync_buff *sb;
727
728 IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
729 "syncid = %d\n",
730 ip_vs_master_mcast_ifn, ip_vs_master_syncid);
731
732 while (!kthread_should_stop()) {
733 while ((sb = sb_dequeue())) {
734 ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
735 ip_vs_sync_buff_release(sb);
736 }
737
738 /* check if entries stay in curr_sb for 2 seconds */
739 sb = get_curr_sync_buff(2 * HZ);
740 if (sb) {
741 ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
742 ip_vs_sync_buff_release(sb);
743 }
744
745 schedule_timeout_interruptible(HZ);
746 }
747
748 /* clean up the sync_buff queue */
749 while ((sb=sb_dequeue())) {
750 ip_vs_sync_buff_release(sb);
751 }
752
753 /* clean up the current sync_buff */
754 if ((sb = get_curr_sync_buff(0))) {
755 ip_vs_sync_buff_release(sb);
756 }
757
758 /* release the sending multicast socket */
759 sock_release(tinfo->sock);
760 kfree(tinfo);
761
762 return 0;
763}
764
765
766static int sync_thread_backup(void *data)
767{
768 struct ip_vs_sync_thread_data *tinfo = data;
769 int len;
770
771 IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
772 "syncid = %d\n",
773 ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
774
775 while (!kthread_should_stop()) {
776 wait_event_interruptible(*tinfo->sock->sk->sk_sleep,
777 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
778 || kthread_should_stop());
779
780 /* do we have data now? */
781 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
782 len = ip_vs_receive(tinfo->sock, tinfo->buf,
783 sync_recv_mesg_maxlen);
784 if (len <= 0) {
785 IP_VS_ERR("receiving message error\n");
786 break;
787 }
788
789 /* disable bottom half, because it accesses the data
790 shared by softirq while getting/creating conns */
791 local_bh_disable();
792 ip_vs_process_message(tinfo->buf, len);
793 local_bh_enable();
794 }
795 }
796
797 /* release the sending multicast socket */
798 sock_release(tinfo->sock);
799 kfree(tinfo->buf);
800 kfree(tinfo);
801
802 return 0;
803}
804
805
806int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
807{
808 struct ip_vs_sync_thread_data *tinfo;
809 struct task_struct **realtask, *task;
810 struct socket *sock;
811 char *name, *buf = NULL;
812 int (*threadfn)(void *data);
813 int result = -ENOMEM;
814
815 IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
816 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
817 sizeof(struct ip_vs_sync_conn));
818
819 if (state == IP_VS_STATE_MASTER) {
820 if (sync_master_thread)
821 return -EEXIST;
822
823 strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
824 sizeof(ip_vs_master_mcast_ifn));
825 ip_vs_master_syncid = syncid;
826 realtask = &sync_master_thread;
827 name = "ipvs_syncmaster";
828 threadfn = sync_thread_master;
829 sock = make_send_sock();
830 } else if (state == IP_VS_STATE_BACKUP) {
831 if (sync_backup_thread)
832 return -EEXIST;
833
834 strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
835 sizeof(ip_vs_backup_mcast_ifn));
836 ip_vs_backup_syncid = syncid;
837 realtask = &sync_backup_thread;
838 name = "ipvs_syncbackup";
839 threadfn = sync_thread_backup;
840 sock = make_receive_sock();
841 } else {
842 return -EINVAL;
843 }
844
845 if (IS_ERR(sock)) {
846 result = PTR_ERR(sock);
847 goto out;
848 }
849
850 set_sync_mesg_maxlen(state);
851 if (state == IP_VS_STATE_BACKUP) {
852 buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
853 if (!buf)
854 goto outsocket;
855 }
856
857 tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
858 if (!tinfo)
859 goto outbuf;
860
861 tinfo->sock = sock;
862 tinfo->buf = buf;
863
864 task = kthread_run(threadfn, tinfo, name);
865 if (IS_ERR(task)) {
866 result = PTR_ERR(task);
867 goto outtinfo;
868 }
869
870 /* mark as active */
871 *realtask = task;
872 ip_vs_sync_state |= state;
873
874 /* increase the module use count */
875 ip_vs_use_count_inc();
876
877 return 0;
878
879outtinfo:
880 kfree(tinfo);
881outbuf:
882 kfree(buf);
883outsocket:
884 sock_release(sock);
885out:
886 return result;
887}
888
889
890int stop_sync_thread(int state)
891{
892 IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current));
893
894 if (state == IP_VS_STATE_MASTER) {
895 if (!sync_master_thread)
896 return -ESRCH;
897
898 IP_VS_INFO("stopping master sync thread %d ...\n",
899 task_pid_nr(sync_master_thread));
900
901 /*
902 * The lock synchronizes with sb_queue_tail(), so that we don't
903 * add sync buffers to the queue, when we are already in
904 * progress of stopping the master sync daemon.
905 */
906
907 spin_lock_bh(&ip_vs_sync_lock);
908 ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
909 spin_unlock_bh(&ip_vs_sync_lock);
910 kthread_stop(sync_master_thread);
911 sync_master_thread = NULL;
912 } else if (state == IP_VS_STATE_BACKUP) {
913 if (!sync_backup_thread)
914 return -ESRCH;
915
916 IP_VS_INFO("stopping backup sync thread %d ...\n",
917 task_pid_nr(sync_backup_thread));
918
919 ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
920 kthread_stop(sync_backup_thread);
921 sync_backup_thread = NULL;
922 } else {
923 return -EINVAL;
924 }
925
926 /* decrease the module use count */
927 ip_vs_use_count_dec();
928
929 return 0;
930}
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c
deleted file mode 100644
index 9b0ef86bb1f7..000000000000
--- a/net/ipv4/ipvs/ip_vs_wlc.c
+++ /dev/null
@@ -1,149 +0,0 @@
1/*
2 * IPVS: Weighted Least-Connection Scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Peter Kese <peter.kese@ijs.si>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
14 * Wensong Zhang : changed to use the inactconns in scheduling
15 * Wensong Zhang : changed some comestics things for debugging
16 * Wensong Zhang : changed for the d-linked destination list
17 * Wensong Zhang : added the ip_vs_wlc_update_svc
18 * Wensong Zhang : added any dest with weight=0 is quiesced
19 *
20 */
21
22#include <linux/module.h>
23#include <linux/kernel.h>
24
25#include <net/ip_vs.h>
26
27
28static int
29ip_vs_wlc_init_svc(struct ip_vs_service *svc)
30{
31 return 0;
32}
33
34
35static int
36ip_vs_wlc_done_svc(struct ip_vs_service *svc)
37{
38 return 0;
39}
40
41
42static int
43ip_vs_wlc_update_svc(struct ip_vs_service *svc)
44{
45 return 0;
46}
47
48
49static inline unsigned int
50ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
51{
52 /*
53 * We think the overhead of processing active connections is 256
54 * times higher than that of inactive connections in average. (This
55 * 256 times might not be accurate, we will change it later) We
56 * use the following formula to estimate the overhead now:
57 * dest->activeconns*256 + dest->inactconns
58 */
59 return (atomic_read(&dest->activeconns) << 8) +
60 atomic_read(&dest->inactconns);
61}
62
63
64/*
65 * Weighted Least Connection scheduling
66 */
67static struct ip_vs_dest *
68ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
69{
70 struct ip_vs_dest *dest, *least;
71 unsigned int loh, doh;
72
73 IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
74
75 /*
76 * We calculate the load of each dest server as follows:
77 * (dest overhead) / dest->weight
78 *
79 * Remember -- no floats in kernel mode!!!
80 * The comparison of h1*w2 > h2*w1 is equivalent to that of
81 * h1/w1 > h2/w2
82 * if every weight is larger than zero.
83 *
84 * The server with weight=0 is quiesced and will not receive any
85 * new connections.
86 */
87
88 list_for_each_entry(dest, &svc->destinations, n_list) {
89 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
90 atomic_read(&dest->weight) > 0) {
91 least = dest;
92 loh = ip_vs_wlc_dest_overhead(least);
93 goto nextstage;
94 }
95 }
96 return NULL;
97
98 /*
99 * Find the destination with the least load.
100 */
101 nextstage:
102 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
103 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
104 continue;
105 doh = ip_vs_wlc_dest_overhead(dest);
106 if (loh * atomic_read(&dest->weight) >
107 doh * atomic_read(&least->weight)) {
108 least = dest;
109 loh = doh;
110 }
111 }
112
113 IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u "
114 "activeconns %d refcnt %d weight %d overhead %d\n",
115 NIPQUAD(least->addr), ntohs(least->port),
116 atomic_read(&least->activeconns),
117 atomic_read(&least->refcnt),
118 atomic_read(&least->weight), loh);
119
120 return least;
121}
122
123
124static struct ip_vs_scheduler ip_vs_wlc_scheduler =
125{
126 .name = "wlc",
127 .refcnt = ATOMIC_INIT(0),
128 .module = THIS_MODULE,
129 .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
130 .init_service = ip_vs_wlc_init_svc,
131 .done_service = ip_vs_wlc_done_svc,
132 .update_service = ip_vs_wlc_update_svc,
133 .schedule = ip_vs_wlc_schedule,
134};
135
136
137static int __init ip_vs_wlc_init(void)
138{
139 return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
140}
141
142static void __exit ip_vs_wlc_cleanup(void)
143{
144 unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
145}
146
147module_init(ip_vs_wlc_init);
148module_exit(ip_vs_wlc_cleanup);
149MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
deleted file mode 100644
index 0d86a79b87b5..000000000000
--- a/net/ipv4/ipvs/ip_vs_wrr.c
+++ /dev/null
@@ -1,234 +0,0 @@
1/*
2 * IPVS: Weighted Round-Robin Scheduling module
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
13 * Wensong Zhang : changed some comestics things for debugging
14 * Wensong Zhang : changed for the d-linked destination list
15 * Wensong Zhang : added the ip_vs_wrr_update_svc
16 * Julian Anastasov : fixed the bug of returning destination
17 * with weight 0 when all weights are zero
18 *
19 */
20
21#include <linux/module.h>
22#include <linux/kernel.h>
23#include <linux/net.h>
24
25#include <net/ip_vs.h>
26
27/*
28 * current destination pointer for weighted round-robin scheduling
29 */
30struct ip_vs_wrr_mark {
31 struct list_head *cl; /* current list head */
32 int cw; /* current weight */
33 int mw; /* maximum weight */
34 int di; /* decreasing interval */
35};
36
37
38/*
39 * Get the gcd of server weights
40 */
41static int gcd(int a, int b)
42{
43 int c;
44
45 while ((c = a % b)) {
46 a = b;
47 b = c;
48 }
49 return b;
50}
51
52static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
53{
54 struct ip_vs_dest *dest;
55 int weight;
56 int g = 0;
57
58 list_for_each_entry(dest, &svc->destinations, n_list) {
59 weight = atomic_read(&dest->weight);
60 if (weight > 0) {
61 if (g > 0)
62 g = gcd(weight, g);
63 else
64 g = weight;
65 }
66 }
67 return g ? g : 1;
68}
69
70
71/*
72 * Get the maximum weight of the service destinations.
73 */
74static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
75{
76 struct ip_vs_dest *dest;
77 int weight = 0;
78
79 list_for_each_entry(dest, &svc->destinations, n_list) {
80 if (atomic_read(&dest->weight) > weight)
81 weight = atomic_read(&dest->weight);
82 }
83
84 return weight;
85}
86
87
88static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
89{
90 struct ip_vs_wrr_mark *mark;
91
92 /*
93 * Allocate the mark variable for WRR scheduling
94 */
95 mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
96 if (mark == NULL) {
97 IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
98 return -ENOMEM;
99 }
100 mark->cl = &svc->destinations;
101 mark->cw = 0;
102 mark->mw = ip_vs_wrr_max_weight(svc);
103 mark->di = ip_vs_wrr_gcd_weight(svc);
104 svc->sched_data = mark;
105
106 return 0;
107}
108
109
110static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
111{
112 /*
113 * Release the mark variable
114 */
115 kfree(svc->sched_data);
116
117 return 0;
118}
119
120
121static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
122{
123 struct ip_vs_wrr_mark *mark = svc->sched_data;
124
125 mark->cl = &svc->destinations;
126 mark->mw = ip_vs_wrr_max_weight(svc);
127 mark->di = ip_vs_wrr_gcd_weight(svc);
128 if (mark->cw > mark->mw)
129 mark->cw = 0;
130 return 0;
131}
132
133
134/*
135 * Weighted Round-Robin Scheduling
136 */
137static struct ip_vs_dest *
138ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
139{
140 struct ip_vs_dest *dest;
141 struct ip_vs_wrr_mark *mark = svc->sched_data;
142 struct list_head *p;
143
144 IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
145
146 /*
147 * This loop will always terminate, because mark->cw in (0, max_weight]
148 * and at least one server has its weight equal to max_weight.
149 */
150 write_lock(&svc->sched_lock);
151 p = mark->cl;
152 while (1) {
153 if (mark->cl == &svc->destinations) {
154 /* it is at the head of the destination list */
155
156 if (mark->cl == mark->cl->next) {
157 /* no dest entry */
158 dest = NULL;
159 goto out;
160 }
161
162 mark->cl = svc->destinations.next;
163 mark->cw -= mark->di;
164 if (mark->cw <= 0) {
165 mark->cw = mark->mw;
166 /*
167 * Still zero, which means no available servers.
168 */
169 if (mark->cw == 0) {
170 mark->cl = &svc->destinations;
171 IP_VS_ERR_RL("ip_vs_wrr_schedule(): "
172 "no available servers\n");
173 dest = NULL;
174 goto out;
175 }
176 }
177 } else
178 mark->cl = mark->cl->next;
179
180 if (mark->cl != &svc->destinations) {
181 /* not at the head of the list */
182 dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
183 if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
184 atomic_read(&dest->weight) >= mark->cw) {
185 /* got it */
186 break;
187 }
188 }
189
190 if (mark->cl == p && mark->cw == mark->di) {
191 /* back to the start, and no dest is found.
192 It is only possible when all dests are OVERLOADED */
193 dest = NULL;
194 goto out;
195 }
196 }
197
198 IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u "
199 "activeconns %d refcnt %d weight %d\n",
200 NIPQUAD(dest->addr), ntohs(dest->port),
201 atomic_read(&dest->activeconns),
202 atomic_read(&dest->refcnt),
203 atomic_read(&dest->weight));
204
205 out:
206 write_unlock(&svc->sched_lock);
207 return dest;
208}
209
210
211static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
212 .name = "wrr",
213 .refcnt = ATOMIC_INIT(0),
214 .module = THIS_MODULE,
215 .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
216 .init_service = ip_vs_wrr_init_svc,
217 .done_service = ip_vs_wrr_done_svc,
218 .update_service = ip_vs_wrr_update_svc,
219 .schedule = ip_vs_wrr_schedule,
220};
221
222static int __init ip_vs_wrr_init(void)
223{
224 return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
225}
226
227static void __exit ip_vs_wrr_cleanup(void)
228{
229 unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
230}
231
232module_init(ip_vs_wrr_init);
233module_exit(ip_vs_wrr_cleanup);
234MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
deleted file mode 100644
index 9892d4aca42e..000000000000
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ /dev/null
@@ -1,559 +0,0 @@
1/*
2 * ip_vs_xmit.c: various packet transmitters for IPVS
3 *
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 *
12 * Changes:
13 *
14 */
15
16#include <linux/kernel.h>
17#include <linux/tcp.h> /* for tcphdr */
18#include <net/ip.h>
19#include <net/tcp.h> /* for csum_tcpudp_magic */
20#include <net/udp.h>
21#include <net/icmp.h> /* for icmp_send */
22#include <net/route.h> /* for ip_route_output */
23#include <linux/netfilter.h>
24#include <linux/netfilter_ipv4.h>
25
26#include <net/ip_vs.h>
27
28
29/*
30 * Destination cache to speed up outgoing route lookup
31 */
32static inline void
33__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
34{
35 struct dst_entry *old_dst;
36
37 old_dst = dest->dst_cache;
38 dest->dst_cache = dst;
39 dest->dst_rtos = rtos;
40 dst_release(old_dst);
41}
42
43static inline struct dst_entry *
44__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
45{
46 struct dst_entry *dst = dest->dst_cache;
47
48 if (!dst)
49 return NULL;
50 if ((dst->obsolete || rtos != dest->dst_rtos) &&
51 dst->ops->check(dst, cookie) == NULL) {
52 dest->dst_cache = NULL;
53 dst_release(dst);
54 return NULL;
55 }
56 dst_hold(dst);
57 return dst;
58}
59
60static struct rtable *
61__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
62{
63 struct rtable *rt; /* Route to the other host */
64 struct ip_vs_dest *dest = cp->dest;
65
66 if (dest) {
67 spin_lock(&dest->dst_lock);
68 if (!(rt = (struct rtable *)
69 __ip_vs_dst_check(dest, rtos, 0))) {
70 struct flowi fl = {
71 .oif = 0,
72 .nl_u = {
73 .ip4_u = {
74 .daddr = dest->addr,
75 .saddr = 0,
76 .tos = rtos, } },
77 };
78
79 if (ip_route_output_key(&init_net, &rt, &fl)) {
80 spin_unlock(&dest->dst_lock);
81 IP_VS_DBG_RL("ip_route_output error, "
82 "dest: %u.%u.%u.%u\n",
83 NIPQUAD(dest->addr));
84 return NULL;
85 }
86 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
87 IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
88 NIPQUAD(dest->addr),
89 atomic_read(&rt->u.dst.__refcnt), rtos);
90 }
91 spin_unlock(&dest->dst_lock);
92 } else {
93 struct flowi fl = {
94 .oif = 0,
95 .nl_u = {
96 .ip4_u = {
97 .daddr = cp->daddr,
98 .saddr = 0,
99 .tos = rtos, } },
100 };
101
102 if (ip_route_output_key(&init_net, &rt, &fl)) {
103 IP_VS_DBG_RL("ip_route_output error, dest: "
104 "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
105 return NULL;
106 }
107 }
108
109 return rt;
110}
111
112
113/*
114 * Release dest->dst_cache before a dest is removed
115 */
116void
117ip_vs_dst_reset(struct ip_vs_dest *dest)
118{
119 struct dst_entry *old_dst;
120
121 old_dst = dest->dst_cache;
122 dest->dst_cache = NULL;
123 dst_release(old_dst);
124}
125
126#define IP_VS_XMIT(skb, rt) \
127do { \
128 (skb)->ipvs_property = 1; \
129 skb_forward_csum(skb); \
130 NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, (skb), NULL, \
131 (rt)->u.dst.dev, dst_output); \
132} while (0)
133
134
135/*
136 * NULL transmitter (do nothing except return NF_ACCEPT)
137 */
138int
139ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
140 struct ip_vs_protocol *pp)
141{
142 /* we do not touch skb and do not need pskb ptr */
143 return NF_ACCEPT;
144}
145
146
147/*
148 * Bypass transmitter
149 * Let packets bypass the destination when the destination is not
150 * available, it may be only used in transparent cache cluster.
151 */
152int
153ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
154 struct ip_vs_protocol *pp)
155{
156 struct rtable *rt; /* Route to the other host */
157 struct iphdr *iph = ip_hdr(skb);
158 u8 tos = iph->tos;
159 int mtu;
160 struct flowi fl = {
161 .oif = 0,
162 .nl_u = {
163 .ip4_u = {
164 .daddr = iph->daddr,
165 .saddr = 0,
166 .tos = RT_TOS(tos), } },
167 };
168
169 EnterFunction(10);
170
171 if (ip_route_output_key(&init_net, &rt, &fl)) {
172 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
173 "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
174 goto tx_error_icmp;
175 }
176
177 /* MTU checking */
178 mtu = dst_mtu(&rt->u.dst);
179 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
180 ip_rt_put(rt);
181 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
182 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
183 goto tx_error;
184 }
185
186 /*
187 * Call ip_send_check because we are not sure it is called
188 * after ip_defrag. Is copy-on-write needed?
189 */
190 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
191 ip_rt_put(rt);
192 return NF_STOLEN;
193 }
194 ip_send_check(ip_hdr(skb));
195
196 /* drop old route */
197 dst_release(skb->dst);
198 skb->dst = &rt->u.dst;
199
200 /* Another hack: avoid icmp_send in ip_fragment */
201 skb->local_df = 1;
202
203 IP_VS_XMIT(skb, rt);
204
205 LeaveFunction(10);
206 return NF_STOLEN;
207
208 tx_error_icmp:
209 dst_link_failure(skb);
210 tx_error:
211 kfree_skb(skb);
212 LeaveFunction(10);
213 return NF_STOLEN;
214}
215
216
217/*
218 * NAT transmitter (only for outside-to-inside nat forwarding)
219 * Not used for related ICMP
220 */
221int
222ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
223 struct ip_vs_protocol *pp)
224{
225 struct rtable *rt; /* Route to the other host */
226 int mtu;
227 struct iphdr *iph = ip_hdr(skb);
228
229 EnterFunction(10);
230
231 /* check if it is a connection of no-client-port */
232 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
233 __be16 _pt, *p;
234 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
235 if (p == NULL)
236 goto tx_error;
237 ip_vs_conn_fill_cport(cp, *p);
238 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
239 }
240
241 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
242 goto tx_error_icmp;
243
244 /* MTU checking */
245 mtu = dst_mtu(&rt->u.dst);
246 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
247 ip_rt_put(rt);
248 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
249 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
250 goto tx_error;
251 }
252
253 /* copy-on-write the packet before mangling it */
254 if (!skb_make_writable(skb, sizeof(struct iphdr)))
255 goto tx_error_put;
256
257 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
258 goto tx_error_put;
259
260 /* drop old route */
261 dst_release(skb->dst);
262 skb->dst = &rt->u.dst;
263
264 /* mangle the packet */
265 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
266 goto tx_error;
267 ip_hdr(skb)->daddr = cp->daddr;
268 ip_send_check(ip_hdr(skb));
269
270 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
271
272 /* FIXME: when application helper enlarges the packet and the length
273 is larger than the MTU of outgoing device, there will be still
274 MTU problem. */
275
276 /* Another hack: avoid icmp_send in ip_fragment */
277 skb->local_df = 1;
278
279 IP_VS_XMIT(skb, rt);
280
281 LeaveFunction(10);
282 return NF_STOLEN;
283
284 tx_error_icmp:
285 dst_link_failure(skb);
286 tx_error:
287 LeaveFunction(10);
288 kfree_skb(skb);
289 return NF_STOLEN;
290 tx_error_put:
291 ip_rt_put(rt);
292 goto tx_error;
293}
294
295
296/*
297 * IP Tunneling transmitter
298 *
299 * This function encapsulates the packet in a new IP packet, its
300 * destination will be set to cp->daddr. Most code of this function
301 * is taken from ipip.c.
302 *
303 * It is used in VS/TUN cluster. The load balancer selects a real
304 * server from a cluster based on a scheduling algorithm,
305 * encapsulates the request packet and forwards it to the selected
306 * server. For example, all real servers are configured with
307 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
308 * the encapsulated packet, it will decapsulate the packet, processe
309 * the request and return the response packets directly to the client
310 * without passing the load balancer. This can greatly increase the
311 * scalability of virtual server.
312 *
313 * Used for ANY protocol
314 */
315int
316ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
317 struct ip_vs_protocol *pp)
318{
319 struct rtable *rt; /* Route to the other host */
320 struct net_device *tdev; /* Device to other host */
321 struct iphdr *old_iph = ip_hdr(skb);
322 u8 tos = old_iph->tos;
323 __be16 df = old_iph->frag_off;
324 sk_buff_data_t old_transport_header = skb->transport_header;
325 struct iphdr *iph; /* Our new IP header */
326 unsigned int max_headroom; /* The extra header space needed */
327 int mtu;
328
329 EnterFunction(10);
330
331 if (skb->protocol != htons(ETH_P_IP)) {
332 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
333 "ETH_P_IP: %d, skb protocol: %d\n",
334 htons(ETH_P_IP), skb->protocol);
335 goto tx_error;
336 }
337
338 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
339 goto tx_error_icmp;
340
341 tdev = rt->u.dst.dev;
342
343 mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
344 if (mtu < 68) {
345 ip_rt_put(rt);
346 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
347 goto tx_error;
348 }
349 if (skb->dst)
350 skb->dst->ops->update_pmtu(skb->dst, mtu);
351
352 df |= (old_iph->frag_off & htons(IP_DF));
353
354 if ((old_iph->frag_off & htons(IP_DF))
355 && mtu < ntohs(old_iph->tot_len)) {
356 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
357 ip_rt_put(rt);
358 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
359 goto tx_error;
360 }
361
362 /*
363 * Okay, now see if we can stuff it in the buffer as-is.
364 */
365 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
366
367 if (skb_headroom(skb) < max_headroom
368 || skb_cloned(skb) || skb_shared(skb)) {
369 struct sk_buff *new_skb =
370 skb_realloc_headroom(skb, max_headroom);
371 if (!new_skb) {
372 ip_rt_put(rt);
373 kfree_skb(skb);
374 IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
375 return NF_STOLEN;
376 }
377 kfree_skb(skb);
378 skb = new_skb;
379 old_iph = ip_hdr(skb);
380 }
381
382 skb->transport_header = old_transport_header;
383
384 /* fix old IP header checksum */
385 ip_send_check(old_iph);
386
387 skb_push(skb, sizeof(struct iphdr));
388 skb_reset_network_header(skb);
389 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
390
391 /* drop old route */
392 dst_release(skb->dst);
393 skb->dst = &rt->u.dst;
394
395 /*
396 * Push down and install the IPIP header.
397 */
398 iph = ip_hdr(skb);
399 iph->version = 4;
400 iph->ihl = sizeof(struct iphdr)>>2;
401 iph->frag_off = df;
402 iph->protocol = IPPROTO_IPIP;
403 iph->tos = tos;
404 iph->daddr = rt->rt_dst;
405 iph->saddr = rt->rt_src;
406 iph->ttl = old_iph->ttl;
407 ip_select_ident(iph, &rt->u.dst, NULL);
408
409 /* Another hack: avoid icmp_send in ip_fragment */
410 skb->local_df = 1;
411
412 ip_local_out(skb);
413
414 LeaveFunction(10);
415
416 return NF_STOLEN;
417
418 tx_error_icmp:
419 dst_link_failure(skb);
420 tx_error:
421 kfree_skb(skb);
422 LeaveFunction(10);
423 return NF_STOLEN;
424}
425
426
427/*
428 * Direct Routing transmitter
429 * Used for ANY protocol
430 */
431int
432ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
433 struct ip_vs_protocol *pp)
434{
435 struct rtable *rt; /* Route to the other host */
436 struct iphdr *iph = ip_hdr(skb);
437 int mtu;
438
439 EnterFunction(10);
440
441 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
442 goto tx_error_icmp;
443
444 /* MTU checking */
445 mtu = dst_mtu(&rt->u.dst);
446 if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
447 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
448 ip_rt_put(rt);
449 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
450 goto tx_error;
451 }
452
453 /*
454 * Call ip_send_check because we are not sure it is called
455 * after ip_defrag. Is copy-on-write needed?
456 */
457 if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
458 ip_rt_put(rt);
459 return NF_STOLEN;
460 }
461 ip_send_check(ip_hdr(skb));
462
463 /* drop old route */
464 dst_release(skb->dst);
465 skb->dst = &rt->u.dst;
466
467 /* Another hack: avoid icmp_send in ip_fragment */
468 skb->local_df = 1;
469
470 IP_VS_XMIT(skb, rt);
471
472 LeaveFunction(10);
473 return NF_STOLEN;
474
475 tx_error_icmp:
476 dst_link_failure(skb);
477 tx_error:
478 kfree_skb(skb);
479 LeaveFunction(10);
480 return NF_STOLEN;
481}
482
483
484/*
485 * ICMP packet transmitter
486 * called by the ip_vs_in_icmp
487 */
488int
489ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
490 struct ip_vs_protocol *pp, int offset)
491{
492 struct rtable *rt; /* Route to the other host */
493 int mtu;
494 int rc;
495
496 EnterFunction(10);
497
498 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
499 forwarded directly here, because there is no need to
500 translate address/port back */
501 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
502 if (cp->packet_xmit)
503 rc = cp->packet_xmit(skb, cp, pp);
504 else
505 rc = NF_ACCEPT;
506 /* do not touch skb anymore */
507 atomic_inc(&cp->in_pkts);
508 goto out;
509 }
510
511 /*
512 * mangle and send the packet here (only for VS/NAT)
513 */
514
515 if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
516 goto tx_error_icmp;
517
518 /* MTU checking */
519 mtu = dst_mtu(&rt->u.dst);
520 if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
521 ip_rt_put(rt);
522 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
523 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
524 goto tx_error;
525 }
526
527 /* copy-on-write the packet before mangling it */
528 if (!skb_make_writable(skb, offset))
529 goto tx_error_put;
530
531 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
532 goto tx_error_put;
533
534 /* drop the old route when skb is not shared */
535 dst_release(skb->dst);
536 skb->dst = &rt->u.dst;
537
538 ip_vs_nat_icmp(skb, pp, cp, 0);
539
540 /* Another hack: avoid icmp_send in ip_fragment */
541 skb->local_df = 1;
542
543 IP_VS_XMIT(skb, rt);
544
545 rc = NF_STOLEN;
546 goto out;
547
548 tx_error_icmp:
549 dst_link_failure(skb);
550 tx_error:
551 dev_kfree_skb(skb);
552 rc = NF_STOLEN;
553 out:
554 LeaveFunction(10);
555 return rc;
556 tx_error_put:
557 ip_rt_put(rt);
558 goto tx_error;
559}
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index f8edacdf991d..6efdb70b3eb2 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -12,6 +12,7 @@
12/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ 12/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
13int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) 13int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
14{ 14{
15 struct net *net = dev_net(skb->dst->dev);
15 const struct iphdr *iph = ip_hdr(skb); 16 const struct iphdr *iph = ip_hdr(skb);
16 struct rtable *rt; 17 struct rtable *rt;
17 struct flowi fl = {}; 18 struct flowi fl = {};
@@ -19,7 +20,9 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
19 unsigned int hh_len; 20 unsigned int hh_len;
20 unsigned int type; 21 unsigned int type;
21 22
22 type = inet_addr_type(&init_net, iph->saddr); 23 type = inet_addr_type(net, iph->saddr);
24 if (skb->sk && inet_sk(skb->sk)->transparent)
25 type = RTN_LOCAL;
23 if (addr_type == RTN_UNSPEC) 26 if (addr_type == RTN_UNSPEC)
24 addr_type = type; 27 addr_type = type;
25 28
@@ -33,7 +36,8 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
33 fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); 36 fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
34 fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; 37 fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
35 fl.mark = skb->mark; 38 fl.mark = skb->mark;
36 if (ip_route_output_key(&init_net, &rt, &fl) != 0) 39 fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
40 if (ip_route_output_key(net, &rt, &fl) != 0)
37 return -1; 41 return -1;
38 42
39 /* Drop old route. */ 43 /* Drop old route. */
@@ -43,7 +47,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
43 /* non-local src, find valid iif to satisfy 47 /* non-local src, find valid iif to satisfy
44 * rp-filter when calling ip_route_input. */ 48 * rp-filter when calling ip_route_input. */
45 fl.nl_u.ip4_u.daddr = iph->saddr; 49 fl.nl_u.ip4_u.daddr = iph->saddr;
46 if (ip_route_output_key(&init_net, &rt, &fl) != 0) 50 if (ip_route_output_key(net, &rt, &fl) != 0)
47 return -1; 51 return -1;
48 52
49 odst = skb->dst; 53 odst = skb->dst;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 90eb7cb47e77..3816e1dc9295 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -5,10 +5,15 @@
5menu "IP: Netfilter Configuration" 5menu "IP: Netfilter Configuration"
6 depends on INET && NETFILTER 6 depends on INET && NETFILTER
7 7
8config NF_DEFRAG_IPV4
9 tristate
10 default n
11
8config NF_CONNTRACK_IPV4 12config NF_CONNTRACK_IPV4
9 tristate "IPv4 connection tracking support (required for NAT)" 13 tristate "IPv4 connection tracking support (required for NAT)"
10 depends on NF_CONNTRACK 14 depends on NF_CONNTRACK
11 default m if NETFILTER_ADVANCED=n 15 default m if NETFILTER_ADVANCED=n
16 select NF_DEFRAG_IPV4
12 ---help--- 17 ---help---
13 Connection tracking keeps a record of what packets have passed 18 Connection tracking keeps a record of what packets have passed
14 through your machine, in order to figure out how they are related 19 through your machine, in order to figure out how they are related
@@ -56,23 +61,30 @@ config IP_NF_IPTABLES
56 61
57 To compile it as a module, choose M here. If unsure, say N. 62 To compile it as a module, choose M here. If unsure, say N.
58 63
64if IP_NF_IPTABLES
65
59# The matches. 66# The matches.
60config IP_NF_MATCH_RECENT 67config IP_NF_MATCH_ADDRTYPE
61 tristate '"recent" match support' 68 tristate '"addrtype" address type match support'
62 depends on IP_NF_IPTABLES
63 depends on NETFILTER_ADVANCED 69 depends on NETFILTER_ADVANCED
64 help 70 help
65 This match is used for creating one or many lists of recently 71 This option allows you to match what routing thinks of an address,
66 used addresses and then matching against that/those list(s). 72 eg. UNICAST, LOCAL, BROADCAST, ...
67 73
68 Short options are available by using 'iptables -m recent -h' 74 If you want to compile it as a module, say M here and read
69 Official Website: <http://snowman.net/projects/ipt_recent/> 75 <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
76
77config IP_NF_MATCH_AH
78 tristate '"ah" match support'
79 depends on NETFILTER_ADVANCED
80 help
81 This match extension allows you to match a range of SPIs
82 inside AH header of IPSec packets.
70 83
71 To compile it as a module, choose M here. If unsure, say N. 84 To compile it as a module, choose M here. If unsure, say N.
72 85
73config IP_NF_MATCH_ECN 86config IP_NF_MATCH_ECN
74 tristate '"ecn" match support' 87 tristate '"ecn" match support'
75 depends on IP_NF_IPTABLES
76 depends on NETFILTER_ADVANCED 88 depends on NETFILTER_ADVANCED
77 help 89 help
78 This option adds a `ECN' match, which allows you to match against 90 This option adds a `ECN' match, which allows you to match against
@@ -80,19 +92,8 @@ config IP_NF_MATCH_ECN
80 92
81 To compile it as a module, choose M here. If unsure, say N. 93 To compile it as a module, choose M here. If unsure, say N.
82 94
83config IP_NF_MATCH_AH
84 tristate '"ah" match support'
85 depends on IP_NF_IPTABLES
86 depends on NETFILTER_ADVANCED
87 help
88 This match extension allows you to match a range of SPIs
89 inside AH header of IPSec packets.
90
91 To compile it as a module, choose M here. If unsure, say N.
92
93config IP_NF_MATCH_TTL 95config IP_NF_MATCH_TTL
94 tristate '"ttl" match support' 96 tristate '"ttl" match support'
95 depends on IP_NF_IPTABLES
96 depends on NETFILTER_ADVANCED 97 depends on NETFILTER_ADVANCED
97 help 98 help
98 This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user 99 This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user
@@ -100,21 +101,9 @@ config IP_NF_MATCH_TTL
100 101
101 To compile it as a module, choose M here. If unsure, say N. 102 To compile it as a module, choose M here. If unsure, say N.
102 103
103config IP_NF_MATCH_ADDRTYPE
104 tristate '"addrtype" address type match support'
105 depends on IP_NF_IPTABLES
106 depends on NETFILTER_ADVANCED
107 help
108 This option allows you to match what routing thinks of an address,
109 eg. UNICAST, LOCAL, BROADCAST, ...
110
111 If you want to compile it as a module, say M here and read
112 <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
113
114# `filter', generic and specific targets 104# `filter', generic and specific targets
115config IP_NF_FILTER 105config IP_NF_FILTER
116 tristate "Packet filtering" 106 tristate "Packet filtering"
117 depends on IP_NF_IPTABLES
118 default m if NETFILTER_ADVANCED=n 107 default m if NETFILTER_ADVANCED=n
119 help 108 help
120 Packet filtering defines a table `filter', which has a series of 109 Packet filtering defines a table `filter', which has a series of
@@ -136,7 +125,6 @@ config IP_NF_TARGET_REJECT
136 125
137config IP_NF_TARGET_LOG 126config IP_NF_TARGET_LOG
138 tristate "LOG target support" 127 tristate "LOG target support"
139 depends on IP_NF_IPTABLES
140 default m if NETFILTER_ADVANCED=n 128 default m if NETFILTER_ADVANCED=n
141 help 129 help
142 This option adds a `LOG' target, which allows you to create rules in 130 This option adds a `LOG' target, which allows you to create rules in
@@ -146,7 +134,6 @@ config IP_NF_TARGET_LOG
146 134
147config IP_NF_TARGET_ULOG 135config IP_NF_TARGET_ULOG
148 tristate "ULOG target support" 136 tristate "ULOG target support"
149 depends on IP_NF_IPTABLES
150 default m if NETFILTER_ADVANCED=n 137 default m if NETFILTER_ADVANCED=n
151 ---help--- 138 ---help---
152 139
@@ -167,7 +154,7 @@ config IP_NF_TARGET_ULOG
167# NAT + specific targets: nf_conntrack 154# NAT + specific targets: nf_conntrack
168config NF_NAT 155config NF_NAT
169 tristate "Full NAT" 156 tristate "Full NAT"
170 depends on IP_NF_IPTABLES && NF_CONNTRACK_IPV4 157 depends on NF_CONNTRACK_IPV4
171 default m if NETFILTER_ADVANCED=n 158 default m if NETFILTER_ADVANCED=n
172 help 159 help
173 The Full NAT option allows masquerading, port forwarding and other 160 The Full NAT option allows masquerading, port forwarding and other
@@ -194,26 +181,26 @@ config IP_NF_TARGET_MASQUERADE
194 181
195 To compile it as a module, choose M here. If unsure, say N. 182 To compile it as a module, choose M here. If unsure, say N.
196 183
197config IP_NF_TARGET_REDIRECT 184config IP_NF_TARGET_NETMAP
198 tristate "REDIRECT target support" 185 tristate "NETMAP target support"
199 depends on NF_NAT 186 depends on NF_NAT
200 depends on NETFILTER_ADVANCED 187 depends on NETFILTER_ADVANCED
201 help 188 help
202 REDIRECT is a special case of NAT: all incoming connections are 189 NETMAP is an implementation of static 1:1 NAT mapping of network
203 mapped onto the incoming interface's address, causing the packets to 190 addresses. It maps the network address part, while keeping the host
204 come to the local machine instead of passing through. This is 191 address part intact.
205 useful for transparent proxies.
206 192
207 To compile it as a module, choose M here. If unsure, say N. 193 To compile it as a module, choose M here. If unsure, say N.
208 194
209config IP_NF_TARGET_NETMAP 195config IP_NF_TARGET_REDIRECT
210 tristate "NETMAP target support" 196 tristate "REDIRECT target support"
211 depends on NF_NAT 197 depends on NF_NAT
212 depends on NETFILTER_ADVANCED 198 depends on NETFILTER_ADVANCED
213 help 199 help
214 NETMAP is an implementation of static 1:1 NAT mapping of network 200 REDIRECT is a special case of NAT: all incoming connections are
215 addresses. It maps the network address part, while keeping the host 201 mapped onto the incoming interface's address, causing the packets to
216 address part intact. 202 come to the local machine instead of passing through. This is
203 useful for transparent proxies.
217 204
218 To compile it as a module, choose M here. If unsure, say N. 205 To compile it as a module, choose M here. If unsure, say N.
219 206
@@ -262,44 +249,43 @@ config NF_NAT_PROTO_SCTP
262 249
263config NF_NAT_FTP 250config NF_NAT_FTP
264 tristate 251 tristate
265 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT 252 depends on NF_CONNTRACK && NF_NAT
266 default NF_NAT && NF_CONNTRACK_FTP 253 default NF_NAT && NF_CONNTRACK_FTP
267 254
268config NF_NAT_IRC 255config NF_NAT_IRC
269 tristate 256 tristate
270 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT 257 depends on NF_CONNTRACK && NF_NAT
271 default NF_NAT && NF_CONNTRACK_IRC 258 default NF_NAT && NF_CONNTRACK_IRC
272 259
273config NF_NAT_TFTP 260config NF_NAT_TFTP
274 tristate 261 tristate
275 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT 262 depends on NF_CONNTRACK && NF_NAT
276 default NF_NAT && NF_CONNTRACK_TFTP 263 default NF_NAT && NF_CONNTRACK_TFTP
277 264
278config NF_NAT_AMANDA 265config NF_NAT_AMANDA
279 tristate 266 tristate
280 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT 267 depends on NF_CONNTRACK && NF_NAT
281 default NF_NAT && NF_CONNTRACK_AMANDA 268 default NF_NAT && NF_CONNTRACK_AMANDA
282 269
283config NF_NAT_PPTP 270config NF_NAT_PPTP
284 tristate 271 tristate
285 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT 272 depends on NF_CONNTRACK && NF_NAT
286 default NF_NAT && NF_CONNTRACK_PPTP 273 default NF_NAT && NF_CONNTRACK_PPTP
287 select NF_NAT_PROTO_GRE 274 select NF_NAT_PROTO_GRE
288 275
289config NF_NAT_H323 276config NF_NAT_H323
290 tristate 277 tristate
291 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT 278 depends on NF_CONNTRACK && NF_NAT
292 default NF_NAT && NF_CONNTRACK_H323 279 default NF_NAT && NF_CONNTRACK_H323
293 280
294config NF_NAT_SIP 281config NF_NAT_SIP
295 tristate 282 tristate
296 depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT 283 depends on NF_CONNTRACK && NF_NAT
297 default NF_NAT && NF_CONNTRACK_SIP 284 default NF_NAT && NF_CONNTRACK_SIP
298 285
299# mangle + specific targets 286# mangle + specific targets
300config IP_NF_MANGLE 287config IP_NF_MANGLE
301 tristate "Packet mangling" 288 tristate "Packet mangling"
302 depends on IP_NF_IPTABLES
303 default m if NETFILTER_ADVANCED=n 289 default m if NETFILTER_ADVANCED=n
304 help 290 help
305 This option adds a `mangle' table to iptables: see the man page for 291 This option adds a `mangle' table to iptables: see the man page for
@@ -308,6 +294,19 @@ config IP_NF_MANGLE
308 294
309 To compile it as a module, choose M here. If unsure, say N. 295 To compile it as a module, choose M here. If unsure, say N.
310 296
297config IP_NF_TARGET_CLUSTERIP
298 tristate "CLUSTERIP target support (EXPERIMENTAL)"
299 depends on IP_NF_MANGLE && EXPERIMENTAL
300 depends on NF_CONNTRACK_IPV4
301 depends on NETFILTER_ADVANCED
302 select NF_CONNTRACK_MARK
303 help
304 The CLUSTERIP target allows you to build load-balancing clusters of
305 network servers without having a dedicated load-balancing
306 router/server/switch.
307
308 To compile it as a module, choose M here. If unsure, say N.
309
311config IP_NF_TARGET_ECN 310config IP_NF_TARGET_ECN
312 tristate "ECN target support" 311 tristate "ECN target support"
313 depends on IP_NF_MANGLE 312 depends on IP_NF_MANGLE
@@ -338,23 +337,9 @@ config IP_NF_TARGET_TTL
338 337
339 To compile it as a module, choose M here. If unsure, say N. 338 To compile it as a module, choose M here. If unsure, say N.
340 339
341config IP_NF_TARGET_CLUSTERIP
342 tristate "CLUSTERIP target support (EXPERIMENTAL)"
343 depends on IP_NF_MANGLE && EXPERIMENTAL
344 depends on NF_CONNTRACK_IPV4
345 depends on NETFILTER_ADVANCED
346 select NF_CONNTRACK_MARK
347 help
348 The CLUSTERIP target allows you to build load-balancing clusters of
349 network servers without having a dedicated load-balancing
350 router/server/switch.
351
352 To compile it as a module, choose M here. If unsure, say N.
353
354# raw + specific targets 340# raw + specific targets
355config IP_NF_RAW 341config IP_NF_RAW
356 tristate 'raw table support (required for NOTRACK/TRACE)' 342 tristate 'raw table support (required for NOTRACK/TRACE)'
357 depends on IP_NF_IPTABLES
358 depends on NETFILTER_ADVANCED 343 depends on NETFILTER_ADVANCED
359 help 344 help
360 This option adds a `raw' table to iptables. This table is the very 345 This option adds a `raw' table to iptables. This table is the very
@@ -367,7 +352,6 @@ config IP_NF_RAW
367# security table for MAC policy 352# security table for MAC policy
368config IP_NF_SECURITY 353config IP_NF_SECURITY
369 tristate "Security table" 354 tristate "Security table"
370 depends on IP_NF_IPTABLES
371 depends on SECURITY 355 depends on SECURITY
372 depends on NETFILTER_ADVANCED 356 depends on NETFILTER_ADVANCED
373 help 357 help
@@ -376,6 +360,8 @@ config IP_NF_SECURITY
376 360
377 If unsure, say N. 361 If unsure, say N.
378 362
363endif # IP_NF_IPTABLES
364
379# ARP tables 365# ARP tables
380config IP_NF_ARPTABLES 366config IP_NF_ARPTABLES
381 tristate "ARP tables support" 367 tristate "ARP tables support"
@@ -388,9 +374,10 @@ config IP_NF_ARPTABLES
388 374
389 To compile it as a module, choose M here. If unsure, say N. 375 To compile it as a module, choose M here. If unsure, say N.
390 376
377if IP_NF_ARPTABLES
378
391config IP_NF_ARPFILTER 379config IP_NF_ARPFILTER
392 tristate "ARP packet filtering" 380 tristate "ARP packet filtering"
393 depends on IP_NF_ARPTABLES
394 help 381 help
395 ARP packet filtering defines a table `filter', which has a series of 382 ARP packet filtering defines a table `filter', which has a series of
396 rules for simple ARP packet filtering at local input and 383 rules for simple ARP packet filtering at local input and
@@ -401,10 +388,11 @@ config IP_NF_ARPFILTER
401 388
402config IP_NF_ARP_MANGLE 389config IP_NF_ARP_MANGLE
403 tristate "ARP payload mangling" 390 tristate "ARP payload mangling"
404 depends on IP_NF_ARPTABLES
405 help 391 help
406 Allows altering the ARP packet payload: source and destination 392 Allows altering the ARP packet payload: source and destination
407 hardware and network addresses. 393 hardware and network addresses.
408 394
395endif # IP_NF_ARPTABLES
396
409endmenu 397endmenu
410 398
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 3f31291f37ce..5f9b650d90fc 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -18,6 +18,9 @@ obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
18 18
19obj-$(CONFIG_NF_NAT) += nf_nat.o 19obj-$(CONFIG_NF_NAT) += nf_nat.o
20 20
21# defrag
22obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
23
21# NAT helpers (nf_conntrack) 24# NAT helpers (nf_conntrack)
22obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o 25obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
23obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o 26obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
@@ -48,7 +51,6 @@ obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
48obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o 51obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
49obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o 52obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
50obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o 53obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
51obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o
52obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o 54obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
53 55
54# targets 56# targets
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 03e83a65aec5..8d70d29f1ccf 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -200,15 +200,12 @@ static inline int arp_checkentry(const struct arpt_arp *arp)
200 return 1; 200 return 1;
201} 201}
202 202
203static unsigned int arpt_error(struct sk_buff *skb, 203static unsigned int
204 const struct net_device *in, 204arpt_error(struct sk_buff *skb, const struct xt_target_param *par)
205 const struct net_device *out,
206 unsigned int hooknum,
207 const struct xt_target *target,
208 const void *targinfo)
209{ 205{
210 if (net_ratelimit()) 206 if (net_ratelimit())
211 printk("arp_tables: error: '%s'\n", (char *)targinfo); 207 printk("arp_tables: error: '%s'\n",
208 (const char *)par->targinfo);
212 209
213 return NF_DROP; 210 return NF_DROP;
214} 211}
@@ -232,6 +229,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
232 const char *indev, *outdev; 229 const char *indev, *outdev;
233 void *table_base; 230 void *table_base;
234 const struct xt_table_info *private; 231 const struct xt_table_info *private;
232 struct xt_target_param tgpar;
235 233
236 if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) 234 if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
237 return NF_DROP; 235 return NF_DROP;
@@ -245,6 +243,11 @@ unsigned int arpt_do_table(struct sk_buff *skb,
245 e = get_entry(table_base, private->hook_entry[hook]); 243 e = get_entry(table_base, private->hook_entry[hook]);
246 back = get_entry(table_base, private->underflow[hook]); 244 back = get_entry(table_base, private->underflow[hook]);
247 245
246 tgpar.in = in;
247 tgpar.out = out;
248 tgpar.hooknum = hook;
249 tgpar.family = NFPROTO_ARP;
250
248 arp = arp_hdr(skb); 251 arp = arp_hdr(skb);
249 do { 252 do {
250 if (arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { 253 if (arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
@@ -290,11 +293,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
290 /* Targets which reenter must return 293 /* Targets which reenter must return
291 * abs. verdicts 294 * abs. verdicts
292 */ 295 */
296 tgpar.target = t->u.kernel.target;
297 tgpar.targinfo = t->data;
293 verdict = t->u.kernel.target->target(skb, 298 verdict = t->u.kernel.target->target(skb,
294 in, out, 299 &tgpar);
295 hook,
296 t->u.kernel.target,
297 t->data);
298 300
299 /* Target might have changed stuff. */ 301 /* Target might have changed stuff. */
300 arp = arp_hdr(skb); 302 arp = arp_hdr(skb);
@@ -456,23 +458,24 @@ static inline int check_entry(struct arpt_entry *e, const char *name)
456 458
457static inline int check_target(struct arpt_entry *e, const char *name) 459static inline int check_target(struct arpt_entry *e, const char *name)
458{ 460{
459 struct arpt_entry_target *t; 461 struct arpt_entry_target *t = arpt_get_target(e);
460 struct xt_target *target;
461 int ret; 462 int ret;
462 463 struct xt_tgchk_param par = {
463 t = arpt_get_target(e); 464 .table = name,
464 target = t->u.kernel.target; 465 .entryinfo = e,
465 466 .target = t->u.kernel.target,
466 ret = xt_check_target(target, NF_ARP, t->u.target_size - sizeof(*t), 467 .targinfo = t->data,
467 name, e->comefrom, 0, 0); 468 .hook_mask = e->comefrom,
468 if (!ret && t->u.kernel.target->checkentry 469 .family = NFPROTO_ARP,
469 && !t->u.kernel.target->checkentry(name, e, target, t->data, 470 };
470 e->comefrom)) { 471
472 ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
473 if (ret < 0) {
471 duprintf("arp_tables: check failed for `%s'.\n", 474 duprintf("arp_tables: check failed for `%s'.\n",
472 t->u.kernel.target->name); 475 t->u.kernel.target->name);
473 ret = -EINVAL; 476 return ret;
474 } 477 }
475 return ret; 478 return 0;
476} 479}
477 480
478static inline int 481static inline int
@@ -488,7 +491,8 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
488 return ret; 491 return ret;
489 492
490 t = arpt_get_target(e); 493 t = arpt_get_target(e);
491 target = try_then_request_module(xt_find_target(NF_ARP, t->u.user.name, 494 target = try_then_request_module(xt_find_target(NFPROTO_ARP,
495 t->u.user.name,
492 t->u.user.revision), 496 t->u.user.revision),
493 "arpt_%s", t->u.user.name); 497 "arpt_%s", t->u.user.name);
494 if (IS_ERR(target) || !target) { 498 if (IS_ERR(target) || !target) {
@@ -554,15 +558,19 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
554 558
555static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) 559static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
556{ 560{
561 struct xt_tgdtor_param par;
557 struct arpt_entry_target *t; 562 struct arpt_entry_target *t;
558 563
559 if (i && (*i)-- == 0) 564 if (i && (*i)-- == 0)
560 return 1; 565 return 1;
561 566
562 t = arpt_get_target(e); 567 t = arpt_get_target(e);
563 if (t->u.kernel.target->destroy) 568 par.target = t->u.kernel.target;
564 t->u.kernel.target->destroy(t->u.kernel.target, t->data); 569 par.targinfo = t->data;
565 module_put(t->u.kernel.target->me); 570 par.family = NFPROTO_ARP;
571 if (par.target->destroy != NULL)
572 par.target->destroy(&par);
573 module_put(par.target->me);
566 return 0; 574 return 0;
567} 575}
568 576
@@ -788,7 +796,7 @@ static void compat_standard_from_user(void *dst, void *src)
788 int v = *(compat_int_t *)src; 796 int v = *(compat_int_t *)src;
789 797
790 if (v > 0) 798 if (v > 0)
791 v += xt_compat_calc_jump(NF_ARP, v); 799 v += xt_compat_calc_jump(NFPROTO_ARP, v);
792 memcpy(dst, &v, sizeof(v)); 800 memcpy(dst, &v, sizeof(v));
793} 801}
794 802
@@ -797,7 +805,7 @@ static int compat_standard_to_user(void __user *dst, void *src)
797 compat_int_t cv = *(int *)src; 805 compat_int_t cv = *(int *)src;
798 806
799 if (cv > 0) 807 if (cv > 0)
800 cv -= xt_compat_calc_jump(NF_ARP, cv); 808 cv -= xt_compat_calc_jump(NFPROTO_ARP, cv);
801 return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; 809 return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
802} 810}
803 811
@@ -815,7 +823,7 @@ static int compat_calc_entry(struct arpt_entry *e,
815 t = arpt_get_target(e); 823 t = arpt_get_target(e);
816 off += xt_compat_target_offset(t->u.kernel.target); 824 off += xt_compat_target_offset(t->u.kernel.target);
817 newinfo->size -= off; 825 newinfo->size -= off;
818 ret = xt_compat_add_offset(NF_ARP, entry_offset, off); 826 ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
819 if (ret) 827 if (ret)
820 return ret; 828 return ret;
821 829
@@ -866,9 +874,9 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
866 name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; 874 name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
867#ifdef CONFIG_COMPAT 875#ifdef CONFIG_COMPAT
868 if (compat) 876 if (compat)
869 xt_compat_lock(NF_ARP); 877 xt_compat_lock(NFPROTO_ARP);
870#endif 878#endif
871 t = try_then_request_module(xt_find_table_lock(net, NF_ARP, name), 879 t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
872 "arptable_%s", name); 880 "arptable_%s", name);
873 if (t && !IS_ERR(t)) { 881 if (t && !IS_ERR(t)) {
874 struct arpt_getinfo info; 882 struct arpt_getinfo info;
@@ -878,7 +886,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
878 if (compat) { 886 if (compat) {
879 struct xt_table_info tmp; 887 struct xt_table_info tmp;
880 ret = compat_table_info(private, &tmp); 888 ret = compat_table_info(private, &tmp);
881 xt_compat_flush_offsets(NF_ARP); 889 xt_compat_flush_offsets(NFPROTO_ARP);
882 private = &tmp; 890 private = &tmp;
883 } 891 }
884#endif 892#endif
@@ -901,7 +909,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
901 ret = t ? PTR_ERR(t) : -ENOENT; 909 ret = t ? PTR_ERR(t) : -ENOENT;
902#ifdef CONFIG_COMPAT 910#ifdef CONFIG_COMPAT
903 if (compat) 911 if (compat)
904 xt_compat_unlock(NF_ARP); 912 xt_compat_unlock(NFPROTO_ARP);
905#endif 913#endif
906 return ret; 914 return ret;
907} 915}
@@ -925,7 +933,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
925 return -EINVAL; 933 return -EINVAL;
926 } 934 }
927 935
928 t = xt_find_table_lock(net, NF_ARP, get.name); 936 t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
929 if (t && !IS_ERR(t)) { 937 if (t && !IS_ERR(t)) {
930 const struct xt_table_info *private = t->private; 938 const struct xt_table_info *private = t->private;
931 939
@@ -967,7 +975,7 @@ static int __do_replace(struct net *net, const char *name,
967 goto out; 975 goto out;
968 } 976 }
969 977
970 t = try_then_request_module(xt_find_table_lock(net, NF_ARP, name), 978 t = try_then_request_module(xt_find_table_lock(net, NFPROTO_ARP, name),
971 "arptable_%s", name); 979 "arptable_%s", name);
972 if (!t || IS_ERR(t)) { 980 if (!t || IS_ERR(t)) {
973 ret = t ? PTR_ERR(t) : -ENOENT; 981 ret = t ? PTR_ERR(t) : -ENOENT;
@@ -1134,7 +1142,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1134 goto free; 1142 goto free;
1135 } 1143 }
1136 1144
1137 t = xt_find_table_lock(net, NF_ARP, name); 1145 t = xt_find_table_lock(net, NFPROTO_ARP, name);
1138 if (!t || IS_ERR(t)) { 1146 if (!t || IS_ERR(t)) {
1139 ret = t ? PTR_ERR(t) : -ENOENT; 1147 ret = t ? PTR_ERR(t) : -ENOENT;
1140 goto free; 1148 goto free;
@@ -1218,7 +1226,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
1218 entry_offset = (void *)e - (void *)base; 1226 entry_offset = (void *)e - (void *)base;
1219 1227
1220 t = compat_arpt_get_target(e); 1228 t = compat_arpt_get_target(e);
1221 target = try_then_request_module(xt_find_target(NF_ARP, 1229 target = try_then_request_module(xt_find_target(NFPROTO_ARP,
1222 t->u.user.name, 1230 t->u.user.name,
1223 t->u.user.revision), 1231 t->u.user.revision),
1224 "arpt_%s", t->u.user.name); 1232 "arpt_%s", t->u.user.name);
@@ -1232,7 +1240,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
1232 1240
1233 off += xt_compat_target_offset(target); 1241 off += xt_compat_target_offset(target);
1234 *size += off; 1242 *size += off;
1235 ret = xt_compat_add_offset(NF_ARP, entry_offset, off); 1243 ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
1236 if (ret) 1244 if (ret)
1237 goto release_target; 1245 goto release_target;
1238 1246
@@ -1333,7 +1341,7 @@ static int translate_compat_table(const char *name,
1333 1341
1334 duprintf("translate_compat_table: size %u\n", info->size); 1342 duprintf("translate_compat_table: size %u\n", info->size);
1335 j = 0; 1343 j = 0;
1336 xt_compat_lock(NF_ARP); 1344 xt_compat_lock(NFPROTO_ARP);
1337 /* Walk through entries, checking offsets. */ 1345 /* Walk through entries, checking offsets. */
1338 ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, 1346 ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size,
1339 check_compat_entry_size_and_hooks, 1347 check_compat_entry_size_and_hooks,
@@ -1383,8 +1391,8 @@ static int translate_compat_table(const char *name,
1383 ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, 1391 ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size,
1384 compat_copy_entry_from_user, 1392 compat_copy_entry_from_user,
1385 &pos, &size, name, newinfo, entry1); 1393 &pos, &size, name, newinfo, entry1);
1386 xt_compat_flush_offsets(NF_ARP); 1394 xt_compat_flush_offsets(NFPROTO_ARP);
1387 xt_compat_unlock(NF_ARP); 1395 xt_compat_unlock(NFPROTO_ARP);
1388 if (ret) 1396 if (ret)
1389 goto free_newinfo; 1397 goto free_newinfo;
1390 1398
@@ -1420,8 +1428,8 @@ out:
1420 COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); 1428 COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j);
1421 return ret; 1429 return ret;
1422out_unlock: 1430out_unlock:
1423 xt_compat_flush_offsets(NF_ARP); 1431 xt_compat_flush_offsets(NFPROTO_ARP);
1424 xt_compat_unlock(NF_ARP); 1432 xt_compat_unlock(NFPROTO_ARP);
1425 goto out; 1433 goto out;
1426} 1434}
1427 1435
@@ -1607,8 +1615,8 @@ static int compat_get_entries(struct net *net,
1607 return -EINVAL; 1615 return -EINVAL;
1608 } 1616 }
1609 1617
1610 xt_compat_lock(NF_ARP); 1618 xt_compat_lock(NFPROTO_ARP);
1611 t = xt_find_table_lock(net, NF_ARP, get.name); 1619 t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
1612 if (t && !IS_ERR(t)) { 1620 if (t && !IS_ERR(t)) {
1613 const struct xt_table_info *private = t->private; 1621 const struct xt_table_info *private = t->private;
1614 struct xt_table_info info; 1622 struct xt_table_info info;
@@ -1623,13 +1631,13 @@ static int compat_get_entries(struct net *net,
1623 private->size, get.size); 1631 private->size, get.size);
1624 ret = -EAGAIN; 1632 ret = -EAGAIN;
1625 } 1633 }
1626 xt_compat_flush_offsets(NF_ARP); 1634 xt_compat_flush_offsets(NFPROTO_ARP);
1627 module_put(t->me); 1635 module_put(t->me);
1628 xt_table_unlock(t); 1636 xt_table_unlock(t);
1629 } else 1637 } else
1630 ret = t ? PTR_ERR(t) : -ENOENT; 1638 ret = t ? PTR_ERR(t) : -ENOENT;
1631 1639
1632 xt_compat_unlock(NF_ARP); 1640 xt_compat_unlock(NFPROTO_ARP);
1633 return ret; 1641 return ret;
1634} 1642}
1635 1643
@@ -1709,7 +1717,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
1709 break; 1717 break;
1710 } 1718 }
1711 1719
1712 try_then_request_module(xt_find_revision(NF_ARP, rev.name, 1720 try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name,
1713 rev.revision, 1, &ret), 1721 rev.revision, 1, &ret),
1714 "arpt_%s", rev.name); 1722 "arpt_%s", rev.name);
1715 break; 1723 break;
@@ -1787,7 +1795,7 @@ void arpt_unregister_table(struct xt_table *table)
1787static struct xt_target arpt_standard_target __read_mostly = { 1795static struct xt_target arpt_standard_target __read_mostly = {
1788 .name = ARPT_STANDARD_TARGET, 1796 .name = ARPT_STANDARD_TARGET,
1789 .targetsize = sizeof(int), 1797 .targetsize = sizeof(int),
1790 .family = NF_ARP, 1798 .family = NFPROTO_ARP,
1791#ifdef CONFIG_COMPAT 1799#ifdef CONFIG_COMPAT
1792 .compatsize = sizeof(compat_int_t), 1800 .compatsize = sizeof(compat_int_t),
1793 .compat_from_user = compat_standard_from_user, 1801 .compat_from_user = compat_standard_from_user,
@@ -1799,7 +1807,7 @@ static struct xt_target arpt_error_target __read_mostly = {
1799 .name = ARPT_ERROR_TARGET, 1807 .name = ARPT_ERROR_TARGET,
1800 .target = arpt_error, 1808 .target = arpt_error,
1801 .targetsize = ARPT_FUNCTION_MAXNAMELEN, 1809 .targetsize = ARPT_FUNCTION_MAXNAMELEN,
1802 .family = NF_ARP, 1810 .family = NFPROTO_ARP,
1803}; 1811};
1804 1812
1805static struct nf_sockopt_ops arpt_sockopts = { 1813static struct nf_sockopt_ops arpt_sockopts = {
@@ -1821,12 +1829,12 @@ static struct nf_sockopt_ops arpt_sockopts = {
1821 1829
1822static int __net_init arp_tables_net_init(struct net *net) 1830static int __net_init arp_tables_net_init(struct net *net)
1823{ 1831{
1824 return xt_proto_init(net, NF_ARP); 1832 return xt_proto_init(net, NFPROTO_ARP);
1825} 1833}
1826 1834
1827static void __net_exit arp_tables_net_exit(struct net *net) 1835static void __net_exit arp_tables_net_exit(struct net *net)
1828{ 1836{
1829 xt_proto_fini(net, NF_ARP); 1837 xt_proto_fini(net, NFPROTO_ARP);
1830} 1838}
1831 1839
1832static struct pernet_operations arp_tables_net_ops = { 1840static struct pernet_operations arp_tables_net_ops = {
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index a385959d2655..b0d5b1d0a769 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -9,12 +9,9 @@ MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
9MODULE_DESCRIPTION("arptables arp payload mangle target"); 9MODULE_DESCRIPTION("arptables arp payload mangle target");
10 10
11static unsigned int 11static unsigned int
12target(struct sk_buff *skb, 12target(struct sk_buff *skb, const struct xt_target_param *par)
13 const struct net_device *in, const struct net_device *out,
14 unsigned int hooknum, const struct xt_target *target,
15 const void *targinfo)
16{ 13{
17 const struct arpt_mangle *mangle = targinfo; 14 const struct arpt_mangle *mangle = par->targinfo;
18 const struct arphdr *arp; 15 const struct arphdr *arp;
19 unsigned char *arpptr; 16 unsigned char *arpptr;
20 int pln, hln; 17 int pln, hln;
@@ -57,11 +54,9 @@ target(struct sk_buff *skb,
57 return mangle->target; 54 return mangle->target;
58} 55}
59 56
60static bool 57static bool checkentry(const struct xt_tgchk_param *par)
61checkentry(const char *tablename, const void *e, const struct xt_target *target,
62 void *targinfo, unsigned int hook_mask)
63{ 58{
64 const struct arpt_mangle *mangle = targinfo; 59 const struct arpt_mangle *mangle = par->targinfo;
65 60
66 if (mangle->flags & ~ARPT_MANGLE_MASK || 61 if (mangle->flags & ~ARPT_MANGLE_MASK ||
67 !(mangle->flags & ARPT_MANGLE_MASK)) 62 !(mangle->flags & ARPT_MANGLE_MASK))
@@ -75,7 +70,7 @@ checkentry(const char *tablename, const void *e, const struct xt_target *target,
75 70
76static struct xt_target arpt_mangle_reg __read_mostly = { 71static struct xt_target arpt_mangle_reg __read_mostly = {
77 .name = "mangle", 72 .name = "mangle",
78 .family = NF_ARP, 73 .family = NFPROTO_ARP,
79 .target = target, 74 .target = target,
80 .targetsize = sizeof(struct arpt_mangle), 75 .targetsize = sizeof(struct arpt_mangle),
81 .checkentry = checkentry, 76 .checkentry = checkentry,
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 082f5dd3156c..bee3d117661a 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -51,7 +51,7 @@ static struct xt_table packet_filter = {
51 .lock = __RW_LOCK_UNLOCKED(packet_filter.lock), 51 .lock = __RW_LOCK_UNLOCKED(packet_filter.lock),
52 .private = NULL, 52 .private = NULL,
53 .me = THIS_MODULE, 53 .me = THIS_MODULE,
54 .af = NF_ARP, 54 .af = NFPROTO_ARP,
55}; 55};
56 56
57/* The work comes in here from netfilter.c */ 57/* The work comes in here from netfilter.c */
@@ -89,21 +89,21 @@ static struct nf_hook_ops arpt_ops[] __read_mostly = {
89 { 89 {
90 .hook = arpt_in_hook, 90 .hook = arpt_in_hook,
91 .owner = THIS_MODULE, 91 .owner = THIS_MODULE,
92 .pf = NF_ARP, 92 .pf = NFPROTO_ARP,
93 .hooknum = NF_ARP_IN, 93 .hooknum = NF_ARP_IN,
94 .priority = NF_IP_PRI_FILTER, 94 .priority = NF_IP_PRI_FILTER,
95 }, 95 },
96 { 96 {
97 .hook = arpt_out_hook, 97 .hook = arpt_out_hook,
98 .owner = THIS_MODULE, 98 .owner = THIS_MODULE,
99 .pf = NF_ARP, 99 .pf = NFPROTO_ARP,
100 .hooknum = NF_ARP_OUT, 100 .hooknum = NF_ARP_OUT,
101 .priority = NF_IP_PRI_FILTER, 101 .priority = NF_IP_PRI_FILTER,
102 }, 102 },
103 { 103 {
104 .hook = arpt_forward_hook, 104 .hook = arpt_forward_hook,
105 .owner = THIS_MODULE, 105 .owner = THIS_MODULE,
106 .pf = NF_ARP, 106 .pf = NFPROTO_ARP,
107 .hooknum = NF_ARP_FORWARD, 107 .hooknum = NF_ARP_FORWARD,
108 .priority = NF_IP_PRI_FILTER, 108 .priority = NF_IP_PRI_FILTER,
109 }, 109 },
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 4e7c719445c2..213fb27debc1 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -171,31 +171,25 @@ ip_checkentry(const struct ipt_ip *ip)
171} 171}
172 172
173static unsigned int 173static unsigned int
174ipt_error(struct sk_buff *skb, 174ipt_error(struct sk_buff *skb, const struct xt_target_param *par)
175 const struct net_device *in,
176 const struct net_device *out,
177 unsigned int hooknum,
178 const struct xt_target *target,
179 const void *targinfo)
180{ 175{
181 if (net_ratelimit()) 176 if (net_ratelimit())
182 printk("ip_tables: error: `%s'\n", (char *)targinfo); 177 printk("ip_tables: error: `%s'\n",
178 (const char *)par->targinfo);
183 179
184 return NF_DROP; 180 return NF_DROP;
185} 181}
186 182
187/* Performance critical - called for every packet */ 183/* Performance critical - called for every packet */
188static inline bool 184static inline bool
189do_match(struct ipt_entry_match *m, 185do_match(struct ipt_entry_match *m, const struct sk_buff *skb,
190 const struct sk_buff *skb, 186 struct xt_match_param *par)
191 const struct net_device *in,
192 const struct net_device *out,
193 int offset,
194 bool *hotdrop)
195{ 187{
188 par->match = m->u.kernel.match;
189 par->matchinfo = m->data;
190
196 /* Stop iteration if it doesn't match */ 191 /* Stop iteration if it doesn't match */
197 if (!m->u.kernel.match->match(skb, in, out, m->u.kernel.match, m->data, 192 if (!m->u.kernel.match->match(skb, par))
198 offset, ip_hdrlen(skb), hotdrop))
199 return true; 193 return true;
200 else 194 else
201 return false; 195 return false;
@@ -326,7 +320,6 @@ ipt_do_table(struct sk_buff *skb,
326 struct xt_table *table) 320 struct xt_table *table)
327{ 321{
328 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); 322 static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
329 u_int16_t offset;
330 const struct iphdr *ip; 323 const struct iphdr *ip;
331 u_int16_t datalen; 324 u_int16_t datalen;
332 bool hotdrop = false; 325 bool hotdrop = false;
@@ -336,6 +329,8 @@ ipt_do_table(struct sk_buff *skb,
336 void *table_base; 329 void *table_base;
337 struct ipt_entry *e, *back; 330 struct ipt_entry *e, *back;
338 struct xt_table_info *private; 331 struct xt_table_info *private;
332 struct xt_match_param mtpar;
333 struct xt_target_param tgpar;
339 334
340 /* Initialization */ 335 /* Initialization */
341 ip = ip_hdr(skb); 336 ip = ip_hdr(skb);
@@ -348,7 +343,13 @@ ipt_do_table(struct sk_buff *skb,
348 * things we don't know, ie. tcp syn flag or ports). If the 343 * things we don't know, ie. tcp syn flag or ports). If the
349 * rule is also a fragment-specific rule, non-fragments won't 344 * rule is also a fragment-specific rule, non-fragments won't
350 * match it. */ 345 * match it. */
351 offset = ntohs(ip->frag_off) & IP_OFFSET; 346 mtpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
347 mtpar.thoff = ip_hdrlen(skb);
348 mtpar.hotdrop = &hotdrop;
349 mtpar.in = tgpar.in = in;
350 mtpar.out = tgpar.out = out;
351 mtpar.family = tgpar.family = NFPROTO_IPV4;
352 tgpar.hooknum = hook;
352 353
353 read_lock_bh(&table->lock); 354 read_lock_bh(&table->lock);
354 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 355 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
@@ -362,12 +363,11 @@ ipt_do_table(struct sk_buff *skb,
362 do { 363 do {
363 IP_NF_ASSERT(e); 364 IP_NF_ASSERT(e);
364 IP_NF_ASSERT(back); 365 IP_NF_ASSERT(back);
365 if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { 366 if (ip_packet_match(ip, indev, outdev,
367 &e->ip, mtpar.fragoff)) {
366 struct ipt_entry_target *t; 368 struct ipt_entry_target *t;
367 369
368 if (IPT_MATCH_ITERATE(e, do_match, 370 if (IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
369 skb, in, out,
370 offset, &hotdrop) != 0)
371 goto no_match; 371 goto no_match;
372 372
373 ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); 373 ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
@@ -413,16 +413,14 @@ ipt_do_table(struct sk_buff *skb,
413 } else { 413 } else {
414 /* Targets which reenter must return 414 /* Targets which reenter must return
415 abs. verdicts */ 415 abs. verdicts */
416 tgpar.target = t->u.kernel.target;
417 tgpar.targinfo = t->data;
416#ifdef CONFIG_NETFILTER_DEBUG 418#ifdef CONFIG_NETFILTER_DEBUG
417 ((struct ipt_entry *)table_base)->comefrom 419 ((struct ipt_entry *)table_base)->comefrom
418 = 0xeeeeeeec; 420 = 0xeeeeeeec;
419#endif 421#endif
420 verdict = t->u.kernel.target->target(skb, 422 verdict = t->u.kernel.target->target(skb,
421 in, out, 423 &tgpar);
422 hook,
423 t->u.kernel.target,
424 t->data);
425
426#ifdef CONFIG_NETFILTER_DEBUG 424#ifdef CONFIG_NETFILTER_DEBUG
427 if (((struct ipt_entry *)table_base)->comefrom 425 if (((struct ipt_entry *)table_base)->comefrom
428 != 0xeeeeeeec 426 != 0xeeeeeeec
@@ -575,12 +573,17 @@ mark_source_chains(struct xt_table_info *newinfo,
575static int 573static int
576cleanup_match(struct ipt_entry_match *m, unsigned int *i) 574cleanup_match(struct ipt_entry_match *m, unsigned int *i)
577{ 575{
576 struct xt_mtdtor_param par;
577
578 if (i && (*i)-- == 0) 578 if (i && (*i)-- == 0)
579 return 1; 579 return 1;
580 580
581 if (m->u.kernel.match->destroy) 581 par.match = m->u.kernel.match;
582 m->u.kernel.match->destroy(m->u.kernel.match, m->data); 582 par.matchinfo = m->data;
583 module_put(m->u.kernel.match->me); 583 par.family = NFPROTO_IPV4;
584 if (par.match->destroy != NULL)
585 par.match->destroy(&par);
586 module_put(par.match->me);
584 return 0; 587 return 0;
585} 588}
586 589
@@ -606,34 +609,28 @@ check_entry(struct ipt_entry *e, const char *name)
606} 609}
607 610
608static int 611static int
609check_match(struct ipt_entry_match *m, const char *name, 612check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par,
610 const struct ipt_ip *ip, 613 unsigned int *i)
611 unsigned int hookmask, unsigned int *i)
612{ 614{
613 struct xt_match *match; 615 const struct ipt_ip *ip = par->entryinfo;
614 int ret; 616 int ret;
615 617
616 match = m->u.kernel.match; 618 par->match = m->u.kernel.match;
617 ret = xt_check_match(match, AF_INET, m->u.match_size - sizeof(*m), 619 par->matchinfo = m->data;
618 name, hookmask, ip->proto, 620
619 ip->invflags & IPT_INV_PROTO); 621 ret = xt_check_match(par, m->u.match_size - sizeof(*m),
620 if (!ret && m->u.kernel.match->checkentry 622 ip->proto, ip->invflags & IPT_INV_PROTO);
621 && !m->u.kernel.match->checkentry(name, ip, match, m->data, 623 if (ret < 0) {
622 hookmask)) {
623 duprintf("ip_tables: check failed for `%s'.\n", 624 duprintf("ip_tables: check failed for `%s'.\n",
624 m->u.kernel.match->name); 625 par.match->name);
625 ret = -EINVAL; 626 return ret;
626 } 627 }
627 if (!ret) 628 ++*i;
628 (*i)++; 629 return 0;
629 return ret;
630} 630}
631 631
632static int 632static int
633find_check_match(struct ipt_entry_match *m, 633find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par,
634 const char *name,
635 const struct ipt_ip *ip,
636 unsigned int hookmask,
637 unsigned int *i) 634 unsigned int *i)
638{ 635{
639 struct xt_match *match; 636 struct xt_match *match;
@@ -648,7 +645,7 @@ find_check_match(struct ipt_entry_match *m,
648 } 645 }
649 m->u.kernel.match = match; 646 m->u.kernel.match = match;
650 647
651 ret = check_match(m, name, ip, hookmask, i); 648 ret = check_match(m, par, i);
652 if (ret) 649 if (ret)
653 goto err; 650 goto err;
654 651
@@ -660,23 +657,25 @@ err:
660 657
661static int check_target(struct ipt_entry *e, const char *name) 658static int check_target(struct ipt_entry *e, const char *name)
662{ 659{
663 struct ipt_entry_target *t; 660 struct ipt_entry_target *t = ipt_get_target(e);
664 struct xt_target *target; 661 struct xt_tgchk_param par = {
662 .table = name,
663 .entryinfo = e,
664 .target = t->u.kernel.target,
665 .targinfo = t->data,
666 .hook_mask = e->comefrom,
667 .family = NFPROTO_IPV4,
668 };
665 int ret; 669 int ret;
666 670
667 t = ipt_get_target(e); 671 ret = xt_check_target(&par, t->u.target_size - sizeof(*t),
668 target = t->u.kernel.target; 672 e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
669 ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t), 673 if (ret < 0) {
670 name, e->comefrom, e->ip.proto,
671 e->ip.invflags & IPT_INV_PROTO);
672 if (!ret && t->u.kernel.target->checkentry
673 && !t->u.kernel.target->checkentry(name, e, target, t->data,
674 e->comefrom)) {
675 duprintf("ip_tables: check failed for `%s'.\n", 674 duprintf("ip_tables: check failed for `%s'.\n",
676 t->u.kernel.target->name); 675 t->u.kernel.target->name);
677 ret = -EINVAL; 676 return ret;
678 } 677 }
679 return ret; 678 return 0;
680} 679}
681 680
682static int 681static int
@@ -687,14 +686,18 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size,
687 struct xt_target *target; 686 struct xt_target *target;
688 int ret; 687 int ret;
689 unsigned int j; 688 unsigned int j;
689 struct xt_mtchk_param mtpar;
690 690
691 ret = check_entry(e, name); 691 ret = check_entry(e, name);
692 if (ret) 692 if (ret)
693 return ret; 693 return ret;
694 694
695 j = 0; 695 j = 0;
696 ret = IPT_MATCH_ITERATE(e, find_check_match, name, &e->ip, 696 mtpar.table = name;
697 e->comefrom, &j); 697 mtpar.entryinfo = &e->ip;
698 mtpar.hook_mask = e->comefrom;
699 mtpar.family = NFPROTO_IPV4;
700 ret = IPT_MATCH_ITERATE(e, find_check_match, &mtpar, &j);
698 if (ret != 0) 701 if (ret != 0)
699 goto cleanup_matches; 702 goto cleanup_matches;
700 703
@@ -769,6 +772,7 @@ check_entry_size_and_hooks(struct ipt_entry *e,
769static int 772static int
770cleanup_entry(struct ipt_entry *e, unsigned int *i) 773cleanup_entry(struct ipt_entry *e, unsigned int *i)
771{ 774{
775 struct xt_tgdtor_param par;
772 struct ipt_entry_target *t; 776 struct ipt_entry_target *t;
773 777
774 if (i && (*i)-- == 0) 778 if (i && (*i)-- == 0)
@@ -777,9 +781,13 @@ cleanup_entry(struct ipt_entry *e, unsigned int *i)
777 /* Cleanup all matches */ 781 /* Cleanup all matches */
778 IPT_MATCH_ITERATE(e, cleanup_match, NULL); 782 IPT_MATCH_ITERATE(e, cleanup_match, NULL);
779 t = ipt_get_target(e); 783 t = ipt_get_target(e);
780 if (t->u.kernel.target->destroy) 784
781 t->u.kernel.target->destroy(t->u.kernel.target, t->data); 785 par.target = t->u.kernel.target;
782 module_put(t->u.kernel.target->me); 786 par.targinfo = t->data;
787 par.family = NFPROTO_IPV4;
788 if (par.target->destroy != NULL)
789 par.target->destroy(&par);
790 module_put(par.target->me);
783 return 0; 791 return 0;
784} 792}
785 793
@@ -1648,12 +1656,16 @@ static int
1648compat_check_entry(struct ipt_entry *e, const char *name, 1656compat_check_entry(struct ipt_entry *e, const char *name,
1649 unsigned int *i) 1657 unsigned int *i)
1650{ 1658{
1659 struct xt_mtchk_param mtpar;
1651 unsigned int j; 1660 unsigned int j;
1652 int ret; 1661 int ret;
1653 1662
1654 j = 0; 1663 j = 0;
1655 ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, 1664 mtpar.table = name;
1656 e->comefrom, &j); 1665 mtpar.entryinfo = &e->ip;
1666 mtpar.hook_mask = e->comefrom;
1667 mtpar.family = NFPROTO_IPV4;
1668 ret = IPT_MATCH_ITERATE(e, check_match, &mtpar, &j);
1657 if (ret) 1669 if (ret)
1658 goto cleanup_matches; 1670 goto cleanup_matches;
1659 1671
@@ -2121,30 +2133,23 @@ icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
2121} 2133}
2122 2134
2123static bool 2135static bool
2124icmp_match(const struct sk_buff *skb, 2136icmp_match(const struct sk_buff *skb, const struct xt_match_param *par)
2125 const struct net_device *in,
2126 const struct net_device *out,
2127 const struct xt_match *match,
2128 const void *matchinfo,
2129 int offset,
2130 unsigned int protoff,
2131 bool *hotdrop)
2132{ 2137{
2133 const struct icmphdr *ic; 2138 const struct icmphdr *ic;
2134 struct icmphdr _icmph; 2139 struct icmphdr _icmph;
2135 const struct ipt_icmp *icmpinfo = matchinfo; 2140 const struct ipt_icmp *icmpinfo = par->matchinfo;
2136 2141
2137 /* Must not be a fragment. */ 2142 /* Must not be a fragment. */
2138 if (offset) 2143 if (par->fragoff != 0)
2139 return false; 2144 return false;
2140 2145
2141 ic = skb_header_pointer(skb, protoff, sizeof(_icmph), &_icmph); 2146 ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph);
2142 if (ic == NULL) { 2147 if (ic == NULL) {
2143 /* We've been asked to examine this packet, and we 2148 /* We've been asked to examine this packet, and we
2144 * can't. Hence, no choice but to drop. 2149 * can't. Hence, no choice but to drop.
2145 */ 2150 */
2146 duprintf("Dropping evil ICMP tinygram.\n"); 2151 duprintf("Dropping evil ICMP tinygram.\n");
2147 *hotdrop = true; 2152 *par->hotdrop = true;
2148 return false; 2153 return false;
2149 } 2154 }
2150 2155
@@ -2155,15 +2160,9 @@ icmp_match(const struct sk_buff *skb,
2155 !!(icmpinfo->invflags&IPT_ICMP_INV)); 2160 !!(icmpinfo->invflags&IPT_ICMP_INV));
2156} 2161}
2157 2162
2158/* Called when user tries to insert an entry of this type. */ 2163static bool icmp_checkentry(const struct xt_mtchk_param *par)
2159static bool
2160icmp_checkentry(const char *tablename,
2161 const void *entry,
2162 const struct xt_match *match,
2163 void *matchinfo,
2164 unsigned int hook_mask)
2165{ 2164{
2166 const struct ipt_icmp *icmpinfo = matchinfo; 2165 const struct ipt_icmp *icmpinfo = par->matchinfo;
2167 2166
2168 /* Must specify no unknown invflags */ 2167 /* Must specify no unknown invflags */
2169 return !(icmpinfo->invflags & ~IPT_ICMP_INV); 2168 return !(icmpinfo->invflags & ~IPT_ICMP_INV);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index fafe8ebb4c55..7ac1677419a9 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -281,11 +281,9 @@ clusterip_responsible(const struct clusterip_config *config, u_int32_t hash)
281 ***********************************************************************/ 281 ***********************************************************************/
282 282
283static unsigned int 283static unsigned int
284clusterip_tg(struct sk_buff *skb, const struct net_device *in, 284clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par)
285 const struct net_device *out, unsigned int hooknum,
286 const struct xt_target *target, const void *targinfo)
287{ 285{
288 const struct ipt_clusterip_tgt_info *cipinfo = targinfo; 286 const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
289 struct nf_conn *ct; 287 struct nf_conn *ct;
290 enum ip_conntrack_info ctinfo; 288 enum ip_conntrack_info ctinfo;
291 u_int32_t hash; 289 u_int32_t hash;
@@ -349,13 +347,10 @@ clusterip_tg(struct sk_buff *skb, const struct net_device *in,
349 return XT_CONTINUE; 347 return XT_CONTINUE;
350} 348}
351 349
352static bool 350static bool clusterip_tg_check(const struct xt_tgchk_param *par)
353clusterip_tg_check(const char *tablename, const void *e_void,
354 const struct xt_target *target, void *targinfo,
355 unsigned int hook_mask)
356{ 351{
357 struct ipt_clusterip_tgt_info *cipinfo = targinfo; 352 struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
358 const struct ipt_entry *e = e_void; 353 const struct ipt_entry *e = par->entryinfo;
359 354
360 struct clusterip_config *config; 355 struct clusterip_config *config;
361 356
@@ -406,9 +401,9 @@ clusterip_tg_check(const char *tablename, const void *e_void,
406 } 401 }
407 cipinfo->config = config; 402 cipinfo->config = config;
408 403
409 if (nf_ct_l3proto_try_module_get(target->family) < 0) { 404 if (nf_ct_l3proto_try_module_get(par->target->family) < 0) {
410 printk(KERN_WARNING "can't load conntrack support for " 405 printk(KERN_WARNING "can't load conntrack support for "
411 "proto=%u\n", target->family); 406 "proto=%u\n", par->target->family);
412 return false; 407 return false;
413 } 408 }
414 409
@@ -416,9 +411,9 @@ clusterip_tg_check(const char *tablename, const void *e_void,
416} 411}
417 412
418/* drop reference count of cluster config when rule is deleted */ 413/* drop reference count of cluster config when rule is deleted */
419static void clusterip_tg_destroy(const struct xt_target *target, void *targinfo) 414static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
420{ 415{
421 const struct ipt_clusterip_tgt_info *cipinfo = targinfo; 416 const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
422 417
423 /* if no more entries are referencing the config, remove it 418 /* if no more entries are referencing the config, remove it
424 * from the list and destroy the proc entry */ 419 * from the list and destroy the proc entry */
@@ -426,7 +421,7 @@ static void clusterip_tg_destroy(const struct xt_target *target, void *targinfo)
426 421
427 clusterip_config_put(cipinfo->config); 422 clusterip_config_put(cipinfo->config);
428 423
429 nf_ct_l3proto_module_put(target->family); 424 nf_ct_l3proto_module_put(par->target->family);
430} 425}
431 426
432#ifdef CONFIG_COMPAT 427#ifdef CONFIG_COMPAT
@@ -445,7 +440,7 @@ struct compat_ipt_clusterip_tgt_info
445 440
446static struct xt_target clusterip_tg_reg __read_mostly = { 441static struct xt_target clusterip_tg_reg __read_mostly = {
447 .name = "CLUSTERIP", 442 .name = "CLUSTERIP",
448 .family = AF_INET, 443 .family = NFPROTO_IPV4,
449 .target = clusterip_tg, 444 .target = clusterip_tg,
450 .checkentry = clusterip_tg_check, 445 .checkentry = clusterip_tg_check,
451 .destroy = clusterip_tg_destroy, 446 .destroy = clusterip_tg_destroy,
@@ -546,7 +541,7 @@ arp_mangle(unsigned int hook,
546 541
547static struct nf_hook_ops cip_arp_ops __read_mostly = { 542static struct nf_hook_ops cip_arp_ops __read_mostly = {
548 .hook = arp_mangle, 543 .hook = arp_mangle,
549 .pf = NF_ARP, 544 .pf = NFPROTO_ARP,
550 .hooknum = NF_ARP_OUT, 545 .hooknum = NF_ARP_OUT,
551 .priority = -1 546 .priority = -1
552}; 547};
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index d60139c134ca..f7e2fa0974dc 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -77,11 +77,9 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
77} 77}
78 78
79static unsigned int 79static unsigned int
80ecn_tg(struct sk_buff *skb, const struct net_device *in, 80ecn_tg(struct sk_buff *skb, const struct xt_target_param *par)
81 const struct net_device *out, unsigned int hooknum,
82 const struct xt_target *target, const void *targinfo)
83{ 81{
84 const struct ipt_ECN_info *einfo = targinfo; 82 const struct ipt_ECN_info *einfo = par->targinfo;
85 83
86 if (einfo->operation & IPT_ECN_OP_SET_IP) 84 if (einfo->operation & IPT_ECN_OP_SET_IP)
87 if (!set_ect_ip(skb, einfo)) 85 if (!set_ect_ip(skb, einfo))
@@ -95,13 +93,10 @@ ecn_tg(struct sk_buff *skb, const struct net_device *in,
95 return XT_CONTINUE; 93 return XT_CONTINUE;
96} 94}
97 95
98static bool 96static bool ecn_tg_check(const struct xt_tgchk_param *par)
99ecn_tg_check(const char *tablename, const void *e_void,
100 const struct xt_target *target, void *targinfo,
101 unsigned int hook_mask)
102{ 97{
103 const struct ipt_ECN_info *einfo = targinfo; 98 const struct ipt_ECN_info *einfo = par->targinfo;
104 const struct ipt_entry *e = e_void; 99 const struct ipt_entry *e = par->entryinfo;
105 100
106 if (einfo->operation & IPT_ECN_OP_MASK) { 101 if (einfo->operation & IPT_ECN_OP_MASK) {
107 printk(KERN_WARNING "ECN: unsupported ECN operation %x\n", 102 printk(KERN_WARNING "ECN: unsupported ECN operation %x\n",
@@ -124,7 +119,7 @@ ecn_tg_check(const char *tablename, const void *e_void,
124 119
125static struct xt_target ecn_tg_reg __read_mostly = { 120static struct xt_target ecn_tg_reg __read_mostly = {
126 .name = "ECN", 121 .name = "ECN",
127 .family = AF_INET, 122 .family = NFPROTO_IPV4,
128 .target = ecn_tg, 123 .target = ecn_tg,
129 .targetsize = sizeof(struct ipt_ECN_info), 124 .targetsize = sizeof(struct ipt_ECN_info),
130 .table = "mangle", 125 .table = "mangle",
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 0af14137137b..fc6ce04a3e35 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -375,7 +375,7 @@ static struct nf_loginfo default_loginfo = {
375}; 375};
376 376
377static void 377static void
378ipt_log_packet(unsigned int pf, 378ipt_log_packet(u_int8_t pf,
379 unsigned int hooknum, 379 unsigned int hooknum,
380 const struct sk_buff *skb, 380 const struct sk_buff *skb,
381 const struct net_device *in, 381 const struct net_device *in,
@@ -426,28 +426,23 @@ ipt_log_packet(unsigned int pf,
426} 426}
427 427
428static unsigned int 428static unsigned int
429log_tg(struct sk_buff *skb, const struct net_device *in, 429log_tg(struct sk_buff *skb, const struct xt_target_param *par)
430 const struct net_device *out, unsigned int hooknum,
431 const struct xt_target *target, const void *targinfo)
432{ 430{
433 const struct ipt_log_info *loginfo = targinfo; 431 const struct ipt_log_info *loginfo = par->targinfo;
434 struct nf_loginfo li; 432 struct nf_loginfo li;
435 433
436 li.type = NF_LOG_TYPE_LOG; 434 li.type = NF_LOG_TYPE_LOG;
437 li.u.log.level = loginfo->level; 435 li.u.log.level = loginfo->level;
438 li.u.log.logflags = loginfo->logflags; 436 li.u.log.logflags = loginfo->logflags;
439 437
440 ipt_log_packet(PF_INET, hooknum, skb, in, out, &li, 438 ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in, par->out, &li,
441 loginfo->prefix); 439 loginfo->prefix);
442 return XT_CONTINUE; 440 return XT_CONTINUE;
443} 441}
444 442
445static bool 443static bool log_tg_check(const struct xt_tgchk_param *par)
446log_tg_check(const char *tablename, const void *e,
447 const struct xt_target *target, void *targinfo,
448 unsigned int hook_mask)
449{ 444{
450 const struct ipt_log_info *loginfo = targinfo; 445 const struct ipt_log_info *loginfo = par->targinfo;
451 446
452 if (loginfo->level >= 8) { 447 if (loginfo->level >= 8) {
453 pr_debug("LOG: level %u >= 8\n", loginfo->level); 448 pr_debug("LOG: level %u >= 8\n", loginfo->level);
@@ -463,7 +458,7 @@ log_tg_check(const char *tablename, const void *e,
463 458
464static struct xt_target log_tg_reg __read_mostly = { 459static struct xt_target log_tg_reg __read_mostly = {
465 .name = "LOG", 460 .name = "LOG",
466 .family = AF_INET, 461 .family = NFPROTO_IPV4,
467 .target = log_tg, 462 .target = log_tg,
468 .targetsize = sizeof(struct ipt_log_info), 463 .targetsize = sizeof(struct ipt_log_info),
469 .checkentry = log_tg_check, 464 .checkentry = log_tg_check,
@@ -483,7 +478,7 @@ static int __init log_tg_init(void)
483 ret = xt_register_target(&log_tg_reg); 478 ret = xt_register_target(&log_tg_reg);
484 if (ret < 0) 479 if (ret < 0)
485 return ret; 480 return ret;
486 nf_log_register(PF_INET, &ipt_log_logger); 481 nf_log_register(NFPROTO_IPV4, &ipt_log_logger);
487 return 0; 482 return 0;
488} 483}
489 484
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 0841aefaa503..f389f60cb105 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -31,12 +31,9 @@ MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
31static DEFINE_RWLOCK(masq_lock); 31static DEFINE_RWLOCK(masq_lock);
32 32
33/* FIXME: Multiple targets. --RR */ 33/* FIXME: Multiple targets. --RR */
34static bool 34static bool masquerade_tg_check(const struct xt_tgchk_param *par)
35masquerade_tg_check(const char *tablename, const void *e,
36 const struct xt_target *target, void *targinfo,
37 unsigned int hook_mask)
38{ 35{
39 const struct nf_nat_multi_range_compat *mr = targinfo; 36 const struct nf_nat_multi_range_compat *mr = par->targinfo;
40 37
41 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { 38 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
42 pr_debug("masquerade_check: bad MAP_IPS.\n"); 39 pr_debug("masquerade_check: bad MAP_IPS.\n");
@@ -50,9 +47,7 @@ masquerade_tg_check(const char *tablename, const void *e,
50} 47}
51 48
52static unsigned int 49static unsigned int
53masquerade_tg(struct sk_buff *skb, const struct net_device *in, 50masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par)
54 const struct net_device *out, unsigned int hooknum,
55 const struct xt_target *target, const void *targinfo)
56{ 51{
57 struct nf_conn *ct; 52 struct nf_conn *ct;
58 struct nf_conn_nat *nat; 53 struct nf_conn_nat *nat;
@@ -62,7 +57,7 @@ masquerade_tg(struct sk_buff *skb, const struct net_device *in,
62 const struct rtable *rt; 57 const struct rtable *rt;
63 __be32 newsrc; 58 __be32 newsrc;
64 59
65 NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); 60 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
66 61
67 ct = nf_ct_get(skb, &ctinfo); 62 ct = nf_ct_get(skb, &ctinfo);
68 nat = nfct_nat(ct); 63 nat = nfct_nat(ct);
@@ -76,16 +71,16 @@ masquerade_tg(struct sk_buff *skb, const struct net_device *in,
76 if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) 71 if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
77 return NF_ACCEPT; 72 return NF_ACCEPT;
78 73
79 mr = targinfo; 74 mr = par->targinfo;
80 rt = skb->rtable; 75 rt = skb->rtable;
81 newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE); 76 newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
82 if (!newsrc) { 77 if (!newsrc) {
83 printk("MASQUERADE: %s ate my IP address\n", out->name); 78 printk("MASQUERADE: %s ate my IP address\n", par->out->name);
84 return NF_DROP; 79 return NF_DROP;
85 } 80 }
86 81
87 write_lock_bh(&masq_lock); 82 write_lock_bh(&masq_lock);
88 nat->masq_index = out->ifindex; 83 nat->masq_index = par->out->ifindex;
89 write_unlock_bh(&masq_lock); 84 write_unlock_bh(&masq_lock);
90 85
91 /* Transfer from original range. */ 86 /* Transfer from original range. */
@@ -119,9 +114,7 @@ static int masq_device_event(struct notifier_block *this,
119 void *ptr) 114 void *ptr)
120{ 115{
121 const struct net_device *dev = ptr; 116 const struct net_device *dev = ptr;
122 117 struct net *net = dev_net(dev);
123 if (!net_eq(dev_net(dev), &init_net))
124 return NOTIFY_DONE;
125 118
126 if (event == NETDEV_DOWN) { 119 if (event == NETDEV_DOWN) {
127 /* Device was downed. Search entire table for 120 /* Device was downed. Search entire table for
@@ -129,7 +122,8 @@ static int masq_device_event(struct notifier_block *this,
129 and forget them. */ 122 and forget them. */
130 NF_CT_ASSERT(dev->ifindex != 0); 123 NF_CT_ASSERT(dev->ifindex != 0);
131 124
132 nf_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); 125 nf_ct_iterate_cleanup(net, device_cmp,
126 (void *)(long)dev->ifindex);
133 } 127 }
134 128
135 return NOTIFY_DONE; 129 return NOTIFY_DONE;
@@ -153,7 +147,7 @@ static struct notifier_block masq_inet_notifier = {
153 147
154static struct xt_target masquerade_tg_reg __read_mostly = { 148static struct xt_target masquerade_tg_reg __read_mostly = {
155 .name = "MASQUERADE", 149 .name = "MASQUERADE",
156 .family = AF_INET, 150 .family = NFPROTO_IPV4,
157 .target = masquerade_tg, 151 .target = masquerade_tg,
158 .targetsize = sizeof(struct nf_nat_multi_range_compat), 152 .targetsize = sizeof(struct nf_nat_multi_range_compat),
159 .table = "nat", 153 .table = "nat",
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 6739abfd1521..7c29582d4ec8 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -22,12 +22,9 @@ MODULE_LICENSE("GPL");
22MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>"); 22MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
23MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets"); 23MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets");
24 24
25static bool 25static bool netmap_tg_check(const struct xt_tgchk_param *par)
26netmap_tg_check(const char *tablename, const void *e,
27 const struct xt_target *target, void *targinfo,
28 unsigned int hook_mask)
29{ 26{
30 const struct nf_nat_multi_range_compat *mr = targinfo; 27 const struct nf_nat_multi_range_compat *mr = par->targinfo;
31 28
32 if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { 29 if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) {
33 pr_debug("NETMAP:check: bad MAP_IPS.\n"); 30 pr_debug("NETMAP:check: bad MAP_IPS.\n");
@@ -41,24 +38,23 @@ netmap_tg_check(const char *tablename, const void *e,
41} 38}
42 39
43static unsigned int 40static unsigned int
44netmap_tg(struct sk_buff *skb, const struct net_device *in, 41netmap_tg(struct sk_buff *skb, const struct xt_target_param *par)
45 const struct net_device *out, unsigned int hooknum,
46 const struct xt_target *target, const void *targinfo)
47{ 42{
48 struct nf_conn *ct; 43 struct nf_conn *ct;
49 enum ip_conntrack_info ctinfo; 44 enum ip_conntrack_info ctinfo;
50 __be32 new_ip, netmask; 45 __be32 new_ip, netmask;
51 const struct nf_nat_multi_range_compat *mr = targinfo; 46 const struct nf_nat_multi_range_compat *mr = par->targinfo;
52 struct nf_nat_range newrange; 47 struct nf_nat_range newrange;
53 48
54 NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING 49 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
55 || hooknum == NF_INET_POST_ROUTING 50 par->hooknum == NF_INET_POST_ROUTING ||
56 || hooknum == NF_INET_LOCAL_OUT); 51 par->hooknum == NF_INET_LOCAL_OUT);
57 ct = nf_ct_get(skb, &ctinfo); 52 ct = nf_ct_get(skb, &ctinfo);
58 53
59 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); 54 netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
60 55
61 if (hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_LOCAL_OUT) 56 if (par->hooknum == NF_INET_PRE_ROUTING ||
57 par->hooknum == NF_INET_LOCAL_OUT)
62 new_ip = ip_hdr(skb)->daddr & ~netmask; 58 new_ip = ip_hdr(skb)->daddr & ~netmask;
63 else 59 else
64 new_ip = ip_hdr(skb)->saddr & ~netmask; 60 new_ip = ip_hdr(skb)->saddr & ~netmask;
@@ -70,12 +66,12 @@ netmap_tg(struct sk_buff *skb, const struct net_device *in,
70 mr->range[0].min, mr->range[0].max }); 66 mr->range[0].min, mr->range[0].max });
71 67
72 /* Hand modified range to generic setup. */ 68 /* Hand modified range to generic setup. */
73 return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(hooknum)); 69 return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
74} 70}
75 71
76static struct xt_target netmap_tg_reg __read_mostly = { 72static struct xt_target netmap_tg_reg __read_mostly = {
77 .name = "NETMAP", 73 .name = "NETMAP",
78 .family = AF_INET, 74 .family = NFPROTO_IPV4,
79 .target = netmap_tg, 75 .target = netmap_tg,
80 .targetsize = sizeof(struct nf_nat_multi_range_compat), 76 .targetsize = sizeof(struct nf_nat_multi_range_compat),
81 .table = "nat", 77 .table = "nat",
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index 5c6292449d13..698e5e78685b 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -26,12 +26,9 @@ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
26MODULE_DESCRIPTION("Xtables: Connection redirection to localhost"); 26MODULE_DESCRIPTION("Xtables: Connection redirection to localhost");
27 27
28/* FIXME: Take multiple ranges --RR */ 28/* FIXME: Take multiple ranges --RR */
29static bool 29static bool redirect_tg_check(const struct xt_tgchk_param *par)
30redirect_tg_check(const char *tablename, const void *e,
31 const struct xt_target *target, void *targinfo,
32 unsigned int hook_mask)
33{ 30{
34 const struct nf_nat_multi_range_compat *mr = targinfo; 31 const struct nf_nat_multi_range_compat *mr = par->targinfo;
35 32
36 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { 33 if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
37 pr_debug("redirect_check: bad MAP_IPS.\n"); 34 pr_debug("redirect_check: bad MAP_IPS.\n");
@@ -45,24 +42,22 @@ redirect_tg_check(const char *tablename, const void *e,
45} 42}
46 43
47static unsigned int 44static unsigned int
48redirect_tg(struct sk_buff *skb, const struct net_device *in, 45redirect_tg(struct sk_buff *skb, const struct xt_target_param *par)
49 const struct net_device *out, unsigned int hooknum,
50 const struct xt_target *target, const void *targinfo)
51{ 46{
52 struct nf_conn *ct; 47 struct nf_conn *ct;
53 enum ip_conntrack_info ctinfo; 48 enum ip_conntrack_info ctinfo;
54 __be32 newdst; 49 __be32 newdst;
55 const struct nf_nat_multi_range_compat *mr = targinfo; 50 const struct nf_nat_multi_range_compat *mr = par->targinfo;
56 struct nf_nat_range newrange; 51 struct nf_nat_range newrange;
57 52
58 NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING 53 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
59 || hooknum == NF_INET_LOCAL_OUT); 54 par->hooknum == NF_INET_LOCAL_OUT);
60 55
61 ct = nf_ct_get(skb, &ctinfo); 56 ct = nf_ct_get(skb, &ctinfo);
62 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); 57 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
63 58
64 /* Local packets: make them go to loopback */ 59 /* Local packets: make them go to loopback */
65 if (hooknum == NF_INET_LOCAL_OUT) 60 if (par->hooknum == NF_INET_LOCAL_OUT)
66 newdst = htonl(0x7F000001); 61 newdst = htonl(0x7F000001);
67 else { 62 else {
68 struct in_device *indev; 63 struct in_device *indev;
@@ -92,7 +87,7 @@ redirect_tg(struct sk_buff *skb, const struct net_device *in,
92 87
93static struct xt_target redirect_tg_reg __read_mostly = { 88static struct xt_target redirect_tg_reg __read_mostly = {
94 .name = "REDIRECT", 89 .name = "REDIRECT",
95 .family = AF_INET, 90 .family = NFPROTO_IPV4,
96 .target = redirect_tg, 91 .target = redirect_tg,
97 .targetsize = sizeof(struct nf_nat_multi_range_compat), 92 .targetsize = sizeof(struct nf_nat_multi_range_compat),
98 .table = "nat", 93 .table = "nat",
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 2639872849da..0b4b6e0ff2b9 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -136,11 +136,9 @@ static inline void send_unreach(struct sk_buff *skb_in, int code)
136} 136}
137 137
138static unsigned int 138static unsigned int
139reject_tg(struct sk_buff *skb, const struct net_device *in, 139reject_tg(struct sk_buff *skb, const struct xt_target_param *par)
140 const struct net_device *out, unsigned int hooknum,
141 const struct xt_target *target, const void *targinfo)
142{ 140{
143 const struct ipt_reject_info *reject = targinfo; 141 const struct ipt_reject_info *reject = par->targinfo;
144 142
145 /* WARNING: This code causes reentry within iptables. 143 /* WARNING: This code causes reentry within iptables.
146 This means that the iptables jump stack is now crap. We 144 This means that the iptables jump stack is now crap. We
@@ -168,7 +166,7 @@ reject_tg(struct sk_buff *skb, const struct net_device *in,
168 send_unreach(skb, ICMP_PKT_FILTERED); 166 send_unreach(skb, ICMP_PKT_FILTERED);
169 break; 167 break;
170 case IPT_TCP_RESET: 168 case IPT_TCP_RESET:
171 send_reset(skb, hooknum); 169 send_reset(skb, par->hooknum);
172 case IPT_ICMP_ECHOREPLY: 170 case IPT_ICMP_ECHOREPLY:
173 /* Doesn't happen. */ 171 /* Doesn't happen. */
174 break; 172 break;
@@ -177,13 +175,10 @@ reject_tg(struct sk_buff *skb, const struct net_device *in,
177 return NF_DROP; 175 return NF_DROP;
178} 176}
179 177
180static bool 178static bool reject_tg_check(const struct xt_tgchk_param *par)
181reject_tg_check(const char *tablename, const void *e_void,
182 const struct xt_target *target, void *targinfo,
183 unsigned int hook_mask)
184{ 179{
185 const struct ipt_reject_info *rejinfo = targinfo; 180 const struct ipt_reject_info *rejinfo = par->targinfo;
186 const struct ipt_entry *e = e_void; 181 const struct ipt_entry *e = par->entryinfo;
187 182
188 if (rejinfo->with == IPT_ICMP_ECHOREPLY) { 183 if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
189 printk("ipt_REJECT: ECHOREPLY no longer supported.\n"); 184 printk("ipt_REJECT: ECHOREPLY no longer supported.\n");
@@ -201,7 +196,7 @@ reject_tg_check(const char *tablename, const void *e_void,
201 196
202static struct xt_target reject_tg_reg __read_mostly = { 197static struct xt_target reject_tg_reg __read_mostly = {
203 .name = "REJECT", 198 .name = "REJECT",
204 .family = AF_INET, 199 .family = NFPROTO_IPV4,
205 .target = reject_tg, 200 .target = reject_tg,
206 .targetsize = sizeof(struct ipt_reject_info), 201 .targetsize = sizeof(struct ipt_reject_info),
207 .table = "filter", 202 .table = "filter",
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
index 30eed65e7338..6d76aae90cc0 100644
--- a/net/ipv4/netfilter/ipt_TTL.c
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -20,12 +20,10 @@ MODULE_DESCRIPTION("Xtables: IPv4 TTL field modification target");
20MODULE_LICENSE("GPL"); 20MODULE_LICENSE("GPL");
21 21
22static unsigned int 22static unsigned int
23ttl_tg(struct sk_buff *skb, const struct net_device *in, 23ttl_tg(struct sk_buff *skb, const struct xt_target_param *par)
24 const struct net_device *out, unsigned int hooknum,
25 const struct xt_target *target, const void *targinfo)
26{ 24{
27 struct iphdr *iph; 25 struct iphdr *iph;
28 const struct ipt_TTL_info *info = targinfo; 26 const struct ipt_TTL_info *info = par->targinfo;
29 int new_ttl; 27 int new_ttl;
30 28
31 if (!skb_make_writable(skb, skb->len)) 29 if (!skb_make_writable(skb, skb->len))
@@ -61,12 +59,9 @@ ttl_tg(struct sk_buff *skb, const struct net_device *in,
61 return XT_CONTINUE; 59 return XT_CONTINUE;
62} 60}
63 61
64static bool 62static bool ttl_tg_check(const struct xt_tgchk_param *par)
65ttl_tg_check(const char *tablename, const void *e,
66 const struct xt_target *target, void *targinfo,
67 unsigned int hook_mask)
68{ 63{
69 const struct ipt_TTL_info *info = targinfo; 64 const struct ipt_TTL_info *info = par->targinfo;
70 65
71 if (info->mode > IPT_TTL_MAXMODE) { 66 if (info->mode > IPT_TTL_MAXMODE) {
72 printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n", 67 printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n",
@@ -80,7 +75,7 @@ ttl_tg_check(const char *tablename, const void *e,
80 75
81static struct xt_target ttl_tg_reg __read_mostly = { 76static struct xt_target ttl_tg_reg __read_mostly = {
82 .name = "TTL", 77 .name = "TTL",
83 .family = AF_INET, 78 .family = NFPROTO_IPV4,
84 .target = ttl_tg, 79 .target = ttl_tg,
85 .targetsize = sizeof(struct ipt_TTL_info), 80 .targetsize = sizeof(struct ipt_TTL_info),
86 .table = "mangle", 81 .table = "mangle",
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index b192756c6d0d..18a2826b57c6 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -281,18 +281,14 @@ alloc_failure:
281} 281}
282 282
283static unsigned int 283static unsigned int
284ulog_tg(struct sk_buff *skb, const struct net_device *in, 284ulog_tg(struct sk_buff *skb, const struct xt_target_param *par)
285 const struct net_device *out, unsigned int hooknum,
286 const struct xt_target *target, const void *targinfo)
287{ 285{
288 struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo; 286 ipt_ulog_packet(par->hooknum, skb, par->in, par->out,
289 287 par->targinfo, NULL);
290 ipt_ulog_packet(hooknum, skb, in, out, loginfo, NULL);
291
292 return XT_CONTINUE; 288 return XT_CONTINUE;
293} 289}
294 290
295static void ipt_logfn(unsigned int pf, 291static void ipt_logfn(u_int8_t pf,
296 unsigned int hooknum, 292 unsigned int hooknum,
297 const struct sk_buff *skb, 293 const struct sk_buff *skb,
298 const struct net_device *in, 294 const struct net_device *in,
@@ -317,12 +313,9 @@ static void ipt_logfn(unsigned int pf,
317 ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); 313 ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
318} 314}
319 315
320static bool 316static bool ulog_tg_check(const struct xt_tgchk_param *par)
321ulog_tg_check(const char *tablename, const void *e,
322 const struct xt_target *target, void *targinfo,
323 unsigned int hookmask)
324{ 317{
325 const struct ipt_ulog_info *loginfo = targinfo; 318 const struct ipt_ulog_info *loginfo = par->targinfo;
326 319
327 if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { 320 if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
328 pr_debug("ipt_ULOG: prefix term %i\n", 321 pr_debug("ipt_ULOG: prefix term %i\n",
@@ -374,7 +367,7 @@ static int ulog_tg_compat_to_user(void __user *dst, void *src)
374 367
375static struct xt_target ulog_tg_reg __read_mostly = { 368static struct xt_target ulog_tg_reg __read_mostly = {
376 .name = "ULOG", 369 .name = "ULOG",
377 .family = AF_INET, 370 .family = NFPROTO_IPV4,
378 .target = ulog_tg, 371 .target = ulog_tg,
379 .targetsize = sizeof(struct ipt_ulog_info), 372 .targetsize = sizeof(struct ipt_ulog_info),
380 .checkentry = ulog_tg_check, 373 .checkentry = ulog_tg_check,
@@ -419,7 +412,7 @@ static int __init ulog_tg_init(void)
419 return ret; 412 return ret;
420 } 413 }
421 if (nflog) 414 if (nflog)
422 nf_log_register(PF_INET, &ipt_ulog_logger); 415 nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger);
423 416
424 return 0; 417 return 0;
425} 418}
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
index 462a22c97877..88762f02779d 100644
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -30,12 +30,9 @@ static inline bool match_type(const struct net_device *dev, __be32 addr,
30} 30}
31 31
32static bool 32static bool
33addrtype_mt_v0(const struct sk_buff *skb, const struct net_device *in, 33addrtype_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
34 const struct net_device *out, const struct xt_match *match,
35 const void *matchinfo, int offset, unsigned int protoff,
36 bool *hotdrop)
37{ 34{
38 const struct ipt_addrtype_info *info = matchinfo; 35 const struct ipt_addrtype_info *info = par->matchinfo;
39 const struct iphdr *iph = ip_hdr(skb); 36 const struct iphdr *iph = ip_hdr(skb);
40 bool ret = true; 37 bool ret = true;
41 38
@@ -50,20 +47,17 @@ addrtype_mt_v0(const struct sk_buff *skb, const struct net_device *in,
50} 47}
51 48
52static bool 49static bool
53addrtype_mt_v1(const struct sk_buff *skb, const struct net_device *in, 50addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par)
54 const struct net_device *out, const struct xt_match *match,
55 const void *matchinfo, int offset, unsigned int protoff,
56 bool *hotdrop)
57{ 51{
58 const struct ipt_addrtype_info_v1 *info = matchinfo; 52 const struct ipt_addrtype_info_v1 *info = par->matchinfo;
59 const struct iphdr *iph = ip_hdr(skb); 53 const struct iphdr *iph = ip_hdr(skb);
60 const struct net_device *dev = NULL; 54 const struct net_device *dev = NULL;
61 bool ret = true; 55 bool ret = true;
62 56
63 if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) 57 if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN)
64 dev = in; 58 dev = par->in;
65 else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) 59 else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT)
66 dev = out; 60 dev = par->out;
67 61
68 if (info->source) 62 if (info->source)
69 ret &= match_type(dev, iph->saddr, info->source) ^ 63 ret &= match_type(dev, iph->saddr, info->source) ^
@@ -74,12 +68,9 @@ addrtype_mt_v1(const struct sk_buff *skb, const struct net_device *in,
74 return ret; 68 return ret;
75} 69}
76 70
77static bool 71static bool addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
78addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void,
79 const struct xt_match *match, void *matchinfo,
80 unsigned int hook_mask)
81{ 72{
82 struct ipt_addrtype_info_v1 *info = matchinfo; 73 struct ipt_addrtype_info_v1 *info = par->matchinfo;
83 74
84 if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN && 75 if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN &&
85 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { 76 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
@@ -88,14 +79,16 @@ addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void,
88 return false; 79 return false;
89 } 80 }
90 81
91 if (hook_mask & (1 << NF_INET_PRE_ROUTING | 1 << NF_INET_LOCAL_IN) && 82 if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
83 (1 << NF_INET_LOCAL_IN)) &&
92 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { 84 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
93 printk(KERN_ERR "ipt_addrtype: output interface limitation " 85 printk(KERN_ERR "ipt_addrtype: output interface limitation "
94 "not valid in PRE_ROUTING and INPUT\n"); 86 "not valid in PRE_ROUTING and INPUT\n");
95 return false; 87 return false;
96 } 88 }
97 89
98 if (hook_mask & (1 << NF_INET_POST_ROUTING | 1 << NF_INET_LOCAL_OUT) && 90 if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
91 (1 << NF_INET_LOCAL_OUT)) &&
99 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) { 92 info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) {
100 printk(KERN_ERR "ipt_addrtype: input interface limitation " 93 printk(KERN_ERR "ipt_addrtype: input interface limitation "
101 "not valid in POST_ROUTING and OUTPUT\n"); 94 "not valid in POST_ROUTING and OUTPUT\n");
@@ -108,14 +101,14 @@ addrtype_mt_checkentry_v1(const char *tablename, const void *ip_void,
108static struct xt_match addrtype_mt_reg[] __read_mostly = { 101static struct xt_match addrtype_mt_reg[] __read_mostly = {
109 { 102 {
110 .name = "addrtype", 103 .name = "addrtype",
111 .family = AF_INET, 104 .family = NFPROTO_IPV4,
112 .match = addrtype_mt_v0, 105 .match = addrtype_mt_v0,
113 .matchsize = sizeof(struct ipt_addrtype_info), 106 .matchsize = sizeof(struct ipt_addrtype_info),
114 .me = THIS_MODULE 107 .me = THIS_MODULE
115 }, 108 },
116 { 109 {
117 .name = "addrtype", 110 .name = "addrtype",
118 .family = AF_INET, 111 .family = NFPROTO_IPV4,
119 .revision = 1, 112 .revision = 1,
120 .match = addrtype_mt_v1, 113 .match = addrtype_mt_v1,
121 .checkentry = addrtype_mt_checkentry_v1, 114 .checkentry = addrtype_mt_checkentry_v1,
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index e977989629c7..0104c0b399de 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -36,27 +36,23 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
36 return r; 36 return r;
37} 37}
38 38
39static bool 39static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par)
40ah_mt(const struct sk_buff *skb, const struct net_device *in,
41 const struct net_device *out, const struct xt_match *match,
42 const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
43{ 40{
44 struct ip_auth_hdr _ahdr; 41 struct ip_auth_hdr _ahdr;
45 const struct ip_auth_hdr *ah; 42 const struct ip_auth_hdr *ah;
46 const struct ipt_ah *ahinfo = matchinfo; 43 const struct ipt_ah *ahinfo = par->matchinfo;
47 44
48 /* Must not be a fragment. */ 45 /* Must not be a fragment. */
49 if (offset) 46 if (par->fragoff != 0)
50 return false; 47 return false;
51 48
52 ah = skb_header_pointer(skb, protoff, 49 ah = skb_header_pointer(skb, par->thoff, sizeof(_ahdr), &_ahdr);
53 sizeof(_ahdr), &_ahdr);
54 if (ah == NULL) { 50 if (ah == NULL) {
55 /* We've been asked to examine this packet, and we 51 /* We've been asked to examine this packet, and we
56 * can't. Hence, no choice but to drop. 52 * can't. Hence, no choice but to drop.
57 */ 53 */
58 duprintf("Dropping evil AH tinygram.\n"); 54 duprintf("Dropping evil AH tinygram.\n");
59 *hotdrop = true; 55 *par->hotdrop = true;
60 return 0; 56 return 0;
61 } 57 }
62 58
@@ -65,13 +61,9 @@ ah_mt(const struct sk_buff *skb, const struct net_device *in,
65 !!(ahinfo->invflags & IPT_AH_INV_SPI)); 61 !!(ahinfo->invflags & IPT_AH_INV_SPI));
66} 62}
67 63
68/* Called when user tries to insert an entry of this type. */ 64static bool ah_mt_check(const struct xt_mtchk_param *par)
69static bool
70ah_mt_check(const char *tablename, const void *ip_void,
71 const struct xt_match *match, void *matchinfo,
72 unsigned int hook_mask)
73{ 65{
74 const struct ipt_ah *ahinfo = matchinfo; 66 const struct ipt_ah *ahinfo = par->matchinfo;
75 67
76 /* Must specify no unknown invflags */ 68 /* Must specify no unknown invflags */
77 if (ahinfo->invflags & ~IPT_AH_INV_MASK) { 69 if (ahinfo->invflags & ~IPT_AH_INV_MASK) {
@@ -83,7 +75,7 @@ ah_mt_check(const char *tablename, const void *ip_void,
83 75
84static struct xt_match ah_mt_reg __read_mostly = { 76static struct xt_match ah_mt_reg __read_mostly = {
85 .name = "ah", 77 .name = "ah",
86 .family = AF_INET, 78 .family = NFPROTO_IPV4,
87 .match = ah_mt, 79 .match = ah_mt,
88 .matchsize = sizeof(struct ipt_ah), 80 .matchsize = sizeof(struct ipt_ah),
89 .proto = IPPROTO_AH, 81 .proto = IPPROTO_AH,
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index 749de8284ce5..6289b64144c6 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -67,12 +67,9 @@ static inline bool match_tcp(const struct sk_buff *skb,
67 return true; 67 return true;
68} 68}
69 69
70static bool 70static bool ecn_mt(const struct sk_buff *skb, const struct xt_match_param *par)
71ecn_mt(const struct sk_buff *skb, const struct net_device *in,
72 const struct net_device *out, const struct xt_match *match,
73 const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
74{ 71{
75 const struct ipt_ecn_info *info = matchinfo; 72 const struct ipt_ecn_info *info = par->matchinfo;
76 73
77 if (info->operation & IPT_ECN_OP_MATCH_IP) 74 if (info->operation & IPT_ECN_OP_MATCH_IP)
78 if (!match_ip(skb, info)) 75 if (!match_ip(skb, info))
@@ -81,20 +78,17 @@ ecn_mt(const struct sk_buff *skb, const struct net_device *in,
81 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { 78 if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
82 if (ip_hdr(skb)->protocol != IPPROTO_TCP) 79 if (ip_hdr(skb)->protocol != IPPROTO_TCP)
83 return false; 80 return false;
84 if (!match_tcp(skb, info, hotdrop)) 81 if (!match_tcp(skb, info, par->hotdrop))
85 return false; 82 return false;
86 } 83 }
87 84
88 return true; 85 return true;
89} 86}
90 87
91static bool 88static bool ecn_mt_check(const struct xt_mtchk_param *par)
92ecn_mt_check(const char *tablename, const void *ip_void,
93 const struct xt_match *match, void *matchinfo,
94 unsigned int hook_mask)
95{ 89{
96 const struct ipt_ecn_info *info = matchinfo; 90 const struct ipt_ecn_info *info = par->matchinfo;
97 const struct ipt_ip *ip = ip_void; 91 const struct ipt_ip *ip = par->entryinfo;
98 92
99 if (info->operation & IPT_ECN_OP_MATCH_MASK) 93 if (info->operation & IPT_ECN_OP_MATCH_MASK)
100 return false; 94 return false;
@@ -114,7 +108,7 @@ ecn_mt_check(const char *tablename, const void *ip_void,
114 108
115static struct xt_match ecn_mt_reg __read_mostly = { 109static struct xt_match ecn_mt_reg __read_mostly = {
116 .name = "ecn", 110 .name = "ecn",
117 .family = AF_INET, 111 .family = NFPROTO_IPV4,
118 .match = ecn_mt, 112 .match = ecn_mt,
119 .matchsize = sizeof(struct ipt_ecn_info), 113 .matchsize = sizeof(struct ipt_ecn_info),
120 .checkentry = ecn_mt_check, 114 .checkentry = ecn_mt_check,
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
deleted file mode 100644
index 3974d7cae5c0..000000000000
--- a/net/ipv4/netfilter/ipt_recent.c
+++ /dev/null
@@ -1,501 +0,0 @@
1/*
2 * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This is a replacement of the old ipt_recent module, which carried the
9 * following copyright notice:
10 *
11 * Author: Stephen Frost <sfrost@snowman.net>
12 * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org
13 */
14#include <linux/init.h>
15#include <linux/ip.h>
16#include <linux/moduleparam.h>
17#include <linux/proc_fs.h>
18#include <linux/seq_file.h>
19#include <linux/string.h>
20#include <linux/ctype.h>
21#include <linux/list.h>
22#include <linux/random.h>
23#include <linux/jhash.h>
24#include <linux/bitops.h>
25#include <linux/skbuff.h>
26#include <linux/inet.h>
27#include <net/net_namespace.h>
28
29#include <linux/netfilter/x_tables.h>
30#include <linux/netfilter_ipv4/ipt_recent.h>
31
32MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
33MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching for IPv4");
34MODULE_LICENSE("GPL");
35
36static unsigned int ip_list_tot = 100;
37static unsigned int ip_pkt_list_tot = 20;
38static unsigned int ip_list_hash_size = 0;
39static unsigned int ip_list_perms = 0644;
40static unsigned int ip_list_uid = 0;
41static unsigned int ip_list_gid = 0;
42module_param(ip_list_tot, uint, 0400);
43module_param(ip_pkt_list_tot, uint, 0400);
44module_param(ip_list_hash_size, uint, 0400);
45module_param(ip_list_perms, uint, 0400);
46module_param(ip_list_uid, uint, 0400);
47module_param(ip_list_gid, uint, 0400);
48MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list");
49MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)");
50MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs");
51MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files");
52MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/ipt_recent/* files");
53MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/ipt_recent/* files");
54
55struct recent_entry {
56 struct list_head list;
57 struct list_head lru_list;
58 __be32 addr;
59 u_int8_t ttl;
60 u_int8_t index;
61 u_int16_t nstamps;
62 unsigned long stamps[0];
63};
64
65struct recent_table {
66 struct list_head list;
67 char name[IPT_RECENT_NAME_LEN];
68#ifdef CONFIG_PROC_FS
69 struct proc_dir_entry *proc;
70#endif
71 unsigned int refcnt;
72 unsigned int entries;
73 struct list_head lru_list;
74 struct list_head iphash[0];
75};
76
77static LIST_HEAD(tables);
78static DEFINE_SPINLOCK(recent_lock);
79static DEFINE_MUTEX(recent_mutex);
80
81#ifdef CONFIG_PROC_FS
82static struct proc_dir_entry *proc_dir;
83static const struct file_operations recent_fops;
84#endif
85
86static u_int32_t hash_rnd;
87static int hash_rnd_initted;
88
89static unsigned int recent_entry_hash(__be32 addr)
90{
91 if (!hash_rnd_initted) {
92 get_random_bytes(&hash_rnd, 4);
93 hash_rnd_initted = 1;
94 }
95 return jhash_1word((__force u32)addr, hash_rnd) & (ip_list_hash_size - 1);
96}
97
98static struct recent_entry *
99recent_entry_lookup(const struct recent_table *table, __be32 addr, u_int8_t ttl)
100{
101 struct recent_entry *e;
102 unsigned int h;
103
104 h = recent_entry_hash(addr);
105 list_for_each_entry(e, &table->iphash[h], list)
106 if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl))
107 return e;
108 return NULL;
109}
110
111static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
112{
113 list_del(&e->list);
114 list_del(&e->lru_list);
115 kfree(e);
116 t->entries--;
117}
118
119static struct recent_entry *
120recent_entry_init(struct recent_table *t, __be32 addr, u_int8_t ttl)
121{
122 struct recent_entry *e;
123
124 if (t->entries >= ip_list_tot) {
125 e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
126 recent_entry_remove(t, e);
127 }
128 e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot,
129 GFP_ATOMIC);
130 if (e == NULL)
131 return NULL;
132 e->addr = addr;
133 e->ttl = ttl;
134 e->stamps[0] = jiffies;
135 e->nstamps = 1;
136 e->index = 1;
137 list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]);
138 list_add_tail(&e->lru_list, &t->lru_list);
139 t->entries++;
140 return e;
141}
142
143static void recent_entry_update(struct recent_table *t, struct recent_entry *e)
144{
145 e->stamps[e->index++] = jiffies;
146 if (e->index > e->nstamps)
147 e->nstamps = e->index;
148 e->index %= ip_pkt_list_tot;
149 list_move_tail(&e->lru_list, &t->lru_list);
150}
151
152static struct recent_table *recent_table_lookup(const char *name)
153{
154 struct recent_table *t;
155
156 list_for_each_entry(t, &tables, list)
157 if (!strcmp(t->name, name))
158 return t;
159 return NULL;
160}
161
162static void recent_table_flush(struct recent_table *t)
163{
164 struct recent_entry *e, *next;
165 unsigned int i;
166
167 for (i = 0; i < ip_list_hash_size; i++)
168 list_for_each_entry_safe(e, next, &t->iphash[i], list)
169 recent_entry_remove(t, e);
170}
171
172static bool
173recent_mt(const struct sk_buff *skb, const struct net_device *in,
174 const struct net_device *out, const struct xt_match *match,
175 const void *matchinfo, int offset, unsigned int protoff,
176 bool *hotdrop)
177{
178 const struct ipt_recent_info *info = matchinfo;
179 struct recent_table *t;
180 struct recent_entry *e;
181 __be32 addr;
182 u_int8_t ttl;
183 bool ret = info->invert;
184
185 if (info->side == IPT_RECENT_DEST)
186 addr = ip_hdr(skb)->daddr;
187 else
188 addr = ip_hdr(skb)->saddr;
189
190 ttl = ip_hdr(skb)->ttl;
191 /* use TTL as seen before forwarding */
192 if (out && !skb->sk)
193 ttl++;
194
195 spin_lock_bh(&recent_lock);
196 t = recent_table_lookup(info->name);
197 e = recent_entry_lookup(t, addr,
198 info->check_set & IPT_RECENT_TTL ? ttl : 0);
199 if (e == NULL) {
200 if (!(info->check_set & IPT_RECENT_SET))
201 goto out;
202 e = recent_entry_init(t, addr, ttl);
203 if (e == NULL)
204 *hotdrop = true;
205 ret = !ret;
206 goto out;
207 }
208
209 if (info->check_set & IPT_RECENT_SET)
210 ret = !ret;
211 else if (info->check_set & IPT_RECENT_REMOVE) {
212 recent_entry_remove(t, e);
213 ret = !ret;
214 } else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) {
215 unsigned long time = jiffies - info->seconds * HZ;
216 unsigned int i, hits = 0;
217
218 for (i = 0; i < e->nstamps; i++) {
219 if (info->seconds && time_after(time, e->stamps[i]))
220 continue;
221 if (++hits >= info->hit_count) {
222 ret = !ret;
223 break;
224 }
225 }
226 }
227
228 if (info->check_set & IPT_RECENT_SET ||
229 (info->check_set & IPT_RECENT_UPDATE && ret)) {
230 recent_entry_update(t, e);
231 e->ttl = ttl;
232 }
233out:
234 spin_unlock_bh(&recent_lock);
235 return ret;
236}
237
238static bool
239recent_mt_check(const char *tablename, const void *ip,
240 const struct xt_match *match, void *matchinfo,
241 unsigned int hook_mask)
242{
243 const struct ipt_recent_info *info = matchinfo;
244 struct recent_table *t;
245 unsigned i;
246 bool ret = false;
247
248 if (hweight8(info->check_set &
249 (IPT_RECENT_SET | IPT_RECENT_REMOVE |
250 IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1)
251 return false;
252 if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) &&
253 (info->seconds || info->hit_count))
254 return false;
255 if (info->hit_count > ip_pkt_list_tot)
256 return false;
257 if (info->name[0] == '\0' ||
258 strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN)
259 return false;
260
261 mutex_lock(&recent_mutex);
262 t = recent_table_lookup(info->name);
263 if (t != NULL) {
264 t->refcnt++;
265 ret = true;
266 goto out;
267 }
268
269 t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size,
270 GFP_KERNEL);
271 if (t == NULL)
272 goto out;
273 t->refcnt = 1;
274 strcpy(t->name, info->name);
275 INIT_LIST_HEAD(&t->lru_list);
276 for (i = 0; i < ip_list_hash_size; i++)
277 INIT_LIST_HEAD(&t->iphash[i]);
278#ifdef CONFIG_PROC_FS
279 t->proc = proc_create(t->name, ip_list_perms, proc_dir, &recent_fops);
280 if (t->proc == NULL) {
281 kfree(t);
282 goto out;
283 }
284 t->proc->uid = ip_list_uid;
285 t->proc->gid = ip_list_gid;
286 t->proc->data = t;
287#endif
288 spin_lock_bh(&recent_lock);
289 list_add_tail(&t->list, &tables);
290 spin_unlock_bh(&recent_lock);
291 ret = true;
292out:
293 mutex_unlock(&recent_mutex);
294 return ret;
295}
296
297static void recent_mt_destroy(const struct xt_match *match, void *matchinfo)
298{
299 const struct ipt_recent_info *info = matchinfo;
300 struct recent_table *t;
301
302 mutex_lock(&recent_mutex);
303 t = recent_table_lookup(info->name);
304 if (--t->refcnt == 0) {
305 spin_lock_bh(&recent_lock);
306 list_del(&t->list);
307 spin_unlock_bh(&recent_lock);
308#ifdef CONFIG_PROC_FS
309 remove_proc_entry(t->name, proc_dir);
310#endif
311 recent_table_flush(t);
312 kfree(t);
313 }
314 mutex_unlock(&recent_mutex);
315}
316
317#ifdef CONFIG_PROC_FS
318struct recent_iter_state {
319 struct recent_table *table;
320 unsigned int bucket;
321};
322
323static void *recent_seq_start(struct seq_file *seq, loff_t *pos)
324 __acquires(recent_lock)
325{
326 struct recent_iter_state *st = seq->private;
327 const struct recent_table *t = st->table;
328 struct recent_entry *e;
329 loff_t p = *pos;
330
331 spin_lock_bh(&recent_lock);
332
333 for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++)
334 list_for_each_entry(e, &t->iphash[st->bucket], list)
335 if (p-- == 0)
336 return e;
337 return NULL;
338}
339
340static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos)
341{
342 struct recent_iter_state *st = seq->private;
343 const struct recent_table *t = st->table;
344 struct recent_entry *e = v;
345 struct list_head *head = e->list.next;
346
347 while (head == &t->iphash[st->bucket]) {
348 if (++st->bucket >= ip_list_hash_size)
349 return NULL;
350 head = t->iphash[st->bucket].next;
351 }
352 (*pos)++;
353 return list_entry(head, struct recent_entry, list);
354}
355
356static void recent_seq_stop(struct seq_file *s, void *v)
357 __releases(recent_lock)
358{
359 spin_unlock_bh(&recent_lock);
360}
361
362static int recent_seq_show(struct seq_file *seq, void *v)
363{
364 const struct recent_entry *e = v;
365 unsigned int i;
366
367 i = (e->index - 1) % ip_pkt_list_tot;
368 seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u",
369 NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index);
370 for (i = 0; i < e->nstamps; i++)
371 seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]);
372 seq_printf(seq, "\n");
373 return 0;
374}
375
376static const struct seq_operations recent_seq_ops = {
377 .start = recent_seq_start,
378 .next = recent_seq_next,
379 .stop = recent_seq_stop,
380 .show = recent_seq_show,
381};
382
383static int recent_seq_open(struct inode *inode, struct file *file)
384{
385 struct proc_dir_entry *pde = PDE(inode);
386 struct recent_iter_state *st;
387
388 st = __seq_open_private(file, &recent_seq_ops, sizeof(*st));
389 if (st == NULL)
390 return -ENOMEM;
391
392 st->table = pde->data;
393 return 0;
394}
395
396static ssize_t recent_proc_write(struct file *file, const char __user *input,
397 size_t size, loff_t *loff)
398{
399 const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
400 struct recent_table *t = pde->data;
401 struct recent_entry *e;
402 char buf[sizeof("+255.255.255.255")], *c = buf;
403 __be32 addr;
404 int add;
405
406 if (size > sizeof(buf))
407 size = sizeof(buf);
408 if (copy_from_user(buf, input, size))
409 return -EFAULT;
410 while (isspace(*c))
411 c++;
412
413 if (size - (c - buf) < 5)
414 return c - buf;
415 if (!strncmp(c, "clear", 5)) {
416 c += 5;
417 spin_lock_bh(&recent_lock);
418 recent_table_flush(t);
419 spin_unlock_bh(&recent_lock);
420 return c - buf;
421 }
422
423 switch (*c) {
424 case '-':
425 add = 0;
426 c++;
427 break;
428 case '+':
429 c++;
430 default:
431 add = 1;
432 break;
433 }
434 addr = in_aton(c);
435
436 spin_lock_bh(&recent_lock);
437 e = recent_entry_lookup(t, addr, 0);
438 if (e == NULL) {
439 if (add)
440 recent_entry_init(t, addr, 0);
441 } else {
442 if (add)
443 recent_entry_update(t, e);
444 else
445 recent_entry_remove(t, e);
446 }
447 spin_unlock_bh(&recent_lock);
448 return size;
449}
450
451static const struct file_operations recent_fops = {
452 .open = recent_seq_open,
453 .read = seq_read,
454 .write = recent_proc_write,
455 .release = seq_release_private,
456 .owner = THIS_MODULE,
457};
458#endif /* CONFIG_PROC_FS */
459
460static struct xt_match recent_mt_reg __read_mostly = {
461 .name = "recent",
462 .family = AF_INET,
463 .match = recent_mt,
464 .matchsize = sizeof(struct ipt_recent_info),
465 .checkentry = recent_mt_check,
466 .destroy = recent_mt_destroy,
467 .me = THIS_MODULE,
468};
469
470static int __init recent_mt_init(void)
471{
472 int err;
473
474 if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255)
475 return -EINVAL;
476 ip_list_hash_size = 1 << fls(ip_list_tot);
477
478 err = xt_register_match(&recent_mt_reg);
479#ifdef CONFIG_PROC_FS
480 if (err)
481 return err;
482 proc_dir = proc_mkdir("ipt_recent", init_net.proc_net);
483 if (proc_dir == NULL) {
484 xt_unregister_match(&recent_mt_reg);
485 err = -ENOMEM;
486 }
487#endif
488 return err;
489}
490
491static void __exit recent_mt_exit(void)
492{
493 BUG_ON(!list_empty(&tables));
494 xt_unregister_match(&recent_mt_reg);
495#ifdef CONFIG_PROC_FS
496 remove_proc_entry("ipt_recent", init_net.proc_net);
497#endif
498}
499
500module_init(recent_mt_init);
501module_exit(recent_mt_exit);
diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c
index e0b8caeb710c..297f1cbf4ff5 100644
--- a/net/ipv4/netfilter/ipt_ttl.c
+++ b/net/ipv4/netfilter/ipt_ttl.c
@@ -18,12 +18,9 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
18MODULE_DESCRIPTION("Xtables: IPv4 TTL field match"); 18MODULE_DESCRIPTION("Xtables: IPv4 TTL field match");
19MODULE_LICENSE("GPL"); 19MODULE_LICENSE("GPL");
20 20
21static bool 21static bool ttl_mt(const struct sk_buff *skb, const struct xt_match_param *par)
22ttl_mt(const struct sk_buff *skb, const struct net_device *in,
23 const struct net_device *out, const struct xt_match *match,
24 const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop)
25{ 22{
26 const struct ipt_ttl_info *info = matchinfo; 23 const struct ipt_ttl_info *info = par->matchinfo;
27 const u8 ttl = ip_hdr(skb)->ttl; 24 const u8 ttl = ip_hdr(skb)->ttl;
28 25
29 switch (info->mode) { 26 switch (info->mode) {
@@ -46,7 +43,7 @@ ttl_mt(const struct sk_buff *skb, const struct net_device *in,
46 43
47static struct xt_match ttl_mt_reg __read_mostly = { 44static struct xt_match ttl_mt_reg __read_mostly = {
48 .name = "ttl", 45 .name = "ttl",
49 .family = AF_INET, 46 .family = NFPROTO_IPV4,
50 .match = ttl_mt, 47 .match = ttl_mt,
51 .matchsize = sizeof(struct ipt_ttl_info), 48 .matchsize = sizeof(struct ipt_ttl_info),
52 .me = THIS_MODULE, 49 .me = THIS_MODULE,
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 1ea677dcf845..c9224310ebae 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -70,7 +70,7 @@ ipt_local_in_hook(unsigned int hook,
70 int (*okfn)(struct sk_buff *)) 70 int (*okfn)(struct sk_buff *))
71{ 71{
72 return ipt_do_table(skb, hook, in, out, 72 return ipt_do_table(skb, hook, in, out,
73 nf_local_in_net(in, out)->ipv4.iptable_filter); 73 dev_net(in)->ipv4.iptable_filter);
74} 74}
75 75
76static unsigned int 76static unsigned int
@@ -81,7 +81,7 @@ ipt_hook(unsigned int hook,
81 int (*okfn)(struct sk_buff *)) 81 int (*okfn)(struct sk_buff *))
82{ 82{
83 return ipt_do_table(skb, hook, in, out, 83 return ipt_do_table(skb, hook, in, out,
84 nf_forward_net(in, out)->ipv4.iptable_filter); 84 dev_net(in)->ipv4.iptable_filter);
85} 85}
86 86
87static unsigned int 87static unsigned int
@@ -101,7 +101,7 @@ ipt_local_out_hook(unsigned int hook,
101 } 101 }
102 102
103 return ipt_do_table(skb, hook, in, out, 103 return ipt_do_table(skb, hook, in, out,
104 nf_local_out_net(in, out)->ipv4.iptable_filter); 104 dev_net(out)->ipv4.iptable_filter);
105} 105}
106 106
107static struct nf_hook_ops ipt_ops[] __read_mostly = { 107static struct nf_hook_ops ipt_ops[] __read_mostly = {
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index da59182f2226..69f2c4287146 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -81,7 +81,7 @@ ipt_pre_routing_hook(unsigned int hook,
81 int (*okfn)(struct sk_buff *)) 81 int (*okfn)(struct sk_buff *))
82{ 82{
83 return ipt_do_table(skb, hook, in, out, 83 return ipt_do_table(skb, hook, in, out,
84 nf_pre_routing_net(in, out)->ipv4.iptable_mangle); 84 dev_net(in)->ipv4.iptable_mangle);
85} 85}
86 86
87static unsigned int 87static unsigned int
@@ -92,7 +92,7 @@ ipt_post_routing_hook(unsigned int hook,
92 int (*okfn)(struct sk_buff *)) 92 int (*okfn)(struct sk_buff *))
93{ 93{
94 return ipt_do_table(skb, hook, in, out, 94 return ipt_do_table(skb, hook, in, out,
95 nf_post_routing_net(in, out)->ipv4.iptable_mangle); 95 dev_net(out)->ipv4.iptable_mangle);
96} 96}
97 97
98static unsigned int 98static unsigned int
@@ -103,7 +103,7 @@ ipt_local_in_hook(unsigned int hook,
103 int (*okfn)(struct sk_buff *)) 103 int (*okfn)(struct sk_buff *))
104{ 104{
105 return ipt_do_table(skb, hook, in, out, 105 return ipt_do_table(skb, hook, in, out,
106 nf_local_in_net(in, out)->ipv4.iptable_mangle); 106 dev_net(in)->ipv4.iptable_mangle);
107} 107}
108 108
109static unsigned int 109static unsigned int
@@ -114,7 +114,7 @@ ipt_forward_hook(unsigned int hook,
114 int (*okfn)(struct sk_buff *)) 114 int (*okfn)(struct sk_buff *))
115{ 115{
116 return ipt_do_table(skb, hook, in, out, 116 return ipt_do_table(skb, hook, in, out,
117 nf_forward_net(in, out)->ipv4.iptable_mangle); 117 dev_net(in)->ipv4.iptable_mangle);
118} 118}
119 119
120static unsigned int 120static unsigned int
@@ -147,7 +147,7 @@ ipt_local_hook(unsigned int hook,
147 tos = iph->tos; 147 tos = iph->tos;
148 148
149 ret = ipt_do_table(skb, hook, in, out, 149 ret = ipt_do_table(skb, hook, in, out,
150 nf_local_out_net(in, out)->ipv4.iptable_mangle); 150 dev_net(out)->ipv4.iptable_mangle);
151 /* Reroute for ANY change. */ 151 /* Reroute for ANY change. */
152 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { 152 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) {
153 iph = ip_hdr(skb); 153 iph = ip_hdr(skb);
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index fddce7754b72..8faebfe638f1 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -53,7 +53,7 @@ ipt_hook(unsigned int hook,
53 int (*okfn)(struct sk_buff *)) 53 int (*okfn)(struct sk_buff *))
54{ 54{
55 return ipt_do_table(skb, hook, in, out, 55 return ipt_do_table(skb, hook, in, out,
56 nf_pre_routing_net(in, out)->ipv4.iptable_raw); 56 dev_net(in)->ipv4.iptable_raw);
57} 57}
58 58
59static unsigned int 59static unsigned int
@@ -72,7 +72,7 @@ ipt_local_hook(unsigned int hook,
72 return NF_ACCEPT; 72 return NF_ACCEPT;
73 } 73 }
74 return ipt_do_table(skb, hook, in, out, 74 return ipt_do_table(skb, hook, in, out,
75 nf_local_out_net(in, out)->ipv4.iptable_raw); 75 dev_net(out)->ipv4.iptable_raw);
76} 76}
77 77
78/* 'raw' is the very first table. */ 78/* 'raw' is the very first table. */
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index db6d312128e1..36f3be3cc428 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -73,7 +73,7 @@ ipt_local_in_hook(unsigned int hook,
73 int (*okfn)(struct sk_buff *)) 73 int (*okfn)(struct sk_buff *))
74{ 74{
75 return ipt_do_table(skb, hook, in, out, 75 return ipt_do_table(skb, hook, in, out,
76 nf_local_in_net(in, out)->ipv4.iptable_security); 76 dev_net(in)->ipv4.iptable_security);
77} 77}
78 78
79static unsigned int 79static unsigned int
@@ -84,7 +84,7 @@ ipt_forward_hook(unsigned int hook,
84 int (*okfn)(struct sk_buff *)) 84 int (*okfn)(struct sk_buff *))
85{ 85{
86 return ipt_do_table(skb, hook, in, out, 86 return ipt_do_table(skb, hook, in, out,
87 nf_forward_net(in, out)->ipv4.iptable_security); 87 dev_net(in)->ipv4.iptable_security);
88} 88}
89 89
90static unsigned int 90static unsigned int
@@ -103,7 +103,7 @@ ipt_local_out_hook(unsigned int hook,
103 return NF_ACCEPT; 103 return NF_ACCEPT;
104 } 104 }
105 return ipt_do_table(skb, hook, in, out, 105 return ipt_do_table(skb, hook, in, out,
106 nf_local_out_net(in, out)->ipv4.iptable_security); 106 dev_net(out)->ipv4.iptable_security);
107} 107}
108 108
109static struct nf_hook_ops ipt_ops[] __read_mostly = { 109static struct nf_hook_ops ipt_ops[] __read_mostly = {
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 5a955c440364..4a7c35275396 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -1,3 +1,4 @@
1
1/* (C) 1999-2001 Paul `Rusty' Russell 2/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> 3 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 * 4 *
@@ -24,6 +25,7 @@
24#include <net/netfilter/nf_conntrack_core.h> 25#include <net/netfilter/nf_conntrack_core.h>
25#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> 26#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
26#include <net/netfilter/nf_nat_helper.h> 27#include <net/netfilter/nf_nat_helper.h>
28#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
27 29
28int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb, 30int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb,
29 struct nf_conn *ct, 31 struct nf_conn *ct,
@@ -63,23 +65,6 @@ static int ipv4_print_tuple(struct seq_file *s,
63 NIPQUAD(tuple->dst.u3.ip)); 65 NIPQUAD(tuple->dst.u3.ip));
64} 66}
65 67
66/* Returns new sk_buff, or NULL */
67static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
68{
69 int err;
70
71 skb_orphan(skb);
72
73 local_bh_disable();
74 err = ip_defrag(skb, user);
75 local_bh_enable();
76
77 if (!err)
78 ip_send_check(ip_hdr(skb));
79
80 return err;
81}
82
83static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 68static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
84 unsigned int *dataoff, u_int8_t *protonum) 69 unsigned int *dataoff, u_int8_t *protonum)
85{ 70{
@@ -144,35 +129,13 @@ out:
144 return nf_conntrack_confirm(skb); 129 return nf_conntrack_confirm(skb);
145} 130}
146 131
147static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
148 struct sk_buff *skb,
149 const struct net_device *in,
150 const struct net_device *out,
151 int (*okfn)(struct sk_buff *))
152{
153 /* Previously seen (loopback)? Ignore. Do this before
154 fragment check. */
155 if (skb->nfct)
156 return NF_ACCEPT;
157
158 /* Gather fragments. */
159 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
160 if (nf_ct_ipv4_gather_frags(skb,
161 hooknum == NF_INET_PRE_ROUTING ?
162 IP_DEFRAG_CONNTRACK_IN :
163 IP_DEFRAG_CONNTRACK_OUT))
164 return NF_STOLEN;
165 }
166 return NF_ACCEPT;
167}
168
169static unsigned int ipv4_conntrack_in(unsigned int hooknum, 132static unsigned int ipv4_conntrack_in(unsigned int hooknum,
170 struct sk_buff *skb, 133 struct sk_buff *skb,
171 const struct net_device *in, 134 const struct net_device *in,
172 const struct net_device *out, 135 const struct net_device *out,
173 int (*okfn)(struct sk_buff *)) 136 int (*okfn)(struct sk_buff *))
174{ 137{
175 return nf_conntrack_in(PF_INET, hooknum, skb); 138 return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb);
176} 139}
177 140
178static unsigned int ipv4_conntrack_local(unsigned int hooknum, 141static unsigned int ipv4_conntrack_local(unsigned int hooknum,
@@ -188,20 +151,13 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum,
188 printk("ipt_hook: happy cracking.\n"); 151 printk("ipt_hook: happy cracking.\n");
189 return NF_ACCEPT; 152 return NF_ACCEPT;
190 } 153 }
191 return nf_conntrack_in(PF_INET, hooknum, skb); 154 return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb);
192} 155}
193 156
194/* Connection tracking may drop packets, but never alters them, so 157/* Connection tracking may drop packets, but never alters them, so
195 make it the first hook. */ 158 make it the first hook. */
196static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { 159static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
197 { 160 {
198 .hook = ipv4_conntrack_defrag,
199 .owner = THIS_MODULE,
200 .pf = PF_INET,
201 .hooknum = NF_INET_PRE_ROUTING,
202 .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
203 },
204 {
205 .hook = ipv4_conntrack_in, 161 .hook = ipv4_conntrack_in,
206 .owner = THIS_MODULE, 162 .owner = THIS_MODULE,
207 .pf = PF_INET, 163 .pf = PF_INET,
@@ -209,13 +165,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
209 .priority = NF_IP_PRI_CONNTRACK, 165 .priority = NF_IP_PRI_CONNTRACK,
210 }, 166 },
211 { 167 {
212 .hook = ipv4_conntrack_defrag,
213 .owner = THIS_MODULE,
214 .pf = PF_INET,
215 .hooknum = NF_INET_LOCAL_OUT,
216 .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
217 },
218 {
219 .hook = ipv4_conntrack_local, 168 .hook = ipv4_conntrack_local,
220 .owner = THIS_MODULE, 169 .owner = THIS_MODULE,
221 .pf = PF_INET, 170 .pf = PF_INET,
@@ -254,7 +203,7 @@ static ctl_table ip_ct_sysctl_table[] = {
254 { 203 {
255 .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT, 204 .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT,
256 .procname = "ip_conntrack_count", 205 .procname = "ip_conntrack_count",
257 .data = &nf_conntrack_count, 206 .data = &init_net.ct.count,
258 .maxlen = sizeof(int), 207 .maxlen = sizeof(int),
259 .mode = 0444, 208 .mode = 0444,
260 .proc_handler = &proc_dointvec, 209 .proc_handler = &proc_dointvec,
@@ -270,7 +219,7 @@ static ctl_table ip_ct_sysctl_table[] = {
270 { 219 {
271 .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM, 220 .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM,
272 .procname = "ip_conntrack_checksum", 221 .procname = "ip_conntrack_checksum",
273 .data = &nf_conntrack_checksum, 222 .data = &init_net.ct.sysctl_checksum,
274 .maxlen = sizeof(int), 223 .maxlen = sizeof(int),
275 .mode = 0644, 224 .mode = 0644,
276 .proc_handler = &proc_dointvec, 225 .proc_handler = &proc_dointvec,
@@ -278,7 +227,7 @@ static ctl_table ip_ct_sysctl_table[] = {
278 { 227 {
279 .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID, 228 .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID,
280 .procname = "ip_conntrack_log_invalid", 229 .procname = "ip_conntrack_log_invalid",
281 .data = &nf_ct_log_invalid, 230 .data = &init_net.ct.sysctl_log_invalid,
282 .maxlen = sizeof(unsigned int), 231 .maxlen = sizeof(unsigned int),
283 .mode = 0644, 232 .mode = 0644,
284 .proc_handler = &proc_dointvec_minmax, 233 .proc_handler = &proc_dointvec_minmax,
@@ -323,7 +272,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
323 return -EINVAL; 272 return -EINVAL;
324 } 273 }
325 274
326 h = nf_conntrack_find_get(&tuple); 275 h = nf_conntrack_find_get(sock_net(sk), &tuple);
327 if (h) { 276 if (h) {
328 struct sockaddr_in sin; 277 struct sockaddr_in sin;
329 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 278 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
@@ -422,6 +371,7 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
422 int ret = 0; 371 int ret = 0;
423 372
424 need_conntrack(); 373 need_conntrack();
374 nf_defrag_ipv4_enable();
425 375
426 ret = nf_register_sockopt(&so_getorigdst); 376 ret = nf_register_sockopt(&so_getorigdst);
427 if (ret < 0) { 377 if (ret < 0) {
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 3a020720e40b..313ebf00ee36 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -21,18 +21,20 @@
21#include <net/netfilter/nf_conntrack_acct.h> 21#include <net/netfilter/nf_conntrack_acct.h>
22 22
23struct ct_iter_state { 23struct ct_iter_state {
24 struct seq_net_private p;
24 unsigned int bucket; 25 unsigned int bucket;
25}; 26};
26 27
27static struct hlist_node *ct_get_first(struct seq_file *seq) 28static struct hlist_node *ct_get_first(struct seq_file *seq)
28{ 29{
30 struct net *net = seq_file_net(seq);
29 struct ct_iter_state *st = seq->private; 31 struct ct_iter_state *st = seq->private;
30 struct hlist_node *n; 32 struct hlist_node *n;
31 33
32 for (st->bucket = 0; 34 for (st->bucket = 0;
33 st->bucket < nf_conntrack_htable_size; 35 st->bucket < nf_conntrack_htable_size;
34 st->bucket++) { 36 st->bucket++) {
35 n = rcu_dereference(nf_conntrack_hash[st->bucket].first); 37 n = rcu_dereference(net->ct.hash[st->bucket].first);
36 if (n) 38 if (n)
37 return n; 39 return n;
38 } 40 }
@@ -42,13 +44,14 @@ static struct hlist_node *ct_get_first(struct seq_file *seq)
42static struct hlist_node *ct_get_next(struct seq_file *seq, 44static struct hlist_node *ct_get_next(struct seq_file *seq,
43 struct hlist_node *head) 45 struct hlist_node *head)
44{ 46{
47 struct net *net = seq_file_net(seq);
45 struct ct_iter_state *st = seq->private; 48 struct ct_iter_state *st = seq->private;
46 49
47 head = rcu_dereference(head->next); 50 head = rcu_dereference(head->next);
48 while (head == NULL) { 51 while (head == NULL) {
49 if (++st->bucket >= nf_conntrack_htable_size) 52 if (++st->bucket >= nf_conntrack_htable_size)
50 return NULL; 53 return NULL;
51 head = rcu_dereference(nf_conntrack_hash[st->bucket].first); 54 head = rcu_dereference(net->ct.hash[st->bucket].first);
52 } 55 }
53 return head; 56 return head;
54} 57}
@@ -158,8 +161,8 @@ static const struct seq_operations ct_seq_ops = {
158 161
159static int ct_open(struct inode *inode, struct file *file) 162static int ct_open(struct inode *inode, struct file *file)
160{ 163{
161 return seq_open_private(file, &ct_seq_ops, 164 return seq_open_net(inode, file, &ct_seq_ops,
162 sizeof(struct ct_iter_state)); 165 sizeof(struct ct_iter_state));
163} 166}
164 167
165static const struct file_operations ct_file_ops = { 168static const struct file_operations ct_file_ops = {
@@ -167,21 +170,23 @@ static const struct file_operations ct_file_ops = {
167 .open = ct_open, 170 .open = ct_open,
168 .read = seq_read, 171 .read = seq_read,
169 .llseek = seq_lseek, 172 .llseek = seq_lseek,
170 .release = seq_release_private, 173 .release = seq_release_net,
171}; 174};
172 175
173/* expects */ 176/* expects */
174struct ct_expect_iter_state { 177struct ct_expect_iter_state {
178 struct seq_net_private p;
175 unsigned int bucket; 179 unsigned int bucket;
176}; 180};
177 181
178static struct hlist_node *ct_expect_get_first(struct seq_file *seq) 182static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
179{ 183{
184 struct net *net = seq_file_net(seq);
180 struct ct_expect_iter_state *st = seq->private; 185 struct ct_expect_iter_state *st = seq->private;
181 struct hlist_node *n; 186 struct hlist_node *n;
182 187
183 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { 188 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
184 n = rcu_dereference(nf_ct_expect_hash[st->bucket].first); 189 n = rcu_dereference(net->ct.expect_hash[st->bucket].first);
185 if (n) 190 if (n)
186 return n; 191 return n;
187 } 192 }
@@ -191,13 +196,14 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
191static struct hlist_node *ct_expect_get_next(struct seq_file *seq, 196static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
192 struct hlist_node *head) 197 struct hlist_node *head)
193{ 198{
199 struct net *net = seq_file_net(seq);
194 struct ct_expect_iter_state *st = seq->private; 200 struct ct_expect_iter_state *st = seq->private;
195 201
196 head = rcu_dereference(head->next); 202 head = rcu_dereference(head->next);
197 while (head == NULL) { 203 while (head == NULL) {
198 if (++st->bucket >= nf_ct_expect_hsize) 204 if (++st->bucket >= nf_ct_expect_hsize)
199 return NULL; 205 return NULL;
200 head = rcu_dereference(nf_ct_expect_hash[st->bucket].first); 206 head = rcu_dereference(net->ct.expect_hash[st->bucket].first);
201 } 207 }
202 return head; 208 return head;
203} 209}
@@ -265,8 +271,8 @@ static const struct seq_operations exp_seq_ops = {
265 271
266static int exp_open(struct inode *inode, struct file *file) 272static int exp_open(struct inode *inode, struct file *file)
267{ 273{
268 return seq_open_private(file, &exp_seq_ops, 274 return seq_open_net(inode, file, &exp_seq_ops,
269 sizeof(struct ct_expect_iter_state)); 275 sizeof(struct ct_expect_iter_state));
270} 276}
271 277
272static const struct file_operations ip_exp_file_ops = { 278static const struct file_operations ip_exp_file_ops = {
@@ -274,11 +280,12 @@ static const struct file_operations ip_exp_file_ops = {
274 .open = exp_open, 280 .open = exp_open,
275 .read = seq_read, 281 .read = seq_read,
276 .llseek = seq_lseek, 282 .llseek = seq_lseek,
277 .release = seq_release_private, 283 .release = seq_release_net,
278}; 284};
279 285
280static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) 286static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
281{ 287{
288 struct net *net = seq_file_net(seq);
282 int cpu; 289 int cpu;
283 290
284 if (*pos == 0) 291 if (*pos == 0)
@@ -288,7 +295,7 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
288 if (!cpu_possible(cpu)) 295 if (!cpu_possible(cpu))
289 continue; 296 continue;
290 *pos = cpu+1; 297 *pos = cpu+1;
291 return &per_cpu(nf_conntrack_stat, cpu); 298 return per_cpu_ptr(net->ct.stat, cpu);
292 } 299 }
293 300
294 return NULL; 301 return NULL;
@@ -296,13 +303,14 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
296 303
297static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 304static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
298{ 305{
306 struct net *net = seq_file_net(seq);
299 int cpu; 307 int cpu;
300 308
301 for (cpu = *pos; cpu < NR_CPUS; ++cpu) { 309 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
302 if (!cpu_possible(cpu)) 310 if (!cpu_possible(cpu))
303 continue; 311 continue;
304 *pos = cpu+1; 312 *pos = cpu+1;
305 return &per_cpu(nf_conntrack_stat, cpu); 313 return per_cpu_ptr(net->ct.stat, cpu);
306 } 314 }
307 315
308 return NULL; 316 return NULL;
@@ -314,7 +322,8 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
314 322
315static int ct_cpu_seq_show(struct seq_file *seq, void *v) 323static int ct_cpu_seq_show(struct seq_file *seq, void *v)
316{ 324{
317 unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); 325 struct net *net = seq_file_net(seq);
326 unsigned int nr_conntracks = atomic_read(&net->ct.count);
318 const struct ip_conntrack_stat *st = v; 327 const struct ip_conntrack_stat *st = v;
319 328
320 if (v == SEQ_START_TOKEN) { 329 if (v == SEQ_START_TOKEN) {
@@ -354,7 +363,8 @@ static const struct seq_operations ct_cpu_seq_ops = {
354 363
355static int ct_cpu_seq_open(struct inode *inode, struct file *file) 364static int ct_cpu_seq_open(struct inode *inode, struct file *file)
356{ 365{
357 return seq_open(file, &ct_cpu_seq_ops); 366 return seq_open_net(inode, file, &ct_cpu_seq_ops,
367 sizeof(struct seq_net_private));
358} 368}
359 369
360static const struct file_operations ct_cpu_seq_fops = { 370static const struct file_operations ct_cpu_seq_fops = {
@@ -362,39 +372,54 @@ static const struct file_operations ct_cpu_seq_fops = {
362 .open = ct_cpu_seq_open, 372 .open = ct_cpu_seq_open,
363 .read = seq_read, 373 .read = seq_read,
364 .llseek = seq_lseek, 374 .llseek = seq_lseek,
365 .release = seq_release, 375 .release = seq_release_net,
366}; 376};
367 377
368int __init nf_conntrack_ipv4_compat_init(void) 378static int __net_init ip_conntrack_net_init(struct net *net)
369{ 379{
370 struct proc_dir_entry *proc, *proc_exp, *proc_stat; 380 struct proc_dir_entry *proc, *proc_exp, *proc_stat;
371 381
372 proc = proc_net_fops_create(&init_net, "ip_conntrack", 0440, &ct_file_ops); 382 proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops);
373 if (!proc) 383 if (!proc)
374 goto err1; 384 goto err1;
375 385
376 proc_exp = proc_net_fops_create(&init_net, "ip_conntrack_expect", 0440, 386 proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440,
377 &ip_exp_file_ops); 387 &ip_exp_file_ops);
378 if (!proc_exp) 388 if (!proc_exp)
379 goto err2; 389 goto err2;
380 390
381 proc_stat = proc_create("ip_conntrack", S_IRUGO, 391 proc_stat = proc_create("ip_conntrack", S_IRUGO,
382 init_net.proc_net_stat, &ct_cpu_seq_fops); 392 net->proc_net_stat, &ct_cpu_seq_fops);
383 if (!proc_stat) 393 if (!proc_stat)
384 goto err3; 394 goto err3;
385 return 0; 395 return 0;
386 396
387err3: 397err3:
388 proc_net_remove(&init_net, "ip_conntrack_expect"); 398 proc_net_remove(net, "ip_conntrack_expect");
389err2: 399err2:
390 proc_net_remove(&init_net, "ip_conntrack"); 400 proc_net_remove(net, "ip_conntrack");
391err1: 401err1:
392 return -ENOMEM; 402 return -ENOMEM;
393} 403}
394 404
405static void __net_exit ip_conntrack_net_exit(struct net *net)
406{
407 remove_proc_entry("ip_conntrack", net->proc_net_stat);
408 proc_net_remove(net, "ip_conntrack_expect");
409 proc_net_remove(net, "ip_conntrack");
410}
411
412static struct pernet_operations ip_conntrack_net_ops = {
413 .init = ip_conntrack_net_init,
414 .exit = ip_conntrack_net_exit,
415};
416
417int __init nf_conntrack_ipv4_compat_init(void)
418{
419 return register_pernet_subsys(&ip_conntrack_net_ops);
420}
421
395void __exit nf_conntrack_ipv4_compat_fini(void) 422void __exit nf_conntrack_ipv4_compat_fini(void)
396{ 423{
397 remove_proc_entry("ip_conntrack", init_net.proc_net_stat); 424 unregister_pernet_subsys(&ip_conntrack_net_ops);
398 proc_net_remove(&init_net, "ip_conntrack_expect");
399 proc_net_remove(&init_net, "ip_conntrack");
400} 425}
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 97791048fa9b..4e8879220222 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -79,7 +79,7 @@ static int icmp_packet(struct nf_conn *ct,
79 const struct sk_buff *skb, 79 const struct sk_buff *skb,
80 unsigned int dataoff, 80 unsigned int dataoff,
81 enum ip_conntrack_info ctinfo, 81 enum ip_conntrack_info ctinfo,
82 int pf, 82 u_int8_t pf,
83 unsigned int hooknum) 83 unsigned int hooknum)
84{ 84{
85 /* Try to delete connection immediately after all replies: 85 /* Try to delete connection immediately after all replies:
@@ -91,7 +91,7 @@ static int icmp_packet(struct nf_conn *ct,
91 nf_ct_kill_acct(ct, ctinfo, skb); 91 nf_ct_kill_acct(ct, ctinfo, skb);
92 } else { 92 } else {
93 atomic_inc(&ct->proto.icmp.count); 93 atomic_inc(&ct->proto.icmp.count);
94 nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); 94 nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct);
95 nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout); 95 nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout);
96 } 96 }
97 97
@@ -123,7 +123,7 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
123 123
124/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ 124/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
125static int 125static int
126icmp_error_message(struct sk_buff *skb, 126icmp_error_message(struct net *net, struct sk_buff *skb,
127 enum ip_conntrack_info *ctinfo, 127 enum ip_conntrack_info *ctinfo,
128 unsigned int hooknum) 128 unsigned int hooknum)
129{ 129{
@@ -155,7 +155,7 @@ icmp_error_message(struct sk_buff *skb,
155 155
156 *ctinfo = IP_CT_RELATED; 156 *ctinfo = IP_CT_RELATED;
157 157
158 h = nf_conntrack_find_get(&innertuple); 158 h = nf_conntrack_find_get(net, &innertuple);
159 if (!h) { 159 if (!h) {
160 pr_debug("icmp_error_message: no match\n"); 160 pr_debug("icmp_error_message: no match\n");
161 return -NF_ACCEPT; 161 return -NF_ACCEPT;
@@ -172,8 +172,8 @@ icmp_error_message(struct sk_buff *skb,
172 172
173/* Small and modified version of icmp_rcv */ 173/* Small and modified version of icmp_rcv */
174static int 174static int
175icmp_error(struct sk_buff *skb, unsigned int dataoff, 175icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
176 enum ip_conntrack_info *ctinfo, int pf, unsigned int hooknum) 176 enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
177{ 177{
178 const struct icmphdr *icmph; 178 const struct icmphdr *icmph;
179 struct icmphdr _ih; 179 struct icmphdr _ih;
@@ -181,16 +181,16 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff,
181 /* Not enough header? */ 181 /* Not enough header? */
182 icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); 182 icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
183 if (icmph == NULL) { 183 if (icmph == NULL) {
184 if (LOG_INVALID(IPPROTO_ICMP)) 184 if (LOG_INVALID(net, IPPROTO_ICMP))
185 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, 185 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
186 "nf_ct_icmp: short packet "); 186 "nf_ct_icmp: short packet ");
187 return -NF_ACCEPT; 187 return -NF_ACCEPT;
188 } 188 }
189 189
190 /* See ip_conntrack_proto_tcp.c */ 190 /* See ip_conntrack_proto_tcp.c */
191 if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && 191 if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
192 nf_ip_checksum(skb, hooknum, dataoff, 0)) { 192 nf_ip_checksum(skb, hooknum, dataoff, 0)) {
193 if (LOG_INVALID(IPPROTO_ICMP)) 193 if (LOG_INVALID(net, IPPROTO_ICMP))
194 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, 194 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
195 "nf_ct_icmp: bad HW ICMP checksum "); 195 "nf_ct_icmp: bad HW ICMP checksum ");
196 return -NF_ACCEPT; 196 return -NF_ACCEPT;
@@ -203,7 +203,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff,
203 * discarded. 203 * discarded.
204 */ 204 */
205 if (icmph->type > NR_ICMP_TYPES) { 205 if (icmph->type > NR_ICMP_TYPES) {
206 if (LOG_INVALID(IPPROTO_ICMP)) 206 if (LOG_INVALID(net, IPPROTO_ICMP))
207 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, 207 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
208 "nf_ct_icmp: invalid ICMP type "); 208 "nf_ct_icmp: invalid ICMP type ");
209 return -NF_ACCEPT; 209 return -NF_ACCEPT;
@@ -217,7 +217,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff,
217 && icmph->type != ICMP_REDIRECT) 217 && icmph->type != ICMP_REDIRECT)
218 return NF_ACCEPT; 218 return NF_ACCEPT;
219 219
220 return icmp_error_message(skb, ctinfo, hooknum); 220 return icmp_error_message(net, skb, ctinfo, hooknum);
221} 221}
222 222
223#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 223#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
new file mode 100644
index 000000000000..aa2c50a180f7
--- /dev/null
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -0,0 +1,96 @@
1/* (C) 1999-2001 Paul `Rusty' Russell
2 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8
9#include <linux/types.h>
10#include <linux/ip.h>
11#include <linux/netfilter.h>
12#include <linux/module.h>
13#include <linux/skbuff.h>
14#include <net/route.h>
15#include <net/ip.h>
16
17#include <linux/netfilter_ipv4.h>
18#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
19
20/* Returns new sk_buff, or NULL */
21static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
22{
23 int err;
24
25 skb_orphan(skb);
26
27 local_bh_disable();
28 err = ip_defrag(skb, user);
29 local_bh_enable();
30
31 if (!err)
32 ip_send_check(ip_hdr(skb));
33
34 return err;
35}
36
37static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
38 struct sk_buff *skb,
39 const struct net_device *in,
40 const struct net_device *out,
41 int (*okfn)(struct sk_buff *))
42{
43#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
44 /* Previously seen (loopback)? Ignore. Do this before
45 fragment check. */
46 if (skb->nfct)
47 return NF_ACCEPT;
48#endif
49
50 /* Gather fragments. */
51 if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
52 if (nf_ct_ipv4_gather_frags(skb,
53 hooknum == NF_INET_PRE_ROUTING ?
54 IP_DEFRAG_CONNTRACK_IN :
55 IP_DEFRAG_CONNTRACK_OUT))
56 return NF_STOLEN;
57 }
58 return NF_ACCEPT;
59}
60
61static struct nf_hook_ops ipv4_defrag_ops[] = {
62 {
63 .hook = ipv4_conntrack_defrag,
64 .owner = THIS_MODULE,
65 .pf = PF_INET,
66 .hooknum = NF_INET_PRE_ROUTING,
67 .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
68 },
69 {
70 .hook = ipv4_conntrack_defrag,
71 .owner = THIS_MODULE,
72 .pf = PF_INET,
73 .hooknum = NF_INET_LOCAL_OUT,
74 .priority = NF_IP_PRI_CONNTRACK_DEFRAG,
75 },
76};
77
78static int __init nf_defrag_init(void)
79{
80 return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
81}
82
83static void __exit nf_defrag_fini(void)
84{
85 nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
86}
87
88void nf_defrag_ipv4_enable(void)
89{
90}
91EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable);
92
93module_init(nf_defrag_init);
94module_exit(nf_defrag_fini);
95
96MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 6c6a3cba8d50..2ac9eaf1a8c9 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -37,9 +37,6 @@ static struct nf_conntrack_l3proto *l3proto __read_mostly;
37 37
38/* Calculated at init based on memory size */ 38/* Calculated at init based on memory size */
39static unsigned int nf_nat_htable_size __read_mostly; 39static unsigned int nf_nat_htable_size __read_mostly;
40static int nf_nat_vmalloced;
41
42static struct hlist_head *bysource __read_mostly;
43 40
44#define MAX_IP_NAT_PROTO 256 41#define MAX_IP_NAT_PROTO 256
45static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] 42static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
@@ -145,7 +142,8 @@ same_src(const struct nf_conn *ct,
145 142
146/* Only called for SRC manip */ 143/* Only called for SRC manip */
147static int 144static int
148find_appropriate_src(const struct nf_conntrack_tuple *tuple, 145find_appropriate_src(struct net *net,
146 const struct nf_conntrack_tuple *tuple,
149 struct nf_conntrack_tuple *result, 147 struct nf_conntrack_tuple *result,
150 const struct nf_nat_range *range) 148 const struct nf_nat_range *range)
151{ 149{
@@ -155,7 +153,7 @@ find_appropriate_src(const struct nf_conntrack_tuple *tuple,
155 const struct hlist_node *n; 153 const struct hlist_node *n;
156 154
157 rcu_read_lock(); 155 rcu_read_lock();
158 hlist_for_each_entry_rcu(nat, n, &bysource[h], bysource) { 156 hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) {
159 ct = nat->ct; 157 ct = nat->ct;
160 if (same_src(ct, tuple)) { 158 if (same_src(ct, tuple)) {
161 /* Copy source part from reply tuple. */ 159 /* Copy source part from reply tuple. */
@@ -231,6 +229,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
231 struct nf_conn *ct, 229 struct nf_conn *ct,
232 enum nf_nat_manip_type maniptype) 230 enum nf_nat_manip_type maniptype)
233{ 231{
232 struct net *net = nf_ct_net(ct);
234 const struct nf_nat_protocol *proto; 233 const struct nf_nat_protocol *proto;
235 234
236 /* 1) If this srcip/proto/src-proto-part is currently mapped, 235 /* 1) If this srcip/proto/src-proto-part is currently mapped,
@@ -242,7 +241,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
242 manips not an issue. */ 241 manips not an issue. */
243 if (maniptype == IP_NAT_MANIP_SRC && 242 if (maniptype == IP_NAT_MANIP_SRC &&
244 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { 243 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
245 if (find_appropriate_src(orig_tuple, tuple, range)) { 244 if (find_appropriate_src(net, orig_tuple, tuple, range)) {
246 pr_debug("get_unique_tuple: Found current src map\n"); 245 pr_debug("get_unique_tuple: Found current src map\n");
247 if (!nf_nat_used_tuple(tuple, ct)) 246 if (!nf_nat_used_tuple(tuple, ct))
248 return; 247 return;
@@ -283,6 +282,7 @@ nf_nat_setup_info(struct nf_conn *ct,
283 const struct nf_nat_range *range, 282 const struct nf_nat_range *range,
284 enum nf_nat_manip_type maniptype) 283 enum nf_nat_manip_type maniptype)
285{ 284{
285 struct net *net = nf_ct_net(ct);
286 struct nf_conntrack_tuple curr_tuple, new_tuple; 286 struct nf_conntrack_tuple curr_tuple, new_tuple;
287 struct nf_conn_nat *nat; 287 struct nf_conn_nat *nat;
288 int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK); 288 int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
@@ -334,7 +334,8 @@ nf_nat_setup_info(struct nf_conn *ct,
334 /* nf_conntrack_alter_reply might re-allocate exntension aera */ 334 /* nf_conntrack_alter_reply might re-allocate exntension aera */
335 nat = nfct_nat(ct); 335 nat = nfct_nat(ct);
336 nat->ct = ct; 336 nat->ct = ct;
337 hlist_add_head_rcu(&nat->bysource, &bysource[srchash]); 337 hlist_add_head_rcu(&nat->bysource,
338 &net->ipv4.nat_bysource[srchash]);
338 spin_unlock_bh(&nf_nat_lock); 339 spin_unlock_bh(&nf_nat_lock);
339 } 340 }
340 341
@@ -583,6 +584,40 @@ static struct nf_ct_ext_type nat_extend __read_mostly = {
583 .flags = NF_CT_EXT_F_PREALLOC, 584 .flags = NF_CT_EXT_F_PREALLOC,
584}; 585};
585 586
587static int __net_init nf_nat_net_init(struct net *net)
588{
589 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size,
590 &net->ipv4.nat_vmalloced);
591 if (!net->ipv4.nat_bysource)
592 return -ENOMEM;
593 return 0;
594}
595
596/* Clear NAT section of all conntracks, in case we're loaded again. */
597static int clean_nat(struct nf_conn *i, void *data)
598{
599 struct nf_conn_nat *nat = nfct_nat(i);
600
601 if (!nat)
602 return 0;
603 memset(nat, 0, sizeof(*nat));
604 i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
605 return 0;
606}
607
608static void __net_exit nf_nat_net_exit(struct net *net)
609{
610 nf_ct_iterate_cleanup(net, &clean_nat, NULL);
611 synchronize_rcu();
612 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced,
613 nf_nat_htable_size);
614}
615
616static struct pernet_operations nf_nat_net_ops = {
617 .init = nf_nat_net_init,
618 .exit = nf_nat_net_exit,
619};
620
586static int __init nf_nat_init(void) 621static int __init nf_nat_init(void)
587{ 622{
588 size_t i; 623 size_t i;
@@ -599,12 +634,9 @@ static int __init nf_nat_init(void)
599 /* Leave them the same for the moment. */ 634 /* Leave them the same for the moment. */
600 nf_nat_htable_size = nf_conntrack_htable_size; 635 nf_nat_htable_size = nf_conntrack_htable_size;
601 636
602 bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 637 ret = register_pernet_subsys(&nf_nat_net_ops);
603 &nf_nat_vmalloced); 638 if (ret < 0)
604 if (!bysource) {
605 ret = -ENOMEM;
606 goto cleanup_extend; 639 goto cleanup_extend;
607 }
608 640
609 /* Sew in builtin protocols. */ 641 /* Sew in builtin protocols. */
610 spin_lock_bh(&nf_nat_lock); 642 spin_lock_bh(&nf_nat_lock);
@@ -629,23 +661,9 @@ static int __init nf_nat_init(void)
629 return ret; 661 return ret;
630} 662}
631 663
632/* Clear NAT section of all conntracks, in case we're loaded again. */
633static int clean_nat(struct nf_conn *i, void *data)
634{
635 struct nf_conn_nat *nat = nfct_nat(i);
636
637 if (!nat)
638 return 0;
639 memset(nat, 0, sizeof(*nat));
640 i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
641 return 0;
642}
643
644static void __exit nf_nat_cleanup(void) 664static void __exit nf_nat_cleanup(void)
645{ 665{
646 nf_ct_iterate_cleanup(&clean_nat, NULL); 666 unregister_pernet_subsys(&nf_nat_net_ops);
647 synchronize_rcu();
648 nf_ct_free_hashtable(bysource, nf_nat_vmalloced, nf_nat_htable_size);
649 nf_ct_l3proto_put(l3proto); 667 nf_ct_l3proto_put(l3proto);
650 nf_ct_extend_unregister(&nat_extend); 668 nf_ct_extend_unregister(&nat_extend);
651 rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); 669 rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL);
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 11976ea29884..cf7a42bf9820 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -16,6 +16,7 @@
16#include <linux/udp.h> 16#include <linux/udp.h>
17#include <net/checksum.h> 17#include <net/checksum.h>
18#include <net/tcp.h> 18#include <net/tcp.h>
19#include <net/route.h>
19 20
20#include <linux/netfilter_ipv4.h> 21#include <linux/netfilter_ipv4.h>
21#include <net/netfilter/nf_conntrack.h> 22#include <net/netfilter/nf_conntrack.h>
@@ -192,7 +193,7 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb,
192 nf_conntrack_tcp_update(skb, ip_hdrlen(skb), 193 nf_conntrack_tcp_update(skb, ip_hdrlen(skb),
193 ct, CTINFO2DIR(ctinfo)); 194 ct, CTINFO2DIR(ctinfo));
194 195
195 nf_conntrack_event_cache(IPCT_NATSEQADJ, skb); 196 nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
196 } 197 }
197 return 1; 198 return 1;
198} 199}
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index da3d91a5ef5c..9eb171056c63 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -40,6 +40,7 @@ MODULE_ALIAS("ip_nat_pptp");
40static void pptp_nat_expected(struct nf_conn *ct, 40static void pptp_nat_expected(struct nf_conn *ct,
41 struct nf_conntrack_expect *exp) 41 struct nf_conntrack_expect *exp)
42{ 42{
43 struct net *net = nf_ct_net(ct);
43 const struct nf_conn *master = ct->master; 44 const struct nf_conn *master = ct->master;
44 struct nf_conntrack_expect *other_exp; 45 struct nf_conntrack_expect *other_exp;
45 struct nf_conntrack_tuple t; 46 struct nf_conntrack_tuple t;
@@ -73,7 +74,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
73 74
74 pr_debug("trying to unexpect other dir: "); 75 pr_debug("trying to unexpect other dir: ");
75 nf_ct_dump_tuple_ip(&t); 76 nf_ct_dump_tuple_ip(&t);
76 other_exp = nf_ct_expect_find_get(&t); 77 other_exp = nf_ct_expect_find_get(net, &t);
77 if (other_exp) { 78 if (other_exp) {
78 nf_ct_unexpect_related(other_exp); 79 nf_ct_unexpect_related(other_exp);
79 nf_ct_expect_put(other_exp); 80 nf_ct_expect_put(other_exp);
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index e8b4d0d4439e..bea54a685109 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -33,7 +33,7 @@ static struct
33 struct ipt_replace repl; 33 struct ipt_replace repl;
34 struct ipt_standard entries[3]; 34 struct ipt_standard entries[3];
35 struct ipt_error term; 35 struct ipt_error term;
36} nat_initial_table __initdata = { 36} nat_initial_table __net_initdata = {
37 .repl = { 37 .repl = {
38 .name = "nat", 38 .name = "nat",
39 .valid_hooks = NAT_VALID_HOOKS, 39 .valid_hooks = NAT_VALID_HOOKS,
@@ -58,47 +58,42 @@ static struct
58 .term = IPT_ERROR_INIT, /* ERROR */ 58 .term = IPT_ERROR_INIT, /* ERROR */
59}; 59};
60 60
61static struct xt_table __nat_table = { 61static struct xt_table nat_table = {
62 .name = "nat", 62 .name = "nat",
63 .valid_hooks = NAT_VALID_HOOKS, 63 .valid_hooks = NAT_VALID_HOOKS,
64 .lock = __RW_LOCK_UNLOCKED(__nat_table.lock), 64 .lock = __RW_LOCK_UNLOCKED(__nat_table.lock),
65 .me = THIS_MODULE, 65 .me = THIS_MODULE,
66 .af = AF_INET, 66 .af = AF_INET,
67}; 67};
68static struct xt_table *nat_table;
69 68
70/* Source NAT */ 69/* Source NAT */
71static unsigned int ipt_snat_target(struct sk_buff *skb, 70static unsigned int
72 const struct net_device *in, 71ipt_snat_target(struct sk_buff *skb, const struct xt_target_param *par)
73 const struct net_device *out,
74 unsigned int hooknum,
75 const struct xt_target *target,
76 const void *targinfo)
77{ 72{
78 struct nf_conn *ct; 73 struct nf_conn *ct;
79 enum ip_conntrack_info ctinfo; 74 enum ip_conntrack_info ctinfo;
80 const struct nf_nat_multi_range_compat *mr = targinfo; 75 const struct nf_nat_multi_range_compat *mr = par->targinfo;
81 76
82 NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); 77 NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
83 78
84 ct = nf_ct_get(skb, &ctinfo); 79 ct = nf_ct_get(skb, &ctinfo);
85 80
86 /* Connection must be valid and new. */ 81 /* Connection must be valid and new. */
87 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || 82 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
88 ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); 83 ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
89 NF_CT_ASSERT(out); 84 NF_CT_ASSERT(par->out != NULL);
90 85
91 return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC); 86 return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_SRC);
92} 87}
93 88
94/* Before 2.6.11 we did implicit source NAT if required. Warn about change. */ 89/* Before 2.6.11 we did implicit source NAT if required. Warn about change. */
95static void warn_if_extra_mangle(__be32 dstip, __be32 srcip) 90static void warn_if_extra_mangle(struct net *net, __be32 dstip, __be32 srcip)
96{ 91{
97 static int warned = 0; 92 static int warned = 0;
98 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } }; 93 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } };
99 struct rtable *rt; 94 struct rtable *rt;
100 95
101 if (ip_route_output_key(&init_net, &rt, &fl) != 0) 96 if (ip_route_output_key(net, &rt, &fl) != 0)
102 return; 97 return;
103 98
104 if (rt->rt_src != srcip && !warned) { 99 if (rt->rt_src != srcip && !warned) {
@@ -110,40 +105,32 @@ static void warn_if_extra_mangle(__be32 dstip, __be32 srcip)
110 ip_rt_put(rt); 105 ip_rt_put(rt);
111} 106}
112 107
113static unsigned int ipt_dnat_target(struct sk_buff *skb, 108static unsigned int
114 const struct net_device *in, 109ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par)
115 const struct net_device *out,
116 unsigned int hooknum,
117 const struct xt_target *target,
118 const void *targinfo)
119{ 110{
120 struct nf_conn *ct; 111 struct nf_conn *ct;
121 enum ip_conntrack_info ctinfo; 112 enum ip_conntrack_info ctinfo;
122 const struct nf_nat_multi_range_compat *mr = targinfo; 113 const struct nf_nat_multi_range_compat *mr = par->targinfo;
123 114
124 NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING || 115 NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
125 hooknum == NF_INET_LOCAL_OUT); 116 par->hooknum == NF_INET_LOCAL_OUT);
126 117
127 ct = nf_ct_get(skb, &ctinfo); 118 ct = nf_ct_get(skb, &ctinfo);
128 119
129 /* Connection must be valid and new. */ 120 /* Connection must be valid and new. */
130 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); 121 NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
131 122
132 if (hooknum == NF_INET_LOCAL_OUT && 123 if (par->hooknum == NF_INET_LOCAL_OUT &&
133 mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) 124 mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
134 warn_if_extra_mangle(ip_hdr(skb)->daddr, 125 warn_if_extra_mangle(dev_net(par->out), ip_hdr(skb)->daddr,
135 mr->range[0].min_ip); 126 mr->range[0].min_ip);
136 127
137 return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST); 128 return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST);
138} 129}
139 130
140static bool ipt_snat_checkentry(const char *tablename, 131static bool ipt_snat_checkentry(const struct xt_tgchk_param *par)
141 const void *entry,
142 const struct xt_target *target,
143 void *targinfo,
144 unsigned int hook_mask)
145{ 132{
146 const struct nf_nat_multi_range_compat *mr = targinfo; 133 const struct nf_nat_multi_range_compat *mr = par->targinfo;
147 134
148 /* Must be a valid range */ 135 /* Must be a valid range */
149 if (mr->rangesize != 1) { 136 if (mr->rangesize != 1) {
@@ -153,13 +140,9 @@ static bool ipt_snat_checkentry(const char *tablename,
153 return true; 140 return true;
154} 141}
155 142
156static bool ipt_dnat_checkentry(const char *tablename, 143static bool ipt_dnat_checkentry(const struct xt_tgchk_param *par)
157 const void *entry,
158 const struct xt_target *target,
159 void *targinfo,
160 unsigned int hook_mask)
161{ 144{
162 const struct nf_nat_multi_range_compat *mr = targinfo; 145 const struct nf_nat_multi_range_compat *mr = par->targinfo;
163 146
164 /* Must be a valid range */ 147 /* Must be a valid range */
165 if (mr->rangesize != 1) { 148 if (mr->rangesize != 1) {
@@ -194,9 +177,10 @@ int nf_nat_rule_find(struct sk_buff *skb,
194 const struct net_device *out, 177 const struct net_device *out,
195 struct nf_conn *ct) 178 struct nf_conn *ct)
196{ 179{
180 struct net *net = nf_ct_net(ct);
197 int ret; 181 int ret;
198 182
199 ret = ipt_do_table(skb, hooknum, in, out, nat_table); 183 ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
200 184
201 if (ret == NF_ACCEPT) { 185 if (ret == NF_ACCEPT) {
202 if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum))) 186 if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
@@ -226,14 +210,32 @@ static struct xt_target ipt_dnat_reg __read_mostly = {
226 .family = AF_INET, 210 .family = AF_INET,
227}; 211};
228 212
213static int __net_init nf_nat_rule_net_init(struct net *net)
214{
215 net->ipv4.nat_table = ipt_register_table(net, &nat_table,
216 &nat_initial_table.repl);
217 if (IS_ERR(net->ipv4.nat_table))
218 return PTR_ERR(net->ipv4.nat_table);
219 return 0;
220}
221
222static void __net_exit nf_nat_rule_net_exit(struct net *net)
223{
224 ipt_unregister_table(net->ipv4.nat_table);
225}
226
227static struct pernet_operations nf_nat_rule_net_ops = {
228 .init = nf_nat_rule_net_init,
229 .exit = nf_nat_rule_net_exit,
230};
231
229int __init nf_nat_rule_init(void) 232int __init nf_nat_rule_init(void)
230{ 233{
231 int ret; 234 int ret;
232 235
233 nat_table = ipt_register_table(&init_net, &__nat_table, 236 ret = register_pernet_subsys(&nf_nat_rule_net_ops);
234 &nat_initial_table.repl); 237 if (ret != 0)
235 if (IS_ERR(nat_table)) 238 goto out;
236 return PTR_ERR(nat_table);
237 ret = xt_register_target(&ipt_snat_reg); 239 ret = xt_register_target(&ipt_snat_reg);
238 if (ret != 0) 240 if (ret != 0)
239 goto unregister_table; 241 goto unregister_table;
@@ -247,8 +249,8 @@ int __init nf_nat_rule_init(void)
247 unregister_snat: 249 unregister_snat:
248 xt_unregister_target(&ipt_snat_reg); 250 xt_unregister_target(&ipt_snat_reg);
249 unregister_table: 251 unregister_table:
250 ipt_unregister_table(nat_table); 252 unregister_pernet_subsys(&nf_nat_rule_net_ops);
251 253 out:
252 return ret; 254 return ret;
253} 255}
254 256
@@ -256,5 +258,5 @@ void nf_nat_rule_cleanup(void)
256{ 258{
257 xt_unregister_target(&ipt_dnat_reg); 259 xt_unregister_target(&ipt_dnat_reg);
258 xt_unregister_target(&ipt_snat_reg); 260 xt_unregister_target(&ipt_snat_reg);
259 ipt_unregister_table(nat_table); 261 unregister_pernet_subsys(&nf_nat_rule_net_ops);
260} 262}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6ee5354c9aa1..a6d7c584f53b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -282,6 +282,8 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
282 struct rtable *r = NULL; 282 struct rtable *r = NULL;
283 283
284 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { 284 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
285 if (!rt_hash_table[st->bucket].chain)
286 continue;
285 rcu_read_lock_bh(); 287 rcu_read_lock_bh();
286 r = rcu_dereference(rt_hash_table[st->bucket].chain); 288 r = rcu_dereference(rt_hash_table[st->bucket].chain);
287 while (r) { 289 while (r) {
@@ -299,11 +301,14 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
299 struct rtable *r) 301 struct rtable *r)
300{ 302{
301 struct rt_cache_iter_state *st = seq->private; 303 struct rt_cache_iter_state *st = seq->private;
304
302 r = r->u.dst.rt_next; 305 r = r->u.dst.rt_next;
303 while (!r) { 306 while (!r) {
304 rcu_read_unlock_bh(); 307 rcu_read_unlock_bh();
305 if (--st->bucket < 0) 308 do {
306 break; 309 if (--st->bucket < 0)
310 return NULL;
311 } while (!rt_hash_table[st->bucket].chain);
307 rcu_read_lock_bh(); 312 rcu_read_lock_bh();
308 r = rt_hash_table[st->bucket].chain; 313 r = rt_hash_table[st->bucket].chain;
309 } 314 }
@@ -2356,11 +2361,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2356 ipv4_is_zeronet(oldflp->fl4_src)) 2361 ipv4_is_zeronet(oldflp->fl4_src))
2357 goto out; 2362 goto out;
2358 2363
2359 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2360 dev_out = ip_dev_find(net, oldflp->fl4_src);
2361 if (dev_out == NULL)
2362 goto out;
2363
2364 /* I removed check for oif == dev_out->oif here. 2364 /* I removed check for oif == dev_out->oif here.
2365 It was wrong for two reasons: 2365 It was wrong for two reasons:
2366 1. ip_dev_find(net, saddr) can return wrong iface, if saddr 2366 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
@@ -2372,6 +2372,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2372 if (oldflp->oif == 0 2372 if (oldflp->oif == 0
2373 && (ipv4_is_multicast(oldflp->fl4_dst) || 2373 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2374 oldflp->fl4_dst == htonl(0xFFFFFFFF))) { 2374 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2375 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2376 dev_out = ip_dev_find(net, oldflp->fl4_src);
2377 if (dev_out == NULL)
2378 goto out;
2379
2375 /* Special hack: user can direct multicasts 2380 /* Special hack: user can direct multicasts
2376 and limited broadcast via necessary interface 2381 and limited broadcast via necessary interface
2377 without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 2382 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
@@ -2390,9 +2395,15 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
2390 fl.oif = dev_out->ifindex; 2395 fl.oif = dev_out->ifindex;
2391 goto make_route; 2396 goto make_route;
2392 } 2397 }
2393 if (dev_out) 2398
2399 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2400 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2401 dev_out = ip_dev_find(net, oldflp->fl4_src);
2402 if (dev_out == NULL)
2403 goto out;
2394 dev_put(dev_out); 2404 dev_put(dev_out);
2395 dev_out = NULL; 2405 dev_out = NULL;
2406 }
2396 } 2407 }
2397 2408
2398 2409
@@ -2840,7 +2851,9 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2840 if (s_h < 0) 2851 if (s_h < 0)
2841 s_h = 0; 2852 s_h = 0;
2842 s_idx = idx = cb->args[1]; 2853 s_idx = idx = cb->args[1];
2843 for (h = s_h; h <= rt_hash_mask; h++) { 2854 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2855 if (!rt_hash_table[h].chain)
2856 continue;
2844 rcu_read_lock_bh(); 2857 rcu_read_lock_bh();
2845 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; 2858 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2846 rt = rcu_dereference(rt->u.dst.rt_next), idx++) { 2859 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
@@ -2859,7 +2872,6 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2859 dst_release(xchg(&skb->dst, NULL)); 2872 dst_release(xchg(&skb->dst, NULL));
2860 } 2873 }
2861 rcu_read_unlock_bh(); 2874 rcu_read_unlock_bh();
2862 s_idx = 0;
2863 } 2875 }
2864 2876
2865done: 2877done:
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 9d38005abbac..d346c22aa6ae 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -16,6 +16,7 @@
16#include <linux/cryptohash.h> 16#include <linux/cryptohash.h>
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <net/tcp.h> 18#include <net/tcp.h>
19#include <net/route.h>
19 20
20/* Timestamps: lowest 9 bits store TCP options */ 21/* Timestamps: lowest 9 bits store TCP options */
21#define TSBITS 9 22#define TSBITS 9
@@ -296,6 +297,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
296 treq->rcv_isn = ntohl(th->seq) - 1; 297 treq->rcv_isn = ntohl(th->seq) - 1;
297 treq->snt_isn = cookie; 298 treq->snt_isn = cookie;
298 req->mss = mss; 299 req->mss = mss;
300 ireq->loc_port = th->dest;
299 ireq->rmt_port = th->source; 301 ireq->rmt_port = th->source;
300 ireq->loc_addr = ip_hdr(skb)->daddr; 302 ireq->loc_addr = ip_hdr(skb)->daddr;
301 ireq->rmt_addr = ip_hdr(skb)->saddr; 303 ireq->rmt_addr = ip_hdr(skb)->saddr;
@@ -337,6 +339,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
337 .saddr = ireq->loc_addr, 339 .saddr = ireq->loc_addr,
338 .tos = RT_CONN_FLAGS(sk) } }, 340 .tos = RT_CONN_FLAGS(sk) } },
339 .proto = IPPROTO_TCP, 341 .proto = IPPROTO_TCP,
342 .flags = inet_sk_flowi_flags(sk),
340 .uli_u = { .ports = 343 .uli_u = { .ports =
341 { .sport = th->dest, 344 { .sport = th->dest,
342 .dport = th->source } } }; 345 .dport = th->source } } };
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e0689fd7b798..276d047fb85a 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -26,16 +26,13 @@ static int tcp_retr1_max = 255;
26static int ip_local_port_range_min[] = { 1, 1 }; 26static int ip_local_port_range_min[] = { 1, 1 };
27static int ip_local_port_range_max[] = { 65535, 65535 }; 27static int ip_local_port_range_max[] = { 65535, 65535 };
28 28
29extern seqlock_t sysctl_port_range_lock;
30extern int sysctl_local_port_range[2];
31
32/* Update system visible IP port range */ 29/* Update system visible IP port range */
33static void set_local_port_range(int range[2]) 30static void set_local_port_range(int range[2])
34{ 31{
35 write_seqlock(&sysctl_port_range_lock); 32 write_seqlock(&sysctl_local_ports.lock);
36 sysctl_local_port_range[0] = range[0]; 33 sysctl_local_ports.range[0] = range[0];
37 sysctl_local_port_range[1] = range[1]; 34 sysctl_local_ports.range[1] = range[1];
38 write_sequnlock(&sysctl_port_range_lock); 35 write_sequnlock(&sysctl_local_ports.lock);
39} 36}
40 37
41/* Validate changes from /proc interface. */ 38/* Validate changes from /proc interface. */
@@ -44,8 +41,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
44 size_t *lenp, loff_t *ppos) 41 size_t *lenp, loff_t *ppos)
45{ 42{
46 int ret; 43 int ret;
47 int range[2] = { sysctl_local_port_range[0], 44 int range[2];
48 sysctl_local_port_range[1] };
49 ctl_table tmp = { 45 ctl_table tmp = {
50 .data = &range, 46 .data = &range,
51 .maxlen = sizeof(range), 47 .maxlen = sizeof(range),
@@ -54,6 +50,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp,
54 .extra2 = &ip_local_port_range_max, 50 .extra2 = &ip_local_port_range_max,
55 }; 51 };
56 52
53 inet_get_local_port_range(range, range + 1);
57 ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos); 54 ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos);
58 55
59 if (write && ret == 0) { 56 if (write && ret == 0) {
@@ -73,8 +70,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name,
73 void __user *newval, size_t newlen) 70 void __user *newval, size_t newlen)
74{ 71{
75 int ret; 72 int ret;
76 int range[2] = { sysctl_local_port_range[0], 73 int range[2];
77 sysctl_local_port_range[1] };
78 ctl_table tmp = { 74 ctl_table tmp = {
79 .data = &range, 75 .data = &range,
80 .maxlen = sizeof(range), 76 .maxlen = sizeof(range),
@@ -83,6 +79,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name,
83 .extra2 = &ip_local_port_range_max, 79 .extra2 = &ip_local_port_range_max,
84 }; 80 };
85 81
82 inet_get_local_port_range(range, range + 1);
86 ret = sysctl_intvec(&tmp, name, nlen, oldval, oldlenp, newval, newlen); 83 ret = sysctl_intvec(&tmp, name, nlen, oldval, oldlenp, newval, newlen);
87 if (ret == 0 && newval && newlen) { 84 if (ret == 0 && newval && newlen) {
88 if (range[1] < range[0]) 85 if (range[1] < range[0])
@@ -396,8 +393,8 @@ static struct ctl_table ipv4_table[] = {
396 { 393 {
397 .ctl_name = NET_IPV4_LOCAL_PORT_RANGE, 394 .ctl_name = NET_IPV4_LOCAL_PORT_RANGE,
398 .procname = "ip_local_port_range", 395 .procname = "ip_local_port_range",
399 .data = &sysctl_local_port_range, 396 .data = &sysctl_local_ports.range,
400 .maxlen = sizeof(sysctl_local_port_range), 397 .maxlen = sizeof(sysctl_local_ports.range),
401 .mode = 0644, 398 .mode = 0644,
402 .proc_handler = &ipv4_local_port_range, 399 .proc_handler = &ipv4_local_port_range,
403 .strategy = &ipv4_sysctl_local_port_range, 400 .strategy = &ipv4_sysctl_local_port_range,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1ab341e5d3e0..eccb7165a80c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -384,13 +384,17 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
384 384
385 /* Connected? */ 385 /* Connected? */
386 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { 386 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
387 int target = sock_rcvlowat(sk, 0, INT_MAX);
388
389 if (tp->urg_seq == tp->copied_seq &&
390 !sock_flag(sk, SOCK_URGINLINE) &&
391 tp->urg_data)
392 target--;
393
387 /* Potential race condition. If read of tp below will 394 /* Potential race condition. If read of tp below will
388 * escape above sk->sk_state, we can be illegally awaken 395 * escape above sk->sk_state, we can be illegally awaken
389 * in SYN_* states. */ 396 * in SYN_* states. */
390 if ((tp->rcv_nxt != tp->copied_seq) && 397 if (tp->rcv_nxt - tp->copied_seq >= target)
391 (tp->urg_seq != tp->copied_seq ||
392 tp->rcv_nxt != tp->copied_seq + 1 ||
393 sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
394 mask |= POLLIN | POLLRDNORM; 398 mask |= POLLIN | POLLRDNORM;
395 399
396 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { 400 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
@@ -493,10 +497,8 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
493static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, 497static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
494 struct sk_buff *skb) 498 struct sk_buff *skb)
495{ 499{
496 if (flags & MSG_OOB) { 500 if (flags & MSG_OOB)
497 tp->urg_mode = 1;
498 tp->snd_up = tp->write_seq; 501 tp->snd_up = tp->write_seq;
499 }
500} 502}
501 503
502static inline void tcp_push(struct sock *sk, int flags, int mss_now, 504static inline void tcp_push(struct sock *sk, int flags, int mss_now,
@@ -1157,7 +1159,7 @@ static void tcp_prequeue_process(struct sock *sk)
1157 * necessary */ 1159 * necessary */
1158 local_bh_disable(); 1160 local_bh_disable();
1159 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) 1161 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1160 sk->sk_backlog_rcv(sk, skb); 1162 sk_backlog_rcv(sk, skb);
1161 local_bh_enable(); 1163 local_bh_enable();
1162 1164
1163 /* Clear memory counter. */ 1165 /* Clear memory counter. */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7abc6b80d47d..d77c0d29e239 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -979,6 +979,39 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
979 } 979 }
980} 980}
981 981
982/* This must be called before lost_out is incremented */
983static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
984{
985 if ((tp->retransmit_skb_hint == NULL) ||
986 before(TCP_SKB_CB(skb)->seq,
987 TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
988 tp->retransmit_skb_hint = skb;
989
990 if (!tp->lost_out ||
991 after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
992 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
993}
994
995static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
996{
997 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
998 tcp_verify_retransmit_hint(tp, skb);
999
1000 tp->lost_out += tcp_skb_pcount(skb);
1001 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1002 }
1003}
1004
1005void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
1006{
1007 tcp_verify_retransmit_hint(tp, skb);
1008
1009 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
1010 tp->lost_out += tcp_skb_pcount(skb);
1011 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1012 }
1013}
1014
982/* This procedure tags the retransmission queue when SACKs arrive. 1015/* This procedure tags the retransmission queue when SACKs arrive.
983 * 1016 *
984 * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). 1017 * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
@@ -1155,13 +1188,7 @@ static void tcp_mark_lost_retrans(struct sock *sk)
1155 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1188 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1156 tp->retrans_out -= tcp_skb_pcount(skb); 1189 tp->retrans_out -= tcp_skb_pcount(skb);
1157 1190
1158 /* clear lost hint */ 1191 tcp_skb_mark_lost_uncond_verify(tp, skb);
1159 tp->retransmit_skb_hint = NULL;
1160
1161 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
1162 tp->lost_out += tcp_skb_pcount(skb);
1163 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1164 }
1165 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); 1192 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
1166 } else { 1193 } else {
1167 if (before(ack_seq, new_low_seq)) 1194 if (before(ack_seq, new_low_seq))
@@ -1271,9 +1298,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1271 ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); 1298 ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1272 tp->lost_out -= tcp_skb_pcount(skb); 1299 tp->lost_out -= tcp_skb_pcount(skb);
1273 tp->retrans_out -= tcp_skb_pcount(skb); 1300 tp->retrans_out -= tcp_skb_pcount(skb);
1274
1275 /* clear lost hint */
1276 tp->retransmit_skb_hint = NULL;
1277 } 1301 }
1278 } else { 1302 } else {
1279 if (!(sacked & TCPCB_RETRANS)) { 1303 if (!(sacked & TCPCB_RETRANS)) {
@@ -1292,9 +1316,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1292 if (sacked & TCPCB_LOST) { 1316 if (sacked & TCPCB_LOST) {
1293 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 1317 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
1294 tp->lost_out -= tcp_skb_pcount(skb); 1318 tp->lost_out -= tcp_skb_pcount(skb);
1295
1296 /* clear lost hint */
1297 tp->retransmit_skb_hint = NULL;
1298 } 1319 }
1299 } 1320 }
1300 1321
@@ -1324,7 +1345,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1324 if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { 1345 if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) {
1325 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1346 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1326 tp->retrans_out -= tcp_skb_pcount(skb); 1347 tp->retrans_out -= tcp_skb_pcount(skb);
1327 tp->retransmit_skb_hint = NULL;
1328 } 1348 }
1329 1349
1330 return flag; 1350 return flag;
@@ -1726,6 +1746,8 @@ int tcp_use_frto(struct sock *sk)
1726 return 0; 1746 return 0;
1727 1747
1728 skb = tcp_write_queue_head(sk); 1748 skb = tcp_write_queue_head(sk);
1749 if (tcp_skb_is_last(sk, skb))
1750 return 1;
1729 skb = tcp_write_queue_next(sk, skb); /* Skips head */ 1751 skb = tcp_write_queue_next(sk, skb); /* Skips head */
1730 tcp_for_write_queue_from(skb, sk) { 1752 tcp_for_write_queue_from(skb, sk) {
1731 if (skb == tcp_send_head(sk)) 1753 if (skb == tcp_send_head(sk))
@@ -1867,6 +1889,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
1867 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { 1889 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
1868 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1890 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1869 tp->lost_out += tcp_skb_pcount(skb); 1891 tp->lost_out += tcp_skb_pcount(skb);
1892 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
1870 } 1893 }
1871 } 1894 }
1872 tcp_verify_left_out(tp); 1895 tcp_verify_left_out(tp);
@@ -1883,7 +1906,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
1883 tp->high_seq = tp->snd_nxt; 1906 tp->high_seq = tp->snd_nxt;
1884 TCP_ECN_queue_cwr(tp); 1907 TCP_ECN_queue_cwr(tp);
1885 1908
1886 tcp_clear_retrans_hints_partial(tp); 1909 tcp_clear_all_retrans_hints(tp);
1887} 1910}
1888 1911
1889static void tcp_clear_retrans_partial(struct tcp_sock *tp) 1912static void tcp_clear_retrans_partial(struct tcp_sock *tp)
@@ -1934,12 +1957,11 @@ void tcp_enter_loss(struct sock *sk, int how)
1934 /* Push undo marker, if it was plain RTO and nothing 1957 /* Push undo marker, if it was plain RTO and nothing
1935 * was retransmitted. */ 1958 * was retransmitted. */
1936 tp->undo_marker = tp->snd_una; 1959 tp->undo_marker = tp->snd_una;
1937 tcp_clear_retrans_hints_partial(tp);
1938 } else { 1960 } else {
1939 tp->sacked_out = 0; 1961 tp->sacked_out = 0;
1940 tp->fackets_out = 0; 1962 tp->fackets_out = 0;
1941 tcp_clear_all_retrans_hints(tp);
1942 } 1963 }
1964 tcp_clear_all_retrans_hints(tp);
1943 1965
1944 tcp_for_write_queue(skb, sk) { 1966 tcp_for_write_queue(skb, sk) {
1945 if (skb == tcp_send_head(sk)) 1967 if (skb == tcp_send_head(sk))
@@ -1952,6 +1974,7 @@ void tcp_enter_loss(struct sock *sk, int how)
1952 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; 1974 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1953 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1975 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1954 tp->lost_out += tcp_skb_pcount(skb); 1976 tp->lost_out += tcp_skb_pcount(skb);
1977 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
1955 } 1978 }
1956 } 1979 }
1957 tcp_verify_left_out(tp); 1980 tcp_verify_left_out(tp);
@@ -2157,19 +2180,6 @@ static int tcp_time_to_recover(struct sock *sk)
2157 return 0; 2180 return 0;
2158} 2181}
2159 2182
2160/* RFC: This is from the original, I doubt that this is necessary at all:
2161 * clear xmit_retrans hint if seq of this skb is beyond hint. How could we
2162 * retransmitted past LOST markings in the first place? I'm not fully sure
2163 * about undo and end of connection cases, which can cause R without L?
2164 */
2165static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
2166{
2167 if ((tp->retransmit_skb_hint != NULL) &&
2168 before(TCP_SKB_CB(skb)->seq,
2169 TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
2170 tp->retransmit_skb_hint = NULL;
2171}
2172
2173/* Mark head of queue up as lost. With RFC3517 SACK, the packets is 2183/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
2174 * is against sacked "cnt", otherwise it's against facked "cnt" 2184 * is against sacked "cnt", otherwise it's against facked "cnt"
2175 */ 2185 */
@@ -2217,11 +2227,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2217 cnt = packets; 2227 cnt = packets;
2218 } 2228 }
2219 2229
2220 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { 2230 tcp_skb_mark_lost(tp, skb);
2221 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
2222 tp->lost_out += tcp_skb_pcount(skb);
2223 tcp_verify_retransmit_hint(tp, skb);
2224 }
2225 } 2231 }
2226 tcp_verify_left_out(tp); 2232 tcp_verify_left_out(tp);
2227} 2233}
@@ -2263,11 +2269,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2263 if (!tcp_skb_timedout(sk, skb)) 2269 if (!tcp_skb_timedout(sk, skb))
2264 break; 2270 break;
2265 2271
2266 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { 2272 tcp_skb_mark_lost(tp, skb);
2267 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
2268 tp->lost_out += tcp_skb_pcount(skb);
2269 tcp_verify_retransmit_hint(tp, skb);
2270 }
2271 } 2273 }
2272 2274
2273 tp->scoreboard_skb_hint = skb; 2275 tp->scoreboard_skb_hint = skb;
@@ -2378,10 +2380,6 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
2378 } 2380 }
2379 tcp_moderate_cwnd(tp); 2381 tcp_moderate_cwnd(tp);
2380 tp->snd_cwnd_stamp = tcp_time_stamp; 2382 tp->snd_cwnd_stamp = tcp_time_stamp;
2381
2382 /* There is something screwy going on with the retrans hints after
2383 an undo */
2384 tcp_clear_all_retrans_hints(tp);
2385} 2383}
2386 2384
2387static inline int tcp_may_undo(struct tcp_sock *tp) 2385static inline int tcp_may_undo(struct tcp_sock *tp)
@@ -2838,7 +2836,8 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
2838 * is before the ack sequence we can discard it as it's confirmed to have 2836 * is before the ack sequence we can discard it as it's confirmed to have
2839 * arrived at the other end. 2837 * arrived at the other end.
2840 */ 2838 */
2841static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets) 2839static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
2840 u32 prior_snd_una)
2842{ 2841{
2843 struct tcp_sock *tp = tcp_sk(sk); 2842 struct tcp_sock *tp = tcp_sk(sk);
2844 const struct inet_connection_sock *icsk = inet_csk(sk); 2843 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2848,6 +2847,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
2848 int flag = 0; 2847 int flag = 0;
2849 u32 pkts_acked = 0; 2848 u32 pkts_acked = 0;
2850 u32 reord = tp->packets_out; 2849 u32 reord = tp->packets_out;
2850 u32 prior_sacked = tp->sacked_out;
2851 s32 seq_rtt = -1; 2851 s32 seq_rtt = -1;
2852 s32 ca_seq_rtt = -1; 2852 s32 ca_seq_rtt = -1;
2853 ktime_t last_ackt = net_invalid_timestamp(); 2853 ktime_t last_ackt = net_invalid_timestamp();
@@ -2904,9 +2904,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
2904 if (sacked & TCPCB_LOST) 2904 if (sacked & TCPCB_LOST)
2905 tp->lost_out -= acked_pcount; 2905 tp->lost_out -= acked_pcount;
2906 2906
2907 if (unlikely(tp->urg_mode && !before(end_seq, tp->snd_up)))
2908 tp->urg_mode = 0;
2909
2910 tp->packets_out -= acked_pcount; 2907 tp->packets_out -= acked_pcount;
2911 pkts_acked += acked_pcount; 2908 pkts_acked += acked_pcount;
2912 2909
@@ -2929,9 +2926,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
2929 2926
2930 tcp_unlink_write_queue(skb, sk); 2927 tcp_unlink_write_queue(skb, sk);
2931 sk_wmem_free_skb(sk, skb); 2928 sk_wmem_free_skb(sk, skb);
2932 tcp_clear_all_retrans_hints(tp); 2929 tp->scoreboard_skb_hint = NULL;
2930 if (skb == tp->retransmit_skb_hint)
2931 tp->retransmit_skb_hint = NULL;
2932 if (skb == tp->lost_skb_hint)
2933 tp->lost_skb_hint = NULL;
2933 } 2934 }
2934 2935
2936 if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
2937 tp->snd_up = tp->snd_una;
2938
2935 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 2939 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
2936 flag |= FLAG_SACK_RENEGING; 2940 flag |= FLAG_SACK_RENEGING;
2937 2941
@@ -2948,6 +2952,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
2948 /* Non-retransmitted hole got filled? That's reordering */ 2952 /* Non-retransmitted hole got filled? That's reordering */
2949 if (reord < prior_fackets) 2953 if (reord < prior_fackets)
2950 tcp_update_reordering(sk, tp->fackets_out - reord, 0); 2954 tcp_update_reordering(sk, tp->fackets_out - reord, 0);
2955
2956 /* No need to care for underflows here because
2957 * the lost_skb_hint gets NULLed if we're past it
2958 * (or something non-trivial happened)
2959 */
2960 if (tcp_is_fack(tp))
2961 tp->lost_cnt_hint -= pkts_acked;
2962 else
2963 tp->lost_cnt_hint -= prior_sacked - tp->sacked_out;
2951 } 2964 }
2952 2965
2953 tp->fackets_out -= min(pkts_acked, tp->fackets_out); 2966 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
@@ -3299,7 +3312,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3299 goto no_queue; 3312 goto no_queue;
3300 3313
3301 /* See if we can take anything off of the retransmit queue. */ 3314 /* See if we can take anything off of the retransmit queue. */
3302 flag |= tcp_clean_rtx_queue(sk, prior_fackets); 3315 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
3303 3316
3304 if (tp->frto_counter) 3317 if (tp->frto_counter)
3305 frto_cwnd = tcp_process_frto(sk, flag); 3318 frto_cwnd = tcp_process_frto(sk, flag);
@@ -3442,6 +3455,22 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3442 } 3455 }
3443} 3456}
3444 3457
3458static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
3459{
3460 __be32 *ptr = (__be32 *)(th + 1);
3461
3462 if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
3463 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
3464 tp->rx_opt.saw_tstamp = 1;
3465 ++ptr;
3466 tp->rx_opt.rcv_tsval = ntohl(*ptr);
3467 ++ptr;
3468 tp->rx_opt.rcv_tsecr = ntohl(*ptr);
3469 return 1;
3470 }
3471 return 0;
3472}
3473
3445/* Fast parse options. This hopes to only see timestamps. 3474/* Fast parse options. This hopes to only see timestamps.
3446 * If it is wrong it falls back on tcp_parse_options(). 3475 * If it is wrong it falls back on tcp_parse_options().
3447 */ 3476 */
@@ -3453,16 +3482,8 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
3453 return 0; 3482 return 0;
3454 } else if (tp->rx_opt.tstamp_ok && 3483 } else if (tp->rx_opt.tstamp_ok &&
3455 th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { 3484 th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
3456 __be32 *ptr = (__be32 *)(th + 1); 3485 if (tcp_parse_aligned_timestamp(tp, th))
3457 if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
3458 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
3459 tp->rx_opt.saw_tstamp = 1;
3460 ++ptr;
3461 tp->rx_opt.rcv_tsval = ntohl(*ptr);
3462 ++ptr;
3463 tp->rx_opt.rcv_tsecr = ntohl(*ptr);
3464 return 1; 3486 return 1;
3465 }
3466 } 3487 }
3467 tcp_parse_options(skb, &tp->rx_opt, 1); 3488 tcp_parse_options(skb, &tp->rx_opt, 1);
3468 return 1; 3489 return 1;
@@ -4138,7 +4159,7 @@ drop:
4138 skb1 = skb1->prev; 4159 skb1 = skb1->prev;
4139 } 4160 }
4140 } 4161 }
4141 __skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue); 4162 __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
4142 4163
4143 /* And clean segments covered by new one as whole. */ 4164 /* And clean segments covered by new one as whole. */
4144 while ((skb1 = skb->next) != 4165 while ((skb1 = skb->next) !=
@@ -4161,6 +4182,18 @@ add_sack:
4161 } 4182 }
4162} 4183}
4163 4184
4185static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
4186 struct sk_buff_head *list)
4187{
4188 struct sk_buff *next = skb->next;
4189
4190 __skb_unlink(skb, list);
4191 __kfree_skb(skb);
4192 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
4193
4194 return next;
4195}
4196
4164/* Collapse contiguous sequence of skbs head..tail with 4197/* Collapse contiguous sequence of skbs head..tail with
4165 * sequence numbers start..end. 4198 * sequence numbers start..end.
4166 * Segments with FIN/SYN are not collapsed (only because this 4199 * Segments with FIN/SYN are not collapsed (only because this
@@ -4178,11 +4211,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
4178 for (skb = head; skb != tail;) { 4211 for (skb = head; skb != tail;) {
4179 /* No new bits? It is possible on ofo queue. */ 4212 /* No new bits? It is possible on ofo queue. */
4180 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 4213 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4181 struct sk_buff *next = skb->next; 4214 skb = tcp_collapse_one(sk, skb, list);
4182 __skb_unlink(skb, list);
4183 __kfree_skb(skb);
4184 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
4185 skb = next;
4186 continue; 4215 continue;
4187 } 4216 }
4188 4217
@@ -4228,7 +4257,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
4228 memcpy(nskb->head, skb->head, header); 4257 memcpy(nskb->head, skb->head, header);
4229 memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); 4258 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
4230 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; 4259 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
4231 __skb_insert(nskb, skb->prev, skb, list); 4260 __skb_queue_before(list, skb, nskb);
4232 skb_set_owner_r(nskb, sk); 4261 skb_set_owner_r(nskb, sk);
4233 4262
4234 /* Copy data, releasing collapsed skbs. */ 4263 /* Copy data, releasing collapsed skbs. */
@@ -4246,11 +4275,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
4246 start += size; 4275 start += size;
4247 } 4276 }
4248 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 4277 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4249 struct sk_buff *next = skb->next; 4278 skb = tcp_collapse_one(sk, skb, list);
4250 __skb_unlink(skb, list);
4251 __kfree_skb(skb);
4252 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
4253 skb = next;
4254 if (skb == tail || 4279 if (skb == tail ||
4255 tcp_hdr(skb)->syn || 4280 tcp_hdr(skb)->syn ||
4256 tcp_hdr(skb)->fin) 4281 tcp_hdr(skb)->fin)
@@ -4436,8 +4461,8 @@ static void tcp_new_space(struct sock *sk)
4436 4461
4437 if (tcp_should_expand_sndbuf(sk)) { 4462 if (tcp_should_expand_sndbuf(sk)) {
4438 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + 4463 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
4439 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), 4464 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
4440 demanded = max_t(unsigned int, tp->snd_cwnd, 4465 int demanded = max_t(unsigned int, tp->snd_cwnd,
4441 tp->reordering + 1); 4466 tp->reordering + 1);
4442 sndmem *= 2 * demanded; 4467 sndmem *= 2 * demanded;
4443 if (sndmem > sk->sk_sndbuf) 4468 if (sndmem > sk->sk_sndbuf)
@@ -4691,6 +4716,67 @@ out:
4691} 4716}
4692#endif /* CONFIG_NET_DMA */ 4717#endif /* CONFIG_NET_DMA */
4693 4718
4719/* Does PAWS and seqno based validation of an incoming segment, flags will
4720 * play significant role here.
4721 */
4722static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
4723 struct tcphdr *th, int syn_inerr)
4724{
4725 struct tcp_sock *tp = tcp_sk(sk);
4726
4727 /* RFC1323: H1. Apply PAWS check first. */
4728 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
4729 tcp_paws_discard(sk, skb)) {
4730 if (!th->rst) {
4731 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
4732 tcp_send_dupack(sk, skb);
4733 goto discard;
4734 }
4735 /* Reset is accepted even if it did not pass PAWS. */
4736 }
4737
4738 /* Step 1: check sequence number */
4739 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
4740 /* RFC793, page 37: "In all states except SYN-SENT, all reset
4741 * (RST) segments are validated by checking their SEQ-fields."
4742 * And page 69: "If an incoming segment is not acceptable,
4743 * an acknowledgment should be sent in reply (unless the RST
4744 * bit is set, if so drop the segment and return)".
4745 */
4746 if (!th->rst)
4747 tcp_send_dupack(sk, skb);
4748 goto discard;
4749 }
4750
4751 /* Step 2: check RST bit */
4752 if (th->rst) {
4753 tcp_reset(sk);
4754 goto discard;
4755 }
4756
4757 /* ts_recent update must be made after we are sure that the packet
4758 * is in window.
4759 */
4760 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
4761
4762 /* step 3: check security and precedence [ignored] */
4763
4764 /* step 4: Check for a SYN in window. */
4765 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4766 if (syn_inerr)
4767 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
4768 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
4769 tcp_reset(sk);
4770 return -1;
4771 }
4772
4773 return 1;
4774
4775discard:
4776 __kfree_skb(skb);
4777 return 0;
4778}
4779
4694/* 4780/*
4695 * TCP receive function for the ESTABLISHED state. 4781 * TCP receive function for the ESTABLISHED state.
4696 * 4782 *
@@ -4718,6 +4804,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
4718 struct tcphdr *th, unsigned len) 4804 struct tcphdr *th, unsigned len)
4719{ 4805{
4720 struct tcp_sock *tp = tcp_sk(sk); 4806 struct tcp_sock *tp = tcp_sk(sk);
4807 int res;
4721 4808
4722 /* 4809 /*
4723 * Header prediction. 4810 * Header prediction.
@@ -4756,19 +4843,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
4756 4843
4757 /* Check timestamp */ 4844 /* Check timestamp */
4758 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { 4845 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
4759 __be32 *ptr = (__be32 *)(th + 1);
4760
4761 /* No? Slow path! */ 4846 /* No? Slow path! */
4762 if (*ptr != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) 4847 if (!tcp_parse_aligned_timestamp(tp, th))
4763 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
4764 goto slow_path; 4848 goto slow_path;
4765 4849
4766 tp->rx_opt.saw_tstamp = 1;
4767 ++ptr;
4768 tp->rx_opt.rcv_tsval = ntohl(*ptr);
4769 ++ptr;
4770 tp->rx_opt.rcv_tsecr = ntohl(*ptr);
4771
4772 /* If PAWS failed, check it more carefully in slow path */ 4850 /* If PAWS failed, check it more carefully in slow path */
4773 if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) 4851 if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
4774 goto slow_path; 4852 goto slow_path;
@@ -4900,51 +4978,12 @@ slow_path:
4900 goto csum_error; 4978 goto csum_error;
4901 4979
4902 /* 4980 /*
4903 * RFC1323: H1. Apply PAWS check first.
4904 */
4905 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
4906 tcp_paws_discard(sk, skb)) {
4907 if (!th->rst) {
4908 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
4909 tcp_send_dupack(sk, skb);
4910 goto discard;
4911 }
4912 /* Resets are accepted even if PAWS failed.
4913
4914 ts_recent update must be made after we are sure
4915 that the packet is in window.
4916 */
4917 }
4918
4919 /*
4920 * Standard slow path. 4981 * Standard slow path.
4921 */ 4982 */
4922 4983
4923 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { 4984 res = tcp_validate_incoming(sk, skb, th, 1);
4924 /* RFC793, page 37: "In all states except SYN-SENT, all reset 4985 if (res <= 0)
4925 * (RST) segments are validated by checking their SEQ-fields." 4986 return -res;
4926 * And page 69: "If an incoming segment is not acceptable,
4927 * an acknowledgment should be sent in reply (unless the RST bit
4928 * is set, if so drop the segment and return)".
4929 */
4930 if (!th->rst)
4931 tcp_send_dupack(sk, skb);
4932 goto discard;
4933 }
4934
4935 if (th->rst) {
4936 tcp_reset(sk);
4937 goto discard;
4938 }
4939
4940 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
4941
4942 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4943 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
4944 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
4945 tcp_reset(sk);
4946 return 1;
4947 }
4948 4987
4949step5: 4988step5:
4950 if (th->ack) 4989 if (th->ack)
@@ -5226,6 +5265,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5226 struct tcp_sock *tp = tcp_sk(sk); 5265 struct tcp_sock *tp = tcp_sk(sk);
5227 struct inet_connection_sock *icsk = inet_csk(sk); 5266 struct inet_connection_sock *icsk = inet_csk(sk);
5228 int queued = 0; 5267 int queued = 0;
5268 int res;
5229 5269
5230 tp->rx_opt.saw_tstamp = 0; 5270 tp->rx_opt.saw_tstamp = 0;
5231 5271
@@ -5278,42 +5318,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5278 return 0; 5318 return 0;
5279 } 5319 }
5280 5320
5281 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && 5321 res = tcp_validate_incoming(sk, skb, th, 0);
5282 tcp_paws_discard(sk, skb)) { 5322 if (res <= 0)
5283 if (!th->rst) { 5323 return -res;
5284 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
5285 tcp_send_dupack(sk, skb);
5286 goto discard;
5287 }
5288 /* Reset is accepted even if it did not pass PAWS. */
5289 }
5290
5291 /* step 1: check sequence number */
5292 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
5293 if (!th->rst)
5294 tcp_send_dupack(sk, skb);
5295 goto discard;
5296 }
5297
5298 /* step 2: check RST bit */
5299 if (th->rst) {
5300 tcp_reset(sk);
5301 goto discard;
5302 }
5303
5304 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
5305
5306 /* step 3: check security and precedence [ignored] */
5307
5308 /* step 4:
5309 *
5310 * Check for a SYN in window.
5311 */
5312 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
5313 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
5314 tcp_reset(sk);
5315 return 1;
5316 }
5317 5324
5318 /* step 5: check the ACK field */ 5325 /* step 5: check the ACK field */
5319 if (th->ack) { 5326 if (th->ack) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 011478e46c40..5c8fa7f1e327 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -583,14 +583,15 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
583 rep.th.doff = arg.iov[0].iov_len / 4; 583 rep.th.doff = arg.iov[0].iov_len / 4;
584 584
585 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1], 585 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
586 key, ip_hdr(skb)->daddr, 586 key, ip_hdr(skb)->saddr,
587 ip_hdr(skb)->saddr, &rep.th); 587 ip_hdr(skb)->daddr, &rep.th);
588 } 588 }
589#endif 589#endif
590 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 590 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
591 ip_hdr(skb)->saddr, /* XXX */ 591 ip_hdr(skb)->saddr, /* XXX */
592 sizeof(struct tcphdr), IPPROTO_TCP, 0); 592 arg.iov[0].iov_len, IPPROTO_TCP, 0);
593 arg.csumoffset = offsetof(struct tcphdr, check) / 2; 593 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
594 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
594 595
595 net = dev_net(skb->dst->dev); 596 net = dev_net(skb->dst->dev);
596 ip_send_reply(net->ipv4.tcp_sock, skb, 597 ip_send_reply(net->ipv4.tcp_sock, skb,
@@ -606,7 +607,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
606 607
607static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, 608static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
608 u32 win, u32 ts, int oif, 609 u32 win, u32 ts, int oif,
609 struct tcp_md5sig_key *key) 610 struct tcp_md5sig_key *key,
611 int reply_flags)
610{ 612{
611 struct tcphdr *th = tcp_hdr(skb); 613 struct tcphdr *th = tcp_hdr(skb);
612 struct { 614 struct {
@@ -659,6 +661,7 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
659 ip_hdr(skb)->daddr, &rep.th); 661 ip_hdr(skb)->daddr, &rep.th);
660 } 662 }
661#endif 663#endif
664 arg.flags = reply_flags;
662 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, 665 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
663 ip_hdr(skb)->saddr, /* XXX */ 666 ip_hdr(skb)->saddr, /* XXX */
664 arg.iov[0].iov_len, IPPROTO_TCP, 0); 667 arg.iov[0].iov_len, IPPROTO_TCP, 0);
@@ -681,7 +684,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
681 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, 684 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
682 tcptw->tw_ts_recent, 685 tcptw->tw_ts_recent,
683 tw->tw_bound_dev_if, 686 tw->tw_bound_dev_if,
684 tcp_twsk_md5_key(tcptw) 687 tcp_twsk_md5_key(tcptw),
688 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
685 ); 689 );
686 690
687 inet_twsk_put(tw); 691 inet_twsk_put(tw);
@@ -694,7 +698,8 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
694 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, 698 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
695 req->ts_recent, 699 req->ts_recent,
696 0, 700 0,
697 tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr)); 701 tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
702 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
698} 703}
699 704
700/* 705/*
@@ -1244,6 +1249,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1244 ireq = inet_rsk(req); 1249 ireq = inet_rsk(req);
1245 ireq->loc_addr = daddr; 1250 ireq->loc_addr = daddr;
1246 ireq->rmt_addr = saddr; 1251 ireq->rmt_addr = saddr;
1252 ireq->no_srccheck = inet_sk(sk)->transparent;
1247 ireq->opt = tcp_v4_save_options(sk, skb); 1253 ireq->opt = tcp_v4_save_options(sk, skb);
1248 if (!want_cookie) 1254 if (!want_cookie)
1249 TCP_ECN_create_request(req, tcp_hdr(skb)); 1255 TCP_ECN_create_request(req, tcp_hdr(skb));
@@ -1364,6 +1370,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1364 tcp_mtup_init(newsk); 1370 tcp_mtup_init(newsk);
1365 tcp_sync_mss(newsk, dst_mtu(dst)); 1371 tcp_sync_mss(newsk, dst_mtu(dst));
1366 newtp->advmss = dst_metric(dst, RTAX_ADVMSS); 1372 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1373 if (tcp_sk(sk)->rx_opt.user_mss &&
1374 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1375 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1376
1367 tcp_initialize_rcv_mss(newsk); 1377 tcp_initialize_rcv_mss(newsk);
1368 1378
1369#ifdef CONFIG_TCP_MD5SIG 1379#ifdef CONFIG_TCP_MD5SIG
@@ -1567,8 +1577,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
1567 TCP_SKB_CB(skb)->flags = iph->tos; 1577 TCP_SKB_CB(skb)->flags = iph->tos;
1568 TCP_SKB_CB(skb)->sacked = 0; 1578 TCP_SKB_CB(skb)->sacked = 0;
1569 1579
1570 sk = __inet_lookup(net, &tcp_hashinfo, iph->saddr, 1580 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1571 th->source, iph->daddr, th->dest, inet_iif(skb));
1572 if (!sk) 1581 if (!sk)
1573 goto no_tcp_socket; 1582 goto no_tcp_socket;
1574 1583
@@ -1946,6 +1955,12 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1946 return rc; 1955 return rc;
1947} 1956}
1948 1957
1958static inline int empty_bucket(struct tcp_iter_state *st)
1959{
1960 return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
1961 hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
1962}
1963
1949static void *established_get_first(struct seq_file *seq) 1964static void *established_get_first(struct seq_file *seq)
1950{ 1965{
1951 struct tcp_iter_state* st = seq->private; 1966 struct tcp_iter_state* st = seq->private;
@@ -1958,6 +1973,10 @@ static void *established_get_first(struct seq_file *seq)
1958 struct inet_timewait_sock *tw; 1973 struct inet_timewait_sock *tw;
1959 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 1974 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1960 1975
1976 /* Lockless fast path for the common case of empty buckets */
1977 if (empty_bucket(st))
1978 continue;
1979
1961 read_lock_bh(lock); 1980 read_lock_bh(lock);
1962 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 1981 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1963 if (sk->sk_family != st->family || 1982 if (sk->sk_family != st->family ||
@@ -2008,13 +2027,15 @@ get_tw:
2008 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2027 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2009 st->state = TCP_SEQ_STATE_ESTABLISHED; 2028 st->state = TCP_SEQ_STATE_ESTABLISHED;
2010 2029
2011 if (++st->bucket < tcp_hashinfo.ehash_size) { 2030 /* Look for next non empty bucket */
2012 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2031 while (++st->bucket < tcp_hashinfo.ehash_size &&
2013 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); 2032 empty_bucket(st))
2014 } else { 2033 ;
2015 cur = NULL; 2034 if (st->bucket >= tcp_hashinfo.ehash_size)
2016 goto out; 2035 return NULL;
2017 } 2036
2037 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2038 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2018 } else 2039 } else
2019 sk = sk_next(sk); 2040 sk = sk_next(sk);
2020 2041
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f976fc57892c..779f2e9d0689 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -395,6 +395,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
395 newtp->pred_flags = 0; 395 newtp->pred_flags = 0;
396 newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; 396 newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
397 newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; 397 newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1;
398 newtp->snd_up = treq->snt_isn + 1;
398 399
399 tcp_prequeue_init(newtp); 400 tcp_prequeue_init(newtp);
400 401
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8165f5aa8c71..990a58493235 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -345,6 +345,11 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
345 TCP_SKB_CB(skb)->end_seq = seq; 345 TCP_SKB_CB(skb)->end_seq = seq;
346} 346}
347 347
348static inline int tcp_urg_mode(const struct tcp_sock *tp)
349{
350 return tp->snd_una != tp->snd_up;
351}
352
348#define OPTION_SACK_ADVERTISE (1 << 0) 353#define OPTION_SACK_ADVERTISE (1 << 0)
349#define OPTION_TS (1 << 1) 354#define OPTION_TS (1 << 1)
350#define OPTION_MD5 (1 << 2) 355#define OPTION_MD5 (1 << 2)
@@ -646,7 +651,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
646 th->check = 0; 651 th->check = 0;
647 th->urg_ptr = 0; 652 th->urg_ptr = 0;
648 653
649 if (unlikely(tp->urg_mode && 654 /* The urg_mode check is necessary during a below snd_una win probe */
655 if (unlikely(tcp_urg_mode(tp) &&
650 between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { 656 between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) {
651 th->urg_ptr = htons(tp->snd_up - tcb->seq); 657 th->urg_ptr = htons(tp->snd_up - tcb->seq);
652 th->urg = 1; 658 th->urg = 1;
@@ -1012,7 +1018,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1012/* Compute the current effective MSS, taking SACKs and IP options, 1018/* Compute the current effective MSS, taking SACKs and IP options,
1013 * and even PMTU discovery events into account. 1019 * and even PMTU discovery events into account.
1014 * 1020 *
1015 * LARGESEND note: !urg_mode is overkill, only frames up to snd_up 1021 * LARGESEND note: !tcp_urg_mode is overkill, only frames up to snd_up
1016 * cannot be large. However, taking into account rare use of URG, this 1022 * cannot be large. However, taking into account rare use of URG, this
1017 * is not a big flaw. 1023 * is not a big flaw.
1018 */ 1024 */
@@ -1029,7 +1035,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
1029 1035
1030 mss_now = tp->mss_cache; 1036 mss_now = tp->mss_cache;
1031 1037
1032 if (large_allowed && sk_can_gso(sk) && !tp->urg_mode) 1038 if (large_allowed && sk_can_gso(sk) && !tcp_urg_mode(tp))
1033 doing_tso = 1; 1039 doing_tso = 1;
1034 1040
1035 if (dst) { 1041 if (dst) {
@@ -1193,7 +1199,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
1193 /* Don't use the nagle rule for urgent data (or for the final FIN). 1199 /* Don't use the nagle rule for urgent data (or for the final FIN).
1194 * Nagle can be ignored during F-RTO too (see RFC4138). 1200 * Nagle can be ignored during F-RTO too (see RFC4138).
1195 */ 1201 */
1196 if (tp->urg_mode || (tp->frto_counter == 2) || 1202 if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
1197 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) 1203 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
1198 return 1; 1204 return 1;
1199 1205
@@ -1824,6 +1830,8 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb,
1824 1830
1825 /* changed transmit queue under us so clear hints */ 1831 /* changed transmit queue under us so clear hints */
1826 tcp_clear_retrans_hints_partial(tp); 1832 tcp_clear_retrans_hints_partial(tp);
1833 if (next_skb == tp->retransmit_skb_hint)
1834 tp->retransmit_skb_hint = skb;
1827 1835
1828 sk_wmem_free_skb(sk, next_skb); 1836 sk_wmem_free_skb(sk, next_skb);
1829} 1837}
@@ -1838,7 +1846,7 @@ void tcp_simple_retransmit(struct sock *sk)
1838 struct tcp_sock *tp = tcp_sk(sk); 1846 struct tcp_sock *tp = tcp_sk(sk);
1839 struct sk_buff *skb; 1847 struct sk_buff *skb;
1840 unsigned int mss = tcp_current_mss(sk, 0); 1848 unsigned int mss = tcp_current_mss(sk, 0);
1841 int lost = 0; 1849 u32 prior_lost = tp->lost_out;
1842 1850
1843 tcp_for_write_queue(skb, sk) { 1851 tcp_for_write_queue(skb, sk) {
1844 if (skb == tcp_send_head(sk)) 1852 if (skb == tcp_send_head(sk))
@@ -1849,17 +1857,13 @@ void tcp_simple_retransmit(struct sock *sk)
1849 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1857 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1850 tp->retrans_out -= tcp_skb_pcount(skb); 1858 tp->retrans_out -= tcp_skb_pcount(skb);
1851 } 1859 }
1852 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) { 1860 tcp_skb_mark_lost_uncond_verify(tp, skb);
1853 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1854 tp->lost_out += tcp_skb_pcount(skb);
1855 lost = 1;
1856 }
1857 } 1861 }
1858 } 1862 }
1859 1863
1860 tcp_clear_all_retrans_hints(tp); 1864 tcp_clear_retrans_hints_partial(tp);
1861 1865
1862 if (!lost) 1866 if (prior_lost == tp->lost_out)
1863 return; 1867 return;
1864 1868
1865 if (tcp_is_reno(tp)) 1869 if (tcp_is_reno(tp))
@@ -1934,8 +1938,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1934 /* Collapse two adjacent packets if worthwhile and we can. */ 1938 /* Collapse two adjacent packets if worthwhile and we can. */
1935 if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && 1939 if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
1936 (skb->len < (cur_mss >> 1)) && 1940 (skb->len < (cur_mss >> 1)) &&
1937 (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
1938 (!tcp_skb_is_last(sk, skb)) && 1941 (!tcp_skb_is_last(sk, skb)) &&
1942 (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
1939 (skb_shinfo(skb)->nr_frags == 0 && 1943 (skb_shinfo(skb)->nr_frags == 0 &&
1940 skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && 1944 skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
1941 (tcp_skb_pcount(skb) == 1 && 1945 (tcp_skb_pcount(skb) == 1 &&
@@ -1996,86 +2000,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1996 return err; 2000 return err;
1997} 2001}
1998 2002
1999/* This gets called after a retransmit timeout, and the initially 2003static int tcp_can_forward_retransmit(struct sock *sk)
2000 * retransmitted data is acknowledged. It tries to continue
2001 * resending the rest of the retransmit queue, until either
2002 * we've sent it all or the congestion window limit is reached.
2003 * If doing SACK, the first ACK which comes back for a timeout
2004 * based retransmit packet might feed us FACK information again.
2005 * If so, we use it to avoid unnecessarily retransmissions.
2006 */
2007void tcp_xmit_retransmit_queue(struct sock *sk)
2008{ 2004{
2009 const struct inet_connection_sock *icsk = inet_csk(sk); 2005 const struct inet_connection_sock *icsk = inet_csk(sk);
2010 struct tcp_sock *tp = tcp_sk(sk); 2006 struct tcp_sock *tp = tcp_sk(sk);
2011 struct sk_buff *skb;
2012 int packet_cnt;
2013
2014 if (tp->retransmit_skb_hint) {
2015 skb = tp->retransmit_skb_hint;
2016 packet_cnt = tp->retransmit_cnt_hint;
2017 } else {
2018 skb = tcp_write_queue_head(sk);
2019 packet_cnt = 0;
2020 }
2021
2022 /* First pass: retransmit lost packets. */
2023 if (tp->lost_out) {
2024 tcp_for_write_queue_from(skb, sk) {
2025 __u8 sacked = TCP_SKB_CB(skb)->sacked;
2026
2027 if (skb == tcp_send_head(sk))
2028 break;
2029 /* we could do better than to assign each time */
2030 tp->retransmit_skb_hint = skb;
2031 tp->retransmit_cnt_hint = packet_cnt;
2032
2033 /* Assume this retransmit will generate
2034 * only one packet for congestion window
2035 * calculation purposes. This works because
2036 * tcp_retransmit_skb() will chop up the
2037 * packet to be MSS sized and all the
2038 * packet counting works out.
2039 */
2040 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
2041 return;
2042
2043 if (sacked & TCPCB_LOST) {
2044 if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
2045 int mib_idx;
2046
2047 if (tcp_retransmit_skb(sk, skb)) {
2048 tp->retransmit_skb_hint = NULL;
2049 return;
2050 }
2051 if (icsk->icsk_ca_state != TCP_CA_Loss)
2052 mib_idx = LINUX_MIB_TCPFASTRETRANS;
2053 else
2054 mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
2055 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2056
2057 if (skb == tcp_write_queue_head(sk))
2058 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2059 inet_csk(sk)->icsk_rto,
2060 TCP_RTO_MAX);
2061 }
2062
2063 packet_cnt += tcp_skb_pcount(skb);
2064 if (packet_cnt >= tp->lost_out)
2065 break;
2066 }
2067 }
2068 }
2069
2070 /* OK, demanded retransmission is finished. */
2071 2007
2072 /* Forward retransmissions are possible only during Recovery. */ 2008 /* Forward retransmissions are possible only during Recovery. */
2073 if (icsk->icsk_ca_state != TCP_CA_Recovery) 2009 if (icsk->icsk_ca_state != TCP_CA_Recovery)
2074 return; 2010 return 0;
2075 2011
2076 /* No forward retransmissions in Reno are possible. */ 2012 /* No forward retransmissions in Reno are possible. */
2077 if (tcp_is_reno(tp)) 2013 if (tcp_is_reno(tp))
2078 return; 2014 return 0;
2079 2015
2080 /* Yeah, we have to make difficult choice between forward transmission 2016 /* Yeah, we have to make difficult choice between forward transmission
2081 * and retransmission... Both ways have their merits... 2017 * and retransmission... Both ways have their merits...
@@ -2086,43 +2022,104 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2086 */ 2022 */
2087 2023
2088 if (tcp_may_send_now(sk)) 2024 if (tcp_may_send_now(sk))
2089 return; 2025 return 0;
2090 2026
2091 /* If nothing is SACKed, highest_sack in the loop won't be valid */ 2027 return 1;
2092 if (!tp->sacked_out) 2028}
2093 return;
2094 2029
2095 if (tp->forward_skb_hint) 2030/* This gets called after a retransmit timeout, and the initially
2096 skb = tp->forward_skb_hint; 2031 * retransmitted data is acknowledged. It tries to continue
2097 else 2032 * resending the rest of the retransmit queue, until either
2033 * we've sent it all or the congestion window limit is reached.
2034 * If doing SACK, the first ACK which comes back for a timeout
2035 * based retransmit packet might feed us FACK information again.
2036 * If so, we use it to avoid unnecessarily retransmissions.
2037 */
2038void tcp_xmit_retransmit_queue(struct sock *sk)
2039{
2040 const struct inet_connection_sock *icsk = inet_csk(sk);
2041 struct tcp_sock *tp = tcp_sk(sk);
2042 struct sk_buff *skb;
2043 struct sk_buff *hole = NULL;
2044 u32 last_lost;
2045 int mib_idx;
2046 int fwd_rexmitting = 0;
2047
2048 if (!tp->lost_out)
2049 tp->retransmit_high = tp->snd_una;
2050
2051 if (tp->retransmit_skb_hint) {
2052 skb = tp->retransmit_skb_hint;
2053 last_lost = TCP_SKB_CB(skb)->end_seq;
2054 if (after(last_lost, tp->retransmit_high))
2055 last_lost = tp->retransmit_high;
2056 } else {
2098 skb = tcp_write_queue_head(sk); 2057 skb = tcp_write_queue_head(sk);
2058 last_lost = tp->snd_una;
2059 }
2099 2060
2061 /* First pass: retransmit lost packets. */
2100 tcp_for_write_queue_from(skb, sk) { 2062 tcp_for_write_queue_from(skb, sk) {
2101 if (skb == tcp_send_head(sk)) 2063 __u8 sacked = TCP_SKB_CB(skb)->sacked;
2102 break;
2103 tp->forward_skb_hint = skb;
2104 2064
2105 if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) 2065 if (skb == tcp_send_head(sk))
2106 break; 2066 break;
2067 /* we could do better than to assign each time */
2068 if (hole == NULL)
2069 tp->retransmit_skb_hint = skb;
2107 2070
2071 /* Assume this retransmit will generate
2072 * only one packet for congestion window
2073 * calculation purposes. This works because
2074 * tcp_retransmit_skb() will chop up the
2075 * packet to be MSS sized and all the
2076 * packet counting works out.
2077 */
2108 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) 2078 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
2109 break; 2079 return;
2080
2081 if (fwd_rexmitting) {
2082begin_fwd:
2083 if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
2084 break;
2085 mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
2086
2087 } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
2088 tp->retransmit_high = last_lost;
2089 if (!tcp_can_forward_retransmit(sk))
2090 break;
2091 /* Backtrack if necessary to non-L'ed skb */
2092 if (hole != NULL) {
2093 skb = hole;
2094 hole = NULL;
2095 }
2096 fwd_rexmitting = 1;
2097 goto begin_fwd;
2110 2098
2111 if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) 2099 } else if (!(sacked & TCPCB_LOST)) {
2100 if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS))
2101 hole = skb;
2112 continue; 2102 continue;
2113 2103
2114 /* Ok, retransmit it. */ 2104 } else {
2115 if (tcp_retransmit_skb(sk, skb)) { 2105 last_lost = TCP_SKB_CB(skb)->end_seq;
2116 tp->forward_skb_hint = NULL; 2106 if (icsk->icsk_ca_state != TCP_CA_Loss)
2117 break; 2107 mib_idx = LINUX_MIB_TCPFASTRETRANS;
2108 else
2109 mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
2118 } 2110 }
2119 2111
2112 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
2113 continue;
2114
2115 if (tcp_retransmit_skb(sk, skb))
2116 return;
2117 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2118
2120 if (skb == tcp_write_queue_head(sk)) 2119 if (skb == tcp_write_queue_head(sk))
2121 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2120 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2122 inet_csk(sk)->icsk_rto, 2121 inet_csk(sk)->icsk_rto,
2123 TCP_RTO_MAX); 2122 TCP_RTO_MAX);
2124
2125 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFORWARDRETRANS);
2126 } 2123 }
2127} 2124}
2128 2125
@@ -2241,6 +2238,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2241 struct sk_buff *skb; 2238 struct sk_buff *skb;
2242 struct tcp_md5sig_key *md5; 2239 struct tcp_md5sig_key *md5;
2243 __u8 *md5_hash_location; 2240 __u8 *md5_hash_location;
2241 int mss;
2244 2242
2245 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); 2243 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
2246 if (skb == NULL) 2244 if (skb == NULL)
@@ -2251,13 +2249,17 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2251 2249
2252 skb->dst = dst_clone(dst); 2250 skb->dst = dst_clone(dst);
2253 2251
2252 mss = dst_metric(dst, RTAX_ADVMSS);
2253 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2254 mss = tp->rx_opt.user_mss;
2255
2254 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ 2256 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
2255 __u8 rcv_wscale; 2257 __u8 rcv_wscale;
2256 /* Set this up on the first call only */ 2258 /* Set this up on the first call only */
2257 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); 2259 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2258 /* tcp_full_space because it is guaranteed to be the first packet */ 2260 /* tcp_full_space because it is guaranteed to be the first packet */
2259 tcp_select_initial_window(tcp_full_space(sk), 2261 tcp_select_initial_window(tcp_full_space(sk),
2260 dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), 2262 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
2261 &req->rcv_wnd, 2263 &req->rcv_wnd,
2262 &req->window_clamp, 2264 &req->window_clamp,
2263 ireq->wscale_ok, 2265 ireq->wscale_ok,
@@ -2267,8 +2269,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2267 2269
2268 memset(&opts, 0, sizeof(opts)); 2270 memset(&opts, 0, sizeof(opts));
2269 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2271 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2270 tcp_header_size = tcp_synack_options(sk, req, 2272 tcp_header_size = tcp_synack_options(sk, req, mss,
2271 dst_metric(dst, RTAX_ADVMSS),
2272 skb, &opts, &md5) + 2273 skb, &opts, &md5) +
2273 sizeof(struct tcphdr); 2274 sizeof(struct tcphdr);
2274 2275
@@ -2280,7 +2281,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2280 th->syn = 1; 2281 th->syn = 1;
2281 th->ack = 1; 2282 th->ack = 1;
2282 TCP_ECN_make_synack(req, th); 2283 TCP_ECN_make_synack(req, th);
2283 th->source = inet_sk(sk)->sport; 2284 th->source = ireq->loc_port;
2284 th->dest = ireq->rmt_port; 2285 th->dest = ireq->rmt_port;
2285 /* Setting of flags are superfluous here for callers (and ECE is 2286 /* Setting of flags are superfluous here for callers (and ECE is
2286 * not even correctly set) 2287 * not even correctly set)
@@ -2342,6 +2343,9 @@ static void tcp_connect_init(struct sock *sk)
2342 if (!tp->window_clamp) 2343 if (!tp->window_clamp)
2343 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 2344 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
2344 tp->advmss = dst_metric(dst, RTAX_ADVMSS); 2345 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
2346 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
2347 tp->advmss = tp->rx_opt.user_mss;
2348
2345 tcp_initialize_rcv_mss(sk); 2349 tcp_initialize_rcv_mss(sk);
2346 2350
2347 tcp_select_initial_window(tcp_full_space(sk), 2351 tcp_select_initial_window(tcp_full_space(sk),
@@ -2360,6 +2364,7 @@ static void tcp_connect_init(struct sock *sk)
2360 tcp_init_wl(tp, tp->write_seq, 0); 2364 tcp_init_wl(tp, tp->write_seq, 0);
2361 tp->snd_una = tp->write_seq; 2365 tp->snd_una = tp->write_seq;
2362 tp->snd_sml = tp->write_seq; 2366 tp->snd_sml = tp->write_seq;
2367 tp->snd_up = tp->write_seq;
2363 tp->rcv_nxt = 0; 2368 tp->rcv_nxt = 0;
2364 tp->rcv_wup = 0; 2369 tp->rcv_wup = 0;
2365 tp->copied_seq = 0; 2370 tp->copied_seq = 0;
@@ -2569,8 +2574,7 @@ int tcp_write_wakeup(struct sock *sk)
2569 tcp_event_new_data_sent(sk, skb); 2574 tcp_event_new_data_sent(sk, skb);
2570 return err; 2575 return err;
2571 } else { 2576 } else {
2572 if (tp->urg_mode && 2577 if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
2573 between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
2574 tcp_xmit_probe_skb(sk, 1); 2578 tcp_xmit_probe_skb(sk, 1);
2575 return tcp_xmit_probe_skb(sk, 0); 2579 return tcp_xmit_probe_skb(sk, 0);
2576 } 2580 }
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 5ab6ba19c3ce..6b6dff1164b9 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -201,7 +201,7 @@ static void tcp_delack_timer(unsigned long data)
201 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); 201 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
202 202
203 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) 203 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
204 sk->sk_backlog_rcv(sk, skb); 204 sk_backlog_rcv(sk, skb);
205 205
206 tp->ucopy.memory = 0; 206 tp->ucopy.memory = 0;
207 } 207 }
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 57e26fa66185..eacf4cfef146 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -108,9 +108,6 @@
108 * Snmp MIB for the UDP layer 108 * Snmp MIB for the UDP layer
109 */ 109 */
110 110
111DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly;
112EXPORT_SYMBOL(udp_stats_in6);
113
114struct hlist_head udp_hash[UDP_HTABLE_SIZE]; 111struct hlist_head udp_hash[UDP_HTABLE_SIZE];
115DEFINE_RWLOCK(udp_hash_lock); 112DEFINE_RWLOCK(udp_hash_lock);
116 113
@@ -125,14 +122,23 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min);
125atomic_t udp_memory_allocated; 122atomic_t udp_memory_allocated;
126EXPORT_SYMBOL(udp_memory_allocated); 123EXPORT_SYMBOL(udp_memory_allocated);
127 124
128static inline int __udp_lib_lport_inuse(struct net *net, __u16 num, 125static int udp_lib_lport_inuse(struct net *net, __u16 num,
129 const struct hlist_head udptable[]) 126 const struct hlist_head udptable[],
127 struct sock *sk,
128 int (*saddr_comp)(const struct sock *sk1,
129 const struct sock *sk2))
130{ 130{
131 struct sock *sk; 131 struct sock *sk2;
132 struct hlist_node *node; 132 struct hlist_node *node;
133 133
134 sk_for_each(sk, node, &udptable[udp_hashfn(net, num)]) 134 sk_for_each(sk2, node, &udptable[udp_hashfn(net, num)])
135 if (net_eq(sock_net(sk), net) && sk->sk_hash == num) 135 if (net_eq(sock_net(sk2), net) &&
136 sk2 != sk &&
137 sk2->sk_hash == num &&
138 (!sk2->sk_reuse || !sk->sk_reuse) &&
139 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
140 || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
141 (*saddr_comp)(sk, sk2))
136 return 1; 142 return 1;
137 return 0; 143 return 0;
138} 144}
@@ -149,83 +155,37 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
149 const struct sock *sk2 ) ) 155 const struct sock *sk2 ) )
150{ 156{
151 struct hlist_head *udptable = sk->sk_prot->h.udp_hash; 157 struct hlist_head *udptable = sk->sk_prot->h.udp_hash;
152 struct hlist_node *node;
153 struct hlist_head *head;
154 struct sock *sk2;
155 int error = 1; 158 int error = 1;
156 struct net *net = sock_net(sk); 159 struct net *net = sock_net(sk);
157 160
158 write_lock_bh(&udp_hash_lock); 161 write_lock_bh(&udp_hash_lock);
159 162
160 if (!snum) { 163 if (!snum) {
161 int i, low, high, remaining; 164 int low, high, remaining;
162 unsigned rover, best, best_size_so_far; 165 unsigned rand;
166 unsigned short first;
163 167
164 inet_get_local_port_range(&low, &high); 168 inet_get_local_port_range(&low, &high);
165 remaining = (high - low) + 1; 169 remaining = (high - low) + 1;
166 170
167 best_size_so_far = UINT_MAX; 171 rand = net_random();
168 best = rover = net_random() % remaining + low; 172 snum = first = rand % remaining + low;
169 173 rand |= 1;
170 /* 1st pass: look for empty (or shortest) hash chain */ 174 while (udp_lib_lport_inuse(net, snum, udptable, sk,
171 for (i = 0; i < UDP_HTABLE_SIZE; i++) { 175 saddr_comp)) {
172 int size = 0; 176 do {
173 177 snum = snum + rand;
174 head = &udptable[udp_hashfn(net, rover)]; 178 } while (snum < low || snum > high);
175 if (hlist_empty(head)) 179 if (snum == first)
176 goto gotit; 180 goto fail;
177
178 sk_for_each(sk2, node, head) {
179 if (++size >= best_size_so_far)
180 goto next;
181 }
182 best_size_so_far = size;
183 best = rover;
184 next:
185 /* fold back if end of range */
186 if (++rover > high)
187 rover = low + ((rover - low)
188 & (UDP_HTABLE_SIZE - 1));
189
190
191 }
192
193 /* 2nd pass: find hole in shortest hash chain */
194 rover = best;
195 for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) {
196 if (! __udp_lib_lport_inuse(net, rover, udptable))
197 goto gotit;
198 rover += UDP_HTABLE_SIZE;
199 if (rover > high)
200 rover = low + ((rover - low)
201 & (UDP_HTABLE_SIZE - 1));
202 } 181 }
203 182 } else if (udp_lib_lport_inuse(net, snum, udptable, sk, saddr_comp))
204
205 /* All ports in use! */
206 goto fail; 183 goto fail;
207 184
208gotit:
209 snum = rover;
210 } else {
211 head = &udptable[udp_hashfn(net, snum)];
212
213 sk_for_each(sk2, node, head)
214 if (sk2->sk_hash == snum &&
215 sk2 != sk &&
216 net_eq(sock_net(sk2), net) &&
217 (!sk2->sk_reuse || !sk->sk_reuse) &&
218 (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
219 || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
220 (*saddr_comp)(sk, sk2) )
221 goto fail;
222 }
223
224 inet_sk(sk)->num = snum; 185 inet_sk(sk)->num = snum;
225 sk->sk_hash = snum; 186 sk->sk_hash = snum;
226 if (sk_unhashed(sk)) { 187 if (sk_unhashed(sk)) {
227 head = &udptable[udp_hashfn(net, snum)]; 188 sk_add_node(sk, &udptable[udp_hashfn(net, snum)]);
228 sk_add_node(sk, head);
229 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 189 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
230 } 190 }
231 error = 0; 191 error = 0;
@@ -302,6 +262,28 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
302 return result; 262 return result;
303} 263}
304 264
265static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
266 __be16 sport, __be16 dport,
267 struct hlist_head udptable[])
268{
269 struct sock *sk;
270 const struct iphdr *iph = ip_hdr(skb);
271
272 if (unlikely(sk = skb_steal_sock(skb)))
273 return sk;
274 else
275 return __udp4_lib_lookup(dev_net(skb->dst->dev), iph->saddr, sport,
276 iph->daddr, dport, inet_iif(skb),
277 udptable);
278}
279
280struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
281 __be32 daddr, __be16 dport, int dif)
282{
283 return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, udp_hash);
284}
285EXPORT_SYMBOL_GPL(udp4_lib_lookup);
286
305static inline struct sock *udp_v4_mcast_next(struct sock *sk, 287static inline struct sock *udp_v4_mcast_next(struct sock *sk,
306 __be16 loc_port, __be32 loc_addr, 288 __be16 loc_port, __be32 loc_addr,
307 __be16 rmt_port, __be32 rmt_addr, 289 __be16 rmt_port, __be32 rmt_addr,
@@ -1201,8 +1183,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[],
1201 return __udp4_lib_mcast_deliver(net, skb, uh, 1183 return __udp4_lib_mcast_deliver(net, skb, uh,
1202 saddr, daddr, udptable); 1184 saddr, daddr, udptable);
1203 1185
1204 sk = __udp4_lib_lookup(net, saddr, uh->source, daddr, 1186 sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
1205 uh->dest, inet_iif(skb), udptable);
1206 1187
1207 if (sk != NULL) { 1188 if (sk != NULL) {
1208 int ret = udp_queue_rcv_skb(sk, skb); 1189 int ret = udp_queue_rcv_skb(sk, skb);