aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/devinet.c15
-rw-r--r--net/ipv4/inet_diag.c6
-rw-r--r--net/ipv4/inet_timewait_sock.c35
-rw-r--r--net/ipv4/ipvs/Kconfig17
-rw-r--r--net/ipv4/ipvs/Makefile3
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c249
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c817
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c1370
-rw-r--r--net/ipv4/ipvs/ip_vs_dh.c5
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c58
-rw-r--r--net/ipv4/ipvs/ip_vs_ftp.c61
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c220
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c249
-rw-r--r--net/ipv4/ipvs/ip_vs_lc.c32
-rw-r--r--net/ipv4/ipvs/ip_vs_nq.c39
-rw-r--r--net/ipv4/ipvs/ip_vs_proto.c65
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah.c178
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah_esp.c235
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_esp.c176
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_tcp.c254
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_udp.c227
-rw-r--r--net/ipv4/ipvs/ip_vs_rr.c20
-rw-r--r--net/ipv4/ipvs/ip_vs_sed.c39
-rw-r--r--net/ipv4/ipvs/ip_vs_sh.c5
-rw-r--r--net/ipv4/ipvs/ip_vs_sync.c40
-rw-r--r--net/ipv4/ipvs/ip_vs_wlc.c39
-rw-r--r--net/ipv4/ipvs/ip_vs_wrr.c15
-rw-r--r--net/ipv4/ipvs/ip_vs_xmit.c471
-rw-r--r--net/ipv4/route.c14
-rw-r--r--net/ipv4/tcp_input.c314
-rw-r--r--net/ipv4/tcp_ipv4.c31
-rw-r--r--net/ipv4/tcp_output.c202
32 files changed, 3850 insertions, 1651 deletions
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 91d3d96805d0..b12dae2b0b2d 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1029,6 +1029,11 @@ skip:
1029 } 1029 }
1030} 1030}
1031 1031
1032static inline bool inetdev_valid_mtu(unsigned mtu)
1033{
1034 return mtu >= 68;
1035}
1036
1032/* Called only under RTNL semaphore */ 1037/* Called only under RTNL semaphore */
1033 1038
1034static int inetdev_event(struct notifier_block *this, unsigned long event, 1039static int inetdev_event(struct notifier_block *this, unsigned long event,
@@ -1048,6 +1053,10 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1048 IN_DEV_CONF_SET(in_dev, NOXFRM, 1); 1053 IN_DEV_CONF_SET(in_dev, NOXFRM, 1);
1049 IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); 1054 IN_DEV_CONF_SET(in_dev, NOPOLICY, 1);
1050 } 1055 }
1056 } else if (event == NETDEV_CHANGEMTU) {
1057 /* Re-enabling IP */
1058 if (inetdev_valid_mtu(dev->mtu))
1059 in_dev = inetdev_init(dev);
1051 } 1060 }
1052 goto out; 1061 goto out;
1053 } 1062 }
@@ -1058,7 +1067,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1058 dev->ip_ptr = NULL; 1067 dev->ip_ptr = NULL;
1059 break; 1068 break;
1060 case NETDEV_UP: 1069 case NETDEV_UP:
1061 if (dev->mtu < 68) 1070 if (!inetdev_valid_mtu(dev->mtu))
1062 break; 1071 break;
1063 if (dev->flags & IFF_LOOPBACK) { 1072 if (dev->flags & IFF_LOOPBACK) {
1064 struct in_ifaddr *ifa; 1073 struct in_ifaddr *ifa;
@@ -1080,9 +1089,9 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
1080 ip_mc_down(in_dev); 1089 ip_mc_down(in_dev);
1081 break; 1090 break;
1082 case NETDEV_CHANGEMTU: 1091 case NETDEV_CHANGEMTU:
1083 if (dev->mtu >= 68) 1092 if (inetdev_valid_mtu(dev->mtu))
1084 break; 1093 break;
1085 /* MTU falled under 68, disable IP */ 1094 /* disable IP when MTU is not enough */
1086 case NETDEV_UNREGISTER: 1095 case NETDEV_UNREGISTER:
1087 inetdev_destroy(in_dev); 1096 inetdev_destroy(in_dev);
1088 break; 1097 break;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index c10036e7a463..89cb047ab314 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -782,11 +782,15 @@ skip_listen_ht:
782 struct sock *sk; 782 struct sock *sk;
783 struct hlist_node *node; 783 struct hlist_node *node;
784 784
785 num = 0;
786
787 if (hlist_empty(&head->chain) && hlist_empty(&head->twchain))
788 continue;
789
785 if (i > s_i) 790 if (i > s_i)
786 s_num = 0; 791 s_num = 0;
787 792
788 read_lock_bh(lock); 793 read_lock_bh(lock);
789 num = 0;
790 sk_for_each(sk, node, &head->chain) { 794 sk_for_each(sk, node, &head->chain) {
791 struct inet_sock *inet = inet_sk(sk); 795 struct inet_sock *inet = inet_sk(sk);
792 796
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index d985bd613d25..743f011b9a84 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -409,3 +409,38 @@ out:
409} 409}
410 410
411EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); 411EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
412
413void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
414 struct inet_timewait_death_row *twdr, int family)
415{
416 struct inet_timewait_sock *tw;
417 struct sock *sk;
418 struct hlist_node *node;
419 int h;
420
421 local_bh_disable();
422 for (h = 0; h < (hashinfo->ehash_size); h++) {
423 struct inet_ehash_bucket *head =
424 inet_ehash_bucket(hashinfo, h);
425 rwlock_t *lock = inet_ehash_lockp(hashinfo, h);
426restart:
427 write_lock(lock);
428 sk_for_each(sk, node, &head->twchain) {
429
430 tw = inet_twsk(sk);
431 if (!net_eq(twsk_net(tw), net) ||
432 tw->tw_family != family)
433 continue;
434
435 atomic_inc(&tw->tw_refcnt);
436 write_unlock(lock);
437 inet_twsk_deschedule(tw, twdr);
438 inet_twsk_put(tw);
439
440 goto restart;
441 }
442 write_unlock(lock);
443 }
444 local_bh_enable();
445}
446EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
index 09d0c3f35669..de6004de80bc 100644
--- a/net/ipv4/ipvs/Kconfig
+++ b/net/ipv4/ipvs/Kconfig
@@ -24,6 +24,14 @@ menuconfig IP_VS
24 24
25if IP_VS 25if IP_VS
26 26
27config IP_VS_IPV6
28 bool "IPv6 support for IPVS (DANGEROUS)"
29 depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6)
30 ---help---
31 Add IPv6 support to IPVS. This is incomplete and might be dangerous.
32
33 Say N if unsure.
34
27config IP_VS_DEBUG 35config IP_VS_DEBUG
28 bool "IP virtual server debugging" 36 bool "IP virtual server debugging"
29 ---help--- 37 ---help---
@@ -33,7 +41,8 @@ config IP_VS_DEBUG
33 41
34config IP_VS_TAB_BITS 42config IP_VS_TAB_BITS
35 int "IPVS connection table size (the Nth power of 2)" 43 int "IPVS connection table size (the Nth power of 2)"
36 default "12" 44 range 8 20
45 default 12
37 ---help--- 46 ---help---
38 The IPVS connection hash table uses the chaining scheme to handle 47 The IPVS connection hash table uses the chaining scheme to handle
39 hash collisions. Using a big IPVS connection hash table will greatly 48 hash collisions. Using a big IPVS connection hash table will greatly
@@ -71,14 +80,20 @@ config IP_VS_PROTO_UDP
71 This option enables support for load balancing UDP transport 80 This option enables support for load balancing UDP transport
72 protocol. Say Y if unsure. 81 protocol. Say Y if unsure.
73 82
83config IP_VS_PROTO_AH_ESP
84 bool
85 depends on UNDEFINED
86
74config IP_VS_PROTO_ESP 87config IP_VS_PROTO_ESP
75 bool "ESP load balancing support" 88 bool "ESP load balancing support"
89 select IP_VS_PROTO_AH_ESP
76 ---help--- 90 ---help---
77 This option enables support for load balancing ESP (Encapsulation 91 This option enables support for load balancing ESP (Encapsulation
78 Security Payload) transport protocol. Say Y if unsure. 92 Security Payload) transport protocol. Say Y if unsure.
79 93
80config IP_VS_PROTO_AH 94config IP_VS_PROTO_AH
81 bool "AH load balancing support" 95 bool "AH load balancing support"
96 select IP_VS_PROTO_AH_ESP
82 ---help--- 97 ---help---
83 This option enables support for load balancing AH (Authentication 98 This option enables support for load balancing AH (Authentication
84 Header) transport protocol. Say Y if unsure. 99 Header) transport protocol. Say Y if unsure.
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile
index 30e85de9ffff..73a46fe1fe4c 100644
--- a/net/ipv4/ipvs/Makefile
+++ b/net/ipv4/ipvs/Makefile
@@ -6,8 +6,7 @@
6ip_vs_proto-objs-y := 6ip_vs_proto-objs-y :=
7ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o 7ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
8ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o 8ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
9ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o 9ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
10ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o
11 10
12ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ 11ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
13 ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ 12 ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 44a6872dc245..9a24332fbed8 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -114,9 +114,18 @@ static inline void ct_write_unlock_bh(unsigned key)
114/* 114/*
115 * Returns hash value for IPVS connection entry 115 * Returns hash value for IPVS connection entry
116 */ 116 */
117static unsigned int ip_vs_conn_hashkey(unsigned proto, __be32 addr, __be16 port) 117static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
118 const union nf_inet_addr *addr,
119 __be16 port)
118{ 120{
119 return jhash_3words((__force u32)addr, (__force u32)port, proto, ip_vs_conn_rnd) 121#ifdef CONFIG_IP_VS_IPV6
122 if (af == AF_INET6)
123 return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
124 (__force u32)port, proto, ip_vs_conn_rnd)
125 & IP_VS_CONN_TAB_MASK;
126#endif
127 return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
128 ip_vs_conn_rnd)
120 & IP_VS_CONN_TAB_MASK; 129 & IP_VS_CONN_TAB_MASK;
121} 130}
122 131
@@ -131,7 +140,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
131 int ret; 140 int ret;
132 141
133 /* Hash by protocol, client address and port */ 142 /* Hash by protocol, client address and port */
134 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); 143 hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
135 144
136 ct_write_lock(hash); 145 ct_write_lock(hash);
137 146
@@ -162,7 +171,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
162 int ret; 171 int ret;
163 172
164 /* unhash it and decrease its reference counter */ 173 /* unhash it and decrease its reference counter */
165 hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); 174 hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
166 175
167 ct_write_lock(hash); 176 ct_write_lock(hash);
168 177
@@ -187,20 +196,23 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
187 * d_addr, d_port: pkt dest address (load balancer) 196 * d_addr, d_port: pkt dest address (load balancer)
188 */ 197 */
189static inline struct ip_vs_conn *__ip_vs_conn_in_get 198static inline struct ip_vs_conn *__ip_vs_conn_in_get
190(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) 199(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
200 const union nf_inet_addr *d_addr, __be16 d_port)
191{ 201{
192 unsigned hash; 202 unsigned hash;
193 struct ip_vs_conn *cp; 203 struct ip_vs_conn *cp;
194 204
195 hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); 205 hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
196 206
197 ct_read_lock(hash); 207 ct_read_lock(hash);
198 208
199 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 209 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
200 if (s_addr==cp->caddr && s_port==cp->cport && 210 if (cp->af == af &&
201 d_port==cp->vport && d_addr==cp->vaddr && 211 ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
212 ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
213 s_port == cp->cport && d_port == cp->vport &&
202 ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && 214 ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
203 protocol==cp->protocol) { 215 protocol == cp->protocol) {
204 /* HIT */ 216 /* HIT */
205 atomic_inc(&cp->refcnt); 217 atomic_inc(&cp->refcnt);
206 ct_read_unlock(hash); 218 ct_read_unlock(hash);
@@ -214,39 +226,44 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get
214} 226}
215 227
216struct ip_vs_conn *ip_vs_conn_in_get 228struct ip_vs_conn *ip_vs_conn_in_get
217(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) 229(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
230 const union nf_inet_addr *d_addr, __be16 d_port)
218{ 231{
219 struct ip_vs_conn *cp; 232 struct ip_vs_conn *cp;
220 233
221 cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port); 234 cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port);
222 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) 235 if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
223 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); 236 cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr,
237 d_port);
224 238
225 IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 239 IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
226 ip_vs_proto_name(protocol), 240 ip_vs_proto_name(protocol),
227 NIPQUAD(s_addr), ntohs(s_port), 241 IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
228 NIPQUAD(d_addr), ntohs(d_port), 242 IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
229 cp?"hit":"not hit"); 243 cp ? "hit" : "not hit");
230 244
231 return cp; 245 return cp;
232} 246}
233 247
234/* Get reference to connection template */ 248/* Get reference to connection template */
235struct ip_vs_conn *ip_vs_ct_in_get 249struct ip_vs_conn *ip_vs_ct_in_get
236(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) 250(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
251 const union nf_inet_addr *d_addr, __be16 d_port)
237{ 252{
238 unsigned hash; 253 unsigned hash;
239 struct ip_vs_conn *cp; 254 struct ip_vs_conn *cp;
240 255
241 hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); 256 hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
242 257
243 ct_read_lock(hash); 258 ct_read_lock(hash);
244 259
245 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 260 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
246 if (s_addr==cp->caddr && s_port==cp->cport && 261 if (cp->af == af &&
247 d_port==cp->vport && d_addr==cp->vaddr && 262 ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
263 ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
264 s_port == cp->cport && d_port == cp->vport &&
248 cp->flags & IP_VS_CONN_F_TEMPLATE && 265 cp->flags & IP_VS_CONN_F_TEMPLATE &&
249 protocol==cp->protocol) { 266 protocol == cp->protocol) {
250 /* HIT */ 267 /* HIT */
251 atomic_inc(&cp->refcnt); 268 atomic_inc(&cp->refcnt);
252 goto out; 269 goto out;
@@ -257,11 +274,11 @@ struct ip_vs_conn *ip_vs_ct_in_get
257 out: 274 out:
258 ct_read_unlock(hash); 275 ct_read_unlock(hash);
259 276
260 IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 277 IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
261 ip_vs_proto_name(protocol), 278 ip_vs_proto_name(protocol),
262 NIPQUAD(s_addr), ntohs(s_port), 279 IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
263 NIPQUAD(d_addr), ntohs(d_port), 280 IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
264 cp?"hit":"not hit"); 281 cp ? "hit" : "not hit");
265 282
266 return cp; 283 return cp;
267} 284}
@@ -273,7 +290,8 @@ struct ip_vs_conn *ip_vs_ct_in_get
273 * d_addr, d_port: pkt dest address (foreign host) 290 * d_addr, d_port: pkt dest address (foreign host)
274 */ 291 */
275struct ip_vs_conn *ip_vs_conn_out_get 292struct ip_vs_conn *ip_vs_conn_out_get
276(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) 293(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port,
294 const union nf_inet_addr *d_addr, __be16 d_port)
277{ 295{
278 unsigned hash; 296 unsigned hash;
279 struct ip_vs_conn *cp, *ret=NULL; 297 struct ip_vs_conn *cp, *ret=NULL;
@@ -281,13 +299,15 @@ struct ip_vs_conn *ip_vs_conn_out_get
281 /* 299 /*
282 * Check for "full" addressed entries 300 * Check for "full" addressed entries
283 */ 301 */
284 hash = ip_vs_conn_hashkey(protocol, d_addr, d_port); 302 hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
285 303
286 ct_read_lock(hash); 304 ct_read_lock(hash);
287 305
288 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 306 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
289 if (d_addr == cp->caddr && d_port == cp->cport && 307 if (cp->af == af &&
290 s_port == cp->dport && s_addr == cp->daddr && 308 ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
309 ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
310 d_port == cp->cport && s_port == cp->dport &&
291 protocol == cp->protocol) { 311 protocol == cp->protocol) {
292 /* HIT */ 312 /* HIT */
293 atomic_inc(&cp->refcnt); 313 atomic_inc(&cp->refcnt);
@@ -298,11 +318,11 @@ struct ip_vs_conn *ip_vs_conn_out_get
298 318
299 ct_read_unlock(hash); 319 ct_read_unlock(hash);
300 320
301 IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", 321 IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
302 ip_vs_proto_name(protocol), 322 ip_vs_proto_name(protocol),
303 NIPQUAD(s_addr), ntohs(s_port), 323 IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port),
304 NIPQUAD(d_addr), ntohs(d_port), 324 IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port),
305 ret?"hit":"not hit"); 325 ret ? "hit" : "not hit");
306 326
307 return ret; 327 return ret;
308} 328}
@@ -369,6 +389,33 @@ static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
369 } 389 }
370} 390}
371 391
392#ifdef CONFIG_IP_VS_IPV6
393static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
394{
395 switch (IP_VS_FWD_METHOD(cp)) {
396 case IP_VS_CONN_F_MASQ:
397 cp->packet_xmit = ip_vs_nat_xmit_v6;
398 break;
399
400 case IP_VS_CONN_F_TUNNEL:
401 cp->packet_xmit = ip_vs_tunnel_xmit_v6;
402 break;
403
404 case IP_VS_CONN_F_DROUTE:
405 cp->packet_xmit = ip_vs_dr_xmit_v6;
406 break;
407
408 case IP_VS_CONN_F_LOCALNODE:
409 cp->packet_xmit = ip_vs_null_xmit;
410 break;
411
412 case IP_VS_CONN_F_BYPASS:
413 cp->packet_xmit = ip_vs_bypass_xmit_v6;
414 break;
415 }
416}
417#endif
418
372 419
373static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) 420static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
374{ 421{
@@ -402,16 +449,16 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
402 cp->flags |= atomic_read(&dest->conn_flags); 449 cp->flags |= atomic_read(&dest->conn_flags);
403 cp->dest = dest; 450 cp->dest = dest;
404 451
405 IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 452 IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
406 "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 453 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
407 "dest->refcnt:%d\n", 454 "dest->refcnt:%d\n",
408 ip_vs_proto_name(cp->protocol), 455 ip_vs_proto_name(cp->protocol),
409 NIPQUAD(cp->caddr), ntohs(cp->cport), 456 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
410 NIPQUAD(cp->vaddr), ntohs(cp->vport), 457 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
411 NIPQUAD(cp->daddr), ntohs(cp->dport), 458 IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
412 ip_vs_fwd_tag(cp), cp->state, 459 ip_vs_fwd_tag(cp), cp->state,
413 cp->flags, atomic_read(&cp->refcnt), 460 cp->flags, atomic_read(&cp->refcnt),
414 atomic_read(&dest->refcnt)); 461 atomic_read(&dest->refcnt));
415 462
416 /* Update the connection counters */ 463 /* Update the connection counters */
417 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { 464 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
@@ -444,8 +491,9 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
444 struct ip_vs_dest *dest; 491 struct ip_vs_dest *dest;
445 492
446 if ((cp) && (!cp->dest)) { 493 if ((cp) && (!cp->dest)) {
447 dest = ip_vs_find_dest(cp->daddr, cp->dport, 494 dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
448 cp->vaddr, cp->vport, cp->protocol); 495 &cp->vaddr, cp->vport,
496 cp->protocol);
449 ip_vs_bind_dest(cp, dest); 497 ip_vs_bind_dest(cp, dest);
450 return dest; 498 return dest;
451 } else 499 } else
@@ -464,16 +512,16 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
464 if (!dest) 512 if (!dest)
465 return; 513 return;
466 514
467 IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 515 IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
468 "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " 516 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
469 "dest->refcnt:%d\n", 517 "dest->refcnt:%d\n",
470 ip_vs_proto_name(cp->protocol), 518 ip_vs_proto_name(cp->protocol),
471 NIPQUAD(cp->caddr), ntohs(cp->cport), 519 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
472 NIPQUAD(cp->vaddr), ntohs(cp->vport), 520 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
473 NIPQUAD(cp->daddr), ntohs(cp->dport), 521 IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
474 ip_vs_fwd_tag(cp), cp->state, 522 ip_vs_fwd_tag(cp), cp->state,
475 cp->flags, atomic_read(&cp->refcnt), 523 cp->flags, atomic_read(&cp->refcnt),
476 atomic_read(&dest->refcnt)); 524 atomic_read(&dest->refcnt));
477 525
478 /* Update the connection counters */ 526 /* Update the connection counters */
479 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { 527 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
@@ -526,13 +574,16 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
526 !(dest->flags & IP_VS_DEST_F_AVAILABLE) || 574 !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
527 (sysctl_ip_vs_expire_quiescent_template && 575 (sysctl_ip_vs_expire_quiescent_template &&
528 (atomic_read(&dest->weight) == 0))) { 576 (atomic_read(&dest->weight) == 0))) {
529 IP_VS_DBG(9, "check_template: dest not available for " 577 IP_VS_DBG_BUF(9, "check_template: dest not available for "
530 "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " 578 "protocol %s s:%s:%d v:%s:%d "
531 "-> d:%u.%u.%u.%u:%d\n", 579 "-> d:%s:%d\n",
532 ip_vs_proto_name(ct->protocol), 580 ip_vs_proto_name(ct->protocol),
533 NIPQUAD(ct->caddr), ntohs(ct->cport), 581 IP_VS_DBG_ADDR(ct->af, &ct->caddr),
534 NIPQUAD(ct->vaddr), ntohs(ct->vport), 582 ntohs(ct->cport),
535 NIPQUAD(ct->daddr), ntohs(ct->dport)); 583 IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
584 ntohs(ct->vport),
585 IP_VS_DBG_ADDR(ct->af, &ct->daddr),
586 ntohs(ct->dport));
536 587
537 /* 588 /*
538 * Invalidate the connection template 589 * Invalidate the connection template
@@ -625,8 +676,9 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
625 * Create a new connection entry and hash it into the ip_vs_conn_tab 676 * Create a new connection entry and hash it into the ip_vs_conn_tab
626 */ 677 */
627struct ip_vs_conn * 678struct ip_vs_conn *
628ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport, 679ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
629 __be32 daddr, __be16 dport, unsigned flags, 680 const union nf_inet_addr *vaddr, __be16 vport,
681 const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
630 struct ip_vs_dest *dest) 682 struct ip_vs_dest *dest)
631{ 683{
632 struct ip_vs_conn *cp; 684 struct ip_vs_conn *cp;
@@ -640,12 +692,13 @@ ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport
640 692
641 INIT_LIST_HEAD(&cp->c_list); 693 INIT_LIST_HEAD(&cp->c_list);
642 setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); 694 setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
695 cp->af = af;
643 cp->protocol = proto; 696 cp->protocol = proto;
644 cp->caddr = caddr; 697 ip_vs_addr_copy(af, &cp->caddr, caddr);
645 cp->cport = cport; 698 cp->cport = cport;
646 cp->vaddr = vaddr; 699 ip_vs_addr_copy(af, &cp->vaddr, vaddr);
647 cp->vport = vport; 700 cp->vport = vport;
648 cp->daddr = daddr; 701 ip_vs_addr_copy(af, &cp->daddr, daddr);
649 cp->dport = dport; 702 cp->dport = dport;
650 cp->flags = flags; 703 cp->flags = flags;
651 spin_lock_init(&cp->lock); 704 spin_lock_init(&cp->lock);
@@ -672,7 +725,12 @@ ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport
672 cp->timeout = 3*HZ; 725 cp->timeout = 3*HZ;
673 726
674 /* Bind its packet transmitter */ 727 /* Bind its packet transmitter */
675 ip_vs_bind_xmit(cp); 728#ifdef CONFIG_IP_VS_IPV6
729 if (af == AF_INET6)
730 ip_vs_bind_xmit_v6(cp);
731 else
732#endif
733 ip_vs_bind_xmit(cp);
676 734
677 if (unlikely(pp && atomic_read(&pp->appcnt))) 735 if (unlikely(pp && atomic_read(&pp->appcnt)))
678 ip_vs_bind_app(cp, pp); 736 ip_vs_bind_app(cp, pp);
@@ -760,12 +818,26 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
760 else { 818 else {
761 const struct ip_vs_conn *cp = v; 819 const struct ip_vs_conn *cp = v;
762 820
763 seq_printf(seq, 821#ifdef CONFIG_IP_VS_IPV6
764 "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n", 822 if (cp->af == AF_INET6)
823 seq_printf(seq,
824 "%-3s " NIP6_FMT " %04X " NIP6_FMT
825 " %04X " NIP6_FMT " %04X %-11s %7lu\n",
826 ip_vs_proto_name(cp->protocol),
827 NIP6(cp->caddr.in6), ntohs(cp->cport),
828 NIP6(cp->vaddr.in6), ntohs(cp->vport),
829 NIP6(cp->daddr.in6), ntohs(cp->dport),
830 ip_vs_state_name(cp->protocol, cp->state),
831 (cp->timer.expires-jiffies)/HZ);
832 else
833#endif
834 seq_printf(seq,
835 "%-3s %08X %04X %08X %04X"
836 " %08X %04X %-11s %7lu\n",
765 ip_vs_proto_name(cp->protocol), 837 ip_vs_proto_name(cp->protocol),
766 ntohl(cp->caddr), ntohs(cp->cport), 838 ntohl(cp->caddr.ip), ntohs(cp->cport),
767 ntohl(cp->vaddr), ntohs(cp->vport), 839 ntohl(cp->vaddr.ip), ntohs(cp->vport),
768 ntohl(cp->daddr), ntohs(cp->dport), 840 ntohl(cp->daddr.ip), ntohs(cp->dport),
769 ip_vs_state_name(cp->protocol, cp->state), 841 ip_vs_state_name(cp->protocol, cp->state),
770 (cp->timer.expires-jiffies)/HZ); 842 (cp->timer.expires-jiffies)/HZ);
771 } 843 }
@@ -809,12 +881,27 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
809 else { 881 else {
810 const struct ip_vs_conn *cp = v; 882 const struct ip_vs_conn *cp = v;
811 883
812 seq_printf(seq, 884#ifdef CONFIG_IP_VS_IPV6
813 "%-3s %08X %04X %08X %04X %08X %04X %-11s %-6s %7lu\n", 885 if (cp->af == AF_INET6)
886 seq_printf(seq,
887 "%-3s " NIP6_FMT " %04X " NIP6_FMT
888 " %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n",
889 ip_vs_proto_name(cp->protocol),
890 NIP6(cp->caddr.in6), ntohs(cp->cport),
891 NIP6(cp->vaddr.in6), ntohs(cp->vport),
892 NIP6(cp->daddr.in6), ntohs(cp->dport),
893 ip_vs_state_name(cp->protocol, cp->state),
894 ip_vs_origin_name(cp->flags),
895 (cp->timer.expires-jiffies)/HZ);
896 else
897#endif
898 seq_printf(seq,
899 "%-3s %08X %04X %08X %04X "
900 "%08X %04X %-11s %-6s %7lu\n",
814 ip_vs_proto_name(cp->protocol), 901 ip_vs_proto_name(cp->protocol),
815 ntohl(cp->caddr), ntohs(cp->cport), 902 ntohl(cp->caddr.ip), ntohs(cp->cport),
816 ntohl(cp->vaddr), ntohs(cp->vport), 903 ntohl(cp->vaddr.ip), ntohs(cp->vport),
817 ntohl(cp->daddr), ntohs(cp->dport), 904 ntohl(cp->daddr.ip), ntohs(cp->dport),
818 ip_vs_state_name(cp->protocol, cp->state), 905 ip_vs_state_name(cp->protocol, cp->state),
819 ip_vs_origin_name(cp->flags), 906 ip_vs_origin_name(cp->flags),
820 (cp->timer.expires-jiffies)/HZ); 907 (cp->timer.expires-jiffies)/HZ);
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index a7879eafc3b5..958abf3e5f8c 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -39,6 +39,11 @@
39#include <linux/netfilter.h> 39#include <linux/netfilter.h>
40#include <linux/netfilter_ipv4.h> 40#include <linux/netfilter_ipv4.h>
41 41
42#ifdef CONFIG_IP_VS_IPV6
43#include <net/ipv6.h>
44#include <linux/netfilter_ipv6.h>
45#endif
46
42#include <net/ip_vs.h> 47#include <net/ip_vs.h>
43 48
44 49
@@ -60,6 +65,7 @@ EXPORT_SYMBOL(ip_vs_get_debug_level);
60 65
61/* ID used in ICMP lookups */ 66/* ID used in ICMP lookups */
62#define icmp_id(icmph) (((icmph)->un).echo.id) 67#define icmp_id(icmph) (((icmph)->un).echo.id)
68#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier)
63 69
64const char *ip_vs_proto_name(unsigned proto) 70const char *ip_vs_proto_name(unsigned proto)
65{ 71{
@@ -74,6 +80,10 @@ const char *ip_vs_proto_name(unsigned proto)
74 return "TCP"; 80 return "TCP";
75 case IPPROTO_ICMP: 81 case IPPROTO_ICMP:
76 return "ICMP"; 82 return "ICMP";
83#ifdef CONFIG_IP_VS_IPV6
84 case IPPROTO_ICMPV6:
85 return "ICMPv6";
86#endif
77 default: 87 default:
78 sprintf(buf, "IP_%d", proto); 88 sprintf(buf, "IP_%d", proto);
79 return buf; 89 return buf;
@@ -92,18 +102,18 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
92 struct ip_vs_dest *dest = cp->dest; 102 struct ip_vs_dest *dest = cp->dest;
93 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 103 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
94 spin_lock(&dest->stats.lock); 104 spin_lock(&dest->stats.lock);
95 dest->stats.inpkts++; 105 dest->stats.ustats.inpkts++;
96 dest->stats.inbytes += skb->len; 106 dest->stats.ustats.inbytes += skb->len;
97 spin_unlock(&dest->stats.lock); 107 spin_unlock(&dest->stats.lock);
98 108
99 spin_lock(&dest->svc->stats.lock); 109 spin_lock(&dest->svc->stats.lock);
100 dest->svc->stats.inpkts++; 110 dest->svc->stats.ustats.inpkts++;
101 dest->svc->stats.inbytes += skb->len; 111 dest->svc->stats.ustats.inbytes += skb->len;
102 spin_unlock(&dest->svc->stats.lock); 112 spin_unlock(&dest->svc->stats.lock);
103 113
104 spin_lock(&ip_vs_stats.lock); 114 spin_lock(&ip_vs_stats.lock);
105 ip_vs_stats.inpkts++; 115 ip_vs_stats.ustats.inpkts++;
106 ip_vs_stats.inbytes += skb->len; 116 ip_vs_stats.ustats.inbytes += skb->len;
107 spin_unlock(&ip_vs_stats.lock); 117 spin_unlock(&ip_vs_stats.lock);
108 } 118 }
109} 119}
@@ -115,18 +125,18 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
115 struct ip_vs_dest *dest = cp->dest; 125 struct ip_vs_dest *dest = cp->dest;
116 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 126 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
117 spin_lock(&dest->stats.lock); 127 spin_lock(&dest->stats.lock);
118 dest->stats.outpkts++; 128 dest->stats.ustats.outpkts++;
119 dest->stats.outbytes += skb->len; 129 dest->stats.ustats.outbytes += skb->len;
120 spin_unlock(&dest->stats.lock); 130 spin_unlock(&dest->stats.lock);
121 131
122 spin_lock(&dest->svc->stats.lock); 132 spin_lock(&dest->svc->stats.lock);
123 dest->svc->stats.outpkts++; 133 dest->svc->stats.ustats.outpkts++;
124 dest->svc->stats.outbytes += skb->len; 134 dest->svc->stats.ustats.outbytes += skb->len;
125 spin_unlock(&dest->svc->stats.lock); 135 spin_unlock(&dest->svc->stats.lock);
126 136
127 spin_lock(&ip_vs_stats.lock); 137 spin_lock(&ip_vs_stats.lock);
128 ip_vs_stats.outpkts++; 138 ip_vs_stats.ustats.outpkts++;
129 ip_vs_stats.outbytes += skb->len; 139 ip_vs_stats.ustats.outbytes += skb->len;
130 spin_unlock(&ip_vs_stats.lock); 140 spin_unlock(&ip_vs_stats.lock);
131 } 141 }
132} 142}
@@ -136,15 +146,15 @@ static inline void
136ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) 146ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
137{ 147{
138 spin_lock(&cp->dest->stats.lock); 148 spin_lock(&cp->dest->stats.lock);
139 cp->dest->stats.conns++; 149 cp->dest->stats.ustats.conns++;
140 spin_unlock(&cp->dest->stats.lock); 150 spin_unlock(&cp->dest->stats.lock);
141 151
142 spin_lock(&svc->stats.lock); 152 spin_lock(&svc->stats.lock);
143 svc->stats.conns++; 153 svc->stats.ustats.conns++;
144 spin_unlock(&svc->stats.lock); 154 spin_unlock(&svc->stats.lock);
145 155
146 spin_lock(&ip_vs_stats.lock); 156 spin_lock(&ip_vs_stats.lock);
147 ip_vs_stats.conns++; 157 ip_vs_stats.ustats.conns++;
148 spin_unlock(&ip_vs_stats.lock); 158 spin_unlock(&ip_vs_stats.lock);
149} 159}
150 160
@@ -173,20 +183,28 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
173 __be16 ports[2]) 183 __be16 ports[2])
174{ 184{
175 struct ip_vs_conn *cp = NULL; 185 struct ip_vs_conn *cp = NULL;
176 struct iphdr *iph = ip_hdr(skb); 186 struct ip_vs_iphdr iph;
177 struct ip_vs_dest *dest; 187 struct ip_vs_dest *dest;
178 struct ip_vs_conn *ct; 188 struct ip_vs_conn *ct;
179 __be16 dport; /* destination port to forward */ 189 __be16 dport; /* destination port to forward */
180 __be32 snet; /* source network of the client, after masking */ 190 union nf_inet_addr snet; /* source network of the client,
191 after masking */
192
193 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
181 194
182 /* Mask saddr with the netmask to adjust template granularity */ 195 /* Mask saddr with the netmask to adjust template granularity */
183 snet = iph->saddr & svc->netmask; 196#ifdef CONFIG_IP_VS_IPV6
197 if (svc->af == AF_INET6)
198 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
199 else
200#endif
201 snet.ip = iph.saddr.ip & svc->netmask;
184 202
185 IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u " 203 IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
186 "mnet %u.%u.%u.%u\n", 204 "mnet %s\n",
187 NIPQUAD(iph->saddr), ntohs(ports[0]), 205 IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
188 NIPQUAD(iph->daddr), ntohs(ports[1]), 206 IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
189 NIPQUAD(snet)); 207 IP_VS_DBG_ADDR(svc->af, &snet));
190 208
191 /* 209 /*
192 * As far as we know, FTP is a very complicated network protocol, and 210 * As far as we know, FTP is a very complicated network protocol, and
@@ -204,11 +222,11 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
204 if (ports[1] == svc->port) { 222 if (ports[1] == svc->port) {
205 /* Check if a template already exists */ 223 /* Check if a template already exists */
206 if (svc->port != FTPPORT) 224 if (svc->port != FTPPORT)
207 ct = ip_vs_ct_in_get(iph->protocol, snet, 0, 225 ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
208 iph->daddr, ports[1]); 226 &iph.daddr, ports[1]);
209 else 227 else
210 ct = ip_vs_ct_in_get(iph->protocol, snet, 0, 228 ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
211 iph->daddr, 0); 229 &iph.daddr, 0);
212 230
213 if (!ct || !ip_vs_check_template(ct)) { 231 if (!ct || !ip_vs_check_template(ct)) {
214 /* 232 /*
@@ -228,18 +246,18 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
228 * for ftp service. 246 * for ftp service.
229 */ 247 */
230 if (svc->port != FTPPORT) 248 if (svc->port != FTPPORT)
231 ct = ip_vs_conn_new(iph->protocol, 249 ct = ip_vs_conn_new(svc->af, iph.protocol,
232 snet, 0, 250 &snet, 0,
233 iph->daddr, 251 &iph.daddr,
234 ports[1], 252 ports[1],
235 dest->addr, dest->port, 253 &dest->addr, dest->port,
236 IP_VS_CONN_F_TEMPLATE, 254 IP_VS_CONN_F_TEMPLATE,
237 dest); 255 dest);
238 else 256 else
239 ct = ip_vs_conn_new(iph->protocol, 257 ct = ip_vs_conn_new(svc->af, iph.protocol,
240 snet, 0, 258 &snet, 0,
241 iph->daddr, 0, 259 &iph.daddr, 0,
242 dest->addr, 0, 260 &dest->addr, 0,
243 IP_VS_CONN_F_TEMPLATE, 261 IP_VS_CONN_F_TEMPLATE,
244 dest); 262 dest);
245 if (ct == NULL) 263 if (ct == NULL)
@@ -258,12 +276,16 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
258 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> 276 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
259 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0> 277 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
260 */ 278 */
261 if (svc->fwmark) 279 if (svc->fwmark) {
262 ct = ip_vs_ct_in_get(IPPROTO_IP, snet, 0, 280 union nf_inet_addr fwmark = {
263 htonl(svc->fwmark), 0); 281 .all = { 0, 0, 0, htonl(svc->fwmark) }
264 else 282 };
265 ct = ip_vs_ct_in_get(iph->protocol, snet, 0, 283
266 iph->daddr, 0); 284 ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
285 &fwmark, 0);
286 } else
287 ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
288 &iph.daddr, 0);
267 289
268 if (!ct || !ip_vs_check_template(ct)) { 290 if (!ct || !ip_vs_check_template(ct)) {
269 /* 291 /*
@@ -282,18 +304,22 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
282 /* 304 /*
283 * Create a template according to the service 305 * Create a template according to the service
284 */ 306 */
285 if (svc->fwmark) 307 if (svc->fwmark) {
286 ct = ip_vs_conn_new(IPPROTO_IP, 308 union nf_inet_addr fwmark = {
287 snet, 0, 309 .all = { 0, 0, 0, htonl(svc->fwmark) }
288 htonl(svc->fwmark), 0, 310 };
289 dest->addr, 0, 311
312 ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
313 &snet, 0,
314 &fwmark, 0,
315 &dest->addr, 0,
290 IP_VS_CONN_F_TEMPLATE, 316 IP_VS_CONN_F_TEMPLATE,
291 dest); 317 dest);
292 else 318 } else
293 ct = ip_vs_conn_new(iph->protocol, 319 ct = ip_vs_conn_new(svc->af, iph.protocol,
294 snet, 0, 320 &snet, 0,
295 iph->daddr, 0, 321 &iph.daddr, 0,
296 dest->addr, 0, 322 &dest->addr, 0,
297 IP_VS_CONN_F_TEMPLATE, 323 IP_VS_CONN_F_TEMPLATE,
298 dest); 324 dest);
299 if (ct == NULL) 325 if (ct == NULL)
@@ -310,10 +336,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
310 /* 336 /*
311 * Create a new connection according to the template 337 * Create a new connection according to the template
312 */ 338 */
313 cp = ip_vs_conn_new(iph->protocol, 339 cp = ip_vs_conn_new(svc->af, iph.protocol,
314 iph->saddr, ports[0], 340 &iph.saddr, ports[0],
315 iph->daddr, ports[1], 341 &iph.daddr, ports[1],
316 dest->addr, dport, 342 &dest->addr, dport,
317 0, 343 0,
318 dest); 344 dest);
319 if (cp == NULL) { 345 if (cp == NULL) {
@@ -342,12 +368,12 @@ struct ip_vs_conn *
342ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) 368ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
343{ 369{
344 struct ip_vs_conn *cp = NULL; 370 struct ip_vs_conn *cp = NULL;
345 struct iphdr *iph = ip_hdr(skb); 371 struct ip_vs_iphdr iph;
346 struct ip_vs_dest *dest; 372 struct ip_vs_dest *dest;
347 __be16 _ports[2], *pptr; 373 __be16 _ports[2], *pptr;
348 374
349 pptr = skb_header_pointer(skb, iph->ihl*4, 375 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
350 sizeof(_ports), _ports); 376 pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
351 if (pptr == NULL) 377 if (pptr == NULL)
352 return NULL; 378 return NULL;
353 379
@@ -377,22 +403,22 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
377 /* 403 /*
378 * Create a connection entry. 404 * Create a connection entry.
379 */ 405 */
380 cp = ip_vs_conn_new(iph->protocol, 406 cp = ip_vs_conn_new(svc->af, iph.protocol,
381 iph->saddr, pptr[0], 407 &iph.saddr, pptr[0],
382 iph->daddr, pptr[1], 408 &iph.daddr, pptr[1],
383 dest->addr, dest->port?dest->port:pptr[1], 409 &dest->addr, dest->port ? dest->port : pptr[1],
384 0, 410 0,
385 dest); 411 dest);
386 if (cp == NULL) 412 if (cp == NULL)
387 return NULL; 413 return NULL;
388 414
389 IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " 415 IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
390 "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n", 416 "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
391 ip_vs_fwd_tag(cp), 417 ip_vs_fwd_tag(cp),
392 NIPQUAD(cp->caddr), ntohs(cp->cport), 418 IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
393 NIPQUAD(cp->vaddr), ntohs(cp->vport), 419 IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
394 NIPQUAD(cp->daddr), ntohs(cp->dport), 420 IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
395 cp->flags, atomic_read(&cp->refcnt)); 421 cp->flags, atomic_read(&cp->refcnt));
396 422
397 ip_vs_conn_stats(cp, svc); 423 ip_vs_conn_stats(cp, svc);
398 return cp; 424 return cp;
@@ -408,31 +434,39 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
408 struct ip_vs_protocol *pp) 434 struct ip_vs_protocol *pp)
409{ 435{
410 __be16 _ports[2], *pptr; 436 __be16 _ports[2], *pptr;
411 struct iphdr *iph = ip_hdr(skb); 437 struct ip_vs_iphdr iph;
438 int unicast;
439 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
412 440
413 pptr = skb_header_pointer(skb, iph->ihl*4, 441 pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
414 sizeof(_ports), _ports);
415 if (pptr == NULL) { 442 if (pptr == NULL) {
416 ip_vs_service_put(svc); 443 ip_vs_service_put(svc);
417 return NF_DROP; 444 return NF_DROP;
418 } 445 }
419 446
447#ifdef CONFIG_IP_VS_IPV6
448 if (svc->af == AF_INET6)
449 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
450 else
451#endif
452 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
453
420 /* if it is fwmark-based service, the cache_bypass sysctl is up 454 /* if it is fwmark-based service, the cache_bypass sysctl is up
421 and the destination is RTN_UNICAST (and not local), then create 455 and the destination is a non-local unicast, then create
422 a cache_bypass connection entry */ 456 a cache_bypass connection entry */
423 if (sysctl_ip_vs_cache_bypass && svc->fwmark 457 if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
424 && (inet_addr_type(&init_net, iph->daddr) == RTN_UNICAST)) {
425 int ret, cs; 458 int ret, cs;
426 struct ip_vs_conn *cp; 459 struct ip_vs_conn *cp;
460 union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
427 461
428 ip_vs_service_put(svc); 462 ip_vs_service_put(svc);
429 463
430 /* create a new connection entry */ 464 /* create a new connection entry */
431 IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); 465 IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
432 cp = ip_vs_conn_new(iph->protocol, 466 cp = ip_vs_conn_new(svc->af, iph.protocol,
433 iph->saddr, pptr[0], 467 &iph.saddr, pptr[0],
434 iph->daddr, pptr[1], 468 &iph.daddr, pptr[1],
435 0, 0, 469 &daddr, 0,
436 IP_VS_CONN_F_BYPASS, 470 IP_VS_CONN_F_BYPASS,
437 NULL); 471 NULL);
438 if (cp == NULL) 472 if (cp == NULL)
@@ -473,7 +507,14 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
473 * created, the TCP RST packet cannot be sent, instead that 507 * created, the TCP RST packet cannot be sent, instead that
474 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ 508 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
475 */ 509 */
476 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 510#ifdef CONFIG_IP_VS_IPV6
511 if (svc->af == AF_INET6)
512 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0,
513 skb->dev);
514 else
515#endif
516 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
517
477 return NF_DROP; 518 return NF_DROP;
478} 519}
479 520
@@ -512,6 +553,14 @@ static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
512 return err; 553 return err;
513} 554}
514 555
556#ifdef CONFIG_IP_VS_IPV6
557static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
558{
559 /* TODO IPv6: Find out what to do here for IPv6 */
560 return 0;
561}
562#endif
563
515/* 564/*
516 * Packet has been made sufficiently writable in caller 565 * Packet has been made sufficiently writable in caller
517 * - inout: 1=in->out, 0=out->in 566 * - inout: 1=in->out, 0=out->in
@@ -526,14 +575,14 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
526 struct iphdr *ciph = (struct iphdr *)(icmph + 1); 575 struct iphdr *ciph = (struct iphdr *)(icmph + 1);
527 576
528 if (inout) { 577 if (inout) {
529 iph->saddr = cp->vaddr; 578 iph->saddr = cp->vaddr.ip;
530 ip_send_check(iph); 579 ip_send_check(iph);
531 ciph->daddr = cp->vaddr; 580 ciph->daddr = cp->vaddr.ip;
532 ip_send_check(ciph); 581 ip_send_check(ciph);
533 } else { 582 } else {
534 iph->daddr = cp->daddr; 583 iph->daddr = cp->daddr.ip;
535 ip_send_check(iph); 584 ip_send_check(iph);
536 ciph->saddr = cp->daddr; 585 ciph->saddr = cp->daddr.ip;
537 ip_send_check(ciph); 586 ip_send_check(ciph);
538 } 587 }
539 588
@@ -560,21 +609,112 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
560 "Forwarding altered incoming ICMP"); 609 "Forwarding altered incoming ICMP");
561} 610}
562 611
612#ifdef CONFIG_IP_VS_IPV6
613void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
614 struct ip_vs_conn *cp, int inout)
615{
616 struct ipv6hdr *iph = ipv6_hdr(skb);
617 unsigned int icmp_offset = sizeof(struct ipv6hdr);
618 struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) +
619 icmp_offset);
620 struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1);
621
622 if (inout) {
623 iph->saddr = cp->vaddr.in6;
624 ciph->daddr = cp->vaddr.in6;
625 } else {
626 iph->daddr = cp->daddr.in6;
627 ciph->saddr = cp->daddr.in6;
628 }
629
630 /* the TCP/UDP port */
631 if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) {
632 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
633
634 if (inout)
635 ports[1] = cp->vport;
636 else
637 ports[0] = cp->dport;
638 }
639
640 /* And finally the ICMP checksum */
641 icmph->icmp6_cksum = 0;
642 /* TODO IPv6: is this correct for ICMPv6? */
643 ip_vs_checksum_complete(skb, icmp_offset);
644 skb->ip_summed = CHECKSUM_UNNECESSARY;
645
646 if (inout)
647 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
648 "Forwarding altered outgoing ICMPv6");
649 else
650 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
651 "Forwarding altered incoming ICMPv6");
652}
653#endif
654
655/* Handle relevant response ICMP messages - forward to the right
656 * destination host. Used for NAT and local client.
657 */
658static int handle_response_icmp(int af, struct sk_buff *skb,
659 union nf_inet_addr *snet,
660 __u8 protocol, struct ip_vs_conn *cp,
661 struct ip_vs_protocol *pp,
662 unsigned int offset, unsigned int ihl)
663{
664 unsigned int verdict = NF_DROP;
665
666 if (IP_VS_FWD_METHOD(cp) != 0) {
667 IP_VS_ERR("shouldn't reach here, because the box is on the "
668 "half connection in the tun/dr module.\n");
669 }
670
671 /* Ensure the checksum is correct */
672 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
673 /* Failed checksum! */
674 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
675 IP_VS_DBG_ADDR(af, snet));
676 goto out;
677 }
678
679 if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol)
680 offset += 2 * sizeof(__u16);
681 if (!skb_make_writable(skb, offset))
682 goto out;
683
684#ifdef CONFIG_IP_VS_IPV6
685 if (af == AF_INET6)
686 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
687 else
688#endif
689 ip_vs_nat_icmp(skb, pp, cp, 1);
690
691 /* do the statistics and put it back */
692 ip_vs_out_stats(cp, skb);
693
694 skb->ipvs_property = 1;
695 verdict = NF_ACCEPT;
696
697out:
698 __ip_vs_conn_put(cp);
699
700 return verdict;
701}
702
563/* 703/*
564 * Handle ICMP messages in the inside-to-outside direction (outgoing). 704 * Handle ICMP messages in the inside-to-outside direction (outgoing).
565 * Find any that might be relevant, check against existing connections, 705 * Find any that might be relevant, check against existing connections.
566 * forward to the right destination host if relevant.
567 * Currently handles error types - unreachable, quench, ttl exceeded. 706 * Currently handles error types - unreachable, quench, ttl exceeded.
568 * (Only used in VS/NAT)
569 */ 707 */
570static int ip_vs_out_icmp(struct sk_buff *skb, int *related) 708static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
571{ 709{
572 struct iphdr *iph; 710 struct iphdr *iph;
573 struct icmphdr _icmph, *ic; 711 struct icmphdr _icmph, *ic;
574 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ 712 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
713 struct ip_vs_iphdr ciph;
575 struct ip_vs_conn *cp; 714 struct ip_vs_conn *cp;
576 struct ip_vs_protocol *pp; 715 struct ip_vs_protocol *pp;
577 unsigned int offset, ihl, verdict; 716 unsigned int offset, ihl;
717 union nf_inet_addr snet;
578 718
579 *related = 1; 719 *related = 1;
580 720
@@ -627,102 +767,231 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
627 767
628 offset += cih->ihl * 4; 768 offset += cih->ihl * 4;
629 769
770 ip_vs_fill_iphdr(AF_INET, cih, &ciph);
630 /* The embedded headers contain source and dest in reverse order */ 771 /* The embedded headers contain source and dest in reverse order */
631 cp = pp->conn_out_get(skb, pp, cih, offset, 1); 772 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
632 if (!cp) 773 if (!cp)
633 return NF_ACCEPT; 774 return NF_ACCEPT;
634 775
635 verdict = NF_DROP; 776 snet.ip = iph->saddr;
777 return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
778 pp, offset, ihl);
779}
636 780
637 if (IP_VS_FWD_METHOD(cp) != 0) { 781#ifdef CONFIG_IP_VS_IPV6
638 IP_VS_ERR("shouldn't reach here, because the box is on the " 782static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
639 "half connection in the tun/dr module.\n"); 783{
784 struct ipv6hdr *iph;
785 struct icmp6hdr _icmph, *ic;
786 struct ipv6hdr _ciph, *cih; /* The ip header contained
787 within the ICMP */
788 struct ip_vs_iphdr ciph;
789 struct ip_vs_conn *cp;
790 struct ip_vs_protocol *pp;
791 unsigned int offset;
792 union nf_inet_addr snet;
793
794 *related = 1;
795
796 /* reassemble IP fragments */
797 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
798 if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
799 return NF_STOLEN;
640 } 800 }
641 801
642 /* Ensure the checksum is correct */ 802 iph = ipv6_hdr(skb);
643 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { 803 offset = sizeof(struct ipv6hdr);
644 /* Failed checksum! */ 804 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
645 IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", 805 if (ic == NULL)
646 NIPQUAD(iph->saddr)); 806 return NF_DROP;
647 goto out; 807
808 IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
809 ic->icmp6_type, ntohs(icmpv6_id(ic)),
810 NIP6(iph->saddr), NIP6(iph->daddr));
811
812 /*
813 * Work through seeing if this is for us.
814 * These checks are supposed to be in an order that means easy
815 * things are checked first to speed up processing.... however
816 * this means that some packets will manage to get a long way
817 * down this stack and then be rejected, but that's life.
818 */
819 if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
820 (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
821 (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
822 *related = 0;
823 return NF_ACCEPT;
648 } 824 }
649 825
650 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) 826 /* Now find the contained IP header */
651 offset += 2 * sizeof(__u16); 827 offset += sizeof(_icmph);
652 if (!skb_make_writable(skb, offset)) 828 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
653 goto out; 829 if (cih == NULL)
830 return NF_ACCEPT; /* The packet looks wrong, ignore */
654 831
655 ip_vs_nat_icmp(skb, pp, cp, 1); 832 pp = ip_vs_proto_get(cih->nexthdr);
833 if (!pp)
834 return NF_ACCEPT;
656 835
657 /* do the statistics and put it back */ 836 /* Is the embedded protocol header present? */
658 ip_vs_out_stats(cp, skb); 837 /* TODO: we don't support fragmentation at the moment anyways */
838 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
839 return NF_ACCEPT;
659 840
660 skb->ipvs_property = 1; 841 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
661 verdict = NF_ACCEPT;
662 842
663 out: 843 offset += sizeof(struct ipv6hdr);
664 __ip_vs_conn_put(cp);
665 844
666 return verdict; 845 ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
846 /* The embedded headers contain source and dest in reverse order */
847 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
848 if (!cp)
849 return NF_ACCEPT;
850
851 ipv6_addr_copy(&snet.in6, &iph->saddr);
852 return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
853 pp, offset, sizeof(struct ipv6hdr));
667} 854}
855#endif
668 856
669static inline int is_tcp_reset(const struct sk_buff *skb) 857static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
670{ 858{
671 struct tcphdr _tcph, *th; 859 struct tcphdr _tcph, *th;
672 860
673 th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); 861 th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
674 if (th == NULL) 862 if (th == NULL)
675 return 0; 863 return 0;
676 return th->rst; 864 return th->rst;
677} 865}
678 866
867/* Handle response packets: rewrite addresses and send away...
868 * Used for NAT and local client.
869 */
870static unsigned int
871handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
872 struct ip_vs_conn *cp, int ihl)
873{
874 IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
875
876 if (!skb_make_writable(skb, ihl))
877 goto drop;
878
879 /* mangle the packet */
880 if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
881 goto drop;
882
883#ifdef CONFIG_IP_VS_IPV6
884 if (af == AF_INET6)
885 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
886 else
887#endif
888 {
889 ip_hdr(skb)->saddr = cp->vaddr.ip;
890 ip_send_check(ip_hdr(skb));
891 }
892
893 /* For policy routing, packets originating from this
894 * machine itself may be routed differently to packets
895 * passing through. We want this packet to be routed as
896 * if it came from this machine itself. So re-compute
897 * the routing information.
898 */
899#ifdef CONFIG_IP_VS_IPV6
900 if (af == AF_INET6) {
901 if (ip6_route_me_harder(skb) != 0)
902 goto drop;
903 } else
904#endif
905 if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
906 goto drop;
907
908 IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
909
910 ip_vs_out_stats(cp, skb);
911 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
912 ip_vs_conn_put(cp);
913
914 skb->ipvs_property = 1;
915
916 LeaveFunction(11);
917 return NF_ACCEPT;
918
919drop:
920 ip_vs_conn_put(cp);
921 kfree_skb(skb);
922 return NF_STOLEN;
923}
924
679/* 925/*
680 * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. 926 * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
681 * Check if outgoing packet belongs to the established ip_vs_conn, 927 * Check if outgoing packet belongs to the established ip_vs_conn.
682 * rewrite addresses of the packet and send it on its way...
683 */ 928 */
684static unsigned int 929static unsigned int
685ip_vs_out(unsigned int hooknum, struct sk_buff *skb, 930ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
686 const struct net_device *in, const struct net_device *out, 931 const struct net_device *in, const struct net_device *out,
687 int (*okfn)(struct sk_buff *)) 932 int (*okfn)(struct sk_buff *))
688{ 933{
689 struct iphdr *iph; 934 struct ip_vs_iphdr iph;
690 struct ip_vs_protocol *pp; 935 struct ip_vs_protocol *pp;
691 struct ip_vs_conn *cp; 936 struct ip_vs_conn *cp;
692 int ihl; 937 int af;
693 938
694 EnterFunction(11); 939 EnterFunction(11);
695 940
941 af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
942
696 if (skb->ipvs_property) 943 if (skb->ipvs_property)
697 return NF_ACCEPT; 944 return NF_ACCEPT;
698 945
699 iph = ip_hdr(skb); 946 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
700 if (unlikely(iph->protocol == IPPROTO_ICMP)) { 947#ifdef CONFIG_IP_VS_IPV6
701 int related, verdict = ip_vs_out_icmp(skb, &related); 948 if (af == AF_INET6) {
949 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
950 int related, verdict = ip_vs_out_icmp_v6(skb, &related);
702 951
703 if (related) 952 if (related)
704 return verdict; 953 return verdict;
705 iph = ip_hdr(skb); 954 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
706 } 955 }
956 } else
957#endif
958 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
959 int related, verdict = ip_vs_out_icmp(skb, &related);
707 960
708 pp = ip_vs_proto_get(iph->protocol); 961 if (related)
962 return verdict;
963 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
964 }
965
966 pp = ip_vs_proto_get(iph.protocol);
709 if (unlikely(!pp)) 967 if (unlikely(!pp))
710 return NF_ACCEPT; 968 return NF_ACCEPT;
711 969
712 /* reassemble IP fragments */ 970 /* reassemble IP fragments */
713 if (unlikely(iph->frag_off & htons(IP_MF|IP_OFFSET) && 971#ifdef CONFIG_IP_VS_IPV6
714 !pp->dont_defrag)) { 972 if (af == AF_INET6) {
715 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) 973 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
716 return NF_STOLEN; 974 int related, verdict = ip_vs_out_icmp_v6(skb, &related);
717 iph = ip_hdr(skb); 975
718 } 976 if (related)
977 return verdict;
719 978
720 ihl = iph->ihl << 2; 979 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
980 }
981 } else
982#endif
983 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
984 !pp->dont_defrag)) {
985 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
986 return NF_STOLEN;
987
988 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
989 }
721 990
722 /* 991 /*
723 * Check if the packet belongs to an existing entry 992 * Check if the packet belongs to an existing entry
724 */ 993 */
725 cp = pp->conn_out_get(skb, pp, iph, ihl, 0); 994 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
726 995
727 if (unlikely(!cp)) { 996 if (unlikely(!cp)) {
728 if (sysctl_ip_vs_nat_icmp_send && 997 if (sysctl_ip_vs_nat_icmp_send &&
@@ -730,21 +999,31 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
730 pp->protocol == IPPROTO_UDP)) { 999 pp->protocol == IPPROTO_UDP)) {
731 __be16 _ports[2], *pptr; 1000 __be16 _ports[2], *pptr;
732 1001
733 pptr = skb_header_pointer(skb, ihl, 1002 pptr = skb_header_pointer(skb, iph.len,
734 sizeof(_ports), _ports); 1003 sizeof(_ports), _ports);
735 if (pptr == NULL) 1004 if (pptr == NULL)
736 return NF_ACCEPT; /* Not for me */ 1005 return NF_ACCEPT; /* Not for me */
737 if (ip_vs_lookup_real_service(iph->protocol, 1006 if (ip_vs_lookup_real_service(af, iph.protocol,
738 iph->saddr, pptr[0])) { 1007 &iph.saddr,
1008 pptr[0])) {
739 /* 1009 /*
740 * Notify the real server: there is no 1010 * Notify the real server: there is no
741 * existing entry if it is not RST 1011 * existing entry if it is not RST
742 * packet or not TCP packet. 1012 * packet or not TCP packet.
743 */ 1013 */
744 if (iph->protocol != IPPROTO_TCP 1014 if (iph.protocol != IPPROTO_TCP
745 || !is_tcp_reset(skb)) { 1015 || !is_tcp_reset(skb, iph.len)) {
746 icmp_send(skb,ICMP_DEST_UNREACH, 1016#ifdef CONFIG_IP_VS_IPV6
747 ICMP_PORT_UNREACH, 0); 1017 if (af == AF_INET6)
1018 icmpv6_send(skb,
1019 ICMPV6_DEST_UNREACH,
1020 ICMPV6_PORT_UNREACH,
1021 0, skb->dev);
1022 else
1023#endif
1024 icmp_send(skb,
1025 ICMP_DEST_UNREACH,
1026 ICMP_PORT_UNREACH, 0);
748 return NF_DROP; 1027 return NF_DROP;
749 } 1028 }
750 } 1029 }
@@ -754,41 +1033,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
754 return NF_ACCEPT; 1033 return NF_ACCEPT;
755 } 1034 }
756 1035
757 IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); 1036 return handle_response(af, skb, pp, cp, iph.len);
758
759 if (!skb_make_writable(skb, ihl))
760 goto drop;
761
762 /* mangle the packet */
763 if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
764 goto drop;
765 ip_hdr(skb)->saddr = cp->vaddr;
766 ip_send_check(ip_hdr(skb));
767
768 /* For policy routing, packets originating from this
769 * machine itself may be routed differently to packets
770 * passing through. We want this packet to be routed as
771 * if it came from this machine itself. So re-compute
772 * the routing information.
773 */
774 if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
775 goto drop;
776
777 IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
778
779 ip_vs_out_stats(cp, skb);
780 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
781 ip_vs_conn_put(cp);
782
783 skb->ipvs_property = 1;
784
785 LeaveFunction(11);
786 return NF_ACCEPT;
787
788 drop:
789 ip_vs_conn_put(cp);
790 kfree_skb(skb);
791 return NF_STOLEN;
792} 1037}
793 1038
794 1039
@@ -804,9 +1049,11 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
804 struct iphdr *iph; 1049 struct iphdr *iph;
805 struct icmphdr _icmph, *ic; 1050 struct icmphdr _icmph, *ic;
806 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ 1051 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
1052 struct ip_vs_iphdr ciph;
807 struct ip_vs_conn *cp; 1053 struct ip_vs_conn *cp;
808 struct ip_vs_protocol *pp; 1054 struct ip_vs_protocol *pp;
809 unsigned int offset, ihl, verdict; 1055 unsigned int offset, ihl, verdict;
1056 union nf_inet_addr snet;
810 1057
811 *related = 1; 1058 *related = 1;
812 1059
@@ -860,10 +1107,20 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
860 1107
861 offset += cih->ihl * 4; 1108 offset += cih->ihl * 4;
862 1109
1110 ip_vs_fill_iphdr(AF_INET, cih, &ciph);
863 /* The embedded headers contain source and dest in reverse order */ 1111 /* The embedded headers contain source and dest in reverse order */
864 cp = pp->conn_in_get(skb, pp, cih, offset, 1); 1112 cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
865 if (!cp) 1113 if (!cp) {
1114 /* The packet could also belong to a local client */
1115 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1116 if (cp) {
1117 snet.ip = iph->saddr;
1118 return handle_response_icmp(AF_INET, skb, &snet,
1119 cih->protocol, cp, pp,
1120 offset, ihl);
1121 }
866 return NF_ACCEPT; 1122 return NF_ACCEPT;
1123 }
867 1124
868 verdict = NF_DROP; 1125 verdict = NF_DROP;
869 1126
@@ -888,6 +1145,105 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
888 return verdict; 1145 return verdict;
889} 1146}
890 1147
1148#ifdef CONFIG_IP_VS_IPV6
1149static int
1150ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1151{
1152 struct ipv6hdr *iph;
1153 struct icmp6hdr _icmph, *ic;
1154 struct ipv6hdr _ciph, *cih; /* The ip header contained
1155 within the ICMP */
1156 struct ip_vs_iphdr ciph;
1157 struct ip_vs_conn *cp;
1158 struct ip_vs_protocol *pp;
1159 unsigned int offset, verdict;
1160 union nf_inet_addr snet;
1161
1162 *related = 1;
1163
1164 /* reassemble IP fragments */
1165 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1166 if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
1167 IP_DEFRAG_VS_IN :
1168 IP_DEFRAG_VS_FWD))
1169 return NF_STOLEN;
1170 }
1171
1172 iph = ipv6_hdr(skb);
1173 offset = sizeof(struct ipv6hdr);
1174 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1175 if (ic == NULL)
1176 return NF_DROP;
1177
1178 IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n",
1179 ic->icmp6_type, ntohs(icmpv6_id(ic)),
1180 NIP6(iph->saddr), NIP6(iph->daddr));
1181
1182 /*
1183 * Work through seeing if this is for us.
1184 * These checks are supposed to be in an order that means easy
1185 * things are checked first to speed up processing.... however
1186 * this means that some packets will manage to get a long way
1187 * down this stack and then be rejected, but that's life.
1188 */
1189 if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1190 (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1191 (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1192 *related = 0;
1193 return NF_ACCEPT;
1194 }
1195
1196 /* Now find the contained IP header */
1197 offset += sizeof(_icmph);
1198 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1199 if (cih == NULL)
1200 return NF_ACCEPT; /* The packet looks wrong, ignore */
1201
1202 pp = ip_vs_proto_get(cih->nexthdr);
1203 if (!pp)
1204 return NF_ACCEPT;
1205
1206 /* Is the embedded protocol header present? */
1207 /* TODO: we don't support fragmentation at the moment anyways */
1208 if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1209 return NF_ACCEPT;
1210
1211 IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
1212
1213 offset += sizeof(struct ipv6hdr);
1214
1215 ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1216 /* The embedded headers contain source and dest in reverse order */
1217 cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
1218 if (!cp) {
1219 /* The packet could also belong to a local client */
1220 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1221 if (cp) {
1222 ipv6_addr_copy(&snet.in6, &iph->saddr);
1223 return handle_response_icmp(AF_INET6, skb, &snet,
1224 cih->nexthdr,
1225 cp, pp, offset,
1226 sizeof(struct ipv6hdr));
1227 }
1228 return NF_ACCEPT;
1229 }
1230
1231 verdict = NF_DROP;
1232
1233 /* do the statistics and put it back */
1234 ip_vs_in_stats(cp, skb);
1235 if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr)
1236 offset += 2 * sizeof(__u16);
1237 verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1238 /* do not touch skb anymore */
1239
1240 __ip_vs_conn_put(cp);
1241
1242 return verdict;
1243}
1244#endif
1245
1246
891/* 1247/*
892 * Check if it's for virtual services, look it up, 1248 * Check if it's for virtual services, look it up,
893 * and send it on its way... 1249 * and send it on its way...
@@ -897,50 +1253,54 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
897 const struct net_device *in, const struct net_device *out, 1253 const struct net_device *in, const struct net_device *out,
898 int (*okfn)(struct sk_buff *)) 1254 int (*okfn)(struct sk_buff *))
899{ 1255{
900 struct iphdr *iph; 1256 struct ip_vs_iphdr iph;
901 struct ip_vs_protocol *pp; 1257 struct ip_vs_protocol *pp;
902 struct ip_vs_conn *cp; 1258 struct ip_vs_conn *cp;
903 int ret, restart; 1259 int ret, restart, af;
904 int ihl; 1260
1261 af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
1262
1263 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
905 1264
906 /* 1265 /*
907 * Big tappo: only PACKET_HOST (neither loopback nor mcasts) 1266 * Big tappo: only PACKET_HOST, including loopback for local client
908 * ... don't know why 1st test DOES NOT include 2nd (?) 1267 * Don't handle local packets on IPv6 for now
909 */ 1268 */
910 if (unlikely(skb->pkt_type != PACKET_HOST 1269 if (unlikely(skb->pkt_type != PACKET_HOST)) {
911 || skb->dev->flags & IFF_LOOPBACK || skb->sk)) { 1270 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
912 IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", 1271 skb->pkt_type,
913 skb->pkt_type, 1272 iph.protocol,
914 ip_hdr(skb)->protocol, 1273 IP_VS_DBG_ADDR(af, &iph.daddr));
915 NIPQUAD(ip_hdr(skb)->daddr));
916 return NF_ACCEPT; 1274 return NF_ACCEPT;
917 } 1275 }
918 1276
919 iph = ip_hdr(skb); 1277 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
920 if (unlikely(iph->protocol == IPPROTO_ICMP)) {
921 int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); 1278 int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
922 1279
923 if (related) 1280 if (related)
924 return verdict; 1281 return verdict;
925 iph = ip_hdr(skb); 1282 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
926 } 1283 }
927 1284
928 /* Protocol supported? */ 1285 /* Protocol supported? */
929 pp = ip_vs_proto_get(iph->protocol); 1286 pp = ip_vs_proto_get(iph.protocol);
930 if (unlikely(!pp)) 1287 if (unlikely(!pp))
931 return NF_ACCEPT; 1288 return NF_ACCEPT;
932 1289
933 ihl = iph->ihl << 2;
934
935 /* 1290 /*
936 * Check if the packet belongs to an existing connection entry 1291 * Check if the packet belongs to an existing connection entry
937 */ 1292 */
938 cp = pp->conn_in_get(skb, pp, iph, ihl, 0); 1293 cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
939 1294
940 if (unlikely(!cp)) { 1295 if (unlikely(!cp)) {
941 int v; 1296 int v;
942 1297
943 if (!pp->conn_schedule(skb, pp, &v, &cp)) 1298 /* For local client packets, it could be a response */
1299 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1300 if (cp)
1301 return handle_response(af, skb, pp, cp, iph.len);
1302
1303 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
944 return v; 1304 return v;
945 } 1305 }
946 1306
@@ -984,7 +1344,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
984 * encorage the standby servers to update the connections timeout 1344 * encorage the standby servers to update the connections timeout
985 */ 1345 */
986 atomic_inc(&cp->in_pkts); 1346 atomic_inc(&cp->in_pkts);
987 if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && 1347 if (af == AF_INET &&
1348 (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
988 (((cp->protocol != IPPROTO_TCP || 1349 (((cp->protocol != IPPROTO_TCP ||
989 cp->state == IP_VS_TCP_S_ESTABLISHED) && 1350 cp->state == IP_VS_TCP_S_ESTABLISHED) &&
990 (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] 1351 (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
@@ -1023,6 +1384,21 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1023 return ip_vs_in_icmp(skb, &r, hooknum); 1384 return ip_vs_in_icmp(skb, &r, hooknum);
1024} 1385}
1025 1386
1387#ifdef CONFIG_IP_VS_IPV6
1388static unsigned int
1389ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1390 const struct net_device *in, const struct net_device *out,
1391 int (*okfn)(struct sk_buff *))
1392{
1393 int r;
1394
1395 if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1396 return NF_ACCEPT;
1397
1398 return ip_vs_in_icmp_v6(skb, &r, hooknum);
1399}
1400#endif
1401
1026 1402
1027static struct nf_hook_ops ip_vs_ops[] __read_mostly = { 1403static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1028 /* After packet filtering, forward packet through VS/DR, VS/TUN, 1404 /* After packet filtering, forward packet through VS/DR, VS/TUN,
@@ -1060,6 +1436,43 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1060 .hooknum = NF_INET_POST_ROUTING, 1436 .hooknum = NF_INET_POST_ROUTING,
1061 .priority = NF_IP_PRI_NAT_SRC-1, 1437 .priority = NF_IP_PRI_NAT_SRC-1,
1062 }, 1438 },
1439#ifdef CONFIG_IP_VS_IPV6
1440 /* After packet filtering, forward packet through VS/DR, VS/TUN,
1441 * or VS/NAT(change destination), so that filtering rules can be
1442 * applied to IPVS. */
1443 {
1444 .hook = ip_vs_in,
1445 .owner = THIS_MODULE,
1446 .pf = PF_INET6,
1447 .hooknum = NF_INET_LOCAL_IN,
1448 .priority = 100,
1449 },
1450 /* After packet filtering, change source only for VS/NAT */
1451 {
1452 .hook = ip_vs_out,
1453 .owner = THIS_MODULE,
1454 .pf = PF_INET6,
1455 .hooknum = NF_INET_FORWARD,
1456 .priority = 100,
1457 },
1458 /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1459 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1460 {
1461 .hook = ip_vs_forward_icmp_v6,
1462 .owner = THIS_MODULE,
1463 .pf = PF_INET6,
1464 .hooknum = NF_INET_FORWARD,
1465 .priority = 99,
1466 },
1467 /* Before the netfilter connection tracking, exit from POST_ROUTING */
1468 {
1469 .hook = ip_vs_post_routing,
1470 .owner = THIS_MODULE,
1471 .pf = PF_INET6,
1472 .hooknum = NF_INET_POST_ROUTING,
1473 .priority = NF_IP6_PRI_NAT_SRC-1,
1474 },
1475#endif
1063}; 1476};
1064 1477
1065 1478
@@ -1070,10 +1483,12 @@ static int __init ip_vs_init(void)
1070{ 1483{
1071 int ret; 1484 int ret;
1072 1485
1486 ip_vs_estimator_init();
1487
1073 ret = ip_vs_control_init(); 1488 ret = ip_vs_control_init();
1074 if (ret < 0) { 1489 if (ret < 0) {
1075 IP_VS_ERR("can't setup control.\n"); 1490 IP_VS_ERR("can't setup control.\n");
1076 goto cleanup_nothing; 1491 goto cleanup_estimator;
1077 } 1492 }
1078 1493
1079 ip_vs_protocol_init(); 1494 ip_vs_protocol_init();
@@ -1106,7 +1521,8 @@ static int __init ip_vs_init(void)
1106 cleanup_protocol: 1521 cleanup_protocol:
1107 ip_vs_protocol_cleanup(); 1522 ip_vs_protocol_cleanup();
1108 ip_vs_control_cleanup(); 1523 ip_vs_control_cleanup();
1109 cleanup_nothing: 1524 cleanup_estimator:
1525 ip_vs_estimator_cleanup();
1110 return ret; 1526 return ret;
1111} 1527}
1112 1528
@@ -1117,6 +1533,7 @@ static void __exit ip_vs_cleanup(void)
1117 ip_vs_app_cleanup(); 1533 ip_vs_app_cleanup();
1118 ip_vs_protocol_cleanup(); 1534 ip_vs_protocol_cleanup();
1119 ip_vs_control_cleanup(); 1535 ip_vs_control_cleanup();
1536 ip_vs_estimator_cleanup();
1120 IP_VS_INFO("ipvs unloaded.\n"); 1537 IP_VS_INFO("ipvs unloaded.\n");
1121} 1538}
1122 1539
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 6379705a8dcb..771551d8fba9 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -35,8 +35,13 @@
35 35
36#include <net/net_namespace.h> 36#include <net/net_namespace.h>
37#include <net/ip.h> 37#include <net/ip.h>
38#ifdef CONFIG_IP_VS_IPV6
39#include <net/ipv6.h>
40#include <net/ip6_route.h>
41#endif
38#include <net/route.h> 42#include <net/route.h>
39#include <net/sock.h> 43#include <net/sock.h>
44#include <net/genetlink.h>
40 45
41#include <asm/uaccess.h> 46#include <asm/uaccess.h>
42 47
@@ -90,6 +95,26 @@ int ip_vs_get_debug_level(void)
90} 95}
91#endif 96#endif
92 97
98#ifdef CONFIG_IP_VS_IPV6
99/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
100static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
101{
102 struct rt6_info *rt;
103 struct flowi fl = {
104 .oif = 0,
105 .nl_u = {
106 .ip6_u = {
107 .daddr = *addr,
108 .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
109 };
110
111 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
112 if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
113 return 1;
114
115 return 0;
116}
117#endif
93/* 118/*
94 * update_defense_level is called from keventd and from sysctl, 119 * update_defense_level is called from keventd and from sysctl,
95 * so it needs to protect itself from softirqs 120 * so it needs to protect itself from softirqs
@@ -281,11 +306,19 @@ static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
281 * Returns hash value for virtual service 306 * Returns hash value for virtual service
282 */ 307 */
283static __inline__ unsigned 308static __inline__ unsigned
284ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port) 309ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
310 __be16 port)
285{ 311{
286 register unsigned porth = ntohs(port); 312 register unsigned porth = ntohs(port);
313 __be32 addr_fold = addr->ip;
314
315#ifdef CONFIG_IP_VS_IPV6
316 if (af == AF_INET6)
317 addr_fold = addr->ip6[0]^addr->ip6[1]^
318 addr->ip6[2]^addr->ip6[3];
319#endif
287 320
288 return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth) 321 return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
289 & IP_VS_SVC_TAB_MASK; 322 & IP_VS_SVC_TAB_MASK;
290} 323}
291 324
@@ -316,7 +349,8 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
316 /* 349 /*
317 * Hash it by <protocol,addr,port> in ip_vs_svc_table 350 * Hash it by <protocol,addr,port> in ip_vs_svc_table
318 */ 351 */
319 hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port); 352 hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr,
353 svc->port);
320 list_add(&svc->s_list, &ip_vs_svc_table[hash]); 354 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
321 } else { 355 } else {
322 /* 356 /*
@@ -362,17 +396,19 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
362/* 396/*
363 * Get service by {proto,addr,port} in the service table. 397 * Get service by {proto,addr,port} in the service table.
364 */ 398 */
365static __inline__ struct ip_vs_service * 399static inline struct ip_vs_service *
366__ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport) 400__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr,
401 __be16 vport)
367{ 402{
368 unsigned hash; 403 unsigned hash;
369 struct ip_vs_service *svc; 404 struct ip_vs_service *svc;
370 405
371 /* Check for "full" addressed entries */ 406 /* Check for "full" addressed entries */
372 hash = ip_vs_svc_hashkey(protocol, vaddr, vport); 407 hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport);
373 408
374 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ 409 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
375 if ((svc->addr == vaddr) 410 if ((svc->af == af)
411 && ip_vs_addr_equal(af, &svc->addr, vaddr)
376 && (svc->port == vport) 412 && (svc->port == vport)
377 && (svc->protocol == protocol)) { 413 && (svc->protocol == protocol)) {
378 /* HIT */ 414 /* HIT */
@@ -388,7 +424,8 @@ __ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport)
388/* 424/*
389 * Get service by {fwmark} in the service table. 425 * Get service by {fwmark} in the service table.
390 */ 426 */
391static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark) 427static inline struct ip_vs_service *
428__ip_vs_svc_fwm_get(int af, __u32 fwmark)
392{ 429{
393 unsigned hash; 430 unsigned hash;
394 struct ip_vs_service *svc; 431 struct ip_vs_service *svc;
@@ -397,7 +434,7 @@ static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
397 hash = ip_vs_svc_fwm_hashkey(fwmark); 434 hash = ip_vs_svc_fwm_hashkey(fwmark);
398 435
399 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { 436 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
400 if (svc->fwmark == fwmark) { 437 if (svc->fwmark == fwmark && svc->af == af) {
401 /* HIT */ 438 /* HIT */
402 atomic_inc(&svc->usecnt); 439 atomic_inc(&svc->usecnt);
403 return svc; 440 return svc;
@@ -408,7 +445,8 @@ static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
408} 445}
409 446
410struct ip_vs_service * 447struct ip_vs_service *
411ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport) 448ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
449 const union nf_inet_addr *vaddr, __be16 vport)
412{ 450{
413 struct ip_vs_service *svc; 451 struct ip_vs_service *svc;
414 452
@@ -417,14 +455,14 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
417 /* 455 /*
418 * Check the table hashed by fwmark first 456 * Check the table hashed by fwmark first
419 */ 457 */
420 if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark))) 458 if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark)))
421 goto out; 459 goto out;
422 460
423 /* 461 /*
424 * Check the table hashed by <protocol,addr,port> 462 * Check the table hashed by <protocol,addr,port>
425 * for "full" addressed entries 463 * for "full" addressed entries
426 */ 464 */
427 svc = __ip_vs_service_get(protocol, vaddr, vport); 465 svc = __ip_vs_service_get(af, protocol, vaddr, vport);
428 466
429 if (svc == NULL 467 if (svc == NULL
430 && protocol == IPPROTO_TCP 468 && protocol == IPPROTO_TCP
@@ -434,7 +472,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
434 * Check if ftp service entry exists, the packet 472 * Check if ftp service entry exists, the packet
435 * might belong to FTP data connections. 473 * might belong to FTP data connections.
436 */ 474 */
437 svc = __ip_vs_service_get(protocol, vaddr, FTPPORT); 475 svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT);
438 } 476 }
439 477
440 if (svc == NULL 478 if (svc == NULL
@@ -442,16 +480,16 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport)
442 /* 480 /*
443 * Check if the catch-all port (port zero) exists 481 * Check if the catch-all port (port zero) exists
444 */ 482 */
445 svc = __ip_vs_service_get(protocol, vaddr, 0); 483 svc = __ip_vs_service_get(af, protocol, vaddr, 0);
446 } 484 }
447 485
448 out: 486 out:
449 read_unlock(&__ip_vs_svc_lock); 487 read_unlock(&__ip_vs_svc_lock);
450 488
451 IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", 489 IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
452 fwmark, ip_vs_proto_name(protocol), 490 fwmark, ip_vs_proto_name(protocol),
453 NIPQUAD(vaddr), ntohs(vport), 491 IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
454 svc?"hit":"not hit"); 492 svc ? "hit" : "not hit");
455 493
456 return svc; 494 return svc;
457} 495}
@@ -478,11 +516,20 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest)
478/* 516/*
479 * Returns hash value for real service 517 * Returns hash value for real service
480 */ 518 */
481static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port) 519static inline unsigned ip_vs_rs_hashkey(int af,
520 const union nf_inet_addr *addr,
521 __be16 port)
482{ 522{
483 register unsigned porth = ntohs(port); 523 register unsigned porth = ntohs(port);
524 __be32 addr_fold = addr->ip;
525
526#ifdef CONFIG_IP_VS_IPV6
527 if (af == AF_INET6)
528 addr_fold = addr->ip6[0]^addr->ip6[1]^
529 addr->ip6[2]^addr->ip6[3];
530#endif
484 531
485 return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth) 532 return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
486 & IP_VS_RTAB_MASK; 533 & IP_VS_RTAB_MASK;
487} 534}
488 535
@@ -502,7 +549,8 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest)
502 * Hash by proto,addr,port, 549 * Hash by proto,addr,port,
503 * which are the parameters of the real service. 550 * which are the parameters of the real service.
504 */ 551 */
505 hash = ip_vs_rs_hashkey(dest->addr, dest->port); 552 hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
553
506 list_add(&dest->d_list, &ip_vs_rtable[hash]); 554 list_add(&dest->d_list, &ip_vs_rtable[hash]);
507 555
508 return 1; 556 return 1;
@@ -529,7 +577,9 @@ static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
529 * Lookup real service by <proto,addr,port> in the real service table. 577 * Lookup real service by <proto,addr,port> in the real service table.
530 */ 578 */
531struct ip_vs_dest * 579struct ip_vs_dest *
532ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport) 580ip_vs_lookup_real_service(int af, __u16 protocol,
581 const union nf_inet_addr *daddr,
582 __be16 dport)
533{ 583{
534 unsigned hash; 584 unsigned hash;
535 struct ip_vs_dest *dest; 585 struct ip_vs_dest *dest;
@@ -538,11 +588,12 @@ ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
538 * Check for "full" addressed entries 588 * Check for "full" addressed entries
539 * Return the first found entry 589 * Return the first found entry
540 */ 590 */
541 hash = ip_vs_rs_hashkey(daddr, dport); 591 hash = ip_vs_rs_hashkey(af, daddr, dport);
542 592
543 read_lock(&__ip_vs_rs_lock); 593 read_lock(&__ip_vs_rs_lock);
544 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { 594 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
545 if ((dest->addr == daddr) 595 if ((dest->af == af)
596 && ip_vs_addr_equal(af, &dest->addr, daddr)
546 && (dest->port == dport) 597 && (dest->port == dport)
547 && ((dest->protocol == protocol) || 598 && ((dest->protocol == protocol) ||
548 dest->vfwmark)) { 599 dest->vfwmark)) {
@@ -560,7 +611,8 @@ ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport)
560 * Lookup destination by {addr,port} in the given service 611 * Lookup destination by {addr,port} in the given service
561 */ 612 */
562static struct ip_vs_dest * 613static struct ip_vs_dest *
563ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) 614ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
615 __be16 dport)
564{ 616{
565 struct ip_vs_dest *dest; 617 struct ip_vs_dest *dest;
566 618
@@ -568,7 +620,9 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
568 * Find the destination for the given service 620 * Find the destination for the given service
569 */ 621 */
570 list_for_each_entry(dest, &svc->destinations, n_list) { 622 list_for_each_entry(dest, &svc->destinations, n_list) {
571 if ((dest->addr == daddr) && (dest->port == dport)) { 623 if ((dest->af == svc->af)
624 && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
625 && (dest->port == dport)) {
572 /* HIT */ 626 /* HIT */
573 return dest; 627 return dest;
574 } 628 }
@@ -587,13 +641,15 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
587 * ip_vs_lookup_real_service() looked promissing, but 641 * ip_vs_lookup_real_service() looked promissing, but
588 * seems not working as expected. 642 * seems not working as expected.
589 */ 643 */
590struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport, 644struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr,
591 __be32 vaddr, __be16 vport, __u16 protocol) 645 __be16 dport,
646 const union nf_inet_addr *vaddr,
647 __be16 vport, __u16 protocol)
592{ 648{
593 struct ip_vs_dest *dest; 649 struct ip_vs_dest *dest;
594 struct ip_vs_service *svc; 650 struct ip_vs_service *svc;
595 651
596 svc = ip_vs_service_get(0, protocol, vaddr, vport); 652 svc = ip_vs_service_get(af, 0, protocol, vaddr, vport);
597 if (!svc) 653 if (!svc)
598 return NULL; 654 return NULL;
599 dest = ip_vs_lookup_dest(svc, daddr, dport); 655 dest = ip_vs_lookup_dest(svc, daddr, dport);
@@ -614,7 +670,8 @@ struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport,
614 * scheduling. 670 * scheduling.
615 */ 671 */
616static struct ip_vs_dest * 672static struct ip_vs_dest *
617ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) 673ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
674 __be16 dport)
618{ 675{
619 struct ip_vs_dest *dest, *nxt; 676 struct ip_vs_dest *dest, *nxt;
620 677
@@ -622,17 +679,19 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
622 * Find the destination in trash 679 * Find the destination in trash
623 */ 680 */
624 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { 681 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
625 IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " 682 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
626 "dest->refcnt=%d\n", 683 "dest->refcnt=%d\n",
627 dest->vfwmark, 684 dest->vfwmark,
628 NIPQUAD(dest->addr), ntohs(dest->port), 685 IP_VS_DBG_ADDR(svc->af, &dest->addr),
629 atomic_read(&dest->refcnt)); 686 ntohs(dest->port),
630 if (dest->addr == daddr && 687 atomic_read(&dest->refcnt));
688 if (dest->af == svc->af &&
689 ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
631 dest->port == dport && 690 dest->port == dport &&
632 dest->vfwmark == svc->fwmark && 691 dest->vfwmark == svc->fwmark &&
633 dest->protocol == svc->protocol && 692 dest->protocol == svc->protocol &&
634 (svc->fwmark || 693 (svc->fwmark ||
635 (dest->vaddr == svc->addr && 694 (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
636 dest->vport == svc->port))) { 695 dest->vport == svc->port))) {
637 /* HIT */ 696 /* HIT */
638 return dest; 697 return dest;
@@ -642,10 +701,11 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport)
642 * Try to purge the destination from trash if not referenced 701 * Try to purge the destination from trash if not referenced
643 */ 702 */
644 if (atomic_read(&dest->refcnt) == 1) { 703 if (atomic_read(&dest->refcnt) == 1) {
645 IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u " 704 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
646 "from trash\n", 705 "from trash\n",
647 dest->vfwmark, 706 dest->vfwmark,
648 NIPQUAD(dest->addr), ntohs(dest->port)); 707 IP_VS_DBG_ADDR(svc->af, &dest->addr),
708 ntohs(dest->port));
649 list_del(&dest->n_list); 709 list_del(&dest->n_list);
650 ip_vs_dst_reset(dest); 710 ip_vs_dst_reset(dest);
651 __ip_vs_unbind_svc(dest); 711 __ip_vs_unbind_svc(dest);
@@ -684,18 +744,7 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
684{ 744{
685 spin_lock_bh(&stats->lock); 745 spin_lock_bh(&stats->lock);
686 746
687 stats->conns = 0; 747 memset(&stats->ustats, 0, sizeof(stats->ustats));
688 stats->inpkts = 0;
689 stats->outpkts = 0;
690 stats->inbytes = 0;
691 stats->outbytes = 0;
692
693 stats->cps = 0;
694 stats->inpps = 0;
695 stats->outpps = 0;
696 stats->inbps = 0;
697 stats->outbps = 0;
698
699 ip_vs_zero_estimator(stats); 748 ip_vs_zero_estimator(stats);
700 749
701 spin_unlock_bh(&stats->lock); 750 spin_unlock_bh(&stats->lock);
@@ -706,7 +755,7 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
706 */ 755 */
707static void 756static void
708__ip_vs_update_dest(struct ip_vs_service *svc, 757__ip_vs_update_dest(struct ip_vs_service *svc,
709 struct ip_vs_dest *dest, struct ip_vs_dest_user *udest) 758 struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest)
710{ 759{
711 int conn_flags; 760 int conn_flags;
712 761
@@ -715,10 +764,18 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
715 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; 764 conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
716 765
717 /* check if local node and update the flags */ 766 /* check if local node and update the flags */
718 if (inet_addr_type(&init_net, udest->addr) == RTN_LOCAL) { 767#ifdef CONFIG_IP_VS_IPV6
719 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) 768 if (svc->af == AF_INET6) {
720 | IP_VS_CONN_F_LOCALNODE; 769 if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) {
721 } 770 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
771 | IP_VS_CONN_F_LOCALNODE;
772 }
773 } else
774#endif
775 if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) {
776 conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
777 | IP_VS_CONN_F_LOCALNODE;
778 }
722 779
723 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ 780 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
724 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { 781 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
@@ -759,7 +816,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc,
759 * Create a destination for the given service 816 * Create a destination for the given service
760 */ 817 */
761static int 818static int
762ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, 819ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
763 struct ip_vs_dest **dest_p) 820 struct ip_vs_dest **dest_p)
764{ 821{
765 struct ip_vs_dest *dest; 822 struct ip_vs_dest *dest;
@@ -767,9 +824,20 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
767 824
768 EnterFunction(2); 825 EnterFunction(2);
769 826
770 atype = inet_addr_type(&init_net, udest->addr); 827#ifdef CONFIG_IP_VS_IPV6
771 if (atype != RTN_LOCAL && atype != RTN_UNICAST) 828 if (svc->af == AF_INET6) {
772 return -EINVAL; 829 atype = ipv6_addr_type(&udest->addr.in6);
830 if ((!(atype & IPV6_ADDR_UNICAST) ||
831 atype & IPV6_ADDR_LINKLOCAL) &&
832 !__ip_vs_addr_is_local_v6(&udest->addr.in6))
833 return -EINVAL;
834 } else
835#endif
836 {
837 atype = inet_addr_type(&init_net, udest->addr.ip);
838 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
839 return -EINVAL;
840 }
773 841
774 dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); 842 dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
775 if (dest == NULL) { 843 if (dest == NULL) {
@@ -777,11 +845,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
777 return -ENOMEM; 845 return -ENOMEM;
778 } 846 }
779 847
848 dest->af = svc->af;
780 dest->protocol = svc->protocol; 849 dest->protocol = svc->protocol;
781 dest->vaddr = svc->addr; 850 dest->vaddr = svc->addr;
782 dest->vport = svc->port; 851 dest->vport = svc->port;
783 dest->vfwmark = svc->fwmark; 852 dest->vfwmark = svc->fwmark;
784 dest->addr = udest->addr; 853 ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
785 dest->port = udest->port; 854 dest->port = udest->port;
786 855
787 atomic_set(&dest->activeconns, 0); 856 atomic_set(&dest->activeconns, 0);
@@ -806,10 +875,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
806 * Add a destination into an existing service 875 * Add a destination into an existing service
807 */ 876 */
808static int 877static int
809ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) 878ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
810{ 879{
811 struct ip_vs_dest *dest; 880 struct ip_vs_dest *dest;
812 __be32 daddr = udest->addr; 881 union nf_inet_addr daddr;
813 __be16 dport = udest->port; 882 __be16 dport = udest->port;
814 int ret; 883 int ret;
815 884
@@ -826,10 +895,13 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
826 return -ERANGE; 895 return -ERANGE;
827 } 896 }
828 897
898 ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
899
829 /* 900 /*
830 * Check if the dest already exists in the list 901 * Check if the dest already exists in the list
831 */ 902 */
832 dest = ip_vs_lookup_dest(svc, daddr, dport); 903 dest = ip_vs_lookup_dest(svc, &daddr, dport);
904
833 if (dest != NULL) { 905 if (dest != NULL) {
834 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); 906 IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
835 return -EEXIST; 907 return -EEXIST;
@@ -839,15 +911,17 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
839 * Check if the dest already exists in the trash and 911 * Check if the dest already exists in the trash and
840 * is from the same service 912 * is from the same service
841 */ 913 */
842 dest = ip_vs_trash_get_dest(svc, daddr, dport); 914 dest = ip_vs_trash_get_dest(svc, &daddr, dport);
915
843 if (dest != NULL) { 916 if (dest != NULL) {
844 IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " 917 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
845 "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n", 918 "dest->refcnt=%d, service %u/%s:%u\n",
846 NIPQUAD(daddr), ntohs(dport), 919 IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
847 atomic_read(&dest->refcnt), 920 atomic_read(&dest->refcnt),
848 dest->vfwmark, 921 dest->vfwmark,
849 NIPQUAD(dest->vaddr), 922 IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
850 ntohs(dest->vport)); 923 ntohs(dest->vport));
924
851 __ip_vs_update_dest(svc, dest, udest); 925 __ip_vs_update_dest(svc, dest, udest);
852 926
853 /* 927 /*
@@ -868,7 +942,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
868 svc->num_dests++; 942 svc->num_dests++;
869 943
870 /* call the update_service function of its scheduler */ 944 /* call the update_service function of its scheduler */
871 svc->scheduler->update_service(svc); 945 if (svc->scheduler->update_service)
946 svc->scheduler->update_service(svc);
872 947
873 write_unlock_bh(&__ip_vs_svc_lock); 948 write_unlock_bh(&__ip_vs_svc_lock);
874 return 0; 949 return 0;
@@ -898,7 +973,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
898 svc->num_dests++; 973 svc->num_dests++;
899 974
900 /* call the update_service function of its scheduler */ 975 /* call the update_service function of its scheduler */
901 svc->scheduler->update_service(svc); 976 if (svc->scheduler->update_service)
977 svc->scheduler->update_service(svc);
902 978
903 write_unlock_bh(&__ip_vs_svc_lock); 979 write_unlock_bh(&__ip_vs_svc_lock);
904 980
@@ -912,10 +988,10 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
912 * Edit a destination in the given service 988 * Edit a destination in the given service
913 */ 989 */
914static int 990static int
915ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) 991ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
916{ 992{
917 struct ip_vs_dest *dest; 993 struct ip_vs_dest *dest;
918 __be32 daddr = udest->addr; 994 union nf_inet_addr daddr;
919 __be16 dport = udest->port; 995 __be16 dport = udest->port;
920 996
921 EnterFunction(2); 997 EnterFunction(2);
@@ -931,10 +1007,13 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
931 return -ERANGE; 1007 return -ERANGE;
932 } 1008 }
933 1009
1010 ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
1011
934 /* 1012 /*
935 * Lookup the destination list 1013 * Lookup the destination list
936 */ 1014 */
937 dest = ip_vs_lookup_dest(svc, daddr, dport); 1015 dest = ip_vs_lookup_dest(svc, &daddr, dport);
1016
938 if (dest == NULL) { 1017 if (dest == NULL) {
939 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); 1018 IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
940 return -ENOENT; 1019 return -ENOENT;
@@ -948,7 +1027,8 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
948 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); 1027 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
949 1028
950 /* call the update_service, because server weight may be changed */ 1029 /* call the update_service, because server weight may be changed */
951 svc->scheduler->update_service(svc); 1030 if (svc->scheduler->update_service)
1031 svc->scheduler->update_service(svc);
952 1032
953 write_unlock_bh(&__ip_vs_svc_lock); 1033 write_unlock_bh(&__ip_vs_svc_lock);
954 1034
@@ -987,10 +1067,11 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
987 atomic_dec(&dest->svc->refcnt); 1067 atomic_dec(&dest->svc->refcnt);
988 kfree(dest); 1068 kfree(dest);
989 } else { 1069 } else {
990 IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, " 1070 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
991 "dest->refcnt=%d\n", 1071 "dest->refcnt=%d\n",
992 NIPQUAD(dest->addr), ntohs(dest->port), 1072 IP_VS_DBG_ADDR(dest->af, &dest->addr),
993 atomic_read(&dest->refcnt)); 1073 ntohs(dest->port),
1074 atomic_read(&dest->refcnt));
994 list_add(&dest->n_list, &ip_vs_dest_trash); 1075 list_add(&dest->n_list, &ip_vs_dest_trash);
995 atomic_inc(&dest->refcnt); 1076 atomic_inc(&dest->refcnt);
996 } 1077 }
@@ -1011,12 +1092,12 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1011 */ 1092 */
1012 list_del(&dest->n_list); 1093 list_del(&dest->n_list);
1013 svc->num_dests--; 1094 svc->num_dests--;
1014 if (svcupd) { 1095
1015 /* 1096 /*
1016 * Call the update_service function of its scheduler 1097 * Call the update_service function of its scheduler
1017 */ 1098 */
1018 svc->scheduler->update_service(svc); 1099 if (svcupd && svc->scheduler->update_service)
1019 } 1100 svc->scheduler->update_service(svc);
1020} 1101}
1021 1102
1022 1103
@@ -1024,15 +1105,15 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1024 * Delete a destination server in the given service 1105 * Delete a destination server in the given service
1025 */ 1106 */
1026static int 1107static int
1027ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest) 1108ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1028{ 1109{
1029 struct ip_vs_dest *dest; 1110 struct ip_vs_dest *dest;
1030 __be32 daddr = udest->addr;
1031 __be16 dport = udest->port; 1111 __be16 dport = udest->port;
1032 1112
1033 EnterFunction(2); 1113 EnterFunction(2);
1034 1114
1035 dest = ip_vs_lookup_dest(svc, daddr, dport); 1115 dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
1116
1036 if (dest == NULL) { 1117 if (dest == NULL) {
1037 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); 1118 IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
1038 return -ENOENT; 1119 return -ENOENT;
@@ -1067,7 +1148,8 @@ ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
1067 * Add a service into the service hash table 1148 * Add a service into the service hash table
1068 */ 1149 */
1069static int 1150static int
1070ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p) 1151ip_vs_add_service(struct ip_vs_service_user_kern *u,
1152 struct ip_vs_service **svc_p)
1071{ 1153{
1072 int ret = 0; 1154 int ret = 0;
1073 struct ip_vs_scheduler *sched = NULL; 1155 struct ip_vs_scheduler *sched = NULL;
@@ -1085,6 +1167,19 @@ ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1085 goto out_mod_dec; 1167 goto out_mod_dec;
1086 } 1168 }
1087 1169
1170#ifdef CONFIG_IP_VS_IPV6
1171 if (u->af == AF_INET6) {
1172 if (!sched->supports_ipv6) {
1173 ret = -EAFNOSUPPORT;
1174 goto out_err;
1175 }
1176 if ((u->netmask < 1) || (u->netmask > 128)) {
1177 ret = -EINVAL;
1178 goto out_err;
1179 }
1180 }
1181#endif
1182
1088 svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); 1183 svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
1089 if (svc == NULL) { 1184 if (svc == NULL) {
1090 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); 1185 IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
@@ -1096,8 +1191,9 @@ ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1096 atomic_set(&svc->usecnt, 1); 1191 atomic_set(&svc->usecnt, 1);
1097 atomic_set(&svc->refcnt, 0); 1192 atomic_set(&svc->refcnt, 0);
1098 1193
1194 svc->af = u->af;
1099 svc->protocol = u->protocol; 1195 svc->protocol = u->protocol;
1100 svc->addr = u->addr; 1196 ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1101 svc->port = u->port; 1197 svc->port = u->port;
1102 svc->fwmark = u->fwmark; 1198 svc->fwmark = u->fwmark;
1103 svc->flags = u->flags; 1199 svc->flags = u->flags;
@@ -1121,7 +1217,10 @@ ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1121 atomic_inc(&ip_vs_nullsvc_counter); 1217 atomic_inc(&ip_vs_nullsvc_counter);
1122 1218
1123 ip_vs_new_estimator(&svc->stats); 1219 ip_vs_new_estimator(&svc->stats);
1124 ip_vs_num_services++; 1220
1221 /* Count only IPv4 services for old get/setsockopt interface */
1222 if (svc->af == AF_INET)
1223 ip_vs_num_services++;
1125 1224
1126 /* Hash the service into the service table */ 1225 /* Hash the service into the service table */
1127 write_lock_bh(&__ip_vs_svc_lock); 1226 write_lock_bh(&__ip_vs_svc_lock);
@@ -1156,7 +1255,7 @@ ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
1156 * Edit a service and bind it with a new scheduler 1255 * Edit a service and bind it with a new scheduler
1157 */ 1256 */
1158static int 1257static int
1159ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u) 1258ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1160{ 1259{
1161 struct ip_vs_scheduler *sched, *old_sched; 1260 struct ip_vs_scheduler *sched, *old_sched;
1162 int ret = 0; 1261 int ret = 0;
@@ -1172,6 +1271,19 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1172 } 1271 }
1173 old_sched = sched; 1272 old_sched = sched;
1174 1273
1274#ifdef CONFIG_IP_VS_IPV6
1275 if (u->af == AF_INET6) {
1276 if (!sched->supports_ipv6) {
1277 ret = -EAFNOSUPPORT;
1278 goto out;
1279 }
1280 if ((u->netmask < 1) || (u->netmask > 128)) {
1281 ret = -EINVAL;
1282 goto out;
1283 }
1284 }
1285#endif
1286
1175 write_lock_bh(&__ip_vs_svc_lock); 1287 write_lock_bh(&__ip_vs_svc_lock);
1176 1288
1177 /* 1289 /*
@@ -1193,7 +1305,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1193 */ 1305 */
1194 if ((ret = ip_vs_unbind_scheduler(svc))) { 1306 if ((ret = ip_vs_unbind_scheduler(svc))) {
1195 old_sched = sched; 1307 old_sched = sched;
1196 goto out; 1308 goto out_unlock;
1197 } 1309 }
1198 1310
1199 /* 1311 /*
@@ -1212,12 +1324,13 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
1212 */ 1324 */
1213 ip_vs_bind_scheduler(svc, old_sched); 1325 ip_vs_bind_scheduler(svc, old_sched);
1214 old_sched = sched; 1326 old_sched = sched;
1215 goto out; 1327 goto out_unlock;
1216 } 1328 }
1217 } 1329 }
1218 1330
1219 out: 1331 out_unlock:
1220 write_unlock_bh(&__ip_vs_svc_lock); 1332 write_unlock_bh(&__ip_vs_svc_lock);
1333 out:
1221 1334
1222 if (old_sched) 1335 if (old_sched)
1223 ip_vs_scheduler_put(old_sched); 1336 ip_vs_scheduler_put(old_sched);
@@ -1236,7 +1349,10 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
1236 struct ip_vs_dest *dest, *nxt; 1349 struct ip_vs_dest *dest, *nxt;
1237 struct ip_vs_scheduler *old_sched; 1350 struct ip_vs_scheduler *old_sched;
1238 1351
1239 ip_vs_num_services--; 1352 /* Count only IPv4 services for old get/setsockopt interface */
1353 if (svc->af == AF_INET)
1354 ip_vs_num_services--;
1355
1240 ip_vs_kill_estimator(&svc->stats); 1356 ip_vs_kill_estimator(&svc->stats);
1241 1357
1242 /* Unbind scheduler */ 1358 /* Unbind scheduler */
@@ -1671,6 +1787,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1671} 1787}
1672 1788
1673static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) 1789static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1790__acquires(__ip_vs_svc_lock)
1674{ 1791{
1675 1792
1676 read_lock_bh(&__ip_vs_svc_lock); 1793 read_lock_bh(&__ip_vs_svc_lock);
@@ -1724,6 +1841,7 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1724} 1841}
1725 1842
1726static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) 1843static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1844__releases(__ip_vs_svc_lock)
1727{ 1845{
1728 read_unlock_bh(&__ip_vs_svc_lock); 1846 read_unlock_bh(&__ip_vs_svc_lock);
1729} 1847}
@@ -1744,15 +1862,25 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1744 const struct ip_vs_iter *iter = seq->private; 1862 const struct ip_vs_iter *iter = seq->private;
1745 const struct ip_vs_dest *dest; 1863 const struct ip_vs_dest *dest;
1746 1864
1747 if (iter->table == ip_vs_svc_table) 1865 if (iter->table == ip_vs_svc_table) {
1748 seq_printf(seq, "%s %08X:%04X %s ", 1866#ifdef CONFIG_IP_VS_IPV6
1749 ip_vs_proto_name(svc->protocol), 1867 if (svc->af == AF_INET6)
1750 ntohl(svc->addr), 1868 seq_printf(seq, "%s [" NIP6_FMT "]:%04X %s ",
1751 ntohs(svc->port), 1869 ip_vs_proto_name(svc->protocol),
1752 svc->scheduler->name); 1870 NIP6(svc->addr.in6),
1753 else 1871 ntohs(svc->port),
1872 svc->scheduler->name);
1873 else
1874#endif
1875 seq_printf(seq, "%s %08X:%04X %s ",
1876 ip_vs_proto_name(svc->protocol),
1877 ntohl(svc->addr.ip),
1878 ntohs(svc->port),
1879 svc->scheduler->name);
1880 } else {
1754 seq_printf(seq, "FWM %08X %s ", 1881 seq_printf(seq, "FWM %08X %s ",
1755 svc->fwmark, svc->scheduler->name); 1882 svc->fwmark, svc->scheduler->name);
1883 }
1756 1884
1757 if (svc->flags & IP_VS_SVC_F_PERSISTENT) 1885 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1758 seq_printf(seq, "persistent %d %08X\n", 1886 seq_printf(seq, "persistent %d %08X\n",
@@ -1762,13 +1890,29 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1762 seq_putc(seq, '\n'); 1890 seq_putc(seq, '\n');
1763 1891
1764 list_for_each_entry(dest, &svc->destinations, n_list) { 1892 list_for_each_entry(dest, &svc->destinations, n_list) {
1765 seq_printf(seq, 1893#ifdef CONFIG_IP_VS_IPV6
1766 " -> %08X:%04X %-7s %-6d %-10d %-10d\n", 1894 if (dest->af == AF_INET6)
1767 ntohl(dest->addr), ntohs(dest->port), 1895 seq_printf(seq,
1768 ip_vs_fwd_name(atomic_read(&dest->conn_flags)), 1896 " -> [" NIP6_FMT "]:%04X"
1769 atomic_read(&dest->weight), 1897 " %-7s %-6d %-10d %-10d\n",
1770 atomic_read(&dest->activeconns), 1898 NIP6(dest->addr.in6),
1771 atomic_read(&dest->inactconns)); 1899 ntohs(dest->port),
1900 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1901 atomic_read(&dest->weight),
1902 atomic_read(&dest->activeconns),
1903 atomic_read(&dest->inactconns));
1904 else
1905#endif
1906 seq_printf(seq,
1907 " -> %08X:%04X "
1908 "%-7s %-6d %-10d %-10d\n",
1909 ntohl(dest->addr.ip),
1910 ntohs(dest->port),
1911 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1912 atomic_read(&dest->weight),
1913 atomic_read(&dest->activeconns),
1914 atomic_read(&dest->inactconns));
1915
1772 } 1916 }
1773 } 1917 }
1774 return 0; 1918 return 0;
@@ -1812,20 +1956,20 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
1812 " Conns Packets Packets Bytes Bytes\n"); 1956 " Conns Packets Packets Bytes Bytes\n");
1813 1957
1814 spin_lock_bh(&ip_vs_stats.lock); 1958 spin_lock_bh(&ip_vs_stats.lock);
1815 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns, 1959 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns,
1816 ip_vs_stats.inpkts, ip_vs_stats.outpkts, 1960 ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts,
1817 (unsigned long long) ip_vs_stats.inbytes, 1961 (unsigned long long) ip_vs_stats.ustats.inbytes,
1818 (unsigned long long) ip_vs_stats.outbytes); 1962 (unsigned long long) ip_vs_stats.ustats.outbytes);
1819 1963
1820/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 1964/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1821 seq_puts(seq, 1965 seq_puts(seq,
1822 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); 1966 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1823 seq_printf(seq,"%8X %8X %8X %16X %16X\n", 1967 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1824 ip_vs_stats.cps, 1968 ip_vs_stats.ustats.cps,
1825 ip_vs_stats.inpps, 1969 ip_vs_stats.ustats.inpps,
1826 ip_vs_stats.outpps, 1970 ip_vs_stats.ustats.outpps,
1827 ip_vs_stats.inbps, 1971 ip_vs_stats.ustats.inbps,
1828 ip_vs_stats.outbps); 1972 ip_vs_stats.ustats.outbps);
1829 spin_unlock_bh(&ip_vs_stats.lock); 1973 spin_unlock_bh(&ip_vs_stats.lock);
1830 1974
1831 return 0; 1975 return 0;
@@ -1900,14 +2044,44 @@ static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
1900 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, 2044 [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
1901}; 2045};
1902 2046
2047static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2048 struct ip_vs_service_user *usvc_compat)
2049{
2050 usvc->af = AF_INET;
2051 usvc->protocol = usvc_compat->protocol;
2052 usvc->addr.ip = usvc_compat->addr;
2053 usvc->port = usvc_compat->port;
2054 usvc->fwmark = usvc_compat->fwmark;
2055
2056 /* Deep copy of sched_name is not needed here */
2057 usvc->sched_name = usvc_compat->sched_name;
2058
2059 usvc->flags = usvc_compat->flags;
2060 usvc->timeout = usvc_compat->timeout;
2061 usvc->netmask = usvc_compat->netmask;
2062}
2063
2064static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2065 struct ip_vs_dest_user *udest_compat)
2066{
2067 udest->addr.ip = udest_compat->addr;
2068 udest->port = udest_compat->port;
2069 udest->conn_flags = udest_compat->conn_flags;
2070 udest->weight = udest_compat->weight;
2071 udest->u_threshold = udest_compat->u_threshold;
2072 udest->l_threshold = udest_compat->l_threshold;
2073}
2074
1903static int 2075static int
1904do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) 2076do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1905{ 2077{
1906 int ret; 2078 int ret;
1907 unsigned char arg[MAX_ARG_LEN]; 2079 unsigned char arg[MAX_ARG_LEN];
1908 struct ip_vs_service_user *usvc; 2080 struct ip_vs_service_user *usvc_compat;
2081 struct ip_vs_service_user_kern usvc;
1909 struct ip_vs_service *svc; 2082 struct ip_vs_service *svc;
1910 struct ip_vs_dest_user *udest; 2083 struct ip_vs_dest_user *udest_compat;
2084 struct ip_vs_dest_user_kern udest;
1911 2085
1912 if (!capable(CAP_NET_ADMIN)) 2086 if (!capable(CAP_NET_ADMIN))
1913 return -EPERM; 2087 return -EPERM;
@@ -1947,35 +2121,40 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1947 goto out_unlock; 2121 goto out_unlock;
1948 } 2122 }
1949 2123
1950 usvc = (struct ip_vs_service_user *)arg; 2124 usvc_compat = (struct ip_vs_service_user *)arg;
1951 udest = (struct ip_vs_dest_user *)(usvc + 1); 2125 udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2126
2127 /* We only use the new structs internally, so copy userspace compat
2128 * structs to extended internal versions */
2129 ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2130 ip_vs_copy_udest_compat(&udest, udest_compat);
1952 2131
1953 if (cmd == IP_VS_SO_SET_ZERO) { 2132 if (cmd == IP_VS_SO_SET_ZERO) {
1954 /* if no service address is set, zero counters in all */ 2133 /* if no service address is set, zero counters in all */
1955 if (!usvc->fwmark && !usvc->addr && !usvc->port) { 2134 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
1956 ret = ip_vs_zero_all(); 2135 ret = ip_vs_zero_all();
1957 goto out_unlock; 2136 goto out_unlock;
1958 } 2137 }
1959 } 2138 }
1960 2139
1961 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ 2140 /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
1962 if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) { 2141 if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) {
1963 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n", 2142 IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
1964 usvc->protocol, NIPQUAD(usvc->addr), 2143 usvc.protocol, NIPQUAD(usvc.addr.ip),
1965 ntohs(usvc->port), usvc->sched_name); 2144 ntohs(usvc.port), usvc.sched_name);
1966 ret = -EFAULT; 2145 ret = -EFAULT;
1967 goto out_unlock; 2146 goto out_unlock;
1968 } 2147 }
1969 2148
1970 /* Lookup the exact service by <protocol, addr, port> or fwmark */ 2149 /* Lookup the exact service by <protocol, addr, port> or fwmark */
1971 if (usvc->fwmark == 0) 2150 if (usvc.fwmark == 0)
1972 svc = __ip_vs_service_get(usvc->protocol, 2151 svc = __ip_vs_service_get(usvc.af, usvc.protocol,
1973 usvc->addr, usvc->port); 2152 &usvc.addr, usvc.port);
1974 else 2153 else
1975 svc = __ip_vs_svc_fwm_get(usvc->fwmark); 2154 svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
1976 2155
1977 if (cmd != IP_VS_SO_SET_ADD 2156 if (cmd != IP_VS_SO_SET_ADD
1978 && (svc == NULL || svc->protocol != usvc->protocol)) { 2157 && (svc == NULL || svc->protocol != usvc.protocol)) {
1979 ret = -ESRCH; 2158 ret = -ESRCH;
1980 goto out_unlock; 2159 goto out_unlock;
1981 } 2160 }
@@ -1985,10 +2164,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1985 if (svc != NULL) 2164 if (svc != NULL)
1986 ret = -EEXIST; 2165 ret = -EEXIST;
1987 else 2166 else
1988 ret = ip_vs_add_service(usvc, &svc); 2167 ret = ip_vs_add_service(&usvc, &svc);
1989 break; 2168 break;
1990 case IP_VS_SO_SET_EDIT: 2169 case IP_VS_SO_SET_EDIT:
1991 ret = ip_vs_edit_service(svc, usvc); 2170 ret = ip_vs_edit_service(svc, &usvc);
1992 break; 2171 break;
1993 case IP_VS_SO_SET_DEL: 2172 case IP_VS_SO_SET_DEL:
1994 ret = ip_vs_del_service(svc); 2173 ret = ip_vs_del_service(svc);
@@ -1999,13 +2178,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1999 ret = ip_vs_zero_service(svc); 2178 ret = ip_vs_zero_service(svc);
2000 break; 2179 break;
2001 case IP_VS_SO_SET_ADDDEST: 2180 case IP_VS_SO_SET_ADDDEST:
2002 ret = ip_vs_add_dest(svc, udest); 2181 ret = ip_vs_add_dest(svc, &udest);
2003 break; 2182 break;
2004 case IP_VS_SO_SET_EDITDEST: 2183 case IP_VS_SO_SET_EDITDEST:
2005 ret = ip_vs_edit_dest(svc, udest); 2184 ret = ip_vs_edit_dest(svc, &udest);
2006 break; 2185 break;
2007 case IP_VS_SO_SET_DELDEST: 2186 case IP_VS_SO_SET_DELDEST:
2008 ret = ip_vs_del_dest(svc, udest); 2187 ret = ip_vs_del_dest(svc, &udest);
2009 break; 2188 break;
2010 default: 2189 default:
2011 ret = -EINVAL; 2190 ret = -EINVAL;
@@ -2028,7 +2207,7 @@ static void
2028ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) 2207ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2029{ 2208{
2030 spin_lock_bh(&src->lock); 2209 spin_lock_bh(&src->lock);
2031 memcpy(dst, src, (char*)&src->lock - (char*)src); 2210 memcpy(dst, &src->ustats, sizeof(*dst));
2032 spin_unlock_bh(&src->lock); 2211 spin_unlock_bh(&src->lock);
2033} 2212}
2034 2213
@@ -2036,7 +2215,7 @@ static void
2036ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) 2215ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2037{ 2216{
2038 dst->protocol = src->protocol; 2217 dst->protocol = src->protocol;
2039 dst->addr = src->addr; 2218 dst->addr = src->addr.ip;
2040 dst->port = src->port; 2219 dst->port = src->port;
2041 dst->fwmark = src->fwmark; 2220 dst->fwmark = src->fwmark;
2042 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); 2221 strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
@@ -2058,6 +2237,10 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2058 2237
2059 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 2238 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2060 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { 2239 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2240 /* Only expose IPv4 entries to old interface */
2241 if (svc->af != AF_INET)
2242 continue;
2243
2061 if (count >= get->num_services) 2244 if (count >= get->num_services)
2062 goto out; 2245 goto out;
2063 memset(&entry, 0, sizeof(entry)); 2246 memset(&entry, 0, sizeof(entry));
@@ -2073,6 +2256,10 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2073 2256
2074 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 2257 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2075 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { 2258 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2259 /* Only expose IPv4 entries to old interface */
2260 if (svc->af != AF_INET)
2261 continue;
2262
2076 if (count >= get->num_services) 2263 if (count >= get->num_services)
2077 goto out; 2264 goto out;
2078 memset(&entry, 0, sizeof(entry)); 2265 memset(&entry, 0, sizeof(entry));
@@ -2094,13 +2281,15 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2094 struct ip_vs_get_dests __user *uptr) 2281 struct ip_vs_get_dests __user *uptr)
2095{ 2282{
2096 struct ip_vs_service *svc; 2283 struct ip_vs_service *svc;
2284 union nf_inet_addr addr = { .ip = get->addr };
2097 int ret = 0; 2285 int ret = 0;
2098 2286
2099 if (get->fwmark) 2287 if (get->fwmark)
2100 svc = __ip_vs_svc_fwm_get(get->fwmark); 2288 svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark);
2101 else 2289 else
2102 svc = __ip_vs_service_get(get->protocol, 2290 svc = __ip_vs_service_get(AF_INET, get->protocol, &addr,
2103 get->addr, get->port); 2291 get->port);
2292
2104 if (svc) { 2293 if (svc) {
2105 int count = 0; 2294 int count = 0;
2106 struct ip_vs_dest *dest; 2295 struct ip_vs_dest *dest;
@@ -2110,7 +2299,7 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2110 if (count >= get->num_dests) 2299 if (count >= get->num_dests)
2111 break; 2300 break;
2112 2301
2113 entry.addr = dest->addr; 2302 entry.addr = dest->addr.ip;
2114 entry.port = dest->port; 2303 entry.port = dest->port;
2115 entry.conn_flags = atomic_read(&dest->conn_flags); 2304 entry.conn_flags = atomic_read(&dest->conn_flags);
2116 entry.weight = atomic_read(&dest->weight); 2305 entry.weight = atomic_read(&dest->weight);
@@ -2235,13 +2424,15 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2235 { 2424 {
2236 struct ip_vs_service_entry *entry; 2425 struct ip_vs_service_entry *entry;
2237 struct ip_vs_service *svc; 2426 struct ip_vs_service *svc;
2427 union nf_inet_addr addr;
2238 2428
2239 entry = (struct ip_vs_service_entry *)arg; 2429 entry = (struct ip_vs_service_entry *)arg;
2430 addr.ip = entry->addr;
2240 if (entry->fwmark) 2431 if (entry->fwmark)
2241 svc = __ip_vs_svc_fwm_get(entry->fwmark); 2432 svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark);
2242 else 2433 else
2243 svc = __ip_vs_service_get(entry->protocol, 2434 svc = __ip_vs_service_get(AF_INET, entry->protocol,
2244 entry->addr, entry->port); 2435 &addr, entry->port);
2245 if (svc) { 2436 if (svc) {
2246 ip_vs_copy_service(entry, svc); 2437 ip_vs_copy_service(entry, svc);
2247 if (copy_to_user(user, entry, sizeof(*entry)) != 0) 2438 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
@@ -2320,6 +2511,875 @@ static struct nf_sockopt_ops ip_vs_sockopts = {
2320 .owner = THIS_MODULE, 2511 .owner = THIS_MODULE,
2321}; 2512};
2322 2513
2514/*
2515 * Generic Netlink interface
2516 */
2517
2518/* IPVS genetlink family */
2519static struct genl_family ip_vs_genl_family = {
2520 .id = GENL_ID_GENERATE,
2521 .hdrsize = 0,
2522 .name = IPVS_GENL_NAME,
2523 .version = IPVS_GENL_VERSION,
2524 .maxattr = IPVS_CMD_MAX,
2525};
2526
2527/* Policy used for first-level command attributes */
2528static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2529 [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED },
2530 [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED },
2531 [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED },
2532 [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 },
2533 [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2534 [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 },
2535};
2536
2537/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2538static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2539 [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 },
2540 [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING,
2541 .len = IP_VS_IFNAME_MAXLEN },
2542 [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 },
2543};
2544
2545/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2546static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2547 [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 },
2548 [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 },
2549 [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY,
2550 .len = sizeof(union nf_inet_addr) },
2551 [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 },
2552 [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
2553 [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
2554 .len = IP_VS_SCHEDNAME_MAXLEN },
2555 [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
2556 .len = sizeof(struct ip_vs_flags) },
2557 [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
2558 [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 },
2559 [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED },
2560};
2561
2562/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2563static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2564 [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY,
2565 .len = sizeof(union nf_inet_addr) },
2566 [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 },
2567 [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 },
2568 [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 },
2569 [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 },
2570 [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 },
2571 [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 },
2572 [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 },
2573 [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
2574 [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
2575};
2576
2577static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2578 struct ip_vs_stats *stats)
2579{
2580 struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2581 if (!nl_stats)
2582 return -EMSGSIZE;
2583
2584 spin_lock_bh(&stats->lock);
2585
2586 NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns);
2587 NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts);
2588 NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts);
2589 NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes);
2590 NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes);
2591 NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps);
2592 NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps);
2593 NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps);
2594 NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps);
2595 NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps);
2596
2597 spin_unlock_bh(&stats->lock);
2598
2599 nla_nest_end(skb, nl_stats);
2600
2601 return 0;
2602
2603nla_put_failure:
2604 spin_unlock_bh(&stats->lock);
2605 nla_nest_cancel(skb, nl_stats);
2606 return -EMSGSIZE;
2607}
2608
2609static int ip_vs_genl_fill_service(struct sk_buff *skb,
2610 struct ip_vs_service *svc)
2611{
2612 struct nlattr *nl_service;
2613 struct ip_vs_flags flags = { .flags = svc->flags,
2614 .mask = ~0 };
2615
2616 nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2617 if (!nl_service)
2618 return -EMSGSIZE;
2619
2620 NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
2621
2622 if (svc->fwmark) {
2623 NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
2624 } else {
2625 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
2626 NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
2627 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
2628 }
2629
2630 NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
2631 NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
2632 NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
2633 NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
2634
2635 if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
2636 goto nla_put_failure;
2637
2638 nla_nest_end(skb, nl_service);
2639
2640 return 0;
2641
2642nla_put_failure:
2643 nla_nest_cancel(skb, nl_service);
2644 return -EMSGSIZE;
2645}
2646
2647static int ip_vs_genl_dump_service(struct sk_buff *skb,
2648 struct ip_vs_service *svc,
2649 struct netlink_callback *cb)
2650{
2651 void *hdr;
2652
2653 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2654 &ip_vs_genl_family, NLM_F_MULTI,
2655 IPVS_CMD_NEW_SERVICE);
2656 if (!hdr)
2657 return -EMSGSIZE;
2658
2659 if (ip_vs_genl_fill_service(skb, svc) < 0)
2660 goto nla_put_failure;
2661
2662 return genlmsg_end(skb, hdr);
2663
2664nla_put_failure:
2665 genlmsg_cancel(skb, hdr);
2666 return -EMSGSIZE;
2667}
2668
2669static int ip_vs_genl_dump_services(struct sk_buff *skb,
2670 struct netlink_callback *cb)
2671{
2672 int idx = 0, i;
2673 int start = cb->args[0];
2674 struct ip_vs_service *svc;
2675
2676 mutex_lock(&__ip_vs_mutex);
2677 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2678 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2679 if (++idx <= start)
2680 continue;
2681 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2682 idx--;
2683 goto nla_put_failure;
2684 }
2685 }
2686 }
2687
2688 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2689 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
2690 if (++idx <= start)
2691 continue;
2692 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2693 idx--;
2694 goto nla_put_failure;
2695 }
2696 }
2697 }
2698
2699nla_put_failure:
2700 mutex_unlock(&__ip_vs_mutex);
2701 cb->args[0] = idx;
2702
2703 return skb->len;
2704}
2705
2706static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
2707 struct nlattr *nla, int full_entry)
2708{
2709 struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
2710 struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
2711
2712 /* Parse mandatory identifying service fields first */
2713 if (nla == NULL ||
2714 nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
2715 return -EINVAL;
2716
2717 nla_af = attrs[IPVS_SVC_ATTR_AF];
2718 nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL];
2719 nla_addr = attrs[IPVS_SVC_ATTR_ADDR];
2720 nla_port = attrs[IPVS_SVC_ATTR_PORT];
2721 nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK];
2722
2723 if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
2724 return -EINVAL;
2725
2726 usvc->af = nla_get_u16(nla_af);
2727#ifdef CONFIG_IP_VS_IPV6
2728 if (usvc->af != AF_INET && usvc->af != AF_INET6)
2729#else
2730 if (usvc->af != AF_INET)
2731#endif
2732 return -EAFNOSUPPORT;
2733
2734 if (nla_fwmark) {
2735 usvc->protocol = IPPROTO_TCP;
2736 usvc->fwmark = nla_get_u32(nla_fwmark);
2737 } else {
2738 usvc->protocol = nla_get_u16(nla_protocol);
2739 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
2740 usvc->port = nla_get_u16(nla_port);
2741 usvc->fwmark = 0;
2742 }
2743
2744 /* If a full entry was requested, check for the additional fields */
2745 if (full_entry) {
2746 struct nlattr *nla_sched, *nla_flags, *nla_timeout,
2747 *nla_netmask;
2748 struct ip_vs_flags flags;
2749 struct ip_vs_service *svc;
2750
2751 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
2752 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
2753 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
2754 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
2755
2756 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
2757 return -EINVAL;
2758
2759 nla_memcpy(&flags, nla_flags, sizeof(flags));
2760
2761 /* prefill flags from service if it already exists */
2762 if (usvc->fwmark)
2763 svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark);
2764 else
2765 svc = __ip_vs_service_get(usvc->af, usvc->protocol,
2766 &usvc->addr, usvc->port);
2767 if (svc) {
2768 usvc->flags = svc->flags;
2769 ip_vs_service_put(svc);
2770 } else
2771 usvc->flags = 0;
2772
2773 /* set new flags from userland */
2774 usvc->flags = (usvc->flags & ~flags.mask) |
2775 (flags.flags & flags.mask);
2776 usvc->sched_name = nla_data(nla_sched);
2777 usvc->timeout = nla_get_u32(nla_timeout);
2778 usvc->netmask = nla_get_u32(nla_netmask);
2779 }
2780
2781 return 0;
2782}
2783
2784static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
2785{
2786 struct ip_vs_service_user_kern usvc;
2787 int ret;
2788
2789 ret = ip_vs_genl_parse_service(&usvc, nla, 0);
2790 if (ret)
2791 return ERR_PTR(ret);
2792
2793 if (usvc.fwmark)
2794 return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
2795 else
2796 return __ip_vs_service_get(usvc.af, usvc.protocol,
2797 &usvc.addr, usvc.port);
2798}
2799
2800static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
2801{
2802 struct nlattr *nl_dest;
2803
2804 nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
2805 if (!nl_dest)
2806 return -EMSGSIZE;
2807
2808 NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
2809 NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
2810
2811 NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
2812 atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
2813 NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
2814 NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
2815 NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
2816 NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
2817 atomic_read(&dest->activeconns));
2818 NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
2819 atomic_read(&dest->inactconns));
2820 NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
2821 atomic_read(&dest->persistconns));
2822
2823 if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
2824 goto nla_put_failure;
2825
2826 nla_nest_end(skb, nl_dest);
2827
2828 return 0;
2829
2830nla_put_failure:
2831 nla_nest_cancel(skb, nl_dest);
2832 return -EMSGSIZE;
2833}
2834
2835static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
2836 struct netlink_callback *cb)
2837{
2838 void *hdr;
2839
2840 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2841 &ip_vs_genl_family, NLM_F_MULTI,
2842 IPVS_CMD_NEW_DEST);
2843 if (!hdr)
2844 return -EMSGSIZE;
2845
2846 if (ip_vs_genl_fill_dest(skb, dest) < 0)
2847 goto nla_put_failure;
2848
2849 return genlmsg_end(skb, hdr);
2850
2851nla_put_failure:
2852 genlmsg_cancel(skb, hdr);
2853 return -EMSGSIZE;
2854}
2855
2856static int ip_vs_genl_dump_dests(struct sk_buff *skb,
2857 struct netlink_callback *cb)
2858{
2859 int idx = 0;
2860 int start = cb->args[0];
2861 struct ip_vs_service *svc;
2862 struct ip_vs_dest *dest;
2863 struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
2864
2865 mutex_lock(&__ip_vs_mutex);
2866
2867 /* Try to find the service for which to dump destinations */
2868 if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
2869 IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
2870 goto out_err;
2871
2872 svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]);
2873 if (IS_ERR(svc) || svc == NULL)
2874 goto out_err;
2875
2876 /* Dump the destinations */
2877 list_for_each_entry(dest, &svc->destinations, n_list) {
2878 if (++idx <= start)
2879 continue;
2880 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
2881 idx--;
2882 goto nla_put_failure;
2883 }
2884 }
2885
2886nla_put_failure:
2887 cb->args[0] = idx;
2888 ip_vs_service_put(svc);
2889
2890out_err:
2891 mutex_unlock(&__ip_vs_mutex);
2892
2893 return skb->len;
2894}
2895
2896static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
2897 struct nlattr *nla, int full_entry)
2898{
2899 struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
2900 struct nlattr *nla_addr, *nla_port;
2901
2902 /* Parse mandatory identifying destination fields first */
2903 if (nla == NULL ||
2904 nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
2905 return -EINVAL;
2906
2907 nla_addr = attrs[IPVS_DEST_ATTR_ADDR];
2908 nla_port = attrs[IPVS_DEST_ATTR_PORT];
2909
2910 if (!(nla_addr && nla_port))
2911 return -EINVAL;
2912
2913 nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
2914 udest->port = nla_get_u16(nla_port);
2915
2916 /* If a full entry was requested, check for the additional fields */
2917 if (full_entry) {
2918 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
2919 *nla_l_thresh;
2920
2921 nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
2922 nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
2923 nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
2924 nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
2925
2926 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
2927 return -EINVAL;
2928
2929 udest->conn_flags = nla_get_u32(nla_fwd)
2930 & IP_VS_CONN_F_FWD_MASK;
2931 udest->weight = nla_get_u32(nla_weight);
2932 udest->u_threshold = nla_get_u32(nla_u_thresh);
2933 udest->l_threshold = nla_get_u32(nla_l_thresh);
2934 }
2935
2936 return 0;
2937}
2938
2939static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
2940 const char *mcast_ifn, __be32 syncid)
2941{
2942 struct nlattr *nl_daemon;
2943
2944 nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
2945 if (!nl_daemon)
2946 return -EMSGSIZE;
2947
2948 NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
2949 NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
2950 NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
2951
2952 nla_nest_end(skb, nl_daemon);
2953
2954 return 0;
2955
2956nla_put_failure:
2957 nla_nest_cancel(skb, nl_daemon);
2958 return -EMSGSIZE;
2959}
2960
2961static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
2962 const char *mcast_ifn, __be32 syncid,
2963 struct netlink_callback *cb)
2964{
2965 void *hdr;
2966 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2967 &ip_vs_genl_family, NLM_F_MULTI,
2968 IPVS_CMD_NEW_DAEMON);
2969 if (!hdr)
2970 return -EMSGSIZE;
2971
2972 if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
2973 goto nla_put_failure;
2974
2975 return genlmsg_end(skb, hdr);
2976
2977nla_put_failure:
2978 genlmsg_cancel(skb, hdr);
2979 return -EMSGSIZE;
2980}
2981
2982static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
2983 struct netlink_callback *cb)
2984{
2985 mutex_lock(&__ip_vs_mutex);
2986 if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
2987 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
2988 ip_vs_master_mcast_ifn,
2989 ip_vs_master_syncid, cb) < 0)
2990 goto nla_put_failure;
2991
2992 cb->args[0] = 1;
2993 }
2994
2995 if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
2996 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
2997 ip_vs_backup_mcast_ifn,
2998 ip_vs_backup_syncid, cb) < 0)
2999 goto nla_put_failure;
3000
3001 cb->args[1] = 1;
3002 }
3003
3004nla_put_failure:
3005 mutex_unlock(&__ip_vs_mutex);
3006
3007 return skb->len;
3008}
3009
3010static int ip_vs_genl_new_daemon(struct nlattr **attrs)
3011{
3012 if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3013 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3014 attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3015 return -EINVAL;
3016
3017 return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3018 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3019 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
3020}
3021
3022static int ip_vs_genl_del_daemon(struct nlattr **attrs)
3023{
3024 if (!attrs[IPVS_DAEMON_ATTR_STATE])
3025 return -EINVAL;
3026
3027 return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3028}
3029
3030static int ip_vs_genl_set_config(struct nlattr **attrs)
3031{
3032 struct ip_vs_timeout_user t;
3033
3034 __ip_vs_get_timeouts(&t);
3035
3036 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3037 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
3038
3039 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
3040 t.tcp_fin_timeout =
3041 nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
3042
3043 if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3044 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3045
3046 return ip_vs_set_timeout(&t);
3047}
3048
3049static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3050{
3051 struct ip_vs_service *svc = NULL;
3052 struct ip_vs_service_user_kern usvc;
3053 struct ip_vs_dest_user_kern udest;
3054 int ret = 0, cmd;
3055 int need_full_svc = 0, need_full_dest = 0;
3056
3057 cmd = info->genlhdr->cmd;
3058
3059 mutex_lock(&__ip_vs_mutex);
3060
3061 if (cmd == IPVS_CMD_FLUSH) {
3062 ret = ip_vs_flush();
3063 goto out;
3064 } else if (cmd == IPVS_CMD_SET_CONFIG) {
3065 ret = ip_vs_genl_set_config(info->attrs);
3066 goto out;
3067 } else if (cmd == IPVS_CMD_NEW_DAEMON ||
3068 cmd == IPVS_CMD_DEL_DAEMON) {
3069
3070 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
3071
3072 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
3073 nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
3074 info->attrs[IPVS_CMD_ATTR_DAEMON],
3075 ip_vs_daemon_policy)) {
3076 ret = -EINVAL;
3077 goto out;
3078 }
3079
3080 if (cmd == IPVS_CMD_NEW_DAEMON)
3081 ret = ip_vs_genl_new_daemon(daemon_attrs);
3082 else
3083 ret = ip_vs_genl_del_daemon(daemon_attrs);
3084 goto out;
3085 } else if (cmd == IPVS_CMD_ZERO &&
3086 !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3087 ret = ip_vs_zero_all();
3088 goto out;
3089 }
3090
3091 /* All following commands require a service argument, so check if we
3092 * received a valid one. We need a full service specification when
3093 * adding / editing a service. Only identifying members otherwise. */
3094 if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3095 need_full_svc = 1;
3096
3097 ret = ip_vs_genl_parse_service(&usvc,
3098 info->attrs[IPVS_CMD_ATTR_SERVICE],
3099 need_full_svc);
3100 if (ret)
3101 goto out;
3102
3103 /* Lookup the exact service by <protocol, addr, port> or fwmark */
3104 if (usvc.fwmark == 0)
3105 svc = __ip_vs_service_get(usvc.af, usvc.protocol,
3106 &usvc.addr, usvc.port);
3107 else
3108 svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark);
3109
3110 /* Unless we're adding a new service, the service must already exist */
3111 if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3112 ret = -ESRCH;
3113 goto out;
3114 }
3115
3116 /* Destination commands require a valid destination argument. For
3117 * adding / editing a destination, we need a full destination
3118 * specification. */
3119 if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
3120 cmd == IPVS_CMD_DEL_DEST) {
3121 if (cmd != IPVS_CMD_DEL_DEST)
3122 need_full_dest = 1;
3123
3124 ret = ip_vs_genl_parse_dest(&udest,
3125 info->attrs[IPVS_CMD_ATTR_DEST],
3126 need_full_dest);
3127 if (ret)
3128 goto out;
3129 }
3130
3131 switch (cmd) {
3132 case IPVS_CMD_NEW_SERVICE:
3133 if (svc == NULL)
3134 ret = ip_vs_add_service(&usvc, &svc);
3135 else
3136 ret = -EEXIST;
3137 break;
3138 case IPVS_CMD_SET_SERVICE:
3139 ret = ip_vs_edit_service(svc, &usvc);
3140 break;
3141 case IPVS_CMD_DEL_SERVICE:
3142 ret = ip_vs_del_service(svc);
3143 break;
3144 case IPVS_CMD_NEW_DEST:
3145 ret = ip_vs_add_dest(svc, &udest);
3146 break;
3147 case IPVS_CMD_SET_DEST:
3148 ret = ip_vs_edit_dest(svc, &udest);
3149 break;
3150 case IPVS_CMD_DEL_DEST:
3151 ret = ip_vs_del_dest(svc, &udest);
3152 break;
3153 case IPVS_CMD_ZERO:
3154 ret = ip_vs_zero_service(svc);
3155 break;
3156 default:
3157 ret = -EINVAL;
3158 }
3159
3160out:
3161 if (svc)
3162 ip_vs_service_put(svc);
3163 mutex_unlock(&__ip_vs_mutex);
3164
3165 return ret;
3166}
3167
3168static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3169{
3170 struct sk_buff *msg;
3171 void *reply;
3172 int ret, cmd, reply_cmd;
3173
3174 cmd = info->genlhdr->cmd;
3175
3176 if (cmd == IPVS_CMD_GET_SERVICE)
3177 reply_cmd = IPVS_CMD_NEW_SERVICE;
3178 else if (cmd == IPVS_CMD_GET_INFO)
3179 reply_cmd = IPVS_CMD_SET_INFO;
3180 else if (cmd == IPVS_CMD_GET_CONFIG)
3181 reply_cmd = IPVS_CMD_SET_CONFIG;
3182 else {
3183 IP_VS_ERR("unknown Generic Netlink command\n");
3184 return -EINVAL;
3185 }
3186
3187 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3188 if (!msg)
3189 return -ENOMEM;
3190
3191 mutex_lock(&__ip_vs_mutex);
3192
3193 reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
3194 if (reply == NULL)
3195 goto nla_put_failure;
3196
3197 switch (cmd) {
3198 case IPVS_CMD_GET_SERVICE:
3199 {
3200 struct ip_vs_service *svc;
3201
3202 svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]);
3203 if (IS_ERR(svc)) {
3204 ret = PTR_ERR(svc);
3205 goto out_err;
3206 } else if (svc) {
3207 ret = ip_vs_genl_fill_service(msg, svc);
3208 ip_vs_service_put(svc);
3209 if (ret)
3210 goto nla_put_failure;
3211 } else {
3212 ret = -ESRCH;
3213 goto out_err;
3214 }
3215
3216 break;
3217 }
3218
3219 case IPVS_CMD_GET_CONFIG:
3220 {
3221 struct ip_vs_timeout_user t;
3222
3223 __ip_vs_get_timeouts(&t);
3224#ifdef CONFIG_IP_VS_PROTO_TCP
3225 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
3226 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
3227 t.tcp_fin_timeout);
3228#endif
3229#ifdef CONFIG_IP_VS_PROTO_UDP
3230 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
3231#endif
3232
3233 break;
3234 }
3235
3236 case IPVS_CMD_GET_INFO:
3237 NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
3238 NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
3239 IP_VS_CONN_TAB_SIZE);
3240 break;
3241 }
3242
3243 genlmsg_end(msg, reply);
3244 ret = genlmsg_unicast(msg, info->snd_pid);
3245 goto out;
3246
3247nla_put_failure:
3248 IP_VS_ERR("not enough space in Netlink message\n");
3249 ret = -EMSGSIZE;
3250
3251out_err:
3252 nlmsg_free(msg);
3253out:
3254 mutex_unlock(&__ip_vs_mutex);
3255
3256 return ret;
3257}
3258
3259
3260static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
3261 {
3262 .cmd = IPVS_CMD_NEW_SERVICE,
3263 .flags = GENL_ADMIN_PERM,
3264 .policy = ip_vs_cmd_policy,
3265 .doit = ip_vs_genl_set_cmd,
3266 },
3267 {
3268 .cmd = IPVS_CMD_SET_SERVICE,
3269 .flags = GENL_ADMIN_PERM,
3270 .policy = ip_vs_cmd_policy,
3271 .doit = ip_vs_genl_set_cmd,
3272 },
3273 {
3274 .cmd = IPVS_CMD_DEL_SERVICE,
3275 .flags = GENL_ADMIN_PERM,
3276 .policy = ip_vs_cmd_policy,
3277 .doit = ip_vs_genl_set_cmd,
3278 },
3279 {
3280 .cmd = IPVS_CMD_GET_SERVICE,
3281 .flags = GENL_ADMIN_PERM,
3282 .doit = ip_vs_genl_get_cmd,
3283 .dumpit = ip_vs_genl_dump_services,
3284 .policy = ip_vs_cmd_policy,
3285 },
3286 {
3287 .cmd = IPVS_CMD_NEW_DEST,
3288 .flags = GENL_ADMIN_PERM,
3289 .policy = ip_vs_cmd_policy,
3290 .doit = ip_vs_genl_set_cmd,
3291 },
3292 {
3293 .cmd = IPVS_CMD_SET_DEST,
3294 .flags = GENL_ADMIN_PERM,
3295 .policy = ip_vs_cmd_policy,
3296 .doit = ip_vs_genl_set_cmd,
3297 },
3298 {
3299 .cmd = IPVS_CMD_DEL_DEST,
3300 .flags = GENL_ADMIN_PERM,
3301 .policy = ip_vs_cmd_policy,
3302 .doit = ip_vs_genl_set_cmd,
3303 },
3304 {
3305 .cmd = IPVS_CMD_GET_DEST,
3306 .flags = GENL_ADMIN_PERM,
3307 .policy = ip_vs_cmd_policy,
3308 .dumpit = ip_vs_genl_dump_dests,
3309 },
3310 {
3311 .cmd = IPVS_CMD_NEW_DAEMON,
3312 .flags = GENL_ADMIN_PERM,
3313 .policy = ip_vs_cmd_policy,
3314 .doit = ip_vs_genl_set_cmd,
3315 },
3316 {
3317 .cmd = IPVS_CMD_DEL_DAEMON,
3318 .flags = GENL_ADMIN_PERM,
3319 .policy = ip_vs_cmd_policy,
3320 .doit = ip_vs_genl_set_cmd,
3321 },
3322 {
3323 .cmd = IPVS_CMD_GET_DAEMON,
3324 .flags = GENL_ADMIN_PERM,
3325 .dumpit = ip_vs_genl_dump_daemons,
3326 },
3327 {
3328 .cmd = IPVS_CMD_SET_CONFIG,
3329 .flags = GENL_ADMIN_PERM,
3330 .policy = ip_vs_cmd_policy,
3331 .doit = ip_vs_genl_set_cmd,
3332 },
3333 {
3334 .cmd = IPVS_CMD_GET_CONFIG,
3335 .flags = GENL_ADMIN_PERM,
3336 .doit = ip_vs_genl_get_cmd,
3337 },
3338 {
3339 .cmd = IPVS_CMD_GET_INFO,
3340 .flags = GENL_ADMIN_PERM,
3341 .doit = ip_vs_genl_get_cmd,
3342 },
3343 {
3344 .cmd = IPVS_CMD_ZERO,
3345 .flags = GENL_ADMIN_PERM,
3346 .policy = ip_vs_cmd_policy,
3347 .doit = ip_vs_genl_set_cmd,
3348 },
3349 {
3350 .cmd = IPVS_CMD_FLUSH,
3351 .flags = GENL_ADMIN_PERM,
3352 .doit = ip_vs_genl_set_cmd,
3353 },
3354};
3355
3356static int __init ip_vs_genl_register(void)
3357{
3358 int ret, i;
3359
3360 ret = genl_register_family(&ip_vs_genl_family);
3361 if (ret)
3362 return ret;
3363
3364 for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) {
3365 ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]);
3366 if (ret)
3367 goto err_out;
3368 }
3369 return 0;
3370
3371err_out:
3372 genl_unregister_family(&ip_vs_genl_family);
3373 return ret;
3374}
3375
3376static void ip_vs_genl_unregister(void)
3377{
3378 genl_unregister_family(&ip_vs_genl_family);
3379}
3380
3381/* End of Generic Netlink interface definitions */
3382
2323 3383
2324int __init ip_vs_control_init(void) 3384int __init ip_vs_control_init(void)
2325{ 3385{
@@ -2334,6 +3394,13 @@ int __init ip_vs_control_init(void)
2334 return ret; 3394 return ret;
2335 } 3395 }
2336 3396
3397 ret = ip_vs_genl_register();
3398 if (ret) {
3399 IP_VS_ERR("cannot register Generic Netlink interface.\n");
3400 nf_unregister_sockopt(&ip_vs_sockopts);
3401 return ret;
3402 }
3403
2337 proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); 3404 proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
2338 proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); 3405 proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
2339 3406
@@ -2368,6 +3435,7 @@ void ip_vs_control_cleanup(void)
2368 unregister_sysctl_table(sysctl_header); 3435 unregister_sysctl_table(sysctl_header);
2369 proc_net_remove(&init_net, "ip_vs_stats"); 3436 proc_net_remove(&init_net, "ip_vs_stats");
2370 proc_net_remove(&init_net, "ip_vs"); 3437 proc_net_remove(&init_net, "ip_vs");
3438 ip_vs_genl_unregister();
2371 nf_unregister_sockopt(&ip_vs_sockopts); 3439 nf_unregister_sockopt(&ip_vs_sockopts);
2372 LeaveFunction(2); 3440 LeaveFunction(2);
2373} 3441}
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
index fa66824d264f..a16943fd72f1 100644
--- a/net/ipv4/ipvs/ip_vs_dh.c
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -218,7 +218,7 @@ ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
218 IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u " 218 IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
219 "--> server %u.%u.%u.%u:%d\n", 219 "--> server %u.%u.%u.%u:%d\n",
220 NIPQUAD(iph->daddr), 220 NIPQUAD(iph->daddr),
221 NIPQUAD(dest->addr), 221 NIPQUAD(dest->addr.ip),
222 ntohs(dest->port)); 222 ntohs(dest->port));
223 223
224 return dest; 224 return dest;
@@ -234,6 +234,9 @@ static struct ip_vs_scheduler ip_vs_dh_scheduler =
234 .refcnt = ATOMIC_INIT(0), 234 .refcnt = ATOMIC_INIT(0),
235 .module = THIS_MODULE, 235 .module = THIS_MODULE,
236 .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), 236 .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
237#ifdef CONFIG_IP_VS_IPV6
238 .supports_ipv6 = 0,
239#endif
237 .init_service = ip_vs_dh_init_svc, 240 .init_service = ip_vs_dh_init_svc,
238 .done_service = ip_vs_dh_done_svc, 241 .done_service = ip_vs_dh_done_svc,
239 .update_service = ip_vs_dh_update_svc, 242 .update_service = ip_vs_dh_update_svc,
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
index 5a20f93bd7f9..2eb2860dabb5 100644
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -65,37 +65,37 @@ static void estimation_timer(unsigned long arg)
65 s = container_of(e, struct ip_vs_stats, est); 65 s = container_of(e, struct ip_vs_stats, est);
66 66
67 spin_lock(&s->lock); 67 spin_lock(&s->lock);
68 n_conns = s->conns; 68 n_conns = s->ustats.conns;
69 n_inpkts = s->inpkts; 69 n_inpkts = s->ustats.inpkts;
70 n_outpkts = s->outpkts; 70 n_outpkts = s->ustats.outpkts;
71 n_inbytes = s->inbytes; 71 n_inbytes = s->ustats.inbytes;
72 n_outbytes = s->outbytes; 72 n_outbytes = s->ustats.outbytes;
73 73
74 /* scaled by 2^10, but divided 2 seconds */ 74 /* scaled by 2^10, but divided 2 seconds */
75 rate = (n_conns - e->last_conns)<<9; 75 rate = (n_conns - e->last_conns)<<9;
76 e->last_conns = n_conns; 76 e->last_conns = n_conns;
77 e->cps += ((long)rate - (long)e->cps)>>2; 77 e->cps += ((long)rate - (long)e->cps)>>2;
78 s->cps = (e->cps+0x1FF)>>10; 78 s->ustats.cps = (e->cps+0x1FF)>>10;
79 79
80 rate = (n_inpkts - e->last_inpkts)<<9; 80 rate = (n_inpkts - e->last_inpkts)<<9;
81 e->last_inpkts = n_inpkts; 81 e->last_inpkts = n_inpkts;
82 e->inpps += ((long)rate - (long)e->inpps)>>2; 82 e->inpps += ((long)rate - (long)e->inpps)>>2;
83 s->inpps = (e->inpps+0x1FF)>>10; 83 s->ustats.inpps = (e->inpps+0x1FF)>>10;
84 84
85 rate = (n_outpkts - e->last_outpkts)<<9; 85 rate = (n_outpkts - e->last_outpkts)<<9;
86 e->last_outpkts = n_outpkts; 86 e->last_outpkts = n_outpkts;
87 e->outpps += ((long)rate - (long)e->outpps)>>2; 87 e->outpps += ((long)rate - (long)e->outpps)>>2;
88 s->outpps = (e->outpps+0x1FF)>>10; 88 s->ustats.outpps = (e->outpps+0x1FF)>>10;
89 89
90 rate = (n_inbytes - e->last_inbytes)<<4; 90 rate = (n_inbytes - e->last_inbytes)<<4;
91 e->last_inbytes = n_inbytes; 91 e->last_inbytes = n_inbytes;
92 e->inbps += ((long)rate - (long)e->inbps)>>2; 92 e->inbps += ((long)rate - (long)e->inbps)>>2;
93 s->inbps = (e->inbps+0xF)>>5; 93 s->ustats.inbps = (e->inbps+0xF)>>5;
94 94
95 rate = (n_outbytes - e->last_outbytes)<<4; 95 rate = (n_outbytes - e->last_outbytes)<<4;
96 e->last_outbytes = n_outbytes; 96 e->last_outbytes = n_outbytes;
97 e->outbps += ((long)rate - (long)e->outbps)>>2; 97 e->outbps += ((long)rate - (long)e->outbps)>>2;
98 s->outbps = (e->outbps+0xF)>>5; 98 s->ustats.outbps = (e->outbps+0xF)>>5;
99 spin_unlock(&s->lock); 99 spin_unlock(&s->lock);
100 } 100 }
101 spin_unlock(&est_lock); 101 spin_unlock(&est_lock);
@@ -108,24 +108,22 @@ void ip_vs_new_estimator(struct ip_vs_stats *stats)
108 108
109 INIT_LIST_HEAD(&est->list); 109 INIT_LIST_HEAD(&est->list);
110 110
111 est->last_conns = stats->conns; 111 est->last_conns = stats->ustats.conns;
112 est->cps = stats->cps<<10; 112 est->cps = stats->ustats.cps<<10;
113 113
114 est->last_inpkts = stats->inpkts; 114 est->last_inpkts = stats->ustats.inpkts;
115 est->inpps = stats->inpps<<10; 115 est->inpps = stats->ustats.inpps<<10;
116 116
117 est->last_outpkts = stats->outpkts; 117 est->last_outpkts = stats->ustats.outpkts;
118 est->outpps = stats->outpps<<10; 118 est->outpps = stats->ustats.outpps<<10;
119 119
120 est->last_inbytes = stats->inbytes; 120 est->last_inbytes = stats->ustats.inbytes;
121 est->inbps = stats->inbps<<5; 121 est->inbps = stats->ustats.inbps<<5;
122 122
123 est->last_outbytes = stats->outbytes; 123 est->last_outbytes = stats->ustats.outbytes;
124 est->outbps = stats->outbps<<5; 124 est->outbps = stats->ustats.outbps<<5;
125 125
126 spin_lock_bh(&est_lock); 126 spin_lock_bh(&est_lock);
127 if (list_empty(&est_list))
128 mod_timer(&est_timer, jiffies + 2 * HZ);
129 list_add(&est->list, &est_list); 127 list_add(&est->list, &est_list);
130 spin_unlock_bh(&est_lock); 128 spin_unlock_bh(&est_lock);
131} 129}
@@ -136,11 +134,6 @@ void ip_vs_kill_estimator(struct ip_vs_stats *stats)
136 134
137 spin_lock_bh(&est_lock); 135 spin_lock_bh(&est_lock);
138 list_del(&est->list); 136 list_del(&est->list);
139 while (list_empty(&est_list) && try_to_del_timer_sync(&est_timer) < 0) {
140 spin_unlock_bh(&est_lock);
141 cpu_relax();
142 spin_lock_bh(&est_lock);
143 }
144 spin_unlock_bh(&est_lock); 137 spin_unlock_bh(&est_lock);
145} 138}
146 139
@@ -160,3 +153,14 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats)
160 est->inbps = 0; 153 est->inbps = 0;
161 est->outbps = 0; 154 est->outbps = 0;
162} 155}
156
157int __init ip_vs_estimator_init(void)
158{
159 mod_timer(&est_timer, jiffies + 2 * HZ);
160 return 0;
161}
162
163void ip_vs_estimator_cleanup(void)
164{
165 del_timer_sync(&est_timer);
166}
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c
index c1c758e4f733..2e7dbd8b73a4 100644
--- a/net/ipv4/ipvs/ip_vs_ftp.c
+++ b/net/ipv4/ipvs/ip_vs_ftp.c
@@ -140,13 +140,21 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
140 struct tcphdr *th; 140 struct tcphdr *th;
141 char *data, *data_limit; 141 char *data, *data_limit;
142 char *start, *end; 142 char *start, *end;
143 __be32 from; 143 union nf_inet_addr from;
144 __be16 port; 144 __be16 port;
145 struct ip_vs_conn *n_cp; 145 struct ip_vs_conn *n_cp;
146 char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ 146 char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
147 unsigned buf_len; 147 unsigned buf_len;
148 int ret; 148 int ret;
149 149
150#ifdef CONFIG_IP_VS_IPV6
151 /* This application helper doesn't work with IPv6 yet,
152 * so turn this into a no-op for IPv6 packets
153 */
154 if (cp->af == AF_INET6)
155 return 1;
156#endif
157
150 *diff = 0; 158 *diff = 0;
151 159
152 /* Only useful for established sessions */ 160 /* Only useful for established sessions */
@@ -166,24 +174,25 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
166 if (ip_vs_ftp_get_addrport(data, data_limit, 174 if (ip_vs_ftp_get_addrport(data, data_limit,
167 SERVER_STRING, 175 SERVER_STRING,
168 sizeof(SERVER_STRING)-1, ')', 176 sizeof(SERVER_STRING)-1, ')',
169 &from, &port, 177 &from.ip, &port,
170 &start, &end) != 1) 178 &start, &end) != 1)
171 return 1; 179 return 1;
172 180
173 IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> " 181 IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> "
174 "%u.%u.%u.%u:%d detected\n", 182 "%u.%u.%u.%u:%d detected\n",
175 NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0); 183 NIPQUAD(from.ip), ntohs(port),
184 NIPQUAD(cp->caddr.ip), 0);
176 185
177 /* 186 /*
178 * Now update or create an connection entry for it 187 * Now update or create an connection entry for it
179 */ 188 */
180 n_cp = ip_vs_conn_out_get(iph->protocol, from, port, 189 n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
181 cp->caddr, 0); 190 &cp->caddr, 0);
182 if (!n_cp) { 191 if (!n_cp) {
183 n_cp = ip_vs_conn_new(IPPROTO_TCP, 192 n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
184 cp->caddr, 0, 193 &cp->caddr, 0,
185 cp->vaddr, port, 194 &cp->vaddr, port,
186 from, port, 195 &from, port,
187 IP_VS_CONN_F_NO_CPORT, 196 IP_VS_CONN_F_NO_CPORT,
188 cp->dest); 197 cp->dest);
189 if (!n_cp) 198 if (!n_cp)
@@ -196,9 +205,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
196 /* 205 /*
197 * Replace the old passive address with the new one 206 * Replace the old passive address with the new one
198 */ 207 */
199 from = n_cp->vaddr; 208 from.ip = n_cp->vaddr.ip;
200 port = n_cp->vport; 209 port = n_cp->vport;
201 sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from), 210 sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip),
202 (ntohs(port)>>8)&255, ntohs(port)&255); 211 (ntohs(port)>>8)&255, ntohs(port)&255);
203 buf_len = strlen(buf); 212 buf_len = strlen(buf);
204 213
@@ -243,10 +252,18 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
243 struct tcphdr *th; 252 struct tcphdr *th;
244 char *data, *data_start, *data_limit; 253 char *data, *data_start, *data_limit;
245 char *start, *end; 254 char *start, *end;
246 __be32 to; 255 union nf_inet_addr to;
247 __be16 port; 256 __be16 port;
248 struct ip_vs_conn *n_cp; 257 struct ip_vs_conn *n_cp;
249 258
259#ifdef CONFIG_IP_VS_IPV6
260 /* This application helper doesn't work with IPv6 yet,
261 * so turn this into a no-op for IPv6 packets
262 */
263 if (cp->af == AF_INET6)
264 return 1;
265#endif
266
250 /* no diff required for incoming packets */ 267 /* no diff required for incoming packets */
251 *diff = 0; 268 *diff = 0;
252 269
@@ -291,12 +308,12 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
291 */ 308 */
292 if (ip_vs_ftp_get_addrport(data_start, data_limit, 309 if (ip_vs_ftp_get_addrport(data_start, data_limit,
293 CLIENT_STRING, sizeof(CLIENT_STRING)-1, 310 CLIENT_STRING, sizeof(CLIENT_STRING)-1,
294 '\r', &to, &port, 311 '\r', &to.ip, &port,
295 &start, &end) != 1) 312 &start, &end) != 1)
296 return 1; 313 return 1;
297 314
298 IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n", 315 IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n",
299 NIPQUAD(to), ntohs(port)); 316 NIPQUAD(to.ip), ntohs(port));
300 317
301 /* Passive mode off */ 318 /* Passive mode off */
302 cp->app_data = NULL; 319 cp->app_data = NULL;
@@ -306,16 +323,16 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
306 */ 323 */
307 IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", 324 IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
308 ip_vs_proto_name(iph->protocol), 325 ip_vs_proto_name(iph->protocol),
309 NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0); 326 NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0);
310 327
311 n_cp = ip_vs_conn_in_get(iph->protocol, 328 n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
312 to, port, 329 &to, port,
313 cp->vaddr, htons(ntohs(cp->vport)-1)); 330 &cp->vaddr, htons(ntohs(cp->vport)-1));
314 if (!n_cp) { 331 if (!n_cp) {
315 n_cp = ip_vs_conn_new(IPPROTO_TCP, 332 n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
316 to, port, 333 &to, port,
317 cp->vaddr, htons(ntohs(cp->vport)-1), 334 &cp->vaddr, htons(ntohs(cp->vport)-1),
318 cp->daddr, htons(ntohs(cp->dport)-1), 335 &cp->daddr, htons(ntohs(cp->dport)-1),
319 0, 336 0,
320 cp->dest); 337 cp->dest);
321 if (!n_cp) 338 if (!n_cp)
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index 7a6a319f544a..6ecef3518cac 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -96,7 +96,6 @@ struct ip_vs_lblc_entry {
96 * IPVS lblc hash table 96 * IPVS lblc hash table
97 */ 97 */
98struct ip_vs_lblc_table { 98struct ip_vs_lblc_table {
99 rwlock_t lock; /* lock for this table */
100 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ 99 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
101 atomic_t entries; /* number of entries */ 100 atomic_t entries; /* number of entries */
102 int max_size; /* maximum size of entries */ 101 int max_size; /* maximum size of entries */
@@ -123,31 +122,6 @@ static ctl_table vs_vars_table[] = {
123 122
124static struct ctl_table_header * sysctl_header; 123static struct ctl_table_header * sysctl_header;
125 124
126/*
127 * new/free a ip_vs_lblc_entry, which is a mapping of a destionation
128 * IP address to a server.
129 */
130static inline struct ip_vs_lblc_entry *
131ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest)
132{
133 struct ip_vs_lblc_entry *en;
134
135 en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
136 if (en == NULL) {
137 IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
138 return NULL;
139 }
140
141 INIT_LIST_HEAD(&en->list);
142 en->addr = daddr;
143
144 atomic_inc(&dest->refcnt);
145 en->dest = dest;
146
147 return en;
148}
149
150
151static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) 125static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
152{ 126{
153 list_del(&en->list); 127 list_del(&en->list);
@@ -173,55 +147,66 @@ static inline unsigned ip_vs_lblc_hashkey(__be32 addr)
173 * Hash an entry in the ip_vs_lblc_table. 147 * Hash an entry in the ip_vs_lblc_table.
174 * returns bool success. 148 * returns bool success.
175 */ 149 */
176static int 150static void
177ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) 151ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
178{ 152{
179 unsigned hash; 153 unsigned hash = ip_vs_lblc_hashkey(en->addr);
180
181 if (!list_empty(&en->list)) {
182 IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
183 "called from %p\n", __builtin_return_address(0));
184 return 0;
185 }
186
187 /*
188 * Hash by destination IP address
189 */
190 hash = ip_vs_lblc_hashkey(en->addr);
191 154
192 write_lock(&tbl->lock);
193 list_add(&en->list, &tbl->bucket[hash]); 155 list_add(&en->list, &tbl->bucket[hash]);
194 atomic_inc(&tbl->entries); 156 atomic_inc(&tbl->entries);
195 write_unlock(&tbl->lock);
196
197 return 1;
198} 157}
199 158
200 159
201/* 160/*
202 * Get ip_vs_lblc_entry associated with supplied parameters. 161 * Get ip_vs_lblc_entry associated with supplied parameters. Called under read
162 * lock
203 */ 163 */
204static inline struct ip_vs_lblc_entry * 164static inline struct ip_vs_lblc_entry *
205ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) 165ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
206{ 166{
207 unsigned hash; 167 unsigned hash = ip_vs_lblc_hashkey(addr);
208 struct ip_vs_lblc_entry *en; 168 struct ip_vs_lblc_entry *en;
209 169
210 hash = ip_vs_lblc_hashkey(addr); 170 list_for_each_entry(en, &tbl->bucket[hash], list)
171 if (en->addr == addr)
172 return en;
211 173
212 read_lock(&tbl->lock); 174 return NULL;
175}
213 176
214 list_for_each_entry(en, &tbl->bucket[hash], list) { 177
215 if (en->addr == addr) { 178/*
216 /* HIT */ 179 * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
217 read_unlock(&tbl->lock); 180 * address to a server. Called under write lock.
218 return en; 181 */
182static inline struct ip_vs_lblc_entry *
183ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr,
184 struct ip_vs_dest *dest)
185{
186 struct ip_vs_lblc_entry *en;
187
188 en = ip_vs_lblc_get(tbl, daddr);
189 if (!en) {
190 en = kmalloc(sizeof(*en), GFP_ATOMIC);
191 if (!en) {
192 IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
193 return NULL;
219 } 194 }
220 }
221 195
222 read_unlock(&tbl->lock); 196 en->addr = daddr;
197 en->lastuse = jiffies;
223 198
224 return NULL; 199 atomic_inc(&dest->refcnt);
200 en->dest = dest;
201
202 ip_vs_lblc_hash(tbl, en);
203 } else if (en->dest != dest) {
204 atomic_dec(&en->dest->refcnt);
205 atomic_inc(&dest->refcnt);
206 en->dest = dest;
207 }
208
209 return en;
225} 210}
226 211
227 212
@@ -230,30 +215,29 @@ ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
230 */ 215 */
231static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) 216static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
232{ 217{
233 int i;
234 struct ip_vs_lblc_entry *en, *nxt; 218 struct ip_vs_lblc_entry *en, *nxt;
219 int i;
235 220
236 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 221 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
237 write_lock(&tbl->lock);
238 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { 222 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
239 ip_vs_lblc_free(en); 223 ip_vs_lblc_free(en);
240 atomic_dec(&tbl->entries); 224 atomic_dec(&tbl->entries);
241 } 225 }
242 write_unlock(&tbl->lock);
243 } 226 }
244} 227}
245 228
246 229
247static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) 230static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
248{ 231{
232 struct ip_vs_lblc_table *tbl = svc->sched_data;
233 struct ip_vs_lblc_entry *en, *nxt;
249 unsigned long now = jiffies; 234 unsigned long now = jiffies;
250 int i, j; 235 int i, j;
251 struct ip_vs_lblc_entry *en, *nxt;
252 236
253 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 237 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
254 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 238 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
255 239
256 write_lock(&tbl->lock); 240 write_lock(&svc->sched_lock);
257 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 241 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
258 if (time_before(now, 242 if (time_before(now,
259 en->lastuse + sysctl_ip_vs_lblc_expiration)) 243 en->lastuse + sysctl_ip_vs_lblc_expiration))
@@ -262,7 +246,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
262 ip_vs_lblc_free(en); 246 ip_vs_lblc_free(en);
263 atomic_dec(&tbl->entries); 247 atomic_dec(&tbl->entries);
264 } 248 }
265 write_unlock(&tbl->lock); 249 write_unlock(&svc->sched_lock);
266 } 250 }
267 tbl->rover = j; 251 tbl->rover = j;
268} 252}
@@ -281,17 +265,16 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
281 */ 265 */
282static void ip_vs_lblc_check_expire(unsigned long data) 266static void ip_vs_lblc_check_expire(unsigned long data)
283{ 267{
284 struct ip_vs_lblc_table *tbl; 268 struct ip_vs_service *svc = (struct ip_vs_service *) data;
269 struct ip_vs_lblc_table *tbl = svc->sched_data;
285 unsigned long now = jiffies; 270 unsigned long now = jiffies;
286 int goal; 271 int goal;
287 int i, j; 272 int i, j;
288 struct ip_vs_lblc_entry *en, *nxt; 273 struct ip_vs_lblc_entry *en, *nxt;
289 274
290 tbl = (struct ip_vs_lblc_table *)data;
291
292 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { 275 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
293 /* do full expiration check */ 276 /* do full expiration check */
294 ip_vs_lblc_full_check(tbl); 277 ip_vs_lblc_full_check(svc);
295 tbl->counter = 1; 278 tbl->counter = 1;
296 goto out; 279 goto out;
297 } 280 }
@@ -308,7 +291,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
308 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 291 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
309 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 292 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
310 293
311 write_lock(&tbl->lock); 294 write_lock(&svc->sched_lock);
312 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 295 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
313 if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) 296 if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
314 continue; 297 continue;
@@ -317,7 +300,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
317 atomic_dec(&tbl->entries); 300 atomic_dec(&tbl->entries);
318 goal--; 301 goal--;
319 } 302 }
320 write_unlock(&tbl->lock); 303 write_unlock(&svc->sched_lock);
321 if (goal <= 0) 304 if (goal <= 0)
322 break; 305 break;
323 } 306 }
@@ -336,15 +319,14 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
336 /* 319 /*
337 * Allocate the ip_vs_lblc_table for this service 320 * Allocate the ip_vs_lblc_table for this service
338 */ 321 */
339 tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); 322 tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
340 if (tbl == NULL) { 323 if (tbl == NULL) {
341 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); 324 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
342 return -ENOMEM; 325 return -ENOMEM;
343 } 326 }
344 svc->sched_data = tbl; 327 svc->sched_data = tbl;
345 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " 328 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
346 "current service\n", 329 "current service\n", sizeof(*tbl));
347 sizeof(struct ip_vs_lblc_table));
348 330
349 /* 331 /*
350 * Initialize the hash buckets 332 * Initialize the hash buckets
@@ -352,7 +334,6 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
352 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 334 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
353 INIT_LIST_HEAD(&tbl->bucket[i]); 335 INIT_LIST_HEAD(&tbl->bucket[i]);
354 } 336 }
355 rwlock_init(&tbl->lock);
356 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; 337 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
357 tbl->rover = 0; 338 tbl->rover = 0;
358 tbl->counter = 1; 339 tbl->counter = 1;
@@ -361,9 +342,8 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
361 * Hook periodic timer for garbage collection 342 * Hook periodic timer for garbage collection
362 */ 343 */
363 setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, 344 setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
364 (unsigned long)tbl); 345 (unsigned long)svc);
365 tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; 346 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
366 add_timer(&tbl->periodic_timer);
367 347
368 return 0; 348 return 0;
369} 349}
@@ -380,22 +360,16 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
380 ip_vs_lblc_flush(tbl); 360 ip_vs_lblc_flush(tbl);
381 361
382 /* release the table itself */ 362 /* release the table itself */
383 kfree(svc->sched_data); 363 kfree(tbl);
384 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", 364 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
385 sizeof(struct ip_vs_lblc_table)); 365 sizeof(*tbl));
386 366
387 return 0; 367 return 0;
388} 368}
389 369
390 370
391static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
392{
393 return 0;
394}
395
396
397static inline struct ip_vs_dest * 371static inline struct ip_vs_dest *
398__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) 372__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
399{ 373{
400 struct ip_vs_dest *dest, *least; 374 struct ip_vs_dest *dest, *least;
401 int loh, doh; 375 int loh, doh;
@@ -448,7 +422,7 @@ __ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
448 422
449 IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " 423 IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
450 "activeconns %d refcnt %d weight %d overhead %d\n", 424 "activeconns %d refcnt %d weight %d overhead %d\n",
451 NIPQUAD(least->addr), ntohs(least->port), 425 NIPQUAD(least->addr.ip), ntohs(least->port),
452 atomic_read(&least->activeconns), 426 atomic_read(&least->activeconns),
453 atomic_read(&least->refcnt), 427 atomic_read(&least->refcnt),
454 atomic_read(&least->weight), loh); 428 atomic_read(&least->weight), loh);
@@ -484,47 +458,55 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
484static struct ip_vs_dest * 458static struct ip_vs_dest *
485ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) 459ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
486{ 460{
487 struct ip_vs_dest *dest; 461 struct ip_vs_lblc_table *tbl = svc->sched_data;
488 struct ip_vs_lblc_table *tbl;
489 struct ip_vs_lblc_entry *en;
490 struct iphdr *iph = ip_hdr(skb); 462 struct iphdr *iph = ip_hdr(skb);
463 struct ip_vs_dest *dest = NULL;
464 struct ip_vs_lblc_entry *en;
491 465
492 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); 466 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
493 467
494 tbl = (struct ip_vs_lblc_table *)svc->sched_data; 468 /* First look in our cache */
469 read_lock(&svc->sched_lock);
495 en = ip_vs_lblc_get(tbl, iph->daddr); 470 en = ip_vs_lblc_get(tbl, iph->daddr);
496 if (en == NULL) { 471 if (en) {
497 dest = __ip_vs_wlc_schedule(svc, iph); 472 /* We only hold a read lock, but this is atomic */
498 if (dest == NULL) { 473 en->lastuse = jiffies;
499 IP_VS_DBG(1, "no destination available\n"); 474
500 return NULL; 475 /*
501 } 476 * If the destination is not available, i.e. it's in the trash,
502 en = ip_vs_lblc_new(iph->daddr, dest); 477 * we must ignore it, as it may be removed from under our feet,
503 if (en == NULL) { 478 * if someone drops our reference count. Our caller only makes
504 return NULL; 479 * sure that destinations, that are not in the trash, are not
505 } 480 * moved to the trash, while we are scheduling. But anyone can
506 ip_vs_lblc_hash(tbl, en); 481 * free up entries from the trash at any time.
507 } else { 482 */
508 dest = en->dest; 483
509 if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) 484 if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
510 || atomic_read(&dest->weight) <= 0 485 dest = en->dest;
511 || is_overloaded(dest, svc)) { 486 }
512 dest = __ip_vs_wlc_schedule(svc, iph); 487 read_unlock(&svc->sched_lock);
513 if (dest == NULL) { 488
514 IP_VS_DBG(1, "no destination available\n"); 489 /* If the destination has a weight and is not overloaded, use it */
515 return NULL; 490 if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
516 } 491 goto out;
517 atomic_dec(&en->dest->refcnt); 492
518 atomic_inc(&dest->refcnt); 493 /* No cache entry or it is invalid, time to schedule */
519 en->dest = dest; 494 dest = __ip_vs_lblc_schedule(svc, iph);
520 } 495 if (!dest) {
496 IP_VS_DBG(1, "no destination available\n");
497 return NULL;
521 } 498 }
522 en->lastuse = jiffies;
523 499
500 /* If we fail to create a cache entry, we'll just use the valid dest */
501 write_lock(&svc->sched_lock);
502 ip_vs_lblc_new(tbl, iph->daddr, dest);
503 write_unlock(&svc->sched_lock);
504
505out:
524 IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " 506 IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
525 "--> server %u.%u.%u.%u:%d\n", 507 "--> server %u.%u.%u.%u:%d\n",
526 NIPQUAD(en->addr), 508 NIPQUAD(iph->daddr),
527 NIPQUAD(dest->addr), 509 NIPQUAD(dest->addr.ip),
528 ntohs(dest->port)); 510 ntohs(dest->port));
529 511
530 return dest; 512 return dest;
@@ -540,9 +522,11 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler =
540 .refcnt = ATOMIC_INIT(0), 522 .refcnt = ATOMIC_INIT(0),
541 .module = THIS_MODULE, 523 .module = THIS_MODULE,
542 .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), 524 .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
525#ifdef CONFIG_IP_VS_IPV6
526 .supports_ipv6 = 0,
527#endif
543 .init_service = ip_vs_lblc_init_svc, 528 .init_service = ip_vs_lblc_init_svc,
544 .done_service = ip_vs_lblc_done_svc, 529 .done_service = ip_vs_lblc_done_svc,
545 .update_service = ip_vs_lblc_update_svc,
546 .schedule = ip_vs_lblc_schedule, 530 .schedule = ip_vs_lblc_schedule,
547}; 531};
548 532
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index c234e73968a6..1f75ea83bcf8 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -106,7 +106,7 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
106 return NULL; 106 return NULL;
107 } 107 }
108 108
109 e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC); 109 e = kmalloc(sizeof(*e), GFP_ATOMIC);
110 if (e == NULL) { 110 if (e == NULL) {
111 IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); 111 IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
112 return NULL; 112 return NULL;
@@ -116,11 +116,9 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
116 e->dest = dest; 116 e->dest = dest;
117 117
118 /* link it to the list */ 118 /* link it to the list */
119 write_lock(&set->lock);
120 e->next = set->list; 119 e->next = set->list;
121 set->list = e; 120 set->list = e;
122 atomic_inc(&set->size); 121 atomic_inc(&set->size);
123 write_unlock(&set->lock);
124 122
125 set->lastmod = jiffies; 123 set->lastmod = jiffies;
126 return e; 124 return e;
@@ -131,7 +129,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
131{ 129{
132 struct ip_vs_dest_list *e, **ep; 130 struct ip_vs_dest_list *e, **ep;
133 131
134 write_lock(&set->lock);
135 for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { 132 for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
136 if (e->dest == dest) { 133 if (e->dest == dest) {
137 /* HIT */ 134 /* HIT */
@@ -144,7 +141,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
144 } 141 }
145 ep = &e->next; 142 ep = &e->next;
146 } 143 }
147 write_unlock(&set->lock);
148} 144}
149 145
150static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) 146static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
@@ -174,7 +170,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
174 if (set == NULL) 170 if (set == NULL)
175 return NULL; 171 return NULL;
176 172
177 read_lock(&set->lock);
178 /* select the first destination server, whose weight > 0 */ 173 /* select the first destination server, whose weight > 0 */
179 for (e=set->list; e!=NULL; e=e->next) { 174 for (e=set->list; e!=NULL; e=e->next) {
180 least = e->dest; 175 least = e->dest;
@@ -188,7 +183,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
188 goto nextstage; 183 goto nextstage;
189 } 184 }
190 } 185 }
191 read_unlock(&set->lock);
192 return NULL; 186 return NULL;
193 187
194 /* find the destination with the weighted least load */ 188 /* find the destination with the weighted least load */
@@ -207,11 +201,10 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
207 loh = doh; 201 loh = doh;
208 } 202 }
209 } 203 }
210 read_unlock(&set->lock);
211 204
212 IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " 205 IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
213 "activeconns %d refcnt %d weight %d overhead %d\n", 206 "activeconns %d refcnt %d weight %d overhead %d\n",
214 NIPQUAD(least->addr), ntohs(least->port), 207 NIPQUAD(least->addr.ip), ntohs(least->port),
215 atomic_read(&least->activeconns), 208 atomic_read(&least->activeconns),
216 atomic_read(&least->refcnt), 209 atomic_read(&least->refcnt),
217 atomic_read(&least->weight), loh); 210 atomic_read(&least->weight), loh);
@@ -229,7 +222,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
229 if (set == NULL) 222 if (set == NULL)
230 return NULL; 223 return NULL;
231 224
232 read_lock(&set->lock);
233 /* select the first destination server, whose weight > 0 */ 225 /* select the first destination server, whose weight > 0 */
234 for (e=set->list; e!=NULL; e=e->next) { 226 for (e=set->list; e!=NULL; e=e->next) {
235 most = e->dest; 227 most = e->dest;
@@ -239,7 +231,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
239 goto nextstage; 231 goto nextstage;
240 } 232 }
241 } 233 }
242 read_unlock(&set->lock);
243 return NULL; 234 return NULL;
244 235
245 /* find the destination with the weighted most load */ 236 /* find the destination with the weighted most load */
@@ -256,11 +247,10 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
256 moh = doh; 247 moh = doh;
257 } 248 }
258 } 249 }
259 read_unlock(&set->lock);
260 250
261 IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " 251 IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
262 "activeconns %d refcnt %d weight %d overhead %d\n", 252 "activeconns %d refcnt %d weight %d overhead %d\n",
263 NIPQUAD(most->addr), ntohs(most->port), 253 NIPQUAD(most->addr.ip), ntohs(most->port),
264 atomic_read(&most->activeconns), 254 atomic_read(&most->activeconns),
265 atomic_read(&most->refcnt), 255 atomic_read(&most->refcnt),
266 atomic_read(&most->weight), moh); 256 atomic_read(&most->weight), moh);
@@ -284,7 +274,6 @@ struct ip_vs_lblcr_entry {
284 * IPVS lblcr hash table 274 * IPVS lblcr hash table
285 */ 275 */
286struct ip_vs_lblcr_table { 276struct ip_vs_lblcr_table {
287 rwlock_t lock; /* lock for this table */
288 struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ 277 struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
289 atomic_t entries; /* number of entries */ 278 atomic_t entries; /* number of entries */
290 int max_size; /* maximum size of entries */ 279 int max_size; /* maximum size of entries */
@@ -311,32 +300,6 @@ static ctl_table vs_vars_table[] = {
311 300
312static struct ctl_table_header * sysctl_header; 301static struct ctl_table_header * sysctl_header;
313 302
314/*
315 * new/free a ip_vs_lblcr_entry, which is a mapping of a destination
316 * IP address to a server.
317 */
318static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr)
319{
320 struct ip_vs_lblcr_entry *en;
321
322 en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
323 if (en == NULL) {
324 IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
325 return NULL;
326 }
327
328 INIT_LIST_HEAD(&en->list);
329 en->addr = daddr;
330
331 /* initilize its dest set */
332 atomic_set(&(en->set.size), 0);
333 en->set.list = NULL;
334 rwlock_init(&en->set.lock);
335
336 return en;
337}
338
339
340static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) 303static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
341{ 304{
342 list_del(&en->list); 305 list_del(&en->list);
@@ -358,55 +321,68 @@ static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
358 * Hash an entry in the ip_vs_lblcr_table. 321 * Hash an entry in the ip_vs_lblcr_table.
359 * returns bool success. 322 * returns bool success.
360 */ 323 */
361static int 324static void
362ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) 325ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
363{ 326{
364 unsigned hash; 327 unsigned hash = ip_vs_lblcr_hashkey(en->addr);
365 328
366 if (!list_empty(&en->list)) {
367 IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
368 "called from %p\n", __builtin_return_address(0));
369 return 0;
370 }
371
372 /*
373 * Hash by destination IP address
374 */
375 hash = ip_vs_lblcr_hashkey(en->addr);
376
377 write_lock(&tbl->lock);
378 list_add(&en->list, &tbl->bucket[hash]); 329 list_add(&en->list, &tbl->bucket[hash]);
379 atomic_inc(&tbl->entries); 330 atomic_inc(&tbl->entries);
380 write_unlock(&tbl->lock);
381
382 return 1;
383} 331}
384 332
385 333
386/* 334/*
387 * Get ip_vs_lblcr_entry associated with supplied parameters. 335 * Get ip_vs_lblcr_entry associated with supplied parameters. Called under
336 * read lock.
388 */ 337 */
389static inline struct ip_vs_lblcr_entry * 338static inline struct ip_vs_lblcr_entry *
390ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr) 339ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
391{ 340{
392 unsigned hash; 341 unsigned hash = ip_vs_lblcr_hashkey(addr);
393 struct ip_vs_lblcr_entry *en; 342 struct ip_vs_lblcr_entry *en;
394 343
395 hash = ip_vs_lblcr_hashkey(addr); 344 list_for_each_entry(en, &tbl->bucket[hash], list)
345 if (en->addr == addr)
346 return en;
396 347
397 read_lock(&tbl->lock); 348 return NULL;
349}
398 350
399 list_for_each_entry(en, &tbl->bucket[hash], list) { 351
400 if (en->addr == addr) { 352/*
401 /* HIT */ 353 * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
402 read_unlock(&tbl->lock); 354 * IP address to a server. Called under write lock.
403 return en; 355 */
356static inline struct ip_vs_lblcr_entry *
357ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr,
358 struct ip_vs_dest *dest)
359{
360 struct ip_vs_lblcr_entry *en;
361
362 en = ip_vs_lblcr_get(tbl, daddr);
363 if (!en) {
364 en = kmalloc(sizeof(*en), GFP_ATOMIC);
365 if (!en) {
366 IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
367 return NULL;
404 } 368 }
369
370 en->addr = daddr;
371 en->lastuse = jiffies;
372
373 /* initilize its dest set */
374 atomic_set(&(en->set.size), 0);
375 en->set.list = NULL;
376 rwlock_init(&en->set.lock);
377
378 ip_vs_lblcr_hash(tbl, en);
405 } 379 }
406 380
407 read_unlock(&tbl->lock); 381 write_lock(&en->set.lock);
382 ip_vs_dest_set_insert(&en->set, dest);
383 write_unlock(&en->set.lock);
408 384
409 return NULL; 385 return en;
410} 386}
411 387
412 388
@@ -418,19 +394,18 @@ static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
418 int i; 394 int i;
419 struct ip_vs_lblcr_entry *en, *nxt; 395 struct ip_vs_lblcr_entry *en, *nxt;
420 396
397 /* No locking required, only called during cleanup. */
421 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { 398 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
422 write_lock(&tbl->lock);
423 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { 399 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
424 ip_vs_lblcr_free(en); 400 ip_vs_lblcr_free(en);
425 atomic_dec(&tbl->entries);
426 } 401 }
427 write_unlock(&tbl->lock);
428 } 402 }
429} 403}
430 404
431 405
432static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) 406static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
433{ 407{
408 struct ip_vs_lblcr_table *tbl = svc->sched_data;
434 unsigned long now = jiffies; 409 unsigned long now = jiffies;
435 int i, j; 410 int i, j;
436 struct ip_vs_lblcr_entry *en, *nxt; 411 struct ip_vs_lblcr_entry *en, *nxt;
@@ -438,7 +413,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
438 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { 413 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
439 j = (j + 1) & IP_VS_LBLCR_TAB_MASK; 414 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
440 415
441 write_lock(&tbl->lock); 416 write_lock(&svc->sched_lock);
442 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 417 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
443 if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, 418 if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
444 now)) 419 now))
@@ -447,7 +422,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
447 ip_vs_lblcr_free(en); 422 ip_vs_lblcr_free(en);
448 atomic_dec(&tbl->entries); 423 atomic_dec(&tbl->entries);
449 } 424 }
450 write_unlock(&tbl->lock); 425 write_unlock(&svc->sched_lock);
451 } 426 }
452 tbl->rover = j; 427 tbl->rover = j;
453} 428}
@@ -466,17 +441,16 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
466 */ 441 */
467static void ip_vs_lblcr_check_expire(unsigned long data) 442static void ip_vs_lblcr_check_expire(unsigned long data)
468{ 443{
469 struct ip_vs_lblcr_table *tbl; 444 struct ip_vs_service *svc = (struct ip_vs_service *) data;
445 struct ip_vs_lblcr_table *tbl = svc->sched_data;
470 unsigned long now = jiffies; 446 unsigned long now = jiffies;
471 int goal; 447 int goal;
472 int i, j; 448 int i, j;
473 struct ip_vs_lblcr_entry *en, *nxt; 449 struct ip_vs_lblcr_entry *en, *nxt;
474 450
475 tbl = (struct ip_vs_lblcr_table *)data;
476
477 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { 451 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
478 /* do full expiration check */ 452 /* do full expiration check */
479 ip_vs_lblcr_full_check(tbl); 453 ip_vs_lblcr_full_check(svc);
480 tbl->counter = 1; 454 tbl->counter = 1;
481 goto out; 455 goto out;
482 } 456 }
@@ -493,7 +467,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
493 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { 467 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
494 j = (j + 1) & IP_VS_LBLCR_TAB_MASK; 468 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
495 469
496 write_lock(&tbl->lock); 470 write_lock(&svc->sched_lock);
497 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 471 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
498 if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) 472 if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
499 continue; 473 continue;
@@ -502,7 +476,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
502 atomic_dec(&tbl->entries); 476 atomic_dec(&tbl->entries);
503 goal--; 477 goal--;
504 } 478 }
505 write_unlock(&tbl->lock); 479 write_unlock(&svc->sched_lock);
506 if (goal <= 0) 480 if (goal <= 0)
507 break; 481 break;
508 } 482 }
@@ -520,15 +494,14 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
520 /* 494 /*
521 * Allocate the ip_vs_lblcr_table for this service 495 * Allocate the ip_vs_lblcr_table for this service
522 */ 496 */
523 tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC); 497 tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
524 if (tbl == NULL) { 498 if (tbl == NULL) {
525 IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); 499 IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
526 return -ENOMEM; 500 return -ENOMEM;
527 } 501 }
528 svc->sched_data = tbl; 502 svc->sched_data = tbl;
529 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " 503 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
530 "current service\n", 504 "current service\n", sizeof(*tbl));
531 sizeof(struct ip_vs_lblcr_table));
532 505
533 /* 506 /*
534 * Initialize the hash buckets 507 * Initialize the hash buckets
@@ -536,7 +509,6 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
536 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { 509 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
537 INIT_LIST_HEAD(&tbl->bucket[i]); 510 INIT_LIST_HEAD(&tbl->bucket[i]);
538 } 511 }
539 rwlock_init(&tbl->lock);
540 tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; 512 tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
541 tbl->rover = 0; 513 tbl->rover = 0;
542 tbl->counter = 1; 514 tbl->counter = 1;
@@ -545,9 +517,8 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
545 * Hook periodic timer for garbage collection 517 * Hook periodic timer for garbage collection
546 */ 518 */
547 setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, 519 setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
548 (unsigned long)tbl); 520 (unsigned long)svc);
549 tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; 521 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
550 add_timer(&tbl->periodic_timer);
551 522
552 return 0; 523 return 0;
553} 524}
@@ -564,22 +535,16 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
564 ip_vs_lblcr_flush(tbl); 535 ip_vs_lblcr_flush(tbl);
565 536
566 /* release the table itself */ 537 /* release the table itself */
567 kfree(svc->sched_data); 538 kfree(tbl);
568 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", 539 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
569 sizeof(struct ip_vs_lblcr_table)); 540 sizeof(*tbl));
570 541
571 return 0; 542 return 0;
572} 543}
573 544
574 545
575static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
576{
577 return 0;
578}
579
580
581static inline struct ip_vs_dest * 546static inline struct ip_vs_dest *
582__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) 547__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
583{ 548{
584 struct ip_vs_dest *dest, *least; 549 struct ip_vs_dest *dest, *least;
585 int loh, doh; 550 int loh, doh;
@@ -633,7 +598,7 @@ __ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
633 598
634 IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " 599 IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
635 "activeconns %d refcnt %d weight %d overhead %d\n", 600 "activeconns %d refcnt %d weight %d overhead %d\n",
636 NIPQUAD(least->addr), ntohs(least->port), 601 NIPQUAD(least->addr.ip), ntohs(least->port),
637 atomic_read(&least->activeconns), 602 atomic_read(&least->activeconns),
638 atomic_read(&least->refcnt), 603 atomic_read(&least->refcnt),
639 atomic_read(&least->weight), loh); 604 atomic_read(&least->weight), loh);
@@ -669,51 +634,79 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
669static struct ip_vs_dest * 634static struct ip_vs_dest *
670ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) 635ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
671{ 636{
672 struct ip_vs_dest *dest; 637 struct ip_vs_lblcr_table *tbl = svc->sched_data;
673 struct ip_vs_lblcr_table *tbl;
674 struct ip_vs_lblcr_entry *en;
675 struct iphdr *iph = ip_hdr(skb); 638 struct iphdr *iph = ip_hdr(skb);
639 struct ip_vs_dest *dest = NULL;
640 struct ip_vs_lblcr_entry *en;
676 641
677 IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); 642 IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
678 643
679 tbl = (struct ip_vs_lblcr_table *)svc->sched_data; 644 /* First look in our cache */
645 read_lock(&svc->sched_lock);
680 en = ip_vs_lblcr_get(tbl, iph->daddr); 646 en = ip_vs_lblcr_get(tbl, iph->daddr);
681 if (en == NULL) { 647 if (en) {
682 dest = __ip_vs_wlc_schedule(svc, iph); 648 /* We only hold a read lock, but this is atomic */
683 if (dest == NULL) { 649 en->lastuse = jiffies;
684 IP_VS_DBG(1, "no destination available\n"); 650
685 return NULL; 651 /* Get the least loaded destination */
686 } 652 read_lock(&en->set.lock);
687 en = ip_vs_lblcr_new(iph->daddr);
688 if (en == NULL) {
689 return NULL;
690 }
691 ip_vs_dest_set_insert(&en->set, dest);
692 ip_vs_lblcr_hash(tbl, en);
693 } else {
694 dest = ip_vs_dest_set_min(&en->set); 653 dest = ip_vs_dest_set_min(&en->set);
695 if (!dest || is_overloaded(dest, svc)) { 654 read_unlock(&en->set.lock);
696 dest = __ip_vs_wlc_schedule(svc, iph); 655
697 if (dest == NULL) { 656 /* More than one destination + enough time passed by, cleanup */
698 IP_VS_DBG(1, "no destination available\n");
699 return NULL;
700 }
701 ip_vs_dest_set_insert(&en->set, dest);
702 }
703 if (atomic_read(&en->set.size) > 1 && 657 if (atomic_read(&en->set.size) > 1 &&
704 jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) { 658 time_after(jiffies, en->set.lastmod +
659 sysctl_ip_vs_lblcr_expiration)) {
705 struct ip_vs_dest *m; 660 struct ip_vs_dest *m;
661
662 write_lock(&en->set.lock);
706 m = ip_vs_dest_set_max(&en->set); 663 m = ip_vs_dest_set_max(&en->set);
707 if (m) 664 if (m)
708 ip_vs_dest_set_erase(&en->set, m); 665 ip_vs_dest_set_erase(&en->set, m);
666 write_unlock(&en->set.lock);
709 } 667 }
668
669 /* If the destination is not overloaded, use it */
670 if (dest && !is_overloaded(dest, svc)) {
671 read_unlock(&svc->sched_lock);
672 goto out;
673 }
674
675 /* The cache entry is invalid, time to schedule */
676 dest = __ip_vs_lblcr_schedule(svc, iph);
677 if (!dest) {
678 IP_VS_DBG(1, "no destination available\n");
679 read_unlock(&svc->sched_lock);
680 return NULL;
681 }
682
683 /* Update our cache entry */
684 write_lock(&en->set.lock);
685 ip_vs_dest_set_insert(&en->set, dest);
686 write_unlock(&en->set.lock);
687 }
688 read_unlock(&svc->sched_lock);
689
690 if (dest)
691 goto out;
692
693 /* No cache entry, time to schedule */
694 dest = __ip_vs_lblcr_schedule(svc, iph);
695 if (!dest) {
696 IP_VS_DBG(1, "no destination available\n");
697 return NULL;
710 } 698 }
711 en->lastuse = jiffies;
712 699
700 /* If we fail to create a cache entry, we'll just use the valid dest */
701 write_lock(&svc->sched_lock);
702 ip_vs_lblcr_new(tbl, iph->daddr, dest);
703 write_unlock(&svc->sched_lock);
704
705out:
713 IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " 706 IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
714 "--> server %u.%u.%u.%u:%d\n", 707 "--> server %u.%u.%u.%u:%d\n",
715 NIPQUAD(en->addr), 708 NIPQUAD(iph->daddr),
716 NIPQUAD(dest->addr), 709 NIPQUAD(dest->addr.ip),
717 ntohs(dest->port)); 710 ntohs(dest->port));
718 711
719 return dest; 712 return dest;
@@ -729,9 +722,11 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
729 .refcnt = ATOMIC_INIT(0), 722 .refcnt = ATOMIC_INIT(0),
730 .module = THIS_MODULE, 723 .module = THIS_MODULE,
731 .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), 724 .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
725#ifdef CONFIG_IP_VS_IPV6
726 .supports_ipv6 = 0,
727#endif
732 .init_service = ip_vs_lblcr_init_svc, 728 .init_service = ip_vs_lblcr_init_svc,
733 .done_service = ip_vs_lblcr_done_svc, 729 .done_service = ip_vs_lblcr_done_svc,
734 .update_service = ip_vs_lblcr_update_svc,
735 .schedule = ip_vs_lblcr_schedule, 730 .schedule = ip_vs_lblcr_schedule,
736}; 731};
737 732
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c
index ebcdbf75ac65..b69f808ac461 100644
--- a/net/ipv4/ipvs/ip_vs_lc.c
+++ b/net/ipv4/ipvs/ip_vs_lc.c
@@ -20,24 +20,6 @@
20#include <net/ip_vs.h> 20#include <net/ip_vs.h>
21 21
22 22
23static int ip_vs_lc_init_svc(struct ip_vs_service *svc)
24{
25 return 0;
26}
27
28
29static int ip_vs_lc_done_svc(struct ip_vs_service *svc)
30{
31 return 0;
32}
33
34
35static int ip_vs_lc_update_svc(struct ip_vs_service *svc)
36{
37 return 0;
38}
39
40
41static inline unsigned int 23static inline unsigned int
42ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) 24ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
43{ 25{
@@ -85,10 +67,10 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
85 } 67 }
86 68
87 if (least) 69 if (least)
88 IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n", 70 IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n",
89 NIPQUAD(least->addr), ntohs(least->port), 71 IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
90 atomic_read(&least->activeconns), 72 atomic_read(&least->activeconns),
91 atomic_read(&least->inactconns)); 73 atomic_read(&least->inactconns));
92 74
93 return least; 75 return least;
94} 76}
@@ -99,9 +81,9 @@ static struct ip_vs_scheduler ip_vs_lc_scheduler = {
99 .refcnt = ATOMIC_INIT(0), 81 .refcnt = ATOMIC_INIT(0),
100 .module = THIS_MODULE, 82 .module = THIS_MODULE,
101 .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), 83 .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
102 .init_service = ip_vs_lc_init_svc, 84#ifdef CONFIG_IP_VS_IPV6
103 .done_service = ip_vs_lc_done_svc, 85 .supports_ipv6 = 1,
104 .update_service = ip_vs_lc_update_svc, 86#endif
105 .schedule = ip_vs_lc_schedule, 87 .schedule = ip_vs_lc_schedule,
106}; 88};
107 89
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c
index 92f3a6770031..9a2d8033f08f 100644
--- a/net/ipv4/ipvs/ip_vs_nq.c
+++ b/net/ipv4/ipvs/ip_vs_nq.c
@@ -37,27 +37,6 @@
37#include <net/ip_vs.h> 37#include <net/ip_vs.h>
38 38
39 39
40static int
41ip_vs_nq_init_svc(struct ip_vs_service *svc)
42{
43 return 0;
44}
45
46
47static int
48ip_vs_nq_done_svc(struct ip_vs_service *svc)
49{
50 return 0;
51}
52
53
54static int
55ip_vs_nq_update_svc(struct ip_vs_service *svc)
56{
57 return 0;
58}
59
60
61static inline unsigned int 40static inline unsigned int
62ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) 41ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
63{ 42{
@@ -120,12 +99,12 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
120 return NULL; 99 return NULL;
121 100
122 out: 101 out:
123 IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u " 102 IP_VS_DBG_BUF(6, "NQ: server %s:%u "
124 "activeconns %d refcnt %d weight %d overhead %d\n", 103 "activeconns %d refcnt %d weight %d overhead %d\n",
125 NIPQUAD(least->addr), ntohs(least->port), 104 IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
126 atomic_read(&least->activeconns), 105 atomic_read(&least->activeconns),
127 atomic_read(&least->refcnt), 106 atomic_read(&least->refcnt),
128 atomic_read(&least->weight), loh); 107 atomic_read(&least->weight), loh);
129 108
130 return least; 109 return least;
131} 110}
@@ -137,9 +116,9 @@ static struct ip_vs_scheduler ip_vs_nq_scheduler =
137 .refcnt = ATOMIC_INIT(0), 116 .refcnt = ATOMIC_INIT(0),
138 .module = THIS_MODULE, 117 .module = THIS_MODULE,
139 .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), 118 .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
140 .init_service = ip_vs_nq_init_svc, 119#ifdef CONFIG_IP_VS_IPV6
141 .done_service = ip_vs_nq_done_svc, 120 .supports_ipv6 = 1,
142 .update_service = ip_vs_nq_update_svc, 121#endif
143 .schedule = ip_vs_nq_schedule, 122 .schedule = ip_vs_nq_schedule,
144}; 123};
145 124
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
index 6099a88fc200..0791f9e08feb 100644
--- a/net/ipv4/ipvs/ip_vs_proto.c
+++ b/net/ipv4/ipvs/ip_vs_proto.c
@@ -151,11 +151,11 @@ const char * ip_vs_state_name(__u16 proto, int state)
151} 151}
152 152
153 153
154void 154static void
155ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, 155ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
156 const struct sk_buff *skb, 156 const struct sk_buff *skb,
157 int offset, 157 int offset,
158 const char *msg) 158 const char *msg)
159{ 159{
160 char buf[128]; 160 char buf[128];
161 struct iphdr _iph, *ih; 161 struct iphdr _iph, *ih;
@@ -189,6 +189,61 @@ ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
189 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); 189 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
190} 190}
191 191
192#ifdef CONFIG_IP_VS_IPV6
193static void
194ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
195 const struct sk_buff *skb,
196 int offset,
197 const char *msg)
198{
199 char buf[192];
200 struct ipv6hdr _iph, *ih;
201
202 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
203 if (ih == NULL)
204 sprintf(buf, "%s TRUNCATED", pp->name);
205 else if (ih->nexthdr == IPPROTO_FRAGMENT)
206 sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag",
207 pp->name, NIP6(ih->saddr),
208 NIP6(ih->daddr));
209 else {
210 __be16 _ports[2], *pptr;
211
212 pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
213 sizeof(_ports), _ports);
214 if (pptr == NULL)
215 sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT,
216 pp->name,
217 NIP6(ih->saddr),
218 NIP6(ih->daddr));
219 else
220 sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u",
221 pp->name,
222 NIP6(ih->saddr),
223 ntohs(pptr[0]),
224 NIP6(ih->daddr),
225 ntohs(pptr[1]));
226 }
227
228 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
229}
230#endif
231
232
233void
234ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
235 const struct sk_buff *skb,
236 int offset,
237 const char *msg)
238{
239#ifdef CONFIG_IP_VS_IPV6
240 if (skb->protocol == htons(ETH_P_IPV6))
241 ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
242 else
243#endif
244 ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
245}
246
192 247
193int __init ip_vs_protocol_init(void) 248int __init ip_vs_protocol_init(void)
194{ 249{
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
deleted file mode 100644
index 73e0ea87c1f5..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_ah.c
+++ /dev/null
@@ -1,178 +0,0 @@
1/*
2 * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS
3 *
4 * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
5 * Wensong Zhang <wensong@linuxvirtualserver.org>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * version 2 as published by the Free Software Foundation;
10 *
11 */
12
13#include <linux/in.h>
14#include <linux/ip.h>
15#include <linux/module.h>
16#include <linux/kernel.h>
17#include <linux/netfilter.h>
18#include <linux/netfilter_ipv4.h>
19
20#include <net/ip_vs.h>
21
22
23/* TODO:
24
25struct isakmp_hdr {
26 __u8 icookie[8];
27 __u8 rcookie[8];
28 __u8 np;
29 __u8 version;
30 __u8 xchgtype;
31 __u8 flags;
32 __u32 msgid;
33 __u32 length;
34};
35
36*/
37
38#define PORT_ISAKMP 500
39
40
41static struct ip_vs_conn *
42ah_conn_in_get(const struct sk_buff *skb,
43 struct ip_vs_protocol *pp,
44 const struct iphdr *iph,
45 unsigned int proto_off,
46 int inverse)
47{
48 struct ip_vs_conn *cp;
49
50 if (likely(!inverse)) {
51 cp = ip_vs_conn_in_get(IPPROTO_UDP,
52 iph->saddr,
53 htons(PORT_ISAKMP),
54 iph->daddr,
55 htons(PORT_ISAKMP));
56 } else {
57 cp = ip_vs_conn_in_get(IPPROTO_UDP,
58 iph->daddr,
59 htons(PORT_ISAKMP),
60 iph->saddr,
61 htons(PORT_ISAKMP));
62 }
63
64 if (!cp) {
65 /*
66 * We are not sure if the packet is from our
67 * service, so our conn_schedule hook should return NF_ACCEPT
68 */
69 IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
70 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
71 inverse ? "ICMP+" : "",
72 pp->name,
73 NIPQUAD(iph->saddr),
74 NIPQUAD(iph->daddr));
75 }
76
77 return cp;
78}
79
80
81static struct ip_vs_conn *
82ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
83 const struct iphdr *iph, unsigned int proto_off, int inverse)
84{
85 struct ip_vs_conn *cp;
86
87 if (likely(!inverse)) {
88 cp = ip_vs_conn_out_get(IPPROTO_UDP,
89 iph->saddr,
90 htons(PORT_ISAKMP),
91 iph->daddr,
92 htons(PORT_ISAKMP));
93 } else {
94 cp = ip_vs_conn_out_get(IPPROTO_UDP,
95 iph->daddr,
96 htons(PORT_ISAKMP),
97 iph->saddr,
98 htons(PORT_ISAKMP));
99 }
100
101 if (!cp) {
102 IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
103 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
104 inverse ? "ICMP+" : "",
105 pp->name,
106 NIPQUAD(iph->saddr),
107 NIPQUAD(iph->daddr));
108 }
109
110 return cp;
111}
112
113
114static int
115ah_conn_schedule(struct sk_buff *skb,
116 struct ip_vs_protocol *pp,
117 int *verdict, struct ip_vs_conn **cpp)
118{
119 /*
120 * AH is only related traffic. Pass the packet to IP stack.
121 */
122 *verdict = NF_ACCEPT;
123 return 0;
124}
125
126
127static void
128ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
129 int offset, const char *msg)
130{
131 char buf[256];
132 struct iphdr _iph, *ih;
133
134 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
135 if (ih == NULL)
136 sprintf(buf, "%s TRUNCATED", pp->name);
137 else
138 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
139 pp->name, NIPQUAD(ih->saddr),
140 NIPQUAD(ih->daddr));
141
142 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
143}
144
145
146static void ah_init(struct ip_vs_protocol *pp)
147{
148 /* nothing to do now */
149}
150
151
152static void ah_exit(struct ip_vs_protocol *pp)
153{
154 /* nothing to do now */
155}
156
157
158struct ip_vs_protocol ip_vs_protocol_ah = {
159 .name = "AH",
160 .protocol = IPPROTO_AH,
161 .num_states = 1,
162 .dont_defrag = 1,
163 .init = ah_init,
164 .exit = ah_exit,
165 .conn_schedule = ah_conn_schedule,
166 .conn_in_get = ah_conn_in_get,
167 .conn_out_get = ah_conn_out_get,
168 .snat_handler = NULL,
169 .dnat_handler = NULL,
170 .csum_check = NULL,
171 .state_transition = NULL,
172 .register_app = NULL,
173 .unregister_app = NULL,
174 .app_conn_bind = NULL,
175 .debug_packet = ah_debug_packet,
176 .timeout_change = NULL, /* ISAKMP */
177 .set_state_timeout = NULL,
178};
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah_esp.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
new file mode 100644
index 000000000000..80ab0c8e5b4a
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
@@ -0,0 +1,235 @@
1/*
2 * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS
3 *
4 * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
5 * Wensong Zhang <wensong@linuxvirtualserver.org>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * version 2 as published by the Free Software Foundation;
10 *
11 */
12
13#include <linux/in.h>
14#include <linux/ip.h>
15#include <linux/module.h>
16#include <linux/kernel.h>
17#include <linux/netfilter.h>
18#include <linux/netfilter_ipv4.h>
19
20#include <net/ip_vs.h>
21
22
23/* TODO:
24
25struct isakmp_hdr {
26 __u8 icookie[8];
27 __u8 rcookie[8];
28 __u8 np;
29 __u8 version;
30 __u8 xchgtype;
31 __u8 flags;
32 __u32 msgid;
33 __u32 length;
34};
35
36*/
37
38#define PORT_ISAKMP 500
39
40
41static struct ip_vs_conn *
42ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
43 const struct ip_vs_iphdr *iph, unsigned int proto_off,
44 int inverse)
45{
46 struct ip_vs_conn *cp;
47
48 if (likely(!inverse)) {
49 cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
50 &iph->saddr,
51 htons(PORT_ISAKMP),
52 &iph->daddr,
53 htons(PORT_ISAKMP));
54 } else {
55 cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
56 &iph->daddr,
57 htons(PORT_ISAKMP),
58 &iph->saddr,
59 htons(PORT_ISAKMP));
60 }
61
62 if (!cp) {
63 /*
64 * We are not sure if the packet is from our
65 * service, so our conn_schedule hook should return NF_ACCEPT
66 */
67 IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
68 "%s%s %s->%s\n",
69 inverse ? "ICMP+" : "",
70 pp->name,
71 IP_VS_DBG_ADDR(af, &iph->saddr),
72 IP_VS_DBG_ADDR(af, &iph->daddr));
73 }
74
75 return cp;
76}
77
78
79static struct ip_vs_conn *
80ah_esp_conn_out_get(int af, const struct sk_buff *skb,
81 struct ip_vs_protocol *pp,
82 const struct ip_vs_iphdr *iph,
83 unsigned int proto_off,
84 int inverse)
85{
86 struct ip_vs_conn *cp;
87
88 if (likely(!inverse)) {
89 cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
90 &iph->saddr,
91 htons(PORT_ISAKMP),
92 &iph->daddr,
93 htons(PORT_ISAKMP));
94 } else {
95 cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
96 &iph->daddr,
97 htons(PORT_ISAKMP),
98 &iph->saddr,
99 htons(PORT_ISAKMP));
100 }
101
102 if (!cp) {
103 IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
104 "%s%s %s->%s\n",
105 inverse ? "ICMP+" : "",
106 pp->name,
107 IP_VS_DBG_ADDR(af, &iph->saddr),
108 IP_VS_DBG_ADDR(af, &iph->daddr));
109 }
110
111 return cp;
112}
113
114
115static int
116ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
117 int *verdict, struct ip_vs_conn **cpp)
118{
119 /*
120 * AH/ESP is only related traffic. Pass the packet to IP stack.
121 */
122 *verdict = NF_ACCEPT;
123 return 0;
124}
125
126
127static void
128ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
129 int offset, const char *msg)
130{
131 char buf[256];
132 struct iphdr _iph, *ih;
133
134 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
135 if (ih == NULL)
136 sprintf(buf, "%s TRUNCATED", pp->name);
137 else
138 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
139 pp->name, NIPQUAD(ih->saddr),
140 NIPQUAD(ih->daddr));
141
142 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
143}
144
145#ifdef CONFIG_IP_VS_IPV6
146static void
147ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
148 int offset, const char *msg)
149{
150 char buf[256];
151 struct ipv6hdr _iph, *ih;
152
153 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
154 if (ih == NULL)
155 sprintf(buf, "%s TRUNCATED", pp->name);
156 else
157 sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT,
158 pp->name, NIP6(ih->saddr),
159 NIP6(ih->daddr));
160
161 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
162}
163#endif
164
165static void
166ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
167 int offset, const char *msg)
168{
169#ifdef CONFIG_IP_VS_IPV6
170 if (skb->protocol == htons(ETH_P_IPV6))
171 ah_esp_debug_packet_v6(pp, skb, offset, msg);
172 else
173#endif
174 ah_esp_debug_packet_v4(pp, skb, offset, msg);
175}
176
177
178static void ah_esp_init(struct ip_vs_protocol *pp)
179{
180 /* nothing to do now */
181}
182
183
184static void ah_esp_exit(struct ip_vs_protocol *pp)
185{
186 /* nothing to do now */
187}
188
189
190#ifdef CONFIG_IP_VS_PROTO_AH
191struct ip_vs_protocol ip_vs_protocol_ah = {
192 .name = "AH",
193 .protocol = IPPROTO_AH,
194 .num_states = 1,
195 .dont_defrag = 1,
196 .init = ah_esp_init,
197 .exit = ah_esp_exit,
198 .conn_schedule = ah_esp_conn_schedule,
199 .conn_in_get = ah_esp_conn_in_get,
200 .conn_out_get = ah_esp_conn_out_get,
201 .snat_handler = NULL,
202 .dnat_handler = NULL,
203 .csum_check = NULL,
204 .state_transition = NULL,
205 .register_app = NULL,
206 .unregister_app = NULL,
207 .app_conn_bind = NULL,
208 .debug_packet = ah_esp_debug_packet,
209 .timeout_change = NULL, /* ISAKMP */
210 .set_state_timeout = NULL,
211};
212#endif
213
214#ifdef CONFIG_IP_VS_PROTO_ESP
215struct ip_vs_protocol ip_vs_protocol_esp = {
216 .name = "ESP",
217 .protocol = IPPROTO_ESP,
218 .num_states = 1,
219 .dont_defrag = 1,
220 .init = ah_esp_init,
221 .exit = ah_esp_exit,
222 .conn_schedule = ah_esp_conn_schedule,
223 .conn_in_get = ah_esp_conn_in_get,
224 .conn_out_get = ah_esp_conn_out_get,
225 .snat_handler = NULL,
226 .dnat_handler = NULL,
227 .csum_check = NULL,
228 .state_transition = NULL,
229 .register_app = NULL,
230 .unregister_app = NULL,
231 .app_conn_bind = NULL,
232 .debug_packet = ah_esp_debug_packet,
233 .timeout_change = NULL, /* ISAKMP */
234};
235#endif
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
deleted file mode 100644
index 21d70c8ffa54..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_esp.c
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS
3 *
4 * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
5 * Wensong Zhang <wensong@linuxvirtualserver.org>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * version 2 as published by the Free Software Foundation;
10 *
11 */
12
13#include <linux/in.h>
14#include <linux/ip.h>
15#include <linux/module.h>
16#include <linux/kernel.h>
17#include <linux/netfilter.h>
18#include <linux/netfilter_ipv4.h>
19
20#include <net/ip_vs.h>
21
22
23/* TODO:
24
25struct isakmp_hdr {
26 __u8 icookie[8];
27 __u8 rcookie[8];
28 __u8 np;
29 __u8 version;
30 __u8 xchgtype;
31 __u8 flags;
32 __u32 msgid;
33 __u32 length;
34};
35
36*/
37
38#define PORT_ISAKMP 500
39
40
41static struct ip_vs_conn *
42esp_conn_in_get(const struct sk_buff *skb,
43 struct ip_vs_protocol *pp,
44 const struct iphdr *iph,
45 unsigned int proto_off,
46 int inverse)
47{
48 struct ip_vs_conn *cp;
49
50 if (likely(!inverse)) {
51 cp = ip_vs_conn_in_get(IPPROTO_UDP,
52 iph->saddr,
53 htons(PORT_ISAKMP),
54 iph->daddr,
55 htons(PORT_ISAKMP));
56 } else {
57 cp = ip_vs_conn_in_get(IPPROTO_UDP,
58 iph->daddr,
59 htons(PORT_ISAKMP),
60 iph->saddr,
61 htons(PORT_ISAKMP));
62 }
63
64 if (!cp) {
65 /*
66 * We are not sure if the packet is from our
67 * service, so our conn_schedule hook should return NF_ACCEPT
68 */
69 IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
70 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
71 inverse ? "ICMP+" : "",
72 pp->name,
73 NIPQUAD(iph->saddr),
74 NIPQUAD(iph->daddr));
75 }
76
77 return cp;
78}
79
80
81static struct ip_vs_conn *
82esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
83 const struct iphdr *iph, unsigned int proto_off, int inverse)
84{
85 struct ip_vs_conn *cp;
86
87 if (likely(!inverse)) {
88 cp = ip_vs_conn_out_get(IPPROTO_UDP,
89 iph->saddr,
90 htons(PORT_ISAKMP),
91 iph->daddr,
92 htons(PORT_ISAKMP));
93 } else {
94 cp = ip_vs_conn_out_get(IPPROTO_UDP,
95 iph->daddr,
96 htons(PORT_ISAKMP),
97 iph->saddr,
98 htons(PORT_ISAKMP));
99 }
100
101 if (!cp) {
102 IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
103 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
104 inverse ? "ICMP+" : "",
105 pp->name,
106 NIPQUAD(iph->saddr),
107 NIPQUAD(iph->daddr));
108 }
109
110 return cp;
111}
112
113
114static int
115esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
116 int *verdict, struct ip_vs_conn **cpp)
117{
118 /*
119 * ESP is only related traffic. Pass the packet to IP stack.
120 */
121 *verdict = NF_ACCEPT;
122 return 0;
123}
124
125
126static void
127esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
128 int offset, const char *msg)
129{
130 char buf[256];
131 struct iphdr _iph, *ih;
132
133 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
134 if (ih == NULL)
135 sprintf(buf, "%s TRUNCATED", pp->name);
136 else
137 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
138 pp->name, NIPQUAD(ih->saddr),
139 NIPQUAD(ih->daddr));
140
141 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
142}
143
144
145static void esp_init(struct ip_vs_protocol *pp)
146{
147 /* nothing to do now */
148}
149
150
151static void esp_exit(struct ip_vs_protocol *pp)
152{
153 /* nothing to do now */
154}
155
156
157struct ip_vs_protocol ip_vs_protocol_esp = {
158 .name = "ESP",
159 .protocol = IPPROTO_ESP,
160 .num_states = 1,
161 .dont_defrag = 1,
162 .init = esp_init,
163 .exit = esp_exit,
164 .conn_schedule = esp_conn_schedule,
165 .conn_in_get = esp_conn_in_get,
166 .conn_out_get = esp_conn_out_get,
167 .snat_handler = NULL,
168 .dnat_handler = NULL,
169 .csum_check = NULL,
170 .state_transition = NULL,
171 .register_app = NULL,
172 .unregister_app = NULL,
173 .app_conn_bind = NULL,
174 .debug_packet = esp_debug_packet,
175 .timeout_change = NULL, /* ISAKMP */
176};
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index d0ea467986a0..dd4566ea2bff 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -18,6 +18,7 @@
18#include <linux/tcp.h> /* for tcphdr */ 18#include <linux/tcp.h> /* for tcphdr */
19#include <net/ip.h> 19#include <net/ip.h>
20#include <net/tcp.h> /* for csum_tcpudp_magic */ 20#include <net/tcp.h> /* for csum_tcpudp_magic */
21#include <net/ip6_checksum.h>
21#include <linux/netfilter.h> 22#include <linux/netfilter.h>
22#include <linux/netfilter_ipv4.h> 23#include <linux/netfilter_ipv4.h>
23 24
@@ -25,8 +26,9 @@
25 26
26 27
27static struct ip_vs_conn * 28static struct ip_vs_conn *
28tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, 29tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
29 const struct iphdr *iph, unsigned int proto_off, int inverse) 30 const struct ip_vs_iphdr *iph, unsigned int proto_off,
31 int inverse)
30{ 32{
31 __be16 _ports[2], *pptr; 33 __be16 _ports[2], *pptr;
32 34
@@ -35,19 +37,20 @@ tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
35 return NULL; 37 return NULL;
36 38
37 if (likely(!inverse)) { 39 if (likely(!inverse)) {
38 return ip_vs_conn_in_get(iph->protocol, 40 return ip_vs_conn_in_get(af, iph->protocol,
39 iph->saddr, pptr[0], 41 &iph->saddr, pptr[0],
40 iph->daddr, pptr[1]); 42 &iph->daddr, pptr[1]);
41 } else { 43 } else {
42 return ip_vs_conn_in_get(iph->protocol, 44 return ip_vs_conn_in_get(af, iph->protocol,
43 iph->daddr, pptr[1], 45 &iph->daddr, pptr[1],
44 iph->saddr, pptr[0]); 46 &iph->saddr, pptr[0]);
45 } 47 }
46} 48}
47 49
48static struct ip_vs_conn * 50static struct ip_vs_conn *
49tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, 51tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
50 const struct iphdr *iph, unsigned int proto_off, int inverse) 52 const struct ip_vs_iphdr *iph, unsigned int proto_off,
53 int inverse)
51{ 54{
52 __be16 _ports[2], *pptr; 55 __be16 _ports[2], *pptr;
53 56
@@ -56,34 +59,36 @@ tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
56 return NULL; 59 return NULL;
57 60
58 if (likely(!inverse)) { 61 if (likely(!inverse)) {
59 return ip_vs_conn_out_get(iph->protocol, 62 return ip_vs_conn_out_get(af, iph->protocol,
60 iph->saddr, pptr[0], 63 &iph->saddr, pptr[0],
61 iph->daddr, pptr[1]); 64 &iph->daddr, pptr[1]);
62 } else { 65 } else {
63 return ip_vs_conn_out_get(iph->protocol, 66 return ip_vs_conn_out_get(af, iph->protocol,
64 iph->daddr, pptr[1], 67 &iph->daddr, pptr[1],
65 iph->saddr, pptr[0]); 68 &iph->saddr, pptr[0]);
66 } 69 }
67} 70}
68 71
69 72
70static int 73static int
71tcp_conn_schedule(struct sk_buff *skb, 74tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
72 struct ip_vs_protocol *pp,
73 int *verdict, struct ip_vs_conn **cpp) 75 int *verdict, struct ip_vs_conn **cpp)
74{ 76{
75 struct ip_vs_service *svc; 77 struct ip_vs_service *svc;
76 struct tcphdr _tcph, *th; 78 struct tcphdr _tcph, *th;
79 struct ip_vs_iphdr iph;
77 80
78 th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); 81 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
82
83 th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
79 if (th == NULL) { 84 if (th == NULL) {
80 *verdict = NF_DROP; 85 *verdict = NF_DROP;
81 return 0; 86 return 0;
82 } 87 }
83 88
84 if (th->syn && 89 if (th->syn &&
85 (svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol, 90 (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
86 ip_hdr(skb)->daddr, th->dest))) { 91 th->dest))) {
87 if (ip_vs_todrop()) { 92 if (ip_vs_todrop()) {
88 /* 93 /*
89 * It seems that we are very loaded. 94 * It seems that we are very loaded.
@@ -110,22 +115,62 @@ tcp_conn_schedule(struct sk_buff *skb,
110 115
111 116
112static inline void 117static inline void
113tcp_fast_csum_update(struct tcphdr *tcph, __be32 oldip, __be32 newip, 118tcp_fast_csum_update(int af, struct tcphdr *tcph,
119 const union nf_inet_addr *oldip,
120 const union nf_inet_addr *newip,
114 __be16 oldport, __be16 newport) 121 __be16 oldport, __be16 newport)
115{ 122{
123#ifdef CONFIG_IP_VS_IPV6
124 if (af == AF_INET6)
125 tcph->check =
126 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
127 ip_vs_check_diff2(oldport, newport,
128 ~csum_unfold(tcph->check))));
129 else
130#endif
116 tcph->check = 131 tcph->check =
117 csum_fold(ip_vs_check_diff4(oldip, newip, 132 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
118 ip_vs_check_diff2(oldport, newport, 133 ip_vs_check_diff2(oldport, newport,
119 ~csum_unfold(tcph->check)))); 134 ~csum_unfold(tcph->check))));
120} 135}
121 136
122 137
138static inline void
139tcp_partial_csum_update(int af, struct tcphdr *tcph,
140 const union nf_inet_addr *oldip,
141 const union nf_inet_addr *newip,
142 __be16 oldlen, __be16 newlen)
143{
144#ifdef CONFIG_IP_VS_IPV6
145 if (af == AF_INET6)
146 tcph->check =
147 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
148 ip_vs_check_diff2(oldlen, newlen,
149 ~csum_unfold(tcph->check))));
150 else
151#endif
152 tcph->check =
153 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
154 ip_vs_check_diff2(oldlen, newlen,
155 ~csum_unfold(tcph->check))));
156}
157
158
123static int 159static int
124tcp_snat_handler(struct sk_buff *skb, 160tcp_snat_handler(struct sk_buff *skb,
125 struct ip_vs_protocol *pp, struct ip_vs_conn *cp) 161 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
126{ 162{
127 struct tcphdr *tcph; 163 struct tcphdr *tcph;
128 const unsigned int tcphoff = ip_hdrlen(skb); 164 unsigned int tcphoff;
165 int oldlen;
166
167#ifdef CONFIG_IP_VS_IPV6
168 if (cp->af == AF_INET6)
169 tcphoff = sizeof(struct ipv6hdr);
170 else
171#endif
172 tcphoff = ip_hdrlen(skb);
173 oldlen = skb->len - tcphoff;
129 174
130 /* csum_check requires unshared skb */ 175 /* csum_check requires unshared skb */
131 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) 176 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
@@ -133,7 +178,7 @@ tcp_snat_handler(struct sk_buff *skb,
133 178
134 if (unlikely(cp->app != NULL)) { 179 if (unlikely(cp->app != NULL)) {
135 /* Some checks before mangling */ 180 /* Some checks before mangling */
136 if (pp->csum_check && !pp->csum_check(skb, pp)) 181 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
137 return 0; 182 return 0;
138 183
139 /* Call application helper if needed */ 184 /* Call application helper if needed */
@@ -141,13 +186,17 @@ tcp_snat_handler(struct sk_buff *skb,
141 return 0; 186 return 0;
142 } 187 }
143 188
144 tcph = (void *)ip_hdr(skb) + tcphoff; 189 tcph = (void *)skb_network_header(skb) + tcphoff;
145 tcph->source = cp->vport; 190 tcph->source = cp->vport;
146 191
147 /* Adjust TCP checksums */ 192 /* Adjust TCP checksums */
148 if (!cp->app) { 193 if (skb->ip_summed == CHECKSUM_PARTIAL) {
194 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
195 htonl(oldlen),
196 htonl(skb->len - tcphoff));
197 } else if (!cp->app) {
149 /* Only port and addr are changed, do fast csum update */ 198 /* Only port and addr are changed, do fast csum update */
150 tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr, 199 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
151 cp->dport, cp->vport); 200 cp->dport, cp->vport);
152 if (skb->ip_summed == CHECKSUM_COMPLETE) 201 if (skb->ip_summed == CHECKSUM_COMPLETE)
153 skb->ip_summed = CHECKSUM_NONE; 202 skb->ip_summed = CHECKSUM_NONE;
@@ -155,9 +204,20 @@ tcp_snat_handler(struct sk_buff *skb,
155 /* full checksum calculation */ 204 /* full checksum calculation */
156 tcph->check = 0; 205 tcph->check = 0;
157 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); 206 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
158 tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, 207#ifdef CONFIG_IP_VS_IPV6
159 skb->len - tcphoff, 208 if (cp->af == AF_INET6)
160 cp->protocol, skb->csum); 209 tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
210 &cp->caddr.in6,
211 skb->len - tcphoff,
212 cp->protocol, skb->csum);
213 else
214#endif
215 tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
216 cp->caddr.ip,
217 skb->len - tcphoff,
218 cp->protocol,
219 skb->csum);
220
161 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", 221 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
162 pp->name, tcph->check, 222 pp->name, tcph->check,
163 (char*)&(tcph->check) - (char*)tcph); 223 (char*)&(tcph->check) - (char*)tcph);
@@ -171,7 +231,16 @@ tcp_dnat_handler(struct sk_buff *skb,
171 struct ip_vs_protocol *pp, struct ip_vs_conn *cp) 231 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
172{ 232{
173 struct tcphdr *tcph; 233 struct tcphdr *tcph;
174 const unsigned int tcphoff = ip_hdrlen(skb); 234 unsigned int tcphoff;
235 int oldlen;
236
237#ifdef CONFIG_IP_VS_IPV6
238 if (cp->af == AF_INET6)
239 tcphoff = sizeof(struct ipv6hdr);
240 else
241#endif
242 tcphoff = ip_hdrlen(skb);
243 oldlen = skb->len - tcphoff;
175 244
176 /* csum_check requires unshared skb */ 245 /* csum_check requires unshared skb */
177 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) 246 if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
@@ -179,7 +248,7 @@ tcp_dnat_handler(struct sk_buff *skb,
179 248
180 if (unlikely(cp->app != NULL)) { 249 if (unlikely(cp->app != NULL)) {
181 /* Some checks before mangling */ 250 /* Some checks before mangling */
182 if (pp->csum_check && !pp->csum_check(skb, pp)) 251 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
183 return 0; 252 return 0;
184 253
185 /* 254 /*
@@ -190,15 +259,19 @@ tcp_dnat_handler(struct sk_buff *skb,
190 return 0; 259 return 0;
191 } 260 }
192 261
193 tcph = (void *)ip_hdr(skb) + tcphoff; 262 tcph = (void *)skb_network_header(skb) + tcphoff;
194 tcph->dest = cp->dport; 263 tcph->dest = cp->dport;
195 264
196 /* 265 /*
197 * Adjust TCP checksums 266 * Adjust TCP checksums
198 */ 267 */
199 if (!cp->app) { 268 if (skb->ip_summed == CHECKSUM_PARTIAL) {
269 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
270 htonl(oldlen),
271 htonl(skb->len - tcphoff));
272 } else if (!cp->app) {
200 /* Only port and addr are changed, do fast csum update */ 273 /* Only port and addr are changed, do fast csum update */
201 tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr, 274 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
202 cp->vport, cp->dport); 275 cp->vport, cp->dport);
203 if (skb->ip_summed == CHECKSUM_COMPLETE) 276 if (skb->ip_summed == CHECKSUM_COMPLETE)
204 skb->ip_summed = CHECKSUM_NONE; 277 skb->ip_summed = CHECKSUM_NONE;
@@ -206,9 +279,19 @@ tcp_dnat_handler(struct sk_buff *skb,
206 /* full checksum calculation */ 279 /* full checksum calculation */
207 tcph->check = 0; 280 tcph->check = 0;
208 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); 281 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
209 tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, 282#ifdef CONFIG_IP_VS_IPV6
210 skb->len - tcphoff, 283 if (cp->af == AF_INET6)
211 cp->protocol, skb->csum); 284 tcph->check = csum_ipv6_magic(&cp->caddr.in6,
285 &cp->daddr.in6,
286 skb->len - tcphoff,
287 cp->protocol, skb->csum);
288 else
289#endif
290 tcph->check = csum_tcpudp_magic(cp->caddr.ip,
291 cp->daddr.ip,
292 skb->len - tcphoff,
293 cp->protocol,
294 skb->csum);
212 skb->ip_summed = CHECKSUM_UNNECESSARY; 295 skb->ip_summed = CHECKSUM_UNNECESSARY;
213 } 296 }
214 return 1; 297 return 1;
@@ -216,21 +299,43 @@ tcp_dnat_handler(struct sk_buff *skb,
216 299
217 300
218static int 301static int
219tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) 302tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
220{ 303{
221 const unsigned int tcphoff = ip_hdrlen(skb); 304 unsigned int tcphoff;
305
306#ifdef CONFIG_IP_VS_IPV6
307 if (af == AF_INET6)
308 tcphoff = sizeof(struct ipv6hdr);
309 else
310#endif
311 tcphoff = ip_hdrlen(skb);
222 312
223 switch (skb->ip_summed) { 313 switch (skb->ip_summed) {
224 case CHECKSUM_NONE: 314 case CHECKSUM_NONE:
225 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); 315 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
226 case CHECKSUM_COMPLETE: 316 case CHECKSUM_COMPLETE:
227 if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, 317#ifdef CONFIG_IP_VS_IPV6
228 skb->len - tcphoff, 318 if (af == AF_INET6) {
229 ip_hdr(skb)->protocol, skb->csum)) { 319 if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
230 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 320 &ipv6_hdr(skb)->daddr,
231 "Failed checksum for"); 321 skb->len - tcphoff,
232 return 0; 322 ipv6_hdr(skb)->nexthdr,
233 } 323 skb->csum)) {
324 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
325 "Failed checksum for");
326 return 0;
327 }
328 } else
329#endif
330 if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
331 ip_hdr(skb)->daddr,
332 skb->len - tcphoff,
333 ip_hdr(skb)->protocol,
334 skb->csum)) {
335 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
336 "Failed checksum for");
337 return 0;
338 }
234 break; 339 break;
235 default: 340 default:
236 /* No need to checksum. */ 341 /* No need to checksum. */
@@ -419,19 +524,23 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
419 if (new_state != cp->state) { 524 if (new_state != cp->state) {
420 struct ip_vs_dest *dest = cp->dest; 525 struct ip_vs_dest *dest = cp->dest;
421 526
422 IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" 527 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
423 "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n", 528 "%s:%d state: %s->%s conn->refcnt:%d\n",
424 pp->name, 529 pp->name,
425 (state_off==TCP_DIR_OUTPUT)?"output ":"input ", 530 ((state_off == TCP_DIR_OUTPUT) ?
426 th->syn? 'S' : '.', 531 "output " : "input "),
427 th->fin? 'F' : '.', 532 th->syn ? 'S' : '.',
428 th->ack? 'A' : '.', 533 th->fin ? 'F' : '.',
429 th->rst? 'R' : '.', 534 th->ack ? 'A' : '.',
430 NIPQUAD(cp->daddr), ntohs(cp->dport), 535 th->rst ? 'R' : '.',
431 NIPQUAD(cp->caddr), ntohs(cp->cport), 536 IP_VS_DBG_ADDR(cp->af, &cp->daddr),
432 tcp_state_name(cp->state), 537 ntohs(cp->dport),
433 tcp_state_name(new_state), 538 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
434 atomic_read(&cp->refcnt)); 539 ntohs(cp->cport),
540 tcp_state_name(cp->state),
541 tcp_state_name(new_state),
542 atomic_read(&cp->refcnt));
543
435 if (dest) { 544 if (dest) {
436 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && 545 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
437 (new_state != IP_VS_TCP_S_ESTABLISHED)) { 546 (new_state != IP_VS_TCP_S_ESTABLISHED)) {
@@ -461,7 +570,13 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
461{ 570{
462 struct tcphdr _tcph, *th; 571 struct tcphdr _tcph, *th;
463 572
464 th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); 573#ifdef CONFIG_IP_VS_IPV6
574 int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
575#else
576 int ihl = ip_hdrlen(skb);
577#endif
578
579 th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
465 if (th == NULL) 580 if (th == NULL)
466 return 0; 581 return 0;
467 582
@@ -546,12 +661,15 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
546 break; 661 break;
547 spin_unlock(&tcp_app_lock); 662 spin_unlock(&tcp_app_lock);
548 663
549 IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" 664 IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
550 "%u.%u.%u.%u:%u to app %s on port %u\n", 665 "%s:%u to app %s on port %u\n",
551 __func__, 666 __func__,
552 NIPQUAD(cp->caddr), ntohs(cp->cport), 667 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
553 NIPQUAD(cp->vaddr), ntohs(cp->vport), 668 ntohs(cp->cport),
554 inc->name, ntohs(inc->port)); 669 IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
670 ntohs(cp->vport),
671 inc->name, ntohs(inc->port));
672
555 cp->app = inc; 673 cp->app = inc;
556 if (inc->init_conn) 674 if (inc->init_conn)
557 result = inc->init_conn(inc, cp); 675 result = inc->init_conn(inc, cp);
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index c6be5d56823f..6eb6039d6343 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -22,10 +22,12 @@
22 22
23#include <net/ip_vs.h> 23#include <net/ip_vs.h>
24#include <net/ip.h> 24#include <net/ip.h>
25#include <net/ip6_checksum.h>
25 26
26static struct ip_vs_conn * 27static struct ip_vs_conn *
27udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, 28udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
28 const struct iphdr *iph, unsigned int proto_off, int inverse) 29 const struct ip_vs_iphdr *iph, unsigned int proto_off,
30 int inverse)
29{ 31{
30 struct ip_vs_conn *cp; 32 struct ip_vs_conn *cp;
31 __be16 _ports[2], *pptr; 33 __be16 _ports[2], *pptr;
@@ -35,13 +37,13 @@ udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
35 return NULL; 37 return NULL;
36 38
37 if (likely(!inverse)) { 39 if (likely(!inverse)) {
38 cp = ip_vs_conn_in_get(iph->protocol, 40 cp = ip_vs_conn_in_get(af, iph->protocol,
39 iph->saddr, pptr[0], 41 &iph->saddr, pptr[0],
40 iph->daddr, pptr[1]); 42 &iph->daddr, pptr[1]);
41 } else { 43 } else {
42 cp = ip_vs_conn_in_get(iph->protocol, 44 cp = ip_vs_conn_in_get(af, iph->protocol,
43 iph->daddr, pptr[1], 45 &iph->daddr, pptr[1],
44 iph->saddr, pptr[0]); 46 &iph->saddr, pptr[0]);
45 } 47 }
46 48
47 return cp; 49 return cp;
@@ -49,25 +51,25 @@ udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
49 51
50 52
51static struct ip_vs_conn * 53static struct ip_vs_conn *
52udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, 54udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
53 const struct iphdr *iph, unsigned int proto_off, int inverse) 55 const struct ip_vs_iphdr *iph, unsigned int proto_off,
56 int inverse)
54{ 57{
55 struct ip_vs_conn *cp; 58 struct ip_vs_conn *cp;
56 __be16 _ports[2], *pptr; 59 __be16 _ports[2], *pptr;
57 60
58 pptr = skb_header_pointer(skb, ip_hdrlen(skb), 61 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
59 sizeof(_ports), _ports);
60 if (pptr == NULL) 62 if (pptr == NULL)
61 return NULL; 63 return NULL;
62 64
63 if (likely(!inverse)) { 65 if (likely(!inverse)) {
64 cp = ip_vs_conn_out_get(iph->protocol, 66 cp = ip_vs_conn_out_get(af, iph->protocol,
65 iph->saddr, pptr[0], 67 &iph->saddr, pptr[0],
66 iph->daddr, pptr[1]); 68 &iph->daddr, pptr[1]);
67 } else { 69 } else {
68 cp = ip_vs_conn_out_get(iph->protocol, 70 cp = ip_vs_conn_out_get(af, iph->protocol,
69 iph->daddr, pptr[1], 71 &iph->daddr, pptr[1],
70 iph->saddr, pptr[0]); 72 &iph->saddr, pptr[0]);
71 } 73 }
72 74
73 return cp; 75 return cp;
@@ -75,21 +77,24 @@ udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
75 77
76 78
77static int 79static int
78udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, 80udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
79 int *verdict, struct ip_vs_conn **cpp) 81 int *verdict, struct ip_vs_conn **cpp)
80{ 82{
81 struct ip_vs_service *svc; 83 struct ip_vs_service *svc;
82 struct udphdr _udph, *uh; 84 struct udphdr _udph, *uh;
85 struct ip_vs_iphdr iph;
86
87 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
83 88
84 uh = skb_header_pointer(skb, ip_hdrlen(skb), 89 uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
85 sizeof(_udph), &_udph);
86 if (uh == NULL) { 90 if (uh == NULL) {
87 *verdict = NF_DROP; 91 *verdict = NF_DROP;
88 return 0; 92 return 0;
89 } 93 }
90 94
91 if ((svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol, 95 svc = ip_vs_service_get(af, skb->mark, iph.protocol,
92 ip_hdr(skb)->daddr, uh->dest))) { 96 &iph.daddr, uh->dest);
97 if (svc) {
93 if (ip_vs_todrop()) { 98 if (ip_vs_todrop()) {
94 /* 99 /*
95 * It seems that we are very loaded. 100 * It seems that we are very loaded.
@@ -116,23 +121,63 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
116 121
117 122
118static inline void 123static inline void
119udp_fast_csum_update(struct udphdr *uhdr, __be32 oldip, __be32 newip, 124udp_fast_csum_update(int af, struct udphdr *uhdr,
125 const union nf_inet_addr *oldip,
126 const union nf_inet_addr *newip,
120 __be16 oldport, __be16 newport) 127 __be16 oldport, __be16 newport)
121{ 128{
122 uhdr->check = 129#ifdef CONFIG_IP_VS_IPV6
123 csum_fold(ip_vs_check_diff4(oldip, newip, 130 if (af == AF_INET6)
124 ip_vs_check_diff2(oldport, newport, 131 uhdr->check =
125 ~csum_unfold(uhdr->check)))); 132 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
133 ip_vs_check_diff2(oldport, newport,
134 ~csum_unfold(uhdr->check))));
135 else
136#endif
137 uhdr->check =
138 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
139 ip_vs_check_diff2(oldport, newport,
140 ~csum_unfold(uhdr->check))));
126 if (!uhdr->check) 141 if (!uhdr->check)
127 uhdr->check = CSUM_MANGLED_0; 142 uhdr->check = CSUM_MANGLED_0;
128} 143}
129 144
145static inline void
146udp_partial_csum_update(int af, struct udphdr *uhdr,
147 const union nf_inet_addr *oldip,
148 const union nf_inet_addr *newip,
149 __be16 oldlen, __be16 newlen)
150{
151#ifdef CONFIG_IP_VS_IPV6
152 if (af == AF_INET6)
153 uhdr->check =
154 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
155 ip_vs_check_diff2(oldlen, newlen,
156 ~csum_unfold(uhdr->check))));
157 else
158#endif
159 uhdr->check =
160 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
161 ip_vs_check_diff2(oldlen, newlen,
162 ~csum_unfold(uhdr->check))));
163}
164
165
130static int 166static int
131udp_snat_handler(struct sk_buff *skb, 167udp_snat_handler(struct sk_buff *skb,
132 struct ip_vs_protocol *pp, struct ip_vs_conn *cp) 168 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
133{ 169{
134 struct udphdr *udph; 170 struct udphdr *udph;
135 const unsigned int udphoff = ip_hdrlen(skb); 171 unsigned int udphoff;
172 int oldlen;
173
174#ifdef CONFIG_IP_VS_IPV6
175 if (cp->af == AF_INET6)
176 udphoff = sizeof(struct ipv6hdr);
177 else
178#endif
179 udphoff = ip_hdrlen(skb);
180 oldlen = skb->len - udphoff;
136 181
137 /* csum_check requires unshared skb */ 182 /* csum_check requires unshared skb */
138 if (!skb_make_writable(skb, udphoff+sizeof(*udph))) 183 if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
@@ -140,7 +185,7 @@ udp_snat_handler(struct sk_buff *skb,
140 185
141 if (unlikely(cp->app != NULL)) { 186 if (unlikely(cp->app != NULL)) {
142 /* Some checks before mangling */ 187 /* Some checks before mangling */
143 if (pp->csum_check && !pp->csum_check(skb, pp)) 188 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
144 return 0; 189 return 0;
145 190
146 /* 191 /*
@@ -150,15 +195,19 @@ udp_snat_handler(struct sk_buff *skb,
150 return 0; 195 return 0;
151 } 196 }
152 197
153 udph = (void *)ip_hdr(skb) + udphoff; 198 udph = (void *)skb_network_header(skb) + udphoff;
154 udph->source = cp->vport; 199 udph->source = cp->vport;
155 200
156 /* 201 /*
157 * Adjust UDP checksums 202 * Adjust UDP checksums
158 */ 203 */
159 if (!cp->app && (udph->check != 0)) { 204 if (skb->ip_summed == CHECKSUM_PARTIAL) {
205 udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
206 htonl(oldlen),
207 htonl(skb->len - udphoff));
208 } else if (!cp->app && (udph->check != 0)) {
160 /* Only port and addr are changed, do fast csum update */ 209 /* Only port and addr are changed, do fast csum update */
161 udp_fast_csum_update(udph, cp->daddr, cp->vaddr, 210 udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
162 cp->dport, cp->vport); 211 cp->dport, cp->vport);
163 if (skb->ip_summed == CHECKSUM_COMPLETE) 212 if (skb->ip_summed == CHECKSUM_COMPLETE)
164 skb->ip_summed = CHECKSUM_NONE; 213 skb->ip_summed = CHECKSUM_NONE;
@@ -166,9 +215,19 @@ udp_snat_handler(struct sk_buff *skb,
166 /* full checksum calculation */ 215 /* full checksum calculation */
167 udph->check = 0; 216 udph->check = 0;
168 skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); 217 skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
169 udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, 218#ifdef CONFIG_IP_VS_IPV6
170 skb->len - udphoff, 219 if (cp->af == AF_INET6)
171 cp->protocol, skb->csum); 220 udph->check = csum_ipv6_magic(&cp->vaddr.in6,
221 &cp->caddr.in6,
222 skb->len - udphoff,
223 cp->protocol, skb->csum);
224 else
225#endif
226 udph->check = csum_tcpudp_magic(cp->vaddr.ip,
227 cp->caddr.ip,
228 skb->len - udphoff,
229 cp->protocol,
230 skb->csum);
172 if (udph->check == 0) 231 if (udph->check == 0)
173 udph->check = CSUM_MANGLED_0; 232 udph->check = CSUM_MANGLED_0;
174 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", 233 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
@@ -184,7 +243,16 @@ udp_dnat_handler(struct sk_buff *skb,
184 struct ip_vs_protocol *pp, struct ip_vs_conn *cp) 243 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
185{ 244{
186 struct udphdr *udph; 245 struct udphdr *udph;
187 unsigned int udphoff = ip_hdrlen(skb); 246 unsigned int udphoff;
247 int oldlen;
248
249#ifdef CONFIG_IP_VS_IPV6
250 if (cp->af == AF_INET6)
251 udphoff = sizeof(struct ipv6hdr);
252 else
253#endif
254 udphoff = ip_hdrlen(skb);
255 oldlen = skb->len - udphoff;
188 256
189 /* csum_check requires unshared skb */ 257 /* csum_check requires unshared skb */
190 if (!skb_make_writable(skb, udphoff+sizeof(*udph))) 258 if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
@@ -192,7 +260,7 @@ udp_dnat_handler(struct sk_buff *skb,
192 260
193 if (unlikely(cp->app != NULL)) { 261 if (unlikely(cp->app != NULL)) {
194 /* Some checks before mangling */ 262 /* Some checks before mangling */
195 if (pp->csum_check && !pp->csum_check(skb, pp)) 263 if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
196 return 0; 264 return 0;
197 265
198 /* 266 /*
@@ -203,15 +271,19 @@ udp_dnat_handler(struct sk_buff *skb,
203 return 0; 271 return 0;
204 } 272 }
205 273
206 udph = (void *)ip_hdr(skb) + udphoff; 274 udph = (void *)skb_network_header(skb) + udphoff;
207 udph->dest = cp->dport; 275 udph->dest = cp->dport;
208 276
209 /* 277 /*
210 * Adjust UDP checksums 278 * Adjust UDP checksums
211 */ 279 */
212 if (!cp->app && (udph->check != 0)) { 280 if (skb->ip_summed == CHECKSUM_PARTIAL) {
281 udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
282 htonl(oldlen),
283 htonl(skb->len - udphoff));
284 } else if (!cp->app && (udph->check != 0)) {
213 /* Only port and addr are changed, do fast csum update */ 285 /* Only port and addr are changed, do fast csum update */
214 udp_fast_csum_update(udph, cp->vaddr, cp->daddr, 286 udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
215 cp->vport, cp->dport); 287 cp->vport, cp->dport);
216 if (skb->ip_summed == CHECKSUM_COMPLETE) 288 if (skb->ip_summed == CHECKSUM_COMPLETE)
217 skb->ip_summed = CHECKSUM_NONE; 289 skb->ip_summed = CHECKSUM_NONE;
@@ -219,9 +291,19 @@ udp_dnat_handler(struct sk_buff *skb,
219 /* full checksum calculation */ 291 /* full checksum calculation */
220 udph->check = 0; 292 udph->check = 0;
221 skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); 293 skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
222 udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, 294#ifdef CONFIG_IP_VS_IPV6
223 skb->len - udphoff, 295 if (cp->af == AF_INET6)
224 cp->protocol, skb->csum); 296 udph->check = csum_ipv6_magic(&cp->caddr.in6,
297 &cp->daddr.in6,
298 skb->len - udphoff,
299 cp->protocol, skb->csum);
300 else
301#endif
302 udph->check = csum_tcpudp_magic(cp->caddr.ip,
303 cp->daddr.ip,
304 skb->len - udphoff,
305 cp->protocol,
306 skb->csum);
225 if (udph->check == 0) 307 if (udph->check == 0)
226 udph->check = CSUM_MANGLED_0; 308 udph->check = CSUM_MANGLED_0;
227 skb->ip_summed = CHECKSUM_UNNECESSARY; 309 skb->ip_summed = CHECKSUM_UNNECESSARY;
@@ -231,10 +313,17 @@ udp_dnat_handler(struct sk_buff *skb,
231 313
232 314
233static int 315static int
234udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) 316udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
235{ 317{
236 struct udphdr _udph, *uh; 318 struct udphdr _udph, *uh;
237 const unsigned int udphoff = ip_hdrlen(skb); 319 unsigned int udphoff;
320
321#ifdef CONFIG_IP_VS_IPV6
322 if (af == AF_INET6)
323 udphoff = sizeof(struct ipv6hdr);
324 else
325#endif
326 udphoff = ip_hdrlen(skb);
238 327
239 uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); 328 uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
240 if (uh == NULL) 329 if (uh == NULL)
@@ -246,15 +335,28 @@ udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
246 skb->csum = skb_checksum(skb, udphoff, 335 skb->csum = skb_checksum(skb, udphoff,
247 skb->len - udphoff, 0); 336 skb->len - udphoff, 0);
248 case CHECKSUM_COMPLETE: 337 case CHECKSUM_COMPLETE:
249 if (csum_tcpudp_magic(ip_hdr(skb)->saddr, 338#ifdef CONFIG_IP_VS_IPV6
250 ip_hdr(skb)->daddr, 339 if (af == AF_INET6) {
251 skb->len - udphoff, 340 if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
252 ip_hdr(skb)->protocol, 341 &ipv6_hdr(skb)->daddr,
253 skb->csum)) { 342 skb->len - udphoff,
254 IP_VS_DBG_RL_PKT(0, pp, skb, 0, 343 ipv6_hdr(skb)->nexthdr,
255 "Failed checksum for"); 344 skb->csum)) {
256 return 0; 345 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
257 } 346 "Failed checksum for");
347 return 0;
348 }
349 } else
350#endif
351 if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
352 ip_hdr(skb)->daddr,
353 skb->len - udphoff,
354 ip_hdr(skb)->protocol,
355 skb->csum)) {
356 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
357 "Failed checksum for");
358 return 0;
359 }
258 break; 360 break;
259 default: 361 default:
260 /* No need to checksum. */ 362 /* No need to checksum. */
@@ -340,12 +442,15 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
340 break; 442 break;
341 spin_unlock(&udp_app_lock); 443 spin_unlock(&udp_app_lock);
342 444
343 IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" 445 IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
344 "%u.%u.%u.%u:%u to app %s on port %u\n", 446 "%s:%u to app %s on port %u\n",
345 __func__, 447 __func__,
346 NIPQUAD(cp->caddr), ntohs(cp->cport), 448 IP_VS_DBG_ADDR(cp->af, &cp->caddr),
347 NIPQUAD(cp->vaddr), ntohs(cp->vport), 449 ntohs(cp->cport),
348 inc->name, ntohs(inc->port)); 450 IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
451 ntohs(cp->vport),
452 inc->name, ntohs(inc->port));
453
349 cp->app = inc; 454 cp->app = inc;
350 if (inc->init_conn) 455 if (inc->init_conn)
351 result = inc->init_conn(inc, cp); 456 result = inc->init_conn(inc, cp);
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c
index 358110d17e59..a22195f68ac4 100644
--- a/net/ipv4/ipvs/ip_vs_rr.c
+++ b/net/ipv4/ipvs/ip_vs_rr.c
@@ -32,12 +32,6 @@ static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
32} 32}
33 33
34 34
35static int ip_vs_rr_done_svc(struct ip_vs_service *svc)
36{
37 return 0;
38}
39
40
41static int ip_vs_rr_update_svc(struct ip_vs_service *svc) 35static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
42{ 36{
43 svc->sched_data = &svc->destinations; 37 svc->sched_data = &svc->destinations;
@@ -80,11 +74,11 @@ ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
80 out: 74 out:
81 svc->sched_data = q; 75 svc->sched_data = q;
82 write_unlock(&svc->sched_lock); 76 write_unlock(&svc->sched_lock);
83 IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u " 77 IP_VS_DBG_BUF(6, "RR: server %s:%u "
84 "activeconns %d refcnt %d weight %d\n", 78 "activeconns %d refcnt %d weight %d\n",
85 NIPQUAD(dest->addr), ntohs(dest->port), 79 IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
86 atomic_read(&dest->activeconns), 80 atomic_read(&dest->activeconns),
87 atomic_read(&dest->refcnt), atomic_read(&dest->weight)); 81 atomic_read(&dest->refcnt), atomic_read(&dest->weight));
88 82
89 return dest; 83 return dest;
90} 84}
@@ -95,8 +89,10 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = {
95 .refcnt = ATOMIC_INIT(0), 89 .refcnt = ATOMIC_INIT(0),
96 .module = THIS_MODULE, 90 .module = THIS_MODULE,
97 .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), 91 .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
92#ifdef CONFIG_IP_VS_IPV6
93 .supports_ipv6 = 1,
94#endif
98 .init_service = ip_vs_rr_init_svc, 95 .init_service = ip_vs_rr_init_svc,
99 .done_service = ip_vs_rr_done_svc,
100 .update_service = ip_vs_rr_update_svc, 96 .update_service = ip_vs_rr_update_svc,
101 .schedule = ip_vs_rr_schedule, 97 .schedule = ip_vs_rr_schedule,
102}; 98};
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c
index 77663d84cbd1..7d2f22f04b83 100644
--- a/net/ipv4/ipvs/ip_vs_sed.c
+++ b/net/ipv4/ipvs/ip_vs_sed.c
@@ -41,27 +41,6 @@
41#include <net/ip_vs.h> 41#include <net/ip_vs.h>
42 42
43 43
44static int
45ip_vs_sed_init_svc(struct ip_vs_service *svc)
46{
47 return 0;
48}
49
50
51static int
52ip_vs_sed_done_svc(struct ip_vs_service *svc)
53{
54 return 0;
55}
56
57
58static int
59ip_vs_sed_update_svc(struct ip_vs_service *svc)
60{
61 return 0;
62}
63
64
65static inline unsigned int 44static inline unsigned int
66ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) 45ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
67{ 46{
@@ -122,12 +101,12 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
122 } 101 }
123 } 102 }
124 103
125 IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u " 104 IP_VS_DBG_BUF(6, "SED: server %s:%u "
126 "activeconns %d refcnt %d weight %d overhead %d\n", 105 "activeconns %d refcnt %d weight %d overhead %d\n",
127 NIPQUAD(least->addr), ntohs(least->port), 106 IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
128 atomic_read(&least->activeconns), 107 atomic_read(&least->activeconns),
129 atomic_read(&least->refcnt), 108 atomic_read(&least->refcnt),
130 atomic_read(&least->weight), loh); 109 atomic_read(&least->weight), loh);
131 110
132 return least; 111 return least;
133} 112}
@@ -139,9 +118,9 @@ static struct ip_vs_scheduler ip_vs_sed_scheduler =
139 .refcnt = ATOMIC_INIT(0), 118 .refcnt = ATOMIC_INIT(0),
140 .module = THIS_MODULE, 119 .module = THIS_MODULE,
141 .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), 120 .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
142 .init_service = ip_vs_sed_init_svc, 121#ifdef CONFIG_IP_VS_IPV6
143 .done_service = ip_vs_sed_done_svc, 122 .supports_ipv6 = 1,
144 .update_service = ip_vs_sed_update_svc, 123#endif
145 .schedule = ip_vs_sed_schedule, 124 .schedule = ip_vs_sed_schedule,
146}; 125};
147 126
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
index 7b979e228056..1d96de27fefd 100644
--- a/net/ipv4/ipvs/ip_vs_sh.c
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -215,7 +215,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
215 IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " 215 IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
216 "--> server %u.%u.%u.%u:%d\n", 216 "--> server %u.%u.%u.%u:%d\n",
217 NIPQUAD(iph->saddr), 217 NIPQUAD(iph->saddr),
218 NIPQUAD(dest->addr), 218 NIPQUAD(dest->addr.ip),
219 ntohs(dest->port)); 219 ntohs(dest->port));
220 220
221 return dest; 221 return dest;
@@ -231,6 +231,9 @@ static struct ip_vs_scheduler ip_vs_sh_scheduler =
231 .refcnt = ATOMIC_INIT(0), 231 .refcnt = ATOMIC_INIT(0),
232 .module = THIS_MODULE, 232 .module = THIS_MODULE,
233 .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), 233 .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
234#ifdef CONFIG_IP_VS_IPV6
235 .supports_ipv6 = 0,
236#endif
234 .init_service = ip_vs_sh_init_svc, 237 .init_service = ip_vs_sh_init_svc,
235 .done_service = ip_vs_sh_done_svc, 238 .done_service = ip_vs_sh_done_svc,
236 .update_service = ip_vs_sh_update_svc, 239 .update_service = ip_vs_sh_update_svc,
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index a652da2c3200..28237a5f62e2 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -256,9 +256,9 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
256 s->cport = cp->cport; 256 s->cport = cp->cport;
257 s->vport = cp->vport; 257 s->vport = cp->vport;
258 s->dport = cp->dport; 258 s->dport = cp->dport;
259 s->caddr = cp->caddr; 259 s->caddr = cp->caddr.ip;
260 s->vaddr = cp->vaddr; 260 s->vaddr = cp->vaddr.ip;
261 s->daddr = cp->daddr; 261 s->daddr = cp->daddr.ip;
262 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); 262 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
263 s->state = htons(cp->state); 263 s->state = htons(cp->state);
264 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 264 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
@@ -366,21 +366,28 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
366 } 366 }
367 367
368 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 368 if (!(flags & IP_VS_CONN_F_TEMPLATE))
369 cp = ip_vs_conn_in_get(s->protocol, 369 cp = ip_vs_conn_in_get(AF_INET, s->protocol,
370 s->caddr, s->cport, 370 (union nf_inet_addr *)&s->caddr,
371 s->vaddr, s->vport); 371 s->cport,
372 (union nf_inet_addr *)&s->vaddr,
373 s->vport);
372 else 374 else
373 cp = ip_vs_ct_in_get(s->protocol, 375 cp = ip_vs_ct_in_get(AF_INET, s->protocol,
374 s->caddr, s->cport, 376 (union nf_inet_addr *)&s->caddr,
375 s->vaddr, s->vport); 377 s->cport,
378 (union nf_inet_addr *)&s->vaddr,
379 s->vport);
376 if (!cp) { 380 if (!cp) {
377 /* 381 /*
378 * Find the appropriate destination for the connection. 382 * Find the appropriate destination for the connection.
379 * If it is not found the connection will remain unbound 383 * If it is not found the connection will remain unbound
380 * but still handled. 384 * but still handled.
381 */ 385 */
382 dest = ip_vs_find_dest(s->daddr, s->dport, 386 dest = ip_vs_find_dest(AF_INET,
383 s->vaddr, s->vport, 387 (union nf_inet_addr *)&s->daddr,
388 s->dport,
389 (union nf_inet_addr *)&s->vaddr,
390 s->vport,
384 s->protocol); 391 s->protocol);
385 /* Set the approprite ativity flag */ 392 /* Set the approprite ativity flag */
386 if (s->protocol == IPPROTO_TCP) { 393 if (s->protocol == IPPROTO_TCP) {
@@ -389,10 +396,13 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
389 else 396 else
390 flags &= ~IP_VS_CONN_F_INACTIVE; 397 flags &= ~IP_VS_CONN_F_INACTIVE;
391 } 398 }
392 cp = ip_vs_conn_new(s->protocol, 399 cp = ip_vs_conn_new(AF_INET, s->protocol,
393 s->caddr, s->cport, 400 (union nf_inet_addr *)&s->caddr,
394 s->vaddr, s->vport, 401 s->cport,
395 s->daddr, s->dport, 402 (union nf_inet_addr *)&s->vaddr,
403 s->vport,
404 (union nf_inet_addr *)&s->daddr,
405 s->dport,
396 flags, dest); 406 flags, dest);
397 if (dest) 407 if (dest)
398 atomic_dec(&dest->refcnt); 408 atomic_dec(&dest->refcnt);
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c
index 9b0ef86bb1f7..8c596e712599 100644
--- a/net/ipv4/ipvs/ip_vs_wlc.c
+++ b/net/ipv4/ipvs/ip_vs_wlc.c
@@ -25,27 +25,6 @@
25#include <net/ip_vs.h> 25#include <net/ip_vs.h>
26 26
27 27
28static int
29ip_vs_wlc_init_svc(struct ip_vs_service *svc)
30{
31 return 0;
32}
33
34
35static int
36ip_vs_wlc_done_svc(struct ip_vs_service *svc)
37{
38 return 0;
39}
40
41
42static int
43ip_vs_wlc_update_svc(struct ip_vs_service *svc)
44{
45 return 0;
46}
47
48
49static inline unsigned int 28static inline unsigned int
50ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) 29ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
51{ 30{
@@ -110,12 +89,12 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
110 } 89 }
111 } 90 }
112 91
113 IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u " 92 IP_VS_DBG_BUF(6, "WLC: server %s:%u "
114 "activeconns %d refcnt %d weight %d overhead %d\n", 93 "activeconns %d refcnt %d weight %d overhead %d\n",
115 NIPQUAD(least->addr), ntohs(least->port), 94 IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port),
116 atomic_read(&least->activeconns), 95 atomic_read(&least->activeconns),
117 atomic_read(&least->refcnt), 96 atomic_read(&least->refcnt),
118 atomic_read(&least->weight), loh); 97 atomic_read(&least->weight), loh);
119 98
120 return least; 99 return least;
121} 100}
@@ -127,9 +106,9 @@ static struct ip_vs_scheduler ip_vs_wlc_scheduler =
127 .refcnt = ATOMIC_INIT(0), 106 .refcnt = ATOMIC_INIT(0),
128 .module = THIS_MODULE, 107 .module = THIS_MODULE,
129 .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), 108 .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
130 .init_service = ip_vs_wlc_init_svc, 109#ifdef CONFIG_IP_VS_IPV6
131 .done_service = ip_vs_wlc_done_svc, 110 .supports_ipv6 = 1,
132 .update_service = ip_vs_wlc_update_svc, 111#endif
133 .schedule = ip_vs_wlc_schedule, 112 .schedule = ip_vs_wlc_schedule,
134}; 113};
135 114
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
index 0d86a79b87b5..7ea92fed50bf 100644
--- a/net/ipv4/ipvs/ip_vs_wrr.c
+++ b/net/ipv4/ipvs/ip_vs_wrr.c
@@ -195,12 +195,12 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
195 } 195 }
196 } 196 }
197 197
198 IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u " 198 IP_VS_DBG_BUF(6, "WRR: server %s:%u "
199 "activeconns %d refcnt %d weight %d\n", 199 "activeconns %d refcnt %d weight %d\n",
200 NIPQUAD(dest->addr), ntohs(dest->port), 200 IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
201 atomic_read(&dest->activeconns), 201 atomic_read(&dest->activeconns),
202 atomic_read(&dest->refcnt), 202 atomic_read(&dest->refcnt),
203 atomic_read(&dest->weight)); 203 atomic_read(&dest->weight));
204 204
205 out: 205 out:
206 write_unlock(&svc->sched_lock); 206 write_unlock(&svc->sched_lock);
@@ -213,6 +213,9 @@ static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
213 .refcnt = ATOMIC_INIT(0), 213 .refcnt = ATOMIC_INIT(0),
214 .module = THIS_MODULE, 214 .module = THIS_MODULE,
215 .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), 215 .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
216#ifdef CONFIG_IP_VS_IPV6
217 .supports_ipv6 = 1,
218#endif
216 .init_service = ip_vs_wrr_init_svc, 219 .init_service = ip_vs_wrr_init_svc,
217 .done_service = ip_vs_wrr_done_svc, 220 .done_service = ip_vs_wrr_done_svc,
218 .update_service = ip_vs_wrr_update_svc, 221 .update_service = ip_vs_wrr_update_svc,
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index 9892d4aca42e..02ddc2b3ce2e 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -20,6 +20,9 @@
20#include <net/udp.h> 20#include <net/udp.h>
21#include <net/icmp.h> /* for icmp_send */ 21#include <net/icmp.h> /* for icmp_send */
22#include <net/route.h> /* for ip_route_output */ 22#include <net/route.h> /* for ip_route_output */
23#include <net/ipv6.h>
24#include <net/ip6_route.h>
25#include <linux/icmpv6.h>
23#include <linux/netfilter.h> 26#include <linux/netfilter.h>
24#include <linux/netfilter_ipv4.h> 27#include <linux/netfilter_ipv4.h>
25 28
@@ -47,7 +50,8 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
47 50
48 if (!dst) 51 if (!dst)
49 return NULL; 52 return NULL;
50 if ((dst->obsolete || rtos != dest->dst_rtos) && 53 if ((dst->obsolete
54 || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
51 dst->ops->check(dst, cookie) == NULL) { 55 dst->ops->check(dst, cookie) == NULL) {
52 dest->dst_cache = NULL; 56 dest->dst_cache = NULL;
53 dst_release(dst); 57 dst_release(dst);
@@ -71,7 +75,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
71 .oif = 0, 75 .oif = 0,
72 .nl_u = { 76 .nl_u = {
73 .ip4_u = { 77 .ip4_u = {
74 .daddr = dest->addr, 78 .daddr = dest->addr.ip,
75 .saddr = 0, 79 .saddr = 0,
76 .tos = rtos, } }, 80 .tos = rtos, } },
77 }; 81 };
@@ -80,12 +84,12 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
80 spin_unlock(&dest->dst_lock); 84 spin_unlock(&dest->dst_lock);
81 IP_VS_DBG_RL("ip_route_output error, " 85 IP_VS_DBG_RL("ip_route_output error, "
82 "dest: %u.%u.%u.%u\n", 86 "dest: %u.%u.%u.%u\n",
83 NIPQUAD(dest->addr)); 87 NIPQUAD(dest->addr.ip));
84 return NULL; 88 return NULL;
85 } 89 }
86 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); 90 __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
87 IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", 91 IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
88 NIPQUAD(dest->addr), 92 NIPQUAD(dest->addr.ip),
89 atomic_read(&rt->u.dst.__refcnt), rtos); 93 atomic_read(&rt->u.dst.__refcnt), rtos);
90 } 94 }
91 spin_unlock(&dest->dst_lock); 95 spin_unlock(&dest->dst_lock);
@@ -94,14 +98,14 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
94 .oif = 0, 98 .oif = 0,
95 .nl_u = { 99 .nl_u = {
96 .ip4_u = { 100 .ip4_u = {
97 .daddr = cp->daddr, 101 .daddr = cp->daddr.ip,
98 .saddr = 0, 102 .saddr = 0,
99 .tos = rtos, } }, 103 .tos = rtos, } },
100 }; 104 };
101 105
102 if (ip_route_output_key(&init_net, &rt, &fl)) { 106 if (ip_route_output_key(&init_net, &rt, &fl)) {
103 IP_VS_DBG_RL("ip_route_output error, dest: " 107 IP_VS_DBG_RL("ip_route_output error, dest: "
104 "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); 108 "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip));
105 return NULL; 109 return NULL;
106 } 110 }
107 } 111 }
@@ -109,6 +113,70 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
109 return rt; 113 return rt;
110} 114}
111 115
116#ifdef CONFIG_IP_VS_IPV6
117static struct rt6_info *
118__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
119{
120 struct rt6_info *rt; /* Route to the other host */
121 struct ip_vs_dest *dest = cp->dest;
122
123 if (dest) {
124 spin_lock(&dest->dst_lock);
125 rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
126 if (!rt) {
127 struct flowi fl = {
128 .oif = 0,
129 .nl_u = {
130 .ip6_u = {
131 .daddr = dest->addr.in6,
132 .saddr = {
133 .s6_addr32 =
134 { 0, 0, 0, 0 },
135 },
136 },
137 },
138 };
139
140 rt = (struct rt6_info *)ip6_route_output(&init_net,
141 NULL, &fl);
142 if (!rt) {
143 spin_unlock(&dest->dst_lock);
144 IP_VS_DBG_RL("ip6_route_output error, "
145 "dest: " NIP6_FMT "\n",
146 NIP6(dest->addr.in6));
147 return NULL;
148 }
149 __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst));
150 IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n",
151 NIP6(dest->addr.in6),
152 atomic_read(&rt->u.dst.__refcnt));
153 }
154 spin_unlock(&dest->dst_lock);
155 } else {
156 struct flowi fl = {
157 .oif = 0,
158 .nl_u = {
159 .ip6_u = {
160 .daddr = cp->daddr.in6,
161 .saddr = {
162 .s6_addr32 = { 0, 0, 0, 0 },
163 },
164 },
165 },
166 };
167
168 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
169 if (!rt) {
170 IP_VS_DBG_RL("ip6_route_output error, dest: "
171 NIP6_FMT "\n", NIP6(cp->daddr.in6));
172 return NULL;
173 }
174 }
175
176 return rt;
177}
178#endif
179
112 180
113/* 181/*
114 * Release dest->dst_cache before a dest is removed 182 * Release dest->dst_cache before a dest is removed
@@ -123,11 +191,11 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
123 dst_release(old_dst); 191 dst_release(old_dst);
124} 192}
125 193
126#define IP_VS_XMIT(skb, rt) \ 194#define IP_VS_XMIT(pf, skb, rt) \
127do { \ 195do { \
128 (skb)->ipvs_property = 1; \ 196 (skb)->ipvs_property = 1; \
129 skb_forward_csum(skb); \ 197 skb_forward_csum(skb); \
130 NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, (skb), NULL, \ 198 NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
131 (rt)->u.dst.dev, dst_output); \ 199 (rt)->u.dst.dev, dst_output); \
132} while (0) 200} while (0)
133 201
@@ -200,7 +268,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
200 /* Another hack: avoid icmp_send in ip_fragment */ 268 /* Another hack: avoid icmp_send in ip_fragment */
201 skb->local_df = 1; 269 skb->local_df = 1;
202 270
203 IP_VS_XMIT(skb, rt); 271 IP_VS_XMIT(PF_INET, skb, rt);
204 272
205 LeaveFunction(10); 273 LeaveFunction(10);
206 return NF_STOLEN; 274 return NF_STOLEN;
@@ -213,6 +281,70 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
213 return NF_STOLEN; 281 return NF_STOLEN;
214} 282}
215 283
284#ifdef CONFIG_IP_VS_IPV6
285int
286ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
287 struct ip_vs_protocol *pp)
288{
289 struct rt6_info *rt; /* Route to the other host */
290 struct ipv6hdr *iph = ipv6_hdr(skb);
291 int mtu;
292 struct flowi fl = {
293 .oif = 0,
294 .nl_u = {
295 .ip6_u = {
296 .daddr = iph->daddr,
297 .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
298 };
299
300 EnterFunction(10);
301
302 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
303 if (!rt) {
304 IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, "
305 "dest: " NIP6_FMT "\n", NIP6(iph->daddr));
306 goto tx_error_icmp;
307 }
308
309 /* MTU checking */
310 mtu = dst_mtu(&rt->u.dst);
311 if (skb->len > mtu) {
312 dst_release(&rt->u.dst);
313 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
314 IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n");
315 goto tx_error;
316 }
317
318 /*
319 * Call ip_send_check because we are not sure it is called
320 * after ip_defrag. Is copy-on-write needed?
321 */
322 skb = skb_share_check(skb, GFP_ATOMIC);
323 if (unlikely(skb == NULL)) {
324 dst_release(&rt->u.dst);
325 return NF_STOLEN;
326 }
327
328 /* drop old route */
329 dst_release(skb->dst);
330 skb->dst = &rt->u.dst;
331
332 /* Another hack: avoid icmp_send in ip_fragment */
333 skb->local_df = 1;
334
335 IP_VS_XMIT(PF_INET6, skb, rt);
336
337 LeaveFunction(10);
338 return NF_STOLEN;
339
340 tx_error_icmp:
341 dst_link_failure(skb);
342 tx_error:
343 kfree_skb(skb);
344 LeaveFunction(10);
345 return NF_STOLEN;
346}
347#endif
216 348
217/* 349/*
218 * NAT transmitter (only for outside-to-inside nat forwarding) 350 * NAT transmitter (only for outside-to-inside nat forwarding)
@@ -264,7 +396,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
264 /* mangle the packet */ 396 /* mangle the packet */
265 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) 397 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
266 goto tx_error; 398 goto tx_error;
267 ip_hdr(skb)->daddr = cp->daddr; 399 ip_hdr(skb)->daddr = cp->daddr.ip;
268 ip_send_check(ip_hdr(skb)); 400 ip_send_check(ip_hdr(skb));
269 401
270 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); 402 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
@@ -276,7 +408,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
276 /* Another hack: avoid icmp_send in ip_fragment */ 408 /* Another hack: avoid icmp_send in ip_fragment */
277 skb->local_df = 1; 409 skb->local_df = 1;
278 410
279 IP_VS_XMIT(skb, rt); 411 IP_VS_XMIT(PF_INET, skb, rt);
280 412
281 LeaveFunction(10); 413 LeaveFunction(10);
282 return NF_STOLEN; 414 return NF_STOLEN;
@@ -292,6 +424,83 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
292 goto tx_error; 424 goto tx_error;
293} 425}
294 426
427#ifdef CONFIG_IP_VS_IPV6
428int
429ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
430 struct ip_vs_protocol *pp)
431{
432 struct rt6_info *rt; /* Route to the other host */
433 int mtu;
434
435 EnterFunction(10);
436
437 /* check if it is a connection of no-client-port */
438 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
439 __be16 _pt, *p;
440 p = skb_header_pointer(skb, sizeof(struct ipv6hdr),
441 sizeof(_pt), &_pt);
442 if (p == NULL)
443 goto tx_error;
444 ip_vs_conn_fill_cport(cp, *p);
445 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
446 }
447
448 rt = __ip_vs_get_out_rt_v6(cp);
449 if (!rt)
450 goto tx_error_icmp;
451
452 /* MTU checking */
453 mtu = dst_mtu(&rt->u.dst);
454 if (skb->len > mtu) {
455 dst_release(&rt->u.dst);
456 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
457 IP_VS_DBG_RL_PKT(0, pp, skb, 0,
458 "ip_vs_nat_xmit_v6(): frag needed for");
459 goto tx_error;
460 }
461
462 /* copy-on-write the packet before mangling it */
463 if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
464 goto tx_error_put;
465
466 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
467 goto tx_error_put;
468
469 /* drop old route */
470 dst_release(skb->dst);
471 skb->dst = &rt->u.dst;
472
473 /* mangle the packet */
474 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp))
475 goto tx_error;
476 ipv6_hdr(skb)->daddr = cp->daddr.in6;
477
478 IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
479
480 /* FIXME: when application helper enlarges the packet and the length
481 is larger than the MTU of outgoing device, there will be still
482 MTU problem. */
483
484 /* Another hack: avoid icmp_send in ip_fragment */
485 skb->local_df = 1;
486
487 IP_VS_XMIT(PF_INET6, skb, rt);
488
489 LeaveFunction(10);
490 return NF_STOLEN;
491
492tx_error_icmp:
493 dst_link_failure(skb);
494tx_error:
495 LeaveFunction(10);
496 kfree_skb(skb);
497 return NF_STOLEN;
498tx_error_put:
499 dst_release(&rt->u.dst);
500 goto tx_error;
501}
502#endif
503
295 504
296/* 505/*
297 * IP Tunneling transmitter 506 * IP Tunneling transmitter
@@ -423,6 +632,112 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
423 return NF_STOLEN; 632 return NF_STOLEN;
424} 633}
425 634
635#ifdef CONFIG_IP_VS_IPV6
636int
637ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
638 struct ip_vs_protocol *pp)
639{
640 struct rt6_info *rt; /* Route to the other host */
641 struct net_device *tdev; /* Device to other host */
642 struct ipv6hdr *old_iph = ipv6_hdr(skb);
643 sk_buff_data_t old_transport_header = skb->transport_header;
644 struct ipv6hdr *iph; /* Our new IP header */
645 unsigned int max_headroom; /* The extra header space needed */
646 int mtu;
647
648 EnterFunction(10);
649
650 if (skb->protocol != htons(ETH_P_IPV6)) {
651 IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, "
652 "ETH_P_IPV6: %d, skb protocol: %d\n",
653 htons(ETH_P_IPV6), skb->protocol);
654 goto tx_error;
655 }
656
657 rt = __ip_vs_get_out_rt_v6(cp);
658 if (!rt)
659 goto tx_error_icmp;
660
661 tdev = rt->u.dst.dev;
662
663 mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr);
664 /* TODO IPv6: do we need this check in IPv6? */
665 if (mtu < 1280) {
666 dst_release(&rt->u.dst);
667 IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n");
668 goto tx_error;
669 }
670 if (skb->dst)
671 skb->dst->ops->update_pmtu(skb->dst, mtu);
672
673 if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
674 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
675 dst_release(&rt->u.dst);
676 IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n");
677 goto tx_error;
678 }
679
680 /*
681 * Okay, now see if we can stuff it in the buffer as-is.
682 */
683 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
684
685 if (skb_headroom(skb) < max_headroom
686 || skb_cloned(skb) || skb_shared(skb)) {
687 struct sk_buff *new_skb =
688 skb_realloc_headroom(skb, max_headroom);
689 if (!new_skb) {
690 dst_release(&rt->u.dst);
691 kfree_skb(skb);
692 IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n");
693 return NF_STOLEN;
694 }
695 kfree_skb(skb);
696 skb = new_skb;
697 old_iph = ipv6_hdr(skb);
698 }
699
700 skb->transport_header = old_transport_header;
701
702 skb_push(skb, sizeof(struct ipv6hdr));
703 skb_reset_network_header(skb);
704 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
705
706 /* drop old route */
707 dst_release(skb->dst);
708 skb->dst = &rt->u.dst;
709
710 /*
711 * Push down and install the IPIP header.
712 */
713 iph = ipv6_hdr(skb);
714 iph->version = 6;
715 iph->nexthdr = IPPROTO_IPV6;
716 iph->payload_len = old_iph->payload_len + sizeof(old_iph);
717 iph->priority = old_iph->priority;
718 memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
719 iph->daddr = rt->rt6i_dst.addr;
720 iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */
721 iph->hop_limit = old_iph->hop_limit;
722
723 /* Another hack: avoid icmp_send in ip_fragment */
724 skb->local_df = 1;
725
726 ip6_local_out(skb);
727
728 LeaveFunction(10);
729
730 return NF_STOLEN;
731
732tx_error_icmp:
733 dst_link_failure(skb);
734tx_error:
735 kfree_skb(skb);
736 LeaveFunction(10);
737 return NF_STOLEN;
738}
739#endif
740
426 741
427/* 742/*
428 * Direct Routing transmitter 743 * Direct Routing transmitter
@@ -467,7 +782,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
467 /* Another hack: avoid icmp_send in ip_fragment */ 782 /* Another hack: avoid icmp_send in ip_fragment */
468 skb->local_df = 1; 783 skb->local_df = 1;
469 784
470 IP_VS_XMIT(skb, rt); 785 IP_VS_XMIT(PF_INET, skb, rt);
471 786
472 LeaveFunction(10); 787 LeaveFunction(10);
473 return NF_STOLEN; 788 return NF_STOLEN;
@@ -480,6 +795,60 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
480 return NF_STOLEN; 795 return NF_STOLEN;
481} 796}
482 797
798#ifdef CONFIG_IP_VS_IPV6
799int
800ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
801 struct ip_vs_protocol *pp)
802{
803 struct rt6_info *rt; /* Route to the other host */
804 int mtu;
805
806 EnterFunction(10);
807
808 rt = __ip_vs_get_out_rt_v6(cp);
809 if (!rt)
810 goto tx_error_icmp;
811
812 /* MTU checking */
813 mtu = dst_mtu(&rt->u.dst);
814 if (skb->len > mtu) {
815 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
816 dst_release(&rt->u.dst);
817 IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n");
818 goto tx_error;
819 }
820
821 /*
822 * Call ip_send_check because we are not sure it is called
823 * after ip_defrag. Is copy-on-write needed?
824 */
825 skb = skb_share_check(skb, GFP_ATOMIC);
826 if (unlikely(skb == NULL)) {
827 dst_release(&rt->u.dst);
828 return NF_STOLEN;
829 }
830
831 /* drop old route */
832 dst_release(skb->dst);
833 skb->dst = &rt->u.dst;
834
835 /* Another hack: avoid icmp_send in ip_fragment */
836 skb->local_df = 1;
837
838 IP_VS_XMIT(PF_INET6, skb, rt);
839
840 LeaveFunction(10);
841 return NF_STOLEN;
842
843tx_error_icmp:
844 dst_link_failure(skb);
845tx_error:
846 kfree_skb(skb);
847 LeaveFunction(10);
848 return NF_STOLEN;
849}
850#endif
851
483 852
484/* 853/*
485 * ICMP packet transmitter 854 * ICMP packet transmitter
@@ -540,7 +909,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
540 /* Another hack: avoid icmp_send in ip_fragment */ 909 /* Another hack: avoid icmp_send in ip_fragment */
541 skb->local_df = 1; 910 skb->local_df = 1;
542 911
543 IP_VS_XMIT(skb, rt); 912 IP_VS_XMIT(PF_INET, skb, rt);
544 913
545 rc = NF_STOLEN; 914 rc = NF_STOLEN;
546 goto out; 915 goto out;
@@ -557,3 +926,79 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
557 ip_rt_put(rt); 926 ip_rt_put(rt);
558 goto tx_error; 927 goto tx_error;
559} 928}
929
930#ifdef CONFIG_IP_VS_IPV6
931int
932ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
933 struct ip_vs_protocol *pp, int offset)
934{
935 struct rt6_info *rt; /* Route to the other host */
936 int mtu;
937 int rc;
938
939 EnterFunction(10);
940
941 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
942 forwarded directly here, because there is no need to
943 translate address/port back */
944 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
945 if (cp->packet_xmit)
946 rc = cp->packet_xmit(skb, cp, pp);
947 else
948 rc = NF_ACCEPT;
949 /* do not touch skb anymore */
950 atomic_inc(&cp->in_pkts);
951 goto out;
952 }
953
954 /*
955 * mangle and send the packet here (only for VS/NAT)
956 */
957
958 rt = __ip_vs_get_out_rt_v6(cp);
959 if (!rt)
960 goto tx_error_icmp;
961
962 /* MTU checking */
963 mtu = dst_mtu(&rt->u.dst);
964 if (skb->len > mtu) {
965 dst_release(&rt->u.dst);
966 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
967 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
968 goto tx_error;
969 }
970
971 /* copy-on-write the packet before mangling it */
972 if (!skb_make_writable(skb, offset))
973 goto tx_error_put;
974
975 if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
976 goto tx_error_put;
977
978 /* drop the old route when skb is not shared */
979 dst_release(skb->dst);
980 skb->dst = &rt->u.dst;
981
982 ip_vs_nat_icmp_v6(skb, pp, cp, 0);
983
984 /* Another hack: avoid icmp_send in ip_fragment */
985 skb->local_df = 1;
986
987 IP_VS_XMIT(PF_INET6, skb, rt);
988
989 rc = NF_STOLEN;
990 goto out;
991
992tx_error_icmp:
993 dst_link_failure(skb);
994tx_error:
995 dev_kfree_skb(skb);
996 rc = NF_STOLEN;
997out:
998 LeaveFunction(10);
999 return rc;
1000tx_error_put:
1001 dst_release(&rt->u.dst);
1002 goto tx_error;
1003}
1004#endif
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6ee5354c9aa1..f62187bb6d08 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -282,6 +282,8 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
282 struct rtable *r = NULL; 282 struct rtable *r = NULL;
283 283
284 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { 284 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
285 if (!rt_hash_table[st->bucket].chain)
286 continue;
285 rcu_read_lock_bh(); 287 rcu_read_lock_bh();
286 r = rcu_dereference(rt_hash_table[st->bucket].chain); 288 r = rcu_dereference(rt_hash_table[st->bucket].chain);
287 while (r) { 289 while (r) {
@@ -299,11 +301,14 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
299 struct rtable *r) 301 struct rtable *r)
300{ 302{
301 struct rt_cache_iter_state *st = seq->private; 303 struct rt_cache_iter_state *st = seq->private;
304
302 r = r->u.dst.rt_next; 305 r = r->u.dst.rt_next;
303 while (!r) { 306 while (!r) {
304 rcu_read_unlock_bh(); 307 rcu_read_unlock_bh();
305 if (--st->bucket < 0) 308 do {
306 break; 309 if (--st->bucket < 0)
310 return NULL;
311 } while (!rt_hash_table[st->bucket].chain);
307 rcu_read_lock_bh(); 312 rcu_read_lock_bh();
308 r = rt_hash_table[st->bucket].chain; 313 r = rt_hash_table[st->bucket].chain;
309 } 314 }
@@ -2840,7 +2845,9 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2840 if (s_h < 0) 2845 if (s_h < 0)
2841 s_h = 0; 2846 s_h = 0;
2842 s_idx = idx = cb->args[1]; 2847 s_idx = idx = cb->args[1];
2843 for (h = s_h; h <= rt_hash_mask; h++) { 2848 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2849 if (!rt_hash_table[h].chain)
2850 continue;
2844 rcu_read_lock_bh(); 2851 rcu_read_lock_bh();
2845 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; 2852 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2846 rt = rcu_dereference(rt->u.dst.rt_next), idx++) { 2853 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
@@ -2859,7 +2866,6 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2859 dst_release(xchg(&skb->dst, NULL)); 2866 dst_release(xchg(&skb->dst, NULL));
2860 } 2867 }
2861 rcu_read_unlock_bh(); 2868 rcu_read_unlock_bh();
2862 s_idx = 0;
2863 } 2869 }
2864 2870
2865done: 2871done:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 67ccce2a96bd..3b76bce769dd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -979,6 +979,39 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
979 } 979 }
980} 980}
981 981
982/* This must be called before lost_out is incremented */
983static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
984{
985 if ((tp->retransmit_skb_hint == NULL) ||
986 before(TCP_SKB_CB(skb)->seq,
987 TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
988 tp->retransmit_skb_hint = skb;
989
990 if (!tp->lost_out ||
991 after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
992 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
993}
994
995static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
996{
997 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
998 tcp_verify_retransmit_hint(tp, skb);
999
1000 tp->lost_out += tcp_skb_pcount(skb);
1001 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1002 }
1003}
1004
1005void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
1006{
1007 tcp_verify_retransmit_hint(tp, skb);
1008
1009 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
1010 tp->lost_out += tcp_skb_pcount(skb);
1011 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1012 }
1013}
1014
982/* This procedure tags the retransmission queue when SACKs arrive. 1015/* This procedure tags the retransmission queue when SACKs arrive.
983 * 1016 *
984 * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). 1017 * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
@@ -1155,13 +1188,7 @@ static void tcp_mark_lost_retrans(struct sock *sk)
1155 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1188 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1156 tp->retrans_out -= tcp_skb_pcount(skb); 1189 tp->retrans_out -= tcp_skb_pcount(skb);
1157 1190
1158 /* clear lost hint */ 1191 tcp_skb_mark_lost_uncond_verify(tp, skb);
1159 tp->retransmit_skb_hint = NULL;
1160
1161 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
1162 tp->lost_out += tcp_skb_pcount(skb);
1163 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1164 }
1165 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); 1192 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
1166 } else { 1193 } else {
1167 if (before(ack_seq, new_low_seq)) 1194 if (before(ack_seq, new_low_seq))
@@ -1271,9 +1298,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1271 ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); 1298 ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1272 tp->lost_out -= tcp_skb_pcount(skb); 1299 tp->lost_out -= tcp_skb_pcount(skb);
1273 tp->retrans_out -= tcp_skb_pcount(skb); 1300 tp->retrans_out -= tcp_skb_pcount(skb);
1274
1275 /* clear lost hint */
1276 tp->retransmit_skb_hint = NULL;
1277 } 1301 }
1278 } else { 1302 } else {
1279 if (!(sacked & TCPCB_RETRANS)) { 1303 if (!(sacked & TCPCB_RETRANS)) {
@@ -1292,9 +1316,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1292 if (sacked & TCPCB_LOST) { 1316 if (sacked & TCPCB_LOST) {
1293 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 1317 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
1294 tp->lost_out -= tcp_skb_pcount(skb); 1318 tp->lost_out -= tcp_skb_pcount(skb);
1295
1296 /* clear lost hint */
1297 tp->retransmit_skb_hint = NULL;
1298 } 1319 }
1299 } 1320 }
1300 1321
@@ -1324,7 +1345,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1324 if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) { 1345 if (dup_sack && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) {
1325 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1346 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1326 tp->retrans_out -= tcp_skb_pcount(skb); 1347 tp->retrans_out -= tcp_skb_pcount(skb);
1327 tp->retransmit_skb_hint = NULL;
1328 } 1348 }
1329 1349
1330 return flag; 1350 return flag;
@@ -1726,6 +1746,8 @@ int tcp_use_frto(struct sock *sk)
1726 return 0; 1746 return 0;
1727 1747
1728 skb = tcp_write_queue_head(sk); 1748 skb = tcp_write_queue_head(sk);
1749 if (tcp_skb_is_last(sk, skb))
1750 return 1;
1729 skb = tcp_write_queue_next(sk, skb); /* Skips head */ 1751 skb = tcp_write_queue_next(sk, skb); /* Skips head */
1730 tcp_for_write_queue_from(skb, sk) { 1752 tcp_for_write_queue_from(skb, sk) {
1731 if (skb == tcp_send_head(sk)) 1753 if (skb == tcp_send_head(sk))
@@ -1867,6 +1889,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
1867 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { 1889 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
1868 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1890 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1869 tp->lost_out += tcp_skb_pcount(skb); 1891 tp->lost_out += tcp_skb_pcount(skb);
1892 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
1870 } 1893 }
1871 } 1894 }
1872 tcp_verify_left_out(tp); 1895 tcp_verify_left_out(tp);
@@ -1883,7 +1906,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
1883 tp->high_seq = tp->snd_nxt; 1906 tp->high_seq = tp->snd_nxt;
1884 TCP_ECN_queue_cwr(tp); 1907 TCP_ECN_queue_cwr(tp);
1885 1908
1886 tcp_clear_retrans_hints_partial(tp); 1909 tcp_clear_all_retrans_hints(tp);
1887} 1910}
1888 1911
1889static void tcp_clear_retrans_partial(struct tcp_sock *tp) 1912static void tcp_clear_retrans_partial(struct tcp_sock *tp)
@@ -1934,12 +1957,11 @@ void tcp_enter_loss(struct sock *sk, int how)
1934 /* Push undo marker, if it was plain RTO and nothing 1957 /* Push undo marker, if it was plain RTO and nothing
1935 * was retransmitted. */ 1958 * was retransmitted. */
1936 tp->undo_marker = tp->snd_una; 1959 tp->undo_marker = tp->snd_una;
1937 tcp_clear_retrans_hints_partial(tp);
1938 } else { 1960 } else {
1939 tp->sacked_out = 0; 1961 tp->sacked_out = 0;
1940 tp->fackets_out = 0; 1962 tp->fackets_out = 0;
1941 tcp_clear_all_retrans_hints(tp);
1942 } 1963 }
1964 tcp_clear_all_retrans_hints(tp);
1943 1965
1944 tcp_for_write_queue(skb, sk) { 1966 tcp_for_write_queue(skb, sk) {
1945 if (skb == tcp_send_head(sk)) 1967 if (skb == tcp_send_head(sk))
@@ -1952,6 +1974,7 @@ void tcp_enter_loss(struct sock *sk, int how)
1952 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; 1974 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1953 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1975 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1954 tp->lost_out += tcp_skb_pcount(skb); 1976 tp->lost_out += tcp_skb_pcount(skb);
1977 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
1955 } 1978 }
1956 } 1979 }
1957 tcp_verify_left_out(tp); 1980 tcp_verify_left_out(tp);
@@ -2157,19 +2180,6 @@ static int tcp_time_to_recover(struct sock *sk)
2157 return 0; 2180 return 0;
2158} 2181}
2159 2182
2160/* RFC: This is from the original, I doubt that this is necessary at all:
2161 * clear xmit_retrans hint if seq of this skb is beyond hint. How could we
2162 * retransmitted past LOST markings in the first place? I'm not fully sure
2163 * about undo and end of connection cases, which can cause R without L?
2164 */
2165static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
2166{
2167 if ((tp->retransmit_skb_hint != NULL) &&
2168 before(TCP_SKB_CB(skb)->seq,
2169 TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
2170 tp->retransmit_skb_hint = NULL;
2171}
2172
2173/* Mark head of queue up as lost. With RFC3517 SACK, the packets is 2183/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
2174 * is against sacked "cnt", otherwise it's against facked "cnt" 2184 * is against sacked "cnt", otherwise it's against facked "cnt"
2175 */ 2185 */
@@ -2217,11 +2227,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2217 cnt = packets; 2227 cnt = packets;
2218 } 2228 }
2219 2229
2220 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { 2230 tcp_skb_mark_lost(tp, skb);
2221 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
2222 tp->lost_out += tcp_skb_pcount(skb);
2223 tcp_verify_retransmit_hint(tp, skb);
2224 }
2225 } 2231 }
2226 tcp_verify_left_out(tp); 2232 tcp_verify_left_out(tp);
2227} 2233}
@@ -2263,11 +2269,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2263 if (!tcp_skb_timedout(sk, skb)) 2269 if (!tcp_skb_timedout(sk, skb))
2264 break; 2270 break;
2265 2271
2266 if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { 2272 tcp_skb_mark_lost(tp, skb);
2267 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
2268 tp->lost_out += tcp_skb_pcount(skb);
2269 tcp_verify_retransmit_hint(tp, skb);
2270 }
2271 } 2273 }
2272 2274
2273 tp->scoreboard_skb_hint = skb; 2275 tp->scoreboard_skb_hint = skb;
@@ -2378,10 +2380,6 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
2378 } 2380 }
2379 tcp_moderate_cwnd(tp); 2381 tcp_moderate_cwnd(tp);
2380 tp->snd_cwnd_stamp = tcp_time_stamp; 2382 tp->snd_cwnd_stamp = tcp_time_stamp;
2381
2382 /* There is something screwy going on with the retrans hints after
2383 an undo */
2384 tcp_clear_all_retrans_hints(tp);
2385} 2383}
2386 2384
2387static inline int tcp_may_undo(struct tcp_sock *tp) 2385static inline int tcp_may_undo(struct tcp_sock *tp)
@@ -2848,6 +2846,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
2848 int flag = 0; 2846 int flag = 0;
2849 u32 pkts_acked = 0; 2847 u32 pkts_acked = 0;
2850 u32 reord = tp->packets_out; 2848 u32 reord = tp->packets_out;
2849 u32 prior_sacked = tp->sacked_out;
2851 s32 seq_rtt = -1; 2850 s32 seq_rtt = -1;
2852 s32 ca_seq_rtt = -1; 2851 s32 ca_seq_rtt = -1;
2853 ktime_t last_ackt = net_invalid_timestamp(); 2852 ktime_t last_ackt = net_invalid_timestamp();
@@ -2929,7 +2928,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
2929 2928
2930 tcp_unlink_write_queue(skb, sk); 2929 tcp_unlink_write_queue(skb, sk);
2931 sk_wmem_free_skb(sk, skb); 2930 sk_wmem_free_skb(sk, skb);
2932 tcp_clear_all_retrans_hints(tp); 2931 tp->scoreboard_skb_hint = NULL;
2932 if (skb == tp->retransmit_skb_hint)
2933 tp->retransmit_skb_hint = NULL;
2934 if (skb == tp->lost_skb_hint)
2935 tp->lost_skb_hint = NULL;
2933 } 2936 }
2934 2937
2935 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 2938 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
@@ -2948,6 +2951,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets)
2948 /* Non-retransmitted hole got filled? That's reordering */ 2951 /* Non-retransmitted hole got filled? That's reordering */
2949 if (reord < prior_fackets) 2952 if (reord < prior_fackets)
2950 tcp_update_reordering(sk, tp->fackets_out - reord, 0); 2953 tcp_update_reordering(sk, tp->fackets_out - reord, 0);
2954
2955 /* No need to care for underflows here because
2956 * the lost_skb_hint gets NULLed if we're past it
2957 * (or something non-trivial happened)
2958 */
2959 if (tcp_is_fack(tp))
2960 tp->lost_cnt_hint -= pkts_acked;
2961 else
2962 tp->lost_cnt_hint -= prior_sacked - tp->sacked_out;
2951 } 2963 }
2952 2964
2953 tp->fackets_out -= min(pkts_acked, tp->fackets_out); 2965 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
@@ -3442,6 +3454,22 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3442 } 3454 }
3443} 3455}
3444 3456
3457static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
3458{
3459 __be32 *ptr = (__be32 *)(th + 1);
3460
3461 if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
3462 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
3463 tp->rx_opt.saw_tstamp = 1;
3464 ++ptr;
3465 tp->rx_opt.rcv_tsval = ntohl(*ptr);
3466 ++ptr;
3467 tp->rx_opt.rcv_tsecr = ntohl(*ptr);
3468 return 1;
3469 }
3470 return 0;
3471}
3472
3445/* Fast parse options. This hopes to only see timestamps. 3473/* Fast parse options. This hopes to only see timestamps.
3446 * If it is wrong it falls back on tcp_parse_options(). 3474 * If it is wrong it falls back on tcp_parse_options().
3447 */ 3475 */
@@ -3453,16 +3481,8 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
3453 return 0; 3481 return 0;
3454 } else if (tp->rx_opt.tstamp_ok && 3482 } else if (tp->rx_opt.tstamp_ok &&
3455 th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { 3483 th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
3456 __be32 *ptr = (__be32 *)(th + 1); 3484 if (tcp_parse_aligned_timestamp(tp, th))
3457 if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
3458 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
3459 tp->rx_opt.saw_tstamp = 1;
3460 ++ptr;
3461 tp->rx_opt.rcv_tsval = ntohl(*ptr);
3462 ++ptr;
3463 tp->rx_opt.rcv_tsecr = ntohl(*ptr);
3464 return 1; 3485 return 1;
3465 }
3466 } 3486 }
3467 tcp_parse_options(skb, &tp->rx_opt, 1); 3487 tcp_parse_options(skb, &tp->rx_opt, 1);
3468 return 1; 3488 return 1;
@@ -4138,7 +4158,7 @@ drop:
4138 skb1 = skb1->prev; 4158 skb1 = skb1->prev;
4139 } 4159 }
4140 } 4160 }
4141 __skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue); 4161 __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
4142 4162
4143 /* And clean segments covered by new one as whole. */ 4163 /* And clean segments covered by new one as whole. */
4144 while ((skb1 = skb->next) != 4164 while ((skb1 = skb->next) !=
@@ -4161,6 +4181,18 @@ add_sack:
4161 } 4181 }
4162} 4182}
4163 4183
4184static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
4185 struct sk_buff_head *list)
4186{
4187 struct sk_buff *next = skb->next;
4188
4189 __skb_unlink(skb, list);
4190 __kfree_skb(skb);
4191 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
4192
4193 return next;
4194}
4195
4164/* Collapse contiguous sequence of skbs head..tail with 4196/* Collapse contiguous sequence of skbs head..tail with
4165 * sequence numbers start..end. 4197 * sequence numbers start..end.
4166 * Segments with FIN/SYN are not collapsed (only because this 4198 * Segments with FIN/SYN are not collapsed (only because this
@@ -4178,11 +4210,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
4178 for (skb = head; skb != tail;) { 4210 for (skb = head; skb != tail;) {
4179 /* No new bits? It is possible on ofo queue. */ 4211 /* No new bits? It is possible on ofo queue. */
4180 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 4212 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4181 struct sk_buff *next = skb->next; 4213 skb = tcp_collapse_one(sk, skb, list);
4182 __skb_unlink(skb, list);
4183 __kfree_skb(skb);
4184 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
4185 skb = next;
4186 continue; 4214 continue;
4187 } 4215 }
4188 4216
@@ -4228,7 +4256,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
4228 memcpy(nskb->head, skb->head, header); 4256 memcpy(nskb->head, skb->head, header);
4229 memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); 4257 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
4230 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; 4258 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
4231 __skb_insert(nskb, skb->prev, skb, list); 4259 __skb_queue_before(list, skb, nskb);
4232 skb_set_owner_r(nskb, sk); 4260 skb_set_owner_r(nskb, sk);
4233 4261
4234 /* Copy data, releasing collapsed skbs. */ 4262 /* Copy data, releasing collapsed skbs. */
@@ -4246,11 +4274,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
4246 start += size; 4274 start += size;
4247 } 4275 }
4248 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 4276 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4249 struct sk_buff *next = skb->next; 4277 skb = tcp_collapse_one(sk, skb, list);
4250 __skb_unlink(skb, list);
4251 __kfree_skb(skb);
4252 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
4253 skb = next;
4254 if (skb == tail || 4278 if (skb == tail ||
4255 tcp_hdr(skb)->syn || 4279 tcp_hdr(skb)->syn ||
4256 tcp_hdr(skb)->fin) 4280 tcp_hdr(skb)->fin)
@@ -4691,6 +4715,67 @@ out:
4691} 4715}
4692#endif /* CONFIG_NET_DMA */ 4716#endif /* CONFIG_NET_DMA */
4693 4717
4718/* Does PAWS and seqno based validation of an incoming segment, flags will
4719 * play significant role here.
4720 */
4721static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
4722 struct tcphdr *th, int syn_inerr)
4723{
4724 struct tcp_sock *tp = tcp_sk(sk);
4725
4726 /* RFC1323: H1. Apply PAWS check first. */
4727 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
4728 tcp_paws_discard(sk, skb)) {
4729 if (!th->rst) {
4730 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
4731 tcp_send_dupack(sk, skb);
4732 goto discard;
4733 }
4734 /* Reset is accepted even if it did not pass PAWS. */
4735 }
4736
4737 /* Step 1: check sequence number */
4738 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
4739 /* RFC793, page 37: "In all states except SYN-SENT, all reset
4740 * (RST) segments are validated by checking their SEQ-fields."
4741 * And page 69: "If an incoming segment is not acceptable,
4742 * an acknowledgment should be sent in reply (unless the RST
4743 * bit is set, if so drop the segment and return)".
4744 */
4745 if (!th->rst)
4746 tcp_send_dupack(sk, skb);
4747 goto discard;
4748 }
4749
4750 /* Step 2: check RST bit */
4751 if (th->rst) {
4752 tcp_reset(sk);
4753 goto discard;
4754 }
4755
4756 /* ts_recent update must be made after we are sure that the packet
4757 * is in window.
4758 */
4759 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
4760
4761 /* step 3: check security and precedence [ignored] */
4762
4763 /* step 4: Check for a SYN in window. */
4764 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4765 if (syn_inerr)
4766 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
4767 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
4768 tcp_reset(sk);
4769 return -1;
4770 }
4771
4772 return 1;
4773
4774discard:
4775 __kfree_skb(skb);
4776 return 0;
4777}
4778
4694/* 4779/*
4695 * TCP receive function for the ESTABLISHED state. 4780 * TCP receive function for the ESTABLISHED state.
4696 * 4781 *
@@ -4718,6 +4803,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
4718 struct tcphdr *th, unsigned len) 4803 struct tcphdr *th, unsigned len)
4719{ 4804{
4720 struct tcp_sock *tp = tcp_sk(sk); 4805 struct tcp_sock *tp = tcp_sk(sk);
4806 int res;
4721 4807
4722 /* 4808 /*
4723 * Header prediction. 4809 * Header prediction.
@@ -4756,19 +4842,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
4756 4842
4757 /* Check timestamp */ 4843 /* Check timestamp */
4758 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { 4844 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
4759 __be32 *ptr = (__be32 *)(th + 1);
4760
4761 /* No? Slow path! */ 4845 /* No? Slow path! */
4762 if (*ptr != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) 4846 if (!tcp_parse_aligned_timestamp(tp, th))
4763 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
4764 goto slow_path; 4847 goto slow_path;
4765 4848
4766 tp->rx_opt.saw_tstamp = 1;
4767 ++ptr;
4768 tp->rx_opt.rcv_tsval = ntohl(*ptr);
4769 ++ptr;
4770 tp->rx_opt.rcv_tsecr = ntohl(*ptr);
4771
4772 /* If PAWS failed, check it more carefully in slow path */ 4849 /* If PAWS failed, check it more carefully in slow path */
4773 if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) 4850 if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
4774 goto slow_path; 4851 goto slow_path;
@@ -4899,51 +4976,12 @@ slow_path:
4899 goto csum_error; 4976 goto csum_error;
4900 4977
4901 /* 4978 /*
4902 * RFC1323: H1. Apply PAWS check first.
4903 */
4904 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
4905 tcp_paws_discard(sk, skb)) {
4906 if (!th->rst) {
4907 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
4908 tcp_send_dupack(sk, skb);
4909 goto discard;
4910 }
4911 /* Resets are accepted even if PAWS failed.
4912
4913 ts_recent update must be made after we are sure
4914 that the packet is in window.
4915 */
4916 }
4917
4918 /*
4919 * Standard slow path. 4979 * Standard slow path.
4920 */ 4980 */
4921 4981
4922 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { 4982 res = tcp_validate_incoming(sk, skb, th, 1);
4923 /* RFC793, page 37: "In all states except SYN-SENT, all reset 4983 if (res <= 0)
4924 * (RST) segments are validated by checking their SEQ-fields." 4984 return -res;
4925 * And page 69: "If an incoming segment is not acceptable,
4926 * an acknowledgment should be sent in reply (unless the RST bit
4927 * is set, if so drop the segment and return)".
4928 */
4929 if (!th->rst)
4930 tcp_send_dupack(sk, skb);
4931 goto discard;
4932 }
4933
4934 if (th->rst) {
4935 tcp_reset(sk);
4936 goto discard;
4937 }
4938
4939 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
4940
4941 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4942 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
4943 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
4944 tcp_reset(sk);
4945 return 1;
4946 }
4947 4985
4948step5: 4986step5:
4949 if (th->ack) 4987 if (th->ack)
@@ -5225,6 +5263,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5225 struct tcp_sock *tp = tcp_sk(sk); 5263 struct tcp_sock *tp = tcp_sk(sk);
5226 struct inet_connection_sock *icsk = inet_csk(sk); 5264 struct inet_connection_sock *icsk = inet_csk(sk);
5227 int queued = 0; 5265 int queued = 0;
5266 int res;
5228 5267
5229 tp->rx_opt.saw_tstamp = 0; 5268 tp->rx_opt.saw_tstamp = 0;
5230 5269
@@ -5277,42 +5316,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5277 return 0; 5316 return 0;
5278 } 5317 }
5279 5318
5280 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && 5319 res = tcp_validate_incoming(sk, skb, th, 0);
5281 tcp_paws_discard(sk, skb)) { 5320 if (res <= 0)
5282 if (!th->rst) { 5321 return -res;
5283 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
5284 tcp_send_dupack(sk, skb);
5285 goto discard;
5286 }
5287 /* Reset is accepted even if it did not pass PAWS. */
5288 }
5289
5290 /* step 1: check sequence number */
5291 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
5292 if (!th->rst)
5293 tcp_send_dupack(sk, skb);
5294 goto discard;
5295 }
5296
5297 /* step 2: check RST bit */
5298 if (th->rst) {
5299 tcp_reset(sk);
5300 goto discard;
5301 }
5302
5303 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
5304
5305 /* step 3: check security and precedence [ignored] */
5306
5307 /* step 4:
5308 *
5309 * Check for a SYN in window.
5310 */
5311 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
5312 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
5313 tcp_reset(sk);
5314 return 1;
5315 }
5316 5322
5317 /* step 5: check the ACK field */ 5323 /* step 5: check the ACK field */
5318 if (th->ack) { 5324 if (th->ack) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 44c1e934824b..44aef1c1f373 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1364,6 +1364,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1364 tcp_mtup_init(newsk); 1364 tcp_mtup_init(newsk);
1365 tcp_sync_mss(newsk, dst_mtu(dst)); 1365 tcp_sync_mss(newsk, dst_mtu(dst));
1366 newtp->advmss = dst_metric(dst, RTAX_ADVMSS); 1366 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1367 if (tcp_sk(sk)->rx_opt.user_mss &&
1368 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1369 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1370
1367 tcp_initialize_rcv_mss(newsk); 1371 tcp_initialize_rcv_mss(newsk);
1368 1372
1369#ifdef CONFIG_TCP_MD5SIG 1373#ifdef CONFIG_TCP_MD5SIG
@@ -1946,6 +1950,12 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1946 return rc; 1950 return rc;
1947} 1951}
1948 1952
1953static inline int empty_bucket(struct tcp_iter_state *st)
1954{
1955 return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
1956 hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
1957}
1958
1949static void *established_get_first(struct seq_file *seq) 1959static void *established_get_first(struct seq_file *seq)
1950{ 1960{
1951 struct tcp_iter_state* st = seq->private; 1961 struct tcp_iter_state* st = seq->private;
@@ -1958,6 +1968,10 @@ static void *established_get_first(struct seq_file *seq)
1958 struct inet_timewait_sock *tw; 1968 struct inet_timewait_sock *tw;
1959 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 1969 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1960 1970
1971 /* Lockless fast path for the common case of empty buckets */
1972 if (empty_bucket(st))
1973 continue;
1974
1961 read_lock_bh(lock); 1975 read_lock_bh(lock);
1962 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 1976 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1963 if (sk->sk_family != st->family || 1977 if (sk->sk_family != st->family ||
@@ -2008,13 +2022,15 @@ get_tw:
2008 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2022 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2009 st->state = TCP_SEQ_STATE_ESTABLISHED; 2023 st->state = TCP_SEQ_STATE_ESTABLISHED;
2010 2024
2011 if (++st->bucket < tcp_hashinfo.ehash_size) { 2025 /* Look for next non empty bucket */
2012 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2026 while (++st->bucket < tcp_hashinfo.ehash_size &&
2013 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); 2027 empty_bucket(st))
2014 } else { 2028 ;
2015 cur = NULL; 2029 if (st->bucket >= tcp_hashinfo.ehash_size)
2016 goto out; 2030 return NULL;
2017 } 2031
2032 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2033 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2018 } else 2034 } else
2019 sk = sk_next(sk); 2035 sk = sk_next(sk);
2020 2036
@@ -2376,6 +2392,7 @@ static int __net_init tcp_sk_init(struct net *net)
2376static void __net_exit tcp_sk_exit(struct net *net) 2392static void __net_exit tcp_sk_exit(struct net *net)
2377{ 2393{
2378 inet_ctl_sock_destroy(net->ipv4.tcp_sock); 2394 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2395 inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
2379} 2396}
2380 2397
2381static struct pernet_operations __net_initdata tcp_sk_ops = { 2398static struct pernet_operations __net_initdata tcp_sk_ops = {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8165f5aa8c71..a8499ef3234a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1824,6 +1824,8 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb,
1824 1824
1825 /* changed transmit queue under us so clear hints */ 1825 /* changed transmit queue under us so clear hints */
1826 tcp_clear_retrans_hints_partial(tp); 1826 tcp_clear_retrans_hints_partial(tp);
1827 if (next_skb == tp->retransmit_skb_hint)
1828 tp->retransmit_skb_hint = skb;
1827 1829
1828 sk_wmem_free_skb(sk, next_skb); 1830 sk_wmem_free_skb(sk, next_skb);
1829} 1831}
@@ -1838,7 +1840,7 @@ void tcp_simple_retransmit(struct sock *sk)
1838 struct tcp_sock *tp = tcp_sk(sk); 1840 struct tcp_sock *tp = tcp_sk(sk);
1839 struct sk_buff *skb; 1841 struct sk_buff *skb;
1840 unsigned int mss = tcp_current_mss(sk, 0); 1842 unsigned int mss = tcp_current_mss(sk, 0);
1841 int lost = 0; 1843 u32 prior_lost = tp->lost_out;
1842 1844
1843 tcp_for_write_queue(skb, sk) { 1845 tcp_for_write_queue(skb, sk) {
1844 if (skb == tcp_send_head(sk)) 1846 if (skb == tcp_send_head(sk))
@@ -1849,17 +1851,13 @@ void tcp_simple_retransmit(struct sock *sk)
1849 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1851 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1850 tp->retrans_out -= tcp_skb_pcount(skb); 1852 tp->retrans_out -= tcp_skb_pcount(skb);
1851 } 1853 }
1852 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) { 1854 tcp_skb_mark_lost_uncond_verify(tp, skb);
1853 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1854 tp->lost_out += tcp_skb_pcount(skb);
1855 lost = 1;
1856 }
1857 } 1855 }
1858 } 1856 }
1859 1857
1860 tcp_clear_all_retrans_hints(tp); 1858 tcp_clear_retrans_hints_partial(tp);
1861 1859
1862 if (!lost) 1860 if (prior_lost == tp->lost_out)
1863 return; 1861 return;
1864 1862
1865 if (tcp_is_reno(tp)) 1863 if (tcp_is_reno(tp))
@@ -1934,8 +1932,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1934 /* Collapse two adjacent packets if worthwhile and we can. */ 1932 /* Collapse two adjacent packets if worthwhile and we can. */
1935 if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && 1933 if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
1936 (skb->len < (cur_mss >> 1)) && 1934 (skb->len < (cur_mss >> 1)) &&
1937 (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
1938 (!tcp_skb_is_last(sk, skb)) && 1935 (!tcp_skb_is_last(sk, skb)) &&
1936 (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) &&
1939 (skb_shinfo(skb)->nr_frags == 0 && 1937 (skb_shinfo(skb)->nr_frags == 0 &&
1940 skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && 1938 skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) &&
1941 (tcp_skb_pcount(skb) == 1 && 1939 (tcp_skb_pcount(skb) == 1 &&
@@ -1996,86 +1994,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1996 return err; 1994 return err;
1997} 1995}
1998 1996
1999/* This gets called after a retransmit timeout, and the initially 1997static int tcp_can_forward_retransmit(struct sock *sk)
2000 * retransmitted data is acknowledged. It tries to continue
2001 * resending the rest of the retransmit queue, until either
2002 * we've sent it all or the congestion window limit is reached.
2003 * If doing SACK, the first ACK which comes back for a timeout
2004 * based retransmit packet might feed us FACK information again.
2005 * If so, we use it to avoid unnecessarily retransmissions.
2006 */
2007void tcp_xmit_retransmit_queue(struct sock *sk)
2008{ 1998{
2009 const struct inet_connection_sock *icsk = inet_csk(sk); 1999 const struct inet_connection_sock *icsk = inet_csk(sk);
2010 struct tcp_sock *tp = tcp_sk(sk); 2000 struct tcp_sock *tp = tcp_sk(sk);
2011 struct sk_buff *skb;
2012 int packet_cnt;
2013
2014 if (tp->retransmit_skb_hint) {
2015 skb = tp->retransmit_skb_hint;
2016 packet_cnt = tp->retransmit_cnt_hint;
2017 } else {
2018 skb = tcp_write_queue_head(sk);
2019 packet_cnt = 0;
2020 }
2021
2022 /* First pass: retransmit lost packets. */
2023 if (tp->lost_out) {
2024 tcp_for_write_queue_from(skb, sk) {
2025 __u8 sacked = TCP_SKB_CB(skb)->sacked;
2026
2027 if (skb == tcp_send_head(sk))
2028 break;
2029 /* we could do better than to assign each time */
2030 tp->retransmit_skb_hint = skb;
2031 tp->retransmit_cnt_hint = packet_cnt;
2032
2033 /* Assume this retransmit will generate
2034 * only one packet for congestion window
2035 * calculation purposes. This works because
2036 * tcp_retransmit_skb() will chop up the
2037 * packet to be MSS sized and all the
2038 * packet counting works out.
2039 */
2040 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
2041 return;
2042
2043 if (sacked & TCPCB_LOST) {
2044 if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
2045 int mib_idx;
2046
2047 if (tcp_retransmit_skb(sk, skb)) {
2048 tp->retransmit_skb_hint = NULL;
2049 return;
2050 }
2051 if (icsk->icsk_ca_state != TCP_CA_Loss)
2052 mib_idx = LINUX_MIB_TCPFASTRETRANS;
2053 else
2054 mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
2055 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2056
2057 if (skb == tcp_write_queue_head(sk))
2058 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2059 inet_csk(sk)->icsk_rto,
2060 TCP_RTO_MAX);
2061 }
2062
2063 packet_cnt += tcp_skb_pcount(skb);
2064 if (packet_cnt >= tp->lost_out)
2065 break;
2066 }
2067 }
2068 }
2069
2070 /* OK, demanded retransmission is finished. */
2071 2001
2072 /* Forward retransmissions are possible only during Recovery. */ 2002 /* Forward retransmissions are possible only during Recovery. */
2073 if (icsk->icsk_ca_state != TCP_CA_Recovery) 2003 if (icsk->icsk_ca_state != TCP_CA_Recovery)
2074 return; 2004 return 0;
2075 2005
2076 /* No forward retransmissions in Reno are possible. */ 2006 /* No forward retransmissions in Reno are possible. */
2077 if (tcp_is_reno(tp)) 2007 if (tcp_is_reno(tp))
2078 return; 2008 return 0;
2079 2009
2080 /* Yeah, we have to make difficult choice between forward transmission 2010 /* Yeah, we have to make difficult choice between forward transmission
2081 * and retransmission... Both ways have their merits... 2011 * and retransmission... Both ways have their merits...
@@ -2086,43 +2016,104 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2086 */ 2016 */
2087 2017
2088 if (tcp_may_send_now(sk)) 2018 if (tcp_may_send_now(sk))
2089 return; 2019 return 0;
2090 2020
2091 /* If nothing is SACKed, highest_sack in the loop won't be valid */ 2021 return 1;
2092 if (!tp->sacked_out) 2022}
2093 return;
2094 2023
2095 if (tp->forward_skb_hint) 2024/* This gets called after a retransmit timeout, and the initially
2096 skb = tp->forward_skb_hint; 2025 * retransmitted data is acknowledged. It tries to continue
2097 else 2026 * resending the rest of the retransmit queue, until either
2027 * we've sent it all or the congestion window limit is reached.
2028 * If doing SACK, the first ACK which comes back for a timeout
2029 * based retransmit packet might feed us FACK information again.
2030 * If so, we use it to avoid unnecessarily retransmissions.
2031 */
2032void tcp_xmit_retransmit_queue(struct sock *sk)
2033{
2034 const struct inet_connection_sock *icsk = inet_csk(sk);
2035 struct tcp_sock *tp = tcp_sk(sk);
2036 struct sk_buff *skb;
2037 struct sk_buff *hole = NULL;
2038 u32 last_lost;
2039 int mib_idx;
2040 int fwd_rexmitting = 0;
2041
2042 if (!tp->lost_out)
2043 tp->retransmit_high = tp->snd_una;
2044
2045 if (tp->retransmit_skb_hint) {
2046 skb = tp->retransmit_skb_hint;
2047 last_lost = TCP_SKB_CB(skb)->end_seq;
2048 if (after(last_lost, tp->retransmit_high))
2049 last_lost = tp->retransmit_high;
2050 } else {
2098 skb = tcp_write_queue_head(sk); 2051 skb = tcp_write_queue_head(sk);
2052 last_lost = tp->snd_una;
2053 }
2099 2054
2055 /* First pass: retransmit lost packets. */
2100 tcp_for_write_queue_from(skb, sk) { 2056 tcp_for_write_queue_from(skb, sk) {
2101 if (skb == tcp_send_head(sk)) 2057 __u8 sacked = TCP_SKB_CB(skb)->sacked;
2102 break;
2103 tp->forward_skb_hint = skb;
2104 2058
2105 if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) 2059 if (skb == tcp_send_head(sk))
2106 break; 2060 break;
2061 /* we could do better than to assign each time */
2062 if (hole == NULL)
2063 tp->retransmit_skb_hint = skb;
2107 2064
2065 /* Assume this retransmit will generate
2066 * only one packet for congestion window
2067 * calculation purposes. This works because
2068 * tcp_retransmit_skb() will chop up the
2069 * packet to be MSS sized and all the
2070 * packet counting works out.
2071 */
2108 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) 2072 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
2109 break; 2073 return;
2074
2075 if (fwd_rexmitting) {
2076begin_fwd:
2077 if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
2078 break;
2079 mib_idx = LINUX_MIB_TCPFORWARDRETRANS;
2080
2081 } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) {
2082 tp->retransmit_high = last_lost;
2083 if (!tcp_can_forward_retransmit(sk))
2084 break;
2085 /* Backtrack if necessary to non-L'ed skb */
2086 if (hole != NULL) {
2087 skb = hole;
2088 hole = NULL;
2089 }
2090 fwd_rexmitting = 1;
2091 goto begin_fwd;
2110 2092
2111 if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) 2093 } else if (!(sacked & TCPCB_LOST)) {
2094 if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS))
2095 hole = skb;
2112 continue; 2096 continue;
2113 2097
2114 /* Ok, retransmit it. */ 2098 } else {
2115 if (tcp_retransmit_skb(sk, skb)) { 2099 last_lost = TCP_SKB_CB(skb)->end_seq;
2116 tp->forward_skb_hint = NULL; 2100 if (icsk->icsk_ca_state != TCP_CA_Loss)
2117 break; 2101 mib_idx = LINUX_MIB_TCPFASTRETRANS;
2102 else
2103 mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
2118 } 2104 }
2119 2105
2106 if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
2107 continue;
2108
2109 if (tcp_retransmit_skb(sk, skb))
2110 return;
2111 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2112
2120 if (skb == tcp_write_queue_head(sk)) 2113 if (skb == tcp_write_queue_head(sk))
2121 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2114 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2122 inet_csk(sk)->icsk_rto, 2115 inet_csk(sk)->icsk_rto,
2123 TCP_RTO_MAX); 2116 TCP_RTO_MAX);
2124
2125 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFORWARDRETRANS);
2126 } 2117 }
2127} 2118}
2128 2119
@@ -2241,6 +2232,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2241 struct sk_buff *skb; 2232 struct sk_buff *skb;
2242 struct tcp_md5sig_key *md5; 2233 struct tcp_md5sig_key *md5;
2243 __u8 *md5_hash_location; 2234 __u8 *md5_hash_location;
2235 int mss;
2244 2236
2245 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); 2237 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
2246 if (skb == NULL) 2238 if (skb == NULL)
@@ -2251,13 +2243,17 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2251 2243
2252 skb->dst = dst_clone(dst); 2244 skb->dst = dst_clone(dst);
2253 2245
2246 mss = dst_metric(dst, RTAX_ADVMSS);
2247 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2248 mss = tp->rx_opt.user_mss;
2249
2254 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ 2250 if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
2255 __u8 rcv_wscale; 2251 __u8 rcv_wscale;
2256 /* Set this up on the first call only */ 2252 /* Set this up on the first call only */
2257 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); 2253 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2258 /* tcp_full_space because it is guaranteed to be the first packet */ 2254 /* tcp_full_space because it is guaranteed to be the first packet */
2259 tcp_select_initial_window(tcp_full_space(sk), 2255 tcp_select_initial_window(tcp_full_space(sk),
2260 dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), 2256 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
2261 &req->rcv_wnd, 2257 &req->rcv_wnd,
2262 &req->window_clamp, 2258 &req->window_clamp,
2263 ireq->wscale_ok, 2259 ireq->wscale_ok,
@@ -2267,8 +2263,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2267 2263
2268 memset(&opts, 0, sizeof(opts)); 2264 memset(&opts, 0, sizeof(opts));
2269 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2265 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2270 tcp_header_size = tcp_synack_options(sk, req, 2266 tcp_header_size = tcp_synack_options(sk, req, mss,
2271 dst_metric(dst, RTAX_ADVMSS),
2272 skb, &opts, &md5) + 2267 skb, &opts, &md5) +
2273 sizeof(struct tcphdr); 2268 sizeof(struct tcphdr);
2274 2269
@@ -2342,6 +2337,9 @@ static void tcp_connect_init(struct sock *sk)
2342 if (!tp->window_clamp) 2337 if (!tp->window_clamp)
2343 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 2338 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
2344 tp->advmss = dst_metric(dst, RTAX_ADVMSS); 2339 tp->advmss = dst_metric(dst, RTAX_ADVMSS);
2340 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
2341 tp->advmss = tp->rx_opt.user_mss;
2342
2345 tcp_initialize_rcv_mss(sk); 2343 tcp_initialize_rcv_mss(sk);
2346 2344
2347 tcp_select_initial_window(tcp_full_space(sk), 2345 tcp_select_initial_window(tcp_full_space(sk),