diff options
Diffstat (limited to 'net/netfilter/ipvs/ip_vs_core.c')
-rw-r--r-- | net/netfilter/ipvs/ip_vs_core.c | 456 |
1 files changed, 296 insertions, 160 deletions
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index b4e51e9c5a0..07accf6b240 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c | |||
@@ -41,6 +41,7 @@ | |||
41 | #include <net/icmp.h> /* for icmp_send */ | 41 | #include <net/icmp.h> /* for icmp_send */ |
42 | #include <net/route.h> | 42 | #include <net/route.h> |
43 | #include <net/ip6_checksum.h> | 43 | #include <net/ip6_checksum.h> |
44 | #include <net/netns/generic.h> /* net_generic() */ | ||
44 | 45 | ||
45 | #include <linux/netfilter.h> | 46 | #include <linux/netfilter.h> |
46 | #include <linux/netfilter_ipv4.h> | 47 | #include <linux/netfilter_ipv4.h> |
@@ -68,6 +69,12 @@ EXPORT_SYMBOL(ip_vs_conn_put); | |||
68 | EXPORT_SYMBOL(ip_vs_get_debug_level); | 69 | EXPORT_SYMBOL(ip_vs_get_debug_level); |
69 | #endif | 70 | #endif |
70 | 71 | ||
72 | int ip_vs_net_id __read_mostly; | ||
73 | #ifdef IP_VS_GENERIC_NETNS | ||
74 | EXPORT_SYMBOL(ip_vs_net_id); | ||
75 | #endif | ||
76 | /* netns cnt used for uniqueness */ | ||
77 | static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); | ||
71 | 78 | ||
72 | /* ID used in ICMP lookups */ | 79 | /* ID used in ICMP lookups */ |
73 | #define icmp_id(icmph) (((icmph)->un).echo.id) | 80 | #define icmp_id(icmph) (((icmph)->un).echo.id) |
@@ -108,21 +115,28 @@ static inline void | |||
108 | ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) | 115 | ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) |
109 | { | 116 | { |
110 | struct ip_vs_dest *dest = cp->dest; | 117 | struct ip_vs_dest *dest = cp->dest; |
118 | struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); | ||
119 | |||
111 | if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { | 120 | if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { |
112 | spin_lock(&dest->stats.lock); | 121 | struct ip_vs_cpu_stats *s; |
113 | dest->stats.ustats.inpkts++; | 122 | |
114 | dest->stats.ustats.inbytes += skb->len; | 123 | s = this_cpu_ptr(dest->stats.cpustats); |
115 | spin_unlock(&dest->stats.lock); | 124 | s->ustats.inpkts++; |
116 | 125 | u64_stats_update_begin(&s->syncp); | |
117 | spin_lock(&dest->svc->stats.lock); | 126 | s->ustats.inbytes += skb->len; |
118 | dest->svc->stats.ustats.inpkts++; | 127 | u64_stats_update_end(&s->syncp); |
119 | dest->svc->stats.ustats.inbytes += skb->len; | 128 | |
120 | spin_unlock(&dest->svc->stats.lock); | 129 | s = this_cpu_ptr(dest->svc->stats.cpustats); |
121 | 130 | s->ustats.inpkts++; | |
122 | spin_lock(&ip_vs_stats.lock); | 131 | u64_stats_update_begin(&s->syncp); |
123 | ip_vs_stats.ustats.inpkts++; | 132 | s->ustats.inbytes += skb->len; |
124 | ip_vs_stats.ustats.inbytes += skb->len; | 133 | u64_stats_update_end(&s->syncp); |
125 | spin_unlock(&ip_vs_stats.lock); | 134 | |
135 | s = this_cpu_ptr(ipvs->tot_stats.cpustats); | ||
136 | s->ustats.inpkts++; | ||
137 | u64_stats_update_begin(&s->syncp); | ||
138 | s->ustats.inbytes += skb->len; | ||
139 | u64_stats_update_end(&s->syncp); | ||
126 | } | 140 | } |
127 | } | 141 | } |
128 | 142 | ||
@@ -131,21 +145,28 @@ static inline void | |||
131 | ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) | 145 | ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) |
132 | { | 146 | { |
133 | struct ip_vs_dest *dest = cp->dest; | 147 | struct ip_vs_dest *dest = cp->dest; |
148 | struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); | ||
149 | |||
134 | if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { | 150 | if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { |
135 | spin_lock(&dest->stats.lock); | 151 | struct ip_vs_cpu_stats *s; |
136 | dest->stats.ustats.outpkts++; | 152 | |
137 | dest->stats.ustats.outbytes += skb->len; | 153 | s = this_cpu_ptr(dest->stats.cpustats); |
138 | spin_unlock(&dest->stats.lock); | 154 | s->ustats.outpkts++; |
139 | 155 | u64_stats_update_begin(&s->syncp); | |
140 | spin_lock(&dest->svc->stats.lock); | 156 | s->ustats.outbytes += skb->len; |
141 | dest->svc->stats.ustats.outpkts++; | 157 | u64_stats_update_end(&s->syncp); |
142 | dest->svc->stats.ustats.outbytes += skb->len; | 158 | |
143 | spin_unlock(&dest->svc->stats.lock); | 159 | s = this_cpu_ptr(dest->svc->stats.cpustats); |
144 | 160 | s->ustats.outpkts++; | |
145 | spin_lock(&ip_vs_stats.lock); | 161 | u64_stats_update_begin(&s->syncp); |
146 | ip_vs_stats.ustats.outpkts++; | 162 | s->ustats.outbytes += skb->len; |
147 | ip_vs_stats.ustats.outbytes += skb->len; | 163 | u64_stats_update_end(&s->syncp); |
148 | spin_unlock(&ip_vs_stats.lock); | 164 | |
165 | s = this_cpu_ptr(ipvs->tot_stats.cpustats); | ||
166 | s->ustats.outpkts++; | ||
167 | u64_stats_update_begin(&s->syncp); | ||
168 | s->ustats.outbytes += skb->len; | ||
169 | u64_stats_update_end(&s->syncp); | ||
149 | } | 170 | } |
150 | } | 171 | } |
151 | 172 | ||
@@ -153,41 +174,44 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) | |||
153 | static inline void | 174 | static inline void |
154 | ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) | 175 | ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) |
155 | { | 176 | { |
156 | spin_lock(&cp->dest->stats.lock); | 177 | struct netns_ipvs *ipvs = net_ipvs(svc->net); |
157 | cp->dest->stats.ustats.conns++; | 178 | struct ip_vs_cpu_stats *s; |
158 | spin_unlock(&cp->dest->stats.lock); | 179 | |
180 | s = this_cpu_ptr(cp->dest->stats.cpustats); | ||
181 | s->ustats.conns++; | ||
159 | 182 | ||
160 | spin_lock(&svc->stats.lock); | 183 | s = this_cpu_ptr(svc->stats.cpustats); |
161 | svc->stats.ustats.conns++; | 184 | s->ustats.conns++; |
162 | spin_unlock(&svc->stats.lock); | ||
163 | 185 | ||
164 | spin_lock(&ip_vs_stats.lock); | 186 | s = this_cpu_ptr(ipvs->tot_stats.cpustats); |
165 | ip_vs_stats.ustats.conns++; | 187 | s->ustats.conns++; |
166 | spin_unlock(&ip_vs_stats.lock); | ||
167 | } | 188 | } |
168 | 189 | ||
169 | 190 | ||
170 | static inline int | 191 | static inline int |
171 | ip_vs_set_state(struct ip_vs_conn *cp, int direction, | 192 | ip_vs_set_state(struct ip_vs_conn *cp, int direction, |
172 | const struct sk_buff *skb, | 193 | const struct sk_buff *skb, |
173 | struct ip_vs_protocol *pp) | 194 | struct ip_vs_proto_data *pd) |
174 | { | 195 | { |
175 | if (unlikely(!pp->state_transition)) | 196 | if (unlikely(!pd->pp->state_transition)) |
176 | return 0; | 197 | return 0; |
177 | return pp->state_transition(cp, direction, skb, pp); | 198 | return pd->pp->state_transition(cp, direction, skb, pd); |
178 | } | 199 | } |
179 | 200 | ||
180 | static inline void | 201 | static inline int |
181 | ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, | 202 | ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, |
182 | struct sk_buff *skb, int protocol, | 203 | struct sk_buff *skb, int protocol, |
183 | const union nf_inet_addr *caddr, __be16 cport, | 204 | const union nf_inet_addr *caddr, __be16 cport, |
184 | const union nf_inet_addr *vaddr, __be16 vport, | 205 | const union nf_inet_addr *vaddr, __be16 vport, |
185 | struct ip_vs_conn_param *p) | 206 | struct ip_vs_conn_param *p) |
186 | { | 207 | { |
187 | ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p); | 208 | ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, |
209 | vport, p); | ||
188 | p->pe = svc->pe; | 210 | p->pe = svc->pe; |
189 | if (p->pe && p->pe->fill_param) | 211 | if (p->pe && p->pe->fill_param) |
190 | p->pe->fill_param(p, skb); | 212 | return p->pe->fill_param(p, skb); |
213 | |||
214 | return 0; | ||
191 | } | 215 | } |
192 | 216 | ||
193 | /* | 217 | /* |
@@ -200,7 +224,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, | |||
200 | static struct ip_vs_conn * | 224 | static struct ip_vs_conn * |
201 | ip_vs_sched_persist(struct ip_vs_service *svc, | 225 | ip_vs_sched_persist(struct ip_vs_service *svc, |
202 | struct sk_buff *skb, | 226 | struct sk_buff *skb, |
203 | __be16 ports[2]) | 227 | __be16 src_port, __be16 dst_port, int *ignored) |
204 | { | 228 | { |
205 | struct ip_vs_conn *cp = NULL; | 229 | struct ip_vs_conn *cp = NULL; |
206 | struct ip_vs_iphdr iph; | 230 | struct ip_vs_iphdr iph; |
@@ -224,8 +248,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, | |||
224 | 248 | ||
225 | IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " | 249 | IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " |
226 | "mnet %s\n", | 250 | "mnet %s\n", |
227 | IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]), | 251 | IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port), |
228 | IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]), | 252 | IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port), |
229 | IP_VS_DBG_ADDR(svc->af, &snet)); | 253 | IP_VS_DBG_ADDR(svc->af, &snet)); |
230 | 254 | ||
231 | /* | 255 | /* |
@@ -247,14 +271,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc, | |||
247 | const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; | 271 | const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; |
248 | __be16 vport = 0; | 272 | __be16 vport = 0; |
249 | 273 | ||
250 | if (ports[1] == svc->port) { | 274 | if (dst_port == svc->port) { |
251 | /* non-FTP template: | 275 | /* non-FTP template: |
252 | * <protocol, caddr, 0, vaddr, vport, daddr, dport> | 276 | * <protocol, caddr, 0, vaddr, vport, daddr, dport> |
253 | * FTP template: | 277 | * FTP template: |
254 | * <protocol, caddr, 0, vaddr, 0, daddr, 0> | 278 | * <protocol, caddr, 0, vaddr, 0, daddr, 0> |
255 | */ | 279 | */ |
256 | if (svc->port != FTPPORT) | 280 | if (svc->port != FTPPORT) |
257 | vport = ports[1]; | 281 | vport = dst_port; |
258 | } else { | 282 | } else { |
259 | /* Note: persistent fwmark-based services and | 283 | /* Note: persistent fwmark-based services and |
260 | * persistent port zero service are handled here. | 284 | * persistent port zero service are handled here. |
@@ -268,24 +292,31 @@ ip_vs_sched_persist(struct ip_vs_service *svc, | |||
268 | vaddr = &fwmark; | 292 | vaddr = &fwmark; |
269 | } | 293 | } |
270 | } | 294 | } |
271 | ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, | 295 | /* return *ignored = -1 so NF_DROP can be used */ |
272 | vaddr, vport, ¶m); | 296 | if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, |
297 | vaddr, vport, ¶m) < 0) { | ||
298 | *ignored = -1; | ||
299 | return NULL; | ||
300 | } | ||
273 | } | 301 | } |
274 | 302 | ||
275 | /* Check if a template already exists */ | 303 | /* Check if a template already exists */ |
276 | ct = ip_vs_ct_in_get(¶m); | 304 | ct = ip_vs_ct_in_get(¶m); |
277 | if (!ct || !ip_vs_check_template(ct)) { | 305 | if (!ct || !ip_vs_check_template(ct)) { |
278 | /* No template found or the dest of the connection | 306 | /* |
307 | * No template found or the dest of the connection | ||
279 | * template is not available. | 308 | * template is not available. |
309 | * return *ignored=0 i.e. ICMP and NF_DROP | ||
280 | */ | 310 | */ |
281 | dest = svc->scheduler->schedule(svc, skb); | 311 | dest = svc->scheduler->schedule(svc, skb); |
282 | if (!dest) { | 312 | if (!dest) { |
283 | IP_VS_DBG(1, "p-schedule: no dest found.\n"); | 313 | IP_VS_DBG(1, "p-schedule: no dest found.\n"); |
284 | kfree(param.pe_data); | 314 | kfree(param.pe_data); |
315 | *ignored = 0; | ||
285 | return NULL; | 316 | return NULL; |
286 | } | 317 | } |
287 | 318 | ||
288 | if (ports[1] == svc->port && svc->port != FTPPORT) | 319 | if (dst_port == svc->port && svc->port != FTPPORT) |
289 | dport = dest->port; | 320 | dport = dest->port; |
290 | 321 | ||
291 | /* Create a template | 322 | /* Create a template |
@@ -293,9 +324,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc, | |||
293 | * and thus param.pe_data will be destroyed | 324 | * and thus param.pe_data will be destroyed |
294 | * when the template expires */ | 325 | * when the template expires */ |
295 | ct = ip_vs_conn_new(¶m, &dest->addr, dport, | 326 | ct = ip_vs_conn_new(¶m, &dest->addr, dport, |
296 | IP_VS_CONN_F_TEMPLATE, dest); | 327 | IP_VS_CONN_F_TEMPLATE, dest, skb->mark); |
297 | if (ct == NULL) { | 328 | if (ct == NULL) { |
298 | kfree(param.pe_data); | 329 | kfree(param.pe_data); |
330 | *ignored = -1; | ||
299 | return NULL; | 331 | return NULL; |
300 | } | 332 | } |
301 | 333 | ||
@@ -306,7 +338,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, | |||
306 | kfree(param.pe_data); | 338 | kfree(param.pe_data); |
307 | } | 339 | } |
308 | 340 | ||
309 | dport = ports[1]; | 341 | dport = dst_port; |
310 | if (dport == svc->port && dest->port) | 342 | if (dport == svc->port && dest->port) |
311 | dport = dest->port; | 343 | dport = dest->port; |
312 | 344 | ||
@@ -317,11 +349,13 @@ ip_vs_sched_persist(struct ip_vs_service *svc, | |||
317 | /* | 349 | /* |
318 | * Create a new connection according to the template | 350 | * Create a new connection according to the template |
319 | */ | 351 | */ |
320 | ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0], | 352 | ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr, |
321 | &iph.daddr, ports[1], ¶m); | 353 | src_port, &iph.daddr, dst_port, ¶m); |
322 | cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest); | 354 | |
355 | cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark); | ||
323 | if (cp == NULL) { | 356 | if (cp == NULL) { |
324 | ip_vs_conn_put(ct); | 357 | ip_vs_conn_put(ct); |
358 | *ignored = -1; | ||
325 | return NULL; | 359 | return NULL; |
326 | } | 360 | } |
327 | 361 | ||
@@ -341,11 +375,27 @@ ip_vs_sched_persist(struct ip_vs_service *svc, | |||
341 | * It selects a server according to the virtual service, and | 375 | * It selects a server according to the virtual service, and |
342 | * creates a connection entry. | 376 | * creates a connection entry. |
343 | * Protocols supported: TCP, UDP | 377 | * Protocols supported: TCP, UDP |
378 | * | ||
379 | * Usage of *ignored | ||
380 | * | ||
381 | * 1 : protocol tried to schedule (eg. on SYN), found svc but the | ||
382 | * svc/scheduler decides that this packet should be accepted with | ||
383 | * NF_ACCEPT because it must not be scheduled. | ||
384 | * | ||
385 | * 0 : scheduler can not find destination, so try bypass or | ||
386 | * return ICMP and then NF_DROP (ip_vs_leave). | ||
387 | * | ||
388 | * -1 : scheduler tried to schedule but fatal error occurred, eg. | ||
389 | * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param | ||
390 | * failure such as missing Call-ID, ENOMEM on skb_linearize | ||
391 | * or pe_data. In this case we should return NF_DROP without | ||
392 | * any attempts to send ICMP with ip_vs_leave. | ||
344 | */ | 393 | */ |
345 | struct ip_vs_conn * | 394 | struct ip_vs_conn * |
346 | ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, | 395 | ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, |
347 | struct ip_vs_protocol *pp, int *ignored) | 396 | struct ip_vs_proto_data *pd, int *ignored) |
348 | { | 397 | { |
398 | struct ip_vs_protocol *pp = pd->pp; | ||
349 | struct ip_vs_conn *cp = NULL; | 399 | struct ip_vs_conn *cp = NULL; |
350 | struct ip_vs_iphdr iph; | 400 | struct ip_vs_iphdr iph; |
351 | struct ip_vs_dest *dest; | 401 | struct ip_vs_dest *dest; |
@@ -371,12 +421,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, | |||
371 | } | 421 | } |
372 | 422 | ||
373 | /* | 423 | /* |
374 | * Do not schedule replies from local real server. It is risky | 424 | * Do not schedule replies from local real server. |
375 | * for fwmark services but mostly for persistent services. | ||
376 | */ | 425 | */ |
377 | if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && | 426 | if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && |
378 | (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) && | 427 | (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) { |
379 | (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) { | ||
380 | IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, | 428 | IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, |
381 | "Not scheduling reply for existing connection"); | 429 | "Not scheduling reply for existing connection"); |
382 | __ip_vs_conn_put(cp); | 430 | __ip_vs_conn_put(cp); |
@@ -386,10 +434,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, | |||
386 | /* | 434 | /* |
387 | * Persistent service | 435 | * Persistent service |
388 | */ | 436 | */ |
389 | if (svc->flags & IP_VS_SVC_F_PERSISTENT) { | 437 | if (svc->flags & IP_VS_SVC_F_PERSISTENT) |
390 | *ignored = 0; | 438 | return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored); |
391 | return ip_vs_sched_persist(svc, skb, pptr); | 439 | |
392 | } | 440 | *ignored = 0; |
393 | 441 | ||
394 | /* | 442 | /* |
395 | * Non-persistent service | 443 | * Non-persistent service |
@@ -402,8 +450,6 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, | |||
402 | return NULL; | 450 | return NULL; |
403 | } | 451 | } |
404 | 452 | ||
405 | *ignored = 0; | ||
406 | |||
407 | dest = svc->scheduler->schedule(svc, skb); | 453 | dest = svc->scheduler->schedule(svc, skb); |
408 | if (dest == NULL) { | 454 | if (dest == NULL) { |
409 | IP_VS_DBG(1, "Schedule: no dest found.\n"); | 455 | IP_VS_DBG(1, "Schedule: no dest found.\n"); |
@@ -419,13 +465,17 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, | |||
419 | */ | 465 | */ |
420 | { | 466 | { |
421 | struct ip_vs_conn_param p; | 467 | struct ip_vs_conn_param p; |
422 | ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, | 468 | |
423 | pptr[0], &iph.daddr, pptr[1], &p); | 469 | ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, |
470 | &iph.saddr, pptr[0], &iph.daddr, pptr[1], | ||
471 | &p); | ||
424 | cp = ip_vs_conn_new(&p, &dest->addr, | 472 | cp = ip_vs_conn_new(&p, &dest->addr, |
425 | dest->port ? dest->port : pptr[1], | 473 | dest->port ? dest->port : pptr[1], |
426 | flags, dest); | 474 | flags, dest, skb->mark); |
427 | if (!cp) | 475 | if (!cp) { |
476 | *ignored = -1; | ||
428 | return NULL; | 477 | return NULL; |
478 | } | ||
429 | } | 479 | } |
430 | 480 | ||
431 | IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " | 481 | IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " |
@@ -447,11 +497,16 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, | |||
447 | * no destination is available for a new connection. | 497 | * no destination is available for a new connection. |
448 | */ | 498 | */ |
449 | int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, | 499 | int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, |
450 | struct ip_vs_protocol *pp) | 500 | struct ip_vs_proto_data *pd) |
451 | { | 501 | { |
452 | __be16 _ports[2], *pptr; | 502 | __be16 _ports[2], *pptr; |
453 | struct ip_vs_iphdr iph; | 503 | struct ip_vs_iphdr iph; |
504 | #ifdef CONFIG_SYSCTL | ||
505 | struct net *net; | ||
506 | struct netns_ipvs *ipvs; | ||
454 | int unicast; | 507 | int unicast; |
508 | #endif | ||
509 | |||
455 | ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); | 510 | ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); |
456 | 511 | ||
457 | pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); | 512 | pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); |
@@ -460,17 +515,21 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, | |||
460 | return NF_DROP; | 515 | return NF_DROP; |
461 | } | 516 | } |
462 | 517 | ||
518 | #ifdef CONFIG_SYSCTL | ||
519 | net = skb_net(skb); | ||
520 | |||
463 | #ifdef CONFIG_IP_VS_IPV6 | 521 | #ifdef CONFIG_IP_VS_IPV6 |
464 | if (svc->af == AF_INET6) | 522 | if (svc->af == AF_INET6) |
465 | unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; | 523 | unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; |
466 | else | 524 | else |
467 | #endif | 525 | #endif |
468 | unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST); | 526 | unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST); |
469 | 527 | ||
470 | /* if it is fwmark-based service, the cache_bypass sysctl is up | 528 | /* if it is fwmark-based service, the cache_bypass sysctl is up |
471 | and the destination is a non-local unicast, then create | 529 | and the destination is a non-local unicast, then create |
472 | a cache_bypass connection entry */ | 530 | a cache_bypass connection entry */ |
473 | if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { | 531 | ipvs = net_ipvs(net); |
532 | if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) { | ||
474 | int ret, cs; | 533 | int ret, cs; |
475 | struct ip_vs_conn *cp; | 534 | struct ip_vs_conn *cp; |
476 | unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && | 535 | unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && |
@@ -484,12 +543,12 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, | |||
484 | IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); | 543 | IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); |
485 | { | 544 | { |
486 | struct ip_vs_conn_param p; | 545 | struct ip_vs_conn_param p; |
487 | ip_vs_conn_fill_param(svc->af, iph.protocol, | 546 | ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, |
488 | &iph.saddr, pptr[0], | 547 | &iph.saddr, pptr[0], |
489 | &iph.daddr, pptr[1], &p); | 548 | &iph.daddr, pptr[1], &p); |
490 | cp = ip_vs_conn_new(&p, &daddr, 0, | 549 | cp = ip_vs_conn_new(&p, &daddr, 0, |
491 | IP_VS_CONN_F_BYPASS | flags, | 550 | IP_VS_CONN_F_BYPASS | flags, |
492 | NULL); | 551 | NULL, skb->mark); |
493 | if (!cp) | 552 | if (!cp) |
494 | return NF_DROP; | 553 | return NF_DROP; |
495 | } | 554 | } |
@@ -498,16 +557,17 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, | |||
498 | ip_vs_in_stats(cp, skb); | 557 | ip_vs_in_stats(cp, skb); |
499 | 558 | ||
500 | /* set state */ | 559 | /* set state */ |
501 | cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); | 560 | cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); |
502 | 561 | ||
503 | /* transmit the first SYN packet */ | 562 | /* transmit the first SYN packet */ |
504 | ret = cp->packet_xmit(skb, cp, pp); | 563 | ret = cp->packet_xmit(skb, cp, pd->pp); |
505 | /* do not touch skb anymore */ | 564 | /* do not touch skb anymore */ |
506 | 565 | ||
507 | atomic_inc(&cp->in_pkts); | 566 | atomic_inc(&cp->in_pkts); |
508 | ip_vs_conn_put(cp); | 567 | ip_vs_conn_put(cp); |
509 | return ret; | 568 | return ret; |
510 | } | 569 | } |
570 | #endif | ||
511 | 571 | ||
512 | /* | 572 | /* |
513 | * When the virtual ftp service is presented, packets destined | 573 | * When the virtual ftp service is presented, packets destined |
@@ -544,6 +604,33 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, | |||
544 | return NF_DROP; | 604 | return NF_DROP; |
545 | } | 605 | } |
546 | 606 | ||
607 | #ifdef CONFIG_SYSCTL | ||
608 | |||
609 | static int sysctl_snat_reroute(struct sk_buff *skb) | ||
610 | { | ||
611 | struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); | ||
612 | return ipvs->sysctl_snat_reroute; | ||
613 | } | ||
614 | |||
615 | static int sysctl_nat_icmp_send(struct net *net) | ||
616 | { | ||
617 | struct netns_ipvs *ipvs = net_ipvs(net); | ||
618 | return ipvs->sysctl_nat_icmp_send; | ||
619 | } | ||
620 | |||
621 | static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) | ||
622 | { | ||
623 | return ipvs->sysctl_expire_nodest_conn; | ||
624 | } | ||
625 | |||
626 | #else | ||
627 | |||
628 | static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; } | ||
629 | static int sysctl_nat_icmp_send(struct net *net) { return 0; } | ||
630 | static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } | ||
631 | |||
632 | #endif | ||
633 | |||
547 | __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) | 634 | __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) |
548 | { | 635 | { |
549 | return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); | 636 | return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); |
@@ -576,6 +663,22 @@ static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) | |||
576 | } | 663 | } |
577 | #endif | 664 | #endif |
578 | 665 | ||
666 | static int ip_vs_route_me_harder(int af, struct sk_buff *skb) | ||
667 | { | ||
668 | #ifdef CONFIG_IP_VS_IPV6 | ||
669 | if (af == AF_INET6) { | ||
670 | if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0) | ||
671 | return 1; | ||
672 | } else | ||
673 | #endif | ||
674 | if ((sysctl_snat_reroute(skb) || | ||
675 | skb_rtable(skb)->rt_flags & RTCF_LOCAL) && | ||
676 | ip_route_me_harder(skb, RTN_LOCAL) != 0) | ||
677 | return 1; | ||
678 | |||
679 | return 0; | ||
680 | } | ||
681 | |||
579 | /* | 682 | /* |
580 | * Packet has been made sufficiently writable in caller | 683 | * Packet has been made sufficiently writable in caller |
581 | * - inout: 1=in->out, 0=out->in | 684 | * - inout: 1=in->out, 0=out->in |
@@ -674,7 +777,7 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, | |||
674 | #endif | 777 | #endif |
675 | 778 | ||
676 | /* Handle relevant response ICMP messages - forward to the right | 779 | /* Handle relevant response ICMP messages - forward to the right |
677 | * destination host. Used for NAT and local client. | 780 | * destination host. |
678 | */ | 781 | */ |
679 | static int handle_response_icmp(int af, struct sk_buff *skb, | 782 | static int handle_response_icmp(int af, struct sk_buff *skb, |
680 | union nf_inet_addr *snet, | 783 | union nf_inet_addr *snet, |
@@ -710,16 +813,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb, | |||
710 | #endif | 813 | #endif |
711 | ip_vs_nat_icmp(skb, pp, cp, 1); | 814 | ip_vs_nat_icmp(skb, pp, cp, 1); |
712 | 815 | ||
713 | #ifdef CONFIG_IP_VS_IPV6 | 816 | if (ip_vs_route_me_harder(af, skb)) |
714 | if (af == AF_INET6) { | 817 | goto out; |
715 | if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0) | ||
716 | goto out; | ||
717 | } else | ||
718 | #endif | ||
719 | if ((sysctl_ip_vs_snat_reroute || | ||
720 | skb_rtable(skb)->rt_flags & RTCF_LOCAL) && | ||
721 | ip_route_me_harder(skb, RTN_LOCAL) != 0) | ||
722 | goto out; | ||
723 | 818 | ||
724 | /* do the statistics and put it back */ | 819 | /* do the statistics and put it back */ |
725 | ip_vs_out_stats(cp, skb); | 820 | ip_vs_out_stats(cp, skb); |
@@ -808,7 +903,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, | |||
808 | 903 | ||
809 | ip_vs_fill_iphdr(AF_INET, cih, &ciph); | 904 | ip_vs_fill_iphdr(AF_INET, cih, &ciph); |
810 | /* The embedded headers contain source and dest in reverse order */ | 905 | /* The embedded headers contain source and dest in reverse order */ |
811 | cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); | 906 | cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1); |
812 | if (!cp) | 907 | if (!cp) |
813 | return NF_ACCEPT; | 908 | return NF_ACCEPT; |
814 | 909 | ||
@@ -885,7 +980,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, | |||
885 | 980 | ||
886 | ip_vs_fill_iphdr(AF_INET6, cih, &ciph); | 981 | ip_vs_fill_iphdr(AF_INET6, cih, &ciph); |
887 | /* The embedded headers contain source and dest in reverse order */ | 982 | /* The embedded headers contain source and dest in reverse order */ |
888 | cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); | 983 | cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1); |
889 | if (!cp) | 984 | if (!cp) |
890 | return NF_ACCEPT; | 985 | return NF_ACCEPT; |
891 | 986 | ||
@@ -921,12 +1016,13 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) | |||
921 | } | 1016 | } |
922 | 1017 | ||
923 | /* Handle response packets: rewrite addresses and send away... | 1018 | /* Handle response packets: rewrite addresses and send away... |
924 | * Used for NAT and local client. | ||
925 | */ | 1019 | */ |
926 | static unsigned int | 1020 | static unsigned int |
927 | handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, | 1021 | handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, |
928 | struct ip_vs_conn *cp, int ihl) | 1022 | struct ip_vs_conn *cp, int ihl) |
929 | { | 1023 | { |
1024 | struct ip_vs_protocol *pp = pd->pp; | ||
1025 | |||
930 | IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); | 1026 | IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); |
931 | 1027 | ||
932 | if (!skb_make_writable(skb, ihl)) | 1028 | if (!skb_make_writable(skb, ihl)) |
@@ -961,21 +1057,13 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, | |||
961 | * if it came from this machine itself. So re-compute | 1057 | * if it came from this machine itself. So re-compute |
962 | * the routing information. | 1058 | * the routing information. |
963 | */ | 1059 | */ |
964 | #ifdef CONFIG_IP_VS_IPV6 | 1060 | if (ip_vs_route_me_harder(af, skb)) |
965 | if (af == AF_INET6) { | 1061 | goto drop; |
966 | if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0) | ||
967 | goto drop; | ||
968 | } else | ||
969 | #endif | ||
970 | if ((sysctl_ip_vs_snat_reroute || | ||
971 | skb_rtable(skb)->rt_flags & RTCF_LOCAL) && | ||
972 | ip_route_me_harder(skb, RTN_LOCAL) != 0) | ||
973 | goto drop; | ||
974 | 1062 | ||
975 | IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); | 1063 | IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); |
976 | 1064 | ||
977 | ip_vs_out_stats(cp, skb); | 1065 | ip_vs_out_stats(cp, skb); |
978 | ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); | 1066 | ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); |
979 | skb->ipvs_property = 1; | 1067 | skb->ipvs_property = 1; |
980 | if (!(cp->flags & IP_VS_CONN_F_NFCT)) | 1068 | if (!(cp->flags & IP_VS_CONN_F_NFCT)) |
981 | ip_vs_notrack(skb); | 1069 | ip_vs_notrack(skb); |
@@ -999,8 +1087,10 @@ drop: | |||
999 | static unsigned int | 1087 | static unsigned int |
1000 | ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) | 1088 | ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) |
1001 | { | 1089 | { |
1090 | struct net *net = NULL; | ||
1002 | struct ip_vs_iphdr iph; | 1091 | struct ip_vs_iphdr iph; |
1003 | struct ip_vs_protocol *pp; | 1092 | struct ip_vs_protocol *pp; |
1093 | struct ip_vs_proto_data *pd; | ||
1004 | struct ip_vs_conn *cp; | 1094 | struct ip_vs_conn *cp; |
1005 | 1095 | ||
1006 | EnterFunction(11); | 1096 | EnterFunction(11); |
@@ -1022,6 +1112,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) | |||
1022 | if (unlikely(!skb_dst(skb))) | 1112 | if (unlikely(!skb_dst(skb))) |
1023 | return NF_ACCEPT; | 1113 | return NF_ACCEPT; |
1024 | 1114 | ||
1115 | net = skb_net(skb); | ||
1025 | ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); | 1116 | ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); |
1026 | #ifdef CONFIG_IP_VS_IPV6 | 1117 | #ifdef CONFIG_IP_VS_IPV6 |
1027 | if (af == AF_INET6) { | 1118 | if (af == AF_INET6) { |
@@ -1045,9 +1136,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) | |||
1045 | ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); | 1136 | ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); |
1046 | } | 1137 | } |
1047 | 1138 | ||
1048 | pp = ip_vs_proto_get(iph.protocol); | 1139 | pd = ip_vs_proto_data_get(net, iph.protocol); |
1049 | if (unlikely(!pp)) | 1140 | if (unlikely(!pd)) |
1050 | return NF_ACCEPT; | 1141 | return NF_ACCEPT; |
1142 | pp = pd->pp; | ||
1051 | 1143 | ||
1052 | /* reassemble IP fragments */ | 1144 | /* reassemble IP fragments */ |
1053 | #ifdef CONFIG_IP_VS_IPV6 | 1145 | #ifdef CONFIG_IP_VS_IPV6 |
@@ -1073,11 +1165,11 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) | |||
1073 | /* | 1165 | /* |
1074 | * Check if the packet belongs to an existing entry | 1166 | * Check if the packet belongs to an existing entry |
1075 | */ | 1167 | */ |
1076 | cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); | 1168 | cp = pp->conn_out_get(af, skb, &iph, iph.len, 0); |
1077 | 1169 | ||
1078 | if (likely(cp)) | 1170 | if (likely(cp)) |
1079 | return handle_response(af, skb, pp, cp, iph.len); | 1171 | return handle_response(af, skb, pd, cp, iph.len); |
1080 | if (sysctl_ip_vs_nat_icmp_send && | 1172 | if (sysctl_nat_icmp_send(net) && |
1081 | (pp->protocol == IPPROTO_TCP || | 1173 | (pp->protocol == IPPROTO_TCP || |
1082 | pp->protocol == IPPROTO_UDP || | 1174 | pp->protocol == IPPROTO_UDP || |
1083 | pp->protocol == IPPROTO_SCTP)) { | 1175 | pp->protocol == IPPROTO_SCTP)) { |
@@ -1087,7 +1179,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) | |||
1087 | sizeof(_ports), _ports); | 1179 | sizeof(_ports), _ports); |
1088 | if (pptr == NULL) | 1180 | if (pptr == NULL) |
1089 | return NF_ACCEPT; /* Not for me */ | 1181 | return NF_ACCEPT; /* Not for me */ |
1090 | if (ip_vs_lookup_real_service(af, iph.protocol, | 1182 | if (ip_vs_lookup_real_service(net, af, iph.protocol, |
1091 | &iph.saddr, | 1183 | &iph.saddr, |
1092 | pptr[0])) { | 1184 | pptr[0])) { |
1093 | /* | 1185 | /* |
@@ -1202,14 +1294,15 @@ ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb, | |||
1202 | static int | 1294 | static int |
1203 | ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) | 1295 | ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) |
1204 | { | 1296 | { |
1297 | struct net *net = NULL; | ||
1205 | struct iphdr *iph; | 1298 | struct iphdr *iph; |
1206 | struct icmphdr _icmph, *ic; | 1299 | struct icmphdr _icmph, *ic; |
1207 | struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ | 1300 | struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ |
1208 | struct ip_vs_iphdr ciph; | 1301 | struct ip_vs_iphdr ciph; |
1209 | struct ip_vs_conn *cp; | 1302 | struct ip_vs_conn *cp; |
1210 | struct ip_vs_protocol *pp; | 1303 | struct ip_vs_protocol *pp; |
1304 | struct ip_vs_proto_data *pd; | ||
1211 | unsigned int offset, ihl, verdict; | 1305 | unsigned int offset, ihl, verdict; |
1212 | union nf_inet_addr snet; | ||
1213 | 1306 | ||
1214 | *related = 1; | 1307 | *related = 1; |
1215 | 1308 | ||
@@ -1249,9 +1342,11 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) | |||
1249 | if (cih == NULL) | 1342 | if (cih == NULL) |
1250 | return NF_ACCEPT; /* The packet looks wrong, ignore */ | 1343 | return NF_ACCEPT; /* The packet looks wrong, ignore */ |
1251 | 1344 | ||
1252 | pp = ip_vs_proto_get(cih->protocol); | 1345 | net = skb_net(skb); |
1253 | if (!pp) | 1346 | pd = ip_vs_proto_data_get(net, cih->protocol); |
1347 | if (!pd) | ||
1254 | return NF_ACCEPT; | 1348 | return NF_ACCEPT; |
1349 | pp = pd->pp; | ||
1255 | 1350 | ||
1256 | /* Is the embedded protocol header present? */ | 1351 | /* Is the embedded protocol header present? */ |
1257 | if (unlikely(cih->frag_off & htons(IP_OFFSET) && | 1352 | if (unlikely(cih->frag_off & htons(IP_OFFSET) && |
@@ -1265,18 +1360,9 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) | |||
1265 | 1360 | ||
1266 | ip_vs_fill_iphdr(AF_INET, cih, &ciph); | 1361 | ip_vs_fill_iphdr(AF_INET, cih, &ciph); |
1267 | /* The embedded headers contain source and dest in reverse order */ | 1362 | /* The embedded headers contain source and dest in reverse order */ |
1268 | cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); | 1363 | cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1); |
1269 | if (!cp) { | 1364 | if (!cp) |
1270 | /* The packet could also belong to a local client */ | ||
1271 | cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); | ||
1272 | if (cp) { | ||
1273 | snet.ip = iph->saddr; | ||
1274 | return handle_response_icmp(AF_INET, skb, &snet, | ||
1275 | cih->protocol, cp, pp, | ||
1276 | offset, ihl); | ||
1277 | } | ||
1278 | return NF_ACCEPT; | 1365 | return NF_ACCEPT; |
1279 | } | ||
1280 | 1366 | ||
1281 | verdict = NF_DROP; | 1367 | verdict = NF_DROP; |
1282 | 1368 | ||
@@ -1312,6 +1398,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) | |||
1312 | static int | 1398 | static int |
1313 | ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) | 1399 | ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) |
1314 | { | 1400 | { |
1401 | struct net *net = NULL; | ||
1315 | struct ipv6hdr *iph; | 1402 | struct ipv6hdr *iph; |
1316 | struct icmp6hdr _icmph, *ic; | 1403 | struct icmp6hdr _icmph, *ic; |
1317 | struct ipv6hdr _ciph, *cih; /* The ip header contained | 1404 | struct ipv6hdr _ciph, *cih; /* The ip header contained |
@@ -1319,8 +1406,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) | |||
1319 | struct ip_vs_iphdr ciph; | 1406 | struct ip_vs_iphdr ciph; |
1320 | struct ip_vs_conn *cp; | 1407 | struct ip_vs_conn *cp; |
1321 | struct ip_vs_protocol *pp; | 1408 | struct ip_vs_protocol *pp; |
1409 | struct ip_vs_proto_data *pd; | ||
1322 | unsigned int offset, verdict; | 1410 | unsigned int offset, verdict; |
1323 | union nf_inet_addr snet; | ||
1324 | struct rt6_info *rt; | 1411 | struct rt6_info *rt; |
1325 | 1412 | ||
1326 | *related = 1; | 1413 | *related = 1; |
@@ -1361,9 +1448,11 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) | |||
1361 | if (cih == NULL) | 1448 | if (cih == NULL) |
1362 | return NF_ACCEPT; /* The packet looks wrong, ignore */ | 1449 | return NF_ACCEPT; /* The packet looks wrong, ignore */ |
1363 | 1450 | ||
1364 | pp = ip_vs_proto_get(cih->nexthdr); | 1451 | net = skb_net(skb); |
1365 | if (!pp) | 1452 | pd = ip_vs_proto_data_get(net, cih->nexthdr); |
1453 | if (!pd) | ||
1366 | return NF_ACCEPT; | 1454 | return NF_ACCEPT; |
1455 | pp = pd->pp; | ||
1367 | 1456 | ||
1368 | /* Is the embedded protocol header present? */ | 1457 | /* Is the embedded protocol header present? */ |
1369 | /* TODO: we don't support fragmentation at the moment anyways */ | 1458 | /* TODO: we don't support fragmentation at the moment anyways */ |
@@ -1377,19 +1466,9 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) | |||
1377 | 1466 | ||
1378 | ip_vs_fill_iphdr(AF_INET6, cih, &ciph); | 1467 | ip_vs_fill_iphdr(AF_INET6, cih, &ciph); |
1379 | /* The embedded headers contain source and dest in reverse order */ | 1468 | /* The embedded headers contain source and dest in reverse order */ |
1380 | cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); | 1469 | cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1); |
1381 | if (!cp) { | 1470 | if (!cp) |
1382 | /* The packet could also belong to a local client */ | ||
1383 | cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); | ||
1384 | if (cp) { | ||
1385 | ipv6_addr_copy(&snet.in6, &iph->saddr); | ||
1386 | return handle_response_icmp(AF_INET6, skb, &snet, | ||
1387 | cih->nexthdr, | ||
1388 | cp, pp, offset, | ||
1389 | sizeof(struct ipv6hdr)); | ||
1390 | } | ||
1391 | return NF_ACCEPT; | 1471 | return NF_ACCEPT; |
1392 | } | ||
1393 | 1472 | ||
1394 | verdict = NF_DROP; | 1473 | verdict = NF_DROP; |
1395 | 1474 | ||
@@ -1423,10 +1502,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) | |||
1423 | static unsigned int | 1502 | static unsigned int |
1424 | ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) | 1503 | ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) |
1425 | { | 1504 | { |
1505 | struct net *net; | ||
1426 | struct ip_vs_iphdr iph; | 1506 | struct ip_vs_iphdr iph; |
1427 | struct ip_vs_protocol *pp; | 1507 | struct ip_vs_protocol *pp; |
1508 | struct ip_vs_proto_data *pd; | ||
1428 | struct ip_vs_conn *cp; | 1509 | struct ip_vs_conn *cp; |
1429 | int ret, restart, pkts; | 1510 | int ret, restart, pkts; |
1511 | struct netns_ipvs *ipvs; | ||
1430 | 1512 | ||
1431 | /* Already marked as IPVS request or reply? */ | 1513 | /* Already marked as IPVS request or reply? */ |
1432 | if (skb->ipvs_property) | 1514 | if (skb->ipvs_property) |
@@ -1480,20 +1562,21 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) | |||
1480 | ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); | 1562 | ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); |
1481 | } | 1563 | } |
1482 | 1564 | ||
1565 | net = skb_net(skb); | ||
1483 | /* Protocol supported? */ | 1566 | /* Protocol supported? */ |
1484 | pp = ip_vs_proto_get(iph.protocol); | 1567 | pd = ip_vs_proto_data_get(net, iph.protocol); |
1485 | if (unlikely(!pp)) | 1568 | if (unlikely(!pd)) |
1486 | return NF_ACCEPT; | 1569 | return NF_ACCEPT; |
1487 | 1570 | pp = pd->pp; | |
1488 | /* | 1571 | /* |
1489 | * Check if the packet belongs to an existing connection entry | 1572 | * Check if the packet belongs to an existing connection entry |
1490 | */ | 1573 | */ |
1491 | cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0); | 1574 | cp = pp->conn_in_get(af, skb, &iph, iph.len, 0); |
1492 | 1575 | ||
1493 | if (unlikely(!cp)) { | 1576 | if (unlikely(!cp)) { |
1494 | int v; | 1577 | int v; |
1495 | 1578 | ||
1496 | if (!pp->conn_schedule(af, skb, pp, &v, &cp)) | 1579 | if (!pp->conn_schedule(af, skb, pd, &v, &cp)) |
1497 | return v; | 1580 | return v; |
1498 | } | 1581 | } |
1499 | 1582 | ||
@@ -1505,12 +1588,13 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) | |||
1505 | } | 1588 | } |
1506 | 1589 | ||
1507 | IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); | 1590 | IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); |
1508 | 1591 | net = skb_net(skb); | |
1592 | ipvs = net_ipvs(net); | ||
1509 | /* Check the server status */ | 1593 | /* Check the server status */ |
1510 | if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { | 1594 | if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { |
1511 | /* the destination server is not available */ | 1595 | /* the destination server is not available */ |
1512 | 1596 | ||
1513 | if (sysctl_ip_vs_expire_nodest_conn) { | 1597 | if (sysctl_expire_nodest_conn(ipvs)) { |
1514 | /* try to expire the connection immediately */ | 1598 | /* try to expire the connection immediately */ |
1515 | ip_vs_conn_expire_now(cp); | 1599 | ip_vs_conn_expire_now(cp); |
1516 | } | 1600 | } |
@@ -1521,7 +1605,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) | |||
1521 | } | 1605 | } |
1522 | 1606 | ||
1523 | ip_vs_in_stats(cp, skb); | 1607 | ip_vs_in_stats(cp, skb); |
1524 | restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); | 1608 | restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); |
1525 | if (cp->packet_xmit) | 1609 | if (cp->packet_xmit) |
1526 | ret = cp->packet_xmit(skb, cp, pp); | 1610 | ret = cp->packet_xmit(skb, cp, pp); |
1527 | /* do not touch skb anymore */ | 1611 | /* do not touch skb anymore */ |
@@ -1535,35 +1619,41 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) | |||
1535 | * | 1619 | * |
1536 | * Sync connection if it is about to close to | 1620 | * Sync connection if it is about to close to |
1537 | * encorage the standby servers to update the connections timeout | 1621 | * encorage the standby servers to update the connections timeout |
1622 | * | ||
1623 | * For ONE_PKT let ip_vs_sync_conn() do the filter work. | ||
1538 | */ | 1624 | */ |
1539 | pkts = atomic_add_return(1, &cp->in_pkts); | 1625 | |
1540 | if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) && | 1626 | if (cp->flags & IP_VS_CONN_F_ONE_PACKET) |
1627 | pkts = sysctl_sync_threshold(ipvs); | ||
1628 | else | ||
1629 | pkts = atomic_add_return(1, &cp->in_pkts); | ||
1630 | |||
1631 | if ((ipvs->sync_state & IP_VS_STATE_MASTER) && | ||
1541 | cp->protocol == IPPROTO_SCTP) { | 1632 | cp->protocol == IPPROTO_SCTP) { |
1542 | if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && | 1633 | if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && |
1543 | (pkts % sysctl_ip_vs_sync_threshold[1] | 1634 | (pkts % sysctl_sync_period(ipvs) |
1544 | == sysctl_ip_vs_sync_threshold[0])) || | 1635 | == sysctl_sync_threshold(ipvs))) || |
1545 | (cp->old_state != cp->state && | 1636 | (cp->old_state != cp->state && |
1546 | ((cp->state == IP_VS_SCTP_S_CLOSED) || | 1637 | ((cp->state == IP_VS_SCTP_S_CLOSED) || |
1547 | (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || | 1638 | (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || |
1548 | (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) { | 1639 | (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) { |
1549 | ip_vs_sync_conn(cp); | 1640 | ip_vs_sync_conn(net, cp); |
1550 | goto out; | 1641 | goto out; |
1551 | } | 1642 | } |
1552 | } | 1643 | } |
1553 | 1644 | ||
1554 | /* Keep this block last: TCP and others with pp->num_states <= 1 */ | 1645 | /* Keep this block last: TCP and others with pp->num_states <= 1 */ |
1555 | else if (af == AF_INET && | 1646 | else if ((ipvs->sync_state & IP_VS_STATE_MASTER) && |
1556 | (ip_vs_sync_state & IP_VS_STATE_MASTER) && | ||
1557 | (((cp->protocol != IPPROTO_TCP || | 1647 | (((cp->protocol != IPPROTO_TCP || |
1558 | cp->state == IP_VS_TCP_S_ESTABLISHED) && | 1648 | cp->state == IP_VS_TCP_S_ESTABLISHED) && |
1559 | (pkts % sysctl_ip_vs_sync_threshold[1] | 1649 | (pkts % sysctl_sync_period(ipvs) |
1560 | == sysctl_ip_vs_sync_threshold[0])) || | 1650 | == sysctl_sync_threshold(ipvs))) || |
1561 | ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && | 1651 | ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && |
1562 | ((cp->state == IP_VS_TCP_S_FIN_WAIT) || | 1652 | ((cp->state == IP_VS_TCP_S_FIN_WAIT) || |
1563 | (cp->state == IP_VS_TCP_S_CLOSE) || | 1653 | (cp->state == IP_VS_TCP_S_CLOSE) || |
1564 | (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || | 1654 | (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || |
1565 | (cp->state == IP_VS_TCP_S_TIME_WAIT))))) | 1655 | (cp->state == IP_VS_TCP_S_TIME_WAIT))))) |
1566 | ip_vs_sync_conn(cp); | 1656 | ip_vs_sync_conn(net, cp); |
1567 | out: | 1657 | out: |
1568 | cp->old_state = cp->state; | 1658 | cp->old_state = cp->state; |
1569 | 1659 | ||
@@ -1782,7 +1872,39 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { | |||
1782 | }, | 1872 | }, |
1783 | #endif | 1873 | #endif |
1784 | }; | 1874 | }; |
1875 | /* | ||
1876 | * Initialize IP Virtual Server netns mem. | ||
1877 | */ | ||
1878 | static int __net_init __ip_vs_init(struct net *net) | ||
1879 | { | ||
1880 | struct netns_ipvs *ipvs; | ||
1881 | |||
1882 | ipvs = net_generic(net, ip_vs_net_id); | ||
1883 | if (ipvs == NULL) { | ||
1884 | pr_err("%s(): no memory.\n", __func__); | ||
1885 | return -ENOMEM; | ||
1886 | } | ||
1887 | ipvs->net = net; | ||
1888 | /* Counters used for creating unique names */ | ||
1889 | ipvs->gen = atomic_read(&ipvs_netns_cnt); | ||
1890 | atomic_inc(&ipvs_netns_cnt); | ||
1891 | net->ipvs = ipvs; | ||
1892 | printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", | ||
1893 | sizeof(struct netns_ipvs), ipvs->gen); | ||
1894 | return 0; | ||
1895 | } | ||
1785 | 1896 | ||
1897 | static void __net_exit __ip_vs_cleanup(struct net *net) | ||
1898 | { | ||
1899 | IP_VS_DBG(10, "ipvs netns %d released\n", net_ipvs(net)->gen); | ||
1900 | } | ||
1901 | |||
1902 | static struct pernet_operations ipvs_core_ops = { | ||
1903 | .init = __ip_vs_init, | ||
1904 | .exit = __ip_vs_cleanup, | ||
1905 | .id = &ip_vs_net_id, | ||
1906 | .size = sizeof(struct netns_ipvs), | ||
1907 | }; | ||
1786 | 1908 | ||
1787 | /* | 1909 | /* |
1788 | * Initialize IP Virtual Server | 1910 | * Initialize IP Virtual Server |
@@ -1791,8 +1913,11 @@ static int __init ip_vs_init(void) | |||
1791 | { | 1913 | { |
1792 | int ret; | 1914 | int ret; |
1793 | 1915 | ||
1794 | ip_vs_estimator_init(); | 1916 | ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ |
1917 | if (ret < 0) | ||
1918 | return ret; | ||
1795 | 1919 | ||
1920 | ip_vs_estimator_init(); | ||
1796 | ret = ip_vs_control_init(); | 1921 | ret = ip_vs_control_init(); |
1797 | if (ret < 0) { | 1922 | if (ret < 0) { |
1798 | pr_err("can't setup control.\n"); | 1923 | pr_err("can't setup control.\n"); |
@@ -1813,15 +1938,23 @@ static int __init ip_vs_init(void) | |||
1813 | goto cleanup_app; | 1938 | goto cleanup_app; |
1814 | } | 1939 | } |
1815 | 1940 | ||
1941 | ret = ip_vs_sync_init(); | ||
1942 | if (ret < 0) { | ||
1943 | pr_err("can't setup sync data.\n"); | ||
1944 | goto cleanup_conn; | ||
1945 | } | ||
1946 | |||
1816 | ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); | 1947 | ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); |
1817 | if (ret < 0) { | 1948 | if (ret < 0) { |
1818 | pr_err("can't register hooks.\n"); | 1949 | pr_err("can't register hooks.\n"); |
1819 | goto cleanup_conn; | 1950 | goto cleanup_sync; |
1820 | } | 1951 | } |
1821 | 1952 | ||
1822 | pr_info("ipvs loaded.\n"); | 1953 | pr_info("ipvs loaded.\n"); |
1823 | return ret; | 1954 | return ret; |
1824 | 1955 | ||
1956 | cleanup_sync: | ||
1957 | ip_vs_sync_cleanup(); | ||
1825 | cleanup_conn: | 1958 | cleanup_conn: |
1826 | ip_vs_conn_cleanup(); | 1959 | ip_vs_conn_cleanup(); |
1827 | cleanup_app: | 1960 | cleanup_app: |
@@ -1831,17 +1964,20 @@ static int __init ip_vs_init(void) | |||
1831 | ip_vs_control_cleanup(); | 1964 | ip_vs_control_cleanup(); |
1832 | cleanup_estimator: | 1965 | cleanup_estimator: |
1833 | ip_vs_estimator_cleanup(); | 1966 | ip_vs_estimator_cleanup(); |
1967 | unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ | ||
1834 | return ret; | 1968 | return ret; |
1835 | } | 1969 | } |
1836 | 1970 | ||
1837 | static void __exit ip_vs_cleanup(void) | 1971 | static void __exit ip_vs_cleanup(void) |
1838 | { | 1972 | { |
1839 | nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); | 1973 | nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); |
1974 | ip_vs_sync_cleanup(); | ||
1840 | ip_vs_conn_cleanup(); | 1975 | ip_vs_conn_cleanup(); |
1841 | ip_vs_app_cleanup(); | 1976 | ip_vs_app_cleanup(); |
1842 | ip_vs_protocol_cleanup(); | 1977 | ip_vs_protocol_cleanup(); |
1843 | ip_vs_control_cleanup(); | 1978 | ip_vs_control_cleanup(); |
1844 | ip_vs_estimator_cleanup(); | 1979 | ip_vs_estimator_cleanup(); |
1980 | unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ | ||
1845 | pr_info("ipvs unloaded.\n"); | 1981 | pr_info("ipvs unloaded.\n"); |
1846 | } | 1982 | } |
1847 | 1983 | ||