aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/inet_diag.c6
-rw-r--r--net/ipv4/ipvs/Kconfig6
-rw-r--r--net/ipv4/ipvs/Makefile3
-rw-r--r--net/ipv4/ipvs/ip_vs_core.c8
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c896
-rw-r--r--net/ipv4/ipvs/ip_vs_est.c18
-rw-r--r--net/ipv4/ipvs/ip_vs_lblc.c213
-rw-r--r--net/ipv4/ipvs/ip_vs_lblcr.c238
-rw-r--r--net/ipv4/ipvs/ip_vs_lc.c21
-rw-r--r--net/ipv4/ipvs/ip_vs_nq.c24
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_ah_esp.c (renamed from net/ipv4/ipvs/ip_vs_proto_ah.c)69
-rw-r--r--net/ipv4/ipvs/ip_vs_proto_esp.c176
-rw-r--r--net/ipv4/ipvs/ip_vs_rr.c7
-rw-r--r--net/ipv4/ipvs/ip_vs_sed.c24
-rw-r--r--net/ipv4/ipvs/ip_vs_wlc.c24
-rw-r--r--net/ipv4/route.c14
-rw-r--r--net/ipv4/tcp_input.c208
-rw-r--r--net/ipv4/tcp_ipv4.c26
18 files changed, 1305 insertions, 676 deletions
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index c10036e7a463..89cb047ab314 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -782,11 +782,15 @@ skip_listen_ht:
782 struct sock *sk; 782 struct sock *sk;
783 struct hlist_node *node; 783 struct hlist_node *node;
784 784
785 num = 0;
786
787 if (hlist_empty(&head->chain) && hlist_empty(&head->twchain))
788 continue;
789
785 if (i > s_i) 790 if (i > s_i)
786 s_num = 0; 791 s_num = 0;
787 792
788 read_lock_bh(lock); 793 read_lock_bh(lock);
789 num = 0;
790 sk_for_each(sk, node, &head->chain) { 794 sk_for_each(sk, node, &head->chain) {
791 struct inet_sock *inet = inet_sk(sk); 795 struct inet_sock *inet = inet_sk(sk);
792 796
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
index 09d0c3f35669..2e48a7e27223 100644
--- a/net/ipv4/ipvs/Kconfig
+++ b/net/ipv4/ipvs/Kconfig
@@ -71,14 +71,20 @@ config IP_VS_PROTO_UDP
71 This option enables support for load balancing UDP transport 71 This option enables support for load balancing UDP transport
72 protocol. Say Y if unsure. 72 protocol. Say Y if unsure.
73 73
74config IP_VS_PROTO_AH_ESP
75 bool
76 depends on UNDEFINED
77
74config IP_VS_PROTO_ESP 78config IP_VS_PROTO_ESP
75 bool "ESP load balancing support" 79 bool "ESP load balancing support"
80 select IP_VS_PROTO_AH_ESP
76 ---help--- 81 ---help---
77 This option enables support for load balancing ESP (Encapsulation 82 This option enables support for load balancing ESP (Encapsulation
78 Security Payload) transport protocol. Say Y if unsure. 83 Security Payload) transport protocol. Say Y if unsure.
79 84
80config IP_VS_PROTO_AH 85config IP_VS_PROTO_AH
81 bool "AH load balancing support" 86 bool "AH load balancing support"
87 select IP_VS_PROTO_AH_ESP
82 ---help--- 88 ---help---
83 This option enables support for load balancing AH (Authentication 89 This option enables support for load balancing AH (Authentication
84 Header) transport protocol. Say Y if unsure. 90 Header) transport protocol. Say Y if unsure.
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile
index 30e85de9ffff..73a46fe1fe4c 100644
--- a/net/ipv4/ipvs/Makefile
+++ b/net/ipv4/ipvs/Makefile
@@ -6,8 +6,7 @@
6ip_vs_proto-objs-y := 6ip_vs_proto-objs-y :=
7ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o 7ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
8ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o 8ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
9ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o 9ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
10ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o
11 10
12ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ 11ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
13 ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ 12 ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index a7879eafc3b5..9fbf0a6d7392 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -1070,10 +1070,12 @@ static int __init ip_vs_init(void)
1070{ 1070{
1071 int ret; 1071 int ret;
1072 1072
1073 ip_vs_estimator_init();
1074
1073 ret = ip_vs_control_init(); 1075 ret = ip_vs_control_init();
1074 if (ret < 0) { 1076 if (ret < 0) {
1075 IP_VS_ERR("can't setup control.\n"); 1077 IP_VS_ERR("can't setup control.\n");
1076 goto cleanup_nothing; 1078 goto cleanup_estimator;
1077 } 1079 }
1078 1080
1079 ip_vs_protocol_init(); 1081 ip_vs_protocol_init();
@@ -1106,7 +1108,8 @@ static int __init ip_vs_init(void)
1106 cleanup_protocol: 1108 cleanup_protocol:
1107 ip_vs_protocol_cleanup(); 1109 ip_vs_protocol_cleanup();
1108 ip_vs_control_cleanup(); 1110 ip_vs_control_cleanup();
1109 cleanup_nothing: 1111 cleanup_estimator:
1112 ip_vs_estimator_cleanup();
1110 return ret; 1113 return ret;
1111} 1114}
1112 1115
@@ -1117,6 +1120,7 @@ static void __exit ip_vs_cleanup(void)
1117 ip_vs_app_cleanup(); 1120 ip_vs_app_cleanup();
1118 ip_vs_protocol_cleanup(); 1121 ip_vs_protocol_cleanup();
1119 ip_vs_control_cleanup(); 1122 ip_vs_control_cleanup();
1123 ip_vs_estimator_cleanup();
1120 IP_VS_INFO("ipvs unloaded.\n"); 1124 IP_VS_INFO("ipvs unloaded.\n");
1121} 1125}
1122 1126
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 6379705a8dcb..ede101eeec17 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -37,6 +37,7 @@
37#include <net/ip.h> 37#include <net/ip.h>
38#include <net/route.h> 38#include <net/route.h>
39#include <net/sock.h> 39#include <net/sock.h>
40#include <net/genetlink.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42 43
@@ -868,7 +869,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
868 svc->num_dests++; 869 svc->num_dests++;
869 870
870 /* call the update_service function of its scheduler */ 871 /* call the update_service function of its scheduler */
871 svc->scheduler->update_service(svc); 872 if (svc->scheduler->update_service)
873 svc->scheduler->update_service(svc);
872 874
873 write_unlock_bh(&__ip_vs_svc_lock); 875 write_unlock_bh(&__ip_vs_svc_lock);
874 return 0; 876 return 0;
@@ -898,7 +900,8 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
898 svc->num_dests++; 900 svc->num_dests++;
899 901
900 /* call the update_service function of its scheduler */ 902 /* call the update_service function of its scheduler */
901 svc->scheduler->update_service(svc); 903 if (svc->scheduler->update_service)
904 svc->scheduler->update_service(svc);
902 905
903 write_unlock_bh(&__ip_vs_svc_lock); 906 write_unlock_bh(&__ip_vs_svc_lock);
904 907
@@ -948,7 +951,8 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
948 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); 951 IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
949 952
950 /* call the update_service, because server weight may be changed */ 953 /* call the update_service, because server weight may be changed */
951 svc->scheduler->update_service(svc); 954 if (svc->scheduler->update_service)
955 svc->scheduler->update_service(svc);
952 956
953 write_unlock_bh(&__ip_vs_svc_lock); 957 write_unlock_bh(&__ip_vs_svc_lock);
954 958
@@ -1011,12 +1015,12 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1011 */ 1015 */
1012 list_del(&dest->n_list); 1016 list_del(&dest->n_list);
1013 svc->num_dests--; 1017 svc->num_dests--;
1014 if (svcupd) { 1018
1015 /* 1019 /*
1016 * Call the update_service function of its scheduler 1020 * Call the update_service function of its scheduler
1017 */ 1021 */
1018 svc->scheduler->update_service(svc); 1022 if (svcupd && svc->scheduler->update_service)
1019 } 1023 svc->scheduler->update_service(svc);
1020} 1024}
1021 1025
1022 1026
@@ -2320,6 +2324,872 @@ static struct nf_sockopt_ops ip_vs_sockopts = {
2320 .owner = THIS_MODULE, 2324 .owner = THIS_MODULE,
2321}; 2325};
2322 2326
2327/*
2328 * Generic Netlink interface
2329 */
2330
2331/* IPVS genetlink family */
2332static struct genl_family ip_vs_genl_family = {
2333 .id = GENL_ID_GENERATE,
2334 .hdrsize = 0,
2335 .name = IPVS_GENL_NAME,
2336 .version = IPVS_GENL_VERSION,
2337 .maxattr = IPVS_CMD_MAX,
2338};
2339
2340/* Policy used for first-level command attributes */
2341static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2342 [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED },
2343 [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED },
2344 [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED },
2345 [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 },
2346 [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2347 [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 },
2348};
2349
2350/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2351static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2352 [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 },
2353 [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING,
2354 .len = IP_VS_IFNAME_MAXLEN },
2355 [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 },
2356};
2357
2358/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2359static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2360 [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 },
2361 [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 },
2362 [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY,
2363 .len = sizeof(union nf_inet_addr) },
2364 [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 },
2365 [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
2366 [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
2367 .len = IP_VS_SCHEDNAME_MAXLEN },
2368 [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
2369 .len = sizeof(struct ip_vs_flags) },
2370 [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
2371 [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 },
2372 [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED },
2373};
2374
2375/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2376static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2377 [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY,
2378 .len = sizeof(union nf_inet_addr) },
2379 [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 },
2380 [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 },
2381 [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 },
2382 [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 },
2383 [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 },
2384 [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 },
2385 [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 },
2386 [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
2387 [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
2388};
2389
2390static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2391 struct ip_vs_stats *stats)
2392{
2393 struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2394 if (!nl_stats)
2395 return -EMSGSIZE;
2396
2397 spin_lock_bh(&stats->lock);
2398
2399 NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->conns);
2400 NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->inpkts);
2401 NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->outpkts);
2402 NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->inbytes);
2403 NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->outbytes);
2404 NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->cps);
2405 NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->inpps);
2406 NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->outpps);
2407 NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->inbps);
2408 NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->outbps);
2409
2410 spin_unlock_bh(&stats->lock);
2411
2412 nla_nest_end(skb, nl_stats);
2413
2414 return 0;
2415
2416nla_put_failure:
2417 spin_unlock_bh(&stats->lock);
2418 nla_nest_cancel(skb, nl_stats);
2419 return -EMSGSIZE;
2420}
2421
2422static int ip_vs_genl_fill_service(struct sk_buff *skb,
2423 struct ip_vs_service *svc)
2424{
2425 struct nlattr *nl_service;
2426 struct ip_vs_flags flags = { .flags = svc->flags,
2427 .mask = ~0 };
2428
2429 nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2430 if (!nl_service)
2431 return -EMSGSIZE;
2432
2433 NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, AF_INET);
2434
2435 if (svc->fwmark) {
2436 NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
2437 } else {
2438 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
2439 NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
2440 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
2441 }
2442
2443 NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
2444 NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
2445 NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
2446 NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
2447
2448 if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
2449 goto nla_put_failure;
2450
2451 nla_nest_end(skb, nl_service);
2452
2453 return 0;
2454
2455nla_put_failure:
2456 nla_nest_cancel(skb, nl_service);
2457 return -EMSGSIZE;
2458}
2459
2460static int ip_vs_genl_dump_service(struct sk_buff *skb,
2461 struct ip_vs_service *svc,
2462 struct netlink_callback *cb)
2463{
2464 void *hdr;
2465
2466 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2467 &ip_vs_genl_family, NLM_F_MULTI,
2468 IPVS_CMD_NEW_SERVICE);
2469 if (!hdr)
2470 return -EMSGSIZE;
2471
2472 if (ip_vs_genl_fill_service(skb, svc) < 0)
2473 goto nla_put_failure;
2474
2475 return genlmsg_end(skb, hdr);
2476
2477nla_put_failure:
2478 genlmsg_cancel(skb, hdr);
2479 return -EMSGSIZE;
2480}
2481
2482static int ip_vs_genl_dump_services(struct sk_buff *skb,
2483 struct netlink_callback *cb)
2484{
2485 int idx = 0, i;
2486 int start = cb->args[0];
2487 struct ip_vs_service *svc;
2488
2489 mutex_lock(&__ip_vs_mutex);
2490 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2491 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2492 if (++idx <= start)
2493 continue;
2494 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2495 idx--;
2496 goto nla_put_failure;
2497 }
2498 }
2499 }
2500
2501 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2502 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
2503 if (++idx <= start)
2504 continue;
2505 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2506 idx--;
2507 goto nla_put_failure;
2508 }
2509 }
2510 }
2511
2512nla_put_failure:
2513 mutex_unlock(&__ip_vs_mutex);
2514 cb->args[0] = idx;
2515
2516 return skb->len;
2517}
2518
2519static int ip_vs_genl_parse_service(struct ip_vs_service_user *usvc,
2520 struct nlattr *nla, int full_entry)
2521{
2522 struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
2523 struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
2524
2525 /* Parse mandatory identifying service fields first */
2526 if (nla == NULL ||
2527 nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
2528 return -EINVAL;
2529
2530 nla_af = attrs[IPVS_SVC_ATTR_AF];
2531 nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL];
2532 nla_addr = attrs[IPVS_SVC_ATTR_ADDR];
2533 nla_port = attrs[IPVS_SVC_ATTR_PORT];
2534 nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK];
2535
2536 if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
2537 return -EINVAL;
2538
2539 /* For now, only support IPv4 */
2540 if (nla_get_u16(nla_af) != AF_INET)
2541 return -EAFNOSUPPORT;
2542
2543 if (nla_fwmark) {
2544 usvc->protocol = IPPROTO_TCP;
2545 usvc->fwmark = nla_get_u32(nla_fwmark);
2546 } else {
2547 usvc->protocol = nla_get_u16(nla_protocol);
2548 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
2549 usvc->port = nla_get_u16(nla_port);
2550 usvc->fwmark = 0;
2551 }
2552
2553 /* If a full entry was requested, check for the additional fields */
2554 if (full_entry) {
2555 struct nlattr *nla_sched, *nla_flags, *nla_timeout,
2556 *nla_netmask;
2557 struct ip_vs_flags flags;
2558 struct ip_vs_service *svc;
2559
2560 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
2561 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
2562 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
2563 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
2564
2565 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
2566 return -EINVAL;
2567
2568 nla_memcpy(&flags, nla_flags, sizeof(flags));
2569
2570 /* prefill flags from service if it already exists */
2571 if (usvc->fwmark)
2572 svc = __ip_vs_svc_fwm_get(usvc->fwmark);
2573 else
2574 svc = __ip_vs_service_get(usvc->protocol, usvc->addr,
2575 usvc->port);
2576 if (svc) {
2577 usvc->flags = svc->flags;
2578 ip_vs_service_put(svc);
2579 } else
2580 usvc->flags = 0;
2581
2582 /* set new flags from userland */
2583 usvc->flags = (usvc->flags & ~flags.mask) |
2584 (flags.flags & flags.mask);
2585
2586 strlcpy(usvc->sched_name, nla_data(nla_sched),
2587 sizeof(usvc->sched_name));
2588 usvc->timeout = nla_get_u32(nla_timeout);
2589 usvc->netmask = nla_get_u32(nla_netmask);
2590 }
2591
2592 return 0;
2593}
2594
2595static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla)
2596{
2597 struct ip_vs_service_user usvc;
2598 int ret;
2599
2600 ret = ip_vs_genl_parse_service(&usvc, nla, 0);
2601 if (ret)
2602 return ERR_PTR(ret);
2603
2604 if (usvc.fwmark)
2605 return __ip_vs_svc_fwm_get(usvc.fwmark);
2606 else
2607 return __ip_vs_service_get(usvc.protocol, usvc.addr,
2608 usvc.port);
2609}
2610
2611static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
2612{
2613 struct nlattr *nl_dest;
2614
2615 nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
2616 if (!nl_dest)
2617 return -EMSGSIZE;
2618
2619 NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
2620 NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
2621
2622 NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
2623 atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
2624 NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
2625 NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
2626 NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
2627 NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
2628 atomic_read(&dest->activeconns));
2629 NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
2630 atomic_read(&dest->inactconns));
2631 NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
2632 atomic_read(&dest->persistconns));
2633
2634 if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
2635 goto nla_put_failure;
2636
2637 nla_nest_end(skb, nl_dest);
2638
2639 return 0;
2640
2641nla_put_failure:
2642 nla_nest_cancel(skb, nl_dest);
2643 return -EMSGSIZE;
2644}
2645
2646static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
2647 struct netlink_callback *cb)
2648{
2649 void *hdr;
2650
2651 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2652 &ip_vs_genl_family, NLM_F_MULTI,
2653 IPVS_CMD_NEW_DEST);
2654 if (!hdr)
2655 return -EMSGSIZE;
2656
2657 if (ip_vs_genl_fill_dest(skb, dest) < 0)
2658 goto nla_put_failure;
2659
2660 return genlmsg_end(skb, hdr);
2661
2662nla_put_failure:
2663 genlmsg_cancel(skb, hdr);
2664 return -EMSGSIZE;
2665}
2666
2667static int ip_vs_genl_dump_dests(struct sk_buff *skb,
2668 struct netlink_callback *cb)
2669{
2670 int idx = 0;
2671 int start = cb->args[0];
2672 struct ip_vs_service *svc;
2673 struct ip_vs_dest *dest;
2674 struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
2675
2676 mutex_lock(&__ip_vs_mutex);
2677
2678 /* Try to find the service for which to dump destinations */
2679 if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
2680 IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
2681 goto out_err;
2682
2683 svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]);
2684 if (IS_ERR(svc) || svc == NULL)
2685 goto out_err;
2686
2687 /* Dump the destinations */
2688 list_for_each_entry(dest, &svc->destinations, n_list) {
2689 if (++idx <= start)
2690 continue;
2691 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
2692 idx--;
2693 goto nla_put_failure;
2694 }
2695 }
2696
2697nla_put_failure:
2698 cb->args[0] = idx;
2699 ip_vs_service_put(svc);
2700
2701out_err:
2702 mutex_unlock(&__ip_vs_mutex);
2703
2704 return skb->len;
2705}
2706
2707static int ip_vs_genl_parse_dest(struct ip_vs_dest_user *udest,
2708 struct nlattr *nla, int full_entry)
2709{
2710 struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
2711 struct nlattr *nla_addr, *nla_port;
2712
2713 /* Parse mandatory identifying destination fields first */
2714 if (nla == NULL ||
2715 nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
2716 return -EINVAL;
2717
2718 nla_addr = attrs[IPVS_DEST_ATTR_ADDR];
2719 nla_port = attrs[IPVS_DEST_ATTR_PORT];
2720
2721 if (!(nla_addr && nla_port))
2722 return -EINVAL;
2723
2724 nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
2725 udest->port = nla_get_u16(nla_port);
2726
2727 /* If a full entry was requested, check for the additional fields */
2728 if (full_entry) {
2729 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
2730 *nla_l_thresh;
2731
2732 nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
2733 nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
2734 nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
2735 nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
2736
2737 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
2738 return -EINVAL;
2739
2740 udest->conn_flags = nla_get_u32(nla_fwd)
2741 & IP_VS_CONN_F_FWD_MASK;
2742 udest->weight = nla_get_u32(nla_weight);
2743 udest->u_threshold = nla_get_u32(nla_u_thresh);
2744 udest->l_threshold = nla_get_u32(nla_l_thresh);
2745 }
2746
2747 return 0;
2748}
2749
2750static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
2751 const char *mcast_ifn, __be32 syncid)
2752{
2753 struct nlattr *nl_daemon;
2754
2755 nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
2756 if (!nl_daemon)
2757 return -EMSGSIZE;
2758
2759 NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
2760 NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
2761 NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
2762
2763 nla_nest_end(skb, nl_daemon);
2764
2765 return 0;
2766
2767nla_put_failure:
2768 nla_nest_cancel(skb, nl_daemon);
2769 return -EMSGSIZE;
2770}
2771
2772static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
2773 const char *mcast_ifn, __be32 syncid,
2774 struct netlink_callback *cb)
2775{
2776 void *hdr;
2777 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2778 &ip_vs_genl_family, NLM_F_MULTI,
2779 IPVS_CMD_NEW_DAEMON);
2780 if (!hdr)
2781 return -EMSGSIZE;
2782
2783 if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
2784 goto nla_put_failure;
2785
2786 return genlmsg_end(skb, hdr);
2787
2788nla_put_failure:
2789 genlmsg_cancel(skb, hdr);
2790 return -EMSGSIZE;
2791}
2792
2793static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
2794 struct netlink_callback *cb)
2795{
2796 mutex_lock(&__ip_vs_mutex);
2797 if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
2798 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
2799 ip_vs_master_mcast_ifn,
2800 ip_vs_master_syncid, cb) < 0)
2801 goto nla_put_failure;
2802
2803 cb->args[0] = 1;
2804 }
2805
2806 if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
2807 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
2808 ip_vs_backup_mcast_ifn,
2809 ip_vs_backup_syncid, cb) < 0)
2810 goto nla_put_failure;
2811
2812 cb->args[1] = 1;
2813 }
2814
2815nla_put_failure:
2816 mutex_unlock(&__ip_vs_mutex);
2817
2818 return skb->len;
2819}
2820
2821static int ip_vs_genl_new_daemon(struct nlattr **attrs)
2822{
2823 if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
2824 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
2825 attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
2826 return -EINVAL;
2827
2828 return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
2829 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
2830 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
2831}
2832
2833static int ip_vs_genl_del_daemon(struct nlattr **attrs)
2834{
2835 if (!attrs[IPVS_DAEMON_ATTR_STATE])
2836 return -EINVAL;
2837
2838 return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
2839}
2840
2841static int ip_vs_genl_set_config(struct nlattr **attrs)
2842{
2843 struct ip_vs_timeout_user t;
2844
2845 __ip_vs_get_timeouts(&t);
2846
2847 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
2848 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
2849
2850 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
2851 t.tcp_fin_timeout =
2852 nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
2853
2854 if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
2855 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
2856
2857 return ip_vs_set_timeout(&t);
2858}
2859
2860static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
2861{
2862 struct ip_vs_service *svc = NULL;
2863 struct ip_vs_service_user usvc;
2864 struct ip_vs_dest_user udest;
2865 int ret = 0, cmd;
2866 int need_full_svc = 0, need_full_dest = 0;
2867
2868 cmd = info->genlhdr->cmd;
2869
2870 mutex_lock(&__ip_vs_mutex);
2871
2872 if (cmd == IPVS_CMD_FLUSH) {
2873 ret = ip_vs_flush();
2874 goto out;
2875 } else if (cmd == IPVS_CMD_SET_CONFIG) {
2876 ret = ip_vs_genl_set_config(info->attrs);
2877 goto out;
2878 } else if (cmd == IPVS_CMD_NEW_DAEMON ||
2879 cmd == IPVS_CMD_DEL_DAEMON) {
2880
2881 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
2882
2883 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
2884 nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
2885 info->attrs[IPVS_CMD_ATTR_DAEMON],
2886 ip_vs_daemon_policy)) {
2887 ret = -EINVAL;
2888 goto out;
2889 }
2890
2891 if (cmd == IPVS_CMD_NEW_DAEMON)
2892 ret = ip_vs_genl_new_daemon(daemon_attrs);
2893 else
2894 ret = ip_vs_genl_del_daemon(daemon_attrs);
2895 goto out;
2896 } else if (cmd == IPVS_CMD_ZERO &&
2897 !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
2898 ret = ip_vs_zero_all();
2899 goto out;
2900 }
2901
2902 /* All following commands require a service argument, so check if we
2903 * received a valid one. We need a full service specification when
2904 * adding / editing a service. Only identifying members otherwise. */
2905 if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
2906 need_full_svc = 1;
2907
2908 ret = ip_vs_genl_parse_service(&usvc,
2909 info->attrs[IPVS_CMD_ATTR_SERVICE],
2910 need_full_svc);
2911 if (ret)
2912 goto out;
2913
2914 /* Lookup the exact service by <protocol, addr, port> or fwmark */
2915 if (usvc.fwmark == 0)
2916 svc = __ip_vs_service_get(usvc.protocol, usvc.addr, usvc.port);
2917 else
2918 svc = __ip_vs_svc_fwm_get(usvc.fwmark);
2919
2920 /* Unless we're adding a new service, the service must already exist */
2921 if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
2922 ret = -ESRCH;
2923 goto out;
2924 }
2925
2926 /* Destination commands require a valid destination argument. For
2927 * adding / editing a destination, we need a full destination
2928 * specification. */
2929 if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
2930 cmd == IPVS_CMD_DEL_DEST) {
2931 if (cmd != IPVS_CMD_DEL_DEST)
2932 need_full_dest = 1;
2933
2934 ret = ip_vs_genl_parse_dest(&udest,
2935 info->attrs[IPVS_CMD_ATTR_DEST],
2936 need_full_dest);
2937 if (ret)
2938 goto out;
2939 }
2940
2941 switch (cmd) {
2942 case IPVS_CMD_NEW_SERVICE:
2943 if (svc == NULL)
2944 ret = ip_vs_add_service(&usvc, &svc);
2945 else
2946 ret = -EEXIST;
2947 break;
2948 case IPVS_CMD_SET_SERVICE:
2949 ret = ip_vs_edit_service(svc, &usvc);
2950 break;
2951 case IPVS_CMD_DEL_SERVICE:
2952 ret = ip_vs_del_service(svc);
2953 break;
2954 case IPVS_CMD_NEW_DEST:
2955 ret = ip_vs_add_dest(svc, &udest);
2956 break;
2957 case IPVS_CMD_SET_DEST:
2958 ret = ip_vs_edit_dest(svc, &udest);
2959 break;
2960 case IPVS_CMD_DEL_DEST:
2961 ret = ip_vs_del_dest(svc, &udest);
2962 break;
2963 case IPVS_CMD_ZERO:
2964 ret = ip_vs_zero_service(svc);
2965 break;
2966 default:
2967 ret = -EINVAL;
2968 }
2969
2970out:
2971 if (svc)
2972 ip_vs_service_put(svc);
2973 mutex_unlock(&__ip_vs_mutex);
2974
2975 return ret;
2976}
2977
2978static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
2979{
2980 struct sk_buff *msg;
2981 void *reply;
2982 int ret, cmd, reply_cmd;
2983
2984 cmd = info->genlhdr->cmd;
2985
2986 if (cmd == IPVS_CMD_GET_SERVICE)
2987 reply_cmd = IPVS_CMD_NEW_SERVICE;
2988 else if (cmd == IPVS_CMD_GET_INFO)
2989 reply_cmd = IPVS_CMD_SET_INFO;
2990 else if (cmd == IPVS_CMD_GET_CONFIG)
2991 reply_cmd = IPVS_CMD_SET_CONFIG;
2992 else {
2993 IP_VS_ERR("unknown Generic Netlink command\n");
2994 return -EINVAL;
2995 }
2996
2997 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
2998 if (!msg)
2999 return -ENOMEM;
3000
3001 mutex_lock(&__ip_vs_mutex);
3002
3003 reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
3004 if (reply == NULL)
3005 goto nla_put_failure;
3006
3007 switch (cmd) {
3008 case IPVS_CMD_GET_SERVICE:
3009 {
3010 struct ip_vs_service *svc;
3011
3012 svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]);
3013 if (IS_ERR(svc)) {
3014 ret = PTR_ERR(svc);
3015 goto out_err;
3016 } else if (svc) {
3017 ret = ip_vs_genl_fill_service(msg, svc);
3018 ip_vs_service_put(svc);
3019 if (ret)
3020 goto nla_put_failure;
3021 } else {
3022 ret = -ESRCH;
3023 goto out_err;
3024 }
3025
3026 break;
3027 }
3028
3029 case IPVS_CMD_GET_CONFIG:
3030 {
3031 struct ip_vs_timeout_user t;
3032
3033 __ip_vs_get_timeouts(&t);
3034#ifdef CONFIG_IP_VS_PROTO_TCP
3035 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
3036 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
3037 t.tcp_fin_timeout);
3038#endif
3039#ifdef CONFIG_IP_VS_PROTO_UDP
3040 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
3041#endif
3042
3043 break;
3044 }
3045
3046 case IPVS_CMD_GET_INFO:
3047 NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
3048 NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
3049 IP_VS_CONN_TAB_SIZE);
3050 break;
3051 }
3052
3053 genlmsg_end(msg, reply);
3054 ret = genlmsg_unicast(msg, info->snd_pid);
3055 goto out;
3056
3057nla_put_failure:
3058 IP_VS_ERR("not enough space in Netlink message\n");
3059 ret = -EMSGSIZE;
3060
3061out_err:
3062 nlmsg_free(msg);
3063out:
3064 mutex_unlock(&__ip_vs_mutex);
3065
3066 return ret;
3067}
3068
3069
3070static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
3071 {
3072 .cmd = IPVS_CMD_NEW_SERVICE,
3073 .flags = GENL_ADMIN_PERM,
3074 .policy = ip_vs_cmd_policy,
3075 .doit = ip_vs_genl_set_cmd,
3076 },
3077 {
3078 .cmd = IPVS_CMD_SET_SERVICE,
3079 .flags = GENL_ADMIN_PERM,
3080 .policy = ip_vs_cmd_policy,
3081 .doit = ip_vs_genl_set_cmd,
3082 },
3083 {
3084 .cmd = IPVS_CMD_DEL_SERVICE,
3085 .flags = GENL_ADMIN_PERM,
3086 .policy = ip_vs_cmd_policy,
3087 .doit = ip_vs_genl_set_cmd,
3088 },
3089 {
3090 .cmd = IPVS_CMD_GET_SERVICE,
3091 .flags = GENL_ADMIN_PERM,
3092 .doit = ip_vs_genl_get_cmd,
3093 .dumpit = ip_vs_genl_dump_services,
3094 .policy = ip_vs_cmd_policy,
3095 },
3096 {
3097 .cmd = IPVS_CMD_NEW_DEST,
3098 .flags = GENL_ADMIN_PERM,
3099 .policy = ip_vs_cmd_policy,
3100 .doit = ip_vs_genl_set_cmd,
3101 },
3102 {
3103 .cmd = IPVS_CMD_SET_DEST,
3104 .flags = GENL_ADMIN_PERM,
3105 .policy = ip_vs_cmd_policy,
3106 .doit = ip_vs_genl_set_cmd,
3107 },
3108 {
3109 .cmd = IPVS_CMD_DEL_DEST,
3110 .flags = GENL_ADMIN_PERM,
3111 .policy = ip_vs_cmd_policy,
3112 .doit = ip_vs_genl_set_cmd,
3113 },
3114 {
3115 .cmd = IPVS_CMD_GET_DEST,
3116 .flags = GENL_ADMIN_PERM,
3117 .policy = ip_vs_cmd_policy,
3118 .dumpit = ip_vs_genl_dump_dests,
3119 },
3120 {
3121 .cmd = IPVS_CMD_NEW_DAEMON,
3122 .flags = GENL_ADMIN_PERM,
3123 .policy = ip_vs_cmd_policy,
3124 .doit = ip_vs_genl_set_cmd,
3125 },
3126 {
3127 .cmd = IPVS_CMD_DEL_DAEMON,
3128 .flags = GENL_ADMIN_PERM,
3129 .policy = ip_vs_cmd_policy,
3130 .doit = ip_vs_genl_set_cmd,
3131 },
3132 {
3133 .cmd = IPVS_CMD_GET_DAEMON,
3134 .flags = GENL_ADMIN_PERM,
3135 .dumpit = ip_vs_genl_dump_daemons,
3136 },
3137 {
3138 .cmd = IPVS_CMD_SET_CONFIG,
3139 .flags = GENL_ADMIN_PERM,
3140 .policy = ip_vs_cmd_policy,
3141 .doit = ip_vs_genl_set_cmd,
3142 },
3143 {
3144 .cmd = IPVS_CMD_GET_CONFIG,
3145 .flags = GENL_ADMIN_PERM,
3146 .doit = ip_vs_genl_get_cmd,
3147 },
3148 {
3149 .cmd = IPVS_CMD_GET_INFO,
3150 .flags = GENL_ADMIN_PERM,
3151 .doit = ip_vs_genl_get_cmd,
3152 },
3153 {
3154 .cmd = IPVS_CMD_ZERO,
3155 .flags = GENL_ADMIN_PERM,
3156 .policy = ip_vs_cmd_policy,
3157 .doit = ip_vs_genl_set_cmd,
3158 },
3159 {
3160 .cmd = IPVS_CMD_FLUSH,
3161 .flags = GENL_ADMIN_PERM,
3162 .doit = ip_vs_genl_set_cmd,
3163 },
3164};
3165
3166static int __init ip_vs_genl_register(void)
3167{
3168 int ret, i;
3169
3170 ret = genl_register_family(&ip_vs_genl_family);
3171 if (ret)
3172 return ret;
3173
3174 for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) {
3175 ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]);
3176 if (ret)
3177 goto err_out;
3178 }
3179 return 0;
3180
3181err_out:
3182 genl_unregister_family(&ip_vs_genl_family);
3183 return ret;
3184}
3185
3186static void ip_vs_genl_unregister(void)
3187{
3188 genl_unregister_family(&ip_vs_genl_family);
3189}
3190
3191/* End of Generic Netlink interface definitions */
3192
2323 3193
2324int __init ip_vs_control_init(void) 3194int __init ip_vs_control_init(void)
2325{ 3195{
@@ -2334,6 +3204,13 @@ int __init ip_vs_control_init(void)
2334 return ret; 3204 return ret;
2335 } 3205 }
2336 3206
3207 ret = ip_vs_genl_register();
3208 if (ret) {
3209 IP_VS_ERR("cannot register Generic Netlink interface.\n");
3210 nf_unregister_sockopt(&ip_vs_sockopts);
3211 return ret;
3212 }
3213
2337 proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); 3214 proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
2338 proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); 3215 proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
2339 3216
@@ -2368,6 +3245,7 @@ void ip_vs_control_cleanup(void)
2368 unregister_sysctl_table(sysctl_header); 3245 unregister_sysctl_table(sysctl_header);
2369 proc_net_remove(&init_net, "ip_vs_stats"); 3246 proc_net_remove(&init_net, "ip_vs_stats");
2370 proc_net_remove(&init_net, "ip_vs"); 3247 proc_net_remove(&init_net, "ip_vs");
3248 ip_vs_genl_unregister();
2371 nf_unregister_sockopt(&ip_vs_sockopts); 3249 nf_unregister_sockopt(&ip_vs_sockopts);
2372 LeaveFunction(2); 3250 LeaveFunction(2);
2373} 3251}
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
index 5a20f93bd7f9..4fb620ec2086 100644
--- a/net/ipv4/ipvs/ip_vs_est.c
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -124,8 +124,6 @@ void ip_vs_new_estimator(struct ip_vs_stats *stats)
124 est->outbps = stats->outbps<<5; 124 est->outbps = stats->outbps<<5;
125 125
126 spin_lock_bh(&est_lock); 126 spin_lock_bh(&est_lock);
127 if (list_empty(&est_list))
128 mod_timer(&est_timer, jiffies + 2 * HZ);
129 list_add(&est->list, &est_list); 127 list_add(&est->list, &est_list);
130 spin_unlock_bh(&est_lock); 128 spin_unlock_bh(&est_lock);
131} 129}
@@ -136,11 +134,6 @@ void ip_vs_kill_estimator(struct ip_vs_stats *stats)
136 134
137 spin_lock_bh(&est_lock); 135 spin_lock_bh(&est_lock);
138 list_del(&est->list); 136 list_del(&est->list);
139 while (list_empty(&est_list) && try_to_del_timer_sync(&est_timer) < 0) {
140 spin_unlock_bh(&est_lock);
141 cpu_relax();
142 spin_lock_bh(&est_lock);
143 }
144 spin_unlock_bh(&est_lock); 137 spin_unlock_bh(&est_lock);
145} 138}
146 139
@@ -160,3 +153,14 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats)
160 est->inbps = 0; 153 est->inbps = 0;
161 est->outbps = 0; 154 est->outbps = 0;
162} 155}
156
157int __init ip_vs_estimator_init(void)
158{
159 mod_timer(&est_timer, jiffies + 2 * HZ);
160 return 0;
161}
162
163void ip_vs_estimator_cleanup(void)
164{
165 del_timer_sync(&est_timer);
166}
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
index 7a6a319f544a..d2a43aa3fe4c 100644
--- a/net/ipv4/ipvs/ip_vs_lblc.c
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -96,7 +96,6 @@ struct ip_vs_lblc_entry {
96 * IPVS lblc hash table 96 * IPVS lblc hash table
97 */ 97 */
98struct ip_vs_lblc_table { 98struct ip_vs_lblc_table {
99 rwlock_t lock; /* lock for this table */
100 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ 99 struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
101 atomic_t entries; /* number of entries */ 100 atomic_t entries; /* number of entries */
102 int max_size; /* maximum size of entries */ 101 int max_size; /* maximum size of entries */
@@ -123,31 +122,6 @@ static ctl_table vs_vars_table[] = {
123 122
124static struct ctl_table_header * sysctl_header; 123static struct ctl_table_header * sysctl_header;
125 124
126/*
127 * new/free a ip_vs_lblc_entry, which is a mapping of a destionation
128 * IP address to a server.
129 */
130static inline struct ip_vs_lblc_entry *
131ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest)
132{
133 struct ip_vs_lblc_entry *en;
134
135 en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
136 if (en == NULL) {
137 IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
138 return NULL;
139 }
140
141 INIT_LIST_HEAD(&en->list);
142 en->addr = daddr;
143
144 atomic_inc(&dest->refcnt);
145 en->dest = dest;
146
147 return en;
148}
149
150
151static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) 125static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
152{ 126{
153 list_del(&en->list); 127 list_del(&en->list);
@@ -173,55 +147,66 @@ static inline unsigned ip_vs_lblc_hashkey(__be32 addr)
173 * Hash an entry in the ip_vs_lblc_table. 147 * Hash an entry in the ip_vs_lblc_table.
174 * returns bool success. 148 * returns bool success.
175 */ 149 */
176static int 150static void
177ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) 151ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
178{ 152{
179 unsigned hash; 153 unsigned hash = ip_vs_lblc_hashkey(en->addr);
180
181 if (!list_empty(&en->list)) {
182 IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
183 "called from %p\n", __builtin_return_address(0));
184 return 0;
185 }
186 154
187 /*
188 * Hash by destination IP address
189 */
190 hash = ip_vs_lblc_hashkey(en->addr);
191
192 write_lock(&tbl->lock);
193 list_add(&en->list, &tbl->bucket[hash]); 155 list_add(&en->list, &tbl->bucket[hash]);
194 atomic_inc(&tbl->entries); 156 atomic_inc(&tbl->entries);
195 write_unlock(&tbl->lock);
196
197 return 1;
198} 157}
199 158
200 159
201/* 160/*
202 * Get ip_vs_lblc_entry associated with supplied parameters. 161 * Get ip_vs_lblc_entry associated with supplied parameters. Called under read
162 * lock
203 */ 163 */
204static inline struct ip_vs_lblc_entry * 164static inline struct ip_vs_lblc_entry *
205ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) 165ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
206{ 166{
207 unsigned hash; 167 unsigned hash = ip_vs_lblc_hashkey(addr);
208 struct ip_vs_lblc_entry *en; 168 struct ip_vs_lblc_entry *en;
209 169
210 hash = ip_vs_lblc_hashkey(addr); 170 list_for_each_entry(en, &tbl->bucket[hash], list)
171 if (en->addr == addr)
172 return en;
211 173
212 read_lock(&tbl->lock); 174 return NULL;
175}
213 176
214 list_for_each_entry(en, &tbl->bucket[hash], list) { 177
215 if (en->addr == addr) { 178/*
216 /* HIT */ 179 * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
217 read_unlock(&tbl->lock); 180 * address to a server. Called under write lock.
218 return en; 181 */
182static inline struct ip_vs_lblc_entry *
183ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr,
184 struct ip_vs_dest *dest)
185{
186 struct ip_vs_lblc_entry *en;
187
188 en = ip_vs_lblc_get(tbl, daddr);
189 if (!en) {
190 en = kmalloc(sizeof(*en), GFP_ATOMIC);
191 if (!en) {
192 IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
193 return NULL;
219 } 194 }
220 }
221 195
222 read_unlock(&tbl->lock); 196 en->addr = daddr;
197 en->lastuse = jiffies;
223 198
224 return NULL; 199 atomic_inc(&dest->refcnt);
200 en->dest = dest;
201
202 ip_vs_lblc_hash(tbl, en);
203 } else if (en->dest != dest) {
204 atomic_dec(&en->dest->refcnt);
205 atomic_inc(&dest->refcnt);
206 en->dest = dest;
207 }
208
209 return en;
225} 210}
226 211
227 212
@@ -230,30 +215,29 @@ ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr)
230 */ 215 */
231static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) 216static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
232{ 217{
233 int i;
234 struct ip_vs_lblc_entry *en, *nxt; 218 struct ip_vs_lblc_entry *en, *nxt;
219 int i;
235 220
236 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 221 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
237 write_lock(&tbl->lock);
238 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { 222 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
239 ip_vs_lblc_free(en); 223 ip_vs_lblc_free(en);
240 atomic_dec(&tbl->entries); 224 atomic_dec(&tbl->entries);
241 } 225 }
242 write_unlock(&tbl->lock);
243 } 226 }
244} 227}
245 228
246 229
247static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) 230static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
248{ 231{
232 struct ip_vs_lblc_table *tbl = svc->sched_data;
233 struct ip_vs_lblc_entry *en, *nxt;
249 unsigned long now = jiffies; 234 unsigned long now = jiffies;
250 int i, j; 235 int i, j;
251 struct ip_vs_lblc_entry *en, *nxt;
252 236
253 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 237 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
254 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 238 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
255 239
256 write_lock(&tbl->lock); 240 write_lock(&svc->sched_lock);
257 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 241 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
258 if (time_before(now, 242 if (time_before(now,
259 en->lastuse + sysctl_ip_vs_lblc_expiration)) 243 en->lastuse + sysctl_ip_vs_lblc_expiration))
@@ -262,7 +246,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
262 ip_vs_lblc_free(en); 246 ip_vs_lblc_free(en);
263 atomic_dec(&tbl->entries); 247 atomic_dec(&tbl->entries);
264 } 248 }
265 write_unlock(&tbl->lock); 249 write_unlock(&svc->sched_lock);
266 } 250 }
267 tbl->rover = j; 251 tbl->rover = j;
268} 252}
@@ -281,17 +265,16 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
281 */ 265 */
282static void ip_vs_lblc_check_expire(unsigned long data) 266static void ip_vs_lblc_check_expire(unsigned long data)
283{ 267{
284 struct ip_vs_lblc_table *tbl; 268 struct ip_vs_service *svc = (struct ip_vs_service *) data;
269 struct ip_vs_lblc_table *tbl = svc->sched_data;
285 unsigned long now = jiffies; 270 unsigned long now = jiffies;
286 int goal; 271 int goal;
287 int i, j; 272 int i, j;
288 struct ip_vs_lblc_entry *en, *nxt; 273 struct ip_vs_lblc_entry *en, *nxt;
289 274
290 tbl = (struct ip_vs_lblc_table *)data;
291
292 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { 275 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
293 /* do full expiration check */ 276 /* do full expiration check */
294 ip_vs_lblc_full_check(tbl); 277 ip_vs_lblc_full_check(svc);
295 tbl->counter = 1; 278 tbl->counter = 1;
296 goto out; 279 goto out;
297 } 280 }
@@ -308,7 +291,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
308 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 291 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
309 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 292 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
310 293
311 write_lock(&tbl->lock); 294 write_lock(&svc->sched_lock);
312 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 295 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
313 if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) 296 if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
314 continue; 297 continue;
@@ -317,7 +300,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
317 atomic_dec(&tbl->entries); 300 atomic_dec(&tbl->entries);
318 goal--; 301 goal--;
319 } 302 }
320 write_unlock(&tbl->lock); 303 write_unlock(&svc->sched_lock);
321 if (goal <= 0) 304 if (goal <= 0)
322 break; 305 break;
323 } 306 }
@@ -336,15 +319,14 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
336 /* 319 /*
337 * Allocate the ip_vs_lblc_table for this service 320 * Allocate the ip_vs_lblc_table for this service
338 */ 321 */
339 tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); 322 tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
340 if (tbl == NULL) { 323 if (tbl == NULL) {
341 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); 324 IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
342 return -ENOMEM; 325 return -ENOMEM;
343 } 326 }
344 svc->sched_data = tbl; 327 svc->sched_data = tbl;
345 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " 328 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
346 "current service\n", 329 "current service\n", sizeof(*tbl));
347 sizeof(struct ip_vs_lblc_table));
348 330
349 /* 331 /*
350 * Initialize the hash buckets 332 * Initialize the hash buckets
@@ -352,7 +334,6 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
352 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { 334 for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
353 INIT_LIST_HEAD(&tbl->bucket[i]); 335 INIT_LIST_HEAD(&tbl->bucket[i]);
354 } 336 }
355 rwlock_init(&tbl->lock);
356 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; 337 tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
357 tbl->rover = 0; 338 tbl->rover = 0;
358 tbl->counter = 1; 339 tbl->counter = 1;
@@ -361,9 +342,8 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
361 * Hook periodic timer for garbage collection 342 * Hook periodic timer for garbage collection
362 */ 343 */
363 setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, 344 setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
364 (unsigned long)tbl); 345 (unsigned long)svc);
365 tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; 346 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
366 add_timer(&tbl->periodic_timer);
367 347
368 return 0; 348 return 0;
369} 349}
@@ -380,22 +360,16 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
380 ip_vs_lblc_flush(tbl); 360 ip_vs_lblc_flush(tbl);
381 361
382 /* release the table itself */ 362 /* release the table itself */
383 kfree(svc->sched_data); 363 kfree(tbl);
384 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", 364 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
385 sizeof(struct ip_vs_lblc_table)); 365 sizeof(*tbl));
386 366
387 return 0; 367 return 0;
388} 368}
389 369
390 370
391static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
392{
393 return 0;
394}
395
396
397static inline struct ip_vs_dest * 371static inline struct ip_vs_dest *
398__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) 372__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
399{ 373{
400 struct ip_vs_dest *dest, *least; 374 struct ip_vs_dest *dest, *least;
401 int loh, doh; 375 int loh, doh;
@@ -484,46 +458,54 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
484static struct ip_vs_dest * 458static struct ip_vs_dest *
485ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) 459ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
486{ 460{
487 struct ip_vs_dest *dest; 461 struct ip_vs_lblc_table *tbl = svc->sched_data;
488 struct ip_vs_lblc_table *tbl;
489 struct ip_vs_lblc_entry *en;
490 struct iphdr *iph = ip_hdr(skb); 462 struct iphdr *iph = ip_hdr(skb);
463 struct ip_vs_dest *dest = NULL;
464 struct ip_vs_lblc_entry *en;
491 465
492 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); 466 IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
493 467
494 tbl = (struct ip_vs_lblc_table *)svc->sched_data; 468 /* First look in our cache */
469 read_lock(&svc->sched_lock);
495 en = ip_vs_lblc_get(tbl, iph->daddr); 470 en = ip_vs_lblc_get(tbl, iph->daddr);
496 if (en == NULL) { 471 if (en) {
497 dest = __ip_vs_wlc_schedule(svc, iph); 472 /* We only hold a read lock, but this is atomic */
498 if (dest == NULL) { 473 en->lastuse = jiffies;
499 IP_VS_DBG(1, "no destination available\n"); 474
500 return NULL; 475 /*
501 } 476 * If the destination is not available, i.e. it's in the trash,
502 en = ip_vs_lblc_new(iph->daddr, dest); 477 * we must ignore it, as it may be removed from under our feet,
503 if (en == NULL) { 478 * if someone drops our reference count. Our caller only makes
504 return NULL; 479 * sure that destinations, that are not in the trash, are not
505 } 480 * moved to the trash, while we are scheduling. But anyone can
506 ip_vs_lblc_hash(tbl, en); 481 * free up entries from the trash at any time.
507 } else { 482 */
508 dest = en->dest; 483
509 if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) 484 if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
510 || atomic_read(&dest->weight) <= 0 485 dest = en->dest;
511 || is_overloaded(dest, svc)) { 486 }
512 dest = __ip_vs_wlc_schedule(svc, iph); 487 read_unlock(&svc->sched_lock);
513 if (dest == NULL) { 488
514 IP_VS_DBG(1, "no destination available\n"); 489 /* If the destination has a weight and is not overloaded, use it */
515 return NULL; 490 if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
516 } 491 goto out;
517 atomic_dec(&en->dest->refcnt); 492
518 atomic_inc(&dest->refcnt); 493 /* No cache entry or it is invalid, time to schedule */
519 en->dest = dest; 494 dest = __ip_vs_lblc_schedule(svc, iph);
520 } 495 if (!dest) {
496 IP_VS_DBG(1, "no destination available\n");
497 return NULL;
521 } 498 }
522 en->lastuse = jiffies;
523 499
500 /* If we fail to create a cache entry, we'll just use the valid dest */
501 write_lock(&svc->sched_lock);
502 ip_vs_lblc_new(tbl, iph->daddr, dest);
503 write_unlock(&svc->sched_lock);
504
505out:
524 IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " 506 IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
525 "--> server %u.%u.%u.%u:%d\n", 507 "--> server %u.%u.%u.%u:%d\n",
526 NIPQUAD(en->addr), 508 NIPQUAD(iph->daddr),
527 NIPQUAD(dest->addr), 509 NIPQUAD(dest->addr),
528 ntohs(dest->port)); 510 ntohs(dest->port));
529 511
@@ -542,7 +524,6 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler =
542 .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), 524 .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
543 .init_service = ip_vs_lblc_init_svc, 525 .init_service = ip_vs_lblc_init_svc,
544 .done_service = ip_vs_lblc_done_svc, 526 .done_service = ip_vs_lblc_done_svc,
545 .update_service = ip_vs_lblc_update_svc,
546 .schedule = ip_vs_lblc_schedule, 527 .schedule = ip_vs_lblc_schedule,
547}; 528};
548 529
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
index c234e73968a6..375a1ffb6b65 100644
--- a/net/ipv4/ipvs/ip_vs_lblcr.c
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -106,7 +106,7 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
106 return NULL; 106 return NULL;
107 } 107 }
108 108
109 e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC); 109 e = kmalloc(sizeof(*e), GFP_ATOMIC);
110 if (e == NULL) { 110 if (e == NULL) {
111 IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); 111 IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
112 return NULL; 112 return NULL;
@@ -116,11 +116,9 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
116 e->dest = dest; 116 e->dest = dest;
117 117
118 /* link it to the list */ 118 /* link it to the list */
119 write_lock(&set->lock);
120 e->next = set->list; 119 e->next = set->list;
121 set->list = e; 120 set->list = e;
122 atomic_inc(&set->size); 121 atomic_inc(&set->size);
123 write_unlock(&set->lock);
124 122
125 set->lastmod = jiffies; 123 set->lastmod = jiffies;
126 return e; 124 return e;
@@ -131,7 +129,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
131{ 129{
132 struct ip_vs_dest_list *e, **ep; 130 struct ip_vs_dest_list *e, **ep;
133 131
134 write_lock(&set->lock);
135 for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { 132 for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
136 if (e->dest == dest) { 133 if (e->dest == dest) {
137 /* HIT */ 134 /* HIT */
@@ -144,7 +141,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
144 } 141 }
145 ep = &e->next; 142 ep = &e->next;
146 } 143 }
147 write_unlock(&set->lock);
148} 144}
149 145
150static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) 146static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
@@ -174,7 +170,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
174 if (set == NULL) 170 if (set == NULL)
175 return NULL; 171 return NULL;
176 172
177 read_lock(&set->lock);
178 /* select the first destination server, whose weight > 0 */ 173 /* select the first destination server, whose weight > 0 */
179 for (e=set->list; e!=NULL; e=e->next) { 174 for (e=set->list; e!=NULL; e=e->next) {
180 least = e->dest; 175 least = e->dest;
@@ -188,7 +183,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
188 goto nextstage; 183 goto nextstage;
189 } 184 }
190 } 185 }
191 read_unlock(&set->lock);
192 return NULL; 186 return NULL;
193 187
194 /* find the destination with the weighted least load */ 188 /* find the destination with the weighted least load */
@@ -207,7 +201,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
207 loh = doh; 201 loh = doh;
208 } 202 }
209 } 203 }
210 read_unlock(&set->lock);
211 204
212 IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " 205 IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
213 "activeconns %d refcnt %d weight %d overhead %d\n", 206 "activeconns %d refcnt %d weight %d overhead %d\n",
@@ -229,7 +222,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
229 if (set == NULL) 222 if (set == NULL)
230 return NULL; 223 return NULL;
231 224
232 read_lock(&set->lock);
233 /* select the first destination server, whose weight > 0 */ 225 /* select the first destination server, whose weight > 0 */
234 for (e=set->list; e!=NULL; e=e->next) { 226 for (e=set->list; e!=NULL; e=e->next) {
235 most = e->dest; 227 most = e->dest;
@@ -239,7 +231,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
239 goto nextstage; 231 goto nextstage;
240 } 232 }
241 } 233 }
242 read_unlock(&set->lock);
243 return NULL; 234 return NULL;
244 235
245 /* find the destination with the weighted most load */ 236 /* find the destination with the weighted most load */
@@ -256,7 +247,6 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
256 moh = doh; 247 moh = doh;
257 } 248 }
258 } 249 }
259 read_unlock(&set->lock);
260 250
261 IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " 251 IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
262 "activeconns %d refcnt %d weight %d overhead %d\n", 252 "activeconns %d refcnt %d weight %d overhead %d\n",
@@ -284,7 +274,6 @@ struct ip_vs_lblcr_entry {
284 * IPVS lblcr hash table 274 * IPVS lblcr hash table
285 */ 275 */
286struct ip_vs_lblcr_table { 276struct ip_vs_lblcr_table {
287 rwlock_t lock; /* lock for this table */
288 struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ 277 struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
289 atomic_t entries; /* number of entries */ 278 atomic_t entries; /* number of entries */
290 int max_size; /* maximum size of entries */ 279 int max_size; /* maximum size of entries */
@@ -311,32 +300,6 @@ static ctl_table vs_vars_table[] = {
311 300
312static struct ctl_table_header * sysctl_header; 301static struct ctl_table_header * sysctl_header;
313 302
314/*
315 * new/free a ip_vs_lblcr_entry, which is a mapping of a destination
316 * IP address to a server.
317 */
318static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr)
319{
320 struct ip_vs_lblcr_entry *en;
321
322 en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
323 if (en == NULL) {
324 IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
325 return NULL;
326 }
327
328 INIT_LIST_HEAD(&en->list);
329 en->addr = daddr;
330
331 /* initilize its dest set */
332 atomic_set(&(en->set.size), 0);
333 en->set.list = NULL;
334 rwlock_init(&en->set.lock);
335
336 return en;
337}
338
339
340static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) 303static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
341{ 304{
342 list_del(&en->list); 305 list_del(&en->list);
@@ -358,55 +321,68 @@ static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
358 * Hash an entry in the ip_vs_lblcr_table. 321 * Hash an entry in the ip_vs_lblcr_table.
359 * returns bool success. 322 * returns bool success.
360 */ 323 */
361static int 324static void
362ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) 325ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
363{ 326{
364 unsigned hash; 327 unsigned hash = ip_vs_lblcr_hashkey(en->addr);
365
366 if (!list_empty(&en->list)) {
367 IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
368 "called from %p\n", __builtin_return_address(0));
369 return 0;
370 }
371 328
372 /*
373 * Hash by destination IP address
374 */
375 hash = ip_vs_lblcr_hashkey(en->addr);
376
377 write_lock(&tbl->lock);
378 list_add(&en->list, &tbl->bucket[hash]); 329 list_add(&en->list, &tbl->bucket[hash]);
379 atomic_inc(&tbl->entries); 330 atomic_inc(&tbl->entries);
380 write_unlock(&tbl->lock);
381
382 return 1;
383} 331}
384 332
385 333
386/* 334/*
387 * Get ip_vs_lblcr_entry associated with supplied parameters. 335 * Get ip_vs_lblcr_entry associated with supplied parameters. Called under
336 * read lock.
388 */ 337 */
389static inline struct ip_vs_lblcr_entry * 338static inline struct ip_vs_lblcr_entry *
390ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr) 339ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
391{ 340{
392 unsigned hash; 341 unsigned hash = ip_vs_lblcr_hashkey(addr);
393 struct ip_vs_lblcr_entry *en; 342 struct ip_vs_lblcr_entry *en;
394 343
395 hash = ip_vs_lblcr_hashkey(addr); 344 list_for_each_entry(en, &tbl->bucket[hash], list)
345 if (en->addr == addr)
346 return en;
396 347
397 read_lock(&tbl->lock); 348 return NULL;
349}
398 350
399 list_for_each_entry(en, &tbl->bucket[hash], list) { 351
400 if (en->addr == addr) { 352/*
401 /* HIT */ 353 * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
402 read_unlock(&tbl->lock); 354 * IP address to a server. Called under write lock.
403 return en; 355 */
356static inline struct ip_vs_lblcr_entry *
357ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr,
358 struct ip_vs_dest *dest)
359{
360 struct ip_vs_lblcr_entry *en;
361
362 en = ip_vs_lblcr_get(tbl, daddr);
363 if (!en) {
364 en = kmalloc(sizeof(*en), GFP_ATOMIC);
365 if (!en) {
366 IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
367 return NULL;
404 } 368 }
369
370 en->addr = daddr;
371 en->lastuse = jiffies;
372
373 /* initilize its dest set */
374 atomic_set(&(en->set.size), 0);
375 en->set.list = NULL;
376 rwlock_init(&en->set.lock);
377
378 ip_vs_lblcr_hash(tbl, en);
405 } 379 }
406 380
407 read_unlock(&tbl->lock); 381 write_lock(&en->set.lock);
382 ip_vs_dest_set_insert(&en->set, dest);
383 write_unlock(&en->set.lock);
408 384
409 return NULL; 385 return en;
410} 386}
411 387
412 388
@@ -418,19 +394,18 @@ static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
418 int i; 394 int i;
419 struct ip_vs_lblcr_entry *en, *nxt; 395 struct ip_vs_lblcr_entry *en, *nxt;
420 396
397 /* No locking required, only called during cleanup. */
421 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { 398 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
422 write_lock(&tbl->lock);
423 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { 399 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
424 ip_vs_lblcr_free(en); 400 ip_vs_lblcr_free(en);
425 atomic_dec(&tbl->entries);
426 } 401 }
427 write_unlock(&tbl->lock);
428 } 402 }
429} 403}
430 404
431 405
432static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) 406static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
433{ 407{
408 struct ip_vs_lblcr_table *tbl = svc->sched_data;
434 unsigned long now = jiffies; 409 unsigned long now = jiffies;
435 int i, j; 410 int i, j;
436 struct ip_vs_lblcr_entry *en, *nxt; 411 struct ip_vs_lblcr_entry *en, *nxt;
@@ -438,7 +413,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
438 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { 413 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
439 j = (j + 1) & IP_VS_LBLCR_TAB_MASK; 414 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
440 415
441 write_lock(&tbl->lock); 416 write_lock(&svc->sched_lock);
442 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 417 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
443 if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, 418 if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
444 now)) 419 now))
@@ -447,7 +422,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
447 ip_vs_lblcr_free(en); 422 ip_vs_lblcr_free(en);
448 atomic_dec(&tbl->entries); 423 atomic_dec(&tbl->entries);
449 } 424 }
450 write_unlock(&tbl->lock); 425 write_unlock(&svc->sched_lock);
451 } 426 }
452 tbl->rover = j; 427 tbl->rover = j;
453} 428}
@@ -466,17 +441,16 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
466 */ 441 */
467static void ip_vs_lblcr_check_expire(unsigned long data) 442static void ip_vs_lblcr_check_expire(unsigned long data)
468{ 443{
469 struct ip_vs_lblcr_table *tbl; 444 struct ip_vs_service *svc = (struct ip_vs_service *) data;
445 struct ip_vs_lblcr_table *tbl = svc->sched_data;
470 unsigned long now = jiffies; 446 unsigned long now = jiffies;
471 int goal; 447 int goal;
472 int i, j; 448 int i, j;
473 struct ip_vs_lblcr_entry *en, *nxt; 449 struct ip_vs_lblcr_entry *en, *nxt;
474 450
475 tbl = (struct ip_vs_lblcr_table *)data;
476
477 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { 451 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
478 /* do full expiration check */ 452 /* do full expiration check */
479 ip_vs_lblcr_full_check(tbl); 453 ip_vs_lblcr_full_check(svc);
480 tbl->counter = 1; 454 tbl->counter = 1;
481 goto out; 455 goto out;
482 } 456 }
@@ -493,7 +467,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
493 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { 467 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
494 j = (j + 1) & IP_VS_LBLCR_TAB_MASK; 468 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
495 469
496 write_lock(&tbl->lock); 470 write_lock(&svc->sched_lock);
497 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 471 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
498 if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) 472 if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
499 continue; 473 continue;
@@ -502,7 +476,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
502 atomic_dec(&tbl->entries); 476 atomic_dec(&tbl->entries);
503 goal--; 477 goal--;
504 } 478 }
505 write_unlock(&tbl->lock); 479 write_unlock(&svc->sched_lock);
506 if (goal <= 0) 480 if (goal <= 0)
507 break; 481 break;
508 } 482 }
@@ -520,15 +494,14 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
520 /* 494 /*
521 * Allocate the ip_vs_lblcr_table for this service 495 * Allocate the ip_vs_lblcr_table for this service
522 */ 496 */
523 tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC); 497 tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
524 if (tbl == NULL) { 498 if (tbl == NULL) {
525 IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); 499 IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
526 return -ENOMEM; 500 return -ENOMEM;
527 } 501 }
528 svc->sched_data = tbl; 502 svc->sched_data = tbl;
529 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " 503 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
530 "current service\n", 504 "current service\n", sizeof(*tbl));
531 sizeof(struct ip_vs_lblcr_table));
532 505
533 /* 506 /*
534 * Initialize the hash buckets 507 * Initialize the hash buckets
@@ -536,7 +509,6 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
536 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { 509 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
537 INIT_LIST_HEAD(&tbl->bucket[i]); 510 INIT_LIST_HEAD(&tbl->bucket[i]);
538 } 511 }
539 rwlock_init(&tbl->lock);
540 tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; 512 tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
541 tbl->rover = 0; 513 tbl->rover = 0;
542 tbl->counter = 1; 514 tbl->counter = 1;
@@ -545,9 +517,8 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
545 * Hook periodic timer for garbage collection 517 * Hook periodic timer for garbage collection
546 */ 518 */
547 setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, 519 setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
548 (unsigned long)tbl); 520 (unsigned long)svc);
549 tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; 521 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
550 add_timer(&tbl->periodic_timer);
551 522
552 return 0; 523 return 0;
553} 524}
@@ -564,22 +535,16 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
564 ip_vs_lblcr_flush(tbl); 535 ip_vs_lblcr_flush(tbl);
565 536
566 /* release the table itself */ 537 /* release the table itself */
567 kfree(svc->sched_data); 538 kfree(tbl);
568 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", 539 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
569 sizeof(struct ip_vs_lblcr_table)); 540 sizeof(*tbl));
570 541
571 return 0; 542 return 0;
572} 543}
573 544
574 545
575static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
576{
577 return 0;
578}
579
580
581static inline struct ip_vs_dest * 546static inline struct ip_vs_dest *
582__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) 547__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
583{ 548{
584 struct ip_vs_dest *dest, *least; 549 struct ip_vs_dest *dest, *least;
585 int loh, doh; 550 int loh, doh;
@@ -669,50 +634,78 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
669static struct ip_vs_dest * 634static struct ip_vs_dest *
670ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) 635ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
671{ 636{
672 struct ip_vs_dest *dest; 637 struct ip_vs_lblcr_table *tbl = svc->sched_data;
673 struct ip_vs_lblcr_table *tbl;
674 struct ip_vs_lblcr_entry *en;
675 struct iphdr *iph = ip_hdr(skb); 638 struct iphdr *iph = ip_hdr(skb);
639 struct ip_vs_dest *dest = NULL;
640 struct ip_vs_lblcr_entry *en;
676 641
677 IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); 642 IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
678 643
679 tbl = (struct ip_vs_lblcr_table *)svc->sched_data; 644 /* First look in our cache */
645 read_lock(&svc->sched_lock);
680 en = ip_vs_lblcr_get(tbl, iph->daddr); 646 en = ip_vs_lblcr_get(tbl, iph->daddr);
681 if (en == NULL) { 647 if (en) {
682 dest = __ip_vs_wlc_schedule(svc, iph); 648 /* We only hold a read lock, but this is atomic */
683 if (dest == NULL) { 649 en->lastuse = jiffies;
684 IP_VS_DBG(1, "no destination available\n"); 650
685 return NULL; 651 /* Get the least loaded destination */
686 } 652 read_lock(&en->set.lock);
687 en = ip_vs_lblcr_new(iph->daddr);
688 if (en == NULL) {
689 return NULL;
690 }
691 ip_vs_dest_set_insert(&en->set, dest);
692 ip_vs_lblcr_hash(tbl, en);
693 } else {
694 dest = ip_vs_dest_set_min(&en->set); 653 dest = ip_vs_dest_set_min(&en->set);
695 if (!dest || is_overloaded(dest, svc)) { 654 read_unlock(&en->set.lock);
696 dest = __ip_vs_wlc_schedule(svc, iph); 655
697 if (dest == NULL) { 656 /* More than one destination + enough time passed by, cleanup */
698 IP_VS_DBG(1, "no destination available\n");
699 return NULL;
700 }
701 ip_vs_dest_set_insert(&en->set, dest);
702 }
703 if (atomic_read(&en->set.size) > 1 && 657 if (atomic_read(&en->set.size) > 1 &&
704 jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) { 658 time_after(jiffies, en->set.lastmod +
659 sysctl_ip_vs_lblcr_expiration)) {
705 struct ip_vs_dest *m; 660 struct ip_vs_dest *m;
661
662 write_lock(&en->set.lock);
706 m = ip_vs_dest_set_max(&en->set); 663 m = ip_vs_dest_set_max(&en->set);
707 if (m) 664 if (m)
708 ip_vs_dest_set_erase(&en->set, m); 665 ip_vs_dest_set_erase(&en->set, m);
666 write_unlock(&en->set.lock);
667 }
668
669 /* If the destination is not overloaded, use it */
670 if (dest && !is_overloaded(dest, svc)) {
671 read_unlock(&svc->sched_lock);
672 goto out;
709 } 673 }
674
675 /* The cache entry is invalid, time to schedule */
676 dest = __ip_vs_lblcr_schedule(svc, iph);
677 if (!dest) {
678 IP_VS_DBG(1, "no destination available\n");
679 read_unlock(&svc->sched_lock);
680 return NULL;
681 }
682
683 /* Update our cache entry */
684 write_lock(&en->set.lock);
685 ip_vs_dest_set_insert(&en->set, dest);
686 write_unlock(&en->set.lock);
687 }
688 read_unlock(&svc->sched_lock);
689
690 if (dest)
691 goto out;
692
693 /* No cache entry, time to schedule */
694 dest = __ip_vs_lblcr_schedule(svc, iph);
695 if (!dest) {
696 IP_VS_DBG(1, "no destination available\n");
697 return NULL;
710 } 698 }
711 en->lastuse = jiffies;
712 699
700 /* If we fail to create a cache entry, we'll just use the valid dest */
701 write_lock(&svc->sched_lock);
702 ip_vs_lblcr_new(tbl, iph->daddr, dest);
703 write_unlock(&svc->sched_lock);
704
705out:
713 IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " 706 IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
714 "--> server %u.%u.%u.%u:%d\n", 707 "--> server %u.%u.%u.%u:%d\n",
715 NIPQUAD(en->addr), 708 NIPQUAD(iph->daddr),
716 NIPQUAD(dest->addr), 709 NIPQUAD(dest->addr),
717 ntohs(dest->port)); 710 ntohs(dest->port));
718 711
@@ -731,7 +724,6 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
731 .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), 724 .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
732 .init_service = ip_vs_lblcr_init_svc, 725 .init_service = ip_vs_lblcr_init_svc,
733 .done_service = ip_vs_lblcr_done_svc, 726 .done_service = ip_vs_lblcr_done_svc,
734 .update_service = ip_vs_lblcr_update_svc,
735 .schedule = ip_vs_lblcr_schedule, 727 .schedule = ip_vs_lblcr_schedule,
736}; 728};
737 729
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c
index ebcdbf75ac65..2c3de1b63518 100644
--- a/net/ipv4/ipvs/ip_vs_lc.c
+++ b/net/ipv4/ipvs/ip_vs_lc.c
@@ -20,24 +20,6 @@
20#include <net/ip_vs.h> 20#include <net/ip_vs.h>
21 21
22 22
23static int ip_vs_lc_init_svc(struct ip_vs_service *svc)
24{
25 return 0;
26}
27
28
29static int ip_vs_lc_done_svc(struct ip_vs_service *svc)
30{
31 return 0;
32}
33
34
35static int ip_vs_lc_update_svc(struct ip_vs_service *svc)
36{
37 return 0;
38}
39
40
41static inline unsigned int 23static inline unsigned int
42ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) 24ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
43{ 25{
@@ -99,9 +81,6 @@ static struct ip_vs_scheduler ip_vs_lc_scheduler = {
99 .refcnt = ATOMIC_INIT(0), 81 .refcnt = ATOMIC_INIT(0),
100 .module = THIS_MODULE, 82 .module = THIS_MODULE,
101 .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), 83 .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
102 .init_service = ip_vs_lc_init_svc,
103 .done_service = ip_vs_lc_done_svc,
104 .update_service = ip_vs_lc_update_svc,
105 .schedule = ip_vs_lc_schedule, 84 .schedule = ip_vs_lc_schedule,
106}; 85};
107 86
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c
index 92f3a6770031..5330d5a2de14 100644
--- a/net/ipv4/ipvs/ip_vs_nq.c
+++ b/net/ipv4/ipvs/ip_vs_nq.c
@@ -37,27 +37,6 @@
37#include <net/ip_vs.h> 37#include <net/ip_vs.h>
38 38
39 39
40static int
41ip_vs_nq_init_svc(struct ip_vs_service *svc)
42{
43 return 0;
44}
45
46
47static int
48ip_vs_nq_done_svc(struct ip_vs_service *svc)
49{
50 return 0;
51}
52
53
54static int
55ip_vs_nq_update_svc(struct ip_vs_service *svc)
56{
57 return 0;
58}
59
60
61static inline unsigned int 40static inline unsigned int
62ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) 41ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
63{ 42{
@@ -137,9 +116,6 @@ static struct ip_vs_scheduler ip_vs_nq_scheduler =
137 .refcnt = ATOMIC_INIT(0), 116 .refcnt = ATOMIC_INIT(0),
138 .module = THIS_MODULE, 117 .module = THIS_MODULE,
139 .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), 118 .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
140 .init_service = ip_vs_nq_init_svc,
141 .done_service = ip_vs_nq_done_svc,
142 .update_service = ip_vs_nq_update_svc,
143 .schedule = ip_vs_nq_schedule, 119 .schedule = ip_vs_nq_schedule,
144}; 120};
145 121
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
index 73e0ea87c1f5..3f9ebd7639ae 100644
--- a/net/ipv4/ipvs/ip_vs_proto_ah.c
+++ b/net/ipv4/ipvs/ip_vs_proto_ah_esp.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS 2 * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS
3 * 3 *
4 * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 4 * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
5 * Wensong Zhang <wensong@linuxvirtualserver.org> 5 * Wensong Zhang <wensong@linuxvirtualserver.org>
@@ -39,11 +39,11 @@ struct isakmp_hdr {
39 39
40 40
41static struct ip_vs_conn * 41static struct ip_vs_conn *
42ah_conn_in_get(const struct sk_buff *skb, 42ah_esp_conn_in_get(const struct sk_buff *skb,
43 struct ip_vs_protocol *pp, 43 struct ip_vs_protocol *pp,
44 const struct iphdr *iph, 44 const struct iphdr *iph,
45 unsigned int proto_off, 45 unsigned int proto_off,
46 int inverse) 46 int inverse)
47{ 47{
48 struct ip_vs_conn *cp; 48 struct ip_vs_conn *cp;
49 49
@@ -79,8 +79,8 @@ ah_conn_in_get(const struct sk_buff *skb,
79 79
80 80
81static struct ip_vs_conn * 81static struct ip_vs_conn *
82ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, 82ah_esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
83 const struct iphdr *iph, unsigned int proto_off, int inverse) 83 const struct iphdr *iph, unsigned int proto_off, int inverse)
84{ 84{
85 struct ip_vs_conn *cp; 85 struct ip_vs_conn *cp;
86 86
@@ -112,12 +112,12 @@ ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
112 112
113 113
114static int 114static int
115ah_conn_schedule(struct sk_buff *skb, 115ah_esp_conn_schedule(struct sk_buff *skb,
116 struct ip_vs_protocol *pp, 116 struct ip_vs_protocol *pp,
117 int *verdict, struct ip_vs_conn **cpp) 117 int *verdict, struct ip_vs_conn **cpp)
118{ 118{
119 /* 119 /*
120 * AH is only related traffic. Pass the packet to IP stack. 120 * AH/ESP is only related traffic. Pass the packet to IP stack.
121 */ 121 */
122 *verdict = NF_ACCEPT; 122 *verdict = NF_ACCEPT;
123 return 0; 123 return 0;
@@ -125,8 +125,8 @@ ah_conn_schedule(struct sk_buff *skb,
125 125
126 126
127static void 127static void
128ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, 128ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
129 int offset, const char *msg) 129 int offset, const char *msg)
130{ 130{
131 char buf[256]; 131 char buf[256];
132 struct iphdr _iph, *ih; 132 struct iphdr _iph, *ih;
@@ -143,28 +143,29 @@ ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
143} 143}
144 144
145 145
146static void ah_init(struct ip_vs_protocol *pp) 146static void ah_esp_init(struct ip_vs_protocol *pp)
147{ 147{
148 /* nothing to do now */ 148 /* nothing to do now */
149} 149}
150 150
151 151
152static void ah_exit(struct ip_vs_protocol *pp) 152static void ah_esp_exit(struct ip_vs_protocol *pp)
153{ 153{
154 /* nothing to do now */ 154 /* nothing to do now */
155} 155}
156 156
157 157
158#ifdef CONFIG_IP_VS_PROTO_AH
158struct ip_vs_protocol ip_vs_protocol_ah = { 159struct ip_vs_protocol ip_vs_protocol_ah = {
159 .name = "AH", 160 .name = "AH",
160 .protocol = IPPROTO_AH, 161 .protocol = IPPROTO_AH,
161 .num_states = 1, 162 .num_states = 1,
162 .dont_defrag = 1, 163 .dont_defrag = 1,
163 .init = ah_init, 164 .init = ah_esp_init,
164 .exit = ah_exit, 165 .exit = ah_esp_exit,
165 .conn_schedule = ah_conn_schedule, 166 .conn_schedule = ah_esp_conn_schedule,
166 .conn_in_get = ah_conn_in_get, 167 .conn_in_get = ah_esp_conn_in_get,
167 .conn_out_get = ah_conn_out_get, 168 .conn_out_get = ah_esp_conn_out_get,
168 .snat_handler = NULL, 169 .snat_handler = NULL,
169 .dnat_handler = NULL, 170 .dnat_handler = NULL,
170 .csum_check = NULL, 171 .csum_check = NULL,
@@ -172,7 +173,31 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
172 .register_app = NULL, 173 .register_app = NULL,
173 .unregister_app = NULL, 174 .unregister_app = NULL,
174 .app_conn_bind = NULL, 175 .app_conn_bind = NULL,
175 .debug_packet = ah_debug_packet, 176 .debug_packet = ah_esp_debug_packet,
176 .timeout_change = NULL, /* ISAKMP */ 177 .timeout_change = NULL, /* ISAKMP */
177 .set_state_timeout = NULL, 178 .set_state_timeout = NULL,
178}; 179};
180#endif
181
182#ifdef CONFIG_IP_VS_PROTO_ESP
183struct ip_vs_protocol ip_vs_protocol_esp = {
184 .name = "ESP",
185 .protocol = IPPROTO_ESP,
186 .num_states = 1,
187 .dont_defrag = 1,
188 .init = ah_esp_init,
189 .exit = ah_esp_exit,
190 .conn_schedule = ah_esp_conn_schedule,
191 .conn_in_get = ah_esp_conn_in_get,
192 .conn_out_get = ah_esp_conn_out_get,
193 .snat_handler = NULL,
194 .dnat_handler = NULL,
195 .csum_check = NULL,
196 .state_transition = NULL,
197 .register_app = NULL,
198 .unregister_app = NULL,
199 .app_conn_bind = NULL,
200 .debug_packet = ah_esp_debug_packet,
201 .timeout_change = NULL, /* ISAKMP */
202};
203#endif
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
deleted file mode 100644
index 21d70c8ffa54..000000000000
--- a/net/ipv4/ipvs/ip_vs_proto_esp.c
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS
3 *
4 * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
5 * Wensong Zhang <wensong@linuxvirtualserver.org>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * version 2 as published by the Free Software Foundation;
10 *
11 */
12
13#include <linux/in.h>
14#include <linux/ip.h>
15#include <linux/module.h>
16#include <linux/kernel.h>
17#include <linux/netfilter.h>
18#include <linux/netfilter_ipv4.h>
19
20#include <net/ip_vs.h>
21
22
23/* TODO:
24
25struct isakmp_hdr {
26 __u8 icookie[8];
27 __u8 rcookie[8];
28 __u8 np;
29 __u8 version;
30 __u8 xchgtype;
31 __u8 flags;
32 __u32 msgid;
33 __u32 length;
34};
35
36*/
37
38#define PORT_ISAKMP 500
39
40
41static struct ip_vs_conn *
42esp_conn_in_get(const struct sk_buff *skb,
43 struct ip_vs_protocol *pp,
44 const struct iphdr *iph,
45 unsigned int proto_off,
46 int inverse)
47{
48 struct ip_vs_conn *cp;
49
50 if (likely(!inverse)) {
51 cp = ip_vs_conn_in_get(IPPROTO_UDP,
52 iph->saddr,
53 htons(PORT_ISAKMP),
54 iph->daddr,
55 htons(PORT_ISAKMP));
56 } else {
57 cp = ip_vs_conn_in_get(IPPROTO_UDP,
58 iph->daddr,
59 htons(PORT_ISAKMP),
60 iph->saddr,
61 htons(PORT_ISAKMP));
62 }
63
64 if (!cp) {
65 /*
66 * We are not sure if the packet is from our
67 * service, so our conn_schedule hook should return NF_ACCEPT
68 */
69 IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
70 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
71 inverse ? "ICMP+" : "",
72 pp->name,
73 NIPQUAD(iph->saddr),
74 NIPQUAD(iph->daddr));
75 }
76
77 return cp;
78}
79
80
81static struct ip_vs_conn *
82esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
83 const struct iphdr *iph, unsigned int proto_off, int inverse)
84{
85 struct ip_vs_conn *cp;
86
87 if (likely(!inverse)) {
88 cp = ip_vs_conn_out_get(IPPROTO_UDP,
89 iph->saddr,
90 htons(PORT_ISAKMP),
91 iph->daddr,
92 htons(PORT_ISAKMP));
93 } else {
94 cp = ip_vs_conn_out_get(IPPROTO_UDP,
95 iph->daddr,
96 htons(PORT_ISAKMP),
97 iph->saddr,
98 htons(PORT_ISAKMP));
99 }
100
101 if (!cp) {
102 IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
103 "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
104 inverse ? "ICMP+" : "",
105 pp->name,
106 NIPQUAD(iph->saddr),
107 NIPQUAD(iph->daddr));
108 }
109
110 return cp;
111}
112
113
114static int
115esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
116 int *verdict, struct ip_vs_conn **cpp)
117{
118 /*
119 * ESP is only related traffic. Pass the packet to IP stack.
120 */
121 *verdict = NF_ACCEPT;
122 return 0;
123}
124
125
126static void
127esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
128 int offset, const char *msg)
129{
130 char buf[256];
131 struct iphdr _iph, *ih;
132
133 ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
134 if (ih == NULL)
135 sprintf(buf, "%s TRUNCATED", pp->name);
136 else
137 sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
138 pp->name, NIPQUAD(ih->saddr),
139 NIPQUAD(ih->daddr));
140
141 printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
142}
143
144
145static void esp_init(struct ip_vs_protocol *pp)
146{
147 /* nothing to do now */
148}
149
150
151static void esp_exit(struct ip_vs_protocol *pp)
152{
153 /* nothing to do now */
154}
155
156
157struct ip_vs_protocol ip_vs_protocol_esp = {
158 .name = "ESP",
159 .protocol = IPPROTO_ESP,
160 .num_states = 1,
161 .dont_defrag = 1,
162 .init = esp_init,
163 .exit = esp_exit,
164 .conn_schedule = esp_conn_schedule,
165 .conn_in_get = esp_conn_in_get,
166 .conn_out_get = esp_conn_out_get,
167 .snat_handler = NULL,
168 .dnat_handler = NULL,
169 .csum_check = NULL,
170 .state_transition = NULL,
171 .register_app = NULL,
172 .unregister_app = NULL,
173 .app_conn_bind = NULL,
174 .debug_packet = esp_debug_packet,
175 .timeout_change = NULL, /* ISAKMP */
176};
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c
index 358110d17e59..f74929117534 100644
--- a/net/ipv4/ipvs/ip_vs_rr.c
+++ b/net/ipv4/ipvs/ip_vs_rr.c
@@ -32,12 +32,6 @@ static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
32} 32}
33 33
34 34
35static int ip_vs_rr_done_svc(struct ip_vs_service *svc)
36{
37 return 0;
38}
39
40
41static int ip_vs_rr_update_svc(struct ip_vs_service *svc) 35static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
42{ 36{
43 svc->sched_data = &svc->destinations; 37 svc->sched_data = &svc->destinations;
@@ -96,7 +90,6 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = {
96 .module = THIS_MODULE, 90 .module = THIS_MODULE,
97 .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), 91 .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
98 .init_service = ip_vs_rr_init_svc, 92 .init_service = ip_vs_rr_init_svc,
99 .done_service = ip_vs_rr_done_svc,
100 .update_service = ip_vs_rr_update_svc, 93 .update_service = ip_vs_rr_update_svc,
101 .schedule = ip_vs_rr_schedule, 94 .schedule = ip_vs_rr_schedule,
102}; 95};
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c
index 77663d84cbd1..53f73bea66ce 100644
--- a/net/ipv4/ipvs/ip_vs_sed.c
+++ b/net/ipv4/ipvs/ip_vs_sed.c
@@ -41,27 +41,6 @@
41#include <net/ip_vs.h> 41#include <net/ip_vs.h>
42 42
43 43
44static int
45ip_vs_sed_init_svc(struct ip_vs_service *svc)
46{
47 return 0;
48}
49
50
51static int
52ip_vs_sed_done_svc(struct ip_vs_service *svc)
53{
54 return 0;
55}
56
57
58static int
59ip_vs_sed_update_svc(struct ip_vs_service *svc)
60{
61 return 0;
62}
63
64
65static inline unsigned int 44static inline unsigned int
66ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) 45ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
67{ 46{
@@ -139,9 +118,6 @@ static struct ip_vs_scheduler ip_vs_sed_scheduler =
139 .refcnt = ATOMIC_INIT(0), 118 .refcnt = ATOMIC_INIT(0),
140 .module = THIS_MODULE, 119 .module = THIS_MODULE,
141 .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), 120 .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
142 .init_service = ip_vs_sed_init_svc,
143 .done_service = ip_vs_sed_done_svc,
144 .update_service = ip_vs_sed_update_svc,
145 .schedule = ip_vs_sed_schedule, 121 .schedule = ip_vs_sed_schedule,
146}; 122};
147 123
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c
index 9b0ef86bb1f7..df7ad8d74766 100644
--- a/net/ipv4/ipvs/ip_vs_wlc.c
+++ b/net/ipv4/ipvs/ip_vs_wlc.c
@@ -25,27 +25,6 @@
25#include <net/ip_vs.h> 25#include <net/ip_vs.h>
26 26
27 27
28static int
29ip_vs_wlc_init_svc(struct ip_vs_service *svc)
30{
31 return 0;
32}
33
34
35static int
36ip_vs_wlc_done_svc(struct ip_vs_service *svc)
37{
38 return 0;
39}
40
41
42static int
43ip_vs_wlc_update_svc(struct ip_vs_service *svc)
44{
45 return 0;
46}
47
48
49static inline unsigned int 28static inline unsigned int
50ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) 29ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
51{ 30{
@@ -127,9 +106,6 @@ static struct ip_vs_scheduler ip_vs_wlc_scheduler =
127 .refcnt = ATOMIC_INIT(0), 106 .refcnt = ATOMIC_INIT(0),
128 .module = THIS_MODULE, 107 .module = THIS_MODULE,
129 .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), 108 .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
130 .init_service = ip_vs_wlc_init_svc,
131 .done_service = ip_vs_wlc_done_svc,
132 .update_service = ip_vs_wlc_update_svc,
133 .schedule = ip_vs_wlc_schedule, 109 .schedule = ip_vs_wlc_schedule,
134}; 110};
135 111
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6ee5354c9aa1..f62187bb6d08 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -282,6 +282,8 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
282 struct rtable *r = NULL; 282 struct rtable *r = NULL;
283 283
284 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { 284 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
285 if (!rt_hash_table[st->bucket].chain)
286 continue;
285 rcu_read_lock_bh(); 287 rcu_read_lock_bh();
286 r = rcu_dereference(rt_hash_table[st->bucket].chain); 288 r = rcu_dereference(rt_hash_table[st->bucket].chain);
287 while (r) { 289 while (r) {
@@ -299,11 +301,14 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
299 struct rtable *r) 301 struct rtable *r)
300{ 302{
301 struct rt_cache_iter_state *st = seq->private; 303 struct rt_cache_iter_state *st = seq->private;
304
302 r = r->u.dst.rt_next; 305 r = r->u.dst.rt_next;
303 while (!r) { 306 while (!r) {
304 rcu_read_unlock_bh(); 307 rcu_read_unlock_bh();
305 if (--st->bucket < 0) 308 do {
306 break; 309 if (--st->bucket < 0)
310 return NULL;
311 } while (!rt_hash_table[st->bucket].chain);
307 rcu_read_lock_bh(); 312 rcu_read_lock_bh();
308 r = rt_hash_table[st->bucket].chain; 313 r = rt_hash_table[st->bucket].chain;
309 } 314 }
@@ -2840,7 +2845,9 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2840 if (s_h < 0) 2845 if (s_h < 0)
2841 s_h = 0; 2846 s_h = 0;
2842 s_idx = idx = cb->args[1]; 2847 s_idx = idx = cb->args[1];
2843 for (h = s_h; h <= rt_hash_mask; h++) { 2848 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2849 if (!rt_hash_table[h].chain)
2850 continue;
2844 rcu_read_lock_bh(); 2851 rcu_read_lock_bh();
2845 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; 2852 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2846 rt = rcu_dereference(rt->u.dst.rt_next), idx++) { 2853 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
@@ -2859,7 +2866,6 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2859 dst_release(xchg(&skb->dst, NULL)); 2866 dst_release(xchg(&skb->dst, NULL));
2860 } 2867 }
2861 rcu_read_unlock_bh(); 2868 rcu_read_unlock_bh();
2862 s_idx = 0;
2863 } 2869 }
2864 2870
2865done: 2871done:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 67ccce2a96bd..f79a51607292 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3442,6 +3442,22 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3442 } 3442 }
3443} 3443}
3444 3444
3445static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
3446{
3447 __be32 *ptr = (__be32 *)(th + 1);
3448
3449 if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
3450 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
3451 tp->rx_opt.saw_tstamp = 1;
3452 ++ptr;
3453 tp->rx_opt.rcv_tsval = ntohl(*ptr);
3454 ++ptr;
3455 tp->rx_opt.rcv_tsecr = ntohl(*ptr);
3456 return 1;
3457 }
3458 return 0;
3459}
3460
3445/* Fast parse options. This hopes to only see timestamps. 3461/* Fast parse options. This hopes to only see timestamps.
3446 * If it is wrong it falls back on tcp_parse_options(). 3462 * If it is wrong it falls back on tcp_parse_options().
3447 */ 3463 */
@@ -3453,16 +3469,8 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
3453 return 0; 3469 return 0;
3454 } else if (tp->rx_opt.tstamp_ok && 3470 } else if (tp->rx_opt.tstamp_ok &&
3455 th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { 3471 th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
3456 __be32 *ptr = (__be32 *)(th + 1); 3472 if (tcp_parse_aligned_timestamp(tp, th))
3457 if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
3458 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
3459 tp->rx_opt.saw_tstamp = 1;
3460 ++ptr;
3461 tp->rx_opt.rcv_tsval = ntohl(*ptr);
3462 ++ptr;
3463 tp->rx_opt.rcv_tsecr = ntohl(*ptr);
3464 return 1; 3473 return 1;
3465 }
3466 } 3474 }
3467 tcp_parse_options(skb, &tp->rx_opt, 1); 3475 tcp_parse_options(skb, &tp->rx_opt, 1);
3468 return 1; 3476 return 1;
@@ -4161,6 +4169,18 @@ add_sack:
4161 } 4169 }
4162} 4170}
4163 4171
4172static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
4173 struct sk_buff_head *list)
4174{
4175 struct sk_buff *next = skb->next;
4176
4177 __skb_unlink(skb, list);
4178 __kfree_skb(skb);
4179 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
4180
4181 return next;
4182}
4183
4164/* Collapse contiguous sequence of skbs head..tail with 4184/* Collapse contiguous sequence of skbs head..tail with
4165 * sequence numbers start..end. 4185 * sequence numbers start..end.
4166 * Segments with FIN/SYN are not collapsed (only because this 4186 * Segments with FIN/SYN are not collapsed (only because this
@@ -4178,11 +4198,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
4178 for (skb = head; skb != tail;) { 4198 for (skb = head; skb != tail;) {
4179 /* No new bits? It is possible on ofo queue. */ 4199 /* No new bits? It is possible on ofo queue. */
4180 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 4200 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4181 struct sk_buff *next = skb->next; 4201 skb = tcp_collapse_one(sk, skb, list);
4182 __skb_unlink(skb, list);
4183 __kfree_skb(skb);
4184 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
4185 skb = next;
4186 continue; 4202 continue;
4187 } 4203 }
4188 4204
@@ -4246,11 +4262,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
4246 start += size; 4262 start += size;
4247 } 4263 }
4248 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 4264 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4249 struct sk_buff *next = skb->next; 4265 skb = tcp_collapse_one(sk, skb, list);
4250 __skb_unlink(skb, list);
4251 __kfree_skb(skb);
4252 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
4253 skb = next;
4254 if (skb == tail || 4266 if (skb == tail ||
4255 tcp_hdr(skb)->syn || 4267 tcp_hdr(skb)->syn ||
4256 tcp_hdr(skb)->fin) 4268 tcp_hdr(skb)->fin)
@@ -4691,6 +4703,67 @@ out:
4691} 4703}
4692#endif /* CONFIG_NET_DMA */ 4704#endif /* CONFIG_NET_DMA */
4693 4705
4706/* Does PAWS and seqno based validation of an incoming segment, flags will
4707 * play significant role here.
4708 */
4709static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
4710 struct tcphdr *th, int syn_inerr)
4711{
4712 struct tcp_sock *tp = tcp_sk(sk);
4713
4714 /* RFC1323: H1. Apply PAWS check first. */
4715 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
4716 tcp_paws_discard(sk, skb)) {
4717 if (!th->rst) {
4718 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
4719 tcp_send_dupack(sk, skb);
4720 goto discard;
4721 }
4722 /* Reset is accepted even if it did not pass PAWS. */
4723 }
4724
4725 /* Step 1: check sequence number */
4726 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
4727 /* RFC793, page 37: "In all states except SYN-SENT, all reset
4728 * (RST) segments are validated by checking their SEQ-fields."
4729 * And page 69: "If an incoming segment is not acceptable,
4730 * an acknowledgment should be sent in reply (unless the RST
4731 * bit is set, if so drop the segment and return)".
4732 */
4733 if (!th->rst)
4734 tcp_send_dupack(sk, skb);
4735 goto discard;
4736 }
4737
4738 /* Step 2: check RST bit */
4739 if (th->rst) {
4740 tcp_reset(sk);
4741 goto discard;
4742 }
4743
4744 /* ts_recent update must be made after we are sure that the packet
4745 * is in window.
4746 */
4747 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
4748
4749 /* step 3: check security and precedence [ignored] */
4750
4751 /* step 4: Check for a SYN in window. */
4752 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4753 if (syn_inerr)
4754 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
4755 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
4756 tcp_reset(sk);
4757 return -1;
4758 }
4759
4760 return 1;
4761
4762discard:
4763 __kfree_skb(skb);
4764 return 0;
4765}
4766
4694/* 4767/*
4695 * TCP receive function for the ESTABLISHED state. 4768 * TCP receive function for the ESTABLISHED state.
4696 * 4769 *
@@ -4718,6 +4791,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
4718 struct tcphdr *th, unsigned len) 4791 struct tcphdr *th, unsigned len)
4719{ 4792{
4720 struct tcp_sock *tp = tcp_sk(sk); 4793 struct tcp_sock *tp = tcp_sk(sk);
4794 int res;
4721 4795
4722 /* 4796 /*
4723 * Header prediction. 4797 * Header prediction.
@@ -4756,19 +4830,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
4756 4830
4757 /* Check timestamp */ 4831 /* Check timestamp */
4758 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { 4832 if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
4759 __be32 *ptr = (__be32 *)(th + 1);
4760
4761 /* No? Slow path! */ 4833 /* No? Slow path! */
4762 if (*ptr != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) 4834 if (!tcp_parse_aligned_timestamp(tp, th))
4763 | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
4764 goto slow_path; 4835 goto slow_path;
4765 4836
4766 tp->rx_opt.saw_tstamp = 1;
4767 ++ptr;
4768 tp->rx_opt.rcv_tsval = ntohl(*ptr);
4769 ++ptr;
4770 tp->rx_opt.rcv_tsecr = ntohl(*ptr);
4771
4772 /* If PAWS failed, check it more carefully in slow path */ 4837 /* If PAWS failed, check it more carefully in slow path */
4773 if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) 4838 if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
4774 goto slow_path; 4839 goto slow_path;
@@ -4899,51 +4964,12 @@ slow_path:
4899 goto csum_error; 4964 goto csum_error;
4900 4965
4901 /* 4966 /*
4902 * RFC1323: H1. Apply PAWS check first.
4903 */
4904 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
4905 tcp_paws_discard(sk, skb)) {
4906 if (!th->rst) {
4907 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
4908 tcp_send_dupack(sk, skb);
4909 goto discard;
4910 }
4911 /* Resets are accepted even if PAWS failed.
4912
4913 ts_recent update must be made after we are sure
4914 that the packet is in window.
4915 */
4916 }
4917
4918 /*
4919 * Standard slow path. 4967 * Standard slow path.
4920 */ 4968 */
4921 4969
4922 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { 4970 res = tcp_validate_incoming(sk, skb, th, 1);
4923 /* RFC793, page 37: "In all states except SYN-SENT, all reset 4971 if (res <= 0)
4924 * (RST) segments are validated by checking their SEQ-fields." 4972 return -res;
4925 * And page 69: "If an incoming segment is not acceptable,
4926 * an acknowledgment should be sent in reply (unless the RST bit
4927 * is set, if so drop the segment and return)".
4928 */
4929 if (!th->rst)
4930 tcp_send_dupack(sk, skb);
4931 goto discard;
4932 }
4933
4934 if (th->rst) {
4935 tcp_reset(sk);
4936 goto discard;
4937 }
4938
4939 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
4940
4941 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
4942 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
4943 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
4944 tcp_reset(sk);
4945 return 1;
4946 }
4947 4973
4948step5: 4974step5:
4949 if (th->ack) 4975 if (th->ack)
@@ -5225,6 +5251,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5225 struct tcp_sock *tp = tcp_sk(sk); 5251 struct tcp_sock *tp = tcp_sk(sk);
5226 struct inet_connection_sock *icsk = inet_csk(sk); 5252 struct inet_connection_sock *icsk = inet_csk(sk);
5227 int queued = 0; 5253 int queued = 0;
5254 int res;
5228 5255
5229 tp->rx_opt.saw_tstamp = 0; 5256 tp->rx_opt.saw_tstamp = 0;
5230 5257
@@ -5277,42 +5304,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5277 return 0; 5304 return 0;
5278 } 5305 }
5279 5306
5280 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && 5307 res = tcp_validate_incoming(sk, skb, th, 0);
5281 tcp_paws_discard(sk, skb)) { 5308 if (res <= 0)
5282 if (!th->rst) { 5309 return -res;
5283 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
5284 tcp_send_dupack(sk, skb);
5285 goto discard;
5286 }
5287 /* Reset is accepted even if it did not pass PAWS. */
5288 }
5289
5290 /* step 1: check sequence number */
5291 if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
5292 if (!th->rst)
5293 tcp_send_dupack(sk, skb);
5294 goto discard;
5295 }
5296
5297 /* step 2: check RST bit */
5298 if (th->rst) {
5299 tcp_reset(sk);
5300 goto discard;
5301 }
5302
5303 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
5304
5305 /* step 3: check security and precedence [ignored] */
5306
5307 /* step 4:
5308 *
5309 * Check for a SYN in window.
5310 */
5311 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
5312 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
5313 tcp_reset(sk);
5314 return 1;
5315 }
5316 5310
5317 /* step 5: check the ACK field */ 5311 /* step 5: check the ACK field */
5318 if (th->ack) { 5312 if (th->ack) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1b4fee20fc93..3dfbc21e555a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1946,6 +1946,12 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1946 return rc; 1946 return rc;
1947} 1947}
1948 1948
1949static inline int empty_bucket(struct tcp_iter_state *st)
1950{
1951 return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
1952 hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
1953}
1954
1949static void *established_get_first(struct seq_file *seq) 1955static void *established_get_first(struct seq_file *seq)
1950{ 1956{
1951 struct tcp_iter_state* st = seq->private; 1957 struct tcp_iter_state* st = seq->private;
@@ -1958,6 +1964,10 @@ static void *established_get_first(struct seq_file *seq)
1958 struct inet_timewait_sock *tw; 1964 struct inet_timewait_sock *tw;
1959 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); 1965 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1960 1966
1967 /* Lockless fast path for the common case of empty buckets */
1968 if (empty_bucket(st))
1969 continue;
1970
1961 read_lock_bh(lock); 1971 read_lock_bh(lock);
1962 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 1972 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1963 if (sk->sk_family != st->family || 1973 if (sk->sk_family != st->family ||
@@ -2008,13 +2018,15 @@ get_tw:
2008 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2018 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2009 st->state = TCP_SEQ_STATE_ESTABLISHED; 2019 st->state = TCP_SEQ_STATE_ESTABLISHED;
2010 2020
2011 if (++st->bucket < tcp_hashinfo.ehash_size) { 2021 /* Look for next non empty bucket */
2012 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); 2022 while (++st->bucket < tcp_hashinfo.ehash_size &&
2013 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); 2023 empty_bucket(st))
2014 } else { 2024 ;
2015 cur = NULL; 2025 if (st->bucket >= tcp_hashinfo.ehash_size)
2016 goto out; 2026 return NULL;
2017 } 2027
2028 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2029 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2018 } else 2030 } else
2019 sk = sk_next(sk); 2031 sk = sk_next(sk);
2020 2032