aboutsummaryrefslogtreecommitdiffstats
path: root/net/netfilter/ipvs/ip_vs_conn.c
diff options
context:
space:
mode:
authorCatalin(ux) M. BOIE <catab@embedromix.ro>2010-01-04 23:50:24 -0500
committerPatrick McHardy <kaber@trash.net>2010-01-04 23:50:24 -0500
commit6f7edb4881bf51300060e89915926e070ace8c4d (patch)
treec4f1a6d337748fea6a9640cf41885e0f431d189f /net/netfilter/ipvs/ip_vs_conn.c
parent294188ae32f984a072c64c959354b2f6f52f80a7 (diff)
IPVS: Allow boot time change of hash size
I was very frustrated about the fact that I have to recompile the kernel to change the hash size. So, I created this patch. If IPVS is built-in you can append ip_vs.conn_tab_bits=?? to kernel command line, or, if you built IPVS as modules, you can add options ip_vs conn_tab_bits=??. To keep everything backward compatible, you still can select the size at compile time, and that will be used as default. It has been about a year since this patch was originally posted and subsequently dropped on the basis of insufficient test data. Mark Bergsma has provided the following test results which seem to strongly support the need for larger hash table sizes: We do however run into the same problem with the default setting (212 = 4096 entries), as most of our LVS balancers handle around a million connections/SLAB entries at any point in time (around 100-150 kpps load). With only 4096 hash table entries this implies that each entry consists of a linked list of 256 connections *on average*. To provide some statistics, I did an oprofile run on an 2.6.31 kernel, with both the default 4096 table size, and the same kernel recompiled with IP_VS_CONN_TAB_BITS set to 18 (218 = 262144 entries). I built a quick test setup with a part of Wikimedia/Wikipedia's live traffic mirrored by the switch to the test host. With the default setting, at ~ 120 kpps packet load we saw a typical %si CPU usage of around 30-35%, and oprofile reported a hot spot in ip_vs_conn_in_get: samples % image name app name symbol name 1719761 42.3741 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 302577 7.4554 bnx2 bnx2 /bnx2 181984 4.4840 vmlinux vmlinux __ticket_spin_lock 128636 3.1695 vmlinux vmlinux ip_route_input 74345 1.8318 ip_vs.ko ip_vs.ko ip_vs_conn_out_get 68482 1.6874 vmlinux vmlinux mwait_idle After loading the recompiled kernel with 218 entries, %si CPU usage dropped in half to around 12-18%, and oprofile looks much healthier, with only 7% spent in ip_vs_conn_in_get: samples % image name app name symbol name 265641 14.4616 bnx2 bnx2 /bnx2 143251 7.7986 vmlinux vmlinux __ticket_spin_lock 140661 7.6576 ip_vs.ko ip_vs.ko ip_vs_conn_in_get 94364 5.1372 vmlinux vmlinux mwait_idle 86267 4.6964 vmlinux vmlinux ip_route_input [ horms@verge.net.au: trivial up-port and minor style fixes ] Signed-off-by: Catalin(ux) M. BOIE <catab@embedromix.ro> Cc: Mark Bergsma <mark@wikimedia.org> Signed-off-by: Simon Horman <horms@verge.net.au> Signed-off-by: Patrick McHardy <kaber@trash.net>
Diffstat (limited to 'net/netfilter/ipvs/ip_vs_conn.c')
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c42
1 files changed, 31 insertions, 11 deletions
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 27c30cf933d..60bb41a8d8d 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -40,6 +40,21 @@
40#include <net/ip_vs.h> 40#include <net/ip_vs.h>
41 41
42 42
43#ifndef CONFIG_IP_VS_TAB_BITS
44#define CONFIG_IP_VS_TAB_BITS 12
45#endif
46
47/*
48 * Connection hash size. Default is what was selected at compile time.
49*/
50int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
51module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
52MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
53
54/* size and mask values */
55int ip_vs_conn_tab_size;
56int ip_vs_conn_tab_mask;
57
43/* 58/*
44 * Connection hash table: for input and output packets lookups of IPVS 59 * Connection hash table: for input and output packets lookups of IPVS
45 */ 60 */
@@ -125,11 +140,11 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto,
125 if (af == AF_INET6) 140 if (af == AF_INET6)
126 return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), 141 return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
127 (__force u32)port, proto, ip_vs_conn_rnd) 142 (__force u32)port, proto, ip_vs_conn_rnd)
128 & IP_VS_CONN_TAB_MASK; 143 & ip_vs_conn_tab_mask;
129#endif 144#endif
130 return jhash_3words((__force u32)addr->ip, (__force u32)port, proto, 145 return jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
131 ip_vs_conn_rnd) 146 ip_vs_conn_rnd)
132 & IP_VS_CONN_TAB_MASK; 147 & ip_vs_conn_tab_mask;
133} 148}
134 149
135 150
@@ -760,7 +775,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
760 int idx; 775 int idx;
761 struct ip_vs_conn *cp; 776 struct ip_vs_conn *cp;
762 777
763 for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { 778 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
764 ct_read_lock_bh(idx); 779 ct_read_lock_bh(idx);
765 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 780 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
766 if (pos-- == 0) { 781 if (pos-- == 0) {
@@ -797,7 +812,7 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
797 idx = l - ip_vs_conn_tab; 812 idx = l - ip_vs_conn_tab;
798 ct_read_unlock_bh(idx); 813 ct_read_unlock_bh(idx);
799 814
800 while (++idx < IP_VS_CONN_TAB_SIZE) { 815 while (++idx < ip_vs_conn_tab_size) {
801 ct_read_lock_bh(idx); 816 ct_read_lock_bh(idx);
802 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 817 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
803 seq->private = &ip_vs_conn_tab[idx]; 818 seq->private = &ip_vs_conn_tab[idx];
@@ -976,8 +991,8 @@ void ip_vs_random_dropentry(void)
976 /* 991 /*
977 * Randomly scan 1/32 of the whole table every second 992 * Randomly scan 1/32 of the whole table every second
978 */ 993 */
979 for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { 994 for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
980 unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; 995 unsigned hash = net_random() & ip_vs_conn_tab_mask;
981 996
982 /* 997 /*
983 * Lock is actually needed in this loop. 998 * Lock is actually needed in this loop.
@@ -1029,7 +1044,7 @@ static void ip_vs_conn_flush(void)
1029 struct ip_vs_conn *cp; 1044 struct ip_vs_conn *cp;
1030 1045
1031 flush_again: 1046 flush_again:
1032 for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { 1047 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
1033 /* 1048 /*
1034 * Lock is actually needed in this loop. 1049 * Lock is actually needed in this loop.
1035 */ 1050 */
@@ -1060,10 +1075,15 @@ int __init ip_vs_conn_init(void)
1060{ 1075{
1061 int idx; 1076 int idx;
1062 1077
1078 /* Compute size and mask */
1079 ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
1080 ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
1081
1063 /* 1082 /*
1064 * Allocate the connection hash table and initialize its list heads 1083 * Allocate the connection hash table and initialize its list heads
1065 */ 1084 */
1066 ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); 1085 ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size *
1086 sizeof(struct list_head));
1067 if (!ip_vs_conn_tab) 1087 if (!ip_vs_conn_tab)
1068 return -ENOMEM; 1088 return -ENOMEM;
1069 1089
@@ -1078,12 +1098,12 @@ int __init ip_vs_conn_init(void)
1078 1098
1079 pr_info("Connection hash table configured " 1099 pr_info("Connection hash table configured "
1080 "(size=%d, memory=%ldKbytes)\n", 1100 "(size=%d, memory=%ldKbytes)\n",
1081 IP_VS_CONN_TAB_SIZE, 1101 ip_vs_conn_tab_size,
1082 (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); 1102 (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
1083 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", 1103 IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
1084 sizeof(struct ip_vs_conn)); 1104 sizeof(struct ip_vs_conn));
1085 1105
1086 for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { 1106 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
1087 INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); 1107 INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
1088 } 1108 }
1089 1109