aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEvgeniy Polyakov <zbr@ioremap.net>2009-01-19 19:46:02 -0500
committerDavid S. Miller <davem@davemloft.net>2009-01-21 17:34:31 -0500
commita9d8f9110d7e953c2f2b521087a4179677843c2a (patch)
tree4945343dd94bd66964a29055b03c2dd1d119f4d5
parent5c0999b72b34541a3734a9138c43d5c024a42d47 (diff)
inet: Allowing more than 64k connections and heavily optimize bind(0) time.
With simple extension to the binding mechanism, which allows to bind more than 64k sockets (or smaller amount, depending on sysctl parameters), we have to traverse the whole bind hash table to find out empty bucket. And while it is not a problem for example for 32k connections, bind() completion time grows exponentially (since after each successful binding we have to traverse one bucket more to find empty one) even if we start each time from random offset inside the hash table. So, when hash table is full, and we want to add another socket, we have to traverse the whole table no matter what, so effectivelly this will be the worst case performance and it will be constant. Attached picture shows bind() time depending on number of already bound sockets. Green area corresponds to the usual binding to zero port process, which turns on kernel port selection as described above. Red area is the bind process, when number of reuse-bound sockets is not limited by 64k (or sysctl parameters). The same exponential growth (hidden by the green area) before number of ports reaches sysctl limit. At this time bind hash table has exactly one reuse-enbaled socket in a bucket, but it is possible that they have different addresses. Actually kernel selects the first port to try randomly, so at the beginning bind will take roughly constant time, but with time number of port to check after random start will increase. And that will have exponential growth, but because of above random selection, not every next port selection will necessary take longer time than previous. So we have to consider the area below in the graph (if you could zoom it, you could find, that there are many different times placed there), so area can hide another. Blue area corresponds to the port selection optimization. This is rather simple design approach: hashtable now maintains (unprecise and racely updated) number of currently bound sockets, and when number of such sockets becomes greater than predefined value (I use maximum port range defined by sysctls), we stop traversing the whole bind hash table and just stop at first matching bucket after random start. Above limit roughly corresponds to the case, when bind hash table is full and we turned on mechanism of allowing to bind more reuse-enabled sockets, so it does not change behaviour of other sockets. Signed-off-by: Evgeniy Polyakov <zbr@ioremap.net> Tested-by: Denys Fedoryschenko <denys@visp.net.lb> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/inet_hashtables.h3
-rw-r--r--net/ipv4/inet_connection_sock.c41
-rw-r--r--net/ipv4/inet_hashtables.c11
3 files changed, 46 insertions, 9 deletions
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index f44bb5c77a70..cdc08c190638 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -82,6 +82,7 @@ struct inet_bind_bucket {
82#endif 82#endif
83 unsigned short port; 83 unsigned short port;
84 signed short fastreuse; 84 signed short fastreuse;
85 int num_owners;
85 struct hlist_node node; 86 struct hlist_node node;
86 struct hlist_head owners; 87 struct hlist_head owners;
87}; 88};
@@ -133,7 +134,7 @@ struct inet_hashinfo {
133 struct inet_bind_hashbucket *bhash; 134 struct inet_bind_hashbucket *bhash;
134 135
135 unsigned int bhash_size; 136 unsigned int bhash_size;
136 /* Note : 4 bytes padding on 64 bit arches */ 137 int bsockets;
137 138
138 struct kmem_cache *bind_bucket_cachep; 139 struct kmem_cache *bind_bucket_cachep;
139 140
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f26ab38680de..df8e72f07478 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -93,24 +93,40 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
93 struct inet_bind_hashbucket *head; 93 struct inet_bind_hashbucket *head;
94 struct hlist_node *node; 94 struct hlist_node *node;
95 struct inet_bind_bucket *tb; 95 struct inet_bind_bucket *tb;
96 int ret; 96 int ret, attempts = 5;
97 struct net *net = sock_net(sk); 97 struct net *net = sock_net(sk);
98 int smallest_size = -1, smallest_rover;
98 99
99 local_bh_disable(); 100 local_bh_disable();
100 if (!snum) { 101 if (!snum) {
101 int remaining, rover, low, high; 102 int remaining, rover, low, high;
102 103
104again:
103 inet_get_local_port_range(&low, &high); 105 inet_get_local_port_range(&low, &high);
104 remaining = (high - low) + 1; 106 remaining = (high - low) + 1;
105 rover = net_random() % remaining + low; 107 smallest_rover = rover = net_random() % remaining + low;
106 108
109 smallest_size = -1;
107 do { 110 do {
108 head = &hashinfo->bhash[inet_bhashfn(net, rover, 111 head = &hashinfo->bhash[inet_bhashfn(net, rover,
109 hashinfo->bhash_size)]; 112 hashinfo->bhash_size)];
110 spin_lock(&head->lock); 113 spin_lock(&head->lock);
111 inet_bind_bucket_for_each(tb, node, &head->chain) 114 inet_bind_bucket_for_each(tb, node, &head->chain)
112 if (ib_net(tb) == net && tb->port == rover) 115 if (ib_net(tb) == net && tb->port == rover) {
116 if (tb->fastreuse > 0 &&
117 sk->sk_reuse &&
118 sk->sk_state != TCP_LISTEN &&
119 (tb->num_owners < smallest_size || smallest_size == -1)) {
120 smallest_size = tb->num_owners;
121 smallest_rover = rover;
122 if (hashinfo->bsockets > (high - low) + 1) {
123 spin_unlock(&head->lock);
124 snum = smallest_rover;
125 goto have_snum;
126 }
127 }
113 goto next; 128 goto next;
129 }
114 break; 130 break;
115 next: 131 next:
116 spin_unlock(&head->lock); 132 spin_unlock(&head->lock);
@@ -125,14 +141,19 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
125 * the top level, not from the 'break;' statement. 141 * the top level, not from the 'break;' statement.
126 */ 142 */
127 ret = 1; 143 ret = 1;
128 if (remaining <= 0) 144 if (remaining <= 0) {
145 if (smallest_size != -1) {
146 snum = smallest_rover;
147 goto have_snum;
148 }
129 goto fail; 149 goto fail;
130 150 }
131 /* OK, here is the one we will use. HEAD is 151 /* OK, here is the one we will use. HEAD is
132 * non-NULL and we hold it's mutex. 152 * non-NULL and we hold it's mutex.
133 */ 153 */
134 snum = rover; 154 snum = rover;
135 } else { 155 } else {
156have_snum:
136 head = &hashinfo->bhash[inet_bhashfn(net, snum, 157 head = &hashinfo->bhash[inet_bhashfn(net, snum,
137 hashinfo->bhash_size)]; 158 hashinfo->bhash_size)];
138 spin_lock(&head->lock); 159 spin_lock(&head->lock);
@@ -145,12 +166,18 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
145tb_found: 166tb_found:
146 if (!hlist_empty(&tb->owners)) { 167 if (!hlist_empty(&tb->owners)) {
147 if (tb->fastreuse > 0 && 168 if (tb->fastreuse > 0 &&
148 sk->sk_reuse && sk->sk_state != TCP_LISTEN) { 169 sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
170 smallest_size == -1) {
149 goto success; 171 goto success;
150 } else { 172 } else {
151 ret = 1; 173 ret = 1;
152 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) 174 if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
175 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && --attempts >= 0) {
176 spin_unlock(&head->lock);
177 goto again;
178 }
153 goto fail_unlock; 179 goto fail_unlock;
180 }
154 } 181 }
155 } 182 }
156tb_not_found: 183tb_not_found:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 6a1045da48d2..d7b6178bf48b 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -38,6 +38,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
38 write_pnet(&tb->ib_net, hold_net(net)); 38 write_pnet(&tb->ib_net, hold_net(net));
39 tb->port = snum; 39 tb->port = snum;
40 tb->fastreuse = 0; 40 tb->fastreuse = 0;
41 tb->num_owners = 0;
41 INIT_HLIST_HEAD(&tb->owners); 42 INIT_HLIST_HEAD(&tb->owners);
42 hlist_add_head(&tb->node, &head->chain); 43 hlist_add_head(&tb->node, &head->chain);
43 } 44 }
@@ -59,8 +60,13 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket
59void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 60void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
60 const unsigned short snum) 61 const unsigned short snum)
61{ 62{
63 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
64
65 hashinfo->bsockets++;
66
62 inet_sk(sk)->num = snum; 67 inet_sk(sk)->num = snum;
63 sk_add_bind_node(sk, &tb->owners); 68 sk_add_bind_node(sk, &tb->owners);
69 tb->num_owners++;
64 inet_csk(sk)->icsk_bind_hash = tb; 70 inet_csk(sk)->icsk_bind_hash = tb;
65} 71}
66 72
@@ -75,9 +81,12 @@ static void __inet_put_port(struct sock *sk)
75 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; 81 struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
76 struct inet_bind_bucket *tb; 82 struct inet_bind_bucket *tb;
77 83
84 hashinfo->bsockets--;
85
78 spin_lock(&head->lock); 86 spin_lock(&head->lock);
79 tb = inet_csk(sk)->icsk_bind_hash; 87 tb = inet_csk(sk)->icsk_bind_hash;
80 __sk_del_bind_node(sk); 88 __sk_del_bind_node(sk);
89 tb->num_owners--;
81 inet_csk(sk)->icsk_bind_hash = NULL; 90 inet_csk(sk)->icsk_bind_hash = NULL;
82 inet_sk(sk)->num = 0; 91 inet_sk(sk)->num = 0;
83 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); 92 inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
@@ -444,9 +453,9 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
444 */ 453 */
445 inet_bind_bucket_for_each(tb, node, &head->chain) { 454 inet_bind_bucket_for_each(tb, node, &head->chain) {
446 if (ib_net(tb) == net && tb->port == port) { 455 if (ib_net(tb) == net && tb->port == port) {
447 WARN_ON(hlist_empty(&tb->owners));
448 if (tb->fastreuse >= 0) 456 if (tb->fastreuse >= 0)
449 goto next_port; 457 goto next_port;
458 WARN_ON(hlist_empty(&tb->owners));
450 if (!check_established(death_row, sk, 459 if (!check_established(death_row, sk,
451 port, &tw)) 460 port, &tw))
452 goto ok; 461 goto ok;