aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2006-11-16 05:30:37 -0500
committerDavid S. Miller <davem@sunset.davemloft.net>2006-12-03 00:21:44 -0500
commit72a3effaf633bcae9034b7e176bdbd78d64a71db (patch)
treeb7a331527f1b15335a358f97809134f35587e57a
parent3c62f75aac7348ee262b1295cfcfeb3473f76815 (diff)
[NET]: Size listen hash tables using backlog hint
We currently allocate a fixed size (TCP_SYNQ_HSIZE=512) slots hash table for each LISTEN socket, regardless of various parameters (listen backlog for example) On x86_64, this means order-1 allocations (might fail), even for 'small' sockets, expecting few connections. On the contrary, a huge server wanting a backlog of 50000 is slowed down a bit because of this fixed limit. This patch makes the sizing of listen hash table a dynamic parameter, depending of : - net.core.somaxconn tunable (default is 128) - net.ipv4.tcp_max_syn_backlog tunable (default : 256, 1024 or 128) - backlog value given by user application (2nd parameter of listen()) For large allocations (bigger than PAGE_SIZE), we use vmalloc() instead of kmalloc(). We still limit memory allocation with the two existing tunables (somaxconn & tcp_max_syn_backlog). So for standard setups, this patch actually reduce RAM usage. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/request_sock.h8
-rw-r--r--include/net/tcp.h1
-rw-r--r--net/core/request_sock.c35
-rw-r--r--net/dccp/ipv4.c2
-rw-r--r--net/dccp/proto.c6
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/inet_connection_sock.c2
-rw-r--r--net/ipv4/tcp_ipv4.c6
-rw-r--r--net/ipv6/tcp_ipv6.c2
9 files changed, 39 insertions, 25 deletions
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index f743a941a4f2..b5b023e79e5f 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -28,8 +28,8 @@ struct proto;
28 28
29struct request_sock_ops { 29struct request_sock_ops {
30 int family; 30 int family;
31 kmem_cache_t *slab;
32 int obj_size; 31 int obj_size;
32 kmem_cache_t *slab;
33 int (*rtx_syn_ack)(struct sock *sk, 33 int (*rtx_syn_ack)(struct sock *sk,
34 struct request_sock *req, 34 struct request_sock *req,
35 struct dst_entry *dst); 35 struct dst_entry *dst);
@@ -51,13 +51,13 @@ struct request_sock {
51 u32 rcv_wnd; /* rcv_wnd offered first time */ 51 u32 rcv_wnd; /* rcv_wnd offered first time */
52 u32 ts_recent; 52 u32 ts_recent;
53 unsigned long expires; 53 unsigned long expires;
54 struct request_sock_ops *rsk_ops; 54 const struct request_sock_ops *rsk_ops;
55 struct sock *sk; 55 struct sock *sk;
56 u32 secid; 56 u32 secid;
57 u32 peer_secid; 57 u32 peer_secid;
58}; 58};
59 59
60static inline struct request_sock *reqsk_alloc(struct request_sock_ops *ops) 60static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops)
61{ 61{
62 struct request_sock *req = kmem_cache_alloc(ops->slab, SLAB_ATOMIC); 62 struct request_sock *req = kmem_cache_alloc(ops->slab, SLAB_ATOMIC);
63 63
@@ -121,7 +121,7 @@ struct request_sock_queue {
121}; 121};
122 122
123extern int reqsk_queue_alloc(struct request_sock_queue *queue, 123extern int reqsk_queue_alloc(struct request_sock_queue *queue,
124 const int nr_table_entries); 124 unsigned int nr_table_entries);
125 125
126static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock_queue *queue) 126static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock_queue *queue)
127{ 127{
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7a093d0aa0fe..246916c2321e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -138,7 +138,6 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
138#define MAX_TCP_SYNCNT 127 138#define MAX_TCP_SYNCNT 127
139 139
140#define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */ 140#define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */
141#define TCP_SYNQ_HSIZE 512 /* Size of SYNACK hash table */
142 141
143#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24) 142#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24)
144#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated 143#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 79ebd75fbe4d..5f0818d815e6 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -15,6 +15,7 @@
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/string.h> 17#include <linux/string.h>
18#include <linux/vmalloc.h>
18 19
19#include <net/request_sock.h> 20#include <net/request_sock.h>
20 21
@@ -29,22 +30,31 @@
29 * it is absolutely not enough even at 100conn/sec. 256 cures most 30 * it is absolutely not enough even at 100conn/sec. 256 cures most
30 * of problems. This value is adjusted to 128 for very small machines 31 * of problems. This value is adjusted to 128 for very small machines
31 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb). 32 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
32 * Further increasing requires to change hash table size. 33 * Note : Dont forget somaxconn that may limit backlog too.
33 */ 34 */
34int sysctl_max_syn_backlog = 256; 35int sysctl_max_syn_backlog = 256;
35 36
36int reqsk_queue_alloc(struct request_sock_queue *queue, 37int reqsk_queue_alloc(struct request_sock_queue *queue,
37 const int nr_table_entries) 38 unsigned int nr_table_entries)
38{ 39{
39 const int lopt_size = sizeof(struct listen_sock) + 40 size_t lopt_size = sizeof(struct listen_sock);
40 nr_table_entries * sizeof(struct request_sock *); 41 struct listen_sock *lopt;
41 struct listen_sock *lopt = kzalloc(lopt_size, GFP_KERNEL); 42
42 43 nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
44 nr_table_entries = max_t(u32, nr_table_entries, 8);
45 nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
46 lopt_size += nr_table_entries * sizeof(struct request_sock *);
47 if (lopt_size > PAGE_SIZE)
48 lopt = __vmalloc(lopt_size,
49 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
50 PAGE_KERNEL);
51 else
52 lopt = kzalloc(lopt_size, GFP_KERNEL);
43 if (lopt == NULL) 53 if (lopt == NULL)
44 return -ENOMEM; 54 return -ENOMEM;
45 55
46 for (lopt->max_qlen_log = 6; 56 for (lopt->max_qlen_log = 3;
47 (1 << lopt->max_qlen_log) < sysctl_max_syn_backlog; 57 (1 << lopt->max_qlen_log) < nr_table_entries;
48 lopt->max_qlen_log++); 58 lopt->max_qlen_log++);
49 59
50 get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); 60 get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
@@ -65,9 +75,11 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
65{ 75{
66 /* make all the listen_opt local to us */ 76 /* make all the listen_opt local to us */
67 struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); 77 struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
78 size_t lopt_size = sizeof(struct listen_sock) +
79 lopt->nr_table_entries * sizeof(struct request_sock *);
68 80
69 if (lopt->qlen != 0) { 81 if (lopt->qlen != 0) {
70 int i; 82 unsigned int i;
71 83
72 for (i = 0; i < lopt->nr_table_entries; i++) { 84 for (i = 0; i < lopt->nr_table_entries; i++) {
73 struct request_sock *req; 85 struct request_sock *req;
@@ -81,7 +93,10 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
81 } 93 }
82 94
83 BUG_TRAP(lopt->qlen == 0); 95 BUG_TRAP(lopt->qlen == 0);
84 kfree(lopt); 96 if (lopt_size > PAGE_SIZE)
97 vfree(lopt);
98 else
99 kfree(lopt);
85} 100}
86 101
87EXPORT_SYMBOL(reqsk_queue_destroy); 102EXPORT_SYMBOL(reqsk_queue_destroy);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index e08e7688a263..0a5d68dbb418 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -1022,7 +1022,7 @@ static void dccp_v4_reqsk_destructor(struct request_sock *req)
1022 kfree(inet_rsk(req)->opt); 1022 kfree(inet_rsk(req)->opt);
1023} 1023}
1024 1024
1025static struct request_sock_ops dccp_request_sock_ops = { 1025static struct request_sock_ops dccp_request_sock_ops _read_mostly = {
1026 .family = PF_INET, 1026 .family = PF_INET,
1027 .obj_size = sizeof(struct dccp_request_sock), 1027 .obj_size = sizeof(struct dccp_request_sock),
1028 .rtx_syn_ack = dccp_v4_send_response, 1028 .rtx_syn_ack = dccp_v4_send_response,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 72cbdcfc2c65..047d170a363a 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -262,12 +262,12 @@ int dccp_destroy_sock(struct sock *sk)
262 262
263EXPORT_SYMBOL_GPL(dccp_destroy_sock); 263EXPORT_SYMBOL_GPL(dccp_destroy_sock);
264 264
265static inline int dccp_listen_start(struct sock *sk) 265static inline int dccp_listen_start(struct sock *sk, int backlog)
266{ 266{
267 struct dccp_sock *dp = dccp_sk(sk); 267 struct dccp_sock *dp = dccp_sk(sk);
268 268
269 dp->dccps_role = DCCP_ROLE_LISTEN; 269 dp->dccps_role = DCCP_ROLE_LISTEN;
270 return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE); 270 return inet_csk_listen_start(sk, backlog);
271} 271}
272 272
273int dccp_disconnect(struct sock *sk, int flags) 273int dccp_disconnect(struct sock *sk, int flags)
@@ -788,7 +788,7 @@ int inet_dccp_listen(struct socket *sock, int backlog)
788 * FIXME: here it probably should be sk->sk_prot->listen_start 788 * FIXME: here it probably should be sk->sk_prot->listen_start
789 * see tcp_listen_start 789 * see tcp_listen_start
790 */ 790 */
791 err = dccp_listen_start(sk); 791 err = dccp_listen_start(sk, backlog);
792 if (err) 792 if (err)
793 goto out; 793 goto out;
794 } 794 }
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index edcf0932ac6d..4a81d54a7569 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -204,7 +204,7 @@ int inet_listen(struct socket *sock, int backlog)
204 * we can only allow the backlog to be adjusted. 204 * we can only allow the backlog to be adjusted.
205 */ 205 */
206 if (old_state != TCP_LISTEN) { 206 if (old_state != TCP_LISTEN) {
207 err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE); 207 err = inet_csk_listen_start(sk, backlog);
208 if (err) 208 if (err)
209 goto out; 209 goto out;
210 } 210 }
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 96bbe2a0aa1b..9d68837888d3 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -343,7 +343,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk,
343EXPORT_SYMBOL_GPL(inet_csk_route_req); 343EXPORT_SYMBOL_GPL(inet_csk_route_req);
344 344
345static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, 345static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
346 const u32 rnd, const u16 synq_hsize) 346 const u32 rnd, const u32 synq_hsize)
347{ 347{
348 return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); 348 return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
349} 349}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 22ef8bd26620..5fbf96552cac 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -715,7 +715,7 @@ static struct ip_options *tcp_v4_save_options(struct sock *sk,
715 return dopt; 715 return dopt;
716} 716}
717 717
718struct request_sock_ops tcp_request_sock_ops = { 718struct request_sock_ops tcp_request_sock_ops __read_mostly = {
719 .family = PF_INET, 719 .family = PF_INET,
720 .obj_size = sizeof(struct tcp_request_sock), 720 .obj_size = sizeof(struct tcp_request_sock),
721 .rtx_syn_ack = tcp_v4_send_synack, 721 .rtx_syn_ack = tcp_v4_send_synack,
@@ -1385,7 +1385,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
1385 if (st->state == TCP_SEQ_STATE_OPENREQ) { 1385 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1386 struct request_sock *req = cur; 1386 struct request_sock *req = cur;
1387 1387
1388 icsk = inet_csk(st->syn_wait_sk); 1388 icsk = inet_csk(st->syn_wait_sk);
1389 req = req->dl_next; 1389 req = req->dl_next;
1390 while (1) { 1390 while (1) {
1391 while (req) { 1391 while (req) {
@@ -1395,7 +1395,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
1395 } 1395 }
1396 req = req->dl_next; 1396 req = req->dl_next;
1397 } 1397 }
1398 if (++st->sbucket >= TCP_SYNQ_HSIZE) 1398 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1399 break; 1399 break;
1400get_req: 1400get_req:
1401 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; 1401 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index eb6d145ecfd7..1a3c46c139f8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -526,7 +526,7 @@ static void tcp_v6_reqsk_destructor(struct request_sock *req)
526 kfree_skb(inet6_rsk(req)->pktopts); 526 kfree_skb(inet6_rsk(req)->pktopts);
527} 527}
528 528
529static struct request_sock_ops tcp6_request_sock_ops = { 529static struct request_sock_ops tcp6_request_sock_ops _read_mostly = {
530 .family = AF_INET6, 530 .family = AF_INET6,
531 .obj_size = sizeof(struct tcp6_request_sock), 531 .obj_size = sizeof(struct tcp6_request_sock),
532 .rtx_syn_ack = tcp_v6_send_synack, 532 .rtx_syn_ack = tcp_v6_send_synack,