diff options
author | Eric Dumazet <dada1@cosmosbay.com> | 2006-11-16 05:30:37 -0500 |
---|---|---|
committer | David S. Miller <davem@sunset.davemloft.net> | 2006-12-03 00:21:44 -0500 |
commit | 72a3effaf633bcae9034b7e176bdbd78d64a71db (patch) | |
tree | b7a331527f1b15335a358f97809134f35587e57a | |
parent | 3c62f75aac7348ee262b1295cfcfeb3473f76815 (diff) |
[NET]: Size listen hash tables using backlog hint
We currently allocate a fixed size (TCP_SYNQ_HSIZE=512) slots hash table for
each LISTEN socket, regardless of various parameters (listen backlog for
example)
On x86_64, this means order-1 allocations (might fail), even for 'small'
sockets, expecting few connections. On the contrary, a huge server wanting a
backlog of 50000 is slowed down a bit because of this fixed limit.
This patch makes the sizing of listen hash table a dynamic parameter,
depending of :
- net.core.somaxconn tunable (default is 128)
- net.ipv4.tcp_max_syn_backlog tunable (default : 256, 1024 or 128)
- backlog value given by user application (2nd parameter of listen())
For large allocations (bigger than PAGE_SIZE), we use vmalloc() instead of
kmalloc().
We still limit memory allocation with the two existing tunables (somaxconn &
tcp_max_syn_backlog). So for standard setups, this patch actually reduce RAM
usage.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/request_sock.h | 8 | ||||
-rw-r--r-- | include/net/tcp.h | 1 | ||||
-rw-r--r-- | net/core/request_sock.c | 35 | ||||
-rw-r--r-- | net/dccp/ipv4.c | 2 | ||||
-rw-r--r-- | net/dccp/proto.c | 6 | ||||
-rw-r--r-- | net/ipv4/af_inet.c | 2 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 6 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 2 |
9 files changed, 39 insertions, 25 deletions
diff --git a/include/net/request_sock.h b/include/net/request_sock.h index f743a941a4f2..b5b023e79e5f 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h | |||
@@ -28,8 +28,8 @@ struct proto; | |||
28 | 28 | ||
29 | struct request_sock_ops { | 29 | struct request_sock_ops { |
30 | int family; | 30 | int family; |
31 | kmem_cache_t *slab; | ||
32 | int obj_size; | 31 | int obj_size; |
32 | kmem_cache_t *slab; | ||
33 | int (*rtx_syn_ack)(struct sock *sk, | 33 | int (*rtx_syn_ack)(struct sock *sk, |
34 | struct request_sock *req, | 34 | struct request_sock *req, |
35 | struct dst_entry *dst); | 35 | struct dst_entry *dst); |
@@ -51,13 +51,13 @@ struct request_sock { | |||
51 | u32 rcv_wnd; /* rcv_wnd offered first time */ | 51 | u32 rcv_wnd; /* rcv_wnd offered first time */ |
52 | u32 ts_recent; | 52 | u32 ts_recent; |
53 | unsigned long expires; | 53 | unsigned long expires; |
54 | struct request_sock_ops *rsk_ops; | 54 | const struct request_sock_ops *rsk_ops; |
55 | struct sock *sk; | 55 | struct sock *sk; |
56 | u32 secid; | 56 | u32 secid; |
57 | u32 peer_secid; | 57 | u32 peer_secid; |
58 | }; | 58 | }; |
59 | 59 | ||
60 | static inline struct request_sock *reqsk_alloc(struct request_sock_ops *ops) | 60 | static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops) |
61 | { | 61 | { |
62 | struct request_sock *req = kmem_cache_alloc(ops->slab, SLAB_ATOMIC); | 62 | struct request_sock *req = kmem_cache_alloc(ops->slab, SLAB_ATOMIC); |
63 | 63 | ||
@@ -121,7 +121,7 @@ struct request_sock_queue { | |||
121 | }; | 121 | }; |
122 | 122 | ||
123 | extern int reqsk_queue_alloc(struct request_sock_queue *queue, | 123 | extern int reqsk_queue_alloc(struct request_sock_queue *queue, |
124 | const int nr_table_entries); | 124 | unsigned int nr_table_entries); |
125 | 125 | ||
126 | static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock_queue *queue) | 126 | static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock_queue *queue) |
127 | { | 127 | { |
diff --git a/include/net/tcp.h b/include/net/tcp.h index 7a093d0aa0fe..246916c2321e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
@@ -138,7 +138,6 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); | |||
138 | #define MAX_TCP_SYNCNT 127 | 138 | #define MAX_TCP_SYNCNT 127 |
139 | 139 | ||
140 | #define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */ | 140 | #define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */ |
141 | #define TCP_SYNQ_HSIZE 512 /* Size of SYNACK hash table */ | ||
142 | 141 | ||
143 | #define TCP_PAWS_24DAYS (60 * 60 * 24 * 24) | 142 | #define TCP_PAWS_24DAYS (60 * 60 * 24 * 24) |
144 | #define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated | 143 | #define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated |
diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 79ebd75fbe4d..5f0818d815e6 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/random.h> | 15 | #include <linux/random.h> |
16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
17 | #include <linux/string.h> | 17 | #include <linux/string.h> |
18 | #include <linux/vmalloc.h> | ||
18 | 19 | ||
19 | #include <net/request_sock.h> | 20 | #include <net/request_sock.h> |
20 | 21 | ||
@@ -29,22 +30,31 @@ | |||
29 | * it is absolutely not enough even at 100conn/sec. 256 cures most | 30 | * it is absolutely not enough even at 100conn/sec. 256 cures most |
30 | * of problems. This value is adjusted to 128 for very small machines | 31 | * of problems. This value is adjusted to 128 for very small machines |
31 | * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb). | 32 | * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb). |
32 | * Further increasing requires to change hash table size. | 33 | * Note : Dont forget somaxconn that may limit backlog too. |
33 | */ | 34 | */ |
34 | int sysctl_max_syn_backlog = 256; | 35 | int sysctl_max_syn_backlog = 256; |
35 | 36 | ||
36 | int reqsk_queue_alloc(struct request_sock_queue *queue, | 37 | int reqsk_queue_alloc(struct request_sock_queue *queue, |
37 | const int nr_table_entries) | 38 | unsigned int nr_table_entries) |
38 | { | 39 | { |
39 | const int lopt_size = sizeof(struct listen_sock) + | 40 | size_t lopt_size = sizeof(struct listen_sock); |
40 | nr_table_entries * sizeof(struct request_sock *); | 41 | struct listen_sock *lopt; |
41 | struct listen_sock *lopt = kzalloc(lopt_size, GFP_KERNEL); | 42 | |
42 | 43 | nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); | |
44 | nr_table_entries = max_t(u32, nr_table_entries, 8); | ||
45 | nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); | ||
46 | lopt_size += nr_table_entries * sizeof(struct request_sock *); | ||
47 | if (lopt_size > PAGE_SIZE) | ||
48 | lopt = __vmalloc(lopt_size, | ||
49 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | ||
50 | PAGE_KERNEL); | ||
51 | else | ||
52 | lopt = kzalloc(lopt_size, GFP_KERNEL); | ||
43 | if (lopt == NULL) | 53 | if (lopt == NULL) |
44 | return -ENOMEM; | 54 | return -ENOMEM; |
45 | 55 | ||
46 | for (lopt->max_qlen_log = 6; | 56 | for (lopt->max_qlen_log = 3; |
47 | (1 << lopt->max_qlen_log) < sysctl_max_syn_backlog; | 57 | (1 << lopt->max_qlen_log) < nr_table_entries; |
48 | lopt->max_qlen_log++); | 58 | lopt->max_qlen_log++); |
49 | 59 | ||
50 | get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); | 60 | get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); |
@@ -65,9 +75,11 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) | |||
65 | { | 75 | { |
66 | /* make all the listen_opt local to us */ | 76 | /* make all the listen_opt local to us */ |
67 | struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); | 77 | struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); |
78 | size_t lopt_size = sizeof(struct listen_sock) + | ||
79 | lopt->nr_table_entries * sizeof(struct request_sock *); | ||
68 | 80 | ||
69 | if (lopt->qlen != 0) { | 81 | if (lopt->qlen != 0) { |
70 | int i; | 82 | unsigned int i; |
71 | 83 | ||
72 | for (i = 0; i < lopt->nr_table_entries; i++) { | 84 | for (i = 0; i < lopt->nr_table_entries; i++) { |
73 | struct request_sock *req; | 85 | struct request_sock *req; |
@@ -81,7 +93,10 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) | |||
81 | } | 93 | } |
82 | 94 | ||
83 | BUG_TRAP(lopt->qlen == 0); | 95 | BUG_TRAP(lopt->qlen == 0); |
84 | kfree(lopt); | 96 | if (lopt_size > PAGE_SIZE) |
97 | vfree(lopt); | ||
98 | else | ||
99 | kfree(lopt); | ||
85 | } | 100 | } |
86 | 101 | ||
87 | EXPORT_SYMBOL(reqsk_queue_destroy); | 102 | EXPORT_SYMBOL(reqsk_queue_destroy); |
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index e08e7688a263..0a5d68dbb418 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c | |||
@@ -1022,7 +1022,7 @@ static void dccp_v4_reqsk_destructor(struct request_sock *req) | |||
1022 | kfree(inet_rsk(req)->opt); | 1022 | kfree(inet_rsk(req)->opt); |
1023 | } | 1023 | } |
1024 | 1024 | ||
1025 | static struct request_sock_ops dccp_request_sock_ops = { | 1025 | static struct request_sock_ops dccp_request_sock_ops _read_mostly = { |
1026 | .family = PF_INET, | 1026 | .family = PF_INET, |
1027 | .obj_size = sizeof(struct dccp_request_sock), | 1027 | .obj_size = sizeof(struct dccp_request_sock), |
1028 | .rtx_syn_ack = dccp_v4_send_response, | 1028 | .rtx_syn_ack = dccp_v4_send_response, |
diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 72cbdcfc2c65..047d170a363a 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c | |||
@@ -262,12 +262,12 @@ int dccp_destroy_sock(struct sock *sk) | |||
262 | 262 | ||
263 | EXPORT_SYMBOL_GPL(dccp_destroy_sock); | 263 | EXPORT_SYMBOL_GPL(dccp_destroy_sock); |
264 | 264 | ||
265 | static inline int dccp_listen_start(struct sock *sk) | 265 | static inline int dccp_listen_start(struct sock *sk, int backlog) |
266 | { | 266 | { |
267 | struct dccp_sock *dp = dccp_sk(sk); | 267 | struct dccp_sock *dp = dccp_sk(sk); |
268 | 268 | ||
269 | dp->dccps_role = DCCP_ROLE_LISTEN; | 269 | dp->dccps_role = DCCP_ROLE_LISTEN; |
270 | return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE); | 270 | return inet_csk_listen_start(sk, backlog); |
271 | } | 271 | } |
272 | 272 | ||
273 | int dccp_disconnect(struct sock *sk, int flags) | 273 | int dccp_disconnect(struct sock *sk, int flags) |
@@ -788,7 +788,7 @@ int inet_dccp_listen(struct socket *sock, int backlog) | |||
788 | * FIXME: here it probably should be sk->sk_prot->listen_start | 788 | * FIXME: here it probably should be sk->sk_prot->listen_start |
789 | * see tcp_listen_start | 789 | * see tcp_listen_start |
790 | */ | 790 | */ |
791 | err = dccp_listen_start(sk); | 791 | err = dccp_listen_start(sk, backlog); |
792 | if (err) | 792 | if (err) |
793 | goto out; | 793 | goto out; |
794 | } | 794 | } |
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index edcf0932ac6d..4a81d54a7569 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c | |||
@@ -204,7 +204,7 @@ int inet_listen(struct socket *sock, int backlog) | |||
204 | * we can only allow the backlog to be adjusted. | 204 | * we can only allow the backlog to be adjusted. |
205 | */ | 205 | */ |
206 | if (old_state != TCP_LISTEN) { | 206 | if (old_state != TCP_LISTEN) { |
207 | err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE); | 207 | err = inet_csk_listen_start(sk, backlog); |
208 | if (err) | 208 | if (err) |
209 | goto out; | 209 | goto out; |
210 | } | 210 | } |
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 96bbe2a0aa1b..9d68837888d3 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c | |||
@@ -343,7 +343,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk, | |||
343 | EXPORT_SYMBOL_GPL(inet_csk_route_req); | 343 | EXPORT_SYMBOL_GPL(inet_csk_route_req); |
344 | 344 | ||
345 | static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, | 345 | static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, |
346 | const u32 rnd, const u16 synq_hsize) | 346 | const u32 rnd, const u32 synq_hsize) |
347 | { | 347 | { |
348 | return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); | 348 | return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); |
349 | } | 349 | } |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 22ef8bd26620..5fbf96552cac 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -715,7 +715,7 @@ static struct ip_options *tcp_v4_save_options(struct sock *sk, | |||
715 | return dopt; | 715 | return dopt; |
716 | } | 716 | } |
717 | 717 | ||
718 | struct request_sock_ops tcp_request_sock_ops = { | 718 | struct request_sock_ops tcp_request_sock_ops __read_mostly = { |
719 | .family = PF_INET, | 719 | .family = PF_INET, |
720 | .obj_size = sizeof(struct tcp_request_sock), | 720 | .obj_size = sizeof(struct tcp_request_sock), |
721 | .rtx_syn_ack = tcp_v4_send_synack, | 721 | .rtx_syn_ack = tcp_v4_send_synack, |
@@ -1385,7 +1385,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) | |||
1385 | if (st->state == TCP_SEQ_STATE_OPENREQ) { | 1385 | if (st->state == TCP_SEQ_STATE_OPENREQ) { |
1386 | struct request_sock *req = cur; | 1386 | struct request_sock *req = cur; |
1387 | 1387 | ||
1388 | icsk = inet_csk(st->syn_wait_sk); | 1388 | icsk = inet_csk(st->syn_wait_sk); |
1389 | req = req->dl_next; | 1389 | req = req->dl_next; |
1390 | while (1) { | 1390 | while (1) { |
1391 | while (req) { | 1391 | while (req) { |
@@ -1395,7 +1395,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) | |||
1395 | } | 1395 | } |
1396 | req = req->dl_next; | 1396 | req = req->dl_next; |
1397 | } | 1397 | } |
1398 | if (++st->sbucket >= TCP_SYNQ_HSIZE) | 1398 | if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) |
1399 | break; | 1399 | break; |
1400 | get_req: | 1400 | get_req: |
1401 | req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; | 1401 | req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index eb6d145ecfd7..1a3c46c139f8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
@@ -526,7 +526,7 @@ static void tcp_v6_reqsk_destructor(struct request_sock *req) | |||
526 | kfree_skb(inet6_rsk(req)->pktopts); | 526 | kfree_skb(inet6_rsk(req)->pktopts); |
527 | } | 527 | } |
528 | 528 | ||
529 | static struct request_sock_ops tcp6_request_sock_ops = { | 529 | static struct request_sock_ops tcp6_request_sock_ops _read_mostly = { |
530 | .family = AF_INET6, | 530 | .family = AF_INET6, |
531 | .obj_size = sizeof(struct tcp6_request_sock), | 531 | .obj_size = sizeof(struct tcp6_request_sock), |
532 | .rtx_syn_ack = tcp_v6_send_synack, | 532 | .rtx_syn_ack = tcp_v6_send_synack, |