diff options
author | Eric Dumazet <dada1@cosmosbay.com> | 2008-11-20 03:40:07 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2008-11-20 03:40:07 -0500 |
commit | 5caea4ea7088e80ac5410d04660346094608b909 (patch) | |
tree | fad95133683c002d24ff5de7fb756dad806b41ed /net/ipv4 | |
parent | d8b83c57a7e497cba9b5cb156e63176323035785 (diff) |
net: listening_hash get a spinlock per bucket
This patch prepares RCU migration of listening_hash table for
TCP/DCCP protocols.
listening_hash table being small (32 slots per protocol), we add
a spinlock for each slot, instead of a single rwlock for whole table.
This should reduce hold time of readers, and writers concurrency.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/inet_diag.c | 12 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 86 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 24 |
3 files changed, 50 insertions, 72 deletions
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 41b36720e977..1cb154ed75ad 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c | |||
@@ -718,13 +718,15 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) | |||
718 | if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) | 718 | if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) |
719 | goto skip_listen_ht; | 719 | goto skip_listen_ht; |
720 | 720 | ||
721 | inet_listen_lock(hashinfo); | ||
722 | for (i = s_i; i < INET_LHTABLE_SIZE; i++) { | 721 | for (i = s_i; i < INET_LHTABLE_SIZE; i++) { |
723 | struct sock *sk; | 722 | struct sock *sk; |
724 | struct hlist_node *node; | 723 | struct hlist_node *node; |
724 | struct inet_listen_hashbucket *ilb; | ||
725 | 725 | ||
726 | num = 0; | 726 | num = 0; |
727 | sk_for_each(sk, node, &hashinfo->listening_hash[i]) { | 727 | ilb = &hashinfo->listening_hash[i]; |
728 | spin_lock_bh(&ilb->lock); | ||
729 | sk_for_each(sk, node, &ilb->head) { | ||
728 | struct inet_sock *inet = inet_sk(sk); | 730 | struct inet_sock *inet = inet_sk(sk); |
729 | 731 | ||
730 | if (num < s_num) { | 732 | if (num < s_num) { |
@@ -742,7 +744,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) | |||
742 | goto syn_recv; | 744 | goto syn_recv; |
743 | 745 | ||
744 | if (inet_csk_diag_dump(sk, skb, cb) < 0) { | 746 | if (inet_csk_diag_dump(sk, skb, cb) < 0) { |
745 | inet_listen_unlock(hashinfo); | 747 | spin_unlock_bh(&ilb->lock); |
746 | goto done; | 748 | goto done; |
747 | } | 749 | } |
748 | 750 | ||
@@ -751,7 +753,7 @@ syn_recv: | |||
751 | goto next_listen; | 753 | goto next_listen; |
752 | 754 | ||
753 | if (inet_diag_dump_reqs(skb, sk, cb) < 0) { | 755 | if (inet_diag_dump_reqs(skb, sk, cb) < 0) { |
754 | inet_listen_unlock(hashinfo); | 756 | spin_unlock_bh(&ilb->lock); |
755 | goto done; | 757 | goto done; |
756 | } | 758 | } |
757 | 759 | ||
@@ -760,12 +762,12 @@ next_listen: | |||
760 | cb->args[4] = 0; | 762 | cb->args[4] = 0; |
761 | ++num; | 763 | ++num; |
762 | } | 764 | } |
765 | spin_unlock_bh(&ilb->lock); | ||
763 | 766 | ||
764 | s_num = 0; | 767 | s_num = 0; |
765 | cb->args[3] = 0; | 768 | cb->args[3] = 0; |
766 | cb->args[4] = 0; | 769 | cb->args[4] = 0; |
767 | } | 770 | } |
768 | inet_listen_unlock(hashinfo); | ||
769 | skip_listen_ht: | 771 | skip_listen_ht: |
770 | cb->args[0] = 1; | 772 | cb->args[0] = 1; |
771 | s_i = num = s_num = 0; | 773 | s_i = num = s_num = 0; |
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index fd269cfef0ec..377d004e5723 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c | |||
@@ -111,35 +111,6 @@ void __inet_inherit_port(struct sock *sk, struct sock *child) | |||
111 | EXPORT_SYMBOL_GPL(__inet_inherit_port); | 111 | EXPORT_SYMBOL_GPL(__inet_inherit_port); |
112 | 112 | ||
113 | /* | 113 | /* |
114 | * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. | ||
115 | * Look, when several writers sleep and reader wakes them up, all but one | ||
116 | * immediately hit write lock and grab all the cpus. Exclusive sleep solves | ||
117 | * this, _but_ remember, it adds useless work on UP machines (wake up each | ||
118 | * exclusive lock release). It should be ifdefed really. | ||
119 | */ | ||
120 | void inet_listen_wlock(struct inet_hashinfo *hashinfo) | ||
121 | __acquires(hashinfo->lhash_lock) | ||
122 | { | ||
123 | write_lock(&hashinfo->lhash_lock); | ||
124 | |||
125 | if (atomic_read(&hashinfo->lhash_users)) { | ||
126 | DEFINE_WAIT(wait); | ||
127 | |||
128 | for (;;) { | ||
129 | prepare_to_wait_exclusive(&hashinfo->lhash_wait, | ||
130 | &wait, TASK_UNINTERRUPTIBLE); | ||
131 | if (!atomic_read(&hashinfo->lhash_users)) | ||
132 | break; | ||
133 | write_unlock_bh(&hashinfo->lhash_lock); | ||
134 | schedule(); | ||
135 | write_lock_bh(&hashinfo->lhash_lock); | ||
136 | } | ||
137 | |||
138 | finish_wait(&hashinfo->lhash_wait, &wait); | ||
139 | } | ||
140 | } | ||
141 | |||
142 | /* | ||
143 | * Don't inline this cruft. Here are some nice properties to exploit here. The | 114 | * Don't inline this cruft. Here are some nice properties to exploit here. The |
144 | * BSD API does not allow a listening sock to specify the remote port nor the | 115 | * BSD API does not allow a listening sock to specify the remote port nor the |
145 | * remote address for the connection. So always assume those are both | 116 | * remote address for the connection. So always assume those are both |
@@ -191,25 +162,25 @@ struct sock *__inet_lookup_listener(struct net *net, | |||
191 | const int dif) | 162 | const int dif) |
192 | { | 163 | { |
193 | struct sock *sk = NULL; | 164 | struct sock *sk = NULL; |
194 | const struct hlist_head *head; | 165 | struct inet_listen_hashbucket *ilb; |
195 | 166 | ||
196 | read_lock(&hashinfo->lhash_lock); | 167 | ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; |
197 | head = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; | 168 | spin_lock(&ilb->lock); |
198 | if (!hlist_empty(head)) { | 169 | if (!hlist_empty(&ilb->head)) { |
199 | const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); | 170 | const struct inet_sock *inet = inet_sk((sk = __sk_head(&ilb->head))); |
200 | 171 | ||
201 | if (inet->num == hnum && !sk->sk_node.next && | 172 | if (inet->num == hnum && !sk->sk_node.next && |
202 | (!inet->rcv_saddr || inet->rcv_saddr == daddr) && | 173 | (!inet->rcv_saddr || inet->rcv_saddr == daddr) && |
203 | (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && | 174 | (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && |
204 | !sk->sk_bound_dev_if && net_eq(sock_net(sk), net)) | 175 | !sk->sk_bound_dev_if && net_eq(sock_net(sk), net)) |
205 | goto sherry_cache; | 176 | goto sherry_cache; |
206 | sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif); | 177 | sk = inet_lookup_listener_slow(net, &ilb->head, daddr, hnum, dif); |
207 | } | 178 | } |
208 | if (sk) { | 179 | if (sk) { |
209 | sherry_cache: | 180 | sherry_cache: |
210 | sock_hold(sk); | 181 | sock_hold(sk); |
211 | } | 182 | } |
212 | read_unlock(&hashinfo->lhash_lock); | 183 | spin_unlock(&ilb->lock); |
213 | return sk; | 184 | return sk; |
214 | } | 185 | } |
215 | EXPORT_SYMBOL_GPL(__inet_lookup_listener); | 186 | EXPORT_SYMBOL_GPL(__inet_lookup_listener); |
@@ -389,8 +360,7 @@ EXPORT_SYMBOL_GPL(__inet_hash_nolisten); | |||
389 | static void __inet_hash(struct sock *sk) | 360 | static void __inet_hash(struct sock *sk) |
390 | { | 361 | { |
391 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; | 362 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
392 | struct hlist_head *list; | 363 | struct inet_listen_hashbucket *ilb; |
393 | rwlock_t *lock; | ||
394 | 364 | ||
395 | if (sk->sk_state != TCP_LISTEN) { | 365 | if (sk->sk_state != TCP_LISTEN) { |
396 | __inet_hash_nolisten(sk); | 366 | __inet_hash_nolisten(sk); |
@@ -398,14 +368,12 @@ static void __inet_hash(struct sock *sk) | |||
398 | } | 368 | } |
399 | 369 | ||
400 | WARN_ON(!sk_unhashed(sk)); | 370 | WARN_ON(!sk_unhashed(sk)); |
401 | list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; | 371 | ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; |
402 | lock = &hashinfo->lhash_lock; | ||
403 | 372 | ||
404 | inet_listen_wlock(hashinfo); | 373 | spin_lock(&ilb->lock); |
405 | __sk_add_node(sk, list); | 374 | __sk_add_node(sk, &ilb->head); |
406 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 375 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
407 | write_unlock(lock); | 376 | spin_unlock(&ilb->lock); |
408 | wake_up(&hashinfo->lhash_wait); | ||
409 | } | 377 | } |
410 | 378 | ||
411 | void inet_hash(struct sock *sk) | 379 | void inet_hash(struct sock *sk) |
@@ -420,29 +388,27 @@ EXPORT_SYMBOL_GPL(inet_hash); | |||
420 | 388 | ||
421 | void inet_unhash(struct sock *sk) | 389 | void inet_unhash(struct sock *sk) |
422 | { | 390 | { |
423 | rwlock_t *lock; | ||
424 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; | 391 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
425 | 392 | ||
426 | if (sk_unhashed(sk)) | 393 | if (sk_unhashed(sk)) |
427 | goto out; | 394 | return; |
428 | 395 | ||
429 | if (sk->sk_state == TCP_LISTEN) { | 396 | if (sk->sk_state == TCP_LISTEN) { |
430 | local_bh_disable(); | 397 | struct inet_listen_hashbucket *ilb; |
431 | inet_listen_wlock(hashinfo); | 398 | |
432 | lock = &hashinfo->lhash_lock; | 399 | ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; |
400 | spin_lock_bh(&ilb->lock); | ||
433 | if (__sk_del_node_init(sk)) | 401 | if (__sk_del_node_init(sk)) |
434 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | 402 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); |
403 | spin_unlock_bh(&ilb->lock); | ||
435 | } else { | 404 | } else { |
436 | lock = inet_ehash_lockp(hashinfo, sk->sk_hash); | 405 | rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); |
406 | |||
437 | write_lock_bh(lock); | 407 | write_lock_bh(lock); |
438 | if (__sk_nulls_del_node_init_rcu(sk)) | 408 | if (__sk_nulls_del_node_init_rcu(sk)) |
439 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | 409 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); |
410 | write_unlock_bh(lock); | ||
440 | } | 411 | } |
441 | |||
442 | write_unlock_bh(lock); | ||
443 | out: | ||
444 | if (sk->sk_state == TCP_LISTEN) | ||
445 | wake_up(&hashinfo->lhash_wait); | ||
446 | } | 412 | } |
447 | EXPORT_SYMBOL_GPL(inet_unhash); | 413 | EXPORT_SYMBOL_GPL(inet_unhash); |
448 | 414 | ||
@@ -556,3 +522,13 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, | |||
556 | } | 522 | } |
557 | 523 | ||
558 | EXPORT_SYMBOL_GPL(inet_hash_connect); | 524 | EXPORT_SYMBOL_GPL(inet_hash_connect); |
525 | |||
526 | void inet_hashinfo_init(struct inet_hashinfo *h) | ||
527 | { | ||
528 | int i; | ||
529 | |||
530 | for (i = 0; i < INET_LHTABLE_SIZE; i++) | ||
531 | spin_lock_init(&h->listening_hash[i].lock); | ||
532 | } | ||
533 | |||
534 | EXPORT_SYMBOL_GPL(inet_hashinfo_init); | ||
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5559fea61e87..330b08a12274 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -97,11 +97,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) | |||
97 | } | 97 | } |
98 | #endif | 98 | #endif |
99 | 99 | ||
100 | struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { | 100 | struct inet_hashinfo tcp_hashinfo; |
101 | .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock), | ||
102 | .lhash_users = ATOMIC_INIT(0), | ||
103 | .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), | ||
104 | }; | ||
105 | 101 | ||
106 | static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) | 102 | static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) |
107 | { | 103 | { |
@@ -1874,15 +1870,18 @@ static void *listening_get_next(struct seq_file *seq, void *cur) | |||
1874 | struct inet_connection_sock *icsk; | 1870 | struct inet_connection_sock *icsk; |
1875 | struct hlist_node *node; | 1871 | struct hlist_node *node; |
1876 | struct sock *sk = cur; | 1872 | struct sock *sk = cur; |
1873 | struct inet_listen_hashbucket *ilb; | ||
1877 | struct tcp_iter_state *st = seq->private; | 1874 | struct tcp_iter_state *st = seq->private; |
1878 | struct net *net = seq_file_net(seq); | 1875 | struct net *net = seq_file_net(seq); |
1879 | 1876 | ||
1880 | if (!sk) { | 1877 | if (!sk) { |
1881 | st->bucket = 0; | 1878 | st->bucket = 0; |
1882 | sk = sk_head(&tcp_hashinfo.listening_hash[0]); | 1879 | ilb = &tcp_hashinfo.listening_hash[0]; |
1880 | spin_lock_bh(&ilb->lock); | ||
1881 | sk = sk_head(&ilb->head); | ||
1883 | goto get_sk; | 1882 | goto get_sk; |
1884 | } | 1883 | } |
1885 | 1884 | ilb = &tcp_hashinfo.listening_hash[st->bucket]; | |
1886 | ++st->num; | 1885 | ++st->num; |
1887 | 1886 | ||
1888 | if (st->state == TCP_SEQ_STATE_OPENREQ) { | 1887 | if (st->state == TCP_SEQ_STATE_OPENREQ) { |
@@ -1932,8 +1931,11 @@ start_req: | |||
1932 | } | 1931 | } |
1933 | read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); | 1932 | read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); |
1934 | } | 1933 | } |
1934 | spin_unlock_bh(&ilb->lock); | ||
1935 | if (++st->bucket < INET_LHTABLE_SIZE) { | 1935 | if (++st->bucket < INET_LHTABLE_SIZE) { |
1936 | sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]); | 1936 | ilb = &tcp_hashinfo.listening_hash[st->bucket]; |
1937 | spin_lock_bh(&ilb->lock); | ||
1938 | sk = sk_head(&ilb->head); | ||
1937 | goto get_sk; | 1939 | goto get_sk; |
1938 | } | 1940 | } |
1939 | cur = NULL; | 1941 | cur = NULL; |
@@ -2066,12 +2068,10 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) | |||
2066 | void *rc; | 2068 | void *rc; |
2067 | struct tcp_iter_state *st = seq->private; | 2069 | struct tcp_iter_state *st = seq->private; |
2068 | 2070 | ||
2069 | inet_listen_lock(&tcp_hashinfo); | ||
2070 | st->state = TCP_SEQ_STATE_LISTENING; | 2071 | st->state = TCP_SEQ_STATE_LISTENING; |
2071 | rc = listening_get_idx(seq, &pos); | 2072 | rc = listening_get_idx(seq, &pos); |
2072 | 2073 | ||
2073 | if (!rc) { | 2074 | if (!rc) { |
2074 | inet_listen_unlock(&tcp_hashinfo); | ||
2075 | st->state = TCP_SEQ_STATE_ESTABLISHED; | 2075 | st->state = TCP_SEQ_STATE_ESTABLISHED; |
2076 | rc = established_get_idx(seq, pos); | 2076 | rc = established_get_idx(seq, pos); |
2077 | } | 2077 | } |
@@ -2103,7 +2103,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |||
2103 | case TCP_SEQ_STATE_LISTENING: | 2103 | case TCP_SEQ_STATE_LISTENING: |
2104 | rc = listening_get_next(seq, v); | 2104 | rc = listening_get_next(seq, v); |
2105 | if (!rc) { | 2105 | if (!rc) { |
2106 | inet_listen_unlock(&tcp_hashinfo); | ||
2107 | st->state = TCP_SEQ_STATE_ESTABLISHED; | 2106 | st->state = TCP_SEQ_STATE_ESTABLISHED; |
2108 | rc = established_get_first(seq); | 2107 | rc = established_get_first(seq); |
2109 | } | 2108 | } |
@@ -2130,7 +2129,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) | |||
2130 | } | 2129 | } |
2131 | case TCP_SEQ_STATE_LISTENING: | 2130 | case TCP_SEQ_STATE_LISTENING: |
2132 | if (v != SEQ_START_TOKEN) | 2131 | if (v != SEQ_START_TOKEN) |
2133 | inet_listen_unlock(&tcp_hashinfo); | 2132 | spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); |
2134 | break; | 2133 | break; |
2135 | case TCP_SEQ_STATE_TIME_WAIT: | 2134 | case TCP_SEQ_STATE_TIME_WAIT: |
2136 | case TCP_SEQ_STATE_ESTABLISHED: | 2135 | case TCP_SEQ_STATE_ESTABLISHED: |
@@ -2405,6 +2404,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { | |||
2405 | 2404 | ||
2406 | void __init tcp_v4_init(void) | 2405 | void __init tcp_v4_init(void) |
2407 | { | 2406 | { |
2407 | inet_hashinfo_init(&tcp_hashinfo); | ||
2408 | if (register_pernet_device(&tcp_sk_ops)) | 2408 | if (register_pernet_device(&tcp_sk_ops)) |
2409 | panic("Failed to create the TCP control socket.\n"); | 2409 | panic("Failed to create the TCP control socket.\n"); |
2410 | } | 2410 | } |