aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2008-11-20 03:40:07 -0500
committerDavid S. Miller <davem@davemloft.net>2008-11-20 03:40:07 -0500
commit5caea4ea7088e80ac5410d04660346094608b909 (patch)
treefad95133683c002d24ff5de7fb756dad806b41ed /net/ipv4
parentd8b83c57a7e497cba9b5cb156e63176323035785 (diff)
net: listening_hash get a spinlock per bucket
This patch prepares RCU migration of listening_hash table for TCP/DCCP protocols. listening_hash table being small (32 slots per protocol), we add a spinlock for each slot, instead of a single rwlock for whole table. This should reduce hold time of readers, and writers concurrency. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/inet_diag.c12
-rw-r--r--net/ipv4/inet_hashtables.c86
-rw-r--r--net/ipv4/tcp_ipv4.c24
3 files changed, 50 insertions, 72 deletions
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 41b36720e977..1cb154ed75ad 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -718,13 +718,15 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
718 if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) 718 if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
719 goto skip_listen_ht; 719 goto skip_listen_ht;
720 720
721 inet_listen_lock(hashinfo);
722 for (i = s_i; i < INET_LHTABLE_SIZE; i++) { 721 for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
723 struct sock *sk; 722 struct sock *sk;
724 struct hlist_node *node; 723 struct hlist_node *node;
724 struct inet_listen_hashbucket *ilb;
725 725
726 num = 0; 726 num = 0;
727 sk_for_each(sk, node, &hashinfo->listening_hash[i]) { 727 ilb = &hashinfo->listening_hash[i];
728 spin_lock_bh(&ilb->lock);
729 sk_for_each(sk, node, &ilb->head) {
728 struct inet_sock *inet = inet_sk(sk); 730 struct inet_sock *inet = inet_sk(sk);
729 731
730 if (num < s_num) { 732 if (num < s_num) {
@@ -742,7 +744,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
742 goto syn_recv; 744 goto syn_recv;
743 745
744 if (inet_csk_diag_dump(sk, skb, cb) < 0) { 746 if (inet_csk_diag_dump(sk, skb, cb) < 0) {
745 inet_listen_unlock(hashinfo); 747 spin_unlock_bh(&ilb->lock);
746 goto done; 748 goto done;
747 } 749 }
748 750
@@ -751,7 +753,7 @@ syn_recv:
751 goto next_listen; 753 goto next_listen;
752 754
753 if (inet_diag_dump_reqs(skb, sk, cb) < 0) { 755 if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
754 inet_listen_unlock(hashinfo); 756 spin_unlock_bh(&ilb->lock);
755 goto done; 757 goto done;
756 } 758 }
757 759
@@ -760,12 +762,12 @@ next_listen:
760 cb->args[4] = 0; 762 cb->args[4] = 0;
761 ++num; 763 ++num;
762 } 764 }
765 spin_unlock_bh(&ilb->lock);
763 766
764 s_num = 0; 767 s_num = 0;
765 cb->args[3] = 0; 768 cb->args[3] = 0;
766 cb->args[4] = 0; 769 cb->args[4] = 0;
767 } 770 }
768 inet_listen_unlock(hashinfo);
769skip_listen_ht: 771skip_listen_ht:
770 cb->args[0] = 1; 772 cb->args[0] = 1;
771 s_i = num = s_num = 0; 773 s_i = num = s_num = 0;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fd269cfef0ec..377d004e5723 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -111,35 +111,6 @@ void __inet_inherit_port(struct sock *sk, struct sock *child)
111EXPORT_SYMBOL_GPL(__inet_inherit_port); 111EXPORT_SYMBOL_GPL(__inet_inherit_port);
112 112
113/* 113/*
114 * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
115 * Look, when several writers sleep and reader wakes them up, all but one
116 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
117 * this, _but_ remember, it adds useless work on UP machines (wake up each
118 * exclusive lock release). It should be ifdefed really.
119 */
120void inet_listen_wlock(struct inet_hashinfo *hashinfo)
121 __acquires(hashinfo->lhash_lock)
122{
123 write_lock(&hashinfo->lhash_lock);
124
125 if (atomic_read(&hashinfo->lhash_users)) {
126 DEFINE_WAIT(wait);
127
128 for (;;) {
129 prepare_to_wait_exclusive(&hashinfo->lhash_wait,
130 &wait, TASK_UNINTERRUPTIBLE);
131 if (!atomic_read(&hashinfo->lhash_users))
132 break;
133 write_unlock_bh(&hashinfo->lhash_lock);
134 schedule();
135 write_lock_bh(&hashinfo->lhash_lock);
136 }
137
138 finish_wait(&hashinfo->lhash_wait, &wait);
139 }
140}
141
142/*
143 * Don't inline this cruft. Here are some nice properties to exploit here. The 114 * Don't inline this cruft. Here are some nice properties to exploit here. The
144 * BSD API does not allow a listening sock to specify the remote port nor the 115 * BSD API does not allow a listening sock to specify the remote port nor the
145 * remote address for the connection. So always assume those are both 116 * remote address for the connection. So always assume those are both
@@ -191,25 +162,25 @@ struct sock *__inet_lookup_listener(struct net *net,
191 const int dif) 162 const int dif)
192{ 163{
193 struct sock *sk = NULL; 164 struct sock *sk = NULL;
194 const struct hlist_head *head; 165 struct inet_listen_hashbucket *ilb;
195 166
196 read_lock(&hashinfo->lhash_lock); 167 ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
197 head = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; 168 spin_lock(&ilb->lock);
198 if (!hlist_empty(head)) { 169 if (!hlist_empty(&ilb->head)) {
199 const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); 170 const struct inet_sock *inet = inet_sk((sk = __sk_head(&ilb->head)));
200 171
201 if (inet->num == hnum && !sk->sk_node.next && 172 if (inet->num == hnum && !sk->sk_node.next &&
202 (!inet->rcv_saddr || inet->rcv_saddr == daddr) && 173 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
203 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && 174 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
204 !sk->sk_bound_dev_if && net_eq(sock_net(sk), net)) 175 !sk->sk_bound_dev_if && net_eq(sock_net(sk), net))
205 goto sherry_cache; 176 goto sherry_cache;
206 sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif); 177 sk = inet_lookup_listener_slow(net, &ilb->head, daddr, hnum, dif);
207 } 178 }
208 if (sk) { 179 if (sk) {
209sherry_cache: 180sherry_cache:
210 sock_hold(sk); 181 sock_hold(sk);
211 } 182 }
212 read_unlock(&hashinfo->lhash_lock); 183 spin_unlock(&ilb->lock);
213 return sk; 184 return sk;
214} 185}
215EXPORT_SYMBOL_GPL(__inet_lookup_listener); 186EXPORT_SYMBOL_GPL(__inet_lookup_listener);
@@ -389,8 +360,7 @@ EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
389static void __inet_hash(struct sock *sk) 360static void __inet_hash(struct sock *sk)
390{ 361{
391 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 362 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
392 struct hlist_head *list; 363 struct inet_listen_hashbucket *ilb;
393 rwlock_t *lock;
394 364
395 if (sk->sk_state != TCP_LISTEN) { 365 if (sk->sk_state != TCP_LISTEN) {
396 __inet_hash_nolisten(sk); 366 __inet_hash_nolisten(sk);
@@ -398,14 +368,12 @@ static void __inet_hash(struct sock *sk)
398 } 368 }
399 369
400 WARN_ON(!sk_unhashed(sk)); 370 WARN_ON(!sk_unhashed(sk));
401 list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 371 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
402 lock = &hashinfo->lhash_lock;
403 372
404 inet_listen_wlock(hashinfo); 373 spin_lock(&ilb->lock);
405 __sk_add_node(sk, list); 374 __sk_add_node(sk, &ilb->head);
406 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 375 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
407 write_unlock(lock); 376 spin_unlock(&ilb->lock);
408 wake_up(&hashinfo->lhash_wait);
409} 377}
410 378
411void inet_hash(struct sock *sk) 379void inet_hash(struct sock *sk)
@@ -420,29 +388,27 @@ EXPORT_SYMBOL_GPL(inet_hash);
420 388
421void inet_unhash(struct sock *sk) 389void inet_unhash(struct sock *sk)
422{ 390{
423 rwlock_t *lock;
424 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 391 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
425 392
426 if (sk_unhashed(sk)) 393 if (sk_unhashed(sk))
427 goto out; 394 return;
428 395
429 if (sk->sk_state == TCP_LISTEN) { 396 if (sk->sk_state == TCP_LISTEN) {
430 local_bh_disable(); 397 struct inet_listen_hashbucket *ilb;
431 inet_listen_wlock(hashinfo); 398
432 lock = &hashinfo->lhash_lock; 399 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
400 spin_lock_bh(&ilb->lock);
433 if (__sk_del_node_init(sk)) 401 if (__sk_del_node_init(sk))
434 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 402 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
403 spin_unlock_bh(&ilb->lock);
435 } else { 404 } else {
436 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 405 rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
406
437 write_lock_bh(lock); 407 write_lock_bh(lock);
438 if (__sk_nulls_del_node_init_rcu(sk)) 408 if (__sk_nulls_del_node_init_rcu(sk))
439 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 409 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
410 write_unlock_bh(lock);
440 } 411 }
441
442 write_unlock_bh(lock);
443out:
444 if (sk->sk_state == TCP_LISTEN)
445 wake_up(&hashinfo->lhash_wait);
446} 412}
447EXPORT_SYMBOL_GPL(inet_unhash); 413EXPORT_SYMBOL_GPL(inet_unhash);
448 414
@@ -556,3 +522,13 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
556} 522}
557 523
558EXPORT_SYMBOL_GPL(inet_hash_connect); 524EXPORT_SYMBOL_GPL(inet_hash_connect);
525
526void inet_hashinfo_init(struct inet_hashinfo *h)
527{
528 int i;
529
530 for (i = 0; i < INET_LHTABLE_SIZE; i++)
531 spin_lock_init(&h->listening_hash[i].lock);
532}
533
534EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5559fea61e87..330b08a12274 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -97,11 +97,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
97} 97}
98#endif 98#endif
99 99
100struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { 100struct inet_hashinfo tcp_hashinfo;
101 .lhash_lock = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
102 .lhash_users = ATOMIC_INIT(0),
103 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
104};
105 101
106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb) 102static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107{ 103{
@@ -1874,15 +1870,18 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
1874 struct inet_connection_sock *icsk; 1870 struct inet_connection_sock *icsk;
1875 struct hlist_node *node; 1871 struct hlist_node *node;
1876 struct sock *sk = cur; 1872 struct sock *sk = cur;
1873 struct inet_listen_hashbucket *ilb;
1877 struct tcp_iter_state *st = seq->private; 1874 struct tcp_iter_state *st = seq->private;
1878 struct net *net = seq_file_net(seq); 1875 struct net *net = seq_file_net(seq);
1879 1876
1880 if (!sk) { 1877 if (!sk) {
1881 st->bucket = 0; 1878 st->bucket = 0;
1882 sk = sk_head(&tcp_hashinfo.listening_hash[0]); 1879 ilb = &tcp_hashinfo.listening_hash[0];
1880 spin_lock_bh(&ilb->lock);
1881 sk = sk_head(&ilb->head);
1883 goto get_sk; 1882 goto get_sk;
1884 } 1883 }
1885 1884 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1886 ++st->num; 1885 ++st->num;
1887 1886
1888 if (st->state == TCP_SEQ_STATE_OPENREQ) { 1887 if (st->state == TCP_SEQ_STATE_OPENREQ) {
@@ -1932,8 +1931,11 @@ start_req:
1932 } 1931 }
1933 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); 1932 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1934 } 1933 }
1934 spin_unlock_bh(&ilb->lock);
1935 if (++st->bucket < INET_LHTABLE_SIZE) { 1935 if (++st->bucket < INET_LHTABLE_SIZE) {
1936 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]); 1936 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1937 spin_lock_bh(&ilb->lock);
1938 sk = sk_head(&ilb->head);
1937 goto get_sk; 1939 goto get_sk;
1938 } 1940 }
1939 cur = NULL; 1941 cur = NULL;
@@ -2066,12 +2068,10 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2066 void *rc; 2068 void *rc;
2067 struct tcp_iter_state *st = seq->private; 2069 struct tcp_iter_state *st = seq->private;
2068 2070
2069 inet_listen_lock(&tcp_hashinfo);
2070 st->state = TCP_SEQ_STATE_LISTENING; 2071 st->state = TCP_SEQ_STATE_LISTENING;
2071 rc = listening_get_idx(seq, &pos); 2072 rc = listening_get_idx(seq, &pos);
2072 2073
2073 if (!rc) { 2074 if (!rc) {
2074 inet_listen_unlock(&tcp_hashinfo);
2075 st->state = TCP_SEQ_STATE_ESTABLISHED; 2075 st->state = TCP_SEQ_STATE_ESTABLISHED;
2076 rc = established_get_idx(seq, pos); 2076 rc = established_get_idx(seq, pos);
2077 } 2077 }
@@ -2103,7 +2103,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2103 case TCP_SEQ_STATE_LISTENING: 2103 case TCP_SEQ_STATE_LISTENING:
2104 rc = listening_get_next(seq, v); 2104 rc = listening_get_next(seq, v);
2105 if (!rc) { 2105 if (!rc) {
2106 inet_listen_unlock(&tcp_hashinfo);
2107 st->state = TCP_SEQ_STATE_ESTABLISHED; 2106 st->state = TCP_SEQ_STATE_ESTABLISHED;
2108 rc = established_get_first(seq); 2107 rc = established_get_first(seq);
2109 } 2108 }
@@ -2130,7 +2129,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
2130 } 2129 }
2131 case TCP_SEQ_STATE_LISTENING: 2130 case TCP_SEQ_STATE_LISTENING:
2132 if (v != SEQ_START_TOKEN) 2131 if (v != SEQ_START_TOKEN)
2133 inet_listen_unlock(&tcp_hashinfo); 2132 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2134 break; 2133 break;
2135 case TCP_SEQ_STATE_TIME_WAIT: 2134 case TCP_SEQ_STATE_TIME_WAIT:
2136 case TCP_SEQ_STATE_ESTABLISHED: 2135 case TCP_SEQ_STATE_ESTABLISHED:
@@ -2405,6 +2404,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
2405 2404
2406void __init tcp_v4_init(void) 2405void __init tcp_v4_init(void)
2407{ 2406{
2407 inet_hashinfo_init(&tcp_hashinfo);
2408 if (register_pernet_device(&tcp_sk_ops)) 2408 if (register_pernet_device(&tcp_sk_ops))
2409 panic("Failed to create the TCP control socket.\n"); 2409 panic("Failed to create the TCP control socket.\n");
2410} 2410}