aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2008-11-23 20:22:55 -0500
committerDavid S. Miller <davem@davemloft.net>2008-11-23 20:22:55 -0500
commitc25eb3bfb97294d0543a81230fbc237046b4b84c (patch)
tree6c9deabfb12f4d31f280cfcfe7e7580a2089931c
parent8c862c23e2563e6aedfc6c4aa6827cadb83f2414 (diff)
net: Convert TCP/DCCP listening hash tables to use RCU
This is the last step to be able to perform full RCU lookups in __inet_lookup() : After established/timewait tables, we add RCU lookups to listening hash table. The only trick here is that a socket of a given type (TCP ipv4, TCP ipv6, ...) can now flight between two different tables (established and listening) during a RCU grace period, so we must use different 'nulls' end-of-chain values for two tables. We define a large value : #define LISTENING_NULLS_BASE (1U << 29) So that slots in listening table are guaranteed to have different end-of-chain values than slots in established table. A reader can still detect it finished its lookup in the right chain. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/inet_hashtables.h9
-rw-r--r--net/ipv4/inet_diag.c4
-rw-r--r--net/ipv4/inet_hashtables.c148
-rw-r--r--net/ipv4/tcp_ipv4.c8
-rw-r--r--net/ipv6/inet6_hashtables.c94
5 files changed, 147 insertions, 116 deletions
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index ec7ee2e46d8c..f44bb5c77a70 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -99,9 +99,16 @@ struct inet_bind_hashbucket {
99 struct hlist_head chain; 99 struct hlist_head chain;
100}; 100};
101 101
102/*
103 * Sockets can be hashed in established or listening table
104 * We must use different 'nulls' end-of-chain value for listening
105 * hash table, or we might find a socket that was closed and
106 * reallocated/inserted into established hash table
107 */
108#define LISTENING_NULLS_BASE (1U << 29)
102struct inet_listen_hashbucket { 109struct inet_listen_hashbucket {
103 spinlock_t lock; 110 spinlock_t lock;
104 struct hlist_head head; 111 struct hlist_nulls_head head;
105}; 112};
106 113
107/* This is for listening sockets, thus all sockets which possess wildcards. */ 114/* This is for listening sockets, thus all sockets which possess wildcards. */
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 998a78f169ff..588a7796e3e3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -720,13 +720,13 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
720 720
721 for (i = s_i; i < INET_LHTABLE_SIZE; i++) { 721 for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
722 struct sock *sk; 722 struct sock *sk;
723 struct hlist_node *node; 723 struct hlist_nulls_node *node;
724 struct inet_listen_hashbucket *ilb; 724 struct inet_listen_hashbucket *ilb;
725 725
726 num = 0; 726 num = 0;
727 ilb = &hashinfo->listening_hash[i]; 727 ilb = &hashinfo->listening_hash[i];
728 spin_lock_bh(&ilb->lock); 728 spin_lock_bh(&ilb->lock);
729 sk_for_each(sk, node, &ilb->head) { 729 sk_nulls_for_each(sk, node, &ilb->head) {
730 struct inet_sock *inet = inet_sk(sk); 730 struct inet_sock *inet = inet_sk(sk);
731 731
732 if (num < s_num) { 732 if (num < s_num) {
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 4c273a9981a6..11fcb87a1fdd 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -110,78 +110,79 @@ void __inet_inherit_port(struct sock *sk, struct sock *child)
110 110
111EXPORT_SYMBOL_GPL(__inet_inherit_port); 111EXPORT_SYMBOL_GPL(__inet_inherit_port);
112 112
113static inline int compute_score(struct sock *sk, struct net *net,
114 const unsigned short hnum, const __be32 daddr,
115 const int dif)
116{
117 int score = -1;
118 struct inet_sock *inet = inet_sk(sk);
119
120 if (net_eq(sock_net(sk), net) && inet->num == hnum &&
121 !ipv6_only_sock(sk)) {
122 __be32 rcv_saddr = inet->rcv_saddr;
123 score = sk->sk_family == PF_INET ? 1 : 0;
124 if (rcv_saddr) {
125 if (rcv_saddr != daddr)
126 return -1;
127 score += 2;
128 }
129 if (sk->sk_bound_dev_if) {
130 if (sk->sk_bound_dev_if != dif)
131 return -1;
132 score += 2;
133 }
134 }
135 return score;
136}
137
113/* 138/*
114 * Don't inline this cruft. Here are some nice properties to exploit here. The 139 * Don't inline this cruft. Here are some nice properties to exploit here. The
115 * BSD API does not allow a listening sock to specify the remote port nor the 140 * BSD API does not allow a listening sock to specify the remote port nor the
116 * remote address for the connection. So always assume those are both 141 * remote address for the connection. So always assume those are both
117 * wildcarded during the search since they can never be otherwise. 142 * wildcarded during the search since they can never be otherwise.
118 */ 143 */
119static struct sock *inet_lookup_listener_slow(struct net *net,
120 const struct hlist_head *head,
121 const __be32 daddr,
122 const unsigned short hnum,
123 const int dif)
124{
125 struct sock *result = NULL, *sk;
126 const struct hlist_node *node;
127 int hiscore = -1;
128
129 sk_for_each(sk, node, head) {
130 const struct inet_sock *inet = inet_sk(sk);
131
132 if (net_eq(sock_net(sk), net) && inet->num == hnum &&
133 !ipv6_only_sock(sk)) {
134 const __be32 rcv_saddr = inet->rcv_saddr;
135 int score = sk->sk_family == PF_INET ? 1 : 0;
136
137 if (rcv_saddr) {
138 if (rcv_saddr != daddr)
139 continue;
140 score += 2;
141 }
142 if (sk->sk_bound_dev_if) {
143 if (sk->sk_bound_dev_if != dif)
144 continue;
145 score += 2;
146 }
147 if (score == 5)
148 return sk;
149 if (score > hiscore) {
150 hiscore = score;
151 result = sk;
152 }
153 }
154 }
155 return result;
156}
157 144
158/* Optimize the common listener case. */ 145
159struct sock *__inet_lookup_listener(struct net *net, 146struct sock *__inet_lookup_listener(struct net *net,
160 struct inet_hashinfo *hashinfo, 147 struct inet_hashinfo *hashinfo,
161 const __be32 daddr, const unsigned short hnum, 148 const __be32 daddr, const unsigned short hnum,
162 const int dif) 149 const int dif)
163{ 150{
164 struct sock *sk = NULL; 151 struct sock *sk, *result;
165 struct inet_listen_hashbucket *ilb; 152 struct hlist_nulls_node *node;
153 unsigned int hash = inet_lhashfn(net, hnum);
154 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
155 int score, hiscore;
166 156
167 ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; 157 rcu_read_lock();
168 spin_lock(&ilb->lock); 158begin:
169 if (!hlist_empty(&ilb->head)) { 159 result = NULL;
170 const struct inet_sock *inet = inet_sk((sk = __sk_head(&ilb->head))); 160 hiscore = -1;
171 161 sk_nulls_for_each_rcu(sk, node, &ilb->head) {
172 if (inet->num == hnum && !sk->sk_node.next && 162 score = compute_score(sk, net, hnum, daddr, dif);
173 (!inet->rcv_saddr || inet->rcv_saddr == daddr) && 163 if (score > hiscore) {
174 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && 164 result = sk;
175 !sk->sk_bound_dev_if && net_eq(sock_net(sk), net)) 165 hiscore = score;
176 goto sherry_cache; 166 }
177 sk = inet_lookup_listener_slow(net, &ilb->head, daddr, hnum, dif);
178 } 167 }
179 if (sk) { 168 /*
180sherry_cache: 169 * if the nulls value we got at the end of this lookup is
181 sock_hold(sk); 170 * not the expected one, we must restart lookup.
171 * We probably met an item that was moved to another chain.
172 */
173 if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
174 goto begin;
175 if (result) {
176 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
177 result = NULL;
178 else if (unlikely(compute_score(result, net, hnum, daddr,
179 dif) < hiscore)) {
180 sock_put(result);
181 goto begin;
182 }
182 } 183 }
183 spin_unlock(&ilb->lock); 184 rcu_read_unlock();
184 return sk; 185 return result;
185} 186}
186EXPORT_SYMBOL_GPL(__inet_lookup_listener); 187EXPORT_SYMBOL_GPL(__inet_lookup_listener);
187 188
@@ -370,7 +371,7 @@ static void __inet_hash(struct sock *sk)
370 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 371 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
371 372
372 spin_lock(&ilb->lock); 373 spin_lock(&ilb->lock);
373 __sk_add_node(sk, &ilb->head); 374 __sk_nulls_add_node_rcu(sk, &ilb->head);
374 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 375 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
375 spin_unlock(&ilb->lock); 376 spin_unlock(&ilb->lock);
376} 377}
@@ -388,26 +389,22 @@ EXPORT_SYMBOL_GPL(inet_hash);
388void inet_unhash(struct sock *sk) 389void inet_unhash(struct sock *sk)
389{ 390{
390 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 391 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
392 spinlock_t *lock;
393 int done;
391 394
392 if (sk_unhashed(sk)) 395 if (sk_unhashed(sk))
393 return; 396 return;
394 397
395 if (sk->sk_state == TCP_LISTEN) { 398 if (sk->sk_state == TCP_LISTEN)
396 struct inet_listen_hashbucket *ilb; 399 lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
400 else
401 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
397 402
398 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 403 spin_lock_bh(lock);
399 spin_lock_bh(&ilb->lock); 404 done =__sk_nulls_del_node_init_rcu(sk);
400 if (__sk_del_node_init(sk)) 405 spin_unlock_bh(lock);
401 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 406 if (done)
402 spin_unlock_bh(&ilb->lock); 407 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
403 } else {
404 spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
405
406 spin_lock_bh(lock);
407 if (__sk_nulls_del_node_init_rcu(sk))
408 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
409 spin_unlock_bh(lock);
410 }
411} 408}
412EXPORT_SYMBOL_GPL(inet_unhash); 409EXPORT_SYMBOL_GPL(inet_unhash);
413 410
@@ -526,8 +523,11 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
526{ 523{
527 int i; 524 int i;
528 525
529 for (i = 0; i < INET_LHTABLE_SIZE; i++) 526 for (i = 0; i < INET_LHTABLE_SIZE; i++) {
530 spin_lock_init(&h->listening_hash[i].lock); 527 spin_lock_init(&h->listening_hash[i].lock);
528 INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
529 i + LISTENING_NULLS_BASE);
530 }
531} 531}
532 532
533EXPORT_SYMBOL_GPL(inet_hashinfo_init); 533EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a81caa1be0cf..cab2458f86fd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1868,7 +1868,7 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1868static void *listening_get_next(struct seq_file *seq, void *cur) 1868static void *listening_get_next(struct seq_file *seq, void *cur)
1869{ 1869{
1870 struct inet_connection_sock *icsk; 1870 struct inet_connection_sock *icsk;
1871 struct hlist_node *node; 1871 struct hlist_nulls_node *node;
1872 struct sock *sk = cur; 1872 struct sock *sk = cur;
1873 struct inet_listen_hashbucket *ilb; 1873 struct inet_listen_hashbucket *ilb;
1874 struct tcp_iter_state *st = seq->private; 1874 struct tcp_iter_state *st = seq->private;
@@ -1878,7 +1878,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
1878 st->bucket = 0; 1878 st->bucket = 0;
1879 ilb = &tcp_hashinfo.listening_hash[0]; 1879 ilb = &tcp_hashinfo.listening_hash[0];
1880 spin_lock_bh(&ilb->lock); 1880 spin_lock_bh(&ilb->lock);
1881 sk = sk_head(&ilb->head); 1881 sk = sk_nulls_head(&ilb->head);
1882 goto get_sk; 1882 goto get_sk;
1883 } 1883 }
1884 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 1884 ilb = &tcp_hashinfo.listening_hash[st->bucket];
@@ -1914,7 +1914,7 @@ get_req:
1914 sk = sk_next(sk); 1914 sk = sk_next(sk);
1915 } 1915 }
1916get_sk: 1916get_sk:
1917 sk_for_each_from(sk, node) { 1917 sk_nulls_for_each_from(sk, node) {
1918 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { 1918 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1919 cur = sk; 1919 cur = sk;
1920 goto out; 1920 goto out;
@@ -1935,7 +1935,7 @@ start_req:
1935 if (++st->bucket < INET_LHTABLE_SIZE) { 1935 if (++st->bucket < INET_LHTABLE_SIZE) {
1936 ilb = &tcp_hashinfo.listening_hash[st->bucket]; 1936 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1937 spin_lock_bh(&ilb->lock); 1937 spin_lock_bh(&ilb->lock);
1938 sk = sk_head(&ilb->head); 1938 sk = sk_nulls_head(&ilb->head);
1939 goto get_sk; 1939 goto get_sk;
1940 } 1940 }
1941 cur = NULL; 1941 cur = NULL;
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index e0fd68187f83..8fe267feb81e 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -33,7 +33,7 @@ void __inet6_hash(struct sock *sk)
33 33
34 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 34 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
35 spin_lock(&ilb->lock); 35 spin_lock(&ilb->lock);
36 __sk_add_node(sk, &ilb->head); 36 __sk_nulls_add_node_rcu(sk, &ilb->head);
37 spin_unlock(&ilb->lock); 37 spin_unlock(&ilb->lock);
38 } else { 38 } else {
39 unsigned int hash; 39 unsigned int hash;
@@ -118,47 +118,71 @@ out:
118} 118}
119EXPORT_SYMBOL(__inet6_lookup_established); 119EXPORT_SYMBOL(__inet6_lookup_established);
120 120
121static int inline compute_score(struct sock *sk, struct net *net,
122 const unsigned short hnum,
123 const struct in6_addr *daddr,
124 const int dif)
125{
126 int score = -1;
127
128 if (net_eq(sock_net(sk), net) && inet_sk(sk)->num == hnum &&
129 sk->sk_family == PF_INET6) {
130 const struct ipv6_pinfo *np = inet6_sk(sk);
131
132 score = 1;
133 if (!ipv6_addr_any(&np->rcv_saddr)) {
134 if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
135 return -1;
136 score++;
137 }
138 if (sk->sk_bound_dev_if) {
139 if (sk->sk_bound_dev_if != dif)
140 return -1;
141 score++;
142 }
143 }
144 return score;
145}
146
121struct sock *inet6_lookup_listener(struct net *net, 147struct sock *inet6_lookup_listener(struct net *net,
122 struct inet_hashinfo *hashinfo, const struct in6_addr *daddr, 148 struct inet_hashinfo *hashinfo, const struct in6_addr *daddr,
123 const unsigned short hnum, const int dif) 149 const unsigned short hnum, const int dif)
124{ 150{
125 struct sock *sk; 151 struct sock *sk;
126 const struct hlist_node *node; 152 const struct hlist_nulls_node *node;
127 struct sock *result = NULL; 153 struct sock *result;
128 int score, hiscore = 0; 154 int score, hiscore;
129 struct inet_listen_hashbucket *ilb; 155 unsigned int hash = inet_lhashfn(net, hnum);
130 156 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
131 ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; 157
132 spin_lock(&ilb->lock); 158 rcu_read_lock();
133 sk_for_each(sk, node, &ilb->head) { 159begin:
134 if (net_eq(sock_net(sk), net) && inet_sk(sk)->num == hnum && 160 result = NULL;
135 sk->sk_family == PF_INET6) { 161 hiscore = -1;
136 const struct ipv6_pinfo *np = inet6_sk(sk); 162 sk_nulls_for_each(sk, node, &ilb->head) {
137 163 score = compute_score(sk, net, hnum, daddr, dif);
138 score = 1; 164 if (score > hiscore) {
139 if (!ipv6_addr_any(&np->rcv_saddr)) { 165 hiscore = score;
140 if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) 166 result = sk;
141 continue;
142 score++;
143 }
144 if (sk->sk_bound_dev_if) {
145 if (sk->sk_bound_dev_if != dif)
146 continue;
147 score++;
148 }
149 if (score == 3) {
150 result = sk;
151 break;
152 }
153 if (score > hiscore) {
154 hiscore = score;
155 result = sk;
156 }
157 } 167 }
158 } 168 }
159 if (result) 169 /*
160 sock_hold(result); 170 * if the nulls value we got at the end of this lookup is
161 spin_unlock(&ilb->lock); 171 * not the expected one, we must restart lookup.
172 * We probably met an item that was moved to another chain.
173 */
174 if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
175 goto begin;
176 if (result) {
177 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
178 result = NULL;
179 else if (unlikely(compute_score(result, net, hnum, daddr,
180 dif) < hiscore)) {
181 sock_put(result);
182 goto begin;
183 }
184 }
185 rcu_read_unlock();
162 return result; 186 return result;
163} 187}
164 188