diff options
author | Eric Dumazet <dada1@cosmosbay.com> | 2008-11-23 20:22:55 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2008-11-23 20:22:55 -0500 |
commit | c25eb3bfb97294d0543a81230fbc237046b4b84c (patch) | |
tree | 6c9deabfb12f4d31f280cfcfe7e7580a2089931c /net/ipv4/inet_hashtables.c | |
parent | 8c862c23e2563e6aedfc6c4aa6827cadb83f2414 (diff) |
net: Convert TCP/DCCP listening hash tables to use RCU
This is the last step to be able to perform full RCU lookups
in __inet_lookup() : After established/timewait tables, we
add RCU lookups to listening hash table.
The only trick here is that a socket of a given type (TCP ipv4,
TCP ipv6, ...) can now flight between two different tables
(established and listening) during a RCU grace period, so we
must use different 'nulls' end-of-chain values for two tables.
We define a large value :
#define LISTENING_NULLS_BASE (1U << 29)
So that slots in listening table are guaranteed to have different
end-of-chain values than slots in established table. A reader can
still detect it finished its lookup in the right chain.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/inet_hashtables.c')
-rw-r--r-- | net/ipv4/inet_hashtables.c | 148 |
1 files changed, 74 insertions, 74 deletions
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4c273a9981a6..11fcb87a1fdd 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c | |||
@@ -110,78 +110,79 @@ void __inet_inherit_port(struct sock *sk, struct sock *child) | |||
110 | 110 | ||
111 | EXPORT_SYMBOL_GPL(__inet_inherit_port); | 111 | EXPORT_SYMBOL_GPL(__inet_inherit_port); |
112 | 112 | ||
113 | static inline int compute_score(struct sock *sk, struct net *net, | ||
114 | const unsigned short hnum, const __be32 daddr, | ||
115 | const int dif) | ||
116 | { | ||
117 | int score = -1; | ||
118 | struct inet_sock *inet = inet_sk(sk); | ||
119 | |||
120 | if (net_eq(sock_net(sk), net) && inet->num == hnum && | ||
121 | !ipv6_only_sock(sk)) { | ||
122 | __be32 rcv_saddr = inet->rcv_saddr; | ||
123 | score = sk->sk_family == PF_INET ? 1 : 0; | ||
124 | if (rcv_saddr) { | ||
125 | if (rcv_saddr != daddr) | ||
126 | return -1; | ||
127 | score += 2; | ||
128 | } | ||
129 | if (sk->sk_bound_dev_if) { | ||
130 | if (sk->sk_bound_dev_if != dif) | ||
131 | return -1; | ||
132 | score += 2; | ||
133 | } | ||
134 | } | ||
135 | return score; | ||
136 | } | ||
137 | |||
113 | /* | 138 | /* |
114 | * Don't inline this cruft. Here are some nice properties to exploit here. The | 139 | * Don't inline this cruft. Here are some nice properties to exploit here. The |
115 | * BSD API does not allow a listening sock to specify the remote port nor the | 140 | * BSD API does not allow a listening sock to specify the remote port nor the |
116 | * remote address for the connection. So always assume those are both | 141 | * remote address for the connection. So always assume those are both |
117 | * wildcarded during the search since they can never be otherwise. | 142 | * wildcarded during the search since they can never be otherwise. |
118 | */ | 143 | */ |
119 | static struct sock *inet_lookup_listener_slow(struct net *net, | ||
120 | const struct hlist_head *head, | ||
121 | const __be32 daddr, | ||
122 | const unsigned short hnum, | ||
123 | const int dif) | ||
124 | { | ||
125 | struct sock *result = NULL, *sk; | ||
126 | const struct hlist_node *node; | ||
127 | int hiscore = -1; | ||
128 | |||
129 | sk_for_each(sk, node, head) { | ||
130 | const struct inet_sock *inet = inet_sk(sk); | ||
131 | |||
132 | if (net_eq(sock_net(sk), net) && inet->num == hnum && | ||
133 | !ipv6_only_sock(sk)) { | ||
134 | const __be32 rcv_saddr = inet->rcv_saddr; | ||
135 | int score = sk->sk_family == PF_INET ? 1 : 0; | ||
136 | |||
137 | if (rcv_saddr) { | ||
138 | if (rcv_saddr != daddr) | ||
139 | continue; | ||
140 | score += 2; | ||
141 | } | ||
142 | if (sk->sk_bound_dev_if) { | ||
143 | if (sk->sk_bound_dev_if != dif) | ||
144 | continue; | ||
145 | score += 2; | ||
146 | } | ||
147 | if (score == 5) | ||
148 | return sk; | ||
149 | if (score > hiscore) { | ||
150 | hiscore = score; | ||
151 | result = sk; | ||
152 | } | ||
153 | } | ||
154 | } | ||
155 | return result; | ||
156 | } | ||
157 | 144 | ||
158 | /* Optimize the common listener case. */ | 145 | |
159 | struct sock *__inet_lookup_listener(struct net *net, | 146 | struct sock *__inet_lookup_listener(struct net *net, |
160 | struct inet_hashinfo *hashinfo, | 147 | struct inet_hashinfo *hashinfo, |
161 | const __be32 daddr, const unsigned short hnum, | 148 | const __be32 daddr, const unsigned short hnum, |
162 | const int dif) | 149 | const int dif) |
163 | { | 150 | { |
164 | struct sock *sk = NULL; | 151 | struct sock *sk, *result; |
165 | struct inet_listen_hashbucket *ilb; | 152 | struct hlist_nulls_node *node; |
153 | unsigned int hash = inet_lhashfn(net, hnum); | ||
154 | struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; | ||
155 | int score, hiscore; | ||
166 | 156 | ||
167 | ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; | 157 | rcu_read_lock(); |
168 | spin_lock(&ilb->lock); | 158 | begin: |
169 | if (!hlist_empty(&ilb->head)) { | 159 | result = NULL; |
170 | const struct inet_sock *inet = inet_sk((sk = __sk_head(&ilb->head))); | 160 | hiscore = -1; |
171 | 161 | sk_nulls_for_each_rcu(sk, node, &ilb->head) { | |
172 | if (inet->num == hnum && !sk->sk_node.next && | 162 | score = compute_score(sk, net, hnum, daddr, dif); |
173 | (!inet->rcv_saddr || inet->rcv_saddr == daddr) && | 163 | if (score > hiscore) { |
174 | (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && | 164 | result = sk; |
175 | !sk->sk_bound_dev_if && net_eq(sock_net(sk), net)) | 165 | hiscore = score; |
176 | goto sherry_cache; | 166 | } |
177 | sk = inet_lookup_listener_slow(net, &ilb->head, daddr, hnum, dif); | ||
178 | } | 167 | } |
179 | if (sk) { | 168 | /* |
180 | sherry_cache: | 169 | * if the nulls value we got at the end of this lookup is |
181 | sock_hold(sk); | 170 | * not the expected one, we must restart lookup. |
171 | * We probably met an item that was moved to another chain. | ||
172 | */ | ||
173 | if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) | ||
174 | goto begin; | ||
175 | if (result) { | ||
176 | if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) | ||
177 | result = NULL; | ||
178 | else if (unlikely(compute_score(result, net, hnum, daddr, | ||
179 | dif) < hiscore)) { | ||
180 | sock_put(result); | ||
181 | goto begin; | ||
182 | } | ||
182 | } | 183 | } |
183 | spin_unlock(&ilb->lock); | 184 | rcu_read_unlock(); |
184 | return sk; | 185 | return result; |
185 | } | 186 | } |
186 | EXPORT_SYMBOL_GPL(__inet_lookup_listener); | 187 | EXPORT_SYMBOL_GPL(__inet_lookup_listener); |
187 | 188 | ||
@@ -370,7 +371,7 @@ static void __inet_hash(struct sock *sk) | |||
370 | ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; | 371 | ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; |
371 | 372 | ||
372 | spin_lock(&ilb->lock); | 373 | spin_lock(&ilb->lock); |
373 | __sk_add_node(sk, &ilb->head); | 374 | __sk_nulls_add_node_rcu(sk, &ilb->head); |
374 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 375 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
375 | spin_unlock(&ilb->lock); | 376 | spin_unlock(&ilb->lock); |
376 | } | 377 | } |
@@ -388,26 +389,22 @@ EXPORT_SYMBOL_GPL(inet_hash); | |||
388 | void inet_unhash(struct sock *sk) | 389 | void inet_unhash(struct sock *sk) |
389 | { | 390 | { |
390 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; | 391 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
392 | spinlock_t *lock; | ||
393 | int done; | ||
391 | 394 | ||
392 | if (sk_unhashed(sk)) | 395 | if (sk_unhashed(sk)) |
393 | return; | 396 | return; |
394 | 397 | ||
395 | if (sk->sk_state == TCP_LISTEN) { | 398 | if (sk->sk_state == TCP_LISTEN) |
396 | struct inet_listen_hashbucket *ilb; | 399 | lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; |
400 | else | ||
401 | lock = inet_ehash_lockp(hashinfo, sk->sk_hash); | ||
397 | 402 | ||
398 | ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; | 403 | spin_lock_bh(lock); |
399 | spin_lock_bh(&ilb->lock); | 404 | done =__sk_nulls_del_node_init_rcu(sk); |
400 | if (__sk_del_node_init(sk)) | 405 | spin_unlock_bh(lock); |
401 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | 406 | if (done) |
402 | spin_unlock_bh(&ilb->lock); | 407 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); |
403 | } else { | ||
404 | spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); | ||
405 | |||
406 | spin_lock_bh(lock); | ||
407 | if (__sk_nulls_del_node_init_rcu(sk)) | ||
408 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | ||
409 | spin_unlock_bh(lock); | ||
410 | } | ||
411 | } | 408 | } |
412 | EXPORT_SYMBOL_GPL(inet_unhash); | 409 | EXPORT_SYMBOL_GPL(inet_unhash); |
413 | 410 | ||
@@ -526,8 +523,11 @@ void inet_hashinfo_init(struct inet_hashinfo *h) | |||
526 | { | 523 | { |
527 | int i; | 524 | int i; |
528 | 525 | ||
529 | for (i = 0; i < INET_LHTABLE_SIZE; i++) | 526 | for (i = 0; i < INET_LHTABLE_SIZE; i++) { |
530 | spin_lock_init(&h->listening_hash[i].lock); | 527 | spin_lock_init(&h->listening_hash[i].lock); |
528 | INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, | ||
529 | i + LISTENING_NULLS_BASE); | ||
530 | } | ||
531 | } | 531 | } |
532 | 532 | ||
533 | EXPORT_SYMBOL_GPL(inet_hashinfo_init); | 533 | EXPORT_SYMBOL_GPL(inet_hashinfo_init); |