aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/inet_hashtables.c
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2008-11-23 20:22:55 -0500
committerDavid S. Miller <davem@davemloft.net>2008-11-23 20:22:55 -0500
commitc25eb3bfb97294d0543a81230fbc237046b4b84c (patch)
tree6c9deabfb12f4d31f280cfcfe7e7580a2089931c /net/ipv4/inet_hashtables.c
parent8c862c23e2563e6aedfc6c4aa6827cadb83f2414 (diff)
net: Convert TCP/DCCP listening hash tables to use RCU
This is the last step to be able to perform full RCU lookups in __inet_lookup() : After established/timewait tables, we add RCU lookups to listening hash table. The only trick here is that a socket of a given type (TCP ipv4, TCP ipv6, ...) can now flight between two different tables (established and listening) during a RCU grace period, so we must use different 'nulls' end-of-chain values for two tables. We define a large value : #define LISTENING_NULLS_BASE (1U << 29) So that slots in listening table are guaranteed to have different end-of-chain values than slots in established table. A reader can still detect it finished its lookup in the right chain. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/inet_hashtables.c')
-rw-r--r--net/ipv4/inet_hashtables.c148
1 files changed, 74 insertions, 74 deletions
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 4c273a9981a6..11fcb87a1fdd 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -110,78 +110,79 @@ void __inet_inherit_port(struct sock *sk, struct sock *child)
110 110
111EXPORT_SYMBOL_GPL(__inet_inherit_port); 111EXPORT_SYMBOL_GPL(__inet_inherit_port);
112 112
113static inline int compute_score(struct sock *sk, struct net *net,
114 const unsigned short hnum, const __be32 daddr,
115 const int dif)
116{
117 int score = -1;
118 struct inet_sock *inet = inet_sk(sk);
119
120 if (net_eq(sock_net(sk), net) && inet->num == hnum &&
121 !ipv6_only_sock(sk)) {
122 __be32 rcv_saddr = inet->rcv_saddr;
123 score = sk->sk_family == PF_INET ? 1 : 0;
124 if (rcv_saddr) {
125 if (rcv_saddr != daddr)
126 return -1;
127 score += 2;
128 }
129 if (sk->sk_bound_dev_if) {
130 if (sk->sk_bound_dev_if != dif)
131 return -1;
132 score += 2;
133 }
134 }
135 return score;
136}
137
113/* 138/*
114 * Don't inline this cruft. Here are some nice properties to exploit here. The 139 * Don't inline this cruft. Here are some nice properties to exploit here. The
115 * BSD API does not allow a listening sock to specify the remote port nor the 140 * BSD API does not allow a listening sock to specify the remote port nor the
116 * remote address for the connection. So always assume those are both 141 * remote address for the connection. So always assume those are both
117 * wildcarded during the search since they can never be otherwise. 142 * wildcarded during the search since they can never be otherwise.
118 */ 143 */
119static struct sock *inet_lookup_listener_slow(struct net *net,
120 const struct hlist_head *head,
121 const __be32 daddr,
122 const unsigned short hnum,
123 const int dif)
124{
125 struct sock *result = NULL, *sk;
126 const struct hlist_node *node;
127 int hiscore = -1;
128
129 sk_for_each(sk, node, head) {
130 const struct inet_sock *inet = inet_sk(sk);
131
132 if (net_eq(sock_net(sk), net) && inet->num == hnum &&
133 !ipv6_only_sock(sk)) {
134 const __be32 rcv_saddr = inet->rcv_saddr;
135 int score = sk->sk_family == PF_INET ? 1 : 0;
136
137 if (rcv_saddr) {
138 if (rcv_saddr != daddr)
139 continue;
140 score += 2;
141 }
142 if (sk->sk_bound_dev_if) {
143 if (sk->sk_bound_dev_if != dif)
144 continue;
145 score += 2;
146 }
147 if (score == 5)
148 return sk;
149 if (score > hiscore) {
150 hiscore = score;
151 result = sk;
152 }
153 }
154 }
155 return result;
156}
157 144
158/* Optimize the common listener case. */ 145
159struct sock *__inet_lookup_listener(struct net *net, 146struct sock *__inet_lookup_listener(struct net *net,
160 struct inet_hashinfo *hashinfo, 147 struct inet_hashinfo *hashinfo,
161 const __be32 daddr, const unsigned short hnum, 148 const __be32 daddr, const unsigned short hnum,
162 const int dif) 149 const int dif)
163{ 150{
164 struct sock *sk = NULL; 151 struct sock *sk, *result;
165 struct inet_listen_hashbucket *ilb; 152 struct hlist_nulls_node *node;
153 unsigned int hash = inet_lhashfn(net, hnum);
154 struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
155 int score, hiscore;
166 156
167 ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; 157 rcu_read_lock();
168 spin_lock(&ilb->lock); 158begin:
169 if (!hlist_empty(&ilb->head)) { 159 result = NULL;
170 const struct inet_sock *inet = inet_sk((sk = __sk_head(&ilb->head))); 160 hiscore = -1;
171 161 sk_nulls_for_each_rcu(sk, node, &ilb->head) {
172 if (inet->num == hnum && !sk->sk_node.next && 162 score = compute_score(sk, net, hnum, daddr, dif);
173 (!inet->rcv_saddr || inet->rcv_saddr == daddr) && 163 if (score > hiscore) {
174 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && 164 result = sk;
175 !sk->sk_bound_dev_if && net_eq(sock_net(sk), net)) 165 hiscore = score;
176 goto sherry_cache; 166 }
177 sk = inet_lookup_listener_slow(net, &ilb->head, daddr, hnum, dif);
178 } 167 }
179 if (sk) { 168 /*
180sherry_cache: 169 * if the nulls value we got at the end of this lookup is
181 sock_hold(sk); 170 * not the expected one, we must restart lookup.
171 * We probably met an item that was moved to another chain.
172 */
173 if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
174 goto begin;
175 if (result) {
176 if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
177 result = NULL;
178 else if (unlikely(compute_score(result, net, hnum, daddr,
179 dif) < hiscore)) {
180 sock_put(result);
181 goto begin;
182 }
182 } 183 }
183 spin_unlock(&ilb->lock); 184 rcu_read_unlock();
184 return sk; 185 return result;
185} 186}
186EXPORT_SYMBOL_GPL(__inet_lookup_listener); 187EXPORT_SYMBOL_GPL(__inet_lookup_listener);
187 188
@@ -370,7 +371,7 @@ static void __inet_hash(struct sock *sk)
370 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 371 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
371 372
372 spin_lock(&ilb->lock); 373 spin_lock(&ilb->lock);
373 __sk_add_node(sk, &ilb->head); 374 __sk_nulls_add_node_rcu(sk, &ilb->head);
374 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 375 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
375 spin_unlock(&ilb->lock); 376 spin_unlock(&ilb->lock);
376} 377}
@@ -388,26 +389,22 @@ EXPORT_SYMBOL_GPL(inet_hash);
388void inet_unhash(struct sock *sk) 389void inet_unhash(struct sock *sk)
389{ 390{
390 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 391 struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
392 spinlock_t *lock;
393 int done;
391 394
392 if (sk_unhashed(sk)) 395 if (sk_unhashed(sk))
393 return; 396 return;
394 397
395 if (sk->sk_state == TCP_LISTEN) { 398 if (sk->sk_state == TCP_LISTEN)
396 struct inet_listen_hashbucket *ilb; 399 lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
400 else
401 lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
397 402
398 ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; 403 spin_lock_bh(lock);
399 spin_lock_bh(&ilb->lock); 404 done =__sk_nulls_del_node_init_rcu(sk);
400 if (__sk_del_node_init(sk)) 405 spin_unlock_bh(lock);
401 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 406 if (done)
402 spin_unlock_bh(&ilb->lock); 407 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
403 } else {
404 spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
405
406 spin_lock_bh(lock);
407 if (__sk_nulls_del_node_init_rcu(sk))
408 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
409 spin_unlock_bh(lock);
410 }
411} 408}
412EXPORT_SYMBOL_GPL(inet_unhash); 409EXPORT_SYMBOL_GPL(inet_unhash);
413 410
@@ -526,8 +523,11 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
526{ 523{
527 int i; 524 int i;
528 525
529 for (i = 0; i < INET_LHTABLE_SIZE; i++) 526 for (i = 0; i < INET_LHTABLE_SIZE; i++) {
530 spin_lock_init(&h->listening_hash[i].lock); 527 spin_lock_init(&h->listening_hash[i].lock);
528 INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
529 i + LISTENING_NULLS_BASE);
530 }
531} 531}
532 532
533EXPORT_SYMBOL_GPL(inet_hashinfo_init); 533EXPORT_SYMBOL_GPL(inet_hashinfo_init);