diff options
author | Eric Dumazet <dada1@cosmosbay.com> | 2008-11-23 20:22:55 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2008-11-23 20:22:55 -0500 |
commit | c25eb3bfb97294d0543a81230fbc237046b4b84c (patch) | |
tree | 6c9deabfb12f4d31f280cfcfe7e7580a2089931c | |
parent | 8c862c23e2563e6aedfc6c4aa6827cadb83f2414 (diff) |
net: Convert TCP/DCCP listening hash tables to use RCU
This is the last step to be able to perform full RCU lookups
in __inet_lookup() : After established/timewait tables, we
add RCU lookups to listening hash table.
The only trick here is that a socket of a given type (TCP ipv4,
TCP ipv6, ...) can now flight between two different tables
(established and listening) during a RCU grace period, so we
must use different 'nulls' end-of-chain values for two tables.
We define a large value :
#define LISTENING_NULLS_BASE (1U << 29)
So that slots in listening table are guaranteed to have different
end-of-chain values than slots in established table. A reader can
still detect it finished its lookup in the right chain.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/inet_hashtables.h | 9 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 4 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 148 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 8 | ||||
-rw-r--r-- | net/ipv6/inet6_hashtables.c | 94 |
5 files changed, 147 insertions, 116 deletions
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index ec7ee2e46d8c..f44bb5c77a70 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h | |||
@@ -99,9 +99,16 @@ struct inet_bind_hashbucket { | |||
99 | struct hlist_head chain; | 99 | struct hlist_head chain; |
100 | }; | 100 | }; |
101 | 101 | ||
102 | /* | ||
103 | * Sockets can be hashed in established or listening table | ||
104 | * We must use different 'nulls' end-of-chain value for listening | ||
105 | * hash table, or we might find a socket that was closed and | ||
106 | * reallocated/inserted into established hash table | ||
107 | */ | ||
108 | #define LISTENING_NULLS_BASE (1U << 29) | ||
102 | struct inet_listen_hashbucket { | 109 | struct inet_listen_hashbucket { |
103 | spinlock_t lock; | 110 | spinlock_t lock; |
104 | struct hlist_head head; | 111 | struct hlist_nulls_head head; |
105 | }; | 112 | }; |
106 | 113 | ||
107 | /* This is for listening sockets, thus all sockets which possess wildcards. */ | 114 | /* This is for listening sockets, thus all sockets which possess wildcards. */ |
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 998a78f169ff..588a7796e3e3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c | |||
@@ -720,13 +720,13 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) | |||
720 | 720 | ||
721 | for (i = s_i; i < INET_LHTABLE_SIZE; i++) { | 721 | for (i = s_i; i < INET_LHTABLE_SIZE; i++) { |
722 | struct sock *sk; | 722 | struct sock *sk; |
723 | struct hlist_node *node; | 723 | struct hlist_nulls_node *node; |
724 | struct inet_listen_hashbucket *ilb; | 724 | struct inet_listen_hashbucket *ilb; |
725 | 725 | ||
726 | num = 0; | 726 | num = 0; |
727 | ilb = &hashinfo->listening_hash[i]; | 727 | ilb = &hashinfo->listening_hash[i]; |
728 | spin_lock_bh(&ilb->lock); | 728 | spin_lock_bh(&ilb->lock); |
729 | sk_for_each(sk, node, &ilb->head) { | 729 | sk_nulls_for_each(sk, node, &ilb->head) { |
730 | struct inet_sock *inet = inet_sk(sk); | 730 | struct inet_sock *inet = inet_sk(sk); |
731 | 731 | ||
732 | if (num < s_num) { | 732 | if (num < s_num) { |
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4c273a9981a6..11fcb87a1fdd 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c | |||
@@ -110,78 +110,79 @@ void __inet_inherit_port(struct sock *sk, struct sock *child) | |||
110 | 110 | ||
111 | EXPORT_SYMBOL_GPL(__inet_inherit_port); | 111 | EXPORT_SYMBOL_GPL(__inet_inherit_port); |
112 | 112 | ||
113 | static inline int compute_score(struct sock *sk, struct net *net, | ||
114 | const unsigned short hnum, const __be32 daddr, | ||
115 | const int dif) | ||
116 | { | ||
117 | int score = -1; | ||
118 | struct inet_sock *inet = inet_sk(sk); | ||
119 | |||
120 | if (net_eq(sock_net(sk), net) && inet->num == hnum && | ||
121 | !ipv6_only_sock(sk)) { | ||
122 | __be32 rcv_saddr = inet->rcv_saddr; | ||
123 | score = sk->sk_family == PF_INET ? 1 : 0; | ||
124 | if (rcv_saddr) { | ||
125 | if (rcv_saddr != daddr) | ||
126 | return -1; | ||
127 | score += 2; | ||
128 | } | ||
129 | if (sk->sk_bound_dev_if) { | ||
130 | if (sk->sk_bound_dev_if != dif) | ||
131 | return -1; | ||
132 | score += 2; | ||
133 | } | ||
134 | } | ||
135 | return score; | ||
136 | } | ||
137 | |||
113 | /* | 138 | /* |
114 | * Don't inline this cruft. Here are some nice properties to exploit here. The | 139 | * Don't inline this cruft. Here are some nice properties to exploit here. The |
115 | * BSD API does not allow a listening sock to specify the remote port nor the | 140 | * BSD API does not allow a listening sock to specify the remote port nor the |
116 | * remote address for the connection. So always assume those are both | 141 | * remote address for the connection. So always assume those are both |
117 | * wildcarded during the search since they can never be otherwise. | 142 | * wildcarded during the search since they can never be otherwise. |
118 | */ | 143 | */ |
119 | static struct sock *inet_lookup_listener_slow(struct net *net, | ||
120 | const struct hlist_head *head, | ||
121 | const __be32 daddr, | ||
122 | const unsigned short hnum, | ||
123 | const int dif) | ||
124 | { | ||
125 | struct sock *result = NULL, *sk; | ||
126 | const struct hlist_node *node; | ||
127 | int hiscore = -1; | ||
128 | |||
129 | sk_for_each(sk, node, head) { | ||
130 | const struct inet_sock *inet = inet_sk(sk); | ||
131 | |||
132 | if (net_eq(sock_net(sk), net) && inet->num == hnum && | ||
133 | !ipv6_only_sock(sk)) { | ||
134 | const __be32 rcv_saddr = inet->rcv_saddr; | ||
135 | int score = sk->sk_family == PF_INET ? 1 : 0; | ||
136 | |||
137 | if (rcv_saddr) { | ||
138 | if (rcv_saddr != daddr) | ||
139 | continue; | ||
140 | score += 2; | ||
141 | } | ||
142 | if (sk->sk_bound_dev_if) { | ||
143 | if (sk->sk_bound_dev_if != dif) | ||
144 | continue; | ||
145 | score += 2; | ||
146 | } | ||
147 | if (score == 5) | ||
148 | return sk; | ||
149 | if (score > hiscore) { | ||
150 | hiscore = score; | ||
151 | result = sk; | ||
152 | } | ||
153 | } | ||
154 | } | ||
155 | return result; | ||
156 | } | ||
157 | 144 | ||
158 | /* Optimize the common listener case. */ | 145 | |
159 | struct sock *__inet_lookup_listener(struct net *net, | 146 | struct sock *__inet_lookup_listener(struct net *net, |
160 | struct inet_hashinfo *hashinfo, | 147 | struct inet_hashinfo *hashinfo, |
161 | const __be32 daddr, const unsigned short hnum, | 148 | const __be32 daddr, const unsigned short hnum, |
162 | const int dif) | 149 | const int dif) |
163 | { | 150 | { |
164 | struct sock *sk = NULL; | 151 | struct sock *sk, *result; |
165 | struct inet_listen_hashbucket *ilb; | 152 | struct hlist_nulls_node *node; |
153 | unsigned int hash = inet_lhashfn(net, hnum); | ||
154 | struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; | ||
155 | int score, hiscore; | ||
166 | 156 | ||
167 | ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; | 157 | rcu_read_lock(); |
168 | spin_lock(&ilb->lock); | 158 | begin: |
169 | if (!hlist_empty(&ilb->head)) { | 159 | result = NULL; |
170 | const struct inet_sock *inet = inet_sk((sk = __sk_head(&ilb->head))); | 160 | hiscore = -1; |
171 | 161 | sk_nulls_for_each_rcu(sk, node, &ilb->head) { | |
172 | if (inet->num == hnum && !sk->sk_node.next && | 162 | score = compute_score(sk, net, hnum, daddr, dif); |
173 | (!inet->rcv_saddr || inet->rcv_saddr == daddr) && | 163 | if (score > hiscore) { |
174 | (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && | 164 | result = sk; |
175 | !sk->sk_bound_dev_if && net_eq(sock_net(sk), net)) | 165 | hiscore = score; |
176 | goto sherry_cache; | 166 | } |
177 | sk = inet_lookup_listener_slow(net, &ilb->head, daddr, hnum, dif); | ||
178 | } | 167 | } |
179 | if (sk) { | 168 | /* |
180 | sherry_cache: | 169 | * if the nulls value we got at the end of this lookup is |
181 | sock_hold(sk); | 170 | * not the expected one, we must restart lookup. |
171 | * We probably met an item that was moved to another chain. | ||
172 | */ | ||
173 | if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) | ||
174 | goto begin; | ||
175 | if (result) { | ||
176 | if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) | ||
177 | result = NULL; | ||
178 | else if (unlikely(compute_score(result, net, hnum, daddr, | ||
179 | dif) < hiscore)) { | ||
180 | sock_put(result); | ||
181 | goto begin; | ||
182 | } | ||
182 | } | 183 | } |
183 | spin_unlock(&ilb->lock); | 184 | rcu_read_unlock(); |
184 | return sk; | 185 | return result; |
185 | } | 186 | } |
186 | EXPORT_SYMBOL_GPL(__inet_lookup_listener); | 187 | EXPORT_SYMBOL_GPL(__inet_lookup_listener); |
187 | 188 | ||
@@ -370,7 +371,7 @@ static void __inet_hash(struct sock *sk) | |||
370 | ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; | 371 | ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; |
371 | 372 | ||
372 | spin_lock(&ilb->lock); | 373 | spin_lock(&ilb->lock); |
373 | __sk_add_node(sk, &ilb->head); | 374 | __sk_nulls_add_node_rcu(sk, &ilb->head); |
374 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 375 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
375 | spin_unlock(&ilb->lock); | 376 | spin_unlock(&ilb->lock); |
376 | } | 377 | } |
@@ -388,26 +389,22 @@ EXPORT_SYMBOL_GPL(inet_hash); | |||
388 | void inet_unhash(struct sock *sk) | 389 | void inet_unhash(struct sock *sk) |
389 | { | 390 | { |
390 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; | 391 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
392 | spinlock_t *lock; | ||
393 | int done; | ||
391 | 394 | ||
392 | if (sk_unhashed(sk)) | 395 | if (sk_unhashed(sk)) |
393 | return; | 396 | return; |
394 | 397 | ||
395 | if (sk->sk_state == TCP_LISTEN) { | 398 | if (sk->sk_state == TCP_LISTEN) |
396 | struct inet_listen_hashbucket *ilb; | 399 | lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; |
400 | else | ||
401 | lock = inet_ehash_lockp(hashinfo, sk->sk_hash); | ||
397 | 402 | ||
398 | ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; | 403 | spin_lock_bh(lock); |
399 | spin_lock_bh(&ilb->lock); | 404 | done =__sk_nulls_del_node_init_rcu(sk); |
400 | if (__sk_del_node_init(sk)) | 405 | spin_unlock_bh(lock); |
401 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | 406 | if (done) |
402 | spin_unlock_bh(&ilb->lock); | 407 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); |
403 | } else { | ||
404 | spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); | ||
405 | |||
406 | spin_lock_bh(lock); | ||
407 | if (__sk_nulls_del_node_init_rcu(sk)) | ||
408 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); | ||
409 | spin_unlock_bh(lock); | ||
410 | } | ||
411 | } | 408 | } |
412 | EXPORT_SYMBOL_GPL(inet_unhash); | 409 | EXPORT_SYMBOL_GPL(inet_unhash); |
413 | 410 | ||
@@ -526,8 +523,11 @@ void inet_hashinfo_init(struct inet_hashinfo *h) | |||
526 | { | 523 | { |
527 | int i; | 524 | int i; |
528 | 525 | ||
529 | for (i = 0; i < INET_LHTABLE_SIZE; i++) | 526 | for (i = 0; i < INET_LHTABLE_SIZE; i++) { |
530 | spin_lock_init(&h->listening_hash[i].lock); | 527 | spin_lock_init(&h->listening_hash[i].lock); |
528 | INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, | ||
529 | i + LISTENING_NULLS_BASE); | ||
530 | } | ||
531 | } | 531 | } |
532 | 532 | ||
533 | EXPORT_SYMBOL_GPL(inet_hashinfo_init); | 533 | EXPORT_SYMBOL_GPL(inet_hashinfo_init); |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a81caa1be0cf..cab2458f86fd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -1868,7 +1868,7 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) | |||
1868 | static void *listening_get_next(struct seq_file *seq, void *cur) | 1868 | static void *listening_get_next(struct seq_file *seq, void *cur) |
1869 | { | 1869 | { |
1870 | struct inet_connection_sock *icsk; | 1870 | struct inet_connection_sock *icsk; |
1871 | struct hlist_node *node; | 1871 | struct hlist_nulls_node *node; |
1872 | struct sock *sk = cur; | 1872 | struct sock *sk = cur; |
1873 | struct inet_listen_hashbucket *ilb; | 1873 | struct inet_listen_hashbucket *ilb; |
1874 | struct tcp_iter_state *st = seq->private; | 1874 | struct tcp_iter_state *st = seq->private; |
@@ -1878,7 +1878,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) | |||
1878 | st->bucket = 0; | 1878 | st->bucket = 0; |
1879 | ilb = &tcp_hashinfo.listening_hash[0]; | 1879 | ilb = &tcp_hashinfo.listening_hash[0]; |
1880 | spin_lock_bh(&ilb->lock); | 1880 | spin_lock_bh(&ilb->lock); |
1881 | sk = sk_head(&ilb->head); | 1881 | sk = sk_nulls_head(&ilb->head); |
1882 | goto get_sk; | 1882 | goto get_sk; |
1883 | } | 1883 | } |
1884 | ilb = &tcp_hashinfo.listening_hash[st->bucket]; | 1884 | ilb = &tcp_hashinfo.listening_hash[st->bucket]; |
@@ -1914,7 +1914,7 @@ get_req: | |||
1914 | sk = sk_next(sk); | 1914 | sk = sk_next(sk); |
1915 | } | 1915 | } |
1916 | get_sk: | 1916 | get_sk: |
1917 | sk_for_each_from(sk, node) { | 1917 | sk_nulls_for_each_from(sk, node) { |
1918 | if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { | 1918 | if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { |
1919 | cur = sk; | 1919 | cur = sk; |
1920 | goto out; | 1920 | goto out; |
@@ -1935,7 +1935,7 @@ start_req: | |||
1935 | if (++st->bucket < INET_LHTABLE_SIZE) { | 1935 | if (++st->bucket < INET_LHTABLE_SIZE) { |
1936 | ilb = &tcp_hashinfo.listening_hash[st->bucket]; | 1936 | ilb = &tcp_hashinfo.listening_hash[st->bucket]; |
1937 | spin_lock_bh(&ilb->lock); | 1937 | spin_lock_bh(&ilb->lock); |
1938 | sk = sk_head(&ilb->head); | 1938 | sk = sk_nulls_head(&ilb->head); |
1939 | goto get_sk; | 1939 | goto get_sk; |
1940 | } | 1940 | } |
1941 | cur = NULL; | 1941 | cur = NULL; |
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index e0fd68187f83..8fe267feb81e 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c | |||
@@ -33,7 +33,7 @@ void __inet6_hash(struct sock *sk) | |||
33 | 33 | ||
34 | ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; | 34 | ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; |
35 | spin_lock(&ilb->lock); | 35 | spin_lock(&ilb->lock); |
36 | __sk_add_node(sk, &ilb->head); | 36 | __sk_nulls_add_node_rcu(sk, &ilb->head); |
37 | spin_unlock(&ilb->lock); | 37 | spin_unlock(&ilb->lock); |
38 | } else { | 38 | } else { |
39 | unsigned int hash; | 39 | unsigned int hash; |
@@ -118,47 +118,71 @@ out: | |||
118 | } | 118 | } |
119 | EXPORT_SYMBOL(__inet6_lookup_established); | 119 | EXPORT_SYMBOL(__inet6_lookup_established); |
120 | 120 | ||
121 | static int inline compute_score(struct sock *sk, struct net *net, | ||
122 | const unsigned short hnum, | ||
123 | const struct in6_addr *daddr, | ||
124 | const int dif) | ||
125 | { | ||
126 | int score = -1; | ||
127 | |||
128 | if (net_eq(sock_net(sk), net) && inet_sk(sk)->num == hnum && | ||
129 | sk->sk_family == PF_INET6) { | ||
130 | const struct ipv6_pinfo *np = inet6_sk(sk); | ||
131 | |||
132 | score = 1; | ||
133 | if (!ipv6_addr_any(&np->rcv_saddr)) { | ||
134 | if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) | ||
135 | return -1; | ||
136 | score++; | ||
137 | } | ||
138 | if (sk->sk_bound_dev_if) { | ||
139 | if (sk->sk_bound_dev_if != dif) | ||
140 | return -1; | ||
141 | score++; | ||
142 | } | ||
143 | } | ||
144 | return score; | ||
145 | } | ||
146 | |||
121 | struct sock *inet6_lookup_listener(struct net *net, | 147 | struct sock *inet6_lookup_listener(struct net *net, |
122 | struct inet_hashinfo *hashinfo, const struct in6_addr *daddr, | 148 | struct inet_hashinfo *hashinfo, const struct in6_addr *daddr, |
123 | const unsigned short hnum, const int dif) | 149 | const unsigned short hnum, const int dif) |
124 | { | 150 | { |
125 | struct sock *sk; | 151 | struct sock *sk; |
126 | const struct hlist_node *node; | 152 | const struct hlist_nulls_node *node; |
127 | struct sock *result = NULL; | 153 | struct sock *result; |
128 | int score, hiscore = 0; | 154 | int score, hiscore; |
129 | struct inet_listen_hashbucket *ilb; | 155 | unsigned int hash = inet_lhashfn(net, hnum); |
130 | 156 | struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; | |
131 | ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; | 157 | |
132 | spin_lock(&ilb->lock); | 158 | rcu_read_lock(); |
133 | sk_for_each(sk, node, &ilb->head) { | 159 | begin: |
134 | if (net_eq(sock_net(sk), net) && inet_sk(sk)->num == hnum && | 160 | result = NULL; |
135 | sk->sk_family == PF_INET6) { | 161 | hiscore = -1; |
136 | const struct ipv6_pinfo *np = inet6_sk(sk); | 162 | sk_nulls_for_each(sk, node, &ilb->head) { |
137 | 163 | score = compute_score(sk, net, hnum, daddr, dif); | |
138 | score = 1; | 164 | if (score > hiscore) { |
139 | if (!ipv6_addr_any(&np->rcv_saddr)) { | 165 | hiscore = score; |
140 | if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) | 166 | result = sk; |
141 | continue; | ||
142 | score++; | ||
143 | } | ||
144 | if (sk->sk_bound_dev_if) { | ||
145 | if (sk->sk_bound_dev_if != dif) | ||
146 | continue; | ||
147 | score++; | ||
148 | } | ||
149 | if (score == 3) { | ||
150 | result = sk; | ||
151 | break; | ||
152 | } | ||
153 | if (score > hiscore) { | ||
154 | hiscore = score; | ||
155 | result = sk; | ||
156 | } | ||
157 | } | 167 | } |
158 | } | 168 | } |
159 | if (result) | 169 | /* |
160 | sock_hold(result); | 170 | * if the nulls value we got at the end of this lookup is |
161 | spin_unlock(&ilb->lock); | 171 | * not the expected one, we must restart lookup. |
172 | * We probably met an item that was moved to another chain. | ||
173 | */ | ||
174 | if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) | ||
175 | goto begin; | ||
176 | if (result) { | ||
177 | if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) | ||
178 | result = NULL; | ||
179 | else if (unlikely(compute_score(result, net, hnum, daddr, | ||
180 | dif) < hiscore)) { | ||
181 | sock_put(result); | ||
182 | goto begin; | ||
183 | } | ||
184 | } | ||
185 | rcu_read_unlock(); | ||
162 | return result; | 186 | return result; |
163 | } | 187 | } |
164 | 188 | ||