diff options
author | Eric Dumazet <dada1@cosmosbay.com> | 2008-11-16 22:40:17 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2008-11-16 22:40:17 -0500 |
commit | 3ab5aee7fe840b5b1b35a8d1ac11c3de5281e611 (patch) | |
tree | 468296b7be813643248d4ca67497d6ddb6934fc6 /net/ipv6 | |
parent | 88ab1932eac721c6e7336708558fa5ed02c85c80 (diff) |
net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls
RCU was added to UDP lookups, using a fast infrastructure :
- sockets kmem_cache use SLAB_DESTROY_BY_RCU and dont pay the
price of call_rcu() at freeing time.
- hlist_nulls permits to use few memory barriers.
This patch uses same infrastructure for TCP/DCCP established
and timewait sockets.
Thanks to SLAB_DESTROY_BY_RCU, no slowdown for applications
using short lived TCP connections. A followup patch, converting
rwlocks to spinlocks will even speedup this case.
__inet_lookup_established() is pretty fast now we dont have to
dirty a contended cache line (read_lock/read_unlock)
Only established and timewait hashtable are converted to RCU
(bind table and listen table are still using traditional locking)
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv6')
-rw-r--r-- | net/ipv6/inet6_hashtables.c | 70 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 1 |
2 files changed, 48 insertions, 23 deletions
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 1646a5658255..c1b4d401fd95 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c | |||
@@ -25,24 +25,28 @@ | |||
25 | void __inet6_hash(struct sock *sk) | 25 | void __inet6_hash(struct sock *sk) |
26 | { | 26 | { |
27 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; | 27 | struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; |
28 | struct hlist_head *list; | ||
29 | rwlock_t *lock; | 28 | rwlock_t *lock; |
30 | 29 | ||
31 | WARN_ON(!sk_unhashed(sk)); | 30 | WARN_ON(!sk_unhashed(sk)); |
32 | 31 | ||
33 | if (sk->sk_state == TCP_LISTEN) { | 32 | if (sk->sk_state == TCP_LISTEN) { |
33 | struct hlist_head *list; | ||
34 | |||
34 | list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; | 35 | list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; |
35 | lock = &hashinfo->lhash_lock; | 36 | lock = &hashinfo->lhash_lock; |
36 | inet_listen_wlock(hashinfo); | 37 | inet_listen_wlock(hashinfo); |
38 | __sk_add_node(sk, list); | ||
37 | } else { | 39 | } else { |
38 | unsigned int hash; | 40 | unsigned int hash; |
41 | struct hlist_nulls_head *list; | ||
42 | |||
39 | sk->sk_hash = hash = inet6_sk_ehashfn(sk); | 43 | sk->sk_hash = hash = inet6_sk_ehashfn(sk); |
40 | list = &inet_ehash_bucket(hashinfo, hash)->chain; | 44 | list = &inet_ehash_bucket(hashinfo, hash)->chain; |
41 | lock = inet_ehash_lockp(hashinfo, hash); | 45 | lock = inet_ehash_lockp(hashinfo, hash); |
42 | write_lock(lock); | 46 | write_lock(lock); |
47 | __sk_nulls_add_node_rcu(sk, list); | ||
43 | } | 48 | } |
44 | 49 | ||
45 | __sk_add_node(sk, list); | ||
46 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 50 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
47 | write_unlock(lock); | 51 | write_unlock(lock); |
48 | } | 52 | } |
@@ -63,33 +67,53 @@ struct sock *__inet6_lookup_established(struct net *net, | |||
63 | const int dif) | 67 | const int dif) |
64 | { | 68 | { |
65 | struct sock *sk; | 69 | struct sock *sk; |
66 | const struct hlist_node *node; | 70 | const struct hlist_nulls_node *node; |
67 | const __portpair ports = INET_COMBINED_PORTS(sport, hnum); | 71 | const __portpair ports = INET_COMBINED_PORTS(sport, hnum); |
68 | /* Optimize here for direct hit, only listening connections can | 72 | /* Optimize here for direct hit, only listening connections can |
69 | * have wildcards anyways. | 73 | * have wildcards anyways. |
70 | */ | 74 | */ |
71 | unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); | 75 | unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); |
72 | struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); | 76 | unsigned int slot = hash & (hashinfo->ehash_size - 1); |
73 | rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); | 77 | struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; |
74 | 78 | ||
75 | prefetch(head->chain.first); | 79 | |
76 | read_lock(lock); | 80 | rcu_read_lock(); |
77 | sk_for_each(sk, node, &head->chain) { | 81 | begin: |
82 | sk_nulls_for_each_rcu(sk, node, &head->chain) { | ||
78 | /* For IPV6 do the cheaper port and family tests first. */ | 83 | /* For IPV6 do the cheaper port and family tests first. */ |
79 | if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) | 84 | if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { |
80 | goto hit; /* You sunk my battleship! */ | 85 | if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) |
86 | goto begintw; | ||
87 | if (!INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { | ||
88 | sock_put(sk); | ||
89 | goto begin; | ||
90 | } | ||
91 | goto out; | ||
92 | } | ||
81 | } | 93 | } |
94 | if (get_nulls_value(node) != slot) | ||
95 | goto begin; | ||
96 | |||
97 | begintw: | ||
82 | /* Must check for a TIME_WAIT'er before going to listener hash. */ | 98 | /* Must check for a TIME_WAIT'er before going to listener hash. */ |
83 | sk_for_each(sk, node, &head->twchain) { | 99 | sk_nulls_for_each_rcu(sk, node, &head->twchain) { |
84 | if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) | 100 | if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { |
85 | goto hit; | 101 | if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { |
102 | sk = NULL; | ||
103 | goto out; | ||
104 | } | ||
105 | if (!INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { | ||
106 | sock_put(sk); | ||
107 | goto begintw; | ||
108 | } | ||
109 | goto out; | ||
110 | } | ||
86 | } | 111 | } |
87 | read_unlock(lock); | 112 | if (get_nulls_value(node) != slot) |
88 | return NULL; | 113 | goto begintw; |
89 | 114 | sk = NULL; | |
90 | hit: | 115 | out: |
91 | sock_hold(sk); | 116 | rcu_read_unlock(); |
92 | read_unlock(lock); | ||
93 | return sk; | 117 | return sk; |
94 | } | 118 | } |
95 | EXPORT_SYMBOL(__inet6_lookup_established); | 119 | EXPORT_SYMBOL(__inet6_lookup_established); |
@@ -172,14 +196,14 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, | |||
172 | struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); | 196 | struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); |
173 | rwlock_t *lock = inet_ehash_lockp(hinfo, hash); | 197 | rwlock_t *lock = inet_ehash_lockp(hinfo, hash); |
174 | struct sock *sk2; | 198 | struct sock *sk2; |
175 | const struct hlist_node *node; | 199 | const struct hlist_nulls_node *node; |
176 | struct inet_timewait_sock *tw; | 200 | struct inet_timewait_sock *tw; |
177 | 201 | ||
178 | prefetch(head->chain.first); | 202 | prefetch(head->chain.first); |
179 | write_lock(lock); | 203 | write_lock(lock); |
180 | 204 | ||
181 | /* Check TIME-WAIT sockets first. */ | 205 | /* Check TIME-WAIT sockets first. */ |
182 | sk_for_each(sk2, node, &head->twchain) { | 206 | sk_nulls_for_each(sk2, node, &head->twchain) { |
183 | tw = inet_twsk(sk2); | 207 | tw = inet_twsk(sk2); |
184 | 208 | ||
185 | if (INET6_TW_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) { | 209 | if (INET6_TW_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) { |
@@ -192,7 +216,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, | |||
192 | tw = NULL; | 216 | tw = NULL; |
193 | 217 | ||
194 | /* And established part... */ | 218 | /* And established part... */ |
195 | sk_for_each(sk2, node, &head->chain) { | 219 | sk_nulls_for_each(sk2, node, &head->chain) { |
196 | if (INET6_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) | 220 | if (INET6_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) |
197 | goto not_unique; | 221 | goto not_unique; |
198 | } | 222 | } |
@@ -203,7 +227,7 @@ unique: | |||
203 | inet->num = lport; | 227 | inet->num = lport; |
204 | inet->sport = htons(lport); | 228 | inet->sport = htons(lport); |
205 | WARN_ON(!sk_unhashed(sk)); | 229 | WARN_ON(!sk_unhashed(sk)); |
206 | __sk_add_node(sk, &head->chain); | 230 | __sk_nulls_add_node_rcu(sk, &head->chain); |
207 | sk->sk_hash = hash; | 231 | sk->sk_hash = hash; |
208 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); | 232 | sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); |
209 | write_unlock(lock); | 233 | write_unlock(lock); |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 984276463a8d..b35787056313 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
@@ -2043,6 +2043,7 @@ struct proto tcpv6_prot = { | |||
2043 | .sysctl_rmem = sysctl_tcp_rmem, | 2043 | .sysctl_rmem = sysctl_tcp_rmem, |
2044 | .max_header = MAX_TCP_HEADER, | 2044 | .max_header = MAX_TCP_HEADER, |
2045 | .obj_size = sizeof(struct tcp6_sock), | 2045 | .obj_size = sizeof(struct tcp6_sock), |
2046 | .slab_flags = SLAB_DESTROY_BY_RCU, | ||
2046 | .twsk_prot = &tcp6_timewait_sock_ops, | 2047 | .twsk_prot = &tcp6_timewait_sock_ops, |
2047 | .rsk_prot = &tcp6_request_sock_ops, | 2048 | .rsk_prot = &tcp6_request_sock_ops, |
2048 | .h.hashinfo = &tcp_hashinfo, | 2049 | .h.hashinfo = &tcp_hashinfo, |