aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2007-11-07 05:40:20 -0500
committerDavid S. Miller <davem@sunset.davemloft.net>2007-11-07 07:15:11 -0500
commit230140cffa7feae90ad50bf259db1fa07674f3a7 (patch)
tree815472add31606423a508a17806b7884f0ab3e2e /net
parentefac52762b1e3fe3035d29e82d8ee1aebc45e4a7 (diff)
[INET]: Remove per bucket rwlock in tcp/dccp ehash table.
As done two years ago on IP route cache table (commit 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per hash bucket for the huge TCP/DCCP hash tables. On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle performance differences. (we hit a different cache line for the rwlock, but then the bucket cache line have a better sharing factor among cpus, since we dirty it less often). For netstat or ss commands that want a full scan of hash table, we perform fewer memory accesses. Using a 'small' table of hashed rwlocks should be more than enough to provide correct SMP concurrency between different buckets, without using too much memory. Sizing of this table depends on num_possible_cpus() and various CONFIG settings. This patch provides some locking abstraction that may ease a future work using a different model for TCP/DCCP table. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/dccp/proto.c9
-rw-r--r--net/ipv4/inet_diag.c9
-rw-r--r--net/ipv4/inet_hashtables.c7
-rw-r--r--net/ipv4/inet_timewait_sock.c13
-rw-r--r--net/ipv4/tcp.c4
-rw-r--r--net/ipv4/tcp_ipv4.c11
-rw-r--r--net/ipv6/inet6_hashtables.c19
7 files changed, 41 insertions, 31 deletions
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index d84973928033..7a3bea9c28c1 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1072,11 +1072,13 @@ static int __init dccp_init(void)
1072 } 1072 }
1073 1073
1074 for (i = 0; i < dccp_hashinfo.ehash_size; i++) { 1074 for (i = 0; i < dccp_hashinfo.ehash_size; i++) {
1075 rwlock_init(&dccp_hashinfo.ehash[i].lock);
1076 INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain); 1075 INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
1077 INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain); 1076 INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain);
1078 } 1077 }
1079 1078
1079 if (inet_ehash_locks_alloc(&dccp_hashinfo))
1080 goto out_free_dccp_ehash;
1081
1080 bhash_order = ehash_order; 1082 bhash_order = ehash_order;
1081 1083
1082 do { 1084 do {
@@ -1091,7 +1093,7 @@ static int __init dccp_init(void)
1091 1093
1092 if (!dccp_hashinfo.bhash) { 1094 if (!dccp_hashinfo.bhash) {
1093 DCCP_CRIT("Failed to allocate DCCP bind hash table"); 1095 DCCP_CRIT("Failed to allocate DCCP bind hash table");
1094 goto out_free_dccp_ehash; 1096 goto out_free_dccp_locks;
1095 } 1097 }
1096 1098
1097 for (i = 0; i < dccp_hashinfo.bhash_size; i++) { 1099 for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
@@ -1121,6 +1123,8 @@ out_free_dccp_mib:
1121out_free_dccp_bhash: 1123out_free_dccp_bhash:
1122 free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); 1124 free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
1123 dccp_hashinfo.bhash = NULL; 1125 dccp_hashinfo.bhash = NULL;
1126out_free_dccp_locks:
1127 inet_ehash_locks_free(&dccp_hashinfo);
1124out_free_dccp_ehash: 1128out_free_dccp_ehash:
1125 free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); 1129 free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
1126 dccp_hashinfo.ehash = NULL; 1130 dccp_hashinfo.ehash = NULL;
@@ -1139,6 +1143,7 @@ static void __exit dccp_fini(void)
1139 free_pages((unsigned long)dccp_hashinfo.ehash, 1143 free_pages((unsigned long)dccp_hashinfo.ehash,
1140 get_order(dccp_hashinfo.ehash_size * 1144 get_order(dccp_hashinfo.ehash_size *
1141 sizeof(struct inet_ehash_bucket))); 1145 sizeof(struct inet_ehash_bucket)));
1146 inet_ehash_locks_free(&dccp_hashinfo);
1142 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); 1147 kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
1143 dccp_ackvec_exit(); 1148 dccp_ackvec_exit();
1144 dccp_sysctl_exit(); 1149 dccp_sysctl_exit();
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index dc429b6b0ba6..b0170732b5e9 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -747,13 +747,14 @@ skip_listen_ht:
747 747
748 for (i = s_i; i < hashinfo->ehash_size; i++) { 748 for (i = s_i; i < hashinfo->ehash_size; i++) {
749 struct inet_ehash_bucket *head = &hashinfo->ehash[i]; 749 struct inet_ehash_bucket *head = &hashinfo->ehash[i];
750 rwlock_t *lock = inet_ehash_lockp(hashinfo, i);
750 struct sock *sk; 751 struct sock *sk;
751 struct hlist_node *node; 752 struct hlist_node *node;
752 753
753 if (i > s_i) 754 if (i > s_i)
754 s_num = 0; 755 s_num = 0;
755 756
756 read_lock_bh(&head->lock); 757 read_lock_bh(lock);
757 num = 0; 758 num = 0;
758 sk_for_each(sk, node, &head->chain) { 759 sk_for_each(sk, node, &head->chain) {
759 struct inet_sock *inet = inet_sk(sk); 760 struct inet_sock *inet = inet_sk(sk);
@@ -769,7 +770,7 @@ skip_listen_ht:
769 r->id.idiag_dport) 770 r->id.idiag_dport)
770 goto next_normal; 771 goto next_normal;
771 if (inet_csk_diag_dump(sk, skb, cb) < 0) { 772 if (inet_csk_diag_dump(sk, skb, cb) < 0) {
772 read_unlock_bh(&head->lock); 773 read_unlock_bh(lock);
773 goto done; 774 goto done;
774 } 775 }
775next_normal: 776next_normal:
@@ -791,14 +792,14 @@ next_normal:
791 r->id.idiag_dport) 792 r->id.idiag_dport)
792 goto next_dying; 793 goto next_dying;
793 if (inet_twsk_diag_dump(tw, skb, cb) < 0) { 794 if (inet_twsk_diag_dump(tw, skb, cb) < 0) {
794 read_unlock_bh(&head->lock); 795 read_unlock_bh(lock);
795 goto done; 796 goto done;
796 } 797 }
797next_dying: 798next_dying:
798 ++num; 799 ++num;
799 } 800 }
800 } 801 }
801 read_unlock_bh(&head->lock); 802 read_unlock_bh(lock);
802 } 803 }
803 804
804done: 805done:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 16eecc7046a3..67704da04fc4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -204,12 +204,13 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
204 const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); 204 const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
205 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); 205 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
206 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 206 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
207 rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
207 struct sock *sk2; 208 struct sock *sk2;
208 const struct hlist_node *node; 209 const struct hlist_node *node;
209 struct inet_timewait_sock *tw; 210 struct inet_timewait_sock *tw;
210 211
211 prefetch(head->chain.first); 212 prefetch(head->chain.first);
212 write_lock(&head->lock); 213 write_lock(lock);
213 214
214 /* Check TIME-WAIT sockets first. */ 215 /* Check TIME-WAIT sockets first. */
215 sk_for_each(sk2, node, &head->twchain) { 216 sk_for_each(sk2, node, &head->twchain) {
@@ -239,7 +240,7 @@ unique:
239 BUG_TRAP(sk_unhashed(sk)); 240 BUG_TRAP(sk_unhashed(sk));
240 __sk_add_node(sk, &head->chain); 241 __sk_add_node(sk, &head->chain);
241 sock_prot_inc_use(sk->sk_prot); 242 sock_prot_inc_use(sk->sk_prot);
242 write_unlock(&head->lock); 243 write_unlock(lock);
243 244
244 if (twp) { 245 if (twp) {
245 *twp = tw; 246 *twp = tw;
@@ -255,7 +256,7 @@ unique:
255 return 0; 256 return 0;
256 257
257not_unique: 258not_unique:
258 write_unlock(&head->lock); 259 write_unlock(lock);
259 return -EADDRNOTAVAIL; 260 return -EADDRNOTAVAIL;
260} 261}
261 262
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 4e189e28f306..a60b99e0ebdc 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -20,16 +20,16 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
20 struct inet_bind_hashbucket *bhead; 20 struct inet_bind_hashbucket *bhead;
21 struct inet_bind_bucket *tb; 21 struct inet_bind_bucket *tb;
22 /* Unlink from established hashes. */ 22 /* Unlink from established hashes. */
23 struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash); 23 rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
24 24
25 write_lock(&ehead->lock); 25 write_lock(lock);
26 if (hlist_unhashed(&tw->tw_node)) { 26 if (hlist_unhashed(&tw->tw_node)) {
27 write_unlock(&ehead->lock); 27 write_unlock(lock);
28 return; 28 return;
29 } 29 }
30 __hlist_del(&tw->tw_node); 30 __hlist_del(&tw->tw_node);
31 sk_node_init(&tw->tw_node); 31 sk_node_init(&tw->tw_node);
32 write_unlock(&ehead->lock); 32 write_unlock(lock);
33 33
34 /* Disassociate with bind bucket. */ 34 /* Disassociate with bind bucket. */
35 bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; 35 bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
@@ -59,6 +59,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
59 const struct inet_sock *inet = inet_sk(sk); 59 const struct inet_sock *inet = inet_sk(sk);
60 const struct inet_connection_sock *icsk = inet_csk(sk); 60 const struct inet_connection_sock *icsk = inet_csk(sk);
61 struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); 61 struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
62 rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
62 struct inet_bind_hashbucket *bhead; 63 struct inet_bind_hashbucket *bhead;
63 /* Step 1: Put TW into bind hash. Original socket stays there too. 64 /* Step 1: Put TW into bind hash. Original socket stays there too.
64 Note, that any socket with inet->num != 0 MUST be bound in 65 Note, that any socket with inet->num != 0 MUST be bound in
@@ -71,7 +72,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
71 inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); 72 inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
72 spin_unlock(&bhead->lock); 73 spin_unlock(&bhead->lock);
73 74
74 write_lock(&ehead->lock); 75 write_lock(lock);
75 76
76 /* Step 2: Remove SK from established hash. */ 77 /* Step 2: Remove SK from established hash. */
77 if (__sk_del_node_init(sk)) 78 if (__sk_del_node_init(sk))
@@ -81,7 +82,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
81 inet_twsk_add_node(tw, &ehead->twchain); 82 inet_twsk_add_node(tw, &ehead->twchain);
82 atomic_inc(&tw->tw_refcnt); 83 atomic_inc(&tw->tw_refcnt);
83 84
84 write_unlock(&ehead->lock); 85 write_unlock(lock);
85} 86}
86 87
87EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); 88EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c64072bb504b..8e65182f7af1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2456,11 +2456,11 @@ void __init tcp_init(void)
2456 thash_entries ? 0 : 512 * 1024); 2456 thash_entries ? 0 : 512 * 1024);
2457 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; 2457 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
2458 for (i = 0; i < tcp_hashinfo.ehash_size; i++) { 2458 for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2459 rwlock_init(&tcp_hashinfo.ehash[i].lock);
2460 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); 2459 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2461 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); 2460 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain);
2462 } 2461 }
2463 2462 if (inet_ehash_locks_alloc(&tcp_hashinfo))
2463 panic("TCP: failed to alloc ehash_locks");
2464 tcp_hashinfo.bhash = 2464 tcp_hashinfo.bhash =
2465 alloc_large_system_hash("TCP bind", 2465 alloc_large_system_hash("TCP bind",
2466 sizeof(struct inet_bind_hashbucket), 2466 sizeof(struct inet_bind_hashbucket),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e9127cdced20..e566f3c67677 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2049,8 +2049,9 @@ static void *established_get_first(struct seq_file *seq)
2049 struct sock *sk; 2049 struct sock *sk;
2050 struct hlist_node *node; 2050 struct hlist_node *node;
2051 struct inet_timewait_sock *tw; 2051 struct inet_timewait_sock *tw;
2052 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2052 2053
2053 read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock); 2054 read_lock_bh(lock);
2054 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2055 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2055 if (sk->sk_family != st->family) { 2056 if (sk->sk_family != st->family) {
2056 continue; 2057 continue;
@@ -2067,7 +2068,7 @@ static void *established_get_first(struct seq_file *seq)
2067 rc = tw; 2068 rc = tw;
2068 goto out; 2069 goto out;
2069 } 2070 }
2070 read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); 2071 read_unlock_bh(lock);
2071 st->state = TCP_SEQ_STATE_ESTABLISHED; 2072 st->state = TCP_SEQ_STATE_ESTABLISHED;
2072 } 2073 }
2073out: 2074out:
@@ -2094,11 +2095,11 @@ get_tw:
2094 cur = tw; 2095 cur = tw;
2095 goto out; 2096 goto out;
2096 } 2097 }
2097 read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); 2098 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2098 st->state = TCP_SEQ_STATE_ESTABLISHED; 2099 st->state = TCP_SEQ_STATE_ESTABLISHED;
2099 2100
2100 if (++st->bucket < tcp_hashinfo.ehash_size) { 2101 if (++st->bucket < tcp_hashinfo.ehash_size) {
2101 read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock); 2102 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2102 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); 2103 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2103 } else { 2104 } else {
2104 cur = NULL; 2105 cur = NULL;
@@ -2206,7 +2207,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
2206 case TCP_SEQ_STATE_TIME_WAIT: 2207 case TCP_SEQ_STATE_TIME_WAIT:
2207 case TCP_SEQ_STATE_ESTABLISHED: 2208 case TCP_SEQ_STATE_ESTABLISHED:
2208 if (v) 2209 if (v)
2209 read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); 2210 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2210 break; 2211 break;
2211 } 2212 }
2212} 2213}
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index d6f1026f1943..adc73adadfae 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -37,9 +37,8 @@ void __inet6_hash(struct inet_hashinfo *hashinfo,
37 } else { 37 } else {
38 unsigned int hash; 38 unsigned int hash;
39 sk->sk_hash = hash = inet6_sk_ehashfn(sk); 39 sk->sk_hash = hash = inet6_sk_ehashfn(sk);
40 hash &= (hashinfo->ehash_size - 1); 40 list = &inet_ehash_bucket(hashinfo, hash)->chain;
41 list = &hashinfo->ehash[hash].chain; 41 lock = inet_ehash_lockp(hashinfo, hash);
42 lock = &hashinfo->ehash[hash].lock;
43 write_lock(lock); 42 write_lock(lock);
44 } 43 }
45 44
@@ -70,9 +69,10 @@ struct sock *__inet6_lookup_established(struct inet_hashinfo *hashinfo,
70 */ 69 */
71 unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport); 70 unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport);
72 struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); 71 struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
72 rwlock_t *lock = inet_ehash_lockp(hashinfo, hash);
73 73
74 prefetch(head->chain.first); 74 prefetch(head->chain.first);
75 read_lock(&head->lock); 75 read_lock(lock);
76 sk_for_each(sk, node, &head->chain) { 76 sk_for_each(sk, node, &head->chain) {
77 /* For IPV6 do the cheaper port and family tests first. */ 77 /* For IPV6 do the cheaper port and family tests first. */
78 if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif)) 78 if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif))
@@ -92,12 +92,12 @@ struct sock *__inet6_lookup_established(struct inet_hashinfo *hashinfo,
92 goto hit; 92 goto hit;
93 } 93 }
94 } 94 }
95 read_unlock(&head->lock); 95 read_unlock(lock);
96 return NULL; 96 return NULL;
97 97
98hit: 98hit:
99 sock_hold(sk); 99 sock_hold(sk);
100 read_unlock(&head->lock); 100 read_unlock(lock);
101 return sk; 101 return sk;
102} 102}
103EXPORT_SYMBOL(__inet6_lookup_established); 103EXPORT_SYMBOL(__inet6_lookup_established);
@@ -175,12 +175,13 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
175 const unsigned int hash = inet6_ehashfn(daddr, lport, saddr, 175 const unsigned int hash = inet6_ehashfn(daddr, lport, saddr,
176 inet->dport); 176 inet->dport);
177 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 177 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
178 rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
178 struct sock *sk2; 179 struct sock *sk2;
179 const struct hlist_node *node; 180 const struct hlist_node *node;
180 struct inet_timewait_sock *tw; 181 struct inet_timewait_sock *tw;
181 182
182 prefetch(head->chain.first); 183 prefetch(head->chain.first);
183 write_lock(&head->lock); 184 write_lock(lock);
184 185
185 /* Check TIME-WAIT sockets first. */ 186 /* Check TIME-WAIT sockets first. */
186 sk_for_each(sk2, node, &head->twchain) { 187 sk_for_each(sk2, node, &head->twchain) {
@@ -216,7 +217,7 @@ unique:
216 __sk_add_node(sk, &head->chain); 217 __sk_add_node(sk, &head->chain);
217 sk->sk_hash = hash; 218 sk->sk_hash = hash;
218 sock_prot_inc_use(sk->sk_prot); 219 sock_prot_inc_use(sk->sk_prot);
219 write_unlock(&head->lock); 220 write_unlock(lock);
220 221
221 if (twp != NULL) { 222 if (twp != NULL) {
222 *twp = tw; 223 *twp = tw;
@@ -231,7 +232,7 @@ unique:
231 return 0; 232 return 0;
232 233
233not_unique: 234not_unique:
234 write_unlock(&head->lock); 235 write_unlock(lock);
235 return -EADDRNOTAVAIL; 236 return -EADDRNOTAVAIL;
236} 237}
237 238