aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2007-11-07 05:40:20 -0500
committerDavid S. Miller <davem@sunset.davemloft.net>2007-11-07 07:15:11 -0500
commit230140cffa7feae90ad50bf259db1fa07674f3a7 (patch)
tree815472add31606423a508a17806b7884f0ab3e2e /net/ipv4
parentefac52762b1e3fe3035d29e82d8ee1aebc45e4a7 (diff)
[INET]: Remove per bucket rwlock in tcp/dccp ehash table.
As done two years ago on IP route cache table (commit 22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one lock per hash bucket for the huge TCP/DCCP hash tables. On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for litle performance differences. (we hit a different cache line for the rwlock, but then the bucket cache line have a better sharing factor among cpus, since we dirty it less often). For netstat or ss commands that want a full scan of hash table, we perform fewer memory accesses. Using a 'small' table of hashed rwlocks should be more than enough to provide correct SMP concurrency between different buckets, without using too much memory. Sizing of this table depends on num_possible_cpus() and various CONFIG settings. This patch provides some locking abstraction that may ease a future work using a different model for TCP/DCCP table. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/inet_diag.c9
-rw-r--r--net/ipv4/inet_hashtables.c7
-rw-r--r--net/ipv4/inet_timewait_sock.c13
-rw-r--r--net/ipv4/tcp.c4
-rw-r--r--net/ipv4/tcp_ipv4.c11
5 files changed, 24 insertions, 20 deletions
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index dc429b6b0ba6..b0170732b5e9 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -747,13 +747,14 @@ skip_listen_ht:
747 747
748 for (i = s_i; i < hashinfo->ehash_size; i++) { 748 for (i = s_i; i < hashinfo->ehash_size; i++) {
749 struct inet_ehash_bucket *head = &hashinfo->ehash[i]; 749 struct inet_ehash_bucket *head = &hashinfo->ehash[i];
750 rwlock_t *lock = inet_ehash_lockp(hashinfo, i);
750 struct sock *sk; 751 struct sock *sk;
751 struct hlist_node *node; 752 struct hlist_node *node;
752 753
753 if (i > s_i) 754 if (i > s_i)
754 s_num = 0; 755 s_num = 0;
755 756
756 read_lock_bh(&head->lock); 757 read_lock_bh(lock);
757 num = 0; 758 num = 0;
758 sk_for_each(sk, node, &head->chain) { 759 sk_for_each(sk, node, &head->chain) {
759 struct inet_sock *inet = inet_sk(sk); 760 struct inet_sock *inet = inet_sk(sk);
@@ -769,7 +770,7 @@ skip_listen_ht:
769 r->id.idiag_dport) 770 r->id.idiag_dport)
770 goto next_normal; 771 goto next_normal;
771 if (inet_csk_diag_dump(sk, skb, cb) < 0) { 772 if (inet_csk_diag_dump(sk, skb, cb) < 0) {
772 read_unlock_bh(&head->lock); 773 read_unlock_bh(lock);
773 goto done; 774 goto done;
774 } 775 }
775next_normal: 776next_normal:
@@ -791,14 +792,14 @@ next_normal:
791 r->id.idiag_dport) 792 r->id.idiag_dport)
792 goto next_dying; 793 goto next_dying;
793 if (inet_twsk_diag_dump(tw, skb, cb) < 0) { 794 if (inet_twsk_diag_dump(tw, skb, cb) < 0) {
794 read_unlock_bh(&head->lock); 795 read_unlock_bh(lock);
795 goto done; 796 goto done;
796 } 797 }
797next_dying: 798next_dying:
798 ++num; 799 ++num;
799 } 800 }
800 } 801 }
801 read_unlock_bh(&head->lock); 802 read_unlock_bh(lock);
802 } 803 }
803 804
804done: 805done:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 16eecc7046a3..67704da04fc4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -204,12 +204,13 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
204 const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); 204 const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
205 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); 205 unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
206 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 206 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
207 rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
207 struct sock *sk2; 208 struct sock *sk2;
208 const struct hlist_node *node; 209 const struct hlist_node *node;
209 struct inet_timewait_sock *tw; 210 struct inet_timewait_sock *tw;
210 211
211 prefetch(head->chain.first); 212 prefetch(head->chain.first);
212 write_lock(&head->lock); 213 write_lock(lock);
213 214
214 /* Check TIME-WAIT sockets first. */ 215 /* Check TIME-WAIT sockets first. */
215 sk_for_each(sk2, node, &head->twchain) { 216 sk_for_each(sk2, node, &head->twchain) {
@@ -239,7 +240,7 @@ unique:
239 BUG_TRAP(sk_unhashed(sk)); 240 BUG_TRAP(sk_unhashed(sk));
240 __sk_add_node(sk, &head->chain); 241 __sk_add_node(sk, &head->chain);
241 sock_prot_inc_use(sk->sk_prot); 242 sock_prot_inc_use(sk->sk_prot);
242 write_unlock(&head->lock); 243 write_unlock(lock);
243 244
244 if (twp) { 245 if (twp) {
245 *twp = tw; 246 *twp = tw;
@@ -255,7 +256,7 @@ unique:
255 return 0; 256 return 0;
256 257
257not_unique: 258not_unique:
258 write_unlock(&head->lock); 259 write_unlock(lock);
259 return -EADDRNOTAVAIL; 260 return -EADDRNOTAVAIL;
260} 261}
261 262
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 4e189e28f306..a60b99e0ebdc 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -20,16 +20,16 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
20 struct inet_bind_hashbucket *bhead; 20 struct inet_bind_hashbucket *bhead;
21 struct inet_bind_bucket *tb; 21 struct inet_bind_bucket *tb;
22 /* Unlink from established hashes. */ 22 /* Unlink from established hashes. */
23 struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash); 23 rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
24 24
25 write_lock(&ehead->lock); 25 write_lock(lock);
26 if (hlist_unhashed(&tw->tw_node)) { 26 if (hlist_unhashed(&tw->tw_node)) {
27 write_unlock(&ehead->lock); 27 write_unlock(lock);
28 return; 28 return;
29 } 29 }
30 __hlist_del(&tw->tw_node); 30 __hlist_del(&tw->tw_node);
31 sk_node_init(&tw->tw_node); 31 sk_node_init(&tw->tw_node);
32 write_unlock(&ehead->lock); 32 write_unlock(lock);
33 33
34 /* Disassociate with bind bucket. */ 34 /* Disassociate with bind bucket. */
35 bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; 35 bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
@@ -59,6 +59,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
59 const struct inet_sock *inet = inet_sk(sk); 59 const struct inet_sock *inet = inet_sk(sk);
60 const struct inet_connection_sock *icsk = inet_csk(sk); 60 const struct inet_connection_sock *icsk = inet_csk(sk);
61 struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); 61 struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
62 rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
62 struct inet_bind_hashbucket *bhead; 63 struct inet_bind_hashbucket *bhead;
63 /* Step 1: Put TW into bind hash. Original socket stays there too. 64 /* Step 1: Put TW into bind hash. Original socket stays there too.
64 Note, that any socket with inet->num != 0 MUST be bound in 65 Note, that any socket with inet->num != 0 MUST be bound in
@@ -71,7 +72,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
71 inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); 72 inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
72 spin_unlock(&bhead->lock); 73 spin_unlock(&bhead->lock);
73 74
74 write_lock(&ehead->lock); 75 write_lock(lock);
75 76
76 /* Step 2: Remove SK from established hash. */ 77 /* Step 2: Remove SK from established hash. */
77 if (__sk_del_node_init(sk)) 78 if (__sk_del_node_init(sk))
@@ -81,7 +82,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
81 inet_twsk_add_node(tw, &ehead->twchain); 82 inet_twsk_add_node(tw, &ehead->twchain);
82 atomic_inc(&tw->tw_refcnt); 83 atomic_inc(&tw->tw_refcnt);
83 84
84 write_unlock(&ehead->lock); 85 write_unlock(lock);
85} 86}
86 87
87EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); 88EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c64072bb504b..8e65182f7af1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2456,11 +2456,11 @@ void __init tcp_init(void)
2456 thash_entries ? 0 : 512 * 1024); 2456 thash_entries ? 0 : 512 * 1024);
2457 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; 2457 tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
2458 for (i = 0; i < tcp_hashinfo.ehash_size; i++) { 2458 for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2459 rwlock_init(&tcp_hashinfo.ehash[i].lock);
2460 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); 2459 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2461 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); 2460 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain);
2462 } 2461 }
2463 2462 if (inet_ehash_locks_alloc(&tcp_hashinfo))
2463 panic("TCP: failed to alloc ehash_locks");
2464 tcp_hashinfo.bhash = 2464 tcp_hashinfo.bhash =
2465 alloc_large_system_hash("TCP bind", 2465 alloc_large_system_hash("TCP bind",
2466 sizeof(struct inet_bind_hashbucket), 2466 sizeof(struct inet_bind_hashbucket),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e9127cdced20..e566f3c67677 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2049,8 +2049,9 @@ static void *established_get_first(struct seq_file *seq)
2049 struct sock *sk; 2049 struct sock *sk;
2050 struct hlist_node *node; 2050 struct hlist_node *node;
2051 struct inet_timewait_sock *tw; 2051 struct inet_timewait_sock *tw;
2052 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2052 2053
2053 read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock); 2054 read_lock_bh(lock);
2054 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { 2055 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2055 if (sk->sk_family != st->family) { 2056 if (sk->sk_family != st->family) {
2056 continue; 2057 continue;
@@ -2067,7 +2068,7 @@ static void *established_get_first(struct seq_file *seq)
2067 rc = tw; 2068 rc = tw;
2068 goto out; 2069 goto out;
2069 } 2070 }
2070 read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); 2071 read_unlock_bh(lock);
2071 st->state = TCP_SEQ_STATE_ESTABLISHED; 2072 st->state = TCP_SEQ_STATE_ESTABLISHED;
2072 } 2073 }
2073out: 2074out:
@@ -2094,11 +2095,11 @@ get_tw:
2094 cur = tw; 2095 cur = tw;
2095 goto out; 2096 goto out;
2096 } 2097 }
2097 read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); 2098 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2098 st->state = TCP_SEQ_STATE_ESTABLISHED; 2099 st->state = TCP_SEQ_STATE_ESTABLISHED;
2099 2100
2100 if (++st->bucket < tcp_hashinfo.ehash_size) { 2101 if (++st->bucket < tcp_hashinfo.ehash_size) {
2101 read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock); 2102 read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2102 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); 2103 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2103 } else { 2104 } else {
2104 cur = NULL; 2105 cur = NULL;
@@ -2206,7 +2207,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
2206 case TCP_SEQ_STATE_TIME_WAIT: 2207 case TCP_SEQ_STATE_TIME_WAIT:
2207 case TCP_SEQ_STATE_ESTABLISHED: 2208 case TCP_SEQ_STATE_ESTABLISHED:
2208 if (v) 2209 if (v)
2209 read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); 2210 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2210 break; 2211 break;
2211 } 2212 }
2212} 2213}