diff options
author | Eric Dumazet <dada1@cosmosbay.com> | 2007-11-07 05:40:20 -0500 |
---|---|---|
committer | David S. Miller <davem@sunset.davemloft.net> | 2007-11-07 07:15:11 -0500 |
commit | 230140cffa7feae90ad50bf259db1fa07674f3a7 (patch) | |
tree | 815472add31606423a508a17806b7884f0ab3e2e /net/ipv4 | |
parent | efac52762b1e3fe3035d29e82d8ee1aebc45e4a7 (diff) |
[INET]: Remove per bucket rwlock in tcp/dccp ehash table.
As done two years ago on IP route cache table (commit
22c047ccbc68fa8f3fa57f0e8f906479a062c426) , we can avoid using one
lock per hash bucket for the huge TCP/DCCP hash tables.
On a typical x86_64 platform, this saves about 2MB or 4MB of ram, for
litle performance differences. (we hit a different cache line for the
rwlock, but then the bucket cache line have a better sharing factor
among cpus, since we dirty it less often). For netstat or ss commands
that want a full scan of hash table, we perform fewer memory accesses.
Using a 'small' table of hashed rwlocks should be more than enough to
provide correct SMP concurrency between different buckets, without
using too much memory. Sizing of this table depends on
num_possible_cpus() and various CONFIG settings.
This patch provides some locking abstraction that may ease a future
work using a different model for TCP/DCCP table.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/inet_diag.c | 9 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 7 | ||||
-rw-r--r-- | net/ipv4/inet_timewait_sock.c | 13 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 11 |
5 files changed, 24 insertions, 20 deletions
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index dc429b6b0ba6..b0170732b5e9 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c | |||
@@ -747,13 +747,14 @@ skip_listen_ht: | |||
747 | 747 | ||
748 | for (i = s_i; i < hashinfo->ehash_size; i++) { | 748 | for (i = s_i; i < hashinfo->ehash_size; i++) { |
749 | struct inet_ehash_bucket *head = &hashinfo->ehash[i]; | 749 | struct inet_ehash_bucket *head = &hashinfo->ehash[i]; |
750 | rwlock_t *lock = inet_ehash_lockp(hashinfo, i); | ||
750 | struct sock *sk; | 751 | struct sock *sk; |
751 | struct hlist_node *node; | 752 | struct hlist_node *node; |
752 | 753 | ||
753 | if (i > s_i) | 754 | if (i > s_i) |
754 | s_num = 0; | 755 | s_num = 0; |
755 | 756 | ||
756 | read_lock_bh(&head->lock); | 757 | read_lock_bh(lock); |
757 | num = 0; | 758 | num = 0; |
758 | sk_for_each(sk, node, &head->chain) { | 759 | sk_for_each(sk, node, &head->chain) { |
759 | struct inet_sock *inet = inet_sk(sk); | 760 | struct inet_sock *inet = inet_sk(sk); |
@@ -769,7 +770,7 @@ skip_listen_ht: | |||
769 | r->id.idiag_dport) | 770 | r->id.idiag_dport) |
770 | goto next_normal; | 771 | goto next_normal; |
771 | if (inet_csk_diag_dump(sk, skb, cb) < 0) { | 772 | if (inet_csk_diag_dump(sk, skb, cb) < 0) { |
772 | read_unlock_bh(&head->lock); | 773 | read_unlock_bh(lock); |
773 | goto done; | 774 | goto done; |
774 | } | 775 | } |
775 | next_normal: | 776 | next_normal: |
@@ -791,14 +792,14 @@ next_normal: | |||
791 | r->id.idiag_dport) | 792 | r->id.idiag_dport) |
792 | goto next_dying; | 793 | goto next_dying; |
793 | if (inet_twsk_diag_dump(tw, skb, cb) < 0) { | 794 | if (inet_twsk_diag_dump(tw, skb, cb) < 0) { |
794 | read_unlock_bh(&head->lock); | 795 | read_unlock_bh(lock); |
795 | goto done; | 796 | goto done; |
796 | } | 797 | } |
797 | next_dying: | 798 | next_dying: |
798 | ++num; | 799 | ++num; |
799 | } | 800 | } |
800 | } | 801 | } |
801 | read_unlock_bh(&head->lock); | 802 | read_unlock_bh(lock); |
802 | } | 803 | } |
803 | 804 | ||
804 | done: | 805 | done: |
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 16eecc7046a3..67704da04fc4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c | |||
@@ -204,12 +204,13 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, | |||
204 | const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); | 204 | const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); |
205 | unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); | 205 | unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); |
206 | struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); | 206 | struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); |
207 | rwlock_t *lock = inet_ehash_lockp(hinfo, hash); | ||
207 | struct sock *sk2; | 208 | struct sock *sk2; |
208 | const struct hlist_node *node; | 209 | const struct hlist_node *node; |
209 | struct inet_timewait_sock *tw; | 210 | struct inet_timewait_sock *tw; |
210 | 211 | ||
211 | prefetch(head->chain.first); | 212 | prefetch(head->chain.first); |
212 | write_lock(&head->lock); | 213 | write_lock(lock); |
213 | 214 | ||
214 | /* Check TIME-WAIT sockets first. */ | 215 | /* Check TIME-WAIT sockets first. */ |
215 | sk_for_each(sk2, node, &head->twchain) { | 216 | sk_for_each(sk2, node, &head->twchain) { |
@@ -239,7 +240,7 @@ unique: | |||
239 | BUG_TRAP(sk_unhashed(sk)); | 240 | BUG_TRAP(sk_unhashed(sk)); |
240 | __sk_add_node(sk, &head->chain); | 241 | __sk_add_node(sk, &head->chain); |
241 | sock_prot_inc_use(sk->sk_prot); | 242 | sock_prot_inc_use(sk->sk_prot); |
242 | write_unlock(&head->lock); | 243 | write_unlock(lock); |
243 | 244 | ||
244 | if (twp) { | 245 | if (twp) { |
245 | *twp = tw; | 246 | *twp = tw; |
@@ -255,7 +256,7 @@ unique: | |||
255 | return 0; | 256 | return 0; |
256 | 257 | ||
257 | not_unique: | 258 | not_unique: |
258 | write_unlock(&head->lock); | 259 | write_unlock(lock); |
259 | return -EADDRNOTAVAIL; | 260 | return -EADDRNOTAVAIL; |
260 | } | 261 | } |
261 | 262 | ||
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 4e189e28f306..a60b99e0ebdc 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c | |||
@@ -20,16 +20,16 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, | |||
20 | struct inet_bind_hashbucket *bhead; | 20 | struct inet_bind_hashbucket *bhead; |
21 | struct inet_bind_bucket *tb; | 21 | struct inet_bind_bucket *tb; |
22 | /* Unlink from established hashes. */ | 22 | /* Unlink from established hashes. */ |
23 | struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash); | 23 | rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); |
24 | 24 | ||
25 | write_lock(&ehead->lock); | 25 | write_lock(lock); |
26 | if (hlist_unhashed(&tw->tw_node)) { | 26 | if (hlist_unhashed(&tw->tw_node)) { |
27 | write_unlock(&ehead->lock); | 27 | write_unlock(lock); |
28 | return; | 28 | return; |
29 | } | 29 | } |
30 | __hlist_del(&tw->tw_node); | 30 | __hlist_del(&tw->tw_node); |
31 | sk_node_init(&tw->tw_node); | 31 | sk_node_init(&tw->tw_node); |
32 | write_unlock(&ehead->lock); | 32 | write_unlock(lock); |
33 | 33 | ||
34 | /* Disassociate with bind bucket. */ | 34 | /* Disassociate with bind bucket. */ |
35 | bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; | 35 | bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; |
@@ -59,6 +59,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, | |||
59 | const struct inet_sock *inet = inet_sk(sk); | 59 | const struct inet_sock *inet = inet_sk(sk); |
60 | const struct inet_connection_sock *icsk = inet_csk(sk); | 60 | const struct inet_connection_sock *icsk = inet_csk(sk); |
61 | struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); | 61 | struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); |
62 | rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); | ||
62 | struct inet_bind_hashbucket *bhead; | 63 | struct inet_bind_hashbucket *bhead; |
63 | /* Step 1: Put TW into bind hash. Original socket stays there too. | 64 | /* Step 1: Put TW into bind hash. Original socket stays there too. |
64 | Note, that any socket with inet->num != 0 MUST be bound in | 65 | Note, that any socket with inet->num != 0 MUST be bound in |
@@ -71,7 +72,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, | |||
71 | inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); | 72 | inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); |
72 | spin_unlock(&bhead->lock); | 73 | spin_unlock(&bhead->lock); |
73 | 74 | ||
74 | write_lock(&ehead->lock); | 75 | write_lock(lock); |
75 | 76 | ||
76 | /* Step 2: Remove SK from established hash. */ | 77 | /* Step 2: Remove SK from established hash. */ |
77 | if (__sk_del_node_init(sk)) | 78 | if (__sk_del_node_init(sk)) |
@@ -81,7 +82,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, | |||
81 | inet_twsk_add_node(tw, &ehead->twchain); | 82 | inet_twsk_add_node(tw, &ehead->twchain); |
82 | atomic_inc(&tw->tw_refcnt); | 83 | atomic_inc(&tw->tw_refcnt); |
83 | 84 | ||
84 | write_unlock(&ehead->lock); | 85 | write_unlock(lock); |
85 | } | 86 | } |
86 | 87 | ||
87 | EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); | 88 | EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c64072bb504b..8e65182f7af1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -2456,11 +2456,11 @@ void __init tcp_init(void) | |||
2456 | thash_entries ? 0 : 512 * 1024); | 2456 | thash_entries ? 0 : 512 * 1024); |
2457 | tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; | 2457 | tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; |
2458 | for (i = 0; i < tcp_hashinfo.ehash_size; i++) { | 2458 | for (i = 0; i < tcp_hashinfo.ehash_size; i++) { |
2459 | rwlock_init(&tcp_hashinfo.ehash[i].lock); | ||
2460 | INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); | 2459 | INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); |
2461 | INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); | 2460 | INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); |
2462 | } | 2461 | } |
2463 | 2462 | if (inet_ehash_locks_alloc(&tcp_hashinfo)) | |
2463 | panic("TCP: failed to alloc ehash_locks"); | ||
2464 | tcp_hashinfo.bhash = | 2464 | tcp_hashinfo.bhash = |
2465 | alloc_large_system_hash("TCP bind", | 2465 | alloc_large_system_hash("TCP bind", |
2466 | sizeof(struct inet_bind_hashbucket), | 2466 | sizeof(struct inet_bind_hashbucket), |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e9127cdced20..e566f3c67677 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -2049,8 +2049,9 @@ static void *established_get_first(struct seq_file *seq) | |||
2049 | struct sock *sk; | 2049 | struct sock *sk; |
2050 | struct hlist_node *node; | 2050 | struct hlist_node *node; |
2051 | struct inet_timewait_sock *tw; | 2051 | struct inet_timewait_sock *tw; |
2052 | rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); | ||
2052 | 2053 | ||
2053 | read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock); | 2054 | read_lock_bh(lock); |
2054 | sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { | 2055 | sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { |
2055 | if (sk->sk_family != st->family) { | 2056 | if (sk->sk_family != st->family) { |
2056 | continue; | 2057 | continue; |
@@ -2067,7 +2068,7 @@ static void *established_get_first(struct seq_file *seq) | |||
2067 | rc = tw; | 2068 | rc = tw; |
2068 | goto out; | 2069 | goto out; |
2069 | } | 2070 | } |
2070 | read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); | 2071 | read_unlock_bh(lock); |
2071 | st->state = TCP_SEQ_STATE_ESTABLISHED; | 2072 | st->state = TCP_SEQ_STATE_ESTABLISHED; |
2072 | } | 2073 | } |
2073 | out: | 2074 | out: |
@@ -2094,11 +2095,11 @@ get_tw: | |||
2094 | cur = tw; | 2095 | cur = tw; |
2095 | goto out; | 2096 | goto out; |
2096 | } | 2097 | } |
2097 | read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); | 2098 | read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); |
2098 | st->state = TCP_SEQ_STATE_ESTABLISHED; | 2099 | st->state = TCP_SEQ_STATE_ESTABLISHED; |
2099 | 2100 | ||
2100 | if (++st->bucket < tcp_hashinfo.ehash_size) { | 2101 | if (++st->bucket < tcp_hashinfo.ehash_size) { |
2101 | read_lock_bh(&tcp_hashinfo.ehash[st->bucket].lock); | 2102 | read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); |
2102 | sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); | 2103 | sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); |
2103 | } else { | 2104 | } else { |
2104 | cur = NULL; | 2105 | cur = NULL; |
@@ -2206,7 +2207,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) | |||
2206 | case TCP_SEQ_STATE_TIME_WAIT: | 2207 | case TCP_SEQ_STATE_TIME_WAIT: |
2207 | case TCP_SEQ_STATE_ESTABLISHED: | 2208 | case TCP_SEQ_STATE_ESTABLISHED: |
2208 | if (v) | 2209 | if (v) |
2209 | read_unlock_bh(&tcp_hashinfo.ehash[st->bucket].lock); | 2210 | read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); |
2210 | break; | 2211 | break; |
2211 | } | 2212 | } |
2212 | } | 2213 | } |