diff options
author | Chris Mason <chris.mason@oracle.com> | 2010-05-11 18:09:45 -0400 |
---|---|---|
committer | Andy Grover <andy.grover@oracle.com> | 2010-09-08 21:15:08 -0400 |
commit | 38a4e5e61344490f18241333d7b1b368a3a38748 (patch) | |
tree | 60268171e32aff9938334806d0debd5c35b3dc56 | |
parent | e4c52c98e04937ea87b0979a81354d0040d284f9 (diff) |
rds: Use RCU for the bind lookup searches
The RDS bind lookups are somewhat expensive in terms of CPU
time and locking overhead. This commit changes them into a
faster RCU based hash tree instead of the rbtrees they were using
before.
On large NUMA systems it is a significant improvement.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r-- | net/rds/af_rds.c | 8 | ||||
-rw-r--r-- | net/rds/bind.c | 90 | ||||
-rw-r--r-- | net/rds/ib_rdma.c | 2 | ||||
-rw-r--r-- | net/rds/rds.h | 2 |
4 files changed, 57 insertions, 45 deletions
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index ef09340cf7a9..f16d2a92cb89 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c | |||
@@ -72,7 +72,15 @@ static int rds_release(struct socket *sock) | |||
72 | * with the socket. */ | 72 | * with the socket. */ |
73 | rds_clear_recv_queue(rs); | 73 | rds_clear_recv_queue(rs); |
74 | rds_cong_remove_socket(rs); | 74 | rds_cong_remove_socket(rs); |
75 | |||
76 | /* | ||
77 | * the binding lookup hash uses rcu, we need to | ||
78 | * make sure we sychronize_rcu before we free our | ||
79 | * entry | ||
80 | */ | ||
75 | rds_remove_bound(rs); | 81 | rds_remove_bound(rs); |
82 | synchronize_rcu(); | ||
83 | |||
76 | rds_send_drop_to(rs, NULL); | 84 | rds_send_drop_to(rs, NULL); |
77 | rds_rdma_drop_keys(rs); | 85 | rds_rdma_drop_keys(rs); |
78 | rds_notify_queue_get(rs, NULL); | 86 | rds_notify_queue_get(rs, NULL); |
diff --git a/net/rds/bind.c b/net/rds/bind.c index a65afff6f4b5..2f6b3fcc79f8 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c | |||
@@ -34,45 +34,52 @@ | |||
34 | #include <net/sock.h> | 34 | #include <net/sock.h> |
35 | #include <linux/in.h> | 35 | #include <linux/in.h> |
36 | #include <linux/if_arp.h> | 36 | #include <linux/if_arp.h> |
37 | #include <linux/jhash.h> | ||
37 | #include "rds.h" | 38 | #include "rds.h" |
38 | 39 | ||
39 | /* | 40 | #define BIND_HASH_SIZE 1024 |
40 | * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't | 41 | static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; |
41 | * particularly zippy. | 42 | static DEFINE_SPINLOCK(rds_bind_lock); |
42 | * | 43 | |
43 | * This is now called for every incoming frame so we arguably care much more | 44 | static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) |
44 | * about it than we used to. | 45 | { |
45 | */ | 46 | return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) & |
46 | static DEFINE_RWLOCK(rds_bind_lock); | 47 | (BIND_HASH_SIZE - 1)); |
47 | static struct rb_root rds_bind_tree = RB_ROOT; | 48 | } |
48 | 49 | ||
49 | static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, | 50 | static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, |
50 | struct rds_sock *insert) | 51 | struct rds_sock *insert) |
51 | { | 52 | { |
52 | struct rb_node **p = &rds_bind_tree.rb_node; | ||
53 | struct rb_node *parent = NULL; | ||
54 | struct rds_sock *rs; | 53 | struct rds_sock *rs; |
54 | struct hlist_node *node; | ||
55 | struct hlist_head *head = hash_to_bucket(addr, port); | ||
55 | u64 cmp; | 56 | u64 cmp; |
56 | u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); | 57 | u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); |
57 | 58 | ||
58 | while (*p) { | 59 | rcu_read_lock(); |
59 | parent = *p; | 60 | hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) { |
60 | rs = rb_entry(parent, struct rds_sock, rs_bound_node); | ||
61 | |||
62 | cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | | 61 | cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | |
63 | be16_to_cpu(rs->rs_bound_port); | 62 | be16_to_cpu(rs->rs_bound_port); |
64 | 63 | ||
65 | if (needle < cmp) | 64 | if (cmp == needle) { |
66 | p = &(*p)->rb_left; | 65 | rcu_read_unlock(); |
67 | else if (needle > cmp) | ||
68 | p = &(*p)->rb_right; | ||
69 | else | ||
70 | return rs; | 66 | return rs; |
67 | } | ||
71 | } | 68 | } |
69 | rcu_read_unlock(); | ||
72 | 70 | ||
73 | if (insert) { | 71 | if (insert) { |
74 | rb_link_node(&insert->rs_bound_node, parent, p); | 72 | /* |
75 | rb_insert_color(&insert->rs_bound_node, &rds_bind_tree); | 73 | * make sure our addr and port are set before |
74 | * we are added to the list, other people | ||
75 | * in rcu will find us as soon as the | ||
76 | * hlist_add_head_rcu is done | ||
77 | */ | ||
78 | insert->rs_bound_addr = addr; | ||
79 | insert->rs_bound_port = port; | ||
80 | rds_sock_addref(insert); | ||
81 | |||
82 | hlist_add_head_rcu(&insert->rs_bound_node, head); | ||
76 | } | 83 | } |
77 | return NULL; | 84 | return NULL; |
78 | } | 85 | } |
@@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, | |||
86 | struct rds_sock *rds_find_bound(__be32 addr, __be16 port) | 93 | struct rds_sock *rds_find_bound(__be32 addr, __be16 port) |
87 | { | 94 | { |
88 | struct rds_sock *rs; | 95 | struct rds_sock *rs; |
89 | unsigned long flags; | ||
90 | 96 | ||
91 | read_lock_irqsave(&rds_bind_lock, flags); | 97 | rs = rds_bind_lookup(addr, port, NULL); |
92 | rs = rds_bind_tree_walk(addr, port, NULL); | 98 | |
93 | if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) | 99 | if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) |
94 | rds_sock_addref(rs); | 100 | rds_sock_addref(rs); |
95 | else | 101 | else |
96 | rs = NULL; | 102 | rs = NULL; |
97 | read_unlock_irqrestore(&rds_bind_lock, flags); | ||
98 | 103 | ||
99 | rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, | 104 | rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, |
100 | ntohs(port)); | 105 | ntohs(port)); |
@@ -116,28 +121,21 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) | |||
116 | last = rover - 1; | 121 | last = rover - 1; |
117 | } | 122 | } |
118 | 123 | ||
119 | write_lock_irqsave(&rds_bind_lock, flags); | 124 | spin_lock_irqsave(&rds_bind_lock, flags); |
120 | 125 | ||
121 | do { | 126 | do { |
122 | if (rover == 0) | 127 | if (rover == 0) |
123 | rover++; | 128 | rover++; |
124 | if (!rds_bind_tree_walk(addr, cpu_to_be16(rover), rs)) { | 129 | if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) { |
125 | *port = cpu_to_be16(rover); | 130 | *port = rs->rs_bound_port; |
126 | ret = 0; | 131 | ret = 0; |
132 | rdsdebug("rs %p binding to %pI4:%d\n", | ||
133 | rs, &addr, (int)ntohs(*port)); | ||
127 | break; | 134 | break; |
128 | } | 135 | } |
129 | } while (rover++ != last); | 136 | } while (rover++ != last); |
130 | 137 | ||
131 | if (ret == 0) { | 138 | spin_unlock_irqrestore(&rds_bind_lock, flags); |
132 | rs->rs_bound_addr = addr; | ||
133 | rs->rs_bound_port = *port; | ||
134 | rds_sock_addref(rs); | ||
135 | |||
136 | rdsdebug("rs %p binding to %pI4:%d\n", | ||
137 | rs, &addr, (int)ntohs(*port)); | ||
138 | } | ||
139 | |||
140 | write_unlock_irqrestore(&rds_bind_lock, flags); | ||
141 | 139 | ||
142 | return ret; | 140 | return ret; |
143 | } | 141 | } |
@@ -146,19 +144,19 @@ void rds_remove_bound(struct rds_sock *rs) | |||
146 | { | 144 | { |
147 | unsigned long flags; | 145 | unsigned long flags; |
148 | 146 | ||
149 | write_lock_irqsave(&rds_bind_lock, flags); | 147 | spin_lock_irqsave(&rds_bind_lock, flags); |
150 | 148 | ||
151 | if (rs->rs_bound_addr) { | 149 | if (rs->rs_bound_addr) { |
152 | rdsdebug("rs %p unbinding from %pI4:%d\n", | 150 | rdsdebug("rs %p unbinding from %pI4:%d\n", |
153 | rs, &rs->rs_bound_addr, | 151 | rs, &rs->rs_bound_addr, |
154 | ntohs(rs->rs_bound_port)); | 152 | ntohs(rs->rs_bound_port)); |
155 | 153 | ||
156 | rb_erase(&rs->rs_bound_node, &rds_bind_tree); | 154 | hlist_del_init_rcu(&rs->rs_bound_node); |
157 | rds_sock_put(rs); | 155 | rds_sock_put(rs); |
158 | rs->rs_bound_addr = 0; | 156 | rs->rs_bound_addr = 0; |
159 | } | 157 | } |
160 | 158 | ||
161 | write_unlock_irqrestore(&rds_bind_lock, flags); | 159 | spin_unlock_irqrestore(&rds_bind_lock, flags); |
162 | } | 160 | } |
163 | 161 | ||
164 | int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) | 162 | int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) |
@@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) | |||
198 | 196 | ||
199 | out: | 197 | out: |
200 | release_sock(sk); | 198 | release_sock(sk); |
199 | |||
200 | /* we might have called rds_remove_bound on error */ | ||
201 | if (ret) | ||
202 | synchronize_rcu(); | ||
201 | return ret; | 203 | return ret; |
202 | } | 204 | } |
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index cc341cd70c87..4ba01b9ffd44 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c | |||
@@ -303,6 +303,8 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) | |||
303 | goto out_no_cigar; | 303 | goto out_no_cigar; |
304 | } | 304 | } |
305 | 305 | ||
306 | memset(ibmr, 0, sizeof(*ibmr)); | ||
307 | |||
306 | ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, | 308 | ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, |
307 | (IB_ACCESS_LOCAL_WRITE | | 309 | (IB_ACCESS_LOCAL_WRITE | |
308 | IB_ACCESS_REMOTE_READ | | 310 | IB_ACCESS_REMOTE_READ | |
diff --git a/net/rds/rds.h b/net/rds/rds.h index c22bd7b49460..241a0859d16e 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h | |||
@@ -452,7 +452,7 @@ struct rds_sock { | |||
452 | * bound_addr used for both incoming and outgoing, no INADDR_ANY | 452 | * bound_addr used for both incoming and outgoing, no INADDR_ANY |
453 | * support. | 453 | * support. |
454 | */ | 454 | */ |
455 | struct rb_node rs_bound_node; | 455 | struct hlist_node rs_bound_node; |
456 | __be32 rs_bound_addr; | 456 | __be32 rs_bound_addr; |
457 | __be32 rs_conn_addr; | 457 | __be32 rs_conn_addr; |
458 | __be16 rs_bound_port; | 458 | __be16 rs_bound_port; |