diff options
Diffstat (limited to 'net/rds')
40 files changed, 2552 insertions, 1577 deletions
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index aebfecbdb84..bb6ad81b671 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c | |||
@@ -39,7 +39,15 @@ | |||
39 | #include <net/sock.h> | 39 | #include <net/sock.h> |
40 | 40 | ||
41 | #include "rds.h" | 41 | #include "rds.h" |
42 | #include "rdma.h" | 42 | |
43 | char *rds_str_array(char **array, size_t elements, size_t index) | ||
44 | { | ||
45 | if ((index < elements) && array[index]) | ||
46 | return array[index]; | ||
47 | else | ||
48 | return "unknown"; | ||
49 | } | ||
50 | EXPORT_SYMBOL(rds_str_array); | ||
43 | 51 | ||
44 | /* this is just used for stats gathering :/ */ | 52 | /* this is just used for stats gathering :/ */ |
45 | static DEFINE_SPINLOCK(rds_sock_lock); | 53 | static DEFINE_SPINLOCK(rds_sock_lock); |
@@ -62,7 +70,7 @@ static int rds_release(struct socket *sock) | |||
62 | struct rds_sock *rs; | 70 | struct rds_sock *rs; |
63 | unsigned long flags; | 71 | unsigned long flags; |
64 | 72 | ||
65 | if (sk == NULL) | 73 | if (!sk) |
66 | goto out; | 74 | goto out; |
67 | 75 | ||
68 | rs = rds_sk_to_rs(sk); | 76 | rs = rds_sk_to_rs(sk); |
@@ -73,7 +81,15 @@ static int rds_release(struct socket *sock) | |||
73 | * with the socket. */ | 81 | * with the socket. */ |
74 | rds_clear_recv_queue(rs); | 82 | rds_clear_recv_queue(rs); |
75 | rds_cong_remove_socket(rs); | 83 | rds_cong_remove_socket(rs); |
84 | |||
85 | /* | ||
86 | * the binding lookup hash uses rcu, we need to | ||
87 | * make sure we sychronize_rcu before we free our | ||
88 | * entry | ||
89 | */ | ||
76 | rds_remove_bound(rs); | 90 | rds_remove_bound(rs); |
91 | synchronize_rcu(); | ||
92 | |||
77 | rds_send_drop_to(rs, NULL); | 93 | rds_send_drop_to(rs, NULL); |
78 | rds_rdma_drop_keys(rs); | 94 | rds_rdma_drop_keys(rs); |
79 | rds_notify_queue_get(rs, NULL); | 95 | rds_notify_queue_get(rs, NULL); |
@@ -83,6 +99,8 @@ static int rds_release(struct socket *sock) | |||
83 | rds_sock_count--; | 99 | rds_sock_count--; |
84 | spin_unlock_irqrestore(&rds_sock_lock, flags); | 100 | spin_unlock_irqrestore(&rds_sock_lock, flags); |
85 | 101 | ||
102 | rds_trans_put(rs->rs_transport); | ||
103 | |||
86 | sock->sk = NULL; | 104 | sock->sk = NULL; |
87 | sock_put(sk); | 105 | sock_put(sk); |
88 | out: | 106 | out: |
@@ -514,7 +532,7 @@ out: | |||
514 | spin_unlock_irqrestore(&rds_sock_lock, flags); | 532 | spin_unlock_irqrestore(&rds_sock_lock, flags); |
515 | } | 533 | } |
516 | 534 | ||
517 | static void __exit rds_exit(void) | 535 | static void rds_exit(void) |
518 | { | 536 | { |
519 | sock_unregister(rds_family_ops.family); | 537 | sock_unregister(rds_family_ops.family); |
520 | proto_unregister(&rds_proto); | 538 | proto_unregister(&rds_proto); |
@@ -529,7 +547,7 @@ static void __exit rds_exit(void) | |||
529 | } | 547 | } |
530 | module_exit(rds_exit); | 548 | module_exit(rds_exit); |
531 | 549 | ||
532 | static int __init rds_init(void) | 550 | static int rds_init(void) |
533 | { | 551 | { |
534 | int ret; | 552 | int ret; |
535 | 553 | ||
diff --git a/net/rds/bind.c b/net/rds/bind.c index 5d95fc007f1..2f6b3fcc79f 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c | |||
@@ -34,45 +34,52 @@ | |||
34 | #include <net/sock.h> | 34 | #include <net/sock.h> |
35 | #include <linux/in.h> | 35 | #include <linux/in.h> |
36 | #include <linux/if_arp.h> | 36 | #include <linux/if_arp.h> |
37 | #include <linux/jhash.h> | ||
37 | #include "rds.h" | 38 | #include "rds.h" |
38 | 39 | ||
39 | /* | 40 | #define BIND_HASH_SIZE 1024 |
40 | * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't | 41 | static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; |
41 | * particularly zippy. | ||
42 | * | ||
43 | * This is now called for every incoming frame so we arguably care much more | ||
44 | * about it than we used to. | ||
45 | */ | ||
46 | static DEFINE_SPINLOCK(rds_bind_lock); | 42 | static DEFINE_SPINLOCK(rds_bind_lock); |
47 | static struct rb_root rds_bind_tree = RB_ROOT; | ||
48 | 43 | ||
49 | static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, | 44 | static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) |
50 | struct rds_sock *insert) | 45 | { |
46 | return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) & | ||
47 | (BIND_HASH_SIZE - 1)); | ||
48 | } | ||
49 | |||
50 | static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, | ||
51 | struct rds_sock *insert) | ||
51 | { | 52 | { |
52 | struct rb_node **p = &rds_bind_tree.rb_node; | ||
53 | struct rb_node *parent = NULL; | ||
54 | struct rds_sock *rs; | 53 | struct rds_sock *rs; |
54 | struct hlist_node *node; | ||
55 | struct hlist_head *head = hash_to_bucket(addr, port); | ||
55 | u64 cmp; | 56 | u64 cmp; |
56 | u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); | 57 | u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); |
57 | 58 | ||
58 | while (*p) { | 59 | rcu_read_lock(); |
59 | parent = *p; | 60 | hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) { |
60 | rs = rb_entry(parent, struct rds_sock, rs_bound_node); | ||
61 | |||
62 | cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | | 61 | cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | |
63 | be16_to_cpu(rs->rs_bound_port); | 62 | be16_to_cpu(rs->rs_bound_port); |
64 | 63 | ||
65 | if (needle < cmp) | 64 | if (cmp == needle) { |
66 | p = &(*p)->rb_left; | 65 | rcu_read_unlock(); |
67 | else if (needle > cmp) | ||
68 | p = &(*p)->rb_right; | ||
69 | else | ||
70 | return rs; | 66 | return rs; |
67 | } | ||
71 | } | 68 | } |
69 | rcu_read_unlock(); | ||
72 | 70 | ||
73 | if (insert) { | 71 | if (insert) { |
74 | rb_link_node(&insert->rs_bound_node, parent, p); | 72 | /* |
75 | rb_insert_color(&insert->rs_bound_node, &rds_bind_tree); | 73 | * make sure our addr and port are set before |
74 | * we are added to the list, other people | ||
75 | * in rcu will find us as soon as the | ||
76 | * hlist_add_head_rcu is done | ||
77 | */ | ||
78 | insert->rs_bound_addr = addr; | ||
79 | insert->rs_bound_port = port; | ||
80 | rds_sock_addref(insert); | ||
81 | |||
82 | hlist_add_head_rcu(&insert->rs_bound_node, head); | ||
76 | } | 83 | } |
77 | return NULL; | 84 | return NULL; |
78 | } | 85 | } |
@@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, | |||
86 | struct rds_sock *rds_find_bound(__be32 addr, __be16 port) | 93 | struct rds_sock *rds_find_bound(__be32 addr, __be16 port) |
87 | { | 94 | { |
88 | struct rds_sock *rs; | 95 | struct rds_sock *rs; |
89 | unsigned long flags; | ||
90 | 96 | ||
91 | spin_lock_irqsave(&rds_bind_lock, flags); | 97 | rs = rds_bind_lookup(addr, port, NULL); |
92 | rs = rds_bind_tree_walk(addr, port, NULL); | 98 | |
93 | if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) | 99 | if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) |
94 | rds_sock_addref(rs); | 100 | rds_sock_addref(rs); |
95 | else | 101 | else |
96 | rs = NULL; | 102 | rs = NULL; |
97 | spin_unlock_irqrestore(&rds_bind_lock, flags); | ||
98 | 103 | ||
99 | rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, | 104 | rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, |
100 | ntohs(port)); | 105 | ntohs(port)); |
@@ -121,22 +126,15 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) | |||
121 | do { | 126 | do { |
122 | if (rover == 0) | 127 | if (rover == 0) |
123 | rover++; | 128 | rover++; |
124 | if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) { | 129 | if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) { |
125 | *port = cpu_to_be16(rover); | 130 | *port = rs->rs_bound_port; |
126 | ret = 0; | 131 | ret = 0; |
132 | rdsdebug("rs %p binding to %pI4:%d\n", | ||
133 | rs, &addr, (int)ntohs(*port)); | ||
127 | break; | 134 | break; |
128 | } | 135 | } |
129 | } while (rover++ != last); | 136 | } while (rover++ != last); |
130 | 137 | ||
131 | if (ret == 0) { | ||
132 | rs->rs_bound_addr = addr; | ||
133 | rs->rs_bound_port = *port; | ||
134 | rds_sock_addref(rs); | ||
135 | |||
136 | rdsdebug("rs %p binding to %pI4:%d\n", | ||
137 | rs, &addr, (int)ntohs(*port)); | ||
138 | } | ||
139 | |||
140 | spin_unlock_irqrestore(&rds_bind_lock, flags); | 138 | spin_unlock_irqrestore(&rds_bind_lock, flags); |
141 | 139 | ||
142 | return ret; | 140 | return ret; |
@@ -153,7 +151,7 @@ void rds_remove_bound(struct rds_sock *rs) | |||
153 | rs, &rs->rs_bound_addr, | 151 | rs, &rs->rs_bound_addr, |
154 | ntohs(rs->rs_bound_port)); | 152 | ntohs(rs->rs_bound_port)); |
155 | 153 | ||
156 | rb_erase(&rs->rs_bound_node, &rds_bind_tree); | 154 | hlist_del_init_rcu(&rs->rs_bound_node); |
157 | rds_sock_put(rs); | 155 | rds_sock_put(rs); |
158 | rs->rs_bound_addr = 0; | 156 | rs->rs_bound_addr = 0; |
159 | } | 157 | } |
@@ -184,7 +182,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) | |||
184 | goto out; | 182 | goto out; |
185 | 183 | ||
186 | trans = rds_trans_get_preferred(sin->sin_addr.s_addr); | 184 | trans = rds_trans_get_preferred(sin->sin_addr.s_addr); |
187 | if (trans == NULL) { | 185 | if (!trans) { |
188 | ret = -EADDRNOTAVAIL; | 186 | ret = -EADDRNOTAVAIL; |
189 | rds_remove_bound(rs); | 187 | rds_remove_bound(rs); |
190 | if (printk_ratelimit()) | 188 | if (printk_ratelimit()) |
@@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) | |||
198 | 196 | ||
199 | out: | 197 | out: |
200 | release_sock(sk); | 198 | release_sock(sk); |
199 | |||
200 | /* we might have called rds_remove_bound on error */ | ||
201 | if (ret) | ||
202 | synchronize_rcu(); | ||
201 | return ret; | 203 | return ret; |
202 | } | 204 | } |
diff --git a/net/rds/cong.c b/net/rds/cong.c index 0871a29f078..75ea686f27d 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c | |||
@@ -141,7 +141,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) | |||
141 | unsigned long flags; | 141 | unsigned long flags; |
142 | 142 | ||
143 | map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); | 143 | map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); |
144 | if (map == NULL) | 144 | if (!map) |
145 | return NULL; | 145 | return NULL; |
146 | 146 | ||
147 | map->m_addr = addr; | 147 | map->m_addr = addr; |
@@ -159,7 +159,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) | |||
159 | ret = rds_cong_tree_walk(addr, map); | 159 | ret = rds_cong_tree_walk(addr, map); |
160 | spin_unlock_irqrestore(&rds_cong_lock, flags); | 160 | spin_unlock_irqrestore(&rds_cong_lock, flags); |
161 | 161 | ||
162 | if (ret == NULL) { | 162 | if (!ret) { |
163 | ret = map; | 163 | ret = map; |
164 | map = NULL; | 164 | map = NULL; |
165 | } | 165 | } |
@@ -205,7 +205,7 @@ int rds_cong_get_maps(struct rds_connection *conn) | |||
205 | conn->c_lcong = rds_cong_from_addr(conn->c_laddr); | 205 | conn->c_lcong = rds_cong_from_addr(conn->c_laddr); |
206 | conn->c_fcong = rds_cong_from_addr(conn->c_faddr); | 206 | conn->c_fcong = rds_cong_from_addr(conn->c_faddr); |
207 | 207 | ||
208 | if (conn->c_lcong == NULL || conn->c_fcong == NULL) | 208 | if (!(conn->c_lcong && conn->c_fcong)) |
209 | return -ENOMEM; | 209 | return -ENOMEM; |
210 | 210 | ||
211 | return 0; | 211 | return 0; |
@@ -221,7 +221,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map) | |||
221 | list_for_each_entry(conn, &map->m_conn_list, c_map_item) { | 221 | list_for_each_entry(conn, &map->m_conn_list, c_map_item) { |
222 | if (!test_and_set_bit(0, &conn->c_map_queued)) { | 222 | if (!test_and_set_bit(0, &conn->c_map_queued)) { |
223 | rds_stats_inc(s_cong_update_queued); | 223 | rds_stats_inc(s_cong_update_queued); |
224 | queue_delayed_work(rds_wq, &conn->c_send_w, 0); | 224 | rds_send_xmit(conn); |
225 | } | 225 | } |
226 | } | 226 | } |
227 | 227 | ||
diff --git a/net/rds/connection.c b/net/rds/connection.c index 7619b671ca2..870992e08ca 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c | |||
@@ -37,7 +37,6 @@ | |||
37 | 37 | ||
38 | #include "rds.h" | 38 | #include "rds.h" |
39 | #include "loop.h" | 39 | #include "loop.h" |
40 | #include "rdma.h" | ||
41 | 40 | ||
42 | #define RDS_CONNECTION_HASH_BITS 12 | 41 | #define RDS_CONNECTION_HASH_BITS 12 |
43 | #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) | 42 | #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) |
@@ -63,18 +62,7 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) | |||
63 | var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ | 62 | var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ |
64 | } while (0) | 63 | } while (0) |
65 | 64 | ||
66 | static inline int rds_conn_is_sending(struct rds_connection *conn) | 65 | /* rcu read lock must be held or the connection spinlock */ |
67 | { | ||
68 | int ret = 0; | ||
69 | |||
70 | if (!mutex_trylock(&conn->c_send_lock)) | ||
71 | ret = 1; | ||
72 | else | ||
73 | mutex_unlock(&conn->c_send_lock); | ||
74 | |||
75 | return ret; | ||
76 | } | ||
77 | |||
78 | static struct rds_connection *rds_conn_lookup(struct hlist_head *head, | 66 | static struct rds_connection *rds_conn_lookup(struct hlist_head *head, |
79 | __be32 laddr, __be32 faddr, | 67 | __be32 laddr, __be32 faddr, |
80 | struct rds_transport *trans) | 68 | struct rds_transport *trans) |
@@ -82,7 +70,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, | |||
82 | struct rds_connection *conn, *ret = NULL; | 70 | struct rds_connection *conn, *ret = NULL; |
83 | struct hlist_node *pos; | 71 | struct hlist_node *pos; |
84 | 72 | ||
85 | hlist_for_each_entry(conn, pos, head, c_hash_node) { | 73 | hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { |
86 | if (conn->c_faddr == faddr && conn->c_laddr == laddr && | 74 | if (conn->c_faddr == faddr && conn->c_laddr == laddr && |
87 | conn->c_trans == trans) { | 75 | conn->c_trans == trans) { |
88 | ret = conn; | 76 | ret = conn; |
@@ -129,10 +117,11 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, | |||
129 | { | 117 | { |
130 | struct rds_connection *conn, *parent = NULL; | 118 | struct rds_connection *conn, *parent = NULL; |
131 | struct hlist_head *head = rds_conn_bucket(laddr, faddr); | 119 | struct hlist_head *head = rds_conn_bucket(laddr, faddr); |
120 | struct rds_transport *loop_trans; | ||
132 | unsigned long flags; | 121 | unsigned long flags; |
133 | int ret; | 122 | int ret; |
134 | 123 | ||
135 | spin_lock_irqsave(&rds_conn_lock, flags); | 124 | rcu_read_lock(); |
136 | conn = rds_conn_lookup(head, laddr, faddr, trans); | 125 | conn = rds_conn_lookup(head, laddr, faddr, trans); |
137 | if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && | 126 | if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && |
138 | !is_outgoing) { | 127 | !is_outgoing) { |
@@ -143,12 +132,12 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, | |||
143 | parent = conn; | 132 | parent = conn; |
144 | conn = parent->c_passive; | 133 | conn = parent->c_passive; |
145 | } | 134 | } |
146 | spin_unlock_irqrestore(&rds_conn_lock, flags); | 135 | rcu_read_unlock(); |
147 | if (conn) | 136 | if (conn) |
148 | goto out; | 137 | goto out; |
149 | 138 | ||
150 | conn = kmem_cache_zalloc(rds_conn_slab, gfp); | 139 | conn = kmem_cache_zalloc(rds_conn_slab, gfp); |
151 | if (conn == NULL) { | 140 | if (!conn) { |
152 | conn = ERR_PTR(-ENOMEM); | 141 | conn = ERR_PTR(-ENOMEM); |
153 | goto out; | 142 | goto out; |
154 | } | 143 | } |
@@ -159,7 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, | |||
159 | spin_lock_init(&conn->c_lock); | 148 | spin_lock_init(&conn->c_lock); |
160 | conn->c_next_tx_seq = 1; | 149 | conn->c_next_tx_seq = 1; |
161 | 150 | ||
162 | mutex_init(&conn->c_send_lock); | 151 | init_waitqueue_head(&conn->c_waitq); |
163 | INIT_LIST_HEAD(&conn->c_send_queue); | 152 | INIT_LIST_HEAD(&conn->c_send_queue); |
164 | INIT_LIST_HEAD(&conn->c_retrans); | 153 | INIT_LIST_HEAD(&conn->c_retrans); |
165 | 154 | ||
@@ -175,7 +164,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, | |||
175 | * can bind to the destination address then we'd rather the messages | 164 | * can bind to the destination address then we'd rather the messages |
176 | * flow through loopback rather than either transport. | 165 | * flow through loopback rather than either transport. |
177 | */ | 166 | */ |
178 | if (rds_trans_get_preferred(faddr)) { | 167 | loop_trans = rds_trans_get_preferred(faddr); |
168 | if (loop_trans) { | ||
169 | rds_trans_put(loop_trans); | ||
179 | conn->c_loopback = 1; | 170 | conn->c_loopback = 1; |
180 | if (is_outgoing && trans->t_prefer_loopback) { | 171 | if (is_outgoing && trans->t_prefer_loopback) { |
181 | /* "outgoing" connection - and the transport | 172 | /* "outgoing" connection - and the transport |
@@ -238,7 +229,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, | |||
238 | kmem_cache_free(rds_conn_slab, conn); | 229 | kmem_cache_free(rds_conn_slab, conn); |
239 | conn = found; | 230 | conn = found; |
240 | } else { | 231 | } else { |
241 | hlist_add_head(&conn->c_hash_node, head); | 232 | hlist_add_head_rcu(&conn->c_hash_node, head); |
242 | rds_cong_add_conn(conn); | 233 | rds_cong_add_conn(conn); |
243 | rds_conn_count++; | 234 | rds_conn_count++; |
244 | } | 235 | } |
@@ -263,21 +254,91 @@ struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, | |||
263 | } | 254 | } |
264 | EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); | 255 | EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); |
265 | 256 | ||
257 | void rds_conn_shutdown(struct rds_connection *conn) | ||
258 | { | ||
259 | /* shut it down unless it's down already */ | ||
260 | if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { | ||
261 | /* | ||
262 | * Quiesce the connection mgmt handlers before we start tearing | ||
263 | * things down. We don't hold the mutex for the entire | ||
264 | * duration of the shutdown operation, else we may be | ||
265 | * deadlocking with the CM handler. Instead, the CM event | ||
266 | * handler is supposed to check for state DISCONNECTING | ||
267 | */ | ||
268 | mutex_lock(&conn->c_cm_lock); | ||
269 | if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) | ||
270 | && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { | ||
271 | rds_conn_error(conn, "shutdown called in state %d\n", | ||
272 | atomic_read(&conn->c_state)); | ||
273 | mutex_unlock(&conn->c_cm_lock); | ||
274 | return; | ||
275 | } | ||
276 | mutex_unlock(&conn->c_cm_lock); | ||
277 | |||
278 | wait_event(conn->c_waitq, | ||
279 | !test_bit(RDS_IN_XMIT, &conn->c_flags)); | ||
280 | |||
281 | conn->c_trans->conn_shutdown(conn); | ||
282 | rds_conn_reset(conn); | ||
283 | |||
284 | if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { | ||
285 | /* This can happen - eg when we're in the middle of tearing | ||
286 | * down the connection, and someone unloads the rds module. | ||
287 | * Quite reproduceable with loopback connections. | ||
288 | * Mostly harmless. | ||
289 | */ | ||
290 | rds_conn_error(conn, | ||
291 | "%s: failed to transition to state DOWN, " | ||
292 | "current state is %d\n", | ||
293 | __func__, | ||
294 | atomic_read(&conn->c_state)); | ||
295 | return; | ||
296 | } | ||
297 | } | ||
298 | |||
299 | /* Then reconnect if it's still live. | ||
300 | * The passive side of an IB loopback connection is never added | ||
301 | * to the conn hash, so we never trigger a reconnect on this | ||
302 | * conn - the reconnect is always triggered by the active peer. */ | ||
303 | cancel_delayed_work_sync(&conn->c_conn_w); | ||
304 | rcu_read_lock(); | ||
305 | if (!hlist_unhashed(&conn->c_hash_node)) { | ||
306 | rcu_read_unlock(); | ||
307 | rds_queue_reconnect(conn); | ||
308 | } else { | ||
309 | rcu_read_unlock(); | ||
310 | } | ||
311 | } | ||
312 | |||
313 | /* | ||
314 | * Stop and free a connection. | ||
315 | * | ||
316 | * This can only be used in very limited circumstances. It assumes that once | ||
317 | * the conn has been shutdown that no one else is referencing the connection. | ||
318 | * We can only ensure this in the rmmod path in the current code. | ||
319 | */ | ||
266 | void rds_conn_destroy(struct rds_connection *conn) | 320 | void rds_conn_destroy(struct rds_connection *conn) |
267 | { | 321 | { |
268 | struct rds_message *rm, *rtmp; | 322 | struct rds_message *rm, *rtmp; |
323 | unsigned long flags; | ||
269 | 324 | ||
270 | rdsdebug("freeing conn %p for %pI4 -> " | 325 | rdsdebug("freeing conn %p for %pI4 -> " |
271 | "%pI4\n", conn, &conn->c_laddr, | 326 | "%pI4\n", conn, &conn->c_laddr, |
272 | &conn->c_faddr); | 327 | &conn->c_faddr); |
273 | 328 | ||
274 | hlist_del_init(&conn->c_hash_node); | 329 | /* Ensure conn will not be scheduled for reconnect */ |
330 | spin_lock_irq(&rds_conn_lock); | ||
331 | hlist_del_init_rcu(&conn->c_hash_node); | ||
332 | spin_unlock_irq(&rds_conn_lock); | ||
333 | synchronize_rcu(); | ||
275 | 334 | ||
276 | /* wait for the rds thread to shut it down */ | 335 | /* shut the connection down */ |
277 | atomic_set(&conn->c_state, RDS_CONN_ERROR); | 336 | rds_conn_drop(conn); |
278 | cancel_delayed_work(&conn->c_conn_w); | 337 | flush_work(&conn->c_down_w); |
279 | queue_work(rds_wq, &conn->c_down_w); | 338 | |
280 | flush_workqueue(rds_wq); | 339 | /* make sure lingering queued work won't try to ref the conn */ |
340 | cancel_delayed_work_sync(&conn->c_send_w); | ||
341 | cancel_delayed_work_sync(&conn->c_recv_w); | ||
281 | 342 | ||
282 | /* tear down queued messages */ | 343 | /* tear down queued messages */ |
283 | list_for_each_entry_safe(rm, rtmp, | 344 | list_for_each_entry_safe(rm, rtmp, |
@@ -302,7 +363,9 @@ void rds_conn_destroy(struct rds_connection *conn) | |||
302 | BUG_ON(!list_empty(&conn->c_retrans)); | 363 | BUG_ON(!list_empty(&conn->c_retrans)); |
303 | kmem_cache_free(rds_conn_slab, conn); | 364 | kmem_cache_free(rds_conn_slab, conn); |
304 | 365 | ||
366 | spin_lock_irqsave(&rds_conn_lock, flags); | ||
305 | rds_conn_count--; | 367 | rds_conn_count--; |
368 | spin_unlock_irqrestore(&rds_conn_lock, flags); | ||
306 | } | 369 | } |
307 | EXPORT_SYMBOL_GPL(rds_conn_destroy); | 370 | EXPORT_SYMBOL_GPL(rds_conn_destroy); |
308 | 371 | ||
@@ -316,23 +379,23 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, | |||
316 | struct list_head *list; | 379 | struct list_head *list; |
317 | struct rds_connection *conn; | 380 | struct rds_connection *conn; |
318 | struct rds_message *rm; | 381 | struct rds_message *rm; |
319 | unsigned long flags; | ||
320 | unsigned int total = 0; | 382 | unsigned int total = 0; |
383 | unsigned long flags; | ||
321 | size_t i; | 384 | size_t i; |
322 | 385 | ||
323 | len /= sizeof(struct rds_info_message); | 386 | len /= sizeof(struct rds_info_message); |
324 | 387 | ||
325 | spin_lock_irqsave(&rds_conn_lock, flags); | 388 | rcu_read_lock(); |
326 | 389 | ||
327 | for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); | 390 | for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); |
328 | i++, head++) { | 391 | i++, head++) { |
329 | hlist_for_each_entry(conn, pos, head, c_hash_node) { | 392 | hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { |
330 | if (want_send) | 393 | if (want_send) |
331 | list = &conn->c_send_queue; | 394 | list = &conn->c_send_queue; |
332 | else | 395 | else |
333 | list = &conn->c_retrans; | 396 | list = &conn->c_retrans; |
334 | 397 | ||
335 | spin_lock(&conn->c_lock); | 398 | spin_lock_irqsave(&conn->c_lock, flags); |
336 | 399 | ||
337 | /* XXX too lazy to maintain counts.. */ | 400 | /* XXX too lazy to maintain counts.. */ |
338 | list_for_each_entry(rm, list, m_conn_item) { | 401 | list_for_each_entry(rm, list, m_conn_item) { |
@@ -343,11 +406,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, | |||
343 | conn->c_faddr, 0); | 406 | conn->c_faddr, 0); |
344 | } | 407 | } |
345 | 408 | ||
346 | spin_unlock(&conn->c_lock); | 409 | spin_unlock_irqrestore(&conn->c_lock, flags); |
347 | } | 410 | } |
348 | } | 411 | } |
349 | 412 | rcu_read_unlock(); | |
350 | spin_unlock_irqrestore(&rds_conn_lock, flags); | ||
351 | 413 | ||
352 | lens->nr = total; | 414 | lens->nr = total; |
353 | lens->each = sizeof(struct rds_info_message); | 415 | lens->each = sizeof(struct rds_info_message); |
@@ -377,19 +439,17 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, | |||
377 | uint64_t buffer[(item_len + 7) / 8]; | 439 | uint64_t buffer[(item_len + 7) / 8]; |
378 | struct hlist_head *head; | 440 | struct hlist_head *head; |
379 | struct hlist_node *pos; | 441 | struct hlist_node *pos; |
380 | struct hlist_node *tmp; | ||
381 | struct rds_connection *conn; | 442 | struct rds_connection *conn; |
382 | unsigned long flags; | ||
383 | size_t i; | 443 | size_t i; |
384 | 444 | ||
385 | spin_lock_irqsave(&rds_conn_lock, flags); | 445 | rcu_read_lock(); |
386 | 446 | ||
387 | lens->nr = 0; | 447 | lens->nr = 0; |
388 | lens->each = item_len; | 448 | lens->each = item_len; |
389 | 449 | ||
390 | for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); | 450 | for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); |
391 | i++, head++) { | 451 | i++, head++) { |
392 | hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) { | 452 | hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { |
393 | 453 | ||
394 | /* XXX no c_lock usage.. */ | 454 | /* XXX no c_lock usage.. */ |
395 | if (!visitor(conn, buffer)) | 455 | if (!visitor(conn, buffer)) |
@@ -405,8 +465,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, | |||
405 | lens->nr++; | 465 | lens->nr++; |
406 | } | 466 | } |
407 | } | 467 | } |
408 | 468 | rcu_read_unlock(); | |
409 | spin_unlock_irqrestore(&rds_conn_lock, flags); | ||
410 | } | 469 | } |
411 | EXPORT_SYMBOL_GPL(rds_for_each_conn_info); | 470 | EXPORT_SYMBOL_GPL(rds_for_each_conn_info); |
412 | 471 | ||
@@ -423,8 +482,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn, | |||
423 | sizeof(cinfo->transport)); | 482 | sizeof(cinfo->transport)); |
424 | cinfo->flags = 0; | 483 | cinfo->flags = 0; |
425 | 484 | ||
426 | rds_conn_info_set(cinfo->flags, | 485 | rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags), |
427 | rds_conn_is_sending(conn), SENDING); | 486 | SENDING); |
428 | /* XXX Future: return the state rather than these funky bits */ | 487 | /* XXX Future: return the state rather than these funky bits */ |
429 | rds_conn_info_set(cinfo->flags, | 488 | rds_conn_info_set(cinfo->flags, |
430 | atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, | 489 | atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, |
@@ -444,12 +503,12 @@ static void rds_conn_info(struct socket *sock, unsigned int len, | |||
444 | sizeof(struct rds_info_connection)); | 503 | sizeof(struct rds_info_connection)); |
445 | } | 504 | } |
446 | 505 | ||
447 | int __init rds_conn_init(void) | 506 | int rds_conn_init(void) |
448 | { | 507 | { |
449 | rds_conn_slab = kmem_cache_create("rds_connection", | 508 | rds_conn_slab = kmem_cache_create("rds_connection", |
450 | sizeof(struct rds_connection), | 509 | sizeof(struct rds_connection), |
451 | 0, 0, NULL); | 510 | 0, 0, NULL); |
452 | if (rds_conn_slab == NULL) | 511 | if (!rds_conn_slab) |
453 | return -ENOMEM; | 512 | return -ENOMEM; |
454 | 513 | ||
455 | rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); | 514 | rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); |
@@ -487,6 +546,18 @@ void rds_conn_drop(struct rds_connection *conn) | |||
487 | EXPORT_SYMBOL_GPL(rds_conn_drop); | 546 | EXPORT_SYMBOL_GPL(rds_conn_drop); |
488 | 547 | ||
489 | /* | 548 | /* |
549 | * If the connection is down, trigger a connect. We may have scheduled a | ||
550 | * delayed reconnect however - in this case we should not interfere. | ||
551 | */ | ||
552 | void rds_conn_connect_if_down(struct rds_connection *conn) | ||
553 | { | ||
554 | if (rds_conn_state(conn) == RDS_CONN_DOWN && | ||
555 | !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) | ||
556 | queue_delayed_work(rds_wq, &conn->c_conn_w, 0); | ||
557 | } | ||
558 | EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); | ||
559 | |||
560 | /* | ||
490 | * An error occurred on the connection | 561 | * An error occurred on the connection |
491 | */ | 562 | */ |
492 | void | 563 | void |
diff --git a/net/rds/ib.c b/net/rds/ib.c index 8f2d6dd7700..b12a3951167 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c | |||
@@ -53,12 +53,71 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); | |||
53 | module_param(rds_ib_retry_count, int, 0444); | 53 | module_param(rds_ib_retry_count, int, 0444); |
54 | MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); | 54 | MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); |
55 | 55 | ||
56 | /* | ||
57 | * we have a clumsy combination of RCU and a rwsem protecting this list | ||
58 | * because it is used both in the get_mr fast path and while blocking in | ||
59 | * the FMR flushing path. | ||
60 | */ | ||
61 | DECLARE_RWSEM(rds_ib_devices_lock); | ||
56 | struct list_head rds_ib_devices; | 62 | struct list_head rds_ib_devices; |
57 | 63 | ||
58 | /* NOTE: if also grabbing ibdev lock, grab this first */ | 64 | /* NOTE: if also grabbing ibdev lock, grab this first */ |
59 | DEFINE_SPINLOCK(ib_nodev_conns_lock); | 65 | DEFINE_SPINLOCK(ib_nodev_conns_lock); |
60 | LIST_HEAD(ib_nodev_conns); | 66 | LIST_HEAD(ib_nodev_conns); |
61 | 67 | ||
68 | void rds_ib_nodev_connect(void) | ||
69 | { | ||
70 | struct rds_ib_connection *ic; | ||
71 | |||
72 | spin_lock(&ib_nodev_conns_lock); | ||
73 | list_for_each_entry(ic, &ib_nodev_conns, ib_node) | ||
74 | rds_conn_connect_if_down(ic->conn); | ||
75 | spin_unlock(&ib_nodev_conns_lock); | ||
76 | } | ||
77 | |||
78 | void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) | ||
79 | { | ||
80 | struct rds_ib_connection *ic; | ||
81 | unsigned long flags; | ||
82 | |||
83 | spin_lock_irqsave(&rds_ibdev->spinlock, flags); | ||
84 | list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) | ||
85 | rds_conn_drop(ic->conn); | ||
86 | spin_unlock_irqrestore(&rds_ibdev->spinlock, flags); | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references | ||
91 | * from interrupt context so we push freing off into a work struct in krdsd. | ||
92 | */ | ||
93 | static void rds_ib_dev_free(struct work_struct *work) | ||
94 | { | ||
95 | struct rds_ib_ipaddr *i_ipaddr, *i_next; | ||
96 | struct rds_ib_device *rds_ibdev = container_of(work, | ||
97 | struct rds_ib_device, free_work); | ||
98 | |||
99 | if (rds_ibdev->mr_pool) | ||
100 | rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); | ||
101 | if (rds_ibdev->mr) | ||
102 | ib_dereg_mr(rds_ibdev->mr); | ||
103 | if (rds_ibdev->pd) | ||
104 | ib_dealloc_pd(rds_ibdev->pd); | ||
105 | |||
106 | list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { | ||
107 | list_del(&i_ipaddr->list); | ||
108 | kfree(i_ipaddr); | ||
109 | } | ||
110 | |||
111 | kfree(rds_ibdev); | ||
112 | } | ||
113 | |||
114 | void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) | ||
115 | { | ||
116 | BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0); | ||
117 | if (atomic_dec_and_test(&rds_ibdev->refcount)) | ||
118 | queue_work(rds_wq, &rds_ibdev->free_work); | ||
119 | } | ||
120 | |||
62 | void rds_ib_add_one(struct ib_device *device) | 121 | void rds_ib_add_one(struct ib_device *device) |
63 | { | 122 | { |
64 | struct rds_ib_device *rds_ibdev; | 123 | struct rds_ib_device *rds_ibdev; |
@@ -77,11 +136,14 @@ void rds_ib_add_one(struct ib_device *device) | |||
77 | goto free_attr; | 136 | goto free_attr; |
78 | } | 137 | } |
79 | 138 | ||
80 | rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); | 139 | rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, |
140 | ibdev_to_node(device)); | ||
81 | if (!rds_ibdev) | 141 | if (!rds_ibdev) |
82 | goto free_attr; | 142 | goto free_attr; |
83 | 143 | ||
84 | spin_lock_init(&rds_ibdev->spinlock); | 144 | spin_lock_init(&rds_ibdev->spinlock); |
145 | atomic_set(&rds_ibdev->refcount, 1); | ||
146 | INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); | ||
85 | 147 | ||
86 | rds_ibdev->max_wrs = dev_attr->max_qp_wr; | 148 | rds_ibdev->max_wrs = dev_attr->max_qp_wr; |
87 | rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); | 149 | rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); |
@@ -91,68 +153,107 @@ void rds_ib_add_one(struct ib_device *device) | |||
91 | min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : | 153 | min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : |
92 | fmr_pool_size; | 154 | fmr_pool_size; |
93 | 155 | ||
156 | rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; | ||
157 | rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; | ||
158 | |||
94 | rds_ibdev->dev = device; | 159 | rds_ibdev->dev = device; |
95 | rds_ibdev->pd = ib_alloc_pd(device); | 160 | rds_ibdev->pd = ib_alloc_pd(device); |
96 | if (IS_ERR(rds_ibdev->pd)) | 161 | if (IS_ERR(rds_ibdev->pd)) { |
97 | goto free_dev; | 162 | rds_ibdev->pd = NULL; |
163 | goto put_dev; | ||
164 | } | ||
98 | 165 | ||
99 | rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, | 166 | rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE); |
100 | IB_ACCESS_LOCAL_WRITE); | 167 | if (IS_ERR(rds_ibdev->mr)) { |
101 | if (IS_ERR(rds_ibdev->mr)) | 168 | rds_ibdev->mr = NULL; |
102 | goto err_pd; | 169 | goto put_dev; |
170 | } | ||
103 | 171 | ||
104 | rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); | 172 | rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); |
105 | if (IS_ERR(rds_ibdev->mr_pool)) { | 173 | if (IS_ERR(rds_ibdev->mr_pool)) { |
106 | rds_ibdev->mr_pool = NULL; | 174 | rds_ibdev->mr_pool = NULL; |
107 | goto err_mr; | 175 | goto put_dev; |
108 | } | 176 | } |
109 | 177 | ||
110 | INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); | 178 | INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); |
111 | INIT_LIST_HEAD(&rds_ibdev->conn_list); | 179 | INIT_LIST_HEAD(&rds_ibdev->conn_list); |
112 | list_add_tail(&rds_ibdev->list, &rds_ib_devices); | 180 | |
181 | down_write(&rds_ib_devices_lock); | ||
182 | list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); | ||
183 | up_write(&rds_ib_devices_lock); | ||
184 | atomic_inc(&rds_ibdev->refcount); | ||
113 | 185 | ||
114 | ib_set_client_data(device, &rds_ib_client, rds_ibdev); | 186 | ib_set_client_data(device, &rds_ib_client, rds_ibdev); |
187 | atomic_inc(&rds_ibdev->refcount); | ||
115 | 188 | ||
116 | goto free_attr; | 189 | rds_ib_nodev_connect(); |
117 | 190 | ||
118 | err_mr: | 191 | put_dev: |
119 | ib_dereg_mr(rds_ibdev->mr); | 192 | rds_ib_dev_put(rds_ibdev); |
120 | err_pd: | ||
121 | ib_dealloc_pd(rds_ibdev->pd); | ||
122 | free_dev: | ||
123 | kfree(rds_ibdev); | ||
124 | free_attr: | 193 | free_attr: |
125 | kfree(dev_attr); | 194 | kfree(dev_attr); |
126 | } | 195 | } |
127 | 196 | ||
197 | /* | ||
198 | * New connections use this to find the device to associate with the | ||
199 | * connection. It's not in the fast path so we're not concerned about the | ||
200 | * performance of the IB call. (As of this writing, it uses an interrupt | ||
201 | * blocking spinlock to serialize walking a per-device list of all registered | ||
202 | * clients.) | ||
203 | * | ||
204 | * RCU is used to handle incoming connections racing with device teardown. | ||
205 | * Rather than use a lock to serialize removal from the client_data and | ||
206 | * getting a new reference, we use an RCU grace period. The destruction | ||
207 | * path removes the device from client_data and then waits for all RCU | ||
208 | * readers to finish. | ||
209 | * | ||
210 | * A new connection can get NULL from this if its arriving on a | ||
211 | * device that is in the process of being removed. | ||
212 | */ | ||
213 | struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) | ||
214 | { | ||
215 | struct rds_ib_device *rds_ibdev; | ||
216 | |||
217 | rcu_read_lock(); | ||
218 | rds_ibdev = ib_get_client_data(device, &rds_ib_client); | ||
219 | if (rds_ibdev) | ||
220 | atomic_inc(&rds_ibdev->refcount); | ||
221 | rcu_read_unlock(); | ||
222 | return rds_ibdev; | ||
223 | } | ||
224 | |||
225 | /* | ||
226 | * The IB stack is letting us know that a device is going away. This can | ||
227 | * happen if the underlying HCA driver is removed or if PCI hotplug is removing | ||
228 | * the pci function, for example. | ||
229 | * | ||
230 | * This can be called at any time and can be racing with any other RDS path. | ||
231 | */ | ||
128 | void rds_ib_remove_one(struct ib_device *device) | 232 | void rds_ib_remove_one(struct ib_device *device) |
129 | { | 233 | { |
130 | struct rds_ib_device *rds_ibdev; | 234 | struct rds_ib_device *rds_ibdev; |
131 | struct rds_ib_ipaddr *i_ipaddr, *i_next; | ||
132 | 235 | ||
133 | rds_ibdev = ib_get_client_data(device, &rds_ib_client); | 236 | rds_ibdev = ib_get_client_data(device, &rds_ib_client); |
134 | if (!rds_ibdev) | 237 | if (!rds_ibdev) |
135 | return; | 238 | return; |
136 | 239 | ||
137 | list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { | 240 | rds_ib_dev_shutdown(rds_ibdev); |
138 | list_del(&i_ipaddr->list); | ||
139 | kfree(i_ipaddr); | ||
140 | } | ||
141 | 241 | ||
142 | rds_ib_destroy_conns(rds_ibdev); | 242 | /* stop connection attempts from getting a reference to this device. */ |
243 | ib_set_client_data(device, &rds_ib_client, NULL); | ||
143 | 244 | ||
144 | if (rds_ibdev->mr_pool) | 245 | down_write(&rds_ib_devices_lock); |
145 | rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); | 246 | list_del_rcu(&rds_ibdev->list); |
146 | 247 | up_write(&rds_ib_devices_lock); | |
147 | ib_dereg_mr(rds_ibdev->mr); | ||
148 | |||
149 | while (ib_dealloc_pd(rds_ibdev->pd)) { | ||
150 | rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); | ||
151 | msleep(1); | ||
152 | } | ||
153 | 248 | ||
154 | list_del(&rds_ibdev->list); | 249 | /* |
155 | kfree(rds_ibdev); | 250 | * This synchronize rcu is waiting for readers of both the ib |
251 | * client data and the devices list to finish before we drop | ||
252 | * both of those references. | ||
253 | */ | ||
254 | synchronize_rcu(); | ||
255 | rds_ib_dev_put(rds_ibdev); | ||
256 | rds_ib_dev_put(rds_ibdev); | ||
156 | } | 257 | } |
157 | 258 | ||
158 | struct ib_client rds_ib_client = { | 259 | struct ib_client rds_ib_client = { |
@@ -186,7 +287,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, | |||
186 | rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); | 287 | rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); |
187 | rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); | 288 | rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); |
188 | 289 | ||
189 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | 290 | rds_ibdev = ic->rds_ibdev; |
190 | iinfo->max_send_wr = ic->i_send_ring.w_nr; | 291 | iinfo->max_send_wr = ic->i_send_ring.w_nr; |
191 | iinfo->max_recv_wr = ic->i_recv_ring.w_nr; | 292 | iinfo->max_recv_wr = ic->i_recv_ring.w_nr; |
192 | iinfo->max_send_sge = rds_ibdev->max_sge; | 293 | iinfo->max_send_sge = rds_ibdev->max_sge; |
@@ -248,29 +349,36 @@ static int rds_ib_laddr_check(__be32 addr) | |||
248 | return ret; | 349 | return ret; |
249 | } | 350 | } |
250 | 351 | ||
352 | static void rds_ib_unregister_client(void) | ||
353 | { | ||
354 | ib_unregister_client(&rds_ib_client); | ||
355 | /* wait for rds_ib_dev_free() to complete */ | ||
356 | flush_workqueue(rds_wq); | ||
357 | } | ||
358 | |||
251 | void rds_ib_exit(void) | 359 | void rds_ib_exit(void) |
252 | { | 360 | { |
253 | rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); | 361 | rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); |
362 | rds_ib_unregister_client(); | ||
254 | rds_ib_destroy_nodev_conns(); | 363 | rds_ib_destroy_nodev_conns(); |
255 | ib_unregister_client(&rds_ib_client); | ||
256 | rds_ib_sysctl_exit(); | 364 | rds_ib_sysctl_exit(); |
257 | rds_ib_recv_exit(); | 365 | rds_ib_recv_exit(); |
258 | rds_trans_unregister(&rds_ib_transport); | 366 | rds_trans_unregister(&rds_ib_transport); |
367 | rds_ib_fmr_exit(); | ||
259 | } | 368 | } |
260 | 369 | ||
261 | struct rds_transport rds_ib_transport = { | 370 | struct rds_transport rds_ib_transport = { |
262 | .laddr_check = rds_ib_laddr_check, | 371 | .laddr_check = rds_ib_laddr_check, |
263 | .xmit_complete = rds_ib_xmit_complete, | 372 | .xmit_complete = rds_ib_xmit_complete, |
264 | .xmit = rds_ib_xmit, | 373 | .xmit = rds_ib_xmit, |
265 | .xmit_cong_map = NULL, | ||
266 | .xmit_rdma = rds_ib_xmit_rdma, | 374 | .xmit_rdma = rds_ib_xmit_rdma, |
375 | .xmit_atomic = rds_ib_xmit_atomic, | ||
267 | .recv = rds_ib_recv, | 376 | .recv = rds_ib_recv, |
268 | .conn_alloc = rds_ib_conn_alloc, | 377 | .conn_alloc = rds_ib_conn_alloc, |
269 | .conn_free = rds_ib_conn_free, | 378 | .conn_free = rds_ib_conn_free, |
270 | .conn_connect = rds_ib_conn_connect, | 379 | .conn_connect = rds_ib_conn_connect, |
271 | .conn_shutdown = rds_ib_conn_shutdown, | 380 | .conn_shutdown = rds_ib_conn_shutdown, |
272 | .inc_copy_to_user = rds_ib_inc_copy_to_user, | 381 | .inc_copy_to_user = rds_ib_inc_copy_to_user, |
273 | .inc_purge = rds_ib_inc_purge, | ||
274 | .inc_free = rds_ib_inc_free, | 382 | .inc_free = rds_ib_inc_free, |
275 | .cm_initiate_connect = rds_ib_cm_initiate_connect, | 383 | .cm_initiate_connect = rds_ib_cm_initiate_connect, |
276 | .cm_handle_connect = rds_ib_cm_handle_connect, | 384 | .cm_handle_connect = rds_ib_cm_handle_connect, |
@@ -286,16 +394,20 @@ struct rds_transport rds_ib_transport = { | |||
286 | .t_type = RDS_TRANS_IB | 394 | .t_type = RDS_TRANS_IB |
287 | }; | 395 | }; |
288 | 396 | ||
289 | int __init rds_ib_init(void) | 397 | int rds_ib_init(void) |
290 | { | 398 | { |
291 | int ret; | 399 | int ret; |
292 | 400 | ||
293 | INIT_LIST_HEAD(&rds_ib_devices); | 401 | INIT_LIST_HEAD(&rds_ib_devices); |
294 | 402 | ||
295 | ret = ib_register_client(&rds_ib_client); | 403 | ret = rds_ib_fmr_init(); |
296 | if (ret) | 404 | if (ret) |
297 | goto out; | 405 | goto out; |
298 | 406 | ||
407 | ret = ib_register_client(&rds_ib_client); | ||
408 | if (ret) | ||
409 | goto out_fmr_exit; | ||
410 | |||
299 | ret = rds_ib_sysctl_init(); | 411 | ret = rds_ib_sysctl_init(); |
300 | if (ret) | 412 | if (ret) |
301 | goto out_ibreg; | 413 | goto out_ibreg; |
@@ -317,7 +429,9 @@ out_recv: | |||
317 | out_sysctl: | 429 | out_sysctl: |
318 | rds_ib_sysctl_exit(); | 430 | rds_ib_sysctl_exit(); |
319 | out_ibreg: | 431 | out_ibreg: |
320 | ib_unregister_client(&rds_ib_client); | 432 | rds_ib_unregister_client(); |
433 | out_fmr_exit: | ||
434 | rds_ib_fmr_exit(); | ||
321 | out: | 435 | out: |
322 | return ret; | 436 | return ret; |
323 | } | 437 | } |
diff --git a/net/rds/ib.h b/net/rds/ib.h index 64df4e79b29..7ad3d57e06a 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h | |||
@@ -3,11 +3,13 @@ | |||
3 | 3 | ||
4 | #include <rdma/ib_verbs.h> | 4 | #include <rdma/ib_verbs.h> |
5 | #include <rdma/rdma_cm.h> | 5 | #include <rdma/rdma_cm.h> |
6 | #include <linux/pci.h> | ||
7 | #include <linux/slab.h> | ||
6 | #include "rds.h" | 8 | #include "rds.h" |
7 | #include "rdma_transport.h" | 9 | #include "rdma_transport.h" |
8 | 10 | ||
9 | #define RDS_FMR_SIZE 256 | 11 | #define RDS_FMR_SIZE 256 |
10 | #define RDS_FMR_POOL_SIZE 4096 | 12 | #define RDS_FMR_POOL_SIZE 8192 |
11 | 13 | ||
12 | #define RDS_IB_MAX_SGE 8 | 14 | #define RDS_IB_MAX_SGE 8 |
13 | #define RDS_IB_RECV_SGE 2 | 15 | #define RDS_IB_RECV_SGE 2 |
@@ -19,6 +21,9 @@ | |||
19 | 21 | ||
20 | #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ | 22 | #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ |
21 | 23 | ||
24 | #define RDS_IB_RECYCLE_BATCH_COUNT 32 | ||
25 | |||
26 | extern struct rw_semaphore rds_ib_devices_lock; | ||
22 | extern struct list_head rds_ib_devices; | 27 | extern struct list_head rds_ib_devices; |
23 | 28 | ||
24 | /* | 29 | /* |
@@ -26,20 +31,29 @@ extern struct list_head rds_ib_devices; | |||
26 | * try and minimize the amount of memory tied up both the device and | 31 | * try and minimize the amount of memory tied up both the device and |
27 | * socket receive queues. | 32 | * socket receive queues. |
28 | */ | 33 | */ |
29 | /* page offset of the final full frag that fits in the page */ | ||
30 | #define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) | ||
31 | struct rds_page_frag { | 34 | struct rds_page_frag { |
32 | struct list_head f_item; | 35 | struct list_head f_item; |
33 | struct page *f_page; | 36 | struct list_head f_cache_entry; |
34 | unsigned long f_offset; | 37 | struct scatterlist f_sg; |
35 | dma_addr_t f_mapped; | ||
36 | }; | 38 | }; |
37 | 39 | ||
38 | struct rds_ib_incoming { | 40 | struct rds_ib_incoming { |
39 | struct list_head ii_frags; | 41 | struct list_head ii_frags; |
42 | struct list_head ii_cache_entry; | ||
40 | struct rds_incoming ii_inc; | 43 | struct rds_incoming ii_inc; |
41 | }; | 44 | }; |
42 | 45 | ||
46 | struct rds_ib_cache_head { | ||
47 | struct list_head *first; | ||
48 | unsigned long count; | ||
49 | }; | ||
50 | |||
51 | struct rds_ib_refill_cache { | ||
52 | struct rds_ib_cache_head *percpu; | ||
53 | struct list_head *xfer; | ||
54 | struct list_head *ready; | ||
55 | }; | ||
56 | |||
43 | struct rds_ib_connect_private { | 57 | struct rds_ib_connect_private { |
44 | /* Add new fields at the end, and don't permute existing fields. */ | 58 | /* Add new fields at the end, and don't permute existing fields. */ |
45 | __be32 dp_saddr; | 59 | __be32 dp_saddr; |
@@ -53,8 +67,7 @@ struct rds_ib_connect_private { | |||
53 | }; | 67 | }; |
54 | 68 | ||
55 | struct rds_ib_send_work { | 69 | struct rds_ib_send_work { |
56 | struct rds_message *s_rm; | 70 | void *s_op; |
57 | struct rds_rdma_op *s_op; | ||
58 | struct ib_send_wr s_wr; | 71 | struct ib_send_wr s_wr; |
59 | struct ib_sge s_sge[RDS_IB_MAX_SGE]; | 72 | struct ib_sge s_sge[RDS_IB_MAX_SGE]; |
60 | unsigned long s_queued; | 73 | unsigned long s_queued; |
@@ -92,10 +105,11 @@ struct rds_ib_connection { | |||
92 | 105 | ||
93 | /* tx */ | 106 | /* tx */ |
94 | struct rds_ib_work_ring i_send_ring; | 107 | struct rds_ib_work_ring i_send_ring; |
95 | struct rds_message *i_rm; | 108 | struct rm_data_op *i_data_op; |
96 | struct rds_header *i_send_hdrs; | 109 | struct rds_header *i_send_hdrs; |
97 | u64 i_send_hdrs_dma; | 110 | u64 i_send_hdrs_dma; |
98 | struct rds_ib_send_work *i_sends; | 111 | struct rds_ib_send_work *i_sends; |
112 | atomic_t i_signaled_sends; | ||
99 | 113 | ||
100 | /* rx */ | 114 | /* rx */ |
101 | struct tasklet_struct i_recv_tasklet; | 115 | struct tasklet_struct i_recv_tasklet; |
@@ -106,8 +120,9 @@ struct rds_ib_connection { | |||
106 | struct rds_header *i_recv_hdrs; | 120 | struct rds_header *i_recv_hdrs; |
107 | u64 i_recv_hdrs_dma; | 121 | u64 i_recv_hdrs_dma; |
108 | struct rds_ib_recv_work *i_recvs; | 122 | struct rds_ib_recv_work *i_recvs; |
109 | struct rds_page_frag i_frag; | ||
110 | u64 i_ack_recv; /* last ACK received */ | 123 | u64 i_ack_recv; /* last ACK received */ |
124 | struct rds_ib_refill_cache i_cache_incs; | ||
125 | struct rds_ib_refill_cache i_cache_frags; | ||
111 | 126 | ||
112 | /* sending acks */ | 127 | /* sending acks */ |
113 | unsigned long i_ack_flags; | 128 | unsigned long i_ack_flags; |
@@ -138,7 +153,6 @@ struct rds_ib_connection { | |||
138 | 153 | ||
139 | /* Batched completions */ | 154 | /* Batched completions */ |
140 | unsigned int i_unsignaled_wrs; | 155 | unsigned int i_unsignaled_wrs; |
141 | long i_unsignaled_bytes; | ||
142 | }; | 156 | }; |
143 | 157 | ||
144 | /* This assumes that atomic_t is at least 32 bits */ | 158 | /* This assumes that atomic_t is at least 32 bits */ |
@@ -164,9 +178,17 @@ struct rds_ib_device { | |||
164 | unsigned int max_fmrs; | 178 | unsigned int max_fmrs; |
165 | int max_sge; | 179 | int max_sge; |
166 | unsigned int max_wrs; | 180 | unsigned int max_wrs; |
181 | unsigned int max_initiator_depth; | ||
182 | unsigned int max_responder_resources; | ||
167 | spinlock_t spinlock; /* protect the above */ | 183 | spinlock_t spinlock; /* protect the above */ |
184 | atomic_t refcount; | ||
185 | struct work_struct free_work; | ||
168 | }; | 186 | }; |
169 | 187 | ||
188 | #define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus) | ||
189 | #define ibdev_to_node(ibdev) pcidev_to_node(to_pci_dev(ibdev->dma_device)) | ||
190 | #define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev) | ||
191 | |||
170 | /* bits for i_ack_flags */ | 192 | /* bits for i_ack_flags */ |
171 | #define IB_ACK_IN_FLIGHT 0 | 193 | #define IB_ACK_IN_FLIGHT 0 |
172 | #define IB_ACK_REQUESTED 1 | 194 | #define IB_ACK_REQUESTED 1 |
@@ -202,6 +224,8 @@ struct rds_ib_statistics { | |||
202 | uint64_t s_ib_rdma_mr_pool_flush; | 224 | uint64_t s_ib_rdma_mr_pool_flush; |
203 | uint64_t s_ib_rdma_mr_pool_wait; | 225 | uint64_t s_ib_rdma_mr_pool_wait; |
204 | uint64_t s_ib_rdma_mr_pool_depleted; | 226 | uint64_t s_ib_rdma_mr_pool_depleted; |
227 | uint64_t s_ib_atomic_cswp; | ||
228 | uint64_t s_ib_atomic_fadd; | ||
205 | }; | 229 | }; |
206 | 230 | ||
207 | extern struct workqueue_struct *rds_ib_wq; | 231 | extern struct workqueue_struct *rds_ib_wq; |
@@ -243,6 +267,8 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, | |||
243 | extern struct rds_transport rds_ib_transport; | 267 | extern struct rds_transport rds_ib_transport; |
244 | extern void rds_ib_add_one(struct ib_device *device); | 268 | extern void rds_ib_add_one(struct ib_device *device); |
245 | extern void rds_ib_remove_one(struct ib_device *device); | 269 | extern void rds_ib_remove_one(struct ib_device *device); |
270 | struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); | ||
271 | void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); | ||
246 | extern struct ib_client rds_ib_client; | 272 | extern struct ib_client rds_ib_client; |
247 | 273 | ||
248 | extern unsigned int fmr_pool_size; | 274 | extern unsigned int fmr_pool_size; |
@@ -258,7 +284,7 @@ void rds_ib_conn_free(void *arg); | |||
258 | int rds_ib_conn_connect(struct rds_connection *conn); | 284 | int rds_ib_conn_connect(struct rds_connection *conn); |
259 | void rds_ib_conn_shutdown(struct rds_connection *conn); | 285 | void rds_ib_conn_shutdown(struct rds_connection *conn); |
260 | void rds_ib_state_change(struct sock *sk); | 286 | void rds_ib_state_change(struct sock *sk); |
261 | int __init rds_ib_listen_init(void); | 287 | int rds_ib_listen_init(void); |
262 | void rds_ib_listen_stop(void); | 288 | void rds_ib_listen_stop(void); |
263 | void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); | 289 | void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); |
264 | int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, | 290 | int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, |
@@ -275,15 +301,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, | |||
275 | int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); | 301 | int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); |
276 | void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); | 302 | void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); |
277 | void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); | 303 | void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); |
278 | void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock); | 304 | void rds_ib_destroy_nodev_conns(void); |
279 | static inline void rds_ib_destroy_nodev_conns(void) | ||
280 | { | ||
281 | __rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock); | ||
282 | } | ||
283 | static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev) | ||
284 | { | ||
285 | __rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock); | ||
286 | } | ||
287 | struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); | 305 | struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); |
288 | void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); | 306 | void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); |
289 | void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); | 307 | void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); |
@@ -292,14 +310,16 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | |||
292 | void rds_ib_sync_mr(void *trans_private, int dir); | 310 | void rds_ib_sync_mr(void *trans_private, int dir); |
293 | void rds_ib_free_mr(void *trans_private, int invalidate); | 311 | void rds_ib_free_mr(void *trans_private, int invalidate); |
294 | void rds_ib_flush_mrs(void); | 312 | void rds_ib_flush_mrs(void); |
313 | int rds_ib_fmr_init(void); | ||
314 | void rds_ib_fmr_exit(void); | ||
295 | 315 | ||
296 | /* ib_recv.c */ | 316 | /* ib_recv.c */ |
297 | int __init rds_ib_recv_init(void); | 317 | int rds_ib_recv_init(void); |
298 | void rds_ib_recv_exit(void); | 318 | void rds_ib_recv_exit(void); |
299 | int rds_ib_recv(struct rds_connection *conn); | 319 | int rds_ib_recv(struct rds_connection *conn); |
300 | int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | 320 | int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); |
301 | gfp_t page_gfp, int prefill); | 321 | void rds_ib_recv_free_caches(struct rds_ib_connection *ic); |
302 | void rds_ib_inc_purge(struct rds_incoming *inc); | 322 | void rds_ib_recv_refill(struct rds_connection *conn, int prefill); |
303 | void rds_ib_inc_free(struct rds_incoming *inc); | 323 | void rds_ib_inc_free(struct rds_incoming *inc); |
304 | int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, | 324 | int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, |
305 | size_t size); | 325 | size_t size); |
@@ -325,17 +345,19 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); | |||
325 | extern wait_queue_head_t rds_ib_ring_empty_wait; | 345 | extern wait_queue_head_t rds_ib_ring_empty_wait; |
326 | 346 | ||
327 | /* ib_send.c */ | 347 | /* ib_send.c */ |
348 | char *rds_ib_wc_status_str(enum ib_wc_status status); | ||
328 | void rds_ib_xmit_complete(struct rds_connection *conn); | 349 | void rds_ib_xmit_complete(struct rds_connection *conn); |
329 | int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | 350 | int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, |
330 | unsigned int hdr_off, unsigned int sg, unsigned int off); | 351 | unsigned int hdr_off, unsigned int sg, unsigned int off); |
331 | void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); | 352 | void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); |
332 | void rds_ib_send_init_ring(struct rds_ib_connection *ic); | 353 | void rds_ib_send_init_ring(struct rds_ib_connection *ic); |
333 | void rds_ib_send_clear_ring(struct rds_ib_connection *ic); | 354 | void rds_ib_send_clear_ring(struct rds_ib_connection *ic); |
334 | int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); | 355 | int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); |
335 | void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); | 356 | void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); |
336 | void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); | 357 | void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); |
337 | int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, | 358 | int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, |
338 | u32 *adv_credits, int need_posted, int max_posted); | 359 | u32 *adv_credits, int need_posted, int max_posted); |
360 | int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); | ||
339 | 361 | ||
340 | /* ib_stats.c */ | 362 | /* ib_stats.c */ |
341 | DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); | 363 | DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); |
@@ -344,7 +366,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, | |||
344 | unsigned int avail); | 366 | unsigned int avail); |
345 | 367 | ||
346 | /* ib_sysctl.c */ | 368 | /* ib_sysctl.c */ |
347 | int __init rds_ib_sysctl_init(void); | 369 | int rds_ib_sysctl_init(void); |
348 | void rds_ib_sysctl_exit(void); | 370 | void rds_ib_sysctl_exit(void); |
349 | extern unsigned long rds_ib_sysctl_max_send_wr; | 371 | extern unsigned long rds_ib_sysctl_max_send_wr; |
350 | extern unsigned long rds_ib_sysctl_max_recv_wr; | 372 | extern unsigned long rds_ib_sysctl_max_recv_wr; |
@@ -354,28 +376,4 @@ extern unsigned long rds_ib_sysctl_max_recv_allocation; | |||
354 | extern unsigned int rds_ib_sysctl_flow_control; | 376 | extern unsigned int rds_ib_sysctl_flow_control; |
355 | extern ctl_table rds_ib_sysctl_table[]; | 377 | extern ctl_table rds_ib_sysctl_table[]; |
356 | 378 | ||
357 | /* | ||
358 | * Helper functions for getting/setting the header and data SGEs in | ||
359 | * RDS packets (not RDMA) | ||
360 | * | ||
361 | * From version 3.1 onwards, header is in front of data in the sge. | ||
362 | */ | ||
363 | static inline struct ib_sge * | ||
364 | rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge) | ||
365 | { | ||
366 | if (ic->conn->c_version > RDS_PROTOCOL_3_0) | ||
367 | return &sge[0]; | ||
368 | else | ||
369 | return &sge[1]; | ||
370 | } | ||
371 | |||
372 | static inline struct ib_sge * | ||
373 | rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge) | ||
374 | { | ||
375 | if (ic->conn->c_version > RDS_PROTOCOL_3_0) | ||
376 | return &sge[1]; | ||
377 | else | ||
378 | return &sge[0]; | ||
379 | } | ||
380 | |||
381 | #endif | 379 | #endif |
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index f68832798db..ee369d201a6 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c | |||
@@ -38,6 +38,36 @@ | |||
38 | #include "rds.h" | 38 | #include "rds.h" |
39 | #include "ib.h" | 39 | #include "ib.h" |
40 | 40 | ||
41 | static char *rds_ib_event_type_strings[] = { | ||
42 | #define RDS_IB_EVENT_STRING(foo) \ | ||
43 | [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo) | ||
44 | RDS_IB_EVENT_STRING(CQ_ERR), | ||
45 | RDS_IB_EVENT_STRING(QP_FATAL), | ||
46 | RDS_IB_EVENT_STRING(QP_REQ_ERR), | ||
47 | RDS_IB_EVENT_STRING(QP_ACCESS_ERR), | ||
48 | RDS_IB_EVENT_STRING(COMM_EST), | ||
49 | RDS_IB_EVENT_STRING(SQ_DRAINED), | ||
50 | RDS_IB_EVENT_STRING(PATH_MIG), | ||
51 | RDS_IB_EVENT_STRING(PATH_MIG_ERR), | ||
52 | RDS_IB_EVENT_STRING(DEVICE_FATAL), | ||
53 | RDS_IB_EVENT_STRING(PORT_ACTIVE), | ||
54 | RDS_IB_EVENT_STRING(PORT_ERR), | ||
55 | RDS_IB_EVENT_STRING(LID_CHANGE), | ||
56 | RDS_IB_EVENT_STRING(PKEY_CHANGE), | ||
57 | RDS_IB_EVENT_STRING(SM_CHANGE), | ||
58 | RDS_IB_EVENT_STRING(SRQ_ERR), | ||
59 | RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED), | ||
60 | RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED), | ||
61 | RDS_IB_EVENT_STRING(CLIENT_REREGISTER), | ||
62 | #undef RDS_IB_EVENT_STRING | ||
63 | }; | ||
64 | |||
65 | static char *rds_ib_event_str(enum ib_event_type type) | ||
66 | { | ||
67 | return rds_str_array(rds_ib_event_type_strings, | ||
68 | ARRAY_SIZE(rds_ib_event_type_strings), type); | ||
69 | }; | ||
70 | |||
41 | /* | 71 | /* |
42 | * Set the selected protocol version | 72 | * Set the selected protocol version |
43 | */ | 73 | */ |
@@ -95,7 +125,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even | |||
95 | { | 125 | { |
96 | const struct rds_ib_connect_private *dp = NULL; | 126 | const struct rds_ib_connect_private *dp = NULL; |
97 | struct rds_ib_connection *ic = conn->c_transport_data; | 127 | struct rds_ib_connection *ic = conn->c_transport_data; |
98 | struct rds_ib_device *rds_ibdev; | ||
99 | struct ib_qp_attr qp_attr; | 128 | struct ib_qp_attr qp_attr; |
100 | int err; | 129 | int err; |
101 | 130 | ||
@@ -111,11 +140,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even | |||
111 | } | 140 | } |
112 | } | 141 | } |
113 | 142 | ||
114 | printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", | 143 | if (conn->c_version < RDS_PROTOCOL(3,1)) { |
115 | &conn->c_faddr, | 144 | printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed," |
116 | RDS_PROTOCOL_MAJOR(conn->c_version), | 145 | " no longer supported\n", |
117 | RDS_PROTOCOL_MINOR(conn->c_version), | 146 | &conn->c_faddr, |
118 | ic->i_flowctl ? ", flow control" : ""); | 147 | RDS_PROTOCOL_MAJOR(conn->c_version), |
148 | RDS_PROTOCOL_MINOR(conn->c_version)); | ||
149 | rds_conn_destroy(conn); | ||
150 | return; | ||
151 | } else { | ||
152 | printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", | ||
153 | &conn->c_faddr, | ||
154 | RDS_PROTOCOL_MAJOR(conn->c_version), | ||
155 | RDS_PROTOCOL_MINOR(conn->c_version), | ||
156 | ic->i_flowctl ? ", flow control" : ""); | ||
157 | } | ||
119 | 158 | ||
120 | /* | 159 | /* |
121 | * Init rings and fill recv. this needs to wait until protocol negotiation | 160 | * Init rings and fill recv. this needs to wait until protocol negotiation |
@@ -125,7 +164,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even | |||
125 | rds_ib_recv_init_ring(ic); | 164 | rds_ib_recv_init_ring(ic); |
126 | /* Post receive buffers - as a side effect, this will update | 165 | /* Post receive buffers - as a side effect, this will update |
127 | * the posted credit count. */ | 166 | * the posted credit count. */ |
128 | rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); | 167 | rds_ib_recv_refill(conn, 1); |
129 | 168 | ||
130 | /* Tune RNR behavior */ | 169 | /* Tune RNR behavior */ |
131 | rds_ib_tune_rnr(ic, &qp_attr); | 170 | rds_ib_tune_rnr(ic, &qp_attr); |
@@ -135,12 +174,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even | |||
135 | if (err) | 174 | if (err) |
136 | printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); | 175 | printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); |
137 | 176 | ||
138 | /* update ib_device with this local ipaddr & conn */ | 177 | /* update ib_device with this local ipaddr */ |
139 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | 178 | err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr); |
140 | err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr); | ||
141 | if (err) | 179 | if (err) |
142 | printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); | 180 | printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", |
143 | rds_ib_add_conn(rds_ibdev, conn); | 181 | err); |
144 | 182 | ||
145 | /* If the peer gave us the last packet it saw, process this as if | 183 | /* If the peer gave us the last packet it saw, process this as if |
146 | * we had received a regular ACK. */ | 184 | * we had received a regular ACK. */ |
@@ -153,18 +191,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even | |||
153 | static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, | 191 | static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, |
154 | struct rdma_conn_param *conn_param, | 192 | struct rdma_conn_param *conn_param, |
155 | struct rds_ib_connect_private *dp, | 193 | struct rds_ib_connect_private *dp, |
156 | u32 protocol_version) | 194 | u32 protocol_version, |
195 | u32 max_responder_resources, | ||
196 | u32 max_initiator_depth) | ||
157 | { | 197 | { |
198 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
199 | struct rds_ib_device *rds_ibdev = ic->rds_ibdev; | ||
200 | |||
158 | memset(conn_param, 0, sizeof(struct rdma_conn_param)); | 201 | memset(conn_param, 0, sizeof(struct rdma_conn_param)); |
159 | /* XXX tune these? */ | 202 | |
160 | conn_param->responder_resources = 1; | 203 | conn_param->responder_resources = |
161 | conn_param->initiator_depth = 1; | 204 | min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources); |
205 | conn_param->initiator_depth = | ||
206 | min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth); | ||
162 | conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); | 207 | conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); |
163 | conn_param->rnr_retry_count = 7; | 208 | conn_param->rnr_retry_count = 7; |
164 | 209 | ||
165 | if (dp) { | 210 | if (dp) { |
166 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
167 | |||
168 | memset(dp, 0, sizeof(*dp)); | 211 | memset(dp, 0, sizeof(*dp)); |
169 | dp->dp_saddr = conn->c_laddr; | 212 | dp->dp_saddr = conn->c_laddr; |
170 | dp->dp_daddr = conn->c_faddr; | 213 | dp->dp_daddr = conn->c_faddr; |
@@ -189,7 +232,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, | |||
189 | 232 | ||
190 | static void rds_ib_cq_event_handler(struct ib_event *event, void *data) | 233 | static void rds_ib_cq_event_handler(struct ib_event *event, void *data) |
191 | { | 234 | { |
192 | rdsdebug("event %u data %p\n", event->event, data); | 235 | rdsdebug("event %u (%s) data %p\n", |
236 | event->event, rds_ib_event_str(event->event), data); | ||
193 | } | 237 | } |
194 | 238 | ||
195 | static void rds_ib_qp_event_handler(struct ib_event *event, void *data) | 239 | static void rds_ib_qp_event_handler(struct ib_event *event, void *data) |
@@ -197,16 +241,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) | |||
197 | struct rds_connection *conn = data; | 241 | struct rds_connection *conn = data; |
198 | struct rds_ib_connection *ic = conn->c_transport_data; | 242 | struct rds_ib_connection *ic = conn->c_transport_data; |
199 | 243 | ||
200 | rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); | 244 | rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event, |
245 | rds_ib_event_str(event->event)); | ||
201 | 246 | ||
202 | switch (event->event) { | 247 | switch (event->event) { |
203 | case IB_EVENT_COMM_EST: | 248 | case IB_EVENT_COMM_EST: |
204 | rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); | 249 | rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); |
205 | break; | 250 | break; |
206 | default: | 251 | default: |
207 | rdsdebug("Fatal QP Event %u " | 252 | rdsdebug("Fatal QP Event %u (%s) " |
208 | "- connection %pI4->%pI4, reconnecting\n", | 253 | "- connection %pI4->%pI4, reconnecting\n", |
209 | event->event, &conn->c_laddr, &conn->c_faddr); | 254 | event->event, rds_ib_event_str(event->event), |
255 | &conn->c_laddr, &conn->c_faddr); | ||
210 | rds_conn_drop(conn); | 256 | rds_conn_drop(conn); |
211 | break; | 257 | break; |
212 | } | 258 | } |
@@ -224,18 +270,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn) | |||
224 | struct rds_ib_device *rds_ibdev; | 270 | struct rds_ib_device *rds_ibdev; |
225 | int ret; | 271 | int ret; |
226 | 272 | ||
227 | /* rds_ib_add_one creates a rds_ib_device object per IB device, | 273 | /* |
228 | * and allocates a protection domain, memory range and FMR pool | 274 | * It's normal to see a null device if an incoming connection races |
229 | * for each. If that fails for any reason, it will not register | 275 | * with device removal, so we don't print a warning. |
230 | * the rds_ibdev at all. | ||
231 | */ | 276 | */ |
232 | rds_ibdev = ib_get_client_data(dev, &rds_ib_client); | 277 | rds_ibdev = rds_ib_get_client_data(dev); |
233 | if (rds_ibdev == NULL) { | 278 | if (!rds_ibdev) |
234 | if (printk_ratelimit()) | ||
235 | printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n", | ||
236 | dev->name); | ||
237 | return -EOPNOTSUPP; | 279 | return -EOPNOTSUPP; |
238 | } | 280 | |
281 | /* add the conn now so that connection establishment has the dev */ | ||
282 | rds_ib_add_conn(rds_ibdev, conn); | ||
239 | 283 | ||
240 | if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) | 284 | if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) |
241 | rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); | 285 | rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); |
@@ -306,7 +350,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) | |||
306 | ic->i_send_ring.w_nr * | 350 | ic->i_send_ring.w_nr * |
307 | sizeof(struct rds_header), | 351 | sizeof(struct rds_header), |
308 | &ic->i_send_hdrs_dma, GFP_KERNEL); | 352 | &ic->i_send_hdrs_dma, GFP_KERNEL); |
309 | if (ic->i_send_hdrs == NULL) { | 353 | if (!ic->i_send_hdrs) { |
310 | ret = -ENOMEM; | 354 | ret = -ENOMEM; |
311 | rdsdebug("ib_dma_alloc_coherent send failed\n"); | 355 | rdsdebug("ib_dma_alloc_coherent send failed\n"); |
312 | goto out; | 356 | goto out; |
@@ -316,7 +360,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) | |||
316 | ic->i_recv_ring.w_nr * | 360 | ic->i_recv_ring.w_nr * |
317 | sizeof(struct rds_header), | 361 | sizeof(struct rds_header), |
318 | &ic->i_recv_hdrs_dma, GFP_KERNEL); | 362 | &ic->i_recv_hdrs_dma, GFP_KERNEL); |
319 | if (ic->i_recv_hdrs == NULL) { | 363 | if (!ic->i_recv_hdrs) { |
320 | ret = -ENOMEM; | 364 | ret = -ENOMEM; |
321 | rdsdebug("ib_dma_alloc_coherent recv failed\n"); | 365 | rdsdebug("ib_dma_alloc_coherent recv failed\n"); |
322 | goto out; | 366 | goto out; |
@@ -324,22 +368,24 @@ static int rds_ib_setup_qp(struct rds_connection *conn) | |||
324 | 368 | ||
325 | ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), | 369 | ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), |
326 | &ic->i_ack_dma, GFP_KERNEL); | 370 | &ic->i_ack_dma, GFP_KERNEL); |
327 | if (ic->i_ack == NULL) { | 371 | if (!ic->i_ack) { |
328 | ret = -ENOMEM; | 372 | ret = -ENOMEM; |
329 | rdsdebug("ib_dma_alloc_coherent ack failed\n"); | 373 | rdsdebug("ib_dma_alloc_coherent ack failed\n"); |
330 | goto out; | 374 | goto out; |
331 | } | 375 | } |
332 | 376 | ||
333 | ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); | 377 | ic->i_sends = vmalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work), |
334 | if (ic->i_sends == NULL) { | 378 | ibdev_to_node(dev)); |
379 | if (!ic->i_sends) { | ||
335 | ret = -ENOMEM; | 380 | ret = -ENOMEM; |
336 | rdsdebug("send allocation failed\n"); | 381 | rdsdebug("send allocation failed\n"); |
337 | goto out; | 382 | goto out; |
338 | } | 383 | } |
339 | memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); | 384 | memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); |
340 | 385 | ||
341 | ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); | 386 | ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work), |
342 | if (ic->i_recvs == NULL) { | 387 | ibdev_to_node(dev)); |
388 | if (!ic->i_recvs) { | ||
343 | ret = -ENOMEM; | 389 | ret = -ENOMEM; |
344 | rdsdebug("recv allocation failed\n"); | 390 | rdsdebug("recv allocation failed\n"); |
345 | goto out; | 391 | goto out; |
@@ -352,6 +398,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) | |||
352 | ic->i_send_cq, ic->i_recv_cq); | 398 | ic->i_send_cq, ic->i_recv_cq); |
353 | 399 | ||
354 | out: | 400 | out: |
401 | rds_ib_dev_put(rds_ibdev); | ||
355 | return ret; | 402 | return ret; |
356 | } | 403 | } |
357 | 404 | ||
@@ -409,7 +456,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, | |||
409 | struct rds_ib_connection *ic = NULL; | 456 | struct rds_ib_connection *ic = NULL; |
410 | struct rdma_conn_param conn_param; | 457 | struct rdma_conn_param conn_param; |
411 | u32 version; | 458 | u32 version; |
412 | int err, destroy = 1; | 459 | int err = 1, destroy = 1; |
413 | 460 | ||
414 | /* Check whether the remote protocol version matches ours. */ | 461 | /* Check whether the remote protocol version matches ours. */ |
415 | version = rds_ib_protocol_compatible(event); | 462 | version = rds_ib_protocol_compatible(event); |
@@ -448,7 +495,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, | |||
448 | /* Wait and see - our connect may still be succeeding */ | 495 | /* Wait and see - our connect may still be succeeding */ |
449 | rds_ib_stats_inc(s_ib_connect_raced); | 496 | rds_ib_stats_inc(s_ib_connect_raced); |
450 | } | 497 | } |
451 | mutex_unlock(&conn->c_cm_lock); | ||
452 | goto out; | 498 | goto out; |
453 | } | 499 | } |
454 | 500 | ||
@@ -475,24 +521,23 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, | |||
475 | err = rds_ib_setup_qp(conn); | 521 | err = rds_ib_setup_qp(conn); |
476 | if (err) { | 522 | if (err) { |
477 | rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err); | 523 | rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err); |
478 | mutex_unlock(&conn->c_cm_lock); | ||
479 | goto out; | 524 | goto out; |
480 | } | 525 | } |
481 | 526 | ||
482 | rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); | 527 | rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, |
528 | event->param.conn.responder_resources, | ||
529 | event->param.conn.initiator_depth); | ||
483 | 530 | ||
484 | /* rdma_accept() calls rdma_reject() internally if it fails */ | 531 | /* rdma_accept() calls rdma_reject() internally if it fails */ |
485 | err = rdma_accept(cm_id, &conn_param); | 532 | err = rdma_accept(cm_id, &conn_param); |
486 | mutex_unlock(&conn->c_cm_lock); | 533 | if (err) |
487 | if (err) { | ||
488 | rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); | 534 | rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); |
489 | goto out; | ||
490 | } | ||
491 | |||
492 | return 0; | ||
493 | 535 | ||
494 | out: | 536 | out: |
495 | rdma_reject(cm_id, NULL, 0); | 537 | if (conn) |
538 | mutex_unlock(&conn->c_cm_lock); | ||
539 | if (err) | ||
540 | rdma_reject(cm_id, NULL, 0); | ||
496 | return destroy; | 541 | return destroy; |
497 | } | 542 | } |
498 | 543 | ||
@@ -516,8 +561,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) | |||
516 | goto out; | 561 | goto out; |
517 | } | 562 | } |
518 | 563 | ||
519 | rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); | 564 | rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION, |
520 | 565 | UINT_MAX, UINT_MAX); | |
521 | ret = rdma_connect(cm_id, &conn_param); | 566 | ret = rdma_connect(cm_id, &conn_param); |
522 | if (ret) | 567 | if (ret) |
523 | rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); | 568 | rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); |
@@ -601,9 +646,19 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) | |||
601 | ic->i_cm_id, err); | 646 | ic->i_cm_id, err); |
602 | } | 647 | } |
603 | 648 | ||
649 | /* | ||
650 | * We want to wait for tx and rx completion to finish | ||
651 | * before we tear down the connection, but we have to be | ||
652 | * careful not to get stuck waiting on a send ring that | ||
653 | * only has unsignaled sends in it. We've shutdown new | ||
654 | * sends before getting here so by waiting for signaled | ||
655 | * sends to complete we're ensured that there will be no | ||
656 | * more tx processing. | ||
657 | */ | ||
604 | wait_event(rds_ib_ring_empty_wait, | 658 | wait_event(rds_ib_ring_empty_wait, |
605 | rds_ib_ring_empty(&ic->i_send_ring) && | 659 | rds_ib_ring_empty(&ic->i_recv_ring) && |
606 | rds_ib_ring_empty(&ic->i_recv_ring)); | 660 | (atomic_read(&ic->i_signaled_sends) == 0)); |
661 | tasklet_kill(&ic->i_recv_tasklet); | ||
607 | 662 | ||
608 | if (ic->i_send_hdrs) | 663 | if (ic->i_send_hdrs) |
609 | ib_dma_free_coherent(dev, | 664 | ib_dma_free_coherent(dev, |
@@ -654,9 +709,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) | |||
654 | BUG_ON(ic->rds_ibdev); | 709 | BUG_ON(ic->rds_ibdev); |
655 | 710 | ||
656 | /* Clear pending transmit */ | 711 | /* Clear pending transmit */ |
657 | if (ic->i_rm) { | 712 | if (ic->i_data_op) { |
658 | rds_message_put(ic->i_rm); | 713 | struct rds_message *rm; |
659 | ic->i_rm = NULL; | 714 | |
715 | rm = container_of(ic->i_data_op, struct rds_message, data); | ||
716 | rds_message_put(rm); | ||
717 | ic->i_data_op = NULL; | ||
660 | } | 718 | } |
661 | 719 | ||
662 | /* Clear the ACK state */ | 720 | /* Clear the ACK state */ |
@@ -690,12 +748,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) | |||
690 | { | 748 | { |
691 | struct rds_ib_connection *ic; | 749 | struct rds_ib_connection *ic; |
692 | unsigned long flags; | 750 | unsigned long flags; |
751 | int ret; | ||
693 | 752 | ||
694 | /* XXX too lazy? */ | 753 | /* XXX too lazy? */ |
695 | ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); | 754 | ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); |
696 | if (ic == NULL) | 755 | if (!ic) |
697 | return -ENOMEM; | 756 | return -ENOMEM; |
698 | 757 | ||
758 | ret = rds_ib_recv_alloc_caches(ic); | ||
759 | if (ret) { | ||
760 | kfree(ic); | ||
761 | return ret; | ||
762 | } | ||
763 | |||
699 | INIT_LIST_HEAD(&ic->ib_node); | 764 | INIT_LIST_HEAD(&ic->ib_node); |
700 | tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, | 765 | tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, |
701 | (unsigned long) ic); | 766 | (unsigned long) ic); |
@@ -703,6 +768,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) | |||
703 | #ifndef KERNEL_HAS_ATOMIC64 | 768 | #ifndef KERNEL_HAS_ATOMIC64 |
704 | spin_lock_init(&ic->i_ack_lock); | 769 | spin_lock_init(&ic->i_ack_lock); |
705 | #endif | 770 | #endif |
771 | atomic_set(&ic->i_signaled_sends, 0); | ||
706 | 772 | ||
707 | /* | 773 | /* |
708 | * rds_ib_conn_shutdown() waits for these to be emptied so they | 774 | * rds_ib_conn_shutdown() waits for these to be emptied so they |
@@ -744,6 +810,8 @@ void rds_ib_conn_free(void *arg) | |||
744 | list_del(&ic->ib_node); | 810 | list_del(&ic->ib_node); |
745 | spin_unlock_irq(lock_ptr); | 811 | spin_unlock_irq(lock_ptr); |
746 | 812 | ||
813 | rds_ib_recv_free_caches(ic); | ||
814 | |||
747 | kfree(ic); | 815 | kfree(ic); |
748 | } | 816 | } |
749 | 817 | ||
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index a54cd63f9e3..b5a88415a18 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c | |||
@@ -32,11 +32,16 @@ | |||
32 | */ | 32 | */ |
33 | #include <linux/kernel.h> | 33 | #include <linux/kernel.h> |
34 | #include <linux/slab.h> | 34 | #include <linux/slab.h> |
35 | #include <linux/rculist.h> | ||
35 | 36 | ||
36 | #include "rds.h" | 37 | #include "rds.h" |
37 | #include "rdma.h" | ||
38 | #include "ib.h" | 38 | #include "ib.h" |
39 | #include "xlist.h" | ||
39 | 40 | ||
41 | struct workqueue_struct *rds_ib_fmr_wq; | ||
42 | |||
43 | static DEFINE_PER_CPU(unsigned long, clean_list_grace); | ||
44 | #define CLEAN_LIST_BUSY_BIT 0 | ||
40 | 45 | ||
41 | /* | 46 | /* |
42 | * This is stored as mr->r_trans_private. | 47 | * This is stored as mr->r_trans_private. |
@@ -45,7 +50,11 @@ struct rds_ib_mr { | |||
45 | struct rds_ib_device *device; | 50 | struct rds_ib_device *device; |
46 | struct rds_ib_mr_pool *pool; | 51 | struct rds_ib_mr_pool *pool; |
47 | struct ib_fmr *fmr; | 52 | struct ib_fmr *fmr; |
48 | struct list_head list; | 53 | |
54 | struct xlist_head xlist; | ||
55 | |||
56 | /* unmap_list is for freeing */ | ||
57 | struct list_head unmap_list; | ||
49 | unsigned int remap_count; | 58 | unsigned int remap_count; |
50 | 59 | ||
51 | struct scatterlist *sg; | 60 | struct scatterlist *sg; |
@@ -59,14 +68,16 @@ struct rds_ib_mr { | |||
59 | */ | 68 | */ |
60 | struct rds_ib_mr_pool { | 69 | struct rds_ib_mr_pool { |
61 | struct mutex flush_lock; /* serialize fmr invalidate */ | 70 | struct mutex flush_lock; /* serialize fmr invalidate */ |
62 | struct work_struct flush_worker; /* flush worker */ | 71 | struct delayed_work flush_worker; /* flush worker */ |
63 | 72 | ||
64 | spinlock_t list_lock; /* protect variables below */ | ||
65 | atomic_t item_count; /* total # of MRs */ | 73 | atomic_t item_count; /* total # of MRs */ |
66 | atomic_t dirty_count; /* # dirty of MRs */ | 74 | atomic_t dirty_count; /* # dirty of MRs */ |
67 | struct list_head drop_list; /* MRs that have reached their max_maps limit */ | 75 | |
68 | struct list_head free_list; /* unused MRs */ | 76 | struct xlist_head drop_list; /* MRs that have reached their max_maps limit */ |
69 | struct list_head clean_list; /* unused & unamapped MRs */ | 77 | struct xlist_head free_list; /* unused MRs */ |
78 | struct xlist_head clean_list; /* global unused & unamapped MRs */ | ||
79 | wait_queue_head_t flush_wait; | ||
80 | |||
70 | atomic_t free_pinned; /* memory pinned by free MRs */ | 81 | atomic_t free_pinned; /* memory pinned by free MRs */ |
71 | unsigned long max_items; | 82 | unsigned long max_items; |
72 | unsigned long max_items_soft; | 83 | unsigned long max_items_soft; |
@@ -74,7 +85,7 @@ struct rds_ib_mr_pool { | |||
74 | struct ib_fmr_attr fmr_attr; | 85 | struct ib_fmr_attr fmr_attr; |
75 | }; | 86 | }; |
76 | 87 | ||
77 | static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all); | 88 | static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **); |
78 | static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); | 89 | static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); |
79 | static void rds_ib_mr_pool_flush_worker(struct work_struct *work); | 90 | static void rds_ib_mr_pool_flush_worker(struct work_struct *work); |
80 | 91 | ||
@@ -83,16 +94,17 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) | |||
83 | struct rds_ib_device *rds_ibdev; | 94 | struct rds_ib_device *rds_ibdev; |
84 | struct rds_ib_ipaddr *i_ipaddr; | 95 | struct rds_ib_ipaddr *i_ipaddr; |
85 | 96 | ||
86 | list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { | 97 | rcu_read_lock(); |
87 | spin_lock_irq(&rds_ibdev->spinlock); | 98 | list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) { |
88 | list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) { | 99 | list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { |
89 | if (i_ipaddr->ipaddr == ipaddr) { | 100 | if (i_ipaddr->ipaddr == ipaddr) { |
90 | spin_unlock_irq(&rds_ibdev->spinlock); | 101 | atomic_inc(&rds_ibdev->refcount); |
102 | rcu_read_unlock(); | ||
91 | return rds_ibdev; | 103 | return rds_ibdev; |
92 | } | 104 | } |
93 | } | 105 | } |
94 | spin_unlock_irq(&rds_ibdev->spinlock); | ||
95 | } | 106 | } |
107 | rcu_read_unlock(); | ||
96 | 108 | ||
97 | return NULL; | 109 | return NULL; |
98 | } | 110 | } |
@@ -108,7 +120,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) | |||
108 | i_ipaddr->ipaddr = ipaddr; | 120 | i_ipaddr->ipaddr = ipaddr; |
109 | 121 | ||
110 | spin_lock_irq(&rds_ibdev->spinlock); | 122 | spin_lock_irq(&rds_ibdev->spinlock); |
111 | list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list); | 123 | list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list); |
112 | spin_unlock_irq(&rds_ibdev->spinlock); | 124 | spin_unlock_irq(&rds_ibdev->spinlock); |
113 | 125 | ||
114 | return 0; | 126 | return 0; |
@@ -116,17 +128,24 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) | |||
116 | 128 | ||
117 | static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) | 129 | static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) |
118 | { | 130 | { |
119 | struct rds_ib_ipaddr *i_ipaddr, *next; | 131 | struct rds_ib_ipaddr *i_ipaddr; |
132 | struct rds_ib_ipaddr *to_free = NULL; | ||
133 | |||
120 | 134 | ||
121 | spin_lock_irq(&rds_ibdev->spinlock); | 135 | spin_lock_irq(&rds_ibdev->spinlock); |
122 | list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) { | 136 | list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { |
123 | if (i_ipaddr->ipaddr == ipaddr) { | 137 | if (i_ipaddr->ipaddr == ipaddr) { |
124 | list_del(&i_ipaddr->list); | 138 | list_del_rcu(&i_ipaddr->list); |
125 | kfree(i_ipaddr); | 139 | to_free = i_ipaddr; |
126 | break; | 140 | break; |
127 | } | 141 | } |
128 | } | 142 | } |
129 | spin_unlock_irq(&rds_ibdev->spinlock); | 143 | spin_unlock_irq(&rds_ibdev->spinlock); |
144 | |||
145 | if (to_free) { | ||
146 | synchronize_rcu(); | ||
147 | kfree(to_free); | ||
148 | } | ||
130 | } | 149 | } |
131 | 150 | ||
132 | int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) | 151 | int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) |
@@ -134,8 +153,10 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) | |||
134 | struct rds_ib_device *rds_ibdev_old; | 153 | struct rds_ib_device *rds_ibdev_old; |
135 | 154 | ||
136 | rds_ibdev_old = rds_ib_get_device(ipaddr); | 155 | rds_ibdev_old = rds_ib_get_device(ipaddr); |
137 | if (rds_ibdev_old) | 156 | if (rds_ibdev_old) { |
138 | rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); | 157 | rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); |
158 | rds_ib_dev_put(rds_ibdev_old); | ||
159 | } | ||
139 | 160 | ||
140 | return rds_ib_add_ipaddr(rds_ibdev, ipaddr); | 161 | return rds_ib_add_ipaddr(rds_ibdev, ipaddr); |
141 | } | 162 | } |
@@ -150,12 +171,13 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con | |||
150 | BUG_ON(list_empty(&ic->ib_node)); | 171 | BUG_ON(list_empty(&ic->ib_node)); |
151 | list_del(&ic->ib_node); | 172 | list_del(&ic->ib_node); |
152 | 173 | ||
153 | spin_lock_irq(&rds_ibdev->spinlock); | 174 | spin_lock(&rds_ibdev->spinlock); |
154 | list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); | 175 | list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); |
155 | spin_unlock_irq(&rds_ibdev->spinlock); | 176 | spin_unlock(&rds_ibdev->spinlock); |
156 | spin_unlock_irq(&ib_nodev_conns_lock); | 177 | spin_unlock_irq(&ib_nodev_conns_lock); |
157 | 178 | ||
158 | ic->rds_ibdev = rds_ibdev; | 179 | ic->rds_ibdev = rds_ibdev; |
180 | atomic_inc(&rds_ibdev->refcount); | ||
159 | } | 181 | } |
160 | 182 | ||
161 | void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) | 183 | void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) |
@@ -175,18 +197,18 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection * | |||
175 | spin_unlock(&ib_nodev_conns_lock); | 197 | spin_unlock(&ib_nodev_conns_lock); |
176 | 198 | ||
177 | ic->rds_ibdev = NULL; | 199 | ic->rds_ibdev = NULL; |
200 | rds_ib_dev_put(rds_ibdev); | ||
178 | } | 201 | } |
179 | 202 | ||
180 | void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock) | 203 | void rds_ib_destroy_nodev_conns(void) |
181 | { | 204 | { |
182 | struct rds_ib_connection *ic, *_ic; | 205 | struct rds_ib_connection *ic, *_ic; |
183 | LIST_HEAD(tmp_list); | 206 | LIST_HEAD(tmp_list); |
184 | 207 | ||
185 | /* avoid calling conn_destroy with irqs off */ | 208 | /* avoid calling conn_destroy with irqs off */ |
186 | spin_lock_irq(list_lock); | 209 | spin_lock_irq(&ib_nodev_conns_lock); |
187 | list_splice(list, &tmp_list); | 210 | list_splice(&ib_nodev_conns, &tmp_list); |
188 | INIT_LIST_HEAD(list); | 211 | spin_unlock_irq(&ib_nodev_conns_lock); |
189 | spin_unlock_irq(list_lock); | ||
190 | 212 | ||
191 | list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) | 213 | list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) |
192 | rds_conn_destroy(ic->conn); | 214 | rds_conn_destroy(ic->conn); |
@@ -200,12 +222,12 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) | |||
200 | if (!pool) | 222 | if (!pool) |
201 | return ERR_PTR(-ENOMEM); | 223 | return ERR_PTR(-ENOMEM); |
202 | 224 | ||
203 | INIT_LIST_HEAD(&pool->free_list); | 225 | INIT_XLIST_HEAD(&pool->free_list); |
204 | INIT_LIST_HEAD(&pool->drop_list); | 226 | INIT_XLIST_HEAD(&pool->drop_list); |
205 | INIT_LIST_HEAD(&pool->clean_list); | 227 | INIT_XLIST_HEAD(&pool->clean_list); |
206 | mutex_init(&pool->flush_lock); | 228 | mutex_init(&pool->flush_lock); |
207 | spin_lock_init(&pool->list_lock); | 229 | init_waitqueue_head(&pool->flush_wait); |
208 | INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); | 230 | INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); |
209 | 231 | ||
210 | pool->fmr_attr.max_pages = fmr_message_size; | 232 | pool->fmr_attr.max_pages = fmr_message_size; |
211 | pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; | 233 | pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; |
@@ -233,34 +255,60 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co | |||
233 | 255 | ||
234 | void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) | 256 | void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) |
235 | { | 257 | { |
236 | flush_workqueue(rds_wq); | 258 | cancel_delayed_work_sync(&pool->flush_worker); |
237 | rds_ib_flush_mr_pool(pool, 1); | 259 | rds_ib_flush_mr_pool(pool, 1, NULL); |
238 | WARN_ON(atomic_read(&pool->item_count)); | 260 | WARN_ON(atomic_read(&pool->item_count)); |
239 | WARN_ON(atomic_read(&pool->free_pinned)); | 261 | WARN_ON(atomic_read(&pool->free_pinned)); |
240 | kfree(pool); | 262 | kfree(pool); |
241 | } | 263 | } |
242 | 264 | ||
265 | static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl, | ||
266 | struct rds_ib_mr **ibmr_ret) | ||
267 | { | ||
268 | struct xlist_head *ibmr_xl; | ||
269 | ibmr_xl = xlist_del_head_fast(xl); | ||
270 | *ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist); | ||
271 | } | ||
272 | |||
243 | static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) | 273 | static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) |
244 | { | 274 | { |
245 | struct rds_ib_mr *ibmr = NULL; | 275 | struct rds_ib_mr *ibmr = NULL; |
246 | unsigned long flags; | 276 | struct xlist_head *ret; |
277 | unsigned long *flag; | ||
247 | 278 | ||
248 | spin_lock_irqsave(&pool->list_lock, flags); | 279 | preempt_disable(); |
249 | if (!list_empty(&pool->clean_list)) { | 280 | flag = &__get_cpu_var(clean_list_grace); |
250 | ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list); | 281 | set_bit(CLEAN_LIST_BUSY_BIT, flag); |
251 | list_del_init(&ibmr->list); | 282 | ret = xlist_del_head(&pool->clean_list); |
252 | } | 283 | if (ret) |
253 | spin_unlock_irqrestore(&pool->list_lock, flags); | 284 | ibmr = list_entry(ret, struct rds_ib_mr, xlist); |
254 | 285 | ||
286 | clear_bit(CLEAN_LIST_BUSY_BIT, flag); | ||
287 | preempt_enable(); | ||
255 | return ibmr; | 288 | return ibmr; |
256 | } | 289 | } |
257 | 290 | ||
291 | static inline void wait_clean_list_grace(void) | ||
292 | { | ||
293 | int cpu; | ||
294 | unsigned long *flag; | ||
295 | |||
296 | for_each_online_cpu(cpu) { | ||
297 | flag = &per_cpu(clean_list_grace, cpu); | ||
298 | while (test_bit(CLEAN_LIST_BUSY_BIT, flag)) | ||
299 | cpu_relax(); | ||
300 | } | ||
301 | } | ||
302 | |||
258 | static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) | 303 | static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) |
259 | { | 304 | { |
260 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; | 305 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; |
261 | struct rds_ib_mr *ibmr = NULL; | 306 | struct rds_ib_mr *ibmr = NULL; |
262 | int err = 0, iter = 0; | 307 | int err = 0, iter = 0; |
263 | 308 | ||
309 | if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) | ||
310 | queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); | ||
311 | |||
264 | while (1) { | 312 | while (1) { |
265 | ibmr = rds_ib_reuse_fmr(pool); | 313 | ibmr = rds_ib_reuse_fmr(pool); |
266 | if (ibmr) | 314 | if (ibmr) |
@@ -287,19 +335,24 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) | |||
287 | 335 | ||
288 | /* We do have some empty MRs. Flush them out. */ | 336 | /* We do have some empty MRs. Flush them out. */ |
289 | rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); | 337 | rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); |
290 | rds_ib_flush_mr_pool(pool, 0); | 338 | rds_ib_flush_mr_pool(pool, 0, &ibmr); |
339 | if (ibmr) | ||
340 | return ibmr; | ||
291 | } | 341 | } |
292 | 342 | ||
293 | ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); | 343 | ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev)); |
294 | if (!ibmr) { | 344 | if (!ibmr) { |
295 | err = -ENOMEM; | 345 | err = -ENOMEM; |
296 | goto out_no_cigar; | 346 | goto out_no_cigar; |
297 | } | 347 | } |
298 | 348 | ||
349 | memset(ibmr, 0, sizeof(*ibmr)); | ||
350 | |||
299 | ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, | 351 | ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, |
300 | (IB_ACCESS_LOCAL_WRITE | | 352 | (IB_ACCESS_LOCAL_WRITE | |
301 | IB_ACCESS_REMOTE_READ | | 353 | IB_ACCESS_REMOTE_READ | |
302 | IB_ACCESS_REMOTE_WRITE), | 354 | IB_ACCESS_REMOTE_WRITE| |
355 | IB_ACCESS_REMOTE_ATOMIC), | ||
303 | &pool->fmr_attr); | 356 | &pool->fmr_attr); |
304 | if (IS_ERR(ibmr->fmr)) { | 357 | if (IS_ERR(ibmr->fmr)) { |
305 | err = PTR_ERR(ibmr->fmr); | 358 | err = PTR_ERR(ibmr->fmr); |
@@ -367,7 +420,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm | |||
367 | if (page_cnt > fmr_message_size) | 420 | if (page_cnt > fmr_message_size) |
368 | return -EINVAL; | 421 | return -EINVAL; |
369 | 422 | ||
370 | dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC); | 423 | dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, |
424 | rdsibdev_to_node(rds_ibdev)); | ||
371 | if (!dma_pages) | 425 | if (!dma_pages) |
372 | return -ENOMEM; | 426 | return -ENOMEM; |
373 | 427 | ||
@@ -441,7 +495,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) | |||
441 | 495 | ||
442 | /* FIXME we need a way to tell a r/w MR | 496 | /* FIXME we need a way to tell a r/w MR |
443 | * from a r/o MR */ | 497 | * from a r/o MR */ |
444 | BUG_ON(in_interrupt()); | 498 | BUG_ON(irqs_disabled()); |
445 | set_page_dirty(page); | 499 | set_page_dirty(page); |
446 | put_page(page); | 500 | put_page(page); |
447 | } | 501 | } |
@@ -477,33 +531,109 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr | |||
477 | } | 531 | } |
478 | 532 | ||
479 | /* | 533 | /* |
534 | * given an xlist of mrs, put them all into the list_head for more processing | ||
535 | */ | ||
536 | static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list) | ||
537 | { | ||
538 | struct rds_ib_mr *ibmr; | ||
539 | struct xlist_head splice; | ||
540 | struct xlist_head *cur; | ||
541 | struct xlist_head *next; | ||
542 | |||
543 | splice.next = NULL; | ||
544 | xlist_splice(xlist, &splice); | ||
545 | cur = splice.next; | ||
546 | while (cur) { | ||
547 | next = cur->next; | ||
548 | ibmr = list_entry(cur, struct rds_ib_mr, xlist); | ||
549 | list_add_tail(&ibmr->unmap_list, list); | ||
550 | cur = next; | ||
551 | } | ||
552 | } | ||
553 | |||
554 | /* | ||
555 | * this takes a list head of mrs and turns it into an xlist of clusters. | ||
556 | * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for | ||
557 | * reuse. | ||
558 | */ | ||
559 | static void list_append_to_xlist(struct rds_ib_mr_pool *pool, | ||
560 | struct list_head *list, struct xlist_head *xlist, | ||
561 | struct xlist_head **tail_ret) | ||
562 | { | ||
563 | struct rds_ib_mr *ibmr; | ||
564 | struct xlist_head *cur_mr = xlist; | ||
565 | struct xlist_head *tail_mr = NULL; | ||
566 | |||
567 | list_for_each_entry(ibmr, list, unmap_list) { | ||
568 | tail_mr = &ibmr->xlist; | ||
569 | tail_mr->next = NULL; | ||
570 | cur_mr->next = tail_mr; | ||
571 | cur_mr = tail_mr; | ||
572 | } | ||
573 | *tail_ret = tail_mr; | ||
574 | } | ||
575 | |||
576 | /* | ||
480 | * Flush our pool of MRs. | 577 | * Flush our pool of MRs. |
481 | * At a minimum, all currently unused MRs are unmapped. | 578 | * At a minimum, all currently unused MRs are unmapped. |
482 | * If the number of MRs allocated exceeds the limit, we also try | 579 | * If the number of MRs allocated exceeds the limit, we also try |
483 | * to free as many MRs as needed to get back to this limit. | 580 | * to free as many MRs as needed to get back to this limit. |
484 | */ | 581 | */ |
485 | static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) | 582 | static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, |
583 | int free_all, struct rds_ib_mr **ibmr_ret) | ||
486 | { | 584 | { |
487 | struct rds_ib_mr *ibmr, *next; | 585 | struct rds_ib_mr *ibmr, *next; |
586 | struct xlist_head clean_xlist; | ||
587 | struct xlist_head *clean_tail; | ||
488 | LIST_HEAD(unmap_list); | 588 | LIST_HEAD(unmap_list); |
489 | LIST_HEAD(fmr_list); | 589 | LIST_HEAD(fmr_list); |
490 | unsigned long unpinned = 0; | 590 | unsigned long unpinned = 0; |
491 | unsigned long flags; | ||
492 | unsigned int nfreed = 0, ncleaned = 0, free_goal; | 591 | unsigned int nfreed = 0, ncleaned = 0, free_goal; |
493 | int ret = 0; | 592 | int ret = 0; |
494 | 593 | ||
495 | rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); | 594 | rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); |
496 | 595 | ||
497 | mutex_lock(&pool->flush_lock); | 596 | if (ibmr_ret) { |
597 | DEFINE_WAIT(wait); | ||
598 | while(!mutex_trylock(&pool->flush_lock)) { | ||
599 | ibmr = rds_ib_reuse_fmr(pool); | ||
600 | if (ibmr) { | ||
601 | *ibmr_ret = ibmr; | ||
602 | finish_wait(&pool->flush_wait, &wait); | ||
603 | goto out_nolock; | ||
604 | } | ||
605 | |||
606 | prepare_to_wait(&pool->flush_wait, &wait, | ||
607 | TASK_UNINTERRUPTIBLE); | ||
608 | if (xlist_empty(&pool->clean_list)) | ||
609 | schedule(); | ||
610 | |||
611 | ibmr = rds_ib_reuse_fmr(pool); | ||
612 | if (ibmr) { | ||
613 | *ibmr_ret = ibmr; | ||
614 | finish_wait(&pool->flush_wait, &wait); | ||
615 | goto out_nolock; | ||
616 | } | ||
617 | } | ||
618 | finish_wait(&pool->flush_wait, &wait); | ||
619 | } else | ||
620 | mutex_lock(&pool->flush_lock); | ||
621 | |||
622 | if (ibmr_ret) { | ||
623 | ibmr = rds_ib_reuse_fmr(pool); | ||
624 | if (ibmr) { | ||
625 | *ibmr_ret = ibmr; | ||
626 | goto out; | ||
627 | } | ||
628 | } | ||
498 | 629 | ||
499 | spin_lock_irqsave(&pool->list_lock, flags); | ||
500 | /* Get the list of all MRs to be dropped. Ordering matters - | 630 | /* Get the list of all MRs to be dropped. Ordering matters - |
501 | * we want to put drop_list ahead of free_list. */ | 631 | * we want to put drop_list ahead of free_list. |
502 | list_splice_init(&pool->free_list, &unmap_list); | 632 | */ |
503 | list_splice_init(&pool->drop_list, &unmap_list); | 633 | xlist_append_to_list(&pool->drop_list, &unmap_list); |
634 | xlist_append_to_list(&pool->free_list, &unmap_list); | ||
504 | if (free_all) | 635 | if (free_all) |
505 | list_splice_init(&pool->clean_list, &unmap_list); | 636 | xlist_append_to_list(&pool->clean_list, &unmap_list); |
506 | spin_unlock_irqrestore(&pool->list_lock, flags); | ||
507 | 637 | ||
508 | free_goal = rds_ib_flush_goal(pool, free_all); | 638 | free_goal = rds_ib_flush_goal(pool, free_all); |
509 | 639 | ||
@@ -511,19 +641,20 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) | |||
511 | goto out; | 641 | goto out; |
512 | 642 | ||
513 | /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ | 643 | /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ |
514 | list_for_each_entry(ibmr, &unmap_list, list) | 644 | list_for_each_entry(ibmr, &unmap_list, unmap_list) |
515 | list_add(&ibmr->fmr->list, &fmr_list); | 645 | list_add(&ibmr->fmr->list, &fmr_list); |
646 | |||
516 | ret = ib_unmap_fmr(&fmr_list); | 647 | ret = ib_unmap_fmr(&fmr_list); |
517 | if (ret) | 648 | if (ret) |
518 | printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); | 649 | printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); |
519 | 650 | ||
520 | /* Now we can destroy the DMA mapping and unpin any pages */ | 651 | /* Now we can destroy the DMA mapping and unpin any pages */ |
521 | list_for_each_entry_safe(ibmr, next, &unmap_list, list) { | 652 | list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) { |
522 | unpinned += ibmr->sg_len; | 653 | unpinned += ibmr->sg_len; |
523 | __rds_ib_teardown_mr(ibmr); | 654 | __rds_ib_teardown_mr(ibmr); |
524 | if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { | 655 | if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { |
525 | rds_ib_stats_inc(s_ib_rdma_mr_free); | 656 | rds_ib_stats_inc(s_ib_rdma_mr_free); |
526 | list_del(&ibmr->list); | 657 | list_del(&ibmr->unmap_list); |
527 | ib_dealloc_fmr(ibmr->fmr); | 658 | ib_dealloc_fmr(ibmr->fmr); |
528 | kfree(ibmr); | 659 | kfree(ibmr); |
529 | nfreed++; | 660 | nfreed++; |
@@ -531,9 +662,27 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) | |||
531 | ncleaned++; | 662 | ncleaned++; |
532 | } | 663 | } |
533 | 664 | ||
534 | spin_lock_irqsave(&pool->list_lock, flags); | 665 | if (!list_empty(&unmap_list)) { |
535 | list_splice(&unmap_list, &pool->clean_list); | 666 | /* we have to make sure that none of the things we're about |
536 | spin_unlock_irqrestore(&pool->list_lock, flags); | 667 | * to put on the clean list would race with other cpus trying |
668 | * to pull items off. The xlist would explode if we managed to | ||
669 | * remove something from the clean list and then add it back again | ||
670 | * while another CPU was spinning on that same item in xlist_del_head. | ||
671 | * | ||
672 | * This is pretty unlikely, but just in case wait for an xlist grace period | ||
673 | * here before adding anything back into the clean list. | ||
674 | */ | ||
675 | wait_clean_list_grace(); | ||
676 | |||
677 | list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail); | ||
678 | if (ibmr_ret) | ||
679 | refill_local(pool, &clean_xlist, ibmr_ret); | ||
680 | |||
681 | /* refill_local may have emptied our list */ | ||
682 | if (!xlist_empty(&clean_xlist)) | ||
683 | xlist_add(clean_xlist.next, clean_tail, &pool->clean_list); | ||
684 | |||
685 | } | ||
537 | 686 | ||
538 | atomic_sub(unpinned, &pool->free_pinned); | 687 | atomic_sub(unpinned, &pool->free_pinned); |
539 | atomic_sub(ncleaned, &pool->dirty_count); | 688 | atomic_sub(ncleaned, &pool->dirty_count); |
@@ -541,14 +690,35 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) | |||
541 | 690 | ||
542 | out: | 691 | out: |
543 | mutex_unlock(&pool->flush_lock); | 692 | mutex_unlock(&pool->flush_lock); |
693 | if (waitqueue_active(&pool->flush_wait)) | ||
694 | wake_up(&pool->flush_wait); | ||
695 | out_nolock: | ||
544 | return ret; | 696 | return ret; |
545 | } | 697 | } |
546 | 698 | ||
699 | int rds_ib_fmr_init(void) | ||
700 | { | ||
701 | rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd"); | ||
702 | if (!rds_ib_fmr_wq) | ||
703 | return -ENOMEM; | ||
704 | return 0; | ||
705 | } | ||
706 | |||
707 | /* | ||
708 | * By the time this is called all the IB devices should have been torn down and | ||
709 | * had their pools freed. As each pool is freed its work struct is waited on, | ||
710 | * so the pool flushing work queue should be idle by the time we get here. | ||
711 | */ | ||
712 | void rds_ib_fmr_exit(void) | ||
713 | { | ||
714 | destroy_workqueue(rds_ib_fmr_wq); | ||
715 | } | ||
716 | |||
547 | static void rds_ib_mr_pool_flush_worker(struct work_struct *work) | 717 | static void rds_ib_mr_pool_flush_worker(struct work_struct *work) |
548 | { | 718 | { |
549 | struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker); | 719 | struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work); |
550 | 720 | ||
551 | rds_ib_flush_mr_pool(pool, 0); | 721 | rds_ib_flush_mr_pool(pool, 0, NULL); |
552 | } | 722 | } |
553 | 723 | ||
554 | void rds_ib_free_mr(void *trans_private, int invalidate) | 724 | void rds_ib_free_mr(void *trans_private, int invalidate) |
@@ -556,47 +726,49 @@ void rds_ib_free_mr(void *trans_private, int invalidate) | |||
556 | struct rds_ib_mr *ibmr = trans_private; | 726 | struct rds_ib_mr *ibmr = trans_private; |
557 | struct rds_ib_device *rds_ibdev = ibmr->device; | 727 | struct rds_ib_device *rds_ibdev = ibmr->device; |
558 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; | 728 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; |
559 | unsigned long flags; | ||
560 | 729 | ||
561 | rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); | 730 | rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); |
562 | 731 | ||
563 | /* Return it to the pool's free list */ | 732 | /* Return it to the pool's free list */ |
564 | spin_lock_irqsave(&pool->list_lock, flags); | ||
565 | if (ibmr->remap_count >= pool->fmr_attr.max_maps) | 733 | if (ibmr->remap_count >= pool->fmr_attr.max_maps) |
566 | list_add(&ibmr->list, &pool->drop_list); | 734 | xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list); |
567 | else | 735 | else |
568 | list_add(&ibmr->list, &pool->free_list); | 736 | xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list); |
569 | 737 | ||
570 | atomic_add(ibmr->sg_len, &pool->free_pinned); | 738 | atomic_add(ibmr->sg_len, &pool->free_pinned); |
571 | atomic_inc(&pool->dirty_count); | 739 | atomic_inc(&pool->dirty_count); |
572 | spin_unlock_irqrestore(&pool->list_lock, flags); | ||
573 | 740 | ||
574 | /* If we've pinned too many pages, request a flush */ | 741 | /* If we've pinned too many pages, request a flush */ |
575 | if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || | 742 | if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || |
576 | atomic_read(&pool->dirty_count) >= pool->max_items / 10) | 743 | atomic_read(&pool->dirty_count) >= pool->max_items / 10) |
577 | queue_work(rds_wq, &pool->flush_worker); | 744 | queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); |
578 | 745 | ||
579 | if (invalidate) { | 746 | if (invalidate) { |
580 | if (likely(!in_interrupt())) { | 747 | if (likely(!in_interrupt())) { |
581 | rds_ib_flush_mr_pool(pool, 0); | 748 | rds_ib_flush_mr_pool(pool, 0, NULL); |
582 | } else { | 749 | } else { |
583 | /* We get here if the user created a MR marked | 750 | /* We get here if the user created a MR marked |
584 | * as use_once and invalidate at the same time. */ | 751 | * as use_once and invalidate at the same time. */ |
585 | queue_work(rds_wq, &pool->flush_worker); | 752 | queue_delayed_work(rds_ib_fmr_wq, |
753 | &pool->flush_worker, 10); | ||
586 | } | 754 | } |
587 | } | 755 | } |
756 | |||
757 | rds_ib_dev_put(rds_ibdev); | ||
588 | } | 758 | } |
589 | 759 | ||
590 | void rds_ib_flush_mrs(void) | 760 | void rds_ib_flush_mrs(void) |
591 | { | 761 | { |
592 | struct rds_ib_device *rds_ibdev; | 762 | struct rds_ib_device *rds_ibdev; |
593 | 763 | ||
764 | down_read(&rds_ib_devices_lock); | ||
594 | list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { | 765 | list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { |
595 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; | 766 | struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; |
596 | 767 | ||
597 | if (pool) | 768 | if (pool) |
598 | rds_ib_flush_mr_pool(pool, 0); | 769 | rds_ib_flush_mr_pool(pool, 0, NULL); |
599 | } | 770 | } |
771 | up_read(&rds_ib_devices_lock); | ||
600 | } | 772 | } |
601 | 773 | ||
602 | void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | 774 | void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, |
@@ -628,6 +800,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | |||
628 | printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); | 800 | printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); |
629 | 801 | ||
630 | ibmr->device = rds_ibdev; | 802 | ibmr->device = rds_ibdev; |
803 | rds_ibdev = NULL; | ||
631 | 804 | ||
632 | out: | 805 | out: |
633 | if (ret) { | 806 | if (ret) { |
@@ -635,5 +808,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, | |||
635 | rds_ib_free_mr(ibmr, 0); | 808 | rds_ib_free_mr(ibmr, 0); |
636 | ibmr = ERR_PTR(ret); | 809 | ibmr = ERR_PTR(ret); |
637 | } | 810 | } |
811 | if (rds_ibdev) | ||
812 | rds_ib_dev_put(rds_ibdev); | ||
638 | return ibmr; | 813 | return ibmr; |
639 | } | 814 | } |
815 | |||
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index c74e9904a6b..e29e0ca32f7 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c | |||
@@ -43,42 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab; | |||
43 | static struct kmem_cache *rds_ib_frag_slab; | 43 | static struct kmem_cache *rds_ib_frag_slab; |
44 | static atomic_t rds_ib_allocation = ATOMIC_INIT(0); | 44 | static atomic_t rds_ib_allocation = ATOMIC_INIT(0); |
45 | 45 | ||
46 | static void rds_ib_frag_drop_page(struct rds_page_frag *frag) | ||
47 | { | ||
48 | rdsdebug("frag %p page %p\n", frag, frag->f_page); | ||
49 | __free_page(frag->f_page); | ||
50 | frag->f_page = NULL; | ||
51 | } | ||
52 | |||
53 | static void rds_ib_frag_free(struct rds_page_frag *frag) | ||
54 | { | ||
55 | rdsdebug("frag %p page %p\n", frag, frag->f_page); | ||
56 | BUG_ON(frag->f_page != NULL); | ||
57 | kmem_cache_free(rds_ib_frag_slab, frag); | ||
58 | } | ||
59 | |||
60 | /* | ||
61 | * We map a page at a time. Its fragments are posted in order. This | ||
62 | * is called in fragment order as the fragments get send completion events. | ||
63 | * Only the last frag in the page performs the unmapping. | ||
64 | * | ||
65 | * It's OK for ring cleanup to call this in whatever order it likes because | ||
66 | * DMA is not in flight and so we can unmap while other ring entries still | ||
67 | * hold page references in their frags. | ||
68 | */ | ||
69 | static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic, | ||
70 | struct rds_ib_recv_work *recv) | ||
71 | { | ||
72 | struct rds_page_frag *frag = recv->r_frag; | ||
73 | |||
74 | rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); | ||
75 | if (frag->f_mapped) | ||
76 | ib_dma_unmap_page(ic->i_cm_id->device, | ||
77 | frag->f_mapped, | ||
78 | RDS_FRAG_SIZE, DMA_FROM_DEVICE); | ||
79 | frag->f_mapped = 0; | ||
80 | } | ||
81 | |||
82 | void rds_ib_recv_init_ring(struct rds_ib_connection *ic) | 46 | void rds_ib_recv_init_ring(struct rds_ib_connection *ic) |
83 | { | 47 | { |
84 | struct rds_ib_recv_work *recv; | 48 | struct rds_ib_recv_work *recv; |
@@ -95,16 +59,161 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic) | |||
95 | recv->r_wr.sg_list = recv->r_sge; | 59 | recv->r_wr.sg_list = recv->r_sge; |
96 | recv->r_wr.num_sge = RDS_IB_RECV_SGE; | 60 | recv->r_wr.num_sge = RDS_IB_RECV_SGE; |
97 | 61 | ||
98 | sge = rds_ib_data_sge(ic, recv->r_sge); | 62 | sge = &recv->r_sge[0]; |
63 | sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); | ||
64 | sge->length = sizeof(struct rds_header); | ||
65 | sge->lkey = ic->i_mr->lkey; | ||
66 | |||
67 | sge = &recv->r_sge[1]; | ||
99 | sge->addr = 0; | 68 | sge->addr = 0; |
100 | sge->length = RDS_FRAG_SIZE; | 69 | sge->length = RDS_FRAG_SIZE; |
101 | sge->lkey = ic->i_mr->lkey; | 70 | sge->lkey = ic->i_mr->lkey; |
71 | } | ||
72 | } | ||
102 | 73 | ||
103 | sge = rds_ib_header_sge(ic, recv->r_sge); | 74 | /* |
104 | sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); | 75 | * The entire 'from' list, including the from element itself, is put on |
105 | sge->length = sizeof(struct rds_header); | 76 | * to the tail of the 'to' list. |
106 | sge->lkey = ic->i_mr->lkey; | 77 | */ |
78 | static void list_splice_entire_tail(struct list_head *from, | ||
79 | struct list_head *to) | ||
80 | { | ||
81 | struct list_head *from_last = from->prev; | ||
82 | |||
83 | list_splice_tail(from_last, to); | ||
84 | list_add_tail(from_last, to); | ||
85 | } | ||
86 | |||
87 | static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache) | ||
88 | { | ||
89 | struct list_head *tmp; | ||
90 | |||
91 | tmp = xchg(&cache->xfer, NULL); | ||
92 | if (tmp) { | ||
93 | if (cache->ready) | ||
94 | list_splice_entire_tail(tmp, cache->ready); | ||
95 | else | ||
96 | cache->ready = tmp; | ||
97 | } | ||
98 | } | ||
99 | |||
100 | static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache) | ||
101 | { | ||
102 | struct rds_ib_cache_head *head; | ||
103 | int cpu; | ||
104 | |||
105 | cache->percpu = alloc_percpu(struct rds_ib_cache_head); | ||
106 | if (!cache->percpu) | ||
107 | return -ENOMEM; | ||
108 | |||
109 | for_each_possible_cpu(cpu) { | ||
110 | head = per_cpu_ptr(cache->percpu, cpu); | ||
111 | head->first = NULL; | ||
112 | head->count = 0; | ||
113 | } | ||
114 | cache->xfer = NULL; | ||
115 | cache->ready = NULL; | ||
116 | |||
117 | return 0; | ||
118 | } | ||
119 | |||
120 | int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic) | ||
121 | { | ||
122 | int ret; | ||
123 | |||
124 | ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs); | ||
125 | if (!ret) { | ||
126 | ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags); | ||
127 | if (ret) | ||
128 | free_percpu(ic->i_cache_incs.percpu); | ||
107 | } | 129 | } |
130 | |||
131 | return ret; | ||
132 | } | ||
133 | |||
134 | static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache, | ||
135 | struct list_head *caller_list) | ||
136 | { | ||
137 | struct rds_ib_cache_head *head; | ||
138 | int cpu; | ||
139 | |||
140 | for_each_possible_cpu(cpu) { | ||
141 | head = per_cpu_ptr(cache->percpu, cpu); | ||
142 | if (head->first) { | ||
143 | list_splice_entire_tail(head->first, caller_list); | ||
144 | head->first = NULL; | ||
145 | } | ||
146 | } | ||
147 | |||
148 | if (cache->ready) { | ||
149 | list_splice_entire_tail(cache->ready, caller_list); | ||
150 | cache->ready = NULL; | ||
151 | } | ||
152 | } | ||
153 | |||
154 | void rds_ib_recv_free_caches(struct rds_ib_connection *ic) | ||
155 | { | ||
156 | struct rds_ib_incoming *inc; | ||
157 | struct rds_ib_incoming *inc_tmp; | ||
158 | struct rds_page_frag *frag; | ||
159 | struct rds_page_frag *frag_tmp; | ||
160 | LIST_HEAD(list); | ||
161 | |||
162 | rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); | ||
163 | rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list); | ||
164 | free_percpu(ic->i_cache_incs.percpu); | ||
165 | |||
166 | list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) { | ||
167 | list_del(&inc->ii_cache_entry); | ||
168 | WARN_ON(!list_empty(&inc->ii_frags)); | ||
169 | kmem_cache_free(rds_ib_incoming_slab, inc); | ||
170 | } | ||
171 | |||
172 | rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); | ||
173 | rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list); | ||
174 | free_percpu(ic->i_cache_frags.percpu); | ||
175 | |||
176 | list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) { | ||
177 | list_del(&frag->f_cache_entry); | ||
178 | WARN_ON(!list_empty(&frag->f_item)); | ||
179 | kmem_cache_free(rds_ib_frag_slab, frag); | ||
180 | } | ||
181 | } | ||
182 | |||
183 | /* fwd decl */ | ||
184 | static void rds_ib_recv_cache_put(struct list_head *new_item, | ||
185 | struct rds_ib_refill_cache *cache); | ||
186 | static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache); | ||
187 | |||
188 | |||
189 | /* Recycle frag and attached recv buffer f_sg */ | ||
190 | static void rds_ib_frag_free(struct rds_ib_connection *ic, | ||
191 | struct rds_page_frag *frag) | ||
192 | { | ||
193 | rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); | ||
194 | |||
195 | rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags); | ||
196 | } | ||
197 | |||
198 | /* Recycle inc after freeing attached frags */ | ||
199 | void rds_ib_inc_free(struct rds_incoming *inc) | ||
200 | { | ||
201 | struct rds_ib_incoming *ibinc; | ||
202 | struct rds_page_frag *frag; | ||
203 | struct rds_page_frag *pos; | ||
204 | struct rds_ib_connection *ic = inc->i_conn->c_transport_data; | ||
205 | |||
206 | ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); | ||
207 | |||
208 | /* Free attached frags */ | ||
209 | list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { | ||
210 | list_del_init(&frag->f_item); | ||
211 | rds_ib_frag_free(ic, frag); | ||
212 | } | ||
213 | BUG_ON(!list_empty(&ibinc->ii_frags)); | ||
214 | |||
215 | rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); | ||
216 | rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs); | ||
108 | } | 217 | } |
109 | 218 | ||
110 | static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, | 219 | static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, |
@@ -115,10 +224,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, | |||
115 | recv->r_ibinc = NULL; | 224 | recv->r_ibinc = NULL; |
116 | } | 225 | } |
117 | if (recv->r_frag) { | 226 | if (recv->r_frag) { |
118 | rds_ib_recv_unmap_page(ic, recv); | 227 | ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); |
119 | if (recv->r_frag->f_page) | 228 | rds_ib_frag_free(ic, recv->r_frag); |
120 | rds_ib_frag_drop_page(recv->r_frag); | ||
121 | rds_ib_frag_free(recv->r_frag); | ||
122 | recv->r_frag = NULL; | 229 | recv->r_frag = NULL; |
123 | } | 230 | } |
124 | } | 231 | } |
@@ -129,84 +236,111 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) | |||
129 | 236 | ||
130 | for (i = 0; i < ic->i_recv_ring.w_nr; i++) | 237 | for (i = 0; i < ic->i_recv_ring.w_nr; i++) |
131 | rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); | 238 | rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); |
132 | |||
133 | if (ic->i_frag.f_page) | ||
134 | rds_ib_frag_drop_page(&ic->i_frag); | ||
135 | } | 239 | } |
136 | 240 | ||
137 | static int rds_ib_recv_refill_one(struct rds_connection *conn, | 241 | static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic, |
138 | struct rds_ib_recv_work *recv, | 242 | gfp_t slab_mask) |
139 | gfp_t kptr_gfp, gfp_t page_gfp) | ||
140 | { | 243 | { |
141 | struct rds_ib_connection *ic = conn->c_transport_data; | 244 | struct rds_ib_incoming *ibinc; |
142 | dma_addr_t dma_addr; | 245 | struct list_head *cache_item; |
143 | struct ib_sge *sge; | 246 | int avail_allocs; |
144 | int ret = -ENOMEM; | ||
145 | 247 | ||
146 | if (recv->r_ibinc == NULL) { | 248 | cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs); |
147 | if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) { | 249 | if (cache_item) { |
250 | ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry); | ||
251 | } else { | ||
252 | avail_allocs = atomic_add_unless(&rds_ib_allocation, | ||
253 | 1, rds_ib_sysctl_max_recv_allocation); | ||
254 | if (!avail_allocs) { | ||
148 | rds_ib_stats_inc(s_ib_rx_alloc_limit); | 255 | rds_ib_stats_inc(s_ib_rx_alloc_limit); |
149 | goto out; | 256 | return NULL; |
150 | } | 257 | } |
151 | recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, | 258 | ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask); |
152 | kptr_gfp); | 259 | if (!ibinc) { |
153 | if (recv->r_ibinc == NULL) { | ||
154 | atomic_dec(&rds_ib_allocation); | 260 | atomic_dec(&rds_ib_allocation); |
155 | goto out; | 261 | return NULL; |
156 | } | 262 | } |
157 | INIT_LIST_HEAD(&recv->r_ibinc->ii_frags); | ||
158 | rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr); | ||
159 | } | 263 | } |
264 | INIT_LIST_HEAD(&ibinc->ii_frags); | ||
265 | rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); | ||
160 | 266 | ||
161 | if (recv->r_frag == NULL) { | 267 | return ibinc; |
162 | recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp); | 268 | } |
163 | if (recv->r_frag == NULL) | 269 | |
164 | goto out; | 270 | static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic, |
165 | INIT_LIST_HEAD(&recv->r_frag->f_item); | 271 | gfp_t slab_mask, gfp_t page_mask) |
166 | recv->r_frag->f_page = NULL; | 272 | { |
273 | struct rds_page_frag *frag; | ||
274 | struct list_head *cache_item; | ||
275 | int ret; | ||
276 | |||
277 | cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags); | ||
278 | if (cache_item) { | ||
279 | frag = container_of(cache_item, struct rds_page_frag, f_cache_entry); | ||
280 | } else { | ||
281 | frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask); | ||
282 | if (!frag) | ||
283 | return NULL; | ||
284 | |||
285 | sg_init_table(&frag->f_sg, 1); | ||
286 | ret = rds_page_remainder_alloc(&frag->f_sg, | ||
287 | RDS_FRAG_SIZE, page_mask); | ||
288 | if (ret) { | ||
289 | kmem_cache_free(rds_ib_frag_slab, frag); | ||
290 | return NULL; | ||
291 | } | ||
167 | } | 292 | } |
168 | 293 | ||
169 | if (ic->i_frag.f_page == NULL) { | 294 | INIT_LIST_HEAD(&frag->f_item); |
170 | ic->i_frag.f_page = alloc_page(page_gfp); | 295 | |
171 | if (ic->i_frag.f_page == NULL) | 296 | return frag; |
172 | goto out; | 297 | } |
173 | ic->i_frag.f_offset = 0; | 298 | |
299 | static int rds_ib_recv_refill_one(struct rds_connection *conn, | ||
300 | struct rds_ib_recv_work *recv, int prefill) | ||
301 | { | ||
302 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
303 | struct ib_sge *sge; | ||
304 | int ret = -ENOMEM; | ||
305 | gfp_t slab_mask = GFP_NOWAIT; | ||
306 | gfp_t page_mask = GFP_NOWAIT; | ||
307 | |||
308 | if (prefill) { | ||
309 | slab_mask = GFP_KERNEL; | ||
310 | page_mask = GFP_HIGHUSER; | ||
174 | } | 311 | } |
175 | 312 | ||
176 | dma_addr = ib_dma_map_page(ic->i_cm_id->device, | 313 | if (!ic->i_cache_incs.ready) |
177 | ic->i_frag.f_page, | 314 | rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); |
178 | ic->i_frag.f_offset, | 315 | if (!ic->i_cache_frags.ready) |
179 | RDS_FRAG_SIZE, | 316 | rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); |
180 | DMA_FROM_DEVICE); | ||
181 | if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) | ||
182 | goto out; | ||
183 | 317 | ||
184 | /* | 318 | /* |
185 | * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap() | 319 | * ibinc was taken from recv if recv contained the start of a message. |
186 | * must be called on this recv. This happens as completions hit | 320 | * recvs that were continuations will still have this allocated. |
187 | * in order or on connection shutdown. | ||
188 | */ | 321 | */ |
189 | recv->r_frag->f_page = ic->i_frag.f_page; | 322 | if (!recv->r_ibinc) { |
190 | recv->r_frag->f_offset = ic->i_frag.f_offset; | 323 | recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask); |
191 | recv->r_frag->f_mapped = dma_addr; | 324 | if (!recv->r_ibinc) |
325 | goto out; | ||
326 | } | ||
192 | 327 | ||
193 | sge = rds_ib_data_sge(ic, recv->r_sge); | 328 | WARN_ON(recv->r_frag); /* leak! */ |
194 | sge->addr = dma_addr; | 329 | recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask); |
195 | sge->length = RDS_FRAG_SIZE; | 330 | if (!recv->r_frag) |
331 | goto out; | ||
332 | |||
333 | ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, | ||
334 | 1, DMA_FROM_DEVICE); | ||
335 | WARN_ON(ret != 1); | ||
196 | 336 | ||
197 | sge = rds_ib_header_sge(ic, recv->r_sge); | 337 | sge = &recv->r_sge[0]; |
198 | sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); | 338 | sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); |
199 | sge->length = sizeof(struct rds_header); | 339 | sge->length = sizeof(struct rds_header); |
200 | 340 | ||
201 | get_page(recv->r_frag->f_page); | 341 | sge = &recv->r_sge[1]; |
202 | 342 | sge->addr = sg_dma_address(&recv->r_frag->f_sg); | |
203 | if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { | 343 | sge->length = sg_dma_len(&recv->r_frag->f_sg); |
204 | ic->i_frag.f_offset += RDS_FRAG_SIZE; | ||
205 | } else { | ||
206 | put_page(ic->i_frag.f_page); | ||
207 | ic->i_frag.f_page = NULL; | ||
208 | ic->i_frag.f_offset = 0; | ||
209 | } | ||
210 | 344 | ||
211 | ret = 0; | 345 | ret = 0; |
212 | out: | 346 | out: |
@@ -216,13 +350,11 @@ out: | |||
216 | /* | 350 | /* |
217 | * This tries to allocate and post unused work requests after making sure that | 351 | * This tries to allocate and post unused work requests after making sure that |
218 | * they have all the allocations they need to queue received fragments into | 352 | * they have all the allocations they need to queue received fragments into |
219 | * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc | 353 | * sockets. |
220 | * pairs don't go unmatched. | ||
221 | * | 354 | * |
222 | * -1 is returned if posting fails due to temporary resource exhaustion. | 355 | * -1 is returned if posting fails due to temporary resource exhaustion. |
223 | */ | 356 | */ |
224 | int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | 357 | void rds_ib_recv_refill(struct rds_connection *conn, int prefill) |
225 | gfp_t page_gfp, int prefill) | ||
226 | { | 358 | { |
227 | struct rds_ib_connection *ic = conn->c_transport_data; | 359 | struct rds_ib_connection *ic = conn->c_transport_data; |
228 | struct rds_ib_recv_work *recv; | 360 | struct rds_ib_recv_work *recv; |
@@ -236,28 +368,25 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | |||
236 | if (pos >= ic->i_recv_ring.w_nr) { | 368 | if (pos >= ic->i_recv_ring.w_nr) { |
237 | printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", | 369 | printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", |
238 | pos); | 370 | pos); |
239 | ret = -EINVAL; | ||
240 | break; | 371 | break; |
241 | } | 372 | } |
242 | 373 | ||
243 | recv = &ic->i_recvs[pos]; | 374 | recv = &ic->i_recvs[pos]; |
244 | ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp); | 375 | ret = rds_ib_recv_refill_one(conn, recv, prefill); |
245 | if (ret) { | 376 | if (ret) { |
246 | ret = -1; | ||
247 | break; | 377 | break; |
248 | } | 378 | } |
249 | 379 | ||
250 | /* XXX when can this fail? */ | 380 | /* XXX when can this fail? */ |
251 | ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); | 381 | ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); |
252 | rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, | 382 | rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, |
253 | recv->r_ibinc, recv->r_frag->f_page, | 383 | recv->r_ibinc, sg_page(&recv->r_frag->f_sg), |
254 | (long) recv->r_frag->f_mapped, ret); | 384 | (long) sg_dma_address(&recv->r_frag->f_sg), ret); |
255 | if (ret) { | 385 | if (ret) { |
256 | rds_ib_conn_error(conn, "recv post on " | 386 | rds_ib_conn_error(conn, "recv post on " |
257 | "%pI4 returned %d, disconnecting and " | 387 | "%pI4 returned %d, disconnecting and " |
258 | "reconnecting\n", &conn->c_faddr, | 388 | "reconnecting\n", &conn->c_faddr, |
259 | ret); | 389 | ret); |
260 | ret = -1; | ||
261 | break; | 390 | break; |
262 | } | 391 | } |
263 | 392 | ||
@@ -270,37 +399,73 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | |||
270 | 399 | ||
271 | if (ret) | 400 | if (ret) |
272 | rds_ib_ring_unalloc(&ic->i_recv_ring, 1); | 401 | rds_ib_ring_unalloc(&ic->i_recv_ring, 1); |
273 | return ret; | ||
274 | } | 402 | } |
275 | 403 | ||
276 | void rds_ib_inc_purge(struct rds_incoming *inc) | 404 | /* |
405 | * We want to recycle several types of recv allocations, like incs and frags. | ||
406 | * To use this, the *_free() function passes in the ptr to a list_head within | ||
407 | * the recyclee, as well as the cache to put it on. | ||
408 | * | ||
409 | * First, we put the memory on a percpu list. When this reaches a certain size, | ||
410 | * We move it to an intermediate non-percpu list in a lockless manner, with some | ||
411 | * xchg/compxchg wizardry. | ||
412 | * | ||
413 | * N.B. Instead of a list_head as the anchor, we use a single pointer, which can | ||
414 | * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and | ||
415 | * list_empty() will return true with one element is actually present. | ||
416 | */ | ||
417 | static void rds_ib_recv_cache_put(struct list_head *new_item, | ||
418 | struct rds_ib_refill_cache *cache) | ||
277 | { | 419 | { |
278 | struct rds_ib_incoming *ibinc; | 420 | unsigned long flags; |
279 | struct rds_page_frag *frag; | 421 | struct rds_ib_cache_head *chp; |
280 | struct rds_page_frag *pos; | 422 | struct list_head *old; |
281 | 423 | ||
282 | ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); | 424 | local_irq_save(flags); |
283 | rdsdebug("purging ibinc %p inc %p\n", ibinc, inc); | ||
284 | 425 | ||
285 | list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { | 426 | chp = per_cpu_ptr(cache->percpu, smp_processor_id()); |
286 | list_del_init(&frag->f_item); | 427 | if (!chp->first) |
287 | rds_ib_frag_drop_page(frag); | 428 | INIT_LIST_HEAD(new_item); |
288 | rds_ib_frag_free(frag); | 429 | else /* put on front */ |
289 | } | 430 | list_add_tail(new_item, chp->first); |
431 | chp->first = new_item; | ||
432 | chp->count++; | ||
433 | |||
434 | if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT) | ||
435 | goto end; | ||
436 | |||
437 | /* | ||
438 | * Return our per-cpu first list to the cache's xfer by atomically | ||
439 | * grabbing the current xfer list, appending it to our per-cpu list, | ||
440 | * and then atomically returning that entire list back to the | ||
441 | * cache's xfer list as long as it's still empty. | ||
442 | */ | ||
443 | do { | ||
444 | old = xchg(&cache->xfer, NULL); | ||
445 | if (old) | ||
446 | list_splice_entire_tail(old, chp->first); | ||
447 | old = cmpxchg(&cache->xfer, NULL, chp->first); | ||
448 | } while (old); | ||
449 | |||
450 | chp->first = NULL; | ||
451 | chp->count = 0; | ||
452 | end: | ||
453 | local_irq_restore(flags); | ||
290 | } | 454 | } |
291 | 455 | ||
292 | void rds_ib_inc_free(struct rds_incoming *inc) | 456 | static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache) |
293 | { | 457 | { |
294 | struct rds_ib_incoming *ibinc; | 458 | struct list_head *head = cache->ready; |
295 | 459 | ||
296 | ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); | 460 | if (head) { |
461 | if (!list_empty(head)) { | ||
462 | cache->ready = head->next; | ||
463 | list_del_init(head); | ||
464 | } else | ||
465 | cache->ready = NULL; | ||
466 | } | ||
297 | 467 | ||
298 | rds_ib_inc_purge(inc); | 468 | return head; |
299 | rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); | ||
300 | BUG_ON(!list_empty(&ibinc->ii_frags)); | ||
301 | kmem_cache_free(rds_ib_incoming_slab, ibinc); | ||
302 | atomic_dec(&rds_ib_allocation); | ||
303 | BUG_ON(atomic_read(&rds_ib_allocation) < 0); | ||
304 | } | 469 | } |
305 | 470 | ||
306 | int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, | 471 | int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, |
@@ -336,13 +501,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, | |||
336 | to_copy = min_t(unsigned long, to_copy, len - copied); | 501 | to_copy = min_t(unsigned long, to_copy, len - copied); |
337 | 502 | ||
338 | rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " | 503 | rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " |
339 | "[%p, %lu] + %lu\n", | 504 | "[%p, %u] + %lu\n", |
340 | to_copy, iov->iov_base, iov->iov_len, iov_off, | 505 | to_copy, iov->iov_base, iov->iov_len, iov_off, |
341 | frag->f_page, frag->f_offset, frag_off); | 506 | sg_page(&frag->f_sg), frag->f_sg.offset, frag_off); |
342 | 507 | ||
343 | /* XXX needs + offset for multiple recvs per page */ | 508 | /* XXX needs + offset for multiple recvs per page */ |
344 | ret = rds_page_copy_to_user(frag->f_page, | 509 | ret = rds_page_copy_to_user(sg_page(&frag->f_sg), |
345 | frag->f_offset + frag_off, | 510 | frag->f_sg.offset + frag_off, |
346 | iov->iov_base + iov_off, | 511 | iov->iov_base + iov_off, |
347 | to_copy); | 512 | to_copy); |
348 | if (ret) { | 513 | if (ret) { |
@@ -557,47 +722,6 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic) | |||
557 | return rds_ib_get_ack(ic); | 722 | return rds_ib_get_ack(ic); |
558 | } | 723 | } |
559 | 724 | ||
560 | static struct rds_header *rds_ib_get_header(struct rds_connection *conn, | ||
561 | struct rds_ib_recv_work *recv, | ||
562 | u32 data_len) | ||
563 | { | ||
564 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
565 | void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs]; | ||
566 | void *addr; | ||
567 | u32 misplaced_hdr_bytes; | ||
568 | |||
569 | /* | ||
570 | * Support header at the front (RDS 3.1+) as well as header-at-end. | ||
571 | * | ||
572 | * Cases: | ||
573 | * 1) header all in header buff (great!) | ||
574 | * 2) header all in data page (copy all to header buff) | ||
575 | * 3) header split across hdr buf + data page | ||
576 | * (move bit in hdr buff to end before copying other bit from data page) | ||
577 | */ | ||
578 | if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE) | ||
579 | return hdr_buff; | ||
580 | |||
581 | if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) { | ||
582 | addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); | ||
583 | memcpy(hdr_buff, | ||
584 | addr + recv->r_frag->f_offset + data_len, | ||
585 | sizeof(struct rds_header)); | ||
586 | kunmap_atomic(addr, KM_SOFTIRQ0); | ||
587 | return hdr_buff; | ||
588 | } | ||
589 | |||
590 | misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len)); | ||
591 | |||
592 | memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes); | ||
593 | |||
594 | addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); | ||
595 | memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len, | ||
596 | sizeof(struct rds_header) - misplaced_hdr_bytes); | ||
597 | kunmap_atomic(addr, KM_SOFTIRQ0); | ||
598 | return hdr_buff; | ||
599 | } | ||
600 | |||
601 | /* | 725 | /* |
602 | * It's kind of lame that we're copying from the posted receive pages into | 726 | * It's kind of lame that we're copying from the posted receive pages into |
603 | * long-lived bitmaps. We could have posted the bitmaps and rdma written into | 727 | * long-lived bitmaps. We could have posted the bitmaps and rdma written into |
@@ -639,7 +763,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, | |||
639 | to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); | 763 | to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); |
640 | BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ | 764 | BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ |
641 | 765 | ||
642 | addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); | 766 | addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0); |
643 | 767 | ||
644 | src = addr + frag_off; | 768 | src = addr + frag_off; |
645 | dst = (void *)map->m_page_addrs[map_page] + map_off; | 769 | dst = (void *)map->m_page_addrs[map_page] + map_off; |
@@ -710,7 +834,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, | |||
710 | } | 834 | } |
711 | data_len -= sizeof(struct rds_header); | 835 | data_len -= sizeof(struct rds_header); |
712 | 836 | ||
713 | ihdr = rds_ib_get_header(conn, recv, data_len); | 837 | ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; |
714 | 838 | ||
715 | /* Validate the checksum. */ | 839 | /* Validate the checksum. */ |
716 | if (!rds_message_verify_checksum(ihdr)) { | 840 | if (!rds_message_verify_checksum(ihdr)) { |
@@ -742,12 +866,12 @@ static void rds_ib_process_recv(struct rds_connection *conn, | |||
742 | * the inc is freed. We don't go that route, so we have to drop the | 866 | * the inc is freed. We don't go that route, so we have to drop the |
743 | * page ref ourselves. We can't just leave the page on the recv | 867 | * page ref ourselves. We can't just leave the page on the recv |
744 | * because that confuses the dma mapping of pages and each recv's use | 868 | * because that confuses the dma mapping of pages and each recv's use |
745 | * of a partial page. We can leave the frag, though, it will be | 869 | * of a partial page. |
746 | * reused. | ||
747 | * | 870 | * |
748 | * FIXME: Fold this into the code path below. | 871 | * FIXME: Fold this into the code path below. |
749 | */ | 872 | */ |
750 | rds_ib_frag_drop_page(recv->r_frag); | 873 | rds_ib_frag_free(ic, recv->r_frag); |
874 | recv->r_frag = NULL; | ||
751 | return; | 875 | return; |
752 | } | 876 | } |
753 | 877 | ||
@@ -757,7 +881,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, | |||
757 | * into the inc and save the inc so we can hang upcoming fragments | 881 | * into the inc and save the inc so we can hang upcoming fragments |
758 | * off its list. | 882 | * off its list. |
759 | */ | 883 | */ |
760 | if (ibinc == NULL) { | 884 | if (!ibinc) { |
761 | ibinc = recv->r_ibinc; | 885 | ibinc = recv->r_ibinc; |
762 | recv->r_ibinc = NULL; | 886 | recv->r_ibinc = NULL; |
763 | ic->i_ibinc = ibinc; | 887 | ic->i_ibinc = ibinc; |
@@ -842,32 +966,38 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic, | |||
842 | struct rds_ib_recv_work *recv; | 966 | struct rds_ib_recv_work *recv; |
843 | 967 | ||
844 | while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { | 968 | while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { |
845 | rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", | 969 | rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", |
846 | (unsigned long long)wc.wr_id, wc.status, wc.byte_len, | 970 | (unsigned long long)wc.wr_id, wc.status, |
971 | rds_ib_wc_status_str(wc.status), wc.byte_len, | ||
847 | be32_to_cpu(wc.ex.imm_data)); | 972 | be32_to_cpu(wc.ex.imm_data)); |
848 | rds_ib_stats_inc(s_ib_rx_cq_event); | 973 | rds_ib_stats_inc(s_ib_rx_cq_event); |
849 | 974 | ||
850 | recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; | 975 | recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; |
851 | 976 | ||
852 | rds_ib_recv_unmap_page(ic, recv); | 977 | ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); |
853 | 978 | ||
854 | /* | 979 | /* |
855 | * Also process recvs in connecting state because it is possible | 980 | * Also process recvs in connecting state because it is possible |
856 | * to get a recv completion _before_ the rdmacm ESTABLISHED | 981 | * to get a recv completion _before_ the rdmacm ESTABLISHED |
857 | * event is processed. | 982 | * event is processed. |
858 | */ | 983 | */ |
859 | if (rds_conn_up(conn) || rds_conn_connecting(conn)) { | 984 | if (wc.status == IB_WC_SUCCESS) { |
985 | rds_ib_process_recv(conn, recv, wc.byte_len, state); | ||
986 | } else { | ||
860 | /* We expect errors as the qp is drained during shutdown */ | 987 | /* We expect errors as the qp is drained during shutdown */ |
861 | if (wc.status == IB_WC_SUCCESS) { | 988 | if (rds_conn_up(conn) || rds_conn_connecting(conn)) |
862 | rds_ib_process_recv(conn, recv, wc.byte_len, state); | 989 | rds_ib_conn_error(conn, "recv completion on %pI4 had " |
863 | } else { | 990 | "status %u (%s), disconnecting and " |
864 | rds_ib_conn_error(conn, "recv completion on " | 991 | "reconnecting\n", &conn->c_faddr, |
865 | "%pI4 had status %u, disconnecting and " | 992 | wc.status, |
866 | "reconnecting\n", &conn->c_faddr, | 993 | rds_ib_wc_status_str(wc.status)); |
867 | wc.status); | ||
868 | } | ||
869 | } | 994 | } |
870 | 995 | ||
996 | /* | ||
997 | * It's very important that we only free this ring entry if we've truly | ||
998 | * freed the resources allocated to the entry. The refilling path can | ||
999 | * leak if we don't. | ||
1000 | */ | ||
871 | rds_ib_ring_free(&ic->i_recv_ring, 1); | 1001 | rds_ib_ring_free(&ic->i_recv_ring, 1); |
872 | } | 1002 | } |
873 | } | 1003 | } |
@@ -897,11 +1027,8 @@ void rds_ib_recv_tasklet_fn(unsigned long data) | |||
897 | if (rds_ib_ring_empty(&ic->i_recv_ring)) | 1027 | if (rds_ib_ring_empty(&ic->i_recv_ring)) |
898 | rds_ib_stats_inc(s_ib_rx_ring_empty); | 1028 | rds_ib_stats_inc(s_ib_rx_ring_empty); |
899 | 1029 | ||
900 | /* | ||
901 | * If the ring is running low, then schedule the thread to refill. | ||
902 | */ | ||
903 | if (rds_ib_ring_low(&ic->i_recv_ring)) | 1030 | if (rds_ib_ring_low(&ic->i_recv_ring)) |
904 | queue_delayed_work(rds_wq, &conn->c_recv_w, 0); | 1031 | rds_ib_recv_refill(conn, 0); |
905 | } | 1032 | } |
906 | 1033 | ||
907 | int rds_ib_recv(struct rds_connection *conn) | 1034 | int rds_ib_recv(struct rds_connection *conn) |
@@ -910,25 +1037,13 @@ int rds_ib_recv(struct rds_connection *conn) | |||
910 | int ret = 0; | 1037 | int ret = 0; |
911 | 1038 | ||
912 | rdsdebug("conn %p\n", conn); | 1039 | rdsdebug("conn %p\n", conn); |
913 | |||
914 | /* | ||
915 | * If we get a temporary posting failure in this context then | ||
916 | * we're really low and we want the caller to back off for a bit. | ||
917 | */ | ||
918 | mutex_lock(&ic->i_recv_mutex); | ||
919 | if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) | ||
920 | ret = -ENOMEM; | ||
921 | else | ||
922 | rds_ib_stats_inc(s_ib_rx_refill_from_thread); | ||
923 | mutex_unlock(&ic->i_recv_mutex); | ||
924 | |||
925 | if (rds_conn_up(conn)) | 1040 | if (rds_conn_up(conn)) |
926 | rds_ib_attempt_ack(ic); | 1041 | rds_ib_attempt_ack(ic); |
927 | 1042 | ||
928 | return ret; | 1043 | return ret; |
929 | } | 1044 | } |
930 | 1045 | ||
931 | int __init rds_ib_recv_init(void) | 1046 | int rds_ib_recv_init(void) |
932 | { | 1047 | { |
933 | struct sysinfo si; | 1048 | struct sysinfo si; |
934 | int ret = -ENOMEM; | 1049 | int ret = -ENOMEM; |
@@ -939,14 +1054,14 @@ int __init rds_ib_recv_init(void) | |||
939 | 1054 | ||
940 | rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", | 1055 | rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", |
941 | sizeof(struct rds_ib_incoming), | 1056 | sizeof(struct rds_ib_incoming), |
942 | 0, 0, NULL); | 1057 | 0, SLAB_HWCACHE_ALIGN, NULL); |
943 | if (rds_ib_incoming_slab == NULL) | 1058 | if (!rds_ib_incoming_slab) |
944 | goto out; | 1059 | goto out; |
945 | 1060 | ||
946 | rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", | 1061 | rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", |
947 | sizeof(struct rds_page_frag), | 1062 | sizeof(struct rds_page_frag), |
948 | 0, 0, NULL); | 1063 | 0, SLAB_HWCACHE_ALIGN, NULL); |
949 | if (rds_ib_frag_slab == NULL) | 1064 | if (!rds_ib_frag_slab) |
950 | kmem_cache_destroy(rds_ib_incoming_slab); | 1065 | kmem_cache_destroy(rds_ib_incoming_slab); |
951 | else | 1066 | else |
952 | ret = 0; | 1067 | ret = 0; |
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 17fa80803ab..71f373c421b 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c | |||
@@ -36,11 +36,49 @@ | |||
36 | #include <linux/dmapool.h> | 36 | #include <linux/dmapool.h> |
37 | 37 | ||
38 | #include "rds.h" | 38 | #include "rds.h" |
39 | #include "rdma.h" | ||
40 | #include "ib.h" | 39 | #include "ib.h" |
41 | 40 | ||
42 | static void rds_ib_send_rdma_complete(struct rds_message *rm, | 41 | static char *rds_ib_wc_status_strings[] = { |
43 | int wc_status) | 42 | #define RDS_IB_WC_STATUS_STR(foo) \ |
43 | [IB_WC_##foo] = __stringify(IB_WC_##foo) | ||
44 | RDS_IB_WC_STATUS_STR(SUCCESS), | ||
45 | RDS_IB_WC_STATUS_STR(LOC_LEN_ERR), | ||
46 | RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR), | ||
47 | RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR), | ||
48 | RDS_IB_WC_STATUS_STR(LOC_PROT_ERR), | ||
49 | RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR), | ||
50 | RDS_IB_WC_STATUS_STR(MW_BIND_ERR), | ||
51 | RDS_IB_WC_STATUS_STR(BAD_RESP_ERR), | ||
52 | RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR), | ||
53 | RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR), | ||
54 | RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR), | ||
55 | RDS_IB_WC_STATUS_STR(REM_OP_ERR), | ||
56 | RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR), | ||
57 | RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR), | ||
58 | RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR), | ||
59 | RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR), | ||
60 | RDS_IB_WC_STATUS_STR(REM_ABORT_ERR), | ||
61 | RDS_IB_WC_STATUS_STR(INV_EECN_ERR), | ||
62 | RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR), | ||
63 | RDS_IB_WC_STATUS_STR(FATAL_ERR), | ||
64 | RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR), | ||
65 | RDS_IB_WC_STATUS_STR(GENERAL_ERR), | ||
66 | #undef RDS_IB_WC_STATUS_STR | ||
67 | }; | ||
68 | |||
69 | char *rds_ib_wc_status_str(enum ib_wc_status status) | ||
70 | { | ||
71 | return rds_str_array(rds_ib_wc_status_strings, | ||
72 | ARRAY_SIZE(rds_ib_wc_status_strings), status); | ||
73 | } | ||
74 | |||
75 | /* | ||
76 | * Convert IB-specific error message to RDS error message and call core | ||
77 | * completion handler. | ||
78 | */ | ||
79 | static void rds_ib_send_complete(struct rds_message *rm, | ||
80 | int wc_status, | ||
81 | void (*complete)(struct rds_message *rm, int status)) | ||
44 | { | 82 | { |
45 | int notify_status; | 83 | int notify_status; |
46 | 84 | ||
@@ -60,69 +98,125 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm, | |||
60 | notify_status = RDS_RDMA_OTHER_ERROR; | 98 | notify_status = RDS_RDMA_OTHER_ERROR; |
61 | break; | 99 | break; |
62 | } | 100 | } |
63 | rds_rdma_send_complete(rm, notify_status); | 101 | complete(rm, notify_status); |
102 | } | ||
103 | |||
104 | static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, | ||
105 | struct rm_data_op *op, | ||
106 | int wc_status) | ||
107 | { | ||
108 | if (op->op_nents) | ||
109 | ib_dma_unmap_sg(ic->i_cm_id->device, | ||
110 | op->op_sg, op->op_nents, | ||
111 | DMA_TO_DEVICE); | ||
64 | } | 112 | } |
65 | 113 | ||
66 | static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, | 114 | static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, |
67 | struct rds_rdma_op *op) | 115 | struct rm_rdma_op *op, |
116 | int wc_status) | ||
68 | { | 117 | { |
69 | if (op->r_mapped) { | 118 | if (op->op_mapped) { |
70 | ib_dma_unmap_sg(ic->i_cm_id->device, | 119 | ib_dma_unmap_sg(ic->i_cm_id->device, |
71 | op->r_sg, op->r_nents, | 120 | op->op_sg, op->op_nents, |
72 | op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); | 121 | op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); |
73 | op->r_mapped = 0; | 122 | op->op_mapped = 0; |
74 | } | 123 | } |
124 | |||
125 | /* If the user asked for a completion notification on this | ||
126 | * message, we can implement three different semantics: | ||
127 | * 1. Notify when we received the ACK on the RDS message | ||
128 | * that was queued with the RDMA. This provides reliable | ||
129 | * notification of RDMA status at the expense of a one-way | ||
130 | * packet delay. | ||
131 | * 2. Notify when the IB stack gives us the completion event for | ||
132 | * the RDMA operation. | ||
133 | * 3. Notify when the IB stack gives us the completion event for | ||
134 | * the accompanying RDS messages. | ||
135 | * Here, we implement approach #3. To implement approach #2, | ||
136 | * we would need to take an event for the rdma WR. To implement #1, | ||
137 | * don't call rds_rdma_send_complete at all, and fall back to the notify | ||
138 | * handling in the ACK processing code. | ||
139 | * | ||
140 | * Note: There's no need to explicitly sync any RDMA buffers using | ||
141 | * ib_dma_sync_sg_for_cpu - the completion for the RDMA | ||
142 | * operation itself unmapped the RDMA buffers, which takes care | ||
143 | * of synching. | ||
144 | */ | ||
145 | rds_ib_send_complete(container_of(op, struct rds_message, rdma), | ||
146 | wc_status, rds_rdma_send_complete); | ||
147 | |||
148 | if (op->op_write) | ||
149 | rds_stats_add(s_send_rdma_bytes, op->op_bytes); | ||
150 | else | ||
151 | rds_stats_add(s_recv_rdma_bytes, op->op_bytes); | ||
75 | } | 152 | } |
76 | 153 | ||
77 | static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, | 154 | static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, |
78 | struct rds_ib_send_work *send, | 155 | struct rm_atomic_op *op, |
79 | int wc_status) | 156 | int wc_status) |
80 | { | 157 | { |
81 | struct rds_message *rm = send->s_rm; | 158 | /* unmap atomic recvbuf */ |
82 | 159 | if (op->op_mapped) { | |
83 | rdsdebug("ic %p send %p rm %p\n", ic, send, rm); | 160 | ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1, |
84 | 161 | DMA_FROM_DEVICE); | |
85 | ib_dma_unmap_sg(ic->i_cm_id->device, | 162 | op->op_mapped = 0; |
86 | rm->m_sg, rm->m_nents, | 163 | } |
87 | DMA_TO_DEVICE); | ||
88 | |||
89 | if (rm->m_rdma_op != NULL) { | ||
90 | rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); | ||
91 | |||
92 | /* If the user asked for a completion notification on this | ||
93 | * message, we can implement three different semantics: | ||
94 | * 1. Notify when we received the ACK on the RDS message | ||
95 | * that was queued with the RDMA. This provides reliable | ||
96 | * notification of RDMA status at the expense of a one-way | ||
97 | * packet delay. | ||
98 | * 2. Notify when the IB stack gives us the completion event for | ||
99 | * the RDMA operation. | ||
100 | * 3. Notify when the IB stack gives us the completion event for | ||
101 | * the accompanying RDS messages. | ||
102 | * Here, we implement approach #3. To implement approach #2, | ||
103 | * call rds_rdma_send_complete from the cq_handler. To implement #1, | ||
104 | * don't call rds_rdma_send_complete at all, and fall back to the notify | ||
105 | * handling in the ACK processing code. | ||
106 | * | ||
107 | * Note: There's no need to explicitly sync any RDMA buffers using | ||
108 | * ib_dma_sync_sg_for_cpu - the completion for the RDMA | ||
109 | * operation itself unmapped the RDMA buffers, which takes care | ||
110 | * of synching. | ||
111 | */ | ||
112 | rds_ib_send_rdma_complete(rm, wc_status); | ||
113 | 164 | ||
114 | if (rm->m_rdma_op->r_write) | 165 | rds_ib_send_complete(container_of(op, struct rds_message, atomic), |
115 | rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); | 166 | wc_status, rds_atomic_send_complete); |
116 | else | 167 | |
117 | rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); | 168 | if (op->op_type == RDS_ATOMIC_TYPE_CSWP) |
169 | rds_ib_stats_inc(s_ib_atomic_cswp); | ||
170 | else | ||
171 | rds_ib_stats_inc(s_ib_atomic_fadd); | ||
172 | } | ||
173 | |||
174 | /* | ||
175 | * Unmap the resources associated with a struct send_work. | ||
176 | * | ||
177 | * Returns the rm for no good reason other than it is unobtainable | ||
178 | * other than by switching on wr.opcode, currently, and the caller, | ||
179 | * the event handler, needs it. | ||
180 | */ | ||
181 | static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic, | ||
182 | struct rds_ib_send_work *send, | ||
183 | int wc_status) | ||
184 | { | ||
185 | struct rds_message *rm = NULL; | ||
186 | |||
187 | /* In the error case, wc.opcode sometimes contains garbage */ | ||
188 | switch (send->s_wr.opcode) { | ||
189 | case IB_WR_SEND: | ||
190 | if (send->s_op) { | ||
191 | rm = container_of(send->s_op, struct rds_message, data); | ||
192 | rds_ib_send_unmap_data(ic, send->s_op, wc_status); | ||
193 | } | ||
194 | break; | ||
195 | case IB_WR_RDMA_WRITE: | ||
196 | case IB_WR_RDMA_READ: | ||
197 | if (send->s_op) { | ||
198 | rm = container_of(send->s_op, struct rds_message, rdma); | ||
199 | rds_ib_send_unmap_rdma(ic, send->s_op, wc_status); | ||
200 | } | ||
201 | break; | ||
202 | case IB_WR_ATOMIC_FETCH_AND_ADD: | ||
203 | case IB_WR_ATOMIC_CMP_AND_SWP: | ||
204 | if (send->s_op) { | ||
205 | rm = container_of(send->s_op, struct rds_message, atomic); | ||
206 | rds_ib_send_unmap_atomic(ic, send->s_op, wc_status); | ||
207 | } | ||
208 | break; | ||
209 | default: | ||
210 | if (printk_ratelimit()) | ||
211 | printk(KERN_NOTICE | ||
212 | "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", | ||
213 | __func__, send->s_wr.opcode); | ||
214 | break; | ||
118 | } | 215 | } |
119 | 216 | ||
120 | /* If anyone waited for this message to get flushed out, wake | 217 | send->s_wr.opcode = 0xdead; |
121 | * them up now */ | ||
122 | rds_message_unmapped(rm); | ||
123 | 218 | ||
124 | rds_message_put(rm); | 219 | return rm; |
125 | send->s_rm = NULL; | ||
126 | } | 220 | } |
127 | 221 | ||
128 | void rds_ib_send_init_ring(struct rds_ib_connection *ic) | 222 | void rds_ib_send_init_ring(struct rds_ib_connection *ic) |
@@ -133,23 +227,18 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) | |||
133 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { | 227 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { |
134 | struct ib_sge *sge; | 228 | struct ib_sge *sge; |
135 | 229 | ||
136 | send->s_rm = NULL; | ||
137 | send->s_op = NULL; | 230 | send->s_op = NULL; |
138 | 231 | ||
139 | send->s_wr.wr_id = i; | 232 | send->s_wr.wr_id = i; |
140 | send->s_wr.sg_list = send->s_sge; | 233 | send->s_wr.sg_list = send->s_sge; |
141 | send->s_wr.num_sge = 1; | ||
142 | send->s_wr.opcode = IB_WR_SEND; | ||
143 | send->s_wr.send_flags = 0; | ||
144 | send->s_wr.ex.imm_data = 0; | 234 | send->s_wr.ex.imm_data = 0; |
145 | 235 | ||
146 | sge = rds_ib_data_sge(ic, send->s_sge); | 236 | sge = &send->s_sge[0]; |
147 | sge->lkey = ic->i_mr->lkey; | ||
148 | |||
149 | sge = rds_ib_header_sge(ic, send->s_sge); | ||
150 | sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); | 237 | sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); |
151 | sge->length = sizeof(struct rds_header); | 238 | sge->length = sizeof(struct rds_header); |
152 | sge->lkey = ic->i_mr->lkey; | 239 | sge->lkey = ic->i_mr->lkey; |
240 | |||
241 | send->s_sge[1].lkey = ic->i_mr->lkey; | ||
153 | } | 242 | } |
154 | } | 243 | } |
155 | 244 | ||
@@ -159,16 +248,24 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic) | |||
159 | u32 i; | 248 | u32 i; |
160 | 249 | ||
161 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { | 250 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { |
162 | if (send->s_wr.opcode == 0xdead) | 251 | if (send->s_op && send->s_wr.opcode != 0xdead) |
163 | continue; | 252 | rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR); |
164 | if (send->s_rm) | ||
165 | rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); | ||
166 | if (send->s_op) | ||
167 | rds_ib_send_unmap_rdma(ic, send->s_op); | ||
168 | } | 253 | } |
169 | } | 254 | } |
170 | 255 | ||
171 | /* | 256 | /* |
257 | * The only fast path caller always has a non-zero nr, so we don't | ||
258 | * bother testing nr before performing the atomic sub. | ||
259 | */ | ||
260 | static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) | ||
261 | { | ||
262 | if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) && | ||
263 | waitqueue_active(&rds_ib_ring_empty_wait)) | ||
264 | wake_up(&rds_ib_ring_empty_wait); | ||
265 | BUG_ON(atomic_read(&ic->i_signaled_sends) < 0); | ||
266 | } | ||
267 | |||
268 | /* | ||
172 | * The _oldest/_free ring operations here race cleanly with the alloc/unalloc | 269 | * The _oldest/_free ring operations here race cleanly with the alloc/unalloc |
173 | * operations performed in the send path. As the sender allocs and potentially | 270 | * operations performed in the send path. As the sender allocs and potentially |
174 | * unallocs the next free entry in the ring it doesn't alter which is | 271 | * unallocs the next free entry in the ring it doesn't alter which is |
@@ -178,12 +275,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
178 | { | 275 | { |
179 | struct rds_connection *conn = context; | 276 | struct rds_connection *conn = context; |
180 | struct rds_ib_connection *ic = conn->c_transport_data; | 277 | struct rds_ib_connection *ic = conn->c_transport_data; |
278 | struct rds_message *rm = NULL; | ||
181 | struct ib_wc wc; | 279 | struct ib_wc wc; |
182 | struct rds_ib_send_work *send; | 280 | struct rds_ib_send_work *send; |
183 | u32 completed; | 281 | u32 completed; |
184 | u32 oldest; | 282 | u32 oldest; |
185 | u32 i = 0; | 283 | u32 i = 0; |
186 | int ret; | 284 | int ret; |
285 | int nr_sig = 0; | ||
187 | 286 | ||
188 | rdsdebug("cq %p conn %p\n", cq, conn); | 287 | rdsdebug("cq %p conn %p\n", cq, conn); |
189 | rds_ib_stats_inc(s_ib_tx_cq_call); | 288 | rds_ib_stats_inc(s_ib_tx_cq_call); |
@@ -192,8 +291,9 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
192 | rdsdebug("ib_req_notify_cq send failed: %d\n", ret); | 291 | rdsdebug("ib_req_notify_cq send failed: %d\n", ret); |
193 | 292 | ||
194 | while (ib_poll_cq(cq, 1, &wc) > 0) { | 293 | while (ib_poll_cq(cq, 1, &wc) > 0) { |
195 | rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", | 294 | rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", |
196 | (unsigned long long)wc.wr_id, wc.status, wc.byte_len, | 295 | (unsigned long long)wc.wr_id, wc.status, |
296 | rds_ib_wc_status_str(wc.status), wc.byte_len, | ||
197 | be32_to_cpu(wc.ex.imm_data)); | 297 | be32_to_cpu(wc.ex.imm_data)); |
198 | rds_ib_stats_inc(s_ib_tx_cq_event); | 298 | rds_ib_stats_inc(s_ib_tx_cq_event); |
199 | 299 | ||
@@ -210,51 +310,30 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
210 | 310 | ||
211 | for (i = 0; i < completed; i++) { | 311 | for (i = 0; i < completed; i++) { |
212 | send = &ic->i_sends[oldest]; | 312 | send = &ic->i_sends[oldest]; |
313 | if (send->s_wr.send_flags & IB_SEND_SIGNALED) | ||
314 | nr_sig++; | ||
213 | 315 | ||
214 | /* In the error case, wc.opcode sometimes contains garbage */ | 316 | rm = rds_ib_send_unmap_op(ic, send, wc.status); |
215 | switch (send->s_wr.opcode) { | ||
216 | case IB_WR_SEND: | ||
217 | if (send->s_rm) | ||
218 | rds_ib_send_unmap_rm(ic, send, wc.status); | ||
219 | break; | ||
220 | case IB_WR_RDMA_WRITE: | ||
221 | case IB_WR_RDMA_READ: | ||
222 | /* Nothing to be done - the SG list will be unmapped | ||
223 | * when the SEND completes. */ | ||
224 | break; | ||
225 | default: | ||
226 | if (printk_ratelimit()) | ||
227 | printk(KERN_NOTICE | ||
228 | "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", | ||
229 | __func__, send->s_wr.opcode); | ||
230 | break; | ||
231 | } | ||
232 | 317 | ||
233 | send->s_wr.opcode = 0xdead; | ||
234 | send->s_wr.num_sge = 1; | ||
235 | if (send->s_queued + HZ/2 < jiffies) | 318 | if (send->s_queued + HZ/2 < jiffies) |
236 | rds_ib_stats_inc(s_ib_tx_stalled); | 319 | rds_ib_stats_inc(s_ib_tx_stalled); |
237 | 320 | ||
238 | /* If a RDMA operation produced an error, signal this right | 321 | if (send->s_op) { |
239 | * away. If we don't, the subsequent SEND that goes with this | 322 | if (send->s_op == rm->m_final_op) { |
240 | * RDMA will be canceled with ERR_WFLUSH, and the application | 323 | /* If anyone waited for this message to get flushed out, wake |
241 | * never learn that the RDMA failed. */ | 324 | * them up now */ |
242 | if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { | 325 | rds_message_unmapped(rm); |
243 | struct rds_message *rm; | ||
244 | |||
245 | rm = rds_send_get_message(conn, send->s_op); | ||
246 | if (rm) { | ||
247 | if (rm->m_rdma_op) | ||
248 | rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); | ||
249 | rds_ib_send_rdma_complete(rm, wc.status); | ||
250 | rds_message_put(rm); | ||
251 | } | 326 | } |
327 | rds_message_put(rm); | ||
328 | send->s_op = NULL; | ||
252 | } | 329 | } |
253 | 330 | ||
254 | oldest = (oldest + 1) % ic->i_send_ring.w_nr; | 331 | oldest = (oldest + 1) % ic->i_send_ring.w_nr; |
255 | } | 332 | } |
256 | 333 | ||
257 | rds_ib_ring_free(&ic->i_send_ring, completed); | 334 | rds_ib_ring_free(&ic->i_send_ring, completed); |
335 | rds_ib_sub_signaled(ic, nr_sig); | ||
336 | nr_sig = 0; | ||
258 | 337 | ||
259 | if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || | 338 | if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || |
260 | test_bit(0, &conn->c_map_queued)) | 339 | test_bit(0, &conn->c_map_queued)) |
@@ -262,10 +341,10 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
262 | 341 | ||
263 | /* We expect errors as the qp is drained during shutdown */ | 342 | /* We expect errors as the qp is drained during shutdown */ |
264 | if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { | 343 | if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { |
265 | rds_ib_conn_error(conn, | 344 | rds_ib_conn_error(conn, "send completion on %pI4 had status " |
266 | "send completion on %pI4 " | 345 | "%u (%s), disconnecting and reconnecting\n", |
267 | "had status %u, disconnecting and reconnecting\n", | 346 | &conn->c_faddr, wc.status, |
268 | &conn->c_faddr, wc.status); | 347 | rds_ib_wc_status_str(wc.status)); |
269 | } | 348 | } |
270 | } | 349 | } |
271 | } | 350 | } |
@@ -294,7 +373,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
294 | * credits (see rds_ib_send_add_credits below). | 373 | * credits (see rds_ib_send_add_credits below). |
295 | * | 374 | * |
296 | * The RDS send code is essentially single-threaded; rds_send_xmit | 375 | * The RDS send code is essentially single-threaded; rds_send_xmit |
297 | * grabs c_send_lock to ensure exclusive access to the send ring. | 376 | * sets RDS_IN_XMIT to ensure exclusive access to the send ring. |
298 | * However, the ACK sending code is independent and can race with | 377 | * However, the ACK sending code is independent and can race with |
299 | * message SENDs. | 378 | * message SENDs. |
300 | * | 379 | * |
@@ -413,40 +492,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) | |||
413 | set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); | 492 | set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); |
414 | } | 493 | } |
415 | 494 | ||
416 | static inline void | 495 | static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic, |
417 | rds_ib_xmit_populate_wr(struct rds_ib_connection *ic, | 496 | struct rds_ib_send_work *send, |
418 | struct rds_ib_send_work *send, unsigned int pos, | 497 | bool notify) |
419 | unsigned long buffer, unsigned int length, | ||
420 | int send_flags) | ||
421 | { | 498 | { |
422 | struct ib_sge *sge; | 499 | /* |
423 | 500 | * We want to delay signaling completions just enough to get | |
424 | WARN_ON(pos != send - ic->i_sends); | 501 | * the batching benefits but not so much that we create dead time |
425 | 502 | * on the wire. | |
426 | send->s_wr.send_flags = send_flags; | 503 | */ |
427 | send->s_wr.opcode = IB_WR_SEND; | 504 | if (ic->i_unsignaled_wrs-- == 0 || notify) { |
428 | send->s_wr.num_sge = 2; | 505 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; |
429 | send->s_wr.next = NULL; | 506 | send->s_wr.send_flags |= IB_SEND_SIGNALED; |
430 | send->s_queued = jiffies; | 507 | return 1; |
431 | send->s_op = NULL; | ||
432 | |||
433 | if (length != 0) { | ||
434 | sge = rds_ib_data_sge(ic, send->s_sge); | ||
435 | sge->addr = buffer; | ||
436 | sge->length = length; | ||
437 | sge->lkey = ic->i_mr->lkey; | ||
438 | |||
439 | sge = rds_ib_header_sge(ic, send->s_sge); | ||
440 | } else { | ||
441 | /* We're sending a packet with no payload. There is only | ||
442 | * one SGE */ | ||
443 | send->s_wr.num_sge = 1; | ||
444 | sge = &send->s_sge[0]; | ||
445 | } | 508 | } |
446 | 509 | return 0; | |
447 | sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); | ||
448 | sge->length = sizeof(struct rds_header); | ||
449 | sge->lkey = ic->i_mr->lkey; | ||
450 | } | 510 | } |
451 | 511 | ||
452 | /* | 512 | /* |
@@ -475,13 +535,14 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
475 | u32 pos; | 535 | u32 pos; |
476 | u32 i; | 536 | u32 i; |
477 | u32 work_alloc; | 537 | u32 work_alloc; |
478 | u32 credit_alloc; | 538 | u32 credit_alloc = 0; |
479 | u32 posted; | 539 | u32 posted; |
480 | u32 adv_credits = 0; | 540 | u32 adv_credits = 0; |
481 | int send_flags = 0; | 541 | int send_flags = 0; |
482 | int sent; | 542 | int bytes_sent = 0; |
483 | int ret; | 543 | int ret; |
484 | int flow_controlled = 0; | 544 | int flow_controlled = 0; |
545 | int nr_sig = 0; | ||
485 | 546 | ||
486 | BUG_ON(off % RDS_FRAG_SIZE); | 547 | BUG_ON(off % RDS_FRAG_SIZE); |
487 | BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); | 548 | BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); |
@@ -507,14 +568,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
507 | goto out; | 568 | goto out; |
508 | } | 569 | } |
509 | 570 | ||
510 | credit_alloc = work_alloc; | ||
511 | if (ic->i_flowctl) { | 571 | if (ic->i_flowctl) { |
512 | credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); | 572 | credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); |
513 | adv_credits += posted; | 573 | adv_credits += posted; |
514 | if (credit_alloc < work_alloc) { | 574 | if (credit_alloc < work_alloc) { |
515 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); | 575 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); |
516 | work_alloc = credit_alloc; | 576 | work_alloc = credit_alloc; |
517 | flow_controlled++; | 577 | flow_controlled = 1; |
518 | } | 578 | } |
519 | if (work_alloc == 0) { | 579 | if (work_alloc == 0) { |
520 | set_bit(RDS_LL_SEND_FULL, &conn->c_flags); | 580 | set_bit(RDS_LL_SEND_FULL, &conn->c_flags); |
@@ -525,31 +585,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
525 | } | 585 | } |
526 | 586 | ||
527 | /* map the message the first time we see it */ | 587 | /* map the message the first time we see it */ |
528 | if (ic->i_rm == NULL) { | 588 | if (!ic->i_data_op) { |
529 | /* | 589 | if (rm->data.op_nents) { |
530 | printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", | 590 | rm->data.op_count = ib_dma_map_sg(dev, |
531 | be16_to_cpu(rm->m_inc.i_hdr.h_dport), | 591 | rm->data.op_sg, |
532 | rm->m_inc.i_hdr.h_flags, | 592 | rm->data.op_nents, |
533 | be32_to_cpu(rm->m_inc.i_hdr.h_len)); | 593 | DMA_TO_DEVICE); |
534 | */ | 594 | rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); |
535 | if (rm->m_nents) { | 595 | if (rm->data.op_count == 0) { |
536 | rm->m_count = ib_dma_map_sg(dev, | ||
537 | rm->m_sg, rm->m_nents, DMA_TO_DEVICE); | ||
538 | rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); | ||
539 | if (rm->m_count == 0) { | ||
540 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); | 596 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); |
541 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | 597 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); |
542 | ret = -ENOMEM; /* XXX ? */ | 598 | ret = -ENOMEM; /* XXX ? */ |
543 | goto out; | 599 | goto out; |
544 | } | 600 | } |
545 | } else { | 601 | } else { |
546 | rm->m_count = 0; | 602 | rm->data.op_count = 0; |
547 | } | 603 | } |
548 | 604 | ||
549 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; | ||
550 | ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; | ||
551 | rds_message_addref(rm); | 605 | rds_message_addref(rm); |
552 | ic->i_rm = rm; | 606 | ic->i_data_op = &rm->data; |
553 | 607 | ||
554 | /* Finalize the header */ | 608 | /* Finalize the header */ |
555 | if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) | 609 | if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) |
@@ -559,10 +613,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
559 | 613 | ||
560 | /* If it has a RDMA op, tell the peer we did it. This is | 614 | /* If it has a RDMA op, tell the peer we did it. This is |
561 | * used by the peer to release use-once RDMA MRs. */ | 615 | * used by the peer to release use-once RDMA MRs. */ |
562 | if (rm->m_rdma_op) { | 616 | if (rm->rdma.op_active) { |
563 | struct rds_ext_header_rdma ext_hdr; | 617 | struct rds_ext_header_rdma ext_hdr; |
564 | 618 | ||
565 | ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); | 619 | ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); |
566 | rds_message_add_extension(&rm->m_inc.i_hdr, | 620 | rds_message_add_extension(&rm->m_inc.i_hdr, |
567 | RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); | 621 | RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); |
568 | } | 622 | } |
@@ -582,99 +636,77 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
582 | /* | 636 | /* |
583 | * Update adv_credits since we reset the ACK_REQUIRED bit. | 637 | * Update adv_credits since we reset the ACK_REQUIRED bit. |
584 | */ | 638 | */ |
585 | rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); | 639 | if (ic->i_flowctl) { |
586 | adv_credits += posted; | 640 | rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); |
587 | BUG_ON(adv_credits > 255); | 641 | adv_credits += posted; |
642 | BUG_ON(adv_credits > 255); | ||
643 | } | ||
588 | } | 644 | } |
589 | 645 | ||
590 | send = &ic->i_sends[pos]; | ||
591 | first = send; | ||
592 | prev = NULL; | ||
593 | scat = &rm->m_sg[sg]; | ||
594 | sent = 0; | ||
595 | i = 0; | ||
596 | |||
597 | /* Sometimes you want to put a fence between an RDMA | 646 | /* Sometimes you want to put a fence between an RDMA |
598 | * READ and the following SEND. | 647 | * READ and the following SEND. |
599 | * We could either do this all the time | 648 | * We could either do this all the time |
600 | * or when requested by the user. Right now, we let | 649 | * or when requested by the user. Right now, we let |
601 | * the application choose. | 650 | * the application choose. |
602 | */ | 651 | */ |
603 | if (rm->m_rdma_op && rm->m_rdma_op->r_fence) | 652 | if (rm->rdma.op_active && rm->rdma.op_fence) |
604 | send_flags = IB_SEND_FENCE; | 653 | send_flags = IB_SEND_FENCE; |
605 | 654 | ||
606 | /* | 655 | /* Each frag gets a header. Msgs may be 0 bytes */ |
607 | * We could be copying the header into the unused tail of the page. | 656 | send = &ic->i_sends[pos]; |
608 | * That would need to be changed in the future when those pages might | 657 | first = send; |
609 | * be mapped userspace pages or page cache pages. So instead we always | 658 | prev = NULL; |
610 | * use a second sge and our long-lived ring of mapped headers. We send | 659 | scat = &ic->i_data_op->op_sg[sg]; |
611 | * the header after the data so that the data payload can be aligned on | 660 | i = 0; |
612 | * the receiver. | 661 | do { |
613 | */ | 662 | unsigned int len = 0; |
614 | 663 | ||
615 | /* handle a 0-len message */ | 664 | /* Set up the header */ |
616 | if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { | 665 | send->s_wr.send_flags = send_flags; |
617 | rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); | 666 | send->s_wr.opcode = IB_WR_SEND; |
618 | goto add_header; | 667 | send->s_wr.num_sge = 1; |
619 | } | 668 | send->s_wr.next = NULL; |
669 | send->s_queued = jiffies; | ||
670 | send->s_op = NULL; | ||
620 | 671 | ||
621 | /* if there's data reference it with a chain of work reqs */ | 672 | send->s_sge[0].addr = ic->i_send_hdrs_dma |
622 | for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { | 673 | + (pos * sizeof(struct rds_header)); |
623 | unsigned int len; | 674 | send->s_sge[0].length = sizeof(struct rds_header); |
624 | 675 | ||
625 | send = &ic->i_sends[pos]; | 676 | memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); |
626 | 677 | ||
627 | len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); | 678 | /* Set up the data, if present */ |
628 | rds_ib_xmit_populate_wr(ic, send, pos, | 679 | if (i < work_alloc |
629 | ib_sg_dma_address(dev, scat) + off, len, | 680 | && scat != &rm->data.op_sg[rm->data.op_count]) { |
630 | send_flags); | 681 | len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); |
682 | send->s_wr.num_sge = 2; | ||
631 | 683 | ||
632 | /* | 684 | send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off; |
633 | * We want to delay signaling completions just enough to get | 685 | send->s_sge[1].length = len; |
634 | * the batching benefits but not so much that we create dead time | ||
635 | * on the wire. | ||
636 | */ | ||
637 | if (ic->i_unsignaled_wrs-- == 0) { | ||
638 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; | ||
639 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | ||
640 | } | ||
641 | 686 | ||
642 | ic->i_unsignaled_bytes -= len; | 687 | bytes_sent += len; |
643 | if (ic->i_unsignaled_bytes <= 0) { | 688 | off += len; |
644 | ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; | 689 | if (off == ib_sg_dma_len(dev, scat)) { |
645 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | 690 | scat++; |
691 | off = 0; | ||
692 | } | ||
646 | } | 693 | } |
647 | 694 | ||
695 | rds_ib_set_wr_signal_state(ic, send, 0); | ||
696 | |||
648 | /* | 697 | /* |
649 | * Always signal the last one if we're stopping due to flow control. | 698 | * Always signal the last one if we're stopping due to flow control. |
650 | */ | 699 | */ |
651 | if (flow_controlled && i == (work_alloc-1)) | 700 | if (ic->i_flowctl && flow_controlled && i == (work_alloc-1)) |
652 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | 701 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; |
653 | 702 | ||
703 | if (send->s_wr.send_flags & IB_SEND_SIGNALED) | ||
704 | nr_sig++; | ||
705 | |||
654 | rdsdebug("send %p wr %p num_sge %u next %p\n", send, | 706 | rdsdebug("send %p wr %p num_sge %u next %p\n", send, |
655 | &send->s_wr, send->s_wr.num_sge, send->s_wr.next); | 707 | &send->s_wr, send->s_wr.num_sge, send->s_wr.next); |
656 | 708 | ||
657 | sent += len; | 709 | if (ic->i_flowctl && adv_credits) { |
658 | off += len; | ||
659 | if (off == ib_sg_dma_len(dev, scat)) { | ||
660 | scat++; | ||
661 | off = 0; | ||
662 | } | ||
663 | |||
664 | add_header: | ||
665 | /* Tack on the header after the data. The header SGE should already | ||
666 | * have been set up to point to the right header buffer. */ | ||
667 | memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); | ||
668 | |||
669 | if (0) { | ||
670 | struct rds_header *hdr = &ic->i_send_hdrs[pos]; | ||
671 | |||
672 | printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", | ||
673 | be16_to_cpu(hdr->h_dport), | ||
674 | hdr->h_flags, | ||
675 | be32_to_cpu(hdr->h_len)); | ||
676 | } | ||
677 | if (adv_credits) { | ||
678 | struct rds_header *hdr = &ic->i_send_hdrs[pos]; | 710 | struct rds_header *hdr = &ic->i_send_hdrs[pos]; |
679 | 711 | ||
680 | /* add credit and redo the header checksum */ | 712 | /* add credit and redo the header checksum */ |
@@ -689,20 +721,25 @@ add_header: | |||
689 | prev = send; | 721 | prev = send; |
690 | 722 | ||
691 | pos = (pos + 1) % ic->i_send_ring.w_nr; | 723 | pos = (pos + 1) % ic->i_send_ring.w_nr; |
692 | } | 724 | send = &ic->i_sends[pos]; |
725 | i++; | ||
726 | |||
727 | } while (i < work_alloc | ||
728 | && scat != &rm->data.op_sg[rm->data.op_count]); | ||
693 | 729 | ||
694 | /* Account the RDS header in the number of bytes we sent, but just once. | 730 | /* Account the RDS header in the number of bytes we sent, but just once. |
695 | * The caller has no concept of fragmentation. */ | 731 | * The caller has no concept of fragmentation. */ |
696 | if (hdr_off == 0) | 732 | if (hdr_off == 0) |
697 | sent += sizeof(struct rds_header); | 733 | bytes_sent += sizeof(struct rds_header); |
698 | 734 | ||
699 | /* if we finished the message then send completion owns it */ | 735 | /* if we finished the message then send completion owns it */ |
700 | if (scat == &rm->m_sg[rm->m_count]) { | 736 | if (scat == &rm->data.op_sg[rm->data.op_count]) { |
701 | prev->s_rm = ic->i_rm; | 737 | prev->s_op = ic->i_data_op; |
702 | prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | 738 | prev->s_wr.send_flags |= IB_SEND_SOLICITED; |
703 | ic->i_rm = NULL; | 739 | ic->i_data_op = NULL; |
704 | } | 740 | } |
705 | 741 | ||
742 | /* Put back wrs & credits we didn't use */ | ||
706 | if (i < work_alloc) { | 743 | if (i < work_alloc) { |
707 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); | 744 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); |
708 | work_alloc = i; | 745 | work_alloc = i; |
@@ -710,6 +747,9 @@ add_header: | |||
710 | if (ic->i_flowctl && i < credit_alloc) | 747 | if (ic->i_flowctl && i < credit_alloc) |
711 | rds_ib_send_add_credits(conn, credit_alloc - i); | 748 | rds_ib_send_add_credits(conn, credit_alloc - i); |
712 | 749 | ||
750 | if (nr_sig) | ||
751 | atomic_add(nr_sig, &ic->i_signaled_sends); | ||
752 | |||
713 | /* XXX need to worry about failed_wr and partial sends. */ | 753 | /* XXX need to worry about failed_wr and partial sends. */ |
714 | failed_wr = &first->s_wr; | 754 | failed_wr = &first->s_wr; |
715 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); | 755 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); |
@@ -720,32 +760,127 @@ add_header: | |||
720 | printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " | 760 | printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " |
721 | "returned %d\n", &conn->c_faddr, ret); | 761 | "returned %d\n", &conn->c_faddr, ret); |
722 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | 762 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); |
723 | if (prev->s_rm) { | 763 | rds_ib_sub_signaled(ic, nr_sig); |
724 | ic->i_rm = prev->s_rm; | 764 | if (prev->s_op) { |
725 | prev->s_rm = NULL; | 765 | ic->i_data_op = prev->s_op; |
766 | prev->s_op = NULL; | ||
726 | } | 767 | } |
727 | 768 | ||
728 | rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); | 769 | rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); |
729 | goto out; | 770 | goto out; |
730 | } | 771 | } |
731 | 772 | ||
732 | ret = sent; | 773 | ret = bytes_sent; |
733 | out: | 774 | out: |
734 | BUG_ON(adv_credits); | 775 | BUG_ON(adv_credits); |
735 | return ret; | 776 | return ret; |
736 | } | 777 | } |
737 | 778 | ||
738 | int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | 779 | /* |
780 | * Issue atomic operation. | ||
781 | * A simplified version of the rdma case, we always map 1 SG, and | ||
782 | * only 8 bytes, for the return value from the atomic operation. | ||
783 | */ | ||
784 | int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) | ||
785 | { | ||
786 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
787 | struct rds_ib_send_work *send = NULL; | ||
788 | struct ib_send_wr *failed_wr; | ||
789 | struct rds_ib_device *rds_ibdev; | ||
790 | u32 pos; | ||
791 | u32 work_alloc; | ||
792 | int ret; | ||
793 | int nr_sig = 0; | ||
794 | |||
795 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | ||
796 | |||
797 | work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); | ||
798 | if (work_alloc != 1) { | ||
799 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
800 | rds_ib_stats_inc(s_ib_tx_ring_full); | ||
801 | ret = -ENOMEM; | ||
802 | goto out; | ||
803 | } | ||
804 | |||
805 | /* address of send request in ring */ | ||
806 | send = &ic->i_sends[pos]; | ||
807 | send->s_queued = jiffies; | ||
808 | |||
809 | if (op->op_type == RDS_ATOMIC_TYPE_CSWP) { | ||
810 | send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; | ||
811 | send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare; | ||
812 | send->s_wr.wr.atomic.swap = op->op_m_cswp.swap; | ||
813 | send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask; | ||
814 | send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask; | ||
815 | } else { /* FADD */ | ||
816 | send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; | ||
817 | send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add; | ||
818 | send->s_wr.wr.atomic.swap = 0; | ||
819 | send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask; | ||
820 | send->s_wr.wr.atomic.swap_mask = 0; | ||
821 | } | ||
822 | nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); | ||
823 | send->s_wr.num_sge = 1; | ||
824 | send->s_wr.next = NULL; | ||
825 | send->s_wr.wr.atomic.remote_addr = op->op_remote_addr; | ||
826 | send->s_wr.wr.atomic.rkey = op->op_rkey; | ||
827 | send->s_op = op; | ||
828 | rds_message_addref(container_of(send->s_op, struct rds_message, atomic)); | ||
829 | |||
830 | /* map 8 byte retval buffer to the device */ | ||
831 | ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE); | ||
832 | rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret); | ||
833 | if (ret != 1) { | ||
834 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
835 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); | ||
836 | ret = -ENOMEM; /* XXX ? */ | ||
837 | goto out; | ||
838 | } | ||
839 | |||
840 | /* Convert our struct scatterlist to struct ib_sge */ | ||
841 | send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg); | ||
842 | send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg); | ||
843 | send->s_sge[0].lkey = ic->i_mr->lkey; | ||
844 | |||
845 | rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, | ||
846 | send->s_sge[0].addr, send->s_sge[0].length); | ||
847 | |||
848 | if (nr_sig) | ||
849 | atomic_add(nr_sig, &ic->i_signaled_sends); | ||
850 | |||
851 | failed_wr = &send->s_wr; | ||
852 | ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr); | ||
853 | rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic, | ||
854 | send, &send->s_wr, ret, failed_wr); | ||
855 | BUG_ON(failed_wr != &send->s_wr); | ||
856 | if (ret) { | ||
857 | printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " | ||
858 | "returned %d\n", &conn->c_faddr, ret); | ||
859 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
860 | rds_ib_sub_signaled(ic, nr_sig); | ||
861 | goto out; | ||
862 | } | ||
863 | |||
864 | if (unlikely(failed_wr != &send->s_wr)) { | ||
865 | printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret); | ||
866 | BUG_ON(failed_wr != &send->s_wr); | ||
867 | } | ||
868 | |||
869 | out: | ||
870 | return ret; | ||
871 | } | ||
872 | |||
873 | int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) | ||
739 | { | 874 | { |
740 | struct rds_ib_connection *ic = conn->c_transport_data; | 875 | struct rds_ib_connection *ic = conn->c_transport_data; |
741 | struct rds_ib_send_work *send = NULL; | 876 | struct rds_ib_send_work *send = NULL; |
742 | struct rds_ib_send_work *first; | 877 | struct rds_ib_send_work *first; |
743 | struct rds_ib_send_work *prev; | 878 | struct rds_ib_send_work *prev; |
744 | struct ib_send_wr *failed_wr; | 879 | struct ib_send_wr *failed_wr; |
745 | struct rds_ib_device *rds_ibdev; | ||
746 | struct scatterlist *scat; | 880 | struct scatterlist *scat; |
747 | unsigned long len; | 881 | unsigned long len; |
748 | u64 remote_addr = op->r_remote_addr; | 882 | u64 remote_addr = op->op_remote_addr; |
883 | u32 max_sge = ic->rds_ibdev->max_sge; | ||
749 | u32 pos; | 884 | u32 pos; |
750 | u32 work_alloc; | 885 | u32 work_alloc; |
751 | u32 i; | 886 | u32 i; |
@@ -753,29 +888,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
753 | int sent; | 888 | int sent; |
754 | int ret; | 889 | int ret; |
755 | int num_sge; | 890 | int num_sge; |
756 | 891 | int nr_sig = 0; | |
757 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | 892 | |
758 | 893 | /* map the op the first time we see it */ | |
759 | /* map the message the first time we see it */ | 894 | if (!op->op_mapped) { |
760 | if (!op->r_mapped) { | 895 | op->op_count = ib_dma_map_sg(ic->i_cm_id->device, |
761 | op->r_count = ib_dma_map_sg(ic->i_cm_id->device, | 896 | op->op_sg, op->op_nents, (op->op_write) ? |
762 | op->r_sg, op->r_nents, (op->r_write) ? | 897 | DMA_TO_DEVICE : DMA_FROM_DEVICE); |
763 | DMA_TO_DEVICE : DMA_FROM_DEVICE); | 898 | rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); |
764 | rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); | 899 | if (op->op_count == 0) { |
765 | if (op->r_count == 0) { | ||
766 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); | 900 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); |
767 | ret = -ENOMEM; /* XXX ? */ | 901 | ret = -ENOMEM; /* XXX ? */ |
768 | goto out; | 902 | goto out; |
769 | } | 903 | } |
770 | 904 | ||
771 | op->r_mapped = 1; | 905 | op->op_mapped = 1; |
772 | } | 906 | } |
773 | 907 | ||
774 | /* | 908 | /* |
775 | * Instead of knowing how to return a partial rdma read/write we insist that there | 909 | * Instead of knowing how to return a partial rdma read/write we insist that there |
776 | * be enough work requests to send the entire message. | 910 | * be enough work requests to send the entire message. |
777 | */ | 911 | */ |
778 | i = ceil(op->r_count, rds_ibdev->max_sge); | 912 | i = ceil(op->op_count, max_sge); |
779 | 913 | ||
780 | work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); | 914 | work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); |
781 | if (work_alloc != i) { | 915 | if (work_alloc != i) { |
@@ -788,30 +922,24 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
788 | send = &ic->i_sends[pos]; | 922 | send = &ic->i_sends[pos]; |
789 | first = send; | 923 | first = send; |
790 | prev = NULL; | 924 | prev = NULL; |
791 | scat = &op->r_sg[0]; | 925 | scat = &op->op_sg[0]; |
792 | sent = 0; | 926 | sent = 0; |
793 | num_sge = op->r_count; | 927 | num_sge = op->op_count; |
794 | 928 | ||
795 | for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { | 929 | for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { |
796 | send->s_wr.send_flags = 0; | 930 | send->s_wr.send_flags = 0; |
797 | send->s_queued = jiffies; | 931 | send->s_queued = jiffies; |
798 | /* | 932 | send->s_op = NULL; |
799 | * We want to delay signaling completions just enough to get | 933 | |
800 | * the batching benefits but not so much that we create dead time on the wire. | 934 | nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify); |
801 | */ | ||
802 | if (ic->i_unsignaled_wrs-- == 0) { | ||
803 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; | ||
804 | send->s_wr.send_flags = IB_SEND_SIGNALED; | ||
805 | } | ||
806 | 935 | ||
807 | send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; | 936 | send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; |
808 | send->s_wr.wr.rdma.remote_addr = remote_addr; | 937 | send->s_wr.wr.rdma.remote_addr = remote_addr; |
809 | send->s_wr.wr.rdma.rkey = op->r_key; | 938 | send->s_wr.wr.rdma.rkey = op->op_rkey; |
810 | send->s_op = op; | ||
811 | 939 | ||
812 | if (num_sge > rds_ibdev->max_sge) { | 940 | if (num_sge > max_sge) { |
813 | send->s_wr.num_sge = rds_ibdev->max_sge; | 941 | send->s_wr.num_sge = max_sge; |
814 | num_sge -= rds_ibdev->max_sge; | 942 | num_sge -= max_sge; |
815 | } else { | 943 | } else { |
816 | send->s_wr.num_sge = num_sge; | 944 | send->s_wr.num_sge = num_sge; |
817 | } | 945 | } |
@@ -821,7 +949,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
821 | if (prev) | 949 | if (prev) |
822 | prev->s_wr.next = &send->s_wr; | 950 | prev->s_wr.next = &send->s_wr; |
823 | 951 | ||
824 | for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { | 952 | for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { |
825 | len = ib_sg_dma_len(ic->i_cm_id->device, scat); | 953 | len = ib_sg_dma_len(ic->i_cm_id->device, scat); |
826 | send->s_sge[j].addr = | 954 | send->s_sge[j].addr = |
827 | ib_sg_dma_address(ic->i_cm_id->device, scat); | 955 | ib_sg_dma_address(ic->i_cm_id->device, scat); |
@@ -843,15 +971,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
843 | send = ic->i_sends; | 971 | send = ic->i_sends; |
844 | } | 972 | } |
845 | 973 | ||
846 | /* if we finished the message then send completion owns it */ | 974 | /* give a reference to the last op */ |
847 | if (scat == &op->r_sg[op->r_count]) | 975 | if (scat == &op->op_sg[op->op_count]) { |
848 | prev->s_wr.send_flags = IB_SEND_SIGNALED; | 976 | prev->s_op = op; |
977 | rds_message_addref(container_of(op, struct rds_message, rdma)); | ||
978 | } | ||
849 | 979 | ||
850 | if (i < work_alloc) { | 980 | if (i < work_alloc) { |
851 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); | 981 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); |
852 | work_alloc = i; | 982 | work_alloc = i; |
853 | } | 983 | } |
854 | 984 | ||
985 | if (nr_sig) | ||
986 | atomic_add(nr_sig, &ic->i_signaled_sends); | ||
987 | |||
855 | failed_wr = &first->s_wr; | 988 | failed_wr = &first->s_wr; |
856 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); | 989 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); |
857 | rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, | 990 | rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, |
@@ -861,6 +994,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
861 | printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " | 994 | printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " |
862 | "returned %d\n", &conn->c_faddr, ret); | 995 | "returned %d\n", &conn->c_faddr, ret); |
863 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | 996 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); |
997 | rds_ib_sub_signaled(ic, nr_sig); | ||
864 | goto out; | 998 | goto out; |
865 | } | 999 | } |
866 | 1000 | ||
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index d2c904dd6fb..2d5965d6e97 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c | |||
@@ -67,6 +67,8 @@ static const char *const rds_ib_stat_names[] = { | |||
67 | "ib_rdma_mr_pool_flush", | 67 | "ib_rdma_mr_pool_flush", |
68 | "ib_rdma_mr_pool_wait", | 68 | "ib_rdma_mr_pool_wait", |
69 | "ib_rdma_mr_pool_depleted", | 69 | "ib_rdma_mr_pool_depleted", |
70 | "ib_atomic_cswp", | ||
71 | "ib_atomic_fadd", | ||
70 | }; | 72 | }; |
71 | 73 | ||
72 | unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, | 74 | unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, |
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index 03f01cb4e0f..fc3da37220f 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c | |||
@@ -49,10 +49,6 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16; | |||
49 | static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1; | 49 | static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1; |
50 | static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; | 50 | static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; |
51 | 51 | ||
52 | unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20); | ||
53 | static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1; | ||
54 | static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL; | ||
55 | |||
56 | /* | 52 | /* |
57 | * This sysctl does nothing. | 53 | * This sysctl does nothing. |
58 | * | 54 | * |
@@ -94,15 +90,6 @@ ctl_table rds_ib_sysctl_table[] = { | |||
94 | .extra2 = &rds_ib_sysctl_max_unsig_wr_max, | 90 | .extra2 = &rds_ib_sysctl_max_unsig_wr_max, |
95 | }, | 91 | }, |
96 | { | 92 | { |
97 | .procname = "max_unsignaled_bytes", | ||
98 | .data = &rds_ib_sysctl_max_unsig_bytes, | ||
99 | .maxlen = sizeof(unsigned long), | ||
100 | .mode = 0644, | ||
101 | .proc_handler = proc_doulongvec_minmax, | ||
102 | .extra1 = &rds_ib_sysctl_max_unsig_bytes_min, | ||
103 | .extra2 = &rds_ib_sysctl_max_unsig_bytes_max, | ||
104 | }, | ||
105 | { | ||
106 | .procname = "max_recv_allocation", | 93 | .procname = "max_recv_allocation", |
107 | .data = &rds_ib_sysctl_max_recv_allocation, | 94 | .data = &rds_ib_sysctl_max_recv_allocation, |
108 | .maxlen = sizeof(unsigned long), | 95 | .maxlen = sizeof(unsigned long), |
@@ -132,10 +119,10 @@ void rds_ib_sysctl_exit(void) | |||
132 | unregister_sysctl_table(rds_ib_sysctl_hdr); | 119 | unregister_sysctl_table(rds_ib_sysctl_hdr); |
133 | } | 120 | } |
134 | 121 | ||
135 | int __init rds_ib_sysctl_init(void) | 122 | int rds_ib_sysctl_init(void) |
136 | { | 123 | { |
137 | rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table); | 124 | rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table); |
138 | if (rds_ib_sysctl_hdr == NULL) | 125 | if (!rds_ib_sysctl_hdr) |
139 | return -ENOMEM; | 126 | return -ENOMEM; |
140 | return 0; | 127 | return 0; |
141 | } | 128 | } |
diff --git a/net/rds/info.c b/net/rds/info.c index c45c4173a44..4fdf1b6e84f 100644 --- a/net/rds/info.c +++ b/net/rds/info.c | |||
@@ -76,7 +76,7 @@ void rds_info_register_func(int optname, rds_info_func func) | |||
76 | BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); | 76 | BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); |
77 | 77 | ||
78 | spin_lock(&rds_info_lock); | 78 | spin_lock(&rds_info_lock); |
79 | BUG_ON(rds_info_funcs[offset] != NULL); | 79 | BUG_ON(rds_info_funcs[offset]); |
80 | rds_info_funcs[offset] = func; | 80 | rds_info_funcs[offset] = func; |
81 | spin_unlock(&rds_info_lock); | 81 | spin_unlock(&rds_info_lock); |
82 | } | 82 | } |
@@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(rds_info_deregister_func); | |||
102 | */ | 102 | */ |
103 | void rds_info_iter_unmap(struct rds_info_iterator *iter) | 103 | void rds_info_iter_unmap(struct rds_info_iterator *iter) |
104 | { | 104 | { |
105 | if (iter->addr != NULL) { | 105 | if (iter->addr) { |
106 | kunmap_atomic(iter->addr, KM_USER0); | 106 | kunmap_atomic(iter->addr, KM_USER0); |
107 | iter->addr = NULL; | 107 | iter->addr = NULL; |
108 | } | 108 | } |
@@ -117,7 +117,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data, | |||
117 | unsigned long this; | 117 | unsigned long this; |
118 | 118 | ||
119 | while (bytes) { | 119 | while (bytes) { |
120 | if (iter->addr == NULL) | 120 | if (!iter->addr) |
121 | iter->addr = kmap_atomic(*iter->pages, KM_USER0); | 121 | iter->addr = kmap_atomic(*iter->pages, KM_USER0); |
122 | 122 | ||
123 | this = min(bytes, PAGE_SIZE - iter->offset); | 123 | this = min(bytes, PAGE_SIZE - iter->offset); |
@@ -188,7 +188,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, | |||
188 | >> PAGE_SHIFT; | 188 | >> PAGE_SHIFT; |
189 | 189 | ||
190 | pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); | 190 | pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); |
191 | if (pages == NULL) { | 191 | if (!pages) { |
192 | ret = -ENOMEM; | 192 | ret = -ENOMEM; |
193 | goto out; | 193 | goto out; |
194 | } | 194 | } |
@@ -206,7 +206,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, | |||
206 | 206 | ||
207 | call_func: | 207 | call_func: |
208 | func = rds_info_funcs[optname - RDS_INFO_FIRST]; | 208 | func = rds_info_funcs[optname - RDS_INFO_FIRST]; |
209 | if (func == NULL) { | 209 | if (!func) { |
210 | ret = -ENOPROTOOPT; | 210 | ret = -ENOPROTOOPT; |
211 | goto out; | 211 | goto out; |
212 | } | 212 | } |
@@ -234,7 +234,7 @@ call_func: | |||
234 | ret = -EFAULT; | 234 | ret = -EFAULT; |
235 | 235 | ||
236 | out: | 236 | out: |
237 | for (i = 0; pages != NULL && i < nr_pages; i++) | 237 | for (i = 0; pages && i < nr_pages; i++) |
238 | put_page(pages[i]); | 238 | put_page(pages[i]); |
239 | kfree(pages); | 239 | kfree(pages); |
240 | 240 | ||
diff --git a/net/rds/iw.c b/net/rds/iw.c index c8f3d3525cb..56808cac0fc 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c | |||
@@ -264,7 +264,6 @@ struct rds_transport rds_iw_transport = { | |||
264 | .laddr_check = rds_iw_laddr_check, | 264 | .laddr_check = rds_iw_laddr_check, |
265 | .xmit_complete = rds_iw_xmit_complete, | 265 | .xmit_complete = rds_iw_xmit_complete, |
266 | .xmit = rds_iw_xmit, | 266 | .xmit = rds_iw_xmit, |
267 | .xmit_cong_map = NULL, | ||
268 | .xmit_rdma = rds_iw_xmit_rdma, | 267 | .xmit_rdma = rds_iw_xmit_rdma, |
269 | .recv = rds_iw_recv, | 268 | .recv = rds_iw_recv, |
270 | .conn_alloc = rds_iw_conn_alloc, | 269 | .conn_alloc = rds_iw_conn_alloc, |
@@ -272,7 +271,6 @@ struct rds_transport rds_iw_transport = { | |||
272 | .conn_connect = rds_iw_conn_connect, | 271 | .conn_connect = rds_iw_conn_connect, |
273 | .conn_shutdown = rds_iw_conn_shutdown, | 272 | .conn_shutdown = rds_iw_conn_shutdown, |
274 | .inc_copy_to_user = rds_iw_inc_copy_to_user, | 273 | .inc_copy_to_user = rds_iw_inc_copy_to_user, |
275 | .inc_purge = rds_iw_inc_purge, | ||
276 | .inc_free = rds_iw_inc_free, | 274 | .inc_free = rds_iw_inc_free, |
277 | .cm_initiate_connect = rds_iw_cm_initiate_connect, | 275 | .cm_initiate_connect = rds_iw_cm_initiate_connect, |
278 | .cm_handle_connect = rds_iw_cm_handle_connect, | 276 | .cm_handle_connect = rds_iw_cm_handle_connect, |
@@ -289,7 +287,7 @@ struct rds_transport rds_iw_transport = { | |||
289 | .t_prefer_loopback = 1, | 287 | .t_prefer_loopback = 1, |
290 | }; | 288 | }; |
291 | 289 | ||
292 | int __init rds_iw_init(void) | 290 | int rds_iw_init(void) |
293 | { | 291 | { |
294 | int ret; | 292 | int ret; |
295 | 293 | ||
diff --git a/net/rds/iw.h b/net/rds/iw.h index eef2f0c2847..543e665fafe 100644 --- a/net/rds/iw.h +++ b/net/rds/iw.h | |||
@@ -70,7 +70,7 @@ struct rds_iw_send_work { | |||
70 | struct rds_message *s_rm; | 70 | struct rds_message *s_rm; |
71 | 71 | ||
72 | /* We should really put these into a union: */ | 72 | /* We should really put these into a union: */ |
73 | struct rds_rdma_op *s_op; | 73 | struct rm_rdma_op *s_op; |
74 | struct rds_iw_mapping *s_mapping; | 74 | struct rds_iw_mapping *s_mapping; |
75 | struct ib_mr *s_mr; | 75 | struct ib_mr *s_mr; |
76 | struct ib_fast_reg_page_list *s_page_list; | 76 | struct ib_fast_reg_page_list *s_page_list; |
@@ -284,7 +284,7 @@ void rds_iw_conn_free(void *arg); | |||
284 | int rds_iw_conn_connect(struct rds_connection *conn); | 284 | int rds_iw_conn_connect(struct rds_connection *conn); |
285 | void rds_iw_conn_shutdown(struct rds_connection *conn); | 285 | void rds_iw_conn_shutdown(struct rds_connection *conn); |
286 | void rds_iw_state_change(struct sock *sk); | 286 | void rds_iw_state_change(struct sock *sk); |
287 | int __init rds_iw_listen_init(void); | 287 | int rds_iw_listen_init(void); |
288 | void rds_iw_listen_stop(void); | 288 | void rds_iw_listen_stop(void); |
289 | void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); | 289 | void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); |
290 | int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, | 290 | int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, |
@@ -321,12 +321,11 @@ void rds_iw_flush_mrs(void); | |||
321 | void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); | 321 | void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); |
322 | 322 | ||
323 | /* ib_recv.c */ | 323 | /* ib_recv.c */ |
324 | int __init rds_iw_recv_init(void); | 324 | int rds_iw_recv_init(void); |
325 | void rds_iw_recv_exit(void); | 325 | void rds_iw_recv_exit(void); |
326 | int rds_iw_recv(struct rds_connection *conn); | 326 | int rds_iw_recv(struct rds_connection *conn); |
327 | int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | 327 | int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, |
328 | gfp_t page_gfp, int prefill); | 328 | gfp_t page_gfp, int prefill); |
329 | void rds_iw_inc_purge(struct rds_incoming *inc); | ||
330 | void rds_iw_inc_free(struct rds_incoming *inc); | 329 | void rds_iw_inc_free(struct rds_incoming *inc); |
331 | int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, | 330 | int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, |
332 | size_t size); | 331 | size_t size); |
@@ -358,7 +357,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
358 | void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); | 357 | void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); |
359 | void rds_iw_send_init_ring(struct rds_iw_connection *ic); | 358 | void rds_iw_send_init_ring(struct rds_iw_connection *ic); |
360 | void rds_iw_send_clear_ring(struct rds_iw_connection *ic); | 359 | void rds_iw_send_clear_ring(struct rds_iw_connection *ic); |
361 | int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); | 360 | int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); |
362 | void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); | 361 | void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); |
363 | void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); | 362 | void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); |
364 | int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, | 363 | int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, |
@@ -371,7 +370,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, | |||
371 | unsigned int avail); | 370 | unsigned int avail); |
372 | 371 | ||
373 | /* ib_sysctl.c */ | 372 | /* ib_sysctl.c */ |
374 | int __init rds_iw_sysctl_init(void); | 373 | int rds_iw_sysctl_init(void); |
375 | void rds_iw_sysctl_exit(void); | 374 | void rds_iw_sysctl_exit(void); |
376 | extern unsigned long rds_iw_sysctl_max_send_wr; | 375 | extern unsigned long rds_iw_sysctl_max_send_wr; |
377 | extern unsigned long rds_iw_sysctl_max_recv_wr; | 376 | extern unsigned long rds_iw_sysctl_max_recv_wr; |
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c index b5dd6ac39be..712cf2d1f28 100644 --- a/net/rds/iw_cm.c +++ b/net/rds/iw_cm.c | |||
@@ -257,7 +257,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) | |||
257 | * the rds_iwdev at all. | 257 | * the rds_iwdev at all. |
258 | */ | 258 | */ |
259 | rds_iwdev = ib_get_client_data(dev, &rds_iw_client); | 259 | rds_iwdev = ib_get_client_data(dev, &rds_iw_client); |
260 | if (rds_iwdev == NULL) { | 260 | if (!rds_iwdev) { |
261 | if (printk_ratelimit()) | 261 | if (printk_ratelimit()) |
262 | printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n", | 262 | printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n", |
263 | dev->name); | 263 | dev->name); |
@@ -292,7 +292,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) | |||
292 | ic->i_send_ring.w_nr * | 292 | ic->i_send_ring.w_nr * |
293 | sizeof(struct rds_header), | 293 | sizeof(struct rds_header), |
294 | &ic->i_send_hdrs_dma, GFP_KERNEL); | 294 | &ic->i_send_hdrs_dma, GFP_KERNEL); |
295 | if (ic->i_send_hdrs == NULL) { | 295 | if (!ic->i_send_hdrs) { |
296 | ret = -ENOMEM; | 296 | ret = -ENOMEM; |
297 | rdsdebug("ib_dma_alloc_coherent send failed\n"); | 297 | rdsdebug("ib_dma_alloc_coherent send failed\n"); |
298 | goto out; | 298 | goto out; |
@@ -302,7 +302,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) | |||
302 | ic->i_recv_ring.w_nr * | 302 | ic->i_recv_ring.w_nr * |
303 | sizeof(struct rds_header), | 303 | sizeof(struct rds_header), |
304 | &ic->i_recv_hdrs_dma, GFP_KERNEL); | 304 | &ic->i_recv_hdrs_dma, GFP_KERNEL); |
305 | if (ic->i_recv_hdrs == NULL) { | 305 | if (!ic->i_recv_hdrs) { |
306 | ret = -ENOMEM; | 306 | ret = -ENOMEM; |
307 | rdsdebug("ib_dma_alloc_coherent recv failed\n"); | 307 | rdsdebug("ib_dma_alloc_coherent recv failed\n"); |
308 | goto out; | 308 | goto out; |
@@ -310,14 +310,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn) | |||
310 | 310 | ||
311 | ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), | 311 | ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), |
312 | &ic->i_ack_dma, GFP_KERNEL); | 312 | &ic->i_ack_dma, GFP_KERNEL); |
313 | if (ic->i_ack == NULL) { | 313 | if (!ic->i_ack) { |
314 | ret = -ENOMEM; | 314 | ret = -ENOMEM; |
315 | rdsdebug("ib_dma_alloc_coherent ack failed\n"); | 315 | rdsdebug("ib_dma_alloc_coherent ack failed\n"); |
316 | goto out; | 316 | goto out; |
317 | } | 317 | } |
318 | 318 | ||
319 | ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); | 319 | ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); |
320 | if (ic->i_sends == NULL) { | 320 | if (!ic->i_sends) { |
321 | ret = -ENOMEM; | 321 | ret = -ENOMEM; |
322 | rdsdebug("send allocation failed\n"); | 322 | rdsdebug("send allocation failed\n"); |
323 | goto out; | 323 | goto out; |
@@ -325,7 +325,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) | |||
325 | rds_iw_send_init_ring(ic); | 325 | rds_iw_send_init_ring(ic); |
326 | 326 | ||
327 | ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); | 327 | ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); |
328 | if (ic->i_recvs == NULL) { | 328 | if (!ic->i_recvs) { |
329 | ret = -ENOMEM; | 329 | ret = -ENOMEM; |
330 | rdsdebug("recv allocation failed\n"); | 330 | rdsdebug("recv allocation failed\n"); |
331 | goto out; | 331 | goto out; |
@@ -696,7 +696,7 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) | |||
696 | 696 | ||
697 | /* XXX too lazy? */ | 697 | /* XXX too lazy? */ |
698 | ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); | 698 | ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); |
699 | if (ic == NULL) | 699 | if (!ic) |
700 | return -ENOMEM; | 700 | return -ENOMEM; |
701 | 701 | ||
702 | INIT_LIST_HEAD(&ic->iw_node); | 702 | INIT_LIST_HEAD(&ic->iw_node); |
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index 13dc1862d86..0e7accc23ee 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c | |||
@@ -34,7 +34,6 @@ | |||
34 | #include <linux/slab.h> | 34 | #include <linux/slab.h> |
35 | 35 | ||
36 | #include "rds.h" | 36 | #include "rds.h" |
37 | #include "rdma.h" | ||
38 | #include "iw.h" | 37 | #include "iw.h" |
39 | 38 | ||
40 | 39 | ||
@@ -207,9 +206,9 @@ void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *con | |||
207 | BUG_ON(list_empty(&ic->iw_node)); | 206 | BUG_ON(list_empty(&ic->iw_node)); |
208 | list_del(&ic->iw_node); | 207 | list_del(&ic->iw_node); |
209 | 208 | ||
210 | spin_lock_irq(&rds_iwdev->spinlock); | 209 | spin_lock(&rds_iwdev->spinlock); |
211 | list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); | 210 | list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); |
212 | spin_unlock_irq(&rds_iwdev->spinlock); | 211 | spin_unlock(&rds_iwdev->spinlock); |
213 | spin_unlock_irq(&iw_nodev_conns_lock); | 212 | spin_unlock_irq(&iw_nodev_conns_lock); |
214 | 213 | ||
215 | ic->rds_iwdev = rds_iwdev; | 214 | ic->rds_iwdev = rds_iwdev; |
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c index 3d479067d54..5e57347f49f 100644 --- a/net/rds/iw_recv.c +++ b/net/rds/iw_recv.c | |||
@@ -53,7 +53,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag) | |||
53 | static void rds_iw_frag_free(struct rds_page_frag *frag) | 53 | static void rds_iw_frag_free(struct rds_page_frag *frag) |
54 | { | 54 | { |
55 | rdsdebug("frag %p page %p\n", frag, frag->f_page); | 55 | rdsdebug("frag %p page %p\n", frag, frag->f_page); |
56 | BUG_ON(frag->f_page != NULL); | 56 | BUG_ON(frag->f_page); |
57 | kmem_cache_free(rds_iw_frag_slab, frag); | 57 | kmem_cache_free(rds_iw_frag_slab, frag); |
58 | } | 58 | } |
59 | 59 | ||
@@ -143,14 +143,14 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn, | |||
143 | struct ib_sge *sge; | 143 | struct ib_sge *sge; |
144 | int ret = -ENOMEM; | 144 | int ret = -ENOMEM; |
145 | 145 | ||
146 | if (recv->r_iwinc == NULL) { | 146 | if (!recv->r_iwinc) { |
147 | if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { | 147 | if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { |
148 | rds_iw_stats_inc(s_iw_rx_alloc_limit); | 148 | rds_iw_stats_inc(s_iw_rx_alloc_limit); |
149 | goto out; | 149 | goto out; |
150 | } | 150 | } |
151 | recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, | 151 | recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, |
152 | kptr_gfp); | 152 | kptr_gfp); |
153 | if (recv->r_iwinc == NULL) { | 153 | if (!recv->r_iwinc) { |
154 | atomic_dec(&rds_iw_allocation); | 154 | atomic_dec(&rds_iw_allocation); |
155 | goto out; | 155 | goto out; |
156 | } | 156 | } |
@@ -158,17 +158,17 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn, | |||
158 | rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); | 158 | rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); |
159 | } | 159 | } |
160 | 160 | ||
161 | if (recv->r_frag == NULL) { | 161 | if (!recv->r_frag) { |
162 | recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); | 162 | recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); |
163 | if (recv->r_frag == NULL) | 163 | if (!recv->r_frag) |
164 | goto out; | 164 | goto out; |
165 | INIT_LIST_HEAD(&recv->r_frag->f_item); | 165 | INIT_LIST_HEAD(&recv->r_frag->f_item); |
166 | recv->r_frag->f_page = NULL; | 166 | recv->r_frag->f_page = NULL; |
167 | } | 167 | } |
168 | 168 | ||
169 | if (ic->i_frag.f_page == NULL) { | 169 | if (!ic->i_frag.f_page) { |
170 | ic->i_frag.f_page = alloc_page(page_gfp); | 170 | ic->i_frag.f_page = alloc_page(page_gfp); |
171 | if (ic->i_frag.f_page == NULL) | 171 | if (!ic->i_frag.f_page) |
172 | goto out; | 172 | goto out; |
173 | ic->i_frag.f_offset = 0; | 173 | ic->i_frag.f_offset = 0; |
174 | } | 174 | } |
@@ -273,7 +273,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, | |||
273 | return ret; | 273 | return ret; |
274 | } | 274 | } |
275 | 275 | ||
276 | void rds_iw_inc_purge(struct rds_incoming *inc) | 276 | static void rds_iw_inc_purge(struct rds_incoming *inc) |
277 | { | 277 | { |
278 | struct rds_iw_incoming *iwinc; | 278 | struct rds_iw_incoming *iwinc; |
279 | struct rds_page_frag *frag; | 279 | struct rds_page_frag *frag; |
@@ -716,7 +716,7 @@ static void rds_iw_process_recv(struct rds_connection *conn, | |||
716 | * into the inc and save the inc so we can hang upcoming fragments | 716 | * into the inc and save the inc so we can hang upcoming fragments |
717 | * off its list. | 717 | * off its list. |
718 | */ | 718 | */ |
719 | if (iwinc == NULL) { | 719 | if (!iwinc) { |
720 | iwinc = recv->r_iwinc; | 720 | iwinc = recv->r_iwinc; |
721 | recv->r_iwinc = NULL; | 721 | recv->r_iwinc = NULL; |
722 | ic->i_iwinc = iwinc; | 722 | ic->i_iwinc = iwinc; |
@@ -887,7 +887,7 @@ int rds_iw_recv(struct rds_connection *conn) | |||
887 | return ret; | 887 | return ret; |
888 | } | 888 | } |
889 | 889 | ||
890 | int __init rds_iw_recv_init(void) | 890 | int rds_iw_recv_init(void) |
891 | { | 891 | { |
892 | struct sysinfo si; | 892 | struct sysinfo si; |
893 | int ret = -ENOMEM; | 893 | int ret = -ENOMEM; |
@@ -899,13 +899,13 @@ int __init rds_iw_recv_init(void) | |||
899 | rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", | 899 | rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", |
900 | sizeof(struct rds_iw_incoming), | 900 | sizeof(struct rds_iw_incoming), |
901 | 0, 0, NULL); | 901 | 0, 0, NULL); |
902 | if (rds_iw_incoming_slab == NULL) | 902 | if (!rds_iw_incoming_slab) |
903 | goto out; | 903 | goto out; |
904 | 904 | ||
905 | rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", | 905 | rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", |
906 | sizeof(struct rds_page_frag), | 906 | sizeof(struct rds_page_frag), |
907 | 0, 0, NULL); | 907 | 0, 0, NULL); |
908 | if (rds_iw_frag_slab == NULL) | 908 | if (!rds_iw_frag_slab) |
909 | kmem_cache_destroy(rds_iw_incoming_slab); | 909 | kmem_cache_destroy(rds_iw_incoming_slab); |
910 | else | 910 | else |
911 | ret = 0; | 911 | ret = 0; |
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index 52182ff7519..6280ea020d4 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c | |||
@@ -36,7 +36,6 @@ | |||
36 | #include <linux/dmapool.h> | 36 | #include <linux/dmapool.h> |
37 | 37 | ||
38 | #include "rds.h" | 38 | #include "rds.h" |
39 | #include "rdma.h" | ||
40 | #include "iw.h" | 39 | #include "iw.h" |
41 | 40 | ||
42 | static void rds_iw_send_rdma_complete(struct rds_message *rm, | 41 | static void rds_iw_send_rdma_complete(struct rds_message *rm, |
@@ -64,13 +63,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm, | |||
64 | } | 63 | } |
65 | 64 | ||
66 | static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, | 65 | static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, |
67 | struct rds_rdma_op *op) | 66 | struct rm_rdma_op *op) |
68 | { | 67 | { |
69 | if (op->r_mapped) { | 68 | if (op->op_mapped) { |
70 | ib_dma_unmap_sg(ic->i_cm_id->device, | 69 | ib_dma_unmap_sg(ic->i_cm_id->device, |
71 | op->r_sg, op->r_nents, | 70 | op->op_sg, op->op_nents, |
72 | op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); | 71 | op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); |
73 | op->r_mapped = 0; | 72 | op->op_mapped = 0; |
74 | } | 73 | } |
75 | } | 74 | } |
76 | 75 | ||
@@ -83,11 +82,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, | |||
83 | rdsdebug("ic %p send %p rm %p\n", ic, send, rm); | 82 | rdsdebug("ic %p send %p rm %p\n", ic, send, rm); |
84 | 83 | ||
85 | ib_dma_unmap_sg(ic->i_cm_id->device, | 84 | ib_dma_unmap_sg(ic->i_cm_id->device, |
86 | rm->m_sg, rm->m_nents, | 85 | rm->data.op_sg, rm->data.op_nents, |
87 | DMA_TO_DEVICE); | 86 | DMA_TO_DEVICE); |
88 | 87 | ||
89 | if (rm->m_rdma_op != NULL) { | 88 | if (rm->rdma.op_active) { |
90 | rds_iw_send_unmap_rdma(ic, rm->m_rdma_op); | 89 | rds_iw_send_unmap_rdma(ic, &rm->rdma); |
91 | 90 | ||
92 | /* If the user asked for a completion notification on this | 91 | /* If the user asked for a completion notification on this |
93 | * message, we can implement three different semantics: | 92 | * message, we can implement three different semantics: |
@@ -111,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, | |||
111 | */ | 110 | */ |
112 | rds_iw_send_rdma_complete(rm, wc_status); | 111 | rds_iw_send_rdma_complete(rm, wc_status); |
113 | 112 | ||
114 | if (rm->m_rdma_op->r_write) | 113 | if (rm->rdma.op_write) |
115 | rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); | 114 | rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes); |
116 | else | 115 | else |
117 | rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); | 116 | rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes); |
118 | } | 117 | } |
119 | 118 | ||
120 | /* If anyone waited for this message to get flushed out, wake | 119 | /* If anyone waited for this message to get flushed out, wake |
@@ -556,25 +555,27 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
556 | } | 555 | } |
557 | 556 | ||
558 | /* map the message the first time we see it */ | 557 | /* map the message the first time we see it */ |
559 | if (ic->i_rm == NULL) { | 558 | if (!ic->i_rm) { |
560 | /* | 559 | /* |
561 | printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", | 560 | printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", |
562 | be16_to_cpu(rm->m_inc.i_hdr.h_dport), | 561 | be16_to_cpu(rm->m_inc.i_hdr.h_dport), |
563 | rm->m_inc.i_hdr.h_flags, | 562 | rm->m_inc.i_hdr.h_flags, |
564 | be32_to_cpu(rm->m_inc.i_hdr.h_len)); | 563 | be32_to_cpu(rm->m_inc.i_hdr.h_len)); |
565 | */ | 564 | */ |
566 | if (rm->m_nents) { | 565 | if (rm->data.op_nents) { |
567 | rm->m_count = ib_dma_map_sg(dev, | 566 | rm->data.op_count = ib_dma_map_sg(dev, |
568 | rm->m_sg, rm->m_nents, DMA_TO_DEVICE); | 567 | rm->data.op_sg, |
569 | rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); | 568 | rm->data.op_nents, |
570 | if (rm->m_count == 0) { | 569 | DMA_TO_DEVICE); |
570 | rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); | ||
571 | if (rm->data.op_count == 0) { | ||
571 | rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); | 572 | rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); |
572 | rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); | 573 | rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); |
573 | ret = -ENOMEM; /* XXX ? */ | 574 | ret = -ENOMEM; /* XXX ? */ |
574 | goto out; | 575 | goto out; |
575 | } | 576 | } |
576 | } else { | 577 | } else { |
577 | rm->m_count = 0; | 578 | rm->data.op_count = 0; |
578 | } | 579 | } |
579 | 580 | ||
580 | ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; | 581 | ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; |
@@ -590,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
590 | 591 | ||
591 | /* If it has a RDMA op, tell the peer we did it. This is | 592 | /* If it has a RDMA op, tell the peer we did it. This is |
592 | * used by the peer to release use-once RDMA MRs. */ | 593 | * used by the peer to release use-once RDMA MRs. */ |
593 | if (rm->m_rdma_op) { | 594 | if (rm->rdma.op_active) { |
594 | struct rds_ext_header_rdma ext_hdr; | 595 | struct rds_ext_header_rdma ext_hdr; |
595 | 596 | ||
596 | ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); | 597 | ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); |
597 | rds_message_add_extension(&rm->m_inc.i_hdr, | 598 | rds_message_add_extension(&rm->m_inc.i_hdr, |
598 | RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); | 599 | RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); |
599 | } | 600 | } |
@@ -621,7 +622,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
621 | send = &ic->i_sends[pos]; | 622 | send = &ic->i_sends[pos]; |
622 | first = send; | 623 | first = send; |
623 | prev = NULL; | 624 | prev = NULL; |
624 | scat = &rm->m_sg[sg]; | 625 | scat = &rm->data.op_sg[sg]; |
625 | sent = 0; | 626 | sent = 0; |
626 | i = 0; | 627 | i = 0; |
627 | 628 | ||
@@ -631,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
631 | * or when requested by the user. Right now, we let | 632 | * or when requested by the user. Right now, we let |
632 | * the application choose. | 633 | * the application choose. |
633 | */ | 634 | */ |
634 | if (rm->m_rdma_op && rm->m_rdma_op->r_fence) | 635 | if (rm->rdma.op_active && rm->rdma.op_fence) |
635 | send_flags = IB_SEND_FENCE; | 636 | send_flags = IB_SEND_FENCE; |
636 | 637 | ||
637 | /* | 638 | /* |
@@ -650,7 +651,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
650 | } | 651 | } |
651 | 652 | ||
652 | /* if there's data reference it with a chain of work reqs */ | 653 | /* if there's data reference it with a chain of work reqs */ |
653 | for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { | 654 | for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) { |
654 | unsigned int len; | 655 | unsigned int len; |
655 | 656 | ||
656 | send = &ic->i_sends[pos]; | 657 | send = &ic->i_sends[pos]; |
@@ -728,7 +729,7 @@ add_header: | |||
728 | sent += sizeof(struct rds_header); | 729 | sent += sizeof(struct rds_header); |
729 | 730 | ||
730 | /* if we finished the message then send completion owns it */ | 731 | /* if we finished the message then send completion owns it */ |
731 | if (scat == &rm->m_sg[rm->m_count]) { | 732 | if (scat == &rm->data.op_sg[rm->data.op_count]) { |
732 | prev->s_rm = ic->i_rm; | 733 | prev->s_rm = ic->i_rm; |
733 | prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | 734 | prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; |
734 | ic->i_rm = NULL; | 735 | ic->i_rm = NULL; |
@@ -784,7 +785,7 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd | |||
784 | ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); | 785 | ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); |
785 | } | 786 | } |
786 | 787 | ||
787 | int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | 788 | int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) |
788 | { | 789 | { |
789 | struct rds_iw_connection *ic = conn->c_transport_data; | 790 | struct rds_iw_connection *ic = conn->c_transport_data; |
790 | struct rds_iw_send_work *send = NULL; | 791 | struct rds_iw_send_work *send = NULL; |
@@ -794,7 +795,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
794 | struct rds_iw_device *rds_iwdev; | 795 | struct rds_iw_device *rds_iwdev; |
795 | struct scatterlist *scat; | 796 | struct scatterlist *scat; |
796 | unsigned long len; | 797 | unsigned long len; |
797 | u64 remote_addr = op->r_remote_addr; | 798 | u64 remote_addr = op->op_remote_addr; |
798 | u32 pos, fr_pos; | 799 | u32 pos, fr_pos; |
799 | u32 work_alloc; | 800 | u32 work_alloc; |
800 | u32 i; | 801 | u32 i; |
@@ -806,21 +807,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
806 | rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); | 807 | rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); |
807 | 808 | ||
808 | /* map the message the first time we see it */ | 809 | /* map the message the first time we see it */ |
809 | if (!op->r_mapped) { | 810 | if (!op->op_mapped) { |
810 | op->r_count = ib_dma_map_sg(ic->i_cm_id->device, | 811 | op->op_count = ib_dma_map_sg(ic->i_cm_id->device, |
811 | op->r_sg, op->r_nents, (op->r_write) ? | 812 | op->op_sg, op->op_nents, (op->op_write) ? |
812 | DMA_TO_DEVICE : DMA_FROM_DEVICE); | 813 | DMA_TO_DEVICE : DMA_FROM_DEVICE); |
813 | rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); | 814 | rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); |
814 | if (op->r_count == 0) { | 815 | if (op->op_count == 0) { |
815 | rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); | 816 | rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); |
816 | ret = -ENOMEM; /* XXX ? */ | 817 | ret = -ENOMEM; /* XXX ? */ |
817 | goto out; | 818 | goto out; |
818 | } | 819 | } |
819 | 820 | ||
820 | op->r_mapped = 1; | 821 | op->op_mapped = 1; |
821 | } | 822 | } |
822 | 823 | ||
823 | if (!op->r_write) { | 824 | if (!op->op_write) { |
824 | /* Alloc space on the send queue for the fastreg */ | 825 | /* Alloc space on the send queue for the fastreg */ |
825 | work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); | 826 | work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); |
826 | if (work_alloc != 1) { | 827 | if (work_alloc != 1) { |
@@ -835,7 +836,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
835 | * Instead of knowing how to return a partial rdma read/write we insist that there | 836 | * Instead of knowing how to return a partial rdma read/write we insist that there |
836 | * be enough work requests to send the entire message. | 837 | * be enough work requests to send the entire message. |
837 | */ | 838 | */ |
838 | i = ceil(op->r_count, rds_iwdev->max_sge); | 839 | i = ceil(op->op_count, rds_iwdev->max_sge); |
839 | 840 | ||
840 | work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); | 841 | work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); |
841 | if (work_alloc != i) { | 842 | if (work_alloc != i) { |
@@ -846,17 +847,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
846 | } | 847 | } |
847 | 848 | ||
848 | send = &ic->i_sends[pos]; | 849 | send = &ic->i_sends[pos]; |
849 | if (!op->r_write) { | 850 | if (!op->op_write) { |
850 | first = prev = &ic->i_sends[fr_pos]; | 851 | first = prev = &ic->i_sends[fr_pos]; |
851 | } else { | 852 | } else { |
852 | first = send; | 853 | first = send; |
853 | prev = NULL; | 854 | prev = NULL; |
854 | } | 855 | } |
855 | scat = &op->r_sg[0]; | 856 | scat = &op->op_sg[0]; |
856 | sent = 0; | 857 | sent = 0; |
857 | num_sge = op->r_count; | 858 | num_sge = op->op_count; |
858 | 859 | ||
859 | for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { | 860 | for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { |
860 | send->s_wr.send_flags = 0; | 861 | send->s_wr.send_flags = 0; |
861 | send->s_queued = jiffies; | 862 | send->s_queued = jiffies; |
862 | 863 | ||
@@ -873,13 +874,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
873 | * for local access after RDS is finished with it, using | 874 | * for local access after RDS is finished with it, using |
874 | * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. | 875 | * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. |
875 | */ | 876 | */ |
876 | if (op->r_write) | 877 | if (op->op_write) |
877 | send->s_wr.opcode = IB_WR_RDMA_WRITE; | 878 | send->s_wr.opcode = IB_WR_RDMA_WRITE; |
878 | else | 879 | else |
879 | send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; | 880 | send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; |
880 | 881 | ||
881 | send->s_wr.wr.rdma.remote_addr = remote_addr; | 882 | send->s_wr.wr.rdma.remote_addr = remote_addr; |
882 | send->s_wr.wr.rdma.rkey = op->r_key; | 883 | send->s_wr.wr.rdma.rkey = op->op_rkey; |
883 | send->s_op = op; | 884 | send->s_op = op; |
884 | 885 | ||
885 | if (num_sge > rds_iwdev->max_sge) { | 886 | if (num_sge > rds_iwdev->max_sge) { |
@@ -893,7 +894,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
893 | if (prev) | 894 | if (prev) |
894 | prev->s_wr.next = &send->s_wr; | 895 | prev->s_wr.next = &send->s_wr; |
895 | 896 | ||
896 | for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { | 897 | for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { |
897 | len = ib_sg_dma_len(ic->i_cm_id->device, scat); | 898 | len = ib_sg_dma_len(ic->i_cm_id->device, scat); |
898 | 899 | ||
899 | if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) | 900 | if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) |
@@ -927,7 +928,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
927 | } | 928 | } |
928 | 929 | ||
929 | /* if we finished the message then send completion owns it */ | 930 | /* if we finished the message then send completion owns it */ |
930 | if (scat == &op->r_sg[op->r_count]) | 931 | if (scat == &op->op_sg[op->op_count]) |
931 | first->s_wr.send_flags = IB_SEND_SIGNALED; | 932 | first->s_wr.send_flags = IB_SEND_SIGNALED; |
932 | 933 | ||
933 | if (i < work_alloc) { | 934 | if (i < work_alloc) { |
@@ -941,9 +942,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
941 | * adapters do not allow using the lkey for this at all. To bypass this use a | 942 | * adapters do not allow using the lkey for this at all. To bypass this use a |
942 | * fastreg_mr (or possibly a dma_mr) | 943 | * fastreg_mr (or possibly a dma_mr) |
943 | */ | 944 | */ |
944 | if (!op->r_write) { | 945 | if (!op->op_write) { |
945 | rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], | 946 | rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], |
946 | op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); | 947 | op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); |
947 | work_alloc++; | 948 | work_alloc++; |
948 | } | 949 | } |
949 | 950 | ||
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c index 1c4428a61a0..23e3a9a26aa 100644 --- a/net/rds/iw_sysctl.c +++ b/net/rds/iw_sysctl.c | |||
@@ -122,10 +122,10 @@ void rds_iw_sysctl_exit(void) | |||
122 | unregister_sysctl_table(rds_iw_sysctl_hdr); | 122 | unregister_sysctl_table(rds_iw_sysctl_hdr); |
123 | } | 123 | } |
124 | 124 | ||
125 | int __init rds_iw_sysctl_init(void) | 125 | int rds_iw_sysctl_init(void) |
126 | { | 126 | { |
127 | rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table); | 127 | rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table); |
128 | if (rds_iw_sysctl_hdr == NULL) | 128 | if (!rds_iw_sysctl_hdr) |
129 | return -ENOMEM; | 129 | return -ENOMEM; |
130 | return 0; | 130 | return 0; |
131 | } | 131 | } |
diff --git a/net/rds/loop.c b/net/rds/loop.c index dd987937945..c390156b426 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c | |||
@@ -61,10 +61,17 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
61 | unsigned int hdr_off, unsigned int sg, | 61 | unsigned int hdr_off, unsigned int sg, |
62 | unsigned int off) | 62 | unsigned int off) |
63 | { | 63 | { |
64 | /* Do not send cong updates to loopback */ | ||
65 | if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { | ||
66 | rds_cong_map_updated(conn->c_fcong, ~(u64) 0); | ||
67 | return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; | ||
68 | } | ||
69 | |||
64 | BUG_ON(hdr_off || sg || off); | 70 | BUG_ON(hdr_off || sg || off); |
65 | 71 | ||
66 | rds_inc_init(&rm->m_inc, conn, conn->c_laddr); | 72 | rds_inc_init(&rm->m_inc, conn, conn->c_laddr); |
67 | rds_message_addref(rm); /* for the inc */ | 73 | /* For the embedded inc. Matching put is in loop_inc_free() */ |
74 | rds_message_addref(rm); | ||
68 | 75 | ||
69 | rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, | 76 | rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, |
70 | GFP_KERNEL, KM_USER0); | 77 | GFP_KERNEL, KM_USER0); |
@@ -77,16 +84,14 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
77 | return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); | 84 | return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); |
78 | } | 85 | } |
79 | 86 | ||
80 | static int rds_loop_xmit_cong_map(struct rds_connection *conn, | 87 | /* |
81 | struct rds_cong_map *map, | 88 | * See rds_loop_xmit(). Since our inc is embedded in the rm, we |
82 | unsigned long offset) | 89 | * make sure the rm lives at least until the inc is done. |
90 | */ | ||
91 | static void rds_loop_inc_free(struct rds_incoming *inc) | ||
83 | { | 92 | { |
84 | BUG_ON(offset); | 93 | struct rds_message *rm = container_of(inc, struct rds_message, m_inc); |
85 | BUG_ON(map != conn->c_lcong); | 94 | rds_message_put(rm); |
86 | |||
87 | rds_cong_map_updated(conn->c_fcong, ~(u64) 0); | ||
88 | |||
89 | return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; | ||
90 | } | 95 | } |
91 | 96 | ||
92 | /* we need to at least give the thread something to succeed */ | 97 | /* we need to at least give the thread something to succeed */ |
@@ -112,7 +117,7 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) | |||
112 | unsigned long flags; | 117 | unsigned long flags; |
113 | 118 | ||
114 | lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL); | 119 | lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL); |
115 | if (lc == NULL) | 120 | if (!lc) |
116 | return -ENOMEM; | 121 | return -ENOMEM; |
117 | 122 | ||
118 | INIT_LIST_HEAD(&lc->loop_node); | 123 | INIT_LIST_HEAD(&lc->loop_node); |
@@ -169,14 +174,12 @@ void rds_loop_exit(void) | |||
169 | */ | 174 | */ |
170 | struct rds_transport rds_loop_transport = { | 175 | struct rds_transport rds_loop_transport = { |
171 | .xmit = rds_loop_xmit, | 176 | .xmit = rds_loop_xmit, |
172 | .xmit_cong_map = rds_loop_xmit_cong_map, | ||
173 | .recv = rds_loop_recv, | 177 | .recv = rds_loop_recv, |
174 | .conn_alloc = rds_loop_conn_alloc, | 178 | .conn_alloc = rds_loop_conn_alloc, |
175 | .conn_free = rds_loop_conn_free, | 179 | .conn_free = rds_loop_conn_free, |
176 | .conn_connect = rds_loop_conn_connect, | 180 | .conn_connect = rds_loop_conn_connect, |
177 | .conn_shutdown = rds_loop_conn_shutdown, | 181 | .conn_shutdown = rds_loop_conn_shutdown, |
178 | .inc_copy_to_user = rds_message_inc_copy_to_user, | 182 | .inc_copy_to_user = rds_message_inc_copy_to_user, |
179 | .inc_purge = rds_message_inc_purge, | 183 | .inc_free = rds_loop_inc_free, |
180 | .inc_free = rds_message_inc_free, | ||
181 | .t_name = "loopback", | 184 | .t_name = "loopback", |
182 | }; | 185 | }; |
diff --git a/net/rds/message.c b/net/rds/message.c index 9a1d67e001b..84f937f11d4 100644 --- a/net/rds/message.c +++ b/net/rds/message.c | |||
@@ -34,9 +34,6 @@ | |||
34 | #include <linux/slab.h> | 34 | #include <linux/slab.h> |
35 | 35 | ||
36 | #include "rds.h" | 36 | #include "rds.h" |
37 | #include "rdma.h" | ||
38 | |||
39 | static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq); | ||
40 | 37 | ||
41 | static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { | 38 | static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { |
42 | [RDS_EXTHDR_NONE] = 0, | 39 | [RDS_EXTHDR_NONE] = 0, |
@@ -63,29 +60,31 @@ static void rds_message_purge(struct rds_message *rm) | |||
63 | if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) | 60 | if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) |
64 | return; | 61 | return; |
65 | 62 | ||
66 | for (i = 0; i < rm->m_nents; i++) { | 63 | for (i = 0; i < rm->data.op_nents; i++) { |
67 | rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i])); | 64 | rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i])); |
68 | /* XXX will have to put_page for page refs */ | 65 | /* XXX will have to put_page for page refs */ |
69 | __free_page(sg_page(&rm->m_sg[i])); | 66 | __free_page(sg_page(&rm->data.op_sg[i])); |
70 | } | 67 | } |
71 | rm->m_nents = 0; | 68 | rm->data.op_nents = 0; |
72 | 69 | ||
73 | if (rm->m_rdma_op) | 70 | if (rm->rdma.op_active) |
74 | rds_rdma_free_op(rm->m_rdma_op); | 71 | rds_rdma_free_op(&rm->rdma); |
75 | if (rm->m_rdma_mr) | 72 | if (rm->rdma.op_rdma_mr) |
76 | rds_mr_put(rm->m_rdma_mr); | 73 | rds_mr_put(rm->rdma.op_rdma_mr); |
77 | } | ||
78 | 74 | ||
79 | void rds_message_inc_purge(struct rds_incoming *inc) | 75 | if (rm->atomic.op_active) |
80 | { | 76 | rds_atomic_free_op(&rm->atomic); |
81 | struct rds_message *rm = container_of(inc, struct rds_message, m_inc); | 77 | if (rm->atomic.op_rdma_mr) |
82 | rds_message_purge(rm); | 78 | rds_mr_put(rm->atomic.op_rdma_mr); |
83 | } | 79 | } |
84 | 80 | ||
85 | void rds_message_put(struct rds_message *rm) | 81 | void rds_message_put(struct rds_message *rm) |
86 | { | 82 | { |
87 | rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); | 83 | rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); |
88 | 84 | if (atomic_read(&rm->m_refcount) == 0) { | |
85 | printk(KERN_CRIT "danger refcount zero on %p\n", rm); | ||
86 | WARN_ON(1); | ||
87 | } | ||
89 | if (atomic_dec_and_test(&rm->m_refcount)) { | 88 | if (atomic_dec_and_test(&rm->m_refcount)) { |
90 | BUG_ON(!list_empty(&rm->m_sock_item)); | 89 | BUG_ON(!list_empty(&rm->m_sock_item)); |
91 | BUG_ON(!list_empty(&rm->m_conn_item)); | 90 | BUG_ON(!list_empty(&rm->m_conn_item)); |
@@ -96,12 +95,6 @@ void rds_message_put(struct rds_message *rm) | |||
96 | } | 95 | } |
97 | EXPORT_SYMBOL_GPL(rds_message_put); | 96 | EXPORT_SYMBOL_GPL(rds_message_put); |
98 | 97 | ||
99 | void rds_message_inc_free(struct rds_incoming *inc) | ||
100 | { | ||
101 | struct rds_message *rm = container_of(inc, struct rds_message, m_inc); | ||
102 | rds_message_put(rm); | ||
103 | } | ||
104 | |||
105 | void rds_message_populate_header(struct rds_header *hdr, __be16 sport, | 98 | void rds_message_populate_header(struct rds_header *hdr, __be16 sport, |
106 | __be16 dport, u64 seq) | 99 | __be16 dport, u64 seq) |
107 | { | 100 | { |
@@ -214,41 +207,68 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o | |||
214 | } | 207 | } |
215 | EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension); | 208 | EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension); |
216 | 209 | ||
217 | struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp) | 210 | /* |
211 | * Each rds_message is allocated with extra space for the scatterlist entries | ||
212 | * rds ops will need. This is to minimize memory allocation count. Then, each rds op | ||
213 | * can grab SGs when initializing its part of the rds_message. | ||
214 | */ | ||
215 | struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp) | ||
218 | { | 216 | { |
219 | struct rds_message *rm; | 217 | struct rds_message *rm; |
220 | 218 | ||
221 | rm = kzalloc(sizeof(struct rds_message) + | 219 | rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp); |
222 | (nents * sizeof(struct scatterlist)), gfp); | ||
223 | if (!rm) | 220 | if (!rm) |
224 | goto out; | 221 | goto out; |
225 | 222 | ||
226 | if (nents) | 223 | rm->m_used_sgs = 0; |
227 | sg_init_table(rm->m_sg, nents); | 224 | rm->m_total_sgs = extra_len / sizeof(struct scatterlist); |
225 | |||
228 | atomic_set(&rm->m_refcount, 1); | 226 | atomic_set(&rm->m_refcount, 1); |
229 | INIT_LIST_HEAD(&rm->m_sock_item); | 227 | INIT_LIST_HEAD(&rm->m_sock_item); |
230 | INIT_LIST_HEAD(&rm->m_conn_item); | 228 | INIT_LIST_HEAD(&rm->m_conn_item); |
231 | spin_lock_init(&rm->m_rs_lock); | 229 | spin_lock_init(&rm->m_rs_lock); |
230 | init_waitqueue_head(&rm->m_flush_wait); | ||
232 | 231 | ||
233 | out: | 232 | out: |
234 | return rm; | 233 | return rm; |
235 | } | 234 | } |
236 | 235 | ||
236 | /* | ||
237 | * RDS ops use this to grab SG entries from the rm's sg pool. | ||
238 | */ | ||
239 | struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents) | ||
240 | { | ||
241 | struct scatterlist *sg_first = (struct scatterlist *) &rm[1]; | ||
242 | struct scatterlist *sg_ret; | ||
243 | |||
244 | WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs); | ||
245 | WARN_ON(!nents); | ||
246 | |||
247 | sg_ret = &sg_first[rm->m_used_sgs]; | ||
248 | sg_init_table(sg_ret, nents); | ||
249 | rm->m_used_sgs += nents; | ||
250 | |||
251 | return sg_ret; | ||
252 | } | ||
253 | |||
237 | struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) | 254 | struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) |
238 | { | 255 | { |
239 | struct rds_message *rm; | 256 | struct rds_message *rm; |
240 | unsigned int i; | 257 | unsigned int i; |
258 | int num_sgs = ceil(total_len, PAGE_SIZE); | ||
259 | int extra_bytes = num_sgs * sizeof(struct scatterlist); | ||
241 | 260 | ||
242 | rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); | 261 | rm = rds_message_alloc(extra_bytes, GFP_NOWAIT); |
243 | if (rm == NULL) | 262 | if (!rm) |
244 | return ERR_PTR(-ENOMEM); | 263 | return ERR_PTR(-ENOMEM); |
245 | 264 | ||
246 | set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); | 265 | set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); |
247 | rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); | 266 | rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); |
248 | rm->m_nents = ceil(total_len, PAGE_SIZE); | 267 | rm->data.op_nents = ceil(total_len, PAGE_SIZE); |
268 | rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); | ||
249 | 269 | ||
250 | for (i = 0; i < rm->m_nents; ++i) { | 270 | for (i = 0; i < rm->data.op_nents; ++i) { |
251 | sg_set_page(&rm->m_sg[i], | 271 | sg_set_page(&rm->data.op_sg[i], |
252 | virt_to_page(page_addrs[i]), | 272 | virt_to_page(page_addrs[i]), |
253 | PAGE_SIZE, 0); | 273 | PAGE_SIZE, 0); |
254 | } | 274 | } |
@@ -256,40 +276,33 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in | |||
256 | return rm; | 276 | return rm; |
257 | } | 277 | } |
258 | 278 | ||
259 | struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, | 279 | int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, |
260 | size_t total_len) | 280 | size_t total_len) |
261 | { | 281 | { |
262 | unsigned long to_copy; | 282 | unsigned long to_copy; |
263 | unsigned long iov_off; | 283 | unsigned long iov_off; |
264 | unsigned long sg_off; | 284 | unsigned long sg_off; |
265 | struct rds_message *rm; | ||
266 | struct iovec *iov; | 285 | struct iovec *iov; |
267 | struct scatterlist *sg; | 286 | struct scatterlist *sg; |
268 | int ret; | 287 | int ret = 0; |
269 | |||
270 | rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); | ||
271 | if (rm == NULL) { | ||
272 | ret = -ENOMEM; | ||
273 | goto out; | ||
274 | } | ||
275 | 288 | ||
276 | rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); | 289 | rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); |
277 | 290 | ||
278 | /* | 291 | /* |
279 | * now allocate and copy in the data payload. | 292 | * now allocate and copy in the data payload. |
280 | */ | 293 | */ |
281 | sg = rm->m_sg; | 294 | sg = rm->data.op_sg; |
282 | iov = first_iov; | 295 | iov = first_iov; |
283 | iov_off = 0; | 296 | iov_off = 0; |
284 | sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ | 297 | sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ |
285 | 298 | ||
286 | while (total_len) { | 299 | while (total_len) { |
287 | if (sg_page(sg) == NULL) { | 300 | if (!sg_page(sg)) { |
288 | ret = rds_page_remainder_alloc(sg, total_len, | 301 | ret = rds_page_remainder_alloc(sg, total_len, |
289 | GFP_HIGHUSER); | 302 | GFP_HIGHUSER); |
290 | if (ret) | 303 | if (ret) |
291 | goto out; | 304 | goto out; |
292 | rm->m_nents++; | 305 | rm->data.op_nents++; |
293 | sg_off = 0; | 306 | sg_off = 0; |
294 | } | 307 | } |
295 | 308 | ||
@@ -320,14 +333,8 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, | |||
320 | sg++; | 333 | sg++; |
321 | } | 334 | } |
322 | 335 | ||
323 | ret = 0; | ||
324 | out: | 336 | out: |
325 | if (ret) { | 337 | return ret; |
326 | if (rm) | ||
327 | rds_message_put(rm); | ||
328 | rm = ERR_PTR(ret); | ||
329 | } | ||
330 | return rm; | ||
331 | } | 338 | } |
332 | 339 | ||
333 | int rds_message_inc_copy_to_user(struct rds_incoming *inc, | 340 | int rds_message_inc_copy_to_user(struct rds_incoming *inc, |
@@ -348,7 +355,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, | |||
348 | 355 | ||
349 | iov = first_iov; | 356 | iov = first_iov; |
350 | iov_off = 0; | 357 | iov_off = 0; |
351 | sg = rm->m_sg; | 358 | sg = rm->data.op_sg; |
352 | vec_off = 0; | 359 | vec_off = 0; |
353 | copied = 0; | 360 | copied = 0; |
354 | 361 | ||
@@ -394,15 +401,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, | |||
394 | */ | 401 | */ |
395 | void rds_message_wait(struct rds_message *rm) | 402 | void rds_message_wait(struct rds_message *rm) |
396 | { | 403 | { |
397 | wait_event(rds_message_flush_waitq, | 404 | wait_event_interruptible(rm->m_flush_wait, |
398 | !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); | 405 | !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); |
399 | } | 406 | } |
400 | 407 | ||
401 | void rds_message_unmapped(struct rds_message *rm) | 408 | void rds_message_unmapped(struct rds_message *rm) |
402 | { | 409 | { |
403 | clear_bit(RDS_MSG_MAPPED, &rm->m_flags); | 410 | clear_bit(RDS_MSG_MAPPED, &rm->m_flags); |
404 | if (waitqueue_active(&rds_message_flush_waitq)) | 411 | wake_up_interruptible(&rm->m_flush_wait); |
405 | wake_up(&rds_message_flush_waitq); | ||
406 | } | 412 | } |
407 | EXPORT_SYMBOL_GPL(rds_message_unmapped); | 413 | EXPORT_SYMBOL_GPL(rds_message_unmapped); |
408 | 414 | ||
diff --git a/net/rds/page.c b/net/rds/page.c index 595a952d4b1..5e44f5ae789 100644 --- a/net/rds/page.c +++ b/net/rds/page.c | |||
@@ -116,7 +116,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, | |||
116 | /* jump straight to allocation if we're trying for a huge page */ | 116 | /* jump straight to allocation if we're trying for a huge page */ |
117 | if (bytes >= PAGE_SIZE) { | 117 | if (bytes >= PAGE_SIZE) { |
118 | page = alloc_page(gfp); | 118 | page = alloc_page(gfp); |
119 | if (page == NULL) { | 119 | if (!page) { |
120 | ret = -ENOMEM; | 120 | ret = -ENOMEM; |
121 | } else { | 121 | } else { |
122 | sg_set_page(scat, page, PAGE_SIZE, 0); | 122 | sg_set_page(scat, page, PAGE_SIZE, 0); |
@@ -162,7 +162,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, | |||
162 | rem = &per_cpu(rds_page_remainders, get_cpu()); | 162 | rem = &per_cpu(rds_page_remainders, get_cpu()); |
163 | local_irq_save(flags); | 163 | local_irq_save(flags); |
164 | 164 | ||
165 | if (page == NULL) { | 165 | if (!page) { |
166 | ret = -ENOMEM; | 166 | ret = -ENOMEM; |
167 | break; | 167 | break; |
168 | } | 168 | } |
@@ -186,6 +186,7 @@ out: | |||
186 | ret ? 0 : scat->length); | 186 | ret ? 0 : scat->length); |
187 | return ret; | 187 | return ret; |
188 | } | 188 | } |
189 | EXPORT_SYMBOL_GPL(rds_page_remainder_alloc); | ||
189 | 190 | ||
190 | static int rds_page_remainder_cpu_notify(struct notifier_block *self, | 191 | static int rds_page_remainder_cpu_notify(struct notifier_block *self, |
191 | unsigned long action, void *hcpu) | 192 | unsigned long action, void *hcpu) |
diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 75fd13bb631..1a41debca1c 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c | |||
@@ -35,7 +35,7 @@ | |||
35 | #include <linux/rbtree.h> | 35 | #include <linux/rbtree.h> |
36 | #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ | 36 | #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ |
37 | 37 | ||
38 | #include "rdma.h" | 38 | #include "rds.h" |
39 | 39 | ||
40 | /* | 40 | /* |
41 | * XXX | 41 | * XXX |
@@ -130,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs) | |||
130 | { | 130 | { |
131 | struct rds_mr *mr; | 131 | struct rds_mr *mr; |
132 | struct rb_node *node; | 132 | struct rb_node *node; |
133 | unsigned long flags; | ||
133 | 134 | ||
134 | /* Release any MRs associated with this socket */ | 135 | /* Release any MRs associated with this socket */ |
136 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | ||
135 | while ((node = rb_first(&rs->rs_rdma_keys))) { | 137 | while ((node = rb_first(&rs->rs_rdma_keys))) { |
136 | mr = container_of(node, struct rds_mr, r_rb_node); | 138 | mr = container_of(node, struct rds_mr, r_rb_node); |
137 | if (mr->r_trans == rs->rs_transport) | 139 | if (mr->r_trans == rs->rs_transport) |
138 | mr->r_invalidate = 0; | 140 | mr->r_invalidate = 0; |
141 | rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); | ||
142 | RB_CLEAR_NODE(&mr->r_rb_node); | ||
143 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | ||
144 | rds_destroy_mr(mr); | ||
139 | rds_mr_put(mr); | 145 | rds_mr_put(mr); |
146 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | ||
140 | } | 147 | } |
148 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | ||
141 | 149 | ||
142 | if (rs->rs_transport && rs->rs_transport->flush_mrs) | 150 | if (rs->rs_transport && rs->rs_transport->flush_mrs) |
143 | rs->rs_transport->flush_mrs(); | 151 | rs->rs_transport->flush_mrs(); |
@@ -181,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, | |||
181 | goto out; | 189 | goto out; |
182 | } | 190 | } |
183 | 191 | ||
184 | if (rs->rs_transport->get_mr == NULL) { | 192 | if (!rs->rs_transport->get_mr) { |
185 | ret = -EOPNOTSUPP; | 193 | ret = -EOPNOTSUPP; |
186 | goto out; | 194 | goto out; |
187 | } | 195 | } |
@@ -197,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, | |||
197 | 205 | ||
198 | /* XXX clamp nr_pages to limit the size of this alloc? */ | 206 | /* XXX clamp nr_pages to limit the size of this alloc? */ |
199 | pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); | 207 | pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); |
200 | if (pages == NULL) { | 208 | if (!pages) { |
201 | ret = -ENOMEM; | 209 | ret = -ENOMEM; |
202 | goto out; | 210 | goto out; |
203 | } | 211 | } |
204 | 212 | ||
205 | mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); | 213 | mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); |
206 | if (mr == NULL) { | 214 | if (!mr) { |
207 | ret = -ENOMEM; | 215 | ret = -ENOMEM; |
208 | goto out; | 216 | goto out; |
209 | } | 217 | } |
@@ -230,13 +238,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, | |||
230 | * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to | 238 | * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to |
231 | * the zero page. | 239 | * the zero page. |
232 | */ | 240 | */ |
233 | ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1); | 241 | ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); |
234 | if (ret < 0) | 242 | if (ret < 0) |
235 | goto out; | 243 | goto out; |
236 | 244 | ||
237 | nents = ret; | 245 | nents = ret; |
238 | sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); | 246 | sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); |
239 | if (sg == NULL) { | 247 | if (!sg) { |
240 | ret = -ENOMEM; | 248 | ret = -ENOMEM; |
241 | goto out; | 249 | goto out; |
242 | } | 250 | } |
@@ -406,68 +414,127 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) | |||
406 | 414 | ||
407 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | 415 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); |
408 | mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); | 416 | mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); |
409 | if (mr && (mr->r_use_once || force)) { | 417 | if (!mr) { |
418 | printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key); | ||
419 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | ||
420 | return; | ||
421 | } | ||
422 | |||
423 | if (mr->r_use_once || force) { | ||
410 | rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); | 424 | rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); |
411 | RB_CLEAR_NODE(&mr->r_rb_node); | 425 | RB_CLEAR_NODE(&mr->r_rb_node); |
412 | zot_me = 1; | 426 | zot_me = 1; |
413 | } else if (mr) | 427 | } |
414 | atomic_inc(&mr->r_refcount); | ||
415 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); | 428 | spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); |
416 | 429 | ||
417 | /* May have to issue a dma_sync on this memory region. | 430 | /* May have to issue a dma_sync on this memory region. |
418 | * Note we could avoid this if the operation was a RDMA READ, | 431 | * Note we could avoid this if the operation was a RDMA READ, |
419 | * but at this point we can't tell. */ | 432 | * but at this point we can't tell. */ |
420 | if (mr != NULL) { | 433 | if (mr->r_trans->sync_mr) |
421 | if (mr->r_trans->sync_mr) | 434 | mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); |
422 | mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); | 435 | |
423 | 436 | /* If the MR was marked as invalidate, this will | |
424 | /* If the MR was marked as invalidate, this will | 437 | * trigger an async flush. */ |
425 | * trigger an async flush. */ | 438 | if (zot_me) |
426 | if (zot_me) | 439 | rds_destroy_mr(mr); |
427 | rds_destroy_mr(mr); | 440 | rds_mr_put(mr); |
428 | rds_mr_put(mr); | ||
429 | } | ||
430 | } | 441 | } |
431 | 442 | ||
432 | void rds_rdma_free_op(struct rds_rdma_op *ro) | 443 | void rds_rdma_free_op(struct rm_rdma_op *ro) |
433 | { | 444 | { |
434 | unsigned int i; | 445 | unsigned int i; |
435 | 446 | ||
436 | for (i = 0; i < ro->r_nents; i++) { | 447 | for (i = 0; i < ro->op_nents; i++) { |
437 | struct page *page = sg_page(&ro->r_sg[i]); | 448 | struct page *page = sg_page(&ro->op_sg[i]); |
438 | 449 | ||
439 | /* Mark page dirty if it was possibly modified, which | 450 | /* Mark page dirty if it was possibly modified, which |
440 | * is the case for a RDMA_READ which copies from remote | 451 | * is the case for a RDMA_READ which copies from remote |
441 | * to local memory */ | 452 | * to local memory */ |
442 | if (!ro->r_write) { | 453 | if (!ro->op_write) { |
443 | BUG_ON(in_interrupt()); | 454 | BUG_ON(irqs_disabled()); |
444 | set_page_dirty(page); | 455 | set_page_dirty(page); |
445 | } | 456 | } |
446 | put_page(page); | 457 | put_page(page); |
447 | } | 458 | } |
448 | 459 | ||
449 | kfree(ro->r_notifier); | 460 | kfree(ro->op_notifier); |
450 | kfree(ro); | 461 | ro->op_notifier = NULL; |
462 | ro->op_active = 0; | ||
463 | } | ||
464 | |||
465 | void rds_atomic_free_op(struct rm_atomic_op *ao) | ||
466 | { | ||
467 | struct page *page = sg_page(ao->op_sg); | ||
468 | |||
469 | /* Mark page dirty if it was possibly modified, which | ||
470 | * is the case for a RDMA_READ which copies from remote | ||
471 | * to local memory */ | ||
472 | set_page_dirty(page); | ||
473 | put_page(page); | ||
474 | |||
475 | kfree(ao->op_notifier); | ||
476 | ao->op_notifier = NULL; | ||
477 | ao->op_active = 0; | ||
451 | } | 478 | } |
452 | 479 | ||
480 | |||
453 | /* | 481 | /* |
454 | * args is a pointer to an in-kernel copy in the sendmsg cmsg. | 482 | * Count the number of pages needed to describe an incoming iovec. |
455 | */ | 483 | */ |
456 | static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | 484 | static int rds_rdma_pages(struct rds_rdma_args *args) |
457 | struct rds_rdma_args *args) | ||
458 | { | 485 | { |
459 | struct rds_iovec vec; | 486 | struct rds_iovec vec; |
460 | struct rds_rdma_op *op = NULL; | 487 | struct rds_iovec __user *local_vec; |
488 | unsigned int tot_pages = 0; | ||
461 | unsigned int nr_pages; | 489 | unsigned int nr_pages; |
462 | unsigned int max_pages; | 490 | unsigned int i; |
491 | |||
492 | local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; | ||
493 | |||
494 | /* figure out the number of pages in the vector */ | ||
495 | for (i = 0; i < args->nr_local; i++) { | ||
496 | if (copy_from_user(&vec, &local_vec[i], | ||
497 | sizeof(struct rds_iovec))) | ||
498 | return -EFAULT; | ||
499 | |||
500 | nr_pages = rds_pages_in_vec(&vec); | ||
501 | if (nr_pages == 0) | ||
502 | return -EINVAL; | ||
503 | |||
504 | tot_pages += nr_pages; | ||
505 | } | ||
506 | |||
507 | return tot_pages; | ||
508 | } | ||
509 | |||
510 | int rds_rdma_extra_size(struct rds_rdma_args *args) | ||
511 | { | ||
512 | return rds_rdma_pages(args) * sizeof(struct scatterlist); | ||
513 | } | ||
514 | |||
515 | /* | ||
516 | * The application asks for a RDMA transfer. | ||
517 | * Extract all arguments and set up the rdma_op | ||
518 | */ | ||
519 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||
520 | struct cmsghdr *cmsg) | ||
521 | { | ||
522 | struct rds_rdma_args *args; | ||
523 | struct rds_iovec vec; | ||
524 | struct rm_rdma_op *op = &rm->rdma; | ||
525 | int nr_pages; | ||
463 | unsigned int nr_bytes; | 526 | unsigned int nr_bytes; |
464 | struct page **pages = NULL; | 527 | struct page **pages = NULL; |
465 | struct rds_iovec __user *local_vec; | 528 | struct rds_iovec __user *local_vec; |
466 | struct scatterlist *sg; | ||
467 | unsigned int nr; | 529 | unsigned int nr; |
468 | unsigned int i, j; | 530 | unsigned int i, j; |
469 | int ret; | 531 | int ret = 0; |
470 | 532 | ||
533 | if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) | ||
534 | || rm->rdma.op_active) | ||
535 | return -EINVAL; | ||
536 | |||
537 | args = CMSG_DATA(cmsg); | ||
471 | 538 | ||
472 | if (rs->rs_bound_addr == 0) { | 539 | if (rs->rs_bound_addr == 0) { |
473 | ret = -ENOTCONN; /* XXX not a great errno */ | 540 | ret = -ENOTCONN; /* XXX not a great errno */ |
@@ -479,61 +546,38 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | |||
479 | goto out; | 546 | goto out; |
480 | } | 547 | } |
481 | 548 | ||
482 | nr_pages = 0; | 549 | nr_pages = rds_rdma_pages(args); |
483 | max_pages = 0; | 550 | if (nr_pages < 0) |
484 | |||
485 | local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; | ||
486 | |||
487 | /* figure out the number of pages in the vector */ | ||
488 | for (i = 0; i < args->nr_local; i++) { | ||
489 | if (copy_from_user(&vec, &local_vec[i], | ||
490 | sizeof(struct rds_iovec))) { | ||
491 | ret = -EFAULT; | ||
492 | goto out; | ||
493 | } | ||
494 | |||
495 | nr = rds_pages_in_vec(&vec); | ||
496 | if (nr == 0) { | ||
497 | ret = -EINVAL; | ||
498 | goto out; | ||
499 | } | ||
500 | |||
501 | max_pages = max(nr, max_pages); | ||
502 | nr_pages += nr; | ||
503 | } | ||
504 | |||
505 | pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL); | ||
506 | if (pages == NULL) { | ||
507 | ret = -ENOMEM; | ||
508 | goto out; | 551 | goto out; |
509 | } | ||
510 | 552 | ||
511 | op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL); | 553 | pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); |
512 | if (op == NULL) { | 554 | if (!pages) { |
513 | ret = -ENOMEM; | 555 | ret = -ENOMEM; |
514 | goto out; | 556 | goto out; |
515 | } | 557 | } |
516 | 558 | ||
517 | op->r_write = !!(args->flags & RDS_RDMA_READWRITE); | 559 | op->op_write = !!(args->flags & RDS_RDMA_READWRITE); |
518 | op->r_fence = !!(args->flags & RDS_RDMA_FENCE); | 560 | op->op_fence = !!(args->flags & RDS_RDMA_FENCE); |
519 | op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); | 561 | op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); |
520 | op->r_recverr = rs->rs_recverr; | 562 | op->op_silent = !!(args->flags & RDS_RDMA_SILENT); |
563 | op->op_active = 1; | ||
564 | op->op_recverr = rs->rs_recverr; | ||
521 | WARN_ON(!nr_pages); | 565 | WARN_ON(!nr_pages); |
522 | sg_init_table(op->r_sg, nr_pages); | 566 | op->op_sg = rds_message_alloc_sgs(rm, nr_pages); |
523 | 567 | ||
524 | if (op->r_notify || op->r_recverr) { | 568 | if (op->op_notify || op->op_recverr) { |
525 | /* We allocate an uninitialized notifier here, because | 569 | /* We allocate an uninitialized notifier here, because |
526 | * we don't want to do that in the completion handler. We | 570 | * we don't want to do that in the completion handler. We |
527 | * would have to use GFP_ATOMIC there, and don't want to deal | 571 | * would have to use GFP_ATOMIC there, and don't want to deal |
528 | * with failed allocations. | 572 | * with failed allocations. |
529 | */ | 573 | */ |
530 | op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); | 574 | op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); |
531 | if (!op->r_notifier) { | 575 | if (!op->op_notifier) { |
532 | ret = -ENOMEM; | 576 | ret = -ENOMEM; |
533 | goto out; | 577 | goto out; |
534 | } | 578 | } |
535 | op->r_notifier->n_user_token = args->user_token; | 579 | op->op_notifier->n_user_token = args->user_token; |
536 | op->r_notifier->n_status = RDS_RDMA_SUCCESS; | 580 | op->op_notifier->n_status = RDS_RDMA_SUCCESS; |
537 | } | 581 | } |
538 | 582 | ||
539 | /* The cookie contains the R_Key of the remote memory region, and | 583 | /* The cookie contains the R_Key of the remote memory region, and |
@@ -543,15 +587,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | |||
543 | * destination address (which is really an offset into the MR) | 587 | * destination address (which is really an offset into the MR) |
544 | * FIXME: We may want to move this into ib_rdma.c | 588 | * FIXME: We may want to move this into ib_rdma.c |
545 | */ | 589 | */ |
546 | op->r_key = rds_rdma_cookie_key(args->cookie); | 590 | op->op_rkey = rds_rdma_cookie_key(args->cookie); |
547 | op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); | 591 | op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); |
548 | 592 | ||
549 | nr_bytes = 0; | 593 | nr_bytes = 0; |
550 | 594 | ||
551 | rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", | 595 | rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", |
552 | (unsigned long long)args->nr_local, | 596 | (unsigned long long)args->nr_local, |
553 | (unsigned long long)args->remote_vec.addr, | 597 | (unsigned long long)args->remote_vec.addr, |
554 | op->r_key); | 598 | op->op_rkey); |
599 | |||
600 | local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; | ||
555 | 601 | ||
556 | for (i = 0; i < args->nr_local; i++) { | 602 | for (i = 0; i < args->nr_local; i++) { |
557 | if (copy_from_user(&vec, &local_vec[i], | 603 | if (copy_from_user(&vec, &local_vec[i], |
@@ -569,15 +615,10 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | |||
569 | rs->rs_user_addr = vec.addr; | 615 | rs->rs_user_addr = vec.addr; |
570 | rs->rs_user_bytes = vec.bytes; | 616 | rs->rs_user_bytes = vec.bytes; |
571 | 617 | ||
572 | /* did the user change the vec under us? */ | ||
573 | if (nr > max_pages || op->r_nents + nr > nr_pages) { | ||
574 | ret = -EINVAL; | ||
575 | goto out; | ||
576 | } | ||
577 | /* If it's a WRITE operation, we want to pin the pages for reading. | 618 | /* If it's a WRITE operation, we want to pin the pages for reading. |
578 | * If it's a READ operation, we need to pin the pages for writing. | 619 | * If it's a READ operation, we need to pin the pages for writing. |
579 | */ | 620 | */ |
580 | ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write); | 621 | ret = rds_pin_pages(vec.addr, nr, pages, !op->op_write); |
581 | if (ret < 0) | 622 | if (ret < 0) |
582 | goto out; | 623 | goto out; |
583 | 624 | ||
@@ -588,8 +629,9 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | |||
588 | 629 | ||
589 | for (j = 0; j < nr; j++) { | 630 | for (j = 0; j < nr; j++) { |
590 | unsigned int offset = vec.addr & ~PAGE_MASK; | 631 | unsigned int offset = vec.addr & ~PAGE_MASK; |
632 | struct scatterlist *sg; | ||
591 | 633 | ||
592 | sg = &op->r_sg[op->r_nents + j]; | 634 | sg = &op->op_sg[op->op_nents + j]; |
593 | sg_set_page(sg, pages[j], | 635 | sg_set_page(sg, pages[j], |
594 | min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), | 636 | min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), |
595 | offset); | 637 | offset); |
@@ -601,10 +643,9 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | |||
601 | vec.bytes -= sg->length; | 643 | vec.bytes -= sg->length; |
602 | } | 644 | } |
603 | 645 | ||
604 | op->r_nents += nr; | 646 | op->op_nents += nr; |
605 | } | 647 | } |
606 | 648 | ||
607 | |||
608 | if (nr_bytes > args->remote_vec.bytes) { | 649 | if (nr_bytes > args->remote_vec.bytes) { |
609 | rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", | 650 | rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", |
610 | nr_bytes, | 651 | nr_bytes, |
@@ -612,38 +653,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, | |||
612 | ret = -EINVAL; | 653 | ret = -EINVAL; |
613 | goto out; | 654 | goto out; |
614 | } | 655 | } |
615 | op->r_bytes = nr_bytes; | 656 | op->op_bytes = nr_bytes; |
616 | 657 | ||
617 | ret = 0; | 658 | ret = 0; |
618 | out: | 659 | out: |
619 | kfree(pages); | 660 | kfree(pages); |
620 | if (ret) { | 661 | if (ret) |
621 | if (op) | 662 | rds_rdma_free_op(op); |
622 | rds_rdma_free_op(op); | ||
623 | op = ERR_PTR(ret); | ||
624 | } | ||
625 | return op; | ||
626 | } | ||
627 | |||
628 | /* | ||
629 | * The application asks for a RDMA transfer. | ||
630 | * Extract all arguments and set up the rdma_op | ||
631 | */ | ||
632 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||
633 | struct cmsghdr *cmsg) | ||
634 | { | ||
635 | struct rds_rdma_op *op; | ||
636 | |||
637 | if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || | ||
638 | rm->m_rdma_op != NULL) | ||
639 | return -EINVAL; | ||
640 | 663 | ||
641 | op = rds_rdma_prepare(rs, CMSG_DATA(cmsg)); | ||
642 | if (IS_ERR(op)) | ||
643 | return PTR_ERR(op); | ||
644 | rds_stats_inc(s_send_rdma); | 664 | rds_stats_inc(s_send_rdma); |
645 | rm->m_rdma_op = op; | 665 | |
646 | return 0; | 666 | return ret; |
647 | } | 667 | } |
648 | 668 | ||
649 | /* | 669 | /* |
@@ -673,7 +693,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, | |||
673 | 693 | ||
674 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); | 694 | spin_lock_irqsave(&rs->rs_rdma_lock, flags); |
675 | mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); | 695 | mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); |
676 | if (mr == NULL) | 696 | if (!mr) |
677 | err = -EINVAL; /* invalid r_key */ | 697 | err = -EINVAL; /* invalid r_key */ |
678 | else | 698 | else |
679 | atomic_inc(&mr->r_refcount); | 699 | atomic_inc(&mr->r_refcount); |
@@ -681,7 +701,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, | |||
681 | 701 | ||
682 | if (mr) { | 702 | if (mr) { |
683 | mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); | 703 | mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); |
684 | rm->m_rdma_mr = mr; | 704 | rm->rdma.op_rdma_mr = mr; |
685 | } | 705 | } |
686 | return err; | 706 | return err; |
687 | } | 707 | } |
@@ -699,5 +719,98 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, | |||
699 | rm->m_rdma_cookie != 0) | 719 | rm->m_rdma_cookie != 0) |
700 | return -EINVAL; | 720 | return -EINVAL; |
701 | 721 | ||
702 | return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr); | 722 | return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr); |
723 | } | ||
724 | |||
725 | /* | ||
726 | * Fill in rds_message for an atomic request. | ||
727 | */ | ||
728 | int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, | ||
729 | struct cmsghdr *cmsg) | ||
730 | { | ||
731 | struct page *page = NULL; | ||
732 | struct rds_atomic_args *args; | ||
733 | int ret = 0; | ||
734 | |||
735 | if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args)) | ||
736 | || rm->atomic.op_active) | ||
737 | return -EINVAL; | ||
738 | |||
739 | args = CMSG_DATA(cmsg); | ||
740 | |||
741 | /* Nonmasked & masked cmsg ops converted to masked hw ops */ | ||
742 | switch (cmsg->cmsg_type) { | ||
743 | case RDS_CMSG_ATOMIC_FADD: | ||
744 | rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; | ||
745 | rm->atomic.op_m_fadd.add = args->fadd.add; | ||
746 | rm->atomic.op_m_fadd.nocarry_mask = 0; | ||
747 | break; | ||
748 | case RDS_CMSG_MASKED_ATOMIC_FADD: | ||
749 | rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; | ||
750 | rm->atomic.op_m_fadd.add = args->m_fadd.add; | ||
751 | rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask; | ||
752 | break; | ||
753 | case RDS_CMSG_ATOMIC_CSWP: | ||
754 | rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; | ||
755 | rm->atomic.op_m_cswp.compare = args->cswp.compare; | ||
756 | rm->atomic.op_m_cswp.swap = args->cswp.swap; | ||
757 | rm->atomic.op_m_cswp.compare_mask = ~0; | ||
758 | rm->atomic.op_m_cswp.swap_mask = ~0; | ||
759 | break; | ||
760 | case RDS_CMSG_MASKED_ATOMIC_CSWP: | ||
761 | rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; | ||
762 | rm->atomic.op_m_cswp.compare = args->m_cswp.compare; | ||
763 | rm->atomic.op_m_cswp.swap = args->m_cswp.swap; | ||
764 | rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask; | ||
765 | rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask; | ||
766 | break; | ||
767 | default: | ||
768 | BUG(); /* should never happen */ | ||
769 | } | ||
770 | |||
771 | rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); | ||
772 | rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); | ||
773 | rm->atomic.op_active = 1; | ||
774 | rm->atomic.op_recverr = rs->rs_recverr; | ||
775 | rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); | ||
776 | |||
777 | /* verify 8 byte-aligned */ | ||
778 | if (args->local_addr & 0x7) { | ||
779 | ret = -EFAULT; | ||
780 | goto err; | ||
781 | } | ||
782 | |||
783 | ret = rds_pin_pages(args->local_addr, 1, &page, 1); | ||
784 | if (ret != 1) | ||
785 | goto err; | ||
786 | ret = 0; | ||
787 | |||
788 | sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr)); | ||
789 | |||
790 | if (rm->atomic.op_notify || rm->atomic.op_recverr) { | ||
791 | /* We allocate an uninitialized notifier here, because | ||
792 | * we don't want to do that in the completion handler. We | ||
793 | * would have to use GFP_ATOMIC there, and don't want to deal | ||
794 | * with failed allocations. | ||
795 | */ | ||
796 | rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); | ||
797 | if (!rm->atomic.op_notifier) { | ||
798 | ret = -ENOMEM; | ||
799 | goto err; | ||
800 | } | ||
801 | |||
802 | rm->atomic.op_notifier->n_user_token = args->user_token; | ||
803 | rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; | ||
804 | } | ||
805 | |||
806 | rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); | ||
807 | rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); | ||
808 | |||
809 | return ret; | ||
810 | err: | ||
811 | if (page) | ||
812 | put_page(page); | ||
813 | kfree(rm->atomic.op_notifier); | ||
814 | |||
815 | return ret; | ||
703 | } | 816 | } |
diff --git a/net/rds/rdma.h b/net/rds/rdma.h deleted file mode 100644 index 909c39835a5..00000000000 --- a/net/rds/rdma.h +++ /dev/null | |||
@@ -1,85 +0,0 @@ | |||
1 | #ifndef _RDS_RDMA_H | ||
2 | #define _RDS_RDMA_H | ||
3 | |||
4 | #include <linux/rbtree.h> | ||
5 | #include <linux/spinlock.h> | ||
6 | #include <linux/scatterlist.h> | ||
7 | |||
8 | #include "rds.h" | ||
9 | |||
10 | struct rds_mr { | ||
11 | struct rb_node r_rb_node; | ||
12 | atomic_t r_refcount; | ||
13 | u32 r_key; | ||
14 | |||
15 | /* A copy of the creation flags */ | ||
16 | unsigned int r_use_once:1; | ||
17 | unsigned int r_invalidate:1; | ||
18 | unsigned int r_write:1; | ||
19 | |||
20 | /* This is for RDS_MR_DEAD. | ||
21 | * It would be nice & consistent to make this part of the above | ||
22 | * bit field here, but we need to use test_and_set_bit. | ||
23 | */ | ||
24 | unsigned long r_state; | ||
25 | struct rds_sock *r_sock; /* back pointer to the socket that owns us */ | ||
26 | struct rds_transport *r_trans; | ||
27 | void *r_trans_private; | ||
28 | }; | ||
29 | |||
30 | /* Flags for mr->r_state */ | ||
31 | #define RDS_MR_DEAD 0 | ||
32 | |||
33 | struct rds_rdma_op { | ||
34 | u32 r_key; | ||
35 | u64 r_remote_addr; | ||
36 | unsigned int r_write:1; | ||
37 | unsigned int r_fence:1; | ||
38 | unsigned int r_notify:1; | ||
39 | unsigned int r_recverr:1; | ||
40 | unsigned int r_mapped:1; | ||
41 | struct rds_notifier *r_notifier; | ||
42 | unsigned int r_bytes; | ||
43 | unsigned int r_nents; | ||
44 | unsigned int r_count; | ||
45 | struct scatterlist r_sg[0]; | ||
46 | }; | ||
47 | |||
48 | static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) | ||
49 | { | ||
50 | return r_key | (((u64) offset) << 32); | ||
51 | } | ||
52 | |||
53 | static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) | ||
54 | { | ||
55 | return cookie; | ||
56 | } | ||
57 | |||
58 | static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) | ||
59 | { | ||
60 | return cookie >> 32; | ||
61 | } | ||
62 | |||
63 | int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); | ||
64 | int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); | ||
65 | int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); | ||
66 | void rds_rdma_drop_keys(struct rds_sock *rs); | ||
67 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||
68 | struct cmsghdr *cmsg); | ||
69 | int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, | ||
70 | struct cmsghdr *cmsg); | ||
71 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||
72 | struct cmsghdr *cmsg); | ||
73 | int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, | ||
74 | struct cmsghdr *cmsg); | ||
75 | void rds_rdma_free_op(struct rds_rdma_op *ro); | ||
76 | void rds_rdma_send_complete(struct rds_message *rm, int); | ||
77 | |||
78 | extern void __rds_put_mr_final(struct rds_mr *mr); | ||
79 | static inline void rds_mr_put(struct rds_mr *mr) | ||
80 | { | ||
81 | if (atomic_dec_and_test(&mr->r_refcount)) | ||
82 | __rds_put_mr_final(mr); | ||
83 | } | ||
84 | |||
85 | #endif | ||
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index e599ba2f950..e6ed10aee19 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c | |||
@@ -36,6 +36,34 @@ | |||
36 | 36 | ||
37 | static struct rdma_cm_id *rds_rdma_listen_id; | 37 | static struct rdma_cm_id *rds_rdma_listen_id; |
38 | 38 | ||
39 | static char *rds_cm_event_strings[] = { | ||
40 | #define RDS_CM_EVENT_STRING(foo) \ | ||
41 | [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo) | ||
42 | RDS_CM_EVENT_STRING(ADDR_RESOLVED), | ||
43 | RDS_CM_EVENT_STRING(ADDR_ERROR), | ||
44 | RDS_CM_EVENT_STRING(ROUTE_RESOLVED), | ||
45 | RDS_CM_EVENT_STRING(ROUTE_ERROR), | ||
46 | RDS_CM_EVENT_STRING(CONNECT_REQUEST), | ||
47 | RDS_CM_EVENT_STRING(CONNECT_RESPONSE), | ||
48 | RDS_CM_EVENT_STRING(CONNECT_ERROR), | ||
49 | RDS_CM_EVENT_STRING(UNREACHABLE), | ||
50 | RDS_CM_EVENT_STRING(REJECTED), | ||
51 | RDS_CM_EVENT_STRING(ESTABLISHED), | ||
52 | RDS_CM_EVENT_STRING(DISCONNECTED), | ||
53 | RDS_CM_EVENT_STRING(DEVICE_REMOVAL), | ||
54 | RDS_CM_EVENT_STRING(MULTICAST_JOIN), | ||
55 | RDS_CM_EVENT_STRING(MULTICAST_ERROR), | ||
56 | RDS_CM_EVENT_STRING(ADDR_CHANGE), | ||
57 | RDS_CM_EVENT_STRING(TIMEWAIT_EXIT), | ||
58 | #undef RDS_CM_EVENT_STRING | ||
59 | }; | ||
60 | |||
61 | static char *rds_cm_event_str(enum rdma_cm_event_type type) | ||
62 | { | ||
63 | return rds_str_array(rds_cm_event_strings, | ||
64 | ARRAY_SIZE(rds_cm_event_strings), type); | ||
65 | }; | ||
66 | |||
39 | int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, | 67 | int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, |
40 | struct rdma_cm_event *event) | 68 | struct rdma_cm_event *event) |
41 | { | 69 | { |
@@ -44,8 +72,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, | |||
44 | struct rds_transport *trans; | 72 | struct rds_transport *trans; |
45 | int ret = 0; | 73 | int ret = 0; |
46 | 74 | ||
47 | rdsdebug("conn %p id %p handling event %u\n", conn, cm_id, | 75 | rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, |
48 | event->event); | 76 | event->event, rds_cm_event_str(event->event)); |
49 | 77 | ||
50 | if (cm_id->device->node_type == RDMA_NODE_RNIC) | 78 | if (cm_id->device->node_type == RDMA_NODE_RNIC) |
51 | trans = &rds_iw_transport; | 79 | trans = &rds_iw_transport; |
@@ -109,7 +137,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, | |||
109 | 137 | ||
110 | default: | 138 | default: |
111 | /* things like device disconnect? */ | 139 | /* things like device disconnect? */ |
112 | printk(KERN_ERR "RDS: unknown event %u!\n", event->event); | 140 | printk(KERN_ERR "RDS: unknown event %u (%s)!\n", |
141 | event->event, rds_cm_event_str(event->event)); | ||
113 | break; | 142 | break; |
114 | } | 143 | } |
115 | 144 | ||
@@ -117,12 +146,13 @@ out: | |||
117 | if (conn) | 146 | if (conn) |
118 | mutex_unlock(&conn->c_cm_lock); | 147 | mutex_unlock(&conn->c_cm_lock); |
119 | 148 | ||
120 | rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret); | 149 | rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event, |
150 | rds_cm_event_str(event->event), ret); | ||
121 | 151 | ||
122 | return ret; | 152 | return ret; |
123 | } | 153 | } |
124 | 154 | ||
125 | static int __init rds_rdma_listen_init(void) | 155 | static int rds_rdma_listen_init(void) |
126 | { | 156 | { |
127 | struct sockaddr_in sin; | 157 | struct sockaddr_in sin; |
128 | struct rdma_cm_id *cm_id; | 158 | struct rdma_cm_id *cm_id; |
@@ -177,7 +207,7 @@ static void rds_rdma_listen_stop(void) | |||
177 | } | 207 | } |
178 | } | 208 | } |
179 | 209 | ||
180 | int __init rds_rdma_init(void) | 210 | int rds_rdma_init(void) |
181 | { | 211 | { |
182 | int ret; | 212 | int ret; |
183 | 213 | ||
diff --git a/net/rds/rds.h b/net/rds/rds.h index c224b5bb3ba..8103dcf8b97 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h | |||
@@ -80,6 +80,7 @@ enum { | |||
80 | /* Bits for c_flags */ | 80 | /* Bits for c_flags */ |
81 | #define RDS_LL_SEND_FULL 0 | 81 | #define RDS_LL_SEND_FULL 0 |
82 | #define RDS_RECONNECT_PENDING 1 | 82 | #define RDS_RECONNECT_PENDING 1 |
83 | #define RDS_IN_XMIT 2 | ||
83 | 84 | ||
84 | struct rds_connection { | 85 | struct rds_connection { |
85 | struct hlist_node c_hash_node; | 86 | struct hlist_node c_hash_node; |
@@ -91,12 +92,13 @@ struct rds_connection { | |||
91 | struct rds_cong_map *c_lcong; | 92 | struct rds_cong_map *c_lcong; |
92 | struct rds_cong_map *c_fcong; | 93 | struct rds_cong_map *c_fcong; |
93 | 94 | ||
94 | struct mutex c_send_lock; /* protect send ring */ | ||
95 | struct rds_message *c_xmit_rm; | 95 | struct rds_message *c_xmit_rm; |
96 | unsigned long c_xmit_sg; | 96 | unsigned long c_xmit_sg; |
97 | unsigned int c_xmit_hdr_off; | 97 | unsigned int c_xmit_hdr_off; |
98 | unsigned int c_xmit_data_off; | 98 | unsigned int c_xmit_data_off; |
99 | unsigned int c_xmit_atomic_sent; | ||
99 | unsigned int c_xmit_rdma_sent; | 100 | unsigned int c_xmit_rdma_sent; |
101 | unsigned int c_xmit_data_sent; | ||
100 | 102 | ||
101 | spinlock_t c_lock; /* protect msg queues */ | 103 | spinlock_t c_lock; /* protect msg queues */ |
102 | u64 c_next_tx_seq; | 104 | u64 c_next_tx_seq; |
@@ -116,11 +118,10 @@ struct rds_connection { | |||
116 | struct delayed_work c_conn_w; | 118 | struct delayed_work c_conn_w; |
117 | struct work_struct c_down_w; | 119 | struct work_struct c_down_w; |
118 | struct mutex c_cm_lock; /* protect conn state & cm */ | 120 | struct mutex c_cm_lock; /* protect conn state & cm */ |
121 | wait_queue_head_t c_waitq; | ||
119 | 122 | ||
120 | struct list_head c_map_item; | 123 | struct list_head c_map_item; |
121 | unsigned long c_map_queued; | 124 | unsigned long c_map_queued; |
122 | unsigned long c_map_offset; | ||
123 | unsigned long c_map_bytes; | ||
124 | 125 | ||
125 | unsigned int c_unacked_packets; | 126 | unsigned int c_unacked_packets; |
126 | unsigned int c_unacked_bytes; | 127 | unsigned int c_unacked_bytes; |
@@ -206,6 +207,48 @@ struct rds_incoming { | |||
206 | rds_rdma_cookie_t i_rdma_cookie; | 207 | rds_rdma_cookie_t i_rdma_cookie; |
207 | }; | 208 | }; |
208 | 209 | ||
210 | struct rds_mr { | ||
211 | struct rb_node r_rb_node; | ||
212 | atomic_t r_refcount; | ||
213 | u32 r_key; | ||
214 | |||
215 | /* A copy of the creation flags */ | ||
216 | unsigned int r_use_once:1; | ||
217 | unsigned int r_invalidate:1; | ||
218 | unsigned int r_write:1; | ||
219 | |||
220 | /* This is for RDS_MR_DEAD. | ||
221 | * It would be nice & consistent to make this part of the above | ||
222 | * bit field here, but we need to use test_and_set_bit. | ||
223 | */ | ||
224 | unsigned long r_state; | ||
225 | struct rds_sock *r_sock; /* back pointer to the socket that owns us */ | ||
226 | struct rds_transport *r_trans; | ||
227 | void *r_trans_private; | ||
228 | }; | ||
229 | |||
230 | /* Flags for mr->r_state */ | ||
231 | #define RDS_MR_DEAD 0 | ||
232 | |||
233 | static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) | ||
234 | { | ||
235 | return r_key | (((u64) offset) << 32); | ||
236 | } | ||
237 | |||
238 | static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) | ||
239 | { | ||
240 | return cookie; | ||
241 | } | ||
242 | |||
243 | static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) | ||
244 | { | ||
245 | return cookie >> 32; | ||
246 | } | ||
247 | |||
248 | /* atomic operation types */ | ||
249 | #define RDS_ATOMIC_TYPE_CSWP 0 | ||
250 | #define RDS_ATOMIC_TYPE_FADD 1 | ||
251 | |||
209 | /* | 252 | /* |
210 | * m_sock_item and m_conn_item are on lists that are serialized under | 253 | * m_sock_item and m_conn_item are on lists that are serialized under |
211 | * conn->c_lock. m_sock_item has additional meaning in that once it is empty | 254 | * conn->c_lock. m_sock_item has additional meaning in that once it is empty |
@@ -258,13 +301,71 @@ struct rds_message { | |||
258 | * -> rs->rs_lock | 301 | * -> rs->rs_lock |
259 | */ | 302 | */ |
260 | spinlock_t m_rs_lock; | 303 | spinlock_t m_rs_lock; |
304 | wait_queue_head_t m_flush_wait; | ||
305 | |||
261 | struct rds_sock *m_rs; | 306 | struct rds_sock *m_rs; |
262 | struct rds_rdma_op *m_rdma_op; | 307 | |
308 | /* cookie to send to remote, in rds header */ | ||
263 | rds_rdma_cookie_t m_rdma_cookie; | 309 | rds_rdma_cookie_t m_rdma_cookie; |
264 | struct rds_mr *m_rdma_mr; | 310 | |
265 | unsigned int m_nents; | 311 | unsigned int m_used_sgs; |
266 | unsigned int m_count; | 312 | unsigned int m_total_sgs; |
267 | struct scatterlist m_sg[0]; | 313 | |
314 | void *m_final_op; | ||
315 | |||
316 | struct { | ||
317 | struct rm_atomic_op { | ||
318 | int op_type; | ||
319 | union { | ||
320 | struct { | ||
321 | uint64_t compare; | ||
322 | uint64_t swap; | ||
323 | uint64_t compare_mask; | ||
324 | uint64_t swap_mask; | ||
325 | } op_m_cswp; | ||
326 | struct { | ||
327 | uint64_t add; | ||
328 | uint64_t nocarry_mask; | ||
329 | } op_m_fadd; | ||
330 | }; | ||
331 | |||
332 | u32 op_rkey; | ||
333 | u64 op_remote_addr; | ||
334 | unsigned int op_notify:1; | ||
335 | unsigned int op_recverr:1; | ||
336 | unsigned int op_mapped:1; | ||
337 | unsigned int op_silent:1; | ||
338 | unsigned int op_active:1; | ||
339 | struct scatterlist *op_sg; | ||
340 | struct rds_notifier *op_notifier; | ||
341 | |||
342 | struct rds_mr *op_rdma_mr; | ||
343 | } atomic; | ||
344 | struct rm_rdma_op { | ||
345 | u32 op_rkey; | ||
346 | u64 op_remote_addr; | ||
347 | unsigned int op_write:1; | ||
348 | unsigned int op_fence:1; | ||
349 | unsigned int op_notify:1; | ||
350 | unsigned int op_recverr:1; | ||
351 | unsigned int op_mapped:1; | ||
352 | unsigned int op_silent:1; | ||
353 | unsigned int op_active:1; | ||
354 | unsigned int op_bytes; | ||
355 | unsigned int op_nents; | ||
356 | unsigned int op_count; | ||
357 | struct scatterlist *op_sg; | ||
358 | struct rds_notifier *op_notifier; | ||
359 | |||
360 | struct rds_mr *op_rdma_mr; | ||
361 | } rdma; | ||
362 | struct rm_data_op { | ||
363 | unsigned int op_active:1; | ||
364 | unsigned int op_nents; | ||
365 | unsigned int op_count; | ||
366 | struct scatterlist *op_sg; | ||
367 | } data; | ||
368 | }; | ||
268 | }; | 369 | }; |
269 | 370 | ||
270 | /* | 371 | /* |
@@ -305,10 +406,6 @@ struct rds_notifier { | |||
305 | * transport is responsible for other serialization, including | 406 | * transport is responsible for other serialization, including |
306 | * rds_recv_incoming(). This is called in process context but | 407 | * rds_recv_incoming(). This is called in process context but |
307 | * should try hard not to block. | 408 | * should try hard not to block. |
308 | * | ||
309 | * @xmit_cong_map: This asks the transport to send the local bitmap down the | ||
310 | * given connection. XXX get a better story about the bitmap | ||
311 | * flag and header. | ||
312 | */ | 409 | */ |
313 | 410 | ||
314 | #define RDS_TRANS_IB 0 | 411 | #define RDS_TRANS_IB 0 |
@@ -332,13 +429,11 @@ struct rds_transport { | |||
332 | void (*xmit_complete)(struct rds_connection *conn); | 429 | void (*xmit_complete)(struct rds_connection *conn); |
333 | int (*xmit)(struct rds_connection *conn, struct rds_message *rm, | 430 | int (*xmit)(struct rds_connection *conn, struct rds_message *rm, |
334 | unsigned int hdr_off, unsigned int sg, unsigned int off); | 431 | unsigned int hdr_off, unsigned int sg, unsigned int off); |
335 | int (*xmit_cong_map)(struct rds_connection *conn, | 432 | int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); |
336 | struct rds_cong_map *map, unsigned long offset); | 433 | int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); |
337 | int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op); | ||
338 | int (*recv)(struct rds_connection *conn); | 434 | int (*recv)(struct rds_connection *conn); |
339 | int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, | 435 | int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, |
340 | size_t size); | 436 | size_t size); |
341 | void (*inc_purge)(struct rds_incoming *inc); | ||
342 | void (*inc_free)(struct rds_incoming *inc); | 437 | void (*inc_free)(struct rds_incoming *inc); |
343 | 438 | ||
344 | int (*cm_handle_connect)(struct rdma_cm_id *cm_id, | 439 | int (*cm_handle_connect)(struct rdma_cm_id *cm_id, |
@@ -367,17 +462,11 @@ struct rds_sock { | |||
367 | * bound_addr used for both incoming and outgoing, no INADDR_ANY | 462 | * bound_addr used for both incoming and outgoing, no INADDR_ANY |
368 | * support. | 463 | * support. |
369 | */ | 464 | */ |
370 | struct rb_node rs_bound_node; | 465 | struct hlist_node rs_bound_node; |
371 | __be32 rs_bound_addr; | 466 | __be32 rs_bound_addr; |
372 | __be32 rs_conn_addr; | 467 | __be32 rs_conn_addr; |
373 | __be16 rs_bound_port; | 468 | __be16 rs_bound_port; |
374 | __be16 rs_conn_port; | 469 | __be16 rs_conn_port; |
375 | |||
376 | /* | ||
377 | * This is only used to communicate the transport between bind and | ||
378 | * initiating connections. All other trans use is referenced through | ||
379 | * the connection. | ||
380 | */ | ||
381 | struct rds_transport *rs_transport; | 470 | struct rds_transport *rs_transport; |
382 | 471 | ||
383 | /* | 472 | /* |
@@ -466,8 +555,8 @@ struct rds_statistics { | |||
466 | uint64_t s_recv_ping; | 555 | uint64_t s_recv_ping; |
467 | uint64_t s_send_queue_empty; | 556 | uint64_t s_send_queue_empty; |
468 | uint64_t s_send_queue_full; | 557 | uint64_t s_send_queue_full; |
469 | uint64_t s_send_sem_contention; | 558 | uint64_t s_send_lock_contention; |
470 | uint64_t s_send_sem_queue_raced; | 559 | uint64_t s_send_lock_queue_raced; |
471 | uint64_t s_send_immediate_retry; | 560 | uint64_t s_send_immediate_retry; |
472 | uint64_t s_send_delayed_retry; | 561 | uint64_t s_send_delayed_retry; |
473 | uint64_t s_send_drop_acked; | 562 | uint64_t s_send_drop_acked; |
@@ -487,6 +576,7 @@ struct rds_statistics { | |||
487 | }; | 576 | }; |
488 | 577 | ||
489 | /* af_rds.c */ | 578 | /* af_rds.c */ |
579 | char *rds_str_array(char **array, size_t elements, size_t index); | ||
490 | void rds_sock_addref(struct rds_sock *rs); | 580 | void rds_sock_addref(struct rds_sock *rs); |
491 | void rds_sock_put(struct rds_sock *rs); | 581 | void rds_sock_put(struct rds_sock *rs); |
492 | void rds_wake_sk_sleep(struct rds_sock *rs); | 582 | void rds_wake_sk_sleep(struct rds_sock *rs); |
@@ -521,15 +611,17 @@ void rds_cong_exit(void); | |||
521 | struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); | 611 | struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); |
522 | 612 | ||
523 | /* conn.c */ | 613 | /* conn.c */ |
524 | int __init rds_conn_init(void); | 614 | int rds_conn_init(void); |
525 | void rds_conn_exit(void); | 615 | void rds_conn_exit(void); |
526 | struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, | 616 | struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, |
527 | struct rds_transport *trans, gfp_t gfp); | 617 | struct rds_transport *trans, gfp_t gfp); |
528 | struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, | 618 | struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, |
529 | struct rds_transport *trans, gfp_t gfp); | 619 | struct rds_transport *trans, gfp_t gfp); |
620 | void rds_conn_shutdown(struct rds_connection *conn); | ||
530 | void rds_conn_destroy(struct rds_connection *conn); | 621 | void rds_conn_destroy(struct rds_connection *conn); |
531 | void rds_conn_reset(struct rds_connection *conn); | 622 | void rds_conn_reset(struct rds_connection *conn); |
532 | void rds_conn_drop(struct rds_connection *conn); | 623 | void rds_conn_drop(struct rds_connection *conn); |
624 | void rds_conn_connect_if_down(struct rds_connection *conn); | ||
533 | void rds_for_each_conn_info(struct socket *sock, unsigned int len, | 625 | void rds_for_each_conn_info(struct socket *sock, unsigned int len, |
534 | struct rds_info_iterator *iter, | 626 | struct rds_info_iterator *iter, |
535 | struct rds_info_lengths *lens, | 627 | struct rds_info_lengths *lens, |
@@ -566,7 +658,8 @@ rds_conn_connecting(struct rds_connection *conn) | |||
566 | 658 | ||
567 | /* message.c */ | 659 | /* message.c */ |
568 | struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); | 660 | struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); |
569 | struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, | 661 | struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); |
662 | int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, | ||
570 | size_t total_len); | 663 | size_t total_len); |
571 | struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); | 664 | struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); |
572 | void rds_message_populate_header(struct rds_header *hdr, __be16 sport, | 665 | void rds_message_populate_header(struct rds_header *hdr, __be16 sport, |
@@ -580,7 +673,6 @@ int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *vers | |||
580 | int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); | 673 | int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); |
581 | int rds_message_inc_copy_to_user(struct rds_incoming *inc, | 674 | int rds_message_inc_copy_to_user(struct rds_incoming *inc, |
582 | struct iovec *first_iov, size_t size); | 675 | struct iovec *first_iov, size_t size); |
583 | void rds_message_inc_purge(struct rds_incoming *inc); | ||
584 | void rds_message_inc_free(struct rds_incoming *inc); | 676 | void rds_message_inc_free(struct rds_incoming *inc); |
585 | void rds_message_addref(struct rds_message *rm); | 677 | void rds_message_addref(struct rds_message *rm); |
586 | void rds_message_put(struct rds_message *rm); | 678 | void rds_message_put(struct rds_message *rm); |
@@ -636,14 +728,39 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); | |||
636 | typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); | 728 | typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); |
637 | void rds_send_drop_acked(struct rds_connection *conn, u64 ack, | 729 | void rds_send_drop_acked(struct rds_connection *conn, u64 ack, |
638 | is_acked_func is_acked); | 730 | is_acked_func is_acked); |
639 | int rds_send_acked_before(struct rds_connection *conn, u64 seq); | ||
640 | void rds_send_remove_from_sock(struct list_head *messages, int status); | 731 | void rds_send_remove_from_sock(struct list_head *messages, int status); |
641 | int rds_send_pong(struct rds_connection *conn, __be16 dport); | 732 | int rds_send_pong(struct rds_connection *conn, __be16 dport); |
642 | struct rds_message *rds_send_get_message(struct rds_connection *, | 733 | struct rds_message *rds_send_get_message(struct rds_connection *, |
643 | struct rds_rdma_op *); | 734 | struct rm_rdma_op *); |
644 | 735 | ||
645 | /* rdma.c */ | 736 | /* rdma.c */ |
646 | void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); | 737 | void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); |
738 | int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); | ||
739 | int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); | ||
740 | int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); | ||
741 | void rds_rdma_drop_keys(struct rds_sock *rs); | ||
742 | int rds_rdma_extra_size(struct rds_rdma_args *args); | ||
743 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||
744 | struct cmsghdr *cmsg); | ||
745 | int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, | ||
746 | struct cmsghdr *cmsg); | ||
747 | int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, | ||
748 | struct cmsghdr *cmsg); | ||
749 | int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, | ||
750 | struct cmsghdr *cmsg); | ||
751 | void rds_rdma_free_op(struct rm_rdma_op *ro); | ||
752 | void rds_atomic_free_op(struct rm_atomic_op *ao); | ||
753 | void rds_rdma_send_complete(struct rds_message *rm, int wc_status); | ||
754 | void rds_atomic_send_complete(struct rds_message *rm, int wc_status); | ||
755 | int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, | ||
756 | struct cmsghdr *cmsg); | ||
757 | |||
758 | extern void __rds_put_mr_final(struct rds_mr *mr); | ||
759 | static inline void rds_mr_put(struct rds_mr *mr) | ||
760 | { | ||
761 | if (atomic_dec_and_test(&mr->r_refcount)) | ||
762 | __rds_put_mr_final(mr); | ||
763 | } | ||
647 | 764 | ||
648 | /* stats.c */ | 765 | /* stats.c */ |
649 | DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); | 766 | DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); |
@@ -657,14 +774,14 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); | |||
657 | put_cpu(); \ | 774 | put_cpu(); \ |
658 | } while (0) | 775 | } while (0) |
659 | #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) | 776 | #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) |
660 | int __init rds_stats_init(void); | 777 | int rds_stats_init(void); |
661 | void rds_stats_exit(void); | 778 | void rds_stats_exit(void); |
662 | void rds_stats_info_copy(struct rds_info_iterator *iter, | 779 | void rds_stats_info_copy(struct rds_info_iterator *iter, |
663 | uint64_t *values, const char *const *names, | 780 | uint64_t *values, const char *const *names, |
664 | size_t nr); | 781 | size_t nr); |
665 | 782 | ||
666 | /* sysctl.c */ | 783 | /* sysctl.c */ |
667 | int __init rds_sysctl_init(void); | 784 | int rds_sysctl_init(void); |
668 | void rds_sysctl_exit(void); | 785 | void rds_sysctl_exit(void); |
669 | extern unsigned long rds_sysctl_sndbuf_min; | 786 | extern unsigned long rds_sysctl_sndbuf_min; |
670 | extern unsigned long rds_sysctl_sndbuf_default; | 787 | extern unsigned long rds_sysctl_sndbuf_default; |
@@ -678,9 +795,10 @@ extern unsigned long rds_sysctl_trace_flags; | |||
678 | extern unsigned int rds_sysctl_trace_level; | 795 | extern unsigned int rds_sysctl_trace_level; |
679 | 796 | ||
680 | /* threads.c */ | 797 | /* threads.c */ |
681 | int __init rds_threads_init(void); | 798 | int rds_threads_init(void); |
682 | void rds_threads_exit(void); | 799 | void rds_threads_exit(void); |
683 | extern struct workqueue_struct *rds_wq; | 800 | extern struct workqueue_struct *rds_wq; |
801 | void rds_queue_reconnect(struct rds_connection *conn); | ||
684 | void rds_connect_worker(struct work_struct *); | 802 | void rds_connect_worker(struct work_struct *); |
685 | void rds_shutdown_worker(struct work_struct *); | 803 | void rds_shutdown_worker(struct work_struct *); |
686 | void rds_send_worker(struct work_struct *); | 804 | void rds_send_worker(struct work_struct *); |
@@ -691,9 +809,10 @@ void rds_connect_complete(struct rds_connection *conn); | |||
691 | int rds_trans_register(struct rds_transport *trans); | 809 | int rds_trans_register(struct rds_transport *trans); |
692 | void rds_trans_unregister(struct rds_transport *trans); | 810 | void rds_trans_unregister(struct rds_transport *trans); |
693 | struct rds_transport *rds_trans_get_preferred(__be32 addr); | 811 | struct rds_transport *rds_trans_get_preferred(__be32 addr); |
812 | void rds_trans_put(struct rds_transport *trans); | ||
694 | unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, | 813 | unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, |
695 | unsigned int avail); | 814 | unsigned int avail); |
696 | int __init rds_trans_init(void); | 815 | int rds_trans_init(void); |
697 | void rds_trans_exit(void); | 816 | void rds_trans_exit(void); |
698 | 817 | ||
699 | #endif | 818 | #endif |
diff --git a/net/rds/recv.c b/net/rds/recv.c index c93588c2d55..68800f02aa3 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c | |||
@@ -36,7 +36,6 @@ | |||
36 | #include <linux/in.h> | 36 | #include <linux/in.h> |
37 | 37 | ||
38 | #include "rds.h" | 38 | #include "rds.h" |
39 | #include "rdma.h" | ||
40 | 39 | ||
41 | void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, | 40 | void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, |
42 | __be32 saddr) | 41 | __be32 saddr) |
@@ -210,7 +209,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, | |||
210 | } | 209 | } |
211 | 210 | ||
212 | rs = rds_find_bound(daddr, inc->i_hdr.h_dport); | 211 | rs = rds_find_bound(daddr, inc->i_hdr.h_dport); |
213 | if (rs == NULL) { | 212 | if (!rs) { |
214 | rds_stats_inc(s_recv_drop_no_sock); | 213 | rds_stats_inc(s_recv_drop_no_sock); |
215 | goto out; | 214 | goto out; |
216 | } | 215 | } |
@@ -251,7 +250,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc) | |||
251 | { | 250 | { |
252 | unsigned long flags; | 251 | unsigned long flags; |
253 | 252 | ||
254 | if (*inc == NULL) { | 253 | if (!*inc) { |
255 | read_lock_irqsave(&rs->rs_recv_lock, flags); | 254 | read_lock_irqsave(&rs->rs_recv_lock, flags); |
256 | if (!list_empty(&rs->rs_recv_queue)) { | 255 | if (!list_empty(&rs->rs_recv_queue)) { |
257 | *inc = list_entry(rs->rs_recv_queue.next, | 256 | *inc = list_entry(rs->rs_recv_queue.next, |
@@ -334,10 +333,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) | |||
334 | 333 | ||
335 | if (msghdr) { | 334 | if (msghdr) { |
336 | cmsg.user_token = notifier->n_user_token; | 335 | cmsg.user_token = notifier->n_user_token; |
337 | cmsg.status = notifier->n_status; | 336 | cmsg.status = notifier->n_status; |
338 | 337 | ||
339 | err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, | 338 | err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, |
340 | sizeof(cmsg), &cmsg); | 339 | sizeof(cmsg), &cmsg); |
341 | if (err) | 340 | if (err) |
342 | break; | 341 | break; |
343 | } | 342 | } |
diff --git a/net/rds/send.c b/net/rds/send.c index 9c1c6bcaa6c..9b951a0ab6b 100644 --- a/net/rds/send.c +++ b/net/rds/send.c | |||
@@ -37,7 +37,6 @@ | |||
37 | #include <linux/list.h> | 37 | #include <linux/list.h> |
38 | 38 | ||
39 | #include "rds.h" | 39 | #include "rds.h" |
40 | #include "rdma.h" | ||
41 | 40 | ||
42 | /* When transmitting messages in rds_send_xmit, we need to emerge from | 41 | /* When transmitting messages in rds_send_xmit, we need to emerge from |
43 | * time to time and briefly release the CPU. Otherwise the softlock watchdog | 42 | * time to time and briefly release the CPU. Otherwise the softlock watchdog |
@@ -54,7 +53,8 @@ module_param(send_batch_count, int, 0444); | |||
54 | MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); | 53 | MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); |
55 | 54 | ||
56 | /* | 55 | /* |
57 | * Reset the send state. Caller must hold c_send_lock when calling here. | 56 | * Reset the send state. Callers must ensure that this doesn't race with |
57 | * rds_send_xmit(). | ||
58 | */ | 58 | */ |
59 | void rds_send_reset(struct rds_connection *conn) | 59 | void rds_send_reset(struct rds_connection *conn) |
60 | { | 60 | { |
@@ -62,18 +62,22 @@ void rds_send_reset(struct rds_connection *conn) | |||
62 | unsigned long flags; | 62 | unsigned long flags; |
63 | 63 | ||
64 | if (conn->c_xmit_rm) { | 64 | if (conn->c_xmit_rm) { |
65 | rm = conn->c_xmit_rm; | ||
66 | conn->c_xmit_rm = NULL; | ||
65 | /* Tell the user the RDMA op is no longer mapped by the | 67 | /* Tell the user the RDMA op is no longer mapped by the |
66 | * transport. This isn't entirely true (it's flushed out | 68 | * transport. This isn't entirely true (it's flushed out |
67 | * independently) but as the connection is down, there's | 69 | * independently) but as the connection is down, there's |
68 | * no ongoing RDMA to/from that memory */ | 70 | * no ongoing RDMA to/from that memory */ |
69 | rds_message_unmapped(conn->c_xmit_rm); | 71 | rds_message_unmapped(rm); |
70 | rds_message_put(conn->c_xmit_rm); | 72 | rds_message_put(rm); |
71 | conn->c_xmit_rm = NULL; | ||
72 | } | 73 | } |
74 | |||
73 | conn->c_xmit_sg = 0; | 75 | conn->c_xmit_sg = 0; |
74 | conn->c_xmit_hdr_off = 0; | 76 | conn->c_xmit_hdr_off = 0; |
75 | conn->c_xmit_data_off = 0; | 77 | conn->c_xmit_data_off = 0; |
78 | conn->c_xmit_atomic_sent = 0; | ||
76 | conn->c_xmit_rdma_sent = 0; | 79 | conn->c_xmit_rdma_sent = 0; |
80 | conn->c_xmit_data_sent = 0; | ||
77 | 81 | ||
78 | conn->c_map_queued = 0; | 82 | conn->c_map_queued = 0; |
79 | 83 | ||
@@ -90,6 +94,25 @@ void rds_send_reset(struct rds_connection *conn) | |||
90 | spin_unlock_irqrestore(&conn->c_lock, flags); | 94 | spin_unlock_irqrestore(&conn->c_lock, flags); |
91 | } | 95 | } |
92 | 96 | ||
97 | static int acquire_in_xmit(struct rds_connection *conn) | ||
98 | { | ||
99 | return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0; | ||
100 | } | ||
101 | |||
102 | static void release_in_xmit(struct rds_connection *conn) | ||
103 | { | ||
104 | clear_bit(RDS_IN_XMIT, &conn->c_flags); | ||
105 | smp_mb__after_clear_bit(); | ||
106 | /* | ||
107 | * We don't use wait_on_bit()/wake_up_bit() because our waking is in a | ||
108 | * hot path and finding waiters is very rare. We don't want to walk | ||
109 | * the system-wide hashed waitqueue buckets in the fast path only to | ||
110 | * almost never find waiters. | ||
111 | */ | ||
112 | if (waitqueue_active(&conn->c_waitq)) | ||
113 | wake_up_all(&conn->c_waitq); | ||
114 | } | ||
115 | |||
93 | /* | 116 | /* |
94 | * We're making the concious trade-off here to only send one message | 117 | * We're making the concious trade-off here to only send one message |
95 | * down the connection at a time. | 118 | * down the connection at a time. |
@@ -109,102 +132,69 @@ int rds_send_xmit(struct rds_connection *conn) | |||
109 | struct rds_message *rm; | 132 | struct rds_message *rm; |
110 | unsigned long flags; | 133 | unsigned long flags; |
111 | unsigned int tmp; | 134 | unsigned int tmp; |
112 | unsigned int send_quota = send_batch_count; | ||
113 | struct scatterlist *sg; | 135 | struct scatterlist *sg; |
114 | int ret = 0; | 136 | int ret = 0; |
115 | int was_empty = 0; | ||
116 | LIST_HEAD(to_be_dropped); | 137 | LIST_HEAD(to_be_dropped); |
117 | 138 | ||
139 | restart: | ||
140 | |||
118 | /* | 141 | /* |
119 | * sendmsg calls here after having queued its message on the send | 142 | * sendmsg calls here after having queued its message on the send |
120 | * queue. We only have one task feeding the connection at a time. If | 143 | * queue. We only have one task feeding the connection at a time. If |
121 | * another thread is already feeding the queue then we back off. This | 144 | * another thread is already feeding the queue then we back off. This |
122 | * avoids blocking the caller and trading per-connection data between | 145 | * avoids blocking the caller and trading per-connection data between |
123 | * caches per message. | 146 | * caches per message. |
124 | * | ||
125 | * The sem holder will issue a retry if they notice that someone queued | ||
126 | * a message after they stopped walking the send queue but before they | ||
127 | * dropped the sem. | ||
128 | */ | 147 | */ |
129 | if (!mutex_trylock(&conn->c_send_lock)) { | 148 | if (!acquire_in_xmit(conn)) { |
130 | rds_stats_inc(s_send_sem_contention); | 149 | rds_stats_inc(s_send_lock_contention); |
131 | ret = -ENOMEM; | 150 | ret = -ENOMEM; |
132 | goto out; | 151 | goto out; |
133 | } | 152 | } |
134 | 153 | ||
154 | /* | ||
155 | * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, | ||
156 | * we do the opposite to avoid races. | ||
157 | */ | ||
158 | if (!rds_conn_up(conn)) { | ||
159 | release_in_xmit(conn); | ||
160 | ret = 0; | ||
161 | goto out; | ||
162 | } | ||
163 | |||
135 | if (conn->c_trans->xmit_prepare) | 164 | if (conn->c_trans->xmit_prepare) |
136 | conn->c_trans->xmit_prepare(conn); | 165 | conn->c_trans->xmit_prepare(conn); |
137 | 166 | ||
138 | /* | 167 | /* |
139 | * spin trying to push headers and data down the connection until | 168 | * spin trying to push headers and data down the connection until |
140 | * the connection doens't make forward progress. | 169 | * the connection doesn't make forward progress. |
141 | */ | 170 | */ |
142 | while (--send_quota) { | 171 | while (1) { |
143 | /* | ||
144 | * See if need to send a congestion map update if we're | ||
145 | * between sending messages. The send_sem protects our sole | ||
146 | * use of c_map_offset and _bytes. | ||
147 | * Note this is used only by transports that define a special | ||
148 | * xmit_cong_map function. For all others, we create allocate | ||
149 | * a cong_map message and treat it just like any other send. | ||
150 | */ | ||
151 | if (conn->c_map_bytes) { | ||
152 | ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong, | ||
153 | conn->c_map_offset); | ||
154 | if (ret <= 0) | ||
155 | break; | ||
156 | 172 | ||
157 | conn->c_map_offset += ret; | ||
158 | conn->c_map_bytes -= ret; | ||
159 | if (conn->c_map_bytes) | ||
160 | continue; | ||
161 | } | ||
162 | |||
163 | /* If we're done sending the current message, clear the | ||
164 | * offset and S/G temporaries. | ||
165 | */ | ||
166 | rm = conn->c_xmit_rm; | 173 | rm = conn->c_xmit_rm; |
167 | if (rm != NULL && | ||
168 | conn->c_xmit_hdr_off == sizeof(struct rds_header) && | ||
169 | conn->c_xmit_sg == rm->m_nents) { | ||
170 | conn->c_xmit_rm = NULL; | ||
171 | conn->c_xmit_sg = 0; | ||
172 | conn->c_xmit_hdr_off = 0; | ||
173 | conn->c_xmit_data_off = 0; | ||
174 | conn->c_xmit_rdma_sent = 0; | ||
175 | 174 | ||
176 | /* Release the reference to the previous message. */ | 175 | /* |
177 | rds_message_put(rm); | 176 | * If between sending messages, we can send a pending congestion |
178 | rm = NULL; | 177 | * map update. |
179 | } | ||
180 | |||
181 | /* If we're asked to send a cong map update, do so. | ||
182 | */ | 178 | */ |
183 | if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) { | 179 | if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) { |
184 | if (conn->c_trans->xmit_cong_map != NULL) { | ||
185 | conn->c_map_offset = 0; | ||
186 | conn->c_map_bytes = sizeof(struct rds_header) + | ||
187 | RDS_CONG_MAP_BYTES; | ||
188 | continue; | ||
189 | } | ||
190 | |||
191 | rm = rds_cong_update_alloc(conn); | 180 | rm = rds_cong_update_alloc(conn); |
192 | if (IS_ERR(rm)) { | 181 | if (IS_ERR(rm)) { |
193 | ret = PTR_ERR(rm); | 182 | ret = PTR_ERR(rm); |
194 | break; | 183 | break; |
195 | } | 184 | } |
185 | rm->data.op_active = 1; | ||
196 | 186 | ||
197 | conn->c_xmit_rm = rm; | 187 | conn->c_xmit_rm = rm; |
198 | } | 188 | } |
199 | 189 | ||
200 | /* | 190 | /* |
201 | * Grab the next message from the send queue, if there is one. | 191 | * If not already working on one, grab the next message. |
202 | * | 192 | * |
203 | * c_xmit_rm holds a ref while we're sending this message down | 193 | * c_xmit_rm holds a ref while we're sending this message down |
204 | * the connction. We can use this ref while holding the | 194 | * the connction. We can use this ref while holding the |
205 | * send_sem.. rds_send_reset() is serialized with it. | 195 | * send_sem.. rds_send_reset() is serialized with it. |
206 | */ | 196 | */ |
207 | if (rm == NULL) { | 197 | if (!rm) { |
208 | unsigned int len; | 198 | unsigned int len; |
209 | 199 | ||
210 | spin_lock_irqsave(&conn->c_lock, flags); | 200 | spin_lock_irqsave(&conn->c_lock, flags); |
@@ -224,10 +214,8 @@ int rds_send_xmit(struct rds_connection *conn) | |||
224 | 214 | ||
225 | spin_unlock_irqrestore(&conn->c_lock, flags); | 215 | spin_unlock_irqrestore(&conn->c_lock, flags); |
226 | 216 | ||
227 | if (rm == NULL) { | 217 | if (!rm) |
228 | was_empty = 1; | ||
229 | break; | 218 | break; |
230 | } | ||
231 | 219 | ||
232 | /* Unfortunately, the way Infiniband deals with | 220 | /* Unfortunately, the way Infiniband deals with |
233 | * RDMA to a bad MR key is by moving the entire | 221 | * RDMA to a bad MR key is by moving the entire |
@@ -236,13 +224,12 @@ int rds_send_xmit(struct rds_connection *conn) | |||
236 | * connection. | 224 | * connection. |
237 | * Therefore, we never retransmit messages with RDMA ops. | 225 | * Therefore, we never retransmit messages with RDMA ops. |
238 | */ | 226 | */ |
239 | if (rm->m_rdma_op && | 227 | if (rm->rdma.op_active && |
240 | test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { | 228 | test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { |
241 | spin_lock_irqsave(&conn->c_lock, flags); | 229 | spin_lock_irqsave(&conn->c_lock, flags); |
242 | if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) | 230 | if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) |
243 | list_move(&rm->m_conn_item, &to_be_dropped); | 231 | list_move(&rm->m_conn_item, &to_be_dropped); |
244 | spin_unlock_irqrestore(&conn->c_lock, flags); | 232 | spin_unlock_irqrestore(&conn->c_lock, flags); |
245 | rds_message_put(rm); | ||
246 | continue; | 233 | continue; |
247 | } | 234 | } |
248 | 235 | ||
@@ -263,23 +250,55 @@ int rds_send_xmit(struct rds_connection *conn) | |||
263 | conn->c_xmit_rm = rm; | 250 | conn->c_xmit_rm = rm; |
264 | } | 251 | } |
265 | 252 | ||
266 | /* | 253 | /* The transport either sends the whole rdma or none of it */ |
267 | * Try and send an rdma message. Let's see if we can | 254 | if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { |
268 | * keep this simple and require that the transport either | 255 | rm->m_final_op = &rm->rdma; |
269 | * send the whole rdma or none of it. | 256 | ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); |
270 | */ | ||
271 | if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) { | ||
272 | ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op); | ||
273 | if (ret) | 257 | if (ret) |
274 | break; | 258 | break; |
275 | conn->c_xmit_rdma_sent = 1; | 259 | conn->c_xmit_rdma_sent = 1; |
260 | |||
276 | /* The transport owns the mapped memory for now. | 261 | /* The transport owns the mapped memory for now. |
277 | * You can't unmap it while it's on the send queue */ | 262 | * You can't unmap it while it's on the send queue */ |
278 | set_bit(RDS_MSG_MAPPED, &rm->m_flags); | 263 | set_bit(RDS_MSG_MAPPED, &rm->m_flags); |
279 | } | 264 | } |
280 | 265 | ||
281 | if (conn->c_xmit_hdr_off < sizeof(struct rds_header) || | 266 | if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { |
282 | conn->c_xmit_sg < rm->m_nents) { | 267 | rm->m_final_op = &rm->atomic; |
268 | ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); | ||
269 | if (ret) | ||
270 | break; | ||
271 | conn->c_xmit_atomic_sent = 1; | ||
272 | |||
273 | /* The transport owns the mapped memory for now. | ||
274 | * You can't unmap it while it's on the send queue */ | ||
275 | set_bit(RDS_MSG_MAPPED, &rm->m_flags); | ||
276 | } | ||
277 | |||
278 | /* | ||
279 | * A number of cases require an RDS header to be sent | ||
280 | * even if there is no data. | ||
281 | * We permit 0-byte sends; rds-ping depends on this. | ||
282 | * However, if there are exclusively attached silent ops, | ||
283 | * we skip the hdr/data send, to enable silent operation. | ||
284 | */ | ||
285 | if (rm->data.op_nents == 0) { | ||
286 | int ops_present; | ||
287 | int all_ops_are_silent = 1; | ||
288 | |||
289 | ops_present = (rm->atomic.op_active || rm->rdma.op_active); | ||
290 | if (rm->atomic.op_active && !rm->atomic.op_silent) | ||
291 | all_ops_are_silent = 0; | ||
292 | if (rm->rdma.op_active && !rm->rdma.op_silent) | ||
293 | all_ops_are_silent = 0; | ||
294 | |||
295 | if (ops_present && all_ops_are_silent | ||
296 | && !rm->m_rdma_cookie) | ||
297 | rm->data.op_active = 0; | ||
298 | } | ||
299 | |||
300 | if (rm->data.op_active && !conn->c_xmit_data_sent) { | ||
301 | rm->m_final_op = &rm->data; | ||
283 | ret = conn->c_trans->xmit(conn, rm, | 302 | ret = conn->c_trans->xmit(conn, rm, |
284 | conn->c_xmit_hdr_off, | 303 | conn->c_xmit_hdr_off, |
285 | conn->c_xmit_sg, | 304 | conn->c_xmit_sg, |
@@ -295,7 +314,7 @@ int rds_send_xmit(struct rds_connection *conn) | |||
295 | ret -= tmp; | 314 | ret -= tmp; |
296 | } | 315 | } |
297 | 316 | ||
298 | sg = &rm->m_sg[conn->c_xmit_sg]; | 317 | sg = &rm->data.op_sg[conn->c_xmit_sg]; |
299 | while (ret) { | 318 | while (ret) { |
300 | tmp = min_t(int, ret, sg->length - | 319 | tmp = min_t(int, ret, sg->length - |
301 | conn->c_xmit_data_off); | 320 | conn->c_xmit_data_off); |
@@ -306,49 +325,63 @@ int rds_send_xmit(struct rds_connection *conn) | |||
306 | sg++; | 325 | sg++; |
307 | conn->c_xmit_sg++; | 326 | conn->c_xmit_sg++; |
308 | BUG_ON(ret != 0 && | 327 | BUG_ON(ret != 0 && |
309 | conn->c_xmit_sg == rm->m_nents); | 328 | conn->c_xmit_sg == rm->data.op_nents); |
310 | } | 329 | } |
311 | } | 330 | } |
331 | |||
332 | if (conn->c_xmit_hdr_off == sizeof(struct rds_header) && | ||
333 | (conn->c_xmit_sg == rm->data.op_nents)) | ||
334 | conn->c_xmit_data_sent = 1; | ||
312 | } | 335 | } |
313 | } | ||
314 | 336 | ||
315 | /* Nuke any messages we decided not to retransmit. */ | 337 | /* |
316 | if (!list_empty(&to_be_dropped)) | 338 | * A rm will only take multiple times through this loop |
317 | rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); | 339 | * if there is a data op. Thus, if the data is sent (or there was |
340 | * none), then we're done with the rm. | ||
341 | */ | ||
342 | if (!rm->data.op_active || conn->c_xmit_data_sent) { | ||
343 | conn->c_xmit_rm = NULL; | ||
344 | conn->c_xmit_sg = 0; | ||
345 | conn->c_xmit_hdr_off = 0; | ||
346 | conn->c_xmit_data_off = 0; | ||
347 | conn->c_xmit_rdma_sent = 0; | ||
348 | conn->c_xmit_atomic_sent = 0; | ||
349 | conn->c_xmit_data_sent = 0; | ||
350 | |||
351 | rds_message_put(rm); | ||
352 | } | ||
353 | } | ||
318 | 354 | ||
319 | if (conn->c_trans->xmit_complete) | 355 | if (conn->c_trans->xmit_complete) |
320 | conn->c_trans->xmit_complete(conn); | 356 | conn->c_trans->xmit_complete(conn); |
321 | 357 | ||
322 | /* | 358 | release_in_xmit(conn); |
323 | * We might be racing with another sender who queued a message but | ||
324 | * backed off on noticing that we held the c_send_lock. If we check | ||
325 | * for queued messages after dropping the sem then either we'll | ||
326 | * see the queued message or the queuer will get the sem. If we | ||
327 | * notice the queued message then we trigger an immediate retry. | ||
328 | * | ||
329 | * We need to be careful only to do this when we stopped processing | ||
330 | * the send queue because it was empty. It's the only way we | ||
331 | * stop processing the loop when the transport hasn't taken | ||
332 | * responsibility for forward progress. | ||
333 | */ | ||
334 | mutex_unlock(&conn->c_send_lock); | ||
335 | 359 | ||
336 | if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) { | 360 | /* Nuke any messages we decided not to retransmit. */ |
337 | /* We exhausted the send quota, but there's work left to | 361 | if (!list_empty(&to_be_dropped)) { |
338 | * do. Return and (re-)schedule the send worker. | 362 | /* irqs on here, so we can put(), unlike above */ |
339 | */ | 363 | list_for_each_entry(rm, &to_be_dropped, m_conn_item) |
340 | ret = -EAGAIN; | 364 | rds_message_put(rm); |
365 | rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); | ||
341 | } | 366 | } |
342 | 367 | ||
343 | if (ret == 0 && was_empty) { | 368 | /* |
344 | /* A simple bit test would be way faster than taking the | 369 | * Other senders can queue a message after we last test the send queue |
345 | * spin lock */ | 370 | * but before we clear RDS_IN_XMIT. In that case they'd back off and |
346 | spin_lock_irqsave(&conn->c_lock, flags); | 371 | * not try and send their newly queued message. We need to check the |
372 | * send queue after having cleared RDS_IN_XMIT so that their message | ||
373 | * doesn't get stuck on the send queue. | ||
374 | * | ||
375 | * If the transport cannot continue (i.e ret != 0), then it must | ||
376 | * call us when more room is available, such as from the tx | ||
377 | * completion handler. | ||
378 | */ | ||
379 | if (ret == 0) { | ||
380 | smp_mb(); | ||
347 | if (!list_empty(&conn->c_send_queue)) { | 381 | if (!list_empty(&conn->c_send_queue)) { |
348 | rds_stats_inc(s_send_sem_queue_raced); | 382 | rds_stats_inc(s_send_lock_queue_raced); |
349 | ret = -EAGAIN; | 383 | goto restart; |
350 | } | 384 | } |
351 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
352 | } | 385 | } |
353 | out: | 386 | out: |
354 | return ret; | 387 | return ret; |
@@ -376,52 +409,60 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, | |||
376 | } | 409 | } |
377 | 410 | ||
378 | /* | 411 | /* |
379 | * Returns true if there are no messages on the send and retransmit queues | 412 | * This is pretty similar to what happens below in the ACK |
380 | * which have a sequence number greater than or equal to the given sequence | 413 | * handling code - except that we call here as soon as we get |
381 | * number. | 414 | * the IB send completion on the RDMA op and the accompanying |
415 | * message. | ||
382 | */ | 416 | */ |
383 | int rds_send_acked_before(struct rds_connection *conn, u64 seq) | 417 | void rds_rdma_send_complete(struct rds_message *rm, int status) |
384 | { | 418 | { |
385 | struct rds_message *rm, *tmp; | 419 | struct rds_sock *rs = NULL; |
386 | int ret = 1; | 420 | struct rm_rdma_op *ro; |
421 | struct rds_notifier *notifier; | ||
422 | unsigned long flags; | ||
387 | 423 | ||
388 | spin_lock(&conn->c_lock); | 424 | spin_lock_irqsave(&rm->m_rs_lock, flags); |
389 | 425 | ||
390 | list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { | 426 | ro = &rm->rdma; |
391 | if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) | 427 | if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && |
392 | ret = 0; | 428 | ro->op_active && ro->op_notify && ro->op_notifier) { |
393 | break; | 429 | notifier = ro->op_notifier; |
394 | } | 430 | rs = rm->m_rs; |
431 | sock_hold(rds_rs_to_sk(rs)); | ||
395 | 432 | ||
396 | list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { | 433 | notifier->n_status = status; |
397 | if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) | 434 | spin_lock(&rs->rs_lock); |
398 | ret = 0; | 435 | list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); |
399 | break; | 436 | spin_unlock(&rs->rs_lock); |
437 | |||
438 | ro->op_notifier = NULL; | ||
400 | } | 439 | } |
401 | 440 | ||
402 | spin_unlock(&conn->c_lock); | 441 | spin_unlock_irqrestore(&rm->m_rs_lock, flags); |
403 | 442 | ||
404 | return ret; | 443 | if (rs) { |
444 | rds_wake_sk_sleep(rs); | ||
445 | sock_put(rds_rs_to_sk(rs)); | ||
446 | } | ||
405 | } | 447 | } |
448 | EXPORT_SYMBOL_GPL(rds_rdma_send_complete); | ||
406 | 449 | ||
407 | /* | 450 | /* |
408 | * This is pretty similar to what happens below in the ACK | 451 | * Just like above, except looks at atomic op |
409 | * handling code - except that we call here as soon as we get | ||
410 | * the IB send completion on the RDMA op and the accompanying | ||
411 | * message. | ||
412 | */ | 452 | */ |
413 | void rds_rdma_send_complete(struct rds_message *rm, int status) | 453 | void rds_atomic_send_complete(struct rds_message *rm, int status) |
414 | { | 454 | { |
415 | struct rds_sock *rs = NULL; | 455 | struct rds_sock *rs = NULL; |
416 | struct rds_rdma_op *ro; | 456 | struct rm_atomic_op *ao; |
417 | struct rds_notifier *notifier; | 457 | struct rds_notifier *notifier; |
458 | unsigned long flags; | ||
418 | 459 | ||
419 | spin_lock(&rm->m_rs_lock); | 460 | spin_lock_irqsave(&rm->m_rs_lock, flags); |
420 | 461 | ||
421 | ro = rm->m_rdma_op; | 462 | ao = &rm->atomic; |
422 | if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && | 463 | if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) |
423 | ro && ro->r_notify && ro->r_notifier) { | 464 | && ao->op_active && ao->op_notify && ao->op_notifier) { |
424 | notifier = ro->r_notifier; | 465 | notifier = ao->op_notifier; |
425 | rs = rm->m_rs; | 466 | rs = rm->m_rs; |
426 | sock_hold(rds_rs_to_sk(rs)); | 467 | sock_hold(rds_rs_to_sk(rs)); |
427 | 468 | ||
@@ -430,17 +471,17 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) | |||
430 | list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); | 471 | list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); |
431 | spin_unlock(&rs->rs_lock); | 472 | spin_unlock(&rs->rs_lock); |
432 | 473 | ||
433 | ro->r_notifier = NULL; | 474 | ao->op_notifier = NULL; |
434 | } | 475 | } |
435 | 476 | ||
436 | spin_unlock(&rm->m_rs_lock); | 477 | spin_unlock_irqrestore(&rm->m_rs_lock, flags); |
437 | 478 | ||
438 | if (rs) { | 479 | if (rs) { |
439 | rds_wake_sk_sleep(rs); | 480 | rds_wake_sk_sleep(rs); |
440 | sock_put(rds_rs_to_sk(rs)); | 481 | sock_put(rds_rs_to_sk(rs)); |
441 | } | 482 | } |
442 | } | 483 | } |
443 | EXPORT_SYMBOL_GPL(rds_rdma_send_complete); | 484 | EXPORT_SYMBOL_GPL(rds_atomic_send_complete); |
444 | 485 | ||
445 | /* | 486 | /* |
446 | * This is the same as rds_rdma_send_complete except we | 487 | * This is the same as rds_rdma_send_complete except we |
@@ -448,15 +489,23 @@ EXPORT_SYMBOL_GPL(rds_rdma_send_complete); | |||
448 | * socket, socket lock) and can just move the notifier. | 489 | * socket, socket lock) and can just move the notifier. |
449 | */ | 490 | */ |
450 | static inline void | 491 | static inline void |
451 | __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) | 492 | __rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) |
452 | { | 493 | { |
453 | struct rds_rdma_op *ro; | 494 | struct rm_rdma_op *ro; |
495 | struct rm_atomic_op *ao; | ||
496 | |||
497 | ro = &rm->rdma; | ||
498 | if (ro->op_active && ro->op_notify && ro->op_notifier) { | ||
499 | ro->op_notifier->n_status = status; | ||
500 | list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue); | ||
501 | ro->op_notifier = NULL; | ||
502 | } | ||
454 | 503 | ||
455 | ro = rm->m_rdma_op; | 504 | ao = &rm->atomic; |
456 | if (ro && ro->r_notify && ro->r_notifier) { | 505 | if (ao->op_active && ao->op_notify && ao->op_notifier) { |
457 | ro->r_notifier->n_status = status; | 506 | ao->op_notifier->n_status = status; |
458 | list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue); | 507 | list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue); |
459 | ro->r_notifier = NULL; | 508 | ao->op_notifier = NULL; |
460 | } | 509 | } |
461 | 510 | ||
462 | /* No need to wake the app - caller does this */ | 511 | /* No need to wake the app - caller does this */ |
@@ -468,7 +517,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status | |||
468 | * So speed is not an issue here. | 517 | * So speed is not an issue here. |
469 | */ | 518 | */ |
470 | struct rds_message *rds_send_get_message(struct rds_connection *conn, | 519 | struct rds_message *rds_send_get_message(struct rds_connection *conn, |
471 | struct rds_rdma_op *op) | 520 | struct rm_rdma_op *op) |
472 | { | 521 | { |
473 | struct rds_message *rm, *tmp, *found = NULL; | 522 | struct rds_message *rm, *tmp, *found = NULL; |
474 | unsigned long flags; | 523 | unsigned long flags; |
@@ -476,7 +525,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, | |||
476 | spin_lock_irqsave(&conn->c_lock, flags); | 525 | spin_lock_irqsave(&conn->c_lock, flags); |
477 | 526 | ||
478 | list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { | 527 | list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { |
479 | if (rm->m_rdma_op == op) { | 528 | if (&rm->rdma == op) { |
480 | atomic_inc(&rm->m_refcount); | 529 | atomic_inc(&rm->m_refcount); |
481 | found = rm; | 530 | found = rm; |
482 | goto out; | 531 | goto out; |
@@ -484,7 +533,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, | |||
484 | } | 533 | } |
485 | 534 | ||
486 | list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { | 535 | list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { |
487 | if (rm->m_rdma_op == op) { | 536 | if (&rm->rdma == op) { |
488 | atomic_inc(&rm->m_refcount); | 537 | atomic_inc(&rm->m_refcount); |
489 | found = rm; | 538 | found = rm; |
490 | break; | 539 | break; |
@@ -544,19 +593,20 @@ void rds_send_remove_from_sock(struct list_head *messages, int status) | |||
544 | spin_lock(&rs->rs_lock); | 593 | spin_lock(&rs->rs_lock); |
545 | 594 | ||
546 | if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { | 595 | if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { |
547 | struct rds_rdma_op *ro = rm->m_rdma_op; | 596 | struct rm_rdma_op *ro = &rm->rdma; |
548 | struct rds_notifier *notifier; | 597 | struct rds_notifier *notifier; |
549 | 598 | ||
550 | list_del_init(&rm->m_sock_item); | 599 | list_del_init(&rm->m_sock_item); |
551 | rds_send_sndbuf_remove(rs, rm); | 600 | rds_send_sndbuf_remove(rs, rm); |
552 | 601 | ||
553 | if (ro && ro->r_notifier && (status || ro->r_notify)) { | 602 | if (ro->op_active && ro->op_notifier && |
554 | notifier = ro->r_notifier; | 603 | (ro->op_notify || (ro->op_recverr && status))) { |
604 | notifier = ro->op_notifier; | ||
555 | list_add_tail(¬ifier->n_list, | 605 | list_add_tail(¬ifier->n_list, |
556 | &rs->rs_notify_queue); | 606 | &rs->rs_notify_queue); |
557 | if (!notifier->n_status) | 607 | if (!notifier->n_status) |
558 | notifier->n_status = status; | 608 | notifier->n_status = status; |
559 | rm->m_rdma_op->r_notifier = NULL; | 609 | rm->rdma.op_notifier = NULL; |
560 | } | 610 | } |
561 | was_on_sock = 1; | 611 | was_on_sock = 1; |
562 | rm->m_rs = NULL; | 612 | rm->m_rs = NULL; |
@@ -619,9 +669,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) | |||
619 | { | 669 | { |
620 | struct rds_message *rm, *tmp; | 670 | struct rds_message *rm, *tmp; |
621 | struct rds_connection *conn; | 671 | struct rds_connection *conn; |
622 | unsigned long flags, flags2; | 672 | unsigned long flags; |
623 | LIST_HEAD(list); | 673 | LIST_HEAD(list); |
624 | int wake = 0; | ||
625 | 674 | ||
626 | /* get all the messages we're dropping under the rs lock */ | 675 | /* get all the messages we're dropping under the rs lock */ |
627 | spin_lock_irqsave(&rs->rs_lock, flags); | 676 | spin_lock_irqsave(&rs->rs_lock, flags); |
@@ -631,59 +680,54 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) | |||
631 | dest->sin_port != rm->m_inc.i_hdr.h_dport)) | 680 | dest->sin_port != rm->m_inc.i_hdr.h_dport)) |
632 | continue; | 681 | continue; |
633 | 682 | ||
634 | wake = 1; | ||
635 | list_move(&rm->m_sock_item, &list); | 683 | list_move(&rm->m_sock_item, &list); |
636 | rds_send_sndbuf_remove(rs, rm); | 684 | rds_send_sndbuf_remove(rs, rm); |
637 | clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); | 685 | clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); |
638 | } | 686 | } |
639 | 687 | ||
640 | /* order flag updates with the rs lock */ | 688 | /* order flag updates with the rs lock */ |
641 | if (wake) | 689 | smp_mb__after_clear_bit(); |
642 | smp_mb__after_clear_bit(); | ||
643 | 690 | ||
644 | spin_unlock_irqrestore(&rs->rs_lock, flags); | 691 | spin_unlock_irqrestore(&rs->rs_lock, flags); |
645 | 692 | ||
646 | conn = NULL; | 693 | if (list_empty(&list)) |
694 | return; | ||
647 | 695 | ||
648 | /* now remove the messages from the conn list as needed */ | 696 | /* Remove the messages from the conn */ |
649 | list_for_each_entry(rm, &list, m_sock_item) { | 697 | list_for_each_entry(rm, &list, m_sock_item) { |
650 | /* We do this here rather than in the loop above, so that | ||
651 | * we don't have to nest m_rs_lock under rs->rs_lock */ | ||
652 | spin_lock_irqsave(&rm->m_rs_lock, flags2); | ||
653 | /* If this is a RDMA operation, notify the app. */ | ||
654 | spin_lock(&rs->rs_lock); | ||
655 | __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED); | ||
656 | spin_unlock(&rs->rs_lock); | ||
657 | rm->m_rs = NULL; | ||
658 | spin_unlock_irqrestore(&rm->m_rs_lock, flags2); | ||
659 | 698 | ||
699 | conn = rm->m_inc.i_conn; | ||
700 | |||
701 | spin_lock_irqsave(&conn->c_lock, flags); | ||
660 | /* | 702 | /* |
661 | * If we see this flag cleared then we're *sure* that someone | 703 | * Maybe someone else beat us to removing rm from the conn. |
662 | * else beat us to removing it from the conn. If we race | 704 | * If we race with their flag update we'll get the lock and |
663 | * with their flag update we'll get the lock and then really | 705 | * then really see that the flag has been cleared. |
664 | * see that the flag has been cleared. | ||
665 | */ | 706 | */ |
666 | if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags)) | 707 | if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { |
708 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
667 | continue; | 709 | continue; |
668 | |||
669 | if (conn != rm->m_inc.i_conn) { | ||
670 | if (conn) | ||
671 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
672 | conn = rm->m_inc.i_conn; | ||
673 | spin_lock_irqsave(&conn->c_lock, flags); | ||
674 | } | 710 | } |
711 | list_del_init(&rm->m_conn_item); | ||
712 | spin_unlock_irqrestore(&conn->c_lock, flags); | ||
675 | 713 | ||
676 | if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { | 714 | /* |
677 | list_del_init(&rm->m_conn_item); | 715 | * Couldn't grab m_rs_lock in top loop (lock ordering), |
678 | rds_message_put(rm); | 716 | * but we can now. |
679 | } | 717 | */ |
680 | } | 718 | spin_lock_irqsave(&rm->m_rs_lock, flags); |
681 | 719 | ||
682 | if (conn) | 720 | spin_lock(&rs->rs_lock); |
683 | spin_unlock_irqrestore(&conn->c_lock, flags); | 721 | __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); |
722 | spin_unlock(&rs->rs_lock); | ||
684 | 723 | ||
685 | if (wake) | 724 | rm->m_rs = NULL; |
686 | rds_wake_sk_sleep(rs); | 725 | spin_unlock_irqrestore(&rm->m_rs_lock, flags); |
726 | |||
727 | rds_message_put(rm); | ||
728 | } | ||
729 | |||
730 | rds_wake_sk_sleep(rs); | ||
687 | 731 | ||
688 | while (!list_empty(&list)) { | 732 | while (!list_empty(&list)) { |
689 | rm = list_entry(list.next, struct rds_message, m_sock_item); | 733 | rm = list_entry(list.next, struct rds_message, m_sock_item); |
@@ -763,6 +807,63 @@ out: | |||
763 | return *queued; | 807 | return *queued; |
764 | } | 808 | } |
765 | 809 | ||
810 | /* | ||
811 | * rds_message is getting to be quite complicated, and we'd like to allocate | ||
812 | * it all in one go. This figures out how big it needs to be up front. | ||
813 | */ | ||
814 | static int rds_rm_size(struct msghdr *msg, int data_len) | ||
815 | { | ||
816 | struct cmsghdr *cmsg; | ||
817 | int size = 0; | ||
818 | int cmsg_groups = 0; | ||
819 | int retval; | ||
820 | |||
821 | for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { | ||
822 | if (!CMSG_OK(msg, cmsg)) | ||
823 | return -EINVAL; | ||
824 | |||
825 | if (cmsg->cmsg_level != SOL_RDS) | ||
826 | continue; | ||
827 | |||
828 | switch (cmsg->cmsg_type) { | ||
829 | case RDS_CMSG_RDMA_ARGS: | ||
830 | cmsg_groups |= 1; | ||
831 | retval = rds_rdma_extra_size(CMSG_DATA(cmsg)); | ||
832 | if (retval < 0) | ||
833 | return retval; | ||
834 | size += retval; | ||
835 | |||
836 | break; | ||
837 | |||
838 | case RDS_CMSG_RDMA_DEST: | ||
839 | case RDS_CMSG_RDMA_MAP: | ||
840 | cmsg_groups |= 2; | ||
841 | /* these are valid but do no add any size */ | ||
842 | break; | ||
843 | |||
844 | case RDS_CMSG_ATOMIC_CSWP: | ||
845 | case RDS_CMSG_ATOMIC_FADD: | ||
846 | case RDS_CMSG_MASKED_ATOMIC_CSWP: | ||
847 | case RDS_CMSG_MASKED_ATOMIC_FADD: | ||
848 | cmsg_groups |= 1; | ||
849 | size += sizeof(struct scatterlist); | ||
850 | break; | ||
851 | |||
852 | default: | ||
853 | return -EINVAL; | ||
854 | } | ||
855 | |||
856 | } | ||
857 | |||
858 | size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); | ||
859 | |||
860 | /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ | ||
861 | if (cmsg_groups == 3) | ||
862 | return -EINVAL; | ||
863 | |||
864 | return size; | ||
865 | } | ||
866 | |||
766 | static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, | 867 | static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, |
767 | struct msghdr *msg, int *allocated_mr) | 868 | struct msghdr *msg, int *allocated_mr) |
768 | { | 869 | { |
@@ -777,7 +878,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, | |||
777 | continue; | 878 | continue; |
778 | 879 | ||
779 | /* As a side effect, RDMA_DEST and RDMA_MAP will set | 880 | /* As a side effect, RDMA_DEST and RDMA_MAP will set |
780 | * rm->m_rdma_cookie and rm->m_rdma_mr. | 881 | * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr. |
781 | */ | 882 | */ |
782 | switch (cmsg->cmsg_type) { | 883 | switch (cmsg->cmsg_type) { |
783 | case RDS_CMSG_RDMA_ARGS: | 884 | case RDS_CMSG_RDMA_ARGS: |
@@ -793,6 +894,12 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, | |||
793 | if (!ret) | 894 | if (!ret) |
794 | *allocated_mr = 1; | 895 | *allocated_mr = 1; |
795 | break; | 896 | break; |
897 | case RDS_CMSG_ATOMIC_CSWP: | ||
898 | case RDS_CMSG_ATOMIC_FADD: | ||
899 | case RDS_CMSG_MASKED_ATOMIC_CSWP: | ||
900 | case RDS_CMSG_MASKED_ATOMIC_FADD: | ||
901 | ret = rds_cmsg_atomic(rs, rm, cmsg); | ||
902 | break; | ||
796 | 903 | ||
797 | default: | 904 | default: |
798 | return -EINVAL; | 905 | return -EINVAL; |
@@ -850,13 +957,26 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | |||
850 | goto out; | 957 | goto out; |
851 | } | 958 | } |
852 | 959 | ||
853 | rm = rds_message_copy_from_user(msg->msg_iov, payload_len); | 960 | /* size of rm including all sgs */ |
854 | if (IS_ERR(rm)) { | 961 | ret = rds_rm_size(msg, payload_len); |
855 | ret = PTR_ERR(rm); | 962 | if (ret < 0) |
856 | rm = NULL; | 963 | goto out; |
964 | |||
965 | rm = rds_message_alloc(ret, GFP_KERNEL); | ||
966 | if (!rm) { | ||
967 | ret = -ENOMEM; | ||
857 | goto out; | 968 | goto out; |
858 | } | 969 | } |
859 | 970 | ||
971 | /* Attach data to the rm */ | ||
972 | if (payload_len) { | ||
973 | rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); | ||
974 | ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len); | ||
975 | if (ret) | ||
976 | goto out; | ||
977 | } | ||
978 | rm->data.op_active = 1; | ||
979 | |||
860 | rm->m_daddr = daddr; | 980 | rm->m_daddr = daddr; |
861 | 981 | ||
862 | /* rds_conn_create has a spinlock that runs with IRQ off. | 982 | /* rds_conn_create has a spinlock that runs with IRQ off. |
@@ -879,22 +999,23 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | |||
879 | if (ret) | 999 | if (ret) |
880 | goto out; | 1000 | goto out; |
881 | 1001 | ||
882 | if ((rm->m_rdma_cookie || rm->m_rdma_op) && | 1002 | if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { |
883 | conn->c_trans->xmit_rdma == NULL) { | ||
884 | if (printk_ratelimit()) | 1003 | if (printk_ratelimit()) |
885 | printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", | 1004 | printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", |
886 | rm->m_rdma_op, conn->c_trans->xmit_rdma); | 1005 | &rm->rdma, conn->c_trans->xmit_rdma); |
887 | ret = -EOPNOTSUPP; | 1006 | ret = -EOPNOTSUPP; |
888 | goto out; | 1007 | goto out; |
889 | } | 1008 | } |
890 | 1009 | ||
891 | /* If the connection is down, trigger a connect. We may | 1010 | if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) { |
892 | * have scheduled a delayed reconnect however - in this case | 1011 | if (printk_ratelimit()) |
893 | * we should not interfere. | 1012 | printk(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n", |
894 | */ | 1013 | &rm->atomic, conn->c_trans->xmit_atomic); |
895 | if (rds_conn_state(conn) == RDS_CONN_DOWN && | 1014 | ret = -EOPNOTSUPP; |
896 | !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) | 1015 | goto out; |
897 | queue_delayed_work(rds_wq, &conn->c_conn_w, 0); | 1016 | } |
1017 | |||
1018 | rds_conn_connect_if_down(conn); | ||
898 | 1019 | ||
899 | ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); | 1020 | ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); |
900 | if (ret) { | 1021 | if (ret) { |
@@ -938,7 +1059,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, | |||
938 | rds_stats_inc(s_send_queued); | 1059 | rds_stats_inc(s_send_queued); |
939 | 1060 | ||
940 | if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) | 1061 | if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) |
941 | rds_send_worker(&conn->c_send_w.work); | 1062 | rds_send_xmit(conn); |
942 | 1063 | ||
943 | rds_message_put(rm); | 1064 | rds_message_put(rm); |
944 | return payload_len; | 1065 | return payload_len; |
@@ -966,20 +1087,15 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) | |||
966 | int ret = 0; | 1087 | int ret = 0; |
967 | 1088 | ||
968 | rm = rds_message_alloc(0, GFP_ATOMIC); | 1089 | rm = rds_message_alloc(0, GFP_ATOMIC); |
969 | if (rm == NULL) { | 1090 | if (!rm) { |
970 | ret = -ENOMEM; | 1091 | ret = -ENOMEM; |
971 | goto out; | 1092 | goto out; |
972 | } | 1093 | } |
973 | 1094 | ||
974 | rm->m_daddr = conn->c_faddr; | 1095 | rm->m_daddr = conn->c_faddr; |
1096 | rm->data.op_active = 1; | ||
975 | 1097 | ||
976 | /* If the connection is down, trigger a connect. We may | 1098 | rds_conn_connect_if_down(conn); |
977 | * have scheduled a delayed reconnect however - in this case | ||
978 | * we should not interfere. | ||
979 | */ | ||
980 | if (rds_conn_state(conn) == RDS_CONN_DOWN && | ||
981 | !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) | ||
982 | queue_delayed_work(rds_wq, &conn->c_conn_w, 0); | ||
983 | 1099 | ||
984 | ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); | 1100 | ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); |
985 | if (ret) | 1101 | if (ret) |
@@ -999,7 +1115,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) | |||
999 | rds_stats_inc(s_send_queued); | 1115 | rds_stats_inc(s_send_queued); |
1000 | rds_stats_inc(s_send_pong); | 1116 | rds_stats_inc(s_send_pong); |
1001 | 1117 | ||
1002 | queue_delayed_work(rds_wq, &conn->c_send_w, 0); | 1118 | if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) |
1119 | rds_send_xmit(conn); | ||
1120 | |||
1003 | rds_message_put(rm); | 1121 | rds_message_put(rm); |
1004 | return 0; | 1122 | return 0; |
1005 | 1123 | ||
diff --git a/net/rds/stats.c b/net/rds/stats.c index 7598eb07cfb..10c759ccac0 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c | |||
@@ -57,8 +57,8 @@ static const char *const rds_stat_names[] = { | |||
57 | "recv_ping", | 57 | "recv_ping", |
58 | "send_queue_empty", | 58 | "send_queue_empty", |
59 | "send_queue_full", | 59 | "send_queue_full", |
60 | "send_sem_contention", | 60 | "send_lock_contention", |
61 | "send_sem_queue_raced", | 61 | "send_lock_queue_raced", |
62 | "send_immediate_retry", | 62 | "send_immediate_retry", |
63 | "send_delayed_retry", | 63 | "send_delayed_retry", |
64 | "send_drop_acked", | 64 | "send_drop_acked", |
@@ -143,7 +143,7 @@ void rds_stats_exit(void) | |||
143 | rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); | 143 | rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); |
144 | } | 144 | } |
145 | 145 | ||
146 | int __init rds_stats_init(void) | 146 | int rds_stats_init(void) |
147 | { | 147 | { |
148 | rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); | 148 | rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); |
149 | return 0; | 149 | return 0; |
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c index 7829a20325d..25ad0c77a26 100644 --- a/net/rds/sysctl.c +++ b/net/rds/sysctl.c | |||
@@ -105,13 +105,13 @@ void rds_sysctl_exit(void) | |||
105 | unregister_sysctl_table(rds_sysctl_reg_table); | 105 | unregister_sysctl_table(rds_sysctl_reg_table); |
106 | } | 106 | } |
107 | 107 | ||
108 | int __init rds_sysctl_init(void) | 108 | int rds_sysctl_init(void) |
109 | { | 109 | { |
110 | rds_sysctl_reconnect_min = msecs_to_jiffies(1); | 110 | rds_sysctl_reconnect_min = msecs_to_jiffies(1); |
111 | rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; | 111 | rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; |
112 | 112 | ||
113 | rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table); | 113 | rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table); |
114 | if (rds_sysctl_reg_table == NULL) | 114 | if (!rds_sysctl_reg_table) |
115 | return -ENOMEM; | 115 | return -ENOMEM; |
116 | return 0; | 116 | return 0; |
117 | } | 117 | } |
diff --git a/net/rds/tcp.c b/net/rds/tcp.c index babf4577ff7..eeb08e6ab96 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c | |||
@@ -200,7 +200,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) | |||
200 | struct rds_tcp_connection *tc; | 200 | struct rds_tcp_connection *tc; |
201 | 201 | ||
202 | tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); | 202 | tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); |
203 | if (tc == NULL) | 203 | if (!tc) |
204 | return -ENOMEM; | 204 | return -ENOMEM; |
205 | 205 | ||
206 | tc->t_sock = NULL; | 206 | tc->t_sock = NULL; |
@@ -258,7 +258,6 @@ struct rds_transport rds_tcp_transport = { | |||
258 | .laddr_check = rds_tcp_laddr_check, | 258 | .laddr_check = rds_tcp_laddr_check, |
259 | .xmit_prepare = rds_tcp_xmit_prepare, | 259 | .xmit_prepare = rds_tcp_xmit_prepare, |
260 | .xmit_complete = rds_tcp_xmit_complete, | 260 | .xmit_complete = rds_tcp_xmit_complete, |
261 | .xmit_cong_map = rds_tcp_xmit_cong_map, | ||
262 | .xmit = rds_tcp_xmit, | 261 | .xmit = rds_tcp_xmit, |
263 | .recv = rds_tcp_recv, | 262 | .recv = rds_tcp_recv, |
264 | .conn_alloc = rds_tcp_conn_alloc, | 263 | .conn_alloc = rds_tcp_conn_alloc, |
@@ -266,7 +265,6 @@ struct rds_transport rds_tcp_transport = { | |||
266 | .conn_connect = rds_tcp_conn_connect, | 265 | .conn_connect = rds_tcp_conn_connect, |
267 | .conn_shutdown = rds_tcp_conn_shutdown, | 266 | .conn_shutdown = rds_tcp_conn_shutdown, |
268 | .inc_copy_to_user = rds_tcp_inc_copy_to_user, | 267 | .inc_copy_to_user = rds_tcp_inc_copy_to_user, |
269 | .inc_purge = rds_tcp_inc_purge, | ||
270 | .inc_free = rds_tcp_inc_free, | 268 | .inc_free = rds_tcp_inc_free, |
271 | .stats_info_copy = rds_tcp_stats_info_copy, | 269 | .stats_info_copy = rds_tcp_stats_info_copy, |
272 | .exit = rds_tcp_exit, | 270 | .exit = rds_tcp_exit, |
@@ -276,14 +274,14 @@ struct rds_transport rds_tcp_transport = { | |||
276 | .t_prefer_loopback = 1, | 274 | .t_prefer_loopback = 1, |
277 | }; | 275 | }; |
278 | 276 | ||
279 | int __init rds_tcp_init(void) | 277 | int rds_tcp_init(void) |
280 | { | 278 | { |
281 | int ret; | 279 | int ret; |
282 | 280 | ||
283 | rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", | 281 | rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", |
284 | sizeof(struct rds_tcp_connection), | 282 | sizeof(struct rds_tcp_connection), |
285 | 0, 0, NULL); | 283 | 0, 0, NULL); |
286 | if (rds_tcp_conn_slab == NULL) { | 284 | if (!rds_tcp_conn_slab) { |
287 | ret = -ENOMEM; | 285 | ret = -ENOMEM; |
288 | goto out; | 286 | goto out; |
289 | } | 287 | } |
diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 844fa6b9cf5..f5e6f7bebb5 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h | |||
@@ -43,7 +43,7 @@ struct rds_tcp_statistics { | |||
43 | }; | 43 | }; |
44 | 44 | ||
45 | /* tcp.c */ | 45 | /* tcp.c */ |
46 | int __init rds_tcp_init(void); | 46 | int rds_tcp_init(void); |
47 | void rds_tcp_exit(void); | 47 | void rds_tcp_exit(void); |
48 | void rds_tcp_tune(struct socket *sock); | 48 | void rds_tcp_tune(struct socket *sock); |
49 | void rds_tcp_nonagle(struct socket *sock); | 49 | void rds_tcp_nonagle(struct socket *sock); |
@@ -61,16 +61,15 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn); | |||
61 | void rds_tcp_state_change(struct sock *sk); | 61 | void rds_tcp_state_change(struct sock *sk); |
62 | 62 | ||
63 | /* tcp_listen.c */ | 63 | /* tcp_listen.c */ |
64 | int __init rds_tcp_listen_init(void); | 64 | int rds_tcp_listen_init(void); |
65 | void rds_tcp_listen_stop(void); | 65 | void rds_tcp_listen_stop(void); |
66 | void rds_tcp_listen_data_ready(struct sock *sk, int bytes); | 66 | void rds_tcp_listen_data_ready(struct sock *sk, int bytes); |
67 | 67 | ||
68 | /* tcp_recv.c */ | 68 | /* tcp_recv.c */ |
69 | int __init rds_tcp_recv_init(void); | 69 | int rds_tcp_recv_init(void); |
70 | void rds_tcp_recv_exit(void); | 70 | void rds_tcp_recv_exit(void); |
71 | void rds_tcp_data_ready(struct sock *sk, int bytes); | 71 | void rds_tcp_data_ready(struct sock *sk, int bytes); |
72 | int rds_tcp_recv(struct rds_connection *conn); | 72 | int rds_tcp_recv(struct rds_connection *conn); |
73 | void rds_tcp_inc_purge(struct rds_incoming *inc); | ||
74 | void rds_tcp_inc_free(struct rds_incoming *inc); | 73 | void rds_tcp_inc_free(struct rds_incoming *inc); |
75 | int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, | 74 | int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, |
76 | size_t size); | 75 | size_t size); |
@@ -81,8 +80,6 @@ void rds_tcp_xmit_complete(struct rds_connection *conn); | |||
81 | int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, | 80 | int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, |
82 | unsigned int hdr_off, unsigned int sg, unsigned int off); | 81 | unsigned int hdr_off, unsigned int sg, unsigned int off); |
83 | void rds_tcp_write_space(struct sock *sk); | 82 | void rds_tcp_write_space(struct sock *sk); |
84 | int rds_tcp_xmit_cong_map(struct rds_connection *conn, | ||
85 | struct rds_cong_map *map, unsigned long offset); | ||
86 | 83 | ||
87 | /* tcp_stats.c */ | 84 | /* tcp_stats.c */ |
88 | DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats); | 85 | DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats); |
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index c519939e8da..af95c8e058f 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c | |||
@@ -45,7 +45,7 @@ void rds_tcp_state_change(struct sock *sk) | |||
45 | 45 | ||
46 | read_lock_bh(&sk->sk_callback_lock); | 46 | read_lock_bh(&sk->sk_callback_lock); |
47 | conn = sk->sk_user_data; | 47 | conn = sk->sk_user_data; |
48 | if (conn == NULL) { | 48 | if (!conn) { |
49 | state_change = sk->sk_state_change; | 49 | state_change = sk->sk_state_change; |
50 | goto out; | 50 | goto out; |
51 | } | 51 | } |
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 27844f231d1..8b5cc4aa886 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c | |||
@@ -116,7 +116,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes) | |||
116 | 116 | ||
117 | read_lock_bh(&sk->sk_callback_lock); | 117 | read_lock_bh(&sk->sk_callback_lock); |
118 | ready = sk->sk_user_data; | 118 | ready = sk->sk_user_data; |
119 | if (ready == NULL) { /* check for teardown race */ | 119 | if (!ready) { /* check for teardown race */ |
120 | ready = sk->sk_data_ready; | 120 | ready = sk->sk_data_ready; |
121 | goto out; | 121 | goto out; |
122 | } | 122 | } |
@@ -135,7 +135,7 @@ out: | |||
135 | ready(sk, bytes); | 135 | ready(sk, bytes); |
136 | } | 136 | } |
137 | 137 | ||
138 | int __init rds_tcp_listen_init(void) | 138 | int rds_tcp_listen_init(void) |
139 | { | 139 | { |
140 | struct sockaddr_in sin; | 140 | struct sockaddr_in sin; |
141 | struct socket *sock = NULL; | 141 | struct socket *sock = NULL; |
@@ -178,7 +178,7 @@ void rds_tcp_listen_stop(void) | |||
178 | struct socket *sock = rds_tcp_listen_sock; | 178 | struct socket *sock = rds_tcp_listen_sock; |
179 | struct sock *sk; | 179 | struct sock *sk; |
180 | 180 | ||
181 | if (sock == NULL) | 181 | if (!sock) |
182 | return; | 182 | return; |
183 | 183 | ||
184 | sk = sock->sk; | 184 | sk = sock->sk; |
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index e4379740410..67263fbee62 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c | |||
@@ -39,7 +39,7 @@ | |||
39 | 39 | ||
40 | static struct kmem_cache *rds_tcp_incoming_slab; | 40 | static struct kmem_cache *rds_tcp_incoming_slab; |
41 | 41 | ||
42 | void rds_tcp_inc_purge(struct rds_incoming *inc) | 42 | static void rds_tcp_inc_purge(struct rds_incoming *inc) |
43 | { | 43 | { |
44 | struct rds_tcp_incoming *tinc; | 44 | struct rds_tcp_incoming *tinc; |
45 | tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); | 45 | tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); |
@@ -190,10 +190,10 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, | |||
190 | * processing. | 190 | * processing. |
191 | */ | 191 | */ |
192 | while (left) { | 192 | while (left) { |
193 | if (tinc == NULL) { | 193 | if (!tinc) { |
194 | tinc = kmem_cache_alloc(rds_tcp_incoming_slab, | 194 | tinc = kmem_cache_alloc(rds_tcp_incoming_slab, |
195 | arg->gfp); | 195 | arg->gfp); |
196 | if (tinc == NULL) { | 196 | if (!tinc) { |
197 | desc->error = -ENOMEM; | 197 | desc->error = -ENOMEM; |
198 | goto out; | 198 | goto out; |
199 | } | 199 | } |
@@ -229,7 +229,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, | |||
229 | 229 | ||
230 | if (left && tc->t_tinc_data_rem) { | 230 | if (left && tc->t_tinc_data_rem) { |
231 | clone = skb_clone(skb, arg->gfp); | 231 | clone = skb_clone(skb, arg->gfp); |
232 | if (clone == NULL) { | 232 | if (!clone) { |
233 | desc->error = -ENOMEM; | 233 | desc->error = -ENOMEM; |
234 | goto out; | 234 | goto out; |
235 | } | 235 | } |
@@ -326,7 +326,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes) | |||
326 | 326 | ||
327 | read_lock_bh(&sk->sk_callback_lock); | 327 | read_lock_bh(&sk->sk_callback_lock); |
328 | conn = sk->sk_user_data; | 328 | conn = sk->sk_user_data; |
329 | if (conn == NULL) { /* check for teardown race */ | 329 | if (!conn) { /* check for teardown race */ |
330 | ready = sk->sk_data_ready; | 330 | ready = sk->sk_data_ready; |
331 | goto out; | 331 | goto out; |
332 | } | 332 | } |
@@ -342,12 +342,12 @@ out: | |||
342 | ready(sk, bytes); | 342 | ready(sk, bytes); |
343 | } | 343 | } |
344 | 344 | ||
345 | int __init rds_tcp_recv_init(void) | 345 | int rds_tcp_recv_init(void) |
346 | { | 346 | { |
347 | rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", | 347 | rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", |
348 | sizeof(struct rds_tcp_incoming), | 348 | sizeof(struct rds_tcp_incoming), |
349 | 0, 0, NULL); | 349 | 0, 0, NULL); |
350 | if (rds_tcp_incoming_slab == NULL) | 350 | if (!rds_tcp_incoming_slab) |
351 | return -ENOMEM; | 351 | return -ENOMEM; |
352 | return 0; | 352 | return 0; |
353 | } | 353 | } |
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 2f012a07d94..aa16841afbd 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c | |||
@@ -77,56 +77,6 @@ int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) | |||
77 | } | 77 | } |
78 | 78 | ||
79 | /* the core send_sem serializes this with other xmit and shutdown */ | 79 | /* the core send_sem serializes this with other xmit and shutdown */ |
80 | int rds_tcp_xmit_cong_map(struct rds_connection *conn, | ||
81 | struct rds_cong_map *map, unsigned long offset) | ||
82 | { | ||
83 | static struct rds_header rds_tcp_map_header = { | ||
84 | .h_flags = RDS_FLAG_CONG_BITMAP, | ||
85 | }; | ||
86 | struct rds_tcp_connection *tc = conn->c_transport_data; | ||
87 | unsigned long i; | ||
88 | int ret; | ||
89 | int copied = 0; | ||
90 | |||
91 | /* Some problem claims cpu_to_be32(constant) isn't a constant. */ | ||
92 | rds_tcp_map_header.h_len = cpu_to_be32(RDS_CONG_MAP_BYTES); | ||
93 | |||
94 | if (offset < sizeof(struct rds_header)) { | ||
95 | ret = rds_tcp_sendmsg(tc->t_sock, | ||
96 | (void *)&rds_tcp_map_header + offset, | ||
97 | sizeof(struct rds_header) - offset); | ||
98 | if (ret <= 0) | ||
99 | return ret; | ||
100 | offset += ret; | ||
101 | copied = ret; | ||
102 | if (offset < sizeof(struct rds_header)) | ||
103 | return ret; | ||
104 | } | ||
105 | |||
106 | offset -= sizeof(struct rds_header); | ||
107 | i = offset / PAGE_SIZE; | ||
108 | offset = offset % PAGE_SIZE; | ||
109 | BUG_ON(i >= RDS_CONG_MAP_PAGES); | ||
110 | |||
111 | do { | ||
112 | ret = tc->t_sock->ops->sendpage(tc->t_sock, | ||
113 | virt_to_page(map->m_page_addrs[i]), | ||
114 | offset, PAGE_SIZE - offset, | ||
115 | MSG_DONTWAIT); | ||
116 | if (ret <= 0) | ||
117 | break; | ||
118 | copied += ret; | ||
119 | offset += ret; | ||
120 | if (offset == PAGE_SIZE) { | ||
121 | offset = 0; | ||
122 | i++; | ||
123 | } | ||
124 | } while (i < RDS_CONG_MAP_PAGES); | ||
125 | |||
126 | return copied ? copied : ret; | ||
127 | } | ||
128 | |||
129 | /* the core send_sem serializes this with other xmit and shutdown */ | ||
130 | int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, | 80 | int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, |
131 | unsigned int hdr_off, unsigned int sg, unsigned int off) | 81 | unsigned int hdr_off, unsigned int sg, unsigned int off) |
132 | { | 82 | { |
@@ -166,21 +116,21 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
166 | goto out; | 116 | goto out; |
167 | } | 117 | } |
168 | 118 | ||
169 | while (sg < rm->m_nents) { | 119 | while (sg < rm->data.op_nents) { |
170 | ret = tc->t_sock->ops->sendpage(tc->t_sock, | 120 | ret = tc->t_sock->ops->sendpage(tc->t_sock, |
171 | sg_page(&rm->m_sg[sg]), | 121 | sg_page(&rm->data.op_sg[sg]), |
172 | rm->m_sg[sg].offset + off, | 122 | rm->data.op_sg[sg].offset + off, |
173 | rm->m_sg[sg].length - off, | 123 | rm->data.op_sg[sg].length - off, |
174 | MSG_DONTWAIT|MSG_NOSIGNAL); | 124 | MSG_DONTWAIT|MSG_NOSIGNAL); |
175 | rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->m_sg[sg]), | 125 | rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]), |
176 | rm->m_sg[sg].offset + off, rm->m_sg[sg].length - off, | 126 | rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, |
177 | ret); | 127 | ret); |
178 | if (ret <= 0) | 128 | if (ret <= 0) |
179 | break; | 129 | break; |
180 | 130 | ||
181 | off += ret; | 131 | off += ret; |
182 | done += ret; | 132 | done += ret; |
183 | if (off == rm->m_sg[sg].length) { | 133 | if (off == rm->data.op_sg[sg].length) { |
184 | off = 0; | 134 | off = 0; |
185 | sg++; | 135 | sg++; |
186 | } | 136 | } |
@@ -226,7 +176,7 @@ void rds_tcp_write_space(struct sock *sk) | |||
226 | 176 | ||
227 | read_lock_bh(&sk->sk_callback_lock); | 177 | read_lock_bh(&sk->sk_callback_lock); |
228 | conn = sk->sk_user_data; | 178 | conn = sk->sk_user_data; |
229 | if (conn == NULL) { | 179 | if (!conn) { |
230 | write_space = sk->sk_write_space; | 180 | write_space = sk->sk_write_space; |
231 | goto out; | 181 | goto out; |
232 | } | 182 | } |
diff --git a/net/rds/threads.c b/net/rds/threads.c index 786c20eaaf5..0fd90f8c5f5 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c | |||
@@ -61,7 +61,7 @@ | |||
61 | * | 61 | * |
62 | * Transition to state DISCONNECTING/DOWN: | 62 | * Transition to state DISCONNECTING/DOWN: |
63 | * - Inside the shutdown worker; synchronizes with xmit path | 63 | * - Inside the shutdown worker; synchronizes with xmit path |
64 | * through c_send_lock, and with connection management callbacks | 64 | * through RDS_IN_XMIT, and with connection management callbacks |
65 | * via c_cm_lock. | 65 | * via c_cm_lock. |
66 | * | 66 | * |
67 | * For receive callbacks, we rely on the underlying transport | 67 | * For receive callbacks, we rely on the underlying transport |
@@ -110,7 +110,7 @@ EXPORT_SYMBOL_GPL(rds_connect_complete); | |||
110 | * We should *always* start with a random backoff; otherwise a broken connection | 110 | * We should *always* start with a random backoff; otherwise a broken connection |
111 | * will always take several iterations to be re-established. | 111 | * will always take several iterations to be re-established. |
112 | */ | 112 | */ |
113 | static void rds_queue_reconnect(struct rds_connection *conn) | 113 | void rds_queue_reconnect(struct rds_connection *conn) |
114 | { | 114 | { |
115 | unsigned long rand; | 115 | unsigned long rand; |
116 | 116 | ||
@@ -156,58 +156,6 @@ void rds_connect_worker(struct work_struct *work) | |||
156 | } | 156 | } |
157 | } | 157 | } |
158 | 158 | ||
159 | void rds_shutdown_worker(struct work_struct *work) | ||
160 | { | ||
161 | struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); | ||
162 | |||
163 | /* shut it down unless it's down already */ | ||
164 | if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { | ||
165 | /* | ||
166 | * Quiesce the connection mgmt handlers before we start tearing | ||
167 | * things down. We don't hold the mutex for the entire | ||
168 | * duration of the shutdown operation, else we may be | ||
169 | * deadlocking with the CM handler. Instead, the CM event | ||
170 | * handler is supposed to check for state DISCONNECTING | ||
171 | */ | ||
172 | mutex_lock(&conn->c_cm_lock); | ||
173 | if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) && | ||
174 | !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { | ||
175 | rds_conn_error(conn, "shutdown called in state %d\n", | ||
176 | atomic_read(&conn->c_state)); | ||
177 | mutex_unlock(&conn->c_cm_lock); | ||
178 | return; | ||
179 | } | ||
180 | mutex_unlock(&conn->c_cm_lock); | ||
181 | |||
182 | mutex_lock(&conn->c_send_lock); | ||
183 | conn->c_trans->conn_shutdown(conn); | ||
184 | rds_conn_reset(conn); | ||
185 | mutex_unlock(&conn->c_send_lock); | ||
186 | |||
187 | if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { | ||
188 | /* This can happen - eg when we're in the middle of tearing | ||
189 | * down the connection, and someone unloads the rds module. | ||
190 | * Quite reproduceable with loopback connections. | ||
191 | * Mostly harmless. | ||
192 | */ | ||
193 | rds_conn_error(conn, | ||
194 | "%s: failed to transition to state DOWN, " | ||
195 | "current state is %d\n", | ||
196 | __func__, | ||
197 | atomic_read(&conn->c_state)); | ||
198 | return; | ||
199 | } | ||
200 | } | ||
201 | |||
202 | /* Then reconnect if it's still live. | ||
203 | * The passive side of an IB loopback connection is never added | ||
204 | * to the conn hash, so we never trigger a reconnect on this | ||
205 | * conn - the reconnect is always triggered by the active peer. */ | ||
206 | cancel_delayed_work(&conn->c_conn_w); | ||
207 | if (!hlist_unhashed(&conn->c_hash_node)) | ||
208 | rds_queue_reconnect(conn); | ||
209 | } | ||
210 | |||
211 | void rds_send_worker(struct work_struct *work) | 159 | void rds_send_worker(struct work_struct *work) |
212 | { | 160 | { |
213 | struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); | 161 | struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); |
@@ -252,15 +200,22 @@ void rds_recv_worker(struct work_struct *work) | |||
252 | } | 200 | } |
253 | } | 201 | } |
254 | 202 | ||
203 | void rds_shutdown_worker(struct work_struct *work) | ||
204 | { | ||
205 | struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); | ||
206 | |||
207 | rds_conn_shutdown(conn); | ||
208 | } | ||
209 | |||
255 | void rds_threads_exit(void) | 210 | void rds_threads_exit(void) |
256 | { | 211 | { |
257 | destroy_workqueue(rds_wq); | 212 | destroy_workqueue(rds_wq); |
258 | } | 213 | } |
259 | 214 | ||
260 | int __init rds_threads_init(void) | 215 | int rds_threads_init(void) |
261 | { | 216 | { |
262 | rds_wq = create_workqueue("krdsd"); | 217 | rds_wq = create_singlethread_workqueue("krdsd"); |
263 | if (rds_wq == NULL) | 218 | if (!rds_wq) |
264 | return -ENOMEM; | 219 | return -ENOMEM; |
265 | 220 | ||
266 | return 0; | 221 | return 0; |
diff --git a/net/rds/transport.c b/net/rds/transport.c index 7e106790135..7f2ac4fec36 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c | |||
@@ -71,19 +71,28 @@ void rds_trans_unregister(struct rds_transport *trans) | |||
71 | } | 71 | } |
72 | EXPORT_SYMBOL_GPL(rds_trans_unregister); | 72 | EXPORT_SYMBOL_GPL(rds_trans_unregister); |
73 | 73 | ||
74 | void rds_trans_put(struct rds_transport *trans) | ||
75 | { | ||
76 | if (trans && trans->t_owner) | ||
77 | module_put(trans->t_owner); | ||
78 | } | ||
79 | |||
74 | struct rds_transport *rds_trans_get_preferred(__be32 addr) | 80 | struct rds_transport *rds_trans_get_preferred(__be32 addr) |
75 | { | 81 | { |
76 | struct rds_transport *ret = NULL; | 82 | struct rds_transport *ret = NULL; |
77 | int i; | 83 | struct rds_transport *trans; |
84 | unsigned int i; | ||
78 | 85 | ||
79 | if (IN_LOOPBACK(ntohl(addr))) | 86 | if (IN_LOOPBACK(ntohl(addr))) |
80 | return &rds_loop_transport; | 87 | return &rds_loop_transport; |
81 | 88 | ||
82 | down_read(&rds_trans_sem); | 89 | down_read(&rds_trans_sem); |
83 | for (i = 0; i < RDS_TRANS_COUNT; i++) | 90 | for (i = 0; i < RDS_TRANS_COUNT; i++) { |
84 | { | 91 | trans = transports[i]; |
85 | if (transports[i] && (transports[i]->laddr_check(addr) == 0)) { | 92 | |
86 | ret = transports[i]; | 93 | if (trans && (trans->laddr_check(addr) == 0) && |
94 | (!trans->t_owner || try_module_get(trans->t_owner))) { | ||
95 | ret = trans; | ||
87 | break; | 96 | break; |
88 | } | 97 | } |
89 | } | 98 | } |
diff --git a/net/rds/xlist.h b/net/rds/xlist.h new file mode 100644 index 00000000000..e6b5190dadd --- /dev/null +++ b/net/rds/xlist.h | |||
@@ -0,0 +1,80 @@ | |||
1 | #ifndef _LINUX_XLIST_H | ||
2 | #define _LINUX_XLIST_H | ||
3 | |||
4 | #include <linux/stddef.h> | ||
5 | #include <linux/poison.h> | ||
6 | #include <linux/prefetch.h> | ||
7 | #include <asm/system.h> | ||
8 | |||
9 | struct xlist_head { | ||
10 | struct xlist_head *next; | ||
11 | }; | ||
12 | |||
13 | static inline void INIT_XLIST_HEAD(struct xlist_head *list) | ||
14 | { | ||
15 | list->next = NULL; | ||
16 | } | ||
17 | |||
18 | static inline int xlist_empty(struct xlist_head *head) | ||
19 | { | ||
20 | return head->next == NULL; | ||
21 | } | ||
22 | |||
23 | static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail, | ||
24 | struct xlist_head *head) | ||
25 | { | ||
26 | struct xlist_head *cur; | ||
27 | struct xlist_head *check; | ||
28 | |||
29 | while (1) { | ||
30 | cur = head->next; | ||
31 | tail->next = cur; | ||
32 | check = cmpxchg(&head->next, cur, new); | ||
33 | if (check == cur) | ||
34 | break; | ||
35 | } | ||
36 | } | ||
37 | |||
38 | static inline struct xlist_head *xlist_del_head(struct xlist_head *head) | ||
39 | { | ||
40 | struct xlist_head *cur; | ||
41 | struct xlist_head *check; | ||
42 | struct xlist_head *next; | ||
43 | |||
44 | while (1) { | ||
45 | cur = head->next; | ||
46 | if (!cur) | ||
47 | goto out; | ||
48 | |||
49 | next = cur->next; | ||
50 | check = cmpxchg(&head->next, cur, next); | ||
51 | if (check == cur) | ||
52 | goto out; | ||
53 | } | ||
54 | out: | ||
55 | return cur; | ||
56 | } | ||
57 | |||
58 | static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head) | ||
59 | { | ||
60 | struct xlist_head *cur; | ||
61 | |||
62 | cur = head->next; | ||
63 | if (!cur) | ||
64 | return NULL; | ||
65 | |||
66 | head->next = cur->next; | ||
67 | return cur; | ||
68 | } | ||
69 | |||
70 | static inline void xlist_splice(struct xlist_head *list, | ||
71 | struct xlist_head *head) | ||
72 | { | ||
73 | struct xlist_head *cur; | ||
74 | |||
75 | WARN_ON(head->next); | ||
76 | cur = xchg(&list->next, NULL); | ||
77 | head->next = cur; | ||
78 | } | ||
79 | |||
80 | #endif | ||