aboutsummaryrefslogtreecommitdiffstats
path: root/net/rds
diff options
context:
space:
mode:
Diffstat (limited to 'net/rds')
-rw-r--r--net/rds/Makefile8
-rw-r--r--net/rds/af_rds.c26
-rw-r--r--net/rds/bind.c82
-rw-r--r--net/rds/cong.c17
-rw-r--r--net/rds/connection.c159
-rw-r--r--net/rds/ib.c193
-rw-r--r--net/rds/ib.h102
-rw-r--r--net/rds/ib_cm.c186
-rw-r--r--net/rds/ib_rdma.c297
-rw-r--r--net/rds/ib_recv.c549
-rw-r--r--net/rds/ib_send.c689
-rw-r--r--net/rds/ib_stats.c2
-rw-r--r--net/rds/ib_sysctl.c19
-rw-r--r--net/rds/info.c12
-rw-r--r--net/rds/iw.c10
-rw-r--r--net/rds/iw.h15
-rw-r--r--net/rds/iw_cm.c18
-rw-r--r--net/rds/iw_rdma.c10
-rw-r--r--net/rds/iw_recv.c24
-rw-r--r--net/rds/iw_send.c95
-rw-r--r--net/rds/iw_sysctl.c6
-rw-r--r--net/rds/loop.c44
-rw-r--r--net/rds/message.c149
-rw-r--r--net/rds/page.c8
-rw-r--r--net/rds/rdma.c423
-rw-r--r--net/rds/rdma.h85
-rw-r--r--net/rds/rdma_transport.c47
-rw-r--r--net/rds/rdma_transport.h4
-rw-r--r--net/rds/rds.h193
-rw-r--r--net/rds/recv.c12
-rw-r--r--net/rds/send.c554
-rw-r--r--net/rds/stats.c6
-rw-r--r--net/rds/sysctl.c4
-rw-r--r--net/rds/tcp.c18
-rw-r--r--net/rds/tcp.h9
-rw-r--r--net/rds/tcp_connect.c2
-rw-r--r--net/rds/tcp_listen.c6
-rw-r--r--net/rds/tcp_recv.c17
-rw-r--r--net/rds/tcp_send.c68
-rw-r--r--net/rds/threads.c69
-rw-r--r--net/rds/transport.c19
-rw-r--r--net/rds/xlist.h80
42 files changed, 2657 insertions, 1679 deletions
diff --git a/net/rds/Makefile b/net/rds/Makefile
index b46eca109688..56d3f6023ced 100644
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -4,7 +4,7 @@ rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \
4 loop.o page.o rdma.o 4 loop.o page.o rdma.o
5 5
6obj-$(CONFIG_RDS_RDMA) += rds_rdma.o 6obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
7rds_rdma-objs := rdma_transport.o \ 7rds_rdma-y := rdma_transport.o \
8 ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \ 8 ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
9 ib_sysctl.o ib_rdma.o \ 9 ib_sysctl.o ib_rdma.o \
10 iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \ 10 iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
@@ -12,10 +12,8 @@ rds_rdma-objs := rdma_transport.o \
12 12
13 13
14obj-$(CONFIG_RDS_TCP) += rds_tcp.o 14obj-$(CONFIG_RDS_TCP) += rds_tcp.o
15rds_tcp-objs := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \ 15rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
16 tcp_send.o tcp_stats.o 16 tcp_send.o tcp_stats.o
17 17
18ifeq ($(CONFIG_RDS_DEBUG), y) 18ccflags-$(CONFIG_RDS_DEBUG) := -DDEBUG
19EXTRA_CFLAGS += -DDEBUG
20endif
21 19
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index aebfecbdb841..bb6ad81b671d 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -39,7 +39,15 @@
39#include <net/sock.h> 39#include <net/sock.h>
40 40
41#include "rds.h" 41#include "rds.h"
42#include "rdma.h" 42
43char *rds_str_array(char **array, size_t elements, size_t index)
44{
45 if ((index < elements) && array[index])
46 return array[index];
47 else
48 return "unknown";
49}
50EXPORT_SYMBOL(rds_str_array);
43 51
44/* this is just used for stats gathering :/ */ 52/* this is just used for stats gathering :/ */
45static DEFINE_SPINLOCK(rds_sock_lock); 53static DEFINE_SPINLOCK(rds_sock_lock);
@@ -62,7 +70,7 @@ static int rds_release(struct socket *sock)
62 struct rds_sock *rs; 70 struct rds_sock *rs;
63 unsigned long flags; 71 unsigned long flags;
64 72
65 if (sk == NULL) 73 if (!sk)
66 goto out; 74 goto out;
67 75
68 rs = rds_sk_to_rs(sk); 76 rs = rds_sk_to_rs(sk);
@@ -73,7 +81,15 @@ static int rds_release(struct socket *sock)
73 * with the socket. */ 81 * with the socket. */
74 rds_clear_recv_queue(rs); 82 rds_clear_recv_queue(rs);
75 rds_cong_remove_socket(rs); 83 rds_cong_remove_socket(rs);
84
85 /*
86 * the binding lookup hash uses rcu, we need to
87 * make sure we sychronize_rcu before we free our
88 * entry
89 */
76 rds_remove_bound(rs); 90 rds_remove_bound(rs);
91 synchronize_rcu();
92
77 rds_send_drop_to(rs, NULL); 93 rds_send_drop_to(rs, NULL);
78 rds_rdma_drop_keys(rs); 94 rds_rdma_drop_keys(rs);
79 rds_notify_queue_get(rs, NULL); 95 rds_notify_queue_get(rs, NULL);
@@ -83,6 +99,8 @@ static int rds_release(struct socket *sock)
83 rds_sock_count--; 99 rds_sock_count--;
84 spin_unlock_irqrestore(&rds_sock_lock, flags); 100 spin_unlock_irqrestore(&rds_sock_lock, flags);
85 101
102 rds_trans_put(rs->rs_transport);
103
86 sock->sk = NULL; 104 sock->sk = NULL;
87 sock_put(sk); 105 sock_put(sk);
88out: 106out:
@@ -514,7 +532,7 @@ out:
514 spin_unlock_irqrestore(&rds_sock_lock, flags); 532 spin_unlock_irqrestore(&rds_sock_lock, flags);
515} 533}
516 534
517static void __exit rds_exit(void) 535static void rds_exit(void)
518{ 536{
519 sock_unregister(rds_family_ops.family); 537 sock_unregister(rds_family_ops.family);
520 proto_unregister(&rds_proto); 538 proto_unregister(&rds_proto);
@@ -529,7 +547,7 @@ static void __exit rds_exit(void)
529} 547}
530module_exit(rds_exit); 548module_exit(rds_exit);
531 549
532static int __init rds_init(void) 550static int rds_init(void)
533{ 551{
534 int ret; 552 int ret;
535 553
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 5d95fc007f1a..2f6b3fcc79f8 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -34,45 +34,52 @@
34#include <net/sock.h> 34#include <net/sock.h>
35#include <linux/in.h> 35#include <linux/in.h>
36#include <linux/if_arp.h> 36#include <linux/if_arp.h>
37#include <linux/jhash.h>
37#include "rds.h" 38#include "rds.h"
38 39
39/* 40#define BIND_HASH_SIZE 1024
40 * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't 41static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
41 * particularly zippy.
42 *
43 * This is now called for every incoming frame so we arguably care much more
44 * about it than we used to.
45 */
46static DEFINE_SPINLOCK(rds_bind_lock); 42static DEFINE_SPINLOCK(rds_bind_lock);
47static struct rb_root rds_bind_tree = RB_ROOT;
48 43
49static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, 44static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
50 struct rds_sock *insert) 45{
46 return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
47 (BIND_HASH_SIZE - 1));
48}
49
50static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
51 struct rds_sock *insert)
51{ 52{
52 struct rb_node **p = &rds_bind_tree.rb_node;
53 struct rb_node *parent = NULL;
54 struct rds_sock *rs; 53 struct rds_sock *rs;
54 struct hlist_node *node;
55 struct hlist_head *head = hash_to_bucket(addr, port);
55 u64 cmp; 56 u64 cmp;
56 u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); 57 u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
57 58
58 while (*p) { 59 rcu_read_lock();
59 parent = *p; 60 hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) {
60 rs = rb_entry(parent, struct rds_sock, rs_bound_node);
61
62 cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | 61 cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
63 be16_to_cpu(rs->rs_bound_port); 62 be16_to_cpu(rs->rs_bound_port);
64 63
65 if (needle < cmp) 64 if (cmp == needle) {
66 p = &(*p)->rb_left; 65 rcu_read_unlock();
67 else if (needle > cmp)
68 p = &(*p)->rb_right;
69 else
70 return rs; 66 return rs;
67 }
71 } 68 }
69 rcu_read_unlock();
72 70
73 if (insert) { 71 if (insert) {
74 rb_link_node(&insert->rs_bound_node, parent, p); 72 /*
75 rb_insert_color(&insert->rs_bound_node, &rds_bind_tree); 73 * make sure our addr and port are set before
74 * we are added to the list, other people
75 * in rcu will find us as soon as the
76 * hlist_add_head_rcu is done
77 */
78 insert->rs_bound_addr = addr;
79 insert->rs_bound_port = port;
80 rds_sock_addref(insert);
81
82 hlist_add_head_rcu(&insert->rs_bound_node, head);
76 } 83 }
77 return NULL; 84 return NULL;
78} 85}
@@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
86struct rds_sock *rds_find_bound(__be32 addr, __be16 port) 93struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
87{ 94{
88 struct rds_sock *rs; 95 struct rds_sock *rs;
89 unsigned long flags;
90 96
91 spin_lock_irqsave(&rds_bind_lock, flags); 97 rs = rds_bind_lookup(addr, port, NULL);
92 rs = rds_bind_tree_walk(addr, port, NULL); 98
93 if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) 99 if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
94 rds_sock_addref(rs); 100 rds_sock_addref(rs);
95 else 101 else
96 rs = NULL; 102 rs = NULL;
97 spin_unlock_irqrestore(&rds_bind_lock, flags);
98 103
99 rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, 104 rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
100 ntohs(port)); 105 ntohs(port));
@@ -121,22 +126,15 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
121 do { 126 do {
122 if (rover == 0) 127 if (rover == 0)
123 rover++; 128 rover++;
124 if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) { 129 if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
125 *port = cpu_to_be16(rover); 130 *port = rs->rs_bound_port;
126 ret = 0; 131 ret = 0;
132 rdsdebug("rs %p binding to %pI4:%d\n",
133 rs, &addr, (int)ntohs(*port));
127 break; 134 break;
128 } 135 }
129 } while (rover++ != last); 136 } while (rover++ != last);
130 137
131 if (ret == 0) {
132 rs->rs_bound_addr = addr;
133 rs->rs_bound_port = *port;
134 rds_sock_addref(rs);
135
136 rdsdebug("rs %p binding to %pI4:%d\n",
137 rs, &addr, (int)ntohs(*port));
138 }
139
140 spin_unlock_irqrestore(&rds_bind_lock, flags); 138 spin_unlock_irqrestore(&rds_bind_lock, flags);
141 139
142 return ret; 140 return ret;
@@ -153,7 +151,7 @@ void rds_remove_bound(struct rds_sock *rs)
153 rs, &rs->rs_bound_addr, 151 rs, &rs->rs_bound_addr,
154 ntohs(rs->rs_bound_port)); 152 ntohs(rs->rs_bound_port));
155 153
156 rb_erase(&rs->rs_bound_node, &rds_bind_tree); 154 hlist_del_init_rcu(&rs->rs_bound_node);
157 rds_sock_put(rs); 155 rds_sock_put(rs);
158 rs->rs_bound_addr = 0; 156 rs->rs_bound_addr = 0;
159 } 157 }
@@ -184,7 +182,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
184 goto out; 182 goto out;
185 183
186 trans = rds_trans_get_preferred(sin->sin_addr.s_addr); 184 trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
187 if (trans == NULL) { 185 if (!trans) {
188 ret = -EADDRNOTAVAIL; 186 ret = -EADDRNOTAVAIL;
189 rds_remove_bound(rs); 187 rds_remove_bound(rs);
190 if (printk_ratelimit()) 188 if (printk_ratelimit())
@@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
198 196
199out: 197out:
200 release_sock(sk); 198 release_sock(sk);
199
200 /* we might have called rds_remove_bound on error */
201 if (ret)
202 synchronize_rcu();
201 return ret; 203 return ret;
202} 204}
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 0871a29f0780..6daaa49d133f 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -33,8 +33,7 @@
33#include <linux/slab.h> 33#include <linux/slab.h>
34#include <linux/types.h> 34#include <linux/types.h>
35#include <linux/rbtree.h> 35#include <linux/rbtree.h>
36 36#include <linux/bitops.h>
37#include <asm-generic/bitops/le.h>
38 37
39#include "rds.h" 38#include "rds.h"
40 39
@@ -141,7 +140,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
141 unsigned long flags; 140 unsigned long flags;
142 141
143 map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); 142 map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
144 if (map == NULL) 143 if (!map)
145 return NULL; 144 return NULL;
146 145
147 map->m_addr = addr; 146 map->m_addr = addr;
@@ -159,7 +158,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
159 ret = rds_cong_tree_walk(addr, map); 158 ret = rds_cong_tree_walk(addr, map);
160 spin_unlock_irqrestore(&rds_cong_lock, flags); 159 spin_unlock_irqrestore(&rds_cong_lock, flags);
161 160
162 if (ret == NULL) { 161 if (!ret) {
163 ret = map; 162 ret = map;
164 map = NULL; 163 map = NULL;
165 } 164 }
@@ -205,7 +204,7 @@ int rds_cong_get_maps(struct rds_connection *conn)
205 conn->c_lcong = rds_cong_from_addr(conn->c_laddr); 204 conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
206 conn->c_fcong = rds_cong_from_addr(conn->c_faddr); 205 conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
207 206
208 if (conn->c_lcong == NULL || conn->c_fcong == NULL) 207 if (!(conn->c_lcong && conn->c_fcong))
209 return -ENOMEM; 208 return -ENOMEM;
210 209
211 return 0; 210 return 0;
@@ -221,7 +220,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
221 list_for_each_entry(conn, &map->m_conn_list, c_map_item) { 220 list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
222 if (!test_and_set_bit(0, &conn->c_map_queued)) { 221 if (!test_and_set_bit(0, &conn->c_map_queued)) {
223 rds_stats_inc(s_cong_update_queued); 222 rds_stats_inc(s_cong_update_queued);
224 queue_delayed_work(rds_wq, &conn->c_send_w, 0); 223 rds_send_xmit(conn);
225 } 224 }
226 } 225 }
227 226
@@ -285,7 +284,7 @@ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
285 i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; 284 i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
286 off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; 285 off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
287 286
288 generic___set_le_bit(off, (void *)map->m_page_addrs[i]); 287 __set_bit_le(off, (void *)map->m_page_addrs[i]);
289} 288}
290 289
291void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) 290void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
@@ -299,7 +298,7 @@ void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
299 i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; 298 i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
300 off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; 299 off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
301 300
302 generic___clear_le_bit(off, (void *)map->m_page_addrs[i]); 301 __clear_bit_le(off, (void *)map->m_page_addrs[i]);
303} 302}
304 303
305static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) 304static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
@@ -310,7 +309,7 @@ static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
310 i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; 309 i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
311 off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; 310 off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
312 311
313 return generic_test_le_bit(off, (void *)map->m_page_addrs[i]); 312 return test_bit_le(off, (void *)map->m_page_addrs[i]);
314} 313}
315 314
316void rds_cong_add_socket(struct rds_sock *rs) 315void rds_cong_add_socket(struct rds_sock *rs)
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 7619b671ca28..9334d892366e 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -37,7 +37,6 @@
37 37
38#include "rds.h" 38#include "rds.h"
39#include "loop.h" 39#include "loop.h"
40#include "rdma.h"
41 40
42#define RDS_CONNECTION_HASH_BITS 12 41#define RDS_CONNECTION_HASH_BITS 12
43#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) 42#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
@@ -63,18 +62,7 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
63 var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ 62 var |= RDS_INFO_CONNECTION_FLAG_##suffix; \
64} while (0) 63} while (0)
65 64
66static inline int rds_conn_is_sending(struct rds_connection *conn) 65/* rcu read lock must be held or the connection spinlock */
67{
68 int ret = 0;
69
70 if (!mutex_trylock(&conn->c_send_lock))
71 ret = 1;
72 else
73 mutex_unlock(&conn->c_send_lock);
74
75 return ret;
76}
77
78static struct rds_connection *rds_conn_lookup(struct hlist_head *head, 66static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
79 __be32 laddr, __be32 faddr, 67 __be32 laddr, __be32 faddr,
80 struct rds_transport *trans) 68 struct rds_transport *trans)
@@ -82,7 +70,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
82 struct rds_connection *conn, *ret = NULL; 70 struct rds_connection *conn, *ret = NULL;
83 struct hlist_node *pos; 71 struct hlist_node *pos;
84 72
85 hlist_for_each_entry(conn, pos, head, c_hash_node) { 73 hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
86 if (conn->c_faddr == faddr && conn->c_laddr == laddr && 74 if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
87 conn->c_trans == trans) { 75 conn->c_trans == trans) {
88 ret = conn; 76 ret = conn;
@@ -100,7 +88,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
100 * and receiving over this connection again in the future. It is up to 88 * and receiving over this connection again in the future. It is up to
101 * the transport to have serialized this call with its send and recv. 89 * the transport to have serialized this call with its send and recv.
102 */ 90 */
103void rds_conn_reset(struct rds_connection *conn) 91static void rds_conn_reset(struct rds_connection *conn)
104{ 92{
105 rdsdebug("connection %pI4 to %pI4 reset\n", 93 rdsdebug("connection %pI4 to %pI4 reset\n",
106 &conn->c_laddr, &conn->c_faddr); 94 &conn->c_laddr, &conn->c_faddr);
@@ -129,10 +117,11 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
129{ 117{
130 struct rds_connection *conn, *parent = NULL; 118 struct rds_connection *conn, *parent = NULL;
131 struct hlist_head *head = rds_conn_bucket(laddr, faddr); 119 struct hlist_head *head = rds_conn_bucket(laddr, faddr);
120 struct rds_transport *loop_trans;
132 unsigned long flags; 121 unsigned long flags;
133 int ret; 122 int ret;
134 123
135 spin_lock_irqsave(&rds_conn_lock, flags); 124 rcu_read_lock();
136 conn = rds_conn_lookup(head, laddr, faddr, trans); 125 conn = rds_conn_lookup(head, laddr, faddr, trans);
137 if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && 126 if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
138 !is_outgoing) { 127 !is_outgoing) {
@@ -143,12 +132,12 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
143 parent = conn; 132 parent = conn;
144 conn = parent->c_passive; 133 conn = parent->c_passive;
145 } 134 }
146 spin_unlock_irqrestore(&rds_conn_lock, flags); 135 rcu_read_unlock();
147 if (conn) 136 if (conn)
148 goto out; 137 goto out;
149 138
150 conn = kmem_cache_zalloc(rds_conn_slab, gfp); 139 conn = kmem_cache_zalloc(rds_conn_slab, gfp);
151 if (conn == NULL) { 140 if (!conn) {
152 conn = ERR_PTR(-ENOMEM); 141 conn = ERR_PTR(-ENOMEM);
153 goto out; 142 goto out;
154 } 143 }
@@ -159,7 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
159 spin_lock_init(&conn->c_lock); 148 spin_lock_init(&conn->c_lock);
160 conn->c_next_tx_seq = 1; 149 conn->c_next_tx_seq = 1;
161 150
162 mutex_init(&conn->c_send_lock); 151 init_waitqueue_head(&conn->c_waitq);
163 INIT_LIST_HEAD(&conn->c_send_queue); 152 INIT_LIST_HEAD(&conn->c_send_queue);
164 INIT_LIST_HEAD(&conn->c_retrans); 153 INIT_LIST_HEAD(&conn->c_retrans);
165 154
@@ -175,7 +164,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
175 * can bind to the destination address then we'd rather the messages 164 * can bind to the destination address then we'd rather the messages
176 * flow through loopback rather than either transport. 165 * flow through loopback rather than either transport.
177 */ 166 */
178 if (rds_trans_get_preferred(faddr)) { 167 loop_trans = rds_trans_get_preferred(faddr);
168 if (loop_trans) {
169 rds_trans_put(loop_trans);
179 conn->c_loopback = 1; 170 conn->c_loopback = 1;
180 if (is_outgoing && trans->t_prefer_loopback) { 171 if (is_outgoing && trans->t_prefer_loopback) {
181 /* "outgoing" connection - and the transport 172 /* "outgoing" connection - and the transport
@@ -238,7 +229,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
238 kmem_cache_free(rds_conn_slab, conn); 229 kmem_cache_free(rds_conn_slab, conn);
239 conn = found; 230 conn = found;
240 } else { 231 } else {
241 hlist_add_head(&conn->c_hash_node, head); 232 hlist_add_head_rcu(&conn->c_hash_node, head);
242 rds_cong_add_conn(conn); 233 rds_cong_add_conn(conn);
243 rds_conn_count++; 234 rds_conn_count++;
244 } 235 }
@@ -263,21 +254,91 @@ struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
263} 254}
264EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); 255EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
265 256
257void rds_conn_shutdown(struct rds_connection *conn)
258{
259 /* shut it down unless it's down already */
260 if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
261 /*
262 * Quiesce the connection mgmt handlers before we start tearing
263 * things down. We don't hold the mutex for the entire
264 * duration of the shutdown operation, else we may be
265 * deadlocking with the CM handler. Instead, the CM event
266 * handler is supposed to check for state DISCONNECTING
267 */
268 mutex_lock(&conn->c_cm_lock);
269 if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
270 && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
271 rds_conn_error(conn, "shutdown called in state %d\n",
272 atomic_read(&conn->c_state));
273 mutex_unlock(&conn->c_cm_lock);
274 return;
275 }
276 mutex_unlock(&conn->c_cm_lock);
277
278 wait_event(conn->c_waitq,
279 !test_bit(RDS_IN_XMIT, &conn->c_flags));
280
281 conn->c_trans->conn_shutdown(conn);
282 rds_conn_reset(conn);
283
284 if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
285 /* This can happen - eg when we're in the middle of tearing
286 * down the connection, and someone unloads the rds module.
287 * Quite reproduceable with loopback connections.
288 * Mostly harmless.
289 */
290 rds_conn_error(conn,
291 "%s: failed to transition to state DOWN, "
292 "current state is %d\n",
293 __func__,
294 atomic_read(&conn->c_state));
295 return;
296 }
297 }
298
299 /* Then reconnect if it's still live.
300 * The passive side of an IB loopback connection is never added
301 * to the conn hash, so we never trigger a reconnect on this
302 * conn - the reconnect is always triggered by the active peer. */
303 cancel_delayed_work_sync(&conn->c_conn_w);
304 rcu_read_lock();
305 if (!hlist_unhashed(&conn->c_hash_node)) {
306 rcu_read_unlock();
307 rds_queue_reconnect(conn);
308 } else {
309 rcu_read_unlock();
310 }
311}
312
313/*
314 * Stop and free a connection.
315 *
316 * This can only be used in very limited circumstances. It assumes that once
317 * the conn has been shutdown that no one else is referencing the connection.
318 * We can only ensure this in the rmmod path in the current code.
319 */
266void rds_conn_destroy(struct rds_connection *conn) 320void rds_conn_destroy(struct rds_connection *conn)
267{ 321{
268 struct rds_message *rm, *rtmp; 322 struct rds_message *rm, *rtmp;
323 unsigned long flags;
269 324
270 rdsdebug("freeing conn %p for %pI4 -> " 325 rdsdebug("freeing conn %p for %pI4 -> "
271 "%pI4\n", conn, &conn->c_laddr, 326 "%pI4\n", conn, &conn->c_laddr,
272 &conn->c_faddr); 327 &conn->c_faddr);
273 328
274 hlist_del_init(&conn->c_hash_node); 329 /* Ensure conn will not be scheduled for reconnect */
330 spin_lock_irq(&rds_conn_lock);
331 hlist_del_init_rcu(&conn->c_hash_node);
332 spin_unlock_irq(&rds_conn_lock);
333 synchronize_rcu();
275 334
276 /* wait for the rds thread to shut it down */ 335 /* shut the connection down */
277 atomic_set(&conn->c_state, RDS_CONN_ERROR); 336 rds_conn_drop(conn);
278 cancel_delayed_work(&conn->c_conn_w); 337 flush_work(&conn->c_down_w);
279 queue_work(rds_wq, &conn->c_down_w); 338
280 flush_workqueue(rds_wq); 339 /* make sure lingering queued work won't try to ref the conn */
340 cancel_delayed_work_sync(&conn->c_send_w);
341 cancel_delayed_work_sync(&conn->c_recv_w);
281 342
282 /* tear down queued messages */ 343 /* tear down queued messages */
283 list_for_each_entry_safe(rm, rtmp, 344 list_for_each_entry_safe(rm, rtmp,
@@ -302,7 +363,9 @@ void rds_conn_destroy(struct rds_connection *conn)
302 BUG_ON(!list_empty(&conn->c_retrans)); 363 BUG_ON(!list_empty(&conn->c_retrans));
303 kmem_cache_free(rds_conn_slab, conn); 364 kmem_cache_free(rds_conn_slab, conn);
304 365
366 spin_lock_irqsave(&rds_conn_lock, flags);
305 rds_conn_count--; 367 rds_conn_count--;
368 spin_unlock_irqrestore(&rds_conn_lock, flags);
306} 369}
307EXPORT_SYMBOL_GPL(rds_conn_destroy); 370EXPORT_SYMBOL_GPL(rds_conn_destroy);
308 371
@@ -316,23 +379,23 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
316 struct list_head *list; 379 struct list_head *list;
317 struct rds_connection *conn; 380 struct rds_connection *conn;
318 struct rds_message *rm; 381 struct rds_message *rm;
319 unsigned long flags;
320 unsigned int total = 0; 382 unsigned int total = 0;
383 unsigned long flags;
321 size_t i; 384 size_t i;
322 385
323 len /= sizeof(struct rds_info_message); 386 len /= sizeof(struct rds_info_message);
324 387
325 spin_lock_irqsave(&rds_conn_lock, flags); 388 rcu_read_lock();
326 389
327 for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); 390 for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
328 i++, head++) { 391 i++, head++) {
329 hlist_for_each_entry(conn, pos, head, c_hash_node) { 392 hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
330 if (want_send) 393 if (want_send)
331 list = &conn->c_send_queue; 394 list = &conn->c_send_queue;
332 else 395 else
333 list = &conn->c_retrans; 396 list = &conn->c_retrans;
334 397
335 spin_lock(&conn->c_lock); 398 spin_lock_irqsave(&conn->c_lock, flags);
336 399
337 /* XXX too lazy to maintain counts.. */ 400 /* XXX too lazy to maintain counts.. */
338 list_for_each_entry(rm, list, m_conn_item) { 401 list_for_each_entry(rm, list, m_conn_item) {
@@ -343,11 +406,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
343 conn->c_faddr, 0); 406 conn->c_faddr, 0);
344 } 407 }
345 408
346 spin_unlock(&conn->c_lock); 409 spin_unlock_irqrestore(&conn->c_lock, flags);
347 } 410 }
348 } 411 }
349 412 rcu_read_unlock();
350 spin_unlock_irqrestore(&rds_conn_lock, flags);
351 413
352 lens->nr = total; 414 lens->nr = total;
353 lens->each = sizeof(struct rds_info_message); 415 lens->each = sizeof(struct rds_info_message);
@@ -377,19 +439,17 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
377 uint64_t buffer[(item_len + 7) / 8]; 439 uint64_t buffer[(item_len + 7) / 8];
378 struct hlist_head *head; 440 struct hlist_head *head;
379 struct hlist_node *pos; 441 struct hlist_node *pos;
380 struct hlist_node *tmp;
381 struct rds_connection *conn; 442 struct rds_connection *conn;
382 unsigned long flags;
383 size_t i; 443 size_t i;
384 444
385 spin_lock_irqsave(&rds_conn_lock, flags); 445 rcu_read_lock();
386 446
387 lens->nr = 0; 447 lens->nr = 0;
388 lens->each = item_len; 448 lens->each = item_len;
389 449
390 for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); 450 for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
391 i++, head++) { 451 i++, head++) {
392 hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) { 452 hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
393 453
394 /* XXX no c_lock usage.. */ 454 /* XXX no c_lock usage.. */
395 if (!visitor(conn, buffer)) 455 if (!visitor(conn, buffer))
@@ -405,8 +465,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
405 lens->nr++; 465 lens->nr++;
406 } 466 }
407 } 467 }
408 468 rcu_read_unlock();
409 spin_unlock_irqrestore(&rds_conn_lock, flags);
410} 469}
411EXPORT_SYMBOL_GPL(rds_for_each_conn_info); 470EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
412 471
@@ -423,8 +482,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn,
423 sizeof(cinfo->transport)); 482 sizeof(cinfo->transport));
424 cinfo->flags = 0; 483 cinfo->flags = 0;
425 484
426 rds_conn_info_set(cinfo->flags, 485 rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags),
427 rds_conn_is_sending(conn), SENDING); 486 SENDING);
428 /* XXX Future: return the state rather than these funky bits */ 487 /* XXX Future: return the state rather than these funky bits */
429 rds_conn_info_set(cinfo->flags, 488 rds_conn_info_set(cinfo->flags,
430 atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, 489 atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
@@ -444,12 +503,12 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
444 sizeof(struct rds_info_connection)); 503 sizeof(struct rds_info_connection));
445} 504}
446 505
447int __init rds_conn_init(void) 506int rds_conn_init(void)
448{ 507{
449 rds_conn_slab = kmem_cache_create("rds_connection", 508 rds_conn_slab = kmem_cache_create("rds_connection",
450 sizeof(struct rds_connection), 509 sizeof(struct rds_connection),
451 0, 0, NULL); 510 0, 0, NULL);
452 if (rds_conn_slab == NULL) 511 if (!rds_conn_slab)
453 return -ENOMEM; 512 return -ENOMEM;
454 513
455 rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); 514 rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
@@ -487,6 +546,18 @@ void rds_conn_drop(struct rds_connection *conn)
487EXPORT_SYMBOL_GPL(rds_conn_drop); 546EXPORT_SYMBOL_GPL(rds_conn_drop);
488 547
489/* 548/*
549 * If the connection is down, trigger a connect. We may have scheduled a
550 * delayed reconnect however - in this case we should not interfere.
551 */
552void rds_conn_connect_if_down(struct rds_connection *conn)
553{
554 if (rds_conn_state(conn) == RDS_CONN_DOWN &&
555 !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
556 queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
557}
558EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
559
560/*
490 * An error occurred on the connection 561 * An error occurred on the connection
491 */ 562 */
492void 563void
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 8f2d6dd7700a..3b83086bcc30 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -42,7 +42,7 @@
42#include "rds.h" 42#include "rds.h"
43#include "ib.h" 43#include "ib.h"
44 44
45unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; 45static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
46unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ 46unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
47unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; 47unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
48 48
@@ -53,13 +53,72 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
53module_param(rds_ib_retry_count, int, 0444); 53module_param(rds_ib_retry_count, int, 0444);
54MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); 54MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
55 55
56/*
57 * we have a clumsy combination of RCU and a rwsem protecting this list
58 * because it is used both in the get_mr fast path and while blocking in
59 * the FMR flushing path.
60 */
61DECLARE_RWSEM(rds_ib_devices_lock);
56struct list_head rds_ib_devices; 62struct list_head rds_ib_devices;
57 63
58/* NOTE: if also grabbing ibdev lock, grab this first */ 64/* NOTE: if also grabbing ibdev lock, grab this first */
59DEFINE_SPINLOCK(ib_nodev_conns_lock); 65DEFINE_SPINLOCK(ib_nodev_conns_lock);
60LIST_HEAD(ib_nodev_conns); 66LIST_HEAD(ib_nodev_conns);
61 67
62void rds_ib_add_one(struct ib_device *device) 68static void rds_ib_nodev_connect(void)
69{
70 struct rds_ib_connection *ic;
71
72 spin_lock(&ib_nodev_conns_lock);
73 list_for_each_entry(ic, &ib_nodev_conns, ib_node)
74 rds_conn_connect_if_down(ic->conn);
75 spin_unlock(&ib_nodev_conns_lock);
76}
77
78static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
79{
80 struct rds_ib_connection *ic;
81 unsigned long flags;
82
83 spin_lock_irqsave(&rds_ibdev->spinlock, flags);
84 list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
85 rds_conn_drop(ic->conn);
86 spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
87}
88
89/*
90 * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
91 * from interrupt context so we push freing off into a work struct in krdsd.
92 */
93static void rds_ib_dev_free(struct work_struct *work)
94{
95 struct rds_ib_ipaddr *i_ipaddr, *i_next;
96 struct rds_ib_device *rds_ibdev = container_of(work,
97 struct rds_ib_device, free_work);
98
99 if (rds_ibdev->mr_pool)
100 rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
101 if (rds_ibdev->mr)
102 ib_dereg_mr(rds_ibdev->mr);
103 if (rds_ibdev->pd)
104 ib_dealloc_pd(rds_ibdev->pd);
105
106 list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
107 list_del(&i_ipaddr->list);
108 kfree(i_ipaddr);
109 }
110
111 kfree(rds_ibdev);
112}
113
114void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
115{
116 BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
117 if (atomic_dec_and_test(&rds_ibdev->refcount))
118 queue_work(rds_wq, &rds_ibdev->free_work);
119}
120
121static void rds_ib_add_one(struct ib_device *device)
63{ 122{
64 struct rds_ib_device *rds_ibdev; 123 struct rds_ib_device *rds_ibdev;
65 struct ib_device_attr *dev_attr; 124 struct ib_device_attr *dev_attr;
@@ -77,11 +136,14 @@ void rds_ib_add_one(struct ib_device *device)
77 goto free_attr; 136 goto free_attr;
78 } 137 }
79 138
80 rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); 139 rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
140 ibdev_to_node(device));
81 if (!rds_ibdev) 141 if (!rds_ibdev)
82 goto free_attr; 142 goto free_attr;
83 143
84 spin_lock_init(&rds_ibdev->spinlock); 144 spin_lock_init(&rds_ibdev->spinlock);
145 atomic_set(&rds_ibdev->refcount, 1);
146 INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
85 147
86 rds_ibdev->max_wrs = dev_attr->max_qp_wr; 148 rds_ibdev->max_wrs = dev_attr->max_qp_wr;
87 rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); 149 rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
@@ -91,68 +153,107 @@ void rds_ib_add_one(struct ib_device *device)
91 min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : 153 min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
92 fmr_pool_size; 154 fmr_pool_size;
93 155
156 rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
157 rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
158
94 rds_ibdev->dev = device; 159 rds_ibdev->dev = device;
95 rds_ibdev->pd = ib_alloc_pd(device); 160 rds_ibdev->pd = ib_alloc_pd(device);
96 if (IS_ERR(rds_ibdev->pd)) 161 if (IS_ERR(rds_ibdev->pd)) {
97 goto free_dev; 162 rds_ibdev->pd = NULL;
163 goto put_dev;
164 }
98 165
99 rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, 166 rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
100 IB_ACCESS_LOCAL_WRITE); 167 if (IS_ERR(rds_ibdev->mr)) {
101 if (IS_ERR(rds_ibdev->mr)) 168 rds_ibdev->mr = NULL;
102 goto err_pd; 169 goto put_dev;
170 }
103 171
104 rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); 172 rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
105 if (IS_ERR(rds_ibdev->mr_pool)) { 173 if (IS_ERR(rds_ibdev->mr_pool)) {
106 rds_ibdev->mr_pool = NULL; 174 rds_ibdev->mr_pool = NULL;
107 goto err_mr; 175 goto put_dev;
108 } 176 }
109 177
110 INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); 178 INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
111 INIT_LIST_HEAD(&rds_ibdev->conn_list); 179 INIT_LIST_HEAD(&rds_ibdev->conn_list);
112 list_add_tail(&rds_ibdev->list, &rds_ib_devices); 180
181 down_write(&rds_ib_devices_lock);
182 list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
183 up_write(&rds_ib_devices_lock);
184 atomic_inc(&rds_ibdev->refcount);
113 185
114 ib_set_client_data(device, &rds_ib_client, rds_ibdev); 186 ib_set_client_data(device, &rds_ib_client, rds_ibdev);
187 atomic_inc(&rds_ibdev->refcount);
115 188
116 goto free_attr; 189 rds_ib_nodev_connect();
117 190
118err_mr: 191put_dev:
119 ib_dereg_mr(rds_ibdev->mr); 192 rds_ib_dev_put(rds_ibdev);
120err_pd:
121 ib_dealloc_pd(rds_ibdev->pd);
122free_dev:
123 kfree(rds_ibdev);
124free_attr: 193free_attr:
125 kfree(dev_attr); 194 kfree(dev_attr);
126} 195}
127 196
128void rds_ib_remove_one(struct ib_device *device) 197/*
198 * New connections use this to find the device to associate with the
199 * connection. It's not in the fast path so we're not concerned about the
200 * performance of the IB call. (As of this writing, it uses an interrupt
201 * blocking spinlock to serialize walking a per-device list of all registered
202 * clients.)
203 *
204 * RCU is used to handle incoming connections racing with device teardown.
205 * Rather than use a lock to serialize removal from the client_data and
206 * getting a new reference, we use an RCU grace period. The destruction
207 * path removes the device from client_data and then waits for all RCU
208 * readers to finish.
209 *
210 * A new connection can get NULL from this if its arriving on a
211 * device that is in the process of being removed.
212 */
213struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
129{ 214{
130 struct rds_ib_device *rds_ibdev; 215 struct rds_ib_device *rds_ibdev;
131 struct rds_ib_ipaddr *i_ipaddr, *i_next;
132 216
217 rcu_read_lock();
133 rds_ibdev = ib_get_client_data(device, &rds_ib_client); 218 rds_ibdev = ib_get_client_data(device, &rds_ib_client);
134 if (!rds_ibdev) 219 if (rds_ibdev)
135 return; 220 atomic_inc(&rds_ibdev->refcount);
221 rcu_read_unlock();
222 return rds_ibdev;
223}
136 224
137 list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { 225/*
138 list_del(&i_ipaddr->list); 226 * The IB stack is letting us know that a device is going away. This can
139 kfree(i_ipaddr); 227 * happen if the underlying HCA driver is removed or if PCI hotplug is removing
140 } 228 * the pci function, for example.
229 *
230 * This can be called at any time and can be racing with any other RDS path.
231 */
232static void rds_ib_remove_one(struct ib_device *device)
233{
234 struct rds_ib_device *rds_ibdev;
141 235
142 rds_ib_destroy_conns(rds_ibdev); 236 rds_ibdev = ib_get_client_data(device, &rds_ib_client);
237 if (!rds_ibdev)
238 return;
143 239
144 if (rds_ibdev->mr_pool) 240 rds_ib_dev_shutdown(rds_ibdev);
145 rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
146 241
147 ib_dereg_mr(rds_ibdev->mr); 242 /* stop connection attempts from getting a reference to this device. */
243 ib_set_client_data(device, &rds_ib_client, NULL);
148 244
149 while (ib_dealloc_pd(rds_ibdev->pd)) { 245 down_write(&rds_ib_devices_lock);
150 rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); 246 list_del_rcu(&rds_ibdev->list);
151 msleep(1); 247 up_write(&rds_ib_devices_lock);
152 }
153 248
154 list_del(&rds_ibdev->list); 249 /*
155 kfree(rds_ibdev); 250 * This synchronize rcu is waiting for readers of both the ib
251 * client data and the devices list to finish before we drop
252 * both of those references.
253 */
254 synchronize_rcu();
255 rds_ib_dev_put(rds_ibdev);
256 rds_ib_dev_put(rds_ibdev);
156} 257}
157 258
158struct ib_client rds_ib_client = { 259struct ib_client rds_ib_client = {
@@ -186,7 +287,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
186 rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); 287 rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
187 rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); 288 rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
188 289
189 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); 290 rds_ibdev = ic->rds_ibdev;
190 iinfo->max_send_wr = ic->i_send_ring.w_nr; 291 iinfo->max_send_wr = ic->i_send_ring.w_nr;
191 iinfo->max_recv_wr = ic->i_recv_ring.w_nr; 292 iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
192 iinfo->max_send_sge = rds_ibdev->max_sge; 293 iinfo->max_send_sge = rds_ibdev->max_sge;
@@ -224,7 +325,7 @@ static int rds_ib_laddr_check(__be32 addr)
224 /* Create a CMA ID and try to bind it. This catches both 325 /* Create a CMA ID and try to bind it. This catches both
225 * IB and iWARP capable NICs. 326 * IB and iWARP capable NICs.
226 */ 327 */
227 cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); 328 cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
228 if (IS_ERR(cm_id)) 329 if (IS_ERR(cm_id))
229 return PTR_ERR(cm_id); 330 return PTR_ERR(cm_id);
230 331
@@ -248,11 +349,18 @@ static int rds_ib_laddr_check(__be32 addr)
248 return ret; 349 return ret;
249} 350}
250 351
352static void rds_ib_unregister_client(void)
353{
354 ib_unregister_client(&rds_ib_client);
355 /* wait for rds_ib_dev_free() to complete */
356 flush_workqueue(rds_wq);
357}
358
251void rds_ib_exit(void) 359void rds_ib_exit(void)
252{ 360{
253 rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); 361 rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
362 rds_ib_unregister_client();
254 rds_ib_destroy_nodev_conns(); 363 rds_ib_destroy_nodev_conns();
255 ib_unregister_client(&rds_ib_client);
256 rds_ib_sysctl_exit(); 364 rds_ib_sysctl_exit();
257 rds_ib_recv_exit(); 365 rds_ib_recv_exit();
258 rds_trans_unregister(&rds_ib_transport); 366 rds_trans_unregister(&rds_ib_transport);
@@ -262,15 +370,14 @@ struct rds_transport rds_ib_transport = {
262 .laddr_check = rds_ib_laddr_check, 370 .laddr_check = rds_ib_laddr_check,
263 .xmit_complete = rds_ib_xmit_complete, 371 .xmit_complete = rds_ib_xmit_complete,
264 .xmit = rds_ib_xmit, 372 .xmit = rds_ib_xmit,
265 .xmit_cong_map = NULL,
266 .xmit_rdma = rds_ib_xmit_rdma, 373 .xmit_rdma = rds_ib_xmit_rdma,
374 .xmit_atomic = rds_ib_xmit_atomic,
267 .recv = rds_ib_recv, 375 .recv = rds_ib_recv,
268 .conn_alloc = rds_ib_conn_alloc, 376 .conn_alloc = rds_ib_conn_alloc,
269 .conn_free = rds_ib_conn_free, 377 .conn_free = rds_ib_conn_free,
270 .conn_connect = rds_ib_conn_connect, 378 .conn_connect = rds_ib_conn_connect,
271 .conn_shutdown = rds_ib_conn_shutdown, 379 .conn_shutdown = rds_ib_conn_shutdown,
272 .inc_copy_to_user = rds_ib_inc_copy_to_user, 380 .inc_copy_to_user = rds_ib_inc_copy_to_user,
273 .inc_purge = rds_ib_inc_purge,
274 .inc_free = rds_ib_inc_free, 381 .inc_free = rds_ib_inc_free,
275 .cm_initiate_connect = rds_ib_cm_initiate_connect, 382 .cm_initiate_connect = rds_ib_cm_initiate_connect,
276 .cm_handle_connect = rds_ib_cm_handle_connect, 383 .cm_handle_connect = rds_ib_cm_handle_connect,
@@ -286,7 +393,7 @@ struct rds_transport rds_ib_transport = {
286 .t_type = RDS_TRANS_IB 393 .t_type = RDS_TRANS_IB
287}; 394};
288 395
289int __init rds_ib_init(void) 396int rds_ib_init(void)
290{ 397{
291 int ret; 398 int ret;
292 399
@@ -317,7 +424,7 @@ out_recv:
317out_sysctl: 424out_sysctl:
318 rds_ib_sysctl_exit(); 425 rds_ib_sysctl_exit();
319out_ibreg: 426out_ibreg:
320 ib_unregister_client(&rds_ib_client); 427 rds_ib_unregister_client();
321out: 428out:
322 return ret; 429 return ret;
323} 430}
diff --git a/net/rds/ib.h b/net/rds/ib.h
index 64df4e79b29f..4297d92788dc 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -3,11 +3,13 @@
3 3
4#include <rdma/ib_verbs.h> 4#include <rdma/ib_verbs.h>
5#include <rdma/rdma_cm.h> 5#include <rdma/rdma_cm.h>
6#include <linux/pci.h>
7#include <linux/slab.h>
6#include "rds.h" 8#include "rds.h"
7#include "rdma_transport.h" 9#include "rdma_transport.h"
8 10
9#define RDS_FMR_SIZE 256 11#define RDS_FMR_SIZE 256
10#define RDS_FMR_POOL_SIZE 4096 12#define RDS_FMR_POOL_SIZE 8192
11 13
12#define RDS_IB_MAX_SGE 8 14#define RDS_IB_MAX_SGE 8
13#define RDS_IB_RECV_SGE 2 15#define RDS_IB_RECV_SGE 2
@@ -19,6 +21,9 @@
19 21
20#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ 22#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
21 23
24#define RDS_IB_RECYCLE_BATCH_COUNT 32
25
26extern struct rw_semaphore rds_ib_devices_lock;
22extern struct list_head rds_ib_devices; 27extern struct list_head rds_ib_devices;
23 28
24/* 29/*
@@ -26,20 +31,29 @@ extern struct list_head rds_ib_devices;
26 * try and minimize the amount of memory tied up both the device and 31 * try and minimize the amount of memory tied up both the device and
27 * socket receive queues. 32 * socket receive queues.
28 */ 33 */
29/* page offset of the final full frag that fits in the page */
30#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
31struct rds_page_frag { 34struct rds_page_frag {
32 struct list_head f_item; 35 struct list_head f_item;
33 struct page *f_page; 36 struct list_head f_cache_entry;
34 unsigned long f_offset; 37 struct scatterlist f_sg;
35 dma_addr_t f_mapped;
36}; 38};
37 39
38struct rds_ib_incoming { 40struct rds_ib_incoming {
39 struct list_head ii_frags; 41 struct list_head ii_frags;
42 struct list_head ii_cache_entry;
40 struct rds_incoming ii_inc; 43 struct rds_incoming ii_inc;
41}; 44};
42 45
46struct rds_ib_cache_head {
47 struct list_head *first;
48 unsigned long count;
49};
50
51struct rds_ib_refill_cache {
52 struct rds_ib_cache_head *percpu;
53 struct list_head *xfer;
54 struct list_head *ready;
55};
56
43struct rds_ib_connect_private { 57struct rds_ib_connect_private {
44 /* Add new fields at the end, and don't permute existing fields. */ 58 /* Add new fields at the end, and don't permute existing fields. */
45 __be32 dp_saddr; 59 __be32 dp_saddr;
@@ -53,8 +67,7 @@ struct rds_ib_connect_private {
53}; 67};
54 68
55struct rds_ib_send_work { 69struct rds_ib_send_work {
56 struct rds_message *s_rm; 70 void *s_op;
57 struct rds_rdma_op *s_op;
58 struct ib_send_wr s_wr; 71 struct ib_send_wr s_wr;
59 struct ib_sge s_sge[RDS_IB_MAX_SGE]; 72 struct ib_sge s_sge[RDS_IB_MAX_SGE];
60 unsigned long s_queued; 73 unsigned long s_queued;
@@ -92,10 +105,11 @@ struct rds_ib_connection {
92 105
93 /* tx */ 106 /* tx */
94 struct rds_ib_work_ring i_send_ring; 107 struct rds_ib_work_ring i_send_ring;
95 struct rds_message *i_rm; 108 struct rm_data_op *i_data_op;
96 struct rds_header *i_send_hdrs; 109 struct rds_header *i_send_hdrs;
97 u64 i_send_hdrs_dma; 110 u64 i_send_hdrs_dma;
98 struct rds_ib_send_work *i_sends; 111 struct rds_ib_send_work *i_sends;
112 atomic_t i_signaled_sends;
99 113
100 /* rx */ 114 /* rx */
101 struct tasklet_struct i_recv_tasklet; 115 struct tasklet_struct i_recv_tasklet;
@@ -106,8 +120,9 @@ struct rds_ib_connection {
106 struct rds_header *i_recv_hdrs; 120 struct rds_header *i_recv_hdrs;
107 u64 i_recv_hdrs_dma; 121 u64 i_recv_hdrs_dma;
108 struct rds_ib_recv_work *i_recvs; 122 struct rds_ib_recv_work *i_recvs;
109 struct rds_page_frag i_frag;
110 u64 i_ack_recv; /* last ACK received */ 123 u64 i_ack_recv; /* last ACK received */
124 struct rds_ib_refill_cache i_cache_incs;
125 struct rds_ib_refill_cache i_cache_frags;
111 126
112 /* sending acks */ 127 /* sending acks */
113 unsigned long i_ack_flags; 128 unsigned long i_ack_flags;
@@ -138,7 +153,6 @@ struct rds_ib_connection {
138 153
139 /* Batched completions */ 154 /* Batched completions */
140 unsigned int i_unsignaled_wrs; 155 unsigned int i_unsignaled_wrs;
141 long i_unsignaled_bytes;
142}; 156};
143 157
144/* This assumes that atomic_t is at least 32 bits */ 158/* This assumes that atomic_t is at least 32 bits */
@@ -164,9 +178,17 @@ struct rds_ib_device {
164 unsigned int max_fmrs; 178 unsigned int max_fmrs;
165 int max_sge; 179 int max_sge;
166 unsigned int max_wrs; 180 unsigned int max_wrs;
181 unsigned int max_initiator_depth;
182 unsigned int max_responder_resources;
167 spinlock_t spinlock; /* protect the above */ 183 spinlock_t spinlock; /* protect the above */
184 atomic_t refcount;
185 struct work_struct free_work;
168}; 186};
169 187
188#define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus)
189#define ibdev_to_node(ibdev) pcidev_to_node(to_pci_dev(ibdev->dma_device))
190#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
191
170/* bits for i_ack_flags */ 192/* bits for i_ack_flags */
171#define IB_ACK_IN_FLIGHT 0 193#define IB_ACK_IN_FLIGHT 0
172#define IB_ACK_REQUESTED 1 194#define IB_ACK_REQUESTED 1
@@ -202,6 +224,8 @@ struct rds_ib_statistics {
202 uint64_t s_ib_rdma_mr_pool_flush; 224 uint64_t s_ib_rdma_mr_pool_flush;
203 uint64_t s_ib_rdma_mr_pool_wait; 225 uint64_t s_ib_rdma_mr_pool_wait;
204 uint64_t s_ib_rdma_mr_pool_depleted; 226 uint64_t s_ib_rdma_mr_pool_depleted;
227 uint64_t s_ib_atomic_cswp;
228 uint64_t s_ib_atomic_fadd;
205}; 229};
206 230
207extern struct workqueue_struct *rds_ib_wq; 231extern struct workqueue_struct *rds_ib_wq;
@@ -241,11 +265,10 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
241 265
242/* ib.c */ 266/* ib.c */
243extern struct rds_transport rds_ib_transport; 267extern struct rds_transport rds_ib_transport;
244extern void rds_ib_add_one(struct ib_device *device); 268struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
245extern void rds_ib_remove_one(struct ib_device *device); 269void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
246extern struct ib_client rds_ib_client; 270extern struct ib_client rds_ib_client;
247 271
248extern unsigned int fmr_pool_size;
249extern unsigned int fmr_message_size; 272extern unsigned int fmr_message_size;
250extern unsigned int rds_ib_retry_count; 273extern unsigned int rds_ib_retry_count;
251 274
@@ -258,7 +281,7 @@ void rds_ib_conn_free(void *arg);
258int rds_ib_conn_connect(struct rds_connection *conn); 281int rds_ib_conn_connect(struct rds_connection *conn);
259void rds_ib_conn_shutdown(struct rds_connection *conn); 282void rds_ib_conn_shutdown(struct rds_connection *conn);
260void rds_ib_state_change(struct sock *sk); 283void rds_ib_state_change(struct sock *sk);
261int __init rds_ib_listen_init(void); 284int rds_ib_listen_init(void);
262void rds_ib_listen_stop(void); 285void rds_ib_listen_stop(void);
263void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); 286void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
264int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 287int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
@@ -275,15 +298,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
275int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); 298int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
276void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); 299void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
277void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); 300void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
278void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock); 301void rds_ib_destroy_nodev_conns(void);
279static inline void rds_ib_destroy_nodev_conns(void)
280{
281 __rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock);
282}
283static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev)
284{
285 __rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock);
286}
287struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); 302struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
288void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); 303void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
289void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); 304void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
@@ -294,12 +309,12 @@ void rds_ib_free_mr(void *trans_private, int invalidate);
294void rds_ib_flush_mrs(void); 309void rds_ib_flush_mrs(void);
295 310
296/* ib_recv.c */ 311/* ib_recv.c */
297int __init rds_ib_recv_init(void); 312int rds_ib_recv_init(void);
298void rds_ib_recv_exit(void); 313void rds_ib_recv_exit(void);
299int rds_ib_recv(struct rds_connection *conn); 314int rds_ib_recv(struct rds_connection *conn);
300int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, 315int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
301 gfp_t page_gfp, int prefill); 316void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
302void rds_ib_inc_purge(struct rds_incoming *inc); 317void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
303void rds_ib_inc_free(struct rds_incoming *inc); 318void rds_ib_inc_free(struct rds_incoming *inc);
304int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, 319int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
305 size_t size); 320 size_t size);
@@ -325,17 +340,19 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
325extern wait_queue_head_t rds_ib_ring_empty_wait; 340extern wait_queue_head_t rds_ib_ring_empty_wait;
326 341
327/* ib_send.c */ 342/* ib_send.c */
343char *rds_ib_wc_status_str(enum ib_wc_status status);
328void rds_ib_xmit_complete(struct rds_connection *conn); 344void rds_ib_xmit_complete(struct rds_connection *conn);
329int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, 345int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
330 unsigned int hdr_off, unsigned int sg, unsigned int off); 346 unsigned int hdr_off, unsigned int sg, unsigned int off);
331void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); 347void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
332void rds_ib_send_init_ring(struct rds_ib_connection *ic); 348void rds_ib_send_init_ring(struct rds_ib_connection *ic);
333void rds_ib_send_clear_ring(struct rds_ib_connection *ic); 349void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
334int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); 350int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
335void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); 351void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
336void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); 352void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
337int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, 353int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
338 u32 *adv_credits, int need_posted, int max_posted); 354 u32 *adv_credits, int need_posted, int max_posted);
355int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
339 356
340/* ib_stats.c */ 357/* ib_stats.c */
341DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); 358DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
@@ -344,7 +361,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
344 unsigned int avail); 361 unsigned int avail);
345 362
346/* ib_sysctl.c */ 363/* ib_sysctl.c */
347int __init rds_ib_sysctl_init(void); 364int rds_ib_sysctl_init(void);
348void rds_ib_sysctl_exit(void); 365void rds_ib_sysctl_exit(void);
349extern unsigned long rds_ib_sysctl_max_send_wr; 366extern unsigned long rds_ib_sysctl_max_send_wr;
350extern unsigned long rds_ib_sysctl_max_recv_wr; 367extern unsigned long rds_ib_sysctl_max_recv_wr;
@@ -352,30 +369,5 @@ extern unsigned long rds_ib_sysctl_max_unsig_wrs;
352extern unsigned long rds_ib_sysctl_max_unsig_bytes; 369extern unsigned long rds_ib_sysctl_max_unsig_bytes;
353extern unsigned long rds_ib_sysctl_max_recv_allocation; 370extern unsigned long rds_ib_sysctl_max_recv_allocation;
354extern unsigned int rds_ib_sysctl_flow_control; 371extern unsigned int rds_ib_sysctl_flow_control;
355extern ctl_table rds_ib_sysctl_table[];
356
357/*
358 * Helper functions for getting/setting the header and data SGEs in
359 * RDS packets (not RDMA)
360 *
361 * From version 3.1 onwards, header is in front of data in the sge.
362 */
363static inline struct ib_sge *
364rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
365{
366 if (ic->conn->c_version > RDS_PROTOCOL_3_0)
367 return &sge[0];
368 else
369 return &sge[1];
370}
371
372static inline struct ib_sge *
373rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
374{
375 if (ic->conn->c_version > RDS_PROTOCOL_3_0)
376 return &sge[1];
377 else
378 return &sge[0];
379}
380 372
381#endif 373#endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index f68832798db2..fd453dd5124b 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -38,6 +38,36 @@
38#include "rds.h" 38#include "rds.h"
39#include "ib.h" 39#include "ib.h"
40 40
41static char *rds_ib_event_type_strings[] = {
42#define RDS_IB_EVENT_STRING(foo) \
43 [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
44 RDS_IB_EVENT_STRING(CQ_ERR),
45 RDS_IB_EVENT_STRING(QP_FATAL),
46 RDS_IB_EVENT_STRING(QP_REQ_ERR),
47 RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
48 RDS_IB_EVENT_STRING(COMM_EST),
49 RDS_IB_EVENT_STRING(SQ_DRAINED),
50 RDS_IB_EVENT_STRING(PATH_MIG),
51 RDS_IB_EVENT_STRING(PATH_MIG_ERR),
52 RDS_IB_EVENT_STRING(DEVICE_FATAL),
53 RDS_IB_EVENT_STRING(PORT_ACTIVE),
54 RDS_IB_EVENT_STRING(PORT_ERR),
55 RDS_IB_EVENT_STRING(LID_CHANGE),
56 RDS_IB_EVENT_STRING(PKEY_CHANGE),
57 RDS_IB_EVENT_STRING(SM_CHANGE),
58 RDS_IB_EVENT_STRING(SRQ_ERR),
59 RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
60 RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
61 RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
62#undef RDS_IB_EVENT_STRING
63};
64
65static char *rds_ib_event_str(enum ib_event_type type)
66{
67 return rds_str_array(rds_ib_event_type_strings,
68 ARRAY_SIZE(rds_ib_event_type_strings), type);
69};
70
41/* 71/*
42 * Set the selected protocol version 72 * Set the selected protocol version
43 */ 73 */
@@ -95,7 +125,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
95{ 125{
96 const struct rds_ib_connect_private *dp = NULL; 126 const struct rds_ib_connect_private *dp = NULL;
97 struct rds_ib_connection *ic = conn->c_transport_data; 127 struct rds_ib_connection *ic = conn->c_transport_data;
98 struct rds_ib_device *rds_ibdev;
99 struct ib_qp_attr qp_attr; 128 struct ib_qp_attr qp_attr;
100 int err; 129 int err;
101 130
@@ -111,11 +140,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
111 } 140 }
112 } 141 }
113 142
114 printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", 143 if (conn->c_version < RDS_PROTOCOL(3,1)) {
115 &conn->c_faddr, 144 printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
116 RDS_PROTOCOL_MAJOR(conn->c_version), 145 " no longer supported\n",
117 RDS_PROTOCOL_MINOR(conn->c_version), 146 &conn->c_faddr,
118 ic->i_flowctl ? ", flow control" : ""); 147 RDS_PROTOCOL_MAJOR(conn->c_version),
148 RDS_PROTOCOL_MINOR(conn->c_version));
149 rds_conn_destroy(conn);
150 return;
151 } else {
152 printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
153 &conn->c_faddr,
154 RDS_PROTOCOL_MAJOR(conn->c_version),
155 RDS_PROTOCOL_MINOR(conn->c_version),
156 ic->i_flowctl ? ", flow control" : "");
157 }
119 158
120 /* 159 /*
121 * Init rings and fill recv. this needs to wait until protocol negotiation 160 * Init rings and fill recv. this needs to wait until protocol negotiation
@@ -125,7 +164,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
125 rds_ib_recv_init_ring(ic); 164 rds_ib_recv_init_ring(ic);
126 /* Post receive buffers - as a side effect, this will update 165 /* Post receive buffers - as a side effect, this will update
127 * the posted credit count. */ 166 * the posted credit count. */
128 rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); 167 rds_ib_recv_refill(conn, 1);
129 168
130 /* Tune RNR behavior */ 169 /* Tune RNR behavior */
131 rds_ib_tune_rnr(ic, &qp_attr); 170 rds_ib_tune_rnr(ic, &qp_attr);
@@ -135,12 +174,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
135 if (err) 174 if (err)
136 printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); 175 printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
137 176
138 /* update ib_device with this local ipaddr & conn */ 177 /* update ib_device with this local ipaddr */
139 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); 178 err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
140 err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
141 if (err) 179 if (err)
142 printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); 180 printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
143 rds_ib_add_conn(rds_ibdev, conn); 181 err);
144 182
145 /* If the peer gave us the last packet it saw, process this as if 183 /* If the peer gave us the last packet it saw, process this as if
146 * we had received a regular ACK. */ 184 * we had received a regular ACK. */
@@ -153,18 +191,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
153static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, 191static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
154 struct rdma_conn_param *conn_param, 192 struct rdma_conn_param *conn_param,
155 struct rds_ib_connect_private *dp, 193 struct rds_ib_connect_private *dp,
156 u32 protocol_version) 194 u32 protocol_version,
195 u32 max_responder_resources,
196 u32 max_initiator_depth)
157{ 197{
198 struct rds_ib_connection *ic = conn->c_transport_data;
199 struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
200
158 memset(conn_param, 0, sizeof(struct rdma_conn_param)); 201 memset(conn_param, 0, sizeof(struct rdma_conn_param));
159 /* XXX tune these? */ 202
160 conn_param->responder_resources = 1; 203 conn_param->responder_resources =
161 conn_param->initiator_depth = 1; 204 min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
205 conn_param->initiator_depth =
206 min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
162 conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); 207 conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
163 conn_param->rnr_retry_count = 7; 208 conn_param->rnr_retry_count = 7;
164 209
165 if (dp) { 210 if (dp) {
166 struct rds_ib_connection *ic = conn->c_transport_data;
167
168 memset(dp, 0, sizeof(*dp)); 211 memset(dp, 0, sizeof(*dp));
169 dp->dp_saddr = conn->c_laddr; 212 dp->dp_saddr = conn->c_laddr;
170 dp->dp_daddr = conn->c_faddr; 213 dp->dp_daddr = conn->c_faddr;
@@ -189,7 +232,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
189 232
190static void rds_ib_cq_event_handler(struct ib_event *event, void *data) 233static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
191{ 234{
192 rdsdebug("event %u data %p\n", event->event, data); 235 rdsdebug("event %u (%s) data %p\n",
236 event->event, rds_ib_event_str(event->event), data);
193} 237}
194 238
195static void rds_ib_qp_event_handler(struct ib_event *event, void *data) 239static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
@@ -197,16 +241,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
197 struct rds_connection *conn = data; 241 struct rds_connection *conn = data;
198 struct rds_ib_connection *ic = conn->c_transport_data; 242 struct rds_ib_connection *ic = conn->c_transport_data;
199 243
200 rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); 244 rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
245 rds_ib_event_str(event->event));
201 246
202 switch (event->event) { 247 switch (event->event) {
203 case IB_EVENT_COMM_EST: 248 case IB_EVENT_COMM_EST:
204 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); 249 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
205 break; 250 break;
206 default: 251 default:
207 rdsdebug("Fatal QP Event %u " 252 rdsdebug("Fatal QP Event %u (%s) "
208 "- connection %pI4->%pI4, reconnecting\n", 253 "- connection %pI4->%pI4, reconnecting\n",
209 event->event, &conn->c_laddr, &conn->c_faddr); 254 event->event, rds_ib_event_str(event->event),
255 &conn->c_laddr, &conn->c_faddr);
210 rds_conn_drop(conn); 256 rds_conn_drop(conn);
211 break; 257 break;
212 } 258 }
@@ -224,18 +270,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
224 struct rds_ib_device *rds_ibdev; 270 struct rds_ib_device *rds_ibdev;
225 int ret; 271 int ret;
226 272
227 /* rds_ib_add_one creates a rds_ib_device object per IB device, 273 /*
228 * and allocates a protection domain, memory range and FMR pool 274 * It's normal to see a null device if an incoming connection races
229 * for each. If that fails for any reason, it will not register 275 * with device removal, so we don't print a warning.
230 * the rds_ibdev at all.
231 */ 276 */
232 rds_ibdev = ib_get_client_data(dev, &rds_ib_client); 277 rds_ibdev = rds_ib_get_client_data(dev);
233 if (rds_ibdev == NULL) { 278 if (!rds_ibdev)
234 if (printk_ratelimit())
235 printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
236 dev->name);
237 return -EOPNOTSUPP; 279 return -EOPNOTSUPP;
238 } 280
281 /* add the conn now so that connection establishment has the dev */
282 rds_ib_add_conn(rds_ibdev, conn);
239 283
240 if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) 284 if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
241 rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); 285 rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
@@ -306,7 +350,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
306 ic->i_send_ring.w_nr * 350 ic->i_send_ring.w_nr *
307 sizeof(struct rds_header), 351 sizeof(struct rds_header),
308 &ic->i_send_hdrs_dma, GFP_KERNEL); 352 &ic->i_send_hdrs_dma, GFP_KERNEL);
309 if (ic->i_send_hdrs == NULL) { 353 if (!ic->i_send_hdrs) {
310 ret = -ENOMEM; 354 ret = -ENOMEM;
311 rdsdebug("ib_dma_alloc_coherent send failed\n"); 355 rdsdebug("ib_dma_alloc_coherent send failed\n");
312 goto out; 356 goto out;
@@ -316,7 +360,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
316 ic->i_recv_ring.w_nr * 360 ic->i_recv_ring.w_nr *
317 sizeof(struct rds_header), 361 sizeof(struct rds_header),
318 &ic->i_recv_hdrs_dma, GFP_KERNEL); 362 &ic->i_recv_hdrs_dma, GFP_KERNEL);
319 if (ic->i_recv_hdrs == NULL) { 363 if (!ic->i_recv_hdrs) {
320 ret = -ENOMEM; 364 ret = -ENOMEM;
321 rdsdebug("ib_dma_alloc_coherent recv failed\n"); 365 rdsdebug("ib_dma_alloc_coherent recv failed\n");
322 goto out; 366 goto out;
@@ -324,22 +368,24 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
324 368
325 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), 369 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
326 &ic->i_ack_dma, GFP_KERNEL); 370 &ic->i_ack_dma, GFP_KERNEL);
327 if (ic->i_ack == NULL) { 371 if (!ic->i_ack) {
328 ret = -ENOMEM; 372 ret = -ENOMEM;
329 rdsdebug("ib_dma_alloc_coherent ack failed\n"); 373 rdsdebug("ib_dma_alloc_coherent ack failed\n");
330 goto out; 374 goto out;
331 } 375 }
332 376
333 ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); 377 ic->i_sends = vmalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
334 if (ic->i_sends == NULL) { 378 ibdev_to_node(dev));
379 if (!ic->i_sends) {
335 ret = -ENOMEM; 380 ret = -ENOMEM;
336 rdsdebug("send allocation failed\n"); 381 rdsdebug("send allocation failed\n");
337 goto out; 382 goto out;
338 } 383 }
339 memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); 384 memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
340 385
341 ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); 386 ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
342 if (ic->i_recvs == NULL) { 387 ibdev_to_node(dev));
388 if (!ic->i_recvs) {
343 ret = -ENOMEM; 389 ret = -ENOMEM;
344 rdsdebug("recv allocation failed\n"); 390 rdsdebug("recv allocation failed\n");
345 goto out; 391 goto out;
@@ -352,6 +398,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
352 ic->i_send_cq, ic->i_recv_cq); 398 ic->i_send_cq, ic->i_recv_cq);
353 399
354out: 400out:
401 rds_ib_dev_put(rds_ibdev);
355 return ret; 402 return ret;
356} 403}
357 404
@@ -409,7 +456,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
409 struct rds_ib_connection *ic = NULL; 456 struct rds_ib_connection *ic = NULL;
410 struct rdma_conn_param conn_param; 457 struct rdma_conn_param conn_param;
411 u32 version; 458 u32 version;
412 int err, destroy = 1; 459 int err = 1, destroy = 1;
413 460
414 /* Check whether the remote protocol version matches ours. */ 461 /* Check whether the remote protocol version matches ours. */
415 version = rds_ib_protocol_compatible(event); 462 version = rds_ib_protocol_compatible(event);
@@ -448,7 +495,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
448 /* Wait and see - our connect may still be succeeding */ 495 /* Wait and see - our connect may still be succeeding */
449 rds_ib_stats_inc(s_ib_connect_raced); 496 rds_ib_stats_inc(s_ib_connect_raced);
450 } 497 }
451 mutex_unlock(&conn->c_cm_lock);
452 goto out; 498 goto out;
453 } 499 }
454 500
@@ -475,24 +521,23 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
475 err = rds_ib_setup_qp(conn); 521 err = rds_ib_setup_qp(conn);
476 if (err) { 522 if (err) {
477 rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err); 523 rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
478 mutex_unlock(&conn->c_cm_lock);
479 goto out; 524 goto out;
480 } 525 }
481 526
482 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); 527 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
528 event->param.conn.responder_resources,
529 event->param.conn.initiator_depth);
483 530
484 /* rdma_accept() calls rdma_reject() internally if it fails */ 531 /* rdma_accept() calls rdma_reject() internally if it fails */
485 err = rdma_accept(cm_id, &conn_param); 532 err = rdma_accept(cm_id, &conn_param);
486 mutex_unlock(&conn->c_cm_lock); 533 if (err)
487 if (err) {
488 rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); 534 rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
489 goto out;
490 }
491
492 return 0;
493 535
494out: 536out:
495 rdma_reject(cm_id, NULL, 0); 537 if (conn)
538 mutex_unlock(&conn->c_cm_lock);
539 if (err)
540 rdma_reject(cm_id, NULL, 0);
496 return destroy; 541 return destroy;
497} 542}
498 543
@@ -516,8 +561,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
516 goto out; 561 goto out;
517 } 562 }
518 563
519 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); 564 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
520 565 UINT_MAX, UINT_MAX);
521 ret = rdma_connect(cm_id, &conn_param); 566 ret = rdma_connect(cm_id, &conn_param);
522 if (ret) 567 if (ret)
523 rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); 568 rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
@@ -542,7 +587,7 @@ int rds_ib_conn_connect(struct rds_connection *conn)
542 /* XXX I wonder what affect the port space has */ 587 /* XXX I wonder what affect the port space has */
543 /* delegate cm event handler to rdma_transport */ 588 /* delegate cm event handler to rdma_transport */
544 ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, 589 ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
545 RDMA_PS_TCP); 590 RDMA_PS_TCP, IB_QPT_RC);
546 if (IS_ERR(ic->i_cm_id)) { 591 if (IS_ERR(ic->i_cm_id)) {
547 ret = PTR_ERR(ic->i_cm_id); 592 ret = PTR_ERR(ic->i_cm_id);
548 ic->i_cm_id = NULL; 593 ic->i_cm_id = NULL;
@@ -601,9 +646,19 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
601 ic->i_cm_id, err); 646 ic->i_cm_id, err);
602 } 647 }
603 648
649 /*
650 * We want to wait for tx and rx completion to finish
651 * before we tear down the connection, but we have to be
652 * careful not to get stuck waiting on a send ring that
653 * only has unsignaled sends in it. We've shutdown new
654 * sends before getting here so by waiting for signaled
655 * sends to complete we're ensured that there will be no
656 * more tx processing.
657 */
604 wait_event(rds_ib_ring_empty_wait, 658 wait_event(rds_ib_ring_empty_wait,
605 rds_ib_ring_empty(&ic->i_send_ring) && 659 rds_ib_ring_empty(&ic->i_recv_ring) &&
606 rds_ib_ring_empty(&ic->i_recv_ring)); 660 (atomic_read(&ic->i_signaled_sends) == 0));
661 tasklet_kill(&ic->i_recv_tasklet);
607 662
608 if (ic->i_send_hdrs) 663 if (ic->i_send_hdrs)
609 ib_dma_free_coherent(dev, 664 ib_dma_free_coherent(dev,
@@ -654,9 +709,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
654 BUG_ON(ic->rds_ibdev); 709 BUG_ON(ic->rds_ibdev);
655 710
656 /* Clear pending transmit */ 711 /* Clear pending transmit */
657 if (ic->i_rm) { 712 if (ic->i_data_op) {
658 rds_message_put(ic->i_rm); 713 struct rds_message *rm;
659 ic->i_rm = NULL; 714
715 rm = container_of(ic->i_data_op, struct rds_message, data);
716 rds_message_put(rm);
717 ic->i_data_op = NULL;
660 } 718 }
661 719
662 /* Clear the ACK state */ 720 /* Clear the ACK state */
@@ -690,12 +748,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
690{ 748{
691 struct rds_ib_connection *ic; 749 struct rds_ib_connection *ic;
692 unsigned long flags; 750 unsigned long flags;
751 int ret;
693 752
694 /* XXX too lazy? */ 753 /* XXX too lazy? */
695 ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); 754 ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
696 if (ic == NULL) 755 if (!ic)
697 return -ENOMEM; 756 return -ENOMEM;
698 757
758 ret = rds_ib_recv_alloc_caches(ic);
759 if (ret) {
760 kfree(ic);
761 return ret;
762 }
763
699 INIT_LIST_HEAD(&ic->ib_node); 764 INIT_LIST_HEAD(&ic->ib_node);
700 tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, 765 tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
701 (unsigned long) ic); 766 (unsigned long) ic);
@@ -703,6 +768,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
703#ifndef KERNEL_HAS_ATOMIC64 768#ifndef KERNEL_HAS_ATOMIC64
704 spin_lock_init(&ic->i_ack_lock); 769 spin_lock_init(&ic->i_ack_lock);
705#endif 770#endif
771 atomic_set(&ic->i_signaled_sends, 0);
706 772
707 /* 773 /*
708 * rds_ib_conn_shutdown() waits for these to be emptied so they 774 * rds_ib_conn_shutdown() waits for these to be emptied so they
@@ -744,6 +810,8 @@ void rds_ib_conn_free(void *arg)
744 list_del(&ic->ib_node); 810 list_del(&ic->ib_node);
745 spin_unlock_irq(lock_ptr); 811 spin_unlock_irq(lock_ptr);
746 812
813 rds_ib_recv_free_caches(ic);
814
747 kfree(ic); 815 kfree(ic);
748} 816}
749 817
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index a54cd63f9e35..819c35a0d9cb 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -32,11 +32,14 @@
32 */ 32 */
33#include <linux/kernel.h> 33#include <linux/kernel.h>
34#include <linux/slab.h> 34#include <linux/slab.h>
35#include <linux/rculist.h>
35 36
36#include "rds.h" 37#include "rds.h"
37#include "rdma.h"
38#include "ib.h" 38#include "ib.h"
39#include "xlist.h"
39 40
41static DEFINE_PER_CPU(unsigned long, clean_list_grace);
42#define CLEAN_LIST_BUSY_BIT 0
40 43
41/* 44/*
42 * This is stored as mr->r_trans_private. 45 * This is stored as mr->r_trans_private.
@@ -45,7 +48,11 @@ struct rds_ib_mr {
45 struct rds_ib_device *device; 48 struct rds_ib_device *device;
46 struct rds_ib_mr_pool *pool; 49 struct rds_ib_mr_pool *pool;
47 struct ib_fmr *fmr; 50 struct ib_fmr *fmr;
48 struct list_head list; 51
52 struct xlist_head xlist;
53
54 /* unmap_list is for freeing */
55 struct list_head unmap_list;
49 unsigned int remap_count; 56 unsigned int remap_count;
50 57
51 struct scatterlist *sg; 58 struct scatterlist *sg;
@@ -59,14 +66,16 @@ struct rds_ib_mr {
59 */ 66 */
60struct rds_ib_mr_pool { 67struct rds_ib_mr_pool {
61 struct mutex flush_lock; /* serialize fmr invalidate */ 68 struct mutex flush_lock; /* serialize fmr invalidate */
62 struct work_struct flush_worker; /* flush worker */ 69 struct delayed_work flush_worker; /* flush worker */
63 70
64 spinlock_t list_lock; /* protect variables below */
65 atomic_t item_count; /* total # of MRs */ 71 atomic_t item_count; /* total # of MRs */
66 atomic_t dirty_count; /* # dirty of MRs */ 72 atomic_t dirty_count; /* # dirty of MRs */
67 struct list_head drop_list; /* MRs that have reached their max_maps limit */ 73
68 struct list_head free_list; /* unused MRs */ 74 struct xlist_head drop_list; /* MRs that have reached their max_maps limit */
69 struct list_head clean_list; /* unused & unamapped MRs */ 75 struct xlist_head free_list; /* unused MRs */
76 struct xlist_head clean_list; /* global unused & unamapped MRs */
77 wait_queue_head_t flush_wait;
78
70 atomic_t free_pinned; /* memory pinned by free MRs */ 79 atomic_t free_pinned; /* memory pinned by free MRs */
71 unsigned long max_items; 80 unsigned long max_items;
72 unsigned long max_items_soft; 81 unsigned long max_items_soft;
@@ -74,7 +83,7 @@ struct rds_ib_mr_pool {
74 struct ib_fmr_attr fmr_attr; 83 struct ib_fmr_attr fmr_attr;
75}; 84};
76 85
77static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all); 86static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
78static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); 87static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
79static void rds_ib_mr_pool_flush_worker(struct work_struct *work); 88static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
80 89
@@ -83,16 +92,17 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
83 struct rds_ib_device *rds_ibdev; 92 struct rds_ib_device *rds_ibdev;
84 struct rds_ib_ipaddr *i_ipaddr; 93 struct rds_ib_ipaddr *i_ipaddr;
85 94
86 list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { 95 rcu_read_lock();
87 spin_lock_irq(&rds_ibdev->spinlock); 96 list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
88 list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) { 97 list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
89 if (i_ipaddr->ipaddr == ipaddr) { 98 if (i_ipaddr->ipaddr == ipaddr) {
90 spin_unlock_irq(&rds_ibdev->spinlock); 99 atomic_inc(&rds_ibdev->refcount);
100 rcu_read_unlock();
91 return rds_ibdev; 101 return rds_ibdev;
92 } 102 }
93 } 103 }
94 spin_unlock_irq(&rds_ibdev->spinlock);
95 } 104 }
105 rcu_read_unlock();
96 106
97 return NULL; 107 return NULL;
98} 108}
@@ -108,7 +118,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
108 i_ipaddr->ipaddr = ipaddr; 118 i_ipaddr->ipaddr = ipaddr;
109 119
110 spin_lock_irq(&rds_ibdev->spinlock); 120 spin_lock_irq(&rds_ibdev->spinlock);
111 list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list); 121 list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
112 spin_unlock_irq(&rds_ibdev->spinlock); 122 spin_unlock_irq(&rds_ibdev->spinlock);
113 123
114 return 0; 124 return 0;
@@ -116,17 +126,24 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
116 126
117static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) 127static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
118{ 128{
119 struct rds_ib_ipaddr *i_ipaddr, *next; 129 struct rds_ib_ipaddr *i_ipaddr;
130 struct rds_ib_ipaddr *to_free = NULL;
131
120 132
121 spin_lock_irq(&rds_ibdev->spinlock); 133 spin_lock_irq(&rds_ibdev->spinlock);
122 list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) { 134 list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
123 if (i_ipaddr->ipaddr == ipaddr) { 135 if (i_ipaddr->ipaddr == ipaddr) {
124 list_del(&i_ipaddr->list); 136 list_del_rcu(&i_ipaddr->list);
125 kfree(i_ipaddr); 137 to_free = i_ipaddr;
126 break; 138 break;
127 } 139 }
128 } 140 }
129 spin_unlock_irq(&rds_ibdev->spinlock); 141 spin_unlock_irq(&rds_ibdev->spinlock);
142
143 if (to_free) {
144 synchronize_rcu();
145 kfree(to_free);
146 }
130} 147}
131 148
132int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) 149int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
@@ -134,8 +151,10 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
134 struct rds_ib_device *rds_ibdev_old; 151 struct rds_ib_device *rds_ibdev_old;
135 152
136 rds_ibdev_old = rds_ib_get_device(ipaddr); 153 rds_ibdev_old = rds_ib_get_device(ipaddr);
137 if (rds_ibdev_old) 154 if (rds_ibdev_old) {
138 rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); 155 rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
156 rds_ib_dev_put(rds_ibdev_old);
157 }
139 158
140 return rds_ib_add_ipaddr(rds_ibdev, ipaddr); 159 return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
141} 160}
@@ -150,12 +169,13 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con
150 BUG_ON(list_empty(&ic->ib_node)); 169 BUG_ON(list_empty(&ic->ib_node));
151 list_del(&ic->ib_node); 170 list_del(&ic->ib_node);
152 171
153 spin_lock_irq(&rds_ibdev->spinlock); 172 spin_lock(&rds_ibdev->spinlock);
154 list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); 173 list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
155 spin_unlock_irq(&rds_ibdev->spinlock); 174 spin_unlock(&rds_ibdev->spinlock);
156 spin_unlock_irq(&ib_nodev_conns_lock); 175 spin_unlock_irq(&ib_nodev_conns_lock);
157 176
158 ic->rds_ibdev = rds_ibdev; 177 ic->rds_ibdev = rds_ibdev;
178 atomic_inc(&rds_ibdev->refcount);
159} 179}
160 180
161void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) 181void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
@@ -175,18 +195,18 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *
175 spin_unlock(&ib_nodev_conns_lock); 195 spin_unlock(&ib_nodev_conns_lock);
176 196
177 ic->rds_ibdev = NULL; 197 ic->rds_ibdev = NULL;
198 rds_ib_dev_put(rds_ibdev);
178} 199}
179 200
180void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock) 201void rds_ib_destroy_nodev_conns(void)
181{ 202{
182 struct rds_ib_connection *ic, *_ic; 203 struct rds_ib_connection *ic, *_ic;
183 LIST_HEAD(tmp_list); 204 LIST_HEAD(tmp_list);
184 205
185 /* avoid calling conn_destroy with irqs off */ 206 /* avoid calling conn_destroy with irqs off */
186 spin_lock_irq(list_lock); 207 spin_lock_irq(&ib_nodev_conns_lock);
187 list_splice(list, &tmp_list); 208 list_splice(&ib_nodev_conns, &tmp_list);
188 INIT_LIST_HEAD(list); 209 spin_unlock_irq(&ib_nodev_conns_lock);
189 spin_unlock_irq(list_lock);
190 210
191 list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) 211 list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
192 rds_conn_destroy(ic->conn); 212 rds_conn_destroy(ic->conn);
@@ -200,12 +220,12 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
200 if (!pool) 220 if (!pool)
201 return ERR_PTR(-ENOMEM); 221 return ERR_PTR(-ENOMEM);
202 222
203 INIT_LIST_HEAD(&pool->free_list); 223 INIT_XLIST_HEAD(&pool->free_list);
204 INIT_LIST_HEAD(&pool->drop_list); 224 INIT_XLIST_HEAD(&pool->drop_list);
205 INIT_LIST_HEAD(&pool->clean_list); 225 INIT_XLIST_HEAD(&pool->clean_list);
206 mutex_init(&pool->flush_lock); 226 mutex_init(&pool->flush_lock);
207 spin_lock_init(&pool->list_lock); 227 init_waitqueue_head(&pool->flush_wait);
208 INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); 228 INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
209 229
210 pool->fmr_attr.max_pages = fmr_message_size; 230 pool->fmr_attr.max_pages = fmr_message_size;
211 pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; 231 pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
@@ -233,34 +253,60 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
233 253
234void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) 254void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
235{ 255{
236 flush_workqueue(rds_wq); 256 cancel_delayed_work_sync(&pool->flush_worker);
237 rds_ib_flush_mr_pool(pool, 1); 257 rds_ib_flush_mr_pool(pool, 1, NULL);
238 WARN_ON(atomic_read(&pool->item_count)); 258 WARN_ON(atomic_read(&pool->item_count));
239 WARN_ON(atomic_read(&pool->free_pinned)); 259 WARN_ON(atomic_read(&pool->free_pinned));
240 kfree(pool); 260 kfree(pool);
241} 261}
242 262
263static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl,
264 struct rds_ib_mr **ibmr_ret)
265{
266 struct xlist_head *ibmr_xl;
267 ibmr_xl = xlist_del_head_fast(xl);
268 *ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist);
269}
270
243static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) 271static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
244{ 272{
245 struct rds_ib_mr *ibmr = NULL; 273 struct rds_ib_mr *ibmr = NULL;
246 unsigned long flags; 274 struct xlist_head *ret;
275 unsigned long *flag;
247 276
248 spin_lock_irqsave(&pool->list_lock, flags); 277 preempt_disable();
249 if (!list_empty(&pool->clean_list)) { 278 flag = &__get_cpu_var(clean_list_grace);
250 ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list); 279 set_bit(CLEAN_LIST_BUSY_BIT, flag);
251 list_del_init(&ibmr->list); 280 ret = xlist_del_head(&pool->clean_list);
252 } 281 if (ret)
253 spin_unlock_irqrestore(&pool->list_lock, flags); 282 ibmr = list_entry(ret, struct rds_ib_mr, xlist);
254 283
284 clear_bit(CLEAN_LIST_BUSY_BIT, flag);
285 preempt_enable();
255 return ibmr; 286 return ibmr;
256} 287}
257 288
289static inline void wait_clean_list_grace(void)
290{
291 int cpu;
292 unsigned long *flag;
293
294 for_each_online_cpu(cpu) {
295 flag = &per_cpu(clean_list_grace, cpu);
296 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
297 cpu_relax();
298 }
299}
300
258static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) 301static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
259{ 302{
260 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; 303 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
261 struct rds_ib_mr *ibmr = NULL; 304 struct rds_ib_mr *ibmr = NULL;
262 int err = 0, iter = 0; 305 int err = 0, iter = 0;
263 306
307 if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
308 schedule_delayed_work(&pool->flush_worker, 10);
309
264 while (1) { 310 while (1) {
265 ibmr = rds_ib_reuse_fmr(pool); 311 ibmr = rds_ib_reuse_fmr(pool);
266 if (ibmr) 312 if (ibmr)
@@ -287,19 +333,24 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
287 333
288 /* We do have some empty MRs. Flush them out. */ 334 /* We do have some empty MRs. Flush them out. */
289 rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); 335 rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
290 rds_ib_flush_mr_pool(pool, 0); 336 rds_ib_flush_mr_pool(pool, 0, &ibmr);
337 if (ibmr)
338 return ibmr;
291 } 339 }
292 340
293 ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); 341 ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
294 if (!ibmr) { 342 if (!ibmr) {
295 err = -ENOMEM; 343 err = -ENOMEM;
296 goto out_no_cigar; 344 goto out_no_cigar;
297 } 345 }
298 346
347 memset(ibmr, 0, sizeof(*ibmr));
348
299 ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, 349 ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
300 (IB_ACCESS_LOCAL_WRITE | 350 (IB_ACCESS_LOCAL_WRITE |
301 IB_ACCESS_REMOTE_READ | 351 IB_ACCESS_REMOTE_READ |
302 IB_ACCESS_REMOTE_WRITE), 352 IB_ACCESS_REMOTE_WRITE|
353 IB_ACCESS_REMOTE_ATOMIC),
303 &pool->fmr_attr); 354 &pool->fmr_attr);
304 if (IS_ERR(ibmr->fmr)) { 355 if (IS_ERR(ibmr->fmr)) {
305 err = PTR_ERR(ibmr->fmr); 356 err = PTR_ERR(ibmr->fmr);
@@ -367,7 +418,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
367 if (page_cnt > fmr_message_size) 418 if (page_cnt > fmr_message_size)
368 return -EINVAL; 419 return -EINVAL;
369 420
370 dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC); 421 dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
422 rdsibdev_to_node(rds_ibdev));
371 if (!dma_pages) 423 if (!dma_pages)
372 return -ENOMEM; 424 return -ENOMEM;
373 425
@@ -441,7 +493,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
441 493
442 /* FIXME we need a way to tell a r/w MR 494 /* FIXME we need a way to tell a r/w MR
443 * from a r/o MR */ 495 * from a r/o MR */
444 BUG_ON(in_interrupt()); 496 BUG_ON(irqs_disabled());
445 set_page_dirty(page); 497 set_page_dirty(page);
446 put_page(page); 498 put_page(page);
447 } 499 }
@@ -477,33 +529,109 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr
477} 529}
478 530
479/* 531/*
532 * given an xlist of mrs, put them all into the list_head for more processing
533 */
534static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list)
535{
536 struct rds_ib_mr *ibmr;
537 struct xlist_head splice;
538 struct xlist_head *cur;
539 struct xlist_head *next;
540
541 splice.next = NULL;
542 xlist_splice(xlist, &splice);
543 cur = splice.next;
544 while (cur) {
545 next = cur->next;
546 ibmr = list_entry(cur, struct rds_ib_mr, xlist);
547 list_add_tail(&ibmr->unmap_list, list);
548 cur = next;
549 }
550}
551
552/*
553 * this takes a list head of mrs and turns it into an xlist of clusters.
554 * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for
555 * reuse.
556 */
557static void list_append_to_xlist(struct rds_ib_mr_pool *pool,
558 struct list_head *list, struct xlist_head *xlist,
559 struct xlist_head **tail_ret)
560{
561 struct rds_ib_mr *ibmr;
562 struct xlist_head *cur_mr = xlist;
563 struct xlist_head *tail_mr = NULL;
564
565 list_for_each_entry(ibmr, list, unmap_list) {
566 tail_mr = &ibmr->xlist;
567 tail_mr->next = NULL;
568 cur_mr->next = tail_mr;
569 cur_mr = tail_mr;
570 }
571 *tail_ret = tail_mr;
572}
573
574/*
480 * Flush our pool of MRs. 575 * Flush our pool of MRs.
481 * At a minimum, all currently unused MRs are unmapped. 576 * At a minimum, all currently unused MRs are unmapped.
482 * If the number of MRs allocated exceeds the limit, we also try 577 * If the number of MRs allocated exceeds the limit, we also try
483 * to free as many MRs as needed to get back to this limit. 578 * to free as many MRs as needed to get back to this limit.
484 */ 579 */
485static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) 580static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
581 int free_all, struct rds_ib_mr **ibmr_ret)
486{ 582{
487 struct rds_ib_mr *ibmr, *next; 583 struct rds_ib_mr *ibmr, *next;
584 struct xlist_head clean_xlist;
585 struct xlist_head *clean_tail;
488 LIST_HEAD(unmap_list); 586 LIST_HEAD(unmap_list);
489 LIST_HEAD(fmr_list); 587 LIST_HEAD(fmr_list);
490 unsigned long unpinned = 0; 588 unsigned long unpinned = 0;
491 unsigned long flags;
492 unsigned int nfreed = 0, ncleaned = 0, free_goal; 589 unsigned int nfreed = 0, ncleaned = 0, free_goal;
493 int ret = 0; 590 int ret = 0;
494 591
495 rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); 592 rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
496 593
497 mutex_lock(&pool->flush_lock); 594 if (ibmr_ret) {
595 DEFINE_WAIT(wait);
596 while(!mutex_trylock(&pool->flush_lock)) {
597 ibmr = rds_ib_reuse_fmr(pool);
598 if (ibmr) {
599 *ibmr_ret = ibmr;
600 finish_wait(&pool->flush_wait, &wait);
601 goto out_nolock;
602 }
603
604 prepare_to_wait(&pool->flush_wait, &wait,
605 TASK_UNINTERRUPTIBLE);
606 if (xlist_empty(&pool->clean_list))
607 schedule();
608
609 ibmr = rds_ib_reuse_fmr(pool);
610 if (ibmr) {
611 *ibmr_ret = ibmr;
612 finish_wait(&pool->flush_wait, &wait);
613 goto out_nolock;
614 }
615 }
616 finish_wait(&pool->flush_wait, &wait);
617 } else
618 mutex_lock(&pool->flush_lock);
619
620 if (ibmr_ret) {
621 ibmr = rds_ib_reuse_fmr(pool);
622 if (ibmr) {
623 *ibmr_ret = ibmr;
624 goto out;
625 }
626 }
498 627
499 spin_lock_irqsave(&pool->list_lock, flags);
500 /* Get the list of all MRs to be dropped. Ordering matters - 628 /* Get the list of all MRs to be dropped. Ordering matters -
501 * we want to put drop_list ahead of free_list. */ 629 * we want to put drop_list ahead of free_list.
502 list_splice_init(&pool->free_list, &unmap_list); 630 */
503 list_splice_init(&pool->drop_list, &unmap_list); 631 xlist_append_to_list(&pool->drop_list, &unmap_list);
632 xlist_append_to_list(&pool->free_list, &unmap_list);
504 if (free_all) 633 if (free_all)
505 list_splice_init(&pool->clean_list, &unmap_list); 634 xlist_append_to_list(&pool->clean_list, &unmap_list);
506 spin_unlock_irqrestore(&pool->list_lock, flags);
507 635
508 free_goal = rds_ib_flush_goal(pool, free_all); 636 free_goal = rds_ib_flush_goal(pool, free_all);
509 637
@@ -511,19 +639,20 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
511 goto out; 639 goto out;
512 640
513 /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ 641 /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
514 list_for_each_entry(ibmr, &unmap_list, list) 642 list_for_each_entry(ibmr, &unmap_list, unmap_list)
515 list_add(&ibmr->fmr->list, &fmr_list); 643 list_add(&ibmr->fmr->list, &fmr_list);
644
516 ret = ib_unmap_fmr(&fmr_list); 645 ret = ib_unmap_fmr(&fmr_list);
517 if (ret) 646 if (ret)
518 printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); 647 printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
519 648
520 /* Now we can destroy the DMA mapping and unpin any pages */ 649 /* Now we can destroy the DMA mapping and unpin any pages */
521 list_for_each_entry_safe(ibmr, next, &unmap_list, list) { 650 list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
522 unpinned += ibmr->sg_len; 651 unpinned += ibmr->sg_len;
523 __rds_ib_teardown_mr(ibmr); 652 __rds_ib_teardown_mr(ibmr);
524 if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { 653 if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
525 rds_ib_stats_inc(s_ib_rdma_mr_free); 654 rds_ib_stats_inc(s_ib_rdma_mr_free);
526 list_del(&ibmr->list); 655 list_del(&ibmr->unmap_list);
527 ib_dealloc_fmr(ibmr->fmr); 656 ib_dealloc_fmr(ibmr->fmr);
528 kfree(ibmr); 657 kfree(ibmr);
529 nfreed++; 658 nfreed++;
@@ -531,9 +660,27 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
531 ncleaned++; 660 ncleaned++;
532 } 661 }
533 662
534 spin_lock_irqsave(&pool->list_lock, flags); 663 if (!list_empty(&unmap_list)) {
535 list_splice(&unmap_list, &pool->clean_list); 664 /* we have to make sure that none of the things we're about
536 spin_unlock_irqrestore(&pool->list_lock, flags); 665 * to put on the clean list would race with other cpus trying
666 * to pull items off. The xlist would explode if we managed to
667 * remove something from the clean list and then add it back again
668 * while another CPU was spinning on that same item in xlist_del_head.
669 *
670 * This is pretty unlikely, but just in case wait for an xlist grace period
671 * here before adding anything back into the clean list.
672 */
673 wait_clean_list_grace();
674
675 list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail);
676 if (ibmr_ret)
677 refill_local(pool, &clean_xlist, ibmr_ret);
678
679 /* refill_local may have emptied our list */
680 if (!xlist_empty(&clean_xlist))
681 xlist_add(clean_xlist.next, clean_tail, &pool->clean_list);
682
683 }
537 684
538 atomic_sub(unpinned, &pool->free_pinned); 685 atomic_sub(unpinned, &pool->free_pinned);
539 atomic_sub(ncleaned, &pool->dirty_count); 686 atomic_sub(ncleaned, &pool->dirty_count);
@@ -541,14 +688,17 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
541 688
542out: 689out:
543 mutex_unlock(&pool->flush_lock); 690 mutex_unlock(&pool->flush_lock);
691 if (waitqueue_active(&pool->flush_wait))
692 wake_up(&pool->flush_wait);
693out_nolock:
544 return ret; 694 return ret;
545} 695}
546 696
547static void rds_ib_mr_pool_flush_worker(struct work_struct *work) 697static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
548{ 698{
549 struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker); 699 struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
550 700
551 rds_ib_flush_mr_pool(pool, 0); 701 rds_ib_flush_mr_pool(pool, 0, NULL);
552} 702}
553 703
554void rds_ib_free_mr(void *trans_private, int invalidate) 704void rds_ib_free_mr(void *trans_private, int invalidate)
@@ -556,47 +706,48 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
556 struct rds_ib_mr *ibmr = trans_private; 706 struct rds_ib_mr *ibmr = trans_private;
557 struct rds_ib_device *rds_ibdev = ibmr->device; 707 struct rds_ib_device *rds_ibdev = ibmr->device;
558 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; 708 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
559 unsigned long flags;
560 709
561 rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); 710 rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
562 711
563 /* Return it to the pool's free list */ 712 /* Return it to the pool's free list */
564 spin_lock_irqsave(&pool->list_lock, flags);
565 if (ibmr->remap_count >= pool->fmr_attr.max_maps) 713 if (ibmr->remap_count >= pool->fmr_attr.max_maps)
566 list_add(&ibmr->list, &pool->drop_list); 714 xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list);
567 else 715 else
568 list_add(&ibmr->list, &pool->free_list); 716 xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list);
569 717
570 atomic_add(ibmr->sg_len, &pool->free_pinned); 718 atomic_add(ibmr->sg_len, &pool->free_pinned);
571 atomic_inc(&pool->dirty_count); 719 atomic_inc(&pool->dirty_count);
572 spin_unlock_irqrestore(&pool->list_lock, flags);
573 720
574 /* If we've pinned too many pages, request a flush */ 721 /* If we've pinned too many pages, request a flush */
575 if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || 722 if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
576 atomic_read(&pool->dirty_count) >= pool->max_items / 10) 723 atomic_read(&pool->dirty_count) >= pool->max_items / 10)
577 queue_work(rds_wq, &pool->flush_worker); 724 schedule_delayed_work(&pool->flush_worker, 10);
578 725
579 if (invalidate) { 726 if (invalidate) {
580 if (likely(!in_interrupt())) { 727 if (likely(!in_interrupt())) {
581 rds_ib_flush_mr_pool(pool, 0); 728 rds_ib_flush_mr_pool(pool, 0, NULL);
582 } else { 729 } else {
583 /* We get here if the user created a MR marked 730 /* We get here if the user created a MR marked
584 * as use_once and invalidate at the same time. */ 731 * as use_once and invalidate at the same time. */
585 queue_work(rds_wq, &pool->flush_worker); 732 schedule_delayed_work(&pool->flush_worker, 10);
586 } 733 }
587 } 734 }
735
736 rds_ib_dev_put(rds_ibdev);
588} 737}
589 738
590void rds_ib_flush_mrs(void) 739void rds_ib_flush_mrs(void)
591{ 740{
592 struct rds_ib_device *rds_ibdev; 741 struct rds_ib_device *rds_ibdev;
593 742
743 down_read(&rds_ib_devices_lock);
594 list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { 744 list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
595 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; 745 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
596 746
597 if (pool) 747 if (pool)
598 rds_ib_flush_mr_pool(pool, 0); 748 rds_ib_flush_mr_pool(pool, 0, NULL);
599 } 749 }
750 up_read(&rds_ib_devices_lock);
600} 751}
601 752
602void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, 753void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
@@ -628,6 +779,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
628 printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); 779 printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
629 780
630 ibmr->device = rds_ibdev; 781 ibmr->device = rds_ibdev;
782 rds_ibdev = NULL;
631 783
632 out: 784 out:
633 if (ret) { 785 if (ret) {
@@ -635,5 +787,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
635 rds_ib_free_mr(ibmr, 0); 787 rds_ib_free_mr(ibmr, 0);
636 ibmr = ERR_PTR(ret); 788 ibmr = ERR_PTR(ret);
637 } 789 }
790 if (rds_ibdev)
791 rds_ib_dev_put(rds_ibdev);
638 return ibmr; 792 return ibmr;
639} 793}
794
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index c74e9904a6b2..e29e0ca32f74 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -43,42 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab;
43static struct kmem_cache *rds_ib_frag_slab; 43static struct kmem_cache *rds_ib_frag_slab;
44static atomic_t rds_ib_allocation = ATOMIC_INIT(0); 44static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
45 45
46static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
47{
48 rdsdebug("frag %p page %p\n", frag, frag->f_page);
49 __free_page(frag->f_page);
50 frag->f_page = NULL;
51}
52
53static void rds_ib_frag_free(struct rds_page_frag *frag)
54{
55 rdsdebug("frag %p page %p\n", frag, frag->f_page);
56 BUG_ON(frag->f_page != NULL);
57 kmem_cache_free(rds_ib_frag_slab, frag);
58}
59
60/*
61 * We map a page at a time. Its fragments are posted in order. This
62 * is called in fragment order as the fragments get send completion events.
63 * Only the last frag in the page performs the unmapping.
64 *
65 * It's OK for ring cleanup to call this in whatever order it likes because
66 * DMA is not in flight and so we can unmap while other ring entries still
67 * hold page references in their frags.
68 */
69static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
70 struct rds_ib_recv_work *recv)
71{
72 struct rds_page_frag *frag = recv->r_frag;
73
74 rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
75 if (frag->f_mapped)
76 ib_dma_unmap_page(ic->i_cm_id->device,
77 frag->f_mapped,
78 RDS_FRAG_SIZE, DMA_FROM_DEVICE);
79 frag->f_mapped = 0;
80}
81
82void rds_ib_recv_init_ring(struct rds_ib_connection *ic) 46void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
83{ 47{
84 struct rds_ib_recv_work *recv; 48 struct rds_ib_recv_work *recv;
@@ -95,16 +59,161 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
95 recv->r_wr.sg_list = recv->r_sge; 59 recv->r_wr.sg_list = recv->r_sge;
96 recv->r_wr.num_sge = RDS_IB_RECV_SGE; 60 recv->r_wr.num_sge = RDS_IB_RECV_SGE;
97 61
98 sge = rds_ib_data_sge(ic, recv->r_sge); 62 sge = &recv->r_sge[0];
63 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
64 sge->length = sizeof(struct rds_header);
65 sge->lkey = ic->i_mr->lkey;
66
67 sge = &recv->r_sge[1];
99 sge->addr = 0; 68 sge->addr = 0;
100 sge->length = RDS_FRAG_SIZE; 69 sge->length = RDS_FRAG_SIZE;
101 sge->lkey = ic->i_mr->lkey; 70 sge->lkey = ic->i_mr->lkey;
71 }
72}
102 73
103 sge = rds_ib_header_sge(ic, recv->r_sge); 74/*
104 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); 75 * The entire 'from' list, including the from element itself, is put on
105 sge->length = sizeof(struct rds_header); 76 * to the tail of the 'to' list.
106 sge->lkey = ic->i_mr->lkey; 77 */
78static void list_splice_entire_tail(struct list_head *from,
79 struct list_head *to)
80{
81 struct list_head *from_last = from->prev;
82
83 list_splice_tail(from_last, to);
84 list_add_tail(from_last, to);
85}
86
87static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
88{
89 struct list_head *tmp;
90
91 tmp = xchg(&cache->xfer, NULL);
92 if (tmp) {
93 if (cache->ready)
94 list_splice_entire_tail(tmp, cache->ready);
95 else
96 cache->ready = tmp;
97 }
98}
99
100static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
101{
102 struct rds_ib_cache_head *head;
103 int cpu;
104
105 cache->percpu = alloc_percpu(struct rds_ib_cache_head);
106 if (!cache->percpu)
107 return -ENOMEM;
108
109 for_each_possible_cpu(cpu) {
110 head = per_cpu_ptr(cache->percpu, cpu);
111 head->first = NULL;
112 head->count = 0;
113 }
114 cache->xfer = NULL;
115 cache->ready = NULL;
116
117 return 0;
118}
119
120int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
121{
122 int ret;
123
124 ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
125 if (!ret) {
126 ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
127 if (ret)
128 free_percpu(ic->i_cache_incs.percpu);
107 } 129 }
130
131 return ret;
132}
133
134static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
135 struct list_head *caller_list)
136{
137 struct rds_ib_cache_head *head;
138 int cpu;
139
140 for_each_possible_cpu(cpu) {
141 head = per_cpu_ptr(cache->percpu, cpu);
142 if (head->first) {
143 list_splice_entire_tail(head->first, caller_list);
144 head->first = NULL;
145 }
146 }
147
148 if (cache->ready) {
149 list_splice_entire_tail(cache->ready, caller_list);
150 cache->ready = NULL;
151 }
152}
153
154void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
155{
156 struct rds_ib_incoming *inc;
157 struct rds_ib_incoming *inc_tmp;
158 struct rds_page_frag *frag;
159 struct rds_page_frag *frag_tmp;
160 LIST_HEAD(list);
161
162 rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
163 rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
164 free_percpu(ic->i_cache_incs.percpu);
165
166 list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
167 list_del(&inc->ii_cache_entry);
168 WARN_ON(!list_empty(&inc->ii_frags));
169 kmem_cache_free(rds_ib_incoming_slab, inc);
170 }
171
172 rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
173 rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
174 free_percpu(ic->i_cache_frags.percpu);
175
176 list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
177 list_del(&frag->f_cache_entry);
178 WARN_ON(!list_empty(&frag->f_item));
179 kmem_cache_free(rds_ib_frag_slab, frag);
180 }
181}
182
183/* fwd decl */
184static void rds_ib_recv_cache_put(struct list_head *new_item,
185 struct rds_ib_refill_cache *cache);
186static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
187
188
189/* Recycle frag and attached recv buffer f_sg */
190static void rds_ib_frag_free(struct rds_ib_connection *ic,
191 struct rds_page_frag *frag)
192{
193 rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
194
195 rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
196}
197
198/* Recycle inc after freeing attached frags */
199void rds_ib_inc_free(struct rds_incoming *inc)
200{
201 struct rds_ib_incoming *ibinc;
202 struct rds_page_frag *frag;
203 struct rds_page_frag *pos;
204 struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
205
206 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
207
208 /* Free attached frags */
209 list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
210 list_del_init(&frag->f_item);
211 rds_ib_frag_free(ic, frag);
212 }
213 BUG_ON(!list_empty(&ibinc->ii_frags));
214
215 rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
216 rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
108} 217}
109 218
110static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, 219static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
@@ -115,10 +224,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
115 recv->r_ibinc = NULL; 224 recv->r_ibinc = NULL;
116 } 225 }
117 if (recv->r_frag) { 226 if (recv->r_frag) {
118 rds_ib_recv_unmap_page(ic, recv); 227 ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
119 if (recv->r_frag->f_page) 228 rds_ib_frag_free(ic, recv->r_frag);
120 rds_ib_frag_drop_page(recv->r_frag);
121 rds_ib_frag_free(recv->r_frag);
122 recv->r_frag = NULL; 229 recv->r_frag = NULL;
123 } 230 }
124} 231}
@@ -129,84 +236,111 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
129 236
130 for (i = 0; i < ic->i_recv_ring.w_nr; i++) 237 for (i = 0; i < ic->i_recv_ring.w_nr; i++)
131 rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); 238 rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
132
133 if (ic->i_frag.f_page)
134 rds_ib_frag_drop_page(&ic->i_frag);
135} 239}
136 240
137static int rds_ib_recv_refill_one(struct rds_connection *conn, 241static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
138 struct rds_ib_recv_work *recv, 242 gfp_t slab_mask)
139 gfp_t kptr_gfp, gfp_t page_gfp)
140{ 243{
141 struct rds_ib_connection *ic = conn->c_transport_data; 244 struct rds_ib_incoming *ibinc;
142 dma_addr_t dma_addr; 245 struct list_head *cache_item;
143 struct ib_sge *sge; 246 int avail_allocs;
144 int ret = -ENOMEM;
145 247
146 if (recv->r_ibinc == NULL) { 248 cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
147 if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) { 249 if (cache_item) {
250 ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
251 } else {
252 avail_allocs = atomic_add_unless(&rds_ib_allocation,
253 1, rds_ib_sysctl_max_recv_allocation);
254 if (!avail_allocs) {
148 rds_ib_stats_inc(s_ib_rx_alloc_limit); 255 rds_ib_stats_inc(s_ib_rx_alloc_limit);
149 goto out; 256 return NULL;
150 } 257 }
151 recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, 258 ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
152 kptr_gfp); 259 if (!ibinc) {
153 if (recv->r_ibinc == NULL) {
154 atomic_dec(&rds_ib_allocation); 260 atomic_dec(&rds_ib_allocation);
155 goto out; 261 return NULL;
156 } 262 }
157 INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
158 rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
159 } 263 }
264 INIT_LIST_HEAD(&ibinc->ii_frags);
265 rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
160 266
161 if (recv->r_frag == NULL) { 267 return ibinc;
162 recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp); 268}
163 if (recv->r_frag == NULL) 269
164 goto out; 270static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
165 INIT_LIST_HEAD(&recv->r_frag->f_item); 271 gfp_t slab_mask, gfp_t page_mask)
166 recv->r_frag->f_page = NULL; 272{
273 struct rds_page_frag *frag;
274 struct list_head *cache_item;
275 int ret;
276
277 cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
278 if (cache_item) {
279 frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
280 } else {
281 frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
282 if (!frag)
283 return NULL;
284
285 sg_init_table(&frag->f_sg, 1);
286 ret = rds_page_remainder_alloc(&frag->f_sg,
287 RDS_FRAG_SIZE, page_mask);
288 if (ret) {
289 kmem_cache_free(rds_ib_frag_slab, frag);
290 return NULL;
291 }
167 } 292 }
168 293
169 if (ic->i_frag.f_page == NULL) { 294 INIT_LIST_HEAD(&frag->f_item);
170 ic->i_frag.f_page = alloc_page(page_gfp); 295
171 if (ic->i_frag.f_page == NULL) 296 return frag;
172 goto out; 297}
173 ic->i_frag.f_offset = 0; 298
299static int rds_ib_recv_refill_one(struct rds_connection *conn,
300 struct rds_ib_recv_work *recv, int prefill)
301{
302 struct rds_ib_connection *ic = conn->c_transport_data;
303 struct ib_sge *sge;
304 int ret = -ENOMEM;
305 gfp_t slab_mask = GFP_NOWAIT;
306 gfp_t page_mask = GFP_NOWAIT;
307
308 if (prefill) {
309 slab_mask = GFP_KERNEL;
310 page_mask = GFP_HIGHUSER;
174 } 311 }
175 312
176 dma_addr = ib_dma_map_page(ic->i_cm_id->device, 313 if (!ic->i_cache_incs.ready)
177 ic->i_frag.f_page, 314 rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
178 ic->i_frag.f_offset, 315 if (!ic->i_cache_frags.ready)
179 RDS_FRAG_SIZE, 316 rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
180 DMA_FROM_DEVICE);
181 if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
182 goto out;
183 317
184 /* 318 /*
185 * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap() 319 * ibinc was taken from recv if recv contained the start of a message.
186 * must be called on this recv. This happens as completions hit 320 * recvs that were continuations will still have this allocated.
187 * in order or on connection shutdown.
188 */ 321 */
189 recv->r_frag->f_page = ic->i_frag.f_page; 322 if (!recv->r_ibinc) {
190 recv->r_frag->f_offset = ic->i_frag.f_offset; 323 recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
191 recv->r_frag->f_mapped = dma_addr; 324 if (!recv->r_ibinc)
325 goto out;
326 }
192 327
193 sge = rds_ib_data_sge(ic, recv->r_sge); 328 WARN_ON(recv->r_frag); /* leak! */
194 sge->addr = dma_addr; 329 recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
195 sge->length = RDS_FRAG_SIZE; 330 if (!recv->r_frag)
331 goto out;
332
333 ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
334 1, DMA_FROM_DEVICE);
335 WARN_ON(ret != 1);
196 336
197 sge = rds_ib_header_sge(ic, recv->r_sge); 337 sge = &recv->r_sge[0];
198 sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); 338 sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
199 sge->length = sizeof(struct rds_header); 339 sge->length = sizeof(struct rds_header);
200 340
201 get_page(recv->r_frag->f_page); 341 sge = &recv->r_sge[1];
202 342 sge->addr = sg_dma_address(&recv->r_frag->f_sg);
203 if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { 343 sge->length = sg_dma_len(&recv->r_frag->f_sg);
204 ic->i_frag.f_offset += RDS_FRAG_SIZE;
205 } else {
206 put_page(ic->i_frag.f_page);
207 ic->i_frag.f_page = NULL;
208 ic->i_frag.f_offset = 0;
209 }
210 344
211 ret = 0; 345 ret = 0;
212out: 346out:
@@ -216,13 +350,11 @@ out:
216/* 350/*
217 * This tries to allocate and post unused work requests after making sure that 351 * This tries to allocate and post unused work requests after making sure that
218 * they have all the allocations they need to queue received fragments into 352 * they have all the allocations they need to queue received fragments into
219 * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc 353 * sockets.
220 * pairs don't go unmatched.
221 * 354 *
222 * -1 is returned if posting fails due to temporary resource exhaustion. 355 * -1 is returned if posting fails due to temporary resource exhaustion.
223 */ 356 */
224int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, 357void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
225 gfp_t page_gfp, int prefill)
226{ 358{
227 struct rds_ib_connection *ic = conn->c_transport_data; 359 struct rds_ib_connection *ic = conn->c_transport_data;
228 struct rds_ib_recv_work *recv; 360 struct rds_ib_recv_work *recv;
@@ -236,28 +368,25 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
236 if (pos >= ic->i_recv_ring.w_nr) { 368 if (pos >= ic->i_recv_ring.w_nr) {
237 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", 369 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
238 pos); 370 pos);
239 ret = -EINVAL;
240 break; 371 break;
241 } 372 }
242 373
243 recv = &ic->i_recvs[pos]; 374 recv = &ic->i_recvs[pos];
244 ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp); 375 ret = rds_ib_recv_refill_one(conn, recv, prefill);
245 if (ret) { 376 if (ret) {
246 ret = -1;
247 break; 377 break;
248 } 378 }
249 379
250 /* XXX when can this fail? */ 380 /* XXX when can this fail? */
251 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); 381 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
252 rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, 382 rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
253 recv->r_ibinc, recv->r_frag->f_page, 383 recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
254 (long) recv->r_frag->f_mapped, ret); 384 (long) sg_dma_address(&recv->r_frag->f_sg), ret);
255 if (ret) { 385 if (ret) {
256 rds_ib_conn_error(conn, "recv post on " 386 rds_ib_conn_error(conn, "recv post on "
257 "%pI4 returned %d, disconnecting and " 387 "%pI4 returned %d, disconnecting and "
258 "reconnecting\n", &conn->c_faddr, 388 "reconnecting\n", &conn->c_faddr,
259 ret); 389 ret);
260 ret = -1;
261 break; 390 break;
262 } 391 }
263 392
@@ -270,37 +399,73 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
270 399
271 if (ret) 400 if (ret)
272 rds_ib_ring_unalloc(&ic->i_recv_ring, 1); 401 rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
273 return ret;
274} 402}
275 403
276void rds_ib_inc_purge(struct rds_incoming *inc) 404/*
405 * We want to recycle several types of recv allocations, like incs and frags.
406 * To use this, the *_free() function passes in the ptr to a list_head within
407 * the recyclee, as well as the cache to put it on.
408 *
409 * First, we put the memory on a percpu list. When this reaches a certain size,
410 * We move it to an intermediate non-percpu list in a lockless manner, with some
411 * xchg/compxchg wizardry.
412 *
413 * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
414 * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
415 * list_empty() will return true with one element is actually present.
416 */
417static void rds_ib_recv_cache_put(struct list_head *new_item,
418 struct rds_ib_refill_cache *cache)
277{ 419{
278 struct rds_ib_incoming *ibinc; 420 unsigned long flags;
279 struct rds_page_frag *frag; 421 struct rds_ib_cache_head *chp;
280 struct rds_page_frag *pos; 422 struct list_head *old;
281 423
282 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); 424 local_irq_save(flags);
283 rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
284 425
285 list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { 426 chp = per_cpu_ptr(cache->percpu, smp_processor_id());
286 list_del_init(&frag->f_item); 427 if (!chp->first)
287 rds_ib_frag_drop_page(frag); 428 INIT_LIST_HEAD(new_item);
288 rds_ib_frag_free(frag); 429 else /* put on front */
289 } 430 list_add_tail(new_item, chp->first);
431 chp->first = new_item;
432 chp->count++;
433
434 if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT)
435 goto end;
436
437 /*
438 * Return our per-cpu first list to the cache's xfer by atomically
439 * grabbing the current xfer list, appending it to our per-cpu list,
440 * and then atomically returning that entire list back to the
441 * cache's xfer list as long as it's still empty.
442 */
443 do {
444 old = xchg(&cache->xfer, NULL);
445 if (old)
446 list_splice_entire_tail(old, chp->first);
447 old = cmpxchg(&cache->xfer, NULL, chp->first);
448 } while (old);
449
450 chp->first = NULL;
451 chp->count = 0;
452end:
453 local_irq_restore(flags);
290} 454}
291 455
292void rds_ib_inc_free(struct rds_incoming *inc) 456static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
293{ 457{
294 struct rds_ib_incoming *ibinc; 458 struct list_head *head = cache->ready;
295 459
296 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); 460 if (head) {
461 if (!list_empty(head)) {
462 cache->ready = head->next;
463 list_del_init(head);
464 } else
465 cache->ready = NULL;
466 }
297 467
298 rds_ib_inc_purge(inc); 468 return head;
299 rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
300 BUG_ON(!list_empty(&ibinc->ii_frags));
301 kmem_cache_free(rds_ib_incoming_slab, ibinc);
302 atomic_dec(&rds_ib_allocation);
303 BUG_ON(atomic_read(&rds_ib_allocation) < 0);
304} 469}
305 470
306int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, 471int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
@@ -336,13 +501,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
336 to_copy = min_t(unsigned long, to_copy, len - copied); 501 to_copy = min_t(unsigned long, to_copy, len - copied);
337 502
338 rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " 503 rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
339 "[%p, %lu] + %lu\n", 504 "[%p, %u] + %lu\n",
340 to_copy, iov->iov_base, iov->iov_len, iov_off, 505 to_copy, iov->iov_base, iov->iov_len, iov_off,
341 frag->f_page, frag->f_offset, frag_off); 506 sg_page(&frag->f_sg), frag->f_sg.offset, frag_off);
342 507
343 /* XXX needs + offset for multiple recvs per page */ 508 /* XXX needs + offset for multiple recvs per page */
344 ret = rds_page_copy_to_user(frag->f_page, 509 ret = rds_page_copy_to_user(sg_page(&frag->f_sg),
345 frag->f_offset + frag_off, 510 frag->f_sg.offset + frag_off,
346 iov->iov_base + iov_off, 511 iov->iov_base + iov_off,
347 to_copy); 512 to_copy);
348 if (ret) { 513 if (ret) {
@@ -557,47 +722,6 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
557 return rds_ib_get_ack(ic); 722 return rds_ib_get_ack(ic);
558} 723}
559 724
560static struct rds_header *rds_ib_get_header(struct rds_connection *conn,
561 struct rds_ib_recv_work *recv,
562 u32 data_len)
563{
564 struct rds_ib_connection *ic = conn->c_transport_data;
565 void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs];
566 void *addr;
567 u32 misplaced_hdr_bytes;
568
569 /*
570 * Support header at the front (RDS 3.1+) as well as header-at-end.
571 *
572 * Cases:
573 * 1) header all in header buff (great!)
574 * 2) header all in data page (copy all to header buff)
575 * 3) header split across hdr buf + data page
576 * (move bit in hdr buff to end before copying other bit from data page)
577 */
578 if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE)
579 return hdr_buff;
580
581 if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) {
582 addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
583 memcpy(hdr_buff,
584 addr + recv->r_frag->f_offset + data_len,
585 sizeof(struct rds_header));
586 kunmap_atomic(addr, KM_SOFTIRQ0);
587 return hdr_buff;
588 }
589
590 misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len));
591
592 memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes);
593
594 addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
595 memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len,
596 sizeof(struct rds_header) - misplaced_hdr_bytes);
597 kunmap_atomic(addr, KM_SOFTIRQ0);
598 return hdr_buff;
599}
600
601/* 725/*
602 * It's kind of lame that we're copying from the posted receive pages into 726 * It's kind of lame that we're copying from the posted receive pages into
603 * long-lived bitmaps. We could have posted the bitmaps and rdma written into 727 * long-lived bitmaps. We could have posted the bitmaps and rdma written into
@@ -639,7 +763,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
639 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); 763 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
640 BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ 764 BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
641 765
642 addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); 766 addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0);
643 767
644 src = addr + frag_off; 768 src = addr + frag_off;
645 dst = (void *)map->m_page_addrs[map_page] + map_off; 769 dst = (void *)map->m_page_addrs[map_page] + map_off;
@@ -710,7 +834,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
710 } 834 }
711 data_len -= sizeof(struct rds_header); 835 data_len -= sizeof(struct rds_header);
712 836
713 ihdr = rds_ib_get_header(conn, recv, data_len); 837 ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
714 838
715 /* Validate the checksum. */ 839 /* Validate the checksum. */
716 if (!rds_message_verify_checksum(ihdr)) { 840 if (!rds_message_verify_checksum(ihdr)) {
@@ -742,12 +866,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
742 * the inc is freed. We don't go that route, so we have to drop the 866 * the inc is freed. We don't go that route, so we have to drop the
743 * page ref ourselves. We can't just leave the page on the recv 867 * page ref ourselves. We can't just leave the page on the recv
744 * because that confuses the dma mapping of pages and each recv's use 868 * because that confuses the dma mapping of pages and each recv's use
745 * of a partial page. We can leave the frag, though, it will be 869 * of a partial page.
746 * reused.
747 * 870 *
748 * FIXME: Fold this into the code path below. 871 * FIXME: Fold this into the code path below.
749 */ 872 */
750 rds_ib_frag_drop_page(recv->r_frag); 873 rds_ib_frag_free(ic, recv->r_frag);
874 recv->r_frag = NULL;
751 return; 875 return;
752 } 876 }
753 877
@@ -757,7 +881,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
757 * into the inc and save the inc so we can hang upcoming fragments 881 * into the inc and save the inc so we can hang upcoming fragments
758 * off its list. 882 * off its list.
759 */ 883 */
760 if (ibinc == NULL) { 884 if (!ibinc) {
761 ibinc = recv->r_ibinc; 885 ibinc = recv->r_ibinc;
762 recv->r_ibinc = NULL; 886 recv->r_ibinc = NULL;
763 ic->i_ibinc = ibinc; 887 ic->i_ibinc = ibinc;
@@ -842,32 +966,38 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic,
842 struct rds_ib_recv_work *recv; 966 struct rds_ib_recv_work *recv;
843 967
844 while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { 968 while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
845 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", 969 rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
846 (unsigned long long)wc.wr_id, wc.status, wc.byte_len, 970 (unsigned long long)wc.wr_id, wc.status,
971 rds_ib_wc_status_str(wc.status), wc.byte_len,
847 be32_to_cpu(wc.ex.imm_data)); 972 be32_to_cpu(wc.ex.imm_data));
848 rds_ib_stats_inc(s_ib_rx_cq_event); 973 rds_ib_stats_inc(s_ib_rx_cq_event);
849 974
850 recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; 975 recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
851 976
852 rds_ib_recv_unmap_page(ic, recv); 977 ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
853 978
854 /* 979 /*
855 * Also process recvs in connecting state because it is possible 980 * Also process recvs in connecting state because it is possible
856 * to get a recv completion _before_ the rdmacm ESTABLISHED 981 * to get a recv completion _before_ the rdmacm ESTABLISHED
857 * event is processed. 982 * event is processed.
858 */ 983 */
859 if (rds_conn_up(conn) || rds_conn_connecting(conn)) { 984 if (wc.status == IB_WC_SUCCESS) {
985 rds_ib_process_recv(conn, recv, wc.byte_len, state);
986 } else {
860 /* We expect errors as the qp is drained during shutdown */ 987 /* We expect errors as the qp is drained during shutdown */
861 if (wc.status == IB_WC_SUCCESS) { 988 if (rds_conn_up(conn) || rds_conn_connecting(conn))
862 rds_ib_process_recv(conn, recv, wc.byte_len, state); 989 rds_ib_conn_error(conn, "recv completion on %pI4 had "
863 } else { 990 "status %u (%s), disconnecting and "
864 rds_ib_conn_error(conn, "recv completion on " 991 "reconnecting\n", &conn->c_faddr,
865 "%pI4 had status %u, disconnecting and " 992 wc.status,
866 "reconnecting\n", &conn->c_faddr, 993 rds_ib_wc_status_str(wc.status));
867 wc.status);
868 }
869 } 994 }
870 995
996 /*
997 * It's very important that we only free this ring entry if we've truly
998 * freed the resources allocated to the entry. The refilling path can
999 * leak if we don't.
1000 */
871 rds_ib_ring_free(&ic->i_recv_ring, 1); 1001 rds_ib_ring_free(&ic->i_recv_ring, 1);
872 } 1002 }
873} 1003}
@@ -897,11 +1027,8 @@ void rds_ib_recv_tasklet_fn(unsigned long data)
897 if (rds_ib_ring_empty(&ic->i_recv_ring)) 1027 if (rds_ib_ring_empty(&ic->i_recv_ring))
898 rds_ib_stats_inc(s_ib_rx_ring_empty); 1028 rds_ib_stats_inc(s_ib_rx_ring_empty);
899 1029
900 /*
901 * If the ring is running low, then schedule the thread to refill.
902 */
903 if (rds_ib_ring_low(&ic->i_recv_ring)) 1030 if (rds_ib_ring_low(&ic->i_recv_ring))
904 queue_delayed_work(rds_wq, &conn->c_recv_w, 0); 1031 rds_ib_recv_refill(conn, 0);
905} 1032}
906 1033
907int rds_ib_recv(struct rds_connection *conn) 1034int rds_ib_recv(struct rds_connection *conn)
@@ -910,25 +1037,13 @@ int rds_ib_recv(struct rds_connection *conn)
910 int ret = 0; 1037 int ret = 0;
911 1038
912 rdsdebug("conn %p\n", conn); 1039 rdsdebug("conn %p\n", conn);
913
914 /*
915 * If we get a temporary posting failure in this context then
916 * we're really low and we want the caller to back off for a bit.
917 */
918 mutex_lock(&ic->i_recv_mutex);
919 if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
920 ret = -ENOMEM;
921 else
922 rds_ib_stats_inc(s_ib_rx_refill_from_thread);
923 mutex_unlock(&ic->i_recv_mutex);
924
925 if (rds_conn_up(conn)) 1040 if (rds_conn_up(conn))
926 rds_ib_attempt_ack(ic); 1041 rds_ib_attempt_ack(ic);
927 1042
928 return ret; 1043 return ret;
929} 1044}
930 1045
931int __init rds_ib_recv_init(void) 1046int rds_ib_recv_init(void)
932{ 1047{
933 struct sysinfo si; 1048 struct sysinfo si;
934 int ret = -ENOMEM; 1049 int ret = -ENOMEM;
@@ -939,14 +1054,14 @@ int __init rds_ib_recv_init(void)
939 1054
940 rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", 1055 rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
941 sizeof(struct rds_ib_incoming), 1056 sizeof(struct rds_ib_incoming),
942 0, 0, NULL); 1057 0, SLAB_HWCACHE_ALIGN, NULL);
943 if (rds_ib_incoming_slab == NULL) 1058 if (!rds_ib_incoming_slab)
944 goto out; 1059 goto out;
945 1060
946 rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", 1061 rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
947 sizeof(struct rds_page_frag), 1062 sizeof(struct rds_page_frag),
948 0, 0, NULL); 1063 0, SLAB_HWCACHE_ALIGN, NULL);
949 if (rds_ib_frag_slab == NULL) 1064 if (!rds_ib_frag_slab)
950 kmem_cache_destroy(rds_ib_incoming_slab); 1065 kmem_cache_destroy(rds_ib_incoming_slab);
951 else 1066 else
952 ret = 0; 1067 ret = 0;
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 17fa80803ab0..7c4dce8fa5e6 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -36,11 +36,49 @@
36#include <linux/dmapool.h> 36#include <linux/dmapool.h>
37 37
38#include "rds.h" 38#include "rds.h"
39#include "rdma.h"
40#include "ib.h" 39#include "ib.h"
41 40
42static void rds_ib_send_rdma_complete(struct rds_message *rm, 41static char *rds_ib_wc_status_strings[] = {
43 int wc_status) 42#define RDS_IB_WC_STATUS_STR(foo) \
43 [IB_WC_##foo] = __stringify(IB_WC_##foo)
44 RDS_IB_WC_STATUS_STR(SUCCESS),
45 RDS_IB_WC_STATUS_STR(LOC_LEN_ERR),
46 RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR),
47 RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR),
48 RDS_IB_WC_STATUS_STR(LOC_PROT_ERR),
49 RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR),
50 RDS_IB_WC_STATUS_STR(MW_BIND_ERR),
51 RDS_IB_WC_STATUS_STR(BAD_RESP_ERR),
52 RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR),
53 RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR),
54 RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR),
55 RDS_IB_WC_STATUS_STR(REM_OP_ERR),
56 RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR),
57 RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR),
58 RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR),
59 RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR),
60 RDS_IB_WC_STATUS_STR(REM_ABORT_ERR),
61 RDS_IB_WC_STATUS_STR(INV_EECN_ERR),
62 RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR),
63 RDS_IB_WC_STATUS_STR(FATAL_ERR),
64 RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR),
65 RDS_IB_WC_STATUS_STR(GENERAL_ERR),
66#undef RDS_IB_WC_STATUS_STR
67};
68
69char *rds_ib_wc_status_str(enum ib_wc_status status)
70{
71 return rds_str_array(rds_ib_wc_status_strings,
72 ARRAY_SIZE(rds_ib_wc_status_strings), status);
73}
74
75/*
76 * Convert IB-specific error message to RDS error message and call core
77 * completion handler.
78 */
79static void rds_ib_send_complete(struct rds_message *rm,
80 int wc_status,
81 void (*complete)(struct rds_message *rm, int status))
44{ 82{
45 int notify_status; 83 int notify_status;
46 84
@@ -60,69 +98,125 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm,
60 notify_status = RDS_RDMA_OTHER_ERROR; 98 notify_status = RDS_RDMA_OTHER_ERROR;
61 break; 99 break;
62 } 100 }
63 rds_rdma_send_complete(rm, notify_status); 101 complete(rm, notify_status);
102}
103
104static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
105 struct rm_data_op *op,
106 int wc_status)
107{
108 if (op->op_nents)
109 ib_dma_unmap_sg(ic->i_cm_id->device,
110 op->op_sg, op->op_nents,
111 DMA_TO_DEVICE);
64} 112}
65 113
66static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, 114static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
67 struct rds_rdma_op *op) 115 struct rm_rdma_op *op,
116 int wc_status)
68{ 117{
69 if (op->r_mapped) { 118 if (op->op_mapped) {
70 ib_dma_unmap_sg(ic->i_cm_id->device, 119 ib_dma_unmap_sg(ic->i_cm_id->device,
71 op->r_sg, op->r_nents, 120 op->op_sg, op->op_nents,
72 op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 121 op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
73 op->r_mapped = 0; 122 op->op_mapped = 0;
74 } 123 }
124
125 /* If the user asked for a completion notification on this
126 * message, we can implement three different semantics:
127 * 1. Notify when we received the ACK on the RDS message
128 * that was queued with the RDMA. This provides reliable
129 * notification of RDMA status at the expense of a one-way
130 * packet delay.
131 * 2. Notify when the IB stack gives us the completion event for
132 * the RDMA operation.
133 * 3. Notify when the IB stack gives us the completion event for
134 * the accompanying RDS messages.
135 * Here, we implement approach #3. To implement approach #2,
136 * we would need to take an event for the rdma WR. To implement #1,
137 * don't call rds_rdma_send_complete at all, and fall back to the notify
138 * handling in the ACK processing code.
139 *
140 * Note: There's no need to explicitly sync any RDMA buffers using
141 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
142 * operation itself unmapped the RDMA buffers, which takes care
143 * of synching.
144 */
145 rds_ib_send_complete(container_of(op, struct rds_message, rdma),
146 wc_status, rds_rdma_send_complete);
147
148 if (op->op_write)
149 rds_stats_add(s_send_rdma_bytes, op->op_bytes);
150 else
151 rds_stats_add(s_recv_rdma_bytes, op->op_bytes);
75} 152}
76 153
77static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, 154static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
78 struct rds_ib_send_work *send, 155 struct rm_atomic_op *op,
79 int wc_status) 156 int wc_status)
80{ 157{
81 struct rds_message *rm = send->s_rm; 158 /* unmap atomic recvbuf */
82 159 if (op->op_mapped) {
83 rdsdebug("ic %p send %p rm %p\n", ic, send, rm); 160 ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
84 161 DMA_FROM_DEVICE);
85 ib_dma_unmap_sg(ic->i_cm_id->device, 162 op->op_mapped = 0;
86 rm->m_sg, rm->m_nents, 163 }
87 DMA_TO_DEVICE);
88
89 if (rm->m_rdma_op != NULL) {
90 rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
91
92 /* If the user asked for a completion notification on this
93 * message, we can implement three different semantics:
94 * 1. Notify when we received the ACK on the RDS message
95 * that was queued with the RDMA. This provides reliable
96 * notification of RDMA status at the expense of a one-way
97 * packet delay.
98 * 2. Notify when the IB stack gives us the completion event for
99 * the RDMA operation.
100 * 3. Notify when the IB stack gives us the completion event for
101 * the accompanying RDS messages.
102 * Here, we implement approach #3. To implement approach #2,
103 * call rds_rdma_send_complete from the cq_handler. To implement #1,
104 * don't call rds_rdma_send_complete at all, and fall back to the notify
105 * handling in the ACK processing code.
106 *
107 * Note: There's no need to explicitly sync any RDMA buffers using
108 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
109 * operation itself unmapped the RDMA buffers, which takes care
110 * of synching.
111 */
112 rds_ib_send_rdma_complete(rm, wc_status);
113 164
114 if (rm->m_rdma_op->r_write) 165 rds_ib_send_complete(container_of(op, struct rds_message, atomic),
115 rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); 166 wc_status, rds_atomic_send_complete);
116 else 167
117 rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); 168 if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
169 rds_ib_stats_inc(s_ib_atomic_cswp);
170 else
171 rds_ib_stats_inc(s_ib_atomic_fadd);
172}
173
174/*
175 * Unmap the resources associated with a struct send_work.
176 *
177 * Returns the rm for no good reason other than it is unobtainable
178 * other than by switching on wr.opcode, currently, and the caller,
179 * the event handler, needs it.
180 */
181static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic,
182 struct rds_ib_send_work *send,
183 int wc_status)
184{
185 struct rds_message *rm = NULL;
186
187 /* In the error case, wc.opcode sometimes contains garbage */
188 switch (send->s_wr.opcode) {
189 case IB_WR_SEND:
190 if (send->s_op) {
191 rm = container_of(send->s_op, struct rds_message, data);
192 rds_ib_send_unmap_data(ic, send->s_op, wc_status);
193 }
194 break;
195 case IB_WR_RDMA_WRITE:
196 case IB_WR_RDMA_READ:
197 if (send->s_op) {
198 rm = container_of(send->s_op, struct rds_message, rdma);
199 rds_ib_send_unmap_rdma(ic, send->s_op, wc_status);
200 }
201 break;
202 case IB_WR_ATOMIC_FETCH_AND_ADD:
203 case IB_WR_ATOMIC_CMP_AND_SWP:
204 if (send->s_op) {
205 rm = container_of(send->s_op, struct rds_message, atomic);
206 rds_ib_send_unmap_atomic(ic, send->s_op, wc_status);
207 }
208 break;
209 default:
210 if (printk_ratelimit())
211 printk(KERN_NOTICE
212 "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
213 __func__, send->s_wr.opcode);
214 break;
118 } 215 }
119 216
120 /* If anyone waited for this message to get flushed out, wake 217 send->s_wr.opcode = 0xdead;
121 * them up now */
122 rds_message_unmapped(rm);
123 218
124 rds_message_put(rm); 219 return rm;
125 send->s_rm = NULL;
126} 220}
127 221
128void rds_ib_send_init_ring(struct rds_ib_connection *ic) 222void rds_ib_send_init_ring(struct rds_ib_connection *ic)
@@ -133,23 +227,18 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
133 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { 227 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
134 struct ib_sge *sge; 228 struct ib_sge *sge;
135 229
136 send->s_rm = NULL;
137 send->s_op = NULL; 230 send->s_op = NULL;
138 231
139 send->s_wr.wr_id = i; 232 send->s_wr.wr_id = i;
140 send->s_wr.sg_list = send->s_sge; 233 send->s_wr.sg_list = send->s_sge;
141 send->s_wr.num_sge = 1;
142 send->s_wr.opcode = IB_WR_SEND;
143 send->s_wr.send_flags = 0;
144 send->s_wr.ex.imm_data = 0; 234 send->s_wr.ex.imm_data = 0;
145 235
146 sge = rds_ib_data_sge(ic, send->s_sge); 236 sge = &send->s_sge[0];
147 sge->lkey = ic->i_mr->lkey;
148
149 sge = rds_ib_header_sge(ic, send->s_sge);
150 sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); 237 sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
151 sge->length = sizeof(struct rds_header); 238 sge->length = sizeof(struct rds_header);
152 sge->lkey = ic->i_mr->lkey; 239 sge->lkey = ic->i_mr->lkey;
240
241 send->s_sge[1].lkey = ic->i_mr->lkey;
153 } 242 }
154} 243}
155 244
@@ -159,16 +248,24 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
159 u32 i; 248 u32 i;
160 249
161 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { 250 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
162 if (send->s_wr.opcode == 0xdead) 251 if (send->s_op && send->s_wr.opcode != 0xdead)
163 continue; 252 rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR);
164 if (send->s_rm)
165 rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
166 if (send->s_op)
167 rds_ib_send_unmap_rdma(ic, send->s_op);
168 } 253 }
169} 254}
170 255
171/* 256/*
257 * The only fast path caller always has a non-zero nr, so we don't
258 * bother testing nr before performing the atomic sub.
259 */
260static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
261{
262 if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) &&
263 waitqueue_active(&rds_ib_ring_empty_wait))
264 wake_up(&rds_ib_ring_empty_wait);
265 BUG_ON(atomic_read(&ic->i_signaled_sends) < 0);
266}
267
268/*
172 * The _oldest/_free ring operations here race cleanly with the alloc/unalloc 269 * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
173 * operations performed in the send path. As the sender allocs and potentially 270 * operations performed in the send path. As the sender allocs and potentially
174 * unallocs the next free entry in the ring it doesn't alter which is 271 * unallocs the next free entry in the ring it doesn't alter which is
@@ -178,12 +275,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
178{ 275{
179 struct rds_connection *conn = context; 276 struct rds_connection *conn = context;
180 struct rds_ib_connection *ic = conn->c_transport_data; 277 struct rds_ib_connection *ic = conn->c_transport_data;
278 struct rds_message *rm = NULL;
181 struct ib_wc wc; 279 struct ib_wc wc;
182 struct rds_ib_send_work *send; 280 struct rds_ib_send_work *send;
183 u32 completed; 281 u32 completed;
184 u32 oldest; 282 u32 oldest;
185 u32 i = 0; 283 u32 i = 0;
186 int ret; 284 int ret;
285 int nr_sig = 0;
187 286
188 rdsdebug("cq %p conn %p\n", cq, conn); 287 rdsdebug("cq %p conn %p\n", cq, conn);
189 rds_ib_stats_inc(s_ib_tx_cq_call); 288 rds_ib_stats_inc(s_ib_tx_cq_call);
@@ -192,8 +291,9 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
192 rdsdebug("ib_req_notify_cq send failed: %d\n", ret); 291 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
193 292
194 while (ib_poll_cq(cq, 1, &wc) > 0) { 293 while (ib_poll_cq(cq, 1, &wc) > 0) {
195 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", 294 rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
196 (unsigned long long)wc.wr_id, wc.status, wc.byte_len, 295 (unsigned long long)wc.wr_id, wc.status,
296 rds_ib_wc_status_str(wc.status), wc.byte_len,
197 be32_to_cpu(wc.ex.imm_data)); 297 be32_to_cpu(wc.ex.imm_data));
198 rds_ib_stats_inc(s_ib_tx_cq_event); 298 rds_ib_stats_inc(s_ib_tx_cq_event);
199 299
@@ -210,51 +310,30 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
210 310
211 for (i = 0; i < completed; i++) { 311 for (i = 0; i < completed; i++) {
212 send = &ic->i_sends[oldest]; 312 send = &ic->i_sends[oldest];
313 if (send->s_wr.send_flags & IB_SEND_SIGNALED)
314 nr_sig++;
213 315
214 /* In the error case, wc.opcode sometimes contains garbage */ 316 rm = rds_ib_send_unmap_op(ic, send, wc.status);
215 switch (send->s_wr.opcode) {
216 case IB_WR_SEND:
217 if (send->s_rm)
218 rds_ib_send_unmap_rm(ic, send, wc.status);
219 break;
220 case IB_WR_RDMA_WRITE:
221 case IB_WR_RDMA_READ:
222 /* Nothing to be done - the SG list will be unmapped
223 * when the SEND completes. */
224 break;
225 default:
226 if (printk_ratelimit())
227 printk(KERN_NOTICE
228 "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
229 __func__, send->s_wr.opcode);
230 break;
231 }
232 317
233 send->s_wr.opcode = 0xdead;
234 send->s_wr.num_sge = 1;
235 if (send->s_queued + HZ/2 < jiffies) 318 if (send->s_queued + HZ/2 < jiffies)
236 rds_ib_stats_inc(s_ib_tx_stalled); 319 rds_ib_stats_inc(s_ib_tx_stalled);
237 320
238 /* If a RDMA operation produced an error, signal this right 321 if (send->s_op) {
239 * away. If we don't, the subsequent SEND that goes with this 322 if (send->s_op == rm->m_final_op) {
240 * RDMA will be canceled with ERR_WFLUSH, and the application 323 /* If anyone waited for this message to get flushed out, wake
241 * never learn that the RDMA failed. */ 324 * them up now */
242 if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { 325 rds_message_unmapped(rm);
243 struct rds_message *rm;
244
245 rm = rds_send_get_message(conn, send->s_op);
246 if (rm) {
247 if (rm->m_rdma_op)
248 rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
249 rds_ib_send_rdma_complete(rm, wc.status);
250 rds_message_put(rm);
251 } 326 }
327 rds_message_put(rm);
328 send->s_op = NULL;
252 } 329 }
253 330
254 oldest = (oldest + 1) % ic->i_send_ring.w_nr; 331 oldest = (oldest + 1) % ic->i_send_ring.w_nr;
255 } 332 }
256 333
257 rds_ib_ring_free(&ic->i_send_ring, completed); 334 rds_ib_ring_free(&ic->i_send_ring, completed);
335 rds_ib_sub_signaled(ic, nr_sig);
336 nr_sig = 0;
258 337
259 if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || 338 if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
260 test_bit(0, &conn->c_map_queued)) 339 test_bit(0, &conn->c_map_queued))
@@ -262,10 +341,10 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
262 341
263 /* We expect errors as the qp is drained during shutdown */ 342 /* We expect errors as the qp is drained during shutdown */
264 if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { 343 if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
265 rds_ib_conn_error(conn, 344 rds_ib_conn_error(conn, "send completion on %pI4 had status "
266 "send completion on %pI4 " 345 "%u (%s), disconnecting and reconnecting\n",
267 "had status %u, disconnecting and reconnecting\n", 346 &conn->c_faddr, wc.status,
268 &conn->c_faddr, wc.status); 347 rds_ib_wc_status_str(wc.status));
269 } 348 }
270 } 349 }
271} 350}
@@ -276,7 +355,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
276 * 355 *
277 * Conceptually, we have two counters: 356 * Conceptually, we have two counters:
278 * - send credits: this tells us how many WRs we're allowed 357 * - send credits: this tells us how many WRs we're allowed
279 * to submit without overruning the reciever's queue. For 358 * to submit without overruning the receiver's queue. For
280 * each SEND WR we post, we decrement this by one. 359 * each SEND WR we post, we decrement this by one.
281 * 360 *
282 * - posted credits: this tells us how many WRs we recently 361 * - posted credits: this tells us how many WRs we recently
@@ -294,7 +373,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
294 * credits (see rds_ib_send_add_credits below). 373 * credits (see rds_ib_send_add_credits below).
295 * 374 *
296 * The RDS send code is essentially single-threaded; rds_send_xmit 375 * The RDS send code is essentially single-threaded; rds_send_xmit
297 * grabs c_send_lock to ensure exclusive access to the send ring. 376 * sets RDS_IN_XMIT to ensure exclusive access to the send ring.
298 * However, the ACK sending code is independent and can race with 377 * However, the ACK sending code is independent and can race with
299 * message SENDs. 378 * message SENDs.
300 * 379 *
@@ -413,40 +492,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
413 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 492 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
414} 493}
415 494
416static inline void 495static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
417rds_ib_xmit_populate_wr(struct rds_ib_connection *ic, 496 struct rds_ib_send_work *send,
418 struct rds_ib_send_work *send, unsigned int pos, 497 bool notify)
419 unsigned long buffer, unsigned int length,
420 int send_flags)
421{ 498{
422 struct ib_sge *sge; 499 /*
423 500 * We want to delay signaling completions just enough to get
424 WARN_ON(pos != send - ic->i_sends); 501 * the batching benefits but not so much that we create dead time
425 502 * on the wire.
426 send->s_wr.send_flags = send_flags; 503 */
427 send->s_wr.opcode = IB_WR_SEND; 504 if (ic->i_unsignaled_wrs-- == 0 || notify) {
428 send->s_wr.num_sge = 2; 505 ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
429 send->s_wr.next = NULL; 506 send->s_wr.send_flags |= IB_SEND_SIGNALED;
430 send->s_queued = jiffies; 507 return 1;
431 send->s_op = NULL;
432
433 if (length != 0) {
434 sge = rds_ib_data_sge(ic, send->s_sge);
435 sge->addr = buffer;
436 sge->length = length;
437 sge->lkey = ic->i_mr->lkey;
438
439 sge = rds_ib_header_sge(ic, send->s_sge);
440 } else {
441 /* We're sending a packet with no payload. There is only
442 * one SGE */
443 send->s_wr.num_sge = 1;
444 sge = &send->s_sge[0];
445 } 508 }
446 509 return 0;
447 sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
448 sge->length = sizeof(struct rds_header);
449 sge->lkey = ic->i_mr->lkey;
450} 510}
451 511
452/* 512/*
@@ -475,13 +535,14 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
475 u32 pos; 535 u32 pos;
476 u32 i; 536 u32 i;
477 u32 work_alloc; 537 u32 work_alloc;
478 u32 credit_alloc; 538 u32 credit_alloc = 0;
479 u32 posted; 539 u32 posted;
480 u32 adv_credits = 0; 540 u32 adv_credits = 0;
481 int send_flags = 0; 541 int send_flags = 0;
482 int sent; 542 int bytes_sent = 0;
483 int ret; 543 int ret;
484 int flow_controlled = 0; 544 int flow_controlled = 0;
545 int nr_sig = 0;
485 546
486 BUG_ON(off % RDS_FRAG_SIZE); 547 BUG_ON(off % RDS_FRAG_SIZE);
487 BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); 548 BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
@@ -490,7 +551,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
490 if (conn->c_loopback 551 if (conn->c_loopback
491 && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { 552 && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
492 rds_cong_map_updated(conn->c_fcong, ~(u64) 0); 553 rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
493 return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; 554 scat = &rm->data.op_sg[sg];
555 ret = sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
556 ret = min_t(int, ret, scat->length - conn->c_xmit_data_off);
557 return ret;
494 } 558 }
495 559
496 /* FIXME we may overallocate here */ 560 /* FIXME we may overallocate here */
@@ -507,14 +571,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
507 goto out; 571 goto out;
508 } 572 }
509 573
510 credit_alloc = work_alloc;
511 if (ic->i_flowctl) { 574 if (ic->i_flowctl) {
512 credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); 575 credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
513 adv_credits += posted; 576 adv_credits += posted;
514 if (credit_alloc < work_alloc) { 577 if (credit_alloc < work_alloc) {
515 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); 578 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
516 work_alloc = credit_alloc; 579 work_alloc = credit_alloc;
517 flow_controlled++; 580 flow_controlled = 1;
518 } 581 }
519 if (work_alloc == 0) { 582 if (work_alloc == 0) {
520 set_bit(RDS_LL_SEND_FULL, &conn->c_flags); 583 set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
@@ -525,31 +588,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
525 } 588 }
526 589
527 /* map the message the first time we see it */ 590 /* map the message the first time we see it */
528 if (ic->i_rm == NULL) { 591 if (!ic->i_data_op) {
529 /* 592 if (rm->data.op_nents) {
530 printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", 593 rm->data.op_count = ib_dma_map_sg(dev,
531 be16_to_cpu(rm->m_inc.i_hdr.h_dport), 594 rm->data.op_sg,
532 rm->m_inc.i_hdr.h_flags, 595 rm->data.op_nents,
533 be32_to_cpu(rm->m_inc.i_hdr.h_len)); 596 DMA_TO_DEVICE);
534 */ 597 rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
535 if (rm->m_nents) { 598 if (rm->data.op_count == 0) {
536 rm->m_count = ib_dma_map_sg(dev,
537 rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
538 rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
539 if (rm->m_count == 0) {
540 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); 599 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
541 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 600 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
542 ret = -ENOMEM; /* XXX ? */ 601 ret = -ENOMEM; /* XXX ? */
543 goto out; 602 goto out;
544 } 603 }
545 } else { 604 } else {
546 rm->m_count = 0; 605 rm->data.op_count = 0;
547 } 606 }
548 607
549 ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
550 ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
551 rds_message_addref(rm); 608 rds_message_addref(rm);
552 ic->i_rm = rm; 609 ic->i_data_op = &rm->data;
553 610
554 /* Finalize the header */ 611 /* Finalize the header */
555 if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) 612 if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
@@ -559,10 +616,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
559 616
560 /* If it has a RDMA op, tell the peer we did it. This is 617 /* If it has a RDMA op, tell the peer we did it. This is
561 * used by the peer to release use-once RDMA MRs. */ 618 * used by the peer to release use-once RDMA MRs. */
562 if (rm->m_rdma_op) { 619 if (rm->rdma.op_active) {
563 struct rds_ext_header_rdma ext_hdr; 620 struct rds_ext_header_rdma ext_hdr;
564 621
565 ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); 622 ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
566 rds_message_add_extension(&rm->m_inc.i_hdr, 623 rds_message_add_extension(&rm->m_inc.i_hdr,
567 RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); 624 RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
568 } 625 }
@@ -582,99 +639,77 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
582 /* 639 /*
583 * Update adv_credits since we reset the ACK_REQUIRED bit. 640 * Update adv_credits since we reset the ACK_REQUIRED bit.
584 */ 641 */
585 rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); 642 if (ic->i_flowctl) {
586 adv_credits += posted; 643 rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
587 BUG_ON(adv_credits > 255); 644 adv_credits += posted;
645 BUG_ON(adv_credits > 255);
646 }
588 } 647 }
589 648
590 send = &ic->i_sends[pos];
591 first = send;
592 prev = NULL;
593 scat = &rm->m_sg[sg];
594 sent = 0;
595 i = 0;
596
597 /* Sometimes you want to put a fence between an RDMA 649 /* Sometimes you want to put a fence between an RDMA
598 * READ and the following SEND. 650 * READ and the following SEND.
599 * We could either do this all the time 651 * We could either do this all the time
600 * or when requested by the user. Right now, we let 652 * or when requested by the user. Right now, we let
601 * the application choose. 653 * the application choose.
602 */ 654 */
603 if (rm->m_rdma_op && rm->m_rdma_op->r_fence) 655 if (rm->rdma.op_active && rm->rdma.op_fence)
604 send_flags = IB_SEND_FENCE; 656 send_flags = IB_SEND_FENCE;
605 657
606 /* 658 /* Each frag gets a header. Msgs may be 0 bytes */
607 * We could be copying the header into the unused tail of the page. 659 send = &ic->i_sends[pos];
608 * That would need to be changed in the future when those pages might 660 first = send;
609 * be mapped userspace pages or page cache pages. So instead we always 661 prev = NULL;
610 * use a second sge and our long-lived ring of mapped headers. We send 662 scat = &ic->i_data_op->op_sg[sg];
611 * the header after the data so that the data payload can be aligned on 663 i = 0;
612 * the receiver. 664 do {
613 */ 665 unsigned int len = 0;
614 666
615 /* handle a 0-len message */ 667 /* Set up the header */
616 if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { 668 send->s_wr.send_flags = send_flags;
617 rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); 669 send->s_wr.opcode = IB_WR_SEND;
618 goto add_header; 670 send->s_wr.num_sge = 1;
619 } 671 send->s_wr.next = NULL;
672 send->s_queued = jiffies;
673 send->s_op = NULL;
620 674
621 /* if there's data reference it with a chain of work reqs */ 675 send->s_sge[0].addr = ic->i_send_hdrs_dma
622 for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { 676 + (pos * sizeof(struct rds_header));
623 unsigned int len; 677 send->s_sge[0].length = sizeof(struct rds_header);
624 678
625 send = &ic->i_sends[pos]; 679 memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
626 680
627 len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); 681 /* Set up the data, if present */
628 rds_ib_xmit_populate_wr(ic, send, pos, 682 if (i < work_alloc
629 ib_sg_dma_address(dev, scat) + off, len, 683 && scat != &rm->data.op_sg[rm->data.op_count]) {
630 send_flags); 684 len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
685 send->s_wr.num_sge = 2;
631 686
632 /* 687 send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off;
633 * We want to delay signaling completions just enough to get 688 send->s_sge[1].length = len;
634 * the batching benefits but not so much that we create dead time
635 * on the wire.
636 */
637 if (ic->i_unsignaled_wrs-- == 0) {
638 ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
639 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
640 }
641 689
642 ic->i_unsignaled_bytes -= len; 690 bytes_sent += len;
643 if (ic->i_unsignaled_bytes <= 0) { 691 off += len;
644 ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; 692 if (off == ib_sg_dma_len(dev, scat)) {
645 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; 693 scat++;
694 off = 0;
695 }
646 } 696 }
647 697
698 rds_ib_set_wr_signal_state(ic, send, 0);
699
648 /* 700 /*
649 * Always signal the last one if we're stopping due to flow control. 701 * Always signal the last one if we're stopping due to flow control.
650 */ 702 */
651 if (flow_controlled && i == (work_alloc-1)) 703 if (ic->i_flowctl && flow_controlled && i == (work_alloc-1))
652 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; 704 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
653 705
706 if (send->s_wr.send_flags & IB_SEND_SIGNALED)
707 nr_sig++;
708
654 rdsdebug("send %p wr %p num_sge %u next %p\n", send, 709 rdsdebug("send %p wr %p num_sge %u next %p\n", send,
655 &send->s_wr, send->s_wr.num_sge, send->s_wr.next); 710 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
656 711
657 sent += len; 712 if (ic->i_flowctl && adv_credits) {
658 off += len;
659 if (off == ib_sg_dma_len(dev, scat)) {
660 scat++;
661 off = 0;
662 }
663
664add_header:
665 /* Tack on the header after the data. The header SGE should already
666 * have been set up to point to the right header buffer. */
667 memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
668
669 if (0) {
670 struct rds_header *hdr = &ic->i_send_hdrs[pos];
671
672 printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
673 be16_to_cpu(hdr->h_dport),
674 hdr->h_flags,
675 be32_to_cpu(hdr->h_len));
676 }
677 if (adv_credits) {
678 struct rds_header *hdr = &ic->i_send_hdrs[pos]; 713 struct rds_header *hdr = &ic->i_send_hdrs[pos];
679 714
680 /* add credit and redo the header checksum */ 715 /* add credit and redo the header checksum */
@@ -689,20 +724,25 @@ add_header:
689 prev = send; 724 prev = send;
690 725
691 pos = (pos + 1) % ic->i_send_ring.w_nr; 726 pos = (pos + 1) % ic->i_send_ring.w_nr;
692 } 727 send = &ic->i_sends[pos];
728 i++;
729
730 } while (i < work_alloc
731 && scat != &rm->data.op_sg[rm->data.op_count]);
693 732
694 /* Account the RDS header in the number of bytes we sent, but just once. 733 /* Account the RDS header in the number of bytes we sent, but just once.
695 * The caller has no concept of fragmentation. */ 734 * The caller has no concept of fragmentation. */
696 if (hdr_off == 0) 735 if (hdr_off == 0)
697 sent += sizeof(struct rds_header); 736 bytes_sent += sizeof(struct rds_header);
698 737
699 /* if we finished the message then send completion owns it */ 738 /* if we finished the message then send completion owns it */
700 if (scat == &rm->m_sg[rm->m_count]) { 739 if (scat == &rm->data.op_sg[rm->data.op_count]) {
701 prev->s_rm = ic->i_rm; 740 prev->s_op = ic->i_data_op;
702 prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; 741 prev->s_wr.send_flags |= IB_SEND_SOLICITED;
703 ic->i_rm = NULL; 742 ic->i_data_op = NULL;
704 } 743 }
705 744
745 /* Put back wrs & credits we didn't use */
706 if (i < work_alloc) { 746 if (i < work_alloc) {
707 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); 747 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
708 work_alloc = i; 748 work_alloc = i;
@@ -710,6 +750,9 @@ add_header:
710 if (ic->i_flowctl && i < credit_alloc) 750 if (ic->i_flowctl && i < credit_alloc)
711 rds_ib_send_add_credits(conn, credit_alloc - i); 751 rds_ib_send_add_credits(conn, credit_alloc - i);
712 752
753 if (nr_sig)
754 atomic_add(nr_sig, &ic->i_signaled_sends);
755
713 /* XXX need to worry about failed_wr and partial sends. */ 756 /* XXX need to worry about failed_wr and partial sends. */
714 failed_wr = &first->s_wr; 757 failed_wr = &first->s_wr;
715 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); 758 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
@@ -720,32 +763,127 @@ add_header:
720 printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " 763 printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
721 "returned %d\n", &conn->c_faddr, ret); 764 "returned %d\n", &conn->c_faddr, ret);
722 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 765 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
723 if (prev->s_rm) { 766 rds_ib_sub_signaled(ic, nr_sig);
724 ic->i_rm = prev->s_rm; 767 if (prev->s_op) {
725 prev->s_rm = NULL; 768 ic->i_data_op = prev->s_op;
769 prev->s_op = NULL;
726 } 770 }
727 771
728 rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); 772 rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
729 goto out; 773 goto out;
730 } 774 }
731 775
732 ret = sent; 776 ret = bytes_sent;
733out: 777out:
734 BUG_ON(adv_credits); 778 BUG_ON(adv_credits);
735 return ret; 779 return ret;
736} 780}
737 781
738int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) 782/*
783 * Issue atomic operation.
784 * A simplified version of the rdma case, we always map 1 SG, and
785 * only 8 bytes, for the return value from the atomic operation.
786 */
787int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
788{
789 struct rds_ib_connection *ic = conn->c_transport_data;
790 struct rds_ib_send_work *send = NULL;
791 struct ib_send_wr *failed_wr;
792 struct rds_ib_device *rds_ibdev;
793 u32 pos;
794 u32 work_alloc;
795 int ret;
796 int nr_sig = 0;
797
798 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
799
800 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
801 if (work_alloc != 1) {
802 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
803 rds_ib_stats_inc(s_ib_tx_ring_full);
804 ret = -ENOMEM;
805 goto out;
806 }
807
808 /* address of send request in ring */
809 send = &ic->i_sends[pos];
810 send->s_queued = jiffies;
811
812 if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
813 send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
814 send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare;
815 send->s_wr.wr.atomic.swap = op->op_m_cswp.swap;
816 send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask;
817 send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask;
818 } else { /* FADD */
819 send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
820 send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add;
821 send->s_wr.wr.atomic.swap = 0;
822 send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask;
823 send->s_wr.wr.atomic.swap_mask = 0;
824 }
825 nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
826 send->s_wr.num_sge = 1;
827 send->s_wr.next = NULL;
828 send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
829 send->s_wr.wr.atomic.rkey = op->op_rkey;
830 send->s_op = op;
831 rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
832
833 /* map 8 byte retval buffer to the device */
834 ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
835 rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
836 if (ret != 1) {
837 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
838 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
839 ret = -ENOMEM; /* XXX ? */
840 goto out;
841 }
842
843 /* Convert our struct scatterlist to struct ib_sge */
844 send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
845 send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
846 send->s_sge[0].lkey = ic->i_mr->lkey;
847
848 rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
849 send->s_sge[0].addr, send->s_sge[0].length);
850
851 if (nr_sig)
852 atomic_add(nr_sig, &ic->i_signaled_sends);
853
854 failed_wr = &send->s_wr;
855 ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
856 rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
857 send, &send->s_wr, ret, failed_wr);
858 BUG_ON(failed_wr != &send->s_wr);
859 if (ret) {
860 printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
861 "returned %d\n", &conn->c_faddr, ret);
862 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
863 rds_ib_sub_signaled(ic, nr_sig);
864 goto out;
865 }
866
867 if (unlikely(failed_wr != &send->s_wr)) {
868 printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
869 BUG_ON(failed_wr != &send->s_wr);
870 }
871
872out:
873 return ret;
874}
875
876int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
739{ 877{
740 struct rds_ib_connection *ic = conn->c_transport_data; 878 struct rds_ib_connection *ic = conn->c_transport_data;
741 struct rds_ib_send_work *send = NULL; 879 struct rds_ib_send_work *send = NULL;
742 struct rds_ib_send_work *first; 880 struct rds_ib_send_work *first;
743 struct rds_ib_send_work *prev; 881 struct rds_ib_send_work *prev;
744 struct ib_send_wr *failed_wr; 882 struct ib_send_wr *failed_wr;
745 struct rds_ib_device *rds_ibdev;
746 struct scatterlist *scat; 883 struct scatterlist *scat;
747 unsigned long len; 884 unsigned long len;
748 u64 remote_addr = op->r_remote_addr; 885 u64 remote_addr = op->op_remote_addr;
886 u32 max_sge = ic->rds_ibdev->max_sge;
749 u32 pos; 887 u32 pos;
750 u32 work_alloc; 888 u32 work_alloc;
751 u32 i; 889 u32 i;
@@ -753,29 +891,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
753 int sent; 891 int sent;
754 int ret; 892 int ret;
755 int num_sge; 893 int num_sge;
756 894 int nr_sig = 0;
757 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); 895
758 896 /* map the op the first time we see it */
759 /* map the message the first time we see it */ 897 if (!op->op_mapped) {
760 if (!op->r_mapped) { 898 op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
761 op->r_count = ib_dma_map_sg(ic->i_cm_id->device, 899 op->op_sg, op->op_nents, (op->op_write) ?
762 op->r_sg, op->r_nents, (op->r_write) ? 900 DMA_TO_DEVICE : DMA_FROM_DEVICE);
763 DMA_TO_DEVICE : DMA_FROM_DEVICE); 901 rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
764 rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); 902 if (op->op_count == 0) {
765 if (op->r_count == 0) {
766 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); 903 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
767 ret = -ENOMEM; /* XXX ? */ 904 ret = -ENOMEM; /* XXX ? */
768 goto out; 905 goto out;
769 } 906 }
770 907
771 op->r_mapped = 1; 908 op->op_mapped = 1;
772 } 909 }
773 910
774 /* 911 /*
775 * Instead of knowing how to return a partial rdma read/write we insist that there 912 * Instead of knowing how to return a partial rdma read/write we insist that there
776 * be enough work requests to send the entire message. 913 * be enough work requests to send the entire message.
777 */ 914 */
778 i = ceil(op->r_count, rds_ibdev->max_sge); 915 i = ceil(op->op_count, max_sge);
779 916
780 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); 917 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
781 if (work_alloc != i) { 918 if (work_alloc != i) {
@@ -788,30 +925,24 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
788 send = &ic->i_sends[pos]; 925 send = &ic->i_sends[pos];
789 first = send; 926 first = send;
790 prev = NULL; 927 prev = NULL;
791 scat = &op->r_sg[0]; 928 scat = &op->op_sg[0];
792 sent = 0; 929 sent = 0;
793 num_sge = op->r_count; 930 num_sge = op->op_count;
794 931
795 for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { 932 for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
796 send->s_wr.send_flags = 0; 933 send->s_wr.send_flags = 0;
797 send->s_queued = jiffies; 934 send->s_queued = jiffies;
798 /* 935 send->s_op = NULL;
799 * We want to delay signaling completions just enough to get 936
800 * the batching benefits but not so much that we create dead time on the wire. 937 nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
801 */
802 if (ic->i_unsignaled_wrs-- == 0) {
803 ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
804 send->s_wr.send_flags = IB_SEND_SIGNALED;
805 }
806 938
807 send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; 939 send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
808 send->s_wr.wr.rdma.remote_addr = remote_addr; 940 send->s_wr.wr.rdma.remote_addr = remote_addr;
809 send->s_wr.wr.rdma.rkey = op->r_key; 941 send->s_wr.wr.rdma.rkey = op->op_rkey;
810 send->s_op = op;
811 942
812 if (num_sge > rds_ibdev->max_sge) { 943 if (num_sge > max_sge) {
813 send->s_wr.num_sge = rds_ibdev->max_sge; 944 send->s_wr.num_sge = max_sge;
814 num_sge -= rds_ibdev->max_sge; 945 num_sge -= max_sge;
815 } else { 946 } else {
816 send->s_wr.num_sge = num_sge; 947 send->s_wr.num_sge = num_sge;
817 } 948 }
@@ -821,7 +952,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
821 if (prev) 952 if (prev)
822 prev->s_wr.next = &send->s_wr; 953 prev->s_wr.next = &send->s_wr;
823 954
824 for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { 955 for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
825 len = ib_sg_dma_len(ic->i_cm_id->device, scat); 956 len = ib_sg_dma_len(ic->i_cm_id->device, scat);
826 send->s_sge[j].addr = 957 send->s_sge[j].addr =
827 ib_sg_dma_address(ic->i_cm_id->device, scat); 958 ib_sg_dma_address(ic->i_cm_id->device, scat);
@@ -843,15 +974,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
843 send = ic->i_sends; 974 send = ic->i_sends;
844 } 975 }
845 976
846 /* if we finished the message then send completion owns it */ 977 /* give a reference to the last op */
847 if (scat == &op->r_sg[op->r_count]) 978 if (scat == &op->op_sg[op->op_count]) {
848 prev->s_wr.send_flags = IB_SEND_SIGNALED; 979 prev->s_op = op;
980 rds_message_addref(container_of(op, struct rds_message, rdma));
981 }
849 982
850 if (i < work_alloc) { 983 if (i < work_alloc) {
851 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); 984 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
852 work_alloc = i; 985 work_alloc = i;
853 } 986 }
854 987
988 if (nr_sig)
989 atomic_add(nr_sig, &ic->i_signaled_sends);
990
855 failed_wr = &first->s_wr; 991 failed_wr = &first->s_wr;
856 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); 992 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
857 rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, 993 rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
@@ -861,6 +997,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
861 printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " 997 printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
862 "returned %d\n", &conn->c_faddr, ret); 998 "returned %d\n", &conn->c_faddr, ret);
863 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 999 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
1000 rds_ib_sub_signaled(ic, nr_sig);
864 goto out; 1001 goto out;
865 } 1002 }
866 1003
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index d2c904dd6fbc..2d5965d6e97c 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -67,6 +67,8 @@ static const char *const rds_ib_stat_names[] = {
67 "ib_rdma_mr_pool_flush", 67 "ib_rdma_mr_pool_flush",
68 "ib_rdma_mr_pool_wait", 68 "ib_rdma_mr_pool_wait",
69 "ib_rdma_mr_pool_depleted", 69 "ib_rdma_mr_pool_depleted",
70 "ib_atomic_cswp",
71 "ib_atomic_fadd",
70}; 72};
71 73
72unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, 74unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
index 03f01cb4e0fe..1253b006efdb 100644
--- a/net/rds/ib_sysctl.c
+++ b/net/rds/ib_sysctl.c
@@ -49,10 +49,6 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
49static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1; 49static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
50static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; 50static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
51 51
52unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
53static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
54static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
55
56/* 52/*
57 * This sysctl does nothing. 53 * This sysctl does nothing.
58 * 54 *
@@ -65,7 +61,7 @@ static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
65 */ 61 */
66unsigned int rds_ib_sysctl_flow_control = 0; 62unsigned int rds_ib_sysctl_flow_control = 0;
67 63
68ctl_table rds_ib_sysctl_table[] = { 64static ctl_table rds_ib_sysctl_table[] = {
69 { 65 {
70 .procname = "max_send_wr", 66 .procname = "max_send_wr",
71 .data = &rds_ib_sysctl_max_send_wr, 67 .data = &rds_ib_sysctl_max_send_wr,
@@ -94,15 +90,6 @@ ctl_table rds_ib_sysctl_table[] = {
94 .extra2 = &rds_ib_sysctl_max_unsig_wr_max, 90 .extra2 = &rds_ib_sysctl_max_unsig_wr_max,
95 }, 91 },
96 { 92 {
97 .procname = "max_unsignaled_bytes",
98 .data = &rds_ib_sysctl_max_unsig_bytes,
99 .maxlen = sizeof(unsigned long),
100 .mode = 0644,
101 .proc_handler = proc_doulongvec_minmax,
102 .extra1 = &rds_ib_sysctl_max_unsig_bytes_min,
103 .extra2 = &rds_ib_sysctl_max_unsig_bytes_max,
104 },
105 {
106 .procname = "max_recv_allocation", 93 .procname = "max_recv_allocation",
107 .data = &rds_ib_sysctl_max_recv_allocation, 94 .data = &rds_ib_sysctl_max_recv_allocation,
108 .maxlen = sizeof(unsigned long), 95 .maxlen = sizeof(unsigned long),
@@ -132,10 +119,10 @@ void rds_ib_sysctl_exit(void)
132 unregister_sysctl_table(rds_ib_sysctl_hdr); 119 unregister_sysctl_table(rds_ib_sysctl_hdr);
133} 120}
134 121
135int __init rds_ib_sysctl_init(void) 122int rds_ib_sysctl_init(void)
136{ 123{
137 rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table); 124 rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
138 if (rds_ib_sysctl_hdr == NULL) 125 if (!rds_ib_sysctl_hdr)
139 return -ENOMEM; 126 return -ENOMEM;
140 return 0; 127 return 0;
141} 128}
diff --git a/net/rds/info.c b/net/rds/info.c
index c45c4173a44d..4fdf1b6e84ff 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -76,7 +76,7 @@ void rds_info_register_func(int optname, rds_info_func func)
76 BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); 76 BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
77 77
78 spin_lock(&rds_info_lock); 78 spin_lock(&rds_info_lock);
79 BUG_ON(rds_info_funcs[offset] != NULL); 79 BUG_ON(rds_info_funcs[offset]);
80 rds_info_funcs[offset] = func; 80 rds_info_funcs[offset] = func;
81 spin_unlock(&rds_info_lock); 81 spin_unlock(&rds_info_lock);
82} 82}
@@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(rds_info_deregister_func);
102 */ 102 */
103void rds_info_iter_unmap(struct rds_info_iterator *iter) 103void rds_info_iter_unmap(struct rds_info_iterator *iter)
104{ 104{
105 if (iter->addr != NULL) { 105 if (iter->addr) {
106 kunmap_atomic(iter->addr, KM_USER0); 106 kunmap_atomic(iter->addr, KM_USER0);
107 iter->addr = NULL; 107 iter->addr = NULL;
108 } 108 }
@@ -117,7 +117,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
117 unsigned long this; 117 unsigned long this;
118 118
119 while (bytes) { 119 while (bytes) {
120 if (iter->addr == NULL) 120 if (!iter->addr)
121 iter->addr = kmap_atomic(*iter->pages, KM_USER0); 121 iter->addr = kmap_atomic(*iter->pages, KM_USER0);
122 122
123 this = min(bytes, PAGE_SIZE - iter->offset); 123 this = min(bytes, PAGE_SIZE - iter->offset);
@@ -188,7 +188,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
188 >> PAGE_SHIFT; 188 >> PAGE_SHIFT;
189 189
190 pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); 190 pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
191 if (pages == NULL) { 191 if (!pages) {
192 ret = -ENOMEM; 192 ret = -ENOMEM;
193 goto out; 193 goto out;
194 } 194 }
@@ -206,7 +206,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
206 206
207call_func: 207call_func:
208 func = rds_info_funcs[optname - RDS_INFO_FIRST]; 208 func = rds_info_funcs[optname - RDS_INFO_FIRST];
209 if (func == NULL) { 209 if (!func) {
210 ret = -ENOPROTOOPT; 210 ret = -ENOPROTOOPT;
211 goto out; 211 goto out;
212 } 212 }
@@ -234,7 +234,7 @@ call_func:
234 ret = -EFAULT; 234 ret = -EFAULT;
235 235
236out: 236out:
237 for (i = 0; pages != NULL && i < nr_pages; i++) 237 for (i = 0; pages && i < nr_pages; i++)
238 put_page(pages[i]); 238 put_page(pages[i]);
239 kfree(pages); 239 kfree(pages);
240 240
diff --git a/net/rds/iw.c b/net/rds/iw.c
index c8f3d3525cb9..f7474844f096 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -56,7 +56,7 @@ struct list_head rds_iw_devices;
56DEFINE_SPINLOCK(iw_nodev_conns_lock); 56DEFINE_SPINLOCK(iw_nodev_conns_lock);
57LIST_HEAD(iw_nodev_conns); 57LIST_HEAD(iw_nodev_conns);
58 58
59void rds_iw_add_one(struct ib_device *device) 59static void rds_iw_add_one(struct ib_device *device)
60{ 60{
61 struct rds_iw_device *rds_iwdev; 61 struct rds_iw_device *rds_iwdev;
62 struct ib_device_attr *dev_attr; 62 struct ib_device_attr *dev_attr;
@@ -124,7 +124,7 @@ free_attr:
124 kfree(dev_attr); 124 kfree(dev_attr);
125} 125}
126 126
127void rds_iw_remove_one(struct ib_device *device) 127static void rds_iw_remove_one(struct ib_device *device)
128{ 128{
129 struct rds_iw_device *rds_iwdev; 129 struct rds_iw_device *rds_iwdev;
130 struct rds_iw_cm_id *i_cm_id, *next; 130 struct rds_iw_cm_id *i_cm_id, *next;
@@ -226,7 +226,7 @@ static int rds_iw_laddr_check(__be32 addr)
226 /* Create a CMA ID and try to bind it. This catches both 226 /* Create a CMA ID and try to bind it. This catches both
227 * IB and iWARP capable NICs. 227 * IB and iWARP capable NICs.
228 */ 228 */
229 cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); 229 cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
230 if (IS_ERR(cm_id)) 230 if (IS_ERR(cm_id))
231 return PTR_ERR(cm_id); 231 return PTR_ERR(cm_id);
232 232
@@ -264,7 +264,6 @@ struct rds_transport rds_iw_transport = {
264 .laddr_check = rds_iw_laddr_check, 264 .laddr_check = rds_iw_laddr_check,
265 .xmit_complete = rds_iw_xmit_complete, 265 .xmit_complete = rds_iw_xmit_complete,
266 .xmit = rds_iw_xmit, 266 .xmit = rds_iw_xmit,
267 .xmit_cong_map = NULL,
268 .xmit_rdma = rds_iw_xmit_rdma, 267 .xmit_rdma = rds_iw_xmit_rdma,
269 .recv = rds_iw_recv, 268 .recv = rds_iw_recv,
270 .conn_alloc = rds_iw_conn_alloc, 269 .conn_alloc = rds_iw_conn_alloc,
@@ -272,7 +271,6 @@ struct rds_transport rds_iw_transport = {
272 .conn_connect = rds_iw_conn_connect, 271 .conn_connect = rds_iw_conn_connect,
273 .conn_shutdown = rds_iw_conn_shutdown, 272 .conn_shutdown = rds_iw_conn_shutdown,
274 .inc_copy_to_user = rds_iw_inc_copy_to_user, 273 .inc_copy_to_user = rds_iw_inc_copy_to_user,
275 .inc_purge = rds_iw_inc_purge,
276 .inc_free = rds_iw_inc_free, 274 .inc_free = rds_iw_inc_free,
277 .cm_initiate_connect = rds_iw_cm_initiate_connect, 275 .cm_initiate_connect = rds_iw_cm_initiate_connect,
278 .cm_handle_connect = rds_iw_cm_handle_connect, 276 .cm_handle_connect = rds_iw_cm_handle_connect,
@@ -289,7 +287,7 @@ struct rds_transport rds_iw_transport = {
289 .t_prefer_loopback = 1, 287 .t_prefer_loopback = 1,
290}; 288};
291 289
292int __init rds_iw_init(void) 290int rds_iw_init(void)
293{ 291{
294 int ret; 292 int ret;
295 293
diff --git a/net/rds/iw.h b/net/rds/iw.h
index eef2f0c28476..90151922178c 100644
--- a/net/rds/iw.h
+++ b/net/rds/iw.h
@@ -70,7 +70,7 @@ struct rds_iw_send_work {
70 struct rds_message *s_rm; 70 struct rds_message *s_rm;
71 71
72 /* We should really put these into a union: */ 72 /* We should really put these into a union: */
73 struct rds_rdma_op *s_op; 73 struct rm_rdma_op *s_op;
74 struct rds_iw_mapping *s_mapping; 74 struct rds_iw_mapping *s_mapping;
75 struct ib_mr *s_mr; 75 struct ib_mr *s_mr;
76 struct ib_fast_reg_page_list *s_page_list; 76 struct ib_fast_reg_page_list *s_page_list;
@@ -268,8 +268,6 @@ static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
268 268
269/* ib.c */ 269/* ib.c */
270extern struct rds_transport rds_iw_transport; 270extern struct rds_transport rds_iw_transport;
271extern void rds_iw_add_one(struct ib_device *device);
272extern void rds_iw_remove_one(struct ib_device *device);
273extern struct ib_client rds_iw_client; 271extern struct ib_client rds_iw_client;
274 272
275extern unsigned int fastreg_pool_size; 273extern unsigned int fastreg_pool_size;
@@ -284,7 +282,7 @@ void rds_iw_conn_free(void *arg);
284int rds_iw_conn_connect(struct rds_connection *conn); 282int rds_iw_conn_connect(struct rds_connection *conn);
285void rds_iw_conn_shutdown(struct rds_connection *conn); 283void rds_iw_conn_shutdown(struct rds_connection *conn);
286void rds_iw_state_change(struct sock *sk); 284void rds_iw_state_change(struct sock *sk);
287int __init rds_iw_listen_init(void); 285int rds_iw_listen_init(void);
288void rds_iw_listen_stop(void); 286void rds_iw_listen_stop(void);
289void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); 287void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
290int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, 288int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
@@ -318,15 +316,13 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
318void rds_iw_sync_mr(void *trans_private, int dir); 316void rds_iw_sync_mr(void *trans_private, int dir);
319void rds_iw_free_mr(void *trans_private, int invalidate); 317void rds_iw_free_mr(void *trans_private, int invalidate);
320void rds_iw_flush_mrs(void); 318void rds_iw_flush_mrs(void);
321void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
322 319
323/* ib_recv.c */ 320/* ib_recv.c */
324int __init rds_iw_recv_init(void); 321int rds_iw_recv_init(void);
325void rds_iw_recv_exit(void); 322void rds_iw_recv_exit(void);
326int rds_iw_recv(struct rds_connection *conn); 323int rds_iw_recv(struct rds_connection *conn);
327int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, 324int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
328 gfp_t page_gfp, int prefill); 325 gfp_t page_gfp, int prefill);
329void rds_iw_inc_purge(struct rds_incoming *inc);
330void rds_iw_inc_free(struct rds_incoming *inc); 326void rds_iw_inc_free(struct rds_incoming *inc);
331int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, 327int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
332 size_t size); 328 size_t size);
@@ -358,7 +354,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
358void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); 354void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
359void rds_iw_send_init_ring(struct rds_iw_connection *ic); 355void rds_iw_send_init_ring(struct rds_iw_connection *ic);
360void rds_iw_send_clear_ring(struct rds_iw_connection *ic); 356void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
361int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); 357int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
362void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); 358void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
363void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); 359void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
364int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, 360int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
@@ -371,7 +367,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
371 unsigned int avail); 367 unsigned int avail);
372 368
373/* ib_sysctl.c */ 369/* ib_sysctl.c */
374int __init rds_iw_sysctl_init(void); 370int rds_iw_sysctl_init(void);
375void rds_iw_sysctl_exit(void); 371void rds_iw_sysctl_exit(void);
376extern unsigned long rds_iw_sysctl_max_send_wr; 372extern unsigned long rds_iw_sysctl_max_send_wr;
377extern unsigned long rds_iw_sysctl_max_recv_wr; 373extern unsigned long rds_iw_sysctl_max_recv_wr;
@@ -379,7 +375,6 @@ extern unsigned long rds_iw_sysctl_max_unsig_wrs;
379extern unsigned long rds_iw_sysctl_max_unsig_bytes; 375extern unsigned long rds_iw_sysctl_max_unsig_bytes;
380extern unsigned long rds_iw_sysctl_max_recv_allocation; 376extern unsigned long rds_iw_sysctl_max_recv_allocation;
381extern unsigned int rds_iw_sysctl_flow_control; 377extern unsigned int rds_iw_sysctl_flow_control;
382extern ctl_table rds_iw_sysctl_table[];
383 378
384/* 379/*
385 * Helper functions for getting/setting the header and data SGEs in 380 * Helper functions for getting/setting the header and data SGEs in
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
index b5dd6ac39be8..c12db66f24c7 100644
--- a/net/rds/iw_cm.c
+++ b/net/rds/iw_cm.c
@@ -181,7 +181,7 @@ static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
181 unsigned int send_size, recv_size; 181 unsigned int send_size, recv_size;
182 int ret; 182 int ret;
183 183
184 /* The offset of 1 is to accomodate the additional ACK WR. */ 184 /* The offset of 1 is to accommodate the additional ACK WR. */
185 send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1); 185 send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
186 recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1); 186 recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
187 rds_iw_ring_resize(send_ring, send_size - 1); 187 rds_iw_ring_resize(send_ring, send_size - 1);
@@ -257,7 +257,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
257 * the rds_iwdev at all. 257 * the rds_iwdev at all.
258 */ 258 */
259 rds_iwdev = ib_get_client_data(dev, &rds_iw_client); 259 rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
260 if (rds_iwdev == NULL) { 260 if (!rds_iwdev) {
261 if (printk_ratelimit()) 261 if (printk_ratelimit())
262 printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n", 262 printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
263 dev->name); 263 dev->name);
@@ -292,7 +292,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
292 ic->i_send_ring.w_nr * 292 ic->i_send_ring.w_nr *
293 sizeof(struct rds_header), 293 sizeof(struct rds_header),
294 &ic->i_send_hdrs_dma, GFP_KERNEL); 294 &ic->i_send_hdrs_dma, GFP_KERNEL);
295 if (ic->i_send_hdrs == NULL) { 295 if (!ic->i_send_hdrs) {
296 ret = -ENOMEM; 296 ret = -ENOMEM;
297 rdsdebug("ib_dma_alloc_coherent send failed\n"); 297 rdsdebug("ib_dma_alloc_coherent send failed\n");
298 goto out; 298 goto out;
@@ -302,7 +302,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
302 ic->i_recv_ring.w_nr * 302 ic->i_recv_ring.w_nr *
303 sizeof(struct rds_header), 303 sizeof(struct rds_header),
304 &ic->i_recv_hdrs_dma, GFP_KERNEL); 304 &ic->i_recv_hdrs_dma, GFP_KERNEL);
305 if (ic->i_recv_hdrs == NULL) { 305 if (!ic->i_recv_hdrs) {
306 ret = -ENOMEM; 306 ret = -ENOMEM;
307 rdsdebug("ib_dma_alloc_coherent recv failed\n"); 307 rdsdebug("ib_dma_alloc_coherent recv failed\n");
308 goto out; 308 goto out;
@@ -310,14 +310,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
310 310
311 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), 311 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
312 &ic->i_ack_dma, GFP_KERNEL); 312 &ic->i_ack_dma, GFP_KERNEL);
313 if (ic->i_ack == NULL) { 313 if (!ic->i_ack) {
314 ret = -ENOMEM; 314 ret = -ENOMEM;
315 rdsdebug("ib_dma_alloc_coherent ack failed\n"); 315 rdsdebug("ib_dma_alloc_coherent ack failed\n");
316 goto out; 316 goto out;
317 } 317 }
318 318
319 ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); 319 ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
320 if (ic->i_sends == NULL) { 320 if (!ic->i_sends) {
321 ret = -ENOMEM; 321 ret = -ENOMEM;
322 rdsdebug("send allocation failed\n"); 322 rdsdebug("send allocation failed\n");
323 goto out; 323 goto out;
@@ -325,7 +325,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
325 rds_iw_send_init_ring(ic); 325 rds_iw_send_init_ring(ic);
326 326
327 ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); 327 ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
328 if (ic->i_recvs == NULL) { 328 if (!ic->i_recvs) {
329 ret = -ENOMEM; 329 ret = -ENOMEM;
330 rdsdebug("recv allocation failed\n"); 330 rdsdebug("recv allocation failed\n");
331 goto out; 331 goto out;
@@ -522,7 +522,7 @@ int rds_iw_conn_connect(struct rds_connection *conn)
522 /* XXX I wonder what affect the port space has */ 522 /* XXX I wonder what affect the port space has */
523 /* delegate cm event handler to rdma_transport */ 523 /* delegate cm event handler to rdma_transport */
524 ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, 524 ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
525 RDMA_PS_TCP); 525 RDMA_PS_TCP, IB_QPT_RC);
526 if (IS_ERR(ic->i_cm_id)) { 526 if (IS_ERR(ic->i_cm_id)) {
527 ret = PTR_ERR(ic->i_cm_id); 527 ret = PTR_ERR(ic->i_cm_id);
528 ic->i_cm_id = NULL; 528 ic->i_cm_id = NULL;
@@ -696,7 +696,7 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
696 696
697 /* XXX too lazy? */ 697 /* XXX too lazy? */
698 ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); 698 ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
699 if (ic == NULL) 699 if (!ic)
700 return -ENOMEM; 700 return -ENOMEM;
701 701
702 INIT_LIST_HEAD(&ic->iw_node); 702 INIT_LIST_HEAD(&ic->iw_node);
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
index 13dc1862d862..6deaa77495e3 100644
--- a/net/rds/iw_rdma.c
+++ b/net/rds/iw_rdma.c
@@ -34,7 +34,6 @@
34#include <linux/slab.h> 34#include <linux/slab.h>
35 35
36#include "rds.h" 36#include "rds.h"
37#include "rdma.h"
38#include "iw.h" 37#include "iw.h"
39 38
40 39
@@ -123,7 +122,7 @@ static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwd
123#else 122#else
124 /* FIXME - needs to compare the local and remote 123 /* FIXME - needs to compare the local and remote
125 * ipaddr/port tuple, but the ipaddr is the only 124 * ipaddr/port tuple, but the ipaddr is the only
126 * available infomation in the rds_sock (as the rest are 125 * available information in the rds_sock (as the rest are
127 * zero'ed. It doesn't appear to be properly populated 126 * zero'ed. It doesn't appear to be properly populated
128 * during connection setup... 127 * during connection setup...
129 */ 128 */
@@ -158,7 +157,8 @@ static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *
158 return 0; 157 return 0;
159} 158}
160 159
161void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) 160static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev,
161 struct rdma_cm_id *cm_id)
162{ 162{
163 struct rds_iw_cm_id *i_cm_id; 163 struct rds_iw_cm_id *i_cm_id;
164 164
@@ -207,9 +207,9 @@ void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *con
207 BUG_ON(list_empty(&ic->iw_node)); 207 BUG_ON(list_empty(&ic->iw_node));
208 list_del(&ic->iw_node); 208 list_del(&ic->iw_node);
209 209
210 spin_lock_irq(&rds_iwdev->spinlock); 210 spin_lock(&rds_iwdev->spinlock);
211 list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); 211 list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
212 spin_unlock_irq(&rds_iwdev->spinlock); 212 spin_unlock(&rds_iwdev->spinlock);
213 spin_unlock_irq(&iw_nodev_conns_lock); 213 spin_unlock_irq(&iw_nodev_conns_lock);
214 214
215 ic->rds_iwdev = rds_iwdev; 215 ic->rds_iwdev = rds_iwdev;
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
index 3d479067d54d..5e57347f49ff 100644
--- a/net/rds/iw_recv.c
+++ b/net/rds/iw_recv.c
@@ -53,7 +53,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
53static void rds_iw_frag_free(struct rds_page_frag *frag) 53static void rds_iw_frag_free(struct rds_page_frag *frag)
54{ 54{
55 rdsdebug("frag %p page %p\n", frag, frag->f_page); 55 rdsdebug("frag %p page %p\n", frag, frag->f_page);
56 BUG_ON(frag->f_page != NULL); 56 BUG_ON(frag->f_page);
57 kmem_cache_free(rds_iw_frag_slab, frag); 57 kmem_cache_free(rds_iw_frag_slab, frag);
58} 58}
59 59
@@ -143,14 +143,14 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
143 struct ib_sge *sge; 143 struct ib_sge *sge;
144 int ret = -ENOMEM; 144 int ret = -ENOMEM;
145 145
146 if (recv->r_iwinc == NULL) { 146 if (!recv->r_iwinc) {
147 if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { 147 if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
148 rds_iw_stats_inc(s_iw_rx_alloc_limit); 148 rds_iw_stats_inc(s_iw_rx_alloc_limit);
149 goto out; 149 goto out;
150 } 150 }
151 recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, 151 recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
152 kptr_gfp); 152 kptr_gfp);
153 if (recv->r_iwinc == NULL) { 153 if (!recv->r_iwinc) {
154 atomic_dec(&rds_iw_allocation); 154 atomic_dec(&rds_iw_allocation);
155 goto out; 155 goto out;
156 } 156 }
@@ -158,17 +158,17 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
158 rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); 158 rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
159 } 159 }
160 160
161 if (recv->r_frag == NULL) { 161 if (!recv->r_frag) {
162 recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); 162 recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
163 if (recv->r_frag == NULL) 163 if (!recv->r_frag)
164 goto out; 164 goto out;
165 INIT_LIST_HEAD(&recv->r_frag->f_item); 165 INIT_LIST_HEAD(&recv->r_frag->f_item);
166 recv->r_frag->f_page = NULL; 166 recv->r_frag->f_page = NULL;
167 } 167 }
168 168
169 if (ic->i_frag.f_page == NULL) { 169 if (!ic->i_frag.f_page) {
170 ic->i_frag.f_page = alloc_page(page_gfp); 170 ic->i_frag.f_page = alloc_page(page_gfp);
171 if (ic->i_frag.f_page == NULL) 171 if (!ic->i_frag.f_page)
172 goto out; 172 goto out;
173 ic->i_frag.f_offset = 0; 173 ic->i_frag.f_offset = 0;
174 } 174 }
@@ -273,7 +273,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
273 return ret; 273 return ret;
274} 274}
275 275
276void rds_iw_inc_purge(struct rds_incoming *inc) 276static void rds_iw_inc_purge(struct rds_incoming *inc)
277{ 277{
278 struct rds_iw_incoming *iwinc; 278 struct rds_iw_incoming *iwinc;
279 struct rds_page_frag *frag; 279 struct rds_page_frag *frag;
@@ -716,7 +716,7 @@ static void rds_iw_process_recv(struct rds_connection *conn,
716 * into the inc and save the inc so we can hang upcoming fragments 716 * into the inc and save the inc so we can hang upcoming fragments
717 * off its list. 717 * off its list.
718 */ 718 */
719 if (iwinc == NULL) { 719 if (!iwinc) {
720 iwinc = recv->r_iwinc; 720 iwinc = recv->r_iwinc;
721 recv->r_iwinc = NULL; 721 recv->r_iwinc = NULL;
722 ic->i_iwinc = iwinc; 722 ic->i_iwinc = iwinc;
@@ -887,7 +887,7 @@ int rds_iw_recv(struct rds_connection *conn)
887 return ret; 887 return ret;
888} 888}
889 889
890int __init rds_iw_recv_init(void) 890int rds_iw_recv_init(void)
891{ 891{
892 struct sysinfo si; 892 struct sysinfo si;
893 int ret = -ENOMEM; 893 int ret = -ENOMEM;
@@ -899,13 +899,13 @@ int __init rds_iw_recv_init(void)
899 rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", 899 rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
900 sizeof(struct rds_iw_incoming), 900 sizeof(struct rds_iw_incoming),
901 0, 0, NULL); 901 0, 0, NULL);
902 if (rds_iw_incoming_slab == NULL) 902 if (!rds_iw_incoming_slab)
903 goto out; 903 goto out;
904 904
905 rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", 905 rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
906 sizeof(struct rds_page_frag), 906 sizeof(struct rds_page_frag),
907 0, 0, NULL); 907 0, 0, NULL);
908 if (rds_iw_frag_slab == NULL) 908 if (!rds_iw_frag_slab)
909 kmem_cache_destroy(rds_iw_incoming_slab); 909 kmem_cache_destroy(rds_iw_incoming_slab);
910 else 910 else
911 ret = 0; 911 ret = 0;
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index 52182ff7519e..545d8ee3efb1 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -36,7 +36,6 @@
36#include <linux/dmapool.h> 36#include <linux/dmapool.h>
37 37
38#include "rds.h" 38#include "rds.h"
39#include "rdma.h"
40#include "iw.h" 39#include "iw.h"
41 40
42static void rds_iw_send_rdma_complete(struct rds_message *rm, 41static void rds_iw_send_rdma_complete(struct rds_message *rm,
@@ -64,13 +63,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm,
64} 63}
65 64
66static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, 65static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
67 struct rds_rdma_op *op) 66 struct rm_rdma_op *op)
68{ 67{
69 if (op->r_mapped) { 68 if (op->op_mapped) {
70 ib_dma_unmap_sg(ic->i_cm_id->device, 69 ib_dma_unmap_sg(ic->i_cm_id->device,
71 op->r_sg, op->r_nents, 70 op->op_sg, op->op_nents,
72 op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 71 op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
73 op->r_mapped = 0; 72 op->op_mapped = 0;
74 } 73 }
75} 74}
76 75
@@ -83,11 +82,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
83 rdsdebug("ic %p send %p rm %p\n", ic, send, rm); 82 rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
84 83
85 ib_dma_unmap_sg(ic->i_cm_id->device, 84 ib_dma_unmap_sg(ic->i_cm_id->device,
86 rm->m_sg, rm->m_nents, 85 rm->data.op_sg, rm->data.op_nents,
87 DMA_TO_DEVICE); 86 DMA_TO_DEVICE);
88 87
89 if (rm->m_rdma_op != NULL) { 88 if (rm->rdma.op_active) {
90 rds_iw_send_unmap_rdma(ic, rm->m_rdma_op); 89 rds_iw_send_unmap_rdma(ic, &rm->rdma);
91 90
92 /* If the user asked for a completion notification on this 91 /* If the user asked for a completion notification on this
93 * message, we can implement three different semantics: 92 * message, we can implement three different semantics:
@@ -111,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
111 */ 110 */
112 rds_iw_send_rdma_complete(rm, wc_status); 111 rds_iw_send_rdma_complete(rm, wc_status);
113 112
114 if (rm->m_rdma_op->r_write) 113 if (rm->rdma.op_write)
115 rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); 114 rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
116 else 115 else
117 rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); 116 rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
118 } 117 }
119 118
120 /* If anyone waited for this message to get flushed out, wake 119 /* If anyone waited for this message to get flushed out, wake
@@ -308,7 +307,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
308 * 307 *
309 * Conceptually, we have two counters: 308 * Conceptually, we have two counters:
310 * - send credits: this tells us how many WRs we're allowed 309 * - send credits: this tells us how many WRs we're allowed
311 * to submit without overruning the reciever's queue. For 310 * to submit without overruning the receiver's queue. For
312 * each SEND WR we post, we decrement this by one. 311 * each SEND WR we post, we decrement this by one.
313 * 312 *
314 * - posted credits: this tells us how many WRs we recently 313 * - posted credits: this tells us how many WRs we recently
@@ -556,25 +555,27 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
556 } 555 }
557 556
558 /* map the message the first time we see it */ 557 /* map the message the first time we see it */
559 if (ic->i_rm == NULL) { 558 if (!ic->i_rm) {
560 /* 559 /*
561 printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", 560 printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
562 be16_to_cpu(rm->m_inc.i_hdr.h_dport), 561 be16_to_cpu(rm->m_inc.i_hdr.h_dport),
563 rm->m_inc.i_hdr.h_flags, 562 rm->m_inc.i_hdr.h_flags,
564 be32_to_cpu(rm->m_inc.i_hdr.h_len)); 563 be32_to_cpu(rm->m_inc.i_hdr.h_len));
565 */ 564 */
566 if (rm->m_nents) { 565 if (rm->data.op_nents) {
567 rm->m_count = ib_dma_map_sg(dev, 566 rm->data.op_count = ib_dma_map_sg(dev,
568 rm->m_sg, rm->m_nents, DMA_TO_DEVICE); 567 rm->data.op_sg,
569 rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); 568 rm->data.op_nents,
570 if (rm->m_count == 0) { 569 DMA_TO_DEVICE);
570 rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
571 if (rm->data.op_count == 0) {
571 rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); 572 rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
572 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); 573 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
573 ret = -ENOMEM; /* XXX ? */ 574 ret = -ENOMEM; /* XXX ? */
574 goto out; 575 goto out;
575 } 576 }
576 } else { 577 } else {
577 rm->m_count = 0; 578 rm->data.op_count = 0;
578 } 579 }
579 580
580 ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; 581 ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
@@ -590,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
590 591
591 /* If it has a RDMA op, tell the peer we did it. This is 592 /* If it has a RDMA op, tell the peer we did it. This is
592 * used by the peer to release use-once RDMA MRs. */ 593 * used by the peer to release use-once RDMA MRs. */
593 if (rm->m_rdma_op) { 594 if (rm->rdma.op_active) {
594 struct rds_ext_header_rdma ext_hdr; 595 struct rds_ext_header_rdma ext_hdr;
595 596
596 ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); 597 ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
597 rds_message_add_extension(&rm->m_inc.i_hdr, 598 rds_message_add_extension(&rm->m_inc.i_hdr,
598 RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); 599 RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
599 } 600 }
@@ -621,7 +622,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
621 send = &ic->i_sends[pos]; 622 send = &ic->i_sends[pos];
622 first = send; 623 first = send;
623 prev = NULL; 624 prev = NULL;
624 scat = &rm->m_sg[sg]; 625 scat = &rm->data.op_sg[sg];
625 sent = 0; 626 sent = 0;
626 i = 0; 627 i = 0;
627 628
@@ -631,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
631 * or when requested by the user. Right now, we let 632 * or when requested by the user. Right now, we let
632 * the application choose. 633 * the application choose.
633 */ 634 */
634 if (rm->m_rdma_op && rm->m_rdma_op->r_fence) 635 if (rm->rdma.op_active && rm->rdma.op_fence)
635 send_flags = IB_SEND_FENCE; 636 send_flags = IB_SEND_FENCE;
636 637
637 /* 638 /*
@@ -650,7 +651,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
650 } 651 }
651 652
652 /* if there's data reference it with a chain of work reqs */ 653 /* if there's data reference it with a chain of work reqs */
653 for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { 654 for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
654 unsigned int len; 655 unsigned int len;
655 656
656 send = &ic->i_sends[pos]; 657 send = &ic->i_sends[pos];
@@ -728,7 +729,7 @@ add_header:
728 sent += sizeof(struct rds_header); 729 sent += sizeof(struct rds_header);
729 730
730 /* if we finished the message then send completion owns it */ 731 /* if we finished the message then send completion owns it */
731 if (scat == &rm->m_sg[rm->m_count]) { 732 if (scat == &rm->data.op_sg[rm->data.op_count]) {
732 prev->s_rm = ic->i_rm; 733 prev->s_rm = ic->i_rm;
733 prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; 734 prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
734 ic->i_rm = NULL; 735 ic->i_rm = NULL;
@@ -784,7 +785,7 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd
784 ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); 785 ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
785} 786}
786 787
787int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) 788int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
788{ 789{
789 struct rds_iw_connection *ic = conn->c_transport_data; 790 struct rds_iw_connection *ic = conn->c_transport_data;
790 struct rds_iw_send_work *send = NULL; 791 struct rds_iw_send_work *send = NULL;
@@ -794,7 +795,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
794 struct rds_iw_device *rds_iwdev; 795 struct rds_iw_device *rds_iwdev;
795 struct scatterlist *scat; 796 struct scatterlist *scat;
796 unsigned long len; 797 unsigned long len;
797 u64 remote_addr = op->r_remote_addr; 798 u64 remote_addr = op->op_remote_addr;
798 u32 pos, fr_pos; 799 u32 pos, fr_pos;
799 u32 work_alloc; 800 u32 work_alloc;
800 u32 i; 801 u32 i;
@@ -806,21 +807,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
806 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); 807 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
807 808
808 /* map the message the first time we see it */ 809 /* map the message the first time we see it */
809 if (!op->r_mapped) { 810 if (!op->op_mapped) {
810 op->r_count = ib_dma_map_sg(ic->i_cm_id->device, 811 op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
811 op->r_sg, op->r_nents, (op->r_write) ? 812 op->op_sg, op->op_nents, (op->op_write) ?
812 DMA_TO_DEVICE : DMA_FROM_DEVICE); 813 DMA_TO_DEVICE : DMA_FROM_DEVICE);
813 rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); 814 rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
814 if (op->r_count == 0) { 815 if (op->op_count == 0) {
815 rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); 816 rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
816 ret = -ENOMEM; /* XXX ? */ 817 ret = -ENOMEM; /* XXX ? */
817 goto out; 818 goto out;
818 } 819 }
819 820
820 op->r_mapped = 1; 821 op->op_mapped = 1;
821 } 822 }
822 823
823 if (!op->r_write) { 824 if (!op->op_write) {
824 /* Alloc space on the send queue for the fastreg */ 825 /* Alloc space on the send queue for the fastreg */
825 work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); 826 work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
826 if (work_alloc != 1) { 827 if (work_alloc != 1) {
@@ -835,7 +836,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
835 * Instead of knowing how to return a partial rdma read/write we insist that there 836 * Instead of knowing how to return a partial rdma read/write we insist that there
836 * be enough work requests to send the entire message. 837 * be enough work requests to send the entire message.
837 */ 838 */
838 i = ceil(op->r_count, rds_iwdev->max_sge); 839 i = ceil(op->op_count, rds_iwdev->max_sge);
839 840
840 work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); 841 work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
841 if (work_alloc != i) { 842 if (work_alloc != i) {
@@ -846,17 +847,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
846 } 847 }
847 848
848 send = &ic->i_sends[pos]; 849 send = &ic->i_sends[pos];
849 if (!op->r_write) { 850 if (!op->op_write) {
850 first = prev = &ic->i_sends[fr_pos]; 851 first = prev = &ic->i_sends[fr_pos];
851 } else { 852 } else {
852 first = send; 853 first = send;
853 prev = NULL; 854 prev = NULL;
854 } 855 }
855 scat = &op->r_sg[0]; 856 scat = &op->op_sg[0];
856 sent = 0; 857 sent = 0;
857 num_sge = op->r_count; 858 num_sge = op->op_count;
858 859
859 for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { 860 for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
860 send->s_wr.send_flags = 0; 861 send->s_wr.send_flags = 0;
861 send->s_queued = jiffies; 862 send->s_queued = jiffies;
862 863
@@ -873,13 +874,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
873 * for local access after RDS is finished with it, using 874 * for local access after RDS is finished with it, using
874 * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. 875 * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
875 */ 876 */
876 if (op->r_write) 877 if (op->op_write)
877 send->s_wr.opcode = IB_WR_RDMA_WRITE; 878 send->s_wr.opcode = IB_WR_RDMA_WRITE;
878 else 879 else
879 send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; 880 send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
880 881
881 send->s_wr.wr.rdma.remote_addr = remote_addr; 882 send->s_wr.wr.rdma.remote_addr = remote_addr;
882 send->s_wr.wr.rdma.rkey = op->r_key; 883 send->s_wr.wr.rdma.rkey = op->op_rkey;
883 send->s_op = op; 884 send->s_op = op;
884 885
885 if (num_sge > rds_iwdev->max_sge) { 886 if (num_sge > rds_iwdev->max_sge) {
@@ -893,7 +894,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
893 if (prev) 894 if (prev)
894 prev->s_wr.next = &send->s_wr; 895 prev->s_wr.next = &send->s_wr;
895 896
896 for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { 897 for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
897 len = ib_sg_dma_len(ic->i_cm_id->device, scat); 898 len = ib_sg_dma_len(ic->i_cm_id->device, scat);
898 899
899 if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) 900 if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
@@ -927,7 +928,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
927 } 928 }
928 929
929 /* if we finished the message then send completion owns it */ 930 /* if we finished the message then send completion owns it */
930 if (scat == &op->r_sg[op->r_count]) 931 if (scat == &op->op_sg[op->op_count])
931 first->s_wr.send_flags = IB_SEND_SIGNALED; 932 first->s_wr.send_flags = IB_SEND_SIGNALED;
932 933
933 if (i < work_alloc) { 934 if (i < work_alloc) {
@@ -941,9 +942,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
941 * adapters do not allow using the lkey for this at all. To bypass this use a 942 * adapters do not allow using the lkey for this at all. To bypass this use a
942 * fastreg_mr (or possibly a dma_mr) 943 * fastreg_mr (or possibly a dma_mr)
943 */ 944 */
944 if (!op->r_write) { 945 if (!op->op_write) {
945 rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], 946 rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
946 op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); 947 op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
947 work_alloc++; 948 work_alloc++;
948 } 949 }
949 950
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
index 1c4428a61a02..e2e47176e729 100644
--- a/net/rds/iw_sysctl.c
+++ b/net/rds/iw_sysctl.c
@@ -55,7 +55,7 @@ static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
55 55
56unsigned int rds_iw_sysctl_flow_control = 1; 56unsigned int rds_iw_sysctl_flow_control = 1;
57 57
58ctl_table rds_iw_sysctl_table[] = { 58static ctl_table rds_iw_sysctl_table[] = {
59 { 59 {
60 .procname = "max_send_wr", 60 .procname = "max_send_wr",
61 .data = &rds_iw_sysctl_max_send_wr, 61 .data = &rds_iw_sysctl_max_send_wr,
@@ -122,10 +122,10 @@ void rds_iw_sysctl_exit(void)
122 unregister_sysctl_table(rds_iw_sysctl_hdr); 122 unregister_sysctl_table(rds_iw_sysctl_hdr);
123} 123}
124 124
125int __init rds_iw_sysctl_init(void) 125int rds_iw_sysctl_init(void)
126{ 126{
127 rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table); 127 rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
128 if (rds_iw_sysctl_hdr == NULL) 128 if (!rds_iw_sysctl_hdr)
129 return -ENOMEM; 129 return -ENOMEM;
130 return 0; 130 return 0;
131} 131}
diff --git a/net/rds/loop.c b/net/rds/loop.c
index dd9879379457..bca6761a3ca2 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -61,10 +61,22 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
61 unsigned int hdr_off, unsigned int sg, 61 unsigned int hdr_off, unsigned int sg,
62 unsigned int off) 62 unsigned int off)
63{ 63{
64 struct scatterlist *sgp = &rm->data.op_sg[sg];
65 int ret = sizeof(struct rds_header) +
66 be32_to_cpu(rm->m_inc.i_hdr.h_len);
67
68 /* Do not send cong updates to loopback */
69 if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
70 rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
71 ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off);
72 goto out;
73 }
74
64 BUG_ON(hdr_off || sg || off); 75 BUG_ON(hdr_off || sg || off);
65 76
66 rds_inc_init(&rm->m_inc, conn, conn->c_laddr); 77 rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
67 rds_message_addref(rm); /* for the inc */ 78 /* For the embedded inc. Matching put is in loop_inc_free() */
79 rds_message_addref(rm);
68 80
69 rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, 81 rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
70 GFP_KERNEL, KM_USER0); 82 GFP_KERNEL, KM_USER0);
@@ -73,20 +85,18 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
73 NULL); 85 NULL);
74 86
75 rds_inc_put(&rm->m_inc); 87 rds_inc_put(&rm->m_inc);
76 88out:
77 return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); 89 return ret;
78} 90}
79 91
80static int rds_loop_xmit_cong_map(struct rds_connection *conn, 92/*
81 struct rds_cong_map *map, 93 * See rds_loop_xmit(). Since our inc is embedded in the rm, we
82 unsigned long offset) 94 * make sure the rm lives at least until the inc is done.
95 */
96static void rds_loop_inc_free(struct rds_incoming *inc)
83{ 97{
84 BUG_ON(offset); 98 struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
85 BUG_ON(map != conn->c_lcong); 99 rds_message_put(rm);
86
87 rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
88
89 return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
90} 100}
91 101
92/* we need to at least give the thread something to succeed */ 102/* we need to at least give the thread something to succeed */
@@ -112,7 +122,7 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
112 unsigned long flags; 122 unsigned long flags;
113 123
114 lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL); 124 lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL);
115 if (lc == NULL) 125 if (!lc)
116 return -ENOMEM; 126 return -ENOMEM;
117 127
118 INIT_LIST_HEAD(&lc->loop_node); 128 INIT_LIST_HEAD(&lc->loop_node);
@@ -129,8 +139,12 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
129static void rds_loop_conn_free(void *arg) 139static void rds_loop_conn_free(void *arg)
130{ 140{
131 struct rds_loop_connection *lc = arg; 141 struct rds_loop_connection *lc = arg;
142 unsigned long flags;
143
132 rdsdebug("lc %p\n", lc); 144 rdsdebug("lc %p\n", lc);
145 spin_lock_irqsave(&loop_conns_lock, flags);
133 list_del(&lc->loop_node); 146 list_del(&lc->loop_node);
147 spin_unlock_irqrestore(&loop_conns_lock, flags);
134 kfree(lc); 148 kfree(lc);
135} 149}
136 150
@@ -169,14 +183,12 @@ void rds_loop_exit(void)
169 */ 183 */
170struct rds_transport rds_loop_transport = { 184struct rds_transport rds_loop_transport = {
171 .xmit = rds_loop_xmit, 185 .xmit = rds_loop_xmit,
172 .xmit_cong_map = rds_loop_xmit_cong_map,
173 .recv = rds_loop_recv, 186 .recv = rds_loop_recv,
174 .conn_alloc = rds_loop_conn_alloc, 187 .conn_alloc = rds_loop_conn_alloc,
175 .conn_free = rds_loop_conn_free, 188 .conn_free = rds_loop_conn_free,
176 .conn_connect = rds_loop_conn_connect, 189 .conn_connect = rds_loop_conn_connect,
177 .conn_shutdown = rds_loop_conn_shutdown, 190 .conn_shutdown = rds_loop_conn_shutdown,
178 .inc_copy_to_user = rds_message_inc_copy_to_user, 191 .inc_copy_to_user = rds_message_inc_copy_to_user,
179 .inc_purge = rds_message_inc_purge, 192 .inc_free = rds_loop_inc_free,
180 .inc_free = rds_message_inc_free,
181 .t_name = "loopback", 193 .t_name = "loopback",
182}; 194};
diff --git a/net/rds/message.c b/net/rds/message.c
index 9a1d67e001ba..1fd3d29023d7 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -34,9 +34,6 @@
34#include <linux/slab.h> 34#include <linux/slab.h>
35 35
36#include "rds.h" 36#include "rds.h"
37#include "rdma.h"
38
39static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq);
40 37
41static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { 38static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
42[RDS_EXTHDR_NONE] = 0, 39[RDS_EXTHDR_NONE] = 0,
@@ -63,29 +60,31 @@ static void rds_message_purge(struct rds_message *rm)
63 if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) 60 if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
64 return; 61 return;
65 62
66 for (i = 0; i < rm->m_nents; i++) { 63 for (i = 0; i < rm->data.op_nents; i++) {
67 rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i])); 64 rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
68 /* XXX will have to put_page for page refs */ 65 /* XXX will have to put_page for page refs */
69 __free_page(sg_page(&rm->m_sg[i])); 66 __free_page(sg_page(&rm->data.op_sg[i]));
70 } 67 }
71 rm->m_nents = 0; 68 rm->data.op_nents = 0;
72 69
73 if (rm->m_rdma_op) 70 if (rm->rdma.op_active)
74 rds_rdma_free_op(rm->m_rdma_op); 71 rds_rdma_free_op(&rm->rdma);
75 if (rm->m_rdma_mr) 72 if (rm->rdma.op_rdma_mr)
76 rds_mr_put(rm->m_rdma_mr); 73 rds_mr_put(rm->rdma.op_rdma_mr);
77}
78 74
79void rds_message_inc_purge(struct rds_incoming *inc) 75 if (rm->atomic.op_active)
80{ 76 rds_atomic_free_op(&rm->atomic);
81 struct rds_message *rm = container_of(inc, struct rds_message, m_inc); 77 if (rm->atomic.op_rdma_mr)
82 rds_message_purge(rm); 78 rds_mr_put(rm->atomic.op_rdma_mr);
83} 79}
84 80
85void rds_message_put(struct rds_message *rm) 81void rds_message_put(struct rds_message *rm)
86{ 82{
87 rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); 83 rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
88 84 if (atomic_read(&rm->m_refcount) == 0) {
85printk(KERN_CRIT "danger refcount zero on %p\n", rm);
86WARN_ON(1);
87 }
89 if (atomic_dec_and_test(&rm->m_refcount)) { 88 if (atomic_dec_and_test(&rm->m_refcount)) {
90 BUG_ON(!list_empty(&rm->m_sock_item)); 89 BUG_ON(!list_empty(&rm->m_sock_item));
91 BUG_ON(!list_empty(&rm->m_conn_item)); 90 BUG_ON(!list_empty(&rm->m_conn_item));
@@ -96,12 +95,6 @@ void rds_message_put(struct rds_message *rm)
96} 95}
97EXPORT_SYMBOL_GPL(rds_message_put); 96EXPORT_SYMBOL_GPL(rds_message_put);
98 97
99void rds_message_inc_free(struct rds_incoming *inc)
100{
101 struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
102 rds_message_put(rm);
103}
104
105void rds_message_populate_header(struct rds_header *hdr, __be16 sport, 98void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
106 __be16 dport, u64 seq) 99 __be16 dport, u64 seq)
107{ 100{
@@ -113,8 +106,8 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
113} 106}
114EXPORT_SYMBOL_GPL(rds_message_populate_header); 107EXPORT_SYMBOL_GPL(rds_message_populate_header);
115 108
116int rds_message_add_extension(struct rds_header *hdr, 109int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
117 unsigned int type, const void *data, unsigned int len) 110 const void *data, unsigned int len)
118{ 111{
119 unsigned int ext_len = sizeof(u8) + len; 112 unsigned int ext_len = sizeof(u8) + len;
120 unsigned char *dst; 113 unsigned char *dst;
@@ -184,26 +177,6 @@ none:
184 return RDS_EXTHDR_NONE; 177 return RDS_EXTHDR_NONE;
185} 178}
186 179
187int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version)
188{
189 struct rds_ext_header_version ext_hdr;
190
191 ext_hdr.h_version = cpu_to_be32(version);
192 return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr));
193}
194
195int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version)
196{
197 struct rds_ext_header_version ext_hdr;
198 unsigned int pos = 0, len = sizeof(ext_hdr);
199
200 /* We assume the version extension is the only one present */
201 if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION)
202 return 0;
203 *version = be32_to_cpu(ext_hdr.h_version);
204 return 1;
205}
206
207int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset) 180int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
208{ 181{
209 struct rds_ext_header_rdma_dest ext_hdr; 182 struct rds_ext_header_rdma_dest ext_hdr;
@@ -214,41 +187,75 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o
214} 187}
215EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension); 188EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
216 189
217struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp) 190/*
191 * Each rds_message is allocated with extra space for the scatterlist entries
192 * rds ops will need. This is to minimize memory allocation count. Then, each rds op
193 * can grab SGs when initializing its part of the rds_message.
194 */
195struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
218{ 196{
219 struct rds_message *rm; 197 struct rds_message *rm;
220 198
221 rm = kzalloc(sizeof(struct rds_message) + 199 rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
222 (nents * sizeof(struct scatterlist)), gfp);
223 if (!rm) 200 if (!rm)
224 goto out; 201 goto out;
225 202
226 if (nents) 203 rm->m_used_sgs = 0;
227 sg_init_table(rm->m_sg, nents); 204 rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
205
228 atomic_set(&rm->m_refcount, 1); 206 atomic_set(&rm->m_refcount, 1);
229 INIT_LIST_HEAD(&rm->m_sock_item); 207 INIT_LIST_HEAD(&rm->m_sock_item);
230 INIT_LIST_HEAD(&rm->m_conn_item); 208 INIT_LIST_HEAD(&rm->m_conn_item);
231 spin_lock_init(&rm->m_rs_lock); 209 spin_lock_init(&rm->m_rs_lock);
210 init_waitqueue_head(&rm->m_flush_wait);
232 211
233out: 212out:
234 return rm; 213 return rm;
235} 214}
236 215
216/*
217 * RDS ops use this to grab SG entries from the rm's sg pool.
218 */
219struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
220{
221 struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
222 struct scatterlist *sg_ret;
223
224 WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
225 WARN_ON(!nents);
226
227 if (rm->m_used_sgs + nents > rm->m_total_sgs)
228 return NULL;
229
230 sg_ret = &sg_first[rm->m_used_sgs];
231 sg_init_table(sg_ret, nents);
232 rm->m_used_sgs += nents;
233
234 return sg_ret;
235}
236
237struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) 237struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
238{ 238{
239 struct rds_message *rm; 239 struct rds_message *rm;
240 unsigned int i; 240 unsigned int i;
241 int num_sgs = ceil(total_len, PAGE_SIZE);
242 int extra_bytes = num_sgs * sizeof(struct scatterlist);
241 243
242 rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); 244 rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
243 if (rm == NULL) 245 if (!rm)
244 return ERR_PTR(-ENOMEM); 246 return ERR_PTR(-ENOMEM);
245 247
246 set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); 248 set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
247 rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); 249 rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
248 rm->m_nents = ceil(total_len, PAGE_SIZE); 250 rm->data.op_nents = ceil(total_len, PAGE_SIZE);
251 rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
252 if (!rm->data.op_sg) {
253 rds_message_put(rm);
254 return ERR_PTR(-ENOMEM);
255 }
249 256
250 for (i = 0; i < rm->m_nents; ++i) { 257 for (i = 0; i < rm->data.op_nents; ++i) {
251 sg_set_page(&rm->m_sg[i], 258 sg_set_page(&rm->data.op_sg[i],
252 virt_to_page(page_addrs[i]), 259 virt_to_page(page_addrs[i]),
253 PAGE_SIZE, 0); 260 PAGE_SIZE, 0);
254 } 261 }
@@ -256,40 +263,33 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
256 return rm; 263 return rm;
257} 264}
258 265
259struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, 266int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
260 size_t total_len) 267 size_t total_len)
261{ 268{
262 unsigned long to_copy; 269 unsigned long to_copy;
263 unsigned long iov_off; 270 unsigned long iov_off;
264 unsigned long sg_off; 271 unsigned long sg_off;
265 struct rds_message *rm;
266 struct iovec *iov; 272 struct iovec *iov;
267 struct scatterlist *sg; 273 struct scatterlist *sg;
268 int ret; 274 int ret = 0;
269
270 rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
271 if (rm == NULL) {
272 ret = -ENOMEM;
273 goto out;
274 }
275 275
276 rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); 276 rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
277 277
278 /* 278 /*
279 * now allocate and copy in the data payload. 279 * now allocate and copy in the data payload.
280 */ 280 */
281 sg = rm->m_sg; 281 sg = rm->data.op_sg;
282 iov = first_iov; 282 iov = first_iov;
283 iov_off = 0; 283 iov_off = 0;
284 sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ 284 sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
285 285
286 while (total_len) { 286 while (total_len) {
287 if (sg_page(sg) == NULL) { 287 if (!sg_page(sg)) {
288 ret = rds_page_remainder_alloc(sg, total_len, 288 ret = rds_page_remainder_alloc(sg, total_len,
289 GFP_HIGHUSER); 289 GFP_HIGHUSER);
290 if (ret) 290 if (ret)
291 goto out; 291 goto out;
292 rm->m_nents++; 292 rm->data.op_nents++;
293 sg_off = 0; 293 sg_off = 0;
294 } 294 }
295 295
@@ -320,14 +320,8 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
320 sg++; 320 sg++;
321 } 321 }
322 322
323 ret = 0;
324out: 323out:
325 if (ret) { 324 return ret;
326 if (rm)
327 rds_message_put(rm);
328 rm = ERR_PTR(ret);
329 }
330 return rm;
331} 325}
332 326
333int rds_message_inc_copy_to_user(struct rds_incoming *inc, 327int rds_message_inc_copy_to_user(struct rds_incoming *inc,
@@ -348,7 +342,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
348 342
349 iov = first_iov; 343 iov = first_iov;
350 iov_off = 0; 344 iov_off = 0;
351 sg = rm->m_sg; 345 sg = rm->data.op_sg;
352 vec_off = 0; 346 vec_off = 0;
353 copied = 0; 347 copied = 0;
354 348
@@ -394,15 +388,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
394 */ 388 */
395void rds_message_wait(struct rds_message *rm) 389void rds_message_wait(struct rds_message *rm)
396{ 390{
397 wait_event(rds_message_flush_waitq, 391 wait_event_interruptible(rm->m_flush_wait,
398 !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); 392 !test_bit(RDS_MSG_MAPPED, &rm->m_flags));
399} 393}
400 394
401void rds_message_unmapped(struct rds_message *rm) 395void rds_message_unmapped(struct rds_message *rm)
402{ 396{
403 clear_bit(RDS_MSG_MAPPED, &rm->m_flags); 397 clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
404 if (waitqueue_active(&rds_message_flush_waitq)) 398 wake_up_interruptible(&rm->m_flush_wait);
405 wake_up(&rds_message_flush_waitq);
406} 399}
407EXPORT_SYMBOL_GPL(rds_message_unmapped); 400EXPORT_SYMBOL_GPL(rds_message_unmapped);
408 401
diff --git a/net/rds/page.c b/net/rds/page.c
index 1dfbfea12e9b..d8acdebe3c7c 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -40,7 +40,8 @@ struct rds_page_remainder {
40 unsigned long r_offset; 40 unsigned long r_offset;
41}; 41};
42 42
43DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); 43static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder,
44 rds_page_remainders);
44 45
45/* 46/*
46 * returns 0 on success or -errno on failure. 47 * returns 0 on success or -errno on failure.
@@ -103,7 +104,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
103 /* jump straight to allocation if we're trying for a huge page */ 104 /* jump straight to allocation if we're trying for a huge page */
104 if (bytes >= PAGE_SIZE) { 105 if (bytes >= PAGE_SIZE) {
105 page = alloc_page(gfp); 106 page = alloc_page(gfp);
106 if (page == NULL) { 107 if (!page) {
107 ret = -ENOMEM; 108 ret = -ENOMEM;
108 } else { 109 } else {
109 sg_set_page(scat, page, PAGE_SIZE, 0); 110 sg_set_page(scat, page, PAGE_SIZE, 0);
@@ -149,7 +150,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
149 rem = &per_cpu(rds_page_remainders, get_cpu()); 150 rem = &per_cpu(rds_page_remainders, get_cpu());
150 local_irq_save(flags); 151 local_irq_save(flags);
151 152
152 if (page == NULL) { 153 if (!page) {
153 ret = -ENOMEM; 154 ret = -ENOMEM;
154 break; 155 break;
155 } 156 }
@@ -173,6 +174,7 @@ out:
173 ret ? 0 : scat->length); 174 ret ? 0 : scat->length);
174 return ret; 175 return ret;
175} 176}
177EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
176 178
177static int rds_page_remainder_cpu_notify(struct notifier_block *self, 179static int rds_page_remainder_cpu_notify(struct notifier_block *self,
178 unsigned long action, void *hcpu) 180 unsigned long action, void *hcpu)
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 75fd13bb631b..4e37c1cbe8b2 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -35,7 +35,7 @@
35#include <linux/rbtree.h> 35#include <linux/rbtree.h>
36#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ 36#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
37 37
38#include "rdma.h" 38#include "rds.h"
39 39
40/* 40/*
41 * XXX 41 * XXX
@@ -130,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
130{ 130{
131 struct rds_mr *mr; 131 struct rds_mr *mr;
132 struct rb_node *node; 132 struct rb_node *node;
133 unsigned long flags;
133 134
134 /* Release any MRs associated with this socket */ 135 /* Release any MRs associated with this socket */
136 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
135 while ((node = rb_first(&rs->rs_rdma_keys))) { 137 while ((node = rb_first(&rs->rs_rdma_keys))) {
136 mr = container_of(node, struct rds_mr, r_rb_node); 138 mr = container_of(node, struct rds_mr, r_rb_node);
137 if (mr->r_trans == rs->rs_transport) 139 if (mr->r_trans == rs->rs_transport)
138 mr->r_invalidate = 0; 140 mr->r_invalidate = 0;
141 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
142 RB_CLEAR_NODE(&mr->r_rb_node);
143 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
144 rds_destroy_mr(mr);
139 rds_mr_put(mr); 145 rds_mr_put(mr);
146 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
140 } 147 }
148 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
141 149
142 if (rs->rs_transport && rs->rs_transport->flush_mrs) 150 if (rs->rs_transport && rs->rs_transport->flush_mrs)
143 rs->rs_transport->flush_mrs(); 151 rs->rs_transport->flush_mrs();
@@ -181,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
181 goto out; 189 goto out;
182 } 190 }
183 191
184 if (rs->rs_transport->get_mr == NULL) { 192 if (!rs->rs_transport->get_mr) {
185 ret = -EOPNOTSUPP; 193 ret = -EOPNOTSUPP;
186 goto out; 194 goto out;
187 } 195 }
@@ -197,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
197 205
198 /* XXX clamp nr_pages to limit the size of this alloc? */ 206 /* XXX clamp nr_pages to limit the size of this alloc? */
199 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); 207 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
200 if (pages == NULL) { 208 if (!pages) {
201 ret = -ENOMEM; 209 ret = -ENOMEM;
202 goto out; 210 goto out;
203 } 211 }
204 212
205 mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); 213 mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
206 if (mr == NULL) { 214 if (!mr) {
207 ret = -ENOMEM; 215 ret = -ENOMEM;
208 goto out; 216 goto out;
209 } 217 }
@@ -230,13 +238,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
230 * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to 238 * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
231 * the zero page. 239 * the zero page.
232 */ 240 */
233 ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1); 241 ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
234 if (ret < 0) 242 if (ret < 0)
235 goto out; 243 goto out;
236 244
237 nents = ret; 245 nents = ret;
238 sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); 246 sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
239 if (sg == NULL) { 247 if (!sg) {
240 ret = -ENOMEM; 248 ret = -ENOMEM;
241 goto out; 249 goto out;
242 } 250 }
@@ -406,134 +414,217 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
406 414
407 spin_lock_irqsave(&rs->rs_rdma_lock, flags); 415 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
408 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); 416 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
409 if (mr && (mr->r_use_once || force)) { 417 if (!mr) {
418 printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
419 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
420 return;
421 }
422
423 if (mr->r_use_once || force) {
410 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); 424 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
411 RB_CLEAR_NODE(&mr->r_rb_node); 425 RB_CLEAR_NODE(&mr->r_rb_node);
412 zot_me = 1; 426 zot_me = 1;
413 } else if (mr) 427 }
414 atomic_inc(&mr->r_refcount);
415 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); 428 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
416 429
417 /* May have to issue a dma_sync on this memory region. 430 /* May have to issue a dma_sync on this memory region.
418 * Note we could avoid this if the operation was a RDMA READ, 431 * Note we could avoid this if the operation was a RDMA READ,
419 * but at this point we can't tell. */ 432 * but at this point we can't tell. */
420 if (mr != NULL) { 433 if (mr->r_trans->sync_mr)
421 if (mr->r_trans->sync_mr) 434 mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
422 mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); 435
423 436 /* If the MR was marked as invalidate, this will
424 /* If the MR was marked as invalidate, this will 437 * trigger an async flush. */
425 * trigger an async flush. */ 438 if (zot_me)
426 if (zot_me) 439 rds_destroy_mr(mr);
427 rds_destroy_mr(mr); 440 rds_mr_put(mr);
428 rds_mr_put(mr);
429 }
430} 441}
431 442
432void rds_rdma_free_op(struct rds_rdma_op *ro) 443void rds_rdma_free_op(struct rm_rdma_op *ro)
433{ 444{
434 unsigned int i; 445 unsigned int i;
435 446
436 for (i = 0; i < ro->r_nents; i++) { 447 for (i = 0; i < ro->op_nents; i++) {
437 struct page *page = sg_page(&ro->r_sg[i]); 448 struct page *page = sg_page(&ro->op_sg[i]);
438 449
439 /* Mark page dirty if it was possibly modified, which 450 /* Mark page dirty if it was possibly modified, which
440 * is the case for a RDMA_READ which copies from remote 451 * is the case for a RDMA_READ which copies from remote
441 * to local memory */ 452 * to local memory */
442 if (!ro->r_write) { 453 if (!ro->op_write) {
443 BUG_ON(in_interrupt()); 454 BUG_ON(irqs_disabled());
444 set_page_dirty(page); 455 set_page_dirty(page);
445 } 456 }
446 put_page(page); 457 put_page(page);
447 } 458 }
448 459
449 kfree(ro->r_notifier); 460 kfree(ro->op_notifier);
450 kfree(ro); 461 ro->op_notifier = NULL;
462 ro->op_active = 0;
463}
464
465void rds_atomic_free_op(struct rm_atomic_op *ao)
466{
467 struct page *page = sg_page(ao->op_sg);
468
469 /* Mark page dirty if it was possibly modified, which
470 * is the case for a RDMA_READ which copies from remote
471 * to local memory */
472 set_page_dirty(page);
473 put_page(page);
474
475 kfree(ao->op_notifier);
476 ao->op_notifier = NULL;
477 ao->op_active = 0;
451} 478}
452 479
480
453/* 481/*
454 * args is a pointer to an in-kernel copy in the sendmsg cmsg. 482 * Count the number of pages needed to describe an incoming iovec array.
455 */ 483 */
456static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, 484static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
457 struct rds_rdma_args *args) 485{
486 int tot_pages = 0;
487 unsigned int nr_pages;
488 unsigned int i;
489
490 /* figure out the number of pages in the vector */
491 for (i = 0; i < nr_iovecs; i++) {
492 nr_pages = rds_pages_in_vec(&iov[i]);
493 if (nr_pages == 0)
494 return -EINVAL;
495
496 tot_pages += nr_pages;
497
498 /*
499 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
500 * so tot_pages cannot overflow without first going negative.
501 */
502 if (tot_pages < 0)
503 return -EINVAL;
504 }
505
506 return tot_pages;
507}
508
509int rds_rdma_extra_size(struct rds_rdma_args *args)
458{ 510{
459 struct rds_iovec vec; 511 struct rds_iovec vec;
460 struct rds_rdma_op *op = NULL; 512 struct rds_iovec __user *local_vec;
513 int tot_pages = 0;
461 unsigned int nr_pages; 514 unsigned int nr_pages;
462 unsigned int max_pages; 515 unsigned int i;
516
517 local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
518
519 /* figure out the number of pages in the vector */
520 for (i = 0; i < args->nr_local; i++) {
521 if (copy_from_user(&vec, &local_vec[i],
522 sizeof(struct rds_iovec)))
523 return -EFAULT;
524
525 nr_pages = rds_pages_in_vec(&vec);
526 if (nr_pages == 0)
527 return -EINVAL;
528
529 tot_pages += nr_pages;
530
531 /*
532 * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
533 * so tot_pages cannot overflow without first going negative.
534 */
535 if (tot_pages < 0)
536 return -EINVAL;
537 }
538
539 return tot_pages * sizeof(struct scatterlist);
540}
541
542/*
543 * The application asks for a RDMA transfer.
544 * Extract all arguments and set up the rdma_op
545 */
546int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
547 struct cmsghdr *cmsg)
548{
549 struct rds_rdma_args *args;
550 struct rm_rdma_op *op = &rm->rdma;
551 int nr_pages;
463 unsigned int nr_bytes; 552 unsigned int nr_bytes;
464 struct page **pages = NULL; 553 struct page **pages = NULL;
465 struct rds_iovec __user *local_vec; 554 struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack;
466 struct scatterlist *sg; 555 int iov_size;
467 unsigned int nr;
468 unsigned int i, j; 556 unsigned int i, j;
469 int ret; 557 int ret = 0;
558
559 if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
560 || rm->rdma.op_active)
561 return -EINVAL;
470 562
563 args = CMSG_DATA(cmsg);
471 564
472 if (rs->rs_bound_addr == 0) { 565 if (rs->rs_bound_addr == 0) {
473 ret = -ENOTCONN; /* XXX not a great errno */ 566 ret = -ENOTCONN; /* XXX not a great errno */
474 goto out; 567 goto out;
475 } 568 }
476 569
477 if (args->nr_local > (u64)UINT_MAX) { 570 if (args->nr_local > UIO_MAXIOV) {
478 ret = -EMSGSIZE; 571 ret = -EMSGSIZE;
479 goto out; 572 goto out;
480 } 573 }
481 574
482 nr_pages = 0; 575 /* Check whether to allocate the iovec area */
483 max_pages = 0; 576 iov_size = args->nr_local * sizeof(struct rds_iovec);
484 577 if (args->nr_local > UIO_FASTIOV) {
485 local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; 578 iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL);
486 579 if (!iovs) {
487 /* figure out the number of pages in the vector */ 580 ret = -ENOMEM;
488 for (i = 0; i < args->nr_local; i++) {
489 if (copy_from_user(&vec, &local_vec[i],
490 sizeof(struct rds_iovec))) {
491 ret = -EFAULT;
492 goto out;
493 }
494
495 nr = rds_pages_in_vec(&vec);
496 if (nr == 0) {
497 ret = -EINVAL;
498 goto out; 581 goto out;
499 } 582 }
583 }
500 584
501 max_pages = max(nr, max_pages); 585 if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) {
502 nr_pages += nr; 586 ret = -EFAULT;
587 goto out;
503 } 588 }
504 589
505 pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL); 590 nr_pages = rds_rdma_pages(iovs, args->nr_local);
506 if (pages == NULL) { 591 if (nr_pages < 0) {
507 ret = -ENOMEM; 592 ret = -EINVAL;
508 goto out; 593 goto out;
509 } 594 }
510 595
511 op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL); 596 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
512 if (op == NULL) { 597 if (!pages) {
513 ret = -ENOMEM; 598 ret = -ENOMEM;
514 goto out; 599 goto out;
515 } 600 }
516 601
517 op->r_write = !!(args->flags & RDS_RDMA_READWRITE); 602 op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
518 op->r_fence = !!(args->flags & RDS_RDMA_FENCE); 603 op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
519 op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); 604 op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
520 op->r_recverr = rs->rs_recverr; 605 op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
606 op->op_active = 1;
607 op->op_recverr = rs->rs_recverr;
521 WARN_ON(!nr_pages); 608 WARN_ON(!nr_pages);
522 sg_init_table(op->r_sg, nr_pages); 609 op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
610 if (!op->op_sg) {
611 ret = -ENOMEM;
612 goto out;
613 }
523 614
524 if (op->r_notify || op->r_recverr) { 615 if (op->op_notify || op->op_recverr) {
525 /* We allocate an uninitialized notifier here, because 616 /* We allocate an uninitialized notifier here, because
526 * we don't want to do that in the completion handler. We 617 * we don't want to do that in the completion handler. We
527 * would have to use GFP_ATOMIC there, and don't want to deal 618 * would have to use GFP_ATOMIC there, and don't want to deal
528 * with failed allocations. 619 * with failed allocations.
529 */ 620 */
530 op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); 621 op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
531 if (!op->r_notifier) { 622 if (!op->op_notifier) {
532 ret = -ENOMEM; 623 ret = -ENOMEM;
533 goto out; 624 goto out;
534 } 625 }
535 op->r_notifier->n_user_token = args->user_token; 626 op->op_notifier->n_user_token = args->user_token;
536 op->r_notifier->n_status = RDS_RDMA_SUCCESS; 627 op->op_notifier->n_status = RDS_RDMA_SUCCESS;
537 } 628 }
538 629
539 /* The cookie contains the R_Key of the remote memory region, and 630 /* The cookie contains the R_Key of the remote memory region, and
@@ -543,68 +634,55 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
543 * destination address (which is really an offset into the MR) 634 * destination address (which is really an offset into the MR)
544 * FIXME: We may want to move this into ib_rdma.c 635 * FIXME: We may want to move this into ib_rdma.c
545 */ 636 */
546 op->r_key = rds_rdma_cookie_key(args->cookie); 637 op->op_rkey = rds_rdma_cookie_key(args->cookie);
547 op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); 638 op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
548 639
549 nr_bytes = 0; 640 nr_bytes = 0;
550 641
551 rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", 642 rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
552 (unsigned long long)args->nr_local, 643 (unsigned long long)args->nr_local,
553 (unsigned long long)args->remote_vec.addr, 644 (unsigned long long)args->remote_vec.addr,
554 op->r_key); 645 op->op_rkey);
555 646
556 for (i = 0; i < args->nr_local; i++) { 647 for (i = 0; i < args->nr_local; i++) {
557 if (copy_from_user(&vec, &local_vec[i], 648 struct rds_iovec *iov = &iovs[i];
558 sizeof(struct rds_iovec))) { 649 /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */
559 ret = -EFAULT; 650 unsigned int nr = rds_pages_in_vec(iov);
560 goto out;
561 }
562 651
563 nr = rds_pages_in_vec(&vec); 652 rs->rs_user_addr = iov->addr;
564 if (nr == 0) { 653 rs->rs_user_bytes = iov->bytes;
565 ret = -EINVAL;
566 goto out;
567 }
568 654
569 rs->rs_user_addr = vec.addr;
570 rs->rs_user_bytes = vec.bytes;
571
572 /* did the user change the vec under us? */
573 if (nr > max_pages || op->r_nents + nr > nr_pages) {
574 ret = -EINVAL;
575 goto out;
576 }
577 /* If it's a WRITE operation, we want to pin the pages for reading. 655 /* If it's a WRITE operation, we want to pin the pages for reading.
578 * If it's a READ operation, we need to pin the pages for writing. 656 * If it's a READ operation, we need to pin the pages for writing.
579 */ 657 */
580 ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write); 658 ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
581 if (ret < 0) 659 if (ret < 0)
582 goto out; 660 goto out;
583 661
584 rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n", 662 rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
585 nr_bytes, nr, vec.bytes, vec.addr); 663 nr_bytes, nr, iov->bytes, iov->addr);
586 664
587 nr_bytes += vec.bytes; 665 nr_bytes += iov->bytes;
588 666
589 for (j = 0; j < nr; j++) { 667 for (j = 0; j < nr; j++) {
590 unsigned int offset = vec.addr & ~PAGE_MASK; 668 unsigned int offset = iov->addr & ~PAGE_MASK;
669 struct scatterlist *sg;
591 670
592 sg = &op->r_sg[op->r_nents + j]; 671 sg = &op->op_sg[op->op_nents + j];
593 sg_set_page(sg, pages[j], 672 sg_set_page(sg, pages[j],
594 min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), 673 min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
595 offset); 674 offset);
596 675
597 rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n", 676 rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
598 sg->offset, sg->length, vec.addr, vec.bytes); 677 sg->offset, sg->length, iov->addr, iov->bytes);
599 678
600 vec.addr += sg->length; 679 iov->addr += sg->length;
601 vec.bytes -= sg->length; 680 iov->bytes -= sg->length;
602 } 681 }
603 682
604 op->r_nents += nr; 683 op->op_nents += nr;
605 } 684 }
606 685
607
608 if (nr_bytes > args->remote_vec.bytes) { 686 if (nr_bytes > args->remote_vec.bytes) {
609 rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", 687 rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
610 nr_bytes, 688 nr_bytes,
@@ -612,38 +690,18 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
612 ret = -EINVAL; 690 ret = -EINVAL;
613 goto out; 691 goto out;
614 } 692 }
615 op->r_bytes = nr_bytes; 693 op->op_bytes = nr_bytes;
616 694
617 ret = 0;
618out: 695out:
696 if (iovs != iovstack)
697 sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size);
619 kfree(pages); 698 kfree(pages);
620 if (ret) { 699 if (ret)
621 if (op) 700 rds_rdma_free_op(op);
622 rds_rdma_free_op(op); 701 else
623 op = ERR_PTR(ret); 702 rds_stats_inc(s_send_rdma);
624 }
625 return op;
626}
627
628/*
629 * The application asks for a RDMA transfer.
630 * Extract all arguments and set up the rdma_op
631 */
632int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
633 struct cmsghdr *cmsg)
634{
635 struct rds_rdma_op *op;
636
637 if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) ||
638 rm->m_rdma_op != NULL)
639 return -EINVAL;
640 703
641 op = rds_rdma_prepare(rs, CMSG_DATA(cmsg)); 704 return ret;
642 if (IS_ERR(op))
643 return PTR_ERR(op);
644 rds_stats_inc(s_send_rdma);
645 rm->m_rdma_op = op;
646 return 0;
647} 705}
648 706
649/* 707/*
@@ -673,7 +731,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
673 731
674 spin_lock_irqsave(&rs->rs_rdma_lock, flags); 732 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
675 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); 733 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
676 if (mr == NULL) 734 if (!mr)
677 err = -EINVAL; /* invalid r_key */ 735 err = -EINVAL; /* invalid r_key */
678 else 736 else
679 atomic_inc(&mr->r_refcount); 737 atomic_inc(&mr->r_refcount);
@@ -681,7 +739,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
681 739
682 if (mr) { 740 if (mr) {
683 mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); 741 mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
684 rm->m_rdma_mr = mr; 742 rm->rdma.op_rdma_mr = mr;
685 } 743 }
686 return err; 744 return err;
687} 745}
@@ -699,5 +757,102 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
699 rm->m_rdma_cookie != 0) 757 rm->m_rdma_cookie != 0)
700 return -EINVAL; 758 return -EINVAL;
701 759
702 return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr); 760 return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
761}
762
763/*
764 * Fill in rds_message for an atomic request.
765 */
766int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
767 struct cmsghdr *cmsg)
768{
769 struct page *page = NULL;
770 struct rds_atomic_args *args;
771 int ret = 0;
772
773 if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
774 || rm->atomic.op_active)
775 return -EINVAL;
776
777 args = CMSG_DATA(cmsg);
778
779 /* Nonmasked & masked cmsg ops converted to masked hw ops */
780 switch (cmsg->cmsg_type) {
781 case RDS_CMSG_ATOMIC_FADD:
782 rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
783 rm->atomic.op_m_fadd.add = args->fadd.add;
784 rm->atomic.op_m_fadd.nocarry_mask = 0;
785 break;
786 case RDS_CMSG_MASKED_ATOMIC_FADD:
787 rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
788 rm->atomic.op_m_fadd.add = args->m_fadd.add;
789 rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
790 break;
791 case RDS_CMSG_ATOMIC_CSWP:
792 rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
793 rm->atomic.op_m_cswp.compare = args->cswp.compare;
794 rm->atomic.op_m_cswp.swap = args->cswp.swap;
795 rm->atomic.op_m_cswp.compare_mask = ~0;
796 rm->atomic.op_m_cswp.swap_mask = ~0;
797 break;
798 case RDS_CMSG_MASKED_ATOMIC_CSWP:
799 rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
800 rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
801 rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
802 rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
803 rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
804 break;
805 default:
806 BUG(); /* should never happen */
807 }
808
809 rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
810 rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
811 rm->atomic.op_active = 1;
812 rm->atomic.op_recverr = rs->rs_recverr;
813 rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
814 if (!rm->atomic.op_sg) {
815 ret = -ENOMEM;
816 goto err;
817 }
818
819 /* verify 8 byte-aligned */
820 if (args->local_addr & 0x7) {
821 ret = -EFAULT;
822 goto err;
823 }
824
825 ret = rds_pin_pages(args->local_addr, 1, &page, 1);
826 if (ret != 1)
827 goto err;
828 ret = 0;
829
830 sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
831
832 if (rm->atomic.op_notify || rm->atomic.op_recverr) {
833 /* We allocate an uninitialized notifier here, because
834 * we don't want to do that in the completion handler. We
835 * would have to use GFP_ATOMIC there, and don't want to deal
836 * with failed allocations.
837 */
838 rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
839 if (!rm->atomic.op_notifier) {
840 ret = -ENOMEM;
841 goto err;
842 }
843
844 rm->atomic.op_notifier->n_user_token = args->user_token;
845 rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
846 }
847
848 rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
849 rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
850
851 return ret;
852err:
853 if (page)
854 put_page(page);
855 kfree(rm->atomic.op_notifier);
856
857 return ret;
703} 858}
diff --git a/net/rds/rdma.h b/net/rds/rdma.h
deleted file mode 100644
index 909c39835a5d..000000000000
--- a/net/rds/rdma.h
+++ /dev/null
@@ -1,85 +0,0 @@
1#ifndef _RDS_RDMA_H
2#define _RDS_RDMA_H
3
4#include <linux/rbtree.h>
5#include <linux/spinlock.h>
6#include <linux/scatterlist.h>
7
8#include "rds.h"
9
10struct rds_mr {
11 struct rb_node r_rb_node;
12 atomic_t r_refcount;
13 u32 r_key;
14
15 /* A copy of the creation flags */
16 unsigned int r_use_once:1;
17 unsigned int r_invalidate:1;
18 unsigned int r_write:1;
19
20 /* This is for RDS_MR_DEAD.
21 * It would be nice & consistent to make this part of the above
22 * bit field here, but we need to use test_and_set_bit.
23 */
24 unsigned long r_state;
25 struct rds_sock *r_sock; /* back pointer to the socket that owns us */
26 struct rds_transport *r_trans;
27 void *r_trans_private;
28};
29
30/* Flags for mr->r_state */
31#define RDS_MR_DEAD 0
32
33struct rds_rdma_op {
34 u32 r_key;
35 u64 r_remote_addr;
36 unsigned int r_write:1;
37 unsigned int r_fence:1;
38 unsigned int r_notify:1;
39 unsigned int r_recverr:1;
40 unsigned int r_mapped:1;
41 struct rds_notifier *r_notifier;
42 unsigned int r_bytes;
43 unsigned int r_nents;
44 unsigned int r_count;
45 struct scatterlist r_sg[0];
46};
47
48static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
49{
50 return r_key | (((u64) offset) << 32);
51}
52
53static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
54{
55 return cookie;
56}
57
58static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
59{
60 return cookie >> 32;
61}
62
63int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
64int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
65int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
66void rds_rdma_drop_keys(struct rds_sock *rs);
67int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
68 struct cmsghdr *cmsg);
69int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
70 struct cmsghdr *cmsg);
71int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
72 struct cmsghdr *cmsg);
73int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
74 struct cmsghdr *cmsg);
75void rds_rdma_free_op(struct rds_rdma_op *ro);
76void rds_rdma_send_complete(struct rds_message *rm, int);
77
78extern void __rds_put_mr_final(struct rds_mr *mr);
79static inline void rds_mr_put(struct rds_mr *mr)
80{
81 if (atomic_dec_and_test(&mr->r_refcount))
82 __rds_put_mr_final(mr);
83}
84
85#endif
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index e599ba2f950d..f8760e1b6688 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -36,6 +36,34 @@
36 36
37static struct rdma_cm_id *rds_rdma_listen_id; 37static struct rdma_cm_id *rds_rdma_listen_id;
38 38
39static char *rds_cm_event_strings[] = {
40#define RDS_CM_EVENT_STRING(foo) \
41 [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo)
42 RDS_CM_EVENT_STRING(ADDR_RESOLVED),
43 RDS_CM_EVENT_STRING(ADDR_ERROR),
44 RDS_CM_EVENT_STRING(ROUTE_RESOLVED),
45 RDS_CM_EVENT_STRING(ROUTE_ERROR),
46 RDS_CM_EVENT_STRING(CONNECT_REQUEST),
47 RDS_CM_EVENT_STRING(CONNECT_RESPONSE),
48 RDS_CM_EVENT_STRING(CONNECT_ERROR),
49 RDS_CM_EVENT_STRING(UNREACHABLE),
50 RDS_CM_EVENT_STRING(REJECTED),
51 RDS_CM_EVENT_STRING(ESTABLISHED),
52 RDS_CM_EVENT_STRING(DISCONNECTED),
53 RDS_CM_EVENT_STRING(DEVICE_REMOVAL),
54 RDS_CM_EVENT_STRING(MULTICAST_JOIN),
55 RDS_CM_EVENT_STRING(MULTICAST_ERROR),
56 RDS_CM_EVENT_STRING(ADDR_CHANGE),
57 RDS_CM_EVENT_STRING(TIMEWAIT_EXIT),
58#undef RDS_CM_EVENT_STRING
59};
60
61static char *rds_cm_event_str(enum rdma_cm_event_type type)
62{
63 return rds_str_array(rds_cm_event_strings,
64 ARRAY_SIZE(rds_cm_event_strings), type);
65};
66
39int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, 67int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
40 struct rdma_cm_event *event) 68 struct rdma_cm_event *event)
41{ 69{
@@ -44,8 +72,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
44 struct rds_transport *trans; 72 struct rds_transport *trans;
45 int ret = 0; 73 int ret = 0;
46 74
47 rdsdebug("conn %p id %p handling event %u\n", conn, cm_id, 75 rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
48 event->event); 76 event->event, rds_cm_event_str(event->event));
49 77
50 if (cm_id->device->node_type == RDMA_NODE_RNIC) 78 if (cm_id->device->node_type == RDMA_NODE_RNIC)
51 trans = &rds_iw_transport; 79 trans = &rds_iw_transport;
@@ -109,7 +137,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
109 137
110 default: 138 default:
111 /* things like device disconnect? */ 139 /* things like device disconnect? */
112 printk(KERN_ERR "RDS: unknown event %u!\n", event->event); 140 printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
141 event->event, rds_cm_event_str(event->event));
113 break; 142 break;
114 } 143 }
115 144
@@ -117,18 +146,20 @@ out:
117 if (conn) 146 if (conn)
118 mutex_unlock(&conn->c_cm_lock); 147 mutex_unlock(&conn->c_cm_lock);
119 148
120 rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret); 149 rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
150 rds_cm_event_str(event->event), ret);
121 151
122 return ret; 152 return ret;
123} 153}
124 154
125static int __init rds_rdma_listen_init(void) 155static int rds_rdma_listen_init(void)
126{ 156{
127 struct sockaddr_in sin; 157 struct sockaddr_in sin;
128 struct rdma_cm_id *cm_id; 158 struct rdma_cm_id *cm_id;
129 int ret; 159 int ret;
130 160
131 cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP); 161 cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP,
162 IB_QPT_RC);
132 if (IS_ERR(cm_id)) { 163 if (IS_ERR(cm_id)) {
133 ret = PTR_ERR(cm_id); 164 ret = PTR_ERR(cm_id);
134 printk(KERN_ERR "RDS/RDMA: failed to setup listener, " 165 printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
@@ -177,7 +208,7 @@ static void rds_rdma_listen_stop(void)
177 } 208 }
178} 209}
179 210
180int __init rds_rdma_init(void) 211static int rds_rdma_init(void)
181{ 212{
182 int ret; 213 int ret;
183 214
@@ -204,7 +235,7 @@ out:
204} 235}
205module_init(rds_rdma_init); 236module_init(rds_rdma_init);
206 237
207void rds_rdma_exit(void) 238static void rds_rdma_exit(void)
208{ 239{
209 /* stop listening first to ensure no new connections are attempted */ 240 /* stop listening first to ensure no new connections are attempted */
210 rds_rdma_listen_stop(); 241 rds_rdma_listen_stop();
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
index 2f2c7d976c21..faba4e382695 100644
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -11,10 +11,6 @@ int rds_rdma_conn_connect(struct rds_connection *conn);
11int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, 11int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
12 struct rdma_cm_event *event); 12 struct rdma_cm_event *event);
13 13
14/* from rdma_transport.c */
15int rds_rdma_init(void);
16void rds_rdma_exit(void);
17
18/* from ib.c */ 14/* from ib.c */
19extern struct rds_transport rds_ib_transport; 15extern struct rds_transport rds_ib_transport;
20int rds_ib_init(void); 16int rds_ib_init(void);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index c224b5bb3ba9..da8adac2bf06 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -50,7 +50,6 @@ rdsdebug(char *fmt, ...)
50#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) 50#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT))
51 51
52#define RDS_CONG_MAP_BYTES (65536 / 8) 52#define RDS_CONG_MAP_BYTES (65536 / 8)
53#define RDS_CONG_MAP_LONGS (RDS_CONG_MAP_BYTES / sizeof(unsigned long))
54#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) 53#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
55#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) 54#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
56 55
@@ -80,6 +79,7 @@ enum {
80/* Bits for c_flags */ 79/* Bits for c_flags */
81#define RDS_LL_SEND_FULL 0 80#define RDS_LL_SEND_FULL 0
82#define RDS_RECONNECT_PENDING 1 81#define RDS_RECONNECT_PENDING 1
82#define RDS_IN_XMIT 2
83 83
84struct rds_connection { 84struct rds_connection {
85 struct hlist_node c_hash_node; 85 struct hlist_node c_hash_node;
@@ -91,12 +91,13 @@ struct rds_connection {
91 struct rds_cong_map *c_lcong; 91 struct rds_cong_map *c_lcong;
92 struct rds_cong_map *c_fcong; 92 struct rds_cong_map *c_fcong;
93 93
94 struct mutex c_send_lock; /* protect send ring */
95 struct rds_message *c_xmit_rm; 94 struct rds_message *c_xmit_rm;
96 unsigned long c_xmit_sg; 95 unsigned long c_xmit_sg;
97 unsigned int c_xmit_hdr_off; 96 unsigned int c_xmit_hdr_off;
98 unsigned int c_xmit_data_off; 97 unsigned int c_xmit_data_off;
98 unsigned int c_xmit_atomic_sent;
99 unsigned int c_xmit_rdma_sent; 99 unsigned int c_xmit_rdma_sent;
100 unsigned int c_xmit_data_sent;
100 101
101 spinlock_t c_lock; /* protect msg queues */ 102 spinlock_t c_lock; /* protect msg queues */
102 u64 c_next_tx_seq; 103 u64 c_next_tx_seq;
@@ -116,11 +117,10 @@ struct rds_connection {
116 struct delayed_work c_conn_w; 117 struct delayed_work c_conn_w;
117 struct work_struct c_down_w; 118 struct work_struct c_down_w;
118 struct mutex c_cm_lock; /* protect conn state & cm */ 119 struct mutex c_cm_lock; /* protect conn state & cm */
120 wait_queue_head_t c_waitq;
119 121
120 struct list_head c_map_item; 122 struct list_head c_map_item;
121 unsigned long c_map_queued; 123 unsigned long c_map_queued;
122 unsigned long c_map_offset;
123 unsigned long c_map_bytes;
124 124
125 unsigned int c_unacked_packets; 125 unsigned int c_unacked_packets;
126 unsigned int c_unacked_bytes; 126 unsigned int c_unacked_bytes;
@@ -206,6 +206,48 @@ struct rds_incoming {
206 rds_rdma_cookie_t i_rdma_cookie; 206 rds_rdma_cookie_t i_rdma_cookie;
207}; 207};
208 208
209struct rds_mr {
210 struct rb_node r_rb_node;
211 atomic_t r_refcount;
212 u32 r_key;
213
214 /* A copy of the creation flags */
215 unsigned int r_use_once:1;
216 unsigned int r_invalidate:1;
217 unsigned int r_write:1;
218
219 /* This is for RDS_MR_DEAD.
220 * It would be nice & consistent to make this part of the above
221 * bit field here, but we need to use test_and_set_bit.
222 */
223 unsigned long r_state;
224 struct rds_sock *r_sock; /* back pointer to the socket that owns us */
225 struct rds_transport *r_trans;
226 void *r_trans_private;
227};
228
229/* Flags for mr->r_state */
230#define RDS_MR_DEAD 0
231
232static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
233{
234 return r_key | (((u64) offset) << 32);
235}
236
237static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
238{
239 return cookie;
240}
241
242static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
243{
244 return cookie >> 32;
245}
246
247/* atomic operation types */
248#define RDS_ATOMIC_TYPE_CSWP 0
249#define RDS_ATOMIC_TYPE_FADD 1
250
209/* 251/*
210 * m_sock_item and m_conn_item are on lists that are serialized under 252 * m_sock_item and m_conn_item are on lists that are serialized under
211 * conn->c_lock. m_sock_item has additional meaning in that once it is empty 253 * conn->c_lock. m_sock_item has additional meaning in that once it is empty
@@ -258,13 +300,71 @@ struct rds_message {
258 * -> rs->rs_lock 300 * -> rs->rs_lock
259 */ 301 */
260 spinlock_t m_rs_lock; 302 spinlock_t m_rs_lock;
303 wait_queue_head_t m_flush_wait;
304
261 struct rds_sock *m_rs; 305 struct rds_sock *m_rs;
262 struct rds_rdma_op *m_rdma_op; 306
307 /* cookie to send to remote, in rds header */
263 rds_rdma_cookie_t m_rdma_cookie; 308 rds_rdma_cookie_t m_rdma_cookie;
264 struct rds_mr *m_rdma_mr; 309
265 unsigned int m_nents; 310 unsigned int m_used_sgs;
266 unsigned int m_count; 311 unsigned int m_total_sgs;
267 struct scatterlist m_sg[0]; 312
313 void *m_final_op;
314
315 struct {
316 struct rm_atomic_op {
317 int op_type;
318 union {
319 struct {
320 uint64_t compare;
321 uint64_t swap;
322 uint64_t compare_mask;
323 uint64_t swap_mask;
324 } op_m_cswp;
325 struct {
326 uint64_t add;
327 uint64_t nocarry_mask;
328 } op_m_fadd;
329 };
330
331 u32 op_rkey;
332 u64 op_remote_addr;
333 unsigned int op_notify:1;
334 unsigned int op_recverr:1;
335 unsigned int op_mapped:1;
336 unsigned int op_silent:1;
337 unsigned int op_active:1;
338 struct scatterlist *op_sg;
339 struct rds_notifier *op_notifier;
340
341 struct rds_mr *op_rdma_mr;
342 } atomic;
343 struct rm_rdma_op {
344 u32 op_rkey;
345 u64 op_remote_addr;
346 unsigned int op_write:1;
347 unsigned int op_fence:1;
348 unsigned int op_notify:1;
349 unsigned int op_recverr:1;
350 unsigned int op_mapped:1;
351 unsigned int op_silent:1;
352 unsigned int op_active:1;
353 unsigned int op_bytes;
354 unsigned int op_nents;
355 unsigned int op_count;
356 struct scatterlist *op_sg;
357 struct rds_notifier *op_notifier;
358
359 struct rds_mr *op_rdma_mr;
360 } rdma;
361 struct rm_data_op {
362 unsigned int op_active:1;
363 unsigned int op_nents;
364 unsigned int op_count;
365 struct scatterlist *op_sg;
366 } data;
367 };
268}; 368};
269 369
270/* 370/*
@@ -305,10 +405,6 @@ struct rds_notifier {
305 * transport is responsible for other serialization, including 405 * transport is responsible for other serialization, including
306 * rds_recv_incoming(). This is called in process context but 406 * rds_recv_incoming(). This is called in process context but
307 * should try hard not to block. 407 * should try hard not to block.
308 *
309 * @xmit_cong_map: This asks the transport to send the local bitmap down the
310 * given connection. XXX get a better story about the bitmap
311 * flag and header.
312 */ 408 */
313 409
314#define RDS_TRANS_IB 0 410#define RDS_TRANS_IB 0
@@ -332,13 +428,11 @@ struct rds_transport {
332 void (*xmit_complete)(struct rds_connection *conn); 428 void (*xmit_complete)(struct rds_connection *conn);
333 int (*xmit)(struct rds_connection *conn, struct rds_message *rm, 429 int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
334 unsigned int hdr_off, unsigned int sg, unsigned int off); 430 unsigned int hdr_off, unsigned int sg, unsigned int off);
335 int (*xmit_cong_map)(struct rds_connection *conn, 431 int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
336 struct rds_cong_map *map, unsigned long offset); 432 int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
337 int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
338 int (*recv)(struct rds_connection *conn); 433 int (*recv)(struct rds_connection *conn);
339 int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, 434 int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
340 size_t size); 435 size_t size);
341 void (*inc_purge)(struct rds_incoming *inc);
342 void (*inc_free)(struct rds_incoming *inc); 436 void (*inc_free)(struct rds_incoming *inc);
343 437
344 int (*cm_handle_connect)(struct rdma_cm_id *cm_id, 438 int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
@@ -367,17 +461,11 @@ struct rds_sock {
367 * bound_addr used for both incoming and outgoing, no INADDR_ANY 461 * bound_addr used for both incoming and outgoing, no INADDR_ANY
368 * support. 462 * support.
369 */ 463 */
370 struct rb_node rs_bound_node; 464 struct hlist_node rs_bound_node;
371 __be32 rs_bound_addr; 465 __be32 rs_bound_addr;
372 __be32 rs_conn_addr; 466 __be32 rs_conn_addr;
373 __be16 rs_bound_port; 467 __be16 rs_bound_port;
374 __be16 rs_conn_port; 468 __be16 rs_conn_port;
375
376 /*
377 * This is only used to communicate the transport between bind and
378 * initiating connections. All other trans use is referenced through
379 * the connection.
380 */
381 struct rds_transport *rs_transport; 469 struct rds_transport *rs_transport;
382 470
383 /* 471 /*
@@ -466,8 +554,8 @@ struct rds_statistics {
466 uint64_t s_recv_ping; 554 uint64_t s_recv_ping;
467 uint64_t s_send_queue_empty; 555 uint64_t s_send_queue_empty;
468 uint64_t s_send_queue_full; 556 uint64_t s_send_queue_full;
469 uint64_t s_send_sem_contention; 557 uint64_t s_send_lock_contention;
470 uint64_t s_send_sem_queue_raced; 558 uint64_t s_send_lock_queue_raced;
471 uint64_t s_send_immediate_retry; 559 uint64_t s_send_immediate_retry;
472 uint64_t s_send_delayed_retry; 560 uint64_t s_send_delayed_retry;
473 uint64_t s_send_drop_acked; 561 uint64_t s_send_drop_acked;
@@ -487,6 +575,7 @@ struct rds_statistics {
487}; 575};
488 576
489/* af_rds.c */ 577/* af_rds.c */
578char *rds_str_array(char **array, size_t elements, size_t index);
490void rds_sock_addref(struct rds_sock *rs); 579void rds_sock_addref(struct rds_sock *rs);
491void rds_sock_put(struct rds_sock *rs); 580void rds_sock_put(struct rds_sock *rs);
492void rds_wake_sk_sleep(struct rds_sock *rs); 581void rds_wake_sk_sleep(struct rds_sock *rs);
@@ -521,15 +610,16 @@ void rds_cong_exit(void);
521struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); 610struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
522 611
523/* conn.c */ 612/* conn.c */
524int __init rds_conn_init(void); 613int rds_conn_init(void);
525void rds_conn_exit(void); 614void rds_conn_exit(void);
526struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, 615struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
527 struct rds_transport *trans, gfp_t gfp); 616 struct rds_transport *trans, gfp_t gfp);
528struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, 617struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
529 struct rds_transport *trans, gfp_t gfp); 618 struct rds_transport *trans, gfp_t gfp);
619void rds_conn_shutdown(struct rds_connection *conn);
530void rds_conn_destroy(struct rds_connection *conn); 620void rds_conn_destroy(struct rds_connection *conn);
531void rds_conn_reset(struct rds_connection *conn);
532void rds_conn_drop(struct rds_connection *conn); 621void rds_conn_drop(struct rds_connection *conn);
622void rds_conn_connect_if_down(struct rds_connection *conn);
533void rds_for_each_conn_info(struct socket *sock, unsigned int len, 623void rds_for_each_conn_info(struct socket *sock, unsigned int len,
534 struct rds_info_iterator *iter, 624 struct rds_info_iterator *iter,
535 struct rds_info_lengths *lens, 625 struct rds_info_lengths *lens,
@@ -566,7 +656,8 @@ rds_conn_connecting(struct rds_connection *conn)
566 656
567/* message.c */ 657/* message.c */
568struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); 658struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
569struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, 659struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
660int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
570 size_t total_len); 661 size_t total_len);
571struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); 662struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
572void rds_message_populate_header(struct rds_header *hdr, __be16 sport, 663void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
@@ -575,12 +666,9 @@ int rds_message_add_extension(struct rds_header *hdr,
575 unsigned int type, const void *data, unsigned int len); 666 unsigned int type, const void *data, unsigned int len);
576int rds_message_next_extension(struct rds_header *hdr, 667int rds_message_next_extension(struct rds_header *hdr,
577 unsigned int *pos, void *buf, unsigned int *buflen); 668 unsigned int *pos, void *buf, unsigned int *buflen);
578int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version);
579int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version);
580int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); 669int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
581int rds_message_inc_copy_to_user(struct rds_incoming *inc, 670int rds_message_inc_copy_to_user(struct rds_incoming *inc,
582 struct iovec *first_iov, size_t size); 671 struct iovec *first_iov, size_t size);
583void rds_message_inc_purge(struct rds_incoming *inc);
584void rds_message_inc_free(struct rds_incoming *inc); 672void rds_message_inc_free(struct rds_incoming *inc);
585void rds_message_addref(struct rds_message *rm); 673void rds_message_addref(struct rds_message *rm);
586void rds_message_put(struct rds_message *rm); 674void rds_message_put(struct rds_message *rm);
@@ -614,7 +702,6 @@ void rds_page_exit(void);
614/* recv.c */ 702/* recv.c */
615void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, 703void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
616 __be32 saddr); 704 __be32 saddr);
617void rds_inc_addref(struct rds_incoming *inc);
618void rds_inc_put(struct rds_incoming *inc); 705void rds_inc_put(struct rds_incoming *inc);
619void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, 706void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
620 struct rds_incoming *inc, gfp_t gfp, enum km_type km); 707 struct rds_incoming *inc, gfp_t gfp, enum km_type km);
@@ -636,14 +723,38 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
636typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); 723typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
637void rds_send_drop_acked(struct rds_connection *conn, u64 ack, 724void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
638 is_acked_func is_acked); 725 is_acked_func is_acked);
639int rds_send_acked_before(struct rds_connection *conn, u64 seq);
640void rds_send_remove_from_sock(struct list_head *messages, int status);
641int rds_send_pong(struct rds_connection *conn, __be16 dport); 726int rds_send_pong(struct rds_connection *conn, __be16 dport);
642struct rds_message *rds_send_get_message(struct rds_connection *, 727struct rds_message *rds_send_get_message(struct rds_connection *,
643 struct rds_rdma_op *); 728 struct rm_rdma_op *);
644 729
645/* rdma.c */ 730/* rdma.c */
646void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); 731void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
732int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
733int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
734int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
735void rds_rdma_drop_keys(struct rds_sock *rs);
736int rds_rdma_extra_size(struct rds_rdma_args *args);
737int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
738 struct cmsghdr *cmsg);
739int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
740 struct cmsghdr *cmsg);
741int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
742 struct cmsghdr *cmsg);
743int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
744 struct cmsghdr *cmsg);
745void rds_rdma_free_op(struct rm_rdma_op *ro);
746void rds_atomic_free_op(struct rm_atomic_op *ao);
747void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
748void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
749int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
750 struct cmsghdr *cmsg);
751
752extern void __rds_put_mr_final(struct rds_mr *mr);
753static inline void rds_mr_put(struct rds_mr *mr)
754{
755 if (atomic_dec_and_test(&mr->r_refcount))
756 __rds_put_mr_final(mr);
757}
647 758
648/* stats.c */ 759/* stats.c */
649DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); 760DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
@@ -657,14 +768,14 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
657 put_cpu(); \ 768 put_cpu(); \
658} while (0) 769} while (0)
659#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) 770#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
660int __init rds_stats_init(void); 771int rds_stats_init(void);
661void rds_stats_exit(void); 772void rds_stats_exit(void);
662void rds_stats_info_copy(struct rds_info_iterator *iter, 773void rds_stats_info_copy(struct rds_info_iterator *iter,
663 uint64_t *values, const char *const *names, 774 uint64_t *values, const char *const *names,
664 size_t nr); 775 size_t nr);
665 776
666/* sysctl.c */ 777/* sysctl.c */
667int __init rds_sysctl_init(void); 778int rds_sysctl_init(void);
668void rds_sysctl_exit(void); 779void rds_sysctl_exit(void);
669extern unsigned long rds_sysctl_sndbuf_min; 780extern unsigned long rds_sysctl_sndbuf_min;
670extern unsigned long rds_sysctl_sndbuf_default; 781extern unsigned long rds_sysctl_sndbuf_default;
@@ -678,9 +789,10 @@ extern unsigned long rds_sysctl_trace_flags;
678extern unsigned int rds_sysctl_trace_level; 789extern unsigned int rds_sysctl_trace_level;
679 790
680/* threads.c */ 791/* threads.c */
681int __init rds_threads_init(void); 792int rds_threads_init(void);
682void rds_threads_exit(void); 793void rds_threads_exit(void);
683extern struct workqueue_struct *rds_wq; 794extern struct workqueue_struct *rds_wq;
795void rds_queue_reconnect(struct rds_connection *conn);
684void rds_connect_worker(struct work_struct *); 796void rds_connect_worker(struct work_struct *);
685void rds_shutdown_worker(struct work_struct *); 797void rds_shutdown_worker(struct work_struct *);
686void rds_send_worker(struct work_struct *); 798void rds_send_worker(struct work_struct *);
@@ -691,9 +803,10 @@ void rds_connect_complete(struct rds_connection *conn);
691int rds_trans_register(struct rds_transport *trans); 803int rds_trans_register(struct rds_transport *trans);
692void rds_trans_unregister(struct rds_transport *trans); 804void rds_trans_unregister(struct rds_transport *trans);
693struct rds_transport *rds_trans_get_preferred(__be32 addr); 805struct rds_transport *rds_trans_get_preferred(__be32 addr);
806void rds_trans_put(struct rds_transport *trans);
694unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, 807unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
695 unsigned int avail); 808 unsigned int avail);
696int __init rds_trans_init(void); 809int rds_trans_init(void);
697void rds_trans_exit(void); 810void rds_trans_exit(void);
698 811
699#endif 812#endif
diff --git a/net/rds/recv.c b/net/rds/recv.c
index c93588c2d553..596689e59272 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -36,7 +36,6 @@
36#include <linux/in.h> 36#include <linux/in.h>
37 37
38#include "rds.h" 38#include "rds.h"
39#include "rdma.h"
40 39
41void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, 40void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
42 __be32 saddr) 41 __be32 saddr)
@@ -49,12 +48,11 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
49} 48}
50EXPORT_SYMBOL_GPL(rds_inc_init); 49EXPORT_SYMBOL_GPL(rds_inc_init);
51 50
52void rds_inc_addref(struct rds_incoming *inc) 51static void rds_inc_addref(struct rds_incoming *inc)
53{ 52{
54 rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); 53 rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
55 atomic_inc(&inc->i_refcount); 54 atomic_inc(&inc->i_refcount);
56} 55}
57EXPORT_SYMBOL_GPL(rds_inc_addref);
58 56
59void rds_inc_put(struct rds_incoming *inc) 57void rds_inc_put(struct rds_incoming *inc)
60{ 58{
@@ -210,7 +208,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
210 } 208 }
211 209
212 rs = rds_find_bound(daddr, inc->i_hdr.h_dport); 210 rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
213 if (rs == NULL) { 211 if (!rs) {
214 rds_stats_inc(s_recv_drop_no_sock); 212 rds_stats_inc(s_recv_drop_no_sock);
215 goto out; 213 goto out;
216 } 214 }
@@ -251,7 +249,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
251{ 249{
252 unsigned long flags; 250 unsigned long flags;
253 251
254 if (*inc == NULL) { 252 if (!*inc) {
255 read_lock_irqsave(&rs->rs_recv_lock, flags); 253 read_lock_irqsave(&rs->rs_recv_lock, flags);
256 if (!list_empty(&rs->rs_recv_queue)) { 254 if (!list_empty(&rs->rs_recv_queue)) {
257 *inc = list_entry(rs->rs_recv_queue.next, 255 *inc = list_entry(rs->rs_recv_queue.next,
@@ -334,10 +332,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
334 332
335 if (msghdr) { 333 if (msghdr) {
336 cmsg.user_token = notifier->n_user_token; 334 cmsg.user_token = notifier->n_user_token;
337 cmsg.status = notifier->n_status; 335 cmsg.status = notifier->n_status;
338 336
339 err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, 337 err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
340 sizeof(cmsg), &cmsg); 338 sizeof(cmsg), &cmsg);
341 if (err) 339 if (err)
342 break; 340 break;
343 } 341 }
diff --git a/net/rds/send.c b/net/rds/send.c
index 9c1c6bcaa6c9..d58ae5f9339e 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -37,7 +37,6 @@
37#include <linux/list.h> 37#include <linux/list.h>
38 38
39#include "rds.h" 39#include "rds.h"
40#include "rdma.h"
41 40
42/* When transmitting messages in rds_send_xmit, we need to emerge from 41/* When transmitting messages in rds_send_xmit, we need to emerge from
43 * time to time and briefly release the CPU. Otherwise the softlock watchdog 42 * time to time and briefly release the CPU. Otherwise the softlock watchdog
@@ -53,8 +52,11 @@ static int send_batch_count = 64;
53module_param(send_batch_count, int, 0444); 52module_param(send_batch_count, int, 0444);
54MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); 53MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
55 54
55static void rds_send_remove_from_sock(struct list_head *messages, int status);
56
56/* 57/*
57 * Reset the send state. Caller must hold c_send_lock when calling here. 58 * Reset the send state. Callers must ensure that this doesn't race with
59 * rds_send_xmit().
58 */ 60 */
59void rds_send_reset(struct rds_connection *conn) 61void rds_send_reset(struct rds_connection *conn)
60{ 62{
@@ -62,18 +64,22 @@ void rds_send_reset(struct rds_connection *conn)
62 unsigned long flags; 64 unsigned long flags;
63 65
64 if (conn->c_xmit_rm) { 66 if (conn->c_xmit_rm) {
67 rm = conn->c_xmit_rm;
68 conn->c_xmit_rm = NULL;
65 /* Tell the user the RDMA op is no longer mapped by the 69 /* Tell the user the RDMA op is no longer mapped by the
66 * transport. This isn't entirely true (it's flushed out 70 * transport. This isn't entirely true (it's flushed out
67 * independently) but as the connection is down, there's 71 * independently) but as the connection is down, there's
68 * no ongoing RDMA to/from that memory */ 72 * no ongoing RDMA to/from that memory */
69 rds_message_unmapped(conn->c_xmit_rm); 73 rds_message_unmapped(rm);
70 rds_message_put(conn->c_xmit_rm); 74 rds_message_put(rm);
71 conn->c_xmit_rm = NULL;
72 } 75 }
76
73 conn->c_xmit_sg = 0; 77 conn->c_xmit_sg = 0;
74 conn->c_xmit_hdr_off = 0; 78 conn->c_xmit_hdr_off = 0;
75 conn->c_xmit_data_off = 0; 79 conn->c_xmit_data_off = 0;
80 conn->c_xmit_atomic_sent = 0;
76 conn->c_xmit_rdma_sent = 0; 81 conn->c_xmit_rdma_sent = 0;
82 conn->c_xmit_data_sent = 0;
77 83
78 conn->c_map_queued = 0; 84 conn->c_map_queued = 0;
79 85
@@ -90,8 +96,27 @@ void rds_send_reset(struct rds_connection *conn)
90 spin_unlock_irqrestore(&conn->c_lock, flags); 96 spin_unlock_irqrestore(&conn->c_lock, flags);
91} 97}
92 98
99static int acquire_in_xmit(struct rds_connection *conn)
100{
101 return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0;
102}
103
104static void release_in_xmit(struct rds_connection *conn)
105{
106 clear_bit(RDS_IN_XMIT, &conn->c_flags);
107 smp_mb__after_clear_bit();
108 /*
109 * We don't use wait_on_bit()/wake_up_bit() because our waking is in a
110 * hot path and finding waiters is very rare. We don't want to walk
111 * the system-wide hashed waitqueue buckets in the fast path only to
112 * almost never find waiters.
113 */
114 if (waitqueue_active(&conn->c_waitq))
115 wake_up_all(&conn->c_waitq);
116}
117
93/* 118/*
94 * We're making the concious trade-off here to only send one message 119 * We're making the conscious trade-off here to only send one message
95 * down the connection at a time. 120 * down the connection at a time.
96 * Pro: 121 * Pro:
97 * - tx queueing is a simple fifo list 122 * - tx queueing is a simple fifo list
@@ -109,102 +134,69 @@ int rds_send_xmit(struct rds_connection *conn)
109 struct rds_message *rm; 134 struct rds_message *rm;
110 unsigned long flags; 135 unsigned long flags;
111 unsigned int tmp; 136 unsigned int tmp;
112 unsigned int send_quota = send_batch_count;
113 struct scatterlist *sg; 137 struct scatterlist *sg;
114 int ret = 0; 138 int ret = 0;
115 int was_empty = 0;
116 LIST_HEAD(to_be_dropped); 139 LIST_HEAD(to_be_dropped);
117 140
141restart:
142
118 /* 143 /*
119 * sendmsg calls here after having queued its message on the send 144 * sendmsg calls here after having queued its message on the send
120 * queue. We only have one task feeding the connection at a time. If 145 * queue. We only have one task feeding the connection at a time. If
121 * another thread is already feeding the queue then we back off. This 146 * another thread is already feeding the queue then we back off. This
122 * avoids blocking the caller and trading per-connection data between 147 * avoids blocking the caller and trading per-connection data between
123 * caches per message. 148 * caches per message.
124 *
125 * The sem holder will issue a retry if they notice that someone queued
126 * a message after they stopped walking the send queue but before they
127 * dropped the sem.
128 */ 149 */
129 if (!mutex_trylock(&conn->c_send_lock)) { 150 if (!acquire_in_xmit(conn)) {
130 rds_stats_inc(s_send_sem_contention); 151 rds_stats_inc(s_send_lock_contention);
131 ret = -ENOMEM; 152 ret = -ENOMEM;
132 goto out; 153 goto out;
133 } 154 }
134 155
156 /*
157 * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
158 * we do the opposite to avoid races.
159 */
160 if (!rds_conn_up(conn)) {
161 release_in_xmit(conn);
162 ret = 0;
163 goto out;
164 }
165
135 if (conn->c_trans->xmit_prepare) 166 if (conn->c_trans->xmit_prepare)
136 conn->c_trans->xmit_prepare(conn); 167 conn->c_trans->xmit_prepare(conn);
137 168
138 /* 169 /*
139 * spin trying to push headers and data down the connection until 170 * spin trying to push headers and data down the connection until
140 * the connection doens't make forward progress. 171 * the connection doesn't make forward progress.
141 */ 172 */
142 while (--send_quota) { 173 while (1) {
143 /*
144 * See if need to send a congestion map update if we're
145 * between sending messages. The send_sem protects our sole
146 * use of c_map_offset and _bytes.
147 * Note this is used only by transports that define a special
148 * xmit_cong_map function. For all others, we create allocate
149 * a cong_map message and treat it just like any other send.
150 */
151 if (conn->c_map_bytes) {
152 ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
153 conn->c_map_offset);
154 if (ret <= 0)
155 break;
156 174
157 conn->c_map_offset += ret;
158 conn->c_map_bytes -= ret;
159 if (conn->c_map_bytes)
160 continue;
161 }
162
163 /* If we're done sending the current message, clear the
164 * offset and S/G temporaries.
165 */
166 rm = conn->c_xmit_rm; 175 rm = conn->c_xmit_rm;
167 if (rm != NULL &&
168 conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
169 conn->c_xmit_sg == rm->m_nents) {
170 conn->c_xmit_rm = NULL;
171 conn->c_xmit_sg = 0;
172 conn->c_xmit_hdr_off = 0;
173 conn->c_xmit_data_off = 0;
174 conn->c_xmit_rdma_sent = 0;
175
176 /* Release the reference to the previous message. */
177 rds_message_put(rm);
178 rm = NULL;
179 }
180 176
181 /* If we're asked to send a cong map update, do so. 177 /*
178 * If between sending messages, we can send a pending congestion
179 * map update.
182 */ 180 */
183 if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) { 181 if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
184 if (conn->c_trans->xmit_cong_map != NULL) {
185 conn->c_map_offset = 0;
186 conn->c_map_bytes = sizeof(struct rds_header) +
187 RDS_CONG_MAP_BYTES;
188 continue;
189 }
190
191 rm = rds_cong_update_alloc(conn); 182 rm = rds_cong_update_alloc(conn);
192 if (IS_ERR(rm)) { 183 if (IS_ERR(rm)) {
193 ret = PTR_ERR(rm); 184 ret = PTR_ERR(rm);
194 break; 185 break;
195 } 186 }
187 rm->data.op_active = 1;
196 188
197 conn->c_xmit_rm = rm; 189 conn->c_xmit_rm = rm;
198 } 190 }
199 191
200 /* 192 /*
201 * Grab the next message from the send queue, if there is one. 193 * If not already working on one, grab the next message.
202 * 194 *
203 * c_xmit_rm holds a ref while we're sending this message down 195 * c_xmit_rm holds a ref while we're sending this message down
204 * the connction. We can use this ref while holding the 196 * the connction. We can use this ref while holding the
205 * send_sem.. rds_send_reset() is serialized with it. 197 * send_sem.. rds_send_reset() is serialized with it.
206 */ 198 */
207 if (rm == NULL) { 199 if (!rm) {
208 unsigned int len; 200 unsigned int len;
209 201
210 spin_lock_irqsave(&conn->c_lock, flags); 202 spin_lock_irqsave(&conn->c_lock, flags);
@@ -224,10 +216,8 @@ int rds_send_xmit(struct rds_connection *conn)
224 216
225 spin_unlock_irqrestore(&conn->c_lock, flags); 217 spin_unlock_irqrestore(&conn->c_lock, flags);
226 218
227 if (rm == NULL) { 219 if (!rm)
228 was_empty = 1;
229 break; 220 break;
230 }
231 221
232 /* Unfortunately, the way Infiniband deals with 222 /* Unfortunately, the way Infiniband deals with
233 * RDMA to a bad MR key is by moving the entire 223 * RDMA to a bad MR key is by moving the entire
@@ -236,13 +226,12 @@ int rds_send_xmit(struct rds_connection *conn)
236 * connection. 226 * connection.
237 * Therefore, we never retransmit messages with RDMA ops. 227 * Therefore, we never retransmit messages with RDMA ops.
238 */ 228 */
239 if (rm->m_rdma_op && 229 if (rm->rdma.op_active &&
240 test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { 230 test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
241 spin_lock_irqsave(&conn->c_lock, flags); 231 spin_lock_irqsave(&conn->c_lock, flags);
242 if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) 232 if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
243 list_move(&rm->m_conn_item, &to_be_dropped); 233 list_move(&rm->m_conn_item, &to_be_dropped);
244 spin_unlock_irqrestore(&conn->c_lock, flags); 234 spin_unlock_irqrestore(&conn->c_lock, flags);
245 rds_message_put(rm);
246 continue; 235 continue;
247 } 236 }
248 237
@@ -263,23 +252,55 @@ int rds_send_xmit(struct rds_connection *conn)
263 conn->c_xmit_rm = rm; 252 conn->c_xmit_rm = rm;
264 } 253 }
265 254
266 /* 255 /* The transport either sends the whole rdma or none of it */
267 * Try and send an rdma message. Let's see if we can 256 if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
268 * keep this simple and require that the transport either 257 rm->m_final_op = &rm->rdma;
269 * send the whole rdma or none of it. 258 ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
270 */
271 if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
272 ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
273 if (ret) 259 if (ret)
274 break; 260 break;
275 conn->c_xmit_rdma_sent = 1; 261 conn->c_xmit_rdma_sent = 1;
262
263 /* The transport owns the mapped memory for now.
264 * You can't unmap it while it's on the send queue */
265 set_bit(RDS_MSG_MAPPED, &rm->m_flags);
266 }
267
268 if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
269 rm->m_final_op = &rm->atomic;
270 ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
271 if (ret)
272 break;
273 conn->c_xmit_atomic_sent = 1;
274
276 /* The transport owns the mapped memory for now. 275 /* The transport owns the mapped memory for now.
277 * You can't unmap it while it's on the send queue */ 276 * You can't unmap it while it's on the send queue */
278 set_bit(RDS_MSG_MAPPED, &rm->m_flags); 277 set_bit(RDS_MSG_MAPPED, &rm->m_flags);
279 } 278 }
280 279
281 if (conn->c_xmit_hdr_off < sizeof(struct rds_header) || 280 /*
282 conn->c_xmit_sg < rm->m_nents) { 281 * A number of cases require an RDS header to be sent
282 * even if there is no data.
283 * We permit 0-byte sends; rds-ping depends on this.
284 * However, if there are exclusively attached silent ops,
285 * we skip the hdr/data send, to enable silent operation.
286 */
287 if (rm->data.op_nents == 0) {
288 int ops_present;
289 int all_ops_are_silent = 1;
290
291 ops_present = (rm->atomic.op_active || rm->rdma.op_active);
292 if (rm->atomic.op_active && !rm->atomic.op_silent)
293 all_ops_are_silent = 0;
294 if (rm->rdma.op_active && !rm->rdma.op_silent)
295 all_ops_are_silent = 0;
296
297 if (ops_present && all_ops_are_silent
298 && !rm->m_rdma_cookie)
299 rm->data.op_active = 0;
300 }
301
302 if (rm->data.op_active && !conn->c_xmit_data_sent) {
303 rm->m_final_op = &rm->data;
283 ret = conn->c_trans->xmit(conn, rm, 304 ret = conn->c_trans->xmit(conn, rm,
284 conn->c_xmit_hdr_off, 305 conn->c_xmit_hdr_off,
285 conn->c_xmit_sg, 306 conn->c_xmit_sg,
@@ -295,7 +316,7 @@ int rds_send_xmit(struct rds_connection *conn)
295 ret -= tmp; 316 ret -= tmp;
296 } 317 }
297 318
298 sg = &rm->m_sg[conn->c_xmit_sg]; 319 sg = &rm->data.op_sg[conn->c_xmit_sg];
299 while (ret) { 320 while (ret) {
300 tmp = min_t(int, ret, sg->length - 321 tmp = min_t(int, ret, sg->length -
301 conn->c_xmit_data_off); 322 conn->c_xmit_data_off);
@@ -306,49 +327,63 @@ int rds_send_xmit(struct rds_connection *conn)
306 sg++; 327 sg++;
307 conn->c_xmit_sg++; 328 conn->c_xmit_sg++;
308 BUG_ON(ret != 0 && 329 BUG_ON(ret != 0 &&
309 conn->c_xmit_sg == rm->m_nents); 330 conn->c_xmit_sg == rm->data.op_nents);
310 } 331 }
311 } 332 }
333
334 if (conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
335 (conn->c_xmit_sg == rm->data.op_nents))
336 conn->c_xmit_data_sent = 1;
312 } 337 }
313 }
314 338
315 /* Nuke any messages we decided not to retransmit. */ 339 /*
316 if (!list_empty(&to_be_dropped)) 340 * A rm will only take multiple times through this loop
317 rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); 341 * if there is a data op. Thus, if the data is sent (or there was
342 * none), then we're done with the rm.
343 */
344 if (!rm->data.op_active || conn->c_xmit_data_sent) {
345 conn->c_xmit_rm = NULL;
346 conn->c_xmit_sg = 0;
347 conn->c_xmit_hdr_off = 0;
348 conn->c_xmit_data_off = 0;
349 conn->c_xmit_rdma_sent = 0;
350 conn->c_xmit_atomic_sent = 0;
351 conn->c_xmit_data_sent = 0;
352
353 rds_message_put(rm);
354 }
355 }
318 356
319 if (conn->c_trans->xmit_complete) 357 if (conn->c_trans->xmit_complete)
320 conn->c_trans->xmit_complete(conn); 358 conn->c_trans->xmit_complete(conn);
321 359
322 /* 360 release_in_xmit(conn);
323 * We might be racing with another sender who queued a message but
324 * backed off on noticing that we held the c_send_lock. If we check
325 * for queued messages after dropping the sem then either we'll
326 * see the queued message or the queuer will get the sem. If we
327 * notice the queued message then we trigger an immediate retry.
328 *
329 * We need to be careful only to do this when we stopped processing
330 * the send queue because it was empty. It's the only way we
331 * stop processing the loop when the transport hasn't taken
332 * responsibility for forward progress.
333 */
334 mutex_unlock(&conn->c_send_lock);
335 361
336 if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) { 362 /* Nuke any messages we decided not to retransmit. */
337 /* We exhausted the send quota, but there's work left to 363 if (!list_empty(&to_be_dropped)) {
338 * do. Return and (re-)schedule the send worker. 364 /* irqs on here, so we can put(), unlike above */
339 */ 365 list_for_each_entry(rm, &to_be_dropped, m_conn_item)
340 ret = -EAGAIN; 366 rds_message_put(rm);
367 rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
341 } 368 }
342 369
343 if (ret == 0 && was_empty) { 370 /*
344 /* A simple bit test would be way faster than taking the 371 * Other senders can queue a message after we last test the send queue
345 * spin lock */ 372 * but before we clear RDS_IN_XMIT. In that case they'd back off and
346 spin_lock_irqsave(&conn->c_lock, flags); 373 * not try and send their newly queued message. We need to check the
374 * send queue after having cleared RDS_IN_XMIT so that their message
375 * doesn't get stuck on the send queue.
376 *
377 * If the transport cannot continue (i.e ret != 0), then it must
378 * call us when more room is available, such as from the tx
379 * completion handler.
380 */
381 if (ret == 0) {
382 smp_mb();
347 if (!list_empty(&conn->c_send_queue)) { 383 if (!list_empty(&conn->c_send_queue)) {
348 rds_stats_inc(s_send_sem_queue_raced); 384 rds_stats_inc(s_send_lock_queue_raced);
349 ret = -EAGAIN; 385 goto restart;
350 } 386 }
351 spin_unlock_irqrestore(&conn->c_lock, flags);
352 } 387 }
353out: 388out:
354 return ret; 389 return ret;
@@ -376,52 +411,60 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
376} 411}
377 412
378/* 413/*
379 * Returns true if there are no messages on the send and retransmit queues 414 * This is pretty similar to what happens below in the ACK
380 * which have a sequence number greater than or equal to the given sequence 415 * handling code - except that we call here as soon as we get
381 * number. 416 * the IB send completion on the RDMA op and the accompanying
417 * message.
382 */ 418 */
383int rds_send_acked_before(struct rds_connection *conn, u64 seq) 419void rds_rdma_send_complete(struct rds_message *rm, int status)
384{ 420{
385 struct rds_message *rm, *tmp; 421 struct rds_sock *rs = NULL;
386 int ret = 1; 422 struct rm_rdma_op *ro;
423 struct rds_notifier *notifier;
424 unsigned long flags;
387 425
388 spin_lock(&conn->c_lock); 426 spin_lock_irqsave(&rm->m_rs_lock, flags);
389 427
390 list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { 428 ro = &rm->rdma;
391 if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) 429 if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
392 ret = 0; 430 ro->op_active && ro->op_notify && ro->op_notifier) {
393 break; 431 notifier = ro->op_notifier;
394 } 432 rs = rm->m_rs;
433 sock_hold(rds_rs_to_sk(rs));
395 434
396 list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { 435 notifier->n_status = status;
397 if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) 436 spin_lock(&rs->rs_lock);
398 ret = 0; 437 list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
399 break; 438 spin_unlock(&rs->rs_lock);
439
440 ro->op_notifier = NULL;
400 } 441 }
401 442
402 spin_unlock(&conn->c_lock); 443 spin_unlock_irqrestore(&rm->m_rs_lock, flags);
403 444
404 return ret; 445 if (rs) {
446 rds_wake_sk_sleep(rs);
447 sock_put(rds_rs_to_sk(rs));
448 }
405} 449}
450EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
406 451
407/* 452/*
408 * This is pretty similar to what happens below in the ACK 453 * Just like above, except looks at atomic op
409 * handling code - except that we call here as soon as we get
410 * the IB send completion on the RDMA op and the accompanying
411 * message.
412 */ 454 */
413void rds_rdma_send_complete(struct rds_message *rm, int status) 455void rds_atomic_send_complete(struct rds_message *rm, int status)
414{ 456{
415 struct rds_sock *rs = NULL; 457 struct rds_sock *rs = NULL;
416 struct rds_rdma_op *ro; 458 struct rm_atomic_op *ao;
417 struct rds_notifier *notifier; 459 struct rds_notifier *notifier;
460 unsigned long flags;
418 461
419 spin_lock(&rm->m_rs_lock); 462 spin_lock_irqsave(&rm->m_rs_lock, flags);
420 463
421 ro = rm->m_rdma_op; 464 ao = &rm->atomic;
422 if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && 465 if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
423 ro && ro->r_notify && ro->r_notifier) { 466 && ao->op_active && ao->op_notify && ao->op_notifier) {
424 notifier = ro->r_notifier; 467 notifier = ao->op_notifier;
425 rs = rm->m_rs; 468 rs = rm->m_rs;
426 sock_hold(rds_rs_to_sk(rs)); 469 sock_hold(rds_rs_to_sk(rs));
427 470
@@ -430,17 +473,17 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
430 list_add_tail(&notifier->n_list, &rs->rs_notify_queue); 473 list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
431 spin_unlock(&rs->rs_lock); 474 spin_unlock(&rs->rs_lock);
432 475
433 ro->r_notifier = NULL; 476 ao->op_notifier = NULL;
434 } 477 }
435 478
436 spin_unlock(&rm->m_rs_lock); 479 spin_unlock_irqrestore(&rm->m_rs_lock, flags);
437 480
438 if (rs) { 481 if (rs) {
439 rds_wake_sk_sleep(rs); 482 rds_wake_sk_sleep(rs);
440 sock_put(rds_rs_to_sk(rs)); 483 sock_put(rds_rs_to_sk(rs));
441 } 484 }
442} 485}
443EXPORT_SYMBOL_GPL(rds_rdma_send_complete); 486EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
444 487
445/* 488/*
446 * This is the same as rds_rdma_send_complete except we 489 * This is the same as rds_rdma_send_complete except we
@@ -448,15 +491,23 @@ EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
448 * socket, socket lock) and can just move the notifier. 491 * socket, socket lock) and can just move the notifier.
449 */ 492 */
450static inline void 493static inline void
451__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) 494__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
452{ 495{
453 struct rds_rdma_op *ro; 496 struct rm_rdma_op *ro;
497 struct rm_atomic_op *ao;
498
499 ro = &rm->rdma;
500 if (ro->op_active && ro->op_notify && ro->op_notifier) {
501 ro->op_notifier->n_status = status;
502 list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
503 ro->op_notifier = NULL;
504 }
454 505
455 ro = rm->m_rdma_op; 506 ao = &rm->atomic;
456 if (ro && ro->r_notify && ro->r_notifier) { 507 if (ao->op_active && ao->op_notify && ao->op_notifier) {
457 ro->r_notifier->n_status = status; 508 ao->op_notifier->n_status = status;
458 list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue); 509 list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
459 ro->r_notifier = NULL; 510 ao->op_notifier = NULL;
460 } 511 }
461 512
462 /* No need to wake the app - caller does this */ 513 /* No need to wake the app - caller does this */
@@ -468,7 +519,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status
468 * So speed is not an issue here. 519 * So speed is not an issue here.
469 */ 520 */
470struct rds_message *rds_send_get_message(struct rds_connection *conn, 521struct rds_message *rds_send_get_message(struct rds_connection *conn,
471 struct rds_rdma_op *op) 522 struct rm_rdma_op *op)
472{ 523{
473 struct rds_message *rm, *tmp, *found = NULL; 524 struct rds_message *rm, *tmp, *found = NULL;
474 unsigned long flags; 525 unsigned long flags;
@@ -476,7 +527,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
476 spin_lock_irqsave(&conn->c_lock, flags); 527 spin_lock_irqsave(&conn->c_lock, flags);
477 528
478 list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { 529 list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
479 if (rm->m_rdma_op == op) { 530 if (&rm->rdma == op) {
480 atomic_inc(&rm->m_refcount); 531 atomic_inc(&rm->m_refcount);
481 found = rm; 532 found = rm;
482 goto out; 533 goto out;
@@ -484,7 +535,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
484 } 535 }
485 536
486 list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { 537 list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
487 if (rm->m_rdma_op == op) { 538 if (&rm->rdma == op) {
488 atomic_inc(&rm->m_refcount); 539 atomic_inc(&rm->m_refcount);
489 found = rm; 540 found = rm;
490 break; 541 break;
@@ -506,7 +557,7 @@ EXPORT_SYMBOL_GPL(rds_send_get_message);
506 * removing the messages from the 'messages' list regardless of if it found 557 * removing the messages from the 'messages' list regardless of if it found
507 * the messages on the socket list or not. 558 * the messages on the socket list or not.
508 */ 559 */
509void rds_send_remove_from_sock(struct list_head *messages, int status) 560static void rds_send_remove_from_sock(struct list_head *messages, int status)
510{ 561{
511 unsigned long flags; 562 unsigned long flags;
512 struct rds_sock *rs = NULL; 563 struct rds_sock *rs = NULL;
@@ -544,19 +595,20 @@ void rds_send_remove_from_sock(struct list_head *messages, int status)
544 spin_lock(&rs->rs_lock); 595 spin_lock(&rs->rs_lock);
545 596
546 if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { 597 if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
547 struct rds_rdma_op *ro = rm->m_rdma_op; 598 struct rm_rdma_op *ro = &rm->rdma;
548 struct rds_notifier *notifier; 599 struct rds_notifier *notifier;
549 600
550 list_del_init(&rm->m_sock_item); 601 list_del_init(&rm->m_sock_item);
551 rds_send_sndbuf_remove(rs, rm); 602 rds_send_sndbuf_remove(rs, rm);
552 603
553 if (ro && ro->r_notifier && (status || ro->r_notify)) { 604 if (ro->op_active && ro->op_notifier &&
554 notifier = ro->r_notifier; 605 (ro->op_notify || (ro->op_recverr && status))) {
606 notifier = ro->op_notifier;
555 list_add_tail(&notifier->n_list, 607 list_add_tail(&notifier->n_list,
556 &rs->rs_notify_queue); 608 &rs->rs_notify_queue);
557 if (!notifier->n_status) 609 if (!notifier->n_status)
558 notifier->n_status = status; 610 notifier->n_status = status;
559 rm->m_rdma_op->r_notifier = NULL; 611 rm->rdma.op_notifier = NULL;
560 } 612 }
561 was_on_sock = 1; 613 was_on_sock = 1;
562 rm->m_rs = NULL; 614 rm->m_rs = NULL;
@@ -619,9 +671,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
619{ 671{
620 struct rds_message *rm, *tmp; 672 struct rds_message *rm, *tmp;
621 struct rds_connection *conn; 673 struct rds_connection *conn;
622 unsigned long flags, flags2; 674 unsigned long flags;
623 LIST_HEAD(list); 675 LIST_HEAD(list);
624 int wake = 0;
625 676
626 /* get all the messages we're dropping under the rs lock */ 677 /* get all the messages we're dropping under the rs lock */
627 spin_lock_irqsave(&rs->rs_lock, flags); 678 spin_lock_irqsave(&rs->rs_lock, flags);
@@ -631,59 +682,54 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
631 dest->sin_port != rm->m_inc.i_hdr.h_dport)) 682 dest->sin_port != rm->m_inc.i_hdr.h_dport))
632 continue; 683 continue;
633 684
634 wake = 1;
635 list_move(&rm->m_sock_item, &list); 685 list_move(&rm->m_sock_item, &list);
636 rds_send_sndbuf_remove(rs, rm); 686 rds_send_sndbuf_remove(rs, rm);
637 clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); 687 clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
638 } 688 }
639 689
640 /* order flag updates with the rs lock */ 690 /* order flag updates with the rs lock */
641 if (wake) 691 smp_mb__after_clear_bit();
642 smp_mb__after_clear_bit();
643 692
644 spin_unlock_irqrestore(&rs->rs_lock, flags); 693 spin_unlock_irqrestore(&rs->rs_lock, flags);
645 694
646 conn = NULL; 695 if (list_empty(&list))
696 return;
647 697
648 /* now remove the messages from the conn list as needed */ 698 /* Remove the messages from the conn */
649 list_for_each_entry(rm, &list, m_sock_item) { 699 list_for_each_entry(rm, &list, m_sock_item) {
650 /* We do this here rather than in the loop above, so that
651 * we don't have to nest m_rs_lock under rs->rs_lock */
652 spin_lock_irqsave(&rm->m_rs_lock, flags2);
653 /* If this is a RDMA operation, notify the app. */
654 spin_lock(&rs->rs_lock);
655 __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
656 spin_unlock(&rs->rs_lock);
657 rm->m_rs = NULL;
658 spin_unlock_irqrestore(&rm->m_rs_lock, flags2);
659 700
701 conn = rm->m_inc.i_conn;
702
703 spin_lock_irqsave(&conn->c_lock, flags);
660 /* 704 /*
661 * If we see this flag cleared then we're *sure* that someone 705 * Maybe someone else beat us to removing rm from the conn.
662 * else beat us to removing it from the conn. If we race 706 * If we race with their flag update we'll get the lock and
663 * with their flag update we'll get the lock and then really 707 * then really see that the flag has been cleared.
664 * see that the flag has been cleared.
665 */ 708 */
666 if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags)) 709 if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
710 spin_unlock_irqrestore(&conn->c_lock, flags);
667 continue; 711 continue;
668
669 if (conn != rm->m_inc.i_conn) {
670 if (conn)
671 spin_unlock_irqrestore(&conn->c_lock, flags);
672 conn = rm->m_inc.i_conn;
673 spin_lock_irqsave(&conn->c_lock, flags);
674 } 712 }
713 list_del_init(&rm->m_conn_item);
714 spin_unlock_irqrestore(&conn->c_lock, flags);
675 715
676 if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { 716 /*
677 list_del_init(&rm->m_conn_item); 717 * Couldn't grab m_rs_lock in top loop (lock ordering),
678 rds_message_put(rm); 718 * but we can now.
679 } 719 */
680 } 720 spin_lock_irqsave(&rm->m_rs_lock, flags);
681 721
682 if (conn) 722 spin_lock(&rs->rs_lock);
683 spin_unlock_irqrestore(&conn->c_lock, flags); 723 __rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
724 spin_unlock(&rs->rs_lock);
684 725
685 if (wake) 726 rm->m_rs = NULL;
686 rds_wake_sk_sleep(rs); 727 spin_unlock_irqrestore(&rm->m_rs_lock, flags);
728
729 rds_message_put(rm);
730 }
731
732 rds_wake_sk_sleep(rs);
687 733
688 while (!list_empty(&list)) { 734 while (!list_empty(&list)) {
689 rm = list_entry(list.next, struct rds_message, m_sock_item); 735 rm = list_entry(list.next, struct rds_message, m_sock_item);
@@ -763,6 +809,63 @@ out:
763 return *queued; 809 return *queued;
764} 810}
765 811
812/*
813 * rds_message is getting to be quite complicated, and we'd like to allocate
814 * it all in one go. This figures out how big it needs to be up front.
815 */
816static int rds_rm_size(struct msghdr *msg, int data_len)
817{
818 struct cmsghdr *cmsg;
819 int size = 0;
820 int cmsg_groups = 0;
821 int retval;
822
823 for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
824 if (!CMSG_OK(msg, cmsg))
825 return -EINVAL;
826
827 if (cmsg->cmsg_level != SOL_RDS)
828 continue;
829
830 switch (cmsg->cmsg_type) {
831 case RDS_CMSG_RDMA_ARGS:
832 cmsg_groups |= 1;
833 retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
834 if (retval < 0)
835 return retval;
836 size += retval;
837
838 break;
839
840 case RDS_CMSG_RDMA_DEST:
841 case RDS_CMSG_RDMA_MAP:
842 cmsg_groups |= 2;
843 /* these are valid but do no add any size */
844 break;
845
846 case RDS_CMSG_ATOMIC_CSWP:
847 case RDS_CMSG_ATOMIC_FADD:
848 case RDS_CMSG_MASKED_ATOMIC_CSWP:
849 case RDS_CMSG_MASKED_ATOMIC_FADD:
850 cmsg_groups |= 1;
851 size += sizeof(struct scatterlist);
852 break;
853
854 default:
855 return -EINVAL;
856 }
857
858 }
859
860 size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
861
862 /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
863 if (cmsg_groups == 3)
864 return -EINVAL;
865
866 return size;
867}
868
766static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, 869static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
767 struct msghdr *msg, int *allocated_mr) 870 struct msghdr *msg, int *allocated_mr)
768{ 871{
@@ -777,7 +880,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
777 continue; 880 continue;
778 881
779 /* As a side effect, RDMA_DEST and RDMA_MAP will set 882 /* As a side effect, RDMA_DEST and RDMA_MAP will set
780 * rm->m_rdma_cookie and rm->m_rdma_mr. 883 * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
781 */ 884 */
782 switch (cmsg->cmsg_type) { 885 switch (cmsg->cmsg_type) {
783 case RDS_CMSG_RDMA_ARGS: 886 case RDS_CMSG_RDMA_ARGS:
@@ -793,6 +896,12 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
793 if (!ret) 896 if (!ret)
794 *allocated_mr = 1; 897 *allocated_mr = 1;
795 break; 898 break;
899 case RDS_CMSG_ATOMIC_CSWP:
900 case RDS_CMSG_ATOMIC_FADD:
901 case RDS_CMSG_MASKED_ATOMIC_CSWP:
902 case RDS_CMSG_MASKED_ATOMIC_FADD:
903 ret = rds_cmsg_atomic(rs, rm, cmsg);
904 break;
796 905
797 default: 906 default:
798 return -EINVAL; 907 return -EINVAL;
@@ -850,13 +959,30 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
850 goto out; 959 goto out;
851 } 960 }
852 961
853 rm = rds_message_copy_from_user(msg->msg_iov, payload_len); 962 /* size of rm including all sgs */
854 if (IS_ERR(rm)) { 963 ret = rds_rm_size(msg, payload_len);
855 ret = PTR_ERR(rm); 964 if (ret < 0)
856 rm = NULL; 965 goto out;
966
967 rm = rds_message_alloc(ret, GFP_KERNEL);
968 if (!rm) {
969 ret = -ENOMEM;
857 goto out; 970 goto out;
858 } 971 }
859 972
973 /* Attach data to the rm */
974 if (payload_len) {
975 rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
976 if (!rm->data.op_sg) {
977 ret = -ENOMEM;
978 goto out;
979 }
980 ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len);
981 if (ret)
982 goto out;
983 }
984 rm->data.op_active = 1;
985
860 rm->m_daddr = daddr; 986 rm->m_daddr = daddr;
861 987
862 /* rds_conn_create has a spinlock that runs with IRQ off. 988 /* rds_conn_create has a spinlock that runs with IRQ off.
@@ -879,22 +1005,23 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
879 if (ret) 1005 if (ret)
880 goto out; 1006 goto out;
881 1007
882 if ((rm->m_rdma_cookie || rm->m_rdma_op) && 1008 if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
883 conn->c_trans->xmit_rdma == NULL) {
884 if (printk_ratelimit()) 1009 if (printk_ratelimit())
885 printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", 1010 printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
886 rm->m_rdma_op, conn->c_trans->xmit_rdma); 1011 &rm->rdma, conn->c_trans->xmit_rdma);
887 ret = -EOPNOTSUPP; 1012 ret = -EOPNOTSUPP;
888 goto out; 1013 goto out;
889 } 1014 }
890 1015
891 /* If the connection is down, trigger a connect. We may 1016 if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
892 * have scheduled a delayed reconnect however - in this case 1017 if (printk_ratelimit())
893 * we should not interfere. 1018 printk(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
894 */ 1019 &rm->atomic, conn->c_trans->xmit_atomic);
895 if (rds_conn_state(conn) == RDS_CONN_DOWN && 1020 ret = -EOPNOTSUPP;
896 !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) 1021 goto out;
897 queue_delayed_work(rds_wq, &conn->c_conn_w, 0); 1022 }
1023
1024 rds_conn_connect_if_down(conn);
898 1025
899 ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); 1026 ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
900 if (ret) { 1027 if (ret) {
@@ -938,7 +1065,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
938 rds_stats_inc(s_send_queued); 1065 rds_stats_inc(s_send_queued);
939 1066
940 if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) 1067 if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
941 rds_send_worker(&conn->c_send_w.work); 1068 rds_send_xmit(conn);
942 1069
943 rds_message_put(rm); 1070 rds_message_put(rm);
944 return payload_len; 1071 return payload_len;
@@ -966,20 +1093,15 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
966 int ret = 0; 1093 int ret = 0;
967 1094
968 rm = rds_message_alloc(0, GFP_ATOMIC); 1095 rm = rds_message_alloc(0, GFP_ATOMIC);
969 if (rm == NULL) { 1096 if (!rm) {
970 ret = -ENOMEM; 1097 ret = -ENOMEM;
971 goto out; 1098 goto out;
972 } 1099 }
973 1100
974 rm->m_daddr = conn->c_faddr; 1101 rm->m_daddr = conn->c_faddr;
1102 rm->data.op_active = 1;
975 1103
976 /* If the connection is down, trigger a connect. We may 1104 rds_conn_connect_if_down(conn);
977 * have scheduled a delayed reconnect however - in this case
978 * we should not interfere.
979 */
980 if (rds_conn_state(conn) == RDS_CONN_DOWN &&
981 !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
982 queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
983 1105
984 ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); 1106 ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
985 if (ret) 1107 if (ret)
@@ -999,7 +1121,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
999 rds_stats_inc(s_send_queued); 1121 rds_stats_inc(s_send_queued);
1000 rds_stats_inc(s_send_pong); 1122 rds_stats_inc(s_send_pong);
1001 1123
1002 queue_delayed_work(rds_wq, &conn->c_send_w, 0); 1124 if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
1125 rds_send_xmit(conn);
1126
1003 rds_message_put(rm); 1127 rds_message_put(rm);
1004 return 0; 1128 return 0;
1005 1129
diff --git a/net/rds/stats.c b/net/rds/stats.c
index 7598eb07cfb1..10c759ccac0c 100644
--- a/net/rds/stats.c
+++ b/net/rds/stats.c
@@ -57,8 +57,8 @@ static const char *const rds_stat_names[] = {
57 "recv_ping", 57 "recv_ping",
58 "send_queue_empty", 58 "send_queue_empty",
59 "send_queue_full", 59 "send_queue_full",
60 "send_sem_contention", 60 "send_lock_contention",
61 "send_sem_queue_raced", 61 "send_lock_queue_raced",
62 "send_immediate_retry", 62 "send_immediate_retry",
63 "send_delayed_retry", 63 "send_delayed_retry",
64 "send_drop_acked", 64 "send_drop_acked",
@@ -143,7 +143,7 @@ void rds_stats_exit(void)
143 rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); 143 rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
144} 144}
145 145
146int __init rds_stats_init(void) 146int rds_stats_init(void)
147{ 147{
148 rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); 148 rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
149 return 0; 149 return 0;
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
index 7829a20325d3..25ad0c77a26c 100644
--- a/net/rds/sysctl.c
+++ b/net/rds/sysctl.c
@@ -105,13 +105,13 @@ void rds_sysctl_exit(void)
105 unregister_sysctl_table(rds_sysctl_reg_table); 105 unregister_sysctl_table(rds_sysctl_reg_table);
106} 106}
107 107
108int __init rds_sysctl_init(void) 108int rds_sysctl_init(void)
109{ 109{
110 rds_sysctl_reconnect_min = msecs_to_jiffies(1); 110 rds_sysctl_reconnect_min = msecs_to_jiffies(1);
111 rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; 111 rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
112 112
113 rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table); 113 rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table);
114 if (rds_sysctl_reg_table == NULL) 114 if (!rds_sysctl_reg_table)
115 return -ENOMEM; 115 return -ENOMEM;
116 return 0; 116 return 0;
117} 117}
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index babf4577ff7d..8e0a32001c90 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -41,7 +41,7 @@
41/* only for info exporting */ 41/* only for info exporting */
42static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); 42static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
43static LIST_HEAD(rds_tcp_tc_list); 43static LIST_HEAD(rds_tcp_tc_list);
44unsigned int rds_tcp_tc_count; 44static unsigned int rds_tcp_tc_count;
45 45
46/* Track rds_tcp_connection structs so they can be cleaned up */ 46/* Track rds_tcp_connection structs so they can be cleaned up */
47static DEFINE_SPINLOCK(rds_tcp_conn_lock); 47static DEFINE_SPINLOCK(rds_tcp_conn_lock);
@@ -200,7 +200,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
200 struct rds_tcp_connection *tc; 200 struct rds_tcp_connection *tc;
201 201
202 tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); 202 tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
203 if (tc == NULL) 203 if (!tc)
204 return -ENOMEM; 204 return -ENOMEM;
205 205
206 tc->t_sock = NULL; 206 tc->t_sock = NULL;
@@ -221,7 +221,13 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
221static void rds_tcp_conn_free(void *arg) 221static void rds_tcp_conn_free(void *arg)
222{ 222{
223 struct rds_tcp_connection *tc = arg; 223 struct rds_tcp_connection *tc = arg;
224 unsigned long flags;
224 rdsdebug("freeing tc %p\n", tc); 225 rdsdebug("freeing tc %p\n", tc);
226
227 spin_lock_irqsave(&rds_tcp_conn_lock, flags);
228 list_del(&tc->t_tcp_node);
229 spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
230
225 kmem_cache_free(rds_tcp_conn_slab, tc); 231 kmem_cache_free(rds_tcp_conn_slab, tc);
226} 232}
227 233
@@ -243,7 +249,7 @@ static void rds_tcp_destroy_conns(void)
243 } 249 }
244} 250}
245 251
246void rds_tcp_exit(void) 252static void rds_tcp_exit(void)
247{ 253{
248 rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); 254 rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
249 rds_tcp_listen_stop(); 255 rds_tcp_listen_stop();
@@ -258,7 +264,6 @@ struct rds_transport rds_tcp_transport = {
258 .laddr_check = rds_tcp_laddr_check, 264 .laddr_check = rds_tcp_laddr_check,
259 .xmit_prepare = rds_tcp_xmit_prepare, 265 .xmit_prepare = rds_tcp_xmit_prepare,
260 .xmit_complete = rds_tcp_xmit_complete, 266 .xmit_complete = rds_tcp_xmit_complete,
261 .xmit_cong_map = rds_tcp_xmit_cong_map,
262 .xmit = rds_tcp_xmit, 267 .xmit = rds_tcp_xmit,
263 .recv = rds_tcp_recv, 268 .recv = rds_tcp_recv,
264 .conn_alloc = rds_tcp_conn_alloc, 269 .conn_alloc = rds_tcp_conn_alloc,
@@ -266,7 +271,6 @@ struct rds_transport rds_tcp_transport = {
266 .conn_connect = rds_tcp_conn_connect, 271 .conn_connect = rds_tcp_conn_connect,
267 .conn_shutdown = rds_tcp_conn_shutdown, 272 .conn_shutdown = rds_tcp_conn_shutdown,
268 .inc_copy_to_user = rds_tcp_inc_copy_to_user, 273 .inc_copy_to_user = rds_tcp_inc_copy_to_user,
269 .inc_purge = rds_tcp_inc_purge,
270 .inc_free = rds_tcp_inc_free, 274 .inc_free = rds_tcp_inc_free,
271 .stats_info_copy = rds_tcp_stats_info_copy, 275 .stats_info_copy = rds_tcp_stats_info_copy,
272 .exit = rds_tcp_exit, 276 .exit = rds_tcp_exit,
@@ -276,14 +280,14 @@ struct rds_transport rds_tcp_transport = {
276 .t_prefer_loopback = 1, 280 .t_prefer_loopback = 1,
277}; 281};
278 282
279int __init rds_tcp_init(void) 283static int rds_tcp_init(void)
280{ 284{
281 int ret; 285 int ret;
282 286
283 rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", 287 rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
284 sizeof(struct rds_tcp_connection), 288 sizeof(struct rds_tcp_connection),
285 0, 0, NULL); 289 0, 0, NULL);
286 if (rds_tcp_conn_slab == NULL) { 290 if (!rds_tcp_conn_slab) {
287 ret = -ENOMEM; 291 ret = -ENOMEM;
288 goto out; 292 goto out;
289 } 293 }
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 844fa6b9cf5a..9cf2927d0021 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -43,8 +43,6 @@ struct rds_tcp_statistics {
43}; 43};
44 44
45/* tcp.c */ 45/* tcp.c */
46int __init rds_tcp_init(void);
47void rds_tcp_exit(void);
48void rds_tcp_tune(struct socket *sock); 46void rds_tcp_tune(struct socket *sock);
49void rds_tcp_nonagle(struct socket *sock); 47void rds_tcp_nonagle(struct socket *sock);
50void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn); 48void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
@@ -61,16 +59,15 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn);
61void rds_tcp_state_change(struct sock *sk); 59void rds_tcp_state_change(struct sock *sk);
62 60
63/* tcp_listen.c */ 61/* tcp_listen.c */
64int __init rds_tcp_listen_init(void); 62int rds_tcp_listen_init(void);
65void rds_tcp_listen_stop(void); 63void rds_tcp_listen_stop(void);
66void rds_tcp_listen_data_ready(struct sock *sk, int bytes); 64void rds_tcp_listen_data_ready(struct sock *sk, int bytes);
67 65
68/* tcp_recv.c */ 66/* tcp_recv.c */
69int __init rds_tcp_recv_init(void); 67int rds_tcp_recv_init(void);
70void rds_tcp_recv_exit(void); 68void rds_tcp_recv_exit(void);
71void rds_tcp_data_ready(struct sock *sk, int bytes); 69void rds_tcp_data_ready(struct sock *sk, int bytes);
72int rds_tcp_recv(struct rds_connection *conn); 70int rds_tcp_recv(struct rds_connection *conn);
73void rds_tcp_inc_purge(struct rds_incoming *inc);
74void rds_tcp_inc_free(struct rds_incoming *inc); 71void rds_tcp_inc_free(struct rds_incoming *inc);
75int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, 72int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
76 size_t size); 73 size_t size);
@@ -81,8 +78,6 @@ void rds_tcp_xmit_complete(struct rds_connection *conn);
81int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, 78int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
82 unsigned int hdr_off, unsigned int sg, unsigned int off); 79 unsigned int hdr_off, unsigned int sg, unsigned int off);
83void rds_tcp_write_space(struct sock *sk); 80void rds_tcp_write_space(struct sock *sk);
84int rds_tcp_xmit_cong_map(struct rds_connection *conn,
85 struct rds_cong_map *map, unsigned long offset);
86 81
87/* tcp_stats.c */ 82/* tcp_stats.c */
88DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats); 83DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index c519939e8da9..af95c8e058fc 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -45,7 +45,7 @@ void rds_tcp_state_change(struct sock *sk)
45 45
46 read_lock_bh(&sk->sk_callback_lock); 46 read_lock_bh(&sk->sk_callback_lock);
47 conn = sk->sk_user_data; 47 conn = sk->sk_user_data;
48 if (conn == NULL) { 48 if (!conn) {
49 state_change = sk->sk_state_change; 49 state_change = sk->sk_state_change;
50 goto out; 50 goto out;
51 } 51 }
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 27844f231d10..8b5cc4aa8868 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -116,7 +116,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
116 116
117 read_lock_bh(&sk->sk_callback_lock); 117 read_lock_bh(&sk->sk_callback_lock);
118 ready = sk->sk_user_data; 118 ready = sk->sk_user_data;
119 if (ready == NULL) { /* check for teardown race */ 119 if (!ready) { /* check for teardown race */
120 ready = sk->sk_data_ready; 120 ready = sk->sk_data_ready;
121 goto out; 121 goto out;
122 } 122 }
@@ -135,7 +135,7 @@ out:
135 ready(sk, bytes); 135 ready(sk, bytes);
136} 136}
137 137
138int __init rds_tcp_listen_init(void) 138int rds_tcp_listen_init(void)
139{ 139{
140 struct sockaddr_in sin; 140 struct sockaddr_in sin;
141 struct socket *sock = NULL; 141 struct socket *sock = NULL;
@@ -178,7 +178,7 @@ void rds_tcp_listen_stop(void)
178 struct socket *sock = rds_tcp_listen_sock; 178 struct socket *sock = rds_tcp_listen_sock;
179 struct sock *sk; 179 struct sock *sk;
180 180
181 if (sock == NULL) 181 if (!sock)
182 return; 182 return;
183 183
184 sk = sock->sk; 184 sk = sock->sk;
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index e43797404102..78205e25500a 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -39,7 +39,7 @@
39 39
40static struct kmem_cache *rds_tcp_incoming_slab; 40static struct kmem_cache *rds_tcp_incoming_slab;
41 41
42void rds_tcp_inc_purge(struct rds_incoming *inc) 42static void rds_tcp_inc_purge(struct rds_incoming *inc)
43{ 43{
44 struct rds_tcp_incoming *tinc; 44 struct rds_tcp_incoming *tinc;
45 tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); 45 tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
@@ -190,10 +190,10 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
190 * processing. 190 * processing.
191 */ 191 */
192 while (left) { 192 while (left) {
193 if (tinc == NULL) { 193 if (!tinc) {
194 tinc = kmem_cache_alloc(rds_tcp_incoming_slab, 194 tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
195 arg->gfp); 195 arg->gfp);
196 if (tinc == NULL) { 196 if (!tinc) {
197 desc->error = -ENOMEM; 197 desc->error = -ENOMEM;
198 goto out; 198 goto out;
199 } 199 }
@@ -229,7 +229,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
229 229
230 if (left && tc->t_tinc_data_rem) { 230 if (left && tc->t_tinc_data_rem) {
231 clone = skb_clone(skb, arg->gfp); 231 clone = skb_clone(skb, arg->gfp);
232 if (clone == NULL) { 232 if (!clone) {
233 desc->error = -ENOMEM; 233 desc->error = -ENOMEM;
234 goto out; 234 goto out;
235 } 235 }
@@ -272,7 +272,8 @@ out:
272} 272}
273 273
274/* the caller has to hold the sock lock */ 274/* the caller has to hold the sock lock */
275int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km) 275static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp,
276 enum km_type km)
276{ 277{
277 struct rds_tcp_connection *tc = conn->c_transport_data; 278 struct rds_tcp_connection *tc = conn->c_transport_data;
278 struct socket *sock = tc->t_sock; 279 struct socket *sock = tc->t_sock;
@@ -326,7 +327,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes)
326 327
327 read_lock_bh(&sk->sk_callback_lock); 328 read_lock_bh(&sk->sk_callback_lock);
328 conn = sk->sk_user_data; 329 conn = sk->sk_user_data;
329 if (conn == NULL) { /* check for teardown race */ 330 if (!conn) { /* check for teardown race */
330 ready = sk->sk_data_ready; 331 ready = sk->sk_data_ready;
331 goto out; 332 goto out;
332 } 333 }
@@ -342,12 +343,12 @@ out:
342 ready(sk, bytes); 343 ready(sk, bytes);
343} 344}
344 345
345int __init rds_tcp_recv_init(void) 346int rds_tcp_recv_init(void)
346{ 347{
347 rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", 348 rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
348 sizeof(struct rds_tcp_incoming), 349 sizeof(struct rds_tcp_incoming),
349 0, 0, NULL); 350 0, 0, NULL);
350 if (rds_tcp_incoming_slab == NULL) 351 if (!rds_tcp_incoming_slab)
351 return -ENOMEM; 352 return -ENOMEM;
352 return 0; 353 return 0;
353} 354}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 2f012a07d94d..1b4fd68f0c7c 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -63,7 +63,7 @@ void rds_tcp_xmit_complete(struct rds_connection *conn)
63} 63}
64 64
65/* the core send_sem serializes this with other xmit and shutdown */ 65/* the core send_sem serializes this with other xmit and shutdown */
66int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) 66static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
67{ 67{
68 struct kvec vec = { 68 struct kvec vec = {
69 .iov_base = data, 69 .iov_base = data,
@@ -77,56 +77,6 @@ int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
77} 77}
78 78
79/* the core send_sem serializes this with other xmit and shutdown */ 79/* the core send_sem serializes this with other xmit and shutdown */
80int rds_tcp_xmit_cong_map(struct rds_connection *conn,
81 struct rds_cong_map *map, unsigned long offset)
82{
83 static struct rds_header rds_tcp_map_header = {
84 .h_flags = RDS_FLAG_CONG_BITMAP,
85 };
86 struct rds_tcp_connection *tc = conn->c_transport_data;
87 unsigned long i;
88 int ret;
89 int copied = 0;
90
91 /* Some problem claims cpu_to_be32(constant) isn't a constant. */
92 rds_tcp_map_header.h_len = cpu_to_be32(RDS_CONG_MAP_BYTES);
93
94 if (offset < sizeof(struct rds_header)) {
95 ret = rds_tcp_sendmsg(tc->t_sock,
96 (void *)&rds_tcp_map_header + offset,
97 sizeof(struct rds_header) - offset);
98 if (ret <= 0)
99 return ret;
100 offset += ret;
101 copied = ret;
102 if (offset < sizeof(struct rds_header))
103 return ret;
104 }
105
106 offset -= sizeof(struct rds_header);
107 i = offset / PAGE_SIZE;
108 offset = offset % PAGE_SIZE;
109 BUG_ON(i >= RDS_CONG_MAP_PAGES);
110
111 do {
112 ret = tc->t_sock->ops->sendpage(tc->t_sock,
113 virt_to_page(map->m_page_addrs[i]),
114 offset, PAGE_SIZE - offset,
115 MSG_DONTWAIT);
116 if (ret <= 0)
117 break;
118 copied += ret;
119 offset += ret;
120 if (offset == PAGE_SIZE) {
121 offset = 0;
122 i++;
123 }
124 } while (i < RDS_CONG_MAP_PAGES);
125
126 return copied ? copied : ret;
127}
128
129/* the core send_sem serializes this with other xmit and shutdown */
130int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, 80int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
131 unsigned int hdr_off, unsigned int sg, unsigned int off) 81 unsigned int hdr_off, unsigned int sg, unsigned int off)
132{ 82{
@@ -166,21 +116,21 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
166 goto out; 116 goto out;
167 } 117 }
168 118
169 while (sg < rm->m_nents) { 119 while (sg < rm->data.op_nents) {
170 ret = tc->t_sock->ops->sendpage(tc->t_sock, 120 ret = tc->t_sock->ops->sendpage(tc->t_sock,
171 sg_page(&rm->m_sg[sg]), 121 sg_page(&rm->data.op_sg[sg]),
172 rm->m_sg[sg].offset + off, 122 rm->data.op_sg[sg].offset + off,
173 rm->m_sg[sg].length - off, 123 rm->data.op_sg[sg].length - off,
174 MSG_DONTWAIT|MSG_NOSIGNAL); 124 MSG_DONTWAIT|MSG_NOSIGNAL);
175 rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->m_sg[sg]), 125 rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
176 rm->m_sg[sg].offset + off, rm->m_sg[sg].length - off, 126 rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
177 ret); 127 ret);
178 if (ret <= 0) 128 if (ret <= 0)
179 break; 129 break;
180 130
181 off += ret; 131 off += ret;
182 done += ret; 132 done += ret;
183 if (off == rm->m_sg[sg].length) { 133 if (off == rm->data.op_sg[sg].length) {
184 off = 0; 134 off = 0;
185 sg++; 135 sg++;
186 } 136 }
@@ -226,7 +176,7 @@ void rds_tcp_write_space(struct sock *sk)
226 176
227 read_lock_bh(&sk->sk_callback_lock); 177 read_lock_bh(&sk->sk_callback_lock);
228 conn = sk->sk_user_data; 178 conn = sk->sk_user_data;
229 if (conn == NULL) { 179 if (!conn) {
230 write_space = sk->sk_write_space; 180 write_space = sk->sk_write_space;
231 goto out; 181 goto out;
232 } 182 }
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 786c20eaaf5e..0fd90f8c5f59 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -61,7 +61,7 @@
61 * 61 *
62 * Transition to state DISCONNECTING/DOWN: 62 * Transition to state DISCONNECTING/DOWN:
63 * - Inside the shutdown worker; synchronizes with xmit path 63 * - Inside the shutdown worker; synchronizes with xmit path
64 * through c_send_lock, and with connection management callbacks 64 * through RDS_IN_XMIT, and with connection management callbacks
65 * via c_cm_lock. 65 * via c_cm_lock.
66 * 66 *
67 * For receive callbacks, we rely on the underlying transport 67 * For receive callbacks, we rely on the underlying transport
@@ -110,7 +110,7 @@ EXPORT_SYMBOL_GPL(rds_connect_complete);
110 * We should *always* start with a random backoff; otherwise a broken connection 110 * We should *always* start with a random backoff; otherwise a broken connection
111 * will always take several iterations to be re-established. 111 * will always take several iterations to be re-established.
112 */ 112 */
113static void rds_queue_reconnect(struct rds_connection *conn) 113void rds_queue_reconnect(struct rds_connection *conn)
114{ 114{
115 unsigned long rand; 115 unsigned long rand;
116 116
@@ -156,58 +156,6 @@ void rds_connect_worker(struct work_struct *work)
156 } 156 }
157} 157}
158 158
159void rds_shutdown_worker(struct work_struct *work)
160{
161 struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
162
163 /* shut it down unless it's down already */
164 if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
165 /*
166 * Quiesce the connection mgmt handlers before we start tearing
167 * things down. We don't hold the mutex for the entire
168 * duration of the shutdown operation, else we may be
169 * deadlocking with the CM handler. Instead, the CM event
170 * handler is supposed to check for state DISCONNECTING
171 */
172 mutex_lock(&conn->c_cm_lock);
173 if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) &&
174 !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
175 rds_conn_error(conn, "shutdown called in state %d\n",
176 atomic_read(&conn->c_state));
177 mutex_unlock(&conn->c_cm_lock);
178 return;
179 }
180 mutex_unlock(&conn->c_cm_lock);
181
182 mutex_lock(&conn->c_send_lock);
183 conn->c_trans->conn_shutdown(conn);
184 rds_conn_reset(conn);
185 mutex_unlock(&conn->c_send_lock);
186
187 if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
188 /* This can happen - eg when we're in the middle of tearing
189 * down the connection, and someone unloads the rds module.
190 * Quite reproduceable with loopback connections.
191 * Mostly harmless.
192 */
193 rds_conn_error(conn,
194 "%s: failed to transition to state DOWN, "
195 "current state is %d\n",
196 __func__,
197 atomic_read(&conn->c_state));
198 return;
199 }
200 }
201
202 /* Then reconnect if it's still live.
203 * The passive side of an IB loopback connection is never added
204 * to the conn hash, so we never trigger a reconnect on this
205 * conn - the reconnect is always triggered by the active peer. */
206 cancel_delayed_work(&conn->c_conn_w);
207 if (!hlist_unhashed(&conn->c_hash_node))
208 rds_queue_reconnect(conn);
209}
210
211void rds_send_worker(struct work_struct *work) 159void rds_send_worker(struct work_struct *work)
212{ 160{
213 struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); 161 struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
@@ -252,15 +200,22 @@ void rds_recv_worker(struct work_struct *work)
252 } 200 }
253} 201}
254 202
203void rds_shutdown_worker(struct work_struct *work)
204{
205 struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
206
207 rds_conn_shutdown(conn);
208}
209
255void rds_threads_exit(void) 210void rds_threads_exit(void)
256{ 211{
257 destroy_workqueue(rds_wq); 212 destroy_workqueue(rds_wq);
258} 213}
259 214
260int __init rds_threads_init(void) 215int rds_threads_init(void)
261{ 216{
262 rds_wq = create_workqueue("krdsd"); 217 rds_wq = create_singlethread_workqueue("krdsd");
263 if (rds_wq == NULL) 218 if (!rds_wq)
264 return -ENOMEM; 219 return -ENOMEM;
265 220
266 return 0; 221 return 0;
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 7e1067901353..7f2ac4fec367 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -71,19 +71,28 @@ void rds_trans_unregister(struct rds_transport *trans)
71} 71}
72EXPORT_SYMBOL_GPL(rds_trans_unregister); 72EXPORT_SYMBOL_GPL(rds_trans_unregister);
73 73
74void rds_trans_put(struct rds_transport *trans)
75{
76 if (trans && trans->t_owner)
77 module_put(trans->t_owner);
78}
79
74struct rds_transport *rds_trans_get_preferred(__be32 addr) 80struct rds_transport *rds_trans_get_preferred(__be32 addr)
75{ 81{
76 struct rds_transport *ret = NULL; 82 struct rds_transport *ret = NULL;
77 int i; 83 struct rds_transport *trans;
84 unsigned int i;
78 85
79 if (IN_LOOPBACK(ntohl(addr))) 86 if (IN_LOOPBACK(ntohl(addr)))
80 return &rds_loop_transport; 87 return &rds_loop_transport;
81 88
82 down_read(&rds_trans_sem); 89 down_read(&rds_trans_sem);
83 for (i = 0; i < RDS_TRANS_COUNT; i++) 90 for (i = 0; i < RDS_TRANS_COUNT; i++) {
84 { 91 trans = transports[i];
85 if (transports[i] && (transports[i]->laddr_check(addr) == 0)) { 92
86 ret = transports[i]; 93 if (trans && (trans->laddr_check(addr) == 0) &&
94 (!trans->t_owner || try_module_get(trans->t_owner))) {
95 ret = trans;
87 break; 96 break;
88 } 97 }
89 } 98 }
diff --git a/net/rds/xlist.h b/net/rds/xlist.h
new file mode 100644
index 000000000000..e6b5190daddd
--- /dev/null
+++ b/net/rds/xlist.h
@@ -0,0 +1,80 @@
1#ifndef _LINUX_XLIST_H
2#define _LINUX_XLIST_H
3
4#include <linux/stddef.h>
5#include <linux/poison.h>
6#include <linux/prefetch.h>
7#include <asm/system.h>
8
9struct xlist_head {
10 struct xlist_head *next;
11};
12
13static inline void INIT_XLIST_HEAD(struct xlist_head *list)
14{
15 list->next = NULL;
16}
17
18static inline int xlist_empty(struct xlist_head *head)
19{
20 return head->next == NULL;
21}
22
23static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail,
24 struct xlist_head *head)
25{
26 struct xlist_head *cur;
27 struct xlist_head *check;
28
29 while (1) {
30 cur = head->next;
31 tail->next = cur;
32 check = cmpxchg(&head->next, cur, new);
33 if (check == cur)
34 break;
35 }
36}
37
38static inline struct xlist_head *xlist_del_head(struct xlist_head *head)
39{
40 struct xlist_head *cur;
41 struct xlist_head *check;
42 struct xlist_head *next;
43
44 while (1) {
45 cur = head->next;
46 if (!cur)
47 goto out;
48
49 next = cur->next;
50 check = cmpxchg(&head->next, cur, next);
51 if (check == cur)
52 goto out;
53 }
54out:
55 return cur;
56}
57
58static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head)
59{
60 struct xlist_head *cur;
61
62 cur = head->next;
63 if (!cur)
64 return NULL;
65
66 head->next = cur->next;
67 return cur;
68}
69
70static inline void xlist_splice(struct xlist_head *list,
71 struct xlist_head *head)
72{
73 struct xlist_head *cur;
74
75 WARN_ON(head->next);
76 cur = xchg(&list->next, NULL);
77 head->next = cur;
78}
79
80#endif