summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/9p/trans_fd.c88
-rw-r--r--net/9p/trans_virtio.c2
-rw-r--r--net/ceph/auth_x.c49
-rw-r--r--net/ceph/auth_x.h2
-rw-r--r--net/ceph/messenger.c105
-rw-r--r--net/ceph/mon_client.c4
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/fib_trie.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c1
-rw-r--r--net/ipv4/tcp_ipv4.c1
-rw-r--r--net/ipv4/tcp_memcontrol.c200
-rw-r--r--net/ipv6/tcp_ipv6.c1
-rw-r--r--net/mac80211/debugfs.c7
-rw-r--r--net/rds/ib.c34
-rw-r--r--net/rds/iw.c23
-rw-r--r--net/sunrpc/cache.c10
-rw-r--r--net/sunrpc/rpc_pipe.c60
-rw-r--r--net/sunrpc/xprt.c1
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c7
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c41
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c371
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c56
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c33
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c360
-rw-r--r--net/sunrpc/xprtrdma/transport.c30
-rw-r--r--net/sunrpc/xprtrdma/verbs.c24
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h21
28 files changed, 938 insertions, 600 deletions
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index bced8c074c12..7bc2208b6cc4 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -108,9 +108,7 @@ struct p9_poll_wait {
108 * @unsent_req_list: accounting for requests that haven't been sent 108 * @unsent_req_list: accounting for requests that haven't been sent
109 * @req: current request being processed (if any) 109 * @req: current request being processed (if any)
110 * @tmp_buf: temporary buffer to read in header 110 * @tmp_buf: temporary buffer to read in header
111 * @rsize: amount to read for current frame 111 * @rc: temporary fcall for reading current frame
112 * @rpos: read position in current frame
113 * @rbuf: current read buffer
114 * @wpos: write position for current frame 112 * @wpos: write position for current frame
115 * @wsize: amount of data to write for current frame 113 * @wsize: amount of data to write for current frame
116 * @wbuf: current write buffer 114 * @wbuf: current write buffer
@@ -131,9 +129,7 @@ struct p9_conn {
131 struct list_head unsent_req_list; 129 struct list_head unsent_req_list;
132 struct p9_req_t *req; 130 struct p9_req_t *req;
133 char tmp_buf[7]; 131 char tmp_buf[7];
134 int rsize; 132 struct p9_fcall rc;
135 int rpos;
136 char *rbuf;
137 int wpos; 133 int wpos;
138 int wsize; 134 int wsize;
139 char *wbuf; 135 char *wbuf;
@@ -305,69 +301,77 @@ static void p9_read_work(struct work_struct *work)
305 if (m->err < 0) 301 if (m->err < 0)
306 return; 302 return;
307 303
308 p9_debug(P9_DEBUG_TRANS, "start mux %p pos %d\n", m, m->rpos); 304 p9_debug(P9_DEBUG_TRANS, "start mux %p pos %zd\n", m, m->rc.offset);
309 305
310 if (!m->rbuf) { 306 if (!m->rc.sdata) {
311 m->rbuf = m->tmp_buf; 307 m->rc.sdata = m->tmp_buf;
312 m->rpos = 0; 308 m->rc.offset = 0;
313 m->rsize = 7; /* start by reading header */ 309 m->rc.capacity = 7; /* start by reading header */
314 } 310 }
315 311
316 clear_bit(Rpending, &m->wsched); 312 clear_bit(Rpending, &m->wsched);
317 p9_debug(P9_DEBUG_TRANS, "read mux %p pos %d size: %d = %d\n", 313 p9_debug(P9_DEBUG_TRANS, "read mux %p pos %zd size: %zd = %zd\n",
318 m, m->rpos, m->rsize, m->rsize-m->rpos); 314 m, m->rc.offset, m->rc.capacity,
319 err = p9_fd_read(m->client, m->rbuf + m->rpos, 315 m->rc.capacity - m->rc.offset);
320 m->rsize - m->rpos); 316 err = p9_fd_read(m->client, m->rc.sdata + m->rc.offset,
317 m->rc.capacity - m->rc.offset);
321 p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err); 318 p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err);
322 if (err == -EAGAIN) { 319 if (err == -EAGAIN)
323 goto end_clear; 320 goto end_clear;
324 }
325 321
326 if (err <= 0) 322 if (err <= 0)
327 goto error; 323 goto error;
328 324
329 m->rpos += err; 325 m->rc.offset += err;
330 326
331 if ((!m->req) && (m->rpos == m->rsize)) { /* header read in */ 327 /* header read in */
332 u16 tag; 328 if ((!m->req) && (m->rc.offset == m->rc.capacity)) {
333 p9_debug(P9_DEBUG_TRANS, "got new header\n"); 329 p9_debug(P9_DEBUG_TRANS, "got new header\n");
334 330
335 n = le32_to_cpu(*(__le32 *) m->rbuf); /* read packet size */ 331 err = p9_parse_header(&m->rc, NULL, NULL, NULL, 0);
336 if (n >= m->client->msize) { 332 if (err) {
333 p9_debug(P9_DEBUG_ERROR,
334 "error parsing header: %d\n", err);
335 goto error;
336 }
337
338 if (m->rc.size >= m->client->msize) {
337 p9_debug(P9_DEBUG_ERROR, 339 p9_debug(P9_DEBUG_ERROR,
338 "requested packet size too big: %d\n", n); 340 "requested packet size too big: %d\n",
341 m->rc.size);
339 err = -EIO; 342 err = -EIO;
340 goto error; 343 goto error;
341 } 344 }
342 345
343 tag = le16_to_cpu(*(__le16 *) (m->rbuf+5)); /* read tag */
344 p9_debug(P9_DEBUG_TRANS, 346 p9_debug(P9_DEBUG_TRANS,
345 "mux %p pkt: size: %d bytes tag: %d\n", m, n, tag); 347 "mux %p pkt: size: %d bytes tag: %d\n",
348 m, m->rc.size, m->rc.tag);
346 349
347 m->req = p9_tag_lookup(m->client, tag); 350 m->req = p9_tag_lookup(m->client, m->rc.tag);
348 if (!m->req || (m->req->status != REQ_STATUS_SENT)) { 351 if (!m->req || (m->req->status != REQ_STATUS_SENT)) {
349 p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n", 352 p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n",
350 tag); 353 m->rc.tag);
351 err = -EIO; 354 err = -EIO;
352 goto error; 355 goto error;
353 } 356 }
354 357
355 if (m->req->rc == NULL) { 358 if (m->req->rc == NULL) {
356 m->req->rc = kmalloc(sizeof(struct p9_fcall) + 359 p9_debug(P9_DEBUG_ERROR,
357 m->client->msize, GFP_NOFS); 360 "No recv fcall for tag %d (req %p), disconnecting!\n",
358 if (!m->req->rc) { 361 m->rc.tag, m->req);
359 m->req = NULL; 362 m->req = NULL;
360 err = -ENOMEM; 363 err = -EIO;
361 goto error; 364 goto error;
362 }
363 } 365 }
364 m->rbuf = (char *)m->req->rc + sizeof(struct p9_fcall); 366 m->rc.sdata = (char *)m->req->rc + sizeof(struct p9_fcall);
365 memcpy(m->rbuf, m->tmp_buf, m->rsize); 367 memcpy(m->rc.sdata, m->tmp_buf, m->rc.capacity);
366 m->rsize = n; 368 m->rc.capacity = m->rc.size;
367 } 369 }
368 370
369 /* not an else because some packets (like clunk) have no payload */ 371 /* packet is read in
370 if ((m->req) && (m->rpos == m->rsize)) { /* packet is read in */ 372 * not an else because some packets (like clunk) have no payload
373 */
374 if ((m->req) && (m->rc.offset == m->rc.capacity)) {
371 p9_debug(P9_DEBUG_TRANS, "got new packet\n"); 375 p9_debug(P9_DEBUG_TRANS, "got new packet\n");
372 spin_lock(&m->client->lock); 376 spin_lock(&m->client->lock);
373 if (m->req->status != REQ_STATUS_ERROR) 377 if (m->req->status != REQ_STATUS_ERROR)
@@ -375,9 +379,9 @@ static void p9_read_work(struct work_struct *work)
375 list_del(&m->req->req_list); 379 list_del(&m->req->req_list);
376 spin_unlock(&m->client->lock); 380 spin_unlock(&m->client->lock);
377 p9_client_cb(m->client, m->req, status); 381 p9_client_cb(m->client, m->req, status);
378 m->rbuf = NULL; 382 m->rc.sdata = NULL;
379 m->rpos = 0; 383 m->rc.offset = 0;
380 m->rsize = 0; 384 m->rc.capacity = 0;
381 m->req = NULL; 385 m->req = NULL;
382 } 386 }
383 387
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 199bc76202d2..4acb1d5417aa 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -658,7 +658,7 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args)
658 mutex_unlock(&virtio_9p_lock); 658 mutex_unlock(&virtio_9p_lock);
659 659
660 if (!found) { 660 if (!found) {
661 pr_err("no channels available\n"); 661 pr_err("no channels available for device %s\n", devname);
662 return ret; 662 return ret;
663 } 663 }
664 664
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index 10d87753ed87..9e43a315e662 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -152,7 +152,6 @@ static int process_one_ticket(struct ceph_auth_client *ac,
152 void *ticket_buf = NULL; 152 void *ticket_buf = NULL;
153 void *tp, *tpend; 153 void *tp, *tpend;
154 void **ptp; 154 void **ptp;
155 struct ceph_timespec new_validity;
156 struct ceph_crypto_key new_session_key; 155 struct ceph_crypto_key new_session_key;
157 struct ceph_buffer *new_ticket_blob; 156 struct ceph_buffer *new_ticket_blob;
158 unsigned long new_expires, new_renew_after; 157 unsigned long new_expires, new_renew_after;
@@ -193,8 +192,8 @@ static int process_one_ticket(struct ceph_auth_client *ac,
193 if (ret) 192 if (ret)
194 goto out; 193 goto out;
195 194
196 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); 195 ceph_decode_timespec(&validity, dp);
197 ceph_decode_timespec(&validity, &new_validity); 196 dp += sizeof(struct ceph_timespec);
198 new_expires = get_seconds() + validity.tv_sec; 197 new_expires = get_seconds() + validity.tv_sec;
199 new_renew_after = new_expires - (validity.tv_sec / 4); 198 new_renew_after = new_expires - (validity.tv_sec / 4);
200 dout(" expires=%lu renew_after=%lu\n", new_expires, 199 dout(" expires=%lu renew_after=%lu\n", new_expires,
@@ -233,10 +232,10 @@ static int process_one_ticket(struct ceph_auth_client *ac,
233 ceph_buffer_put(th->ticket_blob); 232 ceph_buffer_put(th->ticket_blob);
234 th->session_key = new_session_key; 233 th->session_key = new_session_key;
235 th->ticket_blob = new_ticket_blob; 234 th->ticket_blob = new_ticket_blob;
236 th->validity = new_validity;
237 th->secret_id = new_secret_id; 235 th->secret_id = new_secret_id;
238 th->expires = new_expires; 236 th->expires = new_expires;
239 th->renew_after = new_renew_after; 237 th->renew_after = new_renew_after;
238 th->have_key = true;
240 dout(" got ticket service %d (%s) secret_id %lld len %d\n", 239 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
241 type, ceph_entity_type_name(type), th->secret_id, 240 type, ceph_entity_type_name(type), th->secret_id,
242 (int)th->ticket_blob->vec.iov_len); 241 (int)th->ticket_blob->vec.iov_len);
@@ -384,6 +383,24 @@ bad:
384 return -ERANGE; 383 return -ERANGE;
385} 384}
386 385
386static bool need_key(struct ceph_x_ticket_handler *th)
387{
388 if (!th->have_key)
389 return true;
390
391 return get_seconds() >= th->renew_after;
392}
393
394static bool have_key(struct ceph_x_ticket_handler *th)
395{
396 if (th->have_key) {
397 if (get_seconds() >= th->expires)
398 th->have_key = false;
399 }
400
401 return th->have_key;
402}
403
387static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) 404static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
388{ 405{
389 int want = ac->want_keys; 406 int want = ac->want_keys;
@@ -402,20 +419,18 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
402 continue; 419 continue;
403 420
404 th = get_ticket_handler(ac, service); 421 th = get_ticket_handler(ac, service);
405
406 if (IS_ERR(th)) { 422 if (IS_ERR(th)) {
407 *pneed |= service; 423 *pneed |= service;
408 continue; 424 continue;
409 } 425 }
410 426
411 if (get_seconds() >= th->renew_after) 427 if (need_key(th))
412 *pneed |= service; 428 *pneed |= service;
413 if (get_seconds() >= th->expires) 429 if (!have_key(th))
414 xi->have_keys &= ~service; 430 xi->have_keys &= ~service;
415 } 431 }
416} 432}
417 433
418
419static int ceph_x_build_request(struct ceph_auth_client *ac, 434static int ceph_x_build_request(struct ceph_auth_client *ac,
420 void *buf, void *end) 435 void *buf, void *end)
421{ 436{
@@ -667,14 +682,26 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
667 ac->private = NULL; 682 ac->private = NULL;
668} 683}
669 684
670static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, 685static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type)
671 int peer_type)
672{ 686{
673 struct ceph_x_ticket_handler *th; 687 struct ceph_x_ticket_handler *th;
674 688
675 th = get_ticket_handler(ac, peer_type); 689 th = get_ticket_handler(ac, peer_type);
676 if (!IS_ERR(th)) 690 if (!IS_ERR(th))
677 memset(&th->validity, 0, sizeof(th->validity)); 691 th->have_key = false;
692}
693
694static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
695 int peer_type)
696{
697 /*
698 * We are to invalidate a service ticket in the hopes of
699 * getting a new, hopefully more valid, one. But, we won't get
700 * it unless our AUTH ticket is good, so invalidate AUTH ticket
701 * as well, just in case.
702 */
703 invalidate_ticket(ac, peer_type);
704 invalidate_ticket(ac, CEPH_ENTITY_TYPE_AUTH);
678} 705}
679 706
680static int calcu_signature(struct ceph_x_authorizer *au, 707static int calcu_signature(struct ceph_x_authorizer *au,
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index e8b7c6917d47..40b1a3cf7397 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -16,7 +16,7 @@ struct ceph_x_ticket_handler {
16 unsigned int service; 16 unsigned int service;
17 17
18 struct ceph_crypto_key session_key; 18 struct ceph_crypto_key session_key;
19 struct ceph_timespec validity; 19 bool have_key;
20 20
21 u64 secret_id; 21 u64 secret_id;
22 struct ceph_buffer *ticket_blob; 22 struct ceph_buffer *ticket_blob;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9981039ef4ff..9cfedf565f5b 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -23,9 +23,6 @@
23#include <linux/ceph/pagelist.h> 23#include <linux/ceph/pagelist.h>
24#include <linux/export.h> 24#include <linux/export.h>
25 25
26#define list_entry_next(pos, member) \
27 list_entry(pos->member.next, typeof(*pos), member)
28
29/* 26/*
30 * Ceph uses the messenger to exchange ceph_msg messages with other 27 * Ceph uses the messenger to exchange ceph_msg messages with other
31 * hosts in the system. The messenger provides ordered and reliable 28 * hosts in the system. The messenger provides ordered and reliable
@@ -672,6 +669,8 @@ static void reset_connection(struct ceph_connection *con)
672 } 669 }
673 con->in_seq = 0; 670 con->in_seq = 0;
674 con->in_seq_acked = 0; 671 con->in_seq_acked = 0;
672
673 con->out_skip = 0;
675} 674}
676 675
677/* 676/*
@@ -771,6 +770,8 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
771 770
772static void con_out_kvec_reset(struct ceph_connection *con) 771static void con_out_kvec_reset(struct ceph_connection *con)
773{ 772{
773 BUG_ON(con->out_skip);
774
774 con->out_kvec_left = 0; 775 con->out_kvec_left = 0;
775 con->out_kvec_bytes = 0; 776 con->out_kvec_bytes = 0;
776 con->out_kvec_cur = &con->out_kvec[0]; 777 con->out_kvec_cur = &con->out_kvec[0];
@@ -779,9 +780,9 @@ static void con_out_kvec_reset(struct ceph_connection *con)
779static void con_out_kvec_add(struct ceph_connection *con, 780static void con_out_kvec_add(struct ceph_connection *con,
780 size_t size, void *data) 781 size_t size, void *data)
781{ 782{
782 int index; 783 int index = con->out_kvec_left;
783 784
784 index = con->out_kvec_left; 785 BUG_ON(con->out_skip);
785 BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); 786 BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
786 787
787 con->out_kvec[index].iov_len = size; 788 con->out_kvec[index].iov_len = size;
@@ -790,6 +791,27 @@ static void con_out_kvec_add(struct ceph_connection *con,
790 con->out_kvec_bytes += size; 791 con->out_kvec_bytes += size;
791} 792}
792 793
794/*
795 * Chop off a kvec from the end. Return residual number of bytes for
796 * that kvec, i.e. how many bytes would have been written if the kvec
797 * hadn't been nuked.
798 */
799static int con_out_kvec_skip(struct ceph_connection *con)
800{
801 int off = con->out_kvec_cur - con->out_kvec;
802 int skip = 0;
803
804 if (con->out_kvec_bytes > 0) {
805 skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
806 BUG_ON(con->out_kvec_bytes < skip);
807 BUG_ON(!con->out_kvec_left);
808 con->out_kvec_bytes -= skip;
809 con->out_kvec_left--;
810 }
811
812 return skip;
813}
814
793#ifdef CONFIG_BLOCK 815#ifdef CONFIG_BLOCK
794 816
795/* 817/*
@@ -1042,7 +1064,7 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
1042 /* Move on to the next page */ 1064 /* Move on to the next page */
1043 1065
1044 BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); 1066 BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
1045 cursor->page = list_entry_next(cursor->page, lru); 1067 cursor->page = list_next_entry(cursor->page, lru);
1046 cursor->last_piece = cursor->resid <= PAGE_SIZE; 1068 cursor->last_piece = cursor->resid <= PAGE_SIZE;
1047 1069
1048 return true; 1070 return true;
@@ -1166,7 +1188,7 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
1166 if (!cursor->resid && cursor->total_resid) { 1188 if (!cursor->resid && cursor->total_resid) {
1167 WARN_ON(!cursor->last_piece); 1189 WARN_ON(!cursor->last_piece);
1168 BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); 1190 BUG_ON(list_is_last(&cursor->data->links, cursor->data_head));
1169 cursor->data = list_entry_next(cursor->data, links); 1191 cursor->data = list_next_entry(cursor->data, links);
1170 __ceph_msg_data_cursor_init(cursor); 1192 __ceph_msg_data_cursor_init(cursor);
1171 new_piece = true; 1193 new_piece = true;
1172 } 1194 }
@@ -1197,7 +1219,6 @@ static void prepare_write_message_footer(struct ceph_connection *con)
1197 m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; 1219 m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
1198 1220
1199 dout("prepare_write_message_footer %p\n", con); 1221 dout("prepare_write_message_footer %p\n", con);
1200 con->out_kvec_is_msg = true;
1201 con->out_kvec[v].iov_base = &m->footer; 1222 con->out_kvec[v].iov_base = &m->footer;
1202 if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { 1223 if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
1203 if (con->ops->sign_message) 1224 if (con->ops->sign_message)
@@ -1225,7 +1246,6 @@ static void prepare_write_message(struct ceph_connection *con)
1225 u32 crc; 1246 u32 crc;
1226 1247
1227 con_out_kvec_reset(con); 1248 con_out_kvec_reset(con);
1228 con->out_kvec_is_msg = true;
1229 con->out_msg_done = false; 1249 con->out_msg_done = false;
1230 1250
1231 /* Sneak an ack in there first? If we can get it into the same 1251 /* Sneak an ack in there first? If we can get it into the same
@@ -1265,18 +1285,19 @@ static void prepare_write_message(struct ceph_connection *con)
1265 1285
1266 /* tag + hdr + front + middle */ 1286 /* tag + hdr + front + middle */
1267 con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); 1287 con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
1268 con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); 1288 con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
1269 con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); 1289 con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
1270 1290
1271 if (m->middle) 1291 if (m->middle)
1272 con_out_kvec_add(con, m->middle->vec.iov_len, 1292 con_out_kvec_add(con, m->middle->vec.iov_len,
1273 m->middle->vec.iov_base); 1293 m->middle->vec.iov_base);
1274 1294
1275 /* fill in crc (except data pages), footer */ 1295 /* fill in hdr crc and finalize hdr */
1276 crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); 1296 crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
1277 con->out_msg->hdr.crc = cpu_to_le32(crc); 1297 con->out_msg->hdr.crc = cpu_to_le32(crc);
1278 con->out_msg->footer.flags = 0; 1298 memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
1279 1299
1300 /* fill in front and middle crc, footer */
1280 crc = crc32c(0, m->front.iov_base, m->front.iov_len); 1301 crc = crc32c(0, m->front.iov_base, m->front.iov_len);
1281 con->out_msg->footer.front_crc = cpu_to_le32(crc); 1302 con->out_msg->footer.front_crc = cpu_to_le32(crc);
1282 if (m->middle) { 1303 if (m->middle) {
@@ -1288,6 +1309,7 @@ static void prepare_write_message(struct ceph_connection *con)
1288 dout("%s front_crc %u middle_crc %u\n", __func__, 1309 dout("%s front_crc %u middle_crc %u\n", __func__,
1289 le32_to_cpu(con->out_msg->footer.front_crc), 1310 le32_to_cpu(con->out_msg->footer.front_crc),
1290 le32_to_cpu(con->out_msg->footer.middle_crc)); 1311 le32_to_cpu(con->out_msg->footer.middle_crc));
1312 con->out_msg->footer.flags = 0;
1291 1313
1292 /* is there a data payload? */ 1314 /* is there a data payload? */
1293 con->out_msg->footer.data_crc = 0; 1315 con->out_msg->footer.data_crc = 0;
@@ -1492,7 +1514,6 @@ static int write_partial_kvec(struct ceph_connection *con)
1492 } 1514 }
1493 } 1515 }
1494 con->out_kvec_left = 0; 1516 con->out_kvec_left = 0;
1495 con->out_kvec_is_msg = false;
1496 ret = 1; 1517 ret = 1;
1497out: 1518out:
1498 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, 1519 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
@@ -1584,6 +1605,7 @@ static int write_partial_skip(struct ceph_connection *con)
1584{ 1605{
1585 int ret; 1606 int ret;
1586 1607
1608 dout("%s %p %d left\n", __func__, con, con->out_skip);
1587 while (con->out_skip > 0) { 1609 while (con->out_skip > 0) {
1588 size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); 1610 size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE);
1589 1611
@@ -2506,13 +2528,13 @@ more:
2506 2528
2507more_kvec: 2529more_kvec:
2508 /* kvec data queued? */ 2530 /* kvec data queued? */
2509 if (con->out_skip) { 2531 if (con->out_kvec_left) {
2510 ret = write_partial_skip(con); 2532 ret = write_partial_kvec(con);
2511 if (ret <= 0) 2533 if (ret <= 0)
2512 goto out; 2534 goto out;
2513 } 2535 }
2514 if (con->out_kvec_left) { 2536 if (con->out_skip) {
2515 ret = write_partial_kvec(con); 2537 ret = write_partial_skip(con);
2516 if (ret <= 0) 2538 if (ret <= 0)
2517 goto out; 2539 goto out;
2518 } 2540 }
@@ -2805,13 +2827,17 @@ static bool con_backoff(struct ceph_connection *con)
2805 2827
2806static void con_fault_finish(struct ceph_connection *con) 2828static void con_fault_finish(struct ceph_connection *con)
2807{ 2829{
2830 dout("%s %p\n", __func__, con);
2831
2808 /* 2832 /*
2809 * in case we faulted due to authentication, invalidate our 2833 * in case we faulted due to authentication, invalidate our
2810 * current tickets so that we can get new ones. 2834 * current tickets so that we can get new ones.
2811 */ 2835 */
2812 if (con->auth_retry && con->ops->invalidate_authorizer) { 2836 if (con->auth_retry) {
2813 dout("calling invalidate_authorizer()\n"); 2837 dout("auth_retry %d, invalidating\n", con->auth_retry);
2814 con->ops->invalidate_authorizer(con); 2838 if (con->ops->invalidate_authorizer)
2839 con->ops->invalidate_authorizer(con);
2840 con->auth_retry = 0;
2815 } 2841 }
2816 2842
2817 if (con->ops->fault) 2843 if (con->ops->fault)
@@ -3050,16 +3076,31 @@ void ceph_msg_revoke(struct ceph_msg *msg)
3050 ceph_msg_put(msg); 3076 ceph_msg_put(msg);
3051 } 3077 }
3052 if (con->out_msg == msg) { 3078 if (con->out_msg == msg) {
3053 dout("%s %p msg %p - was sending\n", __func__, con, msg); 3079 BUG_ON(con->out_skip);
3054 con->out_msg = NULL; 3080 /* footer */
3055 if (con->out_kvec_is_msg) { 3081 if (con->out_msg_done) {
3056 con->out_skip = con->out_kvec_bytes; 3082 con->out_skip += con_out_kvec_skip(con);
3057 con->out_kvec_is_msg = false; 3083 } else {
3084 BUG_ON(!msg->data_length);
3085 if (con->peer_features & CEPH_FEATURE_MSG_AUTH)
3086 con->out_skip += sizeof(msg->footer);
3087 else
3088 con->out_skip += sizeof(msg->old_footer);
3058 } 3089 }
3090 /* data, middle, front */
3091 if (msg->data_length)
3092 con->out_skip += msg->cursor.total_resid;
3093 if (msg->middle)
3094 con->out_skip += con_out_kvec_skip(con);
3095 con->out_skip += con_out_kvec_skip(con);
3096
3097 dout("%s %p msg %p - was sending, will write %d skip %d\n",
3098 __func__, con, msg, con->out_kvec_bytes, con->out_skip);
3059 msg->hdr.seq = 0; 3099 msg->hdr.seq = 0;
3060 3100 con->out_msg = NULL;
3061 ceph_msg_put(msg); 3101 ceph_msg_put(msg);
3062 } 3102 }
3103
3063 mutex_unlock(&con->mutex); 3104 mutex_unlock(&con->mutex);
3064} 3105}
3065 3106
@@ -3361,9 +3402,7 @@ static void ceph_msg_free(struct ceph_msg *m)
3361static void ceph_msg_release(struct kref *kref) 3402static void ceph_msg_release(struct kref *kref)
3362{ 3403{
3363 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); 3404 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
3364 LIST_HEAD(data); 3405 struct ceph_msg_data *data, *next;
3365 struct list_head *links;
3366 struct list_head *next;
3367 3406
3368 dout("%s %p\n", __func__, m); 3407 dout("%s %p\n", __func__, m);
3369 WARN_ON(!list_empty(&m->list_head)); 3408 WARN_ON(!list_empty(&m->list_head));
@@ -3376,12 +3415,8 @@ static void ceph_msg_release(struct kref *kref)
3376 m->middle = NULL; 3415 m->middle = NULL;
3377 } 3416 }
3378 3417
3379 list_splice_init(&m->data, &data); 3418 list_for_each_entry_safe(data, next, &m->data, links) {
3380 list_for_each_safe(links, next, &data) { 3419 list_del_init(&data->links);
3381 struct ceph_msg_data *data;
3382
3383 data = list_entry(links, struct ceph_msg_data, links);
3384 list_del_init(links);
3385 ceph_msg_data_destroy(data); 3420 ceph_msg_data_destroy(data);
3386 } 3421 }
3387 m->data_length = 0; 3422 m->data_length = 0;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index edda01626a45..de85dddc3dc0 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -364,10 +364,6 @@ static bool have_debugfs_info(struct ceph_mon_client *monc)
364 return monc->client->have_fsid && monc->auth->global_id > 0; 364 return monc->client->have_fsid && monc->auth->global_id > 0;
365} 365}
366 366
367/*
368 * The monitor responds with mount ack indicate mount success. The
369 * included client ticket allows the client to talk to MDSs and OSDs.
370 */
371static void ceph_monc_handle_map(struct ceph_mon_client *monc, 367static void ceph_monc_handle_map(struct ceph_mon_client *monc,
372 struct ceph_msg *msg) 368 struct ceph_msg *msg)
373{ 369{
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index c29809f765dc..62c049b647e9 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -56,7 +56,6 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
56obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o 56obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
57obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o 57obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
58obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o 58obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
59obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
60obj-$(CONFIG_NETLABEL) += cipso_ipv4.o 59obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
61 60
62obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 61obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 22e73171ea63..d07fc076bea0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -289,10 +289,8 @@ static void __node_free_rcu(struct rcu_head *head)
289 289
290 if (!n->tn_bits) 290 if (!n->tn_bits)
291 kmem_cache_free(trie_leaf_kmem, n); 291 kmem_cache_free(trie_leaf_kmem, n);
292 else if (n->tn_bits <= TNODE_KMALLOC_MAX)
293 kfree(n);
294 else 292 else
295 vfree(n); 293 kvfree(n);
296} 294}
297 295
298#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) 296#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 46ce410703b1..4d367b4139a3 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -24,7 +24,6 @@
24#include <net/cipso_ipv4.h> 24#include <net/cipso_ipv4.h>
25#include <net/inet_frag.h> 25#include <net/inet_frag.h>
26#include <net/ping.h> 26#include <net/ping.h>
27#include <net/tcp_memcontrol.h>
28 27
29static int zero; 28static int zero;
30static int one = 1; 29static int one = 1;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2a67244f97ca..a4d523709ab3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -73,7 +73,6 @@
73#include <net/timewait_sock.h> 73#include <net/timewait_sock.h>
74#include <net/xfrm.h> 74#include <net/xfrm.h>
75#include <net/secure_seq.h> 75#include <net/secure_seq.h>
76#include <net/tcp_memcontrol.h>
77#include <net/busy_poll.h> 76#include <net/busy_poll.h>
78 77
79#include <linux/inet.h> 78#include <linux/inet.h>
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
deleted file mode 100644
index 18bc7f745e9c..000000000000
--- a/net/ipv4/tcp_memcontrol.c
+++ /dev/null
@@ -1,200 +0,0 @@
1#include <net/tcp.h>
2#include <net/tcp_memcontrol.h>
3#include <net/sock.h>
4#include <net/ip.h>
5#include <linux/nsproxy.h>
6#include <linux/memcontrol.h>
7#include <linux/module.h>
8
9int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
10{
11 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
12 struct page_counter *counter_parent = NULL;
13 /*
14 * The root cgroup does not use page_counters, but rather,
15 * rely on the data already collected by the network
16 * subsystem
17 */
18 if (memcg == root_mem_cgroup)
19 return 0;
20
21 memcg->tcp_mem.memory_pressure = 0;
22
23 if (parent)
24 counter_parent = &parent->tcp_mem.memory_allocated;
25
26 page_counter_init(&memcg->tcp_mem.memory_allocated, counter_parent);
27
28 return 0;
29}
30
31void tcp_destroy_cgroup(struct mem_cgroup *memcg)
32{
33 if (memcg == root_mem_cgroup)
34 return;
35
36 if (memcg->tcp_mem.active)
37 static_branch_dec(&memcg_sockets_enabled_key);
38}
39
40static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
41{
42 int ret;
43
44 if (memcg == root_mem_cgroup)
45 return -EINVAL;
46
47 ret = page_counter_limit(&memcg->tcp_mem.memory_allocated, nr_pages);
48 if (ret)
49 return ret;
50
51 if (!memcg->tcp_mem.active) {
52 /*
53 * The active flag needs to be written after the static_key
54 * update. This is what guarantees that the socket activation
55 * function is the last one to run. See sock_update_memcg() for
56 * details, and note that we don't mark any socket as belonging
57 * to this memcg until that flag is up.
58 *
59 * We need to do this, because static_keys will span multiple
60 * sites, but we can't control their order. If we mark a socket
61 * as accounted, but the accounting functions are not patched in
62 * yet, we'll lose accounting.
63 *
64 * We never race with the readers in sock_update_memcg(),
65 * because when this value change, the code to process it is not
66 * patched in yet.
67 */
68 static_branch_inc(&memcg_sockets_enabled_key);
69 memcg->tcp_mem.active = true;
70 }
71
72 return 0;
73}
74
75enum {
76 RES_USAGE,
77 RES_LIMIT,
78 RES_MAX_USAGE,
79 RES_FAILCNT,
80};
81
82static DEFINE_MUTEX(tcp_limit_mutex);
83
84static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
85 char *buf, size_t nbytes, loff_t off)
86{
87 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
88 unsigned long nr_pages;
89 int ret = 0;
90
91 buf = strstrip(buf);
92
93 switch (of_cft(of)->private) {
94 case RES_LIMIT:
95 /* see memcontrol.c */
96 ret = page_counter_memparse(buf, "-1", &nr_pages);
97 if (ret)
98 break;
99 mutex_lock(&tcp_limit_mutex);
100 ret = tcp_update_limit(memcg, nr_pages);
101 mutex_unlock(&tcp_limit_mutex);
102 break;
103 default:
104 ret = -EINVAL;
105 break;
106 }
107 return ret ?: nbytes;
108}
109
110static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
111{
112 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
113 u64 val;
114
115 switch (cft->private) {
116 case RES_LIMIT:
117 if (memcg == root_mem_cgroup)
118 val = PAGE_COUNTER_MAX;
119 else
120 val = memcg->tcp_mem.memory_allocated.limit;
121 val *= PAGE_SIZE;
122 break;
123 case RES_USAGE:
124 if (memcg == root_mem_cgroup)
125 val = atomic_long_read(&tcp_memory_allocated);
126 else
127 val = page_counter_read(&memcg->tcp_mem.memory_allocated);
128 val *= PAGE_SIZE;
129 break;
130 case RES_FAILCNT:
131 if (memcg == root_mem_cgroup)
132 return 0;
133 val = memcg->tcp_mem.memory_allocated.failcnt;
134 break;
135 case RES_MAX_USAGE:
136 if (memcg == root_mem_cgroup)
137 return 0;
138 val = memcg->tcp_mem.memory_allocated.watermark;
139 val *= PAGE_SIZE;
140 break;
141 default:
142 BUG();
143 }
144 return val;
145}
146
147static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of,
148 char *buf, size_t nbytes, loff_t off)
149{
150 struct mem_cgroup *memcg;
151
152 memcg = mem_cgroup_from_css(of_css(of));
153 if (memcg == root_mem_cgroup)
154 return nbytes;
155
156 switch (of_cft(of)->private) {
157 case RES_MAX_USAGE:
158 page_counter_reset_watermark(&memcg->tcp_mem.memory_allocated);
159 break;
160 case RES_FAILCNT:
161 memcg->tcp_mem.memory_allocated.failcnt = 0;
162 break;
163 }
164
165 return nbytes;
166}
167
168static struct cftype tcp_files[] = {
169 {
170 .name = "kmem.tcp.limit_in_bytes",
171 .write = tcp_cgroup_write,
172 .read_u64 = tcp_cgroup_read,
173 .private = RES_LIMIT,
174 },
175 {
176 .name = "kmem.tcp.usage_in_bytes",
177 .read_u64 = tcp_cgroup_read,
178 .private = RES_USAGE,
179 },
180 {
181 .name = "kmem.tcp.failcnt",
182 .private = RES_FAILCNT,
183 .write = tcp_cgroup_reset,
184 .read_u64 = tcp_cgroup_read,
185 },
186 {
187 .name = "kmem.tcp.max_usage_in_bytes",
188 .private = RES_MAX_USAGE,
189 .write = tcp_cgroup_reset,
190 .read_u64 = tcp_cgroup_read,
191 },
192 { } /* terminate */
193};
194
195static int __init tcp_memcontrol_init(void)
196{
197 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, tcp_files));
198 return 0;
199}
200__initcall(tcp_memcontrol_init);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4ad8edb46f7c..006396e31cb0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -61,7 +61,6 @@
61#include <net/timewait_sock.h> 61#include <net/timewait_sock.h>
62#include <net/inet_common.h> 62#include <net/inet_common.h>
63#include <net/secure_seq.h> 63#include <net/secure_seq.h>
64#include <net/tcp_memcontrol.h>
65#include <net/busy_poll.h> 64#include <net/busy_poll.h>
66 65
67#include <linux/proc_fs.h> 66#include <linux/proc_fs.h>
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index abbdff03ce92..3e24d0ddb51b 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -91,7 +91,7 @@ static const struct file_operations reset_ops = {
91}; 91};
92#endif 92#endif
93 93
94static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = { 94static const char *hw_flag_names[] = {
95#define FLAG(F) [IEEE80211_HW_##F] = #F 95#define FLAG(F) [IEEE80211_HW_##F] = #F
96 FLAG(HAS_RATE_CONTROL), 96 FLAG(HAS_RATE_CONTROL),
97 FLAG(RX_INCLUDES_FCS), 97 FLAG(RX_INCLUDES_FCS),
@@ -126,9 +126,6 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = {
126 FLAG(SUPPORTS_AMSDU_IN_AMPDU), 126 FLAG(SUPPORTS_AMSDU_IN_AMPDU),
127 FLAG(BEACON_TX_STATUS), 127 FLAG(BEACON_TX_STATUS),
128 FLAG(NEEDS_UNIQUE_STA_ADDR), 128 FLAG(NEEDS_UNIQUE_STA_ADDR),
129
130 /* keep last for the build bug below */
131 (void *)0x1
132#undef FLAG 129#undef FLAG
133}; 130};
134 131
@@ -148,7 +145,7 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf,
148 /* fail compilation if somebody adds or removes 145 /* fail compilation if somebody adds or removes
149 * a flag without updating the name array above 146 * a flag without updating the name array above
150 */ 147 */
151 BUILD_BUG_ON(hw_flag_names[NUM_IEEE80211_HW_FLAGS] != (void *)0x1); 148 BUILD_BUG_ON(ARRAY_SIZE(hw_flag_names) != NUM_IEEE80211_HW_FLAGS);
152 149
153 for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) { 150 for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) {
154 if (test_bit(i, local->hw.flags)) 151 if (test_bit(i, local->hw.flags))
diff --git a/net/rds/ib.c b/net/rds/ib.c
index f222885ac0c7..9481d55ff6cb 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -122,44 +122,34 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
122static void rds_ib_add_one(struct ib_device *device) 122static void rds_ib_add_one(struct ib_device *device)
123{ 123{
124 struct rds_ib_device *rds_ibdev; 124 struct rds_ib_device *rds_ibdev;
125 struct ib_device_attr *dev_attr;
126 125
127 /* Only handle IB (no iWARP) devices */ 126 /* Only handle IB (no iWARP) devices */
128 if (device->node_type != RDMA_NODE_IB_CA) 127 if (device->node_type != RDMA_NODE_IB_CA)
129 return; 128 return;
130 129
131 dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
132 if (!dev_attr)
133 return;
134
135 if (ib_query_device(device, dev_attr)) {
136 rdsdebug("Query device failed for %s\n", device->name);
137 goto free_attr;
138 }
139
140 rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, 130 rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
141 ibdev_to_node(device)); 131 ibdev_to_node(device));
142 if (!rds_ibdev) 132 if (!rds_ibdev)
143 goto free_attr; 133 return;
144 134
145 spin_lock_init(&rds_ibdev->spinlock); 135 spin_lock_init(&rds_ibdev->spinlock);
146 atomic_set(&rds_ibdev->refcount, 1); 136 atomic_set(&rds_ibdev->refcount, 1);
147 INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); 137 INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
148 138
149 rds_ibdev->max_wrs = dev_attr->max_qp_wr; 139 rds_ibdev->max_wrs = device->attrs.max_qp_wr;
150 rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); 140 rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE);
151 141
152 rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; 142 rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
153 rds_ibdev->max_1m_fmrs = dev_attr->max_mr ? 143 rds_ibdev->max_1m_fmrs = device->attrs.max_mr ?
154 min_t(unsigned int, (dev_attr->max_mr / 2), 144 min_t(unsigned int, (device->attrs.max_mr / 2),
155 rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size; 145 rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size;
156 146
157 rds_ibdev->max_8k_fmrs = dev_attr->max_mr ? 147 rds_ibdev->max_8k_fmrs = device->attrs.max_mr ?
158 min_t(unsigned int, ((dev_attr->max_mr / 2) * RDS_MR_8K_SCALE), 148 min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE),
159 rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size; 149 rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size;
160 150
161 rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; 151 rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
162 rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; 152 rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
163 153
164 rds_ibdev->dev = device; 154 rds_ibdev->dev = device;
165 rds_ibdev->pd = ib_alloc_pd(device); 155 rds_ibdev->pd = ib_alloc_pd(device);
@@ -183,7 +173,7 @@ static void rds_ib_add_one(struct ib_device *device)
183 } 173 }
184 174
185 rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n", 175 rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n",
186 dev_attr->max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, 176 device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
187 rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs, 177 rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs,
188 rds_ibdev->max_8k_fmrs); 178 rds_ibdev->max_8k_fmrs);
189 179
@@ -202,8 +192,6 @@ static void rds_ib_add_one(struct ib_device *device)
202 192
203put_dev: 193put_dev:
204 rds_ib_dev_put(rds_ibdev); 194 rds_ib_dev_put(rds_ibdev);
205free_attr:
206 kfree(dev_attr);
207} 195}
208 196
209/* 197/*
diff --git a/net/rds/iw.c b/net/rds/iw.c
index 576f1825fc55..f4a9fff829e0 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -60,30 +60,20 @@ LIST_HEAD(iw_nodev_conns);
60static void rds_iw_add_one(struct ib_device *device) 60static void rds_iw_add_one(struct ib_device *device)
61{ 61{
62 struct rds_iw_device *rds_iwdev; 62 struct rds_iw_device *rds_iwdev;
63 struct ib_device_attr *dev_attr;
64 63
65 /* Only handle iwarp devices */ 64 /* Only handle iwarp devices */
66 if (device->node_type != RDMA_NODE_RNIC) 65 if (device->node_type != RDMA_NODE_RNIC)
67 return; 66 return;
68 67
69 dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
70 if (!dev_attr)
71 return;
72
73 if (ib_query_device(device, dev_attr)) {
74 rdsdebug("Query device failed for %s\n", device->name);
75 goto free_attr;
76 }
77
78 rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL); 68 rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
79 if (!rds_iwdev) 69 if (!rds_iwdev)
80 goto free_attr; 70 return;
81 71
82 spin_lock_init(&rds_iwdev->spinlock); 72 spin_lock_init(&rds_iwdev->spinlock);
83 73
84 rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); 74 rds_iwdev->dma_local_lkey = !!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
85 rds_iwdev->max_wrs = dev_attr->max_qp_wr; 75 rds_iwdev->max_wrs = device->attrs.max_qp_wr;
86 rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE); 76 rds_iwdev->max_sge = min(device->attrs.max_sge, RDS_IW_MAX_SGE);
87 77
88 rds_iwdev->dev = device; 78 rds_iwdev->dev = device;
89 rds_iwdev->pd = ib_alloc_pd(device); 79 rds_iwdev->pd = ib_alloc_pd(device);
@@ -111,8 +101,7 @@ static void rds_iw_add_one(struct ib_device *device)
111 list_add_tail(&rds_iwdev->list, &rds_iw_devices); 101 list_add_tail(&rds_iwdev->list, &rds_iw_devices);
112 102
113 ib_set_client_data(device, &rds_iw_client, rds_iwdev); 103 ib_set_client_data(device, &rds_iw_client, rds_iwdev);
114 104 return;
115 goto free_attr;
116 105
117err_mr: 106err_mr:
118 if (rds_iwdev->mr) 107 if (rds_iwdev->mr)
@@ -121,8 +110,6 @@ err_pd:
121 ib_dealloc_pd(rds_iwdev->pd); 110 ib_dealloc_pd(rds_iwdev->pd);
122free_dev: 111free_dev:
123 kfree(rds_iwdev); 112 kfree(rds_iwdev);
124free_attr:
125 kfree(dev_attr);
126} 113}
127 114
128static void rds_iw_remove_one(struct ib_device *device, void *client_data) 115static void rds_iw_remove_one(struct ib_device *device, void *client_data)
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 5e4f815c2b34..2b32fd602669 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -771,7 +771,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
771 if (count == 0) 771 if (count == 0)
772 return 0; 772 return 0;
773 773
774 mutex_lock(&inode->i_mutex); /* protect against multiple concurrent 774 inode_lock(inode); /* protect against multiple concurrent
775 * readers on this file */ 775 * readers on this file */
776 again: 776 again:
777 spin_lock(&queue_lock); 777 spin_lock(&queue_lock);
@@ -784,7 +784,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
784 } 784 }
785 if (rp->q.list.next == &cd->queue) { 785 if (rp->q.list.next == &cd->queue) {
786 spin_unlock(&queue_lock); 786 spin_unlock(&queue_lock);
787 mutex_unlock(&inode->i_mutex); 787 inode_unlock(inode);
788 WARN_ON_ONCE(rp->offset); 788 WARN_ON_ONCE(rp->offset);
789 return 0; 789 return 0;
790 } 790 }
@@ -838,7 +838,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
838 } 838 }
839 if (err == -EAGAIN) 839 if (err == -EAGAIN)
840 goto again; 840 goto again;
841 mutex_unlock(&inode->i_mutex); 841 inode_unlock(inode);
842 return err ? err : count; 842 return err ? err : count;
843} 843}
844 844
@@ -909,9 +909,9 @@ static ssize_t cache_write(struct file *filp, const char __user *buf,
909 if (!cd->cache_parse) 909 if (!cd->cache_parse)
910 goto out; 910 goto out;
911 911
912 mutex_lock(&inode->i_mutex); 912 inode_lock(inode);
913 ret = cache_downcall(mapping, buf, count, cd); 913 ret = cache_downcall(mapping, buf, count, cd);
914 mutex_unlock(&inode->i_mutex); 914 inode_unlock(inode);
915out: 915out:
916 return ret; 916 return ret;
917} 917}
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 14f45bf0410c..31789ef3e614 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -172,7 +172,7 @@ rpc_close_pipes(struct inode *inode)
172 int need_release; 172 int need_release;
173 LIST_HEAD(free_list); 173 LIST_HEAD(free_list);
174 174
175 mutex_lock(&inode->i_mutex); 175 inode_lock(inode);
176 spin_lock(&pipe->lock); 176 spin_lock(&pipe->lock);
177 need_release = pipe->nreaders != 0 || pipe->nwriters != 0; 177 need_release = pipe->nreaders != 0 || pipe->nwriters != 0;
178 pipe->nreaders = 0; 178 pipe->nreaders = 0;
@@ -188,7 +188,7 @@ rpc_close_pipes(struct inode *inode)
188 cancel_delayed_work_sync(&pipe->queue_timeout); 188 cancel_delayed_work_sync(&pipe->queue_timeout);
189 rpc_inode_setowner(inode, NULL); 189 rpc_inode_setowner(inode, NULL);
190 RPC_I(inode)->pipe = NULL; 190 RPC_I(inode)->pipe = NULL;
191 mutex_unlock(&inode->i_mutex); 191 inode_unlock(inode);
192} 192}
193 193
194static struct inode * 194static struct inode *
@@ -221,7 +221,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp)
221 int first_open; 221 int first_open;
222 int res = -ENXIO; 222 int res = -ENXIO;
223 223
224 mutex_lock(&inode->i_mutex); 224 inode_lock(inode);
225 pipe = RPC_I(inode)->pipe; 225 pipe = RPC_I(inode)->pipe;
226 if (pipe == NULL) 226 if (pipe == NULL)
227 goto out; 227 goto out;
@@ -237,7 +237,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp)
237 pipe->nwriters++; 237 pipe->nwriters++;
238 res = 0; 238 res = 0;
239out: 239out:
240 mutex_unlock(&inode->i_mutex); 240 inode_unlock(inode);
241 return res; 241 return res;
242} 242}
243 243
@@ -248,7 +248,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
248 struct rpc_pipe_msg *msg; 248 struct rpc_pipe_msg *msg;
249 int last_close; 249 int last_close;
250 250
251 mutex_lock(&inode->i_mutex); 251 inode_lock(inode);
252 pipe = RPC_I(inode)->pipe; 252 pipe = RPC_I(inode)->pipe;
253 if (pipe == NULL) 253 if (pipe == NULL)
254 goto out; 254 goto out;
@@ -278,7 +278,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
278 if (last_close && pipe->ops->release_pipe) 278 if (last_close && pipe->ops->release_pipe)
279 pipe->ops->release_pipe(inode); 279 pipe->ops->release_pipe(inode);
280out: 280out:
281 mutex_unlock(&inode->i_mutex); 281 inode_unlock(inode);
282 return 0; 282 return 0;
283} 283}
284 284
@@ -290,7 +290,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
290 struct rpc_pipe_msg *msg; 290 struct rpc_pipe_msg *msg;
291 int res = 0; 291 int res = 0;
292 292
293 mutex_lock(&inode->i_mutex); 293 inode_lock(inode);
294 pipe = RPC_I(inode)->pipe; 294 pipe = RPC_I(inode)->pipe;
295 if (pipe == NULL) { 295 if (pipe == NULL) {
296 res = -EPIPE; 296 res = -EPIPE;
@@ -322,7 +322,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
322 pipe->ops->destroy_msg(msg); 322 pipe->ops->destroy_msg(msg);
323 } 323 }
324out_unlock: 324out_unlock:
325 mutex_unlock(&inode->i_mutex); 325 inode_unlock(inode);
326 return res; 326 return res;
327} 327}
328 328
@@ -332,11 +332,11 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of
332 struct inode *inode = file_inode(filp); 332 struct inode *inode = file_inode(filp);
333 int res; 333 int res;
334 334
335 mutex_lock(&inode->i_mutex); 335 inode_lock(inode);
336 res = -EPIPE; 336 res = -EPIPE;
337 if (RPC_I(inode)->pipe != NULL) 337 if (RPC_I(inode)->pipe != NULL)
338 res = RPC_I(inode)->pipe->ops->downcall(filp, buf, len); 338 res = RPC_I(inode)->pipe->ops->downcall(filp, buf, len);
339 mutex_unlock(&inode->i_mutex); 339 inode_unlock(inode);
340 return res; 340 return res;
341} 341}
342 342
@@ -349,12 +349,12 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait)
349 349
350 poll_wait(filp, &rpci->waitq, wait); 350 poll_wait(filp, &rpci->waitq, wait);
351 351
352 mutex_lock(&inode->i_mutex); 352 inode_lock(inode);
353 if (rpci->pipe == NULL) 353 if (rpci->pipe == NULL)
354 mask |= POLLERR | POLLHUP; 354 mask |= POLLERR | POLLHUP;
355 else if (filp->private_data || !list_empty(&rpci->pipe->pipe)) 355 else if (filp->private_data || !list_empty(&rpci->pipe->pipe))
356 mask |= POLLIN | POLLRDNORM; 356 mask |= POLLIN | POLLRDNORM;
357 mutex_unlock(&inode->i_mutex); 357 inode_unlock(inode);
358 return mask; 358 return mask;
359} 359}
360 360
@@ -367,10 +367,10 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
367 367
368 switch (cmd) { 368 switch (cmd) {
369 case FIONREAD: 369 case FIONREAD:
370 mutex_lock(&inode->i_mutex); 370 inode_lock(inode);
371 pipe = RPC_I(inode)->pipe; 371 pipe = RPC_I(inode)->pipe;
372 if (pipe == NULL) { 372 if (pipe == NULL) {
373 mutex_unlock(&inode->i_mutex); 373 inode_unlock(inode);
374 return -EPIPE; 374 return -EPIPE;
375 } 375 }
376 spin_lock(&pipe->lock); 376 spin_lock(&pipe->lock);
@@ -381,7 +381,7 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
381 len += msg->len - msg->copied; 381 len += msg->len - msg->copied;
382 } 382 }
383 spin_unlock(&pipe->lock); 383 spin_unlock(&pipe->lock);
384 mutex_unlock(&inode->i_mutex); 384 inode_unlock(inode);
385 return put_user(len, (int __user *)arg); 385 return put_user(len, (int __user *)arg);
386 default: 386 default:
387 return -EINVAL; 387 return -EINVAL;
@@ -617,9 +617,9 @@ int rpc_rmdir(struct dentry *dentry)
617 617
618 parent = dget_parent(dentry); 618 parent = dget_parent(dentry);
619 dir = d_inode(parent); 619 dir = d_inode(parent);
620 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); 620 inode_lock_nested(dir, I_MUTEX_PARENT);
621 error = __rpc_rmdir(dir, dentry); 621 error = __rpc_rmdir(dir, dentry);
622 mutex_unlock(&dir->i_mutex); 622 inode_unlock(dir);
623 dput(parent); 623 dput(parent);
624 return error; 624 return error;
625} 625}
@@ -701,9 +701,9 @@ static void rpc_depopulate(struct dentry *parent,
701{ 701{
702 struct inode *dir = d_inode(parent); 702 struct inode *dir = d_inode(parent);
703 703
704 mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD); 704 inode_lock_nested(dir, I_MUTEX_CHILD);
705 __rpc_depopulate(parent, files, start, eof); 705 __rpc_depopulate(parent, files, start, eof);
706 mutex_unlock(&dir->i_mutex); 706 inode_unlock(dir);
707} 707}
708 708
709static int rpc_populate(struct dentry *parent, 709static int rpc_populate(struct dentry *parent,
@@ -715,7 +715,7 @@ static int rpc_populate(struct dentry *parent,
715 struct dentry *dentry; 715 struct dentry *dentry;
716 int i, err; 716 int i, err;
717 717
718 mutex_lock(&dir->i_mutex); 718 inode_lock(dir);
719 for (i = start; i < eof; i++) { 719 for (i = start; i < eof; i++) {
720 dentry = __rpc_lookup_create_exclusive(parent, files[i].name); 720 dentry = __rpc_lookup_create_exclusive(parent, files[i].name);
721 err = PTR_ERR(dentry); 721 err = PTR_ERR(dentry);
@@ -739,11 +739,11 @@ static int rpc_populate(struct dentry *parent,
739 if (err != 0) 739 if (err != 0)
740 goto out_bad; 740 goto out_bad;
741 } 741 }
742 mutex_unlock(&dir->i_mutex); 742 inode_unlock(dir);
743 return 0; 743 return 0;
744out_bad: 744out_bad:
745 __rpc_depopulate(parent, files, start, eof); 745 __rpc_depopulate(parent, files, start, eof);
746 mutex_unlock(&dir->i_mutex); 746 inode_unlock(dir);
747 printk(KERN_WARNING "%s: %s failed to populate directory %pd\n", 747 printk(KERN_WARNING "%s: %s failed to populate directory %pd\n",
748 __FILE__, __func__, parent); 748 __FILE__, __func__, parent);
749 return err; 749 return err;
@@ -757,7 +757,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent,
757 struct inode *dir = d_inode(parent); 757 struct inode *dir = d_inode(parent);
758 int error; 758 int error;
759 759
760 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); 760 inode_lock_nested(dir, I_MUTEX_PARENT);
761 dentry = __rpc_lookup_create_exclusive(parent, name); 761 dentry = __rpc_lookup_create_exclusive(parent, name);
762 if (IS_ERR(dentry)) 762 if (IS_ERR(dentry))
763 goto out; 763 goto out;
@@ -770,7 +770,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent,
770 goto err_rmdir; 770 goto err_rmdir;
771 } 771 }
772out: 772out:
773 mutex_unlock(&dir->i_mutex); 773 inode_unlock(dir);
774 return dentry; 774 return dentry;
775err_rmdir: 775err_rmdir:
776 __rpc_rmdir(dir, dentry); 776 __rpc_rmdir(dir, dentry);
@@ -788,11 +788,11 @@ static int rpc_rmdir_depopulate(struct dentry *dentry,
788 788
789 parent = dget_parent(dentry); 789 parent = dget_parent(dentry);
790 dir = d_inode(parent); 790 dir = d_inode(parent);
791 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); 791 inode_lock_nested(dir, I_MUTEX_PARENT);
792 if (depopulate != NULL) 792 if (depopulate != NULL)
793 depopulate(dentry); 793 depopulate(dentry);
794 error = __rpc_rmdir(dir, dentry); 794 error = __rpc_rmdir(dir, dentry);
795 mutex_unlock(&dir->i_mutex); 795 inode_unlock(dir);
796 dput(parent); 796 dput(parent);
797 return error; 797 return error;
798} 798}
@@ -828,7 +828,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
828 if (pipe->ops->downcall == NULL) 828 if (pipe->ops->downcall == NULL)
829 umode &= ~S_IWUGO; 829 umode &= ~S_IWUGO;
830 830
831 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); 831 inode_lock_nested(dir, I_MUTEX_PARENT);
832 dentry = __rpc_lookup_create_exclusive(parent, name); 832 dentry = __rpc_lookup_create_exclusive(parent, name);
833 if (IS_ERR(dentry)) 833 if (IS_ERR(dentry))
834 goto out; 834 goto out;
@@ -837,7 +837,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
837 if (err) 837 if (err)
838 goto out_err; 838 goto out_err;
839out: 839out:
840 mutex_unlock(&dir->i_mutex); 840 inode_unlock(dir);
841 return dentry; 841 return dentry;
842out_err: 842out_err:
843 dentry = ERR_PTR(err); 843 dentry = ERR_PTR(err);
@@ -865,9 +865,9 @@ rpc_unlink(struct dentry *dentry)
865 865
866 parent = dget_parent(dentry); 866 parent = dget_parent(dentry);
867 dir = d_inode(parent); 867 dir = d_inode(parent);
868 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); 868 inode_lock_nested(dir, I_MUTEX_PARENT);
869 error = __rpc_rmpipe(dir, dentry); 869 error = __rpc_rmpipe(dir, dentry);
870 mutex_unlock(&dir->i_mutex); 870 inode_unlock(dir);
871 dput(parent); 871 dput(parent);
872 return error; 872 return error;
873} 873}
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 2e98f4a243e5..37edea6fa92d 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1425,3 +1425,4 @@ void xprt_put(struct rpc_xprt *xprt)
1425 if (atomic_dec_and_test(&xprt->count)) 1425 if (atomic_dec_and_test(&xprt->count))
1426 xprt_destroy(xprt); 1426 xprt_destroy(xprt);
1427} 1427}
1428EXPORT_SYMBOL_GPL(xprt_put);
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 33f99d3004f2..dc9f3b513a05 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
2 2
3rpcrdma-y := transport.o rpc_rdma.o verbs.o \ 3rpcrdma-y := transport.o rpc_rdma.o verbs.o \
4 fmr_ops.o frwr_ops.o physical_ops.o \ 4 fmr_ops.o frwr_ops.o physical_ops.o \
5 svc_rdma.o svc_rdma_transport.o \ 5 svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
6 svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ 6 svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
7 module.o 7 module.o
8rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o 8rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index c6836844bd0e..e16567389e28 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -190,12 +190,11 @@ static int
190frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, 190frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
191 struct rpcrdma_create_data_internal *cdata) 191 struct rpcrdma_create_data_internal *cdata)
192{ 192{
193 struct ib_device_attr *devattr = &ia->ri_devattr;
194 int depth, delta; 193 int depth, delta;
195 194
196 ia->ri_max_frmr_depth = 195 ia->ri_max_frmr_depth =
197 min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, 196 min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
198 devattr->max_fast_reg_page_list_len); 197 ia->ri_device->attrs.max_fast_reg_page_list_len);
199 dprintk("RPC: %s: device's max FR page list len = %u\n", 198 dprintk("RPC: %s: device's max FR page list len = %u\n",
200 __func__, ia->ri_max_frmr_depth); 199 __func__, ia->ri_max_frmr_depth);
201 200
@@ -222,8 +221,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
222 } 221 }
223 222
224 ep->rep_attr.cap.max_send_wr *= depth; 223 ep->rep_attr.cap.max_send_wr *= depth;
225 if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { 224 if (ep->rep_attr.cap.max_send_wr > ia->ri_device->attrs.max_qp_wr) {
226 cdata->max_requests = devattr->max_qp_wr / depth; 225 cdata->max_requests = ia->ri_device->attrs.max_qp_wr / depth;
227 if (!cdata->max_requests) 226 if (!cdata->max_requests)
228 return -EINVAL; 227 return -EINVAL;
229 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 228 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 1b7051bdbdc8..c846ca9f1eba 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -55,6 +55,7 @@ unsigned int svcrdma_ord = RPCRDMA_ORD;
55static unsigned int min_ord = 1; 55static unsigned int min_ord = 1;
56static unsigned int max_ord = 4096; 56static unsigned int max_ord = 4096;
57unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS; 57unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
58unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
58static unsigned int min_max_requests = 4; 59static unsigned int min_max_requests = 4;
59static unsigned int max_max_requests = 16384; 60static unsigned int max_max_requests = 16384;
60unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE; 61unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
@@ -71,10 +72,6 @@ atomic_t rdma_stat_rq_prod;
71atomic_t rdma_stat_sq_poll; 72atomic_t rdma_stat_sq_poll;
72atomic_t rdma_stat_sq_prod; 73atomic_t rdma_stat_sq_prod;
73 74
74/* Temporary NFS request map and context caches */
75struct kmem_cache *svc_rdma_map_cachep;
76struct kmem_cache *svc_rdma_ctxt_cachep;
77
78struct workqueue_struct *svc_rdma_wq; 75struct workqueue_struct *svc_rdma_wq;
79 76
80/* 77/*
@@ -243,17 +240,16 @@ void svc_rdma_cleanup(void)
243 svc_unreg_xprt_class(&svc_rdma_bc_class); 240 svc_unreg_xprt_class(&svc_rdma_bc_class);
244#endif 241#endif
245 svc_unreg_xprt_class(&svc_rdma_class); 242 svc_unreg_xprt_class(&svc_rdma_class);
246 kmem_cache_destroy(svc_rdma_map_cachep);
247 kmem_cache_destroy(svc_rdma_ctxt_cachep);
248} 243}
249 244
250int svc_rdma_init(void) 245int svc_rdma_init(void)
251{ 246{
252 dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); 247 dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
253 dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); 248 dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
254 dprintk("\tmax_requests : %d\n", svcrdma_max_requests); 249 dprintk("\tmax_requests : %u\n", svcrdma_max_requests);
255 dprintk("\tsq_depth : %d\n", 250 dprintk("\tsq_depth : %u\n",
256 svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT); 251 svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
252 dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests);
257 dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); 253 dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
258 254
259 svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0); 255 svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
@@ -264,39 +260,10 @@ int svc_rdma_init(void)
264 svcrdma_table_header = 260 svcrdma_table_header =
265 register_sysctl_table(svcrdma_root_table); 261 register_sysctl_table(svcrdma_root_table);
266 262
267 /* Create the temporary map cache */
268 svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache",
269 sizeof(struct svc_rdma_req_map),
270 0,
271 SLAB_HWCACHE_ALIGN,
272 NULL);
273 if (!svc_rdma_map_cachep) {
274 printk(KERN_INFO "Could not allocate map cache.\n");
275 goto err0;
276 }
277
278 /* Create the temporary context cache */
279 svc_rdma_ctxt_cachep =
280 kmem_cache_create("svc_rdma_ctxt_cache",
281 sizeof(struct svc_rdma_op_ctxt),
282 0,
283 SLAB_HWCACHE_ALIGN,
284 NULL);
285 if (!svc_rdma_ctxt_cachep) {
286 printk(KERN_INFO "Could not allocate WR ctxt cache.\n");
287 goto err1;
288 }
289
290 /* Register RDMA with the SVC transport switch */ 263 /* Register RDMA with the SVC transport switch */
291 svc_reg_xprt_class(&svc_rdma_class); 264 svc_reg_xprt_class(&svc_rdma_class);
292#if defined(CONFIG_SUNRPC_BACKCHANNEL) 265#if defined(CONFIG_SUNRPC_BACKCHANNEL)
293 svc_reg_xprt_class(&svc_rdma_bc_class); 266 svc_reg_xprt_class(&svc_rdma_bc_class);
294#endif 267#endif
295 return 0; 268 return 0;
296 err1:
297 kmem_cache_destroy(svc_rdma_map_cachep);
298 err0:
299 unregister_sysctl_table(svcrdma_table_header);
300 destroy_workqueue(svc_rdma_wq);
301 return -ENOMEM;
302} 269}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
new file mode 100644
index 000000000000..65a7c232a345
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -0,0 +1,371 @@
1/*
2 * Copyright (c) 2015 Oracle. All rights reserved.
3 *
4 * Support for backward direction RPCs on RPC/RDMA (server-side).
5 */
6
7#include <linux/sunrpc/svc_rdma.h>
8#include "xprt_rdma.h"
9
10#define RPCDBG_FACILITY RPCDBG_SVCXPRT
11
12#undef SVCRDMA_BACKCHANNEL_DEBUG
13
14int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
15 struct xdr_buf *rcvbuf)
16{
17 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
18 struct kvec *dst, *src = &rcvbuf->head[0];
19 struct rpc_rqst *req;
20 unsigned long cwnd;
21 u32 credits;
22 size_t len;
23 __be32 xid;
24 __be32 *p;
25 int ret;
26
27 p = (__be32 *)src->iov_base;
28 len = src->iov_len;
29 xid = rmsgp->rm_xid;
30
31#ifdef SVCRDMA_BACKCHANNEL_DEBUG
32 pr_info("%s: xid=%08x, length=%zu\n",
33 __func__, be32_to_cpu(xid), len);
34 pr_info("%s: RPC/RDMA: %*ph\n",
35 __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp);
36 pr_info("%s: RPC: %*ph\n",
37 __func__, (int)len, p);
38#endif
39
40 ret = -EAGAIN;
41 if (src->iov_len < 24)
42 goto out_shortreply;
43
44 spin_lock_bh(&xprt->transport_lock);
45 req = xprt_lookup_rqst(xprt, xid);
46 if (!req)
47 goto out_notfound;
48
49 dst = &req->rq_private_buf.head[0];
50 memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf));
51 if (dst->iov_len < len)
52 goto out_unlock;
53 memcpy(dst->iov_base, p, len);
54
55 credits = be32_to_cpu(rmsgp->rm_credit);
56 if (credits == 0)
57 credits = 1; /* don't deadlock */
58 else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
59 credits = r_xprt->rx_buf.rb_bc_max_requests;
60
61 cwnd = xprt->cwnd;
62 xprt->cwnd = credits << RPC_CWNDSHIFT;
63 if (xprt->cwnd > cwnd)
64 xprt_release_rqst_cong(req->rq_task);
65
66 ret = 0;
67 xprt_complete_rqst(req->rq_task, rcvbuf->len);
68 rcvbuf->len = 0;
69
70out_unlock:
71 spin_unlock_bh(&xprt->transport_lock);
72out:
73 return ret;
74
75out_shortreply:
76 dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n",
77 xprt, src->iov_len);
78 goto out;
79
80out_notfound:
81 dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n",
82 xprt, be32_to_cpu(xid));
83
84 goto out_unlock;
85}
86
87/* Send a backwards direction RPC call.
88 *
89 * Caller holds the connection's mutex and has already marshaled
90 * the RPC/RDMA request.
91 *
92 * This is similar to svc_rdma_reply, but takes an rpc_rqst
93 * instead, does not support chunks, and avoids blocking memory
94 * allocation.
95 *
96 * XXX: There is still an opportunity to block in svc_rdma_send()
97 * if there are no SQ entries to post the Send. This may occur if
98 * the adapter has a small maximum SQ depth.
99 */
100static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
101 struct rpc_rqst *rqst)
102{
103 struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
104 struct svc_rdma_op_ctxt *ctxt;
105 struct svc_rdma_req_map *vec;
106 struct ib_send_wr send_wr;
107 int ret;
108
109 vec = svc_rdma_get_req_map(rdma);
110 ret = svc_rdma_map_xdr(rdma, sndbuf, vec);
111 if (ret)
112 goto out_err;
113
114 /* Post a recv buffer to handle the reply for this request. */
115 ret = svc_rdma_post_recv(rdma, GFP_NOIO);
116 if (ret) {
117 pr_err("svcrdma: Failed to post bc receive buffer, err=%d.\n",
118 ret);
119 pr_err("svcrdma: closing transport %p.\n", rdma);
120 set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
121 ret = -ENOTCONN;
122 goto out_err;
123 }
124
125 ctxt = svc_rdma_get_context(rdma);
126 ctxt->pages[0] = virt_to_page(rqst->rq_buffer);
127 ctxt->count = 1;
128
129 ctxt->wr_op = IB_WR_SEND;
130 ctxt->direction = DMA_TO_DEVICE;
131 ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
132 ctxt->sge[0].length = sndbuf->len;
133 ctxt->sge[0].addr =
134 ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0,
135 sndbuf->len, DMA_TO_DEVICE);
136 if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) {
137 ret = -EIO;
138 goto out_unmap;
139 }
140 atomic_inc(&rdma->sc_dma_used);
141
142 memset(&send_wr, 0, sizeof(send_wr));
143 send_wr.wr_id = (unsigned long)ctxt;
144 send_wr.sg_list = ctxt->sge;
145 send_wr.num_sge = 1;
146 send_wr.opcode = IB_WR_SEND;
147 send_wr.send_flags = IB_SEND_SIGNALED;
148
149 ret = svc_rdma_send(rdma, &send_wr);
150 if (ret) {
151 ret = -EIO;
152 goto out_unmap;
153 }
154
155out_err:
156 svc_rdma_put_req_map(rdma, vec);
157 dprintk("svcrdma: %s returns %d\n", __func__, ret);
158 return ret;
159
160out_unmap:
161 svc_rdma_unmap_dma(ctxt);
162 svc_rdma_put_context(ctxt, 1);
163 goto out_err;
164}
165
166/* Server-side transport endpoint wants a whole page for its send
167 * buffer. The client RPC code constructs the RPC header in this
168 * buffer before it invokes ->send_request.
169 *
170 * Returns NULL if there was a temporary allocation failure.
171 */
172static void *
173xprt_rdma_bc_allocate(struct rpc_task *task, size_t size)
174{
175 struct rpc_rqst *rqst = task->tk_rqstp;
176 struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
177 struct svcxprt_rdma *rdma;
178 struct page *page;
179
180 rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
181
182 /* Prevent an infinite loop: try to make this case work */
183 if (size > PAGE_SIZE)
184 WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
185 size);
186
187 page = alloc_page(RPCRDMA_DEF_GFP);
188 if (!page)
189 return NULL;
190
191 return page_address(page);
192}
193
194static void
195xprt_rdma_bc_free(void *buffer)
196{
197 /* No-op: ctxt and page have already been freed. */
198}
199
200static int
201rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
202{
203 struct rpc_xprt *xprt = rqst->rq_xprt;
204 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
205 struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer;
206 int rc;
207
208 /* Space in the send buffer for an RPC/RDMA header is reserved
209 * via xprt->tsh_size.
210 */
211 headerp->rm_xid = rqst->rq_xid;
212 headerp->rm_vers = rpcrdma_version;
213 headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
214 headerp->rm_type = rdma_msg;
215 headerp->rm_body.rm_chunks[0] = xdr_zero;
216 headerp->rm_body.rm_chunks[1] = xdr_zero;
217 headerp->rm_body.rm_chunks[2] = xdr_zero;
218
219#ifdef SVCRDMA_BACKCHANNEL_DEBUG
220 pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
221#endif
222
223 rc = svc_rdma_bc_sendto(rdma, rqst);
224 if (rc)
225 goto drop_connection;
226 return rc;
227
228drop_connection:
229 dprintk("svcrdma: failed to send bc call\n");
230 xprt_disconnect_done(xprt);
231 return -ENOTCONN;
232}
233
234/* Send an RPC call on the passive end of a transport
235 * connection.
236 */
237static int
238xprt_rdma_bc_send_request(struct rpc_task *task)
239{
240 struct rpc_rqst *rqst = task->tk_rqstp;
241 struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
242 struct svcxprt_rdma *rdma;
243 int ret;
244
245 dprintk("svcrdma: sending bc call with xid: %08x\n",
246 be32_to_cpu(rqst->rq_xid));
247
248 if (!mutex_trylock(&sxprt->xpt_mutex)) {
249 rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL);
250 if (!mutex_trylock(&sxprt->xpt_mutex))
251 return -EAGAIN;
252 rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task);
253 }
254
255 ret = -ENOTCONN;
256 rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
257 if (!test_bit(XPT_DEAD, &sxprt->xpt_flags))
258 ret = rpcrdma_bc_send_request(rdma, rqst);
259
260 mutex_unlock(&sxprt->xpt_mutex);
261
262 if (ret < 0)
263 return ret;
264 return 0;
265}
266
267static void
268xprt_rdma_bc_close(struct rpc_xprt *xprt)
269{
270 dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
271}
272
273static void
274xprt_rdma_bc_put(struct rpc_xprt *xprt)
275{
276 dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
277
278 xprt_free(xprt);
279 module_put(THIS_MODULE);
280}
281
282static struct rpc_xprt_ops xprt_rdma_bc_procs = {
283 .reserve_xprt = xprt_reserve_xprt_cong,
284 .release_xprt = xprt_release_xprt_cong,
285 .alloc_slot = xprt_alloc_slot,
286 .release_request = xprt_release_rqst_cong,
287 .buf_alloc = xprt_rdma_bc_allocate,
288 .buf_free = xprt_rdma_bc_free,
289 .send_request = xprt_rdma_bc_send_request,
290 .set_retrans_timeout = xprt_set_retrans_timeout_def,
291 .close = xprt_rdma_bc_close,
292 .destroy = xprt_rdma_bc_put,
293 .print_stats = xprt_rdma_print_stats
294};
295
296static const struct rpc_timeout xprt_rdma_bc_timeout = {
297 .to_initval = 60 * HZ,
298 .to_maxval = 60 * HZ,
299};
300
301/* It shouldn't matter if the number of backchannel session slots
302 * doesn't match the number of RPC/RDMA credits. That just means
303 * one or the other will have extra slots that aren't used.
304 */
305static struct rpc_xprt *
306xprt_setup_rdma_bc(struct xprt_create *args)
307{
308 struct rpc_xprt *xprt;
309 struct rpcrdma_xprt *new_xprt;
310
311 if (args->addrlen > sizeof(xprt->addr)) {
312 dprintk("RPC: %s: address too large\n", __func__);
313 return ERR_PTR(-EBADF);
314 }
315
316 xprt = xprt_alloc(args->net, sizeof(*new_xprt),
317 RPCRDMA_MAX_BC_REQUESTS,
318 RPCRDMA_MAX_BC_REQUESTS);
319 if (!xprt) {
320 dprintk("RPC: %s: couldn't allocate rpc_xprt\n",
321 __func__);
322 return ERR_PTR(-ENOMEM);
323 }
324
325 xprt->timeout = &xprt_rdma_bc_timeout;
326 xprt_set_bound(xprt);
327 xprt_set_connected(xprt);
328 xprt->bind_timeout = RPCRDMA_BIND_TO;
329 xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
330 xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
331
332 xprt->prot = XPRT_TRANSPORT_BC_RDMA;
333 xprt->tsh_size = RPCRDMA_HDRLEN_MIN / sizeof(__be32);
334 xprt->ops = &xprt_rdma_bc_procs;
335
336 memcpy(&xprt->addr, args->dstaddr, args->addrlen);
337 xprt->addrlen = args->addrlen;
338 xprt_rdma_format_addresses(xprt, (struct sockaddr *)&xprt->addr);
339 xprt->resvport = 0;
340
341 xprt->max_payload = xprt_rdma_max_inline_read;
342
343 new_xprt = rpcx_to_rdmax(xprt);
344 new_xprt->rx_buf.rb_bc_max_requests = xprt->max_reqs;
345
346 xprt_get(xprt);
347 args->bc_xprt->xpt_bc_xprt = xprt;
348 xprt->bc_xprt = args->bc_xprt;
349
350 if (!try_module_get(THIS_MODULE))
351 goto out_fail;
352
353 /* Final put for backchannel xprt is in __svc_rdma_free */
354 xprt_get(xprt);
355 return xprt;
356
357out_fail:
358 xprt_rdma_free_addresses(xprt);
359 args->bc_xprt->xpt_bc_xprt = NULL;
360 xprt_put(xprt);
361 xprt_free(xprt);
362 return ERR_PTR(-EINVAL);
363}
364
365struct xprt_class xprt_rdma_bc = {
366 .list = LIST_HEAD_INIT(xprt_rdma_bc.list),
367 .name = "rdma backchannel",
368 .owner = THIS_MODULE,
369 .ident = XPRT_TRANSPORT_BC_RDMA,
370 .setup = xprt_setup_rdma_bc,
371};
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index ff4f01e527ec..c8b8a8b4181e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -144,6 +144,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
144 144
145 head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; 145 head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
146 head->arg.page_len += len; 146 head->arg.page_len += len;
147
147 head->arg.len += len; 148 head->arg.len += len;
148 if (!pg_off) 149 if (!pg_off)
149 head->count++; 150 head->count++;
@@ -160,8 +161,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
160 goto err; 161 goto err;
161 atomic_inc(&xprt->sc_dma_used); 162 atomic_inc(&xprt->sc_dma_used);
162 163
163 /* The lkey here is either a local dma lkey or a dma_mr lkey */ 164 ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey;
164 ctxt->sge[pno].lkey = xprt->sc_dma_lkey;
165 ctxt->sge[pno].length = len; 165 ctxt->sge[pno].length = len;
166 ctxt->count++; 166 ctxt->count++;
167 167
@@ -567,6 +567,38 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
567 return ret; 567 return ret;
568} 568}
569 569
570/* By convention, backchannel calls arrive via rdma_msg type
571 * messages, and never populate the chunk lists. This makes
572 * the RPC/RDMA header small and fixed in size, so it is
573 * straightforward to check the RPC header's direction field.
574 */
575static bool
576svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp)
577{
578 __be32 *p = (__be32 *)rmsgp;
579
580 if (!xprt->xpt_bc_xprt)
581 return false;
582
583 if (rmsgp->rm_type != rdma_msg)
584 return false;
585 if (rmsgp->rm_body.rm_chunks[0] != xdr_zero)
586 return false;
587 if (rmsgp->rm_body.rm_chunks[1] != xdr_zero)
588 return false;
589 if (rmsgp->rm_body.rm_chunks[2] != xdr_zero)
590 return false;
591
592 /* sanity */
593 if (p[7] != rmsgp->rm_xid)
594 return false;
595 /* call direction */
596 if (p[8] == cpu_to_be32(RPC_CALL))
597 return false;
598
599 return true;
600}
601
570/* 602/*
571 * Set up the rqstp thread context to point to the RQ buffer. If 603 * Set up the rqstp thread context to point to the RQ buffer. If
572 * necessary, pull additional data from the client with an RDMA_READ 604 * necessary, pull additional data from the client with an RDMA_READ
@@ -632,6 +664,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
632 goto close_out; 664 goto close_out;
633 } 665 }
634 666
667 if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) {
668 ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp,
669 &rqstp->rq_arg);
670 svc_rdma_put_context(ctxt, 0);
671 if (ret)
672 goto repost;
673 return ret;
674 }
675
635 /* Read read-list data. */ 676 /* Read read-list data. */
636 ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt); 677 ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt);
637 if (ret > 0) { 678 if (ret > 0) {
@@ -668,4 +709,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
668 set_bit(XPT_CLOSE, &xprt->xpt_flags); 709 set_bit(XPT_CLOSE, &xprt->xpt_flags);
669defer: 710defer:
670 return 0; 711 return 0;
712
713repost:
714 ret = svc_rdma_post_recv(rdma_xprt, GFP_KERNEL);
715 if (ret) {
716 pr_err("svcrdma: could not post a receive buffer, err=%d.\n",
717 ret);
718 pr_err("svcrdma: closing transport %p.\n", rdma_xprt);
719 set_bit(XPT_CLOSE, &rdma_xprt->sc_xprt.xpt_flags);
720 ret = -ENOTCONN;
721 }
722 return ret;
671} 723}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 969a1ab75fc3..df57f3ce6cd2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -50,9 +50,9 @@
50 50
51#define RPCDBG_FACILITY RPCDBG_SVCXPRT 51#define RPCDBG_FACILITY RPCDBG_SVCXPRT
52 52
53static int map_xdr(struct svcxprt_rdma *xprt, 53int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
54 struct xdr_buf *xdr, 54 struct xdr_buf *xdr,
55 struct svc_rdma_req_map *vec) 55 struct svc_rdma_req_map *vec)
56{ 56{
57 int sge_no; 57 int sge_no;
58 u32 sge_bytes; 58 u32 sge_bytes;
@@ -62,7 +62,7 @@ static int map_xdr(struct svcxprt_rdma *xprt,
62 62
63 if (xdr->len != 63 if (xdr->len !=
64 (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) { 64 (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
65 pr_err("svcrdma: map_xdr: XDR buffer length error\n"); 65 pr_err("svcrdma: %s: XDR buffer length error\n", __func__);
66 return -EIO; 66 return -EIO;
67 } 67 }
68 68
@@ -97,9 +97,9 @@ static int map_xdr(struct svcxprt_rdma *xprt,
97 sge_no++; 97 sge_no++;
98 } 98 }
99 99
100 dprintk("svcrdma: map_xdr: sge_no %d page_no %d " 100 dprintk("svcrdma: %s: sge_no %d page_no %d "
101 "page_base %u page_len %u head_len %zu tail_len %zu\n", 101 "page_base %u page_len %u head_len %zu tail_len %zu\n",
102 sge_no, page_no, xdr->page_base, xdr->page_len, 102 __func__, sge_no, page_no, xdr->page_base, xdr->page_len,
103 xdr->head[0].iov_len, xdr->tail[0].iov_len); 103 xdr->head[0].iov_len, xdr->tail[0].iov_len);
104 104
105 vec->count = sge_no; 105 vec->count = sge_no;
@@ -265,7 +265,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
265 sge[sge_no].addr)) 265 sge[sge_no].addr))
266 goto err; 266 goto err;
267 atomic_inc(&xprt->sc_dma_used); 267 atomic_inc(&xprt->sc_dma_used);
268 sge[sge_no].lkey = xprt->sc_dma_lkey; 268 sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
269 ctxt->count++; 269 ctxt->count++;
270 sge_off = 0; 270 sge_off = 0;
271 sge_no++; 271 sge_no++;
@@ -465,7 +465,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
465 int ret; 465 int ret;
466 466
467 /* Post a recv buffer to handle another request. */ 467 /* Post a recv buffer to handle another request. */
468 ret = svc_rdma_post_recv(rdma); 468 ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
469 if (ret) { 469 if (ret) {
470 printk(KERN_INFO 470 printk(KERN_INFO
471 "svcrdma: could not post a receive buffer, err=%d." 471 "svcrdma: could not post a receive buffer, err=%d."
@@ -480,7 +480,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
480 ctxt->count = 1; 480 ctxt->count = 1;
481 481
482 /* Prepare the SGE for the RPCRDMA Header */ 482 /* Prepare the SGE for the RPCRDMA Header */
483 ctxt->sge[0].lkey = rdma->sc_dma_lkey; 483 ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
484 ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); 484 ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
485 ctxt->sge[0].addr = 485 ctxt->sge[0].addr =
486 ib_dma_map_page(rdma->sc_cm_id->device, page, 0, 486 ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
@@ -504,7 +504,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
504 ctxt->sge[sge_no].addr)) 504 ctxt->sge[sge_no].addr))
505 goto err; 505 goto err;
506 atomic_inc(&rdma->sc_dma_used); 506 atomic_inc(&rdma->sc_dma_used);
507 ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; 507 ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
508 ctxt->sge[sge_no].length = sge_bytes; 508 ctxt->sge[sge_no].length = sge_bytes;
509 } 509 }
510 if (byte_count != 0) { 510 if (byte_count != 0) {
@@ -591,14 +591,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
591 /* Build an req vec for the XDR */ 591 /* Build an req vec for the XDR */
592 ctxt = svc_rdma_get_context(rdma); 592 ctxt = svc_rdma_get_context(rdma);
593 ctxt->direction = DMA_TO_DEVICE; 593 ctxt->direction = DMA_TO_DEVICE;
594 vec = svc_rdma_get_req_map(); 594 vec = svc_rdma_get_req_map(rdma);
595 ret = map_xdr(rdma, &rqstp->rq_res, vec); 595 ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec);
596 if (ret) 596 if (ret)
597 goto err0; 597 goto err0;
598 inline_bytes = rqstp->rq_res.len; 598 inline_bytes = rqstp->rq_res.len;
599 599
600 /* Create the RDMA response header */ 600 /* Create the RDMA response header */
601 res_page = alloc_page(GFP_KERNEL | __GFP_NOFAIL); 601 ret = -ENOMEM;
602 res_page = alloc_page(GFP_KERNEL);
603 if (!res_page)
604 goto err0;
602 rdma_resp = page_address(res_page); 605 rdma_resp = page_address(res_page);
603 reply_ary = svc_rdma_get_reply_array(rdma_argp); 606 reply_ary = svc_rdma_get_reply_array(rdma_argp);
604 if (reply_ary) 607 if (reply_ary)
@@ -630,14 +633,14 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
630 633
631 ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec, 634 ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
632 inline_bytes); 635 inline_bytes);
633 svc_rdma_put_req_map(vec); 636 svc_rdma_put_req_map(rdma, vec);
634 dprintk("svcrdma: send_reply returns %d\n", ret); 637 dprintk("svcrdma: send_reply returns %d\n", ret);
635 return ret; 638 return ret;
636 639
637 err1: 640 err1:
638 put_page(res_page); 641 put_page(res_page);
639 err0: 642 err0:
640 svc_rdma_put_req_map(vec); 643 svc_rdma_put_req_map(rdma, vec);
641 svc_rdma_put_context(ctxt, 0); 644 svc_rdma_put_context(ctxt, 0);
642 return ret; 645 return ret;
643} 646}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index b348b4adef29..5763825d09bf 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -153,18 +153,76 @@ static void svc_rdma_bc_free(struct svc_xprt *xprt)
153} 153}
154#endif /* CONFIG_SUNRPC_BACKCHANNEL */ 154#endif /* CONFIG_SUNRPC_BACKCHANNEL */
155 155
156struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) 156static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
157 gfp_t flags)
157{ 158{
158 struct svc_rdma_op_ctxt *ctxt; 159 struct svc_rdma_op_ctxt *ctxt;
159 160
160 ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, 161 ctxt = kmalloc(sizeof(*ctxt), flags);
161 GFP_KERNEL | __GFP_NOFAIL); 162 if (ctxt) {
162 ctxt->xprt = xprt; 163 ctxt->xprt = xprt;
163 INIT_LIST_HEAD(&ctxt->dto_q); 164 INIT_LIST_HEAD(&ctxt->free);
165 INIT_LIST_HEAD(&ctxt->dto_q);
166 }
167 return ctxt;
168}
169
170static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
171{
172 unsigned int i;
173
174 /* Each RPC/RDMA credit can consume a number of send
175 * and receive WQEs. One ctxt is allocated for each.
176 */
177 i = xprt->sc_sq_depth + xprt->sc_rq_depth;
178
179 while (i--) {
180 struct svc_rdma_op_ctxt *ctxt;
181
182 ctxt = alloc_ctxt(xprt, GFP_KERNEL);
183 if (!ctxt) {
184 dprintk("svcrdma: No memory for RDMA ctxt\n");
185 return false;
186 }
187 list_add(&ctxt->free, &xprt->sc_ctxts);
188 }
189 return true;
190}
191
192struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
193{
194 struct svc_rdma_op_ctxt *ctxt = NULL;
195
196 spin_lock_bh(&xprt->sc_ctxt_lock);
197 xprt->sc_ctxt_used++;
198 if (list_empty(&xprt->sc_ctxts))
199 goto out_empty;
200
201 ctxt = list_first_entry(&xprt->sc_ctxts,
202 struct svc_rdma_op_ctxt, free);
203 list_del_init(&ctxt->free);
204 spin_unlock_bh(&xprt->sc_ctxt_lock);
205
206out:
164 ctxt->count = 0; 207 ctxt->count = 0;
165 ctxt->frmr = NULL; 208 ctxt->frmr = NULL;
166 atomic_inc(&xprt->sc_ctxt_used);
167 return ctxt; 209 return ctxt;
210
211out_empty:
212 /* Either pre-allocation missed the mark, or send
213 * queue accounting is broken.
214 */
215 spin_unlock_bh(&xprt->sc_ctxt_lock);
216
217 ctxt = alloc_ctxt(xprt, GFP_NOIO);
218 if (ctxt)
219 goto out;
220
221 spin_lock_bh(&xprt->sc_ctxt_lock);
222 xprt->sc_ctxt_used--;
223 spin_unlock_bh(&xprt->sc_ctxt_lock);
224 WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
225 return NULL;
168} 226}
169 227
170void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) 228void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
@@ -174,11 +232,11 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
174 for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { 232 for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
175 /* 233 /*
176 * Unmap the DMA addr in the SGE if the lkey matches 234 * Unmap the DMA addr in the SGE if the lkey matches
177 * the sc_dma_lkey, otherwise, ignore it since it is 235 * the local_dma_lkey, otherwise, ignore it since it is
178 * an FRMR lkey and will be unmapped later when the 236 * an FRMR lkey and will be unmapped later when the
179 * last WR that uses it completes. 237 * last WR that uses it completes.
180 */ 238 */
181 if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) { 239 if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) {
182 atomic_dec(&xprt->sc_dma_used); 240 atomic_dec(&xprt->sc_dma_used);
183 ib_dma_unmap_page(xprt->sc_cm_id->device, 241 ib_dma_unmap_page(xprt->sc_cm_id->device,
184 ctxt->sge[i].addr, 242 ctxt->sge[i].addr,
@@ -190,35 +248,108 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
190 248
191void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) 249void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
192{ 250{
193 struct svcxprt_rdma *xprt; 251 struct svcxprt_rdma *xprt = ctxt->xprt;
194 int i; 252 int i;
195 253
196 xprt = ctxt->xprt;
197 if (free_pages) 254 if (free_pages)
198 for (i = 0; i < ctxt->count; i++) 255 for (i = 0; i < ctxt->count; i++)
199 put_page(ctxt->pages[i]); 256 put_page(ctxt->pages[i]);
200 257
201 kmem_cache_free(svc_rdma_ctxt_cachep, ctxt); 258 spin_lock_bh(&xprt->sc_ctxt_lock);
202 atomic_dec(&xprt->sc_ctxt_used); 259 xprt->sc_ctxt_used--;
260 list_add(&ctxt->free, &xprt->sc_ctxts);
261 spin_unlock_bh(&xprt->sc_ctxt_lock);
203} 262}
204 263
205/* 264static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
206 * Temporary NFS req mappings are shared across all transport 265{
207 * instances. These are short lived and should be bounded by the number 266 while (!list_empty(&xprt->sc_ctxts)) {
208 * of concurrent server threads * depth of the SQ. 267 struct svc_rdma_op_ctxt *ctxt;
209 */ 268
210struct svc_rdma_req_map *svc_rdma_get_req_map(void) 269 ctxt = list_first_entry(&xprt->sc_ctxts,
270 struct svc_rdma_op_ctxt, free);
271 list_del(&ctxt->free);
272 kfree(ctxt);
273 }
274}
275
276static struct svc_rdma_req_map *alloc_req_map(gfp_t flags)
211{ 277{
212 struct svc_rdma_req_map *map; 278 struct svc_rdma_req_map *map;
213 map = kmem_cache_alloc(svc_rdma_map_cachep, 279
214 GFP_KERNEL | __GFP_NOFAIL); 280 map = kmalloc(sizeof(*map), flags);
281 if (map)
282 INIT_LIST_HEAD(&map->free);
283 return map;
284}
285
286static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt)
287{
288 unsigned int i;
289
290 /* One for each receive buffer on this connection. */
291 i = xprt->sc_max_requests;
292
293 while (i--) {
294 struct svc_rdma_req_map *map;
295
296 map = alloc_req_map(GFP_KERNEL);
297 if (!map) {
298 dprintk("svcrdma: No memory for request map\n");
299 return false;
300 }
301 list_add(&map->free, &xprt->sc_maps);
302 }
303 return true;
304}
305
306struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt)
307{
308 struct svc_rdma_req_map *map = NULL;
309
310 spin_lock(&xprt->sc_map_lock);
311 if (list_empty(&xprt->sc_maps))
312 goto out_empty;
313
314 map = list_first_entry(&xprt->sc_maps,
315 struct svc_rdma_req_map, free);
316 list_del_init(&map->free);
317 spin_unlock(&xprt->sc_map_lock);
318
319out:
215 map->count = 0; 320 map->count = 0;
216 return map; 321 return map;
322
323out_empty:
324 spin_unlock(&xprt->sc_map_lock);
325
326 /* Pre-allocation amount was incorrect */
327 map = alloc_req_map(GFP_NOIO);
328 if (map)
329 goto out;
330
331 WARN_ONCE(1, "svcrdma: empty request map list?\n");
332 return NULL;
333}
334
335void svc_rdma_put_req_map(struct svcxprt_rdma *xprt,
336 struct svc_rdma_req_map *map)
337{
338 spin_lock(&xprt->sc_map_lock);
339 list_add(&map->free, &xprt->sc_maps);
340 spin_unlock(&xprt->sc_map_lock);
217} 341}
218 342
219void svc_rdma_put_req_map(struct svc_rdma_req_map *map) 343static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt)
220{ 344{
221 kmem_cache_free(svc_rdma_map_cachep, map); 345 while (!list_empty(&xprt->sc_maps)) {
346 struct svc_rdma_req_map *map;
347
348 map = list_first_entry(&xprt->sc_maps,
349 struct svc_rdma_req_map, free);
350 list_del(&map->free);
351 kfree(map);
352 }
222} 353}
223 354
224/* ib_cq event handler */ 355/* ib_cq event handler */
@@ -386,46 +517,44 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
386static void process_context(struct svcxprt_rdma *xprt, 517static void process_context(struct svcxprt_rdma *xprt,
387 struct svc_rdma_op_ctxt *ctxt) 518 struct svc_rdma_op_ctxt *ctxt)
388{ 519{
520 struct svc_rdma_op_ctxt *read_hdr;
521 int free_pages = 0;
522
389 svc_rdma_unmap_dma(ctxt); 523 svc_rdma_unmap_dma(ctxt);
390 524
391 switch (ctxt->wr_op) { 525 switch (ctxt->wr_op) {
392 case IB_WR_SEND: 526 case IB_WR_SEND:
393 if (ctxt->frmr) 527 free_pages = 1;
394 pr_err("svcrdma: SEND: ctxt->frmr != NULL\n");
395 svc_rdma_put_context(ctxt, 1);
396 break; 528 break;
397 529
398 case IB_WR_RDMA_WRITE: 530 case IB_WR_RDMA_WRITE:
399 if (ctxt->frmr)
400 pr_err("svcrdma: WRITE: ctxt->frmr != NULL\n");
401 svc_rdma_put_context(ctxt, 0);
402 break; 531 break;
403 532
404 case IB_WR_RDMA_READ: 533 case IB_WR_RDMA_READ:
405 case IB_WR_RDMA_READ_WITH_INV: 534 case IB_WR_RDMA_READ_WITH_INV:
406 svc_rdma_put_frmr(xprt, ctxt->frmr); 535 svc_rdma_put_frmr(xprt, ctxt->frmr);
407 if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { 536
408 struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; 537 if (!test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags))
409 if (read_hdr) { 538 break;
410 spin_lock_bh(&xprt->sc_rq_dto_lock); 539
411 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); 540 read_hdr = ctxt->read_hdr;
412 list_add_tail(&read_hdr->dto_q,
413 &xprt->sc_read_complete_q);
414 spin_unlock_bh(&xprt->sc_rq_dto_lock);
415 } else {
416 pr_err("svcrdma: ctxt->read_hdr == NULL\n");
417 }
418 svc_xprt_enqueue(&xprt->sc_xprt);
419 }
420 svc_rdma_put_context(ctxt, 0); 541 svc_rdma_put_context(ctxt, 0);
421 break; 542
543 spin_lock_bh(&xprt->sc_rq_dto_lock);
544 set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
545 list_add_tail(&read_hdr->dto_q,
546 &xprt->sc_read_complete_q);
547 spin_unlock_bh(&xprt->sc_rq_dto_lock);
548 svc_xprt_enqueue(&xprt->sc_xprt);
549 return;
422 550
423 default: 551 default:
424 printk(KERN_ERR "svcrdma: unexpected completion type, " 552 dprintk("svcrdma: unexpected completion opcode=%d\n",
425 "opcode=%d\n", 553 ctxt->wr_op);
426 ctxt->wr_op);
427 break; 554 break;
428 } 555 }
556
557 svc_rdma_put_context(ctxt, free_pages);
429} 558}
430 559
431/* 560/*
@@ -523,19 +652,15 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
523 INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); 652 INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
524 INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); 653 INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
525 INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); 654 INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
655 INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
656 INIT_LIST_HEAD(&cma_xprt->sc_maps);
526 init_waitqueue_head(&cma_xprt->sc_send_wait); 657 init_waitqueue_head(&cma_xprt->sc_send_wait);
527 658
528 spin_lock_init(&cma_xprt->sc_lock); 659 spin_lock_init(&cma_xprt->sc_lock);
529 spin_lock_init(&cma_xprt->sc_rq_dto_lock); 660 spin_lock_init(&cma_xprt->sc_rq_dto_lock);
530 spin_lock_init(&cma_xprt->sc_frmr_q_lock); 661 spin_lock_init(&cma_xprt->sc_frmr_q_lock);
531 662 spin_lock_init(&cma_xprt->sc_ctxt_lock);
532 cma_xprt->sc_ord = svcrdma_ord; 663 spin_lock_init(&cma_xprt->sc_map_lock);
533
534 cma_xprt->sc_max_req_size = svcrdma_max_req_size;
535 cma_xprt->sc_max_requests = svcrdma_max_requests;
536 cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
537 atomic_set(&cma_xprt->sc_sq_count, 0);
538 atomic_set(&cma_xprt->sc_ctxt_used, 0);
539 664
540 if (listener) 665 if (listener)
541 set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); 666 set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
@@ -543,7 +668,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
543 return cma_xprt; 668 return cma_xprt;
544} 669}
545 670
546int svc_rdma_post_recv(struct svcxprt_rdma *xprt) 671int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags)
547{ 672{
548 struct ib_recv_wr recv_wr, *bad_recv_wr; 673 struct ib_recv_wr recv_wr, *bad_recv_wr;
549 struct svc_rdma_op_ctxt *ctxt; 674 struct svc_rdma_op_ctxt *ctxt;
@@ -561,7 +686,9 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
561 pr_err("svcrdma: Too many sges (%d)\n", sge_no); 686 pr_err("svcrdma: Too many sges (%d)\n", sge_no);
562 goto err_put_ctxt; 687 goto err_put_ctxt;
563 } 688 }
564 page = alloc_page(GFP_KERNEL | __GFP_NOFAIL); 689 page = alloc_page(flags);
690 if (!page)
691 goto err_put_ctxt;
565 ctxt->pages[sge_no] = page; 692 ctxt->pages[sge_no] = page;
566 pa = ib_dma_map_page(xprt->sc_cm_id->device, 693 pa = ib_dma_map_page(xprt->sc_cm_id->device,
567 page, 0, PAGE_SIZE, 694 page, 0, PAGE_SIZE,
@@ -571,7 +698,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
571 atomic_inc(&xprt->sc_dma_used); 698 atomic_inc(&xprt->sc_dma_used);
572 ctxt->sge[sge_no].addr = pa; 699 ctxt->sge[sge_no].addr = pa;
573 ctxt->sge[sge_no].length = PAGE_SIZE; 700 ctxt->sge[sge_no].length = PAGE_SIZE;
574 ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey; 701 ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
575 ctxt->count = sge_no + 1; 702 ctxt->count = sge_no + 1;
576 buflen += PAGE_SIZE; 703 buflen += PAGE_SIZE;
577 } 704 }
@@ -886,11 +1013,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
886 struct rdma_conn_param conn_param; 1013 struct rdma_conn_param conn_param;
887 struct ib_cq_init_attr cq_attr = {}; 1014 struct ib_cq_init_attr cq_attr = {};
888 struct ib_qp_init_attr qp_attr; 1015 struct ib_qp_init_attr qp_attr;
889 struct ib_device_attr devattr; 1016 struct ib_device *dev;
890 int uninitialized_var(dma_mr_acc); 1017 unsigned int i;
891 int need_dma_mr = 0; 1018 int ret = 0;
892 int ret;
893 int i;
894 1019
895 listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); 1020 listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
896 clear_bit(XPT_CONN, &xprt->xpt_flags); 1021 clear_bit(XPT_CONN, &xprt->xpt_flags);
@@ -910,37 +1035,42 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
910 dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n", 1035 dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
911 newxprt, newxprt->sc_cm_id); 1036 newxprt, newxprt->sc_cm_id);
912 1037
913 ret = ib_query_device(newxprt->sc_cm_id->device, &devattr); 1038 dev = newxprt->sc_cm_id->device;
914 if (ret) {
915 dprintk("svcrdma: could not query device attributes on "
916 "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret);
917 goto errout;
918 }
919 1039
920 /* Qualify the transport resource defaults with the 1040 /* Qualify the transport resource defaults with the
921 * capabilities of this particular device */ 1041 * capabilities of this particular device */
922 newxprt->sc_max_sge = min((size_t)devattr.max_sge, 1042 newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge,
923 (size_t)RPCSVC_MAXPAGES); 1043 (size_t)RPCSVC_MAXPAGES);
924 newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd, 1044 newxprt->sc_max_sge_rd = min_t(size_t, dev->attrs.max_sge_rd,
925 RPCSVC_MAXPAGES); 1045 RPCSVC_MAXPAGES);
926 newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr, 1046 newxprt->sc_max_req_size = svcrdma_max_req_size;
927 (size_t)svcrdma_max_requests); 1047 newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
928 newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; 1048 svcrdma_max_requests);
1049 newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
1050 svcrdma_max_bc_requests);
1051 newxprt->sc_rq_depth = newxprt->sc_max_requests +
1052 newxprt->sc_max_bc_requests;
1053 newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
1054
1055 if (!svc_rdma_prealloc_ctxts(newxprt))
1056 goto errout;
1057 if (!svc_rdma_prealloc_maps(newxprt))
1058 goto errout;
929 1059
930 /* 1060 /*
931 * Limit ORD based on client limit, local device limit, and 1061 * Limit ORD based on client limit, local device limit, and
932 * configured svcrdma limit. 1062 * configured svcrdma limit.
933 */ 1063 */
934 newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord); 1064 newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord);
935 newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord); 1065 newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord);
936 1066
937 newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device); 1067 newxprt->sc_pd = ib_alloc_pd(dev);
938 if (IS_ERR(newxprt->sc_pd)) { 1068 if (IS_ERR(newxprt->sc_pd)) {
939 dprintk("svcrdma: error creating PD for connect request\n"); 1069 dprintk("svcrdma: error creating PD for connect request\n");
940 goto errout; 1070 goto errout;
941 } 1071 }
942 cq_attr.cqe = newxprt->sc_sq_depth; 1072 cq_attr.cqe = newxprt->sc_sq_depth;
943 newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device, 1073 newxprt->sc_sq_cq = ib_create_cq(dev,
944 sq_comp_handler, 1074 sq_comp_handler,
945 cq_event_handler, 1075 cq_event_handler,
946 newxprt, 1076 newxprt,
@@ -949,8 +1079,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
949 dprintk("svcrdma: error creating SQ CQ for connect request\n"); 1079 dprintk("svcrdma: error creating SQ CQ for connect request\n");
950 goto errout; 1080 goto errout;
951 } 1081 }
952 cq_attr.cqe = newxprt->sc_max_requests; 1082 cq_attr.cqe = newxprt->sc_rq_depth;
953 newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device, 1083 newxprt->sc_rq_cq = ib_create_cq(dev,
954 rq_comp_handler, 1084 rq_comp_handler,
955 cq_event_handler, 1085 cq_event_handler,
956 newxprt, 1086 newxprt,
@@ -964,7 +1094,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
964 qp_attr.event_handler = qp_event_handler; 1094 qp_attr.event_handler = qp_event_handler;
965 qp_attr.qp_context = &newxprt->sc_xprt; 1095 qp_attr.qp_context = &newxprt->sc_xprt;
966 qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; 1096 qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
967 qp_attr.cap.max_recv_wr = newxprt->sc_max_requests; 1097 qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
968 qp_attr.cap.max_send_sge = newxprt->sc_max_sge; 1098 qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
969 qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; 1099 qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
970 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 1100 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -978,7 +1108,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
978 " cap.max_send_sge = %d\n" 1108 " cap.max_send_sge = %d\n"
979 " cap.max_recv_sge = %d\n", 1109 " cap.max_recv_sge = %d\n",
980 newxprt->sc_cm_id, newxprt->sc_pd, 1110 newxprt->sc_cm_id, newxprt->sc_pd,
981 newxprt->sc_cm_id->device, newxprt->sc_pd->device, 1111 dev, newxprt->sc_pd->device,
982 qp_attr.cap.max_send_wr, 1112 qp_attr.cap.max_send_wr,
983 qp_attr.cap.max_recv_wr, 1113 qp_attr.cap.max_recv_wr,
984 qp_attr.cap.max_send_sge, 1114 qp_attr.cap.max_send_sge,
@@ -1014,9 +1144,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
1014 * of an RDMA_READ. IB does not. 1144 * of an RDMA_READ. IB does not.
1015 */ 1145 */
1016 newxprt->sc_reader = rdma_read_chunk_lcl; 1146 newxprt->sc_reader = rdma_read_chunk_lcl;
1017 if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { 1147 if (dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
1018 newxprt->sc_frmr_pg_list_len = 1148 newxprt->sc_frmr_pg_list_len =
1019 devattr.max_fast_reg_page_list_len; 1149 dev->attrs.max_fast_reg_page_list_len;
1020 newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; 1150 newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
1021 newxprt->sc_reader = rdma_read_chunk_frmr; 1151 newxprt->sc_reader = rdma_read_chunk_frmr;
1022 } 1152 }
@@ -1024,44 +1154,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
1024 /* 1154 /*
1025 * Determine if a DMA MR is required and if so, what privs are required 1155 * Determine if a DMA MR is required and if so, what privs are required
1026 */ 1156 */
1027 if (!rdma_protocol_iwarp(newxprt->sc_cm_id->device, 1157 if (!rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) &&
1028 newxprt->sc_cm_id->port_num) && 1158 !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num))
1029 !rdma_ib_or_roce(newxprt->sc_cm_id->device,
1030 newxprt->sc_cm_id->port_num))
1031 goto errout; 1159 goto errout;
1032 1160
1033 if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) || 1161 if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num))
1034 !(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
1035 need_dma_mr = 1;
1036 dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
1037 if (rdma_protocol_iwarp(newxprt->sc_cm_id->device,
1038 newxprt->sc_cm_id->port_num) &&
1039 !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG))
1040 dma_mr_acc |= IB_ACCESS_REMOTE_WRITE;
1041 }
1042
1043 if (rdma_protocol_iwarp(newxprt->sc_cm_id->device,
1044 newxprt->sc_cm_id->port_num))
1045 newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; 1162 newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
1046 1163
1047 /* Create the DMA MR if needed, otherwise, use the DMA LKEY */
1048 if (need_dma_mr) {
1049 /* Register all of physical memory */
1050 newxprt->sc_phys_mr =
1051 ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc);
1052 if (IS_ERR(newxprt->sc_phys_mr)) {
1053 dprintk("svcrdma: Failed to create DMA MR ret=%d\n",
1054 ret);
1055 goto errout;
1056 }
1057 newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey;
1058 } else
1059 newxprt->sc_dma_lkey =
1060 newxprt->sc_cm_id->device->local_dma_lkey;
1061
1062 /* Post receive buffers */ 1164 /* Post receive buffers */
1063 for (i = 0; i < newxprt->sc_max_requests; i++) { 1165 for (i = 0; i < newxprt->sc_rq_depth; i++) {
1064 ret = svc_rdma_post_recv(newxprt); 1166 ret = svc_rdma_post_recv(newxprt, GFP_KERNEL);
1065 if (ret) { 1167 if (ret) {
1066 dprintk("svcrdma: failure posting receive buffers\n"); 1168 dprintk("svcrdma: failure posting receive buffers\n");
1067 goto errout; 1169 goto errout;
@@ -1160,12 +1262,14 @@ static void __svc_rdma_free(struct work_struct *work)
1160{ 1262{
1161 struct svcxprt_rdma *rdma = 1263 struct svcxprt_rdma *rdma =
1162 container_of(work, struct svcxprt_rdma, sc_work); 1264 container_of(work, struct svcxprt_rdma, sc_work);
1163 dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); 1265 struct svc_xprt *xprt = &rdma->sc_xprt;
1266
1267 dprintk("svcrdma: %s(%p)\n", __func__, rdma);
1164 1268
1165 /* We should only be called from kref_put */ 1269 /* We should only be called from kref_put */
1166 if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0) 1270 if (atomic_read(&xprt->xpt_ref.refcount) != 0)
1167 pr_err("svcrdma: sc_xprt still in use? (%d)\n", 1271 pr_err("svcrdma: sc_xprt still in use? (%d)\n",
1168 atomic_read(&rdma->sc_xprt.xpt_ref.refcount)); 1272 atomic_read(&xprt->xpt_ref.refcount));
1169 1273
1170 /* 1274 /*
1171 * Destroy queued, but not processed read completions. Note 1275 * Destroy queued, but not processed read completions. Note
@@ -1193,15 +1297,22 @@ static void __svc_rdma_free(struct work_struct *work)
1193 } 1297 }
1194 1298
1195 /* Warn if we leaked a resource or under-referenced */ 1299 /* Warn if we leaked a resource or under-referenced */
1196 if (atomic_read(&rdma->sc_ctxt_used) != 0) 1300 if (rdma->sc_ctxt_used != 0)
1197 pr_err("svcrdma: ctxt still in use? (%d)\n", 1301 pr_err("svcrdma: ctxt still in use? (%d)\n",
1198 atomic_read(&rdma->sc_ctxt_used)); 1302 rdma->sc_ctxt_used);
1199 if (atomic_read(&rdma->sc_dma_used) != 0) 1303 if (atomic_read(&rdma->sc_dma_used) != 0)
1200 pr_err("svcrdma: dma still in use? (%d)\n", 1304 pr_err("svcrdma: dma still in use? (%d)\n",
1201 atomic_read(&rdma->sc_dma_used)); 1305 atomic_read(&rdma->sc_dma_used));
1202 1306
1203 /* De-allocate fastreg mr */ 1307 /* Final put of backchannel client transport */
1308 if (xprt->xpt_bc_xprt) {
1309 xprt_put(xprt->xpt_bc_xprt);
1310 xprt->xpt_bc_xprt = NULL;
1311 }
1312
1204 rdma_dealloc_frmr_q(rdma); 1313 rdma_dealloc_frmr_q(rdma);
1314 svc_rdma_destroy_ctxts(rdma);
1315 svc_rdma_destroy_maps(rdma);
1205 1316
1206 /* Destroy the QP if present (not a listener) */ 1317 /* Destroy the QP if present (not a listener) */
1207 if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) 1318 if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
@@ -1213,9 +1324,6 @@ static void __svc_rdma_free(struct work_struct *work)
1213 if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq)) 1324 if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq))
1214 ib_destroy_cq(rdma->sc_rq_cq); 1325 ib_destroy_cq(rdma->sc_rq_cq);
1215 1326
1216 if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr))
1217 ib_dereg_mr(rdma->sc_phys_mr);
1218
1219 if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) 1327 if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
1220 ib_dealloc_pd(rdma->sc_pd); 1328 ib_dealloc_pd(rdma->sc_pd);
1221 1329
@@ -1321,7 +1429,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
1321 int length; 1429 int length;
1322 int ret; 1430 int ret;
1323 1431
1324 p = alloc_page(GFP_KERNEL | __GFP_NOFAIL); 1432 p = alloc_page(GFP_KERNEL);
1433 if (!p)
1434 return;
1325 va = page_address(p); 1435 va = page_address(p);
1326 1436
1327 /* XDR encode error */ 1437 /* XDR encode error */
@@ -1341,7 +1451,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
1341 return; 1451 return;
1342 } 1452 }
1343 atomic_inc(&xprt->sc_dma_used); 1453 atomic_inc(&xprt->sc_dma_used);
1344 ctxt->sge[0].lkey = xprt->sc_dma_lkey; 1454 ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey;
1345 ctxt->sge[0].length = length; 1455 ctxt->sge[0].length = length;
1346 1456
1347 /* Prepare SEND WR */ 1457 /* Prepare SEND WR */
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 740bddcf3488..b1b009f10ea3 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -63,7 +63,7 @@
63 */ 63 */
64 64
65static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; 65static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
66static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; 66unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
67static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; 67static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
68static unsigned int xprt_rdma_inline_write_padding; 68static unsigned int xprt_rdma_inline_write_padding;
69static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; 69static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
@@ -143,12 +143,7 @@ static struct ctl_table sunrpc_table[] = {
143 143
144#endif 144#endif
145 145
146#define RPCRDMA_BIND_TO (60U * HZ) 146static struct rpc_xprt_ops xprt_rdma_procs; /*forward reference */
147#define RPCRDMA_INIT_REEST_TO (5U * HZ)
148#define RPCRDMA_MAX_REEST_TO (30U * HZ)
149#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
150
151static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
152 147
153static void 148static void
154xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) 149xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
@@ -174,7 +169,7 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
174 xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6; 169 xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6;
175} 170}
176 171
177static void 172void
178xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap) 173xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
179{ 174{
180 char buf[128]; 175 char buf[128];
@@ -203,7 +198,7 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
203 xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; 198 xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
204} 199}
205 200
206static void 201void
207xprt_rdma_free_addresses(struct rpc_xprt *xprt) 202xprt_rdma_free_addresses(struct rpc_xprt *xprt)
208{ 203{
209 unsigned int i; 204 unsigned int i;
@@ -499,7 +494,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
499 if (req == NULL) 494 if (req == NULL)
500 return NULL; 495 return NULL;
501 496
502 flags = GFP_NOIO | __GFP_NOWARN; 497 flags = RPCRDMA_DEF_GFP;
503 if (RPC_IS_SWAPPER(task)) 498 if (RPC_IS_SWAPPER(task))
504 flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; 499 flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
505 500
@@ -642,7 +637,7 @@ drop_connection:
642 return -ENOTCONN; /* implies disconnect */ 637 return -ENOTCONN; /* implies disconnect */
643} 638}
644 639
645static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) 640void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
646{ 641{
647 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 642 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
648 long idle_time = 0; 643 long idle_time = 0;
@@ -743,6 +738,11 @@ void xprt_rdma_cleanup(void)
743 738
744 rpcrdma_destroy_wq(); 739 rpcrdma_destroy_wq();
745 frwr_destroy_recovery_wq(); 740 frwr_destroy_recovery_wq();
741
742 rc = xprt_unregister_transport(&xprt_rdma_bc);
743 if (rc)
744 dprintk("RPC: %s: xprt_unregister(bc) returned %i\n",
745 __func__, rc);
746} 746}
747 747
748int xprt_rdma_init(void) 748int xprt_rdma_init(void)
@@ -766,6 +766,14 @@ int xprt_rdma_init(void)
766 return rc; 766 return rc;
767 } 767 }
768 768
769 rc = xprt_register_transport(&xprt_rdma_bc);
770 if (rc) {
771 xprt_unregister_transport(&xprt_rdma);
772 rpcrdma_destroy_wq();
773 frwr_destroy_recovery_wq();
774 return rc;
775 }
776
769 dprintk("RPCRDMA Module Init, register RPC RDMA transport\n"); 777 dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
770 778
771 dprintk("Defaults:\n"); 779 dprintk("Defaults:\n");
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 732c71ce5dca..878f1bfb1db9 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -462,7 +462,6 @@ int
462rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) 462rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
463{ 463{
464 struct rpcrdma_ia *ia = &xprt->rx_ia; 464 struct rpcrdma_ia *ia = &xprt->rx_ia;
465 struct ib_device_attr *devattr = &ia->ri_devattr;
466 int rc; 465 int rc;
467 466
468 ia->ri_dma_mr = NULL; 467 ia->ri_dma_mr = NULL;
@@ -482,16 +481,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
482 goto out2; 481 goto out2;
483 } 482 }
484 483
485 rc = ib_query_device(ia->ri_device, devattr);
486 if (rc) {
487 dprintk("RPC: %s: ib_query_device failed %d\n",
488 __func__, rc);
489 goto out3;
490 }
491
492 if (memreg == RPCRDMA_FRMR) { 484 if (memreg == RPCRDMA_FRMR) {
493 if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) || 485 if (!(ia->ri_device->attrs.device_cap_flags &
494 (devattr->max_fast_reg_page_list_len == 0)) { 486 IB_DEVICE_MEM_MGT_EXTENSIONS) ||
487 (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) {
495 dprintk("RPC: %s: FRMR registration " 488 dprintk("RPC: %s: FRMR registration "
496 "not supported by HCA\n", __func__); 489 "not supported by HCA\n", __func__);
497 memreg = RPCRDMA_MTHCAFMR; 490 memreg = RPCRDMA_MTHCAFMR;
@@ -566,24 +559,23 @@ int
566rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, 559rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
567 struct rpcrdma_create_data_internal *cdata) 560 struct rpcrdma_create_data_internal *cdata)
568{ 561{
569 struct ib_device_attr *devattr = &ia->ri_devattr;
570 struct ib_cq *sendcq, *recvcq; 562 struct ib_cq *sendcq, *recvcq;
571 struct ib_cq_init_attr cq_attr = {}; 563 struct ib_cq_init_attr cq_attr = {};
572 unsigned int max_qp_wr; 564 unsigned int max_qp_wr;
573 int rc, err; 565 int rc, err;
574 566
575 if (devattr->max_sge < RPCRDMA_MAX_IOVS) { 567 if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) {
576 dprintk("RPC: %s: insufficient sge's available\n", 568 dprintk("RPC: %s: insufficient sge's available\n",
577 __func__); 569 __func__);
578 return -ENOMEM; 570 return -ENOMEM;
579 } 571 }
580 572
581 if (devattr->max_qp_wr <= RPCRDMA_BACKWARD_WRS) { 573 if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
582 dprintk("RPC: %s: insufficient wqe's available\n", 574 dprintk("RPC: %s: insufficient wqe's available\n",
583 __func__); 575 __func__);
584 return -ENOMEM; 576 return -ENOMEM;
585 } 577 }
586 max_qp_wr = devattr->max_qp_wr - RPCRDMA_BACKWARD_WRS; 578 max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS;
587 579
588 /* check provider's send/recv wr limits */ 580 /* check provider's send/recv wr limits */
589 if (cdata->max_requests > max_qp_wr) 581 if (cdata->max_requests > max_qp_wr)
@@ -668,11 +660,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
668 660
669 /* Client offers RDMA Read but does not initiate */ 661 /* Client offers RDMA Read but does not initiate */
670 ep->rep_remote_cma.initiator_depth = 0; 662 ep->rep_remote_cma.initiator_depth = 0;
671 if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */ 663 if (ia->ri_device->attrs.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
672 ep->rep_remote_cma.responder_resources = 32; 664 ep->rep_remote_cma.responder_resources = 32;
673 else 665 else
674 ep->rep_remote_cma.responder_resources = 666 ep->rep_remote_cma.responder_resources =
675 devattr->max_qp_rd_atom; 667 ia->ri_device->attrs.max_qp_rd_atom;
676 668
677 ep->rep_remote_cma.retry_count = 7; 669 ep->rep_remote_cma.retry_count = 7;
678 ep->rep_remote_cma.flow_control = 0; 670 ep->rep_remote_cma.flow_control = 0;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 728101ddc44b..38fe11b09875 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -55,6 +55,11 @@
55#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ 55#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */
56#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ 56#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */
57 57
58#define RPCRDMA_BIND_TO (60U * HZ)
59#define RPCRDMA_INIT_REEST_TO (5U * HZ)
60#define RPCRDMA_MAX_REEST_TO (30U * HZ)
61#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
62
58/* 63/*
59 * Interface Adapter -- one per transport instance 64 * Interface Adapter -- one per transport instance
60 */ 65 */
@@ -68,7 +73,6 @@ struct rpcrdma_ia {
68 struct completion ri_done; 73 struct completion ri_done;
69 int ri_async_rc; 74 int ri_async_rc;
70 unsigned int ri_max_frmr_depth; 75 unsigned int ri_max_frmr_depth;
71 struct ib_device_attr ri_devattr;
72 struct ib_qp_attr ri_qp_attr; 76 struct ib_qp_attr ri_qp_attr;
73 struct ib_qp_init_attr ri_qp_init_attr; 77 struct ib_qp_init_attr ri_qp_init_attr;
74}; 78};
@@ -142,6 +146,8 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
142 return (struct rpcrdma_msg *)rb->rg_base; 146 return (struct rpcrdma_msg *)rb->rg_base;
143} 147}
144 148
149#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN)
150
145/* 151/*
146 * struct rpcrdma_rep -- this structure encapsulates state required to recv 152 * struct rpcrdma_rep -- this structure encapsulates state required to recv
147 * and complete a reply, asychronously. It needs several pieces of 153 * and complete a reply, asychronously. It needs several pieces of
@@ -309,6 +315,8 @@ struct rpcrdma_buffer {
309 u32 rb_bc_srv_max_requests; 315 u32 rb_bc_srv_max_requests;
310 spinlock_t rb_reqslock; /* protect rb_allreqs */ 316 spinlock_t rb_reqslock; /* protect rb_allreqs */
311 struct list_head rb_allreqs; 317 struct list_head rb_allreqs;
318
319 u32 rb_bc_max_requests;
312}; 320};
313#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) 321#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
314 322
@@ -516,6 +524,10 @@ int rpcrdma_marshal_req(struct rpc_rqst *);
516 524
517/* RPC/RDMA module init - xprtrdma/transport.c 525/* RPC/RDMA module init - xprtrdma/transport.c
518 */ 526 */
527extern unsigned int xprt_rdma_max_inline_read;
528void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
529void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
530void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
519int xprt_rdma_init(void); 531int xprt_rdma_init(void);
520void xprt_rdma_cleanup(void); 532void xprt_rdma_cleanup(void);
521 533
@@ -531,11 +543,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *);
531void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); 543void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int);
532#endif /* CONFIG_SUNRPC_BACKCHANNEL */ 544#endif /* CONFIG_SUNRPC_BACKCHANNEL */
533 545
534/* Temporary NFS request map cache. Created in svc_rdma.c */ 546extern struct xprt_class xprt_rdma_bc;
535extern struct kmem_cache *svc_rdma_map_cachep;
536/* WR context cache. Created in svc_rdma.c */
537extern struct kmem_cache *svc_rdma_ctxt_cachep;
538/* Workqueue created in svc_rdma.c */
539extern struct workqueue_struct *svc_rdma_wq;
540 547
541#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ 548#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */