aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorTrond Myklebust <trond.myklebust@primarydata.com>2016-07-24 17:09:02 -0400
committerTrond Myklebust <trond.myklebust@primarydata.com>2016-07-24 17:09:02 -0400
commit1592c4d62a89bbca895c568d65ce290dfbc36ecc (patch)
tree6b979bc02ded2ea7e644c34e6939ffbbb7ee001d /net
parent668f455dac57050e33a43ff5fe006f6cd947fc65 (diff)
parentf0445670bd81cae9f46399d98fef5cd1622d9776 (diff)
Merge branch 'nfs-rdma'
Diffstat (limited to 'net')
-rw-r--r--net/bridge/br_netfilter_hooks.c2
-rw-r--r--net/core/flow_dissector.c43
-rw-r--r--net/core/skbuff.c18
-rw-r--r--net/decnet/dn_fib.c21
-rw-r--r--net/ipv4/ip_output.c4
-rw-r--r--net/ipv6/ip6_fib.c1
-rw-r--r--net/packet/af_packet.c2
-rw-r--r--net/rds/tcp.c5
-rw-r--r--net/sched/act_mirred.c2
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c2
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c2
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c12
-rw-r--r--net/sunrpc/svc.c8
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c378
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c369
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c122
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c274
-rw-r--r--net/sunrpc/xprtrdma/transport.c40
-rw-r--r--net/sunrpc/xprtrdma/verbs.c242
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h118
-rw-r--r--net/tipc/netlink_compat.c2
22 files changed, 783 insertions, 886 deletions
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 2d25979273a6..77e7f69bf80d 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -700,7 +700,7 @@ static int
700br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 700br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
701 int (*output)(struct net *, struct sock *, struct sk_buff *)) 701 int (*output)(struct net *, struct sock *, struct sk_buff *))
702{ 702{
703 unsigned int mtu = ip_skb_dst_mtu(skb); 703 unsigned int mtu = ip_skb_dst_mtu(sk, skb);
704 struct iphdr *iph = ip_hdr(skb); 704 struct iphdr *iph = ip_hdr(skb);
705 705
706 if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || 706 if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index a669dea146c6..61ad43f61c5e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -651,6 +651,23 @@ void make_flow_keys_digest(struct flow_keys_digest *digest,
651} 651}
652EXPORT_SYMBOL(make_flow_keys_digest); 652EXPORT_SYMBOL(make_flow_keys_digest);
653 653
654static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
655
656u32 __skb_get_hash_symmetric(struct sk_buff *skb)
657{
658 struct flow_keys keys;
659
660 __flow_hash_secret_init();
661
662 memset(&keys, 0, sizeof(keys));
663 __skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys,
664 NULL, 0, 0, 0,
665 FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
666
667 return __flow_hash_from_keys(&keys, hashrnd);
668}
669EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric);
670
654/** 671/**
655 * __skb_get_hash: calculate a flow hash 672 * __skb_get_hash: calculate a flow hash
656 * @skb: sk_buff to calculate flow hash from 673 * @skb: sk_buff to calculate flow hash from
@@ -868,6 +885,29 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
868 }, 885 },
869}; 886};
870 887
888static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = {
889 {
890 .key_id = FLOW_DISSECTOR_KEY_CONTROL,
891 .offset = offsetof(struct flow_keys, control),
892 },
893 {
894 .key_id = FLOW_DISSECTOR_KEY_BASIC,
895 .offset = offsetof(struct flow_keys, basic),
896 },
897 {
898 .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
899 .offset = offsetof(struct flow_keys, addrs.v4addrs),
900 },
901 {
902 .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
903 .offset = offsetof(struct flow_keys, addrs.v6addrs),
904 },
905 {
906 .key_id = FLOW_DISSECTOR_KEY_PORTS,
907 .offset = offsetof(struct flow_keys, ports),
908 },
909};
910
871static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { 911static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
872 { 912 {
873 .key_id = FLOW_DISSECTOR_KEY_CONTROL, 913 .key_id = FLOW_DISSECTOR_KEY_CONTROL,
@@ -889,6 +929,9 @@ static int __init init_default_flow_dissectors(void)
889 skb_flow_dissector_init(&flow_keys_dissector, 929 skb_flow_dissector_init(&flow_keys_dissector,
890 flow_keys_dissector_keys, 930 flow_keys_dissector_keys,
891 ARRAY_SIZE(flow_keys_dissector_keys)); 931 ARRAY_SIZE(flow_keys_dissector_keys));
932 skb_flow_dissector_init(&flow_keys_dissector_symmetric,
933 flow_keys_dissector_symmetric_keys,
934 ARRAY_SIZE(flow_keys_dissector_symmetric_keys));
892 skb_flow_dissector_init(&flow_keys_buf_dissector, 935 skb_flow_dissector_init(&flow_keys_buf_dissector,
893 flow_keys_buf_dissector_keys, 936 flow_keys_buf_dissector_keys,
894 ARRAY_SIZE(flow_keys_buf_dissector_keys)); 937 ARRAY_SIZE(flow_keys_buf_dissector_keys));
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f2b77e549c03..eb12d2161fb2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3016,24 +3016,6 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
3016EXPORT_SYMBOL_GPL(skb_append_pagefrags); 3016EXPORT_SYMBOL_GPL(skb_append_pagefrags);
3017 3017
3018/** 3018/**
3019 * skb_push_rcsum - push skb and update receive checksum
3020 * @skb: buffer to update
3021 * @len: length of data pulled
3022 *
3023 * This function performs an skb_push on the packet and updates
3024 * the CHECKSUM_COMPLETE checksum. It should be used on
3025 * receive path processing instead of skb_push unless you know
3026 * that the checksum difference is zero (e.g., a valid IP header)
3027 * or you are setting ip_summed to CHECKSUM_NONE.
3028 */
3029static unsigned char *skb_push_rcsum(struct sk_buff *skb, unsigned len)
3030{
3031 skb_push(skb, len);
3032 skb_postpush_rcsum(skb, skb->data, len);
3033 return skb->data;
3034}
3035
3036/**
3037 * skb_pull_rcsum - pull skb and update receive checksum 3019 * skb_pull_rcsum - pull skb and update receive checksum
3038 * @skb: buffer to update 3020 * @skb: buffer to update
3039 * @len: length of data pulled 3021 * @len: length of data pulled
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index df4803437888..a796fc7cbc35 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -41,6 +41,7 @@
41#include <net/dn_fib.h> 41#include <net/dn_fib.h>
42#include <net/dn_neigh.h> 42#include <net/dn_neigh.h>
43#include <net/dn_dev.h> 43#include <net/dn_dev.h>
44#include <net/nexthop.h>
44 45
45#define RT_MIN_TABLE 1 46#define RT_MIN_TABLE 1
46 47
@@ -150,14 +151,13 @@ static int dn_fib_count_nhs(const struct nlattr *attr)
150 struct rtnexthop *nhp = nla_data(attr); 151 struct rtnexthop *nhp = nla_data(attr);
151 int nhs = 0, nhlen = nla_len(attr); 152 int nhs = 0, nhlen = nla_len(attr);
152 153
153 while(nhlen >= (int)sizeof(struct rtnexthop)) { 154 while (rtnh_ok(nhp, nhlen)) {
154 if ((nhlen -= nhp->rtnh_len) < 0)
155 return 0;
156 nhs++; 155 nhs++;
157 nhp = RTNH_NEXT(nhp); 156 nhp = rtnh_next(nhp, &nhlen);
158 } 157 }
159 158
160 return nhs; 159 /* leftover implies invalid nexthop configuration, discard it */
160 return nhlen > 0 ? 0 : nhs;
161} 161}
162 162
163static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct nlattr *attr, 163static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct nlattr *attr,
@@ -167,21 +167,24 @@ static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct nlattr *attr,
167 int nhlen = nla_len(attr); 167 int nhlen = nla_len(attr);
168 168
169 change_nexthops(fi) { 169 change_nexthops(fi) {
170 int attrlen = nhlen - sizeof(struct rtnexthop); 170 int attrlen;
171 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0) 171
172 if (!rtnh_ok(nhp, nhlen))
172 return -EINVAL; 173 return -EINVAL;
173 174
174 nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags; 175 nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
175 nh->nh_oif = nhp->rtnh_ifindex; 176 nh->nh_oif = nhp->rtnh_ifindex;
176 nh->nh_weight = nhp->rtnh_hops + 1; 177 nh->nh_weight = nhp->rtnh_hops + 1;
177 178
178 if (attrlen) { 179 attrlen = rtnh_attrlen(nhp);
180 if (attrlen > 0) {
179 struct nlattr *gw_attr; 181 struct nlattr *gw_attr;
180 182
181 gw_attr = nla_find((struct nlattr *) (nhp + 1), attrlen, RTA_GATEWAY); 183 gw_attr = nla_find((struct nlattr *) (nhp + 1), attrlen, RTA_GATEWAY);
182 nh->nh_gw = gw_attr ? nla_get_le16(gw_attr) : 0; 184 nh->nh_gw = gw_attr ? nla_get_le16(gw_attr) : 0;
183 } 185 }
184 nhp = RTNH_NEXT(nhp); 186
187 nhp = rtnh_next(nhp, &nhlen);
185 } endfor_nexthops(fi); 188 } endfor_nexthops(fi);
186 189
187 return 0; 190 return 0;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 124bf0a66328..4bd4921639c3 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -271,7 +271,7 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
271 return dst_output(net, sk, skb); 271 return dst_output(net, sk, skb);
272 } 272 }
273#endif 273#endif
274 mtu = ip_skb_dst_mtu(skb); 274 mtu = ip_skb_dst_mtu(sk, skb);
275 if (skb_is_gso(skb)) 275 if (skb_is_gso(skb))
276 return ip_finish_output_gso(net, sk, skb, mtu); 276 return ip_finish_output_gso(net, sk, skb, mtu);
277 277
@@ -541,7 +541,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
541 541
542 iph = ip_hdr(skb); 542 iph = ip_hdr(skb);
543 543
544 mtu = ip_skb_dst_mtu(skb); 544 mtu = ip_skb_dst_mtu(sk, skb);
545 if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu) 545 if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
546 mtu = IPCB(skb)->frag_max_size; 546 mtu = IPCB(skb)->frag_max_size;
547 547
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 1bcef2369d64..771be1fa4176 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -177,6 +177,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
177 } 177 }
178 } 178 }
179 179
180 free_percpu(non_pcpu_rt->rt6i_pcpu);
180 non_pcpu_rt->rt6i_pcpu = NULL; 181 non_pcpu_rt->rt6i_pcpu = NULL;
181} 182}
182 183
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 9bff6ef16fa7..9f0983fa4d52 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1341,7 +1341,7 @@ static unsigned int fanout_demux_hash(struct packet_fanout *f,
1341 struct sk_buff *skb, 1341 struct sk_buff *skb,
1342 unsigned int num) 1342 unsigned int num)
1343{ 1343{
1344 return reciprocal_scale(skb_get_hash(skb), num); 1344 return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
1345} 1345}
1346 1346
1347static unsigned int fanout_demux_lb(struct packet_fanout *f, 1347static unsigned int fanout_demux_lb(struct packet_fanout *f,
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 74ee126a6fe6..c8a7b4c90190 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -616,7 +616,7 @@ static int rds_tcp_init(void)
616 616
617 ret = rds_tcp_recv_init(); 617 ret = rds_tcp_recv_init();
618 if (ret) 618 if (ret)
619 goto out_slab; 619 goto out_pernet;
620 620
621 ret = rds_trans_register(&rds_tcp_transport); 621 ret = rds_trans_register(&rds_tcp_transport);
622 if (ret) 622 if (ret)
@@ -628,8 +628,9 @@ static int rds_tcp_init(void)
628 628
629out_recv: 629out_recv:
630 rds_tcp_recv_exit(); 630 rds_tcp_recv_exit();
631out_slab: 631out_pernet:
632 unregister_pernet_subsys(&rds_tcp_net_ops); 632 unregister_pernet_subsys(&rds_tcp_net_ops);
633out_slab:
633 kmem_cache_destroy(rds_tcp_conn_slab); 634 kmem_cache_destroy(rds_tcp_conn_slab);
634out: 635out:
635 return ret; 636 return ret;
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 128942bc9e42..1f5bd6ccbd2c 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -181,7 +181,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
181 181
182 if (!(at & AT_EGRESS)) { 182 if (!(at & AT_EGRESS)) {
183 if (m->tcfm_ok_push) 183 if (m->tcfm_ok_push)
184 skb_push(skb2, skb->mac_len); 184 skb_push_rcsum(skb2, skb->mac_len);
185 } 185 }
186 186
187 /* mirror is always swallowed */ 187 /* mirror is always swallowed */
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 813a3cdfb573..23c8e7c39656 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1018,6 +1018,8 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
1018 auth->au_flags = 0; 1018 auth->au_flags = 0;
1019 auth->au_ops = &authgss_ops; 1019 auth->au_ops = &authgss_ops;
1020 auth->au_flavor = flavor; 1020 auth->au_flavor = flavor;
1021 if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor))
1022 auth->au_flags |= RPCAUTH_AUTH_DATATOUCH;
1021 atomic_set(&auth->au_count, 1); 1023 atomic_set(&auth->au_count, 1);
1022 kref_init(&gss_auth->kref); 1024 kref_init(&gss_auth->kref);
1023 1025
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 65427492b1c9..60595835317a 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -745,12 +745,14 @@ static struct pf_desc gss_kerberos_pfs[] = {
745 .qop = GSS_C_QOP_DEFAULT, 745 .qop = GSS_C_QOP_DEFAULT,
746 .service = RPC_GSS_SVC_INTEGRITY, 746 .service = RPC_GSS_SVC_INTEGRITY,
747 .name = "krb5i", 747 .name = "krb5i",
748 .datatouch = true,
748 }, 749 },
749 [2] = { 750 [2] = {
750 .pseudoflavor = RPC_AUTH_GSS_KRB5P, 751 .pseudoflavor = RPC_AUTH_GSS_KRB5P,
751 .qop = GSS_C_QOP_DEFAULT, 752 .qop = GSS_C_QOP_DEFAULT,
752 .service = RPC_GSS_SVC_PRIVACY, 753 .service = RPC_GSS_SVC_PRIVACY,
753 .name = "krb5p", 754 .name = "krb5p",
755 .datatouch = true,
754 }, 756 },
755}; 757};
756 758
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 7063d856a598..5fec3abbe19b 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -361,6 +361,18 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor)
361} 361}
362EXPORT_SYMBOL(gss_pseudoflavor_to_service); 362EXPORT_SYMBOL(gss_pseudoflavor_to_service);
363 363
364bool
365gss_pseudoflavor_to_datatouch(struct gss_api_mech *gm, u32 pseudoflavor)
366{
367 int i;
368
369 for (i = 0; i < gm->gm_pf_num; i++) {
370 if (gm->gm_pfs[i].pseudoflavor == pseudoflavor)
371 return gm->gm_pfs[i].datatouch;
372 }
373 return false;
374}
375
364char * 376char *
365gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) 377gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service)
366{ 378{
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index cc9852897395..c5b0cb4f4056 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1188,11 +1188,17 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
1188 *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 1188 *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
1189 1189
1190 /* Encode reply */ 1190 /* Encode reply */
1191 if (test_bit(RQ_DROPME, &rqstp->rq_flags)) { 1191 if (*statp == rpc_drop_reply ||
1192 test_bit(RQ_DROPME, &rqstp->rq_flags)) {
1192 if (procp->pc_release) 1193 if (procp->pc_release)
1193 procp->pc_release(rqstp, NULL, rqstp->rq_resp); 1194 procp->pc_release(rqstp, NULL, rqstp->rq_resp);
1194 goto dropit; 1195 goto dropit;
1195 } 1196 }
1197 if (*statp == rpc_autherr_badcred) {
1198 if (procp->pc_release)
1199 procp->pc_release(rqstp, NULL, rqstp->rq_resp);
1200 goto err_bad_auth;
1201 }
1196 if (*statp == rpc_success && 1202 if (*statp == rpc_success &&
1197 (xdr = procp->pc_encode) && 1203 (xdr = procp->pc_encode) &&
1198 !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) { 1204 !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) {
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index dc9f3b513a05..ef19fa42c50f 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,7 +1,7 @@
1obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o 1obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
2 2
3rpcrdma-y := transport.o rpc_rdma.o verbs.o \ 3rpcrdma-y := transport.o rpc_rdma.o verbs.o \
4 fmr_ops.o frwr_ops.o physical_ops.o \ 4 fmr_ops.o frwr_ops.o \
5 svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ 5 svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
6 svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ 6 svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
7 module.o 7 module.o
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 6326ebe8b595..21cb3b150b37 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -19,13 +19,6 @@
19 * verb (fmr_op_unmap). 19 * verb (fmr_op_unmap).
20 */ 20 */
21 21
22/* Transport recovery
23 *
24 * After a transport reconnect, fmr_op_map re-uses the MR already
25 * allocated for the RPC, but generates a fresh rkey then maps the
26 * MR again. This process is synchronous.
27 */
28
29#include "xprt_rdma.h" 22#include "xprt_rdma.h"
30 23
31#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 24#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -35,62 +28,132 @@
35/* Maximum scatter/gather per FMR */ 28/* Maximum scatter/gather per FMR */
36#define RPCRDMA_MAX_FMR_SGES (64) 29#define RPCRDMA_MAX_FMR_SGES (64)
37 30
38static struct workqueue_struct *fmr_recovery_wq; 31/* Access mode of externally registered pages */
39 32enum {
40#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND) 33 RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE |
34 IB_ACCESS_REMOTE_READ,
35};
41 36
42int 37bool
43fmr_alloc_recovery_wq(void) 38fmr_is_supported(struct rpcrdma_ia *ia)
44{ 39{
45 fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0); 40 if (!ia->ri_device->alloc_fmr) {
46 return !fmr_recovery_wq ? -ENOMEM : 0; 41 pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n",
42 ia->ri_device->name);
43 return false;
44 }
45 return true;
47} 46}
48 47
49void 48static int
50fmr_destroy_recovery_wq(void) 49fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw)
51{ 50{
52 struct workqueue_struct *wq; 51 static struct ib_fmr_attr fmr_attr = {
52 .max_pages = RPCRDMA_MAX_FMR_SGES,
53 .max_maps = 1,
54 .page_shift = PAGE_SHIFT
55 };
53 56
54 if (!fmr_recovery_wq) 57 mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
55 return; 58 sizeof(u64), GFP_KERNEL);
59 if (!mw->fmr.fm_physaddrs)
60 goto out_free;
56 61
57 wq = fmr_recovery_wq; 62 mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
58 fmr_recovery_wq = NULL; 63 sizeof(*mw->mw_sg), GFP_KERNEL);
59 destroy_workqueue(wq); 64 if (!mw->mw_sg)
65 goto out_free;
66
67 sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES);
68
69 mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
70 &fmr_attr);
71 if (IS_ERR(mw->fmr.fm_mr))
72 goto out_fmr_err;
73
74 return 0;
75
76out_fmr_err:
77 dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__,
78 PTR_ERR(mw->fmr.fm_mr));
79
80out_free:
81 kfree(mw->mw_sg);
82 kfree(mw->fmr.fm_physaddrs);
83 return -ENOMEM;
60} 84}
61 85
62static int 86static int
63__fmr_unmap(struct rpcrdma_mw *mw) 87__fmr_unmap(struct rpcrdma_mw *mw)
64{ 88{
65 LIST_HEAD(l); 89 LIST_HEAD(l);
90 int rc;
66 91
67 list_add(&mw->fmr.fmr->list, &l); 92 list_add(&mw->fmr.fm_mr->list, &l);
68 return ib_unmap_fmr(&l); 93 rc = ib_unmap_fmr(&l);
94 list_del_init(&mw->fmr.fm_mr->list);
95 return rc;
69} 96}
70 97
71/* Deferred reset of a single FMR. Generate a fresh rkey by
72 * replacing the MR. There's no recovery if this fails.
73 */
74static void 98static void
75__fmr_recovery_worker(struct work_struct *work) 99fmr_op_release_mr(struct rpcrdma_mw *r)
76{ 100{
77 struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw, 101 LIST_HEAD(unmap_list);
78 mw_work); 102 int rc;
79 struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
80 103
81 __fmr_unmap(mw); 104 /* Ensure MW is not on any rl_registered list */
82 rpcrdma_put_mw(r_xprt, mw); 105 if (!list_empty(&r->mw_list))
83 return; 106 list_del(&r->mw_list);
107
108 kfree(r->fmr.fm_physaddrs);
109 kfree(r->mw_sg);
110
111 /* In case this one was left mapped, try to unmap it
112 * to prevent dealloc_fmr from failing with EBUSY
113 */
114 rc = __fmr_unmap(r);
115 if (rc)
116 pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
117 r, rc);
118
119 rc = ib_dealloc_fmr(r->fmr.fm_mr);
120 if (rc)
121 pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
122 r, rc);
123
124 kfree(r);
84} 125}
85 126
86/* A broken MR was discovered in a context that can't sleep. 127/* Reset of a single FMR.
87 * Defer recovery to the recovery worker.
88 */ 128 */
89static void 129static void
90__fmr_queue_recovery(struct rpcrdma_mw *mw) 130fmr_op_recover_mr(struct rpcrdma_mw *mw)
91{ 131{
92 INIT_WORK(&mw->mw_work, __fmr_recovery_worker); 132 struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
93 queue_work(fmr_recovery_wq, &mw->mw_work); 133 int rc;
134
135 /* ORDER: invalidate first */
136 rc = __fmr_unmap(mw);
137
138 /* ORDER: then DMA unmap */
139 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
140 mw->mw_sg, mw->mw_nents, mw->mw_dir);
141 if (rc)
142 goto out_release;
143
144 rpcrdma_put_mw(r_xprt, mw);
145 r_xprt->rx_stats.mrs_recovered++;
146 return;
147
148out_release:
149 pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw);
150 r_xprt->rx_stats.mrs_orphaned++;
151
152 spin_lock(&r_xprt->rx_buf.rb_mwlock);
153 list_del(&mw->mw_all);
154 spin_unlock(&r_xprt->rx_buf.rb_mwlock);
155
156 fmr_op_release_mr(mw);
94} 157}
95 158
96static int 159static int
@@ -112,86 +175,21 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
112 RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); 175 RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
113} 176}
114 177
115static int
116fmr_op_init(struct rpcrdma_xprt *r_xprt)
117{
118 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
119 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
120 struct ib_fmr_attr fmr_attr = {
121 .max_pages = RPCRDMA_MAX_FMR_SGES,
122 .max_maps = 1,
123 .page_shift = PAGE_SHIFT
124 };
125 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
126 struct rpcrdma_mw *r;
127 int i, rc;
128
129 spin_lock_init(&buf->rb_mwlock);
130 INIT_LIST_HEAD(&buf->rb_mws);
131 INIT_LIST_HEAD(&buf->rb_all);
132
133 i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
134 i += 2; /* head + tail */
135 i *= buf->rb_max_requests; /* one set for each RPC slot */
136 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
137
138 rc = -ENOMEM;
139 while (i--) {
140 r = kzalloc(sizeof(*r), GFP_KERNEL);
141 if (!r)
142 goto out;
143
144 r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
145 sizeof(u64), GFP_KERNEL);
146 if (!r->fmr.physaddrs)
147 goto out_free;
148
149 r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
150 if (IS_ERR(r->fmr.fmr))
151 goto out_fmr_err;
152
153 r->mw_xprt = r_xprt;
154 list_add(&r->mw_list, &buf->rb_mws);
155 list_add(&r->mw_all, &buf->rb_all);
156 }
157 return 0;
158
159out_fmr_err:
160 rc = PTR_ERR(r->fmr.fmr);
161 dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
162 kfree(r->fmr.physaddrs);
163out_free:
164 kfree(r);
165out:
166 return rc;
167}
168
169/* Use the ib_map_phys_fmr() verb to register a memory region 178/* Use the ib_map_phys_fmr() verb to register a memory region
170 * for remote access via RDMA READ or RDMA WRITE. 179 * for remote access via RDMA READ or RDMA WRITE.
171 */ 180 */
172static int 181static int
173fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, 182fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
174 int nsegs, bool writing) 183 int nsegs, bool writing, struct rpcrdma_mw **out)
175{ 184{
176 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
177 struct ib_device *device = ia->ri_device;
178 enum dma_data_direction direction = rpcrdma_data_dir(writing);
179 struct rpcrdma_mr_seg *seg1 = seg; 185 struct rpcrdma_mr_seg *seg1 = seg;
180 int len, pageoff, i, rc; 186 int len, pageoff, i, rc;
181 struct rpcrdma_mw *mw; 187 struct rpcrdma_mw *mw;
188 u64 *dma_pages;
182 189
183 mw = seg1->rl_mw; 190 mw = rpcrdma_get_mw(r_xprt);
184 seg1->rl_mw = NULL; 191 if (!mw)
185 if (!mw) { 192 return -ENOBUFS;
186 mw = rpcrdma_get_mw(r_xprt);
187 if (!mw)
188 return -ENOMEM;
189 } else {
190 /* this is a retransmit; generate a fresh rkey */
191 rc = __fmr_unmap(mw);
192 if (rc)
193 return rc;
194 }
195 193
196 pageoff = offset_in_page(seg1->mr_offset); 194 pageoff = offset_in_page(seg1->mr_offset);
197 seg1->mr_offset -= pageoff; /* start of page */ 195 seg1->mr_offset -= pageoff; /* start of page */
@@ -200,8 +198,14 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
200 if (nsegs > RPCRDMA_MAX_FMR_SGES) 198 if (nsegs > RPCRDMA_MAX_FMR_SGES)
201 nsegs = RPCRDMA_MAX_FMR_SGES; 199 nsegs = RPCRDMA_MAX_FMR_SGES;
202 for (i = 0; i < nsegs;) { 200 for (i = 0; i < nsegs;) {
203 rpcrdma_map_one(device, seg, direction); 201 if (seg->mr_page)
204 mw->fmr.physaddrs[i] = seg->mr_dma; 202 sg_set_page(&mw->mw_sg[i],
203 seg->mr_page,
204 seg->mr_len,
205 offset_in_page(seg->mr_offset));
206 else
207 sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
208 seg->mr_len);
205 len += seg->mr_len; 209 len += seg->mr_len;
206 ++seg; 210 ++seg;
207 ++i; 211 ++i;
@@ -210,49 +214,54 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
210 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 214 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
211 break; 215 break;
212 } 216 }
213 217 mw->mw_nents = i;
214 rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs, 218 mw->mw_dir = rpcrdma_data_dir(writing);
215 i, seg1->mr_dma); 219 if (i == 0)
220 goto out_dmamap_err;
221
222 if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device,
223 mw->mw_sg, mw->mw_nents, mw->mw_dir))
224 goto out_dmamap_err;
225
226 for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++)
227 dma_pages[i] = sg_dma_address(&mw->mw_sg[i]);
228 rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents,
229 dma_pages[0]);
216 if (rc) 230 if (rc)
217 goto out_maperr; 231 goto out_maperr;
218 232
219 seg1->rl_mw = mw; 233 mw->mw_handle = mw->fmr.fm_mr->rkey;
220 seg1->mr_rkey = mw->fmr.fmr->rkey; 234 mw->mw_length = len;
221 seg1->mr_base = seg1->mr_dma + pageoff; 235 mw->mw_offset = dma_pages[0] + pageoff;
222 seg1->mr_nsegs = i;
223 seg1->mr_len = len;
224 return i;
225 236
226out_maperr: 237 *out = mw;
227 dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", 238 return mw->mw_nents;
228 __func__, len, (unsigned long long)seg1->mr_dma,
229 pageoff, i, rc);
230 while (i--)
231 rpcrdma_unmap_one(device, --seg);
232 return rc;
233}
234 239
235static void 240out_dmamap_err:
236__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) 241 pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
237{ 242 mw->mw_sg, mw->mw_nents);
238 struct ib_device *device = r_xprt->rx_ia.ri_device; 243 rpcrdma_defer_mr_recovery(mw);
239 int nsegs = seg->mr_nsegs; 244 return -EIO;
240 245
241 while (nsegs--) 246out_maperr:
242 rpcrdma_unmap_one(device, seg++); 247 pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
248 len, (unsigned long long)dma_pages[0],
249 pageoff, mw->mw_nents, rc);
250 rpcrdma_defer_mr_recovery(mw);
251 return -EIO;
243} 252}
244 253
245/* Invalidate all memory regions that were registered for "req". 254/* Invalidate all memory regions that were registered for "req".
246 * 255 *
247 * Sleeps until it is safe for the host CPU to access the 256 * Sleeps until it is safe for the host CPU to access the
248 * previously mapped memory regions. 257 * previously mapped memory regions.
258 *
259 * Caller ensures that req->rl_registered is not empty.
249 */ 260 */
250static void 261static void
251fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) 262fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
252{ 263{
253 struct rpcrdma_mr_seg *seg; 264 struct rpcrdma_mw *mw, *tmp;
254 unsigned int i, nchunks;
255 struct rpcrdma_mw *mw;
256 LIST_HEAD(unmap_list); 265 LIST_HEAD(unmap_list);
257 int rc; 266 int rc;
258 267
@@ -261,90 +270,54 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
261 /* ORDER: Invalidate all of the req's MRs first 270 /* ORDER: Invalidate all of the req's MRs first
262 * 271 *
263 * ib_unmap_fmr() is slow, so use a single call instead 272 * ib_unmap_fmr() is slow, so use a single call instead
264 * of one call per mapped MR. 273 * of one call per mapped FMR.
265 */ 274 */
266 for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { 275 list_for_each_entry(mw, &req->rl_registered, mw_list)
267 seg = &req->rl_segments[i]; 276 list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
268 mw = seg->rl_mw;
269
270 list_add(&mw->fmr.fmr->list, &unmap_list);
271
272 i += seg->mr_nsegs;
273 }
274 rc = ib_unmap_fmr(&unmap_list); 277 rc = ib_unmap_fmr(&unmap_list);
275 if (rc) 278 if (rc)
276 pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc); 279 goto out_reset;
277 280
278 /* ORDER: Now DMA unmap all of the req's MRs, and return 281 /* ORDER: Now DMA unmap all of the req's MRs, and return
279 * them to the free MW list. 282 * them to the free MW list.
280 */ 283 */
281 for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { 284 list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
282 seg = &req->rl_segments[i]; 285 list_del_init(&mw->mw_list);
286 list_del_init(&mw->fmr.fm_mr->list);
287 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
288 mw->mw_sg, mw->mw_nents, mw->mw_dir);
289 rpcrdma_put_mw(r_xprt, mw);
290 }
283 291
284 __fmr_dma_unmap(r_xprt, seg); 292 return;
285 rpcrdma_put_mw(r_xprt, seg->rl_mw);
286 293
287 i += seg->mr_nsegs; 294out_reset:
288 seg->mr_nsegs = 0; 295 pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
289 seg->rl_mw = NULL;
290 }
291 296
292 req->rl_nchunks = 0; 297 list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
298 list_del_init(&mw->fmr.fm_mr->list);
299 fmr_op_recover_mr(mw);
300 }
293} 301}
294 302
295/* Use a slow, safe mechanism to invalidate all memory regions 303/* Use a slow, safe mechanism to invalidate all memory regions
296 * that were registered for "req". 304 * that were registered for "req".
297 *
298 * In the asynchronous case, DMA unmapping occurs first here
299 * because the rpcrdma_mr_seg is released immediately after this
300 * call. It's contents won't be available in __fmr_dma_unmap later.
301 * FIXME.
302 */ 305 */
303static void 306static void
304fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 307fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
305 bool sync) 308 bool sync)
306{ 309{
307 struct rpcrdma_mr_seg *seg;
308 struct rpcrdma_mw *mw; 310 struct rpcrdma_mw *mw;
309 unsigned int i;
310
311 for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
312 seg = &req->rl_segments[i];
313 mw = seg->rl_mw;
314
315 if (sync) {
316 /* ORDER */
317 __fmr_unmap(mw);
318 __fmr_dma_unmap(r_xprt, seg);
319 rpcrdma_put_mw(r_xprt, mw);
320 } else {
321 __fmr_dma_unmap(r_xprt, seg);
322 __fmr_queue_recovery(mw);
323 }
324
325 i += seg->mr_nsegs;
326 seg->mr_nsegs = 0;
327 seg->rl_mw = NULL;
328 }
329}
330
331static void
332fmr_op_destroy(struct rpcrdma_buffer *buf)
333{
334 struct rpcrdma_mw *r;
335 int rc;
336
337 while (!list_empty(&buf->rb_all)) {
338 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
339 list_del(&r->mw_all);
340 kfree(r->fmr.physaddrs);
341 311
342 rc = ib_dealloc_fmr(r->fmr.fmr); 312 while (!list_empty(&req->rl_registered)) {
343 if (rc) 313 mw = list_first_entry(&req->rl_registered,
344 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", 314 struct rpcrdma_mw, mw_list);
345 __func__, rc); 315 list_del_init(&mw->mw_list);
346 316
347 kfree(r); 317 if (sync)
318 fmr_op_recover_mr(mw);
319 else
320 rpcrdma_defer_mr_recovery(mw);
348 } 321 }
349} 322}
350 323
@@ -352,9 +325,10 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
352 .ro_map = fmr_op_map, 325 .ro_map = fmr_op_map,
353 .ro_unmap_sync = fmr_op_unmap_sync, 326 .ro_unmap_sync = fmr_op_unmap_sync,
354 .ro_unmap_safe = fmr_op_unmap_safe, 327 .ro_unmap_safe = fmr_op_unmap_safe,
328 .ro_recover_mr = fmr_op_recover_mr,
355 .ro_open = fmr_op_open, 329 .ro_open = fmr_op_open,
356 .ro_maxpages = fmr_op_maxpages, 330 .ro_maxpages = fmr_op_maxpages,
357 .ro_init = fmr_op_init, 331 .ro_init_mr = fmr_op_init_mr,
358 .ro_destroy = fmr_op_destroy, 332 .ro_release_mr = fmr_op_release_mr,
359 .ro_displayname = "fmr", 333 .ro_displayname = "fmr",
360}; 334};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index c0947544babe..892b5e1d9b09 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -73,29 +73,71 @@
73# define RPCDBG_FACILITY RPCDBG_TRANS 73# define RPCDBG_FACILITY RPCDBG_TRANS
74#endif 74#endif
75 75
76static struct workqueue_struct *frwr_recovery_wq; 76bool
77 77frwr_is_supported(struct rpcrdma_ia *ia)
78#define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM) 78{
79 struct ib_device_attr *attrs = &ia->ri_device->attrs;
80
81 if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
82 goto out_not_supported;
83 if (attrs->max_fast_reg_page_list_len == 0)
84 goto out_not_supported;
85 return true;
86
87out_not_supported:
88 pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n",
89 ia->ri_device->name);
90 return false;
91}
79 92
80int 93static int
81frwr_alloc_recovery_wq(void) 94frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
82{ 95{
83 frwr_recovery_wq = alloc_workqueue("frwr_recovery", 96 unsigned int depth = ia->ri_max_frmr_depth;
84 FRWR_RECOVERY_WQ_FLAGS, 0); 97 struct rpcrdma_frmr *f = &r->frmr;
85 return !frwr_recovery_wq ? -ENOMEM : 0; 98 int rc;
99
100 f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth);
101 if (IS_ERR(f->fr_mr))
102 goto out_mr_err;
103
104 r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL);
105 if (!r->mw_sg)
106 goto out_list_err;
107
108 sg_init_table(r->mw_sg, depth);
109 init_completion(&f->fr_linv_done);
110 return 0;
111
112out_mr_err:
113 rc = PTR_ERR(f->fr_mr);
114 dprintk("RPC: %s: ib_alloc_mr status %i\n",
115 __func__, rc);
116 return rc;
117
118out_list_err:
119 rc = -ENOMEM;
120 dprintk("RPC: %s: sg allocation failure\n",
121 __func__);
122 ib_dereg_mr(f->fr_mr);
123 return rc;
86} 124}
87 125
88void 126static void
89frwr_destroy_recovery_wq(void) 127frwr_op_release_mr(struct rpcrdma_mw *r)
90{ 128{
91 struct workqueue_struct *wq; 129 int rc;
92 130
93 if (!frwr_recovery_wq) 131 /* Ensure MW is not on any rl_registered list */
94 return; 132 if (!list_empty(&r->mw_list))
133 list_del(&r->mw_list);
95 134
96 wq = frwr_recovery_wq; 135 rc = ib_dereg_mr(r->frmr.fr_mr);
97 frwr_recovery_wq = NULL; 136 if (rc)
98 destroy_workqueue(wq); 137 pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
138 r, rc);
139 kfree(r->mw_sg);
140 kfree(r);
99} 141}
100 142
101static int 143static int
@@ -124,93 +166,37 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
124 return 0; 166 return 0;
125} 167}
126 168
127static void 169/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR.
128__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
129{
130 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
131 struct rpcrdma_frmr *f = &mw->frmr;
132 int rc;
133
134 rc = __frwr_reset_mr(ia, mw);
135 ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir);
136 if (rc)
137 return;
138
139 rpcrdma_put_mw(r_xprt, mw);
140}
141
142/* Deferred reset of a single FRMR. Generate a fresh rkey by
143 * replacing the MR.
144 * 170 *
145 * There's no recovery if this fails. The FRMR is abandoned, but 171 * There's no recovery if this fails. The FRMR is abandoned, but
146 * remains in rb_all. It will be cleaned up when the transport is 172 * remains in rb_all. It will be cleaned up when the transport is
147 * destroyed. 173 * destroyed.
148 */ 174 */
149static void 175static void
150__frwr_recovery_worker(struct work_struct *work) 176frwr_op_recover_mr(struct rpcrdma_mw *mw)
151{
152 struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
153 mw_work);
154
155 __frwr_reset_and_unmap(r->mw_xprt, r);
156 return;
157}
158
159/* A broken MR was discovered in a context that can't sleep.
160 * Defer recovery to the recovery worker.
161 */
162static void
163__frwr_queue_recovery(struct rpcrdma_mw *r)
164{
165 INIT_WORK(&r->mw_work, __frwr_recovery_worker);
166 queue_work(frwr_recovery_wq, &r->mw_work);
167}
168
169static int
170__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
171 unsigned int depth)
172{ 177{
173 struct rpcrdma_frmr *f = &r->frmr; 178 struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
179 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
174 int rc; 180 int rc;
175 181
176 f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); 182 rc = __frwr_reset_mr(ia, mw);
177 if (IS_ERR(f->fr_mr)) 183 ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir);
178 goto out_mr_err; 184 if (rc)
179 185 goto out_release;
180 f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL);
181 if (!f->fr_sg)
182 goto out_list_err;
183
184 sg_init_table(f->fr_sg, depth);
185
186 init_completion(&f->fr_linv_done);
187
188 return 0;
189 186
190out_mr_err: 187 rpcrdma_put_mw(r_xprt, mw);
191 rc = PTR_ERR(f->fr_mr); 188 r_xprt->rx_stats.mrs_recovered++;
192 dprintk("RPC: %s: ib_alloc_mr status %i\n", 189 return;
193 __func__, rc);
194 return rc;
195 190
196out_list_err: 191out_release:
197 rc = -ENOMEM; 192 pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw);
198 dprintk("RPC: %s: sg allocation failure\n", 193 r_xprt->rx_stats.mrs_orphaned++;
199 __func__);
200 ib_dereg_mr(f->fr_mr);
201 return rc;
202}
203 194
204static void 195 spin_lock(&r_xprt->rx_buf.rb_mwlock);
205__frwr_release(struct rpcrdma_mw *r) 196 list_del(&mw->mw_all);
206{ 197 spin_unlock(&r_xprt->rx_buf.rb_mwlock);
207 int rc;
208 198
209 rc = ib_dereg_mr(r->frmr.fr_mr); 199 frwr_op_release_mr(mw);
210 if (rc)
211 dprintk("RPC: %s: ib_dereg_mr status %i\n",
212 __func__, rc);
213 kfree(r->frmr.fr_sg);
214} 200}
215 201
216static int 202static int
@@ -346,57 +332,14 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
346 complete_all(&frmr->fr_linv_done); 332 complete_all(&frmr->fr_linv_done);
347} 333}
348 334
349static int 335/* Post a REG_MR Work Request to register a memory region
350frwr_op_init(struct rpcrdma_xprt *r_xprt)
351{
352 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
353 struct ib_device *device = r_xprt->rx_ia.ri_device;
354 unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
355 struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
356 int i;
357
358 spin_lock_init(&buf->rb_mwlock);
359 INIT_LIST_HEAD(&buf->rb_mws);
360 INIT_LIST_HEAD(&buf->rb_all);
361
362 i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1);
363 i += 2; /* head + tail */
364 i *= buf->rb_max_requests; /* one set for each RPC slot */
365 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
366
367 while (i--) {
368 struct rpcrdma_mw *r;
369 int rc;
370
371 r = kzalloc(sizeof(*r), GFP_KERNEL);
372 if (!r)
373 return -ENOMEM;
374
375 rc = __frwr_init(r, pd, device, depth);
376 if (rc) {
377 kfree(r);
378 return rc;
379 }
380
381 r->mw_xprt = r_xprt;
382 list_add(&r->mw_list, &buf->rb_mws);
383 list_add(&r->mw_all, &buf->rb_all);
384 }
385
386 return 0;
387}
388
389/* Post a FAST_REG Work Request to register a memory region
390 * for remote access via RDMA READ or RDMA WRITE. 336 * for remote access via RDMA READ or RDMA WRITE.
391 */ 337 */
392static int 338static int
393frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, 339frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
394 int nsegs, bool writing) 340 int nsegs, bool writing, struct rpcrdma_mw **out)
395{ 341{
396 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 342 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
397 struct ib_device *device = ia->ri_device;
398 enum dma_data_direction direction = rpcrdma_data_dir(writing);
399 struct rpcrdma_mr_seg *seg1 = seg;
400 struct rpcrdma_mw *mw; 343 struct rpcrdma_mw *mw;
401 struct rpcrdma_frmr *frmr; 344 struct rpcrdma_frmr *frmr;
402 struct ib_mr *mr; 345 struct ib_mr *mr;
@@ -405,14 +348,13 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
405 int rc, i, n, dma_nents; 348 int rc, i, n, dma_nents;
406 u8 key; 349 u8 key;
407 350
408 mw = seg1->rl_mw; 351 mw = NULL;
409 seg1->rl_mw = NULL;
410 do { 352 do {
411 if (mw) 353 if (mw)
412 __frwr_queue_recovery(mw); 354 rpcrdma_defer_mr_recovery(mw);
413 mw = rpcrdma_get_mw(r_xprt); 355 mw = rpcrdma_get_mw(r_xprt);
414 if (!mw) 356 if (!mw)
415 return -ENOMEM; 357 return -ENOBUFS;
416 } while (mw->frmr.fr_state != FRMR_IS_INVALID); 358 } while (mw->frmr.fr_state != FRMR_IS_INVALID);
417 frmr = &mw->frmr; 359 frmr = &mw->frmr;
418 frmr->fr_state = FRMR_IS_VALID; 360 frmr->fr_state = FRMR_IS_VALID;
@@ -421,15 +363,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
421 363
422 if (nsegs > ia->ri_max_frmr_depth) 364 if (nsegs > ia->ri_max_frmr_depth)
423 nsegs = ia->ri_max_frmr_depth; 365 nsegs = ia->ri_max_frmr_depth;
424
425 for (i = 0; i < nsegs;) { 366 for (i = 0; i < nsegs;) {
426 if (seg->mr_page) 367 if (seg->mr_page)
427 sg_set_page(&frmr->fr_sg[i], 368 sg_set_page(&mw->mw_sg[i],
428 seg->mr_page, 369 seg->mr_page,
429 seg->mr_len, 370 seg->mr_len,
430 offset_in_page(seg->mr_offset)); 371 offset_in_page(seg->mr_offset));
431 else 372 else
432 sg_set_buf(&frmr->fr_sg[i], seg->mr_offset, 373 sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
433 seg->mr_len); 374 seg->mr_len);
434 375
435 ++seg; 376 ++seg;
@@ -440,26 +381,22 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
440 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 381 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
441 break; 382 break;
442 } 383 }
443 frmr->fr_nents = i; 384 mw->mw_nents = i;
444 frmr->fr_dir = direction; 385 mw->mw_dir = rpcrdma_data_dir(writing);
445 386 if (i == 0)
446 dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction); 387 goto out_dmamap_err;
447 if (!dma_nents) {
448 pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n",
449 __func__, frmr->fr_sg, frmr->fr_nents);
450 return -ENOMEM;
451 }
452 388
453 n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE); 389 dma_nents = ib_dma_map_sg(ia->ri_device,
454 if (unlikely(n != frmr->fr_nents)) { 390 mw->mw_sg, mw->mw_nents, mw->mw_dir);
455 pr_err("RPC: %s: failed to map mr %p (%u/%u)\n", 391 if (!dma_nents)
456 __func__, frmr->fr_mr, n, frmr->fr_nents); 392 goto out_dmamap_err;
457 rc = n < 0 ? n : -EINVAL; 393
458 goto out_senderr; 394 n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE);
459 } 395 if (unlikely(n != mw->mw_nents))
396 goto out_mapmr_err;
460 397
461 dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", 398 dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
462 __func__, mw, frmr->fr_nents, mr->length); 399 __func__, mw, mw->mw_nents, mr->length);
463 400
464 key = (u8)(mr->rkey & 0x000000FF); 401 key = (u8)(mr->rkey & 0x000000FF);
465 ib_update_fast_reg_key(mr, ++key); 402 ib_update_fast_reg_key(mr, ++key);
@@ -481,24 +418,34 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
481 if (rc) 418 if (rc)
482 goto out_senderr; 419 goto out_senderr;
483 420
484 seg1->rl_mw = mw; 421 mw->mw_handle = mr->rkey;
485 seg1->mr_rkey = mr->rkey; 422 mw->mw_length = mr->length;
486 seg1->mr_base = mr->iova; 423 mw->mw_offset = mr->iova;
487 seg1->mr_nsegs = frmr->fr_nents; 424
488 seg1->mr_len = mr->length; 425 *out = mw;
426 return mw->mw_nents;
489 427
490 return frmr->fr_nents; 428out_dmamap_err:
429 pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
430 mw->mw_sg, mw->mw_nents);
431 rpcrdma_defer_mr_recovery(mw);
432 return -EIO;
433
434out_mapmr_err:
435 pr_err("rpcrdma: failed to map mr %p (%u/%u)\n",
436 frmr->fr_mr, n, mw->mw_nents);
437 rpcrdma_defer_mr_recovery(mw);
438 return -EIO;
491 439
492out_senderr: 440out_senderr:
493 dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); 441 pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc);
494 __frwr_queue_recovery(mw); 442 rpcrdma_defer_mr_recovery(mw);
495 return rc; 443 return -ENOTCONN;
496} 444}
497 445
498static struct ib_send_wr * 446static struct ib_send_wr *
499__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) 447__frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
500{ 448{
501 struct rpcrdma_mw *mw = seg->rl_mw;
502 struct rpcrdma_frmr *f = &mw->frmr; 449 struct rpcrdma_frmr *f = &mw->frmr;
503 struct ib_send_wr *invalidate_wr; 450 struct ib_send_wr *invalidate_wr;
504 451
@@ -518,16 +465,16 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
518 * 465 *
519 * Sleeps until it is safe for the host CPU to access the 466 * Sleeps until it is safe for the host CPU to access the
520 * previously mapped memory regions. 467 * previously mapped memory regions.
468 *
469 * Caller ensures that req->rl_registered is not empty.
521 */ 470 */
522static void 471static void
523frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) 472frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
524{ 473{
525 struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; 474 struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
526 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 475 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
527 struct rpcrdma_mr_seg *seg; 476 struct rpcrdma_mw *mw, *tmp;
528 unsigned int i, nchunks;
529 struct rpcrdma_frmr *f; 477 struct rpcrdma_frmr *f;
530 struct rpcrdma_mw *mw;
531 int rc; 478 int rc;
532 479
533 dprintk("RPC: %s: req %p\n", __func__, req); 480 dprintk("RPC: %s: req %p\n", __func__, req);
@@ -537,22 +484,18 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
537 * Chain the LOCAL_INV Work Requests and post them with 484 * Chain the LOCAL_INV Work Requests and post them with
538 * a single ib_post_send() call. 485 * a single ib_post_send() call.
539 */ 486 */
487 f = NULL;
540 invalidate_wrs = pos = prev = NULL; 488 invalidate_wrs = pos = prev = NULL;
541 seg = NULL; 489 list_for_each_entry(mw, &req->rl_registered, mw_list) {
542 for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { 490 pos = __frwr_prepare_linv_wr(mw);
543 seg = &req->rl_segments[i];
544
545 pos = __frwr_prepare_linv_wr(seg);
546 491
547 if (!invalidate_wrs) 492 if (!invalidate_wrs)
548 invalidate_wrs = pos; 493 invalidate_wrs = pos;
549 else 494 else
550 prev->next = pos; 495 prev->next = pos;
551 prev = pos; 496 prev = pos;
552 497 f = &mw->frmr;
553 i += seg->mr_nsegs;
554 } 498 }
555 f = &seg->rl_mw->frmr;
556 499
557 /* Strong send queue ordering guarantees that when the 500 /* Strong send queue ordering guarantees that when the
558 * last WR in the chain completes, all WRs in the chain 501 * last WR in the chain completes, all WRs in the chain
@@ -577,39 +520,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
577 * them to the free MW list. 520 * them to the free MW list.
578 */ 521 */
579unmap: 522unmap:
580 for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { 523 list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
581 seg = &req->rl_segments[i]; 524 list_del_init(&mw->mw_list);
582 mw = seg->rl_mw; 525 ib_dma_unmap_sg(ia->ri_device,
583 seg->rl_mw = NULL; 526 mw->mw_sg, mw->mw_nents, mw->mw_dir);
584
585 ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents,
586 f->fr_dir);
587 rpcrdma_put_mw(r_xprt, mw); 527 rpcrdma_put_mw(r_xprt, mw);
588
589 i += seg->mr_nsegs;
590 seg->mr_nsegs = 0;
591 } 528 }
592
593 req->rl_nchunks = 0;
594 return; 529 return;
595 530
596reset_mrs: 531reset_mrs:
597 pr_warn("%s: ib_post_send failed %i\n", __func__, rc); 532 pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc);
533 rdma_disconnect(ia->ri_id);
598 534
599 /* Find and reset the MRs in the LOCAL_INV WRs that did not 535 /* Find and reset the MRs in the LOCAL_INV WRs that did not
600 * get posted. This is synchronous, and slow. 536 * get posted. This is synchronous, and slow.
601 */ 537 */
602 for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { 538 list_for_each_entry(mw, &req->rl_registered, mw_list) {
603 seg = &req->rl_segments[i];
604 mw = seg->rl_mw;
605 f = &mw->frmr; 539 f = &mw->frmr;
606
607 if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) { 540 if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
608 __frwr_reset_mr(ia, mw); 541 __frwr_reset_mr(ia, mw);
609 bad_wr = bad_wr->next; 542 bad_wr = bad_wr->next;
610 } 543 }
611
612 i += seg->mr_nsegs;
613 } 544 }
614 goto unmap; 545 goto unmap;
615} 546}
@@ -621,38 +552,17 @@ static void
621frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 552frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
622 bool sync) 553 bool sync)
623{ 554{
624 struct rpcrdma_mr_seg *seg;
625 struct rpcrdma_mw *mw; 555 struct rpcrdma_mw *mw;
626 unsigned int i;
627 556
628 for (i = 0; req->rl_nchunks; req->rl_nchunks--) { 557 while (!list_empty(&req->rl_registered)) {
629 seg = &req->rl_segments[i]; 558 mw = list_first_entry(&req->rl_registered,
630 mw = seg->rl_mw; 559 struct rpcrdma_mw, mw_list);
560 list_del_init(&mw->mw_list);
631 561
632 if (sync) 562 if (sync)
633 __frwr_reset_and_unmap(r_xprt, mw); 563 frwr_op_recover_mr(mw);
634 else 564 else
635 __frwr_queue_recovery(mw); 565 rpcrdma_defer_mr_recovery(mw);
636
637 i += seg->mr_nsegs;
638 seg->mr_nsegs = 0;
639 seg->rl_mw = NULL;
640 }
641}
642
643static void
644frwr_op_destroy(struct rpcrdma_buffer *buf)
645{
646 struct rpcrdma_mw *r;
647
648 /* Ensure stale MWs for "buf" are no longer in flight */
649 flush_workqueue(frwr_recovery_wq);
650
651 while (!list_empty(&buf->rb_all)) {
652 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
653 list_del(&r->mw_all);
654 __frwr_release(r);
655 kfree(r);
656 } 566 }
657} 567}
658 568
@@ -660,9 +570,10 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
660 .ro_map = frwr_op_map, 570 .ro_map = frwr_op_map,
661 .ro_unmap_sync = frwr_op_unmap_sync, 571 .ro_unmap_sync = frwr_op_unmap_sync,
662 .ro_unmap_safe = frwr_op_unmap_safe, 572 .ro_unmap_safe = frwr_op_unmap_safe,
573 .ro_recover_mr = frwr_op_recover_mr,
663 .ro_open = frwr_op_open, 574 .ro_open = frwr_op_open,
664 .ro_maxpages = frwr_op_maxpages, 575 .ro_maxpages = frwr_op_maxpages,
665 .ro_init = frwr_op_init, 576 .ro_init_mr = frwr_op_init_mr,
666 .ro_destroy = frwr_op_destroy, 577 .ro_release_mr = frwr_op_release_mr,
667 .ro_displayname = "frwr", 578 .ro_displayname = "frwr",
668}; 579};
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
deleted file mode 100644
index 3750596cc432..000000000000
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ /dev/null
@@ -1,122 +0,0 @@
1/*
2 * Copyright (c) 2015 Oracle. All rights reserved.
3 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 */
5
6/* No-op chunk preparation. All client memory is pre-registered.
7 * Sometimes referred to as ALLPHYSICAL mode.
8 *
9 * Physical registration is simple because all client memory is
10 * pre-registered and never deregistered. This mode is good for
11 * adapter bring up, but is considered not safe: the server is
12 * trusted not to abuse its access to client memory not involved
13 * in RDMA I/O.
14 */
15
16#include "xprt_rdma.h"
17
18#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
19# define RPCDBG_FACILITY RPCDBG_TRANS
20#endif
21
22static int
23physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
24 struct rpcrdma_create_data_internal *cdata)
25{
26 struct ib_mr *mr;
27
28 /* Obtain an rkey to use for RPC data payloads.
29 */
30 mr = ib_get_dma_mr(ia->ri_pd,
31 IB_ACCESS_LOCAL_WRITE |
32 IB_ACCESS_REMOTE_WRITE |
33 IB_ACCESS_REMOTE_READ);
34 if (IS_ERR(mr)) {
35 pr_err("%s: ib_get_dma_mr for failed with %lX\n",
36 __func__, PTR_ERR(mr));
37 return -ENOMEM;
38 }
39 ia->ri_dma_mr = mr;
40
41 rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int,
42 RPCRDMA_MAX_DATA_SEGS,
43 RPCRDMA_MAX_HDR_SEGS));
44 return 0;
45}
46
47/* PHYSICAL memory registration conveys one page per chunk segment.
48 */
49static size_t
50physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
51{
52 return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
53 RPCRDMA_MAX_HDR_SEGS);
54}
55
56static int
57physical_op_init(struct rpcrdma_xprt *r_xprt)
58{
59 return 0;
60}
61
62/* The client's physical memory is already exposed for
63 * remote access via RDMA READ or RDMA WRITE.
64 */
65static int
66physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
67 int nsegs, bool writing)
68{
69 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
70
71 rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
72 seg->mr_rkey = ia->ri_dma_mr->rkey;
73 seg->mr_base = seg->mr_dma;
74 return 1;
75}
76
77/* DMA unmap all memory regions that were mapped for "req".
78 */
79static void
80physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
81{
82 struct ib_device *device = r_xprt->rx_ia.ri_device;
83 unsigned int i;
84
85 for (i = 0; req->rl_nchunks; --req->rl_nchunks)
86 rpcrdma_unmap_one(device, &req->rl_segments[i++]);
87}
88
89/* Use a slow, safe mechanism to invalidate all memory regions
90 * that were registered for "req".
91 *
92 * For physical memory registration, there is no good way to
93 * fence a single MR that has been advertised to the server. The
94 * client has already handed the server an R_key that cannot be
95 * invalidated and is shared by all MRs on this connection.
96 * Tearing down the PD might be the only safe choice, but it's
97 * not clear that a freshly acquired DMA R_key would be different
98 * than the one used by the PD that was just destroyed.
99 * FIXME.
100 */
101static void
102physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
103 bool sync)
104{
105 physical_op_unmap_sync(r_xprt, req);
106}
107
108static void
109physical_op_destroy(struct rpcrdma_buffer *buf)
110{
111}
112
113const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
114 .ro_map = physical_op_map,
115 .ro_unmap_sync = physical_op_unmap_sync,
116 .ro_unmap_safe = physical_op_unmap_safe,
117 .ro_open = physical_op_open,
118 .ro_maxpages = physical_op_maxpages,
119 .ro_init = physical_op_init,
120 .ro_destroy = physical_op_destroy,
121 .ro_displayname = "physical",
122};
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 35a81096e83d..a47f170b20ef 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -196,8 +196,7 @@ rpcrdma_tail_pullup(struct xdr_buf *buf)
196 * MR when they can. 196 * MR when they can.
197 */ 197 */
198static int 198static int
199rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, 199rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
200 int n, int nsegs)
201{ 200{
202 size_t page_offset; 201 size_t page_offset;
203 u32 remaining; 202 u32 remaining;
@@ -206,7 +205,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
206 base = vec->iov_base; 205 base = vec->iov_base;
207 page_offset = offset_in_page(base); 206 page_offset = offset_in_page(base);
208 remaining = vec->iov_len; 207 remaining = vec->iov_len;
209 while (remaining && n < nsegs) { 208 while (remaining && n < RPCRDMA_MAX_SEGS) {
210 seg[n].mr_page = NULL; 209 seg[n].mr_page = NULL;
211 seg[n].mr_offset = base; 210 seg[n].mr_offset = base;
212 seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); 211 seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
@@ -230,34 +229,34 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
230 229
231static int 230static int
232rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, 231rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
233 enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) 232 enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg)
234{ 233{
235 int len, n = 0, p; 234 int len, n, p, page_base;
236 int page_base;
237 struct page **ppages; 235 struct page **ppages;
238 236
237 n = 0;
239 if (pos == 0) { 238 if (pos == 0) {
240 n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n, nsegs); 239 n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n);
241 if (n == nsegs) 240 if (n == RPCRDMA_MAX_SEGS)
242 return -EIO; 241 goto out_overflow;
243 } 242 }
244 243
245 len = xdrbuf->page_len; 244 len = xdrbuf->page_len;
246 ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); 245 ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
247 page_base = xdrbuf->page_base & ~PAGE_MASK; 246 page_base = xdrbuf->page_base & ~PAGE_MASK;
248 p = 0; 247 p = 0;
249 while (len && n < nsegs) { 248 while (len && n < RPCRDMA_MAX_SEGS) {
250 if (!ppages[p]) { 249 if (!ppages[p]) {
251 /* alloc the pagelist for receiving buffer */ 250 /* alloc the pagelist for receiving buffer */
252 ppages[p] = alloc_page(GFP_ATOMIC); 251 ppages[p] = alloc_page(GFP_ATOMIC);
253 if (!ppages[p]) 252 if (!ppages[p])
254 return -ENOMEM; 253 return -EAGAIN;
255 } 254 }
256 seg[n].mr_page = ppages[p]; 255 seg[n].mr_page = ppages[p];
257 seg[n].mr_offset = (void *)(unsigned long) page_base; 256 seg[n].mr_offset = (void *)(unsigned long) page_base;
258 seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); 257 seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
259 if (seg[n].mr_len > PAGE_SIZE) 258 if (seg[n].mr_len > PAGE_SIZE)
260 return -EIO; 259 goto out_overflow;
261 len -= seg[n].mr_len; 260 len -= seg[n].mr_len;
262 ++n; 261 ++n;
263 ++p; 262 ++p;
@@ -265,8 +264,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
265 } 264 }
266 265
267 /* Message overflows the seg array */ 266 /* Message overflows the seg array */
268 if (len && n == nsegs) 267 if (len && n == RPCRDMA_MAX_SEGS)
269 return -EIO; 268 goto out_overflow;
270 269
271 /* When encoding the read list, the tail is always sent inline */ 270 /* When encoding the read list, the tail is always sent inline */
272 if (type == rpcrdma_readch) 271 if (type == rpcrdma_readch)
@@ -277,20 +276,24 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
277 * xdr pad bytes, saving the server an RDMA operation. */ 276 * xdr pad bytes, saving the server an RDMA operation. */
278 if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) 277 if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
279 return n; 278 return n;
280 n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n, nsegs); 279 n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n);
281 if (n == nsegs) 280 if (n == RPCRDMA_MAX_SEGS)
282 return -EIO; 281 goto out_overflow;
283 } 282 }
284 283
285 return n; 284 return n;
285
286out_overflow:
287 pr_err("rpcrdma: segment array overflow\n");
288 return -EIO;
286} 289}
287 290
288static inline __be32 * 291static inline __be32 *
289xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg) 292xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw)
290{ 293{
291 *iptr++ = cpu_to_be32(seg->mr_rkey); 294 *iptr++ = cpu_to_be32(mw->mw_handle);
292 *iptr++ = cpu_to_be32(seg->mr_len); 295 *iptr++ = cpu_to_be32(mw->mw_length);
293 return xdr_encode_hyper(iptr, seg->mr_base); 296 return xdr_encode_hyper(iptr, mw->mw_offset);
294} 297}
295 298
296/* XDR-encode the Read list. Supports encoding a list of read 299/* XDR-encode the Read list. Supports encoding a list of read
@@ -310,7 +313,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
310 struct rpcrdma_req *req, struct rpc_rqst *rqst, 313 struct rpcrdma_req *req, struct rpc_rqst *rqst,
311 __be32 *iptr, enum rpcrdma_chunktype rtype) 314 __be32 *iptr, enum rpcrdma_chunktype rtype)
312{ 315{
313 struct rpcrdma_mr_seg *seg = req->rl_nextseg; 316 struct rpcrdma_mr_seg *seg;
317 struct rpcrdma_mw *mw;
314 unsigned int pos; 318 unsigned int pos;
315 int n, nsegs; 319 int n, nsegs;
316 320
@@ -322,15 +326,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
322 pos = rqst->rq_snd_buf.head[0].iov_len; 326 pos = rqst->rq_snd_buf.head[0].iov_len;
323 if (rtype == rpcrdma_areadch) 327 if (rtype == rpcrdma_areadch)
324 pos = 0; 328 pos = 0;
325 nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, 329 seg = req->rl_segments;
326 RPCRDMA_MAX_SEGS - req->rl_nchunks); 330 nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg);
327 if (nsegs < 0) 331 if (nsegs < 0)
328 return ERR_PTR(nsegs); 332 return ERR_PTR(nsegs);
329 333
330 do { 334 do {
331 n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false); 335 n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
332 if (n <= 0) 336 false, &mw);
337 if (n < 0)
333 return ERR_PTR(n); 338 return ERR_PTR(n);
339 list_add(&mw->mw_list, &req->rl_registered);
334 340
335 *iptr++ = xdr_one; /* item present */ 341 *iptr++ = xdr_one; /* item present */
336 342
@@ -338,20 +344,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
338 * have the same "position". 344 * have the same "position".
339 */ 345 */
340 *iptr++ = cpu_to_be32(pos); 346 *iptr++ = cpu_to_be32(pos);
341 iptr = xdr_encode_rdma_segment(iptr, seg); 347 iptr = xdr_encode_rdma_segment(iptr, mw);
342 348
343 dprintk("RPC: %5u %s: read segment pos %u " 349 dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n",
344 "%d@0x%016llx:0x%08x (%s)\n",
345 rqst->rq_task->tk_pid, __func__, pos, 350 rqst->rq_task->tk_pid, __func__, pos,
346 seg->mr_len, (unsigned long long)seg->mr_base, 351 mw->mw_length, (unsigned long long)mw->mw_offset,
347 seg->mr_rkey, n < nsegs ? "more" : "last"); 352 mw->mw_handle, n < nsegs ? "more" : "last");
348 353
349 r_xprt->rx_stats.read_chunk_count++; 354 r_xprt->rx_stats.read_chunk_count++;
350 req->rl_nchunks++;
351 seg += n; 355 seg += n;
352 nsegs -= n; 356 nsegs -= n;
353 } while (nsegs); 357 } while (nsegs);
354 req->rl_nextseg = seg;
355 358
356 /* Finish Read list */ 359 /* Finish Read list */
357 *iptr++ = xdr_zero; /* Next item not present */ 360 *iptr++ = xdr_zero; /* Next item not present */
@@ -375,7 +378,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
375 struct rpc_rqst *rqst, __be32 *iptr, 378 struct rpc_rqst *rqst, __be32 *iptr,
376 enum rpcrdma_chunktype wtype) 379 enum rpcrdma_chunktype wtype)
377{ 380{
378 struct rpcrdma_mr_seg *seg = req->rl_nextseg; 381 struct rpcrdma_mr_seg *seg;
382 struct rpcrdma_mw *mw;
379 int n, nsegs, nchunks; 383 int n, nsegs, nchunks;
380 __be32 *segcount; 384 __be32 *segcount;
381 385
@@ -384,10 +388,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
384 return iptr; 388 return iptr;
385 } 389 }
386 390
391 seg = req->rl_segments;
387 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 392 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
388 rqst->rq_rcv_buf.head[0].iov_len, 393 rqst->rq_rcv_buf.head[0].iov_len,
389 wtype, seg, 394 wtype, seg);
390 RPCRDMA_MAX_SEGS - req->rl_nchunks);
391 if (nsegs < 0) 395 if (nsegs < 0)
392 return ERR_PTR(nsegs); 396 return ERR_PTR(nsegs);
393 397
@@ -396,26 +400,25 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
396 400
397 nchunks = 0; 401 nchunks = 0;
398 do { 402 do {
399 n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); 403 n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
400 if (n <= 0) 404 true, &mw);
405 if (n < 0)
401 return ERR_PTR(n); 406 return ERR_PTR(n);
407 list_add(&mw->mw_list, &req->rl_registered);
402 408
403 iptr = xdr_encode_rdma_segment(iptr, seg); 409 iptr = xdr_encode_rdma_segment(iptr, mw);
404 410
405 dprintk("RPC: %5u %s: write segment " 411 dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n",
406 "%d@0x016%llx:0x%08x (%s)\n",
407 rqst->rq_task->tk_pid, __func__, 412 rqst->rq_task->tk_pid, __func__,
408 seg->mr_len, (unsigned long long)seg->mr_base, 413 mw->mw_length, (unsigned long long)mw->mw_offset,
409 seg->mr_rkey, n < nsegs ? "more" : "last"); 414 mw->mw_handle, n < nsegs ? "more" : "last");
410 415
411 r_xprt->rx_stats.write_chunk_count++; 416 r_xprt->rx_stats.write_chunk_count++;
412 r_xprt->rx_stats.total_rdma_request += seg->mr_len; 417 r_xprt->rx_stats.total_rdma_request += seg->mr_len;
413 req->rl_nchunks++;
414 nchunks++; 418 nchunks++;
415 seg += n; 419 seg += n;
416 nsegs -= n; 420 nsegs -= n;
417 } while (nsegs); 421 } while (nsegs);
418 req->rl_nextseg = seg;
419 422
420 /* Update count of segments in this Write chunk */ 423 /* Update count of segments in this Write chunk */
421 *segcount = cpu_to_be32(nchunks); 424 *segcount = cpu_to_be32(nchunks);
@@ -442,7 +445,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
442 struct rpcrdma_req *req, struct rpc_rqst *rqst, 445 struct rpcrdma_req *req, struct rpc_rqst *rqst,
443 __be32 *iptr, enum rpcrdma_chunktype wtype) 446 __be32 *iptr, enum rpcrdma_chunktype wtype)
444{ 447{
445 struct rpcrdma_mr_seg *seg = req->rl_nextseg; 448 struct rpcrdma_mr_seg *seg;
449 struct rpcrdma_mw *mw;
446 int n, nsegs, nchunks; 450 int n, nsegs, nchunks;
447 __be32 *segcount; 451 __be32 *segcount;
448 452
@@ -451,8 +455,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
451 return iptr; 455 return iptr;
452 } 456 }
453 457
454 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, 458 seg = req->rl_segments;
455 RPCRDMA_MAX_SEGS - req->rl_nchunks); 459 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg);
456 if (nsegs < 0) 460 if (nsegs < 0)
457 return ERR_PTR(nsegs); 461 return ERR_PTR(nsegs);
458 462
@@ -461,26 +465,25 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
461 465
462 nchunks = 0; 466 nchunks = 0;
463 do { 467 do {
464 n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); 468 n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
465 if (n <= 0) 469 true, &mw);
470 if (n < 0)
466 return ERR_PTR(n); 471 return ERR_PTR(n);
472 list_add(&mw->mw_list, &req->rl_registered);
467 473
468 iptr = xdr_encode_rdma_segment(iptr, seg); 474 iptr = xdr_encode_rdma_segment(iptr, mw);
469 475
470 dprintk("RPC: %5u %s: reply segment " 476 dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n",
471 "%d@0x%016llx:0x%08x (%s)\n",
472 rqst->rq_task->tk_pid, __func__, 477 rqst->rq_task->tk_pid, __func__,
473 seg->mr_len, (unsigned long long)seg->mr_base, 478 mw->mw_length, (unsigned long long)mw->mw_offset,
474 seg->mr_rkey, n < nsegs ? "more" : "last"); 479 mw->mw_handle, n < nsegs ? "more" : "last");
475 480
476 r_xprt->rx_stats.reply_chunk_count++; 481 r_xprt->rx_stats.reply_chunk_count++;
477 r_xprt->rx_stats.total_rdma_request += seg->mr_len; 482 r_xprt->rx_stats.total_rdma_request += seg->mr_len;
478 req->rl_nchunks++;
479 nchunks++; 483 nchunks++;
480 seg += n; 484 seg += n;
481 nsegs -= n; 485 nsegs -= n;
482 } while (nsegs); 486 } while (nsegs);
483 req->rl_nextseg = seg;
484 487
485 /* Update count of segments in the Reply chunk */ 488 /* Update count of segments in the Reply chunk */
486 *segcount = cpu_to_be32(nchunks); 489 *segcount = cpu_to_be32(nchunks);
@@ -567,6 +570,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
567 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 570 struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
568 enum rpcrdma_chunktype rtype, wtype; 571 enum rpcrdma_chunktype rtype, wtype;
569 struct rpcrdma_msg *headerp; 572 struct rpcrdma_msg *headerp;
573 bool ddp_allowed;
570 ssize_t hdrlen; 574 ssize_t hdrlen;
571 size_t rpclen; 575 size_t rpclen;
572 __be32 *iptr; 576 __be32 *iptr;
@@ -583,6 +587,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
583 headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); 587 headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
584 headerp->rm_type = rdma_msg; 588 headerp->rm_type = rdma_msg;
585 589
590 /* When the ULP employs a GSS flavor that guarantees integrity
591 * or privacy, direct data placement of individual data items
592 * is not allowed.
593 */
594 ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags &
595 RPCAUTH_AUTH_DATATOUCH);
596
586 /* 597 /*
587 * Chunks needed for results? 598 * Chunks needed for results?
588 * 599 *
@@ -594,7 +605,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
594 */ 605 */
595 if (rpcrdma_results_inline(r_xprt, rqst)) 606 if (rpcrdma_results_inline(r_xprt, rqst))
596 wtype = rpcrdma_noch; 607 wtype = rpcrdma_noch;
597 else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) 608 else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ)
598 wtype = rpcrdma_writech; 609 wtype = rpcrdma_writech;
599 else 610 else
600 wtype = rpcrdma_replych; 611 wtype = rpcrdma_replych;
@@ -617,7 +628,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
617 rtype = rpcrdma_noch; 628 rtype = rpcrdma_noch;
618 rpcrdma_inline_pullup(rqst); 629 rpcrdma_inline_pullup(rqst);
619 rpclen = rqst->rq_svec[0].iov_len; 630 rpclen = rqst->rq_svec[0].iov_len;
620 } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { 631 } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
621 rtype = rpcrdma_readch; 632 rtype = rpcrdma_readch;
622 rpclen = rqst->rq_svec[0].iov_len; 633 rpclen = rqst->rq_svec[0].iov_len;
623 rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); 634 rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
@@ -650,8 +661,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
650 * send a Call message with a Position Zero Read chunk and a 661 * send a Call message with a Position Zero Read chunk and a
651 * regular Read chunk at the same time. 662 * regular Read chunk at the same time.
652 */ 663 */
653 req->rl_nchunks = 0;
654 req->rl_nextseg = req->rl_segments;
655 iptr = headerp->rm_body.rm_chunks; 664 iptr = headerp->rm_body.rm_chunks;
656 iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); 665 iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
657 if (IS_ERR(iptr)) 666 if (IS_ERR(iptr))
@@ -690,10 +699,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
690out_overflow: 699out_overflow:
691 pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n", 700 pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
692 hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]); 701 hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
693 /* Terminate this RPC. Chunks registered above will be 702 iptr = ERR_PTR(-EIO);
694 * released by xprt_release -> xprt_rmda_free .
695 */
696 return -EIO;
697 703
698out_unmap: 704out_unmap:
699 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); 705 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
@@ -705,15 +711,13 @@ out_unmap:
705 * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) 711 * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
706 */ 712 */
707static int 713static int
708rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp) 714rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp)
709{ 715{
710 unsigned int i, total_len; 716 unsigned int i, total_len;
711 struct rpcrdma_write_chunk *cur_wchunk; 717 struct rpcrdma_write_chunk *cur_wchunk;
712 char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); 718 char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
713 719
714 i = be32_to_cpu(**iptrp); 720 i = be32_to_cpu(**iptrp);
715 if (i > max)
716 return -1;
717 cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); 721 cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
718 total_len = 0; 722 total_len = 0;
719 while (i--) { 723 while (i--) {
@@ -744,45 +748,66 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
744 return total_len; 748 return total_len;
745} 749}
746 750
747/* 751/**
748 * Scatter inline received data back into provided iov's. 752 * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
753 * @rqst: controlling RPC request
754 * @srcp: points to RPC message payload in receive buffer
755 * @copy_len: remaining length of receive buffer content
756 * @pad: Write chunk pad bytes needed (zero for pure inline)
757 *
758 * The upper layer has set the maximum number of bytes it can
759 * receive in each component of rq_rcv_buf. These values are set in
760 * the head.iov_len, page_len, tail.iov_len, and buflen fields.
761 *
762 * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
763 * many cases this function simply updates iov_base pointers in
764 * rq_rcv_buf to point directly to the received reply data, to
765 * avoid copying reply data.
766 *
767 * Returns the count of bytes which had to be memcopied.
749 */ 768 */
750static void 769static unsigned long
751rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) 770rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
752{ 771{
753 int i, npages, curlen, olen; 772 unsigned long fixup_copy_count;
773 int i, npages, curlen;
754 char *destp; 774 char *destp;
755 struct page **ppages; 775 struct page **ppages;
756 int page_base; 776 int page_base;
757 777
778 /* The head iovec is redirected to the RPC reply message
779 * in the receive buffer, to avoid a memcopy.
780 */
781 rqst->rq_rcv_buf.head[0].iov_base = srcp;
782 rqst->rq_private_buf.head[0].iov_base = srcp;
783
784 /* The contents of the receive buffer that follow
785 * head.iov_len bytes are copied into the page list.
786 */
758 curlen = rqst->rq_rcv_buf.head[0].iov_len; 787 curlen = rqst->rq_rcv_buf.head[0].iov_len;
759 if (curlen > copy_len) { /* write chunk header fixup */ 788 if (curlen > copy_len)
760 curlen = copy_len; 789 curlen = copy_len;
761 rqst->rq_rcv_buf.head[0].iov_len = curlen;
762 }
763
764 dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", 790 dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n",
765 __func__, srcp, copy_len, curlen); 791 __func__, srcp, copy_len, curlen);
766
767 /* Shift pointer for first receive segment only */
768 rqst->rq_rcv_buf.head[0].iov_base = srcp;
769 srcp += curlen; 792 srcp += curlen;
770 copy_len -= curlen; 793 copy_len -= curlen;
771 794
772 olen = copy_len;
773 i = 0;
774 rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
775 page_base = rqst->rq_rcv_buf.page_base; 795 page_base = rqst->rq_rcv_buf.page_base;
776 ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); 796 ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
777 page_base &= ~PAGE_MASK; 797 page_base &= ~PAGE_MASK;
778 798 fixup_copy_count = 0;
779 if (copy_len && rqst->rq_rcv_buf.page_len) { 799 if (copy_len && rqst->rq_rcv_buf.page_len) {
780 npages = PAGE_ALIGN(page_base + 800 int pagelist_len;
781 rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; 801
782 for (; i < npages; i++) { 802 pagelist_len = rqst->rq_rcv_buf.page_len;
803 if (pagelist_len > copy_len)
804 pagelist_len = copy_len;
805 npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT;
806 for (i = 0; i < npages; i++) {
783 curlen = PAGE_SIZE - page_base; 807 curlen = PAGE_SIZE - page_base;
784 if (curlen > copy_len) 808 if (curlen > pagelist_len)
785 curlen = copy_len; 809 curlen = pagelist_len;
810
786 dprintk("RPC: %s: page %d" 811 dprintk("RPC: %s: page %d"
787 " srcp 0x%p len %d curlen %d\n", 812 " srcp 0x%p len %d curlen %d\n",
788 __func__, i, srcp, copy_len, curlen); 813 __func__, i, srcp, copy_len, curlen);
@@ -792,39 +817,32 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
792 kunmap_atomic(destp); 817 kunmap_atomic(destp);
793 srcp += curlen; 818 srcp += curlen;
794 copy_len -= curlen; 819 copy_len -= curlen;
795 if (copy_len == 0) 820 fixup_copy_count += curlen;
821 pagelist_len -= curlen;
822 if (!pagelist_len)
796 break; 823 break;
797 page_base = 0; 824 page_base = 0;
798 } 825 }
799 }
800 826
801 if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { 827 /* Implicit padding for the last segment in a Write
802 curlen = copy_len; 828 * chunk is inserted inline at the front of the tail
803 if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) 829 * iovec. The upper layer ignores the content of
804 curlen = rqst->rq_rcv_buf.tail[0].iov_len; 830 * the pad. Simply ensure inline content in the tail
805 if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) 831 * that follows the Write chunk is properly aligned.
806 memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); 832 */
807 dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", 833 if (pad)
808 __func__, srcp, copy_len, curlen); 834 srcp -= pad;
809 rqst->rq_rcv_buf.tail[0].iov_len = curlen;
810 copy_len -= curlen; ++i;
811 } else
812 rqst->rq_rcv_buf.tail[0].iov_len = 0;
813
814 if (pad) {
815 /* implicit padding on terminal chunk */
816 unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
817 while (pad--)
818 p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
819 } 835 }
820 836
821 if (copy_len) 837 /* The tail iovec is redirected to the remaining data
822 dprintk("RPC: %s: %d bytes in" 838 * in the receive buffer, to avoid a memcopy.
823 " %d extra segments (%d lost)\n", 839 */
824 __func__, olen, i, copy_len); 840 if (copy_len || pad) {
841 rqst->rq_rcv_buf.tail[0].iov_base = srcp;
842 rqst->rq_private_buf.tail[0].iov_base = srcp;
843 }
825 844
826 /* TBD avoid a warning from call_decode() */ 845 return fixup_copy_count;
827 rqst->rq_private_buf = rqst->rq_rcv_buf;
828} 846}
829 847
830void 848void
@@ -960,14 +978,13 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
960 (headerp->rm_body.rm_chunks[1] == xdr_zero && 978 (headerp->rm_body.rm_chunks[1] == xdr_zero &&
961 headerp->rm_body.rm_chunks[2] != xdr_zero) || 979 headerp->rm_body.rm_chunks[2] != xdr_zero) ||
962 (headerp->rm_body.rm_chunks[1] != xdr_zero && 980 (headerp->rm_body.rm_chunks[1] != xdr_zero &&
963 req->rl_nchunks == 0)) 981 list_empty(&req->rl_registered)))
964 goto badheader; 982 goto badheader;
965 if (headerp->rm_body.rm_chunks[1] != xdr_zero) { 983 if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
966 /* count any expected write chunks in read reply */ 984 /* count any expected write chunks in read reply */
967 /* start at write chunk array count */ 985 /* start at write chunk array count */
968 iptr = &headerp->rm_body.rm_chunks[2]; 986 iptr = &headerp->rm_body.rm_chunks[2];
969 rdmalen = rpcrdma_count_chunks(rep, 987 rdmalen = rpcrdma_count_chunks(rep, 1, &iptr);
970 req->rl_nchunks, 1, &iptr);
971 /* check for validity, and no reply chunk after */ 988 /* check for validity, and no reply chunk after */
972 if (rdmalen < 0 || *iptr++ != xdr_zero) 989 if (rdmalen < 0 || *iptr++ != xdr_zero)
973 goto badheader; 990 goto badheader;
@@ -988,8 +1005,10 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
988 rep->rr_len -= RPCRDMA_HDRLEN_MIN; 1005 rep->rr_len -= RPCRDMA_HDRLEN_MIN;
989 status = rep->rr_len; 1006 status = rep->rr_len;
990 } 1007 }
991 /* Fix up the rpc results for upper layer */ 1008
992 rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); 1009 r_xprt->rx_stats.fixup_copy_count +=
1010 rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len,
1011 rdmalen);
993 break; 1012 break;
994 1013
995 case rdma_nomsg: 1014 case rdma_nomsg:
@@ -997,11 +1016,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
997 if (headerp->rm_body.rm_chunks[0] != xdr_zero || 1016 if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
998 headerp->rm_body.rm_chunks[1] != xdr_zero || 1017 headerp->rm_body.rm_chunks[1] != xdr_zero ||
999 headerp->rm_body.rm_chunks[2] != xdr_one || 1018 headerp->rm_body.rm_chunks[2] != xdr_one ||
1000 req->rl_nchunks == 0) 1019 list_empty(&req->rl_registered))
1001 goto badheader; 1020 goto badheader;
1002 iptr = (__be32 *)((unsigned char *)headerp + 1021 iptr = (__be32 *)((unsigned char *)headerp +
1003 RPCRDMA_HDRLEN_MIN); 1022 RPCRDMA_HDRLEN_MIN);
1004 rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); 1023 rdmalen = rpcrdma_count_chunks(rep, 0, &iptr);
1005 if (rdmalen < 0) 1024 if (rdmalen < 0)
1006 goto badheader; 1025 goto badheader;
1007 r_xprt->rx_stats.total_rdma_reply += rdmalen; 1026 r_xprt->rx_stats.total_rdma_reply += rdmalen;
@@ -1014,14 +1033,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
1014 1033
1015badheader: 1034badheader:
1016 default: 1035 default:
1017 dprintk("%s: invalid rpcrdma reply header (type %d):" 1036 dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
1018 " chunks[012] == %d %d %d" 1037 rqst->rq_task->tk_pid, __func__,
1019 " expected chunks <= %d\n", 1038 be32_to_cpu(headerp->rm_type));
1020 __func__, be32_to_cpu(headerp->rm_type),
1021 headerp->rm_body.rm_chunks[0],
1022 headerp->rm_body.rm_chunks[1],
1023 headerp->rm_body.rm_chunks[2],
1024 req->rl_nchunks);
1025 status = -EIO; 1039 status = -EIO;
1026 r_xprt->rx_stats.bad_reply_count++; 1040 r_xprt->rx_stats.bad_reply_count++;
1027 break; 1041 break;
@@ -1035,7 +1049,7 @@ out:
1035 * control: waking the next RPC waits until this RPC has 1049 * control: waking the next RPC waits until this RPC has
1036 * relinquished all its Send Queue entries. 1050 * relinquished all its Send Queue entries.
1037 */ 1051 */
1038 if (req->rl_nchunks) 1052 if (!list_empty(&req->rl_registered))
1039 r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); 1053 r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req);
1040 1054
1041 spin_lock_bh(&xprt->transport_lock); 1055 spin_lock_bh(&xprt->transport_lock);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 99d2e5b72726..81f0e879f019 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -558,7 +558,6 @@ out_sendbuf:
558 558
559out_fail: 559out_fail:
560 rpcrdma_buffer_put(req); 560 rpcrdma_buffer_put(req);
561 r_xprt->rx_stats.failed_marshal_count++;
562 return NULL; 561 return NULL;
563} 562}
564 563
@@ -590,8 +589,19 @@ xprt_rdma_free(void *buffer)
590 rpcrdma_buffer_put(req); 589 rpcrdma_buffer_put(req);
591} 590}
592 591
593/* 592/**
593 * xprt_rdma_send_request - marshal and send an RPC request
594 * @task: RPC task with an RPC message in rq_snd_buf
595 *
596 * Return values:
597 * 0: The request has been sent
598 * ENOTCONN: Caller needs to invoke connect logic then call again
599 * ENOBUFS: Call again later to send the request
600 * EIO: A permanent error occurred. The request was not sent,
601 * and don't try it again
602 *
594 * send_request invokes the meat of RPC RDMA. It must do the following: 603 * send_request invokes the meat of RPC RDMA. It must do the following:
604 *
595 * 1. Marshal the RPC request into an RPC RDMA request, which means 605 * 1. Marshal the RPC request into an RPC RDMA request, which means
596 * putting a header in front of data, and creating IOVs for RDMA 606 * putting a header in front of data, and creating IOVs for RDMA
597 * from those in the request. 607 * from those in the request.
@@ -600,7 +610,6 @@ xprt_rdma_free(void *buffer)
600 * the request (rpcrdma_ep_post). 610 * the request (rpcrdma_ep_post).
601 * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). 611 * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP).
602 */ 612 */
603
604static int 613static int
605xprt_rdma_send_request(struct rpc_task *task) 614xprt_rdma_send_request(struct rpc_task *task)
606{ 615{
@@ -610,6 +619,9 @@ xprt_rdma_send_request(struct rpc_task *task)
610 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 619 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
611 int rc = 0; 620 int rc = 0;
612 621
622 /* On retransmit, remove any previously registered chunks */
623 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
624
613 rc = rpcrdma_marshal_req(rqst); 625 rc = rpcrdma_marshal_req(rqst);
614 if (rc < 0) 626 if (rc < 0)
615 goto failed_marshal; 627 goto failed_marshal;
@@ -630,11 +642,12 @@ xprt_rdma_send_request(struct rpc_task *task)
630 return 0; 642 return 0;
631 643
632failed_marshal: 644failed_marshal:
633 r_xprt->rx_stats.failed_marshal_count++;
634 dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", 645 dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
635 __func__, rc); 646 __func__, rc);
636 if (rc == -EIO) 647 if (rc == -EIO)
637 return -EIO; 648 r_xprt->rx_stats.failed_marshal_count++;
649 if (rc != -ENOTCONN)
650 return rc;
638drop_connection: 651drop_connection:
639 xprt_disconnect_done(xprt); 652 xprt_disconnect_done(xprt);
640 return -ENOTCONN; /* implies disconnect */ 653 return -ENOTCONN; /* implies disconnect */
@@ -660,7 +673,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
660 xprt->stat.bad_xids, 673 xprt->stat.bad_xids,
661 xprt->stat.req_u, 674 xprt->stat.req_u,
662 xprt->stat.bklog_u); 675 xprt->stat.bklog_u);
663 seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n", 676 seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ",
664 r_xprt->rx_stats.read_chunk_count, 677 r_xprt->rx_stats.read_chunk_count,
665 r_xprt->rx_stats.write_chunk_count, 678 r_xprt->rx_stats.write_chunk_count,
666 r_xprt->rx_stats.reply_chunk_count, 679 r_xprt->rx_stats.reply_chunk_count,
@@ -672,6 +685,10 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
672 r_xprt->rx_stats.failed_marshal_count, 685 r_xprt->rx_stats.failed_marshal_count,
673 r_xprt->rx_stats.bad_reply_count, 686 r_xprt->rx_stats.bad_reply_count,
674 r_xprt->rx_stats.nomsg_call_count); 687 r_xprt->rx_stats.nomsg_call_count);
688 seq_printf(seq, "%lu %lu %lu\n",
689 r_xprt->rx_stats.mrs_recovered,
690 r_xprt->rx_stats.mrs_orphaned,
691 r_xprt->rx_stats.mrs_allocated);
675} 692}
676 693
677static int 694static int
@@ -741,7 +758,6 @@ void xprt_rdma_cleanup(void)
741 __func__, rc); 758 __func__, rc);
742 759
743 rpcrdma_destroy_wq(); 760 rpcrdma_destroy_wq();
744 frwr_destroy_recovery_wq();
745 761
746 rc = xprt_unregister_transport(&xprt_rdma_bc); 762 rc = xprt_unregister_transport(&xprt_rdma_bc);
747 if (rc) 763 if (rc)
@@ -753,20 +769,13 @@ int xprt_rdma_init(void)
753{ 769{
754 int rc; 770 int rc;
755 771
756 rc = frwr_alloc_recovery_wq();
757 if (rc)
758 return rc;
759
760 rc = rpcrdma_alloc_wq(); 772 rc = rpcrdma_alloc_wq();
761 if (rc) { 773 if (rc)
762 frwr_destroy_recovery_wq();
763 return rc; 774 return rc;
764 }
765 775
766 rc = xprt_register_transport(&xprt_rdma); 776 rc = xprt_register_transport(&xprt_rdma);
767 if (rc) { 777 if (rc) {
768 rpcrdma_destroy_wq(); 778 rpcrdma_destroy_wq();
769 frwr_destroy_recovery_wq();
770 return rc; 779 return rc;
771 } 780 }
772 781
@@ -774,7 +783,6 @@ int xprt_rdma_init(void)
774 if (rc) { 783 if (rc) {
775 xprt_unregister_transport(&xprt_rdma); 784 xprt_unregister_transport(&xprt_rdma);
776 rpcrdma_destroy_wq(); 785 rpcrdma_destroy_wq();
777 frwr_destroy_recovery_wq();
778 return rc; 786 return rc;
779 } 787 }
780 788
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index b044d98a1370..536d0be3f61b 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -379,8 +379,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
379 struct rpcrdma_ia *ia = &xprt->rx_ia; 379 struct rpcrdma_ia *ia = &xprt->rx_ia;
380 int rc; 380 int rc;
381 381
382 ia->ri_dma_mr = NULL;
383
384 ia->ri_id = rpcrdma_create_id(xprt, ia, addr); 382 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
385 if (IS_ERR(ia->ri_id)) { 383 if (IS_ERR(ia->ri_id)) {
386 rc = PTR_ERR(ia->ri_id); 384 rc = PTR_ERR(ia->ri_id);
@@ -391,47 +389,29 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
391 ia->ri_pd = ib_alloc_pd(ia->ri_device); 389 ia->ri_pd = ib_alloc_pd(ia->ri_device);
392 if (IS_ERR(ia->ri_pd)) { 390 if (IS_ERR(ia->ri_pd)) {
393 rc = PTR_ERR(ia->ri_pd); 391 rc = PTR_ERR(ia->ri_pd);
394 dprintk("RPC: %s: ib_alloc_pd() failed %i\n", 392 pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
395 __func__, rc);
396 goto out2; 393 goto out2;
397 } 394 }
398 395
399 if (memreg == RPCRDMA_FRMR) {
400 if (!(ia->ri_device->attrs.device_cap_flags &
401 IB_DEVICE_MEM_MGT_EXTENSIONS) ||
402 (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) {
403 dprintk("RPC: %s: FRMR registration "
404 "not supported by HCA\n", __func__);
405 memreg = RPCRDMA_MTHCAFMR;
406 }
407 }
408 if (memreg == RPCRDMA_MTHCAFMR) {
409 if (!ia->ri_device->alloc_fmr) {
410 dprintk("RPC: %s: MTHCAFMR registration "
411 "not supported by HCA\n", __func__);
412 rc = -EINVAL;
413 goto out3;
414 }
415 }
416
417 switch (memreg) { 396 switch (memreg) {
418 case RPCRDMA_FRMR: 397 case RPCRDMA_FRMR:
419 ia->ri_ops = &rpcrdma_frwr_memreg_ops; 398 if (frwr_is_supported(ia)) {
420 break; 399 ia->ri_ops = &rpcrdma_frwr_memreg_ops;
421 case RPCRDMA_ALLPHYSICAL: 400 break;
422 ia->ri_ops = &rpcrdma_physical_memreg_ops; 401 }
423 break; 402 /*FALLTHROUGH*/
424 case RPCRDMA_MTHCAFMR: 403 case RPCRDMA_MTHCAFMR:
425 ia->ri_ops = &rpcrdma_fmr_memreg_ops; 404 if (fmr_is_supported(ia)) {
426 break; 405 ia->ri_ops = &rpcrdma_fmr_memreg_ops;
406 break;
407 }
408 /*FALLTHROUGH*/
427 default: 409 default:
428 printk(KERN_ERR "RPC: Unsupported memory " 410 pr_err("rpcrdma: Unsupported memory registration mode: %d\n",
429 "registration mode: %d\n", memreg); 411 memreg);
430 rc = -ENOMEM; 412 rc = -EINVAL;
431 goto out3; 413 goto out3;
432 } 414 }
433 dprintk("RPC: %s: memory registration strategy is '%s'\n",
434 __func__, ia->ri_ops->ro_displayname);
435 415
436 return 0; 416 return 0;
437 417
@@ -585,8 +565,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
585out2: 565out2:
586 ib_free_cq(sendcq); 566 ib_free_cq(sendcq);
587out1: 567out1:
588 if (ia->ri_dma_mr)
589 ib_dereg_mr(ia->ri_dma_mr);
590 return rc; 568 return rc;
591} 569}
592 570
@@ -600,8 +578,6 @@ out1:
600void 578void
601rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 579rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
602{ 580{
603 int rc;
604
605 dprintk("RPC: %s: entering, connected is %d\n", 581 dprintk("RPC: %s: entering, connected is %d\n",
606 __func__, ep->rep_connected); 582 __func__, ep->rep_connected);
607 583
@@ -615,12 +591,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
615 591
616 ib_free_cq(ep->rep_attr.recv_cq); 592 ib_free_cq(ep->rep_attr.recv_cq);
617 ib_free_cq(ep->rep_attr.send_cq); 593 ib_free_cq(ep->rep_attr.send_cq);
618
619 if (ia->ri_dma_mr) {
620 rc = ib_dereg_mr(ia->ri_dma_mr);
621 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
622 __func__, rc);
623 }
624} 594}
625 595
626/* 596/*
@@ -777,6 +747,90 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
777 ib_drain_qp(ia->ri_id->qp); 747 ib_drain_qp(ia->ri_id->qp);
778} 748}
779 749
750static void
751rpcrdma_mr_recovery_worker(struct work_struct *work)
752{
753 struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
754 rb_recovery_worker.work);
755 struct rpcrdma_mw *mw;
756
757 spin_lock(&buf->rb_recovery_lock);
758 while (!list_empty(&buf->rb_stale_mrs)) {
759 mw = list_first_entry(&buf->rb_stale_mrs,
760 struct rpcrdma_mw, mw_list);
761 list_del_init(&mw->mw_list);
762 spin_unlock(&buf->rb_recovery_lock);
763
764 dprintk("RPC: %s: recovering MR %p\n", __func__, mw);
765 mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw);
766
767 spin_lock(&buf->rb_recovery_lock);
768 }
769 spin_unlock(&buf->rb_recovery_lock);
770}
771
772void
773rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
774{
775 struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
776 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
777
778 spin_lock(&buf->rb_recovery_lock);
779 list_add(&mw->mw_list, &buf->rb_stale_mrs);
780 spin_unlock(&buf->rb_recovery_lock);
781
782 schedule_delayed_work(&buf->rb_recovery_worker, 0);
783}
784
785static void
786rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt)
787{
788 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
789 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
790 unsigned int count;
791 LIST_HEAD(free);
792 LIST_HEAD(all);
793
794 for (count = 0; count < 32; count++) {
795 struct rpcrdma_mw *mw;
796 int rc;
797
798 mw = kzalloc(sizeof(*mw), GFP_KERNEL);
799 if (!mw)
800 break;
801
802 rc = ia->ri_ops->ro_init_mr(ia, mw);
803 if (rc) {
804 kfree(mw);
805 break;
806 }
807
808 mw->mw_xprt = r_xprt;
809
810 list_add(&mw->mw_list, &free);
811 list_add(&mw->mw_all, &all);
812 }
813
814 spin_lock(&buf->rb_mwlock);
815 list_splice(&free, &buf->rb_mws);
816 list_splice(&all, &buf->rb_all);
817 r_xprt->rx_stats.mrs_allocated += count;
818 spin_unlock(&buf->rb_mwlock);
819
820 dprintk("RPC: %s: created %u MRs\n", __func__, count);
821}
822
823static void
824rpcrdma_mr_refresh_worker(struct work_struct *work)
825{
826 struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
827 rb_refresh_worker.work);
828 struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
829 rx_buf);
830
831 rpcrdma_create_mrs(r_xprt);
832}
833
780struct rpcrdma_req * 834struct rpcrdma_req *
781rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) 835rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
782{ 836{
@@ -793,6 +847,7 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
793 spin_unlock(&buffer->rb_reqslock); 847 spin_unlock(&buffer->rb_reqslock);
794 req->rl_cqe.done = rpcrdma_wc_send; 848 req->rl_cqe.done = rpcrdma_wc_send;
795 req->rl_buffer = &r_xprt->rx_buf; 849 req->rl_buffer = &r_xprt->rx_buf;
850 INIT_LIST_HEAD(&req->rl_registered);
796 return req; 851 return req;
797} 852}
798 853
@@ -832,17 +887,23 @@ int
832rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) 887rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
833{ 888{
834 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 889 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
835 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
836 int i, rc; 890 int i, rc;
837 891
838 buf->rb_max_requests = r_xprt->rx_data.max_requests; 892 buf->rb_max_requests = r_xprt->rx_data.max_requests;
839 buf->rb_bc_srv_max_requests = 0; 893 buf->rb_bc_srv_max_requests = 0;
840 spin_lock_init(&buf->rb_lock);
841 atomic_set(&buf->rb_credits, 1); 894 atomic_set(&buf->rb_credits, 1);
895 spin_lock_init(&buf->rb_mwlock);
896 spin_lock_init(&buf->rb_lock);
897 spin_lock_init(&buf->rb_recovery_lock);
898 INIT_LIST_HEAD(&buf->rb_mws);
899 INIT_LIST_HEAD(&buf->rb_all);
900 INIT_LIST_HEAD(&buf->rb_stale_mrs);
901 INIT_DELAYED_WORK(&buf->rb_refresh_worker,
902 rpcrdma_mr_refresh_worker);
903 INIT_DELAYED_WORK(&buf->rb_recovery_worker,
904 rpcrdma_mr_recovery_worker);
842 905
843 rc = ia->ri_ops->ro_init(r_xprt); 906 rpcrdma_create_mrs(r_xprt);
844 if (rc)
845 goto out;
846 907
847 INIT_LIST_HEAD(&buf->rb_send_bufs); 908 INIT_LIST_HEAD(&buf->rb_send_bufs);
848 INIT_LIST_HEAD(&buf->rb_allreqs); 909 INIT_LIST_HEAD(&buf->rb_allreqs);
@@ -862,7 +923,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
862 } 923 }
863 924
864 INIT_LIST_HEAD(&buf->rb_recv_bufs); 925 INIT_LIST_HEAD(&buf->rb_recv_bufs);
865 for (i = 0; i < buf->rb_max_requests + 2; i++) { 926 for (i = 0; i < buf->rb_max_requests; i++) {
866 struct rpcrdma_rep *rep; 927 struct rpcrdma_rep *rep;
867 928
868 rep = rpcrdma_create_rep(r_xprt); 929 rep = rpcrdma_create_rep(r_xprt);
@@ -918,11 +979,39 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
918 kfree(req); 979 kfree(req);
919} 980}
920 981
982static void
983rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
984{
985 struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
986 rx_buf);
987 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
988 struct rpcrdma_mw *mw;
989 unsigned int count;
990
991 count = 0;
992 spin_lock(&buf->rb_mwlock);
993 while (!list_empty(&buf->rb_all)) {
994 mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
995 list_del(&mw->mw_all);
996
997 spin_unlock(&buf->rb_mwlock);
998 ia->ri_ops->ro_release_mr(mw);
999 count++;
1000 spin_lock(&buf->rb_mwlock);
1001 }
1002 spin_unlock(&buf->rb_mwlock);
1003 r_xprt->rx_stats.mrs_allocated = 0;
1004
1005 dprintk("RPC: %s: released %u MRs\n", __func__, count);
1006}
1007
921void 1008void
922rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) 1009rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
923{ 1010{
924 struct rpcrdma_ia *ia = rdmab_to_ia(buf); 1011 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
925 1012
1013 cancel_delayed_work_sync(&buf->rb_recovery_worker);
1014
926 while (!list_empty(&buf->rb_recv_bufs)) { 1015 while (!list_empty(&buf->rb_recv_bufs)) {
927 struct rpcrdma_rep *rep; 1016 struct rpcrdma_rep *rep;
928 1017
@@ -944,7 +1033,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
944 } 1033 }
945 spin_unlock(&buf->rb_reqslock); 1034 spin_unlock(&buf->rb_reqslock);
946 1035
947 ia->ri_ops->ro_destroy(buf); 1036 rpcrdma_destroy_mrs(buf);
948} 1037}
949 1038
950struct rpcrdma_mw * 1039struct rpcrdma_mw *
@@ -962,8 +1051,17 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
962 spin_unlock(&buf->rb_mwlock); 1051 spin_unlock(&buf->rb_mwlock);
963 1052
964 if (!mw) 1053 if (!mw)
965 pr_err("RPC: %s: no MWs available\n", __func__); 1054 goto out_nomws;
966 return mw; 1055 return mw;
1056
1057out_nomws:
1058 dprintk("RPC: %s: no MWs available\n", __func__);
1059 schedule_delayed_work(&buf->rb_refresh_worker, 0);
1060
1061 /* Allow the reply handler and refresh worker to run */
1062 cond_resched();
1063
1064 return NULL;
967} 1065}
968 1066
969void 1067void
@@ -978,8 +1076,6 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
978 1076
979/* 1077/*
980 * Get a set of request/reply buffers. 1078 * Get a set of request/reply buffers.
981 *
982 * Reply buffer (if available) is attached to send buffer upon return.
983 */ 1079 */
984struct rpcrdma_req * 1080struct rpcrdma_req *
985rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) 1081rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
@@ -998,13 +1094,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
998 1094
999out_reqbuf: 1095out_reqbuf:
1000 spin_unlock(&buffers->rb_lock); 1096 spin_unlock(&buffers->rb_lock);
1001 pr_warn("RPC: %s: out of request buffers\n", __func__); 1097 pr_warn("rpcrdma: out of request buffers (%p)\n", buffers);
1002 return NULL; 1098 return NULL;
1003out_repbuf: 1099out_repbuf:
1100 list_add(&req->rl_free, &buffers->rb_send_bufs);
1004 spin_unlock(&buffers->rb_lock); 1101 spin_unlock(&buffers->rb_lock);
1005 pr_warn("RPC: %s: out of reply buffers\n", __func__); 1102 pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers);
1006 req->rl_reply = NULL; 1103 return NULL;
1007 return req;
1008} 1104}
1009 1105
1010/* 1106/*
@@ -1060,14 +1156,6 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1060 * Wrappers for internal-use kmalloc memory registration, used by buffer code. 1156 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1061 */ 1157 */
1062 1158
1063void
1064rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
1065{
1066 dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
1067 seg->mr_offset,
1068 (unsigned long long)seg->mr_dma, seg->mr_dmalen);
1069}
1070
1071/** 1159/**
1072 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers 1160 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
1073 * @ia: controlling rpcrdma_ia 1161 * @ia: controlling rpcrdma_ia
@@ -1150,7 +1238,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
1150 if (rep) { 1238 if (rep) {
1151 rc = rpcrdma_ep_post_recv(ia, ep, rep); 1239 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1152 if (rc) 1240 if (rc)
1153 goto out; 1241 return rc;
1154 req->rl_reply = NULL; 1242 req->rl_reply = NULL;
1155 } 1243 }
1156 1244
@@ -1175,10 +1263,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
1175 1263
1176 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); 1264 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1177 if (rc) 1265 if (rc)
1178 dprintk("RPC: %s: ib_post_send returned %i\n", __func__, 1266 goto out_postsend_err;
1179 rc); 1267 return 0;
1180out: 1268
1181 return rc; 1269out_postsend_err:
1270 pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc);
1271 return -ENOTCONN;
1182} 1272}
1183 1273
1184/* 1274/*
@@ -1203,11 +1293,13 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1203 DMA_BIDIRECTIONAL); 1293 DMA_BIDIRECTIONAL);
1204 1294
1205 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); 1295 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1206
1207 if (rc) 1296 if (rc)
1208 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, 1297 goto out_postrecv;
1209 rc); 1298 return 0;
1210 return rc; 1299
1300out_postrecv:
1301 pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
1302 return -ENOTCONN;
1211} 1303}
1212 1304
1213/** 1305/**
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 95cdc66225ee..670fad57153a 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -68,7 +68,6 @@ struct rpcrdma_ia {
68 struct ib_device *ri_device; 68 struct ib_device *ri_device;
69 struct rdma_cm_id *ri_id; 69 struct rdma_cm_id *ri_id;
70 struct ib_pd *ri_pd; 70 struct ib_pd *ri_pd;
71 struct ib_mr *ri_dma_mr;
72 struct completion ri_done; 71 struct completion ri_done;
73 int ri_async_rc; 72 int ri_async_rc;
74 unsigned int ri_max_frmr_depth; 73 unsigned int ri_max_frmr_depth;
@@ -172,23 +171,14 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
172 * o recv buffer (posted to provider) 171 * o recv buffer (posted to provider)
173 * o ib_sge (also donated to provider) 172 * o ib_sge (also donated to provider)
174 * o status of reply (length, success or not) 173 * o status of reply (length, success or not)
175 * o bookkeeping state to get run by tasklet (list, etc) 174 * o bookkeeping state to get run by reply handler (list, etc)
176 * 175 *
177 * These are allocated during initialization, per-transport instance; 176 * These are allocated during initialization, per-transport instance.
178 * however, the tasklet execution list itself is global, as it should
179 * always be pretty short.
180 * 177 *
181 * N of these are associated with a transport instance, and stored in 178 * N of these are associated with a transport instance, and stored in
182 * struct rpcrdma_buffer. N is the max number of outstanding requests. 179 * struct rpcrdma_buffer. N is the max number of outstanding requests.
183 */ 180 */
184 181
185#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
186
187/* data segments + head/tail for Call + head/tail for Reply */
188#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4)
189
190struct rpcrdma_buffer;
191
192struct rpcrdma_rep { 182struct rpcrdma_rep {
193 struct ib_cqe rr_cqe; 183 struct ib_cqe rr_cqe;
194 unsigned int rr_len; 184 unsigned int rr_len;
@@ -221,9 +211,6 @@ enum rpcrdma_frmr_state {
221}; 211};
222 212
223struct rpcrdma_frmr { 213struct rpcrdma_frmr {
224 struct scatterlist *fr_sg;
225 int fr_nents;
226 enum dma_data_direction fr_dir;
227 struct ib_mr *fr_mr; 214 struct ib_mr *fr_mr;
228 struct ib_cqe fr_cqe; 215 struct ib_cqe fr_cqe;
229 enum rpcrdma_frmr_state fr_state; 216 enum rpcrdma_frmr_state fr_state;
@@ -235,18 +222,23 @@ struct rpcrdma_frmr {
235}; 222};
236 223
237struct rpcrdma_fmr { 224struct rpcrdma_fmr {
238 struct ib_fmr *fmr; 225 struct ib_fmr *fm_mr;
239 u64 *physaddrs; 226 u64 *fm_physaddrs;
240}; 227};
241 228
242struct rpcrdma_mw { 229struct rpcrdma_mw {
230 struct list_head mw_list;
231 struct scatterlist *mw_sg;
232 int mw_nents;
233 enum dma_data_direction mw_dir;
243 union { 234 union {
244 struct rpcrdma_fmr fmr; 235 struct rpcrdma_fmr fmr;
245 struct rpcrdma_frmr frmr; 236 struct rpcrdma_frmr frmr;
246 }; 237 };
247 struct work_struct mw_work;
248 struct rpcrdma_xprt *mw_xprt; 238 struct rpcrdma_xprt *mw_xprt;
249 struct list_head mw_list; 239 u32 mw_handle;
240 u32 mw_length;
241 u64 mw_offset;
250 struct list_head mw_all; 242 struct list_head mw_all;
251}; 243};
252 244
@@ -266,33 +258,30 @@ struct rpcrdma_mw {
266 * of iovs for send operations. The reason is that the iovs passed to 258 * of iovs for send operations. The reason is that the iovs passed to
267 * ib_post_{send,recv} must not be modified until the work request 259 * ib_post_{send,recv} must not be modified until the work request
268 * completes. 260 * completes.
269 *
270 * NOTES:
271 * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
272 * marshal. The number needed varies depending on the iov lists that
273 * are passed to us, the memory registration mode we are in, and if
274 * physical addressing is used, the layout.
275 */ 261 */
276 262
263/* Maximum number of page-sized "segments" per chunk list to be
264 * registered or invalidated. Must handle a Reply chunk:
265 */
266enum {
267 RPCRDMA_MAX_IOV_SEGS = 3,
268 RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1,
269 RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS +
270 RPCRDMA_MAX_IOV_SEGS,
271};
272
277struct rpcrdma_mr_seg { /* chunk descriptors */ 273struct rpcrdma_mr_seg { /* chunk descriptors */
278 struct rpcrdma_mw *rl_mw; /* registered MR */
279 u64 mr_base; /* registration result */
280 u32 mr_rkey; /* registration result */
281 u32 mr_len; /* length of chunk or segment */ 274 u32 mr_len; /* length of chunk or segment */
282 int mr_nsegs; /* number of segments in chunk or 0 */
283 enum dma_data_direction mr_dir; /* segment mapping direction */
284 dma_addr_t mr_dma; /* segment mapping address */
285 size_t mr_dmalen; /* segment mapping length */
286 struct page *mr_page; /* owning page, if any */ 275 struct page *mr_page; /* owning page, if any */
287 char *mr_offset; /* kva if no page, else offset */ 276 char *mr_offset; /* kva if no page, else offset */
288}; 277};
289 278
290#define RPCRDMA_MAX_IOVS (2) 279#define RPCRDMA_MAX_IOVS (2)
291 280
281struct rpcrdma_buffer;
292struct rpcrdma_req { 282struct rpcrdma_req {
293 struct list_head rl_free; 283 struct list_head rl_free;
294 unsigned int rl_niovs; 284 unsigned int rl_niovs;
295 unsigned int rl_nchunks;
296 unsigned int rl_connect_cookie; 285 unsigned int rl_connect_cookie;
297 struct rpc_task *rl_task; 286 struct rpc_task *rl_task;
298 struct rpcrdma_buffer *rl_buffer; 287 struct rpcrdma_buffer *rl_buffer;
@@ -300,12 +289,13 @@ struct rpcrdma_req {
300 struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; 289 struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
301 struct rpcrdma_regbuf *rl_rdmabuf; 290 struct rpcrdma_regbuf *rl_rdmabuf;
302 struct rpcrdma_regbuf *rl_sendbuf; 291 struct rpcrdma_regbuf *rl_sendbuf;
303 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
304 struct rpcrdma_mr_seg *rl_nextseg;
305 292
306 struct ib_cqe rl_cqe; 293 struct ib_cqe rl_cqe;
307 struct list_head rl_all; 294 struct list_head rl_all;
308 bool rl_backchannel; 295 bool rl_backchannel;
296
297 struct list_head rl_registered; /* registered segments */
298 struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
309}; 299};
310 300
311static inline struct rpcrdma_req * 301static inline struct rpcrdma_req *
@@ -341,6 +331,11 @@ struct rpcrdma_buffer {
341 struct list_head rb_allreqs; 331 struct list_head rb_allreqs;
342 332
343 u32 rb_bc_max_requests; 333 u32 rb_bc_max_requests;
334
335 spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */
336 struct list_head rb_stale_mrs;
337 struct delayed_work rb_recovery_worker;
338 struct delayed_work rb_refresh_worker;
344}; 339};
345#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) 340#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
346 341
@@ -387,6 +382,9 @@ struct rpcrdma_stats {
387 unsigned long bad_reply_count; 382 unsigned long bad_reply_count;
388 unsigned long nomsg_call_count; 383 unsigned long nomsg_call_count;
389 unsigned long bcall_count; 384 unsigned long bcall_count;
385 unsigned long mrs_recovered;
386 unsigned long mrs_orphaned;
387 unsigned long mrs_allocated;
390}; 388};
391 389
392/* 390/*
@@ -395,23 +393,25 @@ struct rpcrdma_stats {
395struct rpcrdma_xprt; 393struct rpcrdma_xprt;
396struct rpcrdma_memreg_ops { 394struct rpcrdma_memreg_ops {
397 int (*ro_map)(struct rpcrdma_xprt *, 395 int (*ro_map)(struct rpcrdma_xprt *,
398 struct rpcrdma_mr_seg *, int, bool); 396 struct rpcrdma_mr_seg *, int, bool,
397 struct rpcrdma_mw **);
399 void (*ro_unmap_sync)(struct rpcrdma_xprt *, 398 void (*ro_unmap_sync)(struct rpcrdma_xprt *,
400 struct rpcrdma_req *); 399 struct rpcrdma_req *);
401 void (*ro_unmap_safe)(struct rpcrdma_xprt *, 400 void (*ro_unmap_safe)(struct rpcrdma_xprt *,
402 struct rpcrdma_req *, bool); 401 struct rpcrdma_req *, bool);
402 void (*ro_recover_mr)(struct rpcrdma_mw *);
403 int (*ro_open)(struct rpcrdma_ia *, 403 int (*ro_open)(struct rpcrdma_ia *,
404 struct rpcrdma_ep *, 404 struct rpcrdma_ep *,
405 struct rpcrdma_create_data_internal *); 405 struct rpcrdma_create_data_internal *);
406 size_t (*ro_maxpages)(struct rpcrdma_xprt *); 406 size_t (*ro_maxpages)(struct rpcrdma_xprt *);
407 int (*ro_init)(struct rpcrdma_xprt *); 407 int (*ro_init_mr)(struct rpcrdma_ia *,
408 void (*ro_destroy)(struct rpcrdma_buffer *); 408 struct rpcrdma_mw *);
409 void (*ro_release_mr)(struct rpcrdma_mw *);
409 const char *ro_displayname; 410 const char *ro_displayname;
410}; 411};
411 412
412extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; 413extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
413extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; 414extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
414extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
415 415
416/* 416/*
417 * RPCRDMA transport -- encapsulates the structures above for 417 * RPCRDMA transport -- encapsulates the structures above for
@@ -446,6 +446,8 @@ extern int xprt_rdma_pad_optimize;
446 */ 446 */
447int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); 447int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
448void rpcrdma_ia_close(struct rpcrdma_ia *); 448void rpcrdma_ia_close(struct rpcrdma_ia *);
449bool frwr_is_supported(struct rpcrdma_ia *);
450bool fmr_is_supported(struct rpcrdma_ia *);
449 451
450/* 452/*
451 * Endpoint calls - xprtrdma/verbs.c 453 * Endpoint calls - xprtrdma/verbs.c
@@ -477,6 +479,8 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
477void rpcrdma_recv_buffer_get(struct rpcrdma_req *); 479void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
478void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); 480void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
479 481
482void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *);
483
480struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, 484struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
481 size_t, gfp_t); 485 size_t, gfp_t);
482void rpcrdma_free_regbuf(struct rpcrdma_ia *, 486void rpcrdma_free_regbuf(struct rpcrdma_ia *,
@@ -484,9 +488,6 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
484 488
485int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); 489int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
486 490
487int frwr_alloc_recovery_wq(void);
488void frwr_destroy_recovery_wq(void);
489
490int rpcrdma_alloc_wq(void); 491int rpcrdma_alloc_wq(void);
491void rpcrdma_destroy_wq(void); 492void rpcrdma_destroy_wq(void);
492 493
@@ -494,45 +495,12 @@ void rpcrdma_destroy_wq(void);
494 * Wrappers for chunk registration, shared by read/write chunk code. 495 * Wrappers for chunk registration, shared by read/write chunk code.
495 */ 496 */
496 497
497void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
498
499static inline enum dma_data_direction 498static inline enum dma_data_direction
500rpcrdma_data_dir(bool writing) 499rpcrdma_data_dir(bool writing)
501{ 500{
502 return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; 501 return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
503} 502}
504 503
505static inline void
506rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
507 enum dma_data_direction direction)
508{
509 seg->mr_dir = direction;
510 seg->mr_dmalen = seg->mr_len;
511
512 if (seg->mr_page)
513 seg->mr_dma = ib_dma_map_page(device,
514 seg->mr_page, offset_in_page(seg->mr_offset),
515 seg->mr_dmalen, seg->mr_dir);
516 else
517 seg->mr_dma = ib_dma_map_single(device,
518 seg->mr_offset,
519 seg->mr_dmalen, seg->mr_dir);
520
521 if (ib_dma_mapping_error(device, seg->mr_dma))
522 rpcrdma_mapping_error(seg);
523}
524
525static inline void
526rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
527{
528 if (seg->mr_page)
529 ib_dma_unmap_page(device,
530 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
531 else
532 ib_dma_unmap_single(device,
533 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
534}
535
536/* 504/*
537 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c 505 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
538 */ 506 */
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 3ad9fab1985f..1fd464764765 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -604,7 +604,7 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg,
604 604
605 link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]); 605 link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]);
606 link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP])); 606 link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP]));
607 nla_strlcpy(link_info.str, nla_data(link[TIPC_NLA_LINK_NAME]), 607 nla_strlcpy(link_info.str, link[TIPC_NLA_LINK_NAME],
608 TIPC_MAX_LINK_NAME); 608 TIPC_MAX_LINK_NAME);
609 609
610 return tipc_add_tlv(msg->rep, TIPC_TLV_LINK_INFO, 610 return tipc_add_tlv(msg->rep, TIPC_TLV_LINK_INFO,