aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/sunrpc/auth_gss/Makefile4
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c10
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c26
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_unseal.c16
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c72
-rw-r--r--net/sunrpc/svc.c110
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c35
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c84
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c166
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c195
10 files changed, 331 insertions, 387 deletions
diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile
index f3431a7e33da..4de8bcf26fa7 100644
--- a/net/sunrpc/auth_gss/Makefile
+++ b/net/sunrpc/auth_gss/Makefile
@@ -5,12 +5,12 @@
5obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o 5obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o
6 6
7auth_rpcgss-objs := auth_gss.o gss_generic_token.o \ 7auth_rpcgss-objs := auth_gss.o gss_generic_token.o \
8 gss_mech_switch.o svcauth_gss.o gss_krb5_crypto.o 8 gss_mech_switch.o svcauth_gss.o
9 9
10obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o 10obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o
11 11
12rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ 12rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \
13 gss_krb5_seqnum.o gss_krb5_wrap.o 13 gss_krb5_seqnum.o gss_krb5_wrap.o gss_krb5_crypto.o
14 14
15obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o 15obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o
16 16
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 1d52308ca324..c93fca204558 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -83,8 +83,6 @@ out:
83 return ret; 83 return ret;
84} 84}
85 85
86EXPORT_SYMBOL(krb5_encrypt);
87
88u32 86u32
89krb5_decrypt( 87krb5_decrypt(
90 struct crypto_blkcipher *tfm, 88 struct crypto_blkcipher *tfm,
@@ -118,8 +116,6 @@ out:
118 return ret; 116 return ret;
119} 117}
120 118
121EXPORT_SYMBOL(krb5_decrypt);
122
123static int 119static int
124checksummer(struct scatterlist *sg, void *data) 120checksummer(struct scatterlist *sg, void *data)
125{ 121{
@@ -161,8 +157,6 @@ out:
161 return err ? GSS_S_FAILURE : 0; 157 return err ? GSS_S_FAILURE : 0;
162} 158}
163 159
164EXPORT_SYMBOL(make_checksum);
165
166struct encryptor_desc { 160struct encryptor_desc {
167 u8 iv[8]; /* XXX hard-coded blocksize */ 161 u8 iv[8]; /* XXX hard-coded blocksize */
168 struct blkcipher_desc desc; 162 struct blkcipher_desc desc;
@@ -262,8 +256,6 @@ gss_encrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf,
262 return ret; 256 return ret;
263} 257}
264 258
265EXPORT_SYMBOL(gss_encrypt_xdr_buf);
266
267struct decryptor_desc { 259struct decryptor_desc {
268 u8 iv[8]; /* XXX hard-coded blocksize */ 260 u8 iv[8]; /* XXX hard-coded blocksize */
269 struct blkcipher_desc desc; 261 struct blkcipher_desc desc;
@@ -334,5 +326,3 @@ gss_decrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf,
334 326
335 return xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc); 327 return xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc);
336} 328}
337
338EXPORT_SYMBOL(gss_decrypt_xdr_buf);
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index 5f1d36dfbcf7..b8f42ef7178e 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -78,7 +78,7 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
78 struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; 78 struct krb5_ctx *ctx = gss_ctx->internal_ctx_id;
79 char cksumdata[16]; 79 char cksumdata[16];
80 struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; 80 struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata};
81 unsigned char *ptr, *krb5_hdr, *msg_start; 81 unsigned char *ptr, *msg_start;
82 s32 now; 82 s32 now;
83 u32 seq_send; 83 u32 seq_send;
84 84
@@ -87,36 +87,36 @@ gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text,
87 87
88 now = get_seconds(); 88 now = get_seconds();
89 89
90 token->len = g_token_size(&ctx->mech_used, 24); 90 token->len = g_token_size(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8);
91 91
92 ptr = token->data; 92 ptr = token->data;
93 g_make_token_header(&ctx->mech_used, 24, &ptr); 93 g_make_token_header(&ctx->mech_used, GSS_KRB5_TOK_HDR_LEN + 8, &ptr);
94 94
95 *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff); 95 /* ptr now at header described in rfc 1964, section 1.2.1: */
96 *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff); 96 ptr[0] = (unsigned char) ((KG_TOK_MIC_MSG >> 8) & 0xff);
97 ptr[1] = (unsigned char) (KG_TOK_MIC_MSG & 0xff);
97 98
98 /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ 99 msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8;
99 krb5_hdr = ptr - 2;
100 msg_start = krb5_hdr + 24;
101 100
102 *(__be16 *)(krb5_hdr + 2) = htons(SGN_ALG_DES_MAC_MD5); 101 *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5);
103 memset(krb5_hdr + 4, 0xff, 4); 102 memset(ptr + 4, 0xff, 4);
104 103
105 if (make_checksum("md5", krb5_hdr, 8, text, 0, &md5cksum)) 104 if (make_checksum("md5", ptr, 8, text, 0, &md5cksum))
106 return GSS_S_FAILURE; 105 return GSS_S_FAILURE;
107 106
108 if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, 107 if (krb5_encrypt(ctx->seq, NULL, md5cksum.data,
109 md5cksum.data, md5cksum.len)) 108 md5cksum.data, md5cksum.len))
110 return GSS_S_FAILURE; 109 return GSS_S_FAILURE;
111 110
112 memcpy(krb5_hdr + 16, md5cksum.data + md5cksum.len - 8, 8); 111 memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8);
113 112
114 spin_lock(&krb5_seq_lock); 113 spin_lock(&krb5_seq_lock);
115 seq_send = ctx->seq_send++; 114 seq_send = ctx->seq_send++;
116 spin_unlock(&krb5_seq_lock); 115 spin_unlock(&krb5_seq_lock);
117 116
118 if (krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff, 117 if (krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff,
119 seq_send, krb5_hdr + 16, krb5_hdr + 8)) 118 seq_send, ptr + GSS_KRB5_TOK_HDR_LEN,
119 ptr + 8))
120 return GSS_S_FAILURE; 120 return GSS_S_FAILURE;
121 121
122 return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE; 122 return (ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE;
diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c
index d91a5d004803..066ec73c84d6 100644
--- a/net/sunrpc/auth_gss/gss_krb5_unseal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c
@@ -92,30 +92,30 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx,
92 read_token->len)) 92 read_token->len))
93 return GSS_S_DEFECTIVE_TOKEN; 93 return GSS_S_DEFECTIVE_TOKEN;
94 94
95 if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) || 95 if ((ptr[0] != ((KG_TOK_MIC_MSG >> 8) & 0xff)) ||
96 (*ptr++ != ( KG_TOK_MIC_MSG &0xff)) ) 96 (ptr[1] != (KG_TOK_MIC_MSG & 0xff)))
97 return GSS_S_DEFECTIVE_TOKEN; 97 return GSS_S_DEFECTIVE_TOKEN;
98 98
99 /* XXX sanity-check bodysize?? */ 99 /* XXX sanity-check bodysize?? */
100 100
101 signalg = ptr[0] + (ptr[1] << 8); 101 signalg = ptr[2] + (ptr[3] << 8);
102 if (signalg != SGN_ALG_DES_MAC_MD5) 102 if (signalg != SGN_ALG_DES_MAC_MD5)
103 return GSS_S_DEFECTIVE_TOKEN; 103 return GSS_S_DEFECTIVE_TOKEN;
104 104
105 sealalg = ptr[2] + (ptr[3] << 8); 105 sealalg = ptr[4] + (ptr[5] << 8);
106 if (sealalg != SEAL_ALG_NONE) 106 if (sealalg != SEAL_ALG_NONE)
107 return GSS_S_DEFECTIVE_TOKEN; 107 return GSS_S_DEFECTIVE_TOKEN;
108 108
109 if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) 109 if ((ptr[6] != 0xff) || (ptr[7] != 0xff))
110 return GSS_S_DEFECTIVE_TOKEN; 110 return GSS_S_DEFECTIVE_TOKEN;
111 111
112 if (make_checksum("md5", ptr - 2, 8, message_buffer, 0, &md5cksum)) 112 if (make_checksum("md5", ptr, 8, message_buffer, 0, &md5cksum))
113 return GSS_S_FAILURE; 113 return GSS_S_FAILURE;
114 114
115 if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, md5cksum.data, 16)) 115 if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, md5cksum.data, 16))
116 return GSS_S_FAILURE; 116 return GSS_S_FAILURE;
117 117
118 if (memcmp(md5cksum.data + 8, ptr + 14, 8)) 118 if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8))
119 return GSS_S_BAD_SIG; 119 return GSS_S_BAD_SIG;
120 120
121 /* it got through unscathed. Make sure the context is unexpired */ 121 /* it got through unscathed. Make sure the context is unexpired */
@@ -127,7 +127,7 @@ gss_verify_mic_kerberos(struct gss_ctx *gss_ctx,
127 127
128 /* do sequencing checks */ 128 /* do sequencing checks */
129 129
130 if (krb5_get_seq_num(ctx->seq, ptr + 14, ptr + 6, &direction, &seqnum)) 130 if (krb5_get_seq_num(ctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8, &direction, &seqnum))
131 return GSS_S_FAILURE; 131 return GSS_S_FAILURE;
132 132
133 if ((ctx->initiate && direction != 0xff) || 133 if ((ctx->initiate && direction != 0xff) ||
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index b00b1b426301..ae8e69b59c4c 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -87,8 +87,8 @@ out:
87 return 0; 87 return 0;
88} 88}
89 89
90static inline void 90static void
91make_confounder(char *p, int blocksize) 91make_confounder(char *p, u32 conflen)
92{ 92{
93 static u64 i = 0; 93 static u64 i = 0;
94 u64 *q = (u64 *)p; 94 u64 *q = (u64 *)p;
@@ -102,8 +102,22 @@ make_confounder(char *p, int blocksize)
102 * uniqueness would mean worrying about atomicity and rollover, and I 102 * uniqueness would mean worrying about atomicity and rollover, and I
103 * don't care enough. */ 103 * don't care enough. */
104 104
105 BUG_ON(blocksize != 8); 105 /* initialize to random value */
106 *q = i++; 106 if (i == 0) {
107 i = random32();
108 i = (i << 32) | random32();
109 }
110
111 switch (conflen) {
112 case 16:
113 *q++ = i++;
114 /* fall through */
115 case 8:
116 *q++ = i++;
117 break;
118 default:
119 BUG();
120 }
107} 121}
108 122
109/* Assumptions: the head and tail of inbuf are ours to play with. 123/* Assumptions: the head and tail of inbuf are ours to play with.
@@ -122,7 +136,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset,
122 char cksumdata[16]; 136 char cksumdata[16];
123 struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata}; 137 struct xdr_netobj md5cksum = {.len = 0, .data = cksumdata};
124 int blocksize = 0, plainlen; 138 int blocksize = 0, plainlen;
125 unsigned char *ptr, *krb5_hdr, *msg_start; 139 unsigned char *ptr, *msg_start;
126 s32 now; 140 s32 now;
127 int headlen; 141 int headlen;
128 struct page **tmp_pages; 142 struct page **tmp_pages;
@@ -149,26 +163,26 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset,
149 buf->len += headlen; 163 buf->len += headlen;
150 BUG_ON((buf->len - offset - headlen) % blocksize); 164 BUG_ON((buf->len - offset - headlen) % blocksize);
151 165
152 g_make_token_header(&kctx->mech_used, 24 + plainlen, &ptr); 166 g_make_token_header(&kctx->mech_used,
167 GSS_KRB5_TOK_HDR_LEN + 8 + plainlen, &ptr);
153 168
154 169
155 *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG>>8)&0xff); 170 /* ptr now at header described in rfc 1964, section 1.2.1: */
156 *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG&0xff); 171 ptr[0] = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff);
172 ptr[1] = (unsigned char) (KG_TOK_WRAP_MSG & 0xff);
157 173
158 /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ 174 msg_start = ptr + 24;
159 krb5_hdr = ptr - 2;
160 msg_start = krb5_hdr + 24;
161 175
162 *(__be16 *)(krb5_hdr + 2) = htons(SGN_ALG_DES_MAC_MD5); 176 *(__be16 *)(ptr + 2) = htons(SGN_ALG_DES_MAC_MD5);
163 memset(krb5_hdr + 4, 0xff, 4); 177 memset(ptr + 4, 0xff, 4);
164 *(__be16 *)(krb5_hdr + 4) = htons(SEAL_ALG_DES); 178 *(__be16 *)(ptr + 4) = htons(SEAL_ALG_DES);
165 179
166 make_confounder(msg_start, blocksize); 180 make_confounder(msg_start, blocksize);
167 181
168 /* XXXJBF: UGH!: */ 182 /* XXXJBF: UGH!: */
169 tmp_pages = buf->pages; 183 tmp_pages = buf->pages;
170 buf->pages = pages; 184 buf->pages = pages;
171 if (make_checksum("md5", krb5_hdr, 8, buf, 185 if (make_checksum("md5", ptr, 8, buf,
172 offset + headlen - blocksize, &md5cksum)) 186 offset + headlen - blocksize, &md5cksum))
173 return GSS_S_FAILURE; 187 return GSS_S_FAILURE;
174 buf->pages = tmp_pages; 188 buf->pages = tmp_pages;
@@ -176,7 +190,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset,
176 if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, 190 if (krb5_encrypt(kctx->seq, NULL, md5cksum.data,
177 md5cksum.data, md5cksum.len)) 191 md5cksum.data, md5cksum.len))
178 return GSS_S_FAILURE; 192 return GSS_S_FAILURE;
179 memcpy(krb5_hdr + 16, md5cksum.data + md5cksum.len - 8, 8); 193 memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data + md5cksum.len - 8, 8);
180 194
181 spin_lock(&krb5_seq_lock); 195 spin_lock(&krb5_seq_lock);
182 seq_send = kctx->seq_send++; 196 seq_send = kctx->seq_send++;
@@ -185,7 +199,7 @@ gss_wrap_kerberos(struct gss_ctx *ctx, int offset,
185 /* XXX would probably be more efficient to compute checksum 199 /* XXX would probably be more efficient to compute checksum
186 * and encrypt at the same time: */ 200 * and encrypt at the same time: */
187 if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff, 201 if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff,
188 seq_send, krb5_hdr + 16, krb5_hdr + 8))) 202 seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)))
189 return GSS_S_FAILURE; 203 return GSS_S_FAILURE;
190 204
191 if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize, 205 if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize,
@@ -219,38 +233,38 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf)
219 buf->len - offset)) 233 buf->len - offset))
220 return GSS_S_DEFECTIVE_TOKEN; 234 return GSS_S_DEFECTIVE_TOKEN;
221 235
222 if ((*ptr++ != ((KG_TOK_WRAP_MSG>>8)&0xff)) || 236 if ((ptr[0] != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) ||
223 (*ptr++ != (KG_TOK_WRAP_MSG &0xff)) ) 237 (ptr[1] != (KG_TOK_WRAP_MSG & 0xff)))
224 return GSS_S_DEFECTIVE_TOKEN; 238 return GSS_S_DEFECTIVE_TOKEN;
225 239
226 /* XXX sanity-check bodysize?? */ 240 /* XXX sanity-check bodysize?? */
227 241
228 /* get the sign and seal algorithms */ 242 /* get the sign and seal algorithms */
229 243
230 signalg = ptr[0] + (ptr[1] << 8); 244 signalg = ptr[2] + (ptr[3] << 8);
231 if (signalg != SGN_ALG_DES_MAC_MD5) 245 if (signalg != SGN_ALG_DES_MAC_MD5)
232 return GSS_S_DEFECTIVE_TOKEN; 246 return GSS_S_DEFECTIVE_TOKEN;
233 247
234 sealalg = ptr[2] + (ptr[3] << 8); 248 sealalg = ptr[4] + (ptr[5] << 8);
235 if (sealalg != SEAL_ALG_DES) 249 if (sealalg != SEAL_ALG_DES)
236 return GSS_S_DEFECTIVE_TOKEN; 250 return GSS_S_DEFECTIVE_TOKEN;
237 251
238 if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) 252 if ((ptr[6] != 0xff) || (ptr[7] != 0xff))
239 return GSS_S_DEFECTIVE_TOKEN; 253 return GSS_S_DEFECTIVE_TOKEN;
240 254
241 if (gss_decrypt_xdr_buf(kctx->enc, buf, 255 if (gss_decrypt_xdr_buf(kctx->enc, buf,
242 ptr + 22 - (unsigned char *)buf->head[0].iov_base)) 256 ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base))
243 return GSS_S_DEFECTIVE_TOKEN; 257 return GSS_S_DEFECTIVE_TOKEN;
244 258
245 if (make_checksum("md5", ptr - 2, 8, buf, 259 if (make_checksum("md5", ptr, 8, buf,
246 ptr + 22 - (unsigned char *)buf->head[0].iov_base, &md5cksum)) 260 ptr + GSS_KRB5_TOK_HDR_LEN + 8 - (unsigned char *)buf->head[0].iov_base, &md5cksum))
247 return GSS_S_FAILURE; 261 return GSS_S_FAILURE;
248 262
249 if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, 263 if (krb5_encrypt(kctx->seq, NULL, md5cksum.data,
250 md5cksum.data, md5cksum.len)) 264 md5cksum.data, md5cksum.len))
251 return GSS_S_FAILURE; 265 return GSS_S_FAILURE;
252 266
253 if (memcmp(md5cksum.data + 8, ptr + 14, 8)) 267 if (memcmp(md5cksum.data + 8, ptr + GSS_KRB5_TOK_HDR_LEN, 8))
254 return GSS_S_BAD_SIG; 268 return GSS_S_BAD_SIG;
255 269
256 /* it got through unscathed. Make sure the context is unexpired */ 270 /* it got through unscathed. Make sure the context is unexpired */
@@ -262,8 +276,8 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf)
262 276
263 /* do sequencing checks */ 277 /* do sequencing checks */
264 278
265 if (krb5_get_seq_num(kctx->seq, ptr + 14, ptr + 6, &direction, 279 if (krb5_get_seq_num(kctx->seq, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8,
266 &seqnum)) 280 &direction, &seqnum))
267 return GSS_S_BAD_SIG; 281 return GSS_S_BAD_SIG;
268 282
269 if ((kctx->initiate && direction != 0xff) || 283 if ((kctx->initiate && direction != 0xff) ||
@@ -274,7 +288,7 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf)
274 * better to copy and encrypt at the same time. */ 288 * better to copy and encrypt at the same time. */
275 289
276 blocksize = crypto_blkcipher_blocksize(kctx->enc); 290 blocksize = crypto_blkcipher_blocksize(kctx->enc);
277 data_start = ptr + 22 + blocksize; 291 data_start = ptr + GSS_KRB5_TOK_HDR_LEN + 8 + blocksize;
278 orig_start = buf->head[0].iov_base + offset; 292 orig_start = buf->head[0].iov_base + offset;
279 data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start; 293 data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start;
280 memmove(orig_start, data_start, data_len); 294 memmove(orig_start, data_start, data_len);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 01c7e311b904..5a32cb7c4bb4 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -18,6 +18,7 @@
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/kthread.h>
21 22
22#include <linux/sunrpc/types.h> 23#include <linux/sunrpc/types.h>
23#include <linux/sunrpc/xdr.h> 24#include <linux/sunrpc/xdr.h>
@@ -291,15 +292,14 @@ svc_pool_map_put(void)
291 292
292 293
293/* 294/*
294 * Set the current thread's cpus_allowed mask so that it 295 * Set the given thread's cpus_allowed mask so that it
295 * will only run on cpus in the given pool. 296 * will only run on cpus in the given pool.
296 *
297 * Returns 1 and fills in oldmask iff a cpumask was applied.
298 */ 297 */
299static inline int 298static inline void
300svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask) 299svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
301{ 300{
302 struct svc_pool_map *m = &svc_pool_map; 301 struct svc_pool_map *m = &svc_pool_map;
302 unsigned int node = m->pool_to[pidx];
303 303
304 /* 304 /*
305 * The caller checks for sv_nrpools > 1, which 305 * The caller checks for sv_nrpools > 1, which
@@ -307,26 +307,17 @@ svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
307 */ 307 */
308 BUG_ON(m->count == 0); 308 BUG_ON(m->count == 0);
309 309
310 switch (m->mode) 310 switch (m->mode) {
311 {
312 default:
313 return 0;
314 case SVC_POOL_PERCPU: 311 case SVC_POOL_PERCPU:
315 { 312 {
316 unsigned int cpu = m->pool_to[pidx]; 313 set_cpus_allowed_ptr(task, &cpumask_of_cpu(node));
317 314 break;
318 *oldmask = current->cpus_allowed;
319 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
320 return 1;
321 } 315 }
322 case SVC_POOL_PERNODE: 316 case SVC_POOL_PERNODE:
323 { 317 {
324 unsigned int node = m->pool_to[pidx];
325 node_to_cpumask_ptr(nodecpumask, node); 318 node_to_cpumask_ptr(nodecpumask, node);
326 319 set_cpus_allowed_ptr(task, nodecpumask);
327 *oldmask = current->cpus_allowed; 320 break;
328 set_cpus_allowed_ptr(current, nodecpumask);
329 return 1;
330 } 321 }
331 } 322 }
332} 323}
@@ -443,7 +434,7 @@ EXPORT_SYMBOL(svc_create);
443struct svc_serv * 434struct svc_serv *
444svc_create_pooled(struct svc_program *prog, unsigned int bufsize, 435svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
445 void (*shutdown)(struct svc_serv *serv), 436 void (*shutdown)(struct svc_serv *serv),
446 svc_thread_fn func, int sig, struct module *mod) 437 svc_thread_fn func, struct module *mod)
447{ 438{
448 struct svc_serv *serv; 439 struct svc_serv *serv;
449 unsigned int npools = svc_pool_map_get(); 440 unsigned int npools = svc_pool_map_get();
@@ -452,7 +443,6 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
452 443
453 if (serv != NULL) { 444 if (serv != NULL) {
454 serv->sv_function = func; 445 serv->sv_function = func;
455 serv->sv_kill_signal = sig;
456 serv->sv_module = mod; 446 serv->sv_module = mod;
457 } 447 }
458 448
@@ -461,7 +451,8 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
461EXPORT_SYMBOL(svc_create_pooled); 451EXPORT_SYMBOL(svc_create_pooled);
462 452
463/* 453/*
464 * Destroy an RPC service. Should be called with the BKL held 454 * Destroy an RPC service. Should be called with appropriate locking to
455 * protect the sv_nrthreads, sv_permsocks and sv_tempsocks.
465 */ 456 */
466void 457void
467svc_destroy(struct svc_serv *serv) 458svc_destroy(struct svc_serv *serv)
@@ -578,46 +569,6 @@ out_enomem:
578EXPORT_SYMBOL(svc_prepare_thread); 569EXPORT_SYMBOL(svc_prepare_thread);
579 570
580/* 571/*
581 * Create a thread in the given pool. Caller must hold BKL.
582 * On a NUMA or SMP machine, with a multi-pool serv, the thread
583 * will be restricted to run on the cpus belonging to the pool.
584 */
585static int
586__svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
587 struct svc_pool *pool)
588{
589 struct svc_rqst *rqstp;
590 int error = -ENOMEM;
591 int have_oldmask = 0;
592 cpumask_t uninitialized_var(oldmask);
593
594 rqstp = svc_prepare_thread(serv, pool);
595 if (IS_ERR(rqstp)) {
596 error = PTR_ERR(rqstp);
597 goto out;
598 }
599
600 if (serv->sv_nrpools > 1)
601 have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
602
603 error = kernel_thread((int (*)(void *)) func, rqstp, 0);
604
605 if (have_oldmask)
606 set_cpus_allowed(current, oldmask);
607
608 if (error < 0)
609 goto out_thread;
610 svc_sock_update_bufs(serv);
611 error = 0;
612out:
613 return error;
614
615out_thread:
616 svc_exit_thread(rqstp);
617 goto out;
618}
619
620/*
621 * Choose a pool in which to create a new thread, for svc_set_num_threads 572 * Choose a pool in which to create a new thread, for svc_set_num_threads
622 */ 573 */
623static inline struct svc_pool * 574static inline struct svc_pool *
@@ -674,7 +625,7 @@ found_pool:
674 * of threads the given number. If `pool' is non-NULL, applies 625 * of threads the given number. If `pool' is non-NULL, applies
675 * only to threads in that pool, otherwise round-robins between 626 * only to threads in that pool, otherwise round-robins between
676 * all pools. Must be called with a svc_get() reference and 627 * all pools. Must be called with a svc_get() reference and
677 * the BKL held. 628 * the BKL or another lock to protect access to svc_serv fields.
678 * 629 *
679 * Destroying threads relies on the service threads filling in 630 * Destroying threads relies on the service threads filling in
680 * rqstp->rq_task, which only the nfs ones do. Assumes the serv 631 * rqstp->rq_task, which only the nfs ones do. Assumes the serv
@@ -686,7 +637,9 @@ found_pool:
686int 637int
687svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) 638svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
688{ 639{
689 struct task_struct *victim; 640 struct svc_rqst *rqstp;
641 struct task_struct *task;
642 struct svc_pool *chosen_pool;
690 int error = 0; 643 int error = 0;
691 unsigned int state = serv->sv_nrthreads-1; 644 unsigned int state = serv->sv_nrthreads-1;
692 645
@@ -702,18 +655,34 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
702 /* create new threads */ 655 /* create new threads */
703 while (nrservs > 0) { 656 while (nrservs > 0) {
704 nrservs--; 657 nrservs--;
658 chosen_pool = choose_pool(serv, pool, &state);
659
660 rqstp = svc_prepare_thread(serv, chosen_pool);
661 if (IS_ERR(rqstp)) {
662 error = PTR_ERR(rqstp);
663 break;
664 }
665
705 __module_get(serv->sv_module); 666 __module_get(serv->sv_module);
706 error = __svc_create_thread(serv->sv_function, serv, 667 task = kthread_create(serv->sv_function, rqstp, serv->sv_name);
707 choose_pool(serv, pool, &state)); 668 if (IS_ERR(task)) {
708 if (error < 0) { 669 error = PTR_ERR(task);
709 module_put(serv->sv_module); 670 module_put(serv->sv_module);
671 svc_exit_thread(rqstp);
710 break; 672 break;
711 } 673 }
674
675 rqstp->rq_task = task;
676 if (serv->sv_nrpools > 1)
677 svc_pool_map_set_cpumask(task, chosen_pool->sp_id);
678
679 svc_sock_update_bufs(serv);
680 wake_up_process(task);
712 } 681 }
713 /* destroy old threads */ 682 /* destroy old threads */
714 while (nrservs < 0 && 683 while (nrservs < 0 &&
715 (victim = choose_victim(serv, pool, &state)) != NULL) { 684 (task = choose_victim(serv, pool, &state)) != NULL) {
716 send_sig(serv->sv_kill_signal, victim, 1); 685 send_sig(SIGINT, task, 1);
717 nrservs++; 686 nrservs++;
718 } 687 }
719 688
@@ -722,7 +691,8 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
722EXPORT_SYMBOL(svc_set_num_threads); 691EXPORT_SYMBOL(svc_set_num_threads);
723 692
724/* 693/*
725 * Called from a server thread as it's exiting. Caller must hold BKL. 694 * Called from a server thread as it's exiting. Caller must hold the BKL or
695 * the "service mutex", whichever is appropriate for the service.
726 */ 696 */
727void 697void
728svc_exit_thread(struct svc_rqst *rqstp) 698svc_exit_thread(struct svc_rqst *rqstp)
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 88c0ca20bb1e..87101177825b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -69,6 +69,10 @@ atomic_t rdma_stat_rq_prod;
69atomic_t rdma_stat_sq_poll; 69atomic_t rdma_stat_sq_poll;
70atomic_t rdma_stat_sq_prod; 70atomic_t rdma_stat_sq_prod;
71 71
72/* Temporary NFS request map and context caches */
73struct kmem_cache *svc_rdma_map_cachep;
74struct kmem_cache *svc_rdma_ctxt_cachep;
75
72/* 76/*
73 * This function implements reading and resetting an atomic_t stat 77 * This function implements reading and resetting an atomic_t stat
74 * variable through read/write to a proc file. Any write to the file 78 * variable through read/write to a proc file. Any write to the file
@@ -236,11 +240,14 @@ static ctl_table svcrdma_root_table[] = {
236void svc_rdma_cleanup(void) 240void svc_rdma_cleanup(void)
237{ 241{
238 dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); 242 dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
243 flush_scheduled_work();
239 if (svcrdma_table_header) { 244 if (svcrdma_table_header) {
240 unregister_sysctl_table(svcrdma_table_header); 245 unregister_sysctl_table(svcrdma_table_header);
241 svcrdma_table_header = NULL; 246 svcrdma_table_header = NULL;
242 } 247 }
243 svc_unreg_xprt_class(&svc_rdma_class); 248 svc_unreg_xprt_class(&svc_rdma_class);
249 kmem_cache_destroy(svc_rdma_map_cachep);
250 kmem_cache_destroy(svc_rdma_ctxt_cachep);
244} 251}
245 252
246int svc_rdma_init(void) 253int svc_rdma_init(void)
@@ -255,9 +262,37 @@ int svc_rdma_init(void)
255 svcrdma_table_header = 262 svcrdma_table_header =
256 register_sysctl_table(svcrdma_root_table); 263 register_sysctl_table(svcrdma_root_table);
257 264
265 /* Create the temporary map cache */
266 svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache",
267 sizeof(struct svc_rdma_req_map),
268 0,
269 SLAB_HWCACHE_ALIGN,
270 NULL);
271 if (!svc_rdma_map_cachep) {
272 printk(KERN_INFO "Could not allocate map cache.\n");
273 goto err0;
274 }
275
276 /* Create the temporary context cache */
277 svc_rdma_ctxt_cachep =
278 kmem_cache_create("svc_rdma_ctxt_cache",
279 sizeof(struct svc_rdma_op_ctxt),
280 0,
281 SLAB_HWCACHE_ALIGN,
282 NULL);
283 if (!svc_rdma_ctxt_cachep) {
284 printk(KERN_INFO "Could not allocate WR ctxt cache.\n");
285 goto err1;
286 }
287
258 /* Register RDMA with the SVC transport switch */ 288 /* Register RDMA with the SVC transport switch */
259 svc_reg_xprt_class(&svc_rdma_class); 289 svc_reg_xprt_class(&svc_rdma_class);
260 return 0; 290 return 0;
291 err1:
292 kmem_cache_destroy(svc_rdma_map_cachep);
293 err0:
294 unregister_sysctl_table(svcrdma_table_header);
295 return -ENOMEM;
261} 296}
262MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); 297MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
263MODULE_DESCRIPTION("SVC RDMA Transport"); 298MODULE_DESCRIPTION("SVC RDMA Transport");
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 06ab4841537b..b4b17f44cb29 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -112,11 +112,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
112 rqstp->rq_arg.tail[0].iov_len = 0; 112 rqstp->rq_arg.tail[0].iov_len = 0;
113} 113}
114 114
115struct chunk_sge {
116 int start; /* sge no for this chunk */
117 int count; /* sge count for this chunk */
118};
119
120/* Encode a read-chunk-list as an array of IB SGE 115/* Encode a read-chunk-list as an array of IB SGE
121 * 116 *
122 * Assumptions: 117 * Assumptions:
@@ -134,8 +129,8 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
134 struct svc_rqst *rqstp, 129 struct svc_rqst *rqstp,
135 struct svc_rdma_op_ctxt *head, 130 struct svc_rdma_op_ctxt *head,
136 struct rpcrdma_msg *rmsgp, 131 struct rpcrdma_msg *rmsgp,
137 struct ib_sge *sge, 132 struct svc_rdma_req_map *rpl_map,
138 struct chunk_sge *ch_sge_ary, 133 struct svc_rdma_req_map *chl_map,
139 int ch_count, 134 int ch_count,
140 int byte_count) 135 int byte_count)
141{ 136{
@@ -156,22 +151,18 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
156 head->arg.head[0] = rqstp->rq_arg.head[0]; 151 head->arg.head[0] = rqstp->rq_arg.head[0];
157 head->arg.tail[0] = rqstp->rq_arg.tail[0]; 152 head->arg.tail[0] = rqstp->rq_arg.tail[0];
158 head->arg.pages = &head->pages[head->count]; 153 head->arg.pages = &head->pages[head->count];
159 head->sge[0].length = head->count; /* save count of hdr pages */ 154 head->hdr_count = head->count; /* save count of hdr pages */
160 head->arg.page_base = 0; 155 head->arg.page_base = 0;
161 head->arg.page_len = ch_bytes; 156 head->arg.page_len = ch_bytes;
162 head->arg.len = rqstp->rq_arg.len + ch_bytes; 157 head->arg.len = rqstp->rq_arg.len + ch_bytes;
163 head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes; 158 head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes;
164 head->count++; 159 head->count++;
165 ch_sge_ary[0].start = 0; 160 chl_map->ch[0].start = 0;
166 while (byte_count) { 161 while (byte_count) {
162 rpl_map->sge[sge_no].iov_base =
163 page_address(rqstp->rq_arg.pages[page_no]) + page_off;
167 sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes); 164 sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes);
168 sge[sge_no].addr = 165 rpl_map->sge[sge_no].iov_len = sge_bytes;
169 ib_dma_map_page(xprt->sc_cm_id->device,
170 rqstp->rq_arg.pages[page_no],
171 page_off, sge_bytes,
172 DMA_FROM_DEVICE);
173 sge[sge_no].length = sge_bytes;
174 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
175 /* 166 /*
176 * Don't bump head->count here because the same page 167 * Don't bump head->count here because the same page
177 * may be used by multiple SGE. 168 * may be used by multiple SGE.
@@ -187,11 +178,11 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
187 * SGE, move to the next SGE 178 * SGE, move to the next SGE
188 */ 179 */
189 if (ch_bytes == 0) { 180 if (ch_bytes == 0) {
190 ch_sge_ary[ch_no].count = 181 chl_map->ch[ch_no].count =
191 sge_no - ch_sge_ary[ch_no].start; 182 sge_no - chl_map->ch[ch_no].start;
192 ch_no++; 183 ch_no++;
193 ch++; 184 ch++;
194 ch_sge_ary[ch_no].start = sge_no; 185 chl_map->ch[ch_no].start = sge_no;
195 ch_bytes = ch->rc_target.rs_length; 186 ch_bytes = ch->rc_target.rs_length;
196 /* If bytes remaining account for next chunk */ 187 /* If bytes remaining account for next chunk */
197 if (byte_count) { 188 if (byte_count) {
@@ -220,18 +211,25 @@ static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
220 return sge_no; 211 return sge_no;
221} 212}
222 213
223static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt, 214static void rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
224 struct ib_sge *sge, 215 struct svc_rdma_op_ctxt *ctxt,
216 struct kvec *vec,
225 u64 *sgl_offset, 217 u64 *sgl_offset,
226 int count) 218 int count)
227{ 219{
228 int i; 220 int i;
229 221
230 ctxt->count = count; 222 ctxt->count = count;
223 ctxt->direction = DMA_FROM_DEVICE;
231 for (i = 0; i < count; i++) { 224 for (i = 0; i < count; i++) {
232 ctxt->sge[i].addr = sge[i].addr; 225 atomic_inc(&xprt->sc_dma_used);
233 ctxt->sge[i].length = sge[i].length; 226 ctxt->sge[i].addr =
234 *sgl_offset = *sgl_offset + sge[i].length; 227 ib_dma_map_single(xprt->sc_cm_id->device,
228 vec[i].iov_base, vec[i].iov_len,
229 DMA_FROM_DEVICE);
230 ctxt->sge[i].length = vec[i].iov_len;
231 ctxt->sge[i].lkey = xprt->sc_phys_mr->lkey;
232 *sgl_offset = *sgl_offset + vec[i].iov_len;
235 } 233 }
236} 234}
237 235
@@ -282,34 +280,29 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
282 struct ib_send_wr read_wr; 280 struct ib_send_wr read_wr;
283 int err = 0; 281 int err = 0;
284 int ch_no; 282 int ch_no;
285 struct ib_sge *sge;
286 int ch_count; 283 int ch_count;
287 int byte_count; 284 int byte_count;
288 int sge_count; 285 int sge_count;
289 u64 sgl_offset; 286 u64 sgl_offset;
290 struct rpcrdma_read_chunk *ch; 287 struct rpcrdma_read_chunk *ch;
291 struct svc_rdma_op_ctxt *ctxt = NULL; 288 struct svc_rdma_op_ctxt *ctxt = NULL;
292 struct svc_rdma_op_ctxt *tmp_sge_ctxt; 289 struct svc_rdma_req_map *rpl_map;
293 struct svc_rdma_op_ctxt *tmp_ch_ctxt; 290 struct svc_rdma_req_map *chl_map;
294 struct chunk_sge *ch_sge_ary;
295 291
296 /* If no read list is present, return 0 */ 292 /* If no read list is present, return 0 */
297 ch = svc_rdma_get_read_chunk(rmsgp); 293 ch = svc_rdma_get_read_chunk(rmsgp);
298 if (!ch) 294 if (!ch)
299 return 0; 295 return 0;
300 296
301 /* Allocate temporary contexts to keep SGE */ 297 /* Allocate temporary reply and chunk maps */
302 BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge)); 298 rpl_map = svc_rdma_get_req_map();
303 tmp_sge_ctxt = svc_rdma_get_context(xprt); 299 chl_map = svc_rdma_get_req_map();
304 sge = tmp_sge_ctxt->sge;
305 tmp_ch_ctxt = svc_rdma_get_context(xprt);
306 ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge;
307 300
308 svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); 301 svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
309 if (ch_count > RPCSVC_MAXPAGES) 302 if (ch_count > RPCSVC_MAXPAGES)
310 return -EINVAL; 303 return -EINVAL;
311 sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp, 304 sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
312 sge, ch_sge_ary, 305 rpl_map, chl_map,
313 ch_count, byte_count); 306 ch_count, byte_count);
314 sgl_offset = 0; 307 sgl_offset = 0;
315 ch_no = 0; 308 ch_no = 0;
@@ -331,14 +324,15 @@ next_sge:
331 read_wr.wr.rdma.remote_addr = 324 read_wr.wr.rdma.remote_addr =
332 get_unaligned(&(ch->rc_target.rs_offset)) + 325 get_unaligned(&(ch->rc_target.rs_offset)) +
333 sgl_offset; 326 sgl_offset;
334 read_wr.sg_list = &sge[ch_sge_ary[ch_no].start]; 327 read_wr.sg_list = ctxt->sge;
335 read_wr.num_sge = 328 read_wr.num_sge =
336 rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count); 329 rdma_read_max_sge(xprt, chl_map->ch[ch_no].count);
337 rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start], 330 rdma_set_ctxt_sge(xprt, ctxt,
331 &rpl_map->sge[chl_map->ch[ch_no].start],
338 &sgl_offset, 332 &sgl_offset,
339 read_wr.num_sge); 333 read_wr.num_sge);
340 if (((ch+1)->rc_discrim == 0) && 334 if (((ch+1)->rc_discrim == 0) &&
341 (read_wr.num_sge == ch_sge_ary[ch_no].count)) { 335 (read_wr.num_sge == chl_map->ch[ch_no].count)) {
342 /* 336 /*
343 * Mark the last RDMA_READ with a bit to 337 * Mark the last RDMA_READ with a bit to
344 * indicate all RPC data has been fetched from 338 * indicate all RPC data has been fetched from
@@ -358,9 +352,9 @@ next_sge:
358 } 352 }
359 atomic_inc(&rdma_stat_read); 353 atomic_inc(&rdma_stat_read);
360 354
361 if (read_wr.num_sge < ch_sge_ary[ch_no].count) { 355 if (read_wr.num_sge < chl_map->ch[ch_no].count) {
362 ch_sge_ary[ch_no].count -= read_wr.num_sge; 356 chl_map->ch[ch_no].count -= read_wr.num_sge;
363 ch_sge_ary[ch_no].start += read_wr.num_sge; 357 chl_map->ch[ch_no].start += read_wr.num_sge;
364 goto next_sge; 358 goto next_sge;
365 } 359 }
366 sgl_offset = 0; 360 sgl_offset = 0;
@@ -368,8 +362,8 @@ next_sge:
368 } 362 }
369 363
370 out: 364 out:
371 svc_rdma_put_context(tmp_sge_ctxt, 0); 365 svc_rdma_put_req_map(rpl_map);
372 svc_rdma_put_context(tmp_ch_ctxt, 0); 366 svc_rdma_put_req_map(chl_map);
373 367
374 /* Detach arg pages. svc_recv will replenish them */ 368 /* Detach arg pages. svc_recv will replenish them */
375 for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++) 369 for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
@@ -399,7 +393,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
399 rqstp->rq_pages[page_no] = head->pages[page_no]; 393 rqstp->rq_pages[page_no] = head->pages[page_no];
400 } 394 }
401 /* Point rq_arg.pages past header */ 395 /* Point rq_arg.pages past header */
402 rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length]; 396 rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
403 rqstp->rq_arg.page_len = head->arg.page_len; 397 rqstp->rq_arg.page_len = head->arg.page_len;
404 rqstp->rq_arg.page_base = head->arg.page_base; 398 rqstp->rq_arg.page_base = head->arg.page_base;
405 399
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index fb82b1b683f8..a19b22b452a3 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -63,52 +63,44 @@
63 * SGE[2..sge_count-2] data from xdr->pages[] 63 * SGE[2..sge_count-2] data from xdr->pages[]
64 * SGE[sge_count-1] data from xdr->tail. 64 * SGE[sge_count-1] data from xdr->tail.
65 * 65 *
66 * The max SGE we need is the length of the XDR / pagesize + one for
67 * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES
68 * reserves a page for both the request and the reply header, and this
69 * array is only concerned with the reply we are assured that we have
70 * on extra page for the RPCRMDA header.
66 */ 71 */
67static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt, 72static void xdr_to_sge(struct svcxprt_rdma *xprt,
68 struct xdr_buf *xdr, 73 struct xdr_buf *xdr,
69 struct ib_sge *sge, 74 struct svc_rdma_req_map *vec)
70 int *sge_count)
71{ 75{
72 /* Max we need is the length of the XDR / pagesize + one for
73 * head + one for tail + one for RPCRDMA header
74 */
75 int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3; 76 int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
76 int sge_no; 77 int sge_no;
77 u32 byte_count = xdr->len;
78 u32 sge_bytes; 78 u32 sge_bytes;
79 u32 page_bytes; 79 u32 page_bytes;
80 int page_off; 80 u32 page_off;
81 int page_no; 81 int page_no;
82 82
83 BUG_ON(xdr->len !=
84 (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
85
83 /* Skip the first sge, this is for the RPCRDMA header */ 86 /* Skip the first sge, this is for the RPCRDMA header */
84 sge_no = 1; 87 sge_no = 1;
85 88
86 /* Head SGE */ 89 /* Head SGE */
87 sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device, 90 vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
88 xdr->head[0].iov_base, 91 vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
89 xdr->head[0].iov_len,
90 DMA_TO_DEVICE);
91 sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len);
92 byte_count -= sge_bytes;
93 sge[sge_no].length = sge_bytes;
94 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
95 sge_no++; 92 sge_no++;
96 93
97 /* pages SGE */ 94 /* pages SGE */
98 page_no = 0; 95 page_no = 0;
99 page_bytes = xdr->page_len; 96 page_bytes = xdr->page_len;
100 page_off = xdr->page_base; 97 page_off = xdr->page_base;
101 while (byte_count && page_bytes) { 98 while (page_bytes) {
102 sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off)); 99 vec->sge[sge_no].iov_base =
103 sge[sge_no].addr = 100 page_address(xdr->pages[page_no]) + page_off;
104 ib_dma_map_page(xprt->sc_cm_id->device, 101 sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
105 xdr->pages[page_no], page_off,
106 sge_bytes, DMA_TO_DEVICE);
107 sge_bytes = min(sge_bytes, page_bytes);
108 byte_count -= sge_bytes;
109 page_bytes -= sge_bytes; 102 page_bytes -= sge_bytes;
110 sge[sge_no].length = sge_bytes; 103 vec->sge[sge_no].iov_len = sge_bytes;
111 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
112 104
113 sge_no++; 105 sge_no++;
114 page_no++; 106 page_no++;
@@ -116,36 +108,24 @@ static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt,
116 } 108 }
117 109
118 /* Tail SGE */ 110 /* Tail SGE */
119 if (byte_count && xdr->tail[0].iov_len) { 111 if (xdr->tail[0].iov_len) {
120 sge[sge_no].addr = 112 vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
121 ib_dma_map_single(xprt->sc_cm_id->device, 113 vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
122 xdr->tail[0].iov_base,
123 xdr->tail[0].iov_len,
124 DMA_TO_DEVICE);
125 sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len);
126 byte_count -= sge_bytes;
127 sge[sge_no].length = sge_bytes;
128 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
129 sge_no++; 114 sge_no++;
130 } 115 }
131 116
132 BUG_ON(sge_no > sge_max); 117 BUG_ON(sge_no > sge_max);
133 BUG_ON(byte_count != 0); 118 vec->count = sge_no;
134
135 *sge_count = sge_no;
136 return sge;
137} 119}
138 120
139
140/* Assumptions: 121/* Assumptions:
141 * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE 122 * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
142 */ 123 */
143static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, 124static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
144 u32 rmr, u64 to, 125 u32 rmr, u64 to,
145 u32 xdr_off, int write_len, 126 u32 xdr_off, int write_len,
146 struct ib_sge *xdr_sge, int sge_count) 127 struct svc_rdma_req_map *vec)
147{ 128{
148 struct svc_rdma_op_ctxt *tmp_sge_ctxt;
149 struct ib_send_wr write_wr; 129 struct ib_send_wr write_wr;
150 struct ib_sge *sge; 130 struct ib_sge *sge;
151 int xdr_sge_no; 131 int xdr_sge_no;
@@ -154,25 +134,23 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
154 int sge_off; 134 int sge_off;
155 int bc; 135 int bc;
156 struct svc_rdma_op_ctxt *ctxt; 136 struct svc_rdma_op_ctxt *ctxt;
157 int ret = 0;
158 137
159 BUG_ON(sge_count > RPCSVC_MAXPAGES); 138 BUG_ON(vec->count > RPCSVC_MAXPAGES);
160 dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, " 139 dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
161 "write_len=%d, xdr_sge=%p, sge_count=%d\n", 140 "write_len=%d, vec->sge=%p, vec->count=%lu\n",
162 rmr, (unsigned long long)to, xdr_off, 141 rmr, (unsigned long long)to, xdr_off,
163 write_len, xdr_sge, sge_count); 142 write_len, vec->sge, vec->count);
164 143
165 ctxt = svc_rdma_get_context(xprt); 144 ctxt = svc_rdma_get_context(xprt);
166 ctxt->count = 0; 145 ctxt->direction = DMA_TO_DEVICE;
167 tmp_sge_ctxt = svc_rdma_get_context(xprt); 146 sge = ctxt->sge;
168 sge = tmp_sge_ctxt->sge;
169 147
170 /* Find the SGE associated with xdr_off */ 148 /* Find the SGE associated with xdr_off */
171 for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count; 149 for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count;
172 xdr_sge_no++) { 150 xdr_sge_no++) {
173 if (xdr_sge[xdr_sge_no].length > bc) 151 if (vec->sge[xdr_sge_no].iov_len > bc)
174 break; 152 break;
175 bc -= xdr_sge[xdr_sge_no].length; 153 bc -= vec->sge[xdr_sge_no].iov_len;
176 } 154 }
177 155
178 sge_off = bc; 156 sge_off = bc;
@@ -180,21 +158,28 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
180 sge_no = 0; 158 sge_no = 0;
181 159
182 /* Copy the remaining SGE */ 160 /* Copy the remaining SGE */
183 while (bc != 0 && xdr_sge_no < sge_count) { 161 while (bc != 0 && xdr_sge_no < vec->count) {
184 sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off; 162 sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
185 sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey;
186 sge_bytes = min((size_t)bc, 163 sge_bytes = min((size_t)bc,
187 (size_t)(xdr_sge[xdr_sge_no].length-sge_off)); 164 (size_t)(vec->sge[xdr_sge_no].iov_len-sge_off));
188 sge[sge_no].length = sge_bytes; 165 sge[sge_no].length = sge_bytes;
189 166 atomic_inc(&xprt->sc_dma_used);
167 sge[sge_no].addr =
168 ib_dma_map_single(xprt->sc_cm_id->device,
169 (void *)
170 vec->sge[xdr_sge_no].iov_base + sge_off,
171 sge_bytes, DMA_TO_DEVICE);
172 if (dma_mapping_error(sge[sge_no].addr))
173 goto err;
190 sge_off = 0; 174 sge_off = 0;
191 sge_no++; 175 sge_no++;
176 ctxt->count++;
192 xdr_sge_no++; 177 xdr_sge_no++;
193 bc -= sge_bytes; 178 bc -= sge_bytes;
194 } 179 }
195 180
196 BUG_ON(bc != 0); 181 BUG_ON(bc != 0);
197 BUG_ON(xdr_sge_no > sge_count); 182 BUG_ON(xdr_sge_no > vec->count);
198 183
199 /* Prepare WRITE WR */ 184 /* Prepare WRITE WR */
200 memset(&write_wr, 0, sizeof write_wr); 185 memset(&write_wr, 0, sizeof write_wr);
@@ -209,21 +194,20 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
209 194
210 /* Post It */ 195 /* Post It */
211 atomic_inc(&rdma_stat_write); 196 atomic_inc(&rdma_stat_write);
212 if (svc_rdma_send(xprt, &write_wr)) { 197 if (svc_rdma_send(xprt, &write_wr))
213 svc_rdma_put_context(ctxt, 1); 198 goto err;
214 /* Fatal error, close transport */ 199 return 0;
215 ret = -EIO; 200 err:
216 } 201 svc_rdma_put_context(ctxt, 0);
217 svc_rdma_put_context(tmp_sge_ctxt, 0); 202 /* Fatal error, close transport */
218 return ret; 203 return -EIO;
219} 204}
220 205
221static int send_write_chunks(struct svcxprt_rdma *xprt, 206static int send_write_chunks(struct svcxprt_rdma *xprt,
222 struct rpcrdma_msg *rdma_argp, 207 struct rpcrdma_msg *rdma_argp,
223 struct rpcrdma_msg *rdma_resp, 208 struct rpcrdma_msg *rdma_resp,
224 struct svc_rqst *rqstp, 209 struct svc_rqst *rqstp,
225 struct ib_sge *sge, 210 struct svc_rdma_req_map *vec)
226 int sge_count)
227{ 211{
228 u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; 212 u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
229 int write_len; 213 int write_len;
@@ -269,8 +253,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
269 rs_offset + chunk_off, 253 rs_offset + chunk_off,
270 xdr_off, 254 xdr_off,
271 this_write, 255 this_write,
272 sge, 256 vec);
273 sge_count);
274 if (ret) { 257 if (ret) {
275 dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", 258 dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
276 ret); 259 ret);
@@ -292,8 +275,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
292 struct rpcrdma_msg *rdma_argp, 275 struct rpcrdma_msg *rdma_argp,
293 struct rpcrdma_msg *rdma_resp, 276 struct rpcrdma_msg *rdma_resp,
294 struct svc_rqst *rqstp, 277 struct svc_rqst *rqstp,
295 struct ib_sge *sge, 278 struct svc_rdma_req_map *vec)
296 int sge_count)
297{ 279{
298 u32 xfer_len = rqstp->rq_res.len; 280 u32 xfer_len = rqstp->rq_res.len;
299 int write_len; 281 int write_len;
@@ -341,8 +323,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
341 rs_offset + chunk_off, 323 rs_offset + chunk_off,
342 xdr_off, 324 xdr_off,
343 this_write, 325 this_write,
344 sge, 326 vec);
345 sge_count);
346 if (ret) { 327 if (ret) {
347 dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", 328 dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
348 ret); 329 ret);
@@ -380,7 +361,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
380 struct page *page, 361 struct page *page,
381 struct rpcrdma_msg *rdma_resp, 362 struct rpcrdma_msg *rdma_resp,
382 struct svc_rdma_op_ctxt *ctxt, 363 struct svc_rdma_op_ctxt *ctxt,
383 int sge_count, 364 struct svc_rdma_req_map *vec,
384 int byte_count) 365 int byte_count)
385{ 366{
386 struct ib_send_wr send_wr; 367 struct ib_send_wr send_wr;
@@ -405,6 +386,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
405 ctxt->count = 1; 386 ctxt->count = 1;
406 387
407 /* Prepare the SGE for the RPCRDMA Header */ 388 /* Prepare the SGE for the RPCRDMA Header */
389 atomic_inc(&rdma->sc_dma_used);
408 ctxt->sge[0].addr = 390 ctxt->sge[0].addr =
409 ib_dma_map_page(rdma->sc_cm_id->device, 391 ib_dma_map_page(rdma->sc_cm_id->device,
410 page, 0, PAGE_SIZE, DMA_TO_DEVICE); 392 page, 0, PAGE_SIZE, DMA_TO_DEVICE);
@@ -413,10 +395,16 @@ static int send_reply(struct svcxprt_rdma *rdma,
413 ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey; 395 ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey;
414 396
415 /* Determine how many of our SGE are to be transmitted */ 397 /* Determine how many of our SGE are to be transmitted */
416 for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) { 398 for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
417 sge_bytes = min((size_t)ctxt->sge[sge_no].length, 399 sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
418 (size_t)byte_count);
419 byte_count -= sge_bytes; 400 byte_count -= sge_bytes;
401 atomic_inc(&rdma->sc_dma_used);
402 ctxt->sge[sge_no].addr =
403 ib_dma_map_single(rdma->sc_cm_id->device,
404 vec->sge[sge_no].iov_base,
405 sge_bytes, DMA_TO_DEVICE);
406 ctxt->sge[sge_no].length = sge_bytes;
407 ctxt->sge[sge_no].lkey = rdma->sc_phys_mr->lkey;
420 } 408 }
421 BUG_ON(byte_count != 0); 409 BUG_ON(byte_count != 0);
422 410
@@ -428,8 +416,10 @@ static int send_reply(struct svcxprt_rdma *rdma,
428 ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; 416 ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
429 ctxt->count++; 417 ctxt->count++;
430 rqstp->rq_respages[page_no] = NULL; 418 rqstp->rq_respages[page_no] = NULL;
419 /* If there are more pages than SGE, terminate SGE list */
420 if (page_no+1 >= sge_no)
421 ctxt->sge[page_no+1].length = 0;
431 } 422 }
432
433 BUG_ON(sge_no > rdma->sc_max_sge); 423 BUG_ON(sge_no > rdma->sc_max_sge);
434 memset(&send_wr, 0, sizeof send_wr); 424 memset(&send_wr, 0, sizeof send_wr);
435 ctxt->wr_op = IB_WR_SEND; 425 ctxt->wr_op = IB_WR_SEND;
@@ -473,20 +463,20 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
473 enum rpcrdma_proc reply_type; 463 enum rpcrdma_proc reply_type;
474 int ret; 464 int ret;
475 int inline_bytes; 465 int inline_bytes;
476 struct ib_sge *sge;
477 int sge_count = 0;
478 struct page *res_page; 466 struct page *res_page;
479 struct svc_rdma_op_ctxt *ctxt; 467 struct svc_rdma_op_ctxt *ctxt;
468 struct svc_rdma_req_map *vec;
480 469
481 dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); 470 dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
482 471
483 /* Get the RDMA request header. */ 472 /* Get the RDMA request header. */
484 rdma_argp = xdr_start(&rqstp->rq_arg); 473 rdma_argp = xdr_start(&rqstp->rq_arg);
485 474
486 /* Build an SGE for the XDR */ 475 /* Build an req vec for the XDR */
487 ctxt = svc_rdma_get_context(rdma); 476 ctxt = svc_rdma_get_context(rdma);
488 ctxt->direction = DMA_TO_DEVICE; 477 ctxt->direction = DMA_TO_DEVICE;
489 sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count); 478 vec = svc_rdma_get_req_map();
479 xdr_to_sge(rdma, &rqstp->rq_res, vec);
490 480
491 inline_bytes = rqstp->rq_res.len; 481 inline_bytes = rqstp->rq_res.len;
492 482
@@ -503,7 +493,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
503 493
504 /* Send any write-chunk data and build resp write-list */ 494 /* Send any write-chunk data and build resp write-list */
505 ret = send_write_chunks(rdma, rdma_argp, rdma_resp, 495 ret = send_write_chunks(rdma, rdma_argp, rdma_resp,
506 rqstp, sge, sge_count); 496 rqstp, vec);
507 if (ret < 0) { 497 if (ret < 0) {
508 printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n", 498 printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
509 ret); 499 ret);
@@ -513,7 +503,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
513 503
514 /* Send any reply-list data and update resp reply-list */ 504 /* Send any reply-list data and update resp reply-list */
515 ret = send_reply_chunks(rdma, rdma_argp, rdma_resp, 505 ret = send_reply_chunks(rdma, rdma_argp, rdma_resp,
516 rqstp, sge, sge_count); 506 rqstp, vec);
517 if (ret < 0) { 507 if (ret < 0) {
518 printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n", 508 printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
519 ret); 509 ret);
@@ -521,11 +511,13 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
521 } 511 }
522 inline_bytes -= ret; 512 inline_bytes -= ret;
523 513
524 ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count, 514 ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
525 inline_bytes); 515 inline_bytes);
516 svc_rdma_put_req_map(vec);
526 dprintk("svcrdma: send_reply returns %d\n", ret); 517 dprintk("svcrdma: send_reply returns %d\n", ret);
527 return ret; 518 return ret;
528 error: 519 error:
520 svc_rdma_put_req_map(vec);
529 svc_rdma_put_context(ctxt, 0); 521 svc_rdma_put_context(ctxt, 0);
530 put_page(res_page); 522 put_page(res_page);
531 return ret; 523 return ret;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index e132509d1db0..19ddc382b777 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -84,70 +84,37 @@ struct svc_xprt_class svc_rdma_class = {
84 .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, 84 .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
85}; 85};
86 86
87static int rdma_bump_context_cache(struct svcxprt_rdma *xprt) 87/* WR context cache. Created in svc_rdma.c */
88extern struct kmem_cache *svc_rdma_ctxt_cachep;
89
90struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
88{ 91{
89 int target;
90 int at_least_one = 0;
91 struct svc_rdma_op_ctxt *ctxt; 92 struct svc_rdma_op_ctxt *ctxt;
92 93
93 target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump, 94 while (1) {
94 xprt->sc_ctxt_max); 95 ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL);
95 96 if (ctxt)
96 spin_lock_bh(&xprt->sc_ctxt_lock);
97 while (xprt->sc_ctxt_cnt < target) {
98 xprt->sc_ctxt_cnt++;
99 spin_unlock_bh(&xprt->sc_ctxt_lock);
100
101 ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
102
103 spin_lock_bh(&xprt->sc_ctxt_lock);
104 if (ctxt) {
105 at_least_one = 1;
106 INIT_LIST_HEAD(&ctxt->free_list);
107 list_add(&ctxt->free_list, &xprt->sc_ctxt_free);
108 } else {
109 /* kmalloc failed...give up for now */
110 xprt->sc_ctxt_cnt--;
111 break; 97 break;
112 } 98 schedule_timeout_uninterruptible(msecs_to_jiffies(500));
113 } 99 }
114 spin_unlock_bh(&xprt->sc_ctxt_lock); 100 ctxt->xprt = xprt;
115 dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n", 101 INIT_LIST_HEAD(&ctxt->dto_q);
116 xprt->sc_ctxt_max, xprt->sc_ctxt_cnt); 102 ctxt->count = 0;
117 return at_least_one; 103 atomic_inc(&xprt->sc_ctxt_used);
104 return ctxt;
118} 105}
119 106
120struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) 107static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
121{ 108{
122 struct svc_rdma_op_ctxt *ctxt; 109 struct svcxprt_rdma *xprt = ctxt->xprt;
123 110 int i;
124 while (1) { 111 for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
125 spin_lock_bh(&xprt->sc_ctxt_lock); 112 atomic_dec(&xprt->sc_dma_used);
126 if (unlikely(list_empty(&xprt->sc_ctxt_free))) { 113 ib_dma_unmap_single(xprt->sc_cm_id->device,
127 /* Try to bump my cache. */ 114 ctxt->sge[i].addr,
128 spin_unlock_bh(&xprt->sc_ctxt_lock); 115 ctxt->sge[i].length,
129 116 ctxt->direction);
130 if (rdma_bump_context_cache(xprt))
131 continue;
132
133 printk(KERN_INFO "svcrdma: sleeping waiting for "
134 "context memory on xprt=%p\n",
135 xprt);
136 schedule_timeout_uninterruptible(msecs_to_jiffies(500));
137 continue;
138 }
139 ctxt = list_entry(xprt->sc_ctxt_free.next,
140 struct svc_rdma_op_ctxt,
141 free_list);
142 list_del_init(&ctxt->free_list);
143 spin_unlock_bh(&xprt->sc_ctxt_lock);
144 ctxt->xprt = xprt;
145 INIT_LIST_HEAD(&ctxt->dto_q);
146 ctxt->count = 0;
147 atomic_inc(&xprt->sc_ctxt_used);
148 break;
149 } 117 }
150 return ctxt;
151} 118}
152 119
153void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) 120void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
@@ -161,18 +128,36 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
161 for (i = 0; i < ctxt->count; i++) 128 for (i = 0; i < ctxt->count; i++)
162 put_page(ctxt->pages[i]); 129 put_page(ctxt->pages[i]);
163 130
164 for (i = 0; i < ctxt->count; i++) 131 kmem_cache_free(svc_rdma_ctxt_cachep, ctxt);
165 ib_dma_unmap_single(xprt->sc_cm_id->device,
166 ctxt->sge[i].addr,
167 ctxt->sge[i].length,
168 ctxt->direction);
169
170 spin_lock_bh(&xprt->sc_ctxt_lock);
171 list_add(&ctxt->free_list, &xprt->sc_ctxt_free);
172 spin_unlock_bh(&xprt->sc_ctxt_lock);
173 atomic_dec(&xprt->sc_ctxt_used); 132 atomic_dec(&xprt->sc_ctxt_used);
174} 133}
175 134
135/* Temporary NFS request map cache. Created in svc_rdma.c */
136extern struct kmem_cache *svc_rdma_map_cachep;
137
138/*
139 * Temporary NFS req mappings are shared across all transport
140 * instances. These are short lived and should be bounded by the number
141 * of concurrent server threads * depth of the SQ.
142 */
143struct svc_rdma_req_map *svc_rdma_get_req_map(void)
144{
145 struct svc_rdma_req_map *map;
146 while (1) {
147 map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL);
148 if (map)
149 break;
150 schedule_timeout_uninterruptible(msecs_to_jiffies(500));
151 }
152 map->count = 0;
153 return map;
154}
155
156void svc_rdma_put_req_map(struct svc_rdma_req_map *map)
157{
158 kmem_cache_free(svc_rdma_map_cachep, map);
159}
160
176/* ib_cq event handler */ 161/* ib_cq event handler */
177static void cq_event_handler(struct ib_event *event, void *context) 162static void cq_event_handler(struct ib_event *event, void *context)
178{ 163{
@@ -302,6 +287,7 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt)
302 ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; 287 ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
303 ctxt->wc_status = wc.status; 288 ctxt->wc_status = wc.status;
304 ctxt->byte_len = wc.byte_len; 289 ctxt->byte_len = wc.byte_len;
290 svc_rdma_unmap_dma(ctxt);
305 if (wc.status != IB_WC_SUCCESS) { 291 if (wc.status != IB_WC_SUCCESS) {
306 /* Close the transport */ 292 /* Close the transport */
307 dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt); 293 dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt);
@@ -351,6 +337,7 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
351 ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; 337 ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
352 xprt = ctxt->xprt; 338 xprt = ctxt->xprt;
353 339
340 svc_rdma_unmap_dma(ctxt);
354 if (wc.status != IB_WC_SUCCESS) 341 if (wc.status != IB_WC_SUCCESS)
355 /* Close the transport */ 342 /* Close the transport */
356 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); 343 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
@@ -361,10 +348,13 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt)
361 348
362 switch (ctxt->wr_op) { 349 switch (ctxt->wr_op) {
363 case IB_WR_SEND: 350 case IB_WR_SEND:
364 case IB_WR_RDMA_WRITE:
365 svc_rdma_put_context(ctxt, 1); 351 svc_rdma_put_context(ctxt, 1);
366 break; 352 break;
367 353
354 case IB_WR_RDMA_WRITE:
355 svc_rdma_put_context(ctxt, 0);
356 break;
357
368 case IB_WR_RDMA_READ: 358 case IB_WR_RDMA_READ:
369 if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { 359 if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
370 struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; 360 struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
@@ -423,40 +413,6 @@ static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
423 tasklet_schedule(&dto_tasklet); 413 tasklet_schedule(&dto_tasklet);
424} 414}
425 415
426static void create_context_cache(struct svcxprt_rdma *xprt,
427 int ctxt_count, int ctxt_bump, int ctxt_max)
428{
429 struct svc_rdma_op_ctxt *ctxt;
430 int i;
431
432 xprt->sc_ctxt_max = ctxt_max;
433 xprt->sc_ctxt_bump = ctxt_bump;
434 xprt->sc_ctxt_cnt = 0;
435 atomic_set(&xprt->sc_ctxt_used, 0);
436
437 INIT_LIST_HEAD(&xprt->sc_ctxt_free);
438 for (i = 0; i < ctxt_count; i++) {
439 ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
440 if (ctxt) {
441 INIT_LIST_HEAD(&ctxt->free_list);
442 list_add(&ctxt->free_list, &xprt->sc_ctxt_free);
443 xprt->sc_ctxt_cnt++;
444 }
445 }
446}
447
448static void destroy_context_cache(struct svcxprt_rdma *xprt)
449{
450 while (!list_empty(&xprt->sc_ctxt_free)) {
451 struct svc_rdma_op_ctxt *ctxt;
452 ctxt = list_entry(xprt->sc_ctxt_free.next,
453 struct svc_rdma_op_ctxt,
454 free_list);
455 list_del_init(&ctxt->free_list);
456 kfree(ctxt);
457 }
458}
459
460static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, 416static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
461 int listener) 417 int listener)
462{ 418{
@@ -473,7 +429,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
473 429
474 spin_lock_init(&cma_xprt->sc_lock); 430 spin_lock_init(&cma_xprt->sc_lock);
475 spin_lock_init(&cma_xprt->sc_read_complete_lock); 431 spin_lock_init(&cma_xprt->sc_read_complete_lock);
476 spin_lock_init(&cma_xprt->sc_ctxt_lock);
477 spin_lock_init(&cma_xprt->sc_rq_dto_lock); 432 spin_lock_init(&cma_xprt->sc_rq_dto_lock);
478 433
479 cma_xprt->sc_ord = svcrdma_ord; 434 cma_xprt->sc_ord = svcrdma_ord;
@@ -482,21 +437,9 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
482 cma_xprt->sc_max_requests = svcrdma_max_requests; 437 cma_xprt->sc_max_requests = svcrdma_max_requests;
483 cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT; 438 cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
484 atomic_set(&cma_xprt->sc_sq_count, 0); 439 atomic_set(&cma_xprt->sc_sq_count, 0);
440 atomic_set(&cma_xprt->sc_ctxt_used, 0);
485 441
486 if (!listener) { 442 if (listener)
487 int reqs = cma_xprt->sc_max_requests;
488 create_context_cache(cma_xprt,
489 reqs << 1, /* starting size */
490 reqs, /* bump amount */
491 reqs +
492 cma_xprt->sc_sq_depth +
493 RPCRDMA_MAX_THREADS + 1); /* max */
494 if (list_empty(&cma_xprt->sc_ctxt_free)) {
495 kfree(cma_xprt);
496 return NULL;
497 }
498 clear_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
499 } else
500 set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); 443 set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
501 444
502 return cma_xprt; 445 return cma_xprt;
@@ -532,6 +475,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
532 BUG_ON(sge_no >= xprt->sc_max_sge); 475 BUG_ON(sge_no >= xprt->sc_max_sge);
533 page = svc_rdma_get_page(); 476 page = svc_rdma_get_page();
534 ctxt->pages[sge_no] = page; 477 ctxt->pages[sge_no] = page;
478 atomic_inc(&xprt->sc_dma_used);
535 pa = ib_dma_map_page(xprt->sc_cm_id->device, 479 pa = ib_dma_map_page(xprt->sc_cm_id->device,
536 page, 0, PAGE_SIZE, 480 page, 0, PAGE_SIZE,
537 DMA_FROM_DEVICE); 481 DMA_FROM_DEVICE);
@@ -566,7 +510,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
566 * will call the recvfrom method on the listen xprt which will accept the new 510 * will call the recvfrom method on the listen xprt which will accept the new
567 * connection. 511 * connection.
568 */ 512 */
569static void handle_connect_req(struct rdma_cm_id *new_cma_id) 513static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
570{ 514{
571 struct svcxprt_rdma *listen_xprt = new_cma_id->context; 515 struct svcxprt_rdma *listen_xprt = new_cma_id->context;
572 struct svcxprt_rdma *newxprt; 516 struct svcxprt_rdma *newxprt;
@@ -583,6 +527,9 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id)
583 dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", 527 dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
584 newxprt, newxprt->sc_cm_id, listen_xprt); 528 newxprt, newxprt->sc_cm_id, listen_xprt);
585 529
530 /* Save client advertised inbound read limit for use later in accept. */
531 newxprt->sc_ord = client_ird;
532
586 /* Set the local and remote addresses in the transport */ 533 /* Set the local and remote addresses in the transport */
587 sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; 534 sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
588 svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); 535 svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
@@ -619,7 +566,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
619 case RDMA_CM_EVENT_CONNECT_REQUEST: 566 case RDMA_CM_EVENT_CONNECT_REQUEST:
620 dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " 567 dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
621 "event=%d\n", cma_id, cma_id->context, event->event); 568 "event=%d\n", cma_id, cma_id->context, event->event);
622 handle_connect_req(cma_id); 569 handle_connect_req(cma_id,
570 event->param.conn.responder_resources);
623 break; 571 break;
624 572
625 case RDMA_CM_EVENT_ESTABLISHED: 573 case RDMA_CM_EVENT_ESTABLISHED:
@@ -793,8 +741,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
793 (size_t)svcrdma_max_requests); 741 (size_t)svcrdma_max_requests);
794 newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; 742 newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
795 743
796 newxprt->sc_ord = min((size_t)devattr.max_qp_rd_atom, 744 /*
797 (size_t)svcrdma_ord); 745 * Limit ORD based on client limit, local device limit, and
746 * configured svcrdma limit.
747 */
748 newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord);
749 newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord);
798 750
799 newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device); 751 newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
800 if (IS_ERR(newxprt->sc_pd)) { 752 if (IS_ERR(newxprt->sc_pd)) {
@@ -987,7 +939,6 @@ static void __svc_rdma_free(struct work_struct *work)
987 * cm_id because the device ptr is needed to unmap the dma in 939 * cm_id because the device ptr is needed to unmap the dma in
988 * svc_rdma_put_context. 940 * svc_rdma_put_context.
989 */ 941 */
990 spin_lock_bh(&rdma->sc_read_complete_lock);
991 while (!list_empty(&rdma->sc_read_complete_q)) { 942 while (!list_empty(&rdma->sc_read_complete_q)) {
992 struct svc_rdma_op_ctxt *ctxt; 943 struct svc_rdma_op_ctxt *ctxt;
993 ctxt = list_entry(rdma->sc_read_complete_q.next, 944 ctxt = list_entry(rdma->sc_read_complete_q.next,
@@ -996,10 +947,8 @@ static void __svc_rdma_free(struct work_struct *work)
996 list_del_init(&ctxt->dto_q); 947 list_del_init(&ctxt->dto_q);
997 svc_rdma_put_context(ctxt, 1); 948 svc_rdma_put_context(ctxt, 1);
998 } 949 }
999 spin_unlock_bh(&rdma->sc_read_complete_lock);
1000 950
1001 /* Destroy queued, but not processed recv completions */ 951 /* Destroy queued, but not processed recv completions */
1002 spin_lock_bh(&rdma->sc_rq_dto_lock);
1003 while (!list_empty(&rdma->sc_rq_dto_q)) { 952 while (!list_empty(&rdma->sc_rq_dto_q)) {
1004 struct svc_rdma_op_ctxt *ctxt; 953 struct svc_rdma_op_ctxt *ctxt;
1005 ctxt = list_entry(rdma->sc_rq_dto_q.next, 954 ctxt = list_entry(rdma->sc_rq_dto_q.next,
@@ -1008,10 +957,10 @@ static void __svc_rdma_free(struct work_struct *work)
1008 list_del_init(&ctxt->dto_q); 957 list_del_init(&ctxt->dto_q);
1009 svc_rdma_put_context(ctxt, 1); 958 svc_rdma_put_context(ctxt, 1);
1010 } 959 }
1011 spin_unlock_bh(&rdma->sc_rq_dto_lock);
1012 960
1013 /* Warn if we leaked a resource or under-referenced */ 961 /* Warn if we leaked a resource or under-referenced */
1014 WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); 962 WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0);
963 WARN_ON(atomic_read(&rdma->sc_dma_used) != 0);
1015 964
1016 /* Destroy the QP if present (not a listener) */ 965 /* Destroy the QP if present (not a listener) */
1017 if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) 966 if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
@@ -1032,7 +981,6 @@ static void __svc_rdma_free(struct work_struct *work)
1032 /* Destroy the CM ID */ 981 /* Destroy the CM ID */
1033 rdma_destroy_id(rdma->sc_cm_id); 982 rdma_destroy_id(rdma->sc_cm_id);
1034 983
1035 destroy_context_cache(rdma);
1036 kfree(rdma); 984 kfree(rdma);
1037} 985}
1038 986
@@ -1132,6 +1080,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
1132 length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); 1080 length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
1133 1081
1134 /* Prepare SGE for local address */ 1082 /* Prepare SGE for local address */
1083 atomic_inc(&xprt->sc_dma_used);
1135 sge.addr = ib_dma_map_page(xprt->sc_cm_id->device, 1084 sge.addr = ib_dma_map_page(xprt->sc_cm_id->device,
1136 p, 0, PAGE_SIZE, DMA_FROM_DEVICE); 1085 p, 0, PAGE_SIZE, DMA_FROM_DEVICE);
1137 sge.lkey = xprt->sc_phys_mr->lkey; 1086 sge.lkey = xprt->sc_phys_mr->lkey;