aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw
diff options
context:
space:
mode:
authorRalph Campbell <ralph.campbell@qlogic.com>2006-09-22 18:22:26 -0400
committerRoland Dreier <rolandd@cisco.com>2006-09-22 18:22:26 -0400
commit373d9915803aebbbf7fd3841efd9dac31c32e148 (patch)
treebd0b05d5e78eab4471bc4d623a880013693077ea /drivers/infiniband/hw
parent9bc57e2d19db4da81c1150120658cc3658a99ed4 (diff)
IB/ipath: Performance improvements via mmap of queues
Improve performance of userspace post receive, post SRQ receive, and poll CQ operations for ipath by allowing userspace to directly mmap() receive queues and completion queues. This eliminates the copying between userspace and the kernel in the data path. Signed-off-by: Ralph Campbell <ralph.campbell@qlogic.com> Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband/hw')
-rw-r--r--drivers/infiniband/hw/ipath/Makefile1
-rw-r--r--drivers/infiniband/hw/ipath/ipath_cq.c176
-rw-r--r--drivers/infiniband/hw/ipath/ipath_mmap.c122
-rw-r--r--drivers/infiniband/hw/ipath/ipath_qp.c156
-rw-r--r--drivers/infiniband/hw/ipath/ipath_ruc.c138
-rw-r--r--drivers/infiniband/hw/ipath/ipath_srq.c240
-rw-r--r--drivers/infiniband/hw/ipath/ipath_ud.c169
-rw-r--r--drivers/infiniband/hw/ipath/ipath_verbs.c50
-rw-r--r--drivers/infiniband/hw/ipath/ipath_verbs.h115
9 files changed, 785 insertions, 382 deletions
diff --git a/drivers/infiniband/hw/ipath/Makefile b/drivers/infiniband/hw/ipath/Makefile
index b0bf72864130..6bb43474d104 100644
--- a/drivers/infiniband/hw/ipath/Makefile
+++ b/drivers/infiniband/hw/ipath/Makefile
@@ -25,6 +25,7 @@ ib_ipath-y := \
25 ipath_cq.o \ 25 ipath_cq.o \
26 ipath_keys.o \ 26 ipath_keys.o \
27 ipath_mad.o \ 27 ipath_mad.o \
28 ipath_mmap.o \
28 ipath_mr.o \ 29 ipath_mr.o \
29 ipath_qp.o \ 30 ipath_qp.o \
30 ipath_rc.o \ 31 ipath_rc.o \
diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c
index 3efee341c9bc..3c4c198a4514 100644
--- a/drivers/infiniband/hw/ipath/ipath_cq.c
+++ b/drivers/infiniband/hw/ipath/ipath_cq.c
@@ -42,20 +42,28 @@
42 * @entry: work completion entry to add 42 * @entry: work completion entry to add
43 * @sig: true if @entry is a solicitated entry 43 * @sig: true if @entry is a solicitated entry
44 * 44 *
45 * This may be called with one of the qp->s_lock or qp->r_rq.lock held. 45 * This may be called with qp->s_lock held.
46 */ 46 */
47void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited) 47void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
48{ 48{
49 struct ipath_cq_wc *wc = cq->queue;
49 unsigned long flags; 50 unsigned long flags;
51 u32 head;
50 u32 next; 52 u32 next;
51 53
52 spin_lock_irqsave(&cq->lock, flags); 54 spin_lock_irqsave(&cq->lock, flags);
53 55
54 if (cq->head == cq->ibcq.cqe) 56 /*
57 * Note that the head pointer might be writable by user processes.
58 * Take care to verify it is a sane value.
59 */
60 head = wc->head;
61 if (head >= (unsigned) cq->ibcq.cqe) {
62 head = cq->ibcq.cqe;
55 next = 0; 63 next = 0;
56 else 64 } else
57 next = cq->head + 1; 65 next = head + 1;
58 if (unlikely(next == cq->tail)) { 66 if (unlikely(next == wc->tail)) {
59 spin_unlock_irqrestore(&cq->lock, flags); 67 spin_unlock_irqrestore(&cq->lock, flags);
60 if (cq->ibcq.event_handler) { 68 if (cq->ibcq.event_handler) {
61 struct ib_event ev; 69 struct ib_event ev;
@@ -67,8 +75,8 @@ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
67 } 75 }
68 return; 76 return;
69 } 77 }
70 cq->queue[cq->head] = *entry; 78 wc->queue[head] = *entry;
71 cq->head = next; 79 wc->head = next;
72 80
73 if (cq->notify == IB_CQ_NEXT_COMP || 81 if (cq->notify == IB_CQ_NEXT_COMP ||
74 (cq->notify == IB_CQ_SOLICITED && solicited)) { 82 (cq->notify == IB_CQ_SOLICITED && solicited)) {
@@ -101,19 +109,20 @@ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
101int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) 109int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
102{ 110{
103 struct ipath_cq *cq = to_icq(ibcq); 111 struct ipath_cq *cq = to_icq(ibcq);
112 struct ipath_cq_wc *wc = cq->queue;
104 unsigned long flags; 113 unsigned long flags;
105 int npolled; 114 int npolled;
106 115
107 spin_lock_irqsave(&cq->lock, flags); 116 spin_lock_irqsave(&cq->lock, flags);
108 117
109 for (npolled = 0; npolled < num_entries; ++npolled, ++entry) { 118 for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
110 if (cq->tail == cq->head) 119 if (wc->tail == wc->head)
111 break; 120 break;
112 *entry = cq->queue[cq->tail]; 121 *entry = wc->queue[wc->tail];
113 if (cq->tail == cq->ibcq.cqe) 122 if (wc->tail >= cq->ibcq.cqe)
114 cq->tail = 0; 123 wc->tail = 0;
115 else 124 else
116 cq->tail++; 125 wc->tail++;
117 } 126 }
118 127
119 spin_unlock_irqrestore(&cq->lock, flags); 128 spin_unlock_irqrestore(&cq->lock, flags);
@@ -160,38 +169,74 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries,
160{ 169{
161 struct ipath_ibdev *dev = to_idev(ibdev); 170 struct ipath_ibdev *dev = to_idev(ibdev);
162 struct ipath_cq *cq; 171 struct ipath_cq *cq;
163 struct ib_wc *wc; 172 struct ipath_cq_wc *wc;
164 struct ib_cq *ret; 173 struct ib_cq *ret;
165 174
166 if (entries > ib_ipath_max_cqes) { 175 if (entries > ib_ipath_max_cqes) {
167 ret = ERR_PTR(-EINVAL); 176 ret = ERR_PTR(-EINVAL);
168 goto bail; 177 goto done;
169 } 178 }
170 179
171 if (dev->n_cqs_allocated == ib_ipath_max_cqs) { 180 if (dev->n_cqs_allocated == ib_ipath_max_cqs) {
172 ret = ERR_PTR(-ENOMEM); 181 ret = ERR_PTR(-ENOMEM);
173 goto bail; 182 goto done;
174 } 183 }
175 184
176 /* 185 /* Allocate the completion queue structure. */
177 * Need to use vmalloc() if we want to support large #s of
178 * entries.
179 */
180 cq = kmalloc(sizeof(*cq), GFP_KERNEL); 186 cq = kmalloc(sizeof(*cq), GFP_KERNEL);
181 if (!cq) { 187 if (!cq) {
182 ret = ERR_PTR(-ENOMEM); 188 ret = ERR_PTR(-ENOMEM);
183 goto bail; 189 goto done;
184 } 190 }
185 191
186 /* 192 /*
187 * Need to use vmalloc() if we want to support large #s of entries. 193 * Allocate the completion queue entries and head/tail pointers.
194 * This is allocated separately so that it can be resized and
195 * also mapped into user space.
196 * We need to use vmalloc() in order to support mmap and large
197 * numbers of entries.
188 */ 198 */
189 wc = vmalloc(sizeof(*wc) * (entries + 1)); 199 wc = vmalloc_user(sizeof(*wc) + sizeof(struct ib_wc) * entries);
190 if (!wc) { 200 if (!wc) {
191 kfree(cq);
192 ret = ERR_PTR(-ENOMEM); 201 ret = ERR_PTR(-ENOMEM);
193 goto bail; 202 goto bail_cq;
194 } 203 }
204
205 /*
206 * Return the address of the WC as the offset to mmap.
207 * See ipath_mmap() for details.
208 */
209 if (udata && udata->outlen >= sizeof(__u64)) {
210 struct ipath_mmap_info *ip;
211 __u64 offset = (__u64) wc;
212 int err;
213
214 err = ib_copy_to_udata(udata, &offset, sizeof(offset));
215 if (err) {
216 ret = ERR_PTR(err);
217 goto bail_wc;
218 }
219
220 /* Allocate info for ipath_mmap(). */
221 ip = kmalloc(sizeof(*ip), GFP_KERNEL);
222 if (!ip) {
223 ret = ERR_PTR(-ENOMEM);
224 goto bail_wc;
225 }
226 cq->ip = ip;
227 ip->context = context;
228 ip->obj = wc;
229 kref_init(&ip->ref);
230 ip->mmap_cnt = 0;
231 ip->size = PAGE_ALIGN(sizeof(*wc) +
232 sizeof(struct ib_wc) * entries);
233 spin_lock_irq(&dev->pending_lock);
234 ip->next = dev->pending_mmaps;
235 dev->pending_mmaps = ip;
236 spin_unlock_irq(&dev->pending_lock);
237 } else
238 cq->ip = NULL;
239
195 /* 240 /*
196 * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe. 241 * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
197 * The number of entries should be >= the number requested or return 242 * The number of entries should be >= the number requested or return
@@ -202,15 +247,22 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries,
202 cq->triggered = 0; 247 cq->triggered = 0;
203 spin_lock_init(&cq->lock); 248 spin_lock_init(&cq->lock);
204 tasklet_init(&cq->comptask, send_complete, (unsigned long)cq); 249 tasklet_init(&cq->comptask, send_complete, (unsigned long)cq);
205 cq->head = 0; 250 wc->head = 0;
206 cq->tail = 0; 251 wc->tail = 0;
207 cq->queue = wc; 252 cq->queue = wc;
208 253
209 ret = &cq->ibcq; 254 ret = &cq->ibcq;
210 255
211 dev->n_cqs_allocated++; 256 dev->n_cqs_allocated++;
257 goto done;
212 258
213bail: 259bail_wc:
260 vfree(wc);
261
262bail_cq:
263 kfree(cq);
264
265done:
214 return ret; 266 return ret;
215} 267}
216 268
@@ -229,7 +281,10 @@ int ipath_destroy_cq(struct ib_cq *ibcq)
229 281
230 tasklet_kill(&cq->comptask); 282 tasklet_kill(&cq->comptask);
231 dev->n_cqs_allocated--; 283 dev->n_cqs_allocated--;
232 vfree(cq->queue); 284 if (cq->ip)
285 kref_put(&cq->ip->ref, ipath_release_mmap_info);
286 else
287 vfree(cq->queue);
233 kfree(cq); 288 kfree(cq);
234 289
235 return 0; 290 return 0;
@@ -253,7 +308,7 @@ int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify notify)
253 spin_lock_irqsave(&cq->lock, flags); 308 spin_lock_irqsave(&cq->lock, flags);
254 /* 309 /*
255 * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow 310 * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
256 * any other transitions. 311 * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
257 */ 312 */
258 if (cq->notify != IB_CQ_NEXT_COMP) 313 if (cq->notify != IB_CQ_NEXT_COMP)
259 cq->notify = notify; 314 cq->notify = notify;
@@ -264,46 +319,81 @@ int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify notify)
264int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) 319int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
265{ 320{
266 struct ipath_cq *cq = to_icq(ibcq); 321 struct ipath_cq *cq = to_icq(ibcq);
267 struct ib_wc *wc, *old_wc; 322 struct ipath_cq_wc *old_wc = cq->queue;
268 u32 n; 323 struct ipath_cq_wc *wc;
324 u32 head, tail, n;
269 int ret; 325 int ret;
270 326
271 /* 327 /*
272 * Need to use vmalloc() if we want to support large #s of entries. 328 * Need to use vmalloc() if we want to support large #s of entries.
273 */ 329 */
274 wc = vmalloc(sizeof(*wc) * (cqe + 1)); 330 wc = vmalloc_user(sizeof(*wc) + sizeof(struct ib_wc) * cqe);
275 if (!wc) { 331 if (!wc) {
276 ret = -ENOMEM; 332 ret = -ENOMEM;
277 goto bail; 333 goto bail;
278 } 334 }
279 335
336 /*
337 * Return the address of the WC as the offset to mmap.
338 * See ipath_mmap() for details.
339 */
340 if (udata && udata->outlen >= sizeof(__u64)) {
341 __u64 offset = (__u64) wc;
342
343 ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
344 if (ret)
345 goto bail;
346 }
347
280 spin_lock_irq(&cq->lock); 348 spin_lock_irq(&cq->lock);
281 if (cq->head < cq->tail) 349 /*
282 n = cq->ibcq.cqe + 1 + cq->head - cq->tail; 350 * Make sure head and tail are sane since they
351 * might be user writable.
352 */
353 head = old_wc->head;
354 if (head > (u32) cq->ibcq.cqe)
355 head = (u32) cq->ibcq.cqe;
356 tail = old_wc->tail;
357 if (tail > (u32) cq->ibcq.cqe)
358 tail = (u32) cq->ibcq.cqe;
359 if (head < tail)
360 n = cq->ibcq.cqe + 1 + head - tail;
283 else 361 else
284 n = cq->head - cq->tail; 362 n = head - tail;
285 if (unlikely((u32)cqe < n)) { 363 if (unlikely((u32)cqe < n)) {
286 spin_unlock_irq(&cq->lock); 364 spin_unlock_irq(&cq->lock);
287 vfree(wc); 365 vfree(wc);
288 ret = -EOVERFLOW; 366 ret = -EOVERFLOW;
289 goto bail; 367 goto bail;
290 } 368 }
291 for (n = 0; cq->tail != cq->head; n++) { 369 for (n = 0; tail != head; n++) {
292 wc[n] = cq->queue[cq->tail]; 370 wc->queue[n] = old_wc->queue[tail];
293 if (cq->tail == cq->ibcq.cqe) 371 if (tail == (u32) cq->ibcq.cqe)
294 cq->tail = 0; 372 tail = 0;
295 else 373 else
296 cq->tail++; 374 tail++;
297 } 375 }
298 cq->ibcq.cqe = cqe; 376 cq->ibcq.cqe = cqe;
299 cq->head = n; 377 wc->head = n;
300 cq->tail = 0; 378 wc->tail = 0;
301 old_wc = cq->queue;
302 cq->queue = wc; 379 cq->queue = wc;
303 spin_unlock_irq(&cq->lock); 380 spin_unlock_irq(&cq->lock);
304 381
305 vfree(old_wc); 382 vfree(old_wc);
306 383
384 if (cq->ip) {
385 struct ipath_ibdev *dev = to_idev(ibcq->device);
386 struct ipath_mmap_info *ip = cq->ip;
387
388 ip->obj = wc;
389 ip->size = PAGE_ALIGN(sizeof(*wc) +
390 sizeof(struct ib_wc) * cqe);
391 spin_lock_irq(&dev->pending_lock);
392 ip->next = dev->pending_mmaps;
393 dev->pending_mmaps = ip;
394 spin_unlock_irq(&dev->pending_lock);
395 }
396
307 ret = 0; 397 ret = 0;
308 398
309bail: 399bail:
diff --git a/drivers/infiniband/hw/ipath/ipath_mmap.c b/drivers/infiniband/hw/ipath/ipath_mmap.c
new file mode 100644
index 000000000000..11b7378ff214
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_mmap.c
@@ -0,0 +1,122 @@
1/*
2 * Copyright (c) 2006 QLogic, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33#include <linux/config.h>
34#include <linux/module.h>
35#include <linux/vmalloc.h>
36#include <linux/mm.h>
37#include <linux/errno.h>
38#include <asm/pgtable.h>
39
40#include "ipath_verbs.h"
41
42/**
43 * ipath_release_mmap_info - free mmap info structure
44 * @ref: a pointer to the kref within struct ipath_mmap_info
45 */
46void ipath_release_mmap_info(struct kref *ref)
47{
48 struct ipath_mmap_info *ip =
49 container_of(ref, struct ipath_mmap_info, ref);
50
51 vfree(ip->obj);
52 kfree(ip);
53}
54
55/*
56 * open and close keep track of how many times the CQ is mapped,
57 * to avoid releasing it.
58 */
59static void ipath_vma_open(struct vm_area_struct *vma)
60{
61 struct ipath_mmap_info *ip = vma->vm_private_data;
62
63 kref_get(&ip->ref);
64 ip->mmap_cnt++;
65}
66
67static void ipath_vma_close(struct vm_area_struct *vma)
68{
69 struct ipath_mmap_info *ip = vma->vm_private_data;
70
71 ip->mmap_cnt--;
72 kref_put(&ip->ref, ipath_release_mmap_info);
73}
74
75static struct vm_operations_struct ipath_vm_ops = {
76 .open = ipath_vma_open,
77 .close = ipath_vma_close,
78};
79
80/**
81 * ipath_mmap - create a new mmap region
82 * @context: the IB user context of the process making the mmap() call
83 * @vma: the VMA to be initialized
84 * Return zero if the mmap is OK. Otherwise, return an errno.
85 */
86int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
87{
88 struct ipath_ibdev *dev = to_idev(context->device);
89 unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
90 unsigned long size = vma->vm_end - vma->vm_start;
91 struct ipath_mmap_info *ip, **pp;
92 int ret = -EINVAL;
93
94 /*
95 * Search the device's list of objects waiting for a mmap call.
96 * Normally, this list is very short since a call to create a
97 * CQ, QP, or SRQ is soon followed by a call to mmap().
98 */
99 spin_lock_irq(&dev->pending_lock);
100 for (pp = &dev->pending_mmaps; (ip = *pp); pp = &ip->next) {
101 /* Only the creator is allowed to mmap the object */
102 if (context != ip->context || (void *) offset != ip->obj)
103 continue;
104 /* Don't allow a mmap larger than the object. */
105 if (size > ip->size)
106 break;
107
108 *pp = ip->next;
109 spin_unlock_irq(&dev->pending_lock);
110
111 ret = remap_vmalloc_range(vma, ip->obj, 0);
112 if (ret)
113 goto done;
114 vma->vm_ops = &ipath_vm_ops;
115 vma->vm_private_data = ip;
116 ipath_vma_open(vma);
117 goto done;
118 }
119 spin_unlock_irq(&dev->pending_lock);
120done:
121 return ret;
122}
diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c
index 44c32d2db990..1ccfc909db1e 100644
--- a/drivers/infiniband/hw/ipath/ipath_qp.c
+++ b/drivers/infiniband/hw/ipath/ipath_qp.c
@@ -35,7 +35,7 @@
35#include <linux/vmalloc.h> 35#include <linux/vmalloc.h>
36 36
37#include "ipath_verbs.h" 37#include "ipath_verbs.h"
38#include "ipath_common.h" 38#include "ipath_kernel.h"
39 39
40#define BITS_PER_PAGE (PAGE_SIZE*BITS_PER_BYTE) 40#define BITS_PER_PAGE (PAGE_SIZE*BITS_PER_BYTE)
41#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) 41#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
@@ -44,19 +44,6 @@
44#define find_next_offset(map, off) find_next_zero_bit((map)->page, \ 44#define find_next_offset(map, off) find_next_zero_bit((map)->page, \
45 BITS_PER_PAGE, off) 45 BITS_PER_PAGE, off)
46 46
47#define TRANS_INVALID 0
48#define TRANS_ANY2RST 1
49#define TRANS_RST2INIT 2
50#define TRANS_INIT2INIT 3
51#define TRANS_INIT2RTR 4
52#define TRANS_RTR2RTS 5
53#define TRANS_RTS2RTS 6
54#define TRANS_SQERR2RTS 7
55#define TRANS_ANY2ERR 8
56#define TRANS_RTS2SQD 9 /* XXX Wait for expected ACKs & signal event */
57#define TRANS_SQD2SQD 10 /* error if not drained & parameter change */
58#define TRANS_SQD2RTS 11 /* error if not drained */
59
60/* 47/*
61 * Convert the AETH credit code into the number of credits. 48 * Convert the AETH credit code into the number of credits.
62 */ 49 */
@@ -355,8 +342,10 @@ static void ipath_reset_qp(struct ipath_qp *qp)
355 qp->s_last = 0; 342 qp->s_last = 0;
356 qp->s_ssn = 1; 343 qp->s_ssn = 1;
357 qp->s_lsn = 0; 344 qp->s_lsn = 0;
358 qp->r_rq.head = 0; 345 if (qp->r_rq.wq) {
359 qp->r_rq.tail = 0; 346 qp->r_rq.wq->head = 0;
347 qp->r_rq.wq->tail = 0;
348 }
360 qp->r_reuse_sge = 0; 349 qp->r_reuse_sge = 0;
361} 350}
362 351
@@ -410,15 +399,32 @@ void ipath_error_qp(struct ipath_qp *qp)
410 qp->s_hdrwords = 0; 399 qp->s_hdrwords = 0;
411 qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; 400 qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
412 401
413 wc.opcode = IB_WC_RECV; 402 if (qp->r_rq.wq) {
414 spin_lock(&qp->r_rq.lock); 403 struct ipath_rwq *wq;
415 while (qp->r_rq.tail != qp->r_rq.head) { 404 u32 head;
416 wc.wr_id = get_rwqe_ptr(&qp->r_rq, qp->r_rq.tail)->wr_id; 405 u32 tail;
417 if (++qp->r_rq.tail >= qp->r_rq.size) 406
418 qp->r_rq.tail = 0; 407 spin_lock(&qp->r_rq.lock);
419 ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); 408
409 /* sanity check pointers before trusting them */
410 wq = qp->r_rq.wq;
411 head = wq->head;
412 if (head >= qp->r_rq.size)
413 head = 0;
414 tail = wq->tail;
415 if (tail >= qp->r_rq.size)
416 tail = 0;
417 wc.opcode = IB_WC_RECV;
418 while (tail != head) {
419 wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
420 if (++tail >= qp->r_rq.size)
421 tail = 0;
422 ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
423 }
424 wq->tail = tail;
425
426 spin_unlock(&qp->r_rq.lock);
420 } 427 }
421 spin_unlock(&qp->r_rq.lock);
422} 428}
423 429
424/** 430/**
@@ -544,7 +550,7 @@ int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
544 attr->dest_qp_num = qp->remote_qpn; 550 attr->dest_qp_num = qp->remote_qpn;
545 attr->qp_access_flags = qp->qp_access_flags; 551 attr->qp_access_flags = qp->qp_access_flags;
546 attr->cap.max_send_wr = qp->s_size - 1; 552 attr->cap.max_send_wr = qp->s_size - 1;
547 attr->cap.max_recv_wr = qp->r_rq.size - 1; 553 attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
548 attr->cap.max_send_sge = qp->s_max_sge; 554 attr->cap.max_send_sge = qp->s_max_sge;
549 attr->cap.max_recv_sge = qp->r_rq.max_sge; 555 attr->cap.max_recv_sge = qp->r_rq.max_sge;
550 attr->cap.max_inline_data = 0; 556 attr->cap.max_inline_data = 0;
@@ -597,13 +603,23 @@ __be32 ipath_compute_aeth(struct ipath_qp *qp)
597 } else { 603 } else {
598 u32 min, max, x; 604 u32 min, max, x;
599 u32 credits; 605 u32 credits;
600 606 struct ipath_rwq *wq = qp->r_rq.wq;
607 u32 head;
608 u32 tail;
609
610 /* sanity check pointers before trusting them */
611 head = wq->head;
612 if (head >= qp->r_rq.size)
613 head = 0;
614 tail = wq->tail;
615 if (tail >= qp->r_rq.size)
616 tail = 0;
601 /* 617 /*
602 * Compute the number of credits available (RWQEs). 618 * Compute the number of credits available (RWQEs).
603 * XXX Not holding the r_rq.lock here so there is a small 619 * XXX Not holding the r_rq.lock here so there is a small
604 * chance that the pair of reads are not atomic. 620 * chance that the pair of reads are not atomic.
605 */ 621 */
606 credits = qp->r_rq.head - qp->r_rq.tail; 622 credits = head - tail;
607 if ((int)credits < 0) 623 if ((int)credits < 0)
608 credits += qp->r_rq.size; 624 credits += qp->r_rq.size;
609 /* 625 /*
@@ -680,27 +696,37 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
680 case IB_QPT_UD: 696 case IB_QPT_UD:
681 case IB_QPT_SMI: 697 case IB_QPT_SMI:
682 case IB_QPT_GSI: 698 case IB_QPT_GSI:
683 qp = kmalloc(sizeof(*qp), GFP_KERNEL); 699 sz = sizeof(*qp);
700 if (init_attr->srq) {
701 struct ipath_srq *srq = to_isrq(init_attr->srq);
702
703 sz += sizeof(*qp->r_sg_list) *
704 srq->rq.max_sge;
705 } else
706 sz += sizeof(*qp->r_sg_list) *
707 init_attr->cap.max_recv_sge;
708 qp = kmalloc(sz, GFP_KERNEL);
684 if (!qp) { 709 if (!qp) {
685 vfree(swq);
686 ret = ERR_PTR(-ENOMEM); 710 ret = ERR_PTR(-ENOMEM);
687 goto bail; 711 goto bail_swq;
688 } 712 }
689 if (init_attr->srq) { 713 if (init_attr->srq) {
714 sz = 0;
690 qp->r_rq.size = 0; 715 qp->r_rq.size = 0;
691 qp->r_rq.max_sge = 0; 716 qp->r_rq.max_sge = 0;
692 qp->r_rq.wq = NULL; 717 qp->r_rq.wq = NULL;
718 init_attr->cap.max_recv_wr = 0;
719 init_attr->cap.max_recv_sge = 0;
693 } else { 720 } else {
694 qp->r_rq.size = init_attr->cap.max_recv_wr + 1; 721 qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
695 qp->r_rq.max_sge = init_attr->cap.max_recv_sge; 722 qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
696 sz = (sizeof(struct ipath_sge) * qp->r_rq.max_sge) + 723 sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
697 sizeof(struct ipath_rwqe); 724 sizeof(struct ipath_rwqe);
698 qp->r_rq.wq = vmalloc(qp->r_rq.size * sz); 725 qp->r_rq.wq = vmalloc_user(sizeof(struct ipath_rwq) +
726 qp->r_rq.size * sz);
699 if (!qp->r_rq.wq) { 727 if (!qp->r_rq.wq) {
700 kfree(qp);
701 vfree(swq);
702 ret = ERR_PTR(-ENOMEM); 728 ret = ERR_PTR(-ENOMEM);
703 goto bail; 729 goto bail_qp;
704 } 730 }
705 } 731 }
706 732
@@ -726,12 +752,10 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
726 err = ipath_alloc_qpn(&dev->qp_table, qp, 752 err = ipath_alloc_qpn(&dev->qp_table, qp,
727 init_attr->qp_type); 753 init_attr->qp_type);
728 if (err) { 754 if (err) {
729 vfree(swq);
730 vfree(qp->r_rq.wq);
731 kfree(qp);
732 ret = ERR_PTR(err); 755 ret = ERR_PTR(err);
733 goto bail; 756 goto bail_rwq;
734 } 757 }
758 qp->ip = NULL;
735 ipath_reset_qp(qp); 759 ipath_reset_qp(qp);
736 760
737 /* Tell the core driver that the kernel SMA is present. */ 761 /* Tell the core driver that the kernel SMA is present. */
@@ -748,8 +772,51 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
748 772
749 init_attr->cap.max_inline_data = 0; 773 init_attr->cap.max_inline_data = 0;
750 774
775 /*
776 * Return the address of the RWQ as the offset to mmap.
777 * See ipath_mmap() for details.
778 */
779 if (udata && udata->outlen >= sizeof(__u64)) {
780 struct ipath_mmap_info *ip;
781 __u64 offset = (__u64) qp->r_rq.wq;
782 int err;
783
784 err = ib_copy_to_udata(udata, &offset, sizeof(offset));
785 if (err) {
786 ret = ERR_PTR(err);
787 goto bail_rwq;
788 }
789
790 if (qp->r_rq.wq) {
791 /* Allocate info for ipath_mmap(). */
792 ip = kmalloc(sizeof(*ip), GFP_KERNEL);
793 if (!ip) {
794 ret = ERR_PTR(-ENOMEM);
795 goto bail_rwq;
796 }
797 qp->ip = ip;
798 ip->context = ibpd->uobject->context;
799 ip->obj = qp->r_rq.wq;
800 kref_init(&ip->ref);
801 ip->mmap_cnt = 0;
802 ip->size = PAGE_ALIGN(sizeof(struct ipath_rwq) +
803 qp->r_rq.size * sz);
804 spin_lock_irq(&dev->pending_lock);
805 ip->next = dev->pending_mmaps;
806 dev->pending_mmaps = ip;
807 spin_unlock_irq(&dev->pending_lock);
808 }
809 }
810
751 ret = &qp->ibqp; 811 ret = &qp->ibqp;
812 goto bail;
752 813
814bail_rwq:
815 vfree(qp->r_rq.wq);
816bail_qp:
817 kfree(qp);
818bail_swq:
819 vfree(swq);
753bail: 820bail:
754 return ret; 821 return ret;
755} 822}
@@ -773,11 +840,9 @@ int ipath_destroy_qp(struct ib_qp *ibqp)
773 if (qp->ibqp.qp_type == IB_QPT_SMI) 840 if (qp->ibqp.qp_type == IB_QPT_SMI)
774 ipath_layer_set_verbs_flags(dev->dd, 0); 841 ipath_layer_set_verbs_flags(dev->dd, 0);
775 842
776 spin_lock_irqsave(&qp->r_rq.lock, flags); 843 spin_lock_irqsave(&qp->s_lock, flags);
777 spin_lock(&qp->s_lock);
778 qp->state = IB_QPS_ERR; 844 qp->state = IB_QPS_ERR;
779 spin_unlock(&qp->s_lock); 845 spin_unlock_irqrestore(&qp->s_lock, flags);
780 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
781 846
782 /* Stop the sending tasklet. */ 847 /* Stop the sending tasklet. */
783 tasklet_kill(&qp->s_task); 848 tasklet_kill(&qp->s_task);
@@ -798,8 +863,11 @@ int ipath_destroy_qp(struct ib_qp *ibqp)
798 if (atomic_read(&qp->refcount) != 0) 863 if (atomic_read(&qp->refcount) != 0)
799 ipath_free_qp(&dev->qp_table, qp); 864 ipath_free_qp(&dev->qp_table, qp);
800 865
866 if (qp->ip)
867 kref_put(&qp->ip->ref, ipath_release_mmap_info);
868 else
869 vfree(qp->r_rq.wq);
801 vfree(qp->s_wq); 870 vfree(qp->s_wq);
802 vfree(qp->r_rq.wq);
803 kfree(qp); 871 kfree(qp);
804 return 0; 872 return 0;
805} 873}
diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c
index 772bc59fb85c..dd09420d677d 100644
--- a/drivers/infiniband/hw/ipath/ipath_ruc.c
+++ b/drivers/infiniband/hw/ipath/ipath_ruc.c
@@ -32,7 +32,7 @@
32 */ 32 */
33 33
34#include "ipath_verbs.h" 34#include "ipath_verbs.h"
35#include "ipath_common.h" 35#include "ipath_kernel.h"
36 36
37/* 37/*
38 * Convert the AETH RNR timeout code into the number of milliseconds. 38 * Convert the AETH RNR timeout code into the number of milliseconds.
@@ -106,6 +106,54 @@ void ipath_insert_rnr_queue(struct ipath_qp *qp)
106 spin_unlock_irqrestore(&dev->pending_lock, flags); 106 spin_unlock_irqrestore(&dev->pending_lock, flags);
107} 107}
108 108
109static int init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe)
110{
111 struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
112 int user = to_ipd(qp->ibqp.pd)->user;
113 int i, j, ret;
114 struct ib_wc wc;
115
116 qp->r_len = 0;
117 for (i = j = 0; i < wqe->num_sge; i++) {
118 if (wqe->sg_list[i].length == 0)
119 continue;
120 /* Check LKEY */
121 if ((user && wqe->sg_list[i].lkey == 0) ||
122 !ipath_lkey_ok(&dev->lk_table,
123 &qp->r_sg_list[j], &wqe->sg_list[i],
124 IB_ACCESS_LOCAL_WRITE))
125 goto bad_lkey;
126 qp->r_len += wqe->sg_list[i].length;
127 j++;
128 }
129 qp->r_sge.sge = qp->r_sg_list[0];
130 qp->r_sge.sg_list = qp->r_sg_list + 1;
131 qp->r_sge.num_sge = j;
132 ret = 1;
133 goto bail;
134
135bad_lkey:
136 wc.wr_id = wqe->wr_id;
137 wc.status = IB_WC_LOC_PROT_ERR;
138 wc.opcode = IB_WC_RECV;
139 wc.vendor_err = 0;
140 wc.byte_len = 0;
141 wc.imm_data = 0;
142 wc.qp_num = qp->ibqp.qp_num;
143 wc.src_qp = 0;
144 wc.wc_flags = 0;
145 wc.pkey_index = 0;
146 wc.slid = 0;
147 wc.sl = 0;
148 wc.dlid_path_bits = 0;
149 wc.port_num = 0;
150 /* Signal solicited completion event. */
151 ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
152 ret = 0;
153bail:
154 return ret;
155}
156
109/** 157/**
110 * ipath_get_rwqe - copy the next RWQE into the QP's RWQE 158 * ipath_get_rwqe - copy the next RWQE into the QP's RWQE
111 * @qp: the QP 159 * @qp: the QP
@@ -119,71 +167,71 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
119{ 167{
120 unsigned long flags; 168 unsigned long flags;
121 struct ipath_rq *rq; 169 struct ipath_rq *rq;
170 struct ipath_rwq *wq;
122 struct ipath_srq *srq; 171 struct ipath_srq *srq;
123 struct ipath_rwqe *wqe; 172 struct ipath_rwqe *wqe;
124 int ret = 1; 173 void (*handler)(struct ib_event *, void *);
174 u32 tail;
175 int ret;
125 176
126 if (!qp->ibqp.srq) { 177 if (qp->ibqp.srq) {
178 srq = to_isrq(qp->ibqp.srq);
179 handler = srq->ibsrq.event_handler;
180 rq = &srq->rq;
181 } else {
182 srq = NULL;
183 handler = NULL;
127 rq = &qp->r_rq; 184 rq = &qp->r_rq;
128 spin_lock_irqsave(&rq->lock, flags);
129
130 if (unlikely(rq->tail == rq->head)) {
131 ret = 0;
132 goto done;
133 }
134 wqe = get_rwqe_ptr(rq, rq->tail);
135 qp->r_wr_id = wqe->wr_id;
136 if (!wr_id_only) {
137 qp->r_sge.sge = wqe->sg_list[0];
138 qp->r_sge.sg_list = wqe->sg_list + 1;
139 qp->r_sge.num_sge = wqe->num_sge;
140 qp->r_len = wqe->length;
141 }
142 if (++rq->tail >= rq->size)
143 rq->tail = 0;
144 goto done;
145 } 185 }
146 186
147 srq = to_isrq(qp->ibqp.srq);
148 rq = &srq->rq;
149 spin_lock_irqsave(&rq->lock, flags); 187 spin_lock_irqsave(&rq->lock, flags);
150 188 wq = rq->wq;
151 if (unlikely(rq->tail == rq->head)) { 189 tail = wq->tail;
152 ret = 0; 190 /* Validate tail before using it since it is user writable. */
153 goto done; 191 if (tail >= rq->size)
154 } 192 tail = 0;
155 wqe = get_rwqe_ptr(rq, rq->tail); 193 do {
194 if (unlikely(tail == wq->head)) {
195 spin_unlock_irqrestore(&rq->lock, flags);
196 ret = 0;
197 goto bail;
198 }
199 wqe = get_rwqe_ptr(rq, tail);
200 if (++tail >= rq->size)
201 tail = 0;
202 } while (!wr_id_only && !init_sge(qp, wqe));
156 qp->r_wr_id = wqe->wr_id; 203 qp->r_wr_id = wqe->wr_id;
157 if (!wr_id_only) { 204 wq->tail = tail;
158 qp->r_sge.sge = wqe->sg_list[0]; 205
159 qp->r_sge.sg_list = wqe->sg_list + 1; 206 ret = 1;
160 qp->r_sge.num_sge = wqe->num_sge; 207 if (handler) {
161 qp->r_len = wqe->length;
162 }
163 if (++rq->tail >= rq->size)
164 rq->tail = 0;
165 if (srq->ibsrq.event_handler) {
166 struct ib_event ev;
167 u32 n; 208 u32 n;
168 209
169 if (rq->head < rq->tail) 210 /*
170 n = rq->size + rq->head - rq->tail; 211 * validate head pointer value and compute
212 * the number of remaining WQEs.
213 */
214 n = wq->head;
215 if (n >= rq->size)
216 n = 0;
217 if (n < tail)
218 n += rq->size - tail;
171 else 219 else
172 n = rq->head - rq->tail; 220 n -= tail;
173 if (n < srq->limit) { 221 if (n < srq->limit) {
222 struct ib_event ev;
223
174 srq->limit = 0; 224 srq->limit = 0;
175 spin_unlock_irqrestore(&rq->lock, flags); 225 spin_unlock_irqrestore(&rq->lock, flags);
176 ev.device = qp->ibqp.device; 226 ev.device = qp->ibqp.device;
177 ev.element.srq = qp->ibqp.srq; 227 ev.element.srq = qp->ibqp.srq;
178 ev.event = IB_EVENT_SRQ_LIMIT_REACHED; 228 ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
179 srq->ibsrq.event_handler(&ev, 229 handler(&ev, srq->ibsrq.srq_context);
180 srq->ibsrq.srq_context);
181 goto bail; 230 goto bail;
182 } 231 }
183 } 232 }
184
185done:
186 spin_unlock_irqrestore(&rq->lock, flags); 233 spin_unlock_irqrestore(&rq->lock, flags);
234
187bail: 235bail:
188 return ret; 236 return ret;
189} 237}
diff --git a/drivers/infiniband/hw/ipath/ipath_srq.c b/drivers/infiniband/hw/ipath/ipath_srq.c
index fa77da6667ed..941e866d9517 100644
--- a/drivers/infiniband/hw/ipath/ipath_srq.c
+++ b/drivers/infiniband/hw/ipath/ipath_srq.c
@@ -48,66 +48,39 @@ int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
48 struct ib_recv_wr **bad_wr) 48 struct ib_recv_wr **bad_wr)
49{ 49{
50 struct ipath_srq *srq = to_isrq(ibsrq); 50 struct ipath_srq *srq = to_isrq(ibsrq);
51 struct ipath_ibdev *dev = to_idev(ibsrq->device); 51 struct ipath_rwq *wq;
52 unsigned long flags; 52 unsigned long flags;
53 int ret; 53 int ret;
54 54
55 for (; wr; wr = wr->next) { 55 for (; wr; wr = wr->next) {
56 struct ipath_rwqe *wqe; 56 struct ipath_rwqe *wqe;
57 u32 next; 57 u32 next;
58 int i, j; 58 int i;
59 59
60 if (wr->num_sge > srq->rq.max_sge) { 60 if ((unsigned) wr->num_sge > srq->rq.max_sge) {
61 *bad_wr = wr; 61 *bad_wr = wr;
62 ret = -ENOMEM; 62 ret = -ENOMEM;
63 goto bail; 63 goto bail;
64 } 64 }
65 65
66 spin_lock_irqsave(&srq->rq.lock, flags); 66 spin_lock_irqsave(&srq->rq.lock, flags);
67 next = srq->rq.head + 1; 67 wq = srq->rq.wq;
68 next = wq->head + 1;
68 if (next >= srq->rq.size) 69 if (next >= srq->rq.size)
69 next = 0; 70 next = 0;
70 if (next == srq->rq.tail) { 71 if (next == wq->tail) {
71 spin_unlock_irqrestore(&srq->rq.lock, flags); 72 spin_unlock_irqrestore(&srq->rq.lock, flags);
72 *bad_wr = wr; 73 *bad_wr = wr;
73 ret = -ENOMEM; 74 ret = -ENOMEM;
74 goto bail; 75 goto bail;
75 } 76 }
76 77
77 wqe = get_rwqe_ptr(&srq->rq, srq->rq.head); 78 wqe = get_rwqe_ptr(&srq->rq, wq->head);
78 wqe->wr_id = wr->wr_id; 79 wqe->wr_id = wr->wr_id;
79 wqe->sg_list[0].mr = NULL; 80 wqe->num_sge = wr->num_sge;
80 wqe->sg_list[0].vaddr = NULL; 81 for (i = 0; i < wr->num_sge; i++)
81 wqe->sg_list[0].length = 0; 82 wqe->sg_list[i] = wr->sg_list[i];
82 wqe->sg_list[0].sge_length = 0; 83 wq->head = next;
83 wqe->length = 0;
84 for (i = 0, j = 0; i < wr->num_sge; i++) {
85 /* Check LKEY */
86 if (to_ipd(srq->ibsrq.pd)->user &&
87 wr->sg_list[i].lkey == 0) {
88 spin_unlock_irqrestore(&srq->rq.lock,
89 flags);
90 *bad_wr = wr;
91 ret = -EINVAL;
92 goto bail;
93 }
94 if (wr->sg_list[i].length == 0)
95 continue;
96 if (!ipath_lkey_ok(&dev->lk_table,
97 &wqe->sg_list[j],
98 &wr->sg_list[i],
99 IB_ACCESS_LOCAL_WRITE)) {
100 spin_unlock_irqrestore(&srq->rq.lock,
101 flags);
102 *bad_wr = wr;
103 ret = -EINVAL;
104 goto bail;
105 }
106 wqe->length += wr->sg_list[i].length;
107 j++;
108 }
109 wqe->num_sge = j;
110 srq->rq.head = next;
111 spin_unlock_irqrestore(&srq->rq.lock, flags); 84 spin_unlock_irqrestore(&srq->rq.lock, flags);
112 } 85 }
113 ret = 0; 86 ret = 0;
@@ -133,53 +106,95 @@ struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
133 106
134 if (dev->n_srqs_allocated == ib_ipath_max_srqs) { 107 if (dev->n_srqs_allocated == ib_ipath_max_srqs) {
135 ret = ERR_PTR(-ENOMEM); 108 ret = ERR_PTR(-ENOMEM);
136 goto bail; 109 goto done;
137 } 110 }
138 111
139 if (srq_init_attr->attr.max_wr == 0) { 112 if (srq_init_attr->attr.max_wr == 0) {
140 ret = ERR_PTR(-EINVAL); 113 ret = ERR_PTR(-EINVAL);
141 goto bail; 114 goto done;
142 } 115 }
143 116
144 if ((srq_init_attr->attr.max_sge > ib_ipath_max_srq_sges) || 117 if ((srq_init_attr->attr.max_sge > ib_ipath_max_srq_sges) ||
145 (srq_init_attr->attr.max_wr > ib_ipath_max_srq_wrs)) { 118 (srq_init_attr->attr.max_wr > ib_ipath_max_srq_wrs)) {
146 ret = ERR_PTR(-EINVAL); 119 ret = ERR_PTR(-EINVAL);
147 goto bail; 120 goto done;
148 } 121 }
149 122
150 srq = kmalloc(sizeof(*srq), GFP_KERNEL); 123 srq = kmalloc(sizeof(*srq), GFP_KERNEL);
151 if (!srq) { 124 if (!srq) {
152 ret = ERR_PTR(-ENOMEM); 125 ret = ERR_PTR(-ENOMEM);
153 goto bail; 126 goto done;
154 } 127 }
155 128
156 /* 129 /*
157 * Need to use vmalloc() if we want to support large #s of entries. 130 * Need to use vmalloc() if we want to support large #s of entries.
158 */ 131 */
159 srq->rq.size = srq_init_attr->attr.max_wr + 1; 132 srq->rq.size = srq_init_attr->attr.max_wr + 1;
160 sz = sizeof(struct ipath_sge) * srq_init_attr->attr.max_sge + 133 srq->rq.max_sge = srq_init_attr->attr.max_sge;
134 sz = sizeof(struct ib_sge) * srq->rq.max_sge +
161 sizeof(struct ipath_rwqe); 135 sizeof(struct ipath_rwqe);
162 srq->rq.wq = vmalloc(srq->rq.size * sz); 136 srq->rq.wq = vmalloc_user(sizeof(struct ipath_rwq) + srq->rq.size * sz);
163 if (!srq->rq.wq) { 137 if (!srq->rq.wq) {
164 kfree(srq);
165 ret = ERR_PTR(-ENOMEM); 138 ret = ERR_PTR(-ENOMEM);
166 goto bail; 139 goto bail_srq;
167 } 140 }
168 141
169 /* 142 /*
143 * Return the address of the RWQ as the offset to mmap.
144 * See ipath_mmap() for details.
145 */
146 if (udata && udata->outlen >= sizeof(__u64)) {
147 struct ipath_mmap_info *ip;
148 __u64 offset = (__u64) srq->rq.wq;
149 int err;
150
151 err = ib_copy_to_udata(udata, &offset, sizeof(offset));
152 if (err) {
153 ret = ERR_PTR(err);
154 goto bail_wq;
155 }
156
157 /* Allocate info for ipath_mmap(). */
158 ip = kmalloc(sizeof(*ip), GFP_KERNEL);
159 if (!ip) {
160 ret = ERR_PTR(-ENOMEM);
161 goto bail_wq;
162 }
163 srq->ip = ip;
164 ip->context = ibpd->uobject->context;
165 ip->obj = srq->rq.wq;
166 kref_init(&ip->ref);
167 ip->mmap_cnt = 0;
168 ip->size = PAGE_ALIGN(sizeof(struct ipath_rwq) +
169 srq->rq.size * sz);
170 spin_lock_irq(&dev->pending_lock);
171 ip->next = dev->pending_mmaps;
172 dev->pending_mmaps = ip;
173 spin_unlock_irq(&dev->pending_lock);
174 } else
175 srq->ip = NULL;
176
177 /*
170 * ib_create_srq() will initialize srq->ibsrq. 178 * ib_create_srq() will initialize srq->ibsrq.
171 */ 179 */
172 spin_lock_init(&srq->rq.lock); 180 spin_lock_init(&srq->rq.lock);
173 srq->rq.head = 0; 181 srq->rq.wq->head = 0;
174 srq->rq.tail = 0; 182 srq->rq.wq->tail = 0;
175 srq->rq.max_sge = srq_init_attr->attr.max_sge; 183 srq->rq.max_sge = srq_init_attr->attr.max_sge;
176 srq->limit = srq_init_attr->attr.srq_limit; 184 srq->limit = srq_init_attr->attr.srq_limit;
177 185
186 dev->n_srqs_allocated++;
187
178 ret = &srq->ibsrq; 188 ret = &srq->ibsrq;
189 goto done;
179 190
180 dev->n_srqs_allocated++; 191bail_wq:
192 vfree(srq->rq.wq);
181 193
182bail: 194bail_srq:
195 kfree(srq);
196
197done:
183 return ret; 198 return ret;
184} 199}
185 200
@@ -195,78 +210,123 @@ int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
195 struct ib_udata *udata) 210 struct ib_udata *udata)
196{ 211{
197 struct ipath_srq *srq = to_isrq(ibsrq); 212 struct ipath_srq *srq = to_isrq(ibsrq);
198 unsigned long flags; 213 int ret = 0;
199 int ret;
200 214
201 if (attr_mask & IB_SRQ_MAX_WR) 215 if (attr_mask & IB_SRQ_MAX_WR) {
202 if ((attr->max_wr > ib_ipath_max_srq_wrs) || 216 struct ipath_rwq *owq;
203 (attr->max_sge > srq->rq.max_sge)) { 217 struct ipath_rwq *wq;
204 ret = -EINVAL; 218 struct ipath_rwqe *p;
205 goto bail; 219 u32 sz, size, n, head, tail;
206 }
207 220
208 if (attr_mask & IB_SRQ_LIMIT) 221 /* Check that the requested sizes are below the limits. */
209 if (attr->srq_limit >= srq->rq.size) { 222 if ((attr->max_wr > ib_ipath_max_srq_wrs) ||
223 ((attr_mask & IB_SRQ_LIMIT) ?
224 attr->srq_limit : srq->limit) > attr->max_wr) {
210 ret = -EINVAL; 225 ret = -EINVAL;
211 goto bail; 226 goto bail;
212 } 227 }
213 228
214 if (attr_mask & IB_SRQ_MAX_WR) {
215 struct ipath_rwqe *wq, *p;
216 u32 sz, size, n;
217
218 sz = sizeof(struct ipath_rwqe) + 229 sz = sizeof(struct ipath_rwqe) +
219 attr->max_sge * sizeof(struct ipath_sge); 230 srq->rq.max_sge * sizeof(struct ib_sge);
220 size = attr->max_wr + 1; 231 size = attr->max_wr + 1;
221 wq = vmalloc(size * sz); 232 wq = vmalloc_user(sizeof(struct ipath_rwq) + size * sz);
222 if (!wq) { 233 if (!wq) {
223 ret = -ENOMEM; 234 ret = -ENOMEM;
224 goto bail; 235 goto bail;
225 } 236 }
226 237
227 spin_lock_irqsave(&srq->rq.lock, flags); 238 /*
228 if (srq->rq.head < srq->rq.tail) 239 * Return the address of the RWQ as the offset to mmap.
229 n = srq->rq.size + srq->rq.head - srq->rq.tail; 240 * See ipath_mmap() for details.
241 */
242 if (udata && udata->inlen >= sizeof(__u64)) {
243 __u64 offset_addr;
244 __u64 offset = (__u64) wq;
245
246 ret = ib_copy_from_udata(&offset_addr, udata,
247 sizeof(offset_addr));
248 if (ret) {
249 vfree(wq);
250 goto bail;
251 }
252 udata->outbuf = (void __user *) offset_addr;
253 ret = ib_copy_to_udata(udata, &offset,
254 sizeof(offset));
255 if (ret) {
256 vfree(wq);
257 goto bail;
258 }
259 }
260
261 spin_lock_irq(&srq->rq.lock);
262 /*
263 * validate head pointer value and compute
264 * the number of remaining WQEs.
265 */
266 owq = srq->rq.wq;
267 head = owq->head;
268 if (head >= srq->rq.size)
269 head = 0;
270 tail = owq->tail;
271 if (tail >= srq->rq.size)
272 tail = 0;
273 n = head;
274 if (n < tail)
275 n += srq->rq.size - tail;
230 else 276 else
231 n = srq->rq.head - srq->rq.tail; 277 n -= tail;
232 if (size <= n || size <= srq->limit) { 278 if (size <= n) {
233 spin_unlock_irqrestore(&srq->rq.lock, flags); 279 spin_unlock_irq(&srq->rq.lock);
234 vfree(wq); 280 vfree(wq);
235 ret = -EINVAL; 281 ret = -EINVAL;
236 goto bail; 282 goto bail;
237 } 283 }
238 n = 0; 284 n = 0;
239 p = wq; 285 p = wq->wq;
240 while (srq->rq.tail != srq->rq.head) { 286 while (tail != head) {
241 struct ipath_rwqe *wqe; 287 struct ipath_rwqe *wqe;
242 int i; 288 int i;
243 289
244 wqe = get_rwqe_ptr(&srq->rq, srq->rq.tail); 290 wqe = get_rwqe_ptr(&srq->rq, tail);
245 p->wr_id = wqe->wr_id; 291 p->wr_id = wqe->wr_id;
246 p->length = wqe->length;
247 p->num_sge = wqe->num_sge; 292 p->num_sge = wqe->num_sge;
248 for (i = 0; i < wqe->num_sge; i++) 293 for (i = 0; i < wqe->num_sge; i++)
249 p->sg_list[i] = wqe->sg_list[i]; 294 p->sg_list[i] = wqe->sg_list[i];
250 n++; 295 n++;
251 p = (struct ipath_rwqe *)((char *) p + sz); 296 p = (struct ipath_rwqe *)((char *) p + sz);
252 if (++srq->rq.tail >= srq->rq.size) 297 if (++tail >= srq->rq.size)
253 srq->rq.tail = 0; 298 tail = 0;
254 } 299 }
255 vfree(srq->rq.wq);
256 srq->rq.wq = wq; 300 srq->rq.wq = wq;
257 srq->rq.size = size; 301 srq->rq.size = size;
258 srq->rq.head = n; 302 wq->head = n;
259 srq->rq.tail = 0; 303 wq->tail = 0;
260 srq->rq.max_sge = attr->max_sge; 304 if (attr_mask & IB_SRQ_LIMIT)
261 spin_unlock_irqrestore(&srq->rq.lock, flags); 305 srq->limit = attr->srq_limit;
262 } 306 spin_unlock_irq(&srq->rq.lock);
263 307
264 if (attr_mask & IB_SRQ_LIMIT) { 308 vfree(owq);
265 spin_lock_irqsave(&srq->rq.lock, flags); 309
266 srq->limit = attr->srq_limit; 310 if (srq->ip) {
267 spin_unlock_irqrestore(&srq->rq.lock, flags); 311 struct ipath_mmap_info *ip = srq->ip;
312 struct ipath_ibdev *dev = to_idev(srq->ibsrq.device);
313
314 ip->obj = wq;
315 ip->size = PAGE_ALIGN(sizeof(struct ipath_rwq) +
316 size * sz);
317 spin_lock_irq(&dev->pending_lock);
318 ip->next = dev->pending_mmaps;
319 dev->pending_mmaps = ip;
320 spin_unlock_irq(&dev->pending_lock);
321 }
322 } else if (attr_mask & IB_SRQ_LIMIT) {
323 spin_lock_irq(&srq->rq.lock);
324 if (attr->srq_limit >= srq->rq.size)
325 ret = -EINVAL;
326 else
327 srq->limit = attr->srq_limit;
328 spin_unlock_irq(&srq->rq.lock);
268 } 329 }
269 ret = 0;
270 330
271bail: 331bail:
272 return ret; 332 return ret;
diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c
index 3466129af804..82439fcfc2f8 100644
--- a/drivers/infiniband/hw/ipath/ipath_ud.c
+++ b/drivers/infiniband/hw/ipath/ipath_ud.c
@@ -34,7 +34,54 @@
34#include <rdma/ib_smi.h> 34#include <rdma/ib_smi.h>
35 35
36#include "ipath_verbs.h" 36#include "ipath_verbs.h"
37#include "ipath_common.h" 37#include "ipath_kernel.h"
38
39static int init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
40 u32 *lengthp, struct ipath_sge_state *ss)
41{
42 struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
43 int user = to_ipd(qp->ibqp.pd)->user;
44 int i, j, ret;
45 struct ib_wc wc;
46
47 *lengthp = 0;
48 for (i = j = 0; i < wqe->num_sge; i++) {
49 if (wqe->sg_list[i].length == 0)
50 continue;
51 /* Check LKEY */
52 if ((user && wqe->sg_list[i].lkey == 0) ||
53 !ipath_lkey_ok(&dev->lk_table,
54 j ? &ss->sg_list[j - 1] : &ss->sge,
55 &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
56 goto bad_lkey;
57 *lengthp += wqe->sg_list[i].length;
58 j++;
59 }
60 ss->num_sge = j;
61 ret = 1;
62 goto bail;
63
64bad_lkey:
65 wc.wr_id = wqe->wr_id;
66 wc.status = IB_WC_LOC_PROT_ERR;
67 wc.opcode = IB_WC_RECV;
68 wc.vendor_err = 0;
69 wc.byte_len = 0;
70 wc.imm_data = 0;
71 wc.qp_num = qp->ibqp.qp_num;
72 wc.src_qp = 0;
73 wc.wc_flags = 0;
74 wc.pkey_index = 0;
75 wc.slid = 0;
76 wc.sl = 0;
77 wc.dlid_path_bits = 0;
78 wc.port_num = 0;
79 /* Signal solicited completion event. */
80 ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
81 ret = 0;
82bail:
83 return ret;
84}
38 85
39/** 86/**
40 * ipath_ud_loopback - handle send on loopback QPs 87 * ipath_ud_loopback - handle send on loopback QPs
@@ -46,6 +93,8 @@
46 * 93 *
47 * This is called from ipath_post_ud_send() to forward a WQE addressed 94 * This is called from ipath_post_ud_send() to forward a WQE addressed
48 * to the same HCA. 95 * to the same HCA.
96 * Note that the receive interrupt handler may be calling ipath_ud_rcv()
97 * while this is being called.
49 */ 98 */
50static void ipath_ud_loopback(struct ipath_qp *sqp, 99static void ipath_ud_loopback(struct ipath_qp *sqp,
51 struct ipath_sge_state *ss, 100 struct ipath_sge_state *ss,
@@ -60,7 +109,11 @@ static void ipath_ud_loopback(struct ipath_qp *sqp,
60 struct ipath_srq *srq; 109 struct ipath_srq *srq;
61 struct ipath_sge_state rsge; 110 struct ipath_sge_state rsge;
62 struct ipath_sge *sge; 111 struct ipath_sge *sge;
112 struct ipath_rwq *wq;
63 struct ipath_rwqe *wqe; 113 struct ipath_rwqe *wqe;
114 void (*handler)(struct ib_event *, void *);
115 u32 tail;
116 u32 rlen;
64 117
65 qp = ipath_lookup_qpn(&dev->qp_table, wr->wr.ud.remote_qpn); 118 qp = ipath_lookup_qpn(&dev->qp_table, wr->wr.ud.remote_qpn);
66 if (!qp) 119 if (!qp)
@@ -94,6 +147,13 @@ static void ipath_ud_loopback(struct ipath_qp *sqp,
94 wc->imm_data = 0; 147 wc->imm_data = 0;
95 } 148 }
96 149
150 if (wr->num_sge > 1) {
151 rsge.sg_list = kmalloc((wr->num_sge - 1) *
152 sizeof(struct ipath_sge),
153 GFP_ATOMIC);
154 } else
155 rsge.sg_list = NULL;
156
97 /* 157 /*
98 * Get the next work request entry to find where to put the data. 158 * Get the next work request entry to find where to put the data.
99 * Note that it is safe to drop the lock after changing rq->tail 159 * Note that it is safe to drop the lock after changing rq->tail
@@ -101,37 +161,52 @@ static void ipath_ud_loopback(struct ipath_qp *sqp,
101 */ 161 */
102 if (qp->ibqp.srq) { 162 if (qp->ibqp.srq) {
103 srq = to_isrq(qp->ibqp.srq); 163 srq = to_isrq(qp->ibqp.srq);
164 handler = srq->ibsrq.event_handler;
104 rq = &srq->rq; 165 rq = &srq->rq;
105 } else { 166 } else {
106 srq = NULL; 167 srq = NULL;
168 handler = NULL;
107 rq = &qp->r_rq; 169 rq = &qp->r_rq;
108 } 170 }
171
109 spin_lock_irqsave(&rq->lock, flags); 172 spin_lock_irqsave(&rq->lock, flags);
110 if (rq->tail == rq->head) { 173 wq = rq->wq;
111 spin_unlock_irqrestore(&rq->lock, flags); 174 tail = wq->tail;
112 dev->n_pkt_drops++; 175 while (1) {
113 goto done; 176 if (unlikely(tail == wq->head)) {
177 spin_unlock_irqrestore(&rq->lock, flags);
178 dev->n_pkt_drops++;
179 goto bail_sge;
180 }
181 wqe = get_rwqe_ptr(rq, tail);
182 if (++tail >= rq->size)
183 tail = 0;
184 if (init_sge(qp, wqe, &rlen, &rsge))
185 break;
186 wq->tail = tail;
114 } 187 }
115 /* Silently drop packets which are too big. */ 188 /* Silently drop packets which are too big. */
116 wqe = get_rwqe_ptr(rq, rq->tail); 189 if (wc->byte_len > rlen) {
117 if (wc->byte_len > wqe->length) {
118 spin_unlock_irqrestore(&rq->lock, flags); 190 spin_unlock_irqrestore(&rq->lock, flags);
119 dev->n_pkt_drops++; 191 dev->n_pkt_drops++;
120 goto done; 192 goto bail_sge;
121 } 193 }
194 wq->tail = tail;
122 wc->wr_id = wqe->wr_id; 195 wc->wr_id = wqe->wr_id;
123 rsge.sge = wqe->sg_list[0]; 196 if (handler) {
124 rsge.sg_list = wqe->sg_list + 1;
125 rsge.num_sge = wqe->num_sge;
126 if (++rq->tail >= rq->size)
127 rq->tail = 0;
128 if (srq && srq->ibsrq.event_handler) {
129 u32 n; 197 u32 n;
130 198
131 if (rq->head < rq->tail) 199 /*
132 n = rq->size + rq->head - rq->tail; 200 * validate head pointer value and compute
201 * the number of remaining WQEs.
202 */
203 n = wq->head;
204 if (n >= rq->size)
205 n = 0;
206 if (n < tail)
207 n += rq->size - tail;
133 else 208 else
134 n = rq->head - rq->tail; 209 n -= tail;
135 if (n < srq->limit) { 210 if (n < srq->limit) {
136 struct ib_event ev; 211 struct ib_event ev;
137 212
@@ -140,12 +215,12 @@ static void ipath_ud_loopback(struct ipath_qp *sqp,
140 ev.device = qp->ibqp.device; 215 ev.device = qp->ibqp.device;
141 ev.element.srq = qp->ibqp.srq; 216 ev.element.srq = qp->ibqp.srq;
142 ev.event = IB_EVENT_SRQ_LIMIT_REACHED; 217 ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
143 srq->ibsrq.event_handler(&ev, 218 handler(&ev, srq->ibsrq.srq_context);
144 srq->ibsrq.srq_context);
145 } else 219 } else
146 spin_unlock_irqrestore(&rq->lock, flags); 220 spin_unlock_irqrestore(&rq->lock, flags);
147 } else 221 } else
148 spin_unlock_irqrestore(&rq->lock, flags); 222 spin_unlock_irqrestore(&rq->lock, flags);
223
149 ah_attr = &to_iah(wr->wr.ud.ah)->attr; 224 ah_attr = &to_iah(wr->wr.ud.ah)->attr;
150 if (ah_attr->ah_flags & IB_AH_GRH) { 225 if (ah_attr->ah_flags & IB_AH_GRH) {
151 ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh)); 226 ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh));
@@ -186,7 +261,7 @@ static void ipath_ud_loopback(struct ipath_qp *sqp,
186 wc->src_qp = sqp->ibqp.qp_num; 261 wc->src_qp = sqp->ibqp.qp_num;
187 /* XXX do we know which pkey matched? Only needed for GSI. */ 262 /* XXX do we know which pkey matched? Only needed for GSI. */
188 wc->pkey_index = 0; 263 wc->pkey_index = 0;
189 wc->slid = ipath_layer_get_lid(dev->dd) | 264 wc->slid = dev->dd->ipath_lid |
190 (ah_attr->src_path_bits & 265 (ah_attr->src_path_bits &
191 ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1)); 266 ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1));
192 wc->sl = ah_attr->sl; 267 wc->sl = ah_attr->sl;
@@ -196,6 +271,8 @@ static void ipath_ud_loopback(struct ipath_qp *sqp,
196 ipath_cq_enter(to_icq(qp->ibqp.recv_cq), wc, 271 ipath_cq_enter(to_icq(qp->ibqp.recv_cq), wc,
197 wr->send_flags & IB_SEND_SOLICITED); 272 wr->send_flags & IB_SEND_SOLICITED);
198 273
274bail_sge:
275 kfree(rsge.sg_list);
199done: 276done:
200 if (atomic_dec_and_test(&qp->refcount)) 277 if (atomic_dec_and_test(&qp->refcount))
201 wake_up(&qp->wait); 278 wake_up(&qp->wait);
@@ -433,13 +510,9 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
433 int opcode; 510 int opcode;
434 u32 hdrsize; 511 u32 hdrsize;
435 u32 pad; 512 u32 pad;
436 unsigned long flags;
437 struct ib_wc wc; 513 struct ib_wc wc;
438 u32 qkey; 514 u32 qkey;
439 u32 src_qp; 515 u32 src_qp;
440 struct ipath_rq *rq;
441 struct ipath_srq *srq;
442 struct ipath_rwqe *wqe;
443 u16 dlid; 516 u16 dlid;
444 int header_in_data; 517 int header_in_data;
445 518
@@ -547,19 +620,10 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
547 620
548 /* 621 /*
549 * Get the next work request entry to find where to put the data. 622 * Get the next work request entry to find where to put the data.
550 * Note that it is safe to drop the lock after changing rq->tail
551 * since ipath_post_receive() won't fill the empty slot.
552 */ 623 */
553 if (qp->ibqp.srq) { 624 if (qp->r_reuse_sge)
554 srq = to_isrq(qp->ibqp.srq); 625 qp->r_reuse_sge = 0;
555 rq = &srq->rq; 626 else if (!ipath_get_rwqe(qp, 0)) {
556 } else {
557 srq = NULL;
558 rq = &qp->r_rq;
559 }
560 spin_lock_irqsave(&rq->lock, flags);
561 if (rq->tail == rq->head) {
562 spin_unlock_irqrestore(&rq->lock, flags);
563 /* 627 /*
564 * Count VL15 packets dropped due to no receive buffer. 628 * Count VL15 packets dropped due to no receive buffer.
565 * Otherwise, count them as buffer overruns since usually, 629 * Otherwise, count them as buffer overruns since usually,
@@ -573,39 +637,11 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
573 goto bail; 637 goto bail;
574 } 638 }
575 /* Silently drop packets which are too big. */ 639 /* Silently drop packets which are too big. */
576 wqe = get_rwqe_ptr(rq, rq->tail); 640 if (wc.byte_len > qp->r_len) {
577 if (wc.byte_len > wqe->length) { 641 qp->r_reuse_sge = 1;
578 spin_unlock_irqrestore(&rq->lock, flags);
579 dev->n_pkt_drops++; 642 dev->n_pkt_drops++;
580 goto bail; 643 goto bail;
581 } 644 }
582 wc.wr_id = wqe->wr_id;
583 qp->r_sge.sge = wqe->sg_list[0];
584 qp->r_sge.sg_list = wqe->sg_list + 1;
585 qp->r_sge.num_sge = wqe->num_sge;
586 if (++rq->tail >= rq->size)
587 rq->tail = 0;
588 if (srq && srq->ibsrq.event_handler) {
589 u32 n;
590
591 if (rq->head < rq->tail)
592 n = rq->size + rq->head - rq->tail;
593 else
594 n = rq->head - rq->tail;
595 if (n < srq->limit) {
596 struct ib_event ev;
597
598 srq->limit = 0;
599 spin_unlock_irqrestore(&rq->lock, flags);
600 ev.device = qp->ibqp.device;
601 ev.element.srq = qp->ibqp.srq;
602 ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
603 srq->ibsrq.event_handler(&ev,
604 srq->ibsrq.srq_context);
605 } else
606 spin_unlock_irqrestore(&rq->lock, flags);
607 } else
608 spin_unlock_irqrestore(&rq->lock, flags);
609 if (has_grh) { 645 if (has_grh) {
610 ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh, 646 ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh,
611 sizeof(struct ib_grh)); 647 sizeof(struct ib_grh));
@@ -614,6 +650,7 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
614 ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh)); 650 ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh));
615 ipath_copy_sge(&qp->r_sge, data, 651 ipath_copy_sge(&qp->r_sge, data,
616 wc.byte_len - sizeof(struct ib_grh)); 652 wc.byte_len - sizeof(struct ib_grh));
653 wc.wr_id = qp->r_wr_id;
617 wc.status = IB_WC_SUCCESS; 654 wc.status = IB_WC_SUCCESS;
618 wc.opcode = IB_WC_RECV; 655 wc.opcode = IB_WC_RECV;
619 wc.vendor_err = 0; 656 wc.vendor_err = 0;
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
index d70a9b6b5239..a2b4c70192d8 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -277,11 +277,12 @@ static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
277 struct ib_recv_wr **bad_wr) 277 struct ib_recv_wr **bad_wr)
278{ 278{
279 struct ipath_qp *qp = to_iqp(ibqp); 279 struct ipath_qp *qp = to_iqp(ibqp);
280 struct ipath_rwq *wq = qp->r_rq.wq;
280 unsigned long flags; 281 unsigned long flags;
281 int ret; 282 int ret;
282 283
283 /* Check that state is OK to post receive. */ 284 /* Check that state is OK to post receive. */
284 if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK)) { 285 if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK) || !wq) {
285 *bad_wr = wr; 286 *bad_wr = wr;
286 ret = -EINVAL; 287 ret = -EINVAL;
287 goto bail; 288 goto bail;
@@ -290,59 +291,31 @@ static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
290 for (; wr; wr = wr->next) { 291 for (; wr; wr = wr->next) {
291 struct ipath_rwqe *wqe; 292 struct ipath_rwqe *wqe;
292 u32 next; 293 u32 next;
293 int i, j; 294 int i;
294 295
295 if (wr->num_sge > qp->r_rq.max_sge) { 296 if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
296 *bad_wr = wr; 297 *bad_wr = wr;
297 ret = -ENOMEM; 298 ret = -ENOMEM;
298 goto bail; 299 goto bail;
299 } 300 }
300 301
301 spin_lock_irqsave(&qp->r_rq.lock, flags); 302 spin_lock_irqsave(&qp->r_rq.lock, flags);
302 next = qp->r_rq.head + 1; 303 next = wq->head + 1;
303 if (next >= qp->r_rq.size) 304 if (next >= qp->r_rq.size)
304 next = 0; 305 next = 0;
305 if (next == qp->r_rq.tail) { 306 if (next == wq->tail) {
306 spin_unlock_irqrestore(&qp->r_rq.lock, flags); 307 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
307 *bad_wr = wr; 308 *bad_wr = wr;
308 ret = -ENOMEM; 309 ret = -ENOMEM;
309 goto bail; 310 goto bail;
310 } 311 }
311 312
312 wqe = get_rwqe_ptr(&qp->r_rq, qp->r_rq.head); 313 wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
313 wqe->wr_id = wr->wr_id; 314 wqe->wr_id = wr->wr_id;
314 wqe->sg_list[0].mr = NULL; 315 wqe->num_sge = wr->num_sge;
315 wqe->sg_list[0].vaddr = NULL; 316 for (i = 0; i < wr->num_sge; i++)
316 wqe->sg_list[0].length = 0; 317 wqe->sg_list[i] = wr->sg_list[i];
317 wqe->sg_list[0].sge_length = 0; 318 wq->head = next;
318 wqe->length = 0;
319 for (i = 0, j = 0; i < wr->num_sge; i++) {
320 /* Check LKEY */
321 if (to_ipd(qp->ibqp.pd)->user &&
322 wr->sg_list[i].lkey == 0) {
323 spin_unlock_irqrestore(&qp->r_rq.lock,
324 flags);
325 *bad_wr = wr;
326 ret = -EINVAL;
327 goto bail;
328 }
329 if (wr->sg_list[i].length == 0)
330 continue;
331 if (!ipath_lkey_ok(
332 &to_idev(qp->ibqp.device)->lk_table,
333 &wqe->sg_list[j], &wr->sg_list[i],
334 IB_ACCESS_LOCAL_WRITE)) {
335 spin_unlock_irqrestore(&qp->r_rq.lock,
336 flags);
337 *bad_wr = wr;
338 ret = -EINVAL;
339 goto bail;
340 }
341 wqe->length += wr->sg_list[i].length;
342 j++;
343 }
344 wqe->num_sge = j;
345 qp->r_rq.head = next;
346 spin_unlock_irqrestore(&qp->r_rq.lock, flags); 319 spin_unlock_irqrestore(&qp->r_rq.lock, flags);
347 } 320 }
348 ret = 0; 321 ret = 0;
@@ -1137,6 +1110,7 @@ static void *ipath_register_ib_device(int unit, struct ipath_devdata *dd)
1137 dev->attach_mcast = ipath_multicast_attach; 1110 dev->attach_mcast = ipath_multicast_attach;
1138 dev->detach_mcast = ipath_multicast_detach; 1111 dev->detach_mcast = ipath_multicast_detach;
1139 dev->process_mad = ipath_process_mad; 1112 dev->process_mad = ipath_process_mad;
1113 dev->mmap = ipath_mmap;
1140 1114
1141 snprintf(dev->node_desc, sizeof(dev->node_desc), 1115 snprintf(dev->node_desc, sizeof(dev->node_desc),
1142 IPATH_IDSTR " %s kernel_SMA", system_utsname.nodename); 1116 IPATH_IDSTR " %s kernel_SMA", system_utsname.nodename);
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h
index 698396778f00..7d2ba72609f7 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.h
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.h
@@ -38,6 +38,7 @@
38#include <linux/spinlock.h> 38#include <linux/spinlock.h>
39#include <linux/kernel.h> 39#include <linux/kernel.h>
40#include <linux/interrupt.h> 40#include <linux/interrupt.h>
41#include <linux/kref.h>
41#include <rdma/ib_pack.h> 42#include <rdma/ib_pack.h>
42 43
43#include "ipath_layer.h" 44#include "ipath_layer.h"
@@ -50,7 +51,7 @@
50 * Increment this value if any changes that break userspace ABI 51 * Increment this value if any changes that break userspace ABI
51 * compatibility are made. 52 * compatibility are made.
52 */ 53 */
53#define IPATH_UVERBS_ABI_VERSION 1 54#define IPATH_UVERBS_ABI_VERSION 2
54 55
55/* 56/*
56 * Define an ib_cq_notify value that is not valid so we know when CQ 57 * Define an ib_cq_notify value that is not valid so we know when CQ
@@ -178,58 +179,41 @@ struct ipath_ah {
178}; 179};
179 180
180/* 181/*
181 * Quick description of our CQ/QP locking scheme: 182 * This structure is used by ipath_mmap() to validate an offset
182 * 183 * when an mmap() request is made. The vm_area_struct then uses
183 * We have one global lock that protects dev->cq/qp_table. Each 184 * this as its vm_private_data.
184 * struct ipath_cq/qp also has its own lock. An individual qp lock
185 * may be taken inside of an individual cq lock. Both cqs attached to
186 * a qp may be locked, with the send cq locked first. No other
187 * nesting should be done.
188 *
189 * Each struct ipath_cq/qp also has an atomic_t ref count. The
190 * pointer from the cq/qp_table to the struct counts as one reference.
191 * This reference also is good for access through the consumer API, so
192 * modifying the CQ/QP etc doesn't need to take another reference.
193 * Access because of a completion being polled does need a reference.
194 *
195 * Finally, each struct ipath_cq/qp has a wait_queue_head_t for the
196 * destroy function to sleep on.
197 *
198 * This means that access from the consumer API requires nothing but
199 * taking the struct's lock.
200 *
201 * Access because of a completion event should go as follows:
202 * - lock cq/qp_table and look up struct
203 * - increment ref count in struct
204 * - drop cq/qp_table lock
205 * - lock struct, do your thing, and unlock struct
206 * - decrement ref count; if zero, wake up waiters
207 *
208 * To destroy a CQ/QP, we can do the following:
209 * - lock cq/qp_table, remove pointer, unlock cq/qp_table lock
210 * - decrement ref count
211 * - wait_event until ref count is zero
212 *
213 * It is the consumer's responsibilty to make sure that no QP
214 * operations (WQE posting or state modification) are pending when the
215 * QP is destroyed. Also, the consumer must make sure that calls to
216 * qp_modify are serialized.
217 *
218 * Possible optimizations (wait for profile data to see if/where we
219 * have locks bouncing between CPUs):
220 * - split cq/qp table lock into n separate (cache-aligned) locks,
221 * indexed (say) by the page in the table
222 */ 185 */
186struct ipath_mmap_info {
187 struct ipath_mmap_info *next;
188 struct ib_ucontext *context;
189 void *obj;
190 struct kref ref;
191 unsigned size;
192 unsigned mmap_cnt;
193};
223 194
195/*
196 * This structure is used to contain the head pointer, tail pointer,
197 * and completion queue entries as a single memory allocation so
198 * it can be mmap'ed into user space.
199 */
200struct ipath_cq_wc {
201 u32 head; /* index of next entry to fill */
202 u32 tail; /* index of next ib_poll_cq() entry */
203 struct ib_wc queue[1]; /* this is actually size ibcq.cqe + 1 */
204};
205
206/*
207 * The completion queue structure.
208 */
224struct ipath_cq { 209struct ipath_cq {
225 struct ib_cq ibcq; 210 struct ib_cq ibcq;
226 struct tasklet_struct comptask; 211 struct tasklet_struct comptask;
227 spinlock_t lock; 212 spinlock_t lock;
228 u8 notify; 213 u8 notify;
229 u8 triggered; 214 u8 triggered;
230 u32 head; /* new records added to the head */ 215 struct ipath_cq_wc *queue;
231 u32 tail; /* poll_cq() reads from here. */ 216 struct ipath_mmap_info *ip;
232 struct ib_wc *queue; /* this is actually ibcq.cqe + 1 */
233}; 217};
234 218
235/* 219/*
@@ -248,28 +232,40 @@ struct ipath_swqe {
248 232
249/* 233/*
250 * Receive work request queue entry. 234 * Receive work request queue entry.
251 * The size of the sg_list is determined when the QP is created and stored 235 * The size of the sg_list is determined when the QP (or SRQ) is created
252 * in qp->r_max_sge. 236 * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
253 */ 237 */
254struct ipath_rwqe { 238struct ipath_rwqe {
255 u64 wr_id; 239 u64 wr_id;
256 u32 length; /* total length of data in sg_list */
257 u8 num_sge; 240 u8 num_sge;
258 struct ipath_sge sg_list[0]; 241 struct ib_sge sg_list[0];
259}; 242};
260 243
261struct ipath_rq { 244/*
262 spinlock_t lock; 245 * This structure is used to contain the head pointer, tail pointer,
246 * and receive work queue entries as a single memory allocation so
247 * it can be mmap'ed into user space.
248 * Note that the wq array elements are variable size so you can't
249 * just index into the array to get the N'th element;
250 * use get_rwqe_ptr() instead.
251 */
252struct ipath_rwq {
263 u32 head; /* new work requests posted to the head */ 253 u32 head; /* new work requests posted to the head */
264 u32 tail; /* receives pull requests from here. */ 254 u32 tail; /* receives pull requests from here. */
255 struct ipath_rwqe wq[0];
256};
257
258struct ipath_rq {
259 struct ipath_rwq *wq;
260 spinlock_t lock;
265 u32 size; /* size of RWQE array */ 261 u32 size; /* size of RWQE array */
266 u8 max_sge; 262 u8 max_sge;
267 struct ipath_rwqe *wq; /* RWQE array */
268}; 263};
269 264
270struct ipath_srq { 265struct ipath_srq {
271 struct ib_srq ibsrq; 266 struct ib_srq ibsrq;
272 struct ipath_rq rq; 267 struct ipath_rq rq;
268 struct ipath_mmap_info *ip;
273 /* send signal when number of RWQEs < limit */ 269 /* send signal when number of RWQEs < limit */
274 u32 limit; 270 u32 limit;
275}; 271};
@@ -293,6 +289,7 @@ struct ipath_qp {
293 atomic_t refcount; 289 atomic_t refcount;
294 wait_queue_head_t wait; 290 wait_queue_head_t wait;
295 struct tasklet_struct s_task; 291 struct tasklet_struct s_task;
292 struct ipath_mmap_info *ip;
296 struct ipath_sge_state *s_cur_sge; 293 struct ipath_sge_state *s_cur_sge;
297 struct ipath_sge_state s_sge; /* current send request data */ 294 struct ipath_sge_state s_sge; /* current send request data */
298 /* current RDMA read send data */ 295 /* current RDMA read send data */
@@ -345,7 +342,8 @@ struct ipath_qp {
345 u32 s_ssn; /* SSN of tail entry */ 342 u32 s_ssn; /* SSN of tail entry */
346 u32 s_lsn; /* limit sequence number (credit) */ 343 u32 s_lsn; /* limit sequence number (credit) */
347 struct ipath_swqe *s_wq; /* send work queue */ 344 struct ipath_swqe *s_wq; /* send work queue */
348 struct ipath_rq r_rq; /* receive work queue */ 345 struct ipath_rq r_rq; /* receive work queue */
346 struct ipath_sge r_sg_list[0]; /* verified SGEs */
349}; 347};
350 348
351/* 349/*
@@ -369,15 +367,15 @@ static inline struct ipath_swqe *get_swqe_ptr(struct ipath_qp *qp,
369 367
370/* 368/*
371 * Since struct ipath_rwqe is not a fixed size, we can't simply index into 369 * Since struct ipath_rwqe is not a fixed size, we can't simply index into
372 * struct ipath_rq.wq. This function does the array index computation. 370 * struct ipath_rwq.wq. This function does the array index computation.
373 */ 371 */
374static inline struct ipath_rwqe *get_rwqe_ptr(struct ipath_rq *rq, 372static inline struct ipath_rwqe *get_rwqe_ptr(struct ipath_rq *rq,
375 unsigned n) 373 unsigned n)
376{ 374{
377 return (struct ipath_rwqe *) 375 return (struct ipath_rwqe *)
378 ((char *) rq->wq + 376 ((char *) rq->wq->wq +
379 (sizeof(struct ipath_rwqe) + 377 (sizeof(struct ipath_rwqe) +
380 rq->max_sge * sizeof(struct ipath_sge)) * n); 378 rq->max_sge * sizeof(struct ib_sge)) * n);
381} 379}
382 380
383/* 381/*
@@ -417,6 +415,7 @@ struct ipath_ibdev {
417 struct ib_device ibdev; 415 struct ib_device ibdev;
418 struct list_head dev_list; 416 struct list_head dev_list;
419 struct ipath_devdata *dd; 417 struct ipath_devdata *dd;
418 struct ipath_mmap_info *pending_mmaps;
420 int ib_unit; /* This is the device number */ 419 int ib_unit; /* This is the device number */
421 u16 sm_lid; /* in host order */ 420 u16 sm_lid; /* in host order */
422 u8 sm_sl; 421 u8 sm_sl;
@@ -681,6 +680,10 @@ int ipath_unmap_fmr(struct list_head *fmr_list);
681 680
682int ipath_dealloc_fmr(struct ib_fmr *ibfmr); 681int ipath_dealloc_fmr(struct ib_fmr *ibfmr);
683 682
683void ipath_release_mmap_info(struct kref *ref);
684
685int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
686
684void ipath_no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev); 687void ipath_no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev);
685 688
686void ipath_insert_rnr_queue(struct ipath_qp *qp); 689void ipath_insert_rnr_queue(struct ipath_qp *qp);