aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband
diff options
context:
space:
mode:
authorCQ Tang <cq.tang@intel.com>2013-07-19 13:57:21 -0400
committerRoland Dreier <roland@purestorage.com>2013-08-13 14:14:34 -0400
commit4668e4b527d2263a74a85334243bb740905da51e (patch)
tree4ef164dc33ae416ab87cf860a7718fdb7ef662c8 /drivers/infiniband
parentc095ba7224d8edc71dcef0d655911399a8bd4a3f (diff)
IB/qib: Improve SDMA performance
1. The code accepts chunks of messages, and splits the chunk into packets when converting packets into sdma queue entries. Adjacent packets will use user buffer pages smartly to avoid pinning the same page multiple times. 2. Instead of discarding all the work when SDMA queue is full, the work is saved in a pending queue. Whenever there are enough SDMA queue free entries, pending queue is directly put onto SDMA queue. 3. An interrupt handler is used to progress this pending queue. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: CQ Tang <cq.tang@intel.com> Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com> [ Fixed up sparse warnings. - Roland ] Signed-off-by: Roland Dreier <roland@purestorage.com>
Diffstat (limited to 'drivers/infiniband')
-rw-r--r--drivers/infiniband/hw/qib/qib.h4
-rw-r--r--drivers/infiniband/hw/qib/qib_common.h32
-rw-r--r--drivers/infiniband/hw/qib/qib_file_ops.c2
-rw-r--r--drivers/infiniband/hw/qib/qib_sdma.c8
-rw-r--r--drivers/infiniband/hw/qib/qib_user_sdma.c909
5 files changed, 728 insertions, 227 deletions
diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h
index 4a9af795b88f..ae3e4feca718 100644
--- a/drivers/infiniband/hw/qib/qib.h
+++ b/drivers/infiniband/hw/qib/qib.h
@@ -576,11 +576,13 @@ struct qib_pportdata {
576 /* read/write using lock */ 576 /* read/write using lock */
577 spinlock_t sdma_lock ____cacheline_aligned_in_smp; 577 spinlock_t sdma_lock ____cacheline_aligned_in_smp;
578 struct list_head sdma_activelist; 578 struct list_head sdma_activelist;
579 struct list_head sdma_userpending;
579 u64 sdma_descq_added; 580 u64 sdma_descq_added;
580 u64 sdma_descq_removed; 581 u64 sdma_descq_removed;
581 u16 sdma_descq_tail; 582 u16 sdma_descq_tail;
582 u16 sdma_descq_head; 583 u16 sdma_descq_head;
583 u8 sdma_generation; 584 u8 sdma_generation;
585 u8 sdma_intrequest;
584 586
585 struct tasklet_struct sdma_sw_clean_up_task 587 struct tasklet_struct sdma_sw_clean_up_task
586 ____cacheline_aligned_in_smp; 588 ____cacheline_aligned_in_smp;
@@ -1326,6 +1328,8 @@ int qib_setup_sdma(struct qib_pportdata *);
1326void qib_teardown_sdma(struct qib_pportdata *); 1328void qib_teardown_sdma(struct qib_pportdata *);
1327void __qib_sdma_intr(struct qib_pportdata *); 1329void __qib_sdma_intr(struct qib_pportdata *);
1328void qib_sdma_intr(struct qib_pportdata *); 1330void qib_sdma_intr(struct qib_pportdata *);
1331void qib_user_sdma_send_desc(struct qib_pportdata *dd,
1332 struct list_head *pktlist);
1329int qib_sdma_verbs_send(struct qib_pportdata *, struct qib_sge_state *, 1333int qib_sdma_verbs_send(struct qib_pportdata *, struct qib_sge_state *,
1330 u32, struct qib_verbs_txreq *); 1334 u32, struct qib_verbs_txreq *);
1331/* ppd->sdma_lock should be locked before calling this. */ 1335/* ppd->sdma_lock should be locked before calling this. */
diff --git a/drivers/infiniband/hw/qib/qib_common.h b/drivers/infiniband/hw/qib/qib_common.h
index 4f255b723ffd..5670ace27c63 100644
--- a/drivers/infiniband/hw/qib/qib_common.h
+++ b/drivers/infiniband/hw/qib/qib_common.h
@@ -279,7 +279,7 @@ struct qib_base_info {
279 * may not be implemented; the user code must deal with this if it 279 * may not be implemented; the user code must deal with this if it
280 * cares, or it must abort after initialization reports the difference. 280 * cares, or it must abort after initialization reports the difference.
281 */ 281 */
282#define QIB_USER_SWMINOR 12 282#define QIB_USER_SWMINOR 13
283 283
284#define QIB_USER_SWVERSION ((QIB_USER_SWMAJOR << 16) | QIB_USER_SWMINOR) 284#define QIB_USER_SWVERSION ((QIB_USER_SWMAJOR << 16) | QIB_USER_SWMINOR)
285 285
@@ -701,7 +701,37 @@ struct qib_message_header {
701 __be32 bth[3]; 701 __be32 bth[3];
702 /* fields below this point are in host byte order */ 702 /* fields below this point are in host byte order */
703 struct qib_header iph; 703 struct qib_header iph;
704 /* fields below are simplified, but should match PSM */
705 /* some are accessed by driver when packet spliting is needed */
704 __u8 sub_opcode; 706 __u8 sub_opcode;
707 __u8 flags;
708 __u16 commidx;
709 __u32 ack_seq_num;
710 __u8 flowid;
711 __u8 hdr_dlen;
712 __u16 mqhdr;
713 __u32 uwords[4];
714};
715
716/* sequence number bits for message */
717union qib_seqnum {
718 struct {
719 __u32 seq:11;
720 __u32 gen:8;
721 __u32 flow:5;
722 };
723 struct {
724 __u32 pkt:16;
725 __u32 msg:8;
726 };
727 __u32 val;
728};
729
730/* qib receiving-dma tid-session-member */
731struct qib_tid_session_member {
732 __u16 tid;
733 __u16 offset;
734 __u16 length;
705}; 735};
706 736
707/* IB - LRH header consts */ 737/* IB - LRH header consts */
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index b51a51486cb8..275f247f9fca 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -1220,7 +1220,7 @@ static int qib_compatible_subctxts(int user_swmajor, int user_swminor)
1220 return user_swminor == 3; 1220 return user_swminor == 3;
1221 default: 1221 default:
1222 /* >= 4 are compatible (or are expected to be) */ 1222 /* >= 4 are compatible (or are expected to be) */
1223 return user_swminor >= 4; 1223 return user_swminor <= QIB_USER_SWMINOR;
1224 } 1224 }
1225 } 1225 }
1226 /* make no promises yet for future major versions */ 1226 /* make no promises yet for future major versions */
diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c
index 9b5322d8cd5a..c6d6a54d2e19 100644
--- a/drivers/infiniband/hw/qib/qib_sdma.c
+++ b/drivers/infiniband/hw/qib/qib_sdma.c
@@ -423,8 +423,11 @@ void qib_sdma_intr(struct qib_pportdata *ppd)
423 423
424void __qib_sdma_intr(struct qib_pportdata *ppd) 424void __qib_sdma_intr(struct qib_pportdata *ppd)
425{ 425{
426 if (__qib_sdma_running(ppd)) 426 if (__qib_sdma_running(ppd)) {
427 qib_sdma_make_progress(ppd); 427 qib_sdma_make_progress(ppd);
428 if (!list_empty(&ppd->sdma_userpending))
429 qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending);
430 }
428} 431}
429 432
430int qib_setup_sdma(struct qib_pportdata *ppd) 433int qib_setup_sdma(struct qib_pportdata *ppd)
@@ -452,6 +455,9 @@ int qib_setup_sdma(struct qib_pportdata *ppd)
452 ppd->sdma_descq_removed = 0; 455 ppd->sdma_descq_removed = 0;
453 ppd->sdma_descq_added = 0; 456 ppd->sdma_descq_added = 0;
454 457
458 ppd->sdma_intrequest = 0;
459 INIT_LIST_HEAD(&ppd->sdma_userpending);
460
455 INIT_LIST_HEAD(&ppd->sdma_activelist); 461 INIT_LIST_HEAD(&ppd->sdma_activelist);
456 462
457 tasklet_init(&ppd->sdma_sw_clean_up_task, sdma_sw_clean_up_task, 463 tasklet_init(&ppd->sdma_sw_clean_up_task, sdma_sw_clean_up_task,
diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c
index 82442085cbe6..d0a0ea0c14d6 100644
--- a/drivers/infiniband/hw/qib/qib_user_sdma.c
+++ b/drivers/infiniband/hw/qib/qib_user_sdma.c
@@ -53,20 +53,36 @@
53#define QIB_USER_SDMA_DRAIN_TIMEOUT 500 53#define QIB_USER_SDMA_DRAIN_TIMEOUT 500
54 54
55struct qib_user_sdma_pkt { 55struct qib_user_sdma_pkt {
56 u8 naddr; /* dimension of addr (1..3) ... */ 56 struct list_head list; /* list element */
57
58 u8 tiddma; /* if this is NEW tid-sdma */
59 u8 largepkt; /* this is large pkt from kmalloc */
60 u16 frag_size; /* frag size used by PSM */
61 u16 index; /* last header index or push index */
62 u16 naddr; /* dimension of addr (1..3) ... */
63 u16 addrlimit; /* addr array size */
64 u16 tidsmidx; /* current tidsm index */
65 u16 tidsmcount; /* tidsm array item count */
66 u16 payload_size; /* payload size so far for header */
67 u32 bytes_togo; /* bytes for processing */
57 u32 counter; /* sdma pkts queued counter for this entry */ 68 u32 counter; /* sdma pkts queued counter for this entry */
69 struct qib_tid_session_member *tidsm; /* tid session member array */
70 struct qib_user_sdma_queue *pq; /* which pq this pkt belongs to */
58 u64 added; /* global descq number of entries */ 71 u64 added; /* global descq number of entries */
59 72
60 struct { 73 struct {
61 u32 offset; /* offset for kvaddr, addr */ 74 u16 offset; /* offset for kvaddr, addr */
62 u32 length; /* length in page */ 75 u16 length; /* length in page */
63 u8 put_page; /* should we put_page? */ 76 u16 first_desc; /* first desc */
64 u8 dma_mapped; /* is page dma_mapped? */ 77 u16 last_desc; /* last desc */
78 u16 put_page; /* should we put_page? */
79 u16 dma_mapped; /* is page dma_mapped? */
80 u16 dma_length; /* for dma_unmap_page() */
81 u16 padding;
65 struct page *page; /* may be NULL (coherent mem) */ 82 struct page *page; /* may be NULL (coherent mem) */
66 void *kvaddr; /* FIXME: only for pio hack */ 83 void *kvaddr; /* FIXME: only for pio hack */
67 dma_addr_t addr; 84 dma_addr_t addr;
68 } addr[4]; /* max pages, any more and we coalesce */ 85 } addr[4]; /* max pages, any more and we coalesce */
69 struct list_head list; /* list element */
70}; 86};
71 87
72struct qib_user_sdma_queue { 88struct qib_user_sdma_queue {
@@ -77,6 +93,12 @@ struct qib_user_sdma_queue {
77 */ 93 */
78 struct list_head sent; 94 struct list_head sent;
79 95
96 /*
97 * Because above list will be accessed by both process and
98 * signal handler, we need a spinlock for it.
99 */
100 spinlock_t sent_lock ____cacheline_aligned_in_smp;
101
80 /* headers with expected length are allocated from here... */ 102 /* headers with expected length are allocated from here... */
81 char header_cache_name[64]; 103 char header_cache_name[64];
82 struct dma_pool *header_cache; 104 struct dma_pool *header_cache;
@@ -88,6 +110,12 @@ struct qib_user_sdma_queue {
88 /* as packets go on the queued queue, they are counted... */ 110 /* as packets go on the queued queue, they are counted... */
89 u32 counter; 111 u32 counter;
90 u32 sent_counter; 112 u32 sent_counter;
113 /* pending packets, not sending yet */
114 u32 num_pending;
115 /* sending packets, not complete yet */
116 u32 num_sending;
117 /* global descq number of entry of last sending packet */
118 u64 added;
91 119
92 /* dma page table */ 120 /* dma page table */
93 struct rb_root dma_pages_root; 121 struct rb_root dma_pages_root;
@@ -107,8 +135,12 @@ qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt)
107 135
108 pq->counter = 0; 136 pq->counter = 0;
109 pq->sent_counter = 0; 137 pq->sent_counter = 0;
110 INIT_LIST_HEAD(&pq->sent); 138 pq->num_pending = 0;
139 pq->num_sending = 0;
140 pq->added = 0;
111 141
142 INIT_LIST_HEAD(&pq->sent);
143 spin_lock_init(&pq->sent_lock);
112 mutex_init(&pq->lock); 144 mutex_init(&pq->lock);
113 145
114 snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name), 146 snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name),
@@ -144,34 +176,310 @@ done:
144} 176}
145 177
146static void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt, 178static void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt,
147 int i, size_t offset, size_t len, 179 int i, u16 offset, u16 len,
148 int put_page, int dma_mapped, 180 u16 first_desc, u16 last_desc,
149 struct page *page, 181 u16 put_page, u16 dma_mapped,
150 void *kvaddr, dma_addr_t dma_addr) 182 struct page *page, void *kvaddr,
183 dma_addr_t dma_addr, u16 dma_length)
151{ 184{
152 pkt->addr[i].offset = offset; 185 pkt->addr[i].offset = offset;
153 pkt->addr[i].length = len; 186 pkt->addr[i].length = len;
187 pkt->addr[i].first_desc = first_desc;
188 pkt->addr[i].last_desc = last_desc;
154 pkt->addr[i].put_page = put_page; 189 pkt->addr[i].put_page = put_page;
155 pkt->addr[i].dma_mapped = dma_mapped; 190 pkt->addr[i].dma_mapped = dma_mapped;
156 pkt->addr[i].page = page; 191 pkt->addr[i].page = page;
157 pkt->addr[i].kvaddr = kvaddr; 192 pkt->addr[i].kvaddr = kvaddr;
158 pkt->addr[i].addr = dma_addr; 193 pkt->addr[i].addr = dma_addr;
194 pkt->addr[i].dma_length = dma_length;
159} 195}
160 196
161static void qib_user_sdma_init_header(struct qib_user_sdma_pkt *pkt, 197static void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq,
162 u32 counter, size_t offset, 198 size_t len, dma_addr_t *dma_addr)
163 size_t len, int dma_mapped,
164 struct page *page,
165 void *kvaddr, dma_addr_t dma_addr)
166{ 199{
167 pkt->naddr = 1; 200 void *hdr;
168 pkt->counter = counter; 201
169 qib_user_sdma_init_frag(pkt, 0, offset, len, 0, dma_mapped, page, 202 if (len == QIB_USER_SDMA_EXP_HEADER_LENGTH)
170 kvaddr, dma_addr); 203 hdr = dma_pool_alloc(pq->header_cache, GFP_KERNEL,
204 dma_addr);
205 else
206 hdr = NULL;
207
208 if (!hdr) {
209 hdr = kmalloc(len, GFP_KERNEL);
210 if (!hdr)
211 return NULL;
212
213 *dma_addr = 0;
214 }
215
216 return hdr;
217}
218
219static int qib_user_sdma_page_to_frags(const struct qib_devdata *dd,
220 struct qib_user_sdma_queue *pq,
221 struct qib_user_sdma_pkt *pkt,
222 struct page *page, u16 put,
223 u16 offset, u16 len, void *kvaddr)
224{
225 __le16 *pbc16;
226 void *pbcvaddr;
227 struct qib_message_header *hdr;
228 u16 newlen, pbclen, lastdesc, dma_mapped;
229 u32 vcto;
230 union qib_seqnum seqnum;
231 dma_addr_t pbcdaddr;
232 dma_addr_t dma_addr =
233 dma_map_page(&dd->pcidev->dev,
234 page, offset, len, DMA_TO_DEVICE);
235 int ret = 0;
236
237 if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
238 /*
239 * dma mapping error, pkt has not managed
240 * this page yet, return the page here so
241 * the caller can ignore this page.
242 */
243 if (put) {
244 put_page(page);
245 } else {
246 /* coalesce case */
247 kunmap(page);
248 __free_page(page);
249 }
250 ret = -ENOMEM;
251 goto done;
252 }
253 offset = 0;
254 dma_mapped = 1;
255
256
257next_fragment:
258
259 /*
260 * In tid-sdma, the transfer length is restricted by
261 * receiver side current tid page length.
262 */
263 if (pkt->tiddma && len > pkt->tidsm[pkt->tidsmidx].length)
264 newlen = pkt->tidsm[pkt->tidsmidx].length;
265 else
266 newlen = len;
267
268 /*
269 * Then the transfer length is restricted by MTU.
270 * the last descriptor flag is determined by:
271 * 1. the current packet is at frag size length.
272 * 2. the current tid page is done if tid-sdma.
273 * 3. there is no more byte togo if sdma.
274 */
275 lastdesc = 0;
276 if ((pkt->payload_size + newlen) >= pkt->frag_size) {
277 newlen = pkt->frag_size - pkt->payload_size;
278 lastdesc = 1;
279 } else if (pkt->tiddma) {
280 if (newlen == pkt->tidsm[pkt->tidsmidx].length)
281 lastdesc = 1;
282 } else {
283 if (newlen == pkt->bytes_togo)
284 lastdesc = 1;
285 }
286
287 /* fill the next fragment in this page */
288 qib_user_sdma_init_frag(pkt, pkt->naddr, /* index */
289 offset, newlen, /* offset, len */
290 0, lastdesc, /* first last desc */
291 put, dma_mapped, /* put page, dma mapped */
292 page, kvaddr, /* struct page, virt addr */
293 dma_addr, len); /* dma addr, dma length */
294 pkt->bytes_togo -= newlen;
295 pkt->payload_size += newlen;
296 pkt->naddr++;
297 if (pkt->naddr == pkt->addrlimit) {
298 ret = -EFAULT;
299 goto done;
300 }
301
302 /* If there is no more byte togo. (lastdesc==1) */
303 if (pkt->bytes_togo == 0) {
304 /* The packet is done, header is not dma mapped yet.
305 * it should be from kmalloc */
306 if (!pkt->addr[pkt->index].addr) {
307 pkt->addr[pkt->index].addr =
308 dma_map_single(&dd->pcidev->dev,
309 pkt->addr[pkt->index].kvaddr,
310 pkt->addr[pkt->index].dma_length,
311 DMA_TO_DEVICE);
312 if (dma_mapping_error(&dd->pcidev->dev,
313 pkt->addr[pkt->index].addr)) {
314 ret = -ENOMEM;
315 goto done;
316 }
317 pkt->addr[pkt->index].dma_mapped = 1;
318 }
319
320 goto done;
321 }
322
323 /* If tid-sdma, advance tid info. */
324 if (pkt->tiddma) {
325 pkt->tidsm[pkt->tidsmidx].length -= newlen;
326 if (pkt->tidsm[pkt->tidsmidx].length) {
327 pkt->tidsm[pkt->tidsmidx].offset += newlen;
328 } else {
329 pkt->tidsmidx++;
330 if (pkt->tidsmidx == pkt->tidsmcount) {
331 ret = -EFAULT;
332 goto done;
333 }
334 }
335 }
336
337 /*
338 * If this is NOT the last descriptor. (newlen==len)
339 * the current packet is not done yet, but the current
340 * send side page is done.
341 */
342 if (lastdesc == 0)
343 goto done;
344
345 /*
346 * If running this driver under PSM with message size
347 * fitting into one transfer unit, it is not possible
348 * to pass this line. otherwise, it is a buggggg.
349 */
350
351 /*
352 * Since the current packet is done, and there are more
353 * bytes togo, we need to create a new sdma header, copying
354 * from previous sdma header and modify both.
355 */
356 pbclen = pkt->addr[pkt->index].length;
357 pbcvaddr = qib_user_sdma_alloc_header(pq, pbclen, &pbcdaddr);
358 if (!pbcvaddr) {
359 ret = -ENOMEM;
360 goto done;
361 }
362 /* Copy the previous sdma header to new sdma header */
363 pbc16 = (__le16 *)pkt->addr[pkt->index].kvaddr;
364 memcpy(pbcvaddr, pbc16, pbclen);
365
366 /* Modify the previous sdma header */
367 hdr = (struct qib_message_header *)&pbc16[4];
368
369 /* New pbc length */
370 pbc16[0] = cpu_to_le16(le16_to_cpu(pbc16[0])-(pkt->bytes_togo>>2));
371
372 /* New packet length */
373 hdr->lrh[2] = cpu_to_be16(le16_to_cpu(pbc16[0]));
374
375 if (pkt->tiddma) {
376 /* turn on the header suppression */
377 hdr->iph.pkt_flags =
378 cpu_to_le16(le16_to_cpu(hdr->iph.pkt_flags)|0x2);
379 /* turn off ACK_REQ: 0x04 and EXPECTED_DONE: 0x20 */
380 hdr->flags &= ~(0x04|0x20);
381 } else {
382 /* turn off extra bytes: 20-21 bits */
383 hdr->bth[0] = cpu_to_be32(be32_to_cpu(hdr->bth[0])&0xFFCFFFFF);
384 /* turn off ACK_REQ: 0x04 */
385 hdr->flags &= ~(0x04);
386 }
387
388 /* New kdeth checksum */
389 vcto = le32_to_cpu(hdr->iph.ver_ctxt_tid_offset);
390 hdr->iph.chksum = cpu_to_le16(QIB_LRH_BTH +
391 be16_to_cpu(hdr->lrh[2]) -
392 ((vcto>>16)&0xFFFF) - (vcto&0xFFFF) -
393 le16_to_cpu(hdr->iph.pkt_flags));
394
395 /* The packet is done, header is not dma mapped yet.
396 * it should be from kmalloc */
397 if (!pkt->addr[pkt->index].addr) {
398 pkt->addr[pkt->index].addr =
399 dma_map_single(&dd->pcidev->dev,
400 pkt->addr[pkt->index].kvaddr,
401 pkt->addr[pkt->index].dma_length,
402 DMA_TO_DEVICE);
403 if (dma_mapping_error(&dd->pcidev->dev,
404 pkt->addr[pkt->index].addr)) {
405 ret = -ENOMEM;
406 goto done;
407 }
408 pkt->addr[pkt->index].dma_mapped = 1;
409 }
410
411 /* Modify the new sdma header */
412 pbc16 = (__le16 *)pbcvaddr;
413 hdr = (struct qib_message_header *)&pbc16[4];
414
415 /* New pbc length */
416 pbc16[0] = cpu_to_le16(le16_to_cpu(pbc16[0])-(pkt->payload_size>>2));
417
418 /* New packet length */
419 hdr->lrh[2] = cpu_to_be16(le16_to_cpu(pbc16[0]));
420
421 if (pkt->tiddma) {
422 /* Set new tid and offset for new sdma header */
423 hdr->iph.ver_ctxt_tid_offset = cpu_to_le32(
424 (le32_to_cpu(hdr->iph.ver_ctxt_tid_offset)&0xFF000000) +
425 (pkt->tidsm[pkt->tidsmidx].tid<<QLOGIC_IB_I_TID_SHIFT) +
426 (pkt->tidsm[pkt->tidsmidx].offset>>2));
427 } else {
428 /* Middle protocol new packet offset */
429 hdr->uwords[2] += pkt->payload_size;
430 }
431
432 /* New kdeth checksum */
433 vcto = le32_to_cpu(hdr->iph.ver_ctxt_tid_offset);
434 hdr->iph.chksum = cpu_to_le16(QIB_LRH_BTH +
435 be16_to_cpu(hdr->lrh[2]) -
436 ((vcto>>16)&0xFFFF) - (vcto&0xFFFF) -
437 le16_to_cpu(hdr->iph.pkt_flags));
438
439 /* Next sequence number in new sdma header */
440 seqnum.val = be32_to_cpu(hdr->bth[2]);
441 if (pkt->tiddma)
442 seqnum.seq++;
443 else
444 seqnum.pkt++;
445 hdr->bth[2] = cpu_to_be32(seqnum.val);
446
447 /* Init new sdma header. */
448 qib_user_sdma_init_frag(pkt, pkt->naddr, /* index */
449 0, pbclen, /* offset, len */
450 1, 0, /* first last desc */
451 0, 0, /* put page, dma mapped */
452 NULL, pbcvaddr, /* struct page, virt addr */
453 pbcdaddr, pbclen); /* dma addr, dma length */
454 pkt->index = pkt->naddr;
455 pkt->payload_size = 0;
456 pkt->naddr++;
457 if (pkt->naddr == pkt->addrlimit) {
458 ret = -EFAULT;
459 goto done;
460 }
461
462 /* Prepare for next fragment in this page */
463 if (newlen != len) {
464 if (dma_mapped) {
465 put = 0;
466 dma_mapped = 0;
467 page = NULL;
468 kvaddr = NULL;
469 }
470 len -= newlen;
471 offset += newlen;
472
473 goto next_fragment;
474 }
475
476done:
477 return ret;
171} 478}
172 479
173/* we've too many pages in the iovec, coalesce to a single page */ 480/* we've too many pages in the iovec, coalesce to a single page */
174static int qib_user_sdma_coalesce(const struct qib_devdata *dd, 481static int qib_user_sdma_coalesce(const struct qib_devdata *dd,
482 struct qib_user_sdma_queue *pq,
175 struct qib_user_sdma_pkt *pkt, 483 struct qib_user_sdma_pkt *pkt,
176 const struct iovec *iov, 484 const struct iovec *iov,
177 unsigned long niov) 485 unsigned long niov)
@@ -182,7 +490,6 @@ static int qib_user_sdma_coalesce(const struct qib_devdata *dd,
182 char *mpage; 490 char *mpage;
183 int i; 491 int i;
184 int len = 0; 492 int len = 0;
185 dma_addr_t dma_addr;
186 493
187 if (!page) { 494 if (!page) {
188 ret = -ENOMEM; 495 ret = -ENOMEM;
@@ -205,17 +512,8 @@ static int qib_user_sdma_coalesce(const struct qib_devdata *dd,
205 len += iov[i].iov_len; 512 len += iov[i].iov_len;
206 } 513 }
207 514
208 dma_addr = dma_map_page(&dd->pcidev->dev, page, 0, len, 515 ret = qib_user_sdma_page_to_frags(dd, pq, pkt,
209 DMA_TO_DEVICE); 516 page, 0, 0, len, mpage_save);
210 if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
211 ret = -ENOMEM;
212 goto free_unmap;
213 }
214
215 qib_user_sdma_init_frag(pkt, 1, 0, len, 0, 1, page, mpage_save,
216 dma_addr);
217 pkt->naddr = 2;
218
219 goto done; 517 goto done;
220 518
221free_unmap: 519free_unmap:
@@ -238,16 +536,6 @@ static int qib_user_sdma_num_pages(const struct iovec *iov)
238 return 1 + ((epage - spage) >> PAGE_SHIFT); 536 return 1 + ((epage - spage) >> PAGE_SHIFT);
239} 537}
240 538
241/*
242 * Truncate length to page boundary.
243 */
244static int qib_user_sdma_page_length(unsigned long addr, unsigned long len)
245{
246 const unsigned long offset = addr & ~PAGE_MASK;
247
248 return ((offset + len) > PAGE_SIZE) ? (PAGE_SIZE - offset) : len;
249}
250
251static void qib_user_sdma_free_pkt_frag(struct device *dev, 539static void qib_user_sdma_free_pkt_frag(struct device *dev,
252 struct qib_user_sdma_queue *pq, 540 struct qib_user_sdma_queue *pq,
253 struct qib_user_sdma_pkt *pkt, 541 struct qib_user_sdma_pkt *pkt,
@@ -256,10 +544,11 @@ static void qib_user_sdma_free_pkt_frag(struct device *dev,
256 const int i = frag; 544 const int i = frag;
257 545
258 if (pkt->addr[i].page) { 546 if (pkt->addr[i].page) {
547 /* only user data has page */
259 if (pkt->addr[i].dma_mapped) 548 if (pkt->addr[i].dma_mapped)
260 dma_unmap_page(dev, 549 dma_unmap_page(dev,
261 pkt->addr[i].addr, 550 pkt->addr[i].addr,
262 pkt->addr[i].length, 551 pkt->addr[i].dma_length,
263 DMA_TO_DEVICE); 552 DMA_TO_DEVICE);
264 553
265 if (pkt->addr[i].kvaddr) 554 if (pkt->addr[i].kvaddr)
@@ -269,55 +558,81 @@ static void qib_user_sdma_free_pkt_frag(struct device *dev,
269 put_page(pkt->addr[i].page); 558 put_page(pkt->addr[i].page);
270 else 559 else
271 __free_page(pkt->addr[i].page); 560 __free_page(pkt->addr[i].page);
272 } else if (pkt->addr[i].kvaddr) 561 } else if (pkt->addr[i].kvaddr) {
273 /* free coherent mem from cache... */ 562 /* for headers */
274 dma_pool_free(pq->header_cache, 563 if (pkt->addr[i].dma_mapped) {
564 /* from kmalloc & dma mapped */
565 dma_unmap_single(dev,
566 pkt->addr[i].addr,
567 pkt->addr[i].dma_length,
568 DMA_TO_DEVICE);
569 kfree(pkt->addr[i].kvaddr);
570 } else if (pkt->addr[i].addr) {
571 /* free coherent mem from cache... */
572 dma_pool_free(pq->header_cache,
275 pkt->addr[i].kvaddr, pkt->addr[i].addr); 573 pkt->addr[i].kvaddr, pkt->addr[i].addr);
574 } else {
575 /* from kmalloc but not dma mapped */
576 kfree(pkt->addr[i].kvaddr);
577 }
578 }
276} 579}
277 580
278/* return number of pages pinned... */ 581/* return number of pages pinned... */
279static int qib_user_sdma_pin_pages(const struct qib_devdata *dd, 582static int qib_user_sdma_pin_pages(const struct qib_devdata *dd,
583 struct qib_user_sdma_queue *pq,
280 struct qib_user_sdma_pkt *pkt, 584 struct qib_user_sdma_pkt *pkt,
281 unsigned long addr, int tlen, int npages) 585 unsigned long addr, int tlen, int npages)
282{ 586{
283 struct page *pages[2]; 587 struct page *pages[8];
284 int j; 588 int i, j;
285 int ret; 589 int ret = 0;
286
287 ret = get_user_pages(current, current->mm, addr,
288 npages, 0, 1, pages, NULL);
289
290 if (ret != npages) {
291 int i;
292
293 for (i = 0; i < ret; i++)
294 put_page(pages[i]);
295
296 ret = -ENOMEM;
297 goto done;
298 }
299 590
300 for (j = 0; j < npages; j++) { 591 while (npages) {
301 /* map the pages... */ 592 if (npages > 8)
302 const int flen = qib_user_sdma_page_length(addr, tlen); 593 j = 8;
303 dma_addr_t dma_addr = 594 else
304 dma_map_page(&dd->pcidev->dev, 595 j = npages;
305 pages[j], 0, flen, DMA_TO_DEVICE);
306 unsigned long fofs = addr & ~PAGE_MASK;
307 596
308 if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { 597 ret = get_user_pages(current, current->mm, addr,
598 j, 0, 1, pages, NULL);
599 if (ret != j) {
600 i = 0;
601 j = ret;
309 ret = -ENOMEM; 602 ret = -ENOMEM;
310 goto done; 603 goto free_pages;
311 } 604 }
312 605
313 qib_user_sdma_init_frag(pkt, pkt->naddr, fofs, flen, 1, 1, 606 for (i = 0; i < j; i++) {
314 pages[j], kmap(pages[j]), dma_addr); 607 /* map the pages... */
608 unsigned long fofs = addr & ~PAGE_MASK;
609 int flen = ((fofs + tlen) > PAGE_SIZE) ?
610 (PAGE_SIZE - fofs) : tlen;
611
612 ret = qib_user_sdma_page_to_frags(dd, pq, pkt,
613 pages[i], 1, fofs, flen, NULL);
614 if (ret < 0) {
615 /* current page has beed taken
616 * care of inside above call.
617 */
618 i++;
619 goto free_pages;
620 }
315 621
316 pkt->naddr++; 622 addr += flen;
317 addr += flen; 623 tlen -= flen;
318 tlen -= flen; 624 }
625
626 npages -= j;
319 } 627 }
320 628
629 goto done;
630
631 /* if error, return all pages not managed by pkt */
632free_pages:
633 while (i < j)
634 put_page(pages[i++]);
635
321done: 636done:
322 return ret; 637 return ret;
323} 638}
@@ -335,7 +650,7 @@ static int qib_user_sdma_pin_pkt(const struct qib_devdata *dd,
335 const int npages = qib_user_sdma_num_pages(iov + idx); 650 const int npages = qib_user_sdma_num_pages(iov + idx);
336 const unsigned long addr = (unsigned long) iov[idx].iov_base; 651 const unsigned long addr = (unsigned long) iov[idx].iov_base;
337 652
338 ret = qib_user_sdma_pin_pages(dd, pkt, addr, 653 ret = qib_user_sdma_pin_pages(dd, pq, pkt, addr,
339 iov[idx].iov_len, npages); 654 iov[idx].iov_len, npages);
340 if (ret < 0) 655 if (ret < 0)
341 goto free_pkt; 656 goto free_pkt;
@@ -344,9 +659,22 @@ static int qib_user_sdma_pin_pkt(const struct qib_devdata *dd,
344 goto done; 659 goto done;
345 660
346free_pkt: 661free_pkt:
347 for (idx = 0; idx < pkt->naddr; idx++) 662 /* we need to ignore the first entry here */
663 for (idx = 1; idx < pkt->naddr; idx++)
348 qib_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx); 664 qib_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx);
349 665
666 /* need to dma unmap the first entry, this is to restore to
667 * the original state so that caller can free the memory in
668 * error condition. Caller does not know if dma mapped or not*/
669 if (pkt->addr[0].dma_mapped) {
670 dma_unmap_single(&dd->pcidev->dev,
671 pkt->addr[0].addr,
672 pkt->addr[0].dma_length,
673 DMA_TO_DEVICE);
674 pkt->addr[0].addr = 0;
675 pkt->addr[0].dma_mapped = 0;
676 }
677
350done: 678done:
351 return ret; 679 return ret;
352} 680}
@@ -359,8 +687,9 @@ static int qib_user_sdma_init_payload(const struct qib_devdata *dd,
359{ 687{
360 int ret = 0; 688 int ret = 0;
361 689
362 if (npages >= ARRAY_SIZE(pkt->addr)) 690 if (pkt->frag_size == pkt->bytes_togo &&
363 ret = qib_user_sdma_coalesce(dd, pkt, iov, niov); 691 npages >= ARRAY_SIZE(pkt->addr))
692 ret = qib_user_sdma_coalesce(dd, pq, pkt, iov, niov);
364 else 693 else
365 ret = qib_user_sdma_pin_pkt(dd, pq, pkt, iov, niov); 694 ret = qib_user_sdma_pin_pkt(dd, pq, pkt, iov, niov);
366 695
@@ -380,7 +709,10 @@ static void qib_user_sdma_free_pkt_list(struct device *dev,
380 for (i = 0; i < pkt->naddr; i++) 709 for (i = 0; i < pkt->naddr; i++)
381 qib_user_sdma_free_pkt_frag(dev, pq, pkt, i); 710 qib_user_sdma_free_pkt_frag(dev, pq, pkt, i);
382 711
383 kmem_cache_free(pq->pkt_slab, pkt); 712 if (pkt->largepkt)
713 kfree(pkt);
714 else
715 kmem_cache_free(pq->pkt_slab, pkt);
384 } 716 }
385 INIT_LIST_HEAD(list); 717 INIT_LIST_HEAD(list);
386} 718}
@@ -393,63 +725,48 @@ static void qib_user_sdma_free_pkt_list(struct device *dev,
393 * as, if there is an error we clean it... 725 * as, if there is an error we clean it...
394 */ 726 */
395static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd, 727static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
728 struct qib_pportdata *ppd,
396 struct qib_user_sdma_queue *pq, 729 struct qib_user_sdma_queue *pq,
397 struct list_head *list,
398 const struct iovec *iov, 730 const struct iovec *iov,
399 unsigned long niov, 731 unsigned long niov,
400 int maxpkts) 732 struct list_head *list,
733 int *maxpkts, int *ndesc)
401{ 734{
402 unsigned long idx = 0; 735 unsigned long idx = 0;
403 int ret = 0; 736 int ret = 0;
404 int npkts = 0; 737 int npkts = 0;
405 struct page *page = NULL;
406 __le32 *pbc; 738 __le32 *pbc;
407 dma_addr_t dma_addr; 739 dma_addr_t dma_addr;
408 struct qib_user_sdma_pkt *pkt = NULL; 740 struct qib_user_sdma_pkt *pkt = NULL;
409 size_t len; 741 size_t len;
410 size_t nw; 742 size_t nw;
411 u32 counter = pq->counter; 743 u32 counter = pq->counter;
412 int dma_mapped = 0; 744 u16 frag_size;
413 745
414 while (idx < niov && npkts < maxpkts) { 746 while (idx < niov && npkts < *maxpkts) {
415 const unsigned long addr = (unsigned long) iov[idx].iov_base; 747 const unsigned long addr = (unsigned long) iov[idx].iov_base;
416 const unsigned long idx_save = idx; 748 const unsigned long idx_save = idx;
417 unsigned pktnw; 749 unsigned pktnw;
418 unsigned pktnwc; 750 unsigned pktnwc;
419 int nfrags = 0; 751 int nfrags = 0;
420 int npages = 0; 752 int npages = 0;
753 int bytes_togo = 0;
754 int tiddma = 0;
421 int cfur; 755 int cfur;
422 756
423 dma_mapped = 0;
424 len = iov[idx].iov_len; 757 len = iov[idx].iov_len;
425 nw = len >> 2; 758 nw = len >> 2;
426 page = NULL;
427
428 pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL);
429 if (!pkt) {
430 ret = -ENOMEM;
431 goto free_list;
432 }
433 759
434 if (len < QIB_USER_SDMA_MIN_HEADER_LENGTH || 760 if (len < QIB_USER_SDMA_MIN_HEADER_LENGTH ||
435 len > PAGE_SIZE || len & 3 || addr & 3) { 761 len > PAGE_SIZE || len & 3 || addr & 3) {
436 ret = -EINVAL; 762 ret = -EINVAL;
437 goto free_pkt; 763 goto free_list;
438 } 764 }
439 765
440 if (len == QIB_USER_SDMA_EXP_HEADER_LENGTH) 766 pbc = qib_user_sdma_alloc_header(pq, len, &dma_addr);
441 pbc = dma_pool_alloc(pq->header_cache, GFP_KERNEL,
442 &dma_addr);
443 else
444 pbc = NULL;
445
446 if (!pbc) { 767 if (!pbc) {
447 page = alloc_page(GFP_KERNEL); 768 ret = -ENOMEM;
448 if (!page) { 769 goto free_list;
449 ret = -ENOMEM;
450 goto free_pkt;
451 }
452 pbc = kmap(page);
453 } 770 }
454 771
455 cfur = copy_from_user(pbc, iov[idx].iov_base, len); 772 cfur = copy_from_user(pbc, iov[idx].iov_base, len);
@@ -474,8 +791,8 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
474 * we can verify that the packet is consistent with the 791 * we can verify that the packet is consistent with the
475 * iovec lengths. 792 * iovec lengths.
476 */ 793 */
477 pktnw = le32_to_cpu(*pbc) & QIB_PBC_LENGTH_MASK; 794 pktnw = le32_to_cpu(*pbc) & 0xFFFF;
478 if (pktnw < pktnwc || pktnw > pktnwc + (PAGE_SIZE >> 2)) { 795 if (pktnw < pktnwc) {
479 ret = -EINVAL; 796 ret = -EINVAL;
480 goto free_pbc; 797 goto free_pbc;
481 } 798 }
@@ -486,17 +803,14 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
486 const unsigned long faddr = 803 const unsigned long faddr =
487 (unsigned long) iov[idx].iov_base; 804 (unsigned long) iov[idx].iov_base;
488 805
489 if (slen & 3 || faddr & 3 || !slen || 806 if (slen & 3 || faddr & 3 || !slen) {
490 slen > PAGE_SIZE) {
491 ret = -EINVAL; 807 ret = -EINVAL;
492 goto free_pbc; 808 goto free_pbc;
493 } 809 }
494 810
495 npages++; 811 npages += qib_user_sdma_num_pages(&iov[idx]);
496 if ((faddr & PAGE_MASK) !=
497 ((faddr + slen - 1) & PAGE_MASK))
498 npages++;
499 812
813 bytes_togo += slen;
500 pktnwc += slen >> 2; 814 pktnwc += slen >> 2;
501 idx++; 815 idx++;
502 nfrags++; 816 nfrags++;
@@ -507,48 +821,139 @@ static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
507 goto free_pbc; 821 goto free_pbc;
508 } 822 }
509 823
510 if (page) { 824 frag_size = ((le32_to_cpu(*pbc))>>16) & 0xFFFF;
511 dma_addr = dma_map_page(&dd->pcidev->dev, 825 if (((frag_size ? frag_size : bytes_togo) + len) >
512 page, 0, len, DMA_TO_DEVICE); 826 ppd->ibmaxlen) {
513 if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { 827 ret = -EINVAL;
828 goto free_pbc;
829 }
830
831 if (frag_size) {
832 int pktsize, tidsmsize, n;
833
834 n = npages*((2*PAGE_SIZE/frag_size)+1);
835 pktsize = sizeof(*pkt) + sizeof(pkt->addr[0])*n;
836
837 /*
838 * Determine if this is tid-sdma or just sdma.
839 */
840 tiddma = (((le32_to_cpu(pbc[7])>>
841 QLOGIC_IB_I_TID_SHIFT)&
842 QLOGIC_IB_I_TID_MASK) !=
843 QLOGIC_IB_I_TID_MASK);
844
845 if (tiddma)
846 tidsmsize = iov[idx].iov_len;
847 else
848 tidsmsize = 0;
849
850 pkt = kmalloc(pktsize+tidsmsize, GFP_KERNEL);
851 if (!pkt) {
514 ret = -ENOMEM; 852 ret = -ENOMEM;
515 goto free_pbc; 853 goto free_pbc;
516 } 854 }
855 pkt->largepkt = 1;
856 pkt->frag_size = frag_size;
857 pkt->addrlimit = n + ARRAY_SIZE(pkt->addr);
858
859 if (tiddma) {
860 char *tidsm = (char *)pkt + pktsize;
861 cfur = copy_from_user(tidsm,
862 iov[idx].iov_base, tidsmsize);
863 if (cfur) {
864 ret = -EFAULT;
865 goto free_pkt;
866 }
867 pkt->tidsm =
868 (struct qib_tid_session_member *)tidsm;
869 pkt->tidsmcount = tidsmsize/
870 sizeof(struct qib_tid_session_member);
871 pkt->tidsmidx = 0;
872 idx++;
873 }
517 874
518 dma_mapped = 1; 875 /*
876 * pbc 'fill1' field is borrowed to pass frag size,
877 * we need to clear it after picking frag size, the
878 * hardware requires this field to be zero.
879 */
880 *pbc = cpu_to_le32(le32_to_cpu(*pbc) & 0x0000FFFF);
881 } else {
882 pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL);
883 if (!pkt) {
884 ret = -ENOMEM;
885 goto free_pbc;
886 }
887 pkt->largepkt = 0;
888 pkt->frag_size = bytes_togo;
889 pkt->addrlimit = ARRAY_SIZE(pkt->addr);
519 } 890 }
520 891 pkt->bytes_togo = bytes_togo;
521 qib_user_sdma_init_header(pkt, counter, 0, len, dma_mapped, 892 pkt->payload_size = 0;
522 page, pbc, dma_addr); 893 pkt->counter = counter;
894 pkt->tiddma = tiddma;
895
896 /* setup the first header */
897 qib_user_sdma_init_frag(pkt, 0, /* index */
898 0, len, /* offset, len */
899 1, 0, /* first last desc */
900 0, 0, /* put page, dma mapped */
901 NULL, pbc, /* struct page, virt addr */
902 dma_addr, len); /* dma addr, dma length */
903 pkt->index = 0;
904 pkt->naddr = 1;
523 905
524 if (nfrags) { 906 if (nfrags) {
525 ret = qib_user_sdma_init_payload(dd, pq, pkt, 907 ret = qib_user_sdma_init_payload(dd, pq, pkt,
526 iov + idx_save + 1, 908 iov + idx_save + 1,
527 nfrags, npages); 909 nfrags, npages);
528 if (ret < 0) 910 if (ret < 0)
529 goto free_pbc_dma; 911 goto free_pkt;
912 } else {
913 /* since there is no payload, mark the
914 * header as the last desc. */
915 pkt->addr[0].last_desc = 1;
916
917 if (dma_addr == 0) {
918 /*
919 * the header is not dma mapped yet.
920 * it should be from kmalloc.
921 */
922 dma_addr = dma_map_single(&dd->pcidev->dev,
923 pbc, len, DMA_TO_DEVICE);
924 if (dma_mapping_error(&dd->pcidev->dev,
925 dma_addr)) {
926 ret = -ENOMEM;
927 goto free_pkt;
928 }
929 pkt->addr[0].addr = dma_addr;
930 pkt->addr[0].dma_mapped = 1;
931 }
530 } 932 }
531 933
532 counter++; 934 counter++;
533 npkts++; 935 npkts++;
936 pkt->pq = pq;
937 pkt->index = 0; /* reset index for push on hw */
938 *ndesc += pkt->naddr;
534 939
535 list_add_tail(&pkt->list, list); 940 list_add_tail(&pkt->list, list);
536 } 941 }
537 942
943 *maxpkts = npkts;
538 ret = idx; 944 ret = idx;
539 goto done; 945 goto done;
540 946
541free_pbc_dma: 947free_pkt:
542 if (dma_mapped) 948 if (pkt->largepkt)
543 dma_unmap_page(&dd->pcidev->dev, dma_addr, len, DMA_TO_DEVICE); 949 kfree(pkt);
950 else
951 kmem_cache_free(pq->pkt_slab, pkt);
544free_pbc: 952free_pbc:
545 if (page) { 953 if (dma_addr)
546 kunmap(page);
547 __free_page(page);
548 } else
549 dma_pool_free(pq->header_cache, pbc, dma_addr); 954 dma_pool_free(pq->header_cache, pbc, dma_addr);
550free_pkt: 955 else
551 kmem_cache_free(pq->pkt_slab, pkt); 956 kfree(pbc);
552free_list: 957free_list:
553 qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list); 958 qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list);
554done: 959done:
@@ -569,10 +974,20 @@ static int qib_user_sdma_queue_clean(struct qib_pportdata *ppd,
569 struct list_head free_list; 974 struct list_head free_list;
570 struct qib_user_sdma_pkt *pkt; 975 struct qib_user_sdma_pkt *pkt;
571 struct qib_user_sdma_pkt *pkt_prev; 976 struct qib_user_sdma_pkt *pkt_prev;
977 unsigned long flags;
572 int ret = 0; 978 int ret = 0;
573 979
980 if (!pq->num_sending)
981 return 0;
982
574 INIT_LIST_HEAD(&free_list); 983 INIT_LIST_HEAD(&free_list);
575 984
985 /*
986 * We need this spin lock here because interrupt handler
987 * might modify this list in qib_user_sdma_send_desc(), also
988 * we can not get interrupted, otherwise it is a deadlock.
989 */
990 spin_lock_irqsave(&pq->sent_lock, flags);
576 list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) { 991 list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) {
577 s64 descd = ppd->sdma_descq_removed - pkt->added; 992 s64 descd = ppd->sdma_descq_removed - pkt->added;
578 993
@@ -583,7 +998,9 @@ static int qib_user_sdma_queue_clean(struct qib_pportdata *ppd,
583 998
584 /* one more packet cleaned */ 999 /* one more packet cleaned */
585 ret++; 1000 ret++;
1001 pq->num_sending--;
586 } 1002 }
1003 spin_unlock_irqrestore(&pq->sent_lock, flags);
587 1004
588 if (!list_empty(&free_list)) { 1005 if (!list_empty(&free_list)) {
589 u32 counter; 1006 u32 counter;
@@ -627,6 +1044,7 @@ void qib_user_sdma_queue_drain(struct qib_pportdata *ppd,
627 struct qib_user_sdma_queue *pq) 1044 struct qib_user_sdma_queue *pq)
628{ 1045{
629 struct qib_devdata *dd = ppd->dd; 1046 struct qib_devdata *dd = ppd->dd;
1047 unsigned long flags;
630 int i; 1048 int i;
631 1049
632 if (!pq) 1050 if (!pq)
@@ -634,7 +1052,7 @@ void qib_user_sdma_queue_drain(struct qib_pportdata *ppd,
634 1052
635 for (i = 0; i < QIB_USER_SDMA_DRAIN_TIMEOUT; i++) { 1053 for (i = 0; i < QIB_USER_SDMA_DRAIN_TIMEOUT; i++) {
636 mutex_lock(&pq->lock); 1054 mutex_lock(&pq->lock);
637 if (list_empty(&pq->sent)) { 1055 if (!pq->num_pending && !pq->num_sending) {
638 mutex_unlock(&pq->lock); 1056 mutex_unlock(&pq->lock);
639 break; 1057 break;
640 } 1058 }
@@ -644,29 +1062,44 @@ void qib_user_sdma_queue_drain(struct qib_pportdata *ppd,
644 msleep(10); 1062 msleep(10);
645 } 1063 }
646 1064
647 if (!list_empty(&pq->sent)) { 1065 if (pq->num_pending || pq->num_sending) {
1066 struct qib_user_sdma_pkt *pkt;
1067 struct qib_user_sdma_pkt *pkt_prev;
648 struct list_head free_list; 1068 struct list_head free_list;
649 1069
1070 mutex_lock(&pq->lock);
1071 spin_lock_irqsave(&ppd->sdma_lock, flags);
1072 /*
1073 * Since we hold sdma_lock, it is safe without sent_lock.
1074 */
1075 if (pq->num_pending) {
1076 list_for_each_entry_safe(pkt, pkt_prev,
1077 &ppd->sdma_userpending, list) {
1078 if (pkt->pq == pq) {
1079 list_move_tail(&pkt->list, &pq->sent);
1080 pq->num_pending--;
1081 pq->num_sending++;
1082 }
1083 }
1084 }
1085 spin_unlock_irqrestore(&ppd->sdma_lock, flags);
1086
650 qib_dev_err(dd, "user sdma lists not empty: forcing!\n"); 1087 qib_dev_err(dd, "user sdma lists not empty: forcing!\n");
651 INIT_LIST_HEAD(&free_list); 1088 INIT_LIST_HEAD(&free_list);
652 mutex_lock(&pq->lock);
653 list_splice_init(&pq->sent, &free_list); 1089 list_splice_init(&pq->sent, &free_list);
1090 pq->num_sending = 0;
654 qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list); 1091 qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
655 mutex_unlock(&pq->lock); 1092 mutex_unlock(&pq->lock);
656 } 1093 }
657} 1094}
658 1095
659static inline __le64 qib_sdma_make_desc0(struct qib_pportdata *ppd, 1096static inline __le64 qib_sdma_make_desc0(u8 gen,
660 u64 addr, u64 dwlen, u64 dwoffset) 1097 u64 addr, u64 dwlen, u64 dwoffset)
661{ 1098{
662 u8 tmpgen;
663
664 tmpgen = ppd->sdma_generation;
665
666 return cpu_to_le64(/* SDmaPhyAddr[31:0] */ 1099 return cpu_to_le64(/* SDmaPhyAddr[31:0] */
667 ((addr & 0xfffffffcULL) << 32) | 1100 ((addr & 0xfffffffcULL) << 32) |
668 /* SDmaGeneration[1:0] */ 1101 /* SDmaGeneration[1:0] */
669 ((tmpgen & 3ULL) << 30) | 1102 ((gen & 3ULL) << 30) |
670 /* SDmaDwordCount[10:0] */ 1103 /* SDmaDwordCount[10:0] */
671 ((dwlen & 0x7ffULL) << 16) | 1104 ((dwlen & 0x7ffULL) << 16) |
672 /* SDmaBufOffset[12:2] */ 1105 /* SDmaBufOffset[12:2] */
@@ -692,7 +1125,7 @@ static inline __le64 qib_sdma_make_desc1(u64 addr)
692 1125
693static void qib_user_sdma_send_frag(struct qib_pportdata *ppd, 1126static void qib_user_sdma_send_frag(struct qib_pportdata *ppd,
694 struct qib_user_sdma_pkt *pkt, int idx, 1127 struct qib_user_sdma_pkt *pkt, int idx,
695 unsigned ofs, u16 tail) 1128 unsigned ofs, u16 tail, u8 gen)
696{ 1129{
697 const u64 addr = (u64) pkt->addr[idx].addr + 1130 const u64 addr = (u64) pkt->addr[idx].addr +
698 (u64) pkt->addr[idx].offset; 1131 (u64) pkt->addr[idx].offset;
@@ -702,104 +1135,132 @@ static void qib_user_sdma_send_frag(struct qib_pportdata *ppd,
702 1135
703 descqp = &ppd->sdma_descq[tail].qw[0]; 1136 descqp = &ppd->sdma_descq[tail].qw[0];
704 1137
705 descq0 = qib_sdma_make_desc0(ppd, addr, dwlen, ofs); 1138 descq0 = qib_sdma_make_desc0(gen, addr, dwlen, ofs);
706 if (idx == 0) 1139 if (pkt->addr[idx].first_desc)
707 descq0 = qib_sdma_make_first_desc0(descq0); 1140 descq0 = qib_sdma_make_first_desc0(descq0);
708 if (idx == pkt->naddr - 1) 1141 if (pkt->addr[idx].last_desc) {
709 descq0 = qib_sdma_make_last_desc0(descq0); 1142 descq0 = qib_sdma_make_last_desc0(descq0);
1143 if (ppd->sdma_intrequest) {
1144 descq0 |= cpu_to_le64(1ULL << 15);
1145 ppd->sdma_intrequest = 0;
1146 }
1147 }
710 1148
711 descqp[0] = descq0; 1149 descqp[0] = descq0;
712 descqp[1] = qib_sdma_make_desc1(addr); 1150 descqp[1] = qib_sdma_make_desc1(addr);
713} 1151}
714 1152
715/* pq->lock must be held, get packets on the wire... */ 1153void qib_user_sdma_send_desc(struct qib_pportdata *ppd,
716static int qib_user_sdma_push_pkts(struct qib_pportdata *ppd, 1154 struct list_head *pktlist)
717 struct qib_user_sdma_queue *pq,
718 struct list_head *pktlist)
719{ 1155{
720 struct qib_devdata *dd = ppd->dd; 1156 struct qib_devdata *dd = ppd->dd;
721 int ret = 0; 1157 u16 nfree, nsent;
722 unsigned long flags; 1158 u16 tail, tail_c;
723 u16 tail; 1159 u8 gen, gen_c;
724 u8 generation;
725 u64 descq_added;
726
727 if (list_empty(pktlist))
728 return 0;
729 1160
730 if (unlikely(!(ppd->lflags & QIBL_LINKACTIVE))) 1161 nfree = qib_sdma_descq_freecnt(ppd);
731 return -ECOMM; 1162 if (!nfree)
732 1163 return;
733 spin_lock_irqsave(&ppd->sdma_lock, flags);
734
735 /* keep a copy for restoring purposes in case of problems */
736 generation = ppd->sdma_generation;
737 descq_added = ppd->sdma_descq_added;
738
739 if (unlikely(!__qib_sdma_running(ppd))) {
740 ret = -ECOMM;
741 goto unlock;
742 }
743 1164
744 tail = ppd->sdma_descq_tail; 1165retry:
1166 nsent = 0;
1167 tail_c = tail = ppd->sdma_descq_tail;
1168 gen_c = gen = ppd->sdma_generation;
745 while (!list_empty(pktlist)) { 1169 while (!list_empty(pktlist)) {
746 struct qib_user_sdma_pkt *pkt = 1170 struct qib_user_sdma_pkt *pkt =
747 list_entry(pktlist->next, struct qib_user_sdma_pkt, 1171 list_entry(pktlist->next, struct qib_user_sdma_pkt,
748 list); 1172 list);
749 int i; 1173 int i, j, c = 0;
750 unsigned ofs = 0; 1174 unsigned ofs = 0;
751 u16 dtail = tail; 1175 u16 dtail = tail;
752 1176
753 if (pkt->naddr > qib_sdma_descq_freecnt(ppd)) 1177 for (i = pkt->index; i < pkt->naddr && nfree; i++) {
754 goto unlock_check_tail; 1178 qib_user_sdma_send_frag(ppd, pkt, i, ofs, tail, gen);
755
756 for (i = 0; i < pkt->naddr; i++) {
757 qib_user_sdma_send_frag(ppd, pkt, i, ofs, tail);
758 ofs += pkt->addr[i].length >> 2; 1179 ofs += pkt->addr[i].length >> 2;
759 1180
760 if (++tail == ppd->sdma_descq_cnt) { 1181 if (++tail == ppd->sdma_descq_cnt) {
761 tail = 0; 1182 tail = 0;
762 ++ppd->sdma_generation; 1183 ++gen;
1184 ppd->sdma_intrequest = 1;
1185 } else if (tail == (ppd->sdma_descq_cnt>>1)) {
1186 ppd->sdma_intrequest = 1;
763 } 1187 }
764 } 1188 nfree--;
1189 if (pkt->addr[i].last_desc == 0)
1190 continue;
765 1191
766 if ((ofs << 2) > ppd->ibmaxlen) { 1192 /*
767 ret = -EMSGSIZE; 1193 * If the packet is >= 2KB mtu equivalent, we
768 goto unlock; 1194 * have to use the large buffers, and have to
769 } 1195 * mark each descriptor as part of a large
770 1196 * buffer packet.
771 /* 1197 */
772 * If the packet is >= 2KB mtu equivalent, we have to use 1198 if (ofs > dd->piosize2kmax_dwords) {
773 * the large buffers, and have to mark each descriptor as 1199 for (j = pkt->index; j <= i; j++) {
774 * part of a large buffer packet. 1200 ppd->sdma_descq[dtail].qw[0] |=
775 */ 1201 cpu_to_le64(1ULL << 14);
776 if (ofs > dd->piosize2kmax_dwords) { 1202 if (++dtail == ppd->sdma_descq_cnt)
777 for (i = 0; i < pkt->naddr; i++) { 1203 dtail = 0;
778 ppd->sdma_descq[dtail].qw[0] |= 1204 }
779 cpu_to_le64(1ULL << 14);
780 if (++dtail == ppd->sdma_descq_cnt)
781 dtail = 0;
782 } 1205 }
1206 c += i + 1 - pkt->index;
1207 pkt->index = i + 1; /* index for next first */
1208 tail_c = dtail = tail;
1209 gen_c = gen;
1210 ofs = 0; /* reset for next packet */
783 } 1211 }
784 1212
785 ppd->sdma_descq_added += pkt->naddr; 1213 ppd->sdma_descq_added += c;
786 pkt->added = ppd->sdma_descq_added; 1214 nsent += c;
787 list_move_tail(&pkt->list, &pq->sent); 1215 if (pkt->index == pkt->naddr) {
788 ret++; 1216 pkt->added = ppd->sdma_descq_added;
1217 pkt->pq->added = pkt->added;
1218 pkt->pq->num_pending--;
1219 spin_lock(&pkt->pq->sent_lock);
1220 pkt->pq->num_sending++;
1221 list_move_tail(&pkt->list, &pkt->pq->sent);
1222 spin_unlock(&pkt->pq->sent_lock);
1223 }
1224 if (!nfree || (nsent<<2) > ppd->sdma_descq_cnt)
1225 break;
789 } 1226 }
790 1227
791unlock_check_tail:
792 /* advance the tail on the chip if necessary */ 1228 /* advance the tail on the chip if necessary */
793 if (ppd->sdma_descq_tail != tail) 1229 if (ppd->sdma_descq_tail != tail_c) {
794 dd->f_sdma_update_tail(ppd, tail); 1230 ppd->sdma_generation = gen_c;
1231 dd->f_sdma_update_tail(ppd, tail_c);
1232 }
795 1233
796unlock: 1234 if (nfree && !list_empty(pktlist))
797 if (unlikely(ret < 0)) { 1235 goto retry;
798 ppd->sdma_generation = generation; 1236
799 ppd->sdma_descq_added = descq_added; 1237 return;
1238}
1239
1240/* pq->lock must be held, get packets on the wire... */
1241static int qib_user_sdma_push_pkts(struct qib_pportdata *ppd,
1242 struct qib_user_sdma_queue *pq,
1243 struct list_head *pktlist, int count)
1244{
1245 int ret = 0;
1246 unsigned long flags;
1247
1248 if (unlikely(!(ppd->lflags & QIBL_LINKACTIVE)))
1249 return -ECOMM;
1250
1251 spin_lock_irqsave(&ppd->sdma_lock, flags);
1252
1253 if (unlikely(!__qib_sdma_running(ppd))) {
1254 ret = -ECOMM;
1255 goto unlock;
800 } 1256 }
801 spin_unlock_irqrestore(&ppd->sdma_lock, flags);
802 1257
1258 pq->num_pending += count;
1259 list_splice_tail_init(pktlist, &ppd->sdma_userpending);
1260 qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending);
1261
1262unlock:
1263 spin_unlock_irqrestore(&ppd->sdma_lock, flags);
803 return ret; 1264 return ret;
804} 1265}
805 1266
@@ -822,19 +1283,23 @@ int qib_user_sdma_writev(struct qib_ctxtdata *rcd,
822 if (!qib_sdma_running(ppd)) 1283 if (!qib_sdma_running(ppd))
823 goto done_unlock; 1284 goto done_unlock;
824 1285
825 if (ppd->sdma_descq_added != ppd->sdma_descq_removed) { 1286 /* if I have packets not complete yet */
1287 if (pq->added > ppd->sdma_descq_removed)
826 qib_user_sdma_hwqueue_clean(ppd); 1288 qib_user_sdma_hwqueue_clean(ppd);
1289 /* if I have complete packets to be freed */
1290 if (pq->num_sending)
827 qib_user_sdma_queue_clean(ppd, pq); 1291 qib_user_sdma_queue_clean(ppd, pq);
828 }
829 1292
830 while (dim) { 1293 while (dim) {
831 const int mxp = 8; 1294 int mxp = 8;
1295 int ndesc = 0;
832 1296
833 down_write(&current->mm->mmap_sem); 1297 down_write(&current->mm->mmap_sem);
834 ret = qib_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp); 1298 ret = qib_user_sdma_queue_pkts(dd, ppd, pq,
1299 iov, dim, &list, &mxp, &ndesc);
835 up_write(&current->mm->mmap_sem); 1300 up_write(&current->mm->mmap_sem);
836 1301
837 if (ret <= 0) 1302 if (ret < 0)
838 goto done_unlock; 1303 goto done_unlock;
839 else { 1304 else {
840 dim -= ret; 1305 dim -= ret;
@@ -844,24 +1309,20 @@ int qib_user_sdma_writev(struct qib_ctxtdata *rcd,
844 /* force packets onto the sdma hw queue... */ 1309 /* force packets onto the sdma hw queue... */
845 if (!list_empty(&list)) { 1310 if (!list_empty(&list)) {
846 /* 1311 /*
847 * Lazily clean hw queue. the 4 is a guess of about 1312 * Lazily clean hw queue.
848 * how many sdma descriptors a packet will take (it
849 * doesn't have to be perfect).
850 */ 1313 */
851 if (qib_sdma_descq_freecnt(ppd) < ret * 4) { 1314 if (qib_sdma_descq_freecnt(ppd) < ndesc) {
852 qib_user_sdma_hwqueue_clean(ppd); 1315 qib_user_sdma_hwqueue_clean(ppd);
853 qib_user_sdma_queue_clean(ppd, pq); 1316 if (pq->num_sending)
1317 qib_user_sdma_queue_clean(ppd, pq);
854 } 1318 }
855 1319
856 ret = qib_user_sdma_push_pkts(ppd, pq, &list); 1320 ret = qib_user_sdma_push_pkts(ppd, pq, &list, mxp);
857 if (ret < 0) 1321 if (ret < 0)
858 goto done_unlock; 1322 goto done_unlock;
859 else { 1323 else {
860 npkts += ret; 1324 npkts += mxp;
861 pq->counter += ret; 1325 pq->counter += mxp;
862
863 if (!list_empty(&list))
864 goto done_unlock;
865 } 1326 }
866 } 1327 }
867 } 1328 }