aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorJens Axboe <axboe@fb.com>2015-01-22 14:07:58 -0500
committerJens Axboe <axboe@fb.com>2015-01-29 12:25:34 -0500
commitac3dd5bd128b1d1ce2a037775766f39d06a4848a (patch)
tree9e6390eac98ffbbe5d7d82e6c585125ed9bba0cf /drivers/block
parent4ca5829ac8b1297715bf609443ade2c332f3fd0c (diff)
NVMe: avoid kmalloc/kfree for smaller IO
Currently we allocate an nvme_iod for each IO, which holds the sg list, prps, and other IO related info. Set a threshold of 2 pages and/or 8KB of data, below which we can just embed this in the per-command pdu in blk-mq. For any IO at or below NVME_INT_PAGES and NVME_INT_BYTES, we save a kmalloc and kfree. For higher IOPS, this saves up to 1% of CPU time. Signed-off-by: Jens Axboe <axboe@fb.com> Reviewed-by: Keith Busch <keith.busch@intel.com>
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/nvme-core.c119
1 files changed, 88 insertions, 31 deletions
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index f4aa64160838..3eaa0becc52d 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -144,8 +144,37 @@ struct nvme_cmd_info {
144 void *ctx; 144 void *ctx;
145 int aborted; 145 int aborted;
146 struct nvme_queue *nvmeq; 146 struct nvme_queue *nvmeq;
147 struct nvme_iod iod[0];
147}; 148};
148 149
150/*
151 * Max size of iod being embedded in the request payload
152 */
153#define NVME_INT_PAGES 2
154#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size)
155
156/*
157 * Will slightly overestimate the number of pages needed. This is OK
158 * as it only leads to a small amount of wasted memory for the lifetime of
159 * the I/O.
160 */
161static int nvme_npages(unsigned size, struct nvme_dev *dev)
162{
163 unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
164 return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
165}
166
167static unsigned int nvme_cmd_size(struct nvme_dev *dev)
168{
169 unsigned int ret = sizeof(struct nvme_cmd_info);
170
171 ret += sizeof(struct nvme_iod);
172 ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
173 ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
174
175 return ret;
176}
177
149static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 178static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
150 unsigned int hctx_idx) 179 unsigned int hctx_idx)
151{ 180{
@@ -217,6 +246,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
217 cmd->aborted = 0; 246 cmd->aborted = 0;
218} 247}
219 248
249static void *iod_get_private(struct nvme_iod *iod)
250{
251 return (void *) (iod->private & ~0x1UL);
252}
253
254/*
255 * If bit 0 is set, the iod is embedded in the request payload.
256 */
257static bool iod_should_kfree(struct nvme_iod *iod)
258{
259 return (iod->private & 0x01) == 0;
260}
261
220/* Special values must be less than 0x1000 */ 262/* Special values must be less than 0x1000 */
221#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) 263#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA)
222#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) 264#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
@@ -360,35 +402,53 @@ static __le64 **iod_list(struct nvme_iod *iod)
360 return ((void *)iod) + iod->offset; 402 return ((void *)iod) + iod->offset;
361} 403}
362 404
363/* 405static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
364 * Will slightly overestimate the number of pages needed. This is OK 406 unsigned nseg, unsigned long private)
365 * as it only leads to a small amount of wasted memory for the lifetime of
366 * the I/O.
367 */
368static int nvme_npages(unsigned size, struct nvme_dev *dev)
369{ 407{
370 unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); 408 iod->private = private;
371 return DIV_ROUND_UP(8 * nprps, dev->page_size - 8); 409 iod->offset = offsetof(struct nvme_iod, sg[nseg]);
410 iod->npages = -1;
411 iod->length = nbytes;
412 iod->nents = 0;
372} 413}
373 414
374static struct nvme_iod * 415static struct nvme_iod *
375nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp) 416__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
417 unsigned long priv, gfp_t gfp)
376{ 418{
377 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + 419 struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
378 sizeof(__le64 *) * nvme_npages(nbytes, dev) + 420 sizeof(__le64 *) * nvme_npages(bytes, dev) +
379 sizeof(struct scatterlist) * nseg, gfp); 421 sizeof(struct scatterlist) * nseg, gfp);
380 422
381 if (iod) { 423 if (iod)
382 iod->offset = offsetof(struct nvme_iod, sg[nseg]); 424 iod_init(iod, bytes, nseg, priv);
383 iod->npages = -1;
384 iod->length = nbytes;
385 iod->nents = 0;
386 iod->first_dma = 0ULL;
387 }
388 425
389 return iod; 426 return iod;
390} 427}
391 428
429static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
430 gfp_t gfp)
431{
432 unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
433 sizeof(struct nvme_dsm_range);
434 unsigned long mask = 0;
435 struct nvme_iod *iod;
436
437 if (rq->nr_phys_segments <= NVME_INT_PAGES &&
438 size <= NVME_INT_BYTES(dev)) {
439 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
440
441 iod = cmd->iod;
442 mask = 0x01;
443 iod_init(iod, size, rq->nr_phys_segments,
444 (unsigned long) rq | 0x01);
445 return iod;
446 }
447
448 return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
449 (unsigned long) rq, gfp);
450}
451
392void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) 452void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
393{ 453{
394 const int last_prp = dev->page_size / 8 - 1; 454 const int last_prp = dev->page_size / 8 - 1;
@@ -404,7 +464,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
404 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); 464 dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
405 prp_dma = next_prp_dma; 465 prp_dma = next_prp_dma;
406 } 466 }
407 kfree(iod); 467
468 if (iod_should_kfree(iod))
469 kfree(iod);
408} 470}
409 471
410static int nvme_error_status(u16 status) 472static int nvme_error_status(u16 status)
@@ -423,7 +485,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
423 struct nvme_completion *cqe) 485 struct nvme_completion *cqe)
424{ 486{
425 struct nvme_iod *iod = ctx; 487 struct nvme_iod *iod = ctx;
426 struct request *req = iod->private; 488 struct request *req = iod_get_private(iod);
427 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); 489 struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
428 490
429 u16 status = le16_to_cpup(&cqe->status) >> 1; 491 u16 status = le16_to_cpup(&cqe->status) >> 1;
@@ -579,7 +641,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
579static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, 641static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
580 struct nvme_ns *ns) 642 struct nvme_ns *ns)
581{ 643{
582 struct request *req = iod->private; 644 struct request *req = iod_get_private(iod);
583 struct nvme_command *cmnd; 645 struct nvme_command *cmnd;
584 u16 control = 0; 646 u16 control = 0;
585 u32 dsmgmt = 0; 647 u32 dsmgmt = 0;
@@ -620,17 +682,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
620 struct request *req = bd->rq; 682 struct request *req = bd->rq;
621 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); 683 struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
622 struct nvme_iod *iod; 684 struct nvme_iod *iod;
623 int psegs = req->nr_phys_segments;
624 enum dma_data_direction dma_dir; 685 enum dma_data_direction dma_dir;
625 unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) :
626 sizeof(struct nvme_dsm_range);
627 686
628 iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC); 687 iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
629 if (!iod) 688 if (!iod)
630 return BLK_MQ_RQ_QUEUE_BUSY; 689 return BLK_MQ_RQ_QUEUE_BUSY;
631 690
632 iod->private = req;
633
634 if (req->cmd_flags & REQ_DISCARD) { 691 if (req->cmd_flags & REQ_DISCARD) {
635 void *range; 692 void *range;
636 /* 693 /*
@@ -645,10 +702,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
645 goto retry_cmd; 702 goto retry_cmd;
646 iod_list(iod)[0] = (__le64 *)range; 703 iod_list(iod)[0] = (__le64 *)range;
647 iod->npages = 0; 704 iod->npages = 0;
648 } else if (psegs) { 705 } else if (req->nr_phys_segments) {
649 dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; 706 dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
650 707
651 sg_init_table(iod->sg, psegs); 708 sg_init_table(iod->sg, req->nr_phys_segments);
652 iod->nents = blk_rq_map_sg(req->q, req, iod->sg); 709 iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
653 if (!iod->nents) 710 if (!iod->nents)
654 goto error_cmd; 711 goto error_cmd;
@@ -1362,7 +1419,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
1362 dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; 1419 dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
1363 dev->admin_tagset.timeout = ADMIN_TIMEOUT; 1420 dev->admin_tagset.timeout = ADMIN_TIMEOUT;
1364 dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); 1421 dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
1365 dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info); 1422 dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
1366 dev->admin_tagset.driver_data = dev; 1423 dev->admin_tagset.driver_data = dev;
1367 1424
1368 if (blk_mq_alloc_tag_set(&dev->admin_tagset)) 1425 if (blk_mq_alloc_tag_set(&dev->admin_tagset))
@@ -1483,7 +1540,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
1483 } 1540 }
1484 1541
1485 err = -ENOMEM; 1542 err = -ENOMEM;
1486 iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL); 1543 iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
1487 if (!iod) 1544 if (!iod)
1488 goto put_pages; 1545 goto put_pages;
1489 1546
@@ -2109,7 +2166,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
2109 dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); 2166 dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
2110 dev->tagset.queue_depth = 2167 dev->tagset.queue_depth =
2111 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; 2168 min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
2112 dev->tagset.cmd_size = sizeof(struct nvme_cmd_info); 2169 dev->tagset.cmd_size = nvme_cmd_size(dev);
2113 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; 2170 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
2114 dev->tagset.driver_data = dev; 2171 dev->tagset.driver_data = dev;
2115 2172