diff options
author | Jens Axboe <axboe@fb.com> | 2015-01-22 14:07:58 -0500 |
---|---|---|
committer | Jens Axboe <axboe@fb.com> | 2015-01-29 12:25:34 -0500 |
commit | ac3dd5bd128b1d1ce2a037775766f39d06a4848a (patch) | |
tree | 9e6390eac98ffbbe5d7d82e6c585125ed9bba0cf /drivers/block | |
parent | 4ca5829ac8b1297715bf609443ade2c332f3fd0c (diff) |
NVMe: avoid kmalloc/kfree for smaller IO
Currently we allocate an nvme_iod for each IO, which holds the
sg list, prps, and other IO related info. Set a threshold of
2 pages and/or 8KB of data, below which we can just embed this
in the per-command pdu in blk-mq. For any IO at or below
NVME_INT_PAGES and NVME_INT_BYTES, we save a kmalloc and kfree.
For higher IOPS, this saves up to 1% of CPU time.
Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/nvme-core.c | 119 |
1 files changed, 88 insertions, 31 deletions
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index f4aa64160838..3eaa0becc52d 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c | |||
@@ -144,8 +144,37 @@ struct nvme_cmd_info { | |||
144 | void *ctx; | 144 | void *ctx; |
145 | int aborted; | 145 | int aborted; |
146 | struct nvme_queue *nvmeq; | 146 | struct nvme_queue *nvmeq; |
147 | struct nvme_iod iod[0]; | ||
147 | }; | 148 | }; |
148 | 149 | ||
150 | /* | ||
151 | * Max size of iod being embedded in the request payload | ||
152 | */ | ||
153 | #define NVME_INT_PAGES 2 | ||
154 | #define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size) | ||
155 | |||
156 | /* | ||
157 | * Will slightly overestimate the number of pages needed. This is OK | ||
158 | * as it only leads to a small amount of wasted memory for the lifetime of | ||
159 | * the I/O. | ||
160 | */ | ||
161 | static int nvme_npages(unsigned size, struct nvme_dev *dev) | ||
162 | { | ||
163 | unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); | ||
164 | return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); | ||
165 | } | ||
166 | |||
167 | static unsigned int nvme_cmd_size(struct nvme_dev *dev) | ||
168 | { | ||
169 | unsigned int ret = sizeof(struct nvme_cmd_info); | ||
170 | |||
171 | ret += sizeof(struct nvme_iod); | ||
172 | ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); | ||
173 | ret += sizeof(struct scatterlist) * NVME_INT_PAGES; | ||
174 | |||
175 | return ret; | ||
176 | } | ||
177 | |||
149 | static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, | 178 | static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, |
150 | unsigned int hctx_idx) | 179 | unsigned int hctx_idx) |
151 | { | 180 | { |
@@ -217,6 +246,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, | |||
217 | cmd->aborted = 0; | 246 | cmd->aborted = 0; |
218 | } | 247 | } |
219 | 248 | ||
249 | static void *iod_get_private(struct nvme_iod *iod) | ||
250 | { | ||
251 | return (void *) (iod->private & ~0x1UL); | ||
252 | } | ||
253 | |||
254 | /* | ||
255 | * If bit 0 is set, the iod is embedded in the request payload. | ||
256 | */ | ||
257 | static bool iod_should_kfree(struct nvme_iod *iod) | ||
258 | { | ||
259 | return (iod->private & 0x01) == 0; | ||
260 | } | ||
261 | |||
220 | /* Special values must be less than 0x1000 */ | 262 | /* Special values must be less than 0x1000 */ |
221 | #define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) | 263 | #define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) |
222 | #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) | 264 | #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) |
@@ -360,35 +402,53 @@ static __le64 **iod_list(struct nvme_iod *iod) | |||
360 | return ((void *)iod) + iod->offset; | 402 | return ((void *)iod) + iod->offset; |
361 | } | 403 | } |
362 | 404 | ||
363 | /* | 405 | static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, |
364 | * Will slightly overestimate the number of pages needed. This is OK | 406 | unsigned nseg, unsigned long private) |
365 | * as it only leads to a small amount of wasted memory for the lifetime of | ||
366 | * the I/O. | ||
367 | */ | ||
368 | static int nvme_npages(unsigned size, struct nvme_dev *dev) | ||
369 | { | 407 | { |
370 | unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); | 408 | iod->private = private; |
371 | return DIV_ROUND_UP(8 * nprps, dev->page_size - 8); | 409 | iod->offset = offsetof(struct nvme_iod, sg[nseg]); |
410 | iod->npages = -1; | ||
411 | iod->length = nbytes; | ||
412 | iod->nents = 0; | ||
372 | } | 413 | } |
373 | 414 | ||
374 | static struct nvme_iod * | 415 | static struct nvme_iod * |
375 | nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp) | 416 | __nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, |
417 | unsigned long priv, gfp_t gfp) | ||
376 | { | 418 | { |
377 | struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + | 419 | struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + |
378 | sizeof(__le64 *) * nvme_npages(nbytes, dev) + | 420 | sizeof(__le64 *) * nvme_npages(bytes, dev) + |
379 | sizeof(struct scatterlist) * nseg, gfp); | 421 | sizeof(struct scatterlist) * nseg, gfp); |
380 | 422 | ||
381 | if (iod) { | 423 | if (iod) |
382 | iod->offset = offsetof(struct nvme_iod, sg[nseg]); | 424 | iod_init(iod, bytes, nseg, priv); |
383 | iod->npages = -1; | ||
384 | iod->length = nbytes; | ||
385 | iod->nents = 0; | ||
386 | iod->first_dma = 0ULL; | ||
387 | } | ||
388 | 425 | ||
389 | return iod; | 426 | return iod; |
390 | } | 427 | } |
391 | 428 | ||
429 | static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, | ||
430 | gfp_t gfp) | ||
431 | { | ||
432 | unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : | ||
433 | sizeof(struct nvme_dsm_range); | ||
434 | unsigned long mask = 0; | ||
435 | struct nvme_iod *iod; | ||
436 | |||
437 | if (rq->nr_phys_segments <= NVME_INT_PAGES && | ||
438 | size <= NVME_INT_BYTES(dev)) { | ||
439 | struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); | ||
440 | |||
441 | iod = cmd->iod; | ||
442 | mask = 0x01; | ||
443 | iod_init(iod, size, rq->nr_phys_segments, | ||
444 | (unsigned long) rq | 0x01); | ||
445 | return iod; | ||
446 | } | ||
447 | |||
448 | return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, | ||
449 | (unsigned long) rq, gfp); | ||
450 | } | ||
451 | |||
392 | void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) | 452 | void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) |
393 | { | 453 | { |
394 | const int last_prp = dev->page_size / 8 - 1; | 454 | const int last_prp = dev->page_size / 8 - 1; |
@@ -404,7 +464,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) | |||
404 | dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); | 464 | dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); |
405 | prp_dma = next_prp_dma; | 465 | prp_dma = next_prp_dma; |
406 | } | 466 | } |
407 | kfree(iod); | 467 | |
468 | if (iod_should_kfree(iod)) | ||
469 | kfree(iod); | ||
408 | } | 470 | } |
409 | 471 | ||
410 | static int nvme_error_status(u16 status) | 472 | static int nvme_error_status(u16 status) |
@@ -423,7 +485,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, | |||
423 | struct nvme_completion *cqe) | 485 | struct nvme_completion *cqe) |
424 | { | 486 | { |
425 | struct nvme_iod *iod = ctx; | 487 | struct nvme_iod *iod = ctx; |
426 | struct request *req = iod->private; | 488 | struct request *req = iod_get_private(iod); |
427 | struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); | 489 | struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); |
428 | 490 | ||
429 | u16 status = le16_to_cpup(&cqe->status) >> 1; | 491 | u16 status = le16_to_cpup(&cqe->status) >> 1; |
@@ -579,7 +641,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, | |||
579 | static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, | 641 | static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, |
580 | struct nvme_ns *ns) | 642 | struct nvme_ns *ns) |
581 | { | 643 | { |
582 | struct request *req = iod->private; | 644 | struct request *req = iod_get_private(iod); |
583 | struct nvme_command *cmnd; | 645 | struct nvme_command *cmnd; |
584 | u16 control = 0; | 646 | u16 control = 0; |
585 | u32 dsmgmt = 0; | 647 | u32 dsmgmt = 0; |
@@ -620,17 +682,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, | |||
620 | struct request *req = bd->rq; | 682 | struct request *req = bd->rq; |
621 | struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); | 683 | struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); |
622 | struct nvme_iod *iod; | 684 | struct nvme_iod *iod; |
623 | int psegs = req->nr_phys_segments; | ||
624 | enum dma_data_direction dma_dir; | 685 | enum dma_data_direction dma_dir; |
625 | unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) : | ||
626 | sizeof(struct nvme_dsm_range); | ||
627 | 686 | ||
628 | iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC); | 687 | iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC); |
629 | if (!iod) | 688 | if (!iod) |
630 | return BLK_MQ_RQ_QUEUE_BUSY; | 689 | return BLK_MQ_RQ_QUEUE_BUSY; |
631 | 690 | ||
632 | iod->private = req; | ||
633 | |||
634 | if (req->cmd_flags & REQ_DISCARD) { | 691 | if (req->cmd_flags & REQ_DISCARD) { |
635 | void *range; | 692 | void *range; |
636 | /* | 693 | /* |
@@ -645,10 +702,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, | |||
645 | goto retry_cmd; | 702 | goto retry_cmd; |
646 | iod_list(iod)[0] = (__le64 *)range; | 703 | iod_list(iod)[0] = (__le64 *)range; |
647 | iod->npages = 0; | 704 | iod->npages = 0; |
648 | } else if (psegs) { | 705 | } else if (req->nr_phys_segments) { |
649 | dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; | 706 | dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; |
650 | 707 | ||
651 | sg_init_table(iod->sg, psegs); | 708 | sg_init_table(iod->sg, req->nr_phys_segments); |
652 | iod->nents = blk_rq_map_sg(req->q, req, iod->sg); | 709 | iod->nents = blk_rq_map_sg(req->q, req, iod->sg); |
653 | if (!iod->nents) | 710 | if (!iod->nents) |
654 | goto error_cmd; | 711 | goto error_cmd; |
@@ -1362,7 +1419,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) | |||
1362 | dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; | 1419 | dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; |
1363 | dev->admin_tagset.timeout = ADMIN_TIMEOUT; | 1420 | dev->admin_tagset.timeout = ADMIN_TIMEOUT; |
1364 | dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); | 1421 | dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); |
1365 | dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info); | 1422 | dev->admin_tagset.cmd_size = nvme_cmd_size(dev); |
1366 | dev->admin_tagset.driver_data = dev; | 1423 | dev->admin_tagset.driver_data = dev; |
1367 | 1424 | ||
1368 | if (blk_mq_alloc_tag_set(&dev->admin_tagset)) | 1425 | if (blk_mq_alloc_tag_set(&dev->admin_tagset)) |
@@ -1483,7 +1540,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, | |||
1483 | } | 1540 | } |
1484 | 1541 | ||
1485 | err = -ENOMEM; | 1542 | err = -ENOMEM; |
1486 | iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL); | 1543 | iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL); |
1487 | if (!iod) | 1544 | if (!iod) |
1488 | goto put_pages; | 1545 | goto put_pages; |
1489 | 1546 | ||
@@ -2109,7 +2166,7 @@ static int nvme_dev_add(struct nvme_dev *dev) | |||
2109 | dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); | 2166 | dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); |
2110 | dev->tagset.queue_depth = | 2167 | dev->tagset.queue_depth = |
2111 | min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; | 2168 | min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; |
2112 | dev->tagset.cmd_size = sizeof(struct nvme_cmd_info); | 2169 | dev->tagset.cmd_size = nvme_cmd_size(dev); |
2113 | dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; | 2170 | dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; |
2114 | dev->tagset.driver_data = dev; | 2171 | dev->tagset.driver_data = dev; |
2115 | 2172 | ||