aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2018-06-21 11:49:37 -0400
committerChristoph Hellwig <hch@lst.de>2018-06-21 12:59:46 -0400
commit943e942e6266f22babee5efeb00f8f672fbff5bd (patch)
tree9122de26af304afdf313020e689e9e4008de375c
parent9f9cafc14016f23f982d3ce18f9057923bd3037a (diff)
nvme-pci: limit max IO size and segments to avoid high order allocations
nvme requires an sg table allocation for each request. If the request is large, then the allocation can become quite large. For instance, with our default software settings of 1280KB IO size, we'll need 10248 bytes of sg table. That turns into a 2nd order allocation, which we can't always guarantee. If we fail the allocation, blk-mq will retry it later. But there's no guarantee that we'll EVER be able to allocate that much contigious memory. Limit the IO size such that we never need more than a single page of memory. That's a lot faster and more reliable. Then back that allocation with a mempool, so that we know we'll always be able to succeed the allocation at some point. Signed-off-by: Jens Axboe <axboe@kernel.dk> Acked-by: Keith Busch <keith.busch@intel.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
-rw-r--r--drivers/nvme/host/core.c1
-rw-r--r--drivers/nvme/host/nvme.h1
-rw-r--r--drivers/nvme/host/pci.c42
3 files changed, 39 insertions, 5 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 21710a7460c8..46df030b2c3f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1808,6 +1808,7 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
1808 u32 max_segments = 1808 u32 max_segments =
1809 (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; 1809 (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
1810 1810
1811 max_segments = min_not_zero(max_segments, ctrl->max_segments);
1811 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); 1812 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
1812 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); 1813 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
1813 } 1814 }
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 231807cbc849..0c4a33df3b2f 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -170,6 +170,7 @@ struct nvme_ctrl {
170 u64 cap; 170 u64 cap;
171 u32 page_size; 171 u32 page_size;
172 u32 max_hw_sectors; 172 u32 max_hw_sectors;
173 u32 max_segments;
173 u16 oncs; 174 u16 oncs;
174 u16 oacs; 175 u16 oacs;
175 u16 nssa; 176 u16 nssa;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 73a97fcea364..ba943f211687 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -38,6 +38,13 @@
38 38
39#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) 39#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
40 40
41/*
42 * These can be higher, but we need to ensure that any command doesn't
43 * require an sg allocation that needs more than a page of data.
44 */
45#define NVME_MAX_KB_SZ 4096
46#define NVME_MAX_SEGS 127
47
41static int use_threaded_interrupts; 48static int use_threaded_interrupts;
42module_param(use_threaded_interrupts, int, 0); 49module_param(use_threaded_interrupts, int, 0);
43 50
@@ -100,6 +107,8 @@ struct nvme_dev {
100 struct nvme_ctrl ctrl; 107 struct nvme_ctrl ctrl;
101 struct completion ioq_wait; 108 struct completion ioq_wait;
102 109
110 mempool_t *iod_mempool;
111
103 /* shadow doorbell buffer support: */ 112 /* shadow doorbell buffer support: */
104 u32 *dbbuf_dbs; 113 u32 *dbbuf_dbs;
105 dma_addr_t dbbuf_dbs_dma_addr; 114 dma_addr_t dbbuf_dbs_dma_addr;
@@ -477,10 +486,7 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
477 iod->use_sgl = nvme_pci_use_sgls(dev, rq); 486 iod->use_sgl = nvme_pci_use_sgls(dev, rq);
478 487
479 if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { 488 if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
480 size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg, 489 iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
481 iod->use_sgl);
482
483 iod->sg = kmalloc(alloc_size, GFP_ATOMIC);
484 if (!iod->sg) 490 if (!iod->sg)
485 return BLK_STS_RESOURCE; 491 return BLK_STS_RESOURCE;
486 } else { 492 } else {
@@ -526,7 +532,7 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
526 } 532 }
527 533
528 if (iod->sg != iod->inline_sg) 534 if (iod->sg != iod->inline_sg)
529 kfree(iod->sg); 535 mempool_free(iod->sg, dev->iod_mempool);
530} 536}
531 537
532#ifdef CONFIG_BLK_DEV_INTEGRITY 538#ifdef CONFIG_BLK_DEV_INTEGRITY
@@ -2280,6 +2286,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
2280 blk_put_queue(dev->ctrl.admin_q); 2286 blk_put_queue(dev->ctrl.admin_q);
2281 kfree(dev->queues); 2287 kfree(dev->queues);
2282 free_opal_dev(dev->ctrl.opal_dev); 2288 free_opal_dev(dev->ctrl.opal_dev);
2289 mempool_destroy(dev->iod_mempool);
2283 kfree(dev); 2290 kfree(dev);
2284} 2291}
2285 2292
@@ -2334,6 +2341,13 @@ static void nvme_reset_work(struct work_struct *work)
2334 if (result) 2341 if (result)
2335 goto out; 2342 goto out;
2336 2343
2344 /*
2345 * Limit the max command size to prevent iod->sg allocations going
2346 * over a single page.
2347 */
2348 dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1;
2349 dev->ctrl.max_segments = NVME_MAX_SEGS;
2350
2337 result = nvme_init_identify(&dev->ctrl); 2351 result = nvme_init_identify(&dev->ctrl);
2338 if (result) 2352 if (result)
2339 goto out; 2353 goto out;
@@ -2509,6 +2523,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2509 int node, result = -ENOMEM; 2523 int node, result = -ENOMEM;
2510 struct nvme_dev *dev; 2524 struct nvme_dev *dev;
2511 unsigned long quirks = id->driver_data; 2525 unsigned long quirks = id->driver_data;
2526 size_t alloc_size;
2512 2527
2513 node = dev_to_node(&pdev->dev); 2528 node = dev_to_node(&pdev->dev);
2514 if (node == NUMA_NO_NODE) 2529 if (node == NUMA_NO_NODE)
@@ -2546,6 +2561,23 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2546 if (result) 2561 if (result)
2547 goto release_pools; 2562 goto release_pools;
2548 2563
2564 /*
2565 * Double check that our mempool alloc size will cover the biggest
2566 * command we support.
2567 */
2568 alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ,
2569 NVME_MAX_SEGS, true);
2570 WARN_ON_ONCE(alloc_size > PAGE_SIZE);
2571
2572 dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
2573 mempool_kfree,
2574 (void *) alloc_size,
2575 GFP_KERNEL, node);
2576 if (!dev->iod_mempool) {
2577 result = -ENOMEM;
2578 goto release_pools;
2579 }
2580
2549 dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); 2581 dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
2550 2582
2551 nvme_get_ctrl(&dev->ctrl); 2583 nvme_get_ctrl(&dev->ctrl);