aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKeith Busch <keith.busch@intel.com>2014-02-21 16:13:44 -0500
committerMatthew Wilcox <matthew.r.wilcox@intel.com>2014-03-24 08:45:57 -0400
commit5a92e700af2e5e0e6404988d6a7f2ed3dad3f46f (patch)
tree0de62a3873a05bb0e21fc5195be6ff20af5366cf
parentfb35e914b3f88cda9ee6f9d776910c35269c4ecf (diff)
NVMe: RCU protected access to io queues
This adds rcu protected access to nvme_queue to fix a race between a surprise removal freeing the queue and a thread with open reference on a NVMe block device using that queue. The queues do not need to be rcu protected during the initialization or shutdown parts, so I've added a helper function for raw deferencing to get around the sparse errors. There is still a hole in the IOCTL path for the same problem, which is fixed in a subsequent patch. Signed-off-by: Keith Busch <keith.busch@intel.com> Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
-rw-r--r--drivers/block/nvme-core.c91
-rw-r--r--include/linux/nvme.h2
2 files changed, 46 insertions, 47 deletions
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index f565212a9e32..b66ab1db4629 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -74,6 +74,7 @@ struct async_cmd_info {
74 * commands and one for I/O commands). 74 * commands and one for I/O commands).
75 */ 75 */
76struct nvme_queue { 76struct nvme_queue {
77 struct rcu_head r_head;
77 struct device *q_dmadev; 78 struct device *q_dmadev;
78 struct nvme_dev *dev; 79 struct nvme_dev *dev;
79 char irqname[24]; /* nvme4294967295-65535\0 */ 80 char irqname[24]; /* nvme4294967295-65535\0 */
@@ -262,14 +263,21 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
262 return ctx; 263 return ctx;
263} 264}
264 265
265struct nvme_queue *get_nvmeq(struct nvme_dev *dev) 266static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid)
266{ 267{
267 return dev->queues[get_cpu() + 1]; 268 return rcu_dereference_raw(dev->queues[qid]);
268} 269}
269 270
270void put_nvmeq(struct nvme_queue *nvmeq) 271struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
272{
273 rcu_read_lock();
274 return rcu_dereference(dev->queues[get_cpu() + 1]);
275}
276
277void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
271{ 278{
272 put_cpu(); 279 put_cpu();
280 rcu_read_unlock();
273} 281}
274 282
275/** 283/**
@@ -852,13 +860,14 @@ static int nvme_submit_async_cmd(struct nvme_queue *nvmeq,
852int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, 860int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
853 u32 *result) 861 u32 *result)
854{ 862{
855 return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); 863 return nvme_submit_sync_cmd(raw_nvmeq(dev, 0), cmd, result,
864 ADMIN_TIMEOUT);
856} 865}
857 866
858static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, 867static int nvme_submit_admin_cmd_async(struct nvme_dev *dev,
859 struct nvme_command *cmd, struct async_cmd_info *cmdinfo) 868 struct nvme_command *cmd, struct async_cmd_info *cmdinfo)
860{ 869{
861 return nvme_submit_async_cmd(dev->queues[0], cmd, cmdinfo, 870 return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo,
862 ADMIN_TIMEOUT); 871 ADMIN_TIMEOUT);
863} 872}
864 873
@@ -985,6 +994,7 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
985 struct nvme_command cmd; 994 struct nvme_command cmd;
986 struct nvme_dev *dev = nvmeq->dev; 995 struct nvme_dev *dev = nvmeq->dev;
987 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); 996 struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
997 struct nvme_queue *adminq;
988 998
989 if (!nvmeq->qid || info[cmdid].aborted) { 999 if (!nvmeq->qid || info[cmdid].aborted) {
990 if (work_busy(&dev->reset_work)) 1000 if (work_busy(&dev->reset_work))
@@ -1001,7 +1011,8 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
1001 if (!dev->abort_limit) 1011 if (!dev->abort_limit)
1002 return; 1012 return;
1003 1013
1004 a_cmdid = alloc_cmdid(dev->queues[0], CMD_CTX_ABORT, special_completion, 1014 adminq = rcu_dereference(dev->queues[0]);
1015 a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion,
1005 ADMIN_TIMEOUT); 1016 ADMIN_TIMEOUT);
1006 if (a_cmdid < 0) 1017 if (a_cmdid < 0)
1007 return; 1018 return;
@@ -1018,7 +1029,7 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
1018 1029
1019 dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid, 1030 dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid,
1020 nvmeq->qid); 1031 nvmeq->qid);
1021 nvme_submit_cmd(dev->queues[0], &cmd); 1032 nvme_submit_cmd(adminq, &cmd);
1022} 1033}
1023 1034
1024/** 1035/**
@@ -1055,8 +1066,10 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
1055 } 1066 }
1056} 1067}
1057 1068
1058static void nvme_free_queue(struct nvme_queue *nvmeq) 1069static void nvme_free_queue(struct rcu_head *r)
1059{ 1070{
1071 struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head);
1072
1060 spin_lock_irq(&nvmeq->q_lock); 1073 spin_lock_irq(&nvmeq->q_lock);
1061 while (bio_list_peek(&nvmeq->sq_cong)) { 1074 while (bio_list_peek(&nvmeq->sq_cong)) {
1062 struct bio *bio = bio_list_pop(&nvmeq->sq_cong); 1075 struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
@@ -1075,10 +1088,13 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1075{ 1088{
1076 int i; 1089 int i;
1077 1090
1091 for (i = num_possible_cpus(); i > dev->queue_count - 1; i--)
1092 rcu_assign_pointer(dev->queues[i], NULL);
1078 for (i = dev->queue_count - 1; i >= lowest; i--) { 1093 for (i = dev->queue_count - 1; i >= lowest; i--) {
1079 nvme_free_queue(dev->queues[i]); 1094 struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
1095 rcu_assign_pointer(dev->queues[i], NULL);
1096 call_rcu(&nvmeq->r_head, nvme_free_queue);
1080 dev->queue_count--; 1097 dev->queue_count--;
1081 dev->queues[i] = NULL;
1082 } 1098 }
1083} 1099}
1084 1100
@@ -1116,7 +1132,7 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq)
1116 1132
1117static void nvme_disable_queue(struct nvme_dev *dev, int qid) 1133static void nvme_disable_queue(struct nvme_dev *dev, int qid)
1118{ 1134{
1119 struct nvme_queue *nvmeq = dev->queues[qid]; 1135 struct nvme_queue *nvmeq = raw_nvmeq(dev, qid);
1120 1136
1121 if (!nvmeq) 1137 if (!nvmeq)
1122 return; 1138 return;
@@ -1168,6 +1184,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
1168 nvmeq->qid = qid; 1184 nvmeq->qid = qid;
1169 nvmeq->q_suspended = 1; 1185 nvmeq->q_suspended = 1;
1170 dev->queue_count++; 1186 dev->queue_count++;
1187 rcu_assign_pointer(dev->queues[qid], nvmeq);
1171 1188
1172 return nvmeq; 1189 return nvmeq;
1173 1190
@@ -1311,12 +1328,11 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
1311 if (result < 0) 1328 if (result < 0)
1312 return result; 1329 return result;
1313 1330
1314 nvmeq = dev->queues[0]; 1331 nvmeq = raw_nvmeq(dev, 0);
1315 if (!nvmeq) { 1332 if (!nvmeq) {
1316 nvmeq = nvme_alloc_queue(dev, 0, 64, 0); 1333 nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
1317 if (!nvmeq) 1334 if (!nvmeq)
1318 return -ENOMEM; 1335 return -ENOMEM;
1319 dev->queues[0] = nvmeq;
1320 } 1336 }
1321 1337
1322 aqa = nvmeq->q_depth - 1; 1338 aqa = nvmeq->q_depth - 1;
@@ -1581,8 +1597,8 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev,
1581 if (length != cmd.data_len) 1597 if (length != cmd.data_len)
1582 status = -ENOMEM; 1598 status = -ENOMEM;
1583 else 1599 else
1584 status = nvme_submit_sync_cmd(dev->queues[0], &c, &cmd.result, 1600 status = nvme_submit_sync_cmd(raw_nvmeq(dev, 0), &c,
1585 timeout); 1601 &cmd.result, timeout);
1586 1602
1587 if (cmd.data_len) { 1603 if (cmd.data_len) {
1588 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); 1604 nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
@@ -1701,8 +1717,10 @@ static int nvme_kthread(void *data)
1701 queue_work(nvme_workq, &dev->reset_work); 1717 queue_work(nvme_workq, &dev->reset_work);
1702 continue; 1718 continue;
1703 } 1719 }
1720 rcu_read_lock();
1704 for (i = 0; i < dev->queue_count; i++) { 1721 for (i = 0; i < dev->queue_count; i++) {
1705 struct nvme_queue *nvmeq = dev->queues[i]; 1722 struct nvme_queue *nvmeq =
1723 rcu_dereference(dev->queues[i]);
1706 if (!nvmeq) 1724 if (!nvmeq)
1707 continue; 1725 continue;
1708 spin_lock_irq(&nvmeq->q_lock); 1726 spin_lock_irq(&nvmeq->q_lock);
@@ -1714,6 +1732,7 @@ static int nvme_kthread(void *data)
1714 unlock: 1732 unlock:
1715 spin_unlock_irq(&nvmeq->q_lock); 1733 spin_unlock_irq(&nvmeq->q_lock);
1716 } 1734 }
1735 rcu_read_unlock();
1717 } 1736 }
1718 spin_unlock(&dev_list_lock); 1737 spin_unlock(&dev_list_lock);
1719 schedule_timeout(round_jiffies_relative(HZ)); 1738 schedule_timeout(round_jiffies_relative(HZ));
@@ -1808,7 +1827,7 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
1808 1827
1809static int nvme_setup_io_queues(struct nvme_dev *dev) 1828static int nvme_setup_io_queues(struct nvme_dev *dev)
1810{ 1829{
1811 struct nvme_queue *adminq = dev->queues[0]; 1830 struct nvme_queue *adminq = raw_nvmeq(dev, 0);
1812 struct pci_dev *pdev = dev->pci_dev; 1831 struct pci_dev *pdev = dev->pci_dev;
1813 int result, cpu, i, vecs, nr_io_queues, size, q_depth; 1832 int result, cpu, i, vecs, nr_io_queues, size, q_depth;
1814 1833
@@ -1831,7 +1850,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
1831 size = db_bar_size(dev, nr_io_queues); 1850 size = db_bar_size(dev, nr_io_queues);
1832 } while (1); 1851 } while (1);
1833 dev->dbs = ((void __iomem *)dev->bar) + 4096; 1852 dev->dbs = ((void __iomem *)dev->bar) + 4096;
1834 dev->queues[0]->q_db = dev->dbs; 1853 adminq->q_db = dev->dbs;
1835 } 1854 }
1836 1855
1837 /* Deregister the admin queue's interrupt */ 1856 /* Deregister the admin queue's interrupt */
@@ -1880,19 +1899,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
1880 } 1899 }
1881 1900
1882 /* Free previously allocated queues that are no longer usable */ 1901 /* Free previously allocated queues that are no longer usable */
1883 spin_lock(&dev_list_lock); 1902 nvme_free_queues(dev, nr_io_queues);
1884 for (i = dev->queue_count - 1; i > nr_io_queues; i--) {
1885 struct nvme_queue *nvmeq = dev->queues[i];
1886
1887 spin_lock_irq(&nvmeq->q_lock);
1888 nvme_cancel_ios(nvmeq, false);
1889 spin_unlock_irq(&nvmeq->q_lock);
1890
1891 nvme_free_queue(nvmeq);
1892 dev->queue_count--;
1893 dev->queues[i] = NULL;
1894 }
1895 spin_unlock(&dev_list_lock);
1896 1903
1897 cpu = cpumask_first(cpu_online_mask); 1904 cpu = cpumask_first(cpu_online_mask);
1898 for (i = 0; i < nr_io_queues; i++) { 1905 for (i = 0; i < nr_io_queues; i++) {
@@ -1903,8 +1910,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
1903 q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, 1910 q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1,
1904 NVME_Q_DEPTH); 1911 NVME_Q_DEPTH);
1905 for (i = dev->queue_count - 1; i < nr_io_queues; i++) { 1912 for (i = dev->queue_count - 1; i < nr_io_queues; i++) {
1906 dev->queues[i + 1] = nvme_alloc_queue(dev, i + 1, q_depth, i); 1913 if (!nvme_alloc_queue(dev, i + 1, q_depth, i)) {
1907 if (!dev->queues[i + 1]) {
1908 result = -ENOMEM; 1914 result = -ENOMEM;
1909 goto free_queues; 1915 goto free_queues;
1910 } 1916 }
@@ -1912,11 +1918,11 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
1912 1918
1913 for (; i < num_possible_cpus(); i++) { 1919 for (; i < num_possible_cpus(); i++) {
1914 int target = i % rounddown_pow_of_two(dev->queue_count - 1); 1920 int target = i % rounddown_pow_of_two(dev->queue_count - 1);
1915 dev->queues[i + 1] = dev->queues[target + 1]; 1921 rcu_assign_pointer(dev->queues[i + 1], dev->queues[target + 1]);
1916 } 1922 }
1917 1923
1918 for (i = 1; i < dev->queue_count; i++) { 1924 for (i = 1; i < dev->queue_count; i++) {
1919 result = nvme_create_queue(dev->queues[i], i); 1925 result = nvme_create_queue(raw_nvmeq(dev, i), i);
1920 if (result) { 1926 if (result) {
1921 for (--i; i > 0; i--) 1927 for (--i; i > 0; i--)
1922 nvme_disable_queue(dev, i); 1928 nvme_disable_queue(dev, i);
@@ -2180,7 +2186,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
2180 atomic_set(&dq.refcount, 0); 2186 atomic_set(&dq.refcount, 0);
2181 dq.worker = &worker; 2187 dq.worker = &worker;
2182 for (i = dev->queue_count - 1; i > 0; i--) { 2188 for (i = dev->queue_count - 1; i > 0; i--) {
2183 struct nvme_queue *nvmeq = dev->queues[i]; 2189 struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
2184 2190
2185 if (nvme_suspend_queue(nvmeq)) 2191 if (nvme_suspend_queue(nvmeq))
2186 continue; 2192 continue;
@@ -2205,7 +2211,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
2205 2211
2206 if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) { 2212 if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) {
2207 for (i = dev->queue_count - 1; i >= 0; i--) { 2213 for (i = dev->queue_count - 1; i >= 0; i--) {
2208 struct nvme_queue *nvmeq = dev->queues[i]; 2214 struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
2209 nvme_suspend_queue(nvmeq); 2215 nvme_suspend_queue(nvmeq);
2210 nvme_clear_queue(nvmeq); 2216 nvme_clear_queue(nvmeq);
2211 } 2217 }
@@ -2383,18 +2389,10 @@ static int nvme_remove_dead_ctrl(void *arg)
2383 2389
2384static void nvme_remove_disks(struct work_struct *ws) 2390static void nvme_remove_disks(struct work_struct *ws)
2385{ 2391{
2386 int i;
2387 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); 2392 struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
2388 2393
2389 nvme_dev_remove(dev); 2394 nvme_dev_remove(dev);
2390 spin_lock(&dev_list_lock); 2395 nvme_free_queues(dev, 1);
2391 for (i = dev->queue_count - 1; i > 0; i--) {
2392 BUG_ON(!dev->queues[i] || !dev->queues[i]->q_suspended);
2393 nvme_free_queue(dev->queues[i]);
2394 dev->queue_count--;
2395 dev->queues[i] = NULL;
2396 }
2397 spin_unlock(&dev_list_lock);
2398} 2396}
2399 2397
2400static int nvme_dev_resume(struct nvme_dev *dev) 2398static int nvme_dev_resume(struct nvme_dev *dev)
@@ -2526,6 +2524,7 @@ static void nvme_remove(struct pci_dev *pdev)
2526 nvme_dev_remove(dev); 2524 nvme_dev_remove(dev);
2527 nvme_dev_shutdown(dev); 2525 nvme_dev_shutdown(dev);
2528 nvme_free_queues(dev, 0); 2526 nvme_free_queues(dev, 0);
2527 rcu_barrier();
2529 nvme_release_instance(dev); 2528 nvme_release_instance(dev);
2530 nvme_release_prp_pools(dev); 2529 nvme_release_prp_pools(dev);
2531 kref_put(&dev->kref, nvme_free_dev); 2530 kref_put(&dev->kref, nvme_free_dev);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 69ae03f6eb15..98d367b06f9c 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -73,7 +73,7 @@ enum {
73 */ 73 */
74struct nvme_dev { 74struct nvme_dev {
75 struct list_head node; 75 struct list_head node;
76 struct nvme_queue **queues; 76 struct nvme_queue __rcu **queues;
77 u32 __iomem *dbs; 77 u32 __iomem *dbs;
78 struct pci_dev *pci_dev; 78 struct pci_dev *pci_dev;
79 struct dma_pool *prp_page_pool; 79 struct dma_pool *prp_page_pool;