diff options
author | Keith Busch <keith.busch@intel.com> | 2014-02-21 16:13:44 -0500 |
---|---|---|
committer | Matthew Wilcox <matthew.r.wilcox@intel.com> | 2014-03-24 08:45:57 -0400 |
commit | 5a92e700af2e5e0e6404988d6a7f2ed3dad3f46f (patch) | |
tree | 0de62a3873a05bb0e21fc5195be6ff20af5366cf | |
parent | fb35e914b3f88cda9ee6f9d776910c35269c4ecf (diff) |
NVMe: RCU protected access to io queues
This adds rcu protected access to nvme_queue to fix a race between a
surprise removal freeing the queue and a thread with open reference on
a NVMe block device using that queue.
The queues do not need to be rcu protected during the initialization or
shutdown parts, so I've added a helper function for raw deferencing
to get around the sparse errors.
There is still a hole in the IOCTL path for the same problem, which is
fixed in a subsequent patch.
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
-rw-r--r-- | drivers/block/nvme-core.c | 91 | ||||
-rw-r--r-- | include/linux/nvme.h | 2 |
2 files changed, 46 insertions, 47 deletions
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index f565212a9e32..b66ab1db4629 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c | |||
@@ -74,6 +74,7 @@ struct async_cmd_info { | |||
74 | * commands and one for I/O commands). | 74 | * commands and one for I/O commands). |
75 | */ | 75 | */ |
76 | struct nvme_queue { | 76 | struct nvme_queue { |
77 | struct rcu_head r_head; | ||
77 | struct device *q_dmadev; | 78 | struct device *q_dmadev; |
78 | struct nvme_dev *dev; | 79 | struct nvme_dev *dev; |
79 | char irqname[24]; /* nvme4294967295-65535\0 */ | 80 | char irqname[24]; /* nvme4294967295-65535\0 */ |
@@ -262,14 +263,21 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, | |||
262 | return ctx; | 263 | return ctx; |
263 | } | 264 | } |
264 | 265 | ||
265 | struct nvme_queue *get_nvmeq(struct nvme_dev *dev) | 266 | static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) |
266 | { | 267 | { |
267 | return dev->queues[get_cpu() + 1]; | 268 | return rcu_dereference_raw(dev->queues[qid]); |
268 | } | 269 | } |
269 | 270 | ||
270 | void put_nvmeq(struct nvme_queue *nvmeq) | 271 | struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) |
272 | { | ||
273 | rcu_read_lock(); | ||
274 | return rcu_dereference(dev->queues[get_cpu() + 1]); | ||
275 | } | ||
276 | |||
277 | void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) | ||
271 | { | 278 | { |
272 | put_cpu(); | 279 | put_cpu(); |
280 | rcu_read_unlock(); | ||
273 | } | 281 | } |
274 | 282 | ||
275 | /** | 283 | /** |
@@ -852,13 +860,14 @@ static int nvme_submit_async_cmd(struct nvme_queue *nvmeq, | |||
852 | int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, | 860 | int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, |
853 | u32 *result) | 861 | u32 *result) |
854 | { | 862 | { |
855 | return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); | 863 | return nvme_submit_sync_cmd(raw_nvmeq(dev, 0), cmd, result, |
864 | ADMIN_TIMEOUT); | ||
856 | } | 865 | } |
857 | 866 | ||
858 | static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, | 867 | static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, |
859 | struct nvme_command *cmd, struct async_cmd_info *cmdinfo) | 868 | struct nvme_command *cmd, struct async_cmd_info *cmdinfo) |
860 | { | 869 | { |
861 | return nvme_submit_async_cmd(dev->queues[0], cmd, cmdinfo, | 870 | return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo, |
862 | ADMIN_TIMEOUT); | 871 | ADMIN_TIMEOUT); |
863 | } | 872 | } |
864 | 873 | ||
@@ -985,6 +994,7 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) | |||
985 | struct nvme_command cmd; | 994 | struct nvme_command cmd; |
986 | struct nvme_dev *dev = nvmeq->dev; | 995 | struct nvme_dev *dev = nvmeq->dev; |
987 | struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); | 996 | struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); |
997 | struct nvme_queue *adminq; | ||
988 | 998 | ||
989 | if (!nvmeq->qid || info[cmdid].aborted) { | 999 | if (!nvmeq->qid || info[cmdid].aborted) { |
990 | if (work_busy(&dev->reset_work)) | 1000 | if (work_busy(&dev->reset_work)) |
@@ -1001,7 +1011,8 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) | |||
1001 | if (!dev->abort_limit) | 1011 | if (!dev->abort_limit) |
1002 | return; | 1012 | return; |
1003 | 1013 | ||
1004 | a_cmdid = alloc_cmdid(dev->queues[0], CMD_CTX_ABORT, special_completion, | 1014 | adminq = rcu_dereference(dev->queues[0]); |
1015 | a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion, | ||
1005 | ADMIN_TIMEOUT); | 1016 | ADMIN_TIMEOUT); |
1006 | if (a_cmdid < 0) | 1017 | if (a_cmdid < 0) |
1007 | return; | 1018 | return; |
@@ -1018,7 +1029,7 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) | |||
1018 | 1029 | ||
1019 | dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid, | 1030 | dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid, |
1020 | nvmeq->qid); | 1031 | nvmeq->qid); |
1021 | nvme_submit_cmd(dev->queues[0], &cmd); | 1032 | nvme_submit_cmd(adminq, &cmd); |
1022 | } | 1033 | } |
1023 | 1034 | ||
1024 | /** | 1035 | /** |
@@ -1055,8 +1066,10 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) | |||
1055 | } | 1066 | } |
1056 | } | 1067 | } |
1057 | 1068 | ||
1058 | static void nvme_free_queue(struct nvme_queue *nvmeq) | 1069 | static void nvme_free_queue(struct rcu_head *r) |
1059 | { | 1070 | { |
1071 | struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head); | ||
1072 | |||
1060 | spin_lock_irq(&nvmeq->q_lock); | 1073 | spin_lock_irq(&nvmeq->q_lock); |
1061 | while (bio_list_peek(&nvmeq->sq_cong)) { | 1074 | while (bio_list_peek(&nvmeq->sq_cong)) { |
1062 | struct bio *bio = bio_list_pop(&nvmeq->sq_cong); | 1075 | struct bio *bio = bio_list_pop(&nvmeq->sq_cong); |
@@ -1075,10 +1088,13 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) | |||
1075 | { | 1088 | { |
1076 | int i; | 1089 | int i; |
1077 | 1090 | ||
1091 | for (i = num_possible_cpus(); i > dev->queue_count - 1; i--) | ||
1092 | rcu_assign_pointer(dev->queues[i], NULL); | ||
1078 | for (i = dev->queue_count - 1; i >= lowest; i--) { | 1093 | for (i = dev->queue_count - 1; i >= lowest; i--) { |
1079 | nvme_free_queue(dev->queues[i]); | 1094 | struct nvme_queue *nvmeq = raw_nvmeq(dev, i); |
1095 | rcu_assign_pointer(dev->queues[i], NULL); | ||
1096 | call_rcu(&nvmeq->r_head, nvme_free_queue); | ||
1080 | dev->queue_count--; | 1097 | dev->queue_count--; |
1081 | dev->queues[i] = NULL; | ||
1082 | } | 1098 | } |
1083 | } | 1099 | } |
1084 | 1100 | ||
@@ -1116,7 +1132,7 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq) | |||
1116 | 1132 | ||
1117 | static void nvme_disable_queue(struct nvme_dev *dev, int qid) | 1133 | static void nvme_disable_queue(struct nvme_dev *dev, int qid) |
1118 | { | 1134 | { |
1119 | struct nvme_queue *nvmeq = dev->queues[qid]; | 1135 | struct nvme_queue *nvmeq = raw_nvmeq(dev, qid); |
1120 | 1136 | ||
1121 | if (!nvmeq) | 1137 | if (!nvmeq) |
1122 | return; | 1138 | return; |
@@ -1168,6 +1184,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, | |||
1168 | nvmeq->qid = qid; | 1184 | nvmeq->qid = qid; |
1169 | nvmeq->q_suspended = 1; | 1185 | nvmeq->q_suspended = 1; |
1170 | dev->queue_count++; | 1186 | dev->queue_count++; |
1187 | rcu_assign_pointer(dev->queues[qid], nvmeq); | ||
1171 | 1188 | ||
1172 | return nvmeq; | 1189 | return nvmeq; |
1173 | 1190 | ||
@@ -1311,12 +1328,11 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) | |||
1311 | if (result < 0) | 1328 | if (result < 0) |
1312 | return result; | 1329 | return result; |
1313 | 1330 | ||
1314 | nvmeq = dev->queues[0]; | 1331 | nvmeq = raw_nvmeq(dev, 0); |
1315 | if (!nvmeq) { | 1332 | if (!nvmeq) { |
1316 | nvmeq = nvme_alloc_queue(dev, 0, 64, 0); | 1333 | nvmeq = nvme_alloc_queue(dev, 0, 64, 0); |
1317 | if (!nvmeq) | 1334 | if (!nvmeq) |
1318 | return -ENOMEM; | 1335 | return -ENOMEM; |
1319 | dev->queues[0] = nvmeq; | ||
1320 | } | 1336 | } |
1321 | 1337 | ||
1322 | aqa = nvmeq->q_depth - 1; | 1338 | aqa = nvmeq->q_depth - 1; |
@@ -1581,8 +1597,8 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, | |||
1581 | if (length != cmd.data_len) | 1597 | if (length != cmd.data_len) |
1582 | status = -ENOMEM; | 1598 | status = -ENOMEM; |
1583 | else | 1599 | else |
1584 | status = nvme_submit_sync_cmd(dev->queues[0], &c, &cmd.result, | 1600 | status = nvme_submit_sync_cmd(raw_nvmeq(dev, 0), &c, |
1585 | timeout); | 1601 | &cmd.result, timeout); |
1586 | 1602 | ||
1587 | if (cmd.data_len) { | 1603 | if (cmd.data_len) { |
1588 | nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); | 1604 | nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); |
@@ -1701,8 +1717,10 @@ static int nvme_kthread(void *data) | |||
1701 | queue_work(nvme_workq, &dev->reset_work); | 1717 | queue_work(nvme_workq, &dev->reset_work); |
1702 | continue; | 1718 | continue; |
1703 | } | 1719 | } |
1720 | rcu_read_lock(); | ||
1704 | for (i = 0; i < dev->queue_count; i++) { | 1721 | for (i = 0; i < dev->queue_count; i++) { |
1705 | struct nvme_queue *nvmeq = dev->queues[i]; | 1722 | struct nvme_queue *nvmeq = |
1723 | rcu_dereference(dev->queues[i]); | ||
1706 | if (!nvmeq) | 1724 | if (!nvmeq) |
1707 | continue; | 1725 | continue; |
1708 | spin_lock_irq(&nvmeq->q_lock); | 1726 | spin_lock_irq(&nvmeq->q_lock); |
@@ -1714,6 +1732,7 @@ static int nvme_kthread(void *data) | |||
1714 | unlock: | 1732 | unlock: |
1715 | spin_unlock_irq(&nvmeq->q_lock); | 1733 | spin_unlock_irq(&nvmeq->q_lock); |
1716 | } | 1734 | } |
1735 | rcu_read_unlock(); | ||
1717 | } | 1736 | } |
1718 | spin_unlock(&dev_list_lock); | 1737 | spin_unlock(&dev_list_lock); |
1719 | schedule_timeout(round_jiffies_relative(HZ)); | 1738 | schedule_timeout(round_jiffies_relative(HZ)); |
@@ -1808,7 +1827,7 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) | |||
1808 | 1827 | ||
1809 | static int nvme_setup_io_queues(struct nvme_dev *dev) | 1828 | static int nvme_setup_io_queues(struct nvme_dev *dev) |
1810 | { | 1829 | { |
1811 | struct nvme_queue *adminq = dev->queues[0]; | 1830 | struct nvme_queue *adminq = raw_nvmeq(dev, 0); |
1812 | struct pci_dev *pdev = dev->pci_dev; | 1831 | struct pci_dev *pdev = dev->pci_dev; |
1813 | int result, cpu, i, vecs, nr_io_queues, size, q_depth; | 1832 | int result, cpu, i, vecs, nr_io_queues, size, q_depth; |
1814 | 1833 | ||
@@ -1831,7 +1850,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) | |||
1831 | size = db_bar_size(dev, nr_io_queues); | 1850 | size = db_bar_size(dev, nr_io_queues); |
1832 | } while (1); | 1851 | } while (1); |
1833 | dev->dbs = ((void __iomem *)dev->bar) + 4096; | 1852 | dev->dbs = ((void __iomem *)dev->bar) + 4096; |
1834 | dev->queues[0]->q_db = dev->dbs; | 1853 | adminq->q_db = dev->dbs; |
1835 | } | 1854 | } |
1836 | 1855 | ||
1837 | /* Deregister the admin queue's interrupt */ | 1856 | /* Deregister the admin queue's interrupt */ |
@@ -1880,19 +1899,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) | |||
1880 | } | 1899 | } |
1881 | 1900 | ||
1882 | /* Free previously allocated queues that are no longer usable */ | 1901 | /* Free previously allocated queues that are no longer usable */ |
1883 | spin_lock(&dev_list_lock); | 1902 | nvme_free_queues(dev, nr_io_queues); |
1884 | for (i = dev->queue_count - 1; i > nr_io_queues; i--) { | ||
1885 | struct nvme_queue *nvmeq = dev->queues[i]; | ||
1886 | |||
1887 | spin_lock_irq(&nvmeq->q_lock); | ||
1888 | nvme_cancel_ios(nvmeq, false); | ||
1889 | spin_unlock_irq(&nvmeq->q_lock); | ||
1890 | |||
1891 | nvme_free_queue(nvmeq); | ||
1892 | dev->queue_count--; | ||
1893 | dev->queues[i] = NULL; | ||
1894 | } | ||
1895 | spin_unlock(&dev_list_lock); | ||
1896 | 1903 | ||
1897 | cpu = cpumask_first(cpu_online_mask); | 1904 | cpu = cpumask_first(cpu_online_mask); |
1898 | for (i = 0; i < nr_io_queues; i++) { | 1905 | for (i = 0; i < nr_io_queues; i++) { |
@@ -1903,8 +1910,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) | |||
1903 | q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, | 1910 | q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, |
1904 | NVME_Q_DEPTH); | 1911 | NVME_Q_DEPTH); |
1905 | for (i = dev->queue_count - 1; i < nr_io_queues; i++) { | 1912 | for (i = dev->queue_count - 1; i < nr_io_queues; i++) { |
1906 | dev->queues[i + 1] = nvme_alloc_queue(dev, i + 1, q_depth, i); | 1913 | if (!nvme_alloc_queue(dev, i + 1, q_depth, i)) { |
1907 | if (!dev->queues[i + 1]) { | ||
1908 | result = -ENOMEM; | 1914 | result = -ENOMEM; |
1909 | goto free_queues; | 1915 | goto free_queues; |
1910 | } | 1916 | } |
@@ -1912,11 +1918,11 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) | |||
1912 | 1918 | ||
1913 | for (; i < num_possible_cpus(); i++) { | 1919 | for (; i < num_possible_cpus(); i++) { |
1914 | int target = i % rounddown_pow_of_two(dev->queue_count - 1); | 1920 | int target = i % rounddown_pow_of_two(dev->queue_count - 1); |
1915 | dev->queues[i + 1] = dev->queues[target + 1]; | 1921 | rcu_assign_pointer(dev->queues[i + 1], dev->queues[target + 1]); |
1916 | } | 1922 | } |
1917 | 1923 | ||
1918 | for (i = 1; i < dev->queue_count; i++) { | 1924 | for (i = 1; i < dev->queue_count; i++) { |
1919 | result = nvme_create_queue(dev->queues[i], i); | 1925 | result = nvme_create_queue(raw_nvmeq(dev, i), i); |
1920 | if (result) { | 1926 | if (result) { |
1921 | for (--i; i > 0; i--) | 1927 | for (--i; i > 0; i--) |
1922 | nvme_disable_queue(dev, i); | 1928 | nvme_disable_queue(dev, i); |
@@ -2180,7 +2186,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) | |||
2180 | atomic_set(&dq.refcount, 0); | 2186 | atomic_set(&dq.refcount, 0); |
2181 | dq.worker = &worker; | 2187 | dq.worker = &worker; |
2182 | for (i = dev->queue_count - 1; i > 0; i--) { | 2188 | for (i = dev->queue_count - 1; i > 0; i--) { |
2183 | struct nvme_queue *nvmeq = dev->queues[i]; | 2189 | struct nvme_queue *nvmeq = raw_nvmeq(dev, i); |
2184 | 2190 | ||
2185 | if (nvme_suspend_queue(nvmeq)) | 2191 | if (nvme_suspend_queue(nvmeq)) |
2186 | continue; | 2192 | continue; |
@@ -2205,7 +2211,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) | |||
2205 | 2211 | ||
2206 | if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) { | 2212 | if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) { |
2207 | for (i = dev->queue_count - 1; i >= 0; i--) { | 2213 | for (i = dev->queue_count - 1; i >= 0; i--) { |
2208 | struct nvme_queue *nvmeq = dev->queues[i]; | 2214 | struct nvme_queue *nvmeq = raw_nvmeq(dev, i); |
2209 | nvme_suspend_queue(nvmeq); | 2215 | nvme_suspend_queue(nvmeq); |
2210 | nvme_clear_queue(nvmeq); | 2216 | nvme_clear_queue(nvmeq); |
2211 | } | 2217 | } |
@@ -2383,18 +2389,10 @@ static int nvme_remove_dead_ctrl(void *arg) | |||
2383 | 2389 | ||
2384 | static void nvme_remove_disks(struct work_struct *ws) | 2390 | static void nvme_remove_disks(struct work_struct *ws) |
2385 | { | 2391 | { |
2386 | int i; | ||
2387 | struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); | 2392 | struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); |
2388 | 2393 | ||
2389 | nvme_dev_remove(dev); | 2394 | nvme_dev_remove(dev); |
2390 | spin_lock(&dev_list_lock); | 2395 | nvme_free_queues(dev, 1); |
2391 | for (i = dev->queue_count - 1; i > 0; i--) { | ||
2392 | BUG_ON(!dev->queues[i] || !dev->queues[i]->q_suspended); | ||
2393 | nvme_free_queue(dev->queues[i]); | ||
2394 | dev->queue_count--; | ||
2395 | dev->queues[i] = NULL; | ||
2396 | } | ||
2397 | spin_unlock(&dev_list_lock); | ||
2398 | } | 2396 | } |
2399 | 2397 | ||
2400 | static int nvme_dev_resume(struct nvme_dev *dev) | 2398 | static int nvme_dev_resume(struct nvme_dev *dev) |
@@ -2526,6 +2524,7 @@ static void nvme_remove(struct pci_dev *pdev) | |||
2526 | nvme_dev_remove(dev); | 2524 | nvme_dev_remove(dev); |
2527 | nvme_dev_shutdown(dev); | 2525 | nvme_dev_shutdown(dev); |
2528 | nvme_free_queues(dev, 0); | 2526 | nvme_free_queues(dev, 0); |
2527 | rcu_barrier(); | ||
2529 | nvme_release_instance(dev); | 2528 | nvme_release_instance(dev); |
2530 | nvme_release_prp_pools(dev); | 2529 | nvme_release_prp_pools(dev); |
2531 | kref_put(&dev->kref, nvme_free_dev); | 2530 | kref_put(&dev->kref, nvme_free_dev); |
diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 69ae03f6eb15..98d367b06f9c 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h | |||
@@ -73,7 +73,7 @@ enum { | |||
73 | */ | 73 | */ |
74 | struct nvme_dev { | 74 | struct nvme_dev { |
75 | struct list_head node; | 75 | struct list_head node; |
76 | struct nvme_queue **queues; | 76 | struct nvme_queue __rcu **queues; |
77 | u32 __iomem *dbs; | 77 | u32 __iomem *dbs; |
78 | struct pci_dev *pci_dev; | 78 | struct pci_dev *pci_dev; |
79 | struct dma_pool *prp_page_pool; | 79 | struct dma_pool *prp_page_pool; |