aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorKeith Busch <keith.busch@intel.com>2014-03-24 12:46:25 -0400
committerMatthew Wilcox <matthew.r.wilcox@intel.com>2014-04-10 17:03:15 -0400
commit42f614201e80ff4cfb8b285d7190149a8e1e6cec (patch)
tree03a71487b6015ccca44d00053b1643193926f04f /drivers/block
parent6eb0d698efa9c2a35ec3ca958699717c603f85ee (diff)
NVMe: per-cpu io queues
The device's IO queues are associated with CPUs, so we can use a per-cpu variable to map the a qid to a cpu. This provides a convienient way to optimally assign queues to multiple cpus when the device supports fewer queues than the host has cpus. The previous implementation may have assigned these poorly in these situations. This patch addresses this by sharing queues among cpus that are "close" together and should have a lower lock contention penalty. Signed-off-by: Keith Busch <keith.busch@intel.com> Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/nvme-core.c204
1 files changed, 167 insertions, 37 deletions
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e9495f0bfad3..48d7bd55207a 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -20,6 +20,7 @@
20#include <linux/bio.h> 20#include <linux/bio.h>
21#include <linux/bitops.h> 21#include <linux/bitops.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/cpu.h>
23#include <linux/delay.h> 24#include <linux/delay.h>
24#include <linux/errno.h> 25#include <linux/errno.h>
25#include <linux/fs.h> 26#include <linux/fs.h>
@@ -35,6 +36,7 @@
35#include <linux/module.h> 36#include <linux/module.h>
36#include <linux/moduleparam.h> 37#include <linux/moduleparam.h>
37#include <linux/pci.h> 38#include <linux/pci.h>
39#include <linux/percpu.h>
38#include <linux/poison.h> 40#include <linux/poison.h>
39#include <linux/ptrace.h> 41#include <linux/ptrace.h>
40#include <linux/sched.h> 42#include <linux/sched.h>
@@ -96,6 +98,7 @@ struct nvme_queue {
96 u8 cq_phase; 98 u8 cq_phase;
97 u8 cqe_seen; 99 u8 cqe_seen;
98 u8 q_suspended; 100 u8 q_suspended;
101 cpumask_var_t cpu_mask;
99 struct async_cmd_info cmdinfo; 102 struct async_cmd_info cmdinfo;
100 unsigned long cmdid_data[]; 103 unsigned long cmdid_data[];
101}; 104};
@@ -270,14 +273,15 @@ static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid)
270 273
271static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) 274static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
272{ 275{
276 unsigned queue_id = get_cpu_var(*dev->io_queue);
273 rcu_read_lock(); 277 rcu_read_lock();
274 return rcu_dereference(dev->queues[get_cpu() + 1]); 278 return rcu_dereference(dev->queues[queue_id]);
275} 279}
276 280
277static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) 281static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
278{ 282{
279 put_cpu();
280 rcu_read_unlock(); 283 rcu_read_unlock();
284 put_cpu_var(nvmeq->dev->io_queue);
281} 285}
282 286
283static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx) 287static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx)
@@ -1121,6 +1125,8 @@ static void nvme_free_queue(struct rcu_head *r)
1121 (void *)nvmeq->cqes, nvmeq->cq_dma_addr); 1125 (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1122 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), 1126 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
1123 nvmeq->sq_cmds, nvmeq->sq_dma_addr); 1127 nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1128 if (nvmeq->qid)
1129 free_cpumask_var(nvmeq->cpu_mask);
1124 kfree(nvmeq); 1130 kfree(nvmeq);
1125} 1131}
1126 1132
@@ -1128,8 +1134,6 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1128{ 1134{
1129 int i; 1135 int i;
1130 1136
1131 for (i = num_possible_cpus(); i > dev->queue_count - 1; i--)
1132 rcu_assign_pointer(dev->queues[i], NULL);
1133 for (i = dev->queue_count - 1; i >= lowest; i--) { 1137 for (i = dev->queue_count - 1; i >= lowest; i--) {
1134 struct nvme_queue *nvmeq = raw_nvmeq(dev, i); 1138 struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
1135 rcu_assign_pointer(dev->queues[i], NULL); 1139 rcu_assign_pointer(dev->queues[i], NULL);
@@ -1154,6 +1158,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
1154 return 1; 1158 return 1;
1155 } 1159 }
1156 nvmeq->q_suspended = 1; 1160 nvmeq->q_suspended = 1;
1161 nvmeq->dev->online_queues--;
1157 spin_unlock_irq(&nvmeq->q_lock); 1162 spin_unlock_irq(&nvmeq->q_lock);
1158 1163
1159 irq_set_affinity_hint(vector, NULL); 1164 irq_set_affinity_hint(vector, NULL);
@@ -1208,6 +1213,9 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
1208 if (!nvmeq->sq_cmds) 1213 if (!nvmeq->sq_cmds)
1209 goto free_cqdma; 1214 goto free_cqdma;
1210 1215
1216 if (qid && !zalloc_cpumask_var(&nvmeq->cpu_mask, GFP_KERNEL))
1217 goto free_sqdma;
1218
1211 nvmeq->q_dmadev = dmadev; 1219 nvmeq->q_dmadev = dmadev;
1212 nvmeq->dev = dev; 1220 nvmeq->dev = dev;
1213 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", 1221 snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
@@ -1228,6 +1236,9 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
1228 1236
1229 return nvmeq; 1237 return nvmeq;
1230 1238
1239 free_sqdma:
1240 dma_free_coherent(dmadev, SQ_SIZE(depth), (void *)nvmeq->sq_cmds,
1241 nvmeq->sq_dma_addr);
1231 free_cqdma: 1242 free_cqdma:
1232 dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, 1243 dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes,
1233 nvmeq->cq_dma_addr); 1244 nvmeq->cq_dma_addr);
@@ -1260,6 +1271,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
1260 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); 1271 memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
1261 nvme_cancel_ios(nvmeq, false); 1272 nvme_cancel_ios(nvmeq, false);
1262 nvmeq->q_suspended = 0; 1273 nvmeq->q_suspended = 0;
1274 dev->online_queues++;
1263} 1275}
1264 1276
1265static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) 1277static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
@@ -1835,6 +1847,143 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
1835 return NULL; 1847 return NULL;
1836} 1848}
1837 1849
1850static int nvme_find_closest_node(int node)
1851{
1852 int n, val, min_val = INT_MAX, best_node = node;
1853
1854 for_each_online_node(n) {
1855 if (n == node)
1856 continue;
1857 val = node_distance(node, n);
1858 if (val < min_val) {
1859 min_val = val;
1860 best_node = n;
1861 }
1862 }
1863 return best_node;
1864}
1865
1866static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq,
1867 int count)
1868{
1869 int cpu;
1870 for_each_cpu(cpu, qmask) {
1871 if (cpumask_weight(nvmeq->cpu_mask) >= count)
1872 break;
1873 if (!cpumask_test_and_set_cpu(cpu, nvmeq->cpu_mask))
1874 *per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid;
1875 }
1876}
1877
1878static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
1879 const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue)
1880{
1881 int next_cpu;
1882 for_each_cpu(next_cpu, new_mask) {
1883 cpumask_or(mask, mask, get_cpu_mask(next_cpu));
1884 cpumask_or(mask, mask, topology_thread_cpumask(next_cpu));
1885 cpumask_and(mask, mask, unassigned_cpus);
1886 nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue);
1887 }
1888}
1889
1890static void nvme_create_io_queues(struct nvme_dev *dev)
1891{
1892 unsigned i, max;
1893
1894 max = min(dev->max_qid, num_online_cpus());
1895 for (i = dev->queue_count; i <= max; i++)
1896 if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1))
1897 break;
1898
1899 max = min(dev->queue_count - 1, num_online_cpus());
1900 for (i = dev->online_queues; i <= max; i++)
1901 if (nvme_create_queue(raw_nvmeq(dev, i), i))
1902 break;
1903}
1904
1905/*
1906 * If there are fewer queues than online cpus, this will try to optimally
1907 * assign a queue to multiple cpus by grouping cpus that are "close" together:
1908 * thread siblings, core, socket, closest node, then whatever else is
1909 * available.
1910 */
1911static void nvme_assign_io_queues(struct nvme_dev *dev)
1912{
1913 unsigned cpu, cpus_per_queue, queues, remainder, i;
1914 cpumask_var_t unassigned_cpus;
1915
1916 nvme_create_io_queues(dev);
1917
1918 queues = min(dev->online_queues - 1, num_online_cpus());
1919 if (!queues)
1920 return;
1921
1922 cpus_per_queue = num_online_cpus() / queues;
1923 remainder = queues - (num_online_cpus() - queues * cpus_per_queue);
1924
1925 if (!alloc_cpumask_var(&unassigned_cpus, GFP_KERNEL))
1926 return;
1927
1928 cpumask_copy(unassigned_cpus, cpu_online_mask);
1929 cpu = cpumask_first(unassigned_cpus);
1930 for (i = 1; i <= queues; i++) {
1931 struct nvme_queue *nvmeq = lock_nvmeq(dev, i);
1932 cpumask_t mask;
1933
1934 cpumask_clear(nvmeq->cpu_mask);
1935 if (!cpumask_weight(unassigned_cpus)) {
1936 unlock_nvmeq(nvmeq);
1937 break;
1938 }
1939
1940 mask = *get_cpu_mask(cpu);
1941 nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue);
1942 if (cpus_weight(mask) < cpus_per_queue)
1943 nvme_add_cpus(&mask, unassigned_cpus,
1944 topology_thread_cpumask(cpu),
1945 nvmeq, cpus_per_queue);
1946 if (cpus_weight(mask) < cpus_per_queue)
1947 nvme_add_cpus(&mask, unassigned_cpus,
1948 topology_core_cpumask(cpu),
1949 nvmeq, cpus_per_queue);
1950 if (cpus_weight(mask) < cpus_per_queue)
1951 nvme_add_cpus(&mask, unassigned_cpus,
1952 cpumask_of_node(cpu_to_node(cpu)),
1953 nvmeq, cpus_per_queue);
1954 if (cpus_weight(mask) < cpus_per_queue)
1955 nvme_add_cpus(&mask, unassigned_cpus,
1956 cpumask_of_node(
1957 nvme_find_closest_node(
1958 cpu_to_node(cpu))),
1959 nvmeq, cpus_per_queue);
1960 if (cpus_weight(mask) < cpus_per_queue)
1961 nvme_add_cpus(&mask, unassigned_cpus,
1962 unassigned_cpus,
1963 nvmeq, cpus_per_queue);
1964
1965 WARN(cpumask_weight(nvmeq->cpu_mask) != cpus_per_queue,
1966 "nvme%d qid:%d mis-matched queue-to-cpu assignment\n",
1967 dev->instance, i);
1968
1969 irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
1970 nvmeq->cpu_mask);
1971 cpumask_andnot(unassigned_cpus, unassigned_cpus,
1972 nvmeq->cpu_mask);
1973 cpu = cpumask_next(cpu, unassigned_cpus);
1974 if (remainder && !--remainder)
1975 cpus_per_queue++;
1976 unlock_nvmeq(nvmeq);
1977 }
1978 WARN(cpumask_weight(unassigned_cpus), "nvme%d unassigned online cpus\n",
1979 dev->instance);
1980 i = 0;
1981 cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask);
1982 for_each_cpu(cpu, unassigned_cpus)
1983 *per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1;
1984 free_cpumask_var(unassigned_cpus);
1985}
1986
1838static int set_queue_count(struct nvme_dev *dev, int count) 1987static int set_queue_count(struct nvme_dev *dev, int count)
1839{ 1988{
1840 int status; 1989 int status;
@@ -1857,9 +2006,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
1857{ 2006{
1858 struct nvme_queue *adminq = raw_nvmeq(dev, 0); 2007 struct nvme_queue *adminq = raw_nvmeq(dev, 0);
1859 struct pci_dev *pdev = dev->pci_dev; 2008 struct pci_dev *pdev = dev->pci_dev;
1860 int result, cpu, i, vecs, nr_io_queues, size, q_depth; 2009 int result, i, vecs, nr_io_queues, size;
1861 2010
1862 nr_io_queues = num_online_cpus(); 2011 nr_io_queues = num_possible_cpus();
1863 result = set_queue_count(dev, nr_io_queues); 2012 result = set_queue_count(dev, nr_io_queues);
1864 if (result < 0) 2013 if (result < 0)
1865 return result; 2014 return result;
@@ -1919,6 +2068,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
1919 * number of interrupts. 2068 * number of interrupts.
1920 */ 2069 */
1921 nr_io_queues = vecs; 2070 nr_io_queues = vecs;
2071 dev->max_qid = nr_io_queues;
1922 2072
1923 result = queue_request_irq(dev, adminq, adminq->irqname); 2073 result = queue_request_irq(dev, adminq, adminq->irqname);
1924 if (result) { 2074 if (result) {
@@ -1927,36 +2077,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
1927 } 2077 }
1928 2078
1929 /* Free previously allocated queues that are no longer usable */ 2079 /* Free previously allocated queues that are no longer usable */
1930 nvme_free_queues(dev, nr_io_queues); 2080 nvme_free_queues(dev, nr_io_queues + 1);
1931 2081 nvme_assign_io_queues(dev);
1932 cpu = cpumask_first(cpu_online_mask);
1933 for (i = 0; i < nr_io_queues; i++) {
1934 irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
1935 cpu = cpumask_next(cpu, cpu_online_mask);
1936 }
1937
1938 q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1,
1939 NVME_Q_DEPTH);
1940 for (i = dev->queue_count - 1; i < nr_io_queues; i++) {
1941 if (!nvme_alloc_queue(dev, i + 1, q_depth, i)) {
1942 result = -ENOMEM;
1943 goto free_queues;
1944 }
1945 }
1946
1947 for (; i < num_possible_cpus(); i++) {
1948 int target = i % rounddown_pow_of_two(dev->queue_count - 1);
1949 rcu_assign_pointer(dev->queues[i + 1], dev->queues[target + 1]);
1950 }
1951
1952 for (i = 1; i < dev->queue_count; i++) {
1953 result = nvme_create_queue(raw_nvmeq(dev, i), i);
1954 if (result) {
1955 for (--i; i > 0; i--)
1956 nvme_disable_queue(dev, i);
1957 goto free_queues;
1958 }
1959 }
1960 2082
1961 return 0; 2083 return 0;
1962 2084
@@ -2035,6 +2157,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
2035 2157
2036static int nvme_dev_map(struct nvme_dev *dev) 2158static int nvme_dev_map(struct nvme_dev *dev)
2037{ 2159{
2160 u64 cap;
2038 int bars, result = -ENOMEM; 2161 int bars, result = -ENOMEM;
2039 struct pci_dev *pdev = dev->pci_dev; 2162 struct pci_dev *pdev = dev->pci_dev;
2040 2163
@@ -2058,7 +2181,9 @@ static int nvme_dev_map(struct nvme_dev *dev)
2058 result = -ENODEV; 2181 result = -ENODEV;
2059 goto unmap; 2182 goto unmap;
2060 } 2183 }
2061 dev->db_stride = 1 << NVME_CAP_STRIDE(readq(&dev->bar->cap)); 2184 cap = readq(&dev->bar->cap);
2185 dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
2186 dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
2062 dev->dbs = ((void __iomem *)dev->bar) + 4096; 2187 dev->dbs = ((void __iomem *)dev->bar) + 4096;
2063 2188
2064 return 0; 2189 return 0;
@@ -2332,6 +2457,7 @@ static void nvme_free_dev(struct kref *kref)
2332 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); 2457 struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
2333 2458
2334 nvme_free_namespaces(dev); 2459 nvme_free_namespaces(dev);
2460 free_percpu(dev->io_queue);
2335 kfree(dev->queues); 2461 kfree(dev->queues);
2336 kfree(dev->entry); 2462 kfree(dev->entry);
2337 kfree(dev); 2463 kfree(dev);
@@ -2477,6 +2603,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2477 GFP_KERNEL); 2603 GFP_KERNEL);
2478 if (!dev->queues) 2604 if (!dev->queues)
2479 goto free; 2605 goto free;
2606 dev->io_queue = alloc_percpu(unsigned short);
2607 if (!dev->io_queue)
2608 goto free;
2480 2609
2481 INIT_LIST_HEAD(&dev->namespaces); 2610 INIT_LIST_HEAD(&dev->namespaces);
2482 INIT_WORK(&dev->reset_work, nvme_reset_failed_dev); 2611 INIT_WORK(&dev->reset_work, nvme_reset_failed_dev);
@@ -2526,6 +2655,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
2526 release: 2655 release:
2527 nvme_release_instance(dev); 2656 nvme_release_instance(dev);
2528 free: 2657 free:
2658 free_percpu(dev->io_queue);
2529 kfree(dev->queues); 2659 kfree(dev->queues);
2530 kfree(dev->entry); 2660 kfree(dev->entry);
2531 kfree(dev); 2661 kfree(dev);