diff options
-rw-r--r-- | Documentation/ABI/testing/sysfs-bus-pci | 24 | ||||
-rw-r--r-- | Documentation/driver-api/index.rst | 2 | ||||
-rw-r--r-- | Documentation/driver-api/pci/index.rst | 22 | ||||
-rw-r--r-- | Documentation/driver-api/pci/p2pdma.rst | 145 | ||||
-rw-r--r-- | Documentation/driver-api/pci/pci.rst (renamed from Documentation/driver-api/pci.rst) | 0 | ||||
-rw-r--r-- | drivers/infiniband/core/rw.c | 11 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 4 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 1 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 97 | ||||
-rw-r--r-- | drivers/nvme/target/configfs.c | 47 | ||||
-rw-r--r-- | drivers/nvme/target/core.c | 180 | ||||
-rw-r--r-- | drivers/nvme/target/io-cmd-bdev.c | 3 | ||||
-rw-r--r-- | drivers/nvme/target/nvmet.h | 17 | ||||
-rw-r--r-- | drivers/nvme/target/rdma.c | 22 | ||||
-rw-r--r-- | drivers/pci/Kconfig | 17 | ||||
-rw-r--r-- | drivers/pci/Makefile | 1 | ||||
-rw-r--r-- | drivers/pci/p2pdma.c | 805 | ||||
-rw-r--r-- | include/linux/blkdev.h | 3 | ||||
-rw-r--r-- | include/linux/memremap.h | 6 | ||||
-rw-r--r-- | include/linux/mm.h | 18 | ||||
-rw-r--r-- | include/linux/pci-p2pdma.h | 114 | ||||
-rw-r--r-- | include/linux/pci.h | 4 |
22 files changed, 1493 insertions, 50 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index 44d4b2be92fd..8bfee557e50e 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci | |||
@@ -323,3 +323,27 @@ Description: | |||
323 | 323 | ||
324 | This is similar to /sys/bus/pci/drivers_autoprobe, but | 324 | This is similar to /sys/bus/pci/drivers_autoprobe, but |
325 | affects only the VFs associated with a specific PF. | 325 | affects only the VFs associated with a specific PF. |
326 | |||
327 | What: /sys/bus/pci/devices/.../p2pmem/size | ||
328 | Date: November 2017 | ||
329 | Contact: Logan Gunthorpe <logang@deltatee.com> | ||
330 | Description: | ||
331 | If the device has any Peer-to-Peer memory registered, this | ||
332 | file contains the total amount of memory that the device | ||
333 | provides (in decimal). | ||
334 | |||
335 | What: /sys/bus/pci/devices/.../p2pmem/available | ||
336 | Date: November 2017 | ||
337 | Contact: Logan Gunthorpe <logang@deltatee.com> | ||
338 | Description: | ||
339 | If the device has any Peer-to-Peer memory registered, this | ||
340 | file contains the amount of memory that has not been | ||
341 | allocated (in decimal). | ||
342 | |||
343 | What: /sys/bus/pci/devices/.../p2pmem/published | ||
344 | Date: November 2017 | ||
345 | Contact: Logan Gunthorpe <logang@deltatee.com> | ||
346 | Description: | ||
347 | If the device has any Peer-to-Peer memory registered, this | ||
348 | file contains a '1' if the memory has been published for | ||
349 | use outside the driver that owns the device. | ||
diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index 6d9f2f9fe20e..e9e7d24169cf 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst | |||
@@ -29,7 +29,7 @@ available subsections can be seen below. | |||
29 | iio/index | 29 | iio/index |
30 | input | 30 | input |
31 | usb/index | 31 | usb/index |
32 | pci | 32 | pci/index |
33 | spi | 33 | spi |
34 | i2c | 34 | i2c |
35 | hsi | 35 | hsi |
diff --git a/Documentation/driver-api/pci/index.rst b/Documentation/driver-api/pci/index.rst new file mode 100644 index 000000000000..c6cf1fef61ce --- /dev/null +++ b/Documentation/driver-api/pci/index.rst | |||
@@ -0,0 +1,22 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | ============================================ | ||
4 | The Linux PCI driver implementer's API guide | ||
5 | ============================================ | ||
6 | |||
7 | .. class:: toc-title | ||
8 | |||
9 | Table of contents | ||
10 | |||
11 | .. toctree:: | ||
12 | :maxdepth: 2 | ||
13 | |||
14 | pci | ||
15 | p2pdma | ||
16 | |||
17 | .. only:: subproject and html | ||
18 | |||
19 | Indices | ||
20 | ======= | ||
21 | |||
22 | * :ref:`genindex` | ||
diff --git a/Documentation/driver-api/pci/p2pdma.rst b/Documentation/driver-api/pci/p2pdma.rst new file mode 100644 index 000000000000..4c577fa7bef9 --- /dev/null +++ b/Documentation/driver-api/pci/p2pdma.rst | |||
@@ -0,0 +1,145 @@ | |||
1 | .. SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | ============================ | ||
4 | PCI Peer-to-Peer DMA Support | ||
5 | ============================ | ||
6 | |||
7 | The PCI bus has pretty decent support for performing DMA transfers | ||
8 | between two devices on the bus. This type of transaction is henceforth | ||
9 | called Peer-to-Peer (or P2P). However, there are a number of issues that | ||
10 | make P2P transactions tricky to do in a perfectly safe way. | ||
11 | |||
12 | One of the biggest issues is that PCI doesn't require forwarding | ||
13 | transactions between hierarchy domains, and in PCIe, each Root Port | ||
14 | defines a separate hierarchy domain. To make things worse, there is no | ||
15 | simple way to determine if a given Root Complex supports this or not. | ||
16 | (See PCIe r4.0, sec 1.3.1). Therefore, as of this writing, the kernel | ||
17 | only supports doing P2P when the endpoints involved are all behind the | ||
18 | same PCI bridge, as such devices are all in the same PCI hierarchy | ||
19 | domain, and the spec guarantees that all transactions within the | ||
20 | hierarchy will be routable, but it does not require routing | ||
21 | between hierarchies. | ||
22 | |||
23 | The second issue is that to make use of existing interfaces in Linux, | ||
24 | memory that is used for P2P transactions needs to be backed by struct | ||
25 | pages. However, PCI BARs are not typically cache coherent so there are | ||
26 | a few corner case gotchas with these pages so developers need to | ||
27 | be careful about what they do with them. | ||
28 | |||
29 | |||
30 | Driver Writer's Guide | ||
31 | ===================== | ||
32 | |||
33 | In a given P2P implementation there may be three or more different | ||
34 | types of kernel drivers in play: | ||
35 | |||
36 | * Provider - A driver which provides or publishes P2P resources like | ||
37 | memory or doorbell registers to other drivers. | ||
38 | * Client - A driver which makes use of a resource by setting up a | ||
39 | DMA transaction to or from it. | ||
40 | * Orchestrator - A driver which orchestrates the flow of data between | ||
41 | clients and providers. | ||
42 | |||
43 | In many cases there could be overlap between these three types (i.e., | ||
44 | it may be typical for a driver to be both a provider and a client). | ||
45 | |||
46 | For example, in the NVMe Target Copy Offload implementation: | ||
47 | |||
48 | * The NVMe PCI driver is both a client, provider and orchestrator | ||
49 | in that it exposes any CMB (Controller Memory Buffer) as a P2P memory | ||
50 | resource (provider), it accepts P2P memory pages as buffers in requests | ||
51 | to be used directly (client) and it can also make use of the CMB as | ||
52 | submission queue entries (orchastrator). | ||
53 | * The RDMA driver is a client in this arrangement so that an RNIC | ||
54 | can DMA directly to the memory exposed by the NVMe device. | ||
55 | * The NVMe Target driver (nvmet) can orchestrate the data from the RNIC | ||
56 | to the P2P memory (CMB) and then to the NVMe device (and vice versa). | ||
57 | |||
58 | This is currently the only arrangement supported by the kernel but | ||
59 | one could imagine slight tweaks to this that would allow for the same | ||
60 | functionality. For example, if a specific RNIC added a BAR with some | ||
61 | memory behind it, its driver could add support as a P2P provider and | ||
62 | then the NVMe Target could use the RNIC's memory instead of the CMB | ||
63 | in cases where the NVMe cards in use do not have CMB support. | ||
64 | |||
65 | |||
66 | Provider Drivers | ||
67 | ---------------- | ||
68 | |||
69 | A provider simply needs to register a BAR (or a portion of a BAR) | ||
70 | as a P2P DMA resource using :c:func:`pci_p2pdma_add_resource()`. | ||
71 | This will register struct pages for all the specified memory. | ||
72 | |||
73 | After that it may optionally publish all of its resources as | ||
74 | P2P memory using :c:func:`pci_p2pmem_publish()`. This will allow | ||
75 | any orchestrator drivers to find and use the memory. When marked in | ||
76 | this way, the resource must be regular memory with no side effects. | ||
77 | |||
78 | For the time being this is fairly rudimentary in that all resources | ||
79 | are typically going to be P2P memory. Future work will likely expand | ||
80 | this to include other types of resources like doorbells. | ||
81 | |||
82 | |||
83 | Client Drivers | ||
84 | -------------- | ||
85 | |||
86 | A client driver typically only has to conditionally change its DMA map | ||
87 | routine to use the mapping function :c:func:`pci_p2pdma_map_sg()` instead | ||
88 | of the usual :c:func:`dma_map_sg()` function. Memory mapped in this | ||
89 | way does not need to be unmapped. | ||
90 | |||
91 | The client may also, optionally, make use of | ||
92 | :c:func:`is_pci_p2pdma_page()` to determine when to use the P2P mapping | ||
93 | functions and when to use the regular mapping functions. In some | ||
94 | situations, it may be more appropriate to use a flag to indicate a | ||
95 | given request is P2P memory and map appropriately. It is important to | ||
96 | ensure that struct pages that back P2P memory stay out of code that | ||
97 | does not have support for them as other code may treat the pages as | ||
98 | regular memory which may not be appropriate. | ||
99 | |||
100 | |||
101 | Orchestrator Drivers | ||
102 | -------------------- | ||
103 | |||
104 | The first task an orchestrator driver must do is compile a list of | ||
105 | all client devices that will be involved in a given transaction. For | ||
106 | example, the NVMe Target driver creates a list including the namespace | ||
107 | block device and the RNIC in use. If the orchestrator has access to | ||
108 | a specific P2P provider to use it may check compatibility using | ||
109 | :c:func:`pci_p2pdma_distance()` otherwise it may find a memory provider | ||
110 | that's compatible with all clients using :c:func:`pci_p2pmem_find()`. | ||
111 | If more than one provider is supported, the one nearest to all the clients will | ||
112 | be chosen first. If more than one provider is an equal distance away, the | ||
113 | one returned will be chosen at random (it is not an arbitrary but | ||
114 | truely random). This function returns the PCI device to use for the provider | ||
115 | with a reference taken and therefore when it's no longer needed it should be | ||
116 | returned with pci_dev_put(). | ||
117 | |||
118 | Once a provider is selected, the orchestrator can then use | ||
119 | :c:func:`pci_alloc_p2pmem()` and :c:func:`pci_free_p2pmem()` to | ||
120 | allocate P2P memory from the provider. :c:func:`pci_p2pmem_alloc_sgl()` | ||
121 | and :c:func:`pci_p2pmem_free_sgl()` are convenience functions for | ||
122 | allocating scatter-gather lists with P2P memory. | ||
123 | |||
124 | Struct Page Caveats | ||
125 | ------------------- | ||
126 | |||
127 | Driver writers should be very careful about not passing these special | ||
128 | struct pages to code that isn't prepared for it. At this time, the kernel | ||
129 | interfaces do not have any checks for ensuring this. This obviously | ||
130 | precludes passing these pages to userspace. | ||
131 | |||
132 | P2P memory is also technically IO memory but should never have any side | ||
133 | effects behind it. Thus, the order of loads and stores should not be important | ||
134 | and ioreadX(), iowriteX() and friends should not be necessary. | ||
135 | However, as the memory is not cache coherent, if access ever needs to | ||
136 | be protected by a spinlock then :c:func:`mmiowb()` must be used before | ||
137 | unlocking the lock. (See ACQUIRES VS I/O ACCESSES in | ||
138 | Documentation/memory-barriers.txt) | ||
139 | |||
140 | |||
141 | P2P DMA Support Library | ||
142 | ======================= | ||
143 | |||
144 | .. kernel-doc:: drivers/pci/p2pdma.c | ||
145 | :export: | ||
diff --git a/Documentation/driver-api/pci.rst b/Documentation/driver-api/pci/pci.rst index ca85e5e78b2c..ca85e5e78b2c 100644 --- a/Documentation/driver-api/pci.rst +++ b/Documentation/driver-api/pci/pci.rst | |||
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index 683e6d11a564..d22c4a2ebac6 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c | |||
@@ -12,6 +12,7 @@ | |||
12 | */ | 12 | */ |
13 | #include <linux/moduleparam.h> | 13 | #include <linux/moduleparam.h> |
14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
15 | #include <linux/pci-p2pdma.h> | ||
15 | #include <rdma/mr_pool.h> | 16 | #include <rdma/mr_pool.h> |
16 | #include <rdma/rw.h> | 17 | #include <rdma/rw.h> |
17 | 18 | ||
@@ -280,7 +281,11 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, | |||
280 | struct ib_device *dev = qp->pd->device; | 281 | struct ib_device *dev = qp->pd->device; |
281 | int ret; | 282 | int ret; |
282 | 283 | ||
283 | ret = ib_dma_map_sg(dev, sg, sg_cnt, dir); | 284 | if (is_pci_p2pdma_page(sg_page(sg))) |
285 | ret = pci_p2pdma_map_sg(dev->dma_device, sg, sg_cnt, dir); | ||
286 | else | ||
287 | ret = ib_dma_map_sg(dev, sg, sg_cnt, dir); | ||
288 | |||
284 | if (!ret) | 289 | if (!ret) |
285 | return -ENOMEM; | 290 | return -ENOMEM; |
286 | sg_cnt = ret; | 291 | sg_cnt = ret; |
@@ -602,7 +607,9 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, | |||
602 | break; | 607 | break; |
603 | } | 608 | } |
604 | 609 | ||
605 | ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); | 610 | /* P2PDMA contexts do not need to be unmapped */ |
611 | if (!is_pci_p2pdma_page(sg_page(sg))) | ||
612 | ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); | ||
606 | } | 613 | } |
607 | EXPORT_SYMBOL(rdma_rw_ctx_destroy); | 614 | EXPORT_SYMBOL(rdma_rw_ctx_destroy); |
608 | 615 | ||
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dd8ec1dd9219..6033ce2fd3e9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c | |||
@@ -3051,7 +3051,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) | |||
3051 | ns->queue = blk_mq_init_queue(ctrl->tagset); | 3051 | ns->queue = blk_mq_init_queue(ctrl->tagset); |
3052 | if (IS_ERR(ns->queue)) | 3052 | if (IS_ERR(ns->queue)) |
3053 | goto out_free_ns; | 3053 | goto out_free_ns; |
3054 | |||
3054 | blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); | 3055 | blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); |
3056 | if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) | ||
3057 | blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); | ||
3058 | |||
3055 | ns->queue->queuedata = ns; | 3059 | ns->queue->queuedata = ns; |
3056 | ns->ctrl = ctrl; | 3060 | ns->ctrl = ctrl; |
3057 | 3061 | ||
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index bb4a2003c097..4030743c90aa 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h | |||
@@ -343,6 +343,7 @@ struct nvme_ctrl_ops { | |||
343 | unsigned int flags; | 343 | unsigned int flags; |
344 | #define NVME_F_FABRICS (1 << 0) | 344 | #define NVME_F_FABRICS (1 << 0) |
345 | #define NVME_F_METADATA_SUPPORTED (1 << 1) | 345 | #define NVME_F_METADATA_SUPPORTED (1 << 1) |
346 | #define NVME_F_PCI_P2PDMA (1 << 2) | ||
346 | int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); | 347 | int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); |
347 | int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); | 348 | int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); |
348 | int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); | 349 | int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); |
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8991e79b2b87..7e09e45b0b28 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/types.h> | 30 | #include <linux/types.h> |
31 | #include <linux/io-64-nonatomic-lo-hi.h> | 31 | #include <linux/io-64-nonatomic-lo-hi.h> |
32 | #include <linux/sed-opal.h> | 32 | #include <linux/sed-opal.h> |
33 | #include <linux/pci-p2pdma.h> | ||
33 | 34 | ||
34 | #include "nvme.h" | 35 | #include "nvme.h" |
35 | 36 | ||
@@ -99,9 +100,8 @@ struct nvme_dev { | |||
99 | struct work_struct remove_work; | 100 | struct work_struct remove_work; |
100 | struct mutex shutdown_lock; | 101 | struct mutex shutdown_lock; |
101 | bool subsystem; | 102 | bool subsystem; |
102 | void __iomem *cmb; | ||
103 | pci_bus_addr_t cmb_bus_addr; | ||
104 | u64 cmb_size; | 103 | u64 cmb_size; |
104 | bool cmb_use_sqes; | ||
105 | u32 cmbsz; | 105 | u32 cmbsz; |
106 | u32 cmbloc; | 106 | u32 cmbloc; |
107 | struct nvme_ctrl ctrl; | 107 | struct nvme_ctrl ctrl; |
@@ -158,7 +158,7 @@ struct nvme_queue { | |||
158 | struct nvme_dev *dev; | 158 | struct nvme_dev *dev; |
159 | spinlock_t sq_lock; | 159 | spinlock_t sq_lock; |
160 | struct nvme_command *sq_cmds; | 160 | struct nvme_command *sq_cmds; |
161 | struct nvme_command __iomem *sq_cmds_io; | 161 | bool sq_cmds_is_io; |
162 | spinlock_t cq_lock ____cacheline_aligned_in_smp; | 162 | spinlock_t cq_lock ____cacheline_aligned_in_smp; |
163 | volatile struct nvme_completion *cqes; | 163 | volatile struct nvme_completion *cqes; |
164 | struct blk_mq_tags **tags; | 164 | struct blk_mq_tags **tags; |
@@ -447,11 +447,8 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set) | |||
447 | static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) | 447 | static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) |
448 | { | 448 | { |
449 | spin_lock(&nvmeq->sq_lock); | 449 | spin_lock(&nvmeq->sq_lock); |
450 | if (nvmeq->sq_cmds_io) | 450 | |
451 | memcpy_toio(&nvmeq->sq_cmds_io[nvmeq->sq_tail], cmd, | 451 | memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd)); |
452 | sizeof(*cmd)); | ||
453 | else | ||
454 | memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd)); | ||
455 | 452 | ||
456 | if (++nvmeq->sq_tail == nvmeq->q_depth) | 453 | if (++nvmeq->sq_tail == nvmeq->q_depth) |
457 | nvmeq->sq_tail = 0; | 454 | nvmeq->sq_tail = 0; |
@@ -748,8 +745,13 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, | |||
748 | goto out; | 745 | goto out; |
749 | 746 | ||
750 | ret = BLK_STS_RESOURCE; | 747 | ret = BLK_STS_RESOURCE; |
751 | nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir, | 748 | |
752 | DMA_ATTR_NO_WARN); | 749 | if (is_pci_p2pdma_page(sg_page(iod->sg))) |
750 | nr_mapped = pci_p2pdma_map_sg(dev->dev, iod->sg, iod->nents, | ||
751 | dma_dir); | ||
752 | else | ||
753 | nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, | ||
754 | dma_dir, DMA_ATTR_NO_WARN); | ||
753 | if (!nr_mapped) | 755 | if (!nr_mapped) |
754 | goto out; | 756 | goto out; |
755 | 757 | ||
@@ -791,7 +793,10 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) | |||
791 | DMA_TO_DEVICE : DMA_FROM_DEVICE; | 793 | DMA_TO_DEVICE : DMA_FROM_DEVICE; |
792 | 794 | ||
793 | if (iod->nents) { | 795 | if (iod->nents) { |
794 | dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); | 796 | /* P2PDMA requests do not need to be unmapped */ |
797 | if (!is_pci_p2pdma_page(sg_page(iod->sg))) | ||
798 | dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); | ||
799 | |||
795 | if (blk_integrity_rq(req)) | 800 | if (blk_integrity_rq(req)) |
796 | dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir); | 801 | dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir); |
797 | } | 802 | } |
@@ -1232,9 +1237,18 @@ static void nvme_free_queue(struct nvme_queue *nvmeq) | |||
1232 | { | 1237 | { |
1233 | dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), | 1238 | dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), |
1234 | (void *)nvmeq->cqes, nvmeq->cq_dma_addr); | 1239 | (void *)nvmeq->cqes, nvmeq->cq_dma_addr); |
1235 | if (nvmeq->sq_cmds) | 1240 | |
1236 | dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), | 1241 | if (nvmeq->sq_cmds) { |
1237 | nvmeq->sq_cmds, nvmeq->sq_dma_addr); | 1242 | if (nvmeq->sq_cmds_is_io) |
1243 | pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev), | ||
1244 | nvmeq->sq_cmds, | ||
1245 | SQ_SIZE(nvmeq->q_depth)); | ||
1246 | else | ||
1247 | dma_free_coherent(nvmeq->q_dmadev, | ||
1248 | SQ_SIZE(nvmeq->q_depth), | ||
1249 | nvmeq->sq_cmds, | ||
1250 | nvmeq->sq_dma_addr); | ||
1251 | } | ||
1238 | } | 1252 | } |
1239 | 1253 | ||
1240 | static void nvme_free_queues(struct nvme_dev *dev, int lowest) | 1254 | static void nvme_free_queues(struct nvme_dev *dev, int lowest) |
@@ -1323,12 +1337,21 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, | |||
1323 | static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, | 1337 | static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, |
1324 | int qid, int depth) | 1338 | int qid, int depth) |
1325 | { | 1339 | { |
1326 | /* CMB SQEs will be mapped before creation */ | 1340 | struct pci_dev *pdev = to_pci_dev(dev->dev); |
1327 | if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) | 1341 | |
1328 | return 0; | 1342 | if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { |
1343 | nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth)); | ||
1344 | nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, | ||
1345 | nvmeq->sq_cmds); | ||
1346 | nvmeq->sq_cmds_is_io = true; | ||
1347 | } | ||
1348 | |||
1349 | if (!nvmeq->sq_cmds) { | ||
1350 | nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), | ||
1351 | &nvmeq->sq_dma_addr, GFP_KERNEL); | ||
1352 | nvmeq->sq_cmds_is_io = false; | ||
1353 | } | ||
1329 | 1354 | ||
1330 | nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), | ||
1331 | &nvmeq->sq_dma_addr, GFP_KERNEL); | ||
1332 | if (!nvmeq->sq_cmds) | 1355 | if (!nvmeq->sq_cmds) |
1333 | return -ENOMEM; | 1356 | return -ENOMEM; |
1334 | return 0; | 1357 | return 0; |
@@ -1405,13 +1428,6 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) | |||
1405 | int result; | 1428 | int result; |
1406 | s16 vector; | 1429 | s16 vector; |
1407 | 1430 | ||
1408 | if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { | ||
1409 | unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth), | ||
1410 | dev->ctrl.page_size); | ||
1411 | nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; | ||
1412 | nvmeq->sq_cmds_io = dev->cmb + offset; | ||
1413 | } | ||
1414 | |||
1415 | /* | 1431 | /* |
1416 | * A queue's vector matches the queue identifier unless the controller | 1432 | * A queue's vector matches the queue identifier unless the controller |
1417 | * has only one vector available. | 1433 | * has only one vector available. |
@@ -1652,9 +1668,6 @@ static void nvme_map_cmb(struct nvme_dev *dev) | |||
1652 | return; | 1668 | return; |
1653 | dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); | 1669 | dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); |
1654 | 1670 | ||
1655 | if (!use_cmb_sqes) | ||
1656 | return; | ||
1657 | |||
1658 | size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev); | 1671 | size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev); |
1659 | offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc); | 1672 | offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc); |
1660 | bar = NVME_CMB_BIR(dev->cmbloc); | 1673 | bar = NVME_CMB_BIR(dev->cmbloc); |
@@ -1671,11 +1684,18 @@ static void nvme_map_cmb(struct nvme_dev *dev) | |||
1671 | if (size > bar_size - offset) | 1684 | if (size > bar_size - offset) |
1672 | size = bar_size - offset; | 1685 | size = bar_size - offset; |
1673 | 1686 | ||
1674 | dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); | 1687 | if (pci_p2pdma_add_resource(pdev, bar, size, offset)) { |
1675 | if (!dev->cmb) | 1688 | dev_warn(dev->ctrl.device, |
1689 | "failed to register the CMB\n"); | ||
1676 | return; | 1690 | return; |
1677 | dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset; | 1691 | } |
1692 | |||
1678 | dev->cmb_size = size; | 1693 | dev->cmb_size = size; |
1694 | dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS); | ||
1695 | |||
1696 | if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == | ||
1697 | (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) | ||
1698 | pci_p2pmem_publish(pdev, true); | ||
1679 | 1699 | ||
1680 | if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, | 1700 | if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, |
1681 | &dev_attr_cmb.attr, NULL)) | 1701 | &dev_attr_cmb.attr, NULL)) |
@@ -1685,12 +1705,10 @@ static void nvme_map_cmb(struct nvme_dev *dev) | |||
1685 | 1705 | ||
1686 | static inline void nvme_release_cmb(struct nvme_dev *dev) | 1706 | static inline void nvme_release_cmb(struct nvme_dev *dev) |
1687 | { | 1707 | { |
1688 | if (dev->cmb) { | 1708 | if (dev->cmb_size) { |
1689 | iounmap(dev->cmb); | ||
1690 | dev->cmb = NULL; | ||
1691 | sysfs_remove_file_from_group(&dev->ctrl.device->kobj, | 1709 | sysfs_remove_file_from_group(&dev->ctrl.device->kobj, |
1692 | &dev_attr_cmb.attr, NULL); | 1710 | &dev_attr_cmb.attr, NULL); |
1693 | dev->cmbsz = 0; | 1711 | dev->cmb_size = 0; |
1694 | } | 1712 | } |
1695 | } | 1713 | } |
1696 | 1714 | ||
@@ -1889,13 +1907,13 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) | |||
1889 | if (nr_io_queues == 0) | 1907 | if (nr_io_queues == 0) |
1890 | return 0; | 1908 | return 0; |
1891 | 1909 | ||
1892 | if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) { | 1910 | if (dev->cmb_use_sqes) { |
1893 | result = nvme_cmb_qdepth(dev, nr_io_queues, | 1911 | result = nvme_cmb_qdepth(dev, nr_io_queues, |
1894 | sizeof(struct nvme_command)); | 1912 | sizeof(struct nvme_command)); |
1895 | if (result > 0) | 1913 | if (result > 0) |
1896 | dev->q_depth = result; | 1914 | dev->q_depth = result; |
1897 | else | 1915 | else |
1898 | nvme_release_cmb(dev); | 1916 | dev->cmb_use_sqes = false; |
1899 | } | 1917 | } |
1900 | 1918 | ||
1901 | do { | 1919 | do { |
@@ -2390,7 +2408,8 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size) | |||
2390 | static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { | 2408 | static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { |
2391 | .name = "pcie", | 2409 | .name = "pcie", |
2392 | .module = THIS_MODULE, | 2410 | .module = THIS_MODULE, |
2393 | .flags = NVME_F_METADATA_SUPPORTED, | 2411 | .flags = NVME_F_METADATA_SUPPORTED | |
2412 | NVME_F_PCI_P2PDMA, | ||
2394 | .reg_read32 = nvme_pci_reg_read32, | 2413 | .reg_read32 = nvme_pci_reg_read32, |
2395 | .reg_write32 = nvme_pci_reg_write32, | 2414 | .reg_write32 = nvme_pci_reg_write32, |
2396 | .reg_read64 = nvme_pci_reg_read64, | 2415 | .reg_read64 = nvme_pci_reg_read64, |
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index b37a8e3e3f80..d895579b6c5d 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c | |||
@@ -17,6 +17,8 @@ | |||
17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
18 | #include <linux/stat.h> | 18 | #include <linux/stat.h> |
19 | #include <linux/ctype.h> | 19 | #include <linux/ctype.h> |
20 | #include <linux/pci.h> | ||
21 | #include <linux/pci-p2pdma.h> | ||
20 | 22 | ||
21 | #include "nvmet.h" | 23 | #include "nvmet.h" |
22 | 24 | ||
@@ -340,6 +342,48 @@ out_unlock: | |||
340 | 342 | ||
341 | CONFIGFS_ATTR(nvmet_ns_, device_path); | 343 | CONFIGFS_ATTR(nvmet_ns_, device_path); |
342 | 344 | ||
345 | #ifdef CONFIG_PCI_P2PDMA | ||
346 | static ssize_t nvmet_ns_p2pmem_show(struct config_item *item, char *page) | ||
347 | { | ||
348 | struct nvmet_ns *ns = to_nvmet_ns(item); | ||
349 | |||
350 | return pci_p2pdma_enable_show(page, ns->p2p_dev, ns->use_p2pmem); | ||
351 | } | ||
352 | |||
353 | static ssize_t nvmet_ns_p2pmem_store(struct config_item *item, | ||
354 | const char *page, size_t count) | ||
355 | { | ||
356 | struct nvmet_ns *ns = to_nvmet_ns(item); | ||
357 | struct pci_dev *p2p_dev = NULL; | ||
358 | bool use_p2pmem; | ||
359 | int ret = count; | ||
360 | int error; | ||
361 | |||
362 | mutex_lock(&ns->subsys->lock); | ||
363 | if (ns->enabled) { | ||
364 | ret = -EBUSY; | ||
365 | goto out_unlock; | ||
366 | } | ||
367 | |||
368 | error = pci_p2pdma_enable_store(page, &p2p_dev, &use_p2pmem); | ||
369 | if (error) { | ||
370 | ret = error; | ||
371 | goto out_unlock; | ||
372 | } | ||
373 | |||
374 | ns->use_p2pmem = use_p2pmem; | ||
375 | pci_dev_put(ns->p2p_dev); | ||
376 | ns->p2p_dev = p2p_dev; | ||
377 | |||
378 | out_unlock: | ||
379 | mutex_unlock(&ns->subsys->lock); | ||
380 | |||
381 | return ret; | ||
382 | } | ||
383 | |||
384 | CONFIGFS_ATTR(nvmet_ns_, p2pmem); | ||
385 | #endif /* CONFIG_PCI_P2PDMA */ | ||
386 | |||
343 | static ssize_t nvmet_ns_device_uuid_show(struct config_item *item, char *page) | 387 | static ssize_t nvmet_ns_device_uuid_show(struct config_item *item, char *page) |
344 | { | 388 | { |
345 | return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->uuid); | 389 | return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->uuid); |
@@ -509,6 +553,9 @@ static struct configfs_attribute *nvmet_ns_attrs[] = { | |||
509 | &nvmet_ns_attr_ana_grpid, | 553 | &nvmet_ns_attr_ana_grpid, |
510 | &nvmet_ns_attr_enable, | 554 | &nvmet_ns_attr_enable, |
511 | &nvmet_ns_attr_buffered_io, | 555 | &nvmet_ns_attr_buffered_io, |
556 | #ifdef CONFIG_PCI_P2PDMA | ||
557 | &nvmet_ns_attr_p2pmem, | ||
558 | #endif | ||
512 | NULL, | 559 | NULL, |
513 | }; | 560 | }; |
514 | 561 | ||
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index b5ec96abd048..9b4d84cfc224 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/random.h> | 16 | #include <linux/random.h> |
17 | #include <linux/rculist.h> | 17 | #include <linux/rculist.h> |
18 | #include <linux/pci-p2pdma.h> | ||
18 | 19 | ||
19 | #include "nvmet.h" | 20 | #include "nvmet.h" |
20 | 21 | ||
@@ -365,9 +366,93 @@ static void nvmet_ns_dev_disable(struct nvmet_ns *ns) | |||
365 | nvmet_file_ns_disable(ns); | 366 | nvmet_file_ns_disable(ns); |
366 | } | 367 | } |
367 | 368 | ||
369 | static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns) | ||
370 | { | ||
371 | int ret; | ||
372 | struct pci_dev *p2p_dev; | ||
373 | |||
374 | if (!ns->use_p2pmem) | ||
375 | return 0; | ||
376 | |||
377 | if (!ns->bdev) { | ||
378 | pr_err("peer-to-peer DMA is not supported by non-block device namespaces\n"); | ||
379 | return -EINVAL; | ||
380 | } | ||
381 | |||
382 | if (!blk_queue_pci_p2pdma(ns->bdev->bd_queue)) { | ||
383 | pr_err("peer-to-peer DMA is not supported by the driver of %s\n", | ||
384 | ns->device_path); | ||
385 | return -EINVAL; | ||
386 | } | ||
387 | |||
388 | if (ns->p2p_dev) { | ||
389 | ret = pci_p2pdma_distance(ns->p2p_dev, nvmet_ns_dev(ns), true); | ||
390 | if (ret < 0) | ||
391 | return -EINVAL; | ||
392 | } else { | ||
393 | /* | ||
394 | * Right now we just check that there is p2pmem available so | ||
395 | * we can report an error to the user right away if there | ||
396 | * is not. We'll find the actual device to use once we | ||
397 | * setup the controller when the port's device is available. | ||
398 | */ | ||
399 | |||
400 | p2p_dev = pci_p2pmem_find(nvmet_ns_dev(ns)); | ||
401 | if (!p2p_dev) { | ||
402 | pr_err("no peer-to-peer memory is available for %s\n", | ||
403 | ns->device_path); | ||
404 | return -EINVAL; | ||
405 | } | ||
406 | |||
407 | pci_dev_put(p2p_dev); | ||
408 | } | ||
409 | |||
410 | return 0; | ||
411 | } | ||
412 | |||
413 | /* | ||
414 | * Note: ctrl->subsys->lock should be held when calling this function | ||
415 | */ | ||
416 | static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl, | ||
417 | struct nvmet_ns *ns) | ||
418 | { | ||
419 | struct device *clients[2]; | ||
420 | struct pci_dev *p2p_dev; | ||
421 | int ret; | ||
422 | |||
423 | if (!ctrl->p2p_client) | ||
424 | return; | ||
425 | |||
426 | if (ns->p2p_dev) { | ||
427 | ret = pci_p2pdma_distance(ns->p2p_dev, ctrl->p2p_client, true); | ||
428 | if (ret < 0) | ||
429 | return; | ||
430 | |||
431 | p2p_dev = pci_dev_get(ns->p2p_dev); | ||
432 | } else { | ||
433 | clients[0] = ctrl->p2p_client; | ||
434 | clients[1] = nvmet_ns_dev(ns); | ||
435 | |||
436 | p2p_dev = pci_p2pmem_find_many(clients, ARRAY_SIZE(clients)); | ||
437 | if (!p2p_dev) { | ||
438 | pr_err("no peer-to-peer memory is available that's supported by %s and %s\n", | ||
439 | dev_name(ctrl->p2p_client), ns->device_path); | ||
440 | return; | ||
441 | } | ||
442 | } | ||
443 | |||
444 | ret = radix_tree_insert(&ctrl->p2p_ns_map, ns->nsid, p2p_dev); | ||
445 | if (ret < 0) | ||
446 | pci_dev_put(p2p_dev); | ||
447 | |||
448 | pr_info("using p2pmem on %s for nsid %d\n", pci_name(p2p_dev), | ||
449 | ns->nsid); | ||
450 | } | ||
451 | |||
368 | int nvmet_ns_enable(struct nvmet_ns *ns) | 452 | int nvmet_ns_enable(struct nvmet_ns *ns) |
369 | { | 453 | { |
370 | struct nvmet_subsys *subsys = ns->subsys; | 454 | struct nvmet_subsys *subsys = ns->subsys; |
455 | struct nvmet_ctrl *ctrl; | ||
371 | int ret; | 456 | int ret; |
372 | 457 | ||
373 | mutex_lock(&subsys->lock); | 458 | mutex_lock(&subsys->lock); |
@@ -384,6 +469,13 @@ int nvmet_ns_enable(struct nvmet_ns *ns) | |||
384 | if (ret) | 469 | if (ret) |
385 | goto out_unlock; | 470 | goto out_unlock; |
386 | 471 | ||
472 | ret = nvmet_p2pmem_ns_enable(ns); | ||
473 | if (ret) | ||
474 | goto out_unlock; | ||
475 | |||
476 | list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) | ||
477 | nvmet_p2pmem_ns_add_p2p(ctrl, ns); | ||
478 | |||
387 | ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace, | 479 | ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace, |
388 | 0, GFP_KERNEL); | 480 | 0, GFP_KERNEL); |
389 | if (ret) | 481 | if (ret) |
@@ -418,6 +510,9 @@ out_unlock: | |||
418 | mutex_unlock(&subsys->lock); | 510 | mutex_unlock(&subsys->lock); |
419 | return ret; | 511 | return ret; |
420 | out_dev_put: | 512 | out_dev_put: |
513 | list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) | ||
514 | pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); | ||
515 | |||
421 | nvmet_ns_dev_disable(ns); | 516 | nvmet_ns_dev_disable(ns); |
422 | goto out_unlock; | 517 | goto out_unlock; |
423 | } | 518 | } |
@@ -425,6 +520,7 @@ out_dev_put: | |||
425 | void nvmet_ns_disable(struct nvmet_ns *ns) | 520 | void nvmet_ns_disable(struct nvmet_ns *ns) |
426 | { | 521 | { |
427 | struct nvmet_subsys *subsys = ns->subsys; | 522 | struct nvmet_subsys *subsys = ns->subsys; |
523 | struct nvmet_ctrl *ctrl; | ||
428 | 524 | ||
429 | mutex_lock(&subsys->lock); | 525 | mutex_lock(&subsys->lock); |
430 | if (!ns->enabled) | 526 | if (!ns->enabled) |
@@ -434,6 +530,10 @@ void nvmet_ns_disable(struct nvmet_ns *ns) | |||
434 | list_del_rcu(&ns->dev_link); | 530 | list_del_rcu(&ns->dev_link); |
435 | if (ns->nsid == subsys->max_nsid) | 531 | if (ns->nsid == subsys->max_nsid) |
436 | subsys->max_nsid = nvmet_max_nsid(subsys); | 532 | subsys->max_nsid = nvmet_max_nsid(subsys); |
533 | |||
534 | list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) | ||
535 | pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); | ||
536 | |||
437 | mutex_unlock(&subsys->lock); | 537 | mutex_unlock(&subsys->lock); |
438 | 538 | ||
439 | /* | 539 | /* |
@@ -450,6 +550,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns) | |||
450 | percpu_ref_exit(&ns->ref); | 550 | percpu_ref_exit(&ns->ref); |
451 | 551 | ||
452 | mutex_lock(&subsys->lock); | 552 | mutex_lock(&subsys->lock); |
553 | |||
453 | subsys->nr_namespaces--; | 554 | subsys->nr_namespaces--; |
454 | nvmet_ns_changed(subsys, ns->nsid); | 555 | nvmet_ns_changed(subsys, ns->nsid); |
455 | nvmet_ns_dev_disable(ns); | 556 | nvmet_ns_dev_disable(ns); |
@@ -725,6 +826,51 @@ void nvmet_req_execute(struct nvmet_req *req) | |||
725 | } | 826 | } |
726 | EXPORT_SYMBOL_GPL(nvmet_req_execute); | 827 | EXPORT_SYMBOL_GPL(nvmet_req_execute); |
727 | 828 | ||
829 | int nvmet_req_alloc_sgl(struct nvmet_req *req) | ||
830 | { | ||
831 | struct pci_dev *p2p_dev = NULL; | ||
832 | |||
833 | if (IS_ENABLED(CONFIG_PCI_P2PDMA)) { | ||
834 | if (req->sq->ctrl && req->ns) | ||
835 | p2p_dev = radix_tree_lookup(&req->sq->ctrl->p2p_ns_map, | ||
836 | req->ns->nsid); | ||
837 | |||
838 | req->p2p_dev = NULL; | ||
839 | if (req->sq->qid && p2p_dev) { | ||
840 | req->sg = pci_p2pmem_alloc_sgl(p2p_dev, &req->sg_cnt, | ||
841 | req->transfer_len); | ||
842 | if (req->sg) { | ||
843 | req->p2p_dev = p2p_dev; | ||
844 | return 0; | ||
845 | } | ||
846 | } | ||
847 | |||
848 | /* | ||
849 | * If no P2P memory was available we fallback to using | ||
850 | * regular memory | ||
851 | */ | ||
852 | } | ||
853 | |||
854 | req->sg = sgl_alloc(req->transfer_len, GFP_KERNEL, &req->sg_cnt); | ||
855 | if (!req->sg) | ||
856 | return -ENOMEM; | ||
857 | |||
858 | return 0; | ||
859 | } | ||
860 | EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgl); | ||
861 | |||
862 | void nvmet_req_free_sgl(struct nvmet_req *req) | ||
863 | { | ||
864 | if (req->p2p_dev) | ||
865 | pci_p2pmem_free_sgl(req->p2p_dev, req->sg); | ||
866 | else | ||
867 | sgl_free(req->sg); | ||
868 | |||
869 | req->sg = NULL; | ||
870 | req->sg_cnt = 0; | ||
871 | } | ||
872 | EXPORT_SYMBOL_GPL(nvmet_req_free_sgl); | ||
873 | |||
728 | static inline bool nvmet_cc_en(u32 cc) | 874 | static inline bool nvmet_cc_en(u32 cc) |
729 | { | 875 | { |
730 | return (cc >> NVME_CC_EN_SHIFT) & 0x1; | 876 | return (cc >> NVME_CC_EN_SHIFT) & 0x1; |
@@ -921,6 +1067,37 @@ bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, | |||
921 | return __nvmet_host_allowed(subsys, hostnqn); | 1067 | return __nvmet_host_allowed(subsys, hostnqn); |
922 | } | 1068 | } |
923 | 1069 | ||
1070 | /* | ||
1071 | * Note: ctrl->subsys->lock should be held when calling this function | ||
1072 | */ | ||
1073 | static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl, | ||
1074 | struct nvmet_req *req) | ||
1075 | { | ||
1076 | struct nvmet_ns *ns; | ||
1077 | |||
1078 | if (!req->p2p_client) | ||
1079 | return; | ||
1080 | |||
1081 | ctrl->p2p_client = get_device(req->p2p_client); | ||
1082 | |||
1083 | list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) | ||
1084 | nvmet_p2pmem_ns_add_p2p(ctrl, ns); | ||
1085 | } | ||
1086 | |||
1087 | /* | ||
1088 | * Note: ctrl->subsys->lock should be held when calling this function | ||
1089 | */ | ||
1090 | static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl) | ||
1091 | { | ||
1092 | struct radix_tree_iter iter; | ||
1093 | void __rcu **slot; | ||
1094 | |||
1095 | radix_tree_for_each_slot(slot, &ctrl->p2p_ns_map, &iter, 0) | ||
1096 | pci_dev_put(radix_tree_deref_slot(slot)); | ||
1097 | |||
1098 | put_device(ctrl->p2p_client); | ||
1099 | } | ||
1100 | |||
924 | u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, | 1101 | u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, |
925 | struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp) | 1102 | struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp) |
926 | { | 1103 | { |
@@ -962,6 +1139,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, | |||
962 | 1139 | ||
963 | INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); | 1140 | INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); |
964 | INIT_LIST_HEAD(&ctrl->async_events); | 1141 | INIT_LIST_HEAD(&ctrl->async_events); |
1142 | INIT_RADIX_TREE(&ctrl->p2p_ns_map, GFP_KERNEL); | ||
965 | 1143 | ||
966 | memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); | 1144 | memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); |
967 | memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); | 1145 | memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); |
@@ -1026,6 +1204,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, | |||
1026 | 1204 | ||
1027 | mutex_lock(&subsys->lock); | 1205 | mutex_lock(&subsys->lock); |
1028 | list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); | 1206 | list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); |
1207 | nvmet_setup_p2p_ns_map(ctrl, req); | ||
1029 | mutex_unlock(&subsys->lock); | 1208 | mutex_unlock(&subsys->lock); |
1030 | 1209 | ||
1031 | *ctrlp = ctrl; | 1210 | *ctrlp = ctrl; |
@@ -1053,6 +1232,7 @@ static void nvmet_ctrl_free(struct kref *ref) | |||
1053 | struct nvmet_subsys *subsys = ctrl->subsys; | 1232 | struct nvmet_subsys *subsys = ctrl->subsys; |
1054 | 1233 | ||
1055 | mutex_lock(&subsys->lock); | 1234 | mutex_lock(&subsys->lock); |
1235 | nvmet_release_p2p_ns_map(ctrl); | ||
1056 | list_del(&ctrl->subsys_entry); | 1236 | list_del(&ctrl->subsys_entry); |
1057 | mutex_unlock(&subsys->lock); | 1237 | mutex_unlock(&subsys->lock); |
1058 | 1238 | ||
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 7bc9f6240432..5660dd7ca755 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c | |||
@@ -78,6 +78,9 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) | |||
78 | op = REQ_OP_READ; | 78 | op = REQ_OP_READ; |
79 | } | 79 | } |
80 | 80 | ||
81 | if (is_pci_p2pdma_page(sg_page(req->sg))) | ||
82 | op_flags |= REQ_NOMERGE; | ||
83 | |||
81 | sector = le64_to_cpu(req->cmd->rw.slba); | 84 | sector = le64_to_cpu(req->cmd->rw.slba); |
82 | sector <<= (req->ns->blksize_shift - 9); | 85 | sector <<= (req->ns->blksize_shift - 9); |
83 | 86 | ||
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index ec9af4ee03b6..d6be098f342b 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/configfs.h> | 26 | #include <linux/configfs.h> |
27 | #include <linux/rcupdate.h> | 27 | #include <linux/rcupdate.h> |
28 | #include <linux/blkdev.h> | 28 | #include <linux/blkdev.h> |
29 | #include <linux/radix-tree.h> | ||
29 | 30 | ||
30 | #define NVMET_ASYNC_EVENTS 4 | 31 | #define NVMET_ASYNC_EVENTS 4 |
31 | #define NVMET_ERROR_LOG_SLOTS 128 | 32 | #define NVMET_ERROR_LOG_SLOTS 128 |
@@ -77,6 +78,9 @@ struct nvmet_ns { | |||
77 | struct completion disable_done; | 78 | struct completion disable_done; |
78 | mempool_t *bvec_pool; | 79 | mempool_t *bvec_pool; |
79 | struct kmem_cache *bvec_cache; | 80 | struct kmem_cache *bvec_cache; |
81 | |||
82 | int use_p2pmem; | ||
83 | struct pci_dev *p2p_dev; | ||
80 | }; | 84 | }; |
81 | 85 | ||
82 | static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item) | 86 | static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item) |
@@ -84,6 +88,11 @@ static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item) | |||
84 | return container_of(to_config_group(item), struct nvmet_ns, group); | 88 | return container_of(to_config_group(item), struct nvmet_ns, group); |
85 | } | 89 | } |
86 | 90 | ||
91 | static inline struct device *nvmet_ns_dev(struct nvmet_ns *ns) | ||
92 | { | ||
93 | return ns->bdev ? disk_to_dev(ns->bdev->bd_disk) : NULL; | ||
94 | } | ||
95 | |||
87 | struct nvmet_cq { | 96 | struct nvmet_cq { |
88 | u16 qid; | 97 | u16 qid; |
89 | u16 size; | 98 | u16 size; |
@@ -184,6 +193,9 @@ struct nvmet_ctrl { | |||
184 | 193 | ||
185 | char subsysnqn[NVMF_NQN_FIELD_LEN]; | 194 | char subsysnqn[NVMF_NQN_FIELD_LEN]; |
186 | char hostnqn[NVMF_NQN_FIELD_LEN]; | 195 | char hostnqn[NVMF_NQN_FIELD_LEN]; |
196 | |||
197 | struct device *p2p_client; | ||
198 | struct radix_tree_root p2p_ns_map; | ||
187 | }; | 199 | }; |
188 | 200 | ||
189 | struct nvmet_subsys { | 201 | struct nvmet_subsys { |
@@ -294,6 +306,9 @@ struct nvmet_req { | |||
294 | 306 | ||
295 | void (*execute)(struct nvmet_req *req); | 307 | void (*execute)(struct nvmet_req *req); |
296 | const struct nvmet_fabrics_ops *ops; | 308 | const struct nvmet_fabrics_ops *ops; |
309 | |||
310 | struct pci_dev *p2p_dev; | ||
311 | struct device *p2p_client; | ||
297 | }; | 312 | }; |
298 | 313 | ||
299 | extern struct workqueue_struct *buffered_io_wq; | 314 | extern struct workqueue_struct *buffered_io_wq; |
@@ -336,6 +351,8 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, | |||
336 | void nvmet_req_uninit(struct nvmet_req *req); | 351 | void nvmet_req_uninit(struct nvmet_req *req); |
337 | void nvmet_req_execute(struct nvmet_req *req); | 352 | void nvmet_req_execute(struct nvmet_req *req); |
338 | void nvmet_req_complete(struct nvmet_req *req, u16 status); | 353 | void nvmet_req_complete(struct nvmet_req *req, u16 status); |
354 | int nvmet_req_alloc_sgl(struct nvmet_req *req); | ||
355 | void nvmet_req_free_sgl(struct nvmet_req *req); | ||
339 | 356 | ||
340 | void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, | 357 | void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, |
341 | u16 size); | 358 | u16 size); |
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index bfc4da660bb4..3f7971d3706d 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c | |||
@@ -503,7 +503,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) | |||
503 | } | 503 | } |
504 | 504 | ||
505 | if (rsp->req.sg != rsp->cmd->inline_sg) | 505 | if (rsp->req.sg != rsp->cmd->inline_sg) |
506 | sgl_free(rsp->req.sg); | 506 | nvmet_req_free_sgl(&rsp->req); |
507 | 507 | ||
508 | if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) | 508 | if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) |
509 | nvmet_rdma_process_wr_wait_list(queue); | 509 | nvmet_rdma_process_wr_wait_list(queue); |
@@ -652,24 +652,24 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, | |||
652 | { | 652 | { |
653 | struct rdma_cm_id *cm_id = rsp->queue->cm_id; | 653 | struct rdma_cm_id *cm_id = rsp->queue->cm_id; |
654 | u64 addr = le64_to_cpu(sgl->addr); | 654 | u64 addr = le64_to_cpu(sgl->addr); |
655 | u32 len = get_unaligned_le24(sgl->length); | ||
656 | u32 key = get_unaligned_le32(sgl->key); | 655 | u32 key = get_unaligned_le32(sgl->key); |
657 | int ret; | 656 | int ret; |
658 | 657 | ||
658 | rsp->req.transfer_len = get_unaligned_le24(sgl->length); | ||
659 | |||
659 | /* no data command? */ | 660 | /* no data command? */ |
660 | if (!len) | 661 | if (!rsp->req.transfer_len) |
661 | return 0; | 662 | return 0; |
662 | 663 | ||
663 | rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt); | 664 | ret = nvmet_req_alloc_sgl(&rsp->req); |
664 | if (!rsp->req.sg) | 665 | if (ret < 0) |
665 | return NVME_SC_INTERNAL; | 666 | goto error_out; |
666 | 667 | ||
667 | ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, | 668 | ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, |
668 | rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, | 669 | rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, |
669 | nvmet_data_dir(&rsp->req)); | 670 | nvmet_data_dir(&rsp->req)); |
670 | if (ret < 0) | 671 | if (ret < 0) |
671 | return NVME_SC_INTERNAL; | 672 | goto error_out; |
672 | rsp->req.transfer_len += len; | ||
673 | rsp->n_rdma += ret; | 673 | rsp->n_rdma += ret; |
674 | 674 | ||
675 | if (invalidate) { | 675 | if (invalidate) { |
@@ -678,6 +678,10 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, | |||
678 | } | 678 | } |
679 | 679 | ||
680 | return 0; | 680 | return 0; |
681 | |||
682 | error_out: | ||
683 | rsp->req.transfer_len = 0; | ||
684 | return NVME_SC_INTERNAL; | ||
681 | } | 685 | } |
682 | 686 | ||
683 | static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) | 687 | static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) |
@@ -745,6 +749,8 @@ static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, | |||
745 | cmd->send_sge.addr, cmd->send_sge.length, | 749 | cmd->send_sge.addr, cmd->send_sge.length, |
746 | DMA_TO_DEVICE); | 750 | DMA_TO_DEVICE); |
747 | 751 | ||
752 | cmd->req.p2p_client = &queue->dev->device->dev; | ||
753 | |||
748 | if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, | 754 | if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, |
749 | &queue->nvme_sq, &nvmet_rdma_ops)) | 755 | &queue->nvme_sq, &nvmet_rdma_ops)) |
750 | return; | 756 | return; |
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 56ff8f6d31fc..deb68be4fdac 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig | |||
@@ -132,6 +132,23 @@ config PCI_PASID | |||
132 | 132 | ||
133 | If unsure, say N. | 133 | If unsure, say N. |
134 | 134 | ||
135 | config PCI_P2PDMA | ||
136 | bool "PCI peer-to-peer transfer support" | ||
137 | depends on PCI && ZONE_DEVICE | ||
138 | select GENERIC_ALLOCATOR | ||
139 | help | ||
140 | Enableѕ drivers to do PCI peer-to-peer transactions to and from | ||
141 | BARs that are exposed in other devices that are the part of | ||
142 | the hierarchy where peer-to-peer DMA is guaranteed by the PCI | ||
143 | specification to work (ie. anything below a single PCI bridge). | ||
144 | |||
145 | Many PCIe root complexes do not support P2P transactions and | ||
146 | it's hard to tell which support it at all, so at this time, | ||
147 | P2P DMA transations must be between devices behind the same root | ||
148 | port. | ||
149 | |||
150 | If unsure, say N. | ||
151 | |||
135 | config PCI_LABEL | 152 | config PCI_LABEL |
136 | def_bool y if (DMI || ACPI) | 153 | def_bool y if (DMI || ACPI) |
137 | depends on PCI | 154 | depends on PCI |
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index 1b2cfe51e8d7..85f4a703b2be 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile | |||
@@ -26,6 +26,7 @@ obj-$(CONFIG_PCI_SYSCALL) += syscall.o | |||
26 | obj-$(CONFIG_PCI_STUB) += pci-stub.o | 26 | obj-$(CONFIG_PCI_STUB) += pci-stub.o |
27 | obj-$(CONFIG_PCI_PF_STUB) += pci-pf-stub.o | 27 | obj-$(CONFIG_PCI_PF_STUB) += pci-pf-stub.o |
28 | obj-$(CONFIG_PCI_ECAM) += ecam.o | 28 | obj-$(CONFIG_PCI_ECAM) += ecam.o |
29 | obj-$(CONFIG_PCI_P2PDMA) += p2pdma.o | ||
29 | obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o | 30 | obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o |
30 | 31 | ||
31 | # Endpoint library must be initialized before its users | 32 | # Endpoint library must be initialized before its users |
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c new file mode 100644 index 000000000000..ae3c5b25dcc7 --- /dev/null +++ b/drivers/pci/p2pdma.c | |||
@@ -0,0 +1,805 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * PCI Peer 2 Peer DMA support. | ||
4 | * | ||
5 | * Copyright (c) 2016-2018, Logan Gunthorpe | ||
6 | * Copyright (c) 2016-2017, Microsemi Corporation | ||
7 | * Copyright (c) 2017, Christoph Hellwig | ||
8 | * Copyright (c) 2018, Eideticom Inc. | ||
9 | */ | ||
10 | |||
11 | #define pr_fmt(fmt) "pci-p2pdma: " fmt | ||
12 | #include <linux/ctype.h> | ||
13 | #include <linux/pci-p2pdma.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/slab.h> | ||
16 | #include <linux/genalloc.h> | ||
17 | #include <linux/memremap.h> | ||
18 | #include <linux/percpu-refcount.h> | ||
19 | #include <linux/random.h> | ||
20 | #include <linux/seq_buf.h> | ||
21 | |||
22 | struct pci_p2pdma { | ||
23 | struct percpu_ref devmap_ref; | ||
24 | struct completion devmap_ref_done; | ||
25 | struct gen_pool *pool; | ||
26 | bool p2pmem_published; | ||
27 | }; | ||
28 | |||
29 | static ssize_t size_show(struct device *dev, struct device_attribute *attr, | ||
30 | char *buf) | ||
31 | { | ||
32 | struct pci_dev *pdev = to_pci_dev(dev); | ||
33 | size_t size = 0; | ||
34 | |||
35 | if (pdev->p2pdma->pool) | ||
36 | size = gen_pool_size(pdev->p2pdma->pool); | ||
37 | |||
38 | return snprintf(buf, PAGE_SIZE, "%zd\n", size); | ||
39 | } | ||
40 | static DEVICE_ATTR_RO(size); | ||
41 | |||
42 | static ssize_t available_show(struct device *dev, struct device_attribute *attr, | ||
43 | char *buf) | ||
44 | { | ||
45 | struct pci_dev *pdev = to_pci_dev(dev); | ||
46 | size_t avail = 0; | ||
47 | |||
48 | if (pdev->p2pdma->pool) | ||
49 | avail = gen_pool_avail(pdev->p2pdma->pool); | ||
50 | |||
51 | return snprintf(buf, PAGE_SIZE, "%zd\n", avail); | ||
52 | } | ||
53 | static DEVICE_ATTR_RO(available); | ||
54 | |||
55 | static ssize_t published_show(struct device *dev, struct device_attribute *attr, | ||
56 | char *buf) | ||
57 | { | ||
58 | struct pci_dev *pdev = to_pci_dev(dev); | ||
59 | |||
60 | return snprintf(buf, PAGE_SIZE, "%d\n", | ||
61 | pdev->p2pdma->p2pmem_published); | ||
62 | } | ||
63 | static DEVICE_ATTR_RO(published); | ||
64 | |||
65 | static struct attribute *p2pmem_attrs[] = { | ||
66 | &dev_attr_size.attr, | ||
67 | &dev_attr_available.attr, | ||
68 | &dev_attr_published.attr, | ||
69 | NULL, | ||
70 | }; | ||
71 | |||
72 | static const struct attribute_group p2pmem_group = { | ||
73 | .attrs = p2pmem_attrs, | ||
74 | .name = "p2pmem", | ||
75 | }; | ||
76 | |||
77 | static void pci_p2pdma_percpu_release(struct percpu_ref *ref) | ||
78 | { | ||
79 | struct pci_p2pdma *p2p = | ||
80 | container_of(ref, struct pci_p2pdma, devmap_ref); | ||
81 | |||
82 | complete_all(&p2p->devmap_ref_done); | ||
83 | } | ||
84 | |||
85 | static void pci_p2pdma_percpu_kill(void *data) | ||
86 | { | ||
87 | struct percpu_ref *ref = data; | ||
88 | |||
89 | /* | ||
90 | * pci_p2pdma_add_resource() may be called multiple times | ||
91 | * by a driver and may register the percpu_kill devm action multiple | ||
92 | * times. We only want the first action to actually kill the | ||
93 | * percpu_ref. | ||
94 | */ | ||
95 | if (percpu_ref_is_dying(ref)) | ||
96 | return; | ||
97 | |||
98 | percpu_ref_kill(ref); | ||
99 | } | ||
100 | |||
101 | static void pci_p2pdma_release(void *data) | ||
102 | { | ||
103 | struct pci_dev *pdev = data; | ||
104 | |||
105 | if (!pdev->p2pdma) | ||
106 | return; | ||
107 | |||
108 | wait_for_completion(&pdev->p2pdma->devmap_ref_done); | ||
109 | percpu_ref_exit(&pdev->p2pdma->devmap_ref); | ||
110 | |||
111 | gen_pool_destroy(pdev->p2pdma->pool); | ||
112 | sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group); | ||
113 | pdev->p2pdma = NULL; | ||
114 | } | ||
115 | |||
116 | static int pci_p2pdma_setup(struct pci_dev *pdev) | ||
117 | { | ||
118 | int error = -ENOMEM; | ||
119 | struct pci_p2pdma *p2p; | ||
120 | |||
121 | p2p = devm_kzalloc(&pdev->dev, sizeof(*p2p), GFP_KERNEL); | ||
122 | if (!p2p) | ||
123 | return -ENOMEM; | ||
124 | |||
125 | p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev)); | ||
126 | if (!p2p->pool) | ||
127 | goto out; | ||
128 | |||
129 | init_completion(&p2p->devmap_ref_done); | ||
130 | error = percpu_ref_init(&p2p->devmap_ref, | ||
131 | pci_p2pdma_percpu_release, 0, GFP_KERNEL); | ||
132 | if (error) | ||
133 | goto out_pool_destroy; | ||
134 | |||
135 | error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); | ||
136 | if (error) | ||
137 | goto out_pool_destroy; | ||
138 | |||
139 | pdev->p2pdma = p2p; | ||
140 | |||
141 | error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group); | ||
142 | if (error) | ||
143 | goto out_pool_destroy; | ||
144 | |||
145 | return 0; | ||
146 | |||
147 | out_pool_destroy: | ||
148 | pdev->p2pdma = NULL; | ||
149 | gen_pool_destroy(p2p->pool); | ||
150 | out: | ||
151 | devm_kfree(&pdev->dev, p2p); | ||
152 | return error; | ||
153 | } | ||
154 | |||
155 | /** | ||
156 | * pci_p2pdma_add_resource - add memory for use as p2p memory | ||
157 | * @pdev: the device to add the memory to | ||
158 | * @bar: PCI BAR to add | ||
159 | * @size: size of the memory to add, may be zero to use the whole BAR | ||
160 | * @offset: offset into the PCI BAR | ||
161 | * | ||
162 | * The memory will be given ZONE_DEVICE struct pages so that it may | ||
163 | * be used with any DMA request. | ||
164 | */ | ||
165 | int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, | ||
166 | u64 offset) | ||
167 | { | ||
168 | struct dev_pagemap *pgmap; | ||
169 | void *addr; | ||
170 | int error; | ||
171 | |||
172 | if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) | ||
173 | return -EINVAL; | ||
174 | |||
175 | if (offset >= pci_resource_len(pdev, bar)) | ||
176 | return -EINVAL; | ||
177 | |||
178 | if (!size) | ||
179 | size = pci_resource_len(pdev, bar) - offset; | ||
180 | |||
181 | if (size + offset > pci_resource_len(pdev, bar)) | ||
182 | return -EINVAL; | ||
183 | |||
184 | if (!pdev->p2pdma) { | ||
185 | error = pci_p2pdma_setup(pdev); | ||
186 | if (error) | ||
187 | return error; | ||
188 | } | ||
189 | |||
190 | pgmap = devm_kzalloc(&pdev->dev, sizeof(*pgmap), GFP_KERNEL); | ||
191 | if (!pgmap) | ||
192 | return -ENOMEM; | ||
193 | |||
194 | pgmap->res.start = pci_resource_start(pdev, bar) + offset; | ||
195 | pgmap->res.end = pgmap->res.start + size - 1; | ||
196 | pgmap->res.flags = pci_resource_flags(pdev, bar); | ||
197 | pgmap->ref = &pdev->p2pdma->devmap_ref; | ||
198 | pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; | ||
199 | pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) - | ||
200 | pci_resource_start(pdev, bar); | ||
201 | |||
202 | addr = devm_memremap_pages(&pdev->dev, pgmap); | ||
203 | if (IS_ERR(addr)) { | ||
204 | error = PTR_ERR(addr); | ||
205 | goto pgmap_free; | ||
206 | } | ||
207 | |||
208 | error = gen_pool_add_virt(pdev->p2pdma->pool, (unsigned long)addr, | ||
209 | pci_bus_address(pdev, bar) + offset, | ||
210 | resource_size(&pgmap->res), dev_to_node(&pdev->dev)); | ||
211 | if (error) | ||
212 | goto pgmap_free; | ||
213 | |||
214 | error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_percpu_kill, | ||
215 | &pdev->p2pdma->devmap_ref); | ||
216 | if (error) | ||
217 | goto pgmap_free; | ||
218 | |||
219 | pci_info(pdev, "added peer-to-peer DMA memory %pR\n", | ||
220 | &pgmap->res); | ||
221 | |||
222 | return 0; | ||
223 | |||
224 | pgmap_free: | ||
225 | devm_kfree(&pdev->dev, pgmap); | ||
226 | return error; | ||
227 | } | ||
228 | EXPORT_SYMBOL_GPL(pci_p2pdma_add_resource); | ||
229 | |||
230 | /* | ||
231 | * Note this function returns the parent PCI device with a | ||
232 | * reference taken. It is the caller's responsibily to drop | ||
233 | * the reference. | ||
234 | */ | ||
235 | static struct pci_dev *find_parent_pci_dev(struct device *dev) | ||
236 | { | ||
237 | struct device *parent; | ||
238 | |||
239 | dev = get_device(dev); | ||
240 | |||
241 | while (dev) { | ||
242 | if (dev_is_pci(dev)) | ||
243 | return to_pci_dev(dev); | ||
244 | |||
245 | parent = get_device(dev->parent); | ||
246 | put_device(dev); | ||
247 | dev = parent; | ||
248 | } | ||
249 | |||
250 | return NULL; | ||
251 | } | ||
252 | |||
253 | /* | ||
254 | * Check if a PCI bridge has its ACS redirection bits set to redirect P2P | ||
255 | * TLPs upstream via ACS. Returns 1 if the packets will be redirected | ||
256 | * upstream, 0 otherwise. | ||
257 | */ | ||
258 | static int pci_bridge_has_acs_redir(struct pci_dev *pdev) | ||
259 | { | ||
260 | int pos; | ||
261 | u16 ctrl; | ||
262 | |||
263 | pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ACS); | ||
264 | if (!pos) | ||
265 | return 0; | ||
266 | |||
267 | pci_read_config_word(pdev, pos + PCI_ACS_CTRL, &ctrl); | ||
268 | |||
269 | if (ctrl & (PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC)) | ||
270 | return 1; | ||
271 | |||
272 | return 0; | ||
273 | } | ||
274 | |||
275 | static void seq_buf_print_bus_devfn(struct seq_buf *buf, struct pci_dev *pdev) | ||
276 | { | ||
277 | if (!buf) | ||
278 | return; | ||
279 | |||
280 | seq_buf_printf(buf, "%s;", pci_name(pdev)); | ||
281 | } | ||
282 | |||
283 | /* | ||
284 | * Find the distance through the nearest common upstream bridge between | ||
285 | * two PCI devices. | ||
286 | * | ||
287 | * If the two devices are the same device then 0 will be returned. | ||
288 | * | ||
289 | * If there are two virtual functions of the same device behind the same | ||
290 | * bridge port then 2 will be returned (one step down to the PCIe switch, | ||
291 | * then one step back to the same device). | ||
292 | * | ||
293 | * In the case where two devices are connected to the same PCIe switch, the | ||
294 | * value 4 will be returned. This corresponds to the following PCI tree: | ||
295 | * | ||
296 | * -+ Root Port | ||
297 | * \+ Switch Upstream Port | ||
298 | * +-+ Switch Downstream Port | ||
299 | * + \- Device A | ||
300 | * \-+ Switch Downstream Port | ||
301 | * \- Device B | ||
302 | * | ||
303 | * The distance is 4 because we traverse from Device A through the downstream | ||
304 | * port of the switch, to the common upstream port, back up to the second | ||
305 | * downstream port and then to Device B. | ||
306 | * | ||
307 | * Any two devices that don't have a common upstream bridge will return -1. | ||
308 | * In this way devices on separate PCIe root ports will be rejected, which | ||
309 | * is what we want for peer-to-peer seeing each PCIe root port defines a | ||
310 | * separate hierarchy domain and there's no way to determine whether the root | ||
311 | * complex supports forwarding between them. | ||
312 | * | ||
313 | * In the case where two devices are connected to different PCIe switches, | ||
314 | * this function will still return a positive distance as long as both | ||
315 | * switches eventually have a common upstream bridge. Note this covers | ||
316 | * the case of using multiple PCIe switches to achieve a desired level of | ||
317 | * fan-out from a root port. The exact distance will be a function of the | ||
318 | * number of switches between Device A and Device B. | ||
319 | * | ||
320 | * If a bridge which has any ACS redirection bits set is in the path | ||
321 | * then this functions will return -2. This is so we reject any | ||
322 | * cases where the TLPs are forwarded up into the root complex. | ||
323 | * In this case, a list of all infringing bridge addresses will be | ||
324 | * populated in acs_list (assuming it's non-null) for printk purposes. | ||
325 | */ | ||
326 | static int upstream_bridge_distance(struct pci_dev *a, | ||
327 | struct pci_dev *b, | ||
328 | struct seq_buf *acs_list) | ||
329 | { | ||
330 | int dist_a = 0; | ||
331 | int dist_b = 0; | ||
332 | struct pci_dev *bb = NULL; | ||
333 | int acs_cnt = 0; | ||
334 | |||
335 | /* | ||
336 | * Note, we don't need to take references to devices returned by | ||
337 | * pci_upstream_bridge() seeing we hold a reference to a child | ||
338 | * device which will already hold a reference to the upstream bridge. | ||
339 | */ | ||
340 | |||
341 | while (a) { | ||
342 | dist_b = 0; | ||
343 | |||
344 | if (pci_bridge_has_acs_redir(a)) { | ||
345 | seq_buf_print_bus_devfn(acs_list, a); | ||
346 | acs_cnt++; | ||
347 | } | ||
348 | |||
349 | bb = b; | ||
350 | |||
351 | while (bb) { | ||
352 | if (a == bb) | ||
353 | goto check_b_path_acs; | ||
354 | |||
355 | bb = pci_upstream_bridge(bb); | ||
356 | dist_b++; | ||
357 | } | ||
358 | |||
359 | a = pci_upstream_bridge(a); | ||
360 | dist_a++; | ||
361 | } | ||
362 | |||
363 | return -1; | ||
364 | |||
365 | check_b_path_acs: | ||
366 | bb = b; | ||
367 | |||
368 | while (bb) { | ||
369 | if (a == bb) | ||
370 | break; | ||
371 | |||
372 | if (pci_bridge_has_acs_redir(bb)) { | ||
373 | seq_buf_print_bus_devfn(acs_list, bb); | ||
374 | acs_cnt++; | ||
375 | } | ||
376 | |||
377 | bb = pci_upstream_bridge(bb); | ||
378 | } | ||
379 | |||
380 | if (acs_cnt) | ||
381 | return -2; | ||
382 | |||
383 | return dist_a + dist_b; | ||
384 | } | ||
385 | |||
386 | static int upstream_bridge_distance_warn(struct pci_dev *provider, | ||
387 | struct pci_dev *client) | ||
388 | { | ||
389 | struct seq_buf acs_list; | ||
390 | int ret; | ||
391 | |||
392 | seq_buf_init(&acs_list, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); | ||
393 | if (!acs_list.buffer) | ||
394 | return -ENOMEM; | ||
395 | |||
396 | ret = upstream_bridge_distance(provider, client, &acs_list); | ||
397 | if (ret == -2) { | ||
398 | pci_warn(client, "cannot be used for peer-to-peer DMA as ACS redirect is set between the client and provider (%s)\n", | ||
399 | pci_name(provider)); | ||
400 | /* Drop final semicolon */ | ||
401 | acs_list.buffer[acs_list.len-1] = 0; | ||
402 | pci_warn(client, "to disable ACS redirect for this path, add the kernel parameter: pci=disable_acs_redir=%s\n", | ||
403 | acs_list.buffer); | ||
404 | |||
405 | } else if (ret < 0) { | ||
406 | pci_warn(client, "cannot be used for peer-to-peer DMA as the client and provider (%s) do not share an upstream bridge\n", | ||
407 | pci_name(provider)); | ||
408 | } | ||
409 | |||
410 | kfree(acs_list.buffer); | ||
411 | |||
412 | return ret; | ||
413 | } | ||
414 | |||
415 | /** | ||
416 | * pci_p2pdma_distance_many - Determive the cumulative distance between | ||
417 | * a p2pdma provider and the clients in use. | ||
418 | * @provider: p2pdma provider to check against the client list | ||
419 | * @clients: array of devices to check (NULL-terminated) | ||
420 | * @num_clients: number of clients in the array | ||
421 | * @verbose: if true, print warnings for devices when we return -1 | ||
422 | * | ||
423 | * Returns -1 if any of the clients are not compatible (behind the same | ||
424 | * root port as the provider), otherwise returns a positive number where | ||
425 | * a lower number is the preferrable choice. (If there's one client | ||
426 | * that's the same as the provider it will return 0, which is best choice). | ||
427 | * | ||
428 | * For now, "compatible" means the provider and the clients are all behind | ||
429 | * the same PCI root port. This cuts out cases that may work but is safest | ||
430 | * for the user. Future work can expand this to white-list root complexes that | ||
431 | * can safely forward between each ports. | ||
432 | */ | ||
433 | int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, | ||
434 | int num_clients, bool verbose) | ||
435 | { | ||
436 | bool not_supported = false; | ||
437 | struct pci_dev *pci_client; | ||
438 | int distance = 0; | ||
439 | int i, ret; | ||
440 | |||
441 | if (num_clients == 0) | ||
442 | return -1; | ||
443 | |||
444 | for (i = 0; i < num_clients; i++) { | ||
445 | pci_client = find_parent_pci_dev(clients[i]); | ||
446 | if (!pci_client) { | ||
447 | if (verbose) | ||
448 | dev_warn(clients[i], | ||
449 | "cannot be used for peer-to-peer DMA as it is not a PCI device\n"); | ||
450 | return -1; | ||
451 | } | ||
452 | |||
453 | if (verbose) | ||
454 | ret = upstream_bridge_distance_warn(provider, | ||
455 | pci_client); | ||
456 | else | ||
457 | ret = upstream_bridge_distance(provider, pci_client, | ||
458 | NULL); | ||
459 | |||
460 | pci_dev_put(pci_client); | ||
461 | |||
462 | if (ret < 0) | ||
463 | not_supported = true; | ||
464 | |||
465 | if (not_supported && !verbose) | ||
466 | break; | ||
467 | |||
468 | distance += ret; | ||
469 | } | ||
470 | |||
471 | if (not_supported) | ||
472 | return -1; | ||
473 | |||
474 | return distance; | ||
475 | } | ||
476 | EXPORT_SYMBOL_GPL(pci_p2pdma_distance_many); | ||
477 | |||
478 | /** | ||
479 | * pci_has_p2pmem - check if a given PCI device has published any p2pmem | ||
480 | * @pdev: PCI device to check | ||
481 | */ | ||
482 | bool pci_has_p2pmem(struct pci_dev *pdev) | ||
483 | { | ||
484 | return pdev->p2pdma && pdev->p2pdma->p2pmem_published; | ||
485 | } | ||
486 | EXPORT_SYMBOL_GPL(pci_has_p2pmem); | ||
487 | |||
488 | /** | ||
489 | * pci_p2pmem_find - find a peer-to-peer DMA memory device compatible with | ||
490 | * the specified list of clients and shortest distance (as determined | ||
491 | * by pci_p2pmem_dma()) | ||
492 | * @clients: array of devices to check (NULL-terminated) | ||
493 | * @num_clients: number of client devices in the list | ||
494 | * | ||
495 | * If multiple devices are behind the same switch, the one "closest" to the | ||
496 | * client devices in use will be chosen first. (So if one of the providers are | ||
497 | * the same as one of the clients, that provider will be used ahead of any | ||
498 | * other providers that are unrelated). If multiple providers are an equal | ||
499 | * distance away, one will be chosen at random. | ||
500 | * | ||
501 | * Returns a pointer to the PCI device with a reference taken (use pci_dev_put | ||
502 | * to return the reference) or NULL if no compatible device is found. The | ||
503 | * found provider will also be assigned to the client list. | ||
504 | */ | ||
505 | struct pci_dev *pci_p2pmem_find_many(struct device **clients, int num_clients) | ||
506 | { | ||
507 | struct pci_dev *pdev = NULL; | ||
508 | int distance; | ||
509 | int closest_distance = INT_MAX; | ||
510 | struct pci_dev **closest_pdevs; | ||
511 | int dev_cnt = 0; | ||
512 | const int max_devs = PAGE_SIZE / sizeof(*closest_pdevs); | ||
513 | int i; | ||
514 | |||
515 | closest_pdevs = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
516 | if (!closest_pdevs) | ||
517 | return NULL; | ||
518 | |||
519 | while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev))) { | ||
520 | if (!pci_has_p2pmem(pdev)) | ||
521 | continue; | ||
522 | |||
523 | distance = pci_p2pdma_distance_many(pdev, clients, | ||
524 | num_clients, false); | ||
525 | if (distance < 0 || distance > closest_distance) | ||
526 | continue; | ||
527 | |||
528 | if (distance == closest_distance && dev_cnt >= max_devs) | ||
529 | continue; | ||
530 | |||
531 | if (distance < closest_distance) { | ||
532 | for (i = 0; i < dev_cnt; i++) | ||
533 | pci_dev_put(closest_pdevs[i]); | ||
534 | |||
535 | dev_cnt = 0; | ||
536 | closest_distance = distance; | ||
537 | } | ||
538 | |||
539 | closest_pdevs[dev_cnt++] = pci_dev_get(pdev); | ||
540 | } | ||
541 | |||
542 | if (dev_cnt) | ||
543 | pdev = pci_dev_get(closest_pdevs[prandom_u32_max(dev_cnt)]); | ||
544 | |||
545 | for (i = 0; i < dev_cnt; i++) | ||
546 | pci_dev_put(closest_pdevs[i]); | ||
547 | |||
548 | kfree(closest_pdevs); | ||
549 | return pdev; | ||
550 | } | ||
551 | EXPORT_SYMBOL_GPL(pci_p2pmem_find_many); | ||
552 | |||
553 | /** | ||
554 | * pci_alloc_p2p_mem - allocate peer-to-peer DMA memory | ||
555 | * @pdev: the device to allocate memory from | ||
556 | * @size: number of bytes to allocate | ||
557 | * | ||
558 | * Returns the allocated memory or NULL on error. | ||
559 | */ | ||
560 | void *pci_alloc_p2pmem(struct pci_dev *pdev, size_t size) | ||
561 | { | ||
562 | void *ret; | ||
563 | |||
564 | if (unlikely(!pdev->p2pdma)) | ||
565 | return NULL; | ||
566 | |||
567 | if (unlikely(!percpu_ref_tryget_live(&pdev->p2pdma->devmap_ref))) | ||
568 | return NULL; | ||
569 | |||
570 | ret = (void *)gen_pool_alloc(pdev->p2pdma->pool, size); | ||
571 | |||
572 | if (unlikely(!ret)) | ||
573 | percpu_ref_put(&pdev->p2pdma->devmap_ref); | ||
574 | |||
575 | return ret; | ||
576 | } | ||
577 | EXPORT_SYMBOL_GPL(pci_alloc_p2pmem); | ||
578 | |||
579 | /** | ||
580 | * pci_free_p2pmem - free peer-to-peer DMA memory | ||
581 | * @pdev: the device the memory was allocated from | ||
582 | * @addr: address of the memory that was allocated | ||
583 | * @size: number of bytes that was allocated | ||
584 | */ | ||
585 | void pci_free_p2pmem(struct pci_dev *pdev, void *addr, size_t size) | ||
586 | { | ||
587 | gen_pool_free(pdev->p2pdma->pool, (uintptr_t)addr, size); | ||
588 | percpu_ref_put(&pdev->p2pdma->devmap_ref); | ||
589 | } | ||
590 | EXPORT_SYMBOL_GPL(pci_free_p2pmem); | ||
591 | |||
592 | /** | ||
593 | * pci_virt_to_bus - return the PCI bus address for a given virtual | ||
594 | * address obtained with pci_alloc_p2pmem() | ||
595 | * @pdev: the device the memory was allocated from | ||
596 | * @addr: address of the memory that was allocated | ||
597 | */ | ||
598 | pci_bus_addr_t pci_p2pmem_virt_to_bus(struct pci_dev *pdev, void *addr) | ||
599 | { | ||
600 | if (!addr) | ||
601 | return 0; | ||
602 | if (!pdev->p2pdma) | ||
603 | return 0; | ||
604 | |||
605 | /* | ||
606 | * Note: when we added the memory to the pool we used the PCI | ||
607 | * bus address as the physical address. So gen_pool_virt_to_phys() | ||
608 | * actually returns the bus address despite the misleading name. | ||
609 | */ | ||
610 | return gen_pool_virt_to_phys(pdev->p2pdma->pool, (unsigned long)addr); | ||
611 | } | ||
612 | EXPORT_SYMBOL_GPL(pci_p2pmem_virt_to_bus); | ||
613 | |||
614 | /** | ||
615 | * pci_p2pmem_alloc_sgl - allocate peer-to-peer DMA memory in a scatterlist | ||
616 | * @pdev: the device to allocate memory from | ||
617 | * @nents: the number of SG entries in the list | ||
618 | * @length: number of bytes to allocate | ||
619 | * | ||
620 | * Returns 0 on success | ||
621 | */ | ||
622 | struct scatterlist *pci_p2pmem_alloc_sgl(struct pci_dev *pdev, | ||
623 | unsigned int *nents, u32 length) | ||
624 | { | ||
625 | struct scatterlist *sg; | ||
626 | void *addr; | ||
627 | |||
628 | sg = kzalloc(sizeof(*sg), GFP_KERNEL); | ||
629 | if (!sg) | ||
630 | return NULL; | ||
631 | |||
632 | sg_init_table(sg, 1); | ||
633 | |||
634 | addr = pci_alloc_p2pmem(pdev, length); | ||
635 | if (!addr) | ||
636 | goto out_free_sg; | ||
637 | |||
638 | sg_set_buf(sg, addr, length); | ||
639 | *nents = 1; | ||
640 | return sg; | ||
641 | |||
642 | out_free_sg: | ||
643 | kfree(sg); | ||
644 | return NULL; | ||
645 | } | ||
646 | EXPORT_SYMBOL_GPL(pci_p2pmem_alloc_sgl); | ||
647 | |||
648 | /** | ||
649 | * pci_p2pmem_free_sgl - free a scatterlist allocated by pci_p2pmem_alloc_sgl() | ||
650 | * @pdev: the device to allocate memory from | ||
651 | * @sgl: the allocated scatterlist | ||
652 | */ | ||
653 | void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl) | ||
654 | { | ||
655 | struct scatterlist *sg; | ||
656 | int count; | ||
657 | |||
658 | for_each_sg(sgl, sg, INT_MAX, count) { | ||
659 | if (!sg) | ||
660 | break; | ||
661 | |||
662 | pci_free_p2pmem(pdev, sg_virt(sg), sg->length); | ||
663 | } | ||
664 | kfree(sgl); | ||
665 | } | ||
666 | EXPORT_SYMBOL_GPL(pci_p2pmem_free_sgl); | ||
667 | |||
668 | /** | ||
669 | * pci_p2pmem_publish - publish the peer-to-peer DMA memory for use by | ||
670 | * other devices with pci_p2pmem_find() | ||
671 | * @pdev: the device with peer-to-peer DMA memory to publish | ||
672 | * @publish: set to true to publish the memory, false to unpublish it | ||
673 | * | ||
674 | * Published memory can be used by other PCI device drivers for | ||
675 | * peer-2-peer DMA operations. Non-published memory is reserved for | ||
676 | * exlusive use of the device driver that registers the peer-to-peer | ||
677 | * memory. | ||
678 | */ | ||
679 | void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) | ||
680 | { | ||
681 | if (pdev->p2pdma) | ||
682 | pdev->p2pdma->p2pmem_published = publish; | ||
683 | } | ||
684 | EXPORT_SYMBOL_GPL(pci_p2pmem_publish); | ||
685 | |||
686 | /** | ||
687 | * pci_p2pdma_map_sg - map a PCI peer-to-peer scatterlist for DMA | ||
688 | * @dev: device doing the DMA request | ||
689 | * @sg: scatter list to map | ||
690 | * @nents: elements in the scatterlist | ||
691 | * @dir: DMA direction | ||
692 | * | ||
693 | * Scatterlists mapped with this function should not be unmapped in any way. | ||
694 | * | ||
695 | * Returns the number of SG entries mapped or 0 on error. | ||
696 | */ | ||
697 | int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents, | ||
698 | enum dma_data_direction dir) | ||
699 | { | ||
700 | struct dev_pagemap *pgmap; | ||
701 | struct scatterlist *s; | ||
702 | phys_addr_t paddr; | ||
703 | int i; | ||
704 | |||
705 | /* | ||
706 | * p2pdma mappings are not compatible with devices that use | ||
707 | * dma_virt_ops. If the upper layers do the right thing | ||
708 | * this should never happen because it will be prevented | ||
709 | * by the check in pci_p2pdma_add_client() | ||
710 | */ | ||
711 | if (WARN_ON_ONCE(IS_ENABLED(CONFIG_DMA_VIRT_OPS) && | ||
712 | dev->dma_ops == &dma_virt_ops)) | ||
713 | return 0; | ||
714 | |||
715 | for_each_sg(sg, s, nents, i) { | ||
716 | pgmap = sg_page(s)->pgmap; | ||
717 | paddr = sg_phys(s); | ||
718 | |||
719 | s->dma_address = paddr - pgmap->pci_p2pdma_bus_offset; | ||
720 | sg_dma_len(s) = s->length; | ||
721 | } | ||
722 | |||
723 | return nents; | ||
724 | } | ||
725 | EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg); | ||
726 | |||
727 | /** | ||
728 | * pci_p2pdma_enable_store - parse a configfs/sysfs attribute store | ||
729 | * to enable p2pdma | ||
730 | * @page: contents of the value to be stored | ||
731 | * @p2p_dev: returns the PCI device that was selected to be used | ||
732 | * (if one was specified in the stored value) | ||
733 | * @use_p2pdma: returns whether to enable p2pdma or not | ||
734 | * | ||
735 | * Parses an attribute value to decide whether to enable p2pdma. | ||
736 | * The value can select a PCI device (using it's full BDF device | ||
737 | * name) or a boolean (in any format strtobool() accepts). A false | ||
738 | * value disables p2pdma, a true value expects the caller | ||
739 | * to automatically find a compatible device and specifying a PCI device | ||
740 | * expects the caller to use the specific provider. | ||
741 | * | ||
742 | * pci_p2pdma_enable_show() should be used as the show operation for | ||
743 | * the attribute. | ||
744 | * | ||
745 | * Returns 0 on success | ||
746 | */ | ||
747 | int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, | ||
748 | bool *use_p2pdma) | ||
749 | { | ||
750 | struct device *dev; | ||
751 | |||
752 | dev = bus_find_device_by_name(&pci_bus_type, NULL, page); | ||
753 | if (dev) { | ||
754 | *use_p2pdma = true; | ||
755 | *p2p_dev = to_pci_dev(dev); | ||
756 | |||
757 | if (!pci_has_p2pmem(*p2p_dev)) { | ||
758 | pci_err(*p2p_dev, | ||
759 | "PCI device has no peer-to-peer memory: %s\n", | ||
760 | page); | ||
761 | pci_dev_put(*p2p_dev); | ||
762 | return -ENODEV; | ||
763 | } | ||
764 | |||
765 | return 0; | ||
766 | } else if ((page[0] == '0' || page[0] == '1') && !iscntrl(page[1])) { | ||
767 | /* | ||
768 | * If the user enters a PCI device that doesn't exist | ||
769 | * like "0000:01:00.1", we don't want strtobool to think | ||
770 | * it's a '0' when it's clearly not what the user wanted. | ||
771 | * So we require 0's and 1's to be exactly one character. | ||
772 | */ | ||
773 | } else if (!strtobool(page, use_p2pdma)) { | ||
774 | return 0; | ||
775 | } | ||
776 | |||
777 | pr_err("No such PCI device: %.*s\n", (int)strcspn(page, "\n"), page); | ||
778 | return -ENODEV; | ||
779 | } | ||
780 | EXPORT_SYMBOL_GPL(pci_p2pdma_enable_store); | ||
781 | |||
782 | /** | ||
783 | * pci_p2pdma_enable_show - show a configfs/sysfs attribute indicating | ||
784 | * whether p2pdma is enabled | ||
785 | * @page: contents of the stored value | ||
786 | * @p2p_dev: the selected p2p device (NULL if no device is selected) | ||
787 | * @use_p2pdma: whether p2pdme has been enabled | ||
788 | * | ||
789 | * Attributes that use pci_p2pdma_enable_store() should use this function | ||
790 | * to show the value of the attribute. | ||
791 | * | ||
792 | * Returns 0 on success | ||
793 | */ | ||
794 | ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, | ||
795 | bool use_p2pdma) | ||
796 | { | ||
797 | if (!use_p2pdma) | ||
798 | return sprintf(page, "0\n"); | ||
799 | |||
800 | if (!p2p_dev) | ||
801 | return sprintf(page, "1\n"); | ||
802 | |||
803 | return sprintf(page, "%s\n", pci_name(p2p_dev)); | ||
804 | } | ||
805 | EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show); | ||
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6980014357d4..c32f7171899b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h | |||
@@ -699,6 +699,7 @@ struct request_queue { | |||
699 | #define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */ | 699 | #define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */ |
700 | #define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ | 700 | #define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ |
701 | #define QUEUE_FLAG_PREEMPT_ONLY 29 /* only process REQ_PREEMPT requests */ | 701 | #define QUEUE_FLAG_PREEMPT_ONLY 29 /* only process REQ_PREEMPT requests */ |
702 | #define QUEUE_FLAG_PCI_P2PDMA 30 /* device supports PCI p2p requests */ | ||
702 | 703 | ||
703 | #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ | 704 | #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ |
704 | (1 << QUEUE_FLAG_SAME_COMP) | \ | 705 | (1 << QUEUE_FLAG_SAME_COMP) | \ |
@@ -731,6 +732,8 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q); | |||
731 | #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) | 732 | #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) |
732 | #define blk_queue_scsi_passthrough(q) \ | 733 | #define blk_queue_scsi_passthrough(q) \ |
733 | test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags) | 734 | test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags) |
735 | #define blk_queue_pci_p2pdma(q) \ | ||
736 | test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) | ||
734 | 737 | ||
735 | #define blk_noretry_request(rq) \ | 738 | #define blk_noretry_request(rq) \ |
736 | ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ | 739 | ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ |
diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f91f9e763557..0ac69ddf5fc4 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h | |||
@@ -53,11 +53,16 @@ struct vmem_altmap { | |||
53 | * wakeup event whenever a page is unpinned and becomes idle. This | 53 | * wakeup event whenever a page is unpinned and becomes idle. This |
54 | * wakeup is used to coordinate physical address space management (ex: | 54 | * wakeup is used to coordinate physical address space management (ex: |
55 | * fs truncate/hole punch) vs pinned pages (ex: device dma). | 55 | * fs truncate/hole punch) vs pinned pages (ex: device dma). |
56 | * | ||
57 | * MEMORY_DEVICE_PCI_P2PDMA: | ||
58 | * Device memory residing in a PCI BAR intended for use with Peer-to-Peer | ||
59 | * transactions. | ||
56 | */ | 60 | */ |
57 | enum memory_type { | 61 | enum memory_type { |
58 | MEMORY_DEVICE_PRIVATE = 1, | 62 | MEMORY_DEVICE_PRIVATE = 1, |
59 | MEMORY_DEVICE_PUBLIC, | 63 | MEMORY_DEVICE_PUBLIC, |
60 | MEMORY_DEVICE_FS_DAX, | 64 | MEMORY_DEVICE_FS_DAX, |
65 | MEMORY_DEVICE_PCI_P2PDMA, | ||
61 | }; | 66 | }; |
62 | 67 | ||
63 | /* | 68 | /* |
@@ -120,6 +125,7 @@ struct dev_pagemap { | |||
120 | struct device *dev; | 125 | struct device *dev; |
121 | void *data; | 126 | void *data; |
122 | enum memory_type type; | 127 | enum memory_type type; |
128 | u64 pci_p2pdma_bus_offset; | ||
123 | }; | 129 | }; |
124 | 130 | ||
125 | #ifdef CONFIG_ZONE_DEVICE | 131 | #ifdef CONFIG_ZONE_DEVICE |
diff --git a/include/linux/mm.h b/include/linux/mm.h index a61ebe8ad4ca..2055df412a77 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -890,6 +890,19 @@ static inline bool is_device_public_page(const struct page *page) | |||
890 | page->pgmap->type == MEMORY_DEVICE_PUBLIC; | 890 | page->pgmap->type == MEMORY_DEVICE_PUBLIC; |
891 | } | 891 | } |
892 | 892 | ||
893 | #ifdef CONFIG_PCI_P2PDMA | ||
894 | static inline bool is_pci_p2pdma_page(const struct page *page) | ||
895 | { | ||
896 | return is_zone_device_page(page) && | ||
897 | page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; | ||
898 | } | ||
899 | #else /* CONFIG_PCI_P2PDMA */ | ||
900 | static inline bool is_pci_p2pdma_page(const struct page *page) | ||
901 | { | ||
902 | return false; | ||
903 | } | ||
904 | #endif /* CONFIG_PCI_P2PDMA */ | ||
905 | |||
893 | #else /* CONFIG_DEV_PAGEMAP_OPS */ | 906 | #else /* CONFIG_DEV_PAGEMAP_OPS */ |
894 | static inline void dev_pagemap_get_ops(void) | 907 | static inline void dev_pagemap_get_ops(void) |
895 | { | 908 | { |
@@ -913,6 +926,11 @@ static inline bool is_device_public_page(const struct page *page) | |||
913 | { | 926 | { |
914 | return false; | 927 | return false; |
915 | } | 928 | } |
929 | |||
930 | static inline bool is_pci_p2pdma_page(const struct page *page) | ||
931 | { | ||
932 | return false; | ||
933 | } | ||
916 | #endif /* CONFIG_DEV_PAGEMAP_OPS */ | 934 | #endif /* CONFIG_DEV_PAGEMAP_OPS */ |
917 | 935 | ||
918 | static inline void get_page(struct page *page) | 936 | static inline void get_page(struct page *page) |
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h new file mode 100644 index 000000000000..bca9bc3e5be7 --- /dev/null +++ b/include/linux/pci-p2pdma.h | |||
@@ -0,0 +1,114 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | /* | ||
3 | * PCI Peer 2 Peer DMA support. | ||
4 | * | ||
5 | * Copyright (c) 2016-2018, Logan Gunthorpe | ||
6 | * Copyright (c) 2016-2017, Microsemi Corporation | ||
7 | * Copyright (c) 2017, Christoph Hellwig | ||
8 | * Copyright (c) 2018, Eideticom Inc. | ||
9 | */ | ||
10 | |||
11 | #ifndef _LINUX_PCI_P2PDMA_H | ||
12 | #define _LINUX_PCI_P2PDMA_H | ||
13 | |||
14 | #include <linux/pci.h> | ||
15 | |||
16 | struct block_device; | ||
17 | struct scatterlist; | ||
18 | |||
19 | #ifdef CONFIG_PCI_P2PDMA | ||
20 | int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, | ||
21 | u64 offset); | ||
22 | int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, | ||
23 | int num_clients, bool verbose); | ||
24 | bool pci_has_p2pmem(struct pci_dev *pdev); | ||
25 | struct pci_dev *pci_p2pmem_find_many(struct device **clients, int num_clients); | ||
26 | void *pci_alloc_p2pmem(struct pci_dev *pdev, size_t size); | ||
27 | void pci_free_p2pmem(struct pci_dev *pdev, void *addr, size_t size); | ||
28 | pci_bus_addr_t pci_p2pmem_virt_to_bus(struct pci_dev *pdev, void *addr); | ||
29 | struct scatterlist *pci_p2pmem_alloc_sgl(struct pci_dev *pdev, | ||
30 | unsigned int *nents, u32 length); | ||
31 | void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl); | ||
32 | void pci_p2pmem_publish(struct pci_dev *pdev, bool publish); | ||
33 | int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents, | ||
34 | enum dma_data_direction dir); | ||
35 | int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, | ||
36 | bool *use_p2pdma); | ||
37 | ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, | ||
38 | bool use_p2pdma); | ||
39 | #else /* CONFIG_PCI_P2PDMA */ | ||
40 | static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, | ||
41 | size_t size, u64 offset) | ||
42 | { | ||
43 | return -EOPNOTSUPP; | ||
44 | } | ||
45 | static inline int pci_p2pdma_distance_many(struct pci_dev *provider, | ||
46 | struct device **clients, int num_clients, bool verbose) | ||
47 | { | ||
48 | return -1; | ||
49 | } | ||
50 | static inline bool pci_has_p2pmem(struct pci_dev *pdev) | ||
51 | { | ||
52 | return false; | ||
53 | } | ||
54 | static inline struct pci_dev *pci_p2pmem_find_many(struct device **clients, | ||
55 | int num_clients) | ||
56 | { | ||
57 | return NULL; | ||
58 | } | ||
59 | static inline void *pci_alloc_p2pmem(struct pci_dev *pdev, size_t size) | ||
60 | { | ||
61 | return NULL; | ||
62 | } | ||
63 | static inline void pci_free_p2pmem(struct pci_dev *pdev, void *addr, | ||
64 | size_t size) | ||
65 | { | ||
66 | } | ||
67 | static inline pci_bus_addr_t pci_p2pmem_virt_to_bus(struct pci_dev *pdev, | ||
68 | void *addr) | ||
69 | { | ||
70 | return 0; | ||
71 | } | ||
72 | static inline struct scatterlist *pci_p2pmem_alloc_sgl(struct pci_dev *pdev, | ||
73 | unsigned int *nents, u32 length) | ||
74 | { | ||
75 | return NULL; | ||
76 | } | ||
77 | static inline void pci_p2pmem_free_sgl(struct pci_dev *pdev, | ||
78 | struct scatterlist *sgl) | ||
79 | { | ||
80 | } | ||
81 | static inline void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) | ||
82 | { | ||
83 | } | ||
84 | static inline int pci_p2pdma_map_sg(struct device *dev, | ||
85 | struct scatterlist *sg, int nents, enum dma_data_direction dir) | ||
86 | { | ||
87 | return 0; | ||
88 | } | ||
89 | static inline int pci_p2pdma_enable_store(const char *page, | ||
90 | struct pci_dev **p2p_dev, bool *use_p2pdma) | ||
91 | { | ||
92 | *use_p2pdma = false; | ||
93 | return 0; | ||
94 | } | ||
95 | static inline ssize_t pci_p2pdma_enable_show(char *page, | ||
96 | struct pci_dev *p2p_dev, bool use_p2pdma) | ||
97 | { | ||
98 | return sprintf(page, "none\n"); | ||
99 | } | ||
100 | #endif /* CONFIG_PCI_P2PDMA */ | ||
101 | |||
102 | |||
103 | static inline int pci_p2pdma_distance(struct pci_dev *provider, | ||
104 | struct device *client, bool verbose) | ||
105 | { | ||
106 | return pci_p2pdma_distance_many(provider, &client, 1, verbose); | ||
107 | } | ||
108 | |||
109 | static inline struct pci_dev *pci_p2pmem_find(struct device *client) | ||
110 | { | ||
111 | return pci_p2pmem_find_many(&client, 1); | ||
112 | } | ||
113 | |||
114 | #endif /* _LINUX_PCI_P2P_H */ | ||
diff --git a/include/linux/pci.h b/include/linux/pci.h index 7c4802de1e3a..f9e04c170301 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h | |||
@@ -281,6 +281,7 @@ struct pcie_link_state; | |||
281 | struct pci_vpd; | 281 | struct pci_vpd; |
282 | struct pci_sriov; | 282 | struct pci_sriov; |
283 | struct pci_ats; | 283 | struct pci_ats; |
284 | struct pci_p2pdma; | ||
284 | 285 | ||
285 | /* The pci_dev structure describes PCI devices */ | 286 | /* The pci_dev structure describes PCI devices */ |
286 | struct pci_dev { | 287 | struct pci_dev { |
@@ -441,6 +442,9 @@ struct pci_dev { | |||
441 | #ifdef CONFIG_PCI_PASID | 442 | #ifdef CONFIG_PCI_PASID |
442 | u16 pasid_features; | 443 | u16 pasid_features; |
443 | #endif | 444 | #endif |
445 | #ifdef CONFIG_PCI_P2PDMA | ||
446 | struct pci_p2pdma *p2pdma; | ||
447 | #endif | ||
444 | phys_addr_t rom; /* Physical address if not from BAR */ | 448 | phys_addr_t rom; /* Physical address if not from BAR */ |
445 | size_t romlen; /* Length if not from BAR */ | 449 | size_t romlen; /* Length if not from BAR */ |
446 | char *driver_override; /* Driver name to force a match */ | 450 | char *driver_override; /* Driver name to force a match */ |