22 files changed, 1493 insertions, 50 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 44d4b2be92fd..8bfee557e50e 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -323,3 +323,27 @@ Description:
                This is similar to /sys/bus/pci/drivers_autoprobe, but
                affects only the VFs associated with a specific PF.
+What:           /sys/bus/pci/devices/.../p2pmem/size
+Date:           November 2017
+Contact:        Logan Gunthorpe <logang@deltatee.com>
+Description:
+                If the device has any Peer-to-Peer memory registered, this
+                file contains the total amount of memory that the device
+                provides (in decimal).
+What:           /sys/bus/pci/devices/.../p2pmem/available
+Date:           November 2017
+Contact:        Logan Gunthorpe <logang@deltatee.com>
+Description:
+                If the device has any Peer-to-Peer memory registered, this
+                file contains the amount of memory that has not been
+                allocated (in decimal).
+What:           /sys/bus/pci/devices/.../p2pmem/published
+Date:           November 2017
+Contact:        Logan Gunthorpe <logang@deltatee.com>
+Description:
+                If the device has any Peer-to-Peer memory registered, this
+                file contains a '1' if the memory has been published for
+                use outside the driver that owns the device.
diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index 6d9f2f9fe20e..e9e7d24169cf 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -29,7 +29,7 @@ available subsections can be seen below.
   iio/index
   input
   usb/index
-   pci
+   pci/index
   spi
   i2c
   hsi
diff --git a/Documentation/driver-api/pci/index.rst b/Documentation/driver-api/pci/index.rst
new file mode 100644
index 000000000000..c6cf1fef61ce
--- /dev/null
+++ b/Documentation/driver-api/pci/index.rst
@@ -0,0 +1,22 @@
+.. SPDX-License-Identifier: GPL-2.0
+============================================
+The Linux PCI driver implementer's API guide
+============================================
+.. class:: toc-title
+           Table of contents
+.. toctree::
+   :maxdepth: 2
+   pci
+   p2pdma
+.. only::  subproject and html
+   Indices
+   =======
+   * :ref:`genindex`
diff --git a/Documentation/driver-api/pci/p2pdma.rst b/Documentation/driver-api/pci/p2pdma.rst
new file mode 100644
index 000000000000..4c577fa7bef9
--- /dev/null
+++ b/Documentation/driver-api/pci/p2pdma.rst
@@ -0,0 +1,145 @@
+.. SPDX-License-Identifier: GPL-2.0
+============================
+PCI Peer-to-Peer DMA Support
+============================
+The PCI bus has pretty decent support for performing DMA transfers
+between two devices on the bus. This type of transaction is henceforth
+called Peer-to-Peer (or P2P). However, there are a number of issues that
+make P2P transactions tricky to do in a perfectly safe way.
+One of the biggest issues is that PCI doesn't require forwarding
+transactions between hierarchy domains, and in PCIe, each Root Port
+defines a separate hierarchy domain. To make things worse, there is no
+simple way to determine if a given Root Complex supports this or not.
+(See PCIe r4.0, sec 1.3.1). Therefore, as of this writing, the kernel
+only supports doing P2P when the endpoints involved are all behind the
+same PCI bridge, as such devices are all in the same PCI hierarchy
+domain, and the spec guarantees that all transactions within the
+hierarchy will be routable, but it does not require routing
+between hierarchies.
+The second issue is that to make use of existing interfaces in Linux,
+memory that is used for P2P transactions needs to be backed by struct
+pages. However, PCI BARs are not typically cache coherent so there are
+a few corner case gotchas with these pages so developers need to
+be careful about what they do with them.
+Driver Writer's Guide
+=====================
+In a given P2P implementation there may be three or more different
+types of kernel drivers in play:
+* Provider - A driver which provides or publishes P2P resources like
+  memory or doorbell registers to other drivers.
+* Client - A driver which makes use of a resource by setting up a
+  DMA transaction to or from it.
+* Orchestrator - A driver which orchestrates the flow of data between
+  clients and providers.
+In many cases there could be overlap between these three types (i.e.,
+it may be typical for a driver to be both a provider and a client).
+For example, in the NVMe Target Copy Offload implementation:
+* The NVMe PCI driver is both a client, provider and orchestrator
+  in that it exposes any CMB (Controller Memory Buffer) as a P2P memory
+  resource (provider), it accepts P2P memory pages as buffers in requests
+  to be used directly (client) and it can also make use of the CMB as
+  submission queue entries (orchastrator).
+* The RDMA driver is a client in this arrangement so that an RNIC
+  can DMA directly to the memory exposed by the NVMe device.
+* The NVMe Target driver (nvmet) can orchestrate the data from the RNIC
+  to the P2P memory (CMB) and then to the NVMe device (and vice versa).
+This is currently the only arrangement supported by the kernel but
+one could imagine slight tweaks to this that would allow for the same
+functionality. For example, if a specific RNIC added a BAR with some
+memory behind it, its driver could add support as a P2P provider and
+then the NVMe Target could use the RNIC's memory instead of the CMB
+in cases where the NVMe cards in use do not have CMB support.
+Provider Drivers
+----------------
+A provider simply needs to register a BAR (or a portion of a BAR)
+as a P2P DMA resource using :c:func:`pci_p2pdma_add_resource()`.
+This will register struct pages for all the specified memory.
+After that it may optionally publish all of its resources as
+P2P memory using :c:func:`pci_p2pmem_publish()`. This will allow
+any orchestrator drivers to find and use the memory. When marked in
+this way, the resource must be regular memory with no side effects.
+For the time being this is fairly rudimentary in that all resources
+are typically going to be P2P memory. Future work will likely expand
+this to include other types of resources like doorbells.
+Client Drivers
+--------------
+A client driver typically only has to conditionally change its DMA map
+routine to use the mapping function :c:func:`pci_p2pdma_map_sg()` instead
+of the usual :c:func:`dma_map_sg()` function. Memory mapped in this
+way does not need to be unmapped.
+The client may also, optionally, make use of
+:c:func:`is_pci_p2pdma_page()` to determine when to use the P2P mapping
+functions and when to use the regular mapping functions. In some
+situations, it may be more appropriate to use a flag to indicate a
+given request is P2P memory and map appropriately. It is important to
+ensure that struct pages that back P2P memory stay out of code that
+does not have support for them as other code may treat the pages as
+regular memory which may not be appropriate.
+Orchestrator Drivers
+--------------------
+The first task an orchestrator driver must do is compile a list of
+all client devices that will be involved in a given transaction. For
+example, the NVMe Target driver creates a list including the namespace
+block device and the RNIC in use. If the orchestrator has access to
+a specific P2P provider to use it may check compatibility using
+:c:func:`pci_p2pdma_distance()` otherwise it may find a memory provider
+that's compatible with all clients using  :c:func:`pci_p2pmem_find()`.
+If more than one provider is supported, the one nearest to all the clients will
+be chosen first. If more than one provider is an equal distance away, the
+one returned will be chosen at random (it is not an arbitrary but
+truely random). This function returns the PCI device to use for the provider
+with a reference taken and therefore when it's no longer needed it should be
+returned with pci_dev_put().
+Once a provider is selected, the orchestrator can then use
+:c:func:`pci_alloc_p2pmem()` and :c:func:`pci_free_p2pmem()` to
+allocate P2P memory from the provider. :c:func:`pci_p2pmem_alloc_sgl()`
+and :c:func:`pci_p2pmem_free_sgl()` are convenience functions for
+allocating scatter-gather lists with P2P memory.
+Struct Page Caveats
+-------------------
+Driver writers should be very careful about not passing these special
+struct pages to code that isn't prepared for it. At this time, the kernel
+interfaces do not have any checks for ensuring this. This obviously
+precludes passing these pages to userspace.
+P2P memory is also technically IO memory but should never have any side
+effects behind it. Thus, the order of loads and stores should not be important
+and ioreadX(), iowriteX() and friends should not be necessary.
+However, as the memory is not cache coherent, if access ever needs to
+be protected by a spinlock then :c:func:`mmiowb()` must be used before
+unlocking the lock. (See ACQUIRES VS I/O ACCESSES in
+Documentation/memory-barriers.txt)
+P2P DMA Support Library
+=======================
+.. kernel-doc:: drivers/pci/p2pdma.c
+   :export:
diff --git a/Documentation/driver-api/pci.rst b/Documentation/driver-api/pci/pci.rst
index ca85e5e78b2c..ca85e5e78b2c 100644
--- a/Documentation/driver-api/pci.rst
+++ b/Documentation/driver-api/pci/pci.rst
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 683e6d11a564..d22c4a2ebac6 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -12,6 +12,7 @@
 */
 #include <linux/moduleparam.h>
 #include <linux/slab.h>
+#include <linux/pci-p2pdma.h>
 #include <rdma/mr_pool.h>
 #include <rdma/rw.h>
@@ -280,7 +281,11 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
        struct ib_device *dev = qp->pd->device;
        int ret;
-        ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
+        if (is_pci_p2pdma_page(sg_page(sg)))
+                ret = pci_p2pdma_map_sg(dev->dma_device, sg, sg_cnt, dir);
+        else
+                ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
        if (!ret)
                return -ENOMEM;
        sg_cnt = ret;
@@ -602,7 +607,9 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
                break;
        }
-        ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+        /* P2PDMA contexts do not need to be unmapped */
+        if (!is_pci_p2pdma_page(sg_page(sg)))
+                ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
 }
 EXPORT_SYMBOL(rdma_rw_ctx_destroy);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index dd8ec1dd9219..6033ce2fd3e9 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3051,7 +3051,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
        ns->queue = blk_mq_init_queue(ctrl->tagset);
        if (IS_ERR(ns->queue))
                goto out_free_ns;
        blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
+        if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
+                blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
        ns->queue->queuedata = ns;
        ns->ctrl = ctrl;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index bb4a2003c097..4030743c90aa 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -343,6 +343,7 @@ struct nvme_ctrl_ops {
        unsigned int flags;
 #define NVME_F_FABRICS                  (1 << 0)
 #define NVME_F_METADATA_SUPPORTED       (1 << 1)
+#define NVME_F_PCI_P2PDMA               (1 << 2)
        int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
        int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
        int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8991e79b2b87..7e09e45b0b28 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -30,6 +30,7 @@
 #include <linux/types.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/sed-opal.h>
+#include <linux/pci-p2pdma.h>
 #include "nvme.h"
@@ -99,9 +100,8 @@ struct nvme_dev {
        struct work_struct remove_work;
        struct mutex shutdown_lock;
        bool subsystem;
-        void __iomem *cmb;
-        pci_bus_addr_t cmb_bus_addr;
        u64 cmb_size;
+        bool cmb_use_sqes;
        u32 cmbsz;
        u32 cmbloc;
        struct nvme_ctrl ctrl;
@@ -158,7 +158,7 @@ struct nvme_queue {
        struct nvme_dev *dev;
        spinlock_t sq_lock;
        struct nvme_command *sq_cmds;
-        struct nvme_command __iomem *sq_cmds_io;
+        bool sq_cmds_is_io;
        spinlock_t cq_lock ____cacheline_aligned_in_smp;
        volatile struct nvme_completion *cqes;
        struct blk_mq_tags **tags;
@@ -447,11 +447,8 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 {
        spin_lock(&nvmeq->sq_lock);
-        if (nvmeq->sq_cmds_io)
-                memcpy_toio(&nvmeq->sq_cmds_io[nvmeq->sq_tail], cmd,
+        memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
-                                sizeof(*cmd));
-        else
-                memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
        if (++nvmeq->sq_tail == nvmeq->q_depth)
                nvmeq->sq_tail = 0;
@@ -748,8 +745,13 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
                goto out;
        ret = BLK_STS_RESOURCE;
-        nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir,
-                        DMA_ATTR_NO_WARN);
+        if (is_pci_p2pdma_page(sg_page(iod->sg)))
+                nr_mapped = pci_p2pdma_map_sg(dev->dev, iod->sg, iod->nents,
+                                          dma_dir);
+        else
+                nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
+                                             dma_dir,  DMA_ATTR_NO_WARN);
        if (!nr_mapped)
                goto out;
@@ -791,7 +793,10 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
                        DMA_TO_DEVICE : DMA_FROM_DEVICE;
        if (iod->nents) {
-                dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
+                /* P2PDMA requests do not need to be unmapped */
+                if (!is_pci_p2pdma_page(sg_page(iod->sg)))
+                        dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
                if (blk_integrity_rq(req))
                        dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
        }
@@ -1232,9 +1237,18 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
 {
        dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
                                (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
-        if (nvmeq->sq_cmds)
-                dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+        if (nvmeq->sq_cmds) {
-                                        nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+                if (nvmeq->sq_cmds_is_io)
+                        pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev),
+                                        nvmeq->sq_cmds,
+                                        SQ_SIZE(nvmeq->q_depth));
+                else
+                        dma_free_coherent(nvmeq->q_dmadev,
+                                          SQ_SIZE(nvmeq->q_depth),
+                                          nvmeq->sq_cmds,
+                                          nvmeq->sq_dma_addr);
+        }
 }
 static void nvme_free_queues(struct nvme_dev *dev, int lowest)
@@ -1323,12 +1337,21 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
 static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
                                int qid, int depth)
 {
-        /* CMB SQEs will be mapped before creation */
+        struct pci_dev *pdev = to_pci_dev(dev->dev);
-        if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS))
-                return 0;
+        if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
+                nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
+                nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
+                                                nvmeq->sq_cmds);
+                nvmeq->sq_cmds_is_io = true;
+        }
+        if (!nvmeq->sq_cmds) {
+                nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
+                                        &nvmeq->sq_dma_addr, GFP_KERNEL);
+                nvmeq->sq_cmds_is_io = false;
+        }
-        nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
-                                            &nvmeq->sq_dma_addr, GFP_KERNEL);
        if (!nvmeq->sq_cmds)
                return -ENOMEM;
        return 0;
@@ -1405,13 +1428,6 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
        int result;
        s16 vector;
-        if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
-                unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth),
-                                                      dev->ctrl.page_size);
-                nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
-                nvmeq->sq_cmds_io = dev->cmb + offset;
-        }
        /*
         * A queue's vector matches the queue identifier unless the controller
         * has only one vector available.
@@ -1652,9 +1668,6 @@ static void nvme_map_cmb(struct nvme_dev *dev)
                return;
        dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
-        if (!use_cmb_sqes)
-                return;
        size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
        offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
        bar = NVME_CMB_BIR(dev->cmbloc);
@@ -1671,11 +1684,18 @@ static void nvme_map_cmb(struct nvme_dev *dev)
        if (size > bar_size - offset)
                size = bar_size - offset;
-        dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
+        if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
-        if (!dev->cmb)
+                dev_warn(dev->ctrl.device,
+                         "failed to register the CMB\n");
                return;
-        dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
+        }
        dev->cmb_size = size;
+        dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS);
+        if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
+                        (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
+                pci_p2pmem_publish(pdev, true);
        if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
                                    &dev_attr_cmb.attr, NULL))
@@ -1685,12 +1705,10 @@ static void nvme_map_cmb(struct nvme_dev *dev)
 static inline void nvme_release_cmb(struct nvme_dev *dev)
 {
-        if (dev->cmb) {
+        if (dev->cmb_size) {
-                iounmap(dev->cmb);
-                dev->cmb = NULL;
                sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
                                             &dev_attr_cmb.attr, NULL);
-                dev->cmbsz = 0;
+                dev->cmb_size = 0;
        }
 }
@@ -1889,13 +1907,13 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
        if (nr_io_queues == 0)
                return 0;
-        if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) {
+        if (dev->cmb_use_sqes) {
                result = nvme_cmb_qdepth(dev, nr_io_queues,
                                sizeof(struct nvme_command));
                if (result > 0)
                        dev->q_depth = result;
                else
-                        nvme_release_cmb(dev);
+                        dev->cmb_use_sqes = false;
        }
        do {
@@ -2390,7 +2408,8 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
        .name                   = "pcie",
        .module                 = THIS_MODULE,
-        .flags                  = NVME_F_METADATA_SUPPORTED,
+        .flags                  = NVME_F_METADATA_SUPPORTED |
+                                  NVME_F_PCI_P2PDMA,
        .reg_read32             = nvme_pci_reg_read32,
        .reg_write32            = nvme_pci_reg_write32,
        .reg_read64             = nvme_pci_reg_read64,
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index b37a8e3e3f80..d895579b6c5d 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -17,6 +17,8 @@
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/ctype.h>
+#include <linux/pci.h>
+#include <linux/pci-p2pdma.h>
 #include "nvmet.h"
@@ -340,6 +342,48 @@ out_unlock:
 CONFIGFS_ATTR(nvmet_ns_, device_path);
+#ifdef CONFIG_PCI_P2PDMA
+static ssize_t nvmet_ns_p2pmem_show(struct config_item *item, char *page)
+{
+        struct nvmet_ns *ns = to_nvmet_ns(item);
+        return pci_p2pdma_enable_show(page, ns->p2p_dev, ns->use_p2pmem);
+}
+static ssize_t nvmet_ns_p2pmem_store(struct config_item *item,
+                const char *page, size_t count)
+{
+        struct nvmet_ns *ns = to_nvmet_ns(item);
+        struct pci_dev *p2p_dev = NULL;
+        bool use_p2pmem;
+        int ret = count;
+        int error;
+        mutex_lock(&ns->subsys->lock);
+        if (ns->enabled) {
+                ret = -EBUSY;
+                goto out_unlock;
+        }
+        error = pci_p2pdma_enable_store(page, &p2p_dev, &use_p2pmem);
+        if (error) {
+                ret = error;
+                goto out_unlock;
+        }
+        ns->use_p2pmem = use_p2pmem;
+        pci_dev_put(ns->p2p_dev);
+        ns->p2p_dev = p2p_dev;
+out_unlock:
+        mutex_unlock(&ns->subsys->lock);
+        return ret;
+}
+CONFIGFS_ATTR(nvmet_ns_, p2pmem);
+#endif /* CONFIG_PCI_P2PDMA */
 static ssize_t nvmet_ns_device_uuid_show(struct config_item *item, char *page)
 {
        return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->uuid);
@@ -509,6 +553,9 @@ static struct configfs_attribute *nvmet_ns_attrs[] = {
        &nvmet_ns_attr_ana_grpid,
        &nvmet_ns_attr_enable,
        &nvmet_ns_attr_buffered_io,
+#ifdef CONFIG_PCI_P2PDMA
+        &nvmet_ns_attr_p2pmem,
+#endif
        NULL,
 };
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index b5ec96abd048..9b4d84cfc224 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/rculist.h>
+#include <linux/pci-p2pdma.h>
 #include "nvmet.h"
@@ -365,9 +366,93 @@ static void nvmet_ns_dev_disable(struct nvmet_ns *ns)
        nvmet_file_ns_disable(ns);
 }
+static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns)
+{
+        int ret;
+        struct pci_dev *p2p_dev;
+        if (!ns->use_p2pmem)
+                return 0;
+        if (!ns->bdev) {
+                pr_err("peer-to-peer DMA is not supported by non-block device namespaces\n");
+                return -EINVAL;
+        }
+        if (!blk_queue_pci_p2pdma(ns->bdev->bd_queue)) {
+                pr_err("peer-to-peer DMA is not supported by the driver of %s\n",
+                       ns->device_path);
+                return -EINVAL;
+        }
+        if (ns->p2p_dev) {
+                ret = pci_p2pdma_distance(ns->p2p_dev, nvmet_ns_dev(ns), true);
+                if (ret < 0)
+                        return -EINVAL;
+        } else {
+                /*
+                 * Right now we just check that there is p2pmem available so
+                 * we can report an error to the user right away if there
+                 * is not. We'll find the actual device to use once we
+                 * setup the controller when the port's device is available.
+                 */
+                p2p_dev = pci_p2pmem_find(nvmet_ns_dev(ns));
+                if (!p2p_dev) {
+                        pr_err("no peer-to-peer memory is available for %s\n",
+                               ns->device_path);
+                        return -EINVAL;
+                }
+                pci_dev_put(p2p_dev);
+        }
+        return 0;
+}
+/*
+ * Note: ctrl->subsys->lock should be held when calling this function
+ */
+static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl,
+                                    struct nvmet_ns *ns)
+{
+        struct device *clients[2];
+        struct pci_dev *p2p_dev;
+        int ret;
+        if (!ctrl->p2p_client)
+                return;
+        if (ns->p2p_dev) {
+                ret = pci_p2pdma_distance(ns->p2p_dev, ctrl->p2p_client, true);
+                if (ret < 0)
+                        return;
+                p2p_dev = pci_dev_get(ns->p2p_dev);
+        } else {
+                clients[0] = ctrl->p2p_client;
+                clients[1] = nvmet_ns_dev(ns);
+                p2p_dev = pci_p2pmem_find_many(clients, ARRAY_SIZE(clients));
+                if (!p2p_dev) {
+                        pr_err("no peer-to-peer memory is available that's supported by %s and %s\n",
+                               dev_name(ctrl->p2p_client), ns->device_path);
+                        return;
+                }
+        }
+        ret = radix_tree_insert(&ctrl->p2p_ns_map, ns->nsid, p2p_dev);
+        if (ret < 0)
+                pci_dev_put(p2p_dev);
+        pr_info("using p2pmem on %s for nsid %d\n", pci_name(p2p_dev),
+                ns->nsid);
+}
 int nvmet_ns_enable(struct nvmet_ns *ns)
 {
        struct nvmet_subsys *subsys = ns->subsys;
+        struct nvmet_ctrl *ctrl;
        int ret;
        mutex_lock(&subsys->lock);
@@ -384,6 +469,13 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
        if (ret)
                goto out_unlock;
+        ret = nvmet_p2pmem_ns_enable(ns);
+        if (ret)
+                goto out_unlock;
+        list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
+                nvmet_p2pmem_ns_add_p2p(ctrl, ns);
        ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace,
                                0, GFP_KERNEL);
        if (ret)
@@ -418,6 +510,9 @@ out_unlock:
        mutex_unlock(&subsys->lock);
        return ret;
 out_dev_put:
+        list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
+                pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
        nvmet_ns_dev_disable(ns);
        goto out_unlock;
 }
@@ -425,6 +520,7 @@ out_dev_put:
 void nvmet_ns_disable(struct nvmet_ns *ns)
 {
        struct nvmet_subsys *subsys = ns->subsys;
+        struct nvmet_ctrl *ctrl;
        mutex_lock(&subsys->lock);
        if (!ns->enabled)
@@ -434,6 +530,10 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
        list_del_rcu(&ns->dev_link);
        if (ns->nsid == subsys->max_nsid)
                subsys->max_nsid = nvmet_max_nsid(subsys);
+        list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
+                pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
        mutex_unlock(&subsys->lock);
        /*
@@ -450,6 +550,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
        percpu_ref_exit(&ns->ref);
        mutex_lock(&subsys->lock);
        subsys->nr_namespaces--;
        nvmet_ns_changed(subsys, ns->nsid);
        nvmet_ns_dev_disable(ns);
@@ -725,6 +826,51 @@ void nvmet_req_execute(struct nvmet_req *req)
 }
 EXPORT_SYMBOL_GPL(nvmet_req_execute);
+int nvmet_req_alloc_sgl(struct nvmet_req *req)
+{
+        struct pci_dev *p2p_dev = NULL;
+        if (IS_ENABLED(CONFIG_PCI_P2PDMA)) {
+                if (req->sq->ctrl && req->ns)
+                        p2p_dev = radix_tree_lookup(&req->sq->ctrl->p2p_ns_map,
+                                                    req->ns->nsid);
+                req->p2p_dev = NULL;
+                if (req->sq->qid && p2p_dev) {
+                        req->sg = pci_p2pmem_alloc_sgl(p2p_dev, &req->sg_cnt,
+                                                       req->transfer_len);
+                        if (req->sg) {
+                                req->p2p_dev = p2p_dev;
+                                return 0;
+                        }
+                }
+                /*
+                 * If no P2P memory was available we fallback to using
+                 * regular memory
+                 */
+        }
+        req->sg = sgl_alloc(req->transfer_len, GFP_KERNEL, &req->sg_cnt);
+        if (!req->sg)
+                return -ENOMEM;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgl);
+void nvmet_req_free_sgl(struct nvmet_req *req)
+{
+        if (req->p2p_dev)
+                pci_p2pmem_free_sgl(req->p2p_dev, req->sg);
+        else
+                sgl_free(req->sg);
+        req->sg = NULL;
+        req->sg_cnt = 0;
+}
+EXPORT_SYMBOL_GPL(nvmet_req_free_sgl);
 static inline bool nvmet_cc_en(u32 cc)
 {
        return (cc >> NVME_CC_EN_SHIFT) & 0x1;
@@ -921,6 +1067,37 @@ bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
                return __nvmet_host_allowed(subsys, hostnqn);
 }
+/*
+ * Note: ctrl->subsys->lock should be held when calling this function
+ */
+static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
+                struct nvmet_req *req)
+{
+        struct nvmet_ns *ns;
+        if (!req->p2p_client)
+                return;
+        ctrl->p2p_client = get_device(req->p2p_client);
+        list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link)
+                nvmet_p2pmem_ns_add_p2p(ctrl, ns);
+}
+/*
+ * Note: ctrl->subsys->lock should be held when calling this function
+ */
+static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl)
+{
+        struct radix_tree_iter iter;
+        void __rcu **slot;
+        radix_tree_for_each_slot(slot, &ctrl->p2p_ns_map, &iter, 0)
+                pci_dev_put(radix_tree_deref_slot(slot));
+        put_device(ctrl->p2p_client);
+}
 u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
                struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp)
 {
@@ -962,6 +1139,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
        INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
        INIT_LIST_HEAD(&ctrl->async_events);
+        INIT_RADIX_TREE(&ctrl->p2p_ns_map, GFP_KERNEL);
        memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE);
        memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE);
@@ -1026,6 +1204,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
        mutex_lock(&subsys->lock);
        list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
+        nvmet_setup_p2p_ns_map(ctrl, req);
        mutex_unlock(&subsys->lock);
        *ctrlp = ctrl;
@@ -1053,6 +1232,7 @@ static void nvmet_ctrl_free(struct kref *ref)
        struct nvmet_subsys *subsys = ctrl->subsys;
        mutex_lock(&subsys->lock);
+        nvmet_release_p2p_ns_map(ctrl);
        list_del(&ctrl->subsys_entry);
        mutex_unlock(&subsys->lock);
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 7bc9f6240432..5660dd7ca755 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -78,6 +78,9 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
                op = REQ_OP_READ;
        }
+        if (is_pci_p2pdma_page(sg_page(req->sg)))
+                op_flags |= REQ_NOMERGE;
        sector = le64_to_cpu(req->cmd->rw.slba);
        sector <<= (req->ns->blksize_shift - 9);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index ec9af4ee03b6..d6be098f342b 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -26,6 +26,7 @@
 #include <linux/configfs.h>
 #include <linux/rcupdate.h>
 #include <linux/blkdev.h>
+#include <linux/radix-tree.h>
 #define NVMET_ASYNC_EVENTS              4
 #define NVMET_ERROR_LOG_SLOTS           128
@@ -77,6 +78,9 @@ struct nvmet_ns {
        struct completion       disable_done;
        mempool_t               *bvec_pool;
        struct kmem_cache       *bvec_cache;
+        int                     use_p2pmem;
+        struct pci_dev          *p2p_dev;
 };
 static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
@@ -84,6 +88,11 @@ static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
        return container_of(to_config_group(item), struct nvmet_ns, group);
 }
+static inline struct device *nvmet_ns_dev(struct nvmet_ns *ns)
+{
+        return ns->bdev ? disk_to_dev(ns->bdev->bd_disk) : NULL;
+}
 struct nvmet_cq {
        u16                     qid;
        u16                     size;
@@ -184,6 +193,9 @@ struct nvmet_ctrl {
        char                    subsysnqn[NVMF_NQN_FIELD_LEN];
        char                    hostnqn[NVMF_NQN_FIELD_LEN];
+        struct device *p2p_client;
+        struct radix_tree_root p2p_ns_map;
 };
 struct nvmet_subsys {
@@ -294,6 +306,9 @@ struct nvmet_req {
        void (*execute)(struct nvmet_req *req);
        const struct nvmet_fabrics_ops *ops;
+        struct pci_dev *p2p_dev;
+        struct device *p2p_client;
 };
 extern struct workqueue_struct *buffered_io_wq;
@@ -336,6 +351,8 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 void nvmet_req_uninit(struct nvmet_req *req);
 void nvmet_req_execute(struct nvmet_req *req);
 void nvmet_req_complete(struct nvmet_req *req, u16 status);
+int nvmet_req_alloc_sgl(struct nvmet_req *req);
+void nvmet_req_free_sgl(struct nvmet_req *req);
 void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
                u16 size);
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index bfc4da660bb4..3f7971d3706d 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -503,7 +503,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
        }
        if (rsp->req.sg != rsp->cmd->inline_sg)
-                sgl_free(rsp->req.sg);
+                nvmet_req_free_sgl(&rsp->req);
        if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
                nvmet_rdma_process_wr_wait_list(queue);
@@ -652,24 +652,24 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
 {
        struct rdma_cm_id *cm_id = rsp->queue->cm_id;
        u64 addr = le64_to_cpu(sgl->addr);
-        u32 len = get_unaligned_le24(sgl->length);
        u32 key = get_unaligned_le32(sgl->key);
        int ret;
+        rsp->req.transfer_len = get_unaligned_le24(sgl->length);
        /* no data command? */
-        if (!len)
+        if (!rsp->req.transfer_len)
                return 0;
-        rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt);
+        ret = nvmet_req_alloc_sgl(&rsp->req);
-        if (!rsp->req.sg)
+        if (ret < 0)
-                return NVME_SC_INTERNAL;
+                goto error_out;
        ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
                        rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
                        nvmet_data_dir(&rsp->req));
        if (ret < 0)
-                return NVME_SC_INTERNAL;
+                goto error_out;
-        rsp->req.transfer_len += len;
        rsp->n_rdma += ret;
        if (invalidate) {
@@ -678,6 +678,10 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
        }
        return 0;
+error_out:
+        rsp->req.transfer_len = 0;
+        return NVME_SC_INTERNAL;
 }
 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
@@ -745,6 +749,8 @@ static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
                cmd->send_sge.addr, cmd->send_sge.length,
                DMA_TO_DEVICE);
+        cmd->req.p2p_client = &queue->dev->device->dev;
        if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
                        &queue->nvme_sq, &nvmet_rdma_ops))
                return;
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 56ff8f6d31fc..deb68be4fdac 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -132,6 +132,23 @@ config PCI_PASID
          If unsure, say N.
+config PCI_P2PDMA
+        bool "PCI peer-to-peer transfer support"
+        depends on PCI && ZONE_DEVICE
+        select GENERIC_ALLOCATOR
+        help
+          Enableѕ drivers to do PCI peer-to-peer transactions to and from
+          BARs that are exposed in other devices that are the part of
+          the hierarchy where peer-to-peer DMA is guaranteed by the PCI
+          specification to work (ie. anything below a single PCI bridge).
+          Many PCIe root complexes do not support P2P transactions and
+          it's hard to tell which support it at all, so at this time,
+          P2P DMA transations must be between devices behind the same root
+          port.
+          If unsure, say N.
 config PCI_LABEL
        def_bool y if (DMI || ACPI)
        depends on PCI
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 1b2cfe51e8d7..85f4a703b2be 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_PCI_SYSCALL)	+= syscall.o
 obj-$(CONFIG_PCI_STUB)          += pci-stub.o
 obj-$(CONFIG_PCI_PF_STUB)       += pci-pf-stub.o
 obj-$(CONFIG_PCI_ECAM)          += ecam.o
+obj-$(CONFIG_PCI_P2PDMA)        += p2pdma.o
 obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
 # Endpoint library must be initialized before its users
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
new file mode 100644
index 000000000000..ae3c5b25dcc7
--- /dev/null
+++ b/drivers/pci/p2pdma.c
@@ -0,0 +1,805 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Peer 2 Peer DMA support.
+ *
+ * Copyright (c) 2016-2018, Logan Gunthorpe
+ * Copyright (c) 2016-2017, Microsemi Corporation
+ * Copyright (c) 2017, Christoph Hellwig
+ * Copyright (c) 2018, Eideticom Inc.
+ */
+#define pr_fmt(fmt) "pci-p2pdma: " fmt
+#include <linux/ctype.h>
+#include <linux/pci-p2pdma.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/genalloc.h>
+#include <linux/memremap.h>
+#include <linux/percpu-refcount.h>
+#include <linux/random.h>
+#include <linux/seq_buf.h>
+struct pci_p2pdma {
+        struct percpu_ref devmap_ref;
+        struct completion devmap_ref_done;
+        struct gen_pool *pool;
+        bool p2pmem_published;
+};
+static ssize_t size_show(struct device *dev, struct device_attribute *attr,
+                         char *buf)
+{
+        struct pci_dev *pdev = to_pci_dev(dev);
+        size_t size = 0;
+        if (pdev->p2pdma->pool)
+                size = gen_pool_size(pdev->p2pdma->pool);
+        return snprintf(buf, PAGE_SIZE, "%zd\n", size);
+}
+static DEVICE_ATTR_RO(size);
+static ssize_t available_show(struct device *dev, struct device_attribute *attr,
+                              char *buf)
+{
+        struct pci_dev *pdev = to_pci_dev(dev);
+        size_t avail = 0;
+        if (pdev->p2pdma->pool)
+                avail = gen_pool_avail(pdev->p2pdma->pool);
+        return snprintf(buf, PAGE_SIZE, "%zd\n", avail);
+}
+static DEVICE_ATTR_RO(available);
+static ssize_t published_show(struct device *dev, struct device_attribute *attr,
+                              char *buf)
+{
+        struct pci_dev *pdev = to_pci_dev(dev);
+        return snprintf(buf, PAGE_SIZE, "%d\n",
+                        pdev->p2pdma->p2pmem_published);
+}
+static DEVICE_ATTR_RO(published);
+static struct attribute *p2pmem_attrs[] = {
+        &dev_attr_size.attr,
+        &dev_attr_available.attr,
+        &dev_attr_published.attr,
+        NULL,
+};
+static const struct attribute_group p2pmem_group = {
+        .attrs = p2pmem_attrs,
+        .name = "p2pmem",
+};
+static void pci_p2pdma_percpu_release(struct percpu_ref *ref)
+{
+        struct pci_p2pdma *p2p =
+                container_of(ref, struct pci_p2pdma, devmap_ref);
+        complete_all(&p2p->devmap_ref_done);
+}
+static void pci_p2pdma_percpu_kill(void *data)
+{
+        struct percpu_ref *ref = data;
+        /*
+         * pci_p2pdma_add_resource() may be called multiple times
+         * by a driver and may register the percpu_kill devm action multiple
+         * times. We only want the first action to actually kill the
+         * percpu_ref.
+         */
+        if (percpu_ref_is_dying(ref))
+                return;
+        percpu_ref_kill(ref);
+}
+static void pci_p2pdma_release(void *data)
+{
+        struct pci_dev *pdev = data;
+        if (!pdev->p2pdma)
+                return;
+        wait_for_completion(&pdev->p2pdma->devmap_ref_done);
+        percpu_ref_exit(&pdev->p2pdma->devmap_ref);
+        gen_pool_destroy(pdev->p2pdma->pool);
+        sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group);
+        pdev->p2pdma = NULL;
+}
+static int pci_p2pdma_setup(struct pci_dev *pdev)
+{
+        int error = -ENOMEM;
+        struct pci_p2pdma *p2p;
+        p2p = devm_kzalloc(&pdev->dev, sizeof(*p2p), GFP_KERNEL);
+        if (!p2p)
+                return -ENOMEM;
+        p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev));
+        if (!p2p->pool)
+                goto out;
+        init_completion(&p2p->devmap_ref_done);
+        error = percpu_ref_init(&p2p->devmap_ref,
+                        pci_p2pdma_percpu_release, 0, GFP_KERNEL);
+        if (error)
+                goto out_pool_destroy;
+        error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
+        if (error)
+                goto out_pool_destroy;
+        pdev->p2pdma = p2p;
+        error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group);
+        if (error)
+                goto out_pool_destroy;
+        return 0;
+out_pool_destroy:
+        pdev->p2pdma = NULL;
+        gen_pool_destroy(p2p->pool);
+out:
+        devm_kfree(&pdev->dev, p2p);
+        return error;
+}
+/**
+ * pci_p2pdma_add_resource - add memory for use as p2p memory
+ * @pdev: the device to add the memory to
+ * @bar: PCI BAR to add
+ * @size: size of the memory to add, may be zero to use the whole BAR
+ * @offset: offset into the PCI BAR
+ *
+ * The memory will be given ZONE_DEVICE struct pages so that it may
+ * be used with any DMA request.
+ */
+int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
+                            u64 offset)
+{
+        struct dev_pagemap *pgmap;
+        void *addr;
+        int error;
+        if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
+                return -EINVAL;
+        if (offset >= pci_resource_len(pdev, bar))
+                return -EINVAL;
+        if (!size)
+                size = pci_resource_len(pdev, bar) - offset;
+        if (size + offset > pci_resource_len(pdev, bar))
+                return -EINVAL;
+        if (!pdev->p2pdma) {
+                error = pci_p2pdma_setup(pdev);
+                if (error)
+                        return error;
+        }
+        pgmap = devm_kzalloc(&pdev->dev, sizeof(*pgmap), GFP_KERNEL);
+        if (!pgmap)
+                return -ENOMEM;
+        pgmap->res.start = pci_resource_start(pdev, bar) + offset;
+        pgmap->res.end = pgmap->res.start + size - 1;
+        pgmap->res.flags = pci_resource_flags(pdev, bar);
+        pgmap->ref = &pdev->p2pdma->devmap_ref;
+        pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
+        pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) -
+                pci_resource_start(pdev, bar);
+        addr = devm_memremap_pages(&pdev->dev, pgmap);
+        if (IS_ERR(addr)) {
+                error = PTR_ERR(addr);
+                goto pgmap_free;
+        }
+        error = gen_pool_add_virt(pdev->p2pdma->pool, (unsigned long)addr,
+                        pci_bus_address(pdev, bar) + offset,
+                        resource_size(&pgmap->res), dev_to_node(&pdev->dev));
+        if (error)
+                goto pgmap_free;
+        error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_percpu_kill,
+                                          &pdev->p2pdma->devmap_ref);
+        if (error)
+                goto pgmap_free;
+        pci_info(pdev, "added peer-to-peer DMA memory %pR\n",
+                 &pgmap->res);
+        return 0;
+pgmap_free:
+        devm_kfree(&pdev->dev, pgmap);
+        return error;
+}
+EXPORT_SYMBOL_GPL(pci_p2pdma_add_resource);
+/*
+ * Note this function returns the parent PCI device with a
+ * reference taken. It is the caller's responsibily to drop
+ * the reference.
+ */
+static struct pci_dev *find_parent_pci_dev(struct device *dev)
+{
+        struct device *parent;
+        dev = get_device(dev);
+        while (dev) {
+                if (dev_is_pci(dev))
+                        return to_pci_dev(dev);
+                parent = get_device(dev->parent);
+                put_device(dev);
+                dev = parent;
+        }
+        return NULL;
+}
+/*
+ * Check if a PCI bridge has its ACS redirection bits set to redirect P2P
+ * TLPs upstream via ACS. Returns 1 if the packets will be redirected
+ * upstream, 0 otherwise.
+ */
+static int pci_bridge_has_acs_redir(struct pci_dev *pdev)
+{
+        int pos;
+        u16 ctrl;
+        pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ACS);
+        if (!pos)
+                return 0;
+        pci_read_config_word(pdev, pos + PCI_ACS_CTRL, &ctrl);
+        if (ctrl & (PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC))
+                return 1;
+        return 0;
+}
+static void seq_buf_print_bus_devfn(struct seq_buf *buf, struct pci_dev *pdev)
+{
+        if (!buf)
+                return;
+        seq_buf_printf(buf, "%s;", pci_name(pdev));
+}
+/*
+ * Find the distance through the nearest common upstream bridge between
+ * two PCI devices.
+ *
+ * If the two devices are the same device then 0 will be returned.
+ *
+ * If there are two virtual functions of the same device behind the same
+ * bridge port then 2 will be returned (one step down to the PCIe switch,
+ * then one step back to the same device).
+ *
+ * In the case where two devices are connected to the same PCIe switch, the
+ * value 4 will be returned. This corresponds to the following PCI tree:
+ *
+ *     -+  Root Port
+ *      \+ Switch Upstream Port
+ *       +-+ Switch Downstream Port
+ *       + \- Device A
+ *       \-+ Switch Downstream Port
+ *         \- Device B
+ *
+ * The distance is 4 because we traverse from Device A through the downstream
+ * port of the switch, to the common upstream port, back up to the second
+ * downstream port and then to Device B.
+ *
+ * Any two devices that don't have a common upstream bridge will return -1.
+ * In this way devices on separate PCIe root ports will be rejected, which
+ * is what we want for peer-to-peer seeing each PCIe root port defines a
+ * separate hierarchy domain and there's no way to determine whether the root
+ * complex supports forwarding between them.
+ *
+ * In the case where two devices are connected to different PCIe switches,
+ * this function will still return a positive distance as long as both
+ * switches eventually have a common upstream bridge. Note this covers
+ * the case of using multiple PCIe switches to achieve a desired level of
+ * fan-out from a root port. The exact distance will be a function of the
+ * number of switches between Device A and Device B.
+ *
+ * If a bridge which has any ACS redirection bits set is in the path
+ * then this functions will return -2. This is so we reject any
+ * cases where the TLPs are forwarded up into the root complex.
+ * In this case, a list of all infringing bridge addresses will be
+ * populated in acs_list (assuming it's non-null) for printk purposes.
+ */
+static int upstream_bridge_distance(struct pci_dev *a,
+                                    struct pci_dev *b,
+                                    struct seq_buf *acs_list)
+{
+        int dist_a = 0;
+        int dist_b = 0;
+        struct pci_dev *bb = NULL;
+        int acs_cnt = 0;
+        /*
+         * Note, we don't need to take references to devices returned by
+         * pci_upstream_bridge() seeing we hold a reference to a child
+         * device which will already hold a reference to the upstream bridge.
+         */
+        while (a) {
+                dist_b = 0;
+                if (pci_bridge_has_acs_redir(a)) {
+                        seq_buf_print_bus_devfn(acs_list, a);
+                        acs_cnt++;
+                }
+                bb = b;
+                while (bb) {
+                        if (a == bb)
+                                goto check_b_path_acs;
+                        bb = pci_upstream_bridge(bb);
+                        dist_b++;
+                }
+                a = pci_upstream_bridge(a);
+                dist_a++;
+        }
+        return -1;
+check_b_path_acs:
+        bb = b;
+        while (bb) {
+                if (a == bb)
+                        break;
+                if (pci_bridge_has_acs_redir(bb)) {
+                        seq_buf_print_bus_devfn(acs_list, bb);
+                        acs_cnt++;
+                }
+                bb = pci_upstream_bridge(bb);
+        }
+        if (acs_cnt)
+                return -2;
+        return dist_a + dist_b;
+}
+static int upstream_bridge_distance_warn(struct pci_dev *provider,
+                                         struct pci_dev *client)
+{
+        struct seq_buf acs_list;
+        int ret;
+        seq_buf_init(&acs_list, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
+        if (!acs_list.buffer)
+                return -ENOMEM;
+        ret = upstream_bridge_distance(provider, client, &acs_list);
+        if (ret == -2) {
+                pci_warn(client, "cannot be used for peer-to-peer DMA as ACS redirect is set between the client and provider (%s)\n",
+                         pci_name(provider));
+                /* Drop final semicolon */
+                acs_list.buffer[acs_list.len-1] = 0;
+                pci_warn(client, "to disable ACS redirect for this path, add the kernel parameter: pci=disable_acs_redir=%s\n",
+                         acs_list.buffer);
+        } else if (ret < 0) {
+                pci_warn(client, "cannot be used for peer-to-peer DMA as the client and provider (%s) do not share an upstream bridge\n",
+                         pci_name(provider));
+        }
+        kfree(acs_list.buffer);
+        return ret;
+}
+/**
+ * pci_p2pdma_distance_many - Determive the cumulative distance between
+ *      a p2pdma provider and the clients in use.
+ * @provider: p2pdma provider to check against the client list
+ * @clients: array of devices to check (NULL-terminated)
+ * @num_clients: number of clients in the array
+ * @verbose: if true, print warnings for devices when we return -1
+ *
+ * Returns -1 if any of the clients are not compatible (behind the same
+ * root port as the provider), otherwise returns a positive number where
+ * a lower number is the preferrable choice. (If there's one client
+ * that's the same as the provider it will return 0, which is best choice).
+ *
+ * For now, "compatible" means the provider and the clients are all behind
+ * the same PCI root port. This cuts out cases that may work but is safest
+ * for the user. Future work can expand this to white-list root complexes that
+ * can safely forward between each ports.
+ */
+int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients,
+                             int num_clients, bool verbose)
+{
+        bool not_supported = false;
+        struct pci_dev *pci_client;
+        int distance = 0;
+        int i, ret;
+        if (num_clients == 0)
+                return -1;
+        for (i = 0; i < num_clients; i++) {
+                pci_client = find_parent_pci_dev(clients[i]);
+                if (!pci_client) {
+                        if (verbose)
+                                dev_warn(clients[i],
+                                         "cannot be used for peer-to-peer DMA as it is not a PCI device\n");
+                        return -1;
+                }
+                if (verbose)
+                        ret = upstream_bridge_distance_warn(provider,
+                                                            pci_client);
+                else
+                        ret = upstream_bridge_distance(provider, pci_client,
+                                                       NULL);
+                pci_dev_put(pci_client);
+                if (ret < 0)
+                        not_supported = true;
+                if (not_supported && !verbose)
+                        break;
+                distance += ret;
+        }
+        if (not_supported)
+                return -1;
+        return distance;
+}
+EXPORT_SYMBOL_GPL(pci_p2pdma_distance_many);
+/**
+ * pci_has_p2pmem - check if a given PCI device has published any p2pmem
+ * @pdev: PCI device to check
+ */
+bool pci_has_p2pmem(struct pci_dev *pdev)
+{
+        return pdev->p2pdma && pdev->p2pdma->p2pmem_published;
+}
+EXPORT_SYMBOL_GPL(pci_has_p2pmem);
+/**
+ * pci_p2pmem_find - find a peer-to-peer DMA memory device compatible with
+ *      the specified list of clients and shortest distance (as determined
+ *      by pci_p2pmem_dma())
+ * @clients: array of devices to check (NULL-terminated)
+ * @num_clients: number of client devices in the list
+ *
+ * If multiple devices are behind the same switch, the one "closest" to the
+ * client devices in use will be chosen first. (So if one of the providers are
+ * the same as one of the clients, that provider will be used ahead of any
+ * other providers that are unrelated). If multiple providers are an equal
+ * distance away, one will be chosen at random.
+ *
+ * Returns a pointer to the PCI device with a reference taken (use pci_dev_put
+ * to return the reference) or NULL if no compatible device is found. The
+ * found provider will also be assigned to the client list.
+ */
+struct pci_dev *pci_p2pmem_find_many(struct device **clients, int num_clients)
+{
+        struct pci_dev *pdev = NULL;
+        int distance;
+        int closest_distance = INT_MAX;
+        struct pci_dev **closest_pdevs;
+        int dev_cnt = 0;
+        const int max_devs = PAGE_SIZE / sizeof(*closest_pdevs);
+        int i;
+        closest_pdevs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!closest_pdevs)
+                return NULL;
+        while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev))) {
+                if (!pci_has_p2pmem(pdev))
+                        continue;
+                distance = pci_p2pdma_distance_many(pdev, clients,
+                                                    num_clients, false);
+                if (distance < 0 || distance > closest_distance)
+                        continue;
+                if (distance == closest_distance && dev_cnt >= max_devs)
+                        continue;
+                if (distance < closest_distance) {
+                        for (i = 0; i < dev_cnt; i++)
+                                pci_dev_put(closest_pdevs[i]);
+                        dev_cnt = 0;
+                        closest_distance = distance;
+                }
+                closest_pdevs[dev_cnt++] = pci_dev_get(pdev);
+        }
+        if (dev_cnt)
+                pdev = pci_dev_get(closest_pdevs[prandom_u32_max(dev_cnt)]);
+        for (i = 0; i < dev_cnt; i++)
+                pci_dev_put(closest_pdevs[i]);
+        kfree(closest_pdevs);
+        return pdev;
+}
+EXPORT_SYMBOL_GPL(pci_p2pmem_find_many);
+/**
+ * pci_alloc_p2p_mem - allocate peer-to-peer DMA memory
+ * @pdev: the device to allocate memory from
+ * @size: number of bytes to allocate
+ *
+ * Returns the allocated memory or NULL on error.
+ */
+void *pci_alloc_p2pmem(struct pci_dev *pdev, size_t size)
+{
+        void *ret;
+        if (unlikely(!pdev->p2pdma))
+                return NULL;
+        if (unlikely(!percpu_ref_tryget_live(&pdev->p2pdma->devmap_ref)))
+                return NULL;
+        ret = (void *)gen_pool_alloc(pdev->p2pdma->pool, size);
+        if (unlikely(!ret))
+                percpu_ref_put(&pdev->p2pdma->devmap_ref);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(pci_alloc_p2pmem);
+/**
+ * pci_free_p2pmem - free peer-to-peer DMA memory
+ * @pdev: the device the memory was allocated from
+ * @addr: address of the memory that was allocated
+ * @size: number of bytes that was allocated
+ */
+void pci_free_p2pmem(struct pci_dev *pdev, void *addr, size_t size)
+{
+        gen_pool_free(pdev->p2pdma->pool, (uintptr_t)addr, size);
+        percpu_ref_put(&pdev->p2pdma->devmap_ref);
+}
+EXPORT_SYMBOL_GPL(pci_free_p2pmem);
+/**
+ * pci_virt_to_bus - return the PCI bus address for a given virtual
+ *      address obtained with pci_alloc_p2pmem()
+ * @pdev: the device the memory was allocated from
+ * @addr: address of the memory that was allocated
+ */
+pci_bus_addr_t pci_p2pmem_virt_to_bus(struct pci_dev *pdev, void *addr)
+{
+        if (!addr)
+                return 0;
+        if (!pdev->p2pdma)
+                return 0;
+        /*
+         * Note: when we added the memory to the pool we used the PCI
+         * bus address as the physical address. So gen_pool_virt_to_phys()
+         * actually returns the bus address despite the misleading name.
+         */
+        return gen_pool_virt_to_phys(pdev->p2pdma->pool, (unsigned long)addr);
+}
+EXPORT_SYMBOL_GPL(pci_p2pmem_virt_to_bus);
+/**
+ * pci_p2pmem_alloc_sgl - allocate peer-to-peer DMA memory in a scatterlist
+ * @pdev: the device to allocate memory from
+ * @nents: the number of SG entries in the list
+ * @length: number of bytes to allocate
+ *
+ * Returns 0 on success
+ */
+struct scatterlist *pci_p2pmem_alloc_sgl(struct pci_dev *pdev,
+                                         unsigned int *nents, u32 length)
+{
+        struct scatterlist *sg;
+        void *addr;
+        sg = kzalloc(sizeof(*sg), GFP_KERNEL);
+        if (!sg)
+                return NULL;
+        sg_init_table(sg, 1);
+        addr = pci_alloc_p2pmem(pdev, length);
+        if (!addr)
+                goto out_free_sg;
+        sg_set_buf(sg, addr, length);
+        *nents = 1;
+        return sg;
+out_free_sg:
+        kfree(sg);
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(pci_p2pmem_alloc_sgl);
+/**
+ * pci_p2pmem_free_sgl - free a scatterlist allocated by pci_p2pmem_alloc_sgl()
+ * @pdev: the device to allocate memory from
+ * @sgl: the allocated scatterlist
+ */
+void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl)
+{
+        struct scatterlist *sg;
+        int count;
+        for_each_sg(sgl, sg, INT_MAX, count) {
+                if (!sg)
+                        break;
+                pci_free_p2pmem(pdev, sg_virt(sg), sg->length);
+        }
+        kfree(sgl);
+}
+EXPORT_SYMBOL_GPL(pci_p2pmem_free_sgl);
+/**
+ * pci_p2pmem_publish - publish the peer-to-peer DMA memory for use by
+ *      other devices with pci_p2pmem_find()
+ * @pdev: the device with peer-to-peer DMA memory to publish
+ * @publish: set to true to publish the memory, false to unpublish it
+ *
+ * Published memory can be used by other PCI device drivers for
+ * peer-2-peer DMA operations. Non-published memory is reserved for
+ * exlusive use of the device driver that registers the peer-to-peer
+ * memory.
+ */
+void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
+{
+        if (pdev->p2pdma)
+                pdev->p2pdma->p2pmem_published = publish;
+}
+EXPORT_SYMBOL_GPL(pci_p2pmem_publish);
+/**
+ * pci_p2pdma_map_sg - map a PCI peer-to-peer scatterlist for DMA
+ * @dev: device doing the DMA request
+ * @sg: scatter list to map
+ * @nents: elements in the scatterlist
+ * @dir: DMA direction
+ *
+ * Scatterlists mapped with this function should not be unmapped in any way.
+ *
+ * Returns the number of SG entries mapped or 0 on error.
+ */
+int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+                      enum dma_data_direction dir)
+{
+        struct dev_pagemap *pgmap;
+        struct scatterlist *s;
+        phys_addr_t paddr;
+        int i;
+        /*
+         * p2pdma mappings are not compatible with devices that use
+         * dma_virt_ops. If the upper layers do the right thing
+         * this should never happen because it will be prevented
+         * by the check in pci_p2pdma_add_client()
+         */
+        if (WARN_ON_ONCE(IS_ENABLED(CONFIG_DMA_VIRT_OPS) &&
+                         dev->dma_ops == &dma_virt_ops))
+                return 0;
+        for_each_sg(sg, s, nents, i) {
+                pgmap = sg_page(s)->pgmap;
+                paddr = sg_phys(s);
+                s->dma_address = paddr - pgmap->pci_p2pdma_bus_offset;
+                sg_dma_len(s) = s->length;
+        }
+        return nents;
+}
+EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg);
+/**
+ * pci_p2pdma_enable_store - parse a configfs/sysfs attribute store
+ *              to enable p2pdma
+ * @page: contents of the value to be stored
+ * @p2p_dev: returns the PCI device that was selected to be used
+ *              (if one was specified in the stored value)
+ * @use_p2pdma: returns whether to enable p2pdma or not
+ *
+ * Parses an attribute value to decide whether to enable p2pdma.
+ * The value can select a PCI device (using it's full BDF device
+ * name) or a boolean (in any format strtobool() accepts). A false
+ * value disables p2pdma, a true value expects the caller
+ * to automatically find a compatible device and specifying a PCI device
+ * expects the caller to use the specific provider.
+ *
+ * pci_p2pdma_enable_show() should be used as the show operation for
+ * the attribute.
+ *
+ * Returns 0 on success
+ */
+int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
+                            bool *use_p2pdma)
+{
+        struct device *dev;
+        dev = bus_find_device_by_name(&pci_bus_type, NULL, page);
+        if (dev) {
+                *use_p2pdma = true;
+                *p2p_dev = to_pci_dev(dev);
+                if (!pci_has_p2pmem(*p2p_dev)) {
+                        pci_err(*p2p_dev,
+                                "PCI device has no peer-to-peer memory: %s\n",
+                                page);
+                        pci_dev_put(*p2p_dev);
+                        return -ENODEV;
+                }
+                return 0;
+        } else if ((page[0] == '0' || page[0] == '1') && !iscntrl(page[1])) {
+                /*
+                 * If the user enters a PCI device that  doesn't exist
+                 * like "0000:01:00.1", we don't want strtobool to think
+                 * it's a '0' when it's clearly not what the user wanted.
+                 * So we require 0's and 1's to be exactly one character.
+                 */
+        } else if (!strtobool(page, use_p2pdma)) {
+                return 0;
+        }
+        pr_err("No such PCI device: %.*s\n", (int)strcspn(page, "\n"), page);
+        return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(pci_p2pdma_enable_store);
+/**
+ * pci_p2pdma_enable_show - show a configfs/sysfs attribute indicating
+ *              whether p2pdma is enabled
+ * @page: contents of the stored value
+ * @p2p_dev: the selected p2p device (NULL if no device is selected)
+ * @use_p2pdma: whether p2pdme has been enabled
+ *
+ * Attributes that use pci_p2pdma_enable_store() should use this function
+ * to show the value of the attribute.
+ *
+ * Returns 0 on success
+ */
+ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
+                               bool use_p2pdma)
+{
+        if (!use_p2pdma)
+                return sprintf(page, "0\n");
+        if (!p2p_dev)
+                return sprintf(page, "1\n");
+        return sprintf(page, "%s\n", pci_name(p2p_dev));
+}
+EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6980014357d4..c32f7171899b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -699,6 +699,7 @@ struct request_queue {
 #define QUEUE_FLAG_SCSI_PASSTHROUGH 27  /* queue supports SCSI commands */
 #define QUEUE_FLAG_QUIESCED    28       /* queue has been quiesced */
 #define QUEUE_FLAG_PREEMPT_ONLY 29      /* only process REQ_PREEMPT requests */
+#define QUEUE_FLAG_PCI_P2PDMA  30       /* device supports PCI p2p requests */
 #define QUEUE_FLAG_DEFAULT      ((1 << QUEUE_FLAG_IO_STAT) |            \
                                 (1 << QUEUE_FLAG_SAME_COMP)    |       \
@@ -731,6 +732,8 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q);
 #define blk_queue_dax(q)        test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags)
 #define blk_queue_scsi_passthrough(q)   \
        test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags)
+#define blk_queue_pci_p2pdma(q) \
+        test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags)
 #define blk_noretry_request(rq) \
        ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index f91f9e763557..0ac69ddf5fc4 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -53,11 +53,16 @@ struct vmem_altmap {
 * wakeup event whenever a page is unpinned and becomes idle. This
 * wakeup is used to coordinate physical address space management (ex:
 * fs truncate/hole punch) vs pinned pages (ex: device dma).
+ *
+ * MEMORY_DEVICE_PCI_P2PDMA:
+ * Device memory residing in a PCI BAR intended for use with Peer-to-Peer
+ * transactions.
 */
 enum memory_type {
        MEMORY_DEVICE_PRIVATE = 1,
        MEMORY_DEVICE_PUBLIC,
        MEMORY_DEVICE_FS_DAX,
+        MEMORY_DEVICE_PCI_P2PDMA,
 };
 /*
@@ -120,6 +125,7 @@ struct dev_pagemap {
        struct device *dev;
        void *data;
        enum memory_type type;
+        u64 pci_p2pdma_bus_offset;
 };
 #ifdef CONFIG_ZONE_DEVICE
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a61ebe8ad4ca..2055df412a77 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -890,6 +890,19 @@ static inline bool is_device_public_page(const struct page *page)
                page->pgmap->type == MEMORY_DEVICE_PUBLIC;
 }
+#ifdef CONFIG_PCI_P2PDMA
+static inline bool is_pci_p2pdma_page(const struct page *page)
+{
+        return is_zone_device_page(page) &&
+                page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
+}
+#else /* CONFIG_PCI_P2PDMA */
+static inline bool is_pci_p2pdma_page(const struct page *page)
+{
+        return false;
+}
+#endif /* CONFIG_PCI_P2PDMA */
 #else /* CONFIG_DEV_PAGEMAP_OPS */
 static inline void dev_pagemap_get_ops(void)
 {
@@ -913,6 +926,11 @@ static inline bool is_device_public_page(const struct page *page)
 {
        return false;
 }
+static inline bool is_pci_p2pdma_page(const struct page *page)
+{
+        return false;
+}
 #endif /* CONFIG_DEV_PAGEMAP_OPS */
 static inline void get_page(struct page *page)
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
new file mode 100644
index 000000000000..bca9bc3e5be7
--- /dev/null
+++ b/include/linux/pci-p2pdma.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PCI Peer 2 Peer DMA support.
+ *
+ * Copyright (c) 2016-2018, Logan Gunthorpe
+ * Copyright (c) 2016-2017, Microsemi Corporation
+ * Copyright (c) 2017, Christoph Hellwig
+ * Copyright (c) 2018, Eideticom Inc.
+ */
+#ifndef _LINUX_PCI_P2PDMA_H
+#define _LINUX_PCI_P2PDMA_H
+#include <linux/pci.h>
+struct block_device;
+struct scatterlist;
+#ifdef CONFIG_PCI_P2PDMA
+int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
+                u64 offset);
+int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients,
+                             int num_clients, bool verbose);
+bool pci_has_p2pmem(struct pci_dev *pdev);
+struct pci_dev *pci_p2pmem_find_many(struct device **clients, int num_clients);
+void *pci_alloc_p2pmem(struct pci_dev *pdev, size_t size);
+void pci_free_p2pmem(struct pci_dev *pdev, void *addr, size_t size);
+pci_bus_addr_t pci_p2pmem_virt_to_bus(struct pci_dev *pdev, void *addr);
+struct scatterlist *pci_p2pmem_alloc_sgl(struct pci_dev *pdev,
+                                         unsigned int *nents, u32 length);
+void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl);
+void pci_p2pmem_publish(struct pci_dev *pdev, bool publish);
+int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+                      enum dma_data_direction dir);
+int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
+                            bool *use_p2pdma);
+ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
+                               bool use_p2pdma);
+#else /* CONFIG_PCI_P2PDMA */
+static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar,
+                size_t size, u64 offset)
+{
+        return -EOPNOTSUPP;
+}
+static inline int pci_p2pdma_distance_many(struct pci_dev *provider,
+        struct device **clients, int num_clients, bool verbose)
+{
+        return -1;
+}
+static inline bool pci_has_p2pmem(struct pci_dev *pdev)
+{
+        return false;
+}
+static inline struct pci_dev *pci_p2pmem_find_many(struct device **clients,
+                                                   int num_clients)
+{
+        return NULL;
+}
+static inline void *pci_alloc_p2pmem(struct pci_dev *pdev, size_t size)
+{
+        return NULL;
+}
+static inline void pci_free_p2pmem(struct pci_dev *pdev, void *addr,
+                size_t size)
+{
+}
+static inline pci_bus_addr_t pci_p2pmem_virt_to_bus(struct pci_dev *pdev,
+                                                    void *addr)
+{
+        return 0;
+}
+static inline struct scatterlist *pci_p2pmem_alloc_sgl(struct pci_dev *pdev,
+                unsigned int *nents, u32 length)
+{
+        return NULL;
+}
+static inline void pci_p2pmem_free_sgl(struct pci_dev *pdev,
+                struct scatterlist *sgl)
+{
+}
+static inline void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
+{
+}
+static inline int pci_p2pdma_map_sg(struct device *dev,
+                struct scatterlist *sg, int nents, enum dma_data_direction dir)
+{
+        return 0;
+}
+static inline int pci_p2pdma_enable_store(const char *page,
+                struct pci_dev **p2p_dev, bool *use_p2pdma)
+{
+        *use_p2pdma = false;
+        return 0;
+}
+static inline ssize_t pci_p2pdma_enable_show(char *page,
+                struct pci_dev *p2p_dev, bool use_p2pdma)
+{
+        return sprintf(page, "none\n");
+}
+#endif /* CONFIG_PCI_P2PDMA */
+static inline int pci_p2pdma_distance(struct pci_dev *provider,
+        struct device *client, bool verbose)
+{
+        return pci_p2pdma_distance_many(provider, &client, 1, verbose);
+}
+static inline struct pci_dev *pci_p2pmem_find(struct device *client)
+{
+        return pci_p2pmem_find_many(&client, 1);
+}
+#endif /* _LINUX_PCI_P2P_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7c4802de1e3a..f9e04c170301 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -281,6 +281,7 @@ struct pcie_link_state;
 struct pci_vpd;
 struct pci_sriov;
 struct pci_ats;
+struct pci_p2pdma;
 /* The pci_dev structure describes PCI devices */
 struct pci_dev {
@@ -441,6 +442,9 @@ struct pci_dev {
 #ifdef CONFIG_PCI_PASID
        u16             pasid_features;
 #endif
+#ifdef CONFIG_PCI_P2PDMA
+        struct pci_p2pdma *p2pdma;
+#endif
        phys_addr_t     rom;            /* Physical address if not from BAR */
        size_t          romlen;         /* Length if not from BAR */
        char            *driver_override; /* Driver name to force a match */