ntb: ntb perf tool

Providing raw performance data via a tool that directly access data from NTB w/o any software overhead. This allows measurement of the hardware performance limit. In revision one we are only doing single direction CPU and DMA writes. Eventually we will provide bi-directional writes. The measurement using DMA engine for NTB performance measure does not measure the raw performance of DMA engine over NTB due to software overhead. But it should provide the peak performance through the Linux DMA driver. Signed-off-by: Dave Jiang <dave.jiang@intel.com> Tested-by: Allen Hubbe <Allen.Hubbe@emc.com> Signed-off-by: Jon Mason <jdmason@kudzu.us>
author: Dave Jiang <dave.jiang@intel.com> 2016-01-13 15:29:48 -0500
committer: Jon Mason <jdmason@kudzu.us> 2016-01-17 22:08:05 -0500
commit: 8a7b6a778a8519a879c7b6764a11c0d39eead95f (patch)
tree: 00470c865997dda2394da0c8157a660a4b4d2f18 /drivers/ntb
parent: 8c874cc140d667f84ae4642bb5b5e0d6396d2ca4 (diff)
3 files changed, 757 insertions, 0 deletions
diff --git a/drivers/ntb/test/Kconfig b/drivers/ntb/test/Kconfig
index 01852f98a843..a5d0eda44438 100644
--- a/drivers/ntb/test/Kconfig
+++ b/drivers/ntb/test/Kconfig
@@ -17,3 +17,11 @@ config NTB_TOOL
         functioning at a basic level.
         If unsure, say N.
+config NTB_PERF
+        tristate "NTB RAW Perf Measuring Tool"
+        help
+         This is a tool to measure raw NTB performance by transferring data
+         to and from the window without additional software interaction.
+         If unsure, say N.
diff --git a/drivers/ntb/test/Makefile b/drivers/ntb/test/Makefile
index 0ea32a324b6c..9e77e0b761c2 100644
--- a/drivers/ntb/test/Makefile
+++ b/drivers/ntb/test/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_NTB_PINGPONG) += ntb_pingpong.o
 obj-$(CONFIG_NTB_TOOL) += ntb_tool.o
+obj-$(CONFIG_NTB_PERF) += ntb_perf.o
diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
new file mode 100644
index 000000000000..c8a37ba4b4f9
--- /dev/null
+++ b/drivers/ntb/test/ntb_perf.c
@@ -0,0 +1,748 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *   PCIe NTB Perf Linux driver
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+#include <linux/dma-mapping.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/dmaengine.h>
+#include <linux/delay.h>
+#include <linux/sizes.h>
+#include <linux/ntb.h>
+#define DRIVER_NAME             "ntb_perf"
+#define DRIVER_DESCRIPTION      "PCIe NTB Performance Measurement Tool"
+#define DRIVER_LICENSE          "Dual BSD/GPL"
+#define DRIVER_VERSION          "1.0"
+#define DRIVER_AUTHOR           "Dave Jiang <dave.jiang@intel.com>"
+#define PERF_LINK_DOWN_TIMEOUT  10
+#define PERF_VERSION            0xffff0001
+#define MAX_THREADS             32
+#define MAX_TEST_SIZE           SZ_1M
+#define MAX_SRCS                32
+#define DMA_OUT_RESOURCE_TO     50
+#define DMA_RETRIES             20
+#define SZ_4G                   (1ULL << 32)
+#define MAX_SEG_ORDER           20 /* no larger than 1M for kmalloc buffer */
+MODULE_LICENSE(DRIVER_LICENSE);
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESCRIPTION);
+static struct dentry *perf_debugfs_dir;
+static unsigned int seg_order = 19; /* 512K */
+module_param(seg_order, uint, 0644);
+MODULE_PARM_DESC(seg_order, "size order [n^2] of buffer segment for testing");
+static unsigned int run_order = 32; /* 4G */
+module_param(run_order, uint, 0644);
+MODULE_PARM_DESC(run_order, "size order [n^2] of total data to transfer");
+static bool use_dma; /* default to 0 */
+module_param(use_dma, bool, 0644);
+MODULE_PARM_DESC(use_dma, "Using DMA engine to measure performance");
+struct perf_mw {
+        phys_addr_t     phys_addr;
+        resource_size_t phys_size;
+        resource_size_t xlat_align;
+        resource_size_t xlat_align_size;
+        void __iomem    *vbase;
+        size_t          xlat_size;
+        size_t          buf_size;
+        void            *virt_addr;
+        dma_addr_t      dma_addr;
+};
+struct perf_ctx;
+struct pthr_ctx {
+        struct task_struct      *thread;
+        struct perf_ctx         *perf;
+        atomic_t                dma_sync;
+        struct dma_chan         *dma_chan;
+        int                     dma_prep_err;
+        int                     src_idx;
+        void                    *srcs[MAX_SRCS];
+};
+struct perf_ctx {
+        struct ntb_dev          *ntb;
+        spinlock_t              db_lock;
+        struct perf_mw          mw;
+        bool                    link_is_up;
+        struct work_struct      link_cleanup;
+        struct delayed_work     link_work;
+        struct dentry           *debugfs_node_dir;
+        struct dentry           *debugfs_run;
+        struct dentry           *debugfs_threads;
+        u8                      perf_threads;
+        bool                    run;
+        struct pthr_ctx         pthr_ctx[MAX_THREADS];
+        atomic_t                tsync;
+};
+enum {
+        VERSION = 0,
+        MW_SZ_HIGH,
+        MW_SZ_LOW,
+        SPAD_MSG,
+        SPAD_ACK,
+        MAX_SPAD
+};
+static void perf_link_event(void *ctx)
+{
+        struct perf_ctx *perf = ctx;
+        if (ntb_link_is_up(perf->ntb, NULL, NULL) == 1)
+                schedule_delayed_work(&perf->link_work, 2*HZ);
+        else
+                schedule_work(&perf->link_cleanup);
+}
+static void perf_db_event(void *ctx, int vec)
+{
+        struct perf_ctx *perf = ctx;
+        u64 db_bits, db_mask;
+        db_mask = ntb_db_vector_mask(perf->ntb, vec);
+        db_bits = ntb_db_read(perf->ntb);
+        dev_dbg(&perf->ntb->dev, "doorbell vec %d mask %#llx bits %#llx\n",
+                vec, db_mask, db_bits);
+}
+static const struct ntb_ctx_ops perf_ops = {
+        .link_event = perf_link_event,
+        .db_event = perf_db_event,
+};
+static void perf_copy_callback(void *data)
+{
+        struct pthr_ctx *pctx = data;
+        atomic_dec(&pctx->dma_sync);
+}
+static ssize_t perf_copy(struct pthr_ctx *pctx, char *dst,
+                         char *src, size_t size)
+{
+        struct perf_ctx *perf = pctx->perf;
+        struct dma_async_tx_descriptor *txd;
+        struct dma_chan *chan = pctx->dma_chan;
+        struct dma_device *device;
+        struct dmaengine_unmap_data *unmap;
+        dma_cookie_t cookie;
+        size_t src_off, dst_off;
+        struct perf_mw *mw = &perf->mw;
+        u64 vbase, dst_vaddr;
+        dma_addr_t dst_phys;
+        int retries = 0;
+        if (!use_dma) {
+                memcpy_toio(dst, src, size);
+                return size;
+        }
+        if (!chan) {
+                dev_err(&perf->ntb->dev, "DMA engine does not exist\n");
+                return -EINVAL;
+        }
+        device = chan->device;
+        src_off = (size_t)src & ~PAGE_MASK;
+        dst_off = (size_t)dst & ~PAGE_MASK;
+        if (!is_dma_copy_aligned(device, src_off, dst_off, size))
+                return -ENODEV;
+        vbase = (u64)(u64 *)mw->vbase;
+        dst_vaddr = (u64)(u64 *)dst;
+        dst_phys = mw->phys_addr + (dst_vaddr - vbase);
+        unmap = dmaengine_get_unmap_data(device->dev, 1, GFP_NOWAIT);
+        if (!unmap)
+                return -ENOMEM;
+        unmap->len = size;
+        unmap->addr[0] = dma_map_page(device->dev, virt_to_page(src),
+                                      src_off, size, DMA_TO_DEVICE);
+        if (dma_mapping_error(device->dev, unmap->addr[0]))
+                goto err_get_unmap;
+        unmap->to_cnt = 1;
+        do {
+                txd = device->device_prep_dma_memcpy(chan, dst_phys,
+                                                     unmap->addr[0],
+                                                     size, DMA_PREP_INTERRUPT);
+                if (!txd) {
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        schedule_timeout(DMA_OUT_RESOURCE_TO);
+                }
+        } while (!txd && (++retries < DMA_RETRIES));
+        if (!txd) {
+                pctx->dma_prep_err++;
+                goto err_get_unmap;
+        }
+        txd->callback = perf_copy_callback;
+        txd->callback_param = pctx;
+        dma_set_unmap(txd, unmap);
+        cookie = dmaengine_submit(txd);
+        if (dma_submit_error(cookie))
+                goto err_set_unmap;
+        atomic_inc(&pctx->dma_sync);
+        dma_async_issue_pending(chan);
+        return size;
+err_set_unmap:
+        dmaengine_unmap_put(unmap);
+err_get_unmap:
+        dmaengine_unmap_put(unmap);
+        return 0;
+}
+static int perf_move_data(struct pthr_ctx *pctx, char *dst, char *src,
+                          u64 buf_size, u64 win_size, u64 total)
+{
+        int chunks, total_chunks, i;
+        int copied_chunks = 0;
+        u64 copied = 0, result;
+        char *tmp = dst;
+        u64 perf, diff_us;
+        ktime_t kstart, kstop, kdiff;
+        chunks = div64_u64(win_size, buf_size);
+        total_chunks = div64_u64(total, buf_size);
+        kstart = ktime_get();
+        for (i = 0; i < total_chunks; i++) {
+                result = perf_copy(pctx, tmp, src, buf_size);
+                copied += result;
+                copied_chunks++;
+                if (copied_chunks == chunks) {
+                        tmp = dst;
+                        copied_chunks = 0;
+                } else
+                        tmp += buf_size;
+                /* Probably should schedule every 4GB to prevent soft hang. */
+                if (((copied % SZ_4G) == 0) && !use_dma) {
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        schedule_timeout(1);
+                }
+        }
+        if (use_dma) {
+                pr_info("%s: All DMA descriptors submitted\n", current->comm);
+                while (atomic_read(&pctx->dma_sync) != 0)
+                        msleep(20);
+        }
+        kstop = ktime_get();
+        kdiff = ktime_sub(kstop, kstart);
+        diff_us = ktime_to_us(kdiff);
+        pr_info("%s: copied %llu bytes\n", current->comm, copied);
+        pr_info("%s: lasted %llu usecs\n", current->comm, diff_us);
+        perf = div64_u64(copied, diff_us);
+        pr_info("%s: MBytes/s: %llu\n", current->comm, perf);
+        return 0;
+}
+static bool perf_dma_filter_fn(struct dma_chan *chan, void *node)
+{
+        return dev_to_node(&chan->dev->device) == (int)(unsigned long)node;
+}
+static int ntb_perf_thread(void *data)
+{
+        struct pthr_ctx *pctx = data;
+        struct perf_ctx *perf = pctx->perf;
+        struct pci_dev *pdev = perf->ntb->pdev;
+        struct perf_mw *mw = &perf->mw;
+        char *dst;
+        u64 win_size, buf_size, total;
+        void *src;
+        int rc, node, i;
+        struct dma_chan *dma_chan = NULL;
+        pr_info("kthread %s starting...\n", current->comm);
+        node = dev_to_node(&pdev->dev);
+        if (use_dma && !pctx->dma_chan) {
+                dma_cap_mask_t dma_mask;
+                dma_cap_zero(dma_mask);
+                dma_cap_set(DMA_MEMCPY, dma_mask);
+                dma_chan = dma_request_channel(dma_mask, perf_dma_filter_fn,
+                                               (void *)(unsigned long)node);
+                if (!dma_chan) {
+                        pr_warn("%s: cannot acquire DMA channel, quitting\n",
+                                current->comm);
+                        return -ENODEV;
+                }
+                pctx->dma_chan = dma_chan;
+        }
+        for (i = 0; i < MAX_SRCS; i++) {
+                pctx->srcs[i] = kmalloc_node(MAX_TEST_SIZE, GFP_KERNEL, node);
+                if (!pctx->srcs[i]) {
+                        rc = -ENOMEM;
+                        goto err;
+                }
+        }
+        win_size = mw->phys_size;
+        buf_size = 1ULL << seg_order;
+        total = 1ULL << run_order;
+        if (buf_size > MAX_TEST_SIZE)
+                buf_size = MAX_TEST_SIZE;
+        dst = (char *)mw->vbase;
+        atomic_inc(&perf->tsync);
+        while (atomic_read(&perf->tsync) != perf->perf_threads)
+                schedule();
+        src = pctx->srcs[pctx->src_idx];
+        pctx->src_idx = (pctx->src_idx + 1) & (MAX_SRCS - 1);
+        rc = perf_move_data(pctx, dst, src, buf_size, win_size, total);
+        atomic_dec(&perf->tsync);
+        if (rc < 0) {
+                pr_err("%s: failed\n", current->comm);
+                rc = -ENXIO;
+                goto err;
+        }
+        for (i = 0; i < MAX_SRCS; i++) {
+                kfree(pctx->srcs[i]);
+                pctx->srcs[i] = NULL;
+        }
+        return 0;
+err:
+        for (i = 0; i < MAX_SRCS; i++) {
+                kfree(pctx->srcs[i]);
+                pctx->srcs[i] = NULL;
+        }
+        if (dma_chan) {
+                dma_release_channel(dma_chan);
+                pctx->dma_chan = NULL;
+        }
+        return rc;
+}
+static void perf_free_mw(struct perf_ctx *perf)
+{
+        struct perf_mw *mw = &perf->mw;
+        struct pci_dev *pdev = perf->ntb->pdev;
+        if (!mw->virt_addr)
+                return;
+        ntb_mw_clear_trans(perf->ntb, 0);
+        dma_free_coherent(&pdev->dev, mw->buf_size,
+                          mw->virt_addr, mw->dma_addr);
+        mw->xlat_size = 0;
+        mw->buf_size = 0;
+        mw->virt_addr = NULL;
+}
+static int perf_set_mw(struct perf_ctx *perf, resource_size_t size)
+{
+        struct perf_mw *mw = &perf->mw;
+        size_t xlat_size, buf_size;
+        if (!size)
+                return -EINVAL;
+        xlat_size = round_up(size, mw->xlat_align_size);
+        buf_size = round_up(size, mw->xlat_align);
+        if (mw->xlat_size == xlat_size)
+                return 0;
+        if (mw->buf_size)
+                perf_free_mw(perf);
+        mw->xlat_size = xlat_size;
+        mw->buf_size = buf_size;
+        mw->virt_addr = dma_alloc_coherent(&perf->ntb->pdev->dev, buf_size,
+                                           &mw->dma_addr, GFP_KERNEL);
+        if (!mw->virt_addr) {
+                mw->xlat_size = 0;
+                mw->buf_size = 0;
+        }
+        return 0;
+}
+static void perf_link_work(struct work_struct *work)
+{
+        struct perf_ctx *perf =
+                container_of(work, struct perf_ctx, link_work.work);
+        struct ntb_dev *ndev = perf->ntb;
+        struct pci_dev *pdev = ndev->pdev;
+        u32 val;
+        u64 size;
+        int rc;
+        dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
+        size = perf->mw.phys_size;
+        ntb_peer_spad_write(ndev, MW_SZ_HIGH, upper_32_bits(size));
+        ntb_peer_spad_write(ndev, MW_SZ_LOW, lower_32_bits(size));
+        ntb_peer_spad_write(ndev, VERSION, PERF_VERSION);
+        /* now read what peer wrote */
+        val = ntb_spad_read(ndev, VERSION);
+        if (val != PERF_VERSION) {
+                dev_dbg(&pdev->dev, "Remote version = %#x\n", val);
+                goto out;
+        }
+        val = ntb_spad_read(ndev, MW_SZ_HIGH);
+        size = (u64)val << 32;
+        val = ntb_spad_read(ndev, MW_SZ_LOW);
+        size |= val;
+        dev_dbg(&pdev->dev, "Remote MW size = %#llx\n", size);
+        rc = perf_set_mw(perf, size);
+        if (rc)
+                goto out1;
+        perf->link_is_up = true;
+        return;
+out1:
+        perf_free_mw(perf);
+out:
+        if (ntb_link_is_up(ndev, NULL, NULL) == 1)
+                schedule_delayed_work(&perf->link_work,
+                                      msecs_to_jiffies(PERF_LINK_DOWN_TIMEOUT));
+}
+static void perf_link_cleanup(struct work_struct *work)
+{
+        struct perf_ctx *perf = container_of(work,
+                                             struct perf_ctx,
+                                             link_cleanup);
+        dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
+        if (!perf->link_is_up)
+                cancel_delayed_work_sync(&perf->link_work);
+}
+static int perf_setup_mw(struct ntb_dev *ntb, struct perf_ctx *perf)
+{
+        struct perf_mw *mw;
+        int rc;
+        mw = &perf->mw;
+        rc = ntb_mw_get_range(ntb, 0, &mw->phys_addr, &mw->phys_size,
+                              &mw->xlat_align, &mw->xlat_align_size);
+        if (rc)
+                return rc;
+        perf->mw.vbase = ioremap_wc(mw->phys_addr, mw->phys_size);
+        if (!mw->vbase)
+                return -ENOMEM;
+        return 0;
+}
+static ssize_t debugfs_run_read(struct file *filp, char __user *ubuf,
+                                size_t count, loff_t *offp)
+{
+        struct perf_ctx *perf = filp->private_data;
+        char *buf;
+        ssize_t ret, out_offset;
+        if (!perf)
+                return 0;
+        buf = kmalloc(64, GFP_KERNEL);
+        out_offset = snprintf(buf, 64, "%d\n", perf->run);
+        ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
+        kfree(buf);
+        return ret;
+}
+static ssize_t debugfs_run_write(struct file *filp, const char __user *ubuf,
+                                 size_t count, loff_t *offp)
+{
+        struct perf_ctx *perf = filp->private_data;
+        int node, i;
+        if (!perf->link_is_up)
+                return 0;
+        if (perf->perf_threads == 0)
+                return 0;
+        if (atomic_read(&perf->tsync) == 0)
+                perf->run = false;
+        if (perf->run) {
+                /* lets stop the threads */
+                perf->run = false;
+                for (i = 0; i < MAX_THREADS; i++) {
+                        if (perf->pthr_ctx[i].thread) {
+                                kthread_stop(perf->pthr_ctx[i].thread);
+                                perf->pthr_ctx[i].thread = NULL;
+                        } else
+                                break;
+                }
+        } else {
+                perf->run = true;
+                if (perf->perf_threads > MAX_THREADS) {
+                        perf->perf_threads = MAX_THREADS;
+                        pr_info("Reset total threads to: %u\n", MAX_THREADS);
+                }
+                /* no greater than 1M */
+                if (seg_order > MAX_SEG_ORDER) {
+                        seg_order = MAX_SEG_ORDER;
+                        pr_info("Fix seg_order to %u\n", seg_order);
+                }
+                if (run_order < seg_order) {
+                        run_order = seg_order;
+                        pr_info("Fix run_order to %u\n", run_order);
+                }
+                node = dev_to_node(&perf->ntb->pdev->dev);
+                /* launch kernel thread */
+                for (i = 0; i < perf->perf_threads; i++) {
+                        struct pthr_ctx *pctx;
+                        pctx = &perf->pthr_ctx[i];
+                        atomic_set(&pctx->dma_sync, 0);
+                        pctx->perf = perf;
+                        pctx->thread =
+                                kthread_create_on_node(ntb_perf_thread,
+                                                       (void *)pctx,
+                                                       node, "ntb_perf %d", i);
+                        if (pctx->thread)
+                                wake_up_process(pctx->thread);
+                        else {
+                                perf->run = false;
+                                for (i = 0; i < MAX_THREADS; i++) {
+                                        if (pctx->thread) {
+                                                kthread_stop(pctx->thread);
+                                                pctx->thread = NULL;
+                                        }
+                                }
+                        }
+                        if (perf->run == false)
+                                return -ENXIO;
+                }
+        }
+        return count;
+}
+static const struct file_operations ntb_perf_debugfs_run = {
+        .owner = THIS_MODULE,
+        .open = simple_open,
+        .read = debugfs_run_read,
+        .write = debugfs_run_write,
+};
+static int perf_debugfs_setup(struct perf_ctx *perf)
+{
+        struct pci_dev *pdev = perf->ntb->pdev;
+        if (!debugfs_initialized())
+                return -ENODEV;
+        if (!perf_debugfs_dir) {
+                perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+                if (!perf_debugfs_dir)
+                        return -ENODEV;
+        }
+        perf->debugfs_node_dir = debugfs_create_dir(pci_name(pdev),
+                                                    perf_debugfs_dir);
+        if (!perf->debugfs_node_dir)
+                return -ENODEV;
+        perf->debugfs_run = debugfs_create_file("run", S_IRUSR | S_IWUSR,
+                                                perf->debugfs_node_dir, perf,
+                                                &ntb_perf_debugfs_run);
+        if (!perf->debugfs_run)
+                return -ENODEV;
+        perf->debugfs_threads = debugfs_create_u8("threads", S_IRUSR | S_IWUSR,
+                                                  perf->debugfs_node_dir,
+                                                  &perf->perf_threads);
+        if (!perf->debugfs_threads)
+                return -ENODEV;
+        return 0;
+}
+static int perf_probe(struct ntb_client *client, struct ntb_dev *ntb)
+{
+        struct pci_dev *pdev = ntb->pdev;
+        struct perf_ctx *perf;
+        int node;
+        int rc = 0;
+        node = dev_to_node(&pdev->dev);
+        perf = kzalloc_node(sizeof(*perf), GFP_KERNEL, node);
+        if (!perf) {
+                rc = -ENOMEM;
+                goto err_perf;
+        }
+        perf->ntb = ntb;
+        perf->perf_threads = 1;
+        atomic_set(&perf->tsync, 0);
+        perf->run = false;
+        spin_lock_init(&perf->db_lock);
+        perf_setup_mw(ntb, perf);
+        INIT_DELAYED_WORK(&perf->link_work, perf_link_work);
+        INIT_WORK(&perf->link_cleanup, perf_link_cleanup);
+        rc = ntb_set_ctx(ntb, perf, &perf_ops);
+        if (rc)
+                goto err_ctx;
+        perf->link_is_up = false;
+        ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+        ntb_link_event(ntb);
+        rc = perf_debugfs_setup(perf);
+        if (rc)
+                goto err_ctx;
+        return 0;
+err_ctx:
+        cancel_delayed_work_sync(&perf->link_work);
+        cancel_work_sync(&perf->link_cleanup);
+        kfree(perf);
+err_perf:
+        return rc;
+}
+static void perf_remove(struct ntb_client *client, struct ntb_dev *ntb)
+{
+        struct perf_ctx *perf = ntb->ctx;
+        int i;
+        dev_dbg(&perf->ntb->dev, "%s called\n", __func__);
+        cancel_delayed_work_sync(&perf->link_work);
+        cancel_work_sync(&perf->link_cleanup);
+        ntb_clear_ctx(ntb);
+        ntb_link_disable(ntb);
+        debugfs_remove_recursive(perf_debugfs_dir);
+        perf_debugfs_dir = NULL;
+        if (use_dma) {
+                for (i = 0; i < MAX_THREADS; i++) {
+                        struct pthr_ctx *pctx = &perf->pthr_ctx[i];
+                        if (pctx->dma_chan)
+                                dma_release_channel(pctx->dma_chan);
+                }
+        }
+        kfree(perf);
+}
+static struct ntb_client perf_client = {
+        .ops = {
+                .probe = perf_probe,
+                .remove = perf_remove,
+        },
+};
+module_ntb_client(perf_client);
author	Dave Jiang <dave.jiang@intel.com>	2016-01-13 15:29:48 -0500
committer	Jon Mason <jdmason@kudzu.us>	2016-01-17 22:08:05 -0500
commit	8a7b6a778a8519a879c7b6764a11c0d39eead95f (patch)
tree	00470c865997dda2394da0c8157a660a4b4d2f18 /drivers/ntb
parent	8c874cc140d667f84ae4642bb5b5e0d6396d2ca4 (diff)

diff --git a/drivers/ntb/test/Kconfig b/drivers/ntb/test/Kconfig index 01852f98a843..a5d0eda44438 100644 --- a/drivers/ntb/test/Kconfig +++ b/drivers/ntb/test/Kconfig
@@ -17,3 +17,11 @@ config NTB_TOOL
17	functioning at a basic level.	17	functioning at a basic level.
18		18
19	If unsure, say N.	19	If unsure, say N.
		20
		21	config NTB_PERF
		22	tristate "NTB RAW Perf Measuring Tool"
		23	help
		24	This is a tool to measure raw NTB performance by transferring data
		25	to and from the window without additional software interaction.
		26
		27	If unsure, say N.


diff --git a/drivers/ntb/test/Makefile b/drivers/ntb/test/Makefile index 0ea32a324b6c..9e77e0b761c2 100644 --- a/drivers/ntb/test/Makefile +++ b/drivers/ntb/test/Makefile
@@ -1,2 +1,3 @@
1	obj-$(CONFIG_NTB_PINGPONG) += ntb_pingpong.o	1	obj-$(CONFIG_NTB_PINGPONG) += ntb_pingpong.o
2	obj-$(CONFIG_NTB_TOOL) += ntb_tool.o	2	obj-$(CONFIG_NTB_TOOL) += ntb_tool.o
		3	obj-$(CONFIG_NTB_PERF) += ntb_perf.o


diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c new file mode 100644 index 000000000000..c8a37ba4b4f9 --- /dev/null +++ b/drivers/ntb/test/ntb_perf.c
@@ -0,0 +1,748 @@
		1	/*
		2	* This file is provided under a dual BSD/GPLv2 license. When using or
		3	* redistributing this file, you may do so under either license.
		4	*
		5	* GPL LICENSE SUMMARY
		6	*
		7	* Copyright(c) 2015 Intel Corporation. All rights reserved.
		8	*
		9	* This program is free software; you can redistribute it and/or modify
		10	* it under the terms of version 2 of the GNU General Public License as
		11	* published by the Free Software Foundation.
		12	*
		13	* BSD LICENSE
		14	*
		15	* Copyright(c) 2015 Intel Corporation. All rights reserved.
		16	*
		17	* Redistribution and use in source and binary forms, with or without
		18	* modification, are permitted provided that the following conditions
		19	* are met:
		20	*
		21	* * Redistributions of source code must retain the above copyright
		22	* notice, this list of conditions and the following disclaimer.
		23	* * Redistributions in binary form must reproduce the above copy
		24	* notice, this list of conditions and the following disclaimer in
		25	* the documentation and/or other materials provided with the
		26	* distribution.
		27	* * Neither the name of Intel Corporation nor the names of its
		28	* contributors may be used to endorse or promote products derived
		29	* from this software without specific prior written permission.
		30	*
		31	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
		32	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
		33	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
		34	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
		35	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
		36	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
		37	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
		38	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
		39	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
		40	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
		41	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
		42	*
		43	* PCIe NTB Perf Linux driver
		44	*/
		45
		46	#include <linux/init.h>
		47	#include <linux/kernel.h>
		48	#include <linux/module.h>
		49	#include <linux/kthread.h>
		50	#include <linux/time.h>
		51	#include <linux/timer.h>
		52	#include <linux/dma-mapping.h>
		53	#include <linux/pci.h>
		54	#include <linux/slab.h>
		55	#include <linux/spinlock.h>
		56	#include <linux/debugfs.h>
		57	#include <linux/dmaengine.h>
		58	#include <linux/delay.h>
		59	#include <linux/sizes.h>
		60	#include <linux/ntb.h>
		61
		62	#define DRIVER_NAME "ntb_perf"
		63	#define DRIVER_DESCRIPTION "PCIe NTB Performance Measurement Tool"
		64
		65	#define DRIVER_LICENSE "Dual BSD/GPL"
		66	#define DRIVER_VERSION "1.0"
		67	#define DRIVER_AUTHOR "Dave Jiang <dave.jiang@intel.com>"
		68
		69	#define PERF_LINK_DOWN_TIMEOUT 10
		70	#define PERF_VERSION 0xffff0001
		71	#define MAX_THREADS 32
		72	#define MAX_TEST_SIZE SZ_1M
		73	#define MAX_SRCS 32
		74	#define DMA_OUT_RESOURCE_TO 50
		75	#define DMA_RETRIES 20
		76	#define SZ_4G (1ULL << 32)
		77	#define MAX_SEG_ORDER 20 /* no larger than 1M for kmalloc buffer */
		78
		79	MODULE_LICENSE(DRIVER_LICENSE);
		80	MODULE_VERSION(DRIVER_VERSION);
		81	MODULE_AUTHOR(DRIVER_AUTHOR);
		82	MODULE_DESCRIPTION(DRIVER_DESCRIPTION);
		83
		84	static struct dentry *perf_debugfs_dir;
		85
		86	static unsigned int seg_order = 19; /* 512K */
		87	module_param(seg_order, uint, 0644);
		88	MODULE_PARM_DESC(seg_order, "size order [n^2] of buffer segment for testing");
		89
		90	static unsigned int run_order = 32; /* 4G */
		91	module_param(run_order, uint, 0644);
		92	MODULE_PARM_DESC(run_order, "size order [n^2] of total data to transfer");
		93
		94	static bool use_dma; /* default to 0 */
		95	module_param(use_dma, bool, 0644);
		96	MODULE_PARM_DESC(use_dma, "Using DMA engine to measure performance");
		97
		98	struct perf_mw {
		99	phys_addr_t phys_addr;
		100	resource_size_t phys_size;
		101	resource_size_t xlat_align;
		102	resource_size_t xlat_align_size;
		103	void __iomem *vbase;
		104	size_t xlat_size;
		105	size_t buf_size;
		106	void *virt_addr;
		107	dma_addr_t dma_addr;
		108	};
		109
		110	struct perf_ctx;
		111
		112	struct pthr_ctx {
		113	struct task_struct *thread;
		114	struct perf_ctx *perf;
		115	atomic_t dma_sync;
		116	struct dma_chan *dma_chan;
		117	int dma_prep_err;
		118	int src_idx;
		119	void *srcs[MAX_SRCS];
		120	};
		121
		122	struct perf_ctx {
		123	struct ntb_dev *ntb;
		124	spinlock_t db_lock;
		125	struct perf_mw mw;
		126	bool link_is_up;
		127	struct work_struct link_cleanup;
		128	struct delayed_work link_work;
		129	struct dentry *debugfs_node_dir;
		130	struct dentry *debugfs_run;
		131	struct dentry *debugfs_threads;
		132	u8 perf_threads;
		133	bool run;
		134	struct pthr_ctx pthr_ctx[MAX_THREADS];
		135	atomic_t tsync;
		136	};
		137
		138	enum {
		139	VERSION = 0,
		140	MW_SZ_HIGH,
		141	MW_SZ_LOW,
		142	SPAD_MSG,
		143	SPAD_ACK,
		144	MAX_SPAD
		145	};
		146
		147	static void perf_link_event(void *ctx)
		148	{
		149	struct perf_ctx *perf = ctx;
		150
		151	if (ntb_link_is_up(perf->ntb, NULL, NULL) == 1)
		152	schedule_delayed_work(&perf->link_work, 2*HZ);
		153	else
		154	schedule_work(&perf->link_cleanup);
		155	}
		156
		157	static void perf_db_event(void *ctx, int vec)
		158	{
		159	struct perf_ctx *perf = ctx;
		160	u64 db_bits, db_mask;
		161
		162	db_mask = ntb_db_vector_mask(perf->ntb, vec);
		163	db_bits = ntb_db_read(perf->ntb);
		164
		165	dev_dbg(&perf->ntb->dev, "doorbell vec %d mask %#llx bits %#llx\n",
		166	vec, db_mask, db_bits);
		167	}
		168
		169	static const struct ntb_ctx_ops perf_ops = {
		170	.link_event = perf_link_event,
		171	.db_event = perf_db_event,
		172	};
		173
		174	static void perf_copy_callback(void *data)
		175	{
		176	struct pthr_ctx *pctx = data;
		177
		178	atomic_dec(&pctx->dma_sync);
		179	}
		180
		181	static ssize_t perf_copy(struct pthr_ctx pctx, char dst,
		182	char *src, size_t size)
		183	{
		184	struct perf_ctx *perf = pctx->perf;
		185	struct dma_async_tx_descriptor *txd;
		186	struct dma_chan *chan = pctx->dma_chan;
		187	struct dma_device *device;
		188	struct dmaengine_unmap_data *unmap;
		189	dma_cookie_t cookie;
		190	size_t src_off, dst_off;
		191	struct perf_mw *mw = &perf->mw;
		192	u64 vbase, dst_vaddr;
		193	dma_addr_t dst_phys;
		194	int retries = 0;
		195
		196	if (!use_dma) {
		197	memcpy_toio(dst, src, size);
		198	return size;
		199	}
		200
		201	if (!chan) {
		202	dev_err(&perf->ntb->dev, "DMA engine does not exist\n");
		203	return -EINVAL;
		204	}
		205
		206	device = chan->device;
		207	src_off = (size_t)src & ~PAGE_MASK;
		208	dst_off = (size_t)dst & ~PAGE_MASK;
		209
		210	if (!is_dma_copy_aligned(device, src_off, dst_off, size))
		211	return -ENODEV;
		212
		213	vbase = (u64)(u64 *)mw->vbase;
		214	dst_vaddr = (u64)(u64 *)dst;
		215	dst_phys = mw->phys_addr + (dst_vaddr - vbase);
		216
		217	unmap = dmaengine_get_unmap_data(device->dev, 1, GFP_NOWAIT);
		218	if (!unmap)
		219	return -ENOMEM;
		220
		221	unmap->len = size;
		222	unmap->addr[0] = dma_map_page(device->dev, virt_to_page(src),
		223	src_off, size, DMA_TO_DEVICE);
		224	if (dma_mapping_error(device->dev, unmap->addr[0]))
		225	goto err_get_unmap;
		226
		227	unmap->to_cnt = 1;
		228
		229	do {
		230	txd = device->device_prep_dma_memcpy(chan, dst_phys,
		231	unmap->addr[0],
		232	size, DMA_PREP_INTERRUPT);
		233	if (!txd) {
		234	set_current_state(TASK_INTERRUPTIBLE);
		235	schedule_timeout(DMA_OUT_RESOURCE_TO);
		236	}
		237	} while (!txd && (++retries < DMA_RETRIES));
		238
		239	if (!txd) {
		240	pctx->dma_prep_err++;
		241	goto err_get_unmap;
		242	}
		243
		244	txd->callback = perf_copy_callback;
		245	txd->callback_param = pctx;
		246	dma_set_unmap(txd, unmap);
		247
		248	cookie = dmaengine_submit(txd);
		249	if (dma_submit_error(cookie))
		250	goto err_set_unmap;
		251
		252	atomic_inc(&pctx->dma_sync);
		253	dma_async_issue_pending(chan);
		254
		255	return size;
		256
		257	err_set_unmap:
		258	dmaengine_unmap_put(unmap);
		259	err_get_unmap:
		260	dmaengine_unmap_put(unmap);
		261	return 0;
		262	}
		263
		264	static int perf_move_data(struct pthr_ctx pctx, char dst, char *src,
		265	u64 buf_size, u64 win_size, u64 total)
		266	{
		267	int chunks, total_chunks, i;
		268	int copied_chunks = 0;
		269	u64 copied = 0, result;
		270	char *tmp = dst;
		271	u64 perf, diff_us;
		272	ktime_t kstart, kstop, kdiff;
		273
		274	chunks = div64_u64(win_size, buf_size);
		275	total_chunks = div64_u64(total, buf_size);
		276	kstart = ktime_get();
		277
		278	for (i = 0; i < total_chunks; i++) {
		279	result = perf_copy(pctx, tmp, src, buf_size);
		280	copied += result;
		281	copied_chunks++;
		282	if (copied_chunks == chunks) {
		283	tmp = dst;
		284	copied_chunks = 0;
		285	} else
		286	tmp += buf_size;
		287
		288	/* Probably should schedule every 4GB to prevent soft hang. */
		289	if (((copied % SZ_4G) == 0) && !use_dma) {
		290	set_current_state(TASK_INTERRUPTIBLE);
		291	schedule_timeout(1);
		292	}
		293	}
		294
		295	if (use_dma) {
		296	pr_info("%s: All DMA descriptors submitted\n", current->comm);
		297	while (atomic_read(&pctx->dma_sync) != 0)
		298	msleep(20);
		299	}
		300
		301	kstop = ktime_get();
		302	kdiff = ktime_sub(kstop, kstart);
		303	diff_us = ktime_to_us(kdiff);
		304
		305	pr_info("%s: copied %llu bytes\n", current->comm, copied);
		306
		307	pr_info("%s: lasted %llu usecs\n", current->comm, diff_us);
		308
		309	perf = div64_u64(copied, diff_us);
		310
		311	pr_info("%s: MBytes/s: %llu\n", current->comm, perf);
		312
		313	return 0;
		314	}
		315
		316	static bool perf_dma_filter_fn(struct dma_chan chan, void node)
		317	{
		318	return dev_to_node(&chan->dev->device) == (int)(unsigned long)node;
		319	}
		320
		321	static int ntb_perf_thread(void *data)
		322	{
		323	struct pthr_ctx *pctx = data;
		324	struct perf_ctx *perf = pctx->perf;
		325	struct pci_dev *pdev = perf->ntb->pdev;
		326	struct perf_mw *mw = &perf->mw;
		327	char *dst;
		328	u64 win_size, buf_size, total;
		329	void *src;
		330	int rc, node, i;
		331	struct dma_chan *dma_chan = NULL;
		332
		333	pr_info("kthread %s starting...\n", current->comm);
		334
		335	node = dev_to_node(&pdev->dev);
		336
		337	if (use_dma && !pctx->dma_chan) {
		338	dma_cap_mask_t dma_mask;
		339
		340	dma_cap_zero(dma_mask);
		341	dma_cap_set(DMA_MEMCPY, dma_mask);
		342	dma_chan = dma_request_channel(dma_mask, perf_dma_filter_fn,
		343	(void *)(unsigned long)node);
		344	if (!dma_chan) {
		345	pr_warn("%s: cannot acquire DMA channel, quitting\n",
		346	current->comm);
		347	return -ENODEV;
		348	}
		349	pctx->dma_chan = dma_chan;
		350	}
		351
		352	for (i = 0; i < MAX_SRCS; i++) {
		353	pctx->srcs[i] = kmalloc_node(MAX_TEST_SIZE, GFP_KERNEL, node);
		354	if (!pctx->srcs[i]) {
		355	rc = -ENOMEM;
		356	goto err;
		357	}
		358	}
		359
		360	win_size = mw->phys_size;
		361	buf_size = 1ULL << seg_order;
		362	total = 1ULL << run_order;
		363
		364	if (buf_size > MAX_TEST_SIZE)
		365	buf_size = MAX_TEST_SIZE;
		366
		367	dst = (char *)mw->vbase;
		368
		369	atomic_inc(&perf->tsync);
		370	while (atomic_read(&perf->tsync) != perf->perf_threads)
		371	schedule();
		372
		373	src = pctx->srcs[pctx->src_idx];
		374	pctx->src_idx = (pctx->src_idx + 1) & (MAX_SRCS - 1);
		375
		376	rc = perf_move_data(pctx, dst, src, buf_size, win_size, total);
		377
		378	atomic_dec(&perf->tsync);
		379
		380	if (rc < 0) {
		381	pr_err("%s: failed\n", current->comm);
		382	rc = -ENXIO;
		383	goto err;
		384	}
		385
		386	for (i = 0; i < MAX_SRCS; i++) {
		387	kfree(pctx->srcs[i]);
		388	pctx->srcs[i] = NULL;
		389	}
		390
		391	return 0;
		392
		393	err:
		394	for (i = 0; i < MAX_SRCS; i++) {
		395	kfree(pctx->srcs[i]);
		396	pctx->srcs[i] = NULL;
		397	}
		398
		399	if (dma_chan) {
		400	dma_release_channel(dma_chan);
		401	pctx->dma_chan = NULL;
		402	}
		403
		404	return rc;
		405	}
		406
		407	static void perf_free_mw(struct perf_ctx *perf)
		408	{
		409	struct perf_mw *mw = &perf->mw;
		410	struct pci_dev *pdev = perf->ntb->pdev;
		411
		412	if (!mw->virt_addr)
		413	return;
		414
		415	ntb_mw_clear_trans(perf->ntb, 0);
		416	dma_free_coherent(&pdev->dev, mw->buf_size,
		417	mw->virt_addr, mw->dma_addr);
		418	mw->xlat_size = 0;
		419	mw->buf_size = 0;
		420	mw->virt_addr = NULL;
		421	}
		422
		423	static int perf_set_mw(struct perf_ctx *perf, resource_size_t size)
		424	{
		425	struct perf_mw *mw = &perf->mw;
		426	size_t xlat_size, buf_size;
		427
		428	if (!size)
		429	return -EINVAL;
		430
		431	xlat_size = round_up(size, mw->xlat_align_size);
		432	buf_size = round_up(size, mw->xlat_align);
		433
		434	if (mw->xlat_size == xlat_size)
		435	return 0;
		436
		437	if (mw->buf_size)
		438	perf_free_mw(perf);
		439
		440	mw->xlat_size = xlat_size;
		441	mw->buf_size = buf_size;
		442
		443	mw->virt_addr = dma_alloc_coherent(&perf->ntb->pdev->dev, buf_size,
		444	&mw->dma_addr, GFP_KERNEL);
		445	if (!mw->virt_addr) {
		446	mw->xlat_size = 0;
		447	mw->buf_size = 0;
		448	}
		449
		450	return 0;
		451	}
		452
		453	static void perf_link_work(struct work_struct *work)
		454	{
		455	struct perf_ctx *perf =
		456	container_of(work, struct perf_ctx, link_work.work);
		457	struct ntb_dev *ndev = perf->ntb;
		458	struct pci_dev *pdev = ndev->pdev;
		459	u32 val;
		460	u64 size;
		461	int rc;
		462
		463	dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
		464
		465	size = perf->mw.phys_size;
		466	ntb_peer_spad_write(ndev, MW_SZ_HIGH, upper_32_bits(size));
		467	ntb_peer_spad_write(ndev, MW_SZ_LOW, lower_32_bits(size));
		468	ntb_peer_spad_write(ndev, VERSION, PERF_VERSION);
		469
		470	/* now read what peer wrote */
		471	val = ntb_spad_read(ndev, VERSION);
		472	if (val != PERF_VERSION) {
		473	dev_dbg(&pdev->dev, "Remote version = %#x\n", val);
		474	goto out;
		475	}
		476
		477	val = ntb_spad_read(ndev, MW_SZ_HIGH);
		478	size = (u64)val << 32;
		479
		480	val = ntb_spad_read(ndev, MW_SZ_LOW);
		481	size \|= val;
		482
		483	dev_dbg(&pdev->dev, "Remote MW size = %#llx\n", size);
		484
		485	rc = perf_set_mw(perf, size);
		486	if (rc)
		487	goto out1;
		488
		489	perf->link_is_up = true;
		490
		491	return;
		492
		493	out1:
		494	perf_free_mw(perf);
		495
		496	out:
		497	if (ntb_link_is_up(ndev, NULL, NULL) == 1)
		498	schedule_delayed_work(&perf->link_work,
		499	msecs_to_jiffies(PERF_LINK_DOWN_TIMEOUT));
		500	}
		501
		502	static void perf_link_cleanup(struct work_struct *work)
		503	{
		504	struct perf_ctx *perf = container_of(work,
		505	struct perf_ctx,
		506	link_cleanup);
		507
		508	dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__);
		509
		510	if (!perf->link_is_up)
		511	cancel_delayed_work_sync(&perf->link_work);
		512	}
		513
		514	static int perf_setup_mw(struct ntb_dev ntb, struct perf_ctx perf)
		515	{
		516	struct perf_mw *mw;
		517	int rc;
		518
		519	mw = &perf->mw;
		520
		521	rc = ntb_mw_get_range(ntb, 0, &mw->phys_addr, &mw->phys_size,
		522	&mw->xlat_align, &mw->xlat_align_size);
		523	if (rc)
		524	return rc;
		525
		526	perf->mw.vbase = ioremap_wc(mw->phys_addr, mw->phys_size);
		527	if (!mw->vbase)
		528	return -ENOMEM;
		529
		530	return 0;
		531	}
		532
		533	static ssize_t debugfs_run_read(struct file filp, char __user ubuf,
		534	size_t count, loff_t *offp)
		535	{
		536	struct perf_ctx *perf = filp->private_data;
		537	char *buf;
		538	ssize_t ret, out_offset;
		539
		540	if (!perf)
		541	return 0;
		542
		543	buf = kmalloc(64, GFP_KERNEL);
		544	out_offset = snprintf(buf, 64, "%d\n", perf->run);
		545	ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
		546	kfree(buf);
		547
		548	return ret;
		549	}
		550
		551	static ssize_t debugfs_run_write(struct file filp, const char __user ubuf,
		552	size_t count, loff_t *offp)
		553	{
		554	struct perf_ctx *perf = filp->private_data;
		555	int node, i;
		556
		557	if (!perf->link_is_up)
		558	return 0;
		559
		560	if (perf->perf_threads == 0)
		561	return 0;
		562
		563	if (atomic_read(&perf->tsync) == 0)
		564	perf->run = false;
		565
		566	if (perf->run) {
		567	/* lets stop the threads */
		568	perf->run = false;
		569	for (i = 0; i < MAX_THREADS; i++) {
		570	if (perf->pthr_ctx[i].thread) {
		571	kthread_stop(perf->pthr_ctx[i].thread);
		572	perf->pthr_ctx[i].thread = NULL;
		573	} else
		574	break;
		575	}
		576	} else {
		577	perf->run = true;
		578
		579	if (perf->perf_threads > MAX_THREADS) {
		580	perf->perf_threads = MAX_THREADS;
		581	pr_info("Reset total threads to: %u\n", MAX_THREADS);
		582	}
		583
		584	/* no greater than 1M */
		585	if (seg_order > MAX_SEG_ORDER) {
		586	seg_order = MAX_SEG_ORDER;
		587	pr_info("Fix seg_order to %u\n", seg_order);
		588	}
		589
		590	if (run_order < seg_order) {
		591	run_order = seg_order;
		592	pr_info("Fix run_order to %u\n", run_order);
		593	}
		594
		595	node = dev_to_node(&perf->ntb->pdev->dev);
		596	/* launch kernel thread */
		597	for (i = 0; i < perf->perf_threads; i++) {
		598	struct pthr_ctx *pctx;
		599
		600	pctx = &perf->pthr_ctx[i];
		601	atomic_set(&pctx->dma_sync, 0);
		602	pctx->perf = perf;
		603	pctx->thread =
		604	kthread_create_on_node(ntb_perf_thread,
		605	(void *)pctx,
		606	node, "ntb_perf %d", i);
		607	if (pctx->thread)
		608	wake_up_process(pctx->thread);
		609	else {
		610	perf->run = false;
		611	for (i = 0; i < MAX_THREADS; i++) {
		612	if (pctx->thread) {
		613	kthread_stop(pctx->thread);
		614	pctx->thread = NULL;
		615	}
		616	}
		617	}
		618
		619	if (perf->run == false)
		620	return -ENXIO;
		621	}
		622
		623	}
		624
		625	return count;
		626	}
		627
		628	static const struct file_operations ntb_perf_debugfs_run = {
		629	.owner = THIS_MODULE,
		630	.open = simple_open,
		631	.read = debugfs_run_read,
		632	.write = debugfs_run_write,
		633	};
		634
		635	static int perf_debugfs_setup(struct perf_ctx *perf)
		636	{
		637	struct pci_dev *pdev = perf->ntb->pdev;
		638
		639	if (!debugfs_initialized())
		640	return -ENODEV;
		641
		642	if (!perf_debugfs_dir) {
		643	perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
		644	if (!perf_debugfs_dir)
		645	return -ENODEV;
		646	}
		647
		648	perf->debugfs_node_dir = debugfs_create_dir(pci_name(pdev),
		649	perf_debugfs_dir);
		650	if (!perf->debugfs_node_dir)
		651	return -ENODEV;
		652
		653	perf->debugfs_run = debugfs_create_file("run", S_IRUSR \| S_IWUSR,
		654	perf->debugfs_node_dir, perf,
		655	&ntb_perf_debugfs_run);
		656	if (!perf->debugfs_run)
		657	return -ENODEV;
		658
		659	perf->debugfs_threads = debugfs_create_u8("threads", S_IRUSR \| S_IWUSR,
		660	perf->debugfs_node_dir,
		661	&perf->perf_threads);
		662	if (!perf->debugfs_threads)
		663	return -ENODEV;
		664
		665	return 0;
		666	}
		667
		668	static int perf_probe(struct ntb_client client, struct ntb_dev ntb)
		669	{
		670	struct pci_dev *pdev = ntb->pdev;
		671	struct perf_ctx *perf;
		672	int node;
		673	int rc = 0;
		674
		675	node = dev_to_node(&pdev->dev);
		676
		677	perf = kzalloc_node(sizeof(*perf), GFP_KERNEL, node);
		678	if (!perf) {
		679	rc = -ENOMEM;
		680	goto err_perf;
		681	}
		682
		683	perf->ntb = ntb;
		684	perf->perf_threads = 1;
		685	atomic_set(&perf->tsync, 0);
		686	perf->run = false;
		687	spin_lock_init(&perf->db_lock);
		688	perf_setup_mw(ntb, perf);
		689	INIT_DELAYED_WORK(&perf->link_work, perf_link_work);
		690	INIT_WORK(&perf->link_cleanup, perf_link_cleanup);
		691
		692	rc = ntb_set_ctx(ntb, perf, &perf_ops);
		693	if (rc)
		694	goto err_ctx;
		695
		696	perf->link_is_up = false;
		697	ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
		698	ntb_link_event(ntb);
		699
		700	rc = perf_debugfs_setup(perf);
		701	if (rc)
		702	goto err_ctx;
		703
		704	return 0;
		705
		706	err_ctx:
		707	cancel_delayed_work_sync(&perf->link_work);
		708	cancel_work_sync(&perf->link_cleanup);
		709	kfree(perf);
		710	err_perf:
		711	return rc;
		712	}
		713
		714	static void perf_remove(struct ntb_client client, struct ntb_dev ntb)
		715	{
		716	struct perf_ctx *perf = ntb->ctx;
		717	int i;
		718
		719	dev_dbg(&perf->ntb->dev, "%s called\n", __func__);
		720
		721	cancel_delayed_work_sync(&perf->link_work);
		722	cancel_work_sync(&perf->link_cleanup);
		723
		724	ntb_clear_ctx(ntb);
		725	ntb_link_disable(ntb);
		726
		727	debugfs_remove_recursive(perf_debugfs_dir);
		728	perf_debugfs_dir = NULL;
		729
		730	if (use_dma) {
		731	for (i = 0; i < MAX_THREADS; i++) {
		732	struct pthr_ctx *pctx = &perf->pthr_ctx[i];
		733
		734	if (pctx->dma_chan)
		735	dma_release_channel(pctx->dma_chan);
		736	}
		737	}
		738
		739	kfree(perf);
		740	}
		741
		742	static struct ntb_client perf_client = {
		743	.ops = {
		744	.probe = perf_probe,
		745	.remove = perf_remove,
		746	},
		747	};
		748	module_ntb_client(perf_client);