4 files changed, 386 insertions, 2 deletions
diff --git a/drivers/gpu/nvgpu/os/linux/dmabuf.c b/drivers/gpu/nvgpu/os/linux/dmabuf.c
index e8e33130..08f78ae6 100644
--- a/drivers/gpu/nvgpu/os/linux/dmabuf.c
+++ b/drivers/gpu/nvgpu/os/linux/dmabuf.c
@@ -124,8 +124,10 @@ void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf,
        struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
        dma_addr_t dma_addr;
-        if (IS_ERR(priv) || !priv)
+        if (IS_ERR(priv) || !priv) {
+                printk(KERN_ERR "nvgpu: Unable to access priv in gk20a_mm_unpin()\n");
                return;
+        }
        nvgpu_mutex_acquire(&priv->lock);
        WARN_ON(priv->sgt != sgt);
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_as.c b/drivers/gpu/nvgpu/os/linux/ioctl_as.c
index f0cec178..9708ea1a 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_as.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_as.c
@@ -32,6 +32,9 @@
 #include "platform_gk20a.h"
 #include "ioctl_as.h"
 #include "os_linux.h"
+#include <linux/nvmap.h> // For nvmap_dmabuf_{d/r}ealloc()
+#include "dmabuf.h" // struct dma_buf things for swapping
+#include "swap.h"
 static u32 gk20a_as_translate_as_alloc_space_flags(struct gk20a *g, u32 flags)
 {
@@ -329,6 +332,192 @@ int gk20a_as_dev_release(struct inode *inode, struct file *filp)
        return gk20a_as_release_share(as_share);
 }
+#define OLD_WALK 0
+/* Access dmabuf associated with passed file descriptor, copy the associated
+ * pages to an NVME drive, unpin associated pages from DMA'able space, and free
+ * said pages for use by others.
+ * dmabuf is put in a deallocated state, and any GPU mappings will be
+ * invalidated. To restore the dmabuf, see nvgpu_as_ioctl_read_swap_buffer().
+ */
+static int nvgpu_as_ioctl_write_swap_buffer(
+                struct gk20a_as_share *as_share,
+                struct nvgpu_as_swap_buffer_args *args)
+{
+        struct gk20a *g = gk20a_from_vm(as_share->vm);
+        int err = 0;
+#if OLD_WALK
+        struct nvgpu_rbtree_node *node;
+#endif
+        struct nvgpu_mapped_buf *m;
+        struct sg_table *sgt;
+        struct vm_gk20a *vm = as_share->vm;
+        struct dma_buf *dmabuf = dma_buf_get(args->dmabuf_fd);
+        nvgpu_log_fn(g, " ");
+        if (IS_ERR(dmabuf))
+                return PTR_ERR(dmabuf);
+        // Other code walking vm->mapped_buffers grabs this lock
+        nvgpu_mutex_acquire(&vm->update_gmmu_lock);
+#if OLD_WALK
+        // Get mapped buffer corresponding to this dmabuf
+        // TODO: Error on buffer mapped >1
+        for_each_buffer(node, vm->mapped_buffers, m) {
+                if (m->os_priv.dmabuf == dmabuf)
+                        break;
+        }
+        // If failed search
+        if (!node || !m) {
+                // No mapped dmabuf associated with FD
+                err = -EBADFD;
+                goto out_put_unlock;
+        }
+#else
+        m = dmabuf_to_mapped_buf(dmabuf);
+        // If failed search
+        if (IS_ERR(m)) {
+                // No mapped dmabuf associated with FD
+                err = -EBADFD;
+                goto out_put_unlock;
+        }
+#endif
+        // Disable an annoying custom out-of-tree "feature" of dma_buf which defers unmap
+        if (dma_buf_disable_lazy_unmapping(dev_from_vm(vm))) {
+                err = -ENOTRECOVERABLE;
+                goto out_put_unlock;
+        }
+        // Flush dirty GPU L2 cache lines to DRAM
+        // (Assuming that NVMe DRAM acceses are uncached)
+        gk20a_mm_l2_flush(g, false);
+        // Copy out (blocking)
+        err = copy_out(m->os_priv.sgt);
+        if (err) {
+                // Inaccessible swap device, etc
+                goto out_put_unlock;
+        }
+        // Unpin needs to happen after copy out is done
+        // (No return value check as it's a void function)
+        gk20a_mm_unpin(dev_from_vm(vm), m->os_priv.dmabuf,
+                       m->os_priv.attachment, m->os_priv.sgt);
+        // Deallocate dmabuf's backing pages
+        // TODO: Fail early for these cases (where the dmabuf is mmaped, etc),
+        //       before we do all the above (expensive) steps
+        err = nvmap_dealloc_dmabuf(dmabuf);
+        if (err) {
+                // Repin
+                sgt = gk20a_mm_pin(dev_from_vm(vm), m->os_priv.dmabuf,
+                                   &m->os_priv.attachment);
+                m->os_priv.sgt = sgt;
+                goto out_put_unlock;
+        }
+out_put_unlock:
+        // Done with dmabuf, so release our ref to it
+        dma_buf_put(dmabuf);
+        nvgpu_mutex_release(&vm->update_gmmu_lock);
+        return err;
+}
+// Undoes everything nvgpu_as_ioctl_write_swap_buffer() does
+static int nvgpu_as_ioctl_read_swap_buffer(
+                struct gk20a_as_share *as_share,
+                struct nvgpu_as_swap_buffer_args *args)
+{
+        struct gk20a *g = gk20a_from_vm(as_share->vm);
+        int err = 0;
+#if OLD_WALK
+        struct nvgpu_rbtree_node *node;
+#endif
+        struct nvgpu_mapped_buf *m;
+        struct sg_table *sgt;
+        struct vm_gk20a *vm = as_share->vm;
+        struct dma_buf *dmabuf = dma_buf_get(args->dmabuf_fd);
+        nvgpu_log_fn(g, " ");
+        if (!dmabuf)
+                return -EBADF;
+        // Other code walking vm->mapped_buffers grabs this lock
+        nvgpu_mutex_acquire(&vm->update_gmmu_lock);
+#if OLD_WALK
+        // Get mapped buffer corresponding to this dmabuf
+        // TODO: Error on buffer mapped >1
+        for_each_buffer(node, vm->mapped_buffers, m) {
+                if (m->os_priv.dmabuf == dmabuf)
+                        break;
+        }
+        // If failed search
+        if (!node || !m) {
+                // No mapped dmabuf associated with FD
+                err = -EBADFD;
+                goto out_put_unlock;
+        }
+#else
+        m = dmabuf_to_mapped_buf(dmabuf);
+        // If failed search
+        if (IS_ERR(m)) {
+                // No mapped dmabuf associated with FD
+                err = -EBADFD;
+                goto out_put_unlock;
+        }
+#endif
+        // Reallocate space for this buffer
+        err = nvmap_realloc_dmabuf(dmabuf);
+        if (err) {
+                // Out of memory (?)
+                goto out_put_unlock;
+        }
+        // Repin the buffer to DMA'able memory
+        sgt = gk20a_mm_pin(dev_from_vm(vm), m->os_priv.dmabuf,
+                           &m->os_priv.attachment);
+        if (IS_ERR(sgt)) {
+                // Rollback allocation
+                err = nvmap_dealloc_dmabuf(dmabuf);
+                if (err)
+                        printk(KERN_ERR "nvgpu: Error %d while rolling back dmabuf allocation state on error in gk20a_mm_pin()! Consider dmabuf FD %d to be in an inconsistent state!\n", err, args->dmabuf_fd);
+                err = PTR_ERR(sgt);
+                goto out_put_unlock;
+        }
+        // Do any bookeeping not done by gk20a_mm_pin()
+        m->os_priv.sgt = sgt;
+        // Reload page contents from disk (blocking)
+        err = copy_in(sgt);
+        if (err) {
+                int err2;
+                // Rollback pinning and allocation
+                gk20a_mm_unpin(dev_from_vm(vm), m->os_priv.dmabuf,
+                               m->os_priv.attachment, m->os_priv.sgt);
+                err2 = nvmap_dealloc_dmabuf(dmabuf);
+                if (err2)
+                        printk(KERN_ERR "nvgpu: Error %d while rolling back dmabuf allocation state on error in copy_in()! Consider dmabuf FD %d to be in an inconsistent state!\n", err2, args->dmabuf_fd);
+                // Inaccessible swap device, etc
+                goto out_put_unlock;
+        }
+        // Update GPU page tables (PT) to point to new allocation
+        nvgpu_vm_remap(m);
+        // Due to PT update, translation lookaside buffer needs clearing
+        g->ops.fb.tlb_invalidate(g, vm->pdb.mem);
+        // Invalidate L2 so that TLB refill does not load stale PT
+        gk20a_mm_l2_flush(g, true);
+out_put_unlock:
+        // Done with dmabuf, so release our ref to it
+        dma_buf_put(dmabuf);
+        nvgpu_mutex_release(&vm->update_gmmu_lock);
+        return err;
+}
 long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
@@ -412,6 +601,14 @@ long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                err = nvgpu_as_ioctl_get_sync_ro_map(as_share,
                        (struct nvgpu_as_get_sync_ro_map_args *)buf);
                break;
+        case NVGPU_AS_IOCTL_READ_SWAP_BUFFER:
+                err = nvgpu_as_ioctl_read_swap_buffer(as_share,
+                        (struct nvgpu_as_swap_buffer_args *)buf);
+                break;
+        case NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER:
+                err = nvgpu_as_ioctl_write_swap_buffer(as_share,
+                        (struct nvgpu_as_swap_buffer_args *)buf);
+                break;
        default:
                err = -ENOTTY;
                break;
diff --git a/drivers/gpu/nvgpu/os/linux/swap.h b/drivers/gpu/nvgpu/os/linux/swap.h
new file mode 100644
index 00000000..f762ba81
--- /dev/null
+++ b/drivers/gpu/nvgpu/os/linux/swap.h
@@ -0,0 +1,117 @@
+#include <linux/scatterlist.h>
+#include <linux/bio.h>
+//#include <nvgpu/bug.h>
+// Queue a command to copy out an SGT to disk
+// TODO: Cache bdev
+// TODO: Asynchronous I/O
+// TODO: Don't hardcode sector 0
+int copy(struct sg_table *sgt, int op) {
+  unsigned int i;
+  struct scatterlist *sg;
+  struct bio *bio;
+  int err = 0;
+  int sg_cnt = sgt->nents;
+  struct bio *bio_orig;
+  sector_t sector = 0; // XXX: For testing
+  // Find and open the block device
+  struct block_device *bdev = blkdev_get_by_path("/dev/nvme0n1", FMODE_READ | FMODE_WRITE, copy);
+  if (unlikely(IS_ERR(bdev))) {
+    printk(KERN_WARNING "Unabled to find `nvme0`, err %ld!\n", PTR_ERR(bdev));
+    return -ENODEV;
+  }
+  // Will never fail when allocating <= BIO_MAX_PAGES
+  bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
+  bio_orig = bio;
+  bio->bi_bdev = bdev; // Switch to bio_set_dev(bdev) in newer kernels
+  bio->bi_iter.bi_sector = sector;
+  bio_set_op_attrs(bio, op, op == REQ_OP_WRITE ? WRITE_ODIRECT : 0);//REQ_SYNC); // XXX: Is REQ_SYNC necessary?
+  // Copy the scatter-gather table (sgt) into a block I/O vector (bio vec)
+  // bio_chain() approach borrowed from drivers/nvme/target/io-cmd.c:nvmet_execute_rw()
+  for_each_sg(sgt->sgl, sg, sgt->nents, i) {
+    // On most iterations, this inner loop shouldn't happen at all. This loop
+    // conditional only triggers if we fill up the bio and are unable to map
+    // the full length of an SGL entry.
+    while (bio_add_page(bio, sg_page(sg), sg_dma_len(sg), sg->offset) != sg_dma_len(sg)) {
+      // Uh oh! We ran out of space in the bio. Allocate a new one and chain it...
+      struct bio *prev = bio;
+      bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
+      bio->bi_bdev = bdev; // Switch to bio_set_dev(bdev) in newer kernels
+      bio->bi_iter.bi_sector = sector;
+      bio_set_op_attrs(bio, op, op == REQ_OP_WRITE ? WRITE_ODIRECT : 0);
+      bio_chain(bio, prev);
+      // Get the I/O started
+      submit_bio(prev);
+      // No need to call bio_put() as that's automatically managed for chained bios
+    }
+    sector += sg_dma_len(sg) >> 9;
+    sg_cnt--;
+  }
+  // Use blocking submit for now
+  // TODO: Switch to async via submit_bio(bio)
+  err = submit_bio_wait(bio);
+  if (bio->bi_error && bio->bi_error != err)
+    printk(KERN_WARNING "nvgpu: bio->bi_error %d != return val from submit_bio_wait() %d\n", bio->bi_error, err);
+//out:
+  bio_put(bio_orig); // TODO: Move to completion handler
+  blkdev_put(bdev, FMODE_WRITE|FMODE_READ);
+  return err;
+}
+// Patterned off how __nvgpu_vm_find_mapped_buf_reverse() works in vm.c
+// Needs struct nvgpu_rbtree_node *node, struct nvgpu_rbtree_node *root,
+// and struct nvgpu_mapped_buf *m.
+// Steps until end of rbtree OR !m
+#define for_each_buffer(node, root, m) \
+  for (nvgpu_rbtree_enum_start(0, &node, root); \
+       node && (uintptr_t)(m = mapped_buffer_from_rbtree_node(node)); \
+       nvgpu_rbtree_enum_next(&node, node))
+// New, fast replacement to looking through with the above macro to match
+struct nvgpu_mapped_buf* dmabuf_to_mapped_buf(struct dma_buf *dmabuf) {
+  struct list_head *nvmap_priv = nvmap_get_priv_list(dmabuf);
+  struct nvgpu_mapped_buf *mapped_buffer;
+  struct nvgpu_mapped_buf_priv *priv;
+  if (IS_ERR(nvmap_priv))
+    return ERR_PTR(-EOPNOTSUPP);
+  priv = list_first_entry_or_null(nvmap_priv, struct nvgpu_mapped_buf_priv, nvmap_priv_entry);
+  if (unlikely(!priv)) {
+    printk(KERN_ERR "nvgpu: State tracking error for fast reverse lookups. Have unattached dmabuf!");
+    return ERR_PTR(-ENOTRECOVERABLE);
+  }
+  mapped_buffer = container_of(priv, struct nvgpu_mapped_buf, os_priv);
+  if (unlikely(mapped_buffer->os_priv.dmabuf != dmabuf)) {
+    printk(KERN_ERR "nvgpu: dmabuf_to_mapped_buf mapping inconsistent! BUG!\n");
+    return ERR_PTR(-ENOTRECOVERABLE);
+  }
+  if (!list_is_singular(&priv->nvmap_priv_entry)) {
+    printk(KERN_WARNING "nvgpu: Requesting paging on memory with multiple mappings! Aborting...\n");
+    return ERR_PTR(-EOPNOTSUPP);
+  }
+  return mapped_buffer;
+}
+int copy_all(struct vm_gk20a *vm) {
+        struct nvgpu_rbtree_node *node;
+        struct nvgpu_mapped_buf *m;
+        for_each_buffer(node, vm->mapped_buffers, m) {
+                // TODO
+                continue;
+        }
+        return 0;
+}
+int copy_out(struct sg_table *sgt) {
+  return copy(sgt, REQ_OP_WRITE);
+}
+int copy_in(struct sg_table *sgt) {
+  return copy(sgt, REQ_OP_READ);
+}
diff --git a/drivers/gpu/nvgpu/os/linux/vm.c b/drivers/gpu/nvgpu/os/linux/vm.c
index 8956cce5..fcb58ac4 100644
--- a/drivers/gpu/nvgpu/os/linux/vm.c
+++ b/drivers/gpu/nvgpu/os/linux/vm.c
@@ -15,6 +15,7 @@
 */
 #include <linux/dma-buf.h>
+#include <linux/nvmap.h>
 #include <linux/scatterlist.h>
 #include <uapi/linux/nvgpu.h>
@@ -71,7 +72,23 @@ static struct nvgpu_mapped_buf *__nvgpu_vm_find_mapped_buf_reverse(
 {
        struct nvgpu_rbtree_node *node = NULL;
        struct nvgpu_rbtree_node *root = vm->mapped_buffers;
+        struct list_head* nvmap_priv;
+        // Try fast lookup first
+        if (!IS_ERR(nvmap_priv = nvmap_get_priv_list(dmabuf))) {
+                struct nvgpu_mapped_buf *mapped_buffer;
+                struct nvgpu_mapped_buf_priv *priv;
+                list_for_each_entry(priv, nvmap_priv, nvmap_priv_entry) {
+                        mapped_buffer = container_of(priv, struct nvgpu_mapped_buf, os_priv);
+                        if (mapped_buffer->os_priv.dmabuf == dmabuf &&
+                            mapped_buffer->kind == kind)
+                                return mapped_buffer;
+                }
+        }
+        // Full traversal (not an nvmap buffer?)
+        printk(KERN_INFO "nvmap: Fast reverse lookup failed!");
        nvgpu_rbtree_enum_start(0, &node, root);
        while (node) {
@@ -158,6 +175,7 @@ struct nvgpu_mapped_buf *nvgpu_vm_find_mapping(struct vm_gk20a *vm,
         */
        gk20a_mm_unpin(os_buf->dev, os_buf->dmabuf, os_buf->attachment,
                       mapped_buffer->os_priv.sgt);
+        list_del(&mapped_buffer->os_priv.nvmap_priv_entry);
        dma_buf_put(os_buf->dmabuf);
        nvgpu_log(g, gpu_dbg_map,
@@ -198,6 +216,7 @@ int nvgpu_vm_map_linux(struct vm_gk20a *vm,
        struct nvgpu_sgt *nvgpu_sgt = NULL;
        struct nvgpu_mapped_buf *mapped_buffer = NULL;
        struct dma_buf_attachment *attachment;
+        struct list_head *nvmap_priv;
        int err = 0;
        sgt = gk20a_mm_pin(dev, dmabuf, &attachment);
@@ -243,6 +262,12 @@ int nvgpu_vm_map_linux(struct vm_gk20a *vm,
        mapped_buffer->os_priv.dmabuf = dmabuf;
        mapped_buffer->os_priv.attachment = attachment;
        mapped_buffer->os_priv.sgt    = sgt;
+        nvmap_priv = nvmap_get_priv_list(dmabuf);
+        if (!IS_ERR(nvmap_priv))
+                list_add(&mapped_buffer->os_priv.nvmap_priv_entry, nvmap_priv);
+        else
+                // So we can always safely call list_del()
+                INIT_LIST_HEAD(&mapped_buffer->os_priv.nvmap_priv_entry);
        *gpu_va = mapped_buffer->addr;
        return 0;
@@ -353,6 +378,49 @@ void nvgpu_vm_unmap_system(struct nvgpu_mapped_buf *mapped_buffer)
        gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->os_priv.dmabuf,
                       mapped_buffer->os_priv.attachment,
                       mapped_buffer->os_priv.sgt);
+        list_del(&mapped_buffer->os_priv.nvmap_priv_entry);
        dma_buf_put(mapped_buffer->os_priv.dmabuf);
 }
+/**
+ * Given an nvgpu_mapped_buf m, map m->os_priv.sgt into m->addr
+ * Very similar to nvgpu_vm_map_buffer, except that this assumes all necessary
+ * PTEs and PDEs have been created. This merely updates the physical address(es)
+ * in the associated PTEs, leaving all other attributes unchanged.
+ *
+ * NOP if sgt is already mapped for addr.
+ *
+ * vm->gmmu_update_lock must be held.
+ *
+ * Caller is responsible for flushing the TLB and L2 caches.
+ */
+void nvgpu_vm_remap(struct nvgpu_mapped_buf *m)
+{
+        // TODO: Input validation
+        struct scatterlist *sg;
+        unsigned int i = 0;
+        u64 curr_vaddr = m->addr;
+        // For each element of the scatterlist
+        // (based off for_each_sgtable_dma_sg() macro in newer kernels)
+        for_each_sg(m->os_priv.sgt->sgl, sg, m->os_priv.sgt->nents, i) {
+                unsigned int sg_off = 0;
+                // Keep mapping data at the next unmapped virtual address
+                // until each scatterlist element is entirely mapped
+                while (sg_off < sg_dma_len(sg)) {
+                        int amt_mapped = __nvgpu_update_paddr(gk20a_from_vm(m->vm),
+                                                              m->vm,
+                                                              curr_vaddr,
+                                                              sg_dma_address(sg) + sg_off);
+                        if (amt_mapped < 0) {
+                                printk(KERN_ERR "nvgpu: Error %d from __nvgpu_update_paddr() in nvgpu_vm_remap()! Had mapped %llu of %llu bytes.\n", amt_mapped, curr_vaddr - m->addr, m->size);
+                                return;
+                        }
+                        curr_vaddr += amt_mapped;
+                        sg_off += amt_mapped;
+                }
+        }
+        if (curr_vaddr != m->addr + m->size) {
+                printk(KERN_ERR "nvgpu: Mapped %llu bytes when %llu bytes expected! Expect page table corruption!\n", curr_vaddr - m->addr, m->size);
+        }
+}

diff --git a/drivers/gpu/nvgpu/os/linux/dmabuf.c b/drivers/gpu/nvgpu/os/linux/dmabuf.c index e8e33130..08f78ae6 100644 --- a/drivers/gpu/nvgpu/os/linux/dmabuf.c +++ b/drivers/gpu/nvgpu/os/linux/dmabuf.c
@@ -124,8 +124,10 @@ void gk20a_mm_unpin(struct device dev, struct dma_buf dmabuf,
124	struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);	124	struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
125	dma_addr_t dma_addr;	125	dma_addr_t dma_addr;
126		126
127	if (IS_ERR(priv) \|\| !priv)	127	if (IS_ERR(priv) \|\| !priv) {
		128	printk(KERN_ERR "nvgpu: Unable to access priv in gk20a_mm_unpin()\n");
128	return;	129	return;
		130	}
129		131
130	nvgpu_mutex_acquire(&priv->lock);	132	nvgpu_mutex_acquire(&priv->lock);
131	WARN_ON(priv->sgt != sgt);	133	WARN_ON(priv->sgt != sgt);


diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_as.c b/drivers/gpu/nvgpu/os/linux/ioctl_as.c index f0cec178..9708ea1a 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_as.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_as.c
@@ -32,6 +32,9 @@
32	#include "platform_gk20a.h"	32	#include "platform_gk20a.h"
33	#include "ioctl_as.h"	33	#include "ioctl_as.h"
34	#include "os_linux.h"	34	#include "os_linux.h"
		35	#include <linux/nvmap.h> // For nvmap_dmabuf_{d/r}ealloc()
		36	#include "dmabuf.h" // struct dma_buf things for swapping
		37	#include "swap.h"
35		38
36	static u32 gk20a_as_translate_as_alloc_space_flags(struct gk20a *g, u32 flags)	39	static u32 gk20a_as_translate_as_alloc_space_flags(struct gk20a *g, u32 flags)
37	{	40	{
@@ -329,6 +332,192 @@ int gk20a_as_dev_release(struct inode inode, struct file filp)
329		332
330	return gk20a_as_release_share(as_share);	333	return gk20a_as_release_share(as_share);
331	}	334	}
		335	#define OLD_WALK 0
		336
		337	/* Access dmabuf associated with passed file descriptor, copy the associated
		338	* pages to an NVME drive, unpin associated pages from DMA'able space, and free
		339	* said pages for use by others.
		340	* dmabuf is put in a deallocated state, and any GPU mappings will be
		341	* invalidated. To restore the dmabuf, see nvgpu_as_ioctl_read_swap_buffer().
		342	*/
		343	static int nvgpu_as_ioctl_write_swap_buffer(
		344	struct gk20a_as_share *as_share,
		345	struct nvgpu_as_swap_buffer_args *args)
		346	{
		347	struct gk20a *g = gk20a_from_vm(as_share->vm);
		348	int err = 0;
		349	#if OLD_WALK
		350	struct nvgpu_rbtree_node *node;
		351	#endif
		352	struct nvgpu_mapped_buf *m;
		353	struct sg_table *sgt;
		354	struct vm_gk20a *vm = as_share->vm;
		355	struct dma_buf *dmabuf = dma_buf_get(args->dmabuf_fd);
		356
		357	nvgpu_log_fn(g, " ");
		358
		359	if (IS_ERR(dmabuf))
		360	return PTR_ERR(dmabuf);
		361
		362	// Other code walking vm->mapped_buffers grabs this lock
		363	nvgpu_mutex_acquire(&vm->update_gmmu_lock);
		364
		365	#if OLD_WALK
		366	// Get mapped buffer corresponding to this dmabuf
		367	// TODO: Error on buffer mapped >1
		368	for_each_buffer(node, vm->mapped_buffers, m) {
		369	if (m->os_priv.dmabuf == dmabuf)
		370	break;
		371	}
		372	// If failed search
		373	if (!node \|\| !m) {
		374	// No mapped dmabuf associated with FD
		375	err = -EBADFD;
		376	goto out_put_unlock;
		377	}
		378	#else
		379	m = dmabuf_to_mapped_buf(dmabuf);
		380	// If failed search
		381	if (IS_ERR(m)) {
		382	// No mapped dmabuf associated with FD
		383	err = -EBADFD;
		384	goto out_put_unlock;
		385	}
		386	#endif
		387
		388	// Disable an annoying custom out-of-tree "feature" of dma_buf which defers unmap
		389	if (dma_buf_disable_lazy_unmapping(dev_from_vm(vm))) {
		390	err = -ENOTRECOVERABLE;
		391	goto out_put_unlock;
		392	}
		393
		394	// Flush dirty GPU L2 cache lines to DRAM
		395	// (Assuming that NVMe DRAM acceses are uncached)
		396	gk20a_mm_l2_flush(g, false);
		397
		398	// Copy out (blocking)
		399	err = copy_out(m->os_priv.sgt);
		400	if (err) {
		401	// Inaccessible swap device, etc
		402	goto out_put_unlock;
		403	}
		404
		405	// Unpin needs to happen after copy out is done
		406	// (No return value check as it's a void function)
		407	gk20a_mm_unpin(dev_from_vm(vm), m->os_priv.dmabuf,
		408	m->os_priv.attachment, m->os_priv.sgt);
		409
		410	// Deallocate dmabuf's backing pages
		411	// TODO: Fail early for these cases (where the dmabuf is mmaped, etc),
		412	// before we do all the above (expensive) steps
		413	err = nvmap_dealloc_dmabuf(dmabuf);
		414	if (err) {
		415	// Repin
		416	sgt = gk20a_mm_pin(dev_from_vm(vm), m->os_priv.dmabuf,
		417	&m->os_priv.attachment);
		418	m->os_priv.sgt = sgt;
		419	goto out_put_unlock;
		420	}
		421
		422	out_put_unlock:
		423	// Done with dmabuf, so release our ref to it
		424	dma_buf_put(dmabuf);
		425	nvgpu_mutex_release(&vm->update_gmmu_lock);
		426	return err;
		427	}
		428
		429	// Undoes everything nvgpu_as_ioctl_write_swap_buffer() does
		430	static int nvgpu_as_ioctl_read_swap_buffer(
		431	struct gk20a_as_share *as_share,
		432	struct nvgpu_as_swap_buffer_args *args)
		433	{
		434	struct gk20a *g = gk20a_from_vm(as_share->vm);
		435	int err = 0;
		436	#if OLD_WALK
		437	struct nvgpu_rbtree_node *node;
		438	#endif
		439	struct nvgpu_mapped_buf *m;
		440	struct sg_table *sgt;
		441	struct vm_gk20a *vm = as_share->vm;
		442	struct dma_buf *dmabuf = dma_buf_get(args->dmabuf_fd);
		443
		444	nvgpu_log_fn(g, " ");
		445
		446	if (!dmabuf)
		447	return -EBADF;
		448	// Other code walking vm->mapped_buffers grabs this lock
		449	nvgpu_mutex_acquire(&vm->update_gmmu_lock);
		450
		451	#if OLD_WALK
		452	// Get mapped buffer corresponding to this dmabuf
		453	// TODO: Error on buffer mapped >1
		454	for_each_buffer(node, vm->mapped_buffers, m) {
		455	if (m->os_priv.dmabuf == dmabuf)
		456	break;
		457	}
		458	// If failed search
		459	if (!node \|\| !m) {
		460	// No mapped dmabuf associated with FD
		461	err = -EBADFD;
		462	goto out_put_unlock;
		463	}
		464	#else
		465	m = dmabuf_to_mapped_buf(dmabuf);
		466	// If failed search
		467	if (IS_ERR(m)) {
		468	// No mapped dmabuf associated with FD
		469	err = -EBADFD;
		470	goto out_put_unlock;
		471	}
		472	#endif
		473
		474	// Reallocate space for this buffer
		475	err = nvmap_realloc_dmabuf(dmabuf);
		476	if (err) {
		477	// Out of memory (?)
		478	goto out_put_unlock;
		479	}
		480
		481	// Repin the buffer to DMA'able memory
		482	sgt = gk20a_mm_pin(dev_from_vm(vm), m->os_priv.dmabuf,
		483	&m->os_priv.attachment);
		484	if (IS_ERR(sgt)) {
		485	// Rollback allocation
		486	err = nvmap_dealloc_dmabuf(dmabuf);
		487	if (err)
		488	printk(KERN_ERR "nvgpu: Error %d while rolling back dmabuf allocation state on error in gk20a_mm_pin()! Consider dmabuf FD %d to be in an inconsistent state!\n", err, args->dmabuf_fd);
		489	err = PTR_ERR(sgt);
		490	goto out_put_unlock;
		491	}
		492	// Do any bookeeping not done by gk20a_mm_pin()
		493	m->os_priv.sgt = sgt;
		494
		495	// Reload page contents from disk (blocking)
		496	err = copy_in(sgt);
		497	if (err) {
		498	int err2;
		499	// Rollback pinning and allocation
		500	gk20a_mm_unpin(dev_from_vm(vm), m->os_priv.dmabuf,
		501	m->os_priv.attachment, m->os_priv.sgt);
		502	err2 = nvmap_dealloc_dmabuf(dmabuf);
		503	if (err2)
		504	printk(KERN_ERR "nvgpu: Error %d while rolling back dmabuf allocation state on error in copy_in()! Consider dmabuf FD %d to be in an inconsistent state!\n", err2, args->dmabuf_fd);
		505	// Inaccessible swap device, etc
		506	goto out_put_unlock;
		507	}
		508	// Update GPU page tables (PT) to point to new allocation
		509	nvgpu_vm_remap(m);
		510	// Due to PT update, translation lookaside buffer needs clearing
		511	g->ops.fb.tlb_invalidate(g, vm->pdb.mem);
		512	// Invalidate L2 so that TLB refill does not load stale PT
		513	gk20a_mm_l2_flush(g, true);
		514
		515	out_put_unlock:
		516	// Done with dmabuf, so release our ref to it
		517	dma_buf_put(dmabuf);
		518	nvgpu_mutex_release(&vm->update_gmmu_lock);
		519	return err;
		520	}
332		521
333	long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)	522	long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
334	{	523	{
@@ -412,6 +601,14 @@ long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
412	err = nvgpu_as_ioctl_get_sync_ro_map(as_share,	601	err = nvgpu_as_ioctl_get_sync_ro_map(as_share,
413	(struct nvgpu_as_get_sync_ro_map_args *)buf);	602	(struct nvgpu_as_get_sync_ro_map_args *)buf);
414	break;	603	break;
		604	case NVGPU_AS_IOCTL_READ_SWAP_BUFFER:
		605	err = nvgpu_as_ioctl_read_swap_buffer(as_share,
		606	(struct nvgpu_as_swap_buffer_args *)buf);
		607	break;
		608	case NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER:
		609	err = nvgpu_as_ioctl_write_swap_buffer(as_share,
		610	(struct nvgpu_as_swap_buffer_args *)buf);
		611	break;
415	default:	612	default:
416	err = -ENOTTY;	613	err = -ENOTTY;
417	break;	614	break;


diff --git a/drivers/gpu/nvgpu/os/linux/swap.h b/drivers/gpu/nvgpu/os/linux/swap.h new file mode 100644 index 00000000..f762ba81 --- /dev/null +++ b/drivers/gpu/nvgpu/os/linux/swap.h
@@ -0,0 +1,117 @@
		1	#include <linux/scatterlist.h>
		2	#include <linux/bio.h>
		3	//#include <nvgpu/bug.h>
		4
		5	// Queue a command to copy out an SGT to disk
		6	// TODO: Cache bdev
		7	// TODO: Asynchronous I/O
		8	// TODO: Don't hardcode sector 0
		9	int copy(struct sg_table *sgt, int op) {
		10	unsigned int i;
		11	struct scatterlist *sg;
		12	struct bio *bio;
		13	int err = 0;
		14	int sg_cnt = sgt->nents;
		15	struct bio *bio_orig;
		16	sector_t sector = 0; // XXX: For testing
		17	// Find and open the block device
		18	struct block_device *bdev = blkdev_get_by_path("/dev/nvme0n1", FMODE_READ \| FMODE_WRITE, copy);
		19	if (unlikely(IS_ERR(bdev))) {
		20	printk(KERN_WARNING "Unabled to find `nvme0`, err %ld!\n", PTR_ERR(bdev));
		21	return -ENODEV;
		22	}
		23	// Will never fail when allocating <= BIO_MAX_PAGES
		24	bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
		25	bio_orig = bio;
		26	bio->bi_bdev = bdev; // Switch to bio_set_dev(bdev) in newer kernels
		27	bio->bi_iter.bi_sector = sector;
		28	bio_set_op_attrs(bio, op, op == REQ_OP_WRITE ? WRITE_ODIRECT : 0);//REQ_SYNC); // XXX: Is REQ_SYNC necessary?
		29	// Copy the scatter-gather table (sgt) into a block I/O vector (bio vec)
		30	// bio_chain() approach borrowed from drivers/nvme/target/io-cmd.c:nvmet_execute_rw()
		31	for_each_sg(sgt->sgl, sg, sgt->nents, i) {
		32	// On most iterations, this inner loop shouldn't happen at all. This loop
		33	// conditional only triggers if we fill up the bio and are unable to map
		34	// the full length of an SGL entry.
		35	while (bio_add_page(bio, sg_page(sg), sg_dma_len(sg), sg->offset) != sg_dma_len(sg)) {
		36	// Uh oh! We ran out of space in the bio. Allocate a new one and chain it...
		37	struct bio *prev = bio;
		38	bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
		39	bio->bi_bdev = bdev; // Switch to bio_set_dev(bdev) in newer kernels
		40	bio->bi_iter.bi_sector = sector;
		41	bio_set_op_attrs(bio, op, op == REQ_OP_WRITE ? WRITE_ODIRECT : 0);
		42	bio_chain(bio, prev);
		43	// Get the I/O started
		44	submit_bio(prev);
		45	// No need to call bio_put() as that's automatically managed for chained bios
		46	}
		47	sector += sg_dma_len(sg) >> 9;
		48	sg_cnt--;
		49	}
		50	// Use blocking submit for now
		51	// TODO: Switch to async via submit_bio(bio)
		52	err = submit_bio_wait(bio);
		53
		54	if (bio->bi_error && bio->bi_error != err)
		55	printk(KERN_WARNING "nvgpu: bio->bi_error %d != return val from submit_bio_wait() %d\n", bio->bi_error, err);
		56
		57	//out:
		58	bio_put(bio_orig); // TODO: Move to completion handler
		59	blkdev_put(bdev, FMODE_WRITE\|FMODE_READ);
		60	return err;
		61	}
		62
		63	// Patterned off how __nvgpu_vm_find_mapped_buf_reverse() works in vm.c
		64	// Needs struct nvgpu_rbtree_node node, struct nvgpu_rbtree_node root,
		65	// and struct nvgpu_mapped_buf *m.
		66	// Steps until end of rbtree OR !m
		67	#define for_each_buffer(node, root, m) \
		68	for (nvgpu_rbtree_enum_start(0, &node, root); \
		69	node && (uintptr_t)(m = mapped_buffer_from_rbtree_node(node)); \
		70	nvgpu_rbtree_enum_next(&node, node))
		71
		72	// New, fast replacement to looking through with the above macro to match
		73	struct nvgpu_mapped_buf* dmabuf_to_mapped_buf(struct dma_buf *dmabuf) {
		74	struct list_head *nvmap_priv = nvmap_get_priv_list(dmabuf);
		75	struct nvgpu_mapped_buf *mapped_buffer;
		76	struct nvgpu_mapped_buf_priv *priv;
		77
		78	if (IS_ERR(nvmap_priv))
		79	return ERR_PTR(-EOPNOTSUPP);
		80
		81	priv = list_first_entry_or_null(nvmap_priv, struct nvgpu_mapped_buf_priv, nvmap_priv_entry);
		82	if (unlikely(!priv)) {
		83	printk(KERN_ERR "nvgpu: State tracking error for fast reverse lookups. Have unattached dmabuf!");
		84	return ERR_PTR(-ENOTRECOVERABLE);
		85	}
		86
		87	mapped_buffer = container_of(priv, struct nvgpu_mapped_buf, os_priv);
		88	if (unlikely(mapped_buffer->os_priv.dmabuf != dmabuf)) {
		89	printk(KERN_ERR "nvgpu: dmabuf_to_mapped_buf mapping inconsistent! BUG!\n");
		90	return ERR_PTR(-ENOTRECOVERABLE);
		91	}
		92	if (!list_is_singular(&priv->nvmap_priv_entry)) {
		93	printk(KERN_WARNING "nvgpu: Requesting paging on memory with multiple mappings! Aborting...\n");
		94	return ERR_PTR(-EOPNOTSUPP);
		95	}
		96	return mapped_buffer;
		97	}
		98
		99	int copy_all(struct vm_gk20a *vm) {
		100	struct nvgpu_rbtree_node *node;
		101	struct nvgpu_mapped_buf *m;
		102
		103	for_each_buffer(node, vm->mapped_buffers, m) {
		104	// TODO
		105	continue;
		106	}
		107	return 0;
		108	}
		109
		110	int copy_out(struct sg_table *sgt) {
		111	return copy(sgt, REQ_OP_WRITE);
		112	}
		113
		114	int copy_in(struct sg_table *sgt) {
		115	return copy(sgt, REQ_OP_READ);
		116	}
		117


diff --git a/drivers/gpu/nvgpu/os/linux/vm.c b/drivers/gpu/nvgpu/os/linux/vm.c index 8956cce5..fcb58ac4 100644 --- a/drivers/gpu/nvgpu/os/linux/vm.c +++ b/drivers/gpu/nvgpu/os/linux/vm.c
@@ -15,6 +15,7 @@
15	*/	15	*/
16		16
17	#include <linux/dma-buf.h>	17	#include <linux/dma-buf.h>
		18	#include <linux/nvmap.h>
18	#include <linux/scatterlist.h>	19	#include <linux/scatterlist.h>
19	#include <uapi/linux/nvgpu.h>	20	#include <uapi/linux/nvgpu.h>
20		21
@@ -71,7 +72,23 @@ static struct nvgpu_mapped_buf *__nvgpu_vm_find_mapped_buf_reverse(
71	{	72	{
72	struct nvgpu_rbtree_node *node = NULL;	73	struct nvgpu_rbtree_node *node = NULL;
73	struct nvgpu_rbtree_node *root = vm->mapped_buffers;	74	struct nvgpu_rbtree_node *root = vm->mapped_buffers;
		75	struct list_head* nvmap_priv;
		76
		77	// Try fast lookup first
		78	if (!IS_ERR(nvmap_priv = nvmap_get_priv_list(dmabuf))) {
		79	struct nvgpu_mapped_buf *mapped_buffer;
		80	struct nvgpu_mapped_buf_priv *priv;
		81
		82	list_for_each_entry(priv, nvmap_priv, nvmap_priv_entry) {
		83	mapped_buffer = container_of(priv, struct nvgpu_mapped_buf, os_priv);
		84	if (mapped_buffer->os_priv.dmabuf == dmabuf &&
		85	mapped_buffer->kind == kind)
		86	return mapped_buffer;
		87	}
		88	}
74		89
		90	// Full traversal (not an nvmap buffer?)
		91	printk(KERN_INFO "nvmap: Fast reverse lookup failed!");
75	nvgpu_rbtree_enum_start(0, &node, root);	92	nvgpu_rbtree_enum_start(0, &node, root);
76		93
77	while (node) {	94	while (node) {
@@ -158,6 +175,7 @@ struct nvgpu_mapped_buf nvgpu_vm_find_mapping(struct vm_gk20a vm,
158	*/	175	*/
159	gk20a_mm_unpin(os_buf->dev, os_buf->dmabuf, os_buf->attachment,	176	gk20a_mm_unpin(os_buf->dev, os_buf->dmabuf, os_buf->attachment,
160	mapped_buffer->os_priv.sgt);	177	mapped_buffer->os_priv.sgt);
		178	list_del(&mapped_buffer->os_priv.nvmap_priv_entry);
161	dma_buf_put(os_buf->dmabuf);	179	dma_buf_put(os_buf->dmabuf);
162		180
163	nvgpu_log(g, gpu_dbg_map,	181	nvgpu_log(g, gpu_dbg_map,
@@ -198,6 +216,7 @@ int nvgpu_vm_map_linux(struct vm_gk20a *vm,
198	struct nvgpu_sgt *nvgpu_sgt = NULL;	216	struct nvgpu_sgt *nvgpu_sgt = NULL;
199	struct nvgpu_mapped_buf *mapped_buffer = NULL;	217	struct nvgpu_mapped_buf *mapped_buffer = NULL;
200	struct dma_buf_attachment *attachment;	218	struct dma_buf_attachment *attachment;
		219	struct list_head *nvmap_priv;
201	int err = 0;	220	int err = 0;
202		221
203	sgt = gk20a_mm_pin(dev, dmabuf, &attachment);	222	sgt = gk20a_mm_pin(dev, dmabuf, &attachment);
@@ -243,6 +262,12 @@ int nvgpu_vm_map_linux(struct vm_gk20a *vm,
243	mapped_buffer->os_priv.dmabuf = dmabuf;	262	mapped_buffer->os_priv.dmabuf = dmabuf;
244	mapped_buffer->os_priv.attachment = attachment;	263	mapped_buffer->os_priv.attachment = attachment;
245	mapped_buffer->os_priv.sgt = sgt;	264	mapped_buffer->os_priv.sgt = sgt;
		265	nvmap_priv = nvmap_get_priv_list(dmabuf);
		266	if (!IS_ERR(nvmap_priv))
		267	list_add(&mapped_buffer->os_priv.nvmap_priv_entry, nvmap_priv);
		268	else
		269	// So we can always safely call list_del()
		270	INIT_LIST_HEAD(&mapped_buffer->os_priv.nvmap_priv_entry);
246		271
247	*gpu_va = mapped_buffer->addr;	272	*gpu_va = mapped_buffer->addr;
248	return 0;	273	return 0;
@@ -353,6 +378,49 @@ void nvgpu_vm_unmap_system(struct nvgpu_mapped_buf *mapped_buffer)
353	gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->os_priv.dmabuf,	378	gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->os_priv.dmabuf,
354	mapped_buffer->os_priv.attachment,	379	mapped_buffer->os_priv.attachment,
355	mapped_buffer->os_priv.sgt);	380	mapped_buffer->os_priv.sgt);
356		381	list_del(&mapped_buffer->os_priv.nvmap_priv_entry);
357	dma_buf_put(mapped_buffer->os_priv.dmabuf);	382	dma_buf_put(mapped_buffer->os_priv.dmabuf);
358	}	383	}
		384
		385	/**
		386	* Given an nvgpu_mapped_buf m, map m->os_priv.sgt into m->addr
		387	* Very similar to nvgpu_vm_map_buffer, except that this assumes all necessary
		388	* PTEs and PDEs have been created. This merely updates the physical address(es)
		389	* in the associated PTEs, leaving all other attributes unchanged.
		390	*
		391	* NOP if sgt is already mapped for addr.
		392	*
		393	* vm->gmmu_update_lock must be held.
		394	*
		395	* Caller is responsible for flushing the TLB and L2 caches.
		396	*/
		397	void nvgpu_vm_remap(struct nvgpu_mapped_buf *m)
		398	{
		399	// TODO: Input validation
		400	struct scatterlist *sg;
		401	unsigned int i = 0;
		402	u64 curr_vaddr = m->addr;
		403
		404	// For each element of the scatterlist
		405	// (based off for_each_sgtable_dma_sg() macro in newer kernels)
		406	for_each_sg(m->os_priv.sgt->sgl, sg, m->os_priv.sgt->nents, i) {
		407	unsigned int sg_off = 0;
		408	// Keep mapping data at the next unmapped virtual address
		409	// until each scatterlist element is entirely mapped
		410	while (sg_off < sg_dma_len(sg)) {
		411	int amt_mapped = __nvgpu_update_paddr(gk20a_from_vm(m->vm),
		412	m->vm,
		413	curr_vaddr,
		414	sg_dma_address(sg) + sg_off);
		415	if (amt_mapped < 0) {
		416	printk(KERN_ERR "nvgpu: Error %d from __nvgpu_update_paddr() in nvgpu_vm_remap()! Had mapped %llu of %llu bytes.\n", amt_mapped, curr_vaddr - m->addr, m->size);
		417	return;
		418	}
		419	curr_vaddr += amt_mapped;
		420	sg_off += amt_mapped;
		421	}
		422	}
		423	if (curr_vaddr != m->addr + m->size) {
		424	printk(KERN_ERR "nvgpu: Mapped %llu bytes when %llu bytes expected! Expect page table corruption!\n", curr_vaddr - m->addr, m->size);
		425	}
		426	}