From ee26a2842ca891d3ae8b1de1b066d29234fc0115 Mon Sep 17 00:00:00 2001
From: Joshua Bakita <jbakita@cs.unc.edu>
Date: Tue, 24 May 2022 21:11:59 -0400
Subject: gpu-paging: Initial working implementation

Supports synchronous page out or in of a specific buffer.

Includes fast reverse struct mapped_buf lookup.

Requires initial set of changes to nvmap as well.
---
 drivers/gpu/nvgpu/Makefile                 |   1 +
 drivers/gpu/nvgpu/common/mm/gmmu.c         |  54 ++++++++
 drivers/gpu/nvgpu/include/nvgpu/gmmu.h     |  17 +++
 drivers/gpu/nvgpu/include/nvgpu/linux/vm.h |   2 +
 drivers/gpu/nvgpu/include/nvgpu/vm.h       |   2 +
 drivers/gpu/nvgpu/os/linux/dmabuf.c        |   4 +-
 drivers/gpu/nvgpu/os/linux/ioctl_as.c      | 197 +++++++++++++++++++++++++++++
 drivers/gpu/nvgpu/os/linux/swap.h          | 117 +++++++++++++++++
 drivers/gpu/nvgpu/os/linux/vm.c            |  70 +++++++++-
 include/uapi/linux/nvgpu.h                 |  10 +-
 10 files changed, 471 insertions(+), 3 deletions(-)
 create mode 100644 drivers/gpu/nvgpu/os/linux/swap.h

diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile
index 8c5b92e1..c23c858a 100644
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -9,6 +9,7 @@ ccflags-y += -I$(srctree.nvgpu-next)/drivers/gpu/nvgpu
 ccflags-y += -I$(srctree)/drivers/devfreq
 
 ccflags-y += -Wno-multichar
+ccflags-y += -Wno-sign-compare
 ccflags-y += -Werror
 ccflags-y += -Wno-error=cpp
 ifeq ($(VERSION),4)
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c
index 748e9f45..a04e501f 100644
--- a/drivers/gpu/nvgpu/common/mm/gmmu.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu.c
@@ -36,6 +36,9 @@
 
 #include "gk20a/mm_gk20a.h"
 
+// XXX: Shouldn't really be here! Needed for __nvgpu_update_paddr()
+#include <nvgpu/hw/gp10b/hw_gmmu_gp10b.h>
+
 #define __gmmu_dbg(g, attrs, fmt, args...)				\
 	do {								\
 		if (attrs->debug) {					\
@@ -938,3 +941,54 @@ int __nvgpu_set_pte(struct gk20a *g, struct vm_gk20a *vm, u64 vaddr, u32 *pte)
 
 	return 0;
 }
+
+u64 pgsz_enum_to_bytes(int sz) {
+	if (sz == GMMU_PAGE_SIZE_SMALL)
+		return SZ_4K;
+	else
+		return SZ_64K; // Dangerous! Big pages may also be 128k. Should check ram_in_big_page_size... registers.
+}
+
+// Caller is responsible for TLB/L2 flushing so that this can be called
+// repeatedly with low overhead.
+int __nvgpu_update_paddr(struct gk20a *g, struct vm_gk20a *vm, u64 vaddr, u64 paddr)
+{
+	struct nvgpu_gmmu_pd *pd;
+	u32 pd_idx, pd_offs;
+	int err;
+	u32 pte[2]; // Safe for at least gv11b
+	struct nvgpu_gmmu_attrs attrs = {
+		.pgsz = 0,
+	};
+//	u32 pte_orig[2];
+
+	// Get existing pte entry and location
+	err = __nvgpu_locate_pte(g, vm, &vm->pdb,
+				 vaddr, 0, &attrs,
+				 pte, &pd, &pd_idx, &pd_offs);
+	if (unlikely(err)) {
+		printk(KERN_ERR "nvgpu: Unable to find PTE for vaddr %llx in __nvgpu_update_paddr()\n", vaddr);
+		return err;
+	}
+	// TODO: Verify that the PTE is actually in SYSMEM
+//	pte_orig[0] = pte[0];
+//	pte_orig[1] = pte[1];
+
+	// Following logic is borrowed from __update_pte() for gp10b+
+	// TODO: Make this work for gk20a-gp10b!
+	// Zero-out the address field
+	pte[0] &= ~gmmu_new_pte_address_sys_f(~0 >> gmmu_new_pte_address_shift_v());
+	pte[1] &= ~(~0U >> (24 + gmmu_new_pte_address_shift_v()));
+	// Write new address (upper and lower bits)
+	pte[0] |= gmmu_new_pte_address_sys_f(paddr >> gmmu_new_pte_address_shift_v());
+	pte[1] |= paddr >> (24 + gmmu_new_pte_address_shift_v());
+	// Commit to the page tables
+	pd_write(g, pd, pd_offs, pte[0]);
+	pd_write(g, pd, pd_offs + 1, pte[1]);
+	nvgpu_wmb(); // XXX: Is this needed?
+//	printk(KERN_INFO "nvgpu: Mapped vaddr %llx @ paddr %llx. %lluKb pg. [%08x, %08x]\n", vaddr, paddr, pgsz_enum_to_bytes(attrs.pgsz)/1024, pte[1], pte[0]);
+//	if (pte_orig[0] != pte[0] || pte_orig[1] != pte[1]) {
+//		printk(KERN_INFO "nvgpu: Updated PTE entry from {%x,%x} to {%x, %x}\n", pte_orig[0], pte_orig[1], pte[0], pte[1]);
+//	}
+	return pgsz_enum_to_bytes(attrs.pgsz);
+}
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index 2fc0d44e..81f829ed 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -354,6 +354,23 @@ int __nvgpu_get_pte(struct gk20a *g, struct vm_gk20a *vm, u64 vaddr, u32 *pte);
  */
 int __nvgpu_set_pte(struct gk20a *g, struct vm_gk20a *vm, u64 vaddr, u32 *pte);
 
+/**
+ * __nvgpu_update_paddr - Remap a virtual address to a new physical address
+ *
+ * @g     - The GPU.
+ * @vm    - VM to look in.
+ * @vaddr - GPU virtual address.
+ * @paddr - The new physical address to map to
+ *
+ * This function is a combination of __nvgpu_get_pte() and __nvgpu_set_pte().
+ * It searches for an existing PTE associated with @vaddr, and then updates
+ * only the physical address pointed to in the PTE to @paddr. All other
+ * attributes/fields of the PTE are preserved.
+ *
+ * This function returns the number of bytes mapped on success and -EINVAL
+ * otherwise.
+ */
+int __nvgpu_update_paddr(struct gk20a *g, struct vm_gk20a *vm, u64 vaddr, u64 paddr);
 
 /*
  * Internal debugging routines. Probably not something you want to use.
diff --git a/drivers/gpu/nvgpu/include/nvgpu/linux/vm.h b/drivers/gpu/nvgpu/include/nvgpu/linux/vm.h
index 6f3beaa9..b86a428a 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/linux/vm.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/linux/vm.h
@@ -49,6 +49,8 @@ struct nvgpu_mapped_buf_priv {
 	struct dma_buf *dmabuf;
 	struct dma_buf_attachment *attachment;
 	struct sg_table *sgt;
+	// For fast reverse lookup (FD -> mapped_buf)
+	struct list_head nvmap_priv_entry;
 };
 
 /* NVGPU_AS_MAP_BUFFER_FLAGS_DIRECT_KIND_CTRL must be set */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/vm.h b/drivers/gpu/nvgpu/include/nvgpu/vm.h
index 3867c745..f007d880 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/vm.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vm.h
@@ -261,6 +261,8 @@ struct nvgpu_mapped_buf *nvgpu_vm_map(struct vm_gk20a *vm,
 				      struct vm_gk20a_mapping_batch *batch,
 				      enum nvgpu_aperture aperture);
 
+void nvgpu_vm_remap(struct nvgpu_mapped_buf *m);
+
 void nvgpu_vm_unmap(struct vm_gk20a *vm, u64 offset,
 		    struct vm_gk20a_mapping_batch *batch);
 
diff --git a/drivers/gpu/nvgpu/os/linux/dmabuf.c b/drivers/gpu/nvgpu/os/linux/dmabuf.c
index e8e33130..08f78ae6 100644
--- a/drivers/gpu/nvgpu/os/linux/dmabuf.c
+++ b/drivers/gpu/nvgpu/os/linux/dmabuf.c
@@ -124,8 +124,10 @@ void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf,
 	struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
 	dma_addr_t dma_addr;
 
-	if (IS_ERR(priv) || !priv)
+	if (IS_ERR(priv) || !priv) {
+		printk(KERN_ERR "nvgpu: Unable to access priv in gk20a_mm_unpin()\n");
 		return;
+	}
 
 	nvgpu_mutex_acquire(&priv->lock);
 	WARN_ON(priv->sgt != sgt);
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_as.c b/drivers/gpu/nvgpu/os/linux/ioctl_as.c
index f0cec178..9708ea1a 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_as.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_as.c
@@ -32,6 +32,9 @@
 #include "platform_gk20a.h"
 #include "ioctl_as.h"
 #include "os_linux.h"
+#include <linux/nvmap.h> // For nvmap_dmabuf_{d/r}ealloc()
+#include "dmabuf.h" // struct dma_buf things for swapping
+#include "swap.h"
 
 static u32 gk20a_as_translate_as_alloc_space_flags(struct gk20a *g, u32 flags)
 {
@@ -329,6 +332,192 @@ int gk20a_as_dev_release(struct inode *inode, struct file *filp)
 
 	return gk20a_as_release_share(as_share);
 }
+#define OLD_WALK 0
+
+/* Access dmabuf associated with passed file descriptor, copy the associated
+ * pages to an NVME drive, unpin associated pages from DMA'able space, and free
+ * said pages for use by others.
+ * dmabuf is put in a deallocated state, and any GPU mappings will be
+ * invalidated. To restore the dmabuf, see nvgpu_as_ioctl_read_swap_buffer().
+ */
+static int nvgpu_as_ioctl_write_swap_buffer(
+		struct gk20a_as_share *as_share,
+		struct nvgpu_as_swap_buffer_args *args)
+{
+	struct gk20a *g = gk20a_from_vm(as_share->vm);
+	int err = 0;
+#if OLD_WALK
+	struct nvgpu_rbtree_node *node;
+#endif
+	struct nvgpu_mapped_buf *m;
+	struct sg_table *sgt;
+	struct vm_gk20a *vm = as_share->vm;
+	struct dma_buf *dmabuf = dma_buf_get(args->dmabuf_fd);
+
+	nvgpu_log_fn(g, " ");
+
+	if (IS_ERR(dmabuf))
+		return PTR_ERR(dmabuf);
+
+	// Other code walking vm->mapped_buffers grabs this lock
+	nvgpu_mutex_acquire(&vm->update_gmmu_lock);
+
+#if OLD_WALK
+	// Get mapped buffer corresponding to this dmabuf
+	// TODO: Error on buffer mapped >1
+	for_each_buffer(node, vm->mapped_buffers, m) {
+		if (m->os_priv.dmabuf == dmabuf)
+			break;
+	}
+	// If failed search
+	if (!node || !m) {
+		// No mapped dmabuf associated with FD
+		err = -EBADFD;
+		goto out_put_unlock;
+	}
+#else
+	m = dmabuf_to_mapped_buf(dmabuf);
+	// If failed search
+	if (IS_ERR(m)) {
+		// No mapped dmabuf associated with FD
+		err = -EBADFD;
+		goto out_put_unlock;
+	}
+#endif
+
+	// Disable an annoying custom out-of-tree "feature" of dma_buf which defers unmap
+	if (dma_buf_disable_lazy_unmapping(dev_from_vm(vm))) {
+		err = -ENOTRECOVERABLE;
+		goto out_put_unlock;
+	}
+
+	// Flush dirty GPU L2 cache lines to DRAM
+	// (Assuming that NVMe DRAM acceses are uncached)
+	gk20a_mm_l2_flush(g, false);
+
+	// Copy out (blocking)
+	err = copy_out(m->os_priv.sgt);
+	if (err) {
+		// Inaccessible swap device, etc
+		goto out_put_unlock;
+	}
+
+	// Unpin needs to happen after copy out is done
+	// (No return value check as it's a void function)
+	gk20a_mm_unpin(dev_from_vm(vm), m->os_priv.dmabuf,
+		       m->os_priv.attachment, m->os_priv.sgt);
+
+	// Deallocate dmabuf's backing pages
+	// TODO: Fail early for these cases (where the dmabuf is mmaped, etc),
+	//       before we do all the above (expensive) steps
+	err = nvmap_dealloc_dmabuf(dmabuf);
+	if (err) {
+		// Repin
+		sgt = gk20a_mm_pin(dev_from_vm(vm), m->os_priv.dmabuf,
+				   &m->os_priv.attachment);
+		m->os_priv.sgt = sgt;
+		goto out_put_unlock;
+	}
+
+out_put_unlock:
+	// Done with dmabuf, so release our ref to it
+	dma_buf_put(dmabuf);
+	nvgpu_mutex_release(&vm->update_gmmu_lock);
+	return err;
+}
+
+// Undoes everything nvgpu_as_ioctl_write_swap_buffer() does
+static int nvgpu_as_ioctl_read_swap_buffer(
+		struct gk20a_as_share *as_share,
+		struct nvgpu_as_swap_buffer_args *args)
+{
+	struct gk20a *g = gk20a_from_vm(as_share->vm);
+	int err = 0;
+#if OLD_WALK
+	struct nvgpu_rbtree_node *node;
+#endif
+	struct nvgpu_mapped_buf *m;
+	struct sg_table *sgt;
+	struct vm_gk20a *vm = as_share->vm;
+	struct dma_buf *dmabuf = dma_buf_get(args->dmabuf_fd);
+
+	nvgpu_log_fn(g, " ");
+
+	if (!dmabuf)
+		return -EBADF;
+	// Other code walking vm->mapped_buffers grabs this lock
+	nvgpu_mutex_acquire(&vm->update_gmmu_lock);
+
+#if OLD_WALK
+	// Get mapped buffer corresponding to this dmabuf
+	// TODO: Error on buffer mapped >1
+	for_each_buffer(node, vm->mapped_buffers, m) {
+		if (m->os_priv.dmabuf == dmabuf)
+			break;
+	}
+	// If failed search
+	if (!node || !m) {
+		// No mapped dmabuf associated with FD
+		err = -EBADFD;
+		goto out_put_unlock;
+	}
+#else
+	m = dmabuf_to_mapped_buf(dmabuf);
+	// If failed search
+	if (IS_ERR(m)) {
+		// No mapped dmabuf associated with FD
+		err = -EBADFD;
+		goto out_put_unlock;
+	}
+#endif
+
+	// Reallocate space for this buffer
+	err = nvmap_realloc_dmabuf(dmabuf);
+	if (err) {
+		// Out of memory (?)
+		goto out_put_unlock;
+	}
+
+	// Repin the buffer to DMA'able memory
+	sgt = gk20a_mm_pin(dev_from_vm(vm), m->os_priv.dmabuf,
+			   &m->os_priv.attachment);
+	if (IS_ERR(sgt)) {
+		// Rollback allocation
+		err = nvmap_dealloc_dmabuf(dmabuf);
+		if (err)
+			printk(KERN_ERR "nvgpu: Error %d while rolling back dmabuf allocation state on error in gk20a_mm_pin()! Consider dmabuf FD %d to be in an inconsistent state!\n", err, args->dmabuf_fd);
+		err = PTR_ERR(sgt);
+		goto out_put_unlock;
+	}
+	// Do any bookeeping not done by gk20a_mm_pin()
+	m->os_priv.sgt = sgt;
+
+	// Reload page contents from disk (blocking)
+	err = copy_in(sgt);
+	if (err) {
+		int err2;
+		// Rollback pinning and allocation
+		gk20a_mm_unpin(dev_from_vm(vm), m->os_priv.dmabuf,
+			       m->os_priv.attachment, m->os_priv.sgt);
+		err2 = nvmap_dealloc_dmabuf(dmabuf);
+		if (err2)
+			printk(KERN_ERR "nvgpu: Error %d while rolling back dmabuf allocation state on error in copy_in()! Consider dmabuf FD %d to be in an inconsistent state!\n", err2, args->dmabuf_fd);
+		// Inaccessible swap device, etc
+		goto out_put_unlock;
+	}
+	// Update GPU page tables (PT) to point to new allocation
+	nvgpu_vm_remap(m);
+	// Due to PT update, translation lookaside buffer needs clearing
+	g->ops.fb.tlb_invalidate(g, vm->pdb.mem);
+	// Invalidate L2 so that TLB refill does not load stale PT
+	gk20a_mm_l2_flush(g, true);
+
+out_put_unlock:
+	// Done with dmabuf, so release our ref to it
+	dma_buf_put(dmabuf);
+	nvgpu_mutex_release(&vm->update_gmmu_lock);
+	return err;
+}
 
 long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
@@ -412,6 +601,14 @@ long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		err = nvgpu_as_ioctl_get_sync_ro_map(as_share,
 			(struct nvgpu_as_get_sync_ro_map_args *)buf);
 		break;
+	case NVGPU_AS_IOCTL_READ_SWAP_BUFFER:
+		err = nvgpu_as_ioctl_read_swap_buffer(as_share,
+			(struct nvgpu_as_swap_buffer_args *)buf);
+		break;
+	case NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER:
+		err = nvgpu_as_ioctl_write_swap_buffer(as_share,
+			(struct nvgpu_as_swap_buffer_args *)buf);
+		break;
 	default:
 		err = -ENOTTY;
 		break;
diff --git a/drivers/gpu/nvgpu/os/linux/swap.h b/drivers/gpu/nvgpu/os/linux/swap.h
new file mode 100644
index 00000000..f762ba81
--- /dev/null
+++ b/drivers/gpu/nvgpu/os/linux/swap.h
@@ -0,0 +1,117 @@
+#include <linux/scatterlist.h>
+#include <linux/bio.h>
+//#include <nvgpu/bug.h>
+
+// Queue a command to copy out an SGT to disk
+// TODO: Cache bdev
+// TODO: Asynchronous I/O
+// TODO: Don't hardcode sector 0
+int copy(struct sg_table *sgt, int op) {
+  unsigned int i;
+  struct scatterlist *sg;
+  struct bio *bio;
+  int err = 0;
+  int sg_cnt = sgt->nents;
+  struct bio *bio_orig;
+  sector_t sector = 0; // XXX: For testing
+  // Find and open the block device
+  struct block_device *bdev = blkdev_get_by_path("/dev/nvme0n1", FMODE_READ | FMODE_WRITE, copy);
+  if (unlikely(IS_ERR(bdev))) {
+    printk(KERN_WARNING "Unabled to find `nvme0`, err %ld!\n", PTR_ERR(bdev));
+    return -ENODEV;
+  }
+  // Will never fail when allocating <= BIO_MAX_PAGES
+  bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
+  bio_orig = bio;
+  bio->bi_bdev = bdev; // Switch to bio_set_dev(bdev) in newer kernels
+  bio->bi_iter.bi_sector = sector;
+  bio_set_op_attrs(bio, op, op == REQ_OP_WRITE ? WRITE_ODIRECT : 0);//REQ_SYNC); // XXX: Is REQ_SYNC necessary?
+  // Copy the scatter-gather table (sgt) into a block I/O vector (bio vec)
+  // bio_chain() approach borrowed from drivers/nvme/target/io-cmd.c:nvmet_execute_rw()
+  for_each_sg(sgt->sgl, sg, sgt->nents, i) {
+    // On most iterations, this inner loop shouldn't happen at all. This loop
+    // conditional only triggers if we fill up the bio and are unable to map
+    // the full length of an SGL entry.
+    while (bio_add_page(bio, sg_page(sg), sg_dma_len(sg), sg->offset) != sg_dma_len(sg)) {
+      // Uh oh! We ran out of space in the bio. Allocate a new one and chain it...
+      struct bio *prev = bio;
+      bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
+      bio->bi_bdev = bdev; // Switch to bio_set_dev(bdev) in newer kernels
+      bio->bi_iter.bi_sector = sector;
+      bio_set_op_attrs(bio, op, op == REQ_OP_WRITE ? WRITE_ODIRECT : 0);
+      bio_chain(bio, prev);
+      // Get the I/O started
+      submit_bio(prev);
+      // No need to call bio_put() as that's automatically managed for chained bios
+    }
+    sector += sg_dma_len(sg) >> 9;
+    sg_cnt--;
+  }
+  // Use blocking submit for now
+  // TODO: Switch to async via submit_bio(bio)
+  err = submit_bio_wait(bio);
+
+  if (bio->bi_error && bio->bi_error != err)
+    printk(KERN_WARNING "nvgpu: bio->bi_error %d != return val from submit_bio_wait() %d\n", bio->bi_error, err);
+
+//out:
+  bio_put(bio_orig); // TODO: Move to completion handler
+  blkdev_put(bdev, FMODE_WRITE|FMODE_READ);
+  return err;
+}
+
+// Patterned off how __nvgpu_vm_find_mapped_buf_reverse() works in vm.c
+// Needs struct nvgpu_rbtree_node *node, struct nvgpu_rbtree_node *root,
+// and struct nvgpu_mapped_buf *m.
+// Steps until end of rbtree OR !m
+#define for_each_buffer(node, root, m) \
+  for (nvgpu_rbtree_enum_start(0, &node, root); \
+       node && (uintptr_t)(m = mapped_buffer_from_rbtree_node(node)); \
+       nvgpu_rbtree_enum_next(&node, node))
+
+// New, fast replacement to looking through with the above macro to match
+struct nvgpu_mapped_buf* dmabuf_to_mapped_buf(struct dma_buf *dmabuf) {
+  struct list_head *nvmap_priv = nvmap_get_priv_list(dmabuf);
+  struct nvgpu_mapped_buf *mapped_buffer;
+  struct nvgpu_mapped_buf_priv *priv;
+
+  if (IS_ERR(nvmap_priv))
+    return ERR_PTR(-EOPNOTSUPP);
+
+  priv = list_first_entry_or_null(nvmap_priv, struct nvgpu_mapped_buf_priv, nvmap_priv_entry);
+  if (unlikely(!priv)) {
+    printk(KERN_ERR "nvgpu: State tracking error for fast reverse lookups. Have unattached dmabuf!");
+    return ERR_PTR(-ENOTRECOVERABLE);
+  }
+
+  mapped_buffer = container_of(priv, struct nvgpu_mapped_buf, os_priv);
+  if (unlikely(mapped_buffer->os_priv.dmabuf != dmabuf)) {
+    printk(KERN_ERR "nvgpu: dmabuf_to_mapped_buf mapping inconsistent! BUG!\n");
+    return ERR_PTR(-ENOTRECOVERABLE);
+  }
+  if (!list_is_singular(&priv->nvmap_priv_entry)) {
+    printk(KERN_WARNING "nvgpu: Requesting paging on memory with multiple mappings! Aborting...\n");
+    return ERR_PTR(-EOPNOTSUPP);
+  }
+  return mapped_buffer;
+}
+
+int copy_all(struct vm_gk20a *vm) {
+	struct nvgpu_rbtree_node *node;
+	struct nvgpu_mapped_buf *m;
+
+	for_each_buffer(node, vm->mapped_buffers, m) {
+		// TODO
+		continue;
+	}
+	return 0;
+}
+
+int copy_out(struct sg_table *sgt) {
+  return copy(sgt, REQ_OP_WRITE);
+}
+
+int copy_in(struct sg_table *sgt) {
+  return copy(sgt, REQ_OP_READ);
+}
+
diff --git a/drivers/gpu/nvgpu/os/linux/vm.c b/drivers/gpu/nvgpu/os/linux/vm.c
index 8956cce5..fcb58ac4 100644
--- a/drivers/gpu/nvgpu/os/linux/vm.c
+++ b/drivers/gpu/nvgpu/os/linux/vm.c
@@ -15,6 +15,7 @@
  */
 
 #include <linux/dma-buf.h>
+#include <linux/nvmap.h>
 #include <linux/scatterlist.h>
 #include <uapi/linux/nvgpu.h>
 
@@ -71,7 +72,23 @@ static struct nvgpu_mapped_buf *__nvgpu_vm_find_mapped_buf_reverse(
 {
 	struct nvgpu_rbtree_node *node = NULL;
 	struct nvgpu_rbtree_node *root = vm->mapped_buffers;
+	struct list_head* nvmap_priv;
+
+	// Try fast lookup first
+	if (!IS_ERR(nvmap_priv = nvmap_get_priv_list(dmabuf))) {
+		struct nvgpu_mapped_buf *mapped_buffer;
+		struct nvgpu_mapped_buf_priv *priv;
+
+		list_for_each_entry(priv, nvmap_priv, nvmap_priv_entry) {
+			mapped_buffer = container_of(priv, struct nvgpu_mapped_buf, os_priv);
+			if (mapped_buffer->os_priv.dmabuf == dmabuf &&
+			    mapped_buffer->kind == kind)
+				return mapped_buffer;
+		}
+	}
 
+	// Full traversal (not an nvmap buffer?)
+	printk(KERN_INFO "nvmap: Fast reverse lookup failed!");
 	nvgpu_rbtree_enum_start(0, &node, root);
 
 	while (node) {
@@ -158,6 +175,7 @@ struct nvgpu_mapped_buf *nvgpu_vm_find_mapping(struct vm_gk20a *vm,
 	 */
 	gk20a_mm_unpin(os_buf->dev, os_buf->dmabuf, os_buf->attachment,
 		       mapped_buffer->os_priv.sgt);
+	list_del(&mapped_buffer->os_priv.nvmap_priv_entry);
 	dma_buf_put(os_buf->dmabuf);
 
 	nvgpu_log(g, gpu_dbg_map,
@@ -198,6 +216,7 @@ int nvgpu_vm_map_linux(struct vm_gk20a *vm,
 	struct nvgpu_sgt *nvgpu_sgt = NULL;
 	struct nvgpu_mapped_buf *mapped_buffer = NULL;
 	struct dma_buf_attachment *attachment;
+	struct list_head *nvmap_priv;
 	int err = 0;
 
 	sgt = gk20a_mm_pin(dev, dmabuf, &attachment);
@@ -243,6 +262,12 @@ int nvgpu_vm_map_linux(struct vm_gk20a *vm,
 	mapped_buffer->os_priv.dmabuf = dmabuf;
 	mapped_buffer->os_priv.attachment = attachment;
 	mapped_buffer->os_priv.sgt    = sgt;
+	nvmap_priv = nvmap_get_priv_list(dmabuf);
+	if (!IS_ERR(nvmap_priv))
+		list_add(&mapped_buffer->os_priv.nvmap_priv_entry, nvmap_priv);
+	else
+		// So we can always safely call list_del()
+		INIT_LIST_HEAD(&mapped_buffer->os_priv.nvmap_priv_entry);
 
 	*gpu_va = mapped_buffer->addr;
 	return 0;
@@ -353,6 +378,49 @@ void nvgpu_vm_unmap_system(struct nvgpu_mapped_buf *mapped_buffer)
 	gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->os_priv.dmabuf,
 		       mapped_buffer->os_priv.attachment,
 		       mapped_buffer->os_priv.sgt);
-
+	list_del(&mapped_buffer->os_priv.nvmap_priv_entry);
 	dma_buf_put(mapped_buffer->os_priv.dmabuf);
 }
+
+/**
+ * Given an nvgpu_mapped_buf m, map m->os_priv.sgt into m->addr
+ * Very similar to nvgpu_vm_map_buffer, except that this assumes all necessary
+ * PTEs and PDEs have been created. This merely updates the physical address(es)
+ * in the associated PTEs, leaving all other attributes unchanged.
+ *
+ * NOP if sgt is already mapped for addr.
+ *
+ * vm->gmmu_update_lock must be held.
+ *
+ * Caller is responsible for flushing the TLB and L2 caches.
+ */
+void nvgpu_vm_remap(struct nvgpu_mapped_buf *m)
+{
+	// TODO: Input validation
+	struct scatterlist *sg;
+	unsigned int i = 0;
+	u64 curr_vaddr = m->addr;
+
+	// For each element of the scatterlist
+	// (based off for_each_sgtable_dma_sg() macro in newer kernels)
+	for_each_sg(m->os_priv.sgt->sgl, sg, m->os_priv.sgt->nents, i) {
+		unsigned int sg_off = 0;
+		// Keep mapping data at the next unmapped virtual address
+		// until each scatterlist element is entirely mapped
+		while (sg_off < sg_dma_len(sg)) {
+			int amt_mapped = __nvgpu_update_paddr(gk20a_from_vm(m->vm),
+							      m->vm,
+							      curr_vaddr,
+							      sg_dma_address(sg) + sg_off);
+			if (amt_mapped < 0) {
+				printk(KERN_ERR "nvgpu: Error %d from __nvgpu_update_paddr() in nvgpu_vm_remap()! Had mapped %llu of %llu bytes.\n", amt_mapped, curr_vaddr - m->addr, m->size);
+				return;
+			}
+			curr_vaddr += amt_mapped;
+			sg_off += amt_mapped;
+		}
+	}
+	if (curr_vaddr != m->addr + m->size) {
+		printk(KERN_ERR "nvgpu: Mapped %llu bytes when %llu bytes expected! Expect page table corruption!\n", curr_vaddr - m->addr, m->size);
+	}
+}
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index 873e787f..0138b720 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -2176,6 +2176,10 @@ struct nvgpu_as_get_sync_ro_map_args {
 	__u32 padding;
 };
 
+struct nvgpu_as_swap_buffer_args {
+	__u32 dmabuf_fd;	/* in */
+};
+
 #define NVGPU_AS_IOCTL_BIND_CHANNEL \
 	_IOWR(NVGPU_AS_IOCTL_MAGIC, 1, struct nvgpu_as_bind_channel_args)
 #define NVGPU32_AS_IOCTL_ALLOC_SPACE \
@@ -2198,9 +2202,13 @@ struct nvgpu_as_get_sync_ro_map_args {
 	_IOWR(NVGPU_AS_IOCTL_MAGIC, 11, struct nvgpu_as_map_buffer_batch_args)
 #define NVGPU_AS_IOCTL_GET_SYNC_RO_MAP	\
 	_IOR(NVGPU_AS_IOCTL_MAGIC,  12, struct nvgpu_as_get_sync_ro_map_args)
+#define NVGPU_AS_IOCTL_WRITE_SWAP_BUFFER	\
+	_IOW(NVGPU_AS_IOCTL_MAGIC,  13, struct nvgpu_as_swap_buffer_args)
+#define NVGPU_AS_IOCTL_READ_SWAP_BUFFER	\
+	_IOW(NVGPU_AS_IOCTL_MAGIC,  14, struct nvgpu_as_swap_buffer_args)
 
 #define NVGPU_AS_IOCTL_LAST            \
-	_IOC_NR(NVGPU_AS_IOCTL_GET_SYNC_RO_MAP)
+	_IOC_NR(NVGPU_AS_IOCTL_READ_SWAP_BUFFER)
 #define NVGPU_AS_IOCTL_MAX_ARG_SIZE	\
 	sizeof(struct nvgpu_as_map_buffer_ex_args)
 
-- 
cgit v1.2.2