From 9296adcd450143f02faf32fbda5b77dba3f03bc7 Mon Sep 17 00:00:00 2001
From: Joshua Bakita <jbakita@cs.unc.edu>
Date: Thu, 2 Jun 2022 14:29:53 -0400
Subject: gpu-paging: Allow for more than one buffer to be swapped at a time

This uses a very primitive linear disk sector allocation scheme.
Sectors are only reused when userspace resets assignment to 0 with
an NVGPU_AS_IOCTL_SWAP_RESET ioctl (which invalidates all current
swap buffers).

This sector assignment scheme is sufficient for use in a TimeWall-
like system, where all allocations are assumed to be static after
after task system release. This is not suitable for a system with
dynamic allocations, unless userspace manually resets swap state
regularly (benchmarks run a reset at start for example).

Support for dynamic allocations is on the backlog.

No significant speed impact.

Benchmarks, 100 iters, after:
gpu_paging_speed, write: 186.0ms +/- 3.51
gpu_paging_speed, read: 162.7ms +/- 2.58
gpu_paging_overhead_speed, write start: 35.4ms +/- 4.47
gpu_paging_overhead_speed, write finish: 3.3ms +/- 0.18
gpu_paging_overhead_speed, read start: 69.8ms +/- 6.42
gpu_paging_overhead_speed, read finish: 43.2ms +/- 0.91
---
 drivers/gpu/nvgpu/include/nvgpu/linux/vm.h |  3 +++
 drivers/gpu/nvgpu/os/linux/ioctl_as.c      |  8 ++++++++
 drivers/gpu/nvgpu/os/linux/swap.h          | 31 ++++++++++++++++++++++++++----
 drivers/gpu/nvgpu/os/linux/vm.c            |  1 +
 4 files changed, 39 insertions(+), 4 deletions(-)

(limited to 'drivers/gpu/nvgpu')

diff --git a/drivers/gpu/nvgpu/include/nvgpu/linux/vm.h b/drivers/gpu/nvgpu/include/nvgpu/linux/vm.h
index 85abce6f..4fa4242c 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/linux/vm.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/linux/vm.h
@@ -51,7 +51,10 @@ struct nvgpu_mapped_buf_priv {
 	struct sg_table *sgt;
 	// For fast reverse lookup (FD -> mapped_buf)
 	struct list_head nvmap_priv_entry;
+	// To allow waiting on swap I/O completion
 	struct completion swap_io_done;
+	// Sector assignment for swapped-out data
+	sector_t swap_sector;
 };
 
 /* NVGPU_AS_MAP_BUFFER_FLAGS_DIRECT_KIND_CTRL must be set */
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_as.c b/drivers/gpu/nvgpu/os/linux/ioctl_as.c
index 6348bb2a..2bf8363a 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_as.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_as.c
@@ -672,6 +672,14 @@ long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		err = nvgpu_as_dev_ioctl_swap(cmd, as_share,
 			(struct nvgpu_as_swap_buffer_args *)buf);
 		break;
+	case NVGPU_AS_IOCTL_SWAP_RESET:
+		// On-disk sector assignment is linear currently, and needs to
+		// be reset to the start between task systems to avoid disk
+		// space exhaustion.
+		// TODO: Support garbage-collection- or callback-driven sector
+		//       reclaiming rather than requiring manual reset.
+		atomic64_set(&nvgpu_swap_next_sector, 4);
+		break;
 	default:
 		err = -ENOTTY;
 		break;
diff --git a/drivers/gpu/nvgpu/os/linux/swap.h b/drivers/gpu/nvgpu/os/linux/swap.h
index 1e986095..3a648b26 100644
--- a/drivers/gpu/nvgpu/os/linux/swap.h
+++ b/drivers/gpu/nvgpu/os/linux/swap.h
@@ -1,8 +1,12 @@
 #include <linux/scatterlist.h>
 #include <linux/bio.h>
-//#include <nvgpu/bug.h>
+#include <linux/blkdev.h> // For SECTOR_SHIFT
+
+// Next sector to assign a mapped_buf to. Skip first disk block
+atomic64_t nvgpu_swap_next_sector = {4};
 
 // Callback for completion of the I/O chain
+// TODO: Error checking and handling
 static void complete_swap_io(struct bio *bio) {
   struct nvgpu_mapped_buf *m = bio->bi_private;
   bio_put(bio);
@@ -11,21 +15,38 @@ static void complete_swap_io(struct bio *bio) {
 
 // Queue a command to copy out an SGT to disk
 // TODO: Cache bdev
-// TODO: Don't hardcode sector 0
-// TODO: Figure out if submit_bio() can fail, and what to do then
+// TODO: Track, allocate, and recycle individual swap buffers on disk instead
+//       of only supporting a global reset
 int copy(struct sg_table *sgt, int op, struct nvgpu_mapped_buf *m) {
   unsigned int i;
   struct scatterlist *sg;
   struct bio *bio;
   int err = 0;
   int sg_cnt = sgt->nents;
-  sector_t sector = 0; // XXX: For testing
+  sector_t sector = m->os_priv.swap_sector;
   // Find and open the block device
   struct block_device *bdev = blkdev_get_by_path("/dev/nvme0n1", FMODE_READ | FMODE_WRITE, copy);
   if (unlikely(IS_ERR(bdev))) {
     printk(KERN_WARNING "Unabled to find `nvme0`, err %ld!\n", PTR_ERR(bdev));
     return -ENODEV;
   }
+  // Assign a sector on-disk (0 indicates unassigned, we start at 4)
+  if (sector == 0) {
+    // Read block device size in sectors, and fail if we'd use more than 1/3rd
+    // of the disk (to stay in SLC-emulation-mode).
+    // TODO: Issue NVMe DSM commands to try to manage this better? Read-only
+    //       regions should be able to be moved to TLC safely, whereas other
+    //       data should be kept in the SLC cache to reduce wear.
+    if (atomic64_read(&nvgpu_swap_next_sector) >= i_size_read(bdev->bd_inode)/3) {
+      err = -ENOMEM;
+      goto out_put;
+    }
+    // Hand out sectors sequentially, and statically
+    // TODO: Intelligent sector allocation
+    sector = atomic64_add_return(m->size >> SECTOR_SHIFT, &nvgpu_swap_next_sector);
+    sector -= (m->size >> SECTOR_SHIFT);
+    m->os_priv.swap_sector = sector;
+  }
   // Reset the .done variable in the completion
   reinit_completion(&m->os_priv.swap_io_done);
   // bio_alloc() will never fail when allocating <= BIO_MAX_PAGES
@@ -58,8 +79,10 @@ int copy(struct sg_table *sgt, int op, struct nvgpu_mapped_buf *m) {
   }
 
   // Async submit. Caller should wait_for_completion_io(&m->os_priv.swap_io_done);
+  // Does not fail. Error reported via completion handler.
   submit_bio(bio);
 
+out_put:
   // Release our block device handle
   blkdev_put(bdev, FMODE_WRITE | FMODE_READ); // Is this safe?
   return err;
diff --git a/drivers/gpu/nvgpu/os/linux/vm.c b/drivers/gpu/nvgpu/os/linux/vm.c
index 9cd17981..a1c19a3a 100644
--- a/drivers/gpu/nvgpu/os/linux/vm.c
+++ b/drivers/gpu/nvgpu/os/linux/vm.c
@@ -269,6 +269,7 @@ int nvgpu_vm_map_linux(struct vm_gk20a *vm,
 	else
 		// So we can always safely call list_del()
 		INIT_LIST_HEAD(&mapped_buffer->os_priv.nvmap_priv_entry);
+	mapped_buffer->os_priv.swap_sector = 0;
 
 	*gpu_va = mapped_buffer->addr;
 	return 0;
-- 
cgit v1.2.2