From 745b3ef2ac4d7afa99202e6afc441e3f0b97f5b4 Mon Sep 17 00:00:00 2001
From: Joshua Bakita <jbakita@cs.unc.edu>
Date: Mon, 30 May 2022 12:20:48 -0400
Subject: gpu-paging: Support asynchronous paging

- Fully enables *_ASYNC API
- Allows page mapping to be overlapped with I/O, resulting in an 11% speedup
  to synchronous reads

Benchmarks, 1,000 iters, before:
gpu_paging_speed, write: 185.5ms +/- 3.58
gpu_paging_speed, read: 180.5ms +/- 1.42
gpu_paging_overhead_speed, write start: 183.3ms +/- 3.89
gpu_paging_overhead_speed, write finish: 3.4ms +/- 2.61
gpu_paging_overhead_speed, read start: 181.6ms +/- 3.34
gpu_paging_overhead_speed, read finish: 41.1ms +/- 2.69

Benchmarks, 1,000 iters, after:
gpu_paging_speed, write: 185.8ms +/- 3.70
gpu_paging_speed, read: 161.3ms +/- 0.97
gpu_paging_overhead_speed, write start: 38.9ms +/- 5.47
gpu_paging_overhead_speed, write finish: 3.1ms +/- 2.42
gpu_paging_overhead_speed, read start: 79.4 +/- 6.42
gpu_paging_overhead_speed, read finish: 44.3 +/- 1.53
---
 drivers/gpu/nvgpu/include/nvgpu/linux/vm.h |  1 +
 drivers/gpu/nvgpu/os/linux/ioctl_as.c      | 13 +++++-----
 drivers/gpu/nvgpu/os/linux/swap.h          | 41 +++++++++++++++++-------------
 drivers/gpu/nvgpu/os/linux/vm.c            |  1 +
 4 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/nvgpu/include/nvgpu/linux/vm.h b/drivers/gpu/nvgpu/include/nvgpu/linux/vm.h
index b86a428a..85abce6f 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/linux/vm.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/linux/vm.h
@@ -51,6 +51,7 @@ struct nvgpu_mapped_buf_priv {
 	struct sg_table *sgt;
 	// For fast reverse lookup (FD -> mapped_buf)
 	struct list_head nvmap_priv_entry;
+	struct completion swap_io_done;
 };
 
 /* NVGPU_AS_MAP_BUFFER_FLAGS_DIRECT_KIND_CTRL must be set */
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_as.c b/drivers/gpu/nvgpu/os/linux/ioctl_as.c
index af6cdb5b..6348bb2a 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_as.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_as.c
@@ -365,9 +365,9 @@ static int nvgpu_as_ioctl_write_swap_buffer(
 	// (Assuming that NVMe DRAM acceses are uncached)
 	gk20a_mm_l2_flush(g, false);
 
-	// Copy out (blocking) TODO: non-blocking
+	// Copy out (non-blocking)
 	// Could fail on inaccessible swap device, etc
-	err = copy_out(m->os_priv.sgt);
+	err = copy_out(m->os_priv.sgt, m);
 
 out:
 	return err;
@@ -393,7 +393,7 @@ static int nvgpu_as_ioctl_write_swap_buffer_finish(
 	nvgpu_log_fn(g, " ");
 
 	// Wait for the pages to get written out
-	//wait_for_completion_io(m->os_priv.swap_completion);
+	wait_for_completion_io(&m->os_priv.swap_io_done);
 
 	// Unpin needs to happen after copy out is done
 	// (No return value check as it's a void function)
@@ -448,9 +448,8 @@ static int nvgpu_as_ioctl_read_swap_buffer(
 	// Do any bookeeping not done by gk20a_mm_pin()
 	m->os_priv.sgt = sgt;
 
-	// Reload page contents from disk (blocking)
-	// TODO: non-blocking
-	err = copy_in(sgt);
+	// Reload page contents from disk (non-blocking)
+	err = copy_in(sgt, m);
 	if (err) {
 		int err2;
 		// Rollback pinning and allocation
@@ -487,7 +486,7 @@ static int nvgpu_as_ioctl_read_swap_buffer_finish(
 	// Invalidate L2 so that TLB refill does not load stale PT
 	gk20a_mm_l2_flush(g, true);
 	// Wait for read to complete if it hasn't yet
-	//wait_for_completion_io(m->os_priv.swap_completion);
+	wait_for_completion_io(&m->os_priv.swap_io_done);
 
 	return err;
 }
diff --git a/drivers/gpu/nvgpu/os/linux/swap.h b/drivers/gpu/nvgpu/os/linux/swap.h
index f762ba81..1e986095 100644
--- a/drivers/gpu/nvgpu/os/linux/swap.h
+++ b/drivers/gpu/nvgpu/os/linux/swap.h
@@ -2,17 +2,23 @@
 #include <linux/bio.h>
 //#include <nvgpu/bug.h>
 
+// Callback for completion of the I/O chain
+static void complete_swap_io(struct bio *bio) {
+  struct nvgpu_mapped_buf *m = bio->bi_private;
+  bio_put(bio);
+  complete(&m->os_priv.swap_io_done);
+}
+
 // Queue a command to copy out an SGT to disk
 // TODO: Cache bdev
-// TODO: Asynchronous I/O
 // TODO: Don't hardcode sector 0
-int copy(struct sg_table *sgt, int op) {
+// TODO: Figure out if submit_bio() can fail, and what to do then
+int copy(struct sg_table *sgt, int op, struct nvgpu_mapped_buf *m) {
   unsigned int i;
   struct scatterlist *sg;
   struct bio *bio;
   int err = 0;
   int sg_cnt = sgt->nents;
-  struct bio *bio_orig;
   sector_t sector = 0; // XXX: For testing
   // Find and open the block device
   struct block_device *bdev = blkdev_get_by_path("/dev/nvme0n1", FMODE_READ | FMODE_WRITE, copy);
@@ -20,12 +26,15 @@ int copy(struct sg_table *sgt, int op) {
     printk(KERN_WARNING "Unabled to find `nvme0`, err %ld!\n", PTR_ERR(bdev));
     return -ENODEV;
   }
-  // Will never fail when allocating <= BIO_MAX_PAGES
+  // Reset the .done variable in the completion
+  reinit_completion(&m->os_priv.swap_io_done);
+  // bio_alloc() will never fail when allocating <= BIO_MAX_PAGES
   bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
-  bio_orig = bio;
   bio->bi_bdev = bdev; // Switch to bio_set_dev(bdev) in newer kernels
   bio->bi_iter.bi_sector = sector;
-  bio_set_op_attrs(bio, op, op == REQ_OP_WRITE ? WRITE_ODIRECT : 0);//REQ_SYNC); // XXX: Is REQ_SYNC necessary?
+  bio_set_op_attrs(bio, op, REQ_SYNC); // REQ_SYNC is identical to WRITE_ODIRECT
+  bio->bi_private = m;
+  bio->bi_end_io = complete_swap_io;
   // Copy the scatter-gather table (sgt) into a block I/O vector (bio vec)
   // bio_chain() approach borrowed from drivers/nvme/target/io-cmd.c:nvmet_execute_rw()
   for_each_sg(sgt->sgl, sg, sgt->nents, i) {
@@ -47,16 +56,12 @@ int copy(struct sg_table *sgt, int op) {
     sector += sg_dma_len(sg) >> 9;
     sg_cnt--;
   }
-  // Use blocking submit for now
-  // TODO: Switch to async via submit_bio(bio)
-  err = submit_bio_wait(bio);
 
-  if (bio->bi_error && bio->bi_error != err)
-    printk(KERN_WARNING "nvgpu: bio->bi_error %d != return val from submit_bio_wait() %d\n", bio->bi_error, err);
+  // Async submit. Caller should wait_for_completion_io(&m->os_priv.swap_io_done);
+  submit_bio(bio);
 
-//out:
-  bio_put(bio_orig); // TODO: Move to completion handler
-  blkdev_put(bdev, FMODE_WRITE|FMODE_READ);
+  // Release our block device handle
+  blkdev_put(bdev, FMODE_WRITE | FMODE_READ); // Is this safe?
   return err;
 }
 
@@ -107,11 +112,11 @@ int copy_all(struct vm_gk20a *vm) {
 	return 0;
 }
 
-int copy_out(struct sg_table *sgt) {
-  return copy(sgt, REQ_OP_WRITE);
+int copy_out(struct sg_table *sgt, struct nvgpu_mapped_buf *m) {
+  return copy(sgt, REQ_OP_WRITE, m);
 }
 
-int copy_in(struct sg_table *sgt) {
-  return copy(sgt, REQ_OP_READ);
+int copy_in(struct sg_table *sgt, struct nvgpu_mapped_buf *m) {
+  return copy(sgt, REQ_OP_READ, m);
 }
 
diff --git a/drivers/gpu/nvgpu/os/linux/vm.c b/drivers/gpu/nvgpu/os/linux/vm.c
index fcb58ac4..9cd17981 100644
--- a/drivers/gpu/nvgpu/os/linux/vm.c
+++ b/drivers/gpu/nvgpu/os/linux/vm.c
@@ -262,6 +262,7 @@ int nvgpu_vm_map_linux(struct vm_gk20a *vm,
 	mapped_buffer->os_priv.dmabuf = dmabuf;
 	mapped_buffer->os_priv.attachment = attachment;
 	mapped_buffer->os_priv.sgt    = sgt;
+	init_completion(&mapped_buffer->os_priv.swap_io_done);
 	nvmap_priv = nvmap_get_priv_list(dmabuf);
 	if (!IS_ERR(nvmap_priv))
 		list_add(&mapped_buffer->os_priv.nvmap_priv_entry, nvmap_priv);
-- 
cgit v1.2.2