Add initial implementation

Supports accessing and printing the runlist on the Jetson Xavier to dmesg. May work on other Jetson boards. Currently requires the nvgpu headers from NVIDIA's Linux4Tegra (L4T) source tree.
author: Joshua Bakita <jbakita@cs.unc.edu> 2021-08-26 13:04:27 -0400
committer: Joshua Bakita <jbakita@cs.unc.edu> 2021-08-26 13:04:27 -0400
commit: 5f661d8a5db3f7875f6bf36b4843a71fd08ecbea (patch)
tree: b18ce3ceb27fd885cd6aec19a3c342bb9e7963ef
5 files changed, 461 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..197a191
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+.*
+*.ko
+*.mod.c
+*.o
+*.o.*
+modules.order
+Module.symvers
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..cc14996
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,13 @@
+obj-m += nvdebug.o
+# TODO: Avoid needing to distribute NVIDIA's headers (at least they're MIT...)
+#ccflags-y += -I$(PWD)/include
+ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu/include
+ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu
+ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include
+ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include/uapi
+all:
+        make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
+clean:
+        make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
diff --git a/nvdebug.c b/nvdebug.c
new file mode 100644
index 0000000..31a797e
--- /dev/null
+++ b/nvdebug.c
@@ -0,0 +1,278 @@
+/* Copyright 2021 Joshua Bakita
+ * SPDX-License-Identifier: MIT
+ */
+/* TODO
+ * - Add /proc /sys or debugfs interface
+ * - Add API to trigger a preemption
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/kallsyms.h>
+#include <linux/iommu.h> // For struct iommu_domain
+#include <asm/io.h>
+/* Currently used symbols:
+ * - struct gk20a;
+ * - struct nvgpu_os_linux;
+ * - void nvgpu_writel(struct gk20a *g, u32 reg_addr, u32 value);
+ */
+#include <nvgpu/io.h>
+#include <nvgpu/gk20a.h>
+#include <os/linux/os_linux.h>
+#include "nvdebug.h"
+MODULE_LICENSE("GPL"); // LIAR
+MODULE_AUTHOR("Joshua Bakita");
+MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs");
+MODULE_SOFTDEP("pre: nvgpu"); // We only support the Jetson boards for now
+// Bus types are global symbols in the kernel
+extern struct bus_type platform_bus_type;
+static inline struct gk20a *get_gk20a(struct device *dev) {
+        // XXX: Only works because gk20a* is the first member of gk20a_platform
+        return *((struct gk20a**)dev_get_drvdata(dev));
+}
+// Functionally identical to nvgpu_readl()
+// (except we don't try to resolve situations where regs is NULL)
+static inline u32 nvdebug_readl(struct gk20a* g, u32 r) {
+        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
+        if (unlikely(!g_os->regs)) {
+                printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
+                return -1;
+        }
+        return readl(g_os->regs + r);
+}
+// Functionally identical to nvgpu_writel()
+static inline void nvdebug_writel(struct gk20a* g, u32 r, u32 v) {
+        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
+        if (unlikely(!g_os->regs)) {
+                printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n");
+                return;
+        }       
+        writel_relaxed(v, g_os->regs + r);
+        wmb();
+}
+/*
+#define RUNLIST_PROCFS_NAME "runlist"
+static const struct seq_operations runlist_file_seq_ops = {
+        .start = 
+        .next =
+        .stop =
+        .show =
+};
+static const struct file_operations runlist_file_ops = {
+        .read = 
+*/
+/*static void read_bytes(struct gk20a *g, void* target, u32 start, u32 num_bytes) {
+        u32 *output = target;
+        u32 i;
+        // Read u32s from the GPU
+        for (i = 0; i < num_bytes; i += 4) {
+                output[i/4] = _nvgpu_readl(g, start + i);
+                printk(KERN_INFO "[nvdebug] U32 %d: %0x\n", i, output[i/4]);
+        }
+}
+static void read_bytes(void* target, void* start, u32 num_bytes) {
+        u32 *output = target;
+        u32 i;
+        // Read u32s from the GPU
+        for (i = 0; i < num_bytes; i += 4) {
+                output[i/4] = readl(start + i);
+                printk(KERN_INFO "[nvdebug] U32 %d: %0x\n", i, output[i/4]);
+        }
+}*/
+/*
+  +---- TSG Entry %d ----+
+  | Scale: %d            |
+  | Timeout: %d          |
+  +----------------------+
+*/
+#define PRE KERN_INFO "[nvdebug] "
+static void nvdebug_print_tsg(struct entry_tsg* tsg) {
+        if (tsg->entry_type != ENTRY_TYPE_TSG) {
+                printk(KERN_WARNING "[nvdebug] Attempted to print non-TSG in nvdebug_print_tsg()!\n");
+                return;
+        }
+        printk(PRE "+---- TSG Entry %-2d----+", tsg->tsgid);
+        printk(PRE "| Scale: %-13d|", tsg->timeslice_scale);
+        printk(PRE "| Timeout: %-11d|", tsg->timeslice_timeout);
+        printk(PRE "+---------------------+");
+}
+static void nvdebug_print_chan(struct runlist_chan* chan) {
+        char* loc_txt;
+        u64 inst_ptr;
+        if (chan->entry_type != ENTRY_TYPE_CHAN) {
+                printk(KERN_WARNING "[nvdebug] Attempted to print non-channel in nvdebug_print_channel()!\n");
+                return;
+        }
+        switch (chan->inst_target) {
+                case TARGET_VID_MEM:
+                        loc_txt = "VID_MEM";
+                        break;
+                case TARGET_SYS_MEM_COHERENT:
+                        loc_txt = "SYS_MEM_COHERENT";
+                        break;
+                case TARGET_SYS_MEM_NONCOHERENT:
+                        loc_txt = "SYS_MEM_NONCOHERENT";
+                        break;
+                default:
+                        printk(KERN_WARNING "[nvdebug] Invalid aperture in runlist channel!\n");
+                        return;
+        }
+        // Reconstruct pointer to channel instance block
+        inst_ptr = chan->inst_ptr_hi;
+        inst_ptr <<= 32;
+        inst_ptr |= chan->inst_ptr_lo << 12;
+        printk(PRE "  +- Channel Entry %-4d-+", chan->chid);
+        printk(PRE "  | Runqueue Selector: %d|", chan->runqueue_selector);
+        printk(PRE "  | Instance PTR:       |");
+        printk(PRE "  | %#018llx  |", inst_ptr);
+        printk(PRE "  | %-20s|", loc_txt);
+        printk(PRE "  +---------------------+");
+}
+#define for_chan_in_tsg(chan, tsg) \
+        for (chan = (struct runlist_chan*)(tsg + 1); \
+             (void*)chan < (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length; \
+             chan++)
+#define next_tsg(tsg) \
+        (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length
+static void nvdebug_print_runlist(struct entry_tsg* head, runlist_info_t rl_info) {
+        int rl_idx = 0;
+        struct runlist_chan* chan;
+        printk(PRE "tsg->tsg_length: %d\n", head->tsg_length);
+        printk(PRE "rl_info.len: %d\n", rl_info.len);
+        while (rl_idx < rl_info.len) {
+                nvdebug_print_tsg(head);
+                for_chan_in_tsg(chan, head) {
+                        nvdebug_print_chan(chan);
+                }
+                rl_idx += 1 + head->tsg_length;
+                head = next_tsg(head);
+        }
+}
+static int __init nvdebug_init(void) {
+        struct device *dev = NULL;
+        struct device *temp_dev;
+        struct gk20a *g;
+        struct entry_tsg head;
+        runlist_base_t rl_base;
+        runlist_info_t rl_info;
+        u64 runlist_iova;
+        // Get the last device that matches our name
+        while ((temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b"))) {
+                printk(KERN_INFO "Found a matching device\n");
+                dev = temp_dev;
+        }
+        if (!dev)
+                return -EIO;
+        g = get_gk20a(dev);
+        // This address seems to not be:
+        // - A GPU address (type is sysmem_coherent)
+        // - A physical address (dereferencing after ioremap crashes)
+        // - A kernel virtual address (dereferencing segfaults)
+        // So maybe it's some sort of custom thing? This is an address that the GPU
+        // can use, so it would make most sense for it to be a physical address.
+        //
+        // BUT, it can't possibly be a physical address, as it would refer to an
+        // address greater than the maximum one on our system (by a lot!).
+        // Maybe I'm reading the runlist base wrong?
+        // Aha, the driver calls it runlist_iova. Sounds like runlist I/O virtual
+        // address! So, what's this I/O address space? All I know is that it's what
+        // nvgpu_mem_get_addr() returns. That function returns the result of either:
+        // - gpu_phys_addr which is  __nvgpu_sgl_phys on our platform which (?)
+        //   converts an IPA to a PA?
+        // - nvgpu_mem_iommu_translate
+        //
+        // The original memory is allocated with nvgpu_dma_alloc_flags_sys(), which
+        // returns SYSMEM.
+        //
+        // To convert a physical address to a IOMMU address, we add a bit
+        //
+        // BUT, it turns out that it IS JUST A PHYSICAL ADDRESS! It wasn't working
+        // before because the GPU had simply gone to sleep and invalidated its
+        // register state, so nvgpu_readl() was simply returning garbage.
+        printk(KERN_INFO "[nvdebug] Pulling runlist base address from %x\n", NV_PFIFO_RUNLIST_BASE);
+        printk(KERN_INFO "[nvdebug] Using struct gk20a* of %px\n", g);
+        printk(KERN_INFO "[nvdebug] g->name: %s, g->power_on: %d, g->sw_ready: %d, g->is_virtual %d\n", g->name, g->power_on, g->sw_ready, g->is_virtual);
+        struct nvgpu_os_linux *l = container_of(g, struct nvgpu_os_linux, g);
+        printk(KERN_INFO "[nvdebug] l->regs %px, l->regs_saved %px\n", l->regs, l->regs_saved);
+        if (!l->regs)
+                return -EIO;
+        rl_base.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST_BASE);
+        rl_info.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST);
+        runlist_iova = ((u64)rl_base.ptr) << 12;
+        printk(KERN_INFO "[nvdebug] Runlist ptr: %x, type: %d, raw: %x, IOVA: %px\n", rl_base.ptr, rl_base.type, rl_base.raw, (void*)runlist_iova);
+        // Segfaults
+        //u32 attempted_read = ioread32(runlist_iova);
+        //printk(KERN_INFO "[nvdebug] first word of runlist: %0x\n", attempted_read);
+        // Errors out
+        //u32* virt_rt_addr = ioremap(phys_rl_addr, sizeof(struct entry_tsg));
+        //printk(KERN_INFO "[nvdebug] Runlist virt_addr: %px\n", virt_rt_addr);
+        /* Overcomplicated?
+        struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+        if (!domain) {
+                printk(KERN_INFO "[nvdebug] No IOMMU domain!\n");
+                return -EIO;
+        }
+        u64 phys_addr = platform_bus_type.iommu_ops->iova_to_phys(domain, runlist_iova);
+        printk(KERN_INFO "[nvdebug] Runlist PA: %px\n", phys_addr);
+        */
+        printk(KERN_INFO "[nvdebug] Runlist phys_to_virt:   %px\n", (void*)phys_to_virt(runlist_iova));
+        printk(KERN_INFO "[nvdebug] Runlist *phys_to_virt:  %x\n", *(u32*)phys_to_virt(runlist_iova));
+        head = *(struct entry_tsg*)phys_to_virt(runlist_iova);
+        nvdebug_print_runlist((struct entry_tsg*)phys_to_virt(runlist_iova), rl_info);
+        //nvdebug_print_tsg(&head);
+        //nvdebug_print_chan((struct runlist_chan*)(phys_to_virt(runlist_iova) + sizeof(struct entry_tsg)));
+        //printk(KERN_INFO "[nvdebug] entry_type: %d\n", head.entry_type);
+        //printk(KERN_INFO "[nvdebug] timeslice_scale: %d\n", head.timeslice_scale);
+        //printk(KERN_INFO "[nvdebug] timeslice_timeout: %d\n", head.timeslice_timeout);
+        //printk(KERN_INFO "[nvdebug] tsg_length: %d\n", head.tsg_length);
+        //printk(KERN_INFO "[nvdebug] tsgid: %d\n", head.tsgid);
+        //printk(KERN_INFO "[nvdebug] Mem base phys: %p\n", (void*)virt_to_phys((void*)0xffffffc000000000ULL));
+        //printk(KERN_INFO "[nvdebug] Mem end phys: %p\n", (void*)virt_to_phys((void*)0xffffffc400000000ULL));
+        //printk(KERN_INFO "[nvdebug] Runlist *virt_addr: %x\n", readl(virt_rt_addr)); // This crashes
+        //read_bytes(&head, virt_rt_addr, sizeof(struct entry_tsg));
+        /*printk(KERN_INFO "[nvdebug] entry_type: %d\n", head.entry_type);
+        printk(KERN_INFO "[nvdebug] timeslice_scale: %d\n", head.timeslice_scale);
+        printk(KERN_INFO "[nvdebug] timeslice_timeout: %d\n", head.timeslice_timeout);
+        printk(KERN_INFO "[nvdebug] tsg_length: %d\n", head.tsg_length);
+        printk(KERN_INFO "[nvdebug] tsgid: %d\n", head.tsgid);
+        */return 0;
+}
+static void __exit nvdebug_exit(void) {
+        printk(KERN_INFO "[nvdebug] Exiting...\n");
+}
+module_init(nvdebug_init);
+module_exit(nvdebug_exit);
diff --git a/nvdebug.h b/nvdebug.h
new file mode 100644
index 0000000..aa5d0cf
--- /dev/null
+++ b/nvdebug.h
@@ -0,0 +1,127 @@
+/* Copyright 2021 Joshua Bakita
+ * SPDX-License-Identifier: MIT
+ */
+/* Runlist Channel
+  A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue
+  of GPU commands. These commands are typically queued from userspace.
+  `INST_PTR` points to a GPU Instance Block which contains pointers to the GPU
+  virtual address space for this context. All channels in a TSG point to the
+  same GPU Instance Block.
+  ENTRY_TYPE (T)        : type of this entry: ENTRY_TYPE_CHAN
+  CHID (ID)             : identifier of the channel to run (overlays ENTRY_ID)
+  RUNQUEUE_SELECTOR (Q) : selects which PBDMA should run this channel if
+                          more than one PBDMA is supported by the runlist
+  INST_PTR_LO           : lower 20 bits of the 4k-aligned instance block pointer
+  INST_PTR_HI           : upper 32 bit of instance block pointer
+  INST_TARGET (TGI)     : aperture of the instance block
+  USERD_PTR_LO          : upper 24 bits of the low 32 bits, of the 512-byte-aligned USERD pointer
+  USERD_PTR_HI          : upper 32 bits of USERD pointer
+  USERD_TARGET (TGU)    : aperture of the USERD data structure
+*/
+enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1};
+enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3};
+struct runlist_chan {
+// 0:63
+        enum ENTRY_TYPE entry_type:1;
+        uint32_t runqueue_selector:1;
+         uint32_t padding:2;
+        enum INST_TARGET inst_target:2;
+         uint32_t padding2:2;
+        uint32_t userd_ptr_lo:24;
+        uint32_t userd_ptr_hi:32;
+// 64:128
+        uint32_t chid:12;
+        uint32_t inst_ptr_lo:20;
+        uint32_t inst_ptr_hi:32;
+} __attribute__((packed));
+/* Runlist TSG (TimeSlice Group)
+  The runlist is composed of timeslice groups (TSG). Each TSG corresponds
+  to a single virtual address space on the GPU and contains `TSG_LENGTH`
+  channels. These channels and virtual address space are accessible to the GPU
+  host unit for use until the timeslice expires or a TSG switch is forcibly
+  initiated via a write to `NV_PFIFO_PREEMPT`.
+  timeslice = (TSG_TIMESLICE_TIMEOUT << TSG_TIMESLICE_SCALE) * 1024 nanoseconds
+  ENTRY_TYPE (T)      : type of this entry: ENTRY_TYPE_TSG
+  TSGID               : identifier of the Timeslice group (overlays ENTRY_ID)
+  TSG_LENGTH          : number of channels that are part of this timeslice group
+  TIMESLICE_SCALE     : scale factor for the TSG's timeslice
+  TIMESLICE_TIMEOUT   : timeout amount for the TSG's timeslice
+*/
+struct entry_tsg {
+// 0:63
+        enum ENTRY_TYPE entry_type:1;
+         uint64_t padding:15;
+        uint32_t timeslice_scale:4;
+         uint64_t padding2:4;
+        uint32_t timeslice_timeout:8;
+        uint32_t tsg_length:8;
+         uint32_t padding3:24;
+// 64:128
+        uint32_t tsgid:12;
+         uint64_t padding4:52;
+} __attribute__((packed));
+enum PREEMPT_TYPE {PREEMPT_TYPE_CHANNEL = 0, PREEMPT_TYPE_TSG = 1};
+/* Preempt
+  ID/CHID             : Id of TSG or channel to preempt
+*/
+#define NV_PFIFO_PREEMPT 0x00002634
+struct pfifo_preempt {
+        uint32_t id:12;
+         uint32_t padding:8;
+        bool is_pending:1;
+         uint32_t padding2:3;
+        enum PREEMPT_TYPE type:2;
+         uint32_t padding3:6;
+} __attribute__((packed));
+#define NV_PFIFO_RUNLIST_PREEMPT 0x00002638
+struct runlist_preempt {
+        bool runlist_0:1;
+        bool runlist_1:1;
+        bool runlist_2:1;
+        bool runlist_3:1;
+        bool runlist_4:1;
+        bool runlist_5:1;
+        bool runlist_6:1;
+        bool runlist_7:1;
+        bool runlist_8:1;
+        bool runlist_9:1;
+        bool runlist_10:1;
+        bool runlist_11:1;
+        bool runlist_12:1;
+        bool runlist_13:1;
+         uint32_t padding:28;
+} __attribute__((packed));
+// Note: This is different with Turing
+#define NV_PFIFO_RUNLIST_BASE 0x00002270
+typedef union {
+        struct {
+                uint32_t ptr:28;
+                uint32_t type:2;
+                 uint32_t padding:2;
+        } __attribute__((packed));
+        uint32_t raw;
+} runlist_base_t;
+#define NV_PFIFO_RUNLIST 0x00002274
+typedef union {
+        struct {
+                uint32_t len:16;
+                 uint32_t padding:4;
+                uint32_t id:4;
+                 uint32_t padding2:8;
+        } __attribute__((packed));
+        uint32_t raw;
+} runlist_info_t;
diff --git a/nvidia_preemption.md b/nvidia_preemption.md
new file mode 100644
index 0000000..051d4a5
--- /dev/null
+++ b/nvidia_preemption.md
@@ -0,0 +1,36 @@
+# NVIDIA GPU Preemption
+MVP: Preempt current work on the GPU on the Jetson Xavier
+Summary of approach: Create new runlist that excludes the current work and point the GPU to it
+1. Obtain current runlist
+2. Copy runlist to new location, skipping TSG of target to preempt
+3. Write new runlist address to NV_PFIFO_RUNLIST, which will preempt current work
+It's unclear if this approach is lower-overhead than that of Capodieci et al.
+See approach Alternate 1 which is our new priority.
+Notes:
+- Each TSG (timeslice group) corresponds to one context (?)
+- Runlist base must be 4k aligned
+- nvgpu driver gets gk20a struct via container_of an inode which is a struct nvgpu_os_linux
+- gk20a_writel is nvgpu_writel. Define is: `void nvgpu_writel(struct gk20a *g, u32 reg_addr, u32 value);`
+- gk20a_readl is nvgpu_readl. Define is: `u32 nvgpu_readl(struct gk20a *g, u32 reg_addr);`
+## Other approaches:
+### Alternate 1:
+    "2. Disable all channels in the containing TSG by writing ENABLE_CLR to TRUE
+        in their channel RAM entries in NV_PCCSR_CHANNEL (see dev_fifo.ref).
+     3. Initiate a preempt of the TSG via NV_PFIFO_PREEMPT or
+        NV_PFIFO_RUNLIST_PREEMPT." (PBDMA, "Recovery procedure")
+### Alternate 2:
+ "3. Initiate a preempt of the engine by writing the bit associated with its
+     runlist to NV_PFIFO_RUNLIST_PREEMPT.  This allows us to begin the preempt
+     process prior to doing the slow register reads needed to determine whether
+     the context has hit any interrupts or is hung.  Do not poll
+     NV_PFIFO_RUNLIST_PREEMPT for the preempt to complete." (FIFO, "Context TSG tear-down procedure")
+See `nvdebug.c` and `nvdebug.h` for implementation details.
author	Joshua Bakita <jbakita@cs.unc.edu>	2021-08-26 13:04:27 -0400
committer	Joshua Bakita <jbakita@cs.unc.edu>	2021-08-26 13:04:27 -0400
commit	5f661d8a5db3f7875f6bf36b4843a71fd08ecbea (patch)
tree	b18ce3ceb27fd885cd6aec19a3c342bb9e7963ef

diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..197a191 --- /dev/null +++ b/.gitignore
@@ -0,0 +1,7 @@
	1	.*
	2	*.ko
	3	*.mod.c
	4	*.o
	5	.o.
	6	modules.order
	7	Module.symvers


diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..cc14996 --- /dev/null +++ b/Makefile
@@ -0,0 +1,13 @@
	1	obj-m += nvdebug.o
	2
	3	# TODO: Avoid needing to distribute NVIDIA's headers (at least they're MIT...)
	4	#ccflags-y += -I$(PWD)/include
	5	ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu/include
	6	ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu
	7	ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include
	8	ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include/uapi
	9
	10	all:
	11	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
	12	clean:
	13	make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean


diff --git a/nvdebug.c b/nvdebug.c new file mode 100644 index 0000000..31a797e --- /dev/null +++ b/nvdebug.c
@@ -0,0 +1,278 @@
	1	/* Copyright 2021 Joshua Bakita
	2	* SPDX-License-Identifier: MIT
	3	*/
	4
	5	/* TODO
	6	* - Add /proc /sys or debugfs interface
	7	* - Add API to trigger a preemption
	8	*/
	9
	10	#include <linux/module.h>
	11	#include <linux/kernel.h>
	12	#include <linux/device.h>
	13	#include <linux/kallsyms.h>
	14	#include <linux/iommu.h> // For struct iommu_domain
	15	#include <asm/io.h>
	16
	17	/* Currently used symbols:
	18	* - struct gk20a;
	19	* - struct nvgpu_os_linux;
	20	* - void nvgpu_writel(struct gk20a *g, u32 reg_addr, u32 value);
	21	*/
	22	#include <nvgpu/io.h>
	23	#include <nvgpu/gk20a.h>
	24	#include <os/linux/os_linux.h>
	25
	26	#include "nvdebug.h"
	27
	28	MODULE_LICENSE("GPL"); // LIAR
	29	MODULE_AUTHOR("Joshua Bakita");
	30	MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs");
	31	MODULE_SOFTDEP("pre: nvgpu"); // We only support the Jetson boards for now
	32
	33	// Bus types are global symbols in the kernel
	34	extern struct bus_type platform_bus_type;
	35
	36	static inline struct gk20a get_gk20a(struct device dev) {
	37	// XXX: Only works because gk20a* is the first member of gk20a_platform
	38	return ((struct gk20a*)dev_get_drvdata(dev));
	39	}
	40
	41	// Functionally identical to nvgpu_readl()
	42	// (except we don't try to resolve situations where regs is NULL)
	43	static inline u32 nvdebug_readl(struct gk20a* g, u32 r) {
	44	struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
	45	if (unlikely(!g_os->regs)) {
	46	printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
	47	return -1;
	48	}
	49	return readl(g_os->regs + r);
	50	}
	51
	52	// Functionally identical to nvgpu_writel()
	53	static inline void nvdebug_writel(struct gk20a* g, u32 r, u32 v) {
	54	struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
	55	if (unlikely(!g_os->regs)) {
	56	printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n");
	57	return;
	58	}
	59	writel_relaxed(v, g_os->regs + r);
	60	wmb();
	61	}
	62	/*
	63	#define RUNLIST_PROCFS_NAME "runlist"
	64
	65	static const struct seq_operations runlist_file_seq_ops = {
	66	.start =
	67	.next =
	68	.stop =
	69	.show =
	70	};
	71
	72	static const struct file_operations runlist_file_ops = {
	73	.read =
	74	*/
	75	/static void read_bytes(struct gk20a g, void* target, u32 start, u32 num_bytes) {
	76	u32 *output = target;
	77	u32 i;
	78	// Read u32s from the GPU
	79	for (i = 0; i < num_bytes; i += 4) {
	80	output[i/4] = _nvgpu_readl(g, start + i);
	81	printk(KERN_INFO "[nvdebug] U32 %d: %0x\n", i, output[i/4]);
	82	}
	83	}
	84
	85	static void read_bytes(void* target, void* start, u32 num_bytes) {
	86	u32 *output = target;
	87	u32 i;
	88	// Read u32s from the GPU
	89	for (i = 0; i < num_bytes; i += 4) {
	90	output[i/4] = readl(start + i);
	91	printk(KERN_INFO "[nvdebug] U32 %d: %0x\n", i, output[i/4]);
	92	}
	93	}*/
	94
	95	/*
	96	+---- TSG Entry %d ----+
	97	\| Scale: %d \|
	98	\| Timeout: %d \|
	99	+----------------------+
	100
	101
	102
	103
	104
	105
	106	*/
	107
	108	#define PRE KERN_INFO "[nvdebug] "
	109
	110	static void nvdebug_print_tsg(struct entry_tsg* tsg) {
	111	if (tsg->entry_type != ENTRY_TYPE_TSG) {
	112	printk(KERN_WARNING "[nvdebug] Attempted to print non-TSG in nvdebug_print_tsg()!\n");
	113	return;
	114	}
	115	printk(PRE "+---- TSG Entry %-2d----+", tsg->tsgid);
	116	printk(PRE "\| Scale: %-13d\|", tsg->timeslice_scale);
	117	printk(PRE "\| Timeout: %-11d\|", tsg->timeslice_timeout);
	118	printk(PRE "+---------------------+");
	119	}
	120
	121	static void nvdebug_print_chan(struct runlist_chan* chan) {
	122	char* loc_txt;
	123	u64 inst_ptr;
	124	if (chan->entry_type != ENTRY_TYPE_CHAN) {
	125	printk(KERN_WARNING "[nvdebug] Attempted to print non-channel in nvdebug_print_channel()!\n");
	126	return;
	127	}
	128	switch (chan->inst_target) {
	129	case TARGET_VID_MEM:
	130	loc_txt = "VID_MEM";
	131	break;
	132	case TARGET_SYS_MEM_COHERENT:
	133	loc_txt = "SYS_MEM_COHERENT";
	134	break;
	135	case TARGET_SYS_MEM_NONCOHERENT:
	136	loc_txt = "SYS_MEM_NONCOHERENT";
	137	break;
	138	default:
	139	printk(KERN_WARNING "[nvdebug] Invalid aperture in runlist channel!\n");
	140	return;
	141	}
	142	// Reconstruct pointer to channel instance block
	143	inst_ptr = chan->inst_ptr_hi;
	144	inst_ptr <<= 32;
	145	inst_ptr \|= chan->inst_ptr_lo << 12;
	146
	147	printk(PRE " +- Channel Entry %-4d-+", chan->chid);
	148	printk(PRE " \| Runqueue Selector: %d\|", chan->runqueue_selector);
	149	printk(PRE " \| Instance PTR: \|");
	150	printk(PRE " \| %#018llx \|", inst_ptr);
	151	printk(PRE " \| %-20s\|", loc_txt);
	152	printk(PRE " +---------------------+");
	153	}
	154
	155	#define for_chan_in_tsg(chan, tsg) \
	156	for (chan = (struct runlist_chan*)(tsg + 1); \
	157	(void)chan < (void)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length; \
	158	chan++)
	159
	160	#define next_tsg(tsg) \
	161	(void)(tsg + 1) + sizeof(struct runlist_chan) tsg->tsg_length
	162
	163	static void nvdebug_print_runlist(struct entry_tsg* head, runlist_info_t rl_info) {
	164	int rl_idx = 0;
	165	struct runlist_chan* chan;
	166	printk(PRE "tsg->tsg_length: %d\n", head->tsg_length);
	167	printk(PRE "rl_info.len: %d\n", rl_info.len);
	168	while (rl_idx < rl_info.len) {
	169	nvdebug_print_tsg(head);
	170	for_chan_in_tsg(chan, head) {
	171	nvdebug_print_chan(chan);
	172	}
	173	rl_idx += 1 + head->tsg_length;
	174	head = next_tsg(head);
	175	}
	176	}
	177
	178	static int __init nvdebug_init(void) {
	179	struct device *dev = NULL;
	180	struct device *temp_dev;
	181	struct gk20a *g;
	182	struct entry_tsg head;
	183	runlist_base_t rl_base;
	184	runlist_info_t rl_info;
	185	u64 runlist_iova;
	186	// Get the last device that matches our name
	187	while ((temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b"))) {
	188	printk(KERN_INFO "Found a matching device\n");
	189	dev = temp_dev;
	190	}
	191	if (!dev)
	192	return -EIO;
	193	g = get_gk20a(dev);
	194	// This address seems to not be:
	195	// - A GPU address (type is sysmem_coherent)
	196	// - A physical address (dereferencing after ioremap crashes)
	197	// - A kernel virtual address (dereferencing segfaults)
	198	// So maybe it's some sort of custom thing? This is an address that the GPU
	199	// can use, so it would make most sense for it to be a physical address.
	200	//
	201	// BUT, it can't possibly be a physical address, as it would refer to an
	202	// address greater than the maximum one on our system (by a lot!).
	203	// Maybe I'm reading the runlist base wrong?
	204	// Aha, the driver calls it runlist_iova. Sounds like runlist I/O virtual
	205	// address! So, what's this I/O address space? All I know is that it's what
	206	// nvgpu_mem_get_addr() returns. That function returns the result of either:
	207	// - gpu_phys_addr which is __nvgpu_sgl_phys on our platform which (?)
	208	// converts an IPA to a PA?
	209	// - nvgpu_mem_iommu_translate
	210	//
	211	// The original memory is allocated with nvgpu_dma_alloc_flags_sys(), which
	212	// returns SYSMEM.
	213	//
	214	// To convert a physical address to a IOMMU address, we add a bit
	215	//
	216	// BUT, it turns out that it IS JUST A PHYSICAL ADDRESS! It wasn't working
	217	// before because the GPU had simply gone to sleep and invalidated its
	218	// register state, so nvgpu_readl() was simply returning garbage.
	219
	220	printk(KERN_INFO "[nvdebug] Pulling runlist base address from %x\n", NV_PFIFO_RUNLIST_BASE);
	221	printk(KERN_INFO "[nvdebug] Using struct gk20a* of %px\n", g);
	222	printk(KERN_INFO "[nvdebug] g->name: %s, g->power_on: %d, g->sw_ready: %d, g->is_virtual %d\n", g->name, g->power_on, g->sw_ready, g->is_virtual);
	223	struct nvgpu_os_linux *l = container_of(g, struct nvgpu_os_linux, g);
	224	printk(KERN_INFO "[nvdebug] l->regs %px, l->regs_saved %px\n", l->regs, l->regs_saved);
	225	if (!l->regs)
	226	return -EIO;
	227	rl_base.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST_BASE);
	228	rl_info.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST);
	229	runlist_iova = ((u64)rl_base.ptr) << 12;
	230	printk(KERN_INFO "[nvdebug] Runlist ptr: %x, type: %d, raw: %x, IOVA: %px\n", rl_base.ptr, rl_base.type, rl_base.raw, (void*)runlist_iova);
	231	// Segfaults
	232	//u32 attempted_read = ioread32(runlist_iova);
	233	//printk(KERN_INFO "[nvdebug] first word of runlist: %0x\n", attempted_read);
	234
	235	// Errors out
	236	//u32* virt_rt_addr = ioremap(phys_rl_addr, sizeof(struct entry_tsg));
	237	//printk(KERN_INFO "[nvdebug] Runlist virt_addr: %px\n", virt_rt_addr);
	238
	239	/* Overcomplicated?
	240	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
	241	if (!domain) {
	242	printk(KERN_INFO "[nvdebug] No IOMMU domain!\n");
	243	return -EIO;
	244	}
	245	u64 phys_addr = platform_bus_type.iommu_ops->iova_to_phys(domain, runlist_iova);
	246	printk(KERN_INFO "[nvdebug] Runlist PA: %px\n", phys_addr);
	247	*/
	248
	249	printk(KERN_INFO "[nvdebug] Runlist phys_to_virt: %px\n", (void*)phys_to_virt(runlist_iova));
	250	printk(KERN_INFO "[nvdebug] Runlist phys_to_virt: %x\n", (u32*)phys_to_virt(runlist_iova));
	251	head = (struct entry_tsg)phys_to_virt(runlist_iova);
	252	nvdebug_print_runlist((struct entry_tsg*)phys_to_virt(runlist_iova), rl_info);
	253	//nvdebug_print_tsg(&head);
	254	//nvdebug_print_chan((struct runlist_chan*)(phys_to_virt(runlist_iova) + sizeof(struct entry_tsg)));
	255	//printk(KERN_INFO "[nvdebug] entry_type: %d\n", head.entry_type);
	256	//printk(KERN_INFO "[nvdebug] timeslice_scale: %d\n", head.timeslice_scale);
	257	//printk(KERN_INFO "[nvdebug] timeslice_timeout: %d\n", head.timeslice_timeout);
	258	//printk(KERN_INFO "[nvdebug] tsg_length: %d\n", head.tsg_length);
	259	//printk(KERN_INFO "[nvdebug] tsgid: %d\n", head.tsgid);
	260
	261	//printk(KERN_INFO "[nvdebug] Mem base phys: %p\n", (void)virt_to_phys((void)0xffffffc000000000ULL));
	262	//printk(KERN_INFO "[nvdebug] Mem end phys: %p\n", (void)virt_to_phys((void)0xffffffc400000000ULL));
	263	//printk(KERN_INFO "[nvdebug] Runlist *virt_addr: %x\n", readl(virt_rt_addr)); // This crashes
	264	//read_bytes(&head, virt_rt_addr, sizeof(struct entry_tsg));
	265	/*printk(KERN_INFO "[nvdebug] entry_type: %d\n", head.entry_type);
	266	printk(KERN_INFO "[nvdebug] timeslice_scale: %d\n", head.timeslice_scale);
	267	printk(KERN_INFO "[nvdebug] timeslice_timeout: %d\n", head.timeslice_timeout);
	268	printk(KERN_INFO "[nvdebug] tsg_length: %d\n", head.tsg_length);
	269	printk(KERN_INFO "[nvdebug] tsgid: %d\n", head.tsgid);
	270	*/return 0;
	271	}
	272
	273	static void __exit nvdebug_exit(void) {
	274	printk(KERN_INFO "[nvdebug] Exiting...\n");
	275	}
	276
	277	module_init(nvdebug_init);
	278	module_exit(nvdebug_exit);


diff --git a/nvdebug.h b/nvdebug.h new file mode 100644 index 0000000..aa5d0cf --- /dev/null +++ b/nvdebug.h
@@ -0,0 +1,127 @@
	1	/* Copyright 2021 Joshua Bakita
	2	* SPDX-License-Identifier: MIT
	3	*/
	4
	5	/* Runlist Channel
	6	A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue
	7	of GPU commands. These commands are typically queued from userspace.
	8
	9	`INST_PTR` points to a GPU Instance Block which contains pointers to the GPU
	10	virtual address space for this context. All channels in a TSG point to the
	11	same GPU Instance Block.
	12
	13	ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_CHAN
	14	CHID (ID) : identifier of the channel to run (overlays ENTRY_ID)
	15	RUNQUEUE_SELECTOR (Q) : selects which PBDMA should run this channel if
	16	more than one PBDMA is supported by the runlist
	17
	18	INST_PTR_LO : lower 20 bits of the 4k-aligned instance block pointer
	19	INST_PTR_HI : upper 32 bit of instance block pointer
	20	INST_TARGET (TGI) : aperture of the instance block
	21
	22	USERD_PTR_LO : upper 24 bits of the low 32 bits, of the 512-byte-aligned USERD pointer
	23	USERD_PTR_HI : upper 32 bits of USERD pointer
	24	USERD_TARGET (TGU) : aperture of the USERD data structure
	25	*/
	26	enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1};
	27	enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3};
	28
	29	struct runlist_chan {
	30	// 0:63
	31	enum ENTRY_TYPE entry_type:1;
	32	uint32_t runqueue_selector:1;
	33	uint32_t padding:2;
	34	enum INST_TARGET inst_target:2;
	35	uint32_t padding2:2;
	36	uint32_t userd_ptr_lo:24;
	37	uint32_t userd_ptr_hi:32;
	38	// 64:128
	39	uint32_t chid:12;
	40	uint32_t inst_ptr_lo:20;
	41	uint32_t inst_ptr_hi:32;
	42	} __attribute__((packed));
	43
	44	/* Runlist TSG (TimeSlice Group)
	45	The runlist is composed of timeslice groups (TSG). Each TSG corresponds
	46	to a single virtual address space on the GPU and contains `TSG_LENGTH`
	47	channels. These channels and virtual address space are accessible to the GPU
	48	host unit for use until the timeslice expires or a TSG switch is forcibly
	49	initiated via a write to `NV_PFIFO_PREEMPT`.
	50
	51	timeslice = (TSG_TIMESLICE_TIMEOUT << TSG_TIMESLICE_SCALE) * 1024 nanoseconds
	52
	53	ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_TSG
	54	TSGID : identifier of the Timeslice group (overlays ENTRY_ID)
	55	TSG_LENGTH : number of channels that are part of this timeslice group
	56	TIMESLICE_SCALE : scale factor for the TSG's timeslice
	57	TIMESLICE_TIMEOUT : timeout amount for the TSG's timeslice
	58	*/
	59	struct entry_tsg {
	60	// 0:63
	61	enum ENTRY_TYPE entry_type:1;
	62	uint64_t padding:15;
	63	uint32_t timeslice_scale:4;
	64	uint64_t padding2:4;
	65	uint32_t timeslice_timeout:8;
	66	uint32_t tsg_length:8;
	67	uint32_t padding3:24;
	68	// 64:128
	69	uint32_t tsgid:12;
	70	uint64_t padding4:52;
	71	} __attribute__((packed));
	72
	73	enum PREEMPT_TYPE {PREEMPT_TYPE_CHANNEL = 0, PREEMPT_TYPE_TSG = 1};
	74
	75	/* Preempt
	76	ID/CHID : Id of TSG or channel to preempt
	77	*/
	78	#define NV_PFIFO_PREEMPT 0x00002634
	79	struct pfifo_preempt {
	80	uint32_t id:12;
	81	uint32_t padding:8;
	82	bool is_pending:1;
	83	uint32_t padding2:3;
	84	enum PREEMPT_TYPE type:2;
	85	uint32_t padding3:6;
	86	} __attribute__((packed));
	87
	88	#define NV_PFIFO_RUNLIST_PREEMPT 0x00002638
	89	struct runlist_preempt {
	90	bool runlist_0:1;
	91	bool runlist_1:1;
	92	bool runlist_2:1;
	93	bool runlist_3:1;
	94	bool runlist_4:1;
	95	bool runlist_5:1;
	96	bool runlist_6:1;
	97	bool runlist_7:1;
	98	bool runlist_8:1;
	99	bool runlist_9:1;
	100	bool runlist_10:1;
	101	bool runlist_11:1;
	102	bool runlist_12:1;
	103	bool runlist_13:1;
	104	uint32_t padding:28;
	105	} __attribute__((packed));
	106
	107	// Note: This is different with Turing
	108	#define NV_PFIFO_RUNLIST_BASE 0x00002270
	109	typedef union {
	110	struct {
	111	uint32_t ptr:28;
	112	uint32_t type:2;
	113	uint32_t padding:2;
	114	} __attribute__((packed));
	115	uint32_t raw;
	116	} runlist_base_t;
	117
	118	#define NV_PFIFO_RUNLIST 0x00002274
	119	typedef union {
	120	struct {
	121	uint32_t len:16;
	122	uint32_t padding:4;
	123	uint32_t id:4;
	124	uint32_t padding2:8;
	125	} __attribute__((packed));
	126	uint32_t raw;
	127	} runlist_info_t;


diff --git a/nvidia_preemption.md b/nvidia_preemption.md new file mode 100644 index 0000000..051d4a5 --- /dev/null +++ b/nvidia_preemption.md
@@ -0,0 +1,36 @@
	1	# NVIDIA GPU Preemption
	2
	3	MVP: Preempt current work on the GPU on the Jetson Xavier
	4
	5	Summary of approach: Create new runlist that excludes the current work and point the GPU to it
	6
	7	1. Obtain current runlist
	8	2. Copy runlist to new location, skipping TSG of target to preempt
	9	3. Write new runlist address to NV_PFIFO_RUNLIST, which will preempt current work
	10
	11	It's unclear if this approach is lower-overhead than that of Capodieci et al.
	12	See approach Alternate 1 which is our new priority.
	13
	14	Notes:
	15	- Each TSG (timeslice group) corresponds to one context (?)
	16	- Runlist base must be 4k aligned
	17	- nvgpu driver gets gk20a struct via container_of an inode which is a struct nvgpu_os_linux
	18	- gk20a_writel is nvgpu_writel. Define is: `void nvgpu_writel(struct gk20a *g, u32 reg_addr, u32 value);`
	19	- gk20a_readl is nvgpu_readl. Define is: `u32 nvgpu_readl(struct gk20a *g, u32 reg_addr);`
	20
	21	## Other approaches:
	22
	23	### Alternate 1:
	24	"2. Disable all channels in the containing TSG by writing ENABLE_CLR to TRUE
	25	in their channel RAM entries in NV_PCCSR_CHANNEL (see dev_fifo.ref).
	26	3. Initiate a preempt of the TSG via NV_PFIFO_PREEMPT or
	27	NV_PFIFO_RUNLIST_PREEMPT." (PBDMA, "Recovery procedure")
	28
	29	### Alternate 2:
	30	"3. Initiate a preempt of the engine by writing the bit associated with its
	31	runlist to NV_PFIFO_RUNLIST_PREEMPT. This allows us to begin the preempt
	32	process prior to doing the slow register reads needed to determine whether
	33	the context has hit any interrupts or is hung. Do not poll
	34	NV_PFIFO_RUNLIST_PREEMPT for the preempt to complete." (FIFO, "Context TSG tear-down procedure")
	35
	36	See `nvdebug.c` and `nvdebug.h` for implementation details.