From 47506870790989b5e2d9a6128711d96c487f0d7b Mon Sep 17 00:00:00 2001
From: Joshua Bakita <jbakita@cs.unc.edu>
Date: Mon, 8 Apr 2024 15:35:54 -0400
Subject: Heavily refactor runlist code for correctness and Turing support

- Support differently-formatted runlist registers on Turing
- Support different runlist register offsets on Turing
- Fix incorrect indenting when printing the runlist
- Fix `preempt_tsg` and `switch_to_tsg` API implementations to
  correctly interface with the hardware (previously, they would try
  to disable scheduling for the last-updated runlist pointer, which
  was nonsense, and just an artifact of my early misunderstandings
  of how the NV_PFIFO_RUNLIST* registers worked).
- Remove misused NV_PFIFO_RUNLIST and NV_PFIFO_RUNLIST_BASE registers
- Refactor `runlist.c` to use the APIs from `bus.c`
---
 nvdebug.h        | 116 +++++++++++++++++++++---------
 runlist.c        | 212 +++++++++++++++++++++++--------------------------------
 runlist_procfs.c |  52 ++++++--------
 3 files changed, 192 insertions(+), 188 deletions(-)

diff --git a/nvdebug.h b/nvdebug.h
index 2fc8c63..f65b403 100644
--- a/nvdebug.h
+++ b/nvdebug.h
@@ -131,8 +131,8 @@ struct gm107_runlist_chan {
   GPU instance addresses with Volta.
 */
 
-// Support: Volta, Ampere*, Turing*
-// *These treat the top 8 bits of TSGID as GFID (unused)
+// Support: Volta, Turing*, Ampere*
+// *These treat bits 4:11 (8 bits) as GFID (unused)
 struct gv100_runlist_tsg {
 // 0:63
 	enum ENTRY_TYPE entry_type:1;
@@ -166,7 +166,7 @@ enum PREEMPT_TYPE {PREEMPT_TYPE_CHANNEL = 0, PREEMPT_TYPE_TSG = 1};
 
 /* Preempt a TSG or Channel by ID
   ID/CHID     : Id of TSG or channel to preempt
-  IS_PENDING  : Is a context switch pending?
+  IS_PENDING  : Is a context switch pending? (read-only)
   TYPE        : PREEMPT_TYPE_CHANNEL or PREEMPT_TYPE_TSG
 
   Support: Kepler, Maxwell, Pascal, Volta, Turing
@@ -201,7 +201,7 @@ typedef union {
     rl_preempt.raw |= BIT(nr);
     nvdebug_writel(g, NV_PFIFO_RUNLIST_PREEMPT, rl_preempt.raw);
 
-  Support: Volta
+  Support: Volta, Turing
 */
 #define NV_PFIFO_RUNLIST_PREEMPT 0x00002638
 typedef union {
@@ -255,39 +255,83 @@ typedef union {
  * cause a system to hang/stop responding."
  */
 
-// Note: This is different with Turing
-// Support: Fermi, Kepler, Maxwell, Pascal, Volta
-#define NV_PFIFO_RUNLIST_BASE 0x00002270
-#define NV_PFIFO_ENG_RUNLIST_BASE(i) (0x00002280+(i)*8)
+/* Runlist Metadata (up through Volta)
+  "Software specifies the GPU contexts that hardware should "run" by writing a
+  list of entries (known as a "runlist") to a 4k-aligned area of memory (beginning
+  at NV_PFIFO_RUNLIST_BASE), and by notifying Host that a new list is available
+  (by writing to NV_PFIFO_RUNLIST).
+
+  Submission of a new runlist causes Host to expire the timeslice of all work
+  scheduled by the previous runlist, allowing it to schedule the channels present
+  in the new runlist once they are fetched. SW can check the status of the runlist
+  by polling NV_PFIFO_ENG_RUNLIST_PENDING. (see dev_fifo.ref NV_PFIFO_RUNLIST for
+  a full description of the runlist submit mechanism).
+
+  Runlists can be stored in system memory or video memory (as specified by
+  NV_PFIFO_RUNLIST_BASE_TARGET). If a runlist is stored in video memory, software
+  will have to execute flush or read the last entry written before submitting the
+  runlist to Host to guarantee coherency." (volta/dev_ram.ref.txt)
+
+  We only document the *_PFIFO_ENG_RUNLIST_*(i) read-only registers here (where
+  i is a runlist index). Runlists are configured via the seperate, writable
+  *_PFIFO_RUNLIST_* register; see open-gpu-doc for more on that.
+
+  LEN         : Number of entries in runlist
+  IS_PENDING  : Is runlist committed?
+  PTR         : Pointer to start of 4k-aligned runlist (upper 28 of 40 bits)
+  TARGET      : Aperture of runlist (video or system memory)
+
+  Support: Fermi*, Kepler, Maxwell, Pascal, Volta
+  *Fermi may expose this information 8 bytes earlier, starting at 0x227C?
+*/
+#define NV_PFIFO_ENG_RUNLIST_BASE_GF100(i) (0x00002280+(i)*8) // Read-only
 typedef union {
 	struct {
+		// NV_PFIFO_ENG_RUNLIST_BASE_* fields
 		uint32_t ptr:28;
 		enum INST_TARGET target:2;
-		 uint32_t padding:2;
+		 uint32_t padding1:2;
+		// NV_PFIFO_ENG_RUNLIST_* fields
+		uint16_t len:16;
+		 uint32_t padding2:4;
+		bool is_pending:1;
+		 uint32_t padding3:11;
 	} __attribute__((packed));
-	uint32_t raw;
-} runlist_base_t;
+	uint64_t raw;
+} eng_runlist_gf100_t;
 
-// Support: Kepler, Maxwell, Pascal, Volta
-// Works on Fermi, but id is one bit longer and is b11111
-#define NV_PFIFO_RUNLIST 0x00002274
-#define NV_PFIFO_ENG_RUNLIST(i) (0x00002284+(i)*8)
+/*
+  Starting with Turing, the seperate registers for reading and writing runlist
+  configuration were dropped in favor of read/write indexed registers. As part
+  of this, the layout was modified to allow for larger runlist pointers (upper
+  52 of 64 bits).
+
+  Support: Turing, Ampere, Lovelace?, Hopper?
+*/
+// Support: Turing
+#define NV_PFIFO_RUNLIST_BASE_TU102(i) (0x00002B00+(i)*16) // Read/write
+#define NV_PFIFO_RUNLIST_SUBMIT_TU102(i) (0x00002B08+(i)*16) // Read/write
 typedef union {
-	// RUNLIST fields
 	struct {
-		uint32_t len:16;
-		 uint32_t padding:4;
-		uint32_t id:4; // Runlist ID (each engine may have a seperate runlist)
-		 uint32_t padding2:8;
+		enum INST_TARGET target:2;
+		 uint32_t padding:10;
+		uint64_t ptr:28;
+		 uint32_t padding2:24;
 	} __attribute__((packed));
-	// ENG_RUNLIST fields that differ
+	uint64_t raw;
+} runlist_base_tu102_t;
+
+typedef union {
 	struct {
-		 uint32_t padding3:20;
-		bool is_pending:1; // Is runlist not yet committed?
-		 uint32_t padding4:11;
+		uint16_t len:16;
+		uint16_t offset:16;
+		uint32_t preempted_tsgid:14;
+		bool valid_preempted_tsgid:1;
+		bool is_pending:1;
+		uint32_t preempted_offset:16;
 	} __attribute__((packed));
-	uint32_t raw;
-} runlist_info_t;
+	uint64_t raw;
+} runlist_submit_tu102_t;
 
 enum CHANNEL_STATUS {
 	CHANNEL_STATUS_IDLE = 0,
@@ -307,8 +351,13 @@ enum CHANNEL_STATUS {
 	CHANNEL_STATUS_ON_ENG_PENDING_ACQ_CTX_RELOAD = 14,
 };
 
+/* Programmable Channel Control System RAM (PCCSR)
+
+  512-entry array of channel control and status data structures.
+
+  Support: Fermi, Maxwell, Pascal, Volta, Turing, [more?]
+*/
 #define NV_PCCSR_CHANNEL_INST(i) (0x00800000+(i)*8)
-// There are a total of 512 possible channels
 #define MAX_CHID 512
 typedef union {
 	struct {
@@ -1023,12 +1072,12 @@ VERSIONED_RL_ACCESSOR(tsg, uint32_t, tsg_length);
 struct runlist_iter {
 	// Pointer to either a TSG or channel entry (they're the same size)
 	void *curr_entry;
-	// This should be set to tsg_length when a TSG is reached, and
-	// decremented as each subsequent channel is printed. This allows us to
-	// track which channel are and are not part of the TSG.
-	int channels_left_in_tsg;
-	// Total runlist length, etc
-	runlist_info_t rl_info;
+	// This should be set to tsg_length + 1 when a TSG is reached, and
+	// decremented each time _next() is called. This allows us to
+	// track which channels are and are not part of the TSG.
+	int entries_left_in_tsg;
+	// Number of entries in runlist
+	int len;
 };
 
 #define NVDEBUG_MAX_DEVICES 8
@@ -1037,6 +1086,7 @@ extern struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES];
 // Defined in runlist.c
 int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl_iter);
 int preempt_tsg(struct nvdebug_state *g, uint32_t tsg_id);
+int preempt_runlist(struct nvdebug_state *g, uint32_t rl_id);
 
 // Defined in mmu.c
 uint32_t vram2PRAMIN(struct nvdebug_state *g, uint64_t addr);
diff --git a/runlist.c b/runlist.c
index ed35c7e..c725e77 100644
--- a/runlist.c
+++ b/runlist.c
@@ -1,172 +1,134 @@
-#include <linux/kernel.h>  // Kernel types
+/* Copyright 2024 Joshua Bakita
+ * Helpers for dealing with the runlist and other Host (PFIFO) registers
+ */
+#include <linux/printk.h> // For printk()
+#include <asm/errno.h> // For error defines
+#include <asm/io.h> // For phys_to_virt()
 
 #include "nvdebug.h"
 
+// Uncomment to, upon BAR2 access failure, return a PRAMIN-based runlist pointer
+// **If enabled, PRAMIN may not be otherwise used while walking the runlist!**
 #define FALLBACK_TO_PRAMIN
 
 /* Get runlist head and info (incl. length)
-   @param rl_iter Location at which to store output
-   @param rl_id   Which runlist to obtain?
+  @param rl_id   Which runlist to obtain?
+  @param rl_iter Location at which to store output
+  @return 0 or -errno on error
 */
 int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl_iter) {
-	runlist_base_t rl_base;
-	runlist_info_t rl_info;
-	u64 runlist_iova;
+	uint64_t runlist_iova;
+	enum INST_TARGET runlist_target;
+	uint16_t runlist_len;
+#ifdef FALLBACK_TO_PRAMIN
+	int off;
+#endif // FALLBACK_TO_PRAMIN
+	// Zero-initialize the runlist iterator
 	*rl_iter = (struct runlist_iter){0};
-	rl_base.raw = nvdebug_readl(g, NV_PFIFO_ENG_RUNLIST_BASE(rl_id));
-	// Check that reads are working
-	if (rl_base.raw == -1)
-		return -EIO;
-	// The address pointed to `regs` + NV_PFIFO_RUNLIST_BASE seems to not be:
-	// - A GPU address (type is sysmem_coherent)
-	// - A physical address (dereferencing after ioremap crashes)
-	// - A kernel virtual address (dereferencing segfaults)
-	// So maybe it's some sort of custom thing? This is an address that the GPU
-	// can use, so it would make most sense for it to be a physical address.
-	//
-	// BUT, it can't possibly be a physical address, as it would refer to an
-	// address greater than the maximum one on our system (by a lot!).
-	// Maybe I'm reading the runlist base wrong?
-	// Aha, the driver calls it runlist_iova. Sounds like runlist I/O virtual
-	// address! So, what's this I/O address space? All I know is that it's what
-	// nvgpu_mem_get_addr() returns. That function returns the result of either:
-	// - gpu_phys_addr which is  __nvgpu_sgl_phys on our platform which (?)
-	//   converts an IPA to a PA?
-	// - nvgpu_mem_iommu_translate
-	//
-	// The original memory is allocated with nvgpu_dma_alloc_flags_sys(), which
-	// returns SYSMEM.
-	//
-	// To convert a physical address to a IOMMU address, we add a bit
-	//
-	// BUT, it turns out that it IS JUST A PHYSICAL ADDRESS! It wasn't working
-	// before because the GPU had simply gone to sleep and invalidated its
-	// register state, so nvgpu_readl() was simply returning garbage.
-	rl_info.raw = nvdebug_readl(g, NV_PFIFO_ENG_RUNLIST(rl_id));
-	if (rl_info.raw == -1)
-		return -EIO;
-	runlist_iova = ((u64)rl_base.ptr) << 12;
-	printk(KERN_INFO "[nvdebug] Runlist %d @ %llx in %s (config raw: %x)\n",
-	       rl_id, runlist_iova, target_to_text(rl_base.target), rl_base.raw);
-	printk(KERN_INFO "[nvdebug] Runlist length %d, ID %d\n", rl_info.len, rl_info.id);
+
+	// Get runlist location and length using architecture-dependent logic
+	if (g->chip_id < NV_CHIP_ID_TURING) {
+		eng_runlist_gf100_t rl;
+		if ((rl.raw = nvdebug_readq(g, NV_PFIFO_ENG_RUNLIST_BASE_GF100(rl_id))) == -1)
+			return -EIO;
+		runlist_iova = ((uint64_t)rl.ptr) << 12;
+		runlist_target = rl.target;
+		printk(KERN_INFO "[nvdebug] Runlist %d: %d entries @ %llx in %s (config raw: %#018llx)\n",
+			   rl_id, rl.len, runlist_iova, target_to_text(rl.target), rl.raw);
+		runlist_len = rl.len;
+	} else if (g->chip_id < NV_CHIP_ID_AMPERE) {
+		runlist_base_tu102_t base;
+		runlist_submit_tu102_t submit;
+		if ((base.raw = nvdebug_readq(g, NV_PFIFO_RUNLIST_BASE_TU102(rl_id))) == -1)
+			return -EIO;
+		if ((submit.raw = nvdebug_readq(g, NV_PFIFO_RUNLIST_SUBMIT_TU102(rl_id))) == -1)
+			return -EIO;
+		runlist_iova = ((uint64_t)base.ptr) << 12;
+		runlist_target = base.target;
+		runlist_len = submit.len;
+	}
 	// Return early on an empty runlist
-	if (!rl_info.len)
+	if (!runlist_len)
 		return 0;
+
 	// If the runlist is in VID_MEM, search the BAR2/3 page tables for a mapping
-	if (rl_base.target == TARGET_VID_MEM) {
-		printk(KERN_WARNING "[nvdebug] Runlist is located in video memory. Access to video memory is experimental.");
-		bar_config_block_t bar1_block, bar2_block;
-		bar1_block.raw = nvdebug_readl(g, NV_PBUS_BAR1_BLOCK);
-		printk(KERN_INFO "[nvdebug] BAR1 inst block @ %llx in %s's %s address space.\n", ((u64)bar1_block.ptr) << 12, target_to_text(bar1_block.target), bar1_block.is_virtual ? "virtual" : "physical");
-		bar2_block.raw = nvdebug_readl(g, NV_PBUS_BAR2_BLOCK);
-		printk(KERN_INFO "[nvdebug] BAR2 inst block @ %llx in %s's %s address space.\n", ((u64)bar2_block.ptr) << 12, target_to_text(bar2_block.target), bar1_block.is_virtual ? "virtual" : "physical");
-		uint32_t bar_inst_pramin_offset = vram2PRAMIN(g, (uint64_t)bar2_block.ptr << 12);
-		if (!bar_inst_pramin_offset) {
-			printk(KERN_WARNING "[nvdebug] Unable to find instance block for BAR2/3 in the current NV_PRAMIN window. VRAM inaccessible.\n");
-			goto attempt_pramin_access;
-		}
-		/* TODO: Support BAR1?
-		bar_inst_pramin_offset = vram2PRAMIN(g, bar1_block.ptr << 12);
-		if (!bar_inst_pramin_offset) {
-			printk(KERN_WARNING "[nvdebug] Unable to find instance block for BAR1 in the current NV_PRAMIN window. VRAM inaccessible.\n");
-			return -EOPNOTSUPP;
-		}*/
-		// Instance blocks (size == 1kb) contain many things, but we only care about
-		// the section which describes the location of the page directory (page table)
-		uint32_t bar_pdb_config_pramin_offset = bar_inst_pramin_offset + NV_PRAMIN_PDB_CONFIG_OFF;
-		page_dir_config_t pd_config;
-		pd_config.raw = nvdebug_readq(g, bar_pdb_config_pramin_offset + NV_PRAMIN);
-		uint64_t bar_pdb_vram_addr = pd_config.page_dir_hi;
-		bar_pdb_vram_addr <<= 20;
-		bar_pdb_vram_addr |= pd_config.page_dir_lo;
-		bar_pdb_vram_addr <<= 12;
-		printk(KERN_INFO "[nvdebug] BAR2 PDB @ %llx in %s of version %s (config raw: %llx)\n", bar_pdb_vram_addr, target_to_text(pd_config.target), pd_config.is_ver2 ? "2" : "1", pd_config.raw);
-		// TODO: SYSMEM support for page table location
-		if (pd_config.target != TARGET_VID_MEM) {
-			printk(KERN_WARNING "[nvdebug] BAR2 PDB is in an unsupported location.\n");
-			goto attempt_pramin_access;
-		}
-		uint32_t bar_pdb_pramin_offset = vram2PRAMIN(g, bar_pdb_vram_addr);
-		if (!bar_pdb_pramin_offset) {
-			printk(KERN_WARNING "[nvdebug] Unable to find page directory BAR2/3 in the current NV_PRAMIN window. VRAM inaccessible.\n");
-			goto attempt_pramin_access;
-		}
+	if (runlist_target == TARGET_VID_MEM) {
+		void __iomem *bar2_page_dir;
+		bool pdb_is_ver2;
 		uint64_t runlist_bar_vaddr;
-		if (pd_config.is_ver2)
-			runlist_bar_vaddr = search_page_directory(g, g->regs + NV_PRAMIN + bar_pdb_pramin_offset, phy2PRAMIN, runlist_iova);
+
+		if (get_bar2_pdb(g, &bar2_page_dir, &pdb_is_ver2) < 0)
+			return -EIO;
+
+		if (pdb_is_ver2)
+			runlist_bar_vaddr = search_page_directory(g, bar2_page_dir, phy2PRAMIN, runlist_iova);
 		else
-			runlist_bar_vaddr = search_v1_page_directory(g, g->regs + NV_PRAMIN + bar_pdb_pramin_offset, phy2PRAMIN, runlist_iova);
+			runlist_bar_vaddr = search_v1_page_directory(g, bar2_page_dir, phy2PRAMIN, runlist_iova);
 		if (!runlist_bar_vaddr) {
 			printk(KERN_WARNING "[nvdebug] Unable to find runlist mapping in BAR2/3 page tables.\n");
 			goto attempt_pramin_access;
 		}
 		printk(KERN_INFO "[nvdebug] Runlist @ %llx in BAR2 virtual address space.\n", runlist_bar_vaddr);
-		/* XXX: Old test code
-		uint32_t bar2_pd_pramin_offset = vram_to_pramin_off(bar2_pd);
-		//walk_pd_subtree(bar2_pd_pramin_offset);
-		uint64_t runlist_bar2_vaddr = search_pd_subtree(bar2_pd_pramin_offset, runlist_iova);
-		page_dir_entry_t pde_0;
-		pde_0.raw = nvdebug_readl(g, NV_PRAMIN + bar2_pd_pramin_offset);
-		uint32_t pde_1 = nvdebug_readl(g, NV_PRAMIN + vram_to_pramin_off(((u64)pde_0.addr) << 12));
-		uint64_t pde_bar2_vaddr = search_pd_subtree(bar2_pd_pramin_offset, ((u64)pde_0.addr) << 12);
-		uint32_t pde_2 = readl(g->bar3 + pde_bar2_vaddr);
-		printk(KERN_INFO "[nvdebug] PDE0 via PRAMIN: %x, via BAR3: %x\n", pde_1, pde_2);
-		*/
-		if (!g->bar3) {
+		if (!g->bar2) {
 			printk(KERN_WARNING "[nvdebug] BAR2/3 not mapped.\n");
 			return -ENODEV;
 		}
 		rl_iter->curr_entry = g->bar2 + runlist_bar_vaddr;
 	} else {
 		// Directly access the runlist if stored in SYS_MEM (physically addressed)
-		rl_iter->curr_entry = phys_to_virt(runlist_iova);
+		// XXX: SYS_MEM is an IOMMU address on some platforms, causing this to crash
+		rl_iter->curr_entry = (void*)phys_to_virt(runlist_iova);
 	}
-	rl_iter->rl_info = rl_info;
+	rl_iter->len = runlist_len;
 	return 0;
+
 attempt_pramin_access:
 #ifdef FALLBACK_TO_PRAMIN
 	printk(KERN_INFO "[nvdebug] Attempting to move PRAMIN window to runlist as BAR2/3-based access failed [DANGEROUS SIDE EFFECTS]!\n");
-	bar0_window_t win;
-	win.base = (runlist_iova >> 16);
-	win.target = TARGET_VID_MEM;
-	// Shift PRAMIN window. This will cause problems if it races with driver code
-	// that tries to do the same, or expects the window not to move.
-	nvdebug_writel(g, NV_PBUS_BAR0_WINDOW, win.raw);
-	uint32_t off = vram2PRAMIN(g, runlist_iova);
-	// Workaround bug for if `off` should be zero (vram2PRAMIN normally returns
-	// this on error)
-	if (!off && (runlist_iova & 0xffffull != runlist_iova)) {
-		printk(KERN_INFO "[nvdebug] Unable to shift PRAMIN to runlist. Aborting...\n");
-		return -EOPNOTSUPP;
-	}
+	if ((off = addr_to_pramin_mut(g, runlist_iova, runlist_target)) == -1)
+		return off;
 	rl_iter->curr_entry = g->regs + NV_PRAMIN + off;
-	rl_iter->rl_info = rl_info;
+	rl_iter->len = runlist_len;
 	return 0;
 #else
 	return -EOPNOTSUPP;
 #endif // FALLBACK_TO_PRAMIN
 }
 
+/* Trigger a preempt of the specified TSG
+  @param tsg_id ID of TSG to preempt.
+  @return 0 or -errno on error
+
+  Note: If no other TSGs exist in the associated runlist, this TSG may
+        continue executing, unless NV_PFIFO_SCHED_DISABLE is set, or all the
+        channels of the TSG to be preempted are disabled.
+*/
 int preempt_tsg(struct nvdebug_state *g, uint32_t tsg_id) {
-	runlist_info_t rl_info;
 	pfifo_preempt_t pfifo_preempt;
-	runlist_disable_t rl_disable;
-	if (!g)
-		return -EIO;
-        rl_info.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST);
+	if (g->chip_id < NV_CHIP_ID_KEPLER)
+		return -EOPNOTSUPP;
+
+	pfifo_preempt.raw = 0;
 	pfifo_preempt.id = tsg_id;
 	pfifo_preempt.is_pending = 0;
 	pfifo_preempt.type = PREEMPT_TYPE_TSG;
-	// There may be a bug (?) that requires us to disable scheduling before preempting
-	rl_disable.raw = nvdebug_readl(g, NV_PFIFO_SCHED_DISABLE);
-	rl_disable.raw |= BIT(rl_info.id);  // Disable runlist rl_info.id
-	nvdebug_writel(g, NV_PFIFO_SCHED_DISABLE, rl_disable.raw);
+
 	// Actually trigger the preemption
 	nvdebug_writel(g, NV_PFIFO_PREEMPT, pfifo_preempt.raw);
-	// Renable scheduling
-	rl_disable.raw &= ~BIT(rl_info.id);  // Enable runlist rl_info.id
-	nvdebug_writel(g, NV_PFIFO_SCHED_DISABLE, rl_disable.raw);
+	return 0;
+}
+
+/* Trigger a preempt of the specified runlist
+  @param rl_id ID of runlist to preempt.
+  @return 0 or -errno on error
+*/
+int preempt_runlist(struct nvdebug_state *g, uint32_t rl_id) {
+	runlist_preempt_t rl_preempt;
+	if (g->chip_id < NV_CHIP_ID_VOLTA)
+		return -EOPNOTSUPP;
 
-	printk(KERN_INFO "[nvdebug] TSG %d preempted (runlist %d)\n", tsg_id, rl_info.id);
+	// Overwrite, as the register contains nothing to preserve
+	rl_preempt.raw = BIT(rl_id);
+	nvdebug_writel(g, NV_PFIFO_RUNLIST_PREEMPT, rl_preempt.raw);
 	return 0;
 }
diff --git a/runlist_procfs.c b/runlist_procfs.c
index f7f937d..7dedee3 100644
--- a/runlist_procfs.c
+++ b/runlist_procfs.c
@@ -69,12 +69,12 @@ static void *runlist_file_seq_start(struct seq_file *s, loff_t *pos) {
 		if (err)
 			return ERR_PTR(err);
 		// Don't try to print an empty runlist
-		if (rl_iter.rl_info.len <= 0)
+		if (rl_iter.len <= 0)
 			return NULL;
 		return &rl_iter;
 	}
 	// If we're resuming an earlier print
-	if (*pos < rl_iter.rl_info.len) {
+	if (*pos < rl_iter.len) {
 #if LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0)
 		// There's a nasty bug prior to 4.19-rc1 that if the buffer overflows, the
 		// last update to `pos` is not saved. Work around that here by reloading a
@@ -98,14 +98,16 @@ static void* runlist_file_seq_next(struct seq_file *s, void *raw_rl_iter,
 	(*pos)++;
 	rl_iter->curr_entry += NV_RL_ENTRY_SIZE(g);
 	// Verify we haven't reached the end of the runlist
-	// rl_info.len is the num of tsg entries + total num of channel entries
-	if (*pos < rl_iter->rl_info.len) {
+	// len is the num of tsg entries + total num of channel entries
+	if (*pos < rl_iter->len) {
 		ret = rl_iter;
 	}
 #if LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0)
 	// Bug workaround. See comment in runlist_file_seq_start()
 	pos_fixup = ret ? *pos : 0;
 #endif
+	if (rl_iter->entries_left_in_tsg)
+		rl_iter->entries_left_in_tsg--;
 	return ret;
 }
 
@@ -113,17 +115,19 @@ static void runlist_file_seq_stop(struct seq_file *s, void *raw_rl_iter) {
 	// No cleanup needed
 }
 
+// _show() must be idempotent. This function will be rerun if the seq_printf
+// buffer was too small.
 static int runlist_file_seq_show(struct seq_file *s, void *raw_rl_iter) {
 	struct runlist_iter *rl_iter = raw_rl_iter;
 	void *entry = rl_iter->curr_entry;
 	struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)];
 	if (entry_type(g, entry) == ENTRY_TYPE_TSG) {
-		if (rl_iter->channels_left_in_tsg) {
-			printk(KERN_WARNING "[nvdebug] Found TSG ID%d @ %px when %d channels were still expected under the previous TSG in the runlist!\n", tsgid(g, entry), entry, rl_iter->channels_left_in_tsg);
-			while (rl_iter->channels_left_in_tsg--)
+		if (rl_iter->entries_left_in_tsg) {
+			printk(KERN_WARNING "[nvdebug] Found TSG ID%d @ %px when %d channels were still expected under the previous TSG in the runlist!\n", tsgid(g, entry), entry, rl_iter->entries_left_in_tsg);
+			while (rl_iter->entries_left_in_tsg--)
 				seq_printf(s, "[missing channel]\n");
 		}
-		rl_iter->channels_left_in_tsg = tsg_length(g, entry);
+		rl_iter->entries_left_in_tsg = tsg_length(g, entry) + 1;
 		seq_printf(s, "+---- TSG Entry %-3d---+\n", tsgid(g, entry));
 		seq_printf(s, "| Scale: %-13d|\n", timeslice_scale(g, entry));
 		seq_printf(s, "| Timeout: %-11d|\n", timeslice_timeout(g, entry));
@@ -134,10 +138,8 @@ static int runlist_file_seq_show(struct seq_file *s, void *raw_rl_iter) {
 #ifndef DETAILED_CHANNEL_INFO
 		u64 instance_ptr = 0;
 #endif
-		if (rl_iter->channels_left_in_tsg) {
+		if (rl_iter->entries_left_in_tsg)
 			indt = "  ";
-			rl_iter->channels_left_in_tsg--;
-		}
 #ifdef DETAILED_CHANNEL_INFO
 		runlist_detail_seq_show_chan(s, g, chid(g, entry), indt);
 #else
@@ -193,8 +195,7 @@ ssize_t preempt_tsg_file_write(struct file *f, const char __user *buffer,
 		return -ERANGE;
 
 	// Execute preemption
-	err = preempt_tsg(g, target_tsgid);
-	if (err)
+	if ((err = preempt_tsg(g, target_tsgid)))
 		return err;
 
 	return count;
@@ -210,8 +211,6 @@ ssize_t disable_channel_file_write(struct file *f, const char __user *buffer,
 	uint32_t target_channel;
 	channel_ctrl_t chan;
 	int err;
-	runlist_info_t rl_info;
-	runlist_disable_t rl_disable;
 	struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
 	// Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
 	err = kstrtou32_from_user(buffer, count, 0, &target_channel);
@@ -221,19 +220,12 @@ ssize_t disable_channel_file_write(struct file *f, const char __user *buffer,
 	if (target_channel > MAX_CHID)
 		return -ERANGE;
 
-	// Disable channel
-	chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(target_channel));
+	// Read current configuration
+	if ((chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(target_channel))) == -1)
+		return -EIO;
+	// Request disablement
 	chan.enable_clear = true;
-	// disable sched
-	rl_info.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST);
-	rl_disable.raw = nvdebug_readl(g, NV_PFIFO_SCHED_DISABLE);
-	rl_disable.raw |= BIT(rl_info.id);
-	nvdebug_writel(g, NV_PFIFO_SCHED_DISABLE, rl_disable.raw);
-	// disable chan
 	nvdebug_writeq(g, NV_PCCSR_CHANNEL_INST(target_channel), chan.raw);
-	// enable sched
-	rl_disable.raw &= ~BIT(rl_info.id);
-	nvdebug_writel(g, NV_PFIFO_SCHED_DISABLE, rl_disable.raw);
 
 	return count;
 }
@@ -270,6 +262,7 @@ struct file_operations enable_channel_file_ops = {
 	.llseek = default_llseek,
 };
 
+// Note: Operates only on runlist 0 (Compute/Graphics)
 ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer,
 				   size_t count, loff_t *off) {
 	uint32_t target_tsgid;
@@ -292,7 +285,7 @@ ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer,
 		return err;
 
 	// Iterate through all TSGs
-	while (pos < rl_iter.rl_info.len) {
+	while (pos < rl_iter.len) {
 		if (tsgid(g, rl_iter.curr_entry) == target_tsgid) {
 			// Enable channels of target TSG
 			for_chan_in_tsg(g, chan, rl_iter.curr_entry) {
@@ -313,9 +306,8 @@ ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer,
 		pos += 1 + tsg_length(g, rl_iter.curr_entry);
 		rl_iter.curr_entry = next_tsg(g, rl_iter.curr_entry);
 	}
-	// Switch to next TSG with active channels (should be our TSG)
-	err = preempt_tsg(g, target_tsgid);
-	if (err)
+	// Trigger a runlist-level preempt to switch to `target_tsgid`
+	if ((err = preempt_runlist(g, 0)))
 		return err;
 
 	return count;
-- 
cgit v1.2.2