From 232eafd04f272ed69d97a250c50a7bbed4d2894c Mon Sep 17 00:00:00 2001
From: Joshua Bakita <bakitajoshua@gmail.com>
Date: Mon, 16 Sep 2024 15:34:41 -0400
Subject: Support printing the runlist and channels on Ampere+ GPUs

**Modifes the user API from `cat /proc/gpuX/runlist0` to
`cat /proc/gpuX/runlist0/runlist` to support runlist-scoped
registers**

- Count number of runlists via Ampere-style PTOP parsing.
- Create a ProcFS directory for each runlist, and create the runlist
  printing file in this directory.
- Document the newly-added/-formatted Runlist RAM and Channel RAM
  registers.
- Add a helper function `get_runlist_ram()` to obtain the location
  of each runlist's registers.
- Support printing Ampere-style Channel RAM entries.

Tested on Jetson Orin (ga10b), A100, H100, and AD102 (RTX 6000 Ada)
---
 nvdebug.h        | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 nvdebug_entry.c  | 77 ++++++++++++++++++++++++++++++++++++---------------
 runlist.c        | 69 ++++++++++++++++++++++++++++++++++++++++++++--
 runlist_procfs.c | 64 ++++++++++++++++++++++++++++++++----------
 4 files changed, 254 insertions(+), 40 deletions(-)

diff --git a/nvdebug.h b/nvdebug.h
index fd88b2e..26689d9 100644
--- a/nvdebug.h
+++ b/nvdebug.h
@@ -365,6 +365,37 @@ enum CHANNEL_STATUS {
 	CHANNEL_STATUS_ON_ENG_PENDING_ACQ_CTX_RELOAD = 14,
 };
 
+/* RunList RAM (RLRAM)
+  Starting with Ampere, the PFIFO register region no longer exists, and each
+  engine has seperate runlist RAM and channel RAM. The register (BAR0) offset for
+  Runlist RAM for each engine must be pulled from the runlist_pri_base field
+  (RUNLIST Private Register BASE address) provided by PTOP.
+
+ See get_runlist_ram() in runlist.c
+
+ Support: Ampere+
+*/
+#define NV_RUNLIST_BASE_GA100 0x080
+#define NV_RUNLIST_SUBMIT_GA100 0x088
+#define NV_RUNLIST_CHANNEL_CONFIG_GA100 0x004
+
+/* Channel RAM configuration, as contained in Runlist RAM
+
+  NUM_CHANNELS_LOG2 : 1 << NUM_CHANNELS_LOG2 is the number of channel_ctrl_ga100_t
+                      entries in the described Channel RAM region.
+  BAR0_OFFSET       : BAR0_OFFSET << 4 is the register offset (off BAR0) for the
+                      Channel RAM region.
+
+  Support: Ampere+
+*/
+typedef union {
+	struct {
+		uint8_t num_channels_log2:4;
+		uint32_t bar0_offset:28;
+	}__attribute__((packed));
+	uint32_t raw;
+} runlist_channel_config_t;
+
 /* Programmable Channel Control System RAM (PCCSR)
   512-entry array of channel control and status data structures.
 
@@ -425,6 +456,50 @@ typedef union {
 	uint64_t raw;
 } channel_ctrl_t;
 
+/* CHannel RAM (CHRAM) (PCCSR replacement on Ampere+)
+  Starting with Ampere, channel IDs are no longer unique indexes into the
+  global channel RAM region (PCCSR), but are indexes into per-runlist channel
+  RAMs.
+
+  As Channel RAM entries are now subsidiary to a runlist, they do not contain
+  duplicate information, such as the instance pointer (to "result in smaller
+  hardware" per ga100/dev_ram.ref.txt in open-gpu-doc).
+
+  The new format retains and adds to the status information available about a
+  channel, but does so via bit flags rather than an enum. Some bit flags are
+  writable to trigger behavior previously dedicated to a bit (eg. writing to
+  `ctx_reload` triggers the same behavior as writing to `force_ctx_reload` did).
+
+  When the first bit (`is_write_one_clears_bits`) is set in this structure,
+  writing a 1 to any field will clear, rather than set, it. Writing a 0 to any
+  field is a no-op.
+
+  All fields read/write, except the following are read-only: BUSY, ON_PBDMA,
+  ON_ENG, PBDMA_BUSY, ENG_BUSY.
+
+  Support: Ampere, Hopper, Ada (and newer likely)
+  See also: manuals/ampere/ga100/dev_runlist.ref.txt in NVIDIA's open-gpu-doc
+*/
+typedef union {
+	struct {
+		bool is_write_one_clears_bits:1; // new
+		bool enable:1;
+		bool next:1;
+		bool busy:1;
+		bool pbdma_faulted:1; // write to force_pbdma_faulted
+		bool eng_faulted:1; // write to force_eng_faulted
+		bool on_pbdma:1; // breakout
+		bool on_eng:1; // breakout
+		bool pending:1; // breakout
+		bool ctx_reload:1; // breakout; write to force_ctx_reload
+		bool pbdma_busy:1; // breakout
+		bool eng_busy:1; // new
+		bool acquire_fail:1; // breakout
+		 uint32_t :19;
+	} __attribute__((packed));
+	uint32_t raw;
+} channel_ctrl_ga100_t;
+
 /* Control word for runlist enable/disable.
 
   RUNLIST_N           : Is runlist n disabled? (1 == disabled, 0 == enabled)
@@ -1413,14 +1488,19 @@ struct runlist_iter {
 	int entries_left_in_tsg;
 	// Number of entries in runlist
 	int len;
-	// Offset to start of Channel RAM (as this is per-runlist on Ampere+)
-	uint32_t channel_ram;
+	// (Ampere+ only) Offset to the per-runlist "Runlist RAM" register region.
+	// This includes the offset for Channel RAM (per-runlist on Ampere+).
+	uint32_t runlist_pri_base;
 };
 
 #define NVDEBUG_MAX_DEVICES 8
 extern struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES];
 
 // Defined in runlist.c
+int get_runlist_ram(
+	struct nvdebug_state *g,
+	int rl_id,
+	uint32_t *rl_ram_off /* out */);
 int get_runlist_iter(
 	struct nvdebug_state *g,
 	int rl_id,
diff --git a/nvdebug_entry.c b/nvdebug_entry.c
index eee7351..1f9e1c9 100644
--- a/nvdebug_entry.c
+++ b/nvdebug_entry.c
@@ -159,35 +159,53 @@ int probe_and_cache_devices(void) {
 	return -ENODEV;
 }
 
-// Create files `/proc/gpu#/runlist#`, world readable
 // Support: Fermi, Maxwell, Pascal, Volta, Turing
-int create_runlist_files(int device_id, struct proc_dir_entry *dir) {
+int get_last_runlist_id_gk104(struct nvdebug_state *g) {
 	ptop_device_info_gk104_t info;
-	struct proc_dir_entry *rl_entry;
-	int i, rl_id;
-	char runlist_name[12];
-	int max_rl_id = 0; // Always at least one runlist
+	int i, max_rl_id = 0; // Always at least one runlist
 	// Figure out how many runlists there are by checking the device info
 	// registers. Runlists are always numbered sequentially, so we just have
 	// to find the highest-valued one and add 1 to get the number of runlists.
 	for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1_GK104; i++) {
-		info.raw = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_DEVICE_INFO_GK104(i));
+		if ((info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_GK104(i))) == -1)
+			return -EIO;
 		if (info.info_type != INFO_TYPE_ENUM || !info.runlist_is_valid)
 			continue;
 		if (info.runlist_enum > max_rl_id)
 			max_rl_id = info.runlist_enum;
 	}
-	// Create files to read each runlist. The read handling code looks at the
-	// `pde_data` associated with the file to determine what the runlist ID is.
-	for (rl_id = 0; rl_id <= max_rl_id; rl_id++) {
-		snprintf(runlist_name, 12, "runlist%d", rl_id);
-		rl_entry = proc_create_data(
-			runlist_name, 0444, dir, compat_ops(&runlist_file_ops),
-			(void*)(uintptr_t)rl_id);
-		if (!rl_entry)
-			return -ENOMEM;
+	return max_rl_id;
+}
+
+// Support: Ampere, Hopper, Ada (and newer likely)
+// Identical structure to get_runlist_ram() in runlist.c. See comments there.
+int get_last_runlist_id_ga100(struct nvdebug_state *g) {
+	ptop_device_info_ga100_t ptop_entry;
+	int i, runlist_count = 0;
+	int ptop_size = NV_PTOP_DEVICE_INFO__SIZE_1_GA100(g);
+	int ptop_entry_subrow = 0;
+	for (i = 0; i < ptop_size; i++) {
+		if ((ptop_entry.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_GA100(i))) == -1)
+			return -EIO;
+		if (!ptop_entry.raw)
+			continue;
+		if (ptop_entry_subrow == 2 && ptop_entry.rleng_id == 0)
+			runlist_count++;
+		if (ptop_entry.has_next_entry)
+			ptop_entry_subrow += 1;
+		else
+			ptop_entry_subrow = 0;
 	}
-	return 0;
+	return runlist_count - 1;
+}
+
+// Return the maximum runlist ID. For a two-runlist GPU, this would return 1.
+int get_last_runlist_id(int device_id) {
+	struct nvdebug_state* g = &g_nvdebug_state[device_id];
+	if (g->chip_id >= NV_CHIP_ID_AMPERE)
+		return get_last_runlist_id_ga100(g);
+	else
+		return get_last_runlist_id_gk104(g);
 }
 
 // Create files `/proc/gpu#/gpc#_tpc_mask`, world readable
@@ -238,6 +256,7 @@ int __init nvdebug_init(void) {
 	g_nvdebug_devices = res;
 	// Create seperate ProcFS directories for each gpu
 	while (res--) {
+		uintptr_t last_runlist = 0;
 		char device_id_str[7];
 		// Create a wider copy of the GPU ID to allow us to abuse the *data
 		// field of proc_dir_entry to store the GPU ID.
@@ -248,10 +267,24 @@ int __init nvdebug_init(void) {
 		snprintf(device_id_str, 7, "gpu%ld", device_id);
 		if (!(dir = proc_mkdir_data(device_id_str, 0555, NULL, (void*)device_id)))
 			goto out_nomem;
-		// Create files `/proc/gpu#/runlist#`, world readable
-		if (g_nvdebug_state[device_id].chip_id < NV_CHIP_ID_AMPERE)
-			if ((err = create_runlist_files(device_id, dir)))
-				goto out_err;
+		// Create files in the `/proc/gpu#/runlist#/` directory
+		// The read handling code looks at the `pde_data` associated with the parent
+		// directory to determine what the runlist ID is.
+		if ((last_runlist = get_last_runlist_id(device_id)) < 0)
+			return last_runlist;
+		do {
+			char runlist_name[12];
+			struct proc_dir_entry *rl_dir;
+			// Create `/proc/gpu#/runlist#` directory
+			snprintf(runlist_name, 12, "runlist%lu", last_runlist);
+			if (!(rl_dir = proc_mkdir_data(runlist_name, 0555, dir, (void*)device_id)))
+				goto out_nomem;
+			// Create file `/proc/gpu#/runlist#/runlist`, world readable
+			if (!proc_create_data(
+					"runlist", 0444, rl_dir, compat_ops(&runlist_file_ops),
+					(void*)last_runlist))
+				goto out_nomem;
+		} while (last_runlist-- > 0);
 		// Create file `/proc/gpu#/preempt_tsg`, world writable
 		if (!proc_create_data(
 				"preempt_tsg", 0222, dir, compat_ops(&preempt_tsg_file_ops),
@@ -325,7 +358,7 @@ int __init nvdebug_init(void) {
 					"local_memory", 0444, dir, compat_ops(&local_memory_file_ops),
 					(void*)0x00100ce0))
 				goto out_nomem;
-			}
+		}
 		// Create files exposing LCE and PCE configuration (Pascal+)
 		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) {
 			// Create file `/proc/gpu#/copy_topology`, world readable
diff --git a/runlist.c b/runlist.c
index 2e9577d..7e6d292 100644
--- a/runlist.c
+++ b/runlist.c
@@ -14,6 +14,52 @@
 // be enabled to print the runlist on the TX2.
 //#define FALLBACK_TO_PRAMIN
 
+/* Get RunList RAM (RLRAM) offset for a runlist from the device topology
+  @param rl_id      Which runlist to obtain [numbered in order of appearance in
+                    the device topology (PTOP) registers]
+  @param rl_ram_off Location at which to store runlist private register
+                    interface base address (PRI base); an offset into the BAR0
+                    register range.
+  @return 0 or -errno on error
+*/
+int get_runlist_ram(struct nvdebug_state *g, int rl_id, uint32_t *rl_ram_off) {
+	int i;
+	int curr_rl_id = 0;
+	int ptop_size = NV_PTOP_DEVICE_INFO__SIZE_1_GA100(g);
+	// Each PTOP entry is composed of 1--3 subrows, and the fields available
+	// on each row vary. The runlist RAM location is only available on row 3
+	int ptop_entry_subrow = 0;
+	ptop_device_info_ga100_t ptop_entry;
+	// Iterate through all PTOP entries
+	for (i = 0; i < ptop_size; i++) {
+		if ((ptop_entry.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_GA100(i))) == -1)
+			return -EIO;
+		// Skip empty entries
+		if (!ptop_entry.raw)
+			continue;
+		// If on subrow 3 (zero-base-index 2), runlist info is available
+		// Multiple engines may be associated with a single runlist, so
+		// multiple PTOP entries may refer to the same runlist. Only match when
+		// on the 0th-associated entry.
+		if (ptop_entry_subrow == 2 && ptop_entry.rleng_id == 0) {
+			// If this is the requested runlist, return it
+			if (curr_rl_id == rl_id) {
+				*rl_ram_off = (uint32_t)ptop_entry.runlist_pri_base << 10;
+				return 0;
+			}
+			// Otherwise, update our accounting of what the next runlist ID is
+			curr_rl_id++;
+		}
+		// Track if the next row is a subrow of the current entry
+		if (ptop_entry.has_next_entry)
+			ptop_entry_subrow += 1;
+		else
+			ptop_entry_subrow = 0;
+	}
+	// Search failed; requested index does not exist
+	return -EINVAL;
+}
+
 /* Get runlist head and info (incl. length)
   @param rl_id   Which runlist to obtain?
   @param rl_iter Location at which to store output
@@ -39,7 +85,7 @@ int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl
 		runlist_target = rl.target;
 		runlist_len = rl.len;
 		printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx)\n",
-			   rl_id, g->chip_id, rl.len, runlist_iova, target_to_text(rl.target), rl.raw);
+		       rl_id, g->chip_id, rl.len, runlist_iova, target_to_text(rl.target), rl.raw);
 	} else if (g->chip_id < NV_CHIP_ID_AMPERE) {
 		runlist_base_tu102_t base;
 		runlist_submit_tu102_t submit;
@@ -51,7 +97,26 @@ int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl
 		runlist_target = base.target;
 		runlist_len = submit.len;
 		printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx %#018llx)\n",
-			   rl_id, g->chip_id, submit.len, runlist_iova, target_to_text(runlist_target), base.raw, submit.raw);
+		       rl_id, g->chip_id, submit.len, runlist_iova, target_to_text(runlist_target), base.raw, submit.raw);
+	} else {
+		runlist_base_tu102_t base;
+		runlist_submit_tu102_t submit;
+		uint32_t runlist_pri_base;
+		// Runlist configurations are stored in per-runlist regions on Ampere+
+		if ((err = get_runlist_ram(g, rl_id, &runlist_pri_base)) < 0)
+			return err;
+		// The runlist configuration region (RLRAM) contains Turing-like BASE
+		// and SUBMIT registers at static offsets
+		if ((base.raw = nvdebug_readq(g, runlist_pri_base + NV_RUNLIST_BASE_GA100)) == -1)
+			return -EIO;
+		if ((submit.raw = nvdebug_readq(g, runlist_pri_base + NV_RUNLIST_SUBMIT_GA100)) == -1)
+			return -EIO;
+		runlist_iova = ((uint64_t)base.ptr) << 12;
+		runlist_target = base.target;
+		runlist_len = submit.len;
+		printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx %#018llx)\n",
+		       rl_id, g->chip_id, submit.len, runlist_iova, target_to_text(runlist_target), base.raw, submit.raw);
+		rl_iter->runlist_pri_base = runlist_pri_base;
 	}
 	// Return early on an empty runlist
 	if (!runlist_len)
diff --git a/runlist_procfs.c b/runlist_procfs.c
index 8152463..c1cfc87 100644
--- a/runlist_procfs.c
+++ b/runlist_procfs.c
@@ -8,11 +8,11 @@
 
 #ifdef DETAILED_CHANNEL_INFO
 /* Print channel details using PCCSR (Programmable Channel Control System RAM?)
- * @param s      Pointer to state from seq_file subsystem to pass to seq_printf
- * @param g      Pointer to our internal GPU state
- * @param chid   ID of channel to print details on, range [0, 512)
- * @param prefix Text string to prefix each line with, or empty string
- */
+  @param s      Pointer to state from seq_file subsystem to pass to seq_printf
+  @param g      Pointer to our internal GPU state
+  @param chid   ID of channel to print details on, range [0, 512)
+  @param prefix Text string to prefix each line with, or empty string
+*/
 static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix) {
 	channel_ctrl_t chan;
 	uint64_t instance_ptr;
@@ -21,7 +21,7 @@ static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state
 		return -EIO;
 	instance_ptr = (uint64_t)chan.inst_ptr << 12;
 	// Don't print write-only fields
-	seq_printf(s, "%s+- Channel Info %-4d -+\n", prefix, chid);
+	seq_printf(s, "%s|= Channel Info ======|\n", prefix);
 	seq_printf(s, "%s| Enabled:           %d|\n", prefix, chan.enable);
 	seq_printf(s, "%s| Next:              %d|\n", prefix, chan.next);
 	seq_printf(s, "%s| PBDMA Faulted:     %d|\n", prefix, chan.pbdma_faulted);
@@ -32,7 +32,37 @@ static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state
 	seq_printf(s, "%s|   %#018llx|\n", prefix, instance_ptr);
 	seq_printf(s, "%s| %20s|\n", prefix, target_to_text(chan.inst_target));
 	seq_printf(s, "%s| Instance bound:    %d|\n", prefix, chan.inst_bind);
-	seq_printf(s, "%s+---------------------+\n", prefix);
+	return 0;
+}
+
+/* `runlist_detail_seq_show_chan()`, but for Ampere+
+  @param runlist_pri_base Base of the RLRAM region for this runlist
+
+  `runlist_pri_base` is necessary, since Channel RAM is now per-runlist on
+  Ampere+, and its location is configured in Runlist RAM.
+*/
+static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix, uint32_t runlist_pri_base) {
+	runlist_channel_config_t channel_config;
+	channel_ctrl_ga100_t chan;
+
+	// Channel RAM is subsidiary to Runlist RAM (ie. per-runlist) on Ampere+
+	if ((channel_config.raw = nvdebug_readl(g, runlist_pri_base + NV_RUNLIST_CHANNEL_CONFIG_GA100)) == -1)
+		return -EIO;
+	if ((chan.raw = nvdebug_readl(g, (((uint32_t)channel_config.bar0_offset << 4) + chid * 4))) == -1)
+		return -EIO;
+	seq_printf(s, "%s|= Channel Info ======|\n", prefix);
+	seq_printf(s, "%s| Enabled:           %d|\n", prefix, chan.enable);
+	seq_printf(s, "%s| Next:              %d|\n", prefix, chan.next);
+	seq_printf(s, "%s| Busy:              %d|\n", prefix, chan.busy);
+	seq_printf(s, "%s| PBDMA Faulted:     %d|\n", prefix, chan.pbdma_faulted);
+	seq_printf(s, "%s| ENG Faulted:       %d|\n", prefix, chan.eng_faulted);
+	seq_printf(s, "%s| On PBDMA:          %d|\n", prefix, chan.on_pbdma);
+	seq_printf(s, "%s| On ENG:            %d|\n", prefix, chan.on_eng);
+	seq_printf(s, "%s| Pending:           %d|\n", prefix, chan.pending);
+	seq_printf(s, "%s| CTX Reload:        %d|\n", prefix, chan.ctx_reload);
+	seq_printf(s, "%s| PBDMA Busy:        %d|\n", prefix, chan.pbdma_busy);
+	seq_printf(s, "%s| ENG Busy:          %d|\n", prefix, chan.eng_busy);
+	seq_printf(s, "%s| Acquire Fail:      %d|\n", prefix, chan.acquire_fail);
 	return 0;
 }
 #endif
@@ -118,27 +148,33 @@ static int runlist_file_seq_show(struct seq_file *s, void *raw_rl_iter) {
 	} else {
 		char *indt = "";
 		u64 instance_ptr = 0;
-
 		if (rl_iter->entries_left_in_tsg)
 			indt = "  ";
-#ifdef DETAILED_CHANNEL_INFO
-		runlist_detail_seq_show_chan(s, g, chid(g, entry), indt);
-		return 0;
-#endif
 		// Reconstruct pointer to channel instance block
 		if (g->chip_id >= NV_CHIP_ID_VOLTA) {
 			instance_ptr = ((struct gv100_runlist_chan*)entry)->inst_ptr_hi;
 			instance_ptr <<= 32;
 		}
 		instance_ptr |= inst_ptr_lo(g, entry) << 12;
-
+		// Print channel information from runlist
 		seq_printf(s, "%s+- Channel Entry %-4d-+\n", indt, chid(g, entry));
 		if (g->chip_id >= NV_CHIP_ID_VOLTA)
 			seq_printf(s, "%s| Runqueue Selector: %d|\n", indt,
-				   ((struct gv100_runlist_chan*)entry)->runqueue_selector);
+			           ((struct gv100_runlist_chan*)entry)->runqueue_selector);
+		// Not populated on Kepler [ex: gk104 in Bonham (Quadro K5000)], and
+		// populated but unused on Pascal [ex: gp104 in Bonham (GTX 1080 Ti)].
+		// (The aperture field may be incorrectly populated as INVALID, but the
+		// context still works on the aformentioned Pascal GPU.)
 		seq_printf(s, "%s| Instance PTR:       |\n", indt);
 		seq_printf(s, "%s|   %#018llx|\n", indt, instance_ptr);
 		seq_printf(s, "%s| %20s|\n", indt, target_to_text(inst_target(g, entry)));
+#ifdef DETAILED_CHANNEL_INFO
+		// Print channel info from PCCSR/Channel RAM and the instance block
+		if (g->chip_id < NV_CHIP_ID_AMPERE)
+			runlist_detail_seq_show_chan(s, g, chid(g, entry), indt);
+		else
+			runlist_detail_seq_show_chan_ga100(s, g, chid(g, entry), indt, rl_iter->runlist_pri_base);
+#endif
 		seq_printf(s, "%s+---------------------+\n", indt);
 	}
 	return 0;
-- 
cgit v1.2.2