From 232eafd04f272ed69d97a250c50a7bbed4d2894c Mon Sep 17 00:00:00 2001 From: Joshua Bakita Date: Mon, 16 Sep 2024 15:34:41 -0400 Subject: Support printing the runlist and channels on Ampere+ GPUs **Modifes the user API from `cat /proc/gpuX/runlist0` to `cat /proc/gpuX/runlist0/runlist` to support runlist-scoped registers** - Count number of runlists via Ampere-style PTOP parsing. - Create a ProcFS directory for each runlist, and create the runlist printing file in this directory. - Document the newly-added/-formatted Runlist RAM and Channel RAM registers. - Add a helper function `get_runlist_ram()` to obtain the location of each runlist's registers. - Support printing Ampere-style Channel RAM entries. Tested on Jetson Orin (ga10b), A100, H100, and AD102 (RTX 6000 Ada) --- nvdebug.h | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- nvdebug_entry.c | 77 ++++++++++++++++++++++++++++++++++++--------------- runlist.c | 69 ++++++++++++++++++++++++++++++++++++++++++++-- runlist_procfs.c | 64 ++++++++++++++++++++++++++++++++---------- 4 files changed, 254 insertions(+), 40 deletions(-) diff --git a/nvdebug.h b/nvdebug.h index fd88b2e..26689d9 100644 --- a/nvdebug.h +++ b/nvdebug.h @@ -365,6 +365,37 @@ enum CHANNEL_STATUS { CHANNEL_STATUS_ON_ENG_PENDING_ACQ_CTX_RELOAD = 14, }; +/* RunList RAM (RLRAM) + Starting with Ampere, the PFIFO register region no longer exists, and each + engine has seperate runlist RAM and channel RAM. The register (BAR0) offset for + Runlist RAM for each engine must be pulled from the runlist_pri_base field + (RUNLIST Private Register BASE address) provided by PTOP. + + See get_runlist_ram() in runlist.c + + Support: Ampere+ +*/ +#define NV_RUNLIST_BASE_GA100 0x080 +#define NV_RUNLIST_SUBMIT_GA100 0x088 +#define NV_RUNLIST_CHANNEL_CONFIG_GA100 0x004 + +/* Channel RAM configuration, as contained in Runlist RAM + + NUM_CHANNELS_LOG2 : 1 << NUM_CHANNELS_LOG2 is the number of channel_ctrl_ga100_t + entries in the described Channel RAM region. + BAR0_OFFSET : BAR0_OFFSET << 4 is the register offset (off BAR0) for the + Channel RAM region. + + Support: Ampere+ +*/ +typedef union { + struct { + uint8_t num_channels_log2:4; + uint32_t bar0_offset:28; + }__attribute__((packed)); + uint32_t raw; +} runlist_channel_config_t; + /* Programmable Channel Control System RAM (PCCSR) 512-entry array of channel control and status data structures. @@ -425,6 +456,50 @@ typedef union { uint64_t raw; } channel_ctrl_t; +/* CHannel RAM (CHRAM) (PCCSR replacement on Ampere+) + Starting with Ampere, channel IDs are no longer unique indexes into the + global channel RAM region (PCCSR), but are indexes into per-runlist channel + RAMs. + + As Channel RAM entries are now subsidiary to a runlist, they do not contain + duplicate information, such as the instance pointer (to "result in smaller + hardware" per ga100/dev_ram.ref.txt in open-gpu-doc). + + The new format retains and adds to the status information available about a + channel, but does so via bit flags rather than an enum. Some bit flags are + writable to trigger behavior previously dedicated to a bit (eg. writing to + `ctx_reload` triggers the same behavior as writing to `force_ctx_reload` did). + + When the first bit (`is_write_one_clears_bits`) is set in this structure, + writing a 1 to any field will clear, rather than set, it. Writing a 0 to any + field is a no-op. + + All fields read/write, except the following are read-only: BUSY, ON_PBDMA, + ON_ENG, PBDMA_BUSY, ENG_BUSY. + + Support: Ampere, Hopper, Ada (and newer likely) + See also: manuals/ampere/ga100/dev_runlist.ref.txt in NVIDIA's open-gpu-doc +*/ +typedef union { + struct { + bool is_write_one_clears_bits:1; // new + bool enable:1; + bool next:1; + bool busy:1; + bool pbdma_faulted:1; // write to force_pbdma_faulted + bool eng_faulted:1; // write to force_eng_faulted + bool on_pbdma:1; // breakout + bool on_eng:1; // breakout + bool pending:1; // breakout + bool ctx_reload:1; // breakout; write to force_ctx_reload + bool pbdma_busy:1; // breakout + bool eng_busy:1; // new + bool acquire_fail:1; // breakout + uint32_t :19; + } __attribute__((packed)); + uint32_t raw; +} channel_ctrl_ga100_t; + /* Control word for runlist enable/disable. RUNLIST_N : Is runlist n disabled? (1 == disabled, 0 == enabled) @@ -1413,14 +1488,19 @@ struct runlist_iter { int entries_left_in_tsg; // Number of entries in runlist int len; - // Offset to start of Channel RAM (as this is per-runlist on Ampere+) - uint32_t channel_ram; + // (Ampere+ only) Offset to the per-runlist "Runlist RAM" register region. + // This includes the offset for Channel RAM (per-runlist on Ampere+). + uint32_t runlist_pri_base; }; #define NVDEBUG_MAX_DEVICES 8 extern struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES]; // Defined in runlist.c +int get_runlist_ram( + struct nvdebug_state *g, + int rl_id, + uint32_t *rl_ram_off /* out */); int get_runlist_iter( struct nvdebug_state *g, int rl_id, diff --git a/nvdebug_entry.c b/nvdebug_entry.c index eee7351..1f9e1c9 100644 --- a/nvdebug_entry.c +++ b/nvdebug_entry.c @@ -159,35 +159,53 @@ int probe_and_cache_devices(void) { return -ENODEV; } -// Create files `/proc/gpu#/runlist#`, world readable // Support: Fermi, Maxwell, Pascal, Volta, Turing -int create_runlist_files(int device_id, struct proc_dir_entry *dir) { +int get_last_runlist_id_gk104(struct nvdebug_state *g) { ptop_device_info_gk104_t info; - struct proc_dir_entry *rl_entry; - int i, rl_id; - char runlist_name[12]; - int max_rl_id = 0; // Always at least one runlist + int i, max_rl_id = 0; // Always at least one runlist // Figure out how many runlists there are by checking the device info // registers. Runlists are always numbered sequentially, so we just have // to find the highest-valued one and add 1 to get the number of runlists. for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1_GK104; i++) { - info.raw = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_DEVICE_INFO_GK104(i)); + if ((info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_GK104(i))) == -1) + return -EIO; if (info.info_type != INFO_TYPE_ENUM || !info.runlist_is_valid) continue; if (info.runlist_enum > max_rl_id) max_rl_id = info.runlist_enum; } - // Create files to read each runlist. The read handling code looks at the - // `pde_data` associated with the file to determine what the runlist ID is. - for (rl_id = 0; rl_id <= max_rl_id; rl_id++) { - snprintf(runlist_name, 12, "runlist%d", rl_id); - rl_entry = proc_create_data( - runlist_name, 0444, dir, compat_ops(&runlist_file_ops), - (void*)(uintptr_t)rl_id); - if (!rl_entry) - return -ENOMEM; + return max_rl_id; +} + +// Support: Ampere, Hopper, Ada (and newer likely) +// Identical structure to get_runlist_ram() in runlist.c. See comments there. +int get_last_runlist_id_ga100(struct nvdebug_state *g) { + ptop_device_info_ga100_t ptop_entry; + int i, runlist_count = 0; + int ptop_size = NV_PTOP_DEVICE_INFO__SIZE_1_GA100(g); + int ptop_entry_subrow = 0; + for (i = 0; i < ptop_size; i++) { + if ((ptop_entry.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_GA100(i))) == -1) + return -EIO; + if (!ptop_entry.raw) + continue; + if (ptop_entry_subrow == 2 && ptop_entry.rleng_id == 0) + runlist_count++; + if (ptop_entry.has_next_entry) + ptop_entry_subrow += 1; + else + ptop_entry_subrow = 0; } - return 0; + return runlist_count - 1; +} + +// Return the maximum runlist ID. For a two-runlist GPU, this would return 1. +int get_last_runlist_id(int device_id) { + struct nvdebug_state* g = &g_nvdebug_state[device_id]; + if (g->chip_id >= NV_CHIP_ID_AMPERE) + return get_last_runlist_id_ga100(g); + else + return get_last_runlist_id_gk104(g); } // Create files `/proc/gpu#/gpc#_tpc_mask`, world readable @@ -238,6 +256,7 @@ int __init nvdebug_init(void) { g_nvdebug_devices = res; // Create seperate ProcFS directories for each gpu while (res--) { + uintptr_t last_runlist = 0; char device_id_str[7]; // Create a wider copy of the GPU ID to allow us to abuse the *data // field of proc_dir_entry to store the GPU ID. @@ -248,10 +267,24 @@ int __init nvdebug_init(void) { snprintf(device_id_str, 7, "gpu%ld", device_id); if (!(dir = proc_mkdir_data(device_id_str, 0555, NULL, (void*)device_id))) goto out_nomem; - // Create files `/proc/gpu#/runlist#`, world readable - if (g_nvdebug_state[device_id].chip_id < NV_CHIP_ID_AMPERE) - if ((err = create_runlist_files(device_id, dir))) - goto out_err; + // Create files in the `/proc/gpu#/runlist#/` directory + // The read handling code looks at the `pde_data` associated with the parent + // directory to determine what the runlist ID is. + if ((last_runlist = get_last_runlist_id(device_id)) < 0) + return last_runlist; + do { + char runlist_name[12]; + struct proc_dir_entry *rl_dir; + // Create `/proc/gpu#/runlist#` directory + snprintf(runlist_name, 12, "runlist%lu", last_runlist); + if (!(rl_dir = proc_mkdir_data(runlist_name, 0555, dir, (void*)device_id))) + goto out_nomem; + // Create file `/proc/gpu#/runlist#/runlist`, world readable + if (!proc_create_data( + "runlist", 0444, rl_dir, compat_ops(&runlist_file_ops), + (void*)last_runlist)) + goto out_nomem; + } while (last_runlist-- > 0); // Create file `/proc/gpu#/preempt_tsg`, world writable if (!proc_create_data( "preempt_tsg", 0222, dir, compat_ops(&preempt_tsg_file_ops), @@ -325,7 +358,7 @@ int __init nvdebug_init(void) { "local_memory", 0444, dir, compat_ops(&local_memory_file_ops), (void*)0x00100ce0)) goto out_nomem; - } + } // Create files exposing LCE and PCE configuration (Pascal+) if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) { // Create file `/proc/gpu#/copy_topology`, world readable diff --git a/runlist.c b/runlist.c index 2e9577d..7e6d292 100644 --- a/runlist.c +++ b/runlist.c @@ -14,6 +14,52 @@ // be enabled to print the runlist on the TX2. //#define FALLBACK_TO_PRAMIN +/* Get RunList RAM (RLRAM) offset for a runlist from the device topology + @param rl_id Which runlist to obtain [numbered in order of appearance in + the device topology (PTOP) registers] + @param rl_ram_off Location at which to store runlist private register + interface base address (PRI base); an offset into the BAR0 + register range. + @return 0 or -errno on error +*/ +int get_runlist_ram(struct nvdebug_state *g, int rl_id, uint32_t *rl_ram_off) { + int i; + int curr_rl_id = 0; + int ptop_size = NV_PTOP_DEVICE_INFO__SIZE_1_GA100(g); + // Each PTOP entry is composed of 1--3 subrows, and the fields available + // on each row vary. The runlist RAM location is only available on row 3 + int ptop_entry_subrow = 0; + ptop_device_info_ga100_t ptop_entry; + // Iterate through all PTOP entries + for (i = 0; i < ptop_size; i++) { + if ((ptop_entry.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_GA100(i))) == -1) + return -EIO; + // Skip empty entries + if (!ptop_entry.raw) + continue; + // If on subrow 3 (zero-base-index 2), runlist info is available + // Multiple engines may be associated with a single runlist, so + // multiple PTOP entries may refer to the same runlist. Only match when + // on the 0th-associated entry. + if (ptop_entry_subrow == 2 && ptop_entry.rleng_id == 0) { + // If this is the requested runlist, return it + if (curr_rl_id == rl_id) { + *rl_ram_off = (uint32_t)ptop_entry.runlist_pri_base << 10; + return 0; + } + // Otherwise, update our accounting of what the next runlist ID is + curr_rl_id++; + } + // Track if the next row is a subrow of the current entry + if (ptop_entry.has_next_entry) + ptop_entry_subrow += 1; + else + ptop_entry_subrow = 0; + } + // Search failed; requested index does not exist + return -EINVAL; +} + /* Get runlist head and info (incl. length) @param rl_id Which runlist to obtain? @param rl_iter Location at which to store output @@ -39,7 +85,7 @@ int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl runlist_target = rl.target; runlist_len = rl.len; printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx)\n", - rl_id, g->chip_id, rl.len, runlist_iova, target_to_text(rl.target), rl.raw); + rl_id, g->chip_id, rl.len, runlist_iova, target_to_text(rl.target), rl.raw); } else if (g->chip_id < NV_CHIP_ID_AMPERE) { runlist_base_tu102_t base; runlist_submit_tu102_t submit; @@ -51,7 +97,26 @@ int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl runlist_target = base.target; runlist_len = submit.len; printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx %#018llx)\n", - rl_id, g->chip_id, submit.len, runlist_iova, target_to_text(runlist_target), base.raw, submit.raw); + rl_id, g->chip_id, submit.len, runlist_iova, target_to_text(runlist_target), base.raw, submit.raw); + } else { + runlist_base_tu102_t base; + runlist_submit_tu102_t submit; + uint32_t runlist_pri_base; + // Runlist configurations are stored in per-runlist regions on Ampere+ + if ((err = get_runlist_ram(g, rl_id, &runlist_pri_base)) < 0) + return err; + // The runlist configuration region (RLRAM) contains Turing-like BASE + // and SUBMIT registers at static offsets + if ((base.raw = nvdebug_readq(g, runlist_pri_base + NV_RUNLIST_BASE_GA100)) == -1) + return -EIO; + if ((submit.raw = nvdebug_readq(g, runlist_pri_base + NV_RUNLIST_SUBMIT_GA100)) == -1) + return -EIO; + runlist_iova = ((uint64_t)base.ptr) << 12; + runlist_target = base.target; + runlist_len = submit.len; + printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx %#018llx)\n", + rl_id, g->chip_id, submit.len, runlist_iova, target_to_text(runlist_target), base.raw, submit.raw); + rl_iter->runlist_pri_base = runlist_pri_base; } // Return early on an empty runlist if (!runlist_len) diff --git a/runlist_procfs.c b/runlist_procfs.c index 8152463..c1cfc87 100644 --- a/runlist_procfs.c +++ b/runlist_procfs.c @@ -8,11 +8,11 @@ #ifdef DETAILED_CHANNEL_INFO /* Print channel details using PCCSR (Programmable Channel Control System RAM?) - * @param s Pointer to state from seq_file subsystem to pass to seq_printf - * @param g Pointer to our internal GPU state - * @param chid ID of channel to print details on, range [0, 512) - * @param prefix Text string to prefix each line with, or empty string - */ + @param s Pointer to state from seq_file subsystem to pass to seq_printf + @param g Pointer to our internal GPU state + @param chid ID of channel to print details on, range [0, 512) + @param prefix Text string to prefix each line with, or empty string +*/ static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix) { channel_ctrl_t chan; uint64_t instance_ptr; @@ -21,7 +21,7 @@ static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state return -EIO; instance_ptr = (uint64_t)chan.inst_ptr << 12; // Don't print write-only fields - seq_printf(s, "%s+- Channel Info %-4d -+\n", prefix, chid); + seq_printf(s, "%s|= Channel Info ======|\n", prefix); seq_printf(s, "%s| Enabled: %d|\n", prefix, chan.enable); seq_printf(s, "%s| Next: %d|\n", prefix, chan.next); seq_printf(s, "%s| PBDMA Faulted: %d|\n", prefix, chan.pbdma_faulted); @@ -32,7 +32,37 @@ static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state seq_printf(s, "%s| %#018llx|\n", prefix, instance_ptr); seq_printf(s, "%s| %20s|\n", prefix, target_to_text(chan.inst_target)); seq_printf(s, "%s| Instance bound: %d|\n", prefix, chan.inst_bind); - seq_printf(s, "%s+---------------------+\n", prefix); + return 0; +} + +/* `runlist_detail_seq_show_chan()`, but for Ampere+ + @param runlist_pri_base Base of the RLRAM region for this runlist + + `runlist_pri_base` is necessary, since Channel RAM is now per-runlist on + Ampere+, and its location is configured in Runlist RAM. +*/ +static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix, uint32_t runlist_pri_base) { + runlist_channel_config_t channel_config; + channel_ctrl_ga100_t chan; + + // Channel RAM is subsidiary to Runlist RAM (ie. per-runlist) on Ampere+ + if ((channel_config.raw = nvdebug_readl(g, runlist_pri_base + NV_RUNLIST_CHANNEL_CONFIG_GA100)) == -1) + return -EIO; + if ((chan.raw = nvdebug_readl(g, (((uint32_t)channel_config.bar0_offset << 4) + chid * 4))) == -1) + return -EIO; + seq_printf(s, "%s|= Channel Info ======|\n", prefix); + seq_printf(s, "%s| Enabled: %d|\n", prefix, chan.enable); + seq_printf(s, "%s| Next: %d|\n", prefix, chan.next); + seq_printf(s, "%s| Busy: %d|\n", prefix, chan.busy); + seq_printf(s, "%s| PBDMA Faulted: %d|\n", prefix, chan.pbdma_faulted); + seq_printf(s, "%s| ENG Faulted: %d|\n", prefix, chan.eng_faulted); + seq_printf(s, "%s| On PBDMA: %d|\n", prefix, chan.on_pbdma); + seq_printf(s, "%s| On ENG: %d|\n", prefix, chan.on_eng); + seq_printf(s, "%s| Pending: %d|\n", prefix, chan.pending); + seq_printf(s, "%s| CTX Reload: %d|\n", prefix, chan.ctx_reload); + seq_printf(s, "%s| PBDMA Busy: %d|\n", prefix, chan.pbdma_busy); + seq_printf(s, "%s| ENG Busy: %d|\n", prefix, chan.eng_busy); + seq_printf(s, "%s| Acquire Fail: %d|\n", prefix, chan.acquire_fail); return 0; } #endif @@ -118,27 +148,33 @@ static int runlist_file_seq_show(struct seq_file *s, void *raw_rl_iter) { } else { char *indt = ""; u64 instance_ptr = 0; - if (rl_iter->entries_left_in_tsg) indt = " "; -#ifdef DETAILED_CHANNEL_INFO - runlist_detail_seq_show_chan(s, g, chid(g, entry), indt); - return 0; -#endif // Reconstruct pointer to channel instance block if (g->chip_id >= NV_CHIP_ID_VOLTA) { instance_ptr = ((struct gv100_runlist_chan*)entry)->inst_ptr_hi; instance_ptr <<= 32; } instance_ptr |= inst_ptr_lo(g, entry) << 12; - + // Print channel information from runlist seq_printf(s, "%s+- Channel Entry %-4d-+\n", indt, chid(g, entry)); if (g->chip_id >= NV_CHIP_ID_VOLTA) seq_printf(s, "%s| Runqueue Selector: %d|\n", indt, - ((struct gv100_runlist_chan*)entry)->runqueue_selector); + ((struct gv100_runlist_chan*)entry)->runqueue_selector); + // Not populated on Kepler [ex: gk104 in Bonham (Quadro K5000)], and + // populated but unused on Pascal [ex: gp104 in Bonham (GTX 1080 Ti)]. + // (The aperture field may be incorrectly populated as INVALID, but the + // context still works on the aformentioned Pascal GPU.) seq_printf(s, "%s| Instance PTR: |\n", indt); seq_printf(s, "%s| %#018llx|\n", indt, instance_ptr); seq_printf(s, "%s| %20s|\n", indt, target_to_text(inst_target(g, entry))); +#ifdef DETAILED_CHANNEL_INFO + // Print channel info from PCCSR/Channel RAM and the instance block + if (g->chip_id < NV_CHIP_ID_AMPERE) + runlist_detail_seq_show_chan(s, g, chid(g, entry), indt); + else + runlist_detail_seq_show_chan_ga100(s, g, chid(g, entry), indt, rl_iter->runlist_pri_base); +#endif seq_printf(s, "%s+---------------------+\n", indt); } return 0; -- cgit v1.2.2