diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2024-09-16 15:34:41 -0400 |
---|---|---|
committer | Joshua Bakita <bakitajoshua@gmail.com> | 2024-09-16 15:34:41 -0400 |
commit | 232eafd04f272ed69d97a250c50a7bbed4d2894c (patch) | |
tree | bf1d03cd66e6f37b2c9ac9a9d48e4f359fcdd6b5 | |
parent | 0b1c304e53b88fe628d350d1380a88317f071e69 (diff) |
Support printing the runlist and channels on Ampere+ GPUs
**Modifes the user API from `cat /proc/gpuX/runlist0` to
`cat /proc/gpuX/runlist0/runlist` to support runlist-scoped
registers**
- Count number of runlists via Ampere-style PTOP parsing.
- Create a ProcFS directory for each runlist, and create the runlist
printing file in this directory.
- Document the newly-added/-formatted Runlist RAM and Channel RAM
registers.
- Add a helper function `get_runlist_ram()` to obtain the location
of each runlist's registers.
- Support printing Ampere-style Channel RAM entries.
Tested on Jetson Orin (ga10b), A100, H100, and AD102 (RTX 6000 Ada)
-rw-r--r-- | nvdebug.h | 84 | ||||
-rw-r--r-- | nvdebug_entry.c | 77 | ||||
-rw-r--r-- | runlist.c | 69 | ||||
-rw-r--r-- | runlist_procfs.c | 64 |
4 files changed, 254 insertions, 40 deletions
@@ -365,6 +365,37 @@ enum CHANNEL_STATUS { | |||
365 | CHANNEL_STATUS_ON_ENG_PENDING_ACQ_CTX_RELOAD = 14, | 365 | CHANNEL_STATUS_ON_ENG_PENDING_ACQ_CTX_RELOAD = 14, |
366 | }; | 366 | }; |
367 | 367 | ||
368 | /* RunList RAM (RLRAM) | ||
369 | Starting with Ampere, the PFIFO register region no longer exists, and each | ||
370 | engine has seperate runlist RAM and channel RAM. The register (BAR0) offset for | ||
371 | Runlist RAM for each engine must be pulled from the runlist_pri_base field | ||
372 | (RUNLIST Private Register BASE address) provided by PTOP. | ||
373 | |||
374 | See get_runlist_ram() in runlist.c | ||
375 | |||
376 | Support: Ampere+ | ||
377 | */ | ||
378 | #define NV_RUNLIST_BASE_GA100 0x080 | ||
379 | #define NV_RUNLIST_SUBMIT_GA100 0x088 | ||
380 | #define NV_RUNLIST_CHANNEL_CONFIG_GA100 0x004 | ||
381 | |||
382 | /* Channel RAM configuration, as contained in Runlist RAM | ||
383 | |||
384 | NUM_CHANNELS_LOG2 : 1 << NUM_CHANNELS_LOG2 is the number of channel_ctrl_ga100_t | ||
385 | entries in the described Channel RAM region. | ||
386 | BAR0_OFFSET : BAR0_OFFSET << 4 is the register offset (off BAR0) for the | ||
387 | Channel RAM region. | ||
388 | |||
389 | Support: Ampere+ | ||
390 | */ | ||
391 | typedef union { | ||
392 | struct { | ||
393 | uint8_t num_channels_log2:4; | ||
394 | uint32_t bar0_offset:28; | ||
395 | }__attribute__((packed)); | ||
396 | uint32_t raw; | ||
397 | } runlist_channel_config_t; | ||
398 | |||
368 | /* Programmable Channel Control System RAM (PCCSR) | 399 | /* Programmable Channel Control System RAM (PCCSR) |
369 | 512-entry array of channel control and status data structures. | 400 | 512-entry array of channel control and status data structures. |
370 | 401 | ||
@@ -425,6 +456,50 @@ typedef union { | |||
425 | uint64_t raw; | 456 | uint64_t raw; |
426 | } channel_ctrl_t; | 457 | } channel_ctrl_t; |
427 | 458 | ||
459 | /* CHannel RAM (CHRAM) (PCCSR replacement on Ampere+) | ||
460 | Starting with Ampere, channel IDs are no longer unique indexes into the | ||
461 | global channel RAM region (PCCSR), but are indexes into per-runlist channel | ||
462 | RAMs. | ||
463 | |||
464 | As Channel RAM entries are now subsidiary to a runlist, they do not contain | ||
465 | duplicate information, such as the instance pointer (to "result in smaller | ||
466 | hardware" per ga100/dev_ram.ref.txt in open-gpu-doc). | ||
467 | |||
468 | The new format retains and adds to the status information available about a | ||
469 | channel, but does so via bit flags rather than an enum. Some bit flags are | ||
470 | writable to trigger behavior previously dedicated to a bit (eg. writing to | ||
471 | `ctx_reload` triggers the same behavior as writing to `force_ctx_reload` did). | ||
472 | |||
473 | When the first bit (`is_write_one_clears_bits`) is set in this structure, | ||
474 | writing a 1 to any field will clear, rather than set, it. Writing a 0 to any | ||
475 | field is a no-op. | ||
476 | |||
477 | All fields read/write, except the following are read-only: BUSY, ON_PBDMA, | ||
478 | ON_ENG, PBDMA_BUSY, ENG_BUSY. | ||
479 | |||
480 | Support: Ampere, Hopper, Ada (and newer likely) | ||
481 | See also: manuals/ampere/ga100/dev_runlist.ref.txt in NVIDIA's open-gpu-doc | ||
482 | */ | ||
483 | typedef union { | ||
484 | struct { | ||
485 | bool is_write_one_clears_bits:1; // new | ||
486 | bool enable:1; | ||
487 | bool next:1; | ||
488 | bool busy:1; | ||
489 | bool pbdma_faulted:1; // write to force_pbdma_faulted | ||
490 | bool eng_faulted:1; // write to force_eng_faulted | ||
491 | bool on_pbdma:1; // breakout | ||
492 | bool on_eng:1; // breakout | ||
493 | bool pending:1; // breakout | ||
494 | bool ctx_reload:1; // breakout; write to force_ctx_reload | ||
495 | bool pbdma_busy:1; // breakout | ||
496 | bool eng_busy:1; // new | ||
497 | bool acquire_fail:1; // breakout | ||
498 | uint32_t :19; | ||
499 | } __attribute__((packed)); | ||
500 | uint32_t raw; | ||
501 | } channel_ctrl_ga100_t; | ||
502 | |||
428 | /* Control word for runlist enable/disable. | 503 | /* Control word for runlist enable/disable. |
429 | 504 | ||
430 | RUNLIST_N : Is runlist n disabled? (1 == disabled, 0 == enabled) | 505 | RUNLIST_N : Is runlist n disabled? (1 == disabled, 0 == enabled) |
@@ -1413,14 +1488,19 @@ struct runlist_iter { | |||
1413 | int entries_left_in_tsg; | 1488 | int entries_left_in_tsg; |
1414 | // Number of entries in runlist | 1489 | // Number of entries in runlist |
1415 | int len; | 1490 | int len; |
1416 | // Offset to start of Channel RAM (as this is per-runlist on Ampere+) | 1491 | // (Ampere+ only) Offset to the per-runlist "Runlist RAM" register region. |
1417 | uint32_t channel_ram; | 1492 | // This includes the offset for Channel RAM (per-runlist on Ampere+). |
1493 | uint32_t runlist_pri_base; | ||
1418 | }; | 1494 | }; |
1419 | 1495 | ||
1420 | #define NVDEBUG_MAX_DEVICES 8 | 1496 | #define NVDEBUG_MAX_DEVICES 8 |
1421 | extern struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES]; | 1497 | extern struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES]; |
1422 | 1498 | ||
1423 | // Defined in runlist.c | 1499 | // Defined in runlist.c |
1500 | int get_runlist_ram( | ||
1501 | struct nvdebug_state *g, | ||
1502 | int rl_id, | ||
1503 | uint32_t *rl_ram_off /* out */); | ||
1424 | int get_runlist_iter( | 1504 | int get_runlist_iter( |
1425 | struct nvdebug_state *g, | 1505 | struct nvdebug_state *g, |
1426 | int rl_id, | 1506 | int rl_id, |
diff --git a/nvdebug_entry.c b/nvdebug_entry.c index eee7351..1f9e1c9 100644 --- a/nvdebug_entry.c +++ b/nvdebug_entry.c | |||
@@ -159,35 +159,53 @@ int probe_and_cache_devices(void) { | |||
159 | return -ENODEV; | 159 | return -ENODEV; |
160 | } | 160 | } |
161 | 161 | ||
162 | // Create files `/proc/gpu#/runlist#`, world readable | ||
163 | // Support: Fermi, Maxwell, Pascal, Volta, Turing | 162 | // Support: Fermi, Maxwell, Pascal, Volta, Turing |
164 | int create_runlist_files(int device_id, struct proc_dir_entry *dir) { | 163 | int get_last_runlist_id_gk104(struct nvdebug_state *g) { |
165 | ptop_device_info_gk104_t info; | 164 | ptop_device_info_gk104_t info; |
166 | struct proc_dir_entry *rl_entry; | 165 | int i, max_rl_id = 0; // Always at least one runlist |
167 | int i, rl_id; | ||
168 | char runlist_name[12]; | ||
169 | int max_rl_id = 0; // Always at least one runlist | ||
170 | // Figure out how many runlists there are by checking the device info | 166 | // Figure out how many runlists there are by checking the device info |
171 | // registers. Runlists are always numbered sequentially, so we just have | 167 | // registers. Runlists are always numbered sequentially, so we just have |
172 | // to find the highest-valued one and add 1 to get the number of runlists. | 168 | // to find the highest-valued one and add 1 to get the number of runlists. |
173 | for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1_GK104; i++) { | 169 | for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1_GK104; i++) { |
174 | info.raw = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_DEVICE_INFO_GK104(i)); | 170 | if ((info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_GK104(i))) == -1) |
171 | return -EIO; | ||
175 | if (info.info_type != INFO_TYPE_ENUM || !info.runlist_is_valid) | 172 | if (info.info_type != INFO_TYPE_ENUM || !info.runlist_is_valid) |
176 | continue; | 173 | continue; |
177 | if (info.runlist_enum > max_rl_id) | 174 | if (info.runlist_enum > max_rl_id) |
178 | max_rl_id = info.runlist_enum; | 175 | max_rl_id = info.runlist_enum; |
179 | } | 176 | } |
180 | // Create files to read each runlist. The read handling code looks at the | 177 | return max_rl_id; |
181 | // `pde_data` associated with the file to determine what the runlist ID is. | 178 | } |
182 | for (rl_id = 0; rl_id <= max_rl_id; rl_id++) { | 179 | |
183 | snprintf(runlist_name, 12, "runlist%d", rl_id); | 180 | // Support: Ampere, Hopper, Ada (and newer likely) |
184 | rl_entry = proc_create_data( | 181 | // Identical structure to get_runlist_ram() in runlist.c. See comments there. |
185 | runlist_name, 0444, dir, compat_ops(&runlist_file_ops), | 182 | int get_last_runlist_id_ga100(struct nvdebug_state *g) { |
186 | (void*)(uintptr_t)rl_id); | 183 | ptop_device_info_ga100_t ptop_entry; |
187 | if (!rl_entry) | 184 | int i, runlist_count = 0; |
188 | return -ENOMEM; | 185 | int ptop_size = NV_PTOP_DEVICE_INFO__SIZE_1_GA100(g); |
186 | int ptop_entry_subrow = 0; | ||
187 | for (i = 0; i < ptop_size; i++) { | ||
188 | if ((ptop_entry.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_GA100(i))) == -1) | ||
189 | return -EIO; | ||
190 | if (!ptop_entry.raw) | ||
191 | continue; | ||
192 | if (ptop_entry_subrow == 2 && ptop_entry.rleng_id == 0) | ||
193 | runlist_count++; | ||
194 | if (ptop_entry.has_next_entry) | ||
195 | ptop_entry_subrow += 1; | ||
196 | else | ||
197 | ptop_entry_subrow = 0; | ||
189 | } | 198 | } |
190 | return 0; | 199 | return runlist_count - 1; |
200 | } | ||
201 | |||
202 | // Return the maximum runlist ID. For a two-runlist GPU, this would return 1. | ||
203 | int get_last_runlist_id(int device_id) { | ||
204 | struct nvdebug_state* g = &g_nvdebug_state[device_id]; | ||
205 | if (g->chip_id >= NV_CHIP_ID_AMPERE) | ||
206 | return get_last_runlist_id_ga100(g); | ||
207 | else | ||
208 | return get_last_runlist_id_gk104(g); | ||
191 | } | 209 | } |
192 | 210 | ||
193 | // Create files `/proc/gpu#/gpc#_tpc_mask`, world readable | 211 | // Create files `/proc/gpu#/gpc#_tpc_mask`, world readable |
@@ -238,6 +256,7 @@ int __init nvdebug_init(void) { | |||
238 | g_nvdebug_devices = res; | 256 | g_nvdebug_devices = res; |
239 | // Create seperate ProcFS directories for each gpu | 257 | // Create seperate ProcFS directories for each gpu |
240 | while (res--) { | 258 | while (res--) { |
259 | uintptr_t last_runlist = 0; | ||
241 | char device_id_str[7]; | 260 | char device_id_str[7]; |
242 | // Create a wider copy of the GPU ID to allow us to abuse the *data | 261 | // Create a wider copy of the GPU ID to allow us to abuse the *data |
243 | // field of proc_dir_entry to store the GPU ID. | 262 | // field of proc_dir_entry to store the GPU ID. |
@@ -248,10 +267,24 @@ int __init nvdebug_init(void) { | |||
248 | snprintf(device_id_str, 7, "gpu%ld", device_id); | 267 | snprintf(device_id_str, 7, "gpu%ld", device_id); |
249 | if (!(dir = proc_mkdir_data(device_id_str, 0555, NULL, (void*)device_id))) | 268 | if (!(dir = proc_mkdir_data(device_id_str, 0555, NULL, (void*)device_id))) |
250 | goto out_nomem; | 269 | goto out_nomem; |
251 | // Create files `/proc/gpu#/runlist#`, world readable | 270 | // Create files in the `/proc/gpu#/runlist#/` directory |
252 | if (g_nvdebug_state[device_id].chip_id < NV_CHIP_ID_AMPERE) | 271 | // The read handling code looks at the `pde_data` associated with the parent |
253 | if ((err = create_runlist_files(device_id, dir))) | 272 | // directory to determine what the runlist ID is. |
254 | goto out_err; | 273 | if ((last_runlist = get_last_runlist_id(device_id)) < 0) |
274 | return last_runlist; | ||
275 | do { | ||
276 | char runlist_name[12]; | ||
277 | struct proc_dir_entry *rl_dir; | ||
278 | // Create `/proc/gpu#/runlist#` directory | ||
279 | snprintf(runlist_name, 12, "runlist%lu", last_runlist); | ||
280 | if (!(rl_dir = proc_mkdir_data(runlist_name, 0555, dir, (void*)device_id))) | ||
281 | goto out_nomem; | ||
282 | // Create file `/proc/gpu#/runlist#/runlist`, world readable | ||
283 | if (!proc_create_data( | ||
284 | "runlist", 0444, rl_dir, compat_ops(&runlist_file_ops), | ||
285 | (void*)last_runlist)) | ||
286 | goto out_nomem; | ||
287 | } while (last_runlist-- > 0); | ||
255 | // Create file `/proc/gpu#/preempt_tsg`, world writable | 288 | // Create file `/proc/gpu#/preempt_tsg`, world writable |
256 | if (!proc_create_data( | 289 | if (!proc_create_data( |
257 | "preempt_tsg", 0222, dir, compat_ops(&preempt_tsg_file_ops), | 290 | "preempt_tsg", 0222, dir, compat_ops(&preempt_tsg_file_ops), |
@@ -325,7 +358,7 @@ int __init nvdebug_init(void) { | |||
325 | "local_memory", 0444, dir, compat_ops(&local_memory_file_ops), | 358 | "local_memory", 0444, dir, compat_ops(&local_memory_file_ops), |
326 | (void*)0x00100ce0)) | 359 | (void*)0x00100ce0)) |
327 | goto out_nomem; | 360 | goto out_nomem; |
328 | } | 361 | } |
329 | // Create files exposing LCE and PCE configuration (Pascal+) | 362 | // Create files exposing LCE and PCE configuration (Pascal+) |
330 | if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) { | 363 | if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) { |
331 | // Create file `/proc/gpu#/copy_topology`, world readable | 364 | // Create file `/proc/gpu#/copy_topology`, world readable |
@@ -14,6 +14,52 @@ | |||
14 | // be enabled to print the runlist on the TX2. | 14 | // be enabled to print the runlist on the TX2. |
15 | //#define FALLBACK_TO_PRAMIN | 15 | //#define FALLBACK_TO_PRAMIN |
16 | 16 | ||
17 | /* Get RunList RAM (RLRAM) offset for a runlist from the device topology | ||
18 | @param rl_id Which runlist to obtain [numbered in order of appearance in | ||
19 | the device topology (PTOP) registers] | ||
20 | @param rl_ram_off Location at which to store runlist private register | ||
21 | interface base address (PRI base); an offset into the BAR0 | ||
22 | register range. | ||
23 | @return 0 or -errno on error | ||
24 | */ | ||
25 | int get_runlist_ram(struct nvdebug_state *g, int rl_id, uint32_t *rl_ram_off) { | ||
26 | int i; | ||
27 | int curr_rl_id = 0; | ||
28 | int ptop_size = NV_PTOP_DEVICE_INFO__SIZE_1_GA100(g); | ||
29 | // Each PTOP entry is composed of 1--3 subrows, and the fields available | ||
30 | // on each row vary. The runlist RAM location is only available on row 3 | ||
31 | int ptop_entry_subrow = 0; | ||
32 | ptop_device_info_ga100_t ptop_entry; | ||
33 | // Iterate through all PTOP entries | ||
34 | for (i = 0; i < ptop_size; i++) { | ||
35 | if ((ptop_entry.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_GA100(i))) == -1) | ||
36 | return -EIO; | ||
37 | // Skip empty entries | ||
38 | if (!ptop_entry.raw) | ||
39 | continue; | ||
40 | // If on subrow 3 (zero-base-index 2), runlist info is available | ||
41 | // Multiple engines may be associated with a single runlist, so | ||
42 | // multiple PTOP entries may refer to the same runlist. Only match when | ||
43 | // on the 0th-associated entry. | ||
44 | if (ptop_entry_subrow == 2 && ptop_entry.rleng_id == 0) { | ||
45 | // If this is the requested runlist, return it | ||
46 | if (curr_rl_id == rl_id) { | ||
47 | *rl_ram_off = (uint32_t)ptop_entry.runlist_pri_base << 10; | ||
48 | return 0; | ||
49 | } | ||
50 | // Otherwise, update our accounting of what the next runlist ID is | ||
51 | curr_rl_id++; | ||
52 | } | ||
53 | // Track if the next row is a subrow of the current entry | ||
54 | if (ptop_entry.has_next_entry) | ||
55 | ptop_entry_subrow += 1; | ||
56 | else | ||
57 | ptop_entry_subrow = 0; | ||
58 | } | ||
59 | // Search failed; requested index does not exist | ||
60 | return -EINVAL; | ||
61 | } | ||
62 | |||
17 | /* Get runlist head and info (incl. length) | 63 | /* Get runlist head and info (incl. length) |
18 | @param rl_id Which runlist to obtain? | 64 | @param rl_id Which runlist to obtain? |
19 | @param rl_iter Location at which to store output | 65 | @param rl_iter Location at which to store output |
@@ -39,7 +85,7 @@ int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl | |||
39 | runlist_target = rl.target; | 85 | runlist_target = rl.target; |
40 | runlist_len = rl.len; | 86 | runlist_len = rl.len; |
41 | printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx)\n", | 87 | printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx)\n", |
42 | rl_id, g->chip_id, rl.len, runlist_iova, target_to_text(rl.target), rl.raw); | 88 | rl_id, g->chip_id, rl.len, runlist_iova, target_to_text(rl.target), rl.raw); |
43 | } else if (g->chip_id < NV_CHIP_ID_AMPERE) { | 89 | } else if (g->chip_id < NV_CHIP_ID_AMPERE) { |
44 | runlist_base_tu102_t base; | 90 | runlist_base_tu102_t base; |
45 | runlist_submit_tu102_t submit; | 91 | runlist_submit_tu102_t submit; |
@@ -51,7 +97,26 @@ int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl | |||
51 | runlist_target = base.target; | 97 | runlist_target = base.target; |
52 | runlist_len = submit.len; | 98 | runlist_len = submit.len; |
53 | printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx %#018llx)\n", | 99 | printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx %#018llx)\n", |
54 | rl_id, g->chip_id, submit.len, runlist_iova, target_to_text(runlist_target), base.raw, submit.raw); | 100 | rl_id, g->chip_id, submit.len, runlist_iova, target_to_text(runlist_target), base.raw, submit.raw); |
101 | } else { | ||
102 | runlist_base_tu102_t base; | ||
103 | runlist_submit_tu102_t submit; | ||
104 | uint32_t runlist_pri_base; | ||
105 | // Runlist configurations are stored in per-runlist regions on Ampere+ | ||
106 | if ((err = get_runlist_ram(g, rl_id, &runlist_pri_base)) < 0) | ||
107 | return err; | ||
108 | // The runlist configuration region (RLRAM) contains Turing-like BASE | ||
109 | // and SUBMIT registers at static offsets | ||
110 | if ((base.raw = nvdebug_readq(g, runlist_pri_base + NV_RUNLIST_BASE_GA100)) == -1) | ||
111 | return -EIO; | ||
112 | if ((submit.raw = nvdebug_readq(g, runlist_pri_base + NV_RUNLIST_SUBMIT_GA100)) == -1) | ||
113 | return -EIO; | ||
114 | runlist_iova = ((uint64_t)base.ptr) << 12; | ||
115 | runlist_target = base.target; | ||
116 | runlist_len = submit.len; | ||
117 | printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx %#018llx)\n", | ||
118 | rl_id, g->chip_id, submit.len, runlist_iova, target_to_text(runlist_target), base.raw, submit.raw); | ||
119 | rl_iter->runlist_pri_base = runlist_pri_base; | ||
55 | } | 120 | } |
56 | // Return early on an empty runlist | 121 | // Return early on an empty runlist |
57 | if (!runlist_len) | 122 | if (!runlist_len) |
diff --git a/runlist_procfs.c b/runlist_procfs.c index 8152463..c1cfc87 100644 --- a/runlist_procfs.c +++ b/runlist_procfs.c | |||
@@ -8,11 +8,11 @@ | |||
8 | 8 | ||
9 | #ifdef DETAILED_CHANNEL_INFO | 9 | #ifdef DETAILED_CHANNEL_INFO |
10 | /* Print channel details using PCCSR (Programmable Channel Control System RAM?) | 10 | /* Print channel details using PCCSR (Programmable Channel Control System RAM?) |
11 | * @param s Pointer to state from seq_file subsystem to pass to seq_printf | 11 | @param s Pointer to state from seq_file subsystem to pass to seq_printf |
12 | * @param g Pointer to our internal GPU state | 12 | @param g Pointer to our internal GPU state |
13 | * @param chid ID of channel to print details on, range [0, 512) | 13 | @param chid ID of channel to print details on, range [0, 512) |
14 | * @param prefix Text string to prefix each line with, or empty string | 14 | @param prefix Text string to prefix each line with, or empty string |
15 | */ | 15 | */ |
16 | static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix) { | 16 | static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix) { |
17 | channel_ctrl_t chan; | 17 | channel_ctrl_t chan; |
18 | uint64_t instance_ptr; | 18 | uint64_t instance_ptr; |
@@ -21,7 +21,7 @@ static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state | |||
21 | return -EIO; | 21 | return -EIO; |
22 | instance_ptr = (uint64_t)chan.inst_ptr << 12; | 22 | instance_ptr = (uint64_t)chan.inst_ptr << 12; |
23 | // Don't print write-only fields | 23 | // Don't print write-only fields |
24 | seq_printf(s, "%s+- Channel Info %-4d -+\n", prefix, chid); | 24 | seq_printf(s, "%s|= Channel Info ======|\n", prefix); |
25 | seq_printf(s, "%s| Enabled: %d|\n", prefix, chan.enable); | 25 | seq_printf(s, "%s| Enabled: %d|\n", prefix, chan.enable); |
26 | seq_printf(s, "%s| Next: %d|\n", prefix, chan.next); | 26 | seq_printf(s, "%s| Next: %d|\n", prefix, chan.next); |
27 | seq_printf(s, "%s| PBDMA Faulted: %d|\n", prefix, chan.pbdma_faulted); | 27 | seq_printf(s, "%s| PBDMA Faulted: %d|\n", prefix, chan.pbdma_faulted); |
@@ -32,7 +32,37 @@ static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state | |||
32 | seq_printf(s, "%s| %#018llx|\n", prefix, instance_ptr); | 32 | seq_printf(s, "%s| %#018llx|\n", prefix, instance_ptr); |
33 | seq_printf(s, "%s| %20s|\n", prefix, target_to_text(chan.inst_target)); | 33 | seq_printf(s, "%s| %20s|\n", prefix, target_to_text(chan.inst_target)); |
34 | seq_printf(s, "%s| Instance bound: %d|\n", prefix, chan.inst_bind); | 34 | seq_printf(s, "%s| Instance bound: %d|\n", prefix, chan.inst_bind); |
35 | seq_printf(s, "%s+---------------------+\n", prefix); | 35 | return 0; |
36 | } | ||
37 | |||
38 | /* `runlist_detail_seq_show_chan()`, but for Ampere+ | ||
39 | @param runlist_pri_base Base of the RLRAM region for this runlist | ||
40 | |||
41 | `runlist_pri_base` is necessary, since Channel RAM is now per-runlist on | ||
42 | Ampere+, and its location is configured in Runlist RAM. | ||
43 | */ | ||
44 | static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix, uint32_t runlist_pri_base) { | ||
45 | runlist_channel_config_t channel_config; | ||
46 | channel_ctrl_ga100_t chan; | ||
47 | |||
48 | // Channel RAM is subsidiary to Runlist RAM (ie. per-runlist) on Ampere+ | ||
49 | if ((channel_config.raw = nvdebug_readl(g, runlist_pri_base + NV_RUNLIST_CHANNEL_CONFIG_GA100)) == -1) | ||
50 | return -EIO; | ||
51 | if ((chan.raw = nvdebug_readl(g, (((uint32_t)channel_config.bar0_offset << 4) + chid * 4))) == -1) | ||
52 | return -EIO; | ||
53 | seq_printf(s, "%s|= Channel Info ======|\n", prefix); | ||
54 | seq_printf(s, "%s| Enabled: %d|\n", prefix, chan.enable); | ||
55 | seq_printf(s, "%s| Next: %d|\n", prefix, chan.next); | ||
56 | seq_printf(s, "%s| Busy: %d|\n", prefix, chan.busy); | ||
57 | seq_printf(s, "%s| PBDMA Faulted: %d|\n", prefix, chan.pbdma_faulted); | ||
58 | seq_printf(s, "%s| ENG Faulted: %d|\n", prefix, chan.eng_faulted); | ||
59 | seq_printf(s, "%s| On PBDMA: %d|\n", prefix, chan.on_pbdma); | ||
60 | seq_printf(s, "%s| On ENG: %d|\n", prefix, chan.on_eng); | ||
61 | seq_printf(s, "%s| Pending: %d|\n", prefix, chan.pending); | ||
62 | seq_printf(s, "%s| CTX Reload: %d|\n", prefix, chan.ctx_reload); | ||
63 | seq_printf(s, "%s| PBDMA Busy: %d|\n", prefix, chan.pbdma_busy); | ||
64 | seq_printf(s, "%s| ENG Busy: %d|\n", prefix, chan.eng_busy); | ||
65 | seq_printf(s, "%s| Acquire Fail: %d|\n", prefix, chan.acquire_fail); | ||
36 | return 0; | 66 | return 0; |
37 | } | 67 | } |
38 | #endif | 68 | #endif |
@@ -118,27 +148,33 @@ static int runlist_file_seq_show(struct seq_file *s, void *raw_rl_iter) { | |||
118 | } else { | 148 | } else { |
119 | char *indt = ""; | 149 | char *indt = ""; |
120 | u64 instance_ptr = 0; | 150 | u64 instance_ptr = 0; |
121 | |||
122 | if (rl_iter->entries_left_in_tsg) | 151 | if (rl_iter->entries_left_in_tsg) |
123 | indt = " "; | 152 | indt = " "; |
124 | #ifdef DETAILED_CHANNEL_INFO | ||
125 | runlist_detail_seq_show_chan(s, g, chid(g, entry), indt); | ||
126 | return 0; | ||
127 | #endif | ||
128 | // Reconstruct pointer to channel instance block | 153 | // Reconstruct pointer to channel instance block |
129 | if (g->chip_id >= NV_CHIP_ID_VOLTA) { | 154 | if (g->chip_id >= NV_CHIP_ID_VOLTA) { |
130 | instance_ptr = ((struct gv100_runlist_chan*)entry)->inst_ptr_hi; | 155 | instance_ptr = ((struct gv100_runlist_chan*)entry)->inst_ptr_hi; |
131 | instance_ptr <<= 32; | 156 | instance_ptr <<= 32; |
132 | } | 157 | } |
133 | instance_ptr |= inst_ptr_lo(g, entry) << 12; | 158 | instance_ptr |= inst_ptr_lo(g, entry) << 12; |
134 | 159 | // Print channel information from runlist | |
135 | seq_printf(s, "%s+- Channel Entry %-4d-+\n", indt, chid(g, entry)); | 160 | seq_printf(s, "%s+- Channel Entry %-4d-+\n", indt, chid(g, entry)); |
136 | if (g->chip_id >= NV_CHIP_ID_VOLTA) | 161 | if (g->chip_id >= NV_CHIP_ID_VOLTA) |
137 | seq_printf(s, "%s| Runqueue Selector: %d|\n", indt, | 162 | seq_printf(s, "%s| Runqueue Selector: %d|\n", indt, |
138 | ((struct gv100_runlist_chan*)entry)->runqueue_selector); | 163 | ((struct gv100_runlist_chan*)entry)->runqueue_selector); |
164 | // Not populated on Kepler [ex: gk104 in Bonham (Quadro K5000)], and | ||
165 | // populated but unused on Pascal [ex: gp104 in Bonham (GTX 1080 Ti)]. | ||
166 | // (The aperture field may be incorrectly populated as INVALID, but the | ||
167 | // context still works on the aformentioned Pascal GPU.) | ||
139 | seq_printf(s, "%s| Instance PTR: |\n", indt); | 168 | seq_printf(s, "%s| Instance PTR: |\n", indt); |
140 | seq_printf(s, "%s| %#018llx|\n", indt, instance_ptr); | 169 | seq_printf(s, "%s| %#018llx|\n", indt, instance_ptr); |
141 | seq_printf(s, "%s| %20s|\n", indt, target_to_text(inst_target(g, entry))); | 170 | seq_printf(s, "%s| %20s|\n", indt, target_to_text(inst_target(g, entry))); |
171 | #ifdef DETAILED_CHANNEL_INFO | ||
172 | // Print channel info from PCCSR/Channel RAM and the instance block | ||
173 | if (g->chip_id < NV_CHIP_ID_AMPERE) | ||
174 | runlist_detail_seq_show_chan(s, g, chid(g, entry), indt); | ||
175 | else | ||
176 | runlist_detail_seq_show_chan_ga100(s, g, chid(g, entry), indt, rl_iter->runlist_pri_base); | ||
177 | #endif | ||
142 | seq_printf(s, "%s+---------------------+\n", indt); | 178 | seq_printf(s, "%s+---------------------+\n", indt); |
143 | } | 179 | } |
144 | return 0; | 180 | return 0; |