aboutsummaryrefslogtreecommitdiffstats
path: root/runlist_procfs.c
diff options
context:
space:
mode:
authorJoshua Bakita <bakitajoshua@gmail.com>2025-05-05 03:53:01 -0400
committerJoshua Bakita <bakitajoshua@gmail.com>2025-05-05 03:53:13 -0400
commit293430fcb5d4013b573556c58457ee706e482b7f (patch)
tree9328fa680f55b4e1a08d24714275b8437be3be5d /runlist_procfs.c
parent494df296bf4abe9b2b484bde1a4fad28c989afec (diff)
Snapshot for ECRTS'25 artifact evaluation
Diffstat (limited to 'runlist_procfs.c')
-rw-r--r--runlist_procfs.c645
1 files changed, 636 insertions, 9 deletions
diff --git a/runlist_procfs.c b/runlist_procfs.c
index b2159f6..a3a6df3 100644
--- a/runlist_procfs.c
+++ b/runlist_procfs.c
@@ -1,12 +1,117 @@
1#include <linux/seq_file.h> // For seq_* functions and types 1#include <linux/seq_file.h> // For seq_* functions and types
2#include <linux/version.h> // Macros to detect kernel version 2#include <linux/version.h> // Macros to detect kernel version
3#include <linux/platform_device.h> // For platform_get_resource()
4#include <linux/pci.h> // For pci_resource_start()
5#include <linux/iommu.h> // For iommu_ functions
6#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,10,0)
7#include <linux/dma-map-ops.h> // For get_dma_ops()
8#endif
3 9
4#include "nvdebug_linux.h" 10#include "nvdebug_linux.h"
5 11
6// Uncomment to expand channel status information when printing the runlist 12// We cannot touch PRAMIN (via page table operations or ctxsw access) if we're
13// using it to walk the runlist
14//#ifndef FALLBACK_TO_PRAMIN
15// Uncomment to expand channel status, instance, and context information when
16// printing the runlist
7#define DETAILED_CHANNEL_INFO 17#define DETAILED_CHANNEL_INFO
18//#endif
8 19
9#ifdef DETAILED_CHANNEL_INFO 20#ifdef DETAILED_CHANNEL_INFO
21// Print the channel instance and context swtich blocks
22// XXX: THIS IS UNSAFE ON KEPLER!
23// instance_deref() will call into the page table logic, which may move PRAMIN
24// PRAMIN appears heavily utilized by the driver on Bonham (at least), and
25// moving it causes problems.
26static int runlist_detail_seq_show_inst(struct seq_file *s, struct nvdebug_state *g, char *prefix, uint64_t instance_ptr, enum INST_TARGET instance_target) {
27 instance_ctrl_t *inst = NULL;
28 context_switch_ctrl_t *ctxsw = NULL;
29 int i;
30
31#ifdef FALLBACK_TO_PRAMIN
32 bar0_window_t win;
33 win.raw = nvdebug_readl(g, NV_XAL_EP_BAR0_WINDOW_BASE);
34 inst = g->regs + NV_PRAMIN + addr_to_pramin_mut(g, instance_ptr, instance_target);
35#else
36 if (IS_ERR(inst = instance_deref(g, instance_ptr, instance_target)))
37 return PTR_ERR(ctxsw);
38#endif // FALLBACK_TO_PRAMIN
39 // If unable to access instance block, skip
40 if (!inst)
41 return 0;
42
43 // Print the channel instance block
44 // As an ID, use upper 52 bits of the instance address (lower 12 are zero)
45 //seq_printf(s, "%s+- Inst %-13llx-+\n", prefix, instance_ptr >> 12);
46 seq_printf(s, "%s|= Instance Block ====|\n", prefix);
47 seq_printf(s, "%s| Target Engine: %2d|\n", prefix, inst->fc_target);
48 seq_printf(s, "%s| Privileged: %1d|\n", prefix, inst->fc_config_is_priv);
49 seq_printf(s, "%s| Channel VEID: %2d|\n", prefix, inst->fc_chan_info_veid);
50 seq_printf(s, "%s| WFI PTR: |\n", prefix);
51 seq_printf(s, "%s| %#018llx|\n", prefix, (uint64_t)inst->engine_wfi_ptr << 12);
52 seq_printf(s, "%s| %20s|\n", prefix, target_to_text(inst->engine_wfi_target));
53 seq_printf(s, "%s| Virtual address? %d|\n", prefix, inst->engine_wfi_is_virtual);
54 seq_printf(s, "%s| WFI VEID: %2d|\n", prefix, inst->engine_wfi_veid);
55 seq_printf(s, "%s| All PDB PTR: |\n", prefix);
56 seq_printf(s, "%s| %#018llx|\n", prefix, (u64)inst->pdb.page_dir << 12);
57 seq_printf(s, "%s| %20s|\n", prefix, target_to_text(inst->pdb.target));
58 seq_printf(s, "%s| %20s|\n", prefix, inst->pdb.is_volatile ? "volatile" : "non-volatile");
59// seq_printf(s, "%s|raw: %0#10lx|\n", prefix, inst->pdb.raw);
60 seq_printf(s, "%s| Num subcontexts: %2ld|\n", prefix, hweight64(inst->subcontext_pdb_valid));
61 // Print configuration of every enabled subcontext
62 for (i = 0; i < 64; i++) {
63 // Skip subcontexts without their enable bit set
64 if (!(1 & (inst->subcontext_pdb_valid >> i)))
65 continue;
66 seq_printf(s, "%s| CPU SC%02d ASID%7d|\n", prefix, i, inst->subcontext[i].pasid);
67 seq_printf(s, "%s| SC%02d PDB PTR: |\n", prefix, i);
68 seq_printf(s, "%s| %#018llx|\n", prefix, ((u64)inst->subcontext[i].pdb.page_dir_hi << 32) | ((u64)inst->subcontext[i].pdb.page_dir_lo << 12));
69 seq_printf(s, "%s| %20s|\n", prefix, target_to_text(inst->subcontext[i].pdb.target));
70 seq_printf(s, "%s| %20s|\n", prefix, inst->subcontext[i].pdb.is_volatile ? "volatile" : "non-volatile");
71// seq_printf(s, "%s|raw: %0#10lx|\n", prefix, inst->subcontext[i].pdb.raw);
72 }
73
74 // XXX: CTXSW is only accessible via PRAMIN. Accessing PRAMIN appears to
75 // either be broken, or race with the driver on Kepler (gk104 tested). So,
76 // do not attempt to touch the CTXSW block on Kepler.
77 // TODO: This check should be moved into addr_to_pramin_mut().
78 if (g->chip_id < NV_CHIP_ID_MAXWELL)
79 return 0;
80 // End XXX
81
82 if (IS_ERR(ctxsw = get_ctxsw(g, inst))) {
83#ifdef FALLBACK_TO_PRAMIN
84 nvdebug_writel(g, NV_XAL_EP_BAR0_WINDOW_BASE, win.raw);
85#endif
86 return PTR_ERR(ctxsw);
87 }
88 // If unable to access CTXSW block, skip
89 if (!ctxsw) {
90#ifdef FALLBACK_TO_PRAMIN
91 nvdebug_writel(g, NV_XAL_EP_BAR0_WINDOW_BASE, win.raw);
92#endif
93 return 0;
94 }
95 // Access and print the preemption mode and context ID
96 seq_printf(s, "%s|= Context State =====|\n", prefix);
97 seq_printf(s, "%s| Ctx. ID: %#10x|\n", prefix, ctxsw->context_id);
98 // No other CTXSW fields are supported pre-Pascal
99 if (g->chip_id < NV_CHIP_ID_PASCAL)
100 return 0;
101 seq_printf(s, "%s| Gfx. Preemption:%4s|\n", prefix,
102 graphics_preempt_type_to_text(ctxsw->graphics_preemption_options));
103 seq_printf(s, "%s| Cmp. Preemption:%4s|\n", prefix,
104 compute_preempt_type_to_text(ctxsw->compute_preemption_options));
105 seq_printf(s, "%s| #WFI Saves:%9d|\n", prefix, ctxsw->num_wfi_save_operations);
106 seq_printf(s, "%s| #CTA Saves:%9d|\n", prefix, ctxsw->num_cta_save_operations);
107 seq_printf(s, "%s| #GFXP Saves:%8d|\n", prefix, ctxsw->num_gfxp_save_operations);
108 seq_printf(s, "%s| #CILP Saves:%8d|\n", prefix, ctxsw->num_cilp_save_operations);
109#ifdef FALLBACK_TO_PRAMIN
110 nvdebug_writel(g, NV_XAL_EP_BAR0_WINDOW_BASE, win.raw);
111#endif
112 return 0;
113}
114
10/* Print channel details using PCCSR (Programmable Channel Control System RAM?) 115/* Print channel details using PCCSR (Programmable Channel Control System RAM?)
11 @param s Pointer to state from seq_file subsystem to pass to seq_printf 116 @param s Pointer to state from seq_file subsystem to pass to seq_printf
12 @param g Pointer to our internal GPU state 117 @param g Pointer to our internal GPU state
@@ -32,16 +137,19 @@ static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state
32 seq_printf(s, "%s| %#018llx|\n", prefix, instance_ptr); 137 seq_printf(s, "%s| %#018llx|\n", prefix, instance_ptr);
33 seq_printf(s, "%s| %20s|\n", prefix, target_to_text(chan.inst_target)); 138 seq_printf(s, "%s| %20s|\n", prefix, target_to_text(chan.inst_target));
34 seq_printf(s, "%s| Instance bound: %d|\n", prefix, chan.inst_bind); 139 seq_printf(s, "%s| Instance bound: %d|\n", prefix, chan.inst_bind);
35 return 0; 140 // Print instance block
141 return runlist_detail_seq_show_inst(s, g, prefix, instance_ptr, chan.inst_target);
36} 142}
37 143
38/* `runlist_detail_seq_show_chan()`, but for Ampere+ 144/* `runlist_detail_seq_show_chan()`, but for Ampere+
145 @param instance_ptr Address for the channel instance block
146 @param instance_target Aperture of `instance_ptr`
39 @param runlist_pri_base Base of the RLRAM region for this runlist 147 @param runlist_pri_base Base of the RLRAM region for this runlist
40 148
41 `runlist_pri_base` is necessary, since Channel RAM is now per-runlist on 149 `runlist_pri_base` is necessary, since Channel RAM is now per-runlist on
42 Ampere+, and its location is configured in Runlist RAM. 150 Ampere+, and its location is configured in Runlist RAM.
43*/ 151*/
44static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix, uint32_t runlist_pri_base) { 152static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix, uint32_t runlist_pri_base, uint64_t instance_ptr, enum INST_TARGET instance_target) {
45 runlist_channel_config_t channel_config; 153 runlist_channel_config_t channel_config;
46 channel_ctrl_ga100_t chan; 154 channel_ctrl_ga100_t chan;
47 155
@@ -63,7 +171,7 @@ static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug
63 seq_printf(s, "%s| PBDMA Busy: %d|\n", prefix, chan.pbdma_busy); 171 seq_printf(s, "%s| PBDMA Busy: %d|\n", prefix, chan.pbdma_busy);
64 seq_printf(s, "%s| ENG Busy: %d|\n", prefix, chan.eng_busy); 172 seq_printf(s, "%s| ENG Busy: %d|\n", prefix, chan.eng_busy);
65 seq_printf(s, "%s| Acquire Fail: %d|\n", prefix, chan.acquire_fail); 173 seq_printf(s, "%s| Acquire Fail: %d|\n", prefix, chan.acquire_fail);
66 return 0; 174 return runlist_detail_seq_show_inst(s, g, prefix, instance_ptr, instance_target);
67} 175}
68#endif 176#endif
69 177
@@ -173,7 +281,7 @@ static int runlist_file_seq_show(struct seq_file *s, void *raw_rl_iter) {
173 if (g->chip_id < NV_CHIP_ID_AMPERE) 281 if (g->chip_id < NV_CHIP_ID_AMPERE)
174 runlist_detail_seq_show_chan(s, g, chid(g, entry), indt); 282 runlist_detail_seq_show_chan(s, g, chid(g, entry), indt);
175 else 283 else
176 runlist_detail_seq_show_chan_ga100(s, g, chid(g, entry), indt, rl_iter->runlist_pri_base); 284 runlist_detail_seq_show_chan_ga100(s, g, chid(g, entry), indt, rl_iter->runlist_pri_base, instance_ptr, inst_target(g, entry));
177#endif 285#endif
178 seq_printf(s, "%s+---------------------+\n", indt); 286 seq_printf(s, "%s+---------------------+\n", indt);
179 } 287 }
@@ -232,15 +340,17 @@ struct file_operations preempt_tsg_file_ops = {
232 340
233ssize_t resubmit_runlist_file_write(struct file *f, const char __user *buffer, 341ssize_t resubmit_runlist_file_write(struct file *f, const char __user *buffer,
234 size_t count, loff_t *off) { 342 size_t count, loff_t *off) {
235 uint32_t target_runlist; 343 uint32_t target_runlist, target_offset;
236 struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)]; 344 struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
237 // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec 345 // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
238 int err = kstrtou32_from_user(buffer, count, 0, &target_runlist); 346 int err = kstrtou32_from_user(buffer, count, 0, &target_offset);
239 if (err) 347 if (err)
240 return err; 348 return err;
349 // (Ab)use the PDE_DATA field for the runlist ID
350 target_runlist = file2gpuidx(f);
241 351
242 // resubmit_runlist() checks that target_runlist is valid 352 // resubmit_runlist() checks that target_runlist is valid
243 if ((err = resubmit_runlist(g, target_runlist))) 353 if ((err = resubmit_runlist(g, target_runlist, target_offset)))
244 return err; 354 return err;
245 355
246 return count; 356 return count;
@@ -351,6 +461,54 @@ struct file_operations enable_channel_file_ops = {
351 .llseek = default_llseek, 461 .llseek = default_llseek,
352}; 462};
353 463
464ssize_t comm_preempt_channel_file_write(struct file *f, const char __user *buf,
465 size_t count, loff_t *off,
466 enum COMPUTE_PREEMPT_TYPE mode) {
467 uint32_t target_channel, target_runlist;
468 struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
469 // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
470 int err = kstrtou32_from_user(buf, count, 0, &target_channel);
471 if (err)
472 return err;
473 // (Ab)use the PDE_DATA field used by file2gpuidx() for the runlist ID
474 target_runlist = file2gpuidx(f);
475 // Set preemption mode for the context of this channel
476 if ((err = set_channel_preemption_mode(g, target_channel, target_runlist, mode)))
477 return err;
478
479 return count;
480}
481
482ssize_t wfi_preempt_channel_file_write(struct file *f, const char __user *buf,
483 size_t count, loff_t *off) {
484 return comm_preempt_channel_file_write(f, buf, count, off, PREEMPT_WFI);
485}
486
487struct file_operations wfi_preempt_channel_file_ops = {
488 .write = wfi_preempt_channel_file_write,
489 .llseek = default_llseek,
490};
491
492ssize_t cta_preempt_channel_file_write(struct file *f, const char __user *buf,
493 size_t count, loff_t *off) {
494 return comm_preempt_channel_file_write(f, buf, count, off, PREEMPT_CTA);
495}
496
497struct file_operations cta_preempt_channel_file_ops = {
498 .write = cta_preempt_channel_file_write,
499 .llseek = default_llseek,
500};
501
502ssize_t cil_preempt_channel_file_write(struct file *f, const char __user *buf,
503 size_t count, loff_t *off) {
504 return comm_preempt_channel_file_write(f, buf, count, off, PREEMPT_CILP);
505}
506
507struct file_operations cil_preempt_channel_file_ops = {
508 .write = cil_preempt_channel_file_write,
509 .llseek = default_llseek,
510};
511
354// Tested working on Pascal (gp106) through Ada (ad102) 512// Tested working on Pascal (gp106) through Ada (ad102)
355ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer, 513ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer,
356 size_t count, loff_t *off) { 514 size_t count, loff_t *off) {
@@ -419,11 +577,13 @@ ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer,
419 577
420 // TODO: Fix the above for bare channels. Add "for_chan_until_tsg"? 578 // TODO: Fix the above for bare channels. Add "for_chan_until_tsg"?
421 } 579 }
580#warning switch_to_tsg has preempt_runlist omitted!
581 return count;
422 582
423 // Resubmit the runlist to ensure that changes to channel enablement are 583 // Resubmit the runlist to ensure that changes to channel enablement are
424 // picked up on Turing+ GPUs (channel enablements may not be otherwise). 584 // picked up on Turing+ GPUs (channel enablements may not be otherwise).
425 if (g->chip_id >= NV_CHIP_ID_TURING) 585 if (g->chip_id >= NV_CHIP_ID_TURING)
426 if ((err = resubmit_runlist(g, target_runlist))) 586 if ((err = resubmit_runlist(g, target_runlist, -1)))
427 return err; 587 return err;
428 588
429 // Trigger a runlist-level preempt to stop whatever was running, triggering 589 // Trigger a runlist-level preempt to stop whatever was running, triggering
@@ -438,3 +598,470 @@ struct file_operations switch_to_tsg_file_ops = {
438 .write = switch_to_tsg_file_write, 598 .write = switch_to_tsg_file_write,
439 .llseek = default_llseek, 599 .llseek = default_llseek,
440}; 600};
601
602ssize_t preempt_runlist_file_write(struct file *f, const char __user *buffer,
603 size_t count, loff_t *off) {
604 uint32_t target_runlist;
605 struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
606 // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
607 int err = kstrtou32_from_user(buffer, count, 0, &target_runlist);
608 if (err)
609 return err;
610
611 // TODO: Check runlist is in-range
612 if ((err = preempt_runlist(g, target_runlist)))
613 return err;
614
615 return count;
616}
617
618struct file_operations preempt_runlist_file_ops = {
619 .write = preempt_runlist_file_write,
620 .llseek = default_llseek,
621};
622
623// Value written to this file is which runlist to ack the IRQ for
624ssize_t ack_bad_tsg_file_write(struct file *f, const char __user *buffer,
625 size_t count, loff_t *off) {
626 uint32_t target_runlist;
627 uint32_t rl_ram_off;
628 struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
629 // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
630 int err = kstrtou32_from_user(buffer, count, 0, &target_runlist);
631 if (err)
632 return err;
633
634 if ((err = get_runlist_ram(g, target_runlist, &rl_ram_off)))
635 return err;
636
637 nvdebug_writel(g, rl_ram_off + 0x100, 1 << 12);
638
639 return count;
640}
641
642struct file_operations ack_bad_tsg_file_ops = {
643 .write = ack_bad_tsg_file_write,
644 .llseek = default_llseek,
645};
646
647// Rather than mapping all of BAR0, we just map:
648// - On Pascal, Volta, Turing: MC_BOOT, PFIFO, PCCSR, PTOP
649// - On Ampere: MC_BOOT, RAMRL(0), CHRAM(0), PTOP
650// "All CUDA-managed pointers are within---the first 40 bits of the process's
651// VA space" (Sec. 4.1, GPUDirect RDMA Documentation)
652// - This means 0x00ff_ffff_ffff is the highest valid CUDA virtual address,
653// and all higher addresses are unused.
654// - So we use 0x6000_0000_0000+; this falls within the first PDE3 entry, and
655// at the end of the PDE2 entries
656// + Using the second PDE3 entry did not appear to work on Jetson (IIRC)
657#define BAR0_USER_ADDR 0x0000700000000000llu
658#define MEM_USER_ADDR 0x0000600000000000llu
659
660/* Map all of GPU VRAM, and selected BAR0 regions, into a channel instance's
661 * virtual address space at predefined offsets (above).
662 *
663 * @param g Pointer to the nvdebug state for the selected GPU
664 * @param inst_ptr Dereferencible pointer to the channel's instance block
665 * @returns 0 on success, -errno on error
666 *
667 * Support: Pascal, Volta, Turing, Ampere
668 */
669int map_mem_for_instance(struct nvdebug_state *g, instance_ctrl_t *inst_ptr) {
670 int ret;
671 uintptr_t off, ram_size;
672 dma_addr_t bus_mc_boot_ram, bus_ptop_ram, bus_fifo_ram, bus_chan_ctrl_ram;
673 uint64_t mc_boot_ram, ptop_ram, fifo_ram, chan_ctrl_ram;
674 page_dir_config_t chan_pd_config;
675 memory_range_t mem_range;
676 uint32_t channel_ram_off, runlist_ram_off, channel_ram_size, bar0_base;
677 struct iommu_domain *dom;
678
679 if (g->chip_id >= NV_CHIP_ID_AMPERE) {
680 runlist_channel_config_t channel_config;
681 if ((ret = get_runlist_ram(g, 0, &runlist_ram_off))) {
682 printk(KERN_ERR "[nvdebug] %s: Unable to determine location of runlist0 RAM!\n", __func__);
683 return ret;
684 }
685 if (runlist_ram_off & 0xfff) {
686 printk(KERN_ERR "[nvdebug] %s: Runlist0 RAM is not page-aligned!\n", __func__);
687 return -EAFNOSUPPORT;
688 }
689 if ((channel_config.raw = nvdebug_readl(g, runlist_ram_off + NV_RUNLIST_CHANNEL_CONFIG_GA100)) == -1)
690 return -EIO;
691 channel_ram_off = (uint32_t)channel_config.bar0_offset << 4;
692 if (channel_ram_off & 0xfff) {
693 printk(KERN_ERR "[nvdebug] %s: Runlist0 CHRAM is not page-aligned!\n", __func__);
694 return -EAFNOSUPPORT;
695 }
696 channel_ram_size = (1 << channel_config.num_channels_log2) * sizeof(channel_ctrl_ga100_t);
697 printk(KERN_DEBUG "[nvdebug] %s: Mapping CHRAM at %#018llx--%x and RLRAM at %#018llx--%x.\n", __func__, BAR0_USER_ADDR + channel_ram_off, channel_ram_size-1, BAR0_USER_ADDR + runlist_ram_off, 4095);
698 } else {
699 channel_ram_off = NV_PCCSR;
700 // MAX_CHID * sizeof(channel_ctrl_gf100_t) is < 4 KiB, so hardcode
701 channel_ram_size = 4096;
702 runlist_ram_off = NV_PFIFO;
703 }
704
705 // map_mem_by_chid() pulls the instance block via PRAMIN, so inst_ptr will
706 // be invalid after moving PRAMIN (eg. as part of a page table operation).
707 // To avoid accessing inst_ptr after invalidation, keep a copy of what we
708 // need.
709 chan_pd_config = inst_ptr->pdb;
710
711 // map_page_directory_v1() is unimplemented, precluding Maxwell (or older)
712 // support (as they don't support v2 page tables).
713 if (!chan_pd_config.is_ver2)
714 return -EOPNOTSUPP;
715
716 // Determine the size of GPU physical memory (VRAM).
717 if ((mem_range.raw = nvdebug_readl(g, NV_FB_MMU_LOCAL_MEMORY_RANGE)) == -1)
718 return -EIO;
719 ram_size = memory_range_to_bytes(mem_range);
720
721 // We map memory using huge pages, and thus do not support GPUs with
722 // non-2-MiB-divisible VID_MEM sizes.
723 if (ram_size % (1 << 21) != 0) {
724 printk(KERN_ERR "[nvdebug] %s: GPU VID_MEM of %lu bytes is not a multiple of 2 MiB!\n", __func__, ram_size);
725 return -EAFNOSUPPORT;
726 }
727
728 // Map all of physical GPU memory (VID_MEM) into this channels's GPU virtual
729 // address space using huge (2 MiB) pages.
730 for (off = 0; off < ram_size; off += (1 << 21)) {
731 if ((ret = map_page_directory(g, chan_pd_config,
732 MEM_USER_ADDR + off, off, TARGET_VID_MEM, true)) < 0)
733 return ret;
734 // If the mapping already exists for this page directory, the other
735 // mappings should already exist, and can be skipped.
736 if (ret == 1) {
737 printk(KERN_INFO "[nvdebug] %s: VRAM mapping from %llx to %lx already exists. Assuming all mappings already exist and returning early...\n", __func__, MEM_USER_ADDR + off, off);
738 return 0;
739 }
740 }
741
742 // Map Channel RAM to a GPU-accessible bus address (gets past any IOMMU or
743 // IOVA layers), then map that address into this channel's GPU virtual
744 // address space. NV_PCCSR_CHANNEL_INST(0) is 4k-aligned, so it can be
745 // directly mapped.
746 // XXX: All these mappings are currently returning -1 on all reads on
747 // sunlight, jbakita-old, jetson-xavier, jetson-orin, and bonham,
748 // which seems to be returned from the PCIe root (on PCIe GPUs).
749 if (g->pcid)
750 bar0_base = pci_resource_start(g->pcid, 0);
751 else if (g->platd)
752 bar0_base = platform_get_resource(g->platd, IORESOURCE_MEM, 0)->start;
753 else
754 return -ENOTRECOVERABLE;
755 mc_boot_ram = NV_MC_BOOT_0 + bar0_base;
756 // PTOP fits within a page, but not page-aligned; round down.
757 ptop_ram = (NV_PTOP & ~0xfffu) + bar0_base;
758 fifo_ram = runlist_ram_off + bar0_base;
759 chan_ctrl_ram = channel_ram_off + bar0_base;
760
761 // Check if GPU-accessible bus addresses are the same as CPU-visible physical
762 // addresses. Logic from amdgpu_device_check_iommu_direct_map().
763 dom = iommu_get_domain_for_dev(g->dev);
764 if (!dom || dom->type == IOMMU_DOMAIN_IDENTITY) {
765 // Used for: jbakita-old, sunlight, jetson-xavier, jetson-orin integrated, bonham, ?
766 // (For all these, reads on the mapping return only -1.)
767 // (Forcing these through dma_map_resource()/iommu_map() changes nothing)
768 // (Note that the `ls -l /sys/class/iommu/*/devices` also reports that the
769 // GPU is not available under the I/O MMU on these platforms.)
770 // To fix this, please enable AMD-Vi/ARM SMMU/Intel VT-d in your BIOS
771 // settings, UEFI settings, or device-tree file. Supported on:
772 // - AMD: Bulldozer+ (or Phenom II w/ 890FX or 990FX Chipset)
773 // - Intel: Most since Core2 Duo
774 // Note that while the Jetson Orin has an SMMU (I/O MMU), the GPU does not
775 // appear to be configured by any pre-provided device tree files to use the
776 // SMMU.
777 printk(KERN_INFO "[nvdebug] map_mem_ctxid: I/O MMU is unavailable/disabled for GPU %x. Assuming phys and bus addresses are identical...\n", g->chip_id);
778 bus_mc_boot_ram = mc_boot_ram;
779 bus_ptop_ram = ptop_ram;
780 bus_fifo_ram = fifo_ram;
781 bus_chan_ctrl_ram = chan_ctrl_ram;
782 } else {
783 printk(KERN_INFO "[nvdebug] map_mem_ctxid: I/O MMU is enabled. Attempting to use dma_map_resource()...\n");
784 // Used for: tama, yamaha
785 // Fails on tama, yamaha
786 // (Works on jetson-xavier, jetson-orin and bonham, but appears to be a no-op, and
787 // yields inaccessible memory. Get `mc-err: (255) csr_nvl7r: EMEM address decode error`
788 // on access on jetson boards, and a -1 read on all.)
789 bus_mc_boot_ram = dma_map_resource(g->dev, mc_boot_ram, 4096*2 /* *2 is a XXX hack to include PBUS */, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
790 bus_ptop_ram = dma_map_resource(g->dev, ptop_ram, 4096, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
791 bus_fifo_ram = dma_map_resource(g->dev, fifo_ram, 4096*8 /* *8 is a XXX hack */, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
792 bus_chan_ctrl_ram = dma_map_resource(g->dev, chan_ctrl_ram, 2*4096, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
793 if (dma_mapping_error(g->dev, bus_mc_boot_ram) ||
794 dma_mapping_error(g->dev, bus_ptop_ram) ||
795 dma_mapping_error(g->dev, bus_fifo_ram) ||
796 dma_mapping_error(g->dev, bus_chan_ctrl_ram)) {
797 // Used for: tama, yamaha
798 printk(KERN_WARNING "[nvdebug] map_mem_ctxid: Unable to map BAR0 addresses to device-accessible addresses via dma_map_resource(). Return codes: %d for MC_BOOT, %d for PFIFO, %d for PCCSR.\n",
799 dma_mapping_error(g->dev, bus_mc_boot_ram),
800 dma_mapping_error(g->dev, bus_fifo_ram),
801 dma_mapping_error(g->dev, bus_chan_ctrl_ram));
802 // This fallback does not appear to work on jbakita-old (5.4, GART IOMMU), but works on tama
803 if (!get_dma_ops(g->dev))
804 printk(KERN_WARNING "[nvdebug] Reason: No DMA `ops`, and direct mapping failed.\n");
805 else if (!get_dma_ops(g->dev)->map_resource)
806 // Fires on: tama, yamaha
807 printk(KERN_WARNING "[nvdebug] Reason: `map_resource` function undefined on this platform.\n");
808 if (!dom) {
809 printk(KERN_ERR "[nvdebug] map_mem_ctxid: No I/O MMU available and dma_map_resource() failed. Aborting mapping of BAR0 regions!\n");
810 return -ENOTRECOVERABLE;
811 }
812 printk(KERN_INFO "[nvdebug] map_mem_ctxid: Trying to fall back to direct I/O MMU manipulation...\n");
813 // XXX: Fallback to directly creating the I/O MMU mappings.
814 // This is necessary. Directly accessing BAR0 addresses throws I/O MMU
815 // errors in the kernel log on yamaha.
816 // See also: comment on kfd_mem_dmamap_sg_bo() in amdgpu
817 // Note: dma_map_resource -> map_resource -> [arm_]iommu_map_resource
818 // -> __iommu_dma_map -> iommu_map is the happy-path, but this seems to
819 // regularly fail, even though the iommu_map path works. One key
820 // difference is that the dma_map_resource() path also includes
821 // IOMMU_MMIO in the iommu_map() flags.
822 bus_mc_boot_ram = mc_boot_ram;
823 bus_ptop_ram = ptop_ram;
824 bus_fifo_ram = fifo_ram;
825 bus_chan_ctrl_ram = chan_ctrl_ram;
826 // Create identity mapping
827 ret = iommu_map(dom, mc_boot_ram, mc_boot_ram, 4096*2 /* *2 is a hack to fit in PBUS*/, IOMMU_READ | IOMMU_WRITE);
828 if (ret < 0) {
829 printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for MC_BOOT!\n");
830 return ret;
831 }
832 ret = iommu_map(dom, ptop_ram, ptop_ram, 4096, IOMMU_READ | IOMMU_WRITE);
833 if (ret < 0) {
834 printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for PTOP!\n");
835 return ret;
836 }
837 ret = iommu_map(dom, fifo_ram, fifo_ram, 4096*8 /* *8 is XXX hack*/, IOMMU_READ | IOMMU_WRITE);
838 if (ret < 0) {
839 printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for FIFO!\n");
840 return ret;
841 }
842 ret = iommu_map(dom, chan_ctrl_ram, chan_ctrl_ram, channel_ram_size, IOMMU_READ | IOMMU_WRITE);
843 if (ret < 0) {
844 printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for PCCSR!\n");
845 return ret;
846 }
847 }
848 }
849 // TARGET_SYS_MEM_NONCOHERENT tells the GPU to bypass the CPU L2 cache for
850 // accesses to this memory.
851 // "Clients should normally use [SYS_MEM_NON_COHERENT]" (nvgpu)
852 //
853 // "Non-coherent system memory.
854 // (GPU) MMU will NOT maintain coherence with CPU L2 cache.
855 // Higher-level APIs should only allow this when it is known
856 // the memory is not cacheable by CPU or the coherency is
857 // managed explicitly (e.g. w/ flushes in SW).
858 // Also consider that this path is not necessarily faster." (open-gpu-kernel-modules)
859 //
860 // "Coherent system memory.
861 // (GPU) MMU will snoop CPU L2 cache if possible.
862 // This is usually the safer choice over NONCOH since it works
863 // whether the memory is cached by CPU L2 or not.
864 // On some CPU architectures going through CPU L2 may
865 // even be faster than the non-coherent path." (open-gpu-kernel-modules)
866 //
867 // I suspect that that for SYS_MEM_NONCOHERENT mappings, the "no snoop"
868 // attribute bit will be set on associated PCIe read/write transactions.
869 //
870 // The only other bits in a PCIe read/write transaction that could be
871 // relevant are the two AT (Address Translation) bits added in PCIe 2.0.
872 if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + NV_MC_BOOT_0,
873 bus_mc_boot_ram, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
874 return ret;
875 // XXX
876 if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + NV_MC_BOOT_0 + 4096,
877 bus_mc_boot_ram + 4096, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
878 return ret;
879 if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + (NV_PTOP & ~0xfffu),
880 bus_ptop_ram, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
881 return ret;
882 if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + runlist_ram_off,
883 bus_fifo_ram, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
884 return ret;
885 // XXX
886 for (off = 4096; off < 8*4096; off += 4096)
887 if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + runlist_ram_off+off,
888 bus_fifo_ram+off, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
889 return ret;
890 // Channel control RAM can span two or more pages on Ampere+
891 for (off = 0; off < channel_ram_size; off += 4096)
892 if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + channel_ram_off + off,
893 bus_chan_ctrl_ram + off, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
894 return ret;
895 return 0;
896}
897
898// Map by context ID
899// See constituent functions for info on what they do; comments not repeated.
900// Tested on Pascal, Volta, Turing, and Kepler
901ssize_t map_mem_ctxid_file_write(struct file *f, const char __user *buffer,
902 size_t count, loff_t *off) {
903 int err, target_context, target_runlist;
904 loff_t pos;
905 uint64_t instance_ptr;
906 enum INST_TARGET instance_target;
907 struct runlist_iter rl_iter;
908 instance_ctrl_t *inst;
909 context_switch_ctrl_t *ctx_block;
910 struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
911 // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
912 if ((err = kstrtou32_from_user(buffer, count, 0, &target_context)))
913 return err;
914 target_runlist = file2gpuidx(f);
915
916 // Get dereferencable pointer to the runlist
917 if ((err = get_runlist_iter(g, target_runlist, &rl_iter)))
918 return err;
919 // Find a channel in the runlist matching the provided context ID
920 for (pos = 0; pos < rl_iter.len; pos++, rl_iter.curr_entry += NV_RL_ENTRY_SIZE(g)) {
921 uint32_t ctxsw_timeout_pri_base = NV_PFIFO_ENG_CTXSW_TIMEOUT;
922 if (entry_type(g, rl_iter.curr_entry) == ENTRY_TYPE_TSG)
923 continue;
924 // Get instance block address
925 if (g->chip_id >= NV_CHIP_ID_AMPERE) {
926 instance_ptr = ((struct gv100_runlist_chan*)rl_iter.curr_entry)->inst_ptr_hi;
927 instance_ptr <<= 32;
928 instance_ptr |= (uint64_t)inst_ptr_lo(g, rl_iter.curr_entry) << 12;
929 instance_target = inst_target(g, rl_iter.curr_entry);
930 ctxsw_timeout_pri_base = rl_iter.runlist_pri_base + NV_RUNLIST_ENGINE_CTXSW_TIMEOUT_CONFIG(0);
931 } else {
932 channel_ctrl_t chan;
933 chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chid(g, rl_iter.curr_entry)));
934 if (chan.raw == -1)
935 return -EIO;
936 instance_ptr = (uint64_t)chan.inst_ptr << 12;
937 instance_target = chan.inst_target;
938 }
939 // Skip channels with unconfigured or INVALID instance blocks
940 if (!instance_ptr || instance_target == 1) {
941 printk(KERN_WARNING "[nvdebug] Channel %d is in runlist %d, but "
942 "lacks a valid instance block", chid(g, rl_iter.curr_entry),
943 target_runlist);
944 continue;
945 }
946
947 // Get a dereferencable pointer to the instance block
948 if (IS_ERR(inst = instance_deref(g, instance_ptr, instance_target)))
949 return PTR_ERR(inst);
950 // If unable to access instance block, skip
951 if (!inst)
952 continue;
953
954 // Get dereferencable pointer to CTXSW block
955 if (IS_ERR(ctx_block = get_ctxsw(g, inst)))
956 return PTR_ERR(ctx_block);
957 // If unable to access CTXSW block, skip
958 if (!ctx_block)
959 continue;
960 // Check if the context ID matches
961 if (ctx_block->context_id != target_context)
962 continue;
963
964 // XXX: Disable the context switch timeout while we're here
965 ctxsw_timeout_t timeout_config;
966 if ((timeout_config.raw = nvdebug_readl(g, ctxsw_timeout_pri_base)) == -1)
967 return -EIO;
968 timeout_config.enabled = 0;
969 nvdebug_writel(g, ctxsw_timeout_pri_base, timeout_config.raw);
970 // XXX: Attempt setting preemption mode while we're here
971 ctx_block->compute_preemption_options = PREEMPT_CTA;
972
973 // Map memory and return
974 if ((err = map_mem_for_instance(g, inst)) < 0)
975 return err;
976 return count;
977 }
978 return -ESRCH;
979}
980
981struct file_operations map_mem_ctxid_file_ops = {
982 .write = map_mem_ctxid_file_write,
983 .llseek = default_llseek,
984};
985
986// Map by channel ID (LEGACY; unclear if this needs to be kept)
987// Support: Pascal, Volta, and Turing only
988ssize_t map_mem_chid_file_write(struct file *f, const char __user *buffer,
989 size_t count, loff_t *off) {
990 int ret, target_channel;
991 struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
992 channel_ctrl_t chan;
993 instance_ctrl_t *inst_ptr;
994 bool all = false;
995 uint64_t inst_ptr_off;
996 page_dir_config_t bar2_pd_config;
997 // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
998 if ((ret = kstrtos32_from_user(buffer, count, 0, &target_channel)))
999 return ret;
1000
1001 if (g->chip_id >= NV_CHIP_ID_AMPERE)
1002 return -ENOSYS;
1003
1004 // This API is for nvsched, which is only supported on GPUs which support
1005 // instruction-level preemption (Pascal+).
1006 if (g->chip_id < NV_CHIP_ID_PASCAL)
1007 return -EOPNOTSUPP;
1008
1009 if (target_channel > MAX_CHID)
1010 return -ERANGE;
1011
1012 // Passing -1 indicates that all channels should be mapped
1013 if (target_channel == -1) {
1014 all = true;
1015 target_channel = 0;
1016 }
1017
1018 do {
1019 printk(KERN_INFO "[nvdebug] Mapping channel %d\n", target_channel);
1020 // Read the channel's configuration block, which includes the address of
1021 // this channel's instance block, which contains a page table pointer.
1022 // TODO: Verify this works with the channel RAM changes on Ampere+
1023 chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(target_channel));
1024 if (chan.raw == -1)
1025 return -EIO;
1026
1027 // If the instance pointer is unconfigured or the target is 1 (INVALID),
1028 // this channel is not in-use on any runlist and can be skipped.
1029 if (chan.inst_ptr == 0 || chan.inst_target == 1)
1030 continue;
1031
1032 // Find page tables which define how BAR2 offsets are tranlated to physical
1033 // VID_MEM/SYS_MEM addresses. (We have to do this every time since we reset
1034 // PRAMIN.)
1035 if ((ret = get_bar2_pdb(g, &bar2_pd_config)) < 0)
1036 return ret;
1037
1038 // Pascal+ GPUs use Version 2 page tables, so this shouldn't be a problem
1039 if (!bar2_pd_config.is_ver2)
1040 return -ENOSYS;
1041
1042 // To read the instance block, first find where it is mapped in BAR2
1043 if ((inst_ptr_off = search_page_directory(g, bar2_pd_config, (u64)chan.inst_ptr << 12, chan.inst_target)) == 0) {
1044 // If no mapping can be found in BAR2, fallback to accessing the
1045 // instance block via the PRAMIN window.
1046 printk(KERN_WARNING "[nvdebug] Warning: Channel %d has no instance "
1047 "block mapped in BAR2. Falling back to PRAMIN...\n", target_channel);
1048 if ((ret = addr_to_pramin_mut(g, (u64)chan.inst_ptr << 12, chan.inst_target)) < 0)
1049 return -EOPNOTSUPP;
1050 inst_ptr = g->regs + NV_PRAMIN + ret;
1051 } else {
1052 inst_ptr = g->bar2 + inst_ptr_off;
1053 }
1054
1055 if ((ret = map_mem_for_instance(g, inst_ptr)))
1056 return ret;
1057
1058 // If mapping all channels, start again at the next one
1059 } while (all && ++target_channel <= MAX_CHID);
1060
1061 return count;
1062}
1063
1064struct file_operations map_mem_chid_file_ops = {
1065 .write = map_mem_chid_file_write,
1066 .llseek = default_llseek,
1067};