Snapshot for ECRTS'25 artifact evaluation

author: Joshua Bakita <bakitajoshua@gmail.com> 2025-05-05 03:53:01 -0400
committer: Joshua Bakita <bakitajoshua@gmail.com> 2025-05-05 03:53:13 -0400
commit: 293430fcb5d4013b573556c58457ee706e482b7f (patch)
tree: 9328fa680f55b4e1a08d24714275b8437be3be5d /runlist_procfs.c
parent: 494df296bf4abe9b2b484bde1a4fad28c989afec (diff)
1 files changed, 636 insertions, 9 deletions
diff --git a/runlist_procfs.c b/runlist_procfs.c
index b2159f6..a3a6df3 100644
--- a/runlist_procfs.c
+++ b/runlist_procfs.c
@@ -1,12 +1,117 @@
 #include <linux/seq_file.h> // For seq_* functions and types
 #include <linux/version.h>  // Macros to detect kernel version
+#include <linux/platform_device.h> // For platform_get_resource()
+#include <linux/pci.h> // For pci_resource_start()
+#include <linux/iommu.h> // For iommu_ functions
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,10,0)
+#include <linux/dma-map-ops.h> // For get_dma_ops()
+#endif
 #include "nvdebug_linux.h"
-// Uncomment to expand channel status information when printing the runlist
+// We cannot touch PRAMIN (via page table operations or ctxsw access) if we're
+// using it to walk the runlist
+//#ifndef FALLBACK_TO_PRAMIN
+// Uncomment to expand channel status, instance, and context information when
+// printing the runlist
 #define DETAILED_CHANNEL_INFO
+//#endif
 #ifdef DETAILED_CHANNEL_INFO
+// Print the channel instance and context swtich blocks
+// XXX: THIS IS UNSAFE ON KEPLER!
+// instance_deref() will call into the page table logic, which may move PRAMIN
+// PRAMIN appears heavily utilized by the driver on Bonham (at least), and
+// moving it causes problems.
+static int runlist_detail_seq_show_inst(struct seq_file *s, struct nvdebug_state *g, char *prefix, uint64_t instance_ptr, enum INST_TARGET instance_target) {
+        instance_ctrl_t *inst = NULL;
+        context_switch_ctrl_t *ctxsw = NULL;
+        int i;
+#ifdef FALLBACK_TO_PRAMIN
+        bar0_window_t win;
+        win.raw = nvdebug_readl(g, NV_XAL_EP_BAR0_WINDOW_BASE);
+        inst = g->regs + NV_PRAMIN + addr_to_pramin_mut(g, instance_ptr, instance_target);
+#else
+        if (IS_ERR(inst = instance_deref(g, instance_ptr, instance_target)))
+                return PTR_ERR(ctxsw);
+#endif // FALLBACK_TO_PRAMIN
+        // If unable to access instance block, skip
+        if (!inst)
+                return 0;
+        // Print the channel instance block
+        // As an ID, use upper 52 bits of the instance address (lower 12 are zero)
+        //seq_printf(s, "%s+- Inst %-13llx-+\n", prefix, instance_ptr >> 12);
+        seq_printf(s, "%s|= Instance Block ====|\n", prefix);
+        seq_printf(s, "%s| Target Engine:    %2d|\n", prefix, inst->fc_target);
+        seq_printf(s, "%s| Privileged:        %1d|\n", prefix, inst->fc_config_is_priv);
+        seq_printf(s, "%s| Channel VEID:     %2d|\n", prefix, inst->fc_chan_info_veid);
+        seq_printf(s, "%s| WFI PTR:            |\n", prefix);
+        seq_printf(s, "%s|   %#018llx|\n", prefix, (uint64_t)inst->engine_wfi_ptr << 12);
+        seq_printf(s, "%s| %20s|\n", prefix, target_to_text(inst->engine_wfi_target));
+        seq_printf(s, "%s| Virtual address?   %d|\n", prefix, inst->engine_wfi_is_virtual);
+        seq_printf(s, "%s| WFI VEID:         %2d|\n", prefix, inst->engine_wfi_veid);
+        seq_printf(s, "%s| All PDB PTR:        |\n", prefix);
+        seq_printf(s, "%s|   %#018llx|\n", prefix,  (u64)inst->pdb.page_dir << 12);
+        seq_printf(s, "%s| %20s|\n", prefix, target_to_text(inst->pdb.target));
+        seq_printf(s, "%s| %20s|\n", prefix, inst->pdb.is_volatile ? "volatile" : "non-volatile");
+//      seq_printf(s, "%s|raw:       %0#10lx|\n", prefix, inst->pdb.raw);
+        seq_printf(s, "%s| Num subcontexts:  %2ld|\n", prefix, hweight64(inst->subcontext_pdb_valid));
+        // Print configuration of every enabled subcontext
+        for (i = 0; i < 64; i++) {
+                // Skip subcontexts without their enable bit set
+                if (!(1 & (inst->subcontext_pdb_valid >> i)))
+                        continue;
+                seq_printf(s, "%s| CPU SC%02d ASID%7d|\n", prefix, i, inst->subcontext[i].pasid);
+                seq_printf(s, "%s| SC%02d PDB PTR:       |\n", prefix, i);
+                seq_printf(s, "%s|   %#018llx|\n", prefix,  ((u64)inst->subcontext[i].pdb.page_dir_hi << 32) | ((u64)inst->subcontext[i].pdb.page_dir_lo << 12));
+                seq_printf(s, "%s| %20s|\n", prefix, target_to_text(inst->subcontext[i].pdb.target));
+                seq_printf(s, "%s| %20s|\n", prefix, inst->subcontext[i].pdb.is_volatile ? "volatile" : "non-volatile");
+//              seq_printf(s, "%s|raw:       %0#10lx|\n", prefix, inst->subcontext[i].pdb.raw);
+        }
+        // XXX: CTXSW is only accessible via PRAMIN. Accessing PRAMIN appears to
+        // either be broken, or race with the driver on Kepler (gk104 tested). So,
+        // do not attempt to touch the CTXSW block on Kepler.
+        // TODO: This check should be moved into addr_to_pramin_mut().
+        if (g->chip_id < NV_CHIP_ID_MAXWELL)
+                return 0;
+        // End XXX
+        if (IS_ERR(ctxsw = get_ctxsw(g, inst))) {
+#ifdef FALLBACK_TO_PRAMIN
+                nvdebug_writel(g, NV_XAL_EP_BAR0_WINDOW_BASE, win.raw);
+#endif
+                return PTR_ERR(ctxsw);
+        }
+        // If unable to access CTXSW block, skip
+        if (!ctxsw) {
+#ifdef FALLBACK_TO_PRAMIN
+                nvdebug_writel(g, NV_XAL_EP_BAR0_WINDOW_BASE, win.raw);
+#endif
+                return 0;
+        }
+        // Access and print the preemption mode and context ID
+        seq_printf(s, "%s|= Context State =====|\n", prefix);
+        seq_printf(s, "%s| Ctx. ID:  %#10x|\n", prefix, ctxsw->context_id);
+        // No other CTXSW fields are supported pre-Pascal
+        if (g->chip_id < NV_CHIP_ID_PASCAL)
+                return 0;
+        seq_printf(s, "%s| Gfx. Preemption:%4s|\n", prefix,
+                   graphics_preempt_type_to_text(ctxsw->graphics_preemption_options));
+        seq_printf(s, "%s| Cmp. Preemption:%4s|\n", prefix,
+                   compute_preempt_type_to_text(ctxsw->compute_preemption_options));
+        seq_printf(s, "%s| #WFI Saves:%9d|\n", prefix, ctxsw->num_wfi_save_operations);
+        seq_printf(s, "%s| #CTA Saves:%9d|\n", prefix, ctxsw->num_cta_save_operations);
+        seq_printf(s, "%s| #GFXP Saves:%8d|\n", prefix, ctxsw->num_gfxp_save_operations);
+        seq_printf(s, "%s| #CILP Saves:%8d|\n", prefix, ctxsw->num_cilp_save_operations);
+#ifdef FALLBACK_TO_PRAMIN
+        nvdebug_writel(g, NV_XAL_EP_BAR0_WINDOW_BASE, win.raw);
+#endif
+        return 0;
+}
 /* Print channel details using PCCSR (Programmable Channel Control System RAM?)
  @param s      Pointer to state from seq_file subsystem to pass to seq_printf
  @param g      Pointer to our internal GPU state
@@ -32,16 +137,19 @@ static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state
        seq_printf(s, "%s|   %#018llx|\n", prefix, instance_ptr);
        seq_printf(s, "%s| %20s|\n", prefix, target_to_text(chan.inst_target));
        seq_printf(s, "%s| Instance bound:    %d|\n", prefix, chan.inst_bind);
-        return 0;
+        // Print instance block
+        return runlist_detail_seq_show_inst(s, g, prefix, instance_ptr, chan.inst_target);
 }
 /* `runlist_detail_seq_show_chan()`, but for Ampere+
+  @param instance_ptr     Address for the channel instance block
+  @param instance_target  Aperture of `instance_ptr`
  @param runlist_pri_base Base of the RLRAM region for this runlist
  `runlist_pri_base` is necessary, since Channel RAM is now per-runlist on
  Ampere+, and its location is configured in Runlist RAM.
 */
-static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix, uint32_t runlist_pri_base) {
+static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix, uint32_t runlist_pri_base, uint64_t instance_ptr, enum INST_TARGET instance_target) {
        runlist_channel_config_t channel_config;
        channel_ctrl_ga100_t chan;
@@ -63,7 +171,7 @@ static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug
        seq_printf(s, "%s| PBDMA Busy:        %d|\n", prefix, chan.pbdma_busy);
        seq_printf(s, "%s| ENG Busy:          %d|\n", prefix, chan.eng_busy);
        seq_printf(s, "%s| Acquire Fail:      %d|\n", prefix, chan.acquire_fail);
-        return 0;
+        return runlist_detail_seq_show_inst(s, g, prefix, instance_ptr, instance_target);
 }
 #endif
@@ -173,7 +281,7 @@ static int runlist_file_seq_show(struct seq_file *s, void *raw_rl_iter) {
                if (g->chip_id < NV_CHIP_ID_AMPERE)
                        runlist_detail_seq_show_chan(s, g, chid(g, entry), indt);
                else
-                        runlist_detail_seq_show_chan_ga100(s, g, chid(g, entry), indt, rl_iter->runlist_pri_base);
+                        runlist_detail_seq_show_chan_ga100(s, g, chid(g, entry), indt, rl_iter->runlist_pri_base, instance_ptr, inst_target(g, entry));
 #endif
                seq_printf(s, "%s+---------------------+\n", indt);
        }
@@ -232,15 +340,17 @@ struct file_operations preempt_tsg_file_ops = {
 ssize_t resubmit_runlist_file_write(struct file *f, const char __user *buffer,
                                    size_t count, loff_t *off) {
-        uint32_t target_runlist;
+        uint32_t target_runlist, target_offset;
        struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
        // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
-        int err = kstrtou32_from_user(buffer, count, 0, &target_runlist);
+        int err = kstrtou32_from_user(buffer, count, 0, &target_offset);
        if (err)
                return err;
+        // (Ab)use the PDE_DATA field for the runlist ID
+        target_runlist = file2gpuidx(f);
        // resubmit_runlist() checks that target_runlist is valid
-        if ((err = resubmit_runlist(g, target_runlist)))
+        if ((err = resubmit_runlist(g, target_runlist, target_offset)))
                return err;
        return count;
@@ -351,6 +461,54 @@ struct file_operations enable_channel_file_ops = {
        .llseek = default_llseek,
 };
+ssize_t comm_preempt_channel_file_write(struct file *f, const char __user *buf,
+                                        size_t count, loff_t *off,
+                                        enum COMPUTE_PREEMPT_TYPE mode) {
+        uint32_t target_channel, target_runlist;
+        struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
+        // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
+        int err = kstrtou32_from_user(buf, count, 0, &target_channel);
+        if (err)
+                return err;
+        // (Ab)use the PDE_DATA field used by file2gpuidx() for the runlist ID
+        target_runlist = file2gpuidx(f);
+        // Set preemption mode for the context of this channel
+        if ((err = set_channel_preemption_mode(g, target_channel, target_runlist, mode)))
+                return err;
+        return count;
+}
+ssize_t wfi_preempt_channel_file_write(struct file *f, const char __user *buf,
+                                       size_t count, loff_t *off) {
+        return comm_preempt_channel_file_write(f, buf, count, off, PREEMPT_WFI);
+}
+struct file_operations wfi_preempt_channel_file_ops = {
+        .write = wfi_preempt_channel_file_write,
+        .llseek = default_llseek,
+};
+ssize_t cta_preempt_channel_file_write(struct file *f, const char __user *buf,
+                                       size_t count, loff_t *off) {
+        return comm_preempt_channel_file_write(f, buf, count, off, PREEMPT_CTA);
+}
+struct file_operations cta_preempt_channel_file_ops = {
+        .write = cta_preempt_channel_file_write,
+        .llseek = default_llseek,
+};
+ssize_t cil_preempt_channel_file_write(struct file *f, const char __user *buf,
+                                       size_t count, loff_t *off) {
+        return comm_preempt_channel_file_write(f, buf, count, off, PREEMPT_CILP);
+}
+struct file_operations cil_preempt_channel_file_ops = {
+        .write = cil_preempt_channel_file_write,
+        .llseek = default_llseek,
+};
 // Tested working on Pascal (gp106) through Ada (ad102)
 ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer,
                                 size_t count, loff_t *off) {
@@ -419,11 +577,13 @@ ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer,
                // TODO: Fix the above for bare channels. Add "for_chan_until_tsg"?
        }
+#warning switch_to_tsg has preempt_runlist omitted!
+        return count;
        // Resubmit the runlist to ensure that changes to channel enablement are
        // picked up on Turing+ GPUs (channel enablements may not be otherwise).
        if (g->chip_id >= NV_CHIP_ID_TURING)
-                if ((err = resubmit_runlist(g, target_runlist)))
+                if ((err = resubmit_runlist(g, target_runlist, -1)))
                        return err;
        // Trigger a runlist-level preempt to stop whatever was running, triggering
@@ -438,3 +598,470 @@ struct file_operations switch_to_tsg_file_ops = {
        .write = switch_to_tsg_file_write,
        .llseek = default_llseek,
 };
+ssize_t preempt_runlist_file_write(struct file *f, const char __user *buffer,
+                                    size_t count, loff_t *off) {
+        uint32_t target_runlist;
+        struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
+        // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
+        int err = kstrtou32_from_user(buffer, count, 0, &target_runlist);
+        if (err)
+                return err;
+        // TODO: Check runlist is in-range
+        if ((err = preempt_runlist(g, target_runlist)))
+                return err;
+        return count;
+}
+struct file_operations preempt_runlist_file_ops = {
+        .write = preempt_runlist_file_write,
+        .llseek = default_llseek,
+};
+// Value written to this file is which runlist to ack the IRQ for
+ssize_t ack_bad_tsg_file_write(struct file *f, const char __user *buffer,
+                                    size_t count, loff_t *off) {
+        uint32_t target_runlist;
+        uint32_t rl_ram_off;
+        struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
+        // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
+        int err = kstrtou32_from_user(buffer, count, 0, &target_runlist);
+        if (err)
+                return err;
+        if ((err = get_runlist_ram(g, target_runlist, &rl_ram_off)))
+                return err;
+        nvdebug_writel(g, rl_ram_off + 0x100, 1 << 12);
+        return count;
+}
+struct file_operations ack_bad_tsg_file_ops = {
+        .write = ack_bad_tsg_file_write,
+        .llseek = default_llseek,
+};
+// Rather than mapping all of BAR0, we just map:
+// - On Pascal, Volta, Turing: MC_BOOT, PFIFO, PCCSR, PTOP
+// - On Ampere: MC_BOOT, RAMRL(0), CHRAM(0), PTOP
+// "All CUDA-managed pointers are within---the first 40 bits of the process's
+// VA space" (Sec. 4.1, GPUDirect RDMA Documentation)
+// - This means 0x00ff_ffff_ffff is the highest valid CUDA virtual address,
+//   and all higher addresses are unused.
+// - So we use 0x6000_0000_0000+; this falls within the first PDE3 entry, and
+//   at the end of the PDE2 entries
+//   + Using the second PDE3 entry did not appear to work on Jetson (IIRC)
+#define BAR0_USER_ADDR 0x0000700000000000llu
+#define MEM_USER_ADDR 0x0000600000000000llu
+/* Map all of GPU VRAM, and selected BAR0 regions, into a channel instance's
+ * virtual address space at predefined offsets (above).
+ *
+ * @param g        Pointer to the nvdebug state for the selected GPU
+ * @param inst_ptr Dereferencible pointer to the channel's instance block
+ * @returns 0 on success, -errno on error
+ *
+ * Support: Pascal, Volta, Turing, Ampere
+ */
+int map_mem_for_instance(struct nvdebug_state *g, instance_ctrl_t *inst_ptr) {
+        int ret;
+        uintptr_t off, ram_size;
+        dma_addr_t bus_mc_boot_ram, bus_ptop_ram, bus_fifo_ram, bus_chan_ctrl_ram;
+        uint64_t mc_boot_ram, ptop_ram, fifo_ram, chan_ctrl_ram;
+        page_dir_config_t chan_pd_config;
+        memory_range_t mem_range;
+        uint32_t channel_ram_off, runlist_ram_off, channel_ram_size, bar0_base;
+        struct iommu_domain *dom;
+        if (g->chip_id >= NV_CHIP_ID_AMPERE) {
+                runlist_channel_config_t channel_config;
+                if ((ret = get_runlist_ram(g, 0, &runlist_ram_off))) {
+                        printk(KERN_ERR "[nvdebug] %s: Unable to determine location of runlist0 RAM!\n", __func__);
+                        return ret;
+                }
+                if (runlist_ram_off & 0xfff) {
+                        printk(KERN_ERR "[nvdebug] %s: Runlist0 RAM is not page-aligned!\n", __func__);
+                        return -EAFNOSUPPORT;
+                }
+                if ((channel_config.raw = nvdebug_readl(g, runlist_ram_off + NV_RUNLIST_CHANNEL_CONFIG_GA100)) == -1)
+                        return -EIO;
+                channel_ram_off = (uint32_t)channel_config.bar0_offset << 4;
+                if (channel_ram_off & 0xfff) {
+                        printk(KERN_ERR "[nvdebug] %s: Runlist0 CHRAM is not page-aligned!\n", __func__);
+                        return -EAFNOSUPPORT;
+                }
+                channel_ram_size = (1 << channel_config.num_channels_log2) * sizeof(channel_ctrl_ga100_t);
+                printk(KERN_DEBUG "[nvdebug] %s: Mapping CHRAM at %#018llx--%x and RLRAM at %#018llx--%x.\n", __func__, BAR0_USER_ADDR + channel_ram_off, channel_ram_size-1, BAR0_USER_ADDR + runlist_ram_off, 4095);
+        } else {
+                channel_ram_off = NV_PCCSR;
+                // MAX_CHID * sizeof(channel_ctrl_gf100_t) is < 4 KiB, so hardcode
+                channel_ram_size = 4096;
+                runlist_ram_off = NV_PFIFO;
+        }
+        // map_mem_by_chid() pulls the instance block via PRAMIN, so inst_ptr will
+        // be invalid after moving PRAMIN (eg. as part of a page table operation).
+        // To avoid accessing inst_ptr after invalidation, keep a copy of what we
+        // need.
+        chan_pd_config = inst_ptr->pdb;
+        // map_page_directory_v1() is unimplemented, precluding Maxwell (or older)
+        // support (as they don't support v2 page tables).
+        if (!chan_pd_config.is_ver2)
+                return -EOPNOTSUPP;
+        // Determine the size of GPU physical memory (VRAM).
+        if ((mem_range.raw = nvdebug_readl(g, NV_FB_MMU_LOCAL_MEMORY_RANGE)) == -1)
+                return -EIO;
+        ram_size = memory_range_to_bytes(mem_range);
+        // We map memory using huge pages, and thus do not support GPUs with
+        // non-2-MiB-divisible VID_MEM sizes.
+        if (ram_size % (1 << 21) != 0) {
+                printk(KERN_ERR "[nvdebug] %s: GPU VID_MEM of %lu bytes is not a multiple of 2 MiB!\n", __func__, ram_size);
+                return -EAFNOSUPPORT;
+        }
+        // Map all of physical GPU memory (VID_MEM) into this channels's GPU virtual
+        // address space using huge (2 MiB) pages.
+        for (off = 0; off < ram_size; off += (1 << 21)) {
+                if ((ret = map_page_directory(g, chan_pd_config,
+                                        MEM_USER_ADDR + off, off, TARGET_VID_MEM, true)) < 0)
+                        return ret;
+                // If the mapping already exists for this page directory, the other
+                // mappings should already exist, and can be skipped.
+                if (ret == 1) {
+                        printk(KERN_INFO "[nvdebug] %s: VRAM mapping from %llx to %lx already exists. Assuming all mappings already exist and returning early...\n", __func__, MEM_USER_ADDR + off, off);
+                        return 0;
+                }
+        }
+        // Map Channel RAM to a GPU-accessible bus address (gets past any IOMMU or
+        // IOVA layers), then map that address into this channel's GPU virtual
+        // address space. NV_PCCSR_CHANNEL_INST(0) is 4k-aligned, so it can be
+        // directly mapped.
+        // XXX: All these mappings are currently returning -1 on all reads on
+        //      sunlight, jbakita-old, jetson-xavier, jetson-orin, and bonham,
+        //      which seems to be returned from the PCIe root (on PCIe GPUs).
+        if (g->pcid)
+                bar0_base = pci_resource_start(g->pcid, 0);
+        else if (g->platd)
+                bar0_base = platform_get_resource(g->platd, IORESOURCE_MEM, 0)->start;
+        else
+                return -ENOTRECOVERABLE;
+        mc_boot_ram = NV_MC_BOOT_0 + bar0_base;
+        // PTOP fits within a page, but not page-aligned; round down.
+        ptop_ram = (NV_PTOP & ~0xfffu) + bar0_base;
+        fifo_ram = runlist_ram_off + bar0_base;
+        chan_ctrl_ram = channel_ram_off + bar0_base;
+        // Check if GPU-accessible bus addresses are the same as CPU-visible physical
+        // addresses. Logic from amdgpu_device_check_iommu_direct_map().
+        dom = iommu_get_domain_for_dev(g->dev);
+        if (!dom || dom->type == IOMMU_DOMAIN_IDENTITY) {
+                // Used for: jbakita-old, sunlight, jetson-xavier, jetson-orin integrated, bonham, ?
+                // (For all these, reads on the mapping return only -1.)
+                // (Forcing these through dma_map_resource()/iommu_map() changes nothing)
+                // (Note that the `ls -l /sys/class/iommu/*/devices` also reports that the
+                // GPU is not available under the I/O MMU on these platforms.)
+                // To fix this, please enable AMD-Vi/ARM SMMU/Intel VT-d in your BIOS
+                // settings, UEFI settings, or device-tree file. Supported on:
+                // - AMD: Bulldozer+ (or Phenom II w/ 890FX or 990FX Chipset)
+                // - Intel: Most since Core2 Duo
+                // Note that while the Jetson Orin has an SMMU (I/O MMU), the GPU does not
+                // appear to be configured by any pre-provided device tree files to use the
+                // SMMU.
+                printk(KERN_INFO "[nvdebug] map_mem_ctxid: I/O MMU is unavailable/disabled for GPU %x. Assuming phys and bus addresses are identical...\n", g->chip_id);
+                bus_mc_boot_ram = mc_boot_ram;
+                bus_ptop_ram = ptop_ram;
+                bus_fifo_ram = fifo_ram;
+                bus_chan_ctrl_ram = chan_ctrl_ram;
+        } else {
+                printk(KERN_INFO "[nvdebug] map_mem_ctxid: I/O MMU is enabled. Attempting to use dma_map_resource()...\n");
+                // Used for: tama, yamaha
+                // Fails on tama, yamaha
+                // (Works on jetson-xavier, jetson-orin and bonham, but appears to be a no-op, and
+                // yields inaccessible memory. Get `mc-err: (255) csr_nvl7r: EMEM address decode error`
+                // on access on jetson boards, and a -1 read on all.)
+                bus_mc_boot_ram = dma_map_resource(g->dev, mc_boot_ram, 4096*2 /* *2 is a XXX hack to include PBUS */, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+                bus_ptop_ram = dma_map_resource(g->dev, ptop_ram, 4096, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+                bus_fifo_ram = dma_map_resource(g->dev, fifo_ram, 4096*8 /* *8 is a XXX hack */, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+                bus_chan_ctrl_ram = dma_map_resource(g->dev, chan_ctrl_ram, 2*4096, DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC);
+                if (dma_mapping_error(g->dev, bus_mc_boot_ram) ||
+                        dma_mapping_error(g->dev, bus_ptop_ram) ||
+                        dma_mapping_error(g->dev, bus_fifo_ram) ||
+                        dma_mapping_error(g->dev, bus_chan_ctrl_ram)) {
+                        // Used for: tama, yamaha
+                        printk(KERN_WARNING "[nvdebug] map_mem_ctxid: Unable to map BAR0 addresses to device-accessible addresses via dma_map_resource(). Return codes: %d for MC_BOOT, %d for PFIFO, %d for PCCSR.\n",
+                                   dma_mapping_error(g->dev, bus_mc_boot_ram),
+                                   dma_mapping_error(g->dev, bus_fifo_ram),
+                                   dma_mapping_error(g->dev, bus_chan_ctrl_ram));
+                        // This fallback does not appear to work on jbakita-old (5.4, GART IOMMU), but works on tama
+                        if (!get_dma_ops(g->dev))
+                                printk(KERN_WARNING "[nvdebug] Reason: No DMA `ops`, and direct mapping failed.\n");
+                        else if (!get_dma_ops(g->dev)->map_resource)
+                                // Fires on: tama, yamaha
+                                printk(KERN_WARNING "[nvdebug] Reason: `map_resource` function undefined on this platform.\n");
+                        if (!dom) {
+                                printk(KERN_ERR "[nvdebug] map_mem_ctxid: No I/O MMU available and dma_map_resource() failed. Aborting mapping of BAR0 regions!\n");
+                                return -ENOTRECOVERABLE;
+                        }
+                        printk(KERN_INFO "[nvdebug] map_mem_ctxid: Trying to fall back to direct I/O MMU manipulation...\n");
+                        // XXX: Fallback to directly creating the I/O MMU mappings.
+                        //      This is necessary. Directly accessing BAR0 addresses throws I/O MMU
+                        //      errors in the kernel log on yamaha.
+                        // See also: comment on kfd_mem_dmamap_sg_bo() in amdgpu
+                        // Note: dma_map_resource -> map_resource -> [arm_]iommu_map_resource
+                        //       -> __iommu_dma_map -> iommu_map is the happy-path, but this seems to
+                        //       regularly fail, even though the iommu_map path works. One key
+                        //       difference is that the dma_map_resource() path also includes
+                        //       IOMMU_MMIO in the iommu_map() flags.
+                        bus_mc_boot_ram = mc_boot_ram;
+                        bus_ptop_ram = ptop_ram;
+                        bus_fifo_ram = fifo_ram;
+                        bus_chan_ctrl_ram = chan_ctrl_ram;
+                        // Create identity mapping
+                        ret = iommu_map(dom, mc_boot_ram, mc_boot_ram, 4096*2 /* *2 is a hack to fit in PBUS*/, IOMMU_READ | IOMMU_WRITE);
+                        if (ret < 0) {
+                                printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for MC_BOOT!\n");
+                                return ret;
+                        }
+                        ret = iommu_map(dom, ptop_ram, ptop_ram, 4096, IOMMU_READ | IOMMU_WRITE);
+                        if (ret < 0) {
+                                printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for PTOP!\n");
+                                return ret;
+                        }
+                        ret = iommu_map(dom, fifo_ram, fifo_ram, 4096*8 /* *8 is XXX hack*/, IOMMU_READ | IOMMU_WRITE);
+                        if (ret < 0) {
+                                printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for FIFO!\n");
+                                return ret;
+                        }
+                        ret = iommu_map(dom, chan_ctrl_ram, chan_ctrl_ram, channel_ram_size, IOMMU_READ | IOMMU_WRITE);
+                        if (ret < 0) {
+                                printk(KERN_ERR "[nvdebug] map_mem_ctxid: Attempt to bypass and go directly to I/O MMU failed for PCCSR!\n");
+                                return ret;
+                        }
+                }
+        }
+        // TARGET_SYS_MEM_NONCOHERENT tells the GPU to bypass the CPU L2 cache for
+        // accesses to this memory.
+        // "Clients should normally use [SYS_MEM_NON_COHERENT]" (nvgpu)
+        //
+        // "Non-coherent system memory.
+        //  (GPU) MMU will NOT maintain coherence with CPU L2 cache.
+        //  Higher-level APIs should only allow this when it is known
+        //  the memory is not cacheable by CPU or the coherency is
+        //  managed explicitly (e.g. w/ flushes in SW).
+        //  Also consider that this path is not necessarily faster." (open-gpu-kernel-modules)
+        //
+        // "Coherent system memory.
+        //  (GPU) MMU will snoop CPU L2 cache if possible.
+        //  This is usually the safer choice over NONCOH since it works
+        //  whether the memory is cached by CPU L2 or not.
+        //  On some CPU architectures going through CPU L2 may
+        //  even be faster than the non-coherent path." (open-gpu-kernel-modules)
+        //
+        // I suspect that that for SYS_MEM_NONCOHERENT mappings, the "no snoop"
+        // attribute bit will be set on associated PCIe read/write transactions.
+        //
+        // The only other bits in a PCIe read/write transaction that could be
+        // relevant are the two AT (Address Translation) bits added in PCIe 2.0.
+        if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + NV_MC_BOOT_0,
+                     bus_mc_boot_ram, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
+                return ret;
+        // XXX
+        if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + NV_MC_BOOT_0 + 4096,
+                     bus_mc_boot_ram + 4096, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
+                return ret;
+        if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + (NV_PTOP & ~0xfffu),
+                     bus_ptop_ram, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
+                return ret;
+        if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + runlist_ram_off,
+                     bus_fifo_ram, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
+                return ret;
+        // XXX
+        for (off = 4096; off < 8*4096; off += 4096)
+                if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + runlist_ram_off+off,
+                                         bus_fifo_ram+off, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
+                        return ret;
+        // Channel control RAM can span two or more pages on Ampere+
+        for (off = 0; off < channel_ram_size; off += 4096)
+                if ((ret = map_page_directory(g, chan_pd_config, BAR0_USER_ADDR + channel_ram_off + off,
+                             bus_chan_ctrl_ram + off, TARGET_SYS_MEM_NONCOHERENT, false)) < 0)
+                        return ret;
+        return 0;
+}
+// Map by context ID
+// See constituent functions for info on what they do; comments not repeated.
+// Tested on Pascal, Volta, Turing, and Kepler
+ssize_t map_mem_ctxid_file_write(struct file *f, const char __user *buffer,
+                                   size_t count, loff_t *off) {
+        int err, target_context, target_runlist;
+        loff_t pos;
+        uint64_t instance_ptr;
+        enum INST_TARGET instance_target;
+        struct runlist_iter rl_iter;
+        instance_ctrl_t *inst;
+        context_switch_ctrl_t *ctx_block;
+        struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
+        // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
+        if ((err = kstrtou32_from_user(buffer, count, 0, &target_context)))
+                return err;
+        target_runlist = file2gpuidx(f);
+        // Get dereferencable pointer to the runlist
+        if ((err = get_runlist_iter(g, target_runlist, &rl_iter)))
+                return err;
+        // Find a channel in the runlist matching the provided context ID
+        for (pos = 0; pos < rl_iter.len; pos++, rl_iter.curr_entry += NV_RL_ENTRY_SIZE(g)) {
+                uint32_t ctxsw_timeout_pri_base = NV_PFIFO_ENG_CTXSW_TIMEOUT;
+                if (entry_type(g, rl_iter.curr_entry) == ENTRY_TYPE_TSG)
+                        continue;
+                // Get instance block address
+                if (g->chip_id >= NV_CHIP_ID_AMPERE) {
+                        instance_ptr = ((struct gv100_runlist_chan*)rl_iter.curr_entry)->inst_ptr_hi;
+                        instance_ptr <<= 32;
+                        instance_ptr |= (uint64_t)inst_ptr_lo(g, rl_iter.curr_entry) << 12;
+                        instance_target = inst_target(g, rl_iter.curr_entry);
+                        ctxsw_timeout_pri_base = rl_iter.runlist_pri_base + NV_RUNLIST_ENGINE_CTXSW_TIMEOUT_CONFIG(0);
+                } else {
+                        channel_ctrl_t chan;
+                        chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chid(g, rl_iter.curr_entry)));
+                        if (chan.raw == -1)
+                                return -EIO;
+                        instance_ptr = (uint64_t)chan.inst_ptr << 12;
+                        instance_target = chan.inst_target;
+                }
+                // Skip channels with unconfigured or INVALID instance blocks
+                if (!instance_ptr || instance_target == 1) {
+                        printk(KERN_WARNING "[nvdebug] Channel %d is in runlist %d, but "
+                               "lacks a valid instance block", chid(g, rl_iter.curr_entry),
+                               target_runlist);
+                        continue;
+                }
+                // Get a dereferencable pointer to the instance block
+                if (IS_ERR(inst = instance_deref(g, instance_ptr, instance_target)))
+                        return PTR_ERR(inst);
+                // If unable to access instance block, skip
+                if (!inst)
+                        continue;
+                // Get dereferencable pointer to CTXSW block
+                if (IS_ERR(ctx_block = get_ctxsw(g, inst)))
+                        return PTR_ERR(ctx_block);
+                // If unable to access CTXSW block, skip
+                if (!ctx_block)
+                        continue;
+                // Check if the context ID matches
+                if (ctx_block->context_id != target_context)
+                        continue;
+                // XXX: Disable the context switch timeout while we're here
+                ctxsw_timeout_t timeout_config;
+                if ((timeout_config.raw = nvdebug_readl(g, ctxsw_timeout_pri_base)) == -1)
+                        return -EIO;
+                timeout_config.enabled = 0;
+                nvdebug_writel(g, ctxsw_timeout_pri_base, timeout_config.raw);
+                // XXX: Attempt setting preemption mode while we're here
+                ctx_block->compute_preemption_options = PREEMPT_CTA;
+                // Map memory and return
+                if ((err = map_mem_for_instance(g, inst)) < 0)
+                        return err;
+                return count;
+        }
+        return -ESRCH;
+}
+struct file_operations map_mem_ctxid_file_ops = {
+        .write = map_mem_ctxid_file_write,
+        .llseek = default_llseek,
+};
+// Map by channel ID (LEGACY; unclear if this needs to be kept)
+// Support: Pascal, Volta, and Turing only
+ssize_t map_mem_chid_file_write(struct file *f, const char __user *buffer,
+                                   size_t count, loff_t *off) {
+        int ret, target_channel;
+        struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
+        channel_ctrl_t chan;
+        instance_ctrl_t *inst_ptr;
+        bool all = false;
+        uint64_t inst_ptr_off;
+        page_dir_config_t bar2_pd_config;
+        // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
+        if ((ret = kstrtos32_from_user(buffer, count, 0, &target_channel)))
+                return ret;
+        if (g->chip_id >= NV_CHIP_ID_AMPERE)
+                return -ENOSYS;
+        // This API is for nvsched, which is only supported on GPUs which support
+        // instruction-level preemption (Pascal+).
+        if (g->chip_id < NV_CHIP_ID_PASCAL)
+                return -EOPNOTSUPP;
+        if (target_channel > MAX_CHID)
+                return -ERANGE;
+        // Passing -1 indicates that all channels should be mapped
+        if (target_channel == -1) {
+                all = true;
+                target_channel = 0;
+        }
+        do {
+                printk(KERN_INFO "[nvdebug] Mapping channel %d\n", target_channel);
+                // Read the channel's configuration block, which includes the address of
+                // this channel's instance block, which contains a page table pointer.
+                // TODO: Verify this works with the channel RAM changes on Ampere+
+                chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(target_channel));
+                if (chan.raw == -1)
+                        return -EIO;
+                // If the instance pointer is unconfigured or the target is 1 (INVALID),
+                // this channel is not in-use on any runlist and can be skipped.
+                if (chan.inst_ptr == 0 || chan.inst_target == 1)
+                        continue;
+                // Find page tables which define how BAR2 offsets are tranlated to physical
+                // VID_MEM/SYS_MEM addresses. (We have to do this every time since we reset
+                // PRAMIN.)
+                if ((ret = get_bar2_pdb(g, &bar2_pd_config)) < 0)
+                        return ret;
+                // Pascal+ GPUs use Version 2 page tables, so this shouldn't be a problem
+                if (!bar2_pd_config.is_ver2)
+                        return -ENOSYS;
+                // To read the instance block, first find where it is mapped in BAR2
+                if ((inst_ptr_off = search_page_directory(g, bar2_pd_config, (u64)chan.inst_ptr << 12, chan.inst_target)) == 0) {
+                        // If no mapping can be found in BAR2, fallback to accessing the
+                        // instance block via the PRAMIN window.
+                        printk(KERN_WARNING "[nvdebug] Warning: Channel %d has no instance "
+                                   "block mapped in BAR2. Falling back to PRAMIN...\n", target_channel);
+                        if ((ret = addr_to_pramin_mut(g, (u64)chan.inst_ptr << 12, chan.inst_target)) < 0)
+                                return -EOPNOTSUPP;
+                        inst_ptr = g->regs + NV_PRAMIN + ret;
+                } else {
+                        inst_ptr = g->bar2 + inst_ptr_off;
+                }
+                if ((ret = map_mem_for_instance(g, inst_ptr)))
+                        return ret;
+        // If mapping all channels, start again at the next one
+        } while (all && ++target_channel <= MAX_CHID);
+        return count;
+}
+struct file_operations map_mem_chid_file_ops = {
+        .write = map_mem_chid_file_write,
+        .llseek = default_llseek,
+};
author	Joshua Bakita <bakitajoshua@gmail.com>	2025-05-05 03:53:01 -0400
committer	Joshua Bakita <bakitajoshua@gmail.com>	2025-05-05 03:53:13 -0400
commit	293430fcb5d4013b573556c58457ee706e482b7f (patch)
tree	9328fa680f55b4e1a08d24714275b8437be3be5d /runlist_procfs.c
parent	494df296bf4abe9b2b484bde1a4fad28c989afec (diff)