Support page directories outside PRAMIN or in SYS_MEM

- Re-read PRAMIN configuration after update to verify change applies - Return a page_dir_config_t rather than just an address and page. table version from `get_bar2_pdb()`. - Less verbose logging for MMU-related functions by default. - Perform all conversion from SYS_MEM/VID_MEM addresses to kernel addresses inside the translation functions, via the new function 'pd_deref()`. - Support use of an I/O MMU, page tables/directories outside the current PRAMIN window, and page tables/directories arbitrarially located in SYS_MEM or VID_MEM on different levels of the same tree. - Heavily improve documentation and add references for Version 1 and Version 0 page tables. - Improve logging in `runlist.c` to include runlist and chip IDs. - Update all users of search_page_directory* to use the new API. - Remove now-unused supporting functions from `mmu.c`. Tested on GTX 970, GTX 1060 3GB, Jetson TX2, Titan V, Jetson Xavier, and RTX 2080 Ti.
author: Joshua Bakita <bakitajoshua@gmail.com> 2024-04-11 12:23:18 -0400
committer: Joshua Bakita <jbakita@cs.unc.edu> 2024-04-11 13:03:20 -0400
commit: a8fd5a8dee066d0008e7667b0c9e6a60cd5f3a2e (patch)
tree: f05095d4b6458a709034a182649e6d16b6a8558a
parent: 5ea953292441e31e37ae074e48d8b3b5ce1d9440 (diff)
5 files changed, 268 insertions, 181 deletions
diff --git a/bus.c b/bus.c
index 802b6df..951ac77 100644
--- a/bus.c
+++ b/bus.c
@@ -57,35 +57,25 @@ relocate:
        window.base = (u32)(addr >> 16); // Safe, due to above range check
        window.target = target;
        nvdebug_writel(g, NV_PBUS_BAR0_WINDOW, window.raw);
+        // Wait for the window to move by re-reading (as done in nvgpu driver)
+        (void) nvdebug_readl(g, NV_PBUS_BAR0_WINDOW);
        return (int)(addr & 0xffffull);
 }
+/* Get a copy of the BAR2 page directory configuration (base and aperture)
-/* Get a persistent pointer to the page directory base
+  @param pd Pointer at which to store the configuration, including a pointer
-  @param pdb Dereferencable pointer to the zeroeth entry of top-level page
+            and aperture for the zeroth entry of the top-level page directory
-             directory (PD3) for the BAR2 register region.
+            (PD3 for V2 page tables). This pointer **may not** be directly
-  Note: The returned pointer will be into the PRAMIN space. If the PRAMIN
+            dereferencable, and the caller may need to shift the BAR2 window.
-        window is moved to a region that does not cover the BAR2 page table,
+  @return 0 on success, -errno on error.
-        this ***will move the window***.
+  Note: This may move the PRAMIN window.
-  Note: Even if the page table is located in SYS_MEM, we route reads/writes via
-        PRAMIN. This ensures that we always see what the GPU sees, and that
-        includes any passes through I/O MMUs or IOVA spaces.
 */
-int get_bar2_pdb(struct nvdebug_state *g, void **pdb, bool *is_v2_pdb) {
+int get_bar2_pdb(struct nvdebug_state *g, page_dir_config_t* pd) {
-        static void* cached_pdb = NULL;
-        static bool cached_is_v2_pdb = false;
-        static long pd_hash = 0;
        int ret;
        bar_config_block_t bar2_block;
-        page_dir_config_t pd_config;
-        uint64_t pdb_vram;
-        // Use cached base as long as it's still pointing to the same thing
+        if (!pd)
-        if (cached_pdb && readl(cached_pdb) == pd_hash) {
+                return -EINVAL;
-                *pdb = cached_pdb;
-                *is_v2_pdb = cached_is_v2_pdb;
-                return 0;
-        }
        if (!g->bar2)
                return -ENXIO;
@@ -107,24 +97,10 @@ int get_bar2_pdb(struct nvdebug_state *g, void **pdb, bool *is_v2_pdb) {
        }
        printk(KERN_INFO "[nvdebug] BAR2 inst block at off %x in PRAMIN\n", ret);
        // Pull the page directory base configuration from the instance block
-        if ((pd_config.raw = nvdebug_readq(g, NV_PRAMIN + ret + NV_PRAMIN_PDB_CONFIG_OFF)) == -1) {
+        if ((pd->raw = nvdebug_readq(g, NV_PRAMIN + ret + NV_PRAMIN_PDB_CONFIG_OFF)) == -1) {
                printk(KERN_ERR "[nvdebug] Unable to read BAR2/3 PDB configuration! BAR2/3 inaccessible.\n");
                return -ENOTSUPP;
        }
-        pdb_vram = pd_config.page_dir_hi;
-        pdb_vram <<= 20;
-        pdb_vram |= pd_config.page_dir_lo;
-        pdb_vram <<= 12;
-        printk(KERN_INFO "[nvdebug] BAR2 PDB @ %llx (config raw: %llx)\n", pdb_vram, pd_config.raw);
-        // Setup PRAMIN to point at the page directory
-        if ((ret = addr_to_pramin_mut(g, pdb_vram, pd_config.target)) < 0) {
-                printk(KERN_ERR "[nvdebug] Invalid BAR2/3 PDB configuration! BAR2/3 inaccessible.\n");
-                return ret;
-        }
-        *pdb = cached_pdb = g->regs + NV_PRAMIN + ret;
-        pd_hash = readl(cached_pdb);
-        *is_v2_pdb = cached_is_v2_pdb = pd_config.is_ver2;
        return 0;
 }
diff --git a/mmu.c b/mmu.c
index e420864..70c00f9 100644
--- a/mmu.c
+++ b/mmu.c
@@ -1,117 +1,129 @@
-// Helpers to deal with NVIDIA's MMU and associated page tables
+/* Copyright 2024 Joshua Bakita
+ * Helpers to deal with NVIDIA's MMU and associated page tables
+ */
+#include <linux/err.h>  // ERR_PTR() etc.
+#include <linux/iommu.h>  // iommu_get_domain_for_dev() and iommu_iova_to_phys()
 #include <linux/kernel.h>  // Kernel types
 #include "nvdebug.h"
-/* One of the oldest ways to access video memory on NVIDIA GPUs is by using
+// Uncomment to print every PDE and PTE walked for debugging
-  a configurable 1MB window into VRAM which is mapped into BAR0 (register)
+//#define DEBUG
-  space starting at offset NV_PRAMIN. This is still supported on NVIDIA GPUs
+#ifdef DEBUG
-  and appear to be used today to bootstrap page table configuration.
+#define printk_debug printk
+#else
+#define printk_debug(...)
+#endif
-  Why is it mapped at a location called NVIDIA Private RAM Instance? Because
+/* Convert a page directory (PD) pointer and aperture to be kernel-accessible
-  this used to point to the entirety of intance RAM, which was seperate from
-  VRAM on older NVIDIA GPUs.
-*/
-/* Convert a physical VRAM address to an offset in the PRAMIN window
+  I/O MMU handling inspired by amdgpu_iomem_read() in amdgpu_ttm.c of the
-  @param addr VRAM address to convert
+  AMDGPU driver.
-  @return -errno on error, PRAMIN offset on success
-  Note: Use off2PRAMIN() instead if you want a dereferenceable address
+  @param addr  Pointer from page directory entry (PDE)
-  Note: PRAMIN window is only 1MB, so returning an int is safe
+  @param pd_ap PD-type aperture (target address space) for `addr`
-*/
+  @return A dereferencable kernel address, or an ERR_PTR-wrapped error
-static int vram2PRAMIN(struct nvdebug_state *g, uint64_t addr) {
+ */
-        uint64_t pramin_base_va;
+void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr, enum PD_TARGET pd_ap) {
-        bar0_window_t window;
+        struct iommu_domain *dom;
-        window.raw = nvdebug_readl(g, NV_PBUS_BAR0_WINDOW);
+        phys_addr_t phys;
-        // Check if the address is valid (49 bits are addressable on-GPU)
-        if (addr & ~0x0001ffffffffffff) {
+        // Validate arguments
-                printk(KERN_ERR "[nvdebug] Invalid address %llx passed to %s!\n",
+        if (unlikely(!IS_PD_TARGET(pd_ap) || pd_ap == PD_AND_TARGET_INVALID || !addr))
-                       addr, __func__);
+                return ERR_PTR(-EINVAL);
-                return -EINVAL;
+        // VID_MEM accesses are the simple common-case
+        if (pd_ap == PD_AND_TARGET_VID_MEM) {
+                // Using BAR2 requires a page-table traversal. As this function is part
+                // of the page-table traversal process, it must instead use PRAMIN.
+                int off = addr_to_pramin_mut(g, addr, TARGET_VID_MEM);
+                if (off < 0)
+                        return ERR_PTR(off);
+                return g->regs + NV_PRAMIN + off;
        }
-        // For unclear (debugging?) reasons, PRAMIN can point to SYSMEM
+        /* SYS_MEM accesses are rare. Only nvgpu (Jetson driver), nouveau, and this
-        if (window.target != TARGET_VID_MEM)
+         * driver are known to create page directory entries in SYS_MEM.
-                return -EFAULT;
+         *
-        pramin_base_va = ((uint64_t)window.base) << 16;
+         * On systems using an I/O MMU, or some other I/O virtual address space,
-        // Protect against out-of-bounds accesses
+         * these are **not** physical addresses, and must first be translated
-        if (addr < pramin_base_va || addr > pramin_base_va + NV_PRAMIN_LEN)
+         * through the I/O MMU before use.
-                return -ERANGE;
+         * Example default meaning of a SYS_MEM address for a few CPUs:
-        return addr - pramin_base_va;
+         * - Jetson Xavier : physical address
-}
+         * - AMD 3950X     : I/O MMU address
+         * - Phenom II x4  : physical address
+         */
+        // Check for, and translate through, the I/O MMU (if any)
+        if ((dom = iommu_get_domain_for_dev(g->dev))) {
+                phys = iommu_iova_to_phys(dom, addr);
+                printk(KERN_ERR "[nvdebug] I/O MMU translated SYS_MEM I/O VA %#lx to physical address %llx.\n", addr, phys);
+        } else
+                phys = addr;
-// Convert a GPU physical address to CPU virtual address via the PRAMIN window
+        if (!phys)
-// @return A dereferencable address, or 0 (an invalid physical address) on err
-void __iomem *phy2PRAMIN(struct nvdebug_state* g, uint64_t phy) {
-        int off = vram2PRAMIN(g, phy);
-        if (off == -ERANGE)
-                printk(KERN_ERR "[nvdebug] Page table walk off end of PRAMIN!\n");
-        if (off < 0)
                return 0;
-        return g->regs + NV_PRAMIN + vram2PRAMIN(g, phy);
-}
-/* FIXME
+        return phys_to_virt(addr);
-void __iomem *off2BAR2(struct nvdebug_state* g, uint32_t off) {
-        return g->bar2 + off;
 }
-*/
 // Internal helper for search_page_directory().
 uint64_t search_page_directory_subtree(struct nvdebug_state *g,
-                                       void __iomem *pde_offset,
+                                       uintptr_t pde_addr,
-                                       void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
+                                       enum PD_TARGET pde_target,
                                       uint64_t addr_to_find,
                                       uint32_t level) {
        uint64_t res, i;
-        void __iomem *next;
+        void __iomem *pde_kern;
        page_dir_entry_t entry;
        if (level > sizeof(NV_MMU_PT_V2_SZ))
                return 0;
        // Hack to workaround PDE0 being double-size and strangely formatted
        if (NV_MMU_PT_V2_ENTRY_SZ[level] == 16)
-                pde_offset += 8;
+                pde_addr += 8;
-        entry.raw_w = readq(pde_offset);
+        // Translate a VID_MEM/SYS_MEM-space address to something kernel-accessible
+        pde_kern = pd_deref(g, pde_addr, pde_target);
+        if (IS_ERR_OR_NULL(pde_kern)) {
+                printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_addr, pd_target_to_text(pde_target), PTR_ERR(pde_kern));
+                return 0;
+        }
+        // Read the page directory entry (a pointer to another directory, or a PTE)
+        entry.raw_w = readq(pde_kern);
        // If we reached an invalid (unpopulated) PDE, walk back up the tree
        if (entry.target == PD_AND_TARGET_INVALID)
                return 0;
        // Succeed when we reach a PTE with the address we want
        if (entry.is_pte) {
                // TODO: Handle huge pages here
-                printk(KERN_INFO "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, entry.is_privileged, entry.is_readonly, entry.atomics_disabled, entry.raw_w);
+                printk_debug(KERN_INFO "[nvdebug] PTE for phy addr %#018llx, ap '%s', vol '%d', priv '%d', ro '%d', no_atomics '%d' (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, entry.is_privileged, entry.is_readonly, entry.atomics_disabled, entry.raw_w);
                return (uint64_t)entry.addr << 12 == addr_to_find;
        }
-        printk(KERN_INFO "[nvdebug] Found PDE pointing to %#018llx in ap '%s' vol '%d' at lvl %d (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, level, entry.raw_w);
+        printk_debug(KERN_INFO "[nvdebug] Found PDE pointing to %#018llx in ap '%s' vol '%d' at lvl %d (raw: %#018llx)\n", ((u64)entry.addr_w) << 12, pd_target_to_text(entry.target), entry.is_volatile, level, entry.raw_w);
        // Depth-first search of the page table
        for (i = 0; i < NV_MMU_PT_V2_SZ[level + 1]; i++) {
-                next = off2addr(g, ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i);
+                uint64_t next = ((uint64_t)entry.addr << 12) + NV_MMU_PT_V2_ENTRY_SZ[level + 1] * i;
-                // off2addr can fail
+                res = search_page_directory_subtree(g, next, entry.target, addr_to_find, level + 1);
-                if (!next || !entry.addr_w) {
-                        printk(KERN_ERR "[nvdebug] %s: Unable to resolve GPU PA to CPU PA\n", __func__);
-                        return 0;
-                }
-                res = search_page_directory_subtree(g, next, off2addr, addr_to_find, level + 1);
                if (res)
                        return res | (i << NV_MMU_PT_V2_LSB[level + 1]);
        }
        return 0;
 }
-/* GPU Physical address -> Virtual address ("reverse" translation)
+/* GPU Physical address -> Virtual address ("reverse" translation) for V2 tables
+  Depth-first search a page directory of the GPU MMU for where a particular
+  physical address is mapped. Upon finding a mapping, the virtual address is
+  returned.
-   Depth-first search a page directory of the GPU MMU for where a particular
+  The page directory may be located in VID_MEM, SYS_MEM, or some combination of
-   physical address is mapped. Upon finding a mapping, the virtual address is
+  the two.
-   returned.
-  @param pde_offset   Dereferenceable pointer to the start of the PDE3 entries
+  @param pd_config    Page Directory configuration, containing pointer and
-  @param off2addr     Func to convert VRAM phys addresses to valid CPU VAs
+                      aperture for the start of the PDE3 entries
  @param addr_to_find Physical address to reconstruct the virtual address of
  @return 0 on error, otherwise the virtual address at which addr_to_find is
          mapped into by this page table. (Zero is not a valid virtual address)
 */
 uint64_t search_page_directory(struct nvdebug_state *g,
-                               void __iomem *pde_offset,
+                               page_dir_config_t pd_config,
-                               void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
                               uint64_t addr_to_find) {
        uint64_t res, i;
        // Make sure that the query is page-aligned
@@ -119,57 +131,62 @@ uint64_t search_page_directory(struct nvdebug_state *g,
                printk(KERN_WARNING "[nvdebug] Attempting to search for unaligned address %llx in search_page_directory()!\n", addr_to_find);
                return 0;
        }
-        printk(KERN_INFO "[nvdebug] Searching for addr %#018llx in page table with base %#018llx\n", (u64)addr_to_find, (u64)pde_offset);
+        printk(KERN_INFO "[nvdebug] Searching for addr %#018llx in page table with base %#018lx\n", addr_to_find, (uintptr_t)pd_config.page_dir << 12);
        // Search the top-level page directory (PDE3)
        for (i = 0; i < NV_MMU_PT_V2_SZ[0]; i++)
-                if ((res = search_page_directory_subtree(g, pde_offset + NV_MMU_PT_V2_ENTRY_SZ[0] * i, off2addr, addr_to_find, 0)))
+                if ((res = search_page_directory_subtree(g, ((uintptr_t)pd_config.page_dir << 12) + NV_MMU_PT_V2_ENTRY_SZ[0] * i, INST2PD_TARGET(pd_config.target), addr_to_find, 0)))
                        return (res & ~0xfff) | (i << NV_MMU_PT_V2_LSB[0]);
        return 0;
 }
-/* GMMU Page Tables Version 1
+/* GPU Physical address -> Virtual address ("reverse" translation) for V1 tables
-  This page table only contains 2 levels and is used in the Fermi, Kepler, and
+  (See `search_page_directory()` for documentation.)
-  Maxwell architectures
+ */
-*/
-// Number of entries in the PDE and PTE levels
-static const int NV_MMU_PT_V1_SZ[2] = {512, 1<<13};  // 2<<13 is an educated guess!!!
-// Which bit index is the least significant in indexing each page level
-static const int NV_MMU_PT_V1_LSB[2] = {25, 12};  // 25 is an educated guess!!!
 uint64_t search_v1_page_directory(struct nvdebug_state *g,
-                                  void __iomem *pde_offset,
+                                  page_dir_config_t pd_config,
-                                  void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
                                  uint64_t addr_to_find) {
        uint64_t j, i = 0;
        page_dir_entry_v1_t pde;
        page_tbl_entry_v1_t pte;
-        void __iomem *pte_offset;
+        uintptr_t pte_offset, pde_offset;
+        void __iomem *pte_addr, *pde_addr;
        // For each PDE
        do {
+                // Index the list of page directory entries
+                pde_offset = ((uint64_t)pd_config.page_dir << 12) + i * sizeof(page_dir_entry_v1_t);
+                // Convert the VID_MEM/SYS_MEM address to a kernel-accessible addr
+                pde_addr = pd_deref(g, pde_offset, INST2PD_TARGET(pd_config.target));
+                if (IS_ERR_OR_NULL(pde_addr)) {
+                        printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pde_offset, pd_target_to_text(INST2PD_TARGET(pd_config.target)), -PTR_ERR(pde_addr));
+                        return 0;
+                }
                // readq doesn't seem to work on BAR0
-                pde.raw = readl(pde_offset + i * sizeof(page_dir_entry_v1_t) + 4);
+                pde.raw = readl(pde_addr + 4);
                pde.raw <<= 32;
-                pde.raw |= readl(pde_offset + i * sizeof(page_dir_entry_v1_t));
+                pde.raw |= readl(pde_addr);
                // Verify PDE is present
                if (pde.target == PD_TARGET_INVALID && pde.alt_target == PD_TARGET_INVALID)
                        continue;
-                // Convert to a dereferencable pointer from CPU virtual address space
-                pte_offset = off2addr(g, (uint64_t)pde.alt_addr << 12);
-                if (!pte_offset)
-                        continue;
 //              printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.is_volatile ? "volatile" : "non-volatile", ((u64)pde.addr) << 12, pde.target, pde.raw);
-//              printk(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.target, pde.raw);
+                printk_debug(KERN_INFO "[nvdebug] Found %s PDE pointing to PTEs @ %llx in ap '%d' (raw: %llx)\n", pde.alt_is_volatile ? "volatile" : "non-volatile", ((u64)pde.alt_addr) << 12, pde.alt_target, pde.raw);
                // For each PTE
                for (j = 0; j < NV_MMU_PT_V1_SZ[1]; j++) {
-                        // Don't overrun the PRAMIN window
+                        // Index the list of page table entries starting at pde.alt_addr
-                        if (pte_offset > NV_PRAMIN + g->regs + NV_PRAMIN_LEN)
+                        pte_offset = ((uint64_t)pde.alt_addr << 12) + j * sizeof(page_tbl_entry_v1_t);
+                        // Convert the VID_MEM/SYS_MEM address to a kernel-accessible addr
+                        pte_addr = pd_deref(g, pte_offset, V12PD_TARGET(pde.alt_target));
+                        if (IS_ERR_OR_NULL(pte_addr)) {
+                                printk(KERN_ERR "[nvdebug] %s: Unable to resolve %#lx in GPU %s to a kernel-accessible address. Error %ld.\n", __func__, pte_offset, pd_target_to_text(V12PD_TARGET(pde.alt_target)), -PTR_ERR(pte_addr));
                                return 0;
-                        pte.raw = readl(pte_offset + j * sizeof(page_tbl_entry_v1_t) + 4);
+                        }
+                        // Read page table entry, avoiding readq
+                        pte.raw = readl(pte_addr + 4);
                        pte.raw <<= 32;
-                        pte.raw |= readl(pte_offset + j * sizeof(page_tbl_entry_v1_t));
+                        pte.raw |= readl(pte_addr);
                        // Skip non-present PTEs
                        if (!pte.is_present)
                                continue;
-//                      printk(KERN_INFO "[nvdebug] PTE for phy addr %llx %s (raw: %llx)\n", ((u64)pte.addr) << 12, pte.is_present ? "present" : "non-present", pte.raw);
+                        printk_debug(KERN_INFO "[nvdebug] PTE for phy addr %llx %s (raw: %llx)\n", ((u64)pte.addr) << 12, pte.is_present ? "present" : "non-present", pte.raw);
                        // If we find a matching PTE, return its virtual address
                        if ((uint64_t)pte.addr << 12 == addr_to_find)
                                return i << NV_MMU_PT_V1_LSB[0] | j << NV_MMU_PT_V1_LSB[1];
@@ -178,9 +195,6 @@ uint64_t search_v1_page_directory(struct nvdebug_state *g,
        return 0;
 }
-/* GMMU Page Tables Version 0
-  This page table only contains 2 levels and is used in the Tesla architecture
-*/
 /* *** UNTESTED ***
 #define NV_MMU_PT_V0_SZ 2048
 #define NV_MMU_PT_V0_LSB 29
diff --git a/nvdebug.h b/nvdebug.h
index 567806d..eff1470 100644
--- a/nvdebug.h
+++ b/nvdebug.h
@@ -818,6 +818,14 @@ typedef union {
 } bar_config_block_t;
 /* BAR0 PRAMIN (Private RAM Instance) window configuration
+  One of the oldest ways to access video memory on NVIDIA GPUs is by using
+  a configurable 1MB window into VRAM which is mapped into BAR0 (register)
+  space starting at offset NV_PRAMIN. This is still supported on NVIDIA GPUs
+  and appear to be used today to bootstrap page table configuration.
+  Why is it mapped at a location called NVIDIA Private RAM Instance? Because
+  this used to point to the entirety of intance RAM, which was seperate from
+  VRAM on older NVIDIA GPUs.
  BASE    : Base of window >> 16 in [TARGET] virtual address space
  TARGET  : Which address space BASE points into
@@ -843,7 +851,7 @@ typedef union {
 typedef union {
        struct {
                uint32_t target:2;
-                uint32_t vol:1;
+                uint32_t is_volatile:1;
                 uint32_t padding0:1;
                uint32_t fault_replay_tex:1;
                uint32_t fault_replay_gcc:1;
@@ -853,6 +861,10 @@ typedef union {
                uint32_t page_dir_lo:20;
                uint32_t page_dir_hi:32;
        } __attribute__((packed));
+        struct {
+                uint32_t pad:12;
+                uint64_t page_dir:52; // Confirmed working on Xavier and tama
+        } __attribute__((packed));
        uint64_t raw;
 } page_dir_config_t;
@@ -888,6 +900,14 @@ typedef union {
  The following arrays merely represent different projections of Fig. 1, and
  only one is strictly needed to reconstruct all the others. However, due to
  the complexity of page tables, we include all of these to aid in readability.
+  Support: Pascal, Volta, Turing, Ampere, Ada, Ampere, Hopper*, Blackwell*
+  Note: *Hopper introduces Version 3 Page Tables, but is backwards-compatible.
+         The newer version adds a PD4 level to support 57-bit virtual
+         addresses, and slightly shifts the PDE and PTE fields.
+  See also: gp100-mmu-format.pdf in open-gpu-doc. In open-gpu-kernel-modules
+            this is synonymously the "NEW" and "VER2" layout.
 */
 // How many nodes/entries per level in V2 of NVIDIA's page table format
 static const int NV_MMU_PT_V2_SZ[5] = {4, 512, 512, 256, 512};
@@ -907,6 +927,12 @@ enum PD_TARGET {
        PTE_AND_TARGET_SYS_MEM_COHERENT = 5,  // b101
        PTE_AND_TARGET_SYS_MEM_NONCOHERENT = 7,  // b111
 };
+// The low bit is unset on page directory (PD) targets
+#define IS_PD_TARGET(target) (!(target & 0x1u))
+// Convert from an enum INST_TARGET to an enum PD_TARGET
+#define INST2PD_TARGET(target) ((target & 0x2) ? (target << 1) : (!target) << 1)
+// Convert from an enum V1_PD_TARGET to an enum PD_TARGET
+#define V12PD_TARGET(target) (target << 1)
 static inline const char *pd_target_to_text(enum PD_TARGET t) {
        switch (t) {
                case PD_AND_TARGET_INVALID:
@@ -928,13 +954,10 @@ static inline const char *pd_target_to_text(enum PD_TARGET t) {
 }
 // Page Directory Entry/Page Table Entry V2 type
-// Note: As the meaning of target (bits 2:1) changes depending on if the entry
+// Note: As the meaning of target (bits 2:1) at a PDE-level changes if the
-//       is a PTE or not, this combines them into a single target field to
+//       entry is a large-page PTE or not. To simply the logic, we combine them
-//       simplify comparisons.
+//       into a single target field to simplify comparisons.
-// Support: Pascal, Volta, Turing, Ampere, Ada
+#define TARGET_PEER 1
-//
-// V3 introduced with Hopper, but Hopper and Blackwell also support V2
-//
 typedef union {
        // Page Directory Entry (PDE)
        struct {
@@ -965,21 +988,74 @@ typedef union {
        uint64_t raw_w;
 } page_dir_entry_t;
-// Page Directory Entry/Page Table Entry V1 type
+/* GMMU Page Tables Version 1
-// Support: Fermi, Kepler, Maxwell
+  These page tables contain 2 levels and are used in the Fermi, Kepler, and
+  Maxwell architectures to support a 40-bit virtual address space.
+  Version 1 Page Tables may be configured to support either 64 KiB or 128 KiB
+  large pages. Table addressing differs between the modes---even if the table
+  contains no large pages. The format for 4 KiB pages in each mode is shown
+  below.
+  V1 of NVIDIA's page table format uses 1 level of PDEs and a level of PTEs.
+  How the virtual address is sliced to yield an index into each level and a
+  page offset is shown by Fig 1 and Fig 2 (for 64 KiB and 128 KiB large page
+  modes respectively).
+  == Figure 1: 64 KiB mode ==
+  Page Offset (12 bits) <----------------------------------+
+  Page Table Entry (PTE) (13 bits) <--------------+        |
+  Page Directory Entry (PDE) (13 bits) <-+        |        |
+                                         ^        ^        ^
+                     Virtual address: [39, 25] [24, 12] [11, 0]
+  == Figure 2: 128 KiB mode ==
+  Page Offset (12 bits) <----------------------------------+
+  Page Table Entry (PTE) (14 bits) <--------------+        |
+  Page Directory Entry (PDE) (12 bits) <-+        |        |
+                                         ^        ^        ^
+                     Virtual address: [39, 26] [25, 12] [11, 0]
+  Support: Fermi, Kepler, Maxwell, Pascal*
+  Note: *Pascal introduces Version 2 Page Tables, but is backwards-compatible.
+  Note: We only implement the 64-KiB-large-page mode in nvdebug.
+  See also: mm_gk20a.c in nvgpu (Jetson GPU driver) and kern_gmmu_fmt_gm10x.c
+            in open-gpu-kernel-modules (open-source NVRM variant). This is
+            synonymously the "VER1" and unversioned layout in
+            open-gpu-kernel-modules, with some differences noted in Appdx 1.
+  == Appdx 1 ==
+  In open-gpu-kernel-modules, the unversioned MMU layout adds:
+  - Bit 35: NV_MMU_PTE_LOCK synonym for NV_MMU_PTE_ATOMIC_DISABLE
+  - Bit 62: NV_MMU_PTE_READ_DISABLE overlapping NV_MMU_PTE_COMPTAGLINE
+  - Bit 63: NV_MMU_PTE_WRITE_DISABLE overlapping NV_MMU_PTE_COMPTAGLINE
+  And removes:
+  - Bit 40, 41, 42, 43 from NV_MMU_PTE_KIND
+  The PDE layouts are identical. Given that the unversioned defines seem to
+  predate renaming and/or field extension/relocation, they are likely artifacts
+  from the page table development process, and have no meaning now.
+*/
+// Number of entries in the PDE and PTE levels
+static const int NV_MMU_PT_V1_SZ[2] = {8192, 8192};
+// Which bit index is the least significant in indexing each page level
+static const int NV_MMU_PT_V1_LSB[2] = {25, 12};
+// V1 Page Directory Entry target
 enum V1_PD_TARGET {
        PD_TARGET_INVALID = 0,
        PD_TARGET_VID_MEM = 1,
        PD_TARGET_SYS_MEM_COHERENT = 2,
        PD_TARGET_SYS_MEM_NONCOHERENT = 3,
 };
-// Page Directory Entry (PDE)
+// V1 Page Directory Entry (PDE)
 typedef union {
 // Large page fields
        struct {
 // 0:32
                enum V1_PD_TARGET target:2;
-                 uint32_t padding0:2;
+                 uint32_t padding0:2; // Documented as "PDE_SIZE"?
                uint64_t addr:28;  // May be wider?
 // 32:63
                 uint32_t padding2:3;
@@ -998,45 +1074,58 @@ typedef union {
        } __attribute__((packed));
        uint64_t raw;
 } page_dir_entry_v1_t;
-// Page Table Entry (PTE)
-// Reconstructed from info in Jetson nvgpu driver
+// V1 Page Table Entry (PTE)
 typedef union {
        struct {
 // 0:32
                bool is_present:1;
                bool is_privileged:1;
                bool is_readonly:1;
-                 uint32_t padding0:1;
+                bool is_encrypted:1;
                uint64_t addr:28;
 // 32:63
                bool is_volatile:1;
                enum INST_TARGET:2;
-                 uint32_t padding1:1;
+                bool atomics_disabled:1;
                uint32_t kind:8;
-                uint32_t comptag:17;
+                uint32_t comptag:20;
-                 uint32_t padding2:1;
-                bool is_read_disabled:1;
-                bool is_write_disabled:1;
        } __attribute__((packed));
        uint64_t raw;
 } page_tbl_entry_v1_t;
-//enum V0_PDE_TYPE {NOT_PRESENT = 0, PAGE_64K = 1, PAGE_16K = 2, PAGE_4K = 3};
-//enum V0_PDE_SIZE {PDE_SZ_128K = 0, PDE_SZ_32K = 1, PDE_SZ_16K = 2, PDE_SZ_8K = 3};
+/* GMMU Page Tables Version 0
-//static const int V0_PDE_SIZE2NUM[4] = {128*1024, 32*1024, 16*1024, 8*1024};
+  This page table contains 2 levels to support a 40-bit virtual address space,
-/* PDE V0 (nv50/Tesla)
+  and is used in the Tesla (2.0?) architecture.
+  It is unclear what NVIDIA calls this page table layout. It predates V1, so we
+  call it V0.
+  See also: https://envytools.readthedocs.io/en/latest/hw/memory/g80-vm.html
+ */
+/*
+// What size pages are in the pointed-to page table?
+enum V0_PDE_TYPE {NOT_PRESENT = 0, PAGE_64K = 1, PAGE_16K = 2, PAGE_4K = 3};
+// How large is the pointed-to page table?
+enum V0_PDE_SIZE {PDE_SZ_128K = 0, PDE_SZ_32K = 1, PDE_SZ_16K = 2, PDE_SZ_8K = 3};
+// Given a page table size, how many entries does it have?
+static const int V0_PDE_SIZE2NUM[4] = {128*1024, 32*1024, 16*1024, 8*1024};
+// PDE V0 (nv50/Tesla)
 typedef union {
        struct {
-                enum V1_PDE_TYPE type:2;
+                enum V0_PDE_TYPE type:2;
                enum INST_TARGET target:2;
                 uint32_t padding0:1;
-                enum V1_PDE_SIZE sublevel_size:2;
+                enum V0_PDE_SIZE sublevel_size:2;
                 uint32_t padding1:5;
                uint32_t addr:28;
                 uint32_t padding2:24;
        } __attribute__((packed));
        uint64_t raw;
-} page_dir_entry_v1_t;*/
+} page_dir_entry_v0_t;
-/* PTE V0 (nv50)
+// PTE V0 (nv50) for small pages
 typedef union {
        struct {
                bool is_present:1;
@@ -1055,7 +1144,8 @@ typedef union {
                 uint32_t padding5:1;
        } __attribute__((packed));
        uint64_t raw;
-} page_tbl_entry_v1_t;*/
+} page_tbl_entry_v0_t;
+*/
 // TODO(jbakita): Maybe put the above GPU types in a different file.
@@ -1077,6 +1167,8 @@ struct nvdebug_state {
        struct gk20a *g;
        // Pointer to PCI device needed for pci_iounmap
        struct pci_dev *pcid;
+        // Pointer to generic device struct (both platform and pcie devices)
+        struct device *dev;
 };
 /*const struct runlist_funcs {
@@ -1152,13 +1244,11 @@ int preempt_runlist(struct nvdebug_state *g, uint32_t rl_id);
 void __iomem *phy2PRAMIN(struct nvdebug_state* g, uint64_t phy);
 uint64_t search_page_directory(
        struct nvdebug_state *g,
-        void __iomem *pde_offset,
+        page_dir_config_t pd_config,
-        void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
        uint64_t addr_to_find);
 uint64_t search_v1_page_directory(
        struct nvdebug_state *g,
-        void __iomem *pde_offset,
+        page_dir_config_t pd_config,
-        void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
        uint64_t addr_to_find);
@@ -1252,4 +1342,4 @@ static inline void nvdebug_writeq(struct nvdebug_state *s, u32 r, u64 v) {
 }
 // Defined in bus.c
 int addr_to_pramin_mut(struct nvdebug_state *g, uint64_t addr, enum INST_TARGET target);
-int get_bar2_pdb(struct nvdebug_state *g, void **pdb, bool *is_v2_pdb);
+int get_bar2_pdb(struct nvdebug_state *g, page_dir_config_t* pd);
diff --git a/nvdebug_entry.c b/nvdebug_entry.c
index 0cf5344..68e4d71 100644
--- a/nvdebug_entry.c
+++ b/nvdebug_entry.c
@@ -109,6 +109,7 @@ int probe_and_cache_devices(void) {
                g_nvdebug_state[i].chip_id = ids.chip_id;
                g_nvdebug_state[i].pcid = NULL;
                g_nvdebug_state[i].bar3 = NULL;
+                g_nvdebug_state[i].dev = dev;
                printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on platform bus and initialized.",
                       ids.chip_id, ARCH2NAME(ids.architecture));
                i++;
@@ -131,6 +132,7 @@ int probe_and_cache_devices(void) {
                if (!g_nvdebug_state[i].bar3)
                        g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, pci_resource_len(pcid, 3)/2);
                g_nvdebug_state[i].pcid = pcid;
+                g_nvdebug_state[i].dev = &pcid->dev;
                ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
                if (ids.raw == -1) {
                        pci_err(pcid, "[nvdebug] Unable to read config from Master Controller on this GPU\n");
diff --git a/runlist.c b/runlist.c
index c725e77..2eee01c 100644
--- a/runlist.c
+++ b/runlist.c
@@ -9,7 +9,8 @@
 // Uncomment to, upon BAR2 access failure, return a PRAMIN-based runlist pointer
 // **If enabled, PRAMIN may not be otherwise used while walking the runlist!**
-#define FALLBACK_TO_PRAMIN
+// Runlists can only be printed on the Jetson TX2 if this is enabled.
+//#define FALLBACK_TO_PRAMIN
 /* Get runlist head and info (incl. length)
  @param rl_id   Which runlist to obtain?
@@ -20,6 +21,7 @@ int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl
        uint64_t runlist_iova;
        enum INST_TARGET runlist_target;
        uint16_t runlist_len;
+        int err;
 #ifdef FALLBACK_TO_PRAMIN
        int off;
 #endif // FALLBACK_TO_PRAMIN
@@ -33,9 +35,9 @@ int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl
                        return -EIO;
                runlist_iova = ((uint64_t)rl.ptr) << 12;
                runlist_target = rl.target;
-                printk(KERN_INFO "[nvdebug] Runlist %d: %d entries @ %llx in %s (config raw: %#018llx)\n",
-                           rl_id, rl.len, runlist_iova, target_to_text(rl.target), rl.raw);
                runlist_len = rl.len;
+                printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx)\n",
+                           rl_id, g->chip_id, rl.len, runlist_iova, target_to_text(rl.target), rl.raw);
        } else if (g->chip_id < NV_CHIP_ID_AMPERE) {
                runlist_base_tu102_t base;
                runlist_submit_tu102_t submit;
@@ -46,6 +48,8 @@ int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl
                runlist_iova = ((uint64_t)base.ptr) << 12;
                runlist_target = base.target;
                runlist_len = submit.len;
+                printk(KERN_INFO "[nvdebug] Runlist %d for %x: %d entries @ %llx in %s (config raw: %#018llx %#018llx)\n",
+                           rl_id, g->chip_id, submit.len, runlist_iova, target_to_text(runlist_target), base.raw, submit.raw);
        }
        // Return early on an empty runlist
        if (!runlist_len)
@@ -53,24 +57,25 @@ int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl
        // If the runlist is in VID_MEM, search the BAR2/3 page tables for a mapping
        if (runlist_target == TARGET_VID_MEM) {
-                void __iomem *bar2_page_dir;
-                bool pdb_is_ver2;
                uint64_t runlist_bar_vaddr;
+                page_dir_config_t pd_config;
-                if (get_bar2_pdb(g, &bar2_page_dir, &pdb_is_ver2) < 0)
+                if ((err = get_bar2_pdb(g, &pd_config)) < 0)
-                        return -EIO;
+                        goto attempt_pramin_access;
-                if (pdb_is_ver2)
+                if (pd_config.is_ver2)
-                        runlist_bar_vaddr = search_page_directory(g, bar2_page_dir, phy2PRAMIN, runlist_iova);
+                        runlist_bar_vaddr = search_page_directory(g, pd_config, runlist_iova);
                else
-                        runlist_bar_vaddr = search_v1_page_directory(g, bar2_page_dir, phy2PRAMIN, runlist_iova);
+                        runlist_bar_vaddr = search_v1_page_directory(g, pd_config, runlist_iova);
                if (!runlist_bar_vaddr) {
-                        printk(KERN_WARNING "[nvdebug] Unable to find runlist mapping in BAR2/3 page tables.\n");
+                        printk(KERN_WARNING "[nvdebug] Unable to find runlist %d mapping in BAR2/3 page tables for %x.\n", rl_id, g->chip_id);
+                        err = -EOPNOTSUPP;
                        goto attempt_pramin_access;
                }
-                printk(KERN_INFO "[nvdebug] Runlist @ %llx in BAR2 virtual address space.\n", runlist_bar_vaddr);
+                printk(KERN_INFO "[nvdebug] Runlist %d for %x @ %llx in BAR2 virtual address space.\n", rl_id, g->chip_id, runlist_bar_vaddr);
                if (!g->bar2) {
-                        printk(KERN_WARNING "[nvdebug] BAR2/3 not mapped.\n");
+                        printk(KERN_WARNING "[nvdebug] BAR2/3 not mapped for %x.\n", g->chip_id);
                        return -ENODEV;
                }
                rl_iter->curr_entry = g->bar2 + runlist_bar_vaddr;
@@ -91,7 +96,7 @@ attempt_pramin_access:
        rl_iter->len = runlist_len;
        return 0;
 #else
-        return -EOPNOTSUPP;
+        return err;
 #endif // FALLBACK_TO_PRAMIN
 }
author	Joshua Bakita <bakitajoshua@gmail.com>	2024-04-11 12:23:18 -0400
committer	Joshua Bakita <jbakita@cs.unc.edu>	2024-04-11 13:03:20 -0400
commit	a8fd5a8dee066d0008e7667b0c9e6a60cd5f3a2e (patch)
tree	f05095d4b6458a709034a182649e6d16b6a8558a
parent	5ea953292441e31e37ae074e48d8b3b5ce1d9440 (diff)