From 8b9c6400d0c88e127be2d31ab3fb507da49f9d6f Mon Sep 17 00:00:00 2001 From: Joshua Bakita Date: Tue, 23 Apr 2024 17:44:59 -0400 Subject: Style and documentation cleanup - Document topology registers (PTOP) on Ampere+ - Document graphics copy engine configuration registers - Move resubmit_runlist range checks into runlist.c - Miscellaneous spacing, typo, and minor documentation fixes --- mmu.c | 30 ++++++------ nvdebug.h | 137 ++++++++++++++++++++++++++++++++++++++----------------- runlist.c | 10 +++- runlist_procfs.c | 9 +--- 4 files changed, 120 insertions(+), 66 deletions(-) diff --git a/mmu.c b/mmu.c index 6784b9f..ababef5 100644 --- a/mmu.c +++ b/mmu.c @@ -24,7 +24,8 @@ int g_verbose = 0; @param pd_ap PD-type aperture (target address space) for `addr` @return A dereferencable kernel address, or an ERR_PTR-wrapped error */ -static void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr, enum PD_TARGET pd_ap) { +static void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr, + enum PD_TARGET pd_ap) { struct iommu_domain *dom; phys_addr_t phys; @@ -67,11 +68,11 @@ static void __iomem *pd_deref(struct nvdebug_state *g, uintptr_t addr, enum PD_T // Internal helper for search_page_directory(). uint64_t search_page_directory_subtree(struct nvdebug_state *g, - uintptr_t pde_addr, - enum PD_TARGET pde_target, - uint64_t addr_to_find, - enum INST_TARGET addr_to_find_aperture, - uint32_t level) { + uintptr_t pde_addr, + enum PD_TARGET pde_target, + uint64_t addr_to_find, + enum INST_TARGET addr_to_find_aperture, + uint32_t level) { uint64_t res, i; void __iomem *pde_kern; page_dir_entry_t entry; @@ -110,13 +111,12 @@ uint64_t search_page_directory_subtree(struct nvdebug_state *g, } /* GPU Physical address -> Virtual address ("reverse" translation) for V2 tables - Depth-first search a page directory of the GPU MMU for where a particular physical address is mapped. Upon finding a mapping, the virtual address is returned. - The page directory may be located in VID_MEM, SYS_MEM, or some combination of - the two. + The page directory and tables may be located in VID_MEM, SYS_MEM, or spread + across multiple apertures. @param pd_config Page Directory configuration, containing pointer and aperture for the start of the PDE3 entries @@ -126,9 +126,9 @@ uint64_t search_page_directory_subtree(struct nvdebug_state *g, mapped into by this page table. (Zero is not a valid virtual address) */ uint64_t search_page_directory(struct nvdebug_state *g, - page_dir_config_t pd_config, - uint64_t addr_to_find, - enum INST_TARGET addr_to_find_aperture) { + page_dir_config_t pd_config, + uint64_t addr_to_find, + enum INST_TARGET addr_to_find_aperture) { uint64_t res, i; // Make sure that the query is page-aligned if (addr_to_find & 0xfff) { @@ -147,9 +147,9 @@ uint64_t search_page_directory(struct nvdebug_state *g, (See `search_page_directory()` for documentation.) */ uint64_t search_v1_page_directory(struct nvdebug_state *g, - page_dir_config_t pd_config, - uint64_t addr_to_find, - enum INST_TARGET addr_to_find_aperture) { + page_dir_config_t pd_config, + uint64_t addr_to_find, + enum INST_TARGET addr_to_find_aperture) { uint64_t j, i = 0; page_dir_entry_v1_t pde; page_tbl_entry_v1_t pte; diff --git a/nvdebug.h b/nvdebug.h index e9ae3db..f644500 100644 --- a/nvdebug.h +++ b/nvdebug.h @@ -64,7 +64,7 @@ struct gk20a; add a USERD pointer, a longer INST pointer, and a runqueue selector flag. */ enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1}; -enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3}; +enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_INVALID = 1, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3}; static inline const char *target_to_text(enum INST_TARGET t) { switch (t) { case TARGET_VID_MEM: @@ -78,7 +78,7 @@ static inline const char *target_to_text(enum INST_TARGET t) { } } -// Support: Volta, Ampere, Turing +// Support: Volta, Ampere, Turing, Ampere struct gv100_runlist_chan { // 0:63 enum ENTRY_TYPE entry_type:1; @@ -308,7 +308,7 @@ typedef union { } eng_runlist_gf100_t; /* - Starting with Turing, the seperate registers for reading and writing runlist + Starting with Turing, the separate registers for reading and writing runlist configuration were dropped in favor of read/write indexed registers. As part of this, the layout was modified to allow for larger runlist pointers (upper 52 of 64 bits). @@ -362,7 +362,6 @@ enum CHANNEL_STATUS { }; /* Programmable Channel Control System RAM (PCCSR) - 512-entry array of channel control and status data structures. === Read/Write Fields === @@ -391,6 +390,7 @@ enum CHANNEL_STATUS { *Field only available on Turing. Support: Fermi, Maxwell, Pascal, Volta, Turing + See also: manuals/turing/tu104/dev_fifo.ref.txt in NVIDIA's open-gpu-doc */ #define NV_PCCSR_CHANNEL_INST(i) (0x00800000+(i)*8) #define MAX_CHID 512 @@ -611,12 +611,10 @@ typedef union { ENGINE_TYPE : What type of engine is this? (see ENGINE_TYPES_NAMES) Support: Kepler, Maxwell, Pascal, Volta, Turing, Ampere - See dev_top.ref.txt of NVIDIA's open-gpu-doc for more info. + See also: manuals/volta/gv100/dev_top.ref.txt in open-gpu-doc. */ -#define NV_PTOP_DEVICE_INFO_GA100(i) (0x00022800+(i)*4) #define NV_PTOP_DEVICE_INFO_GK104(i) (0x00022700+(i)*4) -#define NV_PTOP_DEVICE_INFO__SIZE_1_GA100(g) (nvdebug_readl(g, 0x0224fc) >> 20) #define NV_PTOP_DEVICE_INFO__SIZE_1_GK104 64 enum DEVICE_INFO_TYPE {INFO_TYPE_NOT_VALID = 0, INFO_TYPE_DATA = 1, INFO_TYPE_ENUM = 2, INFO_TYPE_ENGINE_TYPE = 3}; enum ENGINE_TYPES { @@ -670,34 +668,6 @@ static const char* const ENGINE_TYPES_NAMES[ENGINE_TYPES_LEN] = { "FLA: Fabric Logical Addressing", }; -// These field are from nvgpu/include/nvgpu/hw/ga100/hw_top_ga100.h -typedef union { - // _info type fields - struct { - uint32_t fault_id:11; - uint32_t padding0:5; - uint32_t inst_id:8; - enum ENGINE_TYPES engine_type:7; // "type_enum" - bool has_next_entry:1; - } __attribute__((packed)); - // _info2 type fields - struct { - uint32_t reset_id:8; - uint32_t pri_base:18; // "device_pri_base" - uint32_t padding1:4; - uint32_t is_engine:1; - uint32_t padding2:1; - } __attribute__((packed)); - struct { - uint32_t rleng_id:2; - uint32_t padding3:8; - uint32_t runlist_pri_base:16; - uint32_t padding4:6; - } __attribute__((packed)); - uint32_t raw; -} ptop_device_info_ga100_t; - -// These field are from open-gpu-doc/manuals/volta/gv100/dev_top.ref.txt typedef union { // DATA type fields struct { @@ -737,6 +707,70 @@ typedef union { uint32_t raw; } ptop_device_info_gk104_t; +/* GPU TOPology on Ampere and newer GPUs + On Ampere+, the array of device topology entries continues to describe all GPU + engines, but the layout is entirely different to principly accomodate a + pointer to the runlist configuration region for each engine. (Runlist + configuration was moved out of the Host (PFIFO) region into per-engine spaces + starting with Ampere.) + + Parsing is somewhat more difficult than with the older version, as entries + no longer include an `info_type`. Instead, each entry has 1--3 subrows, where + `has_next_entry` is 0 for the last subrow. + + Empty rows should be skipped. + + HAS_NEXT_ENTRY : Is the following entry a descriptor of the same engine? + + == Subrow 1 fields == + FAULT_ID : [UNKNOWN] + INST_ID : [UNKNOWN] + ENGINE_TYPE : Enumerated name of the type of engine. (Seemingly identical + to ENGINE_ENUM in old PTOP layout.) + + == Subrow 2 fields == + RESET_ID : [UNKNOWN] + PRI_BASE : [UNKNOWN] + IS_ENGINE : Does this entry describe an engine with a runlist? (Seemingly + identical to RUNLIST_IS_VALID in old PTOP layout.) + + == Subrow 3 fields == + RUNLIST_PRI_BASE : Offset in BAR0 of the RunList RAM (RLRAM) region for the + runlist of this engine. + RLENG_ID : What is the per-runlist ID of this engine? + + Support: Ampere, Ada, Hopper, (and newer likely) + See also: hw_top_ga100.h in nvgpu (NVIDIA's open-source Jetson GPU driver) +*/ +#define NV_PTOP_DEVICE_INFO_GA100(i) (0x00022800+(i)*4) +#define NV_PTOP_DEVICE_INFO__SIZE_1_GA100(g) (nvdebug_readl(g, 0x0224fc) >> 20) + +typedef union { + // _info type fields + struct { + uint32_t fault_id:11; + uint32_t padding0:5; + uint32_t inst_id:8; + enum ENGINE_TYPES engine_type:7; // "type_enum" + bool has_next_entry:1; + } __attribute__((packed)); + // _info2 type fields + struct { + uint32_t reset_id:8; + uint32_t pri_base:18; // "device_pri_base" + uint32_t padding1:4; + uint32_t is_engine:1; + uint32_t padding2:1; + } __attribute__((packed)); + struct { + uint32_t rleng_id:2; + uint32_t padding3:8; + uint32_t runlist_pri_base:16; + uint32_t padding4:6; + } __attribute__((packed)); + uint32_t raw; +} ptop_device_info_ga100_t; + /* Graphics Processing Cluster (GPC) on-chip information The GPU's Compute/Graphics engine is subdivided into Graphics Processing Clusters (also known as GPU Processing Clusters, starting with Ampere). @@ -792,21 +826,35 @@ typedef union { SCAL_NUM_CES : Number of externally accessible copy engines Errata: Incorrectly reports "3" on Jetson TX1 and TX2. Should report "1" to be - consistent with PTOP data. + consistent with PTOP data. Support: Kepler through (at least) Blackwell Also see dev_ce.ref.txt of NVIDIA's open-gpu-doc for info. */ #define NV_PTOP_SCAL_NUM_CES 0x00022444 +// Defined LCE->PCE mapping offset from nvgpu (same as ce_pce2lce_config_r(i) in nvgpu) +#define NV_LCE_FOR_PCE_GP100 0x0010402c +#define NV_LCE_FOR_PCE_GV100(i) (0x00104040+(i)*4) +#define NV_LCE_FOR_PCE_GA100(i) (0x00104100+(i)*4) +/* GRaphics Copy Engine (GRCE) Information + "There's two types of CE... ASYNC_CEs which are copy engines with their own + runlists and GRCEs which are CEs that share a runlist with GR." (nvgpu, + ioctl_ctrl.c) + + Starting with Pascal, the GRCEs are LCEs 0 and 1, but have the added capability + to share a PCE with another LCE. (Normally a PCE may only be associated with + one LCE.) These registers include that configuration, which should only be set + if no PCE has been directly associated with the specific GRCE. + + Support: Pascal through (at least) Ada + Note that Volta through Ada use a different bit format than Pascal. +*/ // Defined max number of GRCEs for a GPU (TX2 has only one) # define NV_GRCE_MAX 2 // Defined GRCE->CE mapping offsets from nvgpu #define NV_GRCE_FOR_CE_GP100(i) (0x00104034+(i)*4) #define NV_GRCE_FOR_CE_GA100(i) (0x001041c0+(i)*4) -// Defined LCE->PCE mapping offset from nvgpu (same as ce_pce2lce_config_r(i) in nvgpu) -#define NV_LCE_FOR_PCE_GP100 0x0010402c -#define NV_LCE_FOR_PCE_GV100(i) (0x00104040+(i)*4) -#define NV_LCE_FOR_PCE_GA100(i) (0x00104100+(i)*4) + // Struct for use with nvdebug_reg_range_read() union reg_range { struct { @@ -1294,13 +1342,18 @@ struct runlist_iter { int entries_left_in_tsg; // Number of entries in runlist int len; + // Offset to start of Channel RAM (as this is per-runlist on Ampere+) + uint32_t channel_ram; }; #define NVDEBUG_MAX_DEVICES 8 extern struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES]; // Defined in runlist.c -int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl_iter); +int get_runlist_iter( + struct nvdebug_state *g, + int rl_id, + struct runlist_iter *rl_iter /* out */); int preempt_tsg(struct nvdebug_state *g, uint32_t tsg_id); int preempt_runlist(struct nvdebug_state *g, uint32_t rl_id); int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id); @@ -1318,7 +1371,7 @@ uint64_t search_v1_page_directory( enum INST_TARGET addr_to_find_aperture); // Defined in bus.c int addr_to_pramin_mut(struct nvdebug_state *g, uint64_t addr, enum INST_TARGET target); -int get_bar2_pdb(struct nvdebug_state *g, page_dir_config_t* pd); +int get_bar2_pdb(struct nvdebug_state *g, page_dir_config_t* pd /* out */); // Some portions of nvdebug can be included from kernel- or user-space (just // this file at present). In order for these compiled object files to be diff --git a/runlist.c b/runlist.c index 91fca82..2e9577d 100644 --- a/runlist.c +++ b/runlist.c @@ -8,8 +8,10 @@ #include "nvdebug.h" // Uncomment to, upon BAR2 access failure, return a PRAMIN-based runlist pointer -// **If enabled, PRAMIN may not be otherwise used while walking the runlist!** -// Runlists can only be printed on the Jetson TX2 if this is enabled. +// in get_runlist_iter(). In order for this pointer to remain valid, PRAMIN +// **must** not be moved during runlist traversal. +// The Jetson TX2 has no BAR2, and stores the runlist in VID_MEM, so this must +// be enabled to print the runlist on the TX2. //#define FALLBACK_TO_PRAMIN /* Get runlist head and info (incl. length) @@ -142,12 +144,16 @@ int preempt_runlist(struct nvdebug_state *g, uint32_t rl_id) { int resubmit_runlist(struct nvdebug_state *g, uint32_t rl_id) { if (g->chip_id < NV_CHIP_ID_TURING) { eng_runlist_gf100_t rl; + if (rl_id > MAX_RUNLISTS_GF100) + return -EINVAL; if ((rl.raw = nvdebug_readq(g, NV_PFIFO_ENG_RUNLIST_BASE_GF100(rl_id))) == -1) return -EIO; rl.id = rl_id; nvdebug_writeq(g, NV_PFIFO_RUNLIST_BASE_GF100, rl.raw); } else if (g->chip_id < NV_CHIP_ID_AMPERE) { runlist_submit_tu102_t submit; + if (rl_id > MAX_RUNLISTS_TU102) + return -EINVAL; if ((submit.raw = nvdebug_readq(g, NV_PFIFO_RUNLIST_SUBMIT_TU102(rl_id))) == -1) return -EIO; nvdebug_writeq(g, NV_PFIFO_RUNLIST_SUBMIT_TU102(rl_id), submit.raw); diff --git a/runlist_procfs.c b/runlist_procfs.c index 986465d..8152463 100644 --- a/runlist_procfs.c +++ b/runlist_procfs.c @@ -3,7 +3,7 @@ #include "nvdebug_linux.h" -#define RUNLIST_PROCFS_NAME "runlist" +// Uncomment to expand channel status information when printing the runlist #define DETAILED_CHANNEL_INFO #ifdef DETAILED_CHANNEL_INFO @@ -197,12 +197,7 @@ ssize_t resubmit_runlist_file_write(struct file *f, const char __user *buffer, if (err) return err; - // Verify valid runlist (in terms of absolute maximums) - if (g->chip_id < NV_CHIP_ID_TURING && target_runlist > MAX_RUNLISTS_GF100) - return -ERANGE; - else if (g->chip_id < NV_CHIP_ID_AMPERE && target_runlist > MAX_RUNLISTS_TU102) - return -ERANGE; - + // resubmit_runlist() checks that target_runlist is valid if ((err = resubmit_runlist(g, target_runlist))) return err; -- cgit v1.2.2