1 files changed, 673 insertions, 46 deletions
diff --git a/nvdebug.h b/nvdebug.h
index 9ac71da..1882756 100644
--- a/nvdebug.h
+++ b/nvdebug.h
@@ -5,14 +5,18 @@
 // TODO(jbakita): Don't depend on these.
 #include <nvgpu/gk20a.h>  // For struct gk20a
 #include <os/linux/os_linux.h>  // For struct nvgpu_os_linux
+#include <linux/proc_fs.h>  // For PDE_DATA() macro
 /* Runlist Channel
  A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue
  of GPU commands. These commands are typically queued from userspace.
-  `INST_PTR` points to a GPU Instance Block which contains pointers to the GPU
+  Prior to Volta, channels could also exist independent of a TSG. These are
-  virtual address space for this context. All channels in a TSG point to the
+  called "bare channels" in the Jetson nvgpu driver.
-  same GPU Instance Block (?).
+  `INST_PTR` points to a GPU Instance Block which contains FIFO states, virtual
+  address space configuration for this context, and a pointer to the page
+  tables. All channels in a TSG point to the same GPU Instance Block (?).
  "RUNQUEUE_SELECTOR determines to which runqueue the channel belongs, and
  thereby which PBDMA will run the channel.  Increasing values select
@@ -30,7 +34,13 @@
  ENTRY_TYPE (T)        : type of this entry: ENTRY_TYPE_CHAN
  CHID (ID)             : identifier of the channel to run (overlays ENTRY_ID)
  RUNQUEUE_SELECTOR (Q) : selects which PBDMA should run this channel if
-                          more than one PBDMA is supported by the runlist
+                          more than one PBDMA is supported by the runlist,
+                          additionally, "A value of 0 targets the first FE
+                          pipe, which can process all FE driven engines:
+                          Graphics, Compute, Inline2Memory, and TwoD.  A value
+                          of 1 targets the second FE pipe, which can only
+                          process Compute work.  Note that GRCE work is allowed
+                          on either runqueue.)"
  INST_PTR_LO           : lower 20 bits of the 4k-aligned instance block pointer
  INST_PTR_HI           : upper 32 bit of instance block pointer
@@ -39,6 +49,9 @@
  USERD_PTR_LO          : upper 24 bits of the low 32 bits, of the 512-byte-aligned USERD pointer
  USERD_PTR_HI          : upper 32 bits of USERD pointer
  USERD_TARGET (TGU)    : aperture of the USERD data structure
+  Channels were around since at least Fermi, but were rearranged with Volta to
+  add a USERD pointer, a longer INST pointer, and a runqueue selector flag.
 */
 enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1};
 enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3};
@@ -52,11 +65,12 @@ static inline char* target_to_text(enum INST_TARGET t) {
                        return "SYS_MEM_NONCOHERENT";
                default:
                        printk(KERN_WARNING "[nvdebug] Invalid aperture!\n");
-                        return NULL;
+                        return "INVALID";
        }
 }
-struct runlist_chan {
+// Support: Volta, Ampere, Turing
+struct gv100_runlist_chan {
 // 0:63
        enum ENTRY_TYPE entry_type:1;
        uint32_t runqueue_selector:1;
@@ -71,6 +85,20 @@ struct runlist_chan {
        uint32_t inst_ptr_hi:32;
 } __attribute__((packed));
+// Support: Fermi, Kepler*, Maxwell, Pascal
+// *In Kepler, inst fields may be unpopulated?
+struct gm107_runlist_chan {
+        uint32_t chid:12;
+         uint32_t padding0:1;
+        enum ENTRY_TYPE entry_type:1;
+         uint32_t padding1:18;
+        uint32_t inst_ptr_lo:20;
+        enum INST_TARGET inst_target:2;  // Totally guessing on this
+         uint32_t padding2:10;
+} __attribute__((packed));
+#define gk110_runlist_chan gm107_runlist_chan
 /* Runlist TSG (TimeSlice Group)
  The runlist is composed of timeslice groups (TSG). Each TSG corresponds
  to a single virtual address space on the GPU and contains `TSG_LENGTH`
@@ -85,8 +113,15 @@ struct runlist_chan {
  TIMESLICE_TIMEOUT   : timeout amount for the TSG's timeslice
  TSG_LENGTH          : number of channels that are part of this timeslice group
  TSGID               : identifier of the Timeslice group (overlays ENTRY_ID)
+  TSGs appear to have been introduced with Kepler and stayed the same until
+  they were rearranged at the time of channel rearrangement to support longer
+  GPU instance addresses with Volta.
 */
-struct entry_tsg {
+// Support: Volta, Ampere*, Turing*
+// *These treat the top 8 bits of TSGID as GFID (unused)
+struct gv100_runlist_tsg {
 // 0:63
        enum ENTRY_TYPE entry_type:1;
         uint64_t padding:15;
@@ -101,14 +136,28 @@ struct entry_tsg {
 } __attribute__((packed));
 #define MAX_TSGID (1 << 12)
+// Support: Kepler (v2?), Maxwell, Pascal
+// Same fields as Volta except tsg_length is 6 bits rather than 8
+// Last 32 bits appear to contain an undocumented inst ptr
+struct gk110_runlist_tsg {
+        uint32_t tsgid:12;
+         uint32_t padding0:1;
+        enum ENTRY_TYPE entry_type:1;
+        uint32_t timeslice_scale:4;
+        uint32_t timeslice_timeout:8;
+        uint32_t tsg_length:6;
+         uint32_t padding1:32;
+} __attribute__((packed));
 enum PREEMPT_TYPE {PREEMPT_TYPE_CHANNEL = 0, PREEMPT_TYPE_TSG = 1};
 /* Preempt a TSG or Channel by ID
-  ID/CHID             : Id of TSG or channel to preempt
+  ID/CHID     : Id of TSG or channel to preempt
-  IS_PENDING          : ????
+  IS_PENDING  : Is a context switch pending?
-  TYPE                : PREEMPT_TYPE_CHANNEL or PREEMPT_TYPE_TSG
+  TYPE        : PREEMPT_TYPE_CHANNEL or PREEMPT_TYPE_TSG
-  Support: Kepler, Maxwell, Pascal, Volta
+  Support: Kepler, Maxwell, Pascal, Volta, Turing
 */
 #define NV_PFIFO_PREEMPT 0x00002634
 typedef union {
@@ -195,26 +244,36 @@ typedef union {
 */
 // Note: This is different with Turing
-// Support: Kepler, Maxwell, Pascal, Volta
+// Support: Fermi, Kepler, Maxwell, Pascal, Volta
 #define NV_PFIFO_RUNLIST_BASE 0x00002270
+#define NV_PFIFO_ENG_RUNLIST_BASE(i) (0x00002280+(i)*8)
 typedef union {
        struct {
                uint32_t ptr:28;
-                uint32_t type:2;
+                enum INST_TARGET target:2;
                 uint32_t padding:2;
        } __attribute__((packed));
        uint32_t raw;
 } runlist_base_t;
 // Support: Kepler, Maxwell, Pascal, Volta
+// Works on Fermi, but id is one bit longer and is b11111
 #define NV_PFIFO_RUNLIST 0x00002274
+#define NV_PFIFO_ENG_RUNLIST(i) (0x00002284+(i)*8)
 typedef union {
+        // RUNLIST fields
        struct {
                uint32_t len:16;
                 uint32_t padding:4;
-                uint32_t id:4;
+                uint32_t id:4; // Runlist ID (each engine may have a seperate runlist)
                 uint32_t padding2:8;
        } __attribute__((packed));
+        // ENG_RUNLIST fields that differ
+        struct {
+                 uint32_t padding3:20;
+                bool is_pending:1; // Is runlist not yet committed?
+                 uint32_t padding4:11;
+        } __attribute__((packed));
        uint32_t raw;
 } runlist_info_t;
@@ -301,63 +360,631 @@ typedef union {
        uint32_t raw;
 } runlist_disable_t;
+/* Read GPU descriptors from the Master Controller (MC)
+  MINOR_REVISION  : Legacy (only used with Celvin in Nouveau)
+  MAJOR_REVISION  : Legacy (only used with Celvin in Nouveau)
+  IMPLEMENTATION  : Which implementation of the GPU architecture
+  ARCHITECTURE    : Which GPU architecture
+  CHIP_ID = IMPLEMENTATION + ARCHITECTURE << 4
+  CHIP_ID         : Unique ID of all chips since Kelvin
+  Support: Kelvin, Rankline, Curie, Tesla, Fermi, Kepler, Maxwell, Pascal,
+           Volta, Turing, Ampere
+*/
+#define NV_MC_BOOT_0 0x00000000
+#define NV_CHIP_ID_GP106 0x136 // Discrete GeForce GTX 1060
+#define NV_CHIP_ID_GV11B 0x15B // Jetson Xavier embedded GPU
+#define NV_CHIP_ID_KEPLER 0x0E0
+#define NV_CHIP_ID_VOLTA 0x140
+inline static const char* ARCH2NAME(uint32_t arch) {
+        switch (arch) {
+        case 0x01:
+                return "Celsius";
+        case 0x02:
+                return "Kelvin";
+        case 0x03:
+                return "Rankline";
+        case 0x04:
+        case 0x06: // 0x06 is (nForce 6XX integrated only)
+                return "Curie";
+        // 0x07 is unused/skipped
+        case 0x05: // First Tesla card was released before the nForce 6XX
+        case 0x08:
+        case 0x09:
+        case 0x0A:
+                return "Tesla";
+        // 0x0B is unused/skipped
+        case 0x0C:
+        case 0x0D:
+                return "Fermi";
+        case 0x0E:
+        case 0x0F:
+        case 0x11:
+                return "Kepler";
+        case 0x12:
+                return "Maxwell";
+        case 0x13:
+                return "Pascal";
+        case 0x14:
+        case 0x15: // Volta integrated
+                return "Volta";
+        case 0x16:
+                return "Turing";
+        case 0x17:
+                return "Ampere";
+        case 0x18:
+        case 0x19:
+                return "Hopper (?) or Lovelace (?)";
+        default:
+                if (arch < 0x19)
+                        return "[unknown historical architecture]";
+                else
+                        return "[future]";
+        }
+}
+typedef union {
+        // Fields as defined in the NVIDIA reference
+        struct {
+                uint32_t minor_revision:4;
+                uint32_t major_revision:4;
+                 uint32_t reserved:4;
+                 uint32_t padding0:8;
+                uint32_t implementation:4;
+                uint32_t architecture:5;
+                 uint32_t padding1:3;
+        } __attribute__((packed));
+        uint32_t raw;
+        // Arch << 4 + impl is also often used
+        struct {
+                 uint32_t padding2:20;
+                uint32_t chip_id:9;
+                 uint32_t padding3:3;
+        } __attribute__((packed));
+} mc_boot_0_t;
+enum DEVICE_INFO_TYPE {INFO_TYPE_NOT_VALID = 0, INFO_TYPE_DATA = 1, INFO_TYPE_ENUM = 2, INFO_TYPE_ENGINE_TYPE = 3};
+enum ENGINE_TYPES {
+        ENGINE_GRAPHICS = 0, // GRAPHICS [/compute]
+        ENGINE_COPY0 = 1, // [raw/physical] COPY #0
+        ENGINE_COPY1 = 2, // [raw/physical] COPY #1
+        ENGINE_COPY2 = 3, // [raw/physical] COPY #2
+        ENGINE_MSPDEC = 8, // Picture DECoder
+        ENGINE_MSPPP = 9, // [Video] Post Processing
+        ENGINE_MSVLD = 10, // [Video] Variable Length Decoder
+        ENGINE_MSENC = 11, // [Video] ENCoding
+        ENGINE_VIC = 12, // Video Image Compositor
+        ENGINE_SEC = 13, // SEquenCer [?]
+        ENGINE_NVENC0 = 14, // Nvidia Video ENCoder #0
+        ENGINE_NVENC1 = 15, // Nvidia Video ENCoder #1
+        ENGINE_NVDEC = 16, // Nvidia Video DECoder
+        ENGINE_IOCTRL = 18, // I/O ConTRoLler [of NVLINK at least]
+        ENGINE_LCE = 19, // Logical Copy Engine
+        ENGINE_GSP = 20, // Gpu System Processor
+        ENGINE_NVJPG = 21, // NVidia JPeG [Decoder] (Ampere+)
+};
+#define ENGINE_TYPES_LEN 22
+static const char* const ENGINE_TYPES_NAMES[ENGINE_TYPES_LEN] = {
+        "Graphics/Compute",
+        "COPY0",
+        "COPY1",
+        "COPY2",
+        "Unknown Engine ID#4",
+        "Unknown Engine ID#5",
+        "Unknown Engine ID#6",
+        "Unknown Engine ID#7",
+        "MSPDEC: Picture Decoder",
+        "MSPPP: Post Processing",
+        "MSVLD: Variable Length Decoder",
+        "MSENC: Encoder",
+        "VIC: Video Image Compositor",
+        "SEC: Sequencer",
+        "NVENC0: NVIDIA Video Encoder #0",
+        "NVENC1: NVIDIA Video Encoder #1",
+        "NVDEC: NVIDIA Video Decoder",
+        "Unknown Engine ID#17",
+        "IOCTRL: I/O Controller",
+        "LCE: Logical Copy Engine",
+        "GSP: GPU System Processor",
+        "NVJPG: NVIDIA JPEG Decoder",
+};
+/* GPU engine information and control register offsets
+  Each engine is described by one or more entries (terminated by an entry with
+  the `has_next_entry` flag unset) in the fixed-size PTOP_DEVICE_INFO table. A
+  typical device, such as the graphics/compute engine and any copy engines, are
+  described by three entries, one of each type.
+  The PTOP_DEVICE_INFO table is sparsely populated (entries of type
+  INFO_TYPE_NOT_VALID may be intermingled with valid entries), so any traversal
+  code should check all NV_PTOP_DEVICE_INFO__SIZE_1 entries and not terminate
+  upon reaching the first entry of INFO_TYPE_NOT_VALID.
+  INFO_TYPE          : Is this a DATA, ENUM, or ENGINE_TYPE table entry?
+  HAS_NEXT_ENTRY     : Does the following entry refer to the same engine?
+  == INFO_TYPE_DATA fields ==
+  PRI_BASE           : BAR0 base = (PRI_BASE << 12) aka 4k aligned.
+  INST_ID            : "Note that some instanced [engines] (such as logical copy
+                       engines aka LCE) share a PRI_BASE across all [engines] of
+                       the same engine type; such [engines] require an additional
+                       offset: instanced base = BAR0 base + stride * INST_ID.
+  FAULT_ID_IS_VALID  : Does this engine have its own bind point and fault ID
+                       with the MMU?
+  FAULT_ID           : "The MMU fault id used by this [engine]. These IDs
+                       correspond to the NV_PFAULT_MMU_ENG_ID define list."
+  == INFO_TYPE_ENUM fields ==
+  ENGINE_IS_VALID    : Is this engine a host engine?
+  ENGINE_ENUM        : "[T]he host engine ID for the current [engine] if it is
+                       a host engine, meaning Host can send methods to the
+                       engine. This id is used to index into any register array
+                       whose __SIZE_1 is equal to NV_HOST_NUM_ENGINES.  A given
+                       ENGINE_ENUM can be present for at most one device in the
+                       table.  Devices corresponding to all ENGINE_ENUM ids 0
+                       through NV_HOST_NUM_ENGINES - 1 must be present in the
+                       device info table."
+  RUNLIST_IS_VALID   : Is this engine a host engine with a runlist?
+  RUNLIST_ENUM       : "[T]he Host runlist ID on which methods for the current
+                       [engine] should be submitted... The runlist id is used to
+                       index into any register array whose __SIZE_1 is equal to
+                       NV_HOST_NUM_RUNLISTS. [Engines] corresponding to all
+                       RUNLIST_ENUM ids 0 through NV_HOST_NUM_RUNLISTS - 1 must
+                       be present in the device info table."
+  INTR_IS_VALID      : Does this device have an interrupt?
+  INTR_ENUM          : Interrupt ID for use with "the NV_PMC_INTR_*_DEVICE
+                       register bitfields."
+  RESET_IS_VALID     : Does this engine have a reset ID?
+  RESET_ENUM         : Reset ID for use indexing the "NV_PMC_ENABLE_DEVICE(i)
+                       and NV_PMC_ELPG_ENABLE_DEVICE(i) register bitfields."
+  == INFO_TYPE_ENGINE_TYPE fields ==
+  ENGINE_TYPE        : What type of engine is this? (see ENGINE_TYPES_NAMES) 
+  Support: Kepler, Maxwell, Pascal, Volta, Ampere
+  See dev_top.ref.txt of NVIDIA's open-gpu-doc for more info.
+*/
+#define NV_PTOP_DEVICE_INFO(i) (0x00022700+(i)*4)
+#define NV_PTOP_DEVICE_INFO__SIZE_1 64
+typedef union {
+        // DATA type fields
+        struct {
+                enum DEVICE_INFO_TYPE info_type:2;
+                bool fault_id_is_valid:1;
+                uint32_t fault_id:7;
+                 uint32_t padding0:2;
+                uint32_t pri_base:12;
+                 uint32_t padding1:2;
+                uint32_t inst_id:4;
+                uint32_t is_not_enum2:1;
+                bool has_next_entry:1;
+        } __attribute__((packed));
+        // ENUM type fields
+        struct {
+                 uint32_t padding2:2;
+                bool reset_is_valid:1;
+                bool intr_is_valid:1;
+                bool runlist_is_valid:1;
+                bool engine_is_valid:1;
+                 uint32_t padding3:3;
+                uint32_t reset_enum:5;
+                 uint32_t padding4:1;
+                uint32_t intr_enum:5;
+                 uint32_t padding5:1;
+                uint32_t runlist_enum:4;
+                 uint32_t padding6:1;
+                uint32_t engine_enum:4;
+                 uint32_t padding7:2;
+        } __attribute__((packed));
+        // ENGINE_TYPE type fields
+        struct {
+                 uint32_t padding8:2;
+                enum ENGINE_TYPES engine_type:29;
+                 uint32_t padding9:1;
+        } __attribute__((packed));
+        uint32_t raw;
+} ptop_device_info_t;
+#define NV_PTOP_SCAL_NUM_GPCS 0x00022430
+#define NV_PTOP_SCAL_NUM_TPC_PER_GPC 0x00022434
+#define NV_PTOP_SCAL_NUM_CES 0x00022444
+// PCE_MAP is Volta+ only
+#define NV_CE_PCE_MAP 0x00104028
+// GPC and TPC masks
+// Support: Maxwell+
+#define NV_FUSE_GPC 0x00021c1c
+#define NV_FUSE_TPC_FOR_GPC(i) (0x00021c38+(i)*4)
+/* Location of the 1Kb instance block with page tables for BAR1 and BAR2.
+  Support: Fermi+ (?), Pascal
+*/
+#define NV_PBUS_BAR1_BLOCK 0x00001704
+#define NV_PBUS_BAR2_BLOCK 0x00001714
+typedef union {
+        struct {
+                uint32_t ptr:28;
+                enum INST_TARGET target:2;
+                 uint32_t padding0:1;
+                bool is_virtual:1;
+        } __attribute__((packed));
+        uint32_t raw;
+        struct {
+                uint32_t map:30;
+                 uint32_t padding1:2;
+        } __attribute__((packed));
+} bar_config_block_t;
+/* BAR0 PRAMIN (Private RAM Instance) window configuration
+  BASE    : Base of window >> 16 in [TARGET] virtual address space
+  TARGET  : Which address space BASE points into
+  Note: This seems to be set to 0x0bff00000 - 0x0c0000000 at least sometimes
+  Support: Tesla 2.0, Fermi, Kepler, Maxwell, Pascal, Turing, Ampere
+*/
+#define NV_PBUS_BAR0_WINDOW 0x00001700
+#define NV_PRAMIN 0x00700000  // Goes until 0x00800000 (1MB window)
+#define NV_PRAMIN_LEN 0x00100000
+typedef union {
+        struct {
+                uint32_t base:24;
+                enum INST_TARGET target:2;
+                 uint32_t padding0:6;
+        } __attribute__((packed));
+        uint32_t raw;
+} bar0_window_t;
+// Support: Tesla 2.0, Fermi, Kepler, Maxwell, Pascal, Turing, Ampere
+#define NV_PRAMIN_PDB_CONFIG_OFF 0x200
+typedef union {
+        struct {
+                uint32_t target:2;
+                uint32_t vol:1;
+                 uint32_t padding0:1;
+                uint32_t fault_replay_tex:1;
+                uint32_t fault_replay_gcc:1;
+                 uint32_t padding1:4;
+                bool is_ver2:1;
+                bool is_64k_big_page:1;  // 128Kb otherwise
+                uint32_t page_dir_lo:20;
+                uint32_t page_dir_hi:32;
+        } __attribute__((packed));
+        uint64_t raw;
+} page_dir_config_t;
+/* Page directory entry
+  Note: Format changed with Pascal (how?)
+  Support: Pascal, Volta, Turing, Ampere
+*/
+// FIXME: PDE/PTEs are actually 64 bits =S
+// Important: Aperture keys are different with PDEs
+enum PD_TARGET {
+        PD_AND_TARGET_INVALID = 0,  // b000
+        PD_AND_TARGET_VID_MEM = 2,  // b010
+        PD_AND_TARGET_SYS_MEM_COHERENT = 4,  // b100
+        PD_AND_TARGET_SYS_MEM_NONCOHERENT = 6,  // b110
+        PTE_AND_TARGET_VID_MEM = 1,  // b001
+        PTE_AND_TARGET_PEER = 3,  // b011
+        PTE_AND_TARGET_SYS_MEM_COHERENT = 5,  // b101
+        PTE_AND_TARGET_SYS_MEM_NONCOHERENT = 7,  // b111
+};
+static inline char* pd_target_to_text(enum PD_TARGET t) {
+        switch (t) {
+                case PD_AND_TARGET_INVALID:
+                        return "INVALID";
+                case PD_AND_TARGET_VID_MEM:
+                case PTE_AND_TARGET_VID_MEM:
+                        return "VID_MEM";
+                case PTE_AND_TARGET_PEER:
+                        return "PEER";
+                case PD_AND_TARGET_SYS_MEM_COHERENT:
+                case PTE_AND_TARGET_SYS_MEM_COHERENT:
+                        return "SYS_MEM_COHERENT";
+                case PD_AND_TARGET_SYS_MEM_NONCOHERENT:
+                case PTE_AND_TARGET_SYS_MEM_NONCOHERENT:
+                        return "SYS_MEM_NONCOHERENT";
+                default:
+                        printk(KERN_WARNING "[nvdebug] Invalid aperture!\n");
+                        return NULL;
+        }
+}
+// PDE/PTE V2 type
+// Note: As the meaning of target (bits 2:1) changes depending on if the entry
+//       is a PTE or not, this combines them into a single target field to
+//       simplify comparisons.
+// Support: Pascal, Turing, Ampere
+typedef union {
+        // Page Directory Entry (PDE)
+        struct {
+                bool is_pte:1;
+                 uint32_t __target:2;
+                bool is_volatile:1;
+                 uint32_t padding1:4;
+                uint32_t addr:24;
+        } __attribute__((packed));
+        // Page Table Entry (PTE)
+        struct {
+                enum PD_TARGET target:3;
+                 uint32_t __is_volatile:1;
+                bool is_encrypted:1;
+                bool is_privileged:1;
+                bool is_readonly:1;
+                bool atomics_disabled:1;
+                 uint32_t __addr:24;
+        } __attribute__((packed));
+        uint32_t raw;
+} page_dir_entry_t;
+// PDE/PTE V1 types
+// Support: Fermi, Kepler, Maxwell
+enum V1_PD_TARGET {
+        PD_TARGET_INVALID = 0,
+        PD_TARGET_VID_MEM = 1,
+        PD_TARGET_SYS_MEM_COHERENT = 2,
+        PD_TARGET_SYS_MEM_NONCOHERENT = 3,
+};
+// Page Directory Entry (PDE)
+typedef union {
+// Large page fields
+        struct {
+// 0:32
+                enum V1_PD_TARGET target:2;
+                 uint32_t padding0:2;
+                uint64_t addr:28;  // May be wider?
+// 32:63
+                 uint32_t padding2:3;
+                uint32_t is_volatile:1; // Might have counted wrong?
+                 uint32_t padding3:28;
+        } __attribute__((packed));
+// Small page fields
+        struct {
+// 0:32
+                 uint32_t padding00:32;
+// 32:63
+                enum V1_PD_TARGET alt_target:2;
+                uint32_t alt_is_volatile:1; // Might have counted wrong?
+                 uint32_t padding03:1;
+                uint64_t alt_addr:28;
+        } __attribute__((packed));
+        uint64_t raw;
+} page_dir_entry_v1_t;
+// Page Table Entry (PTE)
+// Reconstructed from info in Jetson nvgpu driver
+typedef union {
+        struct {
+// 0:32
+                bool is_present:1;
+                bool is_privileged:1;
+                bool is_readonly:1;
+                 uint32_t padding0:1;
+                uint64_t addr:28;
+// 32:63
+                bool is_volatile:1;
+                enum INST_TARGET:2;
+                 uint32_t padding1:1;
+                uint32_t kind:8;
+                uint32_t comptag:17;
+                 uint32_t padding2:1;
+                bool is_read_disabled:1;
+                bool is_write_disabled:1;
+        } __attribute__((packed));
+        uint64_t raw;
+} page_tbl_entry_v1_t;
+//enum V0_PDE_TYPE {NOT_PRESENT = 0, PAGE_64K = 1, PAGE_16K = 2, PAGE_4K = 3};
+//enum V0_PDE_SIZE {PDE_SZ_128K = 0, PDE_SZ_32K = 1, PDE_SZ_16K = 2, PDE_SZ_8K = 3};
+//static const int V0_PDE_SIZE2NUM[4] = {128*1024, 32*1024, 16*1024, 8*1024};
+/* PDE V0 (nv50/Tesla)
+typedef union {
+        struct {
+                enum V1_PDE_TYPE type:2;
+                enum INST_TARGET target:2;
+                 uint32_t padding0:1;
+                enum V1_PDE_SIZE sublevel_size:2;
+                 uint32_t padding1:5;
+                uint32_t addr:28;
+                 uint32_t padding2:24;
+        } __attribute__((packed));
+        uint64_t raw;
+} page_dir_entry_v1_t;*/
+/* PTE V0 (nv50)
+typedef union {
+        struct {
+                bool is_present:1;
+                 uint32_t padding3:2;
+                bool is_readonly:1;
+                enum INST_TARGET target:2;
+                bool is_privileged:1;
+                uint32_t contig_blk_sz:3;
+                 uint32_t padding4:2;
+                uint32_t addr:28;
+                uint32_t storage_type:7;  // ???
+                uint32_t compression_mode:2;  // ???
+                uint32_t compression_tag:12;  // ???
+                bool is_long_partition_cycle:1;  // ???
+                bool is_encrypted:1;
+                 uint32_t padding5:1;
+        } __attribute__((packed));
+        uint64_t raw;
+} page_tbl_entry_v1_t;*/
 // TODO(jbakita): Maybe put the above GPU types in a different file.
-#define for_chan_in_tsg(chan, tsg) \
+#define NV_PCI_VENDOR 0x10de
-        for (chan = (struct runlist_chan*)(tsg + 1); \
+struct nvdebug_state {
-             (void*)chan < (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length; \
+        // Pointer to the mapped base address of the GPU control registers (obtained
-             chan++)
+        // via ioremap() originally). For embedded GPUs, we extract this from their
+        // struct nvgpu_os_linux. For discrete GPUs, we create our own mapping of
+        // BAR0 with pci_iomap(). Access via nvgpu_readl/writel functions.
+        void __iomem *regs;
+        // Depending on the architecture, BAR2 or BAR3 are used to access PRAMIN
+        union {
+                void __iomem *bar2;
+                void __iomem *bar3;
+        };
+        int chip_id;
+        // Additional state from the built-in driver. Only set iff
+        // chip_id == NV_CHIP_ID_GV11B
+        struct gk20a *g;
+        // Pointer to PCI device needed for pci_iounmap
+        struct pci_dev *pcid;
+};
+/*const struct runlist_funcs {
+        u8 size;
+        enum ENTRY_TYPE (*entry_type)(struct nvdebug_state *, void *);
+        uint32_t (*chid)(struct nvdebug_state *, void *);
+        uint32_t (*inst_ptr_lo)(struct nvdebug_state *, void *);
+        enum INST_TARGET (*inst_target)(struct nvdebug_state *, void *):
+        uint32_t (*tsgid)(struct nvdebug_state *, void *);
+        uint32_t (*timeslice_scale)(struct nvdebug_state *, void *);
+        uint32_t (*timeslice_timeout)(struct nvdebug_state *, void *);
+        uint32_t (*tsg_length)(struct nvdebug_state *, void *);
+};*/
+// This disgusting macro is a crutch to work around the fact that runlists were
+// different prior to Volta.
+#define VERSIONED_RL_ACCESSOR(_ENTRY_TYPE, type, prop) \
+        __attribute__((unused)) \
+        static type (prop)(const struct nvdebug_state *g, const void *raw) { \
+                if (g->chip_id > NV_CHIP_ID_VOLTA) { \
+                        const struct gv100_runlist_ ## _ENTRY_TYPE *entry = (struct gv100_runlist_ ## _ENTRY_TYPE*)raw; \
+                        return entry->prop; \
+                } else if (g->chip_id > NV_CHIP_ID_KEPLER) { \
+                        const struct gk110_runlist_ ## _ENTRY_TYPE *entry = (struct gk110_runlist_ ## _ENTRY_TYPE*)raw; \
+                        return entry->prop; \
+                } else { \
+                        printk(KERN_WARNING "[nvdebug] " #prop " unavailable on GPU ID %x, which is older than Kepler.\n", g->chip_id); \
+                        return (type)0; \
+                } \
+        }
+VERSIONED_RL_ACCESSOR(chan, uint32_t, chid);
+VERSIONED_RL_ACCESSOR(chan, uint32_t, inst_ptr_lo);
+VERSIONED_RL_ACCESSOR(chan, enum INST_TARGET, inst_target);
+VERSIONED_RL_ACCESSOR(tsg, uint32_t, tsgid);
+VERSIONED_RL_ACCESSOR(tsg, enum ENTRY_TYPE, entry_type);
+VERSIONED_RL_ACCESSOR(tsg, uint32_t, timeslice_scale);
+VERSIONED_RL_ACCESSOR(tsg, uint32_t, timeslice_timeout);
+VERSIONED_RL_ACCESSOR(tsg, uint32_t, tsg_length);
-#define next_tsg(tsg) \
-        (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length
+#define NV_RL_ENTRY_SIZE(g) \
+         ((g)->chip_id >= NV_CHIP_ID_VOLTA ? sizeof(struct gv100_runlist_tsg) : sizeof(struct gk110_runlist_tsg))
+#define for_chan_in_tsg(g, chan, tsg) \
+        for (chan = (typeof(chan))(((u8*)tsg) + NV_RL_ENTRY_SIZE(g)); \
+             (u8*)chan < ((u8*)tsg) + (1 + tsg_length(g, tsg)) * NV_RL_ENTRY_SIZE(g); \
+             chan = (typeof(chan))(((u8*)chan) + NV_RL_ENTRY_SIZE(g)))
+#define next_tsg(g, tsg) \
+        (typeof(tsg))((u8*)(tsg) + NV_RL_ENTRY_SIZE(g) * (tsg_length(g, tsg) + 1))
 struct runlist_iter {
-        struct entry_tsg *curr_tsg;
+        // Pointer to either a TSG or channel entry (they're the same size)
+        void *curr_entry;
+        // This should be set to tsg_length when a TSG is reached, and
+        // decremented as each subsequent channel is printed. This allows us to
+        // track which channel are and are not part of the TSG.
+        int channels_left_in_tsg;
+        // Total runlist length, etc
        runlist_info_t rl_info;
 };
+#define NVDEBUG_MAX_DEVICES 8
+extern struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES];
 // Defined in runlist.c
-struct gk20a* get_live_gk20a(void);
+int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl_iter);
-int get_runlist_iter(struct runlist_iter *rl_iter);
+int preempt_tsg(struct nvdebug_state *g, uint32_t tsg_id);
-int preempt_tsg(uint32_t tsg_id);
+// Defined in mmu.c
+uint32_t vram2PRAMIN(struct nvdebug_state *g, uint64_t addr);
+void __iomem *phy2PRAMIN(struct nvdebug_state* g, uint64_t phy);
+uint64_t search_page_directory(
+        struct nvdebug_state *g,
+        void __iomem *pde_offset,
+        void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
+        uint64_t addr_to_find);
+uint64_t search_v1_page_directory(
+        struct nvdebug_state *g,
+        void __iomem *pde_offset,
+        void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
+        uint64_t addr_to_find);
 static inline struct gk20a *get_gk20a(struct device *dev) {
        // XXX: Only works because gk20a* is the first member of gk20a_platform
        return *((struct gk20a**)dev_get_drvdata(dev));
 }
-// Functionally identical to nvgpu_readl()
+// We us the data field of the proc_dir_entry ("PDE" in this function) to store
+// our index into the g_nvdebug_state array
+static inline int seq2gpuidx(struct seq_file *s) {
+        const struct file *f = s->file;
+        return (uintptr_t)PDE_DATA(file_inode(f));
+}
+static inline int file2gpuidx(const struct file *f) {
+        return (uintptr_t)PDE_DATA(file_inode(f));
+}
+static inline int file2parentgpuidx(const struct file *f) {
+        // Should be safe to call on ProcFS entries, as our parent should (?)
+        // still exist if we're called. If not, there are worse races in this
+        // module.
+        return (uintptr_t)PDE_DATA(file_dentry(f)->d_parent->d_inode);
+}
+#define gk20a_regs(gk20a) (container_of(gk20a, struct nvgpu_os_linux, g)->regs)
+// Similar to nvgpu_readl()
 // (except we don't try to resolve situations where regs is NULL)
-static inline u32 nvdebug_readl(struct gk20a* g, u32 r) {
+static inline u32 nvdebug_readl(struct nvdebug_state *s, u32 r) {
-        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
+        if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
-        if (unlikely(!g_os->regs)) {
+                printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
-                printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
+                return -1;
-                return -1;
+        }
-        }
+        return readl(s->regs + r);
-        return readl(g_os->regs + r);
 }
 // quadword version of nvdebug_readl()
-static inline u64 nvdebug_readq(struct gk20a* g, u32 r) {
+static inline u64 nvdebug_readq(struct nvdebug_state *s, u32 r) {
-        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
+        u64 ret;
-        u64 ret;
+        if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
-        if (unlikely(!g_os->regs)) {
+                printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
-                printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
+                return -1;
-                return -1;
+        }
-        }
        // readq seems to always return the uppermost 32 bits as 0, so workaround with readl
-        ret = readl(g_os->regs + r);
+        ret = readl(s->regs + r);
-        ret |= ((u64)readl(g_os->regs + r + 4)) << 32;
+        ret |= ((u64)readl(s->regs + r + 4)) << 32;
        return ret;
 }
-// Functionally identical to nvgpu_writel()
+// Similar to nvgpu_writel()
-static inline void nvdebug_writel(struct gk20a* g, u32 r, u32 v) {
+static inline void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) {
-        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
+        if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
-        if (unlikely(!g_os->regs)) {
+                printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n");
+                return;
+        }
+        writel_relaxed(v, s->regs + r);
+        wmb();
+}
+// quadword version of nvdebug_writel()
+// XXX: This probably doesn't work XXX: Untested
+static inline void nvdebug_writeq(struct nvdebug_state *s, u32 r, u64 v) {
+        if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
                printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n");
                return;
        }
-        writel_relaxed(v, g_os->regs + r);
+        writeq_relaxed(v, s->regs + r);
        wmb();
 }