1 files changed, 90 insertions, 3 deletions
diff --git a/nvdebug.h b/nvdebug.h
index b4ff0a4..cd0dc90 100644
--- a/nvdebug.h
+++ b/nvdebug.h
@@ -12,7 +12,20 @@
  `INST_PTR` points to a GPU Instance Block which contains pointers to the GPU
  virtual address space for this context. All channels in a TSG point to the
-  same GPU Instance Block.
+  same GPU Instance Block (?).
+  "RUNQUEUE_SELECTOR determines to which runqueue the channel belongs, and
+  thereby which PBDMA will run the channel.  Increasing values select
+  increasingly numbered PBDMA IDs serving the runlist.  If the selector value
+  exceeds the number of PBDMAs on the runlist, the hardware will silently
+  reassign the channel to run on the first PBDMA as though RUNQUEUE_SELECTOR had
+  been set to 0.  (In current hardware, this is used by SCG on the graphics
+  runlist only to determine which FE pipe should service a given channel.  A
+  value of 0 targets the first FE pipe, which can process all FE driven engines:
+  Graphics, Compute, Inline2Memory, and TwoD.  A value of 1 targets the second
+  FE pipe, which can only process Compute work.  Note that GRCE work is allowed
+  on either runqueue." (NVIDIA) Note that it appears runqueue 1 is the default
+  for CUDA work on the Jetson Xavier.
  ENTRY_TYPE (T)        : type of this entry: ENTRY_TYPE_CHAN
  CHID (ID)             : identifier of the channel to run (overlays ENTRY_ID)
@@ -29,6 +42,19 @@
 */
 enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1};
 enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3};
+static inline char* target_to_text(enum INST_TARGET t) {
+        switch (t) {
+                case TARGET_VID_MEM:
+                        return "VID_MEM";
+                case TARGET_SYS_MEM_COHERENT:
+                        return "SYS_MEM_COHERENT";
+                case TARGET_SYS_MEM_NONCOHERENT:
+                        return "SYS_MEM_NONCOHERENT";
+                default:
+                        printk(KERN_WARNING "[nvdebug] Invalid aperture!\n");
+                        return NULL;
+        }
+}
 struct runlist_chan {
 // 0:63
@@ -55,10 +81,10 @@ struct runlist_chan {
  timeslice = (TSG_TIMESLICE_TIMEOUT << TSG_TIMESLICE_SCALE) * 1024 nanoseconds
  ENTRY_TYPE (T)      : type of this entry: ENTRY_TYPE_TSG
-  TSGID               : identifier of the Timeslice group (overlays ENTRY_ID)
-  TSG_LENGTH          : number of channels that are part of this timeslice group
  TIMESLICE_SCALE     : scale factor for the TSG's timeslice
  TIMESLICE_TIMEOUT   : timeout amount for the TSG's timeslice
+  TSG_LENGTH          : number of channels that are part of this timeslice group
+  TSGID               : identifier of the Timeslice group (overlays ENTRY_ID)
 */
 struct entry_tsg {
 // 0:63
@@ -130,6 +156,52 @@ typedef union {
        uint32_t raw;
 } runlist_info_t;
+enum CHANNEL_STATUS {
+        CHANNEL_STATUS_IDLE = 0,
+        CHANNEL_STATUS_PENDING = 1,
+        CHANNEL_STATUS_PENDING_CTX_RELOAD = 2,
+        CHANNEL_STATUS_PENDING_ACQUIRE = 3,
+        CHANNEL_STATUS_PENDING_ACQ_CTX_RELOAD = 4,
+        CHANNEL_STATUS_ON_PBDMA = 5,
+        CHANNEL_STATUS_ON_PBDMA_AND_ENG = 6,
+        CHANNEL_STATUS_ON_ENG = 7,
+        CHANNEL_STATUS_ON_ENG_PENDING_ACQUIRE = 8,
+        CHANNEL_STATUS_ON_ENG_PENDING = 9,
+        CHANNEL_STATUS_ON_PBDMA_CTX_RELOAD = 10,
+        CHANNEL_STATUS_ON_PBDMA_AND_ENG_CTX_RELOAD = 11,
+        CHANNEL_STATUS_ON_ENG_CTX_RELOAD = 12,
+        CHANNEL_STATUS_ON_ENG_PENDING_CTX_RELOAD = 13,
+        CHANNEL_STATUS_ON_ENG_PENDING_ACQ_CTX_RELOAD = 14,
+};
+#define NV_PCCSR_CHANNEL_INST(i) (0x00800000+(i)*8)
+#define MAX_CHID 512  // TODO: Double-check this is right
+// There are a total of 512 possible channels
+typedef union {
+        struct {
+// 0:31
+                uint32_t inst_ptr:28;
+                enum INST_TARGET inst_target:2;
+                 uint32_t padding0:1;
+                bool inst_bind:1;
+// 32:64
+                bool enable:1;
+                bool next:1;
+                 uint32_t padding:6;
+                bool force_ctx_reload:1;
+                 uint32_t padding2:1;
+                bool enable_set:1;
+                bool enable_clear:1;
+                 uint32_t padding3:10;
+                bool pbdma_faulted:1;
+                bool eng_faulted:1;
+                enum CHANNEL_STATUS status:4;
+                bool busy:1;
+                 uint32_t padding4:3;
+        } __attribute__((packed));
+        uint64_t raw;
+} channel_ctrl_t;
 // TODO(jbakita): Maybe put the above GPU types in a different file.
 #define for_chan_in_tsg(chan, tsg) \
@@ -146,6 +218,7 @@ struct runlist_iter {
 };
 // Defined in runlist.c
+struct gk20a* get_live_gk20a(void);
 int get_runlist_iter(struct runlist_iter *rl_iter);
 static inline struct gk20a *get_gk20a(struct device *dev) {
@@ -164,6 +237,20 @@ static inline u32 nvdebug_readl(struct gk20a* g, u32 r) {
        return readl(g_os->regs + r);
 }
+// quadword version of nvdebug_readl()
+static inline u64 nvdebug_readq(struct gk20a* g, u32 r) {
+        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
+        u64 ret;
+        if (unlikely(!g_os->regs)) {
+                printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
+                return -1;
+        }
+        // readq seems to always return the uppermost 32 bits as 0, so workaround with readl
+        ret = readl(g_os->regs + r);
+        ret |= ((u64)readl(g_os->regs + r + 4)) << 32;
+        return ret;
+}
 // Functionally identical to nvgpu_writel()
 static inline void nvdebug_writel(struct gk20a* g, u32 r, u32 v) {
        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);

diff --git a/nvdebug.h b/nvdebug.h index b4ff0a4..cd0dc90 100644 --- a/nvdebug.h +++ b/nvdebug.h
@@ -12,7 +12,20 @@
12		12
13	`INST_PTR` points to a GPU Instance Block which contains pointers to the GPU	13	`INST_PTR` points to a GPU Instance Block which contains pointers to the GPU
14	virtual address space for this context. All channels in a TSG point to the	14	virtual address space for this context. All channels in a TSG point to the
15	same GPU Instance Block.	15	same GPU Instance Block (?).
		16
		17	"RUNQUEUE_SELECTOR determines to which runqueue the channel belongs, and
		18	thereby which PBDMA will run the channel. Increasing values select
		19	increasingly numbered PBDMA IDs serving the runlist. If the selector value
		20	exceeds the number of PBDMAs on the runlist, the hardware will silently
		21	reassign the channel to run on the first PBDMA as though RUNQUEUE_SELECTOR had
		22	been set to 0. (In current hardware, this is used by SCG on the graphics
		23	runlist only to determine which FE pipe should service a given channel. A
		24	value of 0 targets the first FE pipe, which can process all FE driven engines:
		25	Graphics, Compute, Inline2Memory, and TwoD. A value of 1 targets the second
		26	FE pipe, which can only process Compute work. Note that GRCE work is allowed
		27	on either runqueue." (NVIDIA) Note that it appears runqueue 1 is the default
		28	for CUDA work on the Jetson Xavier.
16		29
17	ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_CHAN	30	ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_CHAN
18	CHID (ID) : identifier of the channel to run (overlays ENTRY_ID)	31	CHID (ID) : identifier of the channel to run (overlays ENTRY_ID)
@@ -29,6 +42,19 @@
29	*/	42	*/
30	enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1};	43	enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1};
31	enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3};	44	enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3};
		45	static inline char* target_to_text(enum INST_TARGET t) {
		46	switch (t) {
		47	case TARGET_VID_MEM:
		48	return "VID_MEM";
		49	case TARGET_SYS_MEM_COHERENT:
		50	return "SYS_MEM_COHERENT";
		51	case TARGET_SYS_MEM_NONCOHERENT:
		52	return "SYS_MEM_NONCOHERENT";
		53	default:
		54	printk(KERN_WARNING "[nvdebug] Invalid aperture!\n");
		55	return NULL;
		56	}
		57	}
32		58
33	struct runlist_chan {	59	struct runlist_chan {
34	// 0:63	60	// 0:63
@@ -55,10 +81,10 @@ struct runlist_chan {
55	timeslice = (TSG_TIMESLICE_TIMEOUT << TSG_TIMESLICE_SCALE) * 1024 nanoseconds	81	timeslice = (TSG_TIMESLICE_TIMEOUT << TSG_TIMESLICE_SCALE) * 1024 nanoseconds
56		82
57	ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_TSG	83	ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_TSG
58	TSGID : identifier of the Timeslice group (overlays ENTRY_ID)
59	TSG_LENGTH : number of channels that are part of this timeslice group
60	TIMESLICE_SCALE : scale factor for the TSG's timeslice	84	TIMESLICE_SCALE : scale factor for the TSG's timeslice
61	TIMESLICE_TIMEOUT : timeout amount for the TSG's timeslice	85	TIMESLICE_TIMEOUT : timeout amount for the TSG's timeslice
		86	TSG_LENGTH : number of channels that are part of this timeslice group
		87	TSGID : identifier of the Timeslice group (overlays ENTRY_ID)
62	*/	88	*/
63	struct entry_tsg {	89	struct entry_tsg {
64	// 0:63	90	// 0:63
@@ -130,6 +156,52 @@ typedef union {
130	uint32_t raw;	156	uint32_t raw;
131	} runlist_info_t;	157	} runlist_info_t;
132		158
		159	enum CHANNEL_STATUS {
		160	CHANNEL_STATUS_IDLE = 0,
		161	CHANNEL_STATUS_PENDING = 1,
		162	CHANNEL_STATUS_PENDING_CTX_RELOAD = 2,
		163	CHANNEL_STATUS_PENDING_ACQUIRE = 3,
		164	CHANNEL_STATUS_PENDING_ACQ_CTX_RELOAD = 4,
		165	CHANNEL_STATUS_ON_PBDMA = 5,
		166	CHANNEL_STATUS_ON_PBDMA_AND_ENG = 6,
		167	CHANNEL_STATUS_ON_ENG = 7,
		168	CHANNEL_STATUS_ON_ENG_PENDING_ACQUIRE = 8,
		169	CHANNEL_STATUS_ON_ENG_PENDING = 9,
		170	CHANNEL_STATUS_ON_PBDMA_CTX_RELOAD = 10,
		171	CHANNEL_STATUS_ON_PBDMA_AND_ENG_CTX_RELOAD = 11,
		172	CHANNEL_STATUS_ON_ENG_CTX_RELOAD = 12,
		173	CHANNEL_STATUS_ON_ENG_PENDING_CTX_RELOAD = 13,
		174	CHANNEL_STATUS_ON_ENG_PENDING_ACQ_CTX_RELOAD = 14,
		175	};
		176
		177	#define NV_PCCSR_CHANNEL_INST(i) (0x00800000+(i)*8)
		178	#define MAX_CHID 512 // TODO: Double-check this is right
		179	// There are a total of 512 possible channels
		180	typedef union {
		181	struct {
		182	// 0:31
		183	uint32_t inst_ptr:28;
		184	enum INST_TARGET inst_target:2;
		185	uint32_t padding0:1;
		186	bool inst_bind:1;
		187	// 32:64
		188	bool enable:1;
		189	bool next:1;
		190	uint32_t padding:6;
		191	bool force_ctx_reload:1;
		192	uint32_t padding2:1;
		193	bool enable_set:1;
		194	bool enable_clear:1;
		195	uint32_t padding3:10;
		196	bool pbdma_faulted:1;
		197	bool eng_faulted:1;
		198	enum CHANNEL_STATUS status:4;
		199	bool busy:1;
		200	uint32_t padding4:3;
		201	} __attribute__((packed));
		202	uint64_t raw;
		203	} channel_ctrl_t;
		204
133	// TODO(jbakita): Maybe put the above GPU types in a different file.	205	// TODO(jbakita): Maybe put the above GPU types in a different file.
134		206
135	#define for_chan_in_tsg(chan, tsg) \	207	#define for_chan_in_tsg(chan, tsg) \
@@ -146,6 +218,7 @@ struct runlist_iter {
146	};	218	};
147		219
148	// Defined in runlist.c	220	// Defined in runlist.c
		221	struct gk20a* get_live_gk20a(void);
149	int get_runlist_iter(struct runlist_iter *rl_iter);	222	int get_runlist_iter(struct runlist_iter *rl_iter);
150		223
151	static inline struct gk20a get_gk20a(struct device dev) {	224	static inline struct gk20a get_gk20a(struct device dev) {
@@ -164,6 +237,20 @@ static inline u32 nvdebug_readl(struct gk20a* g, u32 r) {
164	return readl(g_os->regs + r);	237	return readl(g_os->regs + r);
165	}	238	}
166		239
		240	// quadword version of nvdebug_readl()
		241	static inline u64 nvdebug_readq(struct gk20a* g, u32 r) {
		242	struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
		243	u64 ret;
		244	if (unlikely(!g_os->regs)) {
		245	printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
		246	return -1;
		247	}
		248	// readq seems to always return the uppermost 32 bits as 0, so workaround with readl
		249	ret = readl(g_os->regs + r);
		250	ret \|= ((u64)readl(g_os->regs + r + 4)) << 32;
		251	return ret;
		252	}
		253
167	// Functionally identical to nvgpu_writel()	254	// Functionally identical to nvgpu_writel()
168	static inline void nvdebug_writel(struct gk20a* g, u32 r, u32 v) {	255	static inline void nvdebug_writel(struct gk20a* g, u32 r, u32 v) {
169	struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);	256	struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);