nvdebug.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263

/* Copyright 2021 Joshua Bakita
 * SPDX-License-Identifier: MIT
 */

// TODO(jbakita): Don't depend on these.
#include <nvgpu/gk20a.h>  // For struct gk20a
#include <os/linux/os_linux.h>  // For struct nvgpu_os_linux

/* Runlist Channel
  A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue
  of GPU commands. These commands are typically queued from userspace.

  `INST_PTR` points to a GPU Instance Block which contains pointers to the GPU
  virtual address space for this context. All channels in a TSG point to the
  same GPU Instance Block (?).

  "RUNQUEUE_SELECTOR determines to which runqueue the channel belongs, and
  thereby which PBDMA will run the channel.  Increasing values select
  increasingly numbered PBDMA IDs serving the runlist.  If the selector value
  exceeds the number of PBDMAs on the runlist, the hardware will silently
  reassign the channel to run on the first PBDMA as though RUNQUEUE_SELECTOR had
  been set to 0.  (In current hardware, this is used by SCG on the graphics
  runlist only to determine which FE pipe should service a given channel.  A
  value of 0 targets the first FE pipe, which can process all FE driven engines:
  Graphics, Compute, Inline2Memory, and TwoD.  A value of 1 targets the second
  FE pipe, which can only process Compute work.  Note that GRCE work is allowed
  on either runqueue." (NVIDIA) Note that it appears runqueue 1 is the default
  for CUDA work on the Jetson Xavier.

  ENTRY_TYPE (T)        : type of this entry: ENTRY_TYPE_CHAN
  CHID (ID)             : identifier of the channel to run (overlays ENTRY_ID)
  RUNQUEUE_SELECTOR (Q) : selects which PBDMA should run this channel if
                          more than one PBDMA is supported by the runlist

  INST_PTR_LO           : lower 20 bits of the 4k-aligned instance block pointer
  INST_PTR_HI           : upper 32 bit of instance block pointer
  INST_TARGET (TGI)     : aperture of the instance block

  USERD_PTR_LO          : upper 24 bits of the low 32 bits, of the 512-byte-aligned USERD pointer
  USERD_PTR_HI          : upper 32 bits of USERD pointer
  USERD_TARGET (TGU)    : aperture of the USERD data structure
*/
enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1};
enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3};
static inline char* target_to_text(enum INST_TARGET t) {
	switch (t) {
		case TARGET_VID_MEM:
			return "VID_MEM";
		case TARGET_SYS_MEM_COHERENT:
			return "SYS_MEM_COHERENT";
		case TARGET_SYS_MEM_NONCOHERENT:
			return "SYS_MEM_NONCOHERENT";
		default:
			printk(KERN_WARNING "[nvdebug] Invalid aperture!\n");
			return NULL;
	}
}

struct runlist_chan {
// 0:63
	enum ENTRY_TYPE entry_type:1;
	uint32_t runqueue_selector:1;
	 uint32_t padding:2;
	enum INST_TARGET inst_target:2;
	 uint32_t padding2:2;
	uint32_t userd_ptr_lo:24;
	uint32_t userd_ptr_hi:32;
// 64:128
	uint32_t chid:12;
	uint32_t inst_ptr_lo:20;
	uint32_t inst_ptr_hi:32;
} __attribute__((packed));

/* Runlist TSG (TimeSlice Group)
  The runlist is composed of timeslice groups (TSG). Each TSG corresponds
  to a single virtual address space on the GPU and contains `TSG_LENGTH`
  channels. These channels and virtual address space are accessible to the GPU
  host unit for use until the timeslice expires or a TSG switch is forcibly
  initiated via a write to `NV_PFIFO_PREEMPT`.

  timeslice = (TSG_TIMESLICE_TIMEOUT << TSG_TIMESLICE_SCALE) * 1024 nanoseconds

  ENTRY_TYPE (T)      : type of this entry: ENTRY_TYPE_TSG
  TIMESLICE_SCALE     : scale factor for the TSG's timeslice
  TIMESLICE_TIMEOUT   : timeout amount for the TSG's timeslice
  TSG_LENGTH          : number of channels that are part of this timeslice group
  TSGID               : identifier of the Timeslice group (overlays ENTRY_ID)
*/
struct entry_tsg {
// 0:63
	enum ENTRY_TYPE entry_type:1;
	 uint64_t padding:15;
	uint32_t timeslice_scale:4;
	 uint64_t padding2:4;
	uint32_t timeslice_timeout:8;
	uint32_t tsg_length:8;
	 uint32_t padding3:24;
// 64:128
	uint32_t tsgid:12;
	 uint64_t padding4:52;
} __attribute__((packed));

enum PREEMPT_TYPE {PREEMPT_TYPE_CHANNEL = 0, PREEMPT_TYPE_TSG = 1};

/* Preempt
  ID/CHID             : Id of TSG or channel to preempt
*/
#define NV_PFIFO_PREEMPT 0x00002634
struct pfifo_preempt {
	uint32_t id:12;
	 uint32_t padding:8;
	bool is_pending:1;
	 uint32_t padding2:3;
	enum PREEMPT_TYPE type:2;
	 uint32_t padding3:6;
} __attribute__((packed));

#define NV_PFIFO_RUNLIST_PREEMPT 0x00002638
struct runlist_preempt {
	bool runlist_0:1;
	bool runlist_1:1;
	bool runlist_2:1;
	bool runlist_3:1;
	bool runlist_4:1;
	bool runlist_5:1;
	bool runlist_6:1;
	bool runlist_7:1;
	bool runlist_8:1;
	bool runlist_9:1;
	bool runlist_10:1;
	bool runlist_11:1;
	bool runlist_12:1;
	bool runlist_13:1;
	 uint32_t padding:28;
} __attribute__((packed));

// Note: This is different with Turing
#define NV_PFIFO_RUNLIST_BASE 0x00002270
typedef union {
	struct {
		uint32_t ptr:28;
		uint32_t type:2;
		 uint32_t padding:2;
	} __attribute__((packed));
	uint32_t raw;
} runlist_base_t;

#define NV_PFIFO_RUNLIST 0x00002274
typedef union {
	struct {
		uint32_t len:16;
		 uint32_t padding:4;
		uint32_t id:4;
		 uint32_t padding2:8;
	} __attribute__((packed));
	uint32_t raw;
} runlist_info_t;

enum CHANNEL_STATUS {
	CHANNEL_STATUS_IDLE = 0,
	CHANNEL_STATUS_PENDING = 1,
	CHANNEL_STATUS_PENDING_CTX_RELOAD = 2,
	CHANNEL_STATUS_PENDING_ACQUIRE = 3,
	CHANNEL_STATUS_PENDING_ACQ_CTX_RELOAD = 4,
	CHANNEL_STATUS_ON_PBDMA = 5,
	CHANNEL_STATUS_ON_PBDMA_AND_ENG = 6,
	CHANNEL_STATUS_ON_ENG = 7,
	CHANNEL_STATUS_ON_ENG_PENDING_ACQUIRE = 8,
	CHANNEL_STATUS_ON_ENG_PENDING = 9,
	CHANNEL_STATUS_ON_PBDMA_CTX_RELOAD = 10,
	CHANNEL_STATUS_ON_PBDMA_AND_ENG_CTX_RELOAD = 11,
	CHANNEL_STATUS_ON_ENG_CTX_RELOAD = 12,
	CHANNEL_STATUS_ON_ENG_PENDING_CTX_RELOAD = 13,
	CHANNEL_STATUS_ON_ENG_PENDING_ACQ_CTX_RELOAD = 14,
};

#define NV_PCCSR_CHANNEL_INST(i) (0x00800000+(i)*8)
#define MAX_CHID 512  // TODO: Double-check this is right
// There are a total of 512 possible channels
typedef union {
	struct {
// 0:31
		uint32_t inst_ptr:28;
		enum INST_TARGET inst_target:2;
		 uint32_t padding0:1;
		bool inst_bind:1;
// 32:64
		bool enable:1;
		bool next:1;
		 uint32_t padding:6;
		bool force_ctx_reload:1;
		 uint32_t padding2:1;
		bool enable_set:1;
		bool enable_clear:1;
		 uint32_t padding3:10;
		bool pbdma_faulted:1;
		bool eng_faulted:1;
		enum CHANNEL_STATUS status:4;
		bool busy:1;
		 uint32_t padding4:3;
	} __attribute__((packed));
	uint64_t raw;
} channel_ctrl_t;

// TODO(jbakita): Maybe put the above GPU types in a different file.

#define for_chan_in_tsg(chan, tsg) \
        for (chan = (struct runlist_chan*)(tsg + 1); \
             (void*)chan < (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length; \
             chan++)

#define next_tsg(tsg) \
        (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length

struct runlist_iter {
	struct entry_tsg *curr_tsg;
	runlist_info_t rl_info;
};

// Defined in runlist.c
struct gk20a* get_live_gk20a(void);
int get_runlist_iter(struct runlist_iter *rl_iter);

static inline struct gk20a *get_gk20a(struct device *dev) {
        // XXX: Only works because gk20a* is the first member of gk20a_platform
        return *((struct gk20a**)dev_get_drvdata(dev));
}

// Functionally identical to nvgpu_readl()
// (except we don't try to resolve situations where regs is NULL)
static inline u32 nvdebug_readl(struct gk20a* g, u32 r) {
        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
        if (unlikely(!g_os->regs)) {
                printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
                return -1;
        }
        return readl(g_os->regs + r);
}

// quadword version of nvdebug_readl()
static inline u64 nvdebug_readq(struct gk20a* g, u32 r) {
        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
        u64 ret;
	if (unlikely(!g_os->regs)) {
                printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
                return -1;
        }
	// readq seems to always return the uppermost 32 bits as 0, so workaround with readl
	ret = readl(g_os->regs + r);
	ret |= ((u64)readl(g_os->regs + r + 4)) << 32;
	return ret;
}

// Functionally identical to nvgpu_writel()
static inline void nvdebug_writel(struct gk20a* g, u32 r, u32 v) {
        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
        if (unlikely(!g_os->regs)) {
                printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n");
                return;
        }
        writel_relaxed(v, g_os->regs + r);
        wmb();
}