1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
|
/* Copyright 2021 Joshua Bakita
* SPDX-License-Identifier: MIT
*/
// TODO(jbakita): Don't depend on these.
#include <nvgpu/gk20a.h> // For struct gk20a
#include <os/linux/os_linux.h> // For struct nvgpu_os_linux
/* Runlist Channel
A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue
of GPU commands. These commands are typically queued from userspace.
`INST_PTR` points to a GPU Instance Block which contains pointers to the GPU
virtual address space for this context. All channels in a TSG point to the
same GPU Instance Block (?).
"RUNQUEUE_SELECTOR determines to which runqueue the channel belongs, and
thereby which PBDMA will run the channel. Increasing values select
increasingly numbered PBDMA IDs serving the runlist. If the selector value
exceeds the number of PBDMAs on the runlist, the hardware will silently
reassign the channel to run on the first PBDMA as though RUNQUEUE_SELECTOR had
been set to 0. (In current hardware, this is used by SCG on the graphics
runlist only to determine which FE pipe should service a given channel. A
value of 0 targets the first FE pipe, which can process all FE driven engines:
Graphics, Compute, Inline2Memory, and TwoD. A value of 1 targets the second
FE pipe, which can only process Compute work. Note that GRCE work is allowed
on either runqueue." (NVIDIA) Note that it appears runqueue 1 is the default
for CUDA work on the Jetson Xavier.
ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_CHAN
CHID (ID) : identifier of the channel to run (overlays ENTRY_ID)
RUNQUEUE_SELECTOR (Q) : selects which PBDMA should run this channel if
more than one PBDMA is supported by the runlist
INST_PTR_LO : lower 20 bits of the 4k-aligned instance block pointer
INST_PTR_HI : upper 32 bit of instance block pointer
INST_TARGET (TGI) : aperture of the instance block
USERD_PTR_LO : upper 24 bits of the low 32 bits, of the 512-byte-aligned USERD pointer
USERD_PTR_HI : upper 32 bits of USERD pointer
USERD_TARGET (TGU) : aperture of the USERD data structure
*/
enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1};
enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3};
static inline char* target_to_text(enum INST_TARGET t) {
switch (t) {
case TARGET_VID_MEM:
return "VID_MEM";
case TARGET_SYS_MEM_COHERENT:
return "SYS_MEM_COHERENT";
case TARGET_SYS_MEM_NONCOHERENT:
return "SYS_MEM_NONCOHERENT";
default:
printk(KERN_WARNING "[nvdebug] Invalid aperture!\n");
return NULL;
}
}
struct runlist_chan {
// 0:63
enum ENTRY_TYPE entry_type:1;
uint32_t runqueue_selector:1;
uint32_t padding:2;
enum INST_TARGET inst_target:2;
uint32_t padding2:2;
uint32_t userd_ptr_lo:24;
uint32_t userd_ptr_hi:32;
// 64:128
uint32_t chid:12;
uint32_t inst_ptr_lo:20;
uint32_t inst_ptr_hi:32;
} __attribute__((packed));
/* Runlist TSG (TimeSlice Group)
The runlist is composed of timeslice groups (TSG). Each TSG corresponds
to a single virtual address space on the GPU and contains `TSG_LENGTH`
channels. These channels and virtual address space are accessible to the GPU
host unit for use until the timeslice expires or a TSG switch is forcibly
initiated via a write to `NV_PFIFO_PREEMPT`.
timeslice = (TSG_TIMESLICE_TIMEOUT << TSG_TIMESLICE_SCALE) * 1024 nanoseconds
ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_TSG
TIMESLICE_SCALE : scale factor for the TSG's timeslice
TIMESLICE_TIMEOUT : timeout amount for the TSG's timeslice
TSG_LENGTH : number of channels that are part of this timeslice group
TSGID : identifier of the Timeslice group (overlays ENTRY_ID)
*/
struct entry_tsg {
// 0:63
enum ENTRY_TYPE entry_type:1;
uint64_t padding:15;
uint32_t timeslice_scale:4;
uint64_t padding2:4;
uint32_t timeslice_timeout:8;
uint32_t tsg_length:8;
uint32_t padding3:24;
// 64:128
uint32_t tsgid:12;
uint64_t padding4:52;
} __attribute__((packed));
#define MAX_TSGID (1 << 12)
enum PREEMPT_TYPE {PREEMPT_TYPE_CHANNEL = 0, PREEMPT_TYPE_TSG = 1};
/* Preempt a TSG or Channel by ID
ID/CHID : Id of TSG or channel to preempt
IS_PENDING : ????
TYPE : PREEMPT_TYPE_CHANNEL or PREEMPT_TYPE_TSG
Support: Kepler, Maxwell, Pascal, Volta
*/
#define NV_PFIFO_PREEMPT 0x00002634
typedef union {
struct {
uint32_t id:12;
uint32_t padding:8;
bool is_pending:1;
uint32_t padding2:3;
enum PREEMPT_TYPE type:2;
uint32_t padding3:6;
} __attribute__((packed));
uint32_t raw;
} pfifo_preempt_t;
/*
"Initiate a preempt of the engine by writing the bit associated with its
runlist to NV_PFIFO_RUNLIST_PREEMPT... Do not poll NV_PFIFO_RUNLIST_PREEMPT
for the preempt to complete."
Useful for preempting multiple runlists at once.
Appears to trigger an interrupt or some other side-effect on the Jetson
Xavier, as the built-in nvgpu driver seems to be disturbed by writing to this.
To select the runlist dynamically, use the BIT(nr) kernel macro.
Example:
runlist_preempt_t rl_preempt;
rl_preempt.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST_PREEMPT);
rl_preempt.raw |= BIT(nr);
nvdebug_writel(g, NV_PFIFO_RUNLIST_PREEMPT, rl_preempt.raw);
Support: Volta
*/
#define NV_PFIFO_RUNLIST_PREEMPT 0x00002638
typedef union {
struct {
bool runlist_0:1;
bool runlist_1:1;
bool runlist_2:1;
bool runlist_3:1;
bool runlist_4:1;
bool runlist_5:1;
bool runlist_6:1;
bool runlist_7:1;
bool runlist_8:1;
bool runlist_9:1;
bool runlist_10:1;
bool runlist_11:1;
bool runlist_12:1;
bool runlist_13:1;
uint32_t padding:18;
} __attribute__((packed));
uint32_t raw;
} runlist_preempt_t;
/* Additional information on preempting from NVIDIA's driver (commit b1d0d8ece)
* "From h/w team
* Engine save can be blocked by eng stalling interrupts.
* FIFO interrupts shouldn’t block an engine save from
* finishing, but could block FIFO from reporting preempt done.
* No immediate reason to reset the engine if FIFO interrupt is
* pending.
* The hub, priv_ring, and ltc interrupts could block context
* switch (or memory), but doesn’t necessarily have to.
* For Hub interrupts they just report access counters and page
* faults. Neither of these necessarily block context switch
* or preemption, but they could.
* For example a page fault for graphics would prevent graphics
* from saving out. An access counter interrupt is a
* notification and has no effect.
* SW should handle page faults though for preempt to complete.
* PRI interrupt (due to a failed PRI transaction) will result
* in ctxsw failure reported to HOST.
* LTC interrupts are generally ECC related and if so,
* certainly don’t block preemption/ctxsw but they could.
* Bus interrupts shouldn’t have anything to do with preemption
* state as they are part of the Host EXT pipe, though they may
* exhibit a symptom that indicates that GPU is in a bad state.
* To be completely fair, when an engine is preempting SW
* really should just handle other interrupts as they come in.
* It’s generally bad to just poll and wait on a preempt
* to complete since there are many things in the GPU which may
* cause a system to hang/stop responding."
*/
// Note: This is different with Turing
// Support: Kepler, Maxwell, Pascal, Volta
#define NV_PFIFO_RUNLIST_BASE 0x00002270
typedef union {
struct {
uint32_t ptr:28;
uint32_t type:2;
uint32_t padding:2;
} __attribute__((packed));
uint32_t raw;
} runlist_base_t;
// Support: Kepler, Maxwell, Pascal, Volta
#define NV_PFIFO_RUNLIST 0x00002274
typedef union {
struct {
uint32_t len:16;
uint32_t padding:4;
uint32_t id:4;
uint32_t padding2:8;
} __attribute__((packed));
uint32_t raw;
} runlist_info_t;
enum CHANNEL_STATUS {
CHANNEL_STATUS_IDLE = 0,
CHANNEL_STATUS_PENDING = 1,
CHANNEL_STATUS_PENDING_CTX_RELOAD = 2,
CHANNEL_STATUS_PENDING_ACQUIRE = 3,
CHANNEL_STATUS_PENDING_ACQ_CTX_RELOAD = 4,
CHANNEL_STATUS_ON_PBDMA = 5,
CHANNEL_STATUS_ON_PBDMA_AND_ENG = 6,
CHANNEL_STATUS_ON_ENG = 7,
CHANNEL_STATUS_ON_ENG_PENDING_ACQUIRE = 8,
CHANNEL_STATUS_ON_ENG_PENDING = 9,
CHANNEL_STATUS_ON_PBDMA_CTX_RELOAD = 10,
CHANNEL_STATUS_ON_PBDMA_AND_ENG_CTX_RELOAD = 11,
CHANNEL_STATUS_ON_ENG_CTX_RELOAD = 12,
CHANNEL_STATUS_ON_ENG_PENDING_CTX_RELOAD = 13,
CHANNEL_STATUS_ON_ENG_PENDING_ACQ_CTX_RELOAD = 14,
};
#define NV_PCCSR_CHANNEL_INST(i) (0x00800000+(i)*8)
// There are a total of 512 possible channels
#define MAX_CHID 512
typedef union {
struct {
// 0:31
uint32_t inst_ptr:28;
enum INST_TARGET inst_target:2;
uint32_t padding0:1;
bool inst_bind:1;
// 32:64
bool enable:1;
bool next:1;
uint32_t padding:6;
bool force_ctx_reload:1;
uint32_t padding2:1;
bool enable_set:1;
bool enable_clear:1;
uint32_t padding3:10;
bool pbdma_faulted:1;
bool eng_faulted:1;
enum CHANNEL_STATUS status:4;
bool busy:1;
uint32_t padding4:3;
} __attribute__((packed));
uint64_t raw;
} channel_ctrl_t;
/* Control word for runlist enable/disable.
RUNLIST_N : Is runlist n disabled? (1 == disabled, 0 == enabled)
To select the runlist dynamically, use the BIT(nr) kernel macro.
Disabling example:
runlist_disable_t rl_disable;
rl_disable.raw = nvdebug_readl(g, NV_PFIFO_SCHED_DISABLE);
rl_disable.raw |= BIT(nr);
nvdebug_writel(g, NV_PFIFO_SCHED_DISABLE, rl_disable.raw);
Enabling example:
runlist_disable_t rl_disable;
rl_disable.raw = nvdebug_readl(g, NV_PFIFO_SCHED_DISABLE);
rl_disable.raw &= ~BIT(nr);
nvdebug_writel(g, NV_PFIFO_SCHED_DISABLE, rl_disable.raw);
Support: Fermi, Kepler, Maxwell, Pascal, Volta, Turing
*/
#define NV_PFIFO_SCHED_DISABLE 0x00002630
typedef union {
struct {
bool runlist_0:1;
bool runlist_1:1;
bool runlist_2:1;
bool runlist_3:1;
bool runlist_4:1;
bool runlist_5:1;
bool runlist_6:1;
bool runlist_7:1;
bool runlist_8:1;
bool runlist_9:1;
bool runlist_10:1;
uint32_t padding:21;
} __attribute__((packed));
uint32_t raw;
} runlist_disable_t;
// TODO(jbakita): Maybe put the above GPU types in a different file.
#define for_chan_in_tsg(chan, tsg) \
for (chan = (struct runlist_chan*)(tsg + 1); \
(void*)chan < (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length; \
chan++)
#define next_tsg(tsg) \
(void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length
struct runlist_iter {
struct entry_tsg *curr_tsg;
runlist_info_t rl_info;
};
// Defined in runlist.c
struct gk20a* get_live_gk20a(void);
int get_runlist_iter(struct runlist_iter *rl_iter);
int preempt_tsg(uint32_t tsg_id);
static inline struct gk20a *get_gk20a(struct device *dev) {
// XXX: Only works because gk20a* is the first member of gk20a_platform
return *((struct gk20a**)dev_get_drvdata(dev));
}
// Functionally identical to nvgpu_readl()
// (except we don't try to resolve situations where regs is NULL)
static inline u32 nvdebug_readl(struct gk20a* g, u32 r) {
struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
if (unlikely(!g_os->regs)) {
printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
return -1;
}
return readl(g_os->regs + r);
}
// quadword version of nvdebug_readl()
static inline u64 nvdebug_readq(struct gk20a* g, u32 r) {
struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
u64 ret;
if (unlikely(!g_os->regs)) {
printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
return -1;
}
// readq seems to always return the uppermost 32 bits as 0, so workaround with readl
ret = readl(g_os->regs + r);
ret |= ((u64)readl(g_os->regs + r + 4)) << 32;
return ret;
}
// Functionally identical to nvgpu_writel()
static inline void nvdebug_writel(struct gk20a* g, u32 r, u32 v) {
struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
if (unlikely(!g_os->regs)) {
printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n");
return;
}
writel_relaxed(v, g_os->regs + r);
wmb();
}
|