summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoshua Bakita <jbakita@cs.unc.edu>2021-08-26 13:04:27 -0400
committerJoshua Bakita <jbakita@cs.unc.edu>2021-08-26 13:04:27 -0400
commit5f661d8a5db3f7875f6bf36b4843a71fd08ecbea (patch)
treeb18ce3ceb27fd885cd6aec19a3c342bb9e7963ef
Add initial implementation
Supports accessing and printing the runlist on the Jetson Xavier to dmesg. May work on other Jetson boards. Currently requires the nvgpu headers from NVIDIA's Linux4Tegra (L4T) source tree.
-rw-r--r--.gitignore7
-rw-r--r--Makefile13
-rw-r--r--nvdebug.c278
-rw-r--r--nvdebug.h127
-rw-r--r--nvidia_preemption.md36
5 files changed, 461 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..197a191
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
1.*
2*.ko
3*.mod.c
4*.o
5*.o.*
6modules.order
7Module.symvers
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..cc14996
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,13 @@
1obj-m += nvdebug.o
2
3# TODO: Avoid needing to distribute NVIDIA's headers (at least they're MIT...)
4#ccflags-y += -I$(PWD)/include
5ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu/include
6ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu
7ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include
8ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include/uapi
9
10all:
11 make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
12clean:
13 make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
diff --git a/nvdebug.c b/nvdebug.c
new file mode 100644
index 0000000..31a797e
--- /dev/null
+++ b/nvdebug.c
@@ -0,0 +1,278 @@
1/* Copyright 2021 Joshua Bakita
2 * SPDX-License-Identifier: MIT
3 */
4
5/* TODO
6 * - Add /proc /sys or debugfs interface
7 * - Add API to trigger a preemption
8 */
9
10#include <linux/module.h>
11#include <linux/kernel.h>
12#include <linux/device.h>
13#include <linux/kallsyms.h>
14#include <linux/iommu.h> // For struct iommu_domain
15#include <asm/io.h>
16
17/* Currently used symbols:
18 * - struct gk20a;
19 * - struct nvgpu_os_linux;
20 * - void nvgpu_writel(struct gk20a *g, u32 reg_addr, u32 value);
21 */
22#include <nvgpu/io.h>
23#include <nvgpu/gk20a.h>
24#include <os/linux/os_linux.h>
25
26#include "nvdebug.h"
27
28MODULE_LICENSE("GPL"); // LIAR
29MODULE_AUTHOR("Joshua Bakita");
30MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs");
31MODULE_SOFTDEP("pre: nvgpu"); // We only support the Jetson boards for now
32
33// Bus types are global symbols in the kernel
34extern struct bus_type platform_bus_type;
35
36static inline struct gk20a *get_gk20a(struct device *dev) {
37 // XXX: Only works because gk20a* is the first member of gk20a_platform
38 return *((struct gk20a**)dev_get_drvdata(dev));
39}
40
41// Functionally identical to nvgpu_readl()
42// (except we don't try to resolve situations where regs is NULL)
43static inline u32 nvdebug_readl(struct gk20a* g, u32 r) {
44 struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
45 if (unlikely(!g_os->regs)) {
46 printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
47 return -1;
48 }
49 return readl(g_os->regs + r);
50}
51
52// Functionally identical to nvgpu_writel()
53static inline void nvdebug_writel(struct gk20a* g, u32 r, u32 v) {
54 struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
55 if (unlikely(!g_os->regs)) {
56 printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n");
57 return;
58 }
59 writel_relaxed(v, g_os->regs + r);
60 wmb();
61}
62/*
63#define RUNLIST_PROCFS_NAME "runlist"
64
65static const struct seq_operations runlist_file_seq_ops = {
66 .start =
67 .next =
68 .stop =
69 .show =
70};
71
72static const struct file_operations runlist_file_ops = {
73 .read =
74*/
75/*static void read_bytes(struct gk20a *g, void* target, u32 start, u32 num_bytes) {
76 u32 *output = target;
77 u32 i;
78 // Read u32s from the GPU
79 for (i = 0; i < num_bytes; i += 4) {
80 output[i/4] = _nvgpu_readl(g, start + i);
81 printk(KERN_INFO "[nvdebug] U32 %d: %0x\n", i, output[i/4]);
82 }
83}
84
85static void read_bytes(void* target, void* start, u32 num_bytes) {
86 u32 *output = target;
87 u32 i;
88 // Read u32s from the GPU
89 for (i = 0; i < num_bytes; i += 4) {
90 output[i/4] = readl(start + i);
91 printk(KERN_INFO "[nvdebug] U32 %d: %0x\n", i, output[i/4]);
92 }
93}*/
94
95/*
96 +---- TSG Entry %d ----+
97 | Scale: %d |
98 | Timeout: %d |
99 +----------------------+
100
101
102
103
104
105
106*/
107
108#define PRE KERN_INFO "[nvdebug] "
109
110static void nvdebug_print_tsg(struct entry_tsg* tsg) {
111 if (tsg->entry_type != ENTRY_TYPE_TSG) {
112 printk(KERN_WARNING "[nvdebug] Attempted to print non-TSG in nvdebug_print_tsg()!\n");
113 return;
114 }
115 printk(PRE "+---- TSG Entry %-2d----+", tsg->tsgid);
116 printk(PRE "| Scale: %-13d|", tsg->timeslice_scale);
117 printk(PRE "| Timeout: %-11d|", tsg->timeslice_timeout);
118 printk(PRE "+---------------------+");
119}
120
121static void nvdebug_print_chan(struct runlist_chan* chan) {
122 char* loc_txt;
123 u64 inst_ptr;
124 if (chan->entry_type != ENTRY_TYPE_CHAN) {
125 printk(KERN_WARNING "[nvdebug] Attempted to print non-channel in nvdebug_print_channel()!\n");
126 return;
127 }
128 switch (chan->inst_target) {
129 case TARGET_VID_MEM:
130 loc_txt = "VID_MEM";
131 break;
132 case TARGET_SYS_MEM_COHERENT:
133 loc_txt = "SYS_MEM_COHERENT";
134 break;
135 case TARGET_SYS_MEM_NONCOHERENT:
136 loc_txt = "SYS_MEM_NONCOHERENT";
137 break;
138 default:
139 printk(KERN_WARNING "[nvdebug] Invalid aperture in runlist channel!\n");
140 return;
141 }
142 // Reconstruct pointer to channel instance block
143 inst_ptr = chan->inst_ptr_hi;
144 inst_ptr <<= 32;
145 inst_ptr |= chan->inst_ptr_lo << 12;
146
147 printk(PRE " +- Channel Entry %-4d-+", chan->chid);
148 printk(PRE " | Runqueue Selector: %d|", chan->runqueue_selector);
149 printk(PRE " | Instance PTR: |");
150 printk(PRE " | %#018llx |", inst_ptr);
151 printk(PRE " | %-20s|", loc_txt);
152 printk(PRE " +---------------------+");
153}
154
155#define for_chan_in_tsg(chan, tsg) \
156 for (chan = (struct runlist_chan*)(tsg + 1); \
157 (void*)chan < (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length; \
158 chan++)
159
160#define next_tsg(tsg) \
161 (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length
162
163static void nvdebug_print_runlist(struct entry_tsg* head, runlist_info_t rl_info) {
164 int rl_idx = 0;
165 struct runlist_chan* chan;
166 printk(PRE "tsg->tsg_length: %d\n", head->tsg_length);
167 printk(PRE "rl_info.len: %d\n", rl_info.len);
168 while (rl_idx < rl_info.len) {
169 nvdebug_print_tsg(head);
170 for_chan_in_tsg(chan, head) {
171 nvdebug_print_chan(chan);
172 }
173 rl_idx += 1 + head->tsg_length;
174 head = next_tsg(head);
175 }
176}
177
178static int __init nvdebug_init(void) {
179 struct device *dev = NULL;
180 struct device *temp_dev;
181 struct gk20a *g;
182 struct entry_tsg head;
183 runlist_base_t rl_base;
184 runlist_info_t rl_info;
185 u64 runlist_iova;
186 // Get the last device that matches our name
187 while ((temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b"))) {
188 printk(KERN_INFO "Found a matching device\n");
189 dev = temp_dev;
190 }
191 if (!dev)
192 return -EIO;
193 g = get_gk20a(dev);
194 // This address seems to not be:
195 // - A GPU address (type is sysmem_coherent)
196 // - A physical address (dereferencing after ioremap crashes)
197 // - A kernel virtual address (dereferencing segfaults)
198 // So maybe it's some sort of custom thing? This is an address that the GPU
199 // can use, so it would make most sense for it to be a physical address.
200 //
201 // BUT, it can't possibly be a physical address, as it would refer to an
202 // address greater than the maximum one on our system (by a lot!).
203 // Maybe I'm reading the runlist base wrong?
204 // Aha, the driver calls it runlist_iova. Sounds like runlist I/O virtual
205 // address! So, what's this I/O address space? All I know is that it's what
206 // nvgpu_mem_get_addr() returns. That function returns the result of either:
207 // - gpu_phys_addr which is __nvgpu_sgl_phys on our platform which (?)
208 // converts an IPA to a PA?
209 // - nvgpu_mem_iommu_translate
210 //
211 // The original memory is allocated with nvgpu_dma_alloc_flags_sys(), which
212 // returns SYSMEM.
213 //
214 // To convert a physical address to a IOMMU address, we add a bit
215 //
216 // BUT, it turns out that it IS JUST A PHYSICAL ADDRESS! It wasn't working
217 // before because the GPU had simply gone to sleep and invalidated its
218 // register state, so nvgpu_readl() was simply returning garbage.
219
220 printk(KERN_INFO "[nvdebug] Pulling runlist base address from %x\n", NV_PFIFO_RUNLIST_BASE);
221 printk(KERN_INFO "[nvdebug] Using struct gk20a* of %px\n", g);
222 printk(KERN_INFO "[nvdebug] g->name: %s, g->power_on: %d, g->sw_ready: %d, g->is_virtual %d\n", g->name, g->power_on, g->sw_ready, g->is_virtual);
223 struct nvgpu_os_linux *l = container_of(g, struct nvgpu_os_linux, g);
224 printk(KERN_INFO "[nvdebug] l->regs %px, l->regs_saved %px\n", l->regs, l->regs_saved);
225 if (!l->regs)
226 return -EIO;
227 rl_base.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST_BASE);
228 rl_info.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST);
229 runlist_iova = ((u64)rl_base.ptr) << 12;
230 printk(KERN_INFO "[nvdebug] Runlist ptr: %x, type: %d, raw: %x, IOVA: %px\n", rl_base.ptr, rl_base.type, rl_base.raw, (void*)runlist_iova);
231 // Segfaults
232 //u32 attempted_read = ioread32(runlist_iova);
233 //printk(KERN_INFO "[nvdebug] first word of runlist: %0x\n", attempted_read);
234
235 // Errors out
236 //u32* virt_rt_addr = ioremap(phys_rl_addr, sizeof(struct entry_tsg));
237 //printk(KERN_INFO "[nvdebug] Runlist virt_addr: %px\n", virt_rt_addr);
238
239 /* Overcomplicated?
240 struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
241 if (!domain) {
242 printk(KERN_INFO "[nvdebug] No IOMMU domain!\n");
243 return -EIO;
244 }
245 u64 phys_addr = platform_bus_type.iommu_ops->iova_to_phys(domain, runlist_iova);
246 printk(KERN_INFO "[nvdebug] Runlist PA: %px\n", phys_addr);
247 */
248
249 printk(KERN_INFO "[nvdebug] Runlist phys_to_virt: %px\n", (void*)phys_to_virt(runlist_iova));
250 printk(KERN_INFO "[nvdebug] Runlist *phys_to_virt: %x\n", *(u32*)phys_to_virt(runlist_iova));
251 head = *(struct entry_tsg*)phys_to_virt(runlist_iova);
252 nvdebug_print_runlist((struct entry_tsg*)phys_to_virt(runlist_iova), rl_info);
253 //nvdebug_print_tsg(&head);
254 //nvdebug_print_chan((struct runlist_chan*)(phys_to_virt(runlist_iova) + sizeof(struct entry_tsg)));
255 //printk(KERN_INFO "[nvdebug] entry_type: %d\n", head.entry_type);
256 //printk(KERN_INFO "[nvdebug] timeslice_scale: %d\n", head.timeslice_scale);
257 //printk(KERN_INFO "[nvdebug] timeslice_timeout: %d\n", head.timeslice_timeout);
258 //printk(KERN_INFO "[nvdebug] tsg_length: %d\n", head.tsg_length);
259 //printk(KERN_INFO "[nvdebug] tsgid: %d\n", head.tsgid);
260
261 //printk(KERN_INFO "[nvdebug] Mem base phys: %p\n", (void*)virt_to_phys((void*)0xffffffc000000000ULL));
262 //printk(KERN_INFO "[nvdebug] Mem end phys: %p\n", (void*)virt_to_phys((void*)0xffffffc400000000ULL));
263 //printk(KERN_INFO "[nvdebug] Runlist *virt_addr: %x\n", readl(virt_rt_addr)); // This crashes
264 //read_bytes(&head, virt_rt_addr, sizeof(struct entry_tsg));
265 /*printk(KERN_INFO "[nvdebug] entry_type: %d\n", head.entry_type);
266 printk(KERN_INFO "[nvdebug] timeslice_scale: %d\n", head.timeslice_scale);
267 printk(KERN_INFO "[nvdebug] timeslice_timeout: %d\n", head.timeslice_timeout);
268 printk(KERN_INFO "[nvdebug] tsg_length: %d\n", head.tsg_length);
269 printk(KERN_INFO "[nvdebug] tsgid: %d\n", head.tsgid);
270 */return 0;
271}
272
273static void __exit nvdebug_exit(void) {
274 printk(KERN_INFO "[nvdebug] Exiting...\n");
275}
276
277module_init(nvdebug_init);
278module_exit(nvdebug_exit);
diff --git a/nvdebug.h b/nvdebug.h
new file mode 100644
index 0000000..aa5d0cf
--- /dev/null
+++ b/nvdebug.h
@@ -0,0 +1,127 @@
1/* Copyright 2021 Joshua Bakita
2 * SPDX-License-Identifier: MIT
3 */
4
5/* Runlist Channel
6 A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue
7 of GPU commands. These commands are typically queued from userspace.
8
9 `INST_PTR` points to a GPU Instance Block which contains pointers to the GPU
10 virtual address space for this context. All channels in a TSG point to the
11 same GPU Instance Block.
12
13 ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_CHAN
14 CHID (ID) : identifier of the channel to run (overlays ENTRY_ID)
15 RUNQUEUE_SELECTOR (Q) : selects which PBDMA should run this channel if
16 more than one PBDMA is supported by the runlist
17
18 INST_PTR_LO : lower 20 bits of the 4k-aligned instance block pointer
19 INST_PTR_HI : upper 32 bit of instance block pointer
20 INST_TARGET (TGI) : aperture of the instance block
21
22 USERD_PTR_LO : upper 24 bits of the low 32 bits, of the 512-byte-aligned USERD pointer
23 USERD_PTR_HI : upper 32 bits of USERD pointer
24 USERD_TARGET (TGU) : aperture of the USERD data structure
25*/
26enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1};
27enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3};
28
29struct runlist_chan {
30// 0:63
31 enum ENTRY_TYPE entry_type:1;
32 uint32_t runqueue_selector:1;
33 uint32_t padding:2;
34 enum INST_TARGET inst_target:2;
35 uint32_t padding2:2;
36 uint32_t userd_ptr_lo:24;
37 uint32_t userd_ptr_hi:32;
38// 64:128
39 uint32_t chid:12;
40 uint32_t inst_ptr_lo:20;
41 uint32_t inst_ptr_hi:32;
42} __attribute__((packed));
43
44/* Runlist TSG (TimeSlice Group)
45 The runlist is composed of timeslice groups (TSG). Each TSG corresponds
46 to a single virtual address space on the GPU and contains `TSG_LENGTH`
47 channels. These channels and virtual address space are accessible to the GPU
48 host unit for use until the timeslice expires or a TSG switch is forcibly
49 initiated via a write to `NV_PFIFO_PREEMPT`.
50
51 timeslice = (TSG_TIMESLICE_TIMEOUT << TSG_TIMESLICE_SCALE) * 1024 nanoseconds
52
53 ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_TSG
54 TSGID : identifier of the Timeslice group (overlays ENTRY_ID)
55 TSG_LENGTH : number of channels that are part of this timeslice group
56 TIMESLICE_SCALE : scale factor for the TSG's timeslice
57 TIMESLICE_TIMEOUT : timeout amount for the TSG's timeslice
58*/
59struct entry_tsg {
60// 0:63
61 enum ENTRY_TYPE entry_type:1;
62 uint64_t padding:15;
63 uint32_t timeslice_scale:4;
64 uint64_t padding2:4;
65 uint32_t timeslice_timeout:8;
66 uint32_t tsg_length:8;
67 uint32_t padding3:24;
68// 64:128
69 uint32_t tsgid:12;
70 uint64_t padding4:52;
71} __attribute__((packed));
72
73enum PREEMPT_TYPE {PREEMPT_TYPE_CHANNEL = 0, PREEMPT_TYPE_TSG = 1};
74
75/* Preempt
76 ID/CHID : Id of TSG or channel to preempt
77*/
78#define NV_PFIFO_PREEMPT 0x00002634
79struct pfifo_preempt {
80 uint32_t id:12;
81 uint32_t padding:8;
82 bool is_pending:1;
83 uint32_t padding2:3;
84 enum PREEMPT_TYPE type:2;
85 uint32_t padding3:6;
86} __attribute__((packed));
87
88#define NV_PFIFO_RUNLIST_PREEMPT 0x00002638
89struct runlist_preempt {
90 bool runlist_0:1;
91 bool runlist_1:1;
92 bool runlist_2:1;
93 bool runlist_3:1;
94 bool runlist_4:1;
95 bool runlist_5:1;
96 bool runlist_6:1;
97 bool runlist_7:1;
98 bool runlist_8:1;
99 bool runlist_9:1;
100 bool runlist_10:1;
101 bool runlist_11:1;
102 bool runlist_12:1;
103 bool runlist_13:1;
104 uint32_t padding:28;
105} __attribute__((packed));
106
107// Note: This is different with Turing
108#define NV_PFIFO_RUNLIST_BASE 0x00002270
109typedef union {
110 struct {
111 uint32_t ptr:28;
112 uint32_t type:2;
113 uint32_t padding:2;
114 } __attribute__((packed));
115 uint32_t raw;
116} runlist_base_t;
117
118#define NV_PFIFO_RUNLIST 0x00002274
119typedef union {
120 struct {
121 uint32_t len:16;
122 uint32_t padding:4;
123 uint32_t id:4;
124 uint32_t padding2:8;
125 } __attribute__((packed));
126 uint32_t raw;
127} runlist_info_t;
diff --git a/nvidia_preemption.md b/nvidia_preemption.md
new file mode 100644
index 0000000..051d4a5
--- /dev/null
+++ b/nvidia_preemption.md
@@ -0,0 +1,36 @@
1# NVIDIA GPU Preemption
2
3MVP: Preempt current work on the GPU on the Jetson Xavier
4
5Summary of approach: Create new runlist that excludes the current work and point the GPU to it
6
71. Obtain current runlist
82. Copy runlist to new location, skipping TSG of target to preempt
93. Write new runlist address to NV_PFIFO_RUNLIST, which will preempt current work
10
11It's unclear if this approach is lower-overhead than that of Capodieci et al.
12See approach Alternate 1 which is our new priority.
13
14Notes:
15- Each TSG (timeslice group) corresponds to one context (?)
16- Runlist base must be 4k aligned
17- nvgpu driver gets gk20a struct via container_of an inode which is a struct nvgpu_os_linux
18- gk20a_writel is nvgpu_writel. Define is: `void nvgpu_writel(struct gk20a *g, u32 reg_addr, u32 value);`
19- gk20a_readl is nvgpu_readl. Define is: `u32 nvgpu_readl(struct gk20a *g, u32 reg_addr);`
20
21## Other approaches:
22
23### Alternate 1:
24 "2. Disable all channels in the containing TSG by writing ENABLE_CLR to TRUE
25 in their channel RAM entries in NV_PCCSR_CHANNEL (see dev_fifo.ref).
26 3. Initiate a preempt of the TSG via NV_PFIFO_PREEMPT or
27 NV_PFIFO_RUNLIST_PREEMPT." (PBDMA, "Recovery procedure")
28
29### Alternate 2:
30 "3. Initiate a preempt of the engine by writing the bit associated with its
31 runlist to NV_PFIFO_RUNLIST_PREEMPT. This allows us to begin the preempt
32 process prior to doing the slow register reads needed to determine whether
33 the context has hit any interrupts or is hung. Do not poll
34 NV_PFIFO_RUNLIST_PREEMPT for the preempt to complete." (FIFO, "Context TSG tear-down procedure")
35
36See `nvdebug.c` and `nvdebug.h` for implementation details.