diff options
-rw-r--r-- | .gitignore | 7 | ||||
-rw-r--r-- | Makefile | 13 | ||||
-rw-r--r-- | nvdebug.c | 278 | ||||
-rw-r--r-- | nvdebug.h | 127 | ||||
-rw-r--r-- | nvidia_preemption.md | 36 |
5 files changed, 461 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..197a191 --- /dev/null +++ b/.gitignore | |||
@@ -0,0 +1,7 @@ | |||
1 | .* | ||
2 | *.ko | ||
3 | *.mod.c | ||
4 | *.o | ||
5 | *.o.* | ||
6 | modules.order | ||
7 | Module.symvers | ||
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..cc14996 --- /dev/null +++ b/Makefile | |||
@@ -0,0 +1,13 @@ | |||
1 | obj-m += nvdebug.o | ||
2 | |||
3 | # TODO: Avoid needing to distribute NVIDIA's headers (at least they're MIT...) | ||
4 | #ccflags-y += -I$(PWD)/include | ||
5 | ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu/include | ||
6 | ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu | ||
7 | ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include | ||
8 | ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include/uapi | ||
9 | |||
10 | all: | ||
11 | make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules | ||
12 | clean: | ||
13 | make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean | ||
diff --git a/nvdebug.c b/nvdebug.c new file mode 100644 index 0000000..31a797e --- /dev/null +++ b/nvdebug.c | |||
@@ -0,0 +1,278 @@ | |||
1 | /* Copyright 2021 Joshua Bakita | ||
2 | * SPDX-License-Identifier: MIT | ||
3 | */ | ||
4 | |||
5 | /* TODO | ||
6 | * - Add /proc /sys or debugfs interface | ||
7 | * - Add API to trigger a preemption | ||
8 | */ | ||
9 | |||
10 | #include <linux/module.h> | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/device.h> | ||
13 | #include <linux/kallsyms.h> | ||
14 | #include <linux/iommu.h> // For struct iommu_domain | ||
15 | #include <asm/io.h> | ||
16 | |||
17 | /* Currently used symbols: | ||
18 | * - struct gk20a; | ||
19 | * - struct nvgpu_os_linux; | ||
20 | * - void nvgpu_writel(struct gk20a *g, u32 reg_addr, u32 value); | ||
21 | */ | ||
22 | #include <nvgpu/io.h> | ||
23 | #include <nvgpu/gk20a.h> | ||
24 | #include <os/linux/os_linux.h> | ||
25 | |||
26 | #include "nvdebug.h" | ||
27 | |||
28 | MODULE_LICENSE("GPL"); // LIAR | ||
29 | MODULE_AUTHOR("Joshua Bakita"); | ||
30 | MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs"); | ||
31 | MODULE_SOFTDEP("pre: nvgpu"); // We only support the Jetson boards for now | ||
32 | |||
33 | // Bus types are global symbols in the kernel | ||
34 | extern struct bus_type platform_bus_type; | ||
35 | |||
36 | static inline struct gk20a *get_gk20a(struct device *dev) { | ||
37 | // XXX: Only works because gk20a* is the first member of gk20a_platform | ||
38 | return *((struct gk20a**)dev_get_drvdata(dev)); | ||
39 | } | ||
40 | |||
41 | // Functionally identical to nvgpu_readl() | ||
42 | // (except we don't try to resolve situations where regs is NULL) | ||
43 | static inline u32 nvdebug_readl(struct gk20a* g, u32 r) { | ||
44 | struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g); | ||
45 | if (unlikely(!g_os->regs)) { | ||
46 | printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n"); | ||
47 | return -1; | ||
48 | } | ||
49 | return readl(g_os->regs + r); | ||
50 | } | ||
51 | |||
52 | // Functionally identical to nvgpu_writel() | ||
53 | static inline void nvdebug_writel(struct gk20a* g, u32 r, u32 v) { | ||
54 | struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g); | ||
55 | if (unlikely(!g_os->regs)) { | ||
56 | printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n"); | ||
57 | return; | ||
58 | } | ||
59 | writel_relaxed(v, g_os->regs + r); | ||
60 | wmb(); | ||
61 | } | ||
62 | /* | ||
63 | #define RUNLIST_PROCFS_NAME "runlist" | ||
64 | |||
65 | static const struct seq_operations runlist_file_seq_ops = { | ||
66 | .start = | ||
67 | .next = | ||
68 | .stop = | ||
69 | .show = | ||
70 | }; | ||
71 | |||
72 | static const struct file_operations runlist_file_ops = { | ||
73 | .read = | ||
74 | */ | ||
75 | /*static void read_bytes(struct gk20a *g, void* target, u32 start, u32 num_bytes) { | ||
76 | u32 *output = target; | ||
77 | u32 i; | ||
78 | // Read u32s from the GPU | ||
79 | for (i = 0; i < num_bytes; i += 4) { | ||
80 | output[i/4] = _nvgpu_readl(g, start + i); | ||
81 | printk(KERN_INFO "[nvdebug] U32 %d: %0x\n", i, output[i/4]); | ||
82 | } | ||
83 | } | ||
84 | |||
85 | static void read_bytes(void* target, void* start, u32 num_bytes) { | ||
86 | u32 *output = target; | ||
87 | u32 i; | ||
88 | // Read u32s from the GPU | ||
89 | for (i = 0; i < num_bytes; i += 4) { | ||
90 | output[i/4] = readl(start + i); | ||
91 | printk(KERN_INFO "[nvdebug] U32 %d: %0x\n", i, output[i/4]); | ||
92 | } | ||
93 | }*/ | ||
94 | |||
95 | /* | ||
96 | +---- TSG Entry %d ----+ | ||
97 | | Scale: %d | | ||
98 | | Timeout: %d | | ||
99 | +----------------------+ | ||
100 | |||
101 | |||
102 | |||
103 | |||
104 | |||
105 | |||
106 | */ | ||
107 | |||
108 | #define PRE KERN_INFO "[nvdebug] " | ||
109 | |||
110 | static void nvdebug_print_tsg(struct entry_tsg* tsg) { | ||
111 | if (tsg->entry_type != ENTRY_TYPE_TSG) { | ||
112 | printk(KERN_WARNING "[nvdebug] Attempted to print non-TSG in nvdebug_print_tsg()!\n"); | ||
113 | return; | ||
114 | } | ||
115 | printk(PRE "+---- TSG Entry %-2d----+", tsg->tsgid); | ||
116 | printk(PRE "| Scale: %-13d|", tsg->timeslice_scale); | ||
117 | printk(PRE "| Timeout: %-11d|", tsg->timeslice_timeout); | ||
118 | printk(PRE "+---------------------+"); | ||
119 | } | ||
120 | |||
121 | static void nvdebug_print_chan(struct runlist_chan* chan) { | ||
122 | char* loc_txt; | ||
123 | u64 inst_ptr; | ||
124 | if (chan->entry_type != ENTRY_TYPE_CHAN) { | ||
125 | printk(KERN_WARNING "[nvdebug] Attempted to print non-channel in nvdebug_print_channel()!\n"); | ||
126 | return; | ||
127 | } | ||
128 | switch (chan->inst_target) { | ||
129 | case TARGET_VID_MEM: | ||
130 | loc_txt = "VID_MEM"; | ||
131 | break; | ||
132 | case TARGET_SYS_MEM_COHERENT: | ||
133 | loc_txt = "SYS_MEM_COHERENT"; | ||
134 | break; | ||
135 | case TARGET_SYS_MEM_NONCOHERENT: | ||
136 | loc_txt = "SYS_MEM_NONCOHERENT"; | ||
137 | break; | ||
138 | default: | ||
139 | printk(KERN_WARNING "[nvdebug] Invalid aperture in runlist channel!\n"); | ||
140 | return; | ||
141 | } | ||
142 | // Reconstruct pointer to channel instance block | ||
143 | inst_ptr = chan->inst_ptr_hi; | ||
144 | inst_ptr <<= 32; | ||
145 | inst_ptr |= chan->inst_ptr_lo << 12; | ||
146 | |||
147 | printk(PRE " +- Channel Entry %-4d-+", chan->chid); | ||
148 | printk(PRE " | Runqueue Selector: %d|", chan->runqueue_selector); | ||
149 | printk(PRE " | Instance PTR: |"); | ||
150 | printk(PRE " | %#018llx |", inst_ptr); | ||
151 | printk(PRE " | %-20s|", loc_txt); | ||
152 | printk(PRE " +---------------------+"); | ||
153 | } | ||
154 | |||
155 | #define for_chan_in_tsg(chan, tsg) \ | ||
156 | for (chan = (struct runlist_chan*)(tsg + 1); \ | ||
157 | (void*)chan < (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length; \ | ||
158 | chan++) | ||
159 | |||
160 | #define next_tsg(tsg) \ | ||
161 | (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length | ||
162 | |||
163 | static void nvdebug_print_runlist(struct entry_tsg* head, runlist_info_t rl_info) { | ||
164 | int rl_idx = 0; | ||
165 | struct runlist_chan* chan; | ||
166 | printk(PRE "tsg->tsg_length: %d\n", head->tsg_length); | ||
167 | printk(PRE "rl_info.len: %d\n", rl_info.len); | ||
168 | while (rl_idx < rl_info.len) { | ||
169 | nvdebug_print_tsg(head); | ||
170 | for_chan_in_tsg(chan, head) { | ||
171 | nvdebug_print_chan(chan); | ||
172 | } | ||
173 | rl_idx += 1 + head->tsg_length; | ||
174 | head = next_tsg(head); | ||
175 | } | ||
176 | } | ||
177 | |||
178 | static int __init nvdebug_init(void) { | ||
179 | struct device *dev = NULL; | ||
180 | struct device *temp_dev; | ||
181 | struct gk20a *g; | ||
182 | struct entry_tsg head; | ||
183 | runlist_base_t rl_base; | ||
184 | runlist_info_t rl_info; | ||
185 | u64 runlist_iova; | ||
186 | // Get the last device that matches our name | ||
187 | while ((temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b"))) { | ||
188 | printk(KERN_INFO "Found a matching device\n"); | ||
189 | dev = temp_dev; | ||
190 | } | ||
191 | if (!dev) | ||
192 | return -EIO; | ||
193 | g = get_gk20a(dev); | ||
194 | // This address seems to not be: | ||
195 | // - A GPU address (type is sysmem_coherent) | ||
196 | // - A physical address (dereferencing after ioremap crashes) | ||
197 | // - A kernel virtual address (dereferencing segfaults) | ||
198 | // So maybe it's some sort of custom thing? This is an address that the GPU | ||
199 | // can use, so it would make most sense for it to be a physical address. | ||
200 | // | ||
201 | // BUT, it can't possibly be a physical address, as it would refer to an | ||
202 | // address greater than the maximum one on our system (by a lot!). | ||
203 | // Maybe I'm reading the runlist base wrong? | ||
204 | // Aha, the driver calls it runlist_iova. Sounds like runlist I/O virtual | ||
205 | // address! So, what's this I/O address space? All I know is that it's what | ||
206 | // nvgpu_mem_get_addr() returns. That function returns the result of either: | ||
207 | // - gpu_phys_addr which is __nvgpu_sgl_phys on our platform which (?) | ||
208 | // converts an IPA to a PA? | ||
209 | // - nvgpu_mem_iommu_translate | ||
210 | // | ||
211 | // The original memory is allocated with nvgpu_dma_alloc_flags_sys(), which | ||
212 | // returns SYSMEM. | ||
213 | // | ||
214 | // To convert a physical address to a IOMMU address, we add a bit | ||
215 | // | ||
216 | // BUT, it turns out that it IS JUST A PHYSICAL ADDRESS! It wasn't working | ||
217 | // before because the GPU had simply gone to sleep and invalidated its | ||
218 | // register state, so nvgpu_readl() was simply returning garbage. | ||
219 | |||
220 | printk(KERN_INFO "[nvdebug] Pulling runlist base address from %x\n", NV_PFIFO_RUNLIST_BASE); | ||
221 | printk(KERN_INFO "[nvdebug] Using struct gk20a* of %px\n", g); | ||
222 | printk(KERN_INFO "[nvdebug] g->name: %s, g->power_on: %d, g->sw_ready: %d, g->is_virtual %d\n", g->name, g->power_on, g->sw_ready, g->is_virtual); | ||
223 | struct nvgpu_os_linux *l = container_of(g, struct nvgpu_os_linux, g); | ||
224 | printk(KERN_INFO "[nvdebug] l->regs %px, l->regs_saved %px\n", l->regs, l->regs_saved); | ||
225 | if (!l->regs) | ||
226 | return -EIO; | ||
227 | rl_base.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST_BASE); | ||
228 | rl_info.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST); | ||
229 | runlist_iova = ((u64)rl_base.ptr) << 12; | ||
230 | printk(KERN_INFO "[nvdebug] Runlist ptr: %x, type: %d, raw: %x, IOVA: %px\n", rl_base.ptr, rl_base.type, rl_base.raw, (void*)runlist_iova); | ||
231 | // Segfaults | ||
232 | //u32 attempted_read = ioread32(runlist_iova); | ||
233 | //printk(KERN_INFO "[nvdebug] first word of runlist: %0x\n", attempted_read); | ||
234 | |||
235 | // Errors out | ||
236 | //u32* virt_rt_addr = ioremap(phys_rl_addr, sizeof(struct entry_tsg)); | ||
237 | //printk(KERN_INFO "[nvdebug] Runlist virt_addr: %px\n", virt_rt_addr); | ||
238 | |||
239 | /* Overcomplicated? | ||
240 | struct iommu_domain *domain = iommu_get_domain_for_dev(dev); | ||
241 | if (!domain) { | ||
242 | printk(KERN_INFO "[nvdebug] No IOMMU domain!\n"); | ||
243 | return -EIO; | ||
244 | } | ||
245 | u64 phys_addr = platform_bus_type.iommu_ops->iova_to_phys(domain, runlist_iova); | ||
246 | printk(KERN_INFO "[nvdebug] Runlist PA: %px\n", phys_addr); | ||
247 | */ | ||
248 | |||
249 | printk(KERN_INFO "[nvdebug] Runlist phys_to_virt: %px\n", (void*)phys_to_virt(runlist_iova)); | ||
250 | printk(KERN_INFO "[nvdebug] Runlist *phys_to_virt: %x\n", *(u32*)phys_to_virt(runlist_iova)); | ||
251 | head = *(struct entry_tsg*)phys_to_virt(runlist_iova); | ||
252 | nvdebug_print_runlist((struct entry_tsg*)phys_to_virt(runlist_iova), rl_info); | ||
253 | //nvdebug_print_tsg(&head); | ||
254 | //nvdebug_print_chan((struct runlist_chan*)(phys_to_virt(runlist_iova) + sizeof(struct entry_tsg))); | ||
255 | //printk(KERN_INFO "[nvdebug] entry_type: %d\n", head.entry_type); | ||
256 | //printk(KERN_INFO "[nvdebug] timeslice_scale: %d\n", head.timeslice_scale); | ||
257 | //printk(KERN_INFO "[nvdebug] timeslice_timeout: %d\n", head.timeslice_timeout); | ||
258 | //printk(KERN_INFO "[nvdebug] tsg_length: %d\n", head.tsg_length); | ||
259 | //printk(KERN_INFO "[nvdebug] tsgid: %d\n", head.tsgid); | ||
260 | |||
261 | //printk(KERN_INFO "[nvdebug] Mem base phys: %p\n", (void*)virt_to_phys((void*)0xffffffc000000000ULL)); | ||
262 | //printk(KERN_INFO "[nvdebug] Mem end phys: %p\n", (void*)virt_to_phys((void*)0xffffffc400000000ULL)); | ||
263 | //printk(KERN_INFO "[nvdebug] Runlist *virt_addr: %x\n", readl(virt_rt_addr)); // This crashes | ||
264 | //read_bytes(&head, virt_rt_addr, sizeof(struct entry_tsg)); | ||
265 | /*printk(KERN_INFO "[nvdebug] entry_type: %d\n", head.entry_type); | ||
266 | printk(KERN_INFO "[nvdebug] timeslice_scale: %d\n", head.timeslice_scale); | ||
267 | printk(KERN_INFO "[nvdebug] timeslice_timeout: %d\n", head.timeslice_timeout); | ||
268 | printk(KERN_INFO "[nvdebug] tsg_length: %d\n", head.tsg_length); | ||
269 | printk(KERN_INFO "[nvdebug] tsgid: %d\n", head.tsgid); | ||
270 | */return 0; | ||
271 | } | ||
272 | |||
273 | static void __exit nvdebug_exit(void) { | ||
274 | printk(KERN_INFO "[nvdebug] Exiting...\n"); | ||
275 | } | ||
276 | |||
277 | module_init(nvdebug_init); | ||
278 | module_exit(nvdebug_exit); | ||
diff --git a/nvdebug.h b/nvdebug.h new file mode 100644 index 0000000..aa5d0cf --- /dev/null +++ b/nvdebug.h | |||
@@ -0,0 +1,127 @@ | |||
1 | /* Copyright 2021 Joshua Bakita | ||
2 | * SPDX-License-Identifier: MIT | ||
3 | */ | ||
4 | |||
5 | /* Runlist Channel | ||
6 | A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue | ||
7 | of GPU commands. These commands are typically queued from userspace. | ||
8 | |||
9 | `INST_PTR` points to a GPU Instance Block which contains pointers to the GPU | ||
10 | virtual address space for this context. All channels in a TSG point to the | ||
11 | same GPU Instance Block. | ||
12 | |||
13 | ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_CHAN | ||
14 | CHID (ID) : identifier of the channel to run (overlays ENTRY_ID) | ||
15 | RUNQUEUE_SELECTOR (Q) : selects which PBDMA should run this channel if | ||
16 | more than one PBDMA is supported by the runlist | ||
17 | |||
18 | INST_PTR_LO : lower 20 bits of the 4k-aligned instance block pointer | ||
19 | INST_PTR_HI : upper 32 bit of instance block pointer | ||
20 | INST_TARGET (TGI) : aperture of the instance block | ||
21 | |||
22 | USERD_PTR_LO : upper 24 bits of the low 32 bits, of the 512-byte-aligned USERD pointer | ||
23 | USERD_PTR_HI : upper 32 bits of USERD pointer | ||
24 | USERD_TARGET (TGU) : aperture of the USERD data structure | ||
25 | */ | ||
26 | enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1}; | ||
27 | enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3}; | ||
28 | |||
29 | struct runlist_chan { | ||
30 | // 0:63 | ||
31 | enum ENTRY_TYPE entry_type:1; | ||
32 | uint32_t runqueue_selector:1; | ||
33 | uint32_t padding:2; | ||
34 | enum INST_TARGET inst_target:2; | ||
35 | uint32_t padding2:2; | ||
36 | uint32_t userd_ptr_lo:24; | ||
37 | uint32_t userd_ptr_hi:32; | ||
38 | // 64:128 | ||
39 | uint32_t chid:12; | ||
40 | uint32_t inst_ptr_lo:20; | ||
41 | uint32_t inst_ptr_hi:32; | ||
42 | } __attribute__((packed)); | ||
43 | |||
44 | /* Runlist TSG (TimeSlice Group) | ||
45 | The runlist is composed of timeslice groups (TSG). Each TSG corresponds | ||
46 | to a single virtual address space on the GPU and contains `TSG_LENGTH` | ||
47 | channels. These channels and virtual address space are accessible to the GPU | ||
48 | host unit for use until the timeslice expires or a TSG switch is forcibly | ||
49 | initiated via a write to `NV_PFIFO_PREEMPT`. | ||
50 | |||
51 | timeslice = (TSG_TIMESLICE_TIMEOUT << TSG_TIMESLICE_SCALE) * 1024 nanoseconds | ||
52 | |||
53 | ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_TSG | ||
54 | TSGID : identifier of the Timeslice group (overlays ENTRY_ID) | ||
55 | TSG_LENGTH : number of channels that are part of this timeslice group | ||
56 | TIMESLICE_SCALE : scale factor for the TSG's timeslice | ||
57 | TIMESLICE_TIMEOUT : timeout amount for the TSG's timeslice | ||
58 | */ | ||
59 | struct entry_tsg { | ||
60 | // 0:63 | ||
61 | enum ENTRY_TYPE entry_type:1; | ||
62 | uint64_t padding:15; | ||
63 | uint32_t timeslice_scale:4; | ||
64 | uint64_t padding2:4; | ||
65 | uint32_t timeslice_timeout:8; | ||
66 | uint32_t tsg_length:8; | ||
67 | uint32_t padding3:24; | ||
68 | // 64:128 | ||
69 | uint32_t tsgid:12; | ||
70 | uint64_t padding4:52; | ||
71 | } __attribute__((packed)); | ||
72 | |||
73 | enum PREEMPT_TYPE {PREEMPT_TYPE_CHANNEL = 0, PREEMPT_TYPE_TSG = 1}; | ||
74 | |||
75 | /* Preempt | ||
76 | ID/CHID : Id of TSG or channel to preempt | ||
77 | */ | ||
78 | #define NV_PFIFO_PREEMPT 0x00002634 | ||
79 | struct pfifo_preempt { | ||
80 | uint32_t id:12; | ||
81 | uint32_t padding:8; | ||
82 | bool is_pending:1; | ||
83 | uint32_t padding2:3; | ||
84 | enum PREEMPT_TYPE type:2; | ||
85 | uint32_t padding3:6; | ||
86 | } __attribute__((packed)); | ||
87 | |||
88 | #define NV_PFIFO_RUNLIST_PREEMPT 0x00002638 | ||
89 | struct runlist_preempt { | ||
90 | bool runlist_0:1; | ||
91 | bool runlist_1:1; | ||
92 | bool runlist_2:1; | ||
93 | bool runlist_3:1; | ||
94 | bool runlist_4:1; | ||
95 | bool runlist_5:1; | ||
96 | bool runlist_6:1; | ||
97 | bool runlist_7:1; | ||
98 | bool runlist_8:1; | ||
99 | bool runlist_9:1; | ||
100 | bool runlist_10:1; | ||
101 | bool runlist_11:1; | ||
102 | bool runlist_12:1; | ||
103 | bool runlist_13:1; | ||
104 | uint32_t padding:28; | ||
105 | } __attribute__((packed)); | ||
106 | |||
107 | // Note: This is different with Turing | ||
108 | #define NV_PFIFO_RUNLIST_BASE 0x00002270 | ||
109 | typedef union { | ||
110 | struct { | ||
111 | uint32_t ptr:28; | ||
112 | uint32_t type:2; | ||
113 | uint32_t padding:2; | ||
114 | } __attribute__((packed)); | ||
115 | uint32_t raw; | ||
116 | } runlist_base_t; | ||
117 | |||
118 | #define NV_PFIFO_RUNLIST 0x00002274 | ||
119 | typedef union { | ||
120 | struct { | ||
121 | uint32_t len:16; | ||
122 | uint32_t padding:4; | ||
123 | uint32_t id:4; | ||
124 | uint32_t padding2:8; | ||
125 | } __attribute__((packed)); | ||
126 | uint32_t raw; | ||
127 | } runlist_info_t; | ||
diff --git a/nvidia_preemption.md b/nvidia_preemption.md new file mode 100644 index 0000000..051d4a5 --- /dev/null +++ b/nvidia_preemption.md | |||
@@ -0,0 +1,36 @@ | |||
1 | # NVIDIA GPU Preemption | ||
2 | |||
3 | MVP: Preempt current work on the GPU on the Jetson Xavier | ||
4 | |||
5 | Summary of approach: Create new runlist that excludes the current work and point the GPU to it | ||
6 | |||
7 | 1. Obtain current runlist | ||
8 | 2. Copy runlist to new location, skipping TSG of target to preempt | ||
9 | 3. Write new runlist address to NV_PFIFO_RUNLIST, which will preempt current work | ||
10 | |||
11 | It's unclear if this approach is lower-overhead than that of Capodieci et al. | ||
12 | See approach Alternate 1 which is our new priority. | ||
13 | |||
14 | Notes: | ||
15 | - Each TSG (timeslice group) corresponds to one context (?) | ||
16 | - Runlist base must be 4k aligned | ||
17 | - nvgpu driver gets gk20a struct via container_of an inode which is a struct nvgpu_os_linux | ||
18 | - gk20a_writel is nvgpu_writel. Define is: `void nvgpu_writel(struct gk20a *g, u32 reg_addr, u32 value);` | ||
19 | - gk20a_readl is nvgpu_readl. Define is: `u32 nvgpu_readl(struct gk20a *g, u32 reg_addr);` | ||
20 | |||
21 | ## Other approaches: | ||
22 | |||
23 | ### Alternate 1: | ||
24 | "2. Disable all channels in the containing TSG by writing ENABLE_CLR to TRUE | ||
25 | in their channel RAM entries in NV_PCCSR_CHANNEL (see dev_fifo.ref). | ||
26 | 3. Initiate a preempt of the TSG via NV_PFIFO_PREEMPT or | ||
27 | NV_PFIFO_RUNLIST_PREEMPT." (PBDMA, "Recovery procedure") | ||
28 | |||
29 | ### Alternate 2: | ||
30 | "3. Initiate a preempt of the engine by writing the bit associated with its | ||
31 | runlist to NV_PFIFO_RUNLIST_PREEMPT. This allows us to begin the preempt | ||
32 | process prior to doing the slow register reads needed to determine whether | ||
33 | the context has hit any interrupts or is hung. Do not poll | ||
34 | NV_PFIFO_RUNLIST_PREEMPT for the preempt to complete." (FIFO, "Context TSG tear-down procedure") | ||
35 | |||
36 | See `nvdebug.c` and `nvdebug.h` for implementation details. | ||