diff options
| -rw-r--r-- | .gitignore | 7 | ||||
| -rw-r--r-- | Makefile | 13 | ||||
| -rw-r--r-- | nvdebug.c | 278 | ||||
| -rw-r--r-- | nvdebug.h | 127 | ||||
| -rw-r--r-- | nvidia_preemption.md | 36 |
5 files changed, 461 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..197a191 --- /dev/null +++ b/.gitignore | |||
| @@ -0,0 +1,7 @@ | |||
| 1 | .* | ||
| 2 | *.ko | ||
| 3 | *.mod.c | ||
| 4 | *.o | ||
| 5 | *.o.* | ||
| 6 | modules.order | ||
| 7 | Module.symvers | ||
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..cc14996 --- /dev/null +++ b/Makefile | |||
| @@ -0,0 +1,13 @@ | |||
| 1 | obj-m += nvdebug.o | ||
| 2 | |||
| 3 | # TODO: Avoid needing to distribute NVIDIA's headers (at least they're MIT...) | ||
| 4 | #ccflags-y += -I$(PWD)/include | ||
| 5 | ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu/include | ||
| 6 | ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu | ||
| 7 | ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include | ||
| 8 | ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include/uapi | ||
| 9 | |||
| 10 | all: | ||
| 11 | make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules | ||
| 12 | clean: | ||
| 13 | make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean | ||
diff --git a/nvdebug.c b/nvdebug.c new file mode 100644 index 0000000..31a797e --- /dev/null +++ b/nvdebug.c | |||
| @@ -0,0 +1,278 @@ | |||
| 1 | /* Copyright 2021 Joshua Bakita | ||
| 2 | * SPDX-License-Identifier: MIT | ||
| 3 | */ | ||
| 4 | |||
| 5 | /* TODO | ||
| 6 | * - Add /proc /sys or debugfs interface | ||
| 7 | * - Add API to trigger a preemption | ||
| 8 | */ | ||
| 9 | |||
| 10 | #include <linux/module.h> | ||
| 11 | #include <linux/kernel.h> | ||
| 12 | #include <linux/device.h> | ||
| 13 | #include <linux/kallsyms.h> | ||
| 14 | #include <linux/iommu.h> // For struct iommu_domain | ||
| 15 | #include <asm/io.h> | ||
| 16 | |||
| 17 | /* Currently used symbols: | ||
| 18 | * - struct gk20a; | ||
| 19 | * - struct nvgpu_os_linux; | ||
| 20 | * - void nvgpu_writel(struct gk20a *g, u32 reg_addr, u32 value); | ||
| 21 | */ | ||
| 22 | #include <nvgpu/io.h> | ||
| 23 | #include <nvgpu/gk20a.h> | ||
| 24 | #include <os/linux/os_linux.h> | ||
| 25 | |||
| 26 | #include "nvdebug.h" | ||
| 27 | |||
| 28 | MODULE_LICENSE("GPL"); // LIAR | ||
| 29 | MODULE_AUTHOR("Joshua Bakita"); | ||
| 30 | MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs"); | ||
| 31 | MODULE_SOFTDEP("pre: nvgpu"); // We only support the Jetson boards for now | ||
| 32 | |||
| 33 | // Bus types are global symbols in the kernel | ||
| 34 | extern struct bus_type platform_bus_type; | ||
| 35 | |||
| 36 | static inline struct gk20a *get_gk20a(struct device *dev) { | ||
| 37 | // XXX: Only works because gk20a* is the first member of gk20a_platform | ||
| 38 | return *((struct gk20a**)dev_get_drvdata(dev)); | ||
| 39 | } | ||
| 40 | |||
| 41 | // Functionally identical to nvgpu_readl() | ||
| 42 | // (except we don't try to resolve situations where regs is NULL) | ||
| 43 | static inline u32 nvdebug_readl(struct gk20a* g, u32 r) { | ||
| 44 | struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g); | ||
| 45 | if (unlikely(!g_os->regs)) { | ||
| 46 | printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n"); | ||
| 47 | return -1; | ||
| 48 | } | ||
| 49 | return readl(g_os->regs + r); | ||
| 50 | } | ||
| 51 | |||
| 52 | // Functionally identical to nvgpu_writel() | ||
| 53 | static inline void nvdebug_writel(struct gk20a* g, u32 r, u32 v) { | ||
| 54 | struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g); | ||
| 55 | if (unlikely(!g_os->regs)) { | ||
| 56 | printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n"); | ||
| 57 | return; | ||
| 58 | } | ||
| 59 | writel_relaxed(v, g_os->regs + r); | ||
| 60 | wmb(); | ||
| 61 | } | ||
| 62 | /* | ||
| 63 | #define RUNLIST_PROCFS_NAME "runlist" | ||
| 64 | |||
| 65 | static const struct seq_operations runlist_file_seq_ops = { | ||
| 66 | .start = | ||
| 67 | .next = | ||
| 68 | .stop = | ||
| 69 | .show = | ||
| 70 | }; | ||
| 71 | |||
| 72 | static const struct file_operations runlist_file_ops = { | ||
| 73 | .read = | ||
| 74 | */ | ||
| 75 | /*static void read_bytes(struct gk20a *g, void* target, u32 start, u32 num_bytes) { | ||
| 76 | u32 *output = target; | ||
| 77 | u32 i; | ||
| 78 | // Read u32s from the GPU | ||
| 79 | for (i = 0; i < num_bytes; i += 4) { | ||
| 80 | output[i/4] = _nvgpu_readl(g, start + i); | ||
| 81 | printk(KERN_INFO "[nvdebug] U32 %d: %0x\n", i, output[i/4]); | ||
| 82 | } | ||
| 83 | } | ||
| 84 | |||
| 85 | static void read_bytes(void* target, void* start, u32 num_bytes) { | ||
| 86 | u32 *output = target; | ||
| 87 | u32 i; | ||
| 88 | // Read u32s from the GPU | ||
| 89 | for (i = 0; i < num_bytes; i += 4) { | ||
| 90 | output[i/4] = readl(start + i); | ||
| 91 | printk(KERN_INFO "[nvdebug] U32 %d: %0x\n", i, output[i/4]); | ||
| 92 | } | ||
| 93 | }*/ | ||
| 94 | |||
| 95 | /* | ||
| 96 | +---- TSG Entry %d ----+ | ||
| 97 | | Scale: %d | | ||
| 98 | | Timeout: %d | | ||
| 99 | +----------------------+ | ||
| 100 | |||
| 101 | |||
| 102 | |||
| 103 | |||
| 104 | |||
| 105 | |||
| 106 | */ | ||
| 107 | |||
| 108 | #define PRE KERN_INFO "[nvdebug] " | ||
| 109 | |||
| 110 | static void nvdebug_print_tsg(struct entry_tsg* tsg) { | ||
| 111 | if (tsg->entry_type != ENTRY_TYPE_TSG) { | ||
| 112 | printk(KERN_WARNING "[nvdebug] Attempted to print non-TSG in nvdebug_print_tsg()!\n"); | ||
| 113 | return; | ||
| 114 | } | ||
| 115 | printk(PRE "+---- TSG Entry %-2d----+", tsg->tsgid); | ||
| 116 | printk(PRE "| Scale: %-13d|", tsg->timeslice_scale); | ||
| 117 | printk(PRE "| Timeout: %-11d|", tsg->timeslice_timeout); | ||
| 118 | printk(PRE "+---------------------+"); | ||
| 119 | } | ||
| 120 | |||
| 121 | static void nvdebug_print_chan(struct runlist_chan* chan) { | ||
| 122 | char* loc_txt; | ||
| 123 | u64 inst_ptr; | ||
| 124 | if (chan->entry_type != ENTRY_TYPE_CHAN) { | ||
| 125 | printk(KERN_WARNING "[nvdebug] Attempted to print non-channel in nvdebug_print_channel()!\n"); | ||
| 126 | return; | ||
| 127 | } | ||
| 128 | switch (chan->inst_target) { | ||
| 129 | case TARGET_VID_MEM: | ||
| 130 | loc_txt = "VID_MEM"; | ||
| 131 | break; | ||
| 132 | case TARGET_SYS_MEM_COHERENT: | ||
| 133 | loc_txt = "SYS_MEM_COHERENT"; | ||
| 134 | break; | ||
| 135 | case TARGET_SYS_MEM_NONCOHERENT: | ||
| 136 | loc_txt = "SYS_MEM_NONCOHERENT"; | ||
| 137 | break; | ||
| 138 | default: | ||
| 139 | printk(KERN_WARNING "[nvdebug] Invalid aperture in runlist channel!\n"); | ||
| 140 | return; | ||
| 141 | } | ||
| 142 | // Reconstruct pointer to channel instance block | ||
| 143 | inst_ptr = chan->inst_ptr_hi; | ||
| 144 | inst_ptr <<= 32; | ||
| 145 | inst_ptr |= chan->inst_ptr_lo << 12; | ||
| 146 | |||
| 147 | printk(PRE " +- Channel Entry %-4d-+", chan->chid); | ||
| 148 | printk(PRE " | Runqueue Selector: %d|", chan->runqueue_selector); | ||
| 149 | printk(PRE " | Instance PTR: |"); | ||
| 150 | printk(PRE " | %#018llx |", inst_ptr); | ||
| 151 | printk(PRE " | %-20s|", loc_txt); | ||
| 152 | printk(PRE " +---------------------+"); | ||
| 153 | } | ||
| 154 | |||
| 155 | #define for_chan_in_tsg(chan, tsg) \ | ||
| 156 | for (chan = (struct runlist_chan*)(tsg + 1); \ | ||
| 157 | (void*)chan < (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length; \ | ||
| 158 | chan++) | ||
| 159 | |||
| 160 | #define next_tsg(tsg) \ | ||
| 161 | (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length | ||
| 162 | |||
| 163 | static void nvdebug_print_runlist(struct entry_tsg* head, runlist_info_t rl_info) { | ||
| 164 | int rl_idx = 0; | ||
| 165 | struct runlist_chan* chan; | ||
| 166 | printk(PRE "tsg->tsg_length: %d\n", head->tsg_length); | ||
| 167 | printk(PRE "rl_info.len: %d\n", rl_info.len); | ||
| 168 | while (rl_idx < rl_info.len) { | ||
| 169 | nvdebug_print_tsg(head); | ||
| 170 | for_chan_in_tsg(chan, head) { | ||
| 171 | nvdebug_print_chan(chan); | ||
| 172 | } | ||
| 173 | rl_idx += 1 + head->tsg_length; | ||
| 174 | head = next_tsg(head); | ||
| 175 | } | ||
| 176 | } | ||
| 177 | |||
| 178 | static int __init nvdebug_init(void) { | ||
| 179 | struct device *dev = NULL; | ||
| 180 | struct device *temp_dev; | ||
| 181 | struct gk20a *g; | ||
| 182 | struct entry_tsg head; | ||
| 183 | runlist_base_t rl_base; | ||
| 184 | runlist_info_t rl_info; | ||
| 185 | u64 runlist_iova; | ||
| 186 | // Get the last device that matches our name | ||
| 187 | while ((temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b"))) { | ||
| 188 | printk(KERN_INFO "Found a matching device\n"); | ||
| 189 | dev = temp_dev; | ||
| 190 | } | ||
| 191 | if (!dev) | ||
| 192 | return -EIO; | ||
| 193 | g = get_gk20a(dev); | ||
| 194 | // This address seems to not be: | ||
| 195 | // - A GPU address (type is sysmem_coherent) | ||
| 196 | // - A physical address (dereferencing after ioremap crashes) | ||
| 197 | // - A kernel virtual address (dereferencing segfaults) | ||
| 198 | // So maybe it's some sort of custom thing? This is an address that the GPU | ||
| 199 | // can use, so it would make most sense for it to be a physical address. | ||
| 200 | // | ||
| 201 | // BUT, it can't possibly be a physical address, as it would refer to an | ||
| 202 | // address greater than the maximum one on our system (by a lot!). | ||
| 203 | // Maybe I'm reading the runlist base wrong? | ||
| 204 | // Aha, the driver calls it runlist_iova. Sounds like runlist I/O virtual | ||
| 205 | // address! So, what's this I/O address space? All I know is that it's what | ||
| 206 | // nvgpu_mem_get_addr() returns. That function returns the result of either: | ||
| 207 | // - gpu_phys_addr which is __nvgpu_sgl_phys on our platform which (?) | ||
| 208 | // converts an IPA to a PA? | ||
| 209 | // - nvgpu_mem_iommu_translate | ||
| 210 | // | ||
| 211 | // The original memory is allocated with nvgpu_dma_alloc_flags_sys(), which | ||
| 212 | // returns SYSMEM. | ||
| 213 | // | ||
| 214 | // To convert a physical address to a IOMMU address, we add a bit | ||
| 215 | // | ||
| 216 | // BUT, it turns out that it IS JUST A PHYSICAL ADDRESS! It wasn't working | ||
| 217 | // before because the GPU had simply gone to sleep and invalidated its | ||
| 218 | // register state, so nvgpu_readl() was simply returning garbage. | ||
| 219 | |||
| 220 | printk(KERN_INFO "[nvdebug] Pulling runlist base address from %x\n", NV_PFIFO_RUNLIST_BASE); | ||
| 221 | printk(KERN_INFO "[nvdebug] Using struct gk20a* of %px\n", g); | ||
| 222 | printk(KERN_INFO "[nvdebug] g->name: %s, g->power_on: %d, g->sw_ready: %d, g->is_virtual %d\n", g->name, g->power_on, g->sw_ready, g->is_virtual); | ||
| 223 | struct nvgpu_os_linux *l = container_of(g, struct nvgpu_os_linux, g); | ||
| 224 | printk(KERN_INFO "[nvdebug] l->regs %px, l->regs_saved %px\n", l->regs, l->regs_saved); | ||
| 225 | if (!l->regs) | ||
| 226 | return -EIO; | ||
| 227 | rl_base.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST_BASE); | ||
| 228 | rl_info.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST); | ||
| 229 | runlist_iova = ((u64)rl_base.ptr) << 12; | ||
| 230 | printk(KERN_INFO "[nvdebug] Runlist ptr: %x, type: %d, raw: %x, IOVA: %px\n", rl_base.ptr, rl_base.type, rl_base.raw, (void*)runlist_iova); | ||
| 231 | // Segfaults | ||
| 232 | //u32 attempted_read = ioread32(runlist_iova); | ||
| 233 | //printk(KERN_INFO "[nvdebug] first word of runlist: %0x\n", attempted_read); | ||
| 234 | |||
| 235 | // Errors out | ||
| 236 | //u32* virt_rt_addr = ioremap(phys_rl_addr, sizeof(struct entry_tsg)); | ||
| 237 | //printk(KERN_INFO "[nvdebug] Runlist virt_addr: %px\n", virt_rt_addr); | ||
| 238 | |||
| 239 | /* Overcomplicated? | ||
| 240 | struct iommu_domain *domain = iommu_get_domain_for_dev(dev); | ||
| 241 | if (!domain) { | ||
| 242 | printk(KERN_INFO "[nvdebug] No IOMMU domain!\n"); | ||
| 243 | return -EIO; | ||
| 244 | } | ||
| 245 | u64 phys_addr = platform_bus_type.iommu_ops->iova_to_phys(domain, runlist_iova); | ||
| 246 | printk(KERN_INFO "[nvdebug] Runlist PA: %px\n", phys_addr); | ||
| 247 | */ | ||
| 248 | |||
| 249 | printk(KERN_INFO "[nvdebug] Runlist phys_to_virt: %px\n", (void*)phys_to_virt(runlist_iova)); | ||
| 250 | printk(KERN_INFO "[nvdebug] Runlist *phys_to_virt: %x\n", *(u32*)phys_to_virt(runlist_iova)); | ||
| 251 | head = *(struct entry_tsg*)phys_to_virt(runlist_iova); | ||
| 252 | nvdebug_print_runlist((struct entry_tsg*)phys_to_virt(runlist_iova), rl_info); | ||
| 253 | //nvdebug_print_tsg(&head); | ||
| 254 | //nvdebug_print_chan((struct runlist_chan*)(phys_to_virt(runlist_iova) + sizeof(struct entry_tsg))); | ||
| 255 | //printk(KERN_INFO "[nvdebug] entry_type: %d\n", head.entry_type); | ||
| 256 | //printk(KERN_INFO "[nvdebug] timeslice_scale: %d\n", head.timeslice_scale); | ||
| 257 | //printk(KERN_INFO "[nvdebug] timeslice_timeout: %d\n", head.timeslice_timeout); | ||
| 258 | //printk(KERN_INFO "[nvdebug] tsg_length: %d\n", head.tsg_length); | ||
| 259 | //printk(KERN_INFO "[nvdebug] tsgid: %d\n", head.tsgid); | ||
| 260 | |||
| 261 | //printk(KERN_INFO "[nvdebug] Mem base phys: %p\n", (void*)virt_to_phys((void*)0xffffffc000000000ULL)); | ||
| 262 | //printk(KERN_INFO "[nvdebug] Mem end phys: %p\n", (void*)virt_to_phys((void*)0xffffffc400000000ULL)); | ||
| 263 | //printk(KERN_INFO "[nvdebug] Runlist *virt_addr: %x\n", readl(virt_rt_addr)); // This crashes | ||
| 264 | //read_bytes(&head, virt_rt_addr, sizeof(struct entry_tsg)); | ||
| 265 | /*printk(KERN_INFO "[nvdebug] entry_type: %d\n", head.entry_type); | ||
| 266 | printk(KERN_INFO "[nvdebug] timeslice_scale: %d\n", head.timeslice_scale); | ||
| 267 | printk(KERN_INFO "[nvdebug] timeslice_timeout: %d\n", head.timeslice_timeout); | ||
| 268 | printk(KERN_INFO "[nvdebug] tsg_length: %d\n", head.tsg_length); | ||
| 269 | printk(KERN_INFO "[nvdebug] tsgid: %d\n", head.tsgid); | ||
| 270 | */return 0; | ||
| 271 | } | ||
| 272 | |||
| 273 | static void __exit nvdebug_exit(void) { | ||
| 274 | printk(KERN_INFO "[nvdebug] Exiting...\n"); | ||
| 275 | } | ||
| 276 | |||
| 277 | module_init(nvdebug_init); | ||
| 278 | module_exit(nvdebug_exit); | ||
diff --git a/nvdebug.h b/nvdebug.h new file mode 100644 index 0000000..aa5d0cf --- /dev/null +++ b/nvdebug.h | |||
| @@ -0,0 +1,127 @@ | |||
| 1 | /* Copyright 2021 Joshua Bakita | ||
| 2 | * SPDX-License-Identifier: MIT | ||
| 3 | */ | ||
| 4 | |||
| 5 | /* Runlist Channel | ||
| 6 | A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue | ||
| 7 | of GPU commands. These commands are typically queued from userspace. | ||
| 8 | |||
| 9 | `INST_PTR` points to a GPU Instance Block which contains pointers to the GPU | ||
| 10 | virtual address space for this context. All channels in a TSG point to the | ||
| 11 | same GPU Instance Block. | ||
| 12 | |||
| 13 | ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_CHAN | ||
| 14 | CHID (ID) : identifier of the channel to run (overlays ENTRY_ID) | ||
| 15 | RUNQUEUE_SELECTOR (Q) : selects which PBDMA should run this channel if | ||
| 16 | more than one PBDMA is supported by the runlist | ||
| 17 | |||
| 18 | INST_PTR_LO : lower 20 bits of the 4k-aligned instance block pointer | ||
| 19 | INST_PTR_HI : upper 32 bit of instance block pointer | ||
| 20 | INST_TARGET (TGI) : aperture of the instance block | ||
| 21 | |||
| 22 | USERD_PTR_LO : upper 24 bits of the low 32 bits, of the 512-byte-aligned USERD pointer | ||
| 23 | USERD_PTR_HI : upper 32 bits of USERD pointer | ||
| 24 | USERD_TARGET (TGU) : aperture of the USERD data structure | ||
| 25 | */ | ||
| 26 | enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1}; | ||
| 27 | enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3}; | ||
| 28 | |||
| 29 | struct runlist_chan { | ||
| 30 | // 0:63 | ||
| 31 | enum ENTRY_TYPE entry_type:1; | ||
| 32 | uint32_t runqueue_selector:1; | ||
| 33 | uint32_t padding:2; | ||
| 34 | enum INST_TARGET inst_target:2; | ||
| 35 | uint32_t padding2:2; | ||
| 36 | uint32_t userd_ptr_lo:24; | ||
| 37 | uint32_t userd_ptr_hi:32; | ||
| 38 | // 64:128 | ||
| 39 | uint32_t chid:12; | ||
| 40 | uint32_t inst_ptr_lo:20; | ||
| 41 | uint32_t inst_ptr_hi:32; | ||
| 42 | } __attribute__((packed)); | ||
| 43 | |||
| 44 | /* Runlist TSG (TimeSlice Group) | ||
| 45 | The runlist is composed of timeslice groups (TSG). Each TSG corresponds | ||
| 46 | to a single virtual address space on the GPU and contains `TSG_LENGTH` | ||
| 47 | channels. These channels and virtual address space are accessible to the GPU | ||
| 48 | host unit for use until the timeslice expires or a TSG switch is forcibly | ||
| 49 | initiated via a write to `NV_PFIFO_PREEMPT`. | ||
| 50 | |||
| 51 | timeslice = (TSG_TIMESLICE_TIMEOUT << TSG_TIMESLICE_SCALE) * 1024 nanoseconds | ||
| 52 | |||
| 53 | ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_TSG | ||
| 54 | TSGID : identifier of the Timeslice group (overlays ENTRY_ID) | ||
| 55 | TSG_LENGTH : number of channels that are part of this timeslice group | ||
| 56 | TIMESLICE_SCALE : scale factor for the TSG's timeslice | ||
| 57 | TIMESLICE_TIMEOUT : timeout amount for the TSG's timeslice | ||
| 58 | */ | ||
| 59 | struct entry_tsg { | ||
| 60 | // 0:63 | ||
| 61 | enum ENTRY_TYPE entry_type:1; | ||
| 62 | uint64_t padding:15; | ||
| 63 | uint32_t timeslice_scale:4; | ||
| 64 | uint64_t padding2:4; | ||
| 65 | uint32_t timeslice_timeout:8; | ||
| 66 | uint32_t tsg_length:8; | ||
| 67 | uint32_t padding3:24; | ||
| 68 | // 64:128 | ||
| 69 | uint32_t tsgid:12; | ||
| 70 | uint64_t padding4:52; | ||
| 71 | } __attribute__((packed)); | ||
| 72 | |||
| 73 | enum PREEMPT_TYPE {PREEMPT_TYPE_CHANNEL = 0, PREEMPT_TYPE_TSG = 1}; | ||
| 74 | |||
| 75 | /* Preempt | ||
| 76 | ID/CHID : Id of TSG or channel to preempt | ||
| 77 | */ | ||
| 78 | #define NV_PFIFO_PREEMPT 0x00002634 | ||
| 79 | struct pfifo_preempt { | ||
| 80 | uint32_t id:12; | ||
| 81 | uint32_t padding:8; | ||
| 82 | bool is_pending:1; | ||
| 83 | uint32_t padding2:3; | ||
| 84 | enum PREEMPT_TYPE type:2; | ||
| 85 | uint32_t padding3:6; | ||
| 86 | } __attribute__((packed)); | ||
| 87 | |||
| 88 | #define NV_PFIFO_RUNLIST_PREEMPT 0x00002638 | ||
| 89 | struct runlist_preempt { | ||
| 90 | bool runlist_0:1; | ||
| 91 | bool runlist_1:1; | ||
| 92 | bool runlist_2:1; | ||
| 93 | bool runlist_3:1; | ||
| 94 | bool runlist_4:1; | ||
| 95 | bool runlist_5:1; | ||
| 96 | bool runlist_6:1; | ||
| 97 | bool runlist_7:1; | ||
| 98 | bool runlist_8:1; | ||
| 99 | bool runlist_9:1; | ||
| 100 | bool runlist_10:1; | ||
| 101 | bool runlist_11:1; | ||
| 102 | bool runlist_12:1; | ||
| 103 | bool runlist_13:1; | ||
| 104 | uint32_t padding:28; | ||
| 105 | } __attribute__((packed)); | ||
| 106 | |||
| 107 | // Note: This is different with Turing | ||
| 108 | #define NV_PFIFO_RUNLIST_BASE 0x00002270 | ||
| 109 | typedef union { | ||
| 110 | struct { | ||
| 111 | uint32_t ptr:28; | ||
| 112 | uint32_t type:2; | ||
| 113 | uint32_t padding:2; | ||
| 114 | } __attribute__((packed)); | ||
| 115 | uint32_t raw; | ||
| 116 | } runlist_base_t; | ||
| 117 | |||
| 118 | #define NV_PFIFO_RUNLIST 0x00002274 | ||
| 119 | typedef union { | ||
| 120 | struct { | ||
| 121 | uint32_t len:16; | ||
| 122 | uint32_t padding:4; | ||
| 123 | uint32_t id:4; | ||
| 124 | uint32_t padding2:8; | ||
| 125 | } __attribute__((packed)); | ||
| 126 | uint32_t raw; | ||
| 127 | } runlist_info_t; | ||
diff --git a/nvidia_preemption.md b/nvidia_preemption.md new file mode 100644 index 0000000..051d4a5 --- /dev/null +++ b/nvidia_preemption.md | |||
| @@ -0,0 +1,36 @@ | |||
| 1 | # NVIDIA GPU Preemption | ||
| 2 | |||
| 3 | MVP: Preempt current work on the GPU on the Jetson Xavier | ||
| 4 | |||
| 5 | Summary of approach: Create new runlist that excludes the current work and point the GPU to it | ||
| 6 | |||
| 7 | 1. Obtain current runlist | ||
| 8 | 2. Copy runlist to new location, skipping TSG of target to preempt | ||
| 9 | 3. Write new runlist address to NV_PFIFO_RUNLIST, which will preempt current work | ||
| 10 | |||
| 11 | It's unclear if this approach is lower-overhead than that of Capodieci et al. | ||
| 12 | See approach Alternate 1 which is our new priority. | ||
| 13 | |||
| 14 | Notes: | ||
| 15 | - Each TSG (timeslice group) corresponds to one context (?) | ||
| 16 | - Runlist base must be 4k aligned | ||
| 17 | - nvgpu driver gets gk20a struct via container_of an inode which is a struct nvgpu_os_linux | ||
| 18 | - gk20a_writel is nvgpu_writel. Define is: `void nvgpu_writel(struct gk20a *g, u32 reg_addr, u32 value);` | ||
| 19 | - gk20a_readl is nvgpu_readl. Define is: `u32 nvgpu_readl(struct gk20a *g, u32 reg_addr);` | ||
| 20 | |||
| 21 | ## Other approaches: | ||
| 22 | |||
| 23 | ### Alternate 1: | ||
| 24 | "2. Disable all channels in the containing TSG by writing ENABLE_CLR to TRUE | ||
| 25 | in their channel RAM entries in NV_PCCSR_CHANNEL (see dev_fifo.ref). | ||
| 26 | 3. Initiate a preempt of the TSG via NV_PFIFO_PREEMPT or | ||
| 27 | NV_PFIFO_RUNLIST_PREEMPT." (PBDMA, "Recovery procedure") | ||
| 28 | |||
| 29 | ### Alternate 2: | ||
| 30 | "3. Initiate a preempt of the engine by writing the bit associated with its | ||
| 31 | runlist to NV_PFIFO_RUNLIST_PREEMPT. This allows us to begin the preempt | ||
| 32 | process prior to doing the slow register reads needed to determine whether | ||
| 33 | the context has hit any interrupts or is hung. Do not poll | ||
| 34 | NV_PFIFO_RUNLIST_PREEMPT for the preempt to complete." (FIFO, "Context TSG tear-down procedure") | ||
| 35 | |||
| 36 | See `nvdebug.c` and `nvdebug.h` for implementation details. | ||
