aboutsummaryrefslogtreecommitdiffstats
path: root/nvdebug_entry.c
blob: 3a10e132fb04e88c6a3015848baa51310f063ada (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
/* Copyright 2024 Joshua Bakita
 * SPDX-License-Identifier: MIT
 */

#include <linux/device.h>  // For struct device, bus_find_device*(), struct bus_type
#include <linux/interrupt.h> // For hooking the nvidia driver interrupts
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pci.h>  // For PCI device scanning
#include <linux/platform_device.h>  // For platform_device struct
#include <linux/proc_fs.h>  // So we can set up entries in /proc

#include "nvdebug_linux.h"
#include "stubs.h"

// Enable to intercept and log GPU interrupts. Historically used to benchmark
// interrupt latency.
#define INTERRUPT_DEBUG 0

// MIT is GPL-compatible. We need to be GPL-compatible for symbols like
// platform_bus_type or bus_find_device_by_name...
MODULE_LICENSE("Dual MIT/GPL");
MODULE_AUTHOR("Joshua Bakita");
MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs");

// runlist_procfs.c
extern struct file_operations runlist_file_ops;
extern struct file_operations preempt_tsg_file_ops;
extern struct file_operations disable_channel_file_ops;
extern struct file_operations enable_channel_file_ops;
extern struct file_operations resubmit_runlist_file_ops;
extern struct file_operations switch_to_tsg_file_ops;
// device_info_procfs.c
extern struct file_operations device_info_file_ops;
extern struct file_operations nvdebug_read_reg32_file_ops;
extern struct file_operations nvdebug_read_reg_range_file_ops;
extern struct file_operations local_memory_file_ops;
// copy_topology_procfs.c
extern struct file_operations copy_topology_file_ops;

struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES];
unsigned int g_nvdebug_devices = 0;
// Bus types are global symbols in the kernel
extern struct bus_type platform_bus_type;

// Starting in Kernel 5.6, proc_ops is required instead of file_operations.
// As file_operations is larger than proc_ops, we can overwrite the memory
// backing the file_operations struct to follow the proc_ops layout, and then
// cast on newer kernels.
// We use the last byte of the file_operations struct to flag that the memory
// layout has been rearranged.
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0)
const struct proc_ops* compat_ops(const struct file_operations* ops) {
	struct proc_ops new_ops = {};
	// Don't re-layout if it's already been done
	if (*((uint8_t*)(ops + 1) - 1))
		return (struct proc_ops*)ops;
	new_ops.proc_open = ops->open;
	new_ops.proc_read = ops->read;
	new_ops.proc_write = ops->write;
	new_ops.proc_lseek = ops->llseek;
	new_ops.proc_release = ops->release;
	memcpy((void*)ops, &new_ops, sizeof(new_ops));
	// Flag re-layout as complete in last byte of structure
	*((uint8_t*)(ops + 1) - 1) = 1;
	return (struct proc_ops*)ops;
}
#else
const struct file_operations* compat_ops(const struct file_operations* ops) {
	return ops;
}
#endif

#if INTERRUPT_DEBUG
irqreturn_t nvdebug_irq_tap(int irq_num, void * dev) {
	printk(KERN_INFO "[nvdebug] Interrupt tap triggered on IRQ %d.\n", irq_num);
	return IRQ_NONE; // We don't actually handle any interrupts. Pass them on.
}
#endif // INTERRUPT_DEBUG

// Find any and all NVIDIA GPUs in the system
// Note: This function fails if any of them are in a bad state
int probe_and_cache_devices(void) {
	// platform bus (SoC) iterators
	struct device *dev = NULL;
	struct device *temp_dev;
	// PCI search iterator and search query
	struct pci_dev *pcid = NULL;
	// This query pattern is mirrored off nouveau
	struct pci_device_id query = {
		.vendor = NV_PCI_VENDOR,  // Match NVIDIA devices
		.device = PCI_ANY_ID,
		.subvendor = PCI_ANY_ID,
		.subdevice = PCI_ANY_ID,
		.class_mask = 0xff << 16,
		.class = PCI_BASE_CLASS_DISPLAY << 16,  // Match display devs
	};
	int i = 0;
	// Search the platform bus for the first device that matches our name
	// Search for embedded GPU on Jetson (generic name starting around L4T 36.3)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gpu")))
		dev = temp_dev;
	// Search for GA10B (Jetson Orin)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.ga10b")))
		dev = temp_dev;
	// Search for GV11B (Jetson Xavier)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b")))
		dev = temp_dev;
	// Search for GP10B (Jetson TX2)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gp10b")))
		dev = temp_dev;
	// Search for GM20B (Jetson TX1)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "57000000.gpu")))
		dev = temp_dev;
	// TODO: Support other platform bus devices (gk20a - TK1)
	if (dev) {
		mc_boot_0_t ids;
		struct platform_device *platd = container_of(dev, struct platform_device, dev);
		struct resource *regs = platform_get_resource(platd, IORESOURCE_MEM, 0);
		g_nvdebug_state[i].g = get_gk20a(dev);
		if (!regs)
			return -EADDRNOTAVAIL;
		g_nvdebug_state[i].regs = ioremap(regs->start, resource_size(regs));
		if (!g_nvdebug_state[i].regs) {
			printk(KERN_ERR "[nvdebug] Unable to map BAR0 on the integrated GPU\n");
			return -EADDRNOTAVAIL;
		}
		// The Jetson TX1, TX2, Xavier, and Orin do not have a BAR2 (but do have
		// BAR1). On the TX2+, all their platform resources are:
		//   [nvdebug] Region 0: Memory at 17000000 [size=16777216]
		//   [nvdebug] Region 1: Memory at 18000000 [size=16777216]
		//   [nvdebug] Region 2: Memory at 3b41000 [size=4096]
		// The TX1 has the same regions, but at different base addresses.
		g_nvdebug_state[i].bar3 = NULL;
		g_nvdebug_state[i].pcid = NULL;
		g_nvdebug_state[i].platd = platd;
		g_nvdebug_state[i].dev = dev;
		// Don't check Chip ID until everything else is initalized
		ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
		if (ids.raw == -1) {
			printk(KERN_ERR "[nvdebug] Unable to read config from Master Controller on the integrated GPU\n");
			return -EADDRNOTAVAIL;
		}
		g_nvdebug_state[i].chip_id = ids.chip_id;
		printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on platform bus and initialized.",
		       ids.chip_id, ARCH2NAME(ids.architecture));
		i++;
	}
	// Search the PCI bus and iterate through all matches
	// FIXME: Undo the pci_iomap() if this fails
	while ((pcid = pci_get_dev_by_id(&query, pcid)) && i < NVDEBUG_MAX_DEVICES) {
		mc_boot_0_t ids;
		g_nvdebug_state[i].g = NULL;
		// Map BAR0 (GPU control registers)
		g_nvdebug_state[i].regs = pci_iomap(pcid, 0, 0);
		if (!g_nvdebug_state[i].regs) {
			pci_err(pcid, "[nvdebug] Unable to map BAR0 on this GPU\n");
			return -EADDRNOTAVAIL;
		}
		// Map BAR3 (CPU-accessible mappings of GPU DRAM)
		g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, 0);
		// XXX: Try mapping only the lower half of BAR3 on fail
		// (vesafb may map the top half for display)
		if (!g_nvdebug_state[i].bar3)
			g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, pci_resource_len(pcid, 3)/2);
		g_nvdebug_state[i].pcid = pcid;
		g_nvdebug_state[i].platd = NULL;
		g_nvdebug_state[i].dev = &pcid->dev;
		// Don't check Chip ID until everything else is initalized
		ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
		if (ids.raw == -1) {
			pci_err(pcid, "[nvdebug] Unable to read config from Master Controller on this GPU\n");
			return -EADDRNOTAVAIL;
		}
		g_nvdebug_state[i].chip_id = ids.chip_id;
		printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on PCI bus and initialized.",
		       ids.chip_id, ARCH2NAME(ids.architecture));
#if INTERRUPT_DEBUG
		if (request_irq(pcid->irq, nvdebug_irq_tap, IRQF_SHARED, "nvdebug tap", pcid)) {
			printk(KERN_WARNING "[nvdebug] Unable to initialize IRQ tap\n");
		}
#endif // INTERRUPT_DEBUG
		i++;
	}
	// Return the number of devices found
	if (i > 0)
		return i;
	return -ENODEV;
}

// Support: Fermi, Maxwell, Pascal, Volta, Turing
int get_last_runlist_id_gk104(struct nvdebug_state *g) {
	ptop_device_info_gk104_t info;
	int i, max_rl_id = 0; // Always at least one runlist
	// Figure out how many runlists there are by checking the device info
	// registers. Runlists are always numbered sequentially, so we just have
	// to find the highest-valued one and add 1 to get the number of runlists.
	for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1_GK104; i++) {
		if ((info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_GK104(i))) == -1)
			return -EIO;
		if (info.info_type != INFO_TYPE_ENUM || !info.runlist_is_valid)
			continue;
		if (info.runlist_enum > max_rl_id)
			max_rl_id = info.runlist_enum;
	}
	return max_rl_id;
}

// Support: Ampere, Hopper, Ada (and newer likely)
// Identical structure to get_runlist_ram() in runlist.c. See comments there.
int get_last_runlist_id_ga100(struct nvdebug_state *g) {
	ptop_device_info_ga100_t ptop_entry;
	int i, runlist_count = 0;
	int ptop_size = NV_PTOP_DEVICE_INFO__SIZE_1_GA100(g);
	int ptop_entry_subrow = 0;
	for (i = 0; i < ptop_size; i++) {
		if ((ptop_entry.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_GA100(i))) == -1)
			return -EIO;
		if (!ptop_entry.raw)
			continue;
		if (ptop_entry_subrow == 2 && ptop_entry.rleng_id == 0)
			runlist_count++;
		if (ptop_entry.has_next_entry)
			ptop_entry_subrow += 1;
		else
			ptop_entry_subrow = 0;
	}
	return runlist_count - 1;
}

// Return the maximum runlist ID. For a two-runlist GPU, this would return 1.
int get_last_runlist_id(int device_id) {
	struct nvdebug_state* g = &g_nvdebug_state[device_id];
	if (g->chip_id >= NV_CHIP_ID_AMPERE)
		return get_last_runlist_id_ga100(g);
	else
		return get_last_runlist_id_gk104(g);
}

// Create files `/proc/gpu#/gpc#_tpc_mask`, world readable
// Support: Maxwell+
int create_tpc_mask_files(int device_id, struct proc_dir_entry *dir) {
	struct nvdebug_state* g = &g_nvdebug_state[device_id];
	char file_name[20];
	int i;
	struct proc_dir_entry *gpc_tpc_mask_entry;
	// Get maximum number of enabled GPCs for this chip
	uint32_t max_gpcs = nvdebug_readl(g, NV_PTOP_SCAL_NUM_GPCS);
	// Get a bitmask of which GPCs are disabled
	uint32_t gpcs_mask;
	if (g->chip_id < NV_CHIP_ID_AMPERE)
		gpcs_mask = nvdebug_readl(g, NV_FUSE_GPC_GM107);
	else
		gpcs_mask = nvdebug_readl(g, NV_FUSE_GPC_GA100);
	// Verify the reads succeeded
	if (max_gpcs == -1 || gpcs_mask == -1)
		return -EIO;
	// For each enabled GPC, expose a mask of disabled TPCs
	for (i = 0; i < max_gpcs; i++) {
		// Do nothing if GPC is disabled
		if ((1 << i) & gpcs_mask)
			continue;
		// If GPC is enabled, create an entry to read disabled TPCs mask
		snprintf(file_name, 20, "gpc%d_tpc_mask", i);
		if (g->chip_id < NV_CHIP_ID_AMPERE)
			gpc_tpc_mask_entry = proc_create_data(
				file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
				(void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC_GM107(i));
		else
			gpc_tpc_mask_entry = proc_create_data(
				file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
				(void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC_GA100(i));
		if (!gpc_tpc_mask_entry)
			return -ENOMEM;
	}
	return 0;
}

int __init nvdebug_init(void) {
	struct proc_dir_entry *dir;
	int err, res;
	// Check that an NVIDIA GPU is present and initialize g_nvdebug_state
	if ((res = probe_and_cache_devices()) < 0)
		return res;
	g_nvdebug_devices = res;
	// Create seperate ProcFS directories for each gpu
	while (res--) {
		uintptr_t last_runlist = 0;
		char device_id_str[7];
		// Create a wider copy of the GPU ID to allow us to abuse the *data
		// field of proc_dir_entry to store the GPU ID.
		uintptr_t device_id = res;
		// Create directory /proc/gpu# where # is the GPU number
		// As ProcFS entry creation only fails if out of memory, we auto-skip
		// to handling that on any error in creating ProcFS files.
		snprintf(device_id_str, 7, "gpu%ld", device_id);
		if (!(dir = proc_mkdir_data(device_id_str, 0555, NULL, (void*)device_id)))
			goto out_nomem;
		// Create files in the `/proc/gpu#/runlist#/` directory
		// The read handling code looks at the `pde_data` associated with the parent
		// directory to determine what the runlist ID is.
		if ((last_runlist = get_last_runlist_id(device_id)) < 0)
			return last_runlist;
		do {
			char runlist_name[12];
			struct proc_dir_entry *rl_dir;
			// Create `/proc/gpu#/runlist#` directory
			snprintf(runlist_name, 12, "runlist%lu", last_runlist);
			if (!(rl_dir = proc_mkdir_data(runlist_name, 0555, dir, (void*)device_id)))
				goto out_nomem;
			// Create one file for each runlist on Ampere+, or one file for each GPU on older
			if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_AMPERE || last_runlist == 0) {
				struct proc_dir_entry *chram_scope;
				// preempt_tsg, enable_channel, and disable_channel refer to a GPU-global channel
				// RAM on pre-Ampere GPUs
				if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_AMPERE)
					chram_scope = rl_dir;
				else
					chram_scope = dir;
				// Create file `/proc/gpu#/runlist#/preempt_tsg`, world writable
				// On Turing and older, `/proc/gpu#/preempt_tsg`
				if (!proc_create_data(
						"preempt_tsg", 0222, chram_scope, compat_ops(&preempt_tsg_file_ops),
						(void*)last_runlist))
					goto out_nomem;
				// Create file `/proc/gpu#/runlist#/disable_channel`, world writable
				// On Turing and older, `/proc/gpu#/disable_channel`
				if (!proc_create_data(
						"disable_channel", 0222, chram_scope, compat_ops(&disable_channel_file_ops),
						(void*)last_runlist))
					goto out_nomem;
				// Create file `/proc/gpu#/runlist#/enable_channel`, world writable
				// On Turing and older, `/proc/gpu#/enable_channel`
				if (!proc_create_data(
						"enable_channel", 0222, chram_scope, compat_ops(&enable_channel_file_ops),
						(void*)last_runlist))
					goto out_nomem;
			}
			// Create file `/proc/gpu#/runlist#/runlist`, world readable
			if (!proc_create_data(
					"runlist", 0444, rl_dir, compat_ops(&runlist_file_ops),
					(void*)last_runlist))
				goto out_nomem;
			// Create file `/proc/gpu#/runlist#/switch_to_tsg`, world writable
			if (!proc_create_data(
					"switch_to_tsg", 0222, rl_dir, compat_ops(&switch_to_tsg_file_ops),
					(void*)last_runlist))
				goto out_nomem;
		} while (last_runlist-- > 0);
		/* On the TU104, the context scheduler (contained in the Host, aka
		 * PFIFO, unit) has been observed to sometimes to fail to schedule TSGs
		 * containing re-enabled channels. Resubmitting the runlist
		 * configuration appears to remediate this condition, and so this API
		 * is exposed to help reset GPU scheduling as necessary.
		 */
		// Create file `/proc/gpu#/resubmit_runlist`, world writable
		if (!proc_create_data(
				"resubmit_runlist", 0222, dir, compat_ops(&resubmit_runlist_file_ops),
				(void*)device_id))
			goto out_nomem;
		// Create file `/proc/gpu#/device_info`, world readable
		if (!proc_create_data(
				"device_info", 0444, dir, compat_ops(&device_info_file_ops),
				(void*)device_id))
			goto out_nomem;
		// Create file `/proc/gpu#/num_gpcs`, world readable
		if (!proc_create_data(
				"num_gpcs", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
				(void*)NV_PTOP_SCAL_NUM_GPCS))
			goto out_nomem;
		// Create file `/proc/gpu#/num_tpc_per_gpc`, world readable
		if (!proc_create_data(
				"num_tpc_per_gpc", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
				(void*)NV_PTOP_SCAL_NUM_TPC_PER_GPC))
			goto out_nomem;
		// Create file `/proc/gpu#/num_ces`, world readable
		if (!proc_create_data(
				"num_ces", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
				(void*)NV_PTOP_SCAL_NUM_CES))
			goto out_nomem;
		// Create files `/proc/gpu#/gpc#_tpc_mask`, world readable (Maxwell+)
		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_MAXWELL)
			if ((err = create_tpc_mask_files(device_id, dir)))
				goto out_err;
		// Create file `/proc/gpu#/gpc_mask`, world readable (Maxwell+)
		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_AMPERE) {
			if (!proc_create_data(
					"gpc_mask", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
					(void*)NV_FUSE_GPC_GA100))
				goto out_nomem;
		} else if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_MAXWELL) {
			if (!proc_create_data(
					"gpc_mask", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
					(void*)NV_FUSE_GPC_GM107))
				goto out_nomem;
		}
		// Create file `/proc/gpu#/local_memory`, world readable (Pascal+)
		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) {
			if (!proc_create_data(
					"local_memory", 0444, dir, compat_ops(&local_memory_file_ops),
					(void*)0x00100ce0))
				goto out_nomem;
		}
		// Create files exposing LCE and PCE configuration (Pascal+)
		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) {
			// Create file `/proc/gpu#/copy_topology`, world readable
			if (!proc_create_data(
					"copy_topology", 0444, dir, compat_ops(&copy_topology_file_ops),
					(void*)0))
				goto out_nomem;
			// Create file `/proc/gpu#/pce_map`, world readable
			if (!proc_create_data(
					"pce_map", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
					(void*)NV_CE_PCE_MAP))
				goto out_nomem;
		}
	}
	// (See Makefile if you want to know the origin of GIT_HASH.)
	printk(KERN_INFO "[nvdebug] Module version "GIT_HASH" initialized\n");
	return 0;
out_nomem:
	err = -ENOMEM;
out_err:
	// Make sure to clear all ProcFS directories on error
	while (res < g_nvdebug_devices) {
		char device_id_str[7];
		snprintf(device_id_str, 7, "gpu%d", res);
		remove_proc_subtree(device_id_str, NULL);
		res++;
	}
	return err;
}

static void __exit nvdebug_exit(void) {
	struct nvdebug_state *g;
	// Deinitialize each device
	while (g_nvdebug_devices--) {
		// Remove procfs directory
		char device_id[7];
		snprintf(device_id, 7, "gpu%d", g_nvdebug_devices);
		remove_proc_subtree(device_id, NULL);
		g = &g_nvdebug_state[g_nvdebug_devices];
		// Free BAR mappings for PCIe devices
		if (g && g->pcid) {
			if (g->regs)
				pci_iounmap(g->pcid, g->regs);
			if (g->bar2)
				pci_iounmap(g->pcid, g->bar2);
#if INTERRUPT_DEBUG
			free_irq(g->pcid->irq, g->pcid);
#endif // INTERRUPT_DEBUG
		} else {
			if (g->regs)
				iounmap(g->regs);
		}
		printk(KERN_INFO "[nvdebug] Chip ID %x deinitialized.", g->chip_id);
	}
	printk(KERN_INFO "[nvdebug] Module exit complete.\n");
}

module_init(nvdebug_init);
module_exit(nvdebug_exit);