nvdebug_entry.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400

/* Copyright 2021 Joshua Bakita
 * SPDX-License-Identifier: MIT
 */

#include <linux/device.h>  // For struct device, bus_find_device*(), struct bus_type
#include <linux/interrupt.h> // For hooking the nvidia driver interrupts
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pci.h>  // For PCI device scanning
#include <linux/proc_fs.h>  // So we can set up entries in /proc

#include "nvdebug.h"
#include "stubs.h"

// Enable to intercept and log GPU interrupts
#define INTERRUPT_DEBUG 0

// MIT is GPL-compatible. We need to be GPL-compatible for symbols like
// platform_bus_type or bus_find_device_by_name...
MODULE_LICENSE("Dual MIT/GPL");
MODULE_AUTHOR("Joshua Bakita");
MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs");

extern struct file_operations runlist_file_ops;
extern struct file_operations preempt_tsg_file_ops;
extern struct file_operations disable_channel_file_ops;
extern struct file_operations enable_channel_file_ops;
extern struct file_operations switch_to_tsg_file_ops;
extern struct file_operations device_info_file_ops;
extern struct file_operations nvdebug_read_reg32_file_ops;
extern struct file_operations nvdebug_read_reg_range_file_ops;

// Bus types are global symbols in the kernel
extern struct bus_type platform_bus_type;
struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES];
unsigned int g_nvdebug_devices = 0;

// Starting in Kernel 5.6, proc_ops is required instead of file_operations
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0)
// This rewrites the struct to the proc_ops layout on newer kernels
const struct proc_ops* compat_ops(const struct file_operations* ops) {
	struct proc_ops new_ops = {};
	new_ops.proc_open = ops->open;
	new_ops.proc_read = ops->read;
	new_ops.proc_write = ops->write;
	new_ops.proc_lseek = ops->llseek;
	new_ops.proc_release = ops->release;
	memcpy((void*)ops, &new_ops, sizeof(new_ops));
	return (struct proc_ops*)ops;
}
#else
const struct file_operations* compat_ops(const struct file_operations* ops) {
	return ops;
}
#endif

#if INTERRUPT_DEBUG
irqreturn_t nvdebug_irq_tap(int irq_num, void * dev) {
	printk(KERN_INFO "[nvdebug] Interrupt tap triggered on IRQ %d.\n", irq_num);
	return IRQ_NONE; // We don't actually handle any interrupts. Pass them on.
}
#endif // INTERRUPT_DEBUG

// Find any and all NVIDIA GPUs in the system
// Note: This function fails if any of them are in a bad state
int probe_and_cache_device(void) {
	// platform bus (SoC) iterators
	struct device *dev = NULL;
	struct device *temp_dev;
	// PCI search iterator and search query
	struct pci_dev *pcid = NULL;
	// This query pattern is mirrored off nouveau
	struct pci_device_id query = {
		.vendor = NV_PCI_VENDOR,  // Match NVIDIA devices
		.device = PCI_ANY_ID,
		.subvendor = PCI_ANY_ID,
		.subdevice = PCI_ANY_ID,
		.class_mask = 0xff << 16,
		.class = PCI_BASE_CLASS_DISPLAY << 16,  // Match display devs
	};
	int i = 0;
	// Search the platform bus for the first device that matches our name
	// Search for GA10B (Jetson Orin)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.ga10b")))
		dev = temp_dev;
	// Search for GV11B (Jetson Xavier)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b")))
		dev = temp_dev;
	// Search for GP10B (Jetson TX2)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gp10b")))
		dev = temp_dev;
	// TODO: Support other platform bus devices (gk20a - TK1, gm20b - TX1)
	if (dev) {
		mc_boot_0_t ids;
		g_nvdebug_state[i].g = get_gk20a(dev);
		g_nvdebug_state[i].regs = gk20a_regs(g_nvdebug_state[i].g);
		if (!g_nvdebug_state[i].regs)
			return -EADDRNOTAVAIL;
		ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
		if (ids.raw == -1)
			return -EADDRNOTAVAIL;
		g_nvdebug_state[i].chip_id = ids.chip_id;
		g_nvdebug_state[i].pcid = NULL;
		g_nvdebug_state[i].bar3 = NULL;
		printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on platform bus and initialized.",
		       ids.chip_id, ARCH2NAME(ids.architecture));
		i++;
	}
	// Search the PCI bus and iterate through all matches
	// FIXME: State rollback
	while ((pcid = pci_get_dev_by_id(&query, pcid)) && i < NVDEBUG_MAX_DEVICES) {
		mc_boot_0_t ids;
		g_nvdebug_state[i].g = NULL;
		// Map BAR0 (GPU control registers)
		g_nvdebug_state[i].regs = pci_iomap(pcid, 0, 0);
		if (!g_nvdebug_state[i].regs) {
			pci_err(pcid, "[nvdebug] Unable to map BAR0 on this GPU\n");
			return -EADDRNOTAVAIL;
		}
		// Map BAR3 (CPU-accessible mappings of GPU DRAM)
		g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, 0);
		// Try mapping only the lower half of BAR3 on fail
		// (vesafb may map the top half for display)
		if (!g_nvdebug_state[i].bar3)
			g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, pci_resource_len(pcid, 3)/2);
		g_nvdebug_state[i].pcid = pcid;
		ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
		if (ids.raw == -1) {
			pci_err(pcid, "[nvdebug] Unable to read config from Master Controller on this GPU\n");
			return -EADDRNOTAVAIL;
		}
		g_nvdebug_state[i].chip_id = ids.chip_id;
		printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on PCI bus and initialized.",
		       ids.chip_id, ARCH2NAME(ids.architecture));
#if INTERRUPT_DEBUG
		if (request_irq(pcid->irq, nvdebug_irq_tap, IRQF_SHARED, "nvdebug tap", pcid)) {
			printk(KERN_WARNING "[nvdebug] Unable to initialize IRQ tap\n");
		}
#endif // INTERRUPT_DEBUG
		i++;
	}
	// Return the number of devices we found
	if (i > 0)
		return i;
	return -ENODEV;
}

// Create files `/proc/gpu#/runlist#`, world readable
int create_runlist_files(int device_id, struct proc_dir_entry *dir) {
	ptop_device_info_gk104_t info;
	struct proc_dir_entry *rl_entry;
	int i, rl_id;
	char runlist_name[12];
	int max_rl_id = 0; // Always at least one runlist
	// Figure out how many runlists there are by checking the device info
	// registers. Runlists are always numbered sequentially, so we just have
	// to find the highest-valued one and add 1 to get the number of runlists.
	for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1_GK104; i++) {
		info.raw = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_DEVICE_INFO_GK104(i));
		if (info.info_type != INFO_TYPE_ENUM || !info.runlist_is_valid)
			continue;
		if (info.runlist_enum > max_rl_id)
			max_rl_id = info.runlist_enum;
	}
	// Create files to read each runlist. The read handling code looks at the
	// PDE_DATA associated with the file to determine what the runlist ID is.
	for (rl_id = 0; rl_id <= max_rl_id; rl_id++) {
		snprintf(runlist_name, 12, "runlist%d", rl_id);
		rl_entry = proc_create_data(
			runlist_name, 0444, dir, compat_ops(&runlist_file_ops),
			(void*)(uintptr_t)rl_id);
		if (!rl_entry)
			return -ENOMEM;
	}
	return 0;
}

// Create files /proc/gpu#
// TODO: Don't run this on unsupported GPUs
int create_tpc_mask_files(int device_id, struct proc_dir_entry *dir) {
	char file_name[20];
	int i;
	struct proc_dir_entry *gpc_tpc_mask_entry;
	// Get a bitmask of which GPCs are disabled
	uint32_t gpcs_mask = nvdebug_readl(&g_nvdebug_state[device_id], NV_FUSE_GPC);
	// Get maximum number of enabled GPCs for this chip
	uint32_t max_gpcs = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_SCAL_NUM_GPCS);
	// For each enabled GPC, expose a mask of disabled TPCs
	for (i = 0; i < max_gpcs; i++) {
		// Do nothing if GPC is disabled
		if ((1 << i) & gpcs_mask)
			continue;
		// If GPC is enabled, create an entry to read disabled TPCs mask
		snprintf(file_name, 20, "gpc%d_tpc_mask", i);
		gpc_tpc_mask_entry = proc_create_data(
			file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
			(void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC(i));
		if (!gpc_tpc_mask_entry)
			return -ENOMEM;
	}
	return 0;
}

int __init nvdebug_init(void) {
	struct proc_dir_entry *dir, *preempt_entry, *disable_channel_entry,
			      *enable_channel_entry, *switch_to_tsg_entry, *device_info_entry,
			      *num_gpcs_entry, *lce_for_pce_entry, *grce_for_pce_entry;
	int rl_create_err, tpc_masks_create_err;
	// Check that an NVIDIA GPU is present and initialize g_nvdebug_state
	int res = probe_and_cache_device();
	if (res < 0)
		return res;
	g_nvdebug_devices = res;
	// Create seperate ProcFS directories for each gpu
	while (res--) {
		char device_id_str[7];
		uintptr_t device_id = res;  // This is uintptr as we abuse the *data field on proc_dir_entry to store the GPU id
		// Create directory /proc/gpu# where # is the GPU number
		snprintf(device_id_str, 7, "gpu%ld", device_id);
		if (!(dir = proc_mkdir_data(device_id_str, 0555, NULL, (void*)device_id)))
			goto out_nomem;
		// Create files `/proc/gpu#/runlist#`, world readable
		if (g_nvdebug_state[device_id].chip_id < NV_CHIP_ID_AMPERE)
			create_runlist_files(device_id, dir);
		// Create files `/proc/gpu#/gpc#_tpc_mask`, world readable
		tpc_masks_create_err = create_tpc_mask_files(device_id, dir);
		// Create file `/proc/gpu#/preempt_tsg`, world writable
		preempt_entry = proc_create_data(
			"preempt_tsg", 0222, dir, compat_ops(&preempt_tsg_file_ops),
			(void*)device_id);
		// Create file `/proc/gpu#/disable_channel`, world writable
		disable_channel_entry = proc_create_data(
			"disable_channel", 0222, dir, compat_ops(&disable_channel_file_ops),
			(void*)device_id);
		// Create file `/proc/gpu#/enable_channel`, world writable
		enable_channel_entry = proc_create_data(
			"enable_channel", 0222, dir, compat_ops(&enable_channel_file_ops),
			(void*)device_id);
		// Create file `/proc/gpu#/switch_to_tsg`, world writable
		switch_to_tsg_entry = proc_create_data(
			"switch_to_tsg", 0222, dir, compat_ops(&switch_to_tsg_file_ops),
			(void*)device_id);
		// Create file `/proc/gpu#/device_info`, world readable
		device_info_entry = proc_create_data(
			"device_info", 0444, dir, compat_ops(&device_info_file_ops),
			(void*)device_id);
		// Create file `/proc/gpu#/num_gpcs`, world readable
		num_gpcs_entry = proc_create_data(
			"num_gpcs", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
			(void*)NV_PTOP_SCAL_NUM_GPCS);
		// Create file `/proc/gpu#/num_tpc_per_gpc`, world readable
		num_gpcs_entry = proc_create_data(
			"num_tpc_per_gpc", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
			(void*)NV_PTOP_SCAL_NUM_TPC_PER_GPC);
		// Create file `/proc/gpu#/num_ces`, world readable
		num_gpcs_entry = proc_create_data(
			"num_ces", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
			(void*)NV_PTOP_SCAL_NUM_CES);
		// Create file `/proc/gpu#/num_ces`, world readable
		num_gpcs_entry = proc_create_data(
			"gpc_mask", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
			(void*)NV_FUSE_GPC);
		// In both nouveau and nvgpu, the PCE_MAP register is  available on Pascal+
		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL){
			// Used for reading a subset of a register on pascal
			union reg_range pascal_reg;
			// Create a pce mask for iteration
			u32 ce_pce_map = nvdebug_readl(&g_nvdebug_state[device_id], NV_CE_PCE_MAP);
			char file_name[21];
			int pce_id = 0;
			int pce_num = 0;
			int i;
			for (pce_id = 0; pce_id < MAP_SIZE; pce_id++) {
				// If pce is enabled, create files and iterate pce_id; otherwise, do nothing
				if ((1 << pce_id) & ce_pce_map) {
					snprintf(file_name, 20, "lce_for_pce%d", pce_num);
					// Depending on GPU architecture, fetch data for the LCE of particular PCE
					switch (g_nvdebug_state[res].chip_id & 0xff0) {
						case NV_CHIP_ID_PASCAL:
							// On Pascal, two PCE configurations are packed per-byte.
							// Work around this by leveraging that we only run on 64-bit
							// platforms (can assume that a void* is 64-bits), and that
							// GPU register offsets are only 32-bits. Use the other 32
							// bits to store which bits to print.
							pascal_reg.offset = NV_LCE_FOR_PCE_GP100(0);
							pascal_reg.start_bit = pce_id * 4;
							pascal_reg.stop_bit = pce_id * 4 + 4;
							lce_for_pce_entry = proc_create_data(
											file_name, 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops),
											(void*)pascal_reg.raw);
							break;
						case NV_CHIP_ID_VOLTA:
						case NV_CHIP_ID_VOLTA_INTEGRATED:
						case NV_CHIP_ID_TURING:
							lce_for_pce_entry = proc_create_data(
											file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
											(void*)(uintptr_t)NV_LCE_FOR_PCE_GV100(pce_id));
							break;
						case NV_CHIP_ID_AMPERE:
						case NV_CHIP_ID_HOPPER:
						case NV_CHIP_ID_ADA:
							 lce_for_pce_entry = proc_create_data(
											file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
											(void*)(uintptr_t)NV_LCE_FOR_PCE_GA100(pce_id));
							break;
					}
					if (!lce_for_pce_entry)
						return -ENOMEM;
					pce_num++;
				}
			}
			// We assume 2 GRCEs (reminder: GRCE0 and 1 are just LCE0 and 1)
			for (i = 0; i < 2; i++) {
				union reg_range grce_reg = {0};
				snprintf(file_name, 21, "shared_lce_for_grce%d", i);
				// The offset used here is only documented for Turing
				// Actually, Pascal through Turing
				// On Pascal, it's only 3 bits, every 8 bits
				// On Volta-Turing, it start at same offset, but it's lower 4 bits, every 32 bits
				// On Ampere+ it starts at 0x001041c0, but is the same layout as Volta-Turing
				switch (g_nvdebug_state[res].chip_id & 0xff0) {
					case NV_CHIP_ID_PASCAL:
						grce_reg.offset = NV_GRCE_FOR_CE_GP100(0);
						grce_reg.start_bit = i * 8;
						grce_reg.stop_bit = grce_reg.start_bit + 3;
						break;
					case NV_CHIP_ID_VOLTA:
					case NV_CHIP_ID_VOLTA_INTEGRATED:
					case NV_CHIP_ID_TURING:
						grce_reg.offset = NV_GRCE_FOR_CE_GP100(i);
						grce_reg.start_bit = 0;
						grce_reg.stop_bit = grce_reg.start_bit + 4;
						break;
					case NV_CHIP_ID_AMPERE:
					case NV_CHIP_ID_HOPPER:
					case NV_CHIP_ID_ADA:
						grce_reg.offset = NV_GRCE_FOR_CE_GA100(i);
						grce_reg.start_bit = 0;
						grce_reg.stop_bit = grce_reg.start_bit + 4;
						break;
				}
				grce_for_pce_entry = proc_create_data(
								file_name, 0444, dir, compat_ops(&nvdebug_read_reg_range_file_ops),
								(void*)grce_reg.raw);
				if (!grce_for_pce_entry)
					return -ENOMEM;
			}

			// TODO: Redo to num_pces
			// Create file `/proc/gpu#/pce_map`, world readable
			num_gpcs_entry = proc_create_data(
				"pce_map", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
				(void*)NV_CE_PCE_MAP);
		}
		// ProcFS entry creation only fails if out of memory
		if (rl_create_err || tpc_masks_create_err || !preempt_entry ||
		    !disable_channel_entry || !enable_channel_entry ||
		    !switch_to_tsg_entry || !device_info_entry || !num_gpcs_entry)
			goto out_nomem;
	}
	// (See Makefile if you want to know the origin of GIT_HASH.)
	printk(KERN_INFO "[nvdebug] Module version "GIT_HASH" initialized\n");
	return 0;
out_nomem:
	// Make sure to clear all ProcFS directories on error
	while (res < g_nvdebug_devices) {
		char device_id_str[7];
		snprintf(device_id_str, 7, "gpu%d", res);
		remove_proc_subtree(device_id_str, NULL);
		res++;
	}
	return -ENOMEM;
}

static void __exit nvdebug_exit(void) {
	struct nvdebug_state *g;
	// Deinitialize each device
	while (g_nvdebug_devices--) {
		// Remove procfs directory
		char device_id[7];
		snprintf(device_id, 7, "gpu%d", g_nvdebug_devices);
		remove_proc_subtree(device_id, NULL);
		// Free BAR mappings for PCIe devices
		g = &g_nvdebug_state[g_nvdebug_devices];
		if (g && g->pcid) {
			if (g && g->regs)
				pci_iounmap(g->pcid, g->regs);
			if (g && g->bar2)
				pci_iounmap(g->pcid, g->bar2);
#if INTERRUPT_DEBUG
			free_irq(g->pcid->irq, g->pcid);
#endif // INTERRUPT_DEBUG
		}
		printk(KERN_INFO "[nvdebug] Chip ID %x deinitialized.", g->chip_id);
	}
	printk(KERN_INFO "[nvdebug] Module exit complete.\n");
}

module_init(nvdebug_init);
module_exit(nvdebug_exit);