nvdebug_entry.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361

/* Copyright 2024 Joshua Bakita
 * SPDX-License-Identifier: MIT
 */

#include <linux/device.h>  // For struct device, bus_find_device*(), struct bus_type
#include <linux/interrupt.h> // For hooking the nvidia driver interrupts
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pci.h>  // For PCI device scanning
#include <linux/proc_fs.h>  // So we can set up entries in /proc

#include "nvdebug.h"
#include "stubs.h"

// Enable to intercept and log GPU interrupts. Historically used to benchmark
// interrupt latency.
#define INTERRUPT_DEBUG 0

// MIT is GPL-compatible. We need to be GPL-compatible for symbols like
// platform_bus_type or bus_find_device_by_name...
MODULE_LICENSE("Dual MIT/GPL");
MODULE_AUTHOR("Joshua Bakita");
MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs");

extern struct file_operations runlist_file_ops;
extern struct file_operations preempt_tsg_file_ops;
extern struct file_operations disable_channel_file_ops;
extern struct file_operations enable_channel_file_ops;
extern struct file_operations switch_to_tsg_file_ops;
extern struct file_operations device_info_file_ops;
extern struct file_operations copy_topology_file_ops;
extern struct file_operations nvdebug_read_reg32_file_ops;
extern struct file_operations nvdebug_read_reg_range_file_ops;

struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES];
unsigned int g_nvdebug_devices = 0;
// Bus types are global symbols in the kernel
extern struct bus_type platform_bus_type;

// Starting in Kernel 5.6, proc_ops is required instead of file_operations.
// As file_operations is larger than proc_ops, we can overwrite the memory
// backing the file_operations struct to follow the proc_ops layout, and then
// cast on newer kernels.
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0)
const struct proc_ops* compat_ops(const struct file_operations* ops) {
	struct proc_ops new_ops = {};
	new_ops.proc_open = ops->open;
	new_ops.proc_read = ops->read;
	new_ops.proc_write = ops->write;
	new_ops.proc_lseek = ops->llseek;
	new_ops.proc_release = ops->release;
	memcpy((void*)ops, &new_ops, sizeof(new_ops));
	return (struct proc_ops*)ops;
}
#else
const struct file_operations* compat_ops(const struct file_operations* ops) {
	return ops;
}
#endif

#if INTERRUPT_DEBUG
irqreturn_t nvdebug_irq_tap(int irq_num, void * dev) {
	printk(KERN_INFO "[nvdebug] Interrupt tap triggered on IRQ %d.\n", irq_num);
	return IRQ_NONE; // We don't actually handle any interrupts. Pass them on.
}
#endif // INTERRUPT_DEBUG

// Find any and all NVIDIA GPUs in the system
// Note: This function fails if any of them are in a bad state
int probe_and_cache_devices(void) {
	// platform bus (SoC) iterators
	struct device *dev = NULL;
	struct device *temp_dev;
	// PCI search iterator and search query
	struct pci_dev *pcid = NULL;
	// This query pattern is mirrored off nouveau
	struct pci_device_id query = {
		.vendor = NV_PCI_VENDOR,  // Match NVIDIA devices
		.device = PCI_ANY_ID,
		.subvendor = PCI_ANY_ID,
		.subdevice = PCI_ANY_ID,
		.class_mask = 0xff << 16,
		.class = PCI_BASE_CLASS_DISPLAY << 16,  // Match display devs
	};
	int i = 0;
	// Search the platform bus for the first device that matches our name
	// Search for GA10B (Jetson Orin)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.ga10b")))
		dev = temp_dev;
	// Search for GV11B (Jetson Xavier)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b")))
		dev = temp_dev;
	// Search for GP10B (Jetson TX2)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gp10b")))
		dev = temp_dev;
	// Search for GM10A (Jetson TX1)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "57000000.gpu")))
		dev = temp_dev;
	// TODO: Support other platform bus devices (gk20a - TK1)
	if (dev) {
		mc_boot_0_t ids;
		g_nvdebug_state[i].g = get_gk20a(dev);
		g_nvdebug_state[i].regs = gk20a_regs(g_nvdebug_state[i].g);
		if (!g_nvdebug_state[i].regs)
			return -EADDRNOTAVAIL;
		ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
		if (ids.raw == -1)
			return -EADDRNOTAVAIL;
		g_nvdebug_state[i].chip_id = ids.chip_id;
		g_nvdebug_state[i].pcid = NULL;
		g_nvdebug_state[i].bar3 = NULL;
		g_nvdebug_state[i].dev = dev;
		printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on platform bus and initialized.",
		       ids.chip_id, ARCH2NAME(ids.architecture));
		i++;
	}
	// Search the PCI bus and iterate through all matches
	// FIXME: State rollback
	while ((pcid = pci_get_dev_by_id(&query, pcid)) && i < NVDEBUG_MAX_DEVICES) {
		mc_boot_0_t ids;
		g_nvdebug_state[i].g = NULL;
		// Map BAR0 (GPU control registers)
		g_nvdebug_state[i].regs = pci_iomap(pcid, 0, 0);
		if (!g_nvdebug_state[i].regs) {
			pci_err(pcid, "[nvdebug] Unable to map BAR0 on this GPU\n");
			return -EADDRNOTAVAIL;
		}
		// Map BAR3 (CPU-accessible mappings of GPU DRAM)
		g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, 0);
		// Try mapping only the lower half of BAR3 on fail
		// (vesafb may map the top half for display)
		if (!g_nvdebug_state[i].bar3)
			g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, pci_resource_len(pcid, 3)/2);
		g_nvdebug_state[i].pcid = pcid;
		g_nvdebug_state[i].dev = &pcid->dev;
		ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
		if (ids.raw == -1) {
			pci_err(pcid, "[nvdebug] Unable to read config from Master Controller on this GPU\n");
			return -EADDRNOTAVAIL;
		}
		g_nvdebug_state[i].chip_id = ids.chip_id;
		printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on PCI bus and initialized.",
		       ids.chip_id, ARCH2NAME(ids.architecture));
#if INTERRUPT_DEBUG
		if (request_irq(pcid->irq, nvdebug_irq_tap, IRQF_SHARED, "nvdebug tap", pcid)) {
			printk(KERN_WARNING "[nvdebug] Unable to initialize IRQ tap\n");
		}
#endif // INTERRUPT_DEBUG
		i++;
	}
	// Return the number of devices found
	if (i > 0)
		return i;
	return -ENODEV;
}

// Create files `/proc/gpu#/runlist#`, world readable
// Support: Fermi, Maxwell, Pascal, Volta, Turing
int create_runlist_files(int device_id, struct proc_dir_entry *dir) {
	ptop_device_info_gk104_t info;
	struct proc_dir_entry *rl_entry;
	int i, rl_id;
	char runlist_name[12];
	int max_rl_id = 0; // Always at least one runlist
	// Figure out how many runlists there are by checking the device info
	// registers. Runlists are always numbered sequentially, so we just have
	// to find the highest-valued one and add 1 to get the number of runlists.
	for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1_GK104; i++) {
		info.raw = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_DEVICE_INFO_GK104(i));
		if (info.info_type != INFO_TYPE_ENUM || !info.runlist_is_valid)
			continue;
		if (info.runlist_enum > max_rl_id)
			max_rl_id = info.runlist_enum;
	}
	// Create files to read each runlist. The read handling code looks at the
	// PDE_DATA associated with the file to determine what the runlist ID is.
	for (rl_id = 0; rl_id <= max_rl_id; rl_id++) {
		snprintf(runlist_name, 12, "runlist%d", rl_id);
		rl_entry = proc_create_data(
			runlist_name, 0444, dir, compat_ops(&runlist_file_ops),
			(void*)(uintptr_t)rl_id);
		if (!rl_entry)
			return -ENOMEM;
	}
	return 0;
}

// Create files `/proc/gpu#/gpc#_tpc_mask`, world readable
// Support: Maxwell+
int create_tpc_mask_files(int device_id, struct proc_dir_entry *dir) {
	struct nvdebug_state* g = &g_nvdebug_state[device_id];
	char file_name[20];
	int i;
	struct proc_dir_entry *gpc_tpc_mask_entry;
	// Get maximum number of enabled GPCs for this chip
	uint32_t max_gpcs = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_SCAL_NUM_GPCS);
	// Get a bitmask of which GPCs are disabled
	uint32_t gpcs_mask;
	if (g->chip_id < NV_CHIP_ID_AMPERE)
		gpcs_mask = nvdebug_readl(g, NV_FUSE_GPC_GM107);
	else
		gpcs_mask = nvdebug_readl(g, NV_FUSE_GPC_GA100);
	// Verify the reads succeeded
	if (max_gpcs == -1 || gpcs_mask == -1)
		return -EIO;
	// For each enabled GPC, expose a mask of disabled TPCs
	for (i = 0; i < max_gpcs; i++) {
		// Do nothing if GPC is disabled
		if ((1 << i) & gpcs_mask)
			continue;
		// If GPC is enabled, create an entry to read disabled TPCs mask
		snprintf(file_name, 20, "gpc%d_tpc_mask", i);
		if (g->chip_id < NV_CHIP_ID_AMPERE)
			gpc_tpc_mask_entry = proc_create_data(
				file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
				(void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC_GM107(i));
		else
			gpc_tpc_mask_entry = proc_create_data(
				file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
				(void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC_GA100(i));
		if (!gpc_tpc_mask_entry)
			return -ENOMEM;
	}
	return 0;
}

int __init nvdebug_init(void) {
	struct proc_dir_entry *dir;
	int err, res;
	// Check that an NVIDIA GPU is present and initialize g_nvdebug_state
	if ((res = probe_and_cache_devices()) < 0)
		return res;
	g_nvdebug_devices = res;
	// Create seperate ProcFS directories for each gpu
	while (res--) {
		char device_id_str[7];
		// Create a wider copy of the GPU ID to allow us to abuse the *data
		// field of proc_dir_entry to store the GPU ID.
		uintptr_t device_id = res;
		// Create directory /proc/gpu# where # is the GPU number
		// As ProcFS entry creation only fails if out of memory, we auto-skip
		// to handling that on any error in creating ProcFS files.
		snprintf(device_id_str, 7, "gpu%ld", device_id);
		if (!(dir = proc_mkdir_data(device_id_str, 0555, NULL, (void*)device_id)))
			goto out_nomem;
		// Create files `/proc/gpu#/runlist#`, world readable
		if (g_nvdebug_state[device_id].chip_id < NV_CHIP_ID_AMPERE)
			if ((err = create_runlist_files(device_id, dir)))
				goto out_err;
		// Create file `/proc/gpu#/preempt_tsg`, world writable
		if (!proc_create_data(
				"preempt_tsg", 0222, dir, compat_ops(&preempt_tsg_file_ops),
				(void*)device_id))
			goto out_nomem;
		// Create file `/proc/gpu#/disable_channel`, world writable
		if (!proc_create_data(
				"disable_channel", 0222, dir, compat_ops(&disable_channel_file_ops),
				(void*)device_id))
			goto out_nomem;
		// Create file `/proc/gpu#/enable_channel`, world writable
		if (!proc_create_data(
				"enable_channel", 0222, dir, compat_ops(&enable_channel_file_ops),
				(void*)device_id))
			goto out_nomem;
		// Create file `/proc/gpu#/switch_to_tsg`, world writable
		if (!proc_create_data(
				"switch_to_tsg", 0222, dir, compat_ops(&switch_to_tsg_file_ops),
				(void*)device_id))
			goto out_nomem;
		// Create file `/proc/gpu#/device_info`, world readable
		if (!proc_create_data(
				"device_info", 0444, dir, compat_ops(&device_info_file_ops),
				(void*)device_id))
			goto out_nomem;
		// Create file `/proc/gpu#/num_gpcs`, world readable
		if (!proc_create_data(
				"num_gpcs", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
				(void*)NV_PTOP_SCAL_NUM_GPCS))
			goto out_nomem;
		// Create file `/proc/gpu#/num_tpc_per_gpc`, world readable
		if (!proc_create_data(
				"num_tpc_per_gpc", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
				(void*)NV_PTOP_SCAL_NUM_TPC_PER_GPC))
			goto out_nomem;
		// Create file `/proc/gpu#/num_ces`, world readable
		if (!proc_create_data(
				"num_ces", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
				(void*)NV_PTOP_SCAL_NUM_CES))
			goto out_nomem;
		// Create files `/proc/gpu#/gpc#_tpc_mask`, world readable (Maxwell+)
		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_MAXWELL)
			if ((err = create_tpc_mask_files(device_id, dir)))
				goto out_err;
		// Create file `/proc/gpu#/gpc_mask`, world readable (Maxwell+)
		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_AMPERE) {
			if (!proc_create_data(
					"gpc_mask", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
					(void*)NV_FUSE_GPC_GA100))
				goto out_nomem;
		} else if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_MAXWELL) {
			if (!proc_create_data(
					"gpc_mask", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
					(void*)NV_FUSE_GPC_GM107))
				goto out_nomem;
		}
		// Create files exposing LCE and PCE configuration (Pascal+)
		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_PASCAL) {
			// Create file `/proc/gpu#/copy_topology`, world readable
			if (!proc_create_data(
					"copy_topology", 0444, dir, compat_ops(&copy_topology_file_ops),
					(void*)0))
				goto out_nomem;
			// Create file `/proc/gpu#/pce_map`, world readable
			if (!proc_create_data(
					"pce_map", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
					(void*)NV_CE_PCE_MAP))
				goto out_nomem;
		}
	}
	// (See Makefile if you want to know the origin of GIT_HASH.)
	printk(KERN_INFO "[nvdebug] Module version "GIT_HASH" initialized\n");
	return 0;
out_nomem:
	err = -ENOMEM;
out_err:
	// Make sure to clear all ProcFS directories on error
	while (res < g_nvdebug_devices) {
		char device_id_str[7];
		snprintf(device_id_str, 7, "gpu%d", res);
		remove_proc_subtree(device_id_str, NULL);
		res++;
	}
	return err;
}

static void __exit nvdebug_exit(void) {
	struct nvdebug_state *g;
	// Deinitialize each device
	while (g_nvdebug_devices--) {
		// Remove procfs directory
		char device_id[7];
		snprintf(device_id, 7, "gpu%d", g_nvdebug_devices);
		remove_proc_subtree(device_id, NULL);
		// Free BAR mappings for PCIe devices
		g = &g_nvdebug_state[g_nvdebug_devices];
		if (g && g->pcid) {
			if (g && g->regs)
				pci_iounmap(g->pcid, g->regs);
			if (g && g->bar2)
				pci_iounmap(g->pcid, g->bar2);
#if INTERRUPT_DEBUG
			free_irq(g->pcid->irq, g->pcid);
#endif // INTERRUPT_DEBUG
		}
		printk(KERN_INFO "[nvdebug] Chip ID %x deinitialized.", g->chip_id);
	}
	printk(KERN_INFO "[nvdebug] Module exit complete.\n");
}

module_init(nvdebug_init);
module_exit(nvdebug_exit);