nvdebug_entry.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306

/* Copyright 2021 Joshua Bakita
 * SPDX-License-Identifier: MIT
 */

#include <linux/device.h>  // For struct device, bus_find_device*(), struct bus_type
#include <linux/interrupt.h> // For hooking the nvidia driver interrupts
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pci.h>  // For PCI device scanning
#include <linux/proc_fs.h>  // So we can set up entries in /proc

#include "nvdebug.h"
#include "stubs.h"

// MIT is GPL-compatible. We need to be GPL-compatible for symbols like
// platform_bus_type or bus_find_device_by_name...
MODULE_LICENSE("Dual MIT/GPL");
MODULE_AUTHOR("Joshua Bakita");
MODULE_DESCRIPTION("A scheduling debugging module for NVIDIA GPUs");

extern struct file_operations runlist_file_ops;
extern struct file_operations preempt_tsg_file_ops;
extern struct file_operations disable_channel_file_ops;
extern struct file_operations enable_channel_file_ops;
extern struct file_operations switch_to_tsg_file_ops;
extern struct file_operations device_info_file_ops;
extern struct file_operations nvdebug_read_reg32_file_ops;

// Bus types are global symbols in the kernel
extern struct bus_type platform_bus_type;
struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES];
unsigned int g_nvdebug_devices = 0;

// Starting in Kernel 5.6, proc_ops is required instead of file_operations
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0)
// This rewrites the struct to the proc_ops layout on newer kernels
const struct proc_ops* compat_ops(const struct file_operations* ops) {
	struct proc_ops new_ops = {};
	new_ops.proc_open = ops->open;
	new_ops.proc_read = ops->read;
	new_ops.proc_write = ops->write;
	new_ops.proc_lseek = ops->llseek;
	new_ops.proc_release = ops->release;
	memcpy((void*)ops, &new_ops, sizeof(new_ops));
	return (struct proc_ops*)ops;
}
#else
const struct file_operations* compat_ops(const struct file_operations* ops) {
	return ops;
}
#endif

// TEMP
irqreturn_t nvdebug_irq_tap(int irq_num, void * dev) {
	printk(KERN_INFO "[nvdebug] Interrupt tap triggered on IRQ %d.\n", irq_num);
	return IRQ_NONE; // We don't actually handle any interrupts. Pass them on.
}

// Find any and all NVIDIA GPUs in the system
// Note: This function fails if any of them are in a bad state
int probe_and_cache_device(void) {
	// platform bus (SoC) iterators
	struct device *dev = NULL;
	struct device *temp_dev;
	// PCI search iterator and search query
	struct pci_dev *pcid = NULL;
	// This query pattern is mirrored off nouveau
	struct pci_device_id query = {
		.vendor = NV_PCI_VENDOR,  // Match NVIDIA devices
		.device = PCI_ANY_ID,
		.subvendor = PCI_ANY_ID,
		.subdevice = PCI_ANY_ID,
		.class_mask = 0xff << 16,
		.class = PCI_BASE_CLASS_DISPLAY << 16,  // Match display devs
	};
	int i = 0;
	// Search the platform bus for the first device that matches our name
	// Search for GA10B (Jetson Orin)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.ga10b")))
		dev = temp_dev;
	// Search for GV11B (Jetson Xavier)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gv11b")))
		dev = temp_dev;
	// Search for GP10B (Jetson TX2)
	while (!dev && (temp_dev = bus_find_device_by_name(&platform_bus_type, dev, "17000000.gp10b")))
		dev = temp_dev;
	// TODO: Support other platform bus devices (gk20a - TK1, gm20b - TX1)
	if (dev) {
		mc_boot_0_t ids;
		g_nvdebug_state[i].g = get_gk20a(dev);
		g_nvdebug_state[i].regs = gk20a_regs(g_nvdebug_state[i].g);
		if (!g_nvdebug_state[i].regs)
			return -EADDRNOTAVAIL;
		ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
		if (ids.raw == -1)
			return -EADDRNOTAVAIL;
		g_nvdebug_state[i].chip_id = ids.chip_id;
		printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on platform bus and initialized.",
		       ids.chip_id, ARCH2NAME(ids.architecture));
		i++;
	}
	// Search the PCI bus and iterate through all matches
	// FIXME: State rollback
	while ((pcid = pci_get_dev_by_id(&query, pcid)) && i < NVDEBUG_MAX_DEVICES) {
		mc_boot_0_t ids;
		g_nvdebug_state[i].g = NULL;
		// Map BAR0 (GPU control registers)
		g_nvdebug_state[i].regs = pci_iomap(pcid, 0, 0);
		if (!g_nvdebug_state[i].regs) {
			pci_err(pcid, "[nvdebug] Unable to map BAR0 on this GPU\n");
			return -EADDRNOTAVAIL;
		}
		// Map BAR3 (CPU-accessible mappings of GPU DRAM)
		g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, 0);
		// Try mapping only the lower half of BAR3 on fail
		// (vesafb may map the top half for display)
		if (!g_nvdebug_state[i].bar3)
			g_nvdebug_state[i].bar3 = pci_iomap(pcid, 3, pci_resource_len(pcid, 3)/2);
		g_nvdebug_state[i].pcid = pcid;
		ids.raw = nvdebug_readl(&g_nvdebug_state[i], NV_MC_BOOT_0);
		if (ids.raw == -1) {
			pci_err(pcid, "[nvdebug] Unable to read config from Master Controller on this GPU\n");
			return -EADDRNOTAVAIL;
		}
		g_nvdebug_state[i].chip_id = ids.chip_id;
		printk(KERN_INFO "[nvdebug] Chip ID %x (architecture %s) detected on PCI bus and initialized.",
		       ids.chip_id, ARCH2NAME(ids.architecture));
		// TEMP
		if (request_irq(pcid->irq, nvdebug_irq_tap, IRQF_SHARED, "nvdebug tap", pcid)) {
			printk(KERN_WARNING "[nvdebug] Unable to initialize IRQ tap\n");
		}
		i++;
	}
	// Return the number of devices we found
	if (i > 0)
		return i;
	return -ENODEV;
}

// Create files `/proc/gpu#/runlist#`, world readable
int create_runlist_files(int device_id, struct proc_dir_entry *dir) {
	ptop_device_info_t info;
	struct proc_dir_entry *rl_entry;
	int i, rl_id;
	char runlist_name[12];
	int max_rl_id = 0; // Always at least one runlist
	// Figure out how many runlists there are by checking the device info
	// registers. Runlists are always numbered sequentially, so we just have
	// to find the highest-valued one and add 1 to get the number of runlists.
	for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1; i++) {
		info.raw = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_DEVICE_INFO(i));
		if (info.info_type != INFO_TYPE_ENUM || !info.runlist_is_valid)
			continue;
		if (info.runlist_enum > max_rl_id)
			max_rl_id = info.runlist_enum;
	}
	// Create files to read each runlist. The read handling code looks at the
	// PDE_DATA associated with the file to determine what the runlist ID is.
	for (rl_id = 0; rl_id <= max_rl_id; rl_id++) {
		snprintf(runlist_name, 12, "runlist%d", rl_id);
		rl_entry = proc_create_data(
			runlist_name, 0444, dir, compat_ops(&runlist_file_ops),
			(void*)(uintptr_t)rl_id);
		if (!rl_entry)
			return -ENOMEM;
	}
	return 0;
}

// Create files /proc/gpu#
// TODO: Don't run this on unsupported GPUs
int create_tpc_mask_files(int device_id, struct proc_dir_entry *dir) {
	char file_name[20];
	int i;
	struct proc_dir_entry *gpc_tpc_mask_entry;
	// Get a bitmask of which GPCs are disabled
	uint32_t gpcs_mask = nvdebug_readl(&g_nvdebug_state[device_id], NV_FUSE_GPC);
	// Get maximum number of enabled GPCs for this chip
	uint32_t max_gpcs = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_SCAL_NUM_GPCS);
	// For each enabled GPC, expose a mask of disabled TPCs
	for (i = 0; i < max_gpcs; i++) {
		// Do nothing if GPC is disabled
		if ((1 << i) & gpcs_mask)
			continue;
		// If GPC is enabled, create an entry to read disabled TPCs mask
		snprintf(file_name, 20, "gpc%d_tpc_mask", i);
		gpc_tpc_mask_entry = proc_create_data(
			file_name, 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
			(void*)(uintptr_t)NV_FUSE_TPC_FOR_GPC(i));
		if (!gpc_tpc_mask_entry)
			return -ENOMEM;
	}
	return 0;
}

int __init nvdebug_init(void) {
	struct proc_dir_entry *dir, *preempt_entry, *disable_channel_entry,
			      *enable_channel_entry, *switch_to_tsg_entry, *device_info_entry,
			      *num_gpcs_entry;
	int rl_create_err, tpc_masks_create_err;
	// Check that an NVIDIA GPU is present and initialize g_nvdebug_state
	int res = probe_and_cache_device();
	if (res < 0)
		return res;
	g_nvdebug_devices = res;
	// Create seperate ProcFS directories for each gpu
	while (res--) {
		char device_id_str[7];
		uintptr_t device_id = res;  // This is uintptr as we abuse the *data field on proc_dir_entry to store the GPU id
		// Create directory /proc/gpu# where # is the GPU number
		snprintf(device_id_str, 7, "gpu%ld", device_id);
		if (!(dir = proc_mkdir_data(device_id_str, 0555, NULL, (void*)device_id)))
			goto out_nomem;
		// Create files `/proc/gpu#/runlist#`, world readable
		rl_create_err = create_runlist_files(device_id, dir);
		// Create files `/proc/gpu#/gpc#_tpc_mask`, world readable
		tpc_masks_create_err = create_tpc_mask_files(device_id, dir);
		// Create file `/proc/gpu#/preempt_tsg`, world writable
		preempt_entry = proc_create_data(
			"preempt_tsg", 0222, dir, compat_ops(&preempt_tsg_file_ops),
			(void*)device_id);
		// Create file `/proc/gpu#/disable_channel`, world writable
		disable_channel_entry = proc_create_data(
			"disable_channel", 0222, dir, compat_ops(&disable_channel_file_ops),
			(void*)device_id);
		// Create file `/proc/gpu#/enable_channel`, world writable
		enable_channel_entry = proc_create_data(
			"enable_channel", 0222, dir, compat_ops(&enable_channel_file_ops),
			(void*)device_id);
		// Create file `/proc/gpu#/switch_to_tsg`, world writable
		switch_to_tsg_entry = proc_create_data(
			"switch_to_tsg", 0222, dir, compat_ops(&switch_to_tsg_file_ops),
			(void*)device_id);
		// Create file `/proc/gpu#/device_info`, world readable
		device_info_entry = proc_create_data(
			"device_info", 0444, dir, compat_ops(&device_info_file_ops),
			(void*)device_id);
		// Create file `/proc/gpu#/num_gpcs`, world readable
		num_gpcs_entry = proc_create_data(
			"num_gpcs", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
			(void*)NV_PTOP_SCAL_NUM_GPCS);
		// Create file `/proc/gpu#/num_tpc_per_gpc`, world readable
		num_gpcs_entry = proc_create_data(
			"num_tpc_per_gpc", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
			(void*)NV_PTOP_SCAL_NUM_TPC_PER_GPC);
		// Create file `/proc/gpu#/num_ces`, world readable
		num_gpcs_entry = proc_create_data(
			"num_ces", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
			(void*)NV_PTOP_SCAL_NUM_CES);
		// Create file `/proc/gpu#/num_ces`, world readable
		num_gpcs_entry = proc_create_data(
			"gpc_mask", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
			(void*)NV_FUSE_GPC);
		// In both nouveau and nvgpu, the PCE_MAP register is only available on Volta+
		if (g_nvdebug_state[res].chip_id >= NV_CHIP_ID_VOLTA) {
			// TODO: Redo to num_pces
			// Create file `/proc/gpu#/pce_map`, world readable
			num_gpcs_entry = proc_create_data(
				"pce_map", 0444, dir, compat_ops(&nvdebug_read_reg32_file_ops),
				(void*)NV_CE_PCE_MAP);
		}
		// ProcFS entry creation only fails if out of memory
		if (rl_create_err || tpc_masks_create_err || !preempt_entry ||
		    !disable_channel_entry || !enable_channel_entry ||
		    !switch_to_tsg_entry || !device_info_entry || !num_gpcs_entry)
			goto out_nomem;
	}
	// (See Makefile if you want to know the origin of GIT_HASH.)
	printk(KERN_INFO "[nvdebug] Module version "GIT_HASH" initialized\n");
	return 0;
out_nomem:
	// Make sure to clear all ProcFS directories on error
	while (res < g_nvdebug_devices) {
		char device_id_str[7];
		snprintf(device_id_str, 7, "gpu%d", res);
		remove_proc_subtree(device_id_str, NULL);
		res++;
	}
	return -ENOMEM;
}

static void __exit nvdebug_exit(void) {
	struct nvdebug_state *g;
	// Deinitialize each device
	while (g_nvdebug_devices--) {
		// Remove procfs directory
		char device_id[7];
		snprintf(device_id, 7, "gpu%d", g_nvdebug_devices);
		remove_proc_subtree(device_id, NULL);
		// Free BAR mappings for PCIe devices
		g = &g_nvdebug_state[g_nvdebug_devices];
		if (g && g->pcid) {
			if (g && g->regs)
				pci_iounmap(g->pcid, g->regs);
			if (g && g->bar2)
				pci_iounmap(g->pcid, g->bar2);
			// TEMP
			free_irq(g->pcid->irq, g->pcid);
		}
		printk(KERN_INFO "[nvdebug] Chip ID %x deinitialized.", g->chip_id);
	}
	printk(KERN_INFO "[nvdebug] Module exit complete.\n");
}

module_init(nvdebug_init);
module_exit(nvdebug_exit);