runlist_procfs.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440

#include <linux/seq_file.h> // For seq_* functions and types
#include <linux/version.h>  // Macros to detect kernel version

#include "nvdebug_linux.h"

// Uncomment to expand channel status information when printing the runlist
#define DETAILED_CHANNEL_INFO

#ifdef DETAILED_CHANNEL_INFO
/* Print channel details using PCCSR (Programmable Channel Control System RAM?)
  @param s      Pointer to state from seq_file subsystem to pass to seq_printf
  @param g      Pointer to our internal GPU state
  @param chid   ID of channel to print details on, range [0, 512)
  @param prefix Text string to prefix each line with, or empty string
*/
static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix) {
	channel_ctrl_t chan;
	uint64_t instance_ptr;

	if ((chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chid))) == -1)
		return -EIO;
	instance_ptr = (uint64_t)chan.inst_ptr << 12;
	// Don't print write-only fields
	seq_printf(s, "%s|= Channel Info ======|\n", prefix);
	seq_printf(s, "%s| Enabled:           %d|\n", prefix, chan.enable);
	seq_printf(s, "%s| Next:              %d|\n", prefix, chan.next);
	seq_printf(s, "%s| PBDMA Faulted:     %d|\n", prefix, chan.pbdma_faulted);
	seq_printf(s, "%s| ENG Faulted:       %d|\n", prefix, chan.eng_faulted);
	seq_printf(s, "%s| Status:           %2d|\n", prefix, chan.status);
	seq_printf(s, "%s| Busy:              %d|\n", prefix, chan.busy);
	seq_printf(s, "%s| Instance PTR:       |\n", prefix);
	seq_printf(s, "%s|   %#018llx|\n", prefix, instance_ptr);
	seq_printf(s, "%s| %20s|\n", prefix, target_to_text(chan.inst_target));
	seq_printf(s, "%s| Instance bound:    %d|\n", prefix, chan.inst_bind);
	return 0;
}

/* `runlist_detail_seq_show_chan()`, but for Ampere+
  @param runlist_pri_base Base of the RLRAM region for this runlist

  `runlist_pri_base` is necessary, since Channel RAM is now per-runlist on
  Ampere+, and its location is configured in Runlist RAM.
*/
static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix, uint32_t runlist_pri_base) {
	runlist_channel_config_t channel_config;
	channel_ctrl_ga100_t chan;

	// Channel RAM is subsidiary to Runlist RAM (ie. per-runlist) on Ampere+
	if ((channel_config.raw = nvdebug_readl(g, runlist_pri_base + NV_RUNLIST_CHANNEL_CONFIG_GA100)) == -1)
		return -EIO;
	if ((chan.raw = nvdebug_readl(g, (((uint32_t)channel_config.bar0_offset << 4) + chid * 4))) == -1)
		return -EIO;
	seq_printf(s, "%s|= Channel Info ======|\n", prefix);
	seq_printf(s, "%s| Enabled:           %d|\n", prefix, chan.enable);
	seq_printf(s, "%s| Next:              %d|\n", prefix, chan.next);
	seq_printf(s, "%s| Busy:              %d|\n", prefix, chan.busy);
	seq_printf(s, "%s| PBDMA Faulted:     %d|\n", prefix, chan.pbdma_faulted);
	seq_printf(s, "%s| ENG Faulted:       %d|\n", prefix, chan.eng_faulted);
	seq_printf(s, "%s| On PBDMA:          %d|\n", prefix, chan.on_pbdma);
	seq_printf(s, "%s| On ENG:            %d|\n", prefix, chan.on_eng);
	seq_printf(s, "%s| Pending:           %d|\n", prefix, chan.pending);
	seq_printf(s, "%s| CTX Reload:        %d|\n", prefix, chan.ctx_reload);
	seq_printf(s, "%s| PBDMA Busy:        %d|\n", prefix, chan.pbdma_busy);
	seq_printf(s, "%s| ENG Busy:          %d|\n", prefix, chan.eng_busy);
	seq_printf(s, "%s| Acquire Fail:      %d|\n", prefix, chan.acquire_fail);
	return 0;
}
#endif

#if LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0)
// Bug workaround. See comment in runlist_file_seq_start()
static loff_t pos_fixup;
#endif

static void *runlist_file_seq_start(struct seq_file *s, loff_t *pos) {
	static struct runlist_iter rl_iter;
	struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)];
	// *pos == 0 for first call after read of file
	if (*pos == 0) {
		int err = get_runlist_iter(g, seq2gpuidx(s), &rl_iter);
		if (err)
			return ERR_PTR(err);
		// Don't try to print an empty runlist
		if (rl_iter.len <= 0)
			return NULL;
		return &rl_iter;
	}
	// If we're resuming an earlier print
	if (*pos < rl_iter.len) {
#if LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0)
		// There's a nasty bug prior to 4.19-rc1 that if the buffer overflows, the
		// last update to `pos` is not saved. Work around that here by reloading a
		// saved copy of `pos`.
		if (!pos_fixup)
			return NULL;
		*pos = pos_fixup;
#endif
		return &rl_iter;
	}
	// When called with *pos != 0, we already traversed the runlist
	return NULL;
}

static void* runlist_file_seq_next(struct seq_file *s, void *raw_rl_iter,
				   loff_t *pos) {
	struct runlist_iter* rl_iter = raw_rl_iter;
	void *ret = NULL;
	struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)];
	// Advance by one TSG or channel
	(*pos)++;
	rl_iter->curr_entry += NV_RL_ENTRY_SIZE(g);
	// Verify we haven't reached the end of the runlist
	// len is the num of tsg entries + total num of channel entries
	if (*pos < rl_iter->len) {
		ret = rl_iter;
	}
#if LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0)
	// Bug workaround. See comment in runlist_file_seq_start()
	pos_fixup = ret ? *pos : 0;
#endif
	if (rl_iter->entries_left_in_tsg)
		rl_iter->entries_left_in_tsg--;
	return ret;
}

static void runlist_file_seq_stop(struct seq_file *s, void *raw_rl_iter) {
	// No cleanup needed
}

// _show() must be idempotent. This function will be rerun if the seq_printf
// buffer was too small.
static int runlist_file_seq_show(struct seq_file *s, void *raw_rl_iter) {
	struct runlist_iter *rl_iter = raw_rl_iter;
	void *entry = rl_iter->curr_entry;
	struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)];
	if (entry_type(g, entry) == ENTRY_TYPE_TSG) {
		if (rl_iter->entries_left_in_tsg) {
			printk(KERN_WARNING "[nvdebug] Found TSG ID%d @ %px when %d channels were still expected under the previous TSG in the runlist!\n", tsgid(g, entry), entry, rl_iter->entries_left_in_tsg);
			while (rl_iter->entries_left_in_tsg--)
				seq_printf(s, "[missing channel]\n");
		}
		rl_iter->entries_left_in_tsg = tsg_length(g, entry) + 1;
		seq_printf(s, "+---- TSG Entry %-3d---+\n", tsgid(g, entry));
		seq_printf(s, "| Scale: %-13d|\n", timeslice_scale(g, entry));
		seq_printf(s, "| Timeout: %-11d|\n", timeslice_timeout(g, entry));
		seq_printf(s, "| Length: %-12d|\n", tsg_length(g, entry));
		seq_printf(s, "+---------------------+\n");
	} else {
		char *indt = "";
		u64 instance_ptr = 0;
		if (rl_iter->entries_left_in_tsg)
			indt = "  ";
		// Reconstruct pointer to channel instance block
		if (g->chip_id >= NV_CHIP_ID_VOLTA) {
			instance_ptr = ((struct gv100_runlist_chan*)entry)->inst_ptr_hi;
			instance_ptr <<= 32;
		}
		instance_ptr |= inst_ptr_lo(g, entry) << 12;
		// Print channel information from runlist
		seq_printf(s, "%s+- Channel Entry %-4d-+\n", indt, chid(g, entry));
		if (g->chip_id >= NV_CHIP_ID_VOLTA)
			seq_printf(s, "%s| Runqueue Selector: %d|\n", indt,
			           ((struct gv100_runlist_chan*)entry)->runqueue_selector);
		// Not populated on Kepler [ex: gk104 in Bonham (Quadro K5000)], and
		// populated but unused on Pascal [ex: gp104 in Bonham (GTX 1080 Ti)].
		// (The aperture field may be incorrectly populated as INVALID, but the
		// context still works on the aformentioned Pascal GPU.)
		seq_printf(s, "%s| Instance PTR:       |\n", indt);
		seq_printf(s, "%s|   %#018llx|\n", indt, instance_ptr);
		seq_printf(s, "%s| %20s|\n", indt, target_to_text(inst_target(g, entry)));
#ifdef DETAILED_CHANNEL_INFO
		// Print channel info from PCCSR/Channel RAM and the instance block
		if (g->chip_id < NV_CHIP_ID_AMPERE)
			runlist_detail_seq_show_chan(s, g, chid(g, entry), indt);
		else
			runlist_detail_seq_show_chan_ga100(s, g, chid(g, entry), indt, rl_iter->runlist_pri_base);
#endif
		seq_printf(s, "%s+---------------------+\n", indt);
	}
	return 0;
}

static const struct seq_operations runlist_file_seq_ops = {
	.start = runlist_file_seq_start,
	.next = runlist_file_seq_next,
	.stop = runlist_file_seq_stop,
	.show = runlist_file_seq_show,
};

static int runlist_file_open(struct inode *inode, struct file *f) {
	return seq_open(f, &runlist_file_seq_ops);
}

struct file_operations runlist_file_ops = {
	.open = runlist_file_open,
	.read = seq_read,
	.llseek = seq_lseek,
	.release = seq_release,
};

ssize_t preempt_tsg_file_write(struct file *f, const char __user *buffer,
                               size_t count, loff_t *off) {
	uint32_t target_tsgid, target_runlist_ram;
	struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
	// Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
	int err = kstrtou32_from_user(buffer, count, 0, &target_tsgid);
	if (err)
		return err;

	// TSG IDs are a 12-bit field, so make sure the request is in-range
	if (target_tsgid > MAX_TSGID)
		return -ERANGE;

	// (Ab)use the PDE_DATA field for the index into which Runlist RAM this TSG
	// ID is scoped to (only applicable on Ampere+)
	if (g->chip_id >= NV_CHIP_ID_AMPERE)
		target_runlist_ram = file2gpuidx(f);
	else
		target_runlist_ram = 0;

	// Execute preemption
	if ((err = preempt_tsg(g, target_runlist_ram, target_tsgid)))
		return err;

	return count;
}

struct file_operations preempt_tsg_file_ops = {
	.write = preempt_tsg_file_write,
	.llseek = default_llseek,
};

ssize_t resubmit_runlist_file_write(struct file *f, const char __user *buffer,
                                    size_t count, loff_t *off) {
	uint32_t target_runlist;
	struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)];
	// Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
	int err = kstrtou32_from_user(buffer, count, 0, &target_runlist);
	if (err)
		return err;

	// resubmit_runlist() checks that target_runlist is valid
	if ((err = resubmit_runlist(g, target_runlist)))
		return err;

	return count;
}

struct file_operations resubmit_runlist_file_ops = {
	.write = resubmit_runlist_file_write,
	.llseek = default_llseek,
};


ssize_t disable_channel_file_write(struct file *f, const char __user *buffer,
                                   size_t count, loff_t *off) {
	uint32_t target_channel;
	struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
	// Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
	int err = kstrtou32_from_user(buffer, count, 0, &target_channel);
	if (err)
		return err;

	if (g->chip_id < NV_CHIP_ID_AMPERE) {
		channel_ctrl_t chan;
		if (target_channel > MAX_CHID)
			return -ERANGE;
		// Read current configuration
		if ((chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(target_channel))) == -1)
			return -EIO;
		// Request disablement
		chan.enable_clear = true;
		nvdebug_writeq(g, NV_PCCSR_CHANNEL_INST(target_channel), chan.raw);
	} else {
		uint32_t runlist_reg_base, chram_base, channel_max;
		runlist_channel_config_t channel_config;
		channel_ctrl_ga100_t chan;
		// (Ab)use the PDE_DATA field for the runlist ID
		if ((err = get_runlist_ram(g, file2gpuidx(f), &runlist_reg_base)))
			return err;
		// Channel RAM is subsidiary to Runlist RAM (ie. per-runlist) on Ampere
		if ((channel_config.raw = nvdebug_readl(g, runlist_reg_base + NV_RUNLIST_CHANNEL_CONFIG_GA100)) == -1)
			return -EIO;
		channel_max = 1u << channel_config.num_channels_log2;
		if (target_channel >= channel_max)
			return -ERANGE;
		chram_base = (uint32_t)channel_config.bar0_offset << 4;
		// Writing zeros to any field of the Ampere+ channel control structure
		// does nothing, so don't bother to read the structure first, and just
		// write zeros to all the fields we don't care about.
		chan.raw = 0;
		chan.is_write_one_clears_bits = 1; // Invert meaning of writing 1
		chan.enable = 1;
		nvdebug_writel(g, chram_base + sizeof(channel_ctrl_ga100_t) * target_channel, chan.raw);
	}

	return count;
}

struct file_operations disable_channel_file_ops = {
	.write = disable_channel_file_write,
	.llseek = default_llseek,
};

ssize_t enable_channel_file_write(struct file *f, const char __user *buffer,
                                  size_t count, loff_t *off) {
	uint32_t target_channel;
	struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
	// Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
	int err = kstrtou32_from_user(buffer, count, 0, &target_channel);
	if (err)
		return err;

	if (g->chip_id < NV_CHIP_ID_AMPERE) {
		channel_ctrl_t chan;
		if (target_channel > MAX_CHID)
			return -ERANGE;
		// Read current configuration
		if ((chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(target_channel))) == -1)
			return -EIO;
		// Disable channel
		chan.enable_set = true;
		nvdebug_writeq(g, NV_PCCSR_CHANNEL_INST(target_channel), chan.raw);
	} else {
		uint32_t runlist_reg_base, chram_base, channel_max;
		runlist_channel_config_t channel_config;
		channel_ctrl_ga100_t chan;
		// (Ab)use the PDE_DATA field for the runlist ID
		if ((err = get_runlist_ram(g, file2gpuidx(f), &runlist_reg_base)))
			return err;
		// Channel RAM is subsidiary to Runlist RAM (ie. per-runlist) on Ampere
		if ((channel_config.raw = nvdebug_readl(g, runlist_reg_base + NV_RUNLIST_CHANNEL_CONFIG_GA100)) == -1)
			return -EIO;
		channel_max = 1u << channel_config.num_channels_log2;
		if (target_channel >= channel_max)
			return -ERANGE;
		chram_base = (uint32_t)channel_config.bar0_offset << 4;
		// Writing zeros to any field of the Ampere+ channel control structure
		// does nothing, so don't bother to read the structure first, and just
		// write zeros to all the fields we don't care about.
		chan.raw = 0;
		chan.enable = 1;
		nvdebug_writel(g, chram_base + sizeof(channel_ctrl_ga100_t) * target_channel, chan.raw);
	}

	return count;
}

struct file_operations enable_channel_file_ops = {
	.write = enable_channel_file_write,
	.llseek = default_llseek,
};

// Tested working on Pascal (gp106) through Ada (ad102)
ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer,
                                 size_t count, loff_t *off) {
	uint32_t target_tsgid, target_runlist, channel_regs_base;
	struct gv100_runlist_chan* chan;
	channel_ctrl_t chan_ctl;
	channel_ctrl_ga100_t chan_ctl_ga100;
	struct runlist_iter rl_iter;
	loff_t pos = 0;
	struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)];
	// Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec
	int err = kstrtou32_from_user(buffer, count, 0, &target_tsgid);
	if (err)
		return err;

	if (target_tsgid > MAX_TSGID)
		return -ERANGE;

	// (Ab)use the PDE_DATA field for the runlist ID
	target_runlist = file2gpuidx(f);

	if ((err = get_runlist_iter(g, target_runlist, &rl_iter)))
		return err;

	// On Ampere, TSG and Channel IDs are only unique per-runlist, so we need
	// to pull the per-runlist copy of Channel RAM.
	if (g->chip_id >= NV_CHIP_ID_AMPERE) {
		uint32_t runlist_regs_base;
		runlist_channel_config_t chan_config;
		if ((err = get_runlist_ram(g, target_runlist, &runlist_regs_base)))
			return err;
		// Channel RAM is subsidiary to Runlist RAM (ie. per-runlist) on Ampere
		if ((chan_config.raw = nvdebug_readl(g, runlist_regs_base + NV_RUNLIST_CHANNEL_CONFIG_GA100)) == -1)
			return -EIO;
		channel_regs_base = (uint32_t)chan_config.bar0_offset << 4;
	}

	// Iterate through all TSGs
	while (pos < rl_iter.len) {
		bool enable = false;
		if (tsgid(g, rl_iter.curr_entry) == target_tsgid)
			enable = true;

		// Either enable or disable all channels of each TSG, dependent on if
		// they are contained within the target TSG or not.
		for_chan_in_tsg(g, chan, rl_iter.curr_entry) {
			if (g->chip_id < NV_CHIP_ID_AMPERE) {
				// Read, update, write for PCCSR
				if ((chan_ctl.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chid(g, chan)))) == -1)
					return -EIO;
				if (enable)
					chan_ctl.enable_set = true;
				else
					chan_ctl.enable_clear = true;
				nvdebug_writeq(g, NV_PCCSR_CHANNEL_INST(chid(g, chan)), chan_ctl.raw);
			} else {
				// Writing a 0 does nothing on Ampere+, so we can just write
				chan_ctl_ga100.raw = 0;
				chan_ctl_ga100.is_write_one_clears_bits = !enable;
				chan_ctl_ga100.enable = true;
				nvdebug_writel(g, channel_regs_base + sizeof(chan_ctl_ga100) * chid(g, chan), chan_ctl_ga100.raw);
			}
		}
		pos += 1 + tsg_length(g, rl_iter.curr_entry);
		rl_iter.curr_entry = next_tsg(g, rl_iter.curr_entry);

		// TODO: Fix the above for bare channels. Add "for_chan_until_tsg"?
	}

	// Resubmit the runlist to ensure that changes to channel enablement are
	// picked up on Turing+ GPUs (channel enablements may not be otherwise).
	if (g->chip_id >= NV_CHIP_ID_TURING)
		if ((err = resubmit_runlist(g, target_runlist)))
			return err;

	// Trigger a runlist-level preempt to stop whatever was running, triggering
	// the runlist scheduler to select and run the next-enabled channel.
	if ((err = preempt_runlist(g, target_runlist)))
		return err;

	return count;
}

struct file_operations switch_to_tsg_file_ops = {
	.write = switch_to_tsg_file_write,
	.llseek = default_llseek,
};