#include // For seq_* functions and types #include // Macros to detect kernel version #include "nvdebug_linux.h" // Uncomment to expand channel status information when printing the runlist #define DETAILED_CHANNEL_INFO #ifdef DETAILED_CHANNEL_INFO /* Print channel details using PCCSR (Programmable Channel Control System RAM?) @param s Pointer to state from seq_file subsystem to pass to seq_printf @param g Pointer to our internal GPU state @param chid ID of channel to print details on, range [0, 512) @param prefix Text string to prefix each line with, or empty string */ static int runlist_detail_seq_show_chan(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix) { channel_ctrl_t chan; uint64_t instance_ptr; if ((chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chid))) == -1) return -EIO; instance_ptr = (uint64_t)chan.inst_ptr << 12; // Don't print write-only fields seq_printf(s, "%s|= Channel Info ======|\n", prefix); seq_printf(s, "%s| Enabled: %d|\n", prefix, chan.enable); seq_printf(s, "%s| Next: %d|\n", prefix, chan.next); seq_printf(s, "%s| PBDMA Faulted: %d|\n", prefix, chan.pbdma_faulted); seq_printf(s, "%s| ENG Faulted: %d|\n", prefix, chan.eng_faulted); seq_printf(s, "%s| Status: %2d|\n", prefix, chan.status); seq_printf(s, "%s| Busy: %d|\n", prefix, chan.busy); seq_printf(s, "%s| Instance PTR: |\n", prefix); seq_printf(s, "%s| %#018llx|\n", prefix, instance_ptr); seq_printf(s, "%s| %20s|\n", prefix, target_to_text(chan.inst_target)); seq_printf(s, "%s| Instance bound: %d|\n", prefix, chan.inst_bind); return 0; } /* `runlist_detail_seq_show_chan()`, but for Ampere+ @param runlist_pri_base Base of the RLRAM region for this runlist `runlist_pri_base` is necessary, since Channel RAM is now per-runlist on Ampere+, and its location is configured in Runlist RAM. */ static int runlist_detail_seq_show_chan_ga100(struct seq_file *s, struct nvdebug_state *g, uint32_t chid, char *prefix, uint32_t runlist_pri_base) { runlist_channel_config_t channel_config; channel_ctrl_ga100_t chan; // Channel RAM is subsidiary to Runlist RAM (ie. per-runlist) on Ampere+ if ((channel_config.raw = nvdebug_readl(g, runlist_pri_base + NV_RUNLIST_CHANNEL_CONFIG_GA100)) == -1) return -EIO; if ((chan.raw = nvdebug_readl(g, (((uint32_t)channel_config.bar0_offset << 4) + chid * 4))) == -1) return -EIO; seq_printf(s, "%s|= Channel Info ======|\n", prefix); seq_printf(s, "%s| Enabled: %d|\n", prefix, chan.enable); seq_printf(s, "%s| Next: %d|\n", prefix, chan.next); seq_printf(s, "%s| Busy: %d|\n", prefix, chan.busy); seq_printf(s, "%s| PBDMA Faulted: %d|\n", prefix, chan.pbdma_faulted); seq_printf(s, "%s| ENG Faulted: %d|\n", prefix, chan.eng_faulted); seq_printf(s, "%s| On PBDMA: %d|\n", prefix, chan.on_pbdma); seq_printf(s, "%s| On ENG: %d|\n", prefix, chan.on_eng); seq_printf(s, "%s| Pending: %d|\n", prefix, chan.pending); seq_printf(s, "%s| CTX Reload: %d|\n", prefix, chan.ctx_reload); seq_printf(s, "%s| PBDMA Busy: %d|\n", prefix, chan.pbdma_busy); seq_printf(s, "%s| ENG Busy: %d|\n", prefix, chan.eng_busy); seq_printf(s, "%s| Acquire Fail: %d|\n", prefix, chan.acquire_fail); return 0; } #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0) // Bug workaround. See comment in runlist_file_seq_start() static loff_t pos_fixup; #endif static void *runlist_file_seq_start(struct seq_file *s, loff_t *pos) { static struct runlist_iter rl_iter; struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)]; // *pos == 0 for first call after read of file if (*pos == 0) { int err = get_runlist_iter(g, seq2gpuidx(s), &rl_iter); if (err) return ERR_PTR(err); // Don't try to print an empty runlist if (rl_iter.len <= 0) return NULL; return &rl_iter; } // If we're resuming an earlier print if (*pos < rl_iter.len) { #if LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0) // There's a nasty bug prior to 4.19-rc1 that if the buffer overflows, the // last update to `pos` is not saved. Work around that here by reloading a // saved copy of `pos`. if (!pos_fixup) return NULL; *pos = pos_fixup; #endif return &rl_iter; } // When called with *pos != 0, we already traversed the runlist return NULL; } static void* runlist_file_seq_next(struct seq_file *s, void *raw_rl_iter, loff_t *pos) { struct runlist_iter* rl_iter = raw_rl_iter; void *ret = NULL; struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)]; // Advance by one TSG or channel (*pos)++; rl_iter->curr_entry += NV_RL_ENTRY_SIZE(g); // Verify we haven't reached the end of the runlist // len is the num of tsg entries + total num of channel entries if (*pos < rl_iter->len) { ret = rl_iter; } #if LINUX_VERSION_CODE < KERNEL_VERSION(4,19,0) // Bug workaround. See comment in runlist_file_seq_start() pos_fixup = ret ? *pos : 0; #endif if (rl_iter->entries_left_in_tsg) rl_iter->entries_left_in_tsg--; return ret; } static void runlist_file_seq_stop(struct seq_file *s, void *raw_rl_iter) { // No cleanup needed } // _show() must be idempotent. This function will be rerun if the seq_printf // buffer was too small. static int runlist_file_seq_show(struct seq_file *s, void *raw_rl_iter) { struct runlist_iter *rl_iter = raw_rl_iter; void *entry = rl_iter->curr_entry; struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(s->file)]; if (entry_type(g, entry) == ENTRY_TYPE_TSG) { if (rl_iter->entries_left_in_tsg) { printk(KERN_WARNING "[nvdebug] Found TSG ID%d @ %px when %d channels were still expected under the previous TSG in the runlist!\n", tsgid(g, entry), entry, rl_iter->entries_left_in_tsg); while (rl_iter->entries_left_in_tsg--) seq_printf(s, "[missing channel]\n"); } rl_iter->entries_left_in_tsg = tsg_length(g, entry) + 1; seq_printf(s, "+---- TSG Entry %-3d---+\n", tsgid(g, entry)); seq_printf(s, "| Scale: %-13d|\n", timeslice_scale(g, entry)); seq_printf(s, "| Timeout: %-11d|\n", timeslice_timeout(g, entry)); seq_printf(s, "| Length: %-12d|\n", tsg_length(g, entry)); seq_printf(s, "+---------------------+\n"); } else { char *indt = ""; u64 instance_ptr = 0; if (rl_iter->entries_left_in_tsg) indt = " "; // Reconstruct pointer to channel instance block if (g->chip_id >= NV_CHIP_ID_VOLTA) { instance_ptr = ((struct gv100_runlist_chan*)entry)->inst_ptr_hi; instance_ptr <<= 32; } instance_ptr |= inst_ptr_lo(g, entry) << 12; // Print channel information from runlist seq_printf(s, "%s+- Channel Entry %-4d-+\n", indt, chid(g, entry)); if (g->chip_id >= NV_CHIP_ID_VOLTA) seq_printf(s, "%s| Runqueue Selector: %d|\n", indt, ((struct gv100_runlist_chan*)entry)->runqueue_selector); // Not populated on Kepler [ex: gk104 in Bonham (Quadro K5000)], and // populated but unused on Pascal [ex: gp104 in Bonham (GTX 1080 Ti)]. // (The aperture field may be incorrectly populated as INVALID, but the // context still works on the aformentioned Pascal GPU.) seq_printf(s, "%s| Instance PTR: |\n", indt); seq_printf(s, "%s| %#018llx|\n", indt, instance_ptr); seq_printf(s, "%s| %20s|\n", indt, target_to_text(inst_target(g, entry))); #ifdef DETAILED_CHANNEL_INFO // Print channel info from PCCSR/Channel RAM and the instance block if (g->chip_id < NV_CHIP_ID_AMPERE) runlist_detail_seq_show_chan(s, g, chid(g, entry), indt); else runlist_detail_seq_show_chan_ga100(s, g, chid(g, entry), indt, rl_iter->runlist_pri_base); #endif seq_printf(s, "%s+---------------------+\n", indt); } return 0; } static const struct seq_operations runlist_file_seq_ops = { .start = runlist_file_seq_start, .next = runlist_file_seq_next, .stop = runlist_file_seq_stop, .show = runlist_file_seq_show, }; static int runlist_file_open(struct inode *inode, struct file *f) { return seq_open(f, &runlist_file_seq_ops); } struct file_operations runlist_file_ops = { .open = runlist_file_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; ssize_t preempt_tsg_file_write(struct file *f, const char __user *buffer, size_t count, loff_t *off) { uint32_t target_tsgid, target_runlist_ram; struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec int err = kstrtou32_from_user(buffer, count, 0, &target_tsgid); if (err) return err; // TSG IDs are a 12-bit field, so make sure the request is in-range if (target_tsgid > MAX_TSGID) return -ERANGE; // (Ab)use the PDE_DATA field for the index into which Runlist RAM this TSG // ID is scoped to (only applicable on Ampere+) if (g->chip_id >= NV_CHIP_ID_AMPERE) target_runlist_ram = file2gpuidx(f); else target_runlist_ram = 0; // Execute preemption if ((err = preempt_tsg(g, target_runlist_ram, target_tsgid))) return err; return count; } struct file_operations preempt_tsg_file_ops = { .write = preempt_tsg_file_write, .llseek = default_llseek, }; ssize_t resubmit_runlist_file_write(struct file *f, const char __user *buffer, size_t count, loff_t *off) { uint32_t target_runlist; struct nvdebug_state *g = &g_nvdebug_state[file2gpuidx(f)]; // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec int err = kstrtou32_from_user(buffer, count, 0, &target_runlist); if (err) return err; // resubmit_runlist() checks that target_runlist is valid if ((err = resubmit_runlist(g, target_runlist))) return err; return count; } struct file_operations resubmit_runlist_file_ops = { .write = resubmit_runlist_file_write, .llseek = default_llseek, }; ssize_t disable_channel_file_write(struct file *f, const char __user *buffer, size_t count, loff_t *off) { uint32_t target_channel; struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec int err = kstrtou32_from_user(buffer, count, 0, &target_channel); if (err) return err; if (g->chip_id < NV_CHIP_ID_AMPERE) { channel_ctrl_t chan; if (target_channel > MAX_CHID) return -ERANGE; // Read current configuration if ((chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(target_channel))) == -1) return -EIO; // Request disablement chan.enable_clear = true; nvdebug_writeq(g, NV_PCCSR_CHANNEL_INST(target_channel), chan.raw); } else { uint32_t runlist_reg_base, chram_base, channel_max; runlist_channel_config_t channel_config; channel_ctrl_ga100_t chan; // (Ab)use the PDE_DATA field for the runlist ID if ((err = get_runlist_ram(g, file2gpuidx(f), &runlist_reg_base))) return err; // Channel RAM is subsidiary to Runlist RAM (ie. per-runlist) on Ampere if ((channel_config.raw = nvdebug_readl(g, runlist_reg_base + NV_RUNLIST_CHANNEL_CONFIG_GA100)) == -1) return -EIO; channel_max = 1u << channel_config.num_channels_log2; if (target_channel >= channel_max) return -ERANGE; chram_base = (uint32_t)channel_config.bar0_offset << 4; // Writing zeros to any field of the Ampere+ channel control structure // does nothing, so don't bother to read the structure first, and just // write zeros to all the fields we don't care about. chan.raw = 0; chan.is_write_one_clears_bits = 1; // Invert meaning of writing 1 chan.enable = 1; nvdebug_writel(g, chram_base + sizeof(channel_ctrl_ga100_t) * target_channel, chan.raw); } return count; } struct file_operations disable_channel_file_ops = { .write = disable_channel_file_write, .llseek = default_llseek, }; ssize_t enable_channel_file_write(struct file *f, const char __user *buffer, size_t count, loff_t *off) { uint32_t target_channel; struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec int err = kstrtou32_from_user(buffer, count, 0, &target_channel); if (err) return err; if (g->chip_id < NV_CHIP_ID_AMPERE) { channel_ctrl_t chan; if (target_channel > MAX_CHID) return -ERANGE; // Read current configuration if ((chan.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(target_channel))) == -1) return -EIO; // Disable channel chan.enable_set = true; nvdebug_writeq(g, NV_PCCSR_CHANNEL_INST(target_channel), chan.raw); } else { uint32_t runlist_reg_base, chram_base, channel_max; runlist_channel_config_t channel_config; channel_ctrl_ga100_t chan; // (Ab)use the PDE_DATA field for the runlist ID if ((err = get_runlist_ram(g, file2gpuidx(f), &runlist_reg_base))) return err; // Channel RAM is subsidiary to Runlist RAM (ie. per-runlist) on Ampere if ((channel_config.raw = nvdebug_readl(g, runlist_reg_base + NV_RUNLIST_CHANNEL_CONFIG_GA100)) == -1) return -EIO; channel_max = 1u << channel_config.num_channels_log2; if (target_channel >= channel_max) return -ERANGE; chram_base = (uint32_t)channel_config.bar0_offset << 4; // Writing zeros to any field of the Ampere+ channel control structure // does nothing, so don't bother to read the structure first, and just // write zeros to all the fields we don't care about. chan.raw = 0; chan.enable = 1; nvdebug_writel(g, chram_base + sizeof(channel_ctrl_ga100_t) * target_channel, chan.raw); } return count; } struct file_operations enable_channel_file_ops = { .write = enable_channel_file_write, .llseek = default_llseek, }; // Tested working on Pascal (gp106) through Ada (ad102) ssize_t switch_to_tsg_file_write(struct file *f, const char __user *buffer, size_t count, loff_t *off) { uint32_t target_tsgid, target_runlist, channel_regs_base; struct gv100_runlist_chan* chan; channel_ctrl_t chan_ctl; channel_ctrl_ga100_t chan_ctl_ga100; struct runlist_iter rl_iter; loff_t pos = 0; struct nvdebug_state *g = &g_nvdebug_state[file2parentgpuidx(f)]; // Passing 0 as the base to kstrtou32 indicates autodetect hex/octal/dec int err = kstrtou32_from_user(buffer, count, 0, &target_tsgid); if (err) return err; if (target_tsgid > MAX_TSGID) return -ERANGE; // (Ab)use the PDE_DATA field for the runlist ID target_runlist = file2gpuidx(f); if ((err = get_runlist_iter(g, target_runlist, &rl_iter))) return err; // On Ampere, TSG and Channel IDs are only unique per-runlist, so we need // to pull the per-runlist copy of Channel RAM. if (g->chip_id >= NV_CHIP_ID_AMPERE) { uint32_t runlist_regs_base; runlist_channel_config_t chan_config; if ((err = get_runlist_ram(g, target_runlist, &runlist_regs_base))) return err; // Channel RAM is subsidiary to Runlist RAM (ie. per-runlist) on Ampere if ((chan_config.raw = nvdebug_readl(g, runlist_regs_base + NV_RUNLIST_CHANNEL_CONFIG_GA100)) == -1) return -EIO; channel_regs_base = (uint32_t)chan_config.bar0_offset << 4; } // Iterate through all TSGs while (pos < rl_iter.len) { bool enable = false; if (tsgid(g, rl_iter.curr_entry) == target_tsgid) enable = true; // Either enable or disable all channels of each TSG, dependent on if // they are contained within the target TSG or not. for_chan_in_tsg(g, chan, rl_iter.curr_entry) { if (g->chip_id < NV_CHIP_ID_AMPERE) { // Read, update, write for PCCSR if ((chan_ctl.raw = nvdebug_readq(g, NV_PCCSR_CHANNEL_INST(chid(g, chan)))) == -1) return -EIO; if (enable) chan_ctl.enable_set = true; else chan_ctl.enable_clear = true; nvdebug_writeq(g, NV_PCCSR_CHANNEL_INST(chid(g, chan)), chan_ctl.raw); } else { // Writing a 0 does nothing on Ampere+, so we can just write chan_ctl_ga100.raw = 0; chan_ctl_ga100.is_write_one_clears_bits = !enable; chan_ctl_ga100.enable = true; nvdebug_writel(g, channel_regs_base + sizeof(chan_ctl_ga100) * chid(g, chan), chan_ctl_ga100.raw); } } pos += 1 + tsg_length(g, rl_iter.curr_entry); rl_iter.curr_entry = next_tsg(g, rl_iter.curr_entry); // TODO: Fix the above for bare channels. Add "for_chan_until_tsg"? } // Resubmit the runlist to ensure that changes to channel enablement are // picked up on Turing+ GPUs (channel enablements may not be otherwise). if (g->chip_id >= NV_CHIP_ID_TURING) if ((err = resubmit_runlist(g, target_runlist))) return err; // Trigger a runlist-level preempt to stop whatever was running, triggering // the runlist scheduler to select and run the next-enabled channel. if ((err = preempt_runlist(g, target_runlist))) return err; return count; } struct file_operations switch_to_tsg_file_ops = { .write = switch_to_tsg_file_write, .llseek = default_llseek, };