aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBenjamin Hadad IV <bh4@unc.edu>2023-07-19 13:02:52 -0400
committerBenjamin Hadad IV <bh4@unc.edu>2023-07-19 13:02:52 -0400
commit33c915f08f5dc63674b158ecc18897494256a6d0 (patch)
tree917bfa0e6b3b5f482ddb3180f40d2dac9b6dfed1
parentbfb4dcf0e78954c0163f3a06a5a088c4d1b437a8 (diff)
Debugged device_info functionality
- Fixed device_info crash bugs - Made further edits to display functionality - Refactored code to enhance readability
-rw-r--r--Makefile1
-rw-r--r--device_info_procfs.c53
-rw-r--r--nvdebug.h4
-rw-r--r--nvdebug_entry.c4
4 files changed, 24 insertions, 38 deletions
diff --git a/Makefile b/Makefile
index 2dc90c7..8e32bd0 100644
--- a/Makefile
+++ b/Makefile
@@ -5,6 +5,7 @@ KBUILD_CFLAGS += -DGIT_HASH=\"$(shell git --git-dir=$(PWD)/.git rev-parse --shor
5 5
6# TODO: Avoid needing to distribute NVIDIA's headers (at least they're MIT...) 6# TODO: Avoid needing to distribute NVIDIA's headers (at least they're MIT...)
7ccflags-y += -I$(PWD)/include 7ccflags-y += -I$(PWD)/include
8ccflags-y += -std=gnu99
8#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu/include 9#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu/include
9#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu 10#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/drivers/gpu/nvgpu
10#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include 11#ccflags-y += -I/playpen/Linux_for_Tegra/source/public/kernel/nvgpu/include
diff --git a/device_info_procfs.c b/device_info_procfs.c
index b1e58b1..3cf4bc9 100644
--- a/device_info_procfs.c
+++ b/device_info_procfs.c
@@ -37,7 +37,7 @@ static void* device_info_file_seq_start_previous(struct seq_file *s, loff_t *pos
37 if (*pos == 0) 37 if (*pos == 0)
38 idx = 0; 38 idx = 0;
39 // Number of possible info entries is fixed, and list is sparse 39 // Number of possible info entries is fixed, and list is sparse
40 if (idx >= NV_PTOP_DEVICE_INFO__SIZE_1) 40 if (idx >= NV_PTOP_DEVICE_INFO__SIZE_1_PREVIOUS)
41 return NULL; 41 return NULL;
42 return &idx; 42 return &idx;
43} 43}
@@ -49,7 +49,7 @@ static void* device_info_file_seq_start_ampere(struct seq_file *s, loff_t *pos)
49 idx = 0; 49 idx = 0;
50 struct nvdebug_state *g = &g_nvdebug_state[seq2gpuidx(s)]; 50 struct nvdebug_state *g = &g_nvdebug_state[seq2gpuidx(s)];
51 // Number of possible info entries is fixed, and list is sparse 51 // Number of possible info entries is fixed, and list is sparse
52 if (idx >= (nvdebug_readl(g, 0x0224fc) >> 20)) 52 if (idx >= NV_PTOP_DEVICE_INFO__SIZE_1_AMPERE(g))
53 return NULL; 53 return NULL;
54 return &idx; 54 return &idx;
55} 55}
@@ -60,7 +60,7 @@ static void* device_info_file_seq_next_previous(struct seq_file *s, void *idx,
60 loff_t *pos) { 60 loff_t *pos) {
61 (*pos)++; // Required by seq interface 61 (*pos)++; // Required by seq interface
62 // Number of possible info entries is fixed, and list is sparse 62 // Number of possible info entries is fixed, and list is sparse
63 if ((*(int*)idx)++ >= NV_PTOP_DEVICE_INFO__SIZE_1) 63 if ((*(int*)idx)++ >= NV_PTOP_DEVICE_INFO__SIZE_1_PREVIOUS)
64 return NULL; 64 return NULL;
65 return idx; 65 return idx;
66} 66}
@@ -72,38 +72,11 @@ static void* device_info_file_seq_next_ampere(struct seq_file *s, void *idx,
72 (*pos)++; // Required by seq interface 72 (*pos)++; // Required by seq interface
73 // Number of possible info entries is fixed, and list is sparse 73 // Number of possible info entries is fixed, and list is sparse
74 struct nvdebug_state *g = &g_nvdebug_state[seq2gpuidx(s)]; 74 struct nvdebug_state *g = &g_nvdebug_state[seq2gpuidx(s)];
75 if ((*(int*)idx)++ >= (nvdebug_readl(g, 0x0224fc) >> 20)) 75 if ((*(int*)idx)++ >= NV_PTOP_DEVICE_INFO__SIZE_1_AMPERE(g))
76 return NULL; 76 return NULL;
77 return idx; 77 return idx;
78} 78}
79/* 79
80// Steps to next record on Ampere GPUs. Returns new value of `idx`.
81static void* device_info_file_seq_next_ampere(struct seq_file *s, void *idx,
82 loff_t *pos) {
83 (*pos)++; // Required by seq interface
84 // Number of possible info entries is fixed, and list is sparse
85 while(1) {
86 struct nvdebug_state *g = &g_nvdebug_state[seq2gpuidx(s)];
87 if ((*(int*)idx)++ >= (nvdebug_readl(g, 0x0224fc) >> 20))
88 return NULL;
89 ptop_device_info_t curr_info;
90 curr_info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_AMPERE(*(int*)idx));
91 if(!curr_info.raw && !*info_type) continue;
92 (*info_type)++;
93 break;
94 }
95 while(1) {
96 struct nvdebug_state *g = &g_nvdebug_state[seq2gpuidx(s)];
97 if ((*(int*)idx)++ >= (nvdebug_readl(g, 0x0224fc) >> 20))
98 return NULL;
99 ptop_device_info_t curr_info;
100 curr_info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_AMPERE(*(int*)idx));
101 if(curr_info.raw & 0x80000000) continue;
102 break;
103 }
104 return idx;
105}
106*/
107// Print info at index *idx. Returns non-zero on error. 80// Print info at index *idx. Returns non-zero on error.
108static int device_info_file_seq_show_previous(struct seq_file *s, void *idx) { 81static int device_info_file_seq_show_previous(struct seq_file *s, void *idx) {
109 ptop_device_info_t curr_info; 82 ptop_device_info_t curr_info;
@@ -162,11 +135,21 @@ static int device_info_file_seq_show_ampere(struct seq_file *s, void *idx) {
162 // Check for read errors 135 // Check for read errors
163 if (curr_info.raw == -1) 136 if (curr_info.raw == -1)
164 return -EIO; 137 return -EIO;
138 // The info_type field is not available in the Ampere device_info data, so it must be inferred
165 int info_type = -1; 139 int info_type = -1;
166 if(curr_info.raw) { 140 if(curr_info.raw) {
167 if(*(int*)idx < 1 || !nvdebug_readl(g, NV_PTOP_DEVICE_INFO_AMPERE(*(int*)idx) - 1)) info_type = 0; 141 for(int i = 0; i < NV_PTOP_DEVICE_INFO_TYPE_COUNT; i++) {
168 if(*(int*)idx < 2 || !nvdebug_readl(g, NV_PTOP_DEVICE_INFO_AMPERE(*(int*)idx) - 2)) info_type = 1; 142 if(*(int*)idx == i) {
169 if(*(int*)idx < 3 || !nvdebug_readl(g, NV_PTOP_DEVICE_INFO_AMPERE(*(int*)idx) - 3)) info_type = 2; 143 info_type = i;
144 break;
145 }
146 ptop_device_info_t prev_info;
147 prev_info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_AMPERE(*(int*)idx - i - 1));
148 if(!prev_info.raw || !prev_info.has_next_entry_ampere) {
149 info_type = i;
150 break;
151 }
152 }
170 } 153 }
171 // Parse and print the data 154 // Parse and print the data
172 switch(info_type) { 155 switch(info_type) {
diff --git a/nvdebug.h b/nvdebug.h
index 3ccdcfe..bd893aa 100644
--- a/nvdebug.h
+++ b/nvdebug.h
@@ -556,7 +556,9 @@ static const char* const ENGINE_TYPES_NAMES[ENGINE_TYPES_LEN] = {
556 556
557#define NV_PTOP_DEVICE_INFO_AMPERE(i) (0x00022800+(i)*4) 557#define NV_PTOP_DEVICE_INFO_AMPERE(i) (0x00022800+(i)*4)
558#define NV_PTOP_DEVICE_INFO_PREVIOUS(i) (0x00022700+(i)*4) 558#define NV_PTOP_DEVICE_INFO_PREVIOUS(i) (0x00022700+(i)*4)
559#define NV_PTOP_DEVICE_INFO__SIZE_1 64 559#define NV_PTOP_DEVICE_INFO__SIZE_1_AMPERE(g) (nvdebug_readl(g, 0x0224fc) >> 20)
560#define NV_PTOP_DEVICE_INFO__SIZE_1_PREVIOUS 64
561#define NV_PTOP_DEVICE_INFO_TYPE_COUNT 3
560typedef union { 562typedef union {
561 struct { 563 struct {
562 uint32_t fault_id_ampere:7; 564 uint32_t fault_id_ampere:7;
diff --git a/nvdebug_entry.c b/nvdebug_entry.c
index d82c648..3dfe1e8 100644
--- a/nvdebug_entry.c
+++ b/nvdebug_entry.c
@@ -150,7 +150,7 @@ int create_runlist_files_previous(int device_id, struct proc_dir_entry *dir) {
150 // Figure out how many runlists there are by checking the device info 150 // Figure out how many runlists there are by checking the device info
151 // registers. Runlists are always numbered sequentially, so we just have 151 // registers. Runlists are always numbered sequentially, so we just have
152 // to find the highest-valued one and add 1 to get the number of runlists. 152 // to find the highest-valued one and add 1 to get the number of runlists.
153 for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1; i++) { 153 for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1_PREVIOUS; i++) {
154 info.raw = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_DEVICE_INFO_PREVIOUS(i)); 154 info.raw = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_DEVICE_INFO_PREVIOUS(i));
155 if (info.info_type != INFO_TYPE_ENUM || !info.runlist_is_valid) 155 if (info.info_type != INFO_TYPE_ENUM || !info.runlist_is_valid)
156 continue; 156 continue;
@@ -180,7 +180,7 @@ int create_runlist_files_ampere(int device_id, struct proc_dir_entry *dir) {
180 // Figure out how many runlists there are by checking the device info 180 // Figure out how many runlists there are by checking the device info
181 // registers. Runlists are always numbered sequentially, so we just have 181 // registers. Runlists are always numbered sequentially, so we just have
182 // to find the highest-valued one and add 1 to get the number of runlists. 182 // to find the highest-valued one and add 1 to get the number of runlists.
183 for (i = 0; i < (nvdebug_readl(&g_nvdebug_state[device_id], 0x0224fc) >> 20); i++) { 183 for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1_AMPERE(&g_nvdebug_state[device_id]); i++) {
184 info.raw = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_DEVICE_INFO_AMPERE(i)); 184 info.raw = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_DEVICE_INFO_AMPERE(i));
185 if (info.runlist_enum_ampere > max_rl_id) 185 if (info.runlist_enum_ampere > max_rl_id)
186 max_rl_id = info.runlist_enum; 186 max_rl_id = info.runlist_enum;