aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBenjamin Hadad IV <bh4@unc.edu>2023-07-28 11:39:28 -0400
committerBenjamin Hadad IV <bh4@unc.edu>2023-07-28 11:39:28 -0400
commit845960fc1b15995fdbd6d61c384567652a150bc4 (patch)
tree14aad318d8694c6266576a1d45ced0d8fc502a6d
parent8a57aaeba41c43233c323d7e0fc8bf1a81ebc65e (diff)
Refactored various systems and debugged minor issues
- Added device_info_iter - Merged functions in device_info_procfs.c - Separated device_info data structs by version in nvdebug.h - Fixed issue with device_info runlist ID data
-rw-r--r--device_info_procfs.c110
-rw-r--r--nvdebug.h34
-rw-r--r--nvdebug_entry.c6
3 files changed, 69 insertions, 81 deletions
diff --git a/device_info_procfs.c b/device_info_procfs.c
index 3cf4bc9..5ddf240 100644
--- a/device_info_procfs.c
+++ b/device_info_procfs.c
@@ -27,61 +27,57 @@ struct file_operations nvdebug_read_reg32_file_ops = {
27 .llseek = default_llseek, 27 .llseek = default_llseek,
28}; 28};
29 29
30typedef struct {
31 int total_entries;
32 int index;
33 int type_of_next_entry;
34} device_info_iter;
35
30//// ==v== PTOP_DEVICE_INFO ==v== //// 36//// ==v== PTOP_DEVICE_INFO ==v== ////
31 37
38static void* device_info_seq_start_backend(struct seq_file *s, loff_t *pos, int initial_entry_value, int total_entries) {
39 static device_info_iter idx;
40 // If start of sequence, reset `idx`
41 if (*pos == 0) {
42 idx.index = 0;
43 idx.type_of_next_entry = initial_entry_value;
44 }
45 idx.total_entries = total_entries;
46 // Number of possible info entries is fixed, and list is sparse
47 if (idx.index >= idx.total_entries)
48 return NULL;
49 return &idx;
50}
51
32// Called to start or resume a sequence. Prior to 4.19, *pos is unreliable. 52// Called to start or resume a sequence. Prior to 4.19, *pos is unreliable.
33// Initializes iterator `idx` state and returns it. Ends sequence on NULL. 53// Initializes iterator `idx` state and returns it. Ends sequence on NULL.
34static void* device_info_file_seq_start_previous(struct seq_file *s, loff_t *pos) { 54static void* device_info_file_seq_start_previous(struct seq_file *s, loff_t *pos) {
35 static int idx; 55 return device_info_seq_start_backend(s, pos, -1, NV_PTOP_DEVICE_INFO__SIZE_1_PREVIOUS);
36 // If start of sequence, reset `idx`
37 if (*pos == 0)
38 idx = 0;
39 // Number of possible info entries is fixed, and list is sparse
40 if (idx >= NV_PTOP_DEVICE_INFO__SIZE_1_PREVIOUS)
41 return NULL;
42 return &idx;
43} 56}
44 57
45static void* device_info_file_seq_start_ampere(struct seq_file *s, loff_t *pos) { 58static void* device_info_file_seq_start_ampere(struct seq_file *s, loff_t *pos) {
46 static int idx; 59 struct nvdebug_state *g = &g_nvdebug_state[seq2gpuidx(s)];
47 // If start of sequence, reset `idx` 60 return device_info_seq_start_backend(s, pos, 0, NV_PTOP_DEVICE_INFO__SIZE_1_AMPERE(g));
48 if (*pos == 0)
49 idx = 0;
50 struct nvdebug_state *g = &g_nvdebug_state[seq2gpuidx(s)];
51 // Number of possible info entries is fixed, and list is sparse
52 if (idx >= NV_PTOP_DEVICE_INFO__SIZE_1_AMPERE(g))
53 return NULL;
54 return &idx;
55} 61}
56 62
57// Steps to next record. Returns new value of `idx`. 63// Steps to next record. Returns new value of `idx`.
58// Calls show() on non-NULL return 64// Calls show() on non-NULL return
59static void* device_info_file_seq_next_previous(struct seq_file *s, void *idx, 65static void* device_info_file_seq_next(struct seq_file *s, void *idx,
60 loff_t *pos) { 66 loff_t *pos) {
67 device_info_iter *idx_iter = (device_info_iter*)idx;
61 (*pos)++; // Required by seq interface 68 (*pos)++; // Required by seq interface
62 // Number of possible info entries is fixed, and list is sparse 69 // Number of possible info entries is fixed, and list is sparse
63 if ((*(int*)idx)++ >= NV_PTOP_DEVICE_INFO__SIZE_1_PREVIOUS) 70 if (idx_iter->index++ >= idx_iter->total_entries)
64 return NULL; 71 return NULL;
65 return idx; 72 return idx;
66} 73}
67 74
68// Steps to next record. Returns new value of `idx`.
69// Calls show() on non-NULL return
70static void* device_info_file_seq_next_ampere(struct seq_file *s, void *idx,
71 loff_t *pos) {
72 (*pos)++; // Required by seq interface
73 // Number of possible info entries is fixed, and list is sparse
74 struct nvdebug_state *g = &g_nvdebug_state[seq2gpuidx(s)];
75 if ((*(int*)idx)++ >= NV_PTOP_DEVICE_INFO__SIZE_1_AMPERE(g))
76 return NULL;
77 return idx;
78}
79
80// Print info at index *idx. Returns non-zero on error. 75// Print info at index *idx. Returns non-zero on error.
81static int device_info_file_seq_show_previous(struct seq_file *s, void *idx) { 76static int device_info_file_seq_show_previous(struct seq_file *s, void *idx) {
82 ptop_device_info_t curr_info; 77 device_info_iter *idx_iter = (device_info_iter*)idx;
78 ptop_device_info_previous_t curr_info;
83 struct nvdebug_state *g = &g_nvdebug_state[seq2gpuidx(s)]; 79 struct nvdebug_state *g = &g_nvdebug_state[seq2gpuidx(s)];
84 curr_info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_PREVIOUS(*(int*)idx)); 80 curr_info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_PREVIOUS(idx_iter->index));
85 // Check for read errors 81 // Check for read errors
86 if (curr_info.raw == -1) 82 if (curr_info.raw == -1)
87 return -EIO; 83 return -EIO;
@@ -129,54 +125,42 @@ static int device_info_file_seq_show_previous(struct seq_file *s, void *idx) {
129 125
130// Print info at index *idx for Ampere GPUs. Returns non-zero on error. 126// Print info at index *idx for Ampere GPUs. Returns non-zero on error.
131static int device_info_file_seq_show_ampere(struct seq_file *s, void *idx) { 127static int device_info_file_seq_show_ampere(struct seq_file *s, void *idx) {
132 ptop_device_info_t curr_info; 128 device_info_iter *idx_iter = (device_info_iter*)idx;
129 ptop_device_info_ampere_t curr_info;
133 struct nvdebug_state *g = &g_nvdebug_state[seq2gpuidx(s)]; 130 struct nvdebug_state *g = &g_nvdebug_state[seq2gpuidx(s)];
134 curr_info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_AMPERE(*(int*)idx)); 131 curr_info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_AMPERE(idx_iter->index));
135 // Check for read errors 132 // Check for read errors
136 if (curr_info.raw == -1) 133 if (curr_info.raw == -1)
137 return -EIO; 134 return -EIO;
138 // The info_type field is not available in the Ampere device_info data, so it must be inferred 135 // The info_type field is not available in the Ampere device_info data, so it must be inferred
139 int info_type = -1; 136 int info_type = curr_info.raw ? idx_iter->type_of_next_entry : -1;
140 if(curr_info.raw) {
141 for(int i = 0; i < NV_PTOP_DEVICE_INFO_TYPE_COUNT; i++) {
142 if(*(int*)idx == i) {
143 info_type = i;
144 break;
145 }
146 ptop_device_info_t prev_info;
147 prev_info.raw = nvdebug_readl(g, NV_PTOP_DEVICE_INFO_AMPERE(*(int*)idx - i - 1));
148 if(!prev_info.raw || !prev_info.has_next_entry_ampere) {
149 info_type = i;
150 break;
151 }
152 }
153 }
154 // Parse and print the data 137 // Parse and print the data
155 switch(info_type) { 138 switch(info_type) {
156 case 0: 139 case 0:
157 seq_printf(s, "| instance %d\n", curr_info.inst_id_ampere); 140 seq_printf(s, "| instance %d\n", curr_info.inst_id);
158 seq_printf(s, "| Fault ID: %3d\n", curr_info.fault_id_ampere); 141 seq_printf(s, "| Fault ID: %3d\n", curr_info.fault_id);
159 seq_printf(s, "| Engine Type: %2d (", curr_info.engine_type_ampere); 142 seq_printf(s, "| Engine Type: %2d (", curr_info.engine_type);
160 if (curr_info.engine_type_ampere < ENGINE_TYPES_LEN) 143 if (curr_info.engine_type < ENGINE_TYPES_LEN)
161 seq_printf(s, "%s)\n", ENGINE_TYPES_NAMES[curr_info.engine_type_ampere]); 144 seq_printf(s, "%s)\n", ENGINE_TYPES_NAMES[curr_info.engine_type]);
162 else 145 else
163 seq_printf(s, "Unknown Engine, introduced post-Ampere)\n"); 146 seq_printf(s, "Unknown Engine, introduced post-Ampere)\n");
164 break; 147 break;
165 case 1: 148 case 1:
166 seq_printf(s, "| BAR0 Base %#.8x\n", curr_info.pri_base_ampere << 12); 149 seq_printf(s, "| BAR0 Base %#.8x\n", curr_info.pri_base << 12);
167 seq_printf(s, "| Reset ID: %2d\n", curr_info.reset_enum_ampere); 150 seq_printf(s, "| Reset ID: %2d\n", curr_info.reset_enum);
168 break; 151 break;
169 case 2: 152 case 2:
170 seq_printf(s, "| Host's Engine ID: %2d\n", curr_info.engine_enum_ampere); 153 seq_printf(s, "| Host's Engine ID: %2d\n", curr_info.engine_enum);
171 seq_printf(s, "| Runlist ID: %2d\n", curr_info.runlist_enum_ampere); 154 seq_printf(s, "| Runlist ID: %2d\n", curr_info.runlist_enum);
172 break; 155 break;
173 default: 156 default:
174 // Device info records are sparse, so skip unset or unknown ones 157 // Device info records are sparse, so skip unset or unknown ones
175 return 0; 158 return 0;
176 } 159 }
177 160 if(info_type != -1) idx_iter->type_of_next_entry++;
178 // Draw a line between each device entry 161 // Draw a line between each device entry
179 if (!curr_info.has_next_entry_ampere) { 162 if (!curr_info.has_next_entry) {
163 idx_iter->type_of_next_entry = 0;
180 seq_printf(s, "+---------------------+\n"); 164 seq_printf(s, "+---------------------+\n");
181 } 165 }
182 return 0; 166 return 0;
@@ -189,14 +173,14 @@ static void device_info_file_seq_stop(struct seq_file *s, void *idx) {
189 173
190static const struct seq_operations device_info_file_seq_ops_previous = { 174static const struct seq_operations device_info_file_seq_ops_previous = {
191 .start = device_info_file_seq_start_previous, 175 .start = device_info_file_seq_start_previous,
192 .next = device_info_file_seq_next_previous, 176 .next = device_info_file_seq_next,
193 .stop = device_info_file_seq_stop, 177 .stop = device_info_file_seq_stop,
194 .show = device_info_file_seq_show_previous, 178 .show = device_info_file_seq_show_previous,
195}; 179};
196 180
197static const struct seq_operations device_info_file_seq_ops_ampere = { 181static const struct seq_operations device_info_file_seq_ops_ampere = {
198 .start = device_info_file_seq_start_ampere, 182 .start = device_info_file_seq_start_ampere,
199 .next = device_info_file_seq_next_ampere, 183 .next = device_info_file_seq_next,
200 .stop = device_info_file_seq_stop, 184 .stop = device_info_file_seq_stop,
201 .show = device_info_file_seq_show_ampere, 185 .show = device_info_file_seq_show_ampere,
202}; 186};
diff --git a/nvdebug.h b/nvdebug.h
index d6b7358..c45e460 100644
--- a/nvdebug.h
+++ b/nvdebug.h
@@ -567,25 +567,29 @@ static const char* const ENGINE_TYPES_NAMES[ENGINE_TYPES_LEN] = {
567#define NV_PTOP_DEVICE_INFO_TYPE_COUNT 3 567#define NV_PTOP_DEVICE_INFO_TYPE_COUNT 3
568typedef union { 568typedef union {
569 struct { 569 struct {
570 uint32_t fault_id_ampere:7; 570 uint32_t fault_id:7;
571 uint32_t padding0_ampere:9; 571 uint32_t padding0:9;
572 uint32_t inst_id_ampere:4; 572 uint32_t inst_id:4;
573 uint32_t padding1_ampere:4; 573 uint32_t padding1:4;
574 enum ENGINE_TYPES engine_type_ampere:7; 574 enum ENGINE_TYPES engine_type:7;
575 bool has_next_entry_ampere:1; 575 bool has_next_entry:1;
576 } __attribute__((packed)); 576 } __attribute__((packed));
577 struct { 577 struct {
578 uint32_t reset_enum_ampere:5; 578 uint32_t reset_enum:5;
579 uint32_t padding2_ampere:7; 579 uint32_t padding2:7;
580 uint32_t pri_base_ampere:12; 580 uint32_t pri_base:12;
581 uint32_t padding3_ampere:8; 581 uint32_t padding3:8;
582 } __attribute__((packed)); 582 } __attribute__((packed));
583 struct { 583 struct {
584 uint32_t engine_enum_ampere:2; 584 uint32_t engine_enum:2;
585 uint32_t padding4_ampere:4; 585 uint32_t padding4:8;
586 uint32_t runlist_enum_ampere:14; 586 uint32_t runlist_enum:14;
587 uint32_t padding5_ampere:12; 587 uint32_t padding5:8;
588 } __attribute__((packed)); 588 } __attribute__((packed));
589 uint32_t raw;
590} ptop_device_info_ampere_t;
591
592typedef union {
589 // DATA type fields 593 // DATA type fields
590 struct { 594 struct {
591 enum DEVICE_INFO_TYPE info_type:2; 595 enum DEVICE_INFO_TYPE info_type:2;
@@ -622,7 +626,7 @@ typedef union {
622 uint32_t padding9:1; 626 uint32_t padding9:1;
623 } __attribute__((packed)); 627 } __attribute__((packed));
624 uint32_t raw; 628 uint32_t raw;
625} ptop_device_info_t; 629} ptop_device_info_previous_t;
626 630
627#define NV_PTOP_SCAL_NUM_GPCS 0x00022430 631#define NV_PTOP_SCAL_NUM_GPCS 0x00022430
628#define NV_PTOP_SCAL_NUM_TPC_PER_GPC 0x00022434 632#define NV_PTOP_SCAL_NUM_TPC_PER_GPC 0x00022434
diff --git a/nvdebug_entry.c b/nvdebug_entry.c
index 3dfe1e8..d3d934e 100644
--- a/nvdebug_entry.c
+++ b/nvdebug_entry.c
@@ -142,7 +142,7 @@ int probe_and_cache_device(void) {
142 142
143// Create files `/proc/gpu#/runlist#`, world readable 143// Create files `/proc/gpu#/runlist#`, world readable
144int create_runlist_files_previous(int device_id, struct proc_dir_entry *dir) { 144int create_runlist_files_previous(int device_id, struct proc_dir_entry *dir) {
145 ptop_device_info_t info; 145 ptop_device_info_previous_t info;
146 struct proc_dir_entry *rl_entry; 146 struct proc_dir_entry *rl_entry;
147 int i, rl_id; 147 int i, rl_id;
148 char runlist_name[12]; 148 char runlist_name[12];
@@ -172,7 +172,7 @@ int create_runlist_files_previous(int device_id, struct proc_dir_entry *dir) {
172 172
173// Create files `/proc/gpu#/runlist#`, world readable 173// Create files `/proc/gpu#/runlist#`, world readable
174int create_runlist_files_ampere(int device_id, struct proc_dir_entry *dir) { 174int create_runlist_files_ampere(int device_id, struct proc_dir_entry *dir) {
175 ptop_device_info_t info; 175 ptop_device_info_ampere_t info;
176 struct proc_dir_entry *rl_entry; 176 struct proc_dir_entry *rl_entry;
177 int i, rl_id; 177 int i, rl_id;
178 char runlist_name[12]; 178 char runlist_name[12];
@@ -182,7 +182,7 @@ int create_runlist_files_ampere(int device_id, struct proc_dir_entry *dir) {
182 // to find the highest-valued one and add 1 to get the number of runlists. 182 // to find the highest-valued one and add 1 to get the number of runlists.
183 for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1_AMPERE(&g_nvdebug_state[device_id]); i++) { 183 for (i = 0; i < NV_PTOP_DEVICE_INFO__SIZE_1_AMPERE(&g_nvdebug_state[device_id]); i++) {
184 info.raw = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_DEVICE_INFO_AMPERE(i)); 184 info.raw = nvdebug_readl(&g_nvdebug_state[device_id], NV_PTOP_DEVICE_INFO_AMPERE(i));
185 if (info.runlist_enum_ampere > max_rl_id) 185 if (info.runlist_enum > max_rl_id)
186 max_rl_id = info.runlist_enum; 186 max_rl_id = info.runlist_enum;
187 } 187 }
188 // Create files to read each runlist. The read handling code looks at the 188 // Create files to read each runlist. The read handling code looks at the