diff options
Diffstat (limited to 'drivers/gpu/nvgpu/gv100/gr_gv100.c')
-rw-r--r-- | drivers/gpu/nvgpu/gv100/gr_gv100.c | 349 |
1 files changed, 349 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/gv100/gr_gv100.c b/drivers/gpu/nvgpu/gv100/gr_gv100.c new file mode 100644 index 00000000..430c7cd0 --- /dev/null +++ b/drivers/gpu/nvgpu/gv100/gr_gv100.c | |||
@@ -0,0 +1,349 @@ | |||
1 | /* | ||
2 | * GV100 GPU GR | ||
3 | * | ||
4 | * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. | ||
5 | * | ||
6 | * Permission is hereby granted, free of charge, to any person obtaining a | ||
7 | * copy of this software and associated documentation files (the "Software"), | ||
8 | * to deal in the Software without restriction, including without limitation | ||
9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||
10 | * and/or sell copies of the Software, and to permit persons to whom the | ||
11 | * Software is furnished to do so, subject to the following conditions: | ||
12 | * | ||
13 | * The above copyright notice and this permission notice shall be included in | ||
14 | * all copies or substantial portions of the Software. | ||
15 | * | ||
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | ||
19 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
21 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||
22 | * DEALINGS IN THE SOFTWARE. | ||
23 | */ | ||
24 | |||
25 | #include <nvgpu/log.h> | ||
26 | #include <nvgpu/debug.h> | ||
27 | #include <nvgpu/enabled.h> | ||
28 | |||
29 | #include "gk20a/gk20a.h" | ||
30 | #include "gk20a/gr_gk20a.h" | ||
31 | |||
32 | #include "gv100/gr_gv100.h" | ||
33 | #include "gv11b/subctx_gv11b.h" | ||
34 | |||
35 | #include <nvgpu/hw/gv100/hw_gr_gv100.h> | ||
36 | #include <nvgpu/hw/gv100/hw_proj_gv100.h> | ||
37 | |||
38 | /* | ||
39 | * Estimate performance if the given logical TPC in the given logical GPC were | ||
40 | * removed. | ||
41 | */ | ||
42 | static int gr_gv100_scg_estimate_perf(struct gk20a *g, | ||
43 | unsigned long *gpc_tpc_mask, | ||
44 | u32 disable_gpc_id, u32 disable_tpc_id, | ||
45 | int *perf) | ||
46 | { | ||
47 | struct gr_gk20a *gr = &g->gr; | ||
48 | int err = 0; | ||
49 | u32 scale_factor = 512UL; /* Use fx23.9 */ | ||
50 | u32 pix_scale = 1024*1024UL; /* Pix perf in [29:20] */ | ||
51 | u32 world_scale = 1024UL; /* World performance in [19:10] */ | ||
52 | u32 tpc_scale = 1; /* TPC balancing in [9:0] */ | ||
53 | u32 scg_num_pes = 0; | ||
54 | u32 min_scg_gpc_pix_perf = scale_factor; /* Init perf as maximum */ | ||
55 | u32 average_tpcs = 0; /* Average of # of TPCs per GPC */ | ||
56 | u32 deviation; /* absolute diff between TPC# and | ||
57 | * average_tpcs, averaged across GPCs | ||
58 | */ | ||
59 | u32 norm_tpc_deviation; /* deviation/max_tpc_per_gpc */ | ||
60 | u32 tpc_balance; | ||
61 | u32 scg_gpc_pix_perf; | ||
62 | u32 scg_world_perf; | ||
63 | u32 gpc_id; | ||
64 | u32 pes_id; | ||
65 | int diff; | ||
66 | bool is_tpc_removed_gpc = false; | ||
67 | bool is_tpc_removed_pes = false; | ||
68 | u32 max_tpc_gpc = 0; | ||
69 | u32 num_tpc_mask; | ||
70 | u32 *num_tpc_gpc = nvgpu_kzalloc(g, sizeof(u32) * | ||
71 | nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS)); | ||
72 | |||
73 | if (!num_tpc_gpc) | ||
74 | return -ENOMEM; | ||
75 | |||
76 | /* Calculate pix-perf-reduction-rate per GPC and find bottleneck TPC */ | ||
77 | for (gpc_id = 0; gpc_id < gr->gpc_count; gpc_id++) { | ||
78 | num_tpc_mask = gpc_tpc_mask[gpc_id]; | ||
79 | |||
80 | if ((gpc_id == disable_gpc_id) && num_tpc_mask & | ||
81 | (0x1 << disable_tpc_id)) { | ||
82 | /* Safety check if a TPC is removed twice */ | ||
83 | if (is_tpc_removed_gpc) { | ||
84 | err = -EINVAL; | ||
85 | goto free_resources; | ||
86 | } | ||
87 | /* Remove logical TPC from set */ | ||
88 | num_tpc_mask &= ~(0x1 << disable_tpc_id); | ||
89 | is_tpc_removed_gpc = true; | ||
90 | } | ||
91 | |||
92 | /* track balancing of tpcs across gpcs */ | ||
93 | num_tpc_gpc[gpc_id] = hweight32(num_tpc_mask); | ||
94 | average_tpcs += num_tpc_gpc[gpc_id]; | ||
95 | |||
96 | /* save the maximum numer of gpcs */ | ||
97 | max_tpc_gpc = num_tpc_gpc[gpc_id] > max_tpc_gpc ? | ||
98 | num_tpc_gpc[gpc_id] : max_tpc_gpc; | ||
99 | |||
100 | /* | ||
101 | * Calculate ratio between TPC count and post-FS and post-SCG | ||
102 | * | ||
103 | * ratio represents relative throughput of the GPC | ||
104 | */ | ||
105 | scg_gpc_pix_perf = scale_factor * num_tpc_gpc[gpc_id] / | ||
106 | gr->gpc_tpc_count[gpc_id]; | ||
107 | |||
108 | if (min_scg_gpc_pix_perf > scg_gpc_pix_perf) | ||
109 | min_scg_gpc_pix_perf = scg_gpc_pix_perf; | ||
110 | |||
111 | /* Calculate # of surviving PES */ | ||
112 | for (pes_id = 0; pes_id < gr->gpc_ppc_count[gpc_id]; pes_id++) { | ||
113 | /* Count the number of TPC on the set */ | ||
114 | num_tpc_mask = gr->pes_tpc_mask[pes_id][gpc_id] & | ||
115 | gpc_tpc_mask[gpc_id]; | ||
116 | |||
117 | if ((gpc_id == disable_gpc_id) && (num_tpc_mask & | ||
118 | (0x1 << disable_tpc_id))) { | ||
119 | |||
120 | if (is_tpc_removed_pes) { | ||
121 | err = -EINVAL; | ||
122 | goto free_resources; | ||
123 | } | ||
124 | num_tpc_mask &= ~(0x1 << disable_tpc_id); | ||
125 | is_tpc_removed_pes = true; | ||
126 | } | ||
127 | if (hweight32(num_tpc_mask)) | ||
128 | scg_num_pes++; | ||
129 | } | ||
130 | } | ||
131 | |||
132 | if (!is_tpc_removed_gpc || !is_tpc_removed_pes) { | ||
133 | err = -EINVAL; | ||
134 | goto free_resources; | ||
135 | } | ||
136 | |||
137 | if (max_tpc_gpc == 0) { | ||
138 | *perf = 0; | ||
139 | goto free_resources; | ||
140 | } | ||
141 | |||
142 | /* Now calculate perf */ | ||
143 | scg_world_perf = (scale_factor * scg_num_pes) / gr->ppc_count; | ||
144 | deviation = 0; | ||
145 | average_tpcs = scale_factor * average_tpcs / gr->gpc_count; | ||
146 | for (gpc_id =0; gpc_id < gr->gpc_count; gpc_id++) { | ||
147 | diff = average_tpcs - scale_factor * num_tpc_gpc[gpc_id]; | ||
148 | if (diff < 0) | ||
149 | diff = -diff; | ||
150 | deviation += diff; | ||
151 | } | ||
152 | |||
153 | deviation /= gr->gpc_count; | ||
154 | |||
155 | norm_tpc_deviation = deviation / max_tpc_gpc; | ||
156 | |||
157 | tpc_balance = scale_factor - norm_tpc_deviation; | ||
158 | |||
159 | if ((tpc_balance > scale_factor) || | ||
160 | (scg_world_perf > scale_factor) || | ||
161 | (min_scg_gpc_pix_perf > scale_factor) || | ||
162 | (norm_tpc_deviation > scale_factor)) { | ||
163 | err = -EINVAL; | ||
164 | goto free_resources; | ||
165 | } | ||
166 | |||
167 | *perf = (pix_scale * min_scg_gpc_pix_perf) + | ||
168 | (world_scale * scg_world_perf) + | ||
169 | (tpc_scale * tpc_balance); | ||
170 | free_resources: | ||
171 | nvgpu_kfree(g, num_tpc_gpc); | ||
172 | return err; | ||
173 | } | ||
174 | |||
175 | void gr_gv100_bundle_cb_defaults(struct gk20a *g) | ||
176 | { | ||
177 | struct gr_gk20a *gr = &g->gr; | ||
178 | |||
179 | gr->bundle_cb_default_size = | ||
180 | gr_scc_bundle_cb_size_div_256b__prod_v(); | ||
181 | gr->min_gpm_fifo_depth = | ||
182 | gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v(); | ||
183 | gr->bundle_cb_token_limit = | ||
184 | gr_pd_ab_dist_cfg2_token_limit_init_v(); | ||
185 | } | ||
186 | |||
187 | void gr_gv100_cb_size_default(struct gk20a *g) | ||
188 | { | ||
189 | struct gr_gk20a *gr = &g->gr; | ||
190 | |||
191 | if (!gr->attrib_cb_default_size) | ||
192 | gr->attrib_cb_default_size = | ||
193 | gr_gpc0_ppc0_cbm_beta_cb_size_v_default_v(); | ||
194 | gr->alpha_cb_default_size = | ||
195 | gr_gpc0_ppc0_cbm_alpha_cb_size_v_default_v(); | ||
196 | } | ||
197 | |||
198 | void gr_gv100_set_gpc_tpc_mask(struct gk20a *g, u32 gpc_index) | ||
199 | { | ||
200 | } | ||
201 | |||
202 | void gr_gv100_init_sm_id_table(struct gk20a *g) | ||
203 | { | ||
204 | u32 gpc, tpc, sm, pes, gtpc; | ||
205 | u32 sm_id = 0; | ||
206 | u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC); | ||
207 | u32 num_sm = sm_per_tpc * g->gr.tpc_count; | ||
208 | int perf, maxperf; | ||
209 | int err; | ||
210 | unsigned long *gpc_tpc_mask; | ||
211 | u32 *tpc_table, *gpc_table; | ||
212 | |||
213 | gpc_table = nvgpu_kzalloc(g, g->gr.tpc_count * sizeof(u32)); | ||
214 | tpc_table = nvgpu_kzalloc(g, g->gr.tpc_count * sizeof(u32)); | ||
215 | gpc_tpc_mask = nvgpu_kzalloc(g, sizeof(unsigned long) * | ||
216 | nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS)); | ||
217 | |||
218 | if (!gpc_table || !tpc_table || !gpc_tpc_mask) { | ||
219 | nvgpu_err(g, "Error allocating memory for sm tables"); | ||
220 | goto exit_build_table; | ||
221 | } | ||
222 | |||
223 | for (gpc = 0; gpc < g->gr.gpc_count; gpc++) | ||
224 | for (pes = 0; pes < g->gr.gpc_ppc_count[gpc]; pes++) | ||
225 | gpc_tpc_mask[gpc] |= g->gr.pes_tpc_mask[pes][gpc]; | ||
226 | |||
227 | for (gtpc = 0; gtpc < g->gr.tpc_count; gtpc++) { | ||
228 | maxperf = -1; | ||
229 | for (gpc = 0; gpc < g->gr.gpc_count; gpc++) { | ||
230 | for_each_set_bit(tpc, &gpc_tpc_mask[gpc], | ||
231 | g->gr.gpc_tpc_count[gpc]) { | ||
232 | perf = -1; | ||
233 | err = gr_gv100_scg_estimate_perf(g, | ||
234 | gpc_tpc_mask, gpc, tpc, &perf); | ||
235 | |||
236 | if (err) { | ||
237 | nvgpu_err(g, | ||
238 | "Error while estimating perf"); | ||
239 | goto exit_build_table; | ||
240 | } | ||
241 | |||
242 | if (perf >= maxperf) { | ||
243 | maxperf = perf; | ||
244 | gpc_table[gtpc] = gpc; | ||
245 | tpc_table[gtpc] = tpc; | ||
246 | } | ||
247 | } | ||
248 | } | ||
249 | gpc_tpc_mask[gpc_table[gtpc]] &= ~(0x1 << tpc_table[gtpc]); | ||
250 | } | ||
251 | |||
252 | for (tpc = 0, sm_id = 0; sm_id < num_sm; tpc++, sm_id += sm_per_tpc) { | ||
253 | for (sm = 0; sm < sm_per_tpc; sm++) { | ||
254 | u32 index = sm_id + sm; | ||
255 | |||
256 | g->gr.sm_to_cluster[index].gpc_index = gpc_table[tpc]; | ||
257 | g->gr.sm_to_cluster[index].tpc_index = tpc_table[tpc]; | ||
258 | g->gr.sm_to_cluster[index].sm_index = sm; | ||
259 | g->gr.sm_to_cluster[index].global_tpc_index = tpc; | ||
260 | nvgpu_log_info(g, | ||
261 | "gpc : %d tpc %d sm_index %d global_index: %d", | ||
262 | g->gr.sm_to_cluster[index].gpc_index, | ||
263 | g->gr.sm_to_cluster[index].tpc_index, | ||
264 | g->gr.sm_to_cluster[index].sm_index, | ||
265 | g->gr.sm_to_cluster[index].global_tpc_index); | ||
266 | |||
267 | } | ||
268 | } | ||
269 | |||
270 | g->gr.no_of_sm = num_sm; | ||
271 | nvgpu_log_info(g, " total number of sm = %d", g->gr.no_of_sm); | ||
272 | exit_build_table: | ||
273 | nvgpu_kfree(g, gpc_table); | ||
274 | nvgpu_kfree(g, tpc_table); | ||
275 | nvgpu_kfree(g, gpc_tpc_mask); | ||
276 | } | ||
277 | |||
278 | void gr_gv100_load_tpc_mask(struct gk20a *g) | ||
279 | { | ||
280 | u64 pes_tpc_mask = 0x0ULL; | ||
281 | u32 gpc, pes; | ||
282 | u32 num_tpc_per_gpc = nvgpu_get_litter_value(g, | ||
283 | GPU_LIT_NUM_TPC_PER_GPC); | ||
284 | |||
285 | /* gv100 has 6 GPC and 7 TPC/GPC */ | ||
286 | for (gpc = 0; gpc < g->gr.gpc_count; gpc++) { | ||
287 | for (pes = 0; pes < g->gr.pe_count_per_gpc; pes++) { | ||
288 | pes_tpc_mask |= (u64) g->gr.pes_tpc_mask[pes][gpc] << | ||
289 | (num_tpc_per_gpc * gpc); | ||
290 | } | ||
291 | } | ||
292 | |||
293 | nvgpu_log_info(g, "pes_tpc_mask: %016llx\n", pes_tpc_mask); | ||
294 | gk20a_writel(g, gr_fe_tpc_fs_r(0), u64_lo32(pes_tpc_mask)); | ||
295 | gk20a_writel(g, gr_fe_tpc_fs_r(1), u64_hi32(pes_tpc_mask)); | ||
296 | } | ||
297 | |||
298 | u32 gr_gv100_get_patch_slots(struct gk20a *g) | ||
299 | { | ||
300 | struct gr_gk20a *gr = &g->gr; | ||
301 | struct fifo_gk20a *f = &g->fifo; | ||
302 | u32 size = 0; | ||
303 | |||
304 | /* | ||
305 | * CMD to update PE table | ||
306 | */ | ||
307 | size++; | ||
308 | |||
309 | /* | ||
310 | * Update PE table contents | ||
311 | * for PE table, each patch buffer update writes 32 TPCs | ||
312 | */ | ||
313 | size += DIV_ROUND_UP(gr->tpc_count, 32); | ||
314 | |||
315 | /* | ||
316 | * Update the PL table contents | ||
317 | * For PL table, each patch buffer update configures 4 TPCs | ||
318 | */ | ||
319 | size += DIV_ROUND_UP(gr->tpc_count, 4); | ||
320 | |||
321 | /* | ||
322 | * We need this for all subcontexts | ||
323 | */ | ||
324 | size *= f->t19x.max_subctx_count; | ||
325 | |||
326 | /* | ||
327 | * Add space for a partition mode change as well | ||
328 | * reserve two slots since DYNAMIC -> STATIC requires | ||
329 | * DYNAMIC -> NONE -> STATIC | ||
330 | */ | ||
331 | size += 2; | ||
332 | |||
333 | /* | ||
334 | * Add current patch buffer size | ||
335 | */ | ||
336 | size += gr_gk20a_get_patch_slots(g); | ||
337 | |||
338 | /* | ||
339 | * Align to 4K size | ||
340 | */ | ||
341 | size = ALIGN(size, PATCH_CTX_SLOTS_PER_PAGE); | ||
342 | |||
343 | /* | ||
344 | * Increase the size to accommodate for additional TPC partition update | ||
345 | */ | ||
346 | size += 2 * PATCH_CTX_SLOTS_PER_PAGE; | ||
347 | |||
348 | return size; | ||
349 | } | ||