/* * GV100 GPU GR * * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include "gk20a/gr_gk20a.h" #include "gk20a/gr_pri_gk20a.h" #include "gv100/gr_gv100.h" #include "gv11b/subctx_gv11b.h" #include #include #include #include #include /* * Estimate performance if the given logical TPC in the given logical GPC were * removed. */ static int gr_gv100_scg_estimate_perf(struct gk20a *g, unsigned long *gpc_tpc_mask, u32 disable_gpc_id, u32 disable_tpc_id, int *perf) { struct gr_gk20a *gr = &g->gr; int err = 0; u32 scale_factor = 512UL; /* Use fx23.9 */ u32 pix_scale = 1024*1024UL; /* Pix perf in [29:20] */ u32 world_scale = 1024UL; /* World performance in [19:10] */ u32 tpc_scale = 1; /* TPC balancing in [9:0] */ u32 scg_num_pes = 0; u32 min_scg_gpc_pix_perf = scale_factor; /* Init perf as maximum */ u32 average_tpcs = 0; /* Average of # of TPCs per GPC */ u32 deviation; /* absolute diff between TPC# and * average_tpcs, averaged across GPCs */ u32 norm_tpc_deviation; /* deviation/max_tpc_per_gpc */ u32 tpc_balance; u32 scg_gpc_pix_perf; u32 scg_world_perf; u32 gpc_id; u32 pes_id; int diff; bool is_tpc_removed_gpc = false; bool is_tpc_removed_pes = false; u32 max_tpc_gpc = 0; u32 num_tpc_mask; u32 *num_tpc_gpc = nvgpu_kzalloc(g, sizeof(u32) * nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS)); if (!num_tpc_gpc) return -ENOMEM; /* Calculate pix-perf-reduction-rate per GPC and find bottleneck TPC */ for (gpc_id = 0; gpc_id < gr->gpc_count; gpc_id++) { num_tpc_mask = gpc_tpc_mask[gpc_id]; if ((gpc_id == disable_gpc_id) && num_tpc_mask & (0x1 << disable_tpc_id)) { /* Safety check if a TPC is removed twice */ if (is_tpc_removed_gpc) { err = -EINVAL; goto free_resources; } /* Remove logical TPC from set */ num_tpc_mask &= ~(0x1 << disable_tpc_id); is_tpc_removed_gpc = true; } /* track balancing of tpcs across gpcs */ num_tpc_gpc[gpc_id] = hweight32(num_tpc_mask); average_tpcs += num_tpc_gpc[gpc_id]; /* save the maximum numer of gpcs */ max_tpc_gpc = num_tpc_gpc[gpc_id] > max_tpc_gpc ? num_tpc_gpc[gpc_id] : max_tpc_gpc; /* * Calculate ratio between TPC count and post-FS and post-SCG * * ratio represents relative throughput of the GPC */ scg_gpc_pix_perf = scale_factor * num_tpc_gpc[gpc_id] / gr->gpc_tpc_count[gpc_id]; if (min_scg_gpc_pix_perf > scg_gpc_pix_perf) min_scg_gpc_pix_perf = scg_gpc_pix_perf; /* Calculate # of surviving PES */ for (pes_id = 0; pes_id < gr->gpc_ppc_count[gpc_id]; pes_id++) { /* Count the number of TPC on the set */ num_tpc_mask = gr->pes_tpc_mask[pes_id][gpc_id] & gpc_tpc_mask[gpc_id]; if ((gpc_id == disable_gpc_id) && (num_tpc_mask & (0x1 << disable_tpc_id))) { if (is_tpc_removed_pes) { err = -EINVAL; goto free_resources; } num_tpc_mask &= ~(0x1 << disable_tpc_id); is_tpc_removed_pes = true; } if (hweight32(num_tpc_mask)) scg_num_pes++; } } if (!is_tpc_removed_gpc || !is_tpc_removed_pes) { err = -EINVAL; goto free_resources; } if (max_tpc_gpc == 0) { *perf = 0; goto free_resources; } /* Now calculate perf */ scg_world_perf = (scale_factor * scg_num_pes) / gr->ppc_count; deviation = 0; average_tpcs = scale_factor * average_tpcs / gr->gpc_count; for (gpc_id =0; gpc_id < gr->gpc_count; gpc_id++) { diff = average_tpcs - scale_factor * num_tpc_gpc[gpc_id]; if (diff < 0) diff = -diff; deviation += diff; } deviation /= gr->gpc_count; norm_tpc_deviation = deviation / max_tpc_gpc; tpc_balance = scale_factor - norm_tpc_deviation; if ((tpc_balance > scale_factor) || (scg_world_perf > scale_factor) || (min_scg_gpc_pix_perf > scale_factor) || (norm_tpc_deviation > scale_factor)) { err = -EINVAL; goto free_resources; } *perf = (pix_scale * min_scg_gpc_pix_perf) + (world_scale * scg_world_perf) + (tpc_scale * tpc_balance); free_resources: nvgpu_kfree(g, num_tpc_gpc); return err; } void gr_gv100_bundle_cb_defaults(struct gk20a *g) { struct gr_gk20a *gr = &g->gr; gr->bundle_cb_default_size = gr_scc_bundle_cb_size_div_256b__prod_v(); gr->min_gpm_fifo_depth = gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v(); gr->bundle_cb_token_limit = gr_pd_ab_dist_cfg2_token_limit_init_v(); } void gr_gv100_cb_size_default(struct gk20a *g) { struct gr_gk20a *gr = &g->gr; if (!gr->attrib_cb_default_size) gr->attrib_cb_default_size = gr_gpc0_ppc0_cbm_beta_cb_size_v_default_v(); gr->alpha_cb_default_size = gr_gpc0_ppc0_cbm_alpha_cb_size_v_default_v(); } void gr_gv100_set_gpc_tpc_mask(struct gk20a *g, u32 gpc_index) { } int gr_gv100_init_sm_id_table(struct gk20a *g) { u32 gpc, tpc, sm, pes, gtpc; u32 sm_id = 0; u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC); u32 num_sm = sm_per_tpc * g->gr.tpc_count; int perf, maxperf; int err = 0; unsigned long *gpc_tpc_mask; u32 *tpc_table, *gpc_table; gpc_table = nvgpu_kzalloc(g, g->gr.tpc_count * sizeof(u32)); tpc_table = nvgpu_kzalloc(g, g->gr.tpc_count * sizeof(u32)); gpc_tpc_mask = nvgpu_kzalloc(g, sizeof(unsigned long) * nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS)); if (!gpc_table || !tpc_table || !gpc_tpc_mask) { nvgpu_err(g, "Error allocating memory for sm tables"); err = -ENOMEM; goto exit_build_table; } for (gpc = 0; gpc < g->gr.gpc_count; gpc++) { for (pes = 0; pes < g->gr.gpc_ppc_count[gpc]; pes++) { gpc_tpc_mask[gpc] |= g->gr.pes_tpc_mask[pes][gpc]; } } for (gtpc = 0; gtpc < g->gr.tpc_count; gtpc++) { maxperf = -1; for (gpc = 0; gpc < g->gr.gpc_count; gpc++) { for_each_set_bit(tpc, &gpc_tpc_mask[gpc], g->gr.gpc_tpc_count[gpc]) { perf = -1; err = gr_gv100_scg_estimate_perf(g, gpc_tpc_mask, gpc, tpc, &perf); if (err) { nvgpu_err(g, "Error while estimating perf"); goto exit_build_table; } if (perf >= maxperf) { maxperf = perf; gpc_table[gtpc] = gpc; tpc_table[gtpc] = tpc; } } } gpc_tpc_mask[gpc_table[gtpc]] &= ~(0x1 << tpc_table[gtpc]); } for (tpc = 0, sm_id = 0; sm_id < num_sm; tpc++, sm_id += sm_per_tpc) { for (sm = 0; sm < sm_per_tpc; sm++) { u32 index = sm_id + sm; g->gr.sm_to_cluster[index].gpc_index = gpc_table[tpc]; g->gr.sm_to_cluster[index].tpc_index = tpc_table[tpc]; g->gr.sm_to_cluster[index].sm_index = sm; g->gr.sm_to_cluster[index].global_tpc_index = tpc; nvgpu_log_info(g, "gpc : %d tpc %d sm_index %d global_index: %d", g->gr.sm_to_cluster[index].gpc_index, g->gr.sm_to_cluster[index].tpc_index, g->gr.sm_to_cluster[index].sm_index, g->gr.sm_to_cluster[index].global_tpc_index); } } g->gr.no_of_sm = num_sm; nvgpu_log_info(g, " total number of sm = %d", g->gr.no_of_sm); exit_build_table: nvgpu_kfree(g, gpc_table); nvgpu_kfree(g, tpc_table); nvgpu_kfree(g, gpc_tpc_mask); return err; } u32 gr_gv100_get_patch_slots(struct gk20a *g) { struct gr_gk20a *gr = &g->gr; struct fifo_gk20a *f = &g->fifo; u32 size = 0; /* * CMD to update PE table */ size++; /* * Update PE table contents * for PE table, each patch buffer update writes 32 TPCs */ size += DIV_ROUND_UP(gr->tpc_count, 32); /* * Update the PL table contents * For PL table, each patch buffer update configures 4 TPCs */ size += DIV_ROUND_UP(gr->tpc_count, 4); /* * We need this for all subcontexts */ size *= f->max_subctx_count; /* * Add space for a partition mode change as well * reserve two slots since DYNAMIC -> STATIC requires * DYNAMIC -> NONE -> STATIC */ size += 2; /* * Add current patch buffer size */ size += gr_gk20a_get_patch_slots(g); /* * Align to 4K size */ size = ALIGN(size, PATCH_CTX_SLOTS_PER_PAGE); /* * Increase the size to accommodate for additional TPC partition update */ size += 2 * PATCH_CTX_SLOTS_PER_PAGE; return size; } static u32 gr_gv100_get_active_fpba_mask(struct gk20a *g) { u32 active_fbpa_mask; u32 num_fbpas, val; val = nvgpu_readl(g, top_num_fbpas_r()); num_fbpas = top_num_fbpas_value_v(val); /* * Read active fbpa mask from fuse * Note that 0:enable and 1:disable in value read from fuse so we've to * flip the bits. * Also set unused bits to zero */ active_fbpa_mask = g->ops.fuse.fuse_status_opt_fbio(g); active_fbpa_mask = ~active_fbpa_mask; active_fbpa_mask = active_fbpa_mask & ((1 << num_fbpas) - 1); return active_fbpa_mask; } int gr_gv100_add_ctxsw_reg_pm_fbpa(struct gk20a *g, struct ctxsw_buf_offset_map_entry *map, struct aiv_list_gk20a *regs, u32 *count, u32 *offset, u32 max_cnt, u32 base, u32 num_fbpas, u32 stride, u32 mask) { u32 fbpa_id; u32 idx; u32 cnt = *count; u32 off = *offset; u32 active_fbpa_mask; if ((cnt + (regs->count * num_fbpas)) > max_cnt) return -EINVAL; active_fbpa_mask = gr_gv100_get_active_fpba_mask(g); for (idx = 0; idx < regs->count; idx++) { for (fbpa_id = 0; fbpa_id < num_fbpas; fbpa_id++) { if (active_fbpa_mask & BIT(fbpa_id)) { map[cnt].addr = base + (regs->l[idx].addr & mask) + (fbpa_id * stride); map[cnt++].offset = off; off += 4; } } } *count = cnt; *offset = off; return 0; } int gr_gv100_add_ctxsw_reg_perf_pma(struct ctxsw_buf_offset_map_entry *map, struct aiv_list_gk20a *regs, u32 *count, u32 *offset, u32 max_cnt, u32 base, u32 mask) { *offset = ALIGN(*offset, 256); return gr_gk20a_add_ctxsw_reg_perf_pma(map, regs, count, offset, max_cnt, base, mask); } void gr_gv100_split_fbpa_broadcast_addr(struct gk20a *g, u32 addr, u32 num_fbpas, u32 *priv_addr_table, u32 *t) { u32 active_fbpa_mask; u32 fbpa_id; active_fbpa_mask = gr_gv100_get_active_fpba_mask(g); for (fbpa_id = 0; fbpa_id < num_fbpas; fbpa_id++) { if (active_fbpa_mask & BIT(fbpa_id)) { priv_addr_table[(*t)++] = pri_fbpa_addr(g, pri_fbpa_addr_mask(g, addr), fbpa_id); } } } u32 gr_gv100_get_hw_accessor_stream_out_mode(void) { return ctxsw_prog_main_image_pm_mode_stream_out_ctxsw_f(); } void gr_gv100_set_pmm_register(struct gk20a *g, u32 offset, u32 val, u32 num_chiplets, u32 num_perfmons) { u32 perfmon_index = 0; u32 chiplet_index = 0; u32 reg_offset = 0; u32 chiplet_stride = g->ops.gr.get_pmm_per_chiplet_offset(); for (chiplet_index = 0; chiplet_index < num_chiplets; chiplet_index++) { for (perfmon_index = 0; perfmon_index < num_perfmons; perfmon_index++) { reg_offset = offset + perfmon_index * perf_pmmsys_perdomain_offset_v() + chiplet_index * chiplet_stride; nvgpu_writel(g, reg_offset, val); } } } void gr_gv100_get_num_hwpm_perfmon(struct gk20a *g, u32 *num_sys_perfmon, u32 *num_fbp_perfmon, u32 *num_gpc_perfmon) { int err; u32 buf_offset_lo, buf_offset_addr, num_offsets; u32 perfmon_index = 0; for (perfmon_index = 0; perfmon_index < perf_pmmsys_engine_sel__size_1_v(); perfmon_index++) { err = gr_gk20a_get_pm_ctx_buffer_offsets(g, perf_pmmsys_engine_sel_r(perfmon_index), 1, &buf_offset_lo, &buf_offset_addr, &num_offsets); if (err != 0) { break; } } *num_sys_perfmon = perfmon_index; for (perfmon_index = 0; perfmon_index < perf_pmmfbp_engine_sel__size_1_v(); perfmon_index++) { err = gr_gk20a_get_pm_ctx_buffer_offsets(g, perf_pmmfbp_engine_sel_r(perfmon_index), 1, &buf_offset_lo, &buf_offset_addr, &num_offsets); if (err != 0) { break; } } *num_fbp_perfmon = perfmon_index; for (perfmon_index = 0; perfmon_index < perf_pmmgpc_engine_sel__size_1_v(); perfmon_index++) { err = gr_gk20a_get_pm_ctx_buffer_offsets(g, perf_pmmgpc_engine_sel_r(perfmon_index), 1, &buf_offset_lo, &buf_offset_addr, &num_offsets); if (err != 0) { break; } } *num_gpc_perfmon = perfmon_index; } void gr_gv100_init_hwpm_pmm_register(struct gk20a *g) { u32 num_sys_perfmon = 0; u32 num_fbp_perfmon = 0; u32 num_gpc_perfmon = 0; g->ops.gr.get_num_hwpm_perfmon(g, &num_sys_perfmon, &num_fbp_perfmon, &num_gpc_perfmon); g->ops.gr.set_pmm_register(g, perf_pmmsys_engine_sel_r(0), 0xFFFFFFFFU, 1U, num_sys_perfmon); g->ops.gr.set_pmm_register(g, perf_pmmfbp_engine_sel_r(0), 0xFFFFFFFFU, g->gr.num_fbps, num_fbp_perfmon); g->ops.gr.set_pmm_register(g, perf_pmmgpc_engine_sel_r(0), 0xFFFFFFFFU, g->gr.gpc_count, num_gpc_perfmon); }