1 files changed, 349 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/gv100/gr_gv100.c b/drivers/gpu/nvgpu/gv100/gr_gv100.c
new file mode 100644
index 00000000..430c7cd0
--- /dev/null
+++ b/drivers/gpu/nvgpu/gv100/gr_gv100.c
@@ -0,0 +1,349 @@
+/*
+ * GV100 GPU GR
+ *
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <nvgpu/log.h>
+#include <nvgpu/debug.h>
+#include <nvgpu/enabled.h>
+#include "gk20a/gk20a.h"
+#include "gk20a/gr_gk20a.h"
+#include "gv100/gr_gv100.h"
+#include "gv11b/subctx_gv11b.h"
+#include <nvgpu/hw/gv100/hw_gr_gv100.h>
+#include <nvgpu/hw/gv100/hw_proj_gv100.h>
+/*
+ *  Estimate performance if the given logical TPC in the given logical GPC were
+ * removed.
+ */
+static int gr_gv100_scg_estimate_perf(struct gk20a *g,
+                                        unsigned long *gpc_tpc_mask,
+                                        u32 disable_gpc_id, u32 disable_tpc_id,
+                                        int *perf)
+{
+        struct gr_gk20a *gr = &g->gr;
+        int err = 0;
+        u32 scale_factor = 512UL; /* Use fx23.9 */
+        u32 pix_scale = 1024*1024UL;    /* Pix perf in [29:20] */
+        u32 world_scale = 1024UL;       /* World performance in [19:10] */
+        u32 tpc_scale = 1;              /* TPC balancing in [9:0] */
+        u32 scg_num_pes = 0;
+        u32 min_scg_gpc_pix_perf = scale_factor; /* Init perf as maximum */
+        u32 average_tpcs = 0;           /* Average of # of TPCs per GPC */
+        u32 deviation;                  /* absolute diff between TPC# and
+                                         * average_tpcs, averaged across GPCs
+                                         */
+        u32 norm_tpc_deviation;         /* deviation/max_tpc_per_gpc */
+        u32 tpc_balance;
+        u32 scg_gpc_pix_perf;
+        u32 scg_world_perf;
+        u32 gpc_id;
+        u32 pes_id;
+        int diff;
+        bool is_tpc_removed_gpc = false;
+        bool is_tpc_removed_pes = false;
+        u32 max_tpc_gpc = 0;
+        u32 num_tpc_mask;
+        u32 *num_tpc_gpc = nvgpu_kzalloc(g, sizeof(u32) *
+                                nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS));
+        if (!num_tpc_gpc)
+                return -ENOMEM;
+        /* Calculate pix-perf-reduction-rate per GPC and find bottleneck TPC */
+        for (gpc_id = 0; gpc_id < gr->gpc_count; gpc_id++) {
+                num_tpc_mask = gpc_tpc_mask[gpc_id];
+                if ((gpc_id == disable_gpc_id) && num_tpc_mask &
+                                                (0x1 << disable_tpc_id)) {
+                        /* Safety check if a TPC is removed twice */
+                        if (is_tpc_removed_gpc) {
+                                err = -EINVAL;
+                                goto free_resources;
+                        }
+                        /* Remove logical TPC from set */
+                        num_tpc_mask &= ~(0x1 << disable_tpc_id);
+                        is_tpc_removed_gpc = true;
+                }
+                /* track balancing of tpcs across gpcs */
+                num_tpc_gpc[gpc_id] = hweight32(num_tpc_mask);
+                average_tpcs += num_tpc_gpc[gpc_id];
+                /* save the maximum numer of gpcs */
+                max_tpc_gpc = num_tpc_gpc[gpc_id] > max_tpc_gpc ?
+                                num_tpc_gpc[gpc_id] : max_tpc_gpc;
+                /*
+                 * Calculate ratio between TPC count and post-FS and post-SCG
+                 *
+                 * ratio represents relative throughput of the GPC
+                 */
+                scg_gpc_pix_perf = scale_factor * num_tpc_gpc[gpc_id] /
+                                        gr->gpc_tpc_count[gpc_id];
+                if (min_scg_gpc_pix_perf > scg_gpc_pix_perf)
+                        min_scg_gpc_pix_perf = scg_gpc_pix_perf;
+                /* Calculate # of surviving PES */
+                for (pes_id = 0; pes_id < gr->gpc_ppc_count[gpc_id]; pes_id++) {
+                        /* Count the number of TPC on the set */
+                        num_tpc_mask = gr->pes_tpc_mask[pes_id][gpc_id] &
+                                        gpc_tpc_mask[gpc_id];
+                        if ((gpc_id == disable_gpc_id) && (num_tpc_mask &
+                                (0x1 << disable_tpc_id))) {
+                                if (is_tpc_removed_pes) {
+                                        err = -EINVAL;
+                                        goto free_resources;
+                                }
+                                num_tpc_mask &= ~(0x1 << disable_tpc_id);
+                                is_tpc_removed_pes = true;
+                        }
+                        if (hweight32(num_tpc_mask))
+                                scg_num_pes++;
+                }
+        }
+        if (!is_tpc_removed_gpc || !is_tpc_removed_pes) {
+                err = -EINVAL;
+                goto free_resources;
+        }
+        if (max_tpc_gpc == 0) {
+                *perf = 0;
+                goto free_resources;
+        }
+        /* Now calculate perf */
+        scg_world_perf = (scale_factor * scg_num_pes) / gr->ppc_count;
+        deviation = 0;
+        average_tpcs = scale_factor * average_tpcs / gr->gpc_count;
+        for (gpc_id =0; gpc_id < gr->gpc_count; gpc_id++) {
+                diff = average_tpcs - scale_factor * num_tpc_gpc[gpc_id];
+                if (diff < 0)
+                        diff = -diff;
+                deviation += diff;
+        }
+        deviation /= gr->gpc_count;
+        norm_tpc_deviation = deviation / max_tpc_gpc;
+        tpc_balance = scale_factor - norm_tpc_deviation;
+        if ((tpc_balance > scale_factor)          ||
+            (scg_world_perf > scale_factor)       ||
+            (min_scg_gpc_pix_perf > scale_factor) ||
+            (norm_tpc_deviation > scale_factor)) {
+                err = -EINVAL;
+                goto free_resources;
+        }
+        *perf = (pix_scale * min_scg_gpc_pix_perf) +
+                (world_scale * scg_world_perf) +
+                (tpc_scale * tpc_balance);
+free_resources:
+        nvgpu_kfree(g, num_tpc_gpc);
+        return err;
+}
+void gr_gv100_bundle_cb_defaults(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        gr->bundle_cb_default_size =
+                gr_scc_bundle_cb_size_div_256b__prod_v();
+        gr->min_gpm_fifo_depth =
+                gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v();
+        gr->bundle_cb_token_limit =
+                gr_pd_ab_dist_cfg2_token_limit_init_v();
+}
+void gr_gv100_cb_size_default(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        if (!gr->attrib_cb_default_size)
+                gr->attrib_cb_default_size =
+                        gr_gpc0_ppc0_cbm_beta_cb_size_v_default_v();
+        gr->alpha_cb_default_size =
+                gr_gpc0_ppc0_cbm_alpha_cb_size_v_default_v();
+}
+void gr_gv100_set_gpc_tpc_mask(struct gk20a *g, u32 gpc_index)
+{
+}
+void gr_gv100_init_sm_id_table(struct gk20a *g)
+{
+        u32 gpc, tpc, sm, pes, gtpc;
+        u32 sm_id = 0;
+        u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
+        u32 num_sm = sm_per_tpc * g->gr.tpc_count;
+        int perf, maxperf;
+        int err;
+        unsigned long *gpc_tpc_mask;
+        u32 *tpc_table, *gpc_table;
+        gpc_table = nvgpu_kzalloc(g, g->gr.tpc_count * sizeof(u32));
+        tpc_table = nvgpu_kzalloc(g, g->gr.tpc_count * sizeof(u32));
+        gpc_tpc_mask = nvgpu_kzalloc(g, sizeof(unsigned long) *
+                        nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS));
+        if (!gpc_table || !tpc_table || !gpc_tpc_mask) {
+                nvgpu_err(g, "Error allocating memory for sm tables");
+                goto exit_build_table;
+        }
+        for (gpc = 0; gpc < g->gr.gpc_count; gpc++)
+                for (pes = 0; pes < g->gr.gpc_ppc_count[gpc]; pes++)
+                        gpc_tpc_mask[gpc] |= g->gr.pes_tpc_mask[pes][gpc];
+        for (gtpc = 0; gtpc < g->gr.tpc_count; gtpc++) {
+                maxperf = -1;
+                for (gpc = 0; gpc < g->gr.gpc_count; gpc++) {
+                        for_each_set_bit(tpc, &gpc_tpc_mask[gpc],
+                                                g->gr.gpc_tpc_count[gpc]) {
+                                perf = -1;
+                                err = gr_gv100_scg_estimate_perf(g,
+                                                gpc_tpc_mask, gpc, tpc, &perf);
+                                if (err) {
+                                        nvgpu_err(g,
+                                                "Error while estimating perf");
+                                        goto exit_build_table;
+                                }
+                                if (perf >= maxperf) {
+                                        maxperf = perf;
+                                        gpc_table[gtpc] = gpc;
+                                        tpc_table[gtpc] = tpc;
+                                }
+                        }
+                }
+                gpc_tpc_mask[gpc_table[gtpc]] &= ~(0x1 << tpc_table[gtpc]);
+        }
+        for (tpc = 0, sm_id = 0;  sm_id < num_sm; tpc++, sm_id += sm_per_tpc) {
+                for (sm = 0; sm < sm_per_tpc; sm++) {
+                        u32 index = sm_id + sm;
+                        g->gr.sm_to_cluster[index].gpc_index = gpc_table[tpc];
+                        g->gr.sm_to_cluster[index].tpc_index = tpc_table[tpc];
+                        g->gr.sm_to_cluster[index].sm_index = sm;
+                        g->gr.sm_to_cluster[index].global_tpc_index = tpc;
+                        nvgpu_log_info(g,
+                                "gpc : %d tpc %d sm_index %d global_index: %d",
+                                g->gr.sm_to_cluster[index].gpc_index,
+                                g->gr.sm_to_cluster[index].tpc_index,
+                                g->gr.sm_to_cluster[index].sm_index,
+                                g->gr.sm_to_cluster[index].global_tpc_index);
+                }
+        }
+        g->gr.no_of_sm = num_sm;
+        nvgpu_log_info(g, " total number of sm = %d", g->gr.no_of_sm);
+exit_build_table:
+        nvgpu_kfree(g, gpc_table);
+        nvgpu_kfree(g, tpc_table);
+        nvgpu_kfree(g, gpc_tpc_mask);
+}
+void gr_gv100_load_tpc_mask(struct gk20a *g)
+{
+        u64 pes_tpc_mask = 0x0ULL;
+        u32 gpc, pes;
+        u32 num_tpc_per_gpc = nvgpu_get_litter_value(g,
+                                GPU_LIT_NUM_TPC_PER_GPC);
+        /* gv100 has 6 GPC and 7 TPC/GPC */
+        for (gpc = 0; gpc < g->gr.gpc_count; gpc++) {
+                for (pes = 0; pes < g->gr.pe_count_per_gpc; pes++) {
+                        pes_tpc_mask |= (u64) g->gr.pes_tpc_mask[pes][gpc] <<
+                                (num_tpc_per_gpc * gpc);
+                }
+        }
+        nvgpu_log_info(g, "pes_tpc_mask: %016llx\n", pes_tpc_mask);
+        gk20a_writel(g, gr_fe_tpc_fs_r(0), u64_lo32(pes_tpc_mask));
+        gk20a_writel(g, gr_fe_tpc_fs_r(1), u64_hi32(pes_tpc_mask));
+}
+u32 gr_gv100_get_patch_slots(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        struct fifo_gk20a *f = &g->fifo;
+        u32 size = 0;
+        /*
+         * CMD to update PE table
+         */
+        size++;
+        /*
+         * Update PE table contents
+         * for PE table, each patch buffer update writes 32 TPCs
+         */
+        size += DIV_ROUND_UP(gr->tpc_count, 32);
+        /*
+         * Update the PL table contents
+         * For PL table, each patch buffer update configures 4 TPCs
+         */
+        size += DIV_ROUND_UP(gr->tpc_count, 4);
+        /*
+         * We need this for all subcontexts
+         */
+        size *= f->t19x.max_subctx_count;
+        /*
+         * Add space for a partition mode change as well
+         * reserve two slots since DYNAMIC -> STATIC requires
+         * DYNAMIC -> NONE -> STATIC
+         */
+        size += 2;
+        /*
+         * Add current patch buffer size
+         */
+        size += gr_gk20a_get_patch_slots(g);
+        /*
+         * Align to 4K size
+         */
+        size = ALIGN(size, PATCH_CTX_SLOTS_PER_PAGE);
+        /*
+         * Increase the size to accommodate for additional TPC partition update
+         */
+        size += 2 * PATCH_CTX_SLOTS_PER_PAGE;
+        return size;
+}

diff --git a/drivers/gpu/nvgpu/gv100/gr_gv100.c b/drivers/gpu/nvgpu/gv100/gr_gv100.c new file mode 100644 index 00000000..430c7cd0 --- /dev/null +++ b/drivers/gpu/nvgpu/gv100/gr_gv100.c
@@ -0,0 +1,349 @@
	1	/*
	2	* GV100 GPU GR
	3	*
	4	* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
	5	*
	6	* Permission is hereby granted, free of charge, to any person obtaining a
	7	* copy of this software and associated documentation files (the "Software"),
	8	* to deal in the Software without restriction, including without limitation
	9	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	10	* and/or sell copies of the Software, and to permit persons to whom the
	11	* Software is furnished to do so, subject to the following conditions:
	12	*
	13	* The above copyright notice and this permission notice shall be included in
	14	* all copies or substantial portions of the Software.
	15	*
	16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	19	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	21	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	22	* DEALINGS IN THE SOFTWARE.
	23	*/
	24
	25	#include <nvgpu/log.h>
	26	#include <nvgpu/debug.h>
	27	#include <nvgpu/enabled.h>
	28
	29	#include "gk20a/gk20a.h"
	30	#include "gk20a/gr_gk20a.h"
	31
	32	#include "gv100/gr_gv100.h"
	33	#include "gv11b/subctx_gv11b.h"
	34
	35	#include <nvgpu/hw/gv100/hw_gr_gv100.h>
	36	#include <nvgpu/hw/gv100/hw_proj_gv100.h>
	37
	38	/*
	39	* Estimate performance if the given logical TPC in the given logical GPC were
	40	* removed.
	41	*/
	42	static int gr_gv100_scg_estimate_perf(struct gk20a *g,
	43	unsigned long *gpc_tpc_mask,
	44	u32 disable_gpc_id, u32 disable_tpc_id,
	45	int *perf)
	46	{
	47	struct gr_gk20a *gr = &g->gr;
	48	int err = 0;
	49	u32 scale_factor = 512UL; /* Use fx23.9 */
	50	u32 pix_scale = 10241024UL; / Pix perf in [29:20] */
	51	u32 world_scale = 1024UL; /* World performance in [19:10] */
	52	u32 tpc_scale = 1; /* TPC balancing in [9:0] */
	53	u32 scg_num_pes = 0;
	54	u32 min_scg_gpc_pix_perf = scale_factor; /* Init perf as maximum */
	55	u32 average_tpcs = 0; /* Average of # of TPCs per GPC */
	56	u32 deviation; /* absolute diff between TPC# and
	57	* average_tpcs, averaged across GPCs
	58	*/
	59	u32 norm_tpc_deviation; /* deviation/max_tpc_per_gpc */
	60	u32 tpc_balance;
	61	u32 scg_gpc_pix_perf;
	62	u32 scg_world_perf;
	63	u32 gpc_id;
	64	u32 pes_id;
	65	int diff;
	66	bool is_tpc_removed_gpc = false;
	67	bool is_tpc_removed_pes = false;
	68	u32 max_tpc_gpc = 0;
	69	u32 num_tpc_mask;
	70	u32 num_tpc_gpc = nvgpu_kzalloc(g, sizeof(u32)
	71	nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS));
	72
	73	if (!num_tpc_gpc)
	74	return -ENOMEM;
	75
	76	/* Calculate pix-perf-reduction-rate per GPC and find bottleneck TPC */
	77	for (gpc_id = 0; gpc_id < gr->gpc_count; gpc_id++) {
	78	num_tpc_mask = gpc_tpc_mask[gpc_id];
	79
	80	if ((gpc_id == disable_gpc_id) && num_tpc_mask &
	81	(0x1 << disable_tpc_id)) {
	82	/* Safety check if a TPC is removed twice */
	83	if (is_tpc_removed_gpc) {
	84	err = -EINVAL;
	85	goto free_resources;
	86	}
	87	/* Remove logical TPC from set */
	88	num_tpc_mask &= ~(0x1 << disable_tpc_id);
	89	is_tpc_removed_gpc = true;
	90	}
	91
	92	/* track balancing of tpcs across gpcs */
	93	num_tpc_gpc[gpc_id] = hweight32(num_tpc_mask);
	94	average_tpcs += num_tpc_gpc[gpc_id];
	95
	96	/* save the maximum numer of gpcs */
	97	max_tpc_gpc = num_tpc_gpc[gpc_id] > max_tpc_gpc ?
	98	num_tpc_gpc[gpc_id] : max_tpc_gpc;
	99
	100	/*
	101	* Calculate ratio between TPC count and post-FS and post-SCG
	102	*
	103	* ratio represents relative throughput of the GPC
	104	*/
	105	scg_gpc_pix_perf = scale_factor * num_tpc_gpc[gpc_id] /
	106	gr->gpc_tpc_count[gpc_id];
	107
	108	if (min_scg_gpc_pix_perf > scg_gpc_pix_perf)
	109	min_scg_gpc_pix_perf = scg_gpc_pix_perf;
	110
	111	/* Calculate # of surviving PES */
	112	for (pes_id = 0; pes_id < gr->gpc_ppc_count[gpc_id]; pes_id++) {
	113	/* Count the number of TPC on the set */
	114	num_tpc_mask = gr->pes_tpc_mask[pes_id][gpc_id] &
	115	gpc_tpc_mask[gpc_id];
	116
	117	if ((gpc_id == disable_gpc_id) && (num_tpc_mask &
	118	(0x1 << disable_tpc_id))) {
	119
	120	if (is_tpc_removed_pes) {
	121	err = -EINVAL;
	122	goto free_resources;
	123	}
	124	num_tpc_mask &= ~(0x1 << disable_tpc_id);
	125	is_tpc_removed_pes = true;
	126	}
	127	if (hweight32(num_tpc_mask))
	128	scg_num_pes++;
	129	}
	130	}
	131
	132	if (!is_tpc_removed_gpc \|\| !is_tpc_removed_pes) {
	133	err = -EINVAL;
	134	goto free_resources;
	135	}
	136
	137	if (max_tpc_gpc == 0) {
	138	*perf = 0;
	139	goto free_resources;
	140	}
	141
	142	/* Now calculate perf */
	143	scg_world_perf = (scale_factor * scg_num_pes) / gr->ppc_count;
	144	deviation = 0;
	145	average_tpcs = scale_factor * average_tpcs / gr->gpc_count;
	146	for (gpc_id =0; gpc_id < gr->gpc_count; gpc_id++) {
	147	diff = average_tpcs - scale_factor * num_tpc_gpc[gpc_id];
	148	if (diff < 0)
	149	diff = -diff;
	150	deviation += diff;
	151	}
	152
	153	deviation /= gr->gpc_count;
	154
	155	norm_tpc_deviation = deviation / max_tpc_gpc;
	156
	157	tpc_balance = scale_factor - norm_tpc_deviation;
	158
	159	if ((tpc_balance > scale_factor) \|\|
	160	(scg_world_perf > scale_factor) \|\|
	161	(min_scg_gpc_pix_perf > scale_factor) \|\|
	162	(norm_tpc_deviation > scale_factor)) {
	163	err = -EINVAL;
	164	goto free_resources;
	165	}
	166
	167	perf = (pix_scale min_scg_gpc_pix_perf) +
	168	(world_scale * scg_world_perf) +
	169	(tpc_scale * tpc_balance);
	170	free_resources:
	171	nvgpu_kfree(g, num_tpc_gpc);
	172	return err;
	173	}
	174
	175	void gr_gv100_bundle_cb_defaults(struct gk20a *g)
	176	{
	177	struct gr_gk20a *gr = &g->gr;
	178
	179	gr->bundle_cb_default_size =
	180	gr_scc_bundle_cb_size_div_256b__prod_v();
	181	gr->min_gpm_fifo_depth =
	182	gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v();
	183	gr->bundle_cb_token_limit =
	184	gr_pd_ab_dist_cfg2_token_limit_init_v();
	185	}
	186
	187	void gr_gv100_cb_size_default(struct gk20a *g)
	188	{
	189	struct gr_gk20a *gr = &g->gr;
	190
	191	if (!gr->attrib_cb_default_size)
	192	gr->attrib_cb_default_size =
	193	gr_gpc0_ppc0_cbm_beta_cb_size_v_default_v();
	194	gr->alpha_cb_default_size =
	195	gr_gpc0_ppc0_cbm_alpha_cb_size_v_default_v();
	196	}
	197
	198	void gr_gv100_set_gpc_tpc_mask(struct gk20a *g, u32 gpc_index)
	199	{
	200	}
	201
	202	void gr_gv100_init_sm_id_table(struct gk20a *g)
	203	{
	204	u32 gpc, tpc, sm, pes, gtpc;
	205	u32 sm_id = 0;
	206	u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
	207	u32 num_sm = sm_per_tpc * g->gr.tpc_count;
	208	int perf, maxperf;
	209	int err;
	210	unsigned long *gpc_tpc_mask;
	211	u32 tpc_table, gpc_table;
	212
	213	gpc_table = nvgpu_kzalloc(g, g->gr.tpc_count * sizeof(u32));
	214	tpc_table = nvgpu_kzalloc(g, g->gr.tpc_count * sizeof(u32));
	215	gpc_tpc_mask = nvgpu_kzalloc(g, sizeof(unsigned long) *
	216	nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS));
	217
	218	if (!gpc_table \|\| !tpc_table \|\| !gpc_tpc_mask) {
	219	nvgpu_err(g, "Error allocating memory for sm tables");
	220	goto exit_build_table;
	221	}
	222
	223	for (gpc = 0; gpc < g->gr.gpc_count; gpc++)
	224	for (pes = 0; pes < g->gr.gpc_ppc_count[gpc]; pes++)
	225	gpc_tpc_mask[gpc] \|= g->gr.pes_tpc_mask[pes][gpc];
	226
	227	for (gtpc = 0; gtpc < g->gr.tpc_count; gtpc++) {
	228	maxperf = -1;
	229	for (gpc = 0; gpc < g->gr.gpc_count; gpc++) {
	230	for_each_set_bit(tpc, &gpc_tpc_mask[gpc],
	231	g->gr.gpc_tpc_count[gpc]) {
	232	perf = -1;
	233	err = gr_gv100_scg_estimate_perf(g,
	234	gpc_tpc_mask, gpc, tpc, &perf);
	235
	236	if (err) {
	237	nvgpu_err(g,
	238	"Error while estimating perf");
	239	goto exit_build_table;
	240	}
	241
	242	if (perf >= maxperf) {
	243	maxperf = perf;
	244	gpc_table[gtpc] = gpc;
	245	tpc_table[gtpc] = tpc;
	246	}
	247	}
	248	}
	249	gpc_tpc_mask[gpc_table[gtpc]] &= ~(0x1 << tpc_table[gtpc]);
	250	}
	251
	252	for (tpc = 0, sm_id = 0; sm_id < num_sm; tpc++, sm_id += sm_per_tpc) {
	253	for (sm = 0; sm < sm_per_tpc; sm++) {
	254	u32 index = sm_id + sm;
	255
	256	g->gr.sm_to_cluster[index].gpc_index = gpc_table[tpc];
	257	g->gr.sm_to_cluster[index].tpc_index = tpc_table[tpc];
	258	g->gr.sm_to_cluster[index].sm_index = sm;
	259	g->gr.sm_to_cluster[index].global_tpc_index = tpc;
	260	nvgpu_log_info(g,
	261	"gpc : %d tpc %d sm_index %d global_index: %d",
	262	g->gr.sm_to_cluster[index].gpc_index,
	263	g->gr.sm_to_cluster[index].tpc_index,
	264	g->gr.sm_to_cluster[index].sm_index,
	265	g->gr.sm_to_cluster[index].global_tpc_index);
	266
	267	}
	268	}
	269
	270	g->gr.no_of_sm = num_sm;
	271	nvgpu_log_info(g, " total number of sm = %d", g->gr.no_of_sm);
	272	exit_build_table:
	273	nvgpu_kfree(g, gpc_table);
	274	nvgpu_kfree(g, tpc_table);
	275	nvgpu_kfree(g, gpc_tpc_mask);
	276	}
	277
	278	void gr_gv100_load_tpc_mask(struct gk20a *g)
	279	{
	280	u64 pes_tpc_mask = 0x0ULL;
	281	u32 gpc, pes;
	282	u32 num_tpc_per_gpc = nvgpu_get_litter_value(g,
	283	GPU_LIT_NUM_TPC_PER_GPC);
	284
	285	/* gv100 has 6 GPC and 7 TPC/GPC */
	286	for (gpc = 0; gpc < g->gr.gpc_count; gpc++) {
	287	for (pes = 0; pes < g->gr.pe_count_per_gpc; pes++) {
	288	pes_tpc_mask \|= (u64) g->gr.pes_tpc_mask[pes][gpc] <<
	289	(num_tpc_per_gpc * gpc);
	290	}
	291	}
	292
	293	nvgpu_log_info(g, "pes_tpc_mask: %016llx\n", pes_tpc_mask);
	294	gk20a_writel(g, gr_fe_tpc_fs_r(0), u64_lo32(pes_tpc_mask));
	295	gk20a_writel(g, gr_fe_tpc_fs_r(1), u64_hi32(pes_tpc_mask));
	296	}
	297
	298	u32 gr_gv100_get_patch_slots(struct gk20a *g)
	299	{
	300	struct gr_gk20a *gr = &g->gr;
	301	struct fifo_gk20a *f = &g->fifo;
	302	u32 size = 0;
	303
	304	/*
	305	* CMD to update PE table
	306	*/
	307	size++;
	308
	309	/*
	310	* Update PE table contents
	311	* for PE table, each patch buffer update writes 32 TPCs
	312	*/
	313	size += DIV_ROUND_UP(gr->tpc_count, 32);
	314
	315	/*
	316	* Update the PL table contents
	317	* For PL table, each patch buffer update configures 4 TPCs
	318	*/
	319	size += DIV_ROUND_UP(gr->tpc_count, 4);
	320
	321	/*
	322	* We need this for all subcontexts
	323	*/
	324	size *= f->t19x.max_subctx_count;
	325
	326	/*
	327	* Add space for a partition mode change as well
	328	* reserve two slots since DYNAMIC -> STATIC requires
	329	* DYNAMIC -> NONE -> STATIC
	330	*/
	331	size += 2;
	332
	333	/*
	334	* Add current patch buffer size
	335	*/
	336	size += gr_gk20a_get_patch_slots(g);
	337
	338	/*
	339	* Align to 4K size
	340	*/
	341	size = ALIGN(size, PATCH_CTX_SLOTS_PER_PAGE);
	342
	343	/*
	344	* Increase the size to accommodate for additional TPC partition update
	345	*/
	346	size += 2 * PATCH_CTX_SLOTS_PER_PAGE;
	347
	348	return size;
	349	}