gpu: nvgpu: fix smid generation of perf tables

SMID tables were generated according with the local tpc and the pagepool and cb buffers from a different chip and did not take performance in consideration, which made compute kernels hang with CTAs on the fly. This change ensures we are using the right sizes and adds proper enumeration of smids. JIRA: NVGPUGV100-36 bug 2004378 Change-Id: Ic8f50c325d6d6720cca41d9740ae4f5f51e1100a Signed-off-by: David Nieto <dmartineznie@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1581664 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: David Nieto <dmartineznie@nvidia.com> 2017-10-16 15:24:59 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2017-10-20 14:55:43 -0400
commit: ed8ac6e005d95e051bd03a182bbe0aa09a3c2266 (patch)
tree: 48743edbdca7d14193a5d0ff22e39b692d9936d6
parent: 387ecf8a6360f463a129ab569aaef921fe0a2b0e (diff)
4 files changed, 331 insertions, 5 deletions
diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile
index 8f1d42da..d6d60147 100644
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -28,6 +28,7 @@ nvgpu-y += \
        $(nvgpu-t19x)/gv100/fb_gv100.o \
        $(nvgpu-t19x)/gv100/bios_gv100.o \
        $(nvgpu-t19x)/gv100/fifo_gv100.o \
+        $(nvgpu-t19x)/gv100/gr_gv100.o \
        $(nvgpu-t19x)/gv100/hal_gv100.o
 nvgpu-$(CONFIG_TEGRA_GK20A) += $(nvgpu-t19x)/gv11b/platform_gv11b_tegra.o
diff --git a/drivers/gpu/nvgpu/gv100/gr_gv100.c b/drivers/gpu/nvgpu/gv100/gr_gv100.c
new file mode 100644
index 00000000..4b2038ba
--- /dev/null
+++ b/drivers/gpu/nvgpu/gv100/gr_gv100.c
@@ -0,0 +1,289 @@
+/*
+ * GV100 GPU GR
+ *
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <nvgpu/log.h>
+#include <nvgpu/debug.h>
+#include <nvgpu/enabled.h>
+#include "gk20a/gk20a.h"
+#include "gk20a/gr_gk20a.h"
+#include "gv100/gr_gv100.h"
+#include "gv11b/subctx_gv11b.h"
+#include <nvgpu/hw/gv100/hw_gr_gv100.h>
+#include <nvgpu/hw/gv100/hw_proj_gv100.h>
+/*
+ *  Estimate performance if the given logical TPC in the given logical GPC were
+ * removed.
+ */
+static int gr_gv100_scg_estimate_perf(struct gk20a *g,
+                                        unsigned long *gpc_tpc_mask,
+                                        u32 disable_gpc_id, u32 disable_tpc_id,
+                                        int *perf)
+{
+        struct gr_gk20a *gr = &g->gr;
+        int err = 0;
+        u32 scale_factor = 512UL; /* Use fx23.9 */
+        u32 pix_scale = 1024*1024UL;    /* Pix perf in [29:20] */
+        u32 world_scale = 1024UL;       /* World performance in [19:10] */
+        u32 tpc_scale = 1;              /* TPC balancing in [9:0] */
+        u32 scg_num_pes = 0;
+        u32 min_scg_gpc_pix_perf = scale_factor; /* Init perf as maximum */
+        u32 average_tpcs = 0;           /* Average of # of TPCs per GPC */
+        u32 deviation;                  /* absolute diff between TPC# and
+                                         * average_tpcs, averaged across GPCs
+                                         */
+        u32 norm_tpc_deviation;         /* deviation/max_tpc_per_gpc */
+        u32 tpc_balance;
+        u32 scg_gpc_pix_perf;
+        u32 scg_world_perf;
+        u32 gpc_id;
+        u32 pes_id;
+        int diff;
+        bool is_tpc_removed_gpc = false;
+        bool is_tpc_removed_pes = false;
+        u32 max_tpc_gpc = 0;
+        u32 num_tpc_mask;
+        u32 *num_tpc_gpc = nvgpu_kzalloc(g, sizeof(u32) *
+                                nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS));
+        if (!num_tpc_gpc)
+                return -ENOMEM;
+        /* Calculate pix-perf-reduction-rate per GPC and find bottleneck TPC */
+        for (gpc_id = 0; gpc_id < gr->gpc_count; gpc_id++) {
+                num_tpc_mask = gpc_tpc_mask[gpc_id];
+                if ((gpc_id == disable_gpc_id) && num_tpc_mask &
+                                                (0x1 << disable_tpc_id)) {
+                        /* Safety check if a TPC is removed twice */
+                        if (is_tpc_removed_gpc) {
+                                err = -EINVAL;
+                                goto free_resources;
+                        }
+                        /* Remove logical TPC from set */
+                        num_tpc_mask &= ~(0x1 << disable_tpc_id);
+                        is_tpc_removed_gpc = true;
+                }
+                /* track balancing of tpcs across gpcs */
+                num_tpc_gpc[gpc_id] = hweight32(num_tpc_mask);
+                average_tpcs += num_tpc_gpc[gpc_id];
+                /* save the maximum numer of gpcs */
+                max_tpc_gpc = num_tpc_gpc[gpc_id] > max_tpc_gpc ?
+                                num_tpc_gpc[gpc_id] : max_tpc_gpc;
+                /*
+                 * Calculate ratio between TPC count and post-FS and post-SCG
+                 *
+                 * ratio represents relative throughput of the GPC
+                 */
+                scg_gpc_pix_perf = scale_factor * num_tpc_gpc[gpc_id] /
+                                        gr->gpc_tpc_count[gpc_id];
+                if (min_scg_gpc_pix_perf > scg_gpc_pix_perf)
+                        min_scg_gpc_pix_perf = scg_gpc_pix_perf;
+                /* Calculate # of surviving PES */
+                for (pes_id = 0; pes_id < gr->gpc_ppc_count[gpc_id]; pes_id++) {
+                        /* Count the number of TPC on the set */
+                        num_tpc_mask = gr->pes_tpc_mask[pes_id][gpc_id] &
+                                        gpc_tpc_mask[gpc_id];
+                        if ((gpc_id == disable_gpc_id) && (num_tpc_mask &
+                                (0x1 << disable_tpc_id))) {
+                                if (is_tpc_removed_pes) {
+                                        err = -EINVAL;
+                                        goto free_resources;
+                                }
+                                num_tpc_mask &= ~(0x1 << disable_tpc_id);
+                                is_tpc_removed_pes = true;
+                        }
+                        if (hweight32(num_tpc_mask))
+                                scg_num_pes++;
+                }
+        }
+        if (!is_tpc_removed_gpc || !is_tpc_removed_pes) {
+                err = -EINVAL;
+                goto free_resources;
+        }
+        if (max_tpc_gpc == 0) {
+                *perf = 0;
+                goto free_resources;
+        }
+        /* Now calculate perf */
+        scg_world_perf = (scale_factor * scg_num_pes) / gr->ppc_count;
+        deviation = 0;
+        average_tpcs = scale_factor * average_tpcs / gr->gpc_count;
+        for (gpc_id =0; gpc_id < gr->gpc_count; gpc_id++) {
+                diff = average_tpcs - scale_factor * num_tpc_gpc[gpc_id];
+                if (diff < 0)
+                        diff = -diff;
+                deviation += diff;
+        }
+        deviation /= gr->gpc_count;
+        norm_tpc_deviation = deviation / max_tpc_gpc;
+        tpc_balance = scale_factor - norm_tpc_deviation;
+        if ((tpc_balance > scale_factor)          ||
+            (scg_world_perf > scale_factor)       ||
+            (min_scg_gpc_pix_perf > scale_factor) ||
+            (norm_tpc_deviation > scale_factor)) {
+                err = -EINVAL;
+                goto free_resources;
+        }
+        *perf = (pix_scale * min_scg_gpc_pix_perf) +
+                (world_scale * scg_world_perf) +
+                (tpc_scale * tpc_balance);
+free_resources:
+        nvgpu_kfree(g, num_tpc_gpc);
+        return err;
+}
+void gr_gv100_bundle_cb_defaults(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        gr->bundle_cb_default_size =
+                gr_scc_bundle_cb_size_div_256b__prod_v();
+        gr->min_gpm_fifo_depth =
+                gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v();
+        gr->bundle_cb_token_limit =
+                gr_pd_ab_dist_cfg2_token_limit_init_v();
+}
+void gr_gv100_cb_size_default(struct gk20a *g)
+{
+        struct gr_gk20a *gr = &g->gr;
+        if (!gr->attrib_cb_default_size)
+                gr->attrib_cb_default_size =
+                        gr_gpc0_ppc0_cbm_beta_cb_size_v_default_v();
+        gr->alpha_cb_default_size =
+                gr_gpc0_ppc0_cbm_alpha_cb_size_v_default_v();
+}
+void gr_gv100_set_gpc_tpc_mask(struct gk20a *g, u32 gpc_index)
+{
+}
+void gr_gv100_init_sm_id_table(struct gk20a *g)
+{
+        u32 gpc, tpc, sm, pes, gtpc;
+        u32 sm_id = 0;
+        u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
+        u32 num_sm = sm_per_tpc * g->gr.tpc_count;
+        int perf, maxperf;
+        int err;
+        unsigned long *gpc_tpc_mask;
+        u32 *tpc_table, *gpc_table;
+        gpc_table = nvgpu_kzalloc(g, g->gr.tpc_count * sizeof(u32));
+        tpc_table = nvgpu_kzalloc(g, g->gr.tpc_count * sizeof(u32));
+        gpc_tpc_mask = nvgpu_kzalloc(g, sizeof(unsigned long) *
+                        nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS));
+        if (!gpc_table || !tpc_table || !gpc_tpc_mask) {
+                nvgpu_err(g, "Error allocating memory for sm tables");
+                goto exit_build_table;
+        }
+        for (gpc = 0; gpc < g->gr.gpc_count; gpc++)
+                for (pes = 0; pes < g->gr.gpc_ppc_count[gpc]; pes++)
+                        gpc_tpc_mask[gpc] |= g->gr.pes_tpc_mask[pes][gpc];
+        for (gtpc = 0; gtpc < g->gr.tpc_count; gtpc++) {
+                maxperf = -1;
+                for (gpc = 0; gpc < g->gr.gpc_count; gpc++) {
+                        for_each_set_bit(tpc, &gpc_tpc_mask[gpc],
+                                                g->gr.gpc_tpc_count[gpc]) {
+                                perf = -1;
+                                err = gr_gv100_scg_estimate_perf(g,
+                                                gpc_tpc_mask, gpc, tpc, &perf);
+                                if (err) {
+                                        nvgpu_err(g,
+                                                "Error while estimating perf");
+                                        goto exit_build_table;
+                                }
+                                if (perf >= maxperf) {
+                                        maxperf = perf;
+                                        gpc_table[gtpc] = gpc;
+                                        tpc_table[gtpc] = tpc;
+                                }
+                        }
+                }
+                gpc_tpc_mask[gpc_table[gtpc]] &= ~(0x1 << tpc_table[gtpc]);
+        }
+        for (tpc = 0, sm_id = 0;  sm_id < num_sm; tpc++, sm_id += sm_per_tpc) {
+                for (sm = 0; sm < sm_per_tpc; sm++) {
+                        g->gr.sm_to_cluster[sm_id + sm].gpc_index =
+                                                                gpc_table[tpc];
+                        g->gr.sm_to_cluster[sm_id + sm].tpc_index =
+                                                                tpc_table[tpc];
+                        g->gr.sm_to_cluster[sm_id + sm].sm_index = sm;
+                        g->gr.sm_to_cluster[sm_id + sm].global_tpc_index = tpc;
+                }
+        }
+        g->gr.no_of_sm = num_sm;
+        nvgpu_log_info(g, " total number of sm = %d", g->gr.no_of_sm);
+exit_build_table:
+        nvgpu_kfree(g, gpc_table);
+        nvgpu_kfree(g, tpc_table);
+        nvgpu_kfree(g, gpc_tpc_mask);
+}
+void gr_gv100_load_tpc_mask(struct gk20a *g)
+{
+        u64 pes_tpc_mask = 0x0ULL;
+        u32 gpc, pes;
+        u32 num_tpc_per_gpc = nvgpu_get_litter_value(g,
+                                GPU_LIT_NUM_TPC_PER_GPC);
+        /* gv100 has 6 GPC and 7 TPC/GPC */
+        for (gpc = 0; gpc < g->gr.gpc_count; gpc++) {
+                for (pes = 0; pes < g->gr.pe_count_per_gpc; pes++) {
+                        pes_tpc_mask |= (u64) g->gr.pes_tpc_mask[pes][gpc] <<
+                                (num_tpc_per_gpc * gpc);
+                }
+        }
+        nvgpu_log_info(g, "pes_tpc_mask: %016llx\n", pes_tpc_mask);
+        gk20a_writel(g, gr_fe_tpc_fs_r(0), u64_lo32(pes_tpc_mask));
+        gk20a_writel(g, gr_fe_tpc_fs_r(1), u64_hi32(pes_tpc_mask));
+}
diff --git a/drivers/gpu/nvgpu/gv100/gr_gv100.h b/drivers/gpu/nvgpu/gv100/gr_gv100.h
new file mode 100644
index 00000000..460b05ae
--- /dev/null
+++ b/drivers/gpu/nvgpu/gv100/gr_gv100.h
@@ -0,0 +1,36 @@
+/*
+ * GV100 GPU GR
+ *
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _NVGPU_GR_GV100_H_
+#define _NVGPU_GR_GV100_H_
+void gr_gv100_bundle_cb_defaults(struct gk20a *g);
+void gr_gv100_cb_size_default(struct gk20a *g);
+void gr_gv100_set_gpc_tpc_mask(struct gk20a *g, u32 gpc_index);
+void gr_gv100_init_sm_id_table(struct gk20a *g);
+void gr_gv100_program_sm_id_numbering(struct gk20a *g,
+                                        u32 gpc, u32 tpc, u32 smid);
+int gr_gv100_load_smid_config(struct gk20a *g);
+#endif
diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c
index f20d2dcf..cefaf1ae 100644
--- a/drivers/gpu/nvgpu/gv100/hal_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c
@@ -78,7 +78,7 @@
 #include "gv11b/dbg_gpu_gv11b.h"
 #include "gv11b/hal_gv11b.h"
-#include "gv11b/gr_gv11b.h"
+#include "gv100/gr_gv100.h"
 #include "gv11b/mc_gv11b.h"
 #include "gv11b/ltc_gv11b.h"
 #include "gv11b/gv11b.h"
@@ -263,8 +263,8 @@ static const struct gpu_ops gv100_ops = {
        },
        .gr = {
                .init_gpc_mmu = gr_gv11b_init_gpc_mmu,
-                .bundle_cb_defaults = gr_gv11b_bundle_cb_defaults,
+                .bundle_cb_defaults = gr_gv100_bundle_cb_defaults,
-                .cb_size_default = gr_gv11b_cb_size_default,
+                .cb_size_default = gr_gv100_cb_size_default,
                .calc_global_ctx_buffer_size =
                        gr_gv11b_calc_global_ctx_buffer_size,
                .commit_global_attrib_cb = gr_gv11b_commit_global_attrib_cb,
@@ -285,7 +285,7 @@ static const struct gpu_ops gv100_ops = {
                .set_hww_esr_report_mask = gv11b_gr_set_hww_esr_report_mask,
                .falcon_load_ucode = gr_gm20b_load_ctxsw_ucode_segments,
                .load_ctxsw_ucode = gr_gm20b_load_ctxsw_ucode,
-                .set_gpc_tpc_mask = gr_gv11b_set_gpc_tpc_mask,
+                .set_gpc_tpc_mask = gr_gv100_set_gpc_tpc_mask,
                .get_gpc_tpc_mask = gr_gm20b_get_gpc_tpc_mask,
                .free_channel_ctx = gk20a_free_channel_ctx,
                .alloc_obj_ctx = gk20a_alloc_obj_ctx,
@@ -335,7 +335,7 @@ static const struct gpu_ops gv100_ops = {
                .resume_contexts = gr_gk20a_resume_contexts,
                .get_preemption_mode_flags = gr_gp10b_get_preemption_mode_flags,
                .fuse_override = gp10b_gr_fuse_override,
-                .init_sm_id_table = gr_gv11b_init_sm_id_table,
+                .init_sm_id_table = gr_gv100_init_sm_id_table,
                .load_smid_config = gr_gv11b_load_smid_config,
                .program_sm_id_numbering = gr_gv11b_program_sm_id_numbering,
                .is_ltcs_ltss_addr = gr_gm20b_is_ltcs_ltss_addr,
author	David Nieto <dmartineznie@nvidia.com>	2017-10-16 15:24:59 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2017-10-20 14:55:43 -0400
commit	ed8ac6e005d95e051bd03a182bbe0aa09a3c2266 (patch)
tree	48743edbdca7d14193a5d0ff22e39b692d9936d6
parent	387ecf8a6360f463a129ab569aaef921fe0a2b0e (diff)

diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile index 8f1d42da..d6d60147 100644 --- a/drivers/gpu/nvgpu/Makefile +++ b/drivers/gpu/nvgpu/Makefile
@@ -28,6 +28,7 @@ nvgpu-y += \
28	$(nvgpu-t19x)/gv100/fb_gv100.o \	28	$(nvgpu-t19x)/gv100/fb_gv100.o \
29	$(nvgpu-t19x)/gv100/bios_gv100.o \	29	$(nvgpu-t19x)/gv100/bios_gv100.o \
30	$(nvgpu-t19x)/gv100/fifo_gv100.o \	30	$(nvgpu-t19x)/gv100/fifo_gv100.o \
		31	$(nvgpu-t19x)/gv100/gr_gv100.o \
31	$(nvgpu-t19x)/gv100/hal_gv100.o	32	$(nvgpu-t19x)/gv100/hal_gv100.o
32		33
33	nvgpu-$(CONFIG_TEGRA_GK20A) += $(nvgpu-t19x)/gv11b/platform_gv11b_tegra.o	34	nvgpu-$(CONFIG_TEGRA_GK20A) += $(nvgpu-t19x)/gv11b/platform_gv11b_tegra.o


diff --git a/drivers/gpu/nvgpu/gv100/gr_gv100.c b/drivers/gpu/nvgpu/gv100/gr_gv100.c new file mode 100644 index 00000000..4b2038ba --- /dev/null +++ b/drivers/gpu/nvgpu/gv100/gr_gv100.c
@@ -0,0 +1,289 @@
		1	/*
		2	* GV100 GPU GR
		3	*
		4	* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
		5	*
		6	* Permission is hereby granted, free of charge, to any person obtaining a
		7	* copy of this software and associated documentation files (the "Software"),
		8	* to deal in the Software without restriction, including without limitation
		9	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
		10	* and/or sell copies of the Software, and to permit persons to whom the
		11	* Software is furnished to do so, subject to the following conditions:
		12	*
		13	* The above copyright notice and this permission notice shall be included in
		14	* all copies or substantial portions of the Software.
		15	*
		16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
		19	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
		21	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
		22	* DEALINGS IN THE SOFTWARE.
		23	*/
		24
		25	#include <nvgpu/log.h>
		26	#include <nvgpu/debug.h>
		27	#include <nvgpu/enabled.h>
		28
		29	#include "gk20a/gk20a.h"
		30	#include "gk20a/gr_gk20a.h"
		31
		32	#include "gv100/gr_gv100.h"
		33	#include "gv11b/subctx_gv11b.h"
		34
		35	#include <nvgpu/hw/gv100/hw_gr_gv100.h>
		36	#include <nvgpu/hw/gv100/hw_proj_gv100.h>
		37
		38	/*
		39	* Estimate performance if the given logical TPC in the given logical GPC were
		40	* removed.
		41	*/
		42	static int gr_gv100_scg_estimate_perf(struct gk20a *g,
		43	unsigned long *gpc_tpc_mask,
		44	u32 disable_gpc_id, u32 disable_tpc_id,
		45	int *perf)
		46	{
		47	struct gr_gk20a *gr = &g->gr;
		48	int err = 0;
		49	u32 scale_factor = 512UL; /* Use fx23.9 */
		50	u32 pix_scale = 10241024UL; / Pix perf in [29:20] */
		51	u32 world_scale = 1024UL; /* World performance in [19:10] */
		52	u32 tpc_scale = 1; /* TPC balancing in [9:0] */
		53	u32 scg_num_pes = 0;
		54	u32 min_scg_gpc_pix_perf = scale_factor; /* Init perf as maximum */
		55	u32 average_tpcs = 0; /* Average of # of TPCs per GPC */
		56	u32 deviation; /* absolute diff between TPC# and
		57	* average_tpcs, averaged across GPCs
		58	*/
		59	u32 norm_tpc_deviation; /* deviation/max_tpc_per_gpc */
		60	u32 tpc_balance;
		61	u32 scg_gpc_pix_perf;
		62	u32 scg_world_perf;
		63	u32 gpc_id;
		64	u32 pes_id;
		65	int diff;
		66	bool is_tpc_removed_gpc = false;
		67	bool is_tpc_removed_pes = false;
		68	u32 max_tpc_gpc = 0;
		69	u32 num_tpc_mask;
		70	u32 num_tpc_gpc = nvgpu_kzalloc(g, sizeof(u32)
		71	nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS));
		72
		73	if (!num_tpc_gpc)
		74	return -ENOMEM;
		75
		76	/* Calculate pix-perf-reduction-rate per GPC and find bottleneck TPC */
		77	for (gpc_id = 0; gpc_id < gr->gpc_count; gpc_id++) {
		78	num_tpc_mask = gpc_tpc_mask[gpc_id];
		79
		80	if ((gpc_id == disable_gpc_id) && num_tpc_mask &
		81	(0x1 << disable_tpc_id)) {
		82	/* Safety check if a TPC is removed twice */
		83	if (is_tpc_removed_gpc) {
		84	err = -EINVAL;
		85	goto free_resources;
		86	}
		87	/* Remove logical TPC from set */
		88	num_tpc_mask &= ~(0x1 << disable_tpc_id);
		89	is_tpc_removed_gpc = true;
		90	}
		91
		92	/* track balancing of tpcs across gpcs */
		93	num_tpc_gpc[gpc_id] = hweight32(num_tpc_mask);
		94	average_tpcs += num_tpc_gpc[gpc_id];
		95
		96	/* save the maximum numer of gpcs */
		97	max_tpc_gpc = num_tpc_gpc[gpc_id] > max_tpc_gpc ?
		98	num_tpc_gpc[gpc_id] : max_tpc_gpc;
		99
		100	/*
		101	* Calculate ratio between TPC count and post-FS and post-SCG
		102	*
		103	* ratio represents relative throughput of the GPC
		104	*/
		105	scg_gpc_pix_perf = scale_factor * num_tpc_gpc[gpc_id] /
		106	gr->gpc_tpc_count[gpc_id];
		107
		108	if (min_scg_gpc_pix_perf > scg_gpc_pix_perf)
		109	min_scg_gpc_pix_perf = scg_gpc_pix_perf;
		110
		111	/* Calculate # of surviving PES */
		112	for (pes_id = 0; pes_id < gr->gpc_ppc_count[gpc_id]; pes_id++) {
		113	/* Count the number of TPC on the set */
		114	num_tpc_mask = gr->pes_tpc_mask[pes_id][gpc_id] &
		115	gpc_tpc_mask[gpc_id];
		116
		117	if ((gpc_id == disable_gpc_id) && (num_tpc_mask &
		118	(0x1 << disable_tpc_id))) {
		119
		120	if (is_tpc_removed_pes) {
		121	err = -EINVAL;
		122	goto free_resources;
		123	}
		124	num_tpc_mask &= ~(0x1 << disable_tpc_id);
		125	is_tpc_removed_pes = true;
		126	}
		127	if (hweight32(num_tpc_mask))
		128	scg_num_pes++;
		129	}
		130	}
		131
		132	if (!is_tpc_removed_gpc \|\| !is_tpc_removed_pes) {
		133	err = -EINVAL;
		134	goto free_resources;
		135	}
		136
		137	if (max_tpc_gpc == 0) {
		138	*perf = 0;
		139	goto free_resources;
		140	}
		141
		142	/* Now calculate perf */
		143	scg_world_perf = (scale_factor * scg_num_pes) / gr->ppc_count;
		144	deviation = 0;
		145	average_tpcs = scale_factor * average_tpcs / gr->gpc_count;
		146	for (gpc_id =0; gpc_id < gr->gpc_count; gpc_id++) {
		147	diff = average_tpcs - scale_factor * num_tpc_gpc[gpc_id];
		148	if (diff < 0)
		149	diff = -diff;
		150	deviation += diff;
		151	}
		152
		153	deviation /= gr->gpc_count;
		154
		155	norm_tpc_deviation = deviation / max_tpc_gpc;
		156
		157	tpc_balance = scale_factor - norm_tpc_deviation;
		158
		159	if ((tpc_balance > scale_factor) \|\|
		160	(scg_world_perf > scale_factor) \|\|
		161	(min_scg_gpc_pix_perf > scale_factor) \|\|
		162	(norm_tpc_deviation > scale_factor)) {
		163	err = -EINVAL;
		164	goto free_resources;
		165	}
		166
		167	perf = (pix_scale min_scg_gpc_pix_perf) +
		168	(world_scale * scg_world_perf) +
		169	(tpc_scale * tpc_balance);
		170	free_resources:
		171	nvgpu_kfree(g, num_tpc_gpc);
		172	return err;
		173	}
		174
		175	void gr_gv100_bundle_cb_defaults(struct gk20a *g)
		176	{
		177	struct gr_gk20a *gr = &g->gr;
		178
		179	gr->bundle_cb_default_size =
		180	gr_scc_bundle_cb_size_div_256b__prod_v();
		181	gr->min_gpm_fifo_depth =
		182	gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v();
		183	gr->bundle_cb_token_limit =
		184	gr_pd_ab_dist_cfg2_token_limit_init_v();
		185	}
		186
		187	void gr_gv100_cb_size_default(struct gk20a *g)
		188	{
		189	struct gr_gk20a *gr = &g->gr;
		190
		191	if (!gr->attrib_cb_default_size)
		192	gr->attrib_cb_default_size =
		193	gr_gpc0_ppc0_cbm_beta_cb_size_v_default_v();
		194	gr->alpha_cb_default_size =
		195	gr_gpc0_ppc0_cbm_alpha_cb_size_v_default_v();
		196	}
		197
		198	void gr_gv100_set_gpc_tpc_mask(struct gk20a *g, u32 gpc_index)
		199	{
		200	}
		201
		202	void gr_gv100_init_sm_id_table(struct gk20a *g)
		203	{
		204	u32 gpc, tpc, sm, pes, gtpc;
		205	u32 sm_id = 0;
		206	u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
		207	u32 num_sm = sm_per_tpc * g->gr.tpc_count;
		208	int perf, maxperf;
		209	int err;
		210	unsigned long *gpc_tpc_mask;
		211	u32 tpc_table, gpc_table;
		212
		213	gpc_table = nvgpu_kzalloc(g, g->gr.tpc_count * sizeof(u32));
		214	tpc_table = nvgpu_kzalloc(g, g->gr.tpc_count * sizeof(u32));
		215	gpc_tpc_mask = nvgpu_kzalloc(g, sizeof(unsigned long) *
		216	nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS));
		217
		218	if (!gpc_table \|\| !tpc_table \|\| !gpc_tpc_mask) {
		219	nvgpu_err(g, "Error allocating memory for sm tables");
		220	goto exit_build_table;
		221	}
		222
		223	for (gpc = 0; gpc < g->gr.gpc_count; gpc++)
		224	for (pes = 0; pes < g->gr.gpc_ppc_count[gpc]; pes++)
		225	gpc_tpc_mask[gpc] \|= g->gr.pes_tpc_mask[pes][gpc];
		226
		227	for (gtpc = 0; gtpc < g->gr.tpc_count; gtpc++) {
		228	maxperf = -1;
		229	for (gpc = 0; gpc < g->gr.gpc_count; gpc++) {
		230	for_each_set_bit(tpc, &gpc_tpc_mask[gpc],
		231	g->gr.gpc_tpc_count[gpc]) {
		232	perf = -1;
		233	err = gr_gv100_scg_estimate_perf(g,
		234	gpc_tpc_mask, gpc, tpc, &perf);
		235
		236	if (err) {
		237	nvgpu_err(g,
		238	"Error while estimating perf");
		239	goto exit_build_table;
		240	}
		241
		242	if (perf >= maxperf) {
		243	maxperf = perf;
		244	gpc_table[gtpc] = gpc;
		245	tpc_table[gtpc] = tpc;
		246	}
		247	}
		248	}
		249	gpc_tpc_mask[gpc_table[gtpc]] &= ~(0x1 << tpc_table[gtpc]);
		250	}
		251
		252	for (tpc = 0, sm_id = 0; sm_id < num_sm; tpc++, sm_id += sm_per_tpc) {
		253	for (sm = 0; sm < sm_per_tpc; sm++) {
		254	g->gr.sm_to_cluster[sm_id + sm].gpc_index =
		255	gpc_table[tpc];
		256	g->gr.sm_to_cluster[sm_id + sm].tpc_index =
		257	tpc_table[tpc];
		258	g->gr.sm_to_cluster[sm_id + sm].sm_index = sm;
		259	g->gr.sm_to_cluster[sm_id + sm].global_tpc_index = tpc;
		260	}
		261	}
		262
		263	g->gr.no_of_sm = num_sm;
		264	nvgpu_log_info(g, " total number of sm = %d", g->gr.no_of_sm);
		265	exit_build_table:
		266	nvgpu_kfree(g, gpc_table);
		267	nvgpu_kfree(g, tpc_table);
		268	nvgpu_kfree(g, gpc_tpc_mask);
		269	}
		270
		271	void gr_gv100_load_tpc_mask(struct gk20a *g)
		272	{
		273	u64 pes_tpc_mask = 0x0ULL;
		274	u32 gpc, pes;
		275	u32 num_tpc_per_gpc = nvgpu_get_litter_value(g,
		276	GPU_LIT_NUM_TPC_PER_GPC);
		277
		278	/* gv100 has 6 GPC and 7 TPC/GPC */
		279	for (gpc = 0; gpc < g->gr.gpc_count; gpc++) {
		280	for (pes = 0; pes < g->gr.pe_count_per_gpc; pes++) {
		281	pes_tpc_mask \|= (u64) g->gr.pes_tpc_mask[pes][gpc] <<
		282	(num_tpc_per_gpc * gpc);
		283	}
		284	}
		285
		286	nvgpu_log_info(g, "pes_tpc_mask: %016llx\n", pes_tpc_mask);
		287	gk20a_writel(g, gr_fe_tpc_fs_r(0), u64_lo32(pes_tpc_mask));
		288	gk20a_writel(g, gr_fe_tpc_fs_r(1), u64_hi32(pes_tpc_mask));
		289	}


diff --git a/drivers/gpu/nvgpu/gv100/gr_gv100.h b/drivers/gpu/nvgpu/gv100/gr_gv100.h new file mode 100644 index 00000000..460b05ae --- /dev/null +++ b/drivers/gpu/nvgpu/gv100/gr_gv100.h
@@ -0,0 +1,36 @@
		1	/*
		2	* GV100 GPU GR
		3	*
		4	* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
		5	*
		6	* Permission is hereby granted, free of charge, to any person obtaining a
		7	* copy of this software and associated documentation files (the "Software"),
		8	* to deal in the Software without restriction, including without limitation
		9	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
		10	* and/or sell copies of the Software, and to permit persons to whom the
		11	* Software is furnished to do so, subject to the following conditions:
		12	*
		13	* The above copyright notice and this permission notice shall be included in
		14	* all copies or substantial portions of the Software.
		15	*
		16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
		19	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		20	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
		21	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
		22	* DEALINGS IN THE SOFTWARE.
		23	*/
		24
		25	#ifndef _NVGPU_GR_GV100_H_
		26	#define _NVGPU_GR_GV100_H_
		27
		28	void gr_gv100_bundle_cb_defaults(struct gk20a *g);
		29	void gr_gv100_cb_size_default(struct gk20a *g);
		30	void gr_gv100_set_gpc_tpc_mask(struct gk20a *g, u32 gpc_index);
		31	void gr_gv100_init_sm_id_table(struct gk20a *g);
		32	void gr_gv100_program_sm_id_numbering(struct gk20a *g,
		33	u32 gpc, u32 tpc, u32 smid);
		34	int gr_gv100_load_smid_config(struct gk20a *g);
		35
		36	#endif


diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c index f20d2dcf..cefaf1ae 100644 --- a/drivers/gpu/nvgpu/gv100/hal_gv100.c +++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c
@@ -78,7 +78,7 @@
78		78
79	#include "gv11b/dbg_gpu_gv11b.h"	79	#include "gv11b/dbg_gpu_gv11b.h"
80	#include "gv11b/hal_gv11b.h"	80	#include "gv11b/hal_gv11b.h"
81	#include "gv11b/gr_gv11b.h"	81	#include "gv100/gr_gv100.h"
82	#include "gv11b/mc_gv11b.h"	82	#include "gv11b/mc_gv11b.h"
83	#include "gv11b/ltc_gv11b.h"	83	#include "gv11b/ltc_gv11b.h"
84	#include "gv11b/gv11b.h"	84	#include "gv11b/gv11b.h"
@@ -263,8 +263,8 @@ static const struct gpu_ops gv100_ops = {
263	},	263	},
264	.gr = {	264	.gr = {
265	.init_gpc_mmu = gr_gv11b_init_gpc_mmu,	265	.init_gpc_mmu = gr_gv11b_init_gpc_mmu,
266	.bundle_cb_defaults = gr_gv11b_bundle_cb_defaults,	266	.bundle_cb_defaults = gr_gv100_bundle_cb_defaults,
267	.cb_size_default = gr_gv11b_cb_size_default,	267	.cb_size_default = gr_gv100_cb_size_default,
268	.calc_global_ctx_buffer_size =	268	.calc_global_ctx_buffer_size =
269	gr_gv11b_calc_global_ctx_buffer_size,	269	gr_gv11b_calc_global_ctx_buffer_size,
270	.commit_global_attrib_cb = gr_gv11b_commit_global_attrib_cb,	270	.commit_global_attrib_cb = gr_gv11b_commit_global_attrib_cb,
@@ -285,7 +285,7 @@ static const struct gpu_ops gv100_ops = {
285	.set_hww_esr_report_mask = gv11b_gr_set_hww_esr_report_mask,	285	.set_hww_esr_report_mask = gv11b_gr_set_hww_esr_report_mask,
286	.falcon_load_ucode = gr_gm20b_load_ctxsw_ucode_segments,	286	.falcon_load_ucode = gr_gm20b_load_ctxsw_ucode_segments,
287	.load_ctxsw_ucode = gr_gm20b_load_ctxsw_ucode,	287	.load_ctxsw_ucode = gr_gm20b_load_ctxsw_ucode,
288	.set_gpc_tpc_mask = gr_gv11b_set_gpc_tpc_mask,	288	.set_gpc_tpc_mask = gr_gv100_set_gpc_tpc_mask,
289	.get_gpc_tpc_mask = gr_gm20b_get_gpc_tpc_mask,	289	.get_gpc_tpc_mask = gr_gm20b_get_gpc_tpc_mask,
290	.free_channel_ctx = gk20a_free_channel_ctx,	290	.free_channel_ctx = gk20a_free_channel_ctx,
291	.alloc_obj_ctx = gk20a_alloc_obj_ctx,	291	.alloc_obj_ctx = gk20a_alloc_obj_ctx,
@@ -335,7 +335,7 @@ static const struct gpu_ops gv100_ops = {
335	.resume_contexts = gr_gk20a_resume_contexts,	335	.resume_contexts = gr_gk20a_resume_contexts,
336	.get_preemption_mode_flags = gr_gp10b_get_preemption_mode_flags,	336	.get_preemption_mode_flags = gr_gp10b_get_preemption_mode_flags,
337	.fuse_override = gp10b_gr_fuse_override,	337	.fuse_override = gp10b_gr_fuse_override,
338	.init_sm_id_table = gr_gv11b_init_sm_id_table,	338	.init_sm_id_table = gr_gv100_init_sm_id_table,
339	.load_smid_config = gr_gv11b_load_smid_config,	339	.load_smid_config = gr_gv11b_load_smid_config,
340	.program_sm_id_numbering = gr_gv11b_program_sm_id_numbering,	340	.program_sm_id_numbering = gr_gv11b_program_sm_id_numbering,
341	.is_ltcs_ltss_addr = gr_gm20b_is_ltcs_ltss_addr,	341	.is_ltcs_ltss_addr = gr_gm20b_is_ltcs_ltss_addr,