From 8fb6a8562ec033d2d1319f91377cd1782f593979 Mon Sep 17 00:00:00 2001 From: David Nieto Date: Mon, 18 Dec 2017 17:10:19 -0800 Subject: gpu: nvgpu: gv11b: Report LTC errors per slice Add support to report ltc ecc errors per slice (1) use new logic to detect subunits (2) store size of array and check before comparison to prevent out of bounds derefencing (3) use new hashing to prevent collisions or entries with permuted names bug 2037425 Change-Id: I63b9f0df43b9dceddc1bae17924c4723072f569e Signed-off-by: David Nieto Reviewed-on: https://git-master.nvidia.com/r/1620854 GVS: Gerrit_Virtual_Submit Reviewed-by: Chris Dragan Tested-by: Chris Dragan Reviewed-by: Nirav Patel Reviewed-by: mobile promotions Tested-by: mobile promotions --- .../gpu/nvgpu/common/linux/platform_gp10b_tegra.c | 113 ++++++++++++++++----- .../gpu/nvgpu/common/linux/platform_gp10b_tegra.h | 6 +- .../gpu/nvgpu/common/linux/platform_gv11b_tegra.c | 32 ++++++ drivers/gpu/nvgpu/gk20a/ecc_gk20a.h | 1 + drivers/gpu/nvgpu/gp10b/ltc_gp10b.c | 4 +- 5 files changed, 125 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/nvgpu/common/linux/platform_gp10b_tegra.c b/drivers/gpu/nvgpu/common/linux/platform_gp10b_tegra.c index d8bd12d2..8e4cc0a2 100644 --- a/drivers/gpu/nvgpu/common/linux/platform_gp10b_tegra.c +++ b/drivers/gpu/nvgpu/common/linux/platform_gp10b_tegra.c @@ -450,10 +450,11 @@ static struct device_attribute *dev_attr_l2_ecc_ded_count_array; static u32 gen_ecc_hash_key(char *str) { int i = 0; - u32 hash_key = 0; + u32 hash_key = 0x811c9dc5; while (str[i]) { - hash_key += (u32)(str[i]); + hash_key *= 0x1000193; + hash_key ^= (u32)(str[i]); i++; }; @@ -467,10 +468,16 @@ static ssize_t ecc_stat_show(struct device *dev, const char *ecc_stat_full_name = attr->attr.name; const char *ecc_stat_base_name; unsigned int hw_unit; + unsigned int subunit; struct gk20a_ecc_stat *ecc_stat; u32 hash_key; + struct gk20a *g = get_gk20a(dev); - if (sscanf(ecc_stat_full_name, "ltc%u", &hw_unit) == 1) { + if (sscanf(ecc_stat_full_name, "ltc%u_lts%u", &hw_unit, + &subunit) == 2) { + ecc_stat_base_name = &(ecc_stat_full_name[strlen("ltc0_lts0_")]); + hw_unit = g->gr.slices_per_ltc * hw_unit + subunit; + } else if (sscanf(ecc_stat_full_name, "ltc%u", &hw_unit) == 1) { ecc_stat_base_name = &(ecc_stat_full_name[strlen("ltc0_")]); } else if (sscanf(ecc_stat_full_name, "gpc0_tpc%u", &hw_unit) == 1) { ecc_stat_base_name = &(ecc_stat_full_name[strlen("gpc0_tpc0_")]); @@ -485,10 +492,13 @@ static ssize_t ecc_stat_show(struct device *dev, } hash_key = gen_ecc_hash_key((char *)ecc_stat_base_name); + hash_for_each_possible(ecc_hash_table, ecc_stat, hash_node, hash_key) { + if (hw_unit >= ecc_stat->count) + continue; if (!strcmp(ecc_stat_full_name, ecc_stat->names[hw_unit])) return snprintf(buf, PAGE_SIZE, "%u\n", ecc_stat->counters[hw_unit]); } @@ -505,16 +515,22 @@ int gr_gp10b_ecc_stat_create(struct device *dev, struct gk20a *g = get_gk20a(dev); char *ltc_unit_name = "ltc"; char *gr_unit_name = "gpc0_tpc"; + char *lts_unit_name = "lts"; int num_hw_units = 0; + int num_subunits = 0; - if (is_l2) + if (is_l2 == 1) num_hw_units = g->ltc_count; - else + else if (is_l2 == 2) { + num_hw_units = g->ltc_count; + num_subunits = g->gr.slices_per_ltc; + } else num_hw_units = g->gr.tpc_count; - return gp10b_ecc_stat_create(dev, num_hw_units, + return gp10b_ecc_stat_create(dev, num_hw_units, num_subunits, is_l2 ? ltc_unit_name : gr_unit_name, + num_subunits ? lts_unit_name: NULL, ecc_stat_name, ecc_stat, dev_attr_array); @@ -522,7 +538,9 @@ int gr_gp10b_ecc_stat_create(struct device *dev, int gp10b_ecc_stat_create(struct device *dev, int num_hw_units, + int num_subunits, char *ecc_unit_name, + char *ecc_subunit_name, char *ecc_stat_name, struct gk20a_ecc_stat *ecc_stat, struct device_attribute **__dev_attr_array) @@ -530,21 +548,56 @@ int gp10b_ecc_stat_create(struct device *dev, int error = 0; struct gk20a *g = get_gk20a(dev); int hw_unit = 0; + int subunit = 0; + int element = 0; u32 hash_key = 0; struct device_attribute *dev_attr_array; + int num_elements = num_subunits ? num_subunits*num_hw_units : + num_hw_units; + /* Allocate arrays */ dev_attr_array = nvgpu_kzalloc(g, sizeof(struct device_attribute) * - num_hw_units); - ecc_stat->counters = nvgpu_kzalloc(g, sizeof(u32) * num_hw_units); - ecc_stat->names = nvgpu_kzalloc(g, sizeof(char *) * num_hw_units); - for (hw_unit = 0; hw_unit < num_hw_units; hw_unit++) { + num_elements); + ecc_stat->counters = nvgpu_kzalloc(g, sizeof(u32) * num_elements); + ecc_stat->names = nvgpu_kzalloc(g, sizeof(char *) * num_elements); + for (hw_unit = 0; hw_unit < num_elements; hw_unit++) { ecc_stat->names[hw_unit] = nvgpu_kzalloc(g, sizeof(char) * ECC_STAT_NAME_MAX_SIZE); } + ecc_stat->count = num_elements; + if (num_subunits) { + for (hw_unit = 0; hw_unit < num_hw_units; hw_unit++) { + for (subunit = 0; subunit < num_subunits; subunit++) { + element = hw_unit*num_subunits + subunit; + + snprintf(ecc_stat->names[element], + ECC_STAT_NAME_MAX_SIZE, + "%s%d_%s%d_%s", + ecc_unit_name, + hw_unit, + ecc_subunit_name, + subunit, + ecc_stat_name); + + sysfs_attr_init(&dev_attr_array[element].attr); + dev_attr_array[element].attr.name = + ecc_stat->names[element]; + dev_attr_array[element].attr.mode = + VERIFY_OCTAL_PERMISSIONS(S_IRUGO); + dev_attr_array[element].show = ecc_stat_show; + dev_attr_array[element].store = NULL; + + /* Create sysfs file */ + error |= device_create_file(dev, + &dev_attr_array[element]); + + } + } + } else { + for (hw_unit = 0; hw_unit < num_hw_units; hw_unit++) { - for (hw_unit = 0; hw_unit < num_hw_units; hw_unit++) { - /* Fill in struct device_attribute members */ + /* Fill in struct device_attribute members */ snprintf(ecc_stat->names[hw_unit], ECC_STAT_NAME_MAX_SIZE, "%s%d_%s", @@ -552,14 +605,18 @@ int gp10b_ecc_stat_create(struct device *dev, hw_unit, ecc_stat_name); - sysfs_attr_init(&dev_attr_array[hw_unit].attr); - dev_attr_array[hw_unit].attr.name = ecc_stat->names[hw_unit]; - dev_attr_array[hw_unit].attr.mode = VERIFY_OCTAL_PERMISSIONS(S_IRUGO); - dev_attr_array[hw_unit].show = ecc_stat_show; - dev_attr_array[hw_unit].store = NULL; - - /* Create sysfs file */ - error |= device_create_file(dev, &dev_attr_array[hw_unit]); + sysfs_attr_init(&dev_attr_array[hw_unit].attr); + dev_attr_array[hw_unit].attr.name = + ecc_stat->names[hw_unit]; + dev_attr_array[hw_unit].attr.mode = + VERIFY_OCTAL_PERMISSIONS(S_IRUGO); + dev_attr_array[hw_unit].show = ecc_stat_show; + dev_attr_array[hw_unit].store = NULL; + + /* Create sysfs file */ + error |= device_create_file(dev, + &dev_attr_array[hw_unit]); + } } /* Add hash table entry */ @@ -581,8 +638,10 @@ void gr_gp10b_ecc_stat_remove(struct device *dev, struct gk20a *g = get_gk20a(dev); int num_hw_units = 0; - if (is_l2) + if (is_l2 == 1) num_hw_units = g->ltc_count; + else if (is_l2 == 2) + num_hw_units = g->ltc_count * g->gr.slices_per_ltc; else num_hw_units = g->gr.tpc_count; @@ -695,13 +754,13 @@ void gr_gp10b_create_sysfs(struct gk20a *g) &dev_attr_tex_ecc_unique_ded_pipe1_count_array); error |= gr_gp10b_ecc_stat_create(dev, - 1, - "lts0_ecc_sec_count", + 2, + "ecc_sec_count", &g->ecc.ltc.l2_sec_count, &dev_attr_l2_ecc_sec_count_array); error |= gr_gp10b_ecc_stat_create(dev, - 1, - "lts0_ecc_ded_count", + 2, + "ecc_ded_count", &g->ecc.ltc.l2_ded_count, &dev_attr_l2_ecc_ded_count_array); @@ -769,11 +828,11 @@ static void gr_gp10b_remove_sysfs(struct device *dev) dev_attr_tex_ecc_unique_ded_pipe1_count_array); gr_gp10b_ecc_stat_remove(dev, - 1, + 2, &g->ecc.ltc.l2_sec_count, dev_attr_l2_ecc_sec_count_array); gr_gp10b_ecc_stat_remove(dev, - 1, + 2, &g->ecc.ltc.l2_ded_count, dev_attr_l2_ecc_ded_count_array); } diff --git a/drivers/gpu/nvgpu/common/linux/platform_gp10b_tegra.h b/drivers/gpu/nvgpu/common/linux/platform_gp10b_tegra.h index 74db60d1..05832e87 100644 --- a/drivers/gpu/nvgpu/common/linux/platform_gp10b_tegra.h +++ b/drivers/gpu/nvgpu/common/linux/platform_gp10b_tegra.h @@ -25,11 +25,13 @@ int gr_gp10b_ecc_stat_create(struct device *dev, struct gk20a_ecc_stat *ecc_stat, struct device_attribute **dev_attr_array); int gp10b_ecc_stat_create(struct device *dev, - int hw_units, + int num_hw_units, + int num_subunits, char *ecc_unit_name, + char *ecc_subunit_name, char *ecc_stat_name, struct gk20a_ecc_stat *ecc_stat, - struct device_attribute **dev_attr_array); + struct device_attribute **__dev_attr_array); void gr_gp10b_ecc_stat_remove(struct device *dev, int is_l2, diff --git a/drivers/gpu/nvgpu/common/linux/platform_gv11b_tegra.c b/drivers/gpu/nvgpu/common/linux/platform_gv11b_tegra.c index 78461b5d..4f4381a5 100644 --- a/drivers/gpu/nvgpu/common/linux/platform_gv11b_tegra.c +++ b/drivers/gpu/nvgpu/common/linux/platform_gv11b_tegra.c @@ -334,112 +334,144 @@ void gr_gv11b_create_sysfs(struct gk20a *g) error |= gp10b_ecc_stat_create(dev, g->ltc_count, + 0, "ltc", + NULL, "l2_cache_uncorrected_err_count", &g->ecc.ltc.t19x.l2_cache_uncorrected_err_count, &dev_attr_l2_cache_ecc_uncorrected_err_count_array); error |= gp10b_ecc_stat_create(dev, g->ltc_count, + 0, "ltc", + NULL, "l2_cache_corrected_err_count", &g->ecc.ltc.t19x.l2_cache_corrected_err_count, &dev_attr_l2_cache_ecc_corrected_err_count_array); error |= gp10b_ecc_stat_create(dev, 1, + 0, "gpc", + NULL, "fecs_ecc_uncorrected_err_count", &g->ecc.gr.t19x.fecs_uncorrected_err_count, &dev_attr_fecs_ecc_uncorrected_err_count_array); error |= gp10b_ecc_stat_create(dev, 1, + 0, "gpc", + NULL, "fecs_ecc_corrected_err_count", &g->ecc.gr.t19x.fecs_corrected_err_count, &dev_attr_fecs_ecc_corrected_err_count_array); error |= gp10b_ecc_stat_create(dev, g->gr.gpc_count, + 0, "gpc", + NULL, "gpccs_ecc_uncorrected_err_count", &g->ecc.gr.t19x.gpccs_uncorrected_err_count, &dev_attr_gpccs_ecc_uncorrected_err_count_array); error |= gp10b_ecc_stat_create(dev, g->gr.gpc_count, + 0, "gpc", + NULL, "gpccs_ecc_corrected_err_count", &g->ecc.gr.t19x.gpccs_corrected_err_count, &dev_attr_gpccs_ecc_corrected_err_count_array); error |= gp10b_ecc_stat_create(dev, g->gr.gpc_count, + 0, "gpc", + NULL, "mmu_l1tlb_ecc_uncorrected_err_count", &g->ecc.gr.t19x.mmu_l1tlb_uncorrected_err_count, &dev_attr_mmu_l1tlb_ecc_uncorrected_err_count_array); error |= gp10b_ecc_stat_create(dev, g->gr.gpc_count, + 0, "gpc", + NULL, "mmu_l1tlb_ecc_corrected_err_count", &g->ecc.gr.t19x.mmu_l1tlb_corrected_err_count, &dev_attr_mmu_l1tlb_ecc_corrected_err_count_array); error |= gp10b_ecc_stat_create(dev, 1, + 0, "eng", + NULL, "mmu_l2tlb_ecc_uncorrected_err_count", &g->ecc.eng.t19x.mmu_l2tlb_uncorrected_err_count, &dev_attr_mmu_l2tlb_ecc_uncorrected_err_count_array); error |= gp10b_ecc_stat_create(dev, 1, + 0, "eng", + NULL, "mmu_l2tlb_ecc_corrected_err_count", &g->ecc.eng.t19x.mmu_l2tlb_corrected_err_count, &dev_attr_mmu_l2tlb_ecc_corrected_err_count_array); error |= gp10b_ecc_stat_create(dev, 1, + 0, "eng", + NULL, "mmu_hubtlb_ecc_uncorrected_err_count", &g->ecc.eng.t19x.mmu_hubtlb_uncorrected_err_count, &dev_attr_mmu_hubtlb_ecc_uncorrected_err_count_array); error |= gp10b_ecc_stat_create(dev, 1, + 0, "eng", + NULL, "mmu_hubtlb_ecc_corrected_err_count", &g->ecc.eng.t19x.mmu_hubtlb_corrected_err_count, &dev_attr_mmu_hubtlb_ecc_corrected_err_count_array); error |= gp10b_ecc_stat_create(dev, 1, + 0, "eng", + NULL, "mmu_fillunit_ecc_uncorrected_err_count", &g->ecc.eng.t19x.mmu_fillunit_uncorrected_err_count, &dev_attr_mmu_fillunit_ecc_uncorrected_err_count_array); error |= gp10b_ecc_stat_create(dev, 1, + 0, "eng", + NULL, "mmu_fillunit_ecc_corrected_err_count", &g->ecc.eng.t19x.mmu_fillunit_corrected_err_count, &dev_attr_mmu_fillunit_ecc_corrected_err_count_array); error |= gp10b_ecc_stat_create(dev, 1, + 0, "eng", + NULL, "pmu_ecc_uncorrected_err_count", &g->ecc.eng.t19x.pmu_uncorrected_err_count, &dev_attr_pmu_ecc_uncorrected_err_count_array); error |= gp10b_ecc_stat_create(dev, 1, + 0, "eng", + NULL, "pmu_ecc_corrected_err_count", &g->ecc.eng.t19x.pmu_corrected_err_count, &dev_attr_pmu_ecc_corrected_err_count_array); diff --git a/drivers/gpu/nvgpu/gk20a/ecc_gk20a.h b/drivers/gpu/nvgpu/gk20a/ecc_gk20a.h index 0d1ed5df..57eec1e0 100644 --- a/drivers/gpu/nvgpu/gk20a/ecc_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/ecc_gk20a.h @@ -27,6 +27,7 @@ struct gk20a_ecc_stat { char **names; u32 *counters; + u32 count; #ifdef CONFIG_SYSFS struct hlist_node hash_node; #endif diff --git a/drivers/gpu/nvgpu/gp10b/ltc_gp10b.c b/drivers/gpu/nvgpu/gp10b/ltc_gp10b.c index 6e8a53c5..bf95f1fd 100644 --- a/drivers/gpu/nvgpu/gp10b/ltc_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/ltc_gp10b.c @@ -155,7 +155,7 @@ void gp10b_ltc_isr(struct gk20a *g) ecc_stats_reg_val = gk20a_readl(g, ltc_ltc0_lts0_dstg_ecc_report_r() + offset); - g->ecc.ltc.l2_sec_count.counters[ltc] += + g->ecc.ltc.l2_sec_count.counters[ltc*g->ltc_count + slice] += ltc_ltc0_lts0_dstg_ecc_report_sec_count_v(ecc_stats_reg_val); ecc_stats_reg_val &= ~(ltc_ltc0_lts0_dstg_ecc_report_sec_count_m()); @@ -175,7 +175,7 @@ void gp10b_ltc_isr(struct gk20a *g) ecc_stats_reg_val = gk20a_readl(g, ltc_ltc0_lts0_dstg_ecc_report_r() + offset); - g->ecc.ltc.l2_ded_count.counters[ltc] += + g->ecc.ltc.l2_ded_count.counters[ltc*g->ltc_count + slice] += ltc_ltc0_lts0_dstg_ecc_report_ded_count_v(ecc_stats_reg_val); ecc_stats_reg_val &= ~(ltc_ltc0_lts0_dstg_ecc_report_ded_count_m()); -- cgit v1.2.2