summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVinod G <vinodg@nvidia.com>2018-05-24 17:00:19 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2018-08-08 07:27:28 -0400
commita09b9cd587d27a3ef6479a17631c3497d447e7a9 (patch)
treedf543902311f6c0e87458041e13b9f569fc92318
parent32bcf21f5712fcd872b26ec70ad8987f7db4478f (diff)
gpu: nvgpu: Add IOCTL for SM_EXCEPTION_TYPE_MASK
Add new ioctl to set the SM_EXCEPTION_TYPE_MASK is added to dbg session. Currently support SM_EXCEPTION_TYPE_MASK_FATAL type If this type is set then the code will skip RC recovery, instead trigger CILP preemption. bug 200412641 JIRA NVGPU-702 Change-Id: I4b1f18379ee792cd324ccc555939e0f4f5c9e3b4 Signed-off-by: Vinod G <vinodg@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1729792 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h6
-rw-r--r--drivers/gpu/nvgpu/gk20a/gr_gk20a.h6
-rw-r--r--drivers/gpu/nvgpu/gv11b/gr_gv11b.c22
-rw-r--r--drivers/gpu/nvgpu/os/linux/ioctl_dbg.c68
-rw-r--r--include/uapi/linux/nvgpu.h21
5 files changed, 118 insertions, 5 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h
index 50002557..4d3c4d74 100644
--- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h
@@ -72,6 +72,12 @@ struct dbg_session_gk20a {
72 bool broadcast_stop_trigger; 72 bool broadcast_stop_trigger;
73 73
74 struct nvgpu_mutex ioctl_lock; 74 struct nvgpu_mutex ioctl_lock;
75
76 /*
77 * sm set exception type mask flag, to check whether
78 * exception type mask is requested or not.
79 */
80 bool is_sm_exception_type_mask_set;
75}; 81};
76 82
77struct dbg_session_data { 83struct dbg_session_data {
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 0c6be57b..804e0e25 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -437,6 +437,12 @@ struct gr_gk20a {
437 u32 no_of_sm; 437 u32 no_of_sm;
438 struct sm_info *sm_to_cluster; 438 struct sm_info *sm_to_cluster;
439 struct nvgpu_gr_sm_error_state *sm_error_states; 439 struct nvgpu_gr_sm_error_state *sm_error_states;
440
441#define NVGPU_SM_EXCEPTION_TYPE_MASK_NONE (0x0U)
442#define NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL (0x1U << 0)
443 u32 sm_exception_mask_type;
444 u32 sm_exception_mask_refcount;
445
440#if defined(CONFIG_GK20A_CYCLE_STATS) 446#if defined(CONFIG_GK20A_CYCLE_STATS)
441 struct nvgpu_mutex cs_lock; 447 struct nvgpu_mutex cs_lock;
442 struct gk20a_cs_snapshot *cs_data; 448 struct gk20a_cs_snapshot *cs_data;
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index c925e5b6..9e36071f 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -2182,9 +2182,9 @@ static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)
2182 2182
2183 struct warp_esr_error_table_s warp_esr_error_table[] = { 2183 struct warp_esr_error_table_s warp_esr_error_table[] = {
2184 { gr_gpc0_tpc0_sm0_hww_warp_esr_error_stack_error_f(), 2184 { gr_gpc0_tpc0_sm0_hww_warp_esr_error_stack_error_f(),
2185 "STACK ERROR"}, 2185 "STACK ERROR"},
2186 { gr_gpc0_tpc0_sm0_hww_warp_esr_error_api_stack_error_f(), 2186 { gr_gpc0_tpc0_sm0_hww_warp_esr_error_api_stack_error_f(),
2187 "API STACK ERROR"}, 2187 "API STACK ERROR"},
2188 { gr_gpc0_tpc0_sm0_hww_warp_esr_error_pc_wrap_f(), 2188 { gr_gpc0_tpc0_sm0_hww_warp_esr_error_pc_wrap_f(),
2189 "PC WRAP ERROR"}, 2189 "PC WRAP ERROR"},
2190 { gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_pc_f(), 2190 { gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_pc_f(),
@@ -2221,7 +2221,7 @@ static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)
2221 if (warp_esr_error_table[index].error_value == warp_esr_error) { 2221 if (warp_esr_error_table[index].error_value == warp_esr_error) {
2222 esr_err = warp_esr_error_table[index].error_value; 2222 esr_err = warp_esr_error_table[index].error_value;
2223 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, 2223 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
2224 "ESR %s(0x%x)", 2224 "WARP_ESR %s(0x%x)",
2225 warp_esr_error_table[index].error_name, 2225 warp_esr_error_table[index].error_name,
2226 esr_err); 2226 esr_err);
2227 break; 2227 break;
@@ -2250,6 +2250,21 @@ static int gr_gv11b_handle_all_warp_esr_errors(struct gk20a *g,
2250 return 0; 2250 return 0;
2251 } 2251 }
2252 2252
2253 /*
2254 * Check SET_EXCEPTION_TYPE_MASK is being set.
2255 * If set, skip the recovery and trigger CILP
2256 * If not set, trigger the recovery.
2257 */
2258 if ((g->gr.sm_exception_mask_type &
2259 NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL) ==
2260 NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL) {
2261 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
2262 "SM Exception Type Mask set %d,"
2263 "skip recovery",
2264 g->gr.sm_exception_mask_type);
2265 return 0;
2266 }
2267
2253 if (fault_ch) { 2268 if (fault_ch) {
2254 tsg = &g->fifo.tsg[fault_ch->tsgid]; 2269 tsg = &g->fifo.tsg[fault_ch->tsgid];
2255 2270
@@ -2294,7 +2309,6 @@ int gr_gv11b_pre_process_sm_exception(struct gk20a *g,
2294 u32 warp_esr_error = gr_gpc0_tpc0_sm0_hww_warp_esr_error_v(warp_esr); 2309 u32 warp_esr_error = gr_gpc0_tpc0_sm0_hww_warp_esr_error_v(warp_esr);
2295 struct tsg_gk20a *tsg; 2310 struct tsg_gk20a *tsg;
2296 2311
2297
2298 *early_exit = false; 2312 *early_exit = false;
2299 *ignore_debugger = false; 2313 *ignore_debugger = false;
2300 2314
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
index eadf1f93..ad4dfc0e 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
@@ -151,6 +151,10 @@ static int dbg_unbind_all_channels_gk20a(struct dbg_session_gk20a *dbg_s);
151static int gk20a_dbg_gpu_do_dev_open(struct inode *inode, 151static int gk20a_dbg_gpu_do_dev_open(struct inode *inode,
152 struct file *filp, bool is_profiler); 152 struct file *filp, bool is_profiler);
153 153
154static int nvgpu_set_sm_exception_type_mask_locked(
155 struct dbg_session_gk20a *dbg_s,
156 u32 exception_mask);
157
154unsigned int gk20a_dbg_gpu_dev_poll(struct file *filep, poll_table *wait) 158unsigned int gk20a_dbg_gpu_dev_poll(struct file *filep, poll_table *wait)
155{ 159{
156 unsigned int mask = 0; 160 unsigned int mask = 0;
@@ -217,6 +221,10 @@ int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp)
217 nvgpu_kfree(g, prof_obj); 221 nvgpu_kfree(g, prof_obj);
218 } 222 }
219 } 223 }
224
225 nvgpu_set_sm_exception_type_mask_locked(dbg_s,
226 NVGPU_SM_EXCEPTION_TYPE_MASK_NONE);
227
220 nvgpu_mutex_release(&g->dbg_sessions_lock); 228 nvgpu_mutex_release(&g->dbg_sessions_lock);
221 229
222 nvgpu_mutex_destroy(&dbg_s->ch_list_lock); 230 nvgpu_mutex_destroy(&dbg_s->ch_list_lock);
@@ -466,6 +474,7 @@ static int gk20a_dbg_gpu_do_dev_open(struct inode *inode,
466 dbg_s->is_profiler = is_profiler; 474 dbg_s->is_profiler = is_profiler;
467 dbg_s->is_pg_disabled = false; 475 dbg_s->is_pg_disabled = false;
468 dbg_s->is_timeout_disabled = false; 476 dbg_s->is_timeout_disabled = false;
477 dbg_s->is_sm_exception_type_mask_set = false;
469 478
470 nvgpu_cond_init(&dbg_s->dbg_events.wait_queue); 479 nvgpu_cond_init(&dbg_s->dbg_events.wait_queue);
471 nvgpu_init_list_node(&dbg_s->ch_list); 480 nvgpu_init_list_node(&dbg_s->ch_list);
@@ -478,6 +487,9 @@ static int gk20a_dbg_gpu_do_dev_open(struct inode *inode,
478 dbg_s->dbg_events.events_enabled = false; 487 dbg_s->dbg_events.events_enabled = false;
479 dbg_s->dbg_events.num_pending_events = 0; 488 dbg_s->dbg_events.num_pending_events = 0;
480 489
490 nvgpu_set_sm_exception_type_mask_locked(dbg_s,
491 NVGPU_SM_EXCEPTION_TYPE_MASK_NONE);
492
481 return 0; 493 return 0;
482 494
483err_destroy_lock: 495err_destroy_lock:
@@ -1839,6 +1851,57 @@ out:
1839 return err; 1851 return err;
1840} 1852}
1841 1853
1854static int nvgpu_set_sm_exception_type_mask_locked(
1855 struct dbg_session_gk20a *dbg_s,
1856 u32 exception_mask)
1857{
1858 struct gk20a *g = dbg_s->g;
1859 struct gr_gk20a *gr = &g->gr;
1860 int err = 0;
1861
1862 switch (exception_mask) {
1863 case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL:
1864 gr->sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL;
1865 if (dbg_s->is_sm_exception_type_mask_set == false) {
1866 gr->sm_exception_mask_refcount++;
1867 dbg_s->is_sm_exception_type_mask_set = true;
1868 }
1869 break;
1870 case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_NONE:
1871 if (dbg_s->is_sm_exception_type_mask_set) {
1872 gr->sm_exception_mask_refcount--;
1873 dbg_s->is_sm_exception_type_mask_set = false;
1874 }
1875 if (gr->sm_exception_mask_refcount == 0)
1876 gr->sm_exception_mask_type =
1877 NVGPU_SM_EXCEPTION_TYPE_MASK_NONE;
1878 break;
1879 default:
1880 nvgpu_err(g,
1881 "unrecognized dbg sm exception type mask: 0x%x",
1882 exception_mask);
1883 err = -EINVAL;
1884 break;
1885 }
1886
1887 return err;
1888}
1889
1890static int nvgpu_dbg_gpu_set_sm_exception_type_mask(
1891 struct dbg_session_gk20a *dbg_s,
1892 struct nvgpu_dbg_gpu_set_sm_exception_type_mask_args *args)
1893{
1894 int err = 0;
1895 struct gk20a *g = dbg_s->g;
1896
1897 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
1898 err = nvgpu_set_sm_exception_type_mask_locked(dbg_s,
1899 args->exception_type_mask);
1900 nvgpu_mutex_release(&g->dbg_sessions_lock);
1901
1902 return err;
1903}
1904
1842int gk20a_dbg_gpu_dev_open(struct inode *inode, struct file *filp) 1905int gk20a_dbg_gpu_dev_open(struct inode *inode, struct file *filp)
1843{ 1906{
1844 struct nvgpu_os_linux *l = container_of(inode->i_cdev, 1907 struct nvgpu_os_linux *l = container_of(inode->i_cdev,
@@ -1994,6 +2057,11 @@ long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
1994 (struct nvgpu_dbg_gpu_profiler_reserve_args *)buf); 2057 (struct nvgpu_dbg_gpu_profiler_reserve_args *)buf);
1995 break; 2058 break;
1996 2059
2060 case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK:
2061 err = nvgpu_dbg_gpu_set_sm_exception_type_mask(dbg_s,
2062 (struct nvgpu_dbg_gpu_set_sm_exception_type_mask_args *)buf);
2063 break;
2064
1997 default: 2065 default:
1998 nvgpu_err(g, 2066 nvgpu_err(g,
1999 "unrecognized dbg gpu ioctl cmd: 0x%x", 2067 "unrecognized dbg gpu ioctl cmd: 0x%x",
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index 446f5bd3..0733a7b2 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -1411,8 +1411,27 @@ struct nvgpu_dbg_gpu_profiler_reserve_args {
1411#define NVGPU_DBG_GPU_IOCTL_PROFILER_RESERVE \ 1411#define NVGPU_DBG_GPU_IOCTL_PROFILER_RESERVE \
1412 _IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 22, struct nvgpu_dbg_gpu_profiler_reserve_args) 1412 _IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 22, struct nvgpu_dbg_gpu_profiler_reserve_args)
1413 1413
1414/*
1415 * This struct helps to set the exception mask. If mask is not set
1416 * or set to NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_NONE
1417 * then kernel code will follow recovery path on sm exception.
1418 * If mask is set to NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL, then
1419 * kernel code will skip recovery path on sm exception.
1420 */
1421struct nvgpu_dbg_gpu_set_sm_exception_type_mask_args {
1422#define NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_NONE (0x0U)
1423#define NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL (0x1U << 0U)
1424 /* exception type mask value */
1425 __u32 exception_type_mask;
1426 __u32 reserved;
1427};
1428
1429#define NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK \
1430 _IOW(NVGPU_DBG_GPU_IOCTL_MAGIC, 23, \
1431 struct nvgpu_dbg_gpu_set_sm_exception_type_mask_args)
1432
1414#define NVGPU_DBG_GPU_IOCTL_LAST \ 1433#define NVGPU_DBG_GPU_IOCTL_LAST \
1415 _IOC_NR(NVGPU_DBG_GPU_IOCTL_PROFILER_RESERVE) 1434 _IOC_NR(NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK)
1416 1435
1417#define NVGPU_DBG_GPU_IOCTL_MAX_ARG_SIZE \ 1436#define NVGPU_DBG_GPU_IOCTL_MAX_ARG_SIZE \
1418 sizeof(struct nvgpu_dbg_gpu_access_fb_memory_args) 1437 sizeof(struct nvgpu_dbg_gpu_access_fb_memory_args)