summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h6
-rw-r--r--drivers/gpu/nvgpu/gk20a/gr_gk20a.h5
-rw-r--r--drivers/gpu/nvgpu/gk20a/tsg_gk20a.c2
-rw-r--r--drivers/gpu/nvgpu/gk20a/tsg_gk20a.h4
-rw-r--r--drivers/gpu/nvgpu/gv11b/gr_gv11b.c37
-rw-r--r--drivers/gpu/nvgpu/os/linux/ioctl_dbg.c71
6 files changed, 64 insertions, 61 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h
index 4d3c4d74..50002557 100644
--- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h
@@ -72,12 +72,6 @@ struct dbg_session_gk20a {
72 bool broadcast_stop_trigger; 72 bool broadcast_stop_trigger;
73 73
74 struct nvgpu_mutex ioctl_lock; 74 struct nvgpu_mutex ioctl_lock;
75
76 /*
77 * sm set exception type mask flag, to check whether
78 * exception type mask is requested or not.
79 */
80 bool is_sm_exception_type_mask_set;
81}; 75};
82 76
83struct dbg_session_data { 77struct dbg_session_data {
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 0d32cca3..303e1f53 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -420,11 +420,6 @@ struct gr_gk20a {
420 u32 no_of_sm; 420 u32 no_of_sm;
421 struct sm_info *sm_to_cluster; 421 struct sm_info *sm_to_cluster;
422 422
423#define NVGPU_SM_EXCEPTION_TYPE_MASK_NONE (0x0U)
424#define NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL (0x1U << 0)
425 u32 sm_exception_mask_type;
426 u32 sm_exception_mask_refcount;
427
428#if defined(CONFIG_GK20A_CYCLE_STATS) 423#if defined(CONFIG_GK20A_CYCLE_STATS)
429 struct nvgpu_mutex cs_lock; 424 struct nvgpu_mutex cs_lock;
430 struct gk20a_cs_snapshot *cs_data; 425 struct gk20a_cs_snapshot *cs_data;
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
index 624ee1d7..506d4330 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
@@ -304,6 +304,7 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid)
304 tsg->timeslice_scale = 0; 304 tsg->timeslice_scale = 0;
305 tsg->runlist_id = ~0; 305 tsg->runlist_id = ~0;
306 tsg->tgid = pid; 306 tsg->tgid = pid;
307 tsg->sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_NONE;
307 308
308 if (g->ops.fifo.init_eng_method_buffers) 309 if (g->ops.fifo.init_eng_method_buffers)
309 g->ops.fifo.init_eng_method_buffers(g, tsg); 310 g->ops.fifo.init_eng_method_buffers(g, tsg);
@@ -373,6 +374,7 @@ void gk20a_tsg_release(struct nvgpu_ref *ref)
373 release_used_tsg(&g->fifo, tsg); 374 release_used_tsg(&g->fifo, tsg);
374 375
375 tsg->runlist_id = ~0; 376 tsg->runlist_id = ~0;
377 tsg->sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_NONE;
376 378
377 nvgpu_log(g, gpu_dbg_fn, "tsg released %d\n", tsg->tsgid); 379 nvgpu_log(g, gpu_dbg_fn, "tsg released %d\n", tsg->tsgid);
378} 380}
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
index 67ccb9f5..1e3be553 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
@@ -78,6 +78,10 @@ struct tsg_gk20a {
78 bool in_use; 78 bool in_use;
79 79
80 struct nvgpu_tsg_sm_error_state *sm_error_states; 80 struct nvgpu_tsg_sm_error_state *sm_error_states;
81
82#define NVGPU_SM_EXCEPTION_TYPE_MASK_NONE (0x0U)
83#define NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL (0x1U << 0)
84 u32 sm_exception_mask_type;
81}; 85};
82 86
83int gk20a_enable_tsg(struct tsg_gk20a *tsg); 87int gk20a_enable_tsg(struct tsg_gk20a *tsg);
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 9a6afa3e..aeb49982 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -2239,7 +2239,7 @@ static int gr_gv11b_handle_warp_esr_error_mmu_nack(struct gk20a *g,
2239static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error) 2239static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)
2240{ 2240{
2241 u32 index = 0U; 2241 u32 index = 0U;
2242 u32 esr_err = gr_gpc0_tpc0_sm0_hww_warp_esr_error_none_f(); 2242 bool esr_err = false;
2243 2243
2244 struct warp_esr_error_table_s { 2244 struct warp_esr_error_table_s {
2245 u32 error_value; 2245 u32 error_value;
@@ -2285,7 +2285,7 @@ static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)
2285 2285
2286 for (index = 0; index < ARRAY_SIZE(warp_esr_error_table); index++) { 2286 for (index = 0; index < ARRAY_SIZE(warp_esr_error_table); index++) {
2287 if (warp_esr_error_table[index].error_value == warp_esr_error) { 2287 if (warp_esr_error_table[index].error_value == warp_esr_error) {
2288 esr_err = warp_esr_error_table[index].error_value; 2288 esr_err = true;
2289 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, 2289 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
2290 "WARP_ESR %s(0x%x)", 2290 "WARP_ESR %s(0x%x)",
2291 warp_esr_error_table[index].error_name, 2291 warp_esr_error_table[index].error_name,
@@ -2294,8 +2294,9 @@ static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)
2294 } 2294 }
2295 } 2295 }
2296 2296
2297 return (esr_err == 0U) ? false : true; 2297 return esr_err;
2298} 2298}
2299
2299static int gr_gv11b_handle_all_warp_esr_errors(struct gk20a *g, 2300static int gr_gv11b_handle_all_warp_esr_errors(struct gk20a *g,
2300 u32 gpc, u32 tpc, u32 sm, 2301 u32 gpc, u32 tpc, u32 sm,
2301 u32 warp_esr_error, 2302 u32 warp_esr_error,
@@ -2316,24 +2317,24 @@ static int gr_gv11b_handle_all_warp_esr_errors(struct gk20a *g,
2316 return 0; 2317 return 0;
2317 } 2318 }
2318 2319
2319 /*
2320 * Check SET_EXCEPTION_TYPE_MASK is being set.
2321 * If set, skip the recovery and trigger CILP
2322 * If not set, trigger the recovery.
2323 */
2324 if ((g->gr.sm_exception_mask_type &
2325 NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL) ==
2326 NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL) {
2327 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
2328 "SM Exception Type Mask set %d,"
2329 "skip recovery",
2330 g->gr.sm_exception_mask_type);
2331 return 0;
2332 }
2333
2334 if (fault_ch) { 2320 if (fault_ch) {
2335 tsg = &g->fifo.tsg[fault_ch->tsgid]; 2321 tsg = &g->fifo.tsg[fault_ch->tsgid];
2336 2322
2323 /*
2324 * Check SET_EXCEPTION_TYPE_MASK is being set.
2325 * If set, skip the recovery and trigger CILP
2326 * If not set, trigger the recovery.
2327 */
2328 if ((tsg->sm_exception_mask_type &
2329 NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL) ==
2330 NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL) {
2331 nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
2332 "SM Exception Type Mask set %d,"
2333 "skip recovery",
2334 tsg->sm_exception_mask_type);
2335 return 0;
2336 }
2337
2337 nvgpu_rwsem_down_read(&tsg->ch_list_lock); 2338 nvgpu_rwsem_down_read(&tsg->ch_list_lock);
2338 nvgpu_list_for_each_entry(ch_tsg, &tsg->ch_list, 2339 nvgpu_list_for_each_entry(ch_tsg, &tsg->ch_list,
2339 channel_gk20a, ch_entry) { 2340 channel_gk20a, ch_entry) {
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
index 4ac4fb62..3931ab12 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c
@@ -223,10 +223,6 @@ int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp)
223 nvgpu_kfree(g, prof_obj); 223 nvgpu_kfree(g, prof_obj);
224 } 224 }
225 } 225 }
226
227 nvgpu_set_sm_exception_type_mask_locked(dbg_s,
228 NVGPU_SM_EXCEPTION_TYPE_MASK_NONE);
229
230 nvgpu_mutex_release(&g->dbg_sessions_lock); 226 nvgpu_mutex_release(&g->dbg_sessions_lock);
231 227
232 nvgpu_mutex_destroy(&dbg_s->ch_list_lock); 228 nvgpu_mutex_destroy(&dbg_s->ch_list_lock);
@@ -499,7 +495,6 @@ static int gk20a_dbg_gpu_do_dev_open(struct inode *inode,
499 dbg_s->is_profiler = is_profiler; 495 dbg_s->is_profiler = is_profiler;
500 dbg_s->is_pg_disabled = false; 496 dbg_s->is_pg_disabled = false;
501 dbg_s->is_timeout_disabled = false; 497 dbg_s->is_timeout_disabled = false;
502 dbg_s->is_sm_exception_type_mask_set = false;
503 498
504 nvgpu_cond_init(&dbg_s->dbg_events.wait_queue); 499 nvgpu_cond_init(&dbg_s->dbg_events.wait_queue);
505 nvgpu_init_list_node(&dbg_s->ch_list); 500 nvgpu_init_list_node(&dbg_s->ch_list);
@@ -512,9 +507,6 @@ static int gk20a_dbg_gpu_do_dev_open(struct inode *inode,
512 dbg_s->dbg_events.events_enabled = false; 507 dbg_s->dbg_events.events_enabled = false;
513 dbg_s->dbg_events.num_pending_events = 0; 508 dbg_s->dbg_events.num_pending_events = 0;
514 509
515 nvgpu_set_sm_exception_type_mask_locked(dbg_s,
516 NVGPU_SM_EXCEPTION_TYPE_MASK_NONE);
517
518 return 0; 510 return 0;
519 511
520err_destroy_lock: 512err_destroy_lock:
@@ -1887,34 +1879,29 @@ static int nvgpu_set_sm_exception_type_mask_locked(
1887 u32 exception_mask) 1879 u32 exception_mask)
1888{ 1880{
1889 struct gk20a *g = dbg_s->g; 1881 struct gk20a *g = dbg_s->g;
1890 struct gr_gk20a *gr = &g->gr;
1891 int err = 0; 1882 int err = 0;
1883 struct channel_gk20a *ch = NULL;
1892 1884
1893 switch (exception_mask) { 1885 /*
1894 case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL: 1886 * Obtain the fisrt channel from the channel list in
1895 gr->sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL; 1887 * dbg_session, find the context associated with channel
1896 if (dbg_s->is_sm_exception_type_mask_set == false) { 1888 * and set the sm_mask_type to that context
1897 gr->sm_exception_mask_refcount++; 1889 */
1898 dbg_s->is_sm_exception_type_mask_set = true; 1890 ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
1899 } 1891 if (ch != NULL) {
1900 break; 1892 struct tsg_gk20a *tsg;
1901 case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_NONE: 1893
1902 if (dbg_s->is_sm_exception_type_mask_set) { 1894 tsg = tsg_gk20a_from_ch(ch);
1903 gr->sm_exception_mask_refcount--; 1895 if (tsg != NULL) {
1904 dbg_s->is_sm_exception_type_mask_set = false; 1896 tsg->sm_exception_mask_type = exception_mask;
1897 goto type_mask_end;
1905 } 1898 }
1906 if (gr->sm_exception_mask_refcount == 0)
1907 gr->sm_exception_mask_type =
1908 NVGPU_SM_EXCEPTION_TYPE_MASK_NONE;
1909 break;
1910 default:
1911 nvgpu_err(g,
1912 "unrecognized dbg sm exception type mask: 0x%x",
1913 exception_mask);
1914 err = -EINVAL;
1915 break;
1916 } 1899 }
1917 1900
1901 nvgpu_log_fn(g, "unable to find the TSG\n");
1902 err = -EINVAL;
1903
1904type_mask_end:
1918 return err; 1905 return err;
1919} 1906}
1920 1907
@@ -1924,10 +1911,30 @@ static int nvgpu_dbg_gpu_set_sm_exception_type_mask(
1924{ 1911{
1925 int err = 0; 1912 int err = 0;
1926 struct gk20a *g = dbg_s->g; 1913 struct gk20a *g = dbg_s->g;
1914 u32 sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_NONE;
1915
1916 switch (args->exception_type_mask) {
1917 case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL:
1918 sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL;
1919 break;
1920 case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_NONE:
1921 sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_NONE;
1922 break;
1923 default:
1924 nvgpu_err(g,
1925 "unrecognized dbg sm exception type mask: 0x%x",
1926 args->exception_type_mask);
1927 err = -EINVAL;
1928 break;
1929 }
1930
1931 if (err != 0) {
1932 return err;
1933 }
1927 1934
1928 nvgpu_mutex_acquire(&g->dbg_sessions_lock); 1935 nvgpu_mutex_acquire(&g->dbg_sessions_lock);
1929 err = nvgpu_set_sm_exception_type_mask_locked(dbg_s, 1936 err = nvgpu_set_sm_exception_type_mask_locked(dbg_s,
1930 args->exception_type_mask); 1937 sm_exception_mask_type);
1931 nvgpu_mutex_release(&g->dbg_sessions_lock); 1938 nvgpu_mutex_release(&g->dbg_sessions_lock);
1932 1939
1933 return err; 1940 return err;