From f56874aec2ec61f2c341b813cc76de5acc51ea12 Mon Sep 17 00:00:00 2001 From: Ranjanikar Nikhil Prabhakarrao Date: Thu, 13 Dec 2018 17:29:20 +0530 Subject: gpu: nvgpu: add speculative barrier Data can be speculativerly stored and code flow can be hijacked. To mitigate this problem insert a speculation barrier. Bug 200447167 Change-Id: Ia865ff2add8b30de49aa970715625b13e8f71c08 Signed-off-by: Ranjanikar Nikhil Prabhakarrao Reviewed-on: https://git-master.nvidia.com/r/1972221 (cherry picked from commit f0762ed4831b3fe6cc953a4a4ec26c2537dcb69f) Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/1996052 Reviewed-by: automaticguardword Reviewed-by: Deepak Nibade Reviewed-by: mobile promotions GVS: Gerrit_Virtual_Submit Tested-by: Deepak Nibade Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/fifo/submit.c | 1 + drivers/gpu/nvgpu/common/fifo/tsg.c | 1 + drivers/gpu/nvgpu/gk20a/gr_gk20a.c | 2 + drivers/gpu/nvgpu/gp10b/gr_gp10b.c | 1 + drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 3 + drivers/gpu/nvgpu/os/linux/dmabuf_vidmem.c | 1 + drivers/gpu/nvgpu/os/linux/ioctl_as.c | 2 + drivers/gpu/nvgpu/os/linux/ioctl_channel.c | 5 + drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c | 11 + drivers/gpu/nvgpu/os/linux/ioctl_dbg.c | 10 + drivers/gpu/nvgpu/os/linux/ioctl_tsg.c | 2 + drivers/gpu/nvgpu/os/linux/sched.c | 1 + drivers/gpu/nvgpu/tu104/gr_tu104.c | 549 +++++++++++++++++++++++++++++ 13 files changed, 589 insertions(+) create mode 100644 drivers/gpu/nvgpu/tu104/gr_tu104.c diff --git a/drivers/gpu/nvgpu/common/fifo/submit.c b/drivers/gpu/nvgpu/common/fifo/submit.c index d518fbfb..b0f38ff1 100644 --- a/drivers/gpu/nvgpu/common/fifo/submit.c +++ b/drivers/gpu/nvgpu/common/fifo/submit.c @@ -212,6 +212,7 @@ static int nvgpu_submit_append_gpfifo_user_direct(struct channel_gk20a *c, u32 end = start + len; /* exclusive */ int err; + nvgpu_speculation_barrier(); if (end > gpfifo_size) { /* wrap-around */ int length0 = gpfifo_size - start; diff --git a/drivers/gpu/nvgpu/common/fifo/tsg.c b/drivers/gpu/nvgpu/common/fifo/tsg.c index f6c718f0..841dd465 100644 --- a/drivers/gpu/nvgpu/common/fifo/tsg.c +++ b/drivers/gpu/nvgpu/common/fifo/tsg.c @@ -219,6 +219,7 @@ int gk20a_tsg_set_runlist_interleave(struct tsg_gk20a *tsg, u32 level) nvgpu_log(g, gpu_dbg_sched, "tsgid=%u interleave=%u", tsg->tsgid, level); + nvgpu_speculation_barrier(); switch (level) { case NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_LOW: case NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_MEDIUM: diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c index 989ee5c9..636d5714 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c @@ -3943,6 +3943,7 @@ int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr, /* no endian swap ? */ nvgpu_mutex_acquire(&gr->zbc_lock); + nvgpu_speculation_barrier(); switch (zbc_val->type) { case GK20A_ZBC_TYPE_COLOR: /* search existing tables */ @@ -4047,6 +4048,7 @@ int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr, u32 index = query_params->index_size; u32 i; + nvgpu_speculation_barrier(); switch (query_params->type) { case GK20A_ZBC_TYPE_INVALID: query_params->index_size = GK20A_ZBC_TABLE_SIZE; diff --git a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c index 4b42678f..2bcb08a4 100644 --- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c @@ -52,6 +52,7 @@ bool gr_gp10b_is_valid_class(struct gk20a *g, u32 class_num) { bool valid = false; + nvgpu_speculation_barrier(); switch (class_num) { case PASCAL_COMPUTE_A: case PASCAL_A: diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index 1dfecfc1..5820a695 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -77,6 +77,7 @@ bool gr_gv11b_is_valid_class(struct gk20a *g, u32 class_num) { bool valid = false; + nvgpu_speculation_barrier(); switch (class_num) { case VOLTA_COMPUTE_A: case VOLTA_A: @@ -106,6 +107,7 @@ bool gr_gv11b_is_valid_gfx_class(struct gk20a *g, u32 class_num) { bool valid = false; + nvgpu_speculation_barrier(); switch (class_num) { case VOLTA_A: case PASCAL_A: @@ -140,6 +142,7 @@ bool gr_gv11b_is_valid_compute_class(struct gk20a *g, u32 class_num) { bool valid = false; + nvgpu_speculation_barrier(); switch (class_num) { case VOLTA_COMPUTE_A: case PASCAL_COMPUTE_A: diff --git a/drivers/gpu/nvgpu/os/linux/dmabuf_vidmem.c b/drivers/gpu/nvgpu/os/linux/dmabuf_vidmem.c index 8b38a9e1..bada5dc7 100644 --- a/drivers/gpu/nvgpu/os/linux/dmabuf_vidmem.c +++ b/drivers/gpu/nvgpu/os/linux/dmabuf_vidmem.c @@ -244,6 +244,7 @@ int nvgpu_vidmem_buf_access_memory(struct gk20a *g, struct dma_buf *dmabuf, vidmem_buf = dmabuf->priv; mem = vidmem_buf->mem; + nvgpu_speculation_barrier(); switch (cmd) { case NVGPU_DBG_GPU_IOCTL_ACCESS_FB_MEMORY_CMD_READ: nvgpu_mem_rd_n(g, mem, offset, buffer, size); diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_as.c b/drivers/gpu/nvgpu/os/linux/ioctl_as.c index 3fa8aa2c..f0cec178 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_as.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_as.c @@ -170,6 +170,7 @@ static int gk20a_as_ioctl_map_buffer_batch( nvgpu_vm_unmap(as_share->vm, unmap_args.offset, &batch); } + nvgpu_speculation_barrier(); if (err) { nvgpu_vm_mapping_batch_finish(as_share->vm, &batch); @@ -355,6 +356,7 @@ long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (err) return err; + nvgpu_speculation_barrier(); switch (cmd) { case NVGPU_AS_IOCTL_BIND_CHANNEL: trace_gk20a_as_ioctl_bind_channel(g->name); diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c index 22177171..3c844491 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c @@ -290,6 +290,7 @@ static int gk20a_channel_cycle_stats_snapshot(struct channel_gk20a *ch, if (!args->dmabuf_fd) return -EINVAL; + nvgpu_speculation_barrier(); /* handle the command (most frequent cases first) */ switch (args->cmd) { case NVGPU_IOCTL_CHANNEL_CYCLE_STATS_SNAPSHOT_CMD_FLUSH: @@ -874,6 +875,7 @@ clean_up: */ u32 nvgpu_get_common_runlist_level(u32 level) { + nvgpu_speculation_barrier(); switch (level) { case NVGPU_RUNLIST_INTERLEAVE_LEVEL_LOW: return NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_LOW; @@ -982,6 +984,7 @@ u32 nvgpu_get_ioctl_compute_preempt_mode(u32 compute_preempt_mode) */ static u32 nvgpu_get_common_graphics_preempt_mode(u32 graphics_preempt_mode) { + nvgpu_speculation_barrier(); switch (graphics_preempt_mode) { case NVGPU_GRAPHICS_PREEMPTION_MODE_WFI: return NVGPU_PREEMPTION_MODE_GRAPHICS_WFI; @@ -998,6 +1001,7 @@ static u32 nvgpu_get_common_graphics_preempt_mode(u32 graphics_preempt_mode) */ static u32 nvgpu_get_common_compute_preempt_mode(u32 compute_preempt_mode) { + nvgpu_speculation_barrier(); switch (compute_preempt_mode) { case NVGPU_COMPUTE_PREEMPTION_MODE_WFI: return NVGPU_PREEMPTION_MODE_COMPUTE_WFI; @@ -1121,6 +1125,7 @@ long gk20a_channel_ioctl(struct file *filp, /* this ioctl call keeps a ref to the file which keeps a ref to the * channel */ + nvgpu_speculation_barrier(); switch (cmd) { case NVGPU_IOCTL_CHANNEL_OPEN: err = gk20a_channel_open_ioctl(ch->g, diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c index 271c5d92..954b08b5 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c @@ -366,6 +366,7 @@ gk20a_ctrl_ioctl_gpu_characteristics( if (request->gpu_characteristics_buf_size > 0) { size_t write_size = sizeof(gpu); + nvgpu_speculation_barrier(); if (write_size > request->gpu_characteristics_buf_size) write_size = request->gpu_characteristics_buf_size; @@ -556,6 +557,7 @@ static int gk20a_ctrl_get_tpc_masks(struct gk20a *g, if (args->mask_buf_size > 0) { size_t write_size = gpc_tpc_mask_size; + nvgpu_speculation_barrier(); if (write_size > args->mask_buf_size) write_size = args->mask_buf_size; @@ -580,6 +582,7 @@ static int gk20a_ctrl_get_fbp_l2_masks( if (args->mask_buf_size > 0) { size_t write_size = fbp_l2_mask_size; + nvgpu_speculation_barrier(); if (write_size > args->mask_buf_size) write_size = args->mask_buf_size; @@ -1245,6 +1248,7 @@ static int nvgpu_gpu_clk_set_info(struct gk20a *g, nvgpu_gpu_convert_clk_domain(clk_info.clk_domain))) return -EINVAL; } + nvgpu_speculation_barrier(); entry = (struct nvgpu_gpu_clk_info __user *) (uintptr_t)args->clk_info_entries; @@ -1264,6 +1268,7 @@ static int nvgpu_gpu_clk_set_info(struct gk20a *g, nvgpu_gpu_convert_clk_domain(clk_info.clk_domain), freq_mhz); } + nvgpu_speculation_barrier(); ret = nvgpu_clk_arb_commit_request_fd(g, session, fd); if (ret < 0) return ret; @@ -1333,6 +1338,7 @@ static int nvgpu_gpu_clk_get_info(struct gk20a *g, clk_info.clk_type = args->clk_type; } + nvgpu_speculation_barrier(); switch (clk_info.clk_type) { case NVGPU_GPU_CLK_TYPE_TARGET: err = nvgpu_clk_arb_get_session_target_mhz(session, @@ -1366,6 +1372,7 @@ static int nvgpu_gpu_clk_get_info(struct gk20a *g, return -EFAULT; } + nvgpu_speculation_barrier(); args->num_entries = num_entries; return 0; @@ -1403,6 +1410,7 @@ static int nvgpu_gpu_get_voltage(struct gk20a *g, if (err) return err; + nvgpu_speculation_barrier(); switch (args->which) { case NVGPU_GPU_VOLTAGE_CORE: err = volt_get_voltage(g, CTRL_VOLT_DOMAIN_LOGIC, &args->voltage); @@ -1625,6 +1633,7 @@ static int nvgpu_gpu_set_deterministic_opts(struct gk20a *g, break; } + nvgpu_speculation_barrier(); nvgpu_rwsem_up_read(&g->deterministic_busy); out: @@ -1668,6 +1677,7 @@ long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg gk20a_idle(g); } + nvgpu_speculation_barrier(); switch (cmd) { case NVGPU_GPU_IOCTL_ZCULL_GET_CTX_SIZE: get_ctx_size_args = (struct nvgpu_gpu_zcull_get_ctx_size_args *)buf; @@ -1713,6 +1723,7 @@ long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg zbc_val->format = set_table_args->format; zbc_val->type = set_table_args->type; + nvgpu_speculation_barrier(); switch (zbc_val->type) { case GK20A_ZBC_TYPE_COLOR: for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) { diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c index dc732dc5..0c9b10b5 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c @@ -314,6 +314,7 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state( if (args->sm_error_state_record_size > 0) { size_t write_size = sizeof(*sm_error_state); + nvgpu_speculation_barrier(); if (write_size > args->sm_error_state_record_size) write_size = args->sm_error_state_record_size; @@ -361,6 +362,7 @@ static int nvgpu_dbg_timeout_enable(struct dbg_session_gk20a *dbg_s, nvgpu_log(g, gpu_dbg_gpu_dbg, "Timeouts mode requested : %d", timeout_mode); + nvgpu_speculation_barrier(); switch (timeout_mode) { case NVGPU_DBG_GPU_IOCTL_TIMEOUT_ENABLE: if (dbg_s->is_timeout_disabled == true) @@ -917,6 +919,7 @@ static int nvgpu_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s, ops_offset += num_ops; } + nvgpu_speculation_barrier(); nvgpu_kfree(g, linux_fragment); /* enable powergate, if previously disabled */ @@ -1007,6 +1010,7 @@ static int nvgpu_dbg_gpu_ioctl_smpc_ctxsw_mode(struct dbg_session_gk20a *dbg_s, static u32 nvgpu_hwpm_ctxsw_mode_to_common_mode(u32 mode) { + nvgpu_speculation_barrier(); switch (mode){ case NVGPU_DBG_GPU_HWPM_CTXSW_MODE_NO_CTXSW: return NVGPU_DBG_HWPM_CTXSW_MODE_NO_CTXSW; @@ -1153,6 +1157,7 @@ static int nvgpu_dbg_gpu_ioctl_suspend_resume_sm( goto clean_up; } + nvgpu_speculation_barrier(); switch (action) { case NVGPU_DBG_GPU_SUSPEND_ALL_SMS: gr_gk20a_suspend_context(ch); @@ -1366,6 +1371,7 @@ static int gk20a_dbg_gpu_events_ctrl(struct dbg_session_gk20a *dbg_s, return -EINVAL; } + nvgpu_speculation_barrier(); switch (args->cmd) { case NVGPU_DBG_GPU_EVENTS_CTRL_CMD_ENABLE: gk20a_dbg_gpu_events_enable(dbg_s); @@ -1536,6 +1542,7 @@ nvgpu_dbg_gpu_ioctl_suspend_resume_contexts(struct dbg_session_gk20a *dbg_s, if (err) return err; + nvgpu_speculation_barrier(); switch (args->action) { case NVGPU_DBG_GPU_SUSPEND_ALL_CONTEXTS: err = g->ops.gr.suspend_contexts(g, dbg_s, @@ -1627,6 +1634,7 @@ static int nvgpu_dbg_gpu_ioctl_access_fb_memory(struct dbg_session_gk20a *dbg_s, size -= access_size; offset += access_size; } + nvgpu_speculation_barrier(); fail_idle: gk20a_idle(g); @@ -1899,6 +1907,7 @@ static int nvgpu_dbg_gpu_set_sm_exception_type_mask( struct gk20a *g = dbg_s->g; u32 sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_NONE; + nvgpu_speculation_barrier(); switch (args->exception_type_mask) { case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL: sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL; @@ -1970,6 +1979,7 @@ long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd, /* protect from threaded user space calls */ nvgpu_mutex_acquire(&dbg_s->ioctl_lock); + nvgpu_speculation_barrier(); switch (cmd) { case NVGPU_DBG_GPU_IOCTL_BIND_CHANNEL: err = dbg_bind_channel_gk20a(dbg_s, diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c b/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c index a26559f5..2f8cb3ae 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c @@ -361,6 +361,7 @@ static int gk20a_tsg_event_id_ctrl(struct gk20a *g, struct tsg_gk20a *tsg, if (args->event_id >= NVGPU_IOCTL_CHANNEL_EVENT_ID_MAX) return -EINVAL; + nvgpu_speculation_barrier(); switch (args->cmd) { case NVGPU_IOCTL_CHANNEL_EVENT_ID_CMD_ENABLE: err = gk20a_tsg_event_id_enable(tsg, args->event_id, &fd); @@ -572,6 +573,7 @@ static int gk20a_tsg_ioctl_read_single_sm_error_state(struct gk20a *g, if (args->record_size > 0) { size_t write_size = sizeof(*sm_error_state); + nvgpu_speculation_barrier(); if (write_size > args->record_size) write_size = args->record_size; diff --git a/drivers/gpu/nvgpu/os/linux/sched.c b/drivers/gpu/nvgpu/os/linux/sched.c index 15cbf1ec..30c58a19 100644 --- a/drivers/gpu/nvgpu/os/linux/sched.c +++ b/drivers/gpu/nvgpu/os/linux/sched.c @@ -447,6 +447,7 @@ long gk20a_sched_dev_ioctl(struct file *filp, unsigned int cmd, return -EFAULT; } + nvgpu_speculation_barrier(); switch (cmd) { case NVGPU_SCHED_IOCTL_GET_TSGS: err = gk20a_sched_dev_ioctl_get_tsgs(g, diff --git a/drivers/gpu/nvgpu/tu104/gr_tu104.c b/drivers/gpu/nvgpu/tu104/gr_tu104.c new file mode 100644 index 00000000..fa6995ac --- /dev/null +++ b/drivers/gpu/nvgpu/tu104/gr_tu104.c @@ -0,0 +1,549 @@ +<<<<<<< HEAD (bbef4c gpu: nvgpu: initialize masks for the perfmon counters 3) +======= +/* + * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gk20a/gr_gk20a.h" +#include "gk20a/gr_pri_gk20a.h" + +#include "gp10b/gr_gp10b.h" + +#include "gv11b/gr_gv11b.h" + +#include "tu104/gr_tu104.h" + +#include + +bool gr_tu104_is_valid_class(struct gk20a *g, u32 class_num) +{ + nvgpu_speculation_barrier(); + switch (class_num) { + case TURING_CHANNEL_GPFIFO_A: + case TURING_A: + case TURING_COMPUTE_A: + case TURING_DMA_COPY_A: + return true; + default: + break; + } + + return gr_gv11b_is_valid_class(g, class_num); +}; + +bool gr_tu104_is_valid_gfx_class(struct gk20a *g, u32 class_num) +{ + nvgpu_speculation_barrier(); + switch (class_num) { + case TURING_A: + return true; + default: + break; + } + + return gr_gv11b_is_valid_gfx_class(g, class_num); +} + +bool gr_tu104_is_valid_compute_class(struct gk20a *g, u32 class_num) +{ + nvgpu_speculation_barrier(); + switch (class_num) { + case TURING_COMPUTE_A: + return true; + default: + break; + } + + return gr_gv11b_is_valid_compute_class(g, class_num); +} + +int gr_tu104_init_sw_bundle64(struct gk20a *g) +{ + u32 i; + u32 last_bundle_data_lo = 0; + u32 last_bundle_data_hi = 0; + int err = 0; + struct netlist_av64_list *sw_bundle64_init = + &g->netlist_vars->sw_bundle64_init; + + for (i = 0U; i < sw_bundle64_init->count; i++) { + if (i == 0U || + (last_bundle_data_lo != sw_bundle64_init->l[i].value_lo) || + (last_bundle_data_hi != sw_bundle64_init->l[i].value_hi)) { + nvgpu_writel(g, gr_pipe_bundle_data_r(), + sw_bundle64_init->l[i].value_lo); + nvgpu_writel(g, gr_pipe_bundle_data_hi_r(), + sw_bundle64_init->l[i].value_hi); + + last_bundle_data_lo = sw_bundle64_init->l[i].value_lo; + last_bundle_data_hi = sw_bundle64_init->l[i].value_hi; + } + + nvgpu_writel(g, gr_pipe_bundle_address_r(), + sw_bundle64_init->l[i].addr); + + if (gr_pipe_bundle_address_value_v(sw_bundle64_init->l[i].addr) + == GR_GO_IDLE_BUNDLE) { + err = gr_gk20a_wait_idle(g, + gk20a_get_gr_idle_timeout(g), + GR_IDLE_CHECK_DEFAULT); + } else if (nvgpu_platform_is_silicon(g)) { + err = gr_gk20a_wait_fe_idle(g, + gk20a_get_gr_idle_timeout(g), + GR_IDLE_CHECK_DEFAULT); + } + if (err != 0) { + break; + } + } + + return err; +} + +int gr_tu104_alloc_global_ctx_buffers(struct gk20a *g) +{ + int err; + struct gr_gk20a *gr = &g->gr; + u32 rtv_circular_buffer_size; + + nvgpu_log_fn(g, " "); + + rtv_circular_buffer_size = + (gr_scc_rm_rtv_cb_size_div_256b_default_f() + + gr_scc_rm_rtv_cb_size_div_256b_db_adder_f()) * + gr_scc_bundle_cb_size_div_256b_byte_granularity_v(); + nvgpu_log_info(g, "rtv_circular_buffer_size : %u", + rtv_circular_buffer_size); + + err = gk20a_gr_alloc_ctx_buffer(g, + &gr->global_ctx_buffer[RTV_CIRCULAR_BUFFER], + rtv_circular_buffer_size); + if (err != 0) { + return err; + } + + err = gr_gk20a_alloc_global_ctx_buffers(g); + if (err != 0) { + goto clean_up; + } + + return 0; + +clean_up: + nvgpu_err(g, "fail"); + gk20a_gr_destroy_ctx_buffer(g, + &gr->global_ctx_buffer[RTV_CIRCULAR_BUFFER]); + + return err; +} + +int gr_tu104_map_global_ctx_buffers(struct gk20a *g, struct vm_gk20a *vm, + struct nvgpu_gr_ctx *gr_ctx, bool vpr) +{ + int err; + u64 *g_bfr_va; + u64 *g_bfr_size; + int *g_bfr_index; + struct gr_gk20a *gr = &g->gr; + struct nvgpu_mem *mem; + u64 gpu_va; + + nvgpu_log_fn(g, " "); + + g_bfr_va = gr_ctx->global_ctx_buffer_va; + g_bfr_size = gr_ctx->global_ctx_buffer_size; + g_bfr_index = gr_ctx->global_ctx_buffer_index; + + /* RTV circular buffer */ + mem = &gr->global_ctx_buffer[RTV_CIRCULAR_BUFFER].mem; + gpu_va = nvgpu_gmmu_map(vm, mem, mem->size, 0, + gk20a_mem_flag_none, true, mem->aperture); + if (gpu_va == 0ULL) { + return -ENOMEM; + } + + g_bfr_va[RTV_CIRCULAR_BUFFER_VA] = gpu_va; + g_bfr_size[RTV_CIRCULAR_BUFFER_VA] = mem->size; + g_bfr_index[RTV_CIRCULAR_BUFFER_VA] = RTV_CIRCULAR_BUFFER; + + err = gr_gk20a_map_global_ctx_buffers(g, vm, gr_ctx, vpr); + if (err != 0) { + goto clean_up; + } + + return 0; + +clean_up: + nvgpu_err(g, "fail"); + nvgpu_gmmu_unmap(vm, mem, gpu_va); + + return err; +} + +static void gr_tu104_commit_rtv_circular_buffer(struct gk20a *g, + struct nvgpu_gr_ctx *gr_ctx, + u64 addr, u32 size, u32 gfxpAddSize, bool patch) +{ + gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_rm_rtv_cb_base_r(), + gr_scc_rm_rtv_cb_base_addr_39_8_f(addr), patch); + gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_rm_rtv_cb_size_r(), + gr_scc_rm_rtv_cb_size_div_256b_f(size), patch); + gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gcc_rm_rtv_cb_base_r(), + gr_gpcs_gcc_rm_rtv_cb_base_addr_39_8_f(addr), patch); + gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_rm_gfxp_reserve_r(), + gr_scc_rm_gfxp_reserve_rtv_cb_size_div_256b_f(gfxpAddSize), + patch); +} + +int gr_tu104_commit_global_ctx_buffers(struct gk20a *g, + struct nvgpu_gr_ctx *gr_ctx, bool patch) +{ + int err; + u64 addr; + u32 size; + u32 gfxpaddsize = 0; + + nvgpu_log_fn(g, " "); + + err = gr_gk20a_commit_global_ctx_buffers(g, gr_ctx, patch); + if (err != 0) { + return err; + } + + if (patch) { + int err; + err = gr_gk20a_ctx_patch_write_begin(g, gr_ctx, false); + if (err != 0) { + return err; + } + } + + /* RTV circular buffer */ + addr = gr_ctx->global_ctx_buffer_va[RTV_CIRCULAR_BUFFER_VA] >> + U64(gr_scc_rm_rtv_cb_base_addr_39_8_align_bits_f()); + + size = (gr_scc_rm_rtv_cb_size_div_256b_default_f() + + gr_scc_rm_rtv_cb_size_div_256b_db_adder_f()); + + gr_tu104_commit_rtv_circular_buffer(g, gr_ctx, addr, size, + gfxpaddsize, patch); + + if (patch) { + gr_gk20a_ctx_patch_write_end(g, gr_ctx, false); + } + + return 0; +} + +int gr_tu104_alloc_gfxp_rtv_cb(struct gk20a *g, + struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm) +{ + int err; + u32 rtv_cb_size; + + nvgpu_log_fn(g, " "); + + rtv_cb_size = + (gr_scc_rm_rtv_cb_size_div_256b_default_f() + + gr_scc_rm_rtv_cb_size_div_256b_db_adder_f() + + gr_scc_rm_rtv_cb_size_div_256b_gfxp_adder_f()) * + gr_scc_rm_rtv_cb_size_div_256b_byte_granularity_v(); + + err = gr_gp10b_alloc_buffer(vm, + rtv_cb_size, + &gr_ctx->gfxp_rtvcb_ctxsw_buffer); + + return err; +} + +void gr_tu104_commit_gfxp_rtv_cb(struct gk20a *g, + struct nvgpu_gr_ctx *gr_ctx, bool patch) +{ + u64 addr; + u32 rtv_cb_size; + u32 gfxp_addr_size; + + nvgpu_log_fn(g, " "); + + rtv_cb_size = + (gr_scc_rm_rtv_cb_size_div_256b_default_f() + + gr_scc_rm_rtv_cb_size_div_256b_db_adder_f() + + gr_scc_rm_rtv_cb_size_div_256b_gfxp_adder_f()); + gfxp_addr_size = gr_scc_rm_rtv_cb_size_div_256b_gfxp_adder_f(); + + /* GFXP RTV circular buffer */ + addr = (u64)(u64_lo32(gr_ctx->gfxp_rtvcb_ctxsw_buffer.gpu_va) >> + gr_scc_rm_rtv_cb_base_addr_39_8_align_bits_f()) | + (u64)(u64_hi32(gr_ctx->gfxp_rtvcb_ctxsw_buffer.gpu_va) << + (32U - gr_scc_rm_rtv_cb_base_addr_39_8_align_bits_f())); + + + gr_tu104_commit_rtv_circular_buffer(g, gr_ctx, addr, + rtv_cb_size, + gfxp_addr_size, + patch); +} + +void gr_tu104_bundle_cb_defaults(struct gk20a *g) +{ + struct gr_gk20a *gr = &g->gr; + + gr->bundle_cb_default_size = + gr_scc_bundle_cb_size_div_256b__prod_v(); + gr->min_gpm_fifo_depth = + gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v(); + gr->bundle_cb_token_limit = + gr_pd_ab_dist_cfg2_token_limit_init_v(); +} + +void gr_tu104_cb_size_default(struct gk20a *g) +{ + struct gr_gk20a *gr = &g->gr; + + if (gr->attrib_cb_default_size == 0U) { + gr->attrib_cb_default_size = + gr_gpc0_ppc0_cbm_beta_cb_size_v_default_v(); + } + gr->alpha_cb_default_size = + gr_gpc0_ppc0_cbm_alpha_cb_size_v_default_v(); + gr->attrib_cb_gfxp_default_size = + gr_gpc0_ppc0_cbm_beta_cb_size_v_gfxp_v(); + gr->attrib_cb_gfxp_size = + gr_gpc0_ppc0_cbm_beta_cb_size_v_gfxp_v(); +} + +void gr_tu104_free_gr_ctx(struct gk20a *g, + struct vm_gk20a *vm, struct nvgpu_gr_ctx *gr_ctx) +{ + nvgpu_log_fn(g, " "); + + if (gr_ctx != NULL) { + nvgpu_dma_unmap_free(vm, &gr_ctx->gfxp_rtvcb_ctxsw_buffer); + } + + gr_gk20a_free_gr_ctx(g, vm, gr_ctx); +} + +void gr_tu104_enable_gpc_exceptions(struct gk20a *g) +{ + struct gr_gk20a *gr = &g->gr; + u32 tpc_mask; + + gk20a_writel(g, gr_gpcs_tpcs_tpccs_tpc_exception_en_r(), + gr_gpcs_tpcs_tpccs_tpc_exception_en_sm_enabled_f()); + + tpc_mask = + gr_gpcs_gpccs_gpc_exception_en_tpc_f((1 << gr->max_tpc_per_gpc_count) - 1); + + gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(), + (tpc_mask | gr_gpcs_gpccs_gpc_exception_en_gcc_f(1) | + gr_gpcs_gpccs_gpc_exception_en_gpccs_f(1) | + gr_gpcs_gpccs_gpc_exception_en_gpcmmu_f(1))); +} + +int gr_tu104_get_offset_in_gpccs_segment(struct gk20a *g, + enum ctxsw_addr_type addr_type, + u32 num_tpcs, + u32 num_ppcs, + u32 reg_list_ppc_count, + u32 *__offset_in_segment) +{ + u32 offset_in_segment = 0; + u32 num_pes_per_gpc = nvgpu_get_litter_value(g, + GPU_LIT_NUM_PES_PER_GPC); + + if (addr_type == CTXSW_ADDR_TYPE_TPC) { + /* + * reg = g->netlist_vars->ctxsw_regs.tpc.l; + * offset_in_segment = 0; + */ + } else if (addr_type == CTXSW_ADDR_TYPE_PPC) { + /* + * The ucode stores TPC data before PPC data. + * Advance offset past TPC data to PPC data. + */ + offset_in_segment = + ((g->netlist_vars->ctxsw_regs.tpc.count * + num_tpcs) << 2); + } else if (addr_type == CTXSW_ADDR_TYPE_GPC) { + /* + * The ucode stores TPC/PPC data before GPC data. + * Advance offset past TPC/PPC data to GPC data. + * + * Note 1 PES_PER_GPC case + */ + if (num_pes_per_gpc > 1U) { + offset_in_segment = + (((g->netlist_vars->ctxsw_regs.tpc.count * + num_tpcs) << 2) + + ((reg_list_ppc_count * num_ppcs) << 2)); + } else { + offset_in_segment = + ((g->netlist_vars->ctxsw_regs.tpc.count * + num_tpcs) << 2); + } + } else if ((addr_type == CTXSW_ADDR_TYPE_EGPC) || + (addr_type == CTXSW_ADDR_TYPE_ETPC)) { + if (num_pes_per_gpc > 1U) { + offset_in_segment = + ((g->netlist_vars->ctxsw_regs.tpc.count * + num_tpcs) << 2) + + ((reg_list_ppc_count * num_ppcs) << 2) + + (g->netlist_vars->ctxsw_regs.gpc.count << 2); + } else { + offset_in_segment = + ((g->netlist_vars->ctxsw_regs.tpc.count * + num_tpcs) << 2) + + (g->netlist_vars->ctxsw_regs.gpc.count << 2); + } + + /* aligned to next 256 byte */ + offset_in_segment = ALIGN(offset_in_segment, 256); + + nvgpu_log(g, gpu_dbg_info | gpu_dbg_gpu_dbg, + "egpc etpc offset_in_segment 0x%#08x", + offset_in_segment); + } else { + nvgpu_log_fn(g, "Unknown address type."); + return -EINVAL; + } + + *__offset_in_segment = offset_in_segment; + return 0; +} + +static void gr_tu104_set_sm_disp_ctrl(struct gk20a *g, u32 data) +{ + u32 reg_val; + + nvgpu_log_fn(g, " "); + + reg_val = nvgpu_readl(g, gr_gpcs_tpcs_sm_disp_ctrl_r()); + + if ((data & NVC5C0_SET_SM_DISP_CTRL_COMPUTE_SHADER_QUAD_MASK) + == NVC5C0_SET_SM_DISP_CTRL_COMPUTE_SHADER_QUAD_DISABLE) { + reg_val = set_field(reg_val, + gr_gpcs_tpcs_sm_disp_ctrl_compute_shader_quad_m(), + gr_gpcs_tpcs_sm_disp_ctrl_compute_shader_quad_disable_f() + ); + } else if ((data & NVC5C0_SET_SM_DISP_CTRL_COMPUTE_SHADER_QUAD_MASK) + == NVC5C0_SET_SM_DISP_CTRL_COMPUTE_SHADER_QUAD_ENABLE) { + reg_val = set_field(reg_val, + gr_gpcs_tpcs_sm_disp_ctrl_compute_shader_quad_m(), + gr_gpcs_tpcs_sm_disp_ctrl_compute_shader_quad_enable_f() + ); + } + + nvgpu_writel(g, gr_gpcs_tpcs_sm_disp_ctrl_r(), reg_val); +} + +int gr_tu104_handle_sw_method(struct gk20a *g, u32 addr, + u32 class_num, u32 offset, u32 data) +{ + nvgpu_log_fn(g, " "); + + if (class_num == TURING_COMPUTE_A) { + switch (offset << 2) { + case NVC5C0_SET_SHADER_EXCEPTIONS: + gv11b_gr_set_shader_exceptions(g, data); + break; + case NVC5C0_SET_SKEDCHECK: + gr_gv11b_set_skedcheck(g, data); + break; + case NVC5C0_SET_SM_DISP_CTRL: + gr_tu104_set_sm_disp_ctrl(g, data); + break; + case NVC5C0_SET_SHADER_CUT_COLLECTOR: + gr_gv11b_set_shader_cut_collector(g, data); + break; + default: + goto fail; + } + } + + if (class_num == TURING_A) { + switch (offset << 2) { + case NVC597_SET_SHADER_EXCEPTIONS: + gv11b_gr_set_shader_exceptions(g, data); + break; + case NVC597_SET_CIRCULAR_BUFFER_SIZE: + g->ops.gr.set_circular_buffer_size(g, data); + break; + case NVC597_SET_ALPHA_CIRCULAR_BUFFER_SIZE: + g->ops.gr.set_alpha_circular_buffer_size(g, data); + break; + case NVC597_SET_GO_IDLE_TIMEOUT: + gr_gv11b_set_go_idle_timeout(g, data); + break; + case NVC097_SET_COALESCE_BUFFER_SIZE: + gr_gv11b_set_coalesce_buffer_size(g, data); + break; + case NVC597_SET_TEX_IN_DBG: + gr_gv11b_set_tex_in_dbg(g, data); + break; + case NVC597_SET_SKEDCHECK: + gr_gv11b_set_skedcheck(g, data); + break; + case NVC597_SET_BES_CROP_DEBUG3: + g->ops.gr.set_bes_crop_debug3(g, data); + break; + case NVC597_SET_BES_CROP_DEBUG4: + g->ops.gr.set_bes_crop_debug4(g, data); + break; + case NVC597_SET_SM_DISP_CTRL: + gr_tu104_set_sm_disp_ctrl(g, data); + break; + case NVC597_SET_SHADER_CUT_COLLECTOR: + gr_gv11b_set_shader_cut_collector(g, data); + break; + default: + goto fail; + } + } + return 0; + +fail: + return -EINVAL; +} + +void gr_tu104_init_sm_dsm_reg_info(void) +{ + return; +} + +void gr_tu104_get_sm_dsm_perf_ctrl_regs(struct gk20a *g, + u32 *num_sm_dsm_perf_ctrl_regs, + u32 **sm_dsm_perf_ctrl_regs, + u32 *ctrl_register_stride) +{ + *num_sm_dsm_perf_ctrl_regs = 0; + *sm_dsm_perf_ctrl_regs = NULL; + *ctrl_register_stride = 0; +} +>>>>>>> CHANGE (f0762e gpu: nvgpu: add speculative barrier) -- cgit v1.2.2