summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/nvgpu/Kconfig10
-rw-r--r--drivers/gpu/nvgpu/Makefile14
-rw-r--r--drivers/gpu/nvgpu/gk20a/as_gk20a.c6
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c158
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.h4
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c4
-rw-r--r--drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c96
-rw-r--r--drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c586
-rw-r--r--drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.h41
-rw-r--r--drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c763
-rw-r--r--drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.h20
-rw-r--r--drivers/gpu/nvgpu/gk20a/fifo_gk20a.c328
-rw-r--r--drivers/gpu/nvgpu/gk20a/fifo_gk20a.h8
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.c73
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.h49
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a_sysfs.c26
-rw-r--r--drivers/gpu/nvgpu/gk20a/gr_gk20a.c26
-rw-r--r--drivers/gpu/nvgpu/gk20a/hal_gk20a.c2
-rw-r--r--drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h190
-rw-r--r--drivers/gpu/nvgpu/gk20a/hw_ltc_gk20a.h6
-rw-r--r--drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h10
-rw-r--r--drivers/gpu/nvgpu/gk20a/ltc_gk20a.c14
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.c76
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.h7
-rw-r--r--drivers/gpu/nvgpu/gk20a/platform_gk20a.h2
-rw-r--r--drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c4
-rw-r--r--drivers/gpu/nvgpu/gk20a/pmu_gk20a.c2
-rw-r--r--drivers/gpu/nvgpu/gk20a/tsg_gk20a.c1
-rw-r--r--drivers/gpu/nvgpu/gk20a/tsg_gk20a.h2
-rw-r--r--drivers/gpu/nvgpu/gm20b/acr_gm20b.c2
-rw-r--r--drivers/gpu/nvgpu/gm20b/fifo_gm20b.c4
-rw-r--r--drivers/gpu/nvgpu/gm20b/hw_ltc_gm20b.h6
-rw-r--r--drivers/gpu/nvgpu/gm20b/hw_timer_gm20b.h10
-rw-r--r--drivers/gpu/nvgpu/gm20b/ltc_gm20b.c3
-rw-r--r--drivers/gpu/nvgpu/gm20b/therm_gm20b.c1
-rw-r--r--drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.c21
-rw-r--r--drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.h20
-rw-r--r--drivers/gpu/nvgpu/vgpu/fifo_vgpu.c63
-rw-r--r--drivers/gpu/nvgpu/vgpu/vgpu.c2
-rw-r--r--include/linux/tegra_vgpu.h20
-rw-r--r--include/trace/events/gk20a.h6
-rw-r--r--include/uapi/linux/nvgpu.h154
42 files changed, 2537 insertions, 303 deletions
diff --git a/drivers/gpu/nvgpu/Kconfig b/drivers/gpu/nvgpu/Kconfig
index d0e25aa2..94173976 100644
--- a/drivers/gpu/nvgpu/Kconfig
+++ b/drivers/gpu/nvgpu/Kconfig
@@ -54,6 +54,16 @@ config GK20A_CYCLE_STATS
54 help 54 help
55 Say Y here to enable the cycle stats debugging features. 55 Say Y here to enable the cycle stats debugging features.
56 56
57config GK20A_CTXSW_TRACE
58 bool "Support GK20A Context Switch tracing"
59 depends on GK20A
60 default n
61 help
62 Enable support for the GK20A Context Switch Tracing. In this mode,
63 FECS collects timestamps for contexts loaded on GR engine. This
64 allows tracking context switches on GR engine, as well as
65 identifying processes that submitted work.
66
57config TEGRA_GK20A 67config TEGRA_GK20A
58 bool "Enable the GK20A GPU on Tegra" 68 bool "Enable the GK20A GPU on Tegra"
59 depends on TEGRA_GRHOST || TEGRA_HOST1X 69 depends on TEGRA_GRHOST || TEGRA_HOST1X
diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile
index f6b3a673..df660eb7 100644
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -8,9 +8,9 @@ ccflags-y += -Werror
8ccflags-y += -Wno-error=cpp 8ccflags-y += -Wno-error=cpp
9 9
10ifeq ($(CONFIG_ARCH_TEGRA_18x_SOC),y) 10ifeq ($(CONFIG_ARCH_TEGRA_18x_SOC),y)
11ccflags-$(CONFIG_GK20A) += -I$(srctree)/../kernel-t18x/drivers/gpu/nvgpu 11ccflags-y += -I$(srctree)/../kernel-t18x/drivers/gpu/nvgpu
12ccflags-$(CONFIG_GK20A) += -I$(srctree)/../kernel-t18x/include 12ccflags-y += -I$(srctree)/../kernel-t18x/include
13ccflags-$(CONFIG_GK20A) += -I$(srctree)/../kernel-t18x/include/uapi 13ccflags-y += -I$(srctree)/../kernel-t18x/include/uapi
14endif 14endif
15 15
16obj-$(CONFIG_GK20A) := nvgpu.o 16obj-$(CONFIG_GK20A) := nvgpu.o
@@ -46,6 +46,8 @@ nvgpu-y := \
46 gk20a/cde_gk20a.o \ 46 gk20a/cde_gk20a.o \
47 gk20a/platform_gk20a_generic.o \ 47 gk20a/platform_gk20a_generic.o \
48 gk20a/tsg_gk20a.o \ 48 gk20a/tsg_gk20a.o \
49 gk20a/ctxsw_trace_gk20a.o \
50 gk20a/fecs_trace_gk20a.o \
49 gk20a/mc_gk20a.o \ 51 gk20a/mc_gk20a.o \
50 gm20b/hal_gm20b.o \ 52 gm20b/hal_gm20b.o \
51 gm20b/ltc_gm20b.o \ 53 gm20b/ltc_gm20b.o \
@@ -64,7 +66,6 @@ nvgpu-y := \
64 gm20b/debug_gm20b.o \ 66 gm20b/debug_gm20b.o \
65 gm20b/cde_gm20b.o \ 67 gm20b/cde_gm20b.o \
66 gm20b/therm_gm20b.o 68 gm20b/therm_gm20b.o
67
68nvgpu-$(CONFIG_TEGRA_GK20A) += gk20a/platform_gk20a_tegra.o 69nvgpu-$(CONFIG_TEGRA_GK20A) += gk20a/platform_gk20a_tegra.o
69nvgpu-$(CONFIG_SYNC) += gk20a/sync_gk20a.o 70nvgpu-$(CONFIG_SYNC) += gk20a/sync_gk20a.o
70 71
@@ -78,6 +79,7 @@ nvgpu-$(CONFIG_TEGRA_GR_VIRTUALIZATION) += \
78 vgpu/debug_vgpu.o \ 79 vgpu/debug_vgpu.o \
79 vgpu/vgpu.o \ 80 vgpu/vgpu.o \
80 vgpu/dbg_vgpu.o \ 81 vgpu/dbg_vgpu.o \
82 vgpu/fecs_trace_vgpu.o \
81 vgpu/gk20a/vgpu_hal_gk20a.o \ 83 vgpu/gk20a/vgpu_hal_gk20a.o \
82 vgpu/gk20a/vgpu_gr_gk20a.o \ 84 vgpu/gk20a/vgpu_gr_gk20a.o \
83 vgpu/gm20b/vgpu_hal_gm20b.o \ 85 vgpu/gm20b/vgpu_hal_gm20b.o \
@@ -94,7 +96,5 @@ nvgpu-$(CONFIG_GK20A_CYCLE_STATS) += \
94 gk20a/css_gr_gk20a.o 96 gk20a/css_gr_gk20a.o
95 97
96ifeq ($(CONFIG_ARCH_TEGRA_18x_SOC),y) 98ifeq ($(CONFIG_ARCH_TEGRA_18x_SOC),y)
97ccflags-$(CONFIG_GK20A) += -I$(srctree)/../kernel-t18x/drivers/gpu/nvgpu 99include ../kernel-t18x/drivers/gpu/nvgpu/Makefile
98ccflags-$(CONFIG_GK20A) += -I$(srctree)/../kernel-t18x/include
99obj-$(CONFIG_GK20A) += ../../../../kernel-t18x/drivers/gpu/nvgpu/
100endif 100endif
diff --git a/drivers/gpu/nvgpu/gk20a/as_gk20a.c b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
index b6b38541..0571ca1f 100644
--- a/drivers/gpu/nvgpu/gk20a/as_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
@@ -279,13 +279,15 @@ static int gk20a_as_ioctl_get_va_regions(
279 279
280 for (i = 0; i < write_entries; ++i) { 280 for (i = 0; i < write_entries; ++i) {
281 struct nvgpu_as_va_region region; 281 struct nvgpu_as_va_region region;
282 struct gk20a_allocator *vma = vm->fixed.init ?
283 &vm->fixed : &vm->vma[i];
282 284
283 memset(&region, 0, sizeof(struct nvgpu_as_va_region)); 285 memset(&region, 0, sizeof(struct nvgpu_as_va_region));
284 286
285 region.page_size = vm->gmmu_page_sizes[i]; 287 region.page_size = vm->gmmu_page_sizes[i];
286 region.offset = vm->vma[i].base; 288 region.offset = vma->base;
287 /* No __aeabi_uldivmod() on some platforms... */ 289 /* No __aeabi_uldivmod() on some platforms... */
288 region.pages = (vm->vma[i].end - vm->vma[i].start) >> 290 region.pages = (vma->end - vma->start) >>
289 ilog2(region.page_size); 291 ilog2(region.page_size);
290 292
291 if (copy_to_user(user_region_ptr + i, &region, sizeof(region))) 293 if (copy_to_user(user_region_ptr + i, &region, sizeof(region)))
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 1f63bbd8..20976992 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -28,6 +28,7 @@
28#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
29 29
30#include "debug_gk20a.h" 30#include "debug_gk20a.h"
31#include "ctxsw_trace_gk20a.h"
31 32
32#include "gk20a.h" 33#include "gk20a.h"
33#include "dbg_gpu_gk20a.h" 34#include "dbg_gpu_gk20a.h"
@@ -44,6 +45,9 @@
44 45
45#define NVGPU_BEGIN_AGGRESSIVE_SYNC_DESTROY_LIMIT 64 /* channels */ 46#define NVGPU_BEGIN_AGGRESSIVE_SYNC_DESTROY_LIMIT 64 /* channels */
46 47
48#define NVGPU_CHANNEL_MIN_TIMESLICE_US 1000
49#define NVGPU_CHANNEL_MAX_TIMESLICE_US 50000
50
47static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f); 51static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f);
48static void free_channel(struct fifo_gk20a *f, struct channel_gk20a *c); 52static void free_channel(struct fifo_gk20a *f, struct channel_gk20a *c);
49 53
@@ -177,7 +181,7 @@ int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g,
177} 181}
178 182
179static int channel_gk20a_set_schedule_params(struct channel_gk20a *c, 183static int channel_gk20a_set_schedule_params(struct channel_gk20a *c,
180 u32 timeslice_period, bool interleave) 184 u32 timeslice_period)
181{ 185{
182 void *inst_ptr; 186 void *inst_ptr;
183 int shift = 0, value = 0; 187 int shift = 0, value = 0;
@@ -205,30 +209,6 @@ static int channel_gk20a_set_schedule_params(struct channel_gk20a *c,
205 gk20a_readl(c->g, ccsr_channel_r(c->hw_chid)) | 209 gk20a_readl(c->g, ccsr_channel_r(c->hw_chid)) |
206 ccsr_channel_enable_set_true_f()); 210 ccsr_channel_enable_set_true_f());
207 211
208 if (c->interleave != interleave) {
209 mutex_lock(&c->g->interleave_lock);
210 c->interleave = interleave;
211 if (interleave)
212 if (c->g->num_interleaved_channels >=
213 MAX_INTERLEAVED_CHANNELS) {
214 gk20a_err(dev_from_gk20a(c->g),
215 "Change of priority would exceed runlist length, only changing timeslice\n");
216 c->interleave = false;
217 } else
218 c->g->num_interleaved_channels += 1;
219 else
220 c->g->num_interleaved_channels -= 1;
221
222 mutex_unlock(&c->g->interleave_lock);
223 gk20a_dbg_info("Set channel %d to interleave %d",
224 c->hw_chid, c->interleave);
225
226 gk20a_fifo_set_channel_priority(
227 c->g, 0, c->hw_chid, c->interleave);
228 c->g->ops.fifo.update_runlist(
229 c->g, 0, ~0, true, false);
230 }
231
232 return 0; 212 return 0;
233} 213}
234 214
@@ -238,6 +218,12 @@ u32 channel_gk20a_pbdma_acquire_val(struct channel_gk20a *c)
238 u64 timeout; 218 u64 timeout;
239 int val_len; 219 int val_len;
240 220
221 val = pbdma_acquire_retry_man_2_f() |
222 pbdma_acquire_retry_exp_2_f();
223
224 if (!c->g->timeouts_enabled)
225 return val;
226
241 timeout = gk20a_get_channel_watchdog_timeout(c); 227 timeout = gk20a_get_channel_watchdog_timeout(c);
242 do_div(timeout, 2); /* set acquire timeout to half of channel wdt */ 228 do_div(timeout, 2); /* set acquire timeout to half of channel wdt */
243 timeout *= 1000000UL; /* ms -> ns */ 229 timeout *= 1000000UL; /* ms -> ns */
@@ -256,11 +242,10 @@ u32 channel_gk20a_pbdma_acquire_val(struct channel_gk20a *c)
256 man = timeout; 242 man = timeout;
257 } 243 }
258 244
259 val = pbdma_acquire_retry_man_2_f() | 245 val |= pbdma_acquire_timeout_exp_f(exp) |
260 pbdma_acquire_retry_exp_2_f() |
261 pbdma_acquire_timeout_exp_f(exp) |
262 pbdma_acquire_timeout_man_f(man) | 246 pbdma_acquire_timeout_man_f(man) |
263 pbdma_acquire_timeout_en_enable_f(); 247 pbdma_acquire_timeout_en_enable_f();
248
264 return val; 249 return val;
265} 250}
266 251
@@ -711,11 +696,39 @@ static int gk20a_channel_set_wdt_status(struct channel_gk20a *ch,
711 return 0; 696 return 0;
712} 697}
713 698
714static int gk20a_init_error_notifier(struct channel_gk20a *ch, 699static int gk20a_channel_set_runlist_interleave(struct channel_gk20a *ch,
715 struct nvgpu_set_error_notifier *args) { 700 u32 level)
716 void *va; 701{
702 struct gk20a *g = ch->g;
703 int ret;
704
705 if (gk20a_is_channel_marked_as_tsg(ch)) {
706 gk20a_err(dev_from_gk20a(g), "invalid operation for TSG!\n");
707 return -EINVAL;
708 }
709
710 switch (level) {
711 case NVGPU_RUNLIST_INTERLEAVE_LEVEL_LOW:
712 case NVGPU_RUNLIST_INTERLEAVE_LEVEL_MEDIUM:
713 case NVGPU_RUNLIST_INTERLEAVE_LEVEL_HIGH:
714 ret = g->ops.fifo.set_runlist_interleave(g, ch->hw_chid,
715 false, 0, level);
716 break;
717 default:
718 ret = -EINVAL;
719 break;
720 }
717 721
722 return ret ? ret : g->ops.fifo.update_runlist(g, 0, ~0, true, true);
723}
724
725static int gk20a_init_error_notifier(struct channel_gk20a *ch,
726 struct nvgpu_set_error_notifier *args)
727{
728 struct device *dev = dev_from_gk20a(ch->g);
718 struct dma_buf *dmabuf; 729 struct dma_buf *dmabuf;
730 void *va;
731 u64 end = args->offset + sizeof(struct nvgpu_notification);
719 732
720 if (!args->mem) { 733 if (!args->mem) {
721 pr_err("gk20a_init_error_notifier: invalid memory handle\n"); 734 pr_err("gk20a_init_error_notifier: invalid memory handle\n");
@@ -731,6 +744,13 @@ static int gk20a_init_error_notifier(struct channel_gk20a *ch,
731 pr_err("Invalid handle: %d\n", args->mem); 744 pr_err("Invalid handle: %d\n", args->mem);
732 return -EINVAL; 745 return -EINVAL;
733 } 746 }
747
748 if (end > dmabuf->size || end < sizeof(struct nvgpu_notification)) {
749 dma_buf_put(dmabuf);
750 gk20a_err(dev, "gk20a_init_error_notifier: invalid offset\n");
751 return -EINVAL;
752 }
753
734 /* map handle */ 754 /* map handle */
735 va = dma_buf_vmap(dmabuf); 755 va = dma_buf_vmap(dmabuf);
736 if (!va) { 756 if (!va) {
@@ -890,17 +910,6 @@ static void gk20a_free_channel(struct channel_gk20a *ch)
890 } 910 }
891 mutex_unlock(&f->deferred_reset_mutex); 911 mutex_unlock(&f->deferred_reset_mutex);
892 912
893 if (ch->interleave) {
894 ch->interleave = false;
895 gk20a_fifo_set_channel_priority(
896 ch->g, 0, ch->hw_chid, ch->interleave);
897
898 mutex_lock(&f->g->interleave_lock);
899 WARN_ON(f->g->num_interleaved_channels == 0);
900 f->g->num_interleaved_channels -= 1;
901 mutex_unlock(&f->g->interleave_lock);
902 }
903
904 if (!ch->bound) 913 if (!ch->bound)
905 goto release; 914 goto release;
906 915
@@ -912,6 +921,9 @@ static void gk20a_free_channel(struct channel_gk20a *ch)
912 921
913 gk20a_free_error_notifiers(ch); 922 gk20a_free_error_notifiers(ch);
914 923
924 if (g->ops.fecs_trace.unbind_channel)
925 g->ops.fecs_trace.unbind_channel(g, ch);
926
915 /* release channel ctx */ 927 /* release channel ctx */
916 g->ops.gr.free_channel_ctx(ch); 928 g->ops.gr.free_channel_ctx(ch);
917 929
@@ -1145,11 +1157,8 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g)
1145 ch->has_timedout = false; 1157 ch->has_timedout = false;
1146 ch->wdt_enabled = true; 1158 ch->wdt_enabled = true;
1147 ch->obj_class = 0; 1159 ch->obj_class = 0;
1148 ch->interleave = false;
1149 ch->clean_up.scheduled = false; 1160 ch->clean_up.scheduled = false;
1150 gk20a_fifo_set_channel_priority( 1161 ch->interleave_level = NVGPU_RUNLIST_INTERLEAVE_LEVEL_LOW;
1151 ch->g, 0, ch->hw_chid, ch->interleave);
1152
1153 1162
1154 /* The channel is *not* runnable at this point. It still needs to have 1163 /* The channel is *not* runnable at this point. It still needs to have
1155 * an address space bound and allocate a gpfifo and grctx. */ 1164 * an address space bound and allocate a gpfifo and grctx. */
@@ -1697,6 +1706,10 @@ static void gk20a_channel_timeout_handler(struct work_struct *work)
1697 /* Need global lock since multiple channels can timeout at a time */ 1706 /* Need global lock since multiple channels can timeout at a time */
1698 mutex_lock(&g->ch_wdt_lock); 1707 mutex_lock(&g->ch_wdt_lock);
1699 1708
1709 gk20a_debug_dump(g->dev);
1710 gk20a_gr_debug_dump(g->dev);
1711
1712
1700 /* Get timed out job and reset the timer */ 1713 /* Get timed out job and reset the timer */
1701 mutex_lock(&ch->timeout.lock); 1714 mutex_lock(&ch->timeout.lock);
1702 job = ch->timeout.job; 1715 job = ch->timeout.job;
@@ -2399,6 +2412,7 @@ static int gk20a_channel_wait(struct channel_gk20a *ch,
2399 u32 offset; 2412 u32 offset;
2400 unsigned long timeout; 2413 unsigned long timeout;
2401 int remain, ret = 0; 2414 int remain, ret = 0;
2415 u64 end;
2402 2416
2403 gk20a_dbg_fn(""); 2417 gk20a_dbg_fn("");
2404 2418
@@ -2414,6 +2428,7 @@ static int gk20a_channel_wait(struct channel_gk20a *ch,
2414 case NVGPU_WAIT_TYPE_NOTIFIER: 2428 case NVGPU_WAIT_TYPE_NOTIFIER:
2415 id = args->condition.notifier.dmabuf_fd; 2429 id = args->condition.notifier.dmabuf_fd;
2416 offset = args->condition.notifier.offset; 2430 offset = args->condition.notifier.offset;
2431 end = offset + sizeof(struct notification);
2417 2432
2418 dmabuf = dma_buf_get(id); 2433 dmabuf = dma_buf_get(id);
2419 if (IS_ERR(dmabuf)) { 2434 if (IS_ERR(dmabuf)) {
@@ -2422,6 +2437,12 @@ static int gk20a_channel_wait(struct channel_gk20a *ch,
2422 return -EINVAL; 2437 return -EINVAL;
2423 } 2438 }
2424 2439
2440 if (end > dmabuf->size || end < sizeof(struct notification)) {
2441 dma_buf_put(dmabuf);
2442 gk20a_err(d, "invalid notifier offset\n");
2443 return -EINVAL;
2444 }
2445
2425 notif = dma_buf_vmap(dmabuf); 2446 notif = dma_buf_vmap(dmabuf);
2426 if (!notif) { 2447 if (!notif) {
2427 gk20a_err(d, "failed to map notifier memory"); 2448 gk20a_err(d, "failed to map notifier memory");
@@ -2596,7 +2617,6 @@ unsigned int gk20a_channel_poll(struct file *filep, poll_table *wait)
2596int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority) 2617int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority)
2597{ 2618{
2598 u32 timeslice_timeout; 2619 u32 timeslice_timeout;
2599 bool interleave = false;
2600 2620
2601 if (gk20a_is_channel_marked_as_tsg(ch)) { 2621 if (gk20a_is_channel_marked_as_tsg(ch)) {
2602 gk20a_err(dev_from_gk20a(ch->g), 2622 gk20a_err(dev_from_gk20a(ch->g),
@@ -2613,8 +2633,6 @@ int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority)
2613 timeslice_timeout = ch->g->timeslice_medium_priority_us; 2633 timeslice_timeout = ch->g->timeslice_medium_priority_us;
2614 break; 2634 break;
2615 case NVGPU_PRIORITY_HIGH: 2635 case NVGPU_PRIORITY_HIGH:
2616 if (ch->g->interleave_high_priority)
2617 interleave = true;
2618 timeslice_timeout = ch->g->timeslice_high_priority_us; 2636 timeslice_timeout = ch->g->timeslice_high_priority_us;
2619 break; 2637 break;
2620 default: 2638 default:
@@ -2623,7 +2641,22 @@ int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority)
2623 } 2641 }
2624 2642
2625 return channel_gk20a_set_schedule_params(ch, 2643 return channel_gk20a_set_schedule_params(ch,
2626 timeslice_timeout, interleave); 2644 timeslice_timeout);
2645}
2646
2647int gk20a_channel_set_timeslice(struct channel_gk20a *ch, u32 timeslice)
2648{
2649 if (gk20a_is_channel_marked_as_tsg(ch)) {
2650 gk20a_err(dev_from_gk20a(ch->g),
2651 "invalid operation for TSG!\n");
2652 return -EINVAL;
2653 }
2654
2655 if (timeslice < NVGPU_CHANNEL_MIN_TIMESLICE_US ||
2656 timeslice > NVGPU_CHANNEL_MAX_TIMESLICE_US)
2657 return -EINVAL;
2658
2659 return channel_gk20a_set_schedule_params(ch, timeslice);
2627} 2660}
2628 2661
2629static int gk20a_channel_zcull_bind(struct channel_gk20a *ch, 2662static int gk20a_channel_zcull_bind(struct channel_gk20a *ch,
@@ -2778,6 +2811,7 @@ void gk20a_init_channel(struct gpu_ops *gops)
2778 gops->fifo.free_inst = channel_gk20a_free_inst; 2811 gops->fifo.free_inst = channel_gk20a_free_inst;
2779 gops->fifo.setup_ramfc = channel_gk20a_setup_ramfc; 2812 gops->fifo.setup_ramfc = channel_gk20a_setup_ramfc;
2780 gops->fifo.channel_set_priority = gk20a_channel_set_priority; 2813 gops->fifo.channel_set_priority = gk20a_channel_set_priority;
2814 gops->fifo.channel_set_timeslice = gk20a_channel_set_timeslice;
2781} 2815}
2782 2816
2783long gk20a_channel_ioctl(struct file *filp, 2817long gk20a_channel_ioctl(struct file *filp,
@@ -3028,6 +3062,30 @@ long gk20a_channel_ioctl(struct file *filp,
3028 err = gk20a_channel_set_wdt_status(ch, 3062 err = gk20a_channel_set_wdt_status(ch,
3029 (struct nvgpu_channel_wdt_args *)buf); 3063 (struct nvgpu_channel_wdt_args *)buf);
3030 break; 3064 break;
3065 case NVGPU_IOCTL_CHANNEL_SET_RUNLIST_INTERLEAVE:
3066 err = gk20a_busy(dev);
3067 if (err) {
3068 dev_err(&dev->dev,
3069 "%s: failed to host gk20a for ioctl cmd: 0x%x",
3070 __func__, cmd);
3071 break;
3072 }
3073 err = gk20a_channel_set_runlist_interleave(ch,
3074 ((struct nvgpu_runlist_interleave_args *)buf)->level);
3075 gk20a_idle(dev);
3076 break;
3077 case NVGPU_IOCTL_CHANNEL_SET_TIMESLICE:
3078 err = gk20a_busy(dev);
3079 if (err) {
3080 dev_err(&dev->dev,
3081 "%s: failed to host gk20a for ioctl cmd: 0x%x",
3082 __func__, cmd);
3083 break;
3084 }
3085 err = ch->g->ops.fifo.channel_set_timeslice(ch,
3086 ((struct nvgpu_timeslice_args *)buf)->timeslice_us);
3087 gk20a_idle(dev);
3088 break;
3031 default: 3089 default:
3032 dev_dbg(&dev->dev, "unrecognized ioctl cmd: 0x%x", cmd); 3090 dev_dbg(&dev->dev, "unrecognized ioctl cmd: 0x%x", cmd);
3033 err = -ENOTTY; 3091 err = -ENOTTY;
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 4aea9d19..e3fbba3e 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -188,8 +188,7 @@ struct channel_gk20a {
188 spinlock_t update_fn_lock; /* make access to the two above atomic */ 188 spinlock_t update_fn_lock; /* make access to the two above atomic */
189 struct work_struct update_fn_work; 189 struct work_struct update_fn_work;
190 190
191 /* true if channel is interleaved with lower priority channels */ 191 u32 interleave_level;
192 bool interleave;
193}; 192};
194 193
195static inline bool gk20a_channel_as_bound(struct channel_gk20a *ch) 194static inline bool gk20a_channel_as_bound(struct channel_gk20a *ch)
@@ -276,5 +275,6 @@ int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g,
276 int timeslice_period, 275 int timeslice_period,
277 int *__timeslice_timeout, int *__timeslice_scale); 276 int *__timeslice_timeout, int *__timeslice_scale);
278int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority); 277int gk20a_channel_set_priority(struct channel_gk20a *ch, u32 priority);
278int gk20a_channel_set_timeslice(struct channel_gk20a *ch, u32 timeslice);
279 279
280#endif /* CHANNEL_GK20A_H */ 280#endif /* CHANNEL_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
index 8ff53d17..87f0bf74 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -115,8 +115,10 @@ static int gk20a_channel_syncpt_wait_fd(struct gk20a_channel_sync *s, int fd,
115 } 115 }
116 116
117 num_wait_cmds = nvhost_sync_num_pts(sync_fence); 117 num_wait_cmds = nvhost_sync_num_pts(sync_fence);
118 if (num_wait_cmds == 0) 118 if (num_wait_cmds == 0) {
119 sync_fence_put(sync_fence);
119 return 0; 120 return 0;
121 }
120 122
121 err = gk20a_channel_alloc_priv_cmdbuf(c, 4 * num_wait_cmds, &wait_cmd); 123 err = gk20a_channel_alloc_priv_cmdbuf(c, 4 * num_wait_cmds, &wait_cmd);
122 if (err) { 124 if (err) {
diff --git a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
index 9ed5fef3..b2ae224f 100644
--- a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
@@ -29,6 +29,7 @@
29#include "hw_gr_gk20a.h" 29#include "hw_gr_gk20a.h"
30#include "hw_fb_gk20a.h" 30#include "hw_fb_gk20a.h"
31#include "hw_proj_gk20a.h" 31#include "hw_proj_gk20a.h"
32#include "hw_timer_gk20a.h"
32 33
33int gk20a_ctrl_dev_open(struct inode *inode, struct file *filp) 34int gk20a_ctrl_dev_open(struct inode *inode, struct file *filp)
34{ 35{
@@ -272,6 +273,8 @@ static int nvgpu_gpu_ioctl_inval_icache(
272 struct nvgpu_dbg_gpu_reg_op ops; 273 struct nvgpu_dbg_gpu_reg_op ops;
273 274
274 ch = gk20a_get_channel_from_file(args->channel_fd); 275 ch = gk20a_get_channel_from_file(args->channel_fd);
276 if (!ch)
277 return -EINVAL;
275 278
276 ops.op = REGOP(READ_32); 279 ops.op = REGOP(READ_32);
277 ops.type = REGOP(TYPE_GR_CTX); 280 ops.type = REGOP(TYPE_GR_CTX);
@@ -528,6 +531,94 @@ static int gk20a_ctrl_get_buffer_info(
528 &args->out.id, &args->out.length); 531 &args->out.id, &args->out.length);
529} 532}
530 533
534static inline u64 get_cpu_timestamp_tsc(void)
535{
536 return ((u64) get_cycles());
537}
538
539static inline u64 get_cpu_timestamp_jiffies(void)
540{
541 return (get_jiffies_64() - INITIAL_JIFFIES);
542}
543
544static inline u64 get_cpu_timestamp_timeofday(void)
545{
546 struct timeval tv;
547
548 do_gettimeofday(&tv);
549 return timeval_to_jiffies(&tv);
550}
551
552static inline int get_timestamps_zipper(struct gk20a *g,
553 u64 (*get_cpu_timestamp)(void),
554 struct nvgpu_gpu_get_cpu_time_correlation_info_args *args)
555{
556 int err = 0;
557 int i = 0;
558 u32 gpu_timestamp_hi_new = 0;
559 u32 gpu_timestamp_hi_old = 0;
560
561 if (gk20a_busy(g->dev)) {
562 gk20a_err(dev_from_gk20a(g), "GPU not powered on\n");
563 err = -EINVAL;
564 goto end;
565 }
566
567 /* get zipper reads of gpu and cpu counter values */
568 gpu_timestamp_hi_old = gk20a_readl(g, timer_time_1_r());
569 for (i = 0; i < args->count; i++) {
570 u32 gpu_timestamp_lo = 0;
571 u32 gpu_timestamp_hi = 0;
572
573 gpu_timestamp_lo = gk20a_readl(g, timer_time_0_r());
574 args->samples[i].cpu_timestamp = get_cpu_timestamp();
575 rmb(); /* maintain zipper read order */
576 gpu_timestamp_hi_new = gk20a_readl(g, timer_time_1_r());
577
578 /* pick the appropriate gpu counter hi bits */
579 gpu_timestamp_hi = (gpu_timestamp_lo & (1L << 31)) ?
580 gpu_timestamp_hi_old : gpu_timestamp_hi_new;
581
582 args->samples[i].gpu_timestamp =
583 ((u64)gpu_timestamp_hi << 32) | (u64)gpu_timestamp_lo;
584
585 gpu_timestamp_hi_old = gpu_timestamp_hi_new;
586 }
587
588end:
589 gk20a_idle(g->dev);
590 return err;
591}
592
593static int nvgpu_gpu_get_cpu_time_correlation_info(
594 struct gk20a *g,
595 struct nvgpu_gpu_get_cpu_time_correlation_info_args *args)
596{
597 int err = 0;
598 u64 (*get_cpu_timestamp)(void) = NULL;
599
600 if (args->count > NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_MAX_COUNT)
601 return -EINVAL;
602
603 switch (args->source_id) {
604 case NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_SRC_ID_TSC:
605 get_cpu_timestamp = get_cpu_timestamp_tsc;
606 break;
607 case NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_SRC_ID_JIFFIES:
608 get_cpu_timestamp = get_cpu_timestamp_jiffies;
609 break;
610 case NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_SRC_ID_TIMEOFDAY:
611 get_cpu_timestamp = get_cpu_timestamp_timeofday;
612 break;
613 default:
614 gk20a_err(dev_from_gk20a(g), "invalid cpu clock source id\n");
615 return -EINVAL;
616 }
617
618 err = get_timestamps_zipper(g, get_cpu_timestamp, args);
619 return err;
620}
621
531long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 622long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
532{ 623{
533 struct platform_device *dev = filp->private_data; 624 struct platform_device *dev = filp->private_data;
@@ -760,6 +851,11 @@ long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg
760 (struct nvgpu_gpu_get_buffer_info_args *)buf); 851 (struct nvgpu_gpu_get_buffer_info_args *)buf);
761 break; 852 break;
762 853
854 case NVGPU_GPU_IOCTL_GET_CPU_TIME_CORRELATION_INFO:
855 err = nvgpu_gpu_get_cpu_time_correlation_info(g,
856 (struct nvgpu_gpu_get_cpu_time_correlation_info_args *)buf);
857 break;
858
763 default: 859 default:
764 dev_dbg(dev_from_gk20a(g), "unrecognized gpu ioctl cmd: 0x%x", cmd); 860 dev_dbg(dev_from_gk20a(g), "unrecognized gpu ioctl cmd: 0x%x", cmd);
765 err = -ENOTTY; 861 err = -ENOTTY;
diff --git a/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c
new file mode 100644
index 00000000..9e7c04ad
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.c
@@ -0,0 +1,586 @@
1/*
2 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#include <asm/barrier.h>
15#include <linux/slab.h>
16#include <linux/kthread.h>
17#include <linux/circ_buf.h>
18#include <linux/delay.h>
19#include <linux/jiffies.h>
20#include <linux/wait.h>
21#include <linux/ktime.h>
22#include <linux/nvgpu.h>
23#include <linux/hashtable.h>
24#include <linux/debugfs.h>
25#include <linux/log2.h>
26#include <uapi/linux/nvgpu.h>
27#include "ctxsw_trace_gk20a.h"
28#include "gk20a.h"
29#include "gr_gk20a.h"
30#include "hw_ctxsw_prog_gk20a.h"
31#include "hw_gr_gk20a.h"
32
33#define GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE (128*PAGE_SIZE)
34
35/* Userland-facing FIFO (one global + eventually one per VM) */
36struct gk20a_ctxsw_dev {
37 struct gk20a *g;
38
39 struct nvgpu_ctxsw_ring_header *hdr;
40 struct nvgpu_ctxsw_trace_entry *ents;
41 struct nvgpu_ctxsw_trace_filter filter;
42 bool write_enabled;
43 wait_queue_head_t readout_wq;
44 size_t size;
45
46 atomic_t vma_ref;
47
48 struct mutex lock;
49};
50
51
52struct gk20a_ctxsw_trace {
53 struct gk20a_ctxsw_dev devs[GK20A_CTXSW_TRACE_NUM_DEVS];
54};
55
56static inline int ring_is_empty(struct nvgpu_ctxsw_ring_header *hdr)
57{
58 return (hdr->write_idx == hdr->read_idx);
59}
60
61static inline int ring_is_full(struct nvgpu_ctxsw_ring_header *hdr)
62{
63 return ((hdr->write_idx + 1) % hdr->num_ents) == hdr->read_idx;
64}
65
66static inline int ring_len(struct nvgpu_ctxsw_ring_header *hdr)
67{
68 return (hdr->write_idx - hdr->read_idx) % hdr->num_ents;
69}
70
71static inline int ring_space(struct nvgpu_ctxsw_ring_header *hdr)
72{
73 return (hdr->read_idx - hdr->write_idx - 1) % hdr->num_ents;
74}
75
76ssize_t gk20a_ctxsw_dev_read(struct file *filp, char __user *buf, size_t size,
77 loff_t *off)
78{
79 struct gk20a_ctxsw_dev *dev = filp->private_data;
80 struct nvgpu_ctxsw_ring_header *hdr = dev->hdr;
81 struct nvgpu_ctxsw_trace_entry __user *entry =
82 (struct nvgpu_ctxsw_trace_entry *) buf;
83 size_t copied = 0;
84 int err;
85
86 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
87 "filp=%p buf=%p size=%zu", filp, buf, size);
88
89 mutex_lock(&dev->lock);
90 while (ring_is_empty(hdr)) {
91 mutex_unlock(&dev->lock);
92 if (filp->f_flags & O_NONBLOCK)
93 return -EAGAIN;
94 err = wait_event_interruptible(dev->readout_wq,
95 !ring_is_empty(hdr));
96 if (err)
97 return err;
98 mutex_lock(&dev->lock);
99 }
100
101 while (size >= sizeof(struct nvgpu_ctxsw_trace_entry)) {
102 if (ring_is_empty(hdr))
103 break;
104
105 if (copy_to_user(entry, &dev->ents[hdr->read_idx],
106 sizeof(*entry))) {
107 mutex_unlock(&dev->lock);
108 return -EFAULT;
109 }
110
111 hdr->read_idx++;
112 if (hdr->read_idx >= hdr->num_ents)
113 hdr->read_idx = 0;
114
115 entry++;
116 copied += sizeof(*entry);
117 size -= sizeof(*entry);
118 }
119
120 gk20a_dbg(gpu_dbg_ctxsw, "copied=%zu read_idx=%d", copied,
121 hdr->read_idx);
122
123 *off = hdr->read_idx;
124 mutex_unlock(&dev->lock);
125
126 return copied;
127}
128
129static int gk20a_ctxsw_dev_ioctl_trace_enable(struct gk20a_ctxsw_dev *dev)
130{
131 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "trace enabled");
132 dev->write_enabled = true;
133 return 0;
134}
135
136static int gk20a_ctxsw_dev_ioctl_trace_disable(struct gk20a_ctxsw_dev *dev)
137{
138 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "trace disabled");
139 dev->write_enabled = false;
140 return 0;
141}
142
143static int gk20a_ctxsw_dev_ring_alloc(struct gk20a_ctxsw_dev *dev,
144 size_t size)
145{
146 struct nvgpu_ctxsw_ring_header *hdr;
147
148 if (atomic_read(&dev->vma_ref))
149 return -EBUSY;
150
151 if ((dev->write_enabled) || (atomic_read(&dev->vma_ref)))
152 return -EBUSY;
153
154 size = roundup(size, PAGE_SIZE);
155 hdr = vmalloc_user(size);
156 if (!hdr)
157 return -ENOMEM;
158
159 if (dev->hdr)
160 vfree(dev->hdr);
161
162 dev->hdr = hdr;
163 dev->ents = (struct nvgpu_ctxsw_trace_entry *) (dev->hdr + 1);
164 dev->size = size;
165
166 hdr->magic = NVGPU_CTXSW_RING_HEADER_MAGIC;
167 hdr->version = NVGPU_CTXSW_RING_HEADER_VERSION;
168 hdr->num_ents = (size - sizeof(struct nvgpu_ctxsw_ring_header))
169 / sizeof(struct nvgpu_ctxsw_trace_entry);
170 hdr->ent_size = sizeof(struct nvgpu_ctxsw_trace_entry);
171 hdr->drop_count = 0;
172 hdr->read_idx = 0;
173 hdr->write_idx = 0;
174 hdr->write_seqno = 0;
175
176 gk20a_dbg(gpu_dbg_ctxsw, "size=%zu hdr=%p ents=%p num_ents=%d",
177 dev->size, dev->hdr, dev->ents, hdr->num_ents);
178 return 0;
179}
180
181static int gk20a_ctxsw_dev_ioctl_ring_setup(struct gk20a_ctxsw_dev *dev,
182 struct nvgpu_ctxsw_ring_setup_args *args)
183{
184 size_t size = args->size;
185
186 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "size=%zu", size);
187
188 if (size > GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE)
189 return -EINVAL;
190
191 return gk20a_ctxsw_dev_ring_alloc(dev, size);
192}
193
194static int gk20a_ctxsw_dev_ioctl_set_filter(struct gk20a_ctxsw_dev *dev,
195 struct nvgpu_ctxsw_trace_filter_args *args)
196{
197 dev->filter = args->filter;
198 return 0;
199}
200
201static int gk20a_ctxsw_dev_ioctl_get_filter(struct gk20a_ctxsw_dev *dev,
202 struct nvgpu_ctxsw_trace_filter_args *args)
203{
204 args->filter = dev->filter;
205 return 0;
206}
207
208static int gk20a_ctxsw_dev_ioctl_poll(struct gk20a_ctxsw_dev *dev)
209{
210 struct gk20a *g = dev->g;
211 int err;
212
213 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
214
215 err = gk20a_busy(g->dev);
216 if (err)
217 return err;
218
219 if (g->ops.fecs_trace.flush(g))
220 err = g->ops.fecs_trace.flush(g);
221
222 if (likely(!err))
223 err = g->ops.fecs_trace.poll(g);
224
225 gk20a_idle(g->dev);
226 return err;
227}
228
229int gk20a_ctxsw_dev_open(struct inode *inode, struct file *filp)
230{
231 struct gk20a *g;
232 struct gk20a_ctxsw_trace *trace;
233 struct gk20a_ctxsw_dev *dev;
234 int err;
235 size_t size;
236 u32 n;
237
238 /* only one VM for now */
239 const int vmid = 0;
240
241 g = container_of(inode->i_cdev, struct gk20a, ctxsw.cdev);
242 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "g=%p", g);
243
244 if (!capable(CAP_SYS_ADMIN))
245 return -EPERM;
246
247 err = gk20a_busy(g->dev);
248 if (err)
249 return err;
250
251 trace = g->ctxsw_trace;
252 if (!trace) {
253 err = -ENODEV;
254 goto idle;
255 }
256
257 /* Allow only one user for this device */
258 dev = &trace->devs[vmid];
259 mutex_lock(&dev->lock);
260 if (dev->hdr) {
261 err = -EBUSY;
262 goto done;
263 }
264
265 /* By default, allocate ring buffer big enough to accommodate
266 * FECS records with default event filter */
267
268 /* enable all traces by default */
269 NVGPU_CTXSW_FILTER_SET_ALL(&dev->filter);
270
271 /* compute max number of entries generated with this filter */
272 n = g->ops.fecs_trace.max_entries(g, &dev->filter);
273
274 size = sizeof(struct nvgpu_ctxsw_ring_header) +
275 n * sizeof(struct nvgpu_ctxsw_trace_entry);
276 gk20a_dbg(gpu_dbg_ctxsw, "size=%zu entries=%d ent_size=%zu",
277 size, n, sizeof(struct nvgpu_ctxsw_trace_entry));
278
279 err = gk20a_ctxsw_dev_ring_alloc(dev, size);
280 if (!err) {
281 filp->private_data = dev;
282 gk20a_dbg(gpu_dbg_ctxsw, "filp=%p dev=%p size=%zu",
283 filp, dev, size);
284 }
285
286 err = g->ops.fecs_trace.enable(g);
287
288done:
289 mutex_unlock(&dev->lock);
290
291idle:
292 gk20a_idle(g->dev);
293
294 return err;
295}
296
297int gk20a_ctxsw_dev_release(struct inode *inode, struct file *filp)
298{
299 struct gk20a_ctxsw_dev *dev = filp->private_data;
300 struct gk20a *g = container_of(inode->i_cdev, struct gk20a, ctxsw.cdev);
301
302 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "dev: %p", dev);
303
304 mutex_lock(&dev->lock);
305 dev->write_enabled = false;
306 if (dev->hdr) {
307 vfree(dev->hdr);
308 dev->hdr = NULL;
309 }
310
311 g->ops.fecs_trace.disable(g);
312
313 mutex_unlock(&dev->lock);
314
315 return 0;
316}
317
318long gk20a_ctxsw_dev_ioctl(struct file *filp, unsigned int cmd,
319 unsigned long arg)
320{
321 struct gk20a_ctxsw_dev *dev = filp->private_data;
322 struct gk20a *g = dev->g;
323 u8 buf[NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE];
324 int err = 0;
325
326 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "nr=%d", _IOC_NR(cmd));
327
328 if ((_IOC_TYPE(cmd) != NVGPU_CTXSW_IOCTL_MAGIC) || (_IOC_NR(cmd) == 0)
329 || (_IOC_NR(cmd) > NVGPU_CTXSW_IOCTL_LAST))
330 return -EINVAL;
331
332 BUG_ON(_IOC_SIZE(cmd) > NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE);
333
334 memset(buf, 0, sizeof(buf));
335 if (_IOC_DIR(cmd) & _IOC_WRITE) {
336 if (copy_from_user(buf, (void __user *) arg, _IOC_SIZE(cmd)))
337 return -EFAULT;
338 }
339
340 mutex_lock(&dev->lock);
341
342 switch (cmd) {
343 case NVGPU_CTXSW_IOCTL_TRACE_ENABLE:
344 err = gk20a_ctxsw_dev_ioctl_trace_enable(dev);
345 break;
346 case NVGPU_CTXSW_IOCTL_TRACE_DISABLE:
347 err = gk20a_ctxsw_dev_ioctl_trace_disable(dev);
348 break;
349 case NVGPU_CTXSW_IOCTL_RING_SETUP:
350 err = gk20a_ctxsw_dev_ioctl_ring_setup(dev,
351 (struct nvgpu_ctxsw_ring_setup_args *) buf);
352 break;
353 case NVGPU_CTXSW_IOCTL_SET_FILTER:
354 err = gk20a_ctxsw_dev_ioctl_set_filter(dev,
355 (struct nvgpu_ctxsw_trace_filter_args *) buf);
356 break;
357 case NVGPU_CTXSW_IOCTL_GET_FILTER:
358 err = gk20a_ctxsw_dev_ioctl_get_filter(dev,
359 (struct nvgpu_ctxsw_trace_filter_args *) buf);
360 break;
361 case NVGPU_CTXSW_IOCTL_POLL:
362 mutex_unlock(&dev->lock);
363 err = gk20a_ctxsw_dev_ioctl_poll(dev);
364 mutex_lock(&dev->lock);
365 break;
366 default:
367 dev_dbg(dev_from_gk20a(g), "unrecognized gpu ioctl cmd: 0x%x",
368 cmd);
369 err = -ENOTTY;
370 }
371
372 mutex_unlock(&dev->lock);
373
374 if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
375 err = copy_to_user((void __user *) arg, buf, _IOC_SIZE(cmd));
376
377 return err;
378}
379
380unsigned int gk20a_ctxsw_dev_poll(struct file *filp, poll_table *wait)
381{
382 struct gk20a_ctxsw_dev *dev = filp->private_data;
383 struct nvgpu_ctxsw_ring_header *hdr = dev->hdr;
384 unsigned int mask = 0;
385
386 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
387
388 mutex_lock(&dev->lock);
389 poll_wait(filp, &dev->readout_wq, wait);
390 if (!ring_is_empty(hdr))
391 mask |= POLLIN | POLLRDNORM;
392 mutex_unlock(&dev->lock);
393
394 return mask;
395}
396
397static void gk20a_ctxsw_dev_vma_open(struct vm_area_struct *vma)
398{
399 struct gk20a_ctxsw_dev *dev = vma->vm_private_data;
400
401 atomic_inc(&dev->vma_ref);
402 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vma_ref=%d",
403 atomic_read(&dev->vma_ref));
404}
405
406static void gk20a_ctxsw_dev_vma_close(struct vm_area_struct *vma)
407{
408 struct gk20a_ctxsw_dev *dev = vma->vm_private_data;
409
410 atomic_dec(&dev->vma_ref);
411 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vma_ref=%d",
412 atomic_read(&dev->vma_ref));
413}
414
415static struct vm_operations_struct gk20a_ctxsw_dev_vma_ops = {
416 .open = gk20a_ctxsw_dev_vma_open,
417 .close = gk20a_ctxsw_dev_vma_close,
418};
419
420int gk20a_ctxsw_dev_mmap(struct file *filp, struct vm_area_struct *vma)
421{
422 struct gk20a_ctxsw_dev *dev = filp->private_data;
423 int ret;
424
425 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "vm_start=%lx vm_end=%lx",
426 vma->vm_start, vma->vm_end);
427
428 ret = remap_vmalloc_range(vma, dev->hdr, 0);
429 if (likely(!ret)) {
430 vma->vm_private_data = dev;
431 vma->vm_ops = &gk20a_ctxsw_dev_vma_ops;
432 vma->vm_ops->open(vma);
433 }
434
435 return ret;
436}
437
438#ifdef CONFIG_GK20A_CTXSW_TRACE
439static int gk20a_ctxsw_init_devs(struct gk20a *g)
440{
441 struct gk20a_ctxsw_trace *trace = g->ctxsw_trace;
442 struct gk20a_ctxsw_dev *dev = trace->devs;
443 int i;
444
445 for (i = 0; i < GK20A_CTXSW_TRACE_NUM_DEVS; i++) {
446 dev->g = g;
447 dev->hdr = NULL;
448 dev->write_enabled = false;
449 init_waitqueue_head(&dev->readout_wq);
450 mutex_init(&dev->lock);
451 atomic_set(&dev->vma_ref, 0);
452 dev++;
453 }
454 return 0;
455}
456#endif
457
458int gk20a_ctxsw_trace_init(struct gk20a *g)
459{
460#ifdef CONFIG_GK20A_CTXSW_TRACE
461 struct gk20a_ctxsw_trace *trace = g->ctxsw_trace;
462 int err;
463
464 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "g=%p trace=%p", g, trace);
465
466 if (likely(trace))
467 return 0;
468
469 trace = kzalloc(sizeof(*trace), GFP_KERNEL);
470 if (unlikely(!trace))
471 return -ENOMEM;
472 g->ctxsw_trace = trace;
473
474 err = gk20a_ctxsw_init_devs(g);
475 if (err)
476 goto fail;
477
478 err = g->ops.fecs_trace.init(g);
479 if (unlikely(err))
480 goto fail;
481
482 return 0;
483
484fail:
485 kfree(trace);
486 g->ctxsw_trace = NULL;
487 return err;
488#else
489 return 0;
490#endif
491}
492
493void gk20a_ctxsw_trace_cleanup(struct gk20a *g)
494{
495#ifdef CONFIG_GK20A_CTXSW_TRACE
496 kfree(g->ctxsw_trace);
497 g->ctxsw_trace = NULL;
498
499 g->ops.fecs_trace.deinit(g);
500#endif
501}
502
503int gk20a_ctxsw_trace_write(struct gk20a *g,
504 struct nvgpu_ctxsw_trace_entry *entry)
505{
506 struct nvgpu_ctxsw_ring_header *hdr;
507 struct gk20a_ctxsw_dev *dev;
508 int ret = 0;
509 const char *reason;
510
511 if (unlikely(entry->vmid >= GK20A_CTXSW_TRACE_NUM_DEVS))
512 return -ENODEV;
513
514 dev = &g->ctxsw_trace->devs[entry->vmid];
515 hdr = dev->hdr;
516
517 gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
518 "dev=%p hdr=%p", dev, hdr);
519
520 mutex_lock(&dev->lock);
521
522 if (unlikely(!hdr)) {
523 /* device has been released */
524 ret = -ENODEV;
525 goto done;
526 }
527
528 entry->seqno = hdr->write_seqno++;
529
530 if (!dev->write_enabled) {
531 ret = -EBUSY;
532 reason = "write disabled";
533 goto drop;
534 }
535
536 if (unlikely(ring_is_full(hdr))) {
537 ret = -ENOSPC;
538 reason = "user fifo full";
539 goto drop;
540 }
541
542 if (!NVGPU_CTXSW_FILTER_ISSET(entry->tag, &dev->filter)) {
543 reason = "filtered out";
544 goto filter;
545 }
546
547 gk20a_dbg(gpu_dbg_ctxsw,
548 "seqno=%d context_id=%08x pid=%lld tag=%x timestamp=%llx",
549 entry->seqno, entry->context_id, entry->pid,
550 entry->tag, entry->timestamp);
551
552 dev->ents[hdr->write_idx] = *entry;
553
554 /* ensure record is written before updating write index */
555 smp_wmb();
556
557 hdr->write_idx++;
558 if (unlikely(hdr->write_idx >= hdr->num_ents))
559 hdr->write_idx = 0;
560 gk20a_dbg(gpu_dbg_ctxsw, "added: read=%d write=%d len=%d",
561 hdr->read_idx, hdr->write_idx, ring_len(hdr));
562
563 mutex_unlock(&dev->lock);
564 return ret;
565
566drop:
567 hdr->drop_count++;
568
569filter:
570 gk20a_dbg(gpu_dbg_ctxsw,
571 "dropping seqno=%d context_id=%08x pid=%lld "
572 "tag=%x time=%llx (%s)",
573 entry->seqno, entry->context_id, entry->pid,
574 entry->tag, entry->timestamp, reason);
575
576done:
577 mutex_unlock(&dev->lock);
578 return ret;
579}
580
581void gk20a_ctxsw_trace_wake_up(struct gk20a *g, int vmid)
582{
583 struct gk20a_ctxsw_dev *dev = &g->ctxsw_trace->devs[vmid];
584
585 wake_up_interruptible(&dev->readout_wq);
586}
diff --git a/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.h b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.h
new file mode 100644
index 00000000..c57d95d1
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ctxsw_trace_gk20a.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#ifndef __CTXSW_TRACE_GK20A_H
15#define __CTXSW_TRACE_GK20A_H
16
17#define GK20A_CTXSW_TRACE_NUM_DEVS 1
18
19struct gk20a;
20struct nvgpu_ctxsw_trace_entry;
21struct channel_gk20a;
22struct channel_ctx_gk20a;
23struct gk20a_ctxsw_dev;
24struct gk20a_fecs_trace;
25
26
27int gk20a_ctxsw_dev_release(struct inode *inode, struct file *filp);
28int gk20a_ctxsw_dev_open(struct inode *inode, struct file *filp);
29long gk20a_ctxsw_dev_ioctl(struct file *filp,
30 unsigned int cmd, unsigned long arg);
31ssize_t gk20a_ctxsw_dev_read(struct file *, char __user *, size_t, loff_t *);
32unsigned int gk20a_ctxsw_dev_poll(struct file *, struct poll_table_struct *);
33int gk20a_ctxsw_dev_mmap(struct file *, struct vm_area_struct *);
34
35int gk20a_ctxsw_trace_init(struct gk20a *);
36int gk20a_ctxsw_trace_setup(struct gk20a *, void *ctx_ptr);
37void gk20a_ctxsw_trace_cleanup(struct gk20a *);
38int gk20a_ctxsw_trace_write(struct gk20a *, struct nvgpu_ctxsw_trace_entry *);
39void gk20a_ctxsw_trace_wake_up(struct gk20a *g, int vmid);
40
41#endif /* __CTXSW_TRACE_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
new file mode 100644
index 00000000..bac36403
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
@@ -0,0 +1,763 @@
1/*
2 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#include <asm/barrier.h>
15#include <linux/slab.h>
16#include <linux/kthread.h>
17#include <linux/circ_buf.h>
18#include <linux/delay.h>
19#include <linux/jiffies.h>
20#include <linux/wait.h>
21#include <linux/ktime.h>
22#include <linux/nvgpu.h>
23#include <linux/hashtable.h>
24#include <linux/debugfs.h>
25#include <linux/log2.h>
26#include <uapi/linux/nvgpu.h>
27#include "ctxsw_trace_gk20a.h"
28#include "fecs_trace_gk20a.h"
29#include "gk20a.h"
30#include "gr_gk20a.h"
31#include "hw_ctxsw_prog_gk20a.h"
32#include "hw_gr_gk20a.h"
33
34/*
35 * If HW circular buffer is getting too many "buffer full" conditions,
36 * increasing this constant should help (it drives Linux' internal buffer size).
37 */
38#define GK20A_FECS_TRACE_NUM_RECORDS (1 << 6)
39#define GK20A_FECS_TRACE_HASH_BITS 8 /* 2^8 */
40#define GK20A_FECS_TRACE_FRAME_PERIOD_NS (1000000000ULL/60ULL)
41#define GK20A_FECS_TRACE_PTIMER_SHIFT 5
42
43struct gk20a_fecs_trace_record {
44 u32 magic_lo;
45 u32 magic_hi;
46 u32 context_id;
47 u32 context_ptr;
48 u32 new_context_id;
49 u32 new_context_ptr;
50 u64 ts[];
51};
52
53struct gk20a_fecs_trace_hash_ent {
54 u32 context_ptr;
55 pid_t pid;
56 struct hlist_node node;
57};
58
59struct gk20a_fecs_trace {
60
61 struct mem_desc trace_buf;
62 DECLARE_HASHTABLE(pid_hash_table, GK20A_FECS_TRACE_HASH_BITS);
63 struct mutex hash_lock;
64 struct mutex poll_lock;
65 u64 sof;
66 u32 sof_mask; /* did we already send a SOF for this VM */
67
68 struct task_struct *poll_task;
69};
70
71#ifdef CONFIG_GK20A_CTXSW_TRACE
72static inline u32 gk20a_fecs_trace_record_ts_tag_v(u64 ts)
73{
74 return ctxsw_prog_record_timestamp_timestamp_hi_tag_v((u32) (ts >> 32));
75}
76
77static inline u64 gk20a_fecs_trace_record_ts_timestamp_v(u64 ts)
78{
79 return ts & ~(((u64)ctxsw_prog_record_timestamp_timestamp_hi_tag_m()) << 32);
80}
81
82
83static u32 gk20a_fecs_trace_fecs_context_ptr(struct channel_gk20a *ch)
84{
85 return (u32) (sg_phys(ch->inst_block.sgt->sgl) >> 12LL);
86}
87
88static inline int gk20a_fecs_trace_num_ts(void)
89{
90 return (ctxsw_prog_record_timestamp_record_size_in_bytes_v()
91 - sizeof(struct gk20a_fecs_trace_record)) / sizeof(u64);
92}
93
94struct gk20a_fecs_trace_record *gk20a_fecs_trace_get_record(
95 struct gk20a_fecs_trace *trace, int idx)
96{
97 return (struct gk20a_fecs_trace_record *)
98 ((u8 *) trace->trace_buf.cpu_va
99 + (idx * ctxsw_prog_record_timestamp_record_size_in_bytes_v()));
100}
101
102static bool gk20a_fecs_trace_is_valid_record(struct gk20a_fecs_trace_record *r)
103{
104 /*
105 * testing magic_hi should suffice. magic_lo is sometimes used
106 * as a sequence number in experimental ucode.
107 */
108 return (r->magic_hi
109 == ctxsw_prog_record_timestamp_magic_value_hi_v_value_v());
110}
111
112static int gk20a_fecs_trace_get_read_index(struct gk20a *g)
113{
114 return gr_gk20a_elpg_protected_call(g,
115 gk20a_readl(g, gr_fecs_mailbox1_r()));
116}
117
118static int gk20a_fecs_trace_get_write_index(struct gk20a *g)
119{
120 return gr_gk20a_elpg_protected_call(g,
121 gk20a_readl(g, gr_fecs_mailbox0_r()));
122}
123
124static int gk20a_fecs_trace_set_read_index(struct gk20a *g, int index)
125{
126 gk20a_dbg(gpu_dbg_ctxsw, "set read=%d", index);
127 return gr_gk20a_elpg_protected_call(g,
128 (gk20a_writel(g, gr_fecs_mailbox1_r(), index), 0));
129}
130
131void gk20a_fecs_trace_hash_dump(struct gk20a *g)
132{
133 u32 bkt;
134 struct gk20a_fecs_trace_hash_ent *ent;
135 struct gk20a_fecs_trace *trace = g->fecs_trace;
136
137 gk20a_dbg(gpu_dbg_ctxsw, "dumping hash table");
138
139 mutex_lock(&trace->hash_lock);
140 hash_for_each(trace->pid_hash_table, bkt, ent, node)
141 {
142 gk20a_dbg(gpu_dbg_ctxsw, " ent=%p bkt=%x context_ptr=%x pid=%d",
143 ent, bkt, ent->context_ptr, ent->pid);
144
145 }
146 mutex_unlock(&trace->hash_lock);
147}
148
149static int gk20a_fecs_trace_hash_add(struct gk20a *g, u32 context_ptr, pid_t pid)
150{
151 struct gk20a_fecs_trace_hash_ent *he;
152 struct gk20a_fecs_trace *trace = g->fecs_trace;
153
154 gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
155 "adding hash entry context_ptr=%x -> pid=%d", context_ptr, pid);
156
157 he = kzalloc(sizeof(*he), GFP_KERNEL);
158 if (unlikely(!he)) {
159 gk20a_warn(dev_from_gk20a(g),
160 "can't alloc new hash entry for context_ptr=%x pid=%d",
161 context_ptr, pid);
162 return -ENOMEM;
163 }
164
165 he->context_ptr = context_ptr;
166 he->pid = pid;
167 mutex_lock(&trace->hash_lock);
168 hash_add(trace->pid_hash_table, &he->node, context_ptr);
169 mutex_unlock(&trace->hash_lock);
170 return 0;
171}
172
173static void gk20a_fecs_trace_hash_del(struct gk20a *g, u32 context_ptr)
174{
175 struct hlist_node *tmp;
176 struct gk20a_fecs_trace_hash_ent *ent;
177 struct gk20a_fecs_trace *trace = g->fecs_trace;
178
179 gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
180 "freeing hash entry context_ptr=%x", context_ptr);
181
182 mutex_lock(&trace->hash_lock);
183 hash_for_each_possible_safe(trace->pid_hash_table, ent, tmp, node,
184 context_ptr) {
185 if (ent->context_ptr == context_ptr) {
186 hash_del(&ent->node);
187 gk20a_dbg(gpu_dbg_ctxsw,
188 "freed hash entry=%p context_ptr=%x", ent,
189 ent->context_ptr);
190 kfree(ent);
191 break;
192 }
193 }
194 mutex_unlock(&trace->hash_lock);
195}
196
197static void gk20a_fecs_trace_free_hash_table(struct gk20a *g)
198{
199 u32 bkt;
200 struct hlist_node *tmp;
201 struct gk20a_fecs_trace_hash_ent *ent;
202 struct gk20a_fecs_trace *trace = g->fecs_trace;
203
204 gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw, "trace=%p", trace);
205
206 mutex_lock(&trace->hash_lock);
207 hash_for_each_safe(trace->pid_hash_table, bkt, tmp, ent, node) {
208 hash_del(&ent->node);
209 kfree(ent);
210 }
211 mutex_unlock(&trace->hash_lock);
212
213}
214
215static pid_t gk20a_fecs_trace_find_pid(struct gk20a *g, u32 context_ptr)
216{
217 struct gk20a_fecs_trace_hash_ent *ent;
218 struct gk20a_fecs_trace *trace = g->fecs_trace;
219 pid_t pid = 0;
220
221 mutex_lock(&trace->hash_lock);
222 hash_for_each_possible(trace->pid_hash_table, ent, node, context_ptr) {
223 if (ent->context_ptr == context_ptr) {
224 gk20a_dbg(gpu_dbg_ctxsw,
225 "found context_ptr=%x -> pid=%d",
226 ent->context_ptr, ent->pid);
227 pid = ent->pid;
228 break;
229 }
230 }
231 mutex_unlock(&trace->hash_lock);
232
233 return pid;
234}
235
236/*
237 * Converts HW entry format to userspace-facing format and pushes it to the
238 * queue.
239 */
240static int gk20a_fecs_trace_ring_read(struct gk20a *g, int index)
241{
242 int i;
243 struct nvgpu_ctxsw_trace_entry entry = { };
244 struct gk20a_fecs_trace *trace = g->fecs_trace;
245 pid_t cur_pid;
246 pid_t new_pid;
247
248 /* for now, only one VM */
249 const int vmid = 0;
250
251 struct gk20a_fecs_trace_record *r = gk20a_fecs_trace_get_record(
252 trace, index);
253
254 gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
255 "consuming record trace=%p read=%d record=%p", trace, index, r);
256
257 if (unlikely(!gk20a_fecs_trace_is_valid_record(r))) {
258 gk20a_warn(dev_from_gk20a(g),
259 "trace=%p read=%d record=%p magic_lo=%08x magic_hi=%08x (invalid)",
260 trace, index, r, r->magic_lo, r->magic_hi);
261 return -EINVAL;
262 }
263
264 cur_pid = gk20a_fecs_trace_find_pid(g, r->context_ptr);
265 new_pid = gk20a_fecs_trace_find_pid(g, r->new_context_ptr);
266
267 gk20a_dbg(gpu_dbg_fn | gpu_dbg_ctxsw,
268 "context_ptr=%x (pid=%d) new_context_ptr=%x (pid=%d)",
269 r->context_ptr, cur_pid, r->new_context_ptr, new_pid);
270
271 entry.context_id = r->context_id;
272 entry.vmid = vmid;
273
274 /* insert SOF event if needed */
275 if (!(trace->sof_mask & BIT(vmid))) {
276 entry.tag = NVGPU_CTXSW_TAG_SOF;
277 entry.timestamp = trace->sof;
278 entry.context_id = 0;
279 entry.pid = 0;
280
281 gk20a_dbg(gpu_dbg_ctxsw, "SOF time=%llx", entry.timestamp);
282 gk20a_ctxsw_trace_write(g, &entry);
283 trace->sof_mask |= BIT(vmid);
284 }
285
286 /* break out FECS record into trace events */
287 for (i = 0; i < gk20a_fecs_trace_num_ts(); i++) {
288
289 entry.tag = gk20a_fecs_trace_record_ts_tag_v(r->ts[i]);
290 entry.timestamp = gk20a_fecs_trace_record_ts_timestamp_v(r->ts[i]);
291 entry.timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT;
292
293 gk20a_dbg(gpu_dbg_ctxsw,
294 "tag=%x timestamp=%llx context_id=%08x new_context_id=%08x",
295 entry.tag, entry.timestamp, r->context_id,
296 r->new_context_id);
297
298 switch (entry.tag) {
299 case NVGPU_CTXSW_TAG_RESTORE_START:
300 case NVGPU_CTXSW_TAG_CONTEXT_START:
301 entry.context_id = r->new_context_id;
302 entry.pid = new_pid;
303 break;
304
305 case NVGPU_CTXSW_TAG_CTXSW_REQ_BY_HOST:
306 case NVGPU_CTXSW_TAG_FE_ACK:
307 case NVGPU_CTXSW_TAG_FE_ACK_WFI:
308 case NVGPU_CTXSW_TAG_FE_ACK_GFXP:
309 case NVGPU_CTXSW_TAG_FE_ACK_CTAP:
310 case NVGPU_CTXSW_TAG_FE_ACK_CILP:
311 case NVGPU_CTXSW_TAG_SAVE_END:
312 entry.context_id = r->context_id;
313 entry.pid = cur_pid;
314 break;
315
316 default:
317 /* tags are not guaranteed to start at the beginning */
318 WARN_ON(entry.tag && (entry.tag != NVGPU_CTXSW_TAG_INVALID_TIMESTAMP));
319 continue;
320 }
321
322 gk20a_dbg(gpu_dbg_ctxsw, "tag=%x context_id=%x pid=%lld",
323 entry.tag, entry.context_id, entry.pid);
324
325 if (!entry.context_id)
326 continue;
327
328 gk20a_ctxsw_trace_write(g, &entry);
329 }
330
331 gk20a_ctxsw_trace_wake_up(g, vmid);
332 return 0;
333}
334
335static int gk20a_fecs_trace_poll(struct gk20a *g)
336{
337 struct gk20a_fecs_trace *trace = g->fecs_trace;
338
339 int read = 0;
340 int write = 0;
341 int cnt;
342 int err;
343
344 err = gk20a_busy(g->dev);
345 if (unlikely(err))
346 return err;
347
348 mutex_lock(&trace->poll_lock);
349 write = gk20a_fecs_trace_get_write_index(g);
350 if (unlikely((write < 0) || (write >= GK20A_FECS_TRACE_NUM_RECORDS))) {
351 gk20a_err(dev_from_gk20a(g),
352 "failed to acquire write index, write=%d", write);
353 err = write;
354 goto done;
355 }
356
357 read = gk20a_fecs_trace_get_read_index(g);
358
359 cnt = CIRC_CNT(write, read, GK20A_FECS_TRACE_NUM_RECORDS);
360 if (!cnt)
361 goto done;
362
363 gk20a_dbg(gpu_dbg_ctxsw,
364 "circular buffer: read=%d (mailbox=%d) write=%d cnt=%d",
365 read, gk20a_fecs_trace_get_read_index(g), write, cnt);
366
367 /* we did not send any SOF yet */
368 trace->sof_mask = 0;
369
370 /* consume all records */
371 while (read != write) {
372 gk20a_fecs_trace_ring_read(g, read);
373
374 /* Get to next record. */
375 read = (read + 1) & (GK20A_FECS_TRACE_NUM_RECORDS - 1);
376 gk20a_fecs_trace_set_read_index(g, read);
377 }
378
379done:
380 /*
381 * OK, we read out all the entries... a new "frame" starts here.
382 * We remember the Start Of Frame time and insert it on the next
383 * iteration.
384 */
385 trace->sof = gk20a_read_ptimer(g);
386
387 mutex_unlock(&trace->poll_lock);
388 gk20a_idle(g->dev);
389 return err;
390}
391
392static int gk20a_fecs_trace_periodic_polling(void *arg)
393{
394 struct gk20a *g = (struct gk20a *)arg;
395 struct timespec ts = ns_to_timespec(GK20A_FECS_TRACE_FRAME_PERIOD_NS);
396
397 pr_info("%s: running\n", __func__);
398
399 while (!kthread_should_stop()) {
400
401 hrtimer_nanosleep(&ts, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
402
403 gk20a_fecs_trace_poll(g);
404 }
405
406 return 0;
407}
408
409static int gk20a_fecs_trace_alloc_ring(struct gk20a *g)
410{
411 struct gk20a_fecs_trace *trace = g->fecs_trace;
412
413 return gk20a_gmmu_alloc(g, GK20A_FECS_TRACE_NUM_RECORDS
414 * ctxsw_prog_record_timestamp_record_size_in_bytes_v(),
415 &trace->trace_buf);
416}
417
418static void gk20a_fecs_trace_free_ring(struct gk20a *g)
419{
420 struct gk20a_fecs_trace *trace = g->fecs_trace;
421
422 gk20a_gmmu_free(g, &trace->trace_buf);
423}
424
425#ifdef CONFIG_DEBUG_FS
426/*
427 * The sequence iterator functions. We simply use the count of the
428 * next line as our internal position.
429 */
430static void *gk20a_fecs_trace_debugfs_ring_seq_start(
431 struct seq_file *s, loff_t *pos)
432{
433 if (*pos >= GK20A_FECS_TRACE_NUM_RECORDS)
434 return NULL;
435
436 return pos;
437}
438
439static void *gk20a_fecs_trace_debugfs_ring_seq_next(
440 struct seq_file *s, void *v, loff_t *pos)
441{
442 ++(*pos);
443 if (*pos >= GK20A_FECS_TRACE_NUM_RECORDS)
444 return NULL;
445 return pos;
446}
447
448static void gk20a_fecs_trace_debugfs_ring_seq_stop(
449 struct seq_file *s, void *v)
450{
451}
452
453static int gk20a_fecs_trace_debugfs_ring_seq_show(
454 struct seq_file *s, void *v)
455{
456 loff_t *pos = (loff_t *) v;
457 struct gk20a *g = *(struct gk20a **)s->private;
458 struct gk20a_fecs_trace *trace = g->fecs_trace;
459 struct gk20a_fecs_trace_record *r = gk20a_fecs_trace_get_record(trace, *pos);
460 int i;
461 const u32 invalid_tag =
462 ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v();
463 u32 tag;
464 u64 timestamp;
465
466 seq_printf(s, "record #%lld (%p)\n", *pos, r);
467 seq_printf(s, "\tmagic_lo=%08x\n", r->magic_lo);
468 seq_printf(s, "\tmagic_hi=%08x\n", r->magic_hi);
469 if (gk20a_fecs_trace_is_valid_record(r)) {
470 seq_printf(s, "\tcontext_ptr=%08x\n", r->context_ptr);
471 seq_printf(s, "\tcontext_id=%08x\n", r->context_id);
472 seq_printf(s, "\tnew_context_ptr=%08x\n", r->new_context_ptr);
473 seq_printf(s, "\tnew_context_id=%08x\n", r->new_context_id);
474 for (i = 0; i < gk20a_fecs_trace_num_ts(); i++) {
475 tag = gk20a_fecs_trace_record_ts_tag_v(r->ts[i]);
476 if (tag == invalid_tag)
477 continue;
478 timestamp = gk20a_fecs_trace_record_ts_timestamp_v(r->ts[i]);
479 timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT;
480 seq_printf(s, "\ttag=%02x timestamp=%012llx\n", tag, timestamp);
481 }
482 }
483 return 0;
484}
485
486/*
487 * Tie them all together into a set of seq_operations.
488 */
489const struct seq_operations gk20a_fecs_trace_debugfs_ring_seq_ops = {
490 .start = gk20a_fecs_trace_debugfs_ring_seq_start,
491 .next = gk20a_fecs_trace_debugfs_ring_seq_next,
492 .stop = gk20a_fecs_trace_debugfs_ring_seq_stop,
493 .show = gk20a_fecs_trace_debugfs_ring_seq_show
494};
495
496/*
497 * Time to set up the file operations for our /proc file. In this case,
498 * all we need is an open function which sets up the sequence ops.
499 */
500
501static int gk20a_ctxsw_debugfs_ring_open(struct inode *inode,
502 struct file *file)
503{
504 struct gk20a **p;
505
506 if (!capable(CAP_SYS_ADMIN))
507 return -EPERM;
508
509 p = __seq_open_private(file, &gk20a_fecs_trace_debugfs_ring_seq_ops,
510 sizeof(struct gk20a *));
511 if (!p)
512 return -ENOMEM;
513
514 *p = (struct gk20a *)inode->i_private;
515 return 0;
516};
517
518/*
519 * The file operations structure contains our open function along with
520 * set of the canned seq_ ops.
521 */
522const struct file_operations gk20a_fecs_trace_debugfs_ring_fops = {
523 .owner = THIS_MODULE,
524 .open = gk20a_ctxsw_debugfs_ring_open,
525 .read = seq_read,
526 .llseek = seq_lseek,
527 .release = seq_release_private
528};
529
530static int gk20a_fecs_trace_debugfs_read(void *arg, u64 *val)
531{
532 *val = gk20a_fecs_trace_get_read_index((struct gk20a *)arg);
533 return 0;
534}
535DEFINE_SIMPLE_ATTRIBUTE(gk20a_fecs_trace_debugfs_read_fops,
536 gk20a_fecs_trace_debugfs_read, NULL, "%llu\n");
537
538static int gk20a_fecs_trace_debugfs_write(void *arg, u64 *val)
539{
540 *val = gk20a_fecs_trace_get_write_index((struct gk20a *)arg);
541 return 0;
542}
543DEFINE_SIMPLE_ATTRIBUTE(gk20a_fecs_trace_debugfs_write_fops,
544 gk20a_fecs_trace_debugfs_write, NULL, "%llu\n");
545
546static void gk20a_fecs_trace_debugfs_init(struct gk20a *g)
547{
548 struct gk20a_platform *plat = platform_get_drvdata(g->dev);
549
550 debugfs_create_file("ctxsw_trace_read", 0600, plat->debugfs, g,
551 &gk20a_fecs_trace_debugfs_read_fops);
552 debugfs_create_file("ctxsw_trace_write", 0600, plat->debugfs, g,
553 &gk20a_fecs_trace_debugfs_write_fops);
554 debugfs_create_file("ctxsw_trace_ring", 0600, plat->debugfs, g,
555 &gk20a_fecs_trace_debugfs_ring_fops);
556}
557
558static void gk20a_fecs_trace_debugfs_cleanup(struct gk20a *g)
559{
560 struct gk20a_platform *plat = platform_get_drvdata(g->dev);
561
562 debugfs_remove_recursive(plat->debugfs);
563}
564
565#else
566
567static void gk20a_fecs_trace_debugfs_init(struct gk20a *g)
568{
569}
570
571static inline void gk20a_fecs_trace_debugfs_cleanup(struct gk20a *g)
572{
573}
574
575#endif /* CONFIG_DEBUG_FS */
576
577static int gk20a_fecs_trace_init(struct gk20a *g)
578{
579 struct gk20a_fecs_trace *trace;
580 int err;
581
582 trace = kzalloc(sizeof(struct gk20a_fecs_trace), GFP_KERNEL);
583 if (!trace) {
584 gk20a_warn(dev_from_gk20a(g), "failed to allocate fecs_trace");
585 return -ENOMEM;
586 }
587 g->fecs_trace = trace;
588
589 BUG_ON(!is_power_of_2(GK20A_FECS_TRACE_NUM_RECORDS));
590 err = gk20a_fecs_trace_alloc_ring(g);
591 if (err) {
592 gk20a_warn(dev_from_gk20a(g), "failed to allocate FECS ring");
593 goto clean;
594 }
595
596 mutex_init(&trace->poll_lock);
597 mutex_init(&trace->hash_lock);
598 hash_init(trace->pid_hash_table);
599
600 gk20a_fecs_trace_debugfs_init(g);
601 return 0;
602
603clean:
604 kfree(trace);
605 g->fecs_trace = NULL;
606 return err;
607}
608
609static int gk20a_fecs_trace_bind_channel(struct gk20a *g,
610 struct channel_gk20a *ch)
611{
612 /*
613 * map our circ_buf to the context space and store the GPU VA
614 * in the context header.
615 */
616
617 u32 lo;
618 u32 hi;
619 phys_addr_t pa;
620 struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
621 struct gk20a_fecs_trace *trace = g->fecs_trace;
622 void *ctx_ptr;
623 u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(ch);
624
625 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
626 "hw_chid=%d context_ptr=%x inst_block=%llx",
627 ch->hw_chid, context_ptr, gk20a_mem_phys(&ch->inst_block));
628
629 if (!trace)
630 return -ENOMEM;
631
632 pa = gk20a_mem_phys(&trace->trace_buf);
633 if (!pa)
634 return -ENOMEM;
635
636 ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
637 PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT, 0,
638 pgprot_writecombine(PAGE_KERNEL));
639 if (!ctx_ptr)
640 return -ENOMEM;
641
642 lo = u64_lo32(pa);
643 hi = u64_hi32(pa);
644
645 gk20a_dbg(gpu_dbg_ctxsw, "addr_hi=%x addr_lo=%x count=%d", hi,
646 lo, GK20A_FECS_TRACE_NUM_RECORDS);
647
648 gk20a_mem_wr32(ctx_ptr
649 + ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
650 0, lo);
651 gk20a_mem_wr32(ctx_ptr
652 + ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
653 0, ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi));
654 gk20a_mem_wr32(ctx_ptr
655 + ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
656 0, ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
657 GK20A_FECS_TRACE_NUM_RECORDS));
658
659 vunmap(ctx_ptr);
660 gk20a_fecs_trace_hash_add(g, context_ptr, ch->pid);
661
662 return 0;
663}
664
665static int gk20a_fecs_trace_unbind_channel(struct gk20a *g, struct channel_gk20a *ch)
666{
667 u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(ch);
668
669 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
670 "ch=%p context_ptr=%x", ch, context_ptr);
671
672 if (g->ops.fecs_trace.flush)
673 g->ops.fecs_trace.flush(g);
674 gk20a_fecs_trace_poll(g);
675 gk20a_fecs_trace_hash_del(g, context_ptr);
676 return 0;
677}
678
679static int gk20a_fecs_trace_reset(struct gk20a *g)
680{
681 gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw, "");
682
683 if (g->ops.fecs_trace.flush)
684 g->ops.fecs_trace.flush(g);
685 gk20a_fecs_trace_poll(g);
686 return gk20a_fecs_trace_set_read_index(g, 0);
687}
688
689static int gk20a_fecs_trace_deinit(struct gk20a *g)
690{
691 struct gk20a_fecs_trace *trace = g->fecs_trace;
692
693 gk20a_fecs_trace_debugfs_cleanup(g);
694 kthread_stop(trace->poll_task);
695 gk20a_fecs_trace_free_ring(g);
696 gk20a_fecs_trace_free_hash_table(g);
697
698 kfree(g->fecs_trace);
699 g->fecs_trace = NULL;
700 return 0;
701}
702
703static int gk20a_gr_max_entries(struct gk20a *g,
704 struct nvgpu_ctxsw_trace_filter *filter)
705{
706 int n;
707 int tag;
708
709 /* Compute number of entries per record, with given filter */
710 for (n = 0, tag = 0; tag < gk20a_fecs_trace_num_ts(); tag++)
711 n += (NVGPU_CTXSW_FILTER_ISSET(tag, filter) != 0);
712
713 /* Return max number of entries generated for the whole ring */
714 return n * GK20A_FECS_TRACE_NUM_RECORDS;
715}
716
717static int gk20a_fecs_trace_enable(struct gk20a *g)
718{
719 struct gk20a_fecs_trace *trace = g->fecs_trace;
720 struct task_struct *task;
721
722 if (!trace->poll_task) {
723 task = kthread_run(gk20a_fecs_trace_periodic_polling, g, __func__);
724 if (unlikely(IS_ERR(task))) {
725 gk20a_warn(dev_from_gk20a(g), "failed to create FECS polling task");
726 return PTR_ERR(task);
727 }
728 trace->poll_task = task;
729 }
730
731 return 0;
732}
733
734static int gk20a_fecs_trace_disable(struct gk20a *g)
735{
736 struct gk20a_fecs_trace *trace = g->fecs_trace;
737
738 if (trace->poll_task) {
739 kthread_stop(trace->poll_task);
740 trace->poll_task = NULL;
741 }
742
743 return -EPERM;
744}
745
746void gk20a_init_fecs_trace_ops(struct gpu_ops *ops)
747{
748 ops->fecs_trace.init = gk20a_fecs_trace_init;
749 ops->fecs_trace.deinit = gk20a_fecs_trace_deinit;
750 ops->fecs_trace.enable = gk20a_fecs_trace_enable;
751 ops->fecs_trace.disable = gk20a_fecs_trace_disable;
752 ops->fecs_trace.reset = gk20a_fecs_trace_reset;
753 ops->fecs_trace.flush = NULL;
754 ops->fecs_trace.poll = gk20a_fecs_trace_poll;
755 ops->fecs_trace.bind_channel = gk20a_fecs_trace_bind_channel;
756 ops->fecs_trace.unbind_channel = gk20a_fecs_trace_unbind_channel;
757 ops->fecs_trace.max_entries = gk20a_gr_max_entries;
758}
759#else
760void gk20a_init_fecs_trace_ops(struct gpu_ops *ops)
761{
762}
763#endif /* CONFIG_GK20A_CTXSW_TRACE */
diff --git a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.h b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.h
new file mode 100644
index 00000000..4979d6c6
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.h
@@ -0,0 +1,20 @@
1/*
2 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#ifndef __FECS_TRACE_GK20A_H
15#define __FECS_TRACE_GK20A_H
16
17struct gpu_ops;
18void gk20a_init_fecs_trace_ops(struct gpu_ops *ops);
19
20#endif /* __FECS_TRACE_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 769960af..029a713f 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -25,6 +25,7 @@
25 25
26#include "gk20a.h" 26#include "gk20a.h"
27#include "debug_gk20a.h" 27#include "debug_gk20a.h"
28#include "ctxsw_trace_gk20a.h"
28#include "semaphore_gk20a.h" 29#include "semaphore_gk20a.h"
29#include "hw_fifo_gk20a.h" 30#include "hw_fifo_gk20a.h"
30#include "hw_pbdma_gk20a.h" 31#include "hw_pbdma_gk20a.h"
@@ -303,12 +304,6 @@ static int init_runlist(struct gk20a *g, struct fifo_gk20a *f)
303 if (!runlist->active_tsgs) 304 if (!runlist->active_tsgs)
304 goto clean_up_runlist_info; 305 goto clean_up_runlist_info;
305 306
306 runlist->high_prio_channels =
307 kzalloc(DIV_ROUND_UP(f->num_channels, BITS_PER_BYTE),
308 GFP_KERNEL);
309 if (!runlist->high_prio_channels)
310 goto clean_up_runlist_info;
311
312 runlist_size = ram_rl_entry_size_v() * f->num_runlist_entries; 307 runlist_size = ram_rl_entry_size_v() * f->num_runlist_entries;
313 for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) { 308 for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
314 int err = gk20a_gmmu_alloc(g, runlist_size, &runlist->mem[i]); 309 int err = gk20a_gmmu_alloc(g, runlist_size, &runlist->mem[i]);
@@ -337,9 +332,6 @@ clean_up_runlist_info:
337 kfree(runlist->active_tsgs); 332 kfree(runlist->active_tsgs);
338 runlist->active_tsgs = NULL; 333 runlist->active_tsgs = NULL;
339 334
340 kfree(runlist->high_prio_channels);
341 runlist->high_prio_channels = NULL;
342
343 kfree(f->runlist_info); 335 kfree(f->runlist_info);
344 f->runlist_info = NULL; 336 f->runlist_info = NULL;
345 337
@@ -471,8 +463,7 @@ static void gk20a_init_fifo_pbdma_intr_descs(struct fifo_gk20a *f)
471 /* Can be used for sw-methods, or represents 463 /* Can be used for sw-methods, or represents
472 * a recoverable timeout. */ 464 * a recoverable timeout. */
473 f->intr.pbdma.restartable_0 = 465 f->intr.pbdma.restartable_0 =
474 pbdma_intr_0_device_pending_f() | 466 pbdma_intr_0_device_pending_f();
475 pbdma_intr_0_acquire_pending_f();
476} 467}
477 468
478static int gk20a_init_fifo_setup_sw(struct gk20a *g) 469static int gk20a_init_fifo_setup_sw(struct gk20a *g)
@@ -786,13 +777,17 @@ void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id)
786 if (engine_id == top_device_info_type_enum_graphics_v()) { 777 if (engine_id == top_device_info_type_enum_graphics_v()) {
787 if (support_gk20a_pmu(g->dev) && g->elpg_enabled) 778 if (support_gk20a_pmu(g->dev) && g->elpg_enabled)
788 gk20a_pmu_disable_elpg(g); 779 gk20a_pmu_disable_elpg(g);
789 /*HALT_PIPELINE method, halt GR engine*/ 780 /*HALT_PIPELINE method, halt GR engine*/
790 if (gr_gk20a_halt_pipe(g)) 781 if (gr_gk20a_halt_pipe(g))
791 gk20a_err(dev_from_gk20a(g), 782 gk20a_err(dev_from_gk20a(g), "failed to HALT gr pipe");
792 "failed to HALT gr pipe"); 783 /* resetting engine will alter read/write index.
793 /* resetting engine using mc_enable_r() is not 784 * need to flush circular buffer before re-enabling FECS.
794 enough, we do full init sequence */ 785 */
795 gk20a_gr_reset(g); 786 if (g->ops.fecs_trace.reset)
787 g->ops.fecs_trace.reset(g);
788 /* resetting engine using mc_enable_r() is not
789 enough, we do full init sequence */
790 gk20a_gr_reset(g);
796 if (support_gk20a_pmu(g->dev) && g->elpg_enabled) 791 if (support_gk20a_pmu(g->dev) && g->elpg_enabled)
797 gk20a_pmu_enable_elpg(g); 792 gk20a_pmu_enable_elpg(g);
798 } 793 }
@@ -1662,6 +1657,12 @@ static u32 gk20a_fifo_handle_pbdma_intr(struct device *dev,
1662 u32 val = gk20a_readl(g, pbdma_acquire_r(pbdma_id)); 1657 u32 val = gk20a_readl(g, pbdma_acquire_r(pbdma_id));
1663 val &= ~pbdma_acquire_timeout_en_enable_f(); 1658 val &= ~pbdma_acquire_timeout_en_enable_f();
1664 gk20a_writel(g, pbdma_acquire_r(pbdma_id), val); 1659 gk20a_writel(g, pbdma_acquire_r(pbdma_id), val);
1660 if (g->timeouts_enabled) {
1661 reset = true;
1662 gk20a_err(dev_from_gk20a(g),
1663 "semaphore acquire timeout!");
1664 }
1665 handled |= pbdma_intr_0_acquire_pending_f();
1665 } 1666 }
1666 1667
1667 if (pbdma_intr_0 & pbdma_intr_0_pbentry_pending_f()) { 1668 if (pbdma_intr_0 & pbdma_intr_0_pbentry_pending_f()) {
@@ -2162,32 +2163,153 @@ static inline u32 gk20a_get_tsg_runlist_entry_0(struct tsg_gk20a *tsg)
2162 return runlist_entry_0; 2163 return runlist_entry_0;
2163} 2164}
2164 2165
2165/* add all active high priority channels */ 2166/* recursively construct a runlist with interleaved bare channels and TSGs */
2166static inline u32 gk20a_fifo_runlist_add_high_prio_entries( 2167static u32 *gk20a_runlist_construct_locked(struct fifo_gk20a *f,
2167 struct fifo_gk20a *f, 2168 struct fifo_runlist_info_gk20a *runlist,
2168 struct fifo_runlist_info_gk20a *runlist, 2169 u32 cur_level,
2169 u32 *runlist_entry) 2170 u32 *runlist_entry,
2171 bool interleave_enabled,
2172 bool prev_empty,
2173 u32 *entries_left)
2170{ 2174{
2171 struct channel_gk20a *ch = NULL; 2175 bool last_level = cur_level == NVGPU_RUNLIST_INTERLEAVE_LEVEL_HIGH;
2172 unsigned long high_prio_chid; 2176 struct channel_gk20a *ch;
2173 u32 count = 0; 2177 bool skip_next = false;
2178 u32 chid, tsgid, count = 0;
2179
2180 gk20a_dbg_fn("");
2181
2182 /* for each bare channel, CH, on this level, insert all higher-level
2183 channels and TSGs before inserting CH. */
2184 for_each_set_bit(chid, runlist->active_channels, f->num_channels) {
2185 ch = &f->channel[chid];
2186
2187 if (ch->interleave_level != cur_level)
2188 continue;
2174 2189
2175 for_each_set_bit(high_prio_chid, 2190 if (gk20a_is_channel_marked_as_tsg(ch))
2176 runlist->high_prio_channels, f->num_channels) { 2191 continue;
2177 ch = &f->channel[high_prio_chid]; 2192
2193 if (!last_level && !skip_next) {
2194 runlist_entry = gk20a_runlist_construct_locked(f,
2195 runlist,
2196 cur_level + 1,
2197 runlist_entry,
2198 interleave_enabled,
2199 false,
2200 entries_left);
2201 /* if interleaving is disabled, higher-level channels
2202 and TSGs only need to be inserted once */
2203 if (!interleave_enabled)
2204 skip_next = true;
2205 }
2178 2206
2179 if (!gk20a_is_channel_marked_as_tsg(ch) && 2207 if (!(*entries_left))
2180 test_bit(high_prio_chid, runlist->active_channels) == 1) { 2208 return NULL;
2181 gk20a_dbg_info("add high prio channel %lu to runlist", 2209
2182 high_prio_chid); 2210 gk20a_dbg_info("add channel %d to runlist", chid);
2183 runlist_entry[0] = ram_rl_entry_chid_f(high_prio_chid); 2211 runlist_entry[0] = ram_rl_entry_chid_f(chid);
2212 runlist_entry[1] = 0;
2213 runlist_entry += 2;
2214 count++;
2215 (*entries_left)--;
2216 }
2217
2218 /* for each TSG, T, on this level, insert all higher-level channels
2219 and TSGs before inserting T. */
2220 for_each_set_bit(tsgid, runlist->active_tsgs, f->num_channels) {
2221 struct tsg_gk20a *tsg = &f->tsg[tsgid];
2222
2223 if (tsg->interleave_level != cur_level)
2224 continue;
2225
2226 if (!last_level && !skip_next) {
2227 runlist_entry = gk20a_runlist_construct_locked(f,
2228 runlist,
2229 cur_level + 1,
2230 runlist_entry,
2231 interleave_enabled,
2232 false,
2233 entries_left);
2234 if (!interleave_enabled)
2235 skip_next = true;
2236 }
2237
2238 if (!(*entries_left))
2239 return NULL;
2240
2241 /* add TSG entry */
2242 gk20a_dbg_info("add TSG %d to runlist", tsg->tsgid);
2243 runlist_entry[0] = gk20a_get_tsg_runlist_entry_0(tsg);
2244 runlist_entry[1] = 0;
2245 runlist_entry += 2;
2246 count++;
2247 (*entries_left)--;
2248
2249 mutex_lock(&tsg->ch_list_lock);
2250 /* add runnable channels bound to this TSG */
2251 list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
2252 if (!test_bit(ch->hw_chid,
2253 runlist->active_channels))
2254 continue;
2255
2256 if (!(*entries_left)) {
2257 mutex_unlock(&tsg->ch_list_lock);
2258 return NULL;
2259 }
2260
2261 gk20a_dbg_info("add channel %d to runlist",
2262 ch->hw_chid);
2263 runlist_entry[0] = ram_rl_entry_chid_f(ch->hw_chid);
2184 runlist_entry[1] = 0; 2264 runlist_entry[1] = 0;
2185 runlist_entry += 2; 2265 runlist_entry += 2;
2186 count++; 2266 count++;
2267 (*entries_left)--;
2187 } 2268 }
2269 mutex_unlock(&tsg->ch_list_lock);
2188 } 2270 }
2189 2271
2190 return count; 2272 /* append entries from higher level if this level is empty */
2273 if (!count && !last_level)
2274 runlist_entry = gk20a_runlist_construct_locked(f,
2275 runlist,
2276 cur_level + 1,
2277 runlist_entry,
2278 interleave_enabled,
2279 true,
2280 entries_left);
2281
2282 /*
2283 * if previous and this level have entries, append
2284 * entries from higher level.
2285 *
2286 * ex. dropping from MEDIUM to LOW, need to insert HIGH
2287 */
2288 if (interleave_enabled && count && !prev_empty && !last_level)
2289 runlist_entry = gk20a_runlist_construct_locked(f,
2290 runlist,
2291 cur_level + 1,
2292 runlist_entry,
2293 interleave_enabled,
2294 false,
2295 entries_left);
2296 return runlist_entry;
2297}
2298
2299int gk20a_fifo_set_runlist_interleave(struct gk20a *g,
2300 u32 id,
2301 bool is_tsg,
2302 u32 runlist_id,
2303 u32 new_level)
2304{
2305 gk20a_dbg_fn("");
2306
2307 if (is_tsg)
2308 g->fifo.tsg[id].interleave_level = new_level;
2309 else
2310 g->fifo.channel[id].interleave_level = new_level;
2311
2312 return 0;
2191} 2313}
2192 2314
2193static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id, 2315static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
@@ -2198,14 +2320,11 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
2198 struct fifo_gk20a *f = &g->fifo; 2320 struct fifo_gk20a *f = &g->fifo;
2199 struct fifo_runlist_info_gk20a *runlist = NULL; 2321 struct fifo_runlist_info_gk20a *runlist = NULL;
2200 u32 *runlist_entry_base = NULL; 2322 u32 *runlist_entry_base = NULL;
2201 u32 *runlist_entry = NULL;
2202 u64 runlist_iova; 2323 u64 runlist_iova;
2203 u32 old_buf, new_buf; 2324 u32 old_buf, new_buf;
2204 u32 chid, tsgid;
2205 struct channel_gk20a *ch = NULL; 2325 struct channel_gk20a *ch = NULL;
2206 struct tsg_gk20a *tsg = NULL; 2326 struct tsg_gk20a *tsg = NULL;
2207 u32 count = 0; 2327 u32 count = 0;
2208 u32 count_channels_in_tsg;
2209 runlist = &f->runlist_info[runlist_id]; 2328 runlist = &f->runlist_info[runlist_id];
2210 2329
2211 /* valid channel, add/remove it from active list. 2330 /* valid channel, add/remove it from active list.
@@ -2254,91 +2373,23 @@ static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
2254 2373
2255 if (hw_chid != ~0 || /* add/remove a valid channel */ 2374 if (hw_chid != ~0 || /* add/remove a valid channel */
2256 add /* resume to add all channels back */) { 2375 add /* resume to add all channels back */) {
2257 runlist_entry = runlist_entry_base; 2376 u32 max_entries = f->num_runlist_entries;
2258 2377 u32 *runlist_end;
2259 /* Runlist manipulation:
2260 Insert an entry of all high priority channels inbetween
2261 all lower priority channels. This ensure that the maximum
2262 delay a runnable high priority channel has to wait is one
2263 medium timeslice + any context switching overhead +
2264 wait on other high priority channels.
2265 add non-TSG channels first */
2266 for_each_set_bit(chid,
2267 runlist->active_channels, f->num_channels) {
2268 ch = &f->channel[chid];
2269
2270 if (!gk20a_is_channel_marked_as_tsg(ch) &&
2271 !ch->interleave) {
2272 u32 added;
2273
2274 gk20a_dbg_info("add normal prio channel %d to runlist",
2275 chid);
2276 runlist_entry[0] = ram_rl_entry_chid_f(chid);
2277 runlist_entry[1] = 0;
2278 runlist_entry += 2;
2279 count++;
2280
2281 added = gk20a_fifo_runlist_add_high_prio_entries(
2282 f,
2283 runlist,
2284 runlist_entry);
2285 count += added;
2286 runlist_entry += 2 * added;
2287 }
2288 }
2289 2378
2290 /* if there were no lower priority channels, then just 2379 runlist_end = gk20a_runlist_construct_locked(f,
2291 * add the high priority channels once. */ 2380 runlist,
2292 if (count == 0) { 2381 0,
2293 count = gk20a_fifo_runlist_add_high_prio_entries( 2382 runlist_entry_base,
2294 f, 2383 g->runlist_interleave,
2295 runlist, 2384 true,
2296 runlist_entry); 2385 &max_entries);
2297 runlist_entry += 2 * count; 2386 if (!runlist_end) {
2387 ret = -E2BIG;
2388 goto clean_up;
2298 } 2389 }
2299 2390
2300 /* now add TSG entries and channels bound to TSG */ 2391 count = (runlist_end - runlist_entry_base) / 2;
2301 mutex_lock(&f->tsg_inuse_mutex); 2392 WARN_ON(count > f->num_runlist_entries);
2302 for_each_set_bit(tsgid,
2303 runlist->active_tsgs, f->num_channels) {
2304 u32 added;
2305 tsg = &f->tsg[tsgid];
2306 /* add TSG entry */
2307 gk20a_dbg_info("add TSG %d to runlist", tsg->tsgid);
2308 runlist_entry[0] = gk20a_get_tsg_runlist_entry_0(tsg);
2309 runlist_entry[1] = 0;
2310 runlist_entry += 2;
2311 count++;
2312
2313 /* add runnable channels bound to this TSG */
2314 count_channels_in_tsg = 0;
2315 mutex_lock(&tsg->ch_list_lock);
2316 list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
2317 if (!test_bit(ch->hw_chid,
2318 runlist->active_channels))
2319 continue;
2320 gk20a_dbg_info("add channel %d to runlist",
2321 ch->hw_chid);
2322 runlist_entry[0] =
2323 ram_rl_entry_chid_f(ch->hw_chid);
2324 runlist_entry[1] = 0;
2325 runlist_entry += 2;
2326 count++;
2327 count_channels_in_tsg++;
2328 }
2329 mutex_unlock(&tsg->ch_list_lock);
2330
2331 WARN_ON(tsg->num_active_channels !=
2332 count_channels_in_tsg);
2333
2334 added = gk20a_fifo_runlist_add_high_prio_entries(
2335 f,
2336 runlist,
2337 runlist_entry);
2338 count += added;
2339 runlist_entry += 2 * added;
2340 }
2341 mutex_unlock(&f->tsg_inuse_mutex);
2342 } else /* suspend to remove all channels */ 2393 } else /* suspend to remove all channels */
2343 count = 0; 2394 count = 0;
2344 2395
@@ -2493,42 +2544,6 @@ u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g)
2493 return pbdma_signature_hw_valid_f() | pbdma_signature_sw_zero_f(); 2544 return pbdma_signature_hw_valid_f() | pbdma_signature_sw_zero_f();
2494} 2545}
2495 2546
2496int gk20a_fifo_set_channel_priority(
2497 struct gk20a *g,
2498 u32 runlist_id,
2499 u32 hw_chid,
2500 bool interleave)
2501{
2502 struct fifo_runlist_info_gk20a *runlist = NULL;
2503 struct fifo_gk20a *f = &g->fifo;
2504 struct channel_gk20a *ch = NULL;
2505
2506 if (hw_chid >= f->num_channels)
2507 return -EINVAL;
2508
2509 if (runlist_id >= f->max_runlists)
2510 return -EINVAL;
2511
2512 ch = &f->channel[hw_chid];
2513
2514 gk20a_dbg_fn("");
2515
2516 runlist = &f->runlist_info[runlist_id];
2517
2518 mutex_lock(&runlist->mutex);
2519
2520 if (ch->interleave)
2521 set_bit(hw_chid, runlist->high_prio_channels);
2522 else
2523 clear_bit(hw_chid, runlist->high_prio_channels);
2524
2525 gk20a_dbg_fn("done");
2526
2527 mutex_unlock(&runlist->mutex);
2528
2529 return 0;
2530}
2531
2532struct channel_gk20a *gk20a_fifo_channel_from_hw_chid(struct gk20a *g, 2547struct channel_gk20a *gk20a_fifo_channel_from_hw_chid(struct gk20a *g,
2533 u32 hw_chid) 2548 u32 hw_chid)
2534{ 2549{
@@ -2545,4 +2560,5 @@ void gk20a_init_fifo(struct gpu_ops *gops)
2545 gops->fifo.wait_engine_idle = gk20a_fifo_wait_engine_idle; 2560 gops->fifo.wait_engine_idle = gk20a_fifo_wait_engine_idle;
2546 gops->fifo.get_num_fifos = gk20a_fifo_get_num_fifos; 2561 gops->fifo.get_num_fifos = gk20a_fifo_get_num_fifos;
2547 gops->fifo.get_pbdma_signature = gk20a_fifo_get_pbdma_signature; 2562 gops->fifo.get_pbdma_signature = gk20a_fifo_get_pbdma_signature;
2563 gops->fifo.set_runlist_interleave = gk20a_fifo_set_runlist_interleave;
2548} 2564}
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index ee4e7328..0979bf2b 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -31,7 +31,6 @@
31struct fifo_runlist_info_gk20a { 31struct fifo_runlist_info_gk20a {
32 unsigned long *active_channels; 32 unsigned long *active_channels;
33 unsigned long *active_tsgs; 33 unsigned long *active_tsgs;
34 unsigned long *high_prio_channels;
35 /* Each engine has its own SW and HW runlist buffer.*/ 34 /* Each engine has its own SW and HW runlist buffer.*/
36 struct mem_desc mem[MAX_RUNLIST_BUFFERS]; 35 struct mem_desc mem[MAX_RUNLIST_BUFFERS];
37 u32 cur_buffer; 36 u32 cur_buffer;
@@ -184,8 +183,6 @@ void fifo_gk20a_finish_mmu_fault_handling(struct gk20a *g,
184int gk20a_fifo_wait_engine_idle(struct gk20a *g); 183int gk20a_fifo_wait_engine_idle(struct gk20a *g);
185u32 gk20a_fifo_engine_interrupt_mask(struct gk20a *g); 184u32 gk20a_fifo_engine_interrupt_mask(struct gk20a *g);
186u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g); 185u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g);
187int gk20a_fifo_set_channel_priority(struct gk20a *g, u32 runlist_id,
188 u32 hw_chid, bool interleave);
189u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g, 186u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g,
190 int *__id, bool *__is_tsg); 187 int *__id, bool *__is_tsg);
191bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g, 188bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
@@ -198,4 +195,9 @@ struct channel_gk20a *gk20a_fifo_channel_from_hw_chid(struct gk20a *g,
198 u32 hw_chid); 195 u32 hw_chid);
199 196
200void gk20a_fifo_issue_preempt(struct gk20a *g, u32 id, bool is_tsg); 197void gk20a_fifo_issue_preempt(struct gk20a *g, u32 id, bool is_tsg);
198int gk20a_fifo_set_runlist_interleave(struct gk20a *g,
199 u32 id,
200 bool is_tsg,
201 u32 runlist_id,
202 u32 new_level);
201#endif /*__GR_GK20A_H__*/ 203#endif /*__GR_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 6a5986a7..b8753a21 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -60,6 +60,7 @@
60#include "hw_gr_gk20a.h" 60#include "hw_gr_gk20a.h"
61#include "hw_fb_gk20a.h" 61#include "hw_fb_gk20a.h"
62#include "gk20a_scale.h" 62#include "gk20a_scale.h"
63#include "ctxsw_trace_gk20a.h"
63#include "dbg_gpu_gk20a.h" 64#include "dbg_gpu_gk20a.h"
64#include "gk20a_allocator.h" 65#include "gk20a_allocator.h"
65#include "hal.h" 66#include "hal.h"
@@ -80,7 +81,7 @@
80/* TODO: Change to e.g. "nvidia-gpu%s" once we have symlinks in place. */ 81/* TODO: Change to e.g. "nvidia-gpu%s" once we have symlinks in place. */
81#define INTERFACE_NAME "nvhost%s-gpu" 82#define INTERFACE_NAME "nvhost%s-gpu"
82 83
83#define GK20A_NUM_CDEVS 6 84#define GK20A_NUM_CDEVS 7
84 85
85#define EMC3D_DEFAULT_RATIO 750 86#define EMC3D_DEFAULT_RATIO 750
86 87
@@ -169,6 +170,19 @@ static const struct file_operations gk20a_tsg_ops = {
169 .unlocked_ioctl = gk20a_tsg_dev_ioctl, 170 .unlocked_ioctl = gk20a_tsg_dev_ioctl,
170}; 171};
171 172
173static const struct file_operations gk20a_ctxsw_ops = {
174 .owner = THIS_MODULE,
175 .release = gk20a_ctxsw_dev_release,
176 .open = gk20a_ctxsw_dev_open,
177#ifdef CONFIG_COMPAT
178 .compat_ioctl = gk20a_ctxsw_dev_ioctl,
179#endif
180 .unlocked_ioctl = gk20a_ctxsw_dev_ioctl,
181 .poll = gk20a_ctxsw_dev_poll,
182 .read = gk20a_ctxsw_dev_read,
183 .mmap = gk20a_ctxsw_dev_mmap,
184};
185
172static inline void sim_writel(struct gk20a *g, u32 r, u32 v) 186static inline void sim_writel(struct gk20a *g, u32 r, u32 v)
173{ 187{
174 writel(v, g->sim.regs+r); 188 writel(v, g->sim.regs+r);
@@ -672,9 +686,6 @@ static int gk20a_init_support(struct platform_device *dev)
672 mutex_init(&g->ch_wdt_lock); 686 mutex_init(&g->ch_wdt_lock);
673 mutex_init(&g->poweroff_lock); 687 mutex_init(&g->poweroff_lock);
674 688
675 mutex_init(&g->interleave_lock);
676 g->num_interleaved_channels = 0;
677
678 g->remove_support = gk20a_remove_support; 689 g->remove_support = gk20a_remove_support;
679 return 0; 690 return 0;
680 691
@@ -884,6 +895,10 @@ static int gk20a_pm_finalize_poweron(struct device *dev)
884 goto done; 895 goto done;
885 } 896 }
886 897
898 err = gk20a_ctxsw_trace_init(g);
899 if (err)
900 gk20a_warn(dev, "could not initialize ctxsw tracing");
901
887 /* Restore the debug setting */ 902 /* Restore the debug setting */
888 g->ops.mm.set_debug_mode(g, g->mmu_debug_ctrl); 903 g->ops.mm.set_debug_mode(g, g->mmu_debug_ctrl);
889 904
@@ -1012,6 +1027,11 @@ void gk20a_user_deinit(struct platform_device *dev)
1012 cdev_del(&g->tsg.cdev); 1027 cdev_del(&g->tsg.cdev);
1013 } 1028 }
1014 1029
1030 if (g->ctxsw.node) {
1031 device_destroy(g->class, g->ctxsw.cdev.dev);
1032 cdev_del(&g->ctxsw.cdev);
1033 }
1034
1015 if (g->cdev_region) 1035 if (g->cdev_region)
1016 unregister_chrdev_region(g->cdev_region, GK20A_NUM_CDEVS); 1036 unregister_chrdev_region(g->cdev_region, GK20A_NUM_CDEVS);
1017 1037
@@ -1077,6 +1097,15 @@ int gk20a_user_init(struct platform_device *dev)
1077 if (err) 1097 if (err)
1078 goto fail; 1098 goto fail;
1079 1099
1100#ifdef CONFIG_GK20A_CTXSW_TRACE
1101 err = gk20a_create_device(dev, devno++, "-ctxsw",
1102 &g->ctxsw.cdev, &g->ctxsw.node,
1103 &gk20a_ctxsw_ops);
1104 if (err)
1105 goto fail;
1106#endif
1107
1108
1080 return 0; 1109 return 0;
1081fail: 1110fail:
1082 gk20a_user_deinit(dev); 1111 gk20a_user_deinit(dev);
@@ -1400,9 +1429,11 @@ static int gk20a_probe(struct platform_device *dev)
1400 1429
1401 spin_lock_init(&gk20a->mc_enable_lock); 1430 spin_lock_init(&gk20a->mc_enable_lock);
1402 1431
1432#ifdef CONFIG_RESET_CONTROLLER
1403 platform->reset_control = devm_reset_control_get(&dev->dev, NULL); 1433 platform->reset_control = devm_reset_control_get(&dev->dev, NULL);
1404 if (IS_ERR(platform->reset_control)) 1434 if (IS_ERR(platform->reset_control))
1405 platform->reset_control = NULL; 1435 platform->reset_control = NULL;
1436#endif
1406 1437
1407 gk20a_debug_init(dev); 1438 gk20a_debug_init(dev);
1408 1439
@@ -1439,14 +1470,11 @@ static int gk20a_probe(struct platform_device *dev)
1439 if (tegra_platform_is_silicon()) 1470 if (tegra_platform_is_silicon())
1440 gk20a->timeouts_enabled = true; 1471 gk20a->timeouts_enabled = true;
1441 1472
1442 gk20a->interleave_high_priority = true; 1473 gk20a->runlist_interleave = true;
1443 1474
1444 gk20a->timeslice_low_priority_us = 1300; 1475 gk20a->timeslice_low_priority_us = 1300;
1445 gk20a->timeslice_medium_priority_us = 2600; 1476 gk20a->timeslice_medium_priority_us = 2600;
1446 if (gk20a->interleave_high_priority) 1477 gk20a->timeslice_high_priority_us = 5200;
1447 gk20a->timeslice_high_priority_us = 3000;
1448 else
1449 gk20a->timeslice_high_priority_us = 5200;
1450 1478
1451 /* Set up initial power settings. For non-slicon platforms, disable * 1479 /* Set up initial power settings. For non-slicon platforms, disable *
1452 * power features and for silicon platforms, read from platform data */ 1480 * power features and for silicon platforms, read from platform data */
@@ -1527,16 +1555,17 @@ static int gk20a_probe(struct platform_device *dev)
1527 platform->debugfs, 1555 platform->debugfs,
1528 &gk20a->timeslice_high_priority_us); 1556 &gk20a->timeslice_high_priority_us);
1529 1557
1530 gk20a->debugfs_interleave_high_priority = 1558 gk20a->debugfs_runlist_interleave =
1531 debugfs_create_bool("interleave_high_priority", 1559 debugfs_create_bool("runlist_interleave",
1532 S_IRUGO|S_IWUSR, 1560 S_IRUGO|S_IWUSR,
1533 platform->debugfs, 1561 platform->debugfs,
1534 &gk20a->interleave_high_priority); 1562 &gk20a->runlist_interleave);
1535 1563
1536 gr_gk20a_debugfs_init(gk20a); 1564 gr_gk20a_debugfs_init(gk20a);
1537 gk20a_pmu_debugfs_init(dev); 1565 gk20a_pmu_debugfs_init(dev);
1538 gk20a_cde_debugfs_init(dev); 1566 gk20a_cde_debugfs_init(dev);
1539 gk20a_alloc_debugfs_init(dev); 1567 gk20a_alloc_debugfs_init(dev);
1568 gk20a_mm_debugfs_init(dev);
1540#endif 1569#endif
1541 1570
1542 gk20a_init_gr(gk20a); 1571 gk20a_init_gr(gk20a);
@@ -1558,6 +1587,8 @@ static int __exit gk20a_remove(struct platform_device *dev)
1558 if (platform->has_cde) 1587 if (platform->has_cde)
1559 gk20a_cde_destroy(g); 1588 gk20a_cde_destroy(g);
1560 1589
1590 gk20a_ctxsw_trace_cleanup(g);
1591
1561 if (IS_ENABLED(CONFIG_GK20A_DEVFREQ)) 1592 if (IS_ENABLED(CONFIG_GK20A_DEVFREQ))
1562 gk20a_scale_exit(dev); 1593 gk20a_scale_exit(dev);
1563 1594
@@ -1774,7 +1805,10 @@ void gk20a_enable(struct gk20a *g, u32 units)
1774void gk20a_reset(struct gk20a *g, u32 units) 1805void gk20a_reset(struct gk20a *g, u32 units)
1775{ 1806{
1776 gk20a_disable(g, units); 1807 gk20a_disable(g, units);
1777 udelay(20); 1808 if (units & mc_enable_ce2_enabled_f())
1809 udelay(500);
1810 else
1811 udelay(20);
1778 gk20a_enable(g, units); 1812 gk20a_enable(g, units);
1779} 1813}
1780 1814
@@ -2095,6 +2129,19 @@ gk20a_request_firmware(struct gk20a *g, const char *fw_name)
2095 return fw; 2129 return fw;
2096} 2130}
2097 2131
2132
2133u64 gk20a_read_ptimer(struct gk20a *g)
2134{
2135 u32 time_hi0 = gk20a_readl(g, timer_time_1_r());
2136 u32 time_lo = gk20a_readl(g, timer_time_0_r());
2137 u32 time_hi1 = gk20a_readl(g, timer_time_1_r());
2138 u32 time_hi = (time_lo & (1L << 31)) ? time_hi0 : time_hi1;
2139 u64 time = ((u64)time_hi << 32) | time_lo;
2140
2141 return time;
2142}
2143
2144
2098MODULE_LICENSE("GPL v2"); 2145MODULE_LICENSE("GPL v2");
2099module_init(gk20a_init); 2146module_init(gk20a_init);
2100module_exit(gk20a_exit); 2147module_exit(gk20a_exit);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 340f358a..8a1f82bc 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -25,6 +25,8 @@ struct channel_gk20a;
25struct gr_gk20a; 25struct gr_gk20a;
26struct sim_gk20a; 26struct sim_gk20a;
27struct gk20a_ctxsw_ucode_segments; 27struct gk20a_ctxsw_ucode_segments;
28struct gk20a_fecs_trace;
29struct gk20a_ctxsw_trace;
28struct acr_gm20b; 30struct acr_gm20b;
29 31
30#include <linux/sched.h> 32#include <linux/sched.h>
@@ -54,8 +56,6 @@ struct acr_gm20b;
54 32 ns is the resolution of ptimer. */ 56 32 ns is the resolution of ptimer. */
55#define PTIMER_REF_FREQ_HZ 31250000 57#define PTIMER_REF_FREQ_HZ 31250000
56 58
57#define MAX_INTERLEAVED_CHANNELS 32
58
59struct cooling_device_gk20a { 59struct cooling_device_gk20a {
60 struct thermal_cooling_device *gk20a_cooling_dev; 60 struct thermal_cooling_device *gk20a_cooling_dev;
61 unsigned int gk20a_freq_state; 61 unsigned int gk20a_freq_state;
@@ -236,6 +236,7 @@ struct gpu_ops {
236 void (*slcg_therm_load_gating_prod)(struct gk20a *g, bool prod); 236 void (*slcg_therm_load_gating_prod)(struct gk20a *g, bool prod);
237 void (*slcg_xbar_load_gating_prod)(struct gk20a *g, bool prod); 237 void (*slcg_xbar_load_gating_prod)(struct gk20a *g, bool prod);
238 void (*blcg_bus_load_gating_prod)(struct gk20a *g, bool prod); 238 void (*blcg_bus_load_gating_prod)(struct gk20a *g, bool prod);
239 void (*blcg_ce_load_gating_prod)(struct gk20a *g, bool prod);
239 void (*blcg_ctxsw_firmware_load_gating_prod)(struct gk20a *g, bool prod); 240 void (*blcg_ctxsw_firmware_load_gating_prod)(struct gk20a *g, bool prod);
240 void (*blcg_fb_load_gating_prod)(struct gk20a *g, bool prod); 241 void (*blcg_fb_load_gating_prod)(struct gk20a *g, bool prod);
241 void (*blcg_fifo_load_gating_prod)(struct gk20a *g, bool prod); 242 void (*blcg_fifo_load_gating_prod)(struct gk20a *g, bool prod);
@@ -267,6 +268,11 @@ struct gpu_ops {
267 u32 (*get_num_fifos)(struct gk20a *g); 268 u32 (*get_num_fifos)(struct gk20a *g);
268 u32 (*get_pbdma_signature)(struct gk20a *g); 269 u32 (*get_pbdma_signature)(struct gk20a *g);
269 int (*channel_set_priority)(struct channel_gk20a *ch, u32 priority); 270 int (*channel_set_priority)(struct channel_gk20a *ch, u32 priority);
271 int (*set_runlist_interleave)(struct gk20a *g, u32 id,
272 bool is_tsg, u32 runlist_id,
273 u32 new_level);
274 int (*channel_set_timeslice)(struct channel_gk20a *ch,
275 u32 timeslice);
270 } fifo; 276 } fifo;
271 struct pmu_v { 277 struct pmu_v {
272 /*used for change of enum zbc update cmd id from ver 0 to ver1*/ 278 /*used for change of enum zbc update cmd id from ver 0 to ver1*/
@@ -369,6 +375,19 @@ struct gpu_ops {
369 bool use_dma_for_fw_bootstrap; 375 bool use_dma_for_fw_bootstrap;
370 } gr_ctx; 376 } gr_ctx;
371 struct { 377 struct {
378 int (*init)(struct gk20a *g);
379 int (*max_entries)(struct gk20a *,
380 struct nvgpu_ctxsw_trace_filter *);
381 int (*flush)(struct gk20a *g);
382 int (*poll)(struct gk20a *g);
383 int (*enable)(struct gk20a *g);
384 int (*disable)(struct gk20a *g);
385 int (*reset)(struct gk20a *g);
386 int (*bind_channel)(struct gk20a *, struct channel_gk20a *);
387 int (*unbind_channel)(struct gk20a *, struct channel_gk20a *);
388 int (*deinit)(struct gk20a *g);
389 } fecs_trace;
390 struct {
372 bool (*support_sparse)(struct gk20a *g); 391 bool (*support_sparse)(struct gk20a *g);
373 bool (*is_debug_mode_enabled)(struct gk20a *g); 392 bool (*is_debug_mode_enabled)(struct gk20a *g);
374 void (*set_debug_mode)(struct gk20a *g, bool enable); 393 void (*set_debug_mode)(struct gk20a *g, bool enable);
@@ -535,10 +554,7 @@ struct gk20a {
535 u32 timeslice_low_priority_us; 554 u32 timeslice_low_priority_us;
536 u32 timeslice_medium_priority_us; 555 u32 timeslice_medium_priority_us;
537 u32 timeslice_high_priority_us; 556 u32 timeslice_high_priority_us;
538 u32 interleave_high_priority; 557 u32 runlist_interleave;
539
540 struct mutex interleave_lock;
541 u32 num_interleaved_channels;
542 558
543 bool slcg_enabled; 559 bool slcg_enabled;
544 bool blcg_enabled; 560 bool blcg_enabled;
@@ -563,7 +579,7 @@ struct gk20a {
563 struct dentry *debugfs_timeslice_low_priority_us; 579 struct dentry *debugfs_timeslice_low_priority_us;
564 struct dentry *debugfs_timeslice_medium_priority_us; 580 struct dentry *debugfs_timeslice_medium_priority_us;
565 struct dentry *debugfs_timeslice_high_priority_us; 581 struct dentry *debugfs_timeslice_high_priority_us;
566 struct dentry *debugfs_interleave_high_priority; 582 struct dentry *debugfs_runlist_interleave;
567 583
568#endif 584#endif
569 struct gk20a_ctxsw_ucode_info ctxsw_ucode_info; 585 struct gk20a_ctxsw_ucode_info ctxsw_ucode_info;
@@ -575,6 +591,14 @@ struct gk20a {
575 int dbg_powergating_disabled_refcount; /*refcount for pg disable */ 591 int dbg_powergating_disabled_refcount; /*refcount for pg disable */
576 int dbg_timeout_disabled_refcount; /*refcount for timeout disable */ 592 int dbg_timeout_disabled_refcount; /*refcount for timeout disable */
577 593
594 /*
595 * When set subsequent VMAs will separate fixed and non-fixed
596 * allocations. This avoids conflicts with fixed and non-fixed allocs
597 * for some tests. The value in separate_fixed_allocs is used to
598 * determine the split boundary.
599 */
600 u64 separate_fixed_allocs;
601
578 void (*remove_support)(struct platform_device *); 602 void (*remove_support)(struct platform_device *);
579 603
580 u64 pg_ingating_time_us; 604 u64 pg_ingating_time_us;
@@ -612,6 +636,11 @@ struct gk20a {
612 struct device *node; 636 struct device *node;
613 } tsg; 637 } tsg;
614 638
639 struct {
640 struct cdev cdev;
641 struct device *node;
642 } ctxsw;
643
615 struct mutex client_lock; 644 struct mutex client_lock;
616 int client_refcount; /* open channels and ctrl nodes */ 645 int client_refcount; /* open channels and ctrl nodes */
617 646
@@ -638,6 +667,9 @@ struct gk20a {
638 667
639 struct gk20a_scale_profile *scale_profile; 668 struct gk20a_scale_profile *scale_profile;
640 669
670 struct gk20a_ctxsw_trace *ctxsw_trace;
671 struct gk20a_fecs_trace *fecs_trace;
672
641 struct device_dma_parameters dma_parms; 673 struct device_dma_parameters dma_parms;
642 674
643 struct gk20a_cde_app cde_app; 675 struct gk20a_cde_app cde_app;
@@ -715,6 +747,7 @@ enum gk20a_dbg_categories {
715 gpu_dbg_gpu_dbg = BIT(9), /* gpu debugger/profiler */ 747 gpu_dbg_gpu_dbg = BIT(9), /* gpu debugger/profiler */
716 gpu_dbg_cde = BIT(10), /* cde info messages */ 748 gpu_dbg_cde = BIT(10), /* cde info messages */
717 gpu_dbg_cde_ctx = BIT(11), /* cde context usage messages */ 749 gpu_dbg_cde_ctx = BIT(11), /* cde context usage messages */
750 gpu_dbg_ctxsw = BIT(12), /* ctxsw tracing */
718 gpu_dbg_mem = BIT(31), /* memory accesses, very verbose */ 751 gpu_dbg_mem = BIT(31), /* memory accesses, very verbose */
719}; 752};
720 753
@@ -961,4 +994,6 @@ static inline u32 scale_ptimer(u32 timeout , u32 scale10x)
961 else 994 else
962 return (timeout * 10) / scale10x; 995 return (timeout * 10) / scale10x;
963} 996}
997
998u64 gk20a_read_ptimer(struct gk20a *g);
964#endif /* GK20A_H */ 999#endif /* GK20A_H */
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_sysfs.c b/drivers/gpu/nvgpu/gk20a/gk20a_sysfs.c
index 0e6b576b..d433c9bb 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a_sysfs.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_sysfs.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * GK20A Graphics 4 * GK20A Graphics
5 * 5 *
6 * Copyright (c) 2011-2015, NVIDIA CORPORATION. All rights reserved. 6 * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
7 * 7 *
8 * This program is free software; you can redistribute it and/or modify it 8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License, 9 * under the terms and conditions of the GNU General Public License,
@@ -30,7 +30,6 @@
30#include "fifo_gk20a.h" 30#include "fifo_gk20a.h"
31#include "pmu_gk20a.h" 31#include "pmu_gk20a.h"
32 32
33
34#define PTIMER_FP_FACTOR 1000000 33#define PTIMER_FP_FACTOR 1000000
35 34
36#define ROOTRW (S_IRWXU|S_IRGRP|S_IROTH) 35#define ROOTRW (S_IRWXU|S_IRGRP|S_IROTH)
@@ -100,6 +99,9 @@ static ssize_t blcg_enable_store(struct device *device,
100 99
101 if (g->ops.clock_gating.blcg_bus_load_gating_prod) 100 if (g->ops.clock_gating.blcg_bus_load_gating_prod)
102 g->ops.clock_gating.blcg_bus_load_gating_prod(g, g->blcg_enabled); 101 g->ops.clock_gating.blcg_bus_load_gating_prod(g, g->blcg_enabled);
102 if (g->ops.clock_gating.blcg_ce_load_gating_prod)
103 g->ops.clock_gating.blcg_ce_load_gating_prod(g,
104 g->blcg_enabled);
103 if (g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod) 105 if (g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod)
104 g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod(g, g->blcg_enabled); 106 g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod(g, g->blcg_enabled);
105 if (g->ops.clock_gating.blcg_fb_load_gating_prod) 107 if (g->ops.clock_gating.blcg_fb_load_gating_prod)
@@ -784,8 +786,15 @@ void gk20a_remove_sysfs(struct device *dev)
784 device_remove_file(dev, &dev_attr_allow_all); 786 device_remove_file(dev, &dev_attr_allow_all);
785 device_remove_file(dev, &dev_attr_tpc_fs_mask); 787 device_remove_file(dev, &dev_attr_tpc_fs_mask);
786 788
787 if (g->host1x_dev && (dev->parent != &g->host1x_dev->dev)) 789 if (g->host1x_dev && (dev->parent != &g->host1x_dev->dev)) {
788 sysfs_remove_link(&g->host1x_dev->dev.kobj, dev_name(dev)); 790 sysfs_remove_link(&g->host1x_dev->dev.kobj, dev_name(dev));
791 if (strcmp(dev_name(dev), "gpu.0")) {
792 struct kobject *kobj = &dev->kobj;
793 struct device *parent = container_of((kobj->parent),
794 struct device, kobj);
795 sysfs_remove_link(&parent->kobj, "gpu.0");
796 }
797 }
789} 798}
790 799
791void gk20a_create_sysfs(struct platform_device *dev) 800void gk20a_create_sysfs(struct platform_device *dev)
@@ -817,10 +826,19 @@ void gk20a_create_sysfs(struct platform_device *dev)
817 error |= device_create_file(&dev->dev, &dev_attr_allow_all); 826 error |= device_create_file(&dev->dev, &dev_attr_allow_all);
818 error |= device_create_file(&dev->dev, &dev_attr_tpc_fs_mask); 827 error |= device_create_file(&dev->dev, &dev_attr_tpc_fs_mask);
819 828
820 if (g->host1x_dev && (dev->dev.parent != &g->host1x_dev->dev)) 829 if (g->host1x_dev && (dev->dev.parent != &g->host1x_dev->dev)) {
821 error |= sysfs_create_link(&g->host1x_dev->dev.kobj, 830 error |= sysfs_create_link(&g->host1x_dev->dev.kobj,
822 &dev->dev.kobj, 831 &dev->dev.kobj,
823 dev_name(&dev->dev)); 832 dev_name(&dev->dev));
833 if (strcmp(dev_name(&dev->dev), "gpu.0")) {
834 struct kobject *kobj = &dev->dev.kobj;
835 struct device *parent = container_of((kobj->parent),
836 struct device, kobj);
837 error |= sysfs_create_link(&parent->kobj,
838 &dev->dev.kobj, "gpu.0");
839 }
840
841 }
824 842
825 if (error) 843 if (error)
826 dev_err(&dev->dev, "Failed to create sysfs attributes!\n"); 844 dev_err(&dev->dev, "Failed to create sysfs attributes!\n");
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 7e37a965..a10650be 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -56,6 +56,7 @@
56#include "debug_gk20a.h" 56#include "debug_gk20a.h"
57#include "semaphore_gk20a.h" 57#include "semaphore_gk20a.h"
58#include "platform_gk20a.h" 58#include "platform_gk20a.h"
59#include "ctxsw_trace_gk20a.h"
59 60
60#define BLK_SIZE (256) 61#define BLK_SIZE (256)
61 62
@@ -2855,6 +2856,13 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a *c,
2855 "fail to load golden ctx image"); 2856 "fail to load golden ctx image");
2856 goto out; 2857 goto out;
2857 } 2858 }
2859 if (g->ops.fecs_trace.bind_channel) {
2860 err = g->ops.fecs_trace.bind_channel(g, c);
2861 if (err) {
2862 gk20a_warn(dev_from_gk20a(g),
2863 "fail to bind channel for ctxsw trace");
2864 }
2865 }
2858 c->first_init = true; 2866 c->first_init = true;
2859 } 2867 }
2860 2868
@@ -4217,7 +4225,15 @@ out:
4217static void gr_gk20a_load_gating_prod(struct gk20a *g) 4225static void gr_gk20a_load_gating_prod(struct gk20a *g)
4218{ 4226{
4219 /* slcg prod values */ 4227 /* slcg prod values */
4220 g->ops.clock_gating.slcg_gr_load_gating_prod(g, g->slcg_enabled); 4228 if (g->ops.clock_gating.slcg_bus_load_gating_prod)
4229 g->ops.clock_gating.slcg_bus_load_gating_prod(g,
4230 g->slcg_enabled);
4231 if (g->ops.clock_gating.slcg_chiplet_load_gating_prod)
4232 g->ops.clock_gating.slcg_chiplet_load_gating_prod(g,
4233 g->slcg_enabled);
4234 if (g->ops.clock_gating.slcg_gr_load_gating_prod)
4235 g->ops.clock_gating.slcg_gr_load_gating_prod(g,
4236 g->slcg_enabled);
4221 if (g->ops.clock_gating.slcg_ctxsw_firmware_load_gating_prod) 4237 if (g->ops.clock_gating.slcg_ctxsw_firmware_load_gating_prod)
4222 g->ops.clock_gating.slcg_ctxsw_firmware_load_gating_prod(g, 4238 g->ops.clock_gating.slcg_ctxsw_firmware_load_gating_prod(g,
4223 g->slcg_enabled); 4239 g->slcg_enabled);
@@ -4227,6 +4243,12 @@ static void gr_gk20a_load_gating_prod(struct gk20a *g)
4227 g->slcg_enabled); 4243 g->slcg_enabled);
4228 4244
4229 /* blcg prod values */ 4245 /* blcg prod values */
4246 if (g->ops.clock_gating.blcg_bus_load_gating_prod)
4247 g->ops.clock_gating.blcg_bus_load_gating_prod(g,
4248 g->blcg_enabled);
4249 if (g->ops.clock_gating.blcg_ce_load_gating_prod)
4250 g->ops.clock_gating.blcg_ce_load_gating_prod(g,
4251 g->blcg_enabled);
4230 g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled); 4252 g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled);
4231 if (g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod) 4253 if (g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod)
4232 g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod(g, 4254 g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod(g,
@@ -7463,6 +7485,7 @@ static int gr_gk20a_dump_gr_status_regs(struct gk20a *g,
7463 return 0; 7485 return 0;
7464} 7486}
7465 7487
7488#ifdef CONFIG_DEBUG_FS
7466int gr_gk20a_debugfs_init(struct gk20a *g) 7489int gr_gk20a_debugfs_init(struct gk20a *g)
7467{ 7490{
7468 struct gk20a_platform *platform = platform_get_drvdata(g->dev); 7491 struct gk20a_platform *platform = platform_get_drvdata(g->dev);
@@ -7474,6 +7497,7 @@ int gr_gk20a_debugfs_init(struct gk20a *g)
7474 7497
7475 return 0; 7498 return 0;
7476} 7499}
7500#endif
7477 7501
7478static void gr_gk20a_init_cyclestats(struct gk20a *g) 7502static void gr_gk20a_init_cyclestats(struct gk20a *g)
7479{ 7503{
diff --git a/drivers/gpu/nvgpu/gk20a/hal_gk20a.c b/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
index a9ad970a..9718aad2 100644
--- a/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
@@ -22,6 +22,7 @@
22#include "gk20a_gating_reglist.h" 22#include "gk20a_gating_reglist.h"
23#include "channel_gk20a.h" 23#include "channel_gk20a.h"
24#include "gr_ctx_gk20a.h" 24#include "gr_ctx_gk20a.h"
25#include "fecs_trace_gk20a.h"
25#include "mm_gk20a.h" 26#include "mm_gk20a.h"
26#include "mc_gk20a.h" 27#include "mc_gk20a.h"
27#include "pmu_gk20a.h" 28#include "pmu_gk20a.h"
@@ -57,6 +58,7 @@ int gk20a_init_hal(struct gk20a *g)
57 gk20a_init_mc(gops); 58 gk20a_init_mc(gops);
58 gk20a_init_ltc(gops); 59 gk20a_init_ltc(gops);
59 gk20a_init_gr_ops(gops); 60 gk20a_init_gr_ops(gops);
61 gk20a_init_fecs_trace_ops(gops);
60 gk20a_init_fb(gops); 62 gk20a_init_fb(gops);
61 gk20a_init_fifo(gops); 63 gk20a_init_fifo(gops);
62 gk20a_init_ce2(gops); 64 gk20a_init_ce2(gops);
diff --git a/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h
index 39cbbb58..da555f7c 100644
--- a/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2012-2015, NVIDIA CORPORATION. All rights reserved. 2 * Copyright (c) 2012-2016, NVIDIA CORPORATION. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify it 4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License, 5 * under the terms and conditions of the GNU General Public License,
@@ -246,4 +246,192 @@ static inline u32 ctxsw_prog_main_image_context_id_o(void)
246{ 246{
247 return 0x000000f0; 247 return 0x000000f0;
248} 248}
249static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_control_o(void)
250{
251 return 0x000000ac;
252}
253static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(u32 v)
254{
255 return (v & 0xffff) << 0;
256}
257static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(void)
258{
259 return 0x000000b0;
260}
261static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_v_m(void)
262{
263 return 0xfffffff << 0;
264}
265static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_m(void)
266{
267 return 0x3 << 28;
268}
269static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_vid_mem_f(void)
270{
271 return 0x0;
272}
273static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_coherent_f(void)
274{
275 return 0x20000000;
276}
277static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_noncoherent_f(void)
278{
279 return 0x30000000;
280}
281static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(void)
282{
283 return 0x000000b4;
284}
285static inline u32 ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(u32 v)
286{
287 return (v & 0xffffffff) << 0;
288}
289static inline u32 ctxsw_prog_record_timestamp_record_size_in_bytes_v(void)
290{
291 return 0x00000080;
292}
293static inline u32 ctxsw_prog_record_timestamp_record_size_in_words_v(void)
294{
295 return 0x00000020;
296}
297static inline u32 ctxsw_prog_record_timestamp_magic_value_lo_o(void)
298{
299 return 0x00000000;
300}
301static inline u32 ctxsw_prog_record_timestamp_magic_value_lo_v_value_v(void)
302{
303 return 0x00000000;
304}
305static inline u32 ctxsw_prog_record_timestamp_magic_value_hi_o(void)
306{
307 return 0x00000004;
308}
309static inline u32 ctxsw_prog_record_timestamp_magic_value_hi_v_value_v(void)
310{
311 return 0x600dbeef;
312}
313static inline u32 ctxsw_prog_record_timestamp_context_id_o(void)
314{
315 return 0x00000008;
316}
317static inline u32 ctxsw_prog_record_timestamp_context_ptr_o(void)
318{
319 return 0x0000000c;
320}
321static inline u32 ctxsw_prog_record_timestamp_new_context_id_o(void)
322{
323 return 0x00000010;
324}
325static inline u32 ctxsw_prog_record_timestamp_new_context_ptr_o(void)
326{
327 return 0x00000014;
328}
329static inline u32 ctxsw_prog_record_timestamp_timestamp_lo_o(void)
330{
331 return 0x00000018;
332}
333static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_o(void)
334{
335 return 0x0000001c;
336}
337static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_v_f(u32 v)
338{
339 return (v & 0xffffff) << 0;
340}
341static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_v_v(u32 r)
342{
343 return (r >> 0) & 0xffffff;
344}
345static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_f(u32 v)
346{
347 return (v & 0xff) << 24;
348}
349static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_m(void)
350{
351 return 0xff << 24;
352}
353static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_v(u32 r)
354{
355 return (r >> 24) & 0xff;
356}
357static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_ctxsw_req_by_host_v(void)
358{
359 return 0x00000001;
360}
361static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_ctxsw_req_by_host_f(void)
362{
363 return 0x1000000;
364}
365static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_v(void)
366{
367 return 0x00000002;
368}
369static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_f(void)
370{
371 return 0x2000000;
372}
373static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_wfi_v(void)
374{
375 return 0x0000000a;
376}
377static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_wfi_f(void)
378{
379 return 0xa000000;
380}
381static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_gfxp_v(void)
382{
383 return 0x0000000b;
384}
385static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_gfxp_f(void)
386{
387 return 0xb000000;
388}
389static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_ctap_v(void)
390{
391 return 0x0000000c;
392}
393static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_ctap_f(void)
394{
395 return 0xc000000;
396}
397static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_cilp_v(void)
398{
399 return 0x0000000d;
400}
401static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_fe_ack_cilp_f(void)
402{
403 return 0xd000000;
404}
405static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_save_end_v(void)
406{
407 return 0x00000003;
408}
409static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_save_end_f(void)
410{
411 return 0x3000000;
412}
413static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_restore_start_v(void)
414{
415 return 0x00000004;
416}
417static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_restore_start_f(void)
418{
419 return 0x4000000;
420}
421static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_context_start_v(void)
422{
423 return 0x00000005;
424}
425static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_context_start_f(void)
426{
427 return 0x5000000;
428}
429static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v(void)
430{
431 return 0x000000ff;
432}
433static inline u32 ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_f(void)
434{
435 return 0xff000000;
436}
249#endif 437#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_ltc_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_ltc_gk20a.h
index 6db5654b..94770431 100644
--- a/drivers/gpu/nvgpu/gk20a/hw_ltc_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/hw_ltc_gk20a.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2012-2014, NVIDIA CORPORATION. All rights reserved. 2 * Copyright (c) 2012-2016, NVIDIA CORPORATION. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify it 4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License, 5 * under the terms and conditions of the GNU General Public License,
@@ -258,6 +258,10 @@ static inline u32 ltc_ltcs_ltss_intr_en_evicted_cb_m(void)
258{ 258{
259 return 0x1 << 20; 259 return 0x1 << 20;
260} 260}
261static inline u32 ltc_ltcs_ltss_intr_en_illegal_compstat_m(void)
262{
263 return 0x1 << 21;
264}
261static inline u32 ltc_ltc0_lts0_intr_r(void) 265static inline u32 ltc_ltc0_lts0_intr_r(void)
262{ 266{
263 return 0x00141020; 267 return 0x00141020;
diff --git a/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h
index 22bc50ac..4cb36cbe 100644
--- a/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 2 * Copyright (c) 2013-2016, NVIDIA CORPORATION. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify it 4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License, 5 * under the terms and conditions of the GNU General Public License,
@@ -98,4 +98,12 @@ static inline u32 timer_pri_timeout_fecs_errcode_r(void)
98{ 98{
99 return 0x0000908c; 99 return 0x0000908c;
100} 100}
101static inline u32 timer_time_0_r(void)
102{
103 return 0x00009400;
104}
105static inline u32 timer_time_1_r(void)
106{
107 return 0x00009410;
108}
101#endif 109#endif
diff --git a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
index c6ff07da..0d9a98b4 100644
--- a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
@@ -1,9 +1,7 @@
1/* 1/*
2 * drivers/video/tegra/host/gk20a/ltc_gk20a.c 2 * GK20A L2
3 * 3 *
4 * GK20A Graphics 4 * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
5 *
6 * Copyright (c) 2011-2015, NVIDIA CORPORATION. All rights reserved.
7 * 5 *
8 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -173,9 +171,17 @@ out:
173 171
174static void gk20a_ltc_init_fs_state(struct gk20a *g) 172static void gk20a_ltc_init_fs_state(struct gk20a *g)
175{ 173{
174 u32 reg;
175
176 gk20a_dbg_info("initialize gk20a L2"); 176 gk20a_dbg_info("initialize gk20a L2");
177 177
178 g->max_ltc_count = g->ltc_count = 1; 178 g->max_ltc_count = g->ltc_count = 1;
179
180 /* Disable LTC interrupts */
181 reg = gk20a_readl(g, ltc_ltcs_ltss_intr_r());
182 reg &= ~ltc_ltcs_ltss_intr_en_evicted_cb_m();
183 reg &= ~ltc_ltcs_ltss_intr_en_illegal_compstat_m();
184 gk20a_writel(g, ltc_ltcs_ltss_intr_r(), reg);
179} 185}
180 186
181static void gk20a_ltc_isr(struct gk20a *g) 187static void gk20a_ltc_isr(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 738df2af..7a02d68e 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * GK20A memory management 2 * GK20A memory management
3 * 3 *
4 * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -323,7 +323,7 @@ static int gk20a_alloc_comptags(struct gk20a *g,
323 if (err) 323 if (err)
324 return err; 324 return err;
325 325
326 /* 326 /*
327 * offset needs to be at the start of a page/cacheline boundary; 327 * offset needs to be at the start of a page/cacheline boundary;
328 * prune the preceding ctaglines that were allocated for alignment. 328 * prune the preceding ctaglines that were allocated for alignment.
329 */ 329 */
@@ -1290,12 +1290,6 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
1290 int ctag_granularity = g->ops.fb.compression_page_size(g); 1290 int ctag_granularity = g->ops.fb.compression_page_size(g);
1291 u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity); 1291 u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity);
1292 1292
1293 if (clear_ctags && ctag_offset) {
1294 /* init/clear the ctag buffer */
1295 g->ops.ltc.cbc_ctrl(g, gk20a_cbc_op_clear,
1296 ctag_offset, ctag_offset + ctag_lines - 1);
1297 }
1298
1299 /* Allocate (or validate when map_offset != 0) the virtual address. */ 1293 /* Allocate (or validate when map_offset != 0) the virtual address. */
1300 if (!map_offset) { 1294 if (!map_offset) {
1301 map_offset = gk20a_vm_alloc_va(vm, size, 1295 map_offset = gk20a_vm_alloc_va(vm, size,
@@ -1651,17 +1645,14 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,
1651 bfr.kind_v = bfr.uc_kind_v; 1645 bfr.kind_v = bfr.uc_kind_v;
1652 } else { 1646 } else {
1653 gk20a_get_comptags(d, dmabuf, &comptags); 1647 gk20a_get_comptags(d, dmabuf, &comptags);
1654 clear_ctags = true;
1655
1656 if (comptags.lines < comptags.allocated_lines) {
1657 /* clear tail-padding comptags */
1658 u32 ctagmin = comptags.offset + comptags.lines;
1659 u32 ctagmax = comptags.offset +
1660 comptags.allocated_lines - 1;
1661 1648
1649 if (g->ops.ltc.cbc_ctrl)
1662 g->ops.ltc.cbc_ctrl(g, gk20a_cbc_op_clear, 1650 g->ops.ltc.cbc_ctrl(g, gk20a_cbc_op_clear,
1663 ctagmin, ctagmax); 1651 comptags.offset,
1664 } 1652 comptags.offset +
1653 comptags.allocated_lines - 1);
1654 else
1655 clear_ctags = true;
1665 } 1656 }
1666 } 1657 }
1667 1658
@@ -2815,6 +2806,7 @@ int gk20a_init_vm(struct mm_gk20a *mm,
2815 u64 small_vma_start, small_vma_limit, large_vma_start, large_vma_limit, 2806 u64 small_vma_start, small_vma_limit, large_vma_start, large_vma_limit,
2816 kernel_vma_start, kernel_vma_limit; 2807 kernel_vma_start, kernel_vma_limit;
2817 u32 pde_lo, pde_hi; 2808 u32 pde_lo, pde_hi;
2809 struct gk20a *g = mm->g;
2818 2810
2819 /* note: this must match gmmu_pgsz_gk20a enum */ 2811 /* note: this must match gmmu_pgsz_gk20a enum */
2820 u32 gmmu_page_sizes[gmmu_nr_page_sizes] = { SZ_4K, big_page_size, SZ_4K }; 2812 u32 gmmu_page_sizes[gmmu_nr_page_sizes] = { SZ_4K, big_page_size, SZ_4K };
@@ -2904,6 +2896,31 @@ int gk20a_init_vm(struct mm_gk20a *mm,
2904 goto clean_up_pdes; 2896 goto clean_up_pdes;
2905 } 2897 }
2906 2898
2899 /*
2900 * Attempt to make a separate VM for fixed allocations.
2901 */
2902 if (g->separate_fixed_allocs &&
2903 small_vma_start < small_vma_limit) {
2904 if (g->separate_fixed_allocs >= small_vma_limit)
2905 goto clean_up_pdes;
2906
2907 snprintf(alloc_name, sizeof(alloc_name),
2908 "gk20a_%s-fixed", name);
2909
2910 err = __gk20a_allocator_init(&vm->fixed,
2911 vm, alloc_name,
2912 small_vma_start,
2913 g->separate_fixed_allocs,
2914 SZ_4K,
2915 GPU_BALLOC_MAX_ORDER,
2916 GPU_BALLOC_GVA_SPACE);
2917 if (err)
2918 goto clean_up_ptes;
2919
2920 /* Make sure to update the user vma size. */
2921 small_vma_start = g->separate_fixed_allocs;
2922 }
2923
2907 if (small_vma_start < small_vma_limit) { 2924 if (small_vma_start < small_vma_limit) {
2908 snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB", name, 2925 snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB", name,
2909 vm->gmmu_page_sizes[gmmu_page_size_small] >> 10); 2926 vm->gmmu_page_sizes[gmmu_page_size_small] >> 10);
@@ -3066,14 +3083,17 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
3066 } 3083 }
3067 3084
3068 vma = &vm->vma[pgsz_idx]; 3085 vma = &vm->vma[pgsz_idx];
3069 if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET) 3086 if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET) {
3087 if (vm->fixed.init)
3088 vma = &vm->fixed;
3070 vaddr_start = gk20a_balloc_fixed(vma, args->o_a.offset, 3089 vaddr_start = gk20a_balloc_fixed(vma, args->o_a.offset,
3071 (u64)args->pages * 3090 (u64)args->pages *
3072 (u64)args->page_size); 3091 (u64)args->page_size);
3073 else 3092 } else {
3074 vaddr_start = gk20a_balloc(vma, 3093 vaddr_start = gk20a_balloc(vma,
3075 (u64)args->pages * 3094 (u64)args->pages *
3076 (u64)args->page_size); 3095 (u64)args->page_size);
3096 }
3077 3097
3078 if (!vaddr_start) { 3098 if (!vaddr_start) {
3079 kfree(va_node); 3099 kfree(va_node);
@@ -3140,7 +3160,10 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
3140 pgsz_idx = __nv_gmmu_va_is_big_page_region(vm, args->offset) ? 3160 pgsz_idx = __nv_gmmu_va_is_big_page_region(vm, args->offset) ?
3141 gmmu_page_size_big : gmmu_page_size_small; 3161 gmmu_page_size_big : gmmu_page_size_small;
3142 3162
3143 vma = &vm->vma[pgsz_idx]; 3163 if (vm->fixed.init)
3164 vma = &vm->fixed;
3165 else
3166 vma = &vm->vma[pgsz_idx];
3144 gk20a_bfree(vma, args->offset); 3167 gk20a_bfree(vma, args->offset);
3145 3168
3146 mutex_lock(&vm->update_gmmu_lock); 3169 mutex_lock(&vm->update_gmmu_lock);
@@ -3330,6 +3353,8 @@ void gk20a_deinit_vm(struct vm_gk20a *vm)
3330 gk20a_allocator_destroy(&vm->vma[gmmu_page_size_big]); 3353 gk20a_allocator_destroy(&vm->vma[gmmu_page_size_big]);
3331 if (vm->vma[gmmu_page_size_small].init) 3354 if (vm->vma[gmmu_page_size_small].init)
3332 gk20a_allocator_destroy(&vm->vma[gmmu_page_size_small]); 3355 gk20a_allocator_destroy(&vm->vma[gmmu_page_size_small]);
3356 if (vm->fixed.init)
3357 gk20a_allocator_destroy(&vm->fixed);
3333 3358
3334 gk20a_vm_free_entries(vm, &vm->pdb, 0); 3359 gk20a_vm_free_entries(vm, &vm->pdb, 0);
3335} 3360}
@@ -3843,6 +3868,16 @@ clean_up:
3843 return err; 3868 return err;
3844} 3869}
3845 3870
3871void gk20a_mm_debugfs_init(struct platform_device *pdev)
3872{
3873 struct gk20a_platform *platform = platform_get_drvdata(pdev);
3874 struct dentry *gpu_root = platform->debugfs;
3875 struct gk20a *g = gk20a_get_platform(pdev)->g;
3876
3877 debugfs_create_x64("separate_fixed_allocs", 0664, gpu_root,
3878 &g->separate_fixed_allocs);
3879}
3880
3846void gk20a_init_mm(struct gpu_ops *gops) 3881void gk20a_init_mm(struct gpu_ops *gops)
3847{ 3882{
3848 gops->mm.is_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled; 3883 gops->mm.is_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled;
@@ -3863,4 +3898,3 @@ void gk20a_init_mm(struct gpu_ops *gops)
3863 gops->mm.init_pdb = gk20a_mm_init_pdb; 3898 gops->mm.init_pdb = gk20a_mm_init_pdb;
3864 gops->mm.init_mm_setup_hw = gk20a_init_mm_setup_hw; 3899 gops->mm.init_mm_setup_hw = gk20a_init_mm_setup_hw;
3865} 3900}
3866
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index b8b0ca49..368b32d3 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -259,6 +259,10 @@ struct vm_gk20a {
259 struct gk20a_mm_entry pdb; 259 struct gk20a_mm_entry pdb;
260 260
261 struct gk20a_allocator vma[gmmu_nr_page_sizes]; 261 struct gk20a_allocator vma[gmmu_nr_page_sizes];
262
263 /* If necessary, split fixed from non-fixed. */
264 struct gk20a_allocator fixed;
265
262 struct rb_root mapped_buffers; 266 struct rb_root mapped_buffers;
263 267
264 struct list_head reserved_va_list; 268 struct list_head reserved_va_list;
@@ -279,6 +283,7 @@ struct channel_gk20a;
279int gk20a_init_mm_support(struct gk20a *g); 283int gk20a_init_mm_support(struct gk20a *g);
280int gk20a_init_mm_setup_sw(struct gk20a *g); 284int gk20a_init_mm_setup_sw(struct gk20a *g);
281int gk20a_init_mm_setup_hw(struct gk20a *g); 285int gk20a_init_mm_setup_hw(struct gk20a *g);
286void gk20a_mm_debugfs_init(struct platform_device *pdev);
282 287
283int gk20a_mm_fb_flush(struct gk20a *g); 288int gk20a_mm_fb_flush(struct gk20a *g);
284void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate); 289void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate);
@@ -332,9 +337,9 @@ struct mm_gk20a {
332#ifdef CONFIG_DEBUG_FS 337#ifdef CONFIG_DEBUG_FS
333 u32 ltc_enabled; 338 u32 ltc_enabled;
334 u32 ltc_enabled_debug; 339 u32 ltc_enabled_debug;
340#endif
335 u32 bypass_smmu; 341 u32 bypass_smmu;
336 u32 disable_bigpage; 342 u32 disable_bigpage;
337#endif
338}; 343};
339 344
340int gk20a_mm_init(struct mm_gk20a *mm); 345int gk20a_mm_init(struct mm_gk20a *mm);
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
index 84b3fcaf..6bffed9e 100644
--- a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
@@ -59,8 +59,10 @@ struct gk20a_platform {
59 struct clk *clk[3]; 59 struct clk *clk[3];
60 int num_clks; 60 int num_clks;
61 61
62#ifdef CONFIG_RESET_CONTROLLER
62 /* Reset control for device */ 63 /* Reset control for device */
63 struct reset_control *reset_control; 64 struct reset_control *reset_control;
65#endif
64 66
65 /* Delay before rail gated */ 67 /* Delay before rail gated */
66 int railgate_delay; 68 int railgate_delay;
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
index 60ffa381..15d6609d 100644
--- a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * GK20A Tegra Platform Interface 4 * GK20A Tegra Platform Interface
5 * 5 *
6 * Copyright (c) 2014-2015, NVIDIA CORPORATION. All rights reserved. 6 * Copyright (c) 2014-2016, NVIDIA CORPORATION. All rights reserved.
7 * 7 *
8 * This program is free software; you can redistribute it and/or modify it 8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License, 9 * under the terms and conditions of the GNU General Public License,
@@ -822,7 +822,7 @@ static long gk20a_round_clk_rate(struct platform_device *dev,
822 return gk20a_clk_round_rate(g, rate); 822 return gk20a_clk_round_rate(g, rate);
823} 823}
824 824
825int gk20a_set_clk_rate(struct platform_device *dev, unsigned long rate) 825static int gk20a_set_clk_rate(struct platform_device *dev, unsigned long rate)
826{ 826{
827 struct gk20a_platform *platform = gk20a_get_platform(dev); 827 struct gk20a_platform *platform = gk20a_get_platform(dev);
828 struct gk20a *g = platform->g; 828 struct gk20a *g = platform->g;
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
index 30592ee2..60c87979 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -4426,7 +4426,7 @@ int gk20a_aelpg_init_and_enable(struct gk20a *g, u8 ctrl_id)
4426 return status; 4426 return status;
4427} 4427}
4428 4428
4429#if CONFIG_DEBUG_FS 4429#ifdef CONFIG_DEBUG_FS
4430static int elpg_residency_show(struct seq_file *s, void *data) 4430static int elpg_residency_show(struct seq_file *s, void *data)
4431{ 4431{
4432 struct gk20a *g = s->private; 4432 struct gk20a *g = s->private;
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
index 4421744c..b41cca08 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.c
@@ -228,6 +228,7 @@ int gk20a_tsg_open(struct gk20a *g, struct file *filp)
228 228
229 tsg->tsg_gr_ctx = NULL; 229 tsg->tsg_gr_ctx = NULL;
230 tsg->vm = NULL; 230 tsg->vm = NULL;
231 tsg->interleave_level = NVGPU_RUNLIST_INTERLEAVE_LEVEL_LOW;
231 232
232 filp->private_data = tsg; 233 filp->private_data = tsg;
233 234
diff --git a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
index bcc4d0c4..7e0a75d1 100644
--- a/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/tsg_gk20a.h
@@ -49,6 +49,8 @@ struct tsg_gk20a {
49 struct gr_ctx_desc *tsg_gr_ctx; 49 struct gr_ctx_desc *tsg_gr_ctx;
50 50
51 struct vm_gk20a *vm; 51 struct vm_gk20a *vm;
52
53 u32 interleave_level;
52}; 54};
53 55
54int gk20a_enable_tsg(struct tsg_gk20a *tsg); 56int gk20a_enable_tsg(struct tsg_gk20a *tsg);
diff --git a/drivers/gpu/nvgpu/gm20b/acr_gm20b.c b/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
index 9f137246..8a0be106 100644
--- a/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
@@ -362,7 +362,7 @@ int prepare_ucode_blob(struct gk20a *g)
362 gm20b_dbg_pmu("prepare ucode blob return 0\n"); 362 gm20b_dbg_pmu("prepare ucode blob return 0\n");
363 free_acr_resources(g, plsfm); 363 free_acr_resources(g, plsfm);
364 free_sgt: 364 free_sgt:
365 kfree(sgt); 365 gk20a_free_sgtable(&sgt);
366 return err; 366 return err;
367} 367}
368 368
diff --git a/drivers/gpu/nvgpu/gm20b/fifo_gm20b.c b/drivers/gpu/nvgpu/gm20b/fifo_gm20b.c
index d1deffb9..b9763224 100644
--- a/drivers/gpu/nvgpu/gm20b/fifo_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/fifo_gm20b.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * GM20B Fifo 2 * GM20B Fifo
3 * 3 *
4 * Copyright (c) 2014-2015, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2014-2016, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -114,6 +114,7 @@ void gm20b_init_fifo(struct gpu_ops *gops)
114 gops->fifo.free_inst = channel_gk20a_free_inst; 114 gops->fifo.free_inst = channel_gk20a_free_inst;
115 gops->fifo.setup_ramfc = channel_gk20a_setup_ramfc; 115 gops->fifo.setup_ramfc = channel_gk20a_setup_ramfc;
116 gops->fifo.channel_set_priority = gk20a_channel_set_priority; 116 gops->fifo.channel_set_priority = gk20a_channel_set_priority;
117 gops->fifo.channel_set_timeslice = gk20a_channel_set_timeslice;
117 118
118 gops->fifo.preempt_channel = gk20a_fifo_preempt_channel; 119 gops->fifo.preempt_channel = gk20a_fifo_preempt_channel;
119 gops->fifo.update_runlist = gk20a_fifo_update_runlist; 120 gops->fifo.update_runlist = gk20a_fifo_update_runlist;
@@ -121,4 +122,5 @@ void gm20b_init_fifo(struct gpu_ops *gops)
121 gops->fifo.wait_engine_idle = gk20a_fifo_wait_engine_idle; 122 gops->fifo.wait_engine_idle = gk20a_fifo_wait_engine_idle;
122 gops->fifo.get_num_fifos = gm20b_fifo_get_num_fifos; 123 gops->fifo.get_num_fifos = gm20b_fifo_get_num_fifos;
123 gops->fifo.get_pbdma_signature = gk20a_fifo_get_pbdma_signature; 124 gops->fifo.get_pbdma_signature = gk20a_fifo_get_pbdma_signature;
125 gops->fifo.set_runlist_interleave = gk20a_fifo_set_runlist_interleave;
124} 126}
diff --git a/drivers/gpu/nvgpu/gm20b/hw_ltc_gm20b.h b/drivers/gpu/nvgpu/gm20b/hw_ltc_gm20b.h
index 95e0c43d..aa01e945 100644
--- a/drivers/gpu/nvgpu/gm20b/hw_ltc_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/hw_ltc_gm20b.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved. 2 * Copyright (c) 2014-2016, NVIDIA CORPORATION. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify it 4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License, 5 * under the terms and conditions of the GNU General Public License,
@@ -286,6 +286,10 @@ static inline u32 ltc_ltcs_ltss_intr_en_illegal_compstat_access_m(void)
286{ 286{
287 return 0x1 << 30; 287 return 0x1 << 30;
288} 288}
289static inline u32 ltc_ltcs_ltss_intr_en_illegal_compstat_m(void)
290{
291 return 0x1 << 21;
292}
289static inline u32 ltc_ltc0_lts0_intr_r(void) 293static inline u32 ltc_ltc0_lts0_intr_r(void)
290{ 294{
291 return 0x0014040c; 295 return 0x0014040c;
diff --git a/drivers/gpu/nvgpu/gm20b/hw_timer_gm20b.h b/drivers/gpu/nvgpu/gm20b/hw_timer_gm20b.h
index 126f7c8c..06d02522 100644
--- a/drivers/gpu/nvgpu/gm20b/hw_timer_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/hw_timer_gm20b.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved. 2 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify it 4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License, 5 * under the terms and conditions of the GNU General Public License,
@@ -98,4 +98,12 @@ static inline u32 timer_pri_timeout_fecs_errcode_r(void)
98{ 98{
99 return 0x0000908c; 99 return 0x0000908c;
100} 100}
101static inline u32 timer_time_0_r(void)
102{
103 return 0x00009400;
104}
105static inline u32 timer_time_1_r(void)
106{
107 return 0x00009410;
108}
101#endif 109#endif
diff --git a/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c b/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
index 5b6bff7f..ffc36903 100644
--- a/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/ltc_gm20b.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * GM20B L2 2 * GM20B L2
3 * 3 *
4 * Copyright (c) 2014-2015 NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2014-2016 NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -190,6 +190,7 @@ void gm20b_ltc_init_fs_state(struct gk20a *g)
190 reg = gk20a_readl(g, ltc_ltcs_ltss_intr_r()); 190 reg = gk20a_readl(g, ltc_ltcs_ltss_intr_r());
191 reg &= ~ltc_ltcs_ltss_intr_en_evicted_cb_m(); 191 reg &= ~ltc_ltcs_ltss_intr_en_evicted_cb_m();
192 reg &= ~ltc_ltcs_ltss_intr_en_illegal_compstat_access_m(); 192 reg &= ~ltc_ltcs_ltss_intr_en_illegal_compstat_access_m();
193 reg &= ~ltc_ltcs_ltss_intr_en_illegal_compstat_m();
193 gk20a_writel(g, ltc_ltcs_ltss_intr_r(), reg); 194 gk20a_writel(g, ltc_ltcs_ltss_intr_r(), reg);
194} 195}
195 196
diff --git a/drivers/gpu/nvgpu/gm20b/therm_gm20b.c b/drivers/gpu/nvgpu/gm20b/therm_gm20b.c
index 5bd22841..6ebc4c91 100644
--- a/drivers/gpu/nvgpu/gm20b/therm_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/therm_gm20b.c
@@ -15,6 +15,7 @@
15 15
16#include "gk20a/gk20a.h" 16#include "gk20a/gk20a.h"
17#include "hw_therm_gm20b.h" 17#include "hw_therm_gm20b.h"
18#include "therm_gm20b.h"
18 19
19static int gm20b_init_therm_setup_hw(struct gk20a *g) 20static int gm20b_init_therm_setup_hw(struct gk20a *g)
20{ 21{
diff --git a/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.c b/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.c
new file mode 100644
index 00000000..cb955811
--- /dev/null
+++ b/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.c
@@ -0,0 +1,21 @@
1/*
2 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#include <linux/string.h>
15#include "gk20a/gk20a.h"
16#include "fecs_trace_vgpu.h"
17
18void vgpu_init_fecs_trace_ops(struct gpu_ops *ops)
19{
20 memset(&ops->fecs_trace, 0, sizeof(ops->fecs_trace));
21}
diff --git a/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.h b/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.h
new file mode 100644
index 00000000..1aace1fe
--- /dev/null
+++ b/drivers/gpu/nvgpu/vgpu/fecs_trace_vgpu.h
@@ -0,0 +1,20 @@
1/*
2 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13
14#ifndef __FECS_TRACE_VGPU_H
15#define __FECS_TRACE_VGPU_H
16
17struct gpu_ops;
18void vgpu_init_fecs_trace_ops(struct gpu_ops *ops);
19
20#endif /* __FECS_TRACE_VGPU_H */
diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
index e776e97c..9e40218d 100644
--- a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Virtualized GPU Fifo 2 * Virtualized GPU Fifo
3 * 3 *
4 * Copyright (c) 2014-2015, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2014-2016, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -81,6 +81,7 @@ static int vgpu_channel_alloc_inst(struct gk20a *g, struct channel_gk20a *ch)
81 msg.cmd = TEGRA_VGPU_CMD_CHANNEL_ALLOC_HWCTX; 81 msg.cmd = TEGRA_VGPU_CMD_CHANNEL_ALLOC_HWCTX;
82 msg.handle = platform->virt_handle; 82 msg.handle = platform->virt_handle;
83 p->id = ch->hw_chid; 83 p->id = ch->hw_chid;
84 p->pid = (u64)current->pid;
84 err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg)); 85 err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
85 if (err || msg.ret) { 86 if (err || msg.ret) {
86 gk20a_err(dev_from_gk20a(g), "fail"); 87 gk20a_err(dev_from_gk20a(g), "fail");
@@ -194,12 +195,6 @@ static int init_runlist(struct gk20a *g, struct fifo_gk20a *f)
194 if (!runlist->active_channels) 195 if (!runlist->active_channels)
195 goto clean_up_runlist_info; 196 goto clean_up_runlist_info;
196 197
197 runlist->high_prio_channels =
198 kzalloc(DIV_ROUND_UP(f->num_channels, BITS_PER_BYTE),
199 GFP_KERNEL);
200 if (!runlist->high_prio_channels)
201 goto clean_up_runlist_info;
202
203 runlist_size = sizeof(u16) * f->num_channels; 198 runlist_size = sizeof(u16) * f->num_channels;
204 for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) { 199 for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
205 int err = gk20a_gmmu_alloc(g, runlist_size, &runlist->mem[i]); 200 int err = gk20a_gmmu_alloc(g, runlist_size, &runlist->mem[i]);
@@ -222,9 +217,6 @@ clean_up_runlist:
222 gk20a_gmmu_free(g, &runlist->mem[i]); 217 gk20a_gmmu_free(g, &runlist->mem[i]);
223 218
224clean_up_runlist_info: 219clean_up_runlist_info:
225 kfree(runlist->high_prio_channels);
226 runlist->high_prio_channels = NULL;
227
228 kfree(runlist->active_channels); 220 kfree(runlist->active_channels);
229 runlist->active_channels = NULL; 221 runlist->active_channels = NULL;
230 222
@@ -550,6 +542,54 @@ static int vgpu_channel_set_priority(struct channel_gk20a *ch, u32 priority)
550 return err ? err : msg.ret; 542 return err ? err : msg.ret;
551} 543}
552 544
545static int vgpu_fifo_set_runlist_interleave(struct gk20a *g,
546 u32 id,
547 bool is_tsg,
548 u32 runlist_id,
549 u32 new_level)
550{
551 struct gk20a_platform *platform = gk20a_get_platform(g->dev);
552 struct tegra_vgpu_cmd_msg msg;
553 struct tegra_vgpu_channel_runlist_interleave_params *p =
554 &msg.params.channel_interleave;
555 struct channel_gk20a *ch;
556 int err;
557
558 gk20a_dbg_fn("");
559
560 /* FIXME: add support for TSGs */
561 if (is_tsg)
562 return -ENOSYS;
563
564 ch = &g->fifo.channel[id];
565 msg.cmd = TEGRA_VGPU_CMD_CHANNEL_SET_RUNLIST_INTERLEAVE;
566 msg.handle = platform->virt_handle;
567 p->handle = ch->virt_ctx;
568 p->level = new_level;
569 err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
570 WARN_ON(err || msg.ret);
571 return err ? err : msg.ret;
572}
573
574int vgpu_channel_set_timeslice(struct channel_gk20a *ch, u32 timeslice)
575{
576 struct gk20a_platform *platform = gk20a_get_platform(ch->g->dev);
577 struct tegra_vgpu_cmd_msg msg;
578 struct tegra_vgpu_channel_timeslice_params *p =
579 &msg.params.channel_timeslice;
580 int err;
581
582 gk20a_dbg_fn("");
583
584 msg.cmd = TEGRA_VGPU_CMD_CHANNEL_SET_TIMESLICE;
585 msg.handle = platform->virt_handle;
586 p->handle = ch->virt_ctx;
587 p->timeslice_us = timeslice;
588 err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
589 WARN_ON(err || msg.ret);
590 return err ? err : msg.ret;
591}
592
553static void vgpu_fifo_set_ctx_mmu_error(struct gk20a *g, 593static void vgpu_fifo_set_ctx_mmu_error(struct gk20a *g,
554 struct channel_gk20a *ch) 594 struct channel_gk20a *ch)
555{ 595{
@@ -635,5 +675,6 @@ void vgpu_init_fifo_ops(struct gpu_ops *gops)
635 gops->fifo.update_runlist = vgpu_fifo_update_runlist; 675 gops->fifo.update_runlist = vgpu_fifo_update_runlist;
636 gops->fifo.wait_engine_idle = vgpu_fifo_wait_engine_idle; 676 gops->fifo.wait_engine_idle = vgpu_fifo_wait_engine_idle;
637 gops->fifo.channel_set_priority = vgpu_channel_set_priority; 677 gops->fifo.channel_set_priority = vgpu_channel_set_priority;
678 gops->fifo.set_runlist_interleave = vgpu_fifo_set_runlist_interleave;
679 gops->fifo.channel_set_timeslice = vgpu_channel_set_timeslice;
638} 680}
639
diff --git a/drivers/gpu/nvgpu/vgpu/vgpu.c b/drivers/gpu/nvgpu/vgpu/vgpu.c
index e8328326..5a953e20 100644
--- a/drivers/gpu/nvgpu/vgpu/vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/vgpu.c
@@ -18,6 +18,7 @@
18#include <linux/dma-mapping.h> 18#include <linux/dma-mapping.h>
19#include <linux/pm_runtime.h> 19#include <linux/pm_runtime.h>
20#include "vgpu/vgpu.h" 20#include "vgpu/vgpu.h"
21#include "vgpu/fecs_trace_vgpu.h"
21#include "gk20a/debug_gk20a.h" 22#include "gk20a/debug_gk20a.h"
22#include "gk20a/hal_gk20a.h" 23#include "gk20a/hal_gk20a.h"
23#include "gk20a/hw_mc_gk20a.h" 24#include "gk20a/hw_mc_gk20a.h"
@@ -259,6 +260,7 @@ void vgpu_init_hal_common(struct gk20a *g)
259 vgpu_init_ltc_ops(gops); 260 vgpu_init_ltc_ops(gops);
260 vgpu_init_mm_ops(gops); 261 vgpu_init_mm_ops(gops);
261 vgpu_init_debug_ops(gops); 262 vgpu_init_debug_ops(gops);
263 vgpu_init_fecs_trace_ops(gops);
262} 264}
263 265
264static int vgpu_init_hal(struct gk20a *g) 266static int vgpu_init_hal(struct gk20a *g)
diff --git a/include/linux/tegra_vgpu.h b/include/linux/tegra_vgpu.h
index 280ca9c0..c4dd81dd 100644
--- a/include/linux/tegra_vgpu.h
+++ b/include/linux/tegra_vgpu.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Tegra GPU Virtualization Interfaces to Server 2 * Tegra GPU Virtualization Interfaces to Server
3 * 3 *
4 * Copyright (c) 2014-2015, NVIDIA Corporation. All rights reserved. 4 * Copyright (c) 2014-2016, NVIDIA Corporation. All rights reserved.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -74,7 +74,9 @@ enum {
74 TEGRA_VGPU_CMD_SET_MMU_DEBUG_MODE, 74 TEGRA_VGPU_CMD_SET_MMU_DEBUG_MODE,
75 TEGRA_VGPU_CMD_SET_SM_DEBUG_MODE, 75 TEGRA_VGPU_CMD_SET_SM_DEBUG_MODE,
76 TEGRA_VGPU_CMD_REG_OPS, 76 TEGRA_VGPU_CMD_REG_OPS,
77 TEGRA_VGPU_CMD_CHANNEL_SET_PRIORITY 77 TEGRA_VGPU_CMD_CHANNEL_SET_PRIORITY,
78 TEGRA_VGPU_CMD_CHANNEL_SET_RUNLIST_INTERLEAVE,
79 TEGRA_VGPU_CMD_CHANNEL_SET_TIMESLICE
78}; 80};
79 81
80struct tegra_vgpu_connect_params { 82struct tegra_vgpu_connect_params {
@@ -84,6 +86,7 @@ struct tegra_vgpu_connect_params {
84 86
85struct tegra_vgpu_channel_hwctx_params { 87struct tegra_vgpu_channel_hwctx_params {
86 u32 id; 88 u32 id;
89 u64 pid;
87 u64 handle; 90 u64 handle;
88}; 91};
89 92
@@ -298,6 +301,17 @@ struct tegra_vgpu_channel_priority_params {
298 u32 priority; 301 u32 priority;
299}; 302};
300 303
304/* level follows nvgpu.h definitions */
305struct tegra_vgpu_channel_runlist_interleave_params {
306 u64 handle;
307 u32 level;
308};
309
310struct tegra_vgpu_channel_timeslice_params {
311 u64 handle;
312 u32 timeslice_us;
313};
314
301struct tegra_vgpu_cmd_msg { 315struct tegra_vgpu_cmd_msg {
302 u32 cmd; 316 u32 cmd;
303 int ret; 317 int ret;
@@ -326,6 +340,8 @@ struct tegra_vgpu_cmd_msg {
326 struct tegra_vgpu_sm_debug_mode sm_debug_mode; 340 struct tegra_vgpu_sm_debug_mode sm_debug_mode;
327 struct tegra_vgpu_reg_ops_params reg_ops; 341 struct tegra_vgpu_reg_ops_params reg_ops;
328 struct tegra_vgpu_channel_priority_params channel_priority; 342 struct tegra_vgpu_channel_priority_params channel_priority;
343 struct tegra_vgpu_channel_runlist_interleave_params channel_interleave;
344 struct tegra_vgpu_channel_timeslice_params channel_timeslice;
329 char padding[192]; 345 char padding[192];
330 } params; 346 } params;
331}; 347};
diff --git a/include/trace/events/gk20a.h b/include/trace/events/gk20a.h
index 461ff6e8..23b5b642 100644
--- a/include/trace/events/gk20a.h
+++ b/include/trace/events/gk20a.h
@@ -387,7 +387,7 @@ TRACE_EVENT(gk20a_as_ioctl_get_va_regions,
387TRACE_EVENT(gk20a_mmu_fault, 387TRACE_EVENT(gk20a_mmu_fault,
388 TP_PROTO(u32 fault_hi, u32 fault_lo, 388 TP_PROTO(u32 fault_hi, u32 fault_lo,
389 u32 fault_info, 389 u32 fault_info,
390 u32 instance, 390 u64 instance,
391 u32 engine_id, 391 u32 engine_id,
392 const char *engine, 392 const char *engine,
393 const char *client, 393 const char *client,
@@ -398,7 +398,7 @@ TRACE_EVENT(gk20a_mmu_fault,
398 __field(u32, fault_hi) 398 __field(u32, fault_hi)
399 __field(u32, fault_lo) 399 __field(u32, fault_lo)
400 __field(u32, fault_info) 400 __field(u32, fault_info)
401 __field(u32, instance) 401 __field(u64, instance)
402 __field(u32, engine_id) 402 __field(u32, engine_id)
403 __field(const char *, engine) 403 __field(const char *, engine)
404 __field(const char *, client) 404 __field(const char *, client)
@@ -414,7 +414,7 @@ TRACE_EVENT(gk20a_mmu_fault,
414 __entry->client = client; 414 __entry->client = client;
415 __entry->fault_type = fault_type; 415 __entry->fault_type = fault_type;
416 ), 416 ),
417 TP_printk("fault=0x%x,%08x info=0x%x instance=0x%x engine_id=%d engine=%s client=%s type=%s", 417 TP_printk("fault=0x%x,%08x info=0x%x instance=0x%llx engine_id=%d engine=%s client=%s type=%s",
418 __entry->fault_hi, __entry->fault_lo, 418 __entry->fault_hi, __entry->fault_lo,
419 __entry->fault_info, __entry->instance, __entry->engine_id, 419 __entry->fault_info, __entry->instance, __entry->engine_id,
420 __entry->engine, __entry->client, __entry->fault_type) 420 __entry->engine, __entry->client, __entry->fault_type)
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index 442a84ac..64ac45b5 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -351,6 +351,28 @@ struct nvgpu_gpu_get_buffer_info_args {
351 }; 351 };
352}; 352};
353 353
354#define NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_MAX_COUNT 16
355#define NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_SRC_ID_TSC 1
356#define NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_SRC_ID_JIFFIES 2
357#define NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_SRC_ID_TIMEOFDAY 3
358
359struct nvgpu_gpu_get_cpu_time_correlation_sample {
360 /* gpu timestamp value */
361 __u64 cpu_timestamp;
362 /* raw GPU counter (PTIMER) value */
363 __u64 gpu_timestamp;
364};
365
366struct nvgpu_gpu_get_cpu_time_correlation_info_args {
367 /* timestamp pairs */
368 struct nvgpu_gpu_get_cpu_time_correlation_sample samples[
369 NVGPU_GPU_GET_CPU_TIME_CORRELATION_INFO_MAX_COUNT];
370 /* number of pairs to read */
371 __u32 count;
372 /* cpu clock source id */
373 __u32 source_id;
374};
375
354#define NVGPU_GPU_IOCTL_ZCULL_GET_CTX_SIZE \ 376#define NVGPU_GPU_IOCTL_ZCULL_GET_CTX_SIZE \
355 _IOR(NVGPU_GPU_IOCTL_MAGIC, 1, struct nvgpu_gpu_zcull_get_ctx_size_args) 377 _IOR(NVGPU_GPU_IOCTL_MAGIC, 1, struct nvgpu_gpu_zcull_get_ctx_size_args)
356#define NVGPU_GPU_IOCTL_ZCULL_GET_INFO \ 378#define NVGPU_GPU_IOCTL_ZCULL_GET_INFO \
@@ -397,11 +419,13 @@ struct nvgpu_gpu_get_buffer_info_args {
397 _IO(NVGPU_GPU_IOCTL_MAGIC, 22) 419 _IO(NVGPU_GPU_IOCTL_MAGIC, 22)
398#define NVGPU_GPU_IOCTL_CLEAR_SM_ERRORS \ 420#define NVGPU_GPU_IOCTL_CLEAR_SM_ERRORS \
399 _IO(NVGPU_GPU_IOCTL_MAGIC, 23) 421 _IO(NVGPU_GPU_IOCTL_MAGIC, 23)
400 422#define NVGPU_GPU_IOCTL_GET_CPU_TIME_CORRELATION_INFO \
423 _IOWR(NVGPU_GPU_IOCTL_MAGIC, 24, \
424 struct nvgpu_gpu_get_cpu_time_correlation_info_args)
401#define NVGPU_GPU_IOCTL_LAST \ 425#define NVGPU_GPU_IOCTL_LAST \
402 _IOC_NR(NVGPU_GPU_IOCTL_CLEAR_SM_ERRORS) 426 _IOC_NR(NVGPU_GPU_IOCTL_GET_CPU_TIME_CORRELATION_INFO)
403#define NVGPU_GPU_IOCTL_MAX_ARG_SIZE \ 427#define NVGPU_GPU_IOCTL_MAX_ARG_SIZE \
404 sizeof(struct nvgpu_gpu_prepare_compressible_read_args) 428 sizeof(struct nvgpu_gpu_get_cpu_time_correlation_info_args)
405 429
406/* 430/*
407 * /dev/nvhost-tsg-gpu device 431 * /dev/nvhost-tsg-gpu device
@@ -834,6 +858,34 @@ struct nvgpu_channel_wdt_args {
834#define NVGPU_IOCTL_CHANNEL_DISABLE_WDT 1 858#define NVGPU_IOCTL_CHANNEL_DISABLE_WDT 1
835#define NVGPU_IOCTL_CHANNEL_ENABLE_WDT 2 859#define NVGPU_IOCTL_CHANNEL_ENABLE_WDT 2
836 860
861/*
862 * Interleaving channels in a runlist is an approach to improve
863 * GPU scheduling by allowing certain channels to appear multiple
864 * times on the runlist. The number of times a channel appears is
865 * governed by the following levels:
866 *
867 * low (L) : appears once
868 * medium (M): if L, appears L times
869 * else, appears once
870 * high (H) : if L, appears (M + 1) x L times
871 * else if M, appears M times
872 * else, appears once
873 */
874struct nvgpu_runlist_interleave_args {
875 __u32 level;
876 __u32 reserved;
877};
878#define NVGPU_RUNLIST_INTERLEAVE_LEVEL_LOW 0
879#define NVGPU_RUNLIST_INTERLEAVE_LEVEL_MEDIUM 1
880#define NVGPU_RUNLIST_INTERLEAVE_LEVEL_HIGH 2
881#define NVGPU_RUNLIST_INTERLEAVE_NUM_LEVELS 3
882
883/* controls how long a channel occupies an engine uninterrupted */
884struct nvgpu_timeslice_args {
885 __u32 timeslice_us;
886 __u32 reserved;
887};
888
837#define NVGPU_IOCTL_CHANNEL_SET_NVMAP_FD \ 889#define NVGPU_IOCTL_CHANNEL_SET_NVMAP_FD \
838 _IOW(NVGPU_IOCTL_MAGIC, 5, struct nvgpu_set_nvmap_fd_args) 890 _IOW(NVGPU_IOCTL_MAGIC, 5, struct nvgpu_set_nvmap_fd_args)
839#define NVGPU_IOCTL_CHANNEL_SET_TIMEOUT \ 891#define NVGPU_IOCTL_CHANNEL_SET_TIMEOUT \
@@ -876,9 +928,13 @@ struct nvgpu_channel_wdt_args {
876 _IOWR(NVGPU_IOCTL_MAGIC, 118, struct nvgpu_cycle_stats_snapshot_args) 928 _IOWR(NVGPU_IOCTL_MAGIC, 118, struct nvgpu_cycle_stats_snapshot_args)
877#define NVGPU_IOCTL_CHANNEL_WDT \ 929#define NVGPU_IOCTL_CHANNEL_WDT \
878 _IOW(NVGPU_IOCTL_MAGIC, 119, struct nvgpu_channel_wdt_args) 930 _IOW(NVGPU_IOCTL_MAGIC, 119, struct nvgpu_channel_wdt_args)
931#define NVGPU_IOCTL_CHANNEL_SET_RUNLIST_INTERLEAVE \
932 _IOW(NVGPU_IOCTL_MAGIC, 120, struct nvgpu_runlist_interleave_args)
933#define NVGPU_IOCTL_CHANNEL_SET_TIMESLICE \
934 _IOW(NVGPU_IOCTL_MAGIC, 121, struct nvgpu_timeslice_args)
879 935
880#define NVGPU_IOCTL_CHANNEL_LAST \ 936#define NVGPU_IOCTL_CHANNEL_LAST \
881 _IOC_NR(NVGPU_IOCTL_CHANNEL_WDT) 937 _IOC_NR(NVGPU_IOCTL_CHANNEL_SET_TIMESLICE)
882#define NVGPU_IOCTL_CHANNEL_MAX_ARG_SIZE sizeof(struct nvgpu_submit_gpfifo_args) 938#define NVGPU_IOCTL_CHANNEL_MAX_ARG_SIZE sizeof(struct nvgpu_submit_gpfifo_args)
883 939
884/* 940/*
@@ -1159,4 +1215,94 @@ struct nvgpu_as_map_buffer_batch_args {
1159#define NVGPU_AS_IOCTL_MAX_ARG_SIZE \ 1215#define NVGPU_AS_IOCTL_MAX_ARG_SIZE \
1160 sizeof(struct nvgpu_as_map_buffer_ex_args) 1216 sizeof(struct nvgpu_as_map_buffer_ex_args)
1161 1217
1218
1219/*
1220 * /dev/nvhost-ctxsw-gpu device
1221 *
1222 * Opening a '/dev/nvhost-ctxsw-gpu' device node creates a way to trace
1223 * context switches on GR engine
1224 */
1225
1226#define NVGPU_CTXSW_IOCTL_MAGIC 'C'
1227
1228#define NVGPU_CTXSW_TAG_SOF 0x00
1229#define NVGPU_CTXSW_TAG_CTXSW_REQ_BY_HOST 0x01
1230#define NVGPU_CTXSW_TAG_FE_ACK 0x02
1231#define NVGPU_CTXSW_TAG_FE_ACK_WFI 0x0a
1232#define NVGPU_CTXSW_TAG_FE_ACK_GFXP 0x0b
1233#define NVGPU_CTXSW_TAG_FE_ACK_CTAP 0x0c
1234#define NVGPU_CTXSW_TAG_FE_ACK_CILP 0x0d
1235#define NVGPU_CTXSW_TAG_SAVE_END 0x03
1236#define NVGPU_CTXSW_TAG_RESTORE_START 0x04
1237#define NVGPU_CTXSW_TAG_CONTEXT_START 0x05
1238#define NVGPU_CTXSW_TAG_INVALID_TIMESTAMP 0xff
1239#define NVGPU_CTXSW_TAG_LAST \
1240 NVGPU_CTXSW_TAG_INVALID_TIMESTAMP
1241
1242struct nvgpu_ctxsw_trace_entry {
1243 __u8 tag;
1244 __u8 vmid;
1245 __u16 seqno; /* sequence number to detect drops */
1246 __u32 context_id; /* context_id as allocated by FECS */
1247 __u64 pid; /* 64-bit is max bits of different OS pid */
1248 __u64 timestamp; /* 64-bit time */
1249};
1250
1251#define NVGPU_CTXSW_RING_HEADER_MAGIC 0x7000fade
1252#define NVGPU_CTXSW_RING_HEADER_VERSION 0
1253
1254struct nvgpu_ctxsw_ring_header {
1255 __u32 magic;
1256 __u32 version;
1257 __u32 num_ents;
1258 __u32 ent_size;
1259 volatile __u32 drop_count; /* excluding filtered out events */
1260 volatile __u32 write_seqno;
1261 volatile __u32 write_idx;
1262 volatile __u32 read_idx;
1263};
1264
1265struct nvgpu_ctxsw_ring_setup_args {
1266 __u32 size; /* [in/out] size of ring buffer in bytes (including
1267 header). will be rounded page size. this parameter
1268 is updated with actual allocated size. */
1269};
1270
1271#define NVGPU_CTXSW_FILTER_SIZE (NVGPU_CTXSW_TAG_LAST + 1)
1272#define NVGPU_CTXSW_FILTER_SET(n, p) \
1273 ((p)->tag_bits[(n) / 64] |= (1 << ((n) & 63)))
1274#define NVGPU_CTXSW_FILTER_CLR(n, p) \
1275 ((p)->tag_bits[(n) / 64] &= ~(1 << ((n) & 63)))
1276#define NVGPU_CTXSW_FILTER_ISSET(n, p) \
1277 ((p)->tag_bits[(n) / 64] & (1 << ((n) & 63)))
1278#define NVGPU_CTXSW_FILTER_CLR_ALL(p) memset((void *)(p), 0, sizeof(*(p)))
1279#define NVGPU_CTXSW_FILTER_SET_ALL(p) memset((void *)(p), ~0, sizeof(*(p)))
1280
1281struct nvgpu_ctxsw_trace_filter {
1282 __u64 tag_bits[(NVGPU_CTXSW_FILTER_SIZE + 63) / 64];
1283};
1284
1285struct nvgpu_ctxsw_trace_filter_args {
1286 struct nvgpu_ctxsw_trace_filter filter;
1287};
1288
1289#define NVGPU_CTXSW_IOCTL_TRACE_ENABLE \
1290 _IO(NVGPU_CTXSW_IOCTL_MAGIC, 1)
1291#define NVGPU_CTXSW_IOCTL_TRACE_DISABLE \
1292 _IO(NVGPU_CTXSW_IOCTL_MAGIC, 2)
1293#define NVGPU_CTXSW_IOCTL_RING_SETUP \
1294 _IOWR(NVGPU_CTXSW_IOCTL_MAGIC, 3, struct nvgpu_ctxsw_ring_setup_args)
1295#define NVGPU_CTXSW_IOCTL_SET_FILTER \
1296 _IOW(NVGPU_CTXSW_IOCTL_MAGIC, 4, struct nvgpu_ctxsw_trace_filter_args)
1297#define NVGPU_CTXSW_IOCTL_GET_FILTER \
1298 _IOR(NVGPU_CTXSW_IOCTL_MAGIC, 5, struct nvgpu_ctxsw_trace_filter_args)
1299#define NVGPU_CTXSW_IOCTL_POLL \
1300 _IO(NVGPU_CTXSW_IOCTL_MAGIC, 6)
1301
1302#define NVGPU_CTXSW_IOCTL_LAST \
1303 _IOC_NR(NVGPU_CTXSW_IOCTL_POLL)
1304
1305#define NVGPU_CTXSW_IOCTL_MAX_ARG_SIZE \
1306 sizeof(struct nvgpu_ctxsw_trace_filter_args)
1307
1162#endif 1308#endif