summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
diff options
context:
space:
mode:
authorDeepak Nibade <dnibade@nvidia.com>2017-10-26 11:29:56 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2017-11-02 08:09:59 -0400
commit23c7903eff6ee1ab184dfcc62c054de1557e5b1d (patch)
treea5122028e181e5c6009f9f8b66bfbf00f69a9290 /drivers/gpu/nvgpu/gk20a/channel_gk20a.c
parent5f8cfaa250f08499f587da0097f6accaa5eedf15 (diff)
gpu: nvgpu: move submit path to linux
Nvgpu submit path has a lot of dependency on Linux framework e.g. use of copy_from_user, use of structures defined in uapi/nvgpu headers, dma_buf_* calls for trace support etc Hence to keep common code independent of Linux code, move submit path to Linux directory Move below APIs to common/linux/channel.c trace_write_pushbuffer() trace_write_pushbuffer_range() gk20a_submit_prepare_syncs() gk20a_submit_append_priv_cmdbuf() gk20a_submit_append_gpfifo() gk20a_submit_channel_gpfifo() Move below APIs to common/linux/ce2.c gk20a_ce_execute_ops() Define gk20a_ce_execute_ops() in common/linux/ce2.c, and declare it in gk20a/ce2_gk20a.h since it is needed in common/mm code too Each OS needs to implement this API separately gk20a_channel_alloc_gpfifo() use sizeof(nvgpu_gpfifo) to get size of one gpfifo entry, but structure nvgpu_gpfifo is linux specific Define new nvgpu_get_gpfifo_entry_size() in linux specific code and use it in gk20a_channel_alloc_gpfifo() to get gpfifo entry size Each OS needs to implement this API separately Export some APIs from gk20a/ce2_gk20a.h and gk20a/channel_gk20a.h that are needed in linux code Jira NVGPU-259 Jira NVGPU-313 Change-Id: I360c6cb8ce4494b1e50c66af334a2a379f0d2dc4 Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1586277 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/channel_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/channel_gk20a.c666
1 files changed, 12 insertions, 654 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 00d20357..c938ba6b 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -44,45 +44,13 @@
44#include <nvgpu/barrier.h> 44#include <nvgpu/barrier.h>
45#include <nvgpu/ctxsw_trace.h> 45#include <nvgpu/ctxsw_trace.h>
46 46
47/*
48 * This is required for nvgpu_vm_find_buf() which is used in the tracing
49 * code. Once we can get and access userspace buffers without requiring
50 * direct dma_buf usage this can be removed.
51 */
52#include <nvgpu/linux/vm.h>
53
54#include "gk20a.h" 47#include "gk20a.h"
55#include "dbg_gpu_gk20a.h" 48#include "dbg_gpu_gk20a.h"
56#include "fence_gk20a.h" 49#include "fence_gk20a.h"
57 50
58#include <nvgpu/hw/gk20a/hw_pbdma_gk20a.h>
59
60/*
61 * Note
62 * This is added for all the copy_from_user methods in this file which needs to
63 * be moved lated to reduce depenedency on Linux
64 */
65#include <linux/uaccess.h>
66
67/*
68 * Although channels do have pointers back to the gk20a struct that they were
69 * created under in cases where the driver is killed that pointer can be bad.
70 * The channel memory can be freed before the release() function for a given
71 * channel is called. This happens when the driver dies and userspace doesn't
72 * get a chance to call release() until after the entire gk20a driver data is
73 * unloaded and freed.
74 */
75struct channel_priv {
76 struct gk20a *g;
77 struct channel_gk20a *c;
78};
79
80static void free_channel(struct fifo_gk20a *f, struct channel_gk20a *c); 51static void free_channel(struct fifo_gk20a *f, struct channel_gk20a *c);
81static void gk20a_channel_dump_ref_actions(struct channel_gk20a *c); 52static void gk20a_channel_dump_ref_actions(struct channel_gk20a *c);
82 53
83static void free_priv_cmdbuf(struct channel_gk20a *c,
84 struct priv_cmd_entry *e);
85
86static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c); 54static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c);
87static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c); 55static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c);
88 56
@@ -97,9 +65,6 @@ static struct channel_gk20a_job *channel_gk20a_joblist_peek(
97 65
98static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch); 66static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch);
99 67
100static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
101 bool clean_all);
102
103/* allocate GPU channel */ 68/* allocate GPU channel */
104static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f) 69static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f)
105{ 70{
@@ -1038,7 +1003,7 @@ int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 orig_size,
1038 1003
1039/* Don't call this to free an explict cmd entry. 1004/* Don't call this to free an explict cmd entry.
1040 * It doesn't update priv_cmd_queue get/put */ 1005 * It doesn't update priv_cmd_queue get/put */
1041static void free_priv_cmdbuf(struct channel_gk20a *c, 1006void free_priv_cmdbuf(struct channel_gk20a *c,
1042 struct priv_cmd_entry *e) 1007 struct priv_cmd_entry *e)
1043{ 1008{
1044 if (channel_gk20a_is_prealloc_enabled(c)) 1009 if (channel_gk20a_is_prealloc_enabled(c))
@@ -1047,7 +1012,7 @@ static void free_priv_cmdbuf(struct channel_gk20a *c,
1047 nvgpu_kfree(c->g, e); 1012 nvgpu_kfree(c->g, e);
1048} 1013}
1049 1014
1050static int channel_gk20a_alloc_job(struct channel_gk20a *c, 1015int channel_gk20a_alloc_job(struct channel_gk20a *c,
1051 struct channel_gk20a_job **job_out) 1016 struct channel_gk20a_job **job_out)
1052{ 1017{
1053 int err = 0; 1018 int err = 0;
@@ -1080,7 +1045,7 @@ static int channel_gk20a_alloc_job(struct channel_gk20a *c,
1080 return err; 1045 return err;
1081} 1046}
1082 1047
1083static void channel_gk20a_free_job(struct channel_gk20a *c, 1048void channel_gk20a_free_job(struct channel_gk20a *c,
1084 struct channel_gk20a_job *job) 1049 struct channel_gk20a_job *job)
1085{ 1050{
1086 /* 1051 /*
@@ -1267,11 +1232,12 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
1267{ 1232{
1268 struct gk20a *g = c->g; 1233 struct gk20a *g = c->g;
1269 struct vm_gk20a *ch_vm; 1234 struct vm_gk20a *ch_vm;
1270 u32 gpfifo_size; 1235 u32 gpfifo_size, gpfifo_entry_size;
1271 int err = 0; 1236 int err = 0;
1272 unsigned long acquire_timeout; 1237 unsigned long acquire_timeout;
1273 1238
1274 gpfifo_size = num_entries; 1239 gpfifo_size = num_entries;
1240 gpfifo_entry_size = nvgpu_get_gpfifo_entry_size();
1275 1241
1276 if (flags & NVGPU_ALLOC_GPFIFO_EX_FLAGS_VPR_ENABLED) 1242 if (flags & NVGPU_ALLOC_GPFIFO_EX_FLAGS_VPR_ENABLED)
1277 c->vpr = true; 1243 c->vpr = true;
@@ -1315,7 +1281,7 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
1315 } 1281 }
1316 1282
1317 err = nvgpu_dma_alloc_map_sys(ch_vm, 1283 err = nvgpu_dma_alloc_map_sys(ch_vm,
1318 gpfifo_size * sizeof(struct nvgpu_gpfifo), 1284 gpfifo_size * gpfifo_entry_size,
1319 &c->gpfifo.mem); 1285 &c->gpfifo.mem);
1320 if (err) { 1286 if (err) {
1321 nvgpu_err(g, "%s: memory allocation failed", __func__); 1287 nvgpu_err(g, "%s: memory allocation failed", __func__);
@@ -1324,7 +1290,7 @@ int gk20a_channel_alloc_gpfifo(struct channel_gk20a *c,
1324 1290
1325 if (c->gpfifo.mem.aperture == APERTURE_VIDMEM || g->mm.force_pramin) { 1291 if (c->gpfifo.mem.aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
1326 c->gpfifo.pipe = nvgpu_big_malloc(g, 1292 c->gpfifo.pipe = nvgpu_big_malloc(g,
1327 gpfifo_size * sizeof(struct nvgpu_gpfifo)); 1293 gpfifo_size * gpfifo_entry_size);
1328 if (!c->gpfifo.pipe) { 1294 if (!c->gpfifo.pipe) {
1329 err = -ENOMEM; 1295 err = -ENOMEM;
1330 goto clean_up_unmap; 1296 goto clean_up_unmap;
@@ -1427,7 +1393,7 @@ static inline u32 update_gp_get(struct gk20a *g,
1427 return new_get; 1393 return new_get;
1428} 1394}
1429 1395
1430static inline u32 gp_free_count(struct channel_gk20a *c) 1396u32 nvgpu_gp_free_count(struct channel_gk20a *c)
1431{ 1397{
1432 return (c->gpfifo.entry_num - (c->gpfifo.put - c->gpfifo.get) - 1) % 1398 return (c->gpfifo.entry_num - (c->gpfifo.put - c->gpfifo.get) - 1) %
1433 c->gpfifo.entry_num; 1399 c->gpfifo.entry_num;
@@ -1460,91 +1426,10 @@ static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch)
1460 return ch->g->ch_wdt_timeout_ms; 1426 return ch->g->ch_wdt_timeout_ms;
1461} 1427}
1462 1428
1463static u32 get_gp_free_count(struct channel_gk20a *c) 1429u32 nvgpu_get_gp_free_count(struct channel_gk20a *c)
1464{ 1430{
1465 update_gp_get(c->g, c); 1431 update_gp_get(c->g, c);
1466 return gp_free_count(c); 1432 return nvgpu_gp_free_count(c);
1467}
1468
1469#ifdef CONFIG_DEBUG_FS
1470static void trace_write_pushbuffer(struct channel_gk20a *c,
1471 struct nvgpu_gpfifo *g)
1472{
1473 void *mem = NULL;
1474 unsigned int words;
1475 u64 offset;
1476 struct dma_buf *dmabuf = NULL;
1477
1478 if (gk20a_debug_trace_cmdbuf) {
1479 u64 gpu_va = (u64)g->entry0 |
1480 (u64)((u64)pbdma_gp_entry1_get_hi_v(g->entry1) << 32);
1481 int err;
1482
1483 words = pbdma_gp_entry1_length_v(g->entry1);
1484 err = nvgpu_vm_find_buf(c->vm, gpu_va, &dmabuf, &offset);
1485 if (!err)
1486 mem = dma_buf_vmap(dmabuf);
1487 }
1488
1489 if (mem) {
1490 u32 i;
1491 /*
1492 * Write in batches of 128 as there seems to be a limit
1493 * of how much you can output to ftrace at once.
1494 */
1495 for (i = 0; i < words; i += 128U) {
1496 trace_gk20a_push_cmdbuf(
1497 c->g->name,
1498 0,
1499 min(words - i, 128U),
1500 offset + i * sizeof(u32),
1501 mem);
1502 }
1503 dma_buf_vunmap(dmabuf, mem);
1504 }
1505}
1506#endif
1507
1508static void trace_write_pushbuffer_range(struct channel_gk20a *c,
1509 struct nvgpu_gpfifo *g,
1510 struct nvgpu_gpfifo __user *user_gpfifo,
1511 int offset,
1512 int count)
1513{
1514#ifdef CONFIG_DEBUG_FS
1515 u32 size;
1516 int i;
1517 struct nvgpu_gpfifo *gp;
1518 bool gpfifo_allocated = false;
1519
1520 if (!gk20a_debug_trace_cmdbuf)
1521 return;
1522
1523 if (!g && !user_gpfifo)
1524 return;
1525
1526 if (!g) {
1527 size = count * sizeof(struct nvgpu_gpfifo);
1528 if (size) {
1529 g = nvgpu_big_malloc(c->g, size);
1530 if (!g)
1531 return;
1532
1533 if (copy_from_user(g, user_gpfifo, size)) {
1534 nvgpu_big_free(c->g, g);
1535 return;
1536 }
1537 }
1538 gpfifo_allocated = true;
1539 }
1540
1541 gp = g + offset;
1542 for (i = 0; i < count; i++, gp++)
1543 trace_write_pushbuffer(c, gp);
1544
1545 if (gpfifo_allocated)
1546 nvgpu_big_free(c->g, g);
1547#endif
1548} 1433}
1549 1434
1550static void __gk20a_channel_timeout_start(struct channel_gk20a *ch) 1435static void __gk20a_channel_timeout_start(struct channel_gk20a *ch)
@@ -2032,7 +1917,7 @@ int gk20a_free_priv_cmdbuf(struct channel_gk20a *c, struct priv_cmd_entry *e)
2032 return 0; 1917 return 0;
2033} 1918}
2034 1919
2035static int gk20a_channel_add_job(struct channel_gk20a *c, 1920int gk20a_channel_add_job(struct channel_gk20a *c,
2036 struct channel_gk20a_job *job, 1921 struct channel_gk20a_job *job,
2037 bool skip_buffer_refcounting) 1922 bool skip_buffer_refcounting)
2038{ 1923{
@@ -2097,7 +1982,7 @@ err_put_buffers:
2097 * per-job memory for completed jobs; in case of preallocated resources, this 1982 * per-job memory for completed jobs; in case of preallocated resources, this
2098 * opens up slots for new jobs to be submitted. 1983 * opens up slots for new jobs to be submitted.
2099 */ 1984 */
2100static void gk20a_channel_clean_up_jobs(struct channel_gk20a *c, 1985void gk20a_channel_clean_up_jobs(struct channel_gk20a *c,
2101 bool clean_all) 1986 bool clean_all)
2102{ 1987{
2103 struct vm_gk20a *vm; 1988 struct vm_gk20a *vm;
@@ -2257,533 +2142,6 @@ void gk20a_channel_update(struct channel_gk20a *c)
2257 gk20a_channel_worker_enqueue(c); 2142 gk20a_channel_worker_enqueue(c);
2258} 2143}
2259 2144
2260static void gk20a_submit_append_priv_cmdbuf(struct channel_gk20a *c,
2261 struct priv_cmd_entry *cmd)
2262{
2263 struct gk20a *g = c->g;
2264 struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
2265 struct nvgpu_gpfifo x = {
2266 .entry0 = u64_lo32(cmd->gva),
2267 .entry1 = u64_hi32(cmd->gva) |
2268 pbdma_gp_entry1_length_f(cmd->size)
2269 };
2270
2271 nvgpu_mem_wr_n(g, gpfifo_mem, c->gpfifo.put * sizeof(x),
2272 &x, sizeof(x));
2273
2274 if (cmd->mem->aperture == APERTURE_SYSMEM)
2275 trace_gk20a_push_cmdbuf(g->name, 0, cmd->size, 0,
2276 cmd->mem->cpu_va + cmd->off * sizeof(u32));
2277
2278 c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1);
2279}
2280
2281/*
2282 * Copy source gpfifo entries into the gpfifo ring buffer, potentially
2283 * splitting into two memcpys to handle wrap-around.
2284 */
2285static int gk20a_submit_append_gpfifo(struct channel_gk20a *c,
2286 struct nvgpu_gpfifo *kern_gpfifo,
2287 struct nvgpu_gpfifo __user *user_gpfifo,
2288 u32 num_entries)
2289{
2290 /* byte offsets */
2291 u32 gpfifo_size = c->gpfifo.entry_num * sizeof(struct nvgpu_gpfifo);
2292 u32 len = num_entries * sizeof(struct nvgpu_gpfifo);
2293 u32 start = c->gpfifo.put * sizeof(struct nvgpu_gpfifo);
2294 u32 end = start + len; /* exclusive */
2295 struct nvgpu_mem *gpfifo_mem = &c->gpfifo.mem;
2296 struct nvgpu_gpfifo *cpu_src;
2297 int err;
2298
2299 if (user_gpfifo && !c->gpfifo.pipe) {
2300 /*
2301 * This path (from userspace to sysmem) is special in order to
2302 * avoid two copies unnecessarily (from user to pipe, then from
2303 * pipe to gpu sysmem buffer).
2304 *
2305 * As a special case, the pipe buffer exists if PRAMIN writes
2306 * are forced, although the buffers may not be in vidmem in
2307 * that case.
2308 */
2309 if (end > gpfifo_size) {
2310 /* wrap-around */
2311 int length0 = gpfifo_size - start;
2312 int length1 = len - length0;
2313 void __user *user2 = (u8 __user *)user_gpfifo + length0;
2314
2315 err = copy_from_user(gpfifo_mem->cpu_va + start,
2316 user_gpfifo, length0);
2317 if (err)
2318 return err;
2319
2320 err = copy_from_user(gpfifo_mem->cpu_va,
2321 user2, length1);
2322 if (err)
2323 return err;
2324 } else {
2325 err = copy_from_user(gpfifo_mem->cpu_va + start,
2326 user_gpfifo, len);
2327 if (err)
2328 return err;
2329 }
2330
2331 trace_write_pushbuffer_range(c, NULL, user_gpfifo,
2332 0, num_entries);
2333 goto out;
2334 } else if (user_gpfifo) {
2335 /* from userspace to vidmem or sysmem when pramin forced, use
2336 * the common copy path below */
2337 err = copy_from_user(c->gpfifo.pipe, user_gpfifo, len);
2338 if (err)
2339 return err;
2340
2341 cpu_src = c->gpfifo.pipe;
2342 } else {
2343 /* from kernel to either sysmem or vidmem, don't need
2344 * copy_from_user so use the common path below */
2345 cpu_src = kern_gpfifo;
2346 }
2347
2348 if (end > gpfifo_size) {
2349 /* wrap-around */
2350 int length0 = gpfifo_size - start;
2351 int length1 = len - length0;
2352 void *src2 = (u8 *)cpu_src + length0;
2353
2354 nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, length0);
2355 nvgpu_mem_wr_n(c->g, gpfifo_mem, 0, src2, length1);
2356 } else {
2357 nvgpu_mem_wr_n(c->g, gpfifo_mem, start, cpu_src, len);
2358
2359 }
2360
2361 trace_write_pushbuffer_range(c, cpu_src, NULL, 0, num_entries);
2362
2363out:
2364 c->gpfifo.put = (c->gpfifo.put + num_entries) &
2365 (c->gpfifo.entry_num - 1);
2366
2367 return 0;
2368}
2369
2370/*
2371 * Handle the submit synchronization - pre-fences and post-fences.
2372 */
2373static int gk20a_submit_prepare_syncs(struct channel_gk20a *c,
2374 struct nvgpu_fence *fence,
2375 struct channel_gk20a_job *job,
2376 struct priv_cmd_entry **wait_cmd,
2377 struct priv_cmd_entry **incr_cmd,
2378 struct gk20a_fence **pre_fence,
2379 struct gk20a_fence **post_fence,
2380 bool force_need_sync_fence,
2381 bool register_irq,
2382 u32 flags)
2383{
2384 struct gk20a *g = c->g;
2385 bool need_sync_fence = false;
2386 bool new_sync_created = false;
2387 int wait_fence_fd = -1;
2388 int err = 0;
2389 bool need_wfi = !(flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SUPPRESS_WFI);
2390 bool pre_alloc_enabled = channel_gk20a_is_prealloc_enabled(c);
2391
2392 /*
2393 * If user wants to always allocate sync_fence_fds then respect that;
2394 * otherwise, allocate sync_fence_fd based on user flags.
2395 */
2396 if (force_need_sync_fence)
2397 need_sync_fence = true;
2398
2399 if (g->aggressive_sync_destroy_thresh) {
2400 nvgpu_mutex_acquire(&c->sync_lock);
2401 if (!c->sync) {
2402 c->sync = gk20a_channel_sync_create(c);
2403 if (!c->sync) {
2404 err = -ENOMEM;
2405 nvgpu_mutex_release(&c->sync_lock);
2406 goto fail;
2407 }
2408 new_sync_created = true;
2409 }
2410 nvgpu_atomic_inc(&c->sync->refcount);
2411 nvgpu_mutex_release(&c->sync_lock);
2412 }
2413
2414 if (g->ops.fifo.resetup_ramfc && new_sync_created) {
2415 err = g->ops.fifo.resetup_ramfc(c);
2416 if (err)
2417 goto fail;
2418 }
2419
2420 /*
2421 * Optionally insert syncpt wait in the beginning of gpfifo submission
2422 * when user requested and the wait hasn't expired. Validate that the id
2423 * makes sense, elide if not. The only reason this isn't being
2424 * unceremoniously killed is to keep running some tests which trigger
2425 * this condition.
2426 */
2427 if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) {
2428 job->pre_fence = gk20a_alloc_fence(c);
2429 if (!job->pre_fence) {
2430 err = -ENOMEM;
2431 goto fail;
2432 }
2433
2434 if (!pre_alloc_enabled)
2435 job->wait_cmd = nvgpu_kzalloc(g,
2436 sizeof(struct priv_cmd_entry));
2437
2438 if (!job->wait_cmd) {
2439 err = -ENOMEM;
2440 goto clean_up_pre_fence;
2441 }
2442
2443 if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE) {
2444 wait_fence_fd = fence->id;
2445 err = c->sync->wait_fd(c->sync, wait_fence_fd,
2446 job->wait_cmd, job->pre_fence);
2447 } else {
2448 err = c->sync->wait_syncpt(c->sync, fence->id,
2449 fence->value, job->wait_cmd,
2450 job->pre_fence);
2451 }
2452
2453 if (!err) {
2454 if (job->wait_cmd->valid)
2455 *wait_cmd = job->wait_cmd;
2456 *pre_fence = job->pre_fence;
2457 } else
2458 goto clean_up_wait_cmd;
2459 }
2460
2461 if ((flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) &&
2462 (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE))
2463 need_sync_fence = true;
2464
2465 /*
2466 * Always generate an increment at the end of a GPFIFO submission. This
2467 * is used to keep track of method completion for idle railgating. The
2468 * sync_pt/semaphore PB is added to the GPFIFO later on in submit.
2469 */
2470 job->post_fence = gk20a_alloc_fence(c);
2471 if (!job->post_fence) {
2472 err = -ENOMEM;
2473 goto clean_up_wait_cmd;
2474 }
2475 if (!pre_alloc_enabled)
2476 job->incr_cmd = nvgpu_kzalloc(g, sizeof(struct priv_cmd_entry));
2477
2478 if (!job->incr_cmd) {
2479 err = -ENOMEM;
2480 goto clean_up_post_fence;
2481 }
2482
2483 if (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)
2484 err = c->sync->incr_user(c->sync, wait_fence_fd, job->incr_cmd,
2485 job->post_fence, need_wfi, need_sync_fence,
2486 register_irq);
2487 else
2488 err = c->sync->incr(c->sync, job->incr_cmd,
2489 job->post_fence, need_sync_fence,
2490 register_irq);
2491 if (!err) {
2492 *incr_cmd = job->incr_cmd;
2493 *post_fence = job->post_fence;
2494 } else
2495 goto clean_up_incr_cmd;
2496
2497 return 0;
2498
2499clean_up_incr_cmd:
2500 free_priv_cmdbuf(c, job->incr_cmd);
2501 if (!pre_alloc_enabled)
2502 job->incr_cmd = NULL;
2503clean_up_post_fence:
2504 gk20a_fence_put(job->post_fence);
2505 job->post_fence = NULL;
2506clean_up_wait_cmd:
2507 free_priv_cmdbuf(c, job->wait_cmd);
2508 if (!pre_alloc_enabled)
2509 job->wait_cmd = NULL;
2510clean_up_pre_fence:
2511 gk20a_fence_put(job->pre_fence);
2512 job->pre_fence = NULL;
2513fail:
2514 *wait_cmd = NULL;
2515 *pre_fence = NULL;
2516 return err;
2517}
2518
2519int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
2520 struct nvgpu_gpfifo *gpfifo,
2521 struct nvgpu_submit_gpfifo_args *args,
2522 u32 num_entries,
2523 u32 flags,
2524 struct nvgpu_fence *fence,
2525 struct gk20a_fence **fence_out,
2526 bool force_need_sync_fence,
2527 struct fifo_profile_gk20a *profile)
2528{
2529 struct gk20a *g = c->g;
2530 struct priv_cmd_entry *wait_cmd = NULL;
2531 struct priv_cmd_entry *incr_cmd = NULL;
2532 struct gk20a_fence *pre_fence = NULL;
2533 struct gk20a_fence *post_fence = NULL;
2534 struct channel_gk20a_job *job = NULL;
2535 /* we might need two extra gpfifo entries - one for pre fence
2536 * and one for post fence. */
2537 const int extra_entries = 2;
2538 bool skip_buffer_refcounting = (flags &
2539 NVGPU_SUBMIT_GPFIFO_FLAGS_SKIP_BUFFER_REFCOUNTING);
2540 int err = 0;
2541 bool need_job_tracking;
2542 bool need_deferred_cleanup = false;
2543 struct nvgpu_gpfifo __user *user_gpfifo = args ?
2544 (struct nvgpu_gpfifo __user *)(uintptr_t)args->gpfifo : NULL;
2545
2546 if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING))
2547 return -ENODEV;
2548
2549 if (c->has_timedout)
2550 return -ETIMEDOUT;
2551
2552 if (!nvgpu_mem_is_valid(&c->gpfifo.mem))
2553 return -ENOMEM;
2554
2555 /* fifo not large enough for request. Return error immediately.
2556 * Kernel can insert gpfifo entries before and after user gpfifos.
2557 * So, add extra_entries in user request. Also, HW with fifo size N
2558 * can accept only N-1 entreis and so the below condition */
2559 if (c->gpfifo.entry_num - 1 < num_entries + extra_entries) {
2560 nvgpu_err(g, "not enough gpfifo space allocated");
2561 return -ENOMEM;
2562 }
2563
2564 if (!gpfifo && !args)
2565 return -EINVAL;
2566
2567 if ((flags & (NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT |
2568 NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET)) &&
2569 !fence)
2570 return -EINVAL;
2571
2572 /* an address space needs to have been bound at this point. */
2573 if (!gk20a_channel_as_bound(c)) {
2574 nvgpu_err(g,
2575 "not bound to an address space at time of gpfifo"
2576 " submission.");
2577 return -EINVAL;
2578 }
2579
2580 if (profile)
2581 profile->timestamp[PROFILE_ENTRY] = sched_clock();
2582
2583 /* update debug settings */
2584 nvgpu_ltc_sync_enabled(g);
2585
2586 gk20a_dbg_info("channel %d", c->chid);
2587
2588 /*
2589 * Job tracking is necessary for any of the following conditions:
2590 * - pre- or post-fence functionality
2591 * - channel wdt
2592 * - GPU rail-gating with non-deterministic channels
2593 * - buffer refcounting
2594 *
2595 * If none of the conditions are met, then job tracking is not
2596 * required and a fast submit can be done (ie. only need to write
2597 * out userspace GPFIFO entries and update GP_PUT).
2598 */
2599 need_job_tracking = (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) ||
2600 (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET) ||
2601 c->wdt_enabled ||
2602 (g->can_railgate && !c->deterministic) ||
2603 !skip_buffer_refcounting;
2604
2605 if (need_job_tracking) {
2606 bool need_sync_framework = false;
2607
2608 /*
2609 * If the channel is to have deterministic latency and
2610 * job tracking is required, the channel must have
2611 * pre-allocated resources. Otherwise, we fail the submit here
2612 */
2613 if (c->deterministic && !channel_gk20a_is_prealloc_enabled(c))
2614 return -EINVAL;
2615
2616 need_sync_framework = force_need_sync_fence ||
2617 gk20a_channel_sync_needs_sync_framework(g) ||
2618 (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE &&
2619 (flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT ||
2620 flags & NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET));
2621
2622 /*
2623 * Deferred clean-up is necessary for any of the following
2624 * conditions:
2625 * - channel's deterministic flag is not set
2626 * - dependency on sync framework, which could make the
2627 * behavior of the clean-up operation non-deterministic
2628 * (should not be performed in the submit path)
2629 * - channel wdt
2630 * - GPU rail-gating with non-deterministic channels
2631 * - buffer refcounting
2632 *
2633 * If none of the conditions are met, then deferred clean-up
2634 * is not required, and we clean-up one job-tracking
2635 * resource in the submit path.
2636 */
2637 need_deferred_cleanup = !c->deterministic ||
2638 need_sync_framework ||
2639 c->wdt_enabled ||
2640 (g->can_railgate &&
2641 !c->deterministic) ||
2642 !skip_buffer_refcounting;
2643
2644 /*
2645 * For deterministic channels, we don't allow deferred clean_up
2646 * processing to occur. In cases we hit this, we fail the submit
2647 */
2648 if (c->deterministic && need_deferred_cleanup)
2649 return -EINVAL;
2650
2651 if (!c->deterministic) {
2652 /*
2653 * Get a power ref unless this is a deterministic
2654 * channel that holds them during the channel lifetime.
2655 * This one is released by gk20a_channel_clean_up_jobs,
2656 * via syncpt or sema interrupt, whichever is used.
2657 */
2658 err = gk20a_busy(g);
2659 if (err) {
2660 nvgpu_err(g,
2661 "failed to host gk20a to submit gpfifo, process %s",
2662 current->comm);
2663 return err;
2664 }
2665 }
2666
2667 if (!need_deferred_cleanup) {
2668 /* clean up a single job */
2669 gk20a_channel_clean_up_jobs(c, false);
2670 }
2671 }
2672
2673
2674 /* Grab access to HW to deal with do_idle */
2675 if (c->deterministic)
2676 nvgpu_rwsem_down_read(&g->deterministic_busy);
2677
2678 trace_gk20a_channel_submit_gpfifo(g->name,
2679 c->chid,
2680 num_entries,
2681 flags,
2682 fence ? fence->id : 0,
2683 fence ? fence->value : 0);
2684
2685 gk20a_dbg_info("pre-submit put %d, get %d, size %d",
2686 c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
2687
2688 /*
2689 * Make sure we have enough space for gpfifo entries. Check cached
2690 * values first and then read from HW. If no space, return EAGAIN
2691 * and let userpace decide to re-try request or not.
2692 */
2693 if (gp_free_count(c) < num_entries + extra_entries) {
2694 if (get_gp_free_count(c) < num_entries + extra_entries) {
2695 err = -EAGAIN;
2696 goto clean_up;
2697 }
2698 }
2699
2700 if (c->has_timedout) {
2701 err = -ETIMEDOUT;
2702 goto clean_up;
2703 }
2704
2705 if (need_job_tracking) {
2706 err = channel_gk20a_alloc_job(c, &job);
2707 if (err)
2708 goto clean_up;
2709
2710 err = gk20a_submit_prepare_syncs(c, fence, job,
2711 &wait_cmd, &incr_cmd,
2712 &pre_fence, &post_fence,
2713 force_need_sync_fence,
2714 need_deferred_cleanup,
2715 flags);
2716 if (err)
2717 goto clean_up_job;
2718 }
2719
2720 if (profile)
2721 profile->timestamp[PROFILE_JOB_TRACKING] = sched_clock();
2722
2723 if (wait_cmd)
2724 gk20a_submit_append_priv_cmdbuf(c, wait_cmd);
2725
2726 if (gpfifo || user_gpfifo)
2727 err = gk20a_submit_append_gpfifo(c, gpfifo, user_gpfifo,
2728 num_entries);
2729 if (err)
2730 goto clean_up_job;
2731
2732 /*
2733 * And here's where we add the incr_cmd we generated earlier. It should
2734 * always run!
2735 */
2736 if (incr_cmd)
2737 gk20a_submit_append_priv_cmdbuf(c, incr_cmd);
2738
2739 if (fence_out)
2740 *fence_out = gk20a_fence_get(post_fence);
2741
2742 if (need_job_tracking)
2743 /* TODO! Check for errors... */
2744 gk20a_channel_add_job(c, job, skip_buffer_refcounting);
2745 if (profile)
2746 profile->timestamp[PROFILE_APPEND] = sched_clock();
2747
2748 g->ops.fifo.userd_gp_put(g, c);
2749
2750 if ((NVGPU_SUBMIT_GPFIFO_FLAGS_RESCHEDULE_RUNLIST & flags) &&
2751 g->ops.fifo.reschedule_runlist)
2752 g->ops.fifo.reschedule_runlist(g, c->runlist_id);
2753
2754 /* No hw access beyond this point */
2755 if (c->deterministic)
2756 nvgpu_rwsem_up_read(&g->deterministic_busy);
2757
2758 trace_gk20a_channel_submitted_gpfifo(g->name,
2759 c->chid,
2760 num_entries,
2761 flags,
2762 post_fence ? post_fence->syncpt_id : 0,
2763 post_fence ? post_fence->syncpt_value : 0);
2764
2765 gk20a_dbg_info("post-submit put %d, get %d, size %d",
2766 c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
2767
2768 if (profile)
2769 profile->timestamp[PROFILE_END] = sched_clock();
2770 gk20a_dbg_fn("done");
2771 return err;
2772
2773clean_up_job:
2774 channel_gk20a_free_job(c, job);
2775clean_up:
2776 gk20a_dbg_fn("fail");
2777 gk20a_fence_put(pre_fence);
2778 gk20a_fence_put(post_fence);
2779 if (c->deterministic)
2780 nvgpu_rwsem_up_read(&g->deterministic_busy);
2781 else if (need_deferred_cleanup)
2782 gk20a_idle(g);
2783
2784 return err;
2785}
2786
2787/* 2145/*
2788 * Stop deterministic channel activity for do_idle() when power needs to go off 2146 * Stop deterministic channel activity for do_idle() when power needs to go off
2789 * momentarily but deterministic channels keep power refs for potentially a 2147 * momentarily but deterministic channels keep power refs for potentially a