3 files changed, 237 insertions, 77 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index ee62f02a..9067aae5 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -34,7 +34,10 @@
 #include "hw_ccsr_gk20a.h"
 #include "hw_pbdma_gk20a.h"
-static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use);
+static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx);
+static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g);
+#define CTX_DELETE_TIME 1000
 static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
 {
@@ -67,7 +70,7 @@ static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
        cde_ctx->init_cmd_executed = false;
 }
-static int gk20a_cde_remove(struct gk20a_cde_ctx *cde_ctx)
+static void gk20a_cde_remove_ctx(struct gk20a_cde_ctx *cde_ctx)
 {
        struct gk20a *g = cde_ctx->g;
        struct channel_gk20a *ch = cde_ctx->ch;
@@ -81,23 +84,90 @@ static int gk20a_cde_remove(struct gk20a_cde_ctx *cde_ctx)
        gk20a_gmmu_unmap(vm, cde_ctx->backing_store_vaddr,
                         g->gr.compbit_store.size, 1);
-        return 0;
+        /* housekeeping on app */
+        list_del(&cde_ctx->list);
+        cde_ctx->g->cde_app.lru_len--;
+        kfree(cde_ctx);
+}
+static void gk20a_cde_prepare_ctx_remove(struct gk20a_cde_ctx *cde_ctx)
+{
+        struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
+        /* permanent contexts do not have deleter works */
+        if (!cde_ctx->is_temporary)
+                return;
+        /* safe to go off the mutex since app is deinitialised. deleter works
+         * may be only at waiting for the mutex or before, going to abort */
+        mutex_unlock(&cde_app->mutex);
+        /* the deleter can rearm itself */
+        do {
+                cancel_delayed_work_sync(&cde_ctx->ctx_deleter_work);
+        } while (delayed_work_pending(&cde_ctx->ctx_deleter_work));
+        mutex_lock(&cde_app->mutex);
 }
-int gk20a_cde_destroy(struct gk20a *g)
+static void gk20a_cde_deallocate_contexts(struct gk20a *g)
 {
        struct gk20a_cde_app *cde_app = &g->cde_app;
-        struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;
+        struct gk20a_cde_ctx *cde_ctx, *cde_ctx_save;
-        int ret, i;
-        if (!cde_app->initialised)
+        list_for_each_entry_safe(cde_ctx, cde_ctx_save,
-                return 0;
+                        &cde_app->cde_ctx_lru, list) {
+                gk20a_cde_prepare_ctx_remove(cde_ctx);
+                gk20a_cde_remove_ctx(cde_ctx);
+        }
+}
-        for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++)
+void gk20a_cde_stop(struct gk20a *g)
-                ret = gk20a_cde_remove(cde_ctx);
+{
+        struct gk20a_cde_app *cde_app = &g->cde_app;
+        /* prevent further conversions and delayed works from working */
        cde_app->initialised = false;
-        return ret;
+        /* free all data, empty the list */
+        gk20a_cde_deallocate_contexts(g);
+}
+void gk20a_cde_destroy(struct gk20a *g)
+{
+        struct gk20a_cde_app *cde_app = &g->cde_app;
+        if (!cde_app->initialised)
+                return;
+        mutex_lock(&cde_app->mutex);
+        gk20a_cde_stop(g);
+        mutex_unlock(&cde_app->mutex);
+}
+static int gk20a_cde_allocate_contexts(struct gk20a *g)
+{
+        struct gk20a_cde_app *cde_app = &g->cde_app;
+        struct gk20a_cde_ctx *cde_ctx;
+        int err = 0;
+        int i;
+        for (i = 0; i < NUM_CDE_CONTEXTS; i++) {
+                cde_ctx = gk20a_cde_allocate_context(g);
+                if (IS_ERR(cde_ctx)) {
+                        err = PTR_ERR(cde_ctx);
+                        goto out;
+                }
+                list_add(&cde_ctx->list, &cde_app->cde_ctx_lru);
+                cde_app->lru_len++;
+                if (cde_app->lru_len > cde_app->lru_max_len)
+                        cde_app->lru_max_len = cde_app->lru_len;
+        }
+        return 0;
+out:
+        gk20a_cde_deallocate_contexts(g);
+        return err;
 }
 static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx,
@@ -591,29 +661,117 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
                                           num_entries, flags, fence, fence_out);
 }
+static void gk20a_ctx_release(struct gk20a_cde_ctx *cde_ctx)
+{
+        struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
+        gk20a_dbg(gpu_dbg_cde_ctx, "releasing use on %p", cde_ctx);
+        mutex_lock(&cde_app->mutex);
+        cde_ctx->in_use = false;
+        list_move(&cde_ctx->list, &cde_app->cde_ctx_lru);
+        cde_app->lru_used--;
+        mutex_unlock(&cde_app->mutex);
+}
+static void gk20a_cde_ctx_deleter_fn(struct work_struct *work)
+{
+        struct delayed_work *delay_work = to_delayed_work(work);
+        struct gk20a_cde_ctx *cde_ctx = container_of(delay_work,
+                        struct gk20a_cde_ctx, ctx_deleter_work);
+        struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
+        struct platform_device *pdev = cde_ctx->pdev;
+        int err;
+        /* someone has just taken it? engine deletion started? */
+        if (cde_ctx->in_use || !cde_app->initialised)
+                return;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+                        "cde: attempting to delete temporary %p", cde_ctx);
+        /* this should fail only when shutting down the whole device */
+        err = gk20a_busy(pdev);
+        if (WARN(err, "gk20a cde: cannot set gk20a on, not freeing channel yet."
+                                " rescheduling...")) {
+                schedule_delayed_work(&cde_ctx->ctx_deleter_work,
+                        msecs_to_jiffies(CTX_DELETE_TIME));
+                return;
+        }
+        /* mark so that nobody else assumes it's free to take */
+        mutex_lock(&cde_app->mutex);
+        if (cde_ctx->in_use || !cde_app->initialised) {
+                gk20a_dbg(gpu_dbg_cde_ctx,
+                                "cde: context use raced, not deleting %p",
+                                cde_ctx);
+                goto out;
+        }
+        cde_ctx->in_use = true;
+        gk20a_cde_remove_ctx(cde_ctx);
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+                        "cde: destroyed %p len=%d use=%d max=%d",
+                        cde_ctx, cde_app->lru_len, cde_app->lru_used,
+                        cde_app->lru_max_len);
+out:
+        mutex_unlock(&cde_app->mutex);
+        gk20a_idle(pdev);
+}
 static struct gk20a_cde_ctx *gk20a_cde_get_context(struct gk20a *g)
 {
        struct gk20a_cde_app *cde_app = &g->cde_app;
-        struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;
+        struct gk20a_cde_ctx *cde_ctx;
-        int i, ret;
-        /* try to find a jobless context */
+        /* try to get a jobless context. list is in lru order */
+        cde_ctx = list_first_entry(&cde_app->cde_ctx_lru,
+                        struct gk20a_cde_ctx, list);
+        if (!cde_ctx->in_use) {
+                gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+                                "cde: got free %p len=%d use=%d max=%d",
+                                cde_ctx, cde_app->lru_len, cde_app->lru_used,
+                                cde_app->lru_max_len);
+                /* deleter work may be scheduled, but in_use prevents it */
+                cde_ctx->in_use = true;
+                list_move_tail(&cde_ctx->list, &cde_app->cde_ctx_lru);
+                cde_app->lru_used++;
+                return cde_ctx;
+        }
-        for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {
+        /* no free contexts, get a temporary one */
-                struct channel_gk20a *ch = cde_ctx->ch;
-                bool empty;
-                mutex_lock(&ch->jobs_lock);
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
-                empty = list_empty(&ch->jobs);
+                        "cde: no free contexts, list len=%d",
-                mutex_unlock(&ch->jobs_lock);
+                        cde_app->lru_len);
-                if (empty)
+        cde_ctx = gk20a_cde_allocate_context(g);
-                        return cde_ctx;
+        if (IS_ERR(cde_ctx)) {
+                gk20a_warn(&g->dev->dev, "cde: cannot allocate context: %ld",
+                                PTR_ERR(cde_ctx));
+                return cde_ctx;
        }
-        /* could not find a free one, so allocate dynamically */
+        cde_ctx->in_use = true;
+        cde_ctx->is_temporary = true;
+        list_add_tail(&cde_ctx->list, &cde_app->cde_ctx_lru);
+        cde_app->lru_used++;
+        cde_app->lru_len++;
+        if (cde_app->lru_len > cde_app->lru_max_len)
+                cde_app->lru_max_len = cde_app->lru_len;
+        return cde_ctx;
+}
-        gk20a_warn(&g->dev->dev, "cde: no free contexts, allocating temporary");
+static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g)
+{
+        struct gk20a_cde_ctx *cde_ctx;
+        int ret;
        cde_ctx = kzalloc(sizeof(*cde_ctx), GFP_KERNEL);
        if (!cde_ctx)
@@ -622,12 +780,19 @@ static struct gk20a_cde_ctx *gk20a_cde_get_context(struct gk20a *g)
        cde_ctx->g = g;
        cde_ctx->pdev = g->dev;
-        ret = gk20a_cde_load(cde_ctx, true);
+        ret = gk20a_cde_load(cde_ctx);
        if (ret) {
-                gk20a_err(&g->dev->dev, "cde: cde load failed on temporary");
+                kfree(cde_ctx);
                return ERR_PTR(ret);
        }
+        INIT_LIST_HEAD(&cde_ctx->list);
+        cde_ctx->is_temporary = false;
+        cde_ctx->in_use = false;
+        INIT_DELAYED_WORK(&cde_ctx->ctx_deleter_work,
+                        gk20a_cde_ctx_deleter_fn);
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: allocated %p", cde_ctx);
        return cde_ctx;
 }
@@ -653,8 +818,10 @@ int gk20a_cde_convert(struct gk20a *g,
        mutex_lock(&cde_app->mutex);
        cde_ctx = gk20a_cde_get_context(g);
-        if (IS_ERR(cde_ctx))
+        if (IS_ERR(cde_ctx)) {
-                return PTR_ERR(cde_ctx);
+                err = PTR_ERR(cde_ctx);
+                goto exit_unlock;
+        }
        /* First, map the buffers to local va */
@@ -665,7 +832,7 @@ int gk20a_cde_convert(struct gk20a *g,
        /* map the destination buffer */
        get_dma_buf(dst); /* a ref for gk20a_vm_map */
-        dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0,
+        dst_vaddr = gk20a_vm_map(cde_ctx->vm, dst, 0,
                                 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
                                 dst_kind, NULL, true,
                                 gk20a_mem_flag_none,
@@ -757,18 +924,17 @@ exit_unlock:
        /* unmap the buffers - channel holds references to them now */
        if (dst_vaddr)
-                gk20a_vm_unmap(g->cde_app.vm, dst_vaddr);
+                gk20a_vm_unmap(cde_ctx->vm, dst_vaddr);
        mutex_unlock(&cde_app->mutex);
        return err;
 }
-static void gk20a_free_ctx_cb(struct channel_gk20a *ch, void *data)
+static void gk20a_cde_finished_ctx_cb(struct channel_gk20a *ch, void *data)
 {
        struct gk20a_cde_ctx *cde_ctx = data;
        bool empty;
-        int err;
        mutex_lock(&ch->jobs_lock);
        empty = list_empty(&ch->jobs);
@@ -777,19 +943,17 @@ static void gk20a_free_ctx_cb(struct channel_gk20a *ch, void *data)
        if (!empty)
                return;
-        /* this should fail only when shutting down the whole device */
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: finished %p", cde_ctx);
-        err = gk20a_busy(cde_ctx->pdev);
-        if (WARN(err, "gk20a cde: cannot set gk20a on, not freeing channel"
-                                ", leaking memory"))
-                return;
-        gk20a_cde_remove(cde_ctx);
+        /* delete temporary contexts later */
-        gk20a_idle(cde_ctx->pdev);
+        if (cde_ctx->is_temporary)
+                schedule_delayed_work(&cde_ctx->ctx_deleter_work,
+                        msecs_to_jiffies(CTX_DELETE_TIME));
-        kfree(cde_ctx);
+        gk20a_ctx_release(cde_ctx);
 }
-static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use)
+static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
 {
        struct gk20a *g = cde_ctx->g;
        const struct firmware *img;
@@ -804,10 +968,8 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use)
                return -ENOSYS;
        }
-        if (free_after_use)
+        ch = gk20a_open_new_channel_with_cb(g, gk20a_cde_finished_ctx_cb,
-                ch = gk20a_open_new_channel_with_cb(g, gk20a_free_ctx_cb, cde_ctx);
+                        cde_ctx);
-        else
-                ch = gk20a_open_new_channel(g);
        if (!ch) {
                gk20a_warn(&cde_ctx->pdev->dev, "cde: gk20a channel not available");
                err = -ENOMEM;
@@ -876,8 +1038,7 @@ err_get_gk20a_channel:
 int gk20a_cde_reload(struct gk20a *g)
 {
        struct gk20a_cde_app *cde_app = &g->cde_app;
-        struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;
+        int err;
-        int err, i;
        if (!cde_app->initialised)
                return -ENOSYS;
@@ -887,10 +1048,12 @@ int gk20a_cde_reload(struct gk20a *g)
                return err;
        mutex_lock(&cde_app->mutex);
-        for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {
-                gk20a_cde_remove(cde_ctx);
+        gk20a_cde_stop(g);
-                err = gk20a_cde_load(cde_ctx, false);
-        }
+        err = gk20a_cde_allocate_contexts(g);
+        if (!err)
+                cde_app->initialised = true;
        mutex_unlock(&cde_app->mutex);
@@ -901,39 +1064,28 @@ int gk20a_cde_reload(struct gk20a *g)
 int gk20a_init_cde_support(struct gk20a *g)
 {
        struct gk20a_cde_app *cde_app = &g->cde_app;
-        struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;
+        int err;
-        int ret, i;
        if (cde_app->initialised)
                return 0;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: init");
        mutex_init(&cde_app->mutex);
        mutex_lock(&cde_app->mutex);
-        for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {
+        INIT_LIST_HEAD(&cde_app->cde_ctx_lru);
-                cde_ctx->g = g;
+        cde_app->lru_len = 0;
-                cde_ctx->pdev = g->dev;
+        cde_app->lru_max_len = 0;
-                ret = gk20a_cde_load(cde_ctx, false);
+        cde_app->lru_used = 0;
-                if (ret)
-                        goto err_init_instance;
-        }
-        /* take shadow to the vm for general usage */
+        err = gk20a_cde_allocate_contexts(g);
-        cde_app->vm = cde_app->cde_ctx->vm;
+        if (!err)
+                cde_app->initialised = true;
-        cde_app->initialised = true;
        mutex_unlock(&cde_app->mutex);
+        gk20a_dbg(gpu_dbg_cde_ctx, "cde: init finished: %d", err);
-        return 0;
+        return err;
-err_init_instance:
-        /* deinitialise initialised channels */
-        while (i--) {
-                gk20a_cde_remove(cde_ctx);
-                cde_ctx--;
-        }
-        return ret;
 }
 enum cde_launch_patch_offset {
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
index e4d4659d..4120dc94 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
@@ -242,19 +242,26 @@ struct gk20a_cde_ctx {
        struct kobj_attribute attr;
        bool init_cmd_executed;
+        struct list_head list;
+        bool is_temporary;
+        bool in_use;
+        struct delayed_work ctx_deleter_work;
 };
 struct gk20a_cde_app {
        bool initialised;
        struct mutex mutex;
-        struct vm_gk20a *vm;
-        struct gk20a_cde_ctx cde_ctx[NUM_CDE_CONTEXTS];
+        struct list_head cde_ctx_lru;
+        int lru_len;
+        int lru_max_len;
+        int lru_used;
        u32 shader_parameter;
 };
-int gk20a_cde_destroy(struct gk20a *g);
+void gk20a_cde_destroy(struct gk20a *g);
 int gk20a_init_cde_support(struct gk20a *g);
 int gk20a_cde_reload(struct gk20a *g);
 int gk20a_cde_convert(struct gk20a *g, struct dma_buf *dst,
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 309a1b08..825cb886 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -535,6 +535,7 @@ enum gk20a_dbg_categories {
        gpu_dbg_map     = BIT(8),  /* mem mappings */
        gpu_dbg_gpu_dbg = BIT(9),  /* gpu debugger/profiler */
        gpu_dbg_cde     = BIT(10), /* cde info messages */
+        gpu_dbg_cde_ctx = BIT(11), /* cde context usage messages */
        gpu_dbg_mem     = BIT(31), /* memory accesses, very verbose */
 };

diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c index ee62f02a..9067aae5 100644 --- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -34,7 +34,10 @@
34	#include "hw_ccsr_gk20a.h"	34	#include "hw_ccsr_gk20a.h"
35	#include "hw_pbdma_gk20a.h"	35	#include "hw_pbdma_gk20a.h"
36		36
37	static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use);	37	static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx);
		38	static struct gk20a_cde_ctx gk20a_cde_allocate_context(struct gk20a g);
		39
		40	#define CTX_DELETE_TIME 1000
38		41
39	static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)	42	static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
40	{	43	{
@@ -67,7 +70,7 @@ static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
67	cde_ctx->init_cmd_executed = false;	70	cde_ctx->init_cmd_executed = false;
68	}	71	}
69		72
70	static int gk20a_cde_remove(struct gk20a_cde_ctx *cde_ctx)	73	static void gk20a_cde_remove_ctx(struct gk20a_cde_ctx *cde_ctx)
71	{	74	{
72	struct gk20a *g = cde_ctx->g;	75	struct gk20a *g = cde_ctx->g;
73	struct channel_gk20a *ch = cde_ctx->ch;	76	struct channel_gk20a *ch = cde_ctx->ch;
@@ -81,23 +84,90 @@ static int gk20a_cde_remove(struct gk20a_cde_ctx *cde_ctx)
81	gk20a_gmmu_unmap(vm, cde_ctx->backing_store_vaddr,	84	gk20a_gmmu_unmap(vm, cde_ctx->backing_store_vaddr,
82	g->gr.compbit_store.size, 1);	85	g->gr.compbit_store.size, 1);
83		86
84	return 0;	87	/* housekeeping on app */
		88	list_del(&cde_ctx->list);
		89	cde_ctx->g->cde_app.lru_len--;
		90	kfree(cde_ctx);
		91	}
		92
		93	static void gk20a_cde_prepare_ctx_remove(struct gk20a_cde_ctx *cde_ctx)
		94	{
		95	struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
		96
		97	/* permanent contexts do not have deleter works */
		98	if (!cde_ctx->is_temporary)
		99	return;
		100
		101	/* safe to go off the mutex since app is deinitialised. deleter works
		102	* may be only at waiting for the mutex or before, going to abort */
		103	mutex_unlock(&cde_app->mutex);
		104
		105	/* the deleter can rearm itself */
		106	do {
		107	cancel_delayed_work_sync(&cde_ctx->ctx_deleter_work);
		108	} while (delayed_work_pending(&cde_ctx->ctx_deleter_work));
		109
		110	mutex_lock(&cde_app->mutex);
85	}	111	}
86		112
87	int gk20a_cde_destroy(struct gk20a *g)	113	static void gk20a_cde_deallocate_contexts(struct gk20a *g)
88	{	114	{
89	struct gk20a_cde_app *cde_app = &g->cde_app;	115	struct gk20a_cde_app *cde_app = &g->cde_app;
90	struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;	116	struct gk20a_cde_ctx cde_ctx, cde_ctx_save;
91	int ret, i;
92		117
93	if (!cde_app->initialised)	118	list_for_each_entry_safe(cde_ctx, cde_ctx_save,
94	return 0;	119	&cde_app->cde_ctx_lru, list) {
		120	gk20a_cde_prepare_ctx_remove(cde_ctx);
		121	gk20a_cde_remove_ctx(cde_ctx);
		122	}
		123	}
95		124
96	for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++)	125	void gk20a_cde_stop(struct gk20a *g)
97	ret = gk20a_cde_remove(cde_ctx);	126	{
		127	struct gk20a_cde_app *cde_app = &g->cde_app;
98		128
		129	/* prevent further conversions and delayed works from working */
99	cde_app->initialised = false;	130	cde_app->initialised = false;
100	return ret;	131	/* free all data, empty the list */
		132	gk20a_cde_deallocate_contexts(g);
		133	}
		134
		135	void gk20a_cde_destroy(struct gk20a *g)
		136	{
		137	struct gk20a_cde_app *cde_app = &g->cde_app;
		138
		139	if (!cde_app->initialised)
		140	return;
		141
		142	mutex_lock(&cde_app->mutex);
		143	gk20a_cde_stop(g);
		144	mutex_unlock(&cde_app->mutex);
		145	}
		146
		147	static int gk20a_cde_allocate_contexts(struct gk20a *g)
		148	{
		149	struct gk20a_cde_app *cde_app = &g->cde_app;
		150	struct gk20a_cde_ctx *cde_ctx;
		151	int err = 0;
		152	int i;
		153
		154	for (i = 0; i < NUM_CDE_CONTEXTS; i++) {
		155	cde_ctx = gk20a_cde_allocate_context(g);
		156	if (IS_ERR(cde_ctx)) {
		157	err = PTR_ERR(cde_ctx);
		158	goto out;
		159	}
		160
		161	list_add(&cde_ctx->list, &cde_app->cde_ctx_lru);
		162	cde_app->lru_len++;
		163	if (cde_app->lru_len > cde_app->lru_max_len)
		164	cde_app->lru_max_len = cde_app->lru_len;
		165	}
		166
		167	return 0;
		168	out:
		169	gk20a_cde_deallocate_contexts(g);
		170	return err;
101	}	171	}
102		172
103	static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx,	173	static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx,
@@ -591,29 +661,117 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
591	num_entries, flags, fence, fence_out);	661	num_entries, flags, fence, fence_out);
592	}	662	}
593		663
		664	static void gk20a_ctx_release(struct gk20a_cde_ctx *cde_ctx)
		665	{
		666	struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
		667
		668	gk20a_dbg(gpu_dbg_cde_ctx, "releasing use on %p", cde_ctx);
		669
		670	mutex_lock(&cde_app->mutex);
		671
		672	cde_ctx->in_use = false;
		673	list_move(&cde_ctx->list, &cde_app->cde_ctx_lru);
		674	cde_app->lru_used--;
		675
		676	mutex_unlock(&cde_app->mutex);
		677	}
		678
		679	static void gk20a_cde_ctx_deleter_fn(struct work_struct *work)
		680	{
		681	struct delayed_work *delay_work = to_delayed_work(work);
		682	struct gk20a_cde_ctx *cde_ctx = container_of(delay_work,
		683	struct gk20a_cde_ctx, ctx_deleter_work);
		684	struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
		685	struct platform_device *pdev = cde_ctx->pdev;
		686	int err;
		687
		688	/* someone has just taken it? engine deletion started? */
		689	if (cde_ctx->in_use \|\| !cde_app->initialised)
		690	return;
		691
		692	gk20a_dbg(gpu_dbg_fn \| gpu_dbg_cde_ctx,
		693	"cde: attempting to delete temporary %p", cde_ctx);
		694
		695	/* this should fail only when shutting down the whole device */
		696	err = gk20a_busy(pdev);
		697	if (WARN(err, "gk20a cde: cannot set gk20a on, not freeing channel yet."
		698	" rescheduling...")) {
		699	schedule_delayed_work(&cde_ctx->ctx_deleter_work,
		700	msecs_to_jiffies(CTX_DELETE_TIME));
		701	return;
		702	}
		703
		704	/* mark so that nobody else assumes it's free to take */
		705	mutex_lock(&cde_app->mutex);
		706	if (cde_ctx->in_use \|\| !cde_app->initialised) {
		707	gk20a_dbg(gpu_dbg_cde_ctx,
		708	"cde: context use raced, not deleting %p",
		709	cde_ctx);
		710	goto out;
		711	}
		712	cde_ctx->in_use = true;
		713
		714	gk20a_cde_remove_ctx(cde_ctx);
		715	gk20a_dbg(gpu_dbg_fn \| gpu_dbg_cde_ctx,
		716	"cde: destroyed %p len=%d use=%d max=%d",
		717	cde_ctx, cde_app->lru_len, cde_app->lru_used,
		718	cde_app->lru_max_len);
		719
		720	out:
		721	mutex_unlock(&cde_app->mutex);
		722	gk20a_idle(pdev);
		723	}
		724
594	static struct gk20a_cde_ctx gk20a_cde_get_context(struct gk20a g)	725	static struct gk20a_cde_ctx gk20a_cde_get_context(struct gk20a g)
595	{	726	{
596	struct gk20a_cde_app *cde_app = &g->cde_app;	727	struct gk20a_cde_app *cde_app = &g->cde_app;
597	struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;	728	struct gk20a_cde_ctx *cde_ctx;
598	int i, ret;
599		729
600	/* try to find a jobless context */	730	/* try to get a jobless context. list is in lru order */
		731
		732	cde_ctx = list_first_entry(&cde_app->cde_ctx_lru,
		733	struct gk20a_cde_ctx, list);
		734
		735	if (!cde_ctx->in_use) {
		736	gk20a_dbg(gpu_dbg_fn \| gpu_dbg_cde_ctx,
		737	"cde: got free %p len=%d use=%d max=%d",
		738	cde_ctx, cde_app->lru_len, cde_app->lru_used,
		739	cde_app->lru_max_len);
		740	/* deleter work may be scheduled, but in_use prevents it */
		741	cde_ctx->in_use = true;
		742	list_move_tail(&cde_ctx->list, &cde_app->cde_ctx_lru);
		743	cde_app->lru_used++;
		744	return cde_ctx;
		745	}
601		746
602	for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {	747	/* no free contexts, get a temporary one */
603	struct channel_gk20a *ch = cde_ctx->ch;
604	bool empty;
605		748
606	mutex_lock(&ch->jobs_lock);	749	gk20a_dbg(gpu_dbg_fn \| gpu_dbg_cde_ctx,
607	empty = list_empty(&ch->jobs);	750	"cde: no free contexts, list len=%d",
608	mutex_unlock(&ch->jobs_lock);	751	cde_app->lru_len);
609		752
610	if (empty)	753	cde_ctx = gk20a_cde_allocate_context(g);
611	return cde_ctx;	754	if (IS_ERR(cde_ctx)) {
		755	gk20a_warn(&g->dev->dev, "cde: cannot allocate context: %ld",
		756	PTR_ERR(cde_ctx));
		757	return cde_ctx;
612	}	758	}
613		759
614	/* could not find a free one, so allocate dynamically */	760	cde_ctx->in_use = true;
		761	cde_ctx->is_temporary = true;
		762	list_add_tail(&cde_ctx->list, &cde_app->cde_ctx_lru);
		763	cde_app->lru_used++;
		764	cde_app->lru_len++;
		765	if (cde_app->lru_len > cde_app->lru_max_len)
		766	cde_app->lru_max_len = cde_app->lru_len;
		767
		768	return cde_ctx;
		769	}
615		770
616	gk20a_warn(&g->dev->dev, "cde: no free contexts, allocating temporary");	771	static struct gk20a_cde_ctx gk20a_cde_allocate_context(struct gk20a g)
		772	{
		773	struct gk20a_cde_ctx *cde_ctx;
		774	int ret;
617		775
618	cde_ctx = kzalloc(sizeof(*cde_ctx), GFP_KERNEL);	776	cde_ctx = kzalloc(sizeof(*cde_ctx), GFP_KERNEL);
619	if (!cde_ctx)	777	if (!cde_ctx)
@@ -622,12 +780,19 @@ static struct gk20a_cde_ctx gk20a_cde_get_context(struct gk20a g)
622	cde_ctx->g = g;	780	cde_ctx->g = g;
623	cde_ctx->pdev = g->dev;	781	cde_ctx->pdev = g->dev;
624		782
625	ret = gk20a_cde_load(cde_ctx, true);	783	ret = gk20a_cde_load(cde_ctx);
626	if (ret) {	784	if (ret) {
627	gk20a_err(&g->dev->dev, "cde: cde load failed on temporary");	785	kfree(cde_ctx);
628	return ERR_PTR(ret);	786	return ERR_PTR(ret);
629	}	787	}
630		788
		789	INIT_LIST_HEAD(&cde_ctx->list);
		790	cde_ctx->is_temporary = false;
		791	cde_ctx->in_use = false;
		792	INIT_DELAYED_WORK(&cde_ctx->ctx_deleter_work,
		793	gk20a_cde_ctx_deleter_fn);
		794
		795	gk20a_dbg(gpu_dbg_fn \| gpu_dbg_cde_ctx, "cde: allocated %p", cde_ctx);
631	return cde_ctx;	796	return cde_ctx;
632	}	797	}
633		798
@@ -653,8 +818,10 @@ int gk20a_cde_convert(struct gk20a *g,
653	mutex_lock(&cde_app->mutex);	818	mutex_lock(&cde_app->mutex);
654		819
655	cde_ctx = gk20a_cde_get_context(g);	820	cde_ctx = gk20a_cde_get_context(g);
656	if (IS_ERR(cde_ctx))	821	if (IS_ERR(cde_ctx)) {
657	return PTR_ERR(cde_ctx);	822	err = PTR_ERR(cde_ctx);
		823	goto exit_unlock;
		824	}
658		825
659	/* First, map the buffers to local va */	826	/* First, map the buffers to local va */
660		827
@@ -665,7 +832,7 @@ int gk20a_cde_convert(struct gk20a *g,
665		832
666	/* map the destination buffer */	833	/* map the destination buffer */
667	get_dma_buf(dst); /* a ref for gk20a_vm_map */	834	get_dma_buf(dst); /* a ref for gk20a_vm_map */
668	dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0,	835	dst_vaddr = gk20a_vm_map(cde_ctx->vm, dst, 0,
669	NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,	836	NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
670	dst_kind, NULL, true,	837	dst_kind, NULL, true,
671	gk20a_mem_flag_none,	838	gk20a_mem_flag_none,
@@ -757,18 +924,17 @@ exit_unlock:
757		924
758	/* unmap the buffers - channel holds references to them now */	925	/* unmap the buffers - channel holds references to them now */
759	if (dst_vaddr)	926	if (dst_vaddr)
760	gk20a_vm_unmap(g->cde_app.vm, dst_vaddr);	927	gk20a_vm_unmap(cde_ctx->vm, dst_vaddr);
761		928
762	mutex_unlock(&cde_app->mutex);	929	mutex_unlock(&cde_app->mutex);
763		930
764	return err;	931	return err;
765	}	932	}
766		933
767	static void gk20a_free_ctx_cb(struct channel_gk20a ch, void data)	934	static void gk20a_cde_finished_ctx_cb(struct channel_gk20a ch, void data)
768	{	935	{
769	struct gk20a_cde_ctx *cde_ctx = data;	936	struct gk20a_cde_ctx *cde_ctx = data;
770	bool empty;	937	bool empty;
771	int err;
772		938
773	mutex_lock(&ch->jobs_lock);	939	mutex_lock(&ch->jobs_lock);
774	empty = list_empty(&ch->jobs);	940	empty = list_empty(&ch->jobs);
@@ -777,19 +943,17 @@ static void gk20a_free_ctx_cb(struct channel_gk20a ch, void data)
777	if (!empty)	943	if (!empty)
778	return;	944	return;
779		945
780	/* this should fail only when shutting down the whole device */	946	gk20a_dbg(gpu_dbg_fn \| gpu_dbg_cde_ctx, "cde: finished %p", cde_ctx);
781	err = gk20a_busy(cde_ctx->pdev);
782	if (WARN(err, "gk20a cde: cannot set gk20a on, not freeing channel"
783	", leaking memory"))
784	return;
785		947
786	gk20a_cde_remove(cde_ctx);	948	/* delete temporary contexts later */
787	gk20a_idle(cde_ctx->pdev);	949	if (cde_ctx->is_temporary)
		950	schedule_delayed_work(&cde_ctx->ctx_deleter_work,
		951	msecs_to_jiffies(CTX_DELETE_TIME));
788		952
789	kfree(cde_ctx);	953	gk20a_ctx_release(cde_ctx);
790	}	954	}
791		955
792	static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use)	956	static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
793	{	957	{
794	struct gk20a *g = cde_ctx->g;	958	struct gk20a *g = cde_ctx->g;
795	const struct firmware *img;	959	const struct firmware *img;
@@ -804,10 +968,8 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use)
804	return -ENOSYS;	968	return -ENOSYS;
805	}	969	}
806		970
807	if (free_after_use)	971	ch = gk20a_open_new_channel_with_cb(g, gk20a_cde_finished_ctx_cb,
808	ch = gk20a_open_new_channel_with_cb(g, gk20a_free_ctx_cb, cde_ctx);	972	cde_ctx);
809	else
810	ch = gk20a_open_new_channel(g);
811	if (!ch) {	973	if (!ch) {
812	gk20a_warn(&cde_ctx->pdev->dev, "cde: gk20a channel not available");	974	gk20a_warn(&cde_ctx->pdev->dev, "cde: gk20a channel not available");
813	err = -ENOMEM;	975	err = -ENOMEM;
@@ -876,8 +1038,7 @@ err_get_gk20a_channel:
876	int gk20a_cde_reload(struct gk20a *g)	1038	int gk20a_cde_reload(struct gk20a *g)
877	{	1039	{
878	struct gk20a_cde_app *cde_app = &g->cde_app;	1040	struct gk20a_cde_app *cde_app = &g->cde_app;
879	struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;	1041	int err;
880	int err, i;
881		1042
882	if (!cde_app->initialised)	1043	if (!cde_app->initialised)
883	return -ENOSYS;	1044	return -ENOSYS;
@@ -887,10 +1048,12 @@ int gk20a_cde_reload(struct gk20a *g)
887	return err;	1048	return err;
888		1049
889	mutex_lock(&cde_app->mutex);	1050	mutex_lock(&cde_app->mutex);
890	for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {	1051
891	gk20a_cde_remove(cde_ctx);	1052	gk20a_cde_stop(g);
892	err = gk20a_cde_load(cde_ctx, false);	1053
893	}	1054	err = gk20a_cde_allocate_contexts(g);
		1055	if (!err)
		1056	cde_app->initialised = true;
894		1057
895	mutex_unlock(&cde_app->mutex);	1058	mutex_unlock(&cde_app->mutex);
896		1059
@@ -901,39 +1064,28 @@ int gk20a_cde_reload(struct gk20a *g)
901	int gk20a_init_cde_support(struct gk20a *g)	1064	int gk20a_init_cde_support(struct gk20a *g)
902	{	1065	{
903	struct gk20a_cde_app *cde_app = &g->cde_app;	1066	struct gk20a_cde_app *cde_app = &g->cde_app;
904	struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;	1067	int err;
905	int ret, i;
906		1068
907	if (cde_app->initialised)	1069	if (cde_app->initialised)
908	return 0;	1070	return 0;
909		1071
		1072	gk20a_dbg(gpu_dbg_fn \| gpu_dbg_cde_ctx, "cde: init");
		1073
910	mutex_init(&cde_app->mutex);	1074	mutex_init(&cde_app->mutex);
911	mutex_lock(&cde_app->mutex);	1075	mutex_lock(&cde_app->mutex);
912		1076
913	for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {	1077	INIT_LIST_HEAD(&cde_app->cde_ctx_lru);
914	cde_ctx->g = g;	1078	cde_app->lru_len = 0;
915	cde_ctx->pdev = g->dev;	1079	cde_app->lru_max_len = 0;
916	ret = gk20a_cde_load(cde_ctx, false);	1080	cde_app->lru_used = 0;
917	if (ret)
918	goto err_init_instance;
919	}
920		1081
921	/* take shadow to the vm for general usage */	1082	err = gk20a_cde_allocate_contexts(g);
922	cde_app->vm = cde_app->cde_ctx->vm;	1083	if (!err)
		1084	cde_app->initialised = true;
923		1085
924	cde_app->initialised = true;
925	mutex_unlock(&cde_app->mutex);	1086	mutex_unlock(&cde_app->mutex);
926		1087	gk20a_dbg(gpu_dbg_cde_ctx, "cde: init finished: %d", err);
927	return 0;	1088	return err;
928
929	err_init_instance:
930
931	/* deinitialise initialised channels */
932	while (i--) {
933	gk20a_cde_remove(cde_ctx);
934	cde_ctx--;
935	}
936	return ret;
937	}	1089	}
938		1090
939	enum cde_launch_patch_offset {	1091	enum cde_launch_patch_offset {


diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h index e4d4659d..4120dc94 100644 --- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
@@ -242,19 +242,26 @@ struct gk20a_cde_ctx {
242	struct kobj_attribute attr;	242	struct kobj_attribute attr;
243		243
244	bool init_cmd_executed;	244	bool init_cmd_executed;
		245
		246	struct list_head list;
		247	bool is_temporary;
		248	bool in_use;
		249	struct delayed_work ctx_deleter_work;
245	};	250	};
246		251
247	struct gk20a_cde_app {	252	struct gk20a_cde_app {
248	bool initialised;	253	bool initialised;
249	struct mutex mutex;	254	struct mutex mutex;
250	struct vm_gk20a *vm;
251		255
252	struct gk20a_cde_ctx cde_ctx[NUM_CDE_CONTEXTS];	256	struct list_head cde_ctx_lru;
		257	int lru_len;
		258	int lru_max_len;
		259	int lru_used;
253		260
254	u32 shader_parameter;	261	u32 shader_parameter;
255	};	262	};
256		263
257	int gk20a_cde_destroy(struct gk20a *g);	264	void gk20a_cde_destroy(struct gk20a *g);
258	int gk20a_init_cde_support(struct gk20a *g);	265	int gk20a_init_cde_support(struct gk20a *g);
259	int gk20a_cde_reload(struct gk20a *g);	266	int gk20a_cde_reload(struct gk20a *g);
260	int gk20a_cde_convert(struct gk20a g, struct dma_buf dst,	267	int gk20a_cde_convert(struct gk20a g, struct dma_buf dst,


diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index 309a1b08..825cb886 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -535,6 +535,7 @@ enum gk20a_dbg_categories {
535	gpu_dbg_map = BIT(8), /* mem mappings */	535	gpu_dbg_map = BIT(8), /* mem mappings */
536	gpu_dbg_gpu_dbg = BIT(9), /* gpu debugger/profiler */	536	gpu_dbg_gpu_dbg = BIT(9), /* gpu debugger/profiler */
537	gpu_dbg_cde = BIT(10), /* cde info messages */	537	gpu_dbg_cde = BIT(10), /* cde info messages */
		538	gpu_dbg_cde_ctx = BIT(11), /* cde context usage messages */
538	gpu_dbg_mem = BIT(31), /* memory accesses, very verbose */	539	gpu_dbg_mem = BIT(31), /* memory accesses, very verbose */
539	};	540	};
540		541