gpu: nvgpu: cde: list for contexts, defer deletion

Instead of current preallocated array plus dynamically allocated temporary contexts, use a linked list in LRU fashion, always storing free contexts at the beginning of the list. Initialize the preallocated contexts to the list and store dynamically allocated temporaries there too for quick reuse as needed, with a delayed scheduled work for deleting temporaries when the high load has diminished. Bug 200040211 Change-Id: Ibc75a0150109ec9c44b2eeb74607450990584b18 Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: http://git-master/r/562856 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Konsta Holtta <kholtta@nvidia.com> 2014-10-23 07:10:57 -0400
committer: Dan Willemsen <dwillemsen@nvidia.com> 2015-03-18 15:11:58 -0400
commit: 14577a339ccc160ed58f8d936ebcbd96dba3b6ca (patch)
tree: 2e55969ea66a15b23f799e0054d4cba4ab6d85fe /drivers/gpu
parent: b1088fe769ea900438a39c9e9920157b4ba7436a (diff)
3 files changed, 237 insertions, 77 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index ee62f02a..9067aae5 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -34,7 +34,10 @@
 #include "hw_ccsr_gk20a.h"
 #include "hw_pbdma_gk20a.h"
-static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use);
+static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx);
+static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g);
+#define CTX_DELETE_TIME 1000
 static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
 {
@@ -67,7 +70,7 @@ static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
        cde_ctx->init_cmd_executed = false;
 }
-static int gk20a_cde_remove(struct gk20a_cde_ctx *cde_ctx)
+static void gk20a_cde_remove_ctx(struct gk20a_cde_ctx *cde_ctx)
 {
        struct gk20a *g = cde_ctx->g;
        struct channel_gk20a *ch = cde_ctx->ch;
@@ -81,23 +84,90 @@ static int gk20a_cde_remove(struct gk20a_cde_ctx *cde_ctx)
        gk20a_gmmu_unmap(vm, cde_ctx->backing_store_vaddr,
                         g->gr.compbit_store.size, 1);
-        return 0;
+        /* housekeeping on app */
+        list_del(&cde_ctx->list);
+        cde_ctx->g->cde_app.lru_len--;
+        kfree(cde_ctx);
+}
+static void gk20a_cde_prepare_ctx_remove(struct gk20a_cde_ctx *cde_ctx)
+{
+        struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
+        /* permanent contexts do not have deleter works */
+        if (!cde_ctx->is_temporary)
+                return;
+        /* safe to go off the mutex since app is deinitialised. deleter works
+         * may be only at waiting for the mutex or before, going to abort */
+        mutex_unlock(&cde_app->mutex);
+        /* the deleter can rearm itself */
+        do {
+                cancel_delayed_work_sync(&cde_ctx->ctx_deleter_work);
+        } while (delayed_work_pending(&cde_ctx->ctx_deleter_work));
+        mutex_lock(&cde_app->mutex);
 }
-int gk20a_cde_destroy(struct gk20a *g)
+static void gk20a_cde_deallocate_contexts(struct gk20a *g)
 {
        struct gk20a_cde_app *cde_app = &g->cde_app;
-        struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;
+        struct gk20a_cde_ctx *cde_ctx, *cde_ctx_save;
-        int ret, i;
-        if (!cde_app->initialised)
+        list_for_each_entry_safe(cde_ctx, cde_ctx_save,
-                return 0;
+                        &cde_app->cde_ctx_lru, list) {
+                gk20a_cde_prepare_ctx_remove(cde_ctx);
+                gk20a_cde_remove_ctx(cde_ctx);
+        }
+}
-        for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++)
+void gk20a_cde_stop(struct gk20a *g)
-                ret = gk20a_cde_remove(cde_ctx);
+{
+        struct gk20a_cde_app *cde_app = &g->cde_app;
+        /* prevent further conversions and delayed works from working */
        cde_app->initialised = false;
-        return ret;
+        /* free all data, empty the list */
+        gk20a_cde_deallocate_contexts(g);
+}
+void gk20a_cde_destroy(struct gk20a *g)
+{
+        struct gk20a_cde_app *cde_app = &g->cde_app;
+        if (!cde_app->initialised)
+                return;
+        mutex_lock(&cde_app->mutex);
+        gk20a_cde_stop(g);
+        mutex_unlock(&cde_app->mutex);
+}
+static int gk20a_cde_allocate_contexts(struct gk20a *g)
+{
+        struct gk20a_cde_app *cde_app = &g->cde_app;
+        struct gk20a_cde_ctx *cde_ctx;
+        int err = 0;
+        int i;
+        for (i = 0; i < NUM_CDE_CONTEXTS; i++) {
+                cde_ctx = gk20a_cde_allocate_context(g);
+                if (IS_ERR(cde_ctx)) {
+                        err = PTR_ERR(cde_ctx);
+                        goto out;
+                }
+                list_add(&cde_ctx->list, &cde_app->cde_ctx_lru);
+                cde_app->lru_len++;
+                if (cde_app->lru_len > cde_app->lru_max_len)
+                        cde_app->lru_max_len = cde_app->lru_len;
+        }
+        return 0;
+out:
+        gk20a_cde_deallocate_contexts(g);
+        return err;
 }
 static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx,
@@ -591,29 +661,117 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
                                           num_entries, flags, fence, fence_out);
 }
+static void gk20a_ctx_release(struct gk20a_cde_ctx *cde_ctx)
+{
+        struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
+        gk20a_dbg(gpu_dbg_cde_ctx, "releasing use on %p", cde_ctx);
+        mutex_lock(&cde_app->mutex);
+        cde_ctx->in_use = false;
+        list_move(&cde_ctx->list, &cde_app->cde_ctx_lru);
+        cde_app->lru_used--;
+        mutex_unlock(&cde_app->mutex);
+}
+static void gk20a_cde_ctx_deleter_fn(struct work_struct *work)
+{
+        struct delayed_work *delay_work = to_delayed_work(work);
+        struct gk20a_cde_ctx *cde_ctx = container_of(delay_work,
+                        struct gk20a_cde_ctx, ctx_deleter_work);
+        struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
+        struct platform_device *pdev = cde_ctx->pdev;
+        int err;
+        /* someone has just taken it? engine deletion started? */
+        if (cde_ctx->in_use || !cde_app->initialised)
+                return;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+                        "cde: attempting to delete temporary %p", cde_ctx);
+        /* this should fail only when shutting down the whole device */
+        err = gk20a_busy(pdev);
+        if (WARN(err, "gk20a cde: cannot set gk20a on, not freeing channel yet."
+                                " rescheduling...")) {
+                schedule_delayed_work(&cde_ctx->ctx_deleter_work,
+                        msecs_to_jiffies(CTX_DELETE_TIME));
+                return;
+        }
+        /* mark so that nobody else assumes it's free to take */
+        mutex_lock(&cde_app->mutex);
+        if (cde_ctx->in_use || !cde_app->initialised) {
+                gk20a_dbg(gpu_dbg_cde_ctx,
+                                "cde: context use raced, not deleting %p",
+                                cde_ctx);
+                goto out;
+        }
+        cde_ctx->in_use = true;
+        gk20a_cde_remove_ctx(cde_ctx);
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+                        "cde: destroyed %p len=%d use=%d max=%d",
+                        cde_ctx, cde_app->lru_len, cde_app->lru_used,
+                        cde_app->lru_max_len);
+out:
+        mutex_unlock(&cde_app->mutex);
+        gk20a_idle(pdev);
+}
 static struct gk20a_cde_ctx *gk20a_cde_get_context(struct gk20a *g)
 {
        struct gk20a_cde_app *cde_app = &g->cde_app;
-        struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;
+        struct gk20a_cde_ctx *cde_ctx;
-        int i, ret;
-        /* try to find a jobless context */
+        /* try to get a jobless context. list is in lru order */
+        cde_ctx = list_first_entry(&cde_app->cde_ctx_lru,
+                        struct gk20a_cde_ctx, list);
+        if (!cde_ctx->in_use) {
+                gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+                                "cde: got free %p len=%d use=%d max=%d",
+                                cde_ctx, cde_app->lru_len, cde_app->lru_used,
+                                cde_app->lru_max_len);
+                /* deleter work may be scheduled, but in_use prevents it */
+                cde_ctx->in_use = true;
+                list_move_tail(&cde_ctx->list, &cde_app->cde_ctx_lru);
+                cde_app->lru_used++;
+                return cde_ctx;
+        }
-        for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {
+        /* no free contexts, get a temporary one */
-                struct channel_gk20a *ch = cde_ctx->ch;
-                bool empty;
-                mutex_lock(&ch->jobs_lock);
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
-                empty = list_empty(&ch->jobs);
+                        "cde: no free contexts, list len=%d",
-                mutex_unlock(&ch->jobs_lock);
+                        cde_app->lru_len);
-                if (empty)
+        cde_ctx = gk20a_cde_allocate_context(g);
-                        return cde_ctx;
+        if (IS_ERR(cde_ctx)) {
+                gk20a_warn(&g->dev->dev, "cde: cannot allocate context: %ld",
+                                PTR_ERR(cde_ctx));
+                return cde_ctx;
        }
-        /* could not find a free one, so allocate dynamically */
+        cde_ctx->in_use = true;
+        cde_ctx->is_temporary = true;
+        list_add_tail(&cde_ctx->list, &cde_app->cde_ctx_lru);
+        cde_app->lru_used++;
+        cde_app->lru_len++;
+        if (cde_app->lru_len > cde_app->lru_max_len)
+                cde_app->lru_max_len = cde_app->lru_len;
+        return cde_ctx;
+}
-        gk20a_warn(&g->dev->dev, "cde: no free contexts, allocating temporary");
+static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g)
+{
+        struct gk20a_cde_ctx *cde_ctx;
+        int ret;
        cde_ctx = kzalloc(sizeof(*cde_ctx), GFP_KERNEL);
        if (!cde_ctx)
@@ -622,12 +780,19 @@ static struct gk20a_cde_ctx *gk20a_cde_get_context(struct gk20a *g)
        cde_ctx->g = g;
        cde_ctx->pdev = g->dev;
-        ret = gk20a_cde_load(cde_ctx, true);
+        ret = gk20a_cde_load(cde_ctx);
        if (ret) {
-                gk20a_err(&g->dev->dev, "cde: cde load failed on temporary");
+                kfree(cde_ctx);
                return ERR_PTR(ret);
        }
+        INIT_LIST_HEAD(&cde_ctx->list);
+        cde_ctx->is_temporary = false;
+        cde_ctx->in_use = false;
+        INIT_DELAYED_WORK(&cde_ctx->ctx_deleter_work,
+                        gk20a_cde_ctx_deleter_fn);
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: allocated %p", cde_ctx);
        return cde_ctx;
 }
@@ -653,8 +818,10 @@ int gk20a_cde_convert(struct gk20a *g,
        mutex_lock(&cde_app->mutex);
        cde_ctx = gk20a_cde_get_context(g);
-        if (IS_ERR(cde_ctx))
+        if (IS_ERR(cde_ctx)) {
-                return PTR_ERR(cde_ctx);
+                err = PTR_ERR(cde_ctx);
+                goto exit_unlock;
+        }
        /* First, map the buffers to local va */
@@ -665,7 +832,7 @@ int gk20a_cde_convert(struct gk20a *g,
        /* map the destination buffer */
        get_dma_buf(dst); /* a ref for gk20a_vm_map */
-        dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0,
+        dst_vaddr = gk20a_vm_map(cde_ctx->vm, dst, 0,
                                 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
                                 dst_kind, NULL, true,
                                 gk20a_mem_flag_none,
@@ -757,18 +924,17 @@ exit_unlock:
        /* unmap the buffers - channel holds references to them now */
        if (dst_vaddr)
-                gk20a_vm_unmap(g->cde_app.vm, dst_vaddr);
+                gk20a_vm_unmap(cde_ctx->vm, dst_vaddr);
        mutex_unlock(&cde_app->mutex);
        return err;
 }
-static void gk20a_free_ctx_cb(struct channel_gk20a *ch, void *data)
+static void gk20a_cde_finished_ctx_cb(struct channel_gk20a *ch, void *data)
 {
        struct gk20a_cde_ctx *cde_ctx = data;
        bool empty;
-        int err;
        mutex_lock(&ch->jobs_lock);
        empty = list_empty(&ch->jobs);
@@ -777,19 +943,17 @@ static void gk20a_free_ctx_cb(struct channel_gk20a *ch, void *data)
        if (!empty)
                return;
-        /* this should fail only when shutting down the whole device */
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: finished %p", cde_ctx);
-        err = gk20a_busy(cde_ctx->pdev);
-        if (WARN(err, "gk20a cde: cannot set gk20a on, not freeing channel"
-                                ", leaking memory"))
-                return;
-        gk20a_cde_remove(cde_ctx);
+        /* delete temporary contexts later */
-        gk20a_idle(cde_ctx->pdev);
+        if (cde_ctx->is_temporary)
+                schedule_delayed_work(&cde_ctx->ctx_deleter_work,
+                        msecs_to_jiffies(CTX_DELETE_TIME));
-        kfree(cde_ctx);
+        gk20a_ctx_release(cde_ctx);
 }
-static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use)
+static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
 {
        struct gk20a *g = cde_ctx->g;
        const struct firmware *img;
@@ -804,10 +968,8 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use)
                return -ENOSYS;
        }
-        if (free_after_use)
+        ch = gk20a_open_new_channel_with_cb(g, gk20a_cde_finished_ctx_cb,
-                ch = gk20a_open_new_channel_with_cb(g, gk20a_free_ctx_cb, cde_ctx);
+                        cde_ctx);
-        else
-                ch = gk20a_open_new_channel(g);
        if (!ch) {
                gk20a_warn(&cde_ctx->pdev->dev, "cde: gk20a channel not available");
                err = -ENOMEM;
@@ -876,8 +1038,7 @@ err_get_gk20a_channel:
 int gk20a_cde_reload(struct gk20a *g)
 {
        struct gk20a_cde_app *cde_app = &g->cde_app;
-        struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;
+        int err;
-        int err, i;
        if (!cde_app->initialised)
                return -ENOSYS;
@@ -887,10 +1048,12 @@ int gk20a_cde_reload(struct gk20a *g)
                return err;
        mutex_lock(&cde_app->mutex);
-        for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {
-                gk20a_cde_remove(cde_ctx);
+        gk20a_cde_stop(g);
-                err = gk20a_cde_load(cde_ctx, false);
-        }
+        err = gk20a_cde_allocate_contexts(g);
+        if (!err)
+                cde_app->initialised = true;
        mutex_unlock(&cde_app->mutex);
@@ -901,39 +1064,28 @@ int gk20a_cde_reload(struct gk20a *g)
 int gk20a_init_cde_support(struct gk20a *g)
 {
        struct gk20a_cde_app *cde_app = &g->cde_app;
-        struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;
+        int err;
-        int ret, i;
        if (cde_app->initialised)
                return 0;
+        gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: init");
        mutex_init(&cde_app->mutex);
        mutex_lock(&cde_app->mutex);
-        for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {
+        INIT_LIST_HEAD(&cde_app->cde_ctx_lru);
-                cde_ctx->g = g;
+        cde_app->lru_len = 0;
-                cde_ctx->pdev = g->dev;
+        cde_app->lru_max_len = 0;
-                ret = gk20a_cde_load(cde_ctx, false);
+        cde_app->lru_used = 0;
-                if (ret)
-                        goto err_init_instance;
-        }
-        /* take shadow to the vm for general usage */
+        err = gk20a_cde_allocate_contexts(g);
-        cde_app->vm = cde_app->cde_ctx->vm;
+        if (!err)
+                cde_app->initialised = true;
-        cde_app->initialised = true;
        mutex_unlock(&cde_app->mutex);
+        gk20a_dbg(gpu_dbg_cde_ctx, "cde: init finished: %d", err);
-        return 0;
+        return err;
-err_init_instance:
-        /* deinitialise initialised channels */
-        while (i--) {
-                gk20a_cde_remove(cde_ctx);
-                cde_ctx--;
-        }
-        return ret;
 }
 enum cde_launch_patch_offset {
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
index e4d4659d..4120dc94 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
@@ -242,19 +242,26 @@ struct gk20a_cde_ctx {
        struct kobj_attribute attr;
        bool init_cmd_executed;
+        struct list_head list;
+        bool is_temporary;
+        bool in_use;
+        struct delayed_work ctx_deleter_work;
 };
 struct gk20a_cde_app {
        bool initialised;
        struct mutex mutex;
-        struct vm_gk20a *vm;
-        struct gk20a_cde_ctx cde_ctx[NUM_CDE_CONTEXTS];
+        struct list_head cde_ctx_lru;
+        int lru_len;
+        int lru_max_len;
+        int lru_used;
        u32 shader_parameter;
 };
-int gk20a_cde_destroy(struct gk20a *g);
+void gk20a_cde_destroy(struct gk20a *g);
 int gk20a_init_cde_support(struct gk20a *g);
 int gk20a_cde_reload(struct gk20a *g);
 int gk20a_cde_convert(struct gk20a *g, struct dma_buf *dst,
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 309a1b08..825cb886 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -535,6 +535,7 @@ enum gk20a_dbg_categories {
        gpu_dbg_map     = BIT(8),  /* mem mappings */
        gpu_dbg_gpu_dbg = BIT(9),  /* gpu debugger/profiler */
        gpu_dbg_cde     = BIT(10), /* cde info messages */
+        gpu_dbg_cde_ctx = BIT(11), /* cde context usage messages */
        gpu_dbg_mem     = BIT(31), /* memory accesses, very verbose */
 };
author	Konsta Holtta <kholtta@nvidia.com>	2014-10-23 07:10:57 -0400
committer	Dan Willemsen <dwillemsen@nvidia.com>	2015-03-18 15:11:58 -0400
commit	14577a339ccc160ed58f8d936ebcbd96dba3b6ca (patch)
tree	2e55969ea66a15b23f799e0054d4cba4ab6d85fe /drivers/gpu
parent	b1088fe769ea900438a39c9e9920157b4ba7436a (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c index ee62f02a..9067aae5 100644 --- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -34,7 +34,10 @@
34	#include "hw_ccsr_gk20a.h"	34	#include "hw_ccsr_gk20a.h"
35	#include "hw_pbdma_gk20a.h"	35	#include "hw_pbdma_gk20a.h"
36		36
37	static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use);	37	static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx);
		38	static struct gk20a_cde_ctx gk20a_cde_allocate_context(struct gk20a g);
		39
		40	#define CTX_DELETE_TIME 1000
38		41
39	static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)	42	static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
40	{	43	{
@@ -67,7 +70,7 @@ static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
67	cde_ctx->init_cmd_executed = false;	70	cde_ctx->init_cmd_executed = false;
68	}	71	}
69		72
70	static int gk20a_cde_remove(struct gk20a_cde_ctx *cde_ctx)	73	static void gk20a_cde_remove_ctx(struct gk20a_cde_ctx *cde_ctx)
71	{	74	{
72	struct gk20a *g = cde_ctx->g;	75	struct gk20a *g = cde_ctx->g;
73	struct channel_gk20a *ch = cde_ctx->ch;	76	struct channel_gk20a *ch = cde_ctx->ch;
@@ -81,23 +84,90 @@ static int gk20a_cde_remove(struct gk20a_cde_ctx *cde_ctx)
81	gk20a_gmmu_unmap(vm, cde_ctx->backing_store_vaddr,	84	gk20a_gmmu_unmap(vm, cde_ctx->backing_store_vaddr,
82	g->gr.compbit_store.size, 1);	85	g->gr.compbit_store.size, 1);
83		86
84	return 0;	87	/* housekeeping on app */
		88	list_del(&cde_ctx->list);
		89	cde_ctx->g->cde_app.lru_len--;
		90	kfree(cde_ctx);
		91	}
		92
		93	static void gk20a_cde_prepare_ctx_remove(struct gk20a_cde_ctx *cde_ctx)
		94	{
		95	struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
		96
		97	/* permanent contexts do not have deleter works */
		98	if (!cde_ctx->is_temporary)
		99	return;
		100
		101	/* safe to go off the mutex since app is deinitialised. deleter works
		102	* may be only at waiting for the mutex or before, going to abort */
		103	mutex_unlock(&cde_app->mutex);
		104
		105	/* the deleter can rearm itself */
		106	do {
		107	cancel_delayed_work_sync(&cde_ctx->ctx_deleter_work);
		108	} while (delayed_work_pending(&cde_ctx->ctx_deleter_work));
		109
		110	mutex_lock(&cde_app->mutex);
85	}	111	}
86		112
87	int gk20a_cde_destroy(struct gk20a *g)	113	static void gk20a_cde_deallocate_contexts(struct gk20a *g)
88	{	114	{
89	struct gk20a_cde_app *cde_app = &g->cde_app;	115	struct gk20a_cde_app *cde_app = &g->cde_app;
90	struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;	116	struct gk20a_cde_ctx cde_ctx, cde_ctx_save;
91	int ret, i;
92		117
93	if (!cde_app->initialised)	118	list_for_each_entry_safe(cde_ctx, cde_ctx_save,
94	return 0;	119	&cde_app->cde_ctx_lru, list) {
		120	gk20a_cde_prepare_ctx_remove(cde_ctx);
		121	gk20a_cde_remove_ctx(cde_ctx);
		122	}
		123	}
95		124
96	for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++)	125	void gk20a_cde_stop(struct gk20a *g)
97	ret = gk20a_cde_remove(cde_ctx);	126	{
		127	struct gk20a_cde_app *cde_app = &g->cde_app;
98		128
		129	/* prevent further conversions and delayed works from working */
99	cde_app->initialised = false;	130	cde_app->initialised = false;
100	return ret;	131	/* free all data, empty the list */
		132	gk20a_cde_deallocate_contexts(g);
		133	}
		134
		135	void gk20a_cde_destroy(struct gk20a *g)
		136	{
		137	struct gk20a_cde_app *cde_app = &g->cde_app;
		138
		139	if (!cde_app->initialised)
		140	return;
		141
		142	mutex_lock(&cde_app->mutex);
		143	gk20a_cde_stop(g);
		144	mutex_unlock(&cde_app->mutex);
		145	}
		146
		147	static int gk20a_cde_allocate_contexts(struct gk20a *g)
		148	{
		149	struct gk20a_cde_app *cde_app = &g->cde_app;
		150	struct gk20a_cde_ctx *cde_ctx;
		151	int err = 0;
		152	int i;
		153
		154	for (i = 0; i < NUM_CDE_CONTEXTS; i++) {
		155	cde_ctx = gk20a_cde_allocate_context(g);
		156	if (IS_ERR(cde_ctx)) {
		157	err = PTR_ERR(cde_ctx);
		158	goto out;
		159	}
		160
		161	list_add(&cde_ctx->list, &cde_app->cde_ctx_lru);
		162	cde_app->lru_len++;
		163	if (cde_app->lru_len > cde_app->lru_max_len)
		164	cde_app->lru_max_len = cde_app->lru_len;
		165	}
		166
		167	return 0;
		168	out:
		169	gk20a_cde_deallocate_contexts(g);
		170	return err;
101	}	171	}
102		172
103	static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx,	173	static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx,
@@ -591,29 +661,117 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
591	num_entries, flags, fence, fence_out);	661	num_entries, flags, fence, fence_out);
592	}	662	}
593		663
		664	static void gk20a_ctx_release(struct gk20a_cde_ctx *cde_ctx)
		665	{
		666	struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
		667
		668	gk20a_dbg(gpu_dbg_cde_ctx, "releasing use on %p", cde_ctx);
		669
		670	mutex_lock(&cde_app->mutex);
		671
		672	cde_ctx->in_use = false;
		673	list_move(&cde_ctx->list, &cde_app->cde_ctx_lru);
		674	cde_app->lru_used--;
		675
		676	mutex_unlock(&cde_app->mutex);
		677	}
		678
		679	static void gk20a_cde_ctx_deleter_fn(struct work_struct *work)
		680	{
		681	struct delayed_work *delay_work = to_delayed_work(work);
		682	struct gk20a_cde_ctx *cde_ctx = container_of(delay_work,
		683	struct gk20a_cde_ctx, ctx_deleter_work);
		684	struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
		685	struct platform_device *pdev = cde_ctx->pdev;
		686	int err;
		687
		688	/* someone has just taken it? engine deletion started? */
		689	if (cde_ctx->in_use \|\| !cde_app->initialised)
		690	return;
		691
		692	gk20a_dbg(gpu_dbg_fn \| gpu_dbg_cde_ctx,
		693	"cde: attempting to delete temporary %p", cde_ctx);
		694
		695	/* this should fail only when shutting down the whole device */
		696	err = gk20a_busy(pdev);
		697	if (WARN(err, "gk20a cde: cannot set gk20a on, not freeing channel yet."
		698	" rescheduling...")) {
		699	schedule_delayed_work(&cde_ctx->ctx_deleter_work,
		700	msecs_to_jiffies(CTX_DELETE_TIME));
		701	return;
		702	}
		703
		704	/* mark so that nobody else assumes it's free to take */
		705	mutex_lock(&cde_app->mutex);
		706	if (cde_ctx->in_use \|\| !cde_app->initialised) {
		707	gk20a_dbg(gpu_dbg_cde_ctx,
		708	"cde: context use raced, not deleting %p",
		709	cde_ctx);
		710	goto out;
		711	}
		712	cde_ctx->in_use = true;
		713
		714	gk20a_cde_remove_ctx(cde_ctx);
		715	gk20a_dbg(gpu_dbg_fn \| gpu_dbg_cde_ctx,
		716	"cde: destroyed %p len=%d use=%d max=%d",
		717	cde_ctx, cde_app->lru_len, cde_app->lru_used,
		718	cde_app->lru_max_len);
		719
		720	out:
		721	mutex_unlock(&cde_app->mutex);
		722	gk20a_idle(pdev);
		723	}
		724
594	static struct gk20a_cde_ctx gk20a_cde_get_context(struct gk20a g)	725	static struct gk20a_cde_ctx gk20a_cde_get_context(struct gk20a g)
595	{	726	{
596	struct gk20a_cde_app *cde_app = &g->cde_app;	727	struct gk20a_cde_app *cde_app = &g->cde_app;
597	struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;	728	struct gk20a_cde_ctx *cde_ctx;
598	int i, ret;
599		729
600	/* try to find a jobless context */	730	/* try to get a jobless context. list is in lru order */
		731
		732	cde_ctx = list_first_entry(&cde_app->cde_ctx_lru,
		733	struct gk20a_cde_ctx, list);
		734
		735	if (!cde_ctx->in_use) {
		736	gk20a_dbg(gpu_dbg_fn \| gpu_dbg_cde_ctx,
		737	"cde: got free %p len=%d use=%d max=%d",
		738	cde_ctx, cde_app->lru_len, cde_app->lru_used,
		739	cde_app->lru_max_len);
		740	/* deleter work may be scheduled, but in_use prevents it */
		741	cde_ctx->in_use = true;
		742	list_move_tail(&cde_ctx->list, &cde_app->cde_ctx_lru);
		743	cde_app->lru_used++;
		744	return cde_ctx;
		745	}
601		746
602	for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {	747	/* no free contexts, get a temporary one */
603	struct channel_gk20a *ch = cde_ctx->ch;
604	bool empty;
605		748
606	mutex_lock(&ch->jobs_lock);	749	gk20a_dbg(gpu_dbg_fn \| gpu_dbg_cde_ctx,
607	empty = list_empty(&ch->jobs);	750	"cde: no free contexts, list len=%d",
608	mutex_unlock(&ch->jobs_lock);	751	cde_app->lru_len);
609		752
610	if (empty)	753	cde_ctx = gk20a_cde_allocate_context(g);
611	return cde_ctx;	754	if (IS_ERR(cde_ctx)) {
		755	gk20a_warn(&g->dev->dev, "cde: cannot allocate context: %ld",
		756	PTR_ERR(cde_ctx));
		757	return cde_ctx;
612	}	758	}
613		759
614	/* could not find a free one, so allocate dynamically */	760	cde_ctx->in_use = true;
		761	cde_ctx->is_temporary = true;
		762	list_add_tail(&cde_ctx->list, &cde_app->cde_ctx_lru);
		763	cde_app->lru_used++;
		764	cde_app->lru_len++;
		765	if (cde_app->lru_len > cde_app->lru_max_len)
		766	cde_app->lru_max_len = cde_app->lru_len;
		767
		768	return cde_ctx;
		769	}
615		770
616	gk20a_warn(&g->dev->dev, "cde: no free contexts, allocating temporary");	771	static struct gk20a_cde_ctx gk20a_cde_allocate_context(struct gk20a g)
		772	{
		773	struct gk20a_cde_ctx *cde_ctx;
		774	int ret;
617		775
618	cde_ctx = kzalloc(sizeof(*cde_ctx), GFP_KERNEL);	776	cde_ctx = kzalloc(sizeof(*cde_ctx), GFP_KERNEL);
619	if (!cde_ctx)	777	if (!cde_ctx)
@@ -622,12 +780,19 @@ static struct gk20a_cde_ctx gk20a_cde_get_context(struct gk20a g)
622	cde_ctx->g = g;	780	cde_ctx->g = g;
623	cde_ctx->pdev = g->dev;	781	cde_ctx->pdev = g->dev;
624		782
625	ret = gk20a_cde_load(cde_ctx, true);	783	ret = gk20a_cde_load(cde_ctx);
626	if (ret) {	784	if (ret) {
627	gk20a_err(&g->dev->dev, "cde: cde load failed on temporary");	785	kfree(cde_ctx);
628	return ERR_PTR(ret);	786	return ERR_PTR(ret);
629	}	787	}
630		788
		789	INIT_LIST_HEAD(&cde_ctx->list);
		790	cde_ctx->is_temporary = false;
		791	cde_ctx->in_use = false;
		792	INIT_DELAYED_WORK(&cde_ctx->ctx_deleter_work,
		793	gk20a_cde_ctx_deleter_fn);
		794
		795	gk20a_dbg(gpu_dbg_fn \| gpu_dbg_cde_ctx, "cde: allocated %p", cde_ctx);
631	return cde_ctx;	796	return cde_ctx;
632	}	797	}
633		798
@@ -653,8 +818,10 @@ int gk20a_cde_convert(struct gk20a *g,
653	mutex_lock(&cde_app->mutex);	818	mutex_lock(&cde_app->mutex);
654		819
655	cde_ctx = gk20a_cde_get_context(g);	820	cde_ctx = gk20a_cde_get_context(g);
656	if (IS_ERR(cde_ctx))	821	if (IS_ERR(cde_ctx)) {
657	return PTR_ERR(cde_ctx);	822	err = PTR_ERR(cde_ctx);
		823	goto exit_unlock;
		824	}
658		825
659	/* First, map the buffers to local va */	826	/* First, map the buffers to local va */
660		827
@@ -665,7 +832,7 @@ int gk20a_cde_convert(struct gk20a *g,
665		832
666	/* map the destination buffer */	833	/* map the destination buffer */
667	get_dma_buf(dst); /* a ref for gk20a_vm_map */	834	get_dma_buf(dst); /* a ref for gk20a_vm_map */
668	dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0,	835	dst_vaddr = gk20a_vm_map(cde_ctx->vm, dst, 0,
669	NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,	836	NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
670	dst_kind, NULL, true,	837	dst_kind, NULL, true,
671	gk20a_mem_flag_none,	838	gk20a_mem_flag_none,
@@ -757,18 +924,17 @@ exit_unlock:
757		924
758	/* unmap the buffers - channel holds references to them now */	925	/* unmap the buffers - channel holds references to them now */
759	if (dst_vaddr)	926	if (dst_vaddr)
760	gk20a_vm_unmap(g->cde_app.vm, dst_vaddr);	927	gk20a_vm_unmap(cde_ctx->vm, dst_vaddr);
761		928
762	mutex_unlock(&cde_app->mutex);	929	mutex_unlock(&cde_app->mutex);
763		930
764	return err;	931	return err;
765	}	932	}
766		933
767	static void gk20a_free_ctx_cb(struct channel_gk20a ch, void data)	934	static void gk20a_cde_finished_ctx_cb(struct channel_gk20a ch, void data)
768	{	935	{
769	struct gk20a_cde_ctx *cde_ctx = data;	936	struct gk20a_cde_ctx *cde_ctx = data;
770	bool empty;	937	bool empty;
771	int err;
772		938
773	mutex_lock(&ch->jobs_lock);	939	mutex_lock(&ch->jobs_lock);
774	empty = list_empty(&ch->jobs);	940	empty = list_empty(&ch->jobs);
@@ -777,19 +943,17 @@ static void gk20a_free_ctx_cb(struct channel_gk20a ch, void data)
777	if (!empty)	943	if (!empty)
778	return;	944	return;
779		945
780	/* this should fail only when shutting down the whole device */	946	gk20a_dbg(gpu_dbg_fn \| gpu_dbg_cde_ctx, "cde: finished %p", cde_ctx);
781	err = gk20a_busy(cde_ctx->pdev);
782	if (WARN(err, "gk20a cde: cannot set gk20a on, not freeing channel"
783	", leaking memory"))
784	return;
785		947
786	gk20a_cde_remove(cde_ctx);	948	/* delete temporary contexts later */
787	gk20a_idle(cde_ctx->pdev);	949	if (cde_ctx->is_temporary)
		950	schedule_delayed_work(&cde_ctx->ctx_deleter_work,
		951	msecs_to_jiffies(CTX_DELETE_TIME));
788		952
789	kfree(cde_ctx);	953	gk20a_ctx_release(cde_ctx);
790	}	954	}
791		955
792	static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use)	956	static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
793	{	957	{
794	struct gk20a *g = cde_ctx->g;	958	struct gk20a *g = cde_ctx->g;
795	const struct firmware *img;	959	const struct firmware *img;
@@ -804,10 +968,8 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use)
804	return -ENOSYS;	968	return -ENOSYS;
805	}	969	}
806		970
807	if (free_after_use)	971	ch = gk20a_open_new_channel_with_cb(g, gk20a_cde_finished_ctx_cb,
808	ch = gk20a_open_new_channel_with_cb(g, gk20a_free_ctx_cb, cde_ctx);	972	cde_ctx);
809	else
810	ch = gk20a_open_new_channel(g);
811	if (!ch) {	973	if (!ch) {
812	gk20a_warn(&cde_ctx->pdev->dev, "cde: gk20a channel not available");	974	gk20a_warn(&cde_ctx->pdev->dev, "cde: gk20a channel not available");
813	err = -ENOMEM;	975	err = -ENOMEM;
@@ -876,8 +1038,7 @@ err_get_gk20a_channel:
876	int gk20a_cde_reload(struct gk20a *g)	1038	int gk20a_cde_reload(struct gk20a *g)
877	{	1039	{
878	struct gk20a_cde_app *cde_app = &g->cde_app;	1040	struct gk20a_cde_app *cde_app = &g->cde_app;
879	struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;	1041	int err;
880	int err, i;
881		1042
882	if (!cde_app->initialised)	1043	if (!cde_app->initialised)
883	return -ENOSYS;	1044	return -ENOSYS;
@@ -887,10 +1048,12 @@ int gk20a_cde_reload(struct gk20a *g)
887	return err;	1048	return err;
888		1049
889	mutex_lock(&cde_app->mutex);	1050	mutex_lock(&cde_app->mutex);
890	for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {	1051
891	gk20a_cde_remove(cde_ctx);	1052	gk20a_cde_stop(g);
892	err = gk20a_cde_load(cde_ctx, false);	1053
893	}	1054	err = gk20a_cde_allocate_contexts(g);
		1055	if (!err)
		1056	cde_app->initialised = true;
894		1057
895	mutex_unlock(&cde_app->mutex);	1058	mutex_unlock(&cde_app->mutex);
896		1059
@@ -901,39 +1064,28 @@ int gk20a_cde_reload(struct gk20a *g)
901	int gk20a_init_cde_support(struct gk20a *g)	1064	int gk20a_init_cde_support(struct gk20a *g)
902	{	1065	{
903	struct gk20a_cde_app *cde_app = &g->cde_app;	1066	struct gk20a_cde_app *cde_app = &g->cde_app;
904	struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;	1067	int err;
905	int ret, i;
906		1068
907	if (cde_app->initialised)	1069	if (cde_app->initialised)
908	return 0;	1070	return 0;
909		1071
		1072	gk20a_dbg(gpu_dbg_fn \| gpu_dbg_cde_ctx, "cde: init");
		1073
910	mutex_init(&cde_app->mutex);	1074	mutex_init(&cde_app->mutex);
911	mutex_lock(&cde_app->mutex);	1075	mutex_lock(&cde_app->mutex);
912		1076
913	for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {	1077	INIT_LIST_HEAD(&cde_app->cde_ctx_lru);
914	cde_ctx->g = g;	1078	cde_app->lru_len = 0;
915	cde_ctx->pdev = g->dev;	1079	cde_app->lru_max_len = 0;
916	ret = gk20a_cde_load(cde_ctx, false);	1080	cde_app->lru_used = 0;
917	if (ret)
918	goto err_init_instance;
919	}
920		1081
921	/* take shadow to the vm for general usage */	1082	err = gk20a_cde_allocate_contexts(g);
922	cde_app->vm = cde_app->cde_ctx->vm;	1083	if (!err)
		1084	cde_app->initialised = true;
923		1085
924	cde_app->initialised = true;
925	mutex_unlock(&cde_app->mutex);	1086	mutex_unlock(&cde_app->mutex);
926		1087	gk20a_dbg(gpu_dbg_cde_ctx, "cde: init finished: %d", err);
927	return 0;	1088	return err;
928
929	err_init_instance:
930
931	/* deinitialise initialised channels */
932	while (i--) {
933	gk20a_cde_remove(cde_ctx);
934	cde_ctx--;
935	}
936	return ret;
937	}	1089	}
938		1090
939	enum cde_launch_patch_offset {	1091	enum cde_launch_patch_offset {


diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h index e4d4659d..4120dc94 100644 --- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
@@ -242,19 +242,26 @@ struct gk20a_cde_ctx {
242	struct kobj_attribute attr;	242	struct kobj_attribute attr;
243		243
244	bool init_cmd_executed;	244	bool init_cmd_executed;
		245
		246	struct list_head list;
		247	bool is_temporary;
		248	bool in_use;
		249	struct delayed_work ctx_deleter_work;
245	};	250	};
246		251
247	struct gk20a_cde_app {	252	struct gk20a_cde_app {
248	bool initialised;	253	bool initialised;
249	struct mutex mutex;	254	struct mutex mutex;
250	struct vm_gk20a *vm;
251		255
252	struct gk20a_cde_ctx cde_ctx[NUM_CDE_CONTEXTS];	256	struct list_head cde_ctx_lru;
		257	int lru_len;
		258	int lru_max_len;
		259	int lru_used;
253		260
254	u32 shader_parameter;	261	u32 shader_parameter;
255	};	262	};
256		263
257	int gk20a_cde_destroy(struct gk20a *g);	264	void gk20a_cde_destroy(struct gk20a *g);
258	int gk20a_init_cde_support(struct gk20a *g);	265	int gk20a_init_cde_support(struct gk20a *g);
259	int gk20a_cde_reload(struct gk20a *g);	266	int gk20a_cde_reload(struct gk20a *g);
260	int gk20a_cde_convert(struct gk20a g, struct dma_buf dst,	267	int gk20a_cde_convert(struct gk20a g, struct dma_buf dst,


diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h index 309a1b08..825cb886 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -535,6 +535,7 @@ enum gk20a_dbg_categories {
535	gpu_dbg_map = BIT(8), /* mem mappings */	535	gpu_dbg_map = BIT(8), /* mem mappings */
536	gpu_dbg_gpu_dbg = BIT(9), /* gpu debugger/profiler */	536	gpu_dbg_gpu_dbg = BIT(9), /* gpu debugger/profiler */
537	gpu_dbg_cde = BIT(10), /* cde info messages */	537	gpu_dbg_cde = BIT(10), /* cde info messages */
		538	gpu_dbg_cde_ctx = BIT(11), /* cde context usage messages */
538	gpu_dbg_mem = BIT(31), /* memory accesses, very verbose */	539	gpu_dbg_mem = BIT(31), /* memory accesses, very verbose */
539	};	540	};
540		541