From 14577a339ccc160ed58f8d936ebcbd96dba3b6ca Mon Sep 17 00:00:00 2001
From: Konsta Holtta <kholtta@nvidia.com>
Date: Thu, 23 Oct 2014 14:10:57 +0300
Subject: gpu: nvgpu: cde: list for contexts, defer deletion

Instead of current preallocated array plus dynamically allocated
temporary contexts, use a linked list in LRU fashion, always storing
free contexts at the beginning of the list. Initialize the preallocated
contexts to the list and store dynamically allocated temporaries there
too for quick reuse as needed, with a delayed scheduled work for
deleting temporaries when the high load has diminished.

Bug 200040211

Change-Id: Ibc75a0150109ec9c44b2eeb74607450990584b18
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-on: http://git-master/r/562856
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/cde_gk20a.c | 300 +++++++++++++++++++++++++++---------
 drivers/gpu/nvgpu/gk20a/cde_gk20a.h |  13 +-
 drivers/gpu/nvgpu/gk20a/gk20a.h     |   1 +
 3 files changed, 237 insertions(+), 77 deletions(-)

(limited to 'drivers/gpu')

diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
index ee62f02a..9067aae5 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -34,7 +34,10 @@
 #include "hw_ccsr_gk20a.h"
 #include "hw_pbdma_gk20a.h"
 
-static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use);
+static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx);
+static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g);
+
+#define CTX_DELETE_TIME 1000
 
 static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
 {
@@ -67,7 +70,7 @@ static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
 	cde_ctx->init_cmd_executed = false;
 }
 
-static int gk20a_cde_remove(struct gk20a_cde_ctx *cde_ctx)
+static void gk20a_cde_remove_ctx(struct gk20a_cde_ctx *cde_ctx)
 {
 	struct gk20a *g = cde_ctx->g;
 	struct channel_gk20a *ch = cde_ctx->ch;
@@ -81,23 +84,90 @@ static int gk20a_cde_remove(struct gk20a_cde_ctx *cde_ctx)
 	gk20a_gmmu_unmap(vm, cde_ctx->backing_store_vaddr,
 			 g->gr.compbit_store.size, 1);
 
-	return 0;
+	/* housekeeping on app */
+	list_del(&cde_ctx->list);
+	cde_ctx->g->cde_app.lru_len--;
+	kfree(cde_ctx);
+}
+
+static void gk20a_cde_prepare_ctx_remove(struct gk20a_cde_ctx *cde_ctx)
+{
+	struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
+
+	/* permanent contexts do not have deleter works */
+	if (!cde_ctx->is_temporary)
+		return;
+
+	/* safe to go off the mutex since app is deinitialised. deleter works
+	 * may be only at waiting for the mutex or before, going to abort */
+	mutex_unlock(&cde_app->mutex);
+
+	/* the deleter can rearm itself */
+	do {
+		cancel_delayed_work_sync(&cde_ctx->ctx_deleter_work);
+	} while (delayed_work_pending(&cde_ctx->ctx_deleter_work));
+
+	mutex_lock(&cde_app->mutex);
 }
 
-int gk20a_cde_destroy(struct gk20a *g)
+static void gk20a_cde_deallocate_contexts(struct gk20a *g)
 {
 	struct gk20a_cde_app *cde_app = &g->cde_app;
-	struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;
-	int ret, i;
+	struct gk20a_cde_ctx *cde_ctx, *cde_ctx_save;
 
-	if (!cde_app->initialised)
-		return 0;
+	list_for_each_entry_safe(cde_ctx, cde_ctx_save,
+			&cde_app->cde_ctx_lru, list) {
+		gk20a_cde_prepare_ctx_remove(cde_ctx);
+		gk20a_cde_remove_ctx(cde_ctx);
+	}
+}
 
-	for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++)
-		ret = gk20a_cde_remove(cde_ctx);
+void gk20a_cde_stop(struct gk20a *g)
+{
+	struct gk20a_cde_app *cde_app = &g->cde_app;
 
+	/* prevent further conversions and delayed works from working */
 	cde_app->initialised = false;
-	return ret;
+	/* free all data, empty the list */
+	gk20a_cde_deallocate_contexts(g);
+}
+
+void gk20a_cde_destroy(struct gk20a *g)
+{
+	struct gk20a_cde_app *cde_app = &g->cde_app;
+
+	if (!cde_app->initialised)
+		return;
+
+	mutex_lock(&cde_app->mutex);
+	gk20a_cde_stop(g);
+	mutex_unlock(&cde_app->mutex);
+}
+
+static int gk20a_cde_allocate_contexts(struct gk20a *g)
+{
+	struct gk20a_cde_app *cde_app = &g->cde_app;
+	struct gk20a_cde_ctx *cde_ctx;
+	int err = 0;
+	int i;
+
+	for (i = 0; i < NUM_CDE_CONTEXTS; i++) {
+		cde_ctx = gk20a_cde_allocate_context(g);
+		if (IS_ERR(cde_ctx)) {
+			err = PTR_ERR(cde_ctx);
+			goto out;
+		}
+
+		list_add(&cde_ctx->list, &cde_app->cde_ctx_lru);
+		cde_app->lru_len++;
+		if (cde_app->lru_len > cde_app->lru_max_len)
+			cde_app->lru_max_len = cde_app->lru_len;
+	}
+
+	return 0;
+out:
+	gk20a_cde_deallocate_contexts(g);
+	return err;
 }
 
 static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx,
@@ -591,29 +661,117 @@ static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
 					   num_entries, flags, fence, fence_out);
 }
 
+static void gk20a_ctx_release(struct gk20a_cde_ctx *cde_ctx)
+{
+	struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
+
+	gk20a_dbg(gpu_dbg_cde_ctx, "releasing use on %p", cde_ctx);
+
+	mutex_lock(&cde_app->mutex);
+
+	cde_ctx->in_use = false;
+	list_move(&cde_ctx->list, &cde_app->cde_ctx_lru);
+	cde_app->lru_used--;
+
+	mutex_unlock(&cde_app->mutex);
+}
+
+static void gk20a_cde_ctx_deleter_fn(struct work_struct *work)
+{
+	struct delayed_work *delay_work = to_delayed_work(work);
+	struct gk20a_cde_ctx *cde_ctx = container_of(delay_work,
+			struct gk20a_cde_ctx, ctx_deleter_work);
+	struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
+	struct platform_device *pdev = cde_ctx->pdev;
+	int err;
+
+	/* someone has just taken it? engine deletion started? */
+	if (cde_ctx->in_use || !cde_app->initialised)
+		return;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+			"cde: attempting to delete temporary %p", cde_ctx);
+
+	/* this should fail only when shutting down the whole device */
+	err = gk20a_busy(pdev);
+	if (WARN(err, "gk20a cde: cannot set gk20a on, not freeing channel yet."
+				" rescheduling...")) {
+		schedule_delayed_work(&cde_ctx->ctx_deleter_work,
+			msecs_to_jiffies(CTX_DELETE_TIME));
+		return;
+	}
+
+	/* mark so that nobody else assumes it's free to take */
+	mutex_lock(&cde_app->mutex);
+	if (cde_ctx->in_use || !cde_app->initialised) {
+		gk20a_dbg(gpu_dbg_cde_ctx,
+				"cde: context use raced, not deleting %p",
+				cde_ctx);
+		goto out;
+	}
+	cde_ctx->in_use = true;
+
+	gk20a_cde_remove_ctx(cde_ctx);
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+			"cde: destroyed %p len=%d use=%d max=%d",
+			cde_ctx, cde_app->lru_len, cde_app->lru_used,
+			cde_app->lru_max_len);
+
+out:
+	mutex_unlock(&cde_app->mutex);
+	gk20a_idle(pdev);
+}
+
 static struct gk20a_cde_ctx *gk20a_cde_get_context(struct gk20a *g)
 {
 	struct gk20a_cde_app *cde_app = &g->cde_app;
-	struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;
-	int i, ret;
+	struct gk20a_cde_ctx *cde_ctx;
 
-	/* try to find a jobless context */
+	/* try to get a jobless context. list is in lru order */
+
+	cde_ctx = list_first_entry(&cde_app->cde_ctx_lru,
+			struct gk20a_cde_ctx, list);
+
+	if (!cde_ctx->in_use) {
+		gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+				"cde: got free %p len=%d use=%d max=%d",
+				cde_ctx, cde_app->lru_len, cde_app->lru_used,
+				cde_app->lru_max_len);
+		/* deleter work may be scheduled, but in_use prevents it */
+		cde_ctx->in_use = true;
+		list_move_tail(&cde_ctx->list, &cde_app->cde_ctx_lru);
+		cde_app->lru_used++;
+		return cde_ctx;
+	}
 
-	for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {
-		struct channel_gk20a *ch = cde_ctx->ch;
-		bool empty;
+	/* no free contexts, get a temporary one */
 
-		mutex_lock(&ch->jobs_lock);
-		empty = list_empty(&ch->jobs);
-		mutex_unlock(&ch->jobs_lock);
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
+			"cde: no free contexts, list len=%d",
+			cde_app->lru_len);
 
-		if (empty)
-			return cde_ctx;
+	cde_ctx = gk20a_cde_allocate_context(g);
+	if (IS_ERR(cde_ctx)) {
+		gk20a_warn(&g->dev->dev, "cde: cannot allocate context: %ld",
+				PTR_ERR(cde_ctx));
+		return cde_ctx;
 	}
 
-	/* could not find a free one, so allocate dynamically */
+	cde_ctx->in_use = true;
+	cde_ctx->is_temporary = true;
+	list_add_tail(&cde_ctx->list, &cde_app->cde_ctx_lru);
+	cde_app->lru_used++;
+	cde_app->lru_len++;
+	if (cde_app->lru_len > cde_app->lru_max_len)
+		cde_app->lru_max_len = cde_app->lru_len;
+
+	return cde_ctx;
+}
 
-	gk20a_warn(&g->dev->dev, "cde: no free contexts, allocating temporary");
+static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g)
+{
+	struct gk20a_cde_ctx *cde_ctx;
+	int ret;
 
 	cde_ctx = kzalloc(sizeof(*cde_ctx), GFP_KERNEL);
 	if (!cde_ctx)
@@ -622,12 +780,19 @@ static struct gk20a_cde_ctx *gk20a_cde_get_context(struct gk20a *g)
 	cde_ctx->g = g;
 	cde_ctx->pdev = g->dev;
 
-	ret = gk20a_cde_load(cde_ctx, true);
+	ret = gk20a_cde_load(cde_ctx);
 	if (ret) {
-		gk20a_err(&g->dev->dev, "cde: cde load failed on temporary");
+		kfree(cde_ctx);
 		return ERR_PTR(ret);
 	}
 
+	INIT_LIST_HEAD(&cde_ctx->list);
+	cde_ctx->is_temporary = false;
+	cde_ctx->in_use = false;
+	INIT_DELAYED_WORK(&cde_ctx->ctx_deleter_work,
+			gk20a_cde_ctx_deleter_fn);
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: allocated %p", cde_ctx);
 	return cde_ctx;
 }
 
@@ -653,8 +818,10 @@ int gk20a_cde_convert(struct gk20a *g,
 	mutex_lock(&cde_app->mutex);
 
 	cde_ctx = gk20a_cde_get_context(g);
-	if (IS_ERR(cde_ctx))
-		return PTR_ERR(cde_ctx);
+	if (IS_ERR(cde_ctx)) {
+		err = PTR_ERR(cde_ctx);
+		goto exit_unlock;
+	}
 
 	/* First, map the buffers to local va */
 
@@ -665,7 +832,7 @@ int gk20a_cde_convert(struct gk20a *g,
 
 	/* map the destination buffer */
 	get_dma_buf(dst); /* a ref for gk20a_vm_map */
-	dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0,
+	dst_vaddr = gk20a_vm_map(cde_ctx->vm, dst, 0,
 				 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
 				 dst_kind, NULL, true,
 				 gk20a_mem_flag_none,
@@ -757,18 +924,17 @@ exit_unlock:
 
 	/* unmap the buffers - channel holds references to them now */
 	if (dst_vaddr)
-		gk20a_vm_unmap(g->cde_app.vm, dst_vaddr);
+		gk20a_vm_unmap(cde_ctx->vm, dst_vaddr);
 
 	mutex_unlock(&cde_app->mutex);
 
 	return err;
 }
 
-static void gk20a_free_ctx_cb(struct channel_gk20a *ch, void *data)
+static void gk20a_cde_finished_ctx_cb(struct channel_gk20a *ch, void *data)
 {
 	struct gk20a_cde_ctx *cde_ctx = data;
 	bool empty;
-	int err;
 
 	mutex_lock(&ch->jobs_lock);
 	empty = list_empty(&ch->jobs);
@@ -777,19 +943,17 @@ static void gk20a_free_ctx_cb(struct channel_gk20a *ch, void *data)
 	if (!empty)
 		return;
 
-	/* this should fail only when shutting down the whole device */
-	err = gk20a_busy(cde_ctx->pdev);
-	if (WARN(err, "gk20a cde: cannot set gk20a on, not freeing channel"
-				", leaking memory"))
-		return;
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: finished %p", cde_ctx);
 
-	gk20a_cde_remove(cde_ctx);
-	gk20a_idle(cde_ctx->pdev);
+	/* delete temporary contexts later */
+	if (cde_ctx->is_temporary)
+		schedule_delayed_work(&cde_ctx->ctx_deleter_work,
+			msecs_to_jiffies(CTX_DELETE_TIME));
 
-	kfree(cde_ctx);
+	gk20a_ctx_release(cde_ctx);
 }
 
-static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use)
+static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
 {
 	struct gk20a *g = cde_ctx->g;
 	const struct firmware *img;
@@ -804,10 +968,8 @@ static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx, bool free_after_use)
 		return -ENOSYS;
 	}
 
-	if (free_after_use)
-		ch = gk20a_open_new_channel_with_cb(g, gk20a_free_ctx_cb, cde_ctx);
-	else
-		ch = gk20a_open_new_channel(g);
+	ch = gk20a_open_new_channel_with_cb(g, gk20a_cde_finished_ctx_cb,
+			cde_ctx);
 	if (!ch) {
 		gk20a_warn(&cde_ctx->pdev->dev, "cde: gk20a channel not available");
 		err = -ENOMEM;
@@ -876,8 +1038,7 @@ err_get_gk20a_channel:
 int gk20a_cde_reload(struct gk20a *g)
 {
 	struct gk20a_cde_app *cde_app = &g->cde_app;
-	struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;
-	int err, i;
+	int err;
 
 	if (!cde_app->initialised)
 		return -ENOSYS;
@@ -887,10 +1048,12 @@ int gk20a_cde_reload(struct gk20a *g)
 		return err;
 
 	mutex_lock(&cde_app->mutex);
-	for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {
-		gk20a_cde_remove(cde_ctx);
-		err = gk20a_cde_load(cde_ctx, false);
-	}
+
+	gk20a_cde_stop(g);
+
+	err = gk20a_cde_allocate_contexts(g);
+	if (!err)
+		cde_app->initialised = true;
 
 	mutex_unlock(&cde_app->mutex);
 
@@ -901,39 +1064,28 @@ int gk20a_cde_reload(struct gk20a *g)
 int gk20a_init_cde_support(struct gk20a *g)
 {
 	struct gk20a_cde_app *cde_app = &g->cde_app;
-	struct gk20a_cde_ctx *cde_ctx = cde_app->cde_ctx;
-	int ret, i;
+	int err;
 
 	if (cde_app->initialised)
 		return 0;
 
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: init");
+
 	mutex_init(&cde_app->mutex);
 	mutex_lock(&cde_app->mutex);
 
-	for (i = 0; i < ARRAY_SIZE(cde_app->cde_ctx); i++, cde_ctx++) {
-		cde_ctx->g = g;
-		cde_ctx->pdev = g->dev;
-		ret = gk20a_cde_load(cde_ctx, false);
-		if (ret)
-			goto err_init_instance;
-	}
+	INIT_LIST_HEAD(&cde_app->cde_ctx_lru);
+	cde_app->lru_len = 0;
+	cde_app->lru_max_len = 0;
+	cde_app->lru_used = 0;
 
-	/* take shadow to the vm for general usage */
-	cde_app->vm = cde_app->cde_ctx->vm;
+	err = gk20a_cde_allocate_contexts(g);
+	if (!err)
+		cde_app->initialised = true;
 
-	cde_app->initialised = true;
 	mutex_unlock(&cde_app->mutex);
-
-	return 0;
-
-err_init_instance:
-
-	/* deinitialise initialised channels */
-	while (i--) {
-		gk20a_cde_remove(cde_ctx);
-		cde_ctx--;
-	}
-	return ret;
+	gk20a_dbg(gpu_dbg_cde_ctx, "cde: init finished: %d", err);
+	return err;
 }
 
 enum cde_launch_patch_offset {
diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
index e4d4659d..4120dc94 100644
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.h
@@ -242,19 +242,26 @@ struct gk20a_cde_ctx {
 	struct kobj_attribute attr;
 
 	bool init_cmd_executed;
+
+	struct list_head list;
+	bool is_temporary;
+	bool in_use;
+	struct delayed_work ctx_deleter_work;
 };
 
 struct gk20a_cde_app {
 	bool initialised;
 	struct mutex mutex;
-	struct vm_gk20a *vm;
 
-	struct gk20a_cde_ctx cde_ctx[NUM_CDE_CONTEXTS];
+	struct list_head cde_ctx_lru;
+	int lru_len;
+	int lru_max_len;
+	int lru_used;
 
 	u32 shader_parameter;
 };
 
-int gk20a_cde_destroy(struct gk20a *g);
+void gk20a_cde_destroy(struct gk20a *g);
 int gk20a_init_cde_support(struct gk20a *g);
 int gk20a_cde_reload(struct gk20a *g);
 int gk20a_cde_convert(struct gk20a *g, struct dma_buf *dst,
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index 309a1b08..825cb886 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -535,6 +535,7 @@ enum gk20a_dbg_categories {
 	gpu_dbg_map     = BIT(8),  /* mem mappings */
 	gpu_dbg_gpu_dbg = BIT(9),  /* gpu debugger/profiler */
 	gpu_dbg_cde     = BIT(10), /* cde info messages */
+	gpu_dbg_cde_ctx = BIT(11), /* cde context usage messages */
 	gpu_dbg_mem     = BIT(31), /* memory accesses, very verbose */
 };
 
-- 
cgit v1.2.2