From 35b2507fe3ec6c28c27dd7fb289c003c7a0baf33 Mon Sep 17 00:00:00 2001
From: Sachit Kadle <skadle@nvidia.com>
Date: Thu, 1 Sep 2016 20:50:06 -0700
Subject: gpu: nvgpu: implement lockless allocator

Implement a lockless allocator for fixed-size data
structures.

Bug 1795076

Change-Id: I70a5f52cbdb4452cc0fd9a8edf26735be29ede57
Signed-off-by: Sachit Kadle <skadle@nvidia.com>
Reviewed-on: http://git-master/r/1213211
(cherry picked from commit e4bff7da0f39c8f4b5691169c02e482bc9d4166e)
Reviewed-on: http://git-master/r/1223246
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/nvgpu/Makefile.nvgpu                   |   1 +
 drivers/gpu/nvgpu/gk20a/gk20a_allocator.h          |   9 +
 drivers/gpu/nvgpu/gk20a/gk20a_allocator_lockless.c | 204 +++++++++++++++++++++
 drivers/gpu/nvgpu/gk20a/lockless_allocator_priv.h  | 121 ++++++++++++
 4 files changed, 335 insertions(+)
 create mode 100644 drivers/gpu/nvgpu/gk20a/gk20a_allocator_lockless.c
 create mode 100644 drivers/gpu/nvgpu/gk20a/lockless_allocator_priv.h

(limited to 'drivers/gpu/nvgpu')

diff --git a/drivers/gpu/nvgpu/Makefile.nvgpu b/drivers/gpu/nvgpu/Makefile.nvgpu
index b8e38919..33d158bb 100644
--- a/drivers/gpu/nvgpu/Makefile.nvgpu
+++ b/drivers/gpu/nvgpu/Makefile.nvgpu
@@ -57,6 +57,7 @@ nvgpu-y := \
 	gk20a/gk20a_allocator_bitmap.o \
 	gk20a/gk20a_allocator_buddy.o \
 	gk20a/gk20a_allocator_page.o \
+	gk20a/gk20a_allocator_lockless.o \
 	gk20a/cde_gk20a.o \
 	gk20a/platform_gk20a_generic.o \
 	gk20a/tsg_gk20a.o \
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
index 93865190..0d611ba3 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
@@ -165,6 +165,15 @@ int gk20a_page_allocator_init(struct gk20a_allocator *__a,
 			      const char *name, u64 base, u64 length,
 			      u64 blk_size, u64 flags);
 
+/*
+ * Lockless allocatior initializers.
+ * Note: This allocator can only allocate fixed-size structures of a
+ * pre-defined size.
+ */
+int gk20a_lockless_allocator_init(struct gk20a_allocator *__a,
+				  const char *name, u64 base, u64 length,
+				  u64 struct_size, u64 flags);
+
 #define GPU_BALLOC_MAX_ORDER		31
 
 /*
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator_lockless.c b/drivers/gpu/nvgpu/gk20a/gk20a_allocator_lockless.c
new file mode 100644
index 00000000..32455c98
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator_lockless.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/atomic.h>
+
+#include "gk20a_allocator.h"
+#include "lockless_allocator_priv.h"
+
+static u64 gk20a_lockless_alloc_length(struct gk20a_allocator *a)
+{
+	struct gk20a_lockless_allocator *pa = a->priv;
+
+	return pa->length;
+}
+
+static u64 gk20a_lockless_alloc_base(struct gk20a_allocator *a)
+{
+	struct gk20a_lockless_allocator *pa = a->priv;
+
+	return pa->base;
+}
+
+static int gk20a_lockless_alloc_inited(struct gk20a_allocator *a)
+{
+	struct gk20a_lockless_allocator *pa = a->priv;
+	int inited = pa->inited;
+
+	rmb();
+	return inited;
+}
+
+static u64 gk20a_lockless_alloc_end(struct gk20a_allocator *a)
+{
+	struct gk20a_lockless_allocator *pa = a->priv;
+
+	return pa->base + pa->length;
+}
+
+static u64 gk20a_lockless_alloc(struct gk20a_allocator *a, u64 len)
+{
+	struct gk20a_lockless_allocator *pa = a->priv;
+	int head, new_head, ret;
+	u64 addr = 0;
+
+	if (len != pa->blk_size)
+		return 0;
+
+	head = ACCESS_ONCE(pa->head);
+	while (head >= 0) {
+		new_head = ACCESS_ONCE(pa->next[head]);
+		ret = cmpxchg(&pa->head, head, new_head);
+		if (ret == head) {
+			addr = pa->base + head * pa->blk_size;
+			atomic_inc(&pa->nr_allocs);
+			alloc_dbg(a, "Alloc node # %d @ addr 0x%llx\n", head,
+				  addr);
+			break;
+		}
+		head = ACCESS_ONCE(pa->head);
+	}
+	return addr;
+}
+
+static void gk20a_lockless_free(struct gk20a_allocator *a, u64 addr)
+{
+	struct gk20a_lockless_allocator *pa = a->priv;
+	int head, ret;
+	u64 cur_idx, rem;
+
+	cur_idx = addr - pa->base;
+	rem = do_div(cur_idx, pa->blk_size);
+
+	while (1) {
+		head = ACCESS_ONCE(pa->head);
+		ACCESS_ONCE(pa->next[cur_idx]) = head;
+		ret = cmpxchg(&pa->head, head, cur_idx);
+		if (ret == head) {
+			atomic_dec(&pa->nr_allocs);
+			alloc_dbg(a, "Free node # %llu\n", cur_idx);
+			break;
+		}
+	}
+}
+
+static void gk20a_lockless_alloc_destroy(struct gk20a_allocator *a)
+{
+	struct gk20a_lockless_allocator *pa = a->priv;
+
+	vfree(pa->next);
+	kfree(pa);
+}
+
+static void gk20a_lockless_print_stats(struct gk20a_allocator *a,
+				   struct seq_file *s, int lock)
+{
+	struct gk20a_lockless_allocator *pa = a->priv;
+
+	__alloc_pstat(s, a, "Lockless allocator params:\n");
+	__alloc_pstat(s, a, "  start = 0x%llx\n", pa->base);
+	__alloc_pstat(s, a, "  end   = 0x%llx\n", pa->base + pa->length);
+
+	/* Actual stats. */
+	__alloc_pstat(s, a, "Stats:\n");
+	__alloc_pstat(s, a, "  Number allocs = %d\n",
+		      atomic_read(&pa->nr_allocs));
+	__alloc_pstat(s, a, "  Number free   = %d\n",
+		      pa->nr_nodes - atomic_read(&pa->nr_allocs));
+}
+
+static const struct gk20a_allocator_ops pool_ops = {
+	.alloc		= gk20a_lockless_alloc,
+	.free		= gk20a_lockless_free,
+
+	.base		= gk20a_lockless_alloc_base,
+	.length		= gk20a_lockless_alloc_length,
+	.end		= gk20a_lockless_alloc_end,
+	.inited		= gk20a_lockless_alloc_inited,
+
+	.fini		= gk20a_lockless_alloc_destroy,
+
+	.print_stats	= gk20a_lockless_print_stats,
+};
+
+int gk20a_lockless_allocator_init(struct gk20a_allocator *__a,
+			      const char *name, u64 base, u64 length,
+			      u64 blk_size, u64 flags)
+{
+	int i;
+	int err;
+	int nr_nodes;
+	u64 count, rem;
+	struct gk20a_lockless_allocator *a;
+
+	if (!blk_size)
+		return -EINVAL;
+
+	/*
+	 * Ensure we have space for atleast one node & there's no overflow.
+	 * In order to control memory footprint, we require count < INT_MAX
+	 */
+	count = length;
+	rem = do_div(count, blk_size);
+	if (!base || !count || count > INT_MAX)
+		return -EINVAL;
+
+	a = kzalloc(sizeof(struct gk20a_lockless_allocator), GFP_KERNEL);
+	if (!a)
+		return -ENOMEM;
+
+	err = __gk20a_alloc_common_init(__a, name, a, false, &pool_ops);
+	if (err)
+		goto fail;
+
+	a->next = vzalloc(sizeof(*a->next) * count);
+	if (!a->next) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	/* chain the elements together to form the initial free list  */
+	nr_nodes = (int)count;
+	for (i = 0; i < nr_nodes; i++)
+		a->next[i] = i + 1;
+	a->next[nr_nodes - 1] = -1;
+
+	a->base = base;
+	a->length = length;
+	a->blk_size = blk_size;
+	a->nr_nodes = nr_nodes;
+	a->flags = flags;
+	atomic_set(&a->nr_allocs, 0);
+
+	wmb();
+	a->inited = true;
+
+	gk20a_init_alloc_debug(__a);
+	alloc_dbg(__a, "New allocator: type          lockless\n");
+	alloc_dbg(__a, "               base          0x%llx\n", a->base);
+	alloc_dbg(__a, "               nodes         %d\n", a->nr_nodes);
+	alloc_dbg(__a, "               blk_size      0x%llx\n", a->blk_size);
+	alloc_dbg(__a, "               flags         0x%llx\n", a->flags);
+
+	return 0;
+
+fail:
+	kfree(a);
+	return err;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/lockless_allocator_priv.h b/drivers/gpu/nvgpu/gk20a/lockless_allocator_priv.h
new file mode 100644
index 00000000..f9b03e0e
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/lockless_allocator_priv.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Basics:
+ *
+ *    - Lockless memory allocator for fixed-size structures, whose
+ *      size is defined up front at init time.
+ *    - Memory footprint scales linearly w/ the number of structures in
+ *      the pool. It is ~= sizeof(int) * N.
+ *    - Memory is pre-allocated by the client. The allocator itself
+ *      only computes the addresses for allocations.
+ *    - Limit of MAX_INT nodes that the allocator can be responsible for.
+ *
+ * Implementation details:
+ *
+ *    The allocator maintains a single list of free nodes. We allocate &
+ *    free nodes from the head of the list. We rely on the cmpxchg() operator
+ *    to maintain atomicity on the head.
+ *
+ *    So, both allocs & frees are O(1)!!
+ *
+ *    -- Definitions --
+ *    Block Size - size of a single structure that this allocator will
+ *                 allocate.
+ *    Node       - one of the elements of size blk_size in the
+ *                 client-allocated buffer.
+ *    Node Index - zero-based index of a node in the client-allocated
+ *                 contiguous buffer.
+ *
+ *    -- Initial State --
+ *    We maintain the following to track the state of the free list:
+ *
+ *    1) A "head" index to track the index of the first free node in the list
+ *    2) A "next" array to track the index of the next free node in the list
+ *       for every node. So next[head], will give the index to the 2nd free
+ *       element in the list.
+ *
+ *    So, to begin with, the free list consists of all node indices, and each
+ *    position in the next array contains index N + 1:
+ *
+ *    head = 0
+ *    next = [1, 2, 3, 4, -1] : Example for a user-allocated buffer of 5 nodes
+ *    free_list = 0->1->2->3->4->-1
+ *
+ *    -- Allocations --
+ *    1) Read the current head (aka acq_head)
+ *    2) Read next[acq_head], to get the 2nd free element (aka new_head)
+ *    3) cmp_xchg(&head, acq_head, new_head)
+ *    4) If it succeeds, compute the address of the node, based on
+ *       base address, blk_size, & acq_head.
+ *
+ *    head = 1;
+ *    next = [1, 2, 3, 4, -1] : Example after allocating Node #0
+ *    free_list = 1->2->3->4->-1
+ *
+ *    head = 2;
+ *    next = [1, 2, 3, 4, -1] : Example after allocating Node #1
+ *    free_list = 2->3->4->-1
+ *
+ *    -- Frees --
+ *    1) Based on the address to be freed, calculate the index of the node
+ *       being freed (cur_idx)
+ *    2) Read the current head (old_head)
+ *    3) So the freed node is going to go at the head of the list, and we
+ *       want to put the old_head after it. So next[cur_idx] = old_head
+ *    4) cmpxchg(head, old_head, cur_idx)
+ *
+ *    head = 0
+ *    next = [2, 2, 3, 4, -1]
+ *    free_list = 0->2->3->4->-1 : Example after freeing Node #0
+ *
+ *    head = 1
+ *    next = [2, 0, 3, 4, -1]
+ *    free_list = 1->0->2->3->4->-1 : Example after freeing Node #1
+ */
+
+#ifndef LOCKLESS_ALLOCATOR_PRIV_H
+#define LOCKLESS_ALLOCATOR_PRIV_H
+
+struct gk20a_allocator;
+
+struct gk20a_lockless_allocator {
+	struct gk20a_allocator *owner;
+
+	u64 base;		/* Base address of the space. */
+	u64 length;		/* Length of the space. */
+	u64 blk_size;		/* Size of the structure being allocated */
+	int nr_nodes;		/* Number of nodes available for allocation */
+
+	int *next;		/* An array holding the next indices per node */
+	int head;		/* Current node at the top of the stack */
+
+	u64 flags;
+
+	bool inited;
+
+	/* Statistics */
+	atomic_t nr_allocs;
+};
+
+static inline struct gk20a_lockless_allocator *lockless_allocator(
+	struct gk20a_allocator *a)
+{
+	return (struct gk20a_lockless_allocator *)(a)->priv;
+}
+
+#endif
-- 
cgit v1.2.2