gpu: nvgpu: Revamp semaphore support

Revamp the support the nvgpu driver has for semaphores. The original problem with nvgpu's semaphore support is that it required a SW based wait for every semaphore release. This was because for every fence that gk20a_channel_semaphore_wait_fd() waited on a new semaphore was created. This semaphore would then get released by SW when the fence signaled. This meant that for every release there was necessarily a sync_fence_wait_async() call which could block. The latency of this SW wait was enough to cause massive degredation in performance. To fix this a fast path was implemented. When a fence is passed to gk20a_channel_semaphore_wait_fd() that is backed by a GPU semaphore a semaphore acquire is directly used to block the GPU. No longer is a sync_fence_wait_async() performed nor is there an extra semaphore created. To implement this fast path the semaphore memory had to be shared between channels. Previously since a new semaphore was created every time through gk20a_channel_semaphore_wait_fd() what address space a semaphore was mapped into was irrelevant. However, when using the fast path a sempahore may be released on one address space but acquired in another. Sharing the semaphore memory was done by making a fixed GPU mapping in all channels. This mapping points to the semaphore memory (the so called semaphore sea). This global fixed mapping is read-only to make sure no semaphores can be incremented (i.e released) by a malicious channel. Each channel then gets a RW mapping of it's own semaphore. This way a channel may only acquire other channel's semaphores but may both acquire and release its own semaphore. The gk20a fence code was updated to allow introspection of the GPU backed fences. This allows detection of when the fast path can be taken. If the fast path cannot be used (for example when a fence is sync-pt backed) the original slow path is still present. This gets used when the GPU needs to wait on an event from something which only understands how to use sync-pts. Bug 1732449 JIRA DNVGPU-12 Change-Id: Ic0fea74994da5819a771deac726bb0d47a33c2de Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: http://git-master/r/1133792 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Alex Waterman <alexw@nvidia.com> 2016-04-27 15:27:36 -0400
committer: Terje Bergstrom <tbergstrom@nvidia.com> 2016-06-28 18:49:11 -0400
commit: dfd5ec53fcce4ebae27f78242e6b788350337095 (patch)
tree: 073ea380b9ee4734391d381745f57600c3525be5 /drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
parent: b30990ea6db564e885d5aee7a1a5ea87a1e5e8ee (diff)
1 files changed, 250 insertions, 53 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
index 1f12e262..58081b56 100644
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
@@ -15,17 +15,128 @@
 #define SEMAPHORE_GK20A_H
 #include <linux/kref.h>
-#include "gk20a_allocator.h"
+#include <linux/list.h>
+#include <linux/delay.h>
+#include "gk20a.h"
 #include "mm_gk20a.h"
+#include "channel_gk20a.h"
+/*
+ * Max number of channels that can be used is 512. This of course needs to be
+ * fixed to be dynamic but still fast.
+ */
+#define SEMAPHORE_POOL_COUNT            512
+#define SEMAPHORE_SIZE                  16
+#define SEMAPHORE_SEA_GROWTH_RATE       32
+struct gk20a_semaphore_sea;
+/*
+ * Underlying semaphore data structure. This semaphore can be shared amongst
+ * other semaphore instances.
+ */
+struct gk20a_semaphore_int {
+        int idx;                        /* Semaphore index. */
+        u32 offset;                     /* Offset into the pool. */
+        atomic_t next_value;            /* Next available value. */
+        u32 *value;                     /* Current value (access w/ readl()). */
+        u32 nr_incrs;                   /* Number of increments programmed. */
+        struct gk20a_semaphore_pool *p; /* Pool that owns this sema. */
+        struct channel_gk20a *ch;       /* Channel that owns this sema. */
+        struct list_head hw_sema_list;  /* List of HW semaphores. */
+};
+/*
+ * A semaphore which the rest of the driver actually uses. This consists of a
+ * pointer to a real semaphore and a value to wait for. This allows one physical
+ * semaphore to be shared among an essentially infinite number of submits.
+ */
+struct gk20a_semaphore {
+        struct gk20a_semaphore_int *hw_sema;
-/* A memory pool for holding semaphores. */
+        atomic_t value;
+        int incremented;
+        struct kref ref;
+};
+/*
+ * A semaphore pool. Each address space will own exactly one of these.
+ */
 struct gk20a_semaphore_pool {
-        struct mem_desc mem;
+        struct page *page;                      /* This pool's page of memory */
-        struct gk20a *g;
+        struct list_head pool_list_entry;       /* Node for list of pools. */
-        struct list_head maps;
+        void *cpu_va;                           /* CPU access to the pool. */
-        struct mutex maps_mutex;
+        u64 gpu_va;                             /* GPU access to the pool. */
+        u64 gpu_va_ro;                          /* GPU access to the pool. */
+        int page_idx;                           /* Index into sea bitmap. */
+        struct list_head hw_semas;              /* List of HW semas. */
+        DECLARE_BITMAP(semas_alloced, PAGE_SIZE / SEMAPHORE_SIZE);
+        struct gk20a_semaphore_sea *sema_sea;   /* Sea that owns this pool. */
+        struct mutex pool_lock;
+        /*
+         * This is the address spaces's personal RW table. Other channels will
+         * ultimately map this page as RO.
+         */
+        struct sg_table *rw_sg_table;
+        /*
+         * This is to keep track of whether the pool has had its sg_table
+         * updated during sea resizing.
+         */
+        struct sg_table *ro_sg_table;
+        int mapped;
+        /*
+         * Sometimes a channel can be released before other channels are
+         * done waiting on it. This ref count ensures that the pool doesn't
+         * go away until all semaphores using this pool are cleaned up first.
+         */
        struct kref ref;
-        struct gk20a_allocator alloc;
+};
+/*
+ * A sea of semaphores pools. Each pool is owned by a single VM. Since multiple
+ * channels can share a VM each channel gets it's own HW semaphore from the
+ * pool. Channels then allocate regular semaphores - basically just a value that
+ * signifies when a particular job is done.
+ */
+struct gk20a_semaphore_sea {
+        struct list_head pool_list;     /* List of pools in this sea. */
+        struct gk20a *gk20a;
+        size_t size;                    /* Number of pages available. */
+        u64 gpu_va;                     /* GPU virtual address of sema sea. */
+        u64 map_size;                   /* Size of the mapping. */
+        /*
+         * TODO:
+         * List of pages that we use to back the pools. The number of pages
+         * can grow dynamically since allocating 512 pages for all channels at
+         * once would be a tremendous waste.
+         */
+        int page_count;                 /* Pages allocated to pools. */
+        struct sg_table *ro_sg_table;
+        /*
+        struct page *pages[SEMAPHORE_POOL_COUNT];
+        */
+        struct mem_desc sea_mem;
+        /*
+         * Can't use a regular allocator here since the full range of pools are
+         * not always allocated. Instead just use a bitmap.
+         */
+        DECLARE_BITMAP(pools_alloced, SEMAPHORE_POOL_COUNT);
+        struct mutex sea_lock;          /* Lock alloc/free calls. */
 };
 enum gk20a_mem_rw_flag {
@@ -34,64 +145,150 @@ enum gk20a_mem_rw_flag {
        gk20a_mem_flag_write_only = 2,
 };
-/* A semaphore pool can be mapped to multiple GPU address spaces. */
+/*
-struct gk20a_semaphore_pool_map {
+ * Semaphore sea functions.
-        u64 gpu_va;
+ */
-        enum gk20a_mem_rw_flag rw_flag;
+struct gk20a_semaphore_sea *gk20a_semaphore_sea_create(struct gk20a *gk20a);
-        struct vm_gk20a *vm;
+int gk20a_semaphore_sea_map(struct gk20a_semaphore_pool *sea,
-        struct list_head list;
+                            struct vm_gk20a *vm);
-};
+void gk20a_semaphore_sea_unmap(struct gk20a_semaphore_pool *sea,
+                               struct vm_gk20a *vm);
+struct gk20a_semaphore_sea *gk20a_semaphore_get_sea(struct gk20a *g);
+/*
+ * Semaphore pool functions.
+ */
+struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(
+        struct gk20a_semaphore_sea *sea);
+int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *pool,
+                             struct vm_gk20a *vm);
+void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *pool,
+                                struct vm_gk20a *vm);
+u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global);
+void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p);
+void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p);
+/*
+ * Semaphore functions.
+ */
+struct gk20a_semaphore *gk20a_semaphore_alloc(struct channel_gk20a *ch);
+void gk20a_semaphore_put(struct gk20a_semaphore *s);
+void gk20a_semaphore_get(struct gk20a_semaphore *s);
+/*
+ * Return the address of a specific semaphore.
+ *
+ * Don't call this on a semaphore you don't own - the VA returned will make no
+ * sense in your specific channel's VM.
+ */
+static inline u64 gk20a_semaphore_gpu_rw_va(struct gk20a_semaphore *s)
+{
+        return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, false) +
+                s->hw_sema->offset;
+}
+/*
+ * Get the global RO address for the semaphore. Can be called on any semaphore
+ * regardless of whether you own it.
+ */
+static inline u64 gk20a_semaphore_gpu_ro_va(struct gk20a_semaphore *s)
+{
+        return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, true) +
+                s->hw_sema->offset;
+}
+static inline u64 gk20a_hw_sema_addr(struct gk20a_semaphore_int *hw_sema)
+{
+        return __gk20a_semaphore_pool_gpu_va(hw_sema->p, true) +
+                hw_sema->offset;
+}
+/*
+ * TODO: handle wrap around... Hmm, how to do this?
+ */
+static inline bool gk20a_semaphore_is_released(struct gk20a_semaphore *s)
+{
+        u32 sema_val = readl(s->hw_sema->value);
-/* A semaphore that lives inside a semaphore pool. */
-struct gk20a_semaphore {
-        struct gk20a_semaphore_pool *pool;
        /*
-         * value exists within the pool's memory at the specified offset.
+         * If the underlying semaphore value is greater than or equal to
-         * 0=acquired, 1=released.
+         * the value of the semaphore then the semaphore has been signaled
+         * (a.k.a. released).
         */
-        u32 offset; /* byte offset within pool */
+        return sema_val >= atomic_read(&s->value);
-        struct kref ref;
+}
-};
-/* Create a semaphore pool that can hold at most 'capacity' semaphores. */
+static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s)
-struct gk20a_semaphore_pool *
-gk20a_semaphore_pool_alloc(struct gk20a *, const char *unique_name,
-                           size_t capacity);
-void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *);
-int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *,
-                             struct vm_gk20a *,
-                             enum gk20a_mem_rw_flag);
-void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *,
-                                struct vm_gk20a *);
-u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *,
-                                struct vm_gk20a *);
-/* Allocate a semaphore from the semaphore pool. The newly allocated
- * semaphore will be in acquired state (value=0). */
-struct gk20a_semaphore *
-gk20a_semaphore_alloc(struct gk20a_semaphore_pool *);
-void gk20a_semaphore_put(struct gk20a_semaphore *);
-void gk20a_semaphore_get(struct gk20a_semaphore *);
-static inline u64 gk20a_semaphore_gpu_va(struct gk20a_semaphore *s,
-                                         struct vm_gk20a *vm)
 {
-        return gk20a_semaphore_pool_gpu_va(s->pool, vm) + s->offset;
+        return !gk20a_semaphore_is_released(s);
 }
-static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s)
+/*
+ * Read the underlying value from a semaphore.
+ */
+static inline u32 gk20a_semaphore_read(struct gk20a_semaphore *s)
 {
-        u32 v = gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset);
+        return readl(s->hw_sema->value);
+}
-        /* When often block on value reaching a certain threshold. We must make
+static inline u32 gk20a_semaphore_get_value(struct gk20a_semaphore *s)
-         * sure that if we get unblocked, we haven't read anything too early. */
+{
-        smp_rmb();
+        return atomic_read(&s->value);
-        return v == 0;
 }
+static inline u32 gk20a_semaphore_next_value(struct gk20a_semaphore *s)
+{
+        return atomic_read(&s->hw_sema->next_value);
+}
+/*
+ * Note - if you call this then any prior semaphores will also be released.
+ */
 static inline void gk20a_semaphore_release(struct gk20a_semaphore *s)
 {
-        smp_wmb();
+        u32 current_val;
-        gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 1);
+        u32 val = gk20a_semaphore_get_value(s);
+        int attempts = 0;
+        /*
+         * Wait until the sema value is 1 less than the write value. That
+         * way this function is essentially an increment.
+         *
+         * TODO: tune the wait a little better.
+         */
+        while ((current_val = gk20a_semaphore_read(s)) < (val - 1)) {
+                msleep(100);
+                attempts += 1;
+                if (attempts > 100) {
+                        WARN(1, "Stall on sema release!");
+                        return;
+                }
+        }
+        /*
+         * If the semaphore has already passed the value we would write then
+         * this is really just a NO-OP.
+         */
+        if (current_val >= val)
+                return;
+        writel(val, s->hw_sema->value);
+}
+/*
+ * Configure a software based increment on this semaphore. This is useful for
+ * when we want the GPU to wait on a SW event before processing a channel.
+ * Another way to describe this is when the GPU needs to wait on a SW pre-fence.
+ * The pre-fence signals SW which in turn calls gk20a_semaphore_release() which
+ * then allows the GPU to continue.
+ *
+ * Also used to prep a semaphore for an INCR by the GPU.
+ */
+static inline void gk20a_semaphore_incr(struct gk20a_semaphore *s)
+{
+        BUG_ON(s->incremented);
+        atomic_set(&s->value, atomic_add_return(1, &s->hw_sema->next_value));
+        s->incremented = 1;
 }
 #endif
author	Alex Waterman <alexw@nvidia.com>	2016-04-27 15:27:36 -0400
committer	Terje Bergstrom <tbergstrom@nvidia.com>	2016-06-28 18:49:11 -0400
commit	dfd5ec53fcce4ebae27f78242e6b788350337095 (patch)
tree	073ea380b9ee4734391d381745f57600c3525be5 /drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
parent	b30990ea6db564e885d5aee7a1a5ea87a1e5e8ee (diff)

diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h index 1f12e262..58081b56 100644 --- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
@@ -15,17 +15,128 @@
15	#define SEMAPHORE_GK20A_H	15	#define SEMAPHORE_GK20A_H
16		16
17	#include <linux/kref.h>	17	#include <linux/kref.h>
18	#include "gk20a_allocator.h"	18	#include <linux/list.h>
		19	#include <linux/delay.h>
		20
		21	#include "gk20a.h"
19	#include "mm_gk20a.h"	22	#include "mm_gk20a.h"
		23	#include "channel_gk20a.h"
		24
		25	/*
		26	* Max number of channels that can be used is 512. This of course needs to be
		27	* fixed to be dynamic but still fast.
		28	*/
		29	#define SEMAPHORE_POOL_COUNT 512
		30	#define SEMAPHORE_SIZE 16
		31	#define SEMAPHORE_SEA_GROWTH_RATE 32
		32
		33	struct gk20a_semaphore_sea;
		34
		35	/*
		36	* Underlying semaphore data structure. This semaphore can be shared amongst
		37	* other semaphore instances.
		38	*/
		39	struct gk20a_semaphore_int {
		40	int idx; /* Semaphore index. */
		41	u32 offset; /* Offset into the pool. */
		42	atomic_t next_value; /* Next available value. */
		43	u32 value; / Current value (access w/ readl()). */
		44	u32 nr_incrs; /* Number of increments programmed. */
		45	struct gk20a_semaphore_pool p; / Pool that owns this sema. */
		46	struct channel_gk20a ch; / Channel that owns this sema. */
		47	struct list_head hw_sema_list; /* List of HW semaphores. */
		48	};
		49
		50	/*
		51	* A semaphore which the rest of the driver actually uses. This consists of a
		52	* pointer to a real semaphore and a value to wait for. This allows one physical
		53	* semaphore to be shared among an essentially infinite number of submits.
		54	*/
		55	struct gk20a_semaphore {
		56	struct gk20a_semaphore_int *hw_sema;
20		57
21	/* A memory pool for holding semaphores. */	58	atomic_t value;
		59	int incremented;
		60
		61	struct kref ref;
		62	};
		63
		64	/*
		65	* A semaphore pool. Each address space will own exactly one of these.
		66	*/
22	struct gk20a_semaphore_pool {	67	struct gk20a_semaphore_pool {
23	struct mem_desc mem;	68	struct page page; / This pool's page of memory */
24	struct gk20a *g;	69	struct list_head pool_list_entry; /* Node for list of pools. */
25	struct list_head maps;	70	void cpu_va; / CPU access to the pool. */
26	struct mutex maps_mutex;	71	u64 gpu_va; /* GPU access to the pool. */
		72	u64 gpu_va_ro; /* GPU access to the pool. */
		73	int page_idx; /* Index into sea bitmap. */
		74
		75	struct list_head hw_semas; /* List of HW semas. */
		76	DECLARE_BITMAP(semas_alloced, PAGE_SIZE / SEMAPHORE_SIZE);
		77
		78	struct gk20a_semaphore_sea sema_sea; / Sea that owns this pool. */
		79
		80	struct mutex pool_lock;
		81
		82	/*
		83	* This is the address spaces's personal RW table. Other channels will
		84	* ultimately map this page as RO.
		85	*/
		86	struct sg_table *rw_sg_table;
		87
		88	/*
		89	* This is to keep track of whether the pool has had its sg_table
		90	* updated during sea resizing.
		91	*/
		92	struct sg_table *ro_sg_table;
		93
		94	int mapped;
		95
		96	/*
		97	* Sometimes a channel can be released before other channels are
		98	* done waiting on it. This ref count ensures that the pool doesn't
		99	* go away until all semaphores using this pool are cleaned up first.
		100	*/
27	struct kref ref;	101	struct kref ref;
28	struct gk20a_allocator alloc;	102	};
		103
		104	/*
		105	* A sea of semaphores pools. Each pool is owned by a single VM. Since multiple
		106	* channels can share a VM each channel gets it's own HW semaphore from the
		107	* pool. Channels then allocate regular semaphores - basically just a value that
		108	* signifies when a particular job is done.
		109	*/
		110	struct gk20a_semaphore_sea {
		111	struct list_head pool_list; /* List of pools in this sea. */
		112	struct gk20a *gk20a;
		113
		114	size_t size; /* Number of pages available. */
		115	u64 gpu_va; /* GPU virtual address of sema sea. */
		116	u64 map_size; /* Size of the mapping. */
		117
		118	/*
		119	* TODO:
		120	* List of pages that we use to back the pools. The number of pages
		121	* can grow dynamically since allocating 512 pages for all channels at
		122	* once would be a tremendous waste.
		123	*/
		124	int page_count; /* Pages allocated to pools. */
		125
		126	struct sg_table *ro_sg_table;
		127	/*
		128	struct page *pages[SEMAPHORE_POOL_COUNT];
		129	*/
		130
		131	struct mem_desc sea_mem;
		132
		133	/*
		134	* Can't use a regular allocator here since the full range of pools are
		135	* not always allocated. Instead just use a bitmap.
		136	*/
		137	DECLARE_BITMAP(pools_alloced, SEMAPHORE_POOL_COUNT);
		138
		139	struct mutex sea_lock; /* Lock alloc/free calls. */
29	};	140	};
30		141
31	enum gk20a_mem_rw_flag {	142	enum gk20a_mem_rw_flag {
@@ -34,64 +145,150 @@ enum gk20a_mem_rw_flag {
34	gk20a_mem_flag_write_only = 2,	145	gk20a_mem_flag_write_only = 2,
35	};	146	};
36		147
37	/* A semaphore pool can be mapped to multiple GPU address spaces. */	148	/*
38	struct gk20a_semaphore_pool_map {	149	* Semaphore sea functions.
39	u64 gpu_va;	150	*/
40	enum gk20a_mem_rw_flag rw_flag;	151	struct gk20a_semaphore_sea gk20a_semaphore_sea_create(struct gk20a gk20a);
41	struct vm_gk20a *vm;	152	int gk20a_semaphore_sea_map(struct gk20a_semaphore_pool *sea,
42	struct list_head list;	153	struct vm_gk20a *vm);
43	};	154	void gk20a_semaphore_sea_unmap(struct gk20a_semaphore_pool *sea,
		155	struct vm_gk20a *vm);
		156	struct gk20a_semaphore_sea gk20a_semaphore_get_sea(struct gk20a g);
		157
		158	/*
		159	* Semaphore pool functions.
		160	*/
		161	struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(
		162	struct gk20a_semaphore_sea *sea);
		163	int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *pool,
		164	struct vm_gk20a *vm);
		165	void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *pool,
		166	struct vm_gk20a *vm);
		167	u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global);
		168	void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p);
		169	void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p);
		170
		171	/*
		172	* Semaphore functions.
		173	*/
		174	struct gk20a_semaphore gk20a_semaphore_alloc(struct channel_gk20a ch);
		175	void gk20a_semaphore_put(struct gk20a_semaphore *s);
		176	void gk20a_semaphore_get(struct gk20a_semaphore *s);
		177
		178	/*
		179	* Return the address of a specific semaphore.
		180	*
		181	* Don't call this on a semaphore you don't own - the VA returned will make no
		182	* sense in your specific channel's VM.
		183	*/
		184	static inline u64 gk20a_semaphore_gpu_rw_va(struct gk20a_semaphore *s)
		185	{
		186	return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, false) +
		187	s->hw_sema->offset;
		188	}
		189
		190	/*
		191	* Get the global RO address for the semaphore. Can be called on any semaphore
		192	* regardless of whether you own it.
		193	*/
		194	static inline u64 gk20a_semaphore_gpu_ro_va(struct gk20a_semaphore *s)
		195	{
		196	return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, true) +
		197	s->hw_sema->offset;
		198	}
		199
		200	static inline u64 gk20a_hw_sema_addr(struct gk20a_semaphore_int *hw_sema)
		201	{
		202	return __gk20a_semaphore_pool_gpu_va(hw_sema->p, true) +
		203	hw_sema->offset;
		204	}
		205
		206	/*
		207	* TODO: handle wrap around... Hmm, how to do this?
		208	*/
		209	static inline bool gk20a_semaphore_is_released(struct gk20a_semaphore *s)
		210	{
		211	u32 sema_val = readl(s->hw_sema->value);
44		212
45	/* A semaphore that lives inside a semaphore pool. */
46	struct gk20a_semaphore {
47	struct gk20a_semaphore_pool *pool;
48	/*	213	/*
49	* value exists within the pool's memory at the specified offset.	214	* If the underlying semaphore value is greater than or equal to
50	* 0=acquired, 1=released.	215	* the value of the semaphore then the semaphore has been signaled
		216	* (a.k.a. released).
51	*/	217	*/
52	u32 offset; /* byte offset within pool */	218	return sema_val >= atomic_read(&s->value);
53	struct kref ref;	219	}
54	};
55		220
56	/* Create a semaphore pool that can hold at most 'capacity' semaphores. */	221	static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s)
57	struct gk20a_semaphore_pool *
58	gk20a_semaphore_pool_alloc(struct gk20a , const char unique_name,
59	size_t capacity);
60	void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *);
61	int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *,
62	struct vm_gk20a *,
63	enum gk20a_mem_rw_flag);
64	void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *,
65	struct vm_gk20a *);
66	u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *,
67	struct vm_gk20a *);
68
69	/* Allocate a semaphore from the semaphore pool. The newly allocated
70	* semaphore will be in acquired state (value=0). */
71	struct gk20a_semaphore *
72	gk20a_semaphore_alloc(struct gk20a_semaphore_pool *);
73	void gk20a_semaphore_put(struct gk20a_semaphore *);
74	void gk20a_semaphore_get(struct gk20a_semaphore *);
75
76	static inline u64 gk20a_semaphore_gpu_va(struct gk20a_semaphore *s,
77	struct vm_gk20a *vm)
78	{	222	{
79	return gk20a_semaphore_pool_gpu_va(s->pool, vm) + s->offset;	223	return !gk20a_semaphore_is_released(s);
80	}	224	}
81		225
82	static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s)	226	/*
		227	* Read the underlying value from a semaphore.
		228	*/
		229	static inline u32 gk20a_semaphore_read(struct gk20a_semaphore *s)
83	{	230	{
84	u32 v = gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset);	231	return readl(s->hw_sema->value);
		232	}
85		233
86	/* When often block on value reaching a certain threshold. We must make	234	static inline u32 gk20a_semaphore_get_value(struct gk20a_semaphore *s)
87	* sure that if we get unblocked, we haven't read anything too early. */	235	{
88	smp_rmb();	236	return atomic_read(&s->value);
89	return v == 0;
90	}	237	}
91		238
		239	static inline u32 gk20a_semaphore_next_value(struct gk20a_semaphore *s)
		240	{
		241	return atomic_read(&s->hw_sema->next_value);
		242	}
		243
		244	/*
		245	* Note - if you call this then any prior semaphores will also be released.
		246	*/
92	static inline void gk20a_semaphore_release(struct gk20a_semaphore *s)	247	static inline void gk20a_semaphore_release(struct gk20a_semaphore *s)
93	{	248	{
94	smp_wmb();	249	u32 current_val;
95	gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 1);	250	u32 val = gk20a_semaphore_get_value(s);
		251	int attempts = 0;
		252
		253	/*
		254	* Wait until the sema value is 1 less than the write value. That
		255	* way this function is essentially an increment.
		256	*
		257	* TODO: tune the wait a little better.
		258	*/
		259	while ((current_val = gk20a_semaphore_read(s)) < (val - 1)) {
		260	msleep(100);
		261	attempts += 1;
		262	if (attempts > 100) {
		263	WARN(1, "Stall on sema release!");
		264	return;
		265	}
		266	}
		267
		268	/*
		269	* If the semaphore has already passed the value we would write then
		270	* this is really just a NO-OP.
		271	*/
		272	if (current_val >= val)
		273	return;
		274
		275	writel(val, s->hw_sema->value);
		276	}
		277
		278	/*
		279	* Configure a software based increment on this semaphore. This is useful for
		280	* when we want the GPU to wait on a SW event before processing a channel.
		281	* Another way to describe this is when the GPU needs to wait on a SW pre-fence.
		282	* The pre-fence signals SW which in turn calls gk20a_semaphore_release() which
		283	* then allows the GPU to continue.
		284	*
		285	* Also used to prep a semaphore for an INCR by the GPU.
		286	*/
		287	static inline void gk20a_semaphore_incr(struct gk20a_semaphore *s)
		288	{
		289	BUG_ON(s->incremented);
		290
		291	atomic_set(&s->value, atomic_add_return(1, &s->hw_sema->next_value));
		292	s->incremented = 1;
96	}	293	}
97	#endif	294	#endif