summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
diff options
context:
space:
mode:
authorAlex Waterman <alexw@nvidia.com>2016-04-27 15:27:36 -0400
committerTerje Bergstrom <tbergstrom@nvidia.com>2016-06-28 18:49:11 -0400
commitdfd5ec53fcce4ebae27f78242e6b788350337095 (patch)
tree073ea380b9ee4734391d381745f57600c3525be5 /drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
parentb30990ea6db564e885d5aee7a1a5ea87a1e5e8ee (diff)
gpu: nvgpu: Revamp semaphore support
Revamp the support the nvgpu driver has for semaphores. The original problem with nvgpu's semaphore support is that it required a SW based wait for every semaphore release. This was because for every fence that gk20a_channel_semaphore_wait_fd() waited on a new semaphore was created. This semaphore would then get released by SW when the fence signaled. This meant that for every release there was necessarily a sync_fence_wait_async() call which could block. The latency of this SW wait was enough to cause massive degredation in performance. To fix this a fast path was implemented. When a fence is passed to gk20a_channel_semaphore_wait_fd() that is backed by a GPU semaphore a semaphore acquire is directly used to block the GPU. No longer is a sync_fence_wait_async() performed nor is there an extra semaphore created. To implement this fast path the semaphore memory had to be shared between channels. Previously since a new semaphore was created every time through gk20a_channel_semaphore_wait_fd() what address space a semaphore was mapped into was irrelevant. However, when using the fast path a sempahore may be released on one address space but acquired in another. Sharing the semaphore memory was done by making a fixed GPU mapping in all channels. This mapping points to the semaphore memory (the so called semaphore sea). This global fixed mapping is read-only to make sure no semaphores can be incremented (i.e released) by a malicious channel. Each channel then gets a RW mapping of it's own semaphore. This way a channel may only acquire other channel's semaphores but may both acquire and release its own semaphore. The gk20a fence code was updated to allow introspection of the GPU backed fences. This allows detection of when the fast path can be taken. If the fast path cannot be used (for example when a fence is sync-pt backed) the original slow path is still present. This gets used when the GPU needs to wait on an event from something which only understands how to use sync-pts. Bug 1732449 JIRA DNVGPU-12 Change-Id: Ic0fea74994da5819a771deac726bb0d47a33c2de Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: http://git-master/r/1133792 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h')
-rw-r--r--drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h303
1 files changed, 250 insertions, 53 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
index 1f12e262..58081b56 100644
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.h
@@ -15,17 +15,128 @@
15#define SEMAPHORE_GK20A_H 15#define SEMAPHORE_GK20A_H
16 16
17#include <linux/kref.h> 17#include <linux/kref.h>
18#include "gk20a_allocator.h" 18#include <linux/list.h>
19#include <linux/delay.h>
20
21#include "gk20a.h"
19#include "mm_gk20a.h" 22#include "mm_gk20a.h"
23#include "channel_gk20a.h"
24
25/*
26 * Max number of channels that can be used is 512. This of course needs to be
27 * fixed to be dynamic but still fast.
28 */
29#define SEMAPHORE_POOL_COUNT 512
30#define SEMAPHORE_SIZE 16
31#define SEMAPHORE_SEA_GROWTH_RATE 32
32
33struct gk20a_semaphore_sea;
34
35/*
36 * Underlying semaphore data structure. This semaphore can be shared amongst
37 * other semaphore instances.
38 */
39struct gk20a_semaphore_int {
40 int idx; /* Semaphore index. */
41 u32 offset; /* Offset into the pool. */
42 atomic_t next_value; /* Next available value. */
43 u32 *value; /* Current value (access w/ readl()). */
44 u32 nr_incrs; /* Number of increments programmed. */
45 struct gk20a_semaphore_pool *p; /* Pool that owns this sema. */
46 struct channel_gk20a *ch; /* Channel that owns this sema. */
47 struct list_head hw_sema_list; /* List of HW semaphores. */
48};
49
50/*
51 * A semaphore which the rest of the driver actually uses. This consists of a
52 * pointer to a real semaphore and a value to wait for. This allows one physical
53 * semaphore to be shared among an essentially infinite number of submits.
54 */
55struct gk20a_semaphore {
56 struct gk20a_semaphore_int *hw_sema;
20 57
21/* A memory pool for holding semaphores. */ 58 atomic_t value;
59 int incremented;
60
61 struct kref ref;
62};
63
64/*
65 * A semaphore pool. Each address space will own exactly one of these.
66 */
22struct gk20a_semaphore_pool { 67struct gk20a_semaphore_pool {
23 struct mem_desc mem; 68 struct page *page; /* This pool's page of memory */
24 struct gk20a *g; 69 struct list_head pool_list_entry; /* Node for list of pools. */
25 struct list_head maps; 70 void *cpu_va; /* CPU access to the pool. */
26 struct mutex maps_mutex; 71 u64 gpu_va; /* GPU access to the pool. */
72 u64 gpu_va_ro; /* GPU access to the pool. */
73 int page_idx; /* Index into sea bitmap. */
74
75 struct list_head hw_semas; /* List of HW semas. */
76 DECLARE_BITMAP(semas_alloced, PAGE_SIZE / SEMAPHORE_SIZE);
77
78 struct gk20a_semaphore_sea *sema_sea; /* Sea that owns this pool. */
79
80 struct mutex pool_lock;
81
82 /*
83 * This is the address spaces's personal RW table. Other channels will
84 * ultimately map this page as RO.
85 */
86 struct sg_table *rw_sg_table;
87
88 /*
89 * This is to keep track of whether the pool has had its sg_table
90 * updated during sea resizing.
91 */
92 struct sg_table *ro_sg_table;
93
94 int mapped;
95
96 /*
97 * Sometimes a channel can be released before other channels are
98 * done waiting on it. This ref count ensures that the pool doesn't
99 * go away until all semaphores using this pool are cleaned up first.
100 */
27 struct kref ref; 101 struct kref ref;
28 struct gk20a_allocator alloc; 102};
103
104/*
105 * A sea of semaphores pools. Each pool is owned by a single VM. Since multiple
106 * channels can share a VM each channel gets it's own HW semaphore from the
107 * pool. Channels then allocate regular semaphores - basically just a value that
108 * signifies when a particular job is done.
109 */
110struct gk20a_semaphore_sea {
111 struct list_head pool_list; /* List of pools in this sea. */
112 struct gk20a *gk20a;
113
114 size_t size; /* Number of pages available. */
115 u64 gpu_va; /* GPU virtual address of sema sea. */
116 u64 map_size; /* Size of the mapping. */
117
118 /*
119 * TODO:
120 * List of pages that we use to back the pools. The number of pages
121 * can grow dynamically since allocating 512 pages for all channels at
122 * once would be a tremendous waste.
123 */
124 int page_count; /* Pages allocated to pools. */
125
126 struct sg_table *ro_sg_table;
127 /*
128 struct page *pages[SEMAPHORE_POOL_COUNT];
129 */
130
131 struct mem_desc sea_mem;
132
133 /*
134 * Can't use a regular allocator here since the full range of pools are
135 * not always allocated. Instead just use a bitmap.
136 */
137 DECLARE_BITMAP(pools_alloced, SEMAPHORE_POOL_COUNT);
138
139 struct mutex sea_lock; /* Lock alloc/free calls. */
29}; 140};
30 141
31enum gk20a_mem_rw_flag { 142enum gk20a_mem_rw_flag {
@@ -34,64 +145,150 @@ enum gk20a_mem_rw_flag {
34 gk20a_mem_flag_write_only = 2, 145 gk20a_mem_flag_write_only = 2,
35}; 146};
36 147
37/* A semaphore pool can be mapped to multiple GPU address spaces. */ 148/*
38struct gk20a_semaphore_pool_map { 149 * Semaphore sea functions.
39 u64 gpu_va; 150 */
40 enum gk20a_mem_rw_flag rw_flag; 151struct gk20a_semaphore_sea *gk20a_semaphore_sea_create(struct gk20a *gk20a);
41 struct vm_gk20a *vm; 152int gk20a_semaphore_sea_map(struct gk20a_semaphore_pool *sea,
42 struct list_head list; 153 struct vm_gk20a *vm);
43}; 154void gk20a_semaphore_sea_unmap(struct gk20a_semaphore_pool *sea,
155 struct vm_gk20a *vm);
156struct gk20a_semaphore_sea *gk20a_semaphore_get_sea(struct gk20a *g);
157
158/*
159 * Semaphore pool functions.
160 */
161struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(
162 struct gk20a_semaphore_sea *sea);
163int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *pool,
164 struct vm_gk20a *vm);
165void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *pool,
166 struct vm_gk20a *vm);
167u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global);
168void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p);
169void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p);
170
171/*
172 * Semaphore functions.
173 */
174struct gk20a_semaphore *gk20a_semaphore_alloc(struct channel_gk20a *ch);
175void gk20a_semaphore_put(struct gk20a_semaphore *s);
176void gk20a_semaphore_get(struct gk20a_semaphore *s);
177
178/*
179 * Return the address of a specific semaphore.
180 *
181 * Don't call this on a semaphore you don't own - the VA returned will make no
182 * sense in your specific channel's VM.
183 */
184static inline u64 gk20a_semaphore_gpu_rw_va(struct gk20a_semaphore *s)
185{
186 return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, false) +
187 s->hw_sema->offset;
188}
189
190/*
191 * Get the global RO address for the semaphore. Can be called on any semaphore
192 * regardless of whether you own it.
193 */
194static inline u64 gk20a_semaphore_gpu_ro_va(struct gk20a_semaphore *s)
195{
196 return __gk20a_semaphore_pool_gpu_va(s->hw_sema->p, true) +
197 s->hw_sema->offset;
198}
199
200static inline u64 gk20a_hw_sema_addr(struct gk20a_semaphore_int *hw_sema)
201{
202 return __gk20a_semaphore_pool_gpu_va(hw_sema->p, true) +
203 hw_sema->offset;
204}
205
206/*
207 * TODO: handle wrap around... Hmm, how to do this?
208 */
209static inline bool gk20a_semaphore_is_released(struct gk20a_semaphore *s)
210{
211 u32 sema_val = readl(s->hw_sema->value);
44 212
45/* A semaphore that lives inside a semaphore pool. */
46struct gk20a_semaphore {
47 struct gk20a_semaphore_pool *pool;
48 /* 213 /*
49 * value exists within the pool's memory at the specified offset. 214 * If the underlying semaphore value is greater than or equal to
50 * 0=acquired, 1=released. 215 * the value of the semaphore then the semaphore has been signaled
216 * (a.k.a. released).
51 */ 217 */
52 u32 offset; /* byte offset within pool */ 218 return sema_val >= atomic_read(&s->value);
53 struct kref ref; 219}
54};
55 220
56/* Create a semaphore pool that can hold at most 'capacity' semaphores. */ 221static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s)
57struct gk20a_semaphore_pool *
58gk20a_semaphore_pool_alloc(struct gk20a *, const char *unique_name,
59 size_t capacity);
60void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *);
61int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *,
62 struct vm_gk20a *,
63 enum gk20a_mem_rw_flag);
64void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *,
65 struct vm_gk20a *);
66u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *,
67 struct vm_gk20a *);
68
69/* Allocate a semaphore from the semaphore pool. The newly allocated
70 * semaphore will be in acquired state (value=0). */
71struct gk20a_semaphore *
72gk20a_semaphore_alloc(struct gk20a_semaphore_pool *);
73void gk20a_semaphore_put(struct gk20a_semaphore *);
74void gk20a_semaphore_get(struct gk20a_semaphore *);
75
76static inline u64 gk20a_semaphore_gpu_va(struct gk20a_semaphore *s,
77 struct vm_gk20a *vm)
78{ 222{
79 return gk20a_semaphore_pool_gpu_va(s->pool, vm) + s->offset; 223 return !gk20a_semaphore_is_released(s);
80} 224}
81 225
82static inline bool gk20a_semaphore_is_acquired(struct gk20a_semaphore *s) 226/*
227 * Read the underlying value from a semaphore.
228 */
229static inline u32 gk20a_semaphore_read(struct gk20a_semaphore *s)
83{ 230{
84 u32 v = gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset); 231 return readl(s->hw_sema->value);
232}
85 233
86 /* When often block on value reaching a certain threshold. We must make 234static inline u32 gk20a_semaphore_get_value(struct gk20a_semaphore *s)
87 * sure that if we get unblocked, we haven't read anything too early. */ 235{
88 smp_rmb(); 236 return atomic_read(&s->value);
89 return v == 0;
90} 237}
91 238
239static inline u32 gk20a_semaphore_next_value(struct gk20a_semaphore *s)
240{
241 return atomic_read(&s->hw_sema->next_value);
242}
243
244/*
245 * Note - if you call this then any prior semaphores will also be released.
246 */
92static inline void gk20a_semaphore_release(struct gk20a_semaphore *s) 247static inline void gk20a_semaphore_release(struct gk20a_semaphore *s)
93{ 248{
94 smp_wmb(); 249 u32 current_val;
95 gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 1); 250 u32 val = gk20a_semaphore_get_value(s);
251 int attempts = 0;
252
253 /*
254 * Wait until the sema value is 1 less than the write value. That
255 * way this function is essentially an increment.
256 *
257 * TODO: tune the wait a little better.
258 */
259 while ((current_val = gk20a_semaphore_read(s)) < (val - 1)) {
260 msleep(100);
261 attempts += 1;
262 if (attempts > 100) {
263 WARN(1, "Stall on sema release!");
264 return;
265 }
266 }
267
268 /*
269 * If the semaphore has already passed the value we would write then
270 * this is really just a NO-OP.
271 */
272 if (current_val >= val)
273 return;
274
275 writel(val, s->hw_sema->value);
276}
277
278/*
279 * Configure a software based increment on this semaphore. This is useful for
280 * when we want the GPU to wait on a SW event before processing a channel.
281 * Another way to describe this is when the GPU needs to wait on a SW pre-fence.
282 * The pre-fence signals SW which in turn calls gk20a_semaphore_release() which
283 * then allows the GPU to continue.
284 *
285 * Also used to prep a semaphore for an INCR by the GPU.
286 */
287static inline void gk20a_semaphore_incr(struct gk20a_semaphore *s)
288{
289 BUG_ON(s->incremented);
290
291 atomic_set(&s->value, atomic_add_return(1, &s->hw_sema->next_value));
292 s->incremented = 1;
96} 293}
97#endif 294#endif