summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
diff options
context:
space:
mode:
authorAlex Waterman <alexw@nvidia.com>2016-04-27 15:27:36 -0400
committerTerje Bergstrom <tbergstrom@nvidia.com>2016-06-28 18:49:11 -0400
commitdfd5ec53fcce4ebae27f78242e6b788350337095 (patch)
tree073ea380b9ee4734391d381745f57600c3525be5 /drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
parentb30990ea6db564e885d5aee7a1a5ea87a1e5e8ee (diff)
gpu: nvgpu: Revamp semaphore support
Revamp the support the nvgpu driver has for semaphores. The original problem with nvgpu's semaphore support is that it required a SW based wait for every semaphore release. This was because for every fence that gk20a_channel_semaphore_wait_fd() waited on a new semaphore was created. This semaphore would then get released by SW when the fence signaled. This meant that for every release there was necessarily a sync_fence_wait_async() call which could block. The latency of this SW wait was enough to cause massive degredation in performance. To fix this a fast path was implemented. When a fence is passed to gk20a_channel_semaphore_wait_fd() that is backed by a GPU semaphore a semaphore acquire is directly used to block the GPU. No longer is a sync_fence_wait_async() performed nor is there an extra semaphore created. To implement this fast path the semaphore memory had to be shared between channels. Previously since a new semaphore was created every time through gk20a_channel_semaphore_wait_fd() what address space a semaphore was mapped into was irrelevant. However, when using the fast path a sempahore may be released on one address space but acquired in another. Sharing the semaphore memory was done by making a fixed GPU mapping in all channels. This mapping points to the semaphore memory (the so called semaphore sea). This global fixed mapping is read-only to make sure no semaphores can be incremented (i.e released) by a malicious channel. Each channel then gets a RW mapping of it's own semaphore. This way a channel may only acquire other channel's semaphores but may both acquire and release its own semaphore. The gk20a fence code was updated to allow introspection of the GPU backed fences. This allows detection of when the fast path can be taken. If the fast path cannot be used (for example when a fence is sync-pt backed) the original slow path is still present. This gets used when the GPU needs to wait on an event from something which only understands how to use sync-pts. Bug 1732449 JIRA DNVGPU-12 Change-Id: Ic0fea74994da5819a771deac726bb0d47a33c2de Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: http://git-master/r/1133792 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c435
1 files changed, 324 insertions, 111 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
index 3b17bfcb..aa375b24 100644
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
@@ -15,63 +15,284 @@
15 * more details. 15 * more details.
16 */ 16 */
17 17
18#include "semaphore_gk20a.h" 18#define pr_fmt(fmt) "gpu_sema: " fmt
19
19#include <linux/dma-mapping.h> 20#include <linux/dma-mapping.h>
21#include <linux/highmem.h>
20#include <linux/slab.h> 22#include <linux/slab.h>
23
24#include <asm/pgtable.h>
25
21#include "gk20a.h" 26#include "gk20a.h"
22#include "mm_gk20a.h" 27#include "mm_gk20a.h"
28#include "semaphore_gk20a.h"
29
30#define __lock_sema_sea(s) \
31 do { \
32 mutex_lock(&s->sea_lock); \
33 } while (0)
23 34
24static const int SEMAPHORE_SIZE = 16; 35#define __unlock_sema_sea(s) \
36 do { \
37 mutex_unlock(&s->sea_lock); \
38 } while (0)
25 39
26struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(struct gk20a *g, 40/*
27 const char *unique_name, size_t capacity) 41 * Return the sema_sea pointer.
42 */
43struct gk20a_semaphore_sea *gk20a_semaphore_get_sea(struct gk20a *g)
44{
45 return g->sema_sea;
46}
47
48static int __gk20a_semaphore_sea_grow(struct gk20a_semaphore_sea *sea)
49{
50 int ret = 0;
51 struct gk20a *gk20a = sea->gk20a;
52
53 __lock_sema_sea(sea);
54
55 ret = gk20a_gmmu_alloc_attr(gk20a, DMA_ATTR_NO_KERNEL_MAPPING,
56 PAGE_SIZE * SEMAPHORE_POOL_COUNT,
57 &sea->sea_mem);
58 if (ret)
59 goto out;
60
61 sea->ro_sg_table = sea->sea_mem.sgt;
62 sea->size = SEMAPHORE_POOL_COUNT;
63 sea->map_size = SEMAPHORE_POOL_COUNT * PAGE_SIZE;
64
65out:
66 __unlock_sema_sea(sea);
67 return ret;
68}
69
70/*
71 * Create the semaphore sea. Only create it once - subsequent calls to this will
72 * return the originally created sea pointer.
73 */
74struct gk20a_semaphore_sea *gk20a_semaphore_sea_create(struct gk20a *g)
75{
76 if (g->sema_sea)
77 return g->sema_sea;
78
79 g->sema_sea = kzalloc(sizeof(*g->sema_sea), GFP_KERNEL);
80 if (!g->sema_sea)
81 return NULL;
82
83 g->sema_sea->size = 0;
84 g->sema_sea->page_count = 0;
85 g->sema_sea->gk20a = g;
86 INIT_LIST_HEAD(&g->sema_sea->pool_list);
87 mutex_init(&g->sema_sea->sea_lock);
88
89 if (__gk20a_semaphore_sea_grow(g->sema_sea))
90 goto cleanup;
91
92 return g->sema_sea;
93
94cleanup:
95 kfree(g->sema_sea);
96 g->sema_sea = NULL;
97 return NULL;
98}
99
100static int __semaphore_bitmap_alloc(unsigned long *bitmap, unsigned long len)
101{
102 unsigned long idx = find_first_zero_bit(bitmap, len);
103
104 if (idx == len)
105 return -ENOSPC;
106
107 set_bit(idx, bitmap);
108
109 return (int)idx;
110}
111
112/*
113 * Allocate a pool from the sea.
114 */
115struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(
116 struct gk20a_semaphore_sea *sea)
28{ 117{
29 struct gk20a_semaphore_pool *p; 118 struct gk20a_semaphore_pool *p;
119 unsigned long page_idx;
120 int err = 0;
121
30 p = kzalloc(sizeof(*p), GFP_KERNEL); 122 p = kzalloc(sizeof(*p), GFP_KERNEL);
31 if (!p) 123 if (!p)
32 return NULL; 124 return ERR_PTR(-ENOMEM);
125
126 __lock_sema_sea(sea);
127
128 page_idx = __semaphore_bitmap_alloc(sea->pools_alloced,
129 SEMAPHORE_POOL_COUNT);
130 if (page_idx < 0) {
131 err = page_idx;
132 goto fail;
133 }
33 134
135 p->page = sea->sea_mem.pages[page_idx];
136 p->ro_sg_table = sea->ro_sg_table;
137 p->page_idx = page_idx;
138 p->sema_sea = sea;
139 INIT_LIST_HEAD(&p->hw_semas);
34 kref_init(&p->ref); 140 kref_init(&p->ref);
35 INIT_LIST_HEAD(&p->maps); 141 mutex_init(&p->pool_lock);
36 mutex_init(&p->maps_mutex); 142
37 p->g = g; 143 sea->page_count++;
38 144 list_add(&p->pool_list_entry, &sea->pool_list);
39 /* Alloc one 4k page of semaphore per channel. */ 145 __unlock_sema_sea(sea);
40 if (gk20a_gmmu_alloc(g, roundup(capacity * SEMAPHORE_SIZE, PAGE_SIZE), 146
41 &p->mem))
42 goto clean_up;
43
44 /* Sacrifice one semaphore in the name of returning error codes. */
45 if (gk20a_allocator_init(&p->alloc, unique_name,
46 SEMAPHORE_SIZE, p->mem.size - SEMAPHORE_SIZE,
47 SEMAPHORE_SIZE))
48 goto clean_up;
49
50 gk20a_dbg_info("cpuva=%p iova=%llx phys=%llx", p->mem.cpu_va,
51 (u64)sg_dma_address(p->mem.sgt->sgl),
52 (u64)sg_phys(p->mem.sgt->sgl));
53 return p; 147 return p;
54 148
55clean_up: 149fail:
56 if (p->mem.size) 150 __unlock_sema_sea(sea);
57 gk20a_gmmu_free(p->g, &p->mem);
58 kfree(p); 151 kfree(p);
59 return NULL; 152 return ERR_PTR(err);
153}
154
155/*
156 * Map a pool into the passed vm's address space. This handles both the fixed
157 * global RO mapping and the non-fixed private RW mapping.
158 */
159int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p,
160 struct vm_gk20a *vm)
161{
162 int ents, err = 0;
163 u64 addr;
164
165 p->cpu_va = vmap(&p->page, 1, 0,
166 pgprot_writecombine(PAGE_KERNEL));
167
168 /* First do the RW mapping. */
169 p->rw_sg_table = kzalloc(sizeof(*p->rw_sg_table), GFP_KERNEL);
170 if (!p->rw_sg_table)
171 return -ENOMEM;
172
173 err = sg_alloc_table_from_pages(p->rw_sg_table, &p->page, 1, 0,
174 PAGE_SIZE, GFP_KERNEL);
175 if (err) {
176 err = -ENOMEM;
177 goto fail;
178 }
179
180 /* Add IOMMU mapping... */
181 ents = dma_map_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
182 DMA_BIDIRECTIONAL);
183 if (ents != 1) {
184 err = -ENOMEM;
185 goto fail_free_sgt;
186 }
187
188 /* Map into the GPU... Doesn't need to be fixed. */
189 p->gpu_va = gk20a_gmmu_map(vm, &p->rw_sg_table, PAGE_SIZE,
190 0, gk20a_mem_flag_none, false);
191 if (!p->gpu_va) {
192 err = -ENOMEM;
193 goto fail_unmap_sgt;
194 }
195
196 /*
197 * And now the global mapping. Take the sea lock so that we don't race
198 * with a concurrent remap.
199 */
200 __lock_sema_sea(p->sema_sea);
201
202 BUG_ON(p->mapped);
203 addr = gk20a_gmmu_fixed_map(vm, &p->sema_sea->ro_sg_table,
204 p->sema_sea->gpu_va, p->sema_sea->map_size,
205 0,
206 gk20a_mem_flag_read_only,
207 false);
208 if (!addr) {
209 err = -ENOMEM;
210 BUG();
211 goto fail_unlock;
212 }
213 p->gpu_va_ro = addr;
214 p->mapped = 1;
215
216 __unlock_sema_sea(p->sema_sea);
217
218 return 0;
219
220fail_unlock:
221 __unlock_sema_sea(p->sema_sea);
222fail_unmap_sgt:
223 dma_unmap_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
224 DMA_BIDIRECTIONAL);
225fail_free_sgt:
226 sg_free_table(p->rw_sg_table);
227fail:
228 kfree(p->rw_sg_table);
229 p->rw_sg_table = NULL;
230 return err;
60} 231}
61 232
233/*
234 * Unmap a semaphore_pool.
235 */
236void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *p,
237 struct vm_gk20a *vm)
238{
239 struct gk20a_semaphore_int *hw_sema;
240
241 kunmap(p->cpu_va);
242
243 /* First the global RO mapping... */
244 __lock_sema_sea(p->sema_sea);
245 gk20a_gmmu_unmap(vm, p->gpu_va_ro,
246 p->sema_sea->map_size, gk20a_mem_flag_none);
247 p->ro_sg_table = NULL;
248 __unlock_sema_sea(p->sema_sea);
249
250 /* And now the private RW mapping. */
251 gk20a_gmmu_unmap(vm, p->gpu_va, PAGE_SIZE, gk20a_mem_flag_none);
252 p->gpu_va = 0;
253
254 dma_unmap_sg(dev_from_vm(vm), p->rw_sg_table->sgl, 1,
255 DMA_BIDIRECTIONAL);
256
257 sg_free_table(p->rw_sg_table);
258 kfree(p->rw_sg_table);
259 p->rw_sg_table = NULL;
260
261 gk20a_dbg_info("Unmapped sema-pool: idx = %d", p->page_idx);
262 list_for_each_entry(hw_sema, &p->hw_semas, hw_sema_list)
263 /*
264 * Make sure the mem addresses are all NULL so if this gets
265 * reused we will fault.
266 */
267 hw_sema->value = NULL;
268}
269
270/*
271 * Completely free a sempahore_pool. You should make sure this pool is not
272 * mapped otherwise there's going to be a memory leak.
273 */
62static void gk20a_semaphore_pool_free(struct kref *ref) 274static void gk20a_semaphore_pool_free(struct kref *ref)
63{ 275{
64 struct gk20a_semaphore_pool *p = 276 struct gk20a_semaphore_pool *p =
65 container_of(ref, struct gk20a_semaphore_pool, ref); 277 container_of(ref, struct gk20a_semaphore_pool, ref);
66 mutex_lock(&p->maps_mutex); 278 struct gk20a_semaphore_sea *s = p->sema_sea;
67 WARN_ON(!list_empty(&p->maps)); 279 struct gk20a_semaphore_int *hw_sema, *tmp;
68 mutex_unlock(&p->maps_mutex); 280
69 gk20a_gmmu_free(p->g, &p->mem); 281 WARN_ON(p->gpu_va || p->rw_sg_table || p->ro_sg_table);
70 gk20a_allocator_destroy(&p->alloc); 282
283 __lock_sema_sea(s);
284 list_del(&p->pool_list_entry);
285 clear_bit(p->page_idx, s->pools_alloced);
286 s->page_count--;
287 __unlock_sema_sea(s);
288
289 list_for_each_entry_safe(hw_sema, tmp, &p->hw_semas, hw_sema_list)
290 kfree(hw_sema);
291
71 kfree(p); 292 kfree(p);
72} 293}
73 294
74static void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p) 295void gk20a_semaphore_pool_get(struct gk20a_semaphore_pool *p)
75{ 296{
76 kref_get(&p->ref); 297 kref_get(&p->ref);
77} 298}
@@ -81,104 +302,96 @@ void gk20a_semaphore_pool_put(struct gk20a_semaphore_pool *p)
81 kref_put(&p->ref, gk20a_semaphore_pool_free); 302 kref_put(&p->ref, gk20a_semaphore_pool_free);
82} 303}
83 304
84static struct gk20a_semaphore_pool_map * 305/*
85gk20a_semaphore_pool_find_map_locked(struct gk20a_semaphore_pool *p, 306 * Get the address for a semaphore_pool - if global is true then return the
86 struct vm_gk20a *vm) 307 * global RO address instead of the RW address owned by the semaphore's VM.
308 */
309u64 __gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, bool global)
87{ 310{
88 struct gk20a_semaphore_pool_map *map, *found = NULL; 311 if (!global)
89 list_for_each_entry(map, &p->maps, list) { 312 return p->gpu_va;
90 if (map->vm == vm) { 313
91 found = map; 314 return p->gpu_va_ro + (PAGE_SIZE * p->page_idx);
92 break;
93 }
94 }
95 return found;
96} 315}
97 316
98int gk20a_semaphore_pool_map(struct gk20a_semaphore_pool *p, 317static int __gk20a_init_hw_sema(struct channel_gk20a *ch)
99 struct vm_gk20a *vm,
100 enum gk20a_mem_rw_flag rw_flag)
101{ 318{
102 struct gk20a_semaphore_pool_map *map; 319 int hw_sema_idx;
320 int ret = 0;
321 struct gk20a_semaphore_int *hw_sema;
322 struct gk20a_semaphore_pool *p = ch->vm->sema_pool;
103 323
104 map = kzalloc(sizeof(*map), GFP_KERNEL); 324 BUG_ON(!p);
105 if (!map)
106 return -ENOMEM;
107 map->vm = vm;
108 map->rw_flag = rw_flag;
109 map->gpu_va = gk20a_gmmu_map(vm, &p->mem.sgt, p->mem.size,
110 0/*uncached*/, rw_flag,
111 false);
112 if (!map->gpu_va) {
113 kfree(map);
114 return -ENOMEM;
115 }
116 gk20a_vm_get(vm);
117 325
118 mutex_lock(&p->maps_mutex); 326 mutex_lock(&p->pool_lock);
119 WARN_ON(gk20a_semaphore_pool_find_map_locked(p, vm));
120 list_add(&map->list, &p->maps);
121 mutex_unlock(&p->maps_mutex);
122 return 0;
123}
124 327
125void gk20a_semaphore_pool_unmap(struct gk20a_semaphore_pool *p, 328 /* Find an available HW semaphore. */
126 struct vm_gk20a *vm) 329 hw_sema_idx = __semaphore_bitmap_alloc(p->semas_alloced,
127{ 330 PAGE_SIZE / SEMAPHORE_SIZE);
128 struct gk20a_semaphore_pool_map *map; 331 if (hw_sema_idx < 0) {
129 WARN_ON(!vm); 332 ret = hw_sema_idx;
130 333 goto fail;
131 mutex_lock(&p->maps_mutex);
132 map = gk20a_semaphore_pool_find_map_locked(p, vm);
133 if (map) {
134 gk20a_gmmu_unmap(vm, map->gpu_va, p->mem.size, map->rw_flag);
135 gk20a_vm_put(vm);
136 list_del(&map->list);
137 kfree(map);
138 } 334 }
139 mutex_unlock(&p->maps_mutex);
140}
141 335
142u64 gk20a_semaphore_pool_gpu_va(struct gk20a_semaphore_pool *p, 336 hw_sema = kzalloc(sizeof(struct gk20a_semaphore_int), GFP_KERNEL);
143 struct vm_gk20a *vm) 337 if (!hw_sema) {
144{ 338 ret = -ENOMEM;
145 struct gk20a_semaphore_pool_map *map; 339 goto fail_free_idx;
146 u64 gpu_va = 0; 340 }
147 341
148 mutex_lock(&p->maps_mutex); 342 ch->hw_sema = hw_sema;
149 map = gk20a_semaphore_pool_find_map_locked(p, vm); 343 hw_sema->ch = ch;
150 if (map) 344 hw_sema->p = p;
151 gpu_va = map->gpu_va; 345 hw_sema->idx = hw_sema_idx;
152 mutex_unlock(&p->maps_mutex); 346 hw_sema->offset = SEMAPHORE_SIZE * hw_sema_idx;
347 atomic_set(&hw_sema->next_value, 0);
348 hw_sema->value = p->cpu_va + hw_sema->offset;
349 writel(0, hw_sema->value);
153 350
154 return gpu_va; 351 list_add(&hw_sema->hw_sema_list, &p->hw_semas);
352
353 mutex_unlock(&p->pool_lock);
354
355 return 0;
356
357fail_free_idx:
358 clear_bit(hw_sema_idx, p->semas_alloced);
359fail:
360 mutex_unlock(&p->pool_lock);
361 return ret;
155} 362}
156 363
157struct gk20a_semaphore *gk20a_semaphore_alloc(struct gk20a_semaphore_pool *pool) 364/*
365 * Allocate a semaphore from the passed pool.
366 *
367 * Since semaphores are ref-counted there's no explicit free for external code
368 * to use. When the ref-count hits 0 the internal free will happen.
369 */
370struct gk20a_semaphore *gk20a_semaphore_alloc(struct channel_gk20a *ch)
158{ 371{
159 struct gk20a_semaphore *s; 372 struct gk20a_semaphore *s;
373 int ret;
374
375 if (!ch->hw_sema) {
376 ret = __gk20a_init_hw_sema(ch);
377 if (ret)
378 return ERR_PTR(ret);
379 }
160 380
161 s = kzalloc(sizeof(*s), GFP_KERNEL); 381 s = kzalloc(sizeof(*s), GFP_KERNEL);
162 if (!s) 382 if (!s)
163 return NULL; 383 return NULL;
164 384
165 s->offset = gk20a_balloc(&pool->alloc, SEMAPHORE_SIZE); 385 kref_init(&s->ref);
166 if (!s->offset) { 386 s->hw_sema = ch->hw_sema;
167 gk20a_err(dev_from_gk20a(pool->g), 387 atomic_set(&s->value, 0);
168 "failed to allocate semaphore");
169 kfree(s);
170 return NULL;
171 }
172 388
173 gk20a_semaphore_pool_get(pool); 389 /*
174 s->pool = pool; 390 * Take a ref on the pool so that we can keep this pool alive for
391 * as long as this semaphore is alive.
392 */
393 gk20a_semaphore_pool_get(s->hw_sema->p);
175 394
176 kref_init(&s->ref);
177 /* Initially acquired. */
178 gk20a_mem_wr(s->pool->g, &s->pool->mem, s->offset, 0);
179 gk20a_dbg_info("created semaphore offset=%d, value=%d",
180 s->offset,
181 gk20a_mem_rd(s->pool->g, &s->pool->mem, s->offset));
182 return s; 395 return s;
183} 396}
184 397
@@ -187,8 +400,8 @@ static void gk20a_semaphore_free(struct kref *ref)
187 struct gk20a_semaphore *s = 400 struct gk20a_semaphore *s =
188 container_of(ref, struct gk20a_semaphore, ref); 401 container_of(ref, struct gk20a_semaphore, ref);
189 402
190 gk20a_bfree(&s->pool->alloc, s->offset); 403 gk20a_semaphore_pool_put(s->hw_sema->p);
191 gk20a_semaphore_pool_put(s->pool); 404
192 kfree(s); 405 kfree(s);
193} 406}
194 407