summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a
diff options
context:
space:
mode:
authorAlex Waterman <alexw@nvidia.com>2015-03-18 16:33:09 -0400
committerTerje Bergstrom <tbergstrom@nvidia.com>2015-05-11 11:53:25 -0400
commita2e852364582e9c337f52bc53ccc33877c8f3b47 (patch)
treefb13c5ad80db8eb2424a753a92389c7a3a322a12 /drivers/gpu/nvgpu/gk20a
parent0566aee853eb32f4f796499b6b00ddf0f1d7de34 (diff)
gpu: nvgpu: New allocator for VA space
Implement a new buddy allocation scheme for the GPU's VA space. The bitmap allocator was using too much memory and is not a scaleable solution as the GPU's address space keeps getting bigger. The buddy allocation scheme is much more memory efficient when the majority of the address space is not allocated. The buddy allocator is not constrained by the notion of a split address space. The bitmap allocator could only manage either small pages or large pages but not both at the same time. Thus the bottom of the address space was for small pages, the top for large pages. Although, that split is not removed quite yet, the new allocator enables that to happen. The buddy allocator is also very scalable. It manages the relatively small comptag space to the enormous GPU VA space and everything in between. This is important since the GPU has lots of different sized spaces that need managing. Currently there are certain limitations. For one the allocator does not handle the fixed allocations from CUDA very well. It can do so but with certain caveats. The PTE page size is always set to small. This means the BA may place other small page allocations in the buddies around the fixed allocation. It does this to avoid having large and small page allocations in the same PDE. Change-Id: I501cd15af03611536490137331d43761c402c7f9 Signed-off-by: Alex Waterman <alexw@nvidia.com> Reviewed-on: http://git-master/r/740694 Reviewed-by: Automatic_Commit_Validation_User GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a')
-rw-r--r--drivers/gpu/nvgpu/gk20a/as_gk20a.c15
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.c2
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a_allocator.c1167
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a_allocator.h213
-rw-r--r--drivers/gpu/nvgpu/gk20a/ltc_gk20a.c5
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.c194
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.h1
-rw-r--r--drivers/gpu/nvgpu/gk20a/pmu_gk20a.c68
-rw-r--r--drivers/gpu/nvgpu/gk20a/pmu_gk20a.h4
-rw-r--r--drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c15
10 files changed, 1368 insertions, 316 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/as_gk20a.c b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
index 038fa4c8..d832b792 100644
--- a/drivers/gpu/nvgpu/gk20a/as_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
@@ -199,21 +199,14 @@ static int gk20a_as_ioctl_get_va_regions(
199 199
200 for (i = 0; i < write_entries; ++i) { 200 for (i = 0; i < write_entries; ++i) {
201 struct nvgpu_as_va_region region; 201 struct nvgpu_as_va_region region;
202 u32 base, limit;
203 202
204 memset(&region, 0, sizeof(struct nvgpu_as_va_region)); 203 memset(&region, 0, sizeof(struct nvgpu_as_va_region));
205 204
206 if (!vm->vma[i].constraint.enable) {
207 base = vm->vma[i].base;
208 limit = vm->vma[i].limit;
209 } else {
210 base = vm->vma[i].constraint.base;
211 limit = vm->vma[i].constraint.limit;
212 }
213
214 region.page_size = vm->gmmu_page_sizes[i]; 205 region.page_size = vm->gmmu_page_sizes[i];
215 region.offset = (u64)base * region.page_size; 206 region.offset = vm->vma[i].base;
216 region.pages = limit - base; /* NOTE: limit is exclusive */ 207 /* No __aeabi_uldivmod() on some platforms... */
208 region.pages = (vm->vma[i].end - vm->vma[i].start) >>
209 ilog2(region.page_size);
217 210
218 if (copy_to_user(user_region_ptr + i, &region, sizeof(region))) 211 if (copy_to_user(user_region_ptr + i, &region, sizeof(region)))
219 return -EFAULT; 212 return -EFAULT;
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index f6c9f901..eb52f28d 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -59,6 +59,7 @@
59#include "hw_fb_gk20a.h" 59#include "hw_fb_gk20a.h"
60#include "gk20a_scale.h" 60#include "gk20a_scale.h"
61#include "dbg_gpu_gk20a.h" 61#include "dbg_gpu_gk20a.h"
62#include "gk20a_allocator.h"
62#include "hal.h" 63#include "hal.h"
63#include "vgpu/vgpu.h" 64#include "vgpu/vgpu.h"
64 65
@@ -1510,6 +1511,7 @@ static int gk20a_probe(struct platform_device *dev)
1510 &gk20a->mm.disable_bigpage); 1511 &gk20a->mm.disable_bigpage);
1511 gk20a_pmu_debugfs_init(dev); 1512 gk20a_pmu_debugfs_init(dev);
1512 gk20a_cde_debugfs_init(dev); 1513 gk20a_cde_debugfs_init(dev);
1514 gk20a_alloc_debugfs_init(dev);
1513#endif 1515#endif
1514 1516
1515 gk20a_init_gr(gk20a); 1517 gk20a_init_gr(gk20a);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
index 0037257c..56fb22df 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * gk20a allocator 2 * gk20a allocator
3 * 3 *
4 * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2011-2015, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -16,112 +16,1149 @@
16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */ 17 */
18 18
19#include <linux/kernel.h>
20#include <linux/seq_file.h>
21#include <linux/slab.h>
22#include <linux/debugfs.h>
23
24#include "platform_gk20a.h"
19#include "gk20a_allocator.h" 25#include "gk20a_allocator.h"
20#include <linux/vmalloc.h>
21 26
22/* init allocator struct */ 27#include "mm_gk20a.h"
23int gk20a_allocator_init(struct gk20a_allocator *allocator, 28
24 const char *name, u32 start, u32 len) 29static struct dentry *balloc_debugfs_root;
30
31static struct kmem_cache *buddy_cache; /* slab cache for meta data. */
32
33static u32 balloc_tracing_on;
34
35#define balloc_trace_func() \
36 do { \
37 if (balloc_tracing_on) \
38 trace_printk("%s\n", __func__); \
39 } while (0)
40
41#define balloc_trace_func_done() \
42 do { \
43 if (balloc_tracing_on) \
44 trace_printk("%s_done\n", __func__); \
45 } while (0)
46
47
48static void balloc_init_alloc_debug(struct gk20a_allocator *a);
49static void balloc_print_stats(struct gk20a_allocator *a, struct seq_file *s,
50 int lock);
51static struct gk20a_buddy *balloc_free_buddy(struct gk20a_allocator *a,
52 u64 addr);
53static void balloc_coalesce(struct gk20a_allocator *a, struct gk20a_buddy *b);
54static void __balloc_do_free_fixed(struct gk20a_allocator *a,
55 struct gk20a_fixed_alloc *falloc);
56
57/*
58 * This function is not present in older kernel's list.h code.
59 */
60#ifndef list_last_entry
61#define list_last_entry(ptr, type, member) \
62 list_entry((ptr)->prev, type, member)
63#endif
64
65/*
66 * GPU buddy allocator for various address spaces.
67 *
68 * Current limitations:
69 * o A fixed allocation could potentially be made that borders PDEs with
70 * different PTE sizes. This would require that fixed buffer to have
71 * different sized PTEs for different parts of the allocation. Probably
72 * best to just require PDE alignment for fixed address allocs.
73 *
74 * o It is currently possible to make an allocator that has a buddy alignment
75 * out of sync with the PDE block size alignment. A simple example is a
76 * 32GB address space starting at byte 1. Every buddy is shifted off by 1
77 * which means each buddy corresponf to more than one actual GPU page. The
78 * best way to fix this is probably just require PDE blocksize alignment
79 * for the start of the address space. At the moment all allocators are
80 * easily PDE aligned so this hasn't been a problem.
81 */
82
83/*
84 * Pick a suitable maximum order for this allocator.
85 *
86 * Hueristic: Just guessing that the best max order is the largest single
87 * block that will fit in the address space.
88 */
89static void balloc_compute_max_order(struct gk20a_allocator *a)
90{
91 u64 true_max_order = ilog2(a->blks);
92
93 if (a->max_order > true_max_order)
94 a->max_order = true_max_order;
95 if (a->max_order > GPU_BALLOC_MAX_ORDER)
96 a->max_order = GPU_BALLOC_MAX_ORDER;
97}
98
99/*
100 * Since we can only allocate in chucks of a->blk_size we need to trim off
101 * any excess data that is not aligned to a->blk_size.
102 */
103static void balloc_allocator_align(struct gk20a_allocator *a)
104{
105 a->start = ALIGN(a->base, a->blk_size);
106 a->end = (a->base + a->length) & ~(a->blk_size - 1);
107 a->count = a->end - a->start;
108 a->blks = a->count >> a->blk_shift;
109}
110
111/*
112 * Pass NULL for parent if you want a top level buddy.
113 */
114static struct gk20a_buddy *balloc_new_buddy(struct gk20a_allocator *a,
115 struct gk20a_buddy *parent,
116 u64 start, u64 order)
117{
118 struct gk20a_buddy *new_buddy;
119
120 new_buddy = kmem_cache_alloc(buddy_cache, GFP_KERNEL);
121 if (!new_buddy)
122 return NULL;
123
124 memset(new_buddy, 0, sizeof(struct gk20a_buddy));
125
126 new_buddy->parent = parent;
127 new_buddy->start = start;
128 new_buddy->order = order;
129 new_buddy->end = start + (1 << order) * a->blk_size;
130
131 return new_buddy;
132}
133
134static void __balloc_buddy_list_add(struct gk20a_allocator *a,
135 struct gk20a_buddy *b,
136 struct list_head *list)
137{
138 if (buddy_is_in_list(b)) {
139 balloc_dbg(a, "Oops: adding added buddy (%llu:0x%llx)\n",
140 b->order, b->start);
141 BUG();
142 }
143
144 /*
145 * Add big PTE blocks to the tail, small to the head for GVA spaces.
146 * This lets the code that checks if there are available blocks check
147 * without cycling through the entire list.
148 */
149 if (a->flags & GPU_BALLOC_GVA_SPACE &&
150 b->pte_size == BALLOC_PTE_SIZE_BIG)
151 list_add_tail(&b->buddy_entry, list);
152 else
153 list_add(&b->buddy_entry, list);
154
155 buddy_set_in_list(b);
156}
157
158static void __balloc_buddy_list_rem(struct gk20a_allocator *a,
159 struct gk20a_buddy *b)
160{
161 if (!buddy_is_in_list(b)) {
162 balloc_dbg(a, "Oops: removing removed buddy (%llu:0x%llx)\n",
163 b->order, b->start);
164 BUG();
165 }
166
167 list_del_init(&b->buddy_entry);
168 buddy_clr_in_list(b);
169}
170
171/*
172 * Add a buddy to one of the buddy lists and deal with the necessary
173 * book keeping. Adds the buddy to the list specified by the buddy's order.
174 */
175static void balloc_blist_add(struct gk20a_allocator *a, struct gk20a_buddy *b)
176{
177 __balloc_buddy_list_add(a, b, balloc_get_order_list(a, b->order));
178 a->buddy_list_len[b->order]++;
179}
180
181static void balloc_blist_rem(struct gk20a_allocator *a, struct gk20a_buddy *b)
182{
183 __balloc_buddy_list_rem(a, b);
184 a->buddy_list_len[b->order]--;
185}
186
187static u64 balloc_get_order(struct gk20a_allocator *a, u64 len)
188{
189 if (len == 0)
190 return 0;
191
192 len--;
193 len >>= a->blk_shift;
194
195 return fls(len);
196}
197
198static u64 __balloc_max_order_in(struct gk20a_allocator *a, u64 start, u64 end)
199{
200 u64 size = (end - start) >> a->blk_shift;
201
202 if (size > 0)
203 return min_t(u64, ilog2(size), a->max_order);
204 else
205 return GPU_BALLOC_MAX_ORDER;
206}
207
208/*
209 * Initialize the buddy lists.
210 */
211static int balloc_init_lists(struct gk20a_allocator *a)
212{
213 int i;
214 u64 bstart, bend, order;
215 struct gk20a_buddy *buddy;
216
217 bstart = a->start;
218 bend = a->end;
219
220 /* First make sure the LLs are valid. */
221 for (i = 0; i < GPU_BALLOC_ORDER_LIST_LEN; i++)
222 INIT_LIST_HEAD(balloc_get_order_list(a, i));
223
224 while (bstart < bend) {
225 order = __balloc_max_order_in(a, bstart, bend);
226
227 buddy = balloc_new_buddy(a, NULL, bstart, order);
228 if (!buddy)
229 goto cleanup;
230
231 balloc_blist_add(a, buddy);
232 bstart += balloc_order_to_len(a, order);
233 }
234
235 return 0;
236
237cleanup:
238 for (i = 0; i < GPU_BALLOC_ORDER_LIST_LEN; i++) {
239 if (!list_empty(balloc_get_order_list(a, i))) {
240 buddy = list_first_entry(balloc_get_order_list(a, i),
241 struct gk20a_buddy, buddy_entry);
242 balloc_blist_rem(a, buddy);
243 kmem_cache_free(buddy_cache, buddy);
244 }
245 }
246
247 return -ENOMEM;
248}
249
250/*
251 * Initialize a buddy allocator. Returns 0 on success. This allocator does
252 * not necessarily manage bytes. It manages distinct ranges of resources. This
253 * allows the allocator to work for things like comp_tags, semaphores, etc.
254 *
255 * @allocator: Ptr to an allocator struct to init.
256 * @vm: GPU VM to associate this allocator with. Can be NULL. Will be used to
257 * get PTE size for GVA spaces.
258 * @name: Name of the allocator. Doesn't have to be static storage.
259 * @base: The base address of the resource pool being managed.
260 * @size: Number of resources in the pool.
261 * @blk_size: Minimum number of resources to allocate at once. For things like
262 * semaphores this is 1. For GVA this might be as much as 64k. This
263 * corresponds to order 0. Must be power of 2.
264 * @max_order: Pick a maximum order. If you leave this as 0, the buddy allocator
265 * will try and pick a reasonable max order.
266 * @flags: Extra flags necessary. See GPU_BALLOC_*.
267 */
268int __gk20a_allocator_init(struct gk20a_allocator *a,
269 struct vm_gk20a *vm, const char *name,
270 u64 base, u64 size, u64 blk_size, u64 max_order,
271 u64 flags)
25{ 272{
26 memset(allocator, 0, sizeof(struct gk20a_allocator)); 273 int err;
274
275 memset(a, 0, sizeof(struct gk20a_allocator));
276 strncpy(a->name, name, 32);
277
278 a->base = base;
279 a->length = size;
280 a->blk_size = blk_size;
281 a->blk_shift = __ffs(blk_size);
282
283 /* blk_size must be greater than 0 and a power of 2. */
284 if (blk_size == 0)
285 return -EINVAL;
286 if (blk_size & (blk_size - 1))
287 return -EINVAL;
288
289 if (max_order > GPU_BALLOC_MAX_ORDER)
290 return -EINVAL;
291
292 /* If this is to manage a GVA space we need a VM. */
293 if (flags & GPU_BALLOC_GVA_SPACE && !vm)
294 return -EINVAL;
295
296 a->vm = vm;
297 if (flags & GPU_BALLOC_GVA_SPACE)
298 a->pte_blk_order = balloc_get_order(a, vm->big_page_size << 10);
27 299
28 strncpy(allocator->name, name, 32); 300 a->flags = flags;
301 a->max_order = max_order;
29 302
30 allocator->base = start; 303 balloc_allocator_align(a);
31 allocator->limit = start + len - 1; 304 balloc_compute_max_order(a);
32 305
33 allocator->bitmap = vzalloc(BITS_TO_LONGS(len) * sizeof(long)); 306 /* Shared buddy kmem_cache for all allocators. */
34 if (!allocator->bitmap) 307 if (!buddy_cache)
308 buddy_cache = KMEM_CACHE(gk20a_buddy, 0);
309 if (!buddy_cache)
35 return -ENOMEM; 310 return -ENOMEM;
36 311
37 allocator_dbg(allocator, "%s : base %d, limit %d", 312 a->alloced_buddies = RB_ROOT;
38 allocator->name, allocator->base); 313 err = balloc_init_lists(a);
314 if (err)
315 return err;
39 316
40 init_rwsem(&allocator->rw_sema); 317 mutex_init(&a->lock);
41 318
42 allocator->alloc = gk20a_allocator_block_alloc; 319 a->init = 1;
43 allocator->free = gk20a_allocator_block_free; 320
321 balloc_init_alloc_debug(a);
322 balloc_dbg(a, "New allocator: base 0x%llx\n", a->base);
323 balloc_dbg(a, " size 0x%llx\n", a->length);
324 balloc_dbg(a, " blk_size 0x%llx\n", a->blk_size);
325 balloc_dbg(a, " max_order %llu\n", a->max_order);
326 balloc_dbg(a, " flags 0x%llx\n", a->flags);
44 327
45 return 0; 328 return 0;
46} 329}
47 330
48/* destroy allocator, free all remaining blocks if any */ 331int gk20a_allocator_init(struct gk20a_allocator *a, const char *name,
49void gk20a_allocator_destroy(struct gk20a_allocator *allocator) 332 u64 base, u64 size, u64 blk_size)
333{
334 return __gk20a_allocator_init(a, NULL, name,
335 base, size, blk_size, 0, 0);
336}
337
338/*
339 * Clean up and destroy the passed allocator.
340 */
341void gk20a_allocator_destroy(struct gk20a_allocator *a)
50{ 342{
51 down_write(&allocator->rw_sema); 343 struct rb_node *node;
344 struct gk20a_buddy *bud;
345 struct gk20a_fixed_alloc *falloc;
346 int i;
347
348 balloc_lock(a);
349
350 if (!IS_ERR_OR_NULL(a->debugfs_entry))
351 debugfs_remove(a->debugfs_entry);
352
353 /*
354 * Free the fixed allocs first.
355 */
356 while ((node = rb_first(&a->fixed_allocs)) != NULL) {
357 falloc = container_of(node,
358 struct gk20a_fixed_alloc, alloced_entry);
359
360 __balloc_do_free_fixed(a, falloc);
361 rb_erase(node, &a->fixed_allocs);
362 }
363
364 /*
365 * And now free all outstanding allocations.
366 */
367 while ((node = rb_first(&a->alloced_buddies)) != NULL) {
368 bud = container_of(node, struct gk20a_buddy, alloced_entry);
369 balloc_free_buddy(a, bud->start);
370 balloc_blist_add(a, bud);
371 balloc_coalesce(a, bud);
372 }
52 373
53 vfree(allocator->bitmap); 374 /*
375 * Now clean up the unallocated buddies.
376 */
377 for (i = 0; i < GPU_BALLOC_ORDER_LIST_LEN; i++) {
378 BUG_ON(a->buddy_list_alloced[i] != 0);
379
380 while (!list_empty(balloc_get_order_list(a, i))) {
381 bud = list_first_entry(balloc_get_order_list(a, i),
382 struct gk20a_buddy, buddy_entry);
383 balloc_blist_rem(a, bud);
384 kmem_cache_free(buddy_cache, bud);
385 }
386
387 if (a->buddy_list_len[i] != 0) {
388 pr_info("Excess buddies!!! (%d: %llu)\n",
389 i, a->buddy_list_len[i]);
390 BUG();
391 }
392 if (a->buddy_list_split[i] != 0) {
393 pr_info("Excess split nodes!!! (%d: %llu)\n",
394 i, a->buddy_list_split[i]);
395 BUG();
396 }
397 if (a->buddy_list_alloced[i] != 0) {
398 pr_info("Excess alloced nodes!!! (%d: %llu)\n",
399 i, a->buddy_list_alloced[i]);
400 BUG();
401 }
402 }
54 403
55 memset(allocator, 0, sizeof(struct gk20a_allocator)); 404 a->init = 0;
405
406 balloc_unlock(a);
407
408 /*
409 * We cant unlock an allocator after memsetting it. That wipes the
410 * state of the mutex. Hopefully no one uses the allocator after
411 * destroying it...
412 */
413 memset(a, 0, sizeof(struct gk20a_allocator));
56} 414}
57 415
58/* 416/*
59 * *addr != ~0 for fixed address allocation. if *addr == 0, base addr is 417 * Combine the passed buddy if possible. The pointer in @b may not be valid
60 * returned to caller in *addr. 418 * after this as the buddy may be freed.
61 * 419 *
62 * contiguous allocation, which allocates one block of 420 * @a must be locked.
63 * contiguous address. 421 */
64*/ 422static void balloc_coalesce(struct gk20a_allocator *a, struct gk20a_buddy *b)
65int gk20a_allocator_block_alloc(struct gk20a_allocator *allocator,
66 u32 *addr, u32 len, u32 align)
67{ 423{
68 unsigned long _addr; 424 struct gk20a_buddy *parent;
69 425
70 allocator_dbg(allocator, "[in] addr %d, len %d", *addr, len); 426 if (buddy_is_alloced(b) || buddy_is_split(b))
427 return;
71 428
72 if ((*addr != 0 && *addr < allocator->base) || /* check addr range */ 429 /*
73 *addr + len > allocator->limit || /* check addr range */ 430 * If both our buddy and I are both not allocated and not split then
74 *addr & (align - 1) || /* check addr alignment */ 431 * we can coalesce ourselves.
75 len == 0) /* check len */ 432 */
76 return -EINVAL; 433 if (!b->buddy)
434 return;
435 if (buddy_is_alloced(b->buddy) || buddy_is_split(b->buddy))
436 return;
437
438 parent = b->parent;
439
440 balloc_blist_rem(a, b);
441 balloc_blist_rem(a, b->buddy);
442
443 buddy_clr_split(parent);
444 a->buddy_list_split[parent->order]--;
445 balloc_blist_add(a, parent);
446
447 /*
448 * Recursively coalesce as far as we can go.
449 */
450 balloc_coalesce(a, parent);
451
452 /* Clean up the remains. */
453 kmem_cache_free(buddy_cache, b->buddy);
454 kmem_cache_free(buddy_cache, b);
455}
456
457/*
458 * Split a buddy into two new buddies who are 1/2 the size of the parent buddy.
459 *
460 * @a must be locked.
461 */
462static int balloc_split_buddy(struct gk20a_allocator *a, struct gk20a_buddy *b,
463 int pte_size)
464{
465 struct gk20a_buddy *left, *right;
466 u64 half;
77 467
78 len = ALIGN(len, align); 468 left = balloc_new_buddy(a, b, b->start, b->order - 1);
79 if (!len) 469 if (!left)
80 return -ENOMEM; 470 return -ENOMEM;
81 471
82 down_write(&allocator->rw_sema); 472 half = (b->end - b->start) / 2;
83 473
84 _addr = bitmap_find_next_zero_area(allocator->bitmap, 474 right = balloc_new_buddy(a, b, b->start + half, b->order - 1);
85 allocator->limit - allocator->base + 1, 475 if (!right) {
86 *addr ? (*addr - allocator->base) : 0, 476 kmem_cache_free(buddy_cache, left);
87 len,
88 align - 1);
89 if ((_addr > allocator->limit - allocator->base + 1) ||
90 (*addr && *addr != (_addr + allocator->base))) {
91 up_write(&allocator->rw_sema);
92 return -ENOMEM; 477 return -ENOMEM;
93 } 478 }
94 479
95 bitmap_set(allocator->bitmap, _addr, len); 480 buddy_set_split(b);
96 *addr = allocator->base + _addr; 481 a->buddy_list_split[b->order]++;
97 482
98 up_write(&allocator->rw_sema); 483 b->left = left;
484 b->right = right;
485 left->buddy = right;
486 right->buddy = left;
487 left->parent = b;
488 right->parent = b;
99 489
100 allocator_dbg(allocator, "[out] addr %d, len %d", *addr, len); 490 /* PTE considerations. */
491 if (a->flags & GPU_BALLOC_GVA_SPACE &&
492 left->order <= a->pte_blk_order) {
493 left->pte_size = pte_size;
494 right->pte_size = pte_size;
495 }
496
497 balloc_blist_rem(a, b);
498 balloc_blist_add(a, left);
499 balloc_blist_add(a, right);
101 500
102 return 0; 501 return 0;
103} 502}
104 503
105/* free all blocks between start and end */ 504/*
106int gk20a_allocator_block_free(struct gk20a_allocator *allocator, 505 * Place the passed buddy into the RB tree for allocated buddies. Never fails
107 u32 addr, u32 len, u32 align) 506 * unless the passed entry is a duplicate which is a bug.
507 *
508 * @a must be locked.
509 */
510void balloc_alloc_buddy(struct gk20a_allocator *a, struct gk20a_buddy *b)
108{ 511{
109 allocator_dbg(allocator, "[in] addr %d, len %d", addr, len); 512 struct rb_node **new = &(a->alloced_buddies.rb_node);
513 struct rb_node *parent = NULL;
110 514
111 if (addr + len > allocator->limit || /* check addr range */ 515 while (*new) {
112 addr < allocator->base || 516 struct gk20a_buddy *bud = container_of(*new, struct gk20a_buddy,
113 addr & (align - 1)) /* check addr alignment */ 517 alloced_entry);
114 return -EINVAL;
115 518
116 len = ALIGN(len, align); 519 parent = *new;
117 if (!len) 520 if (b->start < bud->start)
118 return -EINVAL; 521 new = &((*new)->rb_left);
522 else if (b->start > bud->start)
523 new = &((*new)->rb_right);
524 else
525 BUG_ON("Duplicate entries in allocated list!\n");
526 }
527
528 rb_link_node(&b->alloced_entry, parent, new);
529 rb_insert_color(&b->alloced_entry, &a->alloced_buddies);
530
531 buddy_set_alloced(b);
532 a->buddy_list_alloced[b->order]++;
533}
534
535/*
536 * Remove the passed buddy from the allocated buddy RB tree. Returns the
537 * deallocated buddy for further processing.
538 *
539 * @a must be locked.
540 */
541static struct gk20a_buddy *balloc_free_buddy(struct gk20a_allocator *a,
542 u64 addr)
543{
544 struct rb_node *node = a->alloced_buddies.rb_node;
545 struct gk20a_buddy *bud;
546
547 while (node) {
548 bud = container_of(node, struct gk20a_buddy, alloced_entry);
549
550 if (addr < bud->start)
551 node = node->rb_left;
552 else if (addr > bud->start)
553 node = node->rb_right;
554 else
555 break;
556 }
557
558 if (!node)
559 return NULL;
560
561 rb_erase(node, &a->alloced_buddies);
562 buddy_clr_alloced(bud);
563 a->buddy_list_alloced[bud->order]--;
564
565 return bud;
566}
567
568/*
569 * Find a suitable buddy for the given order and PTE type (big or little).
570 */
571static struct gk20a_buddy *__balloc_find_buddy(struct gk20a_allocator *a,
572 u64 order, int pte_size)
573{
574 struct gk20a_buddy *bud;
575
576 if (list_empty(balloc_get_order_list(a, order)))
577 return NULL;
578
579 if (a->flags & GPU_BALLOC_GVA_SPACE &&
580 pte_size == BALLOC_PTE_SIZE_BIG)
581 bud = list_last_entry(balloc_get_order_list(a, order),
582 struct gk20a_buddy, buddy_entry);
583 else
584 bud = list_first_entry(balloc_get_order_list(a, order),
585 struct gk20a_buddy, buddy_entry);
586
587 if (bud->pte_size != BALLOC_PTE_SIZE_ANY &&
588 bud->pte_size != pte_size)
589 return NULL;
590
591 return bud;
592}
593
594/*
595 * Allocate a suitably sized buddy. If no suitable buddy exists split higher
596 * order buddies until we have a suitable buddy to allocate.
597 *
598 * For PDE grouping add an extra check to see if a buddy is suitable: that the
599 * buddy exists in a PDE who's PTE size is reasonable
600 *
601 * @a must be locked.
602 */
603static u64 __balloc_do_alloc(struct gk20a_allocator *a, u64 order, int pte_size)
604{
605 u64 split_order;
606 struct gk20a_buddy *bud;
607
608 split_order = order;
609 while (!(bud = __balloc_find_buddy(a, split_order, pte_size)))
610 split_order++;
611
612 while (bud->order != order) {
613 if (balloc_split_buddy(a, bud, pte_size))
614 return 0; /* No mem... */
615 bud = bud->left;
616 }
617
618 balloc_blist_rem(a, bud);
619 balloc_alloc_buddy(a, bud);
119 620
120 down_write(&allocator->rw_sema); 621 return bud->start;
121 bitmap_clear(allocator->bitmap, addr - allocator->base, len); 622}
122 up_write(&allocator->rw_sema); 623
624/*
625 * Allocate memory from the passed allocator.
626 */
627u64 gk20a_balloc(struct gk20a_allocator *a, u64 len)
628{
629 u64 order, addr;
630 int pte_size;
631
632 balloc_trace_func();
633
634 balloc_lock(a);
635
636 order = balloc_get_order(a, len);
637
638 if (order > a->max_order) {
639 balloc_unlock(a);
640 balloc_dbg(a, "Alloc fail\n");
641 balloc_trace_func_done();
642 return 0;
643 }
644
645 /*
646 * For now pass the base address of the allocator's region to
647 * __get_pte_size(). This ensures we get the right page size for
648 * the alloc but we don't have to know what the real address is
649 * going to be quite yet.
650 *
651 * TODO: once userspace supports a unified address space pass 0 for
652 * the base. This will make only 'len' affect the PTE size.
653 */
654 if (a->flags & GPU_BALLOC_GVA_SPACE)
655 pte_size = __get_pte_size(a->vm, a->base, len);
656 else
657 pte_size = BALLOC_PTE_SIZE_ANY;
658
659 addr = __balloc_do_alloc(a, order, pte_size);
660
661 a->bytes_alloced += len;
662 a->bytes_alloced_real += balloc_order_to_len(a, order);
663
664 balloc_unlock(a);
665 balloc_dbg(a, "Alloc 0x%-10llx %3lld:0x%-10llx pte_size=%s\n",
666 addr, order, len,
667 pte_size == gmmu_page_size_big ? "big" :
668 pte_size == gmmu_page_size_small ? "small" :
669 "NA/any");
670
671 balloc_trace_func_done();
672 return addr;
673}
674
675/*
676 * See if the passed range is actually available for allocation. If so, then
677 * return 1, otherwise return 0.
678 *
679 * TODO: Right now this uses the unoptimal approach of going through all
680 * outstanding allocations and checking their base/ends. This could be better.
681 */
682static int balloc_is_range_free(struct gk20a_allocator *a, u64 base, u64 end)
683{
684 struct rb_node *node;
685 struct gk20a_buddy *bud;
686
687 node = rb_first(&a->alloced_buddies);
688 if (!node)
689 return 1; /* No allocs yet. */
690
691 bud = container_of(node, struct gk20a_buddy, alloced_entry);
692
693 while (bud->start < end) {
694 if ((bud->start > base && bud->start < end) ||
695 (bud->end > base && bud->end < end))
696 return 0;
697
698 node = rb_next(node);
699 if (!node)
700 break;
701 bud = container_of(node, struct gk20a_buddy, alloced_entry);
702 }
703
704 return 1;
705}
706
707static void balloc_alloc_fixed(struct gk20a_allocator *a,
708 struct gk20a_fixed_alloc *f)
709{
710 struct rb_node **new = &(a->fixed_allocs.rb_node);
711 struct rb_node *parent = NULL;
712
713 while (*new) {
714 struct gk20a_fixed_alloc *falloc =
715 container_of(*new, struct gk20a_fixed_alloc,
716 alloced_entry);
717
718 parent = *new;
719 if (f->start < falloc->start)
720 new = &((*new)->rb_left);
721 else if (f->start > falloc->start)
722 new = &((*new)->rb_right);
723 else
724 BUG_ON("Duplicate entries in allocated list!\n");
725 }
726
727 rb_link_node(&f->alloced_entry, parent, new);
728 rb_insert_color(&f->alloced_entry, &a->fixed_allocs);
729}
730
731/*
732 * Remove the passed buddy from the allocated buddy RB tree. Returns the
733 * deallocated buddy for further processing.
734 *
735 * @a must be locked.
736 */
737static struct gk20a_fixed_alloc *balloc_free_fixed(struct gk20a_allocator *a,
738 u64 addr)
739{
740 struct rb_node *node = a->fixed_allocs.rb_node;
741 struct gk20a_fixed_alloc *falloc;
742
743 while (node) {
744 falloc = container_of(node,
745 struct gk20a_fixed_alloc, alloced_entry);
746
747 if (addr < falloc->start)
748 node = node->rb_left;
749 else if (addr > falloc->start)
750 node = node->rb_right;
751 else
752 break;
753 }
754
755 if (!node)
756 return NULL;
757
758 rb_erase(node, &a->fixed_allocs);
759
760 return falloc;
761}
762
763/*
764 * Find the parent range - doesn't necessarily need the parent to actually exist
765 * as a buddy. Finding an existing parent comes later...
766 */
767static void __balloc_get_parent_range(struct gk20a_allocator *a,
768 u64 base, u64 order,
769 u64 *pbase, u64 *porder)
770{
771 u64 base_mask;
772 u64 shifted_base = balloc_base_shift(a, base);
773
774 order++;
775 base_mask = ~((a->blk_size << order) - 1);
776
777 shifted_base &= base_mask;
778
779 *pbase = balloc_base_unshift(a, shifted_base);
780 *porder = order;
781}
782
783/*
784 * Makes a buddy at the passed address. This will make all parent buddies
785 * necessary for this buddy to exist as well.
786 */
787static struct gk20a_buddy *__balloc_make_fixed_buddy(struct gk20a_allocator *a,
788 u64 base, u64 order)
789{
790 struct gk20a_buddy *bud = NULL;
791 struct list_head *order_list;
792 u64 cur_order = order, cur_base = base;
793
794 /*
795 * Algo:
796 * 1. Keep jumping up a buddy order until we find the real buddy that
797 * this buddy exists in.
798 * 2. Then work our way down through the buddy tree until we hit a dead
799 * end.
800 * 3. Start splitting buddies until we split to the one we need to
801 * make.
802 */
803 while (cur_order <= a->max_order) {
804 int found = 0;
805
806 order_list = balloc_get_order_list(a, cur_order);
807 list_for_each_entry(bud, order_list, buddy_entry) {
808 if (bud->start == cur_base) {
809 found = 1;
810 break;
811 }
812 }
813
814 if (found)
815 break;
816
817 __balloc_get_parent_range(a, cur_base, cur_order,
818 &cur_base, &cur_order);
819 }
820
821 if (cur_order > a->max_order) {
822 balloc_dbg(a, "No buddy for range ???\n");
823 return NULL;
824 }
825
826 /* Split this buddy as necessary until we get the target buddy. */
827 while (bud->start != base || bud->order != order) {
828 if (balloc_split_buddy(a, bud, BALLOC_PTE_SIZE_ANY)) {
829 balloc_coalesce(a, bud);
830 return NULL;
831 }
832
833 if (base < bud->right->start)
834 bud = bud->left;
835 else
836 bud = bud->right;
837
838 }
839
840 return bud;
841}
842
843static u64 __balloc_do_alloc_fixed(struct gk20a_allocator *a,
844 struct gk20a_fixed_alloc *falloc,
845 u64 base, u64 len)
846{
847 u64 shifted_base, inc_base;
848 u64 align_order;
849
850 shifted_base = balloc_base_shift(a, base);
851 if (shifted_base == 0)
852 align_order = __fls(len >> a->blk_shift);
853 else
854 align_order = min_t(u64,
855 __ffs(shifted_base >> a->blk_shift),
856 __fls(len >> a->blk_shift));
857
858 if (align_order > a->max_order) {
859 balloc_dbg(a, "Align order too big: %llu > %llu\n",
860 align_order, a->max_order);
861 return 0;
862 }
863
864 /*
865 * Generate a list of buddies that satisfy this allocation.
866 */
867 inc_base = shifted_base;
868 while (inc_base < (shifted_base + len)) {
869 u64 order_len = balloc_order_to_len(a, align_order);
870 u64 remaining;
871 struct gk20a_buddy *bud;
872
873 bud = __balloc_make_fixed_buddy(a,
874 balloc_base_unshift(a, inc_base),
875 align_order);
876 if (!bud) {
877 balloc_dbg(a, "Fixed buddy failed: {0x%llx, %llu}!\n",
878 balloc_base_unshift(a, inc_base),
879 align_order);
880 goto err_and_cleanup;
881 }
882
883 balloc_blist_rem(a, bud);
884 balloc_alloc_buddy(a, bud);
885 __balloc_buddy_list_add(a, bud, &falloc->buddies);
886
887 /* Book keeping. */
888 inc_base += order_len;
889 remaining = (shifted_base + len) - inc_base;
890 align_order = __ffs(inc_base >> a->blk_shift);
891
892 /* If we don't have much left - trim down align_order. */
893 if (balloc_order_to_len(a, align_order) > remaining)
894 align_order = __balloc_max_order_in(a, inc_base,
895 inc_base + remaining);
896 }
897
898 return base;
123 899
124 allocator_dbg(allocator, "[out] addr %d, len %d", addr, len); 900err_and_cleanup:
901 while (!list_empty(&falloc->buddies)) {
902 struct gk20a_buddy *bud = list_first_entry(&falloc->buddies,
903 struct gk20a_buddy,
904 buddy_entry);
905
906 __balloc_buddy_list_rem(a, bud);
907 balloc_free_buddy(a, bud->start);
908 kmem_cache_free(buddy_cache, bud);
909 }
910
911 return 0;
912}
913
914/*
915 * Allocate a fixed address allocation. The address of the allocation is @base
916 * and the length is @len. This is not a typical buddy allocator operation and
917 * as such has a high posibility of failure if the address space is heavily in
918 * use.
919 *
920 * Please do not use this function unless _absolutely_ necessary.
921 */
922u64 gk20a_balloc_fixed(struct gk20a_allocator *a, u64 base, u64 len)
923{
924 struct gk20a_fixed_alloc *falloc = NULL;
925 struct gk20a_buddy *bud;
926 u64 ret, real_bytes = 0;
927
928 balloc_trace_func();
929
930 /* If base isn't aligned to an order 0 block, fail. */
931 if (base & (a->blk_size - 1))
932 goto fail;
933
934 if (len == 0)
935 goto fail;
936
937 falloc = kmalloc(sizeof(*falloc), GFP_KERNEL);
938 if (!falloc)
939 goto fail;
940
941 INIT_LIST_HEAD(&falloc->buddies);
942 falloc->start = base;
943 falloc->end = base + len;
944
945 balloc_lock(a);
946 if (!balloc_is_range_free(a, base, base + len)) {
947 balloc_dbg(a, "Range not free: 0x%llx -> 0x%llx\n",
948 base, base + len);
949 goto fail_unlock;
950 }
951
952 ret = __balloc_do_alloc_fixed(a, falloc, base, len);
953 if (!ret) {
954 balloc_dbg(a, "Alloc-fixed failed ?? 0x%llx -> 0x%llx\n",
955 base, base + len);
956 goto fail_unlock;
957 }
958
959 balloc_alloc_fixed(a, falloc);
960
961 list_for_each_entry(bud, &falloc->buddies, buddy_entry)
962 real_bytes += (bud->end - bud->start);
963
964 a->bytes_alloced += len;
965 a->bytes_alloced_real += real_bytes;
966
967 balloc_unlock(a);
968 balloc_dbg(a, "Alloc (fixed) 0x%llx\n", base);
969
970 balloc_trace_func_done();
971 return base;
972
973fail_unlock:
974 balloc_unlock(a);
975fail:
976 kfree(falloc);
977 balloc_trace_func_done();
978 return 0;
979}
980
981static void __balloc_do_free_fixed(struct gk20a_allocator *a,
982 struct gk20a_fixed_alloc *falloc)
983{
984 struct gk20a_buddy *bud;
985
986 while (!list_empty(&falloc->buddies)) {
987 bud = list_first_entry(&falloc->buddies,
988 struct gk20a_buddy,
989 buddy_entry);
990 __balloc_buddy_list_rem(a, bud);
991
992 balloc_free_buddy(a, bud->start);
993 balloc_blist_add(a, bud);
994 a->bytes_freed += balloc_order_to_len(a, bud->order);
995
996 /*
997 * Attemp to defrag the allocation.
998 */
999 balloc_coalesce(a, bud);
1000 }
1001
1002 kfree(falloc);
1003}
1004
1005/*
1006 * Free the passed allocation.
1007 */
1008void gk20a_bfree(struct gk20a_allocator *a, u64 addr)
1009{
1010 struct gk20a_buddy *bud;
1011 struct gk20a_fixed_alloc *falloc;
1012
1013 balloc_trace_func();
1014
1015 if (!addr) {
1016 balloc_trace_func_done();
1017 return;
1018 }
1019
1020 balloc_lock(a);
1021
1022 /*
1023 * First see if this is a fixed alloc. If not fall back to a regular
1024 * buddy.
1025 */
1026 falloc = balloc_free_fixed(a, addr);
1027 if (falloc) {
1028 __balloc_do_free_fixed(a, falloc);
1029 goto done;
1030 }
1031
1032 bud = balloc_free_buddy(a, addr);
1033 if (!bud)
1034 goto done;
1035
1036 balloc_blist_add(a, bud);
1037 a->bytes_freed += balloc_order_to_len(a, bud->order);
1038
1039 /*
1040 * Attemp to defrag the allocation.
1041 */
1042 balloc_coalesce(a, bud);
1043
1044done:
1045 balloc_unlock(a);
1046 balloc_dbg(a, "Free 0x%llx\n", addr);
1047 balloc_trace_func_done();
1048 return;
1049}
1050
1051/*
1052 * Print the buddy allocator top level stats. If you pass @s as NULL then the
1053 * stats are printed to the kernel log. This lets this code be used for
1054 * debugging purposes internal to the allocator.
1055 */
1056static void balloc_print_stats(struct gk20a_allocator *a, struct seq_file *s,
1057 int lock)
1058{
1059#define __balloc_pstat(s, fmt, arg...) \
1060 do { \
1061 if (s) \
1062 seq_printf(s, fmt, ##arg); \
1063 else \
1064 balloc_dbg(a, fmt, ##arg); \
1065 } while (0)
1066
1067 int i;
1068 struct rb_node *node;
1069 struct gk20a_fixed_alloc *falloc;
1070
1071 __balloc_pstat(s, "base = %llu, limit = %llu, blk_size = %llu\n",
1072 a->base, a->length, a->blk_size);
1073 __balloc_pstat(s, "Internal params:\n");
1074 __balloc_pstat(s, " start = %llu\n", a->start);
1075 __balloc_pstat(s, " end = %llu\n", a->end);
1076 __balloc_pstat(s, " count = %llu\n", a->count);
1077 __balloc_pstat(s, " blks = %llu\n", a->blks);
1078 __balloc_pstat(s, " max_order = %llu\n", a->max_order);
1079
1080 __balloc_pstat(s, "Buddy blocks:\n");
1081 __balloc_pstat(s, " Order Free Alloced Split\n");
1082 __balloc_pstat(s, " ----- ---- ------- -----\n");
1083
1084 if (lock)
1085 balloc_lock(a);
1086 for (i = a->max_order; i >= 0; i--) {
1087 if (a->buddy_list_len[i] == 0 &&
1088 a->buddy_list_alloced[i] == 0 &&
1089 a->buddy_list_split[i] == 0)
1090 continue;
1091
1092 __balloc_pstat(s, " %3d %-7llu %-9llu %llu\n", i,
1093 a->buddy_list_len[i],
1094 a->buddy_list_alloced[i],
1095 a->buddy_list_split[i]);
1096 }
1097
1098 __balloc_pstat(s, "\n");
1099
1100 for (node = rb_first(&a->fixed_allocs), i = 1;
1101 node != NULL;
1102 node = rb_next(node)) {
1103 falloc = container_of(node,
1104 struct gk20a_fixed_alloc, alloced_entry);
1105
1106 __balloc_pstat(s, "Fixed alloc (%d): [0x%llx -> 0x%llx]\n",
1107 i, falloc->start, falloc->end);
1108 }
1109
1110 __balloc_pstat(s, "\n");
1111 __balloc_pstat(s, "Bytes allocated: %llu\n", a->bytes_alloced);
1112 __balloc_pstat(s, "Bytes allocated (real): %llu\n",
1113 a->bytes_alloced_real);
1114 __balloc_pstat(s, "Bytes freed: %llu\n", a->bytes_freed);
1115
1116 if (lock)
1117 balloc_unlock(a);
1118
1119#undef __balloc_pstats
1120}
1121
1122static int __alloc_show(struct seq_file *s, void *unused)
1123{
1124 struct gk20a_allocator *a = s->private;
1125
1126 balloc_print_stats(a, s, 1);
125 1127
126 return 0; 1128 return 0;
127} 1129}
1130
1131static int __alloc_open(struct inode *inode, struct file *file)
1132{
1133 return single_open(file, __alloc_show, inode->i_private);
1134}
1135
1136static const struct file_operations __alloc_fops = {
1137 .open = __alloc_open,
1138 .read = seq_read,
1139 .llseek = seq_lseek,
1140 .release = single_release,
1141};
1142
1143static void balloc_init_alloc_debug(struct gk20a_allocator *a)
1144{
1145 if (!balloc_debugfs_root)
1146 return;
1147
1148 a->debugfs_entry = debugfs_create_file(a->name, S_IRUGO,
1149 balloc_debugfs_root,
1150 a, &__alloc_fops);
1151}
1152
1153void gk20a_alloc_debugfs_init(struct platform_device *pdev)
1154{
1155 struct gk20a_platform *platform = platform_get_drvdata(pdev);
1156 struct dentry *gpu_root = platform->debugfs;
1157
1158 balloc_debugfs_root = debugfs_create_dir("allocators", gpu_root);
1159 if (IS_ERR_OR_NULL(balloc_debugfs_root))
1160 return;
1161
1162 debugfs_create_u32("tracing", 0664, balloc_debugfs_root,
1163 &balloc_tracing_on);
1164}
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
index 69a227bd..e86e053b 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 2 * Copyright (c) 2011-2015, NVIDIA CORPORATION. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify it 4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License, 5 * under the terms and conditions of the GNU General Public License,
@@ -17,75 +17,190 @@
17#ifndef GK20A_ALLOCATOR_H 17#ifndef GK20A_ALLOCATOR_H
18#define GK20A_ALLOCATOR_H 18#define GK20A_ALLOCATOR_H
19 19
20#include <linux/list.h>
20#include <linux/rbtree.h> 21#include <linux/rbtree.h>
21#include <linux/rwsem.h> 22#include <linux/debugfs.h>
22#include <linux/slab.h> 23#include <linux/platform_device.h>
23 24
24/* #define ALLOCATOR_DEBUG */ 25/* #define ALLOCATOR_DEBUG */
25 26
26/* main struct */ 27/*
28 * Each buddy is an element in a binary tree.
29 */
30struct gk20a_buddy {
31 struct gk20a_buddy *parent; /* Parent node. */
32 struct gk20a_buddy *buddy; /* This node's buddy. */
33 struct gk20a_buddy *left; /* Lower address sub-node. */
34 struct gk20a_buddy *right; /* Higher address sub-node. */
35
36 struct list_head buddy_entry; /* List entry for various lists. */
37 struct rb_node alloced_entry; /* RB tree of allocations. */
38
39 u64 start; /* Start address of this buddy. */
40 u64 end; /* End address of this buddy. */
41 u64 order; /* Buddy order. */
42
43#define BALLOC_BUDDY_ALLOCED 0x1
44#define BALLOC_BUDDY_SPLIT 0x2
45#define BALLOC_BUDDY_IN_LIST 0x4
46 int flags; /* List of associated flags. */
47
48 /*
49 * Size of the PDE this buddy is using. This allows for grouping like
50 * sized allocations into the same PDE.
51 */
52#define BALLOC_PTE_SIZE_ANY 0x0
53#define BALLOC_PTE_SIZE_SMALL 0x1
54#define BALLOC_PTE_SIZE_BIG 0x2
55 int pte_size;
56};
57
58#define __buddy_flag_ops(flag, flag_up) \
59 static inline int buddy_is_ ## flag(struct gk20a_buddy *b) \
60 { \
61 return b->flags & BALLOC_BUDDY_ ## flag_up; \
62 } \
63 static inline void buddy_set_ ## flag(struct gk20a_buddy *b) \
64 { \
65 b->flags |= BALLOC_BUDDY_ ## flag_up; \
66 } \
67 static inline void buddy_clr_ ## flag(struct gk20a_buddy *b) \
68 { \
69 b->flags &= ~BALLOC_BUDDY_ ## flag_up; \
70 }
71
72/*
73 * int buddy_is_alloced(struct gk20a_buddy *b);
74 * void buddy_set_alloced(struct gk20a_buddy *b);
75 * void buddy_clr_alloced(struct gk20a_buddy *b);
76 *
77 * int buddy_is_split(struct gk20a_buddy *b);
78 * void buddy_set_split(struct gk20a_buddy *b);
79 * void buddy_clr_split(struct gk20a_buddy *b);
80 *
81 * int buddy_is_in_list(struct gk20a_buddy *b);
82 * void buddy_set_in_list(struct gk20a_buddy *b);
83 * void buddy_clr_in_list(struct gk20a_buddy *b);
84 */
85__buddy_flag_ops(alloced, ALLOCED);
86__buddy_flag_ops(split, SPLIT);
87__buddy_flag_ops(in_list, IN_LIST);
88
89/*
90 * Keeps info for a fixed allocation.
91 */
92struct gk20a_fixed_alloc {
93 struct list_head buddies; /* List of buddies. */
94 struct rb_node alloced_entry; /* RB tree of fixed allocations. */
95
96 u64 start; /* Start of fixed block. */
97 u64 end; /* End address. */
98};
99
100struct vm_gk20a;
101
102/*
103 * GPU buddy allocator for the various GPU address spaces. Each addressable unit
104 * doesn't have to correspond to a byte. In some cases each unit is a more
105 * complex object such as a comp_tag line or the like.
106 *
107 * The max order is computed based on the size of the minimum order and the size
108 * of the address space.
109 *
110 * order_size is the size of an order 0 buddy.
111 */
27struct gk20a_allocator { 112struct gk20a_allocator {
28 113
29 char name[32]; /* name for allocator */ 114 struct vm_gk20a *vm; /* Parent VM - can be NULL. */
30 struct rb_root rb_root; /* rb tree root for blocks */
31 115
32 u32 base; /* min value of this linear space */ 116 char name[32]; /* Name of allocator. */
33 u32 limit; /* max value = limit - 1 */
34 117
35 unsigned long *bitmap; /* bitmap */ 118 u64 base; /* Base address of the space. */
119 u64 length; /* Length of the space. */
120 u64 blk_size; /* Size of order 0 allocation. */
121 u64 blk_shift; /* Shift to divide by blk_size. */
36 122
37 struct gk20a_alloc_block *block_first; /* first block in list */ 123 int init; /* Non-zero if initialized. */
38 struct gk20a_alloc_block *block_recent; /* last visited block */
39 124
40 u32 first_free_addr; /* first free addr, non-contigous 125 /* Internal stuff. */
41 allocation preferred start, 126 u64 start; /* Real start (aligned to blk_size). */
42 in order to pick up small holes */ 127 u64 end; /* Real end, trimmed if needed. */
43 u32 last_free_addr; /* last free addr, contiguous 128 u64 count; /* Count of objects in space. */
44 allocation preferred start */ 129 u64 blks; /* Count of blks in the space. */
45 u32 cached_hole_size; /* max free hole size up to 130 u64 max_order; /* Specific maximum order. */
46 last_free_addr */
47 u32 block_count; /* number of blocks */
48 131
49 struct rw_semaphore rw_sema; /* lock */ 132 struct rb_root alloced_buddies; /* Outstanding allocations. */
50 struct kmem_cache *block_cache; /* slab cache */ 133 struct rb_root fixed_allocs; /* Outstanding fixed allocations. */
51 134
52 /* if enabled, constrain to [base, limit) */ 135 struct mutex lock; /* Protects buddy access. */
53 struct {
54 bool enable;
55 u32 base;
56 u32 limit;
57 } constraint;
58 136
59 int (*alloc)(struct gk20a_allocator *allocator, 137#define GPU_BALLOC_GVA_SPACE 0x1
60 u32 *addr, u32 len, u32 align); 138 u64 flags;
61 int (*free)(struct gk20a_allocator *allocator,
62 u32 addr, u32 len, u32 align);
63 139
64}; 140 /*
141 * Impose an upper bound on the maximum order.
142 */
143#define GPU_BALLOC_MAX_ORDER 31
144#define GPU_BALLOC_ORDER_LIST_LEN (GPU_BALLOC_MAX_ORDER + 1)
65 145
66int gk20a_allocator_init(struct gk20a_allocator *allocator, 146 struct list_head buddy_list[GPU_BALLOC_ORDER_LIST_LEN];
67 const char *name, u32 base, u32 size); 147 u64 buddy_list_len[GPU_BALLOC_ORDER_LIST_LEN];
68void gk20a_allocator_destroy(struct gk20a_allocator *allocator); 148 u64 buddy_list_split[GPU_BALLOC_ORDER_LIST_LEN];
149 u64 buddy_list_alloced[GPU_BALLOC_ORDER_LIST_LEN];
69 150
70int gk20a_allocator_block_alloc(struct gk20a_allocator *allocator, 151 /*
71 u32 *addr, u32 len, u32 align); 152 * This is for when the allocator is managing a GVA space (the
153 * GPU_BALLOC_GVA_SPACE bit is set in @flags). This requires
154 * that we group like sized allocations into PDE blocks.
155 */
156 u64 pte_blk_order;
72 157
73int gk20a_allocator_block_free(struct gk20a_allocator *allocator, 158 struct dentry *debugfs_entry;
74 u32 addr, u32 len, u32 align);
75 159
76#if defined(ALLOCATOR_DEBUG) 160 u64 bytes_alloced;
161 u64 bytes_alloced_real;
162 u64 bytes_freed;
163};
77 164
78#define allocator_dbg(alloctor, format, arg...) \ 165#define balloc_lock(a) mutex_lock(&(a)->lock)
79do { \ 166#define balloc_unlock(a) mutex_unlock(&(a)->lock)
80 if (1) \
81 pr_debug("gk20a_allocator (%s) %s: " format "\n",\
82 alloctor->name, __func__, ##arg);\
83} while (0)
84 167
85#else /* ALLOCATOR_DEBUG */ 168#define balloc_get_order_list(a, order) (&(a)->buddy_list[(order)])
169#define balloc_order_to_len(a, order) ((1 << order) * (a)->blk_size)
170#define balloc_base_shift(a, base) ((base) - (a)->start)
171#define balloc_base_unshift(a, base) ((base) + (a)->start)
86 172
87#define allocator_dbg(format, arg...) 173int gk20a_allocator_init(struct gk20a_allocator *allocator,
174 const char *name, u64 base, u64 size, u64 order0);
175int __gk20a_allocator_init(struct gk20a_allocator *allocator,
176 struct vm_gk20a *vm, const char *name,
177 u64 base, u64 size, u64 order0,
178 u64 max_order, u64 flags);
179void gk20a_allocator_destroy(struct gk20a_allocator *allocator);
88 180
89#endif /* ALLOCATOR_DEBUG */ 181/*
182 * Normal alloc/free operations for the buddy allocator.
183 */
184u64 gk20a_balloc(struct gk20a_allocator *allocator, u64 len);
185void gk20a_bfree(struct gk20a_allocator *allocator, u64 addr);
186
187/*
188 * Special interface to allocate a memory regions with a specific starting
189 * address. Yikes.
190 */
191u64 gk20a_balloc_fixed(struct gk20a_allocator *allocator, u64 base, u64 len);
192
193/*
194 * Debugfs init.
195 */
196void gk20a_alloc_debugfs_init(struct platform_device *pdev);
197
198#if defined(ALLOCATOR_DEBUG)
199#define balloc_dbg(alloctor, format, arg...) \
200 pr_info("%-25s %25s() " format, \
201 alloctor->name, __func__, ##arg)
202#else
203#define balloc_dbg(allocator, format, arg...)
204#endif
90 205
91#endif /* GK20A_ALLOCATOR_H */ 206#endif /* GK20A_ALLOCATOR_H */
diff --git a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
index c5d0f0c4..bcadde93 100644
--- a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
@@ -89,9 +89,8 @@ static int gk20a_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
89 if (err) 89 if (err)
90 return err; 90 return err;
91 91
92 gk20a_allocator_init(&gr->comp_tags, "comptag", 92 __gk20a_allocator_init(&gr->comp_tags, NULL, "comptag",
93 1, /* start */ 93 1, max_comptag_lines - 1, 1, 10, 0);
94 max_comptag_lines - 1); /* length*/
95 94
96 gr->comptags_per_cacheline = comptags_per_cacheline; 95 gr->comptags_per_cacheline = comptags_per_cacheline;
97 gr->slices_per_ltc = slices_per_fbp / g->ltc_count; 96 gr->slices_per_ltc = slices_per_fbp / g->ltc_count;
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 5d1ff563..c11414b5 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -132,10 +132,8 @@ static void gk20a_mm_delete_priv(void *_priv)
132 132
133 if (priv->comptags.lines) { 133 if (priv->comptags.lines) {
134 BUG_ON(!priv->comptag_allocator); 134 BUG_ON(!priv->comptag_allocator);
135 priv->comptag_allocator->free(priv->comptag_allocator, 135 gk20a_bfree(priv->comptag_allocator,
136 priv->comptags.offset, 136 priv->comptags.real_offset);
137 priv->comptags.allocated_lines,
138 1);
139 } 137 }
140 138
141 /* Free buffer states */ 139 /* Free buffer states */
@@ -224,10 +222,9 @@ static int gk20a_alloc_comptags(struct gk20a *g,
224 u32 lines, bool user_mappable) 222 u32 lines, bool user_mappable)
225{ 223{
226 struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev); 224 struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
227 u32 offset = 0;
228 int err;
229 u32 ctaglines_to_allocate; 225 u32 ctaglines_to_allocate;
230 u32 ctagline_align; 226 u32 ctagline_align = 1;
227 u32 offset;
231 const u32 aggregate_cacheline_sz = 228 const u32 aggregate_cacheline_sz =
232 g->gr.cacheline_size * g->gr.slices_per_ltc * 229 g->gr.cacheline_size * g->gr.slices_per_ltc *
233 g->ltc_count; 230 g->ltc_count;
@@ -241,7 +238,6 @@ static int gk20a_alloc_comptags(struct gk20a *g,
241 238
242 if (!user_mappable) { 239 if (!user_mappable) {
243 ctaglines_to_allocate = lines; 240 ctaglines_to_allocate = lines;
244 ctagline_align = 1;
245 } else { 241 } else {
246 /* Unfortunately, we cannot use allocation alignment 242 /* Unfortunately, we cannot use allocation alignment
247 * here, since compbits per cacheline is not always a 243 * here, since compbits per cacheline is not always a
@@ -273,71 +269,25 @@ static int gk20a_alloc_comptags(struct gk20a *g,
273 269
274 if (ctaglines_to_allocate < lines) 270 if (ctaglines_to_allocate < lines)
275 return -EINVAL; /* integer overflow */ 271 return -EINVAL; /* integer overflow */
272 pr_info("user-mapped CTAGS: %u\n", ctaglines_to_allocate);
276 } 273 }
277 274
278 /* store the allocator so we can use it when we free the ctags */ 275 /* store the allocator so we can use it when we free the ctags */
279 priv->comptag_allocator = allocator; 276 priv->comptag_allocator = allocator;
280 err = allocator->alloc(allocator, &offset, 277 offset = gk20a_balloc(allocator, ctaglines_to_allocate);
281 ctaglines_to_allocate, 1); 278 if (!offset)
282 if (!err) { 279 return -ENOMEM;
283 const u32 alignment_lines =
284 DIV_ROUND_UP(offset, ctagline_align) * ctagline_align -
285 offset;
286
287 /* prune the preceding ctaglines that were allocated
288 for alignment */
289 if (alignment_lines) {
290 /* free alignment lines */
291 int tmp=
292 allocator->free(allocator, offset,
293 alignment_lines,
294 1);
295 WARN_ON(tmp);
296
297 offset += alignment_lines;
298 ctaglines_to_allocate -= alignment_lines;
299 }
300 280
301 /* check if we can prune the trailing, too */ 281 priv->comptags.lines = lines;
302 if (user_mappable) 282 priv->comptags.real_offset = offset;
303 {
304 u32 needed_cachelines =
305 DIV_ROUND_UP(lines, g->gr.comptags_per_cacheline);
306
307 u32 first_unneeded_cacheline =
308 DIV_ROUND_UP(round_up(needed_cachelines *
309 aggregate_cacheline_sz,
310 small_pgsz),
311 aggregate_cacheline_sz);
312 u32 needed_ctaglines =
313 first_unneeded_cacheline *
314 g->gr.comptags_per_cacheline;
315
316 if (needed_ctaglines < ctaglines_to_allocate) {
317 /* free alignment lines */
318 int tmp=
319 allocator->free(
320 allocator,
321 offset + needed_ctaglines,
322 (ctaglines_to_allocate -
323 needed_ctaglines),
324 1);
325 WARN_ON(tmp);
326
327 ctaglines_to_allocate = needed_ctaglines;
328 }
329 }
330
331 priv->comptags.offset = offset;
332 priv->comptags.lines = lines;
333 priv->comptags.allocated_lines = ctaglines_to_allocate;
334 priv->comptags.user_mappable = user_mappable;
335 }
336 return err;
337}
338 283
284 if (user_mappable)
285 offset = DIV_ROUND_UP(offset, ctagline_align) * ctagline_align;
339 286
287 priv->comptags.offset = offset;
340 288
289 return 0;
290}
341 291
342static int gk20a_init_mm_reset_enable_hw(struct gk20a *g) 292static int gk20a_init_mm_reset_enable_hw(struct gk20a *g)
343{ 293{
@@ -889,14 +839,12 @@ static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset)
889} 839}
890 840
891u64 gk20a_vm_alloc_va(struct vm_gk20a *vm, 841u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
892 u64 size, 842 u64 size,
893 enum gmmu_pgsz_gk20a gmmu_pgsz_idx) 843 enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
894 844
895{ 845{
896 struct gk20a_allocator *vma = &vm->vma[gmmu_pgsz_idx]; 846 struct gk20a_allocator *vma = &vm->vma[gmmu_pgsz_idx];
897 int err;
898 u64 offset; 847 u64 offset;
899 u32 start_page_nr = 0, num_pages;
900 u64 gmmu_page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx]; 848 u64 gmmu_page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];
901 849
902 if (gmmu_pgsz_idx >= gmmu_nr_page_sizes) { 850 if (gmmu_pgsz_idx >= gmmu_nr_page_sizes) {
@@ -912,28 +860,19 @@ u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
912 860
913 } 861 }
914 862
915 /* be certain we round up to gmmu_page_size if needed */ 863 /* Be certain we round up to gmmu_page_size if needed */
916 /* TBD: DIV_ROUND_UP -> undefined reference to __aeabi_uldivmod */
917 size = (size + ((u64)gmmu_page_size - 1)) & ~((u64)gmmu_page_size - 1); 864 size = (size + ((u64)gmmu_page_size - 1)) & ~((u64)gmmu_page_size - 1);
918
919 gk20a_dbg_info("size=0x%llx @ pgsz=%dKB", size, 865 gk20a_dbg_info("size=0x%llx @ pgsz=%dKB", size,
920 vm->gmmu_page_sizes[gmmu_pgsz_idx]>>10); 866 vm->gmmu_page_sizes[gmmu_pgsz_idx]>>10);
921 867
922 /* The vma allocator represents page accounting. */ 868 offset = gk20a_balloc(vma, size);
923 num_pages = size >> ilog2(vm->gmmu_page_sizes[gmmu_pgsz_idx]); 869 if (!offset) {
924
925 err = vma->alloc(vma, &start_page_nr, num_pages, 1);
926
927 if (err) {
928 gk20a_err(dev_from_vm(vm), 870 gk20a_err(dev_from_vm(vm),
929 "%s oom: sz=0x%llx", vma->name, size); 871 "%s oom: sz=0x%llx", vma->name, size);
930 return 0; 872 return 0;
931 } 873 }
932 874
933 offset = (u64)start_page_nr <<
934 ilog2(vm->gmmu_page_sizes[gmmu_pgsz_idx]);
935 gk20a_dbg_fn("%s found addr: 0x%llx", vma->name, offset); 875 gk20a_dbg_fn("%s found addr: 0x%llx", vma->name, offset);
936
937 return offset; 876 return offset;
938} 877}
939 878
@@ -942,25 +881,12 @@ int gk20a_vm_free_va(struct vm_gk20a *vm,
942 enum gmmu_pgsz_gk20a pgsz_idx) 881 enum gmmu_pgsz_gk20a pgsz_idx)
943{ 882{
944 struct gk20a_allocator *vma = &vm->vma[pgsz_idx]; 883 struct gk20a_allocator *vma = &vm->vma[pgsz_idx];
945 u32 page_size = vm->gmmu_page_sizes[pgsz_idx];
946 u32 page_shift = ilog2(page_size);
947 u32 start_page_nr, num_pages;
948 int err;
949 884
950 gk20a_dbg_info("%s free addr=0x%llx, size=0x%llx", 885 gk20a_dbg_info("%s free addr=0x%llx, size=0x%llx",
951 vma->name, offset, size); 886 vma->name, offset, size);
887 gk20a_bfree(vma, offset);
952 888
953 start_page_nr = (u32)(offset >> page_shift); 889 return 0;
954 num_pages = (u32)((size + page_size - 1) >> page_shift);
955
956 err = vma->free(vma, start_page_nr, num_pages, 1);
957 if (err) {
958 gk20a_err(dev_from_vm(vm),
959 "not found: offset=0x%llx, sz=0x%llx",
960 offset, size);
961 }
962
963 return err;
964} 890}
965 891
966static int insert_mapped_buffer(struct rb_root *root, 892static int insert_mapped_buffer(struct rb_root *root,
@@ -1136,7 +1062,7 @@ static int validate_fixed_buffer(struct vm_gk20a *vm,
1136 1062
1137 if (map_offset & (vm->gmmu_page_sizes[bfr->pgsz_idx] - 1)) { 1063 if (map_offset & (vm->gmmu_page_sizes[bfr->pgsz_idx] - 1)) {
1138 gk20a_err(dev, "map offset must be buffer page size aligned 0x%llx", 1064 gk20a_err(dev, "map offset must be buffer page size aligned 0x%llx",
1139 map_offset); 1065 map_offset);
1140 return -EINVAL; 1066 return -EINVAL;
1141 } 1067 }
1142 1068
@@ -2433,7 +2359,6 @@ int gk20a_init_vm(struct mm_gk20a *mm,
2433 char *name) 2359 char *name)
2434{ 2360{
2435 int err, i; 2361 int err, i;
2436 u32 num_small_pages, num_large_pages, low_hole_pages;
2437 char alloc_name[32]; 2362 char alloc_name[32];
2438 u64 small_vma_size, large_vma_size; 2363 u64 small_vma_size, large_vma_size;
2439 u32 pde_lo, pde_hi; 2364 u32 pde_lo, pde_hi;
@@ -2494,34 +2419,31 @@ int gk20a_init_vm(struct mm_gk20a *mm,
2494 large_vma_size = vm->va_limit - small_vma_size; 2419 large_vma_size = vm->va_limit - small_vma_size;
2495 } 2420 }
2496 2421
2497 num_small_pages = (u32)(small_vma_size >>
2498 ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
2499
2500 /* num_pages above is without regard to the low-side hole. */
2501 low_hole_pages = (vm->va_start >>
2502 ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
2503
2504 snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB", name, 2422 snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB", name,
2505 vm->gmmu_page_sizes[gmmu_page_size_small]>>10); 2423 vm->gmmu_page_sizes[gmmu_page_size_small]>>10);
2506 err = gk20a_allocator_init(&vm->vma[gmmu_page_size_small], 2424 err = __gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
2507 alloc_name, 2425 vm, alloc_name,
2508 low_hole_pages, /*start*/ 2426 vm->va_start,
2509 num_small_pages - low_hole_pages);/* length*/ 2427 small_vma_size - vm->va_start,
2428 SZ_4K,
2429 GPU_BALLOC_MAX_ORDER,
2430 GPU_BALLOC_GVA_SPACE);
2510 if (err) 2431 if (err)
2511 goto clean_up_ptes; 2432 goto clean_up_ptes;
2512 2433
2513 if (big_pages) { 2434 if (big_pages) {
2514 u32 start = (u32)(small_vma_size >>
2515 ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
2516 num_large_pages = (u32)(large_vma_size >>
2517 ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
2518
2519 snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB", 2435 snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB",
2520 name, vm->gmmu_page_sizes[gmmu_page_size_big]>>10); 2436 name, vm->gmmu_page_sizes[gmmu_page_size_big]>>10);
2521 err = gk20a_allocator_init(&vm->vma[gmmu_page_size_big], 2437 /*
2522 alloc_name, 2438 * Big page VMA starts at the end of the small page VMA.
2523 start, /* start */ 2439 */
2524 num_large_pages); /* length */ 2440 err = __gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
2441 vm, alloc_name,
2442 small_vma_size,
2443 large_vma_size,
2444 big_page_size,
2445 GPU_BALLOC_MAX_ORDER,
2446 GPU_BALLOC_GVA_SPACE);
2525 if (err) 2447 if (err)
2526 goto clean_up_small_allocator; 2448 goto clean_up_small_allocator;
2527 } 2449 }
@@ -2602,9 +2524,9 @@ int gk20a_vm_release_share(struct gk20a_as_share *as_share)
2602int gk20a_vm_alloc_space(struct gk20a_as_share *as_share, 2524int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
2603 struct nvgpu_as_alloc_space_args *args) 2525 struct nvgpu_as_alloc_space_args *args)
2604 2526
2605{ int err = -ENOMEM; 2527{
2528 int err = -ENOMEM;
2606 int pgsz_idx = gmmu_page_size_small; 2529 int pgsz_idx = gmmu_page_size_small;
2607 u32 start_page_nr;
2608 struct gk20a_allocator *vma; 2530 struct gk20a_allocator *vma;
2609 struct vm_gk20a *vm = as_share->vm; 2531 struct vm_gk20a *vm = as_share->vm;
2610 struct gk20a *g = vm->mm->g; 2532 struct gk20a *g = vm->mm->g;
@@ -2635,21 +2557,19 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
2635 goto clean_up; 2557 goto clean_up;
2636 } 2558 }
2637 2559
2638 start_page_nr = 0; 2560 vma = &vm->vma[pgsz_idx];
2639 if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET) 2561 if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET)
2640 start_page_nr = (u32)(args->o_a.offset >> 2562 vaddr_start = gk20a_balloc_fixed(vma, args->o_a.offset,
2641 ilog2(vm->gmmu_page_sizes[pgsz_idx])); 2563 (u64)args->pages *
2564 (u64)args->page_size);
2565 else
2566 vaddr_start = gk20a_balloc(vma, args->pages * args->page_size);
2642 2567
2643 vma = &vm->vma[pgsz_idx]; 2568 if (!vaddr_start) {
2644 err = vma->alloc(vma, &start_page_nr, args->pages, 1);
2645 if (err) {
2646 kfree(va_node); 2569 kfree(va_node);
2647 goto clean_up; 2570 goto clean_up;
2648 } 2571 }
2649 2572
2650 vaddr_start = (u64)start_page_nr <<
2651 ilog2(vm->gmmu_page_sizes[pgsz_idx]);
2652
2653 va_node->vaddr_start = vaddr_start; 2573 va_node->vaddr_start = vaddr_start;
2654 va_node->size = (u64)args->page_size * (u64)args->pages; 2574 va_node->size = (u64)args->page_size * (u64)args->pages;
2655 va_node->pgsz_idx = pgsz_idx; 2575 va_node->pgsz_idx = pgsz_idx;
@@ -2673,7 +2593,7 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
2673 true); 2593 true);
2674 if (!map_offset) { 2594 if (!map_offset) {
2675 mutex_unlock(&vm->update_gmmu_lock); 2595 mutex_unlock(&vm->update_gmmu_lock);
2676 vma->free(vma, start_page_nr, args->pages, 1); 2596 gk20a_bfree(vma, vaddr_start);
2677 kfree(va_node); 2597 kfree(va_node);
2678 goto clean_up; 2598 goto clean_up;
2679 } 2599 }
@@ -2685,6 +2605,7 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
2685 mutex_unlock(&vm->update_gmmu_lock); 2605 mutex_unlock(&vm->update_gmmu_lock);
2686 2606
2687 args->o_a.offset = vaddr_start; 2607 args->o_a.offset = vaddr_start;
2608 err = 0;
2688 2609
2689clean_up: 2610clean_up:
2690 return err; 2611 return err;
@@ -2695,7 +2616,6 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
2695{ 2616{
2696 int err = -ENOMEM; 2617 int err = -ENOMEM;
2697 int pgsz_idx; 2618 int pgsz_idx;
2698 u32 start_page_nr;
2699 struct gk20a_allocator *vma; 2619 struct gk20a_allocator *vma;
2700 struct vm_gk20a *vm = as_share->vm; 2620 struct vm_gk20a *vm = as_share->vm;
2701 struct vm_reserved_va_node *va_node; 2621 struct vm_reserved_va_node *va_node;
@@ -2708,14 +2628,8 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
2708 pgsz_idx = __nv_gmmu_va_is_upper(vm, args->offset) ? 2628 pgsz_idx = __nv_gmmu_va_is_upper(vm, args->offset) ?
2709 gmmu_page_size_big : gmmu_page_size_small; 2629 gmmu_page_size_big : gmmu_page_size_small;
2710 2630
2711 start_page_nr = (u32)(args->offset >>
2712 ilog2(vm->gmmu_page_sizes[pgsz_idx]));
2713
2714 vma = &vm->vma[pgsz_idx]; 2631 vma = &vm->vma[pgsz_idx];
2715 err = vma->free(vma, start_page_nr, args->pages, 1); 2632 gk20a_bfree(vma, args->offset);
2716
2717 if (err)
2718 goto clean_up;
2719 2633
2720 mutex_lock(&vm->update_gmmu_lock); 2634 mutex_lock(&vm->update_gmmu_lock);
2721 va_node = addr_to_reservation(vm, args->offset); 2635 va_node = addr_to_reservation(vm, args->offset);
@@ -2745,8 +2659,8 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
2745 kfree(va_node); 2659 kfree(va_node);
2746 } 2660 }
2747 mutex_unlock(&vm->update_gmmu_lock); 2661 mutex_unlock(&vm->update_gmmu_lock);
2662 err = 0;
2748 2663
2749clean_up:
2750 return err; 2664 return err;
2751} 2665}
2752 2666
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 57f7a373..cf246744 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -131,6 +131,7 @@ enum gmmu_pgsz_gk20a {
131}; 131};
132 132
133struct gk20a_comptags { 133struct gk20a_comptags {
134 u32 real_offset;
134 u32 offset; 135 u32 offset;
135 u32 lines; 136 u32 lines;
136 u32 allocated_lines; 137 u32 allocated_lines;
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
index 275fbd4e..fc8d130c 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -2438,7 +2438,6 @@ static int pmu_init_perfmon(struct pmu_gk20a *pmu)
2438 struct pmu_payload payload; 2438 struct pmu_payload payload;
2439 u32 seq; 2439 u32 seq;
2440 u32 data; 2440 u32 data;
2441 int err = 0;
2442 2441
2443 gk20a_dbg_fn(""); 2442 gk20a_dbg_fn("");
2444 2443
@@ -2489,12 +2488,11 @@ static int pmu_init_perfmon(struct pmu_gk20a *pmu)
2489 gk20a_writel(g, pwr_pmu_idle_ctrl_r(2), data); 2488 gk20a_writel(g, pwr_pmu_idle_ctrl_r(2), data);
2490 2489
2491 if (!pmu->sample_buffer) 2490 if (!pmu->sample_buffer)
2492 err = pmu->dmem.alloc(&pmu->dmem, 2491 pmu->sample_buffer = gk20a_balloc(&pmu->dmem,
2493 &pmu->sample_buffer, 2 * sizeof(u16), 2492 2 * sizeof(u16));
2494 PMU_DMEM_ALLOC_ALIGNMENT); 2493 if (!pmu->sample_buffer) {
2495 if (err) {
2496 gk20a_err(dev_from_gk20a(g), 2494 gk20a_err(dev_from_gk20a(g),
2497 "failed to allocate perfmon sample buffer"); 2495 "failed to allocate perfmon sample buffer");
2498 return -ENOMEM; 2496 return -ENOMEM;
2499 } 2497 }
2500 2498
@@ -2592,15 +2590,17 @@ static int pmu_process_init_msg(struct pmu_gk20a *pmu,
2592 for (i = 0; i < PMU_QUEUE_COUNT; i++) 2590 for (i = 0; i < PMU_QUEUE_COUNT; i++)
2593 pmu_queue_init(pmu, i, init); 2591 pmu_queue_init(pmu, i, init);
2594 2592
2595 if (!pmu->dmem.alloc) { 2593 if (!pmu->dmem.init) {
2596 /*Align start and end addresses*/ 2594 /* Align start and end addresses */
2597 u32 start = ALIGN(pv->get_pmu_init_msg_pmu_sw_mg_off(init), 2595 u32 start = ALIGN(pv->get_pmu_init_msg_pmu_sw_mg_off(init),
2598 PMU_DMEM_ALLOC_ALIGNMENT); 2596 PMU_DMEM_ALLOC_ALIGNMENT);
2599 u32 end = (pv->get_pmu_init_msg_pmu_sw_mg_off(init) + 2597 u32 end = (pv->get_pmu_init_msg_pmu_sw_mg_off(init) +
2600 pv->get_pmu_init_msg_pmu_sw_mg_size(init)) & 2598 pv->get_pmu_init_msg_pmu_sw_mg_size(init)) &
2601 ~(PMU_DMEM_ALLOC_ALIGNMENT - 1); 2599 ~(PMU_DMEM_ALLOC_ALIGNMENT - 1);
2602 u32 size = end - start; 2600 u32 size = end - start;
2603 gk20a_allocator_init(&pmu->dmem, "gk20a_pmu_dmem", start, size); 2601 __gk20a_allocator_init(&pmu->dmem, NULL, "gk20a_pmu_dmem",
2602 start, size,
2603 PMU_DMEM_ALLOC_ALIGNMENT, 4, 0);
2604 } 2604 }
2605 2605
2606 pmu->pmu_ready = true; 2606 pmu->pmu_ready = true;
@@ -2737,20 +2737,14 @@ static int pmu_response_handle(struct pmu_gk20a *pmu,
2737 seq->callback = NULL; 2737 seq->callback = NULL;
2738 if (pv->pmu_allocation_get_dmem_size(pmu, 2738 if (pv->pmu_allocation_get_dmem_size(pmu,
2739 pv->get_pmu_seq_in_a_ptr(seq)) != 0) 2739 pv->get_pmu_seq_in_a_ptr(seq)) != 0)
2740 pmu->dmem.free(&pmu->dmem, 2740 gk20a_bfree(&pmu->dmem,
2741 pv->pmu_allocation_get_dmem_offset(pmu, 2741 pv->pmu_allocation_get_dmem_offset(pmu,
2742 pv->get_pmu_seq_in_a_ptr(seq)), 2742 pv->get_pmu_seq_in_a_ptr(seq)));
2743 pv->pmu_allocation_get_dmem_size(pmu,
2744 pv->get_pmu_seq_in_a_ptr(seq)),
2745 PMU_DMEM_ALLOC_ALIGNMENT);
2746 if (pv->pmu_allocation_get_dmem_size(pmu, 2743 if (pv->pmu_allocation_get_dmem_size(pmu,
2747 pv->get_pmu_seq_out_a_ptr(seq)) != 0) 2744 pv->get_pmu_seq_out_a_ptr(seq)) != 0)
2748 pmu->dmem.free(&pmu->dmem, 2745 gk20a_bfree(&pmu->dmem,
2749 pv->pmu_allocation_get_dmem_offset(pmu, 2746 pv->pmu_allocation_get_dmem_offset(pmu,
2750 pv->get_pmu_seq_out_a_ptr(seq)), 2747 pv->get_pmu_seq_out_a_ptr(seq)));
2751 pv->pmu_allocation_get_dmem_size(pmu,
2752 pv->get_pmu_seq_out_a_ptr(seq)),
2753 PMU_DMEM_ALLOC_ALIGNMENT);
2754 2748
2755 if (seq->callback) 2749 if (seq->callback)
2756 seq->callback(g, msg, seq->cb_params, seq->desc, ret); 2750 seq->callback(g, msg, seq->cb_params, seq->desc, ret);
@@ -3387,11 +3381,10 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
3387 pv->pmu_allocation_set_dmem_size(pmu, in, 3381 pv->pmu_allocation_set_dmem_size(pmu, in,
3388 (u16)max(payload->in.size, payload->out.size)); 3382 (u16)max(payload->in.size, payload->out.size));
3389 3383
3390 err = pmu->dmem.alloc(&pmu->dmem, 3384 *(pv->pmu_allocation_get_dmem_offset_addr(pmu, in)) =
3391 pv->pmu_allocation_get_dmem_offset_addr(pmu, in), 3385 gk20a_balloc(&pmu->dmem,
3392 pv->pmu_allocation_get_dmem_size(pmu, in), 3386 pv->pmu_allocation_get_dmem_size(pmu, in));
3393 PMU_DMEM_ALLOC_ALIGNMENT); 3387 if (!*(pv->pmu_allocation_get_dmem_offset_addr(pmu, in)))
3394 if (err)
3395 goto clean_up; 3388 goto clean_up;
3396 3389
3397 pmu_copy_to_dmem(pmu, (pv->pmu_allocation_get_dmem_offset(pmu, 3390 pmu_copy_to_dmem(pmu, (pv->pmu_allocation_get_dmem_offset(pmu,
@@ -3412,11 +3405,12 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
3412 (u16)payload->out.size); 3405 (u16)payload->out.size);
3413 3406
3414 if (payload->out.buf != payload->in.buf) { 3407 if (payload->out.buf != payload->in.buf) {
3415 err = pmu->dmem.alloc(&pmu->dmem, 3408
3416 pv->pmu_allocation_get_dmem_offset_addr(pmu, out), 3409 *(pv->pmu_allocation_get_dmem_offset_addr(pmu, out)) =
3417 pv->pmu_allocation_get_dmem_size(pmu, out), 3410 gk20a_balloc(&pmu->dmem,
3418 PMU_DMEM_ALLOC_ALIGNMENT); 3411 pv->pmu_allocation_get_dmem_size(pmu, out));
3419 if (err) 3412 if (!*(pv->pmu_allocation_get_dmem_offset_addr(pmu,
3413 out)))
3420 goto clean_up; 3414 goto clean_up;
3421 } else { 3415 } else {
3422 BUG_ON(in == NULL); 3416 BUG_ON(in == NULL);
@@ -3444,15 +3438,11 @@ int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
3444clean_up: 3438clean_up:
3445 gk20a_dbg_fn("fail"); 3439 gk20a_dbg_fn("fail");
3446 if (in) 3440 if (in)
3447 pmu->dmem.free(&pmu->dmem, 3441 gk20a_bfree(&pmu->dmem,
3448 pv->pmu_allocation_get_dmem_offset(pmu, in), 3442 pv->pmu_allocation_get_dmem_offset(pmu, in));
3449 pv->pmu_allocation_get_dmem_size(pmu, in),
3450 PMU_DMEM_ALLOC_ALIGNMENT);
3451 if (out) 3443 if (out)
3452 pmu->dmem.free(&pmu->dmem, 3444 gk20a_bfree(&pmu->dmem,
3453 pv->pmu_allocation_get_dmem_offset(pmu, out), 3445 pv->pmu_allocation_get_dmem_offset(pmu, out));
3454 pv->pmu_allocation_get_dmem_size(pmu, out),
3455 PMU_DMEM_ALLOC_ALIGNMENT);
3456 3446
3457 pmu_seq_release(pmu, seq); 3447 pmu_seq_release(pmu, seq);
3458 return err; 3448 return err;
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
index 6cd173e8..e54805a6 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
@@ -3,7 +3,7 @@
3 * 3 *
4 * GK20A PMU (aka. gPMU outside gk20a context) 4 * GK20A PMU (aka. gPMU outside gk20a context)
5 * 5 *
6 * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 6 * Copyright (c) 2011-2015, NVIDIA CORPORATION. All rights reserved.
7 * 7 *
8 * This program is free software; you can redistribute it and/or modify it 8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License, 9 * under the terms and conditions of the GNU General Public License,
@@ -437,7 +437,7 @@ struct pmu_ucode_desc {
437#define PMU_UNIT_ID_IS_VALID(id) \ 437#define PMU_UNIT_ID_IS_VALID(id) \
438 (((id) < PMU_UNIT_END) || ((id) >= PMU_UNIT_TEST_START)) 438 (((id) < PMU_UNIT_END) || ((id) >= PMU_UNIT_TEST_START))
439 439
440#define PMU_DMEM_ALLOC_ALIGNMENT (32) 440#define PMU_DMEM_ALLOC_ALIGNMENT (4)
441#define PMU_DMEM_ALIGNMENT (4) 441#define PMU_DMEM_ALIGNMENT (4)
442 442
443#define PMU_CMD_FLAGS_PMU_MASK (0xF0) 443#define PMU_CMD_FLAGS_PMU_MASK (0xF0)
diff --git a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
index 04f61c58..053550f6 100644
--- a/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/semaphore_gk20a.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * GK20A Semaphores 4 * GK20A Semaphores
5 * 5 *
6 * Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved. 6 * Copyright (c) 2014-2015, NVIDIA CORPORATION. All rights reserved.
7 * 7 *
8 * This program is free software; you can redistribute it and/or modify it 8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License, 9 * under the terms and conditions of the GNU General Public License,
@@ -44,8 +44,10 @@ struct gk20a_semaphore_pool *gk20a_semaphore_pool_alloc(struct device *d,
44 if (gk20a_get_sgtable(d, &p->sgt, p->cpu_va, p->iova, p->size)) 44 if (gk20a_get_sgtable(d, &p->sgt, p->cpu_va, p->iova, p->size))
45 goto clean_up; 45 goto clean_up;
46 46
47 if (gk20a_allocator_init(&p->alloc, unique_name, 0, 47 /* Sacrifice one semaphore in the name of returning error codes. */
48 p->size)) 48 if (gk20a_allocator_init(&p->alloc, unique_name,
49 SEMAPHORE_SIZE, p->size - SEMAPHORE_SIZE,
50 SEMAPHORE_SIZE))
49 goto clean_up; 51 goto clean_up;
50 52
51 gk20a_dbg_info("cpuva=%p iova=%llx phys=%llx", p->cpu_va, 53 gk20a_dbg_info("cpuva=%p iova=%llx phys=%llx", p->cpu_va,
@@ -163,8 +165,8 @@ struct gk20a_semaphore *gk20a_semaphore_alloc(struct gk20a_semaphore_pool *pool)
163 if (!s) 165 if (!s)
164 return NULL; 166 return NULL;
165 167
166 if (pool->alloc.alloc(&pool->alloc, &s->offset, SEMAPHORE_SIZE, 168 s->offset = gk20a_balloc(&pool->alloc, SEMAPHORE_SIZE);
167 SEMAPHORE_SIZE)) { 169 if (!s->offset) {
168 gk20a_err(pool->dev, "failed to allocate semaphore"); 170 gk20a_err(pool->dev, "failed to allocate semaphore");
169 kfree(s); 171 kfree(s);
170 return NULL; 172 return NULL;
@@ -186,8 +188,7 @@ static void gk20a_semaphore_free(struct kref *ref)
186 struct gk20a_semaphore *s = 188 struct gk20a_semaphore *s =
187 container_of(ref, struct gk20a_semaphore, ref); 189 container_of(ref, struct gk20a_semaphore, ref);
188 190
189 s->pool->alloc.free(&s->pool->alloc, s->offset, SEMAPHORE_SIZE, 191 gk20a_bfree(&s->pool->alloc, s->offset);
190 SEMAPHORE_SIZE);
191 gk20a_semaphore_pool_put(s->pool); 192 gk20a_semaphore_pool_put(s->pool);
192 kfree(s); 193 kfree(s);
193} 194}