summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/common/mm/pd_cache.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/nvgpu/common/mm/pd_cache.c')
-rw-r--r--drivers/gpu/nvgpu/common/mm/pd_cache.c426
1 files changed, 426 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/common/mm/pd_cache.c b/drivers/gpu/nvgpu/common/mm/pd_cache.c
new file mode 100644
index 00000000..4f312eff
--- /dev/null
+++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c
@@ -0,0 +1,426 @@
1/*
2 * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 */
16
17#include <nvgpu/log.h>
18#include <nvgpu/dma.h>
19#include <nvgpu/gmmu.h>
20#include <nvgpu/nvgpu_mem.h>
21#include <nvgpu/list.h>
22#include <nvgpu/log2.h>
23
24#include "gk20a/gk20a.h"
25#include "gk20a/mm_gk20a.h"
26
27#define pd_dbg(g, fmt, args...) nvgpu_log(g, gpu_dbg_pd_cache, fmt, ##args)
28
29/**
30 * DOC: PD cache
31 *
32 * In the name of saving memory with the many sub-page sized PD levels in Pascal
33 * and beyond a way of packing PD tables together is necessary. This code here
34 * does just that. If a PD table only requires 1024 bytes, then it is possible
35 * to have 4 of these PDs in one page. This is even more pronounced for 256 byte
36 * PD tables.
37 *
38 * The pd cache is basially just a slab allocator. Each instance of the nvgpu
39 * driver makes one of these structs:
40 *
41 * struct nvgpu_pd_cache {
42 * struct nvgpu_list_node full[NVGPU_PD_CACHE_COUNT];
43 * struct nvgpu_list_node partial[NVGPU_PD_CACHE_COUNT];
44 *
45 * struct nvgpu_rbtree_node *mem_tree;
46 * };
47 *
48 * There are two sets of lists, the full and the partial. The full lists contain
49 * pages of memory for which all the memory in that page is in use. The partial
50 * lists contain partially full pages of memory which can be used for more PD
51 * allocations. There a couple of assumptions here:
52 *
53 * 1. PDs greater than or equal to the page size bypass the pd cache.
54 * 2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes.
55 *
56 * There are NVGPU_PD_CACHE_COUNT full lists and the same number of partial
57 * lists. For a 4Kb page NVGPU_PD_CACHE_COUNT is 4. This is enough space for
58 * 256, 512, 1024, and 2048 byte PDs.
59 *
60 * __nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD
61 * size is page size or larger and choose the correct allocation scheme - either
62 * from the PD cache or directly. Similarly __nvgpu_pd_free() will free a PD
63 * allocated by __nvgpu_pd_alloc().
64 *
65 * Since the top level PD (the PDB) is a page aligned pointer but less than a
66 * page size the direct functions must be used for allocating PDBs. Otherwise
67 * there would be alignment issues for the PDBs when they get packed.
68 */
69
70static u32 nvgpu_pd_cache_nr(u32 bytes)
71{
72 return ilog2(bytes >> (NVGPU_PD_CACHE_MIN_SHIFT - 1));
73}
74
75static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry)
76{
77 u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size);
78
79 return mask_offset - 1;
80}
81
82int nvgpu_pd_cache_init(struct gk20a *g)
83{
84 struct nvgpu_pd_cache *cache;
85 int i;
86
87 /*
88 * This gets called from finalize_poweron() so we need to make sure we
89 * don't reinit the pd_cache over and over.
90 */
91 if (g->mm.pd_cache)
92 return 0;
93
94 cache = nvgpu_kzalloc(g, sizeof(*cache));
95 if (!cache) {
96 nvgpu_err(g, "Failed to alloc pd_cache!");
97 return -ENOMEM;
98 }
99
100 for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) {
101 nvgpu_init_list_node(&cache->full[i]);
102 nvgpu_init_list_node(&cache->partial[i]);
103 }
104
105 cache->mem_tree = NULL;
106 g->mm.pd_cache = cache;
107 nvgpu_mutex_init(&cache->lock);
108
109 pd_dbg(g, "PD cache initialized!");
110
111 return 0;
112}
113
114void nvgpu_pd_cache_fini(struct gk20a *g)
115{
116 int i;
117 struct nvgpu_pd_cache *cache = g->mm.pd_cache;
118
119 if (!cache)
120 return;
121
122 for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) {
123 WARN_ON(!nvgpu_list_empty(&cache->full[i]));
124 WARN_ON(!nvgpu_list_empty(&cache->partial[i]));
125 }
126
127 nvgpu_kfree(g, g->mm.pd_cache);
128}
129
130/*
131 * This is the simple pass-through for greater than page or page sized PDs.
132 *
133 * Note: this does not need the cache lock since it does not modify any of the
134 * PD cache data structures.
135 */
136int __nvgpu_pd_cache_alloc_direct(struct gk20a *g,
137 struct nvgpu_gmmu_pd *pd, u32 bytes)
138{
139 int err;
140
141 pd_dbg(g, "PD-Alloc [D] %u bytes", bytes);
142
143 pd->mem = nvgpu_kzalloc(g, sizeof(*pd->mem));
144 if (!pd->mem) {
145 pd_dbg(g, "OOM allocating nvgpu_mem struct!");
146 return -ENOMEM;
147 }
148
149 err = nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS,
150 bytes, pd->mem);
151 if (err) {
152 pd_dbg(g, "OOM allocating page directory!");
153 nvgpu_kfree(g, pd->mem);
154 return -ENOMEM;
155 }
156
157 pd->cached = false;
158 pd->mem_offs = 0;
159
160 return 0;
161}
162
163/*
164 * Make a new nvgpu_pd_cache_entry and allocate a PD from it. Update the passed
165 * pd to reflect this allocation.
166 */
167static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
168 struct nvgpu_pd_cache *cache,
169 struct nvgpu_gmmu_pd *pd,
170 u32 bytes)
171{
172 struct nvgpu_pd_mem_entry *pentry;
173
174 pd_dbg(g, "PD-Alloc [C] New: offs=0");
175
176 pentry = nvgpu_kzalloc(g, sizeof(*pentry));
177 if (!pentry) {
178 pd_dbg(g, "OOM allocating pentry!");
179 return -ENOMEM;
180 }
181
182 if (nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS,
183 PAGE_SIZE, &pentry->mem)) {
184 nvgpu_kfree(g, pentry);
185 pd_dbg(g, "Unable to DMA alloc!");
186 return -ENOMEM;
187 }
188
189 pentry->pd_size = bytes;
190 nvgpu_list_add(&pentry->list_entry,
191 &cache->partial[nvgpu_pd_cache_nr(bytes)]);
192
193 /*
194 * This allocates the very first PD table in the set of tables in this
195 * nvgpu_pd_mem_entry.
196 */
197 pentry->alloc_map = 1;
198
199 /*
200 * Now update the nvgpu_gmmu_pd to reflect this allocation.
201 */
202 pd->mem = &pentry->mem;
203 pd->mem_offs = 0;
204 pd->cached = true;
205
206 pentry->tree_entry.key_start = (u64)(uintptr_t)&pentry->mem;
207 nvgpu_rbtree_insert(&pentry->tree_entry, &cache->mem_tree);
208
209 return 0;
210}
211
212static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
213 struct nvgpu_pd_cache *cache,
214 struct nvgpu_pd_mem_entry *pentry,
215 struct nvgpu_gmmu_pd *pd)
216{
217 unsigned long bit_offs;
218 u32 mem_offs;
219 u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry);
220
221 /*
222 * Find and allocate an open PD.
223 */
224 bit_offs = ffz(pentry->alloc_map);
225 mem_offs = bit_offs * pentry->pd_size;
226
227 /* Bit map full. Somethings wrong. */
228 if (WARN_ON(bit_offs >= ffz(pentry_mask)))
229 return -ENOMEM;
230
231 pentry->alloc_map |= 1 << bit_offs;
232
233 pd_dbg(g, "PD-Alloc [C] Partial: offs=%lu", bit_offs);
234
235 /*
236 * First update the pd.
237 */
238 pd->mem = &pentry->mem;
239 pd->mem_offs = mem_offs;
240 pd->cached = true;
241
242 /*
243 * Now make sure the pentry is in the correct list (full vs partial).
244 */
245 if ((pentry->alloc_map & pentry_mask) == pentry_mask) {
246 pd_dbg(g, "Adding pentry to full list!");
247 nvgpu_list_del(&pentry->list_entry);
248 nvgpu_list_add(&pentry->list_entry,
249 &cache->full[nvgpu_pd_cache_nr(pentry->pd_size)]);
250 }
251
252 return 0;
253}
254
255/*
256 * Get a partially full nvgpu_pd_mem_entry. Returns NULL if there is no partial
257 * nvgpu_pd_mem_entry's.
258 */
259static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_get_partial(
260 struct nvgpu_pd_cache *cache, u32 bytes)
261{
262 struct nvgpu_list_node *list =
263 &cache->partial[nvgpu_pd_cache_nr(bytes)];
264
265 if (nvgpu_list_empty(list))
266 return NULL;
267
268 return nvgpu_list_first_entry(list,
269 nvgpu_pd_mem_entry,
270 list_entry);
271}
272
273/*
274 * Allocate memory from an nvgpu_mem for the page directory.
275 */
276static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
277 struct nvgpu_gmmu_pd *pd, u32 bytes)
278{
279 struct nvgpu_pd_mem_entry *pentry;
280 int err;
281
282 pd_dbg(g, "PD-Alloc [C] %u bytes", bytes);
283
284 if (bytes & (bytes - 1) ||
285 (bytes >= PAGE_SIZE ||
286 bytes < NVGPU_PD_CACHE_MIN)) {
287 pd_dbg(g, "PD-Alloc [C] Invalid (bytes=%u)!", bytes);
288 return -EINVAL;
289 }
290
291 pentry = nvgpu_pd_cache_get_partial(cache, bytes);
292 if (!pentry)
293 err = nvgpu_pd_cache_alloc_new(g, cache, pd, bytes);
294 else
295 err = nvgpu_pd_cache_alloc_from_partial(g, cache, pentry, pd);
296
297 if (err)
298 pd_dbg(g, "PD-Alloc [C] Failed!");
299
300 return err;
301}
302
303/*
304 * Allocate the DMA memory for a page directory. This handles the necessary PD
305 * cache logistics. Since on Parker and later GPUs some of the page directories
306 * are smaller than a page packing these PDs together saves a lot of memory.
307 */
308int __nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes)
309{
310 struct gk20a *g = gk20a_from_vm(vm);
311 int err;
312
313 /*
314 * Simple case: PD is bigger than a page so just do a regular DMA
315 * alloc.
316 */
317 if (bytes >= PAGE_SIZE) {
318 err = __nvgpu_pd_cache_alloc_direct(g, pd, bytes);
319 if (err)
320 return err;
321
322 return 0;
323 }
324
325 if (WARN_ON(!g->mm.pd_cache))
326 return -ENOMEM;
327
328 nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
329 err = nvgpu_pd_cache_alloc(g, g->mm.pd_cache, pd, bytes);
330 nvgpu_mutex_release(&g->mm.pd_cache->lock);
331
332 return err;
333}
334
335void __nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
336{
337 pd_dbg(g, "PD-Free [D] 0x%p", pd->mem);
338
339 if (!pd->mem)
340 return;
341
342 nvgpu_dma_free(g, pd->mem);
343 nvgpu_kfree(g, pd->mem);
344 pd->mem = NULL;
345}
346
347static void nvgpu_pd_cache_free_mem_entry(struct gk20a *g,
348 struct nvgpu_pd_cache *cache,
349 struct nvgpu_pd_mem_entry *pentry)
350{
351 nvgpu_dma_free(g, &pentry->mem);
352 nvgpu_list_del(&pentry->list_entry);
353 nvgpu_rbtree_unlink(&pentry->tree_entry, &cache->mem_tree);
354 nvgpu_kfree(g, pentry);
355}
356
357static void nvgpu_pd_cache_do_free(struct gk20a *g,
358 struct nvgpu_pd_cache *cache,
359 struct nvgpu_pd_mem_entry *pentry,
360 struct nvgpu_gmmu_pd *pd)
361{
362 u32 index = pd->mem_offs / pentry->pd_size;
363 u32 bit = 1 << index;
364
365 /* Mark entry as free. */
366 pentry->alloc_map &= ~bit;
367
368 if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) {
369 /*
370 * Partially full still. If it was already on the partial list
371 * this just re-adds it.
372 */
373 nvgpu_list_del(&pentry->list_entry);
374 nvgpu_list_add(&pentry->list_entry,
375 &cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]);
376 } else {
377 /* Empty now so free it. */
378 nvgpu_pd_cache_free_mem_entry(g, cache, pentry);
379 }
380}
381
382static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up(
383 struct gk20a *g,
384 struct nvgpu_pd_cache *cache,
385 struct nvgpu_gmmu_pd *pd)
386{
387 struct nvgpu_rbtree_node *node;
388
389 nvgpu_rbtree_search((u64)(uintptr_t)pd->mem, &node,
390 cache->mem_tree);
391 if (!node)
392 return NULL;
393
394 return nvgpu_pd_mem_entry_from_tree_entry(node);
395}
396
397static void nvgpu_pd_cache_free(struct gk20a *g, struct nvgpu_pd_cache *cache,
398 struct nvgpu_gmmu_pd *pd)
399{
400 struct nvgpu_pd_mem_entry *pentry;
401
402 pd_dbg(g, "PD-Free [C] 0x%p", pd->mem);
403
404 pentry = nvgpu_pd_cache_look_up(g, cache, pd);
405 if (!pentry) {
406 WARN(1, "Attempting to free non-existent pd");
407 return;
408 }
409
410 nvgpu_pd_cache_do_free(g, cache, pentry, pd);
411}
412
413void __nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd)
414{
415 struct gk20a *g = gk20a_from_vm(vm);
416
417 /*
418 * Simple case: just DMA free.
419 */
420 if (!pd->cached)
421 return __nvgpu_pd_cache_free_direct(g, pd);
422
423 nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
424 nvgpu_pd_cache_free(g, g->mm.pd_cache, pd);
425 nvgpu_mutex_release(&g->mm.pd_cache->lock);
426}