diff options
author | Alex Waterman <alexw@nvidia.com> | 2017-06-09 14:42:50 -0400 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2017-07-06 17:44:16 -0400 |
commit | 583704620db88e391f6b14acc57af859a70127de (patch) | |
tree | 8fc3becf2850b724e87011b0e0250c52d0efb7ee /drivers/gpu/nvgpu/common/mm/pd_cache.c | |
parent | c1393d5b68e63c992f4c689cb788139fdf8c2f1a (diff) |
gpu: nvgpu: Implement PD packing
In some cases page directories require less than a full page of memory.
For example, on Pascal, the final PD level for large pages is only 256 bytes;
thus 16 PDs can fit in a single page. To allocate an entire page for each of
these 256 B PDs is extremely wasteful. This patch aims to alleviate the
wasted DMA memory from having small PDs in a full page by packing multiple
small PDs into a single page.
The packing is implemented as a slab allocator - each page is a slab and
from each page multiple PD instances can be allocated. Several modifications
to the nvgpu_gmmu_pd struct also needed to be made to support this. The
nvgpu_mem is now a pointer and there's an explicit offset into the nvgpu_mem
struct so that each nvgpu_gmmu_pd knows what portion of the memory it's
using.
The nvgpu_pde_phys_addr() function and the pd_write() functions also require
some changes since the PD no longer is always situated at the start of the
nvgpu_mem.
Initialization and cleanup of the page tables for each VM was slightly
modified to work through the new pd_cache implementation. Some PDs (i.e
the PDB), despite not being a full page, still require a full page for
alignment purposes (HW requirements). Thus a direct allocation method for
PDs is still provided. This is also used when a PD that could in principle
be cached is greater than a page in size.
Lastly a new debug flag was added for the pd_cache code.
JIRA NVGPU-30
Change-Id: I64c8037fc356783c1ef203cc143c4d71bbd5d77c
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: https://git-master/r/1506610
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
GVS: Gerrit_Virtual_Submit
Diffstat (limited to 'drivers/gpu/nvgpu/common/mm/pd_cache.c')
-rw-r--r-- | drivers/gpu/nvgpu/common/mm/pd_cache.c | 426 |
1 files changed, 426 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/common/mm/pd_cache.c b/drivers/gpu/nvgpu/common/mm/pd_cache.c new file mode 100644 index 00000000..4f312eff --- /dev/null +++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c | |||
@@ -0,0 +1,426 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify it | ||
5 | * under the terms and conditions of the GNU General Public License, | ||
6 | * version 2, as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
9 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
10 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
11 | * more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
15 | */ | ||
16 | |||
17 | #include <nvgpu/log.h> | ||
18 | #include <nvgpu/dma.h> | ||
19 | #include <nvgpu/gmmu.h> | ||
20 | #include <nvgpu/nvgpu_mem.h> | ||
21 | #include <nvgpu/list.h> | ||
22 | #include <nvgpu/log2.h> | ||
23 | |||
24 | #include "gk20a/gk20a.h" | ||
25 | #include "gk20a/mm_gk20a.h" | ||
26 | |||
27 | #define pd_dbg(g, fmt, args...) nvgpu_log(g, gpu_dbg_pd_cache, fmt, ##args) | ||
28 | |||
29 | /** | ||
30 | * DOC: PD cache | ||
31 | * | ||
32 | * In the name of saving memory with the many sub-page sized PD levels in Pascal | ||
33 | * and beyond a way of packing PD tables together is necessary. This code here | ||
34 | * does just that. If a PD table only requires 1024 bytes, then it is possible | ||
35 | * to have 4 of these PDs in one page. This is even more pronounced for 256 byte | ||
36 | * PD tables. | ||
37 | * | ||
38 | * The pd cache is basially just a slab allocator. Each instance of the nvgpu | ||
39 | * driver makes one of these structs: | ||
40 | * | ||
41 | * struct nvgpu_pd_cache { | ||
42 | * struct nvgpu_list_node full[NVGPU_PD_CACHE_COUNT]; | ||
43 | * struct nvgpu_list_node partial[NVGPU_PD_CACHE_COUNT]; | ||
44 | * | ||
45 | * struct nvgpu_rbtree_node *mem_tree; | ||
46 | * }; | ||
47 | * | ||
48 | * There are two sets of lists, the full and the partial. The full lists contain | ||
49 | * pages of memory for which all the memory in that page is in use. The partial | ||
50 | * lists contain partially full pages of memory which can be used for more PD | ||
51 | * allocations. There a couple of assumptions here: | ||
52 | * | ||
53 | * 1. PDs greater than or equal to the page size bypass the pd cache. | ||
54 | * 2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes. | ||
55 | * | ||
56 | * There are NVGPU_PD_CACHE_COUNT full lists and the same number of partial | ||
57 | * lists. For a 4Kb page NVGPU_PD_CACHE_COUNT is 4. This is enough space for | ||
58 | * 256, 512, 1024, and 2048 byte PDs. | ||
59 | * | ||
60 | * __nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD | ||
61 | * size is page size or larger and choose the correct allocation scheme - either | ||
62 | * from the PD cache or directly. Similarly __nvgpu_pd_free() will free a PD | ||
63 | * allocated by __nvgpu_pd_alloc(). | ||
64 | * | ||
65 | * Since the top level PD (the PDB) is a page aligned pointer but less than a | ||
66 | * page size the direct functions must be used for allocating PDBs. Otherwise | ||
67 | * there would be alignment issues for the PDBs when they get packed. | ||
68 | */ | ||
69 | |||
70 | static u32 nvgpu_pd_cache_nr(u32 bytes) | ||
71 | { | ||
72 | return ilog2(bytes >> (NVGPU_PD_CACHE_MIN_SHIFT - 1)); | ||
73 | } | ||
74 | |||
75 | static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry) | ||
76 | { | ||
77 | u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size); | ||
78 | |||
79 | return mask_offset - 1; | ||
80 | } | ||
81 | |||
82 | int nvgpu_pd_cache_init(struct gk20a *g) | ||
83 | { | ||
84 | struct nvgpu_pd_cache *cache; | ||
85 | int i; | ||
86 | |||
87 | /* | ||
88 | * This gets called from finalize_poweron() so we need to make sure we | ||
89 | * don't reinit the pd_cache over and over. | ||
90 | */ | ||
91 | if (g->mm.pd_cache) | ||
92 | return 0; | ||
93 | |||
94 | cache = nvgpu_kzalloc(g, sizeof(*cache)); | ||
95 | if (!cache) { | ||
96 | nvgpu_err(g, "Failed to alloc pd_cache!"); | ||
97 | return -ENOMEM; | ||
98 | } | ||
99 | |||
100 | for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) { | ||
101 | nvgpu_init_list_node(&cache->full[i]); | ||
102 | nvgpu_init_list_node(&cache->partial[i]); | ||
103 | } | ||
104 | |||
105 | cache->mem_tree = NULL; | ||
106 | g->mm.pd_cache = cache; | ||
107 | nvgpu_mutex_init(&cache->lock); | ||
108 | |||
109 | pd_dbg(g, "PD cache initialized!"); | ||
110 | |||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | void nvgpu_pd_cache_fini(struct gk20a *g) | ||
115 | { | ||
116 | int i; | ||
117 | struct nvgpu_pd_cache *cache = g->mm.pd_cache; | ||
118 | |||
119 | if (!cache) | ||
120 | return; | ||
121 | |||
122 | for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) { | ||
123 | WARN_ON(!nvgpu_list_empty(&cache->full[i])); | ||
124 | WARN_ON(!nvgpu_list_empty(&cache->partial[i])); | ||
125 | } | ||
126 | |||
127 | nvgpu_kfree(g, g->mm.pd_cache); | ||
128 | } | ||
129 | |||
130 | /* | ||
131 | * This is the simple pass-through for greater than page or page sized PDs. | ||
132 | * | ||
133 | * Note: this does not need the cache lock since it does not modify any of the | ||
134 | * PD cache data structures. | ||
135 | */ | ||
136 | int __nvgpu_pd_cache_alloc_direct(struct gk20a *g, | ||
137 | struct nvgpu_gmmu_pd *pd, u32 bytes) | ||
138 | { | ||
139 | int err; | ||
140 | |||
141 | pd_dbg(g, "PD-Alloc [D] %u bytes", bytes); | ||
142 | |||
143 | pd->mem = nvgpu_kzalloc(g, sizeof(*pd->mem)); | ||
144 | if (!pd->mem) { | ||
145 | pd_dbg(g, "OOM allocating nvgpu_mem struct!"); | ||
146 | return -ENOMEM; | ||
147 | } | ||
148 | |||
149 | err = nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS, | ||
150 | bytes, pd->mem); | ||
151 | if (err) { | ||
152 | pd_dbg(g, "OOM allocating page directory!"); | ||
153 | nvgpu_kfree(g, pd->mem); | ||
154 | return -ENOMEM; | ||
155 | } | ||
156 | |||
157 | pd->cached = false; | ||
158 | pd->mem_offs = 0; | ||
159 | |||
160 | return 0; | ||
161 | } | ||
162 | |||
163 | /* | ||
164 | * Make a new nvgpu_pd_cache_entry and allocate a PD from it. Update the passed | ||
165 | * pd to reflect this allocation. | ||
166 | */ | ||
167 | static int nvgpu_pd_cache_alloc_new(struct gk20a *g, | ||
168 | struct nvgpu_pd_cache *cache, | ||
169 | struct nvgpu_gmmu_pd *pd, | ||
170 | u32 bytes) | ||
171 | { | ||
172 | struct nvgpu_pd_mem_entry *pentry; | ||
173 | |||
174 | pd_dbg(g, "PD-Alloc [C] New: offs=0"); | ||
175 | |||
176 | pentry = nvgpu_kzalloc(g, sizeof(*pentry)); | ||
177 | if (!pentry) { | ||
178 | pd_dbg(g, "OOM allocating pentry!"); | ||
179 | return -ENOMEM; | ||
180 | } | ||
181 | |||
182 | if (nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS, | ||
183 | PAGE_SIZE, &pentry->mem)) { | ||
184 | nvgpu_kfree(g, pentry); | ||
185 | pd_dbg(g, "Unable to DMA alloc!"); | ||
186 | return -ENOMEM; | ||
187 | } | ||
188 | |||
189 | pentry->pd_size = bytes; | ||
190 | nvgpu_list_add(&pentry->list_entry, | ||
191 | &cache->partial[nvgpu_pd_cache_nr(bytes)]); | ||
192 | |||
193 | /* | ||
194 | * This allocates the very first PD table in the set of tables in this | ||
195 | * nvgpu_pd_mem_entry. | ||
196 | */ | ||
197 | pentry->alloc_map = 1; | ||
198 | |||
199 | /* | ||
200 | * Now update the nvgpu_gmmu_pd to reflect this allocation. | ||
201 | */ | ||
202 | pd->mem = &pentry->mem; | ||
203 | pd->mem_offs = 0; | ||
204 | pd->cached = true; | ||
205 | |||
206 | pentry->tree_entry.key_start = (u64)(uintptr_t)&pentry->mem; | ||
207 | nvgpu_rbtree_insert(&pentry->tree_entry, &cache->mem_tree); | ||
208 | |||
209 | return 0; | ||
210 | } | ||
211 | |||
212 | static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g, | ||
213 | struct nvgpu_pd_cache *cache, | ||
214 | struct nvgpu_pd_mem_entry *pentry, | ||
215 | struct nvgpu_gmmu_pd *pd) | ||
216 | { | ||
217 | unsigned long bit_offs; | ||
218 | u32 mem_offs; | ||
219 | u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry); | ||
220 | |||
221 | /* | ||
222 | * Find and allocate an open PD. | ||
223 | */ | ||
224 | bit_offs = ffz(pentry->alloc_map); | ||
225 | mem_offs = bit_offs * pentry->pd_size; | ||
226 | |||
227 | /* Bit map full. Somethings wrong. */ | ||
228 | if (WARN_ON(bit_offs >= ffz(pentry_mask))) | ||
229 | return -ENOMEM; | ||
230 | |||
231 | pentry->alloc_map |= 1 << bit_offs; | ||
232 | |||
233 | pd_dbg(g, "PD-Alloc [C] Partial: offs=%lu", bit_offs); | ||
234 | |||
235 | /* | ||
236 | * First update the pd. | ||
237 | */ | ||
238 | pd->mem = &pentry->mem; | ||
239 | pd->mem_offs = mem_offs; | ||
240 | pd->cached = true; | ||
241 | |||
242 | /* | ||
243 | * Now make sure the pentry is in the correct list (full vs partial). | ||
244 | */ | ||
245 | if ((pentry->alloc_map & pentry_mask) == pentry_mask) { | ||
246 | pd_dbg(g, "Adding pentry to full list!"); | ||
247 | nvgpu_list_del(&pentry->list_entry); | ||
248 | nvgpu_list_add(&pentry->list_entry, | ||
249 | &cache->full[nvgpu_pd_cache_nr(pentry->pd_size)]); | ||
250 | } | ||
251 | |||
252 | return 0; | ||
253 | } | ||
254 | |||
255 | /* | ||
256 | * Get a partially full nvgpu_pd_mem_entry. Returns NULL if there is no partial | ||
257 | * nvgpu_pd_mem_entry's. | ||
258 | */ | ||
259 | static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_get_partial( | ||
260 | struct nvgpu_pd_cache *cache, u32 bytes) | ||
261 | { | ||
262 | struct nvgpu_list_node *list = | ||
263 | &cache->partial[nvgpu_pd_cache_nr(bytes)]; | ||
264 | |||
265 | if (nvgpu_list_empty(list)) | ||
266 | return NULL; | ||
267 | |||
268 | return nvgpu_list_first_entry(list, | ||
269 | nvgpu_pd_mem_entry, | ||
270 | list_entry); | ||
271 | } | ||
272 | |||
273 | /* | ||
274 | * Allocate memory from an nvgpu_mem for the page directory. | ||
275 | */ | ||
276 | static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache, | ||
277 | struct nvgpu_gmmu_pd *pd, u32 bytes) | ||
278 | { | ||
279 | struct nvgpu_pd_mem_entry *pentry; | ||
280 | int err; | ||
281 | |||
282 | pd_dbg(g, "PD-Alloc [C] %u bytes", bytes); | ||
283 | |||
284 | if (bytes & (bytes - 1) || | ||
285 | (bytes >= PAGE_SIZE || | ||
286 | bytes < NVGPU_PD_CACHE_MIN)) { | ||
287 | pd_dbg(g, "PD-Alloc [C] Invalid (bytes=%u)!", bytes); | ||
288 | return -EINVAL; | ||
289 | } | ||
290 | |||
291 | pentry = nvgpu_pd_cache_get_partial(cache, bytes); | ||
292 | if (!pentry) | ||
293 | err = nvgpu_pd_cache_alloc_new(g, cache, pd, bytes); | ||
294 | else | ||
295 | err = nvgpu_pd_cache_alloc_from_partial(g, cache, pentry, pd); | ||
296 | |||
297 | if (err) | ||
298 | pd_dbg(g, "PD-Alloc [C] Failed!"); | ||
299 | |||
300 | return err; | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | * Allocate the DMA memory for a page directory. This handles the necessary PD | ||
305 | * cache logistics. Since on Parker and later GPUs some of the page directories | ||
306 | * are smaller than a page packing these PDs together saves a lot of memory. | ||
307 | */ | ||
308 | int __nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes) | ||
309 | { | ||
310 | struct gk20a *g = gk20a_from_vm(vm); | ||
311 | int err; | ||
312 | |||
313 | /* | ||
314 | * Simple case: PD is bigger than a page so just do a regular DMA | ||
315 | * alloc. | ||
316 | */ | ||
317 | if (bytes >= PAGE_SIZE) { | ||
318 | err = __nvgpu_pd_cache_alloc_direct(g, pd, bytes); | ||
319 | if (err) | ||
320 | return err; | ||
321 | |||
322 | return 0; | ||
323 | } | ||
324 | |||
325 | if (WARN_ON(!g->mm.pd_cache)) | ||
326 | return -ENOMEM; | ||
327 | |||
328 | nvgpu_mutex_acquire(&g->mm.pd_cache->lock); | ||
329 | err = nvgpu_pd_cache_alloc(g, g->mm.pd_cache, pd, bytes); | ||
330 | nvgpu_mutex_release(&g->mm.pd_cache->lock); | ||
331 | |||
332 | return err; | ||
333 | } | ||
334 | |||
335 | void __nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd) | ||
336 | { | ||
337 | pd_dbg(g, "PD-Free [D] 0x%p", pd->mem); | ||
338 | |||
339 | if (!pd->mem) | ||
340 | return; | ||
341 | |||
342 | nvgpu_dma_free(g, pd->mem); | ||
343 | nvgpu_kfree(g, pd->mem); | ||
344 | pd->mem = NULL; | ||
345 | } | ||
346 | |||
347 | static void nvgpu_pd_cache_free_mem_entry(struct gk20a *g, | ||
348 | struct nvgpu_pd_cache *cache, | ||
349 | struct nvgpu_pd_mem_entry *pentry) | ||
350 | { | ||
351 | nvgpu_dma_free(g, &pentry->mem); | ||
352 | nvgpu_list_del(&pentry->list_entry); | ||
353 | nvgpu_rbtree_unlink(&pentry->tree_entry, &cache->mem_tree); | ||
354 | nvgpu_kfree(g, pentry); | ||
355 | } | ||
356 | |||
357 | static void nvgpu_pd_cache_do_free(struct gk20a *g, | ||
358 | struct nvgpu_pd_cache *cache, | ||
359 | struct nvgpu_pd_mem_entry *pentry, | ||
360 | struct nvgpu_gmmu_pd *pd) | ||
361 | { | ||
362 | u32 index = pd->mem_offs / pentry->pd_size; | ||
363 | u32 bit = 1 << index; | ||
364 | |||
365 | /* Mark entry as free. */ | ||
366 | pentry->alloc_map &= ~bit; | ||
367 | |||
368 | if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) { | ||
369 | /* | ||
370 | * Partially full still. If it was already on the partial list | ||
371 | * this just re-adds it. | ||
372 | */ | ||
373 | nvgpu_list_del(&pentry->list_entry); | ||
374 | nvgpu_list_add(&pentry->list_entry, | ||
375 | &cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]); | ||
376 | } else { | ||
377 | /* Empty now so free it. */ | ||
378 | nvgpu_pd_cache_free_mem_entry(g, cache, pentry); | ||
379 | } | ||
380 | } | ||
381 | |||
382 | static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up( | ||
383 | struct gk20a *g, | ||
384 | struct nvgpu_pd_cache *cache, | ||
385 | struct nvgpu_gmmu_pd *pd) | ||
386 | { | ||
387 | struct nvgpu_rbtree_node *node; | ||
388 | |||
389 | nvgpu_rbtree_search((u64)(uintptr_t)pd->mem, &node, | ||
390 | cache->mem_tree); | ||
391 | if (!node) | ||
392 | return NULL; | ||
393 | |||
394 | return nvgpu_pd_mem_entry_from_tree_entry(node); | ||
395 | } | ||
396 | |||
397 | static void nvgpu_pd_cache_free(struct gk20a *g, struct nvgpu_pd_cache *cache, | ||
398 | struct nvgpu_gmmu_pd *pd) | ||
399 | { | ||
400 | struct nvgpu_pd_mem_entry *pentry; | ||
401 | |||
402 | pd_dbg(g, "PD-Free [C] 0x%p", pd->mem); | ||
403 | |||
404 | pentry = nvgpu_pd_cache_look_up(g, cache, pd); | ||
405 | if (!pentry) { | ||
406 | WARN(1, "Attempting to free non-existent pd"); | ||
407 | return; | ||
408 | } | ||
409 | |||
410 | nvgpu_pd_cache_do_free(g, cache, pentry, pd); | ||
411 | } | ||
412 | |||
413 | void __nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd) | ||
414 | { | ||
415 | struct gk20a *g = gk20a_from_vm(vm); | ||
416 | |||
417 | /* | ||
418 | * Simple case: just DMA free. | ||
419 | */ | ||
420 | if (!pd->cached) | ||
421 | return __nvgpu_pd_cache_free_direct(g, pd); | ||
422 | |||
423 | nvgpu_mutex_acquire(&g->mm.pd_cache->lock); | ||
424 | nvgpu_pd_cache_free(g, g->mm.pd_cache, pd); | ||
425 | nvgpu_mutex_release(&g->mm.pd_cache->lock); | ||
426 | } | ||