diff options
-rw-r--r-- | drivers/gpu/nvgpu/Makefile.nvgpu | 1 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/common/mm/gmmu.c | 81 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/common/mm/pd_cache.c | 426 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/common/mm/vm.c | 50 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/gk20a.c | 9 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/mm_gk20a.c | 9 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/mm_gk20a.h | 4 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gp10b/mm_gp10b.c | 10 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/include/nvgpu/gmmu.h | 91 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/include/nvgpu/log.h | 1 |
10 files changed, 609 insertions, 73 deletions
diff --git a/drivers/gpu/nvgpu/Makefile.nvgpu b/drivers/gpu/nvgpu/Makefile.nvgpu index 3a256771..4aaf7bc5 100644 --- a/drivers/gpu/nvgpu/Makefile.nvgpu +++ b/drivers/gpu/nvgpu/Makefile.nvgpu | |||
@@ -50,6 +50,7 @@ nvgpu-y := \ | |||
50 | common/mm/page_allocator.o \ | 50 | common/mm/page_allocator.o \ |
51 | common/mm/lockless_allocator.o \ | 51 | common/mm/lockless_allocator.o \ |
52 | common/mm/gmmu.o \ | 52 | common/mm/gmmu.o \ |
53 | common/mm/pd_cache.o \ | ||
53 | common/mm/vm.o \ | 54 | common/mm/vm.o \ |
54 | common/mm/vm_area.o \ | 55 | common/mm/vm_area.o \ |
55 | common/bus.o \ | 56 | common/bus.o \ |
diff --git a/drivers/gpu/nvgpu/common/mm/gmmu.c b/drivers/gpu/nvgpu/common/mm/gmmu.c index ec1bc095..602dfb3b 100644 --- a/drivers/gpu/nvgpu/common/mm/gmmu.c +++ b/drivers/gpu/nvgpu/common/mm/gmmu.c | |||
@@ -45,7 +45,8 @@ static int pd_allocate(struct vm_gk20a *vm, | |||
45 | struct nvgpu_gmmu_pd *pd, | 45 | struct nvgpu_gmmu_pd *pd, |
46 | const struct gk20a_mmu_level *l, | 46 | const struct gk20a_mmu_level *l, |
47 | struct nvgpu_gmmu_attrs *attrs); | 47 | struct nvgpu_gmmu_attrs *attrs); |
48 | 48 | static u32 pd_size(const struct gk20a_mmu_level *l, | |
49 | struct nvgpu_gmmu_attrs *attrs); | ||
49 | /* | 50 | /* |
50 | * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU | 51 | * Core GMMU map function for the kernel to use. If @addr is 0 then the GPU |
51 | * VA will be allocated for you. If addr is non-zero then the buffer will be | 52 | * VA will be allocated for you. If addr is non-zero then the buffer will be |
@@ -138,6 +139,9 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va) | |||
138 | 139 | ||
139 | int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm) | 140 | int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm) |
140 | { | 141 | { |
142 | u32 pdb_size; | ||
143 | int err; | ||
144 | |||
141 | /* | 145 | /* |
142 | * Need this just for page size. Everything else can be ignored. Also | 146 | * Need this just for page size. Everything else can be ignored. Also |
143 | * note that we can just use pgsz 0 (i.e small pages) since the number | 147 | * note that we can just use pgsz 0 (i.e small pages) since the number |
@@ -148,56 +152,43 @@ int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm) | |||
148 | .pgsz = 0, | 152 | .pgsz = 0, |
149 | }; | 153 | }; |
150 | 154 | ||
151 | return pd_allocate(vm, &vm->pdb, &vm->mmu_levels[0], &attrs); | 155 | /* |
152 | } | 156 | * PDB size here must be one page so that its address is page size |
157 | * aligned. Although lower PDE tables can be aligned at 256B boundaries | ||
158 | * the main PDB must be page aligned. | ||
159 | */ | ||
160 | pdb_size = ALIGN(pd_size(&vm->mmu_levels[0], &attrs), PAGE_SIZE); | ||
161 | |||
162 | err = __nvgpu_pd_cache_alloc_direct(vm->mm->g, &vm->pdb, pdb_size); | ||
163 | if (WARN_ON(err)) | ||
164 | return err; | ||
153 | 165 | ||
166 | /* | ||
167 | * One mb() is done after all mapping operations. Don't need individual | ||
168 | * barriers for each PD write. | ||
169 | */ | ||
170 | vm->pdb.mem->skip_wmb = true; | ||
171 | |||
172 | return 0; | ||
173 | } | ||
154 | 174 | ||
155 | /* | 175 | /* |
156 | * Ensure that there's a CPU mapping for the page directory memory. This won't | 176 | * Ensure that there's a CPU mapping for the page directory memory. This won't |
157 | * always be the case for 32 bit systems since we may need to save kernel | 177 | * always be the case for 32 bit systems since we may need to save kernel |
158 | * virtual memory. | 178 | * virtual memory. |
159 | */ | 179 | */ |
160 | static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry) | 180 | static int map_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *pd) |
161 | { | 181 | { |
162 | return nvgpu_mem_begin(g, &entry->mem); | 182 | return nvgpu_mem_begin(g, pd->mem); |
163 | } | 183 | } |
164 | 184 | ||
165 | /* | 185 | /* |
166 | * Handle any necessary CPU unmap semantics for a page directories DMA memory. | 186 | * Handle any necessary CPU unmap semantics for a page directories DMA memory. |
167 | * For 64 bit platforms this is a noop. | 187 | * For 64 bit platforms this is a noop. |
168 | */ | 188 | */ |
169 | static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *entry) | 189 | static void unmap_gmmu_pages(struct gk20a *g, struct nvgpu_gmmu_pd *pd) |
170 | { | ||
171 | nvgpu_mem_end(g, &entry->mem); | ||
172 | } | ||
173 | |||
174 | static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 bytes, | ||
175 | struct nvgpu_gmmu_pd *pd) | ||
176 | { | ||
177 | struct gk20a *g = gk20a_from_vm(vm); | ||
178 | unsigned long flags = NVGPU_DMA_FORCE_CONTIGUOUS; | ||
179 | int err; | ||
180 | |||
181 | /* | ||
182 | * On arm32 vmalloc space is a precious commodity so we do not map pages | ||
183 | * by default. | ||
184 | */ | ||
185 | if (!IS_ENABLED(CONFIG_ARM64)) | ||
186 | flags |= NVGPU_DMA_NO_KERNEL_MAPPING; | ||
187 | |||
188 | err = nvgpu_dma_alloc_flags(g, flags, bytes, &pd->mem); | ||
189 | if (err) | ||
190 | return -ENOMEM; | ||
191 | |||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, | ||
196 | struct nvgpu_gmmu_pd *pd) | ||
197 | { | 190 | { |
198 | struct gk20a *g = gk20a_from_vm(vm); | 191 | nvgpu_mem_end(g, pd->mem); |
199 | |||
200 | nvgpu_dma_free(g, &pd->mem); | ||
201 | } | 192 | } |
202 | 193 | ||
203 | /* | 194 | /* |
@@ -205,10 +196,14 @@ void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, | |||
205 | */ | 196 | */ |
206 | u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd) | 197 | u64 nvgpu_pde_phys_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd) |
207 | { | 198 | { |
199 | u64 page_addr; | ||
200 | |||
208 | if (g->mm.has_physical_mode) | 201 | if (g->mm.has_physical_mode) |
209 | return sg_phys(pd->mem.priv.sgt->sgl); | 202 | page_addr = sg_phys(pd->mem->priv.sgt->sgl); |
210 | else | 203 | else |
211 | return nvgpu_mem_get_base_addr(g, &pd->mem, 0); | 204 | page_addr = nvgpu_mem_get_base_addr(g, pd->mem, 0); |
205 | |||
206 | return page_addr + pd->mem_offs; | ||
212 | } | 207 | } |
213 | 208 | ||
214 | /* | 209 | /* |
@@ -254,10 +249,10 @@ static int pd_allocate(struct vm_gk20a *vm, | |||
254 | { | 249 | { |
255 | int err; | 250 | int err; |
256 | 251 | ||
257 | if (pd->mem.size) | 252 | if (pd->mem) |
258 | return 0; | 253 | return 0; |
259 | 254 | ||
260 | err = nvgpu_alloc_gmmu_pages(vm, pd_size(l, attrs), pd); | 255 | err = __nvgpu_pd_alloc(vm, pd, pd_size(l, attrs)); |
261 | if (err) { | 256 | if (err) { |
262 | nvgpu_info(vm->mm->g, "error allocating page directory!"); | 257 | nvgpu_info(vm->mm->g, "error allocating page directory!"); |
263 | return err; | 258 | return err; |
@@ -267,7 +262,7 @@ static int pd_allocate(struct vm_gk20a *vm, | |||
267 | * One mb() is done after all mapping operations. Don't need individual | 262 | * One mb() is done after all mapping operations. Don't need individual |
268 | * barriers for each PD write. | 263 | * barriers for each PD write. |
269 | */ | 264 | */ |
270 | pd->mem.skip_wmb = true; | 265 | pd->mem->skip_wmb = true; |
271 | 266 | ||
272 | return 0; | 267 | return 0; |
273 | } | 268 | } |
@@ -778,7 +773,7 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm, | |||
778 | } | 773 | } |
779 | 774 | ||
780 | if (!batch) | 775 | if (!batch) |
781 | g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); | 776 | g->ops.fb.tlb_invalidate(g, vm->pdb.mem); |
782 | else | 777 | else |
783 | batch->need_tlb_invalidate = true; | 778 | batch->need_tlb_invalidate = true; |
784 | 779 | ||
@@ -830,7 +825,7 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm, | |||
830 | 825 | ||
831 | if (!batch) { | 826 | if (!batch) { |
832 | gk20a_mm_l2_flush(g, true); | 827 | gk20a_mm_l2_flush(g, true); |
833 | g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); | 828 | g->ops.fb.tlb_invalidate(g, vm->pdb.mem); |
834 | } else { | 829 | } else { |
835 | if (!batch->gpu_l2_flushed) { | 830 | if (!batch->gpu_l2_flushed) { |
836 | gk20a_mm_l2_flush(g, true); | 831 | gk20a_mm_l2_flush(g, true); |
diff --git a/drivers/gpu/nvgpu/common/mm/pd_cache.c b/drivers/gpu/nvgpu/common/mm/pd_cache.c new file mode 100644 index 00000000..4f312eff --- /dev/null +++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c | |||
@@ -0,0 +1,426 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify it | ||
5 | * under the terms and conditions of the GNU General Public License, | ||
6 | * version 2, as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
9 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
10 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
11 | * more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
15 | */ | ||
16 | |||
17 | #include <nvgpu/log.h> | ||
18 | #include <nvgpu/dma.h> | ||
19 | #include <nvgpu/gmmu.h> | ||
20 | #include <nvgpu/nvgpu_mem.h> | ||
21 | #include <nvgpu/list.h> | ||
22 | #include <nvgpu/log2.h> | ||
23 | |||
24 | #include "gk20a/gk20a.h" | ||
25 | #include "gk20a/mm_gk20a.h" | ||
26 | |||
27 | #define pd_dbg(g, fmt, args...) nvgpu_log(g, gpu_dbg_pd_cache, fmt, ##args) | ||
28 | |||
29 | /** | ||
30 | * DOC: PD cache | ||
31 | * | ||
32 | * In the name of saving memory with the many sub-page sized PD levels in Pascal | ||
33 | * and beyond a way of packing PD tables together is necessary. This code here | ||
34 | * does just that. If a PD table only requires 1024 bytes, then it is possible | ||
35 | * to have 4 of these PDs in one page. This is even more pronounced for 256 byte | ||
36 | * PD tables. | ||
37 | * | ||
38 | * The pd cache is basially just a slab allocator. Each instance of the nvgpu | ||
39 | * driver makes one of these structs: | ||
40 | * | ||
41 | * struct nvgpu_pd_cache { | ||
42 | * struct nvgpu_list_node full[NVGPU_PD_CACHE_COUNT]; | ||
43 | * struct nvgpu_list_node partial[NVGPU_PD_CACHE_COUNT]; | ||
44 | * | ||
45 | * struct nvgpu_rbtree_node *mem_tree; | ||
46 | * }; | ||
47 | * | ||
48 | * There are two sets of lists, the full and the partial. The full lists contain | ||
49 | * pages of memory for which all the memory in that page is in use. The partial | ||
50 | * lists contain partially full pages of memory which can be used for more PD | ||
51 | * allocations. There a couple of assumptions here: | ||
52 | * | ||
53 | * 1. PDs greater than or equal to the page size bypass the pd cache. | ||
54 | * 2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes. | ||
55 | * | ||
56 | * There are NVGPU_PD_CACHE_COUNT full lists and the same number of partial | ||
57 | * lists. For a 4Kb page NVGPU_PD_CACHE_COUNT is 4. This is enough space for | ||
58 | * 256, 512, 1024, and 2048 byte PDs. | ||
59 | * | ||
60 | * __nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD | ||
61 | * size is page size or larger and choose the correct allocation scheme - either | ||
62 | * from the PD cache or directly. Similarly __nvgpu_pd_free() will free a PD | ||
63 | * allocated by __nvgpu_pd_alloc(). | ||
64 | * | ||
65 | * Since the top level PD (the PDB) is a page aligned pointer but less than a | ||
66 | * page size the direct functions must be used for allocating PDBs. Otherwise | ||
67 | * there would be alignment issues for the PDBs when they get packed. | ||
68 | */ | ||
69 | |||
70 | static u32 nvgpu_pd_cache_nr(u32 bytes) | ||
71 | { | ||
72 | return ilog2(bytes >> (NVGPU_PD_CACHE_MIN_SHIFT - 1)); | ||
73 | } | ||
74 | |||
75 | static u32 nvgpu_pd_cache_get_mask(struct nvgpu_pd_mem_entry *pentry) | ||
76 | { | ||
77 | u32 mask_offset = 1 << (PAGE_SIZE / pentry->pd_size); | ||
78 | |||
79 | return mask_offset - 1; | ||
80 | } | ||
81 | |||
82 | int nvgpu_pd_cache_init(struct gk20a *g) | ||
83 | { | ||
84 | struct nvgpu_pd_cache *cache; | ||
85 | int i; | ||
86 | |||
87 | /* | ||
88 | * This gets called from finalize_poweron() so we need to make sure we | ||
89 | * don't reinit the pd_cache over and over. | ||
90 | */ | ||
91 | if (g->mm.pd_cache) | ||
92 | return 0; | ||
93 | |||
94 | cache = nvgpu_kzalloc(g, sizeof(*cache)); | ||
95 | if (!cache) { | ||
96 | nvgpu_err(g, "Failed to alloc pd_cache!"); | ||
97 | return -ENOMEM; | ||
98 | } | ||
99 | |||
100 | for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) { | ||
101 | nvgpu_init_list_node(&cache->full[i]); | ||
102 | nvgpu_init_list_node(&cache->partial[i]); | ||
103 | } | ||
104 | |||
105 | cache->mem_tree = NULL; | ||
106 | g->mm.pd_cache = cache; | ||
107 | nvgpu_mutex_init(&cache->lock); | ||
108 | |||
109 | pd_dbg(g, "PD cache initialized!"); | ||
110 | |||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | void nvgpu_pd_cache_fini(struct gk20a *g) | ||
115 | { | ||
116 | int i; | ||
117 | struct nvgpu_pd_cache *cache = g->mm.pd_cache; | ||
118 | |||
119 | if (!cache) | ||
120 | return; | ||
121 | |||
122 | for (i = 0; i < NVGPU_PD_CACHE_COUNT; i++) { | ||
123 | WARN_ON(!nvgpu_list_empty(&cache->full[i])); | ||
124 | WARN_ON(!nvgpu_list_empty(&cache->partial[i])); | ||
125 | } | ||
126 | |||
127 | nvgpu_kfree(g, g->mm.pd_cache); | ||
128 | } | ||
129 | |||
130 | /* | ||
131 | * This is the simple pass-through for greater than page or page sized PDs. | ||
132 | * | ||
133 | * Note: this does not need the cache lock since it does not modify any of the | ||
134 | * PD cache data structures. | ||
135 | */ | ||
136 | int __nvgpu_pd_cache_alloc_direct(struct gk20a *g, | ||
137 | struct nvgpu_gmmu_pd *pd, u32 bytes) | ||
138 | { | ||
139 | int err; | ||
140 | |||
141 | pd_dbg(g, "PD-Alloc [D] %u bytes", bytes); | ||
142 | |||
143 | pd->mem = nvgpu_kzalloc(g, sizeof(*pd->mem)); | ||
144 | if (!pd->mem) { | ||
145 | pd_dbg(g, "OOM allocating nvgpu_mem struct!"); | ||
146 | return -ENOMEM; | ||
147 | } | ||
148 | |||
149 | err = nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS, | ||
150 | bytes, pd->mem); | ||
151 | if (err) { | ||
152 | pd_dbg(g, "OOM allocating page directory!"); | ||
153 | nvgpu_kfree(g, pd->mem); | ||
154 | return -ENOMEM; | ||
155 | } | ||
156 | |||
157 | pd->cached = false; | ||
158 | pd->mem_offs = 0; | ||
159 | |||
160 | return 0; | ||
161 | } | ||
162 | |||
163 | /* | ||
164 | * Make a new nvgpu_pd_cache_entry and allocate a PD from it. Update the passed | ||
165 | * pd to reflect this allocation. | ||
166 | */ | ||
167 | static int nvgpu_pd_cache_alloc_new(struct gk20a *g, | ||
168 | struct nvgpu_pd_cache *cache, | ||
169 | struct nvgpu_gmmu_pd *pd, | ||
170 | u32 bytes) | ||
171 | { | ||
172 | struct nvgpu_pd_mem_entry *pentry; | ||
173 | |||
174 | pd_dbg(g, "PD-Alloc [C] New: offs=0"); | ||
175 | |||
176 | pentry = nvgpu_kzalloc(g, sizeof(*pentry)); | ||
177 | if (!pentry) { | ||
178 | pd_dbg(g, "OOM allocating pentry!"); | ||
179 | return -ENOMEM; | ||
180 | } | ||
181 | |||
182 | if (nvgpu_dma_alloc_flags(g, NVGPU_DMA_FORCE_CONTIGUOUS, | ||
183 | PAGE_SIZE, &pentry->mem)) { | ||
184 | nvgpu_kfree(g, pentry); | ||
185 | pd_dbg(g, "Unable to DMA alloc!"); | ||
186 | return -ENOMEM; | ||
187 | } | ||
188 | |||
189 | pentry->pd_size = bytes; | ||
190 | nvgpu_list_add(&pentry->list_entry, | ||
191 | &cache->partial[nvgpu_pd_cache_nr(bytes)]); | ||
192 | |||
193 | /* | ||
194 | * This allocates the very first PD table in the set of tables in this | ||
195 | * nvgpu_pd_mem_entry. | ||
196 | */ | ||
197 | pentry->alloc_map = 1; | ||
198 | |||
199 | /* | ||
200 | * Now update the nvgpu_gmmu_pd to reflect this allocation. | ||
201 | */ | ||
202 | pd->mem = &pentry->mem; | ||
203 | pd->mem_offs = 0; | ||
204 | pd->cached = true; | ||
205 | |||
206 | pentry->tree_entry.key_start = (u64)(uintptr_t)&pentry->mem; | ||
207 | nvgpu_rbtree_insert(&pentry->tree_entry, &cache->mem_tree); | ||
208 | |||
209 | return 0; | ||
210 | } | ||
211 | |||
212 | static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g, | ||
213 | struct nvgpu_pd_cache *cache, | ||
214 | struct nvgpu_pd_mem_entry *pentry, | ||
215 | struct nvgpu_gmmu_pd *pd) | ||
216 | { | ||
217 | unsigned long bit_offs; | ||
218 | u32 mem_offs; | ||
219 | u32 pentry_mask = nvgpu_pd_cache_get_mask(pentry); | ||
220 | |||
221 | /* | ||
222 | * Find and allocate an open PD. | ||
223 | */ | ||
224 | bit_offs = ffz(pentry->alloc_map); | ||
225 | mem_offs = bit_offs * pentry->pd_size; | ||
226 | |||
227 | /* Bit map full. Somethings wrong. */ | ||
228 | if (WARN_ON(bit_offs >= ffz(pentry_mask))) | ||
229 | return -ENOMEM; | ||
230 | |||
231 | pentry->alloc_map |= 1 << bit_offs; | ||
232 | |||
233 | pd_dbg(g, "PD-Alloc [C] Partial: offs=%lu", bit_offs); | ||
234 | |||
235 | /* | ||
236 | * First update the pd. | ||
237 | */ | ||
238 | pd->mem = &pentry->mem; | ||
239 | pd->mem_offs = mem_offs; | ||
240 | pd->cached = true; | ||
241 | |||
242 | /* | ||
243 | * Now make sure the pentry is in the correct list (full vs partial). | ||
244 | */ | ||
245 | if ((pentry->alloc_map & pentry_mask) == pentry_mask) { | ||
246 | pd_dbg(g, "Adding pentry to full list!"); | ||
247 | nvgpu_list_del(&pentry->list_entry); | ||
248 | nvgpu_list_add(&pentry->list_entry, | ||
249 | &cache->full[nvgpu_pd_cache_nr(pentry->pd_size)]); | ||
250 | } | ||
251 | |||
252 | return 0; | ||
253 | } | ||
254 | |||
255 | /* | ||
256 | * Get a partially full nvgpu_pd_mem_entry. Returns NULL if there is no partial | ||
257 | * nvgpu_pd_mem_entry's. | ||
258 | */ | ||
259 | static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_get_partial( | ||
260 | struct nvgpu_pd_cache *cache, u32 bytes) | ||
261 | { | ||
262 | struct nvgpu_list_node *list = | ||
263 | &cache->partial[nvgpu_pd_cache_nr(bytes)]; | ||
264 | |||
265 | if (nvgpu_list_empty(list)) | ||
266 | return NULL; | ||
267 | |||
268 | return nvgpu_list_first_entry(list, | ||
269 | nvgpu_pd_mem_entry, | ||
270 | list_entry); | ||
271 | } | ||
272 | |||
273 | /* | ||
274 | * Allocate memory from an nvgpu_mem for the page directory. | ||
275 | */ | ||
276 | static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache, | ||
277 | struct nvgpu_gmmu_pd *pd, u32 bytes) | ||
278 | { | ||
279 | struct nvgpu_pd_mem_entry *pentry; | ||
280 | int err; | ||
281 | |||
282 | pd_dbg(g, "PD-Alloc [C] %u bytes", bytes); | ||
283 | |||
284 | if (bytes & (bytes - 1) || | ||
285 | (bytes >= PAGE_SIZE || | ||
286 | bytes < NVGPU_PD_CACHE_MIN)) { | ||
287 | pd_dbg(g, "PD-Alloc [C] Invalid (bytes=%u)!", bytes); | ||
288 | return -EINVAL; | ||
289 | } | ||
290 | |||
291 | pentry = nvgpu_pd_cache_get_partial(cache, bytes); | ||
292 | if (!pentry) | ||
293 | err = nvgpu_pd_cache_alloc_new(g, cache, pd, bytes); | ||
294 | else | ||
295 | err = nvgpu_pd_cache_alloc_from_partial(g, cache, pentry, pd); | ||
296 | |||
297 | if (err) | ||
298 | pd_dbg(g, "PD-Alloc [C] Failed!"); | ||
299 | |||
300 | return err; | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | * Allocate the DMA memory for a page directory. This handles the necessary PD | ||
305 | * cache logistics. Since on Parker and later GPUs some of the page directories | ||
306 | * are smaller than a page packing these PDs together saves a lot of memory. | ||
307 | */ | ||
308 | int __nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes) | ||
309 | { | ||
310 | struct gk20a *g = gk20a_from_vm(vm); | ||
311 | int err; | ||
312 | |||
313 | /* | ||
314 | * Simple case: PD is bigger than a page so just do a regular DMA | ||
315 | * alloc. | ||
316 | */ | ||
317 | if (bytes >= PAGE_SIZE) { | ||
318 | err = __nvgpu_pd_cache_alloc_direct(g, pd, bytes); | ||
319 | if (err) | ||
320 | return err; | ||
321 | |||
322 | return 0; | ||
323 | } | ||
324 | |||
325 | if (WARN_ON(!g->mm.pd_cache)) | ||
326 | return -ENOMEM; | ||
327 | |||
328 | nvgpu_mutex_acquire(&g->mm.pd_cache->lock); | ||
329 | err = nvgpu_pd_cache_alloc(g, g->mm.pd_cache, pd, bytes); | ||
330 | nvgpu_mutex_release(&g->mm.pd_cache->lock); | ||
331 | |||
332 | return err; | ||
333 | } | ||
334 | |||
335 | void __nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd) | ||
336 | { | ||
337 | pd_dbg(g, "PD-Free [D] 0x%p", pd->mem); | ||
338 | |||
339 | if (!pd->mem) | ||
340 | return; | ||
341 | |||
342 | nvgpu_dma_free(g, pd->mem); | ||
343 | nvgpu_kfree(g, pd->mem); | ||
344 | pd->mem = NULL; | ||
345 | } | ||
346 | |||
347 | static void nvgpu_pd_cache_free_mem_entry(struct gk20a *g, | ||
348 | struct nvgpu_pd_cache *cache, | ||
349 | struct nvgpu_pd_mem_entry *pentry) | ||
350 | { | ||
351 | nvgpu_dma_free(g, &pentry->mem); | ||
352 | nvgpu_list_del(&pentry->list_entry); | ||
353 | nvgpu_rbtree_unlink(&pentry->tree_entry, &cache->mem_tree); | ||
354 | nvgpu_kfree(g, pentry); | ||
355 | } | ||
356 | |||
357 | static void nvgpu_pd_cache_do_free(struct gk20a *g, | ||
358 | struct nvgpu_pd_cache *cache, | ||
359 | struct nvgpu_pd_mem_entry *pentry, | ||
360 | struct nvgpu_gmmu_pd *pd) | ||
361 | { | ||
362 | u32 index = pd->mem_offs / pentry->pd_size; | ||
363 | u32 bit = 1 << index; | ||
364 | |||
365 | /* Mark entry as free. */ | ||
366 | pentry->alloc_map &= ~bit; | ||
367 | |||
368 | if (pentry->alloc_map & nvgpu_pd_cache_get_mask(pentry)) { | ||
369 | /* | ||
370 | * Partially full still. If it was already on the partial list | ||
371 | * this just re-adds it. | ||
372 | */ | ||
373 | nvgpu_list_del(&pentry->list_entry); | ||
374 | nvgpu_list_add(&pentry->list_entry, | ||
375 | &cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]); | ||
376 | } else { | ||
377 | /* Empty now so free it. */ | ||
378 | nvgpu_pd_cache_free_mem_entry(g, cache, pentry); | ||
379 | } | ||
380 | } | ||
381 | |||
382 | static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up( | ||
383 | struct gk20a *g, | ||
384 | struct nvgpu_pd_cache *cache, | ||
385 | struct nvgpu_gmmu_pd *pd) | ||
386 | { | ||
387 | struct nvgpu_rbtree_node *node; | ||
388 | |||
389 | nvgpu_rbtree_search((u64)(uintptr_t)pd->mem, &node, | ||
390 | cache->mem_tree); | ||
391 | if (!node) | ||
392 | return NULL; | ||
393 | |||
394 | return nvgpu_pd_mem_entry_from_tree_entry(node); | ||
395 | } | ||
396 | |||
397 | static void nvgpu_pd_cache_free(struct gk20a *g, struct nvgpu_pd_cache *cache, | ||
398 | struct nvgpu_gmmu_pd *pd) | ||
399 | { | ||
400 | struct nvgpu_pd_mem_entry *pentry; | ||
401 | |||
402 | pd_dbg(g, "PD-Free [C] 0x%p", pd->mem); | ||
403 | |||
404 | pentry = nvgpu_pd_cache_look_up(g, cache, pd); | ||
405 | if (!pentry) { | ||
406 | WARN(1, "Attempting to free non-existent pd"); | ||
407 | return; | ||
408 | } | ||
409 | |||
410 | nvgpu_pd_cache_do_free(g, cache, pentry, pd); | ||
411 | } | ||
412 | |||
413 | void __nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd) | ||
414 | { | ||
415 | struct gk20a *g = gk20a_from_vm(vm); | ||
416 | |||
417 | /* | ||
418 | * Simple case: just DMA free. | ||
419 | */ | ||
420 | if (!pd->cached) | ||
421 | return __nvgpu_pd_cache_free_direct(g, pd); | ||
422 | |||
423 | nvgpu_mutex_acquire(&g->mm.pd_cache->lock); | ||
424 | nvgpu_pd_cache_free(g, g->mm.pd_cache, pd); | ||
425 | nvgpu_mutex_release(&g->mm.pd_cache->lock); | ||
426 | } | ||
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c index 3aeba500..3ed3c7fe 100644 --- a/drivers/gpu/nvgpu/common/mm/vm.c +++ b/drivers/gpu/nvgpu/common/mm/vm.c | |||
@@ -35,21 +35,42 @@ int vm_aspace_id(struct vm_gk20a *vm) | |||
35 | return vm->as_share ? vm->as_share->id : -1; | 35 | return vm->as_share ? vm->as_share->id : -1; |
36 | } | 36 | } |
37 | 37 | ||
38 | static void nvgpu_vm_free_entries(struct vm_gk20a *vm, | 38 | static void __nvgpu_vm_free_entries(struct vm_gk20a *vm, |
39 | struct nvgpu_gmmu_pd *parent, | 39 | struct nvgpu_gmmu_pd *pd, |
40 | int level) | 40 | int level) |
41 | { | 41 | { |
42 | int i; | 42 | int i; |
43 | 43 | ||
44 | if (parent->entries) | 44 | if (pd->mem) { |
45 | for (i = 0; i < parent->num_entries; i++) | 45 | __nvgpu_pd_free(vm, pd); |
46 | nvgpu_vm_free_entries(vm, &parent->entries[i], | 46 | pd->mem = NULL; |
47 | } | ||
48 | |||
49 | if (pd->entries) { | ||
50 | for (i = 0; i < pd->num_entries; i++) | ||
51 | __nvgpu_vm_free_entries(vm, &pd->entries[i], | ||
47 | level + 1); | 52 | level + 1); |
53 | nvgpu_vfree(vm->mm->g, pd->entries); | ||
54 | pd->entries = NULL; | ||
55 | } | ||
56 | } | ||
57 | |||
58 | static void nvgpu_vm_free_entries(struct vm_gk20a *vm, | ||
59 | struct nvgpu_gmmu_pd *pdb) | ||
60 | { | ||
61 | struct gk20a *g = vm->mm->g; | ||
62 | int i; | ||
63 | |||
64 | __nvgpu_pd_cache_free_direct(g, pdb); | ||
65 | |||
66 | if (!pdb->entries) | ||
67 | return; | ||
68 | |||
69 | for (i = 0; i < pdb->num_entries; i++) | ||
70 | __nvgpu_vm_free_entries(vm, &pdb->entries[i], 1); | ||
48 | 71 | ||
49 | if (parent->mem.size) | 72 | nvgpu_vfree(g, pdb->entries); |
50 | nvgpu_free_gmmu_pages(vm, parent); | 73 | pdb->entries = NULL; |
51 | nvgpu_vfree(vm->mm->g, parent->entries); | ||
52 | parent->entries = NULL; | ||
53 | } | 74 | } |
54 | 75 | ||
55 | u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size, | 76 | u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size, |
@@ -110,7 +131,7 @@ void nvgpu_vm_mapping_batch_finish_locked( | |||
110 | 131 | ||
111 | if (mapping_batch->need_tlb_invalidate) { | 132 | if (mapping_batch->need_tlb_invalidate) { |
112 | struct gk20a *g = gk20a_from_vm(vm); | 133 | struct gk20a *g = gk20a_from_vm(vm); |
113 | g->ops.fb.tlb_invalidate(g, &vm->pdb.mem); | 134 | g->ops.fb.tlb_invalidate(g, vm->pdb.mem); |
114 | } | 135 | } |
115 | } | 136 | } |
116 | 137 | ||
@@ -407,9 +428,8 @@ clean_up_allocators: | |||
407 | if (nvgpu_alloc_initialized(&vm->user_lp)) | 428 | if (nvgpu_alloc_initialized(&vm->user_lp)) |
408 | nvgpu_alloc_destroy(&vm->user_lp); | 429 | nvgpu_alloc_destroy(&vm->user_lp); |
409 | clean_up_page_tables: | 430 | clean_up_page_tables: |
410 | /* Cleans up nvgpu_vm_init_page_tables() */ | 431 | /* Cleans up nvgpu_gmmu_init_page_table() */ |
411 | nvgpu_vfree(g, vm->pdb.entries); | 432 | __nvgpu_pd_cache_free_direct(g, &vm->pdb); |
412 | nvgpu_free_gmmu_pages(vm, &vm->pdb); | ||
413 | clean_up_vgpu_vm: | 433 | clean_up_vgpu_vm: |
414 | #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION | 434 | #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION |
415 | if (g->is_virtual) | 435 | if (g->is_virtual) |
@@ -525,7 +545,7 @@ static void __nvgpu_vm_remove(struct vm_gk20a *vm) | |||
525 | if (nvgpu_alloc_initialized(&vm->user_lp)) | 545 | if (nvgpu_alloc_initialized(&vm->user_lp)) |
526 | nvgpu_alloc_destroy(&vm->user_lp); | 546 | nvgpu_alloc_destroy(&vm->user_lp); |
527 | 547 | ||
528 | nvgpu_vm_free_entries(vm, &vm->pdb, 0); | 548 | nvgpu_vm_free_entries(vm, &vm->pdb); |
529 | 549 | ||
530 | #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION | 550 | #ifdef CONFIG_TEGRA_GR_VIRTUALIZATION |
531 | if (g->is_virtual) | 551 | if (g->is_virtual) |
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c index 380c28ac..a0753770 100644 --- a/drivers/gpu/nvgpu/gk20a/gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/gk20a.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <nvgpu/soc.h> | 25 | #include <nvgpu/soc.h> |
26 | #include <nvgpu/enabled.h> | 26 | #include <nvgpu/enabled.h> |
27 | #include <nvgpu/pmu.h> | 27 | #include <nvgpu/pmu.h> |
28 | #include <nvgpu/gmmu.h> | ||
28 | 29 | ||
29 | #include <trace/events/gk20a.h> | 30 | #include <trace/events/gk20a.h> |
30 | 31 | ||
@@ -174,6 +175,14 @@ int gk20a_finalize_poweron(struct gk20a *g) | |||
174 | g->gpu_reset_done = true; | 175 | g->gpu_reset_done = true; |
175 | } | 176 | } |
176 | 177 | ||
178 | /* | ||
179 | * Do this early so any early VMs that get made are capable of mapping | ||
180 | * buffers. | ||
181 | */ | ||
182 | err = nvgpu_pd_cache_init(g); | ||
183 | if (err) | ||
184 | return err; | ||
185 | |||
177 | /* init interface layer support for PMU falcon */ | 186 | /* init interface layer support for PMU falcon */ |
178 | nvgpu_flcn_sw_init(g, FALCON_ID_PMU); | 187 | nvgpu_flcn_sw_init(g, FALCON_ID_PMU); |
179 | nvgpu_flcn_sw_init(g, FALCON_ID_SEC2); | 188 | nvgpu_flcn_sw_init(g, FALCON_ID_SEC2); |
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c index 558a1b06..0a84cabb 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c | |||
@@ -478,6 +478,7 @@ static void gk20a_remove_mm_support(struct mm_gk20a *mm) | |||
478 | 478 | ||
479 | gk20a_semaphore_sea_destroy(g); | 479 | gk20a_semaphore_sea_destroy(g); |
480 | gk20a_vidmem_destroy(g); | 480 | gk20a_vidmem_destroy(g); |
481 | nvgpu_pd_cache_fini(g); | ||
481 | } | 482 | } |
482 | 483 | ||
483 | static int gk20a_alloc_sysmem_flush(struct gk20a *g) | 484 | static int gk20a_alloc_sysmem_flush(struct gk20a *g) |
@@ -1560,7 +1561,7 @@ static inline u32 big_valid_pde0_bits(struct gk20a *g, | |||
1560 | struct nvgpu_gmmu_pd *pd, u64 addr) | 1561 | struct nvgpu_gmmu_pd *pd, u64 addr) |
1561 | { | 1562 | { |
1562 | u32 pde0_bits = | 1563 | u32 pde0_bits = |
1563 | nvgpu_aperture_mask(g, &pd->mem, | 1564 | nvgpu_aperture_mask(g, pd->mem, |
1564 | gmmu_pde_aperture_big_sys_mem_ncoh_f(), | 1565 | gmmu_pde_aperture_big_sys_mem_ncoh_f(), |
1565 | gmmu_pde_aperture_big_video_memory_f()) | | 1566 | gmmu_pde_aperture_big_video_memory_f()) | |
1566 | gmmu_pde_address_big_sys_f( | 1567 | gmmu_pde_address_big_sys_f( |
@@ -1573,7 +1574,7 @@ static inline u32 small_valid_pde1_bits(struct gk20a *g, | |||
1573 | struct nvgpu_gmmu_pd *pd, u64 addr) | 1574 | struct nvgpu_gmmu_pd *pd, u64 addr) |
1574 | { | 1575 | { |
1575 | u32 pde1_bits = | 1576 | u32 pde1_bits = |
1576 | nvgpu_aperture_mask(g, &pd->mem, | 1577 | nvgpu_aperture_mask(g, pd->mem, |
1577 | gmmu_pde_aperture_small_sys_mem_ncoh_f(), | 1578 | gmmu_pde_aperture_small_sys_mem_ncoh_f(), |
1578 | gmmu_pde_aperture_small_video_memory_f()) | | 1579 | gmmu_pde_aperture_small_video_memory_f()) | |
1579 | gmmu_pde_vol_small_true_f() | /* tbd: why? */ | 1580 | gmmu_pde_vol_small_true_f() | /* tbd: why? */ |
@@ -2173,14 +2174,14 @@ static int gk20a_init_ce_vm(struct mm_gk20a *mm) | |||
2173 | void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, | 2174 | void gk20a_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, |
2174 | struct vm_gk20a *vm) | 2175 | struct vm_gk20a *vm) |
2175 | { | 2176 | { |
2176 | u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0); | 2177 | u64 pdb_addr = nvgpu_mem_get_base_addr(g, vm->pdb.mem, 0); |
2177 | u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); | 2178 | u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); |
2178 | u32 pdb_addr_hi = u64_hi32(pdb_addr); | 2179 | u32 pdb_addr_hi = u64_hi32(pdb_addr); |
2179 | 2180 | ||
2180 | gk20a_dbg_info("pde pa=0x%llx", pdb_addr); | 2181 | gk20a_dbg_info("pde pa=0x%llx", pdb_addr); |
2181 | 2182 | ||
2182 | nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(), | 2183 | nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(), |
2183 | nvgpu_aperture_mask(g, &vm->pdb.mem, | 2184 | nvgpu_aperture_mask(g, vm->pdb.mem, |
2184 | ram_in_page_dir_base_target_sys_mem_ncoh_f(), | 2185 | ram_in_page_dir_base_target_sys_mem_ncoh_f(), |
2185 | ram_in_page_dir_base_target_vid_mem_f()) | | 2186 | ram_in_page_dir_base_target_vid_mem_f()) | |
2186 | ram_in_page_dir_base_vol_true_f() | | 2187 | ram_in_page_dir_base_vol_true_f() | |
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h index a245d0e0..cadcffa4 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h | |||
@@ -31,6 +31,8 @@ | |||
31 | #include <nvgpu/rbtree.h> | 31 | #include <nvgpu/rbtree.h> |
32 | #include <nvgpu/kref.h> | 32 | #include <nvgpu/kref.h> |
33 | 33 | ||
34 | struct nvgpu_pd_cache; | ||
35 | |||
34 | #ifdef CONFIG_ARM64 | 36 | #ifdef CONFIG_ARM64 |
35 | #define outer_flush_range(a, b) | 37 | #define outer_flush_range(a, b) |
36 | #define __cpuc_flush_dcache_area __flush_dcache_area | 38 | #define __cpuc_flush_dcache_area __flush_dcache_area |
@@ -217,6 +219,8 @@ struct mm_gk20a { | |||
217 | struct vm_gk20a *vm; | 219 | struct vm_gk20a *vm; |
218 | } ce; | 220 | } ce; |
219 | 221 | ||
222 | struct nvgpu_pd_cache *pd_cache; | ||
223 | |||
220 | struct nvgpu_mutex l2_op_lock; | 224 | struct nvgpu_mutex l2_op_lock; |
221 | struct nvgpu_mutex tlb_lock; | 225 | struct nvgpu_mutex tlb_lock; |
222 | struct nvgpu_mutex priv_lock; | 226 | struct nvgpu_mutex priv_lock; |
diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c index c3867e9d..2ff199c6 100644 --- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c | |||
@@ -164,7 +164,7 @@ static void update_gmmu_pde3_locked(struct vm_gk20a *vm, | |||
164 | 164 | ||
165 | phys_addr >>= gmmu_new_pde_address_shift_v(); | 165 | phys_addr >>= gmmu_new_pde_address_shift_v(); |
166 | 166 | ||
167 | pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem, | 167 | pde_v[0] |= nvgpu_aperture_mask(g, pd->mem, |
168 | gmmu_new_pde_aperture_sys_mem_ncoh_f(), | 168 | gmmu_new_pde_aperture_sys_mem_ncoh_f(), |
169 | gmmu_new_pde_aperture_video_memory_f()); | 169 | gmmu_new_pde_aperture_video_memory_f()); |
170 | pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr)); | 170 | pde_v[0] |= gmmu_new_pde_address_sys_f(u64_lo32(phys_addr)); |
@@ -209,7 +209,7 @@ static void update_gmmu_pde0_locked(struct vm_gk20a *vm, | |||
209 | if (small_valid) { | 209 | if (small_valid) { |
210 | pde_v[2] |= | 210 | pde_v[2] |= |
211 | gmmu_new_dual_pde_address_small_sys_f(small_addr); | 211 | gmmu_new_dual_pde_address_small_sys_f(small_addr); |
212 | pde_v[2] |= nvgpu_aperture_mask(g, &pd->mem, | 212 | pde_v[2] |= nvgpu_aperture_mask(g, pd->mem, |
213 | gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(), | 213 | gmmu_new_dual_pde_aperture_small_sys_mem_ncoh_f(), |
214 | gmmu_new_dual_pde_aperture_small_video_memory_f()); | 214 | gmmu_new_dual_pde_aperture_small_video_memory_f()); |
215 | pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f(); | 215 | pde_v[2] |= gmmu_new_dual_pde_vol_small_true_f(); |
@@ -219,7 +219,7 @@ static void update_gmmu_pde0_locked(struct vm_gk20a *vm, | |||
219 | if (big_valid) { | 219 | if (big_valid) { |
220 | pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr); | 220 | pde_v[0] |= gmmu_new_dual_pde_address_big_sys_f(big_addr); |
221 | pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f(); | 221 | pde_v[0] |= gmmu_new_dual_pde_vol_big_true_f(); |
222 | pde_v[0] |= nvgpu_aperture_mask(g, &pd->mem, | 222 | pde_v[0] |= nvgpu_aperture_mask(g, pd->mem, |
223 | gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(), | 223 | gmmu_new_dual_pde_aperture_big_sys_mem_ncoh_f(), |
224 | gmmu_new_dual_pde_aperture_big_video_memory_f()); | 224 | gmmu_new_dual_pde_aperture_big_video_memory_f()); |
225 | pde_v[1] |= big_addr >> 28; | 225 | pde_v[1] |= big_addr >> 28; |
@@ -365,14 +365,14 @@ static const struct gk20a_mmu_level *gp10b_mm_get_mmu_levels(struct gk20a *g, | |||
365 | static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, | 365 | static void gp10b_mm_init_pdb(struct gk20a *g, struct nvgpu_mem *inst_block, |
366 | struct vm_gk20a *vm) | 366 | struct vm_gk20a *vm) |
367 | { | 367 | { |
368 | u64 pdb_addr = nvgpu_mem_get_base_addr(g, &vm->pdb.mem, 0); | 368 | u64 pdb_addr = nvgpu_mem_get_base_addr(g, vm->pdb.mem, 0); |
369 | u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); | 369 | u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v()); |
370 | u32 pdb_addr_hi = u64_hi32(pdb_addr); | 370 | u32 pdb_addr_hi = u64_hi32(pdb_addr); |
371 | 371 | ||
372 | gk20a_dbg_info("pde pa=0x%llx", pdb_addr); | 372 | gk20a_dbg_info("pde pa=0x%llx", pdb_addr); |
373 | 373 | ||
374 | nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(), | 374 | nvgpu_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(), |
375 | nvgpu_aperture_mask(g, &vm->pdb.mem, | 375 | nvgpu_aperture_mask(g, vm->pdb.mem, |
376 | ram_in_page_dir_base_target_sys_mem_ncoh_f(), | 376 | ram_in_page_dir_base_target_sys_mem_ncoh_f(), |
377 | ram_in_page_dir_base_target_vid_mem_f()) | | 377 | ram_in_page_dir_base_target_vid_mem_f()) | |
378 | ram_in_page_dir_base_vol_true_f() | | 378 | ram_in_page_dir_base_vol_true_f() | |
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h index 28a2cb82..eff87c31 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h | |||
@@ -19,6 +19,9 @@ | |||
19 | 19 | ||
20 | #include <nvgpu/types.h> | 20 | #include <nvgpu/types.h> |
21 | #include <nvgpu/nvgpu_mem.h> | 21 | #include <nvgpu/nvgpu_mem.h> |
22 | #include <nvgpu/list.h> | ||
23 | #include <nvgpu/rbtree.h> | ||
24 | #include <nvgpu/lock.h> | ||
22 | 25 | ||
23 | struct scatterlist; | 26 | struct scatterlist; |
24 | 27 | ||
@@ -45,14 +48,85 @@ enum gk20a_mem_rw_flag { | |||
45 | }; | 48 | }; |
46 | 49 | ||
47 | /* | 50 | /* |
51 | * Minimum size of a cache. The number of different caches in the nvgpu_pd_cache | ||
52 | * structure is of course depending on this. The MIN_SHIFT define is the right | ||
53 | * number of bits to shift to determine which list to use in the array of lists. | ||
54 | */ | ||
55 | #define NVGPU_PD_CACHE_MIN 256 | ||
56 | #define NVGPU_PD_CACHE_MIN_SHIFT 9 | ||
57 | #define NVGPU_PD_CACHE_COUNT 4 | ||
58 | |||
59 | struct nvgpu_pd_mem_entry { | ||
60 | struct nvgpu_mem mem; | ||
61 | |||
62 | /* | ||
63 | * Size of the page directories (not the mem). bmap is a bitmap showing | ||
64 | * which PDs have been allocated. The size of mem will always be one | ||
65 | * page. pd_size will always be a power of 2. | ||
66 | */ | ||
67 | u32 pd_size; | ||
68 | unsigned long alloc_map; | ||
69 | |||
70 | struct nvgpu_list_node list_entry; | ||
71 | struct nvgpu_rbtree_node tree_entry; | ||
72 | }; | ||
73 | |||
74 | static inline struct nvgpu_pd_mem_entry * | ||
75 | nvgpu_pd_mem_entry_from_list_entry(struct nvgpu_list_node *node) | ||
76 | { | ||
77 | return (struct nvgpu_pd_mem_entry *) | ||
78 | ((uintptr_t)node - | ||
79 | offsetof(struct nvgpu_pd_mem_entry, list_entry)); | ||
80 | }; | ||
81 | |||
82 | static inline struct nvgpu_pd_mem_entry * | ||
83 | nvgpu_pd_mem_entry_from_tree_entry(struct nvgpu_rbtree_node *node) | ||
84 | { | ||
85 | return (struct nvgpu_pd_mem_entry *) | ||
86 | ((uintptr_t)node - | ||
87 | offsetof(struct nvgpu_pd_mem_entry, tree_entry)); | ||
88 | }; | ||
89 | |||
90 | /* | ||
91 | * A cache for allocating PD memory from. This enables smaller PDs to be packed | ||
92 | * into single pages. | ||
93 | * | ||
94 | * This is fairly complex so see the documentation in pd_cache.c for a full | ||
95 | * description of how this is organized. | ||
96 | */ | ||
97 | struct nvgpu_pd_cache { | ||
98 | /* | ||
99 | * Array of lists of full nvgpu_pd_mem_entries and partially full (or | ||
100 | * empty) nvgpu_pd_mem_entries. | ||
101 | */ | ||
102 | struct nvgpu_list_node full[NVGPU_PD_CACHE_COUNT]; | ||
103 | struct nvgpu_list_node partial[NVGPU_PD_CACHE_COUNT]; | ||
104 | |||
105 | /* | ||
106 | * Tree of all allocated struct nvgpu_mem's for fast look up. | ||
107 | */ | ||
108 | struct nvgpu_rbtree_node *mem_tree; | ||
109 | |||
110 | /* | ||
111 | * All access to the cache much be locked. This protects the lists and | ||
112 | * the rb tree. | ||
113 | */ | ||
114 | struct nvgpu_mutex lock; | ||
115 | }; | ||
116 | |||
117 | /* | ||
48 | * GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs | 118 | * GMMU page directory. This is the kernel's tracking of a list of PDEs or PTEs |
49 | * in the GMMU. | 119 | * in the GMMU. |
50 | */ | 120 | */ |
51 | struct nvgpu_gmmu_pd { | 121 | struct nvgpu_gmmu_pd { |
52 | /* | 122 | /* |
53 | * DMA memory describing the PTEs or PTEs. | 123 | * DMA memory describing the PTEs or PDEs. @mem_offs describes the |
124 | * offset of the PDE table in @mem. @cached specifies if this PD is | ||
125 | * using pd_cache memory. | ||
54 | */ | 126 | */ |
55 | struct nvgpu_mem mem; | 127 | struct nvgpu_mem *mem; |
128 | u32 mem_offs; | ||
129 | bool cached; | ||
56 | 130 | ||
57 | /* | 131 | /* |
58 | * List of pointers to the next level of page tables. Does not | 132 | * List of pointers to the next level of page tables. Does not |
@@ -66,7 +140,7 @@ struct nvgpu_gmmu_pd { | |||
66 | * Reduce the number of arguments getting passed through the various levels of | 140 | * Reduce the number of arguments getting passed through the various levels of |
67 | * GMMU mapping functions. | 141 | * GMMU mapping functions. |
68 | * | 142 | * |
69 | * The following fields are set statically and do not change throughout | 143 | * The following fields are set statically and do not change throughout the |
70 | * mapping call: | 144 | * mapping call: |
71 | * | 145 | * |
72 | * pgsz: Index into the page size table. | 146 | * pgsz: Index into the page size table. |
@@ -166,8 +240,13 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, | |||
166 | struct nvgpu_mem *mem, | 240 | struct nvgpu_mem *mem, |
167 | u64 gpu_va); | 241 | u64 gpu_va); |
168 | 242 | ||
169 | void nvgpu_free_gmmu_pages(struct vm_gk20a *vm, | 243 | int __nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes); |
170 | struct nvgpu_gmmu_pd *entry); | 244 | void __nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd); |
245 | int __nvgpu_pd_cache_alloc_direct(struct gk20a *g, | ||
246 | struct nvgpu_gmmu_pd *pd, u32 bytes); | ||
247 | void __nvgpu_pd_cache_free_direct(struct gk20a *g, struct nvgpu_gmmu_pd *pd); | ||
248 | int nvgpu_pd_cache_init(struct gk20a *g); | ||
249 | void nvgpu_pd_cache_fini(struct gk20a *g); | ||
171 | 250 | ||
172 | /* | 251 | /* |
173 | * Some useful routines that are shared across chips. | 252 | * Some useful routines that are shared across chips. |
@@ -181,7 +260,7 @@ static inline u32 pd_offset_from_index(const struct gk20a_mmu_level *l, | |||
181 | static inline void pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd, | 260 | static inline void pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd, |
182 | size_t w, size_t data) | 261 | size_t w, size_t data) |
183 | { | 262 | { |
184 | nvgpu_mem_wr32(g, &pd->mem, w, data); | 263 | nvgpu_mem_wr32(g, pd->mem, (pd->mem_offs / sizeof(u32)) + w, data); |
185 | } | 264 | } |
186 | 265 | ||
187 | 266 | ||
diff --git a/drivers/gpu/nvgpu/include/nvgpu/log.h b/drivers/gpu/nvgpu/include/nvgpu/log.h index 3b8e6b19..a1110a59 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/log.h +++ b/drivers/gpu/nvgpu/include/nvgpu/log.h | |||
@@ -68,6 +68,7 @@ enum nvgpu_log_categories { | |||
68 | gpu_dbg_xv = BIT(17), /* XVE debugging. */ | 68 | gpu_dbg_xv = BIT(17), /* XVE debugging. */ |
69 | gpu_dbg_shutdown = BIT(18), /* GPU shutdown tracing. */ | 69 | gpu_dbg_shutdown = BIT(18), /* GPU shutdown tracing. */ |
70 | gpu_dbg_kmem = BIT(19), /* Kmem tracking debugging. */ | 70 | gpu_dbg_kmem = BIT(19), /* Kmem tracking debugging. */ |
71 | gpu_dbg_pd_cache = BIT(20), /* PD cache traces. */ | ||
71 | gpu_dbg_mem = BIT(31), /* memory accesses; very verbose. */ | 72 | gpu_dbg_mem = BIT(31), /* memory accesses; very verbose. */ |
72 | }; | 73 | }; |
73 | 74 | ||