diff options
author | Konsta Holtta <kholtta@nvidia.com> | 2016-09-19 02:24:13 -0400 |
---|---|---|
committer | Deepak Nibade <dnibade@nvidia.com> | 2016-12-27 04:56:49 -0500 |
commit | 4afc6a1659ec058fd44953ccff7a1030275bcc92 (patch) | |
tree | a6953c8895f1ab917f3a5eca29f4568a6702749c /drivers | |
parent | 49c3fb25822565a9078961cdef1222aaa8c7e89a (diff) |
gpu: nvgpu: compact pte buffers
The lowest page table level may hold very few entries for mappings of
large pages, but a new page is allocated for each list of entries at the
lowest level, wasting memory and performance. Compact these so that the
new "allocation" of ptes is appended at the end of the previous
allocation, if there is space.
Bug 1736604
Change-Id: I4c7c4cad9019de202325750aee6034076e7e61c2
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-on: http://git-master/r/1222810
(cherry picked from commit 97303ecc946c17150496486a2f52bd481311dbf7)
Reviewed-on: http://git-master/r/1234995
Reviewed-by: Automatic_Commit_Validation_User
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/gpu/nvgpu/gp10b/mm_gp10b.c | 39 |
1 files changed, 14 insertions, 25 deletions
diff --git a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c index 03bab121..1e073ab2 100644 --- a/drivers/gpu/nvgpu/gp10b/mm_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/mm_gp10b.c | |||
@@ -151,18 +151,6 @@ static u32 pte3_from_index(u32 i) | |||
151 | return i * gmmu_new_pte__size_v() / sizeof(u32); | 151 | return i * gmmu_new_pte__size_v() / sizeof(u32); |
152 | } | 152 | } |
153 | 153 | ||
154 | static u64 entry_addr(struct gk20a *g, struct gk20a_mm_entry *entry) | ||
155 | { | ||
156 | u64 addr; | ||
157 | |||
158 | if (g->mm.has_physical_mode) | ||
159 | addr = sg_phys(entry->mem.sgt->sgl); | ||
160 | else | ||
161 | addr = gk20a_mem_get_base_addr(g, &entry->mem, 0); | ||
162 | |||
163 | return addr; | ||
164 | } | ||
165 | |||
166 | static int update_gmmu_pde3_locked(struct vm_gk20a *vm, | 154 | static int update_gmmu_pde3_locked(struct vm_gk20a *vm, |
167 | struct gk20a_mm_entry *parent, | 155 | struct gk20a_mm_entry *parent, |
168 | u32 i, u32 gmmu_pgsz_idx, | 156 | u32 i, u32 gmmu_pgsz_idx, |
@@ -176,15 +164,13 @@ static int update_gmmu_pde3_locked(struct vm_gk20a *vm, | |||
176 | { | 164 | { |
177 | struct gk20a *g = gk20a_from_vm(vm); | 165 | struct gk20a *g = gk20a_from_vm(vm); |
178 | u64 pte_addr = 0; | 166 | u64 pte_addr = 0; |
179 | u64 pde_addr = 0; | ||
180 | struct gk20a_mm_entry *pte = parent->entries + i; | 167 | struct gk20a_mm_entry *pte = parent->entries + i; |
181 | u32 pde_v[2] = {0, 0}; | 168 | u32 pde_v[2] = {0, 0}; |
182 | u32 pde; | 169 | u32 pde; |
183 | 170 | ||
184 | gk20a_dbg_fn(""); | 171 | gk20a_dbg_fn(""); |
185 | 172 | ||
186 | pte_addr = entry_addr(g, pte) >> gmmu_new_pde_address_shift_v(); | 173 | pte_addr = gk20a_pde_addr(g, pte) >> gmmu_new_pde_address_shift_v(); |
187 | pde_addr = entry_addr(g, parent); | ||
188 | 174 | ||
189 | pde_v[0] |= gk20a_aperture_mask(g, &pte->mem, | 175 | pde_v[0] |= gk20a_aperture_mask(g, &pte->mem, |
190 | gmmu_new_pde_aperture_sys_mem_ncoh_f(), | 176 | gmmu_new_pde_aperture_sys_mem_ncoh_f(), |
@@ -194,8 +180,8 @@ static int update_gmmu_pde3_locked(struct vm_gk20a *vm, | |||
194 | pde_v[1] |= pte_addr >> 24; | 180 | pde_v[1] |= pte_addr >> 24; |
195 | pde = pde3_from_index(i); | 181 | pde = pde3_from_index(i); |
196 | 182 | ||
197 | gk20a_mem_wr32(g, &parent->mem, pde + 0, pde_v[0]); | 183 | gk20a_pde_wr32(g, parent, pde + 0, pde_v[0]); |
198 | gk20a_mem_wr32(g, &parent->mem, pde + 1, pde_v[1]); | 184 | gk20a_pde_wr32(g, parent, pde + 1, pde_v[1]); |
199 | 185 | ||
200 | gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x", | 186 | gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x", |
201 | i, gmmu_pgsz_idx, pde_v[1], pde_v[0]); | 187 | i, gmmu_pgsz_idx, pde_v[1], pde_v[0]); |
@@ -232,12 +218,12 @@ static int update_gmmu_pde0_locked(struct vm_gk20a *vm, | |||
232 | big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big; | 218 | big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big; |
233 | 219 | ||
234 | if (small_valid) { | 220 | if (small_valid) { |
235 | pte_addr_small = entry_addr(g, entry) | 221 | pte_addr_small = gk20a_pde_addr(g, entry) |
236 | >> gmmu_new_dual_pde_address_shift_v(); | 222 | >> gmmu_new_dual_pde_address_shift_v(); |
237 | } | 223 | } |
238 | 224 | ||
239 | if (big_valid) | 225 | if (big_valid) |
240 | pte_addr_big = entry_addr(g, entry) | 226 | pte_addr_big = gk20a_pde_addr(g, entry) |
241 | >> gmmu_new_dual_pde_address_big_shift_v(); | 227 | >> gmmu_new_dual_pde_address_big_shift_v(); |
242 | 228 | ||
243 | if (small_valid) { | 229 | if (small_valid) { |
@@ -260,10 +246,10 @@ static int update_gmmu_pde0_locked(struct vm_gk20a *vm, | |||
260 | 246 | ||
261 | pde = pde0_from_index(i); | 247 | pde = pde0_from_index(i); |
262 | 248 | ||
263 | gk20a_mem_wr32(g, &pte->mem, pde + 0, pde_v[0]); | 249 | gk20a_pde_wr32(g, pte, pde + 0, pde_v[0]); |
264 | gk20a_mem_wr32(g, &pte->mem, pde + 1, pde_v[1]); | 250 | gk20a_pde_wr32(g, pte, pde + 1, pde_v[1]); |
265 | gk20a_mem_wr32(g, &pte->mem, pde + 2, pde_v[2]); | 251 | gk20a_pde_wr32(g, pte, pde + 2, pde_v[2]); |
266 | gk20a_mem_wr32(g, &pte->mem, pde + 3, pde_v[3]); | 252 | gk20a_pde_wr32(g, pte, pde + 3, pde_v[3]); |
267 | 253 | ||
268 | gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d [0x%08x, 0x%08x, 0x%x, 0x%08x]", | 254 | gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d [0x%08x, 0x%08x, 0x%x, 0x%08x]", |
269 | i, gmmu_pgsz_idx, pde_v[3], pde_v[2], pde_v[1], pde_v[0]); | 255 | i, gmmu_pgsz_idx, pde_v[3], pde_v[2], pde_v[1], pde_v[0]); |
@@ -286,6 +272,7 @@ static int update_gmmu_pte_locked(struct vm_gk20a *vm, | |||
286 | u32 page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx]; | 272 | u32 page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx]; |
287 | u64 ctag_granularity = g->ops.fb.compression_page_size(g); | 273 | u64 ctag_granularity = g->ops.fb.compression_page_size(g); |
288 | u32 pte_w[2] = {0, 0}; /* invalid pte */ | 274 | u32 pte_w[2] = {0, 0}; /* invalid pte */ |
275 | u32 pte_i; | ||
289 | 276 | ||
290 | if (*iova) { | 277 | if (*iova) { |
291 | u32 pte_valid = unmapped_pte ? | 278 | u32 pte_valid = unmapped_pte ? |
@@ -331,8 +318,10 @@ static int update_gmmu_pte_locked(struct vm_gk20a *vm, | |||
331 | gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i); | 318 | gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i); |
332 | } | 319 | } |
333 | 320 | ||
334 | gk20a_mem_wr32(g, &pte->mem, pte3_from_index(i) + 0, pte_w[0]); | 321 | pte_i = pte3_from_index(i); |
335 | gk20a_mem_wr32(g, &pte->mem, pte3_from_index(i) + 1, pte_w[1]); | 322 | |
323 | gk20a_pde_wr32(g, pte, pte_i + 0, pte_w[0]); | ||
324 | gk20a_pde_wr32(g, pte, pte_i + 1, pte_w[1]); | ||
336 | 325 | ||
337 | if (*iova) { | 326 | if (*iova) { |
338 | *iova += page_size; | 327 | *iova += page_size; |