summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu
diff options
context:
space:
mode:
authorTerje Bergstrom <tbergstrom@nvidia.com>2015-01-14 11:54:26 -0500
committerDan Willemsen <dwillemsen@nvidia.com>2015-04-04 21:08:16 -0400
commitf3a920cb01d1517db5432c8062b660d6b60eb4de (patch)
treebc366a7df51745ea2d6b740395403cf2add2ebef /drivers/gpu/nvgpu
parent8d1ab756ed8a7f4d3138dc5da9d2de9f52915261 (diff)
gpu: nvgpu: Refactor page mapping code
Pass always the directory structure to mm functions instead of pointers to members to it. Also split update_gmmu_ptes_locked() into smaller functions, and turn the hard coded MMU levels (PDE, PTE) into run-time parameters. Change-Id: I315ef7aebbea1e61156705361f2e2a63b5fb7bf1 Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-on: http://git-master/r/672485 Reviewed-by: Automatic_Commit_Validation_User
Diffstat (limited to 'drivers/gpu/nvgpu')
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.c7
-rw-r--r--drivers/gpu/nvgpu/gk20a/gk20a.h12
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.c658
-rw-r--r--drivers/gpu/nvgpu/gk20a/mm_gk20a.h37
-rw-r--r--drivers/gpu/nvgpu/gm20b/mm_gm20b.c155
-rw-r--r--drivers/gpu/nvgpu/vgpu/mm_vgpu.c10
6 files changed, 382 insertions, 497 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 57d5f09a..76237fae 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * GK20A Graphics 2 * GK20A Graphics
3 * 3 *
4 * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 * Copyright (c) 2011-2015, NVIDIA CORPORATION. All rights reserved.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -1789,7 +1789,8 @@ int gk20a_init_gpu_characteristics(struct gk20a *g)
1789 1789
1790 gpu->big_page_size = g->mm.pmu.vm.big_page_size; 1790 gpu->big_page_size = g->mm.pmu.vm.big_page_size;
1791 gpu->compression_page_size = g->ops.fb.compression_page_size(g); 1791 gpu->compression_page_size = g->ops.fb.compression_page_size(g);
1792 gpu->pde_coverage_bit_count = g->mm.pmu.vm.pde_stride_shift; 1792 gpu->pde_coverage_bit_count =
1793 gk20a_mm_pde_coverage_bit_count(&g->mm.pmu.vm);
1793 1794
1794 gpu->available_big_page_sizes = gpu->big_page_size; 1795 gpu->available_big_page_sizes = gpu->big_page_size;
1795 if (g->ops.mm.get_big_page_sizes) 1796 if (g->ops.mm.get_big_page_sizes)
@@ -1798,7 +1799,7 @@ int gk20a_init_gpu_characteristics(struct gk20a *g)
1798 gpu->flags = NVGPU_GPU_FLAGS_SUPPORT_PARTIAL_MAPPINGS 1799 gpu->flags = NVGPU_GPU_FLAGS_SUPPORT_PARTIAL_MAPPINGS
1799 | NVGPU_GPU_FLAGS_SUPPORT_SYNC_FENCE_FDS; 1800 | NVGPU_GPU_FLAGS_SUPPORT_SYNC_FENCE_FDS;
1800 1801
1801 if (g->ops.mm.set_sparse) 1802 if (g->ops.mm.support_sparse && g->ops.mm.support_sparse(g))
1802 gpu->flags |= NVGPU_GPU_FLAGS_SUPPORT_SPARSE_ALLOCS; 1803 gpu->flags |= NVGPU_GPU_FLAGS_SUPPORT_SPARSE_ALLOCS;
1803 1804
1804 if (IS_ENABLED(CONFIG_TEGRA_GK20A) && 1805 if (IS_ENABLED(CONFIG_TEGRA_GK20A) &&
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index fa80f010..ef8068e5 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -296,8 +296,7 @@ struct gpu_ops {
296 bool (*is_fw_defined)(void); 296 bool (*is_fw_defined)(void);
297 } gr_ctx; 297 } gr_ctx;
298 struct { 298 struct {
299 int (*set_sparse)(struct vm_gk20a *vm, u64 vaddr, 299 bool (*support_sparse)(struct gk20a *g);
300 u32 num_pages, u32 pgsz_idx, bool refplus);
301 bool (*is_debug_mode_enabled)(struct gk20a *g); 300 bool (*is_debug_mode_enabled)(struct gk20a *g);
302 u64 (*gmmu_map)(struct vm_gk20a *vm, 301 u64 (*gmmu_map)(struct vm_gk20a *vm,
303 u64 map_offset, 302 u64 map_offset,
@@ -309,13 +308,15 @@ struct gpu_ops {
309 u32 ctag_offset, 308 u32 ctag_offset,
310 u32 flags, 309 u32 flags,
311 int rw_flag, 310 int rw_flag,
312 bool clear_ctags); 311 bool clear_ctags,
312 bool sparse);
313 void (*gmmu_unmap)(struct vm_gk20a *vm, 313 void (*gmmu_unmap)(struct vm_gk20a *vm,
314 u64 vaddr, 314 u64 vaddr,
315 u64 size, 315 u64 size,
316 int pgsz_idx, 316 int pgsz_idx,
317 bool va_allocated, 317 bool va_allocated,
318 int rw_flag); 318 int rw_flag,
319 bool sparse);
319 void (*vm_remove)(struct vm_gk20a *vm); 320 void (*vm_remove)(struct vm_gk20a *vm);
320 int (*vm_alloc_share)(struct gk20a_as_share *as_share, 321 int (*vm_alloc_share)(struct gk20a_as_share *as_share,
321 u32 flags); 322 u32 flags);
@@ -331,6 +332,9 @@ struct gpu_ops {
331 u32 (*get_physical_addr_bits)(struct gk20a *g); 332 u32 (*get_physical_addr_bits)(struct gk20a *g);
332 int (*init_bar2_vm)(struct gk20a *g); 333 int (*init_bar2_vm)(struct gk20a *g);
333 int (*init_bar2_mm_hw_setup)(struct gk20a *g); 334 int (*init_bar2_mm_hw_setup)(struct gk20a *g);
335 const struct gk20a_mmu_level *
336 (*get_mmu_levels)(struct gk20a *g, u32 big_page_size);
337 void (*init_pdb)(struct gk20a *g, void *inst_ptr, u64 pdb_addr);
334 } mm; 338 } mm;
335 struct { 339 struct {
336 int (*prepare_ucode)(struct gk20a *g); 340 int (*prepare_ucode)(struct gk20a *g);
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 80c766b6..d8bd3e70 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -98,7 +98,8 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
98 struct sg_table *sgt, u64 buffer_offset, 98 struct sg_table *sgt, u64 buffer_offset,
99 u64 first_vaddr, u64 last_vaddr, 99 u64 first_vaddr, u64 last_vaddr,
100 u8 kind_v, u32 ctag_offset, bool cacheable, 100 u8 kind_v, u32 ctag_offset, bool cacheable,
101 int rw_flag); 101 int rw_flag,
102 bool sparse);
102static int __must_check gk20a_init_system_vm(struct mm_gk20a *mm); 103static int __must_check gk20a_init_system_vm(struct mm_gk20a *mm);
103static int __must_check gk20a_init_bar1_vm(struct mm_gk20a *mm); 104static int __must_check gk20a_init_bar1_vm(struct mm_gk20a *mm);
104static int __must_check gk20a_init_hwpm(struct mm_gk20a *mm); 105static int __must_check gk20a_init_hwpm(struct mm_gk20a *mm);
@@ -605,34 +606,46 @@ void unmap_gmmu_pages(struct gk20a_mm_entry *entry)
605 606
606static int gk20a_zalloc_gmmu_page_table(struct vm_gk20a *vm, 607static int gk20a_zalloc_gmmu_page_table(struct vm_gk20a *vm,
607 enum gmmu_pgsz_gk20a pgsz_idx, 608 enum gmmu_pgsz_gk20a pgsz_idx,
609 const struct gk20a_mmu_level *l,
608 struct gk20a_mm_entry *entry) 610 struct gk20a_mm_entry *entry)
609{ 611{
610 int err; 612 int err;
611 u32 pte_order; 613 int order;
612 614
613 gk20a_dbg_fn(""); 615 gk20a_dbg_fn("");
614 616
615 /* allocate enough pages for the table */ 617 /* allocate enough pages for the table */
616 pte_order = vm->page_table_sizing[pgsz_idx].order; 618 order = l->hi_bit[pgsz_idx] - l->lo_bit[pgsz_idx] + 1;
619 order += ilog2(l->entry_size);
620 order -= PAGE_SHIFT;
621 order = max(0, order);
617 622
618 err = alloc_gmmu_pages(vm, pte_order, entry); 623 err = alloc_gmmu_pages(vm, order, entry);
619 gk20a_dbg(gpu_dbg_pte, "entry = 0x%p, addr=%08llx, size %d", 624 gk20a_dbg(gpu_dbg_pte, "entry = 0x%p, addr=%08llx, size %d",
620 entry, gk20a_mm_iova_addr(vm->mm->g, entry->sgt->sgl), 625 entry, gk20a_mm_iova_addr(vm->mm->g, entry->sgt->sgl), order);
621 pte_order); 626 if (err)
627 return err;
622 entry->pgsz = pgsz_idx; 628 entry->pgsz = pgsz_idx;
623 629
624 return err; 630 return err;
625} 631}
626 632
633int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm)
634{
635 return vm->mmu_levels[0].lo_bit[0];
636}
637
627/* given address range (inclusive) determine the pdes crossed */ 638/* given address range (inclusive) determine the pdes crossed */
628void pde_range_from_vaddr_range(struct vm_gk20a *vm, 639void pde_range_from_vaddr_range(struct vm_gk20a *vm,
629 u64 addr_lo, u64 addr_hi, 640 u64 addr_lo, u64 addr_hi,
630 u32 *pde_lo, u32 *pde_hi) 641 u32 *pde_lo, u32 *pde_hi)
631{ 642{
632 *pde_lo = (u32)(addr_lo >> vm->pde_stride_shift); 643 int pde_shift = gk20a_mm_pde_coverage_bit_count(vm);
633 *pde_hi = (u32)(addr_hi >> vm->pde_stride_shift); 644
645 *pde_lo = (u32)(addr_lo >> pde_shift);
646 *pde_hi = (u32)(addr_hi >> pde_shift);
634 gk20a_dbg(gpu_dbg_pte, "addr_lo=0x%llx addr_hi=0x%llx pde_ss=%d", 647 gk20a_dbg(gpu_dbg_pte, "addr_lo=0x%llx addr_hi=0x%llx pde_ss=%d",
635 addr_lo, addr_hi, vm->pde_stride_shift); 648 addr_lo, addr_hi, pde_shift);
636 gk20a_dbg(gpu_dbg_pte, "pde_lo=%d pde_hi=%d", 649 gk20a_dbg(gpu_dbg_pte, "pde_lo=%d pde_hi=%d",
637 *pde_lo, *pde_hi); 650 *pde_lo, *pde_hi);
638} 651}
@@ -647,7 +660,7 @@ u32 pte_index_from_vaddr(struct vm_gk20a *vm,
647{ 660{
648 u32 ret; 661 u32 ret;
649 /* mask off pde part */ 662 /* mask off pde part */
650 addr = addr & ((((u64)1) << vm->pde_stride_shift) - ((u64)1)); 663 addr = addr & ((1ULL << gk20a_mm_pde_coverage_bit_count(vm)) - 1ULL);
651 664
652 /* shift over to get pte index. note assumption that pte index 665 /* shift over to get pte index. note assumption that pte index
653 * doesn't leak over into the high 32b */ 666 * doesn't leak over into the high 32b */
@@ -657,57 +670,6 @@ u32 pte_index_from_vaddr(struct vm_gk20a *vm,
657 return ret; 670 return ret;
658} 671}
659 672
660static inline void pte_space_page_offset_from_index(u32 i, u32 *pte_page,
661 u32 *pte_offset)
662{
663 /* ptes are 8B regardless of pagesize */
664 /* pte space pages are 4KB. so 512 ptes per 4KB page*/
665 *pte_page = i >> 9;
666
667 /* this offset is a pte offset, not a byte offset */
668 *pte_offset = i & ((1<<9)-1);
669
670 gk20a_dbg(gpu_dbg_pte, "i=0x%x pte_page=0x%x pte_offset=0x%x",
671 i, *pte_page, *pte_offset);
672}
673
674
675/*
676 * given a pde index/page table number make sure it has
677 * backing store and if not go ahead allocate it and
678 * record it in the appropriate pde
679 */
680int validate_gmmu_page_table_gk20a_locked(struct vm_gk20a *vm,
681 u32 i, enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
682{
683 int err;
684 struct gk20a_mm_entry *entry = vm->pdb.entries + i;
685
686 gk20a_dbg_fn("");
687
688 /* if it's already in place it's valid */
689 if (entry->size)
690 return 0;
691
692 gk20a_dbg(gpu_dbg_pte, "alloc %dKB ptes for pde %d",
693 vm->gmmu_page_sizes[gmmu_pgsz_idx]/1024, i);
694
695 err = gk20a_zalloc_gmmu_page_table(vm, gmmu_pgsz_idx, entry);
696 if (err)
697 return err;
698
699 /* rewrite pde */
700 err = map_gmmu_pages(&vm->pdb);
701 if (err)
702 return err;
703
704 update_gmmu_pde_locked(vm, i);
705
706 unmap_gmmu_pages(&vm->pdb);
707
708 return 0;
709}
710
711static struct vm_reserved_va_node *addr_to_reservation(struct vm_gk20a *vm, 673static struct vm_reserved_va_node *addr_to_reservation(struct vm_gk20a *vm,
712 u64 addr) 674 u64 addr)
713{ 675{
@@ -1117,11 +1079,11 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
1117 u32 ctag_offset, 1079 u32 ctag_offset,
1118 u32 flags, 1080 u32 flags,
1119 int rw_flag, 1081 int rw_flag,
1120 bool clear_ctags) 1082 bool clear_ctags,
1083 bool sparse)
1121{ 1084{
1122 int err = 0, i = 0; 1085 int err = 0;
1123 bool allocated = false; 1086 bool allocated = false;
1124 u32 pde_lo, pde_hi;
1125 struct device *d = dev_from_vm(vm); 1087 struct device *d = dev_from_vm(vm);
1126 struct gk20a *g = gk20a_from_vm(vm); 1088 struct gk20a *g = gk20a_from_vm(vm);
1127 int ctag_granularity = g->ops.fb.compression_page_size(g); 1089 int ctag_granularity = g->ops.fb.compression_page_size(g);
@@ -1146,31 +1108,16 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
1146 allocated = true; 1108 allocated = true;
1147 } 1109 }
1148 1110
1149 pde_range_from_vaddr_range(vm,
1150 map_offset,
1151 map_offset + size - 1,
1152 &pde_lo, &pde_hi);
1153
1154 /* mark the addr range valid (but with 0 phys addr, which will fault) */
1155 for (i = pde_lo; i <= pde_hi; i++) {
1156 err = validate_gmmu_page_table_gk20a_locked(vm, i,
1157 pgsz_idx);
1158 if (err) {
1159 gk20a_err(d, "failed to validate page table %d: %d",
1160 i, err);
1161 goto fail_validate;
1162 }
1163 }
1164
1165 err = update_gmmu_ptes_locked(vm, pgsz_idx, 1111 err = update_gmmu_ptes_locked(vm, pgsz_idx,
1166 sgt, 1112 sgt,
1167 buffer_offset, 1113 buffer_offset,
1168 map_offset, map_offset + size - 1, 1114 map_offset, map_offset + size,
1169 kind_v, 1115 kind_v,
1170 ctag_offset, 1116 ctag_offset,
1171 flags & 1117 flags &
1172 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, 1118 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
1173 rw_flag); 1119 rw_flag,
1120 sparse);
1174 if (err) { 1121 if (err) {
1175 gk20a_err(d, "failed to update ptes on map"); 1122 gk20a_err(d, "failed to update ptes on map");
1176 goto fail_validate; 1123 goto fail_validate;
@@ -1192,7 +1139,8 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
1192 u64 size, 1139 u64 size,
1193 int pgsz_idx, 1140 int pgsz_idx,
1194 bool va_allocated, 1141 bool va_allocated,
1195 int rw_flag) 1142 int rw_flag,
1143 bool sparse)
1196{ 1144{
1197 int err = 0; 1145 int err = 0;
1198 struct gk20a *g = gk20a_from_vm(vm); 1146 struct gk20a *g = gk20a_from_vm(vm);
@@ -1212,9 +1160,10 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
1212 NULL, /* n/a for unmap */ 1160 NULL, /* n/a for unmap */
1213 0, 1161 0,
1214 vaddr, 1162 vaddr,
1215 vaddr + size - 1, 1163 vaddr + size,
1216 0, 0, false /* n/a for unmap */, 1164 0, 0, false /* n/a for unmap */,
1217 rw_flag); 1165 rw_flag,
1166 sparse);
1218 if (err) 1167 if (err)
1219 dev_err(dev_from_vm(vm), 1168 dev_err(dev_from_vm(vm),
1220 "failed to update gmmu ptes on unmap"); 1169 "failed to update gmmu ptes on unmap");
@@ -1439,7 +1388,8 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,
1439 bfr.kind_v, 1388 bfr.kind_v,
1440 bfr.ctag_offset, 1389 bfr.ctag_offset,
1441 flags, rw_flag, 1390 flags, rw_flag,
1442 clear_ctags); 1391 clear_ctags,
1392 false);
1443 if (!map_offset) 1393 if (!map_offset)
1444 goto clean_up; 1394 goto clean_up;
1445 1395
@@ -1555,7 +1505,7 @@ u64 gk20a_gmmu_map(struct vm_gk20a *vm,
1555 0, /* page size index = 0 i.e. SZ_4K */ 1505 0, /* page size index = 0 i.e. SZ_4K */
1556 0, /* kind */ 1506 0, /* kind */
1557 0, /* ctag_offset */ 1507 0, /* ctag_offset */
1558 flags, rw_flag, false); 1508 flags, rw_flag, false, false);
1559 mutex_unlock(&vm->update_gmmu_lock); 1509 mutex_unlock(&vm->update_gmmu_lock);
1560 if (!vaddr) { 1510 if (!vaddr) {
1561 gk20a_err(dev_from_vm(vm), "failed to allocate va space"); 1511 gk20a_err(dev_from_vm(vm), "failed to allocate va space");
@@ -1642,7 +1592,8 @@ void gk20a_gmmu_unmap(struct vm_gk20a *vm,
1642 size, 1592 size,
1643 0, /* page size 4K */ 1593 0, /* page size 4K */
1644 true, /*va_allocated */ 1594 true, /*va_allocated */
1645 rw_flag); 1595 rw_flag,
1596 false);
1646 mutex_unlock(&vm->update_gmmu_lock); 1597 mutex_unlock(&vm->update_gmmu_lock);
1647} 1598}
1648 1599
@@ -1748,157 +1699,6 @@ u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl)
1748 return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl)); 1699 return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl));
1749} 1700}
1750 1701
1751static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
1752 enum gmmu_pgsz_gk20a pgsz_idx,
1753 struct sg_table *sgt,
1754 u64 buffer_offset,
1755 u64 first_vaddr, u64 last_vaddr,
1756 u8 kind_v, u32 ctag_offset,
1757 bool cacheable,
1758 int rw_flag)
1759{
1760 int err;
1761 u32 pde_lo, pde_hi, pde_i;
1762 struct scatterlist *cur_chunk;
1763 unsigned int cur_offset;
1764 u32 pte_w[2] = {0, 0}; /* invalid pte */
1765 struct gk20a *g = gk20a_from_vm(vm);
1766 u32 ctag_granularity = g->ops.fb.compression_page_size(g);
1767 u32 ctag = ctag_offset * ctag_granularity;
1768 u32 ctag_incr;
1769 u32 page_size = vm->gmmu_page_sizes[pgsz_idx];
1770 u64 addr = 0;
1771 u64 space_to_skip = buffer_offset;
1772
1773 pde_range_from_vaddr_range(vm, first_vaddr, last_vaddr,
1774 &pde_lo, &pde_hi);
1775
1776 gk20a_dbg(gpu_dbg_pte, "size_idx=%d, pde_lo=%d, pde_hi=%d",
1777 pgsz_idx, pde_lo, pde_hi);
1778
1779 ctag_incr = ctag_offset ? page_size : 0;
1780
1781 cur_offset = 0;
1782 if (sgt) {
1783 cur_chunk = sgt->sgl;
1784 /* space_to_skip must be page aligned */
1785 BUG_ON(space_to_skip & (page_size - 1));
1786
1787 while (space_to_skip > 0 && cur_chunk) {
1788 u64 new_addr = gk20a_mm_iova_addr(vm->mm->g, cur_chunk);
1789 if (new_addr) {
1790 addr = new_addr;
1791 addr += cur_offset;
1792 }
1793 cur_offset += page_size;
1794 addr += page_size;
1795 while (cur_chunk &&
1796 cur_offset >= cur_chunk->length) {
1797 cur_offset -= cur_chunk->length;
1798 cur_chunk = sg_next(cur_chunk);
1799 }
1800 space_to_skip -= page_size;
1801 }
1802 }
1803 else
1804 cur_chunk = NULL;
1805
1806 for (pde_i = pde_lo; pde_i <= pde_hi; pde_i++) {
1807 u32 pte_lo, pte_hi;
1808 u32 pte_cur;
1809
1810 struct gk20a_mm_entry *entry = vm->pdb.entries + pde_i;
1811
1812 if (pde_i == pde_lo)
1813 pte_lo = pte_index_from_vaddr(vm, first_vaddr,
1814 pgsz_idx);
1815 else
1816 pte_lo = 0;
1817
1818 if ((pde_i != pde_hi) && (pde_hi != pde_lo))
1819 pte_hi = vm->page_table_sizing[pgsz_idx].num_ptes-1;
1820 else
1821 pte_hi = pte_index_from_vaddr(vm, last_vaddr,
1822 pgsz_idx);
1823
1824 /* get cpu access to the ptes */
1825 err = map_gmmu_pages(entry);
1826 if (err) {
1827 gk20a_err(dev_from_vm(vm),
1828 "couldn't map ptes for update as=%d",
1829 vm_aspace_id(vm));
1830 goto clean_up;
1831 }
1832
1833 gk20a_dbg(gpu_dbg_pte, "pte_lo=%d, pte_hi=%d", pte_lo, pte_hi);
1834 for (pte_cur = pte_lo; pte_cur <= pte_hi; pte_cur++) {
1835 if (likely(sgt)) {
1836 u64 new_addr = gk20a_mm_iova_addr(vm->mm->g,
1837 cur_chunk);
1838 if (new_addr) {
1839 addr = new_addr;
1840 addr += cur_offset;
1841 }
1842 pte_w[0] = gmmu_pte_valid_true_f() |
1843 gmmu_pte_address_sys_f(addr
1844 >> gmmu_pte_address_shift_v());
1845 pte_w[1] = gmmu_pte_aperture_video_memory_f() |
1846 gmmu_pte_kind_f(kind_v) |
1847 gmmu_pte_comptagline_f(ctag
1848 / ctag_granularity);
1849
1850 if (rw_flag == gk20a_mem_flag_read_only) {
1851 pte_w[0] |= gmmu_pte_read_only_true_f();
1852 pte_w[1] |=
1853 gmmu_pte_write_disable_true_f();
1854 } else if (rw_flag ==
1855 gk20a_mem_flag_write_only) {
1856 pte_w[1] |=
1857 gmmu_pte_read_disable_true_f();
1858 }
1859 if (!cacheable)
1860 pte_w[1] |= gmmu_pte_vol_true_f();
1861
1862 gk20a_dbg(gpu_dbg_pte, "pte_cur=%d addr=0x%x,%08x kind=%d"
1863 " ctag=%d vol=%d"
1864 " [0x%08x,0x%08x]",
1865 pte_cur, hi32(addr), lo32(addr),
1866 kind_v, ctag, !cacheable,
1867 pte_w[1], pte_w[0]);
1868 ctag += ctag_incr;
1869 cur_offset += page_size;
1870 addr += page_size;
1871 while (cur_chunk &&
1872 cur_offset >= cur_chunk->length) {
1873 cur_offset -= cur_chunk->length;
1874 cur_chunk = sg_next(cur_chunk);
1875 }
1876
1877 } else {
1878 gk20a_dbg(gpu_dbg_pte,
1879 "pte_cur=%d [0x0,0x0]",
1880 pte_cur);
1881 }
1882
1883 gk20a_mem_wr32(entry->cpu_va + pte_cur*8, 0, pte_w[0]);
1884 gk20a_mem_wr32(entry->cpu_va + pte_cur*8, 1, pte_w[1]);
1885 }
1886
1887 unmap_gmmu_pages(entry);
1888 }
1889
1890 smp_mb();
1891
1892 return 0;
1893
1894clean_up:
1895 /*TBD: potentially rewrite above to pre-map everything it needs to
1896 * as that's the only way it can fail */
1897 return err;
1898
1899}
1900
1901
1902/* for gk20a the "video memory" apertures here are misnomers. */ 1702/* for gk20a the "video memory" apertures here are misnomers. */
1903static inline u32 big_valid_pde0_bits(u64 pte_addr) 1703static inline u32 big_valid_pde0_bits(u64 pte_addr)
1904{ 1704{
@@ -1908,6 +1708,7 @@ static inline u32 big_valid_pde0_bits(u64 pte_addr)
1908 (u32)(pte_addr >> gmmu_pde_address_shift_v())); 1708 (u32)(pte_addr >> gmmu_pde_address_shift_v()));
1909 return pde0_bits; 1709 return pde0_bits;
1910} 1710}
1711
1911static inline u32 small_valid_pde1_bits(u64 pte_addr) 1712static inline u32 small_valid_pde1_bits(u64 pte_addr)
1912{ 1713{
1913 u32 pde1_bits = 1714 u32 pde1_bits =
@@ -1924,10 +1725,15 @@ static inline u32 small_valid_pde1_bits(u64 pte_addr)
1924 made. So, superfluous updates will cause unnecessary 1725 made. So, superfluous updates will cause unnecessary
1925 pde invalidations. 1726 pde invalidations.
1926*/ 1727*/
1927void update_gmmu_pde_locked(struct vm_gk20a *vm, u32 i) 1728int update_gmmu_pde_locked(struct vm_gk20a *vm,
1729 struct gk20a_mm_entry *pte,
1730 u32 i, u32 gmmu_pgsz_idx,
1731 u64 iova,
1732 u32 kind_v, u32 *ctag,
1733 bool cacheable, int rw_flag, bool sparse)
1928{ 1734{
1929 bool small_valid, big_valid; 1735 bool small_valid, big_valid;
1930 u64 pte_addr[2] = {0, 0}; 1736 u64 pte_addr_small = 0, pte_addr_big = 0;
1931 struct gk20a_mm_entry *entry = vm->pdb.entries + i; 1737 struct gk20a_mm_entry *entry = vm->pdb.entries + i;
1932 u32 pde_v[2] = {0, 0}; 1738 u32 pde_v[2] = {0, 0};
1933 u32 *pde; 1739 u32 *pde;
@@ -1938,44 +1744,227 @@ void update_gmmu_pde_locked(struct vm_gk20a *vm, u32 i)
1938 big_valid = entry->size && entry->pgsz == gmmu_page_size_big; 1744 big_valid = entry->size && entry->pgsz == gmmu_page_size_big;
1939 1745
1940 if (small_valid) 1746 if (small_valid)
1941 pte_addr[gmmu_page_size_small] = 1747 pte_addr_small = gk20a_mm_iova_addr(vm->mm->g, entry->sgt->sgl);
1942 gk20a_mm_iova_addr(vm->mm->g, entry->sgt->sgl);
1943 1748
1944 if (big_valid) 1749 if (big_valid)
1945 pte_addr[gmmu_page_size_big] = 1750 pte_addr_big = gk20a_mm_iova_addr(vm->mm->g, entry->sgt->sgl);
1946 gk20a_mm_iova_addr(vm->mm->g, entry->sgt->sgl);
1947 1751
1948 pde_v[0] = gmmu_pde_size_full_f(); 1752 pde_v[0] = gmmu_pde_size_full_f();
1949 pde_v[0] |= big_valid ? 1753 pde_v[0] |= big_valid ? big_valid_pde0_bits(pte_addr_big) :
1950 big_valid_pde0_bits(pte_addr[gmmu_page_size_big])
1951 :
1952 (gmmu_pde_aperture_big_invalid_f()); 1754 (gmmu_pde_aperture_big_invalid_f());
1953 1755
1954 pde_v[1] |= (small_valid ? 1756 pde_v[1] |= (small_valid ?
1955 small_valid_pde1_bits(pte_addr[gmmu_page_size_small]) 1757 small_valid_pde1_bits(pte_addr_small) :
1956 :
1957 (gmmu_pde_aperture_small_invalid_f() | 1758 (gmmu_pde_aperture_small_invalid_f() |
1958 gmmu_pde_vol_small_false_f()) 1759 gmmu_pde_vol_small_false_f()))
1959 ) 1760 |
1960 | 1761 (big_valid ? (gmmu_pde_vol_big_true_f()) :
1961 (big_valid ? (gmmu_pde_vol_big_true_f()) : 1762 gmmu_pde_vol_big_false_f());
1962 gmmu_pde_vol_big_false_f());
1963 1763
1964 pde = pde_from_index(vm, i); 1764 pde = pde_from_index(vm, i);
1965 1765
1966 gk20a_mem_wr32(pde, 0, pde_v[0]); 1766 gk20a_mem_wr32(pde, 0, pde_v[0]);
1967 gk20a_mem_wr32(pde, 1, pde_v[1]); 1767 gk20a_mem_wr32(pde, 1, pde_v[1]);
1968 1768
1969 smp_mb(); 1769 gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",
1770 i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);
1771 return 0;
1772}
1773
1774int update_gmmu_pte_locked(struct vm_gk20a *vm,
1775 struct gk20a_mm_entry *pte,
1776 u32 i, u32 gmmu_pgsz_idx,
1777 u64 iova,
1778 u32 kind_v, u32 *ctag,
1779 bool cacheable, int rw_flag, bool sparse)
1780{
1781 struct gk20a *g = gk20a_from_vm(vm);
1782 u32 ctag_granularity = g->ops.fb.compression_page_size(g);
1783 u32 page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];
1784 u32 pte_w[2] = {0, 0}; /* invalid pte */
1785
1786 if (iova) {
1787 pte_w[0] = gmmu_pte_valid_true_f() |
1788 gmmu_pte_address_sys_f(iova
1789 >> gmmu_pte_address_shift_v());
1790 pte_w[1] = gmmu_pte_aperture_video_memory_f() |
1791 gmmu_pte_kind_f(kind_v) |
1792 gmmu_pte_comptagline_f(*ctag / ctag_granularity);
1793
1794 if (rw_flag == gk20a_mem_flag_read_only) {
1795 pte_w[0] |= gmmu_pte_read_only_true_f();
1796 pte_w[1] |=
1797 gmmu_pte_write_disable_true_f();
1798 } else if (rw_flag ==
1799 gk20a_mem_flag_write_only) {
1800 pte_w[1] |=
1801 gmmu_pte_read_disable_true_f();
1802 }
1803 if (!cacheable)
1804 pte_w[1] |= gmmu_pte_vol_true_f();
1805
1806 gk20a_dbg(gpu_dbg_pte,
1807 "pte=%d iova=0x%llx kind=%d ctag=%d vol=%d [0x%08x, 0x%08x]",
1808 i, iova,
1809 kind_v, *ctag, !cacheable,
1810 pte_w[1], pte_w[0]);
1811
1812 if (*ctag)
1813 *ctag += page_size;
1814 } else if (sparse) {
1815 pte_w[0] = gmmu_pte_valid_false_f();
1816 pte_w[1] |= gmmu_pte_vol_true_f();
1817 } else {
1818 gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
1819 }
1820
1821 gk20a_mem_wr32(pte->cpu_va + i*8, 0, pte_w[0]);
1822 gk20a_mem_wr32(pte->cpu_va + i*8, 1, pte_w[1]);
1823
1824 return 0;
1825}
1826
1827static int update_gmmu_level_locked(struct vm_gk20a *vm,
1828 struct gk20a_mm_entry *pte,
1829 enum gmmu_pgsz_gk20a pgsz_idx,
1830 u64 iova,
1831 u64 gpu_va, u64 gpu_end,
1832 u8 kind_v, u32 *ctag,
1833 bool cacheable,
1834 int rw_flag,
1835 bool sparse,
1836 int lvl)
1837{
1838 const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl];
1839 const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl+1];
1840 int err = 0;
1841 u32 pde_i;
1842 u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx];
1843
1844 gk20a_dbg_fn("");
1845
1846 pde_i = (gpu_va & ((1ULL << ((u64)l->hi_bit[pgsz_idx]+1)) - 1ULL))
1847 >> (u64)l->lo_bit[pgsz_idx];
1848
1849 gk20a_dbg(gpu_dbg_pte, "size_idx=%d, l: %d, [%llx,%llx], iova=%llx",
1850 pgsz_idx, lvl, gpu_va, gpu_end-1, iova);
1851
1852 while (gpu_va < gpu_end) {
1853 struct gk20a_mm_entry *next_pte = NULL;
1854 u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end);
1855
1856 /* Allocate next level */
1857 if (next_l->update_entry) {
1858 if (!pte->entries) {
1859 int num_entries =
1860 1 <<
1861 (l->hi_bit[pgsz_idx]
1862 - l->lo_bit[pgsz_idx]);
1863 pte->entries =
1864 kzalloc(sizeof(struct gk20a_mm_entry) *
1865 num_entries, GFP_KERNEL);
1866 pte->pgsz = pgsz_idx;
1867 if (!pte->entries)
1868 return -ENOMEM;
1869 }
1870 next_pte = pte->entries + pde_i;
1871
1872 if (!next_pte->size) {
1873 err = gk20a_zalloc_gmmu_page_table(vm,
1874 pgsz_idx, next_l, next_pte);
1875 if (err)
1876 return err;
1877 }
1878 }
1879
1880 err = l->update_entry(vm, pte, pde_i, pgsz_idx,
1881 iova, kind_v, ctag, cacheable,
1882 rw_flag, sparse);
1883 if (err)
1884 return err;
1885
1886 if (next_l->update_entry) {
1887 /* get cpu access to the ptes */
1888 err = map_gmmu_pages(next_pte);
1889 if (err) {
1890 gk20a_err(dev_from_vm(vm),
1891 "couldn't map ptes for update as=%d",
1892 vm_aspace_id(vm));
1893 return err;
1894 }
1895 err = update_gmmu_level_locked(vm, next_pte,
1896 pgsz_idx,
1897 iova,
1898 gpu_va,
1899 next,
1900 kind_v, ctag,
1901 cacheable, rw_flag, sparse, lvl+1);
1902 unmap_gmmu_pages(next_pte);
1903
1904 if (err)
1905 return err;
1906 }
1907
1908 if (iova)
1909 iova += next - gpu_va;
1910 pde_i++;
1911 gpu_va = next;
1912 }
1913
1914 gk20a_dbg_fn("done");
1915
1916 return 0;
1917}
1918
1919static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
1920 enum gmmu_pgsz_gk20a pgsz_idx,
1921 struct sg_table *sgt,
1922 u64 buffer_offset,
1923 u64 gpu_va, u64 gpu_end,
1924 u8 kind_v, u32 ctag_offset,
1925 bool cacheable,
1926 int rw_flag,
1927 bool sparse)
1928{
1929 struct gk20a *g = gk20a_from_vm(vm);
1930 int ctag_granularity = g->ops.fb.compression_page_size(g);
1931 u32 ctag = ctag_offset * ctag_granularity;
1932 u64 iova = 0;
1933 u64 space_to_skip = buffer_offset;
1934 u32 page_size = vm->gmmu_page_sizes[pgsz_idx];
1935 int err;
1936
1937 gk20a_dbg(gpu_dbg_pte, "size_idx=%d, iova=%llx",
1938 pgsz_idx,
1939 sgt ? gk20a_mm_iova_addr(vm->mm->g, sgt->sgl) : 0ULL);
1970 1940
1971 FLUSH_CPU_DCACHE(pde, 1941 if (space_to_skip & (page_size - 1))
1972 sg_phys(vm->pdb.sgt->sgl) + (i*gmmu_pde__size_v()), 1942 return -EINVAL;
1973 sizeof(u32)*2); 1943
1944 if (sgt)
1945 iova = gk20a_mm_iova_addr(vm->mm->g, sgt->sgl) + space_to_skip;
1974 1946
1975 gk20a_mm_l2_invalidate(vm->mm->g); 1947 gk20a_dbg(gpu_dbg_map, "size_idx=%d, gpu_va=[%llx,%llx], iova=%llx",
1948 pgsz_idx, gpu_va, gpu_end-1, iova);
1949 err = map_gmmu_pages(&vm->pdb);
1950 if (err) {
1951 gk20a_err(dev_from_vm(vm),
1952 "couldn't map ptes for update as=%d",
1953 vm_aspace_id(vm));
1954 return err;
1955 }
1956 err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
1957 iova,
1958 gpu_va, gpu_end,
1959 kind_v, &ctag,
1960 cacheable, rw_flag, sparse, 0);
1961 unmap_gmmu_pages(&vm->pdb);
1962
1963 smp_mb();
1976 1964
1977 gk20a_dbg(gpu_dbg_pte, "pde:%d = 0x%x,0x%08x\n", i, pde_v[1], pde_v[0]);
1978 gk20a_dbg_fn("done"); 1965 gk20a_dbg_fn("done");
1966
1967 return err;
1979} 1968}
1980 1969
1981/* NOTE! mapped_buffers lock must be held */ 1970/* NOTE! mapped_buffers lock must be held */
@@ -1984,29 +1973,14 @@ void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer)
1984 struct vm_gk20a *vm = mapped_buffer->vm; 1973 struct vm_gk20a *vm = mapped_buffer->vm;
1985 struct gk20a *g = vm->mm->g; 1974 struct gk20a *g = vm->mm->g;
1986 1975
1987 if (mapped_buffer->va_node && 1976 g->ops.mm.gmmu_unmap(vm,
1988 mapped_buffer->va_node->sparse) { 1977 mapped_buffer->addr,
1989 u64 vaddr = mapped_buffer->addr; 1978 mapped_buffer->size,
1990 u32 pgsz_idx = mapped_buffer->pgsz_idx; 1979 mapped_buffer->pgsz_idx,
1991 u32 num_pages = mapped_buffer->size >> 1980 mapped_buffer->va_allocated,
1992 ilog2(vm->gmmu_page_sizes[pgsz_idx]); 1981 gk20a_mem_flag_none,
1993 1982 mapped_buffer->va_node ?
1994 /* there is little we can do if this fails... */ 1983 mapped_buffer->va_node->sparse : false);
1995 g->ops.mm.gmmu_unmap(vm,
1996 mapped_buffer->addr,
1997 mapped_buffer->size,
1998 mapped_buffer->pgsz_idx,
1999 mapped_buffer->va_allocated,
2000 gk20a_mem_flag_none);
2001 g->ops.mm.set_sparse(vm, vaddr,
2002 num_pages, pgsz_idx, false);
2003 } else
2004 g->ops.mm.gmmu_unmap(vm,
2005 mapped_buffer->addr,
2006 mapped_buffer->size,
2007 mapped_buffer->pgsz_idx,
2008 mapped_buffer->va_allocated,
2009 gk20a_mem_flag_none);
2010 1984
2011 gk20a_dbg(gpu_dbg_map, "as=%d pgsz=%d gv=0x%x,%08x own_mem_ref=%d", 1985 gk20a_dbg(gpu_dbg_map, "as=%d pgsz=%d gv=0x%x,%08x own_mem_ref=%d",
2012 vm_aspace_id(vm), 1986 vm_aspace_id(vm),
@@ -2057,7 +2031,7 @@ static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm)
2057 struct vm_reserved_va_node *va_node, *va_node_tmp; 2031 struct vm_reserved_va_node *va_node, *va_node_tmp;
2058 struct rb_node *node; 2032 struct rb_node *node;
2059 int i; 2033 int i;
2060 u32 pde_lo, pde_hi; 2034 u32 pde_lo = 0, pde_hi = 0;
2061 2035
2062 gk20a_dbg_fn(""); 2036 gk20a_dbg_fn("");
2063 mutex_lock(&vm->update_gmmu_lock); 2037 mutex_lock(&vm->update_gmmu_lock);
@@ -2082,7 +2056,8 @@ static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm)
2082 2056
2083 /* unmapping all buffers above may not actually free 2057 /* unmapping all buffers above may not actually free
2084 * all vm ptes. jettison them here for certain... */ 2058 * all vm ptes. jettison them here for certain... */
2085 pde_range_from_vaddr_range(vm, 0, vm->va_limit-1, 2059 pde_range_from_vaddr_range(vm,
2060 0, vm->va_limit-1,
2086 &pde_lo, &pde_hi); 2061 &pde_lo, &pde_hi);
2087 for (i = 0; i < pde_hi + 1; i++) { 2062 for (i = 0; i < pde_hi + 1; i++) {
2088 struct gk20a_mm_entry *entry = &vm->pdb.entries[i]; 2063 struct gk20a_mm_entry *entry = &vm->pdb.entries[i];
@@ -2125,6 +2100,30 @@ void gk20a_vm_put(struct vm_gk20a *vm)
2125 kref_put(&vm->ref, gk20a_vm_remove_support_kref); 2100 kref_put(&vm->ref, gk20a_vm_remove_support_kref);
2126} 2101}
2127 2102
2103const struct gk20a_mmu_level gk20a_mm_levels_64k[] = {
2104 {.hi_bit = {NV_GMMU_VA_RANGE-1, NV_GMMU_VA_RANGE-1},
2105 .lo_bit = {26, 26},
2106 .update_entry = update_gmmu_pde_locked,
2107 .entry_size = 8},
2108 {.hi_bit = {25, 25},
2109 .lo_bit = {12, 16},
2110 .update_entry = update_gmmu_pte_locked,
2111 .entry_size = 8},
2112 {.update_entry = NULL}
2113};
2114
2115const struct gk20a_mmu_level gk20a_mm_levels_128k[] = {
2116 {.hi_bit = {NV_GMMU_VA_RANGE-1, NV_GMMU_VA_RANGE-1},
2117 .lo_bit = {27, 27},
2118 .update_entry = update_gmmu_pde_locked,
2119 .entry_size = 8},
2120 {.hi_bit = {26, 26},
2121 .lo_bit = {12, 17},
2122 .update_entry = update_gmmu_pte_locked,
2123 .entry_size = 8},
2124 {.update_entry = NULL}
2125};
2126
2128int gk20a_init_vm(struct mm_gk20a *mm, 2127int gk20a_init_vm(struct mm_gk20a *mm,
2129 struct vm_gk20a *vm, 2128 struct vm_gk20a *vm,
2130 u32 big_page_size, 2129 u32 big_page_size,
@@ -2149,38 +2148,18 @@ int gk20a_init_vm(struct mm_gk20a *mm,
2149 vm->big_pages = big_pages; 2148 vm->big_pages = big_pages;
2150 2149
2151 vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big]; 2150 vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
2152 vm->pde_stride = vm->big_page_size << 10;
2153 vm->pde_stride_shift = ilog2(vm->pde_stride);
2154 2151
2155 for (i = 0; i < gmmu_nr_page_sizes; i++) { 2152 vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g,
2156 u32 num_ptes, pte_space, num_pages; 2153 vm->big_page_size);
2157 2154
2155 for (i = 0; i < gmmu_nr_page_sizes; i++)
2158 vm->gmmu_page_sizes[i] = gmmu_page_sizes[i]; 2156 vm->gmmu_page_sizes[i] = gmmu_page_sizes[i];
2159 2157
2160 /* assuming "full" page tables */ 2158 gk20a_dbg_info("small page-size (%dKB)",
2161 num_ptes = vm->pde_stride / gmmu_page_sizes[i]; 2159 vm->gmmu_page_sizes[gmmu_page_size_small] >> 10);
2162
2163 pte_space = num_ptes * gmmu_pte__size_v();
2164 /* allocate whole pages */
2165 pte_space = roundup(pte_space, PAGE_SIZE);
2166 2160
2167 num_pages = pte_space / PAGE_SIZE; 2161 gk20a_dbg_info("big page-size (%dKB)",
2168 /* make sure "order" is viable */ 2162 vm->gmmu_page_sizes[gmmu_page_size_big] >> 10);
2169 BUG_ON(!is_power_of_2(num_pages));
2170
2171 vm->page_table_sizing[i].num_ptes = num_ptes;
2172 vm->page_table_sizing[i].order = ilog2(num_pages);
2173 }
2174
2175 gk20a_dbg_info("small page-size (%dKB) pte array: %dKB",
2176 vm->gmmu_page_sizes[gmmu_page_size_small] >> 10,
2177 (vm->page_table_sizing[gmmu_page_size_small].num_ptes *
2178 gmmu_pte__size_v()) >> 10);
2179
2180 gk20a_dbg_info("big page-size (%dKB) pte array: %dKB",
2181 vm->gmmu_page_sizes[gmmu_page_size_big] >> 10,
2182 (vm->page_table_sizing[gmmu_page_size_big].num_ptes *
2183 gmmu_pte__size_v()) >> 10);
2184 2163
2185 pde_range_from_vaddr_range(vm, 2164 pde_range_from_vaddr_range(vm,
2186 0, vm->va_limit-1, 2165 0, vm->va_limit-1,
@@ -2197,7 +2176,7 @@ int gk20a_init_vm(struct mm_gk20a *mm,
2197 name, vm->va_limit, pde_hi + 1); 2176 name, vm->va_limit, pde_hi + 1);
2198 2177
2199 /* allocate the page table directory */ 2178 /* allocate the page table directory */
2200 err = gk20a_zalloc_gmmu_page_table(vm, 0, &vm->pdb); 2179 err = gk20a_zalloc_gmmu_page_table(vm, 0, &vm->mmu_levels[0], &vm->pdb);
2201 if (err) 2180 if (err)
2202 goto clean_up_ptes; 2181 goto clean_up_ptes;
2203 2182
@@ -2382,9 +2361,18 @@ int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
2382 2361
2383 /* mark that we need to use sparse mappings here */ 2362 /* mark that we need to use sparse mappings here */
2384 if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_SPARSE) { 2363 if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_SPARSE) {
2385 err = g->ops.mm.set_sparse(vm, vaddr_start, args->pages, 2364 u64 map_offset = g->ops.mm.gmmu_map(vm, vaddr_start,
2386 pgsz_idx, true); 2365 NULL,
2387 if (err) { 2366 0,
2367 va_node->size,
2368 pgsz_idx,
2369 0,
2370 0,
2371 args->flags,
2372 gk20a_mem_flag_none,
2373 false,
2374 true);
2375 if (!map_offset) {
2388 mutex_unlock(&vm->update_gmmu_lock); 2376 mutex_unlock(&vm->update_gmmu_lock);
2389 vma->free(vma, start_page_nr, args->pages, 1); 2377 vma->free(vma, start_page_nr, args->pages, 1);
2390 kfree(va_node); 2378 kfree(va_node);
@@ -2462,7 +2450,8 @@ int gk20a_vm_free_space(struct gk20a_as_share *as_share,
2462 va_node->size, 2450 va_node->size,
2463 va_node->pgsz_idx, 2451 va_node->pgsz_idx,
2464 true, 2452 true,
2465 gk20a_mem_flag_none); 2453 gk20a_mem_flag_none,
2454 true);
2466 kfree(va_node); 2455 kfree(va_node);
2467 } 2456 }
2468 mutex_unlock(&vm->update_gmmu_lock); 2457 mutex_unlock(&vm->update_gmmu_lock);
@@ -2741,13 +2730,25 @@ static int gk20a_init_hwpm(struct mm_gk20a *mm)
2741 return 0; 2730 return 0;
2742} 2731}
2743 2732
2733void gk20a_mm_init_pdb(struct gk20a *g, void *inst_ptr, u64 pdb_addr)
2734{
2735 u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
2736 u32 pdb_addr_hi = u64_hi32(pdb_addr);
2737
2738 gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
2739 ram_in_page_dir_base_target_vid_mem_f() |
2740 ram_in_page_dir_base_vol_true_f() |
2741 ram_in_page_dir_base_lo_f(pdb_addr_lo));
2742
2743 gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
2744 ram_in_page_dir_base_hi_f(pdb_addr_hi));
2745}
2746
2744void gk20a_init_inst_block(struct inst_desc *inst_block, struct vm_gk20a *vm, 2747void gk20a_init_inst_block(struct inst_desc *inst_block, struct vm_gk20a *vm,
2745 u32 big_page_size) 2748 u32 big_page_size)
2746{ 2749{
2747 struct gk20a *g = gk20a_from_vm(vm); 2750 struct gk20a *g = gk20a_from_vm(vm);
2748 u64 pde_addr = gk20a_mm_iova_addr(g, vm->pdb.sgt->sgl); 2751 u64 pde_addr = gk20a_mm_iova_addr(g, vm->pdb.sgt->sgl);
2749 u32 pde_addr_lo = u64_lo32(pde_addr >> ram_in_base_shift_v());
2750 u32 pde_addr_hi = u64_hi32(pde_addr);
2751 phys_addr_t inst_pa = inst_block->cpu_pa; 2752 phys_addr_t inst_pa = inst_block->cpu_pa;
2752 void *inst_ptr = inst_block->cpuva; 2753 void *inst_ptr = inst_block->cpuva;
2753 2754
@@ -2756,13 +2757,7 @@ void gk20a_init_inst_block(struct inst_desc *inst_block, struct vm_gk20a *vm,
2756 2757
2757 gk20a_dbg_info("pde pa=0x%llx", (u64)pde_addr); 2758 gk20a_dbg_info("pde pa=0x%llx", (u64)pde_addr);
2758 2759
2759 gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(), 2760 g->ops.mm.init_pdb(g, inst_ptr, pde_addr);
2760 ram_in_page_dir_base_target_vid_mem_f() |
2761 ram_in_page_dir_base_vol_true_f() |
2762 ram_in_page_dir_base_lo_f(pde_addr_lo));
2763
2764 gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
2765 ram_in_page_dir_base_hi_f(pde_addr_hi));
2766 2761
2767 gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(), 2762 gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
2768 u64_lo32(vm->va_limit) | 0xFFF); 2763 u64_lo32(vm->va_limit) | 0xFFF);
@@ -3030,6 +3025,13 @@ u32 gk20a_mm_get_physical_addr_bits(struct gk20a *g)
3030 return 34; 3025 return 34;
3031} 3026}
3032 3027
3028const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g,
3029 u32 big_page_size)
3030{
3031 return (big_page_size == SZ_64K) ?
3032 gk20a_mm_levels_64k : gk20a_mm_levels_128k;
3033}
3034
3033void gk20a_init_mm(struct gpu_ops *gops) 3035void gk20a_init_mm(struct gpu_ops *gops)
3034{ 3036{
3035 gops->mm.is_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled; 3037 gops->mm.is_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled;
@@ -3043,5 +3045,7 @@ void gk20a_init_mm(struct gpu_ops *gops)
3043 gops->mm.l2_flush = gk20a_mm_l2_flush; 3045 gops->mm.l2_flush = gk20a_mm_l2_flush;
3044 gops->mm.tlb_invalidate = gk20a_mm_tlb_invalidate; 3046 gops->mm.tlb_invalidate = gk20a_mm_tlb_invalidate;
3045 gops->mm.get_physical_addr_bits = gk20a_mm_get_physical_addr_bits; 3047 gops->mm.get_physical_addr_bits = gk20a_mm_get_physical_addr_bits;
3048 gops->mm.get_mmu_levels = gk20a_mm_get_mmu_levels;
3049 gops->mm.init_pdb = gk20a_mm_init_pdb;
3046} 3050}
3047 3051
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 7b355436..42c164be 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -268,6 +268,18 @@ struct vm_reserved_va_node {
268 bool sparse; 268 bool sparse;
269}; 269};
270 270
271struct gk20a_mmu_level {
272 int hi_bit[2];
273 int lo_bit[2];
274 int (*update_entry)(struct vm_gk20a *vm,
275 struct gk20a_mm_entry *pte,
276 u32 i, u32 gmmu_pgsz_idx,
277 u64 iova,
278 u32 kind_v, u32 *ctag,
279 bool cacheable, int rw_flag, bool sparse);
280 size_t entry_size;
281};
282
271struct vm_gk20a { 283struct vm_gk20a {
272 struct mm_gk20a *mm; 284 struct mm_gk20a *mm;
273 struct gk20a_as_share *as_share; /* as_share this represents */ 285 struct gk20a_as_share *as_share; /* as_share this represents */
@@ -282,13 +294,8 @@ struct vm_gk20a {
282 bool mapped; 294 bool mapped;
283 295
284 u32 big_page_size; 296 u32 big_page_size;
285 u32 pde_stride;
286 u32 pde_stride_shift;
287 297
288 struct { 298 const struct gk20a_mmu_level *mmu_levels;
289 u32 order;
290 u32 num_ptes;
291 } page_table_sizing[gmmu_nr_page_sizes];
292 299
293 struct kref ref; 300 struct kref ref;
294 301
@@ -450,7 +457,8 @@ u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
450 u32 ctag_offset, 457 u32 ctag_offset,
451 u32 flags, 458 u32 flags,
452 int rw_flag, 459 int rw_flag,
453 bool clear_ctags); 460 bool clear_ctags,
461 bool sparse);
454 462
455void gk20a_gmmu_unmap(struct vm_gk20a *vm, 463void gk20a_gmmu_unmap(struct vm_gk20a *vm,
456 u64 vaddr, 464 u64 vaddr,
@@ -462,7 +470,8 @@ void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
462 u64 size, 470 u64 size,
463 int pgsz_idx, 471 int pgsz_idx,
464 bool va_allocated, 472 bool va_allocated,
465 int rw_flag); 473 int rw_flag,
474 bool sparse);
466 475
467struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf); 476struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf);
468void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf, 477void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf,
@@ -557,13 +566,10 @@ void unmap_gmmu_pages(struct gk20a_mm_entry *entry);
557void pde_range_from_vaddr_range(struct vm_gk20a *vm, 566void pde_range_from_vaddr_range(struct vm_gk20a *vm,
558 u64 addr_lo, u64 addr_hi, 567 u64 addr_lo, u64 addr_hi,
559 u32 *pde_lo, u32 *pde_hi); 568 u32 *pde_lo, u32 *pde_hi);
569int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm);
560u32 *pde_from_index(struct vm_gk20a *vm, u32 i); 570u32 *pde_from_index(struct vm_gk20a *vm, u32 i);
561u32 pte_index_from_vaddr(struct vm_gk20a *vm, 571u32 pte_index_from_vaddr(struct vm_gk20a *vm,
562 u64 addr, enum gmmu_pgsz_gk20a pgsz_idx); 572 u64 addr, enum gmmu_pgsz_gk20a pgsz_idx);
563int validate_gmmu_page_table_gk20a_locked(struct vm_gk20a *vm,
564 u32 i, enum gmmu_pgsz_gk20a gmmu_pgsz_idx);
565
566void update_gmmu_pde_locked(struct vm_gk20a *vm, u32 i);
567void free_gmmu_pages(struct vm_gk20a *vm, 573void free_gmmu_pages(struct vm_gk20a *vm,
568 struct gk20a_mm_entry *entry); 574 struct gk20a_mm_entry *entry);
569 575
@@ -571,4 +577,11 @@ u32 gk20a_mm_get_physical_addr_bits(struct gk20a *g);
571 577
572struct gpu_ops; 578struct gpu_ops;
573void gk20a_init_mm(struct gpu_ops *gops); 579void gk20a_init_mm(struct gpu_ops *gops);
580const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g,
581 u32 big_page_size);
582void gk20a_mm_init_pdb(struct gk20a *g, void *inst_ptr, u64 pdb_addr);
583
584extern const struct gk20a_mmu_level gk20a_mm_levels_64k[];
585extern const struct gk20a_mmu_level gk20a_mm_levels_128k[];
586
574#endif /* MM_GK20A_H */ 587#endif /* MM_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gm20b/mm_gm20b.c b/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
index 37ab70fa..f85a1718 100644
--- a/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
@@ -21,152 +21,6 @@
21#include "hw_gr_gm20b.h" 21#include "hw_gr_gm20b.h"
22#include "hw_ram_gm20b.h" 22#include "hw_ram_gm20b.h"
23 23
24static int allocate_gmmu_ptes_sparse(struct vm_gk20a *vm,
25 enum gmmu_pgsz_gk20a pgsz_idx,
26 u64 first_vaddr, u64 last_vaddr,
27 bool clear, bool refplus)
28{
29 int err;
30 u32 pte_lo, pte_hi;
31 u32 pde_lo, pde_hi;
32 u32 pte_w[2] = {0, 0}; /* invalid pte */
33 u64 addr = 0;
34 u32 pte_cur;
35 struct gk20a_mm_entry *entry;
36 struct gk20a *g = gk20a_from_vm(vm);
37
38 gk20a_dbg_fn("");
39
40 pde_range_from_vaddr_range(vm, first_vaddr, last_vaddr,
41 &pde_lo, &pde_hi);
42
43 gk20a_dbg(gpu_dbg_pte, "size_idx=%d, pde_lo=%d, pde_hi=%d",
44 pgsz_idx, pde_lo, pde_hi);
45
46 /* Expect ptes of the same pde */
47 BUG_ON(pde_lo != pde_hi);
48
49 entry = vm->pdb.entries + pde_lo;
50
51 pte_lo = pte_index_from_vaddr(vm, first_vaddr, pgsz_idx);
52 pte_hi = pte_index_from_vaddr(vm, last_vaddr, pgsz_idx);
53
54 /* get cpu access to the ptes */
55 err = map_gmmu_pages(entry);
56 if (err)
57 goto fail;
58
59 gk20a_dbg(gpu_dbg_pte, "pte_lo=%d, pte_hi=%d", pte_lo, pte_hi);
60 for (pte_cur = pte_lo; pte_cur <= pte_hi; pte_cur++) {
61 pte_w[0] = gmmu_pte_valid_false_f();
62 pte_w[1] = clear ? 0 : gmmu_pte_vol_true_f();
63
64 gk20a_dbg(gpu_dbg_pte,
65 "pte_cur=%d addr=%llx"
66 " [0x%08x,0x%08x]",
67 pte_cur, addr,
68 pte_w[1], pte_w[0]);
69
70 gk20a_mem_wr32(entry->cpu_va + pte_cur*8, 0, pte_w[0]);
71 gk20a_mem_wr32(entry->cpu_va + pte_cur*8, 1, pte_w[1]);
72 }
73
74 unmap_gmmu_pages(entry);
75
76 smp_mb();
77 g->ops.mm.tlb_invalidate(vm);
78
79 return 0;
80fail:
81 return err;
82
83}
84
85static bool gm20b_vm_is_pde_in_range(struct vm_gk20a *vm, u64 vaddr_lo,
86 u64 vaddr_hi, u32 pde)
87{
88 u64 pde_vaddr_lo, pde_vaddr_hi;
89
90 gk20a_dbg_fn("");
91
92 pde_vaddr_lo = (u64)pde << vm->pde_stride_shift;
93 pde_vaddr_hi = pde_vaddr_lo |
94 ((0x1UL << (vm->pde_stride_shift)) - 1);
95
96 return ((vaddr_lo <= pde_vaddr_lo) && (vaddr_hi) >= pde_vaddr_hi);
97}
98
99static int gm20b_vm_put_sparse(struct vm_gk20a *vm, u64 vaddr,
100 u32 num_pages, u32 pgsz_idx, bool refplus)
101{
102 struct mm_gk20a *mm = vm->mm;
103 u32 pgsz = vm->gmmu_page_sizes[pgsz_idx];
104 u32 pde_shift = vm->pde_stride_shift;
105 u64 vaddr_hi;
106 u64 vaddr_pde_start;
107 u32 i;
108 u32 pde_lo, pde_hi;
109 int err;
110
111 gk20a_dbg_fn("");
112
113 vaddr_hi = vaddr + pgsz * (u64)num_pages - 1;
114 pde_range_from_vaddr_range(vm,
115 vaddr,
116 vaddr_hi,
117 &pde_lo, &pde_hi);
118
119 gk20a_dbg_info("vaddr: 0x%llx, vaddr_hi: 0x%llx, pde_lo: 0x%x, "
120 "pde_hi: 0x%x, pgsz: %d, pde_stride_shift: %d",
121 vaddr, vaddr_hi, pde_lo, pde_hi, pgsz,
122 vm->pde_stride_shift);
123
124 for (i = pde_lo; i <= pde_hi; i++) {
125 /* Mark all ptes as sparse. */
126 err = validate_gmmu_page_table_gk20a_locked(vm, i,
127 pgsz_idx);
128 if (err) {
129 gk20a_err(dev_from_vm(vm),
130 "failed to validate page table %d: %d",
131 i, err);
132 goto fail;
133 }
134
135 if (gm20b_vm_is_pde_in_range(vm, vaddr, vaddr_hi, i)) {
136 /* entire pde is marked as sparse */
137 vaddr_pde_start = (u64)i << pde_shift;
138 allocate_gmmu_ptes_sparse(vm, pgsz_idx,
139 vaddr_pde_start,
140 PDE_ADDR_END(vaddr_pde_start,
141 pde_shift), false, refplus);
142 } else {
143 /* Check leading and trailing spaces which doesn't fit
144 * into entire pde. */
145 if (pde_lo == pde_hi)
146 allocate_gmmu_ptes_sparse(vm, pgsz_idx, vaddr,
147 vaddr_hi, false, refplus);
148 else if (i == pde_lo)
149 allocate_gmmu_ptes_sparse(vm, pgsz_idx, vaddr,
150 PDE_ADDR_END(vaddr, pde_shift), false,
151 refplus);
152 else
153 allocate_gmmu_ptes_sparse(vm, pgsz_idx,
154 PDE_ADDR_START(vaddr_hi, pde_shift),
155 vaddr_hi, false,
156 refplus);
157 }
158 }
159
160 gk20a_mm_l2_flush(mm->g, true);
161
162 return 0;
163
164fail:
165 WARN_ON(1);
166
167 return err;
168}
169
170static int gm20b_mm_mmu_vpr_info_fetch_wait(struct gk20a *g, 24static int gm20b_mm_mmu_vpr_info_fetch_wait(struct gk20a *g,
171 const unsigned int msec) 25 const unsigned int msec)
172{ 26{
@@ -249,9 +103,14 @@ static u32 gm20b_mm_get_big_page_sizes(void)
249 return SZ_64K | SZ_128K; 103 return SZ_64K | SZ_128K;
250} 104}
251 105
106static bool gm20b_mm_support_sparse(struct gk20a *g)
107{
108 return true;
109}
110
252void gm20b_init_mm(struct gpu_ops *gops) 111void gm20b_init_mm(struct gpu_ops *gops)
253{ 112{
254 gops->mm.set_sparse = gm20b_vm_put_sparse; 113 gops->mm.support_sparse = gm20b_mm_support_sparse;
255 gops->mm.is_debug_mode_enabled = gm20b_mm_mmu_debug_mode_enabled; 114 gops->mm.is_debug_mode_enabled = gm20b_mm_mmu_debug_mode_enabled;
256 gops->mm.gmmu_map = gk20a_locked_gmmu_map; 115 gops->mm.gmmu_map = gk20a_locked_gmmu_map;
257 gops->mm.gmmu_unmap = gk20a_locked_gmmu_unmap; 116 gops->mm.gmmu_unmap = gk20a_locked_gmmu_unmap;
@@ -265,4 +124,6 @@ void gm20b_init_mm(struct gpu_ops *gops)
265 gops->mm.set_big_page_size = gm20b_mm_set_big_page_size; 124 gops->mm.set_big_page_size = gm20b_mm_set_big_page_size;
266 gops->mm.get_big_page_sizes = gm20b_mm_get_big_page_sizes; 125 gops->mm.get_big_page_sizes = gm20b_mm_get_big_page_sizes;
267 gops->mm.get_physical_addr_bits = gk20a_mm_get_physical_addr_bits; 126 gops->mm.get_physical_addr_bits = gk20a_mm_get_physical_addr_bits;
127 gops->mm.get_mmu_levels = gk20a_mm_get_mmu_levels;
128 gops->mm.init_pdb = gk20a_mm_init_pdb;
268} 129}
diff --git a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
index 57814f1b..9b7c7dbd 100644
--- a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
@@ -40,8 +40,8 @@ static int vgpu_init_mm_setup_sw(struct gk20a *g)
40 40
41 /* gk20a_init_gpu_characteristics expects this to be populated */ 41 /* gk20a_init_gpu_characteristics expects this to be populated */
42 vm->big_page_size = big_page_size; 42 vm->big_page_size = big_page_size;
43 vm->pde_stride = vm->big_page_size << 10; 43 vm->mmu_levels = (vm->big_page_size == SZ_64K) ?
44 vm->pde_stride_shift = ilog2(vm->pde_stride); 44 gk20a_mm_levels_64k : gk20a_mm_levels_128k;
45 45
46 mm->sw_ready = true; 46 mm->sw_ready = true;
47 47
@@ -65,7 +65,8 @@ static u64 vgpu_locked_gmmu_map(struct vm_gk20a *vm,
65 u32 ctag_offset, 65 u32 ctag_offset,
66 u32 flags, 66 u32 flags,
67 int rw_flag, 67 int rw_flag,
68 bool clear_ctags) 68 bool clear_ctags,
69 bool sparse)
69{ 70{
70 int err = 0; 71 int err = 0;
71 struct device *d = dev_from_vm(vm); 72 struct device *d = dev_from_vm(vm);
@@ -128,7 +129,8 @@ static void vgpu_locked_gmmu_unmap(struct vm_gk20a *vm,
128 u64 size, 129 u64 size,
129 int pgsz_idx, 130 int pgsz_idx,
130 bool va_allocated, 131 bool va_allocated,
131 int rw_flag) 132 int rw_flag,
133 bool sparse)
132{ 134{
133 struct gk20a *g = gk20a_from_vm(vm); 135 struct gk20a *g = gk20a_from_vm(vm);
134 struct gk20a_platform *platform = gk20a_get_platform(g->dev); 136 struct gk20a_platform *platform = gk20a_get_platform(g->dev);