2 files changed, 30 insertions, 4 deletions
diff --git a/drivers/gpu/nvgpu/common/mm/pd_cache.c b/drivers/gpu/nvgpu/common/mm/pd_cache.c
index a5b3d134..8f7003e5 100644
--- a/drivers/gpu/nvgpu/common/mm/pd_cache.c
+++ b/drivers/gpu/nvgpu/common/mm/pd_cache.c
@@ -423,12 +423,19 @@ static void nvgpu_pd_cache_do_free(struct gk20a *g,
                 * this just re-adds it.
                 *
                 * Since the memory used for the entries is still mapped, if
-                 * igpu make sure the entries are invalidated so that the hw
+                 * iommu is being used,  make sure PTE entries in particular
-                 * doesn't accidentally try to prefetch non-existent fb memory.
+                 * are invalidated so that the hw doesn't accidentally try to
+                 * prefetch non-existent fb memory.
                 *
-                 * TBD: what about dgpu? (Not supported in Drive 5.0)
+                 * Notes:
+                 *   - The check for NVGPU_PD_CACHE_SIZE > PAGE_SIZE effectively
+                 *     determines whether PTE entries use the cache.
+                 *   - In the case where PTE entries ues the cache, we also
+                 *     end up invalidating the PDE entries, but that's a minor
+                 *     performance hit, as there are far fewer of those
+                 *     typically than there are PTE entries.
                 */
-                if (pd->mem->cpu_va != NULL) {
+                if (nvgpu_iommuable(g) && (NVGPU_PD_CACHE_SIZE > PAGE_SIZE)) {
                        memset((void *)((u64)pd->mem->cpu_va + pd->mem_offs), 0,
                                        pentry->pd_size);
                }
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
index ee2207b8..2fc0d44e 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -29,6 +29,7 @@
 #include <nvgpu/rbtree.h>
 #include <nvgpu/lock.h>
 #include <nvgpu/bitops.h>
+#include <nvgpu/mm.h>
 /*
 * This is the GMMU API visible to blocks outside of the GMMU. Basically this
@@ -54,10 +55,28 @@ enum gk20a_mem_rw_flag {
 * Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
 * structure is of course depending on this. The MIN_SHIFT define is the right
 * number of bits to shift to determine which list to use in the array of lists.
+ *
+ * For Linux, limit the use of the cache to entries less than the page size, to
+ * avoid potential problems with running out of CMA memory when allocating large,
+ * contiguous slabs, as would be required for non-iommmuable chips.
 */
 #define NVGPU_PD_CACHE_MIN              256U
 #define NVGPU_PD_CACHE_MIN_SHIFT        9U
+#ifdef __KERNEL__
+#if PAGE_SIZE == 4096
+#define NVGPU_PD_CACHE_COUNT            4U
+#elif PAGE_SIZE == 65536
 #define NVGPU_PD_CACHE_COUNT            8U
+#else
+#error "Unsupported page size."
+#endif
+#else
+#define NVGPU_PD_CACHE_COUNT            8U
+#endif
 #define NVGPU_PD_CACHE_SIZE             (NVGPU_PD_CACHE_MIN * (1U << NVGPU_PD_CACHE_COUNT))
 struct nvgpu_pd_mem_entry {