aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-08-15 14:11:45 -0400
committerSage Weil <sage@inktank.com>2013-08-15 14:11:45 -0400
commitee3e542fec6e69bc9fb668698889a37d93950ddf (patch)
treee74ee766a4764769ef1d3d45d266b4dea64101d3 /arch/powerpc/mm
parentfe2a801b50c0bb8039d627e5ae1fec249d10ff39 (diff)
parentf1d6e17f540af37bb1891480143669ba7636c4cf (diff)
Merge remote-tracking branch 'linus/master' into testing
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/44x_mmu.c6
-rw-r--r--arch/powerpc/mm/Makefile8
-rw-r--r--arch/powerpc/mm/gup.c18
-rw-r--r--arch/powerpc/mm/hash_low_64.S21
-rw-r--r--arch/powerpc/mm/hash_native_64.c207
-rw-r--r--arch/powerpc/mm/hash_utils_64.c67
-rw-r--r--arch/powerpc/mm/hugepage-hash64.c175
-rw-r--r--arch/powerpc/mm/hugetlbpage-hash64.c2
-rw-r--r--arch/powerpc/mm/hugetlbpage.c301
-rw-r--r--arch/powerpc/mm/init_64.c9
-rw-r--r--arch/powerpc/mm/mem.c63
-rw-r--r--arch/powerpc/mm/mmap.c (renamed from arch/powerpc/mm/mmap_64.c)2
-rw-r--r--arch/powerpc/mm/mmu_context_nohash.c15
-rw-r--r--arch/powerpc/mm/numa.c71
-rw-r--r--arch/powerpc/mm/pgtable.c8
-rw-r--r--arch/powerpc/mm/pgtable_64.c414
-rw-r--r--arch/powerpc/mm/subpage-prot.c48
-rw-r--r--arch/powerpc/mm/tlb_hash64.c40
-rw-r--r--arch/powerpc/mm/tlb_nohash.c2
19 files changed, 1139 insertions, 338 deletions
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
index 2c9441ee6bb8..82b1ff759e26 100644
--- a/arch/powerpc/mm/44x_mmu.c
+++ b/arch/powerpc/mm/44x_mmu.c
@@ -41,7 +41,7 @@ int icache_44x_need_flush;
41 41
42unsigned long tlb_47x_boltmap[1024/8]; 42unsigned long tlb_47x_boltmap[1024/8];
43 43
44static void __cpuinit ppc44x_update_tlb_hwater(void) 44static void ppc44x_update_tlb_hwater(void)
45{ 45{
46 extern unsigned int tlb_44x_patch_hwater_D[]; 46 extern unsigned int tlb_44x_patch_hwater_D[];
47 extern unsigned int tlb_44x_patch_hwater_I[]; 47 extern unsigned int tlb_44x_patch_hwater_I[];
@@ -134,7 +134,7 @@ static void __init ppc47x_update_boltmap(void)
134/* 134/*
135 * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU 135 * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU
136 */ 136 */
137static void __cpuinit ppc47x_pin_tlb(unsigned int virt, unsigned int phys) 137static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
138{ 138{
139 unsigned int rA; 139 unsigned int rA;
140 int bolted; 140 int bolted;
@@ -229,7 +229,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
229} 229}
230 230
231#ifdef CONFIG_SMP 231#ifdef CONFIG_SMP
232void __cpuinit mmu_init_secondary(int cpu) 232void mmu_init_secondary(int cpu)
233{ 233{
234 unsigned long addr; 234 unsigned long addr;
235 unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); 235 unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index cf16b5733eaa..51230ee6a407 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -6,17 +6,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
6 6
7ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) 7ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
8 8
9obj-y := fault.o mem.o pgtable.o gup.o \ 9obj-y := fault.o mem.o pgtable.o gup.o mmap.o \
10 init_$(CONFIG_WORD_SIZE).o \ 10 init_$(CONFIG_WORD_SIZE).o \
11 pgtable_$(CONFIG_WORD_SIZE).o 11 pgtable_$(CONFIG_WORD_SIZE).o
12obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ 12obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \
13 tlb_nohash_low.o 13 tlb_nohash_low.o
14obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o 14obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o
15obj-$(CONFIG_PPC64) += mmap_64.o
16hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o 15hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o
17obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \ 16obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \
18 slb_low.o slb.o stab.o \ 17 slb_low.o slb.o stab.o \
19 mmap_64.o $(hash64-y) 18 $(hash64-y)
20obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o 19obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o
21obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \ 20obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \
22 tlb_hash$(CONFIG_WORD_SIZE).o \ 21 tlb_hash$(CONFIG_WORD_SIZE).o \
@@ -28,11 +27,12 @@ obj-$(CONFIG_44x) += 44x_mmu.o
28obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o 27obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o
29obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o 28obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
30obj-$(CONFIG_PPC_MM_SLICES) += slice.o 29obj-$(CONFIG_PPC_MM_SLICES) += slice.o
31ifeq ($(CONFIG_HUGETLB_PAGE),y)
32obj-y += hugetlbpage.o 30obj-y += hugetlbpage.o
31ifeq ($(CONFIG_HUGETLB_PAGE),y)
33obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o 32obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o
34obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o 33obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o
35endif 34endif
35obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
36obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o 36obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
37obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o 37obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
38obj-$(CONFIG_HIGHMEM) += highmem.o 38obj-$(CONFIG_HIGHMEM) += highmem.o
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index 4b921affa495..49822d90ea96 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -34,7 +34,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
34 34
35 ptep = pte_offset_kernel(&pmd, addr); 35 ptep = pte_offset_kernel(&pmd, addr);
36 do { 36 do {
37 pte_t pte = *ptep; 37 pte_t pte = ACCESS_ONCE(*ptep);
38 struct page *page; 38 struct page *page;
39 39
40 if ((pte_val(pte) & mask) != result) 40 if ((pte_val(pte) & mask) != result)
@@ -63,12 +63,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
63 63
64 pmdp = pmd_offset(&pud, addr); 64 pmdp = pmd_offset(&pud, addr);
65 do { 65 do {
66 pmd_t pmd = *pmdp; 66 pmd_t pmd = ACCESS_ONCE(*pmdp);
67 67
68 next = pmd_addr_end(addr, end); 68 next = pmd_addr_end(addr, end);
69 if (pmd_none(pmd)) 69 /*
70 * If we find a splitting transparent hugepage we
71 * return zero. That will result in taking the slow
72 * path which will call wait_split_huge_page()
73 * if the pmd is still in splitting state
74 */
75 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
70 return 0; 76 return 0;
71 if (pmd_huge(pmd)) { 77 if (pmd_huge(pmd) || pmd_large(pmd)) {
72 if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next, 78 if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
73 write, pages, nr)) 79 write, pages, nr))
74 return 0; 80 return 0;
@@ -91,7 +97,7 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
91 97
92 pudp = pud_offset(&pgd, addr); 98 pudp = pud_offset(&pgd, addr);
93 do { 99 do {
94 pud_t pud = *pudp; 100 pud_t pud = ACCESS_ONCE(*pudp);
95 101
96 next = pud_addr_end(addr, end); 102 next = pud_addr_end(addr, end);
97 if (pud_none(pud)) 103 if (pud_none(pud))
@@ -154,7 +160,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
154 160
155 pgdp = pgd_offset(mm, addr); 161 pgdp = pgd_offset(mm, addr);
156 do { 162 do {
157 pgd_t pgd = *pgdp; 163 pgd_t pgd = ACCESS_ONCE(*pgdp);
158 164
159 pr_devel(" %016lx: normal pgd %p\n", addr, 165 pr_devel(" %016lx: normal pgd %p\n", addr,
160 (void *)pgd_val(pgd)); 166 (void *)pgd_val(pgd));
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index 0e980acae67c..d3cbda62857b 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -289,9 +289,10 @@ htab_modify_pte:
289 289
290 /* Call ppc_md.hpte_updatepp */ 290 /* Call ppc_md.hpte_updatepp */
291 mr r5,r29 /* vpn */ 291 mr r5,r29 /* vpn */
292 li r6,MMU_PAGE_4K /* page size */ 292 li r6,MMU_PAGE_4K /* base page size */
293 ld r7,STK_PARAM(R9)(r1) /* segment size */ 293 li r7,MMU_PAGE_4K /* actual page size */
294 ld r8,STK_PARAM(R8)(r1) /* get "local" param */ 294 ld r8,STK_PARAM(R9)(r1) /* segment size */
295 ld r9,STK_PARAM(R8)(r1) /* get "local" param */
295_GLOBAL(htab_call_hpte_updatepp) 296_GLOBAL(htab_call_hpte_updatepp)
296 bl . /* Patched by htab_finish_init() */ 297 bl . /* Patched by htab_finish_init() */
297 298
@@ -649,9 +650,10 @@ htab_modify_pte:
649 650
650 /* Call ppc_md.hpte_updatepp */ 651 /* Call ppc_md.hpte_updatepp */
651 mr r5,r29 /* vpn */ 652 mr r5,r29 /* vpn */
652 li r6,MMU_PAGE_4K /* page size */ 653 li r6,MMU_PAGE_4K /* base page size */
653 ld r7,STK_PARAM(R9)(r1) /* segment size */ 654 li r7,MMU_PAGE_4K /* actual page size */
654 ld r8,STK_PARAM(R8)(r1) /* get "local" param */ 655 ld r8,STK_PARAM(R9)(r1) /* segment size */
656 ld r9,STK_PARAM(R8)(r1) /* get "local" param */
655_GLOBAL(htab_call_hpte_updatepp) 657_GLOBAL(htab_call_hpte_updatepp)
656 bl . /* patched by htab_finish_init() */ 658 bl . /* patched by htab_finish_init() */
657 659
@@ -937,9 +939,10 @@ ht64_modify_pte:
937 939
938 /* Call ppc_md.hpte_updatepp */ 940 /* Call ppc_md.hpte_updatepp */
939 mr r5,r29 /* vpn */ 941 mr r5,r29 /* vpn */
940 li r6,MMU_PAGE_64K 942 li r6,MMU_PAGE_64K /* base page size */
941 ld r7,STK_PARAM(R9)(r1) /* segment size */ 943 li r7,MMU_PAGE_64K /* actual page size */
942 ld r8,STK_PARAM(R8)(r1) /* get "local" param */ 944 ld r8,STK_PARAM(R9)(r1) /* segment size */
945 ld r9,STK_PARAM(R8)(r1) /* get "local" param */
943_GLOBAL(ht64_call_hpte_updatepp) 946_GLOBAL(ht64_call_hpte_updatepp)
944 bl . /* patched by htab_finish_init() */ 947 bl . /* patched by htab_finish_init() */
945 948
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 4c122c3f1623..c33d939120c9 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -43,6 +43,7 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
43{ 43{
44 unsigned long va; 44 unsigned long va;
45 unsigned int penc; 45 unsigned int penc;
46 unsigned long sllp;
46 47
47 /* 48 /*
48 * We need 14 to 65 bits of va for a tlibe of 4K page 49 * We need 14 to 65 bits of va for a tlibe of 4K page
@@ -64,7 +65,9 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
64 /* clear out bits after (52) [0....52.....63] */ 65 /* clear out bits after (52) [0....52.....63] */
65 va &= ~((1ul << (64 - 52)) - 1); 66 va &= ~((1ul << (64 - 52)) - 1);
66 va |= ssize << 8; 67 va |= ssize << 8;
67 va |= mmu_psize_defs[apsize].sllp << 6; 68 sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
69 ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
70 va |= sllp << 5;
68 asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2) 71 asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
69 : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) 72 : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
70 : "memory"); 73 : "memory");
@@ -98,6 +101,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
98{ 101{
99 unsigned long va; 102 unsigned long va;
100 unsigned int penc; 103 unsigned int penc;
104 unsigned long sllp;
101 105
102 /* VPN_SHIFT can be atmost 12 */ 106 /* VPN_SHIFT can be atmost 12 */
103 va = vpn << VPN_SHIFT; 107 va = vpn << VPN_SHIFT;
@@ -113,7 +117,9 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
113 /* clear out bits after(52) [0....52.....63] */ 117 /* clear out bits after(52) [0....52.....63] */
114 va &= ~((1ul << (64 - 52)) - 1); 118 va &= ~((1ul << (64 - 52)) - 1);
115 va |= ssize << 8; 119 va |= ssize << 8;
116 va |= mmu_psize_defs[apsize].sllp << 6; 120 sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
121 ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
122 va |= sllp << 5;
117 asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)" 123 asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)"
118 : : "r"(va) : "memory"); 124 : : "r"(va) : "memory");
119 break; 125 break;
@@ -273,61 +279,15 @@ static long native_hpte_remove(unsigned long hpte_group)
273 return i; 279 return i;
274} 280}
275 281
276static inline int __hpte_actual_psize(unsigned int lp, int psize)
277{
278 int i, shift;
279 unsigned int mask;
280
281 /* start from 1 ignoring MMU_PAGE_4K */
282 for (i = 1; i < MMU_PAGE_COUNT; i++) {
283
284 /* invalid penc */
285 if (mmu_psize_defs[psize].penc[i] == -1)
286 continue;
287 /*
288 * encoding bits per actual page size
289 * PTE LP actual page size
290 * rrrr rrrz >=8KB
291 * rrrr rrzz >=16KB
292 * rrrr rzzz >=32KB
293 * rrrr zzzz >=64KB
294 * .......
295 */
296 shift = mmu_psize_defs[i].shift - LP_SHIFT;
297 if (shift > LP_BITS)
298 shift = LP_BITS;
299 mask = (1 << shift) - 1;
300 if ((lp & mask) == mmu_psize_defs[psize].penc[i])
301 return i;
302 }
303 return -1;
304}
305
306static inline int hpte_actual_psize(struct hash_pte *hptep, int psize)
307{
308 /* Look at the 8 bit LP value */
309 unsigned int lp = (hptep->r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
310
311 if (!(hptep->v & HPTE_V_VALID))
312 return -1;
313
314 /* First check if it is large page */
315 if (!(hptep->v & HPTE_V_LARGE))
316 return MMU_PAGE_4K;
317
318 return __hpte_actual_psize(lp, psize);
319}
320
321static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, 282static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
322 unsigned long vpn, int psize, int ssize, 283 unsigned long vpn, int bpsize,
323 int local) 284 int apsize, int ssize, int local)
324{ 285{
325 struct hash_pte *hptep = htab_address + slot; 286 struct hash_pte *hptep = htab_address + slot;
326 unsigned long hpte_v, want_v; 287 unsigned long hpte_v, want_v;
327 int ret = 0; 288 int ret = 0;
328 int actual_psize;
329 289
330 want_v = hpte_encode_avpn(vpn, psize, ssize); 290 want_v = hpte_encode_avpn(vpn, bpsize, ssize);
331 291
332 DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)", 292 DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
333 vpn, want_v & HPTE_V_AVPN, slot, newpp); 293 vpn, want_v & HPTE_V_AVPN, slot, newpp);
@@ -335,7 +295,6 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
335 native_lock_hpte(hptep); 295 native_lock_hpte(hptep);
336 296
337 hpte_v = hptep->v; 297 hpte_v = hptep->v;
338 actual_psize = hpte_actual_psize(hptep, psize);
339 /* 298 /*
340 * We need to invalidate the TLB always because hpte_remove doesn't do 299 * We need to invalidate the TLB always because hpte_remove doesn't do
341 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less 300 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -343,12 +302,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
343 * (hpte_remove) because we assume the old translation is still 302 * (hpte_remove) because we assume the old translation is still
344 * technically "valid". 303 * technically "valid".
345 */ 304 */
346 if (actual_psize < 0) { 305 if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
347 actual_psize = psize;
348 ret = -1;
349 goto err_out;
350 }
351 if (!HPTE_V_COMPARE(hpte_v, want_v)) {
352 DBG_LOW(" -> miss\n"); 306 DBG_LOW(" -> miss\n");
353 ret = -1; 307 ret = -1;
354 } else { 308 } else {
@@ -357,11 +311,10 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
357 hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) | 311 hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
358 (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C)); 312 (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C));
359 } 313 }
360err_out:
361 native_unlock_hpte(hptep); 314 native_unlock_hpte(hptep);
362 315
363 /* Ensure it is out of the tlb too. */ 316 /* Ensure it is out of the tlb too. */
364 tlbie(vpn, psize, actual_psize, ssize, local); 317 tlbie(vpn, bpsize, apsize, ssize, local);
365 318
366 return ret; 319 return ret;
367} 320}
@@ -402,7 +355,6 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
402static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, 355static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
403 int psize, int ssize) 356 int psize, int ssize)
404{ 357{
405 int actual_psize;
406 unsigned long vpn; 358 unsigned long vpn;
407 unsigned long vsid; 359 unsigned long vsid;
408 long slot; 360 long slot;
@@ -415,36 +367,33 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
415 if (slot == -1) 367 if (slot == -1)
416 panic("could not find page to bolt\n"); 368 panic("could not find page to bolt\n");
417 hptep = htab_address + slot; 369 hptep = htab_address + slot;
418 actual_psize = hpte_actual_psize(hptep, psize);
419 if (actual_psize < 0)
420 actual_psize = psize;
421 370
422 /* Update the HPTE */ 371 /* Update the HPTE */
423 hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) | 372 hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
424 (newpp & (HPTE_R_PP | HPTE_R_N)); 373 (newpp & (HPTE_R_PP | HPTE_R_N));
425 374 /*
426 /* Ensure it is out of the tlb too. */ 375 * Ensure it is out of the tlb too. Bolted entries base and
427 tlbie(vpn, psize, actual_psize, ssize, 0); 376 * actual page size will be same.
377 */
378 tlbie(vpn, psize, psize, ssize, 0);
428} 379}
429 380
430static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, 381static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
431 int psize, int ssize, int local) 382 int bpsize, int apsize, int ssize, int local)
432{ 383{
433 struct hash_pte *hptep = htab_address + slot; 384 struct hash_pte *hptep = htab_address + slot;
434 unsigned long hpte_v; 385 unsigned long hpte_v;
435 unsigned long want_v; 386 unsigned long want_v;
436 unsigned long flags; 387 unsigned long flags;
437 int actual_psize;
438 388
439 local_irq_save(flags); 389 local_irq_save(flags);
440 390
441 DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot); 391 DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
442 392
443 want_v = hpte_encode_avpn(vpn, psize, ssize); 393 want_v = hpte_encode_avpn(vpn, bpsize, ssize);
444 native_lock_hpte(hptep); 394 native_lock_hpte(hptep);
445 hpte_v = hptep->v; 395 hpte_v = hptep->v;
446 396
447 actual_psize = hpte_actual_psize(hptep, psize);
448 /* 397 /*
449 * We need to invalidate the TLB always because hpte_remove doesn't do 398 * We need to invalidate the TLB always because hpte_remove doesn't do
450 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less 399 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -452,23 +401,120 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
452 * (hpte_remove) because we assume the old translation is still 401 * (hpte_remove) because we assume the old translation is still
453 * technically "valid". 402 * technically "valid".
454 */ 403 */
455 if (actual_psize < 0) { 404 if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
456 actual_psize = psize;
457 native_unlock_hpte(hptep);
458 goto err_out;
459 }
460 if (!HPTE_V_COMPARE(hpte_v, want_v))
461 native_unlock_hpte(hptep); 405 native_unlock_hpte(hptep);
462 else 406 else
463 /* Invalidate the hpte. NOTE: this also unlocks it */ 407 /* Invalidate the hpte. NOTE: this also unlocks it */
464 hptep->v = 0; 408 hptep->v = 0;
465 409
466err_out:
467 /* Invalidate the TLB */ 410 /* Invalidate the TLB */
468 tlbie(vpn, psize, actual_psize, ssize, local); 411 tlbie(vpn, bpsize, apsize, ssize, local);
412
413 local_irq_restore(flags);
414}
415
416static void native_hugepage_invalidate(struct mm_struct *mm,
417 unsigned char *hpte_slot_array,
418 unsigned long addr, int psize)
419{
420 int ssize = 0, i;
421 int lock_tlbie;
422 struct hash_pte *hptep;
423 int actual_psize = MMU_PAGE_16M;
424 unsigned int max_hpte_count, valid;
425 unsigned long flags, s_addr = addr;
426 unsigned long hpte_v, want_v, shift;
427 unsigned long hidx, vpn = 0, vsid, hash, slot;
428
429 shift = mmu_psize_defs[psize].shift;
430 max_hpte_count = 1U << (PMD_SHIFT - shift);
431
432 local_irq_save(flags);
433 for (i = 0; i < max_hpte_count; i++) {
434 valid = hpte_valid(hpte_slot_array, i);
435 if (!valid)
436 continue;
437 hidx = hpte_hash_index(hpte_slot_array, i);
438
439 /* get the vpn */
440 addr = s_addr + (i * (1ul << shift));
441 if (!is_kernel_addr(addr)) {
442 ssize = user_segment_size(addr);
443 vsid = get_vsid(mm->context.id, addr, ssize);
444 WARN_ON(vsid == 0);
445 } else {
446 vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
447 ssize = mmu_kernel_ssize;
448 }
449
450 vpn = hpt_vpn(addr, vsid, ssize);
451 hash = hpt_hash(vpn, shift, ssize);
452 if (hidx & _PTEIDX_SECONDARY)
453 hash = ~hash;
454
455 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
456 slot += hidx & _PTEIDX_GROUP_IX;
457
458 hptep = htab_address + slot;
459 want_v = hpte_encode_avpn(vpn, psize, ssize);
460 native_lock_hpte(hptep);
461 hpte_v = hptep->v;
462
463 /* Even if we miss, we need to invalidate the TLB */
464 if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
465 native_unlock_hpte(hptep);
466 else
467 /* Invalidate the hpte. NOTE: this also unlocks it */
468 hptep->v = 0;
469 }
470 /*
471 * Since this is a hugepage, we just need a single tlbie.
472 * use the last vpn.
473 */
474 lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
475 if (lock_tlbie)
476 raw_spin_lock(&native_tlbie_lock);
477
478 asm volatile("ptesync":::"memory");
479 __tlbie(vpn, psize, actual_psize, ssize);
480 asm volatile("eieio; tlbsync; ptesync":::"memory");
481
482 if (lock_tlbie)
483 raw_spin_unlock(&native_tlbie_lock);
484
469 local_irq_restore(flags); 485 local_irq_restore(flags);
470} 486}
471 487
488static inline int __hpte_actual_psize(unsigned int lp, int psize)
489{
490 int i, shift;
491 unsigned int mask;
492
493 /* start from 1 ignoring MMU_PAGE_4K */
494 for (i = 1; i < MMU_PAGE_COUNT; i++) {
495
496 /* invalid penc */
497 if (mmu_psize_defs[psize].penc[i] == -1)
498 continue;
499 /*
500 * encoding bits per actual page size
501 * PTE LP actual page size
502 * rrrr rrrz >=8KB
503 * rrrr rrzz >=16KB
504 * rrrr rzzz >=32KB
505 * rrrr zzzz >=64KB
506 * .......
507 */
508 shift = mmu_psize_defs[i].shift - LP_SHIFT;
509 if (shift > LP_BITS)
510 shift = LP_BITS;
511 mask = (1 << shift) - 1;
512 if ((lp & mask) == mmu_psize_defs[psize].penc[i])
513 return i;
514 }
515 return -1;
516}
517
472static void hpte_decode(struct hash_pte *hpte, unsigned long slot, 518static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
473 int *psize, int *apsize, int *ssize, unsigned long *vpn) 519 int *psize, int *apsize, int *ssize, unsigned long *vpn)
474{ 520{
@@ -514,6 +560,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
514 seg_off |= vpi << shift; 560 seg_off |= vpi << shift;
515 } 561 }
516 *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT; 562 *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT;
563 break;
517 case MMU_SEGSIZE_1T: 564 case MMU_SEGSIZE_1T:
518 /* We only have 40 - 23 bits of seg_off in avpn */ 565 /* We only have 40 - 23 bits of seg_off in avpn */
519 seg_off = (avpn & 0x1ffff) << 23; 566 seg_off = (avpn & 0x1ffff) << 23;
@@ -523,6 +570,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
523 seg_off |= vpi << shift; 570 seg_off |= vpi << shift;
524 } 571 }
525 *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT; 572 *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT;
573 break;
526 default: 574 default:
527 *vpn = size = 0; 575 *vpn = size = 0;
528 } 576 }
@@ -672,4 +720,5 @@ void __init hpte_init_native(void)
672 ppc_md.hpte_remove = native_hpte_remove; 720 ppc_md.hpte_remove = native_hpte_remove;
673 ppc_md.hpte_clear_all = native_hpte_clear; 721 ppc_md.hpte_clear_all = native_hpte_clear;
674 ppc_md.flush_hash_range = native_flush_hash_range; 722 ppc_md.flush_hash_range = native_flush_hash_range;
723 ppc_md.hugepage_invalidate = native_hugepage_invalidate;
675} 724}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index e303a6d74e3a..6ecc38bd5b24 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -807,7 +807,7 @@ void __init early_init_mmu(void)
807} 807}
808 808
809#ifdef CONFIG_SMP 809#ifdef CONFIG_SMP
810void __cpuinit early_init_mmu_secondary(void) 810void early_init_mmu_secondary(void)
811{ 811{
812 /* Initialize hash table for that CPU */ 812 /* Initialize hash table for that CPU */
813 if (!firmware_has_feature(FW_FEATURE_LPAR)) 813 if (!firmware_has_feature(FW_FEATURE_LPAR))
@@ -1050,13 +1050,26 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
1050 goto bail; 1050 goto bail;
1051 } 1051 }
1052 1052
1053#ifdef CONFIG_HUGETLB_PAGE
1054 if (hugeshift) { 1053 if (hugeshift) {
1055 rc = __hash_page_huge(ea, access, vsid, ptep, trap, local, 1054 if (pmd_trans_huge(*(pmd_t *)ptep))
1056 ssize, hugeshift, psize); 1055 rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
1056 trap, local, ssize, psize);
1057#ifdef CONFIG_HUGETLB_PAGE
1058 else
1059 rc = __hash_page_huge(ea, access, vsid, ptep, trap,
1060 local, ssize, hugeshift, psize);
1061#else
1062 else {
1063 /*
1064 * if we have hugeshift, and is not transhuge with
1065 * hugetlb disabled, something is really wrong.
1066 */
1067 rc = 1;
1068 WARN_ON(1);
1069 }
1070#endif
1057 goto bail; 1071 goto bail;
1058 } 1072 }
1059#endif /* CONFIG_HUGETLB_PAGE */
1060 1073
1061#ifndef CONFIG_PPC_64K_PAGES 1074#ifndef CONFIG_PPC_64K_PAGES
1062 DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); 1075 DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
@@ -1145,6 +1158,7 @@ EXPORT_SYMBOL_GPL(hash_page);
1145void hash_preload(struct mm_struct *mm, unsigned long ea, 1158void hash_preload(struct mm_struct *mm, unsigned long ea,
1146 unsigned long access, unsigned long trap) 1159 unsigned long access, unsigned long trap)
1147{ 1160{
1161 int hugepage_shift;
1148 unsigned long vsid; 1162 unsigned long vsid;
1149 pgd_t *pgdir; 1163 pgd_t *pgdir;
1150 pte_t *ptep; 1164 pte_t *ptep;
@@ -1166,10 +1180,27 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
1166 pgdir = mm->pgd; 1180 pgdir = mm->pgd;
1167 if (pgdir == NULL) 1181 if (pgdir == NULL)
1168 return; 1182 return;
1169 ptep = find_linux_pte(pgdir, ea); 1183
1170 if (!ptep) 1184 /* Get VSID */
1185 ssize = user_segment_size(ea);
1186 vsid = get_vsid(mm->context.id, ea, ssize);
1187 if (!vsid)
1171 return; 1188 return;
1189 /*
1190 * Hash doesn't like irqs. Walking linux page table with irq disabled
1191 * saves us from holding multiple locks.
1192 */
1193 local_irq_save(flags);
1194
1195 /*
1196 * THP pages use update_mmu_cache_pmd. We don't do
1197 * hash preload there. Hence can ignore THP here
1198 */
1199 ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift);
1200 if (!ptep)
1201 goto out_exit;
1172 1202
1203 WARN_ON(hugepage_shift);
1173#ifdef CONFIG_PPC_64K_PAGES 1204#ifdef CONFIG_PPC_64K_PAGES
1174 /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on 1205 /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on
1175 * a 64K kernel), then we don't preload, hash_page() will take 1206 * a 64K kernel), then we don't preload, hash_page() will take
@@ -1178,18 +1209,9 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
1178 * page size demotion here 1209 * page size demotion here
1179 */ 1210 */
1180 if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) 1211 if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
1181 return; 1212 goto out_exit;
1182#endif /* CONFIG_PPC_64K_PAGES */ 1213#endif /* CONFIG_PPC_64K_PAGES */
1183 1214
1184 /* Get VSID */
1185 ssize = user_segment_size(ea);
1186 vsid = get_vsid(mm->context.id, ea, ssize);
1187 if (!vsid)
1188 return;
1189
1190 /* Hash doesn't like irqs */
1191 local_irq_save(flags);
1192
1193 /* Is that local to this CPU ? */ 1215 /* Is that local to this CPU ? */
1194 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 1216 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
1195 local = 1; 1217 local = 1;
@@ -1211,7 +1233,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
1211 mm->context.user_psize, 1233 mm->context.user_psize,
1212 mm->context.user_psize, 1234 mm->context.user_psize,
1213 pte_val(*ptep)); 1235 pte_val(*ptep));
1214 1236out_exit:
1215 local_irq_restore(flags); 1237 local_irq_restore(flags);
1216} 1238}
1217 1239
@@ -1232,7 +1254,11 @@ void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
1232 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 1254 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
1233 slot += hidx & _PTEIDX_GROUP_IX; 1255 slot += hidx & _PTEIDX_GROUP_IX;
1234 DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx); 1256 DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx);
1235 ppc_md.hpte_invalidate(slot, vpn, psize, ssize, local); 1257 /*
1258 * We use same base page size and actual psize, because we don't
1259 * use these functions for hugepage
1260 */
1261 ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local);
1236 } pte_iterate_hashed_end(); 1262 } pte_iterate_hashed_end();
1237 1263
1238#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 1264#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -1365,7 +1391,8 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
1365 hash = ~hash; 1391 hash = ~hash;
1366 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 1392 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
1367 slot += hidx & _PTEIDX_GROUP_IX; 1393 slot += hidx & _PTEIDX_GROUP_IX;
1368 ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_kernel_ssize, 0); 1394 ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize,
1395 mmu_kernel_ssize, 0);
1369} 1396}
1370 1397
1371void kernel_map_pages(struct page *page, int numpages, int enable) 1398void kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
new file mode 100644
index 000000000000..34de9e0cdc34
--- /dev/null
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -0,0 +1,175 @@
1/*
2 * Copyright IBM Corporation, 2013
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15/*
16 * PPC64 THP Support for hash based MMUs
17 */
18#include <linux/mm.h>
19#include <asm/machdep.h>
20
21int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
22 pmd_t *pmdp, unsigned long trap, int local, int ssize,
23 unsigned int psize)
24{
25 unsigned int index, valid;
26 unsigned char *hpte_slot_array;
27 unsigned long rflags, pa, hidx;
28 unsigned long old_pmd, new_pmd;
29 int ret, lpsize = MMU_PAGE_16M;
30 unsigned long vpn, hash, shift, slot;
31
32 /*
33 * atomically mark the linux large page PMD busy and dirty
34 */
35 do {
36 old_pmd = pmd_val(*pmdp);
37 /* If PMD busy, retry the access */
38 if (unlikely(old_pmd & _PAGE_BUSY))
39 return 0;
40 /* If PMD is trans splitting retry the access */
41 if (unlikely(old_pmd & _PAGE_SPLITTING))
42 return 0;
43 /* If PMD permissions don't match, take page fault */
44 if (unlikely(access & ~old_pmd))
45 return 1;
46 /*
47 * Try to lock the PTE, add ACCESSED and DIRTY if it was
48 * a write access
49 */
50 new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED;
51 if (access & _PAGE_RW)
52 new_pmd |= _PAGE_DIRTY;
53 } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
54 old_pmd, new_pmd));
55 /*
56 * PP bits. _PAGE_USER is already PP bit 0x2, so we only
57 * need to add in 0x1 if it's a read-only user page
58 */
59 rflags = new_pmd & _PAGE_USER;
60 if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) &&
61 (new_pmd & _PAGE_DIRTY)))
62 rflags |= 0x1;
63 /*
64 * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
65 */
66 rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N);
67
68#if 0
69 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
70
71 /*
72 * No CPU has hugepages but lacks no execute, so we
73 * don't need to worry about that case
74 */
75 rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
76 }
77#endif
78 /*
79 * Find the slot index details for this ea, using base page size.
80 */
81 shift = mmu_psize_defs[psize].shift;
82 index = (ea & ~HPAGE_PMD_MASK) >> shift;
83 BUG_ON(index >= 4096);
84
85 vpn = hpt_vpn(ea, vsid, ssize);
86 hash = hpt_hash(vpn, shift, ssize);
87 hpte_slot_array = get_hpte_slot_array(pmdp);
88
89 valid = hpte_valid(hpte_slot_array, index);
90 if (valid) {
91 /* update the hpte bits */
92 hidx = hpte_hash_index(hpte_slot_array, index);
93 if (hidx & _PTEIDX_SECONDARY)
94 hash = ~hash;
95 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
96 slot += hidx & _PTEIDX_GROUP_IX;
97
98 ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
99 psize, lpsize, ssize, local);
100 /*
101 * We failed to update, try to insert a new entry.
102 */
103 if (ret == -1) {
104 /*
105 * large pte is marked busy, so we can be sure
106 * nobody is looking at hpte_slot_array. hence we can
107 * safely update this here.
108 */
109 valid = 0;
110 new_pmd &= ~_PAGE_HPTEFLAGS;
111 hpte_slot_array[index] = 0;
112 } else
113 /* clear the busy bits and set the hash pte bits */
114 new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
115 }
116
117 if (!valid) {
118 unsigned long hpte_group;
119
120 /* insert new entry */
121 pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
122repeat:
123 hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
124
125 /* clear the busy bits and set the hash pte bits */
126 new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
127
128 /* Add in WIMG bits */
129 rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
130 _PAGE_COHERENT | _PAGE_GUARDED));
131
132 /* Insert into the hash table, primary slot */
133 slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
134 psize, lpsize, ssize);
135 /*
136 * Primary is full, try the secondary
137 */
138 if (unlikely(slot == -1)) {
139 hpte_group = ((~hash & htab_hash_mask) *
140 HPTES_PER_GROUP) & ~0x7UL;
141 slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
142 rflags, HPTE_V_SECONDARY,
143 psize, lpsize, ssize);
144 if (slot == -1) {
145 if (mftb() & 0x1)
146 hpte_group = ((hash & htab_hash_mask) *
147 HPTES_PER_GROUP) & ~0x7UL;
148
149 ppc_md.hpte_remove(hpte_group);
150 goto repeat;
151 }
152 }
153 /*
154 * Hypervisor failure. Restore old pmd and return -1
155 * similar to __hash_page_*
156 */
157 if (unlikely(slot == -2)) {
158 *pmdp = __pmd(old_pmd);
159 hash_failure_debug(ea, access, vsid, trap, ssize,
160 psize, lpsize, old_pmd);
161 return -1;
162 }
163 /*
164 * large pte is marked busy, so we can be sure
165 * nobody is looking at hpte_slot_array. hence we can
166 * safely update this here.
167 */
168 mark_hpte_slot_valid(hpte_slot_array, index, slot);
169 }
170 /*
171 * No need to use ldarx/stdcx here
172 */
173 *pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
174 return 0;
175}
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 0f1d94a1fb82..0b7fb6761015 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -81,7 +81,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
81 slot += (old_pte & _PAGE_F_GIX) >> 12; 81 slot += (old_pte & _PAGE_F_GIX) >> 12;
82 82
83 if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize, 83 if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize,
84 ssize, local) == -1) 84 mmu_psize, ssize, local) == -1)
85 old_pte &= ~_PAGE_HPTEFLAGS; 85 old_pte &= ~_PAGE_HPTEFLAGS;
86 } 86 }
87 87
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 77fdd2cef33b..834ca8eb38f2 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -21,6 +21,9 @@
21#include <asm/pgalloc.h> 21#include <asm/pgalloc.h>
22#include <asm/tlb.h> 22#include <asm/tlb.h>
23#include <asm/setup.h> 23#include <asm/setup.h>
24#include <asm/hugetlb.h>
25
26#ifdef CONFIG_HUGETLB_PAGE
24 27
25#define PAGE_SHIFT_64K 16 28#define PAGE_SHIFT_64K 16
26#define PAGE_SHIFT_16M 24 29#define PAGE_SHIFT_16M 24
@@ -100,68 +103,9 @@ int pgd_huge(pgd_t pgd)
100} 103}
101#endif 104#endif
102 105
103/*
104 * We have 4 cases for pgds and pmds:
105 * (1) invalid (all zeroes)
106 * (2) pointer to next table, as normal; bottom 6 bits == 0
107 * (3) leaf pte for huge page, bottom two bits != 00
108 * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
109 */
110pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
111{
112 pgd_t *pg;
113 pud_t *pu;
114 pmd_t *pm;
115 pte_t *ret_pte;
116 hugepd_t *hpdp = NULL;
117 unsigned pdshift = PGDIR_SHIFT;
118
119 if (shift)
120 *shift = 0;
121
122 pg = pgdir + pgd_index(ea);
123
124 if (pgd_huge(*pg)) {
125 ret_pte = (pte_t *) pg;
126 goto out;
127 } else if (is_hugepd(pg))
128 hpdp = (hugepd_t *)pg;
129 else if (!pgd_none(*pg)) {
130 pdshift = PUD_SHIFT;
131 pu = pud_offset(pg, ea);
132
133 if (pud_huge(*pu)) {
134 ret_pte = (pte_t *) pu;
135 goto out;
136 } else if (is_hugepd(pu))
137 hpdp = (hugepd_t *)pu;
138 else if (!pud_none(*pu)) {
139 pdshift = PMD_SHIFT;
140 pm = pmd_offset(pu, ea);
141
142 if (pmd_huge(*pm)) {
143 ret_pte = (pte_t *) pm;
144 goto out;
145 } else if (is_hugepd(pm))
146 hpdp = (hugepd_t *)pm;
147 else if (!pmd_none(*pm))
148 return pte_offset_kernel(pm, ea);
149 }
150 }
151 if (!hpdp)
152 return NULL;
153
154 ret_pte = hugepte_offset(hpdp, ea, pdshift);
155 pdshift = hugepd_shift(*hpdp);
156out:
157 if (shift)
158 *shift = pdshift;
159 return ret_pte;
160}
161EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
162
163pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 106pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
164{ 107{
108 /* Only called for hugetlbfs pages, hence can ignore THP */
165 return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); 109 return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
166} 110}
167 111
@@ -357,7 +301,7 @@ void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
357int alloc_bootmem_huge_page(struct hstate *hstate) 301int alloc_bootmem_huge_page(struct hstate *hstate)
358{ 302{
359 struct huge_bootmem_page *m; 303 struct huge_bootmem_page *m;
360 int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT); 304 int idx = shift_to_mmu_psize(huge_page_shift(hstate));
361 int nr_gpages = gpage_freearray[idx].nr_gpages; 305 int nr_gpages = gpage_freearray[idx].nr_gpages;
362 306
363 if (nr_gpages == 0) 307 if (nr_gpages == 0)
@@ -736,11 +680,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
736 struct page *page; 680 struct page *page;
737 unsigned shift; 681 unsigned shift;
738 unsigned long mask; 682 unsigned long mask;
739 683 /*
684 * Transparent hugepages are handled by generic code. We can skip them
685 * here.
686 */
740 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); 687 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
741 688
742 /* Verify it is a huge page else bail. */ 689 /* Verify it is a huge page else bail. */
743 if (!ptep || !shift) 690 if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
744 return ERR_PTR(-EINVAL); 691 return ERR_PTR(-EINVAL);
745 692
746 mask = (1UL << shift) - 1; 693 mask = (1UL << shift) - 1;
@@ -759,69 +706,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
759 return NULL; 706 return NULL;
760} 707}
761 708
762int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
763 unsigned long end, int write, struct page **pages, int *nr)
764{
765 unsigned long mask;
766 unsigned long pte_end;
767 struct page *head, *page, *tail;
768 pte_t pte;
769 int refs;
770
771 pte_end = (addr + sz) & ~(sz-1);
772 if (pte_end < end)
773 end = pte_end;
774
775 pte = *ptep;
776 mask = _PAGE_PRESENT | _PAGE_USER;
777 if (write)
778 mask |= _PAGE_RW;
779
780 if ((pte_val(pte) & mask) != mask)
781 return 0;
782
783 /* hugepages are never "special" */
784 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
785
786 refs = 0;
787 head = pte_page(pte);
788
789 page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
790 tail = page;
791 do {
792 VM_BUG_ON(compound_head(page) != head);
793 pages[*nr] = page;
794 (*nr)++;
795 page++;
796 refs++;
797 } while (addr += PAGE_SIZE, addr != end);
798
799 if (!page_cache_add_speculative(head, refs)) {
800 *nr -= refs;
801 return 0;
802 }
803
804 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
805 /* Could be optimized better */
806 *nr -= refs;
807 while (refs--)
808 put_page(head);
809 return 0;
810 }
811
812 /*
813 * Any tail page need their mapcount reference taken before we
814 * return.
815 */
816 while (refs--) {
817 if (PageTail(tail))
818 get_huge_page_tail(tail);
819 tail++;
820 }
821
822 return 1;
823}
824
825static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, 709static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
826 unsigned long sz) 710 unsigned long sz)
827{ 711{
@@ -1038,3 +922,168 @@ void flush_dcache_icache_hugepage(struct page *page)
1038 } 922 }
1039 } 923 }
1040} 924}
925
926#endif /* CONFIG_HUGETLB_PAGE */
927
928/*
929 * We have 4 cases for pgds and pmds:
930 * (1) invalid (all zeroes)
931 * (2) pointer to next table, as normal; bottom 6 bits == 0
932 * (3) leaf pte for huge page, bottom two bits != 00
933 * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
934 *
935 * So long as we atomically load page table pointers we are safe against teardown,
936 * we can follow the address down to the the page and take a ref on it.
937 */
938
939pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
940{
941 pgd_t pgd, *pgdp;
942 pud_t pud, *pudp;
943 pmd_t pmd, *pmdp;
944 pte_t *ret_pte;
945 hugepd_t *hpdp = NULL;
946 unsigned pdshift = PGDIR_SHIFT;
947
948 if (shift)
949 *shift = 0;
950
951 pgdp = pgdir + pgd_index(ea);
952 pgd = ACCESS_ONCE(*pgdp);
953 /*
954 * Always operate on the local stack value. This make sure the
955 * value don't get updated by a parallel THP split/collapse,
956 * page fault or a page unmap. The return pte_t * is still not
957 * stable. So should be checked there for above conditions.
958 */
959 if (pgd_none(pgd))
960 return NULL;
961 else if (pgd_huge(pgd)) {
962 ret_pte = (pte_t *) pgdp;
963 goto out;
964 } else if (is_hugepd(&pgd))
965 hpdp = (hugepd_t *)&pgd;
966 else {
967 /*
968 * Even if we end up with an unmap, the pgtable will not
969 * be freed, because we do an rcu free and here we are
970 * irq disabled
971 */
972 pdshift = PUD_SHIFT;
973 pudp = pud_offset(&pgd, ea);
974 pud = ACCESS_ONCE(*pudp);
975
976 if (pud_none(pud))
977 return NULL;
978 else if (pud_huge(pud)) {
979 ret_pte = (pte_t *) pudp;
980 goto out;
981 } else if (is_hugepd(&pud))
982 hpdp = (hugepd_t *)&pud;
983 else {
984 pdshift = PMD_SHIFT;
985 pmdp = pmd_offset(&pud, ea);
986 pmd = ACCESS_ONCE(*pmdp);
987 /*
988 * A hugepage collapse is captured by pmd_none, because
989 * it mark the pmd none and do a hpte invalidate.
990 *
991 * A hugepage split is captured by pmd_trans_splitting
992 * because we mark the pmd trans splitting and do a
993 * hpte invalidate
994 *
995 */
996 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
997 return NULL;
998
999 if (pmd_huge(pmd) || pmd_large(pmd)) {
1000 ret_pte = (pte_t *) pmdp;
1001 goto out;
1002 } else if (is_hugepd(&pmd))
1003 hpdp = (hugepd_t *)&pmd;
1004 else
1005 return pte_offset_kernel(&pmd, ea);
1006 }
1007 }
1008 if (!hpdp)
1009 return NULL;
1010
1011 ret_pte = hugepte_offset(hpdp, ea, pdshift);
1012 pdshift = hugepd_shift(*hpdp);
1013out:
1014 if (shift)
1015 *shift = pdshift;
1016 return ret_pte;
1017}
1018EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
1019
1020int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
1021 unsigned long end, int write, struct page **pages, int *nr)
1022{
1023 unsigned long mask;
1024 unsigned long pte_end;
1025 struct page *head, *page, *tail;
1026 pte_t pte;
1027 int refs;
1028
1029 pte_end = (addr + sz) & ~(sz-1);
1030 if (pte_end < end)
1031 end = pte_end;
1032
1033 pte = ACCESS_ONCE(*ptep);
1034 mask = _PAGE_PRESENT | _PAGE_USER;
1035 if (write)
1036 mask |= _PAGE_RW;
1037
1038 if ((pte_val(pte) & mask) != mask)
1039 return 0;
1040
1041#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1042 /*
1043 * check for splitting here
1044 */
1045 if (pmd_trans_splitting(pte_pmd(pte)))
1046 return 0;
1047#endif
1048
1049 /* hugepages are never "special" */
1050 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1051
1052 refs = 0;
1053 head = pte_page(pte);
1054
1055 page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
1056 tail = page;
1057 do {
1058 VM_BUG_ON(compound_head(page) != head);
1059 pages[*nr] = page;
1060 (*nr)++;
1061 page++;
1062 refs++;
1063 } while (addr += PAGE_SIZE, addr != end);
1064
1065 if (!page_cache_add_speculative(head, refs)) {
1066 *nr -= refs;
1067 return 0;
1068 }
1069
1070 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
1071 /* Could be optimized better */
1072 *nr -= refs;
1073 while (refs--)
1074 put_page(head);
1075 return 0;
1076 }
1077
1078 /*
1079 * Any tail page need their mapcount reference taken before we
1080 * return.
1081 */
1082 while (refs--) {
1083 if (PageTail(tail))
1084 get_huge_page_tail(tail);
1085 tail++;
1086 }
1087
1088 return 1;
1089}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a90b9c458990..d0cd9e4c6837 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -88,7 +88,11 @@ static void pgd_ctor(void *addr)
88 88
89static void pmd_ctor(void *addr) 89static void pmd_ctor(void *addr)
90{ 90{
91#ifdef CONFIG_TRANSPARENT_HUGEPAGE
92 memset(addr, 0, PMD_TABLE_SIZE * 2);
93#else
91 memset(addr, 0, PMD_TABLE_SIZE); 94 memset(addr, 0, PMD_TABLE_SIZE);
95#endif
92} 96}
93 97
94struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; 98struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
@@ -137,10 +141,9 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
137void pgtable_cache_init(void) 141void pgtable_cache_init(void)
138{ 142{
139 pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor); 143 pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
140 pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor); 144 pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
141 if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE)) 145 if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
142 panic("Couldn't allocate pgtable caches"); 146 panic("Couldn't allocate pgtable caches");
143
144 /* In all current configs, when the PUD index exists it's the 147 /* In all current configs, when the PUD index exists it's the
145 * same size as either the pgd or pmd index. Verify that the 148 * same size as either the pgd or pmd index. Verify that the
146 * initialization above has also created a PUD cache. This 149 * initialization above has also created a PUD cache. This
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 0988a26e0413..7f4bea162026 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -299,47 +299,13 @@ void __init paging_init(void)
299 299
300void __init mem_init(void) 300void __init mem_init(void)
301{ 301{
302#ifdef CONFIG_NEED_MULTIPLE_NODES
303 int nid;
304#endif
305 pg_data_t *pgdat;
306 unsigned long i;
307 struct page *page;
308 unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
309
310#ifdef CONFIG_SWIOTLB 302#ifdef CONFIG_SWIOTLB
311 swiotlb_init(0); 303 swiotlb_init(0);
312#endif 304#endif
313 305
314 num_physpages = memblock_phys_mem_size() >> PAGE_SHIFT;
315 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); 306 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
316 307 set_max_mapnr(max_pfn);
317#ifdef CONFIG_NEED_MULTIPLE_NODES 308 free_all_bootmem();
318 for_each_online_node(nid) {
319 if (NODE_DATA(nid)->node_spanned_pages != 0) {
320 printk("freeing bootmem node %d\n", nid);
321 totalram_pages +=
322 free_all_bootmem_node(NODE_DATA(nid));
323 }
324 }
325#else
326 max_mapnr = max_pfn;
327 totalram_pages += free_all_bootmem();
328#endif
329 for_each_online_pgdat(pgdat) {
330 for (i = 0; i < pgdat->node_spanned_pages; i++) {
331 if (!pfn_valid(pgdat->node_start_pfn + i))
332 continue;
333 page = pgdat_page_nr(pgdat, i);
334 if (PageReserved(page))
335 reservedpages++;
336 }
337 }
338
339 codesize = (unsigned long)&_sdata - (unsigned long)&_stext;
340 datasize = (unsigned long)&_edata - (unsigned long)&_sdata;
341 initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin;
342 bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start;
343 309
344#ifdef CONFIG_HIGHMEM 310#ifdef CONFIG_HIGHMEM
345 { 311 {
@@ -349,13 +315,9 @@ void __init mem_init(void)
349 for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) { 315 for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) {
350 phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT; 316 phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT;
351 struct page *page = pfn_to_page(pfn); 317 struct page *page = pfn_to_page(pfn);
352 if (memblock_is_reserved(paddr)) 318 if (!memblock_is_reserved(paddr))
353 continue; 319 free_highmem_page(page);
354 free_highmem_page(page);
355 reservedpages--;
356 } 320 }
357 printk(KERN_DEBUG "High memory: %luk\n",
358 totalhigh_pages << (PAGE_SHIFT-10));
359 } 321 }
360#endif /* CONFIG_HIGHMEM */ 322#endif /* CONFIG_HIGHMEM */
361 323
@@ -368,16 +330,7 @@ void __init mem_init(void)
368 (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1; 330 (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1;
369#endif 331#endif
370 332
371 printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, " 333 mem_init_print_info(NULL);
372 "%luk reserved, %luk data, %luk bss, %luk init)\n",
373 nr_free_pages() << (PAGE_SHIFT-10),
374 num_physpages << (PAGE_SHIFT-10),
375 codesize >> 10,
376 reservedpages << (PAGE_SHIFT-10),
377 datasize >> 10,
378 bsssize >> 10,
379 initsize >> 10);
380
381#ifdef CONFIG_PPC32 334#ifdef CONFIG_PPC32
382 pr_info("Kernel virtual memory layout:\n"); 335 pr_info("Kernel virtual memory layout:\n");
383 pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP); 336 pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP);
@@ -407,7 +360,7 @@ void free_initmem(void)
407#ifdef CONFIG_BLK_DEV_INITRD 360#ifdef CONFIG_BLK_DEV_INITRD
408void __init free_initrd_mem(unsigned long start, unsigned long end) 361void __init free_initrd_mem(unsigned long start, unsigned long end)
409{ 362{
410 free_reserved_area(start, end, 0, "initrd"); 363 free_reserved_area((void *)start, (void *)end, -1, "initrd");
411} 364}
412#endif 365#endif
413 366
@@ -508,6 +461,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
508 pte_t *ptep) 461 pte_t *ptep)
509{ 462{
510#ifdef CONFIG_PPC_STD_MMU 463#ifdef CONFIG_PPC_STD_MMU
464 /*
465 * We don't need to worry about _PAGE_PRESENT here because we are
466 * called with either mm->page_table_lock held or ptl lock held
467 */
511 unsigned long access = 0, trap; 468 unsigned long access = 0, trap;
512 469
513 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ 470 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap.c
index 67a42ed0d2fc..cb8bdbe4972f 100644
--- a/arch/powerpc/mm/mmap_64.c
+++ b/arch/powerpc/mm/mmap.c
@@ -92,10 +92,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
92 if (mmap_is_legacy()) { 92 if (mmap_is_legacy()) {
93 mm->mmap_base = TASK_UNMAPPED_BASE; 93 mm->mmap_base = TASK_UNMAPPED_BASE;
94 mm->get_unmapped_area = arch_get_unmapped_area; 94 mm->get_unmapped_area = arch_get_unmapped_area;
95 mm->unmap_area = arch_unmap_area;
96 } else { 95 } else {
97 mm->mmap_base = mmap_base(); 96 mm->mmap_base = mmap_base();
98 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 97 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
99 mm->unmap_area = arch_unmap_area_topdown;
100 } 98 }
101} 99}
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index e779642c25e5..af3d78e19302 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -112,8 +112,10 @@ static unsigned int steal_context_smp(unsigned int id)
112 */ 112 */
113 for_each_cpu(cpu, mm_cpumask(mm)) { 113 for_each_cpu(cpu, mm_cpumask(mm)) {
114 for (i = cpu_first_thread_sibling(cpu); 114 for (i = cpu_first_thread_sibling(cpu);
115 i <= cpu_last_thread_sibling(cpu); i++) 115 i <= cpu_last_thread_sibling(cpu); i++) {
116 __set_bit(id, stale_map[i]); 116 if (stale_map[i])
117 __set_bit(id, stale_map[i]);
118 }
117 cpu = i - 1; 119 cpu = i - 1;
118 } 120 }
119 return id; 121 return id;
@@ -272,7 +274,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
272 /* XXX This clear should ultimately be part of local_flush_tlb_mm */ 274 /* XXX This clear should ultimately be part of local_flush_tlb_mm */
273 for (i = cpu_first_thread_sibling(cpu); 275 for (i = cpu_first_thread_sibling(cpu);
274 i <= cpu_last_thread_sibling(cpu); i++) { 276 i <= cpu_last_thread_sibling(cpu); i++) {
275 __clear_bit(id, stale_map[i]); 277 if (stale_map[i])
278 __clear_bit(id, stale_map[i]);
276 } 279 }
277 } 280 }
278 281
@@ -329,8 +332,8 @@ void destroy_context(struct mm_struct *mm)
329 332
330#ifdef CONFIG_SMP 333#ifdef CONFIG_SMP
331 334
332static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self, 335static int mmu_context_cpu_notify(struct notifier_block *self,
333 unsigned long action, void *hcpu) 336 unsigned long action, void *hcpu)
334{ 337{
335 unsigned int cpu = (unsigned int)(long)hcpu; 338 unsigned int cpu = (unsigned int)(long)hcpu;
336 339
@@ -363,7 +366,7 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
363 return NOTIFY_OK; 366 return NOTIFY_OK;
364} 367}
365 368
366static struct notifier_block __cpuinitdata mmu_context_cpu_nb = { 369static struct notifier_block mmu_context_cpu_nb = {
367 .notifier_call = mmu_context_cpu_notify, 370 .notifier_call = mmu_context_cpu_notify,
368}; 371};
369 372
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 88c0425dc0a8..5850798826cd 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -27,6 +27,7 @@
27#include <linux/seq_file.h> 27#include <linux/seq_file.h>
28#include <linux/uaccess.h> 28#include <linux/uaccess.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <asm/cputhreads.h>
30#include <asm/sparsemem.h> 31#include <asm/sparsemem.h>
31#include <asm/prom.h> 32#include <asm/prom.h>
32#include <asm/smp.h> 33#include <asm/smp.h>
@@ -516,7 +517,7 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
516 * Figure out to which domain a cpu belongs and stick it there. 517 * Figure out to which domain a cpu belongs and stick it there.
517 * Return the id of the domain used. 518 * Return the id of the domain used.
518 */ 519 */
519static int __cpuinit numa_setup_cpu(unsigned long lcpu) 520static int numa_setup_cpu(unsigned long lcpu)
520{ 521{
521 int nid = 0; 522 int nid = 0;
522 struct device_node *cpu = of_get_cpu_node(lcpu, NULL); 523 struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
@@ -538,8 +539,7 @@ out:
538 return nid; 539 return nid;
539} 540}
540 541
541static int __cpuinit cpu_numa_callback(struct notifier_block *nfb, 542static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,
542 unsigned long action,
543 void *hcpu) 543 void *hcpu)
544{ 544{
545 unsigned long lcpu = (unsigned long)hcpu; 545 unsigned long lcpu = (unsigned long)hcpu;
@@ -919,7 +919,7 @@ static void __init *careful_zallocation(int nid, unsigned long size,
919 return ret; 919 return ret;
920} 920}
921 921
922static struct notifier_block __cpuinitdata ppc64_numa_nb = { 922static struct notifier_block ppc64_numa_nb = {
923 .notifier_call = cpu_numa_callback, 923 .notifier_call = cpu_numa_callback,
924 .priority = 1 /* Must run before sched domains notifier. */ 924 .priority = 1 /* Must run before sched domains notifier. */
925}; 925};
@@ -1319,7 +1319,8 @@ static int update_cpu_associativity_changes_mask(void)
1319 } 1319 }
1320 } 1320 }
1321 if (changed) { 1321 if (changed) {
1322 cpumask_set_cpu(cpu, changes); 1322 cpumask_or(changes, changes, cpu_sibling_mask(cpu));
1323 cpu = cpu_last_thread_sibling(cpu);
1323 } 1324 }
1324 } 1325 }
1325 1326
@@ -1427,17 +1428,15 @@ static int update_cpu_topology(void *data)
1427 if (!data) 1428 if (!data)
1428 return -EINVAL; 1429 return -EINVAL;
1429 1430
1430 cpu = get_cpu(); 1431 cpu = smp_processor_id();
1431 1432
1432 for (update = data; update; update = update->next) { 1433 for (update = data; update; update = update->next) {
1433 if (cpu != update->cpu) 1434 if (cpu != update->cpu)
1434 continue; 1435 continue;
1435 1436
1436 unregister_cpu_under_node(update->cpu, update->old_nid);
1437 unmap_cpu_from_node(update->cpu); 1437 unmap_cpu_from_node(update->cpu);
1438 map_cpu_to_node(update->cpu, update->new_nid); 1438 map_cpu_to_node(update->cpu, update->new_nid);
1439 vdso_getcpu_init(); 1439 vdso_getcpu_init();
1440 register_cpu_under_node(update->cpu, update->new_nid);
1441 } 1440 }
1442 1441
1443 return 0; 1442 return 0;
@@ -1449,12 +1448,12 @@ static int update_cpu_topology(void *data)
1449 */ 1448 */
1450int arch_update_cpu_topology(void) 1449int arch_update_cpu_topology(void)
1451{ 1450{
1452 unsigned int cpu, changed = 0; 1451 unsigned int cpu, sibling, changed = 0;
1453 struct topology_update_data *updates, *ud; 1452 struct topology_update_data *updates, *ud;
1454 unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0}; 1453 unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
1455 cpumask_t updated_cpus; 1454 cpumask_t updated_cpus;
1456 struct device *dev; 1455 struct device *dev;
1457 int weight, i = 0; 1456 int weight, new_nid, i = 0;
1458 1457
1459 weight = cpumask_weight(&cpu_associativity_changes_mask); 1458 weight = cpumask_weight(&cpu_associativity_changes_mask);
1460 if (!weight) 1459 if (!weight)
@@ -1467,24 +1466,54 @@ int arch_update_cpu_topology(void)
1467 cpumask_clear(&updated_cpus); 1466 cpumask_clear(&updated_cpus);
1468 1467
1469 for_each_cpu(cpu, &cpu_associativity_changes_mask) { 1468 for_each_cpu(cpu, &cpu_associativity_changes_mask) {
1470 ud = &updates[i++]; 1469 /*
1471 ud->cpu = cpu; 1470 * If siblings aren't flagged for changes, updates list
1472 vphn_get_associativity(cpu, associativity); 1471 * will be too short. Skip on this update and set for next
1473 ud->new_nid = associativity_to_nid(associativity); 1472 * update.
1474 1473 */
1475 if (ud->new_nid < 0 || !node_online(ud->new_nid)) 1474 if (!cpumask_subset(cpu_sibling_mask(cpu),
1476 ud->new_nid = first_online_node; 1475 &cpu_associativity_changes_mask)) {
1476 pr_info("Sibling bits not set for associativity "
1477 "change, cpu%d\n", cpu);
1478 cpumask_or(&cpu_associativity_changes_mask,
1479 &cpu_associativity_changes_mask,
1480 cpu_sibling_mask(cpu));
1481 cpu = cpu_last_thread_sibling(cpu);
1482 continue;
1483 }
1477 1484
1478 ud->old_nid = numa_cpu_lookup_table[cpu]; 1485 /* Use associativity from first thread for all siblings */
1479 cpumask_set_cpu(cpu, &updated_cpus); 1486 vphn_get_associativity(cpu, associativity);
1487 new_nid = associativity_to_nid(associativity);
1488 if (new_nid < 0 || !node_online(new_nid))
1489 new_nid = first_online_node;
1490
1491 if (new_nid == numa_cpu_lookup_table[cpu]) {
1492 cpumask_andnot(&cpu_associativity_changes_mask,
1493 &cpu_associativity_changes_mask,
1494 cpu_sibling_mask(cpu));
1495 cpu = cpu_last_thread_sibling(cpu);
1496 continue;
1497 }
1480 1498
1481 if (i < weight) 1499 for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
1482 ud->next = &updates[i]; 1500 ud = &updates[i++];
1501 ud->cpu = sibling;
1502 ud->new_nid = new_nid;
1503 ud->old_nid = numa_cpu_lookup_table[sibling];
1504 cpumask_set_cpu(sibling, &updated_cpus);
1505 if (i < weight)
1506 ud->next = &updates[i];
1507 }
1508 cpu = cpu_last_thread_sibling(cpu);
1483 } 1509 }
1484 1510
1485 stop_machine(update_cpu_topology, &updates[0], &updated_cpus); 1511 stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
1486 1512
1487 for (ud = &updates[0]; ud; ud = ud->next) { 1513 for (ud = &updates[0]; ud; ud = ud->next) {
1514 unregister_cpu_under_node(ud->cpu, ud->old_nid);
1515 register_cpu_under_node(ud->cpu, ud->new_nid);
1516
1488 dev = get_cpu_device(ud->cpu); 1517 dev = get_cpu_device(ud->cpu);
1489 if (dev) 1518 if (dev)
1490 kobject_uevent(&dev->kobj, KOBJ_CHANGE); 1519 kobject_uevent(&dev->kobj, KOBJ_CHANGE);
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 214130a4edc6..edda589795c3 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -235,6 +235,14 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
235 pud = pud_offset(pgd, addr); 235 pud = pud_offset(pgd, addr);
236 BUG_ON(pud_none(*pud)); 236 BUG_ON(pud_none(*pud));
237 pmd = pmd_offset(pud, addr); 237 pmd = pmd_offset(pud, addr);
238 /*
239 * khugepaged to collapse normal pages to hugepage, first set
240 * pmd to none to force page fault/gup to take mmap_sem. After
241 * pmd is set to none, we do a pte_clear which does this assertion
242 * so if we find pmd none, return.
243 */
244 if (pmd_none(*pmd))
245 return;
238 BUG_ON(!pmd_present(*pmd)); 246 BUG_ON(!pmd_present(*pmd));
239 assert_spin_locked(pte_lockptr(mm, pmd)); 247 assert_spin_locked(pte_lockptr(mm, pmd));
240} 248}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index a854096e1023..536eec72c0f7 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -338,6 +338,19 @@ EXPORT_SYMBOL(iounmap);
338EXPORT_SYMBOL(__iounmap); 338EXPORT_SYMBOL(__iounmap);
339EXPORT_SYMBOL(__iounmap_at); 339EXPORT_SYMBOL(__iounmap_at);
340 340
341/*
342 * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
343 * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
344 */
345struct page *pmd_page(pmd_t pmd)
346{
347#ifdef CONFIG_TRANSPARENT_HUGEPAGE
348 if (pmd_trans_huge(pmd))
349 return pfn_to_page(pmd_pfn(pmd));
350#endif
351 return virt_to_page(pmd_page_vaddr(pmd));
352}
353
341#ifdef CONFIG_PPC_64K_PAGES 354#ifdef CONFIG_PPC_64K_PAGES
342static pte_t *get_from_cache(struct mm_struct *mm) 355static pte_t *get_from_cache(struct mm_struct *mm)
343{ 356{
@@ -455,3 +468,404 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
455} 468}
456#endif 469#endif
457#endif /* CONFIG_PPC_64K_PAGES */ 470#endif /* CONFIG_PPC_64K_PAGES */
471
472#ifdef CONFIG_TRANSPARENT_HUGEPAGE
473
474/*
475 * This is called when relaxing access to a hugepage. It's also called in the page
476 * fault path when we don't hit any of the major fault cases, ie, a minor
477 * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
478 * handled those two for us, we additionally deal with missing execute
479 * permission here on some processors
480 */
481int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
482 pmd_t *pmdp, pmd_t entry, int dirty)
483{
484 int changed;
485#ifdef CONFIG_DEBUG_VM
486 WARN_ON(!pmd_trans_huge(*pmdp));
487 assert_spin_locked(&vma->vm_mm->page_table_lock);
488#endif
489 changed = !pmd_same(*(pmdp), entry);
490 if (changed) {
491 __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
492 /*
493 * Since we are not supporting SW TLB systems, we don't
494 * have any thing similar to flush_tlb_page_nohash()
495 */
496 }
497 return changed;
498}
499
500unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
501 pmd_t *pmdp, unsigned long clr)
502{
503
504 unsigned long old, tmp;
505
506#ifdef CONFIG_DEBUG_VM
507 WARN_ON(!pmd_trans_huge(*pmdp));
508 assert_spin_locked(&mm->page_table_lock);
509#endif
510
511#ifdef PTE_ATOMIC_UPDATES
512 __asm__ __volatile__(
513 "1: ldarx %0,0,%3\n\
514 andi. %1,%0,%6\n\
515 bne- 1b \n\
516 andc %1,%0,%4 \n\
517 stdcx. %1,0,%3 \n\
518 bne- 1b"
519 : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
520 : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY)
521 : "cc" );
522#else
523 old = pmd_val(*pmdp);
524 *pmdp = __pmd(old & ~clr);
525#endif
526 if (old & _PAGE_HASHPTE)
527 hpte_do_hugepage_flush(mm, addr, pmdp);
528 return old;
529}
530
531pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
532 pmd_t *pmdp)
533{
534 pmd_t pmd;
535
536 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
537 if (pmd_trans_huge(*pmdp)) {
538 pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
539 } else {
540 /*
541 * khugepaged calls this for normal pmd
542 */
543 pmd = *pmdp;
544 pmd_clear(pmdp);
545 /*
546 * Wait for all pending hash_page to finish. This is needed
547 * in case of subpage collapse. When we collapse normal pages
548 * to hugepage, we first clear the pmd, then invalidate all
549 * the PTE entries. The assumption here is that any low level
550 * page fault will see a none pmd and take the slow path that
551 * will wait on mmap_sem. But we could very well be in a
552 * hash_page with local ptep pointer value. Such a hash page
553 * can result in adding new HPTE entries for normal subpages.
554 * That means we could be modifying the page content as we
555 * copy them to a huge page. So wait for parallel hash_page
556 * to finish before invalidating HPTE entries. We can do this
557 * by sending an IPI to all the cpus and executing a dummy
558 * function there.
559 */
560 kick_all_cpus_sync();
561 /*
562 * Now invalidate the hpte entries in the range
563 * covered by pmd. This make sure we take a
564 * fault and will find the pmd as none, which will
565 * result in a major fault which takes mmap_sem and
566 * hence wait for collapse to complete. Without this
567 * the __collapse_huge_page_copy can result in copying
568 * the old content.
569 */
570 flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
571 }
572 return pmd;
573}
574
575int pmdp_test_and_clear_young(struct vm_area_struct *vma,
576 unsigned long address, pmd_t *pmdp)
577{
578 return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
579}
580
581/*
582 * We currently remove entries from the hashtable regardless of whether
583 * the entry was young or dirty. The generic routines only flush if the
584 * entry was young or dirty which is not good enough.
585 *
586 * We should be more intelligent about this but for the moment we override
587 * these functions and force a tlb flush unconditionally
588 */
589int pmdp_clear_flush_young(struct vm_area_struct *vma,
590 unsigned long address, pmd_t *pmdp)
591{
592 return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
593}
594
595/*
596 * We mark the pmd splitting and invalidate all the hpte
597 * entries for this hugepage.
598 */
599void pmdp_splitting_flush(struct vm_area_struct *vma,
600 unsigned long address, pmd_t *pmdp)
601{
602 unsigned long old, tmp;
603
604 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
605
606#ifdef CONFIG_DEBUG_VM
607 WARN_ON(!pmd_trans_huge(*pmdp));
608 assert_spin_locked(&vma->vm_mm->page_table_lock);
609#endif
610
611#ifdef PTE_ATOMIC_UPDATES
612
613 __asm__ __volatile__(
614 "1: ldarx %0,0,%3\n\
615 andi. %1,%0,%6\n\
616 bne- 1b \n\
617 ori %1,%0,%4 \n\
618 stdcx. %1,0,%3 \n\
619 bne- 1b"
620 : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
621 : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
622 : "cc" );
623#else
624 old = pmd_val(*pmdp);
625 *pmdp = __pmd(old | _PAGE_SPLITTING);
626#endif
627 /*
628 * If we didn't had the splitting flag set, go and flush the
629 * HPTE entries.
630 */
631 if (!(old & _PAGE_SPLITTING)) {
632 /* We need to flush the hpte */
633 if (old & _PAGE_HASHPTE)
634 hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
635 }
636}
637
638/*
639 * We want to put the pgtable in pmd and use pgtable for tracking
640 * the base page size hptes
641 */
642void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
643 pgtable_t pgtable)
644{
645 pgtable_t *pgtable_slot;
646 assert_spin_locked(&mm->page_table_lock);
647 /*
648 * we store the pgtable in the second half of PMD
649 */
650 pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
651 *pgtable_slot = pgtable;
652 /*
653 * expose the deposited pgtable to other cpus.
654 * before we set the hugepage PTE at pmd level
655 * hash fault code looks at the deposted pgtable
656 * to store hash index values.
657 */
658 smp_wmb();
659}
660
661pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
662{
663 pgtable_t pgtable;
664 pgtable_t *pgtable_slot;
665
666 assert_spin_locked(&mm->page_table_lock);
667 pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
668 pgtable = *pgtable_slot;
669 /*
670 * Once we withdraw, mark the entry NULL.
671 */
672 *pgtable_slot = NULL;
673 /*
674 * We store HPTE information in the deposited PTE fragment.
675 * zero out the content on withdraw.
676 */
677 memset(pgtable, 0, PTE_FRAG_SIZE);
678 return pgtable;
679}
680
681/*
682 * set a new huge pmd. We should not be called for updating
683 * an existing pmd entry. That should go via pmd_hugepage_update.
684 */
685void set_pmd_at(struct mm_struct *mm, unsigned long addr,
686 pmd_t *pmdp, pmd_t pmd)
687{
688#ifdef CONFIG_DEBUG_VM
689 WARN_ON(!pmd_none(*pmdp));
690 assert_spin_locked(&mm->page_table_lock);
691 WARN_ON(!pmd_trans_huge(pmd));
692#endif
693 return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
694}
695
696void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
697 pmd_t *pmdp)
698{
699 pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT);
700}
701
702/*
703 * A linux hugepage PMD was changed and the corresponding hash table entries
704 * neesd to be flushed.
705 */
706void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
707 pmd_t *pmdp)
708{
709 int ssize, i;
710 unsigned long s_addr;
711 int max_hpte_count;
712 unsigned int psize, valid;
713 unsigned char *hpte_slot_array;
714 unsigned long hidx, vpn, vsid, hash, shift, slot;
715
716 /*
717 * Flush all the hptes mapping this hugepage
718 */
719 s_addr = addr & HPAGE_PMD_MASK;
720 hpte_slot_array = get_hpte_slot_array(pmdp);
721 /*
722 * IF we try to do a HUGE PTE update after a withdraw is done.
723 * we will find the below NULL. This happens when we do
724 * split_huge_page_pmd
725 */
726 if (!hpte_slot_array)
727 return;
728
729 /* get the base page size */
730 psize = get_slice_psize(mm, s_addr);
731
732 if (ppc_md.hugepage_invalidate)
733 return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
734 s_addr, psize);
735 /*
736 * No bluk hpte removal support, invalidate each entry
737 */
738 shift = mmu_psize_defs[psize].shift;
739 max_hpte_count = HPAGE_PMD_SIZE >> shift;
740 for (i = 0; i < max_hpte_count; i++) {
741 /*
742 * 8 bits per each hpte entries
743 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
744 */
745 valid = hpte_valid(hpte_slot_array, i);
746 if (!valid)
747 continue;
748 hidx = hpte_hash_index(hpte_slot_array, i);
749
750 /* get the vpn */
751 addr = s_addr + (i * (1ul << shift));
752 if (!is_kernel_addr(addr)) {
753 ssize = user_segment_size(addr);
754 vsid = get_vsid(mm->context.id, addr, ssize);
755 WARN_ON(vsid == 0);
756 } else {
757 vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
758 ssize = mmu_kernel_ssize;
759 }
760
761 vpn = hpt_vpn(addr, vsid, ssize);
762 hash = hpt_hash(vpn, shift, ssize);
763 if (hidx & _PTEIDX_SECONDARY)
764 hash = ~hash;
765
766 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
767 slot += hidx & _PTEIDX_GROUP_IX;
768 ppc_md.hpte_invalidate(slot, vpn, psize,
769 MMU_PAGE_16M, ssize, 0);
770 }
771}
772
773static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
774{
775 pmd_val(pmd) |= pgprot_val(pgprot);
776 return pmd;
777}
778
779pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
780{
781 pmd_t pmd;
782 /*
783 * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
784 * set. We use this to check THP page at pmd level.
785 * leaf pte for huge page, bottom two bits != 00
786 */
787 pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
788 pmd_val(pmd) |= _PAGE_THP_HUGE;
789 pmd = pmd_set_protbits(pmd, pgprot);
790 return pmd;
791}
792
793pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
794{
795 return pfn_pmd(page_to_pfn(page), pgprot);
796}
797
798pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
799{
800
801 pmd_val(pmd) &= _HPAGE_CHG_MASK;
802 pmd = pmd_set_protbits(pmd, newprot);
803 return pmd;
804}
805
806/*
807 * This is called at the end of handling a user page fault, when the
808 * fault has been handled by updating a HUGE PMD entry in the linux page tables.
809 * We use it to preload an HPTE into the hash table corresponding to
810 * the updated linux HUGE PMD entry.
811 */
812void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
813 pmd_t *pmd)
814{
815 return;
816}
817
818pmd_t pmdp_get_and_clear(struct mm_struct *mm,
819 unsigned long addr, pmd_t *pmdp)
820{
821 pmd_t old_pmd;
822 pgtable_t pgtable;
823 unsigned long old;
824 pgtable_t *pgtable_slot;
825
826 old = pmd_hugepage_update(mm, addr, pmdp, ~0UL);
827 old_pmd = __pmd(old);
828 /*
829 * We have pmd == none and we are holding page_table_lock.
830 * So we can safely go and clear the pgtable hash
831 * index info.
832 */
833 pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
834 pgtable = *pgtable_slot;
835 /*
836 * Let's zero out old valid and hash index details
837 * hash fault look at them.
838 */
839 memset(pgtable, 0, PTE_FRAG_SIZE);
840 return old_pmd;
841}
842
843int has_transparent_hugepage(void)
844{
845 if (!mmu_has_feature(MMU_FTR_16M_PAGE))
846 return 0;
847 /*
848 * We support THP only if PMD_SIZE is 16MB.
849 */
850 if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
851 return 0;
852 /*
853 * We need to make sure that we support 16MB hugepage in a segement
854 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
855 * of 64K.
856 */
857 /*
858 * If we have 64K HPTE, we will be using that by default
859 */
860 if (mmu_psize_defs[MMU_PAGE_64K].shift &&
861 (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
862 return 0;
863 /*
864 * Ok we only have 4K HPTE
865 */
866 if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
867 return 0;
868
869 return 1;
870}
871#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 7c415ddde948..aa74acb0fdfc 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -130,6 +130,53 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
130 up_write(&mm->mmap_sem); 130 up_write(&mm->mmap_sem);
131} 131}
132 132
133#ifdef CONFIG_TRANSPARENT_HUGEPAGE
134static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
135 unsigned long end, struct mm_walk *walk)
136{
137 struct vm_area_struct *vma = walk->private;
138 split_huge_page_pmd(vma, addr, pmd);
139 return 0;
140}
141
142static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
143 unsigned long len)
144{
145 struct vm_area_struct *vma;
146 struct mm_walk subpage_proto_walk = {
147 .mm = mm,
148 .pmd_entry = subpage_walk_pmd_entry,
149 };
150
151 /*
152 * We don't try too hard, we just mark all the vma in that range
153 * VM_NOHUGEPAGE and split them.
154 */
155 vma = find_vma(mm, addr);
156 /*
157 * If the range is in unmapped range, just return
158 */
159 if (vma && ((addr + len) <= vma->vm_start))
160 return;
161
162 while (vma) {
163 if (vma->vm_start >= (addr + len))
164 break;
165 vma->vm_flags |= VM_NOHUGEPAGE;
166 subpage_proto_walk.private = vma;
167 walk_page_range(vma->vm_start, vma->vm_end,
168 &subpage_proto_walk);
169 vma = vma->vm_next;
170 }
171}
172#else
173static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
174 unsigned long len)
175{
176 return;
177}
178#endif
179
133/* 180/*
134 * Copy in a subpage protection map for an address range. 181 * Copy in a subpage protection map for an address range.
135 * The map has 2 bits per 4k subpage, so 32 bits per 64k page. 182 * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
@@ -168,6 +215,7 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
168 return -EFAULT; 215 return -EFAULT;
169 216
170 down_write(&mm->mmap_sem); 217 down_write(&mm->mmap_sem);
218 subpage_mark_vma_nohuge(mm, addr, len);
171 for (limit = addr + len; addr < limit; addr = next) { 219 for (limit = addr + len; addr < limit; addr = next) {
172 next = pmd_addr_end(addr, limit); 220 next = pmd_addr_end(addr, limit);
173 err = -ENOMEM; 221 err = -ENOMEM;
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 023ec8a13f38..36e44b4260eb 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -183,12 +183,13 @@ void tlb_flush(struct mmu_gather *tlb)
183 * since 64K pages may overlap with other bridges when using 64K pages 183 * since 64K pages may overlap with other bridges when using 64K pages
184 * with 4K HW pages on IO space. 184 * with 4K HW pages on IO space.
185 * 185 *
186 * Because of that usage pattern, it's only available with CONFIG_HOTPLUG 186 * Because of that usage pattern, it is implemented for small size rather
187 * and is implemented for small size rather than speed. 187 * than speed.
188 */ 188 */
189void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, 189void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
190 unsigned long end) 190 unsigned long end)
191{ 191{
192 int hugepage_shift;
192 unsigned long flags; 193 unsigned long flags;
193 194
194 start = _ALIGN_DOWN(start, PAGE_SIZE); 195 start = _ALIGN_DOWN(start, PAGE_SIZE);
@@ -206,7 +207,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
206 local_irq_save(flags); 207 local_irq_save(flags);
207 arch_enter_lazy_mmu_mode(); 208 arch_enter_lazy_mmu_mode();
208 for (; start < end; start += PAGE_SIZE) { 209 for (; start < end; start += PAGE_SIZE) {
209 pte_t *ptep = find_linux_pte(mm->pgd, start); 210 pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start,
211 &hugepage_shift);
210 unsigned long pte; 212 unsigned long pte;
211 213
212 if (ptep == NULL) 214 if (ptep == NULL)
@@ -214,7 +216,37 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
214 pte = pte_val(*ptep); 216 pte = pte_val(*ptep);
215 if (!(pte & _PAGE_HASHPTE)) 217 if (!(pte & _PAGE_HASHPTE))
216 continue; 218 continue;
217 hpte_need_flush(mm, start, ptep, pte, 0); 219 if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
220 hpte_do_hugepage_flush(mm, start, (pmd_t *)pte);
221 else
222 hpte_need_flush(mm, start, ptep, pte, 0);
223 }
224 arch_leave_lazy_mmu_mode();
225 local_irq_restore(flags);
226}
227
228void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
229{
230 pte_t *pte;
231 pte_t *start_pte;
232 unsigned long flags;
233
234 addr = _ALIGN_DOWN(addr, PMD_SIZE);
235 /* Note: Normally, we should only ever use a batch within a
236 * PTE locked section. This violates the rule, but will work
237 * since we don't actually modify the PTEs, we just flush the
238 * hash while leaving the PTEs intact (including their reference
239 * to being hashed). This is not the most performance oriented
240 * way to do things but is fine for our needs here.
241 */
242 local_irq_save(flags);
243 arch_enter_lazy_mmu_mode();
244 start_pte = pte_offset_map(pmd, addr);
245 for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
246 unsigned long pteval = pte_val(*pte);
247 if (pteval & _PAGE_HASHPTE)
248 hpte_need_flush(mm, addr, pte, pteval, 0);
249 addr += PAGE_SIZE;
218 } 250 }
219 arch_leave_lazy_mmu_mode(); 251 arch_leave_lazy_mmu_mode();
220 local_irq_restore(flags); 252 local_irq_restore(flags);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 6888cad5103d..41cd68dee681 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -648,7 +648,7 @@ void __init early_init_mmu(void)
648 __early_init_mmu(1); 648 __early_init_mmu(1);
649} 649}
650 650
651void __cpuinit early_init_mmu_secondary(void) 651void early_init_mmu_secondary(void)
652{ 652{
653 __early_init_mmu(0); 653 __early_init_mmu(0);
654} 654}