aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-04 13:29:23 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-04 13:29:23 -0400
commit65b97fb7303050fc826e518cf67fc283da23314f (patch)
tree595e7f04d65d95a39d65bd2dcf2385b3b6ea7969 /arch/powerpc/mm
parentddcf6600b133697adbafd96e080818bdc0dfd028 (diff)
parent1d8b368ab4aacfc3f864655baad4d31a3028ec1a (diff)
Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc
Pull powerpc updates from Ben Herrenschmidt: "This is the powerpc changes for the 3.11 merge window. In addition to the usual bug fixes and small updates, the main highlights are: - Support for transparent huge pages by Aneesh Kumar for 64-bit server processors. This allows the use of 16M pages as transparent huge pages on kernels compiled with a 64K base page size. - Base VFIO support for KVM on power by Alexey Kardashevskiy - Wiring up of our nvram to the pstore infrastructure, including putting compressed oopses in there by Aruna Balakrishnaiah - Move, rework and improve our "EEH" (basically PCI error handling and recovery) infrastructure. It is no longer specific to pseries but is now usable by the new "powernv" platform as well (no hypervisor) by Gavin Shan. - I fixed some bugs in our math-emu instruction decoding and made it usable to emulate some optional FP instructions on processors with hard FP that lack them (such as fsqrt on Freescale embedded processors). - Support for Power8 "Event Based Branch" facility by Michael Ellerman. This facility allows what is basically "userspace interrupts" for performance monitor events. - A bunch of Transactional Memory vs. Signals bug fixes and HW breakpoint/watchpoint fixes by Michael Neuling. And more ... I appologize in advance if I've failed to highlight something that somebody deemed worth it." * 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc: (156 commits) pstore: Add hsize argument in write_buf call of pstore_ftrace_call powerpc/fsl: add MPIC timer wakeup support powerpc/mpic: create mpic subsystem object powerpc/mpic: add global timer support powerpc/mpic: add irq_set_wake support powerpc/85xx: enable coreint for all the 64bit boards powerpc/8xx: Erroneous double irq_eoi() on CPM IRQ in MPC8xx powerpc/fsl: Enable CONFIG_E1000E in mpc85xx_smp_defconfig powerpc/mpic: Add get_version API both for internal and external use powerpc: Handle both new style and old style reserve maps powerpc/hw_brk: Fix off by one error when validating DAWR region end powerpc/pseries: Support compression of oops text via pstore powerpc/pseries: Re-organise the oops compression code pstore: Pass header size in the pstore write callback powerpc/powernv: Fix iommu initialization again powerpc/pseries: Inform the hypervisor we are using EBB regs powerpc/perf: Add power8 EBB support powerpc/perf: Core EBB support for 64-bit book3s powerpc/perf: Drop MMCRA from thread_struct powerpc/perf: Don't enable if we have zero events ...
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/44x_mmu.c6
-rw-r--r--arch/powerpc/mm/Makefile8
-rw-r--r--arch/powerpc/mm/gup.c18
-rw-r--r--arch/powerpc/mm/hash_low_64.S21
-rw-r--r--arch/powerpc/mm/hash_native_64.c195
-rw-r--r--arch/powerpc/mm/hash_utils_64.c67
-rw-r--r--arch/powerpc/mm/hugepage-hash64.c175
-rw-r--r--arch/powerpc/mm/hugetlbpage-hash64.c2
-rw-r--r--arch/powerpc/mm/hugetlbpage.c299
-rw-r--r--arch/powerpc/mm/init_64.c9
-rw-r--r--arch/powerpc/mm/mem.c4
-rw-r--r--arch/powerpc/mm/mmap.c (renamed from arch/powerpc/mm/mmap_64.c)0
-rw-r--r--arch/powerpc/mm/mmu_context_nohash.c15
-rw-r--r--arch/powerpc/mm/numa.c12
-rw-r--r--arch/powerpc/mm/pgtable.c8
-rw-r--r--arch/powerpc/mm/pgtable_64.c414
-rw-r--r--arch/powerpc/mm/subpage-prot.c48
-rw-r--r--arch/powerpc/mm/tlb_hash64.c36
-rw-r--r--arch/powerpc/mm/tlb_nohash.c2
19 files changed, 1076 insertions, 263 deletions
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
index 2c9441ee6bb8..82b1ff759e26 100644
--- a/arch/powerpc/mm/44x_mmu.c
+++ b/arch/powerpc/mm/44x_mmu.c
@@ -41,7 +41,7 @@ int icache_44x_need_flush;
41 41
42unsigned long tlb_47x_boltmap[1024/8]; 42unsigned long tlb_47x_boltmap[1024/8];
43 43
44static void __cpuinit ppc44x_update_tlb_hwater(void) 44static void ppc44x_update_tlb_hwater(void)
45{ 45{
46 extern unsigned int tlb_44x_patch_hwater_D[]; 46 extern unsigned int tlb_44x_patch_hwater_D[];
47 extern unsigned int tlb_44x_patch_hwater_I[]; 47 extern unsigned int tlb_44x_patch_hwater_I[];
@@ -134,7 +134,7 @@ static void __init ppc47x_update_boltmap(void)
134/* 134/*
135 * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU 135 * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU
136 */ 136 */
137static void __cpuinit ppc47x_pin_tlb(unsigned int virt, unsigned int phys) 137static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
138{ 138{
139 unsigned int rA; 139 unsigned int rA;
140 int bolted; 140 int bolted;
@@ -229,7 +229,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
229} 229}
230 230
231#ifdef CONFIG_SMP 231#ifdef CONFIG_SMP
232void __cpuinit mmu_init_secondary(int cpu) 232void mmu_init_secondary(int cpu)
233{ 233{
234 unsigned long addr; 234 unsigned long addr;
235 unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); 235 unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index cf16b5733eaa..51230ee6a407 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -6,17 +6,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
6 6
7ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) 7ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
8 8
9obj-y := fault.o mem.o pgtable.o gup.o \ 9obj-y := fault.o mem.o pgtable.o gup.o mmap.o \
10 init_$(CONFIG_WORD_SIZE).o \ 10 init_$(CONFIG_WORD_SIZE).o \
11 pgtable_$(CONFIG_WORD_SIZE).o 11 pgtable_$(CONFIG_WORD_SIZE).o
12obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ 12obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \
13 tlb_nohash_low.o 13 tlb_nohash_low.o
14obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o 14obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o
15obj-$(CONFIG_PPC64) += mmap_64.o
16hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o 15hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o
17obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \ 16obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \
18 slb_low.o slb.o stab.o \ 17 slb_low.o slb.o stab.o \
19 mmap_64.o $(hash64-y) 18 $(hash64-y)
20obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o 19obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o
21obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \ 20obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \
22 tlb_hash$(CONFIG_WORD_SIZE).o \ 21 tlb_hash$(CONFIG_WORD_SIZE).o \
@@ -28,11 +27,12 @@ obj-$(CONFIG_44x) += 44x_mmu.o
28obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o 27obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o
29obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o 28obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
30obj-$(CONFIG_PPC_MM_SLICES) += slice.o 29obj-$(CONFIG_PPC_MM_SLICES) += slice.o
31ifeq ($(CONFIG_HUGETLB_PAGE),y)
32obj-y += hugetlbpage.o 30obj-y += hugetlbpage.o
31ifeq ($(CONFIG_HUGETLB_PAGE),y)
33obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o 32obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o
34obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o 33obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o
35endif 34endif
35obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
36obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o 36obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
37obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o 37obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
38obj-$(CONFIG_HIGHMEM) += highmem.o 38obj-$(CONFIG_HIGHMEM) += highmem.o
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index 4b921affa495..49822d90ea96 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -34,7 +34,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
34 34
35 ptep = pte_offset_kernel(&pmd, addr); 35 ptep = pte_offset_kernel(&pmd, addr);
36 do { 36 do {
37 pte_t pte = *ptep; 37 pte_t pte = ACCESS_ONCE(*ptep);
38 struct page *page; 38 struct page *page;
39 39
40 if ((pte_val(pte) & mask) != result) 40 if ((pte_val(pte) & mask) != result)
@@ -63,12 +63,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
63 63
64 pmdp = pmd_offset(&pud, addr); 64 pmdp = pmd_offset(&pud, addr);
65 do { 65 do {
66 pmd_t pmd = *pmdp; 66 pmd_t pmd = ACCESS_ONCE(*pmdp);
67 67
68 next = pmd_addr_end(addr, end); 68 next = pmd_addr_end(addr, end);
69 if (pmd_none(pmd)) 69 /*
70 * If we find a splitting transparent hugepage we
71 * return zero. That will result in taking the slow
72 * path which will call wait_split_huge_page()
73 * if the pmd is still in splitting state
74 */
75 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
70 return 0; 76 return 0;
71 if (pmd_huge(pmd)) { 77 if (pmd_huge(pmd) || pmd_large(pmd)) {
72 if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next, 78 if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
73 write, pages, nr)) 79 write, pages, nr))
74 return 0; 80 return 0;
@@ -91,7 +97,7 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
91 97
92 pudp = pud_offset(&pgd, addr); 98 pudp = pud_offset(&pgd, addr);
93 do { 99 do {
94 pud_t pud = *pudp; 100 pud_t pud = ACCESS_ONCE(*pudp);
95 101
96 next = pud_addr_end(addr, end); 102 next = pud_addr_end(addr, end);
97 if (pud_none(pud)) 103 if (pud_none(pud))
@@ -154,7 +160,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
154 160
155 pgdp = pgd_offset(mm, addr); 161 pgdp = pgd_offset(mm, addr);
156 do { 162 do {
157 pgd_t pgd = *pgdp; 163 pgd_t pgd = ACCESS_ONCE(*pgdp);
158 164
159 pr_devel(" %016lx: normal pgd %p\n", addr, 165 pr_devel(" %016lx: normal pgd %p\n", addr,
160 (void *)pgd_val(pgd)); 166 (void *)pgd_val(pgd));
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index 0e980acae67c..d3cbda62857b 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -289,9 +289,10 @@ htab_modify_pte:
289 289
290 /* Call ppc_md.hpte_updatepp */ 290 /* Call ppc_md.hpte_updatepp */
291 mr r5,r29 /* vpn */ 291 mr r5,r29 /* vpn */
292 li r6,MMU_PAGE_4K /* page size */ 292 li r6,MMU_PAGE_4K /* base page size */
293 ld r7,STK_PARAM(R9)(r1) /* segment size */ 293 li r7,MMU_PAGE_4K /* actual page size */
294 ld r8,STK_PARAM(R8)(r1) /* get "local" param */ 294 ld r8,STK_PARAM(R9)(r1) /* segment size */
295 ld r9,STK_PARAM(R8)(r1) /* get "local" param */
295_GLOBAL(htab_call_hpte_updatepp) 296_GLOBAL(htab_call_hpte_updatepp)
296 bl . /* Patched by htab_finish_init() */ 297 bl . /* Patched by htab_finish_init() */
297 298
@@ -649,9 +650,10 @@ htab_modify_pte:
649 650
650 /* Call ppc_md.hpte_updatepp */ 651 /* Call ppc_md.hpte_updatepp */
651 mr r5,r29 /* vpn */ 652 mr r5,r29 /* vpn */
652 li r6,MMU_PAGE_4K /* page size */ 653 li r6,MMU_PAGE_4K /* base page size */
653 ld r7,STK_PARAM(R9)(r1) /* segment size */ 654 li r7,MMU_PAGE_4K /* actual page size */
654 ld r8,STK_PARAM(R8)(r1) /* get "local" param */ 655 ld r8,STK_PARAM(R9)(r1) /* segment size */
656 ld r9,STK_PARAM(R8)(r1) /* get "local" param */
655_GLOBAL(htab_call_hpte_updatepp) 657_GLOBAL(htab_call_hpte_updatepp)
656 bl . /* patched by htab_finish_init() */ 658 bl . /* patched by htab_finish_init() */
657 659
@@ -937,9 +939,10 @@ ht64_modify_pte:
937 939
938 /* Call ppc_md.hpte_updatepp */ 940 /* Call ppc_md.hpte_updatepp */
939 mr r5,r29 /* vpn */ 941 mr r5,r29 /* vpn */
940 li r6,MMU_PAGE_64K 942 li r6,MMU_PAGE_64K /* base page size */
941 ld r7,STK_PARAM(R9)(r1) /* segment size */ 943 li r7,MMU_PAGE_64K /* actual page size */
942 ld r8,STK_PARAM(R8)(r1) /* get "local" param */ 944 ld r8,STK_PARAM(R9)(r1) /* segment size */
945 ld r9,STK_PARAM(R8)(r1) /* get "local" param */
943_GLOBAL(ht64_call_hpte_updatepp) 946_GLOBAL(ht64_call_hpte_updatepp)
944 bl . /* patched by htab_finish_init() */ 947 bl . /* patched by htab_finish_init() */
945 948
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 4c122c3f1623..3f0c30ae4791 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -273,61 +273,15 @@ static long native_hpte_remove(unsigned long hpte_group)
273 return i; 273 return i;
274} 274}
275 275
276static inline int __hpte_actual_psize(unsigned int lp, int psize)
277{
278 int i, shift;
279 unsigned int mask;
280
281 /* start from 1 ignoring MMU_PAGE_4K */
282 for (i = 1; i < MMU_PAGE_COUNT; i++) {
283
284 /* invalid penc */
285 if (mmu_psize_defs[psize].penc[i] == -1)
286 continue;
287 /*
288 * encoding bits per actual page size
289 * PTE LP actual page size
290 * rrrr rrrz >=8KB
291 * rrrr rrzz >=16KB
292 * rrrr rzzz >=32KB
293 * rrrr zzzz >=64KB
294 * .......
295 */
296 shift = mmu_psize_defs[i].shift - LP_SHIFT;
297 if (shift > LP_BITS)
298 shift = LP_BITS;
299 mask = (1 << shift) - 1;
300 if ((lp & mask) == mmu_psize_defs[psize].penc[i])
301 return i;
302 }
303 return -1;
304}
305
306static inline int hpte_actual_psize(struct hash_pte *hptep, int psize)
307{
308 /* Look at the 8 bit LP value */
309 unsigned int lp = (hptep->r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
310
311 if (!(hptep->v & HPTE_V_VALID))
312 return -1;
313
314 /* First check if it is large page */
315 if (!(hptep->v & HPTE_V_LARGE))
316 return MMU_PAGE_4K;
317
318 return __hpte_actual_psize(lp, psize);
319}
320
321static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, 276static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
322 unsigned long vpn, int psize, int ssize, 277 unsigned long vpn, int bpsize,
323 int local) 278 int apsize, int ssize, int local)
324{ 279{
325 struct hash_pte *hptep = htab_address + slot; 280 struct hash_pte *hptep = htab_address + slot;
326 unsigned long hpte_v, want_v; 281 unsigned long hpte_v, want_v;
327 int ret = 0; 282 int ret = 0;
328 int actual_psize;
329 283
330 want_v = hpte_encode_avpn(vpn, psize, ssize); 284 want_v = hpte_encode_avpn(vpn, bpsize, ssize);
331 285
332 DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)", 286 DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
333 vpn, want_v & HPTE_V_AVPN, slot, newpp); 287 vpn, want_v & HPTE_V_AVPN, slot, newpp);
@@ -335,7 +289,6 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
335 native_lock_hpte(hptep); 289 native_lock_hpte(hptep);
336 290
337 hpte_v = hptep->v; 291 hpte_v = hptep->v;
338 actual_psize = hpte_actual_psize(hptep, psize);
339 /* 292 /*
340 * We need to invalidate the TLB always because hpte_remove doesn't do 293 * We need to invalidate the TLB always because hpte_remove doesn't do
341 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less 294 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -343,12 +296,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
343 * (hpte_remove) because we assume the old translation is still 296 * (hpte_remove) because we assume the old translation is still
344 * technically "valid". 297 * technically "valid".
345 */ 298 */
346 if (actual_psize < 0) { 299 if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
347 actual_psize = psize;
348 ret = -1;
349 goto err_out;
350 }
351 if (!HPTE_V_COMPARE(hpte_v, want_v)) {
352 DBG_LOW(" -> miss\n"); 300 DBG_LOW(" -> miss\n");
353 ret = -1; 301 ret = -1;
354 } else { 302 } else {
@@ -357,11 +305,10 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
357 hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) | 305 hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
358 (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C)); 306 (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C));
359 } 307 }
360err_out:
361 native_unlock_hpte(hptep); 308 native_unlock_hpte(hptep);
362 309
363 /* Ensure it is out of the tlb too. */ 310 /* Ensure it is out of the tlb too. */
364 tlbie(vpn, psize, actual_psize, ssize, local); 311 tlbie(vpn, bpsize, apsize, ssize, local);
365 312
366 return ret; 313 return ret;
367} 314}
@@ -402,7 +349,6 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
402static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, 349static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
403 int psize, int ssize) 350 int psize, int ssize)
404{ 351{
405 int actual_psize;
406 unsigned long vpn; 352 unsigned long vpn;
407 unsigned long vsid; 353 unsigned long vsid;
408 long slot; 354 long slot;
@@ -415,36 +361,33 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
415 if (slot == -1) 361 if (slot == -1)
416 panic("could not find page to bolt\n"); 362 panic("could not find page to bolt\n");
417 hptep = htab_address + slot; 363 hptep = htab_address + slot;
418 actual_psize = hpte_actual_psize(hptep, psize);
419 if (actual_psize < 0)
420 actual_psize = psize;
421 364
422 /* Update the HPTE */ 365 /* Update the HPTE */
423 hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) | 366 hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
424 (newpp & (HPTE_R_PP | HPTE_R_N)); 367 (newpp & (HPTE_R_PP | HPTE_R_N));
425 368 /*
426 /* Ensure it is out of the tlb too. */ 369 * Ensure it is out of the tlb too. Bolted entries base and
427 tlbie(vpn, psize, actual_psize, ssize, 0); 370 * actual page size will be same.
371 */
372 tlbie(vpn, psize, psize, ssize, 0);
428} 373}
429 374
430static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, 375static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
431 int psize, int ssize, int local) 376 int bpsize, int apsize, int ssize, int local)
432{ 377{
433 struct hash_pte *hptep = htab_address + slot; 378 struct hash_pte *hptep = htab_address + slot;
434 unsigned long hpte_v; 379 unsigned long hpte_v;
435 unsigned long want_v; 380 unsigned long want_v;
436 unsigned long flags; 381 unsigned long flags;
437 int actual_psize;
438 382
439 local_irq_save(flags); 383 local_irq_save(flags);
440 384
441 DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot); 385 DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
442 386
443 want_v = hpte_encode_avpn(vpn, psize, ssize); 387 want_v = hpte_encode_avpn(vpn, bpsize, ssize);
444 native_lock_hpte(hptep); 388 native_lock_hpte(hptep);
445 hpte_v = hptep->v; 389 hpte_v = hptep->v;
446 390
447 actual_psize = hpte_actual_psize(hptep, psize);
448 /* 391 /*
449 * We need to invalidate the TLB always because hpte_remove doesn't do 392 * We need to invalidate the TLB always because hpte_remove doesn't do
450 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less 393 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -452,23 +395,120 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
452 * (hpte_remove) because we assume the old translation is still 395 * (hpte_remove) because we assume the old translation is still
453 * technically "valid". 396 * technically "valid".
454 */ 397 */
455 if (actual_psize < 0) { 398 if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
456 actual_psize = psize;
457 native_unlock_hpte(hptep);
458 goto err_out;
459 }
460 if (!HPTE_V_COMPARE(hpte_v, want_v))
461 native_unlock_hpte(hptep); 399 native_unlock_hpte(hptep);
462 else 400 else
463 /* Invalidate the hpte. NOTE: this also unlocks it */ 401 /* Invalidate the hpte. NOTE: this also unlocks it */
464 hptep->v = 0; 402 hptep->v = 0;
465 403
466err_out:
467 /* Invalidate the TLB */ 404 /* Invalidate the TLB */
468 tlbie(vpn, psize, actual_psize, ssize, local); 405 tlbie(vpn, bpsize, apsize, ssize, local);
406
407 local_irq_restore(flags);
408}
409
410static void native_hugepage_invalidate(struct mm_struct *mm,
411 unsigned char *hpte_slot_array,
412 unsigned long addr, int psize)
413{
414 int ssize = 0, i;
415 int lock_tlbie;
416 struct hash_pte *hptep;
417 int actual_psize = MMU_PAGE_16M;
418 unsigned int max_hpte_count, valid;
419 unsigned long flags, s_addr = addr;
420 unsigned long hpte_v, want_v, shift;
421 unsigned long hidx, vpn = 0, vsid, hash, slot;
422
423 shift = mmu_psize_defs[psize].shift;
424 max_hpte_count = 1U << (PMD_SHIFT - shift);
425
426 local_irq_save(flags);
427 for (i = 0; i < max_hpte_count; i++) {
428 valid = hpte_valid(hpte_slot_array, i);
429 if (!valid)
430 continue;
431 hidx = hpte_hash_index(hpte_slot_array, i);
432
433 /* get the vpn */
434 addr = s_addr + (i * (1ul << shift));
435 if (!is_kernel_addr(addr)) {
436 ssize = user_segment_size(addr);
437 vsid = get_vsid(mm->context.id, addr, ssize);
438 WARN_ON(vsid == 0);
439 } else {
440 vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
441 ssize = mmu_kernel_ssize;
442 }
443
444 vpn = hpt_vpn(addr, vsid, ssize);
445 hash = hpt_hash(vpn, shift, ssize);
446 if (hidx & _PTEIDX_SECONDARY)
447 hash = ~hash;
448
449 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
450 slot += hidx & _PTEIDX_GROUP_IX;
451
452 hptep = htab_address + slot;
453 want_v = hpte_encode_avpn(vpn, psize, ssize);
454 native_lock_hpte(hptep);
455 hpte_v = hptep->v;
456
457 /* Even if we miss, we need to invalidate the TLB */
458 if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
459 native_unlock_hpte(hptep);
460 else
461 /* Invalidate the hpte. NOTE: this also unlocks it */
462 hptep->v = 0;
463 }
464 /*
465 * Since this is a hugepage, we just need a single tlbie.
466 * use the last vpn.
467 */
468 lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
469 if (lock_tlbie)
470 raw_spin_lock(&native_tlbie_lock);
471
472 asm volatile("ptesync":::"memory");
473 __tlbie(vpn, psize, actual_psize, ssize);
474 asm volatile("eieio; tlbsync; ptesync":::"memory");
475
476 if (lock_tlbie)
477 raw_spin_unlock(&native_tlbie_lock);
478
469 local_irq_restore(flags); 479 local_irq_restore(flags);
470} 480}
471 481
482static inline int __hpte_actual_psize(unsigned int lp, int psize)
483{
484 int i, shift;
485 unsigned int mask;
486
487 /* start from 1 ignoring MMU_PAGE_4K */
488 for (i = 1; i < MMU_PAGE_COUNT; i++) {
489
490 /* invalid penc */
491 if (mmu_psize_defs[psize].penc[i] == -1)
492 continue;
493 /*
494 * encoding bits per actual page size
495 * PTE LP actual page size
496 * rrrr rrrz >=8KB
497 * rrrr rrzz >=16KB
498 * rrrr rzzz >=32KB
499 * rrrr zzzz >=64KB
500 * .......
501 */
502 shift = mmu_psize_defs[i].shift - LP_SHIFT;
503 if (shift > LP_BITS)
504 shift = LP_BITS;
505 mask = (1 << shift) - 1;
506 if ((lp & mask) == mmu_psize_defs[psize].penc[i])
507 return i;
508 }
509 return -1;
510}
511
472static void hpte_decode(struct hash_pte *hpte, unsigned long slot, 512static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
473 int *psize, int *apsize, int *ssize, unsigned long *vpn) 513 int *psize, int *apsize, int *ssize, unsigned long *vpn)
474{ 514{
@@ -672,4 +712,5 @@ void __init hpte_init_native(void)
672 ppc_md.hpte_remove = native_hpte_remove; 712 ppc_md.hpte_remove = native_hpte_remove;
673 ppc_md.hpte_clear_all = native_hpte_clear; 713 ppc_md.hpte_clear_all = native_hpte_clear;
674 ppc_md.flush_hash_range = native_flush_hash_range; 714 ppc_md.flush_hash_range = native_flush_hash_range;
715 ppc_md.hugepage_invalidate = native_hugepage_invalidate;
675} 716}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index e303a6d74e3a..6ecc38bd5b24 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -807,7 +807,7 @@ void __init early_init_mmu(void)
807} 807}
808 808
809#ifdef CONFIG_SMP 809#ifdef CONFIG_SMP
810void __cpuinit early_init_mmu_secondary(void) 810void early_init_mmu_secondary(void)
811{ 811{
812 /* Initialize hash table for that CPU */ 812 /* Initialize hash table for that CPU */
813 if (!firmware_has_feature(FW_FEATURE_LPAR)) 813 if (!firmware_has_feature(FW_FEATURE_LPAR))
@@ -1050,13 +1050,26 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
1050 goto bail; 1050 goto bail;
1051 } 1051 }
1052 1052
1053#ifdef CONFIG_HUGETLB_PAGE
1054 if (hugeshift) { 1053 if (hugeshift) {
1055 rc = __hash_page_huge(ea, access, vsid, ptep, trap, local, 1054 if (pmd_trans_huge(*(pmd_t *)ptep))
1056 ssize, hugeshift, psize); 1055 rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
1056 trap, local, ssize, psize);
1057#ifdef CONFIG_HUGETLB_PAGE
1058 else
1059 rc = __hash_page_huge(ea, access, vsid, ptep, trap,
1060 local, ssize, hugeshift, psize);
1061#else
1062 else {
1063 /*
1064 * if we have hugeshift, and is not transhuge with
1065 * hugetlb disabled, something is really wrong.
1066 */
1067 rc = 1;
1068 WARN_ON(1);
1069 }
1070#endif
1057 goto bail; 1071 goto bail;
1058 } 1072 }
1059#endif /* CONFIG_HUGETLB_PAGE */
1060 1073
1061#ifndef CONFIG_PPC_64K_PAGES 1074#ifndef CONFIG_PPC_64K_PAGES
1062 DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); 1075 DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
@@ -1145,6 +1158,7 @@ EXPORT_SYMBOL_GPL(hash_page);
1145void hash_preload(struct mm_struct *mm, unsigned long ea, 1158void hash_preload(struct mm_struct *mm, unsigned long ea,
1146 unsigned long access, unsigned long trap) 1159 unsigned long access, unsigned long trap)
1147{ 1160{
1161 int hugepage_shift;
1148 unsigned long vsid; 1162 unsigned long vsid;
1149 pgd_t *pgdir; 1163 pgd_t *pgdir;
1150 pte_t *ptep; 1164 pte_t *ptep;
@@ -1166,10 +1180,27 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
1166 pgdir = mm->pgd; 1180 pgdir = mm->pgd;
1167 if (pgdir == NULL) 1181 if (pgdir == NULL)
1168 return; 1182 return;
1169 ptep = find_linux_pte(pgdir, ea); 1183
1170 if (!ptep) 1184 /* Get VSID */
1185 ssize = user_segment_size(ea);
1186 vsid = get_vsid(mm->context.id, ea, ssize);
1187 if (!vsid)
1171 return; 1188 return;
1189 /*
1190 * Hash doesn't like irqs. Walking linux page table with irq disabled
1191 * saves us from holding multiple locks.
1192 */
1193 local_irq_save(flags);
1194
1195 /*
1196 * THP pages use update_mmu_cache_pmd. We don't do
1197 * hash preload there. Hence can ignore THP here
1198 */
1199 ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift);
1200 if (!ptep)
1201 goto out_exit;
1172 1202
1203 WARN_ON(hugepage_shift);
1173#ifdef CONFIG_PPC_64K_PAGES 1204#ifdef CONFIG_PPC_64K_PAGES
1174 /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on 1205 /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on
1175 * a 64K kernel), then we don't preload, hash_page() will take 1206 * a 64K kernel), then we don't preload, hash_page() will take
@@ -1178,18 +1209,9 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
1178 * page size demotion here 1209 * page size demotion here
1179 */ 1210 */
1180 if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) 1211 if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
1181 return; 1212 goto out_exit;
1182#endif /* CONFIG_PPC_64K_PAGES */ 1213#endif /* CONFIG_PPC_64K_PAGES */
1183 1214
1184 /* Get VSID */
1185 ssize = user_segment_size(ea);
1186 vsid = get_vsid(mm->context.id, ea, ssize);
1187 if (!vsid)
1188 return;
1189
1190 /* Hash doesn't like irqs */
1191 local_irq_save(flags);
1192
1193 /* Is that local to this CPU ? */ 1215 /* Is that local to this CPU ? */
1194 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 1216 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
1195 local = 1; 1217 local = 1;
@@ -1211,7 +1233,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
1211 mm->context.user_psize, 1233 mm->context.user_psize,
1212 mm->context.user_psize, 1234 mm->context.user_psize,
1213 pte_val(*ptep)); 1235 pte_val(*ptep));
1214 1236out_exit:
1215 local_irq_restore(flags); 1237 local_irq_restore(flags);
1216} 1238}
1217 1239
@@ -1232,7 +1254,11 @@ void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
1232 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 1254 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
1233 slot += hidx & _PTEIDX_GROUP_IX; 1255 slot += hidx & _PTEIDX_GROUP_IX;
1234 DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx); 1256 DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx);
1235 ppc_md.hpte_invalidate(slot, vpn, psize, ssize, local); 1257 /*
1258 * We use same base page size and actual psize, because we don't
1259 * use these functions for hugepage
1260 */
1261 ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local);
1236 } pte_iterate_hashed_end(); 1262 } pte_iterate_hashed_end();
1237 1263
1238#ifdef CONFIG_PPC_TRANSACTIONAL_MEM 1264#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -1365,7 +1391,8 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
1365 hash = ~hash; 1391 hash = ~hash;
1366 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 1392 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
1367 slot += hidx & _PTEIDX_GROUP_IX; 1393 slot += hidx & _PTEIDX_GROUP_IX;
1368 ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_kernel_ssize, 0); 1394 ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize,
1395 mmu_kernel_ssize, 0);
1369} 1396}
1370 1397
1371void kernel_map_pages(struct page *page, int numpages, int enable) 1398void kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
new file mode 100644
index 000000000000..34de9e0cdc34
--- /dev/null
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -0,0 +1,175 @@
1/*
2 * Copyright IBM Corporation, 2013
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15/*
16 * PPC64 THP Support for hash based MMUs
17 */
18#include <linux/mm.h>
19#include <asm/machdep.h>
20
21int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
22 pmd_t *pmdp, unsigned long trap, int local, int ssize,
23 unsigned int psize)
24{
25 unsigned int index, valid;
26 unsigned char *hpte_slot_array;
27 unsigned long rflags, pa, hidx;
28 unsigned long old_pmd, new_pmd;
29 int ret, lpsize = MMU_PAGE_16M;
30 unsigned long vpn, hash, shift, slot;
31
32 /*
33 * atomically mark the linux large page PMD busy and dirty
34 */
35 do {
36 old_pmd = pmd_val(*pmdp);
37 /* If PMD busy, retry the access */
38 if (unlikely(old_pmd & _PAGE_BUSY))
39 return 0;
40 /* If PMD is trans splitting retry the access */
41 if (unlikely(old_pmd & _PAGE_SPLITTING))
42 return 0;
43 /* If PMD permissions don't match, take page fault */
44 if (unlikely(access & ~old_pmd))
45 return 1;
46 /*
47 * Try to lock the PTE, add ACCESSED and DIRTY if it was
48 * a write access
49 */
50 new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED;
51 if (access & _PAGE_RW)
52 new_pmd |= _PAGE_DIRTY;
53 } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
54 old_pmd, new_pmd));
55 /*
56 * PP bits. _PAGE_USER is already PP bit 0x2, so we only
57 * need to add in 0x1 if it's a read-only user page
58 */
59 rflags = new_pmd & _PAGE_USER;
60 if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) &&
61 (new_pmd & _PAGE_DIRTY)))
62 rflags |= 0x1;
63 /*
64 * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
65 */
66 rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N);
67
68#if 0
69 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
70
71 /*
72 * No CPU has hugepages but lacks no execute, so we
73 * don't need to worry about that case
74 */
75 rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
76 }
77#endif
78 /*
79 * Find the slot index details for this ea, using base page size.
80 */
81 shift = mmu_psize_defs[psize].shift;
82 index = (ea & ~HPAGE_PMD_MASK) >> shift;
83 BUG_ON(index >= 4096);
84
85 vpn = hpt_vpn(ea, vsid, ssize);
86 hash = hpt_hash(vpn, shift, ssize);
87 hpte_slot_array = get_hpte_slot_array(pmdp);
88
89 valid = hpte_valid(hpte_slot_array, index);
90 if (valid) {
91 /* update the hpte bits */
92 hidx = hpte_hash_index(hpte_slot_array, index);
93 if (hidx & _PTEIDX_SECONDARY)
94 hash = ~hash;
95 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
96 slot += hidx & _PTEIDX_GROUP_IX;
97
98 ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
99 psize, lpsize, ssize, local);
100 /*
101 * We failed to update, try to insert a new entry.
102 */
103 if (ret == -1) {
104 /*
105 * large pte is marked busy, so we can be sure
106 * nobody is looking at hpte_slot_array. hence we can
107 * safely update this here.
108 */
109 valid = 0;
110 new_pmd &= ~_PAGE_HPTEFLAGS;
111 hpte_slot_array[index] = 0;
112 } else
113 /* clear the busy bits and set the hash pte bits */
114 new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
115 }
116
117 if (!valid) {
118 unsigned long hpte_group;
119
120 /* insert new entry */
121 pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
122repeat:
123 hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
124
125 /* clear the busy bits and set the hash pte bits */
126 new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
127
128 /* Add in WIMG bits */
129 rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
130 _PAGE_COHERENT | _PAGE_GUARDED));
131
132 /* Insert into the hash table, primary slot */
133 slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
134 psize, lpsize, ssize);
135 /*
136 * Primary is full, try the secondary
137 */
138 if (unlikely(slot == -1)) {
139 hpte_group = ((~hash & htab_hash_mask) *
140 HPTES_PER_GROUP) & ~0x7UL;
141 slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
142 rflags, HPTE_V_SECONDARY,
143 psize, lpsize, ssize);
144 if (slot == -1) {
145 if (mftb() & 0x1)
146 hpte_group = ((hash & htab_hash_mask) *
147 HPTES_PER_GROUP) & ~0x7UL;
148
149 ppc_md.hpte_remove(hpte_group);
150 goto repeat;
151 }
152 }
153 /*
154 * Hypervisor failure. Restore old pmd and return -1
155 * similar to __hash_page_*
156 */
157 if (unlikely(slot == -2)) {
158 *pmdp = __pmd(old_pmd);
159 hash_failure_debug(ea, access, vsid, trap, ssize,
160 psize, lpsize, old_pmd);
161 return -1;
162 }
163 /*
164 * large pte is marked busy, so we can be sure
165 * nobody is looking at hpte_slot_array. hence we can
166 * safely update this here.
167 */
168 mark_hpte_slot_valid(hpte_slot_array, index, slot);
169 }
170 /*
171 * No need to use ldarx/stdcx here
172 */
173 *pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
174 return 0;
175}
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 0f1d94a1fb82..0b7fb6761015 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -81,7 +81,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
81 slot += (old_pte & _PAGE_F_GIX) >> 12; 81 slot += (old_pte & _PAGE_F_GIX) >> 12;
82 82
83 if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize, 83 if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize,
84 ssize, local) == -1) 84 mmu_psize, ssize, local) == -1)
85 old_pte &= ~_PAGE_HPTEFLAGS; 85 old_pte &= ~_PAGE_HPTEFLAGS;
86 } 86 }
87 87
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 4210549ac95e..834ca8eb38f2 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -21,6 +21,9 @@
21#include <asm/pgalloc.h> 21#include <asm/pgalloc.h>
22#include <asm/tlb.h> 22#include <asm/tlb.h>
23#include <asm/setup.h> 23#include <asm/setup.h>
24#include <asm/hugetlb.h>
25
26#ifdef CONFIG_HUGETLB_PAGE
24 27
25#define PAGE_SHIFT_64K 16 28#define PAGE_SHIFT_64K 16
26#define PAGE_SHIFT_16M 24 29#define PAGE_SHIFT_16M 24
@@ -100,68 +103,9 @@ int pgd_huge(pgd_t pgd)
100} 103}
101#endif 104#endif
102 105
103/*
104 * We have 4 cases for pgds and pmds:
105 * (1) invalid (all zeroes)
106 * (2) pointer to next table, as normal; bottom 6 bits == 0
107 * (3) leaf pte for huge page, bottom two bits != 00
108 * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
109 */
110pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
111{
112 pgd_t *pg;
113 pud_t *pu;
114 pmd_t *pm;
115 pte_t *ret_pte;
116 hugepd_t *hpdp = NULL;
117 unsigned pdshift = PGDIR_SHIFT;
118
119 if (shift)
120 *shift = 0;
121
122 pg = pgdir + pgd_index(ea);
123
124 if (pgd_huge(*pg)) {
125 ret_pte = (pte_t *) pg;
126 goto out;
127 } else if (is_hugepd(pg))
128 hpdp = (hugepd_t *)pg;
129 else if (!pgd_none(*pg)) {
130 pdshift = PUD_SHIFT;
131 pu = pud_offset(pg, ea);
132
133 if (pud_huge(*pu)) {
134 ret_pte = (pte_t *) pu;
135 goto out;
136 } else if (is_hugepd(pu))
137 hpdp = (hugepd_t *)pu;
138 else if (!pud_none(*pu)) {
139 pdshift = PMD_SHIFT;
140 pm = pmd_offset(pu, ea);
141
142 if (pmd_huge(*pm)) {
143 ret_pte = (pte_t *) pm;
144 goto out;
145 } else if (is_hugepd(pm))
146 hpdp = (hugepd_t *)pm;
147 else if (!pmd_none(*pm))
148 return pte_offset_kernel(pm, ea);
149 }
150 }
151 if (!hpdp)
152 return NULL;
153
154 ret_pte = hugepte_offset(hpdp, ea, pdshift);
155 pdshift = hugepd_shift(*hpdp);
156out:
157 if (shift)
158 *shift = pdshift;
159 return ret_pte;
160}
161EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
162
163pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 106pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
164{ 107{
108 /* Only called for hugetlbfs pages, hence can ignore THP */
165 return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); 109 return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
166} 110}
167 111
@@ -736,11 +680,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
736 struct page *page; 680 struct page *page;
737 unsigned shift; 681 unsigned shift;
738 unsigned long mask; 682 unsigned long mask;
739 683 /*
684 * Transparent hugepages are handled by generic code. We can skip them
685 * here.
686 */
740 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); 687 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
741 688
742 /* Verify it is a huge page else bail. */ 689 /* Verify it is a huge page else bail. */
743 if (!ptep || !shift) 690 if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
744 return ERR_PTR(-EINVAL); 691 return ERR_PTR(-EINVAL);
745 692
746 mask = (1UL << shift) - 1; 693 mask = (1UL << shift) - 1;
@@ -759,69 +706,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
759 return NULL; 706 return NULL;
760} 707}
761 708
762int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
763 unsigned long end, int write, struct page **pages, int *nr)
764{
765 unsigned long mask;
766 unsigned long pte_end;
767 struct page *head, *page, *tail;
768 pte_t pte;
769 int refs;
770
771 pte_end = (addr + sz) & ~(sz-1);
772 if (pte_end < end)
773 end = pte_end;
774
775 pte = *ptep;
776 mask = _PAGE_PRESENT | _PAGE_USER;
777 if (write)
778 mask |= _PAGE_RW;
779
780 if ((pte_val(pte) & mask) != mask)
781 return 0;
782
783 /* hugepages are never "special" */
784 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
785
786 refs = 0;
787 head = pte_page(pte);
788
789 page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
790 tail = page;
791 do {
792 VM_BUG_ON(compound_head(page) != head);
793 pages[*nr] = page;
794 (*nr)++;
795 page++;
796 refs++;
797 } while (addr += PAGE_SIZE, addr != end);
798
799 if (!page_cache_add_speculative(head, refs)) {
800 *nr -= refs;
801 return 0;
802 }
803
804 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
805 /* Could be optimized better */
806 *nr -= refs;
807 while (refs--)
808 put_page(head);
809 return 0;
810 }
811
812 /*
813 * Any tail page need their mapcount reference taken before we
814 * return.
815 */
816 while (refs--) {
817 if (PageTail(tail))
818 get_huge_page_tail(tail);
819 tail++;
820 }
821
822 return 1;
823}
824
825static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, 709static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
826 unsigned long sz) 710 unsigned long sz)
827{ 711{
@@ -1038,3 +922,168 @@ void flush_dcache_icache_hugepage(struct page *page)
1038 } 922 }
1039 } 923 }
1040} 924}
925
926#endif /* CONFIG_HUGETLB_PAGE */
927
928/*
929 * We have 4 cases for pgds and pmds:
930 * (1) invalid (all zeroes)
931 * (2) pointer to next table, as normal; bottom 6 bits == 0
932 * (3) leaf pte for huge page, bottom two bits != 00
933 * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
934 *
935 * So long as we atomically load page table pointers we are safe against teardown,
936 * we can follow the address down to the the page and take a ref on it.
937 */
938
939pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
940{
941 pgd_t pgd, *pgdp;
942 pud_t pud, *pudp;
943 pmd_t pmd, *pmdp;
944 pte_t *ret_pte;
945 hugepd_t *hpdp = NULL;
946 unsigned pdshift = PGDIR_SHIFT;
947
948 if (shift)
949 *shift = 0;
950
951 pgdp = pgdir + pgd_index(ea);
952 pgd = ACCESS_ONCE(*pgdp);
953 /*
954 * Always operate on the local stack value. This make sure the
955 * value don't get updated by a parallel THP split/collapse,
956 * page fault or a page unmap. The return pte_t * is still not
957 * stable. So should be checked there for above conditions.
958 */
959 if (pgd_none(pgd))
960 return NULL;
961 else if (pgd_huge(pgd)) {
962 ret_pte = (pte_t *) pgdp;
963 goto out;
964 } else if (is_hugepd(&pgd))
965 hpdp = (hugepd_t *)&pgd;
966 else {
967 /*
968 * Even if we end up with an unmap, the pgtable will not
969 * be freed, because we do an rcu free and here we are
970 * irq disabled
971 */
972 pdshift = PUD_SHIFT;
973 pudp = pud_offset(&pgd, ea);
974 pud = ACCESS_ONCE(*pudp);
975
976 if (pud_none(pud))
977 return NULL;
978 else if (pud_huge(pud)) {
979 ret_pte = (pte_t *) pudp;
980 goto out;
981 } else if (is_hugepd(&pud))
982 hpdp = (hugepd_t *)&pud;
983 else {
984 pdshift = PMD_SHIFT;
985 pmdp = pmd_offset(&pud, ea);
986 pmd = ACCESS_ONCE(*pmdp);
987 /*
988 * A hugepage collapse is captured by pmd_none, because
989 * it mark the pmd none and do a hpte invalidate.
990 *
991 * A hugepage split is captured by pmd_trans_splitting
992 * because we mark the pmd trans splitting and do a
993 * hpte invalidate
994 *
995 */
996 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
997 return NULL;
998
999 if (pmd_huge(pmd) || pmd_large(pmd)) {
1000 ret_pte = (pte_t *) pmdp;
1001 goto out;
1002 } else if (is_hugepd(&pmd))
1003 hpdp = (hugepd_t *)&pmd;
1004 else
1005 return pte_offset_kernel(&pmd, ea);
1006 }
1007 }
1008 if (!hpdp)
1009 return NULL;
1010
1011 ret_pte = hugepte_offset(hpdp, ea, pdshift);
1012 pdshift = hugepd_shift(*hpdp);
1013out:
1014 if (shift)
1015 *shift = pdshift;
1016 return ret_pte;
1017}
1018EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
1019
1020int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
1021 unsigned long end, int write, struct page **pages, int *nr)
1022{
1023 unsigned long mask;
1024 unsigned long pte_end;
1025 struct page *head, *page, *tail;
1026 pte_t pte;
1027 int refs;
1028
1029 pte_end = (addr + sz) & ~(sz-1);
1030 if (pte_end < end)
1031 end = pte_end;
1032
1033 pte = ACCESS_ONCE(*ptep);
1034 mask = _PAGE_PRESENT | _PAGE_USER;
1035 if (write)
1036 mask |= _PAGE_RW;
1037
1038 if ((pte_val(pte) & mask) != mask)
1039 return 0;
1040
1041#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1042 /*
1043 * check for splitting here
1044 */
1045 if (pmd_trans_splitting(pte_pmd(pte)))
1046 return 0;
1047#endif
1048
1049 /* hugepages are never "special" */
1050 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1051
1052 refs = 0;
1053 head = pte_page(pte);
1054
1055 page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
1056 tail = page;
1057 do {
1058 VM_BUG_ON(compound_head(page) != head);
1059 pages[*nr] = page;
1060 (*nr)++;
1061 page++;
1062 refs++;
1063 } while (addr += PAGE_SIZE, addr != end);
1064
1065 if (!page_cache_add_speculative(head, refs)) {
1066 *nr -= refs;
1067 return 0;
1068 }
1069
1070 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
1071 /* Could be optimized better */
1072 *nr -= refs;
1073 while (refs--)
1074 put_page(head);
1075 return 0;
1076 }
1077
1078 /*
1079 * Any tail page need their mapcount reference taken before we
1080 * return.
1081 */
1082 while (refs--) {
1083 if (PageTail(tail))
1084 get_huge_page_tail(tail);
1085 tail++;
1086 }
1087
1088 return 1;
1089}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a90b9c458990..d0cd9e4c6837 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -88,7 +88,11 @@ static void pgd_ctor(void *addr)
88 88
89static void pmd_ctor(void *addr) 89static void pmd_ctor(void *addr)
90{ 90{
91#ifdef CONFIG_TRANSPARENT_HUGEPAGE
92 memset(addr, 0, PMD_TABLE_SIZE * 2);
93#else
91 memset(addr, 0, PMD_TABLE_SIZE); 94 memset(addr, 0, PMD_TABLE_SIZE);
95#endif
92} 96}
93 97
94struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; 98struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
@@ -137,10 +141,9 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
137void pgtable_cache_init(void) 141void pgtable_cache_init(void)
138{ 142{
139 pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor); 143 pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
140 pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor); 144 pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
141 if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE)) 145 if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
142 panic("Couldn't allocate pgtable caches"); 146 panic("Couldn't allocate pgtable caches");
143
144 /* In all current configs, when the PUD index exists it's the 147 /* In all current configs, when the PUD index exists it's the
145 * same size as either the pgd or pmd index. Verify that the 148 * same size as either the pgd or pmd index. Verify that the
146 * initialization above has also created a PUD cache. This 149 * initialization above has also created a PUD cache. This
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 1cb1ea133a2c..7f4bea162026 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -461,6 +461,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
461 pte_t *ptep) 461 pte_t *ptep)
462{ 462{
463#ifdef CONFIG_PPC_STD_MMU 463#ifdef CONFIG_PPC_STD_MMU
464 /*
465 * We don't need to worry about _PAGE_PRESENT here because we are
466 * called with either mm->page_table_lock held or ptl lock held
467 */
464 unsigned long access = 0, trap; 468 unsigned long access = 0, trap;
465 469
466 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ 470 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap.c
index 67a42ed0d2fc..67a42ed0d2fc 100644
--- a/arch/powerpc/mm/mmap_64.c
+++ b/arch/powerpc/mm/mmap.c
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index e779642c25e5..af3d78e19302 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -112,8 +112,10 @@ static unsigned int steal_context_smp(unsigned int id)
112 */ 112 */
113 for_each_cpu(cpu, mm_cpumask(mm)) { 113 for_each_cpu(cpu, mm_cpumask(mm)) {
114 for (i = cpu_first_thread_sibling(cpu); 114 for (i = cpu_first_thread_sibling(cpu);
115 i <= cpu_last_thread_sibling(cpu); i++) 115 i <= cpu_last_thread_sibling(cpu); i++) {
116 __set_bit(id, stale_map[i]); 116 if (stale_map[i])
117 __set_bit(id, stale_map[i]);
118 }
117 cpu = i - 1; 119 cpu = i - 1;
118 } 120 }
119 return id; 121 return id;
@@ -272,7 +274,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
272 /* XXX This clear should ultimately be part of local_flush_tlb_mm */ 274 /* XXX This clear should ultimately be part of local_flush_tlb_mm */
273 for (i = cpu_first_thread_sibling(cpu); 275 for (i = cpu_first_thread_sibling(cpu);
274 i <= cpu_last_thread_sibling(cpu); i++) { 276 i <= cpu_last_thread_sibling(cpu); i++) {
275 __clear_bit(id, stale_map[i]); 277 if (stale_map[i])
278 __clear_bit(id, stale_map[i]);
276 } 279 }
277 } 280 }
278 281
@@ -329,8 +332,8 @@ void destroy_context(struct mm_struct *mm)
329 332
330#ifdef CONFIG_SMP 333#ifdef CONFIG_SMP
331 334
332static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self, 335static int mmu_context_cpu_notify(struct notifier_block *self,
333 unsigned long action, void *hcpu) 336 unsigned long action, void *hcpu)
334{ 337{
335 unsigned int cpu = (unsigned int)(long)hcpu; 338 unsigned int cpu = (unsigned int)(long)hcpu;
336 339
@@ -363,7 +366,7 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
363 return NOTIFY_OK; 366 return NOTIFY_OK;
364} 367}
365 368
366static struct notifier_block __cpuinitdata mmu_context_cpu_nb = { 369static struct notifier_block mmu_context_cpu_nb = {
367 .notifier_call = mmu_context_cpu_notify, 370 .notifier_call = mmu_context_cpu_notify,
368}; 371};
369 372
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 88c0425dc0a8..08397217e8ac 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -516,7 +516,7 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
516 * Figure out to which domain a cpu belongs and stick it there. 516 * Figure out to which domain a cpu belongs and stick it there.
517 * Return the id of the domain used. 517 * Return the id of the domain used.
518 */ 518 */
519static int __cpuinit numa_setup_cpu(unsigned long lcpu) 519static int numa_setup_cpu(unsigned long lcpu)
520{ 520{
521 int nid = 0; 521 int nid = 0;
522 struct device_node *cpu = of_get_cpu_node(lcpu, NULL); 522 struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
@@ -538,8 +538,7 @@ out:
538 return nid; 538 return nid;
539} 539}
540 540
541static int __cpuinit cpu_numa_callback(struct notifier_block *nfb, 541static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,
542 unsigned long action,
543 void *hcpu) 542 void *hcpu)
544{ 543{
545 unsigned long lcpu = (unsigned long)hcpu; 544 unsigned long lcpu = (unsigned long)hcpu;
@@ -919,7 +918,7 @@ static void __init *careful_zallocation(int nid, unsigned long size,
919 return ret; 918 return ret;
920} 919}
921 920
922static struct notifier_block __cpuinitdata ppc64_numa_nb = { 921static struct notifier_block ppc64_numa_nb = {
923 .notifier_call = cpu_numa_callback, 922 .notifier_call = cpu_numa_callback,
924 .priority = 1 /* Must run before sched domains notifier. */ 923 .priority = 1 /* Must run before sched domains notifier. */
925}; 924};
@@ -1433,11 +1432,9 @@ static int update_cpu_topology(void *data)
1433 if (cpu != update->cpu) 1432 if (cpu != update->cpu)
1434 continue; 1433 continue;
1435 1434
1436 unregister_cpu_under_node(update->cpu, update->old_nid);
1437 unmap_cpu_from_node(update->cpu); 1435 unmap_cpu_from_node(update->cpu);
1438 map_cpu_to_node(update->cpu, update->new_nid); 1436 map_cpu_to_node(update->cpu, update->new_nid);
1439 vdso_getcpu_init(); 1437 vdso_getcpu_init();
1440 register_cpu_under_node(update->cpu, update->new_nid);
1441 } 1438 }
1442 1439
1443 return 0; 1440 return 0;
@@ -1485,6 +1482,9 @@ int arch_update_cpu_topology(void)
1485 stop_machine(update_cpu_topology, &updates[0], &updated_cpus); 1482 stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
1486 1483
1487 for (ud = &updates[0]; ud; ud = ud->next) { 1484 for (ud = &updates[0]; ud; ud = ud->next) {
1485 unregister_cpu_under_node(ud->cpu, ud->old_nid);
1486 register_cpu_under_node(ud->cpu, ud->new_nid);
1487
1488 dev = get_cpu_device(ud->cpu); 1488 dev = get_cpu_device(ud->cpu);
1489 if (dev) 1489 if (dev)
1490 kobject_uevent(&dev->kobj, KOBJ_CHANGE); 1490 kobject_uevent(&dev->kobj, KOBJ_CHANGE);
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 214130a4edc6..edda589795c3 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -235,6 +235,14 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
235 pud = pud_offset(pgd, addr); 235 pud = pud_offset(pgd, addr);
236 BUG_ON(pud_none(*pud)); 236 BUG_ON(pud_none(*pud));
237 pmd = pmd_offset(pud, addr); 237 pmd = pmd_offset(pud, addr);
238 /*
239 * khugepaged to collapse normal pages to hugepage, first set
240 * pmd to none to force page fault/gup to take mmap_sem. After
241 * pmd is set to none, we do a pte_clear which does this assertion
242 * so if we find pmd none, return.
243 */
244 if (pmd_none(*pmd))
245 return;
238 BUG_ON(!pmd_present(*pmd)); 246 BUG_ON(!pmd_present(*pmd));
239 assert_spin_locked(pte_lockptr(mm, pmd)); 247 assert_spin_locked(pte_lockptr(mm, pmd));
240} 248}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index a854096e1023..536eec72c0f7 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -338,6 +338,19 @@ EXPORT_SYMBOL(iounmap);
338EXPORT_SYMBOL(__iounmap); 338EXPORT_SYMBOL(__iounmap);
339EXPORT_SYMBOL(__iounmap_at); 339EXPORT_SYMBOL(__iounmap_at);
340 340
341/*
342 * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
343 * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
344 */
345struct page *pmd_page(pmd_t pmd)
346{
347#ifdef CONFIG_TRANSPARENT_HUGEPAGE
348 if (pmd_trans_huge(pmd))
349 return pfn_to_page(pmd_pfn(pmd));
350#endif
351 return virt_to_page(pmd_page_vaddr(pmd));
352}
353
341#ifdef CONFIG_PPC_64K_PAGES 354#ifdef CONFIG_PPC_64K_PAGES
342static pte_t *get_from_cache(struct mm_struct *mm) 355static pte_t *get_from_cache(struct mm_struct *mm)
343{ 356{
@@ -455,3 +468,404 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
455} 468}
456#endif 469#endif
457#endif /* CONFIG_PPC_64K_PAGES */ 470#endif /* CONFIG_PPC_64K_PAGES */
471
472#ifdef CONFIG_TRANSPARENT_HUGEPAGE
473
474/*
475 * This is called when relaxing access to a hugepage. It's also called in the page
476 * fault path when we don't hit any of the major fault cases, ie, a minor
477 * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
478 * handled those two for us, we additionally deal with missing execute
479 * permission here on some processors
480 */
481int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
482 pmd_t *pmdp, pmd_t entry, int dirty)
483{
484 int changed;
485#ifdef CONFIG_DEBUG_VM
486 WARN_ON(!pmd_trans_huge(*pmdp));
487 assert_spin_locked(&vma->vm_mm->page_table_lock);
488#endif
489 changed = !pmd_same(*(pmdp), entry);
490 if (changed) {
491 __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
492 /*
493 * Since we are not supporting SW TLB systems, we don't
494 * have any thing similar to flush_tlb_page_nohash()
495 */
496 }
497 return changed;
498}
499
500unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
501 pmd_t *pmdp, unsigned long clr)
502{
503
504 unsigned long old, tmp;
505
506#ifdef CONFIG_DEBUG_VM
507 WARN_ON(!pmd_trans_huge(*pmdp));
508 assert_spin_locked(&mm->page_table_lock);
509#endif
510
511#ifdef PTE_ATOMIC_UPDATES
512 __asm__ __volatile__(
513 "1: ldarx %0,0,%3\n\
514 andi. %1,%0,%6\n\
515 bne- 1b \n\
516 andc %1,%0,%4 \n\
517 stdcx. %1,0,%3 \n\
518 bne- 1b"
519 : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
520 : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY)
521 : "cc" );
522#else
523 old = pmd_val(*pmdp);
524 *pmdp = __pmd(old & ~clr);
525#endif
526 if (old & _PAGE_HASHPTE)
527 hpte_do_hugepage_flush(mm, addr, pmdp);
528 return old;
529}
530
531pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
532 pmd_t *pmdp)
533{
534 pmd_t pmd;
535
536 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
537 if (pmd_trans_huge(*pmdp)) {
538 pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
539 } else {
540 /*
541 * khugepaged calls this for normal pmd
542 */
543 pmd = *pmdp;
544 pmd_clear(pmdp);
545 /*
546 * Wait for all pending hash_page to finish. This is needed
547 * in case of subpage collapse. When we collapse normal pages
548 * to hugepage, we first clear the pmd, then invalidate all
549 * the PTE entries. The assumption here is that any low level
550 * page fault will see a none pmd and take the slow path that
551 * will wait on mmap_sem. But we could very well be in a
552 * hash_page with local ptep pointer value. Such a hash page
553 * can result in adding new HPTE entries for normal subpages.
554 * That means we could be modifying the page content as we
555 * copy them to a huge page. So wait for parallel hash_page
556 * to finish before invalidating HPTE entries. We can do this
557 * by sending an IPI to all the cpus and executing a dummy
558 * function there.
559 */
560 kick_all_cpus_sync();
561 /*
562 * Now invalidate the hpte entries in the range
563 * covered by pmd. This make sure we take a
564 * fault and will find the pmd as none, which will
565 * result in a major fault which takes mmap_sem and
566 * hence wait for collapse to complete. Without this
567 * the __collapse_huge_page_copy can result in copying
568 * the old content.
569 */
570 flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
571 }
572 return pmd;
573}
574
575int pmdp_test_and_clear_young(struct vm_area_struct *vma,
576 unsigned long address, pmd_t *pmdp)
577{
578 return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
579}
580
581/*
582 * We currently remove entries from the hashtable regardless of whether
583 * the entry was young or dirty. The generic routines only flush if the
584 * entry was young or dirty which is not good enough.
585 *
586 * We should be more intelligent about this but for the moment we override
587 * these functions and force a tlb flush unconditionally
588 */
589int pmdp_clear_flush_young(struct vm_area_struct *vma,
590 unsigned long address, pmd_t *pmdp)
591{
592 return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
593}
594
595/*
596 * We mark the pmd splitting and invalidate all the hpte
597 * entries for this hugepage.
598 */
599void pmdp_splitting_flush(struct vm_area_struct *vma,
600 unsigned long address, pmd_t *pmdp)
601{
602 unsigned long old, tmp;
603
604 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
605
606#ifdef CONFIG_DEBUG_VM
607 WARN_ON(!pmd_trans_huge(*pmdp));
608 assert_spin_locked(&vma->vm_mm->page_table_lock);
609#endif
610
611#ifdef PTE_ATOMIC_UPDATES
612
613 __asm__ __volatile__(
614 "1: ldarx %0,0,%3\n\
615 andi. %1,%0,%6\n\
616 bne- 1b \n\
617 ori %1,%0,%4 \n\
618 stdcx. %1,0,%3 \n\
619 bne- 1b"
620 : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
621 : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
622 : "cc" );
623#else
624 old = pmd_val(*pmdp);
625 *pmdp = __pmd(old | _PAGE_SPLITTING);
626#endif
627 /*
628 * If we didn't had the splitting flag set, go and flush the
629 * HPTE entries.
630 */
631 if (!(old & _PAGE_SPLITTING)) {
632 /* We need to flush the hpte */
633 if (old & _PAGE_HASHPTE)
634 hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
635 }
636}
637
638/*
639 * We want to put the pgtable in pmd and use pgtable for tracking
640 * the base page size hptes
641 */
642void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
643 pgtable_t pgtable)
644{
645 pgtable_t *pgtable_slot;
646 assert_spin_locked(&mm->page_table_lock);
647 /*
648 * we store the pgtable in the second half of PMD
649 */
650 pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
651 *pgtable_slot = pgtable;
652 /*
653 * expose the deposited pgtable to other cpus.
654 * before we set the hugepage PTE at pmd level
655 * hash fault code looks at the deposted pgtable
656 * to store hash index values.
657 */
658 smp_wmb();
659}
660
661pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
662{
663 pgtable_t pgtable;
664 pgtable_t *pgtable_slot;
665
666 assert_spin_locked(&mm->page_table_lock);
667 pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
668 pgtable = *pgtable_slot;
669 /*
670 * Once we withdraw, mark the entry NULL.
671 */
672 *pgtable_slot = NULL;
673 /*
674 * We store HPTE information in the deposited PTE fragment.
675 * zero out the content on withdraw.
676 */
677 memset(pgtable, 0, PTE_FRAG_SIZE);
678 return pgtable;
679}
680
681/*
682 * set a new huge pmd. We should not be called for updating
683 * an existing pmd entry. That should go via pmd_hugepage_update.
684 */
685void set_pmd_at(struct mm_struct *mm, unsigned long addr,
686 pmd_t *pmdp, pmd_t pmd)
687{
688#ifdef CONFIG_DEBUG_VM
689 WARN_ON(!pmd_none(*pmdp));
690 assert_spin_locked(&mm->page_table_lock);
691 WARN_ON(!pmd_trans_huge(pmd));
692#endif
693 return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
694}
695
696void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
697 pmd_t *pmdp)
698{
699 pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT);
700}
701
702/*
703 * A linux hugepage PMD was changed and the corresponding hash table entries
704 * neesd to be flushed.
705 */
706void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
707 pmd_t *pmdp)
708{
709 int ssize, i;
710 unsigned long s_addr;
711 int max_hpte_count;
712 unsigned int psize, valid;
713 unsigned char *hpte_slot_array;
714 unsigned long hidx, vpn, vsid, hash, shift, slot;
715
716 /*
717 * Flush all the hptes mapping this hugepage
718 */
719 s_addr = addr & HPAGE_PMD_MASK;
720 hpte_slot_array = get_hpte_slot_array(pmdp);
721 /*
722 * IF we try to do a HUGE PTE update after a withdraw is done.
723 * we will find the below NULL. This happens when we do
724 * split_huge_page_pmd
725 */
726 if (!hpte_slot_array)
727 return;
728
729 /* get the base page size */
730 psize = get_slice_psize(mm, s_addr);
731
732 if (ppc_md.hugepage_invalidate)
733 return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
734 s_addr, psize);
735 /*
736 * No bluk hpte removal support, invalidate each entry
737 */
738 shift = mmu_psize_defs[psize].shift;
739 max_hpte_count = HPAGE_PMD_SIZE >> shift;
740 for (i = 0; i < max_hpte_count; i++) {
741 /*
742 * 8 bits per each hpte entries
743 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
744 */
745 valid = hpte_valid(hpte_slot_array, i);
746 if (!valid)
747 continue;
748 hidx = hpte_hash_index(hpte_slot_array, i);
749
750 /* get the vpn */
751 addr = s_addr + (i * (1ul << shift));
752 if (!is_kernel_addr(addr)) {
753 ssize = user_segment_size(addr);
754 vsid = get_vsid(mm->context.id, addr, ssize);
755 WARN_ON(vsid == 0);
756 } else {
757 vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
758 ssize = mmu_kernel_ssize;
759 }
760
761 vpn = hpt_vpn(addr, vsid, ssize);
762 hash = hpt_hash(vpn, shift, ssize);
763 if (hidx & _PTEIDX_SECONDARY)
764 hash = ~hash;
765
766 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
767 slot += hidx & _PTEIDX_GROUP_IX;
768 ppc_md.hpte_invalidate(slot, vpn, psize,
769 MMU_PAGE_16M, ssize, 0);
770 }
771}
772
773static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
774{
775 pmd_val(pmd) |= pgprot_val(pgprot);
776 return pmd;
777}
778
779pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
780{
781 pmd_t pmd;
782 /*
783 * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
784 * set. We use this to check THP page at pmd level.
785 * leaf pte for huge page, bottom two bits != 00
786 */
787 pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
788 pmd_val(pmd) |= _PAGE_THP_HUGE;
789 pmd = pmd_set_protbits(pmd, pgprot);
790 return pmd;
791}
792
793pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
794{
795 return pfn_pmd(page_to_pfn(page), pgprot);
796}
797
798pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
799{
800
801 pmd_val(pmd) &= _HPAGE_CHG_MASK;
802 pmd = pmd_set_protbits(pmd, newprot);
803 return pmd;
804}
805
806/*
807 * This is called at the end of handling a user page fault, when the
808 * fault has been handled by updating a HUGE PMD entry in the linux page tables.
809 * We use it to preload an HPTE into the hash table corresponding to
810 * the updated linux HUGE PMD entry.
811 */
812void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
813 pmd_t *pmd)
814{
815 return;
816}
817
818pmd_t pmdp_get_and_clear(struct mm_struct *mm,
819 unsigned long addr, pmd_t *pmdp)
820{
821 pmd_t old_pmd;
822 pgtable_t pgtable;
823 unsigned long old;
824 pgtable_t *pgtable_slot;
825
826 old = pmd_hugepage_update(mm, addr, pmdp, ~0UL);
827 old_pmd = __pmd(old);
828 /*
829 * We have pmd == none and we are holding page_table_lock.
830 * So we can safely go and clear the pgtable hash
831 * index info.
832 */
833 pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
834 pgtable = *pgtable_slot;
835 /*
836 * Let's zero out old valid and hash index details
837 * hash fault look at them.
838 */
839 memset(pgtable, 0, PTE_FRAG_SIZE);
840 return old_pmd;
841}
842
843int has_transparent_hugepage(void)
844{
845 if (!mmu_has_feature(MMU_FTR_16M_PAGE))
846 return 0;
847 /*
848 * We support THP only if PMD_SIZE is 16MB.
849 */
850 if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
851 return 0;
852 /*
853 * We need to make sure that we support 16MB hugepage in a segement
854 * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
855 * of 64K.
856 */
857 /*
858 * If we have 64K HPTE, we will be using that by default
859 */
860 if (mmu_psize_defs[MMU_PAGE_64K].shift &&
861 (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
862 return 0;
863 /*
864 * Ok we only have 4K HPTE
865 */
866 if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
867 return 0;
868
869 return 1;
870}
871#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 7c415ddde948..aa74acb0fdfc 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -130,6 +130,53 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
130 up_write(&mm->mmap_sem); 130 up_write(&mm->mmap_sem);
131} 131}
132 132
133#ifdef CONFIG_TRANSPARENT_HUGEPAGE
134static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
135 unsigned long end, struct mm_walk *walk)
136{
137 struct vm_area_struct *vma = walk->private;
138 split_huge_page_pmd(vma, addr, pmd);
139 return 0;
140}
141
142static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
143 unsigned long len)
144{
145 struct vm_area_struct *vma;
146 struct mm_walk subpage_proto_walk = {
147 .mm = mm,
148 .pmd_entry = subpage_walk_pmd_entry,
149 };
150
151 /*
152 * We don't try too hard, we just mark all the vma in that range
153 * VM_NOHUGEPAGE and split them.
154 */
155 vma = find_vma(mm, addr);
156 /*
157 * If the range is in unmapped range, just return
158 */
159 if (vma && ((addr + len) <= vma->vm_start))
160 return;
161
162 while (vma) {
163 if (vma->vm_start >= (addr + len))
164 break;
165 vma->vm_flags |= VM_NOHUGEPAGE;
166 subpage_proto_walk.private = vma;
167 walk_page_range(vma->vm_start, vma->vm_end,
168 &subpage_proto_walk);
169 vma = vma->vm_next;
170 }
171}
172#else
173static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
174 unsigned long len)
175{
176 return;
177}
178#endif
179
133/* 180/*
134 * Copy in a subpage protection map for an address range. 181 * Copy in a subpage protection map for an address range.
135 * The map has 2 bits per 4k subpage, so 32 bits per 64k page. 182 * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
@@ -168,6 +215,7 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
168 return -EFAULT; 215 return -EFAULT;
169 216
170 down_write(&mm->mmap_sem); 217 down_write(&mm->mmap_sem);
218 subpage_mark_vma_nohuge(mm, addr, len);
171 for (limit = addr + len; addr < limit; addr = next) { 219 for (limit = addr + len; addr < limit; addr = next) {
172 next = pmd_addr_end(addr, limit); 220 next = pmd_addr_end(addr, limit);
173 err = -ENOMEM; 221 err = -ENOMEM;
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 7df1c5edda87..36e44b4260eb 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -189,6 +189,7 @@ void tlb_flush(struct mmu_gather *tlb)
189void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, 189void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
190 unsigned long end) 190 unsigned long end)
191{ 191{
192 int hugepage_shift;
192 unsigned long flags; 193 unsigned long flags;
193 194
194 start = _ALIGN_DOWN(start, PAGE_SIZE); 195 start = _ALIGN_DOWN(start, PAGE_SIZE);
@@ -206,7 +207,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
206 local_irq_save(flags); 207 local_irq_save(flags);
207 arch_enter_lazy_mmu_mode(); 208 arch_enter_lazy_mmu_mode();
208 for (; start < end; start += PAGE_SIZE) { 209 for (; start < end; start += PAGE_SIZE) {
209 pte_t *ptep = find_linux_pte(mm->pgd, start); 210 pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start,
211 &hugepage_shift);
210 unsigned long pte; 212 unsigned long pte;
211 213
212 if (ptep == NULL) 214 if (ptep == NULL)
@@ -214,7 +216,37 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
214 pte = pte_val(*ptep); 216 pte = pte_val(*ptep);
215 if (!(pte & _PAGE_HASHPTE)) 217 if (!(pte & _PAGE_HASHPTE))
216 continue; 218 continue;
217 hpte_need_flush(mm, start, ptep, pte, 0); 219 if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
220 hpte_do_hugepage_flush(mm, start, (pmd_t *)pte);
221 else
222 hpte_need_flush(mm, start, ptep, pte, 0);
223 }
224 arch_leave_lazy_mmu_mode();
225 local_irq_restore(flags);
226}
227
228void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
229{
230 pte_t *pte;
231 pte_t *start_pte;
232 unsigned long flags;
233
234 addr = _ALIGN_DOWN(addr, PMD_SIZE);
235 /* Note: Normally, we should only ever use a batch within a
236 * PTE locked section. This violates the rule, but will work
237 * since we don't actually modify the PTEs, we just flush the
238 * hash while leaving the PTEs intact (including their reference
239 * to being hashed). This is not the most performance oriented
240 * way to do things but is fine for our needs here.
241 */
242 local_irq_save(flags);
243 arch_enter_lazy_mmu_mode();
244 start_pte = pte_offset_map(pmd, addr);
245 for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
246 unsigned long pteval = pte_val(*pte);
247 if (pteval & _PAGE_HASHPTE)
248 hpte_need_flush(mm, addr, pte, pteval, 0);
249 addr += PAGE_SIZE;
218 } 250 }
219 arch_leave_lazy_mmu_mode(); 251 arch_leave_lazy_mmu_mode();
220 local_irq_restore(flags); 252 local_irq_restore(flags);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 6888cad5103d..41cd68dee681 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -648,7 +648,7 @@ void __init early_init_mmu(void)
648 __early_init_mmu(1); 648 __early_init_mmu(1);
649} 649}
650 650
651void __cpuinit early_init_mmu_secondary(void) 651void early_init_mmu_secondary(void)
652{ 652{
653 __early_init_mmu(0); 653 __early_init_mmu(0);
654} 654}