Merge remote-tracking branch 'linus/master' into testing

author: Sage Weil <sage@inktank.com> 2013-08-15 14:11:45 -0400
committer: Sage Weil <sage@inktank.com> 2013-08-15 14:11:45 -0400
commit: ee3e542fec6e69bc9fb668698889a37d93950ddf (patch)
tree: e74ee766a4764769ef1d3d45d266b4dea64101d3 /arch/powerpc/mm
parent: fe2a801b50c0bb8039d627e5ae1fec249d10ff39 (diff)
parent: f1d6e17f540af37bb1891480143669ba7636c4cf (diff)
19 files changed, 1139 insertions, 338 deletions
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c
index 2c9441ee6bb8..82b1ff759e26 100644
--- a/arch/powerpc/mm/44x_mmu.c
+++ b/arch/powerpc/mm/44x_mmu.c
@@ -41,7 +41,7 @@ int icache_44x_need_flush;
 unsigned long tlb_47x_boltmap[1024/8];
-static void __cpuinit ppc44x_update_tlb_hwater(void)
+static void ppc44x_update_tlb_hwater(void)
 {
        extern unsigned int tlb_44x_patch_hwater_D[];
        extern unsigned int tlb_44x_patch_hwater_I[];
@@ -134,7 +134,7 @@ static void __init ppc47x_update_boltmap(void)
 /*
 * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU
 */
-static void __cpuinit ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
+static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys)
 {
        unsigned int rA;
        int bolted;
@@ -229,7 +229,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 }
 #ifdef CONFIG_SMP
-void __cpuinit mmu_init_secondary(int cpu)
+void mmu_init_secondary(int cpu)
 {
        unsigned long addr;
        unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index cf16b5733eaa..51230ee6a407 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -6,17 +6,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
-obj-y                           := fault.o mem.o pgtable.o gup.o \
+obj-y                           := fault.o mem.o pgtable.o gup.o mmap.o \
                                   init_$(CONFIG_WORD_SIZE).o \
                                   pgtable_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_PPC_MMU_NOHASH)    += mmu_context_nohash.o tlb_nohash.o \
                                   tlb_nohash_low.o
 obj-$(CONFIG_PPC_BOOK3E)        += tlb_low_$(CONFIG_WORD_SIZE)e.o
-obj-$(CONFIG_PPC64)             += mmap_64.o
 hash64-$(CONFIG_PPC_NATIVE)     := hash_native_64.o
 obj-$(CONFIG_PPC_STD_MMU_64)    += hash_utils_64.o \
                                   slb_low.o slb.o stab.o \
-                                   mmap_64.o $(hash64-y)
+                                   $(hash64-y)
 obj-$(CONFIG_PPC_STD_MMU_32)    += ppc_mmu_32.o
 obj-$(CONFIG_PPC_STD_MMU)       += hash_low_$(CONFIG_WORD_SIZE).o \
                                   tlb_hash$(CONFIG_WORD_SIZE).o \
@@ -28,11 +27,12 @@ obj-$(CONFIG_44x)		+= 44x_mmu.o
 obj-$(CONFIG_PPC_FSL_BOOK3E)    += fsl_booke_mmu.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)     += slice.o
-ifeq ($(CONFIG_HUGETLB_PAGE),y)
 obj-y                           += hugetlbpage.o
+ifeq ($(CONFIG_HUGETLB_PAGE),y)
 obj-$(CONFIG_PPC_STD_MMU_64)    += hugetlbpage-hash64.o
 obj-$(CONFIG_PPC_BOOK3E_MMU)    += hugetlbpage-book3e.o
 endif
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
 obj-$(CONFIG_PPC_SUBPAGE_PROT)  += subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)           += highmem.o
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index 4b921affa495..49822d90ea96 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -34,7 +34,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
        ptep = pte_offset_kernel(&pmd, addr);
        do {
-                pte_t pte = *ptep;
+                pte_t pte = ACCESS_ONCE(*ptep);
                struct page *page;
                if ((pte_val(pte) & mask) != result)
@@ -63,12 +63,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
        pmdp = pmd_offset(&pud, addr);
        do {
-                pmd_t pmd = *pmdp;
+                pmd_t pmd = ACCESS_ONCE(*pmdp);
                next = pmd_addr_end(addr, end);
-                if (pmd_none(pmd))
+                /*
+                 * If we find a splitting transparent hugepage we
+                 * return zero. That will result in taking the slow
+                 * path which will call wait_split_huge_page()
+                 * if the pmd is still in splitting state
+                 */
+                if (pmd_none(pmd) || pmd_trans_splitting(pmd))
                        return 0;
-                if (pmd_huge(pmd)) {
+                if (pmd_huge(pmd) || pmd_large(pmd)) {
                        if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
                                         write, pages, nr))
                                return 0;
@@ -91,7 +97,7 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
        pudp = pud_offset(&pgd, addr);
        do {
-                pud_t pud = *pudp;
+                pud_t pud = ACCESS_ONCE(*pudp);
                next = pud_addr_end(addr, end);
                if (pud_none(pud))
@@ -154,7 +160,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
        pgdp = pgd_offset(mm, addr);
        do {
-                pgd_t pgd = *pgdp;
+                pgd_t pgd = ACCESS_ONCE(*pgdp);
                pr_devel("  %016lx: normal pgd %p\n", addr,
                         (void *)pgd_val(pgd));
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index 0e980acae67c..d3cbda62857b 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -289,9 +289,10 @@ htab_modify_pte:
        /* Call ppc_md.hpte_updatepp */
        mr      r5,r29                  /* vpn */
-        li      r6,MMU_PAGE_4K          /* page size */
+        li      r6,MMU_PAGE_4K          /* base page size */
-        ld      r7,STK_PARAM(R9)(r1)    /* segment size */
+        li      r7,MMU_PAGE_4K          /* actual page size */
-        ld      r8,STK_PARAM(R8)(r1)    /* get "local" param */
+        ld      r8,STK_PARAM(R9)(r1)    /* segment size */
+        ld      r9,STK_PARAM(R8)(r1)    /* get "local" param */
 _GLOBAL(htab_call_hpte_updatepp)
        bl      .                       /* Patched by htab_finish_init() */
@@ -649,9 +650,10 @@ htab_modify_pte:
        /* Call ppc_md.hpte_updatepp */
        mr      r5,r29                  /* vpn */
-        li      r6,MMU_PAGE_4K          /* page size */
+        li      r6,MMU_PAGE_4K          /* base page size */
-        ld      r7,STK_PARAM(R9)(r1)    /* segment size */
+        li      r7,MMU_PAGE_4K          /* actual page size */
-        ld      r8,STK_PARAM(R8)(r1)    /* get "local" param */
+        ld      r8,STK_PARAM(R9)(r1)    /* segment size */
+        ld      r9,STK_PARAM(R8)(r1)    /* get "local" param */
 _GLOBAL(htab_call_hpte_updatepp)
        bl      .                       /* patched by htab_finish_init() */
@@ -937,9 +939,10 @@ ht64_modify_pte:
        /* Call ppc_md.hpte_updatepp */
        mr      r5,r29                  /* vpn */
-        li      r6,MMU_PAGE_64K
+        li      r6,MMU_PAGE_64K         /* base page size */
-        ld      r7,STK_PARAM(R9)(r1)    /* segment size */
+        li      r7,MMU_PAGE_64K         /* actual page size */
-        ld      r8,STK_PARAM(R8)(r1)    /* get "local" param */
+        ld      r8,STK_PARAM(R9)(r1)    /* segment size */
+        ld      r9,STK_PARAM(R8)(r1)    /* get "local" param */
 _GLOBAL(ht64_call_hpte_updatepp)
        bl      .                       /* patched by htab_finish_init() */
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 4c122c3f1623..c33d939120c9 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -43,6 +43,7 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
 {
        unsigned long va;
        unsigned int penc;
+        unsigned long sllp;
        /*
         * We need 14 to 65 bits of va for a tlibe of 4K page
@@ -64,7 +65,9 @@ static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize)
                /* clear out bits after (52) [0....52.....63] */
                va &= ~((1ul << (64 - 52)) - 1);
                va |= ssize << 8;
-                va |= mmu_psize_defs[apsize].sllp << 6;
+                sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
+                        ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
+                va |= sllp << 5;
                asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2)
                             : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206)
                             : "memory");
@@ -98,6 +101,7 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
 {
        unsigned long va;
        unsigned int penc;
+        unsigned long sllp;
        /* VPN_SHIFT can be atmost 12 */
        va = vpn << VPN_SHIFT;
@@ -113,7 +117,9 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize)
                /* clear out bits after(52) [0....52.....63] */
                va &= ~((1ul << (64 - 52)) - 1);
                va |= ssize << 8;
-                va |= mmu_psize_defs[apsize].sllp << 6;
+                sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) |
+                        ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4);
+                va |= sllp << 5;
                asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)"
                             : : "r"(va) : "memory");
                break;
@@ -273,61 +279,15 @@ static long native_hpte_remove(unsigned long hpte_group)
        return i;
 }
-static inline int __hpte_actual_psize(unsigned int lp, int psize)
-{
-        int i, shift;
-        unsigned int mask;
-        /* start from 1 ignoring MMU_PAGE_4K */
-        for (i = 1; i < MMU_PAGE_COUNT; i++) {
-                /* invalid penc */
-                if (mmu_psize_defs[psize].penc[i] == -1)
-                        continue;
-                /*
-                 * encoding bits per actual page size
-                 *        PTE LP     actual page size
-                 *    rrrr rrrz         >=8KB
-                 *    rrrr rrzz         >=16KB
-                 *    rrrr rzzz         >=32KB
-                 *    rrrr zzzz         >=64KB
-                 * .......
-                 */
-                shift = mmu_psize_defs[i].shift - LP_SHIFT;
-                if (shift > LP_BITS)
-                        shift = LP_BITS;
-                mask = (1 << shift) - 1;
-                if ((lp & mask) == mmu_psize_defs[psize].penc[i])
-                        return i;
-        }
-        return -1;
-}
-static inline int hpte_actual_psize(struct hash_pte *hptep, int psize)
-{
-        /* Look at the 8 bit LP value */
-        unsigned int lp = (hptep->r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
-        if (!(hptep->v & HPTE_V_VALID))
-                return -1;
-        /* First check if it is large page */
-        if (!(hptep->v & HPTE_V_LARGE))
-                return MMU_PAGE_4K;
-        return __hpte_actual_psize(lp, psize);
-}
 static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
-                                 unsigned long vpn, int psize, int ssize,
+                                 unsigned long vpn, int bpsize,
-                                 int local)
+                                 int apsize, int ssize, int local)
 {
        struct hash_pte *hptep = htab_address + slot;
        unsigned long hpte_v, want_v;
        int ret = 0;
-        int actual_psize;
-        want_v = hpte_encode_avpn(vpn, psize, ssize);
+        want_v = hpte_encode_avpn(vpn, bpsize, ssize);
        DBG_LOW("    update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)",
                vpn, want_v & HPTE_V_AVPN, slot, newpp);
@@ -335,7 +295,6 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
        native_lock_hpte(hptep);
        hpte_v = hptep->v;
-        actual_psize = hpte_actual_psize(hptep, psize);
        /*
         * We need to invalidate the TLB always because hpte_remove doesn't do
         * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -343,12 +302,7 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
         * (hpte_remove) because we assume the old translation is still
         * technically "valid".
         */
-        if (actual_psize < 0) {
+        if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) {
-                actual_psize = psize;
-                ret = -1;
-                goto err_out;
-        }
-        if (!HPTE_V_COMPARE(hpte_v, want_v)) {
                DBG_LOW(" -> miss\n");
                ret = -1;
        } else {
@@ -357,11 +311,10 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
                hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
                        (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_C));
        }
-err_out:
        native_unlock_hpte(hptep);
        /* Ensure it is out of the tlb too. */
-        tlbie(vpn, psize, actual_psize, ssize, local);
+        tlbie(vpn, bpsize, apsize, ssize, local);
        return ret;
 }
@@ -402,7 +355,6 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
 static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
                                       int psize, int ssize)
 {
-        int actual_psize;
        unsigned long vpn;
        unsigned long vsid;
        long slot;
@@ -415,36 +367,33 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
        if (slot == -1)
                panic("could not find page to bolt\n");
        hptep = htab_address + slot;
-        actual_psize = hpte_actual_psize(hptep, psize);
-        if (actual_psize < 0)
-                actual_psize = psize;
        /* Update the HPTE */
        hptep->r = (hptep->r & ~(HPTE_R_PP | HPTE_R_N)) |
                (newpp & (HPTE_R_PP | HPTE_R_N));
+        /*
-        /* Ensure it is out of the tlb too. */
+         * Ensure it is out of the tlb too. Bolted entries base and
-        tlbie(vpn, psize, actual_psize, ssize, 0);
+         * actual page size will be same.
+         */
+        tlbie(vpn, psize, psize, ssize, 0);
 }
 static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
-                                   int psize, int ssize, int local)
+                                   int bpsize, int apsize, int ssize, int local)
 {
        struct hash_pte *hptep = htab_address + slot;
        unsigned long hpte_v;
        unsigned long want_v;
        unsigned long flags;
-        int actual_psize;
        local_irq_save(flags);
        DBG_LOW("    invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot);
-        want_v = hpte_encode_avpn(vpn, psize, ssize);
+        want_v = hpte_encode_avpn(vpn, bpsize, ssize);
        native_lock_hpte(hptep);
        hpte_v = hptep->v;
-        actual_psize = hpte_actual_psize(hptep, psize);
        /*
         * We need to invalidate the TLB always because hpte_remove doesn't do
         * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -452,23 +401,120 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
         * (hpte_remove) because we assume the old translation is still
         * technically "valid".
         */
-        if (actual_psize < 0) {
+        if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
-                actual_psize = psize;
-                native_unlock_hpte(hptep);
-                goto err_out;
-        }
-        if (!HPTE_V_COMPARE(hpte_v, want_v))
                native_unlock_hpte(hptep);
        else
                /* Invalidate the hpte. NOTE: this also unlocks it */
                hptep->v = 0;
-err_out:
        /* Invalidate the TLB */
-        tlbie(vpn, psize, actual_psize, ssize, local);
+        tlbie(vpn, bpsize, apsize, ssize, local);
+        local_irq_restore(flags);
+}
+static void native_hugepage_invalidate(struct mm_struct *mm,
+                                       unsigned char *hpte_slot_array,
+                                       unsigned long addr, int psize)
+{
+        int ssize = 0, i;
+        int lock_tlbie;
+        struct hash_pte *hptep;
+        int actual_psize = MMU_PAGE_16M;
+        unsigned int max_hpte_count, valid;
+        unsigned long flags, s_addr = addr;
+        unsigned long hpte_v, want_v, shift;
+        unsigned long hidx, vpn = 0, vsid, hash, slot;
+        shift = mmu_psize_defs[psize].shift;
+        max_hpte_count = 1U << (PMD_SHIFT - shift);
+        local_irq_save(flags);
+        for (i = 0; i < max_hpte_count; i++) {
+                valid = hpte_valid(hpte_slot_array, i);
+                if (!valid)
+                        continue;
+                hidx =  hpte_hash_index(hpte_slot_array, i);
+                /* get the vpn */
+                addr = s_addr + (i * (1ul << shift));
+                if (!is_kernel_addr(addr)) {
+                        ssize = user_segment_size(addr);
+                        vsid = get_vsid(mm->context.id, addr, ssize);
+                        WARN_ON(vsid == 0);
+                } else {
+                        vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+                        ssize = mmu_kernel_ssize;
+                }
+                vpn = hpt_vpn(addr, vsid, ssize);
+                hash = hpt_hash(vpn, shift, ssize);
+                if (hidx & _PTEIDX_SECONDARY)
+                        hash = ~hash;
+                slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+                slot += hidx & _PTEIDX_GROUP_IX;
+                hptep = htab_address + slot;
+                want_v = hpte_encode_avpn(vpn, psize, ssize);
+                native_lock_hpte(hptep);
+                hpte_v = hptep->v;
+                /* Even if we miss, we need to invalidate the TLB */
+                if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+                        native_unlock_hpte(hptep);
+                else
+                        /* Invalidate the hpte. NOTE: this also unlocks it */
+                        hptep->v = 0;
+        }
+        /*
+         * Since this is a hugepage, we just need a single tlbie.
+         * use the last vpn.
+         */
+        lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+        if (lock_tlbie)
+                raw_spin_lock(&native_tlbie_lock);
+        asm volatile("ptesync":::"memory");
+        __tlbie(vpn, psize, actual_psize, ssize);
+        asm volatile("eieio; tlbsync; ptesync":::"memory");
+        if (lock_tlbie)
+                raw_spin_unlock(&native_tlbie_lock);
        local_irq_restore(flags);
 }
+static inline int __hpte_actual_psize(unsigned int lp, int psize)
+{
+        int i, shift;
+        unsigned int mask;
+        /* start from 1 ignoring MMU_PAGE_4K */
+        for (i = 1; i < MMU_PAGE_COUNT; i++) {
+                /* invalid penc */
+                if (mmu_psize_defs[psize].penc[i] == -1)
+                        continue;
+                /*
+                 * encoding bits per actual page size
+                 *        PTE LP     actual page size
+                 *    rrrr rrrz         >=8KB
+                 *    rrrr rrzz         >=16KB
+                 *    rrrr rzzz         >=32KB
+                 *    rrrr zzzz         >=64KB
+                 * .......
+                 */
+                shift = mmu_psize_defs[i].shift - LP_SHIFT;
+                if (shift > LP_BITS)
+                        shift = LP_BITS;
+                mask = (1 << shift) - 1;
+                if ((lp & mask) == mmu_psize_defs[psize].penc[i])
+                        return i;
+        }
+        return -1;
+}
 static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
                        int *psize, int *apsize, int *ssize, unsigned long *vpn)
 {
@@ -514,6 +560,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
                        seg_off |= vpi << shift;
                }
                *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+                break;
        case MMU_SEGSIZE_1T:
                /* We only have 40 - 23 bits of seg_off in avpn */
                seg_off = (avpn & 0x1ffff) << 23;
@@ -523,6 +570,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
                        seg_off |= vpi << shift;
                }
                *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT;
+                break;
        default:
                *vpn = size = 0;
        }
@@ -672,4 +720,5 @@ void __init hpte_init_native(void)
        ppc_md.hpte_remove      = native_hpte_remove;
        ppc_md.hpte_clear_all   = native_hpte_clear;
        ppc_md.flush_hash_range = native_flush_hash_range;
+        ppc_md.hugepage_invalidate   = native_hugepage_invalidate;
 }
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index e303a6d74e3a..6ecc38bd5b24 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -807,7 +807,7 @@ void __init early_init_mmu(void)
 }
 #ifdef CONFIG_SMP
-void __cpuinit early_init_mmu_secondary(void)
+void early_init_mmu_secondary(void)
 {
        /* Initialize hash table for that CPU */
        if (!firmware_has_feature(FW_FEATURE_LPAR))
@@ -1050,13 +1050,26 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
                goto bail;
        }
-#ifdef CONFIG_HUGETLB_PAGE
        if (hugeshift) {
-                rc = __hash_page_huge(ea, access, vsid, ptep, trap, local,
+                if (pmd_trans_huge(*(pmd_t *)ptep))
-                                        ssize, hugeshift, psize);
+                        rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
+                                             trap, local, ssize, psize);
+#ifdef CONFIG_HUGETLB_PAGE
+                else
+                        rc = __hash_page_huge(ea, access, vsid, ptep, trap,
+                                              local, ssize, hugeshift, psize);
+#else
+                else {
+                        /*
+                         * if we have hugeshift, and is not transhuge with
+                         * hugetlb disabled, something is really wrong.
+                         */
+                        rc = 1;
+                        WARN_ON(1);
+                }
+#endif
                goto bail;
        }
-#endif /* CONFIG_HUGETLB_PAGE */
 #ifndef CONFIG_PPC_64K_PAGES
        DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
@@ -1145,6 +1158,7 @@ EXPORT_SYMBOL_GPL(hash_page);
 void hash_preload(struct mm_struct *mm, unsigned long ea,
                  unsigned long access, unsigned long trap)
 {
+        int hugepage_shift;
        unsigned long vsid;
        pgd_t *pgdir;
        pte_t *ptep;
@@ -1166,10 +1180,27 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
        pgdir = mm->pgd;
        if (pgdir == NULL)
                return;
-        ptep = find_linux_pte(pgdir, ea);
-        if (!ptep)
+        /* Get VSID */
+        ssize = user_segment_size(ea);
+        vsid = get_vsid(mm->context.id, ea, ssize);
+        if (!vsid)
                return;
+        /*
+         * Hash doesn't like irqs. Walking linux page table with irq disabled
+         * saves us from holding multiple locks.
+         */
+        local_irq_save(flags);
+        /*
+         * THP pages use update_mmu_cache_pmd. We don't do
+         * hash preload there. Hence can ignore THP here
+         */
+        ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift);
+        if (!ptep)
+                goto out_exit;
+        WARN_ON(hugepage_shift);
 #ifdef CONFIG_PPC_64K_PAGES
        /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on
         * a 64K kernel), then we don't preload, hash_page() will take
@@ -1178,18 +1209,9 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
         * page size demotion here
         */
        if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE))
-                return;
+                goto out_exit;
 #endif /* CONFIG_PPC_64K_PAGES */
-        /* Get VSID */
-        ssize = user_segment_size(ea);
-        vsid = get_vsid(mm->context.id, ea, ssize);
-        if (!vsid)
-                return;
-        /* Hash doesn't like irqs */
-        local_irq_save(flags);
        /* Is that local to this CPU ? */
        if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
                local = 1;
@@ -1211,7 +1233,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
                                   mm->context.user_psize,
                                   mm->context.user_psize,
                                   pte_val(*ptep));
+out_exit:
        local_irq_restore(flags);
 }
@@ -1232,7 +1254,11 @@ void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
                slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
                slot += hidx & _PTEIDX_GROUP_IX;
                DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx);
-                ppc_md.hpte_invalidate(slot, vpn, psize, ssize, local);
+                /*
+                 * We use same base page size and actual psize, because we don't
+                 * use these functions for hugepage
+                 */
+                ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local);
        } pte_iterate_hashed_end();
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -1365,7 +1391,8 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
                hash = ~hash;
        slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
        slot += hidx & _PTEIDX_GROUP_IX;
-        ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_kernel_ssize, 0);
+        ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize,
+                               mmu_kernel_ssize, 0);
 }
 void kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
new file mode 100644
index 000000000000..34de9e0cdc34
--- /dev/null
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+/*
+ * PPC64 THP Support for hash based MMUs
+ */
+#include <linux/mm.h>
+#include <asm/machdep.h>
+int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
+                    pmd_t *pmdp, unsigned long trap, int local, int ssize,
+                    unsigned int psize)
+{
+        unsigned int index, valid;
+        unsigned char *hpte_slot_array;
+        unsigned long rflags, pa, hidx;
+        unsigned long old_pmd, new_pmd;
+        int ret, lpsize = MMU_PAGE_16M;
+        unsigned long vpn, hash, shift, slot;
+        /*
+         * atomically mark the linux large page PMD busy and dirty
+         */
+        do {
+                old_pmd = pmd_val(*pmdp);
+                /* If PMD busy, retry the access */
+                if (unlikely(old_pmd & _PAGE_BUSY))
+                        return 0;
+                /* If PMD is trans splitting retry the access */
+                if (unlikely(old_pmd & _PAGE_SPLITTING))
+                        return 0;
+                /* If PMD permissions don't match, take page fault */
+                if (unlikely(access & ~old_pmd))
+                        return 1;
+                /*
+                 * Try to lock the PTE, add ACCESSED and DIRTY if it was
+                 * a write access
+                 */
+                new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED;
+                if (access & _PAGE_RW)
+                        new_pmd |= _PAGE_DIRTY;
+        } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp,
+                                          old_pmd, new_pmd));
+        /*
+         * PP bits. _PAGE_USER is already PP bit 0x2, so we only
+         * need to add in 0x1 if it's a read-only user page
+         */
+        rflags = new_pmd & _PAGE_USER;
+        if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) &&
+                                           (new_pmd & _PAGE_DIRTY)))
+                rflags |= 0x1;
+        /*
+         * _PAGE_EXEC -> HW_NO_EXEC since it's inverted
+         */
+        rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N);
+#if 0
+        if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+                /*
+                 * No CPU has hugepages but lacks no execute, so we
+                 * don't need to worry about that case
+                 */
+                rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+        }
+#endif
+        /*
+         * Find the slot index details for this ea, using base page size.
+         */
+        shift = mmu_psize_defs[psize].shift;
+        index = (ea & ~HPAGE_PMD_MASK) >> shift;
+        BUG_ON(index >= 4096);
+        vpn = hpt_vpn(ea, vsid, ssize);
+        hash = hpt_hash(vpn, shift, ssize);
+        hpte_slot_array = get_hpte_slot_array(pmdp);
+        valid = hpte_valid(hpte_slot_array, index);
+        if (valid) {
+                /* update the hpte bits */
+                hidx =  hpte_hash_index(hpte_slot_array, index);
+                if (hidx & _PTEIDX_SECONDARY)
+                        hash = ~hash;
+                slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+                slot += hidx & _PTEIDX_GROUP_IX;
+                ret = ppc_md.hpte_updatepp(slot, rflags, vpn,
+                                           psize, lpsize, ssize, local);
+                /*
+                 * We failed to update, try to insert a new entry.
+                 */
+                if (ret == -1) {
+                        /*
+                         * large pte is marked busy, so we can be sure
+                         * nobody is looking at hpte_slot_array. hence we can
+                         * safely update this here.
+                         */
+                        valid = 0;
+                        new_pmd &= ~_PAGE_HPTEFLAGS;
+                        hpte_slot_array[index] = 0;
+                } else
+                        /* clear the busy bits and set the hash pte bits */
+                        new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+        }
+        if (!valid) {
+                unsigned long hpte_group;
+                /* insert new entry */
+                pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
+repeat:
+                hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+                /* clear the busy bits and set the hash pte bits */
+                new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+                /* Add in WIMG bits */
+                rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+                                      _PAGE_COHERENT | _PAGE_GUARDED));
+                /* Insert into the hash table, primary slot */
+                slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
+                                          psize, lpsize, ssize);
+                /*
+                 * Primary is full, try the secondary
+                 */
+                if (unlikely(slot == -1)) {
+                        hpte_group = ((~hash & htab_hash_mask) *
+                                      HPTES_PER_GROUP) & ~0x7UL;
+                        slot = ppc_md.hpte_insert(hpte_group, vpn, pa,
+                                                  rflags, HPTE_V_SECONDARY,
+                                                  psize, lpsize, ssize);
+                        if (slot == -1) {
+                                if (mftb() & 0x1)
+                                        hpte_group = ((hash & htab_hash_mask) *
+                                                      HPTES_PER_GROUP) & ~0x7UL;
+                                ppc_md.hpte_remove(hpte_group);
+                                goto repeat;
+                        }
+                }
+                /*
+                 * Hypervisor failure. Restore old pmd and return -1
+                 * similar to __hash_page_*
+                 */
+                if (unlikely(slot == -2)) {
+                        *pmdp = __pmd(old_pmd);
+                        hash_failure_debug(ea, access, vsid, trap, ssize,
+                                           psize, lpsize, old_pmd);
+                        return -1;
+                }
+                /*
+                 * large pte is marked busy, so we can be sure
+                 * nobody is looking at hpte_slot_array. hence we can
+                 * safely update this here.
+                 */
+                mark_hpte_slot_valid(hpte_slot_array, index, slot);
+        }
+        /*
+         * No need to use ldarx/stdcx here
+         */
+        *pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
+        return 0;
+}
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 0f1d94a1fb82..0b7fb6761015 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -81,7 +81,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
                slot += (old_pte & _PAGE_F_GIX) >> 12;
                if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize,
-                                         ssize, local) == -1)
+                                         mmu_psize, ssize, local) == -1)
                        old_pte &= ~_PAGE_HPTEFLAGS;
        }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 77fdd2cef33b..834ca8eb38f2 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -21,6 +21,9 @@
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 #include <asm/setup.h>
+#include <asm/hugetlb.h>
+#ifdef CONFIG_HUGETLB_PAGE
 #define PAGE_SHIFT_64K  16
 #define PAGE_SHIFT_16M  24
@@ -100,68 +103,9 @@ int pgd_huge(pgd_t pgd)
 }
 #endif
-/*
- * We have 4 cases for pgds and pmds:
- * (1) invalid (all zeroes)
- * (2) pointer to next table, as normal; bottom 6 bits == 0
- * (3) leaf pte for huge page, bottom two bits != 00
- * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
- */
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
-{
-        pgd_t *pg;
-        pud_t *pu;
-        pmd_t *pm;
-        pte_t *ret_pte;
-        hugepd_t *hpdp = NULL;
-        unsigned pdshift = PGDIR_SHIFT;
-        if (shift)
-                *shift = 0;
-        pg = pgdir + pgd_index(ea);
-        if (pgd_huge(*pg)) {
-                ret_pte = (pte_t *) pg;
-                goto out;
-        } else if (is_hugepd(pg))
-                hpdp = (hugepd_t *)pg;
-        else if (!pgd_none(*pg)) {
-                pdshift = PUD_SHIFT;
-                pu = pud_offset(pg, ea);
-                if (pud_huge(*pu)) {
-                        ret_pte = (pte_t *) pu;
-                        goto out;
-                } else if (is_hugepd(pu))
-                        hpdp = (hugepd_t *)pu;
-                else if (!pud_none(*pu)) {
-                        pdshift = PMD_SHIFT;
-                        pm = pmd_offset(pu, ea);
-                        if (pmd_huge(*pm)) {
-                                ret_pte = (pte_t *) pm;
-                                goto out;
-                        } else if (is_hugepd(pm))
-                                hpdp = (hugepd_t *)pm;
-                        else if (!pmd_none(*pm))
-                                return pte_offset_kernel(pm, ea);
-                }
-        }
-        if (!hpdp)
-                return NULL;
-        ret_pte = hugepte_offset(hpdp, ea, pdshift);
-        pdshift = hugepd_shift(*hpdp);
-out:
-        if (shift)
-                *shift = pdshift;
-        return ret_pte;
-}
-EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
+        /* Only called for hugetlbfs pages, hence can ignore THP */
        return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
 }
@@ -357,7 +301,7 @@ void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
 int alloc_bootmem_huge_page(struct hstate *hstate)
 {
        struct huge_bootmem_page *m;
-        int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT);
+        int idx = shift_to_mmu_psize(huge_page_shift(hstate));
        int nr_gpages = gpage_freearray[idx].nr_gpages;
        if (nr_gpages == 0)
@@ -736,11 +680,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
        struct page *page;
        unsigned shift;
        unsigned long mask;
+        /*
+         * Transparent hugepages are handled by generic code. We can skip them
+         * here.
+         */
        ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
        /* Verify it is a huge page else bail. */
-        if (!ptep || !shift)
+        if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
                return ERR_PTR(-EINVAL);
        mask = (1UL << shift) - 1;
@@ -759,69 +706,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
        return NULL;
 }
-int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
-                unsigned long end, int write, struct page **pages, int *nr)
-{
-        unsigned long mask;
-        unsigned long pte_end;
-        struct page *head, *page, *tail;
-        pte_t pte;
-        int refs;
-        pte_end = (addr + sz) & ~(sz-1);
-        if (pte_end < end)
-                end = pte_end;
-        pte = *ptep;
-        mask = _PAGE_PRESENT | _PAGE_USER;
-        if (write)
-                mask |= _PAGE_RW;
-        if ((pte_val(pte) & mask) != mask)
-                return 0;
-        /* hugepages are never "special" */
-        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-        refs = 0;
-        head = pte_page(pte);
-        page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
-        tail = page;
-        do {
-                VM_BUG_ON(compound_head(page) != head);
-                pages[*nr] = page;
-                (*nr)++;
-                page++;
-                refs++;
-        } while (addr += PAGE_SIZE, addr != end);
-        if (!page_cache_add_speculative(head, refs)) {
-                *nr -= refs;
-                return 0;
-        }
-        if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-                /* Could be optimized better */
-                *nr -= refs;
-                while (refs--)
-                        put_page(head);
-                return 0;
-        }
-        /*
-         * Any tail page need their mapcount reference taken before we
-         * return.
-         */
-        while (refs--) {
-                if (PageTail(tail))
-                        get_huge_page_tail(tail);
-                tail++;
-        }
-        return 1;
-}
 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
                                      unsigned long sz)
 {
@@ -1038,3 +922,168 @@ void flush_dcache_icache_hugepage(struct page *page)
                }
        }
 }
+#endif /* CONFIG_HUGETLB_PAGE */
+/*
+ * We have 4 cases for pgds and pmds:
+ * (1) invalid (all zeroes)
+ * (2) pointer to next table, as normal; bottom 6 bits == 0
+ * (3) leaf pte for huge page, bottom two bits != 00
+ * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
+ *
+ * So long as we atomically load page table pointers we are safe against teardown,
+ * we can follow the address down to the the page and take a ref on it.
+ */
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+{
+        pgd_t pgd, *pgdp;
+        pud_t pud, *pudp;
+        pmd_t pmd, *pmdp;
+        pte_t *ret_pte;
+        hugepd_t *hpdp = NULL;
+        unsigned pdshift = PGDIR_SHIFT;
+        if (shift)
+                *shift = 0;
+        pgdp = pgdir + pgd_index(ea);
+        pgd  = ACCESS_ONCE(*pgdp);
+        /*
+         * Always operate on the local stack value. This make sure the
+         * value don't get updated by a parallel THP split/collapse,
+         * page fault or a page unmap. The return pte_t * is still not
+         * stable. So should be checked there for above conditions.
+         */
+        if (pgd_none(pgd))
+                return NULL;
+        else if (pgd_huge(pgd)) {
+                ret_pte = (pte_t *) pgdp;
+                goto out;
+        } else if (is_hugepd(&pgd))
+                hpdp = (hugepd_t *)&pgd;
+        else {
+                /*
+                 * Even if we end up with an unmap, the pgtable will not
+                 * be freed, because we do an rcu free and here we are
+                 * irq disabled
+                 */
+                pdshift = PUD_SHIFT;
+                pudp = pud_offset(&pgd, ea);
+                pud  = ACCESS_ONCE(*pudp);
+                if (pud_none(pud))
+                        return NULL;
+                else if (pud_huge(pud)) {
+                        ret_pte = (pte_t *) pudp;
+                        goto out;
+                } else if (is_hugepd(&pud))
+                        hpdp = (hugepd_t *)&pud;
+                else {
+                        pdshift = PMD_SHIFT;
+                        pmdp = pmd_offset(&pud, ea);
+                        pmd  = ACCESS_ONCE(*pmdp);
+                        /*
+                         * A hugepage collapse is captured by pmd_none, because
+                         * it mark the pmd none and do a hpte invalidate.
+                         *
+                         * A hugepage split is captured by pmd_trans_splitting
+                         * because we mark the pmd trans splitting and do a
+                         * hpte invalidate
+                         *
+                         */
+                        if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+                                return NULL;
+                        if (pmd_huge(pmd) || pmd_large(pmd)) {
+                                ret_pte = (pte_t *) pmdp;
+                                goto out;
+                        } else if (is_hugepd(&pmd))
+                                hpdp = (hugepd_t *)&pmd;
+                        else
+                                return pte_offset_kernel(&pmd, ea);
+                }
+        }
+        if (!hpdp)
+                return NULL;
+        ret_pte = hugepte_offset(hpdp, ea, pdshift);
+        pdshift = hugepd_shift(*hpdp);
+out:
+        if (shift)
+                *shift = pdshift;
+        return ret_pte;
+}
+EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
+int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+                unsigned long end, int write, struct page **pages, int *nr)
+{
+        unsigned long mask;
+        unsigned long pte_end;
+        struct page *head, *page, *tail;
+        pte_t pte;
+        int refs;
+        pte_end = (addr + sz) & ~(sz-1);
+        if (pte_end < end)
+                end = pte_end;
+        pte = ACCESS_ONCE(*ptep);
+        mask = _PAGE_PRESENT | _PAGE_USER;
+        if (write)
+                mask |= _PAGE_RW;
+        if ((pte_val(pte) & mask) != mask)
+                return 0;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        /*
+         * check for splitting here
+         */
+        if (pmd_trans_splitting(pte_pmd(pte)))
+                return 0;
+#endif
+        /* hugepages are never "special" */
+        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+        refs = 0;
+        head = pte_page(pte);
+        page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+        tail = page;
+        do {
+                VM_BUG_ON(compound_head(page) != head);
+                pages[*nr] = page;
+                (*nr)++;
+                page++;
+                refs++;
+        } while (addr += PAGE_SIZE, addr != end);
+        if (!page_cache_add_speculative(head, refs)) {
+                *nr -= refs;
+                return 0;
+        }
+        if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+                /* Could be optimized better */
+                *nr -= refs;
+                while (refs--)
+                        put_page(head);
+                return 0;
+        }
+        /*
+         * Any tail page need their mapcount reference taken before we
+         * return.
+         */
+        while (refs--) {
+                if (PageTail(tail))
+                        get_huge_page_tail(tail);
+                tail++;
+        }
+        return 1;
+}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a90b9c458990..d0cd9e4c6837 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -88,7 +88,11 @@ static void pgd_ctor(void *addr)
 static void pmd_ctor(void *addr)
 {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        memset(addr, 0, PMD_TABLE_SIZE * 2);
+#else
        memset(addr, 0, PMD_TABLE_SIZE);
+#endif
 }
 struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
@@ -137,10 +141,9 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
 void pgtable_cache_init(void)
 {
        pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
-        pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
+        pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
-        if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
+        if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
                panic("Couldn't allocate pgtable caches");
        /* In all current configs, when the PUD index exists it's the
         * same size as either the pgd or pmd index.  Verify that the
         * initialization above has also created a PUD cache.  This
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 0988a26e0413..7f4bea162026 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -299,47 +299,13 @@ void __init paging_init(void)
 void __init mem_init(void)
 {
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-        int nid;
-#endif
-        pg_data_t *pgdat;
-        unsigned long i;
-        struct page *page;
-        unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
 #ifdef CONFIG_SWIOTLB
        swiotlb_init(0);
 #endif
-        num_physpages = memblock_phys_mem_size() >> PAGE_SHIFT;
        high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+        set_max_mapnr(max_pfn);
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+        free_all_bootmem();
-        for_each_online_node(nid) {
-                if (NODE_DATA(nid)->node_spanned_pages != 0) {
-                        printk("freeing bootmem node %d\n", nid);
-                        totalram_pages +=
-                                free_all_bootmem_node(NODE_DATA(nid));
-                }
-        }
-#else
-        max_mapnr = max_pfn;
-        totalram_pages += free_all_bootmem();
-#endif
-        for_each_online_pgdat(pgdat) {
-                for (i = 0; i < pgdat->node_spanned_pages; i++) {
-                        if (!pfn_valid(pgdat->node_start_pfn + i))
-                                continue;
-                        page = pgdat_page_nr(pgdat, i);
-                        if (PageReserved(page))
-                                reservedpages++;
-                }
-        }
-        codesize = (unsigned long)&_sdata - (unsigned long)&_stext;
-        datasize = (unsigned long)&_edata - (unsigned long)&_sdata;
-        initsize = (unsigned long)&__init_end - (unsigned long)&__init_begin;
-        bsssize = (unsigned long)&__bss_stop - (unsigned long)&__bss_start;
 #ifdef CONFIG_HIGHMEM
        {
@@ -349,13 +315,9 @@ void __init mem_init(void)
                for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) {
                        phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT;
                        struct page *page = pfn_to_page(pfn);
-                        if (memblock_is_reserved(paddr))
+                        if (!memblock_is_reserved(paddr))
-                                continue;
+                                free_highmem_page(page);
-                        free_highmem_page(page);
-                        reservedpages--;
                }
-                printk(KERN_DEBUG "High memory: %luk\n",
-                       totalhigh_pages << (PAGE_SHIFT-10));
        }
 #endif /* CONFIG_HIGHMEM */
@@ -368,16 +330,7 @@ void __init mem_init(void)
                (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1;
 #endif
-        printk(KERN_INFO "Memory: %luk/%luk available (%luk kernel code, "
+        mem_init_print_info(NULL);
-               "%luk reserved, %luk data, %luk bss, %luk init)\n",
-                nr_free_pages() << (PAGE_SHIFT-10),
-                num_physpages << (PAGE_SHIFT-10),
-                codesize >> 10,
-                reservedpages << (PAGE_SHIFT-10),
-                datasize >> 10,
-                bsssize >> 10,
-                initsize >> 10);
 #ifdef CONFIG_PPC32
        pr_info("Kernel virtual memory layout:\n");
        pr_info("  * 0x%08lx..0x%08lx  : fixmap\n", FIXADDR_START, FIXADDR_TOP);
@@ -407,7 +360,7 @@ void free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-        free_reserved_area(start, end, 0, "initrd");
+        free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
@@ -508,6 +461,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
                      pte_t *ptep)
 {
 #ifdef CONFIG_PPC_STD_MMU
+        /*
+         * We don't need to worry about _PAGE_PRESENT here because we are
+         * called with either mm->page_table_lock held or ptl lock held
+         */
        unsigned long access = 0, trap;
        /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
diff --git a/arch/powerpc/mm/mmap_64.c b/arch/powerpc/mm/mmap.c
index 67a42ed0d2fc..cb8bdbe4972f 100644
--- a/arch/powerpc/mm/mmap_64.c
+++ b/arch/powerpc/mm/mmap.c
@@ -92,10 +92,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
        if (mmap_is_legacy()) {
                mm->mmap_base = TASK_UNMAPPED_BASE;
                mm->get_unmapped_area = arch_get_unmapped_area;
-                mm->unmap_area = arch_unmap_area;
        } else {
                mm->mmap_base = mmap_base();
                mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-                mm->unmap_area = arch_unmap_area_topdown;
        }
 }
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index e779642c25e5..af3d78e19302 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -112,8 +112,10 @@ static unsigned int steal_context_smp(unsigned int id)
                 */
                for_each_cpu(cpu, mm_cpumask(mm)) {
                        for (i = cpu_first_thread_sibling(cpu);
-                             i <= cpu_last_thread_sibling(cpu); i++)
+                             i <= cpu_last_thread_sibling(cpu); i++) {
-                                __set_bit(id, stale_map[i]);
+                                if (stale_map[i])
+                                        __set_bit(id, stale_map[i]);
+                        }
                        cpu = i - 1;
                }
                return id;
@@ -272,7 +274,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
                /* XXX This clear should ultimately be part of local_flush_tlb_mm */
                for (i = cpu_first_thread_sibling(cpu);
                     i <= cpu_last_thread_sibling(cpu); i++) {
-                        __clear_bit(id, stale_map[i]);
+                        if (stale_map[i])
+                                __clear_bit(id, stale_map[i]);
                }
        }
@@ -329,8 +332,8 @@ void destroy_context(struct mm_struct *mm)
 #ifdef CONFIG_SMP
-static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
+static int mmu_context_cpu_notify(struct notifier_block *self,
-                                            unsigned long action, void *hcpu)
+                                  unsigned long action, void *hcpu)
 {
        unsigned int cpu = (unsigned int)(long)hcpu;
@@ -363,7 +366,7 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
-static struct notifier_block __cpuinitdata mmu_context_cpu_nb = {
+static struct notifier_block mmu_context_cpu_nb = {
        .notifier_call  = mmu_context_cpu_notify,
 };
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 88c0425dc0a8..5850798826cd 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -27,6 +27,7 @@
 #include <linux/seq_file.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
+#include <asm/cputhreads.h>
 #include <asm/sparsemem.h>
 #include <asm/prom.h>
 #include <asm/smp.h>
@@ -516,7 +517,7 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
 * Figure out to which domain a cpu belongs and stick it there.
 * Return the id of the domain used.
 */
-static int __cpuinit numa_setup_cpu(unsigned long lcpu)
+static int numa_setup_cpu(unsigned long lcpu)
 {
        int nid = 0;
        struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
@@ -538,8 +539,7 @@ out:
        return nid;
 }
-static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
+static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,
-                             unsigned long action,
                             void *hcpu)
 {
        unsigned long lcpu = (unsigned long)hcpu;
@@ -919,7 +919,7 @@ static void __init *careful_zallocation(int nid, unsigned long size,
        return ret;
 }
-static struct notifier_block __cpuinitdata ppc64_numa_nb = {
+static struct notifier_block ppc64_numa_nb = {
        .notifier_call = cpu_numa_callback,
        .priority = 1 /* Must run before sched domains notifier. */
 };
@@ -1319,7 +1319,8 @@ static int update_cpu_associativity_changes_mask(void)
                        }
                }
                if (changed) {
-                        cpumask_set_cpu(cpu, changes);
+                        cpumask_or(changes, changes, cpu_sibling_mask(cpu));
+                        cpu = cpu_last_thread_sibling(cpu);
                }
        }
@@ -1427,17 +1428,15 @@ static int update_cpu_topology(void *data)
        if (!data)
                return -EINVAL;
-        cpu = get_cpu();
+        cpu = smp_processor_id();
        for (update = data; update; update = update->next) {
                if (cpu != update->cpu)
                        continue;
-                unregister_cpu_under_node(update->cpu, update->old_nid);
                unmap_cpu_from_node(update->cpu);
                map_cpu_to_node(update->cpu, update->new_nid);
                vdso_getcpu_init();
-                register_cpu_under_node(update->cpu, update->new_nid);
        }
        return 0;
@@ -1449,12 +1448,12 @@ static int update_cpu_topology(void *data)
 */
 int arch_update_cpu_topology(void)
 {
-        unsigned int cpu, changed = 0;
+        unsigned int cpu, sibling, changed = 0;
        struct topology_update_data *updates, *ud;
        unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
        cpumask_t updated_cpus;
        struct device *dev;
-        int weight, i = 0;
+        int weight, new_nid, i = 0;
        weight = cpumask_weight(&cpu_associativity_changes_mask);
        if (!weight)
@@ -1467,24 +1466,54 @@ int arch_update_cpu_topology(void)
        cpumask_clear(&updated_cpus);
        for_each_cpu(cpu, &cpu_associativity_changes_mask) {
-                ud = &updates[i++];
+                /*
-                ud->cpu = cpu;
+                 * If siblings aren't flagged for changes, updates list
-                vphn_get_associativity(cpu, associativity);
+                 * will be too short. Skip on this update and set for next
-                ud->new_nid = associativity_to_nid(associativity);
+                 * update.
+                 */
-                if (ud->new_nid < 0 || !node_online(ud->new_nid))
+                if (!cpumask_subset(cpu_sibling_mask(cpu),
-                        ud->new_nid = first_online_node;
+                                        &cpu_associativity_changes_mask)) {
+                        pr_info("Sibling bits not set for associativity "
+                                        "change, cpu%d\n", cpu);
+                        cpumask_or(&cpu_associativity_changes_mask,
+                                        &cpu_associativity_changes_mask,
+                                        cpu_sibling_mask(cpu));
+                        cpu = cpu_last_thread_sibling(cpu);
+                        continue;
+                }
-                ud->old_nid = numa_cpu_lookup_table[cpu];
+                /* Use associativity from first thread for all siblings */
-                cpumask_set_cpu(cpu, &updated_cpus);
+                vphn_get_associativity(cpu, associativity);
+                new_nid = associativity_to_nid(associativity);
+                if (new_nid < 0 || !node_online(new_nid))
+                        new_nid = first_online_node;
+                if (new_nid == numa_cpu_lookup_table[cpu]) {
+                        cpumask_andnot(&cpu_associativity_changes_mask,
+                                        &cpu_associativity_changes_mask,
+                                        cpu_sibling_mask(cpu));
+                        cpu = cpu_last_thread_sibling(cpu);
+                        continue;
+                }
-                if (i < weight)
+                for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
-                        ud->next = &updates[i];
+                        ud = &updates[i++];
+                        ud->cpu = sibling;
+                        ud->new_nid = new_nid;
+                        ud->old_nid = numa_cpu_lookup_table[sibling];
+                        cpumask_set_cpu(sibling, &updated_cpus);
+                        if (i < weight)
+                                ud->next = &updates[i];
+                }
+                cpu = cpu_last_thread_sibling(cpu);
        }
        stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
        for (ud = &updates[0]; ud; ud = ud->next) {
+                unregister_cpu_under_node(ud->cpu, ud->old_nid);
+                register_cpu_under_node(ud->cpu, ud->new_nid);
                dev = get_cpu_device(ud->cpu);
                if (dev)
                        kobject_uevent(&dev->kobj, KOBJ_CHANGE);
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 214130a4edc6..edda589795c3 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -235,6 +235,14 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
        pud = pud_offset(pgd, addr);
        BUG_ON(pud_none(*pud));
        pmd = pmd_offset(pud, addr);
+        /*
+         * khugepaged to collapse normal pages to hugepage, first set
+         * pmd to none to force page fault/gup to take mmap_sem. After
+         * pmd is set to none, we do a pte_clear which does this assertion
+         * so if we find pmd none, return.
+         */
+        if (pmd_none(*pmd))
+                return;
        BUG_ON(!pmd_present(*pmd));
        assert_spin_locked(pte_lockptr(mm, pmd));
 }
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index a854096e1023..536eec72c0f7 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -338,6 +338,19 @@ EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(__iounmap);
 EXPORT_SYMBOL(__iounmap_at);
+/*
+ * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
+ * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
+ */
+struct page *pmd_page(pmd_t pmd)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        if (pmd_trans_huge(pmd))
+                return pfn_to_page(pmd_pfn(pmd));
+#endif
+        return virt_to_page(pmd_page_vaddr(pmd));
+}
 #ifdef CONFIG_PPC_64K_PAGES
 static pte_t *get_from_cache(struct mm_struct *mm)
 {
@@ -455,3 +468,404 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
 }
 #endif
 #endif /* CONFIG_PPC_64K_PAGES */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * This is called when relaxing access to a hugepage. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+                          pmd_t *pmdp, pmd_t entry, int dirty)
+{
+        int changed;
+#ifdef CONFIG_DEBUG_VM
+        WARN_ON(!pmd_trans_huge(*pmdp));
+        assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+        changed = !pmd_same(*(pmdp), entry);
+        if (changed) {
+                __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
+                /*
+                 * Since we are not supporting SW TLB systems, we don't
+                 * have any thing similar to flush_tlb_page_nohash()
+                 */
+        }
+        return changed;
+}
+unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+                                  pmd_t *pmdp, unsigned long clr)
+{
+        unsigned long old, tmp;
+#ifdef CONFIG_DEBUG_VM
+        WARN_ON(!pmd_trans_huge(*pmdp));
+        assert_spin_locked(&mm->page_table_lock);
+#endif
+#ifdef PTE_ATOMIC_UPDATES
+        __asm__ __volatile__(
+        "1:     ldarx   %0,0,%3\n\
+                andi.   %1,%0,%6\n\
+                bne-    1b \n\
+                andc    %1,%0,%4 \n\
+                stdcx.  %1,0,%3 \n\
+                bne-    1b"
+        : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+        : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY)
+        : "cc" );
+#else
+        old = pmd_val(*pmdp);
+        *pmdp = __pmd(old & ~clr);
+#endif
+        if (old & _PAGE_HASHPTE)
+                hpte_do_hugepage_flush(mm, addr, pmdp);
+        return old;
+}
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+                       pmd_t *pmdp)
+{
+        pmd_t pmd;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        if (pmd_trans_huge(*pmdp)) {
+                pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+        } else {
+                /*
+                 * khugepaged calls this for normal pmd
+                 */
+                pmd = *pmdp;
+                pmd_clear(pmdp);
+                /*
+                 * Wait for all pending hash_page to finish. This is needed
+                 * in case of subpage collapse. When we collapse normal pages
+                 * to hugepage, we first clear the pmd, then invalidate all
+                 * the PTE entries. The assumption here is that any low level
+                 * page fault will see a none pmd and take the slow path that
+                 * will wait on mmap_sem. But we could very well be in a
+                 * hash_page with local ptep pointer value. Such a hash page
+                 * can result in adding new HPTE entries for normal subpages.
+                 * That means we could be modifying the page content as we
+                 * copy them to a huge page. So wait for parallel hash_page
+                 * to finish before invalidating HPTE entries. We can do this
+                 * by sending an IPI to all the cpus and executing a dummy
+                 * function there.
+                 */
+                kick_all_cpus_sync();
+                /*
+                 * Now invalidate the hpte entries in the range
+                 * covered by pmd. This make sure we take a
+                 * fault and will find the pmd as none, which will
+                 * result in a major fault which takes mmap_sem and
+                 * hence wait for collapse to complete. Without this
+                 * the __collapse_huge_page_copy can result in copying
+                 * the old content.
+                 */
+                flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+        }
+        return pmd;
+}
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                              unsigned long address, pmd_t *pmdp)
+{
+        return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+/*
+ * We currently remove entries from the hashtable regardless of whether
+ * the entry was young or dirty. The generic routines only flush if the
+ * entry was young or dirty which is not good enough.
+ *
+ * We should be more intelligent about this but for the moment we override
+ * these functions and force a tlb flush unconditionally
+ */
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                                  unsigned long address, pmd_t *pmdp)
+{
+        return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+/*
+ * We mark the pmd splitting and invalidate all the hpte
+ * entries for this hugepage.
+ */
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+                          unsigned long address, pmd_t *pmdp)
+{
+        unsigned long old, tmp;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifdef CONFIG_DEBUG_VM
+        WARN_ON(!pmd_trans_huge(*pmdp));
+        assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+#ifdef PTE_ATOMIC_UPDATES
+        __asm__ __volatile__(
+        "1:     ldarx   %0,0,%3\n\
+                andi.   %1,%0,%6\n\
+                bne-    1b \n\
+                ori     %1,%0,%4 \n\
+                stdcx.  %1,0,%3 \n\
+                bne-    1b"
+        : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+        : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
+        : "cc" );
+#else
+        old = pmd_val(*pmdp);
+        *pmdp = __pmd(old | _PAGE_SPLITTING);
+#endif
+        /*
+         * If we didn't had the splitting flag set, go and flush the
+         * HPTE entries.
+         */
+        if (!(old & _PAGE_SPLITTING)) {
+                /* We need to flush the hpte */
+                if (old & _PAGE_HASHPTE)
+                        hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
+        }
+}
+/*
+ * We want to put the pgtable in pmd and use pgtable for tracking
+ * the base page size hptes
+ */
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                                pgtable_t pgtable)
+{
+        pgtable_t *pgtable_slot;
+        assert_spin_locked(&mm->page_table_lock);
+        /*
+         * we store the pgtable in the second half of PMD
+         */
+        pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+        *pgtable_slot = pgtable;
+        /*
+         * expose the deposited pgtable to other cpus.
+         * before we set the hugepage PTE at pmd level
+         * hash fault code looks at the deposted pgtable
+         * to store hash index values.
+         */
+        smp_wmb();
+}
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+        pgtable_t pgtable;
+        pgtable_t *pgtable_slot;
+        assert_spin_locked(&mm->page_table_lock);
+        pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+        pgtable = *pgtable_slot;
+        /*
+         * Once we withdraw, mark the entry NULL.
+         */
+        *pgtable_slot = NULL;
+        /*
+         * We store HPTE information in the deposited PTE fragment.
+         * zero out the content on withdraw.
+         */
+        memset(pgtable, 0, PTE_FRAG_SIZE);
+        return pgtable;
+}
+/*
+ * set a new huge pmd. We should not be called for updating
+ * an existing pmd entry. That should go via pmd_hugepage_update.
+ */
+void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+                pmd_t *pmdp, pmd_t pmd)
+{
+#ifdef CONFIG_DEBUG_VM
+        WARN_ON(!pmd_none(*pmdp));
+        assert_spin_locked(&mm->page_table_lock);
+        WARN_ON(!pmd_trans_huge(pmd));
+#endif
+        return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
+}
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+                     pmd_t *pmdp)
+{
+        pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT);
+}
+/*
+ * A linux hugepage PMD was changed and the corresponding hash table entries
+ * neesd to be flushed.
+ */
+void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+                            pmd_t *pmdp)
+{
+        int ssize, i;
+        unsigned long s_addr;
+        int max_hpte_count;
+        unsigned int psize, valid;
+        unsigned char *hpte_slot_array;
+        unsigned long hidx, vpn, vsid, hash, shift, slot;
+        /*
+         * Flush all the hptes mapping this hugepage
+         */
+        s_addr = addr & HPAGE_PMD_MASK;
+        hpte_slot_array = get_hpte_slot_array(pmdp);
+        /*
+         * IF we try to do a HUGE PTE update after a withdraw is done.
+         * we will find the below NULL. This happens when we do
+         * split_huge_page_pmd
+         */
+        if (!hpte_slot_array)
+                return;
+        /* get the base page size */
+        psize = get_slice_psize(mm, s_addr);
+        if (ppc_md.hugepage_invalidate)
+                return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
+                                                  s_addr, psize);
+        /*
+         * No bluk hpte removal support, invalidate each entry
+         */
+        shift = mmu_psize_defs[psize].shift;
+        max_hpte_count = HPAGE_PMD_SIZE >> shift;
+        for (i = 0; i < max_hpte_count; i++) {
+                /*
+                 * 8 bits per each hpte entries
+                 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+                 */
+                valid = hpte_valid(hpte_slot_array, i);
+                if (!valid)
+                        continue;
+                hidx =  hpte_hash_index(hpte_slot_array, i);
+                /* get the vpn */
+                addr = s_addr + (i * (1ul << shift));
+                if (!is_kernel_addr(addr)) {
+                        ssize = user_segment_size(addr);
+                        vsid = get_vsid(mm->context.id, addr, ssize);
+                        WARN_ON(vsid == 0);
+                } else {
+                        vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+                        ssize = mmu_kernel_ssize;
+                }
+                vpn = hpt_vpn(addr, vsid, ssize);
+                hash = hpt_hash(vpn, shift, ssize);
+                if (hidx & _PTEIDX_SECONDARY)
+                        hash = ~hash;
+                slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+                slot += hidx & _PTEIDX_GROUP_IX;
+                ppc_md.hpte_invalidate(slot, vpn, psize,
+                                       MMU_PAGE_16M, ssize, 0);
+        }
+}
+static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
+{
+        pmd_val(pmd) |= pgprot_val(pgprot);
+        return pmd;
+}
+pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
+{
+        pmd_t pmd;
+        /*
+         * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
+         * set. We use this to check THP page at pmd level.
+         * leaf pte for huge page, bottom two bits != 00
+         */
+        pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
+        pmd_val(pmd) |= _PAGE_THP_HUGE;
+        pmd = pmd_set_protbits(pmd, pgprot);
+        return pmd;
+}
+pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
+{
+        return pfn_pmd(page_to_pfn(page), pgprot);
+}
+pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+        pmd_val(pmd) &= _HPAGE_CHG_MASK;
+        pmd = pmd_set_protbits(pmd, newprot);
+        return pmd;
+}
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a HUGE PMD entry in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux HUGE PMD entry.
+ */
+void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+                          pmd_t *pmd)
+{
+        return;
+}
+pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+                         unsigned long addr, pmd_t *pmdp)
+{
+        pmd_t old_pmd;
+        pgtable_t pgtable;
+        unsigned long old;
+        pgtable_t *pgtable_slot;
+        old = pmd_hugepage_update(mm, addr, pmdp, ~0UL);
+        old_pmd = __pmd(old);
+        /*
+         * We have pmd == none and we are holding page_table_lock.
+         * So we can safely go and clear the pgtable hash
+         * index info.
+         */
+        pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+        pgtable = *pgtable_slot;
+        /*
+         * Let's zero out old valid and hash index details
+         * hash fault look at them.
+         */
+        memset(pgtable, 0, PTE_FRAG_SIZE);
+        return old_pmd;
+}
+int has_transparent_hugepage(void)
+{
+        if (!mmu_has_feature(MMU_FTR_16M_PAGE))
+                return 0;
+        /*
+         * We support THP only if PMD_SIZE is 16MB.
+         */
+        if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
+                return 0;
+        /*
+         * We need to make sure that we support 16MB hugepage in a segement
+         * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
+         * of 64K.
+         */
+        /*
+         * If we have 64K HPTE, we will be using that by default
+         */
+        if (mmu_psize_defs[MMU_PAGE_64K].shift &&
+            (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
+                return 0;
+        /*
+         * Ok we only have 4K HPTE
+         */
+        if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
+                return 0;
+        return 1;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 7c415ddde948..aa74acb0fdfc 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -130,6 +130,53 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len)
        up_write(&mm->mmap_sem);
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
+                                  unsigned long end, struct mm_walk *walk)
+{
+        struct vm_area_struct *vma = walk->private;
+        split_huge_page_pmd(vma, addr, pmd);
+        return 0;
+}
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+                                    unsigned long len)
+{
+        struct vm_area_struct *vma;
+        struct mm_walk subpage_proto_walk = {
+                .mm = mm,
+                .pmd_entry = subpage_walk_pmd_entry,
+        };
+        /*
+         * We don't try too hard, we just mark all the vma in that range
+         * VM_NOHUGEPAGE and split them.
+         */
+        vma = find_vma(mm, addr);
+        /*
+         * If the range is in unmapped range, just return
+         */
+        if (vma && ((addr + len) <= vma->vm_start))
+                return;
+        while (vma) {
+                if (vma->vm_start >= (addr + len))
+                        break;
+                vma->vm_flags |= VM_NOHUGEPAGE;
+                subpage_proto_walk.private = vma;
+                walk_page_range(vma->vm_start, vma->vm_end,
+                                &subpage_proto_walk);
+                vma = vma->vm_next;
+        }
+}
+#else
+static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
+                                    unsigned long len)
+{
+        return;
+}
+#endif
 /*
 * Copy in a subpage protection map for an address range.
 * The map has 2 bits per 4k subpage, so 32 bits per 64k page.
@@ -168,6 +215,7 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
                return -EFAULT;
        down_write(&mm->mmap_sem);
+        subpage_mark_vma_nohuge(mm, addr, len);
        for (limit = addr + len; addr < limit; addr = next) {
                next = pmd_addr_end(addr, limit);
                err = -ENOMEM;
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 023ec8a13f38..36e44b4260eb 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -183,12 +183,13 @@ void tlb_flush(struct mmu_gather *tlb)
 * since 64K pages may overlap with other bridges when using 64K pages
 * with 4K HW pages on IO space.
 *
- * Because of that usage pattern, it's only available with CONFIG_HOTPLUG
+ * Because of that usage pattern, it is implemented for small size rather
- * and is implemented for small size rather than speed.
+ * than speed.
 */
 void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
                              unsigned long end)
 {
+        int hugepage_shift;
        unsigned long flags;
        start = _ALIGN_DOWN(start, PAGE_SIZE);
@@ -206,7 +207,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
        local_irq_save(flags);
        arch_enter_lazy_mmu_mode();
        for (; start < end; start += PAGE_SIZE) {
-                pte_t *ptep = find_linux_pte(mm->pgd, start);
+                pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start,
+                                                        &hugepage_shift);
                unsigned long pte;
                if (ptep == NULL)
@@ -214,7 +216,37 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
                pte = pte_val(*ptep);
                if (!(pte & _PAGE_HASHPTE))
                        continue;
-                hpte_need_flush(mm, start, ptep, pte, 0);
+                if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
+                        hpte_do_hugepage_flush(mm, start, (pmd_t *)pte);
+                else
+                        hpte_need_flush(mm, start, ptep, pte, 0);
+        }
+        arch_leave_lazy_mmu_mode();
+        local_irq_restore(flags);
+}
+void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+{
+        pte_t *pte;
+        pte_t *start_pte;
+        unsigned long flags;
+        addr = _ALIGN_DOWN(addr, PMD_SIZE);
+        /* Note: Normally, we should only ever use a batch within a
+         * PTE locked section. This violates the rule, but will work
+         * since we don't actually modify the PTEs, we just flush the
+         * hash while leaving the PTEs intact (including their reference
+         * to being hashed). This is not the most performance oriented
+         * way to do things but is fine for our needs here.
+         */
+        local_irq_save(flags);
+        arch_enter_lazy_mmu_mode();
+        start_pte = pte_offset_map(pmd, addr);
+        for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
+                unsigned long pteval = pte_val(*pte);
+                if (pteval & _PAGE_HASHPTE)
+                        hpte_need_flush(mm, addr, pte, pteval, 0);
+                addr += PAGE_SIZE;
        }
        arch_leave_lazy_mmu_mode();
        local_irq_restore(flags);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 6888cad5103d..41cd68dee681 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -648,7 +648,7 @@ void __init early_init_mmu(void)
        __early_init_mmu(1);
 }
-void __cpuinit early_init_mmu_secondary(void)
+void early_init_mmu_secondary(void)
 {
        __early_init_mmu(0);
 }
author	Sage Weil <sage@inktank.com>	2013-08-15 14:11:45 -0400
committer	Sage Weil <sage@inktank.com>	2013-08-15 14:11:45 -0400
commit	ee3e542fec6e69bc9fb668698889a37d93950ddf (patch)
tree	e74ee766a4764769ef1d3d45d266b4dea64101d3 /arch/powerpc/mm
parent	fe2a801b50c0bb8039d627e5ae1fec249d10ff39 (diff)
parent	f1d6e17f540af37bb1891480143669ba7636c4cf (diff)