powerpc/THP: Implement transparent hugepages for ppc64

We now have pmd entries covering 16MB range and the PMD table double its original size. We use the second half of the PMD table to deposit the pgtable (PTE page). The depoisted PTE page is further used to track the HPTE information. The information include [ secondary group | 3 bit hidx | valid ]. We use one byte per each HPTE entry. With 16MB hugepage and 64K HPTE we need 256 entries and with 4K HPTE we need 4096 entries. Both will fit in a 4K PTE page. On hugepage invalidate we need to walk the PTE page and invalidate all valid HPTEs. This patch implements necessary arch specific functions for THP support and also hugepage invalidate logic. These PMD related functions are intentionally kept similar to their PTE counter-part. Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
author: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> 2013-06-20 05:00:15 -0400
committer: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2013-06-21 02:01:53 -0400
commit: 074c2eae3e9b66c03a17a12df8f2cd19382b68ab (patch)
tree: 1606baa1df8093b5ca0ef638236b2ace837e1d98 /arch/powerpc/mm
parent: f940f5289873af2ad2c4e73f88c24ad2b8fe3f87 (diff)
2 files changed, 404 insertions, 0 deletions
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index a854096e1023..e4d3e9fb59be 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -338,6 +338,19 @@ EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(__iounmap);
 EXPORT_SYMBOL(__iounmap_at);
+/*
+ * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
+ * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
+ */
+struct page *pmd_page(pmd_t pmd)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        if (pmd_trans_huge(pmd))
+                return pfn_to_page(pmd_pfn(pmd));
+#endif
+        return virt_to_page(pmd_page_vaddr(pmd));
+}
 #ifdef CONFIG_PPC_64K_PAGES
 static pte_t *get_from_cache(struct mm_struct *mm)
 {
@@ -455,3 +468,367 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
 }
 #endif
 #endif /* CONFIG_PPC_64K_PAGES */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * This is called when relaxing access to a hugepage. It's also called in the page
+ * fault path when we don't hit any of the major fault cases, ie, a minor
+ * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
+ * handled those two for us, we additionally deal with missing execute
+ * permission here on some processors
+ */
+int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+                          pmd_t *pmdp, pmd_t entry, int dirty)
+{
+        int changed;
+#ifdef CONFIG_DEBUG_VM
+        WARN_ON(!pmd_trans_huge(*pmdp));
+        assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+        changed = !pmd_same(*(pmdp), entry);
+        if (changed) {
+                __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
+                /*
+                 * Since we are not supporting SW TLB systems, we don't
+                 * have any thing similar to flush_tlb_page_nohash()
+                 */
+        }
+        return changed;
+}
+unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
+                                  pmd_t *pmdp, unsigned long clr)
+{
+        unsigned long old, tmp;
+#ifdef CONFIG_DEBUG_VM
+        WARN_ON(!pmd_trans_huge(*pmdp));
+        assert_spin_locked(&mm->page_table_lock);
+#endif
+#ifdef PTE_ATOMIC_UPDATES
+        __asm__ __volatile__(
+        "1:     ldarx   %0,0,%3\n\
+                andi.   %1,%0,%6\n\
+                bne-    1b \n\
+                andc    %1,%0,%4 \n\
+                stdcx.  %1,0,%3 \n\
+                bne-    1b"
+        : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+        : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY)
+        : "cc" );
+#else
+        old = pmd_val(*pmdp);
+        *pmdp = __pmd(old & ~clr);
+#endif
+        if (old & _PAGE_HASHPTE)
+                hpte_do_hugepage_flush(mm, addr, pmdp);
+        return old;
+}
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+                       pmd_t *pmdp)
+{
+        pmd_t pmd;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        if (pmd_trans_huge(*pmdp)) {
+                pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+        } else {
+                /*
+                 * khugepaged calls this for normal pmd
+                 */
+                pmd = *pmdp;
+                pmd_clear(pmdp);
+                /*
+                 * Wait for all pending hash_page to finish. This is needed
+                 * in case of subpage collapse. When we collapse normal pages
+                 * to hugepage, we first clear the pmd, then invalidate all
+                 * the PTE entries. The assumption here is that any low level
+                 * page fault will see a none pmd and take the slow path that
+                 * will wait on mmap_sem. But we could very well be in a
+                 * hash_page with local ptep pointer value. Such a hash page
+                 * can result in adding new HPTE entries for normal subpages.
+                 * That means we could be modifying the page content as we
+                 * copy them to a huge page. So wait for parallel hash_page
+                 * to finish before invalidating HPTE entries. We can do this
+                 * by sending an IPI to all the cpus and executing a dummy
+                 * function there.
+                 */
+                kick_all_cpus_sync();
+                /*
+                 * Now invalidate the hpte entries in the range
+                 * covered by pmd. This make sure we take a
+                 * fault and will find the pmd as none, which will
+                 * result in a major fault which takes mmap_sem and
+                 * hence wait for collapse to complete. Without this
+                 * the __collapse_huge_page_copy can result in copying
+                 * the old content.
+                 */
+                flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
+        }
+        return pmd;
+}
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                              unsigned long address, pmd_t *pmdp)
+{
+        return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+/*
+ * We currently remove entries from the hashtable regardless of whether
+ * the entry was young or dirty. The generic routines only flush if the
+ * entry was young or dirty which is not good enough.
+ *
+ * We should be more intelligent about this but for the moment we override
+ * these functions and force a tlb flush unconditionally
+ */
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                                  unsigned long address, pmd_t *pmdp)
+{
+        return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
+}
+/*
+ * We mark the pmd splitting and invalidate all the hpte
+ * entries for this hugepage.
+ */
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+                          unsigned long address, pmd_t *pmdp)
+{
+        unsigned long old, tmp;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifdef CONFIG_DEBUG_VM
+        WARN_ON(!pmd_trans_huge(*pmdp));
+        assert_spin_locked(&vma->vm_mm->page_table_lock);
+#endif
+#ifdef PTE_ATOMIC_UPDATES
+        __asm__ __volatile__(
+        "1:     ldarx   %0,0,%3\n\
+                andi.   %1,%0,%6\n\
+                bne-    1b \n\
+                ori     %1,%0,%4 \n\
+                stdcx.  %1,0,%3 \n\
+                bne-    1b"
+        : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
+        : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
+        : "cc" );
+#else
+        old = pmd_val(*pmdp);
+        *pmdp = __pmd(old | _PAGE_SPLITTING);
+#endif
+        /*
+         * If we didn't had the splitting flag set, go and flush the
+         * HPTE entries.
+         */
+        if (!(old & _PAGE_SPLITTING)) {
+                /* We need to flush the hpte */
+                if (old & _PAGE_HASHPTE)
+                        hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
+        }
+}
+/*
+ * We want to put the pgtable in pmd and use pgtable for tracking
+ * the base page size hptes
+ */
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
+                                pgtable_t pgtable)
+{
+        pgtable_t *pgtable_slot;
+        assert_spin_locked(&mm->page_table_lock);
+        /*
+         * we store the pgtable in the second half of PMD
+         */
+        pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+        *pgtable_slot = pgtable;
+        /*
+         * expose the deposited pgtable to other cpus.
+         * before we set the hugepage PTE at pmd level
+         * hash fault code looks at the deposted pgtable
+         * to store hash index values.
+         */
+        smp_wmb();
+}
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
+{
+        pgtable_t pgtable;
+        pgtable_t *pgtable_slot;
+        assert_spin_locked(&mm->page_table_lock);
+        pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+        pgtable = *pgtable_slot;
+        /*
+         * Once we withdraw, mark the entry NULL.
+         */
+        *pgtable_slot = NULL;
+        /*
+         * We store HPTE information in the deposited PTE fragment.
+         * zero out the content on withdraw.
+         */
+        memset(pgtable, 0, PTE_FRAG_SIZE);
+        return pgtable;
+}
+/*
+ * set a new huge pmd. We should not be called for updating
+ * an existing pmd entry. That should go via pmd_hugepage_update.
+ */
+void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+                pmd_t *pmdp, pmd_t pmd)
+{
+#ifdef CONFIG_DEBUG_VM
+        WARN_ON(!pmd_none(*pmdp));
+        assert_spin_locked(&mm->page_table_lock);
+        WARN_ON(!pmd_trans_huge(pmd));
+#endif
+        return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
+}
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+                     pmd_t *pmdp)
+{
+        pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT);
+}
+/*
+ * A linux hugepage PMD was changed and the corresponding hash table entries
+ * neesd to be flushed.
+ */
+void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
+                            pmd_t *pmdp)
+{
+        int ssize, i;
+        unsigned long s_addr;
+        unsigned int psize, valid;
+        unsigned char *hpte_slot_array;
+        unsigned long hidx, vpn, vsid, hash, shift, slot;
+        /*
+         * Flush all the hptes mapping this hugepage
+         */
+        s_addr = addr & HPAGE_PMD_MASK;
+        hpte_slot_array = get_hpte_slot_array(pmdp);
+        /*
+         * IF we try to do a HUGE PTE update after a withdraw is done.
+         * we will find the below NULL. This happens when we do
+         * split_huge_page_pmd
+         */
+        if (!hpte_slot_array)
+                return;
+        /* get the base page size */
+        psize = get_slice_psize(mm, s_addr);
+        shift = mmu_psize_defs[psize].shift;
+        for (i = 0; i < (HPAGE_PMD_SIZE >> shift); i++) {
+                /*
+                 * 8 bits per each hpte entries
+                 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
+                 */
+                valid = hpte_valid(hpte_slot_array, i);
+                if (!valid)
+                        continue;
+                hidx =  hpte_hash_index(hpte_slot_array, i);
+                /* get the vpn */
+                addr = s_addr + (i * (1ul << shift));
+                if (!is_kernel_addr(addr)) {
+                        ssize = user_segment_size(addr);
+                        vsid = get_vsid(mm->context.id, addr, ssize);
+                        WARN_ON(vsid == 0);
+                } else {
+                        vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
+                        ssize = mmu_kernel_ssize;
+                }
+                vpn = hpt_vpn(addr, vsid, ssize);
+                hash = hpt_hash(vpn, shift, ssize);
+                if (hidx & _PTEIDX_SECONDARY)
+                        hash = ~hash;
+                slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+                slot += hidx & _PTEIDX_GROUP_IX;
+                ppc_md.hpte_invalidate(slot, vpn, psize,
+                                       MMU_PAGE_16M, ssize, 0);
+        }
+}
+static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
+{
+        pmd_val(pmd) |= pgprot_val(pgprot);
+        return pmd;
+}
+pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
+{
+        pmd_t pmd;
+        /*
+         * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
+         * set. We use this to check THP page at pmd level.
+         * leaf pte for huge page, bottom two bits != 00
+         */
+        pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
+        pmd_val(pmd) |= _PAGE_THP_HUGE;
+        pmd = pmd_set_protbits(pmd, pgprot);
+        return pmd;
+}
+pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
+{
+        return pfn_pmd(page_to_pfn(page), pgprot);
+}
+pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+        pmd_val(pmd) &= _HPAGE_CHG_MASK;
+        pmd = pmd_set_protbits(pmd, newprot);
+        return pmd;
+}
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a HUGE PMD entry in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux HUGE PMD entry.
+ */
+void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
+                          pmd_t *pmd)
+{
+        return;
+}
+pmd_t pmdp_get_and_clear(struct mm_struct *mm,
+                         unsigned long addr, pmd_t *pmdp)
+{
+        pmd_t old_pmd;
+        pgtable_t pgtable;
+        unsigned long old;
+        pgtable_t *pgtable_slot;
+        old = pmd_hugepage_update(mm, addr, pmdp, ~0UL);
+        old_pmd = __pmd(old);
+        /*
+         * We have pmd == none and we are holding page_table_lock.
+         * So we can safely go and clear the pgtable hash
+         * index info.
+         */
+        pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
+        pgtable = *pgtable_slot;
+        /*
+         * Let's zero out old valid and hash index details
+         * hash fault look at them.
+         */
+        memset(pgtable, 0, PTE_FRAG_SIZE);
+        return old_pmd;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 023ec8a13f38..48bf63ea6525 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -219,3 +219,30 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
        arch_leave_lazy_mmu_mode();
        local_irq_restore(flags);
 }
+void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+{
+        pte_t *pte;
+        pte_t *start_pte;
+        unsigned long flags;
+        addr = _ALIGN_DOWN(addr, PMD_SIZE);
+        /* Note: Normally, we should only ever use a batch within a
+         * PTE locked section. This violates the rule, but will work
+         * since we don't actually modify the PTEs, we just flush the
+         * hash while leaving the PTEs intact (including their reference
+         * to being hashed). This is not the most performance oriented
+         * way to do things but is fine for our needs here.
+         */
+        local_irq_save(flags);
+        arch_enter_lazy_mmu_mode();
+        start_pte = pte_offset_map(pmd, addr);
+        for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
+                unsigned long pteval = pte_val(*pte);
+                if (pteval & _PAGE_HASHPTE)
+                        hpte_need_flush(mm, addr, pte, pteval, 0);
+                addr += PAGE_SIZE;
+        }
+        arch_leave_lazy_mmu_mode();
+        local_irq_restore(flags);
+}
author	Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>	2013-06-20 05:00:15 -0400
committer	Benjamin Herrenschmidt <benh@kernel.crashing.org>	2013-06-21 02:01:53 -0400
commit	074c2eae3e9b66c03a17a12df8f2cd19382b68ab (patch)
tree	1606baa1df8093b5ca0ef638236b2ace837e1d98 /arch/powerpc/mm
parent	f940f5289873af2ad2c4e73f88c24ad2b8fe3f87 (diff)

diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index a854096e1023..e4d3e9fb59be 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c
@@ -338,6 +338,19 @@ EXPORT_SYMBOL(iounmap);
338	EXPORT_SYMBOL(__iounmap);	338	EXPORT_SYMBOL(__iounmap);
339	EXPORT_SYMBOL(__iounmap_at);	339	EXPORT_SYMBOL(__iounmap_at);
340		340
		341	/*
		342	* For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
		343	* For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
		344	*/
		345	struct page *pmd_page(pmd_t pmd)
		346	{
		347	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
		348	if (pmd_trans_huge(pmd))
		349	return pfn_to_page(pmd_pfn(pmd));
		350	#endif
		351	return virt_to_page(pmd_page_vaddr(pmd));
		352	}
		353
341	#ifdef CONFIG_PPC_64K_PAGES	354	#ifdef CONFIG_PPC_64K_PAGES
342	static pte_t get_from_cache(struct mm_struct mm)	355	static pte_t get_from_cache(struct mm_struct mm)
343	{	356	{
@@ -455,3 +468,367 @@ void pgtable_free_tlb(struct mmu_gather tlb, void table, int shift)
455	}	468	}
456	#endif	469	#endif
457	#endif /* CONFIG_PPC_64K_PAGES */	470	#endif /* CONFIG_PPC_64K_PAGES */
		471
		472	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
		473
		474	/*
		475	* This is called when relaxing access to a hugepage. It's also called in the page
		476	* fault path when we don't hit any of the major fault cases, ie, a minor
		477	* update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
		478	* handled those two for us, we additionally deal with missing execute
		479	* permission here on some processors
		480	*/
		481	int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
		482	pmd_t *pmdp, pmd_t entry, int dirty)
		483	{
		484	int changed;
		485	#ifdef CONFIG_DEBUG_VM
		486	WARN_ON(!pmd_trans_huge(*pmdp));
		487	assert_spin_locked(&vma->vm_mm->page_table_lock);
		488	#endif
		489	changed = !pmd_same(*(pmdp), entry);
		490	if (changed) {
		491	__ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
		492	/*
		493	* Since we are not supporting SW TLB systems, we don't
		494	* have any thing similar to flush_tlb_page_nohash()
		495	*/
		496	}
		497	return changed;
		498	}
		499
		500	unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
		501	pmd_t *pmdp, unsigned long clr)
		502	{
		503
		504	unsigned long old, tmp;
		505
		506	#ifdef CONFIG_DEBUG_VM
		507	WARN_ON(!pmd_trans_huge(*pmdp));
		508	assert_spin_locked(&mm->page_table_lock);
		509	#endif
		510
		511	#ifdef PTE_ATOMIC_UPDATES
		512	__asm__ __volatile__(
		513	"1: ldarx %0,0,%3\n\
		514	andi. %1,%0,%6\n\
		515	bne- 1b \n\
		516	andc %1,%0,%4 \n\
		517	stdcx. %1,0,%3 \n\
		518	bne- 1b"
		519	: "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
		520	: "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY)
		521	: "cc" );
		522	#else
		523	old = pmd_val(*pmdp);
		524	*pmdp = __pmd(old & ~clr);
		525	#endif
		526	if (old & _PAGE_HASHPTE)
		527	hpte_do_hugepage_flush(mm, addr, pmdp);
		528	return old;
		529	}
		530
		531	pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
		532	pmd_t *pmdp)
		533	{
		534	pmd_t pmd;
		535
		536	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
		537	if (pmd_trans_huge(*pmdp)) {
		538	pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
		539	} else {
		540	/*
		541	* khugepaged calls this for normal pmd
		542	*/
		543	pmd = *pmdp;
		544	pmd_clear(pmdp);
		545	/*
		546	* Wait for all pending hash_page to finish. This is needed
		547	* in case of subpage collapse. When we collapse normal pages
		548	* to hugepage, we first clear the pmd, then invalidate all
		549	* the PTE entries. The assumption here is that any low level
		550	* page fault will see a none pmd and take the slow path that
		551	* will wait on mmap_sem. But we could very well be in a
		552	* hash_page with local ptep pointer value. Such a hash page
		553	* can result in adding new HPTE entries for normal subpages.
		554	* That means we could be modifying the page content as we
		555	* copy them to a huge page. So wait for parallel hash_page
		556	* to finish before invalidating HPTE entries. We can do this
		557	* by sending an IPI to all the cpus and executing a dummy
		558	* function there.
		559	*/
		560	kick_all_cpus_sync();
		561	/*
		562	* Now invalidate the hpte entries in the range
		563	* covered by pmd. This make sure we take a
		564	* fault and will find the pmd as none, which will
		565	* result in a major fault which takes mmap_sem and
		566	* hence wait for collapse to complete. Without this
		567	* the __collapse_huge_page_copy can result in copying
		568	* the old content.
		569	*/
		570	flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
		571	}
		572	return pmd;
		573	}
		574
		575	int pmdp_test_and_clear_young(struct vm_area_struct *vma,
		576	unsigned long address, pmd_t *pmdp)
		577	{
		578	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
		579	}
		580
		581	/*
		582	* We currently remove entries from the hashtable regardless of whether
		583	* the entry was young or dirty. The generic routines only flush if the
		584	* entry was young or dirty which is not good enough.
		585	*
		586	* We should be more intelligent about this but for the moment we override
		587	* these functions and force a tlb flush unconditionally
		588	*/
		589	int pmdp_clear_flush_young(struct vm_area_struct *vma,
		590	unsigned long address, pmd_t *pmdp)
		591	{
		592	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
		593	}
		594
		595	/*
		596	* We mark the pmd splitting and invalidate all the hpte
		597	* entries for this hugepage.
		598	*/
		599	void pmdp_splitting_flush(struct vm_area_struct *vma,
		600	unsigned long address, pmd_t *pmdp)
		601	{
		602	unsigned long old, tmp;
		603
		604	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
		605
		606	#ifdef CONFIG_DEBUG_VM
		607	WARN_ON(!pmd_trans_huge(*pmdp));
		608	assert_spin_locked(&vma->vm_mm->page_table_lock);
		609	#endif
		610
		611	#ifdef PTE_ATOMIC_UPDATES
		612
		613	__asm__ __volatile__(
		614	"1: ldarx %0,0,%3\n\
		615	andi. %1,%0,%6\n\
		616	bne- 1b \n\
		617	ori %1,%0,%4 \n\
		618	stdcx. %1,0,%3 \n\
		619	bne- 1b"
		620	: "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
		621	: "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
		622	: "cc" );
		623	#else
		624	old = pmd_val(*pmdp);
		625	*pmdp = __pmd(old \| _PAGE_SPLITTING);
		626	#endif
		627	/*
		628	* If we didn't had the splitting flag set, go and flush the
		629	* HPTE entries.
		630	*/
		631	if (!(old & _PAGE_SPLITTING)) {
		632	/* We need to flush the hpte */
		633	if (old & _PAGE_HASHPTE)
		634	hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
		635	}
		636	}
		637
		638	/*
		639	* We want to put the pgtable in pmd and use pgtable for tracking
		640	* the base page size hptes
		641	*/
		642	void pgtable_trans_huge_deposit(struct mm_struct mm, pmd_t pmdp,
		643	pgtable_t pgtable)
		644	{
		645	pgtable_t *pgtable_slot;
		646	assert_spin_locked(&mm->page_table_lock);
		647	/*
		648	* we store the pgtable in the second half of PMD
		649	*/
		650	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
		651	*pgtable_slot = pgtable;
		652	/*
		653	* expose the deposited pgtable to other cpus.
		654	* before we set the hugepage PTE at pmd level
		655	* hash fault code looks at the deposted pgtable
		656	* to store hash index values.
		657	*/
		658	smp_wmb();
		659	}
		660
		661	pgtable_t pgtable_trans_huge_withdraw(struct mm_struct mm, pmd_t pmdp)
		662	{
		663	pgtable_t pgtable;
		664	pgtable_t *pgtable_slot;
		665
		666	assert_spin_locked(&mm->page_table_lock);
		667	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
		668	pgtable = *pgtable_slot;
		669	/*
		670	* Once we withdraw, mark the entry NULL.
		671	*/
		672	*pgtable_slot = NULL;
		673	/*
		674	* We store HPTE information in the deposited PTE fragment.
		675	* zero out the content on withdraw.
		676	*/
		677	memset(pgtable, 0, PTE_FRAG_SIZE);
		678	return pgtable;
		679	}
		680
		681	/*
		682	* set a new huge pmd. We should not be called for updating
		683	* an existing pmd entry. That should go via pmd_hugepage_update.
		684	*/
		685	void set_pmd_at(struct mm_struct *mm, unsigned long addr,
		686	pmd_t *pmdp, pmd_t pmd)
		687	{
		688	#ifdef CONFIG_DEBUG_VM
		689	WARN_ON(!pmd_none(*pmdp));
		690	assert_spin_locked(&mm->page_table_lock);
		691	WARN_ON(!pmd_trans_huge(pmd));
		692	#endif
		693	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
		694	}
		695
		696	void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
		697	pmd_t *pmdp)
		698	{
		699	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT);
		700	}
		701
		702	/*
		703	* A linux hugepage PMD was changed and the corresponding hash table entries
		704	* neesd to be flushed.
		705	*/
		706	void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
		707	pmd_t *pmdp)
		708	{
		709	int ssize, i;
		710	unsigned long s_addr;
		711	unsigned int psize, valid;
		712	unsigned char *hpte_slot_array;
		713	unsigned long hidx, vpn, vsid, hash, shift, slot;
		714
		715	/*
		716	* Flush all the hptes mapping this hugepage
		717	*/
		718	s_addr = addr & HPAGE_PMD_MASK;
		719	hpte_slot_array = get_hpte_slot_array(pmdp);
		720	/*
		721	* IF we try to do a HUGE PTE update after a withdraw is done.
		722	* we will find the below NULL. This happens when we do
		723	* split_huge_page_pmd
		724	*/
		725	if (!hpte_slot_array)
		726	return;
		727
		728	/* get the base page size */
		729	psize = get_slice_psize(mm, s_addr);
		730	shift = mmu_psize_defs[psize].shift;
		731
		732	for (i = 0; i < (HPAGE_PMD_SIZE >> shift); i++) {
		733	/*
		734	* 8 bits per each hpte entries
		735	* 000\| [ secondary group (one bit) \| hidx (3 bits) \| valid bit]
		736	*/
		737	valid = hpte_valid(hpte_slot_array, i);
		738	if (!valid)
		739	continue;
		740	hidx = hpte_hash_index(hpte_slot_array, i);
		741
		742	/* get the vpn */
		743	addr = s_addr + (i * (1ul << shift));
		744	if (!is_kernel_addr(addr)) {
		745	ssize = user_segment_size(addr);
		746	vsid = get_vsid(mm->context.id, addr, ssize);
		747	WARN_ON(vsid == 0);
		748	} else {
		749	vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
		750	ssize = mmu_kernel_ssize;
		751	}
		752
		753	vpn = hpt_vpn(addr, vsid, ssize);
		754	hash = hpt_hash(vpn, shift, ssize);
		755	if (hidx & _PTEIDX_SECONDARY)
		756	hash = ~hash;
		757
		758	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
		759	slot += hidx & _PTEIDX_GROUP_IX;
		760	ppc_md.hpte_invalidate(slot, vpn, psize,
		761	MMU_PAGE_16M, ssize, 0);
		762	}
		763	}
		764
		765	static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
		766	{
		767	pmd_val(pmd) \|= pgprot_val(pgprot);
		768	return pmd;
		769	}
		770
		771	pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
		772	{
		773	pmd_t pmd;
		774	/*
		775	* For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
		776	* set. We use this to check THP page at pmd level.
		777	* leaf pte for huge page, bottom two bits != 00
		778	*/
		779	pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
		780	pmd_val(pmd) \|= _PAGE_THP_HUGE;
		781	pmd = pmd_set_protbits(pmd, pgprot);
		782	return pmd;
		783	}
		784
		785	pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
		786	{
		787	return pfn_pmd(page_to_pfn(page), pgprot);
		788	}
		789
		790	pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
		791	{
		792
		793	pmd_val(pmd) &= _HPAGE_CHG_MASK;
		794	pmd = pmd_set_protbits(pmd, newprot);
		795	return pmd;
		796	}
		797
		798	/*
		799	* This is called at the end of handling a user page fault, when the
		800	* fault has been handled by updating a HUGE PMD entry in the linux page tables.
		801	* We use it to preload an HPTE into the hash table corresponding to
		802	* the updated linux HUGE PMD entry.
		803	*/
		804	void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
		805	pmd_t *pmd)
		806	{
		807	return;
		808	}
		809
		810	pmd_t pmdp_get_and_clear(struct mm_struct *mm,
		811	unsigned long addr, pmd_t *pmdp)
		812	{
		813	pmd_t old_pmd;
		814	pgtable_t pgtable;
		815	unsigned long old;
		816	pgtable_t *pgtable_slot;
		817
		818	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL);
		819	old_pmd = __pmd(old);
		820	/*
		821	* We have pmd == none and we are holding page_table_lock.
		822	* So we can safely go and clear the pgtable hash
		823	* index info.
		824	*/
		825	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
		826	pgtable = *pgtable_slot;
		827	/*
		828	* Let's zero out old valid and hash index details
		829	* hash fault look at them.
		830	*/
		831	memset(pgtable, 0, PTE_FRAG_SIZE);
		832	return old_pmd;
		833	}
		834	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */


diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 023ec8a13f38..48bf63ea6525 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c
@@ -219,3 +219,30 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
219	arch_leave_lazy_mmu_mode();	219	arch_leave_lazy_mmu_mode();
220	local_irq_restore(flags);	220	local_irq_restore(flags);
221	}	221	}
		222
		223	void flush_tlb_pmd_range(struct mm_struct mm, pmd_t pmd, unsigned long addr)
		224	{
		225	pte_t *pte;
		226	pte_t *start_pte;
		227	unsigned long flags;
		228
		229	addr = _ALIGN_DOWN(addr, PMD_SIZE);
		230	/* Note: Normally, we should only ever use a batch within a
		231	* PTE locked section. This violates the rule, but will work
		232	* since we don't actually modify the PTEs, we just flush the
		233	* hash while leaving the PTEs intact (including their reference
		234	* to being hashed). This is not the most performance oriented
		235	* way to do things but is fine for our needs here.
		236	*/
		237	local_irq_save(flags);
		238	arch_enter_lazy_mmu_mode();
		239	start_pte = pte_offset_map(pmd, addr);
		240	for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
		241	unsigned long pteval = pte_val(*pte);
		242	if (pteval & _PAGE_HASHPTE)
		243	hpte_need_flush(mm, addr, pte, pteval, 0);
		244	addr += PAGE_SIZE;
		245	}
		246	arch_leave_lazy_mmu_mode();
		247	local_irq_restore(flags);
		248	}