aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
authorAneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>2013-06-20 05:00:15 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2013-06-21 02:01:53 -0400
commit074c2eae3e9b66c03a17a12df8f2cd19382b68ab (patch)
tree1606baa1df8093b5ca0ef638236b2ace837e1d98 /arch/powerpc/mm
parentf940f5289873af2ad2c4e73f88c24ad2b8fe3f87 (diff)
powerpc/THP: Implement transparent hugepages for ppc64
We now have pmd entries covering 16MB range and the PMD table double its original size. We use the second half of the PMD table to deposit the pgtable (PTE page). The depoisted PTE page is further used to track the HPTE information. The information include [ secondary group | 3 bit hidx | valid ]. We use one byte per each HPTE entry. With 16MB hugepage and 64K HPTE we need 256 entries and with 4K HPTE we need 4096 entries. Both will fit in a 4K PTE page. On hugepage invalidate we need to walk the PTE page and invalidate all valid HPTEs. This patch implements necessary arch specific functions for THP support and also hugepage invalidate logic. These PMD related functions are intentionally kept similar to their PTE counter-part. Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/pgtable_64.c377
-rw-r--r--arch/powerpc/mm/tlb_hash64.c27
2 files changed, 404 insertions, 0 deletions
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index a854096e1023..e4d3e9fb59be 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -338,6 +338,19 @@ EXPORT_SYMBOL(iounmap);
338EXPORT_SYMBOL(__iounmap); 338EXPORT_SYMBOL(__iounmap);
339EXPORT_SYMBOL(__iounmap_at); 339EXPORT_SYMBOL(__iounmap_at);
340 340
341/*
342 * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
343 * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
344 */
345struct page *pmd_page(pmd_t pmd)
346{
347#ifdef CONFIG_TRANSPARENT_HUGEPAGE
348 if (pmd_trans_huge(pmd))
349 return pfn_to_page(pmd_pfn(pmd));
350#endif
351 return virt_to_page(pmd_page_vaddr(pmd));
352}
353
341#ifdef CONFIG_PPC_64K_PAGES 354#ifdef CONFIG_PPC_64K_PAGES
342static pte_t *get_from_cache(struct mm_struct *mm) 355static pte_t *get_from_cache(struct mm_struct *mm)
343{ 356{
@@ -455,3 +468,367 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
455} 468}
456#endif 469#endif
457#endif /* CONFIG_PPC_64K_PAGES */ 470#endif /* CONFIG_PPC_64K_PAGES */
471
472#ifdef CONFIG_TRANSPARENT_HUGEPAGE
473
474/*
475 * This is called when relaxing access to a hugepage. It's also called in the page
476 * fault path when we don't hit any of the major fault cases, ie, a minor
477 * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
478 * handled those two for us, we additionally deal with missing execute
479 * permission here on some processors
480 */
481int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
482 pmd_t *pmdp, pmd_t entry, int dirty)
483{
484 int changed;
485#ifdef CONFIG_DEBUG_VM
486 WARN_ON(!pmd_trans_huge(*pmdp));
487 assert_spin_locked(&vma->vm_mm->page_table_lock);
488#endif
489 changed = !pmd_same(*(pmdp), entry);
490 if (changed) {
491 __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
492 /*
493 * Since we are not supporting SW TLB systems, we don't
494 * have any thing similar to flush_tlb_page_nohash()
495 */
496 }
497 return changed;
498}
499
500unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
501 pmd_t *pmdp, unsigned long clr)
502{
503
504 unsigned long old, tmp;
505
506#ifdef CONFIG_DEBUG_VM
507 WARN_ON(!pmd_trans_huge(*pmdp));
508 assert_spin_locked(&mm->page_table_lock);
509#endif
510
511#ifdef PTE_ATOMIC_UPDATES
512 __asm__ __volatile__(
513 "1: ldarx %0,0,%3\n\
514 andi. %1,%0,%6\n\
515 bne- 1b \n\
516 andc %1,%0,%4 \n\
517 stdcx. %1,0,%3 \n\
518 bne- 1b"
519 : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
520 : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY)
521 : "cc" );
522#else
523 old = pmd_val(*pmdp);
524 *pmdp = __pmd(old & ~clr);
525#endif
526 if (old & _PAGE_HASHPTE)
527 hpte_do_hugepage_flush(mm, addr, pmdp);
528 return old;
529}
530
531pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
532 pmd_t *pmdp)
533{
534 pmd_t pmd;
535
536 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
537 if (pmd_trans_huge(*pmdp)) {
538 pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
539 } else {
540 /*
541 * khugepaged calls this for normal pmd
542 */
543 pmd = *pmdp;
544 pmd_clear(pmdp);
545 /*
546 * Wait for all pending hash_page to finish. This is needed
547 * in case of subpage collapse. When we collapse normal pages
548 * to hugepage, we first clear the pmd, then invalidate all
549 * the PTE entries. The assumption here is that any low level
550 * page fault will see a none pmd and take the slow path that
551 * will wait on mmap_sem. But we could very well be in a
552 * hash_page with local ptep pointer value. Such a hash page
553 * can result in adding new HPTE entries for normal subpages.
554 * That means we could be modifying the page content as we
555 * copy them to a huge page. So wait for parallel hash_page
556 * to finish before invalidating HPTE entries. We can do this
557 * by sending an IPI to all the cpus and executing a dummy
558 * function there.
559 */
560 kick_all_cpus_sync();
561 /*
562 * Now invalidate the hpte entries in the range
563 * covered by pmd. This make sure we take a
564 * fault and will find the pmd as none, which will
565 * result in a major fault which takes mmap_sem and
566 * hence wait for collapse to complete. Without this
567 * the __collapse_huge_page_copy can result in copying
568 * the old content.
569 */
570 flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
571 }
572 return pmd;
573}
574
575int pmdp_test_and_clear_young(struct vm_area_struct *vma,
576 unsigned long address, pmd_t *pmdp)
577{
578 return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
579}
580
581/*
582 * We currently remove entries from the hashtable regardless of whether
583 * the entry was young or dirty. The generic routines only flush if the
584 * entry was young or dirty which is not good enough.
585 *
586 * We should be more intelligent about this but for the moment we override
587 * these functions and force a tlb flush unconditionally
588 */
589int pmdp_clear_flush_young(struct vm_area_struct *vma,
590 unsigned long address, pmd_t *pmdp)
591{
592 return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
593}
594
595/*
596 * We mark the pmd splitting and invalidate all the hpte
597 * entries for this hugepage.
598 */
599void pmdp_splitting_flush(struct vm_area_struct *vma,
600 unsigned long address, pmd_t *pmdp)
601{
602 unsigned long old, tmp;
603
604 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
605
606#ifdef CONFIG_DEBUG_VM
607 WARN_ON(!pmd_trans_huge(*pmdp));
608 assert_spin_locked(&vma->vm_mm->page_table_lock);
609#endif
610
611#ifdef PTE_ATOMIC_UPDATES
612
613 __asm__ __volatile__(
614 "1: ldarx %0,0,%3\n\
615 andi. %1,%0,%6\n\
616 bne- 1b \n\
617 ori %1,%0,%4 \n\
618 stdcx. %1,0,%3 \n\
619 bne- 1b"
620 : "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
621 : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
622 : "cc" );
623#else
624 old = pmd_val(*pmdp);
625 *pmdp = __pmd(old | _PAGE_SPLITTING);
626#endif
627 /*
628 * If we didn't had the splitting flag set, go and flush the
629 * HPTE entries.
630 */
631 if (!(old & _PAGE_SPLITTING)) {
632 /* We need to flush the hpte */
633 if (old & _PAGE_HASHPTE)
634 hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
635 }
636}
637
638/*
639 * We want to put the pgtable in pmd and use pgtable for tracking
640 * the base page size hptes
641 */
642void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
643 pgtable_t pgtable)
644{
645 pgtable_t *pgtable_slot;
646 assert_spin_locked(&mm->page_table_lock);
647 /*
648 * we store the pgtable in the second half of PMD
649 */
650 pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
651 *pgtable_slot = pgtable;
652 /*
653 * expose the deposited pgtable to other cpus.
654 * before we set the hugepage PTE at pmd level
655 * hash fault code looks at the deposted pgtable
656 * to store hash index values.
657 */
658 smp_wmb();
659}
660
661pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
662{
663 pgtable_t pgtable;
664 pgtable_t *pgtable_slot;
665
666 assert_spin_locked(&mm->page_table_lock);
667 pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
668 pgtable = *pgtable_slot;
669 /*
670 * Once we withdraw, mark the entry NULL.
671 */
672 *pgtable_slot = NULL;
673 /*
674 * We store HPTE information in the deposited PTE fragment.
675 * zero out the content on withdraw.
676 */
677 memset(pgtable, 0, PTE_FRAG_SIZE);
678 return pgtable;
679}
680
681/*
682 * set a new huge pmd. We should not be called for updating
683 * an existing pmd entry. That should go via pmd_hugepage_update.
684 */
685void set_pmd_at(struct mm_struct *mm, unsigned long addr,
686 pmd_t *pmdp, pmd_t pmd)
687{
688#ifdef CONFIG_DEBUG_VM
689 WARN_ON(!pmd_none(*pmdp));
690 assert_spin_locked(&mm->page_table_lock);
691 WARN_ON(!pmd_trans_huge(pmd));
692#endif
693 return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
694}
695
696void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
697 pmd_t *pmdp)
698{
699 pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT);
700}
701
702/*
703 * A linux hugepage PMD was changed and the corresponding hash table entries
704 * neesd to be flushed.
705 */
706void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
707 pmd_t *pmdp)
708{
709 int ssize, i;
710 unsigned long s_addr;
711 unsigned int psize, valid;
712 unsigned char *hpte_slot_array;
713 unsigned long hidx, vpn, vsid, hash, shift, slot;
714
715 /*
716 * Flush all the hptes mapping this hugepage
717 */
718 s_addr = addr & HPAGE_PMD_MASK;
719 hpte_slot_array = get_hpte_slot_array(pmdp);
720 /*
721 * IF we try to do a HUGE PTE update after a withdraw is done.
722 * we will find the below NULL. This happens when we do
723 * split_huge_page_pmd
724 */
725 if (!hpte_slot_array)
726 return;
727
728 /* get the base page size */
729 psize = get_slice_psize(mm, s_addr);
730 shift = mmu_psize_defs[psize].shift;
731
732 for (i = 0; i < (HPAGE_PMD_SIZE >> shift); i++) {
733 /*
734 * 8 bits per each hpte entries
735 * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
736 */
737 valid = hpte_valid(hpte_slot_array, i);
738 if (!valid)
739 continue;
740 hidx = hpte_hash_index(hpte_slot_array, i);
741
742 /* get the vpn */
743 addr = s_addr + (i * (1ul << shift));
744 if (!is_kernel_addr(addr)) {
745 ssize = user_segment_size(addr);
746 vsid = get_vsid(mm->context.id, addr, ssize);
747 WARN_ON(vsid == 0);
748 } else {
749 vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
750 ssize = mmu_kernel_ssize;
751 }
752
753 vpn = hpt_vpn(addr, vsid, ssize);
754 hash = hpt_hash(vpn, shift, ssize);
755 if (hidx & _PTEIDX_SECONDARY)
756 hash = ~hash;
757
758 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
759 slot += hidx & _PTEIDX_GROUP_IX;
760 ppc_md.hpte_invalidate(slot, vpn, psize,
761 MMU_PAGE_16M, ssize, 0);
762 }
763}
764
765static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
766{
767 pmd_val(pmd) |= pgprot_val(pgprot);
768 return pmd;
769}
770
771pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
772{
773 pmd_t pmd;
774 /*
775 * For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
776 * set. We use this to check THP page at pmd level.
777 * leaf pte for huge page, bottom two bits != 00
778 */
779 pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
780 pmd_val(pmd) |= _PAGE_THP_HUGE;
781 pmd = pmd_set_protbits(pmd, pgprot);
782 return pmd;
783}
784
785pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
786{
787 return pfn_pmd(page_to_pfn(page), pgprot);
788}
789
790pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
791{
792
793 pmd_val(pmd) &= _HPAGE_CHG_MASK;
794 pmd = pmd_set_protbits(pmd, newprot);
795 return pmd;
796}
797
798/*
799 * This is called at the end of handling a user page fault, when the
800 * fault has been handled by updating a HUGE PMD entry in the linux page tables.
801 * We use it to preload an HPTE into the hash table corresponding to
802 * the updated linux HUGE PMD entry.
803 */
804void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
805 pmd_t *pmd)
806{
807 return;
808}
809
810pmd_t pmdp_get_and_clear(struct mm_struct *mm,
811 unsigned long addr, pmd_t *pmdp)
812{
813 pmd_t old_pmd;
814 pgtable_t pgtable;
815 unsigned long old;
816 pgtable_t *pgtable_slot;
817
818 old = pmd_hugepage_update(mm, addr, pmdp, ~0UL);
819 old_pmd = __pmd(old);
820 /*
821 * We have pmd == none and we are holding page_table_lock.
822 * So we can safely go and clear the pgtable hash
823 * index info.
824 */
825 pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
826 pgtable = *pgtable_slot;
827 /*
828 * Let's zero out old valid and hash index details
829 * hash fault look at them.
830 */
831 memset(pgtable, 0, PTE_FRAG_SIZE);
832 return old_pmd;
833}
834#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 023ec8a13f38..48bf63ea6525 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -219,3 +219,30 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
219 arch_leave_lazy_mmu_mode(); 219 arch_leave_lazy_mmu_mode();
220 local_irq_restore(flags); 220 local_irq_restore(flags);
221} 221}
222
223void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
224{
225 pte_t *pte;
226 pte_t *start_pte;
227 unsigned long flags;
228
229 addr = _ALIGN_DOWN(addr, PMD_SIZE);
230 /* Note: Normally, we should only ever use a batch within a
231 * PTE locked section. This violates the rule, but will work
232 * since we don't actually modify the PTEs, we just flush the
233 * hash while leaving the PTEs intact (including their reference
234 * to being hashed). This is not the most performance oriented
235 * way to do things but is fine for our needs here.
236 */
237 local_irq_save(flags);
238 arch_enter_lazy_mmu_mode();
239 start_pte = pte_offset_map(pmd, addr);
240 for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) {
241 unsigned long pteval = pte_val(*pte);
242 if (pteval & _PAGE_HASHPTE)
243 hpte_need_flush(mm, addr, pte, pteval, 0);
244 addr += PAGE_SIZE;
245 }
246 arch_leave_lazy_mmu_mode();
247 local_irq_restore(flags);
248}