powerpc: Hugetlb for BookE

Enable hugepages on Freescale BookE processors. This allows the kernel to use huge TLB entries to map pages, which can greatly reduce the number of TLB misses and the amount of TLB thrashing experienced by applications with large memory footprints. Care should be taken when using this on FSL processors, as the number of large TLB entries supported by the core is low (16-64) on current processors. The supported set of hugepage sizes include 4m, 16m, 64m, 256m, and 1g. Page sizes larger than the max zone size are called "gigantic" pages and must be allocated on the command line (and cannot be deallocated). This is currently only fully implemented for Freescale 32-bit BookE processors, but there is some infrastructure in the code for 64-bit BooKE. Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org> Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
author: Becky Bruce <beckyb@kernel.crashing.org> 2011-06-28 05:54:48 -0400
committer: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2011-09-19 19:19:40 -0400
commit: 41151e77a4d96ea138cede6d84c955aa4769ce74 (patch)
tree: 2d997b77b9adf406a2fd30326bff688577d2e64f /arch/powerpc/mm/hugetlbpage.c
parent: 7df5659eefad9b6d457ccdee016bd78bd064cfc0 (diff)
1 files changed, 337 insertions, 42 deletions
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0b9a5c1901b9..3a5f59dcbb33 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -1,7 +1,8 @@
 /*
- * PPC64 (POWER4) Huge TLB Page Support for Kernel.
+ * PPC Huge TLB Page Support for Kernel.
 *
 * Copyright (C) 2003 David Gibson, IBM Corporation.
+ * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
 *
 * Based on the IA-32 version:
 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
@@ -11,24 +12,39 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 #include <linux/hugetlb.h>
+#include <linux/of_fdt.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
+#include <asm/setup.h>
 #define PAGE_SHIFT_64K  16
 #define PAGE_SHIFT_16M  24
 #define PAGE_SHIFT_16G  34
-#define MAX_NUMBER_GPAGES       1024
+unsigned int HPAGE_SHIFT;
-/* Tracks the 16G pages after the device tree is scanned and before the
+/*
- * huge_boot_pages list is ready.  */
+ * Tracks gpages after the device tree is scanned and before the
-static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
+ * huge_boot_pages list is ready.  On 64-bit implementations, this is
+ * just used to track 16G pages and so is a single array.  32-bit
+ * implementations may have more than one gpage size due to limitations
+ * of the memory allocators, so we need multiple arrays
+ */
+#ifdef CONFIG_PPC64
+#define MAX_NUMBER_GPAGES       1024
+static u64 gpage_freearray[MAX_NUMBER_GPAGES];
 static unsigned nr_gpages;
+#else
-/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
+#define MAX_NUMBER_GPAGES       128
- * will choke on pointers to hugepte tables, which is handy for
+struct psize_gpages {
- * catching screwups early. */
+        u64 gpage_list[MAX_NUMBER_GPAGES];
+        unsigned int nr_gpages;
+};
+static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
+#endif
 static inline int shift_to_mmu_psize(unsigned int shift)
 {
@@ -49,25 +65,6 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
 #define hugepd_none(hpd)        ((hpd).pd == 0)
-static inline pte_t *hugepd_page(hugepd_t hpd)
-{
-        BUG_ON(!hugepd_ok(hpd));
-        return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
-}
-static inline unsigned int hugepd_shift(hugepd_t hpd)
-{
-        return hpd.pd & HUGEPD_SHIFT_MASK;
-}
-static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
-{
-        unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
-        pte_t *dir = hugepd_page(*hpdp);
-        return dir + idx;
-}
 pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
 {
        pgd_t *pg;
@@ -93,7 +90,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
                        if (is_hugepd(pm))
                                hpdp = (hugepd_t *)pm;
                        else if (!pmd_none(*pm)) {
-                                return pte_offset_map(pm, ea);
+                                return pte_offset_kernel(pm, ea);
                        }
                }
        }
@@ -114,8 +111,18 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
                           unsigned long address, unsigned pdshift, unsigned pshift)
 {
-        pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
+        struct kmem_cache *cachep;
-                                       GFP_KERNEL|__GFP_REPEAT);
+        pte_t *new;
+#ifdef CONFIG_PPC64
+        cachep = PGT_CACHE(pdshift - pshift);
+#else
+        int i;
+        int num_hugepd = 1 << (pshift - pdshift);
+        cachep = hugepte_cache;
+#endif
+        new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT);
        BUG_ON(pshift > HUGEPD_SHIFT_MASK);
        BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
@@ -124,10 +131,31 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
                return -ENOMEM;
        spin_lock(&mm->page_table_lock);
+#ifdef CONFIG_PPC64
        if (!hugepd_none(*hpdp))
-                kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
+                kmem_cache_free(cachep, new);
        else
-                hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
+                hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
+#else
+        /*
+         * We have multiple higher-level entries that point to the same
+         * actual pte location.  Fill in each as we go and backtrack on error.
+         * We need all of these so the DTLB pgtable walk code can find the
+         * right higher-level entry without knowing if it's a hugepage or not.
+         */
+        for (i = 0; i < num_hugepd; i++, hpdp++) {
+                if (unlikely(!hugepd_none(*hpdp)))
+                        break;
+                else
+                        hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
+        }
+        /* If we bailed from the for loop early, an error occurred, clean up */
+        if (i < num_hugepd) {
+                for (i = i - 1 ; i >= 0; i--, hpdp--)
+                        hpdp->pd = 0;
+                kmem_cache_free(cachep, new);
+        }
+#endif
        spin_unlock(&mm->page_table_lock);
        return 0;
 }
@@ -169,11 +197,132 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
        return hugepte_offset(hpdp, addr, pdshift);
 }
+#ifdef CONFIG_PPC32
 /* Build list of addresses of gigantic pages.  This function is used in early
 * boot before the buddy or bootmem allocator is setup.
 */
-void add_gpage(unsigned long addr, unsigned long page_size,
+void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
-        unsigned long number_of_pages)
+{
+        unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
+        int i;
+        if (addr == 0)
+                return;
+        gpage_freearray[idx].nr_gpages = number_of_pages;
+        for (i = 0; i < number_of_pages; i++) {
+                gpage_freearray[idx].gpage_list[i] = addr;
+                addr += page_size;
+        }
+}
+/*
+ * Moves the gigantic page addresses from the temporary list to the
+ * huge_boot_pages list.
+ */
+int alloc_bootmem_huge_page(struct hstate *hstate)
+{
+        struct huge_bootmem_page *m;
+        int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT);
+        int nr_gpages = gpage_freearray[idx].nr_gpages;
+        if (nr_gpages == 0)
+                return 0;
+#ifdef CONFIG_HIGHMEM
+        /*
+         * If gpages can be in highmem we can't use the trick of storing the
+         * data structure in the page; allocate space for this
+         */
+        m = alloc_bootmem(sizeof(struct huge_bootmem_page));
+        m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
+#else
+        m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
+#endif
+        list_add(&m->list, &huge_boot_pages);
+        gpage_freearray[idx].nr_gpages = nr_gpages;
+        gpage_freearray[idx].gpage_list[nr_gpages] = 0;
+        m->hstate = hstate;
+        return 1;
+}
+/*
+ * Scan the command line hugepagesz= options for gigantic pages; store those in
+ * a list that we use to allocate the memory once all options are parsed.
+ */
+unsigned long gpage_npages[MMU_PAGE_COUNT];
+static int __init do_gpage_early_setup(char *param, char *val)
+{
+        static phys_addr_t size;
+        unsigned long npages;
+        /*
+         * The hugepagesz and hugepages cmdline options are interleaved.  We
+         * use the size variable to keep track of whether or not this was done
+         * properly and skip over instances where it is incorrect.  Other
+         * command-line parsing code will issue warnings, so we don't need to.
+         *
+         */
+        if ((strcmp(param, "default_hugepagesz") == 0) ||
+            (strcmp(param, "hugepagesz") == 0)) {
+                size = memparse(val, NULL);
+        } else if (strcmp(param, "hugepages") == 0) {
+                if (size != 0) {
+                        if (sscanf(val, "%lu", &npages) <= 0)
+                                npages = 0;
+                        gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
+                        size = 0;
+                }
+        }
+        return 0;
+}
+/*
+ * This function allocates physical space for pages that are larger than the
+ * buddy allocator can handle.  We want to allocate these in highmem because
+ * the amount of lowmem is limited.  This means that this function MUST be
+ * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
+ * allocate to grab highmem.
+ */
+void __init reserve_hugetlb_gpages(void)
+{
+        static __initdata char cmdline[COMMAND_LINE_SIZE];
+        phys_addr_t size, base;
+        int i;
+        strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
+        parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup);
+        /*
+         * Walk gpage list in reverse, allocating larger page sizes first.
+         * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
+         * When we reach the point in the list where pages are no longer
+         * considered gpages, we're done.
+         */
+        for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
+                if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
+                        continue;
+                else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
+                        break;
+                size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
+                base = memblock_alloc_base(size * gpage_npages[i], size,
+                                           MEMBLOCK_ALLOC_ANYWHERE);
+                add_gpage(base, size, gpage_npages[i]);
+        }
+}
+#else /* PPC64 */
+/* Build list of addresses of gigantic pages.  This function is used in early
+ * boot before the buddy or bootmem allocator is setup.
+ */
+void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
 {
        if (!addr)
                return;
@@ -199,19 +348,79 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
        m->hstate = hstate;
        return 1;
 }
+#endif
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 {
        return 0;
 }
+#ifdef CONFIG_PPC32
+#define HUGEPD_FREELIST_SIZE \
+        ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
+struct hugepd_freelist {
+        struct rcu_head rcu;
+        unsigned int index;
+        void *ptes[0];
+};
+static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
+static void hugepd_free_rcu_callback(struct rcu_head *head)
+{
+        struct hugepd_freelist *batch =
+                container_of(head, struct hugepd_freelist, rcu);
+        unsigned int i;
+        for (i = 0; i < batch->index; i++)
+                kmem_cache_free(hugepte_cache, batch->ptes[i]);
+        free_page((unsigned long)batch);
+}
+static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
+{
+        struct hugepd_freelist **batchp;
+        batchp = &__get_cpu_var(hugepd_freelist_cur);
+        if (atomic_read(&tlb->mm->mm_users) < 2 ||
+            cpumask_equal(mm_cpumask(tlb->mm),
+                          cpumask_of(smp_processor_id()))) {
+                kmem_cache_free(hugepte_cache, hugepte);
+                return;
+        }
+        if (*batchp == NULL) {
+                *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
+                (*batchp)->index = 0;
+        }
+        (*batchp)->ptes[(*batchp)->index++] = hugepte;
+        if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
+                call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
+                *batchp = NULL;
+        }
+}
+#endif
 static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
                              unsigned long start, unsigned long end,
                              unsigned long floor, unsigned long ceiling)
 {
        pte_t *hugepte = hugepd_page(*hpdp);
-        unsigned shift = hugepd_shift(*hpdp);
+        int i;
        unsigned long pdmask = ~((1UL << pdshift) - 1);
+        unsigned int num_hugepd = 1;
+#ifdef CONFIG_PPC64
+        unsigned int shift = hugepd_shift(*hpdp);
+#else
+        /* Note: On 32-bit the hpdp may be the first of several */
+        num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift));
+#endif
        start &= pdmask;
        if (start < floor)
@@ -224,9 +433,15 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif
        if (end - 1 > ceiling - 1)
                return;
-        hpdp->pd = 0;
+        for (i = 0; i < num_hugepd; i++, hpdp++)
+                hpdp->pd = 0;
        tlb->need_flush = 1;
+#ifdef CONFIG_PPC64
        pgtable_free_tlb(tlb, hugepte, pdshift - shift);
+#else
+        hugepd_free(tlb, hugepte);
+#endif
 }
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -331,18 +546,27 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
         * too.
         */
-        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
+                pgd = pgd_offset(tlb->mm, addr);
                if (!is_hugepd(pgd)) {
                        if (pgd_none_or_clear_bad(pgd))
                                continue;
                        hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
                } else {
+#ifdef CONFIG_PPC32
+                        /*
+                         * Increment next by the size of the huge mapping since
+                         * on 32-bit there may be more than one entry at the pgd
+                         * level for a single hugepage, but all of them point to
+                         * the same kmem cache that holds the hugepte.
+                         */
+                        next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
+#endif
                        free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
                                          addr, next, floor, ceiling);
                }
-        } while (pgd++, addr = next, addr != end);
+        } while (addr = next, addr != end);
 }
 struct page *
@@ -466,17 +690,35 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                                        unsigned long len, unsigned long pgoff,
                                        unsigned long flags)
 {
+#ifdef CONFIG_MM_SLICES
        struct hstate *hstate = hstate_file(file);
        int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
        return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
+#else
+        return get_unmapped_area(file, addr, len, pgoff, flags);
+#endif
 }
 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
 {
+#ifdef CONFIG_MM_SLICES
        unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
        return 1UL << mmu_psize_to_shift(psize);
+#else
+        if (!is_vm_hugetlb_page(vma))
+                return PAGE_SIZE;
+        return huge_page_size(hstate_vma(vma));
+#endif
+}
+static inline bool is_power_of_4(unsigned long x)
+{
+        if (is_power_of_2(x))
+                return (__ilog2(x) % 2) ? false : true;
+        return false;
 }
 static int __init add_huge_page_size(unsigned long long size)
@@ -486,9 +728,14 @@ static int __init add_huge_page_size(unsigned long long size)
        /* Check that it is a page size supported by the hardware and
         * that it fits within pagetable and slice limits. */
+#ifdef CONFIG_PPC_FSL_BOOK3E
+        if ((size < PAGE_SIZE) || !is_power_of_4(size))
+                return -EINVAL;
+#else
        if (!is_power_of_2(size)
            || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
                return -EINVAL;
+#endif
        if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
                return -EINVAL;
@@ -525,6 +772,46 @@ static int __init hugepage_setup_sz(char *str)
 }
 __setup("hugepagesz=", hugepage_setup_sz);
+#ifdef CONFIG_FSL_BOOKE
+struct kmem_cache *hugepte_cache;
+static int __init hugetlbpage_init(void)
+{
+        int psize;
+        for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+                unsigned shift;
+                if (!mmu_psize_defs[psize].shift)
+                        continue;
+                shift = mmu_psize_to_shift(psize);
+                /* Don't treat normal page sizes as huge... */
+                if (shift != PAGE_SHIFT)
+                        if (add_huge_page_size(1ULL << shift) < 0)
+                                continue;
+        }
+        /*
+         * Create a kmem cache for hugeptes.  The bottom bits in the pte have
+         * size information encoded in them, so align them to allow this
+         */
+        hugepte_cache =  kmem_cache_create("hugepte-cache", sizeof(pte_t),
+                                           HUGEPD_SHIFT_MASK + 1, 0, NULL);
+        if (hugepte_cache == NULL)
+                panic("%s: Unable to create kmem cache for hugeptes\n",
+                      __func__);
+        /* Default hpage size = 4M */
+        if (mmu_psize_defs[MMU_PAGE_4M].shift)
+                HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
+        else
+                panic("%s: Unable to set default huge page size\n", __func__);
+        return 0;
+}
+#else
 static int __init hugetlbpage_init(void)
 {
        int psize;
@@ -567,15 +854,23 @@ static int __init hugetlbpage_init(void)
        return 0;
 }
+#endif
 module_init(hugetlbpage_init);
 void flush_dcache_icache_hugepage(struct page *page)
 {
        int i;
+        void *start;
        BUG_ON(!PageCompound(page));
-        for (i = 0; i < (1UL << compound_order(page)); i++)
+        for (i = 0; i < (1UL << compound_order(page)); i++) {
-                __flush_dcache_icache(page_address(page+i));
+                if (!PageHighMem(page)) {
+                        __flush_dcache_icache(page_address(page+i));
+                } else {
+                        start = kmap_atomic(page+i, KM_PPC_SYNC_ICACHE);
+                        __flush_dcache_icache(start);
+                        kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
+                }
+        }
 }
author	Becky Bruce <beckyb@kernel.crashing.org>	2011-06-28 05:54:48 -0400
committer	Benjamin Herrenschmidt <benh@kernel.crashing.org>	2011-09-19 19:19:40 -0400
commit	41151e77a4d96ea138cede6d84c955aa4769ce74 (patch)
tree	2d997b77b9adf406a2fd30326bff688577d2e64f /arch/powerpc/mm/hugetlbpage.c
parent	7df5659eefad9b6d457ccdee016bd78bd064cfc0 (diff)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 0b9a5c1901b9..3a5f59dcbb33 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c
@@ -1,7 +1,8 @@
1	/*	1	/*
2	* PPC64 (POWER4) Huge TLB Page Support for Kernel.	2	* PPC Huge TLB Page Support for Kernel.
3	*	3	*
4	* Copyright (C) 2003 David Gibson, IBM Corporation.	4	* Copyright (C) 2003 David Gibson, IBM Corporation.
		5	* Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
5	*	6	*
6	* Based on the IA-32 version:	7	* Based on the IA-32 version:
7	* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>	8	* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
@@ -11,24 +12,39 @@
11	#include <linux/io.h>	12	#include <linux/io.h>
12	#include <linux/slab.h>	13	#include <linux/slab.h>
13	#include <linux/hugetlb.h>	14	#include <linux/hugetlb.h>
		15	#include <linux/of_fdt.h>
		16	#include <linux/memblock.h>
		17	#include <linux/bootmem.h>
14	#include <asm/pgtable.h>	18	#include <asm/pgtable.h>
15	#include <asm/pgalloc.h>	19	#include <asm/pgalloc.h>
16	#include <asm/tlb.h>	20	#include <asm/tlb.h>
		21	#include <asm/setup.h>
17		22
18	#define PAGE_SHIFT_64K 16	23	#define PAGE_SHIFT_64K 16
19	#define PAGE_SHIFT_16M 24	24	#define PAGE_SHIFT_16M 24
20	#define PAGE_SHIFT_16G 34	25	#define PAGE_SHIFT_16G 34
21		26
22	#define MAX_NUMBER_GPAGES 1024	27	unsigned int HPAGE_SHIFT;
23		28
24	/* Tracks the 16G pages after the device tree is scanned and before the	29	/*
25	* huge_boot_pages list is ready. */	30	* Tracks gpages after the device tree is scanned and before the
26	static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];	31	* huge_boot_pages list is ready. On 64-bit implementations, this is
		32	* just used to track 16G pages and so is a single array. 32-bit
		33	* implementations may have more than one gpage size due to limitations
		34	* of the memory allocators, so we need multiple arrays
		35	*/
		36	#ifdef CONFIG_PPC64
		37	#define MAX_NUMBER_GPAGES 1024
		38	static u64 gpage_freearray[MAX_NUMBER_GPAGES];
27	static unsigned nr_gpages;	39	static unsigned nr_gpages;
28		40	#else
29	/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()	41	#define MAX_NUMBER_GPAGES 128
30	* will choke on pointers to hugepte tables, which is handy for	42	struct psize_gpages {
31	* catching screwups early. */	43	u64 gpage_list[MAX_NUMBER_GPAGES];
		44	unsigned int nr_gpages;
		45	};
		46	static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
		47	#endif
32		48
33	static inline int shift_to_mmu_psize(unsigned int shift)	49	static inline int shift_to_mmu_psize(unsigned int shift)
34	{	50	{
@@ -49,25 +65,6 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
49		65
50	#define hugepd_none(hpd) ((hpd).pd == 0)	66	#define hugepd_none(hpd) ((hpd).pd == 0)
51		67
52	static inline pte_t *hugepd_page(hugepd_t hpd)
53	{
54	BUG_ON(!hugepd_ok(hpd));
55	return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) \| 0xc000000000000000);
56	}
57
58	static inline unsigned int hugepd_shift(hugepd_t hpd)
59	{
60	return hpd.pd & HUGEPD_SHIFT_MASK;
61	}
62
63	static inline pte_t hugepte_offset(hugepd_t hpdp, unsigned long addr, unsigned pdshift)
64	{
65	unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
66	pte_t dir = hugepd_page(hpdp);
67
68	return dir + idx;
69	}
70
71	pte_t find_linux_pte_or_hugepte(pgd_t pgdir, unsigned long ea, unsigned *shift)	68	pte_t find_linux_pte_or_hugepte(pgd_t pgdir, unsigned long ea, unsigned *shift)
72	{	69	{
73	pgd_t *pg;	70	pgd_t *pg;
@@ -93,7 +90,7 @@ pte_t find_linux_pte_or_hugepte(pgd_t pgdir, unsigned long ea, unsigned *shift
93	if (is_hugepd(pm))	90	if (is_hugepd(pm))
94	hpdp = (hugepd_t *)pm;	91	hpdp = (hugepd_t *)pm;
95	else if (!pmd_none(*pm)) {	92	else if (!pmd_none(*pm)) {
96	return pte_offset_map(pm, ea);	93	return pte_offset_kernel(pm, ea);
97	}	94	}
98	}	95	}
99	}	96	}
@@ -114,8 +111,18 @@ pte_t huge_pte_offset(struct mm_struct mm, unsigned long addr)
114	static int __hugepte_alloc(struct mm_struct mm, hugepd_t hpdp,	111	static int __hugepte_alloc(struct mm_struct mm, hugepd_t hpdp,
115	unsigned long address, unsigned pdshift, unsigned pshift)	112	unsigned long address, unsigned pdshift, unsigned pshift)
116	{	113	{
117	pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),	114	struct kmem_cache *cachep;
118	GFP_KERNEL\|__GFP_REPEAT);	115	pte_t *new;
		116
		117	#ifdef CONFIG_PPC64
		118	cachep = PGT_CACHE(pdshift - pshift);
		119	#else
		120	int i;
		121	int num_hugepd = 1 << (pshift - pdshift);
		122	cachep = hugepte_cache;
		123	#endif
		124
		125	new = kmem_cache_zalloc(cachep, GFP_KERNEL\|__GFP_REPEAT);
119		126
120	BUG_ON(pshift > HUGEPD_SHIFT_MASK);	127	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
121	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);	128	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
@@ -124,10 +131,31 @@ static int __hugepte_alloc(struct mm_struct mm, hugepd_t hpdp,
124	return -ENOMEM;	131	return -ENOMEM;
125		132
126	spin_lock(&mm->page_table_lock);	133	spin_lock(&mm->page_table_lock);
		134	#ifdef CONFIG_PPC64
127	if (!hugepd_none(*hpdp))	135	if (!hugepd_none(*hpdp))
128	kmem_cache_free(PGT_CACHE(pdshift - pshift), new);	136	kmem_cache_free(cachep, new);
129	else	137	else
130	hpdp->pd = ((unsigned long)new & ~0x8000000000000000) \| pshift;	138	hpdp->pd = ((unsigned long)new & ~PD_HUGE) \| pshift;
		139	#else
		140	/*
		141	* We have multiple higher-level entries that point to the same
		142	* actual pte location. Fill in each as we go and backtrack on error.
		143	* We need all of these so the DTLB pgtable walk code can find the
		144	* right higher-level entry without knowing if it's a hugepage or not.
		145	*/
		146	for (i = 0; i < num_hugepd; i++, hpdp++) {
		147	if (unlikely(!hugepd_none(*hpdp)))
		148	break;
		149	else
		150	hpdp->pd = ((unsigned long)new & ~PD_HUGE) \| pshift;
		151	}
		152	/* If we bailed from the for loop early, an error occurred, clean up */
		153	if (i < num_hugepd) {
		154	for (i = i - 1 ; i >= 0; i--, hpdp--)
		155	hpdp->pd = 0;
		156	kmem_cache_free(cachep, new);
		157	}
		158	#endif
131	spin_unlock(&mm->page_table_lock);	159	spin_unlock(&mm->page_table_lock);
132	return 0;	160	return 0;
133	}	161	}
@@ -169,11 +197,132 @@ pte_t huge_pte_alloc(struct mm_struct mm, unsigned long addr, unsigned long sz
169	return hugepte_offset(hpdp, addr, pdshift);	197	return hugepte_offset(hpdp, addr, pdshift);
170	}	198	}
171		199
		200	#ifdef CONFIG_PPC32
172	/* Build list of addresses of gigantic pages. This function is used in early	201	/* Build list of addresses of gigantic pages. This function is used in early
173	* boot before the buddy or bootmem allocator is setup.	202	* boot before the buddy or bootmem allocator is setup.
174	*/	203	*/
175	void add_gpage(unsigned long addr, unsigned long page_size,	204	void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
176	unsigned long number_of_pages)	205	{
		206	unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
		207	int i;
		208
		209	if (addr == 0)
		210	return;
		211
		212	gpage_freearray[idx].nr_gpages = number_of_pages;
		213
		214	for (i = 0; i < number_of_pages; i++) {
		215	gpage_freearray[idx].gpage_list[i] = addr;
		216	addr += page_size;
		217	}
		218	}
		219
		220	/*
		221	* Moves the gigantic page addresses from the temporary list to the
		222	* huge_boot_pages list.
		223	*/
		224	int alloc_bootmem_huge_page(struct hstate *hstate)
		225	{
		226	struct huge_bootmem_page *m;
		227	int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT);
		228	int nr_gpages = gpage_freearray[idx].nr_gpages;
		229
		230	if (nr_gpages == 0)
		231	return 0;
		232
		233	#ifdef CONFIG_HIGHMEM
		234	/*
		235	* If gpages can be in highmem we can't use the trick of storing the
		236	* data structure in the page; allocate space for this
		237	*/
		238	m = alloc_bootmem(sizeof(struct huge_bootmem_page));
		239	m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
		240	#else
		241	m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
		242	#endif
		243
		244	list_add(&m->list, &huge_boot_pages);
		245	gpage_freearray[idx].nr_gpages = nr_gpages;
		246	gpage_freearray[idx].gpage_list[nr_gpages] = 0;
		247	m->hstate = hstate;
		248
		249	return 1;
		250	}
		251	/*
		252	* Scan the command line hugepagesz= options for gigantic pages; store those in
		253	* a list that we use to allocate the memory once all options are parsed.
		254	*/
		255
		256	unsigned long gpage_npages[MMU_PAGE_COUNT];
		257
		258	static int __init do_gpage_early_setup(char param, char val)
		259	{
		260	static phys_addr_t size;
		261	unsigned long npages;
		262
		263	/*
		264	* The hugepagesz and hugepages cmdline options are interleaved. We
		265	* use the size variable to keep track of whether or not this was done
		266	* properly and skip over instances where it is incorrect. Other
		267	* command-line parsing code will issue warnings, so we don't need to.
		268	*
		269	*/
		270	if ((strcmp(param, "default_hugepagesz") == 0) \|\|
		271	(strcmp(param, "hugepagesz") == 0)) {
		272	size = memparse(val, NULL);
		273	} else if (strcmp(param, "hugepages") == 0) {
		274	if (size != 0) {
		275	if (sscanf(val, "%lu", &npages) <= 0)
		276	npages = 0;
		277	gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
		278	size = 0;
		279	}
		280	}
		281	return 0;
		282	}
		283
		284
		285	/*
		286	* This function allocates physical space for pages that are larger than the
		287	* buddy allocator can handle. We want to allocate these in highmem because
		288	* the amount of lowmem is limited. This means that this function MUST be
		289	* called before lowmem_end_addr is set up in MMU_init() in order for the lmb
		290	* allocate to grab highmem.
		291	*/
		292	void __init reserve_hugetlb_gpages(void)
		293	{
		294	static __initdata char cmdline[COMMAND_LINE_SIZE];
		295	phys_addr_t size, base;
		296	int i;
		297
		298	strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
		299	parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup);
		300
		301	/*
		302	* Walk gpage list in reverse, allocating larger page sizes first.
		303	* Skip over unsupported sizes, or sizes that have 0 gpages allocated.
		304	* When we reach the point in the list where pages are no longer
		305	* considered gpages, we're done.
		306	*/
		307	for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
		308	if (mmu_psize_defs[i].shift == 0 \|\| gpage_npages[i] == 0)
		309	continue;
		310	else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
		311	break;
		312
		313	size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
		314	base = memblock_alloc_base(size * gpage_npages[i], size,
		315	MEMBLOCK_ALLOC_ANYWHERE);
		316	add_gpage(base, size, gpage_npages[i]);
		317	}
		318	}
		319
		320	#else /* PPC64 */
		321
		322	/* Build list of addresses of gigantic pages. This function is used in early
		323	* boot before the buddy or bootmem allocator is setup.
		324	*/
		325	void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
177	{	326	{
178	if (!addr)	327	if (!addr)
179	return;	328	return;
@@ -199,19 +348,79 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
199	m->hstate = hstate;	348	m->hstate = hstate;
200	return 1;	349	return 1;
201	}	350	}
		351	#endif
202		352
203	int huge_pmd_unshare(struct mm_struct mm, unsigned long addr, pte_t *ptep)	353	int huge_pmd_unshare(struct mm_struct mm, unsigned long addr, pte_t *ptep)
204	{	354	{
205	return 0;	355	return 0;
206	}	356	}
207		357
		358	#ifdef CONFIG_PPC32
		359	#define HUGEPD_FREELIST_SIZE \
		360	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
		361
		362	struct hugepd_freelist {
		363	struct rcu_head rcu;
		364	unsigned int index;
		365	void *ptes[0];
		366	};
		367
		368	static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
		369
		370	static void hugepd_free_rcu_callback(struct rcu_head *head)
		371	{
		372	struct hugepd_freelist *batch =
		373	container_of(head, struct hugepd_freelist, rcu);
		374	unsigned int i;
		375
		376	for (i = 0; i < batch->index; i++)
		377	kmem_cache_free(hugepte_cache, batch->ptes[i]);
		378
		379	free_page((unsigned long)batch);
		380	}
		381
		382	static void hugepd_free(struct mmu_gather tlb, void hugepte)
		383	{
		384	struct hugepd_freelist **batchp;
		385
		386	batchp = &__get_cpu_var(hugepd_freelist_cur);
		387
		388	if (atomic_read(&tlb->mm->mm_users) < 2 \|\|
		389	cpumask_equal(mm_cpumask(tlb->mm),
		390	cpumask_of(smp_processor_id()))) {
		391	kmem_cache_free(hugepte_cache, hugepte);
		392	return;
		393	}
		394
		395	if (*batchp == NULL) {
		396	batchp = (struct hugepd_freelist )__get_free_page(GFP_ATOMIC);
		397	(*batchp)->index = 0;
		398	}
		399
		400	(batchp)->ptes[(batchp)->index++] = hugepte;
		401	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
		402	call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
		403	*batchp = NULL;
		404	}
		405	}
		406	#endif
		407
208	static void free_hugepd_range(struct mmu_gather tlb, hugepd_t hpdp, int pdshift,	408	static void free_hugepd_range(struct mmu_gather tlb, hugepd_t hpdp, int pdshift,
209	unsigned long start, unsigned long end,	409	unsigned long start, unsigned long end,
210	unsigned long floor, unsigned long ceiling)	410	unsigned long floor, unsigned long ceiling)
211	{	411	{
212	pte_t hugepte = hugepd_page(hpdp);	412	pte_t hugepte = hugepd_page(hpdp);
213	unsigned shift = hugepd_shift(*hpdp);	413	int i;
		414
214	unsigned long pdmask = ~((1UL << pdshift) - 1);	415	unsigned long pdmask = ~((1UL << pdshift) - 1);
		416	unsigned int num_hugepd = 1;
		417
		418	#ifdef CONFIG_PPC64
		419	unsigned int shift = hugepd_shift(*hpdp);
		420	#else
		421	/* Note: On 32-bit the hpdp may be the first of several */
		422	num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift));
		423	#endif
215		424
216	start &= pdmask;	425	start &= pdmask;
217	if (start < floor)	426	if (start < floor)
@@ -224,9 +433,15 @@ static void free_hugepd_range(struct mmu_gather tlb, hugepd_t hpdp, int pdshif
224	if (end - 1 > ceiling - 1)	433	if (end - 1 > ceiling - 1)
225	return;	434	return;
226		435
227	hpdp->pd = 0;	436	for (i = 0; i < num_hugepd; i++, hpdp++)
		437	hpdp->pd = 0;
		438
228	tlb->need_flush = 1;	439	tlb->need_flush = 1;
		440	#ifdef CONFIG_PPC64
229	pgtable_free_tlb(tlb, hugepte, pdshift - shift);	441	pgtable_free_tlb(tlb, hugepte, pdshift - shift);
		442	#else
		443	hugepd_free(tlb, hugepte);
		444	#endif
230	}	445	}
231		446
232	static void hugetlb_free_pmd_range(struct mmu_gather tlb, pud_t pud,	447	static void hugetlb_free_pmd_range(struct mmu_gather tlb, pud_t pud,
@@ -331,18 +546,27 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
331	* too.	546	* too.
332	*/	547	*/
333		548
334	pgd = pgd_offset(tlb->mm, addr);
335	do {	549	do {
336	next = pgd_addr_end(addr, end);	550	next = pgd_addr_end(addr, end);
		551	pgd = pgd_offset(tlb->mm, addr);
337	if (!is_hugepd(pgd)) {	552	if (!is_hugepd(pgd)) {
338	if (pgd_none_or_clear_bad(pgd))	553	if (pgd_none_or_clear_bad(pgd))
339	continue;	554	continue;
340	hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);	555	hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
341	} else {	556	} else {
		557	#ifdef CONFIG_PPC32
		558	/*
		559	* Increment next by the size of the huge mapping since
		560	* on 32-bit there may be more than one entry at the pgd
		561	* level for a single hugepage, but all of them point to
		562	* the same kmem cache that holds the hugepte.
		563	*/
		564	next = addr + (1 << hugepd_shift((hugepd_t )pgd));
		565	#endif
342	free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,	566	free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
343	addr, next, floor, ceiling);	567	addr, next, floor, ceiling);
344	}	568	}
345	} while (pgd++, addr = next, addr != end);	569	} while (addr = next, addr != end);
346	}	570	}
347		571
348	struct page *	572	struct page *
@@ -466,17 +690,35 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
466	unsigned long len, unsigned long pgoff,	690	unsigned long len, unsigned long pgoff,
467	unsigned long flags)	691	unsigned long flags)
468	{	692	{
		693	#ifdef CONFIG_MM_SLICES
469	struct hstate *hstate = hstate_file(file);	694	struct hstate *hstate = hstate_file(file);
470	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));	695	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
471		696
472	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);	697	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
		698	#else
		699	return get_unmapped_area(file, addr, len, pgoff, flags);
		700	#endif
473	}	701	}
474		702
475	unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)	703	unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
476	{	704	{
		705	#ifdef CONFIG_MM_SLICES
477	unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);	706	unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
478		707
479	return 1UL << mmu_psize_to_shift(psize);	708	return 1UL << mmu_psize_to_shift(psize);
		709	#else
		710	if (!is_vm_hugetlb_page(vma))
		711	return PAGE_SIZE;
		712
		713	return huge_page_size(hstate_vma(vma));
		714	#endif
		715	}
		716
		717	static inline bool is_power_of_4(unsigned long x)
		718	{
		719	if (is_power_of_2(x))
		720	return (__ilog2(x) % 2) ? false : true;
		721	return false;
480	}	722	}
481		723
482	static int __init add_huge_page_size(unsigned long long size)	724	static int __init add_huge_page_size(unsigned long long size)
@@ -486,9 +728,14 @@ static int __init add_huge_page_size(unsigned long long size)
486		728
487	/* Check that it is a page size supported by the hardware and	729	/* Check that it is a page size supported by the hardware and
488	* that it fits within pagetable and slice limits. */	730	* that it fits within pagetable and slice limits. */
		731	#ifdef CONFIG_PPC_FSL_BOOK3E
		732	if ((size < PAGE_SIZE) \|\| !is_power_of_4(size))
		733	return -EINVAL;
		734	#else
489	if (!is_power_of_2(size)	735	if (!is_power_of_2(size)
490	\|\| (shift > SLICE_HIGH_SHIFT) \|\| (shift <= PAGE_SHIFT))	736	\|\| (shift > SLICE_HIGH_SHIFT) \|\| (shift <= PAGE_SHIFT))
491	return -EINVAL;	737	return -EINVAL;
		738	#endif
492		739
493	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)	740	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
494	return -EINVAL;	741	return -EINVAL;
@@ -525,6 +772,46 @@ static int __init hugepage_setup_sz(char *str)
525	}	772	}
526	__setup("hugepagesz=", hugepage_setup_sz);	773	__setup("hugepagesz=", hugepage_setup_sz);
527		774
		775	#ifdef CONFIG_FSL_BOOKE
		776	struct kmem_cache *hugepte_cache;
		777	static int __init hugetlbpage_init(void)
		778	{
		779	int psize;
		780
		781	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
		782	unsigned shift;
		783
		784	if (!mmu_psize_defs[psize].shift)
		785	continue;
		786
		787	shift = mmu_psize_to_shift(psize);
		788
		789	/* Don't treat normal page sizes as huge... */
		790	if (shift != PAGE_SHIFT)
		791	if (add_huge_page_size(1ULL << shift) < 0)
		792	continue;
		793	}
		794
		795	/*
		796	* Create a kmem cache for hugeptes. The bottom bits in the pte have
		797	* size information encoded in them, so align them to allow this
		798	*/
		799	hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t),
		800	HUGEPD_SHIFT_MASK + 1, 0, NULL);
		801	if (hugepte_cache == NULL)
		802	panic("%s: Unable to create kmem cache for hugeptes\n",
		803	__func__);
		804
		805	/* Default hpage size = 4M */
		806	if (mmu_psize_defs[MMU_PAGE_4M].shift)
		807	HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
		808	else
		809	panic("%s: Unable to set default huge page size\n", __func__);
		810
		811
		812	return 0;
		813	}
		814	#else
528	static int __init hugetlbpage_init(void)	815	static int __init hugetlbpage_init(void)
529	{	816	{
530	int psize;	817	int psize;
@@ -567,15 +854,23 @@ static int __init hugetlbpage_init(void)
567		854
568	return 0;	855	return 0;
569	}	856	}
570		857	#endif
571	module_init(hugetlbpage_init);	858	module_init(hugetlbpage_init);
572		859
573	void flush_dcache_icache_hugepage(struct page *page)	860	void flush_dcache_icache_hugepage(struct page *page)
574	{	861	{
575	int i;	862	int i;
		863	void *start;
576		864
577	BUG_ON(!PageCompound(page));	865	BUG_ON(!PageCompound(page));
578		866
579	for (i = 0; i < (1UL << compound_order(page)); i++)	867	for (i = 0; i < (1UL << compound_order(page)); i++) {
580	__flush_dcache_icache(page_address(page+i));	868	if (!PageHighMem(page)) {
		869	__flush_dcache_icache(page_address(page+i));
		870	} else {
		871	start = kmap_atomic(page+i, KM_PPC_SYNC_ICACHE);
		872	__flush_dcache_icache(start);
		873	kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
		874	}
		875	}
581	}	876	}