x86,percpu: generalize lpage first chunk allocator

Generalize and move x86 setup_pcpu_lpage() into pcpu_lpage_first_chunk(). setup_pcpu_lpage() now is a simple wrapper around the generalized version. Other than taking size parameters and using arch supplied callbacks to allocate/free/map memory, pcpu_lpage_first_chunk() is identical to the original implementation. This simplifies arch code and will help converting more archs to dynamic percpu allocator. While at it, factor out pcpu_calc_fc_sizes() which is common to pcpu_embed_first_chunk() and pcpu_lpage_first_chunk(). [ Impact: code reorganization and generalization ] Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@elte.hu>
author: Tejun Heo <tj@kernel.org> 2009-07-03 19:10:59 -0400
committer: Tejun Heo <tj@kernel.org> 2009-07-03 19:10:59 -0400
commit: 8c4bfc6e8801616ab2e01c38140b2159b388d2ff (patch)
tree: e29e8bbfae362362554b870371a6187b41f92d82
parent: 8f05a6a65d944f2fed4eb384fb58aa8c8e5a9bab (diff)
5 files changed, 244 insertions, 171 deletions
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 103f1ddb0d85..a18c038a3079 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -156,15 +156,6 @@ do {							\
 /* We can use this directly for local CPU (faster). */
 DECLARE_PER_CPU(unsigned long, this_cpu_off);
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-void *pcpu_lpage_remapped(void *kaddr);
-#else
-static inline void *pcpu_lpage_remapped(void *kaddr)
-{
-        return NULL;
-}
-#endif
 #endif /* !__ASSEMBLY__ */
 #ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ab896b31e80b..4f2e0ac9130b 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -137,44 +137,21 @@ static void __init pcpu_fc_free(void *ptr, size_t size)
 }
 /*
- * Large page remap allocator
+ * Large page remapping allocator
- *
- * This allocator uses PMD page as unit.  A PMD page is allocated for
- * each cpu and each is remapped into vmalloc area using PMD mapping.
- * As PMD page is quite large, only part of it is used for the first
- * chunk.  Unused part is returned to the bootmem allocator.
- *
- * So, the PMD pages are mapped twice - once to the physical mapping
- * and to the vmalloc area for the first percpu chunk.  The double
- * mapping does add one more PMD TLB entry pressure but still is much
- * better than only using 4k mappings while still being NUMA friendly.
 */
 #ifdef CONFIG_NEED_MULTIPLE_NODES
-struct pcpul_ent {
+static void __init pcpul_map(void *ptr, size_t size, void *addr)
-        unsigned int    cpu;
-        void            *ptr;
-};
-static size_t pcpul_size;
-static struct pcpul_ent *pcpul_map;
-static struct vm_struct pcpul_vm;
-static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
 {
-        size_t off = (size_t)pageno << PAGE_SHIFT;
+        pmd_t *pmd, pmd_v;
-        if (off >= pcpul_size)
+        pmd = populate_extra_pmd((unsigned long)addr);
-                return NULL;
+        pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE);
+        set_pmd(pmd, pmd_v);
-        return virt_to_page(pcpul_map[cpu].ptr + off);
 }
 static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
 {
-        size_t map_size, dyn_size;
+        size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
-        unsigned int cpu;
-        int i, j;
-        ssize_t ret;
        if (!chosen) {
                size_t vm_size = VMALLOC_END - VMALLOC_START;
@@ -198,134 +175,10 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
                return -EINVAL;
        }
-        /*
+        return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
-         * Currently supports only single page.  Supporting multiple
+                                      reserve - PERCPU_FIRST_CHUNK_RESERVE,
-         * pages won't be too difficult if it ever becomes necessary.
+                                      PMD_SIZE,
-         */
+                                      pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
-        pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
-                               PERCPU_DYNAMIC_RESERVE);
-        if (pcpul_size > PMD_SIZE) {
-                pr_warning("PERCPU: static data is larger than large page, "
-                           "can't use large page\n");
-                return -EINVAL;
-        }
-        dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
-        /* allocate pointer array and alloc large pages */
-        map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
-        pcpul_map = alloc_bootmem(map_size);
-        for_each_possible_cpu(cpu) {
-                pcpul_map[cpu].cpu = cpu;
-                pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
-                                                        PMD_SIZE);
-                if (!pcpul_map[cpu].ptr) {
-                        pr_warning("PERCPU: failed to allocate large page "
-                                   "for cpu%u\n", cpu);
-                        goto enomem;
-                }
-                /*
-                 * Only use pcpul_size bytes and give back the rest.
-                 *
-                 * Ingo: The 2MB up-rounding bootmem is needed to make
-                 * sure the partial 2MB page is still fully RAM - it's
-                 * not well-specified to have a PAT-incompatible area
-                 * (unmapped RAM, device memory, etc.) in that hole.
-                 */
-                free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
-                             PMD_SIZE - pcpul_size);
-                memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
-        }
-        /* allocate address and map */
-        pcpul_vm.flags = VM_ALLOC;
-        pcpul_vm.size = num_possible_cpus() * PMD_SIZE;
-        vm_area_register_early(&pcpul_vm, PMD_SIZE);
-        for_each_possible_cpu(cpu) {
-                pmd_t *pmd, pmd_v;
-                pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
-                                         cpu * PMD_SIZE);
-                pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
-                                PAGE_KERNEL_LARGE);
-                set_pmd(pmd, pmd_v);
-        }
-        /* we're ready, commit */
-        pr_info("PERCPU: Remapped at %p with large pages, static data "
-                "%zu bytes\n", pcpul_vm.addr, static_size);
-        ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
-                                     PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
-                                     PMD_SIZE, pcpul_vm.addr, NULL);
-        /* sort pcpul_map array for pcpu_lpage_remapped() */
-        for (i = 0; i < num_possible_cpus() - 1; i++)
-                for (j = i + 1; j < num_possible_cpus(); j++)
-                        if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
-                                struct pcpul_ent tmp = pcpul_map[i];
-                                pcpul_map[i] = pcpul_map[j];
-                                pcpul_map[j] = tmp;
-                        }
-        return ret;
-enomem:
-        for_each_possible_cpu(cpu)
-                if (pcpul_map[cpu].ptr)
-                        free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
-        free_bootmem(__pa(pcpul_map), map_size);
-        return -ENOMEM;
-}
-/**
- * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
- * @kaddr: the kernel address in question
- *
- * Determine whether @kaddr falls in the pcpul recycled area.  This is
- * used by pageattr to detect VM aliases and break up the pcpu PMD
- * mapping such that the same physical page is not mapped under
- * different attributes.
- *
- * The recycled area is always at the tail of a partially used PMD
- * page.
- *
- * RETURNS:
- * Address of corresponding remapped pcpu address if match is found;
- * otherwise, NULL.
- */
-void *pcpu_lpage_remapped(void *kaddr)
-{
-        void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
-        unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
-        int left = 0, right = num_possible_cpus() - 1;
-        int pos;
-        /* pcpul in use at all? */
-        if (!pcpul_map)
-                return NULL;
-        /* okay, perform binary search */
-        while (left <= right) {
-                pos = (left + right) / 2;
-                if (pcpul_map[pos].ptr < pmd_addr)
-                        left = pos + 1;
-                else if (pcpul_map[pos].ptr > pmd_addr)
-                        right = pos - 1;
-                else {
-                        /* it shouldn't be in the area for the first chunk */
-                        WARN_ON(offset < pcpul_size);
-                        return pcpul_vm.addr +
-                                pcpul_map[pos].cpu * PMD_SIZE + offset;
-                }
-        }
-        return NULL;
 }
 #else
 static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 1b734d7a8966..c106f7852424 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -12,6 +12,7 @@
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/pfn.h>
+#include <linux/percpu.h>
 #include <asm/e820.h>
 #include <asm/processor.h>
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 41b5bfab4195..9f6bfd7d4b92 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -62,6 +62,7 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
 typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size);
 typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
+typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr);
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
                                size_t static_size, size_t reserved_size,
@@ -79,6 +80,32 @@ extern ssize_t __init pcpu_4k_first_chunk(
                                pcpu_fc_free_fn_t free_fn,
                                pcpu_fc_populate_pte_fn_t populate_pte_fn);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+extern ssize_t __init pcpu_lpage_first_chunk(
+                                size_t static_size, size_t reserved_size,
+                                ssize_t dyn_size, size_t lpage_size,
+                                pcpu_fc_alloc_fn_t alloc_fn,
+                                pcpu_fc_free_fn_t free_fn,
+                                pcpu_fc_map_fn_t map_fn);
+extern void *pcpu_lpage_remapped(void *kaddr);
+#else
+static inline ssize_t __init pcpu_lpage_first_chunk(
+                                size_t static_size, size_t reserved_size,
+                                ssize_t dyn_size, size_t lpage_size,
+                                pcpu_fc_alloc_fn_t alloc_fn,
+                                pcpu_fc_free_fn_t free_fn,
+                                pcpu_fc_map_fn_t map_fn)
+{
+        return -EINVAL;
+}
+static inline void *pcpu_lpage_remapped(void *kaddr)
+{
+        return NULL;
+}
+#endif
 /*
 * Use this to get to a cpu's version of the per-cpu object
 * dynamically allocated. Non-atomic access to the current CPU's
diff --git a/mm/percpu.c b/mm/percpu.c
index f3fe7bc7378f..17db527ee2e2 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1190,6 +1190,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
        return pcpu_unit_size;
 }
+static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
+                                 ssize_t *dyn_sizep)
+{
+        size_t size_sum;
+        size_sum = PFN_ALIGN(static_size + reserved_size +
+                             (*dyn_sizep >= 0 ? *dyn_sizep : 0));
+        if (*dyn_sizep != 0)
+                *dyn_sizep = size_sum - static_size - reserved_size;
+        return size_sum;
+}
 /*
 * Embedding first chunk setup helper.
 */
@@ -1241,10 +1254,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
        unsigned int cpu;
        /* determine parameters and allocate */
-        pcpue_size = PFN_ALIGN(static_size + reserved_size +
+        pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
-                               (dyn_size >= 0 ? dyn_size : 0));
-        if (dyn_size != 0)
-                dyn_size = pcpue_size - static_size - reserved_size;
        pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
        chunk_size = pcpue_unit_size * num_possible_cpus();
@@ -1391,6 +1401,197 @@ out_free_ar:
 }
 /*
+ * Large page remapping first chunk setup helper
+ */
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+struct pcpul_ent {
+        unsigned int    cpu;
+        void            *ptr;
+};
+static size_t pcpul_size;
+static size_t pcpul_unit_size;
+static struct pcpul_ent *pcpul_map;
+static struct vm_struct pcpul_vm;
+static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
+{
+        size_t off = (size_t)pageno << PAGE_SHIFT;
+        if (off >= pcpul_size)
+                return NULL;
+        return virt_to_page(pcpul_map[cpu].ptr + off);
+}
+/**
+ * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
+ * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
+ * @lpage_size: the size of a large page
+ * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
+ * @free_fn: function to free percpu memory, @size <= lpage_size
+ * @map_fn: function to map percpu lpage, always called with lpage_size
+ *
+ * This allocator uses large page as unit.  A large page is allocated
+ * for each cpu and each is remapped into vmalloc area using large
+ * page mapping.  As large page can be quite large, only part of it is
+ * used for the first chunk.  Unused part is returned to the bootmem
+ * allocator.
+ *
+ * So, the large pages are mapped twice - once to the physical mapping
+ * and to the vmalloc area for the first percpu chunk.  The double
+ * mapping does add one more large TLB entry pressure but still is
+ * much better than only using 4k mappings while still being NUMA
+ * friendly.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access on success, -errno on failure.
+ */
+ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
+                                      ssize_t dyn_size, size_t lpage_size,
+                                      pcpu_fc_alloc_fn_t alloc_fn,
+                                      pcpu_fc_free_fn_t free_fn,
+                                      pcpu_fc_map_fn_t map_fn)
+{
+        size_t size_sum;
+        size_t map_size;
+        unsigned int cpu;
+        int i, j;
+        ssize_t ret;
+        /*
+         * Currently supports only single page.  Supporting multiple
+         * pages won't be too difficult if it ever becomes necessary.
+         */
+        size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
+        pcpul_unit_size = lpage_size;
+        pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+        if (pcpul_size > pcpul_unit_size) {
+                pr_warning("PERCPU: static data is larger than large page, "
+                           "can't use large page\n");
+                return -EINVAL;
+        }
+        /* allocate pointer array and alloc large pages */
+        map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
+        pcpul_map = alloc_bootmem(map_size);
+        for_each_possible_cpu(cpu) {
+                void *ptr;
+                ptr = alloc_fn(cpu, lpage_size);
+                if (!ptr) {
+                        pr_warning("PERCPU: failed to allocate large page "
+                                   "for cpu%u\n", cpu);
+                        goto enomem;
+                }
+                /*
+                 * Only use pcpul_size bytes and give back the rest.
+                 *
+                 * Ingo: The lpage_size up-rounding bootmem is needed
+                 * to make sure the partial lpage is still fully RAM -
+                 * it's not well-specified to have a incompatible area
+                 * (unmapped RAM, device memory, etc.) in that hole.
+                 */
+                free_fn(ptr + pcpul_size, lpage_size - pcpul_size);
+                pcpul_map[cpu].cpu = cpu;
+                pcpul_map[cpu].ptr = ptr;
+                memcpy(ptr, __per_cpu_load, static_size);
+        }
+        /* allocate address and map */
+        pcpul_vm.flags = VM_ALLOC;
+        pcpul_vm.size = num_possible_cpus() * pcpul_unit_size;
+        vm_area_register_early(&pcpul_vm, pcpul_unit_size);
+        for_each_possible_cpu(cpu)
+                map_fn(pcpul_map[cpu].ptr, pcpul_unit_size,
+                       pcpul_vm.addr + cpu * pcpul_unit_size);
+        /* we're ready, commit */
+        pr_info("PERCPU: Remapped at %p with large pages, static data "
+                "%zu bytes\n", pcpul_vm.addr, static_size);
+        ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
+                                     reserved_size, dyn_size, pcpul_unit_size,
+                                     pcpul_vm.addr, NULL);
+        /* sort pcpul_map array for pcpu_lpage_remapped() */
+        for (i = 0; i < num_possible_cpus() - 1; i++)
+                for (j = i + 1; j < num_possible_cpus(); j++)
+                        if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
+                                struct pcpul_ent tmp = pcpul_map[i];
+                                pcpul_map[i] = pcpul_map[j];
+                                pcpul_map[j] = tmp;
+                        }
+        return ret;
+enomem:
+        for_each_possible_cpu(cpu)
+                if (pcpul_map[cpu].ptr)
+                        free_fn(pcpul_map[cpu].ptr, pcpul_size);
+        free_bootmem(__pa(pcpul_map), map_size);
+        return -ENOMEM;
+}
+/**
+ * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
+ * @kaddr: the kernel address in question
+ *
+ * Determine whether @kaddr falls in the pcpul recycled area.  This is
+ * used by pageattr to detect VM aliases and break up the pcpu large
+ * page mapping such that the same physical page is not mapped under
+ * different attributes.
+ *
+ * The recycled area is always at the tail of a partially used large
+ * page.
+ *
+ * RETURNS:
+ * Address of corresponding remapped pcpu address if match is found;
+ * otherwise, NULL.
+ */
+void *pcpu_lpage_remapped(void *kaddr)
+{
+        unsigned long unit_mask = pcpul_unit_size - 1;
+        void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask);
+        unsigned long offset = (unsigned long)kaddr & unit_mask;
+        int left = 0, right = num_possible_cpus() - 1;
+        int pos;
+        /* pcpul in use at all? */
+        if (!pcpul_map)
+                return NULL;
+        /* okay, perform binary search */
+        while (left <= right) {
+                pos = (left + right) / 2;
+                if (pcpul_map[pos].ptr < lpage_addr)
+                        left = pos + 1;
+                else if (pcpul_map[pos].ptr > lpage_addr)
+                        right = pos - 1;
+                else {
+                        /* it shouldn't be in the area for the first chunk */
+                        WARN_ON(offset < pcpul_size);
+                        return pcpul_vm.addr +
+                                pcpul_map[pos].cpu * pcpul_unit_size + offset;
+                }
+        }
+        return NULL;
+}
+#endif
+/*
 * Generic percpu area setup.
 *
 * The embedding helper is used because its behavior closely resembles
author	Tejun Heo <tj@kernel.org>	2009-07-03 19:10:59 -0400
committer	Tejun Heo <tj@kernel.org>	2009-07-03 19:10:59 -0400
commit	8c4bfc6e8801616ab2e01c38140b2159b388d2ff (patch)
tree	e29e8bbfae362362554b870371a6187b41f92d82
parent	8f05a6a65d944f2fed4eb384fb58aa8c8e5a9bab (diff)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 103f1ddb0d85..a18c038a3079 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h
@@ -156,15 +156,6 @@ do { \
156	/* We can use this directly for local CPU (faster). */	156	/* We can use this directly for local CPU (faster). */
157	DECLARE_PER_CPU(unsigned long, this_cpu_off);	157	DECLARE_PER_CPU(unsigned long, this_cpu_off);
158		158
159	#ifdef CONFIG_NEED_MULTIPLE_NODES
160	void pcpu_lpage_remapped(void kaddr);
161	#else
162	static inline void pcpu_lpage_remapped(void kaddr)
163	{
164	return NULL;
165	}
166	#endif
167
168	#endif /* !__ASSEMBLY__ */	159	#endif /* !__ASSEMBLY__ */
169		160
170	#ifdef CONFIG_SMP	161	#ifdef CONFIG_SMP


diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ab896b31e80b..4f2e0ac9130b 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c
@@ -137,44 +137,21 @@ static void __init pcpu_fc_free(void *ptr, size_t size)
137	}	137	}
138		138
139	/*	139	/*
140	* Large page remap allocator	140	* Large page remapping allocator
141	*
142	* This allocator uses PMD page as unit. A PMD page is allocated for
143	* each cpu and each is remapped into vmalloc area using PMD mapping.
144	* As PMD page is quite large, only part of it is used for the first
145	* chunk. Unused part is returned to the bootmem allocator.
146	*
147	* So, the PMD pages are mapped twice - once to the physical mapping
148	* and to the vmalloc area for the first percpu chunk. The double
149	* mapping does add one more PMD TLB entry pressure but still is much
150	* better than only using 4k mappings while still being NUMA friendly.
151	*/	141	*/
152	#ifdef CONFIG_NEED_MULTIPLE_NODES	142	#ifdef CONFIG_NEED_MULTIPLE_NODES
153	struct pcpul_ent {	143	static void __init pcpul_map(void ptr, size_t size, void addr)
154	unsigned int cpu;
155	void *ptr;
156	};
157
158	static size_t pcpul_size;
159	static struct pcpul_ent *pcpul_map;
160	static struct vm_struct pcpul_vm;
161
162	static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
163	{	144	{
164	size_t off = (size_t)pageno << PAGE_SHIFT;	145	pmd_t *pmd, pmd_v;
165		146
166	if (off >= pcpul_size)	147	pmd = populate_extra_pmd((unsigned long)addr);
167	return NULL;	148	pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE);
168		149	set_pmd(pmd, pmd_v);
169	return virt_to_page(pcpul_map[cpu].ptr + off);
170	}	150	}
171		151
172	static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)	152	static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
173	{	153	{
174	size_t map_size, dyn_size;	154	size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
175	unsigned int cpu;
176	int i, j;
177	ssize_t ret;
178		155
179	if (!chosen) {	156	if (!chosen) {
180	size_t vm_size = VMALLOC_END - VMALLOC_START;	157	size_t vm_size = VMALLOC_END - VMALLOC_START;
@@ -198,134 +175,10 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
198	return -EINVAL;	175	return -EINVAL;
199	}	176	}
200		177
201	/*	178	return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
202	* Currently supports only single page. Supporting multiple	179	reserve - PERCPU_FIRST_CHUNK_RESERVE,
203	* pages won't be too difficult if it ever becomes necessary.	180	PMD_SIZE,
204	*/	181	pcpu_fc_alloc, pcpu_fc_free, pcpul_map);
205	pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
206	PERCPU_DYNAMIC_RESERVE);
207	if (pcpul_size > PMD_SIZE) {
208	pr_warning("PERCPU: static data is larger than large page, "
209	"can't use large page\n");
210	return -EINVAL;
211	}
212	dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
213
214	/* allocate pointer array and alloc large pages */
215	map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
216	pcpul_map = alloc_bootmem(map_size);
217
218	for_each_possible_cpu(cpu) {
219	pcpul_map[cpu].cpu = cpu;
220	pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
221	PMD_SIZE);
222	if (!pcpul_map[cpu].ptr) {
223	pr_warning("PERCPU: failed to allocate large page "
224	"for cpu%u\n", cpu);
225	goto enomem;
226	}
227
228	/*
229	* Only use pcpul_size bytes and give back the rest.
230	*
231	* Ingo: The 2MB up-rounding bootmem is needed to make
232	* sure the partial 2MB page is still fully RAM - it's
233	* not well-specified to have a PAT-incompatible area
234	* (unmapped RAM, device memory, etc.) in that hole.
235	*/
236	free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
237	PMD_SIZE - pcpul_size);
238
239	memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
240	}
241
242	/* allocate address and map */
243	pcpul_vm.flags = VM_ALLOC;
244	pcpul_vm.size = num_possible_cpus() * PMD_SIZE;
245	vm_area_register_early(&pcpul_vm, PMD_SIZE);
246
247	for_each_possible_cpu(cpu) {
248	pmd_t *pmd, pmd_v;
249
250	pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
251	cpu * PMD_SIZE);
252	pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
253	PAGE_KERNEL_LARGE);
254	set_pmd(pmd, pmd_v);
255	}
256
257	/* we're ready, commit */
258	pr_info("PERCPU: Remapped at %p with large pages, static data "
259	"%zu bytes\n", pcpul_vm.addr, static_size);
260
261	ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
262	PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
263	PMD_SIZE, pcpul_vm.addr, NULL);
264
265	/* sort pcpul_map array for pcpu_lpage_remapped() */
266	for (i = 0; i < num_possible_cpus() - 1; i++)
267	for (j = i + 1; j < num_possible_cpus(); j++)
268	if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
269	struct pcpul_ent tmp = pcpul_map[i];
270	pcpul_map[i] = pcpul_map[j];
271	pcpul_map[j] = tmp;
272	}
273
274	return ret;
275
276	enomem:
277	for_each_possible_cpu(cpu)
278	if (pcpul_map[cpu].ptr)
279	free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
280	free_bootmem(__pa(pcpul_map), map_size);
281	return -ENOMEM;
282	}
283
284	/**
285	* pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
286	* @kaddr: the kernel address in question
287	*
288	* Determine whether @kaddr falls in the pcpul recycled area. This is
289	* used by pageattr to detect VM aliases and break up the pcpu PMD
290	* mapping such that the same physical page is not mapped under
291	* different attributes.
292	*
293	* The recycled area is always at the tail of a partially used PMD
294	* page.
295	*
296	* RETURNS:
297	* Address of corresponding remapped pcpu address if match is found;
298	* otherwise, NULL.
299	*/
300	void pcpu_lpage_remapped(void kaddr)
301	{
302	void pmd_addr = (void )((unsigned long)kaddr & PMD_MASK);
303	unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
304	int left = 0, right = num_possible_cpus() - 1;
305	int pos;
306
307	/* pcpul in use at all? */
308	if (!pcpul_map)
309	return NULL;
310
311	/* okay, perform binary search */
312	while (left <= right) {
313	pos = (left + right) / 2;
314
315	if (pcpul_map[pos].ptr < pmd_addr)
316	left = pos + 1;
317	else if (pcpul_map[pos].ptr > pmd_addr)
318	right = pos - 1;
319	else {
320	/* it shouldn't be in the area for the first chunk */
321	WARN_ON(offset < pcpul_size);
322
323	return pcpul_vm.addr +
324	pcpul_map[pos].cpu * PMD_SIZE + offset;
325	}
326	}
327
328	return NULL;
329	}	182	}
330	#else	183	#else
331	static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)	184	static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)


diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1b734d7a8966..c106f7852424 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c
@@ -12,6 +12,7 @@
12	#include <linux/seq_file.h>	12	#include <linux/seq_file.h>
13	#include <linux/debugfs.h>	13	#include <linux/debugfs.h>
14	#include <linux/pfn.h>	14	#include <linux/pfn.h>
		15	#include <linux/percpu.h>
15		16
16	#include <asm/e820.h>	17	#include <asm/e820.h>
17	#include <asm/processor.h>	18	#include <asm/processor.h>


diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 41b5bfab4195..9f6bfd7d4b92 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h
@@ -62,6 +62,7 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
62	typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);	62	typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size);
63	typedef void (pcpu_fc_free_fn_t)(void ptr, size_t size);	63	typedef void (pcpu_fc_free_fn_t)(void ptr, size_t size);
64	typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);	64	typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr);
		65	typedef void (pcpu_fc_map_fn_t)(void ptr, size_t size, void *addr);
65		66
66	extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,	67	extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
67	size_t static_size, size_t reserved_size,	68	size_t static_size, size_t reserved_size,
@@ -79,6 +80,32 @@ extern ssize_t __init pcpu_4k_first_chunk(
79	pcpu_fc_free_fn_t free_fn,	80	pcpu_fc_free_fn_t free_fn,
80	pcpu_fc_populate_pte_fn_t populate_pte_fn);	81	pcpu_fc_populate_pte_fn_t populate_pte_fn);
81		82
		83	#ifdef CONFIG_NEED_MULTIPLE_NODES
		84	extern ssize_t __init pcpu_lpage_first_chunk(
		85	size_t static_size, size_t reserved_size,
		86	ssize_t dyn_size, size_t lpage_size,
		87	pcpu_fc_alloc_fn_t alloc_fn,
		88	pcpu_fc_free_fn_t free_fn,
		89	pcpu_fc_map_fn_t map_fn);
		90
		91	extern void pcpu_lpage_remapped(void kaddr);
		92	#else
		93	static inline ssize_t __init pcpu_lpage_first_chunk(
		94	size_t static_size, size_t reserved_size,
		95	ssize_t dyn_size, size_t lpage_size,
		96	pcpu_fc_alloc_fn_t alloc_fn,
		97	pcpu_fc_free_fn_t free_fn,
		98	pcpu_fc_map_fn_t map_fn)
		99	{
		100	return -EINVAL;
		101	}
		102
		103	static inline void pcpu_lpage_remapped(void kaddr)
		104	{
		105	return NULL;
		106	}
		107	#endif
		108
82	/*	109	/*
83	* Use this to get to a cpu's version of the per-cpu object	110	* Use this to get to a cpu's version of the per-cpu object
84	* dynamically allocated. Non-atomic access to the current CPU's	111	* dynamically allocated. Non-atomic access to the current CPU's


diff --git a/mm/percpu.c b/mm/percpu.c index f3fe7bc7378f..17db527ee2e2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c
@@ -1190,6 +1190,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1190	return pcpu_unit_size;	1190	return pcpu_unit_size;
1191	}	1191	}
1192		1192
		1193	static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
		1194	ssize_t *dyn_sizep)
		1195	{
		1196	size_t size_sum;
		1197
		1198	size_sum = PFN_ALIGN(static_size + reserved_size +
		1199	(dyn_sizep >= 0 ? dyn_sizep : 0));
		1200	if (*dyn_sizep != 0)
		1201	*dyn_sizep = size_sum - static_size - reserved_size;
		1202
		1203	return size_sum;
		1204	}
		1205
1193	/*	1206	/*
1194	* Embedding first chunk setup helper.	1207	* Embedding first chunk setup helper.
1195	*/	1208	*/
@@ -1241,10 +1254,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1241	unsigned int cpu;	1254	unsigned int cpu;
1242		1255
1243	/* determine parameters and allocate */	1256	/* determine parameters and allocate */
1244	pcpue_size = PFN_ALIGN(static_size + reserved_size +	1257	pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
1245	(dyn_size >= 0 ? dyn_size : 0));
1246	if (dyn_size != 0)
1247	dyn_size = pcpue_size - static_size - reserved_size;
1248		1258
1249	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);	1259	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
1250	chunk_size = pcpue_unit_size * num_possible_cpus();	1260	chunk_size = pcpue_unit_size * num_possible_cpus();
@@ -1391,6 +1401,197 @@ out_free_ar:
1391	}	1401	}
1392		1402
1393	/*	1403	/*
		1404	* Large page remapping first chunk setup helper
		1405	*/
		1406	#ifdef CONFIG_NEED_MULTIPLE_NODES
		1407	struct pcpul_ent {
		1408	unsigned int cpu;
		1409	void *ptr;
		1410	};
		1411
		1412	static size_t pcpul_size;
		1413	static size_t pcpul_unit_size;
		1414	static struct pcpul_ent *pcpul_map;
		1415	static struct vm_struct pcpul_vm;
		1416
		1417	static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
		1418	{
		1419	size_t off = (size_t)pageno << PAGE_SHIFT;
		1420
		1421	if (off >= pcpul_size)
		1422	return NULL;
		1423
		1424	return virt_to_page(pcpul_map[cpu].ptr + off);
		1425	}
		1426
		1427	/**
		1428	* pcpu_lpage_first_chunk - remap the first percpu chunk using large page
		1429	* @static_size: the size of static percpu area in bytes
		1430	* @reserved_size: the size of reserved percpu area in bytes
		1431	* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
		1432	* @lpage_size: the size of a large page
		1433	* @alloc_fn: function to allocate percpu lpage, always called with lpage_size
		1434	* @free_fn: function to free percpu memory, @size <= lpage_size
		1435	* @map_fn: function to map percpu lpage, always called with lpage_size
		1436	*
		1437	* This allocator uses large page as unit. A large page is allocated
		1438	* for each cpu and each is remapped into vmalloc area using large
		1439	* page mapping. As large page can be quite large, only part of it is
		1440	* used for the first chunk. Unused part is returned to the bootmem
		1441	* allocator.
		1442	*
		1443	* So, the large pages are mapped twice - once to the physical mapping
		1444	* and to the vmalloc area for the first percpu chunk. The double
		1445	* mapping does add one more large TLB entry pressure but still is
		1446	* much better than only using 4k mappings while still being NUMA
		1447	* friendly.
		1448	*
		1449	* RETURNS:
		1450	* The determined pcpu_unit_size which can be used to initialize
		1451	* percpu access on success, -errno on failure.
		1452	*/
		1453	ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
		1454	ssize_t dyn_size, size_t lpage_size,
		1455	pcpu_fc_alloc_fn_t alloc_fn,
		1456	pcpu_fc_free_fn_t free_fn,
		1457	pcpu_fc_map_fn_t map_fn)
		1458	{
		1459	size_t size_sum;
		1460	size_t map_size;
		1461	unsigned int cpu;
		1462	int i, j;
		1463	ssize_t ret;
		1464
		1465	/*
		1466	* Currently supports only single page. Supporting multiple
		1467	* pages won't be too difficult if it ever becomes necessary.
		1468	*/
		1469	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
		1470
		1471	pcpul_unit_size = lpage_size;
		1472	pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
		1473	if (pcpul_size > pcpul_unit_size) {
		1474	pr_warning("PERCPU: static data is larger than large page, "
		1475	"can't use large page\n");
		1476	return -EINVAL;
		1477	}
		1478
		1479	/* allocate pointer array and alloc large pages */
		1480	map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
		1481	pcpul_map = alloc_bootmem(map_size);
		1482
		1483	for_each_possible_cpu(cpu) {
		1484	void *ptr;
		1485
		1486	ptr = alloc_fn(cpu, lpage_size);
		1487	if (!ptr) {
		1488	pr_warning("PERCPU: failed to allocate large page "
		1489	"for cpu%u\n", cpu);
		1490	goto enomem;
		1491	}
		1492
		1493	/*
		1494	* Only use pcpul_size bytes and give back the rest.
		1495	*
		1496	* Ingo: The lpage_size up-rounding bootmem is needed
		1497	* to make sure the partial lpage is still fully RAM -
		1498	* it's not well-specified to have a incompatible area
		1499	* (unmapped RAM, device memory, etc.) in that hole.
		1500	*/
		1501	free_fn(ptr + pcpul_size, lpage_size - pcpul_size);
		1502
		1503	pcpul_map[cpu].cpu = cpu;
		1504	pcpul_map[cpu].ptr = ptr;
		1505
		1506	memcpy(ptr, __per_cpu_load, static_size);
		1507	}
		1508
		1509	/* allocate address and map */
		1510	pcpul_vm.flags = VM_ALLOC;
		1511	pcpul_vm.size = num_possible_cpus() * pcpul_unit_size;
		1512	vm_area_register_early(&pcpul_vm, pcpul_unit_size);
		1513
		1514	for_each_possible_cpu(cpu)
		1515	map_fn(pcpul_map[cpu].ptr, pcpul_unit_size,
		1516	pcpul_vm.addr + cpu * pcpul_unit_size);
		1517
		1518	/* we're ready, commit */
		1519	pr_info("PERCPU: Remapped at %p with large pages, static data "
		1520	"%zu bytes\n", pcpul_vm.addr, static_size);
		1521
		1522	ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
		1523	reserved_size, dyn_size, pcpul_unit_size,
		1524	pcpul_vm.addr, NULL);
		1525
		1526	/* sort pcpul_map array for pcpu_lpage_remapped() */
		1527	for (i = 0; i < num_possible_cpus() - 1; i++)
		1528	for (j = i + 1; j < num_possible_cpus(); j++)
		1529	if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
		1530	struct pcpul_ent tmp = pcpul_map[i];
		1531	pcpul_map[i] = pcpul_map[j];
		1532	pcpul_map[j] = tmp;
		1533	}
		1534
		1535	return ret;
		1536
		1537	enomem:
		1538	for_each_possible_cpu(cpu)
		1539	if (pcpul_map[cpu].ptr)
		1540	free_fn(pcpul_map[cpu].ptr, pcpul_size);
		1541	free_bootmem(__pa(pcpul_map), map_size);
		1542	return -ENOMEM;
		1543	}
		1544
		1545	/**
		1546	* pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
		1547	* @kaddr: the kernel address in question
		1548	*
		1549	* Determine whether @kaddr falls in the pcpul recycled area. This is
		1550	* used by pageattr to detect VM aliases and break up the pcpu large
		1551	* page mapping such that the same physical page is not mapped under
		1552	* different attributes.
		1553	*
		1554	* The recycled area is always at the tail of a partially used large
		1555	* page.
		1556	*
		1557	* RETURNS:
		1558	* Address of corresponding remapped pcpu address if match is found;
		1559	* otherwise, NULL.
		1560	*/
		1561	void pcpu_lpage_remapped(void kaddr)
		1562	{
		1563	unsigned long unit_mask = pcpul_unit_size - 1;
		1564	void lpage_addr = (void )((unsigned long)kaddr & ~unit_mask);
		1565	unsigned long offset = (unsigned long)kaddr & unit_mask;
		1566	int left = 0, right = num_possible_cpus() - 1;
		1567	int pos;
		1568
		1569	/* pcpul in use at all? */
		1570	if (!pcpul_map)
		1571	return NULL;
		1572
		1573	/* okay, perform binary search */
		1574	while (left <= right) {
		1575	pos = (left + right) / 2;
		1576
		1577	if (pcpul_map[pos].ptr < lpage_addr)
		1578	left = pos + 1;
		1579	else if (pcpul_map[pos].ptr > lpage_addr)
		1580	right = pos - 1;
		1581	else {
		1582	/* it shouldn't be in the area for the first chunk */
		1583	WARN_ON(offset < pcpul_size);
		1584
		1585	return pcpul_vm.addr +
		1586	pcpul_map[pos].cpu * pcpul_unit_size + offset;
		1587	}
		1588	}
		1589
		1590	return NULL;
		1591	}
		1592	#endif
		1593
		1594	/*
1394	* Generic percpu area setup.	1595	* Generic percpu area setup.
1395	*	1596	*
1396	* The embedding helper is used because its behavior closely resembles	1597	* The embedding helper is used because its behavior closely resembles