From e6efaa025384f86a18814a6b9f4e5d54484ab9ff Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 18 Jun 2009 19:33:57 +0800 Subject: crypto: aes-ni - Fix cbc mode IV saving Original implementation of aesni_cbc_dec do not save IV if input length % 4 == 0. This will make decryption of next block failed. Signed-off-by: Huang Ying Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_asm.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index caba99601703..eb0566e83319 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -845,7 +845,7 @@ ENTRY(aesni_cbc_enc) */ ENTRY(aesni_cbc_dec) cmp $16, LEN - jb .Lcbc_dec_ret + jb .Lcbc_dec_just_ret mov 480(KEYP), KLEN add $240, KEYP movups (IVP), IV @@ -891,6 +891,7 @@ ENTRY(aesni_cbc_dec) add $16, OUTP cmp $16, LEN jge .Lcbc_dec_loop1 - movups IV, (IVP) .Lcbc_dec_ret: + movups IV, (IVP) +.Lcbc_dec_just_ret: ret -- cgit v1.2.2 From 9251b64fb2d2326d28f0e0646a9e4fb8bbb51d8e Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 18 Jun 2009 19:41:27 +0800 Subject: crypto: aes-ni - Do not sleep when using the FPU Because AES-NI instructions will touch XMM state, corresponding code must be enclosed within kernel_fpu_begin/end, which used preempt_disable/enable. So sleep should be prevented between kernel_fpu_begin/end. Signed-off-by: Huang Ying Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 4e663398f77f..c580c5ec1cad 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -198,6 +198,7 @@ static int ecb_encrypt(struct blkcipher_desc *desc, blkcipher_walk_init(&walk, dst, src, nbytes); err = blkcipher_walk_virt(desc, &walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { @@ -221,6 +222,7 @@ static int ecb_decrypt(struct blkcipher_desc *desc, blkcipher_walk_init(&walk, dst, src, nbytes); err = blkcipher_walk_virt(desc, &walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { @@ -266,6 +268,7 @@ static int cbc_encrypt(struct blkcipher_desc *desc, blkcipher_walk_init(&walk, dst, src, nbytes); err = blkcipher_walk_virt(desc, &walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { @@ -289,6 +292,7 @@ static int cbc_decrypt(struct blkcipher_desc *desc, blkcipher_walk_init(&walk, dst, src, nbytes); err = blkcipher_walk_virt(desc, &walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { -- cgit v1.2.2 From b6f34d44cb341ad32f08717d1a2c418e6053a031 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 18 Jun 2009 19:44:01 +0800 Subject: crypto: aes-ni - Remove CRYPTO_TFM_REQ_MAY_SLEEP from fpu template kernel_fpu_begin/end used preempt_disable/enable, so sleep should be prevented between kernel_fpu_begin/end. Signed-off-by: Huang Ying Signed-off-by: Herbert Xu --- arch/x86/crypto/fpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c index 5f9781a3815f..daef6cd2b45d 100644 --- a/arch/x86/crypto/fpu.c +++ b/arch/x86/crypto/fpu.c @@ -48,7 +48,7 @@ static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in, struct blkcipher_desc desc = { .tfm = child, .info = desc_in->info, - .flags = desc_in->flags, + .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, }; kernel_fpu_begin(); @@ -67,7 +67,7 @@ static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in, struct blkcipher_desc desc = { .tfm = child, .info = desc_in->info, - .flags = desc_in->flags, + .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, }; kernel_fpu_begin(); -- cgit v1.2.2 From d06063cc221fdefcab86589e79ddfdb7c0e14b63 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 10 Apr 2009 09:01:23 -0700 Subject: Move FAULT_FLAG_xyz into handle_mm_fault() callers This allows the callers to now pass down the full set of FAULT_FLAG_xyz flags to handle_mm_fault(). All callers have been (mechanically) converted to the new calling convention, there's almost certainly room for architectures to clean up their code and then add FAULT_FLAG_RETRY when that support is added. Signed-off-by: Linus Torvalds --- arch/alpha/mm/fault.c | 2 +- arch/arm/mm/fault.c | 2 +- arch/avr32/mm/fault.c | 2 +- arch/cris/mm/fault.c | 2 +- arch/frv/mm/fault.c | 2 +- arch/ia64/mm/fault.c | 2 +- arch/m32r/mm/fault.c | 2 +- arch/m68k/mm/fault.c | 2 +- arch/microblaze/mm/fault.c | 2 +- arch/mips/mm/fault.c | 2 +- arch/mn10300/mm/fault.c | 2 +- arch/parisc/mm/fault.c | 2 +- arch/powerpc/mm/fault.c | 2 +- arch/powerpc/platforms/cell/spu_fault.c | 2 +- arch/s390/lib/uaccess_pt.c | 2 +- arch/s390/mm/fault.c | 2 +- arch/sh/mm/fault_32.c | 2 +- arch/sh/mm/tlbflush_64.c | 2 +- arch/sparc/mm/fault_32.c | 4 ++-- arch/sparc/mm/fault_64.c | 2 +- arch/um/kernel/trap.c | 2 +- arch/x86/mm/fault.c | 2 +- arch/xtensa/mm/fault.c | 2 +- 23 files changed, 24 insertions(+), 24 deletions(-) (limited to 'arch') diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 4829f96585b1..00a31deaa96e 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -146,7 +146,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, /* If for any reason at all we couldn't handle the fault, make sure we exit gracefully rather than endlessly redo the fault. */ - fault = handle_mm_fault(mm, vma, address, cause > 0); + fault = handle_mm_fault(mm, vma, address, cause > 0 ? FAULT_FLAG_WRITE : 0); up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 0455557a2899..6fdcbb709827 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -208,7 +208,7 @@ good_area: * than endlessly redo the fault. */ survive: - fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, fsr & (1 << 11)); + fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, (fsr & (1 << 11)) ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c index 62d4abbaa654..b61d86d3debf 100644 --- a/arch/avr32/mm/fault.c +++ b/arch/avr32/mm/fault.c @@ -133,7 +133,7 @@ good_area: * fault. */ survive: - fault = handle_mm_fault(mm, vma, address, writeaccess); + fault = handle_mm_fault(mm, vma, address, writeaccess ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c index c4c76db90f9c..f925115e3250 100644 --- a/arch/cris/mm/fault.c +++ b/arch/cris/mm/fault.c @@ -163,7 +163,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs, * the fault. */ - fault = handle_mm_fault(mm, vma, address, writeaccess & 1); + fault = handle_mm_fault(mm, vma, address, (writeaccess & 1) ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c index 05093d41d98e..30f5d100a81c 100644 --- a/arch/frv/mm/fault.c +++ b/arch/frv/mm/fault.c @@ -163,7 +163,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, ear0, write); + fault = handle_mm_fault(mm, vma, ear0, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 23088bed111e..19261a99e623 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -154,7 +154,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re * sure we exit gracefully rather than endlessly redo the * fault. */ - fault = handle_mm_fault(mm, vma, address, (mask & VM_WRITE) != 0); + fault = handle_mm_fault(mm, vma, address, (mask & VM_WRITE) ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { /* * We ran out of memory, or some other thing happened diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c index 4a71df4c1b30..7274b47f4c22 100644 --- a/arch/m32r/mm/fault.c +++ b/arch/m32r/mm/fault.c @@ -196,7 +196,7 @@ survive: */ addr = (address & PAGE_MASK); set_thread_fault_code(error_code); - fault = handle_mm_fault(mm, vma, addr, write); + fault = handle_mm_fault(mm, vma, addr, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index f493f03231d5..d0e35cf99fc6 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -155,7 +155,7 @@ good_area: */ survive: - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); #ifdef DEBUG printk("handle_mm_fault returns %d\n",fault); #endif diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index 5e67cd1fab40..956607a63f4c 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -232,7 +232,7 @@ good_area: * the fault. */ survive: - fault = handle_mm_fault(mm, vma, address, is_write); + fault = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 55767ad9f00e..6751ce9ede9e 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -102,7 +102,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c index 33cf25025dac..a62e1e138bc1 100644 --- a/arch/mn10300/mm/fault.c +++ b/arch/mn10300/mm/fault.c @@ -258,7 +258,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 92c7fa4ecc3f..bfb6dd6ab380 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -202,7 +202,7 @@ good_area: * fault. */ - fault = handle_mm_fault(mm, vma, address, (acc_type & VM_WRITE) != 0); + fault = handle_mm_fault(mm, vma, address, (acc_type & VM_WRITE) ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { /* * We hit a shared mapping outside of the file, or some diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 5beffc8f481e..830bef0a1131 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -302,7 +302,7 @@ good_area: * the fault. */ survive: - ret = handle_mm_fault(mm, vma, address, is_write); + ret = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0); if (unlikely(ret & VM_FAULT_ERROR)) { if (ret & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/powerpc/platforms/cell/spu_fault.c b/arch/powerpc/platforms/cell/spu_fault.c index 95d8dadf2d87..d06ba87f1a19 100644 --- a/arch/powerpc/platforms/cell/spu_fault.c +++ b/arch/powerpc/platforms/cell/spu_fault.c @@ -70,7 +70,7 @@ int spu_handle_mm_fault(struct mm_struct *mm, unsigned long ea, } ret = 0; - *flt = handle_mm_fault(mm, vma, ea, is_write); + *flt = handle_mm_fault(mm, vma, ea, is_write ? FAULT_FLAG_WRITE : 0); if (unlikely(*flt & VM_FAULT_ERROR)) { if (*flt & VM_FAULT_OOM) { ret = -ENOMEM; diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c index b0b84c35b0ad..cb5d59eab0ee 100644 --- a/arch/s390/lib/uaccess_pt.c +++ b/arch/s390/lib/uaccess_pt.c @@ -66,7 +66,7 @@ static int __handle_fault(struct mm_struct *mm, unsigned long address, } survive: - fault = handle_mm_fault(mm, vma, address, write_access); + fault = handle_mm_fault(mm, vma, address, write_access ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 220a152c836c..74eb26bf1970 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -352,7 +352,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) { up_read(&mm->mmap_sem); diff --git a/arch/sh/mm/fault_32.c b/arch/sh/mm/fault_32.c index 2c50f80fc332..cc8ddbdf3d7a 100644 --- a/arch/sh/mm/fault_32.c +++ b/arch/sh/mm/fault_32.c @@ -133,7 +133,7 @@ good_area: * the fault. */ survive: - fault = handle_mm_fault(mm, vma, address, writeaccess); + fault = handle_mm_fault(mm, vma, address, writeaccess ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c index 7876997ba19a..fcbb6e135cef 100644 --- a/arch/sh/mm/tlbflush_64.c +++ b/arch/sh/mm/tlbflush_64.c @@ -187,7 +187,7 @@ good_area: * the fault. */ survive: - fault = handle_mm_fault(mm, vma, address, writeaccess); + fault = handle_mm_fault(mm, vma, address, writeaccess ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index 12e447fc8542..a5e30c642ee3 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -241,7 +241,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; @@ -484,7 +484,7 @@ good_area: if(!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } - switch (handle_mm_fault(mm, vma, address, write)) { + switch (handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0)) { case VM_FAULT_SIGBUS: case VM_FAULT_OOM: goto do_sigbus; diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 4ab8993b0863..e5620b27c8bf 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -398,7 +398,7 @@ good_area: goto bad_area; } - fault = handle_mm_fault(mm, vma, address, (fault_code & FAULT_CODE_WRITE)); + fault = handle_mm_fault(mm, vma, address, (fault_code & FAULT_CODE_WRITE) ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 7384d8accfe7..637c6505dc00 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -65,7 +65,7 @@ good_area: do { int fault; - fault = handle_mm_fault(mm, vma, address, is_write); + fault = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) { goto out_of_memory; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c403526d5d15..78a5fff857be 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1113,7 +1113,7 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault: */ - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index bdd860d93f72..bc0733359a88 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -106,7 +106,7 @@ good_area: * the fault. */ survive: - fault = handle_mm_fault(mm, vma, address, is_write); + fault = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; -- cgit v1.2.2 From c5806df9232d2a7f554b4839b57cac2e664fc256 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: fix duplicate free in setup_pcpu_remap() failure path In the failure path, setup_pcpu_remap() tries to free the area which has already been freed to make holes in the large page. Fix it. [ Impact: fix duplicate free in failure path ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9c3f0823e6aa..dfbc7e6c64d4 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -228,7 +228,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) enomem: for_each_possible_cpu(cpu) if (pcpur_ptrs[cpu]) - free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); + free_bootmem(__pa(pcpur_ptrs[cpu]), pcpur_size); ret = -ENOMEM; out_free_ar: free_bootmem(__pa(pcpur_ptrs), ptrs_size); -- cgit v1.2.2 From 97c9bf0618cd40b05b4859c1f8a90d8ad97fefb2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: rename remap percpu first chunk allocator to lpage The "remap" allocator remaps large pages to build the first chunk; however, the name isn't very good because 4k allocator remaps too and the whole point of the remap allocator is using large page mapping. The allocator will be generalized and exported outside of x86, rename it to lpage before that happens. percpu_alloc kernel parameter is updated to accept both "remap" and "lpage" for lpage allocator. [ Impact: code cleanup, kernel parameter argument updated ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 50 +++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index dfbc7e6c64d4..8794c0c94d2c 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -124,7 +124,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } /* - * Remap allocator + * Large page remap allocator * * This allocator uses PMD page as unit. A PMD page is allocated for * each cpu and each is remapped into vmalloc area using PMD mapping. @@ -137,20 +137,20 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, * better than only using 4k mappings while still being NUMA friendly. */ #ifdef CONFIG_NEED_MULTIPLE_NODES -static size_t pcpur_size __initdata; -static void **pcpur_ptrs __initdata; +static size_t pcpul_size __initdata; +static void **pcpul_ptrs __initdata; -static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) +static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) { size_t off = (size_t)pageno << PAGE_SHIFT; - if (off >= pcpur_size) + if (off >= pcpul_size) return NULL; - return virt_to_page(pcpur_ptrs[cpu] + off); + return virt_to_page(pcpul_ptrs[cpu] + off); } -static ssize_t __init setup_pcpu_remap(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size) { static struct vm_struct vm; size_t ptrs_size, dyn_size; @@ -170,36 +170,36 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) * Currently supports only single page. Supporting multiple * pages won't be too difficult if it ever becomes necessary. */ - pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE); - if (pcpur_size > PMD_SIZE) { + if (pcpul_size > PMD_SIZE) { pr_warning("PERCPU: static data is larger than large page, " "can't use large page\n"); return -EINVAL; } - dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; + dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; /* allocate pointer array and alloc large pages */ - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); - pcpur_ptrs = alloc_bootmem(ptrs_size); + ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_ptrs[0])); + pcpul_ptrs = alloc_bootmem(ptrs_size); for_each_possible_cpu(cpu) { - pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpur_ptrs[cpu]) + pcpul_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); + if (!pcpul_ptrs[cpu]) goto enomem; /* - * Only use pcpur_size bytes and give back the rest. + * Only use pcpul_size bytes and give back the rest. * * Ingo: The 2MB up-rounding bootmem is needed to make * sure the partial 2MB page is still fully RAM - it's * not well-specified to have a PAT-incompatible area * (unmapped RAM, device memory, etc.) in that hole. */ - free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), - PMD_SIZE - pcpur_size); + free_bootmem(__pa(pcpul_ptrs[cpu] + pcpul_size), + PMD_SIZE - pcpul_size); - memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); + memcpy(pcpul_ptrs[cpu], __per_cpu_load, static_size); } /* allocate address and map */ @@ -212,7 +212,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) pmd = populate_extra_pmd((unsigned long)vm.addr + cpu * PMD_SIZE); - set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), + set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpul_ptrs[cpu])), PAGE_KERNEL_LARGE)); } @@ -220,22 +220,22 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) pr_info("PERCPU: Remapped at %p with large pages, static data " "%zu bytes\n", vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, + ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, PERCPU_FIRST_CHUNK_RESERVE, dyn_size, PMD_SIZE, vm.addr, NULL); goto out_free_ar; enomem: for_each_possible_cpu(cpu) - if (pcpur_ptrs[cpu]) - free_bootmem(__pa(pcpur_ptrs[cpu]), pcpur_size); + if (pcpul_ptrs[cpu]) + free_bootmem(__pa(pcpul_ptrs[cpu]), pcpul_size); ret = -ENOMEM; out_free_ar: - free_bootmem(__pa(pcpur_ptrs), ptrs_size); + free_bootmem(__pa(pcpul_ptrs), ptrs_size); return ret; } #else -static ssize_t __init setup_pcpu_remap(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size) { return -EINVAL; } @@ -367,7 +367,7 @@ void __init setup_per_cpu_areas(void) * of large page mappings. Please read comments on top of * each allocator for details. */ - ret = setup_pcpu_remap(static_size); + ret = setup_pcpu_lpage(static_size); if (ret < 0) ret = setup_pcpu_embed(static_size); if (ret < 0) -- cgit v1.2.2 From 0ff2587fd54bd6f66bc6914ada4eb77a7e819a5b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: prepare setup_pcpu_lpage() for pageattr fix Make the following changes in preparation of coming pageattr updates. * Define and use array of struct pcpul_ent instead of array of pointers. The only difference is ->cpu field which is set but unused yet. * Rename variables according to the above change. * Rename local variable vm to pcpul_vm and move it out of the function. [ Impact: no functional difference ] Signed-off-by: Tejun Heo Cc: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 58 ++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 25 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 8794c0c94d2c..7d38941e2b8c 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -137,8 +137,14 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, * better than only using 4k mappings while still being NUMA friendly. */ #ifdef CONFIG_NEED_MULTIPLE_NODES +struct pcpul_ent { + unsigned int cpu; + void *ptr; +}; + static size_t pcpul_size __initdata; -static void **pcpul_ptrs __initdata; +static struct pcpul_ent *pcpul_map __initdata; +static struct vm_struct pcpul_vm; static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) { @@ -147,13 +153,12 @@ static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) if (off >= pcpul_size) return NULL; - return virt_to_page(pcpul_ptrs[cpu] + off); + return virt_to_page(pcpul_map[cpu].ptr + off); } static ssize_t __init setup_pcpu_lpage(size_t static_size) { - static struct vm_struct vm; - size_t ptrs_size, dyn_size; + size_t map_size, dyn_size; unsigned int cpu; ssize_t ret; @@ -180,12 +185,14 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; /* allocate pointer array and alloc large pages */ - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_ptrs[0])); - pcpul_ptrs = alloc_bootmem(ptrs_size); + map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); + pcpul_map = alloc_bootmem(map_size); for_each_possible_cpu(cpu) { - pcpul_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpul_ptrs[cpu]) + pcpul_map[cpu].cpu = cpu; + pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, + PMD_SIZE); + if (!pcpul_map[cpu].ptr) goto enomem; /* @@ -196,42 +203,43 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) * not well-specified to have a PAT-incompatible area * (unmapped RAM, device memory, etc.) in that hole. */ - free_bootmem(__pa(pcpul_ptrs[cpu] + pcpul_size), + free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), PMD_SIZE - pcpul_size); - memcpy(pcpul_ptrs[cpu], __per_cpu_load, static_size); + memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); } /* allocate address and map */ - vm.flags = VM_ALLOC; - vm.size = num_possible_cpus() * PMD_SIZE; - vm_area_register_early(&vm, PMD_SIZE); + pcpul_vm.flags = VM_ALLOC; + pcpul_vm.size = num_possible_cpus() * PMD_SIZE; + vm_area_register_early(&pcpul_vm, PMD_SIZE); for_each_possible_cpu(cpu) { - pmd_t *pmd; + pmd_t *pmd, pmd_v; - pmd = populate_extra_pmd((unsigned long)vm.addr - + cpu * PMD_SIZE); - set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpul_ptrs[cpu])), - PAGE_KERNEL_LARGE)); + pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + + cpu * PMD_SIZE); + pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), + PAGE_KERNEL_LARGE); + set_pmd(pmd, pmd_v); } /* we're ready, commit */ pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", vm.addr, static_size); + "%zu bytes\n", pcpul_vm.addr, static_size); ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, vm.addr, NULL); - goto out_free_ar; + PMD_SIZE, pcpul_vm.addr, NULL); + goto out_free_map; enomem: for_each_possible_cpu(cpu) - if (pcpul_ptrs[cpu]) - free_bootmem(__pa(pcpul_ptrs[cpu]), pcpul_size); + if (pcpul_map[cpu].ptr) + free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpul_ptrs), ptrs_size); +out_free_map: + free_bootmem(__pa(pcpul_map), map_size); return ret; } #else -- cgit v1.2.2 From 992f4c1c2c1583cef3296ec4bf5205843a9a5f3d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: reorganize cpa_process_alias() Reorganize cpa_process_alias() so that new alias condition can be added easily. Jan Beulich spotted problem in the original cleanup thread which incorrectly assumed the two existing conditions were mutially exclusive. [ Impact: code reorganization ] Signed-off-by: Tejun Heo Cc: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/mm/pageattr.c | 50 +++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3cfe9ced8a4c..2ab058b0947d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -681,8 +681,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; - int ret = 0; - unsigned long temp_cpa_vaddr, vaddr; + unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); + unsigned long vaddr; + int ret; if (cpa->pfn >= max_pfn_mapped) return 0; @@ -706,42 +707,37 @@ static int cpa_process_alias(struct cpa_data *cpa) PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { alias_cpa = *cpa; - temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); - alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.vaddr = &laddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; } #ifdef CONFIG_X86_64 - if (ret) - return ret; /* - * No need to redo, when the primary call touched the high - * mapping already: - */ - if (within(vaddr, (unsigned long) _text, _brk_end)) - return 0; - - /* - * If the physical address is inside the kernel map, we need + * If the primary call didn't touch the high mapping already + * and the physical address is inside the kernel map, we need * to touch the high mapped kernel as well: */ - if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) - return 0; - - alias_cpa = *cpa; - temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; - alias_cpa.vaddr = &temp_cpa_vaddr; - alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + if (!within(vaddr, (unsigned long)_text, _brk_end) && + within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { + unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + + __START_KERNEL_map - phys_base; + alias_cpa = *cpa; + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - /* - * The high mapping range is imprecise, so ignore the return value. - */ - __change_page_attr_set_clr(&alias_cpa, 0); + /* + * The high mapping range is imprecise, so ignore the + * return value. + */ + __change_page_attr_set_clr(&alias_cpa, 0); + } #endif - return ret; + + return 0; } static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) -- cgit v1.2.2 From e59a1bb2fdfb745c685f5b40ffbed126331d3223 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: fix pageattr handling for lpage percpu allocator and re-enable it lpage allocator aliases a PMD page for each cpu and returns whatever is unused to the page allocator. When the pageattr of the recycled pages are changed, this makes the two aliases point to the overlapping regions with different attributes which isn't allowed and known to cause subtle data corruption in certain cases. This can be handled in simliar manner to the x86_64 highmap alias. pageattr code should detect if the target pages have PMD alias and split the PMD alias and synchronize the attributes. pcpur allocator is updated to keep the allocated PMD pages map sorted in ascending address order and provide pcpu_lpage_remapped() function which binary searches the array to determine whether the given address is aliased and if so to which address. pageattr is updated to use pcpu_lpage_remapped() to detect the PMD alias and split it up as necessary from cpa_process_alias(). Jan Beulich spotted the original problem and incorrect usage of vaddr instead of laddr for lookup. With this, lpage percpu allocator should work correctly. Re-enable it. [ Impact: fix subtle lpage pageattr bug and re-enable lpage ] Signed-off-by: Tejun Heo Reported-by: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/include/asm/percpu.h | 10 ++++++ arch/x86/kernel/setup_percpu.c | 72 ++++++++++++++++++++++++++++++++++++------ arch/x86/mm/pageattr.c | 21 +++++++++++- 3 files changed, 93 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 02ecb30982a3..103f1ddb0d85 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -42,6 +42,7 @@ #else /* ...!ASSEMBLY */ +#include #include #ifdef CONFIG_SMP @@ -155,6 +156,15 @@ do { \ /* We can use this directly for local CPU (faster). */ DECLARE_PER_CPU(unsigned long, this_cpu_off); +#ifdef CONFIG_NEED_MULTIPLE_NODES +void *pcpu_lpage_remapped(void *kaddr); +#else +static inline void *pcpu_lpage_remapped(void *kaddr) +{ + return NULL; +} +#endif + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 7d38941e2b8c..bad2fd223114 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -142,8 +142,8 @@ struct pcpul_ent { void *ptr; }; -static size_t pcpul_size __initdata; -static struct pcpul_ent *pcpul_map __initdata; +static size_t pcpul_size; +static struct pcpul_ent *pcpul_map; static struct vm_struct pcpul_vm; static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) @@ -160,15 +160,14 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) { size_t map_size, dyn_size; unsigned int cpu; + int i, j; ssize_t ret; /* * If large page isn't supported, there's no benefit in doing * this. Also, on non-NUMA, embedding is better. - * - * NOTE: disabled for now. */ - if (true || !cpu_has_pse || !pcpu_need_numa()) + if (!cpu_has_pse || !pcpu_need_numa()) return -EINVAL; /* @@ -231,16 +230,71 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, PERCPU_FIRST_CHUNK_RESERVE, dyn_size, PMD_SIZE, pcpul_vm.addr, NULL); - goto out_free_map; + + /* sort pcpul_map array for pcpu_lpage_remapped() */ + for (i = 0; i < num_possible_cpus() - 1; i++) + for (j = i + 1; j < num_possible_cpus(); j++) + if (pcpul_map[i].ptr > pcpul_map[j].ptr) { + struct pcpul_ent tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + return ret; enomem: for_each_possible_cpu(cpu) if (pcpul_map[cpu].ptr) free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); - ret = -ENOMEM; -out_free_map: free_bootmem(__pa(pcpul_map), map_size); - return ret; + return -ENOMEM; +} + +/** + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area + * @kaddr: the kernel address in question + * + * Determine whether @kaddr falls in the pcpul recycled area. This is + * used by pageattr to detect VM aliases and break up the pcpu PMD + * mapping such that the same physical page is not mapped under + * different attributes. + * + * The recycled area is always at the tail of a partially used PMD + * page. + * + * RETURNS: + * Address of corresponding remapped pcpu address if match is found; + * otherwise, NULL. + */ +void *pcpu_lpage_remapped(void *kaddr) +{ + void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); + unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; + int left = 0, right = num_possible_cpus() - 1; + int pos; + + /* pcpul in use at all? */ + if (!pcpul_map) + return NULL; + + /* okay, perform binary search */ + while (left <= right) { + pos = (left + right) / 2; + + if (pcpul_map[pos].ptr < pmd_addr) + left = pos + 1; + else if (pcpul_map[pos].ptr > pmd_addr) + right = pos - 1; + else { + /* it shouldn't be in the area for the first chunk */ + WARN_ON(offset < pcpul_size); + + return pcpul_vm.addr + + pcpul_map[pos].cpu * PMD_SIZE + offset; + } + } + + return NULL; } #else static ssize_t __init setup_pcpu_lpage(size_t static_size) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2ab058b0947d..1b734d7a8966 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -682,7 +683,7 @@ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); - unsigned long vaddr; + unsigned long vaddr, remapped; int ret; if (cpa->pfn >= max_pfn_mapped) @@ -737,6 +738,24 @@ static int cpa_process_alias(struct cpa_data *cpa) } #endif + /* + * If the PMD page was partially used for per-cpu remapping, + * the recycled area needs to be split and modified. Because + * the area is always proper subset of a PMD page + * cpa->numpages is guaranteed to be 1 for these areas, so + * there's no need to loop over and check for further remaps. + */ + remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); + if (remapped) { + WARN_ON(cpa->numpages > 1); + alias_cpa = *cpa; + alias_cpa.vaddr = &remapped; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; + } + return 0; } -- cgit v1.2.2 From fa8a7094ba1679b4b9b443e0ac9f5e046c79ee8d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: implement percpu_alloc kernel parameter According to Andi, it isn't clear whether lpage allocator is worth the trouble as there are many processors where PMD TLB is far scarcer than PTE TLB. The advantage or disadvantage probably depends on the actual size of percpu area and specific processor. As performance degradation due to TLB pressure tends to be highly workload specific and subtle, it is difficult to decide which way to go without more data. This patch implements percpu_alloc kernel parameter to allow selecting which first chunk allocator to use to ease debugging and testing. While at it, make sure all the failure paths report why something failed to help determining why certain allocator isn't working. Also, kill the "Great future plan" comment which had already been realized quite some time ago. [ Impact: allow explicit percpu first chunk allocator selection ] Signed-off-by: Tejun Heo Reported-by: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 69 ++++++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 19 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index bad2fd223114..165ebd5ba83b 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -156,20 +156,23 @@ static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) return virt_to_page(pcpul_map[cpu].ptr + off); } -static ssize_t __init setup_pcpu_lpage(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { size_t map_size, dyn_size; unsigned int cpu; int i, j; ssize_t ret; - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, on non-NUMA, embedding is better. - */ - if (!cpu_has_pse || !pcpu_need_numa()) + /* on non-NUMA, embedding is better */ + if (!chosen && !pcpu_need_numa()) return -EINVAL; + /* need PSE */ + if (!cpu_has_pse) { + pr_warning("PERCPU: lpage allocator requires PSE\n"); + return -EINVAL; + } + /* * Currently supports only single page. Supporting multiple * pages won't be too difficult if it ever becomes necessary. @@ -191,8 +194,11 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) pcpul_map[cpu].cpu = cpu; pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); - if (!pcpul_map[cpu].ptr) + if (!pcpul_map[cpu].ptr) { + pr_warning("PERCPU: failed to allocate large page " + "for cpu%u\n", cpu); goto enomem; + } /* * Only use pcpul_size bytes and give back the rest. @@ -297,7 +303,7 @@ void *pcpu_lpage_remapped(void *kaddr) return NULL; } #else -static ssize_t __init setup_pcpu_lpage(size_t static_size) +static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { return -EINVAL; } @@ -311,7 +317,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size) * mapping so that it can use PMD mapping without additional TLB * pressure. */ -static ssize_t __init setup_pcpu_embed(size_t static_size) +static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -320,7 +326,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) * this. Also, embedding allocation doesn't play well with * NUMA. */ - if (!cpu_has_pse || pcpu_need_numa()) + if (!chosen && (!cpu_has_pse || pcpu_need_numa())) return -EINVAL; return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, @@ -370,8 +376,11 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) void *ptr; ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) + if (!ptr) { + pr_warning("PERCPU: failed to allocate " + "4k page for cpu%u\n", cpu); goto enomem; + } memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); pcpu4k_pages[j++] = virt_to_page(ptr); @@ -395,6 +404,16 @@ out_free_ar: return ret; } +/* for explicit first chunk allocator selection */ +static char pcpu_chosen_alloc[16] __initdata; + +static int __init percpu_alloc_setup(char *str) +{ + strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); + return 0; +} +early_param("percpu_alloc", percpu_alloc_setup); + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -408,11 +427,6 @@ static inline void setup_percpu_segment(int cpu) #endif } -/* - * Great future plan: - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. - * Always point %gs to its beginning - */ void __init setup_per_cpu_areas(void) { size_t static_size = __per_cpu_end - __per_cpu_start; @@ -429,9 +443,26 @@ void __init setup_per_cpu_areas(void) * of large page mappings. Please read comments on top of * each allocator for details. */ - ret = setup_pcpu_lpage(static_size); - if (ret < 0) - ret = setup_pcpu_embed(static_size); + ret = -EINVAL; + if (strlen(pcpu_chosen_alloc)) { + if (strcmp(pcpu_chosen_alloc, "4k")) { + if (!strcmp(pcpu_chosen_alloc, "lpage")) + ret = setup_pcpu_lpage(static_size, true); + else if (!strcmp(pcpu_chosen_alloc, "embed")) + ret = setup_pcpu_embed(static_size, true); + else + pr_warning("PERCPU: unknown allocator %s " + "specified\n", pcpu_chosen_alloc); + if (ret < 0) + pr_warning("PERCPU: %s allocator failed (%zd), " + "falling back to 4k\n", + pcpu_chosen_alloc, ret); + } + } else { + ret = setup_pcpu_lpage(static_size, false); + if (ret < 0) + ret = setup_pcpu_embed(static_size, false); + } if (ret < 0) ret = setup_pcpu_4k(static_size); if (ret < 0) -- cgit v1.2.2 From 0017c869ddcb73069905d09f9e98e68627466237 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Jun 2009 11:56:24 +0900 Subject: x86: ensure percpu lpage doesn't consume too much vmalloc space On extreme configuration (e.g. 32bit 32-way NUMA machine), lpage percpu first chunk allocator can consume too much of vmalloc space. Make it fall back to 4k allocator if the consumption goes over 20%. [ Impact: add sanity check for lpage percpu first chunk allocator ] Signed-off-by: Tejun Heo Reported-by: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 165ebd5ba83b..29a3eef7cf4a 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -163,9 +163,21 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) int i, j; ssize_t ret; - /* on non-NUMA, embedding is better */ - if (!chosen && !pcpu_need_numa()) - return -EINVAL; + if (!chosen) { + size_t vm_size = VMALLOC_END - VMALLOC_START; + size_t tot_size = num_possible_cpus() * PMD_SIZE; + + /* on non-NUMA, embedding is better */ + if (!pcpu_need_numa()) + return -EINVAL; + + /* don't consume more than 20% of vmalloc area */ + if (tot_size > vm_size / 5) { + pr_info("PERCPU: too large chunk size %zuMB for " + "large page remap\n", tot_size >> 20); + return -EINVAL; + } + } /* need PSE */ if (!cpu_has_pse) { -- cgit v1.2.2