diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-21 16:47:29 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-21 16:47:29 -0400 |
commit | c3b86a29429dac1033e3f602f51fa8d00006a8eb (patch) | |
tree | bcedd0a553ca2396eeb58318ef6ee6b426e83652 /arch/x86 | |
parent | 8d8d2e9ccd331a1345c88b292ebee9d256fd8749 (diff) | |
parent | 2aeb66d3036dbafc297ac553a257a40283dadb3e (diff) |
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
x86-32, percpu: Correct the ordering of the percpu readmostly section
x86, mm: Enable ARCH_DMA_ADDR_T_64BIT with X86_64 || HIGHMEM64G
x86: Spread tlb flush vector between nodes
percpu: Introduce a read-mostly percpu API
x86, mm: Fix incorrect data type in vmalloc_sync_all()
x86, mm: Hold mm->page_table_lock while doing vmalloc_sync
x86, mm: Fix bogus whitespace in sync_global_pgds()
x86-32: Fix sparse warning for the __PHYSICAL_MASK calculation
x86, mm: Add RESERVE_BRK_ARRAY() helper
mm, x86: Saving vmcore with non-lazy freeing of vmas
x86, kdump: Change copy_oldmem_page() to use cached addressing
x86, mm: fix uninitialized addr in kernel_physical_mapping_init()
x86, kmemcheck: Remove double test
x86, mm: Make spurious_fault check explicitly check the PRESENT bit
x86-64, mem: Update all PGDs for direct mapping and vmemmap mapping changes
x86, mm: Separate x86_64 vmalloc_sync_all() into separate functions
x86, mm: Avoid unnecessary TLB flush
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/Kconfig | 3 | ||||
-rw-r--r-- | arch/x86/include/asm/io.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/page_types.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable.h | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable_64.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/setup.h | 5 | ||||
-rw-r--r-- | arch/x86/kernel/crash_dump_64.c | 3 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 43 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 47 | ||||
-rw-r--r-- | arch/x86/mm/kmemcheck/opcode.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 20 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 48 |
12 files changed, 147 insertions, 33 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b8676498d8df..8e9c4d4772fb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -1163,6 +1163,9 @@ config X86_PAE | |||
1163 | config ARCH_PHYS_ADDR_T_64BIT | 1163 | config ARCH_PHYS_ADDR_T_64BIT |
1164 | def_bool X86_64 || X86_PAE | 1164 | def_bool X86_64 || X86_PAE |
1165 | 1165 | ||
1166 | config ARCH_DMA_ADDR_T_64BIT | ||
1167 | def_bool X86_64 || HIGHMEM64G | ||
1168 | |||
1166 | config DIRECT_GBPAGES | 1169 | config DIRECT_GBPAGES |
1167 | bool "Enable 1GB pages for kernel pagetables" if EMBEDDED | 1170 | bool "Enable 1GB pages for kernel pagetables" if EMBEDDED |
1168 | default y | 1171 | default y |
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 30a3e9776123..6a45ec41ec26 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h | |||
@@ -206,6 +206,7 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) | |||
206 | 206 | ||
207 | extern void iounmap(volatile void __iomem *addr); | 207 | extern void iounmap(volatile void __iomem *addr); |
208 | 208 | ||
209 | extern void set_iounmap_nonlazy(void); | ||
209 | 210 | ||
210 | #ifdef __KERNEL__ | 211 | #ifdef __KERNEL__ |
211 | 212 | ||
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index a667f24c7254..1df66211fd1b 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h | |||
@@ -8,7 +8,7 @@ | |||
8 | #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) | 8 | #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) |
9 | #define PAGE_MASK (~(PAGE_SIZE-1)) | 9 | #define PAGE_MASK (~(PAGE_SIZE-1)) |
10 | 10 | ||
11 | #define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1) | 11 | #define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) |
12 | #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) | 12 | #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) |
13 | 13 | ||
14 | /* Cast PAGE_MASK to a signed type so that it is sign-extended if | 14 | /* Cast PAGE_MASK to a signed type so that it is sign-extended if |
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a34c785c5a63..ada823a13c7c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; | |||
28 | extern spinlock_t pgd_lock; | 28 | extern spinlock_t pgd_lock; |
29 | extern struct list_head pgd_list; | 29 | extern struct list_head pgd_list; |
30 | 30 | ||
31 | extern struct mm_struct *pgd_page_get_mm(struct page *page); | ||
32 | |||
31 | #ifdef CONFIG_PARAVIRT | 33 | #ifdef CONFIG_PARAVIRT |
32 | #include <asm/paravirt.h> | 34 | #include <asm/paravirt.h> |
33 | #else /* !CONFIG_PARAVIRT */ | 35 | #else /* !CONFIG_PARAVIRT */ |
@@ -603,6 +605,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, | |||
603 | pte_update(mm, addr, ptep); | 605 | pte_update(mm, addr, ptep); |
604 | } | 606 | } |
605 | 607 | ||
608 | #define flush_tlb_fix_spurious_fault(vma, address) | ||
609 | |||
606 | /* | 610 | /* |
607 | * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); | 611 | * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); |
608 | * | 612 | * |
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 076052cd62be..f96ac9bedf75 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h | |||
@@ -102,6 +102,8 @@ static inline void native_pgd_clear(pgd_t *pgd) | |||
102 | native_set_pgd(pgd, native_make_pgd(0)); | 102 | native_set_pgd(pgd, native_make_pgd(0)); |
103 | } | 103 | } |
104 | 104 | ||
105 | extern void sync_global_pgds(unsigned long start, unsigned long end); | ||
106 | |||
105 | /* | 107 | /* |
106 | * Conversion functions: convert a page and protection to a page entry, | 108 | * Conversion functions: convert a page and protection to a page entry, |
107 | * and a page entry and page directory to the page they refer to. | 109 | * and a page entry and page directory to the page they refer to. |
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ef292c792d74..d6763b139a84 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h | |||
@@ -93,6 +93,11 @@ void *extend_brk(size_t size, size_t align); | |||
93 | : : "i" (sz)); \ | 93 | : : "i" (sz)); \ |
94 | } | 94 | } |
95 | 95 | ||
96 | /* Helper for reserving space for arrays of things */ | ||
97 | #define RESERVE_BRK_ARRAY(type, name, entries) \ | ||
98 | type *name; \ | ||
99 | RESERVE_BRK(name, sizeof(type) * entries) | ||
100 | |||
96 | #ifdef __i386__ | 101 | #ifdef __i386__ |
97 | 102 | ||
98 | void __init i386_start_kernel(void); | 103 | void __init i386_start_kernel(void); |
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 045b36cada65..994828899e09 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c | |||
@@ -34,7 +34,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | |||
34 | if (!csize) | 34 | if (!csize) |
35 | return 0; | 35 | return 0; |
36 | 36 | ||
37 | vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); | 37 | vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); |
38 | if (!vaddr) | 38 | if (!vaddr) |
39 | return -ENOMEM; | 39 | return -ENOMEM; |
40 | 40 | ||
@@ -46,6 +46,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | |||
46 | } else | 46 | } else |
47 | memcpy(buf, vaddr + offset, csize); | 47 | memcpy(buf, vaddr + offset, csize); |
48 | 48 | ||
49 | set_iounmap_nonlazy(); | ||
49 | iounmap(vaddr); | 50 | iounmap(vaddr); |
50 | return csize; | 51 | return csize; |
51 | } | 52 | } |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a24c6cfdccc4..79b0b372d2d0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -229,7 +229,16 @@ void vmalloc_sync_all(void) | |||
229 | 229 | ||
230 | spin_lock_irqsave(&pgd_lock, flags); | 230 | spin_lock_irqsave(&pgd_lock, flags); |
231 | list_for_each_entry(page, &pgd_list, lru) { | 231 | list_for_each_entry(page, &pgd_list, lru) { |
232 | if (!vmalloc_sync_one(page_address(page), address)) | 232 | spinlock_t *pgt_lock; |
233 | pmd_t *ret; | ||
234 | |||
235 | pgt_lock = &pgd_page_get_mm(page)->page_table_lock; | ||
236 | |||
237 | spin_lock(pgt_lock); | ||
238 | ret = vmalloc_sync_one(page_address(page), address); | ||
239 | spin_unlock(pgt_lock); | ||
240 | |||
241 | if (!ret) | ||
233 | break; | 242 | break; |
234 | } | 243 | } |
235 | spin_unlock_irqrestore(&pgd_lock, flags); | 244 | spin_unlock_irqrestore(&pgd_lock, flags); |
@@ -328,29 +337,7 @@ out: | |||
328 | 337 | ||
329 | void vmalloc_sync_all(void) | 338 | void vmalloc_sync_all(void) |
330 | { | 339 | { |
331 | unsigned long address; | 340 | sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); |
332 | |||
333 | for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; | ||
334 | address += PGDIR_SIZE) { | ||
335 | |||
336 | const pgd_t *pgd_ref = pgd_offset_k(address); | ||
337 | unsigned long flags; | ||
338 | struct page *page; | ||
339 | |||
340 | if (pgd_none(*pgd_ref)) | ||
341 | continue; | ||
342 | |||
343 | spin_lock_irqsave(&pgd_lock, flags); | ||
344 | list_for_each_entry(page, &pgd_list, lru) { | ||
345 | pgd_t *pgd; | ||
346 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | ||
347 | if (pgd_none(*pgd)) | ||
348 | set_pgd(pgd, *pgd_ref); | ||
349 | else | ||
350 | BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | ||
351 | } | ||
352 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
353 | } | ||
354 | } | 341 | } |
355 | 342 | ||
356 | /* | 343 | /* |
@@ -898,8 +885,14 @@ spurious_fault(unsigned long error_code, unsigned long address) | |||
898 | if (pmd_large(*pmd)) | 885 | if (pmd_large(*pmd)) |
899 | return spurious_fault_check(error_code, (pte_t *) pmd); | 886 | return spurious_fault_check(error_code, (pte_t *) pmd); |
900 | 887 | ||
888 | /* | ||
889 | * Note: don't use pte_present() here, since it returns true | ||
890 | * if the _PAGE_PROTNONE bit is set. However, this aliases the | ||
891 | * _PAGE_GLOBAL bit, which for kernel pages give false positives | ||
892 | * when CONFIG_DEBUG_PAGEALLOC is used. | ||
893 | */ | ||
901 | pte = pte_offset_kernel(pmd, address); | 894 | pte = pte_offset_kernel(pmd, address); |
902 | if (!pte_present(*pte)) | 895 | if (!(pte_flags(*pte) & _PAGE_PRESENT)) |
903 | return 0; | 896 | return 0; |
904 | 897 | ||
905 | ret = spurious_fault_check(error_code, pte); | 898 | ret = spurious_fault_check(error_code, pte); |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7c48ad4faca3..c55f900fbf89 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -98,6 +98,43 @@ static int __init nonx32_setup(char *str) | |||
98 | __setup("noexec32=", nonx32_setup); | 98 | __setup("noexec32=", nonx32_setup); |
99 | 99 | ||
100 | /* | 100 | /* |
101 | * When memory was added/removed make sure all the processes MM have | ||
102 | * suitable PGD entries in the local PGD level page. | ||
103 | */ | ||
104 | void sync_global_pgds(unsigned long start, unsigned long end) | ||
105 | { | ||
106 | unsigned long address; | ||
107 | |||
108 | for (address = start; address <= end; address += PGDIR_SIZE) { | ||
109 | const pgd_t *pgd_ref = pgd_offset_k(address); | ||
110 | unsigned long flags; | ||
111 | struct page *page; | ||
112 | |||
113 | if (pgd_none(*pgd_ref)) | ||
114 | continue; | ||
115 | |||
116 | spin_lock_irqsave(&pgd_lock, flags); | ||
117 | list_for_each_entry(page, &pgd_list, lru) { | ||
118 | pgd_t *pgd; | ||
119 | spinlock_t *pgt_lock; | ||
120 | |||
121 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | ||
122 | pgt_lock = &pgd_page_get_mm(page)->page_table_lock; | ||
123 | spin_lock(pgt_lock); | ||
124 | |||
125 | if (pgd_none(*pgd)) | ||
126 | set_pgd(pgd, *pgd_ref); | ||
127 | else | ||
128 | BUG_ON(pgd_page_vaddr(*pgd) | ||
129 | != pgd_page_vaddr(*pgd_ref)); | ||
130 | |||
131 | spin_unlock(pgt_lock); | ||
132 | } | ||
133 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
134 | } | ||
135 | } | ||
136 | |||
137 | /* | ||
101 | * NOTE: This function is marked __ref because it calls __init function | 138 | * NOTE: This function is marked __ref because it calls __init function |
102 | * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. | 139 | * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. |
103 | */ | 140 | */ |
@@ -534,11 +571,13 @@ kernel_physical_mapping_init(unsigned long start, | |||
534 | unsigned long end, | 571 | unsigned long end, |
535 | unsigned long page_size_mask) | 572 | unsigned long page_size_mask) |
536 | { | 573 | { |
537 | 574 | bool pgd_changed = false; | |
538 | unsigned long next, last_map_addr = end; | 575 | unsigned long next, last_map_addr = end; |
576 | unsigned long addr; | ||
539 | 577 | ||
540 | start = (unsigned long)__va(start); | 578 | start = (unsigned long)__va(start); |
541 | end = (unsigned long)__va(end); | 579 | end = (unsigned long)__va(end); |
580 | addr = start; | ||
542 | 581 | ||
543 | for (; start < end; start = next) { | 582 | for (; start < end; start = next) { |
544 | pgd_t *pgd = pgd_offset_k(start); | 583 | pgd_t *pgd = pgd_offset_k(start); |
@@ -563,7 +602,12 @@ kernel_physical_mapping_init(unsigned long start, | |||
563 | spin_lock(&init_mm.page_table_lock); | 602 | spin_lock(&init_mm.page_table_lock); |
564 | pgd_populate(&init_mm, pgd, __va(pud_phys)); | 603 | pgd_populate(&init_mm, pgd, __va(pud_phys)); |
565 | spin_unlock(&init_mm.page_table_lock); | 604 | spin_unlock(&init_mm.page_table_lock); |
605 | pgd_changed = true; | ||
566 | } | 606 | } |
607 | |||
608 | if (pgd_changed) | ||
609 | sync_global_pgds(addr, end); | ||
610 | |||
567 | __flush_tlb_all(); | 611 | __flush_tlb_all(); |
568 | 612 | ||
569 | return last_map_addr; | 613 | return last_map_addr; |
@@ -1003,6 +1047,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) | |||
1003 | } | 1047 | } |
1004 | 1048 | ||
1005 | } | 1049 | } |
1050 | sync_global_pgds((unsigned long)start_page, end); | ||
1006 | return 0; | 1051 | return 0; |
1007 | } | 1052 | } |
1008 | 1053 | ||
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c index 63c19e27aa6f..324aa3f07237 100644 --- a/arch/x86/mm/kmemcheck/opcode.c +++ b/arch/x86/mm/kmemcheck/opcode.c | |||
@@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b) | |||
9 | b == 0xf0 || b == 0xf2 || b == 0xf3 | 9 | b == 0xf0 || b == 0xf2 || b == 0xf3 |
10 | /* Group 2 */ | 10 | /* Group 2 */ |
11 | || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 | 11 | || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 |
12 | || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e | 12 | || b == 0x64 || b == 0x65 |
13 | /* Group 3 */ | 13 | /* Group 3 */ |
14 | || b == 0x66 | 14 | || b == 0x66 |
15 | /* Group 4 */ | 15 | /* Group 4 */ |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5c4ee422590e..c70e57dbb491 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd) | |||
87 | #define UNSHARED_PTRS_PER_PGD \ | 87 | #define UNSHARED_PTRS_PER_PGD \ |
88 | (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) | 88 | (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) |
89 | 89 | ||
90 | static void pgd_ctor(pgd_t *pgd) | 90 | |
91 | static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) | ||
92 | { | ||
93 | BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); | ||
94 | virt_to_page(pgd)->index = (pgoff_t)mm; | ||
95 | } | ||
96 | |||
97 | struct mm_struct *pgd_page_get_mm(struct page *page) | ||
98 | { | ||
99 | return (struct mm_struct *)page->index; | ||
100 | } | ||
101 | |||
102 | static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) | ||
91 | { | 103 | { |
92 | /* If the pgd points to a shared pagetable level (either the | 104 | /* If the pgd points to a shared pagetable level (either the |
93 | ptes in non-PAE, or shared PMD in PAE), then just copy the | 105 | ptes in non-PAE, or shared PMD in PAE), then just copy the |
@@ -105,8 +117,10 @@ static void pgd_ctor(pgd_t *pgd) | |||
105 | } | 117 | } |
106 | 118 | ||
107 | /* list required to sync kernel mapping updates */ | 119 | /* list required to sync kernel mapping updates */ |
108 | if (!SHARED_KERNEL_PMD) | 120 | if (!SHARED_KERNEL_PMD) { |
121 | pgd_set_mm(pgd, mm); | ||
109 | pgd_list_add(pgd); | 122 | pgd_list_add(pgd); |
123 | } | ||
110 | } | 124 | } |
111 | 125 | ||
112 | static void pgd_dtor(pgd_t *pgd) | 126 | static void pgd_dtor(pgd_t *pgd) |
@@ -272,7 +286,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) | |||
272 | */ | 286 | */ |
273 | spin_lock_irqsave(&pgd_lock, flags); | 287 | spin_lock_irqsave(&pgd_lock, flags); |
274 | 288 | ||
275 | pgd_ctor(pgd); | 289 | pgd_ctor(mm, pgd); |
276 | pgd_prepopulate_pmd(mm, pgd, pmds); | 290 | pgd_prepopulate_pmd(mm, pgd, pmds); |
277 | 291 | ||
278 | spin_unlock_irqrestore(&pgd_lock, flags); | 292 | spin_unlock_irqrestore(&pgd_lock, flags); |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c03f14ab6667..49358481c733 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <linux/smp.h> | 5 | #include <linux/smp.h> |
6 | #include <linux/interrupt.h> | 6 | #include <linux/interrupt.h> |
7 | #include <linux/module.h> | 7 | #include <linux/module.h> |
8 | #include <linux/cpu.h> | ||
8 | 9 | ||
9 | #include <asm/tlbflush.h> | 10 | #include <asm/tlbflush.h> |
10 | #include <asm/mmu_context.h> | 11 | #include <asm/mmu_context.h> |
@@ -52,6 +53,8 @@ union smp_flush_state { | |||
52 | want false sharing in the per cpu data segment. */ | 53 | want false sharing in the per cpu data segment. */ |
53 | static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; | 54 | static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; |
54 | 55 | ||
56 | static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); | ||
57 | |||
55 | /* | 58 | /* |
56 | * We cannot call mmdrop() because we are in interrupt context, | 59 | * We cannot call mmdrop() because we are in interrupt context, |
57 | * instead update mm->cpu_vm_mask. | 60 | * instead update mm->cpu_vm_mask. |
@@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, | |||
173 | union smp_flush_state *f; | 176 | union smp_flush_state *f; |
174 | 177 | ||
175 | /* Caller has disabled preemption */ | 178 | /* Caller has disabled preemption */ |
176 | sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; | 179 | sender = this_cpu_read(tlb_vector_offset); |
177 | f = &flush_state[sender]; | 180 | f = &flush_state[sender]; |
178 | 181 | ||
179 | /* | 182 | /* |
@@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
218 | flush_tlb_others_ipi(cpumask, mm, va); | 221 | flush_tlb_others_ipi(cpumask, mm, va); |
219 | } | 222 | } |
220 | 223 | ||
224 | static void __cpuinit calculate_tlb_offset(void) | ||
225 | { | ||
226 | int cpu, node, nr_node_vecs; | ||
227 | /* | ||
228 | * we are changing tlb_vector_offset for each CPU in runtime, but this | ||
229 | * will not cause inconsistency, as the write is atomic under X86. we | ||
230 | * might see more lock contentions in a short time, but after all CPU's | ||
231 | * tlb_vector_offset are changed, everything should go normal | ||
232 | * | ||
233 | * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might | ||
234 | * waste some vectors. | ||
235 | **/ | ||
236 | if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS) | ||
237 | nr_node_vecs = 1; | ||
238 | else | ||
239 | nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; | ||
240 | |||
241 | for_each_online_node(node) { | ||
242 | int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) * | ||
243 | nr_node_vecs; | ||
244 | int cpu_offset = 0; | ||
245 | for_each_cpu(cpu, cpumask_of_node(node)) { | ||
246 | per_cpu(tlb_vector_offset, cpu) = node_offset + | ||
247 | cpu_offset; | ||
248 | cpu_offset++; | ||
249 | cpu_offset = cpu_offset % nr_node_vecs; | ||
250 | } | ||
251 | } | ||
252 | } | ||
253 | |||
254 | static int tlb_cpuhp_notify(struct notifier_block *n, | ||
255 | unsigned long action, void *hcpu) | ||
256 | { | ||
257 | switch (action & 0xf) { | ||
258 | case CPU_ONLINE: | ||
259 | case CPU_DEAD: | ||
260 | calculate_tlb_offset(); | ||
261 | } | ||
262 | return NOTIFY_OK; | ||
263 | } | ||
264 | |||
221 | static int __cpuinit init_smp_flush(void) | 265 | static int __cpuinit init_smp_flush(void) |
222 | { | 266 | { |
223 | int i; | 267 | int i; |
@@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void) | |||
225 | for (i = 0; i < ARRAY_SIZE(flush_state); i++) | 269 | for (i = 0; i < ARRAY_SIZE(flush_state); i++) |
226 | raw_spin_lock_init(&flush_state[i].tlbstate_lock); | 270 | raw_spin_lock_init(&flush_state[i].tlbstate_lock); |
227 | 271 | ||
272 | calculate_tlb_offset(); | ||
273 | hotcpu_notifier(tlb_cpuhp_notify, 0); | ||
228 | return 0; | 274 | return 0; |
229 | } | 275 | } |
230 | core_initcall(init_smp_flush); | 276 | core_initcall(init_smp_flush); |