aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-07-26 16:17:17 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-26 16:17:17 -0400
commit4cb38750d49010ae72e718d46605ac9ba5a851b4 (patch)
tree8c991a900fd176288f4acbc340512b90d604374d
parent0a2fe19ccc4bc552a8083a595a3aa737b8bea727 (diff)
parent7efa1c87963d23cc57ba40c07316d3e28cc75a3a (diff)
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86/mm changes from Peter Anvin: "The big change here is the patchset by Alex Shi to use INVLPG to flush only the affected pages when we only need to flush a small page range. It also removes the special INVALIDATE_TLB_VECTOR interrupts (32 vectors!) and replace it with an ordinary IPI function call." Fix up trivial conflicts in arch/x86/include/asm/apic.h (added code next to changed line) * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/tlb: Fix build warning and crash when building for !SMP x86/tlb: do flush_tlb_kernel_range by 'invlpg' x86/tlb: replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR x86/tlb: enable tlb flush range support for x86 mm/mmu_gather: enable tlb flush range in generic mmu_gather x86/tlb: add tlb_flushall_shift knob into debugfs x86/tlb: add tlb_flushall_shift for specific CPU x86/tlb: fall back to flush all when meet a THP large page x86/flush_tlb: try flush_tlb_single one by one in flush_tlb_range x86/tlb_info: get last level TLB entry number of CPU x86: Add read_mostly declaration/definition to variables from smp.h x86: Define early read-mostly per-cpu macros
-rw-r--r--arch/x86/Kconfig.debug19
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/entry_arch.h9
-rw-r--r--arch/x86/include/asm/irq_vectors.h11
-rw-r--r--arch/x86/include/asm/paravirt.h5
-rw-r--r--arch/x86/include/asm/paravirt_types.h3
-rw-r--r--arch/x86/include/asm/percpu.h17
-rw-r--r--arch/x86/include/asm/processor.h13
-rw-r--r--arch/x86/include/asm/smp.h16
-rw-r--r--arch/x86/include/asm/tlb.h9
-rw-r--r--arch/x86/include/asm/tlbflush.h49
-rw-r--r--arch/x86/include/asm/uv/uv.h5
-rw-r--r--arch/x86/kernel/apic/apic.c6
-rw-r--r--arch/x86/kernel/cpu/common.c31
-rw-r--r--arch/x86/kernel/cpu/cpu.h9
-rw-r--r--arch/x86/kernel/cpu/intel.c176
-rw-r--r--arch/x86/kernel/entry_64.S18
-rw-r--r--arch/x86/kernel/irqinit.c73
-rw-r--r--arch/x86/kernel/setup_percpu.c2
-rw-r--r--arch/x86/kernel/smpboot.c8
-rw-r--r--arch/x86/mm/tlb.c401
-rw-r--r--arch/x86/platform/uv/tlb_uv.c6
-rw-r--r--arch/x86/xen/mmu.c12
-rw-r--r--include/asm-generic/tlb.h5
-rw-r--r--include/trace/events/xen.h12
-rw-r--r--mm/memory.c9
26 files changed, 561 insertions, 365 deletions
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index e46c2147397f..b322f124ee3c 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -129,6 +129,25 @@ config DOUBLEFAULT
129 option saves about 4k and might cause you much additional grey 129 option saves about 4k and might cause you much additional grey
130 hair. 130 hair.
131 131
132config DEBUG_TLBFLUSH
133 bool "Set upper limit of TLB entries to flush one-by-one"
134 depends on DEBUG_KERNEL && (X86_64 || X86_INVLPG)
135 ---help---
136
137 X86-only for now.
138
139 This option allows the user to tune the amount of TLB entries the
140 kernel flushes one-by-one instead of doing a full TLB flush. In
141 certain situations, the former is cheaper. This is controlled by the
142 tlb_flushall_shift knob under /sys/kernel/debug/x86. If you set it
143 to -1, the code flushes the whole TLB unconditionally. Otherwise,
144 for positive values of it, the kernel will use single TLB entry
145 invalidating instructions according to the following formula:
146
147 flush_entries <= active_tlb_entries / 2^tlb_flushall_shift
148
149 If in doubt, say "N".
150
132config IOMMU_DEBUG 151config IOMMU_DEBUG
133 bool "Enable IOMMU debugging" 152 bool "Enable IOMMU debugging"
134 depends on GART_IOMMU && DEBUG_KERNEL 153 depends on GART_IOMMU && DEBUG_KERNEL
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 3ea51a84a0e4..f34261296ffb 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -546,7 +546,7 @@ static inline const struct cpumask *online_target_cpus(void)
546 return cpu_online_mask; 546 return cpu_online_mask;
547} 547}
548 548
549DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); 549DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
550 550
551 551
552static inline unsigned int read_apic_id(void) 552static inline unsigned int read_apic_id(void)
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 0baa628e330c..40afa0005c69 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -15,15 +15,6 @@ BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) 15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) 16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) 17BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
18
19.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
20 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
21.if NUM_INVALIDATE_TLB_VECTORS > \idx
22BUILD_INTERRUPT3(invalidate_interrupt\idx,
23 (INVALIDATE_TLB_VECTOR_START)+\idx,
24 smp_invalidate_interrupt)
25.endif
26.endr
27#endif 18#endif
28 19
29BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) 20BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 4b4448761e88..1508e518c7e3 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -119,17 +119,6 @@
119 */ 119 */
120#define LOCAL_TIMER_VECTOR 0xef 120#define LOCAL_TIMER_VECTOR 0xef
121 121
122/* up to 32 vectors used for spreading out TLB flushes: */
123#if NR_CPUS <= 32
124# define NUM_INVALIDATE_TLB_VECTORS (NR_CPUS)
125#else
126# define NUM_INVALIDATE_TLB_VECTORS (32)
127#endif
128
129#define INVALIDATE_TLB_VECTOR_END (0xee)
130#define INVALIDATE_TLB_VECTOR_START \
131 (INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1)
132
133#define NR_VECTORS 256 122#define NR_VECTORS 256
134 123
135#define FPU_IRQ 13 124#define FPU_IRQ 13
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 0b47ddb6f00b..a0facf3908d7 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -360,9 +360,10 @@ static inline void __flush_tlb_single(unsigned long addr)
360 360
361static inline void flush_tlb_others(const struct cpumask *cpumask, 361static inline void flush_tlb_others(const struct cpumask *cpumask,
362 struct mm_struct *mm, 362 struct mm_struct *mm,
363 unsigned long va) 363 unsigned long start,
364 unsigned long end)
364{ 365{
365 PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va); 366 PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
366} 367}
367 368
368static inline int paravirt_pgd_alloc(struct mm_struct *mm) 369static inline int paravirt_pgd_alloc(struct mm_struct *mm)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 8613cbb7ba41..142236ed83af 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -248,7 +248,8 @@ struct pv_mmu_ops {
248 void (*flush_tlb_single)(unsigned long addr); 248 void (*flush_tlb_single)(unsigned long addr);
249 void (*flush_tlb_others)(const struct cpumask *cpus, 249 void (*flush_tlb_others)(const struct cpumask *cpus,
250 struct mm_struct *mm, 250 struct mm_struct *mm,
251 unsigned long va); 251 unsigned long start,
252 unsigned long end);
252 253
253 /* Hooks for allocating and freeing a pagetable top-level */ 254 /* Hooks for allocating and freeing a pagetable top-level */
254 int (*pgd_alloc)(struct mm_struct *mm); 255 int (*pgd_alloc)(struct mm_struct *mm);
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index d9b8e3f7f42a..1104afaba52b 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -551,6 +551,12 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off);
551 { [0 ... NR_CPUS-1] = _initvalue }; \ 551 { [0 ... NR_CPUS-1] = _initvalue }; \
552 __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map 552 __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
553 553
554#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue) \
555 DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue; \
556 __typeof__(_type) _name##_early_map[NR_CPUS] __initdata = \
557 { [0 ... NR_CPUS-1] = _initvalue }; \
558 __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
559
554#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ 560#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
555 EXPORT_PER_CPU_SYMBOL(_name) 561 EXPORT_PER_CPU_SYMBOL(_name)
556 562
@@ -559,6 +565,11 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off);
559 extern __typeof__(_type) *_name##_early_ptr; \ 565 extern __typeof__(_type) *_name##_early_ptr; \
560 extern __typeof__(_type) _name##_early_map[] 566 extern __typeof__(_type) _name##_early_map[]
561 567
568#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \
569 DECLARE_PER_CPU_READ_MOSTLY(_type, _name); \
570 extern __typeof__(_type) *_name##_early_ptr; \
571 extern __typeof__(_type) _name##_early_map[]
572
562#define early_per_cpu_ptr(_name) (_name##_early_ptr) 573#define early_per_cpu_ptr(_name) (_name##_early_ptr)
563#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx]) 574#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
564#define early_per_cpu(_name, _cpu) \ 575#define early_per_cpu(_name, _cpu) \
@@ -570,12 +581,18 @@ DECLARE_PER_CPU(unsigned long, this_cpu_off);
570#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ 581#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
571 DEFINE_PER_CPU(_type, _name) = _initvalue 582 DEFINE_PER_CPU(_type, _name) = _initvalue
572 583
584#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue) \
585 DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue
586
573#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ 587#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
574 EXPORT_PER_CPU_SYMBOL(_name) 588 EXPORT_PER_CPU_SYMBOL(_name)
575 589
576#define DECLARE_EARLY_PER_CPU(_type, _name) \ 590#define DECLARE_EARLY_PER_CPU(_type, _name) \
577 DECLARE_PER_CPU(_type, _name) 591 DECLARE_PER_CPU(_type, _name)
578 592
593#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \
594 DECLARE_PER_CPU_READ_MOSTLY(_type, _name)
595
579#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu) 596#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)
580#define early_per_cpu_ptr(_name) NULL 597#define early_per_cpu_ptr(_name) NULL
581/* no early_per_cpu_map() */ 598/* no early_per_cpu_map() */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 39bc5777211a..d048cad9bcad 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -61,6 +61,19 @@ static inline void *current_text_addr(void)
61# define ARCH_MIN_MMSTRUCT_ALIGN 0 61# define ARCH_MIN_MMSTRUCT_ALIGN 0
62#endif 62#endif
63 63
64enum tlb_infos {
65 ENTRIES,
66 NR_INFO
67};
68
69extern u16 __read_mostly tlb_lli_4k[NR_INFO];
70extern u16 __read_mostly tlb_lli_2m[NR_INFO];
71extern u16 __read_mostly tlb_lli_4m[NR_INFO];
72extern u16 __read_mostly tlb_lld_4k[NR_INFO];
73extern u16 __read_mostly tlb_lld_2m[NR_INFO];
74extern u16 __read_mostly tlb_lld_4m[NR_INFO];
75extern s8 __read_mostly tlb_flushall_shift;
76
64/* 77/*
65 * CPU type and hardware bug flags. Kept separately for each CPU. 78 * CPU type and hardware bug flags. Kept separately for each CPU.
66 * Members of this structure are referenced in head.S, so think twice 79 * Members of this structure are referenced in head.S, so think twice
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 2ffa95dc2333..4f19a1526037 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -31,12 +31,12 @@ static inline bool cpu_has_ht_siblings(void)
31 return has_siblings; 31 return has_siblings;
32} 32}
33 33
34DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); 34DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
35DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); 35DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
36/* cpus sharing the last level cache: */ 36/* cpus sharing the last level cache: */
37DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); 37DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
38DECLARE_PER_CPU(u16, cpu_llc_id); 38DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
39DECLARE_PER_CPU(int, cpu_number); 39DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
40 40
41static inline struct cpumask *cpu_sibling_mask(int cpu) 41static inline struct cpumask *cpu_sibling_mask(int cpu)
42{ 42{
@@ -53,10 +53,10 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
53 return per_cpu(cpu_llc_shared_map, cpu); 53 return per_cpu(cpu_llc_shared_map, cpu);
54} 54}
55 55
56DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); 56DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
57DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); 57DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
58#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) 58#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
59DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid); 59DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);
60#endif 60#endif
61 61
62/* Static state in head.S used to set up a CPU */ 62/* Static state in head.S used to set up a CPU */
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 829215fef9ee..4fef20773b8f 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -4,7 +4,14 @@
4#define tlb_start_vma(tlb, vma) do { } while (0) 4#define tlb_start_vma(tlb, vma) do { } while (0)
5#define tlb_end_vma(tlb, vma) do { } while (0) 5#define tlb_end_vma(tlb, vma) do { } while (0)
6#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) 6#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
7#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) 7
8#define tlb_flush(tlb) \
9{ \
10 if (tlb->fullmm == 0) \
11 flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \
12 else \
13 flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \
14}
8 15
9#include <asm-generic/tlb.h> 16#include <asm-generic/tlb.h>
10 17
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 36a1a2ab87d2..74a44333545a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -73,14 +73,10 @@ static inline void __flush_tlb_one(unsigned long addr)
73 * - flush_tlb_page(vma, vmaddr) flushes one page 73 * - flush_tlb_page(vma, vmaddr) flushes one page
74 * - flush_tlb_range(vma, start, end) flushes a range of pages 74 * - flush_tlb_range(vma, start, end) flushes a range of pages
75 * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages 75 * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
76 * - flush_tlb_others(cpumask, mm, va) flushes TLBs on other cpus 76 * - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus
77 * 77 *
78 * ..but the i386 has somewhat limited tlb flushing capabilities, 78 * ..but the i386 has somewhat limited tlb flushing capabilities,
79 * and page-granular flushes are available only on i486 and up. 79 * and page-granular flushes are available only on i486 and up.
80 *
81 * x86-64 can only flush individual pages or full VMs. For a range flush
82 * we always do the full VM. Might be worth trying if for a small
83 * range a few INVLPGs in a row are a win.
84 */ 80 */
85 81
86#ifndef CONFIG_SMP 82#ifndef CONFIG_SMP
@@ -109,9 +105,17 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
109 __flush_tlb(); 105 __flush_tlb();
110} 106}
111 107
108static inline void flush_tlb_mm_range(struct mm_struct *mm,
109 unsigned long start, unsigned long end, unsigned long vmflag)
110{
111 if (mm == current->active_mm)
112 __flush_tlb();
113}
114
112static inline void native_flush_tlb_others(const struct cpumask *cpumask, 115static inline void native_flush_tlb_others(const struct cpumask *cpumask,
113 struct mm_struct *mm, 116 struct mm_struct *mm,
114 unsigned long va) 117 unsigned long start,
118 unsigned long end)
115{ 119{
116} 120}
117 121
@@ -119,27 +123,35 @@ static inline void reset_lazy_tlbstate(void)
119{ 123{
120} 124}
121 125
126static inline void flush_tlb_kernel_range(unsigned long start,
127 unsigned long end)
128{
129 flush_tlb_all();
130}
131
122#else /* SMP */ 132#else /* SMP */
123 133
124#include <asm/smp.h> 134#include <asm/smp.h>
125 135
126#define local_flush_tlb() __flush_tlb() 136#define local_flush_tlb() __flush_tlb()
127 137
138#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
139
140#define flush_tlb_range(vma, start, end) \
141 flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
142
128extern void flush_tlb_all(void); 143extern void flush_tlb_all(void);
129extern void flush_tlb_current_task(void); 144extern void flush_tlb_current_task(void);
130extern void flush_tlb_mm(struct mm_struct *);
131extern void flush_tlb_page(struct vm_area_struct *, unsigned long); 145extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
146extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
147 unsigned long end, unsigned long vmflag);
148extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
132 149
133#define flush_tlb() flush_tlb_current_task() 150#define flush_tlb() flush_tlb_current_task()
134 151
135static inline void flush_tlb_range(struct vm_area_struct *vma,
136 unsigned long start, unsigned long end)
137{
138 flush_tlb_mm(vma->vm_mm);
139}
140
141void native_flush_tlb_others(const struct cpumask *cpumask, 152void native_flush_tlb_others(const struct cpumask *cpumask,
142 struct mm_struct *mm, unsigned long va); 153 struct mm_struct *mm,
154 unsigned long start, unsigned long end);
143 155
144#define TLBSTATE_OK 1 156#define TLBSTATE_OK 1
145#define TLBSTATE_LAZY 2 157#define TLBSTATE_LAZY 2
@@ -159,13 +171,8 @@ static inline void reset_lazy_tlbstate(void)
159#endif /* SMP */ 171#endif /* SMP */
160 172
161#ifndef CONFIG_PARAVIRT 173#ifndef CONFIG_PARAVIRT
162#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(mask, mm, va) 174#define flush_tlb_others(mask, mm, start, end) \
175 native_flush_tlb_others(mask, mm, start, end)
163#endif 176#endif
164 177
165static inline void flush_tlb_kernel_range(unsigned long start,
166 unsigned long end)
167{
168 flush_tlb_all();
169}
170
171#endif /* _ASM_X86_TLBFLUSH_H */ 178#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 3bb9491b7659..b47c2a82ff15 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -15,7 +15,8 @@ extern void uv_nmi_init(void);
15extern void uv_system_init(void); 15extern void uv_system_init(void);
16extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, 16extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
17 struct mm_struct *mm, 17 struct mm_struct *mm,
18 unsigned long va, 18 unsigned long start,
19 unsigned end,
19 unsigned int cpu); 20 unsigned int cpu);
20 21
21#else /* X86_UV */ 22#else /* X86_UV */
@@ -26,7 +27,7 @@ static inline void uv_cpu_init(void) { }
26static inline void uv_system_init(void) { } 27static inline void uv_system_init(void) { }
27static inline const struct cpumask * 28static inline const struct cpumask *
28uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, 29uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
29 unsigned long va, unsigned int cpu) 30 unsigned long start, unsigned long end, unsigned int cpu)
30{ return cpumask; } 31{ return cpumask; }
31 32
32#endif /* X86_UV */ 33#endif /* X86_UV */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 98e24131ff3a..24deb3082328 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -75,8 +75,8 @@ physid_mask_t phys_cpu_present_map;
75/* 75/*
76 * Map cpu index to physical APIC ID 76 * Map cpu index to physical APIC ID
77 */ 77 */
78DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); 78DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
79DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); 79DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID);
80EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); 80EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
81EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); 81EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
82 82
@@ -88,7 +88,7 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
88 * used for the mapping. This is where the behaviors of x86_64 and 32 88 * used for the mapping. This is where the behaviors of x86_64 and 32
89 * actually diverge. Let's keep it ugly for now. 89 * actually diverge. Let's keep it ugly for now.
90 */ 90 */
91DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID); 91DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID);
92 92
93/* 93/*
94 * Knob to control our willingness to enable the local APIC. 94 * Knob to control our willingness to enable the local APIC.
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5bbc082c47ad..46d8786d655e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -452,6 +452,35 @@ void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
452 c->x86_cache_size = l2size; 452 c->x86_cache_size = l2size;
453} 453}
454 454
455u16 __read_mostly tlb_lli_4k[NR_INFO];
456u16 __read_mostly tlb_lli_2m[NR_INFO];
457u16 __read_mostly tlb_lli_4m[NR_INFO];
458u16 __read_mostly tlb_lld_4k[NR_INFO];
459u16 __read_mostly tlb_lld_2m[NR_INFO];
460u16 __read_mostly tlb_lld_4m[NR_INFO];
461
462/*
463 * tlb_flushall_shift shows the balance point in replacing cr3 write
464 * with multiple 'invlpg'. It will do this replacement when
465 * flush_tlb_lines <= active_lines/2^tlb_flushall_shift.
466 * If tlb_flushall_shift is -1, means the replacement will be disabled.
467 */
468s8 __read_mostly tlb_flushall_shift = -1;
469
470void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c)
471{
472 if (this_cpu->c_detect_tlb)
473 this_cpu->c_detect_tlb(c);
474
475 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
476 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
477 "tlb_flushall_shift is 0x%x\n",
478 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
479 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
480 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
481 tlb_flushall_shift);
482}
483
455void __cpuinit detect_ht(struct cpuinfo_x86 *c) 484void __cpuinit detect_ht(struct cpuinfo_x86 *c)
456{ 485{
457#ifdef CONFIG_X86_HT 486#ifdef CONFIG_X86_HT
@@ -911,6 +940,8 @@ void __init identify_boot_cpu(void)
911#else 940#else
912 vgetcpu_set_mode(); 941 vgetcpu_set_mode();
913#endif 942#endif
943 if (boot_cpu_data.cpuid_level >= 2)
944 cpu_detect_tlb(&boot_cpu_data);
914} 945}
915 946
916void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 947void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 8bacc7826fb3..4041c24ae7db 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -20,10 +20,19 @@ struct cpu_dev {
20 void (*c_bsp_init)(struct cpuinfo_x86 *); 20 void (*c_bsp_init)(struct cpuinfo_x86 *);
21 void (*c_init)(struct cpuinfo_x86 *); 21 void (*c_init)(struct cpuinfo_x86 *);
22 void (*c_identify)(struct cpuinfo_x86 *); 22 void (*c_identify)(struct cpuinfo_x86 *);
23 void (*c_detect_tlb)(struct cpuinfo_x86 *);
23 unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int); 24 unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
24 int c_x86_vendor; 25 int c_x86_vendor;
25}; 26};
26 27
28struct _tlb_table {
29 unsigned char descriptor;
30 char tlb_type;
31 unsigned int entries;
32 /* unsigned int ways; */
33 char info[128];
34};
35
27#define cpu_dev_register(cpu_devX) \ 36#define cpu_dev_register(cpu_devX) \
28 static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ 37 static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
29 __attribute__((__section__(".x86_cpu_dev.init"))) = \ 38 __attribute__((__section__(".x86_cpu_dev.init"))) = \
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 3e6ff6cbf42a..0a4ce2980a5a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -491,6 +491,181 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
491} 491}
492#endif 492#endif
493 493
494#define TLB_INST_4K 0x01
495#define TLB_INST_4M 0x02
496#define TLB_INST_2M_4M 0x03
497
498#define TLB_INST_ALL 0x05
499#define TLB_INST_1G 0x06
500
501#define TLB_DATA_4K 0x11
502#define TLB_DATA_4M 0x12
503#define TLB_DATA_2M_4M 0x13
504#define TLB_DATA_4K_4M 0x14
505
506#define TLB_DATA_1G 0x16
507
508#define TLB_DATA0_4K 0x21
509#define TLB_DATA0_4M 0x22
510#define TLB_DATA0_2M_4M 0x23
511
512#define STLB_4K 0x41
513
514static const struct _tlb_table intel_tlb_table[] __cpuinitconst = {
515 { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
516 { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" },
517 { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" },
518 { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
519 { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
520 { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
521 { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" },
522 { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
523 { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
524 { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
525 { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
526 { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" },
527 { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" },
528 { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" },
529 { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" },
530 { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
531 { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
532 { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
533 { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
534 { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
535 { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
536 { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
537 { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
538 { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
539 { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
540 { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
541 { 0x00, 0, 0 }
542};
543
544static void __cpuinit intel_tlb_lookup(const unsigned char desc)
545{
546 unsigned char k;
547 if (desc == 0)
548 return;
549
550 /* look up this descriptor in the table */
551 for (k = 0; intel_tlb_table[k].descriptor != desc && \
552 intel_tlb_table[k].descriptor != 0; k++)
553 ;
554
555 if (intel_tlb_table[k].tlb_type == 0)
556 return;
557
558 switch (intel_tlb_table[k].tlb_type) {
559 case STLB_4K:
560 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
561 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
562 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
563 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
564 break;
565 case TLB_INST_ALL:
566 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
567 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
568 if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
569 tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
570 if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
571 tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
572 break;
573 case TLB_INST_4K:
574 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
575 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
576 break;
577 case TLB_INST_4M:
578 if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
579 tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
580 break;
581 case TLB_INST_2M_4M:
582 if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
583 tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
584 if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
585 tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
586 break;
587 case TLB_DATA_4K:
588 case TLB_DATA0_4K:
589 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
590 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
591 break;
592 case TLB_DATA_4M:
593 case TLB_DATA0_4M:
594 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
595 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
596 break;
597 case TLB_DATA_2M_4M:
598 case TLB_DATA0_2M_4M:
599 if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
600 tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
601 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
602 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
603 break;
604 case TLB_DATA_4K_4M:
605 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
606 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
607 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
608 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
609 break;
610 }
611}
612
613static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
614{
615 if (!cpu_has_invlpg) {
616 tlb_flushall_shift = -1;
617 return;
618 }
619 switch ((c->x86 << 8) + c->x86_model) {
620 case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
621 case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
622 case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
623 case 0x61d: /* six-core 45 nm xeon "Dunnington" */
624 tlb_flushall_shift = -1;
625 break;
626 case 0x61a: /* 45 nm nehalem, "Bloomfield" */
627 case 0x61e: /* 45 nm nehalem, "Lynnfield" */
628 case 0x625: /* 32 nm nehalem, "Clarkdale" */
629 case 0x62c: /* 32 nm nehalem, "Gulftown" */
630 case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
631 case 0x62f: /* 32 nm Xeon E7 */
632 tlb_flushall_shift = 6;
633 break;
634 case 0x62a: /* SandyBridge */
635 case 0x62d: /* SandyBridge, "Romely-EP" */
636 tlb_flushall_shift = 5;
637 break;
638 case 0x63a: /* Ivybridge */
639 tlb_flushall_shift = 1;
640 break;
641 default:
642 tlb_flushall_shift = 6;
643 }
644}
645
646static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c)
647{
648 int i, j, n;
649 unsigned int regs[4];
650 unsigned char *desc = (unsigned char *)regs;
651 /* Number of times to iterate */
652 n = cpuid_eax(2) & 0xFF;
653
654 for (i = 0 ; i < n ; i++) {
655 cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
656
657 /* If bit 31 is set, this is an unknown format */
658 for (j = 0 ; j < 3 ; j++)
659 if (regs[j] & (1 << 31))
660 regs[j] = 0;
661
662 /* Byte 0 is level count, not a descriptor */
663 for (j = 1 ; j < 16 ; j++)
664 intel_tlb_lookup(desc[j]);
665 }
666 intel_tlb_flushall_shift_set(c);
667}
668
494static const struct cpu_dev __cpuinitconst intel_cpu_dev = { 669static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
495 .c_vendor = "Intel", 670 .c_vendor = "Intel",
496 .c_ident = { "GenuineIntel" }, 671 .c_ident = { "GenuineIntel" },
@@ -546,6 +721,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
546 }, 721 },
547 .c_size_cache = intel_size_cache, 722 .c_size_cache = intel_size_cache,
548#endif 723#endif
724 .c_detect_tlb = intel_detect_tlb,
549 .c_early_init = early_init_intel, 725 .c_early_init = early_init_intel,
550 .c_init = init_intel, 726 .c_init = init_intel,
551 .c_x86_vendor = X86_VENDOR_INTEL, 727 .c_x86_vendor = X86_VENDOR_INTEL,
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 111f6bbd8b38..69babd8c834f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1048,24 +1048,6 @@ apicinterrupt LOCAL_TIMER_VECTOR \
1048apicinterrupt X86_PLATFORM_IPI_VECTOR \ 1048apicinterrupt X86_PLATFORM_IPI_VECTOR \
1049 x86_platform_ipi smp_x86_platform_ipi 1049 x86_platform_ipi smp_x86_platform_ipi
1050 1050
1051#ifdef CONFIG_SMP
1052 ALIGN
1053 INTR_FRAME
1054.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
1055 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
1056.if NUM_INVALIDATE_TLB_VECTORS > \idx
1057ENTRY(invalidate_interrupt\idx)
1058 pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
1059 jmp .Lcommon_invalidate_interrupt0
1060 CFI_ADJUST_CFA_OFFSET -8
1061END(invalidate_interrupt\idx)
1062.endif
1063.endr
1064 CFI_ENDPROC
1065apicinterrupt INVALIDATE_TLB_VECTOR_START, \
1066 invalidate_interrupt0, smp_invalidate_interrupt
1067#endif
1068
1069apicinterrupt THRESHOLD_APIC_VECTOR \ 1051apicinterrupt THRESHOLD_APIC_VECTOR \
1070 threshold_interrupt smp_threshold_interrupt 1052 threshold_interrupt smp_threshold_interrupt
1071apicinterrupt THERMAL_APIC_VECTOR \ 1053apicinterrupt THERMAL_APIC_VECTOR \
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 252981afd6c4..6e03b0d69138 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -171,79 +171,6 @@ static void __init smp_intr_init(void)
171 */ 171 */
172 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 172 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
173 173
174 /* IPIs for invalidation */
175#define ALLOC_INVTLB_VEC(NR) \
176 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \
177 invalidate_interrupt##NR)
178
179 switch (NUM_INVALIDATE_TLB_VECTORS) {
180 default:
181 ALLOC_INVTLB_VEC(31);
182 case 31:
183 ALLOC_INVTLB_VEC(30);
184 case 30:
185 ALLOC_INVTLB_VEC(29);
186 case 29:
187 ALLOC_INVTLB_VEC(28);
188 case 28:
189 ALLOC_INVTLB_VEC(27);
190 case 27:
191 ALLOC_INVTLB_VEC(26);
192 case 26:
193 ALLOC_INVTLB_VEC(25);
194 case 25:
195 ALLOC_INVTLB_VEC(24);
196 case 24:
197 ALLOC_INVTLB_VEC(23);
198 case 23:
199 ALLOC_INVTLB_VEC(22);
200 case 22:
201 ALLOC_INVTLB_VEC(21);
202 case 21:
203 ALLOC_INVTLB_VEC(20);
204 case 20:
205 ALLOC_INVTLB_VEC(19);
206 case 19:
207 ALLOC_INVTLB_VEC(18);
208 case 18:
209 ALLOC_INVTLB_VEC(17);
210 case 17:
211 ALLOC_INVTLB_VEC(16);
212 case 16:
213 ALLOC_INVTLB_VEC(15);
214 case 15:
215 ALLOC_INVTLB_VEC(14);
216 case 14:
217 ALLOC_INVTLB_VEC(13);
218 case 13:
219 ALLOC_INVTLB_VEC(12);
220 case 12:
221 ALLOC_INVTLB_VEC(11);
222 case 11:
223 ALLOC_INVTLB_VEC(10);
224 case 10:
225 ALLOC_INVTLB_VEC(9);
226 case 9:
227 ALLOC_INVTLB_VEC(8);
228 case 8:
229 ALLOC_INVTLB_VEC(7);
230 case 7:
231 ALLOC_INVTLB_VEC(6);
232 case 6:
233 ALLOC_INVTLB_VEC(5);
234 case 5:
235 ALLOC_INVTLB_VEC(4);
236 case 4:
237 ALLOC_INVTLB_VEC(3);
238 case 3:
239 ALLOC_INVTLB_VEC(2);
240 case 2:
241 ALLOC_INVTLB_VEC(1);
242 case 1:
243 ALLOC_INVTLB_VEC(0);
244 break;
245 }
246
247 /* IPI for generic function call */ 174 /* IPI for generic function call */
248 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 175 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
249 176
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 5a98aa272184..5cdff0357746 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -21,7 +21,7 @@
21#include <asm/cpu.h> 21#include <asm/cpu.h>
22#include <asm/stackprotector.h> 22#include <asm/stackprotector.h>
23 23
24DEFINE_PER_CPU(int, cpu_number); 24DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
25EXPORT_PER_CPU_SYMBOL(cpu_number); 25EXPORT_PER_CPU_SYMBOL(cpu_number);
26 26
27#ifdef CONFIG_X86_64 27#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c1a310fb8309..7c5a8c314c02 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -106,17 +106,17 @@ int smp_num_siblings = 1;
106EXPORT_SYMBOL(smp_num_siblings); 106EXPORT_SYMBOL(smp_num_siblings);
107 107
108/* Last level cache ID of each logical CPU */ 108/* Last level cache ID of each logical CPU */
109DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; 109DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
110 110
111/* representing HT siblings of each logical CPU */ 111/* representing HT siblings of each logical CPU */
112DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); 112DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
113EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); 113EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
114 114
115/* representing HT and core siblings of each logical CPU */ 115/* representing HT and core siblings of each logical CPU */
116DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); 116DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
117EXPORT_PER_CPU_SYMBOL(cpu_core_map); 117EXPORT_PER_CPU_SYMBOL(cpu_core_map);
118 118
119DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); 119DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
120 120
121/* Per CPU bogomips and other parameters */ 121/* Per CPU bogomips and other parameters */
122DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 122DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5e57e113b72c..613cd83e8c0c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -12,6 +12,7 @@
12#include <asm/cache.h> 12#include <asm/cache.h>
13#include <asm/apic.h> 13#include <asm/apic.h>
14#include <asm/uv/uv.h> 14#include <asm/uv/uv.h>
15#include <linux/debugfs.h>
15 16
16DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) 17DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
17 = { &init_mm, 0, }; 18 = { &init_mm, 0, };
@@ -27,33 +28,14 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
27 * 28 *
28 * More scalable flush, from Andi Kleen 29 * More scalable flush, from Andi Kleen
29 * 30 *
30 * To avoid global state use 8 different call vectors. 31 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
31 * Each CPU uses a specific vector to trigger flushes on other
32 * CPUs. Depending on the received vector the target CPUs look into
33 * the right array slot for the flush data.
34 *
35 * With more than 8 CPUs they are hashed to the 8 available
36 * vectors. The limited global vector space forces us to this right now.
37 * In future when interrupts are split into per CPU domains this could be
38 * fixed, at the cost of triggering multiple IPIs in some cases.
39 */ 32 */
40 33
41union smp_flush_state { 34struct flush_tlb_info {
42 struct { 35 struct mm_struct *flush_mm;
43 struct mm_struct *flush_mm; 36 unsigned long flush_start;
44 unsigned long flush_va; 37 unsigned long flush_end;
45 raw_spinlock_t tlbstate_lock; 38};
46 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
47 };
48 char pad[INTERNODE_CACHE_BYTES];
49} ____cacheline_internodealigned_in_smp;
50
51/* State is put into the per CPU data section, but padded
52 to a full cache line because other CPUs can access it and we don't
53 want false sharing in the per cpu data segment. */
54static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
55
56static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
57 39
58/* 40/*
59 * We cannot call mmdrop() because we are in interrupt context, 41 * We cannot call mmdrop() because we are in interrupt context,
@@ -72,28 +54,25 @@ void leave_mm(int cpu)
72EXPORT_SYMBOL_GPL(leave_mm); 54EXPORT_SYMBOL_GPL(leave_mm);
73 55
74/* 56/*
75 *
76 * The flush IPI assumes that a thread switch happens in this order: 57 * The flush IPI assumes that a thread switch happens in this order:
77 * [cpu0: the cpu that switches] 58 * [cpu0: the cpu that switches]
78 * 1) switch_mm() either 1a) or 1b) 59 * 1) switch_mm() either 1a) or 1b)
79 * 1a) thread switch to a different mm 60 * 1a) thread switch to a different mm
80 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); 61 * 1a1) set cpu_tlbstate to TLBSTATE_OK
81 * Stop ipi delivery for the old mm. This is not synchronized with 62 * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
82 * the other cpus, but smp_invalidate_interrupt ignore flush ipis 63 * if cpu0 was in lazy tlb mode.
83 * for the wrong mm, and in the worst case we perform a superfluous 64 * 1a2) update cpu active_mm
84 * tlb flush.
85 * 1a2) set cpu mmu_state to TLBSTATE_OK
86 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
87 * was in lazy tlb mode.
88 * 1a3) update cpu active_mm
89 * Now cpu0 accepts tlb flushes for the new mm. 65 * Now cpu0 accepts tlb flushes for the new mm.
90 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); 66 * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
91 * Now the other cpus will send tlb flush ipis. 67 * Now the other cpus will send tlb flush ipis.
92 * 1a4) change cr3. 68 * 1a4) change cr3.
69 * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
70 * Stop ipi delivery for the old mm. This is not synchronized with
71 * the other cpus, but flush_tlb_func ignore flush ipis for the wrong
72 * mm, and in the worst case we perform a superfluous tlb flush.
93 * 1b) thread switch without mm change 73 * 1b) thread switch without mm change
94 * cpu active_mm is correct, cpu0 already handles 74 * cpu active_mm is correct, cpu0 already handles flush ipis.
95 * flush ipis. 75 * 1b1) set cpu_tlbstate to TLBSTATE_OK
96 * 1b1) set cpu mmu_state to TLBSTATE_OK
97 * 1b2) test_and_set the cpu bit in cpu_vm_mask. 76 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
98 * Atomically set the bit [other cpus will start sending flush ipis], 77 * Atomically set the bit [other cpus will start sending flush ipis],
99 * and test the bit. 78 * and test the bit.
@@ -106,174 +85,62 @@ EXPORT_SYMBOL_GPL(leave_mm);
106 * runs in kernel space, the cpu could load tlb entries for user space 85 * runs in kernel space, the cpu could load tlb entries for user space
107 * pages. 86 * pages.
108 * 87 *
109 * The good news is that cpu mmu_state is local to each cpu, no 88 * The good news is that cpu_tlbstate is local to each cpu, no
110 * write/read ordering problems. 89 * write/read ordering problems.
111 */ 90 */
112 91
113/* 92/*
114 * TLB flush IPI: 93 * TLB flush funcation:
115 *
116 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. 94 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
117 * 2) Leave the mm if we are in the lazy tlb mode. 95 * 2) Leave the mm if we are in the lazy tlb mode.
118 *
119 * Interrupts are disabled.
120 */
121
122/*
123 * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop
124 * but still used for documentation purpose but the usage is slightly
125 * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt
126 * entry calls in with the first parameter in %eax. Maybe define
127 * intrlinkage?
128 */ 96 */
129#ifdef CONFIG_X86_64 97static void flush_tlb_func(void *info)
130asmlinkage
131#endif
132void smp_invalidate_interrupt(struct pt_regs *regs)
133{ 98{
134 unsigned int cpu; 99 struct flush_tlb_info *f = info;
135 unsigned int sender;
136 union smp_flush_state *f;
137
138 cpu = smp_processor_id();
139 /*
140 * orig_rax contains the negated interrupt vector.
141 * Use that to determine where the sender put the data.
142 */
143 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
144 f = &flush_state[sender];
145
146 if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
147 goto out;
148 /*
149 * This was a BUG() but until someone can quote me the
150 * line from the intel manual that guarantees an IPI to
151 * multiple CPUs is retried _only_ on the erroring CPUs
152 * its staying as a return
153 *
154 * BUG();
155 */
156
157 if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) {
158 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
159 if (f->flush_va == TLB_FLUSH_ALL)
160 local_flush_tlb();
161 else
162 __flush_tlb_one(f->flush_va);
163 } else
164 leave_mm(cpu);
165 }
166out:
167 ack_APIC_irq();
168 smp_mb__before_clear_bit();
169 cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
170 smp_mb__after_clear_bit();
171 inc_irq_stat(irq_tlb_count);
172}
173 100
174static void flush_tlb_others_ipi(const struct cpumask *cpumask, 101 if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
175 struct mm_struct *mm, unsigned long va) 102 return;
176{ 103
177 unsigned int sender; 104 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
178 union smp_flush_state *f; 105 if (f->flush_end == TLB_FLUSH_ALL || !cpu_has_invlpg)
179 106 local_flush_tlb();
180 /* Caller has disabled preemption */ 107 else if (!f->flush_end)
181 sender = this_cpu_read(tlb_vector_offset); 108 __flush_tlb_single(f->flush_start);
182 f = &flush_state[sender]; 109 else {
183 110 unsigned long addr;
184 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) 111 addr = f->flush_start;
185 raw_spin_lock(&f->tlbstate_lock); 112 while (addr < f->flush_end) {
186 113 __flush_tlb_single(addr);
187 f->flush_mm = mm; 114 addr += PAGE_SIZE;
188 f->flush_va = va; 115 }
189 if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) { 116 }
190 /* 117 } else
191 * We have to send the IPI only to 118 leave_mm(smp_processor_id());
192 * CPUs affected.
193 */
194 apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
195 INVALIDATE_TLB_VECTOR_START + sender);
196
197 while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
198 cpu_relax();
199 }
200 119
201 f->flush_mm = NULL;
202 f->flush_va = 0;
203 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
204 raw_spin_unlock(&f->tlbstate_lock);
205} 120}
206 121
207void native_flush_tlb_others(const struct cpumask *cpumask, 122void native_flush_tlb_others(const struct cpumask *cpumask,
208 struct mm_struct *mm, unsigned long va) 123 struct mm_struct *mm, unsigned long start,
124 unsigned long end)
209{ 125{
126 struct flush_tlb_info info;
127 info.flush_mm = mm;
128 info.flush_start = start;
129 info.flush_end = end;
130
210 if (is_uv_system()) { 131 if (is_uv_system()) {
211 unsigned int cpu; 132 unsigned int cpu;
212 133
213 cpu = smp_processor_id(); 134 cpu = smp_processor_id();
214 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); 135 cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
215 if (cpumask) 136 if (cpumask)
216 flush_tlb_others_ipi(cpumask, mm, va); 137 smp_call_function_many(cpumask, flush_tlb_func,
138 &info, 1);
217 return; 139 return;
218 } 140 }
219 flush_tlb_others_ipi(cpumask, mm, va); 141 smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
220} 142}
221 143
222static void __cpuinit calculate_tlb_offset(void)
223{
224 int cpu, node, nr_node_vecs, idx = 0;
225 /*
226 * we are changing tlb_vector_offset for each CPU in runtime, but this
227 * will not cause inconsistency, as the write is atomic under X86. we
228 * might see more lock contentions in a short time, but after all CPU's
229 * tlb_vector_offset are changed, everything should go normal
230 *
231 * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
232 * waste some vectors.
233 **/
234 if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
235 nr_node_vecs = 1;
236 else
237 nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
238
239 for_each_online_node(node) {
240 int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
241 nr_node_vecs;
242 int cpu_offset = 0;
243 for_each_cpu(cpu, cpumask_of_node(node)) {
244 per_cpu(tlb_vector_offset, cpu) = node_offset +
245 cpu_offset;
246 cpu_offset++;
247 cpu_offset = cpu_offset % nr_node_vecs;
248 }
249 idx++;
250 }
251}
252
253static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
254 unsigned long action, void *hcpu)
255{
256 switch (action & 0xf) {
257 case CPU_ONLINE:
258 case CPU_DEAD:
259 calculate_tlb_offset();
260 }
261 return NOTIFY_OK;
262}
263
264static int __cpuinit init_smp_flush(void)
265{
266 int i;
267
268 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
269 raw_spin_lock_init(&flush_state[i].tlbstate_lock);
270
271 calculate_tlb_offset();
272 hotcpu_notifier(tlb_cpuhp_notify, 0);
273 return 0;
274}
275core_initcall(init_smp_flush);
276
277void flush_tlb_current_task(void) 144void flush_tlb_current_task(void)
278{ 145{
279 struct mm_struct *mm = current->mm; 146 struct mm_struct *mm = current->mm;
@@ -282,27 +149,91 @@ void flush_tlb_current_task(void)
282 149
283 local_flush_tlb(); 150 local_flush_tlb();
284 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 151 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
285 flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); 152 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
286 preempt_enable(); 153 preempt_enable();
287} 154}
288 155
289void flush_tlb_mm(struct mm_struct *mm) 156/*
157 * It can find out the THP large page, or
158 * HUGETLB page in tlb_flush when THP disabled
159 */
160static inline unsigned long has_large_page(struct mm_struct *mm,
161 unsigned long start, unsigned long end)
162{
163 pgd_t *pgd;
164 pud_t *pud;
165 pmd_t *pmd;
166 unsigned long addr = ALIGN(start, HPAGE_SIZE);
167 for (; addr < end; addr += HPAGE_SIZE) {
168 pgd = pgd_offset(mm, addr);
169 if (likely(!pgd_none(*pgd))) {
170 pud = pud_offset(pgd, addr);
171 if (likely(!pud_none(*pud))) {
172 pmd = pmd_offset(pud, addr);
173 if (likely(!pmd_none(*pmd)))
174 if (pmd_large(*pmd))
175 return addr;
176 }
177 }
178 }
179 return 0;
180}
181
182void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
183 unsigned long end, unsigned long vmflag)
290{ 184{
185 unsigned long addr;
186 unsigned act_entries, tlb_entries = 0;
187
291 preempt_disable(); 188 preempt_disable();
189 if (current->active_mm != mm)
190 goto flush_all;
292 191
293 if (current->active_mm == mm) { 192 if (!current->mm) {
294 if (current->mm) 193 leave_mm(smp_processor_id());
194 goto flush_all;
195 }
196
197 if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1
198 || vmflag == VM_HUGETLB) {
199 local_flush_tlb();
200 goto flush_all;
201 }
202
203 /* In modern CPU, last level tlb used for both data/ins */
204 if (vmflag & VM_EXEC)
205 tlb_entries = tlb_lli_4k[ENTRIES];
206 else
207 tlb_entries = tlb_lld_4k[ENTRIES];
208 /* Assume all of TLB entries was occupied by this task */
209 act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
210
211 /* tlb_flushall_shift is on balance point, details in commit log */
212 if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
213 local_flush_tlb();
214 else {
215 if (has_large_page(mm, start, end)) {
295 local_flush_tlb(); 216 local_flush_tlb();
296 else 217 goto flush_all;
297 leave_mm(smp_processor_id()); 218 }
219 /* flush range by one by one 'invlpg' */
220 for (addr = start; addr < end; addr += PAGE_SIZE)
221 __flush_tlb_single(addr);
222
223 if (cpumask_any_but(mm_cpumask(mm),
224 smp_processor_id()) < nr_cpu_ids)
225 flush_tlb_others(mm_cpumask(mm), mm, start, end);
226 preempt_enable();
227 return;
298 } 228 }
299 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
300 flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
301 229
230flush_all:
231 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
232 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
302 preempt_enable(); 233 preempt_enable();
303} 234}
304 235
305void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) 236void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
306{ 237{
307 struct mm_struct *mm = vma->vm_mm; 238 struct mm_struct *mm = vma->vm_mm;
308 239
@@ -310,13 +241,13 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
310 241
311 if (current->active_mm == mm) { 242 if (current->active_mm == mm) {
312 if (current->mm) 243 if (current->mm)
313 __flush_tlb_one(va); 244 __flush_tlb_one(start);
314 else 245 else
315 leave_mm(smp_processor_id()); 246 leave_mm(smp_processor_id());
316 } 247 }
317 248
318 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 249 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
319 flush_tlb_others(mm_cpumask(mm), mm, va); 250 flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
320 251
321 preempt_enable(); 252 preempt_enable();
322} 253}
@@ -332,3 +263,83 @@ void flush_tlb_all(void)
332{ 263{
333 on_each_cpu(do_flush_tlb_all, NULL, 1); 264 on_each_cpu(do_flush_tlb_all, NULL, 1);
334} 265}
266
267static void do_kernel_range_flush(void *info)
268{
269 struct flush_tlb_info *f = info;
270 unsigned long addr;
271
272 /* flush range by one by one 'invlpg' */
273 for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
274 __flush_tlb_single(addr);
275}
276
277void flush_tlb_kernel_range(unsigned long start, unsigned long end)
278{
279 unsigned act_entries;
280 struct flush_tlb_info info;
281
282 /* In modern CPU, last level tlb used for both data/ins */
283 act_entries = tlb_lld_4k[ENTRIES];
284
285 /* Balance as user space task's flush, a bit conservative */
286 if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 ||
287 (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
288
289 on_each_cpu(do_flush_tlb_all, NULL, 1);
290 else {
291 info.flush_start = start;
292 info.flush_end = end;
293 on_each_cpu(do_kernel_range_flush, &info, 1);
294 }
295}
296
297#ifdef CONFIG_DEBUG_TLBFLUSH
298static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
299 size_t count, loff_t *ppos)
300{
301 char buf[32];
302 unsigned int len;
303
304 len = sprintf(buf, "%hd\n", tlb_flushall_shift);
305 return simple_read_from_buffer(user_buf, count, ppos, buf, len);
306}
307
308static ssize_t tlbflush_write_file(struct file *file,
309 const char __user *user_buf, size_t count, loff_t *ppos)
310{
311 char buf[32];
312 ssize_t len;
313 s8 shift;
314
315 len = min(count, sizeof(buf) - 1);
316 if (copy_from_user(buf, user_buf, len))
317 return -EFAULT;
318
319 buf[len] = '\0';
320 if (kstrtos8(buf, 0, &shift))
321 return -EINVAL;
322
323 if (shift > 64)
324 return -EINVAL;
325
326 tlb_flushall_shift = shift;
327 return count;
328}
329
330static const struct file_operations fops_tlbflush = {
331 .read = tlbflush_read_file,
332 .write = tlbflush_write_file,
333 .llseek = default_llseek,
334};
335
336static int __cpuinit create_tlb_flushall_shift(void)
337{
338 if (cpu_has_invlpg) {
339 debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR,
340 arch_debugfs_dir, NULL, &fops_tlbflush);
341 }
342 return 0;
343}
344late_initcall(create_tlb_flushall_shift);
345#endif
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 71b5d5a07d7b..b8b3a37c80cd 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1055,8 +1055,8 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
1055 * done. The returned pointer is valid till preemption is re-enabled. 1055 * done. The returned pointer is valid till preemption is re-enabled.
1056 */ 1056 */
1057const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, 1057const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1058 struct mm_struct *mm, unsigned long va, 1058 struct mm_struct *mm, unsigned long start,
1059 unsigned int cpu) 1059 unsigned end, unsigned int cpu)
1060{ 1060{
1061 int locals = 0; 1061 int locals = 0;
1062 int remotes = 0; 1062 int remotes = 0;
@@ -1113,7 +1113,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
1113 1113
1114 record_send_statistics(stat, locals, hubs, remotes, bau_desc); 1114 record_send_statistics(stat, locals, hubs, remotes, bau_desc);
1115 1115
1116 bau_desc->payload.address = va; 1116 bau_desc->payload.address = start;
1117 bau_desc->payload.sending_cpu = cpu; 1117 bau_desc->payload.sending_cpu = cpu;
1118 /* 1118 /*
1119 * uv_flush_send_and_wait returns 0 if all cpu's were messaged, 1119 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 27336dfcda8e..b65a76133f4f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1256,7 +1256,8 @@ static void xen_flush_tlb_single(unsigned long addr)
1256} 1256}
1257 1257
1258static void xen_flush_tlb_others(const struct cpumask *cpus, 1258static void xen_flush_tlb_others(const struct cpumask *cpus,
1259 struct mm_struct *mm, unsigned long va) 1259 struct mm_struct *mm, unsigned long start,
1260 unsigned long end)
1260{ 1261{
1261 struct { 1262 struct {
1262 struct mmuext_op op; 1263 struct mmuext_op op;
@@ -1268,7 +1269,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1268 } *args; 1269 } *args;
1269 struct multicall_space mcs; 1270 struct multicall_space mcs;
1270 1271
1271 trace_xen_mmu_flush_tlb_others(cpus, mm, va); 1272 trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
1272 1273
1273 if (cpumask_empty(cpus)) 1274 if (cpumask_empty(cpus))
1274 return; /* nothing to do */ 1275 return; /* nothing to do */
@@ -1281,11 +1282,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1281 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); 1282 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1282 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); 1283 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1283 1284
1284 if (va == TLB_FLUSH_ALL) { 1285 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1285 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; 1286 if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
1286 } else {
1287 args->op.cmd = MMUEXT_INVLPG_MULTI; 1287 args->op.cmd = MMUEXT_INVLPG_MULTI;
1288 args->op.arg1.linear_addr = va; 1288 args->op.arg1.linear_addr = start;
1289 } 1289 }
1290 1290
1291 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); 1291 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index f96a5b58a975..ed6642ad03e0 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -86,6 +86,8 @@ struct mmu_gather {
86#ifdef CONFIG_HAVE_RCU_TABLE_FREE 86#ifdef CONFIG_HAVE_RCU_TABLE_FREE
87 struct mmu_table_batch *batch; 87 struct mmu_table_batch *batch;
88#endif 88#endif
89 unsigned long start;
90 unsigned long end;
89 unsigned int need_flush : 1, /* Did free PTEs */ 91 unsigned int need_flush : 1, /* Did free PTEs */
90 fast_mode : 1; /* No batching */ 92 fast_mode : 1; /* No batching */
91 93
@@ -113,7 +115,8 @@ static inline int tlb_fast_mode(struct mmu_gather *tlb)
113 115
114void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm); 116void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm);
115void tlb_flush_mmu(struct mmu_gather *tlb); 117void tlb_flush_mmu(struct mmu_gather *tlb);
116void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end); 118void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start,
119 unsigned long end);
117int __tlb_remove_page(struct mmu_gather *tlb, struct page *page); 120int __tlb_remove_page(struct mmu_gather *tlb, struct page *page);
118 121
119/* tlb_remove_page 122/* tlb_remove_page
diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h
index 92f1a796829e..15ba03bdd7c6 100644
--- a/include/trace/events/xen.h
+++ b/include/trace/events/xen.h
@@ -397,18 +397,20 @@ TRACE_EVENT(xen_mmu_flush_tlb_single,
397 397
398TRACE_EVENT(xen_mmu_flush_tlb_others, 398TRACE_EVENT(xen_mmu_flush_tlb_others,
399 TP_PROTO(const struct cpumask *cpus, struct mm_struct *mm, 399 TP_PROTO(const struct cpumask *cpus, struct mm_struct *mm,
400 unsigned long addr), 400 unsigned long addr, unsigned long end),
401 TP_ARGS(cpus, mm, addr), 401 TP_ARGS(cpus, mm, addr, end),
402 TP_STRUCT__entry( 402 TP_STRUCT__entry(
403 __field(unsigned, ncpus) 403 __field(unsigned, ncpus)
404 __field(struct mm_struct *, mm) 404 __field(struct mm_struct *, mm)
405 __field(unsigned long, addr) 405 __field(unsigned long, addr)
406 __field(unsigned long, end)
406 ), 407 ),
407 TP_fast_assign(__entry->ncpus = cpumask_weight(cpus); 408 TP_fast_assign(__entry->ncpus = cpumask_weight(cpus);
408 __entry->mm = mm; 409 __entry->mm = mm;
409 __entry->addr = addr), 410 __entry->addr = addr,
410 TP_printk("ncpus %d mm %p addr %lx", 411 __entry->end = end),
411 __entry->ncpus, __entry->mm, __entry->addr) 412 TP_printk("ncpus %d mm %p addr %lx, end %lx",
413 __entry->ncpus, __entry->mm, __entry->addr, __entry->end)
412 ); 414 );
413 415
414TRACE_EVENT(xen_mmu_write_cr3, 416TRACE_EVENT(xen_mmu_write_cr3,
diff --git a/mm/memory.c b/mm/memory.c
index 2466d1250231..91f69459d3e8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -206,6 +206,8 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
206 tlb->mm = mm; 206 tlb->mm = mm;
207 207
208 tlb->fullmm = fullmm; 208 tlb->fullmm = fullmm;
209 tlb->start = -1UL;
210 tlb->end = 0;
209 tlb->need_flush = 0; 211 tlb->need_flush = 0;
210 tlb->fast_mode = (num_possible_cpus() == 1); 212 tlb->fast_mode = (num_possible_cpus() == 1);
211 tlb->local.next = NULL; 213 tlb->local.next = NULL;
@@ -248,6 +250,8 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e
248{ 250{
249 struct mmu_gather_batch *batch, *next; 251 struct mmu_gather_batch *batch, *next;
250 252
253 tlb->start = start;
254 tlb->end = end;
251 tlb_flush_mmu(tlb); 255 tlb_flush_mmu(tlb);
252 256
253 /* keep the page table cache within bounds */ 257 /* keep the page table cache within bounds */
@@ -1204,6 +1208,11 @@ again:
1204 */ 1208 */
1205 if (force_flush) { 1209 if (force_flush) {
1206 force_flush = 0; 1210 force_flush = 0;
1211
1212#ifdef HAVE_GENERIC_MMU_GATHER
1213 tlb->start = addr;
1214 tlb->end = end;
1215#endif
1207 tlb_flush_mmu(tlb); 1216 tlb_flush_mmu(tlb);
1208 if (addr != end) 1217 if (addr != end)
1209 goto again; 1218 goto again;