1 files changed, 206 insertions, 195 deletions
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5e57e113b72c..613cd83e8c0c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -12,6 +12,7 @@
 #include <asm/cache.h>
 #include <asm/apic.h>
 #include <asm/uv/uv.h>
+#include <linux/debugfs.h>
 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
                        = { &init_mm, 0, };
@@ -27,33 +28,14 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
 *
 *      More scalable flush, from Andi Kleen
 *
- *      To avoid global state use 8 different call vectors.
+ *      Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
- *      Each CPU uses a specific vector to trigger flushes on other
- *      CPUs. Depending on the received vector the target CPUs look into
- *      the right array slot for the flush data.
- *
- *      With more than 8 CPUs they are hashed to the 8 available
- *      vectors. The limited global vector space forces us to this right now.
- *      In future when interrupts are split into per CPU domains this could be
- *      fixed, at the cost of triggering multiple IPIs in some cases.
 */
-union smp_flush_state {
+struct flush_tlb_info {
-        struct {
+        struct mm_struct *flush_mm;
-                struct mm_struct *flush_mm;
+        unsigned long flush_start;
-                unsigned long flush_va;
+        unsigned long flush_end;
-                raw_spinlock_t tlbstate_lock;
+};
-                DECLARE_BITMAP(flush_cpumask, NR_CPUS);
-        };
-        char pad[INTERNODE_CACHE_BYTES];
-} ____cacheline_internodealigned_in_smp;
-/* State is put into the per CPU data section, but padded
-   to a full cache line because other CPUs can access it and we don't
-   want false sharing in the per cpu data segment. */
-static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
-static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
 /*
 * We cannot call mmdrop() because we are in interrupt context,
@@ -72,28 +54,25 @@ void leave_mm(int cpu)
 EXPORT_SYMBOL_GPL(leave_mm);
 /*
- *
 * The flush IPI assumes that a thread switch happens in this order:
 * [cpu0: the cpu that switches]
 * 1) switch_mm() either 1a) or 1b)
 * 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ * 1a1) set cpu_tlbstate to TLBSTATE_OK
- *      Stop ipi delivery for the old mm. This is not synchronized with
+ *      Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
- *      the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ *      if cpu0 was in lazy tlb mode.
- *      for the wrong mm, and in the worst case we perform a superfluous
+ * 1a2) update cpu active_mm
- *      tlb flush.
- * 1a2) set cpu mmu_state to TLBSTATE_OK
- *      Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- *      was in lazy tlb mode.
- * 1a3) update cpu active_mm
 *      Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+ * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
 *      Now the other cpus will send tlb flush ipis.
 * 1a4) change cr3.
+ * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ *      Stop ipi delivery for the old mm. This is not synchronized with
+ *      the other cpus, but flush_tlb_func ignore flush ipis for the wrong
+ *      mm, and in the worst case we perform a superfluous tlb flush.
 * 1b) thread switch without mm change
- *      cpu active_mm is correct, cpu0 already handles
+ *      cpu active_mm is correct, cpu0 already handles flush ipis.
- *      flush ipis.
+ * 1b1) set cpu_tlbstate to TLBSTATE_OK
- * 1b1) set cpu mmu_state to TLBSTATE_OK
 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
 *      Atomically set the bit [other cpus will start sending flush ipis],
 *      and test the bit.
@@ -106,174 +85,62 @@ EXPORT_SYMBOL_GPL(leave_mm);
 *   runs in kernel space, the cpu could load tlb entries for user space
 *   pages.
 *
- * The good news is that cpu mmu_state is local to each cpu, no
+ * The good news is that cpu_tlbstate is local to each cpu, no
 * write/read ordering problems.
 */
 /*
- * TLB flush IPI:
+ * TLB flush funcation:
- *
 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
 * 2) Leave the mm if we are in the lazy tlb mode.
- *
- * Interrupts are disabled.
- */
-/*
- * FIXME: use of asmlinkage is not consistent.  On x86_64 it's noop
- * but still used for documentation purpose but the usage is slightly
- * inconsistent.  On x86_32, asmlinkage is regparm(0) but interrupt
- * entry calls in with the first parameter in %eax.  Maybe define
- * intrlinkage?
 */
-#ifdef CONFIG_X86_64
+static void flush_tlb_func(void *info)
-asmlinkage
-#endif
-void smp_invalidate_interrupt(struct pt_regs *regs)
 {
-        unsigned int cpu;
+        struct flush_tlb_info *f = info;
-        unsigned int sender;
-        union smp_flush_state *f;
-        cpu = smp_processor_id();
-        /*
-         * orig_rax contains the negated interrupt vector.
-         * Use that to determine where the sender put the data.
-         */
-        sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
-        f = &flush_state[sender];
-        if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
-                goto out;
-                /*
-                 * This was a BUG() but until someone can quote me the
-                 * line from the intel manual that guarantees an IPI to
-                 * multiple CPUs is retried _only_ on the erroring CPUs
-                 * its staying as a return
-                 *
-                 * BUG();
-                 */
-        if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) {
-                if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
-                        if (f->flush_va == TLB_FLUSH_ALL)
-                                local_flush_tlb();
-                        else
-                                __flush_tlb_one(f->flush_va);
-                } else
-                        leave_mm(cpu);
-        }
-out:
-        ack_APIC_irq();
-        smp_mb__before_clear_bit();
-        cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
-        smp_mb__after_clear_bit();
-        inc_irq_stat(irq_tlb_count);
-}
-static void flush_tlb_others_ipi(const struct cpumask *cpumask,
+        if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
-                                 struct mm_struct *mm, unsigned long va)
+                return;
-{
-        unsigned int sender;
+        if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
-        union smp_flush_state *f;
+                if (f->flush_end == TLB_FLUSH_ALL || !cpu_has_invlpg)
+                        local_flush_tlb();
-        /* Caller has disabled preemption */
+                else if (!f->flush_end)
-        sender = this_cpu_read(tlb_vector_offset);
+                        __flush_tlb_single(f->flush_start);
-        f = &flush_state[sender];
+                else {
+                        unsigned long addr;
-        if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
+                        addr = f->flush_start;
-                raw_spin_lock(&f->tlbstate_lock);
+                        while (addr < f->flush_end) {
+                                __flush_tlb_single(addr);
-        f->flush_mm = mm;
+                                addr += PAGE_SIZE;
-        f->flush_va = va;
+                        }
-        if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
+                }
-                /*
+        } else
-                 * We have to send the IPI only to
+                leave_mm(smp_processor_id());
-                 * CPUs affected.
-                 */
-                apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
-                              INVALIDATE_TLB_VECTOR_START + sender);
-                while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
-                        cpu_relax();
-        }
-        f->flush_mm = NULL;
-        f->flush_va = 0;
-        if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
-                raw_spin_unlock(&f->tlbstate_lock);
 }
 void native_flush_tlb_others(const struct cpumask *cpumask,
-                             struct mm_struct *mm, unsigned long va)
+                                 struct mm_struct *mm, unsigned long start,
+                                 unsigned long end)
 {
+        struct flush_tlb_info info;
+        info.flush_mm = mm;
+        info.flush_start = start;
+        info.flush_end = end;
        if (is_uv_system()) {
                unsigned int cpu;
                cpu = smp_processor_id();
-                cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
+                cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
                if (cpumask)
-                        flush_tlb_others_ipi(cpumask, mm, va);
+                        smp_call_function_many(cpumask, flush_tlb_func,
+                                                                &info, 1);
                return;
        }
-        flush_tlb_others_ipi(cpumask, mm, va);
+        smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
 }
-static void __cpuinit calculate_tlb_offset(void)
-{
-        int cpu, node, nr_node_vecs, idx = 0;
-        /*
-         * we are changing tlb_vector_offset for each CPU in runtime, but this
-         * will not cause inconsistency, as the write is atomic under X86. we
-         * might see more lock contentions in a short time, but after all CPU's
-         * tlb_vector_offset are changed, everything should go normal
-         *
-         * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
-         * waste some vectors.
-         **/
-        if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
-                nr_node_vecs = 1;
-        else
-                nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
-        for_each_online_node(node) {
-                int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
-                        nr_node_vecs;
-                int cpu_offset = 0;
-                for_each_cpu(cpu, cpumask_of_node(node)) {
-                        per_cpu(tlb_vector_offset, cpu) = node_offset +
-                                cpu_offset;
-                        cpu_offset++;
-                        cpu_offset = cpu_offset % nr_node_vecs;
-                }
-                idx++;
-        }
-}
-static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
-                unsigned long action, void *hcpu)
-{
-        switch (action & 0xf) {
-        case CPU_ONLINE:
-        case CPU_DEAD:
-                calculate_tlb_offset();
-        }
-        return NOTIFY_OK;
-}
-static int __cpuinit init_smp_flush(void)
-{
-        int i;
-        for (i = 0; i < ARRAY_SIZE(flush_state); i++)
-                raw_spin_lock_init(&flush_state[i].tlbstate_lock);
-        calculate_tlb_offset();
-        hotcpu_notifier(tlb_cpuhp_notify, 0);
-        return 0;
-}
-core_initcall(init_smp_flush);
 void flush_tlb_current_task(void)
 {
        struct mm_struct *mm = current->mm;
@@ -282,27 +149,91 @@ void flush_tlb_current_task(void)
        local_flush_tlb();
        if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-                flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
+                flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
        preempt_enable();
 }
-void flush_tlb_mm(struct mm_struct *mm)
+/*
+ * It can find out the THP large page, or
+ * HUGETLB page in tlb_flush when THP disabled
+ */
+static inline unsigned long has_large_page(struct mm_struct *mm,
+                                 unsigned long start, unsigned long end)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        unsigned long addr = ALIGN(start, HPAGE_SIZE);
+        for (; addr < end; addr += HPAGE_SIZE) {
+                pgd = pgd_offset(mm, addr);
+                if (likely(!pgd_none(*pgd))) {
+                        pud = pud_offset(pgd, addr);
+                        if (likely(!pud_none(*pud))) {
+                                pmd = pmd_offset(pud, addr);
+                                if (likely(!pmd_none(*pmd)))
+                                        if (pmd_large(*pmd))
+                                                return addr;
+                        }
+                }
+        }
+        return 0;
+}
+void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+                                unsigned long end, unsigned long vmflag)
 {
+        unsigned long addr;
+        unsigned act_entries, tlb_entries = 0;
        preempt_disable();
+        if (current->active_mm != mm)
+                goto flush_all;
-        if (current->active_mm == mm) {
+        if (!current->mm) {
-                if (current->mm)
+                leave_mm(smp_processor_id());
+                goto flush_all;
+        }
+        if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1
+                                        || vmflag == VM_HUGETLB) {
+                local_flush_tlb();
+                goto flush_all;
+        }
+        /* In modern CPU, last level tlb used for both data/ins */
+        if (vmflag & VM_EXEC)
+                tlb_entries = tlb_lli_4k[ENTRIES];
+        else
+                tlb_entries = tlb_lld_4k[ENTRIES];
+        /* Assume all of TLB entries was occupied by this task */
+        act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
+        /* tlb_flushall_shift is on balance point, details in commit log */
+        if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
+                local_flush_tlb();
+        else {
+                if (has_large_page(mm, start, end)) {
                        local_flush_tlb();
-                else
+                        goto flush_all;
-                        leave_mm(smp_processor_id());
+                }
+                /* flush range by one by one 'invlpg' */
+                for (addr = start; addr < end;  addr += PAGE_SIZE)
+                        __flush_tlb_single(addr);
+                if (cpumask_any_but(mm_cpumask(mm),
+                                smp_processor_id()) < nr_cpu_ids)
+                        flush_tlb_others(mm_cpumask(mm), mm, start, end);
+                preempt_enable();
+                return;
        }
-        if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-                flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
+flush_all:
+        if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+                flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
        preempt_enable();
 }
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
 {
        struct mm_struct *mm = vma->vm_mm;
@@ -310,13 +241,13 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
        if (current->active_mm == mm) {
                if (current->mm)
-                        __flush_tlb_one(va);
+                        __flush_tlb_one(start);
                else
                        leave_mm(smp_processor_id());
        }
        if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-                flush_tlb_others(mm_cpumask(mm), mm, va);
+                flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
        preempt_enable();
 }
@@ -332,3 +263,83 @@ void flush_tlb_all(void)
 {
        on_each_cpu(do_flush_tlb_all, NULL, 1);
 }
+static void do_kernel_range_flush(void *info)
+{
+        struct flush_tlb_info *f = info;
+        unsigned long addr;
+        /* flush range by one by one 'invlpg' */
+        for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
+                __flush_tlb_single(addr);
+}
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+        unsigned act_entries;
+        struct flush_tlb_info info;
+        /* In modern CPU, last level tlb used for both data/ins */
+        act_entries = tlb_lld_4k[ENTRIES];
+        /* Balance as user space task's flush, a bit conservative */
+        if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 ||
+                (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
+                on_each_cpu(do_flush_tlb_all, NULL, 1);
+        else {
+                info.flush_start = start;
+                info.flush_end = end;
+                on_each_cpu(do_kernel_range_flush, &info, 1);
+        }
+}
+#ifdef CONFIG_DEBUG_TLBFLUSH
+static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
+                             size_t count, loff_t *ppos)
+{
+        char buf[32];
+        unsigned int len;
+        len = sprintf(buf, "%hd\n", tlb_flushall_shift);
+        return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+static ssize_t tlbflush_write_file(struct file *file,
+                 const char __user *user_buf, size_t count, loff_t *ppos)
+{
+        char buf[32];
+        ssize_t len;
+        s8 shift;
+        len = min(count, sizeof(buf) - 1);
+        if (copy_from_user(buf, user_buf, len))
+                return -EFAULT;
+        buf[len] = '\0';
+        if (kstrtos8(buf, 0, &shift))
+                return -EINVAL;
+        if (shift > 64)
+                return -EINVAL;
+        tlb_flushall_shift = shift;
+        return count;
+}
+static const struct file_operations fops_tlbflush = {
+        .read = tlbflush_read_file,
+        .write = tlbflush_write_file,
+        .llseek = default_llseek,
+};
+static int __cpuinit create_tlb_flushall_shift(void)
+{
+        if (cpu_has_invlpg) {
+                debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR,
+                        arch_debugfs_dir, NULL, &fops_tlbflush);
+        }
+        return 0;
+}
+late_initcall(create_tlb_flushall_shift);
+#endif

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5e57e113b72c..613cd83e8c0c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c
@@ -12,6 +12,7 @@
12	#include <asm/cache.h>	12	#include <asm/cache.h>
13	#include <asm/apic.h>	13	#include <asm/apic.h>
14	#include <asm/uv/uv.h>	14	#include <asm/uv/uv.h>
		15	#include <linux/debugfs.h>
15		16
16	DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)	17	DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
17	= { &init_mm, 0, };	18	= { &init_mm, 0, };
@@ -27,33 +28,14 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
27	*	28	*
28	* More scalable flush, from Andi Kleen	29	* More scalable flush, from Andi Kleen
29	*	30	*
30	* To avoid global state use 8 different call vectors.	31	* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
31	* Each CPU uses a specific vector to trigger flushes on other
32	* CPUs. Depending on the received vector the target CPUs look into
33	* the right array slot for the flush data.
34	*
35	* With more than 8 CPUs they are hashed to the 8 available
36	* vectors. The limited global vector space forces us to this right now.
37	* In future when interrupts are split into per CPU domains this could be
38	* fixed, at the cost of triggering multiple IPIs in some cases.
39	*/	32	*/
40		33
41	union smp_flush_state {	34	struct flush_tlb_info {
42	struct {	35	struct mm_struct *flush_mm;
43	struct mm_struct *flush_mm;	36	unsigned long flush_start;
44	unsigned long flush_va;	37	unsigned long flush_end;
45	raw_spinlock_t tlbstate_lock;	38	};
46	DECLARE_BITMAP(flush_cpumask, NR_CPUS);
47	};
48	char pad[INTERNODE_CACHE_BYTES];
49	} ____cacheline_internodealigned_in_smp;
50
51	/* State is put into the per CPU data section, but padded
52	to a full cache line because other CPUs can access it and we don't
53	want false sharing in the per cpu data segment. */
54	static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
55
56	static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
57		39
58	/*	40	/*
59	* We cannot call mmdrop() because we are in interrupt context,	41	* We cannot call mmdrop() because we are in interrupt context,
@@ -72,28 +54,25 @@ void leave_mm(int cpu)
72	EXPORT_SYMBOL_GPL(leave_mm);	54	EXPORT_SYMBOL_GPL(leave_mm);
73		55
74	/*	56	/*
75	*
76	* The flush IPI assumes that a thread switch happens in this order:	57	* The flush IPI assumes that a thread switch happens in this order:
77	* [cpu0: the cpu that switches]	58	* [cpu0: the cpu that switches]
78	* 1) switch_mm() either 1a) or 1b)	59	* 1) switch_mm() either 1a) or 1b)
79	* 1a) thread switch to a different mm	60	* 1a) thread switch to a different mm
80	* 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);	61	* 1a1) set cpu_tlbstate to TLBSTATE_OK
81	* Stop ipi delivery for the old mm. This is not synchronized with	62	* Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
82	* the other cpus, but smp_invalidate_interrupt ignore flush ipis	63	* if cpu0 was in lazy tlb mode.
83	* for the wrong mm, and in the worst case we perform a superfluous	64	* 1a2) update cpu active_mm
84	* tlb flush.
85	* 1a2) set cpu mmu_state to TLBSTATE_OK
86	* Now the smp_invalidate_interrupt won't call leave_mm if cpu0
87	* was in lazy tlb mode.
88	* 1a3) update cpu active_mm
89	* Now cpu0 accepts tlb flushes for the new mm.	65	* Now cpu0 accepts tlb flushes for the new mm.
90	* 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);	66	* 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
91	* Now the other cpus will send tlb flush ipis.	67	* Now the other cpus will send tlb flush ipis.
92	* 1a4) change cr3.	68	* 1a4) change cr3.
		69	* 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
		70	* Stop ipi delivery for the old mm. This is not synchronized with
		71	* the other cpus, but flush_tlb_func ignore flush ipis for the wrong
		72	* mm, and in the worst case we perform a superfluous tlb flush.
93	* 1b) thread switch without mm change	73	* 1b) thread switch without mm change
94	* cpu active_mm is correct, cpu0 already handles	74	* cpu active_mm is correct, cpu0 already handles flush ipis.
95	* flush ipis.	75	* 1b1) set cpu_tlbstate to TLBSTATE_OK
96	* 1b1) set cpu mmu_state to TLBSTATE_OK
97	* 1b2) test_and_set the cpu bit in cpu_vm_mask.	76	* 1b2) test_and_set the cpu bit in cpu_vm_mask.
98	* Atomically set the bit [other cpus will start sending flush ipis],	77	* Atomically set the bit [other cpus will start sending flush ipis],
99	* and test the bit.	78	* and test the bit.
@@ -106,174 +85,62 @@ EXPORT_SYMBOL_GPL(leave_mm);
106	* runs in kernel space, the cpu could load tlb entries for user space	85	* runs in kernel space, the cpu could load tlb entries for user space
107	* pages.	86	* pages.
108	*	87	*
109	* The good news is that cpu mmu_state is local to each cpu, no	88	* The good news is that cpu_tlbstate is local to each cpu, no
110	* write/read ordering problems.	89	* write/read ordering problems.
111	*/	90	*/
112		91
113	/*	92	/*
114	* TLB flush IPI:	93	* TLB flush funcation:
115	*
116	* 1) Flush the tlb entries if the cpu uses the mm that's being flushed.	94	* 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
117	* 2) Leave the mm if we are in the lazy tlb mode.	95	* 2) Leave the mm if we are in the lazy tlb mode.
118	*
119	* Interrupts are disabled.
120	*/
121
122	/*
123	* FIXME: use of asmlinkage is not consistent. On x86_64 it's noop
124	* but still used for documentation purpose but the usage is slightly
125	* inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt
126	* entry calls in with the first parameter in %eax. Maybe define
127	* intrlinkage?
128	*/	96	*/
129	#ifdef CONFIG_X86_64	97	static void flush_tlb_func(void *info)
130	asmlinkage
131	#endif
132	void smp_invalidate_interrupt(struct pt_regs *regs)
133	{	98	{
134	unsigned int cpu;	99	struct flush_tlb_info *f = info;
135	unsigned int sender;
136	union smp_flush_state *f;
137
138	cpu = smp_processor_id();
139	/*
140	* orig_rax contains the negated interrupt vector.
141	* Use that to determine where the sender put the data.
142	*/
143	sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
144	f = &flush_state[sender];
145
146	if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
147	goto out;
148	/*
149	* This was a BUG() but until someone can quote me the
150	* line from the intel manual that guarantees an IPI to
151	* multiple CPUs is retried _only_ on the erroring CPUs
152	* its staying as a return
153	*
154	* BUG();
155	*/
156
157	if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) {
158	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
159	if (f->flush_va == TLB_FLUSH_ALL)
160	local_flush_tlb();
161	else
162	__flush_tlb_one(f->flush_va);
163	} else
164	leave_mm(cpu);
165	}
166	out:
167	ack_APIC_irq();
168	smp_mb__before_clear_bit();
169	cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
170	smp_mb__after_clear_bit();
171	inc_irq_stat(irq_tlb_count);
172	}
173		100
174	static void flush_tlb_others_ipi(const struct cpumask *cpumask,	101	if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
175	struct mm_struct *mm, unsigned long va)	102	return;
176	{	103
177	unsigned int sender;	104	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
178	union smp_flush_state *f;	105	if (f->flush_end == TLB_FLUSH_ALL \|\| !cpu_has_invlpg)
179		106	local_flush_tlb();
180	/* Caller has disabled preemption */	107	else if (!f->flush_end)
181	sender = this_cpu_read(tlb_vector_offset);	108	__flush_tlb_single(f->flush_start);
182	f = &flush_state[sender];	109	else {
183		110	unsigned long addr;
184	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)	111	addr = f->flush_start;
185	raw_spin_lock(&f->tlbstate_lock);	112	while (addr < f->flush_end) {
186		113	__flush_tlb_single(addr);
187	f->flush_mm = mm;	114	addr += PAGE_SIZE;
188	f->flush_va = va;	115	}
189	if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {	116	}
190	/*	117	} else
191	* We have to send the IPI only to	118	leave_mm(smp_processor_id());
192	* CPUs affected.
193	*/
194	apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
195	INVALIDATE_TLB_VECTOR_START + sender);
196
197	while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
198	cpu_relax();
199	}
200		119
201	f->flush_mm = NULL;
202	f->flush_va = 0;
203	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
204	raw_spin_unlock(&f->tlbstate_lock);
205	}	120	}
206		121
207	void native_flush_tlb_others(const struct cpumask *cpumask,	122	void native_flush_tlb_others(const struct cpumask *cpumask,
208	struct mm_struct *mm, unsigned long va)	123	struct mm_struct *mm, unsigned long start,
		124	unsigned long end)
209	{	125	{
		126	struct flush_tlb_info info;
		127	info.flush_mm = mm;
		128	info.flush_start = start;
		129	info.flush_end = end;
		130
210	if (is_uv_system()) {	131	if (is_uv_system()) {
211	unsigned int cpu;	132	unsigned int cpu;
212		133
213	cpu = smp_processor_id();	134	cpu = smp_processor_id();
214	cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);	135	cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
215	if (cpumask)	136	if (cpumask)
216	flush_tlb_others_ipi(cpumask, mm, va);	137	smp_call_function_many(cpumask, flush_tlb_func,
		138	&info, 1);
217	return;	139	return;
218	}	140	}
219	flush_tlb_others_ipi(cpumask, mm, va);	141	smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
220	}	142	}
221		143
222	static void __cpuinit calculate_tlb_offset(void)
223	{
224	int cpu, node, nr_node_vecs, idx = 0;
225	/*
226	* we are changing tlb_vector_offset for each CPU in runtime, but this
227	* will not cause inconsistency, as the write is atomic under X86. we
228	* might see more lock contentions in a short time, but after all CPU's
229	* tlb_vector_offset are changed, everything should go normal
230	*
231	* Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
232	* waste some vectors.
233	**/
234	if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
235	nr_node_vecs = 1;
236	else
237	nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
238
239	for_each_online_node(node) {
240	int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
241	nr_node_vecs;
242	int cpu_offset = 0;
243	for_each_cpu(cpu, cpumask_of_node(node)) {
244	per_cpu(tlb_vector_offset, cpu) = node_offset +
245	cpu_offset;
246	cpu_offset++;
247	cpu_offset = cpu_offset % nr_node_vecs;
248	}
249	idx++;
250	}
251	}
252
253	static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
254	unsigned long action, void *hcpu)
255	{
256	switch (action & 0xf) {
257	case CPU_ONLINE:
258	case CPU_DEAD:
259	calculate_tlb_offset();
260	}
261	return NOTIFY_OK;
262	}
263
264	static int __cpuinit init_smp_flush(void)
265	{
266	int i;
267
268	for (i = 0; i < ARRAY_SIZE(flush_state); i++)
269	raw_spin_lock_init(&flush_state[i].tlbstate_lock);
270
271	calculate_tlb_offset();
272	hotcpu_notifier(tlb_cpuhp_notify, 0);
273	return 0;
274	}
275	core_initcall(init_smp_flush);
276
277	void flush_tlb_current_task(void)	144	void flush_tlb_current_task(void)
278	{	145	{
279	struct mm_struct *mm = current->mm;	146	struct mm_struct *mm = current->mm;
@@ -282,27 +149,91 @@ void flush_tlb_current_task(void)
282		149
283	local_flush_tlb();	150	local_flush_tlb();
284	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)	151	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
285	flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);	152	flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
286	preempt_enable();	153	preempt_enable();
287	}	154	}
288		155
289	void flush_tlb_mm(struct mm_struct *mm)	156	/*
		157	* It can find out the THP large page, or
		158	* HUGETLB page in tlb_flush when THP disabled
		159	*/
		160	static inline unsigned long has_large_page(struct mm_struct *mm,
		161	unsigned long start, unsigned long end)
		162	{
		163	pgd_t *pgd;
		164	pud_t *pud;
		165	pmd_t *pmd;
		166	unsigned long addr = ALIGN(start, HPAGE_SIZE);
		167	for (; addr < end; addr += HPAGE_SIZE) {
		168	pgd = pgd_offset(mm, addr);
		169	if (likely(!pgd_none(*pgd))) {
		170	pud = pud_offset(pgd, addr);
		171	if (likely(!pud_none(*pud))) {
		172	pmd = pmd_offset(pud, addr);
		173	if (likely(!pmd_none(*pmd)))
		174	if (pmd_large(*pmd))
		175	return addr;
		176	}
		177	}
		178	}
		179	return 0;
		180	}
		181
		182	void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
		183	unsigned long end, unsigned long vmflag)
290	{	184	{
		185	unsigned long addr;
		186	unsigned act_entries, tlb_entries = 0;
		187
291	preempt_disable();	188	preempt_disable();
		189	if (current->active_mm != mm)
		190	goto flush_all;
292		191
293	if (current->active_mm == mm) {	192	if (!current->mm) {
294	if (current->mm)	193	leave_mm(smp_processor_id());
		194	goto flush_all;
		195	}
		196
		197	if (end == TLB_FLUSH_ALL \|\| tlb_flushall_shift == -1
		198	\|\| vmflag == VM_HUGETLB) {
		199	local_flush_tlb();
		200	goto flush_all;
		201	}
		202
		203	/* In modern CPU, last level tlb used for both data/ins */
		204	if (vmflag & VM_EXEC)
		205	tlb_entries = tlb_lli_4k[ENTRIES];
		206	else
		207	tlb_entries = tlb_lld_4k[ENTRIES];
		208	/* Assume all of TLB entries was occupied by this task */
		209	act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
		210
		211	/* tlb_flushall_shift is on balance point, details in commit log */
		212	if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
		213	local_flush_tlb();
		214	else {
		215	if (has_large_page(mm, start, end)) {
295	local_flush_tlb();	216	local_flush_tlb();
296	else	217	goto flush_all;
297	leave_mm(smp_processor_id());	218	}
		219	/* flush range by one by one 'invlpg' */
		220	for (addr = start; addr < end; addr += PAGE_SIZE)
		221	__flush_tlb_single(addr);
		222
		223	if (cpumask_any_but(mm_cpumask(mm),
		224	smp_processor_id()) < nr_cpu_ids)
		225	flush_tlb_others(mm_cpumask(mm), mm, start, end);
		226	preempt_enable();
		227	return;
298	}	228	}
299	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
300	flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
301		229
		230	flush_all:
		231	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
		232	flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
302	preempt_enable();	233	preempt_enable();
303	}	234	}
304		235
305	void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)	236	void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
306	{	237	{
307	struct mm_struct *mm = vma->vm_mm;	238	struct mm_struct *mm = vma->vm_mm;
308		239
@@ -310,13 +241,13 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
310		241
311	if (current->active_mm == mm) {	242	if (current->active_mm == mm) {
312	if (current->mm)	243	if (current->mm)
313	__flush_tlb_one(va);	244	__flush_tlb_one(start);
314	else	245	else
315	leave_mm(smp_processor_id());	246	leave_mm(smp_processor_id());
316	}	247	}
317		248
318	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)	249	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
319	flush_tlb_others(mm_cpumask(mm), mm, va);	250	flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
320		251
321	preempt_enable();	252	preempt_enable();
322	}	253	}
@@ -332,3 +263,83 @@ void flush_tlb_all(void)
332	{	263	{
333	on_each_cpu(do_flush_tlb_all, NULL, 1);	264	on_each_cpu(do_flush_tlb_all, NULL, 1);
334	}	265	}
		266
		267	static void do_kernel_range_flush(void *info)
		268	{
		269	struct flush_tlb_info *f = info;
		270	unsigned long addr;
		271
		272	/* flush range by one by one 'invlpg' */
		273	for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
		274	__flush_tlb_single(addr);
		275	}
		276
		277	void flush_tlb_kernel_range(unsigned long start, unsigned long end)
		278	{
		279	unsigned act_entries;
		280	struct flush_tlb_info info;
		281
		282	/* In modern CPU, last level tlb used for both data/ins */
		283	act_entries = tlb_lld_4k[ENTRIES];
		284
		285	/* Balance as user space task's flush, a bit conservative */
		286	if (end == TLB_FLUSH_ALL \|\| tlb_flushall_shift == -1 \|\|
		287	(end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
		288
		289	on_each_cpu(do_flush_tlb_all, NULL, 1);
		290	else {
		291	info.flush_start = start;
		292	info.flush_end = end;
		293	on_each_cpu(do_kernel_range_flush, &info, 1);
		294	}
		295	}
		296
		297	#ifdef CONFIG_DEBUG_TLBFLUSH
		298	static ssize_t tlbflush_read_file(struct file file, char __user user_buf,
		299	size_t count, loff_t *ppos)
		300	{
		301	char buf[32];
		302	unsigned int len;
		303
		304	len = sprintf(buf, "%hd\n", tlb_flushall_shift);
		305	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
		306	}
		307
		308	static ssize_t tlbflush_write_file(struct file *file,
		309	const char __user user_buf, size_t count, loff_t ppos)
		310	{
		311	char buf[32];
		312	ssize_t len;
		313	s8 shift;
		314
		315	len = min(count, sizeof(buf) - 1);
		316	if (copy_from_user(buf, user_buf, len))
		317	return -EFAULT;
		318
		319	buf[len] = '\0';
		320	if (kstrtos8(buf, 0, &shift))
		321	return -EINVAL;
		322
		323	if (shift > 64)
		324	return -EINVAL;
		325
		326	tlb_flushall_shift = shift;
		327	return count;
		328	}
		329
		330	static const struct file_operations fops_tlbflush = {
		331	.read = tlbflush_read_file,
		332	.write = tlbflush_write_file,
		333	.llseek = default_llseek,
		334	};
		335
		336	static int __cpuinit create_tlb_flushall_shift(void)
		337	{
		338	if (cpu_has_invlpg) {
		339	debugfs_create_file("tlb_flushall_shift", S_IRUSR \| S_IWUSR,
		340	arch_debugfs_dir, NULL, &fops_tlbflush);
		341	}
		342	return 0;
		343	}
		344	late_initcall(create_tlb_flushall_shift);
		345	#endif