1 files changed, 707 insertions, 0 deletions
diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
new file mode 100644
index 000000000000..2d35d8502029
--- /dev/null
+++ b/arch/x86/kernel/smp_32.c
@@ -0,0 +1,707 @@
+/*
+ *      Intel SMP support routines.
+ *
+ *      (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ *      (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ *      This code is released under the GNU General Public License version 2 or
+ *      later.
+ */
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <mach_apic.h>
+/*
+ *      Some notes on x86 processor bugs affecting SMP operation:
+ *
+ *      Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
+ *      The Linux implications for SMP are handled as follows:
+ *
+ *      Pentium III / [Xeon]
+ *              None of the E1AP-E3AP errata are visible to the user.
+ *
+ *      E1AP.   see PII A1AP
+ *      E2AP.   see PII A2AP
+ *      E3AP.   see PII A3AP
+ *
+ *      Pentium II / [Xeon]
+ *              None of the A1AP-A3AP errata are visible to the user.
+ *
+ *      A1AP.   see PPro 1AP
+ *      A2AP.   see PPro 2AP
+ *      A3AP.   see PPro 7AP
+ *
+ *      Pentium Pro
+ *              None of 1AP-9AP errata are visible to the normal user,
+ *      except occasional delivery of 'spurious interrupt' as trap #15.
+ *      This is very rare and a non-problem.
+ *
+ *      1AP.    Linux maps APIC as non-cacheable
+ *      2AP.    worked around in hardware
+ *      3AP.    fixed in C0 and above steppings microcode update.
+ *              Linux does not use excessive STARTUP_IPIs.
+ *      4AP.    worked around in hardware
+ *      5AP.    symmetric IO mode (normal Linux operation) not affected.
+ *              'noapic' mode has vector 0xf filled out properly.
+ *      6AP.    'noapic' mode might be affected - fixed in later steppings
+ *      7AP.    We do not assume writes to the LVT deassering IRQs
+ *      8AP.    We do not enable low power mode (deep sleep) during MP bootup
+ *      9AP.    We do not use mixed mode
+ *
+ *      Pentium
+ *              There is a marginal case where REP MOVS on 100MHz SMP
+ *      machines with B stepping processors can fail. XXX should provide
+ *      an L1cache=Writethrough or L1cache=off option.
+ *
+ *              B stepping CPUs may hang. There are hardware work arounds
+ *      for this. We warn about it in case your board doesn't have the work
+ *      arounds. Basically thats so I can tell anyone with a B stepping
+ *      CPU and SMP problems "tough".
+ *
+ *      Specific items [From Pentium Processor Specification Update]
+ *
+ *      1AP.    Linux doesn't use remote read
+ *      2AP.    Linux doesn't trust APIC errors
+ *      3AP.    We work around this
+ *      4AP.    Linux never generated 3 interrupts of the same priority
+ *              to cause a lost local interrupt.
+ *      5AP.    Remote read is never used
+ *      6AP.    not affected - worked around in hardware
+ *      7AP.    not affected - worked around in hardware
+ *      8AP.    worked around in hardware - we get explicit CS errors if not
+ *      9AP.    only 'noapic' mode affected. Might generate spurious
+ *              interrupts, we log only the first one and count the
+ *              rest silently.
+ *      10AP.   not affected - worked around in hardware
+ *      11AP.   Linux reads the APIC between writes to avoid this, as per
+ *              the documentation. Make sure you preserve this as it affects
+ *              the C stepping chips too.
+ *      12AP.   not affected - worked around in hardware
+ *      13AP.   not affected - worked around in hardware
+ *      14AP.   we always deassert INIT during bootup
+ *      15AP.   not affected - worked around in hardware
+ *      16AP.   not affected - worked around in hardware
+ *      17AP.   not affected - worked around in hardware
+ *      18AP.   not affected - worked around in hardware
+ *      19AP.   not affected - worked around in BIOS
+ *
+ *      If this sounds worrying believe me these bugs are either ___RARE___,
+ *      or are signal timing bugs worked around in hardware and there's
+ *      about nothing of note with C stepping upwards.
+ */
+DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
+/*
+ * the following functions deal with sending IPIs between CPUs.
+ *
+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
+ */
+static inline int __prepare_ICR (unsigned int shortcut, int vector)
+{
+        unsigned int icr = shortcut | APIC_DEST_LOGICAL;
+        switch (vector) {
+        default:
+                icr |= APIC_DM_FIXED | vector;
+                break;
+        case NMI_VECTOR:
+                icr |= APIC_DM_NMI;
+                break;
+        }
+        return icr;
+}
+static inline int __prepare_ICR2 (unsigned int mask)
+{
+        return SET_APIC_DEST_FIELD(mask);
+}
+void __send_IPI_shortcut(unsigned int shortcut, int vector)
+{
+        /*
+         * Subtle. In the case of the 'never do double writes' workaround
+         * we have to lock out interrupts to be safe.  As we don't care
+         * of the value read we use an atomic rmw access to avoid costly
+         * cli/sti.  Otherwise we use an even cheaper single atomic write
+         * to the APIC.
+         */
+        unsigned int cfg;
+        /*
+         * Wait for idle.
+         */
+        apic_wait_icr_idle();
+        /*
+         * No need to touch the target chip field
+         */
+        cfg = __prepare_ICR(shortcut, vector);
+        /*
+         * Send the IPI. The write to APIC_ICR fires this off.
+         */
+        apic_write_around(APIC_ICR, cfg);
+}
+void fastcall send_IPI_self(int vector)
+{
+        __send_IPI_shortcut(APIC_DEST_SELF, vector);
+}
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+static inline void __send_IPI_dest_field(unsigned long mask, int vector)
+{
+        unsigned long cfg;
+        /*
+         * Wait for idle.
+         */
+        if (unlikely(vector == NMI_VECTOR))
+                safe_apic_wait_icr_idle();
+        else
+                apic_wait_icr_idle();
+                
+        /*
+         * prepare target chip field
+         */
+        cfg = __prepare_ICR2(mask);
+        apic_write_around(APIC_ICR2, cfg);
+                
+        /*
+         * program the ICR 
+         */
+        cfg = __prepare_ICR(0, vector);
+                        
+        /*
+         * Send the IPI. The write to APIC_ICR fires this off.
+         */
+        apic_write_around(APIC_ICR, cfg);
+}
+/*
+ * This is only used on smaller machines.
+ */
+void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+{
+        unsigned long mask = cpus_addr(cpumask)[0];
+        unsigned long flags;
+        local_irq_save(flags);
+        WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
+        __send_IPI_dest_field(mask, vector);
+        local_irq_restore(flags);
+}
+void send_IPI_mask_sequence(cpumask_t mask, int vector)
+{
+        unsigned long flags;
+        unsigned int query_cpu;
+        /*
+         * Hack. The clustered APIC addressing mode doesn't allow us to send 
+         * to an arbitrary mask, so I do a unicasts to each CPU instead. This 
+         * should be modified to do 1 message per cluster ID - mbligh
+         */ 
+        local_irq_save(flags);
+        for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
+                if (cpu_isset(query_cpu, mask)) {
+                        __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
+                                              vector);
+                }
+        }
+        local_irq_restore(flags);
+}
+#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
+/*
+ *      Smarter SMP flushing macros. 
+ *              c/o Linus Torvalds.
+ *
+ *      These mean you can really definitely utterly forget about
+ *      writing to user space from interrupts. (Its not allowed anyway).
+ *
+ *      Optimizations Manfred Spraul <manfred@colorfullife.com>
+ */
+static cpumask_t flush_cpumask;
+static struct mm_struct * flush_mm;
+static unsigned long flush_va;
+static DEFINE_SPINLOCK(tlbstate_lock);
+/*
+ * We cannot call mmdrop() because we are in interrupt context,
+ * instead update mm->cpu_vm_mask.
+ *
+ * We need to reload %cr3 since the page tables may be going
+ * away from under us..
+ */
+void leave_mm(unsigned long cpu)
+{
+        if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
+                BUG();
+        cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
+        load_cr3(swapper_pg_dir);
+}
+/*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+ * [cpu0: the cpu that switches]
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ *      Stop ipi delivery for the old mm. This is not synchronized with
+ *      the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ *      for the wrong mm, and in the worst case we perform a superflous
+ *      tlb flush.
+ * 1a2) set cpu_tlbstate to TLBSTATE_OK
+ *      Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ *      was in lazy tlb mode.
+ * 1a3) update cpu_tlbstate[].active_mm
+ *      Now cpu0 accepts tlb flushes for the new mm.
+ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+ *      Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1b) thread switch without mm change
+ *      cpu_tlbstate[].active_mm is correct, cpu0 already handles
+ *      flush ipis.
+ * 1b1) set cpu_tlbstate to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+ *      Atomically set the bit [other cpus will start sending flush ipis],
+ *      and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ *   runs in kernel space, the cpu could load tlb entries for user space
+ *   pages.
+ *
+ * The good news is that cpu_tlbstate is local to each cpu, no
+ * write/read ordering problems.
+ */
+/*
+ * TLB flush IPI:
+ *
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ */
+fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
+{
+        unsigned long cpu;
+        cpu = get_cpu();
+        if (!cpu_isset(cpu, flush_cpumask))
+                goto out;
+                /* 
+                 * This was a BUG() but until someone can quote me the
+                 * line from the intel manual that guarantees an IPI to
+                 * multiple CPUs is retried _only_ on the erroring CPUs
+                 * its staying as a return
+                 *
+                 * BUG();
+                 */
+                 
+        if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
+                if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
+                        if (flush_va == TLB_FLUSH_ALL)
+                                local_flush_tlb();
+                        else
+                                __flush_tlb_one(flush_va);
+                } else
+                        leave_mm(cpu);
+        }
+        ack_APIC_irq();
+        smp_mb__before_clear_bit();
+        cpu_clear(cpu, flush_cpumask);
+        smp_mb__after_clear_bit();
+out:
+        put_cpu_no_resched();
+}
+void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
+                             unsigned long va)
+{
+        cpumask_t cpumask = *cpumaskp;
+        /*
+         * A couple of (to be removed) sanity checks:
+         *
+         * - current CPU must not be in mask
+         * - mask must exist :)
+         */
+        BUG_ON(cpus_empty(cpumask));
+        BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+        BUG_ON(!mm);
+#ifdef CONFIG_HOTPLUG_CPU
+        /* If a CPU which we ran on has gone down, OK. */
+        cpus_and(cpumask, cpumask, cpu_online_map);
+        if (unlikely(cpus_empty(cpumask)))
+                return;
+#endif
+        /*
+         * i'm not happy about this global shared spinlock in the
+         * MM hot path, but we'll see how contended it is.
+         * AK: x86-64 has a faster method that could be ported.
+         */
+        spin_lock(&tlbstate_lock);
+        
+        flush_mm = mm;
+        flush_va = va;
+        cpus_or(flush_cpumask, cpumask, flush_cpumask);
+        /*
+         * We have to send the IPI only to
+         * CPUs affected.
+         */
+        send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+        while (!cpus_empty(flush_cpumask))
+                /* nothing. lockup detection does not belong here */
+                cpu_relax();
+        flush_mm = NULL;
+        flush_va = 0;
+        spin_unlock(&tlbstate_lock);
+}
+        
+void flush_tlb_current_task(void)
+{
+        struct mm_struct *mm = current->mm;
+        cpumask_t cpu_mask;
+        preempt_disable();
+        cpu_mask = mm->cpu_vm_mask;
+        cpu_clear(smp_processor_id(), cpu_mask);
+        local_flush_tlb();
+        if (!cpus_empty(cpu_mask))
+                flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+        preempt_enable();
+}
+void flush_tlb_mm (struct mm_struct * mm)
+{
+        cpumask_t cpu_mask;
+        preempt_disable();
+        cpu_mask = mm->cpu_vm_mask;
+        cpu_clear(smp_processor_id(), cpu_mask);
+        if (current->active_mm == mm) {
+                if (current->mm)
+                        local_flush_tlb();
+                else
+                        leave_mm(smp_processor_id());
+        }
+        if (!cpus_empty(cpu_mask))
+                flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+        preempt_enable();
+}
+void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        cpumask_t cpu_mask;
+        preempt_disable();
+        cpu_mask = mm->cpu_vm_mask;
+        cpu_clear(smp_processor_id(), cpu_mask);
+        if (current->active_mm == mm) {
+                if(current->mm)
+                        __flush_tlb_one(va);
+                 else
+                        leave_mm(smp_processor_id());
+        }
+        if (!cpus_empty(cpu_mask))
+                flush_tlb_others(cpu_mask, mm, va);
+        preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_page);
+static void do_flush_tlb_all(void* info)
+{
+        unsigned long cpu = smp_processor_id();
+        __flush_tlb_all();
+        if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
+                leave_mm(cpu);
+}
+void flush_tlb_all(void)
+{
+        on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+}
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+static void native_smp_send_reschedule(int cpu)
+{
+        WARN_ON(cpu_is_offline(cpu));
+        send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+}
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+struct call_data_struct {
+        void (*func) (void *info);
+        void *info;
+        atomic_t started;
+        atomic_t finished;
+        int wait;
+};
+void lock_ipi_call_lock(void)
+{
+        spin_lock_irq(&call_lock);
+}
+void unlock_ipi_call_lock(void)
+{
+        spin_unlock_irq(&call_lock);
+}
+static struct call_data_struct *call_data;
+static void __smp_call_function(void (*func) (void *info), void *info,
+                                int nonatomic, int wait)
+{
+        struct call_data_struct data;
+        int cpus = num_online_cpus() - 1;
+        if (!cpus)
+                return;
+        data.func = func;
+        data.info = info;
+        atomic_set(&data.started, 0);
+        data.wait = wait;
+        if (wait)
+                atomic_set(&data.finished, 0);
+        call_data = &data;
+        mb();
+        
+        /* Send a message to all other CPUs and wait for them to respond */
+        send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+        /* Wait for response */
+        while (atomic_read(&data.started) != cpus)
+                cpu_relax();
+        if (wait)
+                while (atomic_read(&data.finished) != cpus)
+                        cpu_relax();
+}
+/**
+ * smp_call_function_mask(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on.  Must not include the current cpu.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+  * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+static int
+native_smp_call_function_mask(cpumask_t mask,
+                              void (*func)(void *), void *info,
+                              int wait)
+{
+        struct call_data_struct data;
+        cpumask_t allbutself;
+        int cpus;
+        /* Can deadlock when called with interrupts disabled */
+        WARN_ON(irqs_disabled());
+        /* Holding any lock stops cpus from going down. */
+        spin_lock(&call_lock);
+        allbutself = cpu_online_map;
+        cpu_clear(smp_processor_id(), allbutself);
+        cpus_and(mask, mask, allbutself);
+        cpus = cpus_weight(mask);
+        if (!cpus) {
+                spin_unlock(&call_lock);
+                return 0;
+        }
+        data.func = func;
+        data.info = info;
+        atomic_set(&data.started, 0);
+        data.wait = wait;
+        if (wait)
+                atomic_set(&data.finished, 0);
+        call_data = &data;
+        mb();
+        /* Send a message to other CPUs */
+        if (cpus_equal(mask, allbutself))
+                send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+        else
+                send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+        /* Wait for response */
+        while (atomic_read(&data.started) != cpus)
+                cpu_relax();
+        if (wait)
+                while (atomic_read(&data.finished) != cpus)
+                        cpu_relax();
+        spin_unlock(&call_lock);
+        return 0;
+}
+static void stop_this_cpu (void * dummy)
+{
+        local_irq_disable();
+        /*
+         * Remove this CPU:
+         */
+        cpu_clear(smp_processor_id(), cpu_online_map);
+        disable_local_APIC();
+        if (cpu_data[smp_processor_id()].hlt_works_ok)
+                for(;;) halt();
+        for (;;);
+}
+/*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+static void native_smp_send_stop(void)
+{
+        /* Don't deadlock on the call lock in panic */
+        int nolock = !spin_trylock(&call_lock);
+        unsigned long flags;
+        local_irq_save(flags);
+        __smp_call_function(stop_this_cpu, NULL, 0, 0);
+        if (!nolock)
+                spin_unlock(&call_lock);
+        disable_local_APIC();
+        local_irq_restore(flags);
+}
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
+{
+        ack_APIC_irq();
+}
+fastcall void smp_call_function_interrupt(struct pt_regs *regs)
+{
+        void (*func) (void *info) = call_data->func;
+        void *info = call_data->info;
+        int wait = call_data->wait;
+        ack_APIC_irq();
+        /*
+         * Notify initiating CPU that I've grabbed the data and am
+         * about to execute the function
+         */
+        mb();
+        atomic_inc(&call_data->started);
+        /*
+         * At this point the info structure may be out of scope unless wait==1
+         */
+        irq_enter();
+        (*func)(info);
+        irq_exit();
+        if (wait) {
+                mb();
+                atomic_inc(&call_data->finished);
+        }
+}
+static int convert_apicid_to_cpu(int apic_id)
+{
+        int i;
+        for (i = 0; i < NR_CPUS; i++) {
+                if (x86_cpu_to_apicid[i] == apic_id)
+                        return i;
+        }
+        return -1;
+}
+int safe_smp_processor_id(void)
+{
+        int apicid, cpuid;
+        if (!boot_cpu_has(X86_FEATURE_APIC))
+                return 0;
+        apicid = hard_smp_processor_id();
+        if (apicid == BAD_APICID)
+                return 0;
+        cpuid = convert_apicid_to_cpu(apicid);
+        return cpuid >= 0 ? cpuid : 0;
+}
+struct smp_ops smp_ops = {
+        .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
+        .smp_prepare_cpus = native_smp_prepare_cpus,
+        .cpu_up = native_cpu_up,
+        .smp_cpus_done = native_smp_cpus_done,
+        .smp_send_stop = native_smp_send_stop,
+        .smp_send_reschedule = native_smp_send_reschedule,
+        .smp_call_function_mask = native_smp_call_function_mask,
+};

diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c new file mode 100644 index 000000000000..2d35d8502029 --- /dev/null +++ b/arch/x86/kernel/smp_32.c
@@ -0,0 +1,707 @@
	1	/*
	2	* Intel SMP support routines.
	3	*
	4	* (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
	5	* (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
	6	*
	7	* This code is released under the GNU General Public License version 2 or
	8	* later.
	9	*/
	10
	11	#include <linux/init.h>
	12
	13	#include <linux/mm.h>
	14	#include <linux/delay.h>
	15	#include <linux/spinlock.h>
	16	#include <linux/kernel_stat.h>
	17	#include <linux/mc146818rtc.h>
	18	#include <linux/cache.h>
	19	#include <linux/interrupt.h>
	20	#include <linux/cpu.h>
	21	#include <linux/module.h>
	22
	23	#include <asm/mtrr.h>
	24	#include <asm/tlbflush.h>
	25	#include <asm/mmu_context.h>
	26	#include <mach_apic.h>
	27
	28	/*
	29	* Some notes on x86 processor bugs affecting SMP operation:
	30	*
	31	* Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
	32	* The Linux implications for SMP are handled as follows:
	33	*
	34	* Pentium III / [Xeon]
	35	* None of the E1AP-E3AP errata are visible to the user.
	36	*
	37	* E1AP. see PII A1AP
	38	* E2AP. see PII A2AP
	39	* E3AP. see PII A3AP
	40	*
	41	* Pentium II / [Xeon]
	42	* None of the A1AP-A3AP errata are visible to the user.
	43	*
	44	* A1AP. see PPro 1AP
	45	* A2AP. see PPro 2AP
	46	* A3AP. see PPro 7AP
	47	*
	48	* Pentium Pro
	49	* None of 1AP-9AP errata are visible to the normal user,
	50	* except occasional delivery of 'spurious interrupt' as trap #15.
	51	* This is very rare and a non-problem.
	52	*
	53	* 1AP. Linux maps APIC as non-cacheable
	54	* 2AP. worked around in hardware
	55	* 3AP. fixed in C0 and above steppings microcode update.
	56	* Linux does not use excessive STARTUP_IPIs.
	57	* 4AP. worked around in hardware
	58	* 5AP. symmetric IO mode (normal Linux operation) not affected.
	59	* 'noapic' mode has vector 0xf filled out properly.
	60	* 6AP. 'noapic' mode might be affected - fixed in later steppings
	61	* 7AP. We do not assume writes to the LVT deassering IRQs
	62	* 8AP. We do not enable low power mode (deep sleep) during MP bootup
	63	* 9AP. We do not use mixed mode
	64	*
	65	* Pentium
	66	* There is a marginal case where REP MOVS on 100MHz SMP
	67	* machines with B stepping processors can fail. XXX should provide
	68	* an L1cache=Writethrough or L1cache=off option.
	69	*
	70	* B stepping CPUs may hang. There are hardware work arounds
	71	* for this. We warn about it in case your board doesn't have the work
	72	* arounds. Basically thats so I can tell anyone with a B stepping
	73	* CPU and SMP problems "tough".
	74	*
	75	* Specific items [From Pentium Processor Specification Update]
	76	*
	77	* 1AP. Linux doesn't use remote read
	78	* 2AP. Linux doesn't trust APIC errors
	79	* 3AP. We work around this
	80	* 4AP. Linux never generated 3 interrupts of the same priority
	81	* to cause a lost local interrupt.
	82	* 5AP. Remote read is never used
	83	* 6AP. not affected - worked around in hardware
	84	* 7AP. not affected - worked around in hardware
	85	* 8AP. worked around in hardware - we get explicit CS errors if not
	86	* 9AP. only 'noapic' mode affected. Might generate spurious
	87	* interrupts, we log only the first one and count the
	88	* rest silently.
	89	* 10AP. not affected - worked around in hardware
	90	* 11AP. Linux reads the APIC between writes to avoid this, as per
	91	* the documentation. Make sure you preserve this as it affects
	92	* the C stepping chips too.
	93	* 12AP. not affected - worked around in hardware
	94	* 13AP. not affected - worked around in hardware
	95	* 14AP. we always deassert INIT during bootup
	96	* 15AP. not affected - worked around in hardware
	97	* 16AP. not affected - worked around in hardware
	98	* 17AP. not affected - worked around in hardware
	99	* 18AP. not affected - worked around in hardware
	100	* 19AP. not affected - worked around in BIOS
	101	*
	102	* If this sounds worrying believe me these bugs are either ___RARE___,
	103	* or are signal timing bugs worked around in hardware and there's
	104	* about nothing of note with C stepping upwards.
	105	*/
	106
	107	DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
	108
	109	/*
	110	* the following functions deal with sending IPIs between CPUs.
	111	*
	112	* We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
	113	*/
	114
	115	static inline int __prepare_ICR (unsigned int shortcut, int vector)
	116	{
	117	unsigned int icr = shortcut \| APIC_DEST_LOGICAL;
	118
	119	switch (vector) {
	120	default:
	121	icr \|= APIC_DM_FIXED \| vector;
	122	break;
	123	case NMI_VECTOR:
	124	icr \|= APIC_DM_NMI;
	125	break;
	126	}
	127	return icr;
	128	}
	129
	130	static inline int __prepare_ICR2 (unsigned int mask)
	131	{
	132	return SET_APIC_DEST_FIELD(mask);
	133	}
	134
	135	void __send_IPI_shortcut(unsigned int shortcut, int vector)
	136	{
	137	/*
	138	* Subtle. In the case of the 'never do double writes' workaround
	139	* we have to lock out interrupts to be safe. As we don't care
	140	* of the value read we use an atomic rmw access to avoid costly
	141	* cli/sti. Otherwise we use an even cheaper single atomic write
	142	* to the APIC.
	143	*/
	144	unsigned int cfg;
	145
	146	/*
	147	* Wait for idle.
	148	*/
	149	apic_wait_icr_idle();
	150
	151	/*
	152	* No need to touch the target chip field
	153	*/
	154	cfg = __prepare_ICR(shortcut, vector);
	155
	156	/*
	157	* Send the IPI. The write to APIC_ICR fires this off.
	158	*/
	159	apic_write_around(APIC_ICR, cfg);
	160	}
	161
	162	void fastcall send_IPI_self(int vector)
	163	{
	164	__send_IPI_shortcut(APIC_DEST_SELF, vector);
	165	}
	166
	167	/*
	168	* This is used to send an IPI with no shorthand notation (the destination is
	169	* specified in bits 56 to 63 of the ICR).
	170	*/
	171	static inline void __send_IPI_dest_field(unsigned long mask, int vector)
	172	{
	173	unsigned long cfg;
	174
	175	/*
	176	* Wait for idle.
	177	*/
	178	if (unlikely(vector == NMI_VECTOR))
	179	safe_apic_wait_icr_idle();
	180	else
	181	apic_wait_icr_idle();
	182
	183	/*
	184	* prepare target chip field
	185	*/
	186	cfg = __prepare_ICR2(mask);
	187	apic_write_around(APIC_ICR2, cfg);
	188
	189	/*
	190	* program the ICR
	191	*/
	192	cfg = __prepare_ICR(0, vector);
	193
	194	/*
	195	* Send the IPI. The write to APIC_ICR fires this off.
	196	*/
	197	apic_write_around(APIC_ICR, cfg);
	198	}
	199
	200	/*
	201	* This is only used on smaller machines.
	202	*/
	203	void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
	204	{
	205	unsigned long mask = cpus_addr(cpumask)[0];
	206	unsigned long flags;
	207
	208	local_irq_save(flags);
	209	WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
	210	__send_IPI_dest_field(mask, vector);
	211	local_irq_restore(flags);
	212	}
	213
	214	void send_IPI_mask_sequence(cpumask_t mask, int vector)
	215	{
	216	unsigned long flags;
	217	unsigned int query_cpu;
	218
	219	/*
	220	* Hack. The clustered APIC addressing mode doesn't allow us to send
	221	* to an arbitrary mask, so I do a unicasts to each CPU instead. This
	222	* should be modified to do 1 message per cluster ID - mbligh
	223	*/
	224
	225	local_irq_save(flags);
	226	for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
	227	if (cpu_isset(query_cpu, mask)) {
	228	__send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
	229	vector);
	230	}
	231	}
	232	local_irq_restore(flags);
	233	}
	234
	235	#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
	236
	237	/*
	238	* Smarter SMP flushing macros.
	239	* c/o Linus Torvalds.
	240	*
	241	* These mean you can really definitely utterly forget about
	242	* writing to user space from interrupts. (Its not allowed anyway).
	243	*
	244	* Optimizations Manfred Spraul <manfred@colorfullife.com>
	245	*/
	246
	247	static cpumask_t flush_cpumask;
	248	static struct mm_struct * flush_mm;
	249	static unsigned long flush_va;
	250	static DEFINE_SPINLOCK(tlbstate_lock);
	251
	252	/*
	253	* We cannot call mmdrop() because we are in interrupt context,
	254	* instead update mm->cpu_vm_mask.
	255	*
	256	* We need to reload %cr3 since the page tables may be going
	257	* away from under us..
	258	*/
	259	void leave_mm(unsigned long cpu)
	260	{
	261	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
	262	BUG();
	263	cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
	264	load_cr3(swapper_pg_dir);
	265	}
	266
	267	/*
	268	*
	269	* The flush IPI assumes that a thread switch happens in this order:
	270	* [cpu0: the cpu that switches]
	271	* 1) switch_mm() either 1a) or 1b)
	272	* 1a) thread switch to a different mm
	273	* 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
	274	* Stop ipi delivery for the old mm. This is not synchronized with
	275	* the other cpus, but smp_invalidate_interrupt ignore flush ipis
	276	* for the wrong mm, and in the worst case we perform a superflous
	277	* tlb flush.
	278	* 1a2) set cpu_tlbstate to TLBSTATE_OK
	279	* Now the smp_invalidate_interrupt won't call leave_mm if cpu0
	280	* was in lazy tlb mode.
	281	* 1a3) update cpu_tlbstate[].active_mm
	282	* Now cpu0 accepts tlb flushes for the new mm.
	283	* 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
	284	* Now the other cpus will send tlb flush ipis.
	285	* 1a4) change cr3.
	286	* 1b) thread switch without mm change
	287	* cpu_tlbstate[].active_mm is correct, cpu0 already handles
	288	* flush ipis.
	289	* 1b1) set cpu_tlbstate to TLBSTATE_OK
	290	* 1b2) test_and_set the cpu bit in cpu_vm_mask.
	291	* Atomically set the bit [other cpus will start sending flush ipis],
	292	* and test the bit.
	293	* 1b3) if the bit was 0: leave_mm was called, flush the tlb.
	294	* 2) switch %%esp, ie current
	295	*
	296	* The interrupt must handle 2 special cases:
	297	* - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
	298	* - the cpu performs speculative tlb reads, i.e. even if the cpu only
	299	* runs in kernel space, the cpu could load tlb entries for user space
	300	* pages.
	301	*
	302	* The good news is that cpu_tlbstate is local to each cpu, no
	303	* write/read ordering problems.
	304	*/
	305
	306	/*
	307	* TLB flush IPI:
	308	*
	309	* 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
	310	* 2) Leave the mm if we are in the lazy tlb mode.
	311	*/
	312
	313	fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
	314	{
	315	unsigned long cpu;
	316
	317	cpu = get_cpu();
	318
	319	if (!cpu_isset(cpu, flush_cpumask))
	320	goto out;
	321	/*
	322	* This was a BUG() but until someone can quote me the
	323	* line from the intel manual that guarantees an IPI to
	324	* multiple CPUs is retried _only_ on the erroring CPUs
	325	* its staying as a return
	326	*
	327	* BUG();
	328	*/
	329
	330	if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
	331	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
	332	if (flush_va == TLB_FLUSH_ALL)
	333	local_flush_tlb();
	334	else
	335	__flush_tlb_one(flush_va);
	336	} else
	337	leave_mm(cpu);
	338	}
	339	ack_APIC_irq();
	340	smp_mb__before_clear_bit();
	341	cpu_clear(cpu, flush_cpumask);
	342	smp_mb__after_clear_bit();
	343	out:
	344	put_cpu_no_resched();
	345	}
	346
	347	void native_flush_tlb_others(const cpumask_t cpumaskp, struct mm_struct mm,
	348	unsigned long va)
	349	{
	350	cpumask_t cpumask = *cpumaskp;
	351
	352	/*
	353	* A couple of (to be removed) sanity checks:
	354	*
	355	* - current CPU must not be in mask
	356	* - mask must exist :)
	357	*/
	358	BUG_ON(cpus_empty(cpumask));
	359	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
	360	BUG_ON(!mm);
	361
	362	#ifdef CONFIG_HOTPLUG_CPU
	363	/* If a CPU which we ran on has gone down, OK. */
	364	cpus_and(cpumask, cpumask, cpu_online_map);
	365	if (unlikely(cpus_empty(cpumask)))
	366	return;
	367	#endif
	368
	369	/*
	370	* i'm not happy about this global shared spinlock in the
	371	* MM hot path, but we'll see how contended it is.
	372	* AK: x86-64 has a faster method that could be ported.
	373	*/
	374	spin_lock(&tlbstate_lock);
	375
	376	flush_mm = mm;
	377	flush_va = va;
	378	cpus_or(flush_cpumask, cpumask, flush_cpumask);
	379	/*
	380	* We have to send the IPI only to
	381	* CPUs affected.
	382	*/
	383	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
	384
	385	while (!cpus_empty(flush_cpumask))
	386	/* nothing. lockup detection does not belong here */
	387	cpu_relax();
	388
	389	flush_mm = NULL;
	390	flush_va = 0;
	391	spin_unlock(&tlbstate_lock);
	392	}
	393
	394	void flush_tlb_current_task(void)
	395	{
	396	struct mm_struct *mm = current->mm;
	397	cpumask_t cpu_mask;
	398
	399	preempt_disable();
	400	cpu_mask = mm->cpu_vm_mask;
	401	cpu_clear(smp_processor_id(), cpu_mask);
	402
	403	local_flush_tlb();
	404	if (!cpus_empty(cpu_mask))
	405	flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
	406	preempt_enable();
	407	}
	408
	409	void flush_tlb_mm (struct mm_struct * mm)
	410	{
	411	cpumask_t cpu_mask;
	412
	413	preempt_disable();
	414	cpu_mask = mm->cpu_vm_mask;
	415	cpu_clear(smp_processor_id(), cpu_mask);
	416
	417	if (current->active_mm == mm) {
	418	if (current->mm)
	419	local_flush_tlb();
	420	else
	421	leave_mm(smp_processor_id());
	422	}
	423	if (!cpus_empty(cpu_mask))
	424	flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
	425
	426	preempt_enable();
	427	}
	428
	429	void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
	430	{
	431	struct mm_struct *mm = vma->vm_mm;
	432	cpumask_t cpu_mask;
	433
	434	preempt_disable();
	435	cpu_mask = mm->cpu_vm_mask;
	436	cpu_clear(smp_processor_id(), cpu_mask);
	437
	438	if (current->active_mm == mm) {
	439	if(current->mm)
	440	__flush_tlb_one(va);
	441	else
	442	leave_mm(smp_processor_id());
	443	}
	444
	445	if (!cpus_empty(cpu_mask))
	446	flush_tlb_others(cpu_mask, mm, va);
	447
	448	preempt_enable();
	449	}
	450	EXPORT_SYMBOL(flush_tlb_page);
	451
	452	static void do_flush_tlb_all(void* info)
	453	{
	454	unsigned long cpu = smp_processor_id();
	455
	456	__flush_tlb_all();
	457	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
	458	leave_mm(cpu);
	459	}
	460
	461	void flush_tlb_all(void)
	462	{
	463	on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
	464	}
	465
	466	/*
	467	* this function sends a 'reschedule' IPI to another CPU.
	468	* it goes straight through and wastes no time serializing
	469	* anything. Worst case is that we lose a reschedule ...
	470	*/
	471	static void native_smp_send_reschedule(int cpu)
	472	{
	473	WARN_ON(cpu_is_offline(cpu));
	474	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
	475	}
	476
	477	/*
	478	* Structure and data for smp_call_function(). This is designed to minimise
	479	* static memory requirements. It also looks cleaner.
	480	*/
	481	static DEFINE_SPINLOCK(call_lock);
	482
	483	struct call_data_struct {
	484	void (func) (void info);
	485	void *info;
	486	atomic_t started;
	487	atomic_t finished;
	488	int wait;
	489	};
	490
	491	void lock_ipi_call_lock(void)
	492	{
	493	spin_lock_irq(&call_lock);
	494	}
	495
	496	void unlock_ipi_call_lock(void)
	497	{
	498	spin_unlock_irq(&call_lock);
	499	}
	500
	501	static struct call_data_struct *call_data;
	502
	503	static void __smp_call_function(void (func) (void info), void *info,
	504	int nonatomic, int wait)
	505	{
	506	struct call_data_struct data;
	507	int cpus = num_online_cpus() - 1;
	508
	509	if (!cpus)
	510	return;
	511
	512	data.func = func;
	513	data.info = info;
	514	atomic_set(&data.started, 0);
	515	data.wait = wait;
	516	if (wait)
	517	atomic_set(&data.finished, 0);
	518
	519	call_data = &data;
	520	mb();
	521
	522	/* Send a message to all other CPUs and wait for them to respond */
	523	send_IPI_allbutself(CALL_FUNCTION_VECTOR);
	524
	525	/* Wait for response */
	526	while (atomic_read(&data.started) != cpus)
	527	cpu_relax();
	528
	529	if (wait)
	530	while (atomic_read(&data.finished) != cpus)
	531	cpu_relax();
	532	}
	533
	534
	535	/**
	536	* smp_call_function_mask(): Run a function on a set of other CPUs.
	537	* @mask: The set of cpus to run on. Must not include the current cpu.
	538	* @func: The function to run. This must be fast and non-blocking.
	539	* @info: An arbitrary pointer to pass to the function.
	540	* @wait: If true, wait (atomically) until function has completed on other CPUs.
	541	*
	542	* Returns 0 on success, else a negative status code.
	543	*
	544	* If @wait is true, then returns once @func has returned; otherwise
	545	* it returns just before the target cpu calls @func.
	546	*
	547	* You must not call this function with disabled interrupts or from a
	548	* hardware interrupt handler or from a bottom half handler.
	549	*/
	550	static int
	551	native_smp_call_function_mask(cpumask_t mask,
	552	void (func)(void ), void *info,
	553	int wait)
	554	{
	555	struct call_data_struct data;
	556	cpumask_t allbutself;
	557	int cpus;
	558
	559	/* Can deadlock when called with interrupts disabled */
	560	WARN_ON(irqs_disabled());
	561
	562	/* Holding any lock stops cpus from going down. */
	563	spin_lock(&call_lock);
	564
	565	allbutself = cpu_online_map;
	566	cpu_clear(smp_processor_id(), allbutself);
	567
	568	cpus_and(mask, mask, allbutself);
	569	cpus = cpus_weight(mask);
	570
	571	if (!cpus) {
	572	spin_unlock(&call_lock);
	573	return 0;
	574	}
	575
	576	data.func = func;
	577	data.info = info;
	578	atomic_set(&data.started, 0);
	579	data.wait = wait;
	580	if (wait)
	581	atomic_set(&data.finished, 0);
	582
	583	call_data = &data;
	584	mb();
	585
	586	/* Send a message to other CPUs */
	587	if (cpus_equal(mask, allbutself))
	588	send_IPI_allbutself(CALL_FUNCTION_VECTOR);
	589	else
	590	send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
	591
	592	/* Wait for response */
	593	while (atomic_read(&data.started) != cpus)
	594	cpu_relax();
	595
	596	if (wait)
	597	while (atomic_read(&data.finished) != cpus)
	598	cpu_relax();
	599	spin_unlock(&call_lock);
	600
	601	return 0;
	602	}
	603
	604	static void stop_this_cpu (void * dummy)
	605	{
	606	local_irq_disable();
	607	/*
	608	* Remove this CPU:
	609	*/
	610	cpu_clear(smp_processor_id(), cpu_online_map);
	611	disable_local_APIC();
	612	if (cpu_data[smp_processor_id()].hlt_works_ok)
	613	for(;;) halt();
	614	for (;;);
	615	}
	616
	617	/*
	618	* this function calls the 'stop' function on all other CPUs in the system.
	619	*/
	620
	621	static void native_smp_send_stop(void)
	622	{
	623	/* Don't deadlock on the call lock in panic */
	624	int nolock = !spin_trylock(&call_lock);
	625	unsigned long flags;
	626
	627	local_irq_save(flags);
	628	__smp_call_function(stop_this_cpu, NULL, 0, 0);
	629	if (!nolock)
	630	spin_unlock(&call_lock);
	631	disable_local_APIC();
	632	local_irq_restore(flags);
	633	}
	634
	635	/*
	636	* Reschedule call back. Nothing to do,
	637	* all the work is done automatically when
	638	* we return from the interrupt.
	639	*/
	640	fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
	641	{
	642	ack_APIC_irq();
	643	}
	644
	645	fastcall void smp_call_function_interrupt(struct pt_regs *regs)
	646	{
	647	void (func) (void info) = call_data->func;
	648	void *info = call_data->info;
	649	int wait = call_data->wait;
	650
	651	ack_APIC_irq();
	652	/*
	653	* Notify initiating CPU that I've grabbed the data and am
	654	* about to execute the function
	655	*/
	656	mb();
	657	atomic_inc(&call_data->started);
	658	/*
	659	* At this point the info structure may be out of scope unless wait==1
	660	*/
	661	irq_enter();
	662	(*func)(info);
	663	irq_exit();
	664
	665	if (wait) {
	666	mb();
	667	atomic_inc(&call_data->finished);
	668	}
	669	}
	670
	671	static int convert_apicid_to_cpu(int apic_id)
	672	{
	673	int i;
	674
	675	for (i = 0; i < NR_CPUS; i++) {
	676	if (x86_cpu_to_apicid[i] == apic_id)
	677	return i;
	678	}
	679	return -1;
	680	}
	681
	682	int safe_smp_processor_id(void)
	683	{
	684	int apicid, cpuid;
	685
	686	if (!boot_cpu_has(X86_FEATURE_APIC))
	687	return 0;
	688
	689	apicid = hard_smp_processor_id();
	690	if (apicid == BAD_APICID)
	691	return 0;
	692
	693	cpuid = convert_apicid_to_cpu(apicid);
	694
	695	return cpuid >= 0 ? cpuid : 0;
	696	}
	697
	698	struct smp_ops smp_ops = {
	699	.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
	700	.smp_prepare_cpus = native_smp_prepare_cpus,
	701	.cpu_up = native_cpu_up,
	702	.smp_cpus_done = native_smp_cpus_done,
	703
	704	.smp_send_stop = native_smp_send_stop,
	705	.smp_send_reschedule = native_smp_send_reschedule,
	706	.smp_call_function_mask = native_smp_call_function_mask,
	707	};