aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cputopology.txt6
-rw-r--r--arch/alpha/kernel/irq.c2
-rw-r--r--arch/arm/kernel/irq.c18
-rw-r--r--arch/arm/kernel/vmlinux.lds.S1
-rw-r--r--arch/arm/oprofile/op_model_mpcore.c2
-rw-r--r--arch/blackfin/kernel/irqchip.c5
-rw-r--r--arch/ia64/include/asm/percpu.h4
-rw-r--r--arch/ia64/include/asm/topology.h2
-rw-r--r--arch/ia64/kernel/iosapic.c2
-rw-r--r--arch/ia64/kernel/irq.c4
-rw-r--r--arch/ia64/kernel/irq_ia64.c12
-rw-r--r--arch/ia64/kernel/msi_ia64.c4
-rw-r--r--arch/ia64/kernel/vmlinux.lds.S1
-rw-r--r--arch/ia64/sn/kernel/msi_sn.c2
-rw-r--r--arch/mips/include/asm/irq.h2
-rw-r--r--arch/mips/kernel/irq-gic.c2
-rw-r--r--arch/mips/kernel/smtc.c6
-rw-r--r--arch/mips/mti-malta/malta-smtc.c5
-rw-r--r--arch/mips/sgi-ip22/ip22-int.c2
-rw-r--r--arch/mips/sgi-ip22/ip22-time.c2
-rw-r--r--arch/mips/sibyte/bcm1480/smp.c3
-rw-r--r--arch/mips/sibyte/sb1250/smp.c3
-rw-r--r--arch/mn10300/kernel/mn10300-watchdog.c3
-rw-r--r--arch/parisc/kernel/irq.c8
-rw-r--r--arch/powerpc/kernel/irq.c2
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S1
-rw-r--r--arch/powerpc/platforms/pseries/xics.c5
-rw-r--r--arch/powerpc/sysdev/mpic.c3
-rw-r--r--arch/sparc/kernel/irq_64.c5
-rw-r--r--arch/sparc/kernel/time_64.c2
-rw-r--r--arch/x86/Kconfig32
-rw-r--r--arch/x86/Kconfig.cpu10
-rw-r--r--arch/x86/Kconfig.debug1
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/ia32/ia32entry.S8
-rw-r--r--arch/x86/include/asm/apicnum.h12
-rw-r--r--arch/x86/include/asm/cpu.h21
-rw-r--r--arch/x86/include/asm/cpumask.h32
-rw-r--r--arch/x86/include/asm/current.h24
-rw-r--r--arch/x86/include/asm/genapic_32.h7
-rw-r--r--arch/x86/include/asm/genapic_64.h6
-rw-r--r--arch/x86/include/asm/hardirq.h49
-rw-r--r--arch/x86/include/asm/hardirq_32.h30
-rw-r--r--arch/x86/include/asm/hardirq_64.h25
-rw-r--r--arch/x86/include/asm/io_apic.h26
-rw-r--r--arch/x86/include/asm/irq_regs.h36
-rw-r--r--arch/x86/include/asm/irq_regs_32.h31
-rw-r--r--arch/x86/include/asm/irq_regs_64.h1
-rw-r--r--arch/x86/include/asm/irq_vectors.h49
-rw-r--r--arch/x86/include/asm/mach-default/entry_arch.h18
-rw-r--r--arch/x86/include/asm/mmu_context.h63
-rw-r--r--arch/x86/include/asm/mmu_context_32.h55
-rw-r--r--arch/x86/include/asm/mmu_context_64.h54
-rw-r--r--arch/x86/include/asm/mpspec_def.h23
-rw-r--r--arch/x86/include/asm/page.h3
-rw-r--r--arch/x86/include/asm/page_64.h4
-rw-r--r--arch/x86/include/asm/paravirt.h461
-rw-r--r--arch/x86/include/asm/pda.h137
-rw-r--r--arch/x86/include/asm/percpu.h169
-rw-r--r--arch/x86/include/asm/pgtable.h38
-rw-r--r--arch/x86/include/asm/pgtable_64.h1
-rw-r--r--arch/x86/include/asm/processor.h22
-rw-r--r--arch/x86/include/asm/setup.h1
-rw-r--r--arch/x86/include/asm/smp.h50
-rw-r--r--arch/x86/include/asm/stackprotector.h38
-rw-r--r--arch/x86/include/asm/system.h23
-rw-r--r--arch/x86/include/asm/thread_info.h20
-rw-r--r--arch/x86/include/asm/tlbflush.h17
-rw-r--r--arch/x86/include/asm/topology.h31
-rw-r--r--arch/x86/include/asm/trampoline.h1
-rw-r--r--arch/x86/include/asm/uv/uv.h33
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h1
-rw-r--r--arch/x86/kernel/Makefile14
-rw-r--r--arch/x86/kernel/acpi/boot.c96
-rw-r--r--arch/x86/kernel/acpi/sleep.c1
-rw-r--r--arch/x86/kernel/apic.c40
-rw-r--r--arch/x86/kernel/asm-offsets_64.c11
-rw-r--r--arch/x86/kernel/cpu/common.c141
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c63
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c21
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c1
-rw-r--r--arch/x86/kernel/crash.c2
-rw-r--r--arch/x86/kernel/dumpstack_64.c35
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/efi_64.c1
-rw-r--r--arch/x86/kernel/entry_32.S6
-rw-r--r--arch/x86/kernel/entry_64.S45
-rw-r--r--arch/x86/kernel/genapic_64.c2
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c1
-rw-r--r--arch/x86/kernel/head64.c23
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/head_64.S19
-rw-r--r--arch/x86/kernel/io_apic.c165
-rw-r--r--arch/x86/kernel/irq.c6
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c9
-rw-r--r--arch/x86/kernel/irqinit_32.c11
-rw-r--r--arch/x86/kernel/microcode_intel.c10
-rw-r--r--arch/x86/kernel/module_32.c6
-rw-r--r--arch/x86/kernel/module_64.c32
-rw-r--r--arch/x86/kernel/mpparse.c142
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/nmi.c10
-rw-r--r--arch/x86/kernel/paravirt.c55
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c12
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c15
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c43
-rw-r--r--arch/x86/kernel/reboot.c1
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/setup_percpu.c412
-rw-r--r--arch/x86/kernel/smpboot.c76
-rw-r--r--arch/x86/kernel/smpcommon.c30
-rw-r--r--arch/x86/kernel/tlb_32.c256
-rw-r--r--arch/x86/kernel/tlb_uv.c70
-rw-r--r--arch/x86/kernel/traps.c1
-rw-r--r--arch/x86/kernel/vmi_32.c9
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S9
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S35
-rw-r--r--arch/x86/kernel/vsmp_64.c12
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c2
-rw-r--r--arch/x86/lguest/boot.c13
-rw-r--r--arch/x86/mach-voyager/setup.c1
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c9
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c446
-rw-r--r--arch/x86/mm/init_32.c1
-rw-r--r--arch/x86/mm/numa_64.c217
-rw-r--r--arch/x86/mm/srat_64.c1
-rw-r--r--arch/x86/mm/tlb.c (renamed from arch/x86/kernel/tlb_64.c)122
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c772
-rw-r--r--arch/x86/xen/irq.c22
-rw-r--r--arch/x86/xen/mmu.c753
-rw-r--r--arch/x86/xen/mmu.h3
-rw-r--r--arch/x86/xen/multicalls.h2
-rw-r--r--arch/x86/xen/smp.c41
-rw-r--r--arch/x86/xen/suspend.c1
-rw-r--r--arch/x86/xen/xen-asm.S142
-rw-r--r--arch/x86/xen/xen-asm.h12
-rw-r--r--arch/x86/xen/xen-asm_32.S343
-rw-r--r--arch/x86/xen/xen-asm_64.S252
-rw-r--r--arch/x86/xen/xen-ops.h10
-rw-r--r--drivers/base/cpu.c2
-rw-r--r--drivers/base/topology.c33
-rw-r--r--drivers/firmware/dcdbas.c12
-rw-r--r--drivers/misc/Kconfig4
-rw-r--r--drivers/misc/sgi-gru/gru.h2
-rw-r--r--drivers/misc/sgi-xp/xp.h2
-rw-r--r--drivers/misc/sgi-xp/xpc_main.c2
-rw-r--r--drivers/net/sfc/efx.c17
-rw-r--r--drivers/oprofile/buffer_sync.c22
-rw-r--r--drivers/oprofile/buffer_sync.h4
-rw-r--r--drivers/oprofile/oprof.c9
-rw-r--r--drivers/pci/intr_remapping.c1
-rw-r--r--drivers/xen/events.c26
-rw-r--r--drivers/xen/manage.c2
-rw-r--r--include/asm-generic/percpu.h52
-rw-r--r--include/asm-generic/sections.h2
-rw-r--r--include/asm-generic/vmlinux.lds.h55
-rw-r--r--include/linux/interrupt.h1
-rw-r--r--include/linux/irq.h86
-rw-r--r--include/linux/irqnr.h1
-rw-r--r--include/linux/magic.h1
-rw-r--r--include/linux/percpu.h47
-rw-r--r--include/linux/sched.h16
-rw-r--r--include/linux/stackprotector.h16
-rw-r--r--include/linux/topology.h6
-rw-r--r--init/main.c7
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/handle.c57
-rw-r--r--kernel/irq/internals.h7
-rw-r--r--kernel/irq/manage.c12
-rw-r--r--kernel/irq/migration.c12
-rw-r--r--kernel/irq/numa_migrate.c19
-rw-r--r--kernel/irq/proc.c4
-rw-r--r--kernel/panic.c12
-rw-r--r--kernel/sched.c7
-rw-r--r--kernel/sched_rt.c32
-rw-r--r--kernel/softirq.c5
-rw-r--r--scripts/mod/modpost.c5
183 files changed, 3882 insertions, 3656 deletions
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index 45932ec21cee..b41f3e58aefa 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -18,11 +18,11 @@ For an architecture to support this feature, it must define some of
18these macros in include/asm-XXX/topology.h: 18these macros in include/asm-XXX/topology.h:
19#define topology_physical_package_id(cpu) 19#define topology_physical_package_id(cpu)
20#define topology_core_id(cpu) 20#define topology_core_id(cpu)
21#define topology_thread_siblings(cpu) 21#define topology_thread_cpumask(cpu)
22#define topology_core_siblings(cpu) 22#define topology_core_cpumask(cpu)
23 23
24The type of **_id is int. 24The type of **_id is int.
25The type of siblings is cpumask_t. 25The type of siblings is (const) struct cpumask *.
26 26
27To be consistent on all architectures, include/linux/topology.h 27To be consistent on all architectures, include/linux/topology.h
28provides default definitions for any of the above macros that are 28provides default definitions for any of the above macros that are
diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c
index 703731accda6..7bc7489223f3 100644
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -55,7 +55,7 @@ int irq_select_affinity(unsigned int irq)
55 cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0); 55 cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0);
56 last_cpu = cpu; 56 last_cpu = cpu;
57 57
58 irq_desc[irq].affinity = cpumask_of_cpu(cpu); 58 cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
59 irq_desc[irq].chip->set_affinity(irq, cpumask_of(cpu)); 59 irq_desc[irq].chip->set_affinity(irq, cpumask_of(cpu));
60 return 0; 60 return 0;
61} 61}
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index 363db186cb93..45eacb5a2ecd 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -104,6 +104,11 @@ static struct irq_desc bad_irq_desc = {
104 .lock = __SPIN_LOCK_UNLOCKED(bad_irq_desc.lock), 104 .lock = __SPIN_LOCK_UNLOCKED(bad_irq_desc.lock),
105}; 105};
106 106
107#ifdef CONFIG_CPUMASK_OFFSTACK
108/* We are not allocating bad_irq_desc.affinity or .pending_mask */
109#error "ARM architecture does not support CONFIG_CPUMASK_OFFSTACK."
110#endif
111
107/* 112/*
108 * do_IRQ handles all hardware IRQ's. Decoded IRQs should not 113 * do_IRQ handles all hardware IRQ's. Decoded IRQs should not
109 * come via this function. Instead, they should provide their 114 * come via this function. Instead, they should provide their
@@ -161,7 +166,7 @@ void __init init_IRQ(void)
161 irq_desc[irq].status |= IRQ_NOREQUEST | IRQ_NOPROBE; 166 irq_desc[irq].status |= IRQ_NOREQUEST | IRQ_NOPROBE;
162 167
163#ifdef CONFIG_SMP 168#ifdef CONFIG_SMP
164 bad_irq_desc.affinity = CPU_MASK_ALL; 169 cpumask_setall(bad_irq_desc.affinity);
165 bad_irq_desc.cpu = smp_processor_id(); 170 bad_irq_desc.cpu = smp_processor_id();
166#endif 171#endif
167 init_arch_irq(); 172 init_arch_irq();
@@ -191,15 +196,16 @@ void migrate_irqs(void)
191 struct irq_desc *desc = irq_desc + i; 196 struct irq_desc *desc = irq_desc + i;
192 197
193 if (desc->cpu == cpu) { 198 if (desc->cpu == cpu) {
194 unsigned int newcpu = any_online_cpu(desc->affinity); 199 unsigned int newcpu = cpumask_any_and(desc->affinity,
195 200 cpu_online_mask);
196 if (newcpu == NR_CPUS) { 201 if (newcpu >= nr_cpu_ids) {
197 if (printk_ratelimit()) 202 if (printk_ratelimit())
198 printk(KERN_INFO "IRQ%u no longer affine to CPU%u\n", 203 printk(KERN_INFO "IRQ%u no longer affine to CPU%u\n",
199 i, cpu); 204 i, cpu);
200 205
201 cpus_setall(desc->affinity); 206 cpumask_setall(desc->affinity);
202 newcpu = any_online_cpu(desc->affinity); 207 newcpu = cpumask_any_and(desc->affinity,
208 cpu_online_mask);
203 } 209 }
204 210
205 route_irq(desc, i, newcpu); 211 route_irq(desc, i, newcpu);
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 00216071eaf7..85598f7da407 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -65,6 +65,7 @@ SECTIONS
65#endif 65#endif
66 . = ALIGN(4096); 66 . = ALIGN(4096);
67 __per_cpu_start = .; 67 __per_cpu_start = .;
68 *(.data.percpu.page_aligned)
68 *(.data.percpu) 69 *(.data.percpu)
69 *(.data.percpu.shared_aligned) 70 *(.data.percpu.shared_aligned)
70 __per_cpu_end = .; 71 __per_cpu_end = .;
diff --git a/arch/arm/oprofile/op_model_mpcore.c b/arch/arm/oprofile/op_model_mpcore.c
index 6d6bd5899240..853d42bb8682 100644
--- a/arch/arm/oprofile/op_model_mpcore.c
+++ b/arch/arm/oprofile/op_model_mpcore.c
@@ -263,7 +263,7 @@ static void em_route_irq(int irq, unsigned int cpu)
263 const struct cpumask *mask = cpumask_of(cpu); 263 const struct cpumask *mask = cpumask_of(cpu);
264 264
265 spin_lock_irq(&desc->lock); 265 spin_lock_irq(&desc->lock);
266 desc->affinity = *mask; 266 cpumask_copy(desc->affinity, mask);
267 desc->chip->set_affinity(irq, mask); 267 desc->chip->set_affinity(irq, mask);
268 spin_unlock_irq(&desc->lock); 268 spin_unlock_irq(&desc->lock);
269} 269}
diff --git a/arch/blackfin/kernel/irqchip.c b/arch/blackfin/kernel/irqchip.c
index 75724eee6494..23e9aa080710 100644
--- a/arch/blackfin/kernel/irqchip.c
+++ b/arch/blackfin/kernel/irqchip.c
@@ -70,6 +70,11 @@ static struct irq_desc bad_irq_desc = {
70#endif 70#endif
71}; 71};
72 72
73#ifdef CONFIG_CPUMASK_OFFSTACK
74/* We are not allocating a variable-sized bad_irq_desc.affinity */
75#error "Blackfin architecture does not support CONFIG_CPUMASK_OFFSTACK."
76#endif
77
73int show_interrupts(struct seq_file *p, void *v) 78int show_interrupts(struct seq_file *p, void *v)
74{ 79{
75 int i = *(loff_t *) v, j; 80 int i = *(loff_t *) v, j;
diff --git a/arch/ia64/include/asm/percpu.h b/arch/ia64/include/asm/percpu.h
index 77f30b664b4e..30cf46534dd2 100644
--- a/arch/ia64/include/asm/percpu.h
+++ b/arch/ia64/include/asm/percpu.h
@@ -27,12 +27,12 @@ extern void *per_cpu_init(void);
27 27
28#else /* ! SMP */ 28#else /* ! SMP */
29 29
30#define PER_CPU_ATTRIBUTES __attribute__((__section__(".data.percpu")))
31
32#define per_cpu_init() (__phys_per_cpu_start) 30#define per_cpu_init() (__phys_per_cpu_start)
33 31
34#endif /* SMP */ 32#endif /* SMP */
35 33
34#define PER_CPU_BASE_SECTION ".data.percpu"
35
36/* 36/*
37 * Be extremely careful when taking the address of this variable! Due to virtual 37 * Be extremely careful when taking the address of this variable! Due to virtual
38 * remapping, it is different from the canonical address returned by __get_cpu_var(var)! 38 * remapping, it is different from the canonical address returned by __get_cpu_var(var)!
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 32f3af1641c5..3193f4417e16 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -84,7 +84,7 @@ void build_cpu_to_node_map(void);
84 .child = NULL, \ 84 .child = NULL, \
85 .groups = NULL, \ 85 .groups = NULL, \
86 .min_interval = 8, \ 86 .min_interval = 8, \
87 .max_interval = 8*(min(num_online_cpus(), 32)), \ 87 .max_interval = 8*(min(num_online_cpus(), 32U)), \
88 .busy_factor = 64, \ 88 .busy_factor = 64, \
89 .imbalance_pct = 125, \ 89 .imbalance_pct = 125, \
90 .cache_nice_tries = 2, \ 90 .cache_nice_tries = 2, \
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 5cfd3d91001a..006ad366a454 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -880,7 +880,7 @@ iosapic_unregister_intr (unsigned int gsi)
880 if (iosapic_intr_info[irq].count == 0) { 880 if (iosapic_intr_info[irq].count == 0) {
881#ifdef CONFIG_SMP 881#ifdef CONFIG_SMP
882 /* Clear affinity */ 882 /* Clear affinity */
883 cpus_setall(idesc->affinity); 883 cpumask_setall(idesc->affinity);
884#endif 884#endif
885 /* Clear the interrupt information */ 885 /* Clear the interrupt information */
886 iosapic_intr_info[irq].dest = 0; 886 iosapic_intr_info[irq].dest = 0;
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index a58f64ca9f0e..226233a6fa19 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -103,7 +103,7 @@ static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
103void set_irq_affinity_info (unsigned int irq, int hwid, int redir) 103void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
104{ 104{
105 if (irq < NR_IRQS) { 105 if (irq < NR_IRQS) {
106 cpumask_copy(&irq_desc[irq].affinity, 106 cpumask_copy(irq_desc[irq].affinity,
107 cpumask_of(cpu_logical_id(hwid))); 107 cpumask_of(cpu_logical_id(hwid)));
108 irq_redir[irq] = (char) (redir & 0xff); 108 irq_redir[irq] = (char) (redir & 0xff);
109 } 109 }
@@ -148,7 +148,7 @@ static void migrate_irqs(void)
148 if (desc->status == IRQ_PER_CPU) 148 if (desc->status == IRQ_PER_CPU)
149 continue; 149 continue;
150 150
151 if (cpumask_any_and(&irq_desc[irq].affinity, cpu_online_mask) 151 if (cpumask_any_and(irq_desc[irq].affinity, cpu_online_mask)
152 >= nr_cpu_ids) { 152 >= nr_cpu_ids) {
153 /* 153 /*
154 * Save it for phase 2 processing 154 * Save it for phase 2 processing
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index 28d3d483db92..927ad027820c 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -493,11 +493,13 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
493 saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); 493 saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
494 ia64_srlz_d(); 494 ia64_srlz_d();
495 while (vector != IA64_SPURIOUS_INT_VECTOR) { 495 while (vector != IA64_SPURIOUS_INT_VECTOR) {
496 struct irq_desc *desc = irq_to_desc(vector);
497
496 if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) { 498 if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
497 smp_local_flush_tlb(); 499 smp_local_flush_tlb();
498 kstat_this_cpu.irqs[vector]++; 500 kstat_incr_irqs_this_cpu(vector, desc);
499 } else if (unlikely(IS_RESCHEDULE(vector))) 501 } else if (unlikely(IS_RESCHEDULE(vector)))
500 kstat_this_cpu.irqs[vector]++; 502 kstat_incr_irqs_this_cpu(vector, desc);
501 else { 503 else {
502 int irq = local_vector_to_irq(vector); 504 int irq = local_vector_to_irq(vector);
503 505
@@ -551,11 +553,13 @@ void ia64_process_pending_intr(void)
551 * Perform normal interrupt style processing 553 * Perform normal interrupt style processing
552 */ 554 */
553 while (vector != IA64_SPURIOUS_INT_VECTOR) { 555 while (vector != IA64_SPURIOUS_INT_VECTOR) {
556 struct irq_desc *desc = irq_to_desc(vector);
557
554 if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) { 558 if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
555 smp_local_flush_tlb(); 559 smp_local_flush_tlb();
556 kstat_this_cpu.irqs[vector]++; 560 kstat_incr_irqs_this_cpu(vector, desc);
557 } else if (unlikely(IS_RESCHEDULE(vector))) 561 } else if (unlikely(IS_RESCHEDULE(vector)))
558 kstat_this_cpu.irqs[vector]++; 562 kstat_incr_irqs_this_cpu(vector, desc);
559 else { 563 else {
560 struct pt_regs *old_regs = set_irq_regs(NULL); 564 struct pt_regs *old_regs = set_irq_regs(NULL);
561 int irq = local_vector_to_irq(vector); 565 int irq = local_vector_to_irq(vector);
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 890339339035..dcb6b7c51ea7 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -75,7 +75,7 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
75 msg.data = data; 75 msg.data = data;
76 76
77 write_msi_msg(irq, &msg); 77 write_msi_msg(irq, &msg);
78 irq_desc[irq].affinity = cpumask_of_cpu(cpu); 78 cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
79} 79}
80#endif /* CONFIG_SMP */ 80#endif /* CONFIG_SMP */
81 81
@@ -187,7 +187,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
187 msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu)); 187 msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu));
188 188
189 dmar_msi_write(irq, &msg); 189 dmar_msi_write(irq, &msg);
190 irq_desc[irq].affinity = *mask; 190 cpumask_copy(irq_desc[irq].affinity, mask);
191} 191}
192#endif /* CONFIG_SMP */ 192#endif /* CONFIG_SMP */
193 193
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 10a7d47e8510..f45e4e508eca 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -219,6 +219,7 @@ SECTIONS
219 .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET) 219 .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET)
220 { 220 {
221 __per_cpu_start = .; 221 __per_cpu_start = .;
222 *(.data.percpu.page_aligned)
222 *(.data.percpu) 223 *(.data.percpu)
223 *(.data.percpu.shared_aligned) 224 *(.data.percpu.shared_aligned)
224 __per_cpu_end = .; 225 __per_cpu_end = .;
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c
index ca553b0429ce..81e428943d73 100644
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -205,7 +205,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
205 msg.address_lo = (u32)(bus_addr & 0x00000000ffffffff); 205 msg.address_lo = (u32)(bus_addr & 0x00000000ffffffff);
206 206
207 write_msi_msg(irq, &msg); 207 write_msi_msg(irq, &msg);
208 irq_desc[irq].affinity = *cpu_mask; 208 cpumask_copy(irq_desc[irq].affinity, cpu_mask);
209} 209}
210#endif /* CONFIG_SMP */ 210#endif /* CONFIG_SMP */
211 211
diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h
index abc62aa744ac..3214ade02d10 100644
--- a/arch/mips/include/asm/irq.h
+++ b/arch/mips/include/asm/irq.h
@@ -66,7 +66,7 @@ extern void smtc_forward_irq(unsigned int irq);
66 */ 66 */
67#define IRQ_AFFINITY_HOOK(irq) \ 67#define IRQ_AFFINITY_HOOK(irq) \
68do { \ 68do { \
69 if (!cpu_isset(smp_processor_id(), irq_desc[irq].affinity)) { \ 69 if (!cpumask_test_cpu(smp_processor_id(), irq_desc[irq].affinity)) {\
70 smtc_forward_irq(irq); \ 70 smtc_forward_irq(irq); \
71 irq_exit(); \ 71 irq_exit(); \
72 return; \ 72 return; \
diff --git a/arch/mips/kernel/irq-gic.c b/arch/mips/kernel/irq-gic.c
index 494a49a317e9..87deb8f6c458 100644
--- a/arch/mips/kernel/irq-gic.c
+++ b/arch/mips/kernel/irq-gic.c
@@ -187,7 +187,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
187 set_bit(irq, pcpu_masks[first_cpu(tmp)].pcpu_mask); 187 set_bit(irq, pcpu_masks[first_cpu(tmp)].pcpu_mask);
188 188
189 } 189 }
190 irq_desc[irq].affinity = *cpumask; 190 cpumask_copy(irq_desc[irq].affinity, cpumask);
191 spin_unlock_irqrestore(&gic_lock, flags); 191 spin_unlock_irqrestore(&gic_lock, flags);
192 192
193} 193}
diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c
index b6cca01ff82b..5f5af7d4c890 100644
--- a/arch/mips/kernel/smtc.c
+++ b/arch/mips/kernel/smtc.c
@@ -686,7 +686,7 @@ void smtc_forward_irq(unsigned int irq)
686 * and efficiency, we just pick the easiest one to find. 686 * and efficiency, we just pick the easiest one to find.
687 */ 687 */
688 688
689 target = first_cpu(irq_desc[irq].affinity); 689 target = cpumask_first(irq_desc[irq].affinity);
690 690
691 /* 691 /*
692 * We depend on the platform code to have correctly processed 692 * We depend on the platform code to have correctly processed
@@ -921,11 +921,13 @@ void ipi_decode(struct smtc_ipi *pipi)
921 struct clock_event_device *cd; 921 struct clock_event_device *cd;
922 void *arg_copy = pipi->arg; 922 void *arg_copy = pipi->arg;
923 int type_copy = pipi->type; 923 int type_copy = pipi->type;
924 int irq = MIPS_CPU_IRQ_BASE + 1;
925
924 smtc_ipi_nq(&freeIPIq, pipi); 926 smtc_ipi_nq(&freeIPIq, pipi);
925 switch (type_copy) { 927 switch (type_copy) {
926 case SMTC_CLOCK_TICK: 928 case SMTC_CLOCK_TICK:
927 irq_enter(); 929 irq_enter();
928 kstat_this_cpu.irqs[MIPS_CPU_IRQ_BASE + 1]++; 930 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
929 cd = &per_cpu(mips_clockevent_device, cpu); 931 cd = &per_cpu(mips_clockevent_device, cpu);
930 cd->event_handler(cd); 932 cd->event_handler(cd);
931 irq_exit(); 933 irq_exit();
diff --git a/arch/mips/mti-malta/malta-smtc.c b/arch/mips/mti-malta/malta-smtc.c
index aabd7274507b..5ba31888fefb 100644
--- a/arch/mips/mti-malta/malta-smtc.c
+++ b/arch/mips/mti-malta/malta-smtc.c
@@ -116,7 +116,7 @@ struct plat_smp_ops msmtc_smp_ops = {
116 116
117void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity) 117void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
118{ 118{
119 cpumask_t tmask = *affinity; 119 cpumask_t tmask;
120 int cpu = 0; 120 int cpu = 0;
121 void smtc_set_irq_affinity(unsigned int irq, cpumask_t aff); 121 void smtc_set_irq_affinity(unsigned int irq, cpumask_t aff);
122 122
@@ -139,11 +139,12 @@ void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
139 * be made to forward to an offline "CPU". 139 * be made to forward to an offline "CPU".
140 */ 140 */
141 141
142 cpumask_copy(&tmask, affinity);
142 for_each_cpu(cpu, affinity) { 143 for_each_cpu(cpu, affinity) {
143 if ((cpu_data[cpu].vpe_id != 0) || !cpu_online(cpu)) 144 if ((cpu_data[cpu].vpe_id != 0) || !cpu_online(cpu))
144 cpu_clear(cpu, tmask); 145 cpu_clear(cpu, tmask);
145 } 146 }
146 irq_desc[irq].affinity = tmask; 147 cpumask_copy(irq_desc[irq].affinity, &tmask);
147 148
148 if (cpus_empty(tmask)) 149 if (cpus_empty(tmask))
149 /* 150 /*
diff --git a/arch/mips/sgi-ip22/ip22-int.c b/arch/mips/sgi-ip22/ip22-int.c
index f8b18af141a1..0ecd5fe9486e 100644
--- a/arch/mips/sgi-ip22/ip22-int.c
+++ b/arch/mips/sgi-ip22/ip22-int.c
@@ -155,7 +155,7 @@ static void indy_buserror_irq(void)
155 int irq = SGI_BUSERR_IRQ; 155 int irq = SGI_BUSERR_IRQ;
156 156
157 irq_enter(); 157 irq_enter();
158 kstat_this_cpu.irqs[irq]++; 158 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
159 ip22_be_interrupt(irq); 159 ip22_be_interrupt(irq);
160 irq_exit(); 160 irq_exit();
161} 161}
diff --git a/arch/mips/sgi-ip22/ip22-time.c b/arch/mips/sgi-ip22/ip22-time.c
index 3dcb27ec0c53..c8f7d2328b24 100644
--- a/arch/mips/sgi-ip22/ip22-time.c
+++ b/arch/mips/sgi-ip22/ip22-time.c
@@ -122,7 +122,7 @@ void indy_8254timer_irq(void)
122 char c; 122 char c;
123 123
124 irq_enter(); 124 irq_enter();
125 kstat_this_cpu.irqs[irq]++; 125 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
126 printk(KERN_ALERT "Oops, got 8254 interrupt.\n"); 126 printk(KERN_ALERT "Oops, got 8254 interrupt.\n");
127 ArcRead(0, &c, 1, &cnt); 127 ArcRead(0, &c, 1, &cnt);
128 ArcEnterInteractiveMode(); 128 ArcEnterInteractiveMode();
diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c
index dddfda8e8294..314691648c97 100644
--- a/arch/mips/sibyte/bcm1480/smp.c
+++ b/arch/mips/sibyte/bcm1480/smp.c
@@ -178,9 +178,10 @@ struct plat_smp_ops bcm1480_smp_ops = {
178void bcm1480_mailbox_interrupt(void) 178void bcm1480_mailbox_interrupt(void)
179{ 179{
180 int cpu = smp_processor_id(); 180 int cpu = smp_processor_id();
181 int irq = K_BCM1480_INT_MBOX_0_0;
181 unsigned int action; 182 unsigned int action;
182 183
183 kstat_this_cpu.irqs[K_BCM1480_INT_MBOX_0_0]++; 184 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
184 /* Load the mailbox register to figure out what we're supposed to do */ 185 /* Load the mailbox register to figure out what we're supposed to do */
185 action = (__raw_readq(mailbox_0_regs[cpu]) >> 48) & 0xffff; 186 action = (__raw_readq(mailbox_0_regs[cpu]) >> 48) & 0xffff;
186 187
diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c
index 5950a288a7da..cad14003b84f 100644
--- a/arch/mips/sibyte/sb1250/smp.c
+++ b/arch/mips/sibyte/sb1250/smp.c
@@ -166,9 +166,10 @@ struct plat_smp_ops sb_smp_ops = {
166void sb1250_mailbox_interrupt(void) 166void sb1250_mailbox_interrupt(void)
167{ 167{
168 int cpu = smp_processor_id(); 168 int cpu = smp_processor_id();
169 int irq = K_INT_MBOX_0;
169 unsigned int action; 170 unsigned int action;
170 171
171 kstat_this_cpu.irqs[K_INT_MBOX_0]++; 172 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
172 /* Load the mailbox register to figure out what we're supposed to do */ 173 /* Load the mailbox register to figure out what we're supposed to do */
173 action = (____raw_readq(mailbox_regs[cpu]) >> 48) & 0xffff; 174 action = (____raw_readq(mailbox_regs[cpu]) >> 48) & 0xffff;
174 175
diff --git a/arch/mn10300/kernel/mn10300-watchdog.c b/arch/mn10300/kernel/mn10300-watchdog.c
index 10811e981d20..2e370d88a87a 100644
--- a/arch/mn10300/kernel/mn10300-watchdog.c
+++ b/arch/mn10300/kernel/mn10300-watchdog.c
@@ -130,6 +130,7 @@ void watchdog_interrupt(struct pt_regs *regs, enum exception_code excep)
130 * the stack NMI-atomically, it's safe to use smp_processor_id(). 130 * the stack NMI-atomically, it's safe to use smp_processor_id().
131 */ 131 */
132 int sum, cpu = smp_processor_id(); 132 int sum, cpu = smp_processor_id();
133 int irq = NMIIRQ;
133 u8 wdt, tmp; 134 u8 wdt, tmp;
134 135
135 wdt = WDCTR & ~WDCTR_WDCNE; 136 wdt = WDCTR & ~WDCTR_WDCNE;
@@ -138,7 +139,7 @@ void watchdog_interrupt(struct pt_regs *regs, enum exception_code excep)
138 NMICR = NMICR_WDIF; 139 NMICR = NMICR_WDIF;
139 140
140 nmi_count(cpu)++; 141 nmi_count(cpu)++;
141 kstat_this_cpu.irqs[NMIIRQ]++; 142 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
142 sum = irq_stat[cpu].__irq_count; 143 sum = irq_stat[cpu].__irq_count;
143 144
144 if (last_irq_sums[cpu] == sum) { 145 if (last_irq_sums[cpu] == sum) {
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index ac2c822928c7..49482806863f 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -120,7 +120,7 @@ int cpu_check_affinity(unsigned int irq, cpumask_t *dest)
120 if (CHECK_IRQ_PER_CPU(irq)) { 120 if (CHECK_IRQ_PER_CPU(irq)) {
121 /* Bad linux design decision. The mask has already 121 /* Bad linux design decision. The mask has already
122 * been set; we must reset it */ 122 * been set; we must reset it */
123 irq_desc[irq].affinity = CPU_MASK_ALL; 123 cpumask_setall(irq_desc[irq].affinity);
124 return -EINVAL; 124 return -EINVAL;
125 } 125 }
126 126
@@ -136,7 +136,7 @@ static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
136 if (cpu_check_affinity(irq, dest)) 136 if (cpu_check_affinity(irq, dest))
137 return; 137 return;
138 138
139 irq_desc[irq].affinity = *dest; 139 cpumask_copy(irq_desc[irq].affinity, dest);
140} 140}
141#endif 141#endif
142 142
@@ -295,7 +295,7 @@ int txn_alloc_irq(unsigned int bits_wide)
295unsigned long txn_affinity_addr(unsigned int irq, int cpu) 295unsigned long txn_affinity_addr(unsigned int irq, int cpu)
296{ 296{
297#ifdef CONFIG_SMP 297#ifdef CONFIG_SMP
298 irq_desc[irq].affinity = cpumask_of_cpu(cpu); 298 cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
299#endif 299#endif
300 300
301 return per_cpu(cpu_data, cpu).txn_addr; 301 return per_cpu(cpu_data, cpu).txn_addr;
@@ -352,7 +352,7 @@ void do_cpu_irq_mask(struct pt_regs *regs)
352 irq = eirr_to_irq(eirr_val); 352 irq = eirr_to_irq(eirr_val);
353 353
354#ifdef CONFIG_SMP 354#ifdef CONFIG_SMP
355 dest = irq_desc[irq].affinity; 355 cpumask_copy(&dest, irq_desc[irq].affinity);
356 if (CHECK_IRQ_PER_CPU(irq_desc[irq].status) && 356 if (CHECK_IRQ_PER_CPU(irq_desc[irq].status) &&
357 !cpu_isset(smp_processor_id(), dest)) { 357 !cpu_isset(smp_processor_id(), dest)) {
358 int cpu = first_cpu(dest); 358 int cpu = first_cpu(dest);
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 23b8b5e36f98..ad1e5ac721d8 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -231,7 +231,7 @@ void fixup_irqs(cpumask_t map)
231 if (irq_desc[irq].status & IRQ_PER_CPU) 231 if (irq_desc[irq].status & IRQ_PER_CPU)
232 continue; 232 continue;
233 233
234 cpus_and(mask, irq_desc[irq].affinity, map); 234 cpumask_and(&mask, irq_desc[irq].affinity, &map);
235 if (any_online_cpu(mask) == NR_CPUS) { 235 if (any_online_cpu(mask) == NR_CPUS) {
236 printk("Breaking affinity for irq %i\n", irq); 236 printk("Breaking affinity for irq %i\n", irq);
237 mask = map; 237 mask = map;
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 161b9b9691f0..295ccc5e86b1 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -184,6 +184,7 @@ SECTIONS
184 . = ALIGN(PAGE_SIZE); 184 . = ALIGN(PAGE_SIZE);
185 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { 185 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
186 __per_cpu_start = .; 186 __per_cpu_start = .;
187 *(.data.percpu.page_aligned)
187 *(.data.percpu) 188 *(.data.percpu)
188 *(.data.percpu.shared_aligned) 189 *(.data.percpu.shared_aligned)
189 __per_cpu_end = .; 190 __per_cpu_end = .;
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index 84e058f1e1cc..80b513449f4c 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -153,9 +153,10 @@ static int get_irq_server(unsigned int virq, unsigned int strict_check)
153{ 153{
154 int server; 154 int server;
155 /* For the moment only implement delivery to all cpus or one cpu */ 155 /* For the moment only implement delivery to all cpus or one cpu */
156 cpumask_t cpumask = irq_desc[virq].affinity; 156 cpumask_t cpumask;
157 cpumask_t tmp = CPU_MASK_NONE; 157 cpumask_t tmp = CPU_MASK_NONE;
158 158
159 cpumask_copy(&cpumask, irq_desc[virq].affinity);
159 if (!distribute_irqs) 160 if (!distribute_irqs)
160 return default_server; 161 return default_server;
161 162
@@ -869,7 +870,7 @@ void xics_migrate_irqs_away(void)
869 virq, cpu); 870 virq, cpu);
870 871
871 /* Reset affinity to all cpus */ 872 /* Reset affinity to all cpus */
872 irq_desc[virq].affinity = CPU_MASK_ALL; 873 cpumask_setall(irq_desc[virq].affinity);
873 desc->chip->set_affinity(virq, cpu_all_mask); 874 desc->chip->set_affinity(virq, cpu_all_mask);
874unlock: 875unlock:
875 spin_unlock_irqrestore(&desc->lock, flags); 876 spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index a35297dbac28..532e205303a2 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -566,9 +566,10 @@ static void __init mpic_scan_ht_pics(struct mpic *mpic)
566#ifdef CONFIG_SMP 566#ifdef CONFIG_SMP
567static int irq_choose_cpu(unsigned int virt_irq) 567static int irq_choose_cpu(unsigned int virt_irq)
568{ 568{
569 cpumask_t mask = irq_desc[virt_irq].affinity; 569 cpumask_t mask;
570 int cpuid; 570 int cpuid;
571 571
572 cpumask_copy(&mask, irq_desc[virt_irq].affinity);
572 if (cpus_equal(mask, CPU_MASK_ALL)) { 573 if (cpus_equal(mask, CPU_MASK_ALL)) {
573 static int irq_rover; 574 static int irq_rover;
574 static DEFINE_SPINLOCK(irq_rover_lock); 575 static DEFINE_SPINLOCK(irq_rover_lock);
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index e289376198eb..3d2c6baae96b 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -252,9 +252,10 @@ struct irq_handler_data {
252#ifdef CONFIG_SMP 252#ifdef CONFIG_SMP
253static int irq_choose_cpu(unsigned int virt_irq) 253static int irq_choose_cpu(unsigned int virt_irq)
254{ 254{
255 cpumask_t mask = irq_desc[virt_irq].affinity; 255 cpumask_t mask;
256 int cpuid; 256 int cpuid;
257 257
258 cpumask_copy(&mask, irq_desc[virt_irq].affinity);
258 if (cpus_equal(mask, CPU_MASK_ALL)) { 259 if (cpus_equal(mask, CPU_MASK_ALL)) {
259 static int irq_rover; 260 static int irq_rover;
260 static DEFINE_SPINLOCK(irq_rover_lock); 261 static DEFINE_SPINLOCK(irq_rover_lock);
@@ -796,7 +797,7 @@ void fixup_irqs(void)
796 !(irq_desc[irq].status & IRQ_PER_CPU)) { 797 !(irq_desc[irq].status & IRQ_PER_CPU)) {
797 if (irq_desc[irq].chip->set_affinity) 798 if (irq_desc[irq].chip->set_affinity)
798 irq_desc[irq].chip->set_affinity(irq, 799 irq_desc[irq].chip->set_affinity(irq,
799 &irq_desc[irq].affinity); 800 irq_desc[irq].affinity);
800 } 801 }
801 spin_unlock_irqrestore(&irq_desc[irq].lock, flags); 802 spin_unlock_irqrestore(&irq_desc[irq].lock, flags);
802 } 803 }
diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
index 2db3c2229b95..db310aa00183 100644
--- a/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@ -729,7 +729,7 @@ void timer_interrupt(int irq, struct pt_regs *regs)
729 729
730 irq_enter(); 730 irq_enter();
731 731
732 kstat_this_cpu.irqs[0]++; 732 kstat_incr_irqs_this_cpu(0, irq_to_desc(0));
733 733
734 if (unlikely(!evt->event_handler)) { 734 if (unlikely(!evt->event_handler)) {
735 printk(KERN_WARNING 735 printk(KERN_WARNING
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 73f7fe8fd4d1..d6218e6c9824 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -133,7 +133,7 @@ config ARCH_HAS_CACHE_LINE_SIZE
133 def_bool y 133 def_bool y
134 134
135config HAVE_SETUP_PER_CPU_AREA 135config HAVE_SETUP_PER_CPU_AREA
136 def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER) 136 def_bool y
137 137
138config HAVE_CPUMASK_OF_CPU_MAP 138config HAVE_CPUMASK_OF_CPU_MAP
139 def_bool X86_64_SMP 139 def_bool X86_64_SMP
@@ -391,6 +391,13 @@ config X86_RDC321X
391 as R-8610-(G). 391 as R-8610-(G).
392 If you don't have one of these chips, you should say N here. 392 If you don't have one of these chips, you should say N here.
393 393
394config X86_UV
395 bool "SGI Ultraviolet"
396 depends on X86_64
397 help
398 This option is needed in order to support SGI Ultraviolet systems.
399 If you don't have one of these, you should say N here.
400
394config SCHED_OMIT_FRAME_POINTER 401config SCHED_OMIT_FRAME_POINTER
395 def_bool y 402 def_bool y
396 prompt "Single-depth WCHAN output" 403 prompt "Single-depth WCHAN output"
@@ -1340,13 +1347,17 @@ config SECCOMP
1340 1347
1341 If unsure, say Y. Only embedded should say N here. 1348 If unsure, say Y. Only embedded should say N here.
1342 1349
1350config CC_STACKPROTECTOR_ALL
1351 bool
1352
1343config CC_STACKPROTECTOR 1353config CC_STACKPROTECTOR
1344 bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)" 1354 bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
1345 depends on X86_64 && EXPERIMENTAL && BROKEN 1355 depends on X86_64
1356 select CC_STACKPROTECTOR_ALL
1346 help 1357 help
1347 This option turns on the -fstack-protector GCC feature. This 1358 This option turns on the -fstack-protector GCC feature. This
1348 feature puts, at the beginning of critical functions, a canary 1359 feature puts, at the beginning of functions, a canary value on
1349 value on the stack just before the return address, and validates 1360 the stack just before the return address, and validates
1350 the value just before actually returning. Stack based buffer 1361 the value just before actually returning. Stack based buffer
1351 overflows (that need to overwrite this return address) now also 1362 overflows (that need to overwrite this return address) now also
1352 overwrite the canary, which gets detected and the attack is then 1363 overwrite the canary, which gets detected and the attack is then
@@ -1354,15 +1365,8 @@ config CC_STACKPROTECTOR
1354 1365
1355 This feature requires gcc version 4.2 or above, or a distribution 1366 This feature requires gcc version 4.2 or above, or a distribution
1356 gcc with the feature backported. Older versions are automatically 1367 gcc with the feature backported. Older versions are automatically
1357 detected and for those versions, this configuration option is ignored. 1368 detected and for those versions, this configuration option is
1358 1369 ignored. (and a warning is printed during bootup)
1359config CC_STACKPROTECTOR_ALL
1360 bool "Use stack-protector for all functions"
1361 depends on CC_STACKPROTECTOR
1362 help
1363 Normally, GCC only inserts the canary value protection for
1364 functions that use large-ish on-stack buffers. By enabling
1365 this option, GCC will be asked to do this for ALL functions.
1366 1370
1367source kernel/Kconfig.hz 1371source kernel/Kconfig.hz
1368 1372
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 8078955845ae..8eb50ba9161e 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -292,25 +292,23 @@ config X86_CPU
292# Define implied options from the CPU selection here 292# Define implied options from the CPU selection here
293config X86_L1_CACHE_BYTES 293config X86_L1_CACHE_BYTES
294 int 294 int
295 default "128" if GENERIC_CPU || MPSC 295 default "128" if MPSC
296 default "64" if MK8 || MCORE2 296 default "64" if GENERIC_CPU || MK8 || MCORE2 || X86_32
297 depends on X86_64
298 297
299config X86_INTERNODE_CACHE_BYTES 298config X86_INTERNODE_CACHE_BYTES
300 int 299 int
301 default "4096" if X86_VSMP 300 default "4096" if X86_VSMP
302 default X86_L1_CACHE_BYTES if !X86_VSMP 301 default X86_L1_CACHE_BYTES if !X86_VSMP
303 depends on X86_64
304 302
305config X86_CMPXCHG 303config X86_CMPXCHG
306 def_bool X86_64 || (X86_32 && !M386) 304 def_bool X86_64 || (X86_32 && !M386)
307 305
308config X86_L1_CACHE_SHIFT 306config X86_L1_CACHE_SHIFT
309 int 307 int
310 default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC 308 default "7" if MPENTIUM4 || MPSC
311 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 309 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
312 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 310 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
313 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 311 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 || X86_GENERIC || GENERIC_CPU
314 312
315config X86_XADD 313config X86_XADD
316 def_bool y 314 def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 10d6cc3fd052..28f111461ca8 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -117,6 +117,7 @@ config DEBUG_RODATA
117config DEBUG_RODATA_TEST 117config DEBUG_RODATA_TEST
118 bool "Testcase for the DEBUG_RODATA feature" 118 bool "Testcase for the DEBUG_RODATA feature"
119 depends on DEBUG_RODATA 119 depends on DEBUG_RODATA
120 default y
120 help 121 help
121 This option enables a testcase for the DEBUG_RODATA 122 This option enables a testcase for the DEBUG_RODATA
122 feature as well as for the change_page_attr() infrastructure. 123 feature as well as for the change_page_attr() infrastructure.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index d1a47adb5aec..cacee981d166 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -73,7 +73,7 @@ else
73 73
74 stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh 74 stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh
75 stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \ 75 stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \
76 "$(CC)" -fstack-protector ) 76 "$(CC)" "-fstack-protector -DGCC_HAS_SP" )
77 stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \ 77 stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \
78 "$(CC)" -fstack-protector-all ) 78 "$(CC)" -fstack-protector-all )
79 79
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 5a0d76dc56a4..097a6b64c24d 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -112,8 +112,8 @@ ENTRY(ia32_sysenter_target)
112 CFI_DEF_CFA rsp,0 112 CFI_DEF_CFA rsp,0
113 CFI_REGISTER rsp,rbp 113 CFI_REGISTER rsp,rbp
114 SWAPGS_UNSAFE_STACK 114 SWAPGS_UNSAFE_STACK
115 movq %gs:pda_kernelstack, %rsp 115 movq PER_CPU_VAR(kernel_stack), %rsp
116 addq $(PDA_STACKOFFSET),%rsp 116 addq $(KERNEL_STACK_OFFSET),%rsp
117 /* 117 /*
118 * No need to follow this irqs on/off section: the syscall 118 * No need to follow this irqs on/off section: the syscall
119 * disabled irqs, here we enable it straight after entry: 119 * disabled irqs, here we enable it straight after entry:
@@ -273,13 +273,13 @@ ENDPROC(ia32_sysenter_target)
273ENTRY(ia32_cstar_target) 273ENTRY(ia32_cstar_target)
274 CFI_STARTPROC32 simple 274 CFI_STARTPROC32 simple
275 CFI_SIGNAL_FRAME 275 CFI_SIGNAL_FRAME
276 CFI_DEF_CFA rsp,PDA_STACKOFFSET 276 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
277 CFI_REGISTER rip,rcx 277 CFI_REGISTER rip,rcx
278 /*CFI_REGISTER rflags,r11*/ 278 /*CFI_REGISTER rflags,r11*/
279 SWAPGS_UNSAFE_STACK 279 SWAPGS_UNSAFE_STACK
280 movl %esp,%r8d 280 movl %esp,%r8d
281 CFI_REGISTER rsp,r8 281 CFI_REGISTER rsp,r8
282 movq %gs:pda_kernelstack,%rsp 282 movq PER_CPU_VAR(kernel_stack),%rsp
283 /* 283 /*
284 * No need to follow this irqs on/off section: the syscall 284 * No need to follow this irqs on/off section: the syscall
285 * disabled irqs and here we enable it straight after entry: 285 * disabled irqs and here we enable it straight after entry:
diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h
new file mode 100644
index 000000000000..82f613c607ce
--- /dev/null
+++ b/arch/x86/include/asm/apicnum.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_APICNUM_H
2#define _ASM_X86_APICNUM_H
3
4/* define MAX_IO_APICS */
5#ifdef CONFIG_X86_32
6# define MAX_IO_APICS 64
7#else
8# define MAX_IO_APICS 128
9# define MAX_LOCAL_APIC 32768
10#endif
11
12#endif /* _ASM_X86_APICNUM_H */
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index bae482df6039..f03b23e32864 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -7,6 +7,20 @@
7#include <linux/nodemask.h> 7#include <linux/nodemask.h>
8#include <linux/percpu.h> 8#include <linux/percpu.h>
9 9
10#ifdef CONFIG_SMP
11
12extern void prefill_possible_map(void);
13
14#else /* CONFIG_SMP */
15
16static inline void prefill_possible_map(void) {}
17
18#define cpu_physical_id(cpu) boot_cpu_physical_apicid
19#define safe_smp_processor_id() 0
20#define stack_smp_processor_id() 0
21
22#endif /* CONFIG_SMP */
23
10struct x86_cpu { 24struct x86_cpu {
11 struct cpu cpu; 25 struct cpu cpu;
12}; 26};
@@ -17,4 +31,11 @@ extern void arch_unregister_cpu(int);
17#endif 31#endif
18 32
19DECLARE_PER_CPU(int, cpu_state); 33DECLARE_PER_CPU(int, cpu_state);
34
35#ifdef CONFIG_X86_HAS_BOOT_CPU_ID
36extern unsigned char boot_cpu_id;
37#else
38#define boot_cpu_id 0
39#endif
40
20#endif /* _ASM_X86_CPU_H */ 41#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h
new file mode 100644
index 000000000000..a7f3c75f8ad7
--- /dev/null
+++ b/arch/x86/include/asm/cpumask.h
@@ -0,0 +1,32 @@
1#ifndef _ASM_X86_CPUMASK_H
2#define _ASM_X86_CPUMASK_H
3#ifndef __ASSEMBLY__
4#include <linux/cpumask.h>
5
6#ifdef CONFIG_X86_64
7
8extern cpumask_var_t cpu_callin_mask;
9extern cpumask_var_t cpu_callout_mask;
10extern cpumask_var_t cpu_initialized_mask;
11extern cpumask_var_t cpu_sibling_setup_mask;
12
13extern void setup_cpu_local_masks(void);
14
15#else /* CONFIG_X86_32 */
16
17extern cpumask_t cpu_callin_map;
18extern cpumask_t cpu_callout_map;
19extern cpumask_t cpu_initialized;
20extern cpumask_t cpu_sibling_setup_map;
21
22#define cpu_callin_mask ((struct cpumask *)&cpu_callin_map)
23#define cpu_callout_mask ((struct cpumask *)&cpu_callout_map)
24#define cpu_initialized_mask ((struct cpumask *)&cpu_initialized)
25#define cpu_sibling_setup_mask ((struct cpumask *)&cpu_sibling_setup_map)
26
27static inline void setup_cpu_local_masks(void) { }
28
29#endif /* CONFIG_X86_32 */
30
31#endif /* __ASSEMBLY__ */
32#endif /* _ASM_X86_CPUMASK_H */
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index 0930b4f8d672..c68c361697e1 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -1,39 +1,21 @@
1#ifndef _ASM_X86_CURRENT_H 1#ifndef _ASM_X86_CURRENT_H
2#define _ASM_X86_CURRENT_H 2#define _ASM_X86_CURRENT_H
3 3
4#ifdef CONFIG_X86_32
5#include <linux/compiler.h> 4#include <linux/compiler.h>
6#include <asm/percpu.h> 5#include <asm/percpu.h>
7 6
7#ifndef __ASSEMBLY__
8struct task_struct; 8struct task_struct;
9 9
10DECLARE_PER_CPU(struct task_struct *, current_task); 10DECLARE_PER_CPU(struct task_struct *, current_task);
11static __always_inline struct task_struct *get_current(void)
12{
13 return x86_read_percpu(current_task);
14}
15
16#else /* X86_32 */
17
18#ifndef __ASSEMBLY__
19#include <asm/pda.h>
20
21struct task_struct;
22 11
23static __always_inline struct task_struct *get_current(void) 12static __always_inline struct task_struct *get_current(void)
24{ 13{
25 return read_pda(pcurrent); 14 return percpu_read(current_task);
26} 15}
27 16
28#else /* __ASSEMBLY__ */ 17#define current get_current()
29
30#include <asm/asm-offsets.h>
31#define GET_CURRENT(reg) movq %gs:(pda_pcurrent),reg
32 18
33#endif /* __ASSEMBLY__ */ 19#endif /* __ASSEMBLY__ */
34 20
35#endif /* X86_32 */
36
37#define current get_current()
38
39#endif /* _ASM_X86_CURRENT_H */ 21#endif /* _ASM_X86_CURRENT_H */
diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h
index 2c05b737ee22..4334502d3664 100644
--- a/arch/x86/include/asm/genapic_32.h
+++ b/arch/x86/include/asm/genapic_32.h
@@ -138,11 +138,4 @@ struct genapic {
138extern struct genapic *genapic; 138extern struct genapic *genapic;
139extern void es7000_update_genapic_to_cluster(void); 139extern void es7000_update_genapic_to_cluster(void);
140 140
141enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
142#define get_uv_system_type() UV_NONE
143#define is_uv_system() 0
144#define uv_wakeup_secondary(a, b) 1
145#define uv_system_init() do {} while (0)
146
147
148#endif /* _ASM_X86_GENAPIC_32_H */ 141#endif /* _ASM_X86_GENAPIC_32_H */
diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h
index adf32fb56aa6..7bb092c59055 100644
--- a/arch/x86/include/asm/genapic_64.h
+++ b/arch/x86/include/asm/genapic_64.h
@@ -51,15 +51,9 @@ extern struct genapic apic_x2apic_phys;
51extern int acpi_madt_oem_check(char *, char *); 51extern int acpi_madt_oem_check(char *, char *);
52 52
53extern void apic_send_IPI_self(int vector); 53extern void apic_send_IPI_self(int vector);
54enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
55extern enum uv_system_type get_uv_system_type(void);
56extern int is_uv_system(void);
57 54
58extern struct genapic apic_x2apic_uv_x; 55extern struct genapic apic_x2apic_uv_x;
59DECLARE_PER_CPU(int, x2apic_extra_bits); 56DECLARE_PER_CPU(int, x2apic_extra_bits);
60extern void uv_cpu_init(void);
61extern void uv_system_init(void);
62extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
63 57
64extern void setup_apic_routing(void); 58extern void setup_apic_routing(void);
65 59
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 000787df66e6..176f058e7159 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -1,11 +1,52 @@
1#ifdef CONFIG_X86_32 1#ifndef _ASM_X86_HARDIRQ_H
2# include "hardirq_32.h" 2#define _ASM_X86_HARDIRQ_H
3#else 3
4# include "hardirq_64.h" 4#include <linux/threads.h>
5#include <linux/irq.h>
6
7typedef struct {
8 unsigned int __softirq_pending;
9 unsigned int __nmi_count; /* arch dependent */
10 unsigned int irq0_irqs;
11#ifdef CONFIG_X86_LOCAL_APIC
12 unsigned int apic_timer_irqs; /* arch dependent */
13 unsigned int irq_spurious_count;
14#endif
15#ifdef CONFIG_SMP
16 unsigned int irq_resched_count;
17 unsigned int irq_call_count;
18 unsigned int irq_tlb_count;
19#endif
20#ifdef CONFIG_X86_MCE
21 unsigned int irq_thermal_count;
22# ifdef CONFIG_X86_64
23 unsigned int irq_threshold_count;
24# endif
5#endif 25#endif
26} ____cacheline_aligned irq_cpustat_t;
27
28DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
29
30/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
31#define MAX_HARDIRQS_PER_CPU NR_VECTORS
32
33#define __ARCH_IRQ_STAT
34
35#define inc_irq_stat(member) percpu_add(irq_stat.member, 1)
36
37#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending)
38
39#define __ARCH_SET_SOFTIRQ_PENDING
40
41#define set_softirq_pending(x) percpu_write(irq_stat.__softirq_pending, (x))
42#define or_softirq_pending(x) percpu_or(irq_stat.__softirq_pending, (x))
43
44extern void ack_bad_irq(unsigned int irq);
6 45
7extern u64 arch_irq_stat_cpu(unsigned int cpu); 46extern u64 arch_irq_stat_cpu(unsigned int cpu);
8#define arch_irq_stat_cpu arch_irq_stat_cpu 47#define arch_irq_stat_cpu arch_irq_stat_cpu
9 48
10extern u64 arch_irq_stat(void); 49extern u64 arch_irq_stat(void);
11#define arch_irq_stat arch_irq_stat 50#define arch_irq_stat arch_irq_stat
51
52#endif /* _ASM_X86_HARDIRQ_H */
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
deleted file mode 100644
index cf7954d1405f..000000000000
--- a/arch/x86/include/asm/hardirq_32.h
+++ /dev/null
@@ -1,30 +0,0 @@
1#ifndef _ASM_X86_HARDIRQ_32_H
2#define _ASM_X86_HARDIRQ_32_H
3
4#include <linux/threads.h>
5#include <linux/irq.h>
6
7typedef struct {
8 unsigned int __softirq_pending;
9 unsigned long idle_timestamp;
10 unsigned int __nmi_count; /* arch dependent */
11 unsigned int apic_timer_irqs; /* arch dependent */
12 unsigned int irq0_irqs;
13 unsigned int irq_resched_count;
14 unsigned int irq_call_count;
15 unsigned int irq_tlb_count;
16 unsigned int irq_thermal_count;
17 unsigned int irq_spurious_count;
18} ____cacheline_aligned irq_cpustat_t;
19
20DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
21
22#define __ARCH_IRQ_STAT
23#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member)
24
25#define inc_irq_stat(member) (__get_cpu_var(irq_stat).member++)
26
27void ack_bad_irq(unsigned int irq);
28#include <linux/irq_cpustat.h>
29
30#endif /* _ASM_X86_HARDIRQ_32_H */
diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h
deleted file mode 100644
index b5a6b5d56704..000000000000
--- a/arch/x86/include/asm/hardirq_64.h
+++ /dev/null
@@ -1,25 +0,0 @@
1#ifndef _ASM_X86_HARDIRQ_64_H
2#define _ASM_X86_HARDIRQ_64_H
3
4#include <linux/threads.h>
5#include <linux/irq.h>
6#include <asm/pda.h>
7#include <asm/apic.h>
8
9/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
10#define MAX_HARDIRQS_PER_CPU NR_VECTORS
11
12#define __ARCH_IRQ_STAT 1
13
14#define inc_irq_stat(member) add_pda(member, 1)
15
16#define local_softirq_pending() read_pda(__softirq_pending)
17
18#define __ARCH_SET_SOFTIRQ_PENDING 1
19
20#define set_softirq_pending(x) write_pda(__softirq_pending, (x))
21#define or_softirq_pending(x) or_pda(__softirq_pending, (x))
22
23extern void ack_bad_irq(unsigned int irq);
24
25#endif /* _ASM_X86_HARDIRQ_64_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 7a1f44ac1f17..08ec793aa043 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -114,38 +114,16 @@ struct IR_IO_APIC_route_entry {
114extern int nr_ioapics; 114extern int nr_ioapics;
115extern int nr_ioapic_registers[MAX_IO_APICS]; 115extern int nr_ioapic_registers[MAX_IO_APICS];
116 116
117/*
118 * MP-BIOS irq configuration table structures:
119 */
120
121#define MP_MAX_IOAPIC_PIN 127 117#define MP_MAX_IOAPIC_PIN 127
122 118
123struct mp_config_ioapic {
124 unsigned long mp_apicaddr;
125 unsigned int mp_apicid;
126 unsigned char mp_type;
127 unsigned char mp_apicver;
128 unsigned char mp_flags;
129};
130
131struct mp_config_intsrc {
132 unsigned int mp_dstapic;
133 unsigned char mp_type;
134 unsigned char mp_irqtype;
135 unsigned short mp_irqflag;
136 unsigned char mp_srcbus;
137 unsigned char mp_srcbusirq;
138 unsigned char mp_dstirq;
139};
140
141/* I/O APIC entries */ 119/* I/O APIC entries */
142extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; 120extern struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
143 121
144/* # of MP IRQ source entries */ 122/* # of MP IRQ source entries */
145extern int mp_irq_entries; 123extern int mp_irq_entries;
146 124
147/* MP IRQ source entries */ 125/* MP IRQ source entries */
148extern struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; 126extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
149 127
150/* non-0 if default (table-less) MP configuration */ 128/* non-0 if default (table-less) MP configuration */
151extern int mpc_default_type; 129extern int mpc_default_type;
diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h
index 89c898ab298b..77843225b7ea 100644
--- a/arch/x86/include/asm/irq_regs.h
+++ b/arch/x86/include/asm/irq_regs.h
@@ -1,5 +1,31 @@
1#ifdef CONFIG_X86_32 1/*
2# include "irq_regs_32.h" 2 * Per-cpu current frame pointer - the location of the last exception frame on
3#else 3 * the stack, stored in the per-cpu area.
4# include "irq_regs_64.h" 4 *
5#endif 5 * Jeremy Fitzhardinge <jeremy@goop.org>
6 */
7#ifndef _ASM_X86_IRQ_REGS_H
8#define _ASM_X86_IRQ_REGS_H
9
10#include <asm/percpu.h>
11
12#define ARCH_HAS_OWN_IRQ_REGS
13
14DECLARE_PER_CPU(struct pt_regs *, irq_regs);
15
16static inline struct pt_regs *get_irq_regs(void)
17{
18 return percpu_read(irq_regs);
19}
20
21static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
22{
23 struct pt_regs *old_regs;
24
25 old_regs = get_irq_regs();
26 percpu_write(irq_regs, new_regs);
27
28 return old_regs;
29}
30
31#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h
deleted file mode 100644
index 86afd7473457..000000000000
--- a/arch/x86/include/asm/irq_regs_32.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/*
2 * Per-cpu current frame pointer - the location of the last exception frame on
3 * the stack, stored in the per-cpu area.
4 *
5 * Jeremy Fitzhardinge <jeremy@goop.org>
6 */
7#ifndef _ASM_X86_IRQ_REGS_32_H
8#define _ASM_X86_IRQ_REGS_32_H
9
10#include <asm/percpu.h>
11
12#define ARCH_HAS_OWN_IRQ_REGS
13
14DECLARE_PER_CPU(struct pt_regs *, irq_regs);
15
16static inline struct pt_regs *get_irq_regs(void)
17{
18 return x86_read_percpu(irq_regs);
19}
20
21static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
22{
23 struct pt_regs *old_regs;
24
25 old_regs = get_irq_regs();
26 x86_write_percpu(irq_regs, new_regs);
27
28 return old_regs;
29}
30
31#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_regs_64.h b/arch/x86/include/asm/irq_regs_64.h
deleted file mode 100644
index 3dd9c0b70270..000000000000
--- a/arch/x86/include/asm/irq_regs_64.h
+++ /dev/null
@@ -1 +0,0 @@
1#include <asm-generic/irq_regs.h>
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index f7ff65032b9d..9a83a10a5d51 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -49,31 +49,33 @@
49 * some of the following vectors are 'rare', they are merged 49 * some of the following vectors are 'rare', they are merged
50 * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. 50 * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
51 * TLB, reschedule and local APIC vectors are performance-critical. 51 * TLB, reschedule and local APIC vectors are performance-critical.
52 *
53 * Vectors 0xf0-0xfa are free (reserved for future Linux use).
54 */ 52 */
55#ifdef CONFIG_X86_32 53#ifdef CONFIG_X86_32
56 54
57# define SPURIOUS_APIC_VECTOR 0xff 55# define SPURIOUS_APIC_VECTOR 0xff
58# define ERROR_APIC_VECTOR 0xfe 56# define ERROR_APIC_VECTOR 0xfe
59# define INVALIDATE_TLB_VECTOR 0xfd 57# define RESCHEDULE_VECTOR 0xfd
60# define RESCHEDULE_VECTOR 0xfc 58# define CALL_FUNCTION_VECTOR 0xfc
61# define CALL_FUNCTION_VECTOR 0xfb 59# define CALL_FUNCTION_SINGLE_VECTOR 0xfb
62# define CALL_FUNCTION_SINGLE_VECTOR 0xfa 60# define THERMAL_APIC_VECTOR 0xfa
63# define THERMAL_APIC_VECTOR 0xf0 61/* 0xf8 - 0xf9 : free */
62# define INVALIDATE_TLB_VECTOR_END 0xf7
63# define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */
64
65# define NUM_INVALIDATE_TLB_VECTORS 8
64 66
65#else 67#else
66 68
67#define SPURIOUS_APIC_VECTOR 0xff 69# define SPURIOUS_APIC_VECTOR 0xff
68#define ERROR_APIC_VECTOR 0xfe 70# define ERROR_APIC_VECTOR 0xfe
69#define RESCHEDULE_VECTOR 0xfd 71# define RESCHEDULE_VECTOR 0xfd
70#define CALL_FUNCTION_VECTOR 0xfc 72# define CALL_FUNCTION_VECTOR 0xfc
71#define CALL_FUNCTION_SINGLE_VECTOR 0xfb 73# define CALL_FUNCTION_SINGLE_VECTOR 0xfb
72#define THERMAL_APIC_VECTOR 0xfa 74# define THERMAL_APIC_VECTOR 0xfa
73#define THRESHOLD_APIC_VECTOR 0xf9 75# define THRESHOLD_APIC_VECTOR 0xf9
74#define UV_BAU_MESSAGE 0xf8 76# define UV_BAU_MESSAGE 0xf8
75#define INVALIDATE_TLB_VECTOR_END 0xf7 77# define INVALIDATE_TLB_VECTOR_END 0xf7
76#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */ 78# define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */
77 79
78#define NUM_INVALIDATE_TLB_VECTORS 8 80#define NUM_INVALIDATE_TLB_VECTORS 8
79 81
@@ -105,6 +107,8 @@
105 107
106#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER) 108#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
107 109
110#include <asm/apicnum.h> /* need MAX_IO_APICS */
111
108#ifndef CONFIG_SPARSE_IRQ 112#ifndef CONFIG_SPARSE_IRQ
109# if NR_CPUS < MAX_IO_APICS 113# if NR_CPUS < MAX_IO_APICS
110# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS)) 114# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
@@ -112,11 +116,12 @@
112# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) 116# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
113# endif 117# endif
114#else 118#else
115# if (8 * NR_CPUS) > (32 * MAX_IO_APICS) 119
116# define NR_IRQS (NR_VECTORS + (8 * NR_CPUS)) 120# define NR_IRQS \
117# else 121 ((8 * NR_CPUS) > (32 * MAX_IO_APICS) ? \
118# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) 122 (NR_VECTORS + (8 * NR_CPUS)) : \
119# endif 123 (NR_VECTORS + (32 * MAX_IO_APICS))) \
124
120#endif 125#endif
121 126
122#elif defined(CONFIG_X86_VOYAGER) 127#elif defined(CONFIG_X86_VOYAGER)
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
index 6b1add8e31dd..6fa399ad1de2 100644
--- a/arch/x86/include/asm/mach-default/entry_arch.h
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -11,10 +11,26 @@
11 */ 11 */
12#ifdef CONFIG_X86_SMP 12#ifdef CONFIG_X86_SMP
13BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) 13BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
14BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
15BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) 14BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
16BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) 15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
17BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) 16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
17
18BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0,
19 smp_invalidate_interrupt)
20BUILD_INTERRUPT3(invalidate_interrupt1,INVALIDATE_TLB_VECTOR_START+1,
21 smp_invalidate_interrupt)
22BUILD_INTERRUPT3(invalidate_interrupt2,INVALIDATE_TLB_VECTOR_START+2,
23 smp_invalidate_interrupt)
24BUILD_INTERRUPT3(invalidate_interrupt3,INVALIDATE_TLB_VECTOR_START+3,
25 smp_invalidate_interrupt)
26BUILD_INTERRUPT3(invalidate_interrupt4,INVALIDATE_TLB_VECTOR_START+4,
27 smp_invalidate_interrupt)
28BUILD_INTERRUPT3(invalidate_interrupt5,INVALIDATE_TLB_VECTOR_START+5,
29 smp_invalidate_interrupt)
30BUILD_INTERRUPT3(invalidate_interrupt6,INVALIDATE_TLB_VECTOR_START+6,
31 smp_invalidate_interrupt)
32BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
33 smp_invalidate_interrupt)
18#endif 34#endif
19 35
20/* 36/*
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 8aeeb3fd73db..52948df9cd1d 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -21,11 +21,54 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
21int init_new_context(struct task_struct *tsk, struct mm_struct *mm); 21int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
22void destroy_context(struct mm_struct *mm); 22void destroy_context(struct mm_struct *mm);
23 23
24#ifdef CONFIG_X86_32 24
25# include "mmu_context_32.h" 25static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
26#else 26{
27# include "mmu_context_64.h" 27#ifdef CONFIG_SMP
28 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
29 percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
30#endif
31}
32
33static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
34 struct task_struct *tsk)
35{
36 unsigned cpu = smp_processor_id();
37
38 if (likely(prev != next)) {
39 /* stop flush ipis for the previous mm */
40 cpu_clear(cpu, prev->cpu_vm_mask);
41#ifdef CONFIG_SMP
42 percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
43 percpu_write(cpu_tlbstate.active_mm, next);
28#endif 44#endif
45 cpu_set(cpu, next->cpu_vm_mask);
46
47 /* Re-load page tables */
48 load_cr3(next->pgd);
49
50 /*
51 * load the LDT, if the LDT is different:
52 */
53 if (unlikely(prev->context.ldt != next->context.ldt))
54 load_LDT_nolock(&next->context);
55 }
56#ifdef CONFIG_SMP
57 else {
58 percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
59 BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
60
61 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
62 /* We were in lazy tlb mode and leave_mm disabled
63 * tlb flush IPI delivery. We must reload CR3
64 * to make sure to use no freed page tables.
65 */
66 load_cr3(next->pgd);
67 load_LDT_nolock(&next->context);
68 }
69 }
70#endif
71}
29 72
30#define activate_mm(prev, next) \ 73#define activate_mm(prev, next) \
31do { \ 74do { \
@@ -33,5 +76,17 @@ do { \
33 switch_mm((prev), (next), NULL); \ 76 switch_mm((prev), (next), NULL); \
34} while (0); 77} while (0);
35 78
79#ifdef CONFIG_X86_32
80#define deactivate_mm(tsk, mm) \
81do { \
82 loadsegment(gs, 0); \
83} while (0)
84#else
85#define deactivate_mm(tsk, mm) \
86do { \
87 load_gs_index(0); \
88 loadsegment(fs, 0); \
89} while (0)
90#endif
36 91
37#endif /* _ASM_X86_MMU_CONTEXT_H */ 92#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h
deleted file mode 100644
index 7e98ce1d2c0e..000000000000
--- a/arch/x86/include/asm/mmu_context_32.h
+++ /dev/null
@@ -1,55 +0,0 @@
1#ifndef _ASM_X86_MMU_CONTEXT_32_H
2#define _ASM_X86_MMU_CONTEXT_32_H
3
4static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
5{
6#ifdef CONFIG_SMP
7 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
8 x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
9#endif
10}
11
12static inline void switch_mm(struct mm_struct *prev,
13 struct mm_struct *next,
14 struct task_struct *tsk)
15{
16 int cpu = smp_processor_id();
17
18 if (likely(prev != next)) {
19 /* stop flush ipis for the previous mm */
20 cpu_clear(cpu, prev->cpu_vm_mask);
21#ifdef CONFIG_SMP
22 x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
23 x86_write_percpu(cpu_tlbstate.active_mm, next);
24#endif
25 cpu_set(cpu, next->cpu_vm_mask);
26
27 /* Re-load page tables */
28 load_cr3(next->pgd);
29
30 /*
31 * load the LDT, if the LDT is different:
32 */
33 if (unlikely(prev->context.ldt != next->context.ldt))
34 load_LDT_nolock(&next->context);
35 }
36#ifdef CONFIG_SMP
37 else {
38 x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
39 BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
40
41 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
42 /* We were in lazy tlb mode and leave_mm disabled
43 * tlb flush IPI delivery. We must reload %cr3.
44 */
45 load_cr3(next->pgd);
46 load_LDT_nolock(&next->context);
47 }
48 }
49#endif
50}
51
52#define deactivate_mm(tsk, mm) \
53 asm("movl %0,%%gs": :"r" (0));
54
55#endif /* _ASM_X86_MMU_CONTEXT_32_H */
diff --git a/arch/x86/include/asm/mmu_context_64.h b/arch/x86/include/asm/mmu_context_64.h
deleted file mode 100644
index 677d36e9540a..000000000000
--- a/arch/x86/include/asm/mmu_context_64.h
+++ /dev/null
@@ -1,54 +0,0 @@
1#ifndef _ASM_X86_MMU_CONTEXT_64_H
2#define _ASM_X86_MMU_CONTEXT_64_H
3
4#include <asm/pda.h>
5
6static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
7{
8#ifdef CONFIG_SMP
9 if (read_pda(mmu_state) == TLBSTATE_OK)
10 write_pda(mmu_state, TLBSTATE_LAZY);
11#endif
12}
13
14static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
15 struct task_struct *tsk)
16{
17 unsigned cpu = smp_processor_id();
18 if (likely(prev != next)) {
19 /* stop flush ipis for the previous mm */
20 cpu_clear(cpu, prev->cpu_vm_mask);
21#ifdef CONFIG_SMP
22 write_pda(mmu_state, TLBSTATE_OK);
23 write_pda(active_mm, next);
24#endif
25 cpu_set(cpu, next->cpu_vm_mask);
26 load_cr3(next->pgd);
27
28 if (unlikely(next->context.ldt != prev->context.ldt))
29 load_LDT_nolock(&next->context);
30 }
31#ifdef CONFIG_SMP
32 else {
33 write_pda(mmu_state, TLBSTATE_OK);
34 if (read_pda(active_mm) != next)
35 BUG();
36 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
37 /* We were in lazy tlb mode and leave_mm disabled
38 * tlb flush IPI delivery. We must reload CR3
39 * to make sure to use no freed page tables.
40 */
41 load_cr3(next->pgd);
42 load_LDT_nolock(&next->context);
43 }
44 }
45#endif
46}
47
48#define deactivate_mm(tsk, mm) \
49do { \
50 load_gs_index(0); \
51 asm volatile("movl %0,%%fs"::"r"(0)); \
52} while (0)
53
54#endif /* _ASM_X86_MMU_CONTEXT_64_H */
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
index 59568bc4767f..4a7f96d7c188 100644
--- a/arch/x86/include/asm/mpspec_def.h
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -24,17 +24,18 @@
24# endif 24# endif
25#endif 25#endif
26 26
27struct intel_mp_floating { 27/* Intel MP Floating Pointer Structure */
28 char mpf_signature[4]; /* "_MP_" */ 28struct mpf_intel {
29 unsigned int mpf_physptr; /* Configuration table address */ 29 char signature[4]; /* "_MP_" */
30 unsigned char mpf_length; /* Our length (paragraphs) */ 30 unsigned int physptr; /* Configuration table address */
31 unsigned char mpf_specification;/* Specification version */ 31 unsigned char length; /* Our length (paragraphs) */
32 unsigned char mpf_checksum; /* Checksum (makes sum 0) */ 32 unsigned char specification; /* Specification version */
33 unsigned char mpf_feature1; /* Standard or configuration ? */ 33 unsigned char checksum; /* Checksum (makes sum 0) */
34 unsigned char mpf_feature2; /* Bit7 set for IMCR|PIC */ 34 unsigned char feature1; /* Standard or configuration ? */
35 unsigned char mpf_feature3; /* Unused (0) */ 35 unsigned char feature2; /* Bit7 set for IMCR|PIC */
36 unsigned char mpf_feature4; /* Unused (0) */ 36 unsigned char feature3; /* Unused (0) */
37 unsigned char mpf_feature5; /* Unused (0) */ 37 unsigned char feature4; /* Unused (0) */
38 unsigned char feature5; /* Unused (0) */
38}; 39};
39 40
40#define MPC_SIGNATURE "PCMP" 41#define MPC_SIGNATURE "PCMP"
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index e9873a2e8695..6b9810859daf 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -147,7 +147,7 @@ static inline pteval_t native_pte_val(pte_t pte)
147 return pte.pte; 147 return pte.pte;
148} 148}
149 149
150static inline pteval_t native_pte_flags(pte_t pte) 150static inline pteval_t pte_flags(pte_t pte)
151{ 151{
152 return native_pte_val(pte) & PTE_FLAGS_MASK; 152 return native_pte_val(pte) & PTE_FLAGS_MASK;
153} 153}
@@ -173,7 +173,6 @@ static inline pteval_t native_pte_flags(pte_t pte)
173#endif 173#endif
174 174
175#define pte_val(x) native_pte_val(x) 175#define pte_val(x) native_pte_val(x)
176#define pte_flags(x) native_pte_flags(x)
177#define __pte(x) native_make_pte(x) 176#define __pte(x) native_make_pte(x)
178 177
179#endif /* CONFIG_PARAVIRT */ 178#endif /* CONFIG_PARAVIRT */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 5ebca29f44f0..e27fdbe5f9e4 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -13,8 +13,8 @@
13#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) 13#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
14#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) 14#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
15 15
16#define IRQSTACK_ORDER 2 16#define IRQ_STACK_ORDER 2
17#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER) 17#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
18 18
19#define STACKFAULT_STACK 1 19#define STACKFAULT_STACK 1
20#define DOUBLEFAULT_STACK 2 20#define DOUBLEFAULT_STACK 2
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ba3e2ff6aedc..c85e7475e171 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -12,21 +12,38 @@
12#define CLBR_EAX (1 << 0) 12#define CLBR_EAX (1 << 0)
13#define CLBR_ECX (1 << 1) 13#define CLBR_ECX (1 << 1)
14#define CLBR_EDX (1 << 2) 14#define CLBR_EDX (1 << 2)
15#define CLBR_EDI (1 << 3)
15 16
16#ifdef CONFIG_X86_64 17#ifdef CONFIG_X86_32
17#define CLBR_RSI (1 << 3) 18/* CLBR_ANY should match all regs platform has. For i386, that's just it */
18#define CLBR_RDI (1 << 4) 19#define CLBR_ANY ((1 << 4) - 1)
20
21#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX)
22#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX)
23#define CLBR_SCRATCH (0)
24#else
25#define CLBR_RAX CLBR_EAX
26#define CLBR_RCX CLBR_ECX
27#define CLBR_RDX CLBR_EDX
28#define CLBR_RDI CLBR_EDI
29#define CLBR_RSI (1 << 4)
19#define CLBR_R8 (1 << 5) 30#define CLBR_R8 (1 << 5)
20#define CLBR_R9 (1 << 6) 31#define CLBR_R9 (1 << 6)
21#define CLBR_R10 (1 << 7) 32#define CLBR_R10 (1 << 7)
22#define CLBR_R11 (1 << 8) 33#define CLBR_R11 (1 << 8)
34
23#define CLBR_ANY ((1 << 9) - 1) 35#define CLBR_ANY ((1 << 9) - 1)
36
37#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \
38 CLBR_RCX | CLBR_R8 | CLBR_R9)
39#define CLBR_RET_REG (CLBR_RAX)
40#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11)
41
24#include <asm/desc_defs.h> 42#include <asm/desc_defs.h>
25#else
26/* CLBR_ANY should match all regs platform has. For i386, that's just it */
27#define CLBR_ANY ((1 << 3) - 1)
28#endif /* X86_64 */ 43#endif /* X86_64 */
29 44
45#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG)
46
30#ifndef __ASSEMBLY__ 47#ifndef __ASSEMBLY__
31#include <linux/types.h> 48#include <linux/types.h>
32#include <linux/cpumask.h> 49#include <linux/cpumask.h>
@@ -40,6 +57,14 @@ struct tss_struct;
40struct mm_struct; 57struct mm_struct;
41struct desc_struct; 58struct desc_struct;
42 59
60/*
61 * Wrapper type for pointers to code which uses the non-standard
62 * calling convention. See PV_CALL_SAVE_REGS_THUNK below.
63 */
64struct paravirt_callee_save {
65 void *func;
66};
67
43/* general info */ 68/* general info */
44struct pv_info { 69struct pv_info {
45 unsigned int kernel_rpl; 70 unsigned int kernel_rpl;
@@ -189,11 +214,15 @@ struct pv_irq_ops {
189 * expected to use X86_EFLAGS_IF; all other bits 214 * expected to use X86_EFLAGS_IF; all other bits
190 * returned from save_fl are undefined, and may be ignored by 215 * returned from save_fl are undefined, and may be ignored by
191 * restore_fl. 216 * restore_fl.
217 *
218 * NOTE: These functions callers expect the callee to preserve
219 * more registers than the standard C calling convention.
192 */ 220 */
193 unsigned long (*save_fl)(void); 221 struct paravirt_callee_save save_fl;
194 void (*restore_fl)(unsigned long); 222 struct paravirt_callee_save restore_fl;
195 void (*irq_disable)(void); 223 struct paravirt_callee_save irq_disable;
196 void (*irq_enable)(void); 224 struct paravirt_callee_save irq_enable;
225
197 void (*safe_halt)(void); 226 void (*safe_halt)(void);
198 void (*halt)(void); 227 void (*halt)(void);
199 228
@@ -244,7 +273,8 @@ struct pv_mmu_ops {
244 void (*flush_tlb_user)(void); 273 void (*flush_tlb_user)(void);
245 void (*flush_tlb_kernel)(void); 274 void (*flush_tlb_kernel)(void);
246 void (*flush_tlb_single)(unsigned long addr); 275 void (*flush_tlb_single)(unsigned long addr);
247 void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm, 276 void (*flush_tlb_others)(const struct cpumask *cpus,
277 struct mm_struct *mm,
248 unsigned long va); 278 unsigned long va);
249 279
250 /* Hooks for allocating and freeing a pagetable top-level */ 280 /* Hooks for allocating and freeing a pagetable top-level */
@@ -278,12 +308,11 @@ struct pv_mmu_ops {
278 void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, 308 void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
279 pte_t *ptep, pte_t pte); 309 pte_t *ptep, pte_t pte);
280 310
281 pteval_t (*pte_val)(pte_t); 311 struct paravirt_callee_save pte_val;
282 pteval_t (*pte_flags)(pte_t); 312 struct paravirt_callee_save make_pte;
283 pte_t (*make_pte)(pteval_t pte);
284 313
285 pgdval_t (*pgd_val)(pgd_t); 314 struct paravirt_callee_save pgd_val;
286 pgd_t (*make_pgd)(pgdval_t pgd); 315 struct paravirt_callee_save make_pgd;
287 316
288#if PAGETABLE_LEVELS >= 3 317#if PAGETABLE_LEVELS >= 3
289#ifdef CONFIG_X86_PAE 318#ifdef CONFIG_X86_PAE
@@ -298,12 +327,12 @@ struct pv_mmu_ops {
298 327
299 void (*set_pud)(pud_t *pudp, pud_t pudval); 328 void (*set_pud)(pud_t *pudp, pud_t pudval);
300 329
301 pmdval_t (*pmd_val)(pmd_t); 330 struct paravirt_callee_save pmd_val;
302 pmd_t (*make_pmd)(pmdval_t pmd); 331 struct paravirt_callee_save make_pmd;
303 332
304#if PAGETABLE_LEVELS == 4 333#if PAGETABLE_LEVELS == 4
305 pudval_t (*pud_val)(pud_t); 334 struct paravirt_callee_save pud_val;
306 pud_t (*make_pud)(pudval_t pud); 335 struct paravirt_callee_save make_pud;
307 336
308 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); 337 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
309#endif /* PAGETABLE_LEVELS == 4 */ 338#endif /* PAGETABLE_LEVELS == 4 */
@@ -388,6 +417,8 @@ extern struct pv_lock_ops pv_lock_ops;
388 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") 417 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
389 418
390unsigned paravirt_patch_nop(void); 419unsigned paravirt_patch_nop(void);
420unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
421unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len);
391unsigned paravirt_patch_ignore(unsigned len); 422unsigned paravirt_patch_ignore(unsigned len);
392unsigned paravirt_patch_call(void *insnbuf, 423unsigned paravirt_patch_call(void *insnbuf,
393 const void *target, u16 tgt_clobbers, 424 const void *target, u16 tgt_clobbers,
@@ -479,25 +510,45 @@ int paravirt_disable_iospace(void);
479 * makes sure the incoming and outgoing types are always correct. 510 * makes sure the incoming and outgoing types are always correct.
480 */ 511 */
481#ifdef CONFIG_X86_32 512#ifdef CONFIG_X86_32
482#define PVOP_VCALL_ARGS unsigned long __eax, __edx, __ecx 513#define PVOP_VCALL_ARGS \
514 unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx
483#define PVOP_CALL_ARGS PVOP_VCALL_ARGS 515#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
516
517#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x))
518#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x))
519#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x))
520
484#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ 521#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \
485 "=c" (__ecx) 522 "=c" (__ecx)
486#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS 523#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS
524
525#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx)
526#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
527
487#define EXTRA_CLOBBERS 528#define EXTRA_CLOBBERS
488#define VEXTRA_CLOBBERS 529#define VEXTRA_CLOBBERS
489#else 530#else /* CONFIG_X86_64 */
490#define PVOP_VCALL_ARGS unsigned long __edi, __esi, __edx, __ecx 531#define PVOP_VCALL_ARGS \
532 unsigned long __edi = __edi, __esi = __esi, \
533 __edx = __edx, __ecx = __ecx
491#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax 534#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax
535
536#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
537#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
538#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x))
539#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x))
540
492#define PVOP_VCALL_CLOBBERS "=D" (__edi), \ 541#define PVOP_VCALL_CLOBBERS "=D" (__edi), \
493 "=S" (__esi), "=d" (__edx), \ 542 "=S" (__esi), "=d" (__edx), \
494 "=c" (__ecx) 543 "=c" (__ecx)
495
496#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) 544#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
497 545
546#define PVOP_VCALLEE_CLOBBERS "=a" (__eax)
547#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
548
498#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" 549#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11"
499#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" 550#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11"
500#endif 551#endif /* CONFIG_X86_32 */
501 552
502#ifdef CONFIG_PARAVIRT_DEBUG 553#ifdef CONFIG_PARAVIRT_DEBUG
503#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) 554#define PVOP_TEST_NULL(op) BUG_ON(op == NULL)
@@ -505,10 +556,11 @@ int paravirt_disable_iospace(void);
505#define PVOP_TEST_NULL(op) ((void)op) 556#define PVOP_TEST_NULL(op) ((void)op)
506#endif 557#endif
507 558
508#define __PVOP_CALL(rettype, op, pre, post, ...) \ 559#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \
560 pre, post, ...) \
509 ({ \ 561 ({ \
510 rettype __ret; \ 562 rettype __ret; \
511 PVOP_CALL_ARGS; \ 563 PVOP_CALL_ARGS; \
512 PVOP_TEST_NULL(op); \ 564 PVOP_TEST_NULL(op); \
513 /* This is 32-bit specific, but is okay in 64-bit */ \ 565 /* This is 32-bit specific, but is okay in 64-bit */ \
514 /* since this condition will never hold */ \ 566 /* since this condition will never hold */ \
@@ -516,70 +568,113 @@ int paravirt_disable_iospace(void);
516 asm volatile(pre \ 568 asm volatile(pre \
517 paravirt_alt(PARAVIRT_CALL) \ 569 paravirt_alt(PARAVIRT_CALL) \
518 post \ 570 post \
519 : PVOP_CALL_CLOBBERS \ 571 : call_clbr \
520 : paravirt_type(op), \ 572 : paravirt_type(op), \
521 paravirt_clobber(CLBR_ANY), \ 573 paravirt_clobber(clbr), \
522 ##__VA_ARGS__ \ 574 ##__VA_ARGS__ \
523 : "memory", "cc" EXTRA_CLOBBERS); \ 575 : "memory", "cc" extra_clbr); \
524 __ret = (rettype)((((u64)__edx) << 32) | __eax); \ 576 __ret = (rettype)((((u64)__edx) << 32) | __eax); \
525 } else { \ 577 } else { \
526 asm volatile(pre \ 578 asm volatile(pre \
527 paravirt_alt(PARAVIRT_CALL) \ 579 paravirt_alt(PARAVIRT_CALL) \
528 post \ 580 post \
529 : PVOP_CALL_CLOBBERS \ 581 : call_clbr \
530 : paravirt_type(op), \ 582 : paravirt_type(op), \
531 paravirt_clobber(CLBR_ANY), \ 583 paravirt_clobber(clbr), \
532 ##__VA_ARGS__ \ 584 ##__VA_ARGS__ \
533 : "memory", "cc" EXTRA_CLOBBERS); \ 585 : "memory", "cc" extra_clbr); \
534 __ret = (rettype)__eax; \ 586 __ret = (rettype)__eax; \
535 } \ 587 } \
536 __ret; \ 588 __ret; \
537 }) 589 })
538#define __PVOP_VCALL(op, pre, post, ...) \ 590
591#define __PVOP_CALL(rettype, op, pre, post, ...) \
592 ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \
593 EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__)
594
595#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \
596 ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
597 PVOP_CALLEE_CLOBBERS, , \
598 pre, post, ##__VA_ARGS__)
599
600
601#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \
539 ({ \ 602 ({ \
540 PVOP_VCALL_ARGS; \ 603 PVOP_VCALL_ARGS; \
541 PVOP_TEST_NULL(op); \ 604 PVOP_TEST_NULL(op); \
542 asm volatile(pre \ 605 asm volatile(pre \
543 paravirt_alt(PARAVIRT_CALL) \ 606 paravirt_alt(PARAVIRT_CALL) \
544 post \ 607 post \
545 : PVOP_VCALL_CLOBBERS \ 608 : call_clbr \
546 : paravirt_type(op), \ 609 : paravirt_type(op), \
547 paravirt_clobber(CLBR_ANY), \ 610 paravirt_clobber(clbr), \
548 ##__VA_ARGS__ \ 611 ##__VA_ARGS__ \
549 : "memory", "cc" VEXTRA_CLOBBERS); \ 612 : "memory", "cc" extra_clbr); \
550 }) 613 })
551 614
615#define __PVOP_VCALL(op, pre, post, ...) \
616 ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
617 VEXTRA_CLOBBERS, \
618 pre, post, ##__VA_ARGS__)
619
620#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \
621 ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
622 PVOP_VCALLEE_CLOBBERS, , \
623 pre, post, ##__VA_ARGS__)
624
625
626
552#define PVOP_CALL0(rettype, op) \ 627#define PVOP_CALL0(rettype, op) \
553 __PVOP_CALL(rettype, op, "", "") 628 __PVOP_CALL(rettype, op, "", "")
554#define PVOP_VCALL0(op) \ 629#define PVOP_VCALL0(op) \
555 __PVOP_VCALL(op, "", "") 630 __PVOP_VCALL(op, "", "")
556 631
632#define PVOP_CALLEE0(rettype, op) \
633 __PVOP_CALLEESAVE(rettype, op, "", "")
634#define PVOP_VCALLEE0(op) \
635 __PVOP_VCALLEESAVE(op, "", "")
636
637
557#define PVOP_CALL1(rettype, op, arg1) \ 638#define PVOP_CALL1(rettype, op, arg1) \
558 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1))) 639 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
559#define PVOP_VCALL1(op, arg1) \ 640#define PVOP_VCALL1(op, arg1) \
560 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1))) 641 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1))
642
643#define PVOP_CALLEE1(rettype, op, arg1) \
644 __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
645#define PVOP_VCALLEE1(op, arg1) \
646 __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1))
647
561 648
562#define PVOP_CALL2(rettype, op, arg1, arg2) \ 649#define PVOP_CALL2(rettype, op, arg1, arg2) \
563 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ 650 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
564 "1" ((unsigned long)(arg2))) 651 PVOP_CALL_ARG2(arg2))
565#define PVOP_VCALL2(op, arg1, arg2) \ 652#define PVOP_VCALL2(op, arg1, arg2) \
566 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ 653 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
567 "1" ((unsigned long)(arg2))) 654 PVOP_CALL_ARG2(arg2))
655
656#define PVOP_CALLEE2(rettype, op, arg1, arg2) \
657 __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
658 PVOP_CALL_ARG2(arg2))
659#define PVOP_VCALLEE2(op, arg1, arg2) \
660 __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \
661 PVOP_CALL_ARG2(arg2))
662
568 663
569#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ 664#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
570 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ 665 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
571 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3))) 666 PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
572#define PVOP_VCALL3(op, arg1, arg2, arg3) \ 667#define PVOP_VCALL3(op, arg1, arg2, arg3) \
573 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ 668 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
574 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3))) 669 PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
575 670
576/* This is the only difference in x86_64. We can make it much simpler */ 671/* This is the only difference in x86_64. We can make it much simpler */
577#ifdef CONFIG_X86_32 672#ifdef CONFIG_X86_32
578#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ 673#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
579 __PVOP_CALL(rettype, op, \ 674 __PVOP_CALL(rettype, op, \
580 "push %[_arg4];", "lea 4(%%esp),%%esp;", \ 675 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
581 "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ 676 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
582 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) 677 PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4)))
583#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ 678#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
584 __PVOP_VCALL(op, \ 679 __PVOP_VCALL(op, \
585 "push %[_arg4];", "lea 4(%%esp),%%esp;", \ 680 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
@@ -587,13 +682,13 @@ int paravirt_disable_iospace(void);
587 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) 682 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
588#else 683#else
589#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ 684#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
590 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ 685 __PVOP_CALL(rettype, op, "", "", \
591 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \ 686 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
592 "3"((unsigned long)(arg4))) 687 PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
593#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ 688#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
594 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ 689 __PVOP_VCALL(op, "", "", \
595 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \ 690 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
596 "3"((unsigned long)(arg4))) 691 PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
597#endif 692#endif
598 693
599static inline int paravirt_enabled(void) 694static inline int paravirt_enabled(void)
@@ -984,10 +1079,11 @@ static inline void __flush_tlb_single(unsigned long addr)
984 PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr); 1079 PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
985} 1080}
986 1081
987static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, 1082static inline void flush_tlb_others(const struct cpumask *cpumask,
1083 struct mm_struct *mm,
988 unsigned long va) 1084 unsigned long va)
989{ 1085{
990 PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va); 1086 PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va);
991} 1087}
992 1088
993static inline int paravirt_pgd_alloc(struct mm_struct *mm) 1089static inline int paravirt_pgd_alloc(struct mm_struct *mm)
@@ -1059,13 +1155,13 @@ static inline pte_t __pte(pteval_t val)
1059 pteval_t ret; 1155 pteval_t ret;
1060 1156
1061 if (sizeof(pteval_t) > sizeof(long)) 1157 if (sizeof(pteval_t) > sizeof(long))
1062 ret = PVOP_CALL2(pteval_t, 1158 ret = PVOP_CALLEE2(pteval_t,
1063 pv_mmu_ops.make_pte, 1159 pv_mmu_ops.make_pte,
1064 val, (u64)val >> 32); 1160 val, (u64)val >> 32);
1065 else 1161 else
1066 ret = PVOP_CALL1(pteval_t, 1162 ret = PVOP_CALLEE1(pteval_t,
1067 pv_mmu_ops.make_pte, 1163 pv_mmu_ops.make_pte,
1068 val); 1164 val);
1069 1165
1070 return (pte_t) { .pte = ret }; 1166 return (pte_t) { .pte = ret };
1071} 1167}
@@ -1075,29 +1171,12 @@ static inline pteval_t pte_val(pte_t pte)
1075 pteval_t ret; 1171 pteval_t ret;
1076 1172
1077 if (sizeof(pteval_t) > sizeof(long)) 1173 if (sizeof(pteval_t) > sizeof(long))
1078 ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val, 1174 ret = PVOP_CALLEE2(pteval_t, pv_mmu_ops.pte_val,
1079 pte.pte, (u64)pte.pte >> 32); 1175 pte.pte, (u64)pte.pte >> 32);
1080 else
1081 ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val,
1082 pte.pte);
1083
1084 return ret;
1085}
1086
1087static inline pteval_t pte_flags(pte_t pte)
1088{
1089 pteval_t ret;
1090
1091 if (sizeof(pteval_t) > sizeof(long))
1092 ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags,
1093 pte.pte, (u64)pte.pte >> 32);
1094 else 1176 else
1095 ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags, 1177 ret = PVOP_CALLEE1(pteval_t, pv_mmu_ops.pte_val,
1096 pte.pte); 1178 pte.pte);
1097 1179
1098#ifdef CONFIG_PARAVIRT_DEBUG
1099 BUG_ON(ret & PTE_PFN_MASK);
1100#endif
1101 return ret; 1180 return ret;
1102} 1181}
1103 1182
@@ -1106,11 +1185,11 @@ static inline pgd_t __pgd(pgdval_t val)
1106 pgdval_t ret; 1185 pgdval_t ret;
1107 1186
1108 if (sizeof(pgdval_t) > sizeof(long)) 1187 if (sizeof(pgdval_t) > sizeof(long))
1109 ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd, 1188 ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.make_pgd,
1110 val, (u64)val >> 32); 1189 val, (u64)val >> 32);
1111 else 1190 else
1112 ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd, 1191 ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.make_pgd,
1113 val); 1192 val);
1114 1193
1115 return (pgd_t) { ret }; 1194 return (pgd_t) { ret };
1116} 1195}
@@ -1120,11 +1199,11 @@ static inline pgdval_t pgd_val(pgd_t pgd)
1120 pgdval_t ret; 1199 pgdval_t ret;
1121 1200
1122 if (sizeof(pgdval_t) > sizeof(long)) 1201 if (sizeof(pgdval_t) > sizeof(long))
1123 ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val, 1202 ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.pgd_val,
1124 pgd.pgd, (u64)pgd.pgd >> 32); 1203 pgd.pgd, (u64)pgd.pgd >> 32);
1125 else 1204 else
1126 ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val, 1205 ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.pgd_val,
1127 pgd.pgd); 1206 pgd.pgd);
1128 1207
1129 return ret; 1208 return ret;
1130} 1209}
@@ -1188,11 +1267,11 @@ static inline pmd_t __pmd(pmdval_t val)
1188 pmdval_t ret; 1267 pmdval_t ret;
1189 1268
1190 if (sizeof(pmdval_t) > sizeof(long)) 1269 if (sizeof(pmdval_t) > sizeof(long))
1191 ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd, 1270 ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.make_pmd,
1192 val, (u64)val >> 32); 1271 val, (u64)val >> 32);
1193 else 1272 else
1194 ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd, 1273 ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.make_pmd,
1195 val); 1274 val);
1196 1275
1197 return (pmd_t) { ret }; 1276 return (pmd_t) { ret };
1198} 1277}
@@ -1202,11 +1281,11 @@ static inline pmdval_t pmd_val(pmd_t pmd)
1202 pmdval_t ret; 1281 pmdval_t ret;
1203 1282
1204 if (sizeof(pmdval_t) > sizeof(long)) 1283 if (sizeof(pmdval_t) > sizeof(long))
1205 ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val, 1284 ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.pmd_val,
1206 pmd.pmd, (u64)pmd.pmd >> 32); 1285 pmd.pmd, (u64)pmd.pmd >> 32);
1207 else 1286 else
1208 ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val, 1287 ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.pmd_val,
1209 pmd.pmd); 1288 pmd.pmd);
1210 1289
1211 return ret; 1290 return ret;
1212} 1291}
@@ -1228,11 +1307,11 @@ static inline pud_t __pud(pudval_t val)
1228 pudval_t ret; 1307 pudval_t ret;
1229 1308
1230 if (sizeof(pudval_t) > sizeof(long)) 1309 if (sizeof(pudval_t) > sizeof(long))
1231 ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud, 1310 ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.make_pud,
1232 val, (u64)val >> 32); 1311 val, (u64)val >> 32);
1233 else 1312 else
1234 ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud, 1313 ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.make_pud,
1235 val); 1314 val);
1236 1315
1237 return (pud_t) { ret }; 1316 return (pud_t) { ret };
1238} 1317}
@@ -1242,11 +1321,11 @@ static inline pudval_t pud_val(pud_t pud)
1242 pudval_t ret; 1321 pudval_t ret;
1243 1322
1244 if (sizeof(pudval_t) > sizeof(long)) 1323 if (sizeof(pudval_t) > sizeof(long))
1245 ret = PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val, 1324 ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.pud_val,
1246 pud.pud, (u64)pud.pud >> 32); 1325 pud.pud, (u64)pud.pud >> 32);
1247 else 1326 else
1248 ret = PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val, 1327 ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.pud_val,
1249 pud.pud); 1328 pud.pud);
1250 1329
1251 return ret; 1330 return ret;
1252} 1331}
@@ -1387,6 +1466,9 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
1387} 1466}
1388 1467
1389void _paravirt_nop(void); 1468void _paravirt_nop(void);
1469u32 _paravirt_ident_32(u32);
1470u64 _paravirt_ident_64(u64);
1471
1390#define paravirt_nop ((void *)_paravirt_nop) 1472#define paravirt_nop ((void *)_paravirt_nop)
1391 1473
1392void paravirt_use_bytelocks(void); 1474void paravirt_use_bytelocks(void);
@@ -1438,12 +1520,37 @@ extern struct paravirt_patch_site __parainstructions[],
1438 __parainstructions_end[]; 1520 __parainstructions_end[];
1439 1521
1440#ifdef CONFIG_X86_32 1522#ifdef CONFIG_X86_32
1441#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;" 1523#define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
1442#define PV_RESTORE_REGS "popl %%edx; popl %%ecx" 1524#define PV_RESTORE_REGS "popl %edx; popl %ecx;"
1525
1526/* save and restore all caller-save registers, except return value */
1527#define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;"
1528#define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;"
1529
1443#define PV_FLAGS_ARG "0" 1530#define PV_FLAGS_ARG "0"
1444#define PV_EXTRA_CLOBBERS 1531#define PV_EXTRA_CLOBBERS
1445#define PV_VEXTRA_CLOBBERS 1532#define PV_VEXTRA_CLOBBERS
1446#else 1533#else
1534/* save and restore all caller-save registers, except return value */
1535#define PV_SAVE_ALL_CALLER_REGS \
1536 "push %rcx;" \
1537 "push %rdx;" \
1538 "push %rsi;" \
1539 "push %rdi;" \
1540 "push %r8;" \
1541 "push %r9;" \
1542 "push %r10;" \
1543 "push %r11;"
1544#define PV_RESTORE_ALL_CALLER_REGS \
1545 "pop %r11;" \
1546 "pop %r10;" \
1547 "pop %r9;" \
1548 "pop %r8;" \
1549 "pop %rdi;" \
1550 "pop %rsi;" \
1551 "pop %rdx;" \
1552 "pop %rcx;"
1553
1447/* We save some registers, but all of them, that's too much. We clobber all 1554/* We save some registers, but all of them, that's too much. We clobber all
1448 * caller saved registers but the argument parameter */ 1555 * caller saved registers but the argument parameter */
1449#define PV_SAVE_REGS "pushq %%rdi;" 1556#define PV_SAVE_REGS "pushq %%rdi;"
@@ -1453,52 +1560,76 @@ extern struct paravirt_patch_site __parainstructions[],
1453#define PV_FLAGS_ARG "D" 1560#define PV_FLAGS_ARG "D"
1454#endif 1561#endif
1455 1562
1563/*
1564 * Generate a thunk around a function which saves all caller-save
1565 * registers except for the return value. This allows C functions to
1566 * be called from assembler code where fewer than normal registers are
1567 * available. It may also help code generation around calls from C
1568 * code if the common case doesn't use many registers.
1569 *
1570 * When a callee is wrapped in a thunk, the caller can assume that all
1571 * arg regs and all scratch registers are preserved across the
1572 * call. The return value in rax/eax will not be saved, even for void
1573 * functions.
1574 */
1575#define PV_CALLEE_SAVE_REGS_THUNK(func) \
1576 extern typeof(func) __raw_callee_save_##func; \
1577 static void *__##func##__ __used = func; \
1578 \
1579 asm(".pushsection .text;" \
1580 "__raw_callee_save_" #func ": " \
1581 PV_SAVE_ALL_CALLER_REGS \
1582 "call " #func ";" \
1583 PV_RESTORE_ALL_CALLER_REGS \
1584 "ret;" \
1585 ".popsection")
1586
1587/* Get a reference to a callee-save function */
1588#define PV_CALLEE_SAVE(func) \
1589 ((struct paravirt_callee_save) { __raw_callee_save_##func })
1590
1591/* Promise that "func" already uses the right calling convention */
1592#define __PV_IS_CALLEE_SAVE(func) \
1593 ((struct paravirt_callee_save) { func })
1594
1456static inline unsigned long __raw_local_save_flags(void) 1595static inline unsigned long __raw_local_save_flags(void)
1457{ 1596{
1458 unsigned long f; 1597 unsigned long f;
1459 1598
1460 asm volatile(paravirt_alt(PV_SAVE_REGS 1599 asm volatile(paravirt_alt(PARAVIRT_CALL)
1461 PARAVIRT_CALL
1462 PV_RESTORE_REGS)
1463 : "=a"(f) 1600 : "=a"(f)
1464 : paravirt_type(pv_irq_ops.save_fl), 1601 : paravirt_type(pv_irq_ops.save_fl),
1465 paravirt_clobber(CLBR_EAX) 1602 paravirt_clobber(CLBR_EAX)
1466 : "memory", "cc" PV_VEXTRA_CLOBBERS); 1603 : "memory", "cc");
1467 return f; 1604 return f;
1468} 1605}
1469 1606
1470static inline void raw_local_irq_restore(unsigned long f) 1607static inline void raw_local_irq_restore(unsigned long f)
1471{ 1608{
1472 asm volatile(paravirt_alt(PV_SAVE_REGS 1609 asm volatile(paravirt_alt(PARAVIRT_CALL)
1473 PARAVIRT_CALL
1474 PV_RESTORE_REGS)
1475 : "=a"(f) 1610 : "=a"(f)
1476 : PV_FLAGS_ARG(f), 1611 : PV_FLAGS_ARG(f),
1477 paravirt_type(pv_irq_ops.restore_fl), 1612 paravirt_type(pv_irq_ops.restore_fl),
1478 paravirt_clobber(CLBR_EAX) 1613 paravirt_clobber(CLBR_EAX)
1479 : "memory", "cc" PV_EXTRA_CLOBBERS); 1614 : "memory", "cc");
1480} 1615}
1481 1616
1482static inline void raw_local_irq_disable(void) 1617static inline void raw_local_irq_disable(void)
1483{ 1618{
1484 asm volatile(paravirt_alt(PV_SAVE_REGS 1619 asm volatile(paravirt_alt(PARAVIRT_CALL)
1485 PARAVIRT_CALL
1486 PV_RESTORE_REGS)
1487 : 1620 :
1488 : paravirt_type(pv_irq_ops.irq_disable), 1621 : paravirt_type(pv_irq_ops.irq_disable),
1489 paravirt_clobber(CLBR_EAX) 1622 paravirt_clobber(CLBR_EAX)
1490 : "memory", "eax", "cc" PV_EXTRA_CLOBBERS); 1623 : "memory", "eax", "cc");
1491} 1624}
1492 1625
1493static inline void raw_local_irq_enable(void) 1626static inline void raw_local_irq_enable(void)
1494{ 1627{
1495 asm volatile(paravirt_alt(PV_SAVE_REGS 1628 asm volatile(paravirt_alt(PARAVIRT_CALL)
1496 PARAVIRT_CALL
1497 PV_RESTORE_REGS)
1498 : 1629 :
1499 : paravirt_type(pv_irq_ops.irq_enable), 1630 : paravirt_type(pv_irq_ops.irq_enable),
1500 paravirt_clobber(CLBR_EAX) 1631 paravirt_clobber(CLBR_EAX)
1501 : "memory", "eax", "cc" PV_EXTRA_CLOBBERS); 1632 : "memory", "eax", "cc");
1502} 1633}
1503 1634
1504static inline unsigned long __raw_local_irq_save(void) 1635static inline unsigned long __raw_local_irq_save(void)
@@ -1541,33 +1672,49 @@ static inline unsigned long __raw_local_irq_save(void)
1541 .popsection 1672 .popsection
1542 1673
1543 1674
1675#define COND_PUSH(set, mask, reg) \
1676 .if ((~(set)) & mask); push %reg; .endif
1677#define COND_POP(set, mask, reg) \
1678 .if ((~(set)) & mask); pop %reg; .endif
1679
1544#ifdef CONFIG_X86_64 1680#ifdef CONFIG_X86_64
1545#define PV_SAVE_REGS \ 1681
1546 push %rax; \ 1682#define PV_SAVE_REGS(set) \
1547 push %rcx; \ 1683 COND_PUSH(set, CLBR_RAX, rax); \
1548 push %rdx; \ 1684 COND_PUSH(set, CLBR_RCX, rcx); \
1549 push %rsi; \ 1685 COND_PUSH(set, CLBR_RDX, rdx); \
1550 push %rdi; \ 1686 COND_PUSH(set, CLBR_RSI, rsi); \
1551 push %r8; \ 1687 COND_PUSH(set, CLBR_RDI, rdi); \
1552 push %r9; \ 1688 COND_PUSH(set, CLBR_R8, r8); \
1553 push %r10; \ 1689 COND_PUSH(set, CLBR_R9, r9); \
1554 push %r11 1690 COND_PUSH(set, CLBR_R10, r10); \
1555#define PV_RESTORE_REGS \ 1691 COND_PUSH(set, CLBR_R11, r11)
1556 pop %r11; \ 1692#define PV_RESTORE_REGS(set) \
1557 pop %r10; \ 1693 COND_POP(set, CLBR_R11, r11); \
1558 pop %r9; \ 1694 COND_POP(set, CLBR_R10, r10); \
1559 pop %r8; \ 1695 COND_POP(set, CLBR_R9, r9); \
1560 pop %rdi; \ 1696 COND_POP(set, CLBR_R8, r8); \
1561 pop %rsi; \ 1697 COND_POP(set, CLBR_RDI, rdi); \
1562 pop %rdx; \ 1698 COND_POP(set, CLBR_RSI, rsi); \
1563 pop %rcx; \ 1699 COND_POP(set, CLBR_RDX, rdx); \
1564 pop %rax 1700 COND_POP(set, CLBR_RCX, rcx); \
1701 COND_POP(set, CLBR_RAX, rax)
1702
1565#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8) 1703#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
1566#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8) 1704#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
1567#define PARA_INDIRECT(addr) *addr(%rip) 1705#define PARA_INDIRECT(addr) *addr(%rip)
1568#else 1706#else
1569#define PV_SAVE_REGS pushl %eax; pushl %edi; pushl %ecx; pushl %edx 1707#define PV_SAVE_REGS(set) \
1570#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax 1708 COND_PUSH(set, CLBR_EAX, eax); \
1709 COND_PUSH(set, CLBR_EDI, edi); \
1710 COND_PUSH(set, CLBR_ECX, ecx); \
1711 COND_PUSH(set, CLBR_EDX, edx)
1712#define PV_RESTORE_REGS(set) \
1713 COND_POP(set, CLBR_EDX, edx); \
1714 COND_POP(set, CLBR_ECX, ecx); \
1715 COND_POP(set, CLBR_EDI, edi); \
1716 COND_POP(set, CLBR_EAX, eax)
1717
1571#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4) 1718#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4)
1572#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4) 1719#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
1573#define PARA_INDIRECT(addr) *%cs:addr 1720#define PARA_INDIRECT(addr) *%cs:addr
@@ -1579,15 +1726,15 @@ static inline unsigned long __raw_local_irq_save(void)
1579 1726
1580#define DISABLE_INTERRUPTS(clobbers) \ 1727#define DISABLE_INTERRUPTS(clobbers) \
1581 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ 1728 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
1582 PV_SAVE_REGS; \ 1729 PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
1583 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \ 1730 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \
1584 PV_RESTORE_REGS;) \ 1731 PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
1585 1732
1586#define ENABLE_INTERRUPTS(clobbers) \ 1733#define ENABLE_INTERRUPTS(clobbers) \
1587 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \ 1734 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \
1588 PV_SAVE_REGS; \ 1735 PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
1589 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ 1736 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \
1590 PV_RESTORE_REGS;) 1737 PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
1591 1738
1592#define USERGS_SYSRET32 \ 1739#define USERGS_SYSRET32 \
1593 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \ 1740 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \
@@ -1617,11 +1764,15 @@ static inline unsigned long __raw_local_irq_save(void)
1617 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ 1764 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
1618 swapgs) 1765 swapgs)
1619 1766
1767/*
1768 * Note: swapgs is very special, and in practise is either going to be
1769 * implemented with a single "swapgs" instruction or something very
1770 * special. Either way, we don't need to save any registers for
1771 * it.
1772 */
1620#define SWAPGS \ 1773#define SWAPGS \
1621 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ 1774 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
1622 PV_SAVE_REGS; \ 1775 call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs) \
1623 call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \
1624 PV_RESTORE_REGS \
1625 ) 1776 )
1626 1777
1627#define GET_CR2_INTO_RCX \ 1778#define GET_CR2_INTO_RCX \
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
deleted file mode 100644
index 2fbfff88df37..000000000000
--- a/arch/x86/include/asm/pda.h
+++ /dev/null
@@ -1,137 +0,0 @@
1#ifndef _ASM_X86_PDA_H
2#define _ASM_X86_PDA_H
3
4#ifndef __ASSEMBLY__
5#include <linux/stddef.h>
6#include <linux/types.h>
7#include <linux/cache.h>
8#include <asm/page.h>
9
10/* Per processor datastructure. %gs points to it while the kernel runs */
11struct x8664_pda {
12 struct task_struct *pcurrent; /* 0 Current process */
13 unsigned long data_offset; /* 8 Per cpu data offset from linker
14 address */
15 unsigned long kernelstack; /* 16 top of kernel stack for current */
16 unsigned long oldrsp; /* 24 user rsp for system call */
17 int irqcount; /* 32 Irq nesting counter. Starts -1 */
18 unsigned int cpunumber; /* 36 Logical CPU number */
19#ifdef CONFIG_CC_STACKPROTECTOR
20 unsigned long stack_canary; /* 40 stack canary value */
21 /* gcc-ABI: this canary MUST be at
22 offset 40!!! */
23#endif
24 char *irqstackptr;
25 short nodenumber; /* number of current node (32k max) */
26 short in_bootmem; /* pda lives in bootmem */
27 unsigned int __softirq_pending;
28 unsigned int __nmi_count; /* number of NMI on this CPUs */
29 short mmu_state;
30 short isidle;
31 struct mm_struct *active_mm;
32 unsigned apic_timer_irqs;
33 unsigned irq0_irqs;
34 unsigned irq_resched_count;
35 unsigned irq_call_count;
36 unsigned irq_tlb_count;
37 unsigned irq_thermal_count;
38 unsigned irq_threshold_count;
39 unsigned irq_spurious_count;
40} ____cacheline_aligned_in_smp;
41
42extern struct x8664_pda **_cpu_pda;
43extern void pda_init(int);
44
45#define cpu_pda(i) (_cpu_pda[i])
46
47/*
48 * There is no fast way to get the base address of the PDA, all the accesses
49 * have to mention %fs/%gs. So it needs to be done this Torvaldian way.
50 */
51extern void __bad_pda_field(void) __attribute__((noreturn));
52
53/*
54 * proxy_pda doesn't actually exist, but tell gcc it is accessed for
55 * all PDA accesses so it gets read/write dependencies right.
56 */
57extern struct x8664_pda _proxy_pda;
58
59#define pda_offset(field) offsetof(struct x8664_pda, field)
60
61#define pda_to_op(op, field, val) \
62do { \
63 typedef typeof(_proxy_pda.field) T__; \
64 if (0) { T__ tmp__; tmp__ = (val); } /* type checking */ \
65 switch (sizeof(_proxy_pda.field)) { \
66 case 2: \
67 asm(op "w %1,%%gs:%c2" : \
68 "+m" (_proxy_pda.field) : \
69 "ri" ((T__)val), \
70 "i"(pda_offset(field))); \
71 break; \
72 case 4: \
73 asm(op "l %1,%%gs:%c2" : \
74 "+m" (_proxy_pda.field) : \
75 "ri" ((T__)val), \
76 "i" (pda_offset(field))); \
77 break; \
78 case 8: \
79 asm(op "q %1,%%gs:%c2": \
80 "+m" (_proxy_pda.field) : \
81 "ri" ((T__)val), \
82 "i"(pda_offset(field))); \
83 break; \
84 default: \
85 __bad_pda_field(); \
86 } \
87} while (0)
88
89#define pda_from_op(op, field) \
90({ \
91 typeof(_proxy_pda.field) ret__; \
92 switch (sizeof(_proxy_pda.field)) { \
93 case 2: \
94 asm(op "w %%gs:%c1,%0" : \
95 "=r" (ret__) : \
96 "i" (pda_offset(field)), \
97 "m" (_proxy_pda.field)); \
98 break; \
99 case 4: \
100 asm(op "l %%gs:%c1,%0": \
101 "=r" (ret__): \
102 "i" (pda_offset(field)), \
103 "m" (_proxy_pda.field)); \
104 break; \
105 case 8: \
106 asm(op "q %%gs:%c1,%0": \
107 "=r" (ret__) : \
108 "i" (pda_offset(field)), \
109 "m" (_proxy_pda.field)); \
110 break; \
111 default: \
112 __bad_pda_field(); \
113 } \
114 ret__; \
115})
116
117#define read_pda(field) pda_from_op("mov", field)
118#define write_pda(field, val) pda_to_op("mov", field, val)
119#define add_pda(field, val) pda_to_op("add", field, val)
120#define sub_pda(field, val) pda_to_op("sub", field, val)
121#define or_pda(field, val) pda_to_op("or", field, val)
122
123/* This is not atomic against other CPUs -- CPU preemption needs to be off */
124#define test_and_clear_bit_pda(bit, field) \
125({ \
126 int old__; \
127 asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0" \
128 : "=r" (old__), "+m" (_proxy_pda.field) \
129 : "dIr" (bit), "i" (pda_offset(field)) : "memory");\
130 old__; \
131})
132
133#endif
134
135#define PDA_STACKOFFSET (5*8)
136
137#endif /* _ASM_X86_PDA_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index ece72053ba63..aee103b26d01 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -2,53 +2,12 @@
2#define _ASM_X86_PERCPU_H 2#define _ASM_X86_PERCPU_H
3 3
4#ifdef CONFIG_X86_64 4#ifdef CONFIG_X86_64
5#include <linux/compiler.h> 5#define __percpu_seg gs
6 6#define __percpu_mov_op movq
7/* Same as asm-generic/percpu.h, except that we store the per cpu offset 7#else
8 in the PDA. Longer term the PDA and every per cpu variable 8#define __percpu_seg fs
9 should be just put into a single section and referenced directly 9#define __percpu_mov_op movl
10 from %gs */
11
12#ifdef CONFIG_SMP
13#include <asm/pda.h>
14
15#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
16#define __my_cpu_offset read_pda(data_offset)
17
18#define per_cpu_offset(x) (__per_cpu_offset(x))
19
20#endif 10#endif
21#include <asm-generic/percpu.h>
22
23DECLARE_PER_CPU(struct x8664_pda, pda);
24
25/*
26 * These are supposed to be implemented as a single instruction which
27 * operates on the per-cpu data base segment. x86-64 doesn't have
28 * that yet, so this is a fairly inefficient workaround for the
29 * meantime. The single instruction is atomic with respect to
30 * preemption and interrupts, so we need to explicitly disable
31 * interrupts here to achieve the same effect. However, because it
32 * can be used from within interrupt-disable/enable, we can't actually
33 * disable interrupts; disabling preemption is enough.
34 */
35#define x86_read_percpu(var) \
36 ({ \
37 typeof(per_cpu_var(var)) __tmp; \
38 preempt_disable(); \
39 __tmp = __get_cpu_var(var); \
40 preempt_enable(); \
41 __tmp; \
42 })
43
44#define x86_write_percpu(var, val) \
45 do { \
46 preempt_disable(); \
47 __get_cpu_var(var) = (val); \
48 preempt_enable(); \
49 } while(0)
50
51#else /* CONFIG_X86_64 */
52 11
53#ifdef __ASSEMBLY__ 12#ifdef __ASSEMBLY__
54 13
@@ -65,47 +24,48 @@ DECLARE_PER_CPU(struct x8664_pda, pda);
65 * PER_CPU(cpu_gdt_descr, %ebx) 24 * PER_CPU(cpu_gdt_descr, %ebx)
66 */ 25 */
67#ifdef CONFIG_SMP 26#ifdef CONFIG_SMP
68#define PER_CPU(var, reg) \ 27#define PER_CPU(var, reg) \
69 movl %fs:per_cpu__##this_cpu_off, reg; \ 28 __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg; \
70 lea per_cpu__##var(reg), reg 29 lea per_cpu__##var(reg), reg
71#define PER_CPU_VAR(var) %fs:per_cpu__##var 30#define PER_CPU_VAR(var) %__percpu_seg:per_cpu__##var
72#else /* ! SMP */ 31#else /* ! SMP */
73#define PER_CPU(var, reg) \ 32#define PER_CPU(var, reg) \
74 movl $per_cpu__##var, reg 33 __percpu_mov_op $per_cpu__##var, reg
75#define PER_CPU_VAR(var) per_cpu__##var 34#define PER_CPU_VAR(var) per_cpu__##var
76#endif /* SMP */ 35#endif /* SMP */
77 36
37#ifdef CONFIG_X86_64_SMP
38#define INIT_PER_CPU_VAR(var) init_per_cpu__##var
39#else
40#define INIT_PER_CPU_VAR(var) per_cpu__##var
41#endif
42
78#else /* ...!ASSEMBLY */ 43#else /* ...!ASSEMBLY */
79 44
45#include <linux/stringify.h>
46
47#ifdef CONFIG_SMP
48#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x
49#define __my_cpu_offset percpu_read(this_cpu_off)
50#else
51#define __percpu_arg(x) "%" #x
52#endif
53
80/* 54/*
81 * PER_CPU finds an address of a per-cpu variable. 55 * Initialized pointers to per-cpu variables needed for the boot
56 * processor need to use these macros to get the proper address
57 * offset from __per_cpu_load on SMP.
82 * 58 *
83 * Args: 59 * There also must be an entry in vmlinux_64.lds.S
84 * var - variable name
85 * cpu - 32bit register containing the current CPU number
86 *
87 * The resulting address is stored in the "cpu" argument.
88 *
89 * Example:
90 * PER_CPU(cpu_gdt_descr, %ebx)
91 */ 60 */
92#ifdef CONFIG_SMP 61#define DECLARE_INIT_PER_CPU(var) \
93 62 extern typeof(per_cpu_var(var)) init_per_cpu_var(var)
94#define __my_cpu_offset x86_read_percpu(this_cpu_off)
95
96/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
97#define __percpu_seg "%%fs:"
98
99#else /* !SMP */
100
101#define __percpu_seg ""
102
103#endif /* SMP */
104
105#include <asm-generic/percpu.h>
106 63
107/* We can use this directly for local CPU (faster). */ 64#ifdef CONFIG_X86_64_SMP
108DECLARE_PER_CPU(unsigned long, this_cpu_off); 65#define init_per_cpu_var(var) init_per_cpu__##var
66#else
67#define init_per_cpu_var(var) per_cpu_var(var)
68#endif
109 69
110/* For arch-specific code, we can use direct single-insn ops (they 70/* For arch-specific code, we can use direct single-insn ops (they
111 * don't give an lvalue though). */ 71 * don't give an lvalue though). */
@@ -120,20 +80,25 @@ do { \
120 } \ 80 } \
121 switch (sizeof(var)) { \ 81 switch (sizeof(var)) { \
122 case 1: \ 82 case 1: \
123 asm(op "b %1,"__percpu_seg"%0" \ 83 asm(op "b %1,"__percpu_arg(0) \
124 : "+m" (var) \ 84 : "+m" (var) \
125 : "ri" ((T__)val)); \ 85 : "ri" ((T__)val)); \
126 break; \ 86 break; \
127 case 2: \ 87 case 2: \
128 asm(op "w %1,"__percpu_seg"%0" \ 88 asm(op "w %1,"__percpu_arg(0) \
129 : "+m" (var) \ 89 : "+m" (var) \
130 : "ri" ((T__)val)); \ 90 : "ri" ((T__)val)); \
131 break; \ 91 break; \
132 case 4: \ 92 case 4: \
133 asm(op "l %1,"__percpu_seg"%0" \ 93 asm(op "l %1,"__percpu_arg(0) \
134 : "+m" (var) \ 94 : "+m" (var) \
135 : "ri" ((T__)val)); \ 95 : "ri" ((T__)val)); \
136 break; \ 96 break; \
97 case 8: \
98 asm(op "q %1,"__percpu_arg(0) \
99 : "+m" (var) \
100 : "re" ((T__)val)); \
101 break; \
137 default: __bad_percpu_size(); \ 102 default: __bad_percpu_size(); \
138 } \ 103 } \
139} while (0) 104} while (0)
@@ -143,17 +108,22 @@ do { \
143 typeof(var) ret__; \ 108 typeof(var) ret__; \
144 switch (sizeof(var)) { \ 109 switch (sizeof(var)) { \
145 case 1: \ 110 case 1: \
146 asm(op "b "__percpu_seg"%1,%0" \ 111 asm(op "b "__percpu_arg(1)",%0" \
147 : "=r" (ret__) \ 112 : "=r" (ret__) \
148 : "m" (var)); \ 113 : "m" (var)); \
149 break; \ 114 break; \
150 case 2: \ 115 case 2: \
151 asm(op "w "__percpu_seg"%1,%0" \ 116 asm(op "w "__percpu_arg(1)",%0" \
152 : "=r" (ret__) \ 117 : "=r" (ret__) \
153 : "m" (var)); \ 118 : "m" (var)); \
154 break; \ 119 break; \
155 case 4: \ 120 case 4: \
156 asm(op "l "__percpu_seg"%1,%0" \ 121 asm(op "l "__percpu_arg(1)",%0" \
122 : "=r" (ret__) \
123 : "m" (var)); \
124 break; \
125 case 8: \
126 asm(op "q "__percpu_arg(1)",%0" \
157 : "=r" (ret__) \ 127 : "=r" (ret__) \
158 : "m" (var)); \ 128 : "m" (var)); \
159 break; \ 129 break; \
@@ -162,13 +132,30 @@ do { \
162 ret__; \ 132 ret__; \
163}) 133})
164 134
165#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var) 135#define percpu_read(var) percpu_from_op("mov", per_cpu__##var)
166#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val) 136#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val)
167#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val) 137#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val)
168#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val) 138#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val)
169#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val) 139#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val)
140#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val)
141#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val)
142
143/* This is not atomic against other CPUs -- CPU preemption needs to be off */
144#define x86_test_and_clear_bit_percpu(bit, var) \
145({ \
146 int old__; \
147 asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \
148 : "=r" (old__), "+m" (per_cpu__##var) \
149 : "dIr" (bit)); \
150 old__; \
151})
152
153#include <asm-generic/percpu.h>
154
155/* We can use this directly for local CPU (faster). */
156DECLARE_PER_CPU(unsigned long, this_cpu_off);
157
170#endif /* !__ASSEMBLY__ */ 158#endif /* !__ASSEMBLY__ */
171#endif /* !CONFIG_X86_64 */
172 159
173#ifdef CONFIG_SMP 160#ifdef CONFIG_SMP
174 161
@@ -195,9 +182,9 @@ do { \
195#define early_per_cpu_ptr(_name) (_name##_early_ptr) 182#define early_per_cpu_ptr(_name) (_name##_early_ptr)
196#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx]) 183#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
197#define early_per_cpu(_name, _cpu) \ 184#define early_per_cpu(_name, _cpu) \
198 (early_per_cpu_ptr(_name) ? \ 185 *(early_per_cpu_ptr(_name) ? \
199 early_per_cpu_ptr(_name)[_cpu] : \ 186 &early_per_cpu_ptr(_name)[_cpu] : \
200 per_cpu(_name, _cpu)) 187 &per_cpu(_name, _cpu))
201 188
202#else /* !CONFIG_SMP */ 189#else /* !CONFIG_SMP */
203#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ 190#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 06bbcbd66e9c..6ceaef08486f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -240,64 +240,78 @@ static inline int pmd_large(pmd_t pte)
240 (_PAGE_PSE | _PAGE_PRESENT); 240 (_PAGE_PSE | _PAGE_PRESENT);
241} 241}
242 242
243static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
244{
245 pteval_t v = native_pte_val(pte);
246
247 return native_make_pte(v | set);
248}
249
250static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
251{
252 pteval_t v = native_pte_val(pte);
253
254 return native_make_pte(v & ~clear);
255}
256
243static inline pte_t pte_mkclean(pte_t pte) 257static inline pte_t pte_mkclean(pte_t pte)
244{ 258{
245 return __pte(pte_val(pte) & ~_PAGE_DIRTY); 259 return pte_clear_flags(pte, _PAGE_DIRTY);
246} 260}
247 261
248static inline pte_t pte_mkold(pte_t pte) 262static inline pte_t pte_mkold(pte_t pte)
249{ 263{
250 return __pte(pte_val(pte) & ~_PAGE_ACCESSED); 264 return pte_clear_flags(pte, _PAGE_ACCESSED);
251} 265}
252 266
253static inline pte_t pte_wrprotect(pte_t pte) 267static inline pte_t pte_wrprotect(pte_t pte)
254{ 268{
255 return __pte(pte_val(pte) & ~_PAGE_RW); 269 return pte_clear_flags(pte, _PAGE_RW);
256} 270}
257 271
258static inline pte_t pte_mkexec(pte_t pte) 272static inline pte_t pte_mkexec(pte_t pte)
259{ 273{
260 return __pte(pte_val(pte) & ~_PAGE_NX); 274 return pte_clear_flags(pte, _PAGE_NX);
261} 275}
262 276
263static inline pte_t pte_mkdirty(pte_t pte) 277static inline pte_t pte_mkdirty(pte_t pte)
264{ 278{
265 return __pte(pte_val(pte) | _PAGE_DIRTY); 279 return pte_set_flags(pte, _PAGE_DIRTY);
266} 280}
267 281
268static inline pte_t pte_mkyoung(pte_t pte) 282static inline pte_t pte_mkyoung(pte_t pte)
269{ 283{
270 return __pte(pte_val(pte) | _PAGE_ACCESSED); 284 return pte_set_flags(pte, _PAGE_ACCESSED);
271} 285}
272 286
273static inline pte_t pte_mkwrite(pte_t pte) 287static inline pte_t pte_mkwrite(pte_t pte)
274{ 288{
275 return __pte(pte_val(pte) | _PAGE_RW); 289 return pte_set_flags(pte, _PAGE_RW);
276} 290}
277 291
278static inline pte_t pte_mkhuge(pte_t pte) 292static inline pte_t pte_mkhuge(pte_t pte)
279{ 293{
280 return __pte(pte_val(pte) | _PAGE_PSE); 294 return pte_set_flags(pte, _PAGE_PSE);
281} 295}
282 296
283static inline pte_t pte_clrhuge(pte_t pte) 297static inline pte_t pte_clrhuge(pte_t pte)
284{ 298{
285 return __pte(pte_val(pte) & ~_PAGE_PSE); 299 return pte_clear_flags(pte, _PAGE_PSE);
286} 300}
287 301
288static inline pte_t pte_mkglobal(pte_t pte) 302static inline pte_t pte_mkglobal(pte_t pte)
289{ 303{
290 return __pte(pte_val(pte) | _PAGE_GLOBAL); 304 return pte_set_flags(pte, _PAGE_GLOBAL);
291} 305}
292 306
293static inline pte_t pte_clrglobal(pte_t pte) 307static inline pte_t pte_clrglobal(pte_t pte)
294{ 308{
295 return __pte(pte_val(pte) & ~_PAGE_GLOBAL); 309 return pte_clear_flags(pte, _PAGE_GLOBAL);
296} 310}
297 311
298static inline pte_t pte_mkspecial(pte_t pte) 312static inline pte_t pte_mkspecial(pte_t pte)
299{ 313{
300 return __pte(pte_val(pte) | _PAGE_SPECIAL); 314 return pte_set_flags(pte, _PAGE_SPECIAL);
301} 315}
302 316
303extern pteval_t __supported_pte_mask; 317extern pteval_t __supported_pte_mask;
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index ba09289accaa..1df9637dfda3 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -11,7 +11,6 @@
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <linux/bitops.h> 12#include <linux/bitops.h>
13#include <linux/threads.h> 13#include <linux/threads.h>
14#include <asm/pda.h>
15 14
16extern pud_t level3_kernel_pgt[512]; 15extern pud_t level3_kernel_pgt[512];
17extern pud_t level3_ident_pgt[512]; 16extern pud_t level3_ident_pgt[512];
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 091cd8855f2e..373d3f628d24 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -378,6 +378,24 @@ union thread_xstate {
378 378
379#ifdef CONFIG_X86_64 379#ifdef CONFIG_X86_64
380DECLARE_PER_CPU(struct orig_ist, orig_ist); 380DECLARE_PER_CPU(struct orig_ist, orig_ist);
381
382union irq_stack_union {
383 char irq_stack[IRQ_STACK_SIZE];
384 /*
385 * GCC hardcodes the stack canary as %gs:40. Since the
386 * irq_stack is the object at %gs:0, we reserve the bottom
387 * 48 bytes of the irq stack for the canary.
388 */
389 struct {
390 char gs_base[40];
391 unsigned long stack_canary;
392 };
393};
394
395DECLARE_PER_CPU(union irq_stack_union, irq_stack_union);
396DECLARE_INIT_PER_CPU(irq_stack_union);
397
398DECLARE_PER_CPU(char *, irq_stack_ptr);
381#endif 399#endif
382 400
383extern void print_cpu_info(struct cpuinfo_x86 *); 401extern void print_cpu_info(struct cpuinfo_x86 *);
@@ -752,9 +770,9 @@ extern int sysenter_setup(void);
752extern struct desc_ptr early_gdt_descr; 770extern struct desc_ptr early_gdt_descr;
753 771
754extern void cpu_set_gdt(int); 772extern void cpu_set_gdt(int);
755extern void switch_to_new_gdt(void); 773extern void switch_to_new_gdt(int);
774extern void load_percpu_segment(int);
756extern void cpu_init(void); 775extern void cpu_init(void);
757extern void init_gdt(int cpu);
758 776
759static inline unsigned long get_debugctlmsr(void) 777static inline unsigned long get_debugctlmsr(void)
760{ 778{
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ebe858cdc8a3..536949749bc2 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -100,7 +100,6 @@ extern unsigned long init_pg_tables_start;
100extern unsigned long init_pg_tables_end; 100extern unsigned long init_pg_tables_end;
101 101
102#else 102#else
103void __init x86_64_init_pda(void);
104void __init x86_64_start_kernel(char *real_mode); 103void __init x86_64_start_kernel(char *real_mode);
105void __init x86_64_start_reservations(char *real_mode_data); 104void __init x86_64_start_reservations(char *real_mode_data);
106 105
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 19953df61c52..45ef8a1b9d7c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -15,34 +15,8 @@
15# include <asm/io_apic.h> 15# include <asm/io_apic.h>
16# endif 16# endif
17#endif 17#endif
18#include <asm/pda.h>
19#include <asm/thread_info.h> 18#include <asm/thread_info.h>
20 19#include <asm/cpumask.h>
21#ifdef CONFIG_X86_64
22
23extern cpumask_var_t cpu_callin_mask;
24extern cpumask_var_t cpu_callout_mask;
25extern cpumask_var_t cpu_initialized_mask;
26extern cpumask_var_t cpu_sibling_setup_mask;
27
28#else /* CONFIG_X86_32 */
29
30extern cpumask_t cpu_callin_map;
31extern cpumask_t cpu_callout_map;
32extern cpumask_t cpu_initialized;
33extern cpumask_t cpu_sibling_setup_map;
34
35#define cpu_callin_mask ((struct cpumask *)&cpu_callin_map)
36#define cpu_callout_mask ((struct cpumask *)&cpu_callout_map)
37#define cpu_initialized_mask ((struct cpumask *)&cpu_initialized)
38#define cpu_sibling_setup_mask ((struct cpumask *)&cpu_sibling_setup_map)
39
40#endif /* CONFIG_X86_32 */
41
42extern void (*mtrr_hook)(void);
43extern void zap_low_mappings(void);
44
45extern int __cpuinit get_local_pda(int cpu);
46 20
47extern int smp_num_siblings; 21extern int smp_num_siblings;
48extern unsigned int num_processors; 22extern unsigned int num_processors;
@@ -50,9 +24,7 @@ extern unsigned int num_processors;
50DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); 24DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
51DECLARE_PER_CPU(cpumask_t, cpu_core_map); 25DECLARE_PER_CPU(cpumask_t, cpu_core_map);
52DECLARE_PER_CPU(u16, cpu_llc_id); 26DECLARE_PER_CPU(u16, cpu_llc_id);
53#ifdef CONFIG_X86_32
54DECLARE_PER_CPU(int, cpu_number); 27DECLARE_PER_CPU(int, cpu_number);
55#endif
56 28
57static inline struct cpumask *cpu_sibling_mask(int cpu) 29static inline struct cpumask *cpu_sibling_mask(int cpu)
58{ 30{
@@ -167,8 +139,6 @@ void play_dead_common(void);
167void native_send_call_func_ipi(const struct cpumask *mask); 139void native_send_call_func_ipi(const struct cpumask *mask);
168void native_send_call_func_single_ipi(int cpu); 140void native_send_call_func_single_ipi(int cpu);
169 141
170extern void prefill_possible_map(void);
171
172void smp_store_cpu_info(int id); 142void smp_store_cpu_info(int id);
173#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) 143#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
174 144
@@ -177,10 +147,6 @@ static inline int num_booting_cpus(void)
177{ 147{
178 return cpumask_weight(cpu_callout_mask); 148 return cpumask_weight(cpu_callout_mask);
179} 149}
180#else
181static inline void prefill_possible_map(void)
182{
183}
184#endif /* CONFIG_SMP */ 150#endif /* CONFIG_SMP */
185 151
186extern unsigned disabled_cpus __cpuinitdata; 152extern unsigned disabled_cpus __cpuinitdata;
@@ -191,11 +157,11 @@ extern unsigned disabled_cpus __cpuinitdata;
191 * from the initial startup. We map APIC_BASE very early in page_setup(), 157 * from the initial startup. We map APIC_BASE very early in page_setup(),
192 * so this is correct in the x86 case. 158 * so this is correct in the x86 case.
193 */ 159 */
194#define raw_smp_processor_id() (x86_read_percpu(cpu_number)) 160#define raw_smp_processor_id() (percpu_read(cpu_number))
195extern int safe_smp_processor_id(void); 161extern int safe_smp_processor_id(void);
196 162
197#elif defined(CONFIG_X86_64_SMP) 163#elif defined(CONFIG_X86_64_SMP)
198#define raw_smp_processor_id() read_pda(cpunumber) 164#define raw_smp_processor_id() (percpu_read(cpu_number))
199 165
200#define stack_smp_processor_id() \ 166#define stack_smp_processor_id() \
201({ \ 167({ \
@@ -205,10 +171,6 @@ extern int safe_smp_processor_id(void);
205}) 171})
206#define safe_smp_processor_id() smp_processor_id() 172#define safe_smp_processor_id() smp_processor_id()
207 173
208#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */
209#define cpu_physical_id(cpu) boot_cpu_physical_apicid
210#define safe_smp_processor_id() 0
211#define stack_smp_processor_id() 0
212#endif 174#endif
213 175
214#ifdef CONFIG_X86_LOCAL_APIC 176#ifdef CONFIG_X86_LOCAL_APIC
@@ -251,11 +213,5 @@ static inline int hard_smp_processor_id(void)
251 213
252#endif /* CONFIG_X86_LOCAL_APIC */ 214#endif /* CONFIG_X86_LOCAL_APIC */
253 215
254#ifdef CONFIG_X86_HAS_BOOT_CPU_ID
255extern unsigned char boot_cpu_id;
256#else
257#define boot_cpu_id 0
258#endif
259
260#endif /* __ASSEMBLY__ */ 216#endif /* __ASSEMBLY__ */
261#endif /* _ASM_X86_SMP_H */ 217#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
new file mode 100644
index 000000000000..36a700acaf2b
--- /dev/null
+++ b/arch/x86/include/asm/stackprotector.h
@@ -0,0 +1,38 @@
1#ifndef _ASM_STACKPROTECTOR_H
2#define _ASM_STACKPROTECTOR_H 1
3
4#include <asm/tsc.h>
5#include <asm/processor.h>
6
7/*
8 * Initialize the stackprotector canary value.
9 *
10 * NOTE: this must only be called from functions that never return,
11 * and it must always be inlined.
12 */
13static __always_inline void boot_init_stack_canary(void)
14{
15 u64 canary;
16 u64 tsc;
17
18 /*
19 * Build time only check to make sure the stack_canary is at
20 * offset 40 in the pda; this is a gcc ABI requirement
21 */
22 BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40);
23
24 /*
25 * We both use the random pool and the current TSC as a source
26 * of randomness. The TSC only matters for very early init,
27 * there it already has some randomness on most systems. Later
28 * on during the bootup the random pool has true entropy too.
29 */
30 get_random_bytes(&canary, sizeof(canary));
31 tsc = __native_read_tsc();
32 canary += tsc + (tsc << 32UL);
33
34 current->stack_canary = canary;
35 percpu_write(irq_stack_union.stack_canary, canary);
36}
37
38#endif
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 8e626ea33a1a..2fcc70bc85f3 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -86,27 +86,44 @@ do { \
86 , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ 86 , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
87 "r12", "r13", "r14", "r15" 87 "r12", "r13", "r14", "r15"
88 88
89#ifdef CONFIG_CC_STACKPROTECTOR
90#define __switch_canary \
91 "movq %P[task_canary](%%rsi),%%r8\n\t" \
92 "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
93#define __switch_canary_oparam \
94 , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary))
95#define __switch_canary_iparam \
96 , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
97#else /* CC_STACKPROTECTOR */
98#define __switch_canary
99#define __switch_canary_oparam
100#define __switch_canary_iparam
101#endif /* CC_STACKPROTECTOR */
102
89/* Save restore flags to clear handle leaking NT */ 103/* Save restore flags to clear handle leaking NT */
90#define switch_to(prev, next, last) \ 104#define switch_to(prev, next, last) \
91 asm volatile(SAVE_CONTEXT \ 105 asm volatile(SAVE_CONTEXT \
92 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ 106 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
93 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ 107 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
94 "call __switch_to\n\t" \ 108 "call __switch_to\n\t" \
95 ".globl thread_return\n" \ 109 ".globl thread_return\n" \
96 "thread_return:\n\t" \ 110 "thread_return:\n\t" \
97 "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \ 111 "movq "__percpu_arg([current_task])",%%rsi\n\t" \
112 __switch_canary \
98 "movq %P[thread_info](%%rsi),%%r8\n\t" \ 113 "movq %P[thread_info](%%rsi),%%r8\n\t" \
99 LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \ 114 LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
100 "movq %%rax,%%rdi\n\t" \ 115 "movq %%rax,%%rdi\n\t" \
101 "jc ret_from_fork\n\t" \ 116 "jc ret_from_fork\n\t" \
102 RESTORE_CONTEXT \ 117 RESTORE_CONTEXT \
103 : "=a" (last) \ 118 : "=a" (last) \
119 __switch_canary_oparam \
104 : [next] "S" (next), [prev] "D" (prev), \ 120 : [next] "S" (next), [prev] "D" (prev), \
105 [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ 121 [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
106 [ti_flags] "i" (offsetof(struct thread_info, flags)), \ 122 [ti_flags] "i" (offsetof(struct thread_info, flags)), \
107 [tif_fork] "i" (TIF_FORK), \ 123 [tif_fork] "i" (TIF_FORK), \
108 [thread_info] "i" (offsetof(struct task_struct, stack)), \ 124 [thread_info] "i" (offsetof(struct task_struct, stack)), \
109 [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ 125 [current_task] "m" (per_cpu_var(current_task)) \
126 __switch_canary_iparam \
110 : "memory", "cc" __EXTRA_CLOBBER) 127 : "memory", "cc" __EXTRA_CLOBBER)
111#endif 128#endif
112 129
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 98789647baa9..b46f8ca007b5 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -194,25 +194,21 @@ static inline struct thread_info *current_thread_info(void)
194 194
195#else /* X86_32 */ 195#else /* X86_32 */
196 196
197#include <asm/pda.h> 197#include <asm/percpu.h>
198#define KERNEL_STACK_OFFSET (5*8)
198 199
199/* 200/*
200 * macros/functions for gaining access to the thread information structure 201 * macros/functions for gaining access to the thread information structure
201 * preempt_count needs to be 1 initially, until the scheduler is functional. 202 * preempt_count needs to be 1 initially, until the scheduler is functional.
202 */ 203 */
203#ifndef __ASSEMBLY__ 204#ifndef __ASSEMBLY__
204static inline struct thread_info *current_thread_info(void) 205DECLARE_PER_CPU(unsigned long, kernel_stack);
205{
206 struct thread_info *ti;
207 ti = (void *)(read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE);
208 return ti;
209}
210 206
211/* do not use in interrupt context */ 207static inline struct thread_info *current_thread_info(void)
212static inline struct thread_info *stack_thread_info(void)
213{ 208{
214 struct thread_info *ti; 209 struct thread_info *ti;
215 asm("andq %%rsp,%0; " : "=r" (ti) : "0" (~(THREAD_SIZE - 1))); 210 ti = (void *)(percpu_read(kernel_stack) +
211 KERNEL_STACK_OFFSET - THREAD_SIZE);
216 return ti; 212 return ti;
217} 213}
218 214
@@ -220,8 +216,8 @@ static inline struct thread_info *stack_thread_info(void)
220 216
221/* how to get the thread information struct from ASM */ 217/* how to get the thread information struct from ASM */
222#define GET_THREAD_INFO(reg) \ 218#define GET_THREAD_INFO(reg) \
223 movq %gs:pda_kernelstack,reg ; \ 219 movq PER_CPU_VAR(kernel_stack),reg ; \
224 subq $(THREAD_SIZE-PDA_STACKOFFSET),reg 220 subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg
225 221
226#endif 222#endif
227 223
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 0e7bbb549116..d3539f998f88 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -113,7 +113,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
113 __flush_tlb(); 113 __flush_tlb();
114} 114}
115 115
116static inline void native_flush_tlb_others(const cpumask_t *cpumask, 116static inline void native_flush_tlb_others(const struct cpumask *cpumask,
117 struct mm_struct *mm, 117 struct mm_struct *mm,
118 unsigned long va) 118 unsigned long va)
119{ 119{
@@ -142,31 +142,28 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
142 flush_tlb_mm(vma->vm_mm); 142 flush_tlb_mm(vma->vm_mm);
143} 143}
144 144
145void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm, 145void native_flush_tlb_others(const struct cpumask *cpumask,
146 unsigned long va); 146 struct mm_struct *mm, unsigned long va);
147 147
148#define TLBSTATE_OK 1 148#define TLBSTATE_OK 1
149#define TLBSTATE_LAZY 2 149#define TLBSTATE_LAZY 2
150 150
151#ifdef CONFIG_X86_32
152struct tlb_state { 151struct tlb_state {
153 struct mm_struct *active_mm; 152 struct mm_struct *active_mm;
154 int state; 153 int state;
155 char __cacheline_padding[L1_CACHE_BYTES-8];
156}; 154};
157DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); 155DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
158 156
159void reset_lazy_tlbstate(void);
160#else
161static inline void reset_lazy_tlbstate(void) 157static inline void reset_lazy_tlbstate(void)
162{ 158{
159 percpu_write(cpu_tlbstate.state, 0);
160 percpu_write(cpu_tlbstate.active_mm, &init_mm);
163} 161}
164#endif
165 162
166#endif /* SMP */ 163#endif /* SMP */
167 164
168#ifndef CONFIG_PARAVIRT 165#ifndef CONFIG_PARAVIRT
169#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(&mask, mm, va) 166#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(mask, mm, va)
170#endif 167#endif
171 168
172static inline void flush_tlb_kernel_range(unsigned long start, 169static inline void flush_tlb_kernel_range(unsigned long start,
@@ -175,4 +172,6 @@ static inline void flush_tlb_kernel_range(unsigned long start,
175 flush_tlb_all(); 172 flush_tlb_all();
176} 173}
177 174
175extern void zap_low_mappings(void);
176
178#endif /* _ASM_X86_TLBFLUSH_H */ 177#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 4e2f2e0aab27..77cfb2cfb386 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -74,6 +74,8 @@ static inline const struct cpumask *cpumask_of_node(int node)
74 return &node_to_cpumask_map[node]; 74 return &node_to_cpumask_map[node];
75} 75}
76 76
77static inline void setup_node_to_cpumask_map(void) { }
78
77#else /* CONFIG_X86_64 */ 79#else /* CONFIG_X86_64 */
78 80
79/* Mappings between node number and cpus on that node. */ 81/* Mappings between node number and cpus on that node. */
@@ -83,7 +85,8 @@ extern cpumask_t *node_to_cpumask_map;
83DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); 85DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
84 86
85/* Returns the number of the current Node. */ 87/* Returns the number of the current Node. */
86#define numa_node_id() read_pda(nodenumber) 88DECLARE_PER_CPU(int, node_number);
89#define numa_node_id() percpu_read(node_number)
87 90
88#ifdef CONFIG_DEBUG_PER_CPU_MAPS 91#ifdef CONFIG_DEBUG_PER_CPU_MAPS
89extern int cpu_to_node(int cpu); 92extern int cpu_to_node(int cpu);
@@ -102,10 +105,7 @@ static inline int cpu_to_node(int cpu)
102/* Same function but used if called before per_cpu areas are setup */ 105/* Same function but used if called before per_cpu areas are setup */
103static inline int early_cpu_to_node(int cpu) 106static inline int early_cpu_to_node(int cpu)
104{ 107{
105 if (early_per_cpu_ptr(x86_cpu_to_node_map)) 108 return early_per_cpu(x86_cpu_to_node_map, cpu);
106 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
107
108 return per_cpu(x86_cpu_to_node_map, cpu);
109} 109}
110 110
111/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ 111/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
@@ -122,6 +122,8 @@ static inline cpumask_t node_to_cpumask(int node)
122 122
123#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 123#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
124 124
125extern void setup_node_to_cpumask_map(void);
126
125/* 127/*
126 * Replace default node_to_cpumask_ptr with optimized version 128 * Replace default node_to_cpumask_ptr with optimized version
127 * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" 129 * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
@@ -192,9 +194,20 @@ extern int __node_distance(int, int);
192 194
193#else /* !CONFIG_NUMA */ 195#else /* !CONFIG_NUMA */
194 196
195#define numa_node_id() 0 197static inline int numa_node_id(void)
196#define cpu_to_node(cpu) 0 198{
197#define early_cpu_to_node(cpu) 0 199 return 0;
200}
201
202static inline int cpu_to_node(int cpu)
203{
204 return 0;
205}
206
207static inline int early_cpu_to_node(int cpu)
208{
209 return 0;
210}
198 211
199static inline const cpumask_t *cpumask_of_node(int node) 212static inline const cpumask_t *cpumask_of_node(int node)
200{ 213{
@@ -209,6 +222,8 @@ static inline int node_to_first_cpu(int node)
209 return first_cpu(cpu_online_map); 222 return first_cpu(cpu_online_map);
210} 223}
211 224
225static inline void setup_node_to_cpumask_map(void) { }
226
212/* 227/*
213 * Replace default node_to_cpumask_ptr with optimized version 228 * Replace default node_to_cpumask_ptr with optimized version
214 * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" 229 * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index 780ba0ab94f9..90f06c25221d 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -13,6 +13,7 @@ extern unsigned char *trampoline_base;
13 13
14extern unsigned long init_rsp; 14extern unsigned long init_rsp;
15extern unsigned long initial_code; 15extern unsigned long initial_code;
16extern unsigned long initial_gs;
16 17
17#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) 18#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
18#define TRAMPOLINE_BASE 0x6000 19#define TRAMPOLINE_BASE 0x6000
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
new file mode 100644
index 000000000000..8ac1d7e312f3
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv.h
@@ -0,0 +1,33 @@
1#ifndef _ASM_X86_UV_UV_H
2#define _ASM_X86_UV_UV_H
3
4enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
5
6#ifdef CONFIG_X86_UV
7
8extern enum uv_system_type get_uv_system_type(void);
9extern int is_uv_system(void);
10extern void uv_cpu_init(void);
11extern void uv_system_init(void);
12extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
13extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
14 struct mm_struct *mm,
15 unsigned long va,
16 unsigned int cpu);
17
18#else /* X86_UV */
19
20static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; }
21static inline int is_uv_system(void) { return 0; }
22static inline void uv_cpu_init(void) { }
23static inline void uv_system_init(void) { }
24static inline int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
25{ return 1; }
26static inline const struct cpumask *
27uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
28 unsigned long va, unsigned int cpu)
29{ return cpumask; }
30
31#endif /* X86_UV */
32
33#endif /* _ASM_X86_UV_UV_H */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 50423c7b56b2..9b0e61bf7a88 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -325,7 +325,6 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
325#define cpubit_isset(cpu, bau_local_cpumask) \ 325#define cpubit_isset(cpu, bau_local_cpumask) \
326 test_bit((cpu), (bau_local_cpumask).bits) 326 test_bit((cpu), (bau_local_cpumask).bits)
327 327
328extern int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long);
329extern void uv_bau_message_intr1(void); 328extern void uv_bau_message_intr1(void);
330extern void uv_bau_timeout_intr1(void); 329extern void uv_bau_timeout_intr1(void);
331 330
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d364df03c1d6..37fa30bada17 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,11 +23,12 @@ nostackp := $(call cc-option, -fno-stack-protector)
23CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) 23CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
24CFLAGS_hpet.o := $(nostackp) 24CFLAGS_hpet.o := $(nostackp)
25CFLAGS_tsc.o := $(nostackp) 25CFLAGS_tsc.o := $(nostackp)
26CFLAGS_paravirt.o := $(nostackp)
26 27
27obj-y := process_$(BITS).o signal.o entry_$(BITS).o 28obj-y := process_$(BITS).o signal.o entry_$(BITS).o
28obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 29obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
29obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o 30obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
30obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 31obj-y += setup.o i8259.o irqinit_$(BITS).o
31obj-$(CONFIG_X86_VISWS) += visws_quirks.o 32obj-$(CONFIG_X86_VISWS) += visws_quirks.o
32obj-$(CONFIG_X86_32) += probe_roms_32.o 33obj-$(CONFIG_X86_32) += probe_roms_32.o
33obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 34obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
@@ -57,9 +58,9 @@ obj-$(CONFIG_PCI) += early-quirks.o
57apm-y := apm_32.o 58apm-y := apm_32.o
58obj-$(CONFIG_APM) += apm.o 59obj-$(CONFIG_APM) += apm.o
59obj-$(CONFIG_X86_SMP) += smp.o 60obj-$(CONFIG_X86_SMP) += smp.o
60obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o tlb_$(BITS).o 61obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o
61obj-$(CONFIG_X86_32_SMP) += smpcommon.o 62obj-$(CONFIG_SMP) += setup_percpu.o
62obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o 63obj-$(CONFIG_X86_64_SMP) += tsc_sync.o
63obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o 64obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
64obj-$(CONFIG_X86_MPPARSE) += mpparse.o 65obj-$(CONFIG_X86_MPPARSE) += mpparse.o
65obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o 66obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
@@ -114,10 +115,11 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
114### 115###
115# 64 bit specific files 116# 64 bit specific files
116ifeq ($(CONFIG_X86_64),y) 117ifeq ($(CONFIG_X86_64),y)
117 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 118 obj-y += genapic_64.o genapic_flat_64.o
118 obj-y += bios_uv.o uv_irq.o uv_sysfs.o
119 obj-y += genx2apic_cluster.o 119 obj-y += genx2apic_cluster.o
120 obj-y += genx2apic_phys.o 120 obj-y += genx2apic_phys.o
121 obj-$(CONFIG_X86_UV) += genx2apic_uv_x.o tlb_uv.o
122 obj-$(CONFIG_X86_UV) += bios_uv.o uv_irq.o uv_sysfs.o
121 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 123 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
122 obj-$(CONFIG_AUDIT) += audit_64.o 124 obj-$(CONFIG_AUDIT) += audit_64.o
123 125
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index d37593c2f438..4cb5964f1499 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -912,8 +912,8 @@ static u8 __init uniq_ioapic_id(u8 id)
912 DECLARE_BITMAP(used, 256); 912 DECLARE_BITMAP(used, 256);
913 bitmap_zero(used, 256); 913 bitmap_zero(used, 256);
914 for (i = 0; i < nr_ioapics; i++) { 914 for (i = 0; i < nr_ioapics; i++) {
915 struct mp_config_ioapic *ia = &mp_ioapics[i]; 915 struct mpc_ioapic *ia = &mp_ioapics[i];
916 __set_bit(ia->mp_apicid, used); 916 __set_bit(ia->apicid, used);
917 } 917 }
918 if (!test_bit(id, used)) 918 if (!test_bit(id, used))
919 return id; 919 return id;
@@ -945,47 +945,47 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
945 945
946 idx = nr_ioapics; 946 idx = nr_ioapics;
947 947
948 mp_ioapics[idx].mp_type = MP_IOAPIC; 948 mp_ioapics[idx].type = MP_IOAPIC;
949 mp_ioapics[idx].mp_flags = MPC_APIC_USABLE; 949 mp_ioapics[idx].flags = MPC_APIC_USABLE;
950 mp_ioapics[idx].mp_apicaddr = address; 950 mp_ioapics[idx].apicaddr = address;
951 951
952 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 952 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
953 mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id); 953 mp_ioapics[idx].apicid = uniq_ioapic_id(id);
954#ifdef CONFIG_X86_32 954#ifdef CONFIG_X86_32
955 mp_ioapics[idx].mp_apicver = io_apic_get_version(idx); 955 mp_ioapics[idx].apicver = io_apic_get_version(idx);
956#else 956#else
957 mp_ioapics[idx].mp_apicver = 0; 957 mp_ioapics[idx].apicver = 0;
958#endif 958#endif
959 /* 959 /*
960 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 960 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
961 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 961 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
962 */ 962 */
963 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid; 963 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
964 mp_ioapic_routing[idx].gsi_base = gsi_base; 964 mp_ioapic_routing[idx].gsi_base = gsi_base;
965 mp_ioapic_routing[idx].gsi_end = gsi_base + 965 mp_ioapic_routing[idx].gsi_end = gsi_base +
966 io_apic_get_redir_entries(idx); 966 io_apic_get_redir_entries(idx);
967 967
968 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " 968 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
969 "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid, 969 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
970 mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr, 970 mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
971 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); 971 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
972 972
973 nr_ioapics++; 973 nr_ioapics++;
974} 974}
975 975
976static void assign_to_mp_irq(struct mp_config_intsrc *m, 976static void assign_to_mp_irq(struct mpc_intsrc *m,
977 struct mp_config_intsrc *mp_irq) 977 struct mpc_intsrc *mp_irq)
978{ 978{
979 memcpy(mp_irq, m, sizeof(struct mp_config_intsrc)); 979 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
980} 980}
981 981
982static int mp_irq_cmp(struct mp_config_intsrc *mp_irq, 982static int mp_irq_cmp(struct mpc_intsrc *mp_irq,
983 struct mp_config_intsrc *m) 983 struct mpc_intsrc *m)
984{ 984{
985 return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc)); 985 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
986} 986}
987 987
988static void save_mp_irq(struct mp_config_intsrc *m) 988static void save_mp_irq(struct mpc_intsrc *m)
989{ 989{
990 int i; 990 int i;
991 991
@@ -1003,7 +1003,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
1003{ 1003{
1004 int ioapic; 1004 int ioapic;
1005 int pin; 1005 int pin;
1006 struct mp_config_intsrc mp_irq; 1006 struct mpc_intsrc mp_irq;
1007 1007
1008 /* 1008 /*
1009 * Convert 'gsi' to 'ioapic.pin'. 1009 * Convert 'gsi' to 'ioapic.pin'.
@@ -1021,13 +1021,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
1021 if ((bus_irq == 0) && (trigger == 3)) 1021 if ((bus_irq == 0) && (trigger == 3))
1022 trigger = 1; 1022 trigger = 1;
1023 1023
1024 mp_irq.mp_type = MP_INTSRC; 1024 mp_irq.type = MP_INTSRC;
1025 mp_irq.mp_irqtype = mp_INT; 1025 mp_irq.irqtype = mp_INT;
1026 mp_irq.mp_irqflag = (trigger << 2) | polarity; 1026 mp_irq.irqflag = (trigger << 2) | polarity;
1027 mp_irq.mp_srcbus = MP_ISA_BUS; 1027 mp_irq.srcbus = MP_ISA_BUS;
1028 mp_irq.mp_srcbusirq = bus_irq; /* IRQ */ 1028 mp_irq.srcbusirq = bus_irq; /* IRQ */
1029 mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */ 1029 mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */
1030 mp_irq.mp_dstirq = pin; /* INTIN# */ 1030 mp_irq.dstirq = pin; /* INTIN# */
1031 1031
1032 save_mp_irq(&mp_irq); 1032 save_mp_irq(&mp_irq);
1033} 1033}
@@ -1037,7 +1037,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1037 int i; 1037 int i;
1038 int ioapic; 1038 int ioapic;
1039 unsigned int dstapic; 1039 unsigned int dstapic;
1040 struct mp_config_intsrc mp_irq; 1040 struct mpc_intsrc mp_irq;
1041 1041
1042#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 1042#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
1043 /* 1043 /*
@@ -1062,7 +1062,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1062 ioapic = mp_find_ioapic(0); 1062 ioapic = mp_find_ioapic(0);
1063 if (ioapic < 0) 1063 if (ioapic < 0)
1064 return; 1064 return;
1065 dstapic = mp_ioapics[ioapic].mp_apicid; 1065 dstapic = mp_ioapics[ioapic].apicid;
1066 1066
1067 /* 1067 /*
1068 * Use the default configuration for the IRQs 0-15. Unless 1068 * Use the default configuration for the IRQs 0-15. Unless
@@ -1072,16 +1072,14 @@ void __init mp_config_acpi_legacy_irqs(void)
1072 int idx; 1072 int idx;
1073 1073
1074 for (idx = 0; idx < mp_irq_entries; idx++) { 1074 for (idx = 0; idx < mp_irq_entries; idx++) {
1075 struct mp_config_intsrc *irq = mp_irqs + idx; 1075 struct mpc_intsrc *irq = mp_irqs + idx;
1076 1076
1077 /* Do we already have a mapping for this ISA IRQ? */ 1077 /* Do we already have a mapping for this ISA IRQ? */
1078 if (irq->mp_srcbus == MP_ISA_BUS 1078 if (irq->srcbus == MP_ISA_BUS && irq->srcbusirq == i)
1079 && irq->mp_srcbusirq == i)
1080 break; 1079 break;
1081 1080
1082 /* Do we already have a mapping for this IOAPIC pin */ 1081 /* Do we already have a mapping for this IOAPIC pin */
1083 if (irq->mp_dstapic == dstapic && 1082 if (irq->dstapic == dstapic && irq->dstirq == i)
1084 irq->mp_dstirq == i)
1085 break; 1083 break;
1086 } 1084 }
1087 1085
@@ -1090,13 +1088,13 @@ void __init mp_config_acpi_legacy_irqs(void)
1090 continue; /* IRQ already used */ 1088 continue; /* IRQ already used */
1091 } 1089 }
1092 1090
1093 mp_irq.mp_type = MP_INTSRC; 1091 mp_irq.type = MP_INTSRC;
1094 mp_irq.mp_irqflag = 0; /* Conforming */ 1092 mp_irq.irqflag = 0; /* Conforming */
1095 mp_irq.mp_srcbus = MP_ISA_BUS; 1093 mp_irq.srcbus = MP_ISA_BUS;
1096 mp_irq.mp_dstapic = dstapic; 1094 mp_irq.dstapic = dstapic;
1097 mp_irq.mp_irqtype = mp_INT; 1095 mp_irq.irqtype = mp_INT;
1098 mp_irq.mp_srcbusirq = i; /* Identity mapped */ 1096 mp_irq.srcbusirq = i; /* Identity mapped */
1099 mp_irq.mp_dstirq = i; 1097 mp_irq.dstirq = i;
1100 1098
1101 save_mp_irq(&mp_irq); 1099 save_mp_irq(&mp_irq);
1102 } 1100 }
@@ -1207,22 +1205,22 @@ int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
1207 u32 gsi, int triggering, int polarity) 1205 u32 gsi, int triggering, int polarity)
1208{ 1206{
1209#ifdef CONFIG_X86_MPPARSE 1207#ifdef CONFIG_X86_MPPARSE
1210 struct mp_config_intsrc mp_irq; 1208 struct mpc_intsrc mp_irq;
1211 int ioapic; 1209 int ioapic;
1212 1210
1213 if (!acpi_ioapic) 1211 if (!acpi_ioapic)
1214 return 0; 1212 return 0;
1215 1213
1216 /* print the entry should happen on mptable identically */ 1214 /* print the entry should happen on mptable identically */
1217 mp_irq.mp_type = MP_INTSRC; 1215 mp_irq.type = MP_INTSRC;
1218 mp_irq.mp_irqtype = mp_INT; 1216 mp_irq.irqtype = mp_INT;
1219 mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | 1217 mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
1220 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); 1218 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
1221 mp_irq.mp_srcbus = number; 1219 mp_irq.srcbus = number;
1222 mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); 1220 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1223 ioapic = mp_find_ioapic(gsi); 1221 ioapic = mp_find_ioapic(gsi);
1224 mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id; 1222 mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
1225 mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base; 1223 mp_irq.dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base;
1226 1224
1227 save_mp_irq(&mp_irq); 1225 save_mp_irq(&mp_irq);
1228#endif 1226#endif
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index a60c1f3bcb87..7c243a2c5115 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,6 +101,7 @@ int acpi_save_state_mem(void)
101 stack_start.sp = temp_stack + sizeof(temp_stack); 101 stack_start.sp = temp_stack + sizeof(temp_stack);
102 early_gdt_descr.address = 102 early_gdt_descr.address =
103 (unsigned long)get_cpu_gdt_table(smp_processor_id()); 103 (unsigned long)get_cpu_gdt_table(smp_processor_id());
104 initial_gs = per_cpu_offset(smp_processor_id());
104#endif 105#endif
105 initial_code = (unsigned long)wakeup_long64; 106 initial_code = (unsigned long)wakeup_long64;
106 saved_magic = 0x123456789abcdef0; 107 saved_magic = 0x123456789abcdef0;
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 115449f869ee..383d827eef89 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -60,6 +60,24 @@
60# error SPURIOUS_APIC_VECTOR definition error 60# error SPURIOUS_APIC_VECTOR definition error
61#endif 61#endif
62 62
63unsigned int num_processors;
64unsigned disabled_cpus __cpuinitdata;
65/* Processor that is doing the boot up */
66unsigned int boot_cpu_physical_apicid = -1U;
67EXPORT_SYMBOL(boot_cpu_physical_apicid);
68unsigned int max_physical_apicid;
69
70/* Bitmask of physically existing CPUs */
71physid_mask_t phys_cpu_present_map;
72
73/*
74 * Map cpu index to physical APIC ID
75 */
76DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
77DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
78EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
79EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
80
63#ifdef CONFIG_X86_32 81#ifdef CONFIG_X86_32
64/* 82/*
65 * Knob to control our willingness to enable the local APIC. 83 * Knob to control our willingness to enable the local APIC.
@@ -1130,6 +1148,13 @@ void __cpuinit setup_local_APIC(void)
1130 unsigned int value; 1148 unsigned int value;
1131 int i, j; 1149 int i, j;
1132 1150
1151 if (disable_apic) {
1152#ifdef CONFIG_X86_IO_APIC
1153 disable_ioapic_setup();
1154#endif
1155 return;
1156 }
1157
1133#ifdef CONFIG_X86_32 1158#ifdef CONFIG_X86_32
1134 /* Pound the ESR really hard over the head with a big hammer - mbligh */ 1159 /* Pound the ESR really hard over the head with a big hammer - mbligh */
1135 if (lapic_is_integrated() && esr_disable) { 1160 if (lapic_is_integrated() && esr_disable) {
@@ -1570,11 +1595,11 @@ int apic_version[MAX_APICS];
1570 1595
1571int __init APIC_init_uniprocessor(void) 1596int __init APIC_init_uniprocessor(void)
1572{ 1597{
1573#ifdef CONFIG_X86_64
1574 if (disable_apic) { 1598 if (disable_apic) {
1575 pr_info("Apic disabled\n"); 1599 pr_info("Apic disabled\n");
1576 return -1; 1600 return -1;
1577 } 1601 }
1602#ifdef CONFIG_X86_64
1578 if (!cpu_has_apic) { 1603 if (!cpu_has_apic) {
1579 disable_apic = 1; 1604 disable_apic = 1;
1580 pr_info("Apic disabled by BIOS\n"); 1605 pr_info("Apic disabled by BIOS\n");
@@ -1877,17 +1902,8 @@ void __cpuinit generic_processor_info(int apicid, int version)
1877#endif 1902#endif
1878 1903
1879#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) 1904#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1880 /* are we being called early in kernel startup? */ 1905 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1881 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1906 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1882 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
1883 u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1884
1885 cpu_to_apicid[cpu] = apicid;
1886 bios_cpu_apicid[cpu] = apicid;
1887 } else {
1888 per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1889 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1890 }
1891#endif 1907#endif
1892 1908
1893 set_cpu_possible(cpu, true); 1909 set_cpu_possible(cpu, true);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 1d41d3f1edbc..8793ab33e2c1 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -11,7 +11,6 @@
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
12#include <linux/suspend.h> 12#include <linux/suspend.h>
13#include <linux/kbuild.h> 13#include <linux/kbuild.h>
14#include <asm/pda.h>
15#include <asm/processor.h> 14#include <asm/processor.h>
16#include <asm/segment.h> 15#include <asm/segment.h>
17#include <asm/thread_info.h> 16#include <asm/thread_info.h>
@@ -48,16 +47,6 @@ int main(void)
48#endif 47#endif
49 BLANK(); 48 BLANK();
50#undef ENTRY 49#undef ENTRY
51#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
52 ENTRY(kernelstack);
53 ENTRY(oldrsp);
54 ENTRY(pcurrent);
55 ENTRY(irqcount);
56 ENTRY(cpunumber);
57 ENTRY(irqstackptr);
58 ENTRY(data_offset);
59 BLANK();
60#undef ENTRY
61#ifdef CONFIG_PARAVIRT 50#ifdef CONFIG_PARAVIRT
62 BLANK(); 51 BLANK();
63 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); 52 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 83492b1f93b1..41b0de6df873 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -21,14 +21,16 @@
21#include <asm/asm.h> 21#include <asm/asm.h>
22#include <asm/numa.h> 22#include <asm/numa.h>
23#include <asm/smp.h> 23#include <asm/smp.h>
24#include <asm/cpu.h>
25#include <asm/cpumask.h>
24#ifdef CONFIG_X86_LOCAL_APIC 26#ifdef CONFIG_X86_LOCAL_APIC
25#include <asm/mpspec.h> 27#include <asm/mpspec.h>
26#include <asm/apic.h> 28#include <asm/apic.h>
27#include <mach_apic.h> 29#include <mach_apic.h>
28#include <asm/genapic.h> 30#include <asm/genapic.h>
31#include <asm/uv/uv.h>
29#endif 32#endif
30 33
31#include <asm/pda.h>
32#include <asm/pgtable.h> 34#include <asm/pgtable.h>
33#include <asm/processor.h> 35#include <asm/processor.h>
34#include <asm/desc.h> 36#include <asm/desc.h>
@@ -50,6 +52,15 @@ cpumask_var_t cpu_initialized_mask;
50/* representing cpus for which sibling maps can be computed */ 52/* representing cpus for which sibling maps can be computed */
51cpumask_var_t cpu_sibling_setup_mask; 53cpumask_var_t cpu_sibling_setup_mask;
52 54
55/* correctly size the local cpu masks */
56void __init setup_cpu_local_masks(void)
57{
58 alloc_bootmem_cpumask_var(&cpu_initialized_mask);
59 alloc_bootmem_cpumask_var(&cpu_callin_mask);
60 alloc_bootmem_cpumask_var(&cpu_callout_mask);
61 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
62}
63
53#else /* CONFIG_X86_32 */ 64#else /* CONFIG_X86_32 */
54 65
55cpumask_t cpu_callin_map; 66cpumask_t cpu_callin_map;
@@ -62,23 +73,23 @@ cpumask_t cpu_sibling_setup_map;
62 73
63static struct cpu_dev *this_cpu __cpuinitdata; 74static struct cpu_dev *this_cpu __cpuinitdata;
64 75
76DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
65#ifdef CONFIG_X86_64 77#ifdef CONFIG_X86_64
66/* We need valid kernel segments for data and code in long mode too 78 /*
67 * IRET will check the segment types kkeil 2000/10/28 79 * We need valid kernel segments for data and code in long mode too
68 * Also sysret mandates a special GDT layout 80 * IRET will check the segment types kkeil 2000/10/28
69 */ 81 * Also sysret mandates a special GDT layout
70/* The TLS descriptors are currently at a different place compared to i386. 82 *
71 Hopefully nobody expects them at a fixed place (Wine?) */ 83 * The TLS descriptors are currently at a different place compared to i386.
72DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { 84 * Hopefully nobody expects them at a fixed place (Wine?)
85 */
73 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, 86 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
74 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, 87 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
75 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, 88 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
76 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, 89 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
77 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, 90 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
78 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, 91 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
79} };
80#else 92#else
81DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
82 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 93 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
83 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 94 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
84 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 95 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -110,9 +121,9 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
110 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, 121 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
111 122
112 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 123 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
113 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, 124 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
114} };
115#endif 125#endif
126} };
116EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 127EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
117 128
118#ifdef CONFIG_X86_32 129#ifdef CONFIG_X86_32
@@ -242,18 +253,28 @@ static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
242 253
243__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 254__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
244 255
256void load_percpu_segment(int cpu)
257{
258#ifdef CONFIG_X86_32
259 loadsegment(fs, __KERNEL_PERCPU);
260#else
261 loadsegment(gs, 0);
262 wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
263#endif
264}
265
245/* Current gdt points %fs at the "master" per-cpu area: after this, 266/* Current gdt points %fs at the "master" per-cpu area: after this,
246 * it's on the real one. */ 267 * it's on the real one. */
247void switch_to_new_gdt(void) 268void switch_to_new_gdt(int cpu)
248{ 269{
249 struct desc_ptr gdt_descr; 270 struct desc_ptr gdt_descr;
250 271
251 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); 272 gdt_descr.address = (long)get_cpu_gdt_table(cpu);
252 gdt_descr.size = GDT_SIZE - 1; 273 gdt_descr.size = GDT_SIZE - 1;
253 load_gdt(&gdt_descr); 274 load_gdt(&gdt_descr);
254#ifdef CONFIG_X86_32 275 /* Reload the per-cpu base */
255 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); 276
256#endif 277 load_percpu_segment(cpu);
257} 278}
258 279
259static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 280static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
@@ -877,54 +898,22 @@ static __init int setup_disablecpuid(char *arg)
877__setup("clearcpuid=", setup_disablecpuid); 898__setup("clearcpuid=", setup_disablecpuid);
878 899
879#ifdef CONFIG_X86_64 900#ifdef CONFIG_X86_64
880struct x8664_pda **_cpu_pda __read_mostly;
881EXPORT_SYMBOL(_cpu_pda);
882
883struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; 901struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
884 902
885static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; 903DEFINE_PER_CPU_FIRST(union irq_stack_union,
904 irq_stack_union) __aligned(PAGE_SIZE);
905DEFINE_PER_CPU(char *, irq_stack_ptr) =
906 init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
886 907
887void __cpuinit pda_init(int cpu) 908DEFINE_PER_CPU(unsigned long, kernel_stack) =
888{ 909 (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
889 struct x8664_pda *pda = cpu_pda(cpu); 910EXPORT_PER_CPU_SYMBOL(kernel_stack);
890 911
891 /* Setup up data that may be needed in __get_free_pages early */ 912DEFINE_PER_CPU(unsigned int, irq_count) = -1;
892 loadsegment(fs, 0);
893 loadsegment(gs, 0);
894 /* Memory clobbers used to order PDA accessed */
895 mb();
896 wrmsrl(MSR_GS_BASE, pda);
897 mb();
898
899 pda->cpunumber = cpu;
900 pda->irqcount = -1;
901 pda->kernelstack = (unsigned long)stack_thread_info() -
902 PDA_STACKOFFSET + THREAD_SIZE;
903 pda->active_mm = &init_mm;
904 pda->mmu_state = 0;
905
906 if (cpu == 0) {
907 /* others are initialized in smpboot.c */
908 pda->pcurrent = &init_task;
909 pda->irqstackptr = boot_cpu_stack;
910 pda->irqstackptr += IRQSTACKSIZE - 64;
911 } else {
912 if (!pda->irqstackptr) {
913 pda->irqstackptr = (char *)
914 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
915 if (!pda->irqstackptr)
916 panic("cannot allocate irqstack for cpu %d",
917 cpu);
918 pda->irqstackptr += IRQSTACKSIZE - 64;
919 }
920 913
921 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) 914static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
922 pda->nodenumber = cpu_to_node(cpu); 915 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
923 } 916 __aligned(PAGE_SIZE);
924}
925
926static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
927 DEBUG_STKSZ] __page_aligned_bss;
928 917
929extern asmlinkage void ignore_sysret(void); 918extern asmlinkage void ignore_sysret(void);
930 919
@@ -982,15 +971,14 @@ void __cpuinit cpu_init(void)
982 struct tss_struct *t = &per_cpu(init_tss, cpu); 971 struct tss_struct *t = &per_cpu(init_tss, cpu);
983 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); 972 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
984 unsigned long v; 973 unsigned long v;
985 char *estacks = NULL;
986 struct task_struct *me; 974 struct task_struct *me;
987 int i; 975 int i;
988 976
989 /* CPU 0 is initialised in head64.c */ 977#ifdef CONFIG_NUMA
990 if (cpu != 0) 978 if (cpu != 0 && percpu_read(node_number) == 0 &&
991 pda_init(cpu); 979 cpu_to_node(cpu) != NUMA_NO_NODE)
992 else 980 percpu_write(node_number, cpu_to_node(cpu));
993 estacks = boot_exception_stacks; 981#endif
994 982
995 me = current; 983 me = current;
996 984
@@ -1006,7 +994,9 @@ void __cpuinit cpu_init(void)
1006 * and set up the GDT descriptor: 994 * and set up the GDT descriptor:
1007 */ 995 */
1008 996
1009 switch_to_new_gdt(); 997 switch_to_new_gdt(cpu);
998 loadsegment(fs, 0);
999
1010 load_idt((const struct desc_ptr *)&idt_descr); 1000 load_idt((const struct desc_ptr *)&idt_descr);
1011 1001
1012 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); 1002 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
@@ -1024,18 +1014,13 @@ void __cpuinit cpu_init(void)
1024 * set up and load the per-CPU TSS 1014 * set up and load the per-CPU TSS
1025 */ 1015 */
1026 if (!orig_ist->ist[0]) { 1016 if (!orig_ist->ist[0]) {
1027 static const unsigned int order[N_EXCEPTION_STACKS] = { 1017 static const unsigned int sizes[N_EXCEPTION_STACKS] = {
1028 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, 1018 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
1029 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER 1019 [DEBUG_STACK - 1] = DEBUG_STKSZ
1030 }; 1020 };
1021 char *estacks = per_cpu(exception_stacks, cpu);
1031 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 1022 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1032 if (cpu) { 1023 estacks += sizes[v];
1033 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1034 if (!estacks)
1035 panic("Cannot allocate exception "
1036 "stack %ld %d\n", v, cpu);
1037 }
1038 estacks += PAGE_SIZE << order[v];
1039 orig_ist->ist[v] = t->x86_tss.ist[v] = 1024 orig_ist->ist[v] = t->x86_tss.ist[v] =
1040 (unsigned long)estacks; 1025 (unsigned long)estacks;
1041 } 1026 }
@@ -1114,7 +1099,7 @@ void __cpuinit cpu_init(void)
1114 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 1099 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1115 1100
1116 load_idt(&idt_descr); 1101 load_idt(&idt_descr);
1117 switch_to_new_gdt(); 1102 switch_to_new_gdt(cpu);
1118 1103
1119 /* 1104 /*
1120 * Set up and load the per-CPU TSS and LDT 1105 * Set up and load the per-CPU TSS and LDT
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index da299eb85fc0..7293508d8f5c 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -147,7 +147,16 @@ struct _cpuid4_info {
147 union _cpuid4_leaf_ecx ecx; 147 union _cpuid4_leaf_ecx ecx;
148 unsigned long size; 148 unsigned long size;
149 unsigned long can_disable; 149 unsigned long can_disable;
150 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ 150 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
151};
152
153/* subset of above _cpuid4_info w/o shared_cpu_map */
154struct _cpuid4_info_regs {
155 union _cpuid4_leaf_eax eax;
156 union _cpuid4_leaf_ebx ebx;
157 union _cpuid4_leaf_ecx ecx;
158 unsigned long size;
159 unsigned long can_disable;
151}; 160};
152 161
153#ifdef CONFIG_PCI 162#ifdef CONFIG_PCI
@@ -278,7 +287,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
278} 287}
279 288
280static void __cpuinit 289static void __cpuinit
281amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf) 290amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
282{ 291{
283 if (index < 3) 292 if (index < 3)
284 return; 293 return;
@@ -286,7 +295,8 @@ amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
286} 295}
287 296
288static int 297static int
289__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) 298__cpuinit cpuid4_cache_lookup_regs(int index,
299 struct _cpuid4_info_regs *this_leaf)
290{ 300{
291 union _cpuid4_leaf_eax eax; 301 union _cpuid4_leaf_eax eax;
292 union _cpuid4_leaf_ebx ebx; 302 union _cpuid4_leaf_ebx ebx;
@@ -314,6 +324,15 @@ __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
314 return 0; 324 return 0;
315} 325}
316 326
327static int
328__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
329{
330 struct _cpuid4_info_regs *leaf_regs =
331 (struct _cpuid4_info_regs *)this_leaf;
332
333 return cpuid4_cache_lookup_regs(index, leaf_regs);
334}
335
317static int __cpuinit find_num_cache_leaves(void) 336static int __cpuinit find_num_cache_leaves(void)
318{ 337{
319 unsigned int eax, ebx, ecx, edx; 338 unsigned int eax, ebx, ecx, edx;
@@ -353,11 +372,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
353 * parameters cpuid leaf to find the cache details 372 * parameters cpuid leaf to find the cache details
354 */ 373 */
355 for (i = 0; i < num_cache_leaves; i++) { 374 for (i = 0; i < num_cache_leaves; i++) {
356 struct _cpuid4_info this_leaf; 375 struct _cpuid4_info_regs this_leaf;
357
358 int retval; 376 int retval;
359 377
360 retval = cpuid4_cache_lookup(i, &this_leaf); 378 retval = cpuid4_cache_lookup_regs(i, &this_leaf);
361 if (retval >= 0) { 379 if (retval >= 0) {
362 switch(this_leaf.eax.split.level) { 380 switch(this_leaf.eax.split.level) {
363 case 1: 381 case 1:
@@ -506,17 +524,20 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
506 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; 524 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
507 525
508 if (num_threads_sharing == 1) 526 if (num_threads_sharing == 1)
509 cpu_set(cpu, this_leaf->shared_cpu_map); 527 cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
510 else { 528 else {
511 index_msb = get_count_order(num_threads_sharing); 529 index_msb = get_count_order(num_threads_sharing);
512 530
513 for_each_online_cpu(i) { 531 for_each_online_cpu(i) {
514 if (cpu_data(i).apicid >> index_msb == 532 if (cpu_data(i).apicid >> index_msb ==
515 c->apicid >> index_msb) { 533 c->apicid >> index_msb) {
516 cpu_set(i, this_leaf->shared_cpu_map); 534 cpumask_set_cpu(i,
535 to_cpumask(this_leaf->shared_cpu_map));
517 if (i != cpu && per_cpu(cpuid4_info, i)) { 536 if (i != cpu && per_cpu(cpuid4_info, i)) {
518 sibling_leaf = CPUID4_INFO_IDX(i, index); 537 sibling_leaf =
519 cpu_set(cpu, sibling_leaf->shared_cpu_map); 538 CPUID4_INFO_IDX(i, index);
539 cpumask_set_cpu(cpu, to_cpumask(
540 sibling_leaf->shared_cpu_map));
520 } 541 }
521 } 542 }
522 } 543 }
@@ -528,9 +549,10 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
528 int sibling; 549 int sibling;
529 550
530 this_leaf = CPUID4_INFO_IDX(cpu, index); 551 this_leaf = CPUID4_INFO_IDX(cpu, index);
531 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { 552 for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
532 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 553 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
533 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 554 cpumask_clear_cpu(cpu,
555 to_cpumask(sibling_leaf->shared_cpu_map));
534 } 556 }
535} 557}
536#else 558#else
@@ -635,8 +657,9 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
635 int n = 0; 657 int n = 0;
636 658
637 if (len > 1) { 659 if (len > 1) {
638 cpumask_t *mask = &this_leaf->shared_cpu_map; 660 const struct cpumask *mask;
639 661
662 mask = to_cpumask(this_leaf->shared_cpu_map);
640 n = type? 663 n = type?
641 cpulist_scnprintf(buf, len-2, mask) : 664 cpulist_scnprintf(buf, len-2, mask) :
642 cpumask_scnprintf(buf, len-2, mask); 665 cpumask_scnprintf(buf, len-2, mask);
@@ -699,7 +722,8 @@ static struct pci_dev *get_k8_northbridge(int node)
699 722
700static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) 723static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
701{ 724{
702 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); 725 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
726 int node = cpu_to_node(cpumask_first(mask));
703 struct pci_dev *dev = NULL; 727 struct pci_dev *dev = NULL;
704 ssize_t ret = 0; 728 ssize_t ret = 0;
705 int i; 729 int i;
@@ -733,7 +757,8 @@ static ssize_t
733store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, 757store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
734 size_t count) 758 size_t count)
735{ 759{
736 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); 760 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
761 int node = cpu_to_node(cpumask_first(mask));
737 struct pci_dev *dev = NULL; 762 struct pci_dev *dev = NULL;
738 unsigned int ret, index, val; 763 unsigned int ret, index, val;
739 764
@@ -878,7 +903,7 @@ err_out:
878 return -ENOMEM; 903 return -ENOMEM;
879} 904}
880 905
881static cpumask_t cache_dev_map = CPU_MASK_NONE; 906static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
882 907
883/* Add/Remove cache interface for CPU device */ 908/* Add/Remove cache interface for CPU device */
884static int __cpuinit cache_add_dev(struct sys_device * sys_dev) 909static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
@@ -918,7 +943,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
918 } 943 }
919 kobject_uevent(&(this_object->kobj), KOBJ_ADD); 944 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
920 } 945 }
921 cpu_set(cpu, cache_dev_map); 946 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
922 947
923 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 948 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
924 return 0; 949 return 0;
@@ -931,9 +956,9 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
931 956
932 if (per_cpu(cpuid4_info, cpu) == NULL) 957 if (per_cpu(cpuid4_info, cpu) == NULL)
933 return; 958 return;
934 if (!cpu_isset(cpu, cache_dev_map)) 959 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
935 return; 960 return;
936 cpu_clear(cpu, cache_dev_map); 961 cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
937 962
938 for (i = 0; i < num_cache_leaves; i++) 963 for (i = 0; i < num_cache_leaves; i++)
939 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); 964 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 8ae8c4ff094d..4772e91e8246 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -67,7 +67,7 @@ static struct threshold_block threshold_defaults = {
67struct threshold_bank { 67struct threshold_bank {
68 struct kobject *kobj; 68 struct kobject *kobj;
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
73 73
@@ -481,7 +481,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
481 481
482#ifdef CONFIG_SMP 482#ifdef CONFIG_SMP
483 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 483 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
484 i = first_cpu(per_cpu(cpu_core_map, cpu)); 484 i = cpumask_first(&per_cpu(cpu_core_map, cpu));
485 485
486 /* first core not up yet */ 486 /* first core not up yet */
487 if (cpu_data(i).cpu_core_id) 487 if (cpu_data(i).cpu_core_id)
@@ -501,7 +501,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
501 if (err) 501 if (err)
502 goto out; 502 goto out;
503 503
504 b->cpus = per_cpu(cpu_core_map, cpu); 504 cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
505 per_cpu(threshold_banks, cpu)[bank] = b; 505 per_cpu(threshold_banks, cpu)[bank] = b;
506 goto out; 506 goto out;
507 } 507 }
@@ -512,15 +512,20 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
512 err = -ENOMEM; 512 err = -ENOMEM;
513 goto out; 513 goto out;
514 } 514 }
515 if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
516 kfree(b);
517 err = -ENOMEM;
518 goto out;
519 }
515 520
516 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); 521 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);
517 if (!b->kobj) 522 if (!b->kobj)
518 goto out_free; 523 goto out_free;
519 524
520#ifndef CONFIG_SMP 525#ifndef CONFIG_SMP
521 b->cpus = CPU_MASK_ALL; 526 cpumask_setall(b->cpus);
522#else 527#else
523 b->cpus = per_cpu(cpu_core_map, cpu); 528 cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
524#endif 529#endif
525 530
526 per_cpu(threshold_banks, cpu)[bank] = b; 531 per_cpu(threshold_banks, cpu)[bank] = b;
@@ -529,7 +534,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
529 if (err) 534 if (err)
530 goto out_free; 535 goto out_free;
531 536
532 for_each_cpu_mask_nr(i, b->cpus) { 537 for_each_cpu(i, b->cpus) {
533 if (i == cpu) 538 if (i == cpu)
534 continue; 539 continue;
535 540
@@ -545,6 +550,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
545 550
546out_free: 551out_free:
547 per_cpu(threshold_banks, cpu)[bank] = NULL; 552 per_cpu(threshold_banks, cpu)[bank] = NULL;
553 free_cpumask_var(b->cpus);
548 kfree(b); 554 kfree(b);
549out: 555out:
550 return err; 556 return err;
@@ -619,7 +625,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
619#endif 625#endif
620 626
621 /* remove all sibling symlinks before unregistering */ 627 /* remove all sibling symlinks before unregistering */
622 for_each_cpu_mask_nr(i, b->cpus) { 628 for_each_cpu(i, b->cpus) {
623 if (i == cpu) 629 if (i == cpu)
624 continue; 630 continue;
625 631
@@ -632,6 +638,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
632free_out: 638free_out:
633 kobject_del(b->kobj); 639 kobject_del(b->kobj);
634 kobject_put(b->kobj); 640 kobject_put(b->kobj);
641 free_cpumask_var(b->cpus);
635 kfree(b); 642 kfree(b);
636 per_cpu(threshold_banks, cpu)[bank] = NULL; 643 per_cpu(threshold_banks, cpu)[bank] = NULL;
637} 644}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 4b48f251fd39..5e8c79e748a6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -7,6 +7,7 @@
7#include <linux/interrupt.h> 7#include <linux/interrupt.h>
8#include <linux/percpu.h> 8#include <linux/percpu.h>
9#include <asm/processor.h> 9#include <asm/processor.h>
10#include <asm/apic.h>
10#include <asm/msr.h> 11#include <asm/msr.h>
11#include <asm/mce.h> 12#include <asm/mce.h>
12#include <asm/hw_irq.h> 13#include <asm/hw_irq.h>
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c689d19e35ab..11b93cabdf78 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -24,7 +24,7 @@
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/hpet.h> 25#include <asm/hpet.h>
26#include <linux/kdebug.h> 26#include <linux/kdebug.h>
27#include <asm/smp.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30 30
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index c302d0707048..d35db5993fd6 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -106,7 +106,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
106 const struct stacktrace_ops *ops, void *data) 106 const struct stacktrace_ops *ops, void *data)
107{ 107{
108 const unsigned cpu = get_cpu(); 108 const unsigned cpu = get_cpu();
109 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; 109 unsigned long *irq_stack_end =
110 (unsigned long *)per_cpu(irq_stack_ptr, cpu);
110 unsigned used = 0; 111 unsigned used = 0;
111 struct thread_info *tinfo; 112 struct thread_info *tinfo;
112 int graph = 0; 113 int graph = 0;
@@ -160,23 +161,23 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
160 stack = (unsigned long *) estack_end[-2]; 161 stack = (unsigned long *) estack_end[-2];
161 continue; 162 continue;
162 } 163 }
163 if (irqstack_end) { 164 if (irq_stack_end) {
164 unsigned long *irqstack; 165 unsigned long *irq_stack;
165 irqstack = irqstack_end - 166 irq_stack = irq_stack_end -
166 (IRQSTACKSIZE - 64) / sizeof(*irqstack); 167 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
167 168
168 if (stack >= irqstack && stack < irqstack_end) { 169 if (stack >= irq_stack && stack < irq_stack_end) {
169 if (ops->stack(data, "IRQ") < 0) 170 if (ops->stack(data, "IRQ") < 0)
170 break; 171 break;
171 bp = print_context_stack(tinfo, stack, bp, 172 bp = print_context_stack(tinfo, stack, bp,
172 ops, data, irqstack_end, &graph); 173 ops, data, irq_stack_end, &graph);
173 /* 174 /*
174 * We link to the next stack (which would be 175 * We link to the next stack (which would be
175 * the process stack normally) the last 176 * the process stack normally) the last
176 * pointer (index -1 to end) in the IRQ stack: 177 * pointer (index -1 to end) in the IRQ stack:
177 */ 178 */
178 stack = (unsigned long *) (irqstack_end[-1]); 179 stack = (unsigned long *) (irq_stack_end[-1]);
179 irqstack_end = NULL; 180 irq_stack_end = NULL;
180 ops->stack(data, "EOI"); 181 ops->stack(data, "EOI");
181 continue; 182 continue;
182 } 183 }
@@ -199,10 +200,10 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
199 unsigned long *stack; 200 unsigned long *stack;
200 int i; 201 int i;
201 const int cpu = smp_processor_id(); 202 const int cpu = smp_processor_id();
202 unsigned long *irqstack_end = 203 unsigned long *irq_stack_end =
203 (unsigned long *) (cpu_pda(cpu)->irqstackptr); 204 (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
204 unsigned long *irqstack = 205 unsigned long *irq_stack =
205 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); 206 (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
206 207
207 /* 208 /*
208 * debugging aid: "show_stack(NULL, NULL);" prints the 209 * debugging aid: "show_stack(NULL, NULL);" prints the
@@ -218,9 +219,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
218 219
219 stack = sp; 220 stack = sp;
220 for (i = 0; i < kstack_depth_to_print; i++) { 221 for (i = 0; i < kstack_depth_to_print; i++) {
221 if (stack >= irqstack && stack <= irqstack_end) { 222 if (stack >= irq_stack && stack <= irq_stack_end) {
222 if (stack == irqstack_end) { 223 if (stack == irq_stack_end) {
223 stack = (unsigned long *) (irqstack_end[-1]); 224 stack = (unsigned long *) (irq_stack_end[-1]);
224 printk(" <EOI> "); 225 printk(" <EOI> ");
225 } 226 }
226 } else { 227 } else {
@@ -241,7 +242,7 @@ void show_registers(struct pt_regs *regs)
241 int i; 242 int i;
242 unsigned long sp; 243 unsigned long sp;
243 const int cpu = smp_processor_id(); 244 const int cpu = smp_processor_id();
244 struct task_struct *cur = cpu_pda(cpu)->pcurrent; 245 struct task_struct *cur = current;
245 246
246 sp = regs->sp; 247 sp = regs->sp;
247 printk("CPU %d ", cpu); 248 printk("CPU %d ", cpu);
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 1119d247fe11..b205272ad394 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -366,10 +366,12 @@ void __init efi_init(void)
366 SMBIOS_TABLE_GUID)) { 366 SMBIOS_TABLE_GUID)) {
367 efi.smbios = config_tables[i].table; 367 efi.smbios = config_tables[i].table;
368 printk(" SMBIOS=0x%lx ", config_tables[i].table); 368 printk(" SMBIOS=0x%lx ", config_tables[i].table);
369#ifdef CONFIG_X86_UV
369 } else if (!efi_guidcmp(config_tables[i].guid, 370 } else if (!efi_guidcmp(config_tables[i].guid,
370 UV_SYSTEM_TABLE_GUID)) { 371 UV_SYSTEM_TABLE_GUID)) {
371 efi.uv_systab = config_tables[i].table; 372 efi.uv_systab = config_tables[i].table;
372 printk(" UVsystab=0x%lx ", config_tables[i].table); 373 printk(" UVsystab=0x%lx ", config_tables[i].table);
374#endif
373 } else if (!efi_guidcmp(config_tables[i].guid, 375 } else if (!efi_guidcmp(config_tables[i].guid,
374 HCDP_TABLE_GUID)) { 376 HCDP_TABLE_GUID)) {
375 efi.hcdp = config_tables[i].table; 377 efi.hcdp = config_tables[i].table;
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 652c5287215f..a4ee29127fdf 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -36,6 +36,7 @@
36#include <asm/proto.h> 36#include <asm/proto.h>
37#include <asm/efi.h> 37#include <asm/efi.h>
38#include <asm/cacheflush.h> 38#include <asm/cacheflush.h>
39#include <asm/fixmap.h>
39 40
40static pgd_t save_pgd __initdata; 41static pgd_t save_pgd __initdata;
41static unsigned long efi_flags __initdata; 42static unsigned long efi_flags __initdata;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 46469029e9d3..a0b91aac72a1 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -672,7 +672,7 @@ common_interrupt:
672ENDPROC(common_interrupt) 672ENDPROC(common_interrupt)
673 CFI_ENDPROC 673 CFI_ENDPROC
674 674
675#define BUILD_INTERRUPT(name, nr) \ 675#define BUILD_INTERRUPT3(name, nr, fn) \
676ENTRY(name) \ 676ENTRY(name) \
677 RING0_INT_FRAME; \ 677 RING0_INT_FRAME; \
678 pushl $~(nr); \ 678 pushl $~(nr); \
@@ -680,11 +680,13 @@ ENTRY(name) \
680 SAVE_ALL; \ 680 SAVE_ALL; \
681 TRACE_IRQS_OFF \ 681 TRACE_IRQS_OFF \
682 movl %esp,%eax; \ 682 movl %esp,%eax; \
683 call smp_##name; \ 683 call fn; \
684 jmp ret_from_intr; \ 684 jmp ret_from_intr; \
685 CFI_ENDPROC; \ 685 CFI_ENDPROC; \
686ENDPROC(name) 686ENDPROC(name)
687 687
688#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name)
689
688/* The include is where all of the SMP etc. interrupts come from */ 690/* The include is where all of the SMP etc. interrupts come from */
689#include "entry_arch.h" 691#include "entry_arch.h"
690 692
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a1346217e43c..586bed677557 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -52,6 +52,7 @@
52#include <asm/irqflags.h> 52#include <asm/irqflags.h>
53#include <asm/paravirt.h> 53#include <asm/paravirt.h>
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/percpu.h>
55 56
56/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57#include <linux/elf-em.h> 58#include <linux/elf-em.h>
@@ -209,7 +210,7 @@ ENTRY(native_usergs_sysret64)
209 210
210 /* %rsp:at FRAMEEND */ 211 /* %rsp:at FRAMEEND */
211 .macro FIXUP_TOP_OF_STACK tmp offset=0 212 .macro FIXUP_TOP_OF_STACK tmp offset=0
212 movq %gs:pda_oldrsp,\tmp 213 movq PER_CPU_VAR(old_rsp),\tmp
213 movq \tmp,RSP+\offset(%rsp) 214 movq \tmp,RSP+\offset(%rsp)
214 movq $__USER_DS,SS+\offset(%rsp) 215 movq $__USER_DS,SS+\offset(%rsp)
215 movq $__USER_CS,CS+\offset(%rsp) 216 movq $__USER_CS,CS+\offset(%rsp)
@@ -220,7 +221,7 @@ ENTRY(native_usergs_sysret64)
220 221
221 .macro RESTORE_TOP_OF_STACK tmp offset=0 222 .macro RESTORE_TOP_OF_STACK tmp offset=0
222 movq RSP+\offset(%rsp),\tmp 223 movq RSP+\offset(%rsp),\tmp
223 movq \tmp,%gs:pda_oldrsp 224 movq \tmp,PER_CPU_VAR(old_rsp)
224 movq EFLAGS+\offset(%rsp),\tmp 225 movq EFLAGS+\offset(%rsp),\tmp
225 movq \tmp,R11+\offset(%rsp) 226 movq \tmp,R11+\offset(%rsp)
226 .endm 227 .endm
@@ -336,15 +337,15 @@ ENTRY(save_args)
336 je 1f 337 je 1f
337 SWAPGS 338 SWAPGS
338 /* 339 /*
339 * irqcount is used to check if a CPU is already on an interrupt stack 340 * irq_count is used to check if a CPU is already on an interrupt stack
340 * or not. While this is essentially redundant with preempt_count it is 341 * or not. While this is essentially redundant with preempt_count it is
341 * a little cheaper to use a separate counter in the PDA (short of 342 * a little cheaper to use a separate counter in the PDA (short of
342 * moving irq_enter into assembly, which would be too much work) 343 * moving irq_enter into assembly, which would be too much work)
343 */ 344 */
3441: incl %gs:pda_irqcount 3451: incl PER_CPU_VAR(irq_count)
345 jne 2f 346 jne 2f
346 popq_cfi %rax /* move return address... */ 347 popq_cfi %rax /* move return address... */
347 mov %gs:pda_irqstackptr,%rsp 348 mov PER_CPU_VAR(irq_stack_ptr),%rsp
348 EMPTY_FRAME 0 349 EMPTY_FRAME 0
349 pushq_cfi %rbp /* backlink for unwinder */ 350 pushq_cfi %rbp /* backlink for unwinder */
350 pushq_cfi %rax /* ... to the new stack */ 351 pushq_cfi %rax /* ... to the new stack */
@@ -468,7 +469,7 @@ END(ret_from_fork)
468ENTRY(system_call) 469ENTRY(system_call)
469 CFI_STARTPROC simple 470 CFI_STARTPROC simple
470 CFI_SIGNAL_FRAME 471 CFI_SIGNAL_FRAME
471 CFI_DEF_CFA rsp,PDA_STACKOFFSET 472 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
472 CFI_REGISTER rip,rcx 473 CFI_REGISTER rip,rcx
473 /*CFI_REGISTER rflags,r11*/ 474 /*CFI_REGISTER rflags,r11*/
474 SWAPGS_UNSAFE_STACK 475 SWAPGS_UNSAFE_STACK
@@ -479,8 +480,8 @@ ENTRY(system_call)
479 */ 480 */
480ENTRY(system_call_after_swapgs) 481ENTRY(system_call_after_swapgs)
481 482
482 movq %rsp,%gs:pda_oldrsp 483 movq %rsp,PER_CPU_VAR(old_rsp)
483 movq %gs:pda_kernelstack,%rsp 484 movq PER_CPU_VAR(kernel_stack),%rsp
484 /* 485 /*
485 * No need to follow this irqs off/on section - it's straight 486 * No need to follow this irqs off/on section - it's straight
486 * and short: 487 * and short:
@@ -523,7 +524,7 @@ sysret_check:
523 CFI_REGISTER rip,rcx 524 CFI_REGISTER rip,rcx
524 RESTORE_ARGS 0,-ARG_SKIP,1 525 RESTORE_ARGS 0,-ARG_SKIP,1
525 /*CFI_REGISTER rflags,r11*/ 526 /*CFI_REGISTER rflags,r11*/
526 movq %gs:pda_oldrsp, %rsp 527 movq PER_CPU_VAR(old_rsp), %rsp
527 USERGS_SYSRET64 528 USERGS_SYSRET64
528 529
529 CFI_RESTORE_STATE 530 CFI_RESTORE_STATE
@@ -833,11 +834,11 @@ common_interrupt:
833 XCPT_FRAME 834 XCPT_FRAME
834 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ 835 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
835 interrupt do_IRQ 836 interrupt do_IRQ
836 /* 0(%rsp): oldrsp-ARGOFFSET */ 837 /* 0(%rsp): old_rsp-ARGOFFSET */
837ret_from_intr: 838ret_from_intr:
838 DISABLE_INTERRUPTS(CLBR_NONE) 839 DISABLE_INTERRUPTS(CLBR_NONE)
839 TRACE_IRQS_OFF 840 TRACE_IRQS_OFF
840 decl %gs:pda_irqcount 841 decl PER_CPU_VAR(irq_count)
841 leaveq 842 leaveq
842 CFI_DEF_CFA_REGISTER rsp 843 CFI_DEF_CFA_REGISTER rsp
843 CFI_ADJUST_CFA_OFFSET -8 844 CFI_ADJUST_CFA_OFFSET -8
@@ -982,8 +983,10 @@ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
982 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 983 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
983#endif 984#endif
984 985
986#ifdef CONFIG_X86_UV
985apicinterrupt UV_BAU_MESSAGE \ 987apicinterrupt UV_BAU_MESSAGE \
986 uv_bau_message_intr1 uv_bau_message_interrupt 988 uv_bau_message_intr1 uv_bau_message_interrupt
989#endif
987apicinterrupt LOCAL_TIMER_VECTOR \ 990apicinterrupt LOCAL_TIMER_VECTOR \
988 apic_timer_interrupt smp_apic_timer_interrupt 991 apic_timer_interrupt smp_apic_timer_interrupt
989 992
@@ -1073,10 +1076,10 @@ ENTRY(\sym)
1073 TRACE_IRQS_OFF 1076 TRACE_IRQS_OFF
1074 movq %rsp,%rdi /* pt_regs pointer */ 1077 movq %rsp,%rdi /* pt_regs pointer */
1075 xorl %esi,%esi /* no error code */ 1078 xorl %esi,%esi /* no error code */
1076 movq %gs:pda_data_offset, %rbp 1079 PER_CPU(init_tss, %rbp)
1077 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) 1080 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
1078 call \do_sym 1081 call \do_sym
1079 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) 1082 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
1080 jmp paranoid_exit /* %ebx: no swapgs flag */ 1083 jmp paranoid_exit /* %ebx: no swapgs flag */
1081 CFI_ENDPROC 1084 CFI_ENDPROC
1082END(\sym) 1085END(\sym)
@@ -1138,7 +1141,7 @@ ENTRY(native_load_gs_index)
1138 CFI_STARTPROC 1141 CFI_STARTPROC
1139 pushf 1142 pushf
1140 CFI_ADJUST_CFA_OFFSET 8 1143 CFI_ADJUST_CFA_OFFSET 8
1141 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) 1144 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
1142 SWAPGS 1145 SWAPGS
1143gs_change: 1146gs_change:
1144 movl %edi,%gs 1147 movl %edi,%gs
@@ -1260,14 +1263,14 @@ ENTRY(call_softirq)
1260 CFI_REL_OFFSET rbp,0 1263 CFI_REL_OFFSET rbp,0
1261 mov %rsp,%rbp 1264 mov %rsp,%rbp
1262 CFI_DEF_CFA_REGISTER rbp 1265 CFI_DEF_CFA_REGISTER rbp
1263 incl %gs:pda_irqcount 1266 incl PER_CPU_VAR(irq_count)
1264 cmove %gs:pda_irqstackptr,%rsp 1267 cmove PER_CPU_VAR(irq_stack_ptr),%rsp
1265 push %rbp # backlink for old unwinder 1268 push %rbp # backlink for old unwinder
1266 call __do_softirq 1269 call __do_softirq
1267 leaveq 1270 leaveq
1268 CFI_DEF_CFA_REGISTER rsp 1271 CFI_DEF_CFA_REGISTER rsp
1269 CFI_ADJUST_CFA_OFFSET -8 1272 CFI_ADJUST_CFA_OFFSET -8
1270 decl %gs:pda_irqcount 1273 decl PER_CPU_VAR(irq_count)
1271 ret 1274 ret
1272 CFI_ENDPROC 1275 CFI_ENDPROC
1273END(call_softirq) 1276END(call_softirq)
@@ -1297,15 +1300,15 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1297 movq %rdi, %rsp # we don't return, adjust the stack frame 1300 movq %rdi, %rsp # we don't return, adjust the stack frame
1298 CFI_ENDPROC 1301 CFI_ENDPROC
1299 DEFAULT_FRAME 1302 DEFAULT_FRAME
130011: incl %gs:pda_irqcount 130311: incl PER_CPU_VAR(irq_count)
1301 movq %rsp,%rbp 1304 movq %rsp,%rbp
1302 CFI_DEF_CFA_REGISTER rbp 1305 CFI_DEF_CFA_REGISTER rbp
1303 cmovzq %gs:pda_irqstackptr,%rsp 1306 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
1304 pushq %rbp # backlink for old unwinder 1307 pushq %rbp # backlink for old unwinder
1305 call xen_evtchn_do_upcall 1308 call xen_evtchn_do_upcall
1306 popq %rsp 1309 popq %rsp
1307 CFI_DEF_CFA_REGISTER rsp 1310 CFI_DEF_CFA_REGISTER rsp
1308 decl %gs:pda_irqcount 1311 decl PER_CPU_VAR(irq_count)
1309 jmp error_exit 1312 jmp error_exit
1310 CFI_ENDPROC 1313 CFI_ENDPROC
1311END(do_hypervisor_callback) 1314END(do_hypervisor_callback)
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 2bced78b0b8e..e656c2721154 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -32,7 +32,9 @@ extern struct genapic apic_x2apic_cluster;
32struct genapic __read_mostly *genapic = &apic_flat; 32struct genapic __read_mostly *genapic = &apic_flat;
33 33
34static struct genapic *apic_probe[] __initdata = { 34static struct genapic *apic_probe[] __initdata = {
35#ifdef CONFIG_X86_UV
35 &apic_x2apic_uv_x, 36 &apic_x2apic_uv_x,
37#endif
36 &apic_x2apic_phys, 38 &apic_x2apic_phys,
37 &apic_x2apic_cluster, 39 &apic_x2apic_cluster,
38 &apic_physflat, 40 &apic_physflat,
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index b193e082f6ce..bfe36249145c 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -25,6 +25,7 @@
25#include <asm/ipi.h> 25#include <asm/ipi.h>
26#include <asm/genapic.h> 26#include <asm/genapic.h>
27#include <asm/pgtable.h> 27#include <asm/pgtable.h>
28#include <asm/uv/uv.h>
28#include <asm/uv/uv_mmrs.h> 29#include <asm/uv/uv_mmrs.h>
29#include <asm/uv/uv_hub.h> 30#include <asm/uv/uv_hub.h>
30#include <asm/uv/bios.h> 31#include <asm/uv/bios.h>
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b9a4d8c4b935..f5b272247690 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,27 +26,6 @@
26#include <asm/bios_ebda.h> 26#include <asm/bios_ebda.h>
27#include <asm/trampoline.h> 27#include <asm/trampoline.h>
28 28
29/* boot cpu pda */
30static struct x8664_pda _boot_cpu_pda;
31
32#ifdef CONFIG_SMP
33/*
34 * We install an empty cpu_pda pointer table to indicate to early users
35 * (numa_set_node) that the cpu_pda pointer table for cpus other than
36 * the boot cpu is not yet setup.
37 */
38static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
39#else
40static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
41#endif
42
43void __init x86_64_init_pda(void)
44{
45 _cpu_pda = __cpu_pda;
46 cpu_pda(0) = &_boot_cpu_pda;
47 pda_init(0);
48}
49
50static void __init zap_identity_mappings(void) 29static void __init zap_identity_mappings(void)
51{ 30{
52 pgd_t *pgd = pgd_offset_k(0UL); 31 pgd_t *pgd = pgd_offset_k(0UL);
@@ -112,8 +91,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
112 if (console_loglevel == 10) 91 if (console_loglevel == 10)
113 early_printk("Kernel alive\n"); 92 early_printk("Kernel alive\n");
114 93
115 x86_64_init_pda();
116
117 x86_64_start_reservations(real_mode_data); 94 x86_64_start_reservations(real_mode_data);
118} 95}
119 96
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index e835b4eea70b..24c0e5cd71e3 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -429,12 +429,14 @@ is386: movl $2,%ecx # set MP
429 ljmp $(__KERNEL_CS),$1f 429 ljmp $(__KERNEL_CS),$1f
4301: movl $(__KERNEL_DS),%eax # reload all the segment registers 4301: movl $(__KERNEL_DS),%eax # reload all the segment registers
431 movl %eax,%ss # after changing gdt. 431 movl %eax,%ss # after changing gdt.
432 movl %eax,%fs # gets reset once there's real percpu
433 432
434 movl $(__USER_DS),%eax # DS/ES contains default USER segment 433 movl $(__USER_DS),%eax # DS/ES contains default USER segment
435 movl %eax,%ds 434 movl %eax,%ds
436 movl %eax,%es 435 movl %eax,%es
437 436
437 movl $(__KERNEL_PERCPU), %eax
438 movl %eax,%fs # set this cpu's percpu
439
438 xorl %eax,%eax # Clear GS and LDT 440 xorl %eax,%eax # Clear GS and LDT
439 movl %eax,%gs 441 movl %eax,%gs
440 lldt %ax 442 lldt %ax
@@ -446,8 +448,6 @@ is386: movl $2,%ecx # set MP
446 movb $1, ready 448 movb $1, ready
447 cmpb $0,%cl # the first CPU calls start_kernel 449 cmpb $0,%cl # the first CPU calls start_kernel
448 je 1f 450 je 1f
449 movl $(__KERNEL_PERCPU), %eax
450 movl %eax,%fs # set this cpu's percpu
451 movl (stack_start), %esp 451 movl (stack_start), %esp
4521: 4521:
453#endif /* CONFIG_SMP */ 453#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 0e275d495563..2e648e3a5ea4 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,7 @@
19#include <asm/msr.h> 19#include <asm/msr.h>
20#include <asm/cache.h> 20#include <asm/cache.h>
21#include <asm/processor-flags.h> 21#include <asm/processor-flags.h>
22#include <asm/percpu.h>
22 23
23#ifdef CONFIG_PARAVIRT 24#ifdef CONFIG_PARAVIRT
24#include <asm/asm-offsets.h> 25#include <asm/asm-offsets.h>
@@ -226,12 +227,15 @@ ENTRY(secondary_startup_64)
226 movl %eax,%fs 227 movl %eax,%fs
227 movl %eax,%gs 228 movl %eax,%gs
228 229
229 /* 230 /* Set up %gs.
230 * Setup up a dummy PDA. this is just for some early bootup code 231 *
231 * that does in_interrupt() 232 * The base of %gs always points to the bottom of the irqstack
232 */ 233 * union. If the stack protector canary is enabled, it is
234 * located at %gs:40. Note that, on SMP, the boot cpu uses
235 * init data section till per cpu areas are set up.
236 */
233 movl $MSR_GS_BASE,%ecx 237 movl $MSR_GS_BASE,%ecx
234 movq $empty_zero_page,%rax 238 movq initial_gs(%rip),%rax
235 movq %rax,%rdx 239 movq %rax,%rdx
236 shrq $32,%rdx 240 shrq $32,%rdx
237 wrmsr 241 wrmsr
@@ -257,6 +261,8 @@ ENTRY(secondary_startup_64)
257 .align 8 261 .align 8
258 ENTRY(initial_code) 262 ENTRY(initial_code)
259 .quad x86_64_start_kernel 263 .quad x86_64_start_kernel
264 ENTRY(initial_gs)
265 .quad INIT_PER_CPU_VAR(irq_stack_union)
260 __FINITDATA 266 __FINITDATA
261 267
262 ENTRY(stack_start) 268 ENTRY(stack_start)
@@ -401,7 +407,8 @@ NEXT_PAGE(level2_spare_pgt)
401 .globl early_gdt_descr 407 .globl early_gdt_descr
402early_gdt_descr: 408early_gdt_descr:
403 .word GDT_ENTRIES*8-1 409 .word GDT_ENTRIES*8-1
404 .quad per_cpu__gdt_page 410early_gdt_descr_base:
411 .quad INIT_PER_CPU_VAR(gdt_page)
405 412
406ENTRY(phys_base) 413ENTRY(phys_base)
407 /* This must match the first entry in level2_kernel_pgt */ 414 /* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 9b0c480c383b..0a7f6d6b1206 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -46,6 +46,7 @@
46#include <asm/idle.h> 46#include <asm/idle.h>
47#include <asm/io.h> 47#include <asm/io.h>
48#include <asm/smp.h> 48#include <asm/smp.h>
49#include <asm/cpu.h>
49#include <asm/desc.h> 50#include <asm/desc.h>
50#include <asm/proto.h> 51#include <asm/proto.h>
51#include <asm/acpi.h> 52#include <asm/acpi.h>
@@ -82,11 +83,11 @@ static DEFINE_SPINLOCK(vector_lock);
82int nr_ioapic_registers[MAX_IO_APICS]; 83int nr_ioapic_registers[MAX_IO_APICS];
83 84
84/* I/O APIC entries */ 85/* I/O APIC entries */
85struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; 86struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
86int nr_ioapics; 87int nr_ioapics;
87 88
88/* MP IRQ source entries */ 89/* MP IRQ source entries */
89struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; 90struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
90 91
91/* # of MP IRQ source entries */ 92/* # of MP IRQ source entries */
92int mp_irq_entries; 93int mp_irq_entries;
@@ -356,7 +357,7 @@ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
356 357
357 if (!cfg->move_in_progress) { 358 if (!cfg->move_in_progress) {
358 /* it means that domain is not changed */ 359 /* it means that domain is not changed */
359 if (!cpumask_intersects(&desc->affinity, mask)) 360 if (!cpumask_intersects(desc->affinity, mask))
360 cfg->move_desc_pending = 1; 361 cfg->move_desc_pending = 1;
361 } 362 }
362} 363}
@@ -386,7 +387,7 @@ struct io_apic {
386static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 387static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
387{ 388{
388 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) 389 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
389 + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); 390 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
390} 391}
391 392
392static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) 393static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -579,9 +580,9 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
579 if (assign_irq_vector(irq, cfg, mask)) 580 if (assign_irq_vector(irq, cfg, mask))
580 return BAD_APICID; 581 return BAD_APICID;
581 582
582 cpumask_and(&desc->affinity, cfg->domain, mask); 583 cpumask_and(desc->affinity, cfg->domain, mask);
583 set_extra_move_desc(desc, mask); 584 set_extra_move_desc(desc, mask);
584 return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask); 585 return cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
585} 586}
586 587
587static void 588static void
@@ -944,10 +945,10 @@ static int find_irq_entry(int apic, int pin, int type)
944 int i; 945 int i;
945 946
946 for (i = 0; i < mp_irq_entries; i++) 947 for (i = 0; i < mp_irq_entries; i++)
947 if (mp_irqs[i].mp_irqtype == type && 948 if (mp_irqs[i].irqtype == type &&
948 (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || 949 (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
949 mp_irqs[i].mp_dstapic == MP_APIC_ALL) && 950 mp_irqs[i].dstapic == MP_APIC_ALL) &&
950 mp_irqs[i].mp_dstirq == pin) 951 mp_irqs[i].dstirq == pin)
951 return i; 952 return i;
952 953
953 return -1; 954 return -1;
@@ -961,13 +962,13 @@ static int __init find_isa_irq_pin(int irq, int type)
961 int i; 962 int i;
962 963
963 for (i = 0; i < mp_irq_entries; i++) { 964 for (i = 0; i < mp_irq_entries; i++) {
964 int lbus = mp_irqs[i].mp_srcbus; 965 int lbus = mp_irqs[i].srcbus;
965 966
966 if (test_bit(lbus, mp_bus_not_pci) && 967 if (test_bit(lbus, mp_bus_not_pci) &&
967 (mp_irqs[i].mp_irqtype == type) && 968 (mp_irqs[i].irqtype == type) &&
968 (mp_irqs[i].mp_srcbusirq == irq)) 969 (mp_irqs[i].srcbusirq == irq))
969 970
970 return mp_irqs[i].mp_dstirq; 971 return mp_irqs[i].dstirq;
971 } 972 }
972 return -1; 973 return -1;
973} 974}
@@ -977,17 +978,17 @@ static int __init find_isa_irq_apic(int irq, int type)
977 int i; 978 int i;
978 979
979 for (i = 0; i < mp_irq_entries; i++) { 980 for (i = 0; i < mp_irq_entries; i++) {
980 int lbus = mp_irqs[i].mp_srcbus; 981 int lbus = mp_irqs[i].srcbus;
981 982
982 if (test_bit(lbus, mp_bus_not_pci) && 983 if (test_bit(lbus, mp_bus_not_pci) &&
983 (mp_irqs[i].mp_irqtype == type) && 984 (mp_irqs[i].irqtype == type) &&
984 (mp_irqs[i].mp_srcbusirq == irq)) 985 (mp_irqs[i].srcbusirq == irq))
985 break; 986 break;
986 } 987 }
987 if (i < mp_irq_entries) { 988 if (i < mp_irq_entries) {
988 int apic; 989 int apic;
989 for(apic = 0; apic < nr_ioapics; apic++) { 990 for(apic = 0; apic < nr_ioapics; apic++) {
990 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) 991 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
991 return apic; 992 return apic;
992 } 993 }
993 } 994 }
@@ -1012,23 +1013,23 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
1012 return -1; 1013 return -1;
1013 } 1014 }
1014 for (i = 0; i < mp_irq_entries; i++) { 1015 for (i = 0; i < mp_irq_entries; i++) {
1015 int lbus = mp_irqs[i].mp_srcbus; 1016 int lbus = mp_irqs[i].srcbus;
1016 1017
1017 for (apic = 0; apic < nr_ioapics; apic++) 1018 for (apic = 0; apic < nr_ioapics; apic++)
1018 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || 1019 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
1019 mp_irqs[i].mp_dstapic == MP_APIC_ALL) 1020 mp_irqs[i].dstapic == MP_APIC_ALL)
1020 break; 1021 break;
1021 1022
1022 if (!test_bit(lbus, mp_bus_not_pci) && 1023 if (!test_bit(lbus, mp_bus_not_pci) &&
1023 !mp_irqs[i].mp_irqtype && 1024 !mp_irqs[i].irqtype &&
1024 (bus == lbus) && 1025 (bus == lbus) &&
1025 (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { 1026 (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
1026 int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); 1027 int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
1027 1028
1028 if (!(apic || IO_APIC_IRQ(irq))) 1029 if (!(apic || IO_APIC_IRQ(irq)))
1029 continue; 1030 continue;
1030 1031
1031 if (pin == (mp_irqs[i].mp_srcbusirq & 3)) 1032 if (pin == (mp_irqs[i].srcbusirq & 3))
1032 return irq; 1033 return irq;
1033 /* 1034 /*
1034 * Use the first all-but-pin matching entry as a 1035 * Use the first all-but-pin matching entry as a
@@ -1071,7 +1072,7 @@ static int EISA_ELCR(unsigned int irq)
1071 * EISA conforming in the MP table, that means its trigger type must 1072 * EISA conforming in the MP table, that means its trigger type must
1072 * be read in from the ELCR */ 1073 * be read in from the ELCR */
1073 1074
1074#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) 1075#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
1075#define default_EISA_polarity(idx) default_ISA_polarity(idx) 1076#define default_EISA_polarity(idx) default_ISA_polarity(idx)
1076 1077
1077/* PCI interrupts are always polarity one level triggered, 1078/* PCI interrupts are always polarity one level triggered,
@@ -1088,13 +1089,13 @@ static int EISA_ELCR(unsigned int irq)
1088 1089
1089static int MPBIOS_polarity(int idx) 1090static int MPBIOS_polarity(int idx)
1090{ 1091{
1091 int bus = mp_irqs[idx].mp_srcbus; 1092 int bus = mp_irqs[idx].srcbus;
1092 int polarity; 1093 int polarity;
1093 1094
1094 /* 1095 /*
1095 * Determine IRQ line polarity (high active or low active): 1096 * Determine IRQ line polarity (high active or low active):
1096 */ 1097 */
1097 switch (mp_irqs[idx].mp_irqflag & 3) 1098 switch (mp_irqs[idx].irqflag & 3)
1098 { 1099 {
1099 case 0: /* conforms, ie. bus-type dependent polarity */ 1100 case 0: /* conforms, ie. bus-type dependent polarity */
1100 if (test_bit(bus, mp_bus_not_pci)) 1101 if (test_bit(bus, mp_bus_not_pci))
@@ -1130,13 +1131,13 @@ static int MPBIOS_polarity(int idx)
1130 1131
1131static int MPBIOS_trigger(int idx) 1132static int MPBIOS_trigger(int idx)
1132{ 1133{
1133 int bus = mp_irqs[idx].mp_srcbus; 1134 int bus = mp_irqs[idx].srcbus;
1134 int trigger; 1135 int trigger;
1135 1136
1136 /* 1137 /*
1137 * Determine IRQ trigger mode (edge or level sensitive): 1138 * Determine IRQ trigger mode (edge or level sensitive):
1138 */ 1139 */
1139 switch ((mp_irqs[idx].mp_irqflag>>2) & 3) 1140 switch ((mp_irqs[idx].irqflag>>2) & 3)
1140 { 1141 {
1141 case 0: /* conforms, ie. bus-type dependent */ 1142 case 0: /* conforms, ie. bus-type dependent */
1142 if (test_bit(bus, mp_bus_not_pci)) 1143 if (test_bit(bus, mp_bus_not_pci))
@@ -1214,16 +1215,16 @@ int (*ioapic_renumber_irq)(int ioapic, int irq);
1214static int pin_2_irq(int idx, int apic, int pin) 1215static int pin_2_irq(int idx, int apic, int pin)
1215{ 1216{
1216 int irq, i; 1217 int irq, i;
1217 int bus = mp_irqs[idx].mp_srcbus; 1218 int bus = mp_irqs[idx].srcbus;
1218 1219
1219 /* 1220 /*
1220 * Debugging check, we are in big trouble if this message pops up! 1221 * Debugging check, we are in big trouble if this message pops up!
1221 */ 1222 */
1222 if (mp_irqs[idx].mp_dstirq != pin) 1223 if (mp_irqs[idx].dstirq != pin)
1223 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); 1224 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
1224 1225
1225 if (test_bit(bus, mp_bus_not_pci)) { 1226 if (test_bit(bus, mp_bus_not_pci)) {
1226 irq = mp_irqs[idx].mp_srcbusirq; 1227 irq = mp_irqs[idx].srcbusirq;
1227 } else { 1228 } else {
1228 /* 1229 /*
1229 * PCI IRQs are mapped in order 1230 * PCI IRQs are mapped in order
@@ -1566,14 +1567,14 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_de
1566 apic_printk(APIC_VERBOSE,KERN_DEBUG 1567 apic_printk(APIC_VERBOSE,KERN_DEBUG
1567 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1568 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
1568 "IRQ %d Mode:%i Active:%i)\n", 1569 "IRQ %d Mode:%i Active:%i)\n",
1569 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, 1570 apic, mp_ioapics[apic].apicid, pin, cfg->vector,
1570 irq, trigger, polarity); 1571 irq, trigger, polarity);
1571 1572
1572 1573
1573 if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, 1574 if (setup_ioapic_entry(mp_ioapics[apic].apicid, irq, &entry,
1574 dest, trigger, polarity, cfg->vector)) { 1575 dest, trigger, polarity, cfg->vector)) {
1575 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1576 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1576 mp_ioapics[apic].mp_apicid, pin); 1577 mp_ioapics[apic].apicid, pin);
1577 __clear_irq_vector(irq, cfg); 1578 __clear_irq_vector(irq, cfg);
1578 return; 1579 return;
1579 } 1580 }
@@ -1604,12 +1605,10 @@ static void __init setup_IO_APIC_irqs(void)
1604 notcon = 1; 1605 notcon = 1;
1605 apic_printk(APIC_VERBOSE, 1606 apic_printk(APIC_VERBOSE,
1606 KERN_DEBUG " %d-%d", 1607 KERN_DEBUG " %d-%d",
1607 mp_ioapics[apic].mp_apicid, 1608 mp_ioapics[apic].apicid, pin);
1608 pin);
1609 } else 1609 } else
1610 apic_printk(APIC_VERBOSE, " %d-%d", 1610 apic_printk(APIC_VERBOSE, " %d-%d",
1611 mp_ioapics[apic].mp_apicid, 1611 mp_ioapics[apic].apicid, pin);
1612 pin);
1613 continue; 1612 continue;
1614 } 1613 }
1615 if (notcon) { 1614 if (notcon) {
@@ -1699,7 +1698,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1699 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1698 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1700 for (i = 0; i < nr_ioapics; i++) 1699 for (i = 0; i < nr_ioapics; i++)
1701 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1700 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1702 mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); 1701 mp_ioapics[i].apicid, nr_ioapic_registers[i]);
1703 1702
1704 /* 1703 /*
1705 * We are a bit conservative about what we expect. We have to 1704 * We are a bit conservative about what we expect. We have to
@@ -1719,7 +1718,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1719 spin_unlock_irqrestore(&ioapic_lock, flags); 1718 spin_unlock_irqrestore(&ioapic_lock, flags);
1720 1719
1721 printk("\n"); 1720 printk("\n");
1722 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); 1721 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
1723 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1722 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1724 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1723 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1725 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); 1724 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -2121,14 +2120,14 @@ static void __init setup_ioapic_ids_from_mpc(void)
2121 reg_00.raw = io_apic_read(apic, 0); 2120 reg_00.raw = io_apic_read(apic, 0);
2122 spin_unlock_irqrestore(&ioapic_lock, flags); 2121 spin_unlock_irqrestore(&ioapic_lock, flags);
2123 2122
2124 old_id = mp_ioapics[apic].mp_apicid; 2123 old_id = mp_ioapics[apic].apicid;
2125 2124
2126 if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { 2125 if (mp_ioapics[apic].apicid >= get_physical_broadcast()) {
2127 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", 2126 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
2128 apic, mp_ioapics[apic].mp_apicid); 2127 apic, mp_ioapics[apic].apicid);
2129 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 2128 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
2130 reg_00.bits.ID); 2129 reg_00.bits.ID);
2131 mp_ioapics[apic].mp_apicid = reg_00.bits.ID; 2130 mp_ioapics[apic].apicid = reg_00.bits.ID;
2132 } 2131 }
2133 2132
2134 /* 2133 /*
@@ -2137,9 +2136,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
2137 * 'stuck on smp_invalidate_needed IPI wait' messages. 2136 * 'stuck on smp_invalidate_needed IPI wait' messages.
2138 */ 2137 */
2139 if (check_apicid_used(phys_id_present_map, 2138 if (check_apicid_used(phys_id_present_map,
2140 mp_ioapics[apic].mp_apicid)) { 2139 mp_ioapics[apic].apicid)) {
2141 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 2140 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
2142 apic, mp_ioapics[apic].mp_apicid); 2141 apic, mp_ioapics[apic].apicid);
2143 for (i = 0; i < get_physical_broadcast(); i++) 2142 for (i = 0; i < get_physical_broadcast(); i++)
2144 if (!physid_isset(i, phys_id_present_map)) 2143 if (!physid_isset(i, phys_id_present_map))
2145 break; 2144 break;
@@ -2148,13 +2147,13 @@ static void __init setup_ioapic_ids_from_mpc(void)
2148 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 2147 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
2149 i); 2148 i);
2150 physid_set(i, phys_id_present_map); 2149 physid_set(i, phys_id_present_map);
2151 mp_ioapics[apic].mp_apicid = i; 2150 mp_ioapics[apic].apicid = i;
2152 } else { 2151 } else {
2153 physid_mask_t tmp; 2152 physid_mask_t tmp;
2154 tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); 2153 tmp = apicid_to_cpu_present(mp_ioapics[apic].apicid);
2155 apic_printk(APIC_VERBOSE, "Setting %d in the " 2154 apic_printk(APIC_VERBOSE, "Setting %d in the "
2156 "phys_id_present_map\n", 2155 "phys_id_present_map\n",
2157 mp_ioapics[apic].mp_apicid); 2156 mp_ioapics[apic].apicid);
2158 physids_or(phys_id_present_map, phys_id_present_map, tmp); 2157 physids_or(phys_id_present_map, phys_id_present_map, tmp);
2159 } 2158 }
2160 2159
@@ -2163,11 +2162,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
2163 * We need to adjust the IRQ routing table 2162 * We need to adjust the IRQ routing table
2164 * if the ID changed. 2163 * if the ID changed.
2165 */ 2164 */
2166 if (old_id != mp_ioapics[apic].mp_apicid) 2165 if (old_id != mp_ioapics[apic].apicid)
2167 for (i = 0; i < mp_irq_entries; i++) 2166 for (i = 0; i < mp_irq_entries; i++)
2168 if (mp_irqs[i].mp_dstapic == old_id) 2167 if (mp_irqs[i].dstapic == old_id)
2169 mp_irqs[i].mp_dstapic 2168 mp_irqs[i].dstapic
2170 = mp_ioapics[apic].mp_apicid; 2169 = mp_ioapics[apic].apicid;
2171 2170
2172 /* 2171 /*
2173 * Read the right value from the MPC table and 2172 * Read the right value from the MPC table and
@@ -2175,9 +2174,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
2175 */ 2174 */
2176 apic_printk(APIC_VERBOSE, KERN_INFO 2175 apic_printk(APIC_VERBOSE, KERN_INFO
2177 "...changing IO-APIC physical APIC ID to %d ...", 2176 "...changing IO-APIC physical APIC ID to %d ...",
2178 mp_ioapics[apic].mp_apicid); 2177 mp_ioapics[apic].apicid);
2179 2178
2180 reg_00.bits.ID = mp_ioapics[apic].mp_apicid; 2179 reg_00.bits.ID = mp_ioapics[apic].apicid;
2181 spin_lock_irqsave(&ioapic_lock, flags); 2180 spin_lock_irqsave(&ioapic_lock, flags);
2182 io_apic_write(apic, 0, reg_00.raw); 2181 io_apic_write(apic, 0, reg_00.raw);
2183 spin_unlock_irqrestore(&ioapic_lock, flags); 2182 spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2188,7 +2187,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
2188 spin_lock_irqsave(&ioapic_lock, flags); 2187 spin_lock_irqsave(&ioapic_lock, flags);
2189 reg_00.raw = io_apic_read(apic, 0); 2188 reg_00.raw = io_apic_read(apic, 0);
2190 spin_unlock_irqrestore(&ioapic_lock, flags); 2189 spin_unlock_irqrestore(&ioapic_lock, flags);
2191 if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) 2190 if (reg_00.bits.ID != mp_ioapics[apic].apicid)
2192 printk("could not set ID!\n"); 2191 printk("could not set ID!\n");
2193 else 2192 else
2194 apic_printk(APIC_VERBOSE, " ok.\n"); 2193 apic_printk(APIC_VERBOSE, " ok.\n");
@@ -2383,7 +2382,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2383 if (cfg->move_in_progress) 2382 if (cfg->move_in_progress)
2384 send_cleanup_vector(cfg); 2383 send_cleanup_vector(cfg);
2385 2384
2386 cpumask_copy(&desc->affinity, mask); 2385 cpumask_copy(desc->affinity, mask);
2387} 2386}
2388 2387
2389static int migrate_irq_remapped_level_desc(struct irq_desc *desc) 2388static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
@@ -2405,11 +2404,11 @@ static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
2405 } 2404 }
2406 2405
2407 /* everthing is clear. we have right of way */ 2406 /* everthing is clear. we have right of way */
2408 migrate_ioapic_irq_desc(desc, &desc->pending_mask); 2407 migrate_ioapic_irq_desc(desc, desc->pending_mask);
2409 2408
2410 ret = 0; 2409 ret = 0;
2411 desc->status &= ~IRQ_MOVE_PENDING; 2410 desc->status &= ~IRQ_MOVE_PENDING;
2412 cpumask_clear(&desc->pending_mask); 2411 cpumask_clear(desc->pending_mask);
2413 2412
2414unmask: 2413unmask:
2415 unmask_IO_APIC_irq_desc(desc); 2414 unmask_IO_APIC_irq_desc(desc);
@@ -2434,7 +2433,7 @@ static void ir_irq_migration(struct work_struct *work)
2434 continue; 2433 continue;
2435 } 2434 }
2436 2435
2437 desc->chip->set_affinity(irq, &desc->pending_mask); 2436 desc->chip->set_affinity(irq, desc->pending_mask);
2438 spin_unlock_irqrestore(&desc->lock, flags); 2437 spin_unlock_irqrestore(&desc->lock, flags);
2439 } 2438 }
2440 } 2439 }
@@ -2448,7 +2447,7 @@ static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2448{ 2447{
2449 if (desc->status & IRQ_LEVEL) { 2448 if (desc->status & IRQ_LEVEL) {
2450 desc->status |= IRQ_MOVE_PENDING; 2449 desc->status |= IRQ_MOVE_PENDING;
2451 cpumask_copy(&desc->pending_mask, mask); 2450 cpumask_copy(desc->pending_mask, mask);
2452 migrate_irq_remapped_level_desc(desc); 2451 migrate_irq_remapped_level_desc(desc);
2453 return; 2452 return;
2454 } 2453 }
@@ -2516,7 +2515,7 @@ static void irq_complete_move(struct irq_desc **descp)
2516 2515
2517 /* domain has not changed, but affinity did */ 2516 /* domain has not changed, but affinity did */
2518 me = smp_processor_id(); 2517 me = smp_processor_id();
2519 if (cpu_isset(me, desc->affinity)) { 2518 if (cpumask_test_cpu(me, desc->affinity)) {
2520 *descp = desc = move_irq_desc(desc, me); 2519 *descp = desc = move_irq_desc(desc, me);
2521 /* get the new one */ 2520 /* get the new one */
2522 cfg = desc->chip_data; 2521 cfg = desc->chip_data;
@@ -3118,8 +3117,8 @@ static int ioapic_resume(struct sys_device *dev)
3118 3117
3119 spin_lock_irqsave(&ioapic_lock, flags); 3118 spin_lock_irqsave(&ioapic_lock, flags);
3120 reg_00.raw = io_apic_read(dev->id, 0); 3119 reg_00.raw = io_apic_read(dev->id, 0);
3121 if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { 3120 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
3122 reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; 3121 reg_00.bits.ID = mp_ioapics[dev->id].apicid;
3123 io_apic_write(dev->id, 0, reg_00.raw); 3122 io_apic_write(dev->id, 0, reg_00.raw);
3124 } 3123 }
3125 spin_unlock_irqrestore(&ioapic_lock, flags); 3124 spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -3184,7 +3183,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
3184 3183
3185 irq = 0; 3184 irq = 0;
3186 spin_lock_irqsave(&vector_lock, flags); 3185 spin_lock_irqsave(&vector_lock, flags);
3187 for (new = irq_want; new < NR_IRQS; new++) { 3186 for (new = irq_want; new < nr_irqs; new++) {
3188 if (platform_legacy_irq(new)) 3187 if (platform_legacy_irq(new))
3189 continue; 3188 continue;
3190 3189
@@ -3259,6 +3258,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3259 int err; 3258 int err;
3260 unsigned dest; 3259 unsigned dest;
3261 3260
3261 if (disable_apic)
3262 return -ENXIO;
3263
3262 cfg = irq_cfg(irq); 3264 cfg = irq_cfg(irq);
3263 err = assign_irq_vector(irq, cfg, TARGET_CPUS); 3265 err = assign_irq_vector(irq, cfg, TARGET_CPUS);
3264 if (err) 3266 if (err)
@@ -3727,6 +3729,9 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3727 struct irq_cfg *cfg; 3729 struct irq_cfg *cfg;
3728 int err; 3730 int err;
3729 3731
3732 if (disable_apic)
3733 return -ENXIO;
3734
3730 cfg = irq_cfg(irq); 3735 cfg = irq_cfg(irq);
3731 err = assign_irq_vector(irq, cfg, TARGET_CPUS); 3736 err = assign_irq_vector(irq, cfg, TARGET_CPUS);
3732 if (!err) { 3737 if (!err) {
@@ -3761,7 +3766,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3761} 3766}
3762#endif /* CONFIG_HT_IRQ */ 3767#endif /* CONFIG_HT_IRQ */
3763 3768
3764#ifdef CONFIG_X86_64 3769#ifdef CONFIG_X86_UV
3765/* 3770/*
3766 * Re-target the irq to the specified CPU and enable the specified MMR located 3771 * Re-target the irq to the specified CPU and enable the specified MMR located
3767 * on the specified blade to allow the sending of MSIs to the specified CPU. 3772 * on the specified blade to allow the sending of MSIs to the specified CPU.
@@ -3851,6 +3856,22 @@ void __init probe_nr_irqs_gsi(void)
3851 nr_irqs_gsi = nr; 3856 nr_irqs_gsi = nr;
3852} 3857}
3853 3858
3859#ifdef CONFIG_SPARSE_IRQ
3860int __init arch_probe_nr_irqs(void)
3861{
3862 int nr;
3863
3864 nr = ((8 * nr_cpu_ids) > (32 * nr_ioapics) ?
3865 (NR_VECTORS + (8 * nr_cpu_ids)) :
3866 (NR_VECTORS + (32 * nr_ioapics)));
3867
3868 if (nr < nr_irqs && nr > nr_irqs_gsi)
3869 nr_irqs = nr;
3870
3871 return 0;
3872}
3873#endif
3874
3854/* -------------------------------------------------------------------------- 3875/* --------------------------------------------------------------------------
3855 ACPI-based IOAPIC Configuration 3876 ACPI-based IOAPIC Configuration
3856 -------------------------------------------------------------------------- */ 3877 -------------------------------------------------------------------------- */
@@ -3985,8 +4006,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
3985 return -1; 4006 return -1;
3986 4007
3987 for (i = 0; i < mp_irq_entries; i++) 4008 for (i = 0; i < mp_irq_entries; i++)
3988 if (mp_irqs[i].mp_irqtype == mp_INT && 4009 if (mp_irqs[i].irqtype == mp_INT &&
3989 mp_irqs[i].mp_srcbusirq == bus_irq) 4010 mp_irqs[i].srcbusirq == bus_irq)
3990 break; 4011 break;
3991 if (i >= mp_irq_entries) 4012 if (i >= mp_irq_entries)
3992 return -1; 4013 return -1;
@@ -4040,7 +4061,7 @@ void __init setup_ioapic_dest(void)
4040 */ 4061 */
4041 if (desc->status & 4062 if (desc->status &
4042 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) 4063 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
4043 mask = &desc->affinity; 4064 mask = desc->affinity;
4044 else 4065 else
4045 mask = TARGET_CPUS; 4066 mask = TARGET_CPUS;
4046 4067
@@ -4101,7 +4122,7 @@ void __init ioapic_init_mappings(void)
4101 ioapic_res = ioapic_setup_resources(); 4122 ioapic_res = ioapic_setup_resources();
4102 for (i = 0; i < nr_ioapics; i++) { 4123 for (i = 0; i < nr_ioapics; i++) {
4103 if (smp_found_config) { 4124 if (smp_found_config) {
4104 ioapic_phys = mp_ioapics[i].mp_apicaddr; 4125 ioapic_phys = mp_ioapics[i].apicaddr;
4105#ifdef CONFIG_X86_32 4126#ifdef CONFIG_X86_32
4106 if (!ioapic_phys) { 4127 if (!ioapic_phys) {
4107 printk(KERN_ERR 4128 printk(KERN_ERR
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3973e2df7f87..8b30d0c2512c 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -36,11 +36,7 @@ void ack_bad_irq(unsigned int irq)
36#endif 36#endif
37} 37}
38 38
39#ifdef CONFIG_X86_32 39#define irq_stats(x) (&per_cpu(irq_stat, x))
40# define irq_stats(x) (&per_cpu(irq_stat, x))
41#else
42# define irq_stats(x) cpu_pda(x)
43#endif
44/* 40/*
45 * /proc/interrupts printing: 41 * /proc/interrupts printing:
46 */ 42 */
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 74b9ff7341e9..e0f29be8ab0b 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -248,7 +248,7 @@ void fixup_irqs(void)
248 if (irq == 2) 248 if (irq == 2)
249 continue; 249 continue;
250 250
251 affinity = &desc->affinity; 251 affinity = desc->affinity;
252 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { 252 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
253 printk("Breaking affinity for irq %i\n", irq); 253 printk("Breaking affinity for irq %i\n", irq);
254 affinity = cpu_all_mask; 254 affinity = cpu_all_mask;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 63c88e6ec025..018963aa6ee3 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,6 +18,13 @@
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <asm/io_apic.h> 19#include <asm/io_apic.h>
20#include <asm/idle.h> 20#include <asm/idle.h>
21#include <asm/apic.h>
22
23DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
24EXPORT_PER_CPU_SYMBOL(irq_stat);
25
26DEFINE_PER_CPU(struct pt_regs *, irq_regs);
27EXPORT_PER_CPU_SYMBOL(irq_regs);
21 28
22/* 29/*
23 * Probabilistic stack overflow check: 30 * Probabilistic stack overflow check:
@@ -100,7 +107,7 @@ void fixup_irqs(void)
100 /* interrupt's are disabled at this point */ 107 /* interrupt's are disabled at this point */
101 spin_lock(&desc->lock); 108 spin_lock(&desc->lock);
102 109
103 affinity = &desc->affinity; 110 affinity = desc->affinity;
104 if (!irq_has_action(irq) || 111 if (!irq_has_action(irq) ||
105 cpumask_equal(affinity, cpu_online_mask)) { 112 cpumask_equal(affinity, cpu_online_mask)) {
106 spin_unlock(&desc->lock); 113 spin_unlock(&desc->lock);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 10a09c2f1828..22608ebf831b 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -140,8 +140,15 @@ void __init native_init_IRQ(void)
140 */ 140 */
141 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 141 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
142 142
143 /* IPI for invalidation */ 143 /* IPIs for invalidation */
144 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); 144 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
145 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
146 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
147 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
148 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
149 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
150 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
151 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
145 152
146 /* IPI for generic function call */ 153 /* IPI for generic function call */
147 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 154 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index b7f4c929e615..5e9f4fc51385 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -87,9 +87,9 @@
87#include <linux/cpu.h> 87#include <linux/cpu.h>
88#include <linux/firmware.h> 88#include <linux/firmware.h>
89#include <linux/platform_device.h> 89#include <linux/platform_device.h>
90#include <linux/uaccess.h>
90 91
91#include <asm/msr.h> 92#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h> 93#include <asm/processor.h>
94#include <asm/microcode.h> 94#include <asm/microcode.h>
95 95
@@ -196,7 +196,7 @@ static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
196 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1; 196 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
197} 197}
198 198
199static inline int 199static inline int
200update_match_revision(struct microcode_header_intel *mc_header, int rev) 200update_match_revision(struct microcode_header_intel *mc_header, int rev)
201{ 201{
202 return (mc_header->rev <= rev) ? 0 : 1; 202 return (mc_header->rev <= rev) ? 0 : 1;
@@ -442,8 +442,8 @@ static int request_microcode_fw(int cpu, struct device *device)
442 return ret; 442 return ret;
443 } 443 }
444 444
445 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, 445 ret = generic_load_microcode(cpu, (void *)firmware->data,
446 &get_ucode_fw); 446 firmware->size, &get_ucode_fw);
447 447
448 release_firmware(firmware); 448 release_firmware(firmware);
449 449
@@ -460,7 +460,7 @@ static int request_microcode_user(int cpu, const void __user *buf, size_t size)
460 /* We should bind the task to the CPU */ 460 /* We should bind the task to the CPU */
461 BUG_ON(cpu != raw_smp_processor_id()); 461 BUG_ON(cpu != raw_smp_processor_id());
462 462
463 return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user); 463 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
464} 464}
465 465
466static void microcode_fini_cpu(int cpu) 466static void microcode_fini_cpu(int cpu)
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
index 3db0a5442eb1..0edd819050e7 100644
--- a/arch/x86/kernel/module_32.c
+++ b/arch/x86/kernel/module_32.c
@@ -42,7 +42,7 @@ void module_free(struct module *mod, void *module_region)
42{ 42{
43 vfree(module_region); 43 vfree(module_region);
44 /* FIXME: If module_region == mod->init_region, trim exception 44 /* FIXME: If module_region == mod->init_region, trim exception
45 table entries. */ 45 table entries. */
46} 46}
47 47
48/* We don't need anything special. */ 48/* We don't need anything special. */
@@ -113,13 +113,13 @@ int module_finalize(const Elf_Ehdr *hdr,
113 *para = NULL; 113 *para = NULL;
114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
115 115
116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
117 if (!strcmp(".text", secstrings + s->sh_name)) 117 if (!strcmp(".text", secstrings + s->sh_name))
118 text = s; 118 text = s;
119 if (!strcmp(".altinstructions", secstrings + s->sh_name)) 119 if (!strcmp(".altinstructions", secstrings + s->sh_name))
120 alt = s; 120 alt = s;
121 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 121 if (!strcmp(".smp_locks", secstrings + s->sh_name))
122 locks= s; 122 locks = s;
123 if (!strcmp(".parainstructions", secstrings + s->sh_name)) 123 if (!strcmp(".parainstructions", secstrings + s->sh_name))
124 para = s; 124 para = s;
125 } 125 }
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index 6ba87830d4b1..c23880b90b5c 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -30,14 +30,14 @@
30#include <asm/page.h> 30#include <asm/page.h>
31#include <asm/pgtable.h> 31#include <asm/pgtable.h>
32 32
33#define DEBUGP(fmt...) 33#define DEBUGP(fmt...)
34 34
35#ifndef CONFIG_UML 35#ifndef CONFIG_UML
36void module_free(struct module *mod, void *module_region) 36void module_free(struct module *mod, void *module_region)
37{ 37{
38 vfree(module_region); 38 vfree(module_region);
39 /* FIXME: If module_region == mod->init_region, trim exception 39 /* FIXME: If module_region == mod->init_region, trim exception
40 table entries. */ 40 table entries. */
41} 41}
42 42
43void *module_alloc(unsigned long size) 43void *module_alloc(unsigned long size)
@@ -77,7 +77,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
77 Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; 77 Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
78 Elf64_Sym *sym; 78 Elf64_Sym *sym;
79 void *loc; 79 void *loc;
80 u64 val; 80 u64 val;
81 81
82 DEBUGP("Applying relocate section %u to %u\n", relsec, 82 DEBUGP("Applying relocate section %u to %u\n", relsec,
83 sechdrs[relsec].sh_info); 83 sechdrs[relsec].sh_info);
@@ -91,11 +91,11 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
91 sym = (Elf64_Sym *)sechdrs[symindex].sh_addr 91 sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
92 + ELF64_R_SYM(rel[i].r_info); 92 + ELF64_R_SYM(rel[i].r_info);
93 93
94 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", 94 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
95 (int)ELF64_R_TYPE(rel[i].r_info), 95 (int)ELF64_R_TYPE(rel[i].r_info),
96 sym->st_value, rel[i].r_addend, (u64)loc); 96 sym->st_value, rel[i].r_addend, (u64)loc);
97 97
98 val = sym->st_value + rel[i].r_addend; 98 val = sym->st_value + rel[i].r_addend;
99 99
100 switch (ELF64_R_TYPE(rel[i].r_info)) { 100 switch (ELF64_R_TYPE(rel[i].r_info)) {
101 case R_X86_64_NONE: 101 case R_X86_64_NONE:
@@ -113,16 +113,16 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
113 if ((s64)val != *(s32 *)loc) 113 if ((s64)val != *(s32 *)loc)
114 goto overflow; 114 goto overflow;
115 break; 115 break;
116 case R_X86_64_PC32: 116 case R_X86_64_PC32:
117 val -= (u64)loc; 117 val -= (u64)loc;
118 *(u32 *)loc = val; 118 *(u32 *)loc = val;
119#if 0 119#if 0
120 if ((s64)val != *(s32 *)loc) 120 if ((s64)val != *(s32 *)loc)
121 goto overflow; 121 goto overflow;
122#endif 122#endif
123 break; 123 break;
124 default: 124 default:
125 printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n", 125 printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n",
126 me->name, ELF64_R_TYPE(rel[i].r_info)); 126 me->name, ELF64_R_TYPE(rel[i].r_info));
127 return -ENOEXEC; 127 return -ENOEXEC;
128 } 128 }
@@ -130,7 +130,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
130 return 0; 130 return 0;
131 131
132overflow: 132overflow:
133 printk(KERN_ERR "overflow in relocation type %d val %Lx\n", 133 printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
134 (int)ELF64_R_TYPE(rel[i].r_info), val); 134 (int)ELF64_R_TYPE(rel[i].r_info), val);
135 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", 135 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
136 me->name); 136 me->name);
@@ -143,13 +143,13 @@ int apply_relocate(Elf_Shdr *sechdrs,
143 unsigned int relsec, 143 unsigned int relsec,
144 struct module *me) 144 struct module *me)
145{ 145{
146 printk("non add relocation not supported\n"); 146 printk(KERN_ERR "non add relocation not supported\n");
147 return -ENOSYS; 147 return -ENOSYS;
148} 148}
149 149
150int module_finalize(const Elf_Ehdr *hdr, 150int module_finalize(const Elf_Ehdr *hdr,
151 const Elf_Shdr *sechdrs, 151 const Elf_Shdr *sechdrs,
152 struct module *me) 152 struct module *me)
153{ 153{
154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, 154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
155 *para = NULL; 155 *para = NULL;
@@ -161,7 +161,7 @@ int module_finalize(const Elf_Ehdr *hdr,
161 if (!strcmp(".altinstructions", secstrings + s->sh_name)) 161 if (!strcmp(".altinstructions", secstrings + s->sh_name))
162 alt = s; 162 alt = s;
163 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 163 if (!strcmp(".smp_locks", secstrings + s->sh_name))
164 locks= s; 164 locks = s;
165 if (!strcmp(".parainstructions", secstrings + s->sh_name)) 165 if (!strcmp(".parainstructions", secstrings + s->sh_name))
166 para = s; 166 para = s;
167 } 167 }
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index a649a4ccad43..fa6bb263892e 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -144,11 +144,11 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)
144 if (bad_ioapic(m->apicaddr)) 144 if (bad_ioapic(m->apicaddr))
145 return; 145 return;
146 146
147 mp_ioapics[nr_ioapics].mp_apicaddr = m->apicaddr; 147 mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
148 mp_ioapics[nr_ioapics].mp_apicid = m->apicid; 148 mp_ioapics[nr_ioapics].apicid = m->apicid;
149 mp_ioapics[nr_ioapics].mp_type = m->type; 149 mp_ioapics[nr_ioapics].type = m->type;
150 mp_ioapics[nr_ioapics].mp_apicver = m->apicver; 150 mp_ioapics[nr_ioapics].apicver = m->apicver;
151 mp_ioapics[nr_ioapics].mp_flags = m->flags; 151 mp_ioapics[nr_ioapics].flags = m->flags;
152 nr_ioapics++; 152 nr_ioapics++;
153} 153}
154 154
@@ -160,55 +160,55 @@ static void print_MP_intsrc_info(struct mpc_intsrc *m)
160 m->srcbusirq, m->dstapic, m->dstirq); 160 m->srcbusirq, m->dstapic, m->dstirq);
161} 161}
162 162
163static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) 163static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
164{ 164{
165 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," 165 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
166 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 166 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
167 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, 167 mp_irq->irqtype, mp_irq->irqflag & 3,
168 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, 168 (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
169 mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq); 169 mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
170} 170}
171 171
172static void __init assign_to_mp_irq(struct mpc_intsrc *m, 172static void __init assign_to_mp_irq(struct mpc_intsrc *m,
173 struct mp_config_intsrc *mp_irq) 173 struct mpc_intsrc *mp_irq)
174{ 174{
175 mp_irq->mp_dstapic = m->dstapic; 175 mp_irq->dstapic = m->dstapic;
176 mp_irq->mp_type = m->type; 176 mp_irq->type = m->type;
177 mp_irq->mp_irqtype = m->irqtype; 177 mp_irq->irqtype = m->irqtype;
178 mp_irq->mp_irqflag = m->irqflag; 178 mp_irq->irqflag = m->irqflag;
179 mp_irq->mp_srcbus = m->srcbus; 179 mp_irq->srcbus = m->srcbus;
180 mp_irq->mp_srcbusirq = m->srcbusirq; 180 mp_irq->srcbusirq = m->srcbusirq;
181 mp_irq->mp_dstirq = m->dstirq; 181 mp_irq->dstirq = m->dstirq;
182} 182}
183 183
184static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq, 184static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
185 struct mpc_intsrc *m) 185 struct mpc_intsrc *m)
186{ 186{
187 m->dstapic = mp_irq->mp_dstapic; 187 m->dstapic = mp_irq->dstapic;
188 m->type = mp_irq->mp_type; 188 m->type = mp_irq->type;
189 m->irqtype = mp_irq->mp_irqtype; 189 m->irqtype = mp_irq->irqtype;
190 m->irqflag = mp_irq->mp_irqflag; 190 m->irqflag = mp_irq->irqflag;
191 m->srcbus = mp_irq->mp_srcbus; 191 m->srcbus = mp_irq->srcbus;
192 m->srcbusirq = mp_irq->mp_srcbusirq; 192 m->srcbusirq = mp_irq->srcbusirq;
193 m->dstirq = mp_irq->mp_dstirq; 193 m->dstirq = mp_irq->dstirq;
194} 194}
195 195
196static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq, 196static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
197 struct mpc_intsrc *m) 197 struct mpc_intsrc *m)
198{ 198{
199 if (mp_irq->mp_dstapic != m->dstapic) 199 if (mp_irq->dstapic != m->dstapic)
200 return 1; 200 return 1;
201 if (mp_irq->mp_type != m->type) 201 if (mp_irq->type != m->type)
202 return 2; 202 return 2;
203 if (mp_irq->mp_irqtype != m->irqtype) 203 if (mp_irq->irqtype != m->irqtype)
204 return 3; 204 return 3;
205 if (mp_irq->mp_irqflag != m->irqflag) 205 if (mp_irq->irqflag != m->irqflag)
206 return 4; 206 return 4;
207 if (mp_irq->mp_srcbus != m->srcbus) 207 if (mp_irq->srcbus != m->srcbus)
208 return 5; 208 return 5;
209 if (mp_irq->mp_srcbusirq != m->srcbusirq) 209 if (mp_irq->srcbusirq != m->srcbusirq)
210 return 6; 210 return 6;
211 if (mp_irq->mp_dstirq != m->dstirq) 211 if (mp_irq->dstirq != m->dstirq)
212 return 7; 212 return 7;
213 213
214 return 0; 214 return 0;
@@ -417,7 +417,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
417 intsrc.type = MP_INTSRC; 417 intsrc.type = MP_INTSRC;
418 intsrc.irqflag = 0; /* conforming */ 418 intsrc.irqflag = 0; /* conforming */
419 intsrc.srcbus = 0; 419 intsrc.srcbus = 0;
420 intsrc.dstapic = mp_ioapics[0].mp_apicid; 420 intsrc.dstapic = mp_ioapics[0].apicid;
421 421
422 intsrc.irqtype = mp_INT; 422 intsrc.irqtype = mp_INT;
423 423
@@ -570,14 +570,14 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
570 } 570 }
571} 571}
572 572
573static struct intel_mp_floating *mpf_found; 573static struct mpf_intel *mpf_found;
574 574
575/* 575/*
576 * Scan the memory blocks for an SMP configuration block. 576 * Scan the memory blocks for an SMP configuration block.
577 */ 577 */
578static void __init __get_smp_config(unsigned int early) 578static void __init __get_smp_config(unsigned int early)
579{ 579{
580 struct intel_mp_floating *mpf = mpf_found; 580 struct mpf_intel *mpf = mpf_found;
581 581
582 if (!mpf) 582 if (!mpf)
583 return; 583 return;
@@ -598,9 +598,9 @@ static void __init __get_smp_config(unsigned int early)
598 } 598 }
599 599
600 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", 600 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
601 mpf->mpf_specification); 601 mpf->specification);
602#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) 602#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
603 if (mpf->mpf_feature2 & (1 << 7)) { 603 if (mpf->feature2 & (1 << 7)) {
604 printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); 604 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
605 pic_mode = 1; 605 pic_mode = 1;
606 } else { 606 } else {
@@ -611,7 +611,7 @@ static void __init __get_smp_config(unsigned int early)
611 /* 611 /*
612 * Now see if we need to read further. 612 * Now see if we need to read further.
613 */ 613 */
614 if (mpf->mpf_feature1 != 0) { 614 if (mpf->feature1 != 0) {
615 if (early) { 615 if (early) {
616 /* 616 /*
617 * local APIC has default address 617 * local APIC has default address
@@ -621,16 +621,16 @@ static void __init __get_smp_config(unsigned int early)
621 } 621 }
622 622
623 printk(KERN_INFO "Default MP configuration #%d\n", 623 printk(KERN_INFO "Default MP configuration #%d\n",
624 mpf->mpf_feature1); 624 mpf->feature1);
625 construct_default_ISA_mptable(mpf->mpf_feature1); 625 construct_default_ISA_mptable(mpf->feature1);
626 626
627 } else if (mpf->mpf_physptr) { 627 } else if (mpf->physptr) {
628 628
629 /* 629 /*
630 * Read the physical hardware table. Anything here will 630 * Read the physical hardware table. Anything here will
631 * override the defaults. 631 * override the defaults.
632 */ 632 */
633 if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) { 633 if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) {
634#ifdef CONFIG_X86_LOCAL_APIC 634#ifdef CONFIG_X86_LOCAL_APIC
635 smp_found_config = 0; 635 smp_found_config = 0;
636#endif 636#endif
@@ -688,19 +688,19 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
688 unsigned reserve) 688 unsigned reserve)
689{ 689{
690 unsigned int *bp = phys_to_virt(base); 690 unsigned int *bp = phys_to_virt(base);
691 struct intel_mp_floating *mpf; 691 struct mpf_intel *mpf;
692 692
693 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", 693 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
694 bp, length); 694 bp, length);
695 BUILD_BUG_ON(sizeof(*mpf) != 16); 695 BUILD_BUG_ON(sizeof(*mpf) != 16);
696 696
697 while (length > 0) { 697 while (length > 0) {
698 mpf = (struct intel_mp_floating *)bp; 698 mpf = (struct mpf_intel *)bp;
699 if ((*bp == SMP_MAGIC_IDENT) && 699 if ((*bp == SMP_MAGIC_IDENT) &&
700 (mpf->mpf_length == 1) && 700 (mpf->length == 1) &&
701 !mpf_checksum((unsigned char *)bp, 16) && 701 !mpf_checksum((unsigned char *)bp, 16) &&
702 ((mpf->mpf_specification == 1) 702 ((mpf->specification == 1)
703 || (mpf->mpf_specification == 4))) { 703 || (mpf->specification == 4))) {
704#ifdef CONFIG_X86_LOCAL_APIC 704#ifdef CONFIG_X86_LOCAL_APIC
705 smp_found_config = 1; 705 smp_found_config = 1;
706#endif 706#endif
@@ -713,7 +713,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
713 return 1; 713 return 1;
714 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, 714 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
715 BOOTMEM_DEFAULT); 715 BOOTMEM_DEFAULT);
716 if (mpf->mpf_physptr) { 716 if (mpf->physptr) {
717 unsigned long size = PAGE_SIZE; 717 unsigned long size = PAGE_SIZE;
718#ifdef CONFIG_X86_32 718#ifdef CONFIG_X86_32
719 /* 719 /*
@@ -722,14 +722,14 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
722 * the bottom is mapped now. 722 * the bottom is mapped now.
723 * PC-9800's MPC table places on the very last 723 * PC-9800's MPC table places on the very last
724 * of physical memory; so that simply reserving 724 * of physical memory; so that simply reserving
725 * PAGE_SIZE from mpg->mpf_physptr yields BUG() 725 * PAGE_SIZE from mpf->physptr yields BUG()
726 * in reserve_bootmem. 726 * in reserve_bootmem.
727 */ 727 */
728 unsigned long end = max_low_pfn * PAGE_SIZE; 728 unsigned long end = max_low_pfn * PAGE_SIZE;
729 if (mpf->mpf_physptr + size > end) 729 if (mpf->physptr + size > end)
730 size = end - mpf->mpf_physptr; 730 size = end - mpf->physptr;
731#endif 731#endif
732 reserve_bootmem_generic(mpf->mpf_physptr, size, 732 reserve_bootmem_generic(mpf->physptr, size,
733 BOOTMEM_DEFAULT); 733 BOOTMEM_DEFAULT);
734 } 734 }
735 735
@@ -809,15 +809,15 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
809 /* not legacy */ 809 /* not legacy */
810 810
811 for (i = 0; i < mp_irq_entries; i++) { 811 for (i = 0; i < mp_irq_entries; i++) {
812 if (mp_irqs[i].mp_irqtype != mp_INT) 812 if (mp_irqs[i].irqtype != mp_INT)
813 continue; 813 continue;
814 814
815 if (mp_irqs[i].mp_irqflag != 0x0f) 815 if (mp_irqs[i].irqflag != 0x0f)
816 continue; 816 continue;
817 817
818 if (mp_irqs[i].mp_srcbus != m->srcbus) 818 if (mp_irqs[i].srcbus != m->srcbus)
819 continue; 819 continue;
820 if (mp_irqs[i].mp_srcbusirq != m->srcbusirq) 820 if (mp_irqs[i].srcbusirq != m->srcbusirq)
821 continue; 821 continue;
822 if (irq_used[i]) { 822 if (irq_used[i]) {
823 /* already claimed */ 823 /* already claimed */
@@ -922,10 +922,10 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
922 if (irq_used[i]) 922 if (irq_used[i])
923 continue; 923 continue;
924 924
925 if (mp_irqs[i].mp_irqtype != mp_INT) 925 if (mp_irqs[i].irqtype != mp_INT)
926 continue; 926 continue;
927 927
928 if (mp_irqs[i].mp_irqflag != 0x0f) 928 if (mp_irqs[i].irqflag != 0x0f)
929 continue; 929 continue;
930 930
931 if (nr_m_spare > 0) { 931 if (nr_m_spare > 0) {
@@ -1001,7 +1001,7 @@ static int __init update_mp_table(void)
1001{ 1001{
1002 char str[16]; 1002 char str[16];
1003 char oem[10]; 1003 char oem[10];
1004 struct intel_mp_floating *mpf; 1004 struct mpf_intel *mpf;
1005 struct mpc_table *mpc, *mpc_new; 1005 struct mpc_table *mpc, *mpc_new;
1006 1006
1007 if (!enable_update_mptable) 1007 if (!enable_update_mptable)
@@ -1014,19 +1014,19 @@ static int __init update_mp_table(void)
1014 /* 1014 /*
1015 * Now see if we need to go further. 1015 * Now see if we need to go further.
1016 */ 1016 */
1017 if (mpf->mpf_feature1 != 0) 1017 if (mpf->feature1 != 0)
1018 return 0; 1018 return 0;
1019 1019
1020 if (!mpf->mpf_physptr) 1020 if (!mpf->physptr)
1021 return 0; 1021 return 0;
1022 1022
1023 mpc = phys_to_virt(mpf->mpf_physptr); 1023 mpc = phys_to_virt(mpf->physptr);
1024 1024
1025 if (!smp_check_mpc(mpc, oem, str)) 1025 if (!smp_check_mpc(mpc, oem, str))
1026 return 0; 1026 return 0;
1027 1027
1028 printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf)); 1028 printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf));
1029 printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr); 1029 printk(KERN_INFO "physptr: %x\n", mpf->physptr);
1030 1030
1031 if (mpc_new_phys && mpc->length > mpc_new_length) { 1031 if (mpc_new_phys && mpc->length > mpc_new_length) {
1032 mpc_new_phys = 0; 1032 mpc_new_phys = 0;
@@ -1047,23 +1047,23 @@ static int __init update_mp_table(void)
1047 } 1047 }
1048 printk(KERN_INFO "use in-positon replacing\n"); 1048 printk(KERN_INFO "use in-positon replacing\n");
1049 } else { 1049 } else {
1050 mpf->mpf_physptr = mpc_new_phys; 1050 mpf->physptr = mpc_new_phys;
1051 mpc_new = phys_to_virt(mpc_new_phys); 1051 mpc_new = phys_to_virt(mpc_new_phys);
1052 memcpy(mpc_new, mpc, mpc->length); 1052 memcpy(mpc_new, mpc, mpc->length);
1053 mpc = mpc_new; 1053 mpc = mpc_new;
1054 /* check if we can modify that */ 1054 /* check if we can modify that */
1055 if (mpc_new_phys - mpf->mpf_physptr) { 1055 if (mpc_new_phys - mpf->physptr) {
1056 struct intel_mp_floating *mpf_new; 1056 struct mpf_intel *mpf_new;
1057 /* steal 16 bytes from [0, 1k) */ 1057 /* steal 16 bytes from [0, 1k) */
1058 printk(KERN_INFO "mpf new: %x\n", 0x400 - 16); 1058 printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
1059 mpf_new = phys_to_virt(0x400 - 16); 1059 mpf_new = phys_to_virt(0x400 - 16);
1060 memcpy(mpf_new, mpf, 16); 1060 memcpy(mpf_new, mpf, 16);
1061 mpf = mpf_new; 1061 mpf = mpf_new;
1062 mpf->mpf_physptr = mpc_new_phys; 1062 mpf->physptr = mpc_new_phys;
1063 } 1063 }
1064 mpf->mpf_checksum = 0; 1064 mpf->checksum = 0;
1065 mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16); 1065 mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
1066 printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr); 1066 printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
1067 } 1067 }
1068 1068
1069 /* 1069 /*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 726266695b2c..3cf3413ec626 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -35,10 +35,10 @@
35#include <linux/device.h> 35#include <linux/device.h>
36#include <linux/cpu.h> 36#include <linux/cpu.h>
37#include <linux/notifier.h> 37#include <linux/notifier.h>
38#include <linux/uaccess.h>
38 39
39#include <asm/processor.h> 40#include <asm/processor.h>
40#include <asm/msr.h> 41#include <asm/msr.h>
41#include <asm/uaccess.h>
42#include <asm/system.h> 42#include <asm/system.h>
43 43
44static struct class *msr_class; 44static struct class *msr_class;
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 7228979f1e7f..23b6d9e6e4f5 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -61,11 +61,7 @@ static int endflag __initdata;
61 61
62static inline unsigned int get_nmi_count(int cpu) 62static inline unsigned int get_nmi_count(int cpu)
63{ 63{
64#ifdef CONFIG_X86_64 64 return per_cpu(irq_stat, cpu).__nmi_count;
65 return cpu_pda(cpu)->__nmi_count;
66#else
67 return nmi_count(cpu);
68#endif
69} 65}
70 66
71static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
@@ -82,12 +78,8 @@ static inline int mce_in_progress(void)
82 */ 78 */
83static inline unsigned int get_timer_irqs(int cpu) 79static inline unsigned int get_timer_irqs(int cpu)
84{ 80{
85#ifdef CONFIG_X86_64
86 return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
87#else
88 return per_cpu(irq_stat, cpu).apic_timer_irqs + 81 return per_cpu(irq_stat, cpu).apic_timer_irqs +
89 per_cpu(irq_stat, cpu).irq0_irqs; 82 per_cpu(irq_stat, cpu).irq0_irqs;
90#endif
91} 83}
92 84
93#ifdef CONFIG_SMP 85#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e4c8fb608873..cea11c8e3049 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -44,6 +44,17 @@ void _paravirt_nop(void)
44{ 44{
45} 45}
46 46
47/* identity function, which can be inlined */
48u32 _paravirt_ident_32(u32 x)
49{
50 return x;
51}
52
53u64 _paravirt_ident_64(u64 x)
54{
55 return x;
56}
57
47static void __init default_banner(void) 58static void __init default_banner(void)
48{ 59{
49 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 60 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
@@ -138,9 +149,16 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
138 if (opfunc == NULL) 149 if (opfunc == NULL)
139 /* If there's no function, patch it with a ud2a (BUG) */ 150 /* If there's no function, patch it with a ud2a (BUG) */
140 ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); 151 ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
141 else if (opfunc == paravirt_nop) 152 else if (opfunc == _paravirt_nop)
142 /* If the operation is a nop, then nop the callsite */ 153 /* If the operation is a nop, then nop the callsite */
143 ret = paravirt_patch_nop(); 154 ret = paravirt_patch_nop();
155
156 /* identity functions just return their single argument */
157 else if (opfunc == _paravirt_ident_32)
158 ret = paravirt_patch_ident_32(insnbuf, len);
159 else if (opfunc == _paravirt_ident_64)
160 ret = paravirt_patch_ident_64(insnbuf, len);
161
144 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || 162 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
145 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || 163 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
146 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || 164 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
@@ -292,10 +310,10 @@ struct pv_time_ops pv_time_ops = {
292 310
293struct pv_irq_ops pv_irq_ops = { 311struct pv_irq_ops pv_irq_ops = {
294 .init_IRQ = native_init_IRQ, 312 .init_IRQ = native_init_IRQ,
295 .save_fl = native_save_fl, 313 .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
296 .restore_fl = native_restore_fl, 314 .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
297 .irq_disable = native_irq_disable, 315 .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
298 .irq_enable = native_irq_enable, 316 .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
299 .safe_halt = native_safe_halt, 317 .safe_halt = native_safe_halt,
300 .halt = native_halt, 318 .halt = native_halt,
301#ifdef CONFIG_X86_64 319#ifdef CONFIG_X86_64
@@ -373,6 +391,14 @@ struct pv_apic_ops pv_apic_ops = {
373#endif 391#endif
374}; 392};
375 393
394#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
395/* 32-bit pagetable entries */
396#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
397#else
398/* 64-bit pagetable entries */
399#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
400#endif
401
376struct pv_mmu_ops pv_mmu_ops = { 402struct pv_mmu_ops pv_mmu_ops = {
377#ifndef CONFIG_X86_64 403#ifndef CONFIG_X86_64
378 .pagetable_setup_start = native_pagetable_setup_start, 404 .pagetable_setup_start = native_pagetable_setup_start,
@@ -424,22 +450,23 @@ struct pv_mmu_ops pv_mmu_ops = {
424 .pmd_clear = native_pmd_clear, 450 .pmd_clear = native_pmd_clear,
425#endif 451#endif
426 .set_pud = native_set_pud, 452 .set_pud = native_set_pud,
427 .pmd_val = native_pmd_val, 453
428 .make_pmd = native_make_pmd, 454 .pmd_val = PTE_IDENT,
455 .make_pmd = PTE_IDENT,
429 456
430#if PAGETABLE_LEVELS == 4 457#if PAGETABLE_LEVELS == 4
431 .pud_val = native_pud_val, 458 .pud_val = PTE_IDENT,
432 .make_pud = native_make_pud, 459 .make_pud = PTE_IDENT,
460
433 .set_pgd = native_set_pgd, 461 .set_pgd = native_set_pgd,
434#endif 462#endif
435#endif /* PAGETABLE_LEVELS >= 3 */ 463#endif /* PAGETABLE_LEVELS >= 3 */
436 464
437 .pte_val = native_pte_val, 465 .pte_val = PTE_IDENT,
438 .pte_flags = native_pte_flags, 466 .pgd_val = PTE_IDENT,
439 .pgd_val = native_pgd_val,
440 467
441 .make_pte = native_make_pte, 468 .make_pte = PTE_IDENT,
442 .make_pgd = native_make_pgd, 469 .make_pgd = PTE_IDENT,
443 470
444 .dup_mmap = paravirt_nop, 471 .dup_mmap = paravirt_nop,
445 .exit_mmap = paravirt_nop, 472 .exit_mmap = paravirt_nop,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 9fe644f4861d..d9f32e6d6ab6 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -12,6 +12,18 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
12DEF_NATIVE(pv_cpu_ops, clts, "clts"); 12DEF_NATIVE(pv_cpu_ops, clts, "clts");
13DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); 13DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
14 14
15unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
16{
17 /* arg in %eax, return in %eax */
18 return 0;
19}
20
21unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
22{
23 /* arg in %edx:%eax, return in %edx:%eax */
24 return 0;
25}
26
15unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 27unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
16 unsigned long addr, unsigned len) 28 unsigned long addr, unsigned len)
17{ 29{
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 061d01df9ae6..3f08f34f93eb 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -19,6 +19,21 @@ DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
19DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl"); 19DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
20DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); 20DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
21 21
22DEF_NATIVE(, mov32, "mov %edi, %eax");
23DEF_NATIVE(, mov64, "mov %rdi, %rax");
24
25unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
26{
27 return paravirt_patch_insns(insnbuf, len,
28 start__mov32, end__mov32);
29}
30
31unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
32{
33 return paravirt_patch_insns(insnbuf, len,
34 start__mov64, end__mov64);
35}
36
22unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 37unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
23 unsigned long addr, unsigned len) 38 unsigned long addr, unsigned len)
24{ 39{
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a546f55c77b4..1a1ae8edc40c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -66,9 +66,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
66DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 66DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
67EXPORT_PER_CPU_SYMBOL(current_task); 67EXPORT_PER_CPU_SYMBOL(current_task);
68 68
69DEFINE_PER_CPU(int, cpu_number);
70EXPORT_PER_CPU_SYMBOL(cpu_number);
71
72/* 69/*
73 * Return saved PC of a blocked thread. 70 * Return saved PC of a blocked thread.
74 */ 71 */
@@ -111,7 +108,6 @@ void cpu_idle(void)
111 play_dead(); 108 play_dead();
112 109
113 local_irq_disable(); 110 local_irq_disable();
114 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
115 /* Don't trace irqs off for idle */ 111 /* Don't trace irqs off for idle */
116 stop_critical_timings(); 112 stop_critical_timings();
117 pm_idle(); 113 pm_idle();
@@ -591,7 +587,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
591 if (prev->gs | next->gs) 587 if (prev->gs | next->gs)
592 loadsegment(gs, next->gs); 588 loadsegment(gs, next->gs);
593 589
594 x86_write_percpu(current_task, next_p); 590 percpu_write(current_task, next_p);
595 591
596 return prev_p; 592 return prev_p;
597} 593}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 416fb9282f4f..c422eebb0c58 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -16,6 +16,7 @@
16 16
17#include <stdarg.h> 17#include <stdarg.h>
18 18
19#include <linux/stackprotector.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <linux/errno.h> 21#include <linux/errno.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
@@ -46,7 +47,6 @@
46#include <asm/processor.h> 47#include <asm/processor.h>
47#include <asm/i387.h> 48#include <asm/i387.h>
48#include <asm/mmu_context.h> 49#include <asm/mmu_context.h>
49#include <asm/pda.h>
50#include <asm/prctl.h> 50#include <asm/prctl.h>
51#include <asm/desc.h> 51#include <asm/desc.h>
52#include <asm/proto.h> 52#include <asm/proto.h>
@@ -57,6 +57,12 @@
57 57
58asmlinkage extern void ret_from_fork(void); 58asmlinkage extern void ret_from_fork(void);
59 59
60DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
61EXPORT_PER_CPU_SYMBOL(current_task);
62
63DEFINE_PER_CPU(unsigned long, old_rsp);
64static DEFINE_PER_CPU(unsigned char, is_idle);
65
60unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; 66unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
61 67
62static ATOMIC_NOTIFIER_HEAD(idle_notifier); 68static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -75,13 +81,13 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
75 81
76void enter_idle(void) 82void enter_idle(void)
77{ 83{
78 write_pda(isidle, 1); 84 percpu_write(is_idle, 1);
79 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 85 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
80} 86}
81 87
82static void __exit_idle(void) 88static void __exit_idle(void)
83{ 89{
84 if (test_and_clear_bit_pda(0, isidle) == 0) 90 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
85 return; 91 return;
86 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 92 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
87} 93}
@@ -111,6 +117,17 @@ static inline void play_dead(void)
111void cpu_idle(void) 117void cpu_idle(void)
112{ 118{
113 current_thread_info()->status |= TS_POLLING; 119 current_thread_info()->status |= TS_POLLING;
120
121 /*
122 * If we're the non-boot CPU, nothing set the PDA stack
123 * canary up for us - and if we are the boot CPU we have
124 * a 0 stack canary. This is a good place for updating
125 * it, as we wont ever return from this function (so the
126 * invalid canaries already on the stack wont ever
127 * trigger):
128 */
129 boot_init_stack_canary();
130
114 /* endless idle loop with no priority at all */ 131 /* endless idle loop with no priority at all */
115 while (1) { 132 while (1) {
116 tick_nohz_stop_sched_tick(1); 133 tick_nohz_stop_sched_tick(1);
@@ -392,7 +409,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
392 load_gs_index(0); 409 load_gs_index(0);
393 regs->ip = new_ip; 410 regs->ip = new_ip;
394 regs->sp = new_sp; 411 regs->sp = new_sp;
395 write_pda(oldrsp, new_sp); 412 percpu_write(old_rsp, new_sp);
396 regs->cs = __USER_CS; 413 regs->cs = __USER_CS;
397 regs->ss = __USER_DS; 414 regs->ss = __USER_DS;
398 regs->flags = 0x200; 415 regs->flags = 0x200;
@@ -613,21 +630,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
613 /* 630 /*
614 * Switch the PDA and FPU contexts. 631 * Switch the PDA and FPU contexts.
615 */ 632 */
616 prev->usersp = read_pda(oldrsp); 633 prev->usersp = percpu_read(old_rsp);
617 write_pda(oldrsp, next->usersp); 634 percpu_write(old_rsp, next->usersp);
618 write_pda(pcurrent, next_p); 635 percpu_write(current_task, next_p);
619 636
620 write_pda(kernelstack, 637 percpu_write(kernel_stack,
621 (unsigned long)task_stack_page(next_p) + 638 (unsigned long)task_stack_page(next_p) +
622 THREAD_SIZE - PDA_STACKOFFSET); 639 THREAD_SIZE - KERNEL_STACK_OFFSET);
623#ifdef CONFIG_CC_STACKPROTECTOR
624 write_pda(stack_canary, next_p->stack_canary);
625 /*
626 * Build time only check to make sure the stack_canary is at
627 * offset 40 in the pda; this is a gcc ABI requirement
628 */
629 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
630#endif
631 640
632 /* 641 /*
633 * Now maybe reload the debug registers and handle I/O bitmaps 642 * Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 2b46eb41643b..f8536fee5c12 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -14,6 +14,7 @@
14#include <asm/reboot.h> 14#include <asm/reboot.h>
15#include <asm/pci_x86.h> 15#include <asm/pci_x86.h>
16#include <asm/virtext.h> 16#include <asm/virtext.h>
17#include <asm/cpu.h>
17 18
18#ifdef CONFIG_X86_32 19#ifdef CONFIG_X86_32
19# include <linux/dmi.h> 20# include <linux/dmi.h>
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ae0d8042cf69..f41c4486c270 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -89,7 +89,7 @@
89 89
90#include <asm/system.h> 90#include <asm/system.h>
91#include <asm/vsyscall.h> 91#include <asm/vsyscall.h>
92#include <asm/smp.h> 92#include <asm/cpu.h>
93#include <asm/desc.h> 93#include <asm/desc.h>
94#include <asm/dma.h> 94#include <asm/dma.h>
95#include <asm/iommu.h> 95#include <asm/iommu.h>
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 01161077a49c..ef91747bbed5 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -13,145 +13,46 @@
13#include <asm/mpspec.h> 13#include <asm/mpspec.h>
14#include <asm/apicdef.h> 14#include <asm/apicdef.h>
15#include <asm/highmem.h> 15#include <asm/highmem.h>
16#include <asm/proto.h>
17#include <asm/cpumask.h>
18#include <asm/cpu.h>
16 19
17#ifdef CONFIG_X86_LOCAL_APIC 20#ifdef CONFIG_DEBUG_PER_CPU_MAPS
18unsigned int num_processors; 21# define DBG(x...) printk(KERN_DEBUG x)
19unsigned disabled_cpus __cpuinitdata;
20/* Processor that is doing the boot up */
21unsigned int boot_cpu_physical_apicid = -1U;
22EXPORT_SYMBOL(boot_cpu_physical_apicid);
23unsigned int max_physical_apicid;
24
25/* Bitmask of physically existing CPUs */
26physid_mask_t phys_cpu_present_map;
27#endif
28
29/* map cpu index to physical APIC ID */
30DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
31DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
32EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
33EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
34
35#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
36#define X86_64_NUMA 1
37
38/* map cpu index to node index */
39DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
40EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
41
42/* which logical CPUs are on which nodes */
43cpumask_t *node_to_cpumask_map;
44EXPORT_SYMBOL(node_to_cpumask_map);
45
46/* setup node_to_cpumask_map */
47static void __init setup_node_to_cpumask_map(void);
48
49#else 22#else
50static inline void setup_node_to_cpumask_map(void) { } 23# define DBG(x...)
51#endif 24#endif
52 25
53#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) 26DEFINE_PER_CPU(int, cpu_number);
54/* 27EXPORT_PER_CPU_SYMBOL(cpu_number);
55 * Copy data used in early init routines from the initial arrays to the
56 * per cpu data areas. These arrays then become expendable and the
57 * *_early_ptr's are zeroed indicating that the static arrays are gone.
58 */
59static void __init setup_per_cpu_maps(void)
60{
61 int cpu;
62 28
63 for_each_possible_cpu(cpu) { 29#ifdef CONFIG_X86_64
64 per_cpu(x86_cpu_to_apicid, cpu) = 30#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
65 early_per_cpu_map(x86_cpu_to_apicid, cpu); 31#else
66 per_cpu(x86_bios_cpu_apicid, cpu) = 32#define BOOT_PERCPU_OFFSET 0
67 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
68#ifdef X86_64_NUMA
69 per_cpu(x86_cpu_to_node_map, cpu) =
70 early_per_cpu_map(x86_cpu_to_node_map, cpu);
71#endif 33#endif
72 }
73 34
74 /* indicate the early static arrays will soon be gone */ 35DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
75 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; 36EXPORT_PER_CPU_SYMBOL(this_cpu_off);
76 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
77#ifdef X86_64_NUMA
78 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
79#endif
80}
81 37
82#ifdef CONFIG_X86_32 38unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
83/* 39 [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
84 * Great future not-so-futuristic plan: make i386 and x86_64 do it 40};
85 * the same way
86 */
87unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
88EXPORT_SYMBOL(__per_cpu_offset); 41EXPORT_SYMBOL(__per_cpu_offset);
89static inline void setup_cpu_pda_map(void) { }
90
91#elif !defined(CONFIG_SMP)
92static inline void setup_cpu_pda_map(void) { }
93
94#else /* CONFIG_SMP && CONFIG_X86_64 */
95
96/*
97 * Allocate cpu_pda pointer table and array via alloc_bootmem.
98 */
99static void __init setup_cpu_pda_map(void)
100{
101 char *pda;
102 struct x8664_pda **new_cpu_pda;
103 unsigned long size;
104 int cpu;
105
106 size = roundup(sizeof(struct x8664_pda), cache_line_size());
107
108 /* allocate cpu_pda array and pointer table */
109 {
110 unsigned long tsize = nr_cpu_ids * sizeof(void *);
111 unsigned long asize = size * (nr_cpu_ids - 1);
112 42
113 tsize = roundup(tsize, cache_line_size()); 43static inline void setup_percpu_segment(int cpu)
114 new_cpu_pda = alloc_bootmem(tsize + asize);
115 pda = (char *)new_cpu_pda + tsize;
116 }
117
118 /* initialize pointer table to static pda's */
119 for_each_possible_cpu(cpu) {
120 if (cpu == 0) {
121 /* leave boot cpu pda in place */
122 new_cpu_pda[0] = cpu_pda(0);
123 continue;
124 }
125 new_cpu_pda[cpu] = (struct x8664_pda *)pda;
126 new_cpu_pda[cpu]->in_bootmem = 1;
127 pda += size;
128 }
129
130 /* point to new pointer table */
131 _cpu_pda = new_cpu_pda;
132}
133
134#endif /* CONFIG_SMP && CONFIG_X86_64 */
135
136#ifdef CONFIG_X86_64
137
138/* correctly size the local cpu masks */
139static void __init setup_cpu_local_masks(void)
140{ 44{
141 alloc_bootmem_cpumask_var(&cpu_initialized_mask); 45#ifdef CONFIG_X86_32
142 alloc_bootmem_cpumask_var(&cpu_callin_mask); 46 struct desc_struct gdt;
143 alloc_bootmem_cpumask_var(&cpu_callout_mask);
144 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
145}
146
147#else /* CONFIG_X86_32 */
148 47
149static inline void setup_cpu_local_masks(void) 48 pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
150{ 49 0x2 | DESCTYPE_S, 0x8);
50 gdt.s = 1;
51 write_gdt_entry(get_cpu_gdt_table(cpu),
52 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
53#endif
151} 54}
152 55
153#endif /* CONFIG_X86_32 */
154
155/* 56/*
156 * Great future plan: 57 * Great future plan:
157 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. 58 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
@@ -159,18 +60,12 @@ static inline void setup_cpu_local_masks(void)
159 */ 60 */
160void __init setup_per_cpu_areas(void) 61void __init setup_per_cpu_areas(void)
161{ 62{
162 ssize_t size, old_size; 63 ssize_t size;
163 char *ptr; 64 char *ptr;
164 int cpu; 65 int cpu;
165 unsigned long align = 1;
166
167 /* Setup cpu_pda map */
168 setup_cpu_pda_map();
169 66
170 /* Copy section for each CPU (we discard the original) */ 67 /* Copy section for each CPU (we discard the original) */
171 old_size = PERCPU_ENOUGH_ROOM; 68 size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
172 align = max_t(unsigned long, PAGE_SIZE, align);
173 size = roundup(old_size, align);
174 69
175 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 70 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
176 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 71 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
@@ -179,30 +74,67 @@ void __init setup_per_cpu_areas(void)
179 74
180 for_each_possible_cpu(cpu) { 75 for_each_possible_cpu(cpu) {
181#ifndef CONFIG_NEED_MULTIPLE_NODES 76#ifndef CONFIG_NEED_MULTIPLE_NODES
182 ptr = __alloc_bootmem(size, align, 77 ptr = alloc_bootmem_pages(size);
183 __pa(MAX_DMA_ADDRESS));
184#else 78#else
185 int node = early_cpu_to_node(cpu); 79 int node = early_cpu_to_node(cpu);
186 if (!node_online(node) || !NODE_DATA(node)) { 80 if (!node_online(node) || !NODE_DATA(node)) {
187 ptr = __alloc_bootmem(size, align, 81 ptr = alloc_bootmem_pages(size);
188 __pa(MAX_DMA_ADDRESS));
189 pr_info("cpu %d has no node %d or node-local memory\n", 82 pr_info("cpu %d has no node %d or node-local memory\n",
190 cpu, node); 83 cpu, node);
191 pr_debug("per cpu data for cpu%d at %016lx\n", 84 pr_debug("per cpu data for cpu%d at %016lx\n",
192 cpu, __pa(ptr)); 85 cpu, __pa(ptr));
193 } else { 86 } else {
194 ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, 87 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
195 __pa(MAX_DMA_ADDRESS));
196 pr_debug("per cpu data for cpu%d on node%d at %016lx\n", 88 pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
197 cpu, node, __pa(ptr)); 89 cpu, node, __pa(ptr));
198 } 90 }
199#endif 91#endif
92
93 memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
200 per_cpu_offset(cpu) = ptr - __per_cpu_start; 94 per_cpu_offset(cpu) = ptr - __per_cpu_start;
201 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 95 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
96 per_cpu(cpu_number, cpu) = cpu;
97 setup_percpu_segment(cpu);
98 /*
99 * Copy data used in early init routines from the
100 * initial arrays to the per cpu data areas. These
101 * arrays then become expendable and the *_early_ptr's
102 * are zeroed indicating that the static arrays are
103 * gone.
104 */
105#ifdef CONFIG_X86_LOCAL_APIC
106 per_cpu(x86_cpu_to_apicid, cpu) =
107 early_per_cpu_map(x86_cpu_to_apicid, cpu);
108 per_cpu(x86_bios_cpu_apicid, cpu) =
109 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
110#endif
111#ifdef CONFIG_X86_64
112 per_cpu(irq_stack_ptr, cpu) =
113 per_cpu(irq_stack_union.irq_stack, cpu) +
114 IRQ_STACK_SIZE - 64;
115#ifdef CONFIG_NUMA
116 per_cpu(x86_cpu_to_node_map, cpu) =
117 early_per_cpu_map(x86_cpu_to_node_map, cpu);
118#endif
119#endif
120 /*
121 * Up to this point, the boot CPU has been using .data.init
122 * area. Reload any changed state for the boot CPU.
123 */
124 if (cpu == boot_cpu_id)
125 switch_to_new_gdt(cpu);
126
127 DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
202 } 128 }
203 129
204 /* Setup percpu data maps */ 130 /* indicate the early static arrays will soon be gone */
205 setup_per_cpu_maps(); 131#ifdef CONFIG_X86_LOCAL_APIC
132 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
133 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
134#endif
135#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
136 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
137#endif
206 138
207 /* Setup node to cpumask map */ 139 /* Setup node to cpumask map */
208 setup_node_to_cpumask_map(); 140 setup_node_to_cpumask_map();
@@ -210,199 +142,3 @@ void __init setup_per_cpu_areas(void)
210 /* Setup cpu initialized, callin, callout masks */ 142 /* Setup cpu initialized, callin, callout masks */
211 setup_cpu_local_masks(); 143 setup_cpu_local_masks();
212} 144}
213
214#endif
215
216#ifdef X86_64_NUMA
217
218/*
219 * Allocate node_to_cpumask_map based on number of available nodes
220 * Requires node_possible_map to be valid.
221 *
222 * Note: node_to_cpumask() is not valid until after this is done.
223 */
224static void __init setup_node_to_cpumask_map(void)
225{
226 unsigned int node, num = 0;
227 cpumask_t *map;
228
229 /* setup nr_node_ids if not done yet */
230 if (nr_node_ids == MAX_NUMNODES) {
231 for_each_node_mask(node, node_possible_map)
232 num = node;
233 nr_node_ids = num + 1;
234 }
235
236 /* allocate the map */
237 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
238
239 pr_debug("Node to cpumask map at %p for %d nodes\n",
240 map, nr_node_ids);
241
242 /* node_to_cpumask() will now work */
243 node_to_cpumask_map = map;
244}
245
246void __cpuinit numa_set_node(int cpu, int node)
247{
248 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
249
250 if (cpu_pda(cpu) && node != NUMA_NO_NODE)
251 cpu_pda(cpu)->nodenumber = node;
252
253 if (cpu_to_node_map)
254 cpu_to_node_map[cpu] = node;
255
256 else if (per_cpu_offset(cpu))
257 per_cpu(x86_cpu_to_node_map, cpu) = node;
258
259 else
260 pr_debug("Setting node for non-present cpu %d\n", cpu);
261}
262
263void __cpuinit numa_clear_node(int cpu)
264{
265 numa_set_node(cpu, NUMA_NO_NODE);
266}
267
268#ifndef CONFIG_DEBUG_PER_CPU_MAPS
269
270void __cpuinit numa_add_cpu(int cpu)
271{
272 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
273}
274
275void __cpuinit numa_remove_cpu(int cpu)
276{
277 cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
278}
279
280#else /* CONFIG_DEBUG_PER_CPU_MAPS */
281
282/*
283 * --------- debug versions of the numa functions ---------
284 */
285static void __cpuinit numa_set_cpumask(int cpu, int enable)
286{
287 int node = cpu_to_node(cpu);
288 cpumask_t *mask;
289 char buf[64];
290
291 if (node_to_cpumask_map == NULL) {
292 printk(KERN_ERR "node_to_cpumask_map NULL\n");
293 dump_stack();
294 return;
295 }
296
297 mask = &node_to_cpumask_map[node];
298 if (enable)
299 cpu_set(cpu, *mask);
300 else
301 cpu_clear(cpu, *mask);
302
303 cpulist_scnprintf(buf, sizeof(buf), mask);
304 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
305 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
306}
307
308void __cpuinit numa_add_cpu(int cpu)
309{
310 numa_set_cpumask(cpu, 1);
311}
312
313void __cpuinit numa_remove_cpu(int cpu)
314{
315 numa_set_cpumask(cpu, 0);
316}
317
318int cpu_to_node(int cpu)
319{
320 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
321 printk(KERN_WARNING
322 "cpu_to_node(%d): usage too early!\n", cpu);
323 dump_stack();
324 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
325 }
326 return per_cpu(x86_cpu_to_node_map, cpu);
327}
328EXPORT_SYMBOL(cpu_to_node);
329
330/*
331 * Same function as cpu_to_node() but used if called before the
332 * per_cpu areas are setup.
333 */
334int early_cpu_to_node(int cpu)
335{
336 if (early_per_cpu_ptr(x86_cpu_to_node_map))
337 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
338
339 if (!per_cpu_offset(cpu)) {
340 printk(KERN_WARNING
341 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
342 dump_stack();
343 return NUMA_NO_NODE;
344 }
345 return per_cpu(x86_cpu_to_node_map, cpu);
346}
347
348
349/* empty cpumask */
350static const cpumask_t cpu_mask_none;
351
352/*
353 * Returns a pointer to the bitmask of CPUs on Node 'node'.
354 */
355const cpumask_t *cpumask_of_node(int node)
356{
357 if (node_to_cpumask_map == NULL) {
358 printk(KERN_WARNING
359 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
360 node);
361 dump_stack();
362 return (const cpumask_t *)&cpu_online_map;
363 }
364 if (node >= nr_node_ids) {
365 printk(KERN_WARNING
366 "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
367 node, nr_node_ids);
368 dump_stack();
369 return &cpu_mask_none;
370 }
371 return &node_to_cpumask_map[node];
372}
373EXPORT_SYMBOL(cpumask_of_node);
374
375/*
376 * Returns a bitmask of CPUs on Node 'node'.
377 *
378 * Side note: this function creates the returned cpumask on the stack
379 * so with a high NR_CPUS count, excessive stack space is used. The
380 * node_to_cpumask_ptr function should be used whenever possible.
381 */
382cpumask_t node_to_cpumask(int node)
383{
384 if (node_to_cpumask_map == NULL) {
385 printk(KERN_WARNING
386 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
387 dump_stack();
388 return cpu_online_map;
389 }
390 if (node >= nr_node_ids) {
391 printk(KERN_WARNING
392 "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
393 node, nr_node_ids);
394 dump_stack();
395 return cpu_mask_none;
396 }
397 return node_to_cpumask_map[node];
398}
399EXPORT_SYMBOL(node_to_cpumask);
400
401/*
402 * --------- end of debug versions of the numa functions ---------
403 */
404
405#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
406
407#endif /* X86_64_NUMA */
408
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bb1a3b1fc87f..612d3c74f6a3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -53,7 +53,6 @@
53#include <asm/nmi.h> 53#include <asm/nmi.h>
54#include <asm/irq.h> 54#include <asm/irq.h>
55#include <asm/idle.h> 55#include <asm/idle.h>
56#include <asm/smp.h>
57#include <asm/trampoline.h> 56#include <asm/trampoline.h>
58#include <asm/cpu.h> 57#include <asm/cpu.h>
59#include <asm/numa.h> 58#include <asm/numa.h>
@@ -63,6 +62,7 @@
63#include <asm/vmi.h> 62#include <asm/vmi.h>
64#include <asm/genapic.h> 63#include <asm/genapic.h>
65#include <asm/setup.h> 64#include <asm/setup.h>
65#include <asm/uv/uv.h>
66#include <linux/mc146818rtc.h> 66#include <linux/mc146818rtc.h>
67 67
68#include <mach_apic.h> 68#include <mach_apic.h>
@@ -745,52 +745,6 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
745 complete(&c_idle->done); 745 complete(&c_idle->done);
746} 746}
747 747
748#ifdef CONFIG_X86_64
749
750/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
751static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
752{
753 if (!after_bootmem)
754 free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
755}
756
757/*
758 * Allocate node local memory for the AP pda.
759 *
760 * Must be called after the _cpu_pda pointer table is initialized.
761 */
762int __cpuinit get_local_pda(int cpu)
763{
764 struct x8664_pda *oldpda, *newpda;
765 unsigned long size = sizeof(struct x8664_pda);
766 int node = cpu_to_node(cpu);
767
768 if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
769 return 0;
770
771 oldpda = cpu_pda(cpu);
772 newpda = kmalloc_node(size, GFP_ATOMIC, node);
773 if (!newpda) {
774 printk(KERN_ERR "Could not allocate node local PDA "
775 "for CPU %d on node %d\n", cpu, node);
776
777 if (oldpda)
778 return 0; /* have a usable pda */
779 else
780 return -1;
781 }
782
783 if (oldpda) {
784 memcpy(newpda, oldpda, size);
785 free_bootmem_pda(oldpda);
786 }
787
788 newpda->in_bootmem = 0;
789 cpu_pda(cpu) = newpda;
790 return 0;
791}
792#endif /* CONFIG_X86_64 */
793
794static int __cpuinit do_boot_cpu(int apicid, int cpu) 748static int __cpuinit do_boot_cpu(int apicid, int cpu)
795/* 749/*
796 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 750 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -808,16 +762,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
808 }; 762 };
809 INIT_WORK(&c_idle.work, do_fork_idle); 763 INIT_WORK(&c_idle.work, do_fork_idle);
810 764
811#ifdef CONFIG_X86_64
812 /* Allocate node local memory for AP pdas */
813 if (cpu > 0) {
814 boot_error = get_local_pda(cpu);
815 if (boot_error)
816 goto restore_state;
817 /* if can't get pda memory, can't start cpu */
818 }
819#endif
820
821 alternatives_smp_switch(1); 765 alternatives_smp_switch(1);
822 766
823 c_idle.idle = get_idle_for_cpu(cpu); 767 c_idle.idle = get_idle_for_cpu(cpu);
@@ -847,14 +791,16 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
847 791
848 set_idle_for_cpu(cpu, c_idle.idle); 792 set_idle_for_cpu(cpu, c_idle.idle);
849do_rest: 793do_rest:
850#ifdef CONFIG_X86_32
851 per_cpu(current_task, cpu) = c_idle.idle; 794 per_cpu(current_task, cpu) = c_idle.idle;
852 init_gdt(cpu); 795#ifdef CONFIG_X86_32
853 /* Stack for startup_32 can be just as for start_secondary onwards */ 796 /* Stack for startup_32 can be just as for start_secondary onwards */
854 irq_ctx_init(cpu); 797 irq_ctx_init(cpu);
855#else 798#else
856 cpu_pda(cpu)->pcurrent = c_idle.idle;
857 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 799 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
800 initial_gs = per_cpu_offset(cpu);
801 per_cpu(kernel_stack, cpu) =
802 (unsigned long)task_stack_page(c_idle.idle) -
803 KERNEL_STACK_OFFSET + THREAD_SIZE;
858#endif 804#endif
859 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 805 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
860 initial_code = (unsigned long)start_secondary; 806 initial_code = (unsigned long)start_secondary;
@@ -931,9 +877,7 @@ do_rest:
931 inquire_remote_apic(apicid); 877 inquire_remote_apic(apicid);
932 } 878 }
933 } 879 }
934#ifdef CONFIG_X86_64 880
935restore_state:
936#endif
937 if (boot_error) { 881 if (boot_error) {
938 /* Try to put things back the way they were before ... */ 882 /* Try to put things back the way they were before ... */
939 numa_remove_cpu(cpu); /* was set by numa_add_cpu */ 883 numa_remove_cpu(cpu); /* was set by numa_add_cpu */
@@ -1125,6 +1069,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1125 printk(KERN_ERR "... forcing use of dummy APIC emulation." 1069 printk(KERN_ERR "... forcing use of dummy APIC emulation."
1126 "(tell your hw vendor)\n"); 1070 "(tell your hw vendor)\n");
1127 smpboot_clear_io_apic(); 1071 smpboot_clear_io_apic();
1072 disable_ioapic_setup();
1128 return -1; 1073 return -1;
1129 } 1074 }
1130 1075
@@ -1240,10 +1185,7 @@ out:
1240void __init native_smp_prepare_boot_cpu(void) 1185void __init native_smp_prepare_boot_cpu(void)
1241{ 1186{
1242 int me = smp_processor_id(); 1187 int me = smp_processor_id();
1243#ifdef CONFIG_X86_32 1188 switch_to_new_gdt(me);
1244 init_gdt(me);
1245#endif
1246 switch_to_new_gdt();
1247 /* already set me in cpu_online_mask in boot_cpu_init() */ 1189 /* already set me in cpu_online_mask in boot_cpu_init() */
1248 cpumask_set_cpu(me, cpu_callout_mask); 1190 cpumask_set_cpu(me, cpu_callout_mask);
1249 per_cpu(cpu_state, me) = CPU_ONLINE; 1191 per_cpu(cpu_state, me) = CPU_ONLINE;
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
deleted file mode 100644
index 397e309839dd..000000000000
--- a/arch/x86/kernel/smpcommon.c
+++ /dev/null
@@ -1,30 +0,0 @@
1/*
2 * SMP stuff which is common to all sub-architectures.
3 */
4#include <linux/module.h>
5#include <asm/smp.h>
6
7#ifdef CONFIG_X86_32
8DEFINE_PER_CPU(unsigned long, this_cpu_off);
9EXPORT_PER_CPU_SYMBOL(this_cpu_off);
10
11/*
12 * Initialize the CPU's GDT. This is either the boot CPU doing itself
13 * (still using the master per-cpu area), or a CPU doing it for a
14 * secondary which will soon come up.
15 */
16__cpuinit void init_gdt(int cpu)
17{
18 struct desc_struct gdt;
19
20 pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
21 0x2 | DESCTYPE_S, 0x8);
22 gdt.s = 1;
23
24 write_gdt_entry(get_cpu_gdt_table(cpu),
25 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
26
27 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
28 per_cpu(cpu_number, cpu) = cpu;
29}
30#endif
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
deleted file mode 100644
index ce5054642247..000000000000
--- a/arch/x86/kernel/tlb_32.c
+++ /dev/null
@@ -1,256 +0,0 @@
1#include <linux/spinlock.h>
2#include <linux/cpu.h>
3#include <linux/interrupt.h>
4
5#include <asm/tlbflush.h>
6
7DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate)
8 ____cacheline_aligned = { &init_mm, 0, };
9
10/* must come after the send_IPI functions above for inlining */
11#include <mach_ipi.h>
12
13/*
14 * Smarter SMP flushing macros.
15 * c/o Linus Torvalds.
16 *
17 * These mean you can really definitely utterly forget about
18 * writing to user space from interrupts. (Its not allowed anyway).
19 *
20 * Optimizations Manfred Spraul <manfred@colorfullife.com>
21 */
22
23static cpumask_t flush_cpumask;
24static struct mm_struct *flush_mm;
25static unsigned long flush_va;
26static DEFINE_SPINLOCK(tlbstate_lock);
27
28/*
29 * We cannot call mmdrop() because we are in interrupt context,
30 * instead update mm->cpu_vm_mask.
31 *
32 * We need to reload %cr3 since the page tables may be going
33 * away from under us..
34 */
35void leave_mm(int cpu)
36{
37 BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
38 cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
39 load_cr3(swapper_pg_dir);
40}
41EXPORT_SYMBOL_GPL(leave_mm);
42
43/*
44 *
45 * The flush IPI assumes that a thread switch happens in this order:
46 * [cpu0: the cpu that switches]
47 * 1) switch_mm() either 1a) or 1b)
48 * 1a) thread switch to a different mm
49 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
50 * Stop ipi delivery for the old mm. This is not synchronized with
51 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
52 * for the wrong mm, and in the worst case we perform a superfluous
53 * tlb flush.
54 * 1a2) set cpu_tlbstate to TLBSTATE_OK
55 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
56 * was in lazy tlb mode.
57 * 1a3) update cpu_tlbstate[].active_mm
58 * Now cpu0 accepts tlb flushes for the new mm.
59 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
60 * Now the other cpus will send tlb flush ipis.
61 * 1a4) change cr3.
62 * 1b) thread switch without mm change
63 * cpu_tlbstate[].active_mm is correct, cpu0 already handles
64 * flush ipis.
65 * 1b1) set cpu_tlbstate to TLBSTATE_OK
66 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
67 * Atomically set the bit [other cpus will start sending flush ipis],
68 * and test the bit.
69 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
70 * 2) switch %%esp, ie current
71 *
72 * The interrupt must handle 2 special cases:
73 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
74 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
75 * runs in kernel space, the cpu could load tlb entries for user space
76 * pages.
77 *
78 * The good news is that cpu_tlbstate is local to each cpu, no
79 * write/read ordering problems.
80 */
81
82/*
83 * TLB flush IPI:
84 *
85 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
86 * 2) Leave the mm if we are in the lazy tlb mode.
87 */
88
89void smp_invalidate_interrupt(struct pt_regs *regs)
90{
91 unsigned long cpu;
92
93 cpu = get_cpu();
94
95 if (!cpu_isset(cpu, flush_cpumask))
96 goto out;
97 /*
98 * This was a BUG() but until someone can quote me the
99 * line from the intel manual that guarantees an IPI to
100 * multiple CPUs is retried _only_ on the erroring CPUs
101 * its staying as a return
102 *
103 * BUG();
104 */
105
106 if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
107 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
108 if (flush_va == TLB_FLUSH_ALL)
109 local_flush_tlb();
110 else
111 __flush_tlb_one(flush_va);
112 } else
113 leave_mm(cpu);
114 }
115 ack_APIC_irq();
116 smp_mb__before_clear_bit();
117 cpu_clear(cpu, flush_cpumask);
118 smp_mb__after_clear_bit();
119out:
120 put_cpu_no_resched();
121 inc_irq_stat(irq_tlb_count);
122}
123
124void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
125 unsigned long va)
126{
127 cpumask_t cpumask = *cpumaskp;
128
129 /*
130 * A couple of (to be removed) sanity checks:
131 *
132 * - current CPU must not be in mask
133 * - mask must exist :)
134 */
135 BUG_ON(cpus_empty(cpumask));
136 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
137 BUG_ON(!mm);
138
139#ifdef CONFIG_HOTPLUG_CPU
140 /* If a CPU which we ran on has gone down, OK. */
141 cpus_and(cpumask, cpumask, cpu_online_map);
142 if (unlikely(cpus_empty(cpumask)))
143 return;
144#endif
145
146 /*
147 * i'm not happy about this global shared spinlock in the
148 * MM hot path, but we'll see how contended it is.
149 * AK: x86-64 has a faster method that could be ported.
150 */
151 spin_lock(&tlbstate_lock);
152
153 flush_mm = mm;
154 flush_va = va;
155 cpus_or(flush_cpumask, cpumask, flush_cpumask);
156
157 /*
158 * Make the above memory operations globally visible before
159 * sending the IPI.
160 */
161 smp_mb();
162 /*
163 * We have to send the IPI only to
164 * CPUs affected.
165 */
166 send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
167
168 while (!cpus_empty(flush_cpumask))
169 /* nothing. lockup detection does not belong here */
170 cpu_relax();
171
172 flush_mm = NULL;
173 flush_va = 0;
174 spin_unlock(&tlbstate_lock);
175}
176
177void flush_tlb_current_task(void)
178{
179 struct mm_struct *mm = current->mm;
180 cpumask_t cpu_mask;
181
182 preempt_disable();
183 cpu_mask = mm->cpu_vm_mask;
184 cpu_clear(smp_processor_id(), cpu_mask);
185
186 local_flush_tlb();
187 if (!cpus_empty(cpu_mask))
188 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
189 preempt_enable();
190}
191
192void flush_tlb_mm(struct mm_struct *mm)
193{
194 cpumask_t cpu_mask;
195
196 preempt_disable();
197 cpu_mask = mm->cpu_vm_mask;
198 cpu_clear(smp_processor_id(), cpu_mask);
199
200 if (current->active_mm == mm) {
201 if (current->mm)
202 local_flush_tlb();
203 else
204 leave_mm(smp_processor_id());
205 }
206 if (!cpus_empty(cpu_mask))
207 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
208
209 preempt_enable();
210}
211
212void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
213{
214 struct mm_struct *mm = vma->vm_mm;
215 cpumask_t cpu_mask;
216
217 preempt_disable();
218 cpu_mask = mm->cpu_vm_mask;
219 cpu_clear(smp_processor_id(), cpu_mask);
220
221 if (current->active_mm == mm) {
222 if (current->mm)
223 __flush_tlb_one(va);
224 else
225 leave_mm(smp_processor_id());
226 }
227
228 if (!cpus_empty(cpu_mask))
229 flush_tlb_others(cpu_mask, mm, va);
230
231 preempt_enable();
232}
233EXPORT_SYMBOL(flush_tlb_page);
234
235static void do_flush_tlb_all(void *info)
236{
237 unsigned long cpu = smp_processor_id();
238
239 __flush_tlb_all();
240 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
241 leave_mm(cpu);
242}
243
244void flush_tlb_all(void)
245{
246 on_each_cpu(do_flush_tlb_all, NULL, 1);
247}
248
249void reset_lazy_tlbstate(void)
250{
251 int cpu = raw_smp_processor_id();
252
253 per_cpu(cpu_tlbstate, cpu).state = 0;
254 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
255}
256
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 6812b829ed83..f4b2f27d19b9 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -11,6 +11,7 @@
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12 12
13#include <asm/mmu_context.h> 13#include <asm/mmu_context.h>
14#include <asm/uv/uv.h>
14#include <asm/uv/uv_mmrs.h> 15#include <asm/uv/uv_mmrs.h>
15#include <asm/uv/uv_hub.h> 16#include <asm/uv/uv_hub.h>
16#include <asm/uv/uv_bau.h> 17#include <asm/uv/uv_bau.h>
@@ -210,14 +211,15 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
210 * 211 *
211 * Send a broadcast and wait for a broadcast message to complete. 212 * Send a broadcast and wait for a broadcast message to complete.
212 * 213 *
213 * The cpumaskp mask contains the cpus the broadcast was sent to. 214 * The flush_mask contains the cpus the broadcast was sent to.
214 * 215 *
215 * Returns 1 if all remote flushing was done. The mask is zeroed. 216 * Returns NULL if all remote flushing was done. The mask is zeroed.
216 * Returns 0 if some remote flushing remains to be done. The mask is left 217 * Returns @flush_mask if some remote flushing remains to be done. The
217 * unchanged. 218 * mask will have some bits still set.
218 */ 219 */
219int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc, 220const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
220 cpumask_t *cpumaskp) 221 struct bau_desc *bau_desc,
222 struct cpumask *flush_mask)
221{ 223{
222 int completion_status = 0; 224 int completion_status = 0;
223 int right_shift; 225 int right_shift;
@@ -257,66 +259,76 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
257 * the cpu's, all of which are still in the mask. 259 * the cpu's, all of which are still in the mask.
258 */ 260 */
259 __get_cpu_var(ptcstats).ptc_i++; 261 __get_cpu_var(ptcstats).ptc_i++;
260 return 0; 262 return flush_mask;
261 } 263 }
262 264
263 /* 265 /*
264 * Success, so clear the remote cpu's from the mask so we don't 266 * Success, so clear the remote cpu's from the mask so we don't
265 * use the IPI method of shootdown on them. 267 * use the IPI method of shootdown on them.
266 */ 268 */
267 for_each_cpu_mask(bit, *cpumaskp) { 269 for_each_cpu(bit, flush_mask) {
268 blade = uv_cpu_to_blade_id(bit); 270 blade = uv_cpu_to_blade_id(bit);
269 if (blade == this_blade) 271 if (blade == this_blade)
270 continue; 272 continue;
271 cpu_clear(bit, *cpumaskp); 273 cpumask_clear_cpu(bit, flush_mask);
272 } 274 }
273 if (!cpus_empty(*cpumaskp)) 275 if (!cpumask_empty(flush_mask))
274 return 0; 276 return flush_mask;
275 return 1; 277 return NULL;
276} 278}
277 279
278/** 280/**
279 * uv_flush_tlb_others - globally purge translation cache of a virtual 281 * uv_flush_tlb_others - globally purge translation cache of a virtual
280 * address or all TLB's 282 * address or all TLB's
281 * @cpumaskp: mask of all cpu's in which the address is to be removed 283 * @cpumask: mask of all cpu's in which the address is to be removed
282 * @mm: mm_struct containing virtual address range 284 * @mm: mm_struct containing virtual address range
283 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) 285 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
286 * @cpu: the current cpu
284 * 287 *
285 * This is the entry point for initiating any UV global TLB shootdown. 288 * This is the entry point for initiating any UV global TLB shootdown.
286 * 289 *
287 * Purges the translation caches of all specified processors of the given 290 * Purges the translation caches of all specified processors of the given
288 * virtual address, or purges all TLB's on specified processors. 291 * virtual address, or purges all TLB's on specified processors.
289 * 292 *
290 * The caller has derived the cpumaskp from the mm_struct and has subtracted 293 * The caller has derived the cpumask from the mm_struct. This function
291 * the local cpu from the mask. This function is called only if there 294 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
292 * are bits set in the mask. (e.g. flush_tlb_page())
293 * 295 *
294 * The cpumaskp is converted into a nodemask of the nodes containing 296 * The cpumask is converted into a nodemask of the nodes containing
295 * the cpus. 297 * the cpus.
296 * 298 *
297 * Returns 1 if all remote flushing was done. 299 * Note that this function should be called with preemption disabled.
298 * Returns 0 if some remote flushing remains to be done. 300 *
301 * Returns NULL if all remote flushing was done.
302 * Returns pointer to cpumask if some remote flushing remains to be
303 * done. The returned pointer is valid till preemption is re-enabled.
299 */ 304 */
300int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, 305const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
301 unsigned long va) 306 struct mm_struct *mm,
307 unsigned long va, unsigned int cpu)
302{ 308{
309 static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
310 struct cpumask *flush_mask = &__get_cpu_var(flush_tlb_mask);
303 int i; 311 int i;
304 int bit; 312 int bit;
305 int blade; 313 int blade;
306 int cpu; 314 int uv_cpu;
307 int this_blade; 315 int this_blade;
308 int locals = 0; 316 int locals = 0;
309 struct bau_desc *bau_desc; 317 struct bau_desc *bau_desc;
310 318
311 cpu = uv_blade_processor_id(); 319 WARN_ON(!in_atomic());
320
321 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
322
323 uv_cpu = uv_blade_processor_id();
312 this_blade = uv_numa_blade_id(); 324 this_blade = uv_numa_blade_id();
313 bau_desc = __get_cpu_var(bau_control).descriptor_base; 325 bau_desc = __get_cpu_var(bau_control).descriptor_base;
314 bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu; 326 bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
315 327
316 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 328 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
317 329
318 i = 0; 330 i = 0;
319 for_each_cpu_mask(bit, *cpumaskp) { 331 for_each_cpu(bit, flush_mask) {
320 blade = uv_cpu_to_blade_id(bit); 332 blade = uv_cpu_to_blade_id(bit);
321 BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1)); 333 BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
322 if (blade == this_blade) { 334 if (blade == this_blade) {
@@ -331,17 +343,17 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
331 * no off_node flushing; return status for local node 343 * no off_node flushing; return status for local node
332 */ 344 */
333 if (locals) 345 if (locals)
334 return 0; 346 return flush_mask;
335 else 347 else
336 return 1; 348 return NULL;
337 } 349 }
338 __get_cpu_var(ptcstats).requestor++; 350 __get_cpu_var(ptcstats).requestor++;
339 __get_cpu_var(ptcstats).ntargeted += i; 351 __get_cpu_var(ptcstats).ntargeted += i;
340 352
341 bau_desc->payload.address = va; 353 bau_desc->payload.address = va;
342 bau_desc->payload.sending_cpu = smp_processor_id(); 354 bau_desc->payload.sending_cpu = cpu;
343 355
344 return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp); 356 return uv_flush_send_and_wait(uv_cpu, this_blade, bau_desc, flush_mask);
345} 357}
346 358
347/* 359/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 98c2d055284b..ed5aee5f3fcc 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -59,7 +59,6 @@
59#ifdef CONFIG_X86_64 59#ifdef CONFIG_X86_64
60#include <asm/pgalloc.h> 60#include <asm/pgalloc.h>
61#include <asm/proto.h> 61#include <asm/proto.h>
62#include <asm/pda.h>
63#else 62#else
64#include <asm/processor-flags.h> 63#include <asm/processor-flags.h>
65#include <asm/arch_hooks.h> 64#include <asm/arch_hooks.h>
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 1d3302cc2ddf..eb9e7347928e 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -670,10 +670,11 @@ static inline int __init activate_vmi(void)
670 para_fill(pv_mmu_ops.write_cr2, SetCR2); 670 para_fill(pv_mmu_ops.write_cr2, SetCR2);
671 para_fill(pv_mmu_ops.write_cr3, SetCR3); 671 para_fill(pv_mmu_ops.write_cr3, SetCR3);
672 para_fill(pv_cpu_ops.write_cr4, SetCR4); 672 para_fill(pv_cpu_ops.write_cr4, SetCR4);
673 para_fill(pv_irq_ops.save_fl, GetInterruptMask); 673
674 para_fill(pv_irq_ops.restore_fl, SetInterruptMask); 674 para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
675 para_fill(pv_irq_ops.irq_disable, DisableInterrupts); 675 para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
676 para_fill(pv_irq_ops.irq_enable, EnableInterrupts); 676 para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
677 para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
677 678
678 para_fill(pv_cpu_ops.wbinvd, WBINVD); 679 para_fill(pv_cpu_ops.wbinvd, WBINVD);
679 para_fill(pv_cpu_ops.read_tsc, RDTSC); 680 para_fill(pv_cpu_ops.read_tsc, RDTSC);
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 82c67559dde7..3eba7f7bac05 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -178,14 +178,7 @@ SECTIONS
178 __initramfs_end = .; 178 __initramfs_end = .;
179 } 179 }
180#endif 180#endif
181 . = ALIGN(PAGE_SIZE); 181 PERCPU(PAGE_SIZE)
182 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
183 __per_cpu_start = .;
184 *(.data.percpu.page_aligned)
185 *(.data.percpu)
186 *(.data.percpu.shared_aligned)
187 __per_cpu_end = .;
188 }
189 . = ALIGN(PAGE_SIZE); 182 . = ALIGN(PAGE_SIZE);
190 /* freed after init ends here */ 183 /* freed after init ends here */
191 184
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 1a614c0e6bef..087a7f2c639b 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -5,6 +5,7 @@
5#define LOAD_OFFSET __START_KERNEL_map 5#define LOAD_OFFSET __START_KERNEL_map
6 6
7#include <asm-generic/vmlinux.lds.h> 7#include <asm-generic/vmlinux.lds.h>
8#include <asm/asm-offsets.h>
8#include <asm/page.h> 9#include <asm/page.h>
9 10
10#undef i386 /* in case the preprocessor is a 32bit one */ 11#undef i386 /* in case the preprocessor is a 32bit one */
@@ -13,12 +14,15 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
13OUTPUT_ARCH(i386:x86-64) 14OUTPUT_ARCH(i386:x86-64)
14ENTRY(phys_startup_64) 15ENTRY(phys_startup_64)
15jiffies_64 = jiffies; 16jiffies_64 = jiffies;
16_proxy_pda = 1;
17PHDRS { 17PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */ 18 text PT_LOAD FLAGS(5); /* R_E */
19 data PT_LOAD FLAGS(7); /* RWE */ 19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */ 20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */ 21 data.init PT_LOAD FLAGS(7); /* RWE */
22#ifdef CONFIG_SMP
23 percpu PT_LOAD FLAGS(7); /* RWE */
24#endif
25 data.init2 PT_LOAD FLAGS(7); /* RWE */
22 note PT_NOTE FLAGS(0); /* ___ */ 26 note PT_NOTE FLAGS(0); /* ___ */
23} 27}
24SECTIONS 28SECTIONS
@@ -208,14 +212,28 @@ SECTIONS
208 __initramfs_end = .; 212 __initramfs_end = .;
209#endif 213#endif
210 214
215#ifdef CONFIG_SMP
216 /*
217 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
218 * output PHDR, so the next output section - __data_nosave - should
219 * start another section data.init2. Also, pda should be at the head of
220 * percpu area. Preallocate it and define the percpu offset symbol
221 * so that it can be accessed as a percpu variable.
222 */
223 . = ALIGN(PAGE_SIZE);
224 PERCPU_VADDR(0, :percpu)
225#else
211 PERCPU(PAGE_SIZE) 226 PERCPU(PAGE_SIZE)
227#endif
212 228
213 . = ALIGN(PAGE_SIZE); 229 . = ALIGN(PAGE_SIZE);
214 __init_end = .; 230 __init_end = .;
215 231
216 . = ALIGN(PAGE_SIZE); 232 . = ALIGN(PAGE_SIZE);
217 __nosave_begin = .; 233 __nosave_begin = .;
218 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } 234 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
235 *(.data.nosave)
236 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
219 . = ALIGN(PAGE_SIZE); 237 . = ALIGN(PAGE_SIZE);
220 __nosave_end = .; 238 __nosave_end = .;
221 239
@@ -239,8 +257,21 @@ SECTIONS
239 DWARF_DEBUG 257 DWARF_DEBUG
240} 258}
241 259
260 /*
261 * Per-cpu symbols which need to be offset from __per_cpu_load
262 * for the boot processor.
263 */
264#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
265INIT_PER_CPU(gdt_page);
266INIT_PER_CPU(irq_stack_union);
267
242/* 268/*
243 * Build-time check on the image size: 269 * Build-time check on the image size:
244 */ 270 */
245ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), 271ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
246 "kernel image bigger than KERNEL_IMAGE_SIZE") 272 "kernel image bigger than KERNEL_IMAGE_SIZE")
273
274#ifdef CONFIG_SMP
275ASSERT((per_cpu__irq_stack_union == 0),
276 "irq_stack_union is not at start of per-cpu area");
277#endif
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index a688f3bfaec2..c609205df594 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -37,6 +37,7 @@ static unsigned long vsmp_save_fl(void)
37 flags &= ~X86_EFLAGS_IF; 37 flags &= ~X86_EFLAGS_IF;
38 return flags; 38 return flags;
39} 39}
40PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
40 41
41static void vsmp_restore_fl(unsigned long flags) 42static void vsmp_restore_fl(unsigned long flags)
42{ 43{
@@ -46,6 +47,7 @@ static void vsmp_restore_fl(unsigned long flags)
46 flags |= X86_EFLAGS_AC; 47 flags |= X86_EFLAGS_AC;
47 native_restore_fl(flags); 48 native_restore_fl(flags);
48} 49}
50PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
49 51
50static void vsmp_irq_disable(void) 52static void vsmp_irq_disable(void)
51{ 53{
@@ -53,6 +55,7 @@ static void vsmp_irq_disable(void)
53 55
54 native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC); 56 native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
55} 57}
58PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
56 59
57static void vsmp_irq_enable(void) 60static void vsmp_irq_enable(void)
58{ 61{
@@ -60,6 +63,7 @@ static void vsmp_irq_enable(void)
60 63
61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); 64 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
62} 65}
66PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
63 67
64static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, 68static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
65 unsigned long addr, unsigned len) 69 unsigned long addr, unsigned len)
@@ -90,10 +94,10 @@ static void __init set_vsmp_pv_ops(void)
90 cap, ctl); 94 cap, ctl);
91 if (cap & ctl & (1 << 4)) { 95 if (cap & ctl & (1 << 4)) {
92 /* Setup irq ops and turn on vSMP IRQ fastpath handling */ 96 /* Setup irq ops and turn on vSMP IRQ fastpath handling */
93 pv_irq_ops.irq_disable = vsmp_irq_disable; 97 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
94 pv_irq_ops.irq_enable = vsmp_irq_enable; 98 pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable);
95 pv_irq_ops.save_fl = vsmp_save_fl; 99 pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
96 pv_irq_ops.restore_fl = vsmp_restore_fl; 100 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
97 pv_init_ops.patch = vsmp_patch; 101 pv_init_ops.patch = vsmp_patch;
98 102
99 ctl &= ~(1 << 4); 103 ctl &= ~(1 << 4);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 695e426aa354..3909e3ba5ce3 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -58,5 +58,3 @@ EXPORT_SYMBOL(__memcpy);
58EXPORT_SYMBOL(empty_zero_page); 58EXPORT_SYMBOL(empty_zero_page);
59EXPORT_SYMBOL(init_level4_pgt); 59EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index); 60EXPORT_SYMBOL(load_gs_index);
61
62EXPORT_SYMBOL(_proxy_pda);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 92f1c6f3e19d..19e33b6cd593 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -173,24 +173,29 @@ static unsigned long save_fl(void)
173{ 173{
174 return lguest_data.irq_enabled; 174 return lguest_data.irq_enabled;
175} 175}
176PV_CALLEE_SAVE_REGS_THUNK(save_fl);
176 177
177/* restore_flags() just sets the flags back to the value given. */ 178/* restore_flags() just sets the flags back to the value given. */
178static void restore_fl(unsigned long flags) 179static void restore_fl(unsigned long flags)
179{ 180{
180 lguest_data.irq_enabled = flags; 181 lguest_data.irq_enabled = flags;
181} 182}
183PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
182 184
183/* Interrupts go off... */ 185/* Interrupts go off... */
184static void irq_disable(void) 186static void irq_disable(void)
185{ 187{
186 lguest_data.irq_enabled = 0; 188 lguest_data.irq_enabled = 0;
187} 189}
190PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
188 191
189/* Interrupts go on... */ 192/* Interrupts go on... */
190static void irq_enable(void) 193static void irq_enable(void)
191{ 194{
192 lguest_data.irq_enabled = X86_EFLAGS_IF; 195 lguest_data.irq_enabled = X86_EFLAGS_IF;
193} 196}
197PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
198
194/*:*/ 199/*:*/
195/*M:003 Note that we don't check for outstanding interrupts when we re-enable 200/*M:003 Note that we don't check for outstanding interrupts when we re-enable
196 * them (or when we unmask an interrupt). This seems to work for the moment, 201 * them (or when we unmask an interrupt). This seems to work for the moment,
@@ -984,10 +989,10 @@ __init void lguest_init(void)
984 989
985 /* interrupt-related operations */ 990 /* interrupt-related operations */
986 pv_irq_ops.init_IRQ = lguest_init_IRQ; 991 pv_irq_ops.init_IRQ = lguest_init_IRQ;
987 pv_irq_ops.save_fl = save_fl; 992 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
988 pv_irq_ops.restore_fl = restore_fl; 993 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl);
989 pv_irq_ops.irq_disable = irq_disable; 994 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
990 pv_irq_ops.irq_enable = irq_enable; 995 pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable);
991 pv_irq_ops.safe_halt = lguest_safe_halt; 996 pv_irq_ops.safe_halt = lguest_safe_halt;
992 997
993 /* init-time operations */ 998 /* init-time operations */
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
index d914a7996a66..66b7eb57d8e4 100644
--- a/arch/x86/mach-voyager/setup.c
+++ b/arch/x86/mach-voyager/setup.c
@@ -9,6 +9,7 @@
9#include <asm/e820.h> 9#include <asm/e820.h>
10#include <asm/io.h> 10#include <asm/io.h>
11#include <asm/setup.h> 11#include <asm/setup.h>
12#include <asm/cpu.h>
12 13
13void __init pre_intr_init_hook(void) 14void __init pre_intr_init_hook(void)
14{ 15{
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 7ffcdeec4631..6f5a38c7f900 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -400,7 +400,7 @@ void __init find_smp_config(void)
400 VOYAGER_SUS_IN_CONTROL_PORT); 400 VOYAGER_SUS_IN_CONTROL_PORT);
401 401
402 current_thread_info()->cpu = boot_cpu_id; 402 current_thread_info()->cpu = boot_cpu_id;
403 x86_write_percpu(cpu_number, boot_cpu_id); 403 percpu_write(cpu_number, boot_cpu_id);
404} 404}
405 405
406/* 406/*
@@ -528,7 +528,6 @@ static void __init do_boot_cpu(__u8 cpu)
528 /* init_tasks (in sched.c) is indexed logically */ 528 /* init_tasks (in sched.c) is indexed logically */
529 stack_start.sp = (void *)idle->thread.sp; 529 stack_start.sp = (void *)idle->thread.sp;
530 530
531 init_gdt(cpu);
532 per_cpu(current_task, cpu) = idle; 531 per_cpu(current_task, cpu) = idle;
533 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 532 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
534 irq_ctx_init(cpu); 533 irq_ctx_init(cpu);
@@ -1745,14 +1744,13 @@ static void __init voyager_smp_prepare_cpus(unsigned int max_cpus)
1745 1744
1746static void __cpuinit voyager_smp_prepare_boot_cpu(void) 1745static void __cpuinit voyager_smp_prepare_boot_cpu(void)
1747{ 1746{
1748 init_gdt(smp_processor_id()); 1747 int cpu = smp_processor_id();
1749 switch_to_new_gdt(); 1748 switch_to_new_gdt(cpu);
1750 1749
1751 cpu_online_map = cpumask_of_cpu(smp_processor_id()); 1750 cpu_online_map = cpumask_of_cpu(smp_processor_id());
1752 cpu_callout_map = cpumask_of_cpu(smp_processor_id()); 1751 cpu_callout_map = cpumask_of_cpu(smp_processor_id());
1753 cpu_callin_map = CPU_MASK_NONE; 1752 cpu_callin_map = CPU_MASK_NONE;
1754 cpu_present_map = cpumask_of_cpu(smp_processor_id()); 1753 cpu_present_map = cpumask_of_cpu(smp_processor_id());
1755
1756} 1754}
1757 1755
1758static int __cpuinit voyager_cpu_up(unsigned int cpu) 1756static int __cpuinit voyager_cpu_up(unsigned int cpu)
@@ -1779,7 +1777,6 @@ static void __init voyager_smp_cpus_done(unsigned int max_cpus)
1779void __init smp_setup_processor_id(void) 1777void __init smp_setup_processor_id(void)
1780{ 1778{
1781 current_thread_info()->cpu = hard_smp_processor_id(); 1779 current_thread_info()->cpu = hard_smp_processor_id();
1782 x86_write_percpu(cpu_number, hard_smp_processor_id());
1783} 1780}
1784 1781
1785static void voyager_send_call_func(const struct cpumask *callmask) 1782static void voyager_send_call_func(const struct cpumask *callmask)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index d8cc96a2738f..9f05157220f5 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,6 +1,8 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o gup.o 2 pat.o pgtable.o gup.o
3 3
4obj-$(CONFIG_X86_SMP) += tlb.o
5
4obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o 6obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
5 7
6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 8obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c76ef1d701c9..8c3f3113a6ec 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -26,6 +26,7 @@
26#include <linux/kprobes.h> 26#include <linux/kprobes.h>
27#include <linux/uaccess.h> 27#include <linux/uaccess.h>
28#include <linux/kdebug.h> 28#include <linux/kdebug.h>
29#include <linux/magic.h>
29 30
30#include <asm/system.h> 31#include <asm/system.h>
31#include <asm/desc.h> 32#include <asm/desc.h>
@@ -91,8 +92,8 @@ static inline int notify_page_fault(struct pt_regs *regs)
91 * 92 *
92 * Opcode checker based on code by Richard Brunner 93 * Opcode checker based on code by Richard Brunner
93 */ 94 */
94static int is_prefetch(struct pt_regs *regs, unsigned long addr, 95static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
95 unsigned long error_code) 96 unsigned long addr)
96{ 97{
97 unsigned char *instr; 98 unsigned char *instr;
98 int scan_more = 1; 99 int scan_more = 1;
@@ -409,15 +410,15 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
409} 410}
410 411
411#ifdef CONFIG_X86_64 412#ifdef CONFIG_X86_64
412static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, 413static noinline void pgtable_bad(struct pt_regs *regs,
413 unsigned long error_code) 414 unsigned long error_code, unsigned long address)
414{ 415{
415 unsigned long flags = oops_begin(); 416 unsigned long flags = oops_begin();
416 int sig = SIGKILL; 417 int sig = SIGKILL;
417 struct task_struct *tsk; 418 struct task_struct *tsk = current;
418 419
419 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 420 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
420 current->comm, address); 421 tsk->comm, address);
421 dump_pagetable(address); 422 dump_pagetable(address);
422 tsk = current; 423 tsk = current;
423 tsk->thread.cr2 = address; 424 tsk->thread.cr2 = address;
@@ -429,6 +430,196 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
429} 430}
430#endif 431#endif
431 432
433static noinline void no_context(struct pt_regs *regs,
434 unsigned long error_code, unsigned long address)
435{
436 struct task_struct *tsk = current;
437 unsigned long *stackend;
438
439#ifdef CONFIG_X86_64
440 unsigned long flags;
441 int sig;
442#endif
443
444 /* Are we prepared to handle this kernel fault? */
445 if (fixup_exception(regs))
446 return;
447
448 /*
449 * X86_32
450 * Valid to do another page fault here, because if this fault
451 * had been triggered by is_prefetch fixup_exception would have
452 * handled it.
453 *
454 * X86_64
455 * Hall of shame of CPU/BIOS bugs.
456 */
457 if (is_prefetch(regs, error_code, address))
458 return;
459
460 if (is_errata93(regs, address))
461 return;
462
463 /*
464 * Oops. The kernel tried to access some bad page. We'll have to
465 * terminate things with extreme prejudice.
466 */
467#ifdef CONFIG_X86_32
468 bust_spinlocks(1);
469#else
470 flags = oops_begin();
471#endif
472
473 show_fault_oops(regs, error_code, address);
474
475 stackend = end_of_stack(tsk);
476 if (*stackend != STACK_END_MAGIC)
477 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
478
479 tsk->thread.cr2 = address;
480 tsk->thread.trap_no = 14;
481 tsk->thread.error_code = error_code;
482
483#ifdef CONFIG_X86_32
484 die("Oops", regs, error_code);
485 bust_spinlocks(0);
486 do_exit(SIGKILL);
487#else
488 sig = SIGKILL;
489 if (__die("Oops", regs, error_code))
490 sig = 0;
491 /* Executive summary in case the body of the oops scrolled away */
492 printk(KERN_EMERG "CR2: %016lx\n", address);
493 oops_end(flags, regs, sig);
494#endif
495}
496
497static void __bad_area_nosemaphore(struct pt_regs *regs,
498 unsigned long error_code, unsigned long address,
499 int si_code)
500{
501 struct task_struct *tsk = current;
502
503 /* User mode accesses just cause a SIGSEGV */
504 if (error_code & PF_USER) {
505 /*
506 * It's possible to have interrupts off here.
507 */
508 local_irq_enable();
509
510 /*
511 * Valid to do another page fault here because this one came
512 * from user space.
513 */
514 if (is_prefetch(regs, error_code, address))
515 return;
516
517 if (is_errata100(regs, address))
518 return;
519
520 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
521 printk_ratelimit()) {
522 printk(
523 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
524 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
525 tsk->comm, task_pid_nr(tsk), address,
526 (void *) regs->ip, (void *) regs->sp, error_code);
527 print_vma_addr(" in ", regs->ip);
528 printk("\n");
529 }
530
531 tsk->thread.cr2 = address;
532 /* Kernel addresses are always protection faults */
533 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
534 tsk->thread.trap_no = 14;
535 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
536 return;
537 }
538
539 if (is_f00f_bug(regs, address))
540 return;
541
542 no_context(regs, error_code, address);
543}
544
545static noinline void bad_area_nosemaphore(struct pt_regs *regs,
546 unsigned long error_code, unsigned long address)
547{
548 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
549}
550
551static void __bad_area(struct pt_regs *regs,
552 unsigned long error_code, unsigned long address,
553 int si_code)
554{
555 struct mm_struct *mm = current->mm;
556
557 /*
558 * Something tried to access memory that isn't in our memory map..
559 * Fix it, but check if it's kernel or user first..
560 */
561 up_read(&mm->mmap_sem);
562
563 __bad_area_nosemaphore(regs, error_code, address, si_code);
564}
565
566static noinline void bad_area(struct pt_regs *regs,
567 unsigned long error_code, unsigned long address)
568{
569 __bad_area(regs, error_code, address, SEGV_MAPERR);
570}
571
572static noinline void bad_area_access_error(struct pt_regs *regs,
573 unsigned long error_code, unsigned long address)
574{
575 __bad_area(regs, error_code, address, SEGV_ACCERR);
576}
577
578/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
579static void out_of_memory(struct pt_regs *regs,
580 unsigned long error_code, unsigned long address)
581{
582 /*
583 * We ran out of memory, call the OOM killer, and return the userspace
584 * (which will retry the fault, or kill us if we got oom-killed).
585 */
586 up_read(&current->mm->mmap_sem);
587 pagefault_out_of_memory();
588}
589
590static void do_sigbus(struct pt_regs *regs,
591 unsigned long error_code, unsigned long address)
592{
593 struct task_struct *tsk = current;
594 struct mm_struct *mm = tsk->mm;
595
596 up_read(&mm->mmap_sem);
597
598 /* Kernel mode? Handle exceptions or die */
599 if (!(error_code & PF_USER))
600 no_context(regs, error_code, address);
601#ifdef CONFIG_X86_32
602 /* User space => ok to do another page fault */
603 if (is_prefetch(regs, error_code, address))
604 return;
605#endif
606 tsk->thread.cr2 = address;
607 tsk->thread.error_code = error_code;
608 tsk->thread.trap_no = 14;
609 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
610}
611
612static noinline void mm_fault_error(struct pt_regs *regs,
613 unsigned long error_code, unsigned long address, unsigned int fault)
614{
615 if (fault & VM_FAULT_OOM)
616 out_of_memory(regs, error_code, address);
617 else if (fault & VM_FAULT_SIGBUS)
618 do_sigbus(regs, error_code, address);
619 else
620 BUG();
621}
622
432static int spurious_fault_check(unsigned long error_code, pte_t *pte) 623static int spurious_fault_check(unsigned long error_code, pte_t *pte)
433{ 624{
434 if ((error_code & PF_WRITE) && !pte_write(*pte)) 625 if ((error_code & PF_WRITE) && !pte_write(*pte))
@@ -448,8 +639,8 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
448 * There are no security implications to leaving a stale TLB when 639 * There are no security implications to leaving a stale TLB when
449 * increasing the permissions on a page. 640 * increasing the permissions on a page.
450 */ 641 */
451static int spurious_fault(unsigned long address, 642static noinline int spurious_fault(unsigned long error_code,
452 unsigned long error_code) 643 unsigned long address)
453{ 644{
454 pgd_t *pgd; 645 pgd_t *pgd;
455 pud_t *pud; 646 pud_t *pud;
@@ -494,7 +685,7 @@ static int spurious_fault(unsigned long address,
494 * 685 *
495 * This assumes no large pages in there. 686 * This assumes no large pages in there.
496 */ 687 */
497static int vmalloc_fault(unsigned long address) 688static noinline int vmalloc_fault(unsigned long address)
498{ 689{
499#ifdef CONFIG_X86_32 690#ifdef CONFIG_X86_32
500 unsigned long pgd_paddr; 691 unsigned long pgd_paddr;
@@ -573,6 +764,25 @@ static int vmalloc_fault(unsigned long address)
573 764
574int show_unhandled_signals = 1; 765int show_unhandled_signals = 1;
575 766
767static inline int access_error(unsigned long error_code, int write,
768 struct vm_area_struct *vma)
769{
770 if (write) {
771 /* write, present and write, not present */
772 if (unlikely(!(vma->vm_flags & VM_WRITE)))
773 return 1;
774 } else if (unlikely(error_code & PF_PROT)) {
775 /* read, present */
776 return 1;
777 } else {
778 /* read, not present */
779 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
780 return 1;
781 }
782
783 return 0;
784}
785
576/* 786/*
577 * This routine handles page faults. It determines the address, 787 * This routine handles page faults. It determines the address,
578 * and the problem, and then passes it off to one of the appropriate 788 * and the problem, and then passes it off to one of the appropriate
@@ -583,16 +793,12 @@ asmlinkage
583#endif 793#endif
584void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) 794void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
585{ 795{
796 unsigned long address;
586 struct task_struct *tsk; 797 struct task_struct *tsk;
587 struct mm_struct *mm; 798 struct mm_struct *mm;
588 struct vm_area_struct *vma; 799 struct vm_area_struct *vma;
589 unsigned long address; 800 int write;
590 int write, si_code;
591 int fault; 801 int fault;
592#ifdef CONFIG_X86_64
593 unsigned long flags;
594 int sig;
595#endif
596 802
597 tsk = current; 803 tsk = current;
598 mm = tsk->mm; 804 mm = tsk->mm;
@@ -601,8 +807,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
601 /* get the address */ 807 /* get the address */
602 address = read_cr2(); 808 address = read_cr2();
603 809
604 si_code = SEGV_MAPERR;
605
606 if (unlikely(kmmio_fault(regs, address))) 810 if (unlikely(kmmio_fault(regs, address)))
607 return; 811 return;
608 812
@@ -629,7 +833,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
629 return; 833 return;
630 834
631 /* Can handle a stale RO->RW TLB */ 835 /* Can handle a stale RO->RW TLB */
632 if (spurious_fault(address, error_code)) 836 if (spurious_fault(error_code, address))
633 return; 837 return;
634 838
635 /* kprobes don't want to hook the spurious faults. */ 839 /* kprobes don't want to hook the spurious faults. */
@@ -639,13 +843,12 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
639 * Don't take the mm semaphore here. If we fixup a prefetch 843 * Don't take the mm semaphore here. If we fixup a prefetch
640 * fault we could otherwise deadlock. 844 * fault we could otherwise deadlock.
641 */ 845 */
642 goto bad_area_nosemaphore; 846 bad_area_nosemaphore(regs, error_code, address);
847 return;
643 } 848 }
644 849
645 /* kprobes don't want to hook the spurious faults. */ 850 if (unlikely(notify_page_fault(regs)))
646 if (notify_page_fault(regs))
647 return; 851 return;
648
649 /* 852 /*
650 * It's safe to allow irq's after cr2 has been saved and the 853 * It's safe to allow irq's after cr2 has been saved and the
651 * vmalloc fault has been handled. 854 * vmalloc fault has been handled.
@@ -661,15 +864,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
661 864
662#ifdef CONFIG_X86_64 865#ifdef CONFIG_X86_64
663 if (unlikely(error_code & PF_RSVD)) 866 if (unlikely(error_code & PF_RSVD))
664 pgtable_bad(address, regs, error_code); 867 pgtable_bad(regs, error_code, address);
665#endif 868#endif
666 869
667 /* 870 /*
668 * If we're in an interrupt, have no user context or are running in an 871 * If we're in an interrupt, have no user context or are running in an
669 * atomic region then we must not take the fault. 872 * atomic region then we must not take the fault.
670 */ 873 */
671 if (unlikely(in_atomic() || !mm)) 874 if (unlikely(in_atomic() || !mm)) {
672 goto bad_area_nosemaphore; 875 bad_area_nosemaphore(regs, error_code, address);
876 return;
877 }
673 878
674 /* 879 /*
675 * When running in the kernel we expect faults to occur only to 880 * When running in the kernel we expect faults to occur only to
@@ -687,20 +892,26 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
687 * source. If this is invalid we can skip the address space check, 892 * source. If this is invalid we can skip the address space check,
688 * thus avoiding the deadlock. 893 * thus avoiding the deadlock.
689 */ 894 */
690 if (!down_read_trylock(&mm->mmap_sem)) { 895 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
691 if ((error_code & PF_USER) == 0 && 896 if ((error_code & PF_USER) == 0 &&
692 !search_exception_tables(regs->ip)) 897 !search_exception_tables(regs->ip)) {
693 goto bad_area_nosemaphore; 898 bad_area_nosemaphore(regs, error_code, address);
899 return;
900 }
694 down_read(&mm->mmap_sem); 901 down_read(&mm->mmap_sem);
695 } 902 }
696 903
697 vma = find_vma(mm, address); 904 vma = find_vma(mm, address);
698 if (!vma) 905 if (unlikely(!vma)) {
699 goto bad_area; 906 bad_area(regs, error_code, address);
700 if (vma->vm_start <= address) 907 return;
908 }
909 if (likely(vma->vm_start <= address))
701 goto good_area; 910 goto good_area;
702 if (!(vma->vm_flags & VM_GROWSDOWN)) 911 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
703 goto bad_area; 912 bad_area(regs, error_code, address);
913 return;
914 }
704 if (error_code & PF_USER) { 915 if (error_code & PF_USER) {
705 /* 916 /*
706 * Accessing the stack below %sp is always a bug. 917 * Accessing the stack below %sp is always a bug.
@@ -708,31 +919,25 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
708 * and pusha to work. ("enter $65535,$31" pushes 919 * and pusha to work. ("enter $65535,$31" pushes
709 * 32 pointers and then decrements %sp by 65535.) 920 * 32 pointers and then decrements %sp by 65535.)
710 */ 921 */
711 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) 922 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
712 goto bad_area; 923 bad_area(regs, error_code, address);
924 return;
925 }
713 } 926 }
714 if (expand_stack(vma, address)) 927 if (unlikely(expand_stack(vma, address))) {
715 goto bad_area; 928 bad_area(regs, error_code, address);
716/* 929 return;
717 * Ok, we have a good vm_area for this memory access, so 930 }
718 * we can handle it.. 931
719 */ 932 /*
933 * Ok, we have a good vm_area for this memory access, so
934 * we can handle it..
935 */
720good_area: 936good_area:
721 si_code = SEGV_ACCERR; 937 write = error_code & PF_WRITE;
722 write = 0; 938 if (unlikely(access_error(error_code, write, vma))) {
723 switch (error_code & (PF_PROT|PF_WRITE)) { 939 bad_area_access_error(regs, error_code, address);
724 default: /* 3: write, present */ 940 return;
725 /* fall through */
726 case PF_WRITE: /* write, not present */
727 if (!(vma->vm_flags & VM_WRITE))
728 goto bad_area;
729 write++;
730 break;
731 case PF_PROT: /* read, present */
732 goto bad_area;
733 case 0: /* read, not present */
734 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
735 goto bad_area;
736 } 941 }
737 942
738 /* 943 /*
@@ -742,11 +947,8 @@ good_area:
742 */ 947 */
743 fault = handle_mm_fault(mm, vma, address, write); 948 fault = handle_mm_fault(mm, vma, address, write);
744 if (unlikely(fault & VM_FAULT_ERROR)) { 949 if (unlikely(fault & VM_FAULT_ERROR)) {
745 if (fault & VM_FAULT_OOM) 950 mm_fault_error(regs, error_code, address, fault);
746 goto out_of_memory; 951 return;
747 else if (fault & VM_FAULT_SIGBUS)
748 goto do_sigbus;
749 BUG();
750 } 952 }
751 if (fault & VM_FAULT_MAJOR) 953 if (fault & VM_FAULT_MAJOR)
752 tsk->maj_flt++; 954 tsk->maj_flt++;
@@ -764,128 +966,6 @@ good_area:
764 } 966 }
765#endif 967#endif
766 up_read(&mm->mmap_sem); 968 up_read(&mm->mmap_sem);
767 return;
768
769/*
770 * Something tried to access memory that isn't in our memory map..
771 * Fix it, but check if it's kernel or user first..
772 */
773bad_area:
774 up_read(&mm->mmap_sem);
775
776bad_area_nosemaphore:
777 /* User mode accesses just cause a SIGSEGV */
778 if (error_code & PF_USER) {
779 /*
780 * It's possible to have interrupts off here.
781 */
782 local_irq_enable();
783
784 /*
785 * Valid to do another page fault here because this one came
786 * from user space.
787 */
788 if (is_prefetch(regs, address, error_code))
789 return;
790
791 if (is_errata100(regs, address))
792 return;
793
794 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
795 printk_ratelimit()) {
796 printk(
797 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
798 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
799 tsk->comm, task_pid_nr(tsk), address,
800 (void *) regs->ip, (void *) regs->sp, error_code);
801 print_vma_addr(" in ", regs->ip);
802 printk("\n");
803 }
804
805 tsk->thread.cr2 = address;
806 /* Kernel addresses are always protection faults */
807 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
808 tsk->thread.trap_no = 14;
809 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
810 return;
811 }
812
813 if (is_f00f_bug(regs, address))
814 return;
815
816no_context:
817 /* Are we prepared to handle this kernel fault? */
818 if (fixup_exception(regs))
819 return;
820
821 /*
822 * X86_32
823 * Valid to do another page fault here, because if this fault
824 * had been triggered by is_prefetch fixup_exception would have
825 * handled it.
826 *
827 * X86_64
828 * Hall of shame of CPU/BIOS bugs.
829 */
830 if (is_prefetch(regs, address, error_code))
831 return;
832
833 if (is_errata93(regs, address))
834 return;
835
836/*
837 * Oops. The kernel tried to access some bad page. We'll have to
838 * terminate things with extreme prejudice.
839 */
840#ifdef CONFIG_X86_32
841 bust_spinlocks(1);
842#else
843 flags = oops_begin();
844#endif
845
846 show_fault_oops(regs, error_code, address);
847
848 tsk->thread.cr2 = address;
849 tsk->thread.trap_no = 14;
850 tsk->thread.error_code = error_code;
851
852#ifdef CONFIG_X86_32
853 die("Oops", regs, error_code);
854 bust_spinlocks(0);
855 do_exit(SIGKILL);
856#else
857 sig = SIGKILL;
858 if (__die("Oops", regs, error_code))
859 sig = 0;
860 /* Executive summary in case the body of the oops scrolled away */
861 printk(KERN_EMERG "CR2: %016lx\n", address);
862 oops_end(flags, regs, sig);
863#endif
864
865out_of_memory:
866 /*
867 * We ran out of memory, call the OOM killer, and return the userspace
868 * (which will retry the fault, or kill us if we got oom-killed).
869 */
870 up_read(&mm->mmap_sem);
871 pagefault_out_of_memory();
872 return;
873
874do_sigbus:
875 up_read(&mm->mmap_sem);
876
877 /* Kernel mode? Handle exceptions or die */
878 if (!(error_code & PF_USER))
879 goto no_context;
880#ifdef CONFIG_X86_32
881 /* User space => ok to do another page fault */
882 if (is_prefetch(regs, address, error_code))
883 return;
884#endif
885 tsk->thread.cr2 = address;
886 tsk->thread.error_code = error_code;
887 tsk->thread.trap_no = 14;
888 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
889} 969}
890 970
891DEFINE_SPINLOCK(pgd_lock); 971DEFINE_SPINLOCK(pgd_lock);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 2cef05074413..00263bf07a88 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,7 +49,6 @@
49#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/setup.h> 50#include <asm/setup.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52#include <asm/smp.h>
53 52
54unsigned int __VMALLOC_RESERVE = 128 << 20; 53unsigned int __VMALLOC_RESERVE = 128 << 20;
55 54
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 71a14f89f89e..deb1c1ab7868 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,6 +20,12 @@
20#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h> 21#include <asm/k8.h>
22 22
23#ifdef CONFIG_DEBUG_PER_CPU_MAPS
24# define DBG(x...) printk(KERN_DEBUG x)
25#else
26# define DBG(x...)
27#endif
28
23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 29struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
24EXPORT_SYMBOL(node_data); 30EXPORT_SYMBOL(node_data);
25 31
@@ -33,6 +39,21 @@ int numa_off __initdata;
33static unsigned long __initdata nodemap_addr; 39static unsigned long __initdata nodemap_addr;
34static unsigned long __initdata nodemap_size; 40static unsigned long __initdata nodemap_size;
35 41
42DEFINE_PER_CPU(int, node_number) = 0;
43EXPORT_PER_CPU_SYMBOL(node_number);
44
45/*
46 * Map cpu index to node index
47 */
48DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
49EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
50
51/*
52 * Which logical CPUs are on which nodes
53 */
54cpumask_t *node_to_cpumask_map;
55EXPORT_SYMBOL(node_to_cpumask_map);
56
36/* 57/*
37 * Given a shift value, try to populate memnodemap[] 58 * Given a shift value, try to populate memnodemap[]
38 * Returns : 59 * Returns :
@@ -640,3 +661,199 @@ void __init init_cpu_to_node(void)
640#endif 661#endif
641 662
642 663
664/*
665 * Allocate node_to_cpumask_map based on number of available nodes
666 * Requires node_possible_map to be valid.
667 *
668 * Note: node_to_cpumask() is not valid until after this is done.
669 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
670 */
671void __init setup_node_to_cpumask_map(void)
672{
673 unsigned int node, num = 0;
674 cpumask_t *map;
675
676 /* setup nr_node_ids if not done yet */
677 if (nr_node_ids == MAX_NUMNODES) {
678 for_each_node_mask(node, node_possible_map)
679 num = node;
680 nr_node_ids = num + 1;
681 }
682
683 /* allocate the map */
684 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
685 DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);
686
687 pr_debug("Node to cpumask map at %p for %d nodes\n",
688 map, nr_node_ids);
689
690 /* node_to_cpumask() will now work */
691 node_to_cpumask_map = map;
692}
693
694void __cpuinit numa_set_node(int cpu, int node)
695{
696 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
697
698 /* early setting, no percpu area yet */
699 if (cpu_to_node_map) {
700 cpu_to_node_map[cpu] = node;
701 return;
702 }
703
704#ifdef CONFIG_DEBUG_PER_CPU_MAPS
705 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
706 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
707 dump_stack();
708 return;
709 }
710#endif
711 per_cpu(x86_cpu_to_node_map, cpu) = node;
712
713 if (node != NUMA_NO_NODE)
714 per_cpu(node_number, cpu) = node;
715}
716
717void __cpuinit numa_clear_node(int cpu)
718{
719 numa_set_node(cpu, NUMA_NO_NODE);
720}
721
722#ifndef CONFIG_DEBUG_PER_CPU_MAPS
723
724void __cpuinit numa_add_cpu(int cpu)
725{
726 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
727}
728
729void __cpuinit numa_remove_cpu(int cpu)
730{
731 cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
732}
733
734#else /* CONFIG_DEBUG_PER_CPU_MAPS */
735
736/*
737 * --------- debug versions of the numa functions ---------
738 */
739static void __cpuinit numa_set_cpumask(int cpu, int enable)
740{
741 int node = early_cpu_to_node(cpu);
742 cpumask_t *mask;
743 char buf[64];
744
745 if (node_to_cpumask_map == NULL) {
746 printk(KERN_ERR "node_to_cpumask_map NULL\n");
747 dump_stack();
748 return;
749 }
750
751 mask = &node_to_cpumask_map[node];
752 if (enable)
753 cpu_set(cpu, *mask);
754 else
755 cpu_clear(cpu, *mask);
756
757 cpulist_scnprintf(buf, sizeof(buf), mask);
758 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
759 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
760}
761
762void __cpuinit numa_add_cpu(int cpu)
763{
764 numa_set_cpumask(cpu, 1);
765}
766
767void __cpuinit numa_remove_cpu(int cpu)
768{
769 numa_set_cpumask(cpu, 0);
770}
771
772int cpu_to_node(int cpu)
773{
774 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
775 printk(KERN_WARNING
776 "cpu_to_node(%d): usage too early!\n", cpu);
777 dump_stack();
778 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
779 }
780 return per_cpu(x86_cpu_to_node_map, cpu);
781}
782EXPORT_SYMBOL(cpu_to_node);
783
784/*
785 * Same function as cpu_to_node() but used if called before the
786 * per_cpu areas are setup.
787 */
788int early_cpu_to_node(int cpu)
789{
790 if (early_per_cpu_ptr(x86_cpu_to_node_map))
791 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
792
793 if (!cpu_possible(cpu)) {
794 printk(KERN_WARNING
795 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
796 dump_stack();
797 return NUMA_NO_NODE;
798 }
799 return per_cpu(x86_cpu_to_node_map, cpu);
800}
801
802
803/* empty cpumask */
804static const cpumask_t cpu_mask_none;
805
806/*
807 * Returns a pointer to the bitmask of CPUs on Node 'node'.
808 */
809const cpumask_t *cpumask_of_node(int node)
810{
811 if (node_to_cpumask_map == NULL) {
812 printk(KERN_WARNING
813 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
814 node);
815 dump_stack();
816 return (const cpumask_t *)&cpu_online_map;
817 }
818 if (node >= nr_node_ids) {
819 printk(KERN_WARNING
820 "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
821 node, nr_node_ids);
822 dump_stack();
823 return &cpu_mask_none;
824 }
825 return &node_to_cpumask_map[node];
826}
827EXPORT_SYMBOL(cpumask_of_node);
828
829/*
830 * Returns a bitmask of CPUs on Node 'node'.
831 *
832 * Side note: this function creates the returned cpumask on the stack
833 * so with a high NR_CPUS count, excessive stack space is used. The
834 * node_to_cpumask_ptr function should be used whenever possible.
835 */
836cpumask_t node_to_cpumask(int node)
837{
838 if (node_to_cpumask_map == NULL) {
839 printk(KERN_WARNING
840 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
841 dump_stack();
842 return cpu_online_map;
843 }
844 if (node >= nr_node_ids) {
845 printk(KERN_WARNING
846 "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
847 node, nr_node_ids);
848 dump_stack();
849 return cpu_mask_none;
850 }
851 return node_to_cpumask_map[node];
852}
853EXPORT_SYMBOL(node_to_cpumask);
854
855/*
856 * --------- end of debug versions of the numa functions ---------
857 */
858
859#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 09737c8af074..15df1baee100 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -21,6 +21,7 @@
21#include <asm/numa.h> 21#include <asm/numa.h>
22#include <asm/e820.h> 22#include <asm/e820.h>
23#include <asm/genapic.h> 23#include <asm/genapic.h>
24#include <asm/uv/uv.h>
24 25
25int acpi_numa __initdata; 26int acpi_numa __initdata;
26 27
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/mm/tlb.c
index f8be6f1d2e48..72a6d4ebe34d 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/mm/tlb.c
@@ -1,22 +1,18 @@
1#include <linux/init.h> 1#include <linux/init.h>
2 2
3#include <linux/mm.h> 3#include <linux/mm.h>
4#include <linux/delay.h>
5#include <linux/spinlock.h> 4#include <linux/spinlock.h>
6#include <linux/smp.h> 5#include <linux/smp.h>
7#include <linux/kernel_stat.h>
8#include <linux/mc146818rtc.h>
9#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/module.h>
10 8
11#include <asm/mtrr.h>
12#include <asm/pgalloc.h>
13#include <asm/tlbflush.h> 9#include <asm/tlbflush.h>
14#include <asm/mmu_context.h> 10#include <asm/mmu_context.h>
15#include <asm/proto.h> 11#include <asm/apic.h>
16#include <asm/apicdef.h> 12#include <asm/uv/uv.h>
17#include <asm/idle.h> 13
18#include <asm/uv/uv_hub.h> 14DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
19#include <asm/uv/uv_bau.h> 15 = { &init_mm, 0, };
20 16
21#include <mach_ipi.h> 17#include <mach_ipi.h>
22/* 18/*
@@ -33,7 +29,7 @@
33 * To avoid global state use 8 different call vectors. 29 * To avoid global state use 8 different call vectors.
34 * Each CPU uses a specific vector to trigger flushes on other 30 * Each CPU uses a specific vector to trigger flushes on other
35 * CPUs. Depending on the received vector the target CPUs look into 31 * CPUs. Depending on the received vector the target CPUs look into
36 * the right per cpu variable for the flush data. 32 * the right array slot for the flush data.
37 * 33 *
38 * With more than 8 CPUs they are hashed to the 8 available 34 * With more than 8 CPUs they are hashed to the 8 available
39 * vectors. The limited global vector space forces us to this right now. 35 * vectors. The limited global vector space forces us to this right now.
@@ -43,18 +39,18 @@
43 39
44union smp_flush_state { 40union smp_flush_state {
45 struct { 41 struct {
46 cpumask_t flush_cpumask;
47 struct mm_struct *flush_mm; 42 struct mm_struct *flush_mm;
48 unsigned long flush_va; 43 unsigned long flush_va;
49 spinlock_t tlbstate_lock; 44 spinlock_t tlbstate_lock;
45 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
50 }; 46 };
51 char pad[SMP_CACHE_BYTES]; 47 char pad[CONFIG_X86_INTERNODE_CACHE_BYTES];
52} ____cacheline_aligned; 48} ____cacheline_internodealigned_in_smp;
53 49
54/* State is put into the per CPU data section, but padded 50/* State is put into the per CPU data section, but padded
55 to a full cache line because other CPUs can access it and we don't 51 to a full cache line because other CPUs can access it and we don't
56 want false sharing in the per cpu data segment. */ 52 want false sharing in the per cpu data segment. */
57static DEFINE_PER_CPU(union smp_flush_state, flush_state); 53static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
58 54
59/* 55/*
60 * We cannot call mmdrop() because we are in interrupt context, 56 * We cannot call mmdrop() because we are in interrupt context,
@@ -62,9 +58,9 @@ static DEFINE_PER_CPU(union smp_flush_state, flush_state);
62 */ 58 */
63void leave_mm(int cpu) 59void leave_mm(int cpu)
64{ 60{
65 if (read_pda(mmu_state) == TLBSTATE_OK) 61 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
66 BUG(); 62 BUG();
67 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); 63 cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
68 load_cr3(swapper_pg_dir); 64 load_cr3(swapper_pg_dir);
69} 65}
70EXPORT_SYMBOL_GPL(leave_mm); 66EXPORT_SYMBOL_GPL(leave_mm);
@@ -117,10 +113,20 @@ EXPORT_SYMBOL_GPL(leave_mm);
117 * Interrupts are disabled. 113 * Interrupts are disabled.
118 */ 114 */
119 115
120asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) 116/*
117 * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop
118 * but still used for documentation purpose but the usage is slightly
119 * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt
120 * entry calls in with the first parameter in %eax. Maybe define
121 * intrlinkage?
122 */
123#ifdef CONFIG_X86_64
124asmlinkage
125#endif
126void smp_invalidate_interrupt(struct pt_regs *regs)
121{ 127{
122 int cpu; 128 unsigned int cpu;
123 int sender; 129 unsigned int sender;
124 union smp_flush_state *f; 130 union smp_flush_state *f;
125 131
126 cpu = smp_processor_id(); 132 cpu = smp_processor_id();
@@ -129,9 +135,9 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
129 * Use that to determine where the sender put the data. 135 * Use that to determine where the sender put the data.
130 */ 136 */
131 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; 137 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
132 f = &per_cpu(flush_state, sender); 138 f = &flush_state[sender];
133 139
134 if (!cpu_isset(cpu, f->flush_cpumask)) 140 if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
135 goto out; 141 goto out;
136 /* 142 /*
137 * This was a BUG() but until someone can quote me the 143 * This was a BUG() but until someone can quote me the
@@ -142,8 +148,8 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
142 * BUG(); 148 * BUG();
143 */ 149 */
144 150
145 if (f->flush_mm == read_pda(active_mm)) { 151 if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
146 if (read_pda(mmu_state) == TLBSTATE_OK) { 152 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
147 if (f->flush_va == TLB_FLUSH_ALL) 153 if (f->flush_va == TLB_FLUSH_ALL)
148 local_flush_tlb(); 154 local_flush_tlb();
149 else 155 else
@@ -153,23 +159,21 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
153 } 159 }
154out: 160out:
155 ack_APIC_irq(); 161 ack_APIC_irq();
156 cpu_clear(cpu, f->flush_cpumask); 162 smp_mb__before_clear_bit();
163 cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
164 smp_mb__after_clear_bit();
157 inc_irq_stat(irq_tlb_count); 165 inc_irq_stat(irq_tlb_count);
158} 166}
159 167
160void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, 168static void flush_tlb_others_ipi(const struct cpumask *cpumask,
161 unsigned long va) 169 struct mm_struct *mm, unsigned long va)
162{ 170{
163 int sender; 171 unsigned int sender;
164 union smp_flush_state *f; 172 union smp_flush_state *f;
165 cpumask_t cpumask = *cpumaskp;
166
167 if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va))
168 return;
169 173
170 /* Caller has disabled preemption */ 174 /* Caller has disabled preemption */
171 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; 175 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
172 f = &per_cpu(flush_state, sender); 176 f = &flush_state[sender];
173 177
174 /* 178 /*
175 * Could avoid this lock when 179 * Could avoid this lock when
@@ -180,7 +184,8 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
180 184
181 f->flush_mm = mm; 185 f->flush_mm = mm;
182 f->flush_va = va; 186 f->flush_va = va;
183 cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask); 187 cpumask_andnot(to_cpumask(f->flush_cpumask),
188 cpumask, cpumask_of(smp_processor_id()));
184 189
185 /* 190 /*
186 * Make the above memory operations globally visible before 191 * Make the above memory operations globally visible before
@@ -191,9 +196,10 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
191 * We have to send the IPI only to 196 * We have to send the IPI only to
192 * CPUs affected. 197 * CPUs affected.
193 */ 198 */
194 send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender); 199 send_IPI_mask(to_cpumask(f->flush_cpumask),
200 INVALIDATE_TLB_VECTOR_START + sender);
195 201
196 while (!cpus_empty(f->flush_cpumask)) 202 while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
197 cpu_relax(); 203 cpu_relax();
198 204
199 f->flush_mm = NULL; 205 f->flush_mm = NULL;
@@ -201,12 +207,28 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
201 spin_unlock(&f->tlbstate_lock); 207 spin_unlock(&f->tlbstate_lock);
202} 208}
203 209
210void native_flush_tlb_others(const struct cpumask *cpumask,
211 struct mm_struct *mm, unsigned long va)
212{
213 if (is_uv_system()) {
214 unsigned int cpu;
215
216 cpu = get_cpu();
217 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
218 if (cpumask)
219 flush_tlb_others_ipi(cpumask, mm, va);
220 put_cpu();
221 return;
222 }
223 flush_tlb_others_ipi(cpumask, mm, va);
224}
225
204static int __cpuinit init_smp_flush(void) 226static int __cpuinit init_smp_flush(void)
205{ 227{
206 int i; 228 int i;
207 229
208 for_each_possible_cpu(i) 230 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
209 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); 231 spin_lock_init(&flush_state[i].tlbstate_lock);
210 232
211 return 0; 233 return 0;
212} 234}
@@ -215,25 +237,18 @@ core_initcall(init_smp_flush);
215void flush_tlb_current_task(void) 237void flush_tlb_current_task(void)
216{ 238{
217 struct mm_struct *mm = current->mm; 239 struct mm_struct *mm = current->mm;
218 cpumask_t cpu_mask;
219 240
220 preempt_disable(); 241 preempt_disable();
221 cpu_mask = mm->cpu_vm_mask;
222 cpu_clear(smp_processor_id(), cpu_mask);
223 242
224 local_flush_tlb(); 243 local_flush_tlb();
225 if (!cpus_empty(cpu_mask)) 244 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
226 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); 245 flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
227 preempt_enable(); 246 preempt_enable();
228} 247}
229 248
230void flush_tlb_mm(struct mm_struct *mm) 249void flush_tlb_mm(struct mm_struct *mm)
231{ 250{
232 cpumask_t cpu_mask;
233
234 preempt_disable(); 251 preempt_disable();
235 cpu_mask = mm->cpu_vm_mask;
236 cpu_clear(smp_processor_id(), cpu_mask);
237 252
238 if (current->active_mm == mm) { 253 if (current->active_mm == mm) {
239 if (current->mm) 254 if (current->mm)
@@ -241,8 +256,8 @@ void flush_tlb_mm(struct mm_struct *mm)
241 else 256 else
242 leave_mm(smp_processor_id()); 257 leave_mm(smp_processor_id());
243 } 258 }
244 if (!cpus_empty(cpu_mask)) 259 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
245 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); 260 flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
246 261
247 preempt_enable(); 262 preempt_enable();
248} 263}
@@ -250,11 +265,8 @@ void flush_tlb_mm(struct mm_struct *mm)
250void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) 265void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
251{ 266{
252 struct mm_struct *mm = vma->vm_mm; 267 struct mm_struct *mm = vma->vm_mm;
253 cpumask_t cpu_mask;
254 268
255 preempt_disable(); 269 preempt_disable();
256 cpu_mask = mm->cpu_vm_mask;
257 cpu_clear(smp_processor_id(), cpu_mask);
258 270
259 if (current->active_mm == mm) { 271 if (current->active_mm == mm) {
260 if (current->mm) 272 if (current->mm)
@@ -263,8 +275,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
263 leave_mm(smp_processor_id()); 275 leave_mm(smp_processor_id());
264 } 276 }
265 277
266 if (!cpus_empty(cpu_mask)) 278 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
267 flush_tlb_others(cpu_mask, mm, va); 279 flush_tlb_others(&mm->cpu_vm_mask, mm, va);
268 280
269 preempt_enable(); 281 preempt_enable();
270} 282}
@@ -274,7 +286,7 @@ static void do_flush_tlb_all(void *info)
274 unsigned long cpu = smp_processor_id(); 286 unsigned long cpu = smp_processor_id();
275 287
276 __flush_tlb_all(); 288 __flush_tlb_all();
277 if (read_pda(mmu_state) == TLBSTATE_LAZY) 289 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
278 leave_mm(cpu); 290 leave_mm(cpu);
279} 291}
280 292
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 6dcefba7836f..3b767d03fd6a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -6,7 +6,8 @@ CFLAGS_REMOVE_irq.o = -pg
6endif 6endif
7 7
8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
9 time.o xen-asm_$(BITS).o grant-table.o suspend.o 9 time.o xen-asm.o xen-asm_$(BITS).o \
10 grant-table.o suspend.o
10 11
11obj-$(CONFIG_SMP) += smp.o spinlock.o 12obj-$(CONFIG_SMP) += smp.o spinlock.o
12obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file 13obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bea215230b20..37230342c2c4 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -61,40 +61,13 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
61enum xen_domain_type xen_domain_type = XEN_NATIVE; 61enum xen_domain_type xen_domain_type = XEN_NATIVE;
62EXPORT_SYMBOL_GPL(xen_domain_type); 62EXPORT_SYMBOL_GPL(xen_domain_type);
63 63
64/*
65 * Identity map, in addition to plain kernel map. This needs to be
66 * large enough to allocate page table pages to allocate the rest.
67 * Each page can map 2MB.
68 */
69static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
70
71#ifdef CONFIG_X86_64
72/* l3 pud for userspace vsyscall mapping */
73static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
74#endif /* CONFIG_X86_64 */
75
76/*
77 * Note about cr3 (pagetable base) values:
78 *
79 * xen_cr3 contains the current logical cr3 value; it contains the
80 * last set cr3. This may not be the current effective cr3, because
81 * its update may be being lazily deferred. However, a vcpu looking
82 * at its own cr3 can use this value knowing that it everything will
83 * be self-consistent.
84 *
85 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
86 * hypercall to set the vcpu cr3 is complete (so it may be a little
87 * out of date, but it will never be set early). If one vcpu is
88 * looking at another vcpu's cr3 value, it should use this variable.
89 */
90DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
91DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
92
93struct start_info *xen_start_info; 64struct start_info *xen_start_info;
94EXPORT_SYMBOL_GPL(xen_start_info); 65EXPORT_SYMBOL_GPL(xen_start_info);
95 66
96struct shared_info xen_dummy_shared_info; 67struct shared_info xen_dummy_shared_info;
97 68
69void *xen_initial_gdt;
70
98/* 71/*
99 * Point at some empty memory to start with. We map the real shared_info 72 * Point at some empty memory to start with. We map the real shared_info
100 * page as soon as fixmap is up and running. 73 * page as soon as fixmap is up and running.
@@ -114,14 +87,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
114 * 87 *
115 * 0: not available, 1: available 88 * 0: not available, 1: available
116 */ 89 */
117static int have_vcpu_info_placement = 90static int have_vcpu_info_placement = 1;
118#ifdef CONFIG_X86_32
119 1
120#else
121 0
122#endif
123 ;
124
125 91
126static void xen_vcpu_setup(int cpu) 92static void xen_vcpu_setup(int cpu)
127{ 93{
@@ -237,7 +203,7 @@ static unsigned long xen_get_debugreg(int reg)
237 return HYPERVISOR_get_debugreg(reg); 203 return HYPERVISOR_get_debugreg(reg);
238} 204}
239 205
240static void xen_leave_lazy(void) 206void xen_leave_lazy(void)
241{ 207{
242 paravirt_leave_lazy(paravirt_get_lazy_mode()); 208 paravirt_leave_lazy(paravirt_get_lazy_mode());
243 xen_mc_flush(); 209 xen_mc_flush();
@@ -598,83 +564,6 @@ static struct apic_ops xen_basic_apic_ops = {
598 564
599#endif 565#endif
600 566
601static void xen_flush_tlb(void)
602{
603 struct mmuext_op *op;
604 struct multicall_space mcs;
605
606 preempt_disable();
607
608 mcs = xen_mc_entry(sizeof(*op));
609
610 op = mcs.args;
611 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
612 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
613
614 xen_mc_issue(PARAVIRT_LAZY_MMU);
615
616 preempt_enable();
617}
618
619static void xen_flush_tlb_single(unsigned long addr)
620{
621 struct mmuext_op *op;
622 struct multicall_space mcs;
623
624 preempt_disable();
625
626 mcs = xen_mc_entry(sizeof(*op));
627 op = mcs.args;
628 op->cmd = MMUEXT_INVLPG_LOCAL;
629 op->arg1.linear_addr = addr & PAGE_MASK;
630 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
631
632 xen_mc_issue(PARAVIRT_LAZY_MMU);
633
634 preempt_enable();
635}
636
637static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
638 unsigned long va)
639{
640 struct {
641 struct mmuext_op op;
642 cpumask_t mask;
643 } *args;
644 cpumask_t cpumask = *cpus;
645 struct multicall_space mcs;
646
647 /*
648 * A couple of (to be removed) sanity checks:
649 *
650 * - current CPU must not be in mask
651 * - mask must exist :)
652 */
653 BUG_ON(cpus_empty(cpumask));
654 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
655 BUG_ON(!mm);
656
657 /* If a CPU which we ran on has gone down, OK. */
658 cpus_and(cpumask, cpumask, cpu_online_map);
659 if (cpus_empty(cpumask))
660 return;
661
662 mcs = xen_mc_entry(sizeof(*args));
663 args = mcs.args;
664 args->mask = cpumask;
665 args->op.arg2.vcpumask = &args->mask;
666
667 if (va == TLB_FLUSH_ALL) {
668 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
669 } else {
670 args->op.cmd = MMUEXT_INVLPG_MULTI;
671 args->op.arg1.linear_addr = va;
672 }
673
674 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
675
676 xen_mc_issue(PARAVIRT_LAZY_MMU);
677}
678 567
679static void xen_clts(void) 568static void xen_clts(void)
680{ 569{
@@ -700,21 +589,6 @@ static void xen_write_cr0(unsigned long cr0)
700 xen_mc_issue(PARAVIRT_LAZY_CPU); 589 xen_mc_issue(PARAVIRT_LAZY_CPU);
701} 590}
702 591
703static void xen_write_cr2(unsigned long cr2)
704{
705 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
706}
707
708static unsigned long xen_read_cr2(void)
709{
710 return x86_read_percpu(xen_vcpu)->arch.cr2;
711}
712
713static unsigned long xen_read_cr2_direct(void)
714{
715 return x86_read_percpu(xen_vcpu_info.arch.cr2);
716}
717
718static void xen_write_cr4(unsigned long cr4) 592static void xen_write_cr4(unsigned long cr4)
719{ 593{
720 cr4 &= ~X86_CR4_PGE; 594 cr4 &= ~X86_CR4_PGE;
@@ -723,71 +597,6 @@ static void xen_write_cr4(unsigned long cr4)
723 native_write_cr4(cr4); 597 native_write_cr4(cr4);
724} 598}
725 599
726static unsigned long xen_read_cr3(void)
727{
728 return x86_read_percpu(xen_cr3);
729}
730
731static void set_current_cr3(void *v)
732{
733 x86_write_percpu(xen_current_cr3, (unsigned long)v);
734}
735
736static void __xen_write_cr3(bool kernel, unsigned long cr3)
737{
738 struct mmuext_op *op;
739 struct multicall_space mcs;
740 unsigned long mfn;
741
742 if (cr3)
743 mfn = pfn_to_mfn(PFN_DOWN(cr3));
744 else
745 mfn = 0;
746
747 WARN_ON(mfn == 0 && kernel);
748
749 mcs = __xen_mc_entry(sizeof(*op));
750
751 op = mcs.args;
752 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
753 op->arg1.mfn = mfn;
754
755 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
756
757 if (kernel) {
758 x86_write_percpu(xen_cr3, cr3);
759
760 /* Update xen_current_cr3 once the batch has actually
761 been submitted. */
762 xen_mc_callback(set_current_cr3, (void *)cr3);
763 }
764}
765
766static void xen_write_cr3(unsigned long cr3)
767{
768 BUG_ON(preemptible());
769
770 xen_mc_batch(); /* disables interrupts */
771
772 /* Update while interrupts are disabled, so its atomic with
773 respect to ipis */
774 x86_write_percpu(xen_cr3, cr3);
775
776 __xen_write_cr3(true, cr3);
777
778#ifdef CONFIG_X86_64
779 {
780 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
781 if (user_pgd)
782 __xen_write_cr3(false, __pa(user_pgd));
783 else
784 __xen_write_cr3(false, 0);
785 }
786#endif
787
788 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
789}
790
791static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) 600static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
792{ 601{
793 int ret; 602 int ret;
@@ -829,185 +638,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
829 return ret; 638 return ret;
830} 639}
831 640
832/* Early in boot, while setting up the initial pagetable, assume
833 everything is pinned. */
834static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
835{
836#ifdef CONFIG_FLATMEM
837 BUG_ON(mem_map); /* should only be used early */
838#endif
839 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
840}
841
842/* Early release_pte assumes that all pts are pinned, since there's
843 only init_mm and anything attached to that is pinned. */
844static void xen_release_pte_init(unsigned long pfn)
845{
846 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
847}
848
849static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
850{
851 struct mmuext_op op;
852 op.cmd = cmd;
853 op.arg1.mfn = pfn_to_mfn(pfn);
854 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
855 BUG();
856}
857
858/* This needs to make sure the new pte page is pinned iff its being
859 attached to a pinned pagetable. */
860static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
861{
862 struct page *page = pfn_to_page(pfn);
863
864 if (PagePinned(virt_to_page(mm->pgd))) {
865 SetPagePinned(page);
866
867 vm_unmap_aliases();
868 if (!PageHighMem(page)) {
869 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
870 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
871 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
872 } else {
873 /* make sure there are no stray mappings of
874 this page */
875 kmap_flush_unused();
876 }
877 }
878}
879
880static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
881{
882 xen_alloc_ptpage(mm, pfn, PT_PTE);
883}
884
885static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
886{
887 xen_alloc_ptpage(mm, pfn, PT_PMD);
888}
889
890static int xen_pgd_alloc(struct mm_struct *mm)
891{
892 pgd_t *pgd = mm->pgd;
893 int ret = 0;
894
895 BUG_ON(PagePinned(virt_to_page(pgd)));
896
897#ifdef CONFIG_X86_64
898 {
899 struct page *page = virt_to_page(pgd);
900 pgd_t *user_pgd;
901
902 BUG_ON(page->private != 0);
903
904 ret = -ENOMEM;
905
906 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
907 page->private = (unsigned long)user_pgd;
908
909 if (user_pgd != NULL) {
910 user_pgd[pgd_index(VSYSCALL_START)] =
911 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
912 ret = 0;
913 }
914
915 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
916 }
917#endif
918
919 return ret;
920}
921
922static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
923{
924#ifdef CONFIG_X86_64
925 pgd_t *user_pgd = xen_get_user_pgd(pgd);
926
927 if (user_pgd)
928 free_page((unsigned long)user_pgd);
929#endif
930}
931
932/* This should never happen until we're OK to use struct page */
933static void xen_release_ptpage(unsigned long pfn, unsigned level)
934{
935 struct page *page = pfn_to_page(pfn);
936
937 if (PagePinned(page)) {
938 if (!PageHighMem(page)) {
939 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
940 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
941 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
942 }
943 ClearPagePinned(page);
944 }
945}
946
947static void xen_release_pte(unsigned long pfn)
948{
949 xen_release_ptpage(pfn, PT_PTE);
950}
951
952static void xen_release_pmd(unsigned long pfn)
953{
954 xen_release_ptpage(pfn, PT_PMD);
955}
956
957#if PAGETABLE_LEVELS == 4
958static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
959{
960 xen_alloc_ptpage(mm, pfn, PT_PUD);
961}
962
963static void xen_release_pud(unsigned long pfn)
964{
965 xen_release_ptpage(pfn, PT_PUD);
966}
967#endif
968
969#ifdef CONFIG_HIGHPTE
970static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
971{
972 pgprot_t prot = PAGE_KERNEL;
973
974 if (PagePinned(page))
975 prot = PAGE_KERNEL_RO;
976
977 if (0 && PageHighMem(page))
978 printk("mapping highpte %lx type %d prot %s\n",
979 page_to_pfn(page), type,
980 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
981
982 return kmap_atomic_prot(page, type, prot);
983}
984#endif
985
986#ifdef CONFIG_X86_32
987static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
988{
989 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
990 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
991 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
992 pte_val_ma(pte));
993
994 return pte;
995}
996
997/* Init-time set_pte while constructing initial pagetables, which
998 doesn't allow RO pagetable pages to be remapped RW */
999static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1000{
1001 pte = mask_rw_pte(ptep, pte);
1002
1003 xen_set_pte(ptep, pte);
1004}
1005#endif
1006
1007static __init void xen_pagetable_setup_start(pgd_t *base)
1008{
1009}
1010
1011void xen_setup_shared_info(void) 641void xen_setup_shared_info(void)
1012{ 642{
1013 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 643 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1028,37 +658,6 @@ void xen_setup_shared_info(void)
1028 xen_setup_mfn_list_list(); 658 xen_setup_mfn_list_list();
1029} 659}
1030 660
1031static __init void xen_pagetable_setup_done(pgd_t *base)
1032{
1033 xen_setup_shared_info();
1034}
1035
1036static __init void xen_post_allocator_init(void)
1037{
1038 pv_mmu_ops.set_pte = xen_set_pte;
1039 pv_mmu_ops.set_pmd = xen_set_pmd;
1040 pv_mmu_ops.set_pud = xen_set_pud;
1041#if PAGETABLE_LEVELS == 4
1042 pv_mmu_ops.set_pgd = xen_set_pgd;
1043#endif
1044
1045 /* This will work as long as patching hasn't happened yet
1046 (which it hasn't) */
1047 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1048 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1049 pv_mmu_ops.release_pte = xen_release_pte;
1050 pv_mmu_ops.release_pmd = xen_release_pmd;
1051#if PAGETABLE_LEVELS == 4
1052 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1053 pv_mmu_ops.release_pud = xen_release_pud;
1054#endif
1055
1056#ifdef CONFIG_X86_64
1057 SetPagePinned(virt_to_page(level3_user_vsyscall));
1058#endif
1059 xen_mark_init_mm_pinned();
1060}
1061
1062/* This is called once we have the cpu_possible_map */ 661/* This is called once we have the cpu_possible_map */
1063void xen_setup_vcpu_info_placement(void) 662void xen_setup_vcpu_info_placement(void)
1064{ 663{
@@ -1072,10 +671,10 @@ void xen_setup_vcpu_info_placement(void)
1072 if (have_vcpu_info_placement) { 671 if (have_vcpu_info_placement) {
1073 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 672 printk(KERN_INFO "Xen: using vcpu_info placement\n");
1074 673
1075 pv_irq_ops.save_fl = xen_save_fl_direct; 674 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
1076 pv_irq_ops.restore_fl = xen_restore_fl_direct; 675 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
1077 pv_irq_ops.irq_disable = xen_irq_disable_direct; 676 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
1078 pv_irq_ops.irq_enable = xen_irq_enable_direct; 677 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
1079 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 678 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
1080 } 679 }
1081} 680}
@@ -1133,49 +732,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
1133 return ret; 732 return ret;
1134} 733}
1135 734
1136static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1137{
1138 pte_t pte;
1139
1140 phys >>= PAGE_SHIFT;
1141
1142 switch (idx) {
1143 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1144#ifdef CONFIG_X86_F00F_BUG
1145 case FIX_F00F_IDT:
1146#endif
1147#ifdef CONFIG_X86_32
1148 case FIX_WP_TEST:
1149 case FIX_VDSO:
1150# ifdef CONFIG_HIGHMEM
1151 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1152# endif
1153#else
1154 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1155#endif
1156#ifdef CONFIG_X86_LOCAL_APIC
1157 case FIX_APIC_BASE: /* maps dummy local APIC */
1158#endif
1159 pte = pfn_pte(phys, prot);
1160 break;
1161
1162 default:
1163 pte = mfn_pte(phys, prot);
1164 break;
1165 }
1166
1167 __native_set_fixmap(idx, pte);
1168
1169#ifdef CONFIG_X86_64
1170 /* Replicate changes to map the vsyscall page into the user
1171 pagetable vsyscall mapping. */
1172 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1173 unsigned long vaddr = __fix_to_virt(idx);
1174 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1175 }
1176#endif
1177}
1178
1179static const struct pv_info xen_info __initdata = { 735static const struct pv_info xen_info __initdata = {
1180 .paravirt_enabled = 1, 736 .paravirt_enabled = 1,
1181 .shared_kernel_pmd = 0, 737 .shared_kernel_pmd = 0,
@@ -1271,87 +827,6 @@ static const struct pv_apic_ops xen_apic_ops __initdata = {
1271#endif 827#endif
1272}; 828};
1273 829
1274static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1275 .pagetable_setup_start = xen_pagetable_setup_start,
1276 .pagetable_setup_done = xen_pagetable_setup_done,
1277
1278 .read_cr2 = xen_read_cr2,
1279 .write_cr2 = xen_write_cr2,
1280
1281 .read_cr3 = xen_read_cr3,
1282 .write_cr3 = xen_write_cr3,
1283
1284 .flush_tlb_user = xen_flush_tlb,
1285 .flush_tlb_kernel = xen_flush_tlb,
1286 .flush_tlb_single = xen_flush_tlb_single,
1287 .flush_tlb_others = xen_flush_tlb_others,
1288
1289 .pte_update = paravirt_nop,
1290 .pte_update_defer = paravirt_nop,
1291
1292 .pgd_alloc = xen_pgd_alloc,
1293 .pgd_free = xen_pgd_free,
1294
1295 .alloc_pte = xen_alloc_pte_init,
1296 .release_pte = xen_release_pte_init,
1297 .alloc_pmd = xen_alloc_pte_init,
1298 .alloc_pmd_clone = paravirt_nop,
1299 .release_pmd = xen_release_pte_init,
1300
1301#ifdef CONFIG_HIGHPTE
1302 .kmap_atomic_pte = xen_kmap_atomic_pte,
1303#endif
1304
1305#ifdef CONFIG_X86_64
1306 .set_pte = xen_set_pte,
1307#else
1308 .set_pte = xen_set_pte_init,
1309#endif
1310 .set_pte_at = xen_set_pte_at,
1311 .set_pmd = xen_set_pmd_hyper,
1312
1313 .ptep_modify_prot_start = __ptep_modify_prot_start,
1314 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1315
1316 .pte_val = xen_pte_val,
1317 .pte_flags = native_pte_flags,
1318 .pgd_val = xen_pgd_val,
1319
1320 .make_pte = xen_make_pte,
1321 .make_pgd = xen_make_pgd,
1322
1323#ifdef CONFIG_X86_PAE
1324 .set_pte_atomic = xen_set_pte_atomic,
1325 .set_pte_present = xen_set_pte_at,
1326 .pte_clear = xen_pte_clear,
1327 .pmd_clear = xen_pmd_clear,
1328#endif /* CONFIG_X86_PAE */
1329 .set_pud = xen_set_pud_hyper,
1330
1331 .make_pmd = xen_make_pmd,
1332 .pmd_val = xen_pmd_val,
1333
1334#if PAGETABLE_LEVELS == 4
1335 .pud_val = xen_pud_val,
1336 .make_pud = xen_make_pud,
1337 .set_pgd = xen_set_pgd_hyper,
1338
1339 .alloc_pud = xen_alloc_pte_init,
1340 .release_pud = xen_release_pte_init,
1341#endif /* PAGETABLE_LEVELS == 4 */
1342
1343 .activate_mm = xen_activate_mm,
1344 .dup_mmap = xen_dup_mmap,
1345 .exit_mmap = xen_exit_mmap,
1346
1347 .lazy_mode = {
1348 .enter = paravirt_enter_lazy_mmu,
1349 .leave = xen_leave_lazy,
1350 },
1351
1352 .set_fixmap = xen_set_fixmap,
1353};
1354
1355static void xen_reboot(int reason) 830static void xen_reboot(int reason)
1356{ 831{
1357 struct sched_shutdown r = { .reason = reason }; 832 struct sched_shutdown r = { .reason = reason };
@@ -1394,223 +869,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
1394}; 869};
1395 870
1396 871
1397static void __init xen_reserve_top(void)
1398{
1399#ifdef CONFIG_X86_32
1400 unsigned long top = HYPERVISOR_VIRT_START;
1401 struct xen_platform_parameters pp;
1402
1403 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1404 top = pp.virt_start;
1405
1406 reserve_top_address(-top);
1407#endif /* CONFIG_X86_32 */
1408}
1409
1410/*
1411 * Like __va(), but returns address in the kernel mapping (which is
1412 * all we have until the physical memory mapping has been set up.
1413 */
1414static void *__ka(phys_addr_t paddr)
1415{
1416#ifdef CONFIG_X86_64
1417 return (void *)(paddr + __START_KERNEL_map);
1418#else
1419 return __va(paddr);
1420#endif
1421}
1422
1423/* Convert a machine address to physical address */
1424static unsigned long m2p(phys_addr_t maddr)
1425{
1426 phys_addr_t paddr;
1427
1428 maddr &= PTE_PFN_MASK;
1429 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1430
1431 return paddr;
1432}
1433
1434/* Convert a machine address to kernel virtual */
1435static void *m2v(phys_addr_t maddr)
1436{
1437 return __ka(m2p(maddr));
1438}
1439
1440static void set_page_prot(void *addr, pgprot_t prot)
1441{
1442 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1443 pte_t pte = pfn_pte(pfn, prot);
1444
1445 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1446 BUG();
1447}
1448
1449static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1450{
1451 unsigned pmdidx, pteidx;
1452 unsigned ident_pte;
1453 unsigned long pfn;
1454
1455 ident_pte = 0;
1456 pfn = 0;
1457 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1458 pte_t *pte_page;
1459
1460 /* Reuse or allocate a page of ptes */
1461 if (pmd_present(pmd[pmdidx]))
1462 pte_page = m2v(pmd[pmdidx].pmd);
1463 else {
1464 /* Check for free pte pages */
1465 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1466 break;
1467
1468 pte_page = &level1_ident_pgt[ident_pte];
1469 ident_pte += PTRS_PER_PTE;
1470
1471 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1472 }
1473
1474 /* Install mappings */
1475 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1476 pte_t pte;
1477
1478 if (pfn > max_pfn_mapped)
1479 max_pfn_mapped = pfn;
1480
1481 if (!pte_none(pte_page[pteidx]))
1482 continue;
1483
1484 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1485 pte_page[pteidx] = pte;
1486 }
1487 }
1488
1489 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1490 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1491
1492 set_page_prot(pmd, PAGE_KERNEL_RO);
1493}
1494
1495#ifdef CONFIG_X86_64
1496static void convert_pfn_mfn(void *v)
1497{
1498 pte_t *pte = v;
1499 int i;
1500
1501 /* All levels are converted the same way, so just treat them
1502 as ptes. */
1503 for (i = 0; i < PTRS_PER_PTE; i++)
1504 pte[i] = xen_make_pte(pte[i].pte);
1505}
1506
1507/*
1508 * Set up the inital kernel pagetable.
1509 *
1510 * We can construct this by grafting the Xen provided pagetable into
1511 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1512 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1513 * means that only the kernel has a physical mapping to start with -
1514 * but that's enough to get __va working. We need to fill in the rest
1515 * of the physical mapping once some sort of allocator has been set
1516 * up.
1517 */
1518static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1519 unsigned long max_pfn)
1520{
1521 pud_t *l3;
1522 pmd_t *l2;
1523
1524 /* Zap identity mapping */
1525 init_level4_pgt[0] = __pgd(0);
1526
1527 /* Pre-constructed entries are in pfn, so convert to mfn */
1528 convert_pfn_mfn(init_level4_pgt);
1529 convert_pfn_mfn(level3_ident_pgt);
1530 convert_pfn_mfn(level3_kernel_pgt);
1531
1532 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1533 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1534
1535 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1536 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1537
1538 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1539 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1540 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1541
1542 /* Set up identity map */
1543 xen_map_identity_early(level2_ident_pgt, max_pfn);
1544
1545 /* Make pagetable pieces RO */
1546 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1547 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1548 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1549 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1550 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1551 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1552
1553 /* Pin down new L4 */
1554 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1555 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1556
1557 /* Unpin Xen-provided one */
1558 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1559
1560 /* Switch over */
1561 pgd = init_level4_pgt;
1562
1563 /*
1564 * At this stage there can be no user pgd, and no page
1565 * structure to attach it to, so make sure we just set kernel
1566 * pgd.
1567 */
1568 xen_mc_batch();
1569 __xen_write_cr3(true, __pa(pgd));
1570 xen_mc_issue(PARAVIRT_LAZY_CPU);
1571
1572 reserve_early(__pa(xen_start_info->pt_base),
1573 __pa(xen_start_info->pt_base +
1574 xen_start_info->nr_pt_frames * PAGE_SIZE),
1575 "XEN PAGETABLES");
1576
1577 return pgd;
1578}
1579#else /* !CONFIG_X86_64 */
1580static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1581
1582static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1583 unsigned long max_pfn)
1584{
1585 pmd_t *kernel_pmd;
1586
1587 init_pg_tables_start = __pa(pgd);
1588 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1589 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1590
1591 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1592 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1593
1594 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1595
1596 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1597 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1598 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1599
1600 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1601 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1602 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1603
1604 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1605
1606 xen_write_cr3(__pa(swapper_pg_dir));
1607
1608 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1609
1610 return swapper_pg_dir;
1611}
1612#endif /* CONFIG_X86_64 */
1613
1614/* First C function to be called on Xen boot */ 872/* First C function to be called on Xen boot */
1615asmlinkage void __init xen_start_kernel(void) 873asmlinkage void __init xen_start_kernel(void)
1616{ 874{
@@ -1650,10 +908,18 @@ asmlinkage void __init xen_start_kernel(void)
1650 machine_ops = xen_machine_ops; 908 machine_ops = xen_machine_ops;
1651 909
1652#ifdef CONFIG_X86_64 910#ifdef CONFIG_X86_64
1653 /* Disable until direct per-cpu data access. */ 911 /*
1654 have_vcpu_info_placement = 0; 912 * Setup percpu state. We only need to do this for 64-bit
1655 x86_64_init_pda(); 913 * because 32-bit already has %fs set properly.
914 */
915 load_percpu_segment(0);
1656#endif 916#endif
917 /*
918 * The only reliable way to retain the initial address of the
919 * percpu gdt_page is to remember it here, so we can go and
920 * mark it RW later, when the initial percpu area is freed.
921 */
922 xen_initial_gdt = &per_cpu(gdt_page, 0);
1657 923
1658 xen_smp_init(); 924 xen_smp_init();
1659 925
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index bb042608c602..5a070900ad35 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -39,7 +39,7 @@ static unsigned long xen_save_fl(void)
39 struct vcpu_info *vcpu; 39 struct vcpu_info *vcpu;
40 unsigned long flags; 40 unsigned long flags;
41 41
42 vcpu = x86_read_percpu(xen_vcpu); 42 vcpu = percpu_read(xen_vcpu);
43 43
44 /* flag has opposite sense of mask */ 44 /* flag has opposite sense of mask */
45 flags = !vcpu->evtchn_upcall_mask; 45 flags = !vcpu->evtchn_upcall_mask;
@@ -50,6 +50,7 @@ static unsigned long xen_save_fl(void)
50 */ 50 */
51 return (-flags) & X86_EFLAGS_IF; 51 return (-flags) & X86_EFLAGS_IF;
52} 52}
53PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
53 54
54static void xen_restore_fl(unsigned long flags) 55static void xen_restore_fl(unsigned long flags)
55{ 56{
@@ -62,7 +63,7 @@ static void xen_restore_fl(unsigned long flags)
62 make sure we're don't switch CPUs between getting the vcpu 63 make sure we're don't switch CPUs between getting the vcpu
63 pointer and updating the mask. */ 64 pointer and updating the mask. */
64 preempt_disable(); 65 preempt_disable();
65 vcpu = x86_read_percpu(xen_vcpu); 66 vcpu = percpu_read(xen_vcpu);
66 vcpu->evtchn_upcall_mask = flags; 67 vcpu->evtchn_upcall_mask = flags;
67 preempt_enable_no_resched(); 68 preempt_enable_no_resched();
68 69
@@ -76,6 +77,7 @@ static void xen_restore_fl(unsigned long flags)
76 xen_force_evtchn_callback(); 77 xen_force_evtchn_callback();
77 } 78 }
78} 79}
80PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
79 81
80static void xen_irq_disable(void) 82static void xen_irq_disable(void)
81{ 83{
@@ -83,9 +85,10 @@ static void xen_irq_disable(void)
83 make sure we're don't switch CPUs between getting the vcpu 85 make sure we're don't switch CPUs between getting the vcpu
84 pointer and updating the mask. */ 86 pointer and updating the mask. */
85 preempt_disable(); 87 preempt_disable();
86 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; 88 percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
87 preempt_enable_no_resched(); 89 preempt_enable_no_resched();
88} 90}
91PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
89 92
90static void xen_irq_enable(void) 93static void xen_irq_enable(void)
91{ 94{
@@ -96,7 +99,7 @@ static void xen_irq_enable(void)
96 the caller is confused and is trying to re-enable interrupts 99 the caller is confused and is trying to re-enable interrupts
97 on an indeterminate processor. */ 100 on an indeterminate processor. */
98 101
99 vcpu = x86_read_percpu(xen_vcpu); 102 vcpu = percpu_read(xen_vcpu);
100 vcpu->evtchn_upcall_mask = 0; 103 vcpu->evtchn_upcall_mask = 0;
101 104
102 /* Doesn't matter if we get preempted here, because any 105 /* Doesn't matter if we get preempted here, because any
@@ -106,6 +109,7 @@ static void xen_irq_enable(void)
106 if (unlikely(vcpu->evtchn_upcall_pending)) 109 if (unlikely(vcpu->evtchn_upcall_pending))
107 xen_force_evtchn_callback(); 110 xen_force_evtchn_callback();
108} 111}
112PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
109 113
110static void xen_safe_halt(void) 114static void xen_safe_halt(void)
111{ 115{
@@ -124,10 +128,12 @@ static void xen_halt(void)
124 128
125static const struct pv_irq_ops xen_irq_ops __initdata = { 129static const struct pv_irq_ops xen_irq_ops __initdata = {
126 .init_IRQ = __xen_init_IRQ, 130 .init_IRQ = __xen_init_IRQ,
127 .save_fl = xen_save_fl, 131
128 .restore_fl = xen_restore_fl, 132 .save_fl = PV_CALLEE_SAVE(xen_save_fl),
129 .irq_disable = xen_irq_disable, 133 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
130 .irq_enable = xen_irq_enable, 134 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
135 .irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
136
131 .safe_halt = xen_safe_halt, 137 .safe_halt = xen_safe_halt,
132 .halt = xen_halt, 138 .halt = xen_halt,
133#ifdef CONFIG_X86_64 139#ifdef CONFIG_X86_64
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 503c240e26c7..d2e8ed1aff3d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -47,6 +47,7 @@
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/fixmap.h> 48#include <asm/fixmap.h>
49#include <asm/mmu_context.h> 49#include <asm/mmu_context.h>
50#include <asm/setup.h>
50#include <asm/paravirt.h> 51#include <asm/paravirt.h>
51#include <asm/linkage.h> 52#include <asm/linkage.h>
52 53
@@ -55,6 +56,8 @@
55 56
56#include <xen/page.h> 57#include <xen/page.h>
57#include <xen/interface/xen.h> 58#include <xen/interface/xen.h>
59#include <xen/interface/version.h>
60#include <xen/hvc-console.h>
58 61
59#include "multicalls.h" 62#include "multicalls.h"
60#include "mmu.h" 63#include "mmu.h"
@@ -114,6 +117,37 @@ static inline void check_zero(void)
114 117
115#endif /* CONFIG_XEN_DEBUG_FS */ 118#endif /* CONFIG_XEN_DEBUG_FS */
116 119
120
121/*
122 * Identity map, in addition to plain kernel map. This needs to be
123 * large enough to allocate page table pages to allocate the rest.
124 * Each page can map 2MB.
125 */
126static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
127
128#ifdef CONFIG_X86_64
129/* l3 pud for userspace vsyscall mapping */
130static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
131#endif /* CONFIG_X86_64 */
132
133/*
134 * Note about cr3 (pagetable base) values:
135 *
136 * xen_cr3 contains the current logical cr3 value; it contains the
137 * last set cr3. This may not be the current effective cr3, because
138 * its update may be being lazily deferred. However, a vcpu looking
139 * at its own cr3 can use this value knowing that it everything will
140 * be self-consistent.
141 *
142 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
143 * hypercall to set the vcpu cr3 is complete (so it may be a little
144 * out of date, but it will never be set early). If one vcpu is
145 * looking at another vcpu's cr3 value, it should use this variable.
146 */
147DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
148DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
149
150
117/* 151/*
118 * Just beyond the highest usermode address. STACK_TOP_MAX has a 152 * Just beyond the highest usermode address. STACK_TOP_MAX has a
119 * redzone above it, so round it up to a PGD boundary. 153 * redzone above it, so round it up to a PGD boundary.
@@ -458,28 +492,33 @@ pteval_t xen_pte_val(pte_t pte)
458{ 492{
459 return pte_mfn_to_pfn(pte.pte); 493 return pte_mfn_to_pfn(pte.pte);
460} 494}
495PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
461 496
462pgdval_t xen_pgd_val(pgd_t pgd) 497pgdval_t xen_pgd_val(pgd_t pgd)
463{ 498{
464 return pte_mfn_to_pfn(pgd.pgd); 499 return pte_mfn_to_pfn(pgd.pgd);
465} 500}
501PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
466 502
467pte_t xen_make_pte(pteval_t pte) 503pte_t xen_make_pte(pteval_t pte)
468{ 504{
469 pte = pte_pfn_to_mfn(pte); 505 pte = pte_pfn_to_mfn(pte);
470 return native_make_pte(pte); 506 return native_make_pte(pte);
471} 507}
508PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
472 509
473pgd_t xen_make_pgd(pgdval_t pgd) 510pgd_t xen_make_pgd(pgdval_t pgd)
474{ 511{
475 pgd = pte_pfn_to_mfn(pgd); 512 pgd = pte_pfn_to_mfn(pgd);
476 return native_make_pgd(pgd); 513 return native_make_pgd(pgd);
477} 514}
515PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
478 516
479pmdval_t xen_pmd_val(pmd_t pmd) 517pmdval_t xen_pmd_val(pmd_t pmd)
480{ 518{
481 return pte_mfn_to_pfn(pmd.pmd); 519 return pte_mfn_to_pfn(pmd.pmd);
482} 520}
521PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
483 522
484void xen_set_pud_hyper(pud_t *ptr, pud_t val) 523void xen_set_pud_hyper(pud_t *ptr, pud_t val)
485{ 524{
@@ -556,12 +595,14 @@ pmd_t xen_make_pmd(pmdval_t pmd)
556 pmd = pte_pfn_to_mfn(pmd); 595 pmd = pte_pfn_to_mfn(pmd);
557 return native_make_pmd(pmd); 596 return native_make_pmd(pmd);
558} 597}
598PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
559 599
560#if PAGETABLE_LEVELS == 4 600#if PAGETABLE_LEVELS == 4
561pudval_t xen_pud_val(pud_t pud) 601pudval_t xen_pud_val(pud_t pud)
562{ 602{
563 return pte_mfn_to_pfn(pud.pud); 603 return pte_mfn_to_pfn(pud.pud);
564} 604}
605PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
565 606
566pud_t xen_make_pud(pudval_t pud) 607pud_t xen_make_pud(pudval_t pud)
567{ 608{
@@ -569,6 +610,7 @@ pud_t xen_make_pud(pudval_t pud)
569 610
570 return native_make_pud(pud); 611 return native_make_pud(pud);
571} 612}
613PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
572 614
573pgd_t *xen_get_user_pgd(pgd_t *pgd) 615pgd_t *xen_get_user_pgd(pgd_t *pgd)
574{ 616{
@@ -1063,18 +1105,14 @@ static void drop_other_mm_ref(void *info)
1063 struct mm_struct *mm = info; 1105 struct mm_struct *mm = info;
1064 struct mm_struct *active_mm; 1106 struct mm_struct *active_mm;
1065 1107
1066#ifdef CONFIG_X86_64 1108 active_mm = percpu_read(cpu_tlbstate.active_mm);
1067 active_mm = read_pda(active_mm);
1068#else
1069 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
1070#endif
1071 1109
1072 if (active_mm == mm) 1110 if (active_mm == mm)
1073 leave_mm(smp_processor_id()); 1111 leave_mm(smp_processor_id());
1074 1112
1075 /* If this cpu still has a stale cr3 reference, then make sure 1113 /* If this cpu still has a stale cr3 reference, then make sure
1076 it has been flushed. */ 1114 it has been flushed. */
1077 if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) { 1115 if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
1078 load_cr3(swapper_pg_dir); 1116 load_cr3(swapper_pg_dir);
1079 arch_flush_lazy_cpu_mode(); 1117 arch_flush_lazy_cpu_mode();
1080 } 1118 }
@@ -1156,6 +1194,709 @@ void xen_exit_mmap(struct mm_struct *mm)
1156 spin_unlock(&mm->page_table_lock); 1194 spin_unlock(&mm->page_table_lock);
1157} 1195}
1158 1196
1197static __init void xen_pagetable_setup_start(pgd_t *base)
1198{
1199}
1200
1201static __init void xen_pagetable_setup_done(pgd_t *base)
1202{
1203 xen_setup_shared_info();
1204}
1205
1206static void xen_write_cr2(unsigned long cr2)
1207{
1208 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1209}
1210
1211static unsigned long xen_read_cr2(void)
1212{
1213 return percpu_read(xen_vcpu)->arch.cr2;
1214}
1215
1216unsigned long xen_read_cr2_direct(void)
1217{
1218 return percpu_read(xen_vcpu_info.arch.cr2);
1219}
1220
1221static void xen_flush_tlb(void)
1222{
1223 struct mmuext_op *op;
1224 struct multicall_space mcs;
1225
1226 preempt_disable();
1227
1228 mcs = xen_mc_entry(sizeof(*op));
1229
1230 op = mcs.args;
1231 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1232 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1233
1234 xen_mc_issue(PARAVIRT_LAZY_MMU);
1235
1236 preempt_enable();
1237}
1238
1239static void xen_flush_tlb_single(unsigned long addr)
1240{
1241 struct mmuext_op *op;
1242 struct multicall_space mcs;
1243
1244 preempt_disable();
1245
1246 mcs = xen_mc_entry(sizeof(*op));
1247 op = mcs.args;
1248 op->cmd = MMUEXT_INVLPG_LOCAL;
1249 op->arg1.linear_addr = addr & PAGE_MASK;
1250 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1251
1252 xen_mc_issue(PARAVIRT_LAZY_MMU);
1253
1254 preempt_enable();
1255}
1256
1257static void xen_flush_tlb_others(const struct cpumask *cpus,
1258 struct mm_struct *mm, unsigned long va)
1259{
1260 struct {
1261 struct mmuext_op op;
1262 DECLARE_BITMAP(mask, NR_CPUS);
1263 } *args;
1264 struct multicall_space mcs;
1265
1266 BUG_ON(cpumask_empty(cpus));
1267 BUG_ON(!mm);
1268
1269 mcs = xen_mc_entry(sizeof(*args));
1270 args = mcs.args;
1271 args->op.arg2.vcpumask = to_cpumask(args->mask);
1272
1273 /* Remove us, and any offline CPUS. */
1274 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1275 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1276 if (unlikely(cpumask_empty(to_cpumask(args->mask))))
1277 goto issue;
1278
1279 if (va == TLB_FLUSH_ALL) {
1280 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1281 } else {
1282 args->op.cmd = MMUEXT_INVLPG_MULTI;
1283 args->op.arg1.linear_addr = va;
1284 }
1285
1286 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1287
1288issue:
1289 xen_mc_issue(PARAVIRT_LAZY_MMU);
1290}
1291
1292static unsigned long xen_read_cr3(void)
1293{
1294 return percpu_read(xen_cr3);
1295}
1296
1297static void set_current_cr3(void *v)
1298{
1299 percpu_write(xen_current_cr3, (unsigned long)v);
1300}
1301
1302static void __xen_write_cr3(bool kernel, unsigned long cr3)
1303{
1304 struct mmuext_op *op;
1305 struct multicall_space mcs;
1306 unsigned long mfn;
1307
1308 if (cr3)
1309 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1310 else
1311 mfn = 0;
1312
1313 WARN_ON(mfn == 0 && kernel);
1314
1315 mcs = __xen_mc_entry(sizeof(*op));
1316
1317 op = mcs.args;
1318 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1319 op->arg1.mfn = mfn;
1320
1321 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1322
1323 if (kernel) {
1324 percpu_write(xen_cr3, cr3);
1325
1326 /* Update xen_current_cr3 once the batch has actually
1327 been submitted. */
1328 xen_mc_callback(set_current_cr3, (void *)cr3);
1329 }
1330}
1331
1332static void xen_write_cr3(unsigned long cr3)
1333{
1334 BUG_ON(preemptible());
1335
1336 xen_mc_batch(); /* disables interrupts */
1337
1338 /* Update while interrupts are disabled, so its atomic with
1339 respect to ipis */
1340 percpu_write(xen_cr3, cr3);
1341
1342 __xen_write_cr3(true, cr3);
1343
1344#ifdef CONFIG_X86_64
1345 {
1346 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1347 if (user_pgd)
1348 __xen_write_cr3(false, __pa(user_pgd));
1349 else
1350 __xen_write_cr3(false, 0);
1351 }
1352#endif
1353
1354 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1355}
1356
1357static int xen_pgd_alloc(struct mm_struct *mm)
1358{
1359 pgd_t *pgd = mm->pgd;
1360 int ret = 0;
1361
1362 BUG_ON(PagePinned(virt_to_page(pgd)));
1363
1364#ifdef CONFIG_X86_64
1365 {
1366 struct page *page = virt_to_page(pgd);
1367 pgd_t *user_pgd;
1368
1369 BUG_ON(page->private != 0);
1370
1371 ret = -ENOMEM;
1372
1373 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1374 page->private = (unsigned long)user_pgd;
1375
1376 if (user_pgd != NULL) {
1377 user_pgd[pgd_index(VSYSCALL_START)] =
1378 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1379 ret = 0;
1380 }
1381
1382 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1383 }
1384#endif
1385
1386 return ret;
1387}
1388
1389static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1390{
1391#ifdef CONFIG_X86_64
1392 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1393
1394 if (user_pgd)
1395 free_page((unsigned long)user_pgd);
1396#endif
1397}
1398
1399#ifdef CONFIG_HIGHPTE
1400static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
1401{
1402 pgprot_t prot = PAGE_KERNEL;
1403
1404 if (PagePinned(page))
1405 prot = PAGE_KERNEL_RO;
1406
1407 if (0 && PageHighMem(page))
1408 printk("mapping highpte %lx type %d prot %s\n",
1409 page_to_pfn(page), type,
1410 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
1411
1412 return kmap_atomic_prot(page, type, prot);
1413}
1414#endif
1415
1416#ifdef CONFIG_X86_32
1417static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1418{
1419 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1420 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1421 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1422 pte_val_ma(pte));
1423
1424 return pte;
1425}
1426
1427/* Init-time set_pte while constructing initial pagetables, which
1428 doesn't allow RO pagetable pages to be remapped RW */
1429static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1430{
1431 pte = mask_rw_pte(ptep, pte);
1432
1433 xen_set_pte(ptep, pte);
1434}
1435#endif
1436
1437/* Early in boot, while setting up the initial pagetable, assume
1438 everything is pinned. */
1439static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1440{
1441#ifdef CONFIG_FLATMEM
1442 BUG_ON(mem_map); /* should only be used early */
1443#endif
1444 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1445}
1446
1447/* Early release_pte assumes that all pts are pinned, since there's
1448 only init_mm and anything attached to that is pinned. */
1449static void xen_release_pte_init(unsigned long pfn)
1450{
1451 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1452}
1453
1454static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1455{
1456 struct mmuext_op op;
1457 op.cmd = cmd;
1458 op.arg1.mfn = pfn_to_mfn(pfn);
1459 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1460 BUG();
1461}
1462
1463/* This needs to make sure the new pte page is pinned iff its being
1464 attached to a pinned pagetable. */
1465static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1466{
1467 struct page *page = pfn_to_page(pfn);
1468
1469 if (PagePinned(virt_to_page(mm->pgd))) {
1470 SetPagePinned(page);
1471
1472 vm_unmap_aliases();
1473 if (!PageHighMem(page)) {
1474 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1475 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1476 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1477 } else {
1478 /* make sure there are no stray mappings of
1479 this page */
1480 kmap_flush_unused();
1481 }
1482 }
1483}
1484
1485static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1486{
1487 xen_alloc_ptpage(mm, pfn, PT_PTE);
1488}
1489
1490static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1491{
1492 xen_alloc_ptpage(mm, pfn, PT_PMD);
1493}
1494
1495/* This should never happen until we're OK to use struct page */
1496static void xen_release_ptpage(unsigned long pfn, unsigned level)
1497{
1498 struct page *page = pfn_to_page(pfn);
1499
1500 if (PagePinned(page)) {
1501 if (!PageHighMem(page)) {
1502 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1503 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1504 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1505 }
1506 ClearPagePinned(page);
1507 }
1508}
1509
1510static void xen_release_pte(unsigned long pfn)
1511{
1512 xen_release_ptpage(pfn, PT_PTE);
1513}
1514
1515static void xen_release_pmd(unsigned long pfn)
1516{
1517 xen_release_ptpage(pfn, PT_PMD);
1518}
1519
1520#if PAGETABLE_LEVELS == 4
1521static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1522{
1523 xen_alloc_ptpage(mm, pfn, PT_PUD);
1524}
1525
1526static void xen_release_pud(unsigned long pfn)
1527{
1528 xen_release_ptpage(pfn, PT_PUD);
1529}
1530#endif
1531
1532void __init xen_reserve_top(void)
1533{
1534#ifdef CONFIG_X86_32
1535 unsigned long top = HYPERVISOR_VIRT_START;
1536 struct xen_platform_parameters pp;
1537
1538 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1539 top = pp.virt_start;
1540
1541 reserve_top_address(-top);
1542#endif /* CONFIG_X86_32 */
1543}
1544
1545/*
1546 * Like __va(), but returns address in the kernel mapping (which is
1547 * all we have until the physical memory mapping has been set up.
1548 */
1549static void *__ka(phys_addr_t paddr)
1550{
1551#ifdef CONFIG_X86_64
1552 return (void *)(paddr + __START_KERNEL_map);
1553#else
1554 return __va(paddr);
1555#endif
1556}
1557
1558/* Convert a machine address to physical address */
1559static unsigned long m2p(phys_addr_t maddr)
1560{
1561 phys_addr_t paddr;
1562
1563 maddr &= PTE_PFN_MASK;
1564 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1565
1566 return paddr;
1567}
1568
1569/* Convert a machine address to kernel virtual */
1570static void *m2v(phys_addr_t maddr)
1571{
1572 return __ka(m2p(maddr));
1573}
1574
1575static void set_page_prot(void *addr, pgprot_t prot)
1576{
1577 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1578 pte_t pte = pfn_pte(pfn, prot);
1579
1580 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1581 BUG();
1582}
1583
1584static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1585{
1586 unsigned pmdidx, pteidx;
1587 unsigned ident_pte;
1588 unsigned long pfn;
1589
1590 ident_pte = 0;
1591 pfn = 0;
1592 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1593 pte_t *pte_page;
1594
1595 /* Reuse or allocate a page of ptes */
1596 if (pmd_present(pmd[pmdidx]))
1597 pte_page = m2v(pmd[pmdidx].pmd);
1598 else {
1599 /* Check for free pte pages */
1600 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1601 break;
1602
1603 pte_page = &level1_ident_pgt[ident_pte];
1604 ident_pte += PTRS_PER_PTE;
1605
1606 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1607 }
1608
1609 /* Install mappings */
1610 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1611 pte_t pte;
1612
1613 if (pfn > max_pfn_mapped)
1614 max_pfn_mapped = pfn;
1615
1616 if (!pte_none(pte_page[pteidx]))
1617 continue;
1618
1619 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1620 pte_page[pteidx] = pte;
1621 }
1622 }
1623
1624 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1625 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1626
1627 set_page_prot(pmd, PAGE_KERNEL_RO);
1628}
1629
1630#ifdef CONFIG_X86_64
1631static void convert_pfn_mfn(void *v)
1632{
1633 pte_t *pte = v;
1634 int i;
1635
1636 /* All levels are converted the same way, so just treat them
1637 as ptes. */
1638 for (i = 0; i < PTRS_PER_PTE; i++)
1639 pte[i] = xen_make_pte(pte[i].pte);
1640}
1641
1642/*
1643 * Set up the inital kernel pagetable.
1644 *
1645 * We can construct this by grafting the Xen provided pagetable into
1646 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1647 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1648 * means that only the kernel has a physical mapping to start with -
1649 * but that's enough to get __va working. We need to fill in the rest
1650 * of the physical mapping once some sort of allocator has been set
1651 * up.
1652 */
1653__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1654 unsigned long max_pfn)
1655{
1656 pud_t *l3;
1657 pmd_t *l2;
1658
1659 /* Zap identity mapping */
1660 init_level4_pgt[0] = __pgd(0);
1661
1662 /* Pre-constructed entries are in pfn, so convert to mfn */
1663 convert_pfn_mfn(init_level4_pgt);
1664 convert_pfn_mfn(level3_ident_pgt);
1665 convert_pfn_mfn(level3_kernel_pgt);
1666
1667 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1668 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1669
1670 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1671 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1672
1673 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1674 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1675 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1676
1677 /* Set up identity map */
1678 xen_map_identity_early(level2_ident_pgt, max_pfn);
1679
1680 /* Make pagetable pieces RO */
1681 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1682 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1683 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1684 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1685 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1686 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1687
1688 /* Pin down new L4 */
1689 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1690 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1691
1692 /* Unpin Xen-provided one */
1693 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1694
1695 /* Switch over */
1696 pgd = init_level4_pgt;
1697
1698 /*
1699 * At this stage there can be no user pgd, and no page
1700 * structure to attach it to, so make sure we just set kernel
1701 * pgd.
1702 */
1703 xen_mc_batch();
1704 __xen_write_cr3(true, __pa(pgd));
1705 xen_mc_issue(PARAVIRT_LAZY_CPU);
1706
1707 reserve_early(__pa(xen_start_info->pt_base),
1708 __pa(xen_start_info->pt_base +
1709 xen_start_info->nr_pt_frames * PAGE_SIZE),
1710 "XEN PAGETABLES");
1711
1712 return pgd;
1713}
1714#else /* !CONFIG_X86_64 */
1715static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1716
1717__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1718 unsigned long max_pfn)
1719{
1720 pmd_t *kernel_pmd;
1721
1722 init_pg_tables_start = __pa(pgd);
1723 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1724 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1725
1726 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1727 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1728
1729 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1730
1731 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1732 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1733 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1734
1735 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1736 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1737 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1738
1739 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1740
1741 xen_write_cr3(__pa(swapper_pg_dir));
1742
1743 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1744
1745 return swapper_pg_dir;
1746}
1747#endif /* CONFIG_X86_64 */
1748
1749static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1750{
1751 pte_t pte;
1752
1753 phys >>= PAGE_SHIFT;
1754
1755 switch (idx) {
1756 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1757#ifdef CONFIG_X86_F00F_BUG
1758 case FIX_F00F_IDT:
1759#endif
1760#ifdef CONFIG_X86_32
1761 case FIX_WP_TEST:
1762 case FIX_VDSO:
1763# ifdef CONFIG_HIGHMEM
1764 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1765# endif
1766#else
1767 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1768#endif
1769#ifdef CONFIG_X86_LOCAL_APIC
1770 case FIX_APIC_BASE: /* maps dummy local APIC */
1771#endif
1772 pte = pfn_pte(phys, prot);
1773 break;
1774
1775 default:
1776 pte = mfn_pte(phys, prot);
1777 break;
1778 }
1779
1780 __native_set_fixmap(idx, pte);
1781
1782#ifdef CONFIG_X86_64
1783 /* Replicate changes to map the vsyscall page into the user
1784 pagetable vsyscall mapping. */
1785 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1786 unsigned long vaddr = __fix_to_virt(idx);
1787 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1788 }
1789#endif
1790}
1791
1792__init void xen_post_allocator_init(void)
1793{
1794 pv_mmu_ops.set_pte = xen_set_pte;
1795 pv_mmu_ops.set_pmd = xen_set_pmd;
1796 pv_mmu_ops.set_pud = xen_set_pud;
1797#if PAGETABLE_LEVELS == 4
1798 pv_mmu_ops.set_pgd = xen_set_pgd;
1799#endif
1800
1801 /* This will work as long as patching hasn't happened yet
1802 (which it hasn't) */
1803 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1804 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1805 pv_mmu_ops.release_pte = xen_release_pte;
1806 pv_mmu_ops.release_pmd = xen_release_pmd;
1807#if PAGETABLE_LEVELS == 4
1808 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1809 pv_mmu_ops.release_pud = xen_release_pud;
1810#endif
1811
1812#ifdef CONFIG_X86_64
1813 SetPagePinned(virt_to_page(level3_user_vsyscall));
1814#endif
1815 xen_mark_init_mm_pinned();
1816}
1817
1818
1819const struct pv_mmu_ops xen_mmu_ops __initdata = {
1820 .pagetable_setup_start = xen_pagetable_setup_start,
1821 .pagetable_setup_done = xen_pagetable_setup_done,
1822
1823 .read_cr2 = xen_read_cr2,
1824 .write_cr2 = xen_write_cr2,
1825
1826 .read_cr3 = xen_read_cr3,
1827 .write_cr3 = xen_write_cr3,
1828
1829 .flush_tlb_user = xen_flush_tlb,
1830 .flush_tlb_kernel = xen_flush_tlb,
1831 .flush_tlb_single = xen_flush_tlb_single,
1832 .flush_tlb_others = xen_flush_tlb_others,
1833
1834 .pte_update = paravirt_nop,
1835 .pte_update_defer = paravirt_nop,
1836
1837 .pgd_alloc = xen_pgd_alloc,
1838 .pgd_free = xen_pgd_free,
1839
1840 .alloc_pte = xen_alloc_pte_init,
1841 .release_pte = xen_release_pte_init,
1842 .alloc_pmd = xen_alloc_pte_init,
1843 .alloc_pmd_clone = paravirt_nop,
1844 .release_pmd = xen_release_pte_init,
1845
1846#ifdef CONFIG_HIGHPTE
1847 .kmap_atomic_pte = xen_kmap_atomic_pte,
1848#endif
1849
1850#ifdef CONFIG_X86_64
1851 .set_pte = xen_set_pte,
1852#else
1853 .set_pte = xen_set_pte_init,
1854#endif
1855 .set_pte_at = xen_set_pte_at,
1856 .set_pmd = xen_set_pmd_hyper,
1857
1858 .ptep_modify_prot_start = __ptep_modify_prot_start,
1859 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1860
1861 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
1862 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
1863
1864 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
1865 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
1866
1867#ifdef CONFIG_X86_PAE
1868 .set_pte_atomic = xen_set_pte_atomic,
1869 .set_pte_present = xen_set_pte_at,
1870 .pte_clear = xen_pte_clear,
1871 .pmd_clear = xen_pmd_clear,
1872#endif /* CONFIG_X86_PAE */
1873 .set_pud = xen_set_pud_hyper,
1874
1875 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
1876 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
1877
1878#if PAGETABLE_LEVELS == 4
1879 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
1880 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
1881 .set_pgd = xen_set_pgd_hyper,
1882
1883 .alloc_pud = xen_alloc_pte_init,
1884 .release_pud = xen_release_pte_init,
1885#endif /* PAGETABLE_LEVELS == 4 */
1886
1887 .activate_mm = xen_activate_mm,
1888 .dup_mmap = xen_dup_mmap,
1889 .exit_mmap = xen_exit_mmap,
1890
1891 .lazy_mode = {
1892 .enter = paravirt_enter_lazy_mmu,
1893 .leave = xen_leave_lazy,
1894 },
1895
1896 .set_fixmap = xen_set_fixmap,
1897};
1898
1899
1159#ifdef CONFIG_XEN_DEBUG_FS 1900#ifdef CONFIG_XEN_DEBUG_FS
1160 1901
1161static struct dentry *d_mmu_debug; 1902static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 98d71659da5a..24d1b44a337d 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -54,4 +54,7 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t
54void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 54void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
55 pte_t *ptep, pte_t pte); 55 pte_t *ptep, pte_t pte);
56 56
57unsigned long xen_read_cr2_direct(void);
58
59extern const struct pv_mmu_ops xen_mmu_ops;
57#endif /* _XEN_MMU_H */ 60#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index fa3e10725d98..9e565da5d1f7 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -41,7 +41,7 @@ static inline void xen_mc_issue(unsigned mode)
41 xen_mc_flush(); 41 xen_mc_flush();
42 42
43 /* restore flags saved in xen_mc_batch */ 43 /* restore flags saved in xen_mc_batch */
44 local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); 44 local_irq_restore(percpu_read(xen_mc_irq_flags));
45} 45}
46 46
47/* Set up a callback to be called when the current batch is flushed */ 47/* Set up a callback to be called when the current batch is flushed */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c44e2069c7c7..035582ae815d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -50,11 +50,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
50 */ 50 */
51static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) 51static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
52{ 52{
53#ifdef CONFIG_X86_32 53 inc_irq_stat(irq_resched_count);
54 __get_cpu_var(irq_stat).irq_resched_count++;
55#else
56 add_pda(irq_resched_count, 1);
57#endif
58 54
59 return IRQ_HANDLED; 55 return IRQ_HANDLED;
60} 56}
@@ -78,7 +74,7 @@ static __cpuinit void cpu_bringup(void)
78 xen_setup_cpu_clockevents(); 74 xen_setup_cpu_clockevents();
79 75
80 cpu_set(cpu, cpu_online_map); 76 cpu_set(cpu, cpu_online_map);
81 x86_write_percpu(cpu_state, CPU_ONLINE); 77 percpu_write(cpu_state, CPU_ONLINE);
82 wmb(); 78 wmb();
83 79
84 /* We can take interrupts now: we're officially "up". */ 80 /* We can take interrupts now: we're officially "up". */
@@ -174,7 +170,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
174 170
175 /* We've switched to the "real" per-cpu gdt, so make sure the 171 /* We've switched to the "real" per-cpu gdt, so make sure the
176 old memory can be recycled */ 172 old memory can be recycled */
177 make_lowmem_page_readwrite(&per_cpu_var(gdt_page)); 173 make_lowmem_page_readwrite(xen_initial_gdt);
178 174
179 xen_setup_vcpu_info_placement(); 175 xen_setup_vcpu_info_placement();
180} 176}
@@ -239,6 +235,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
239 ctxt->user_regs.ss = __KERNEL_DS; 235 ctxt->user_regs.ss = __KERNEL_DS;
240#ifdef CONFIG_X86_32 236#ifdef CONFIG_X86_32
241 ctxt->user_regs.fs = __KERNEL_PERCPU; 237 ctxt->user_regs.fs = __KERNEL_PERCPU;
238#else
239 ctxt->gs_base_kernel = per_cpu_offset(cpu);
242#endif 240#endif
243 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 241 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
244 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 242 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
@@ -283,23 +281,14 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
283 struct task_struct *idle = idle_task(cpu); 281 struct task_struct *idle = idle_task(cpu);
284 int rc; 282 int rc;
285 283
286#ifdef CONFIG_X86_64
287 /* Allocate node local memory for AP pdas */
288 WARN_ON(cpu == 0);
289 if (cpu > 0) {
290 rc = get_local_pda(cpu);
291 if (rc)
292 return rc;
293 }
294#endif
295
296#ifdef CONFIG_X86_32
297 init_gdt(cpu);
298 per_cpu(current_task, cpu) = idle; 284 per_cpu(current_task, cpu) = idle;
285#ifdef CONFIG_X86_32
299 irq_ctx_init(cpu); 286 irq_ctx_init(cpu);
300#else 287#else
301 cpu_pda(cpu)->pcurrent = idle;
302 clear_tsk_thread_flag(idle, TIF_FORK); 288 clear_tsk_thread_flag(idle, TIF_FORK);
289 per_cpu(kernel_stack, cpu) =
290 (unsigned long)task_stack_page(idle) -
291 KERNEL_STACK_OFFSET + THREAD_SIZE;
303#endif 292#endif
304 xen_setup_timer(cpu); 293 xen_setup_timer(cpu);
305 xen_init_lock_cpu(cpu); 294 xen_init_lock_cpu(cpu);
@@ -445,11 +434,7 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
445{ 434{
446 irq_enter(); 435 irq_enter();
447 generic_smp_call_function_interrupt(); 436 generic_smp_call_function_interrupt();
448#ifdef CONFIG_X86_32 437 inc_irq_stat(irq_call_count);
449 __get_cpu_var(irq_stat).irq_call_count++;
450#else
451 add_pda(irq_call_count, 1);
452#endif
453 irq_exit(); 438 irq_exit();
454 439
455 return IRQ_HANDLED; 440 return IRQ_HANDLED;
@@ -459,11 +444,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
459{ 444{
460 irq_enter(); 445 irq_enter();
461 generic_smp_call_function_single_interrupt(); 446 generic_smp_call_function_single_interrupt();
462#ifdef CONFIG_X86_32 447 inc_irq_stat(irq_call_count);
463 __get_cpu_var(irq_stat).irq_call_count++;
464#else
465 add_pda(irq_call_count, 1);
466#endif
467 irq_exit(); 448 irq_exit();
468 449
469 return IRQ_HANDLED; 450 return IRQ_HANDLED;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 212ffe012b76..95be7b434724 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -6,6 +6,7 @@
6 6
7#include <asm/xen/hypercall.h> 7#include <asm/xen/hypercall.h>
8#include <asm/xen/page.h> 8#include <asm/xen/page.h>
9#include <asm/fixmap.h>
9 10
10#include "xen-ops.h" 11#include "xen-ops.h"
11#include "mmu.h" 12#include "mmu.h"
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
new file mode 100644
index 000000000000..79d7362ad6d1
--- /dev/null
+++ b/arch/x86/xen/xen-asm.S
@@ -0,0 +1,142 @@
1/*
2 * Asm versions of Xen pv-ops, suitable for either direct use or
3 * inlining. The inline versions are the same as the direct-use
4 * versions, with the pre- and post-amble chopped off.
5 *
6 * This code is encoded for size rather than absolute efficiency, with
7 * a view to being able to inline as much as possible.
8 *
9 * We only bother with direct forms (ie, vcpu in percpu data) of the
10 * operations here; the indirect forms are better handled in C, since
11 * they're generally too large to inline anyway.
12 */
13
14#include <asm/asm-offsets.h>
15#include <asm/percpu.h>
16#include <asm/processor-flags.h>
17
18#include "xen-asm.h"
19
20/*
21 * Enable events. This clears the event mask and tests the pending
22 * event status with one and operation. If there are pending events,
23 * then enter the hypervisor to get them handled.
24 */
25ENTRY(xen_irq_enable_direct)
26 /* Unmask events */
27 movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
28
29 /*
30 * Preempt here doesn't matter because that will deal with any
31 * pending interrupts. The pending check may end up being run
32 * on the wrong CPU, but that doesn't hurt.
33 */
34
35 /* Test for pending */
36 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
37 jz 1f
38
392: call check_events
401:
41ENDPATCH(xen_irq_enable_direct)
42 ret
43 ENDPROC(xen_irq_enable_direct)
44 RELOC(xen_irq_enable_direct, 2b+1)
45
46
47/*
48 * Disabling events is simply a matter of making the event mask
49 * non-zero.
50 */
51ENTRY(xen_irq_disable_direct)
52 movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
53ENDPATCH(xen_irq_disable_direct)
54 ret
55 ENDPROC(xen_irq_disable_direct)
56 RELOC(xen_irq_disable_direct, 0)
57
58/*
59 * (xen_)save_fl is used to get the current interrupt enable status.
60 * Callers expect the status to be in X86_EFLAGS_IF, and other bits
61 * may be set in the return value. We take advantage of this by
62 * making sure that X86_EFLAGS_IF has the right value (and other bits
63 * in that byte are 0), but other bits in the return value are
64 * undefined. We need to toggle the state of the bit, because Xen and
65 * x86 use opposite senses (mask vs enable).
66 */
67ENTRY(xen_save_fl_direct)
68 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
69 setz %ah
70 addb %ah, %ah
71ENDPATCH(xen_save_fl_direct)
72 ret
73 ENDPROC(xen_save_fl_direct)
74 RELOC(xen_save_fl_direct, 0)
75
76
77/*
78 * In principle the caller should be passing us a value return from
79 * xen_save_fl_direct, but for robustness sake we test only the
80 * X86_EFLAGS_IF flag rather than the whole byte. After setting the
81 * interrupt mask state, it checks for unmasked pending events and
82 * enters the hypervisor to get them delivered if so.
83 */
84ENTRY(xen_restore_fl_direct)
85#ifdef CONFIG_X86_64
86 testw $X86_EFLAGS_IF, %di
87#else
88 testb $X86_EFLAGS_IF>>8, %ah
89#endif
90 setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
91 /*
92 * Preempt here doesn't matter because that will deal with any
93 * pending interrupts. The pending check may end up being run
94 * on the wrong CPU, but that doesn't hurt.
95 */
96
97 /* check for unmasked and pending */
98 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
99 jz 1f
1002: call check_events
1011:
102ENDPATCH(xen_restore_fl_direct)
103 ret
104 ENDPROC(xen_restore_fl_direct)
105 RELOC(xen_restore_fl_direct, 2b+1)
106
107
108/*
109 * Force an event check by making a hypercall, but preserve regs
110 * before making the call.
111 */
112check_events:
113#ifdef CONFIG_X86_32
114 push %eax
115 push %ecx
116 push %edx
117 call xen_force_evtchn_callback
118 pop %edx
119 pop %ecx
120 pop %eax
121#else
122 push %rax
123 push %rcx
124 push %rdx
125 push %rsi
126 push %rdi
127 push %r8
128 push %r9
129 push %r10
130 push %r11
131 call xen_force_evtchn_callback
132 pop %r11
133 pop %r10
134 pop %r9
135 pop %r8
136 pop %rdi
137 pop %rsi
138 pop %rdx
139 pop %rcx
140 pop %rax
141#endif
142 ret
diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h
new file mode 100644
index 000000000000..465276467a47
--- /dev/null
+++ b/arch/x86/xen/xen-asm.h
@@ -0,0 +1,12 @@
1#ifndef _XEN_XEN_ASM_H
2#define _XEN_XEN_ASM_H
3
4#include <linux/linkage.h>
5
6#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
7#define ENDPATCH(x) .globl x##_end; x##_end=.
8
9/* Pseudo-flag used for virtual NMI, which we don't implement yet */
10#define XEN_EFLAGS_NMI 0x80000000
11
12#endif
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 42786f59d9c0..88e15deb8b82 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -1,117 +1,43 @@
1/* 1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining. 2 * Asm versions of Xen pv-ops, suitable for either direct use or
3 The inline versions are the same as the direct-use versions, with the 3 * inlining. The inline versions are the same as the direct-use
4 pre- and post-amble chopped off. 4 * versions, with the pre- and post-amble chopped off.
5 5 *
6 This code is encoded for size rather than absolute efficiency, 6 * This code is encoded for size rather than absolute efficiency, with
7 with a view to being able to inline as much as possible. 7 * a view to being able to inline as much as possible.
8 8 *
9 We only bother with direct forms (ie, vcpu in pda) of the operations 9 * We only bother with direct forms (ie, vcpu in pda) of the
10 here; the indirect forms are better handled in C, since they're 10 * operations here; the indirect forms are better handled in C, since
11 generally too large to inline anyway. 11 * they're generally too large to inline anyway.
12 */ 12 */
13 13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/thread_info.h> 14#include <asm/thread_info.h>
18#include <asm/percpu.h>
19#include <asm/processor-flags.h> 15#include <asm/processor-flags.h>
20#include <asm/segment.h> 16#include <asm/segment.h>
21 17
22#include <xen/interface/xen.h> 18#include <xen/interface/xen.h>
23 19
24#define RELOC(x, v) .globl x##_reloc; x##_reloc=v 20#include "xen-asm.h"
25#define ENDPATCH(x) .globl x##_end; x##_end=.
26
27/* Pseudo-flag used for virtual NMI, which we don't implement yet */
28#define XEN_EFLAGS_NMI 0x80000000
29
30/*
31 Enable events. This clears the event mask and tests the pending
32 event status with one and operation. If there are pending
33 events, then enter the hypervisor to get them handled.
34 */
35ENTRY(xen_irq_enable_direct)
36 /* Unmask events */
37 movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
38
39 /* Preempt here doesn't matter because that will deal with
40 any pending interrupts. The pending check may end up being
41 run on the wrong CPU, but that doesn't hurt. */
42
43 /* Test for pending */
44 testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
45 jz 1f
46
472: call check_events
481:
49ENDPATCH(xen_irq_enable_direct)
50 ret
51 ENDPROC(xen_irq_enable_direct)
52 RELOC(xen_irq_enable_direct, 2b+1)
53
54
55/*
56 Disabling events is simply a matter of making the event mask
57 non-zero.
58 */
59ENTRY(xen_irq_disable_direct)
60 movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
61ENDPATCH(xen_irq_disable_direct)
62 ret
63 ENDPROC(xen_irq_disable_direct)
64 RELOC(xen_irq_disable_direct, 0)
65 21
66/* 22/*
67 (xen_)save_fl is used to get the current interrupt enable status. 23 * Force an event check by making a hypercall, but preserve regs
68 Callers expect the status to be in X86_EFLAGS_IF, and other bits 24 * before making the call.
69 may be set in the return value. We take advantage of this by
70 making sure that X86_EFLAGS_IF has the right value (and other bits
71 in that byte are 0), but other bits in the return value are
72 undefined. We need to toggle the state of the bit, because
73 Xen and x86 use opposite senses (mask vs enable).
74 */ 25 */
75ENTRY(xen_save_fl_direct) 26check_events:
76 testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask 27 push %eax
77 setz %ah 28 push %ecx
78 addb %ah,%ah 29 push %edx
79ENDPATCH(xen_save_fl_direct) 30 call xen_force_evtchn_callback
80 ret 31 pop %edx
81 ENDPROC(xen_save_fl_direct) 32 pop %ecx
82 RELOC(xen_save_fl_direct, 0) 33 pop %eax
83
84
85/*
86 In principle the caller should be passing us a value return
87 from xen_save_fl_direct, but for robustness sake we test only
88 the X86_EFLAGS_IF flag rather than the whole byte. After
89 setting the interrupt mask state, it checks for unmasked
90 pending events and enters the hypervisor to get them delivered
91 if so.
92 */
93ENTRY(xen_restore_fl_direct)
94 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
96 /* Preempt here doesn't matter because that will deal with
97 any pending interrupts. The pending check may end up being
98 run on the wrong CPU, but that doesn't hurt. */
99
100 /* check for unmasked and pending */
101 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
102 jz 1f
1032: call check_events
1041:
105ENDPATCH(xen_restore_fl_direct)
106 ret 34 ret
107 ENDPROC(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1)
109 35
110/* 36/*
111 We can't use sysexit directly, because we're not running in ring0. 37 * We can't use sysexit directly, because we're not running in ring0.
112 But we can easily fake it up using iret. Assuming xen_sysexit 38 * But we can easily fake it up using iret. Assuming xen_sysexit is
113 is jumped to with a standard stack frame, we can just strip it 39 * jumped to with a standard stack frame, we can just strip it back to
114 back to a standard iret frame and use iret. 40 * a standard iret frame and use iret.
115 */ 41 */
116ENTRY(xen_sysexit) 42ENTRY(xen_sysexit)
117 movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ 43 movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
@@ -122,33 +48,31 @@ ENTRY(xen_sysexit)
122ENDPROC(xen_sysexit) 48ENDPROC(xen_sysexit)
123 49
124/* 50/*
125 This is run where a normal iret would be run, with the same stack setup: 51 * This is run where a normal iret would be run, with the same stack setup:
126 8: eflags 52 * 8: eflags
127 4: cs 53 * 4: cs
128 esp-> 0: eip 54 * esp-> 0: eip
129 55 *
130 This attempts to make sure that any pending events are dealt 56 * This attempts to make sure that any pending events are dealt with
131 with on return to usermode, but there is a small window in 57 * on return to usermode, but there is a small window in which an
132 which an event can happen just before entering usermode. If 58 * event can happen just before entering usermode. If the nested
133 the nested interrupt ends up setting one of the TIF_WORK_MASK 59 * interrupt ends up setting one of the TIF_WORK_MASK pending work
134 pending work flags, they will not be tested again before 60 * flags, they will not be tested again before returning to
135 returning to usermode. This means that a process can end up 61 * usermode. This means that a process can end up with pending work,
136 with pending work, which will be unprocessed until the process 62 * which will be unprocessed until the process enters and leaves the
137 enters and leaves the kernel again, which could be an 63 * kernel again, which could be an unbounded amount of time. This
138 unbounded amount of time. This means that a pending signal or 64 * means that a pending signal or reschedule event could be
139 reschedule event could be indefinitely delayed. 65 * indefinitely delayed.
140 66 *
141 The fix is to notice a nested interrupt in the critical 67 * The fix is to notice a nested interrupt in the critical window, and
142 window, and if one occurs, then fold the nested interrupt into 68 * if one occurs, then fold the nested interrupt into the current
143 the current interrupt stack frame, and re-process it 69 * interrupt stack frame, and re-process it iteratively rather than
144 iteratively rather than recursively. This means that it will 70 * recursively. This means that it will exit via the normal path, and
145 exit via the normal path, and all pending work will be dealt 71 * all pending work will be dealt with appropriately.
146 with appropriately. 72 *
147 73 * Because the nested interrupt handler needs to deal with the current
148 Because the nested interrupt handler needs to deal with the 74 * stack state in whatever form its in, we keep things simple by only
149 current stack state in whatever form its in, we keep things 75 * using a single register which is pushed/popped on the stack.
150 simple by only using a single register which is pushed/popped
151 on the stack.
152 */ 76 */
153ENTRY(xen_iret) 77ENTRY(xen_iret)
154 /* test eflags for special cases */ 78 /* test eflags for special cases */
@@ -158,13 +82,15 @@ ENTRY(xen_iret)
158 push %eax 82 push %eax
159 ESP_OFFSET=4 # bytes pushed onto stack 83 ESP_OFFSET=4 # bytes pushed onto stack
160 84
161 /* Store vcpu_info pointer for easy access. Do it this 85 /*
162 way to avoid having to reload %fs */ 86 * Store vcpu_info pointer for easy access. Do it this way to
87 * avoid having to reload %fs
88 */
163#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
164 GET_THREAD_INFO(%eax) 90 GET_THREAD_INFO(%eax)
165 movl TI_cpu(%eax),%eax 91 movl TI_cpu(%eax), %eax
166 movl __per_cpu_offset(,%eax,4),%eax 92 movl __per_cpu_offset(,%eax,4), %eax
167 mov per_cpu__xen_vcpu(%eax),%eax 93 mov per_cpu__xen_vcpu(%eax), %eax
168#else 94#else
169 movl per_cpu__xen_vcpu, %eax 95 movl per_cpu__xen_vcpu, %eax
170#endif 96#endif
@@ -172,37 +98,46 @@ ENTRY(xen_iret)
172 /* check IF state we're restoring */ 98 /* check IF state we're restoring */
173 testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) 99 testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
174 100
175 /* Maybe enable events. Once this happens we could get a 101 /*
176 recursive event, so the critical region starts immediately 102 * Maybe enable events. Once this happens we could get a
177 afterwards. However, if that happens we don't end up 103 * recursive event, so the critical region starts immediately
178 resuming the code, so we don't have to be worried about 104 * afterwards. However, if that happens we don't end up
179 being preempted to another CPU. */ 105 * resuming the code, so we don't have to be worried about
106 * being preempted to another CPU.
107 */
180 setz XEN_vcpu_info_mask(%eax) 108 setz XEN_vcpu_info_mask(%eax)
181xen_iret_start_crit: 109xen_iret_start_crit:
182 110
183 /* check for unmasked and pending */ 111 /* check for unmasked and pending */
184 cmpw $0x0001, XEN_vcpu_info_pending(%eax) 112 cmpw $0x0001, XEN_vcpu_info_pending(%eax)
185 113
186 /* If there's something pending, mask events again so we 114 /*
187 can jump back into xen_hypervisor_callback */ 115 * If there's something pending, mask events again so we can
116 * jump back into xen_hypervisor_callback
117 */
188 sete XEN_vcpu_info_mask(%eax) 118 sete XEN_vcpu_info_mask(%eax)
189 119
190 popl %eax 120 popl %eax
191 121
192 /* From this point on the registers are restored and the stack 122 /*
193 updated, so we don't need to worry about it if we're preempted */ 123 * From this point on the registers are restored and the stack
124 * updated, so we don't need to worry about it if we're
125 * preempted
126 */
194iret_restore_end: 127iret_restore_end:
195 128
196 /* Jump to hypervisor_callback after fixing up the stack. 129 /*
197 Events are masked, so jumping out of the critical 130 * Jump to hypervisor_callback after fixing up the stack.
198 region is OK. */ 131 * Events are masked, so jumping out of the critical region is
132 * OK.
133 */
199 je xen_hypervisor_callback 134 je xen_hypervisor_callback
200 135
2011: iret 1361: iret
202xen_iret_end_crit: 137xen_iret_end_crit:
203.section __ex_table,"a" 138.section __ex_table, "a"
204 .align 4 139 .align 4
205 .long 1b,iret_exc 140 .long 1b, iret_exc
206.previous 141.previous
207 142
208hyper_iret: 143hyper_iret:
@@ -212,55 +147,55 @@ hyper_iret:
212 .globl xen_iret_start_crit, xen_iret_end_crit 147 .globl xen_iret_start_crit, xen_iret_end_crit
213 148
214/* 149/*
215 This is called by xen_hypervisor_callback in entry.S when it sees 150 * This is called by xen_hypervisor_callback in entry.S when it sees
216 that the EIP at the time of interrupt was between xen_iret_start_crit 151 * that the EIP at the time of interrupt was between
217 and xen_iret_end_crit. We're passed the EIP in %eax so we can do 152 * xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in
218 a more refined determination of what to do. 153 * %eax so we can do a more refined determination of what to do.
219 154 *
220 The stack format at this point is: 155 * The stack format at this point is:
221 ---------------- 156 * ----------------
222 ss : (ss/esp may be present if we came from usermode) 157 * ss : (ss/esp may be present if we came from usermode)
223 esp : 158 * esp :
224 eflags } outer exception info 159 * eflags } outer exception info
225 cs } 160 * cs }
226 eip } 161 * eip }
227 ---------------- <- edi (copy dest) 162 * ---------------- <- edi (copy dest)
228 eax : outer eax if it hasn't been restored 163 * eax : outer eax if it hasn't been restored
229 ---------------- 164 * ----------------
230 eflags } nested exception info 165 * eflags } nested exception info
231 cs } (no ss/esp because we're nested 166 * cs } (no ss/esp because we're nested
232 eip } from the same ring) 167 * eip } from the same ring)
233 orig_eax }<- esi (copy src) 168 * orig_eax }<- esi (copy src)
234 - - - - - - - - 169 * - - - - - - - -
235 fs } 170 * fs }
236 es } 171 * es }
237 ds } SAVE_ALL state 172 * ds } SAVE_ALL state
238 eax } 173 * eax }
239 : : 174 * : :
240 ebx }<- esp 175 * ebx }<- esp
241 ---------------- 176 * ----------------
242 177 *
243 In order to deliver the nested exception properly, we need to shift 178 * In order to deliver the nested exception properly, we need to shift
244 everything from the return addr up to the error code so it 179 * everything from the return addr up to the error code so it sits
245 sits just under the outer exception info. This means that when we 180 * just under the outer exception info. This means that when we
246 handle the exception, we do it in the context of the outer exception 181 * handle the exception, we do it in the context of the outer
247 rather than starting a new one. 182 * exception rather than starting a new one.
248 183 *
249 The only caveat is that if the outer eax hasn't been 184 * The only caveat is that if the outer eax hasn't been restored yet
250 restored yet (ie, it's still on stack), we need to insert 185 * (ie, it's still on stack), we need to insert its value into the
251 its value into the SAVE_ALL state before going on, since 186 * SAVE_ALL state before going on, since it's usermode state which we
252 it's usermode state which we eventually need to restore. 187 * eventually need to restore.
253 */ 188 */
254ENTRY(xen_iret_crit_fixup) 189ENTRY(xen_iret_crit_fixup)
255 /* 190 /*
256 Paranoia: Make sure we're really coming from kernel space. 191 * Paranoia: Make sure we're really coming from kernel space.
257 One could imagine a case where userspace jumps into the 192 * One could imagine a case where userspace jumps into the
258 critical range address, but just before the CPU delivers a GP, 193 * critical range address, but just before the CPU delivers a
259 it decides to deliver an interrupt instead. Unlikely? 194 * GP, it decides to deliver an interrupt instead. Unlikely?
260 Definitely. Easy to avoid? Yes. The Intel documents 195 * Definitely. Easy to avoid? Yes. The Intel documents
261 explicitly say that the reported EIP for a bad jump is the 196 * explicitly say that the reported EIP for a bad jump is the
262 jump instruction itself, not the destination, but some virtual 197 * jump instruction itself, not the destination, but some
263 environments get this wrong. 198 * virtual environments get this wrong.
264 */ 199 */
265 movl PT_CS(%esp), %ecx 200 movl PT_CS(%esp), %ecx
266 andl $SEGMENT_RPL_MASK, %ecx 201 andl $SEGMENT_RPL_MASK, %ecx
@@ -270,15 +205,17 @@ ENTRY(xen_iret_crit_fixup)
270 lea PT_ORIG_EAX(%esp), %esi 205 lea PT_ORIG_EAX(%esp), %esi
271 lea PT_EFLAGS(%esp), %edi 206 lea PT_EFLAGS(%esp), %edi
272 207
273 /* If eip is before iret_restore_end then stack 208 /*
274 hasn't been restored yet. */ 209 * If eip is before iret_restore_end then stack
210 * hasn't been restored yet.
211 */
275 cmp $iret_restore_end, %eax 212 cmp $iret_restore_end, %eax
276 jae 1f 213 jae 1f
277 214
278 movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */ 215 movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */
279 movl %eax, PT_EAX(%esp) 216 movl %eax, PT_EAX(%esp)
280 217
281 lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ 218 lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */
282 219
283 /* set up the copy */ 220 /* set up the copy */
2841: std 2211: std
@@ -286,20 +223,6 @@ ENTRY(xen_iret_crit_fixup)
286 rep movsl 223 rep movsl
287 cld 224 cld
288 225
289 lea 4(%edi),%esp /* point esp to new frame */ 226 lea 4(%edi), %esp /* point esp to new frame */
2902: jmp xen_do_upcall 2272: jmp xen_do_upcall
291 228
292
293/*
294 Force an event check by making a hypercall,
295 but preserve regs before making the call.
296 */
297check_events:
298 push %eax
299 push %ecx
300 push %edx
301 call xen_force_evtchn_callback
302 pop %edx
303 pop %ecx
304 pop %eax
305 ret
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 05794c566e87..02f496a8dbaa 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -1,174 +1,45 @@
1/* 1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining. 2 * Asm versions of Xen pv-ops, suitable for either direct use or
3 The inline versions are the same as the direct-use versions, with the 3 * inlining. The inline versions are the same as the direct-use
4 pre- and post-amble chopped off. 4 * versions, with the pre- and post-amble chopped off.
5 5 *
6 This code is encoded for size rather than absolute efficiency, 6 * This code is encoded for size rather than absolute efficiency, with
7 with a view to being able to inline as much as possible. 7 * a view to being able to inline as much as possible.
8 8 *
9 We only bother with direct forms (ie, vcpu in pda) of the operations 9 * We only bother with direct forms (ie, vcpu in pda) of the
10 here; the indirect forms are better handled in C, since they're 10 * operations here; the indirect forms are better handled in C, since
11 generally too large to inline anyway. 11 * they're generally too large to inline anyway.
12 */ 12 */
13 13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h> 14#include <asm/errno.h>
15#include <asm/percpu.h>
16#include <asm/processor-flags.h>
19#include <asm/segment.h> 17#include <asm/segment.h>
20 18
21#include <xen/interface/xen.h> 19#include <xen/interface/xen.h>
22 20
23#define RELOC(x, v) .globl x##_reloc; x##_reloc=v 21#include "xen-asm.h"
24#define ENDPATCH(x) .globl x##_end; x##_end=.
25
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000
28
29#if 1
30/*
31 x86-64 does not yet support direct access to percpu variables
32 via a segment override, so we just need to make sure this code
33 never gets used
34 */
35#define BUG ud2a
36#define PER_CPU_VAR(var, off) 0xdeadbeef
37#endif
38
39/*
40 Enable events. This clears the event mask and tests the pending
41 event status with one and operation. If there are pending
42 events, then enter the hypervisor to get them handled.
43 */
44ENTRY(xen_irq_enable_direct)
45 BUG
46
47 /* Unmask events */
48 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
49
50 /* Preempt here doesn't matter because that will deal with
51 any pending interrupts. The pending check may end up being
52 run on the wrong CPU, but that doesn't hurt. */
53
54 /* Test for pending */
55 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
56 jz 1f
57
582: call check_events
591:
60ENDPATCH(xen_irq_enable_direct)
61 ret
62 ENDPROC(xen_irq_enable_direct)
63 RELOC(xen_irq_enable_direct, 2b+1)
64
65/*
66 Disabling events is simply a matter of making the event mask
67 non-zero.
68 */
69ENTRY(xen_irq_disable_direct)
70 BUG
71
72 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
73ENDPATCH(xen_irq_disable_direct)
74 ret
75 ENDPROC(xen_irq_disable_direct)
76 RELOC(xen_irq_disable_direct, 0)
77
78/*
79 (xen_)save_fl is used to get the current interrupt enable status.
80 Callers expect the status to be in X86_EFLAGS_IF, and other bits
81 may be set in the return value. We take advantage of this by
82 making sure that X86_EFLAGS_IF has the right value (and other bits
83 in that byte are 0), but other bits in the return value are
84 undefined. We need to toggle the state of the bit, because
85 Xen and x86 use opposite senses (mask vs enable).
86 */
87ENTRY(xen_save_fl_direct)
88 BUG
89
90 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
91 setz %ah
92 addb %ah,%ah
93ENDPATCH(xen_save_fl_direct)
94 ret
95 ENDPROC(xen_save_fl_direct)
96 RELOC(xen_save_fl_direct, 0)
97
98/*
99 In principle the caller should be passing us a value return
100 from xen_save_fl_direct, but for robustness sake we test only
101 the X86_EFLAGS_IF flag rather than the whole byte. After
102 setting the interrupt mask state, it checks for unmasked
103 pending events and enters the hypervisor to get them delivered
104 if so.
105 */
106ENTRY(xen_restore_fl_direct)
107 BUG
108
109 testb $X86_EFLAGS_IF>>8, %ah
110 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
111 /* Preempt here doesn't matter because that will deal with
112 any pending interrupts. The pending check may end up being
113 run on the wrong CPU, but that doesn't hurt. */
114
115 /* check for unmasked and pending */
116 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
117 jz 1f
1182: call check_events
1191:
120ENDPATCH(xen_restore_fl_direct)
121 ret
122 ENDPROC(xen_restore_fl_direct)
123 RELOC(xen_restore_fl_direct, 2b+1)
124
125
126/*
127 Force an event check by making a hypercall,
128 but preserve regs before making the call.
129 */
130check_events:
131 push %rax
132 push %rcx
133 push %rdx
134 push %rsi
135 push %rdi
136 push %r8
137 push %r9
138 push %r10
139 push %r11
140 call xen_force_evtchn_callback
141 pop %r11
142 pop %r10
143 pop %r9
144 pop %r8
145 pop %rdi
146 pop %rsi
147 pop %rdx
148 pop %rcx
149 pop %rax
150 ret
151 22
152ENTRY(xen_adjust_exception_frame) 23ENTRY(xen_adjust_exception_frame)
153 mov 8+0(%rsp),%rcx 24 mov 8+0(%rsp), %rcx
154 mov 8+8(%rsp),%r11 25 mov 8+8(%rsp), %r11
155 ret $16 26 ret $16
156 27
157hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 28hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
158/* 29/*
159 Xen64 iret frame: 30 * Xen64 iret frame:
160 31 *
161 ss 32 * ss
162 rsp 33 * rsp
163 rflags 34 * rflags
164 cs 35 * cs
165 rip <-- standard iret frame 36 * rip <-- standard iret frame
166 37 *
167 flags 38 * flags
168 39 *
169 rcx } 40 * rcx }
170 r11 }<-- pushed by hypercall page 41 * r11 }<-- pushed by hypercall page
171rsp -> rax } 42 * rsp->rax }
172 */ 43 */
173ENTRY(xen_iret) 44ENTRY(xen_iret)
174 pushq $0 45 pushq $0
@@ -177,8 +48,8 @@ ENDPATCH(xen_iret)
177RELOC(xen_iret, 1b+1) 48RELOC(xen_iret, 1b+1)
178 49
179/* 50/*
180 sysexit is not used for 64-bit processes, so it's 51 * sysexit is not used for 64-bit processes, so it's only ever used to
181 only ever used to return to 32-bit compat userspace. 52 * return to 32-bit compat userspace.
182 */ 53 */
183ENTRY(xen_sysexit) 54ENTRY(xen_sysexit)
184 pushq $__USER32_DS 55 pushq $__USER32_DS
@@ -193,13 +64,15 @@ ENDPATCH(xen_sysexit)
193RELOC(xen_sysexit, 1b+1) 64RELOC(xen_sysexit, 1b+1)
194 65
195ENTRY(xen_sysret64) 66ENTRY(xen_sysret64)
196 /* We're already on the usermode stack at this point, but still 67 /*
197 with the kernel gs, so we can easily switch back */ 68 * We're already on the usermode stack at this point, but
198 movq %rsp, %gs:pda_oldrsp 69 * still with the kernel gs, so we can easily switch back
199 movq %gs:pda_kernelstack,%rsp 70 */
71 movq %rsp, PER_CPU_VAR(old_rsp)
72 movq PER_CPU_VAR(kernel_stack), %rsp
200 73
201 pushq $__USER_DS 74 pushq $__USER_DS
202 pushq %gs:pda_oldrsp 75 pushq PER_CPU_VAR(old_rsp)
203 pushq %r11 76 pushq %r11
204 pushq $__USER_CS 77 pushq $__USER_CS
205 pushq %rcx 78 pushq %rcx
@@ -210,13 +83,15 @@ ENDPATCH(xen_sysret64)
210RELOC(xen_sysret64, 1b+1) 83RELOC(xen_sysret64, 1b+1)
211 84
212ENTRY(xen_sysret32) 85ENTRY(xen_sysret32)
213 /* We're already on the usermode stack at this point, but still 86 /*
214 with the kernel gs, so we can easily switch back */ 87 * We're already on the usermode stack at this point, but
215 movq %rsp, %gs:pda_oldrsp 88 * still with the kernel gs, so we can easily switch back
216 movq %gs:pda_kernelstack, %rsp 89 */
90 movq %rsp, PER_CPU_VAR(old_rsp)
91 movq PER_CPU_VAR(kernel_stack), %rsp
217 92
218 pushq $__USER32_DS 93 pushq $__USER32_DS
219 pushq %gs:pda_oldrsp 94 pushq PER_CPU_VAR(old_rsp)
220 pushq %r11 95 pushq %r11
221 pushq $__USER32_CS 96 pushq $__USER32_CS
222 pushq %rcx 97 pushq %rcx
@@ -227,28 +102,27 @@ ENDPATCH(xen_sysret32)
227RELOC(xen_sysret32, 1b+1) 102RELOC(xen_sysret32, 1b+1)
228 103
229/* 104/*
230 Xen handles syscall callbacks much like ordinary exceptions, 105 * Xen handles syscall callbacks much like ordinary exceptions, which
231 which means we have: 106 * means we have:
232 - kernel gs 107 * - kernel gs
233 - kernel rsp 108 * - kernel rsp
234 - an iret-like stack frame on the stack (including rcx and r11): 109 * - an iret-like stack frame on the stack (including rcx and r11):
235 ss 110 * ss
236 rsp 111 * rsp
237 rflags 112 * rflags
238 cs 113 * cs
239 rip 114 * rip
240 r11 115 * r11
241 rsp-> rcx 116 * rsp->rcx
242 117 *
243 In all the entrypoints, we undo all that to make it look 118 * In all the entrypoints, we undo all that to make it look like a
244 like a CPU-generated syscall/sysenter and jump to the normal 119 * CPU-generated syscall/sysenter and jump to the normal entrypoint.
245 entrypoint.
246 */ 120 */
247 121
248.macro undo_xen_syscall 122.macro undo_xen_syscall
249 mov 0*8(%rsp),%rcx 123 mov 0*8(%rsp), %rcx
250 mov 1*8(%rsp),%r11 124 mov 1*8(%rsp), %r11
251 mov 5*8(%rsp),%rsp 125 mov 5*8(%rsp), %rsp
252.endm 126.endm
253 127
254/* Normal 64-bit system call target */ 128/* Normal 64-bit system call target */
@@ -275,7 +149,7 @@ ENDPROC(xen_sysenter_target)
275 149
276ENTRY(xen_syscall32_target) 150ENTRY(xen_syscall32_target)
277ENTRY(xen_sysenter_target) 151ENTRY(xen_sysenter_target)
278 lea 16(%rsp), %rsp /* strip %rcx,%r11 */ 152 lea 16(%rsp), %rsp /* strip %rcx, %r11 */
279 mov $-ENOSYS, %rax 153 mov $-ENOSYS, %rax
280 pushq $VGCF_in_syscall 154 pushq $VGCF_in_syscall
281 jmp hypercall_iret 155 jmp hypercall_iret
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index c1f8faf0a2c5..2f5ef2632ea2 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -10,9 +10,12 @@
10extern const char xen_hypervisor_callback[]; 10extern const char xen_hypervisor_callback[];
11extern const char xen_failsafe_callback[]; 11extern const char xen_failsafe_callback[];
12 12
13extern void *xen_initial_gdt;
14
13struct trap_info; 15struct trap_info;
14void xen_copy_trap_info(struct trap_info *traps); 16void xen_copy_trap_info(struct trap_info *traps);
15 17
18DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
16DECLARE_PER_CPU(unsigned long, xen_cr3); 19DECLARE_PER_CPU(unsigned long, xen_cr3);
17DECLARE_PER_CPU(unsigned long, xen_current_cr3); 20DECLARE_PER_CPU(unsigned long, xen_current_cr3);
18 21
@@ -22,6 +25,13 @@ extern struct shared_info *HYPERVISOR_shared_info;
22 25
23void xen_setup_mfn_list_list(void); 26void xen_setup_mfn_list_list(void);
24void xen_setup_shared_info(void); 27void xen_setup_shared_info(void);
28void xen_setup_machphys_mapping(void);
29pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void);
31void xen_reserve_top(void);
32
33void xen_leave_lazy(void);
34void xen_post_allocator_init(void);
25 35
26char * __init xen_memory_setup(void); 36char * __init xen_memory_setup(void);
27void __init xen_arch_setup(void); 37void __init xen_arch_setup(void);
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 719ee5c1c8d9..5b257a57bc57 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -107,7 +107,7 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
107/* 107/*
108 * Print cpu online, possible, present, and system maps 108 * Print cpu online, possible, present, and system maps
109 */ 109 */
110static ssize_t print_cpus_map(char *buf, cpumask_t *map) 110static ssize_t print_cpus_map(char *buf, const struct cpumask *map)
111{ 111{
112 int n = cpulist_scnprintf(buf, PAGE_SIZE-2, map); 112 int n = cpulist_scnprintf(buf, PAGE_SIZE-2, map);
113 113
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index a778fb52b11f..bf6b13206d00 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -31,7 +31,10 @@
31#include <linux/hardirq.h> 31#include <linux/hardirq.h>
32#include <linux/topology.h> 32#include <linux/topology.h>
33 33
34#define define_one_ro(_name) \ 34#define define_one_ro_named(_name, _func) \
35static SYSDEV_ATTR(_name, 0444, _func, NULL)
36
37#define define_one_ro(_name) \
35static SYSDEV_ATTR(_name, 0444, show_##_name, NULL) 38static SYSDEV_ATTR(_name, 0444, show_##_name, NULL)
36 39
37#define define_id_show_func(name) \ 40#define define_id_show_func(name) \
@@ -42,8 +45,8 @@ static ssize_t show_##name(struct sys_device *dev, \
42 return sprintf(buf, "%d\n", topology_##name(cpu)); \ 45 return sprintf(buf, "%d\n", topology_##name(cpu)); \
43} 46}
44 47
45#if defined(topology_thread_siblings) || defined(topology_core_siblings) 48#if defined(topology_thread_cpumask) || defined(topology_core_cpumask)
46static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf) 49static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf)
47{ 50{
48 ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; 51 ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
49 int n = 0; 52 int n = 0;
@@ -65,7 +68,7 @@ static ssize_t show_##name(struct sys_device *dev, \
65 struct sysdev_attribute *attr, char *buf) \ 68 struct sysdev_attribute *attr, char *buf) \
66{ \ 69{ \
67 unsigned int cpu = dev->id; \ 70 unsigned int cpu = dev->id; \
68 return show_cpumap(0, &(topology_##name(cpu)), buf); \ 71 return show_cpumap(0, topology_##name(cpu), buf); \
69} 72}
70 73
71#define define_siblings_show_list(name) \ 74#define define_siblings_show_list(name) \
@@ -74,7 +77,7 @@ static ssize_t show_##name##_list(struct sys_device *dev, \
74 char *buf) \ 77 char *buf) \
75{ \ 78{ \
76 unsigned int cpu = dev->id; \ 79 unsigned int cpu = dev->id; \
77 return show_cpumap(1, &(topology_##name(cpu)), buf); \ 80 return show_cpumap(1, topology_##name(cpu), buf); \
78} 81}
79 82
80#else 83#else
@@ -82,9 +85,7 @@ static ssize_t show_##name##_list(struct sys_device *dev, \
82static ssize_t show_##name(struct sys_device *dev, \ 85static ssize_t show_##name(struct sys_device *dev, \
83 struct sysdev_attribute *attr, char *buf) \ 86 struct sysdev_attribute *attr, char *buf) \
84{ \ 87{ \
85 unsigned int cpu = dev->id; \ 88 return show_cpumap(0, topology_##name(dev->id), buf); \
86 cpumask_t mask = topology_##name(cpu); \
87 return show_cpumap(0, &mask, buf); \
88} 89}
89 90
90#define define_siblings_show_list(name) \ 91#define define_siblings_show_list(name) \
@@ -92,9 +93,7 @@ static ssize_t show_##name##_list(struct sys_device *dev, \
92 struct sysdev_attribute *attr, \ 93 struct sysdev_attribute *attr, \
93 char *buf) \ 94 char *buf) \
94{ \ 95{ \
95 unsigned int cpu = dev->id; \ 96 return show_cpumap(1, topology_##name(dev->id), buf); \
96 cpumask_t mask = topology_##name(cpu); \
97 return show_cpumap(1, &mask, buf); \
98} 97}
99#endif 98#endif
100 99
@@ -107,13 +106,13 @@ define_one_ro(physical_package_id);
107define_id_show_func(core_id); 106define_id_show_func(core_id);
108define_one_ro(core_id); 107define_one_ro(core_id);
109 108
110define_siblings_show_func(thread_siblings); 109define_siblings_show_func(thread_cpumask);
111define_one_ro(thread_siblings); 110define_one_ro_named(thread_siblings, show_thread_cpumask);
112define_one_ro(thread_siblings_list); 111define_one_ro_named(thread_siblings_list, show_thread_cpumask_list);
113 112
114define_siblings_show_func(core_siblings); 113define_siblings_show_func(core_cpumask);
115define_one_ro(core_siblings); 114define_one_ro_named(core_siblings, show_core_cpumask);
116define_one_ro(core_siblings_list); 115define_one_ro_named(core_siblings_list, show_core_cpumask_list);
117 116
118static struct attribute *default_attrs[] = { 117static struct attribute *default_attrs[] = {
119 &attr_physical_package_id.attr, 118 &attr_physical_package_id.attr,
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index 777fba48d2d3..3009e0171e54 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -244,7 +244,7 @@ static ssize_t host_control_on_shutdown_store(struct device *dev,
244 */ 244 */
245int dcdbas_smi_request(struct smi_cmd *smi_cmd) 245int dcdbas_smi_request(struct smi_cmd *smi_cmd)
246{ 246{
247 cpumask_t old_mask; 247 cpumask_var_t old_mask;
248 int ret = 0; 248 int ret = 0;
249 249
250 if (smi_cmd->magic != SMI_CMD_MAGIC) { 250 if (smi_cmd->magic != SMI_CMD_MAGIC) {
@@ -254,8 +254,11 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
254 } 254 }
255 255
256 /* SMI requires CPU 0 */ 256 /* SMI requires CPU 0 */
257 old_mask = current->cpus_allowed; 257 if (!alloc_cpumask_var(&old_mask, GFP_KERNEL))
258 set_cpus_allowed_ptr(current, &cpumask_of_cpu(0)); 258 return -ENOMEM;
259
260 cpumask_copy(old_mask, &current->cpus_allowed);
261 set_cpus_allowed_ptr(current, cpumask_of(0));
259 if (smp_processor_id() != 0) { 262 if (smp_processor_id() != 0) {
260 dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n", 263 dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n",
261 __func__); 264 __func__);
@@ -275,7 +278,8 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
275 ); 278 );
276 279
277out: 280out:
278 set_cpus_allowed_ptr(current, &old_mask); 281 set_cpus_allowed_ptr(current, old_mask);
282 free_cpumask_var(old_mask);
279 return ret; 283 return ret;
280} 284}
281 285
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index c64e6798878a..1c484084ed4f 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -162,7 +162,7 @@ config ENCLOSURE_SERVICES
162config SGI_XP 162config SGI_XP
163 tristate "Support communication between SGI SSIs" 163 tristate "Support communication between SGI SSIs"
164 depends on NET 164 depends on NET
165 depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP 165 depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_UV) && SMP
166 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2 166 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
167 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2 167 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
168 select SGI_GRU if (IA64_GENERIC || IA64_SGI_UV || X86_64) && SMP 168 select SGI_GRU if (IA64_GENERIC || IA64_SGI_UV || X86_64) && SMP
@@ -189,7 +189,7 @@ config HP_ILO
189 189
190config SGI_GRU 190config SGI_GRU
191 tristate "SGI GRU driver" 191 tristate "SGI GRU driver"
192 depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP 192 depends on (X86_UV || IA64_SGI_UV || IA64_GENERIC) && SMP
193 default n 193 default n
194 select MMU_NOTIFIER 194 select MMU_NOTIFIER
195 ---help--- 195 ---help---
diff --git a/drivers/misc/sgi-gru/gru.h b/drivers/misc/sgi-gru/gru.h
index f93f03a9e6e9..1b5f579df15f 100644
--- a/drivers/misc/sgi-gru/gru.h
+++ b/drivers/misc/sgi-gru/gru.h
@@ -19,6 +19,8 @@
19#ifndef __GRU_H__ 19#ifndef __GRU_H__
20#define __GRU_H__ 20#define __GRU_H__
21 21
22#include <asm/uv/uv.h>
23
22/* 24/*
23 * GRU architectural definitions 25 * GRU architectural definitions
24 */ 26 */
diff --git a/drivers/misc/sgi-xp/xp.h b/drivers/misc/sgi-xp/xp.h
index 7b4cbd5e03e9..069ad3a1c2ac 100644
--- a/drivers/misc/sgi-xp/xp.h
+++ b/drivers/misc/sgi-xp/xp.h
@@ -15,6 +15,8 @@
15 15
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18#include <asm/uv/uv.h>
19
18#ifdef CONFIG_IA64 20#ifdef CONFIG_IA64
19#include <asm/system.h> 21#include <asm/system.h>
20#include <asm/sn/arch.h> /* defines is_shub1() and is_shub2() */ 22#include <asm/sn/arch.h> /* defines is_shub1() and is_shub2() */
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
index 89218f7cfaa7..6576170de962 100644
--- a/drivers/misc/sgi-xp/xpc_main.c
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -318,7 +318,7 @@ xpc_hb_checker(void *ignore)
318 318
319 /* this thread was marked active by xpc_hb_init() */ 319 /* this thread was marked active by xpc_hb_init() */
320 320
321 set_cpus_allowed_ptr(current, &cpumask_of_cpu(XPC_HB_CHECK_CPU)); 321 set_cpus_allowed_ptr(current, cpumask_of(XPC_HB_CHECK_CPU));
322 322
323 /* set our heartbeating to other partitions into motion */ 323 /* set our heartbeating to other partitions into motion */
324 xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ); 324 xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c
index ab0e09bf154d..847e9bb0098f 100644
--- a/drivers/net/sfc/efx.c
+++ b/drivers/net/sfc/efx.c
@@ -854,20 +854,27 @@ static void efx_fini_io(struct efx_nic *efx)
854 * interrupts across them. */ 854 * interrupts across them. */
855static int efx_wanted_rx_queues(void) 855static int efx_wanted_rx_queues(void)
856{ 856{
857 cpumask_t core_mask; 857 cpumask_var_t core_mask;
858 int count; 858 int count;
859 int cpu; 859 int cpu;
860 860
861 cpus_clear(core_mask); 861 if (!alloc_cpumask_var(&core_mask, GFP_KERNEL)) {
862 printk(KERN_WARNING
863 "efx.c: allocation failure, irq balancing hobbled\n");
864 return 1;
865 }
866
867 cpumask_clear(core_mask);
862 count = 0; 868 count = 0;
863 for_each_online_cpu(cpu) { 869 for_each_online_cpu(cpu) {
864 if (!cpu_isset(cpu, core_mask)) { 870 if (!cpumask_test_cpu(cpu, core_mask)) {
865 ++count; 871 ++count;
866 cpus_or(core_mask, core_mask, 872 cpumask_or(core_mask, core_mask,
867 topology_core_siblings(cpu)); 873 topology_core_cpumask(cpu));
868 } 874 }
869 } 875 }
870 876
877 free_cpumask_var(core_mask);
871 return count; 878 return count;
872} 879}
873 880
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index 9da5a4b81133..c3ea5fa7d05a 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -38,7 +38,7 @@
38 38
39static LIST_HEAD(dying_tasks); 39static LIST_HEAD(dying_tasks);
40static LIST_HEAD(dead_tasks); 40static LIST_HEAD(dead_tasks);
41static cpumask_t marked_cpus = CPU_MASK_NONE; 41static cpumask_var_t marked_cpus;
42static DEFINE_SPINLOCK(task_mortuary); 42static DEFINE_SPINLOCK(task_mortuary);
43static void process_task_mortuary(void); 43static void process_task_mortuary(void);
44 44
@@ -456,10 +456,10 @@ static void mark_done(int cpu)
456{ 456{
457 int i; 457 int i;
458 458
459 cpu_set(cpu, marked_cpus); 459 cpumask_set_cpu(cpu, marked_cpus);
460 460
461 for_each_online_cpu(i) { 461 for_each_online_cpu(i) {
462 if (!cpu_isset(i, marked_cpus)) 462 if (!cpumask_test_cpu(i, marked_cpus))
463 return; 463 return;
464 } 464 }
465 465
@@ -468,7 +468,7 @@ static void mark_done(int cpu)
468 */ 468 */
469 process_task_mortuary(); 469 process_task_mortuary();
470 470
471 cpus_clear(marked_cpus); 471 cpumask_clear(marked_cpus);
472} 472}
473 473
474 474
@@ -565,6 +565,20 @@ void sync_buffer(int cpu)
565 mutex_unlock(&buffer_mutex); 565 mutex_unlock(&buffer_mutex);
566} 566}
567 567
568int __init buffer_sync_init(void)
569{
570 if (!alloc_cpumask_var(&marked_cpus, GFP_KERNEL))
571 return -ENOMEM;
572
573 cpumask_clear(marked_cpus);
574 return 0;
575}
576
577void __exit buffer_sync_cleanup(void)
578{
579 free_cpumask_var(marked_cpus);
580}
581
568/* The function can be used to add a buffer worth of data directly to 582/* The function can be used to add a buffer worth of data directly to
569 * the kernel buffer. The buffer is assumed to be a circular buffer. 583 * the kernel buffer. The buffer is assumed to be a circular buffer.
570 * Take the entries from index start and end at index end, wrapping 584 * Take the entries from index start and end at index end, wrapping
diff --git a/drivers/oprofile/buffer_sync.h b/drivers/oprofile/buffer_sync.h
index 3110732c1835..0ebf5db62679 100644
--- a/drivers/oprofile/buffer_sync.h
+++ b/drivers/oprofile/buffer_sync.h
@@ -19,4 +19,8 @@ void sync_stop(void);
19/* sync the given CPU's buffer */ 19/* sync the given CPU's buffer */
20void sync_buffer(int cpu); 20void sync_buffer(int cpu);
21 21
22/* initialize/destroy the buffer system. */
23int buffer_sync_init(void);
24void buffer_sync_cleanup(void);
25
22#endif /* OPROFILE_BUFFER_SYNC_H */ 26#endif /* OPROFILE_BUFFER_SYNC_H */
diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c
index 3cffce90f82a..ced39f602292 100644
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -183,6 +183,10 @@ static int __init oprofile_init(void)
183{ 183{
184 int err; 184 int err;
185 185
186 err = buffer_sync_init();
187 if (err)
188 return err;
189
186 err = oprofile_arch_init(&oprofile_ops); 190 err = oprofile_arch_init(&oprofile_ops);
187 191
188 if (err < 0 || timer) { 192 if (err < 0 || timer) {
@@ -191,8 +195,10 @@ static int __init oprofile_init(void)
191 } 195 }
192 196
193 err = oprofilefs_register(); 197 err = oprofilefs_register();
194 if (err) 198 if (err) {
195 oprofile_arch_exit(); 199 oprofile_arch_exit();
200 buffer_sync_cleanup();
201 }
196 202
197 return err; 203 return err;
198} 204}
@@ -202,6 +208,7 @@ static void __exit oprofile_exit(void)
202{ 208{
203 oprofilefs_unregister(); 209 oprofilefs_unregister();
204 oprofile_arch_exit(); 210 oprofile_arch_exit();
211 buffer_sync_cleanup();
205} 212}
206 213
207 214
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index f78371b22529..5a57753ea9fc 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -6,6 +6,7 @@
6#include <linux/irq.h> 6#include <linux/irq.h>
7#include <asm/io_apic.h> 7#include <asm/io_apic.h>
8#include <asm/smp.h> 8#include <asm/smp.h>
9#include <asm/cpu.h>
9#include <linux/intel-iommu.h> 10#include <linux/intel-iommu.h>
10#include "intr_remapping.h" 11#include "intr_remapping.h"
11 12
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index eb0dfdeaa949..3141e149d595 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -26,6 +26,7 @@
26#include <linux/irq.h> 26#include <linux/irq.h>
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/bootmem.h>
29 30
30#include <asm/ptrace.h> 31#include <asm/ptrace.h>
31#include <asm/irq.h> 32#include <asm/irq.h>
@@ -75,7 +76,14 @@ enum {
75static int evtchn_to_irq[NR_EVENT_CHANNELS] = { 76static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
76 [0 ... NR_EVENT_CHANNELS-1] = -1 77 [0 ... NR_EVENT_CHANNELS-1] = -1
77}; 78};
78static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; 79struct cpu_evtchn_s {
80 unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG];
81};
82static struct cpu_evtchn_s *cpu_evtchn_mask_p;
83static inline unsigned long *cpu_evtchn_mask(int cpu)
84{
85 return cpu_evtchn_mask_p[cpu].bits;
86}
79static u8 cpu_evtchn[NR_EVENT_CHANNELS]; 87static u8 cpu_evtchn[NR_EVENT_CHANNELS];
80 88
81/* Reference counts for bindings to IRQs. */ 89/* Reference counts for bindings to IRQs. */
@@ -115,7 +123,7 @@ static inline unsigned long active_evtchns(unsigned int cpu,
115 unsigned int idx) 123 unsigned int idx)
116{ 124{
117 return (sh->evtchn_pending[idx] & 125 return (sh->evtchn_pending[idx] &
118 cpu_evtchn_mask[cpu][idx] & 126 cpu_evtchn_mask(cpu)[idx] &
119 ~sh->evtchn_mask[idx]); 127 ~sh->evtchn_mask[idx]);
120} 128}
121 129
@@ -125,11 +133,11 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
125 133
126 BUG_ON(irq == -1); 134 BUG_ON(irq == -1);
127#ifdef CONFIG_SMP 135#ifdef CONFIG_SMP
128 irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu); 136 cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
129#endif 137#endif
130 138
131 __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); 139 __clear_bit(chn, cpu_evtchn_mask(cpu_evtchn[chn]));
132 __set_bit(chn, cpu_evtchn_mask[cpu]); 140 __set_bit(chn, cpu_evtchn_mask(cpu));
133 141
134 cpu_evtchn[chn] = cpu; 142 cpu_evtchn[chn] = cpu;
135} 143}
@@ -142,12 +150,12 @@ static void init_evtchn_cpu_bindings(void)
142 150
143 /* By default all event channels notify CPU#0. */ 151 /* By default all event channels notify CPU#0. */
144 for_each_irq_desc(i, desc) { 152 for_each_irq_desc(i, desc) {
145 desc->affinity = cpumask_of_cpu(0); 153 cpumask_copy(desc->affinity, cpumask_of(0));
146 } 154 }
147#endif 155#endif
148 156
149 memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); 157 memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
150 memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); 158 memset(cpu_evtchn_mask(0), ~0, sizeof(cpu_evtchn_mask(0)));
151} 159}
152 160
153static inline unsigned int cpu_from_evtchn(unsigned int evtchn) 161static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
@@ -822,6 +830,10 @@ static struct irq_chip xen_dynamic_chip __read_mostly = {
822void __init xen_init_IRQ(void) 830void __init xen_init_IRQ(void)
823{ 831{
824 int i; 832 int i;
833 size_t size = nr_cpu_ids * sizeof(struct cpu_evtchn_s);
834
835 cpu_evtchn_mask_p = alloc_bootmem(size);
836 BUG_ON(cpu_evtchn_mask_p == NULL);
825 837
826 init_evtchn_cpu_bindings(); 838 init_evtchn_cpu_bindings();
827 839
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 9b91617b9582..e7e83b65c18f 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -100,7 +100,7 @@ static void do_suspend(void)
100 /* XXX use normal device tree? */ 100 /* XXX use normal device tree? */
101 xenbus_suspend(); 101 xenbus_suspend();
102 102
103 err = stop_machine(xen_suspend, &cancelled, &cpumask_of_cpu(0)); 103 err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
104 if (err) { 104 if (err) {
105 printk(KERN_ERR "failed to start xen_suspend: %d\n", err); 105 printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
106 goto out; 106 goto out;
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index b0e63c672ebd..00f45ff081a6 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -80,4 +80,56 @@ extern void setup_per_cpu_areas(void);
80#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \ 80#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
81 __typeof__(type) per_cpu_var(name) 81 __typeof__(type) per_cpu_var(name)
82 82
83/*
84 * Optional methods for optimized non-lvalue per-cpu variable access.
85 *
86 * @var can be a percpu variable or a field of it and its size should
87 * equal char, int or long. percpu_read() evaluates to a lvalue and
88 * all others to void.
89 *
90 * These operations are guaranteed to be atomic w.r.t. preemption.
91 * The generic versions use plain get/put_cpu_var(). Archs are
92 * encouraged to implement single-instruction alternatives which don't
93 * require preemption protection.
94 */
95#ifndef percpu_read
96# define percpu_read(var) \
97 ({ \
98 typeof(per_cpu_var(var)) __tmp_var__; \
99 __tmp_var__ = get_cpu_var(var); \
100 put_cpu_var(var); \
101 __tmp_var__; \
102 })
103#endif
104
105#define __percpu_generic_to_op(var, val, op) \
106do { \
107 get_cpu_var(var) op val; \
108 put_cpu_var(var); \
109} while (0)
110
111#ifndef percpu_write
112# define percpu_write(var, val) __percpu_generic_to_op(var, (val), =)
113#endif
114
115#ifndef percpu_add
116# define percpu_add(var, val) __percpu_generic_to_op(var, (val), +=)
117#endif
118
119#ifndef percpu_sub
120# define percpu_sub(var, val) __percpu_generic_to_op(var, (val), -=)
121#endif
122
123#ifndef percpu_and
124# define percpu_and(var, val) __percpu_generic_to_op(var, (val), &=)
125#endif
126
127#ifndef percpu_or
128# define percpu_or(var, val) __percpu_generic_to_op(var, (val), |=)
129#endif
130
131#ifndef percpu_xor
132# define percpu_xor(var, val) __percpu_generic_to_op(var, (val), ^=)
133#endif
134
83#endif /* _ASM_GENERIC_PERCPU_H_ */ 135#endif /* _ASM_GENERIC_PERCPU_H_ */
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 79a7ff925bf8..4ce48e878530 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -9,7 +9,7 @@ extern char __bss_start[], __bss_stop[];
9extern char __init_begin[], __init_end[]; 9extern char __init_begin[], __init_end[];
10extern char _sinittext[], _einittext[]; 10extern char _sinittext[], _einittext[];
11extern char _end[]; 11extern char _end[];
12extern char __per_cpu_start[], __per_cpu_end[]; 12extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
13extern char __kprobes_text_start[], __kprobes_text_end[]; 13extern char __kprobes_text_start[], __kprobes_text_end[];
14extern char __initdata_begin[], __initdata_end[]; 14extern char __initdata_begin[], __initdata_end[];
15extern char __start_rodata[], __end_rodata[]; 15extern char __start_rodata[], __end_rodata[];
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index c61fab1dd2f8..5406e70aba86 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -430,12 +430,59 @@
430 *(.initcall7.init) \ 430 *(.initcall7.init) \
431 *(.initcall7s.init) 431 *(.initcall7s.init)
432 432
433/**
434 * PERCPU_VADDR - define output section for percpu area
435 * @vaddr: explicit base address (optional)
436 * @phdr: destination PHDR (optional)
437 *
438 * Macro which expands to output section for percpu area. If @vaddr
439 * is not blank, it specifies explicit base address and all percpu
440 * symbols will be offset from the given address. If blank, @vaddr
441 * always equals @laddr + LOAD_OFFSET.
442 *
443 * @phdr defines the output PHDR to use if not blank. Be warned that
444 * output PHDR is sticky. If @phdr is specified, the next output
445 * section in the linker script will go there too. @phdr should have
446 * a leading colon.
447 *
448 * Note that this macros defines __per_cpu_load as an absolute symbol.
449 * If there is no need to put the percpu section at a predetermined
450 * address, use PERCPU().
451 */
452#define PERCPU_VADDR(vaddr, phdr) \
453 VMLINUX_SYMBOL(__per_cpu_load) = .; \
454 .data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load) \
455 - LOAD_OFFSET) { \
456 VMLINUX_SYMBOL(__per_cpu_start) = .; \
457 *(.data.percpu.first) \
458 *(.data.percpu.page_aligned) \
459 *(.data.percpu) \
460 *(.data.percpu.shared_aligned) \
461 VMLINUX_SYMBOL(__per_cpu_end) = .; \
462 } phdr \
463 . = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu);
464
465/**
466 * PERCPU - define output section for percpu area, simple version
467 * @align: required alignment
468 *
469 * Align to @align and outputs output section for percpu area. This
470 * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and
471 * __per_cpu_start will be identical.
472 *
473 * This macro is equivalent to ALIGN(align); PERCPU_VADDR( , ) except
474 * that __per_cpu_load is defined as a relative symbol against
475 * .data.percpu which is required for relocatable x86_32
476 * configuration.
477 */
433#define PERCPU(align) \ 478#define PERCPU(align) \
434 . = ALIGN(align); \ 479 . = ALIGN(align); \
435 VMLINUX_SYMBOL(__per_cpu_start) = .; \ 480 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \
436 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \ 481 VMLINUX_SYMBOL(__per_cpu_load) = .; \
482 VMLINUX_SYMBOL(__per_cpu_start) = .; \
483 *(.data.percpu.first) \
437 *(.data.percpu.page_aligned) \ 484 *(.data.percpu.page_aligned) \
438 *(.data.percpu) \ 485 *(.data.percpu) \
439 *(.data.percpu.shared_aligned) \ 486 *(.data.percpu.shared_aligned) \
440 } \ 487 VMLINUX_SYMBOL(__per_cpu_end) = .; \
441 VMLINUX_SYMBOL(__per_cpu_end) = .; 488 }
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9127f6b51a39..472f11765f60 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -467,6 +467,7 @@ int show_interrupts(struct seq_file *p, void *v);
467struct irq_desc; 467struct irq_desc;
468 468
469extern int early_irq_init(void); 469extern int early_irq_init(void);
470extern int arch_probe_nr_irqs(void);
470extern int arch_early_irq_init(void); 471extern int arch_early_irq_init(void);
471extern int arch_init_chip_data(struct irq_desc *desc, int cpu); 472extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
472 473
diff --git a/include/linux/irq.h b/include/linux/irq.h
index f899b502f186..27a67536511e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -182,11 +182,11 @@ struct irq_desc {
182 unsigned int irqs_unhandled; 182 unsigned int irqs_unhandled;
183 spinlock_t lock; 183 spinlock_t lock;
184#ifdef CONFIG_SMP 184#ifdef CONFIG_SMP
185 cpumask_t affinity; 185 cpumask_var_t affinity;
186 unsigned int cpu; 186 unsigned int cpu;
187#endif
188#ifdef CONFIG_GENERIC_PENDING_IRQ 187#ifdef CONFIG_GENERIC_PENDING_IRQ
189 cpumask_t pending_mask; 188 cpumask_var_t pending_mask;
189#endif
190#endif 190#endif
191#ifdef CONFIG_PROC_FS 191#ifdef CONFIG_PROC_FS
192 struct proc_dir_entry *dir; 192 struct proc_dir_entry *dir;
@@ -422,4 +422,84 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
422 422
423#endif /* !CONFIG_S390 */ 423#endif /* !CONFIG_S390 */
424 424
425#ifdef CONFIG_SMP
426/**
427 * init_alloc_desc_masks - allocate cpumasks for irq_desc
428 * @desc: pointer to irq_desc struct
429 * @cpu: cpu which will be handling the cpumasks
430 * @boot: true if need bootmem
431 *
432 * Allocates affinity and pending_mask cpumask if required.
433 * Returns true if successful (or not required).
434 * Side effect: affinity has all bits set, pending_mask has all bits clear.
435 */
436static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
437 bool boot)
438{
439 int node;
440
441 if (boot) {
442 alloc_bootmem_cpumask_var(&desc->affinity);
443 cpumask_setall(desc->affinity);
444
445#ifdef CONFIG_GENERIC_PENDING_IRQ
446 alloc_bootmem_cpumask_var(&desc->pending_mask);
447 cpumask_clear(desc->pending_mask);
448#endif
449 return true;
450 }
451
452 node = cpu_to_node(cpu);
453
454 if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
455 return false;
456 cpumask_setall(desc->affinity);
457
458#ifdef CONFIG_GENERIC_PENDING_IRQ
459 if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
460 free_cpumask_var(desc->affinity);
461 return false;
462 }
463 cpumask_clear(desc->pending_mask);
464#endif
465 return true;
466}
467
468/**
469 * init_copy_desc_masks - copy cpumasks for irq_desc
470 * @old_desc: pointer to old irq_desc struct
471 * @new_desc: pointer to new irq_desc struct
472 *
473 * Insures affinity and pending_masks are copied to new irq_desc.
474 * If !CONFIG_CPUMASKS_OFFSTACK the cpumasks are embedded in the
475 * irq_desc struct so the copy is redundant.
476 */
477
478static inline void init_copy_desc_masks(struct irq_desc *old_desc,
479 struct irq_desc *new_desc)
480{
481#ifdef CONFIG_CPUMASKS_OFFSTACK
482 cpumask_copy(new_desc->affinity, old_desc->affinity);
483
484#ifdef CONFIG_GENERIC_PENDING_IRQ
485 cpumask_copy(new_desc->pending_mask, old_desc->pending_mask);
486#endif
487#endif
488}
489
490#else /* !CONFIG_SMP */
491
492static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
493 bool boot)
494{
495 return true;
496}
497
498static inline void init_copy_desc_masks(struct irq_desc *old_desc,
499 struct irq_desc *new_desc)
500{
501}
502
503#endif /* CONFIG_SMP */
504
425#endif /* _LINUX_IRQ_H */ 505#endif /* _LINUX_IRQ_H */
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index 86af92e9e84c..887477bc2ab0 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -20,6 +20,7 @@
20 20
21# define for_each_irq_desc_reverse(irq, desc) \ 21# define for_each_irq_desc_reverse(irq, desc) \
22 for (irq = nr_irqs - 1; irq >= 0; irq--) 22 for (irq = nr_irqs - 1; irq >= 0; irq--)
23
23#else /* CONFIG_GENERIC_HARDIRQS */ 24#else /* CONFIG_GENERIC_HARDIRQS */
24 25
25extern int nr_irqs; 26extern int nr_irqs;
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 0b4df7eba852..5b4e28bcb788 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -49,4 +49,5 @@
49#define FUTEXFS_SUPER_MAGIC 0xBAD1DEA 49#define FUTEXFS_SUPER_MAGIC 0xBAD1DEA
50#define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA 50#define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA
51 51
52#define STACK_END_MAGIC 0x57AC6E9D
52#endif /* __LINUX_MAGIC_H__ */ 53#endif /* __LINUX_MAGIC_H__ */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 9f2a3751873a..3577ffd90d45 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -8,35 +8,46 @@
8 8
9#include <asm/percpu.h> 9#include <asm/percpu.h>
10 10
11#ifndef PER_CPU_BASE_SECTION
12#ifdef CONFIG_SMP
13#define PER_CPU_BASE_SECTION ".data.percpu"
14#else
15#define PER_CPU_BASE_SECTION ".data"
16#endif
17#endif
18
11#ifdef CONFIG_SMP 19#ifdef CONFIG_SMP
12#define DEFINE_PER_CPU(type, name) \
13 __attribute__((__section__(".data.percpu"))) \
14 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
15 20
16#ifdef MODULE 21#ifdef MODULE
17#define SHARED_ALIGNED_SECTION ".data.percpu" 22#define PER_CPU_SHARED_ALIGNED_SECTION ""
18#else 23#else
19#define SHARED_ALIGNED_SECTION ".data.percpu.shared_aligned" 24#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
20#endif 25#endif
26#define PER_CPU_FIRST_SECTION ".first"
21 27
22#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 28#else
23 __attribute__((__section__(SHARED_ALIGNED_SECTION))) \
24 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name \
25 ____cacheline_aligned_in_smp
26 29
27#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ 30#define PER_CPU_SHARED_ALIGNED_SECTION ""
28 __attribute__((__section__(".data.percpu.page_aligned"))) \ 31#define PER_CPU_FIRST_SECTION ""
32
33#endif
34
35#define DEFINE_PER_CPU_SECTION(type, name, section) \
36 __attribute__((__section__(PER_CPU_BASE_SECTION section))) \
29 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name 37 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
30#else 38
31#define DEFINE_PER_CPU(type, name) \ 39#define DEFINE_PER_CPU(type, name) \
32 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name 40 DEFINE_PER_CPU_SECTION(type, name, "")
33 41
34#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 42#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
35 DEFINE_PER_CPU(type, name) 43 DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
44 ____cacheline_aligned_in_smp
36 45
37#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ 46#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
38 DEFINE_PER_CPU(type, name) 47 DEFINE_PER_CPU_SECTION(type, name, ".page_aligned")
39#endif 48
49#define DEFINE_PER_CPU_FIRST(type, name) \
50 DEFINE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION)
40 51
41#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) 52#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
42#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) 53#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2127e959e0f4..28b3f505f189 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1161,10 +1161,9 @@ struct task_struct {
1161 pid_t pid; 1161 pid_t pid;
1162 pid_t tgid; 1162 pid_t tgid;
1163 1163
1164#ifdef CONFIG_CC_STACKPROTECTOR
1165 /* Canary value for the -fstack-protector gcc feature */ 1164 /* Canary value for the -fstack-protector gcc feature */
1166 unsigned long stack_canary; 1165 unsigned long stack_canary;
1167#endif 1166
1168 /* 1167 /*
1169 * pointers to (original) parent process, youngest child, younger sibling, 1168 * pointers to (original) parent process, youngest child, younger sibling,
1170 * older sibling, respectively. (p->father can be replaced with 1169 * older sibling, respectively. (p->father can be replaced with
@@ -2070,6 +2069,19 @@ static inline int object_is_on_stack(void *obj)
2070 2069
2071extern void thread_info_cache_init(void); 2070extern void thread_info_cache_init(void);
2072 2071
2072#ifdef CONFIG_DEBUG_STACK_USAGE
2073static inline unsigned long stack_not_used(struct task_struct *p)
2074{
2075 unsigned long *n = end_of_stack(p);
2076
2077 do { /* Skip over canary */
2078 n++;
2079 } while (!*n);
2080
2081 return (unsigned long)n - (unsigned long)end_of_stack(p);
2082}
2083#endif
2084
2073/* set thread flags in other task's structures 2085/* set thread flags in other task's structures
2074 * - see asm/thread_info.h for TIF_xxxx flags available 2086 * - see asm/thread_info.h for TIF_xxxx flags available
2075 */ 2087 */
diff --git a/include/linux/stackprotector.h b/include/linux/stackprotector.h
new file mode 100644
index 000000000000..6f3e54c704c0
--- /dev/null
+++ b/include/linux/stackprotector.h
@@ -0,0 +1,16 @@
1#ifndef _LINUX_STACKPROTECTOR_H
2#define _LINUX_STACKPROTECTOR_H 1
3
4#include <linux/compiler.h>
5#include <linux/sched.h>
6#include <linux/random.h>
7
8#ifdef CONFIG_CC_STACKPROTECTOR
9# include <asm/stackprotector.h>
10#else
11static inline void boot_init_stack_canary(void)
12{
13}
14#endif
15
16#endif
diff --git a/include/linux/topology.h b/include/linux/topology.h
index e632d29f0544..a16b9e06f2e5 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -193,5 +193,11 @@ int arch_update_cpu_topology(void);
193#ifndef topology_core_siblings 193#ifndef topology_core_siblings
194#define topology_core_siblings(cpu) cpumask_of_cpu(cpu) 194#define topology_core_siblings(cpu) cpumask_of_cpu(cpu)
195#endif 195#endif
196#ifndef topology_thread_cpumask
197#define topology_thread_cpumask(cpu) cpumask_of(cpu)
198#endif
199#ifndef topology_core_cpumask
200#define topology_core_cpumask(cpu) cpumask_of(cpu)
201#endif
196 202
197#endif /* _LINUX_TOPOLOGY_H */ 203#endif /* _LINUX_TOPOLOGY_H */
diff --git a/init/main.c b/init/main.c
index 844209453c02..bfe4fb0c9842 100644
--- a/init/main.c
+++ b/init/main.c
@@ -14,6 +14,7 @@
14#include <linux/proc_fs.h> 14#include <linux/proc_fs.h>
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <linux/stackprotector.h>
17#include <linux/string.h> 18#include <linux/string.h>
18#include <linux/ctype.h> 19#include <linux/ctype.h>
19#include <linux/delay.h> 20#include <linux/delay.h>
@@ -539,6 +540,12 @@ asmlinkage void __init start_kernel(void)
539 */ 540 */
540 lockdep_init(); 541 lockdep_init();
541 debug_objects_early_init(); 542 debug_objects_early_init();
543
544 /*
545 * Set up the the initial canary ASAP:
546 */
547 boot_init_stack_canary();
548
542 cgroup_init_early(); 549 cgroup_init_early();
543 550
544 local_irq_disable(); 551 local_irq_disable();
diff --git a/kernel/exit.c b/kernel/exit.c
index f80dec3f1875..70612c19ac96 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -977,12 +977,9 @@ static void check_stack_usage(void)
977{ 977{
978 static DEFINE_SPINLOCK(low_water_lock); 978 static DEFINE_SPINLOCK(low_water_lock);
979 static int lowest_to_date = THREAD_SIZE; 979 static int lowest_to_date = THREAD_SIZE;
980 unsigned long *n = end_of_stack(current);
981 unsigned long free; 980 unsigned long free;
982 981
983 while (*n == 0) 982 free = stack_not_used(current);
984 n++;
985 free = (unsigned long)n - (unsigned long)end_of_stack(current);
986 983
987 if (free >= lowest_to_date) 984 if (free >= lowest_to_date)
988 return; 985 return;
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d5dbb7a13e2..c078438ba6fc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -61,6 +61,7 @@
61#include <linux/proc_fs.h> 61#include <linux/proc_fs.h>
62#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <trace/sched.h> 63#include <trace/sched.h>
64#include <linux/magic.h>
64 65
65#include <asm/pgtable.h> 66#include <asm/pgtable.h>
66#include <asm/pgalloc.h> 67#include <asm/pgalloc.h>
@@ -212,6 +213,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
212{ 213{
213 struct task_struct *tsk; 214 struct task_struct *tsk;
214 struct thread_info *ti; 215 struct thread_info *ti;
216 unsigned long *stackend;
217
215 int err; 218 int err;
216 219
217 prepare_to_copy(orig); 220 prepare_to_copy(orig);
@@ -237,6 +240,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
237 goto out; 240 goto out;
238 241
239 setup_thread_stack(tsk, orig); 242 setup_thread_stack(tsk, orig);
243 stackend = end_of_stack(tsk);
244 *stackend = STACK_END_MAGIC; /* for overflow detection */
240 245
241#ifdef CONFIG_CC_STACKPROTECTOR 246#ifdef CONFIG_CC_STACKPROTECTOR
242 tsk->stack_canary = get_random_int(); 247 tsk->stack_canary = get_random_int();
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 7de11bd64dfe..122fef4b0bd3 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -46,7 +46,10 @@ void dynamic_irq_init(unsigned int irq)
46 desc->irq_count = 0; 46 desc->irq_count = 0;
47 desc->irqs_unhandled = 0; 47 desc->irqs_unhandled = 0;
48#ifdef CONFIG_SMP 48#ifdef CONFIG_SMP
49 cpumask_setall(&desc->affinity); 49 cpumask_setall(desc->affinity);
50#ifdef CONFIG_GENERIC_PENDING_IRQ
51 cpumask_clear(desc->pending_mask);
52#endif
50#endif 53#endif
51 spin_unlock_irqrestore(&desc->lock, flags); 54 spin_unlock_irqrestore(&desc->lock, flags);
52} 55}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3aba8d12f328..f51eaee921b6 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -17,6 +17,7 @@
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/rculist.h> 18#include <linux/rculist.h>
19#include <linux/hash.h> 19#include <linux/hash.h>
20#include <linux/bootmem.h>
20 21
21#include "internals.h" 22#include "internals.h"
22 23
@@ -69,6 +70,7 @@ int nr_irqs = NR_IRQS;
69EXPORT_SYMBOL_GPL(nr_irqs); 70EXPORT_SYMBOL_GPL(nr_irqs);
70 71
71#ifdef CONFIG_SPARSE_IRQ 72#ifdef CONFIG_SPARSE_IRQ
73
72static struct irq_desc irq_desc_init = { 74static struct irq_desc irq_desc_init = {
73 .irq = -1, 75 .irq = -1,
74 .status = IRQ_DISABLED, 76 .status = IRQ_DISABLED,
@@ -76,9 +78,6 @@ static struct irq_desc irq_desc_init = {
76 .handle_irq = handle_bad_irq, 78 .handle_irq = handle_bad_irq,
77 .depth = 1, 79 .depth = 1,
78 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 80 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
79#ifdef CONFIG_SMP
80 .affinity = CPU_MASK_ALL
81#endif
82}; 81};
83 82
84void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) 83void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
@@ -113,6 +112,10 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
113 printk(KERN_ERR "can not alloc kstat_irqs\n"); 112 printk(KERN_ERR "can not alloc kstat_irqs\n");
114 BUG_ON(1); 113 BUG_ON(1);
115 } 114 }
115 if (!init_alloc_desc_masks(desc, cpu, false)) {
116 printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
117 BUG_ON(1);
118 }
116 arch_init_chip_data(desc, cpu); 119 arch_init_chip_data(desc, cpu);
117} 120}
118 121
@@ -121,7 +124,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
121 */ 124 */
122DEFINE_SPINLOCK(sparse_irq_lock); 125DEFINE_SPINLOCK(sparse_irq_lock);
123 126
124struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly; 127struct irq_desc **irq_desc_ptrs __read_mostly;
125 128
126static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { 129static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
127 [0 ... NR_IRQS_LEGACY-1] = { 130 [0 ... NR_IRQS_LEGACY-1] = {
@@ -131,14 +134,10 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
131 .handle_irq = handle_bad_irq, 134 .handle_irq = handle_bad_irq,
132 .depth = 1, 135 .depth = 1,
133 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 136 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
134#ifdef CONFIG_SMP
135 .affinity = CPU_MASK_ALL
136#endif
137 } 137 }
138}; 138};
139 139
140/* FIXME: use bootmem alloc ...*/ 140static unsigned int *kstat_irqs_legacy;
141static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
142 141
143int __init early_irq_init(void) 142int __init early_irq_init(void)
144{ 143{
@@ -148,18 +147,30 @@ int __init early_irq_init(void)
148 147
149 init_irq_default_affinity(); 148 init_irq_default_affinity();
150 149
150 /* initialize nr_irqs based on nr_cpu_ids */
151 arch_probe_nr_irqs();
152 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
153
151 desc = irq_desc_legacy; 154 desc = irq_desc_legacy;
152 legacy_count = ARRAY_SIZE(irq_desc_legacy); 155 legacy_count = ARRAY_SIZE(irq_desc_legacy);
153 156
157 /* allocate irq_desc_ptrs array based on nr_irqs */
158 irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
159
160 /* allocate based on nr_cpu_ids */
161 /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
162 kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
163 sizeof(int));
164
154 for (i = 0; i < legacy_count; i++) { 165 for (i = 0; i < legacy_count; i++) {
155 desc[i].irq = i; 166 desc[i].irq = i;
156 desc[i].kstat_irqs = kstat_irqs_legacy[i]; 167 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
157 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 168 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
158 169 init_alloc_desc_masks(&desc[i], 0, true);
159 irq_desc_ptrs[i] = desc + i; 170 irq_desc_ptrs[i] = desc + i;
160 } 171 }
161 172
162 for (i = legacy_count; i < NR_IRQS; i++) 173 for (i = legacy_count; i < nr_irqs; i++)
163 irq_desc_ptrs[i] = NULL; 174 irq_desc_ptrs[i] = NULL;
164 175
165 return arch_early_irq_init(); 176 return arch_early_irq_init();
@@ -167,7 +178,10 @@ int __init early_irq_init(void)
167 178
168struct irq_desc *irq_to_desc(unsigned int irq) 179struct irq_desc *irq_to_desc(unsigned int irq)
169{ 180{
170 return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL; 181 if (irq_desc_ptrs && irq < nr_irqs)
182 return irq_desc_ptrs[irq];
183
184 return NULL;
171} 185}
172 186
173struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) 187struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
@@ -176,10 +190,9 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
176 unsigned long flags; 190 unsigned long flags;
177 int node; 191 int node;
178 192
179 if (irq >= NR_IRQS) { 193 if (irq >= nr_irqs) {
180 printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n", 194 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
181 irq, NR_IRQS); 195 irq, nr_irqs);
182 WARN_ON(1);
183 return NULL; 196 return NULL;
184 } 197 }
185 198
@@ -221,9 +234,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
221 .handle_irq = handle_bad_irq, 234 .handle_irq = handle_bad_irq,
222 .depth = 1, 235 .depth = 1,
223 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), 236 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
224#ifdef CONFIG_SMP
225 .affinity = CPU_MASK_ALL
226#endif
227 } 237 }
228}; 238};
229 239
@@ -235,12 +245,15 @@ int __init early_irq_init(void)
235 245
236 init_irq_default_affinity(); 246 init_irq_default_affinity();
237 247
248 printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
249
238 desc = irq_desc; 250 desc = irq_desc;
239 count = ARRAY_SIZE(irq_desc); 251 count = ARRAY_SIZE(irq_desc);
240 252
241 for (i = 0; i < count; i++) 253 for (i = 0; i < count; i++) {
242 desc[i].irq = i; 254 desc[i].irq = i;
243 255 init_alloc_desc_masks(&desc[i], 0, true);
256 }
244 return arch_early_irq_init(); 257 return arch_early_irq_init();
245} 258}
246 259
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e6d0a43cc125..40416a81a0f5 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,14 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
16extern struct lock_class_key irq_desc_lock_class; 16extern struct lock_class_key irq_desc_lock_class;
17extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); 17extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
18extern spinlock_t sparse_irq_lock; 18extern spinlock_t sparse_irq_lock;
19
20#ifdef CONFIG_SPARSE_IRQ
21/* irq_desc_ptrs allocated at boot time */
22extern struct irq_desc **irq_desc_ptrs;
23#else
24/* irq_desc_ptrs is a fixed size array */
19extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; 25extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
26#endif
20 27
21#ifdef CONFIG_PROC_FS 28#ifdef CONFIG_PROC_FS
22extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 29extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 291f03664552..a3a5dc9ef346 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -90,14 +90,14 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
90 90
91#ifdef CONFIG_GENERIC_PENDING_IRQ 91#ifdef CONFIG_GENERIC_PENDING_IRQ
92 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { 92 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
93 cpumask_copy(&desc->affinity, cpumask); 93 cpumask_copy(desc->affinity, cpumask);
94 desc->chip->set_affinity(irq, cpumask); 94 desc->chip->set_affinity(irq, cpumask);
95 } else { 95 } else {
96 desc->status |= IRQ_MOVE_PENDING; 96 desc->status |= IRQ_MOVE_PENDING;
97 cpumask_copy(&desc->pending_mask, cpumask); 97 cpumask_copy(desc->pending_mask, cpumask);
98 } 98 }
99#else 99#else
100 cpumask_copy(&desc->affinity, cpumask); 100 cpumask_copy(desc->affinity, cpumask);
101 desc->chip->set_affinity(irq, cpumask); 101 desc->chip->set_affinity(irq, cpumask);
102#endif 102#endif
103 desc->status |= IRQ_AFFINITY_SET; 103 desc->status |= IRQ_AFFINITY_SET;
@@ -119,16 +119,16 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
119 * one of the targets is online. 119 * one of the targets is online.
120 */ 120 */
121 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 121 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
122 if (cpumask_any_and(&desc->affinity, cpu_online_mask) 122 if (cpumask_any_and(desc->affinity, cpu_online_mask)
123 < nr_cpu_ids) 123 < nr_cpu_ids)
124 goto set_affinity; 124 goto set_affinity;
125 else 125 else
126 desc->status &= ~IRQ_AFFINITY_SET; 126 desc->status &= ~IRQ_AFFINITY_SET;
127 } 127 }
128 128
129 cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity); 129 cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
130set_affinity: 130set_affinity:
131 desc->chip->set_affinity(irq, &desc->affinity); 131 desc->chip->set_affinity(irq, desc->affinity);
132 132
133 return 0; 133 return 0;
134} 134}
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index bd72329e630c..e05ad9be43b7 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -18,7 +18,7 @@ void move_masked_irq(int irq)
18 18
19 desc->status &= ~IRQ_MOVE_PENDING; 19 desc->status &= ~IRQ_MOVE_PENDING;
20 20
21 if (unlikely(cpumask_empty(&desc->pending_mask))) 21 if (unlikely(cpumask_empty(desc->pending_mask)))
22 return; 22 return;
23 23
24 if (!desc->chip->set_affinity) 24 if (!desc->chip->set_affinity)
@@ -38,13 +38,13 @@ void move_masked_irq(int irq)
38 * For correct operation this depends on the caller 38 * For correct operation this depends on the caller
39 * masking the irqs. 39 * masking the irqs.
40 */ 40 */
41 if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask) 41 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
42 < nr_cpu_ids)) { 42 < nr_cpu_ids)) {
43 cpumask_and(&desc->affinity, 43 cpumask_and(desc->affinity,
44 &desc->pending_mask, cpu_online_mask); 44 desc->pending_mask, cpu_online_mask);
45 desc->chip->set_affinity(irq, &desc->affinity); 45 desc->chip->set_affinity(irq, desc->affinity);
46 } 46 }
47 cpumask_clear(&desc->pending_mask); 47 cpumask_clear(desc->pending_mask);
48} 48}
49 49
50void move_native_irq(int irq) 50void move_native_irq(int irq)
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index acd88356ac76..7f9b80434e32 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -38,15 +38,22 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
38 old_desc->kstat_irqs = NULL; 38 old_desc->kstat_irqs = NULL;
39} 39}
40 40
41static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, 41static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
42 struct irq_desc *desc, int cpu) 42 struct irq_desc *desc, int cpu)
43{ 43{
44 memcpy(desc, old_desc, sizeof(struct irq_desc)); 44 memcpy(desc, old_desc, sizeof(struct irq_desc));
45 if (!init_alloc_desc_masks(desc, cpu, false)) {
46 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
47 "for migration.\n", irq);
48 return false;
49 }
45 spin_lock_init(&desc->lock); 50 spin_lock_init(&desc->lock);
46 desc->cpu = cpu; 51 desc->cpu = cpu;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 52 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); 53 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
54 init_copy_desc_masks(old_desc, desc);
49 arch_init_copy_chip_data(old_desc, desc, cpu); 55 arch_init_copy_chip_data(old_desc, desc, cpu);
56 return true;
50} 57}
51 58
52static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) 59static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
@@ -76,12 +83,18 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
76 node = cpu_to_node(cpu); 83 node = cpu_to_node(cpu);
77 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); 84 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
78 if (!desc) { 85 if (!desc) {
79 printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq); 86 printk(KERN_ERR "irq %d: can not get new irq_desc "
87 "for migration.\n", irq);
88 /* still use old one */
89 desc = old_desc;
90 goto out_unlock;
91 }
92 if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
80 /* still use old one */ 93 /* still use old one */
94 kfree(desc);
81 desc = old_desc; 95 desc = old_desc;
82 goto out_unlock; 96 goto out_unlock;
83 } 97 }
84 init_copy_one_irq_desc(irq, old_desc, desc, cpu);
85 98
86 irq_desc_ptrs[irq] = desc; 99 irq_desc_ptrs[irq] = desc;
87 spin_unlock_irqrestore(&sparse_irq_lock, flags); 100 spin_unlock_irqrestore(&sparse_irq_lock, flags);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index aae3f742bcec..692363dd591f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -20,11 +20,11 @@ static struct proc_dir_entry *root_irq_dir;
20static int irq_affinity_proc_show(struct seq_file *m, void *v) 20static int irq_affinity_proc_show(struct seq_file *m, void *v)
21{ 21{
22 struct irq_desc *desc = irq_to_desc((long)m->private); 22 struct irq_desc *desc = irq_to_desc((long)m->private);
23 const struct cpumask *mask = &desc->affinity; 23 const struct cpumask *mask = desc->affinity;
24 24
25#ifdef CONFIG_GENERIC_PENDING_IRQ 25#ifdef CONFIG_GENERIC_PENDING_IRQ
26 if (desc->status & IRQ_MOVE_PENDING) 26 if (desc->status & IRQ_MOVE_PENDING)
27 mask = &desc->pending_mask; 27 mask = desc->pending_mask;
28#endif 28#endif
29 seq_cpumask(m, mask); 29 seq_cpumask(m, mask);
30 seq_putc(m, '\n'); 30 seq_putc(m, '\n');
diff --git a/kernel/panic.c b/kernel/panic.c
index 2a2ff36ff44d..33cab3de1763 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -74,6 +74,9 @@ NORET_TYPE void panic(const char * fmt, ...)
74 vsnprintf(buf, sizeof(buf), fmt, args); 74 vsnprintf(buf, sizeof(buf), fmt, args);
75 va_end(args); 75 va_end(args);
76 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); 76 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
77#ifdef CONFIG_DEBUG_BUGVERBOSE
78 dump_stack();
79#endif
77 bust_spinlocks(0); 80 bust_spinlocks(0);
78 81
79 /* 82 /*
@@ -355,15 +358,22 @@ EXPORT_SYMBOL(warn_slowpath);
355#endif 358#endif
356 359
357#ifdef CONFIG_CC_STACKPROTECTOR 360#ifdef CONFIG_CC_STACKPROTECTOR
361
362#ifndef GCC_HAS_SP
363#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this.
364#endif
365
358/* 366/*
359 * Called when gcc's -fstack-protector feature is used, and 367 * Called when gcc's -fstack-protector feature is used, and
360 * gcc detects corruption of the on-stack canary value 368 * gcc detects corruption of the on-stack canary value
361 */ 369 */
362void __stack_chk_fail(void) 370void __stack_chk_fail(void)
363{ 371{
364 panic("stack-protector: Kernel stack is corrupted"); 372 panic("stack-protector: Kernel stack is corrupted in: %p\n",
373 __builtin_return_address(0));
365} 374}
366EXPORT_SYMBOL(__stack_chk_fail); 375EXPORT_SYMBOL(__stack_chk_fail);
376
367#endif 377#endif
368 378
369core_param(panic, panic_timeout, int, 0644); 379core_param(panic, panic_timeout, int, 0644);
diff --git a/kernel/sched.c b/kernel/sched.c
index 8ee437a5ec1d..fc17fd91ab57 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5949,12 +5949,7 @@ void sched_show_task(struct task_struct *p)
5949 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 5949 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5950#endif 5950#endif
5951#ifdef CONFIG_DEBUG_STACK_USAGE 5951#ifdef CONFIG_DEBUG_STACK_USAGE
5952 { 5952 free = stack_not_used(p);
5953 unsigned long *n = end_of_stack(p);
5954 while (!*n)
5955 n++;
5956 free = (unsigned long)n - (unsigned long)end_of_stack(p);
5957 }
5958#endif 5953#endif
5959 printk(KERN_CONT "%5lu %5d %6d\n", free, 5954 printk(KERN_CONT "%5lu %5d %6d\n", free,
5960 task_pid_nr(p), task_pid_nr(p->real_parent)); 5955 task_pid_nr(p), task_pid_nr(p->real_parent));
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bac1061cea2f..da932f4c8524 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -960,12 +960,13 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
960 960
961static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 961static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
962 962
963static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 963static inline int pick_optimal_cpu(int this_cpu,
964 const struct cpumask *mask)
964{ 965{
965 int first; 966 int first;
966 967
967 /* "this_cpu" is cheaper to preempt than a remote processor */ 968 /* "this_cpu" is cheaper to preempt than a remote processor */
968 if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) 969 if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
969 return this_cpu; 970 return this_cpu;
970 971
971 first = cpumask_first(mask); 972 first = cpumask_first(mask);
@@ -981,6 +982,7 @@ static int find_lowest_rq(struct task_struct *task)
981 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); 982 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
982 int this_cpu = smp_processor_id(); 983 int this_cpu = smp_processor_id();
983 int cpu = task_cpu(task); 984 int cpu = task_cpu(task);
985 cpumask_var_t domain_mask;
984 986
985 if (task->rt.nr_cpus_allowed == 1) 987 if (task->rt.nr_cpus_allowed == 1)
986 return -1; /* No other targets possible */ 988 return -1; /* No other targets possible */
@@ -1013,19 +1015,25 @@ static int find_lowest_rq(struct task_struct *task)
1013 if (this_cpu == cpu) 1015 if (this_cpu == cpu)
1014 this_cpu = -1; /* Skip this_cpu opt if the same */ 1016 this_cpu = -1; /* Skip this_cpu opt if the same */
1015 1017
1016 for_each_domain(cpu, sd) { 1018 if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
1017 if (sd->flags & SD_WAKE_AFFINE) { 1019 for_each_domain(cpu, sd) {
1018 cpumask_t domain_mask; 1020 if (sd->flags & SD_WAKE_AFFINE) {
1019 int best_cpu; 1021 int best_cpu;
1020 1022
1021 cpumask_and(&domain_mask, sched_domain_span(sd), 1023 cpumask_and(domain_mask,
1022 lowest_mask); 1024 sched_domain_span(sd),
1025 lowest_mask);
1023 1026
1024 best_cpu = pick_optimal_cpu(this_cpu, 1027 best_cpu = pick_optimal_cpu(this_cpu,
1025 &domain_mask); 1028 domain_mask);
1026 if (best_cpu != -1) 1029
1027 return best_cpu; 1030 if (best_cpu != -1) {
1031 free_cpumask_var(domain_mask);
1032 return best_cpu;
1033 }
1034 }
1028 } 1035 }
1036 free_cpumask_var(domain_mask);
1029 } 1037 }
1030 1038
1031 /* 1039 /*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..0365b4899a3d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -795,6 +795,11 @@ int __init __weak early_irq_init(void)
795 return 0; 795 return 0;
796} 796}
797 797
798int __init __weak arch_probe_nr_irqs(void)
799{
800 return 0;
801}
802
798int __init __weak arch_early_irq_init(void) 803int __init __weak arch_early_irq_init(void)
799{ 804{
800 return 0; 805 return 0;
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 88921611b22e..7e62303133dc 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -415,8 +415,9 @@ static int parse_elf(struct elf_info *info, const char *filename)
415 const char *secstrings 415 const char *secstrings
416 = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 416 = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
417 const char *secname; 417 const char *secname;
418 int nobits = sechdrs[i].sh_type == SHT_NOBITS;
418 419
419 if (sechdrs[i].sh_offset > info->size) { 420 if (!nobits && sechdrs[i].sh_offset > info->size) {
420 fatal("%s is truncated. sechdrs[i].sh_offset=%lu > " 421 fatal("%s is truncated. sechdrs[i].sh_offset=%lu > "
421 "sizeof(*hrd)=%zu\n", filename, 422 "sizeof(*hrd)=%zu\n", filename,
422 (unsigned long)sechdrs[i].sh_offset, 423 (unsigned long)sechdrs[i].sh_offset,
@@ -425,6 +426,8 @@ static int parse_elf(struct elf_info *info, const char *filename)
425 } 426 }
426 secname = secstrings + sechdrs[i].sh_name; 427 secname = secstrings + sechdrs[i].sh_name;
427 if (strcmp(secname, ".modinfo") == 0) { 428 if (strcmp(secname, ".modinfo") == 0) {
429 if (nobits)
430 fatal("%s has NOBITS .modinfo\n", filename);
428 info->modinfo = (void *)hdr + sechdrs[i].sh_offset; 431 info->modinfo = (void *)hdr + sechdrs[i].sh_offset;
429 info->modinfo_len = sechdrs[i].sh_size; 432 info->modinfo_len = sechdrs[i].sh_size;
430 } else if (strcmp(secname, "__ksymtab") == 0) 433 } else if (strcmp(secname, "__ksymtab") == 0)