aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cputopology.txt6
-rw-r--r--Documentation/perf-counters.txt147
-rw-r--r--arch/alpha/kernel/irq.c2
-rw-r--r--arch/arm/kernel/irq.c18
-rw-r--r--arch/arm/kernel/vmlinux.lds.S1
-rw-r--r--arch/arm/oprofile/op_model_mpcore.c2
-rw-r--r--arch/blackfin/kernel/irqchip.c5
-rw-r--r--arch/ia64/include/asm/topology.h2
-rw-r--r--arch/ia64/kernel/iosapic.c2
-rw-r--r--arch/ia64/kernel/irq.c4
-rw-r--r--arch/ia64/kernel/irq_ia64.c12
-rw-r--r--arch/ia64/kernel/msi_ia64.c4
-rw-r--r--arch/ia64/kernel/vmlinux.lds.S1
-rw-r--r--arch/ia64/sn/kernel/msi_sn.c2
-rw-r--r--arch/mips/include/asm/irq.h2
-rw-r--r--arch/mips/kernel/irq-gic.c2
-rw-r--r--arch/mips/kernel/smtc.c6
-rw-r--r--arch/mips/mti-malta/malta-smtc.c5
-rw-r--r--arch/mips/sgi-ip22/ip22-int.c2
-rw-r--r--arch/mips/sgi-ip22/ip22-time.c2
-rw-r--r--arch/mips/sibyte/bcm1480/smp.c3
-rw-r--r--arch/mips/sibyte/sb1250/smp.c3
-rw-r--r--arch/mn10300/kernel/mn10300-watchdog.c3
-rw-r--r--arch/parisc/kernel/irq.c8
-rw-r--r--arch/powerpc/include/asm/hw_irq.h31
-rw-r--r--arch/powerpc/include/asm/paca.h1
-rw-r--r--arch/powerpc/include/asm/perf_counter.h72
-rw-r--r--arch/powerpc/include/asm/systbl.h1
-rw-r--r--arch/powerpc/include/asm/unistd.h3
-rw-r--r--arch/powerpc/kernel/Makefile1
-rw-r--r--arch/powerpc/kernel/asm-offsets.c1
-rw-r--r--arch/powerpc/kernel/entry_64.S9
-rw-r--r--arch/powerpc/kernel/irq.c12
-rw-r--r--arch/powerpc/kernel/perf_counter.c847
-rw-r--r--arch/powerpc/kernel/power6-pmu.c283
-rw-r--r--arch/powerpc/kernel/ppc970-pmu.c375
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S1
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype1
-rw-r--r--arch/powerpc/platforms/pseries/xics.c5
-rw-r--r--arch/powerpc/sysdev/mpic.c3
-rw-r--r--arch/sparc/kernel/irq_64.c5
-rw-r--r--arch/sparc/kernel/time_64.c2
-rw-r--r--arch/x86/Kconfig31
-rw-r--r--arch/x86/Kconfig.cpu10
-rw-r--r--arch/x86/Kconfig.debug1
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/ia32/ia32entry.S11
-rw-r--r--arch/x86/include/asm/apicnum.h12
-rw-r--r--arch/x86/include/asm/atomic_32.h218
-rw-r--r--arch/x86/include/asm/cpu.h21
-rw-r--r--arch/x86/include/asm/cpumask.h28
-rw-r--r--arch/x86/include/asm/current.h24
-rw-r--r--arch/x86/include/asm/genapic_32.h7
-rw-r--r--arch/x86/include/asm/genapic_64.h6
-rw-r--r--arch/x86/include/asm/hardirq.h50
-rw-r--r--arch/x86/include/asm/hardirq_32.h30
-rw-r--r--arch/x86/include/asm/hardirq_64.h25
-rw-r--r--arch/x86/include/asm/hw_irq.h2
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/io_apic.h26
-rw-r--r--arch/x86/include/asm/irq_regs.h36
-rw-r--r--arch/x86/include/asm/irq_regs_32.h31
-rw-r--r--arch/x86/include/asm/irq_regs_64.h1
-rw-r--r--arch/x86/include/asm/irq_vectors.h54
-rw-r--r--arch/x86/include/asm/mach-default/entry_arch.h23
-rw-r--r--arch/x86/include/asm/mmu_context.h63
-rw-r--r--arch/x86/include/asm/mmu_context_32.h55
-rw-r--r--arch/x86/include/asm/mmu_context_64.h54
-rw-r--r--arch/x86/include/asm/mpspec_def.h23
-rw-r--r--arch/x86/include/asm/page_64.h4
-rw-r--r--arch/x86/include/asm/paravirt.h8
-rw-r--r--arch/x86/include/asm/pda.h137
-rw-r--r--arch/x86/include/asm/percpu.h153
-rw-r--r--arch/x86/include/asm/perf_counter.h95
-rw-r--r--arch/x86/include/asm/pgtable_64.h1
-rw-r--r--arch/x86/include/asm/processor.h24
-rw-r--r--arch/x86/include/asm/setup.h1
-rw-r--r--arch/x86/include/asm/smp.h50
-rw-r--r--arch/x86/include/asm/stackprotector.h38
-rw-r--r--arch/x86/include/asm/system.h23
-rw-r--r--arch/x86/include/asm/thread_info.h24
-rw-r--r--arch/x86/include/asm/tlbflush.h17
-rw-r--r--arch/x86/include/asm/topology.h25
-rw-r--r--arch/x86/include/asm/trampoline.h1
-rw-r--r--arch/x86/include/asm/unistd_32.h1
-rw-r--r--arch/x86/include/asm/unistd_64.h3
-rw-r--r--arch/x86/include/asm/uv/uv.h33
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h1
-rw-r--r--arch/x86/kernel/Makefile8
-rw-r--r--arch/x86/kernel/acpi/boot.c96
-rw-r--r--arch/x86/kernel/acpi/sleep.c1
-rw-r--r--arch/x86/kernel/apic.c26
-rw-r--r--arch/x86/kernel/asm-offsets_64.c11
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/common.c116
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c63
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c21
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c1
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c733
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/crash.c2
-rw-r--r--arch/x86/kernel/dumpstack_64.c35
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/efi_64.c1
-rw-r--r--arch/x86/kernel/entry_32.S6
-rw-r--r--arch/x86/kernel/entry_64.S48
-rw-r--r--arch/x86/kernel/genapic_64.c2
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c1
-rw-r--r--arch/x86/kernel/head64.c23
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/head_64.S36
-rw-r--r--arch/x86/kernel/io_apic.c165
-rw-r--r--arch/x86/kernel/irq.c11
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c9
-rw-r--r--arch/x86/kernel/irqinit_32.c70
-rw-r--r--arch/x86/kernel/irqinit_64.c12
-rw-r--r--arch/x86/kernel/microcode_intel.c10
-rw-r--r--arch/x86/kernel/module_32.c6
-rw-r--r--arch/x86/kernel/module_64.c32
-rw-r--r--arch/x86/kernel/mpparse.c142
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/nmi.c10
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c43
-rw-r--r--arch/x86/kernel/reboot.c1
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/setup_percpu.c182
-rw-r--r--arch/x86/kernel/signal.c7
-rw-r--r--arch/x86/kernel/smpboot.c70
-rw-r--r--arch/x86/kernel/smpcommon.c10
-rw-r--r--arch/x86/kernel/syscall_table_32.S1
-rw-r--r--arch/x86/kernel/tlb_32.c256
-rw-r--r--arch/x86/kernel/tlb_uv.c68
-rw-r--r--arch/x86/kernel/traps.c16
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S9
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S26
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c2
-rw-r--r--arch/x86/mach-voyager/setup.c1
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c6
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c446
-rw-r--r--arch/x86/mm/init_32.c1
-rw-r--r--arch/x86/mm/srat_64.c1
-rw-r--r--arch/x86/mm/tlb.c (renamed from arch/x86/kernel/tlb_64.c)122
-rw-r--r--arch/x86/oprofile/nmi_int.c7
-rw-r--r--arch/x86/oprofile/op_model_ppro.c10
-rw-r--r--arch/x86/xen/enlighten.c46
-rw-r--r--arch/x86/xen/irq.c8
-rw-r--r--arch/x86/xen/mmu.c8
-rw-r--r--arch/x86/xen/multicalls.h2
-rw-r--r--arch/x86/xen/smp.c33
-rw-r--r--arch/x86/xen/suspend.c1
-rw-r--r--arch/x86/xen/xen-asm_64.S31
-rw-r--r--drivers/acpi/processor_idle.c8
-rw-r--r--drivers/base/cpu.c2
-rw-r--r--drivers/base/topology.c33
-rw-r--r--drivers/char/sysrq.c2
-rw-r--r--drivers/firmware/dcdbas.c12
-rw-r--r--drivers/misc/Kconfig4
-rw-r--r--drivers/misc/sgi-gru/gru.h2
-rw-r--r--drivers/misc/sgi-xp/xp.h2
-rw-r--r--drivers/misc/sgi-xp/xpc_main.c2
-rw-r--r--drivers/net/sfc/efx.c17
-rw-r--r--drivers/oprofile/buffer_sync.c22
-rw-r--r--drivers/oprofile/buffer_sync.h4
-rw-r--r--drivers/oprofile/oprof.c9
-rw-r--r--drivers/pci/intr_remapping.c1
-rw-r--r--drivers/xen/events.c26
-rw-r--r--drivers/xen/manage.c2
-rw-r--r--fs/exec.c8
-rw-r--r--include/asm-generic/percpu.h52
-rw-r--r--include/asm-generic/sections.h2
-rw-r--r--include/asm-generic/vmlinux.lds.h47
-rw-r--r--include/linux/init_task.h11
-rw-r--r--include/linux/interrupt.h1
-rw-r--r--include/linux/irq.h86
-rw-r--r--include/linux/irqnr.h1
-rw-r--r--include/linux/kernel_stat.h8
-rw-r--r--include/linux/magic.h1
-rw-r--r--include/linux/percpu.h41
-rw-r--r--include/linux/perf_counter.h295
-rw-r--r--include/linux/prctl.h3
-rw-r--r--include/linux/sched.h30
-rw-r--r--include/linux/stackprotector.h16
-rw-r--r--include/linux/syscalls.h8
-rw-r--r--include/linux/topology.h6
-rw-r--r--init/Kconfig30
-rw-r--r--init/main.c7
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/exit.c18
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/handle.c57
-rw-r--r--kernel/irq/internals.h7
-rw-r--r--kernel/irq/manage.c12
-rw-r--r--kernel/irq/migration.c12
-rw-r--r--kernel/irq/numa_migrate.c19
-rw-r--r--kernel/irq/proc.c4
-rw-r--r--kernel/panic.c12
-rw-r--r--kernel/perf_counter.c2199
-rw-r--r--kernel/sched.c100
-rw-r--r--kernel/sched_rt.c32
-rw-r--r--kernel/softirq.c5
-rw-r--r--kernel/sys.c7
-rw-r--r--kernel/sys_ni.c3
206 files changed, 7717 insertions, 2141 deletions
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index 45932ec21cee..b41f3e58aefa 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -18,11 +18,11 @@ For an architecture to support this feature, it must define some of
18these macros in include/asm-XXX/topology.h: 18these macros in include/asm-XXX/topology.h:
19#define topology_physical_package_id(cpu) 19#define topology_physical_package_id(cpu)
20#define topology_core_id(cpu) 20#define topology_core_id(cpu)
21#define topology_thread_siblings(cpu) 21#define topology_thread_cpumask(cpu)
22#define topology_core_siblings(cpu) 22#define topology_core_cpumask(cpu)
23 23
24The type of **_id is int. 24The type of **_id is int.
25The type of siblings is cpumask_t. 25The type of siblings is (const) struct cpumask *.
26 26
27To be consistent on all architectures, include/linux/topology.h 27To be consistent on all architectures, include/linux/topology.h
28provides default definitions for any of the above macros that are 28provides default definitions for any of the above macros that are
diff --git a/Documentation/perf-counters.txt b/Documentation/perf-counters.txt
new file mode 100644
index 000000000000..fddd32189a50
--- /dev/null
+++ b/Documentation/perf-counters.txt
@@ -0,0 +1,147 @@
1
2Performance Counters for Linux
3------------------------------
4
5Performance counters are special hardware registers available on most modern
6CPUs. These registers count the number of certain types of hw events: such
7as instructions executed, cachemisses suffered, or branches mis-predicted -
8without slowing down the kernel or applications. These registers can also
9trigger interrupts when a threshold number of events have passed - and can
10thus be used to profile the code that runs on that CPU.
11
12The Linux Performance Counter subsystem provides an abstraction of these
13hardware capabilities. It provides per task and per CPU counters, counter
14groups, and it provides event capabilities on top of those.
15
16Performance counters are accessed via special file descriptors.
17There's one file descriptor per virtual counter used.
18
19The special file descriptor is opened via the perf_counter_open()
20system call:
21
22 int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
23 pid_t pid, int cpu, int group_fd);
24
25The syscall returns the new fd. The fd can be used via the normal
26VFS system calls: read() can be used to read the counter, fcntl()
27can be used to set the blocking mode, etc.
28
29Multiple counters can be kept open at a time, and the counters
30can be poll()ed.
31
32When creating a new counter fd, 'perf_counter_hw_event' is:
33
34/*
35 * Hardware event to monitor via a performance monitoring counter:
36 */
37struct perf_counter_hw_event {
38 s64 type;
39
40 u64 irq_period;
41 u32 record_type;
42
43 u32 disabled : 1, /* off by default */
44 nmi : 1, /* NMI sampling */
45 raw : 1, /* raw event type */
46 __reserved_1 : 29;
47
48 u64 __reserved_2;
49};
50
51/*
52 * Generalized performance counter event types, used by the hw_event.type
53 * parameter of the sys_perf_counter_open() syscall:
54 */
55enum hw_event_types {
56 /*
57 * Common hardware events, generalized by the kernel:
58 */
59 PERF_COUNT_CYCLES = 0,
60 PERF_COUNT_INSTRUCTIONS = 1,
61 PERF_COUNT_CACHE_REFERENCES = 2,
62 PERF_COUNT_CACHE_MISSES = 3,
63 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
64 PERF_COUNT_BRANCH_MISSES = 5,
65
66 /*
67 * Special "software" counters provided by the kernel, even if
68 * the hardware does not support performance counters. These
69 * counters measure various physical and sw events of the
70 * kernel (and allow the profiling of them as well):
71 */
72 PERF_COUNT_CPU_CLOCK = -1,
73 PERF_COUNT_TASK_CLOCK = -2,
74 /*
75 * Future software events:
76 */
77 /* PERF_COUNT_PAGE_FAULTS = -3,
78 PERF_COUNT_CONTEXT_SWITCHES = -4, */
79};
80
81These are standardized types of events that work uniformly on all CPUs
82that implements Performance Counters support under Linux. If a CPU is
83not able to count branch-misses, then the system call will return
84-EINVAL.
85
86More hw_event_types are supported as well, but they are CPU
87specific and are enumerated via /sys on a per CPU basis. Raw hw event
88types can be passed in under hw_event.type if hw_event.raw is 1.
89For example, to count "External bus cycles while bus lock signal asserted"
90events on Intel Core CPUs, pass in a 0x4064 event type value and set
91hw_event.raw to 1.
92
93'record_type' is the type of data that a read() will provide for the
94counter, and it can be one of:
95
96/*
97 * IRQ-notification data record type:
98 */
99enum perf_counter_record_type {
100 PERF_RECORD_SIMPLE = 0,
101 PERF_RECORD_IRQ = 1,
102 PERF_RECORD_GROUP = 2,
103};
104
105a "simple" counter is one that counts hardware events and allows
106them to be read out into a u64 count value. (read() returns 8 on
107a successful read of a simple counter.)
108
109An "irq" counter is one that will also provide an IRQ context information:
110the IP of the interrupted context. In this case read() will return
111the 8-byte counter value, plus the Instruction Pointer address of the
112interrupted context.
113
114The parameter 'hw_event_period' is the number of events before waking up
115a read() that is blocked on a counter fd. Zero value means a non-blocking
116counter.
117
118The 'pid' parameter allows the counter to be specific to a task:
119
120 pid == 0: if the pid parameter is zero, the counter is attached to the
121 current task.
122
123 pid > 0: the counter is attached to a specific task (if the current task
124 has sufficient privilege to do so)
125
126 pid < 0: all tasks are counted (per cpu counters)
127
128The 'cpu' parameter allows a counter to be made specific to a full
129CPU:
130
131 cpu >= 0: the counter is restricted to a specific CPU
132 cpu == -1: the counter counts on all CPUs
133
134(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
135
136A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
137events of that task and 'follows' that task to whatever CPU the task
138gets schedule to. Per task counters can be created by any user, for
139their own tasks.
140
141A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
142all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
143
144Group counters are created by passing in a group_fd of another counter.
145Groups are scheduled at once and can be used with PERF_RECORD_GROUP
146to record multi-dimensional timestamps.
147
diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c
index 703731accda6..7bc7489223f3 100644
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -55,7 +55,7 @@ int irq_select_affinity(unsigned int irq)
55 cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0); 55 cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0);
56 last_cpu = cpu; 56 last_cpu = cpu;
57 57
58 irq_desc[irq].affinity = cpumask_of_cpu(cpu); 58 cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
59 irq_desc[irq].chip->set_affinity(irq, cpumask_of(cpu)); 59 irq_desc[irq].chip->set_affinity(irq, cpumask_of(cpu));
60 return 0; 60 return 0;
61} 61}
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index 363db186cb93..45eacb5a2ecd 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -104,6 +104,11 @@ static struct irq_desc bad_irq_desc = {
104 .lock = __SPIN_LOCK_UNLOCKED(bad_irq_desc.lock), 104 .lock = __SPIN_LOCK_UNLOCKED(bad_irq_desc.lock),
105}; 105};
106 106
107#ifdef CONFIG_CPUMASK_OFFSTACK
108/* We are not allocating bad_irq_desc.affinity or .pending_mask */
109#error "ARM architecture does not support CONFIG_CPUMASK_OFFSTACK."
110#endif
111
107/* 112/*
108 * do_IRQ handles all hardware IRQ's. Decoded IRQs should not 113 * do_IRQ handles all hardware IRQ's. Decoded IRQs should not
109 * come via this function. Instead, they should provide their 114 * come via this function. Instead, they should provide their
@@ -161,7 +166,7 @@ void __init init_IRQ(void)
161 irq_desc[irq].status |= IRQ_NOREQUEST | IRQ_NOPROBE; 166 irq_desc[irq].status |= IRQ_NOREQUEST | IRQ_NOPROBE;
162 167
163#ifdef CONFIG_SMP 168#ifdef CONFIG_SMP
164 bad_irq_desc.affinity = CPU_MASK_ALL; 169 cpumask_setall(bad_irq_desc.affinity);
165 bad_irq_desc.cpu = smp_processor_id(); 170 bad_irq_desc.cpu = smp_processor_id();
166#endif 171#endif
167 init_arch_irq(); 172 init_arch_irq();
@@ -191,15 +196,16 @@ void migrate_irqs(void)
191 struct irq_desc *desc = irq_desc + i; 196 struct irq_desc *desc = irq_desc + i;
192 197
193 if (desc->cpu == cpu) { 198 if (desc->cpu == cpu) {
194 unsigned int newcpu = any_online_cpu(desc->affinity); 199 unsigned int newcpu = cpumask_any_and(desc->affinity,
195 200 cpu_online_mask);
196 if (newcpu == NR_CPUS) { 201 if (newcpu >= nr_cpu_ids) {
197 if (printk_ratelimit()) 202 if (printk_ratelimit())
198 printk(KERN_INFO "IRQ%u no longer affine to CPU%u\n", 203 printk(KERN_INFO "IRQ%u no longer affine to CPU%u\n",
199 i, cpu); 204 i, cpu);
200 205
201 cpus_setall(desc->affinity); 206 cpumask_setall(desc->affinity);
202 newcpu = any_online_cpu(desc->affinity); 207 newcpu = cpumask_any_and(desc->affinity,
208 cpu_online_mask);
203 } 209 }
204 210
205 route_irq(desc, i, newcpu); 211 route_irq(desc, i, newcpu);
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 00216071eaf7..85598f7da407 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -65,6 +65,7 @@ SECTIONS
65#endif 65#endif
66 . = ALIGN(4096); 66 . = ALIGN(4096);
67 __per_cpu_start = .; 67 __per_cpu_start = .;
68 *(.data.percpu.page_aligned)
68 *(.data.percpu) 69 *(.data.percpu)
69 *(.data.percpu.shared_aligned) 70 *(.data.percpu.shared_aligned)
70 __per_cpu_end = .; 71 __per_cpu_end = .;
diff --git a/arch/arm/oprofile/op_model_mpcore.c b/arch/arm/oprofile/op_model_mpcore.c
index 6d6bd5899240..853d42bb8682 100644
--- a/arch/arm/oprofile/op_model_mpcore.c
+++ b/arch/arm/oprofile/op_model_mpcore.c
@@ -263,7 +263,7 @@ static void em_route_irq(int irq, unsigned int cpu)
263 const struct cpumask *mask = cpumask_of(cpu); 263 const struct cpumask *mask = cpumask_of(cpu);
264 264
265 spin_lock_irq(&desc->lock); 265 spin_lock_irq(&desc->lock);
266 desc->affinity = *mask; 266 cpumask_copy(desc->affinity, mask);
267 desc->chip->set_affinity(irq, mask); 267 desc->chip->set_affinity(irq, mask);
268 spin_unlock_irq(&desc->lock); 268 spin_unlock_irq(&desc->lock);
269} 269}
diff --git a/arch/blackfin/kernel/irqchip.c b/arch/blackfin/kernel/irqchip.c
index 75724eee6494..23e9aa080710 100644
--- a/arch/blackfin/kernel/irqchip.c
+++ b/arch/blackfin/kernel/irqchip.c
@@ -70,6 +70,11 @@ static struct irq_desc bad_irq_desc = {
70#endif 70#endif
71}; 71};
72 72
73#ifdef CONFIG_CPUMASK_OFFSTACK
74/* We are not allocating a variable-sized bad_irq_desc.affinity */
75#error "Blackfin architecture does not support CONFIG_CPUMASK_OFFSTACK."
76#endif
77
73int show_interrupts(struct seq_file *p, void *v) 78int show_interrupts(struct seq_file *p, void *v)
74{ 79{
75 int i = *(loff_t *) v, j; 80 int i = *(loff_t *) v, j;
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 32f3af1641c5..3193f4417e16 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -84,7 +84,7 @@ void build_cpu_to_node_map(void);
84 .child = NULL, \ 84 .child = NULL, \
85 .groups = NULL, \ 85 .groups = NULL, \
86 .min_interval = 8, \ 86 .min_interval = 8, \
87 .max_interval = 8*(min(num_online_cpus(), 32)), \ 87 .max_interval = 8*(min(num_online_cpus(), 32U)), \
88 .busy_factor = 64, \ 88 .busy_factor = 64, \
89 .imbalance_pct = 125, \ 89 .imbalance_pct = 125, \
90 .cache_nice_tries = 2, \ 90 .cache_nice_tries = 2, \
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 5cfd3d91001a..006ad366a454 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -880,7 +880,7 @@ iosapic_unregister_intr (unsigned int gsi)
880 if (iosapic_intr_info[irq].count == 0) { 880 if (iosapic_intr_info[irq].count == 0) {
881#ifdef CONFIG_SMP 881#ifdef CONFIG_SMP
882 /* Clear affinity */ 882 /* Clear affinity */
883 cpus_setall(idesc->affinity); 883 cpumask_setall(idesc->affinity);
884#endif 884#endif
885 /* Clear the interrupt information */ 885 /* Clear the interrupt information */
886 iosapic_intr_info[irq].dest = 0; 886 iosapic_intr_info[irq].dest = 0;
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index a58f64ca9f0e..226233a6fa19 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -103,7 +103,7 @@ static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
103void set_irq_affinity_info (unsigned int irq, int hwid, int redir) 103void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
104{ 104{
105 if (irq < NR_IRQS) { 105 if (irq < NR_IRQS) {
106 cpumask_copy(&irq_desc[irq].affinity, 106 cpumask_copy(irq_desc[irq].affinity,
107 cpumask_of(cpu_logical_id(hwid))); 107 cpumask_of(cpu_logical_id(hwid)));
108 irq_redir[irq] = (char) (redir & 0xff); 108 irq_redir[irq] = (char) (redir & 0xff);
109 } 109 }
@@ -148,7 +148,7 @@ static void migrate_irqs(void)
148 if (desc->status == IRQ_PER_CPU) 148 if (desc->status == IRQ_PER_CPU)
149 continue; 149 continue;
150 150
151 if (cpumask_any_and(&irq_desc[irq].affinity, cpu_online_mask) 151 if (cpumask_any_and(irq_desc[irq].affinity, cpu_online_mask)
152 >= nr_cpu_ids) { 152 >= nr_cpu_ids) {
153 /* 153 /*
154 * Save it for phase 2 processing 154 * Save it for phase 2 processing
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index 28d3d483db92..927ad027820c 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -493,11 +493,13 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
493 saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); 493 saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
494 ia64_srlz_d(); 494 ia64_srlz_d();
495 while (vector != IA64_SPURIOUS_INT_VECTOR) { 495 while (vector != IA64_SPURIOUS_INT_VECTOR) {
496 struct irq_desc *desc = irq_to_desc(vector);
497
496 if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) { 498 if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
497 smp_local_flush_tlb(); 499 smp_local_flush_tlb();
498 kstat_this_cpu.irqs[vector]++; 500 kstat_incr_irqs_this_cpu(vector, desc);
499 } else if (unlikely(IS_RESCHEDULE(vector))) 501 } else if (unlikely(IS_RESCHEDULE(vector)))
500 kstat_this_cpu.irqs[vector]++; 502 kstat_incr_irqs_this_cpu(vector, desc);
501 else { 503 else {
502 int irq = local_vector_to_irq(vector); 504 int irq = local_vector_to_irq(vector);
503 505
@@ -551,11 +553,13 @@ void ia64_process_pending_intr(void)
551 * Perform normal interrupt style processing 553 * Perform normal interrupt style processing
552 */ 554 */
553 while (vector != IA64_SPURIOUS_INT_VECTOR) { 555 while (vector != IA64_SPURIOUS_INT_VECTOR) {
556 struct irq_desc *desc = irq_to_desc(vector);
557
554 if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) { 558 if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
555 smp_local_flush_tlb(); 559 smp_local_flush_tlb();
556 kstat_this_cpu.irqs[vector]++; 560 kstat_incr_irqs_this_cpu(vector, desc);
557 } else if (unlikely(IS_RESCHEDULE(vector))) 561 } else if (unlikely(IS_RESCHEDULE(vector)))
558 kstat_this_cpu.irqs[vector]++; 562 kstat_incr_irqs_this_cpu(vector, desc);
559 else { 563 else {
560 struct pt_regs *old_regs = set_irq_regs(NULL); 564 struct pt_regs *old_regs = set_irq_regs(NULL);
561 int irq = local_vector_to_irq(vector); 565 int irq = local_vector_to_irq(vector);
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 890339339035..dcb6b7c51ea7 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -75,7 +75,7 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
75 msg.data = data; 75 msg.data = data;
76 76
77 write_msi_msg(irq, &msg); 77 write_msi_msg(irq, &msg);
78 irq_desc[irq].affinity = cpumask_of_cpu(cpu); 78 cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
79} 79}
80#endif /* CONFIG_SMP */ 80#endif /* CONFIG_SMP */
81 81
@@ -187,7 +187,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
187 msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu)); 187 msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu));
188 188
189 dmar_msi_write(irq, &msg); 189 dmar_msi_write(irq, &msg);
190 irq_desc[irq].affinity = *mask; 190 cpumask_copy(irq_desc[irq].affinity, mask);
191} 191}
192#endif /* CONFIG_SMP */ 192#endif /* CONFIG_SMP */
193 193
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 10a7d47e8510..f45e4e508eca 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -219,6 +219,7 @@ SECTIONS
219 .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET) 219 .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET)
220 { 220 {
221 __per_cpu_start = .; 221 __per_cpu_start = .;
222 *(.data.percpu.page_aligned)
222 *(.data.percpu) 223 *(.data.percpu)
223 *(.data.percpu.shared_aligned) 224 *(.data.percpu.shared_aligned)
224 __per_cpu_end = .; 225 __per_cpu_end = .;
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c
index ca553b0429ce..81e428943d73 100644
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -205,7 +205,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
205 msg.address_lo = (u32)(bus_addr & 0x00000000ffffffff); 205 msg.address_lo = (u32)(bus_addr & 0x00000000ffffffff);
206 206
207 write_msi_msg(irq, &msg); 207 write_msi_msg(irq, &msg);
208 irq_desc[irq].affinity = *cpu_mask; 208 cpumask_copy(irq_desc[irq].affinity, cpu_mask);
209} 209}
210#endif /* CONFIG_SMP */ 210#endif /* CONFIG_SMP */
211 211
diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h
index abc62aa744ac..3214ade02d10 100644
--- a/arch/mips/include/asm/irq.h
+++ b/arch/mips/include/asm/irq.h
@@ -66,7 +66,7 @@ extern void smtc_forward_irq(unsigned int irq);
66 */ 66 */
67#define IRQ_AFFINITY_HOOK(irq) \ 67#define IRQ_AFFINITY_HOOK(irq) \
68do { \ 68do { \
69 if (!cpu_isset(smp_processor_id(), irq_desc[irq].affinity)) { \ 69 if (!cpumask_test_cpu(smp_processor_id(), irq_desc[irq].affinity)) {\
70 smtc_forward_irq(irq); \ 70 smtc_forward_irq(irq); \
71 irq_exit(); \ 71 irq_exit(); \
72 return; \ 72 return; \
diff --git a/arch/mips/kernel/irq-gic.c b/arch/mips/kernel/irq-gic.c
index 494a49a317e9..87deb8f6c458 100644
--- a/arch/mips/kernel/irq-gic.c
+++ b/arch/mips/kernel/irq-gic.c
@@ -187,7 +187,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
187 set_bit(irq, pcpu_masks[first_cpu(tmp)].pcpu_mask); 187 set_bit(irq, pcpu_masks[first_cpu(tmp)].pcpu_mask);
188 188
189 } 189 }
190 irq_desc[irq].affinity = *cpumask; 190 cpumask_copy(irq_desc[irq].affinity, cpumask);
191 spin_unlock_irqrestore(&gic_lock, flags); 191 spin_unlock_irqrestore(&gic_lock, flags);
192 192
193} 193}
diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c
index b6cca01ff82b..5f5af7d4c890 100644
--- a/arch/mips/kernel/smtc.c
+++ b/arch/mips/kernel/smtc.c
@@ -686,7 +686,7 @@ void smtc_forward_irq(unsigned int irq)
686 * and efficiency, we just pick the easiest one to find. 686 * and efficiency, we just pick the easiest one to find.
687 */ 687 */
688 688
689 target = first_cpu(irq_desc[irq].affinity); 689 target = cpumask_first(irq_desc[irq].affinity);
690 690
691 /* 691 /*
692 * We depend on the platform code to have correctly processed 692 * We depend on the platform code to have correctly processed
@@ -921,11 +921,13 @@ void ipi_decode(struct smtc_ipi *pipi)
921 struct clock_event_device *cd; 921 struct clock_event_device *cd;
922 void *arg_copy = pipi->arg; 922 void *arg_copy = pipi->arg;
923 int type_copy = pipi->type; 923 int type_copy = pipi->type;
924 int irq = MIPS_CPU_IRQ_BASE + 1;
925
924 smtc_ipi_nq(&freeIPIq, pipi); 926 smtc_ipi_nq(&freeIPIq, pipi);
925 switch (type_copy) { 927 switch (type_copy) {
926 case SMTC_CLOCK_TICK: 928 case SMTC_CLOCK_TICK:
927 irq_enter(); 929 irq_enter();
928 kstat_this_cpu.irqs[MIPS_CPU_IRQ_BASE + 1]++; 930 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
929 cd = &per_cpu(mips_clockevent_device, cpu); 931 cd = &per_cpu(mips_clockevent_device, cpu);
930 cd->event_handler(cd); 932 cd->event_handler(cd);
931 irq_exit(); 933 irq_exit();
diff --git a/arch/mips/mti-malta/malta-smtc.c b/arch/mips/mti-malta/malta-smtc.c
index aabd7274507b..5ba31888fefb 100644
--- a/arch/mips/mti-malta/malta-smtc.c
+++ b/arch/mips/mti-malta/malta-smtc.c
@@ -116,7 +116,7 @@ struct plat_smp_ops msmtc_smp_ops = {
116 116
117void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity) 117void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
118{ 118{
119 cpumask_t tmask = *affinity; 119 cpumask_t tmask;
120 int cpu = 0; 120 int cpu = 0;
121 void smtc_set_irq_affinity(unsigned int irq, cpumask_t aff); 121 void smtc_set_irq_affinity(unsigned int irq, cpumask_t aff);
122 122
@@ -139,11 +139,12 @@ void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
139 * be made to forward to an offline "CPU". 139 * be made to forward to an offline "CPU".
140 */ 140 */
141 141
142 cpumask_copy(&tmask, affinity);
142 for_each_cpu(cpu, affinity) { 143 for_each_cpu(cpu, affinity) {
143 if ((cpu_data[cpu].vpe_id != 0) || !cpu_online(cpu)) 144 if ((cpu_data[cpu].vpe_id != 0) || !cpu_online(cpu))
144 cpu_clear(cpu, tmask); 145 cpu_clear(cpu, tmask);
145 } 146 }
146 irq_desc[irq].affinity = tmask; 147 cpumask_copy(irq_desc[irq].affinity, &tmask);
147 148
148 if (cpus_empty(tmask)) 149 if (cpus_empty(tmask))
149 /* 150 /*
diff --git a/arch/mips/sgi-ip22/ip22-int.c b/arch/mips/sgi-ip22/ip22-int.c
index f8b18af141a1..0ecd5fe9486e 100644
--- a/arch/mips/sgi-ip22/ip22-int.c
+++ b/arch/mips/sgi-ip22/ip22-int.c
@@ -155,7 +155,7 @@ static void indy_buserror_irq(void)
155 int irq = SGI_BUSERR_IRQ; 155 int irq = SGI_BUSERR_IRQ;
156 156
157 irq_enter(); 157 irq_enter();
158 kstat_this_cpu.irqs[irq]++; 158 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
159 ip22_be_interrupt(irq); 159 ip22_be_interrupt(irq);
160 irq_exit(); 160 irq_exit();
161} 161}
diff --git a/arch/mips/sgi-ip22/ip22-time.c b/arch/mips/sgi-ip22/ip22-time.c
index 3dcb27ec0c53..c8f7d2328b24 100644
--- a/arch/mips/sgi-ip22/ip22-time.c
+++ b/arch/mips/sgi-ip22/ip22-time.c
@@ -122,7 +122,7 @@ void indy_8254timer_irq(void)
122 char c; 122 char c;
123 123
124 irq_enter(); 124 irq_enter();
125 kstat_this_cpu.irqs[irq]++; 125 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
126 printk(KERN_ALERT "Oops, got 8254 interrupt.\n"); 126 printk(KERN_ALERT "Oops, got 8254 interrupt.\n");
127 ArcRead(0, &c, 1, &cnt); 127 ArcRead(0, &c, 1, &cnt);
128 ArcEnterInteractiveMode(); 128 ArcEnterInteractiveMode();
diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c
index dddfda8e8294..314691648c97 100644
--- a/arch/mips/sibyte/bcm1480/smp.c
+++ b/arch/mips/sibyte/bcm1480/smp.c
@@ -178,9 +178,10 @@ struct plat_smp_ops bcm1480_smp_ops = {
178void bcm1480_mailbox_interrupt(void) 178void bcm1480_mailbox_interrupt(void)
179{ 179{
180 int cpu = smp_processor_id(); 180 int cpu = smp_processor_id();
181 int irq = K_BCM1480_INT_MBOX_0_0;
181 unsigned int action; 182 unsigned int action;
182 183
183 kstat_this_cpu.irqs[K_BCM1480_INT_MBOX_0_0]++; 184 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
184 /* Load the mailbox register to figure out what we're supposed to do */ 185 /* Load the mailbox register to figure out what we're supposed to do */
185 action = (__raw_readq(mailbox_0_regs[cpu]) >> 48) & 0xffff; 186 action = (__raw_readq(mailbox_0_regs[cpu]) >> 48) & 0xffff;
186 187
diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c
index 5950a288a7da..cad14003b84f 100644
--- a/arch/mips/sibyte/sb1250/smp.c
+++ b/arch/mips/sibyte/sb1250/smp.c
@@ -166,9 +166,10 @@ struct plat_smp_ops sb_smp_ops = {
166void sb1250_mailbox_interrupt(void) 166void sb1250_mailbox_interrupt(void)
167{ 167{
168 int cpu = smp_processor_id(); 168 int cpu = smp_processor_id();
169 int irq = K_INT_MBOX_0;
169 unsigned int action; 170 unsigned int action;
170 171
171 kstat_this_cpu.irqs[K_INT_MBOX_0]++; 172 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
172 /* Load the mailbox register to figure out what we're supposed to do */ 173 /* Load the mailbox register to figure out what we're supposed to do */
173 action = (____raw_readq(mailbox_regs[cpu]) >> 48) & 0xffff; 174 action = (____raw_readq(mailbox_regs[cpu]) >> 48) & 0xffff;
174 175
diff --git a/arch/mn10300/kernel/mn10300-watchdog.c b/arch/mn10300/kernel/mn10300-watchdog.c
index 10811e981d20..2e370d88a87a 100644
--- a/arch/mn10300/kernel/mn10300-watchdog.c
+++ b/arch/mn10300/kernel/mn10300-watchdog.c
@@ -130,6 +130,7 @@ void watchdog_interrupt(struct pt_regs *regs, enum exception_code excep)
130 * the stack NMI-atomically, it's safe to use smp_processor_id(). 130 * the stack NMI-atomically, it's safe to use smp_processor_id().
131 */ 131 */
132 int sum, cpu = smp_processor_id(); 132 int sum, cpu = smp_processor_id();
133 int irq = NMIIRQ;
133 u8 wdt, tmp; 134 u8 wdt, tmp;
134 135
135 wdt = WDCTR & ~WDCTR_WDCNE; 136 wdt = WDCTR & ~WDCTR_WDCNE;
@@ -138,7 +139,7 @@ void watchdog_interrupt(struct pt_regs *regs, enum exception_code excep)
138 NMICR = NMICR_WDIF; 139 NMICR = NMICR_WDIF;
139 140
140 nmi_count(cpu)++; 141 nmi_count(cpu)++;
141 kstat_this_cpu.irqs[NMIIRQ]++; 142 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
142 sum = irq_stat[cpu].__irq_count; 143 sum = irq_stat[cpu].__irq_count;
143 144
144 if (last_irq_sums[cpu] == sum) { 145 if (last_irq_sums[cpu] == sum) {
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index ac2c822928c7..49482806863f 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -120,7 +120,7 @@ int cpu_check_affinity(unsigned int irq, cpumask_t *dest)
120 if (CHECK_IRQ_PER_CPU(irq)) { 120 if (CHECK_IRQ_PER_CPU(irq)) {
121 /* Bad linux design decision. The mask has already 121 /* Bad linux design decision. The mask has already
122 * been set; we must reset it */ 122 * been set; we must reset it */
123 irq_desc[irq].affinity = CPU_MASK_ALL; 123 cpumask_setall(irq_desc[irq].affinity);
124 return -EINVAL; 124 return -EINVAL;
125 } 125 }
126 126
@@ -136,7 +136,7 @@ static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
136 if (cpu_check_affinity(irq, dest)) 136 if (cpu_check_affinity(irq, dest))
137 return; 137 return;
138 138
139 irq_desc[irq].affinity = *dest; 139 cpumask_copy(irq_desc[irq].affinity, dest);
140} 140}
141#endif 141#endif
142 142
@@ -295,7 +295,7 @@ int txn_alloc_irq(unsigned int bits_wide)
295unsigned long txn_affinity_addr(unsigned int irq, int cpu) 295unsigned long txn_affinity_addr(unsigned int irq, int cpu)
296{ 296{
297#ifdef CONFIG_SMP 297#ifdef CONFIG_SMP
298 irq_desc[irq].affinity = cpumask_of_cpu(cpu); 298 cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
299#endif 299#endif
300 300
301 return per_cpu(cpu_data, cpu).txn_addr; 301 return per_cpu(cpu_data, cpu).txn_addr;
@@ -352,7 +352,7 @@ void do_cpu_irq_mask(struct pt_regs *regs)
352 irq = eirr_to_irq(eirr_val); 352 irq = eirr_to_irq(eirr_val);
353 353
354#ifdef CONFIG_SMP 354#ifdef CONFIG_SMP
355 dest = irq_desc[irq].affinity; 355 cpumask_copy(&dest, irq_desc[irq].affinity);
356 if (CHECK_IRQ_PER_CPU(irq_desc[irq].status) && 356 if (CHECK_IRQ_PER_CPU(irq_desc[irq].status) &&
357 !cpu_isset(smp_processor_id(), dest)) { 357 !cpu_isset(smp_processor_id(), dest)) {
358 int cpu = first_cpu(dest); 358 int cpu = first_cpu(dest);
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index f75a5fc64d2e..e10f151c3db6 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -131,5 +131,36 @@ static inline int irqs_disabled_flags(unsigned long flags)
131 */ 131 */
132struct hw_interrupt_type; 132struct hw_interrupt_type;
133 133
134#ifdef CONFIG_PERF_COUNTERS
135static inline unsigned long get_perf_counter_pending(void)
136{
137 unsigned long x;
138
139 asm volatile("lbz %0,%1(13)"
140 : "=r" (x)
141 : "i" (offsetof(struct paca_struct, perf_counter_pending)));
142 return x;
143}
144
145static inline void set_perf_counter_pending(int x)
146{
147 asm volatile("stb %0,%1(13)" : :
148 "r" (x),
149 "i" (offsetof(struct paca_struct, perf_counter_pending)));
150}
151
152extern void perf_counter_do_pending(void);
153
154#else
155
156static inline unsigned long get_perf_counter_pending(void)
157{
158 return 0;
159}
160
161static inline void set_perf_counter_pending(int x) {}
162static inline void perf_counter_do_pending(void) {}
163#endif /* CONFIG_PERF_COUNTERS */
164
134#endif /* __KERNEL__ */ 165#endif /* __KERNEL__ */
135#endif /* _ASM_POWERPC_HW_IRQ_H */ 166#endif /* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 082b3aedf145..6ef055723019 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,6 +99,7 @@ struct paca_struct {
99 u8 soft_enabled; /* irq soft-enable flag */ 99 u8 soft_enabled; /* irq soft-enable flag */
100 u8 hard_enabled; /* set if irqs are enabled in MSR */ 100 u8 hard_enabled; /* set if irqs are enabled in MSR */
101 u8 io_sync; /* writel() needs spin_unlock sync */ 101 u8 io_sync; /* writel() needs spin_unlock sync */
102 u8 perf_counter_pending; /* PM interrupt while soft-disabled */
102 103
103 /* Stuff for accurate time accounting */ 104 /* Stuff for accurate time accounting */
104 u64 user_time; /* accumulated usermode TB ticks */ 105 u64 user_time; /* accumulated usermode TB ticks */
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
new file mode 100644
index 000000000000..9d7ff6d7fb56
--- /dev/null
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -0,0 +1,72 @@
1/*
2 * Performance counter support - PowerPC-specific definitions.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/types.h>
12
13#define MAX_HWCOUNTERS 8
14#define MAX_EVENT_ALTERNATIVES 8
15
16/*
17 * This struct provides the constants and functions needed to
18 * describe the PMU on a particular POWER-family CPU.
19 */
20struct power_pmu {
21 int n_counter;
22 int max_alternatives;
23 u64 add_fields;
24 u64 test_adder;
25 int (*compute_mmcr)(unsigned int events[], int n_ev,
26 unsigned int hwc[], u64 mmcr[]);
27 int (*get_constraint)(unsigned int event, u64 *mskp, u64 *valp);
28 int (*get_alternatives)(unsigned int event, unsigned int alt[]);
29 void (*disable_pmc)(unsigned int pmc, u64 mmcr[]);
30 int n_generic;
31 int *generic_events;
32};
33
34extern struct power_pmu *ppmu;
35
36/*
37 * The power_pmu.get_constraint function returns a 64-bit value and
38 * a 64-bit mask that express the constraints between this event and
39 * other events.
40 *
41 * The value and mask are divided up into (non-overlapping) bitfields
42 * of three different types:
43 *
44 * Select field: this expresses the constraint that some set of bits
45 * in MMCR* needs to be set to a specific value for this event. For a
46 * select field, the mask contains 1s in every bit of the field, and
47 * the value contains a unique value for each possible setting of the
48 * MMCR* bits. The constraint checking code will ensure that two events
49 * that set the same field in their masks have the same value in their
50 * value dwords.
51 *
52 * Add field: this expresses the constraint that there can be at most
53 * N events in a particular class. A field of k bits can be used for
54 * N <= 2^(k-1) - 1. The mask has the most significant bit of the field
55 * set (and the other bits 0), and the value has only the least significant
56 * bit of the field set. In addition, the 'add_fields' and 'test_adder'
57 * in the struct power_pmu for this processor come into play. The
58 * add_fields value contains 1 in the LSB of the field, and the
59 * test_adder contains 2^(k-1) - 1 - N in the field.
60 *
61 * NAND field: this expresses the constraint that you may not have events
62 * in all of a set of classes. (For example, on PPC970, you can't select
63 * events from the FPU, ISU and IDU simultaneously, although any two are
64 * possible.) For N classes, the field is N+1 bits wide, and each class
65 * is assigned one bit from the least-significant N bits. The mask has
66 * only the most-significant bit set, and the value has only the bit
67 * for the event's class set. The test_adder has the least significant
68 * bit set in the field.
69 *
70 * If an event is not subject to the constraint expressed by a particular
71 * field, then it will have 0 in both the mask and value for that field.
72 */
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 72353f6070a4..4c8095f6bec0 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,3 +322,4 @@ SYSCALL_SPU(epoll_create1)
322SYSCALL_SPU(dup3) 322SYSCALL_SPU(dup3)
323SYSCALL_SPU(pipe2) 323SYSCALL_SPU(pipe2)
324SYSCALL(inotify_init1) 324SYSCALL(inotify_init1)
325SYSCALL(perf_counter_open)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index e07d0c76ed77..7cef5afe89d8 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,10 +341,11 @@
341#define __NR_dup3 316 341#define __NR_dup3 316
342#define __NR_pipe2 317 342#define __NR_pipe2 317
343#define __NR_inotify_init1 318 343#define __NR_inotify_init1 318
344#define __NR_perf_counter_open 319
344 345
345#ifdef __KERNEL__ 346#ifdef __KERNEL__
346 347
347#define __NR_syscalls 319 348#define __NR_syscalls 320
348 349
349#define __NR__exit __NR_exit 350#define __NR__exit __NR_exit
350#define NR_syscalls __NR_syscalls 351#define NR_syscalls __NR_syscalls
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 8d1a419df35d..7c941ec3b23e 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_AUDIT) += audit.o
94obj64-$(CONFIG_AUDIT) += compat_audit.o 94obj64-$(CONFIG_AUDIT) += compat_audit.o
95 95
96obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 96obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
97obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o power6-pmu.o
97 98
98obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o 99obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
99 100
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 19ee491e9e23..3734973f7394 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -131,6 +131,7 @@ int main(void)
131 DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); 131 DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
132 DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); 132 DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
133 DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); 133 DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
134 DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
134 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache)); 135 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
135 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); 136 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
136 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); 137 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 383ed6eb0085..f30b4e553c53 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
5262: 5262:
527 TRACE_AND_RESTORE_IRQ(r5); 527 TRACE_AND_RESTORE_IRQ(r5);
528 528
529#ifdef CONFIG_PERF_COUNTERS
530 /* check paca->perf_counter_pending if we're enabling ints */
531 lbz r3,PACAPERFPEND(r13)
532 and. r3,r3,r5
533 beq 27f
534 bl .perf_counter_do_pending
53527:
536#endif /* CONFIG_PERF_COUNTERS */
537
529 /* extract EE bit and use it to restore paca->hard_enabled */ 538 /* extract EE bit and use it to restore paca->hard_enabled */
530 ld r3,_MSR(r1) 539 ld r3,_MSR(r1)
531 rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ 540 rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 23b8b5e36f98..7f8e6a92c5a1 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -104,6 +104,13 @@ static inline notrace void set_soft_enabled(unsigned long enable)
104 : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); 104 : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
105} 105}
106 106
107#ifdef CONFIG_PERF_COUNTERS
108notrace void __weak perf_counter_do_pending(void)
109{
110 set_perf_counter_pending(0);
111}
112#endif
113
107notrace void raw_local_irq_restore(unsigned long en) 114notrace void raw_local_irq_restore(unsigned long en)
108{ 115{
109 /* 116 /*
@@ -135,6 +142,9 @@ notrace void raw_local_irq_restore(unsigned long en)
135 iseries_handle_interrupts(); 142 iseries_handle_interrupts();
136 } 143 }
137 144
145 if (get_perf_counter_pending())
146 perf_counter_do_pending();
147
138 /* 148 /*
139 * if (get_paca()->hard_enabled) return; 149 * if (get_paca()->hard_enabled) return;
140 * But again we need to take care that gcc gets hard_enabled directly 150 * But again we need to take care that gcc gets hard_enabled directly
@@ -231,7 +241,7 @@ void fixup_irqs(cpumask_t map)
231 if (irq_desc[irq].status & IRQ_PER_CPU) 241 if (irq_desc[irq].status & IRQ_PER_CPU)
232 continue; 242 continue;
233 243
234 cpus_and(mask, irq_desc[irq].affinity, map); 244 cpumask_and(&mask, irq_desc[irq].affinity, &map);
235 if (any_online_cpu(mask) == NR_CPUS) { 245 if (any_online_cpu(mask) == NR_CPUS) {
236 printk("Breaking affinity for irq %i\n", irq); 246 printk("Breaking affinity for irq %i\n", irq);
237 mask = map; 247 mask = map;
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
new file mode 100644
index 000000000000..bd6ba85beb54
--- /dev/null
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -0,0 +1,847 @@
1/*
2 * Performance counter support - powerpc architecture code
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/sched.h>
13#include <linux/perf_counter.h>
14#include <linux/percpu.h>
15#include <linux/hardirq.h>
16#include <asm/reg.h>
17#include <asm/pmc.h>
18#include <asm/machdep.h>
19#include <asm/firmware.h>
20
21struct cpu_hw_counters {
22 int n_counters;
23 int n_percpu;
24 int disabled;
25 int n_added;
26 struct perf_counter *counter[MAX_HWCOUNTERS];
27 unsigned int events[MAX_HWCOUNTERS];
28 u64 mmcr[3];
29 u8 pmcs_enabled;
30};
31DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
32
33struct power_pmu *ppmu;
34
35void perf_counter_print_debug(void)
36{
37}
38
39/*
40 * Read one performance monitor counter (PMC).
41 */
42static unsigned long read_pmc(int idx)
43{
44 unsigned long val;
45
46 switch (idx) {
47 case 1:
48 val = mfspr(SPRN_PMC1);
49 break;
50 case 2:
51 val = mfspr(SPRN_PMC2);
52 break;
53 case 3:
54 val = mfspr(SPRN_PMC3);
55 break;
56 case 4:
57 val = mfspr(SPRN_PMC4);
58 break;
59 case 5:
60 val = mfspr(SPRN_PMC5);
61 break;
62 case 6:
63 val = mfspr(SPRN_PMC6);
64 break;
65 case 7:
66 val = mfspr(SPRN_PMC7);
67 break;
68 case 8:
69 val = mfspr(SPRN_PMC8);
70 break;
71 default:
72 printk(KERN_ERR "oops trying to read PMC%d\n", idx);
73 val = 0;
74 }
75 return val;
76}
77
78/*
79 * Write one PMC.
80 */
81static void write_pmc(int idx, unsigned long val)
82{
83 switch (idx) {
84 case 1:
85 mtspr(SPRN_PMC1, val);
86 break;
87 case 2:
88 mtspr(SPRN_PMC2, val);
89 break;
90 case 3:
91 mtspr(SPRN_PMC3, val);
92 break;
93 case 4:
94 mtspr(SPRN_PMC4, val);
95 break;
96 case 5:
97 mtspr(SPRN_PMC5, val);
98 break;
99 case 6:
100 mtspr(SPRN_PMC6, val);
101 break;
102 case 7:
103 mtspr(SPRN_PMC7, val);
104 break;
105 case 8:
106 mtspr(SPRN_PMC8, val);
107 break;
108 default:
109 printk(KERN_ERR "oops trying to write PMC%d\n", idx);
110 }
111}
112
113/*
114 * Check if a set of events can all go on the PMU at once.
115 * If they can't, this will look at alternative codes for the events
116 * and see if any combination of alternative codes is feasible.
117 * The feasible set is returned in event[].
118 */
119static int power_check_constraints(unsigned int event[], int n_ev)
120{
121 u64 mask, value, nv;
122 unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
123 u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
124 u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
125 u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
126 int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
127 int i, j;
128 u64 addf = ppmu->add_fields;
129 u64 tadd = ppmu->test_adder;
130
131 if (n_ev > ppmu->n_counter)
132 return -1;
133
134 /* First see if the events will go on as-is */
135 for (i = 0; i < n_ev; ++i) {
136 alternatives[i][0] = event[i];
137 if (ppmu->get_constraint(event[i], &amasks[i][0],
138 &avalues[i][0]))
139 return -1;
140 choice[i] = 0;
141 }
142 value = mask = 0;
143 for (i = 0; i < n_ev; ++i) {
144 nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
145 if ((((nv + tadd) ^ value) & mask) != 0 ||
146 (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
147 break;
148 value = nv;
149 mask |= amasks[i][0];
150 }
151 if (i == n_ev)
152 return 0; /* all OK */
153
154 /* doesn't work, gather alternatives... */
155 if (!ppmu->get_alternatives)
156 return -1;
157 for (i = 0; i < n_ev; ++i) {
158 n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]);
159 for (j = 1; j < n_alt[i]; ++j)
160 ppmu->get_constraint(alternatives[i][j],
161 &amasks[i][j], &avalues[i][j]);
162 }
163
164 /* enumerate all possibilities and see if any will work */
165 i = 0;
166 j = -1;
167 value = mask = nv = 0;
168 while (i < n_ev) {
169 if (j >= 0) {
170 /* we're backtracking, restore context */
171 value = svalues[i];
172 mask = smasks[i];
173 j = choice[i];
174 }
175 /*
176 * See if any alternative k for event i,
177 * where k > j, will satisfy the constraints.
178 */
179 while (++j < n_alt[i]) {
180 nv = (value | avalues[i][j]) +
181 (value & avalues[i][j] & addf);
182 if ((((nv + tadd) ^ value) & mask) == 0 &&
183 (((nv + tadd) ^ avalues[i][j])
184 & amasks[i][j]) == 0)
185 break;
186 }
187 if (j >= n_alt[i]) {
188 /*
189 * No feasible alternative, backtrack
190 * to event i-1 and continue enumerating its
191 * alternatives from where we got up to.
192 */
193 if (--i < 0)
194 return -1;
195 } else {
196 /*
197 * Found a feasible alternative for event i,
198 * remember where we got up to with this event,
199 * go on to the next event, and start with
200 * the first alternative for it.
201 */
202 choice[i] = j;
203 svalues[i] = value;
204 smasks[i] = mask;
205 value = nv;
206 mask |= amasks[i][j];
207 ++i;
208 j = -1;
209 }
210 }
211
212 /* OK, we have a feasible combination, tell the caller the solution */
213 for (i = 0; i < n_ev; ++i)
214 event[i] = alternatives[i][choice[i]];
215 return 0;
216}
217
218/*
219 * Check if newly-added counters have consistent settings for
220 * exclude_{user,kernel,hv} with each other and any previously
221 * added counters.
222 */
223static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
224{
225 int eu, ek, eh;
226 int i, n;
227 struct perf_counter *counter;
228
229 n = n_prev + n_new;
230 if (n <= 1)
231 return 0;
232
233 eu = ctrs[0]->hw_event.exclude_user;
234 ek = ctrs[0]->hw_event.exclude_kernel;
235 eh = ctrs[0]->hw_event.exclude_hv;
236 if (n_prev == 0)
237 n_prev = 1;
238 for (i = n_prev; i < n; ++i) {
239 counter = ctrs[i];
240 if (counter->hw_event.exclude_user != eu ||
241 counter->hw_event.exclude_kernel != ek ||
242 counter->hw_event.exclude_hv != eh)
243 return -EAGAIN;
244 }
245 return 0;
246}
247
248static void power_perf_read(struct perf_counter *counter)
249{
250 long val, delta, prev;
251
252 if (!counter->hw.idx)
253 return;
254 /*
255 * Performance monitor interrupts come even when interrupts
256 * are soft-disabled, as long as interrupts are hard-enabled.
257 * Therefore we treat them like NMIs.
258 */
259 do {
260 prev = atomic64_read(&counter->hw.prev_count);
261 barrier();
262 val = read_pmc(counter->hw.idx);
263 } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
264
265 /* The counters are only 32 bits wide */
266 delta = (val - prev) & 0xfffffffful;
267 atomic64_add(delta, &counter->count);
268 atomic64_sub(delta, &counter->hw.period_left);
269}
270
271/*
272 * Disable all counters to prevent PMU interrupts and to allow
273 * counters to be added or removed.
274 */
275u64 hw_perf_save_disable(void)
276{
277 struct cpu_hw_counters *cpuhw;
278 unsigned long ret;
279 unsigned long flags;
280
281 local_irq_save(flags);
282 cpuhw = &__get_cpu_var(cpu_hw_counters);
283
284 ret = cpuhw->disabled;
285 if (!ret) {
286 cpuhw->disabled = 1;
287 cpuhw->n_added = 0;
288
289 /*
290 * Check if we ever enabled the PMU on this cpu.
291 */
292 if (!cpuhw->pmcs_enabled) {
293 if (ppc_md.enable_pmcs)
294 ppc_md.enable_pmcs();
295 cpuhw->pmcs_enabled = 1;
296 }
297
298 /*
299 * Set the 'freeze counters' bit.
300 * The barrier is to make sure the mtspr has been
301 * executed and the PMU has frozen the counters
302 * before we return.
303 */
304 mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
305 mb();
306 }
307 local_irq_restore(flags);
308 return ret;
309}
310
311/*
312 * Re-enable all counters if disable == 0.
313 * If we were previously disabled and counters were added, then
314 * put the new config on the PMU.
315 */
316void hw_perf_restore(u64 disable)
317{
318 struct perf_counter *counter;
319 struct cpu_hw_counters *cpuhw;
320 unsigned long flags;
321 long i;
322 unsigned long val;
323 s64 left;
324 unsigned int hwc_index[MAX_HWCOUNTERS];
325
326 if (disable)
327 return;
328 local_irq_save(flags);
329 cpuhw = &__get_cpu_var(cpu_hw_counters);
330 cpuhw->disabled = 0;
331
332 /*
333 * If we didn't change anything, or only removed counters,
334 * no need to recalculate MMCR* settings and reset the PMCs.
335 * Just reenable the PMU with the current MMCR* settings
336 * (possibly updated for removal of counters).
337 */
338 if (!cpuhw->n_added) {
339 mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
340 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
341 mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
342 if (cpuhw->n_counters == 0)
343 get_lppaca()->pmcregs_in_use = 0;
344 goto out;
345 }
346
347 /*
348 * Compute MMCR* values for the new set of counters
349 */
350 if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
351 cpuhw->mmcr)) {
352 /* shouldn't ever get here */
353 printk(KERN_ERR "oops compute_mmcr failed\n");
354 goto out;
355 }
356
357 /*
358 * Add in MMCR0 freeze bits corresponding to the
359 * hw_event.exclude_* bits for the first counter.
360 * We have already checked that all counters have the
361 * same values for these bits as the first counter.
362 */
363 counter = cpuhw->counter[0];
364 if (counter->hw_event.exclude_user)
365 cpuhw->mmcr[0] |= MMCR0_FCP;
366 if (counter->hw_event.exclude_kernel)
367 cpuhw->mmcr[0] |= MMCR0_FCS;
368 if (counter->hw_event.exclude_hv)
369 cpuhw->mmcr[0] |= MMCR0_FCHV;
370
371 /*
372 * Write the new configuration to MMCR* with the freeze
373 * bit set and set the hardware counters to their initial values.
374 * Then unfreeze the counters.
375 */
376 get_lppaca()->pmcregs_in_use = 1;
377 mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
378 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
379 mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
380 | MMCR0_FC);
381
382 /*
383 * Read off any pre-existing counters that need to move
384 * to another PMC.
385 */
386 for (i = 0; i < cpuhw->n_counters; ++i) {
387 counter = cpuhw->counter[i];
388 if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
389 power_perf_read(counter);
390 write_pmc(counter->hw.idx, 0);
391 counter->hw.idx = 0;
392 }
393 }
394
395 /*
396 * Initialize the PMCs for all the new and moved counters.
397 */
398 for (i = 0; i < cpuhw->n_counters; ++i) {
399 counter = cpuhw->counter[i];
400 if (counter->hw.idx)
401 continue;
402 val = 0;
403 if (counter->hw_event.irq_period) {
404 left = atomic64_read(&counter->hw.period_left);
405 if (left < 0x80000000L)
406 val = 0x80000000L - left;
407 }
408 atomic64_set(&counter->hw.prev_count, val);
409 counter->hw.idx = hwc_index[i] + 1;
410 write_pmc(counter->hw.idx, val);
411 }
412 mb();
413 cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
414 mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
415
416 out:
417 local_irq_restore(flags);
418}
419
420static int collect_events(struct perf_counter *group, int max_count,
421 struct perf_counter *ctrs[], unsigned int *events)
422{
423 int n = 0;
424 struct perf_counter *counter;
425
426 if (!is_software_counter(group)) {
427 if (n >= max_count)
428 return -1;
429 ctrs[n] = group;
430 events[n++] = group->hw.config;
431 }
432 list_for_each_entry(counter, &group->sibling_list, list_entry) {
433 if (!is_software_counter(counter) &&
434 counter->state != PERF_COUNTER_STATE_OFF) {
435 if (n >= max_count)
436 return -1;
437 ctrs[n] = counter;
438 events[n++] = counter->hw.config;
439 }
440 }
441 return n;
442}
443
444static void counter_sched_in(struct perf_counter *counter, int cpu)
445{
446 counter->state = PERF_COUNTER_STATE_ACTIVE;
447 counter->oncpu = cpu;
448 if (is_software_counter(counter))
449 counter->hw_ops->enable(counter);
450}
451
452/*
453 * Called to enable a whole group of counters.
454 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
455 * Assumes the caller has disabled interrupts and has
456 * frozen the PMU with hw_perf_save_disable.
457 */
458int hw_perf_group_sched_in(struct perf_counter *group_leader,
459 struct perf_cpu_context *cpuctx,
460 struct perf_counter_context *ctx, int cpu)
461{
462 struct cpu_hw_counters *cpuhw;
463 long i, n, n0;
464 struct perf_counter *sub;
465
466 cpuhw = &__get_cpu_var(cpu_hw_counters);
467 n0 = cpuhw->n_counters;
468 n = collect_events(group_leader, ppmu->n_counter - n0,
469 &cpuhw->counter[n0], &cpuhw->events[n0]);
470 if (n < 0)
471 return -EAGAIN;
472 if (check_excludes(cpuhw->counter, n0, n))
473 return -EAGAIN;
474 if (power_check_constraints(cpuhw->events, n + n0))
475 return -EAGAIN;
476 cpuhw->n_counters = n0 + n;
477 cpuhw->n_added += n;
478
479 /*
480 * OK, this group can go on; update counter states etc.,
481 * and enable any software counters
482 */
483 for (i = n0; i < n0 + n; ++i)
484 cpuhw->counter[i]->hw.config = cpuhw->events[i];
485 cpuctx->active_oncpu += n;
486 n = 1;
487 counter_sched_in(group_leader, cpu);
488 list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
489 if (sub->state != PERF_COUNTER_STATE_OFF) {
490 counter_sched_in(sub, cpu);
491 ++n;
492 }
493 }
494 ctx->nr_active += n;
495
496 return 1;
497}
498
499/*
500 * Add a counter to the PMU.
501 * If all counters are not already frozen, then we disable and
502 * re-enable the PMU in order to get hw_perf_restore to do the
503 * actual work of reconfiguring the PMU.
504 */
505static int power_perf_enable(struct perf_counter *counter)
506{
507 struct cpu_hw_counters *cpuhw;
508 unsigned long flags;
509 u64 pmudis;
510 int n0;
511 int ret = -EAGAIN;
512
513 local_irq_save(flags);
514 pmudis = hw_perf_save_disable();
515
516 /*
517 * Add the counter to the list (if there is room)
518 * and check whether the total set is still feasible.
519 */
520 cpuhw = &__get_cpu_var(cpu_hw_counters);
521 n0 = cpuhw->n_counters;
522 if (n0 >= ppmu->n_counter)
523 goto out;
524 cpuhw->counter[n0] = counter;
525 cpuhw->events[n0] = counter->hw.config;
526 if (check_excludes(cpuhw->counter, n0, 1))
527 goto out;
528 if (power_check_constraints(cpuhw->events, n0 + 1))
529 goto out;
530
531 counter->hw.config = cpuhw->events[n0];
532 ++cpuhw->n_counters;
533 ++cpuhw->n_added;
534
535 ret = 0;
536 out:
537 hw_perf_restore(pmudis);
538 local_irq_restore(flags);
539 return ret;
540}
541
542/*
543 * Remove a counter from the PMU.
544 */
545static void power_perf_disable(struct perf_counter *counter)
546{
547 struct cpu_hw_counters *cpuhw;
548 long i;
549 u64 pmudis;
550 unsigned long flags;
551
552 local_irq_save(flags);
553 pmudis = hw_perf_save_disable();
554
555 power_perf_read(counter);
556
557 cpuhw = &__get_cpu_var(cpu_hw_counters);
558 for (i = 0; i < cpuhw->n_counters; ++i) {
559 if (counter == cpuhw->counter[i]) {
560 while (++i < cpuhw->n_counters)
561 cpuhw->counter[i-1] = cpuhw->counter[i];
562 --cpuhw->n_counters;
563 ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
564 write_pmc(counter->hw.idx, 0);
565 counter->hw.idx = 0;
566 break;
567 }
568 }
569 if (cpuhw->n_counters == 0) {
570 /* disable exceptions if no counters are running */
571 cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
572 }
573
574 hw_perf_restore(pmudis);
575 local_irq_restore(flags);
576}
577
578struct hw_perf_counter_ops power_perf_ops = {
579 .enable = power_perf_enable,
580 .disable = power_perf_disable,
581 .read = power_perf_read
582};
583
584const struct hw_perf_counter_ops *
585hw_perf_counter_init(struct perf_counter *counter)
586{
587 unsigned long ev;
588 struct perf_counter *ctrs[MAX_HWCOUNTERS];
589 unsigned int events[MAX_HWCOUNTERS];
590 int n;
591
592 if (!ppmu)
593 return NULL;
594 if ((s64)counter->hw_event.irq_period < 0)
595 return NULL;
596 ev = counter->hw_event.type;
597 if (!counter->hw_event.raw) {
598 if (ev >= ppmu->n_generic ||
599 ppmu->generic_events[ev] == 0)
600 return NULL;
601 ev = ppmu->generic_events[ev];
602 }
603 counter->hw.config_base = ev;
604 counter->hw.idx = 0;
605
606 /*
607 * If we are not running on a hypervisor, force the
608 * exclude_hv bit to 0 so that we don't care what
609 * the user set it to. This also means that we don't
610 * set the MMCR0_FCHV bit, which unconditionally freezes
611 * the counters on the PPC970 variants used in Apple G5
612 * machines (since MSR.HV is always 1 on those machines).
613 */
614 if (!firmware_has_feature(FW_FEATURE_LPAR))
615 counter->hw_event.exclude_hv = 0;
616
617 /*
618 * If this is in a group, check if it can go on with all the
619 * other hardware counters in the group. We assume the counter
620 * hasn't been linked into its leader's sibling list at this point.
621 */
622 n = 0;
623 if (counter->group_leader != counter) {
624 n = collect_events(counter->group_leader, ppmu->n_counter - 1,
625 ctrs, events);
626 if (n < 0)
627 return NULL;
628 }
629 events[n] = ev;
630 if (check_excludes(ctrs, n, 1))
631 return NULL;
632 if (power_check_constraints(events, n + 1))
633 return NULL;
634
635 counter->hw.config = events[n];
636 atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
637 return &power_perf_ops;
638}
639
640/*
641 * Handle wakeups.
642 */
643void perf_counter_do_pending(void)
644{
645 int i;
646 struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
647 struct perf_counter *counter;
648
649 set_perf_counter_pending(0);
650 for (i = 0; i < cpuhw->n_counters; ++i) {
651 counter = cpuhw->counter[i];
652 if (counter && counter->wakeup_pending) {
653 counter->wakeup_pending = 0;
654 wake_up(&counter->waitq);
655 }
656 }
657}
658
659/*
660 * Record data for an irq counter.
661 * This function was lifted from the x86 code; maybe it should
662 * go in the core?
663 */
664static void perf_store_irq_data(struct perf_counter *counter, u64 data)
665{
666 struct perf_data *irqdata = counter->irqdata;
667
668 if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
669 irqdata->overrun++;
670 } else {
671 u64 *p = (u64 *) &irqdata->data[irqdata->len];
672
673 *p = data;
674 irqdata->len += sizeof(u64);
675 }
676}
677
678/*
679 * Record all the values of the counters in a group
680 */
681static void perf_handle_group(struct perf_counter *counter)
682{
683 struct perf_counter *leader, *sub;
684
685 leader = counter->group_leader;
686 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
687 if (sub != counter)
688 sub->hw_ops->read(sub);
689 perf_store_irq_data(counter, sub->hw_event.type);
690 perf_store_irq_data(counter, atomic64_read(&sub->count));
691 }
692}
693
694/*
695 * A counter has overflowed; update its count and record
696 * things if requested. Note that interrupts are hard-disabled
697 * here so there is no possibility of being interrupted.
698 */
699static void record_and_restart(struct perf_counter *counter, long val,
700 struct pt_regs *regs)
701{
702 s64 prev, delta, left;
703 int record = 0;
704
705 /* we don't have to worry about interrupts here */
706 prev = atomic64_read(&counter->hw.prev_count);
707 delta = (val - prev) & 0xfffffffful;
708 atomic64_add(delta, &counter->count);
709
710 /*
711 * See if the total period for this counter has expired,
712 * and update for the next period.
713 */
714 val = 0;
715 left = atomic64_read(&counter->hw.period_left) - delta;
716 if (counter->hw_event.irq_period) {
717 if (left <= 0) {
718 left += counter->hw_event.irq_period;
719 if (left <= 0)
720 left = counter->hw_event.irq_period;
721 record = 1;
722 }
723 if (left < 0x80000000L)
724 val = 0x80000000L - left;
725 }
726 write_pmc(counter->hw.idx, val);
727 atomic64_set(&counter->hw.prev_count, val);
728 atomic64_set(&counter->hw.period_left, left);
729
730 /*
731 * Finally record data if requested.
732 */
733 if (record) {
734 switch (counter->hw_event.record_type) {
735 case PERF_RECORD_SIMPLE:
736 break;
737 case PERF_RECORD_IRQ:
738 perf_store_irq_data(counter, instruction_pointer(regs));
739 counter->wakeup_pending = 1;
740 break;
741 case PERF_RECORD_GROUP:
742 perf_handle_group(counter);
743 counter->wakeup_pending = 1;
744 break;
745 }
746 }
747}
748
749/*
750 * Performance monitor interrupt stuff
751 */
752static void perf_counter_interrupt(struct pt_regs *regs)
753{
754 int i;
755 struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
756 struct perf_counter *counter;
757 long val;
758 int need_wakeup = 0, found = 0;
759
760 for (i = 0; i < cpuhw->n_counters; ++i) {
761 counter = cpuhw->counter[i];
762 val = read_pmc(counter->hw.idx);
763 if ((int)val < 0) {
764 /* counter has overflowed */
765 found = 1;
766 record_and_restart(counter, val, regs);
767 if (counter->wakeup_pending)
768 need_wakeup = 1;
769 }
770 }
771
772 /*
773 * In case we didn't find and reset the counter that caused
774 * the interrupt, scan all counters and reset any that are
775 * negative, to avoid getting continual interrupts.
776 * Any that we processed in the previous loop will not be negative.
777 */
778 if (!found) {
779 for (i = 0; i < ppmu->n_counter; ++i) {
780 val = read_pmc(i + 1);
781 if ((int)val < 0)
782 write_pmc(i + 1, 0);
783 }
784 }
785
786 /*
787 * Reset MMCR0 to its normal value. This will set PMXE and
788 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
789 * and thus allow interrupts to occur again.
790 * XXX might want to use MSR.PM to keep the counters frozen until
791 * we get back out of this interrupt.
792 */
793 mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
794
795 /*
796 * If we need a wakeup, check whether interrupts were soft-enabled
797 * when we took the interrupt. If they were, we can wake stuff up
798 * immediately; otherwise we'll have to set a flag and do the
799 * wakeup when interrupts get soft-enabled.
800 */
801 if (need_wakeup) {
802 if (regs->softe) {
803 irq_enter();
804 perf_counter_do_pending();
805 irq_exit();
806 } else {
807 set_perf_counter_pending(1);
808 }
809 }
810}
811
812void hw_perf_counter_setup(int cpu)
813{
814 struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
815
816 memset(cpuhw, 0, sizeof(*cpuhw));
817 cpuhw->mmcr[0] = MMCR0_FC;
818}
819
820extern struct power_pmu ppc970_pmu;
821extern struct power_pmu power6_pmu;
822
823static int init_perf_counters(void)
824{
825 unsigned long pvr;
826
827 if (reserve_pmc_hardware(perf_counter_interrupt)) {
828 printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
829 return -EBUSY;
830 }
831
832 /* XXX should get this from cputable */
833 pvr = mfspr(SPRN_PVR);
834 switch (PVR_VER(pvr)) {
835 case PV_970:
836 case PV_970FX:
837 case PV_970MP:
838 ppmu = &ppc970_pmu;
839 break;
840 case 0x3e:
841 ppmu = &power6_pmu;
842 break;
843 }
844 return 0;
845}
846
847arch_initcall(init_perf_counters);
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
new file mode 100644
index 000000000000..b1f61f3c97bb
--- /dev/null
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -0,0 +1,283 @@
1/*
2 * Performance counter support for POWER6 processors.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER6
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0x7
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* Unit event comes (TTMxSEL encoding) */
22#define PM_UNIT_MSK 0xf
23#define PM_UNIT_MSKS (PM_UNIT_MSK << PM_UNIT_SH)
24#define PM_LLAV 0x8000 /* Load lookahead match value */
25#define PM_LLA 0x4000 /* Load lookahead match enable */
26#define PM_BYTE_SH 12 /* Byte of event bus to use */
27#define PM_BYTE_MSK 3
28#define PM_SUBUNIT_SH 8 /* Subunit event comes from (NEST_SEL enc.) */
29#define PM_SUBUNIT_MSK 7
30#define PM_SUBUNIT_MSKS (PM_SUBUNIT_MSK << PM_SUBUNIT_SH)
31#define PM_PMCSEL_MSK 0xff /* PMCxSEL value */
32#define PM_BUSEVENT_MSK 0xf3700
33
34/*
35 * Bits in MMCR1 for POWER6
36 */
37#define MMCR1_TTM0SEL_SH 60
38#define MMCR1_TTMSEL_SH(n) (MMCR1_TTM0SEL_SH - (n) * 4)
39#define MMCR1_TTMSEL_MSK 0xf
40#define MMCR1_TTMSEL(m, n) (((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK)
41#define MMCR1_NESTSEL_SH 45
42#define MMCR1_NESTSEL_MSK 0x7
43#define MMCR1_NESTSEL(m) (((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK)
44#define MMCR1_PMC1_LLA ((u64)1 << 44)
45#define MMCR1_PMC1_LLA_VALUE ((u64)1 << 39)
46#define MMCR1_PMC1_ADDR_SEL ((u64)1 << 35)
47#define MMCR1_PMC1SEL_SH 24
48#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
49#define MMCR1_PMCSEL_MSK 0xff
50
51/*
52 * Assign PMC numbers and compute MMCR1 value for a set of events
53 */
54static int p6_compute_mmcr(unsigned int event[], int n_ev,
55 unsigned int hwc[], u64 mmcr[])
56{
57 u64 mmcr1 = 0;
58 int i;
59 unsigned int pmc, ev, b, u, s, psel;
60 unsigned int ttmset = 0;
61 unsigned int pmc_inuse = 0;
62
63 if (n_ev > 4)
64 return -1;
65 for (i = 0; i < n_ev; ++i) {
66 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
67 if (pmc) {
68 if (pmc_inuse & (1 << (pmc - 1)))
69 return -1; /* collision! */
70 pmc_inuse |= 1 << (pmc - 1);
71 }
72 }
73 for (i = 0; i < n_ev; ++i) {
74 ev = event[i];
75 pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
76 if (pmc) {
77 --pmc;
78 } else {
79 /* can go on any PMC; find a free one */
80 for (pmc = 0; pmc < 4; ++pmc)
81 if (!(pmc_inuse & (1 << pmc)))
82 break;
83 pmc_inuse |= 1 << pmc;
84 }
85 hwc[i] = pmc;
86 psel = ev & PM_PMCSEL_MSK;
87 if (ev & PM_BUSEVENT_MSK) {
88 /* this event uses the event bus */
89 b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK;
90 u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK;
91 /* check for conflict on this byte of event bus */
92 if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u)
93 return -1;
94 mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b);
95 ttmset |= 1 << b;
96 if (u == 5) {
97 /* Nest events have a further mux */
98 s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
99 if ((ttmset & 0x10) &&
100 MMCR1_NESTSEL(mmcr1) != s)
101 return -1;
102 ttmset |= 0x10;
103 mmcr1 |= (u64)s << MMCR1_NESTSEL_SH;
104 }
105 if (0x30 <= psel && psel <= 0x3d) {
106 /* these need the PMCx_ADDR_SEL bits */
107 if (b >= 2)
108 mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc;
109 }
110 /* bus select values are different for PMC3/4 */
111 if (pmc >= 2 && (psel & 0x90) == 0x80)
112 psel ^= 0x20;
113 }
114 if (ev & PM_LLA) {
115 mmcr1 |= MMCR1_PMC1_LLA >> pmc;
116 if (ev & PM_LLAV)
117 mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
118 }
119 mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
120 }
121 mmcr[0] = 0;
122 if (pmc_inuse & 1)
123 mmcr[0] = MMCR0_PMC1CE;
124 if (pmc_inuse & 0xe)
125 mmcr[0] |= MMCR0_PMCjCE;
126 mmcr[1] = mmcr1;
127 mmcr[2] = 0;
128 return 0;
129}
130
131/*
132 * Layout of constraint bits:
133 *
134 * 0-1 add field: number of uses of PMC1 (max 1)
135 * 2-3, 4-5, 6-7: ditto for PMC2, 3, 4
136 * 8-10 select field: nest (subunit) event selector
137 * 16-19 select field: unit on byte 0 of event bus
138 * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
139 */
140static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
141{
142 int pmc, byte, sh;
143 unsigned int mask = 0, value = 0;
144
145 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
146 if (pmc) {
147 if (pmc > 4)
148 return -1;
149 sh = (pmc - 1) * 2;
150 mask |= 2 << sh;
151 value |= 1 << sh;
152 }
153 if (event & PM_BUSEVENT_MSK) {
154 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
155 sh = byte * 4;
156 mask |= PM_UNIT_MSKS << sh;
157 value |= (event & PM_UNIT_MSKS) << sh;
158 if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
159 mask |= PM_SUBUNIT_MSKS;
160 value |= event & PM_SUBUNIT_MSKS;
161 }
162 }
163 *maskp = mask;
164 *valp = value;
165 return 0;
166}
167
168#define MAX_ALT 4 /* at most 4 alternatives for any event */
169
170static const unsigned int event_alternatives[][MAX_ALT] = {
171 { 0x0130e8, 0x2000f6, 0x3000fc }, /* PM_PTEG_RELOAD_VALID */
172 { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
173 { 0x080088, 0x200054, 0x3000f0 }, /* PM_ST_MISS_L1 */
174 { 0x10000a, 0x2000f4 }, /* PM_RUN_CYC */
175 { 0x10000b, 0x2000f5 }, /* PM_RUN_COUNT */
176 { 0x10000e, 0x400010 }, /* PM_PURR */
177 { 0x100010, 0x4000f8 }, /* PM_FLUSH */
178 { 0x10001a, 0x200010 }, /* PM_MRK_INST_DISP */
179 { 0x100026, 0x3000f8 }, /* PM_TB_BIT_TRANS */
180 { 0x100054, 0x2000f0 }, /* PM_ST_FIN */
181 { 0x100056, 0x2000fc }, /* PM_L1_ICACHE_MISS */
182 { 0x1000f0, 0x40000a }, /* PM_INST_IMC_MATCH_CMPL */
183 { 0x1000f8, 0x200008 }, /* PM_GCT_EMPTY_CYC */
184 { 0x1000fc, 0x400006 }, /* PM_LSU_DERAT_MISS_CYC */
185 { 0x20000e, 0x400007 }, /* PM_LSU_DERAT_MISS */
186 { 0x200012, 0x300012 }, /* PM_INST_DISP */
187 { 0x2000f2, 0x3000f2 }, /* PM_INST_DISP */
188 { 0x2000f8, 0x300010 }, /* PM_EXT_INT */
189 { 0x2000fe, 0x300056 }, /* PM_DATA_FROM_L2MISS */
190 { 0x2d0030, 0x30001a }, /* PM_MRK_FPU_FIN */
191 { 0x30000a, 0x400018 }, /* PM_MRK_INST_FIN */
192 { 0x3000f6, 0x40000e }, /* PM_L1_DCACHE_RELOAD_VALID */
193 { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */
194};
195
196/*
197 * This could be made more efficient with a binary search on
198 * a presorted list, if necessary
199 */
200static int find_alternatives_list(unsigned int event)
201{
202 int i, j;
203 unsigned int alt;
204
205 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
206 if (event < event_alternatives[i][0])
207 return -1;
208 for (j = 0; j < MAX_ALT; ++j) {
209 alt = event_alternatives[i][j];
210 if (!alt || event < alt)
211 break;
212 if (event == alt)
213 return i;
214 }
215 }
216 return -1;
217}
218
219static int p6_get_alternatives(unsigned int event, unsigned int alt[])
220{
221 int i, j;
222 unsigned int aevent, psel, pmc;
223 unsigned int nalt = 1;
224
225 alt[0] = event;
226
227 /* check the alternatives table */
228 i = find_alternatives_list(event);
229 if (i >= 0) {
230 /* copy out alternatives from list */
231 for (j = 0; j < MAX_ALT; ++j) {
232 aevent = event_alternatives[i][j];
233 if (!aevent)
234 break;
235 if (aevent != event)
236 alt[nalt++] = aevent;
237 }
238
239 } else {
240 /* Check for alternative ways of computing sum events */
241 /* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */
242 psel = event & (PM_PMCSEL_MSK & ~1); /* ignore edge bit */
243 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
244 if (pmc && (psel == 0x32 || psel == 0x34))
245 alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) |
246 ((5 - pmc) << PM_PMC_SH);
247
248 /* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */
249 if (pmc && (psel == 0x38 || psel == 0x3a))
250 alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) |
251 ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
252 }
253
254 return nalt;
255}
256
257static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
258{
259 /* Set PMCxSEL to 0 to disable PMCx */
260 mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
261}
262
263static int power6_generic_events[] = {
264 [PERF_COUNT_CPU_CYCLES] = 0x1e,
265 [PERF_COUNT_INSTRUCTIONS] = 2,
266 [PERF_COUNT_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */
267 [PERF_COUNT_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */
268 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */
269 [PERF_COUNT_BRANCH_MISSES] = 0x400052, /* BR_MPRED */
270};
271
272struct power_pmu power6_pmu = {
273 .n_counter = 4,
274 .max_alternatives = MAX_ALT,
275 .add_fields = 0x55,
276 .test_adder = 0,
277 .compute_mmcr = p6_compute_mmcr,
278 .get_constraint = p6_get_constraint,
279 .get_alternatives = p6_get_alternatives,
280 .disable_pmc = p6_disable_pmc,
281 .n_generic = ARRAY_SIZE(power6_generic_events),
282 .generic_events = power6_generic_events,
283};
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
new file mode 100644
index 000000000000..c3256580be1a
--- /dev/null
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -0,0 +1,375 @@
1/*
2 * Performance counter support for PPC970-family processors.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/string.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for PPC970
17 */
18#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
21#define PM_UNIT_MSK 0xf
22#define PM_BYTE_SH 4 /* Byte number of event bus to use */
23#define PM_BYTE_MSK 3
24#define PM_PMCSEL_MSK 0xf
25
26/* Values in PM_UNIT field */
27#define PM_NONE 0
28#define PM_FPU 1
29#define PM_VPU 2
30#define PM_ISU 3
31#define PM_IFU 4
32#define PM_IDU 5
33#define PM_STS 6
34#define PM_LSU0 7
35#define PM_LSU1U 8
36#define PM_LSU1L 9
37#define PM_LASTUNIT 9
38
39/*
40 * Bits in MMCR0 for PPC970
41 */
42#define MMCR0_PMC1SEL_SH 8
43#define MMCR0_PMC2SEL_SH 1
44#define MMCR_PMCSEL_MSK 0x1f
45
46/*
47 * Bits in MMCR1 for PPC970
48 */
49#define MMCR1_TTM0SEL_SH 62
50#define MMCR1_TTM1SEL_SH 59
51#define MMCR1_TTM3SEL_SH 53
52#define MMCR1_TTMSEL_MSK 3
53#define MMCR1_TD_CP_DBG0SEL_SH 50
54#define MMCR1_TD_CP_DBG1SEL_SH 48
55#define MMCR1_TD_CP_DBG2SEL_SH 46
56#define MMCR1_TD_CP_DBG3SEL_SH 44
57#define MMCR1_PMC1_ADDER_SEL_SH 39
58#define MMCR1_PMC2_ADDER_SEL_SH 38
59#define MMCR1_PMC6_ADDER_SEL_SH 37
60#define MMCR1_PMC5_ADDER_SEL_SH 36
61#define MMCR1_PMC8_ADDER_SEL_SH 35
62#define MMCR1_PMC7_ADDER_SEL_SH 34
63#define MMCR1_PMC3_ADDER_SEL_SH 33
64#define MMCR1_PMC4_ADDER_SEL_SH 32
65#define MMCR1_PMC3SEL_SH 27
66#define MMCR1_PMC4SEL_SH 22
67#define MMCR1_PMC5SEL_SH 17
68#define MMCR1_PMC6SEL_SH 12
69#define MMCR1_PMC7SEL_SH 7
70#define MMCR1_PMC8SEL_SH 2
71
72static short mmcr1_adder_bits[8] = {
73 MMCR1_PMC1_ADDER_SEL_SH,
74 MMCR1_PMC2_ADDER_SEL_SH,
75 MMCR1_PMC3_ADDER_SEL_SH,
76 MMCR1_PMC4_ADDER_SEL_SH,
77 MMCR1_PMC5_ADDER_SEL_SH,
78 MMCR1_PMC6_ADDER_SEL_SH,
79 MMCR1_PMC7_ADDER_SEL_SH,
80 MMCR1_PMC8_ADDER_SEL_SH
81};
82
83/*
84 * Bits in MMCRA
85 */
86
87/*
88 * Layout of constraint bits:
89 * 6666555555555544444444443333333333222222222211111111110000000000
90 * 3210987654321098765432109876543210987654321098765432109876543210
91 * <><>[ >[ >[ >< >< >< >< ><><><><><><><><>
92 * T0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
93 *
94 * T0 - TTM0 constraint
95 * 46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
96 *
97 * T1 - TTM1 constraint
98 * 44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000
99 *
100 * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS
101 * 43: UC3 error 0x0800_0000_0000
102 * 42: FPU|IFU|VPU events needed 0x0400_0000_0000
103 * 41: ISU events needed 0x0200_0000_0000
104 * 40: IDU|STS events needed 0x0100_0000_0000
105 *
106 * PS1
107 * 39: PS1 error 0x0080_0000_0000
108 * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
109 *
110 * PS2
111 * 35: PS2 error 0x0008_0000_0000
112 * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
113 *
114 * B0
115 * 28-31: Byte 0 event source 0xf000_0000
116 * Encoding as for the event code
117 *
118 * B1, B2, B3
119 * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
120 *
121 * P1
122 * 15: P1 error 0x8000
123 * 14-15: Count of events needing PMC1
124 *
125 * P2..P8
126 * 0-13: Count of events needing PMC2..PMC8
127 */
128
129/* Masks and values for using events from the various units */
130static u64 unit_cons[PM_LASTUNIT+1][2] = {
131 [PM_FPU] = { 0xc80000000000ull, 0x040000000000ull },
132 [PM_VPU] = { 0xc80000000000ull, 0xc40000000000ull },
133 [PM_ISU] = { 0x080000000000ull, 0x020000000000ull },
134 [PM_IFU] = { 0xc80000000000ull, 0x840000000000ull },
135 [PM_IDU] = { 0x380000000000ull, 0x010000000000ull },
136 [PM_STS] = { 0x380000000000ull, 0x310000000000ull },
137};
138
139static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
140{
141 int pmc, byte, unit, sh;
142 u64 mask = 0, value = 0;
143 int grp = -1;
144
145 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
146 if (pmc) {
147 if (pmc > 8)
148 return -1;
149 sh = (pmc - 1) * 2;
150 mask |= 2 << sh;
151 value |= 1 << sh;
152 grp = ((pmc - 1) >> 1) & 1;
153 }
154 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
155 if (unit) {
156 if (unit > PM_LASTUNIT)
157 return -1;
158 mask |= unit_cons[unit][0];
159 value |= unit_cons[unit][1];
160 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
161 /*
162 * Bus events on bytes 0 and 2 can be counted
163 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
164 */
165 if (!pmc)
166 grp = byte & 1;
167 /* Set byte lane select field */
168 mask |= 0xfULL << (28 - 4 * byte);
169 value |= (u64)unit << (28 - 4 * byte);
170 }
171 if (grp == 0) {
172 /* increment PMC1/2/5/6 field */
173 mask |= 0x8000000000ull;
174 value |= 0x1000000000ull;
175 } else if (grp == 1) {
176 /* increment PMC3/4/7/8 field */
177 mask |= 0x800000000ull;
178 value |= 0x100000000ull;
179 }
180 *maskp = mask;
181 *valp = value;
182 return 0;
183}
184
185static int p970_get_alternatives(unsigned int event, unsigned int alt[])
186{
187 alt[0] = event;
188
189 /* 2 alternatives for LSU empty */
190 if (event == 0x2002 || event == 0x3002) {
191 alt[1] = event ^ 0x1000;
192 return 2;
193 }
194
195 return 1;
196}
197
198static int p970_compute_mmcr(unsigned int event[], int n_ev,
199 unsigned int hwc[], u64 mmcr[])
200{
201 u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
202 unsigned int pmc, unit, byte, psel;
203 unsigned int ttm, grp;
204 unsigned int pmc_inuse = 0;
205 unsigned int pmc_grp_use[2];
206 unsigned char busbyte[4];
207 unsigned char unituse[16];
208 unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 };
209 unsigned char ttmuse[2];
210 unsigned char pmcsel[8];
211 int i;
212
213 if (n_ev > 8)
214 return -1;
215
216 /* First pass to count resource use */
217 pmc_grp_use[0] = pmc_grp_use[1] = 0;
218 memset(busbyte, 0, sizeof(busbyte));
219 memset(unituse, 0, sizeof(unituse));
220 for (i = 0; i < n_ev; ++i) {
221 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
222 if (pmc) {
223 if (pmc_inuse & (1 << (pmc - 1)))
224 return -1;
225 pmc_inuse |= 1 << (pmc - 1);
226 /* count 1/2/5/6 vs 3/4/7/8 use */
227 ++pmc_grp_use[((pmc - 1) >> 1) & 1];
228 }
229 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
230 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
231 if (unit) {
232 if (unit > PM_LASTUNIT)
233 return -1;
234 if (!pmc)
235 ++pmc_grp_use[byte & 1];
236 if (busbyte[byte] && busbyte[byte] != unit)
237 return -1;
238 busbyte[byte] = unit;
239 unituse[unit] = 1;
240 }
241 }
242 if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
243 return -1;
244
245 /*
246 * Assign resources and set multiplexer selects.
247 *
248 * PM_ISU can go either on TTM0 or TTM1, but that's the only
249 * choice we have to deal with.
250 */
251 if (unituse[PM_ISU] &
252 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU]))
253 unitmap[PM_ISU] = 2 | 4; /* move ISU to TTM1 */
254 /* Set TTM[01]SEL fields. */
255 ttmuse[0] = ttmuse[1] = 0;
256 for (i = PM_FPU; i <= PM_STS; ++i) {
257 if (!unituse[i])
258 continue;
259 ttm = unitmap[i];
260 ++ttmuse[(ttm >> 2) & 1];
261 mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH;
262 }
263 /* Check only one unit per TTMx */
264 if (ttmuse[0] > 1 || ttmuse[1] > 1)
265 return -1;
266
267 /* Set byte lane select fields and TTM3SEL. */
268 for (byte = 0; byte < 4; ++byte) {
269 unit = busbyte[byte];
270 if (!unit)
271 continue;
272 if (unit <= PM_STS)
273 ttm = (unitmap[unit] >> 2) & 1;
274 else if (unit == PM_LSU0)
275 ttm = 2;
276 else {
277 ttm = 3;
278 if (unit == PM_LSU1L && byte >= 2)
279 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
280 }
281 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
282 }
283
284 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
285 memset(pmcsel, 0x8, sizeof(pmcsel)); /* 8 means don't count */
286 for (i = 0; i < n_ev; ++i) {
287 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
288 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
289 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
290 psel = event[i] & PM_PMCSEL_MSK;
291 if (!pmc) {
292 /* Bus event or any-PMC direct event */
293 if (unit)
294 psel |= 0x10 | ((byte & 2) << 2);
295 else
296 psel |= 8;
297 for (pmc = 0; pmc < 8; ++pmc) {
298 if (pmc_inuse & (1 << pmc))
299 continue;
300 grp = (pmc >> 1) & 1;
301 if (unit) {
302 if (grp == (byte & 1))
303 break;
304 } else if (pmc_grp_use[grp] < 4) {
305 ++pmc_grp_use[grp];
306 break;
307 }
308 }
309 pmc_inuse |= 1 << pmc;
310 } else {
311 /* Direct event */
312 --pmc;
313 if (psel == 0 && (byte & 2))
314 /* add events on higher-numbered bus */
315 mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
316 }
317 pmcsel[pmc] = psel;
318 hwc[i] = pmc;
319 }
320 for (pmc = 0; pmc < 2; ++pmc)
321 mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
322 for (; pmc < 8; ++pmc)
323 mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
324 if (pmc_inuse & 1)
325 mmcr0 |= MMCR0_PMC1CE;
326 if (pmc_inuse & 0xfe)
327 mmcr0 |= MMCR0_PMCjCE;
328
329 mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
330
331 /* Return MMCRx values */
332 mmcr[0] = mmcr0;
333 mmcr[1] = mmcr1;
334 mmcr[2] = mmcra;
335 return 0;
336}
337
338static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
339{
340 int shift, i;
341
342 if (pmc <= 1) {
343 shift = MMCR0_PMC1SEL_SH - 7 * pmc;
344 i = 0;
345 } else {
346 shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2);
347 i = 1;
348 }
349 /*
350 * Setting the PMCxSEL field to 0x08 disables PMC x.
351 */
352 mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift);
353}
354
355static int ppc970_generic_events[] = {
356 [PERF_COUNT_CPU_CYCLES] = 7,
357 [PERF_COUNT_INSTRUCTIONS] = 1,
358 [PERF_COUNT_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */
359 [PERF_COUNT_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */
360 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */
361 [PERF_COUNT_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */
362};
363
364struct power_pmu ppc970_pmu = {
365 .n_counter = 8,
366 .max_alternatives = 2,
367 .add_fields = 0x001100005555ull,
368 .test_adder = 0x013300000000ull,
369 .compute_mmcr = p970_compute_mmcr,
370 .get_constraint = p970_get_constraint,
371 .get_alternatives = p970_get_alternatives,
372 .disable_pmc = p970_disable_pmc,
373 .n_generic = ARRAY_SIZE(ppc970_generic_events),
374 .generic_events = ppc970_generic_events,
375};
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 161b9b9691f0..295ccc5e86b1 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -184,6 +184,7 @@ SECTIONS
184 . = ALIGN(PAGE_SIZE); 184 . = ALIGN(PAGE_SIZE);
185 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { 185 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
186 __per_cpu_start = .; 186 __per_cpu_start = .;
187 *(.data.percpu.page_aligned)
187 *(.data.percpu) 188 *(.data.percpu)
188 *(.data.percpu.shared_aligned) 189 *(.data.percpu.shared_aligned)
189 __per_cpu_end = .; 190 __per_cpu_end = .;
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index e868b5c50723..dc0f3c933518 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
1config PPC64 1config PPC64
2 bool "64-bit kernel" 2 bool "64-bit kernel"
3 default n 3 default n
4 select HAVE_PERF_COUNTERS
4 help 5 help
5 This option selects whether a 32-bit or a 64-bit kernel 6 This option selects whether a 32-bit or a 64-bit kernel
6 will be built. 7 will be built.
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index 84e058f1e1cc..80b513449f4c 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -153,9 +153,10 @@ static int get_irq_server(unsigned int virq, unsigned int strict_check)
153{ 153{
154 int server; 154 int server;
155 /* For the moment only implement delivery to all cpus or one cpu */ 155 /* For the moment only implement delivery to all cpus or one cpu */
156 cpumask_t cpumask = irq_desc[virq].affinity; 156 cpumask_t cpumask;
157 cpumask_t tmp = CPU_MASK_NONE; 157 cpumask_t tmp = CPU_MASK_NONE;
158 158
159 cpumask_copy(&cpumask, irq_desc[virq].affinity);
159 if (!distribute_irqs) 160 if (!distribute_irqs)
160 return default_server; 161 return default_server;
161 162
@@ -869,7 +870,7 @@ void xics_migrate_irqs_away(void)
869 virq, cpu); 870 virq, cpu);
870 871
871 /* Reset affinity to all cpus */ 872 /* Reset affinity to all cpus */
872 irq_desc[virq].affinity = CPU_MASK_ALL; 873 cpumask_setall(irq_desc[virq].affinity);
873 desc->chip->set_affinity(virq, cpu_all_mask); 874 desc->chip->set_affinity(virq, cpu_all_mask);
874unlock: 875unlock:
875 spin_unlock_irqrestore(&desc->lock, flags); 876 spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index a35297dbac28..532e205303a2 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -566,9 +566,10 @@ static void __init mpic_scan_ht_pics(struct mpic *mpic)
566#ifdef CONFIG_SMP 566#ifdef CONFIG_SMP
567static int irq_choose_cpu(unsigned int virt_irq) 567static int irq_choose_cpu(unsigned int virt_irq)
568{ 568{
569 cpumask_t mask = irq_desc[virt_irq].affinity; 569 cpumask_t mask;
570 int cpuid; 570 int cpuid;
571 571
572 cpumask_copy(&mask, irq_desc[virt_irq].affinity);
572 if (cpus_equal(mask, CPU_MASK_ALL)) { 573 if (cpus_equal(mask, CPU_MASK_ALL)) {
573 static int irq_rover; 574 static int irq_rover;
574 static DEFINE_SPINLOCK(irq_rover_lock); 575 static DEFINE_SPINLOCK(irq_rover_lock);
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index e289376198eb..3d2c6baae96b 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -252,9 +252,10 @@ struct irq_handler_data {
252#ifdef CONFIG_SMP 252#ifdef CONFIG_SMP
253static int irq_choose_cpu(unsigned int virt_irq) 253static int irq_choose_cpu(unsigned int virt_irq)
254{ 254{
255 cpumask_t mask = irq_desc[virt_irq].affinity; 255 cpumask_t mask;
256 int cpuid; 256 int cpuid;
257 257
258 cpumask_copy(&mask, irq_desc[virt_irq].affinity);
258 if (cpus_equal(mask, CPU_MASK_ALL)) { 259 if (cpus_equal(mask, CPU_MASK_ALL)) {
259 static int irq_rover; 260 static int irq_rover;
260 static DEFINE_SPINLOCK(irq_rover_lock); 261 static DEFINE_SPINLOCK(irq_rover_lock);
@@ -796,7 +797,7 @@ void fixup_irqs(void)
796 !(irq_desc[irq].status & IRQ_PER_CPU)) { 797 !(irq_desc[irq].status & IRQ_PER_CPU)) {
797 if (irq_desc[irq].chip->set_affinity) 798 if (irq_desc[irq].chip->set_affinity)
798 irq_desc[irq].chip->set_affinity(irq, 799 irq_desc[irq].chip->set_affinity(irq,
799 &irq_desc[irq].affinity); 800 irq_desc[irq].affinity);
800 } 801 }
801 spin_unlock_irqrestore(&irq_desc[irq].lock, flags); 802 spin_unlock_irqrestore(&irq_desc[irq].lock, flags);
802 } 803 }
diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
index 2db3c2229b95..db310aa00183 100644
--- a/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@ -729,7 +729,7 @@ void timer_interrupt(int irq, struct pt_regs *regs)
729 729
730 irq_enter(); 730 irq_enter();
731 731
732 kstat_this_cpu.irqs[0]++; 732 kstat_incr_irqs_this_cpu(0, irq_to_desc(0));
733 733
734 if (unlikely(!evt->event_handler)) { 734 if (unlikely(!evt->event_handler)) {
735 printk(KERN_WARNING 735 printk(KERN_WARNING
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9c39095b33fc..97d3bd17b7df 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -391,6 +391,13 @@ config X86_RDC321X
391 as R-8610-(G). 391 as R-8610-(G).
392 If you don't have one of these chips, you should say N here. 392 If you don't have one of these chips, you should say N here.
393 393
394config X86_UV
395 bool "SGI Ultraviolet"
396 depends on X86_64
397 help
398 This option is needed in order to support SGI Ultraviolet systems.
399 If you don't have one of these, you should say N here.
400
394config SCHED_OMIT_FRAME_POINTER 401config SCHED_OMIT_FRAME_POINTER
395 def_bool y 402 def_bool y
396 prompt "Single-depth WCHAN output" 403 prompt "Single-depth WCHAN output"
@@ -685,6 +692,7 @@ config X86_UP_IOAPIC
685config X86_LOCAL_APIC 692config X86_LOCAL_APIC
686 def_bool y 693 def_bool y
687 depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH)) 694 depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
695 select HAVE_PERF_COUNTERS if (!M386 && !M486)
688 696
689config X86_IO_APIC 697config X86_IO_APIC
690 def_bool y 698 def_bool y
@@ -1340,13 +1348,17 @@ config SECCOMP
1340 1348
1341 If unsure, say Y. Only embedded should say N here. 1349 If unsure, say Y. Only embedded should say N here.
1342 1350
1351config CC_STACKPROTECTOR_ALL
1352 bool
1353
1343config CC_STACKPROTECTOR 1354config CC_STACKPROTECTOR
1344 bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)" 1355 bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
1345 depends on X86_64 && EXPERIMENTAL && BROKEN 1356 depends on X86_64
1357 select CC_STACKPROTECTOR_ALL
1346 help 1358 help
1347 This option turns on the -fstack-protector GCC feature. This 1359 This option turns on the -fstack-protector GCC feature. This
1348 feature puts, at the beginning of critical functions, a canary 1360 feature puts, at the beginning of functions, a canary value on
1349 value on the stack just before the return address, and validates 1361 the stack just before the return address, and validates
1350 the value just before actually returning. Stack based buffer 1362 the value just before actually returning. Stack based buffer
1351 overflows (that need to overwrite this return address) now also 1363 overflows (that need to overwrite this return address) now also
1352 overwrite the canary, which gets detected and the attack is then 1364 overwrite the canary, which gets detected and the attack is then
@@ -1354,15 +1366,8 @@ config CC_STACKPROTECTOR
1354 1366
1355 This feature requires gcc version 4.2 or above, or a distribution 1367 This feature requires gcc version 4.2 or above, or a distribution
1356 gcc with the feature backported. Older versions are automatically 1368 gcc with the feature backported. Older versions are automatically
1357 detected and for those versions, this configuration option is ignored. 1369 detected and for those versions, this configuration option is
1358 1370 ignored. (and a warning is printed during bootup)
1359config CC_STACKPROTECTOR_ALL
1360 bool "Use stack-protector for all functions"
1361 depends on CC_STACKPROTECTOR
1362 help
1363 Normally, GCC only inserts the canary value protection for
1364 functions that use large-ish on-stack buffers. By enabling
1365 this option, GCC will be asked to do this for ALL functions.
1366 1371
1367source kernel/Kconfig.hz 1372source kernel/Kconfig.hz
1368 1373
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index c98d52e82966..085fef4d8660 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -294,25 +294,23 @@ config X86_CPU
294# Define implied options from the CPU selection here 294# Define implied options from the CPU selection here
295config X86_L1_CACHE_BYTES 295config X86_L1_CACHE_BYTES
296 int 296 int
297 default "128" if GENERIC_CPU || MPSC 297 default "128" if MPSC
298 default "64" if MK8 || MCORE2 298 default "64" if GENERIC_CPU || MK8 || MCORE2 || X86_32
299 depends on X86_64
300 299
301config X86_INTERNODE_CACHE_BYTES 300config X86_INTERNODE_CACHE_BYTES
302 int 301 int
303 default "4096" if X86_VSMP 302 default "4096" if X86_VSMP
304 default X86_L1_CACHE_BYTES if !X86_VSMP 303 default X86_L1_CACHE_BYTES if !X86_VSMP
305 depends on X86_64
306 304
307config X86_CMPXCHG 305config X86_CMPXCHG
308 def_bool X86_64 || (X86_32 && !M386) 306 def_bool X86_64 || (X86_32 && !M386)
309 307
310config X86_L1_CACHE_SHIFT 308config X86_L1_CACHE_SHIFT
311 int 309 int
312 default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC 310 default "7" if MPENTIUM4 || MPSC
313 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 311 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
314 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 312 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
315 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 313 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 || X86_GENERIC || GENERIC_CPU
316 314
317config X86_XADD 315config X86_XADD
318 def_bool y 316 def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 10d6cc3fd052..28f111461ca8 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -117,6 +117,7 @@ config DEBUG_RODATA
117config DEBUG_RODATA_TEST 117config DEBUG_RODATA_TEST
118 bool "Testcase for the DEBUG_RODATA feature" 118 bool "Testcase for the DEBUG_RODATA feature"
119 depends on DEBUG_RODATA 119 depends on DEBUG_RODATA
120 default y
120 help 121 help
121 This option enables a testcase for the DEBUG_RODATA 122 This option enables a testcase for the DEBUG_RODATA
122 feature as well as for the change_page_attr() infrastructure. 123 feature as well as for the change_page_attr() infrastructure.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index d1a47adb5aec..cacee981d166 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -73,7 +73,7 @@ else
73 73
74 stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh 74 stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh
75 stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \ 75 stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \
76 "$(CC)" -fstack-protector ) 76 "$(CC)" "-fstack-protector -DGCC_HAS_SP" )
77 stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \ 77 stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \
78 "$(CC)" -fstack-protector-all ) 78 "$(CC)" -fstack-protector-all )
79 79
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 5a0d76dc56a4..e4baa06bbceb 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -112,8 +112,8 @@ ENTRY(ia32_sysenter_target)
112 CFI_DEF_CFA rsp,0 112 CFI_DEF_CFA rsp,0
113 CFI_REGISTER rsp,rbp 113 CFI_REGISTER rsp,rbp
114 SWAPGS_UNSAFE_STACK 114 SWAPGS_UNSAFE_STACK
115 movq %gs:pda_kernelstack, %rsp 115 movq PER_CPU_VAR(kernel_stack), %rsp
116 addq $(PDA_STACKOFFSET),%rsp 116 addq $(KERNEL_STACK_OFFSET),%rsp
117 /* 117 /*
118 * No need to follow this irqs on/off section: the syscall 118 * No need to follow this irqs on/off section: the syscall
119 * disabled irqs, here we enable it straight after entry: 119 * disabled irqs, here we enable it straight after entry:
@@ -273,13 +273,13 @@ ENDPROC(ia32_sysenter_target)
273ENTRY(ia32_cstar_target) 273ENTRY(ia32_cstar_target)
274 CFI_STARTPROC32 simple 274 CFI_STARTPROC32 simple
275 CFI_SIGNAL_FRAME 275 CFI_SIGNAL_FRAME
276 CFI_DEF_CFA rsp,PDA_STACKOFFSET 276 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
277 CFI_REGISTER rip,rcx 277 CFI_REGISTER rip,rcx
278 /*CFI_REGISTER rflags,r11*/ 278 /*CFI_REGISTER rflags,r11*/
279 SWAPGS_UNSAFE_STACK 279 SWAPGS_UNSAFE_STACK
280 movl %esp,%r8d 280 movl %esp,%r8d
281 CFI_REGISTER rsp,r8 281 CFI_REGISTER rsp,r8
282 movq %gs:pda_kernelstack,%rsp 282 movq PER_CPU_VAR(kernel_stack),%rsp
283 /* 283 /*
284 * No need to follow this irqs on/off section: the syscall 284 * No need to follow this irqs on/off section: the syscall
285 * disabled irqs and here we enable it straight after entry: 285 * disabled irqs and here we enable it straight after entry:
@@ -825,7 +825,8 @@ ia32_sys_call_table:
825 .quad compat_sys_signalfd4 825 .quad compat_sys_signalfd4
826 .quad sys_eventfd2 826 .quad sys_eventfd2
827 .quad sys_epoll_create1 827 .quad sys_epoll_create1
828 .quad sys_dup3 /* 330 */ 828 .quad sys_dup3 /* 330 */
829 .quad sys_pipe2 829 .quad sys_pipe2
830 .quad sys_inotify_init1 830 .quad sys_inotify_init1
831 .quad sys_perf_counter_open
831ia32_syscall_end: 832ia32_syscall_end:
diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h
new file mode 100644
index 000000000000..82f613c607ce
--- /dev/null
+++ b/arch/x86/include/asm/apicnum.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_APICNUM_H
2#define _ASM_X86_APICNUM_H
3
4/* define MAX_IO_APICS */
5#ifdef CONFIG_X86_32
6# define MAX_IO_APICS 64
7#else
8# define MAX_IO_APICS 128
9# define MAX_LOCAL_APIC 32768
10#endif
11
12#endif /* _ASM_X86_APICNUM_H */
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 85b46fba4229..977250ed8b89 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -247,5 +247,223 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
247#define smp_mb__before_atomic_inc() barrier() 247#define smp_mb__before_atomic_inc() barrier()
248#define smp_mb__after_atomic_inc() barrier() 248#define smp_mb__after_atomic_inc() barrier()
249 249
250/* An 64bit atomic type */
251
252typedef struct {
253 unsigned long long counter;
254} atomic64_t;
255
256#define ATOMIC64_INIT(val) { (val) }
257
258/**
259 * atomic64_read - read atomic64 variable
260 * @v: pointer of type atomic64_t
261 *
262 * Atomically reads the value of @v.
263 * Doesn't imply a read memory barrier.
264 */
265#define __atomic64_read(ptr) ((ptr)->counter)
266
267static inline unsigned long long
268cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
269{
270 asm volatile(
271
272 LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
273
274 : "=A" (old)
275
276 : [ptr] "D" (ptr),
277 "A" (old),
278 "b" (ll_low(new)),
279 "c" (ll_high(new))
280
281 : "memory");
282
283 return old;
284}
285
286static inline unsigned long long
287atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
288 unsigned long long new_val)
289{
290 return cmpxchg8b(&ptr->counter, old_val, new_val);
291}
292
293/**
294 * atomic64_set - set atomic64 variable
295 * @ptr: pointer to type atomic64_t
296 * @new_val: value to assign
297 *
298 * Atomically sets the value of @ptr to @new_val.
299 */
300static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
301{
302 unsigned long long old_val;
303
304 do {
305 old_val = atomic_read(ptr);
306 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
307}
308
309/**
310 * atomic64_read - read atomic64 variable
311 * @ptr: pointer to type atomic64_t
312 *
313 * Atomically reads the value of @ptr and returns it.
314 */
315static inline unsigned long long atomic64_read(atomic64_t *ptr)
316{
317 unsigned long long curr_val;
318
319 do {
320 curr_val = __atomic64_read(ptr);
321 } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
322
323 return curr_val;
324}
325
326/**
327 * atomic64_add_return - add and return
328 * @delta: integer value to add
329 * @ptr: pointer to type atomic64_t
330 *
331 * Atomically adds @delta to @ptr and returns @delta + *@ptr
332 */
333static inline unsigned long long
334atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
335{
336 unsigned long long old_val, new_val;
337
338 do {
339 old_val = atomic_read(ptr);
340 new_val = old_val + delta;
341
342 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
343
344 return new_val;
345}
346
347static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
348{
349 return atomic64_add_return(-delta, ptr);
350}
351
352static inline long atomic64_inc_return(atomic64_t *ptr)
353{
354 return atomic64_add_return(1, ptr);
355}
356
357static inline long atomic64_dec_return(atomic64_t *ptr)
358{
359 return atomic64_sub_return(1, ptr);
360}
361
362/**
363 * atomic64_add - add integer to atomic64 variable
364 * @delta: integer value to add
365 * @ptr: pointer to type atomic64_t
366 *
367 * Atomically adds @delta to @ptr.
368 */
369static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
370{
371 atomic64_add_return(delta, ptr);
372}
373
374/**
375 * atomic64_sub - subtract the atomic64 variable
376 * @delta: integer value to subtract
377 * @ptr: pointer to type atomic64_t
378 *
379 * Atomically subtracts @delta from @ptr.
380 */
381static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
382{
383 atomic64_add(-delta, ptr);
384}
385
386/**
387 * atomic64_sub_and_test - subtract value from variable and test result
388 * @delta: integer value to subtract
389 * @ptr: pointer to type atomic64_t
390 *
391 * Atomically subtracts @delta from @ptr and returns
392 * true if the result is zero, or false for all
393 * other cases.
394 */
395static inline int
396atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
397{
398 unsigned long long old_val = atomic64_sub_return(delta, ptr);
399
400 return old_val == 0;
401}
402
403/**
404 * atomic64_inc - increment atomic64 variable
405 * @ptr: pointer to type atomic64_t
406 *
407 * Atomically increments @ptr by 1.
408 */
409static inline void atomic64_inc(atomic64_t *ptr)
410{
411 atomic64_add(1, ptr);
412}
413
414/**
415 * atomic64_dec - decrement atomic64 variable
416 * @ptr: pointer to type atomic64_t
417 *
418 * Atomically decrements @ptr by 1.
419 */
420static inline void atomic64_dec(atomic64_t *ptr)
421{
422 atomic64_sub(1, ptr);
423}
424
425/**
426 * atomic64_dec_and_test - decrement and test
427 * @ptr: pointer to type atomic64_t
428 *
429 * Atomically decrements @ptr by 1 and
430 * returns true if the result is 0, or false for all other
431 * cases.
432 */
433static inline int atomic64_dec_and_test(atomic64_t *ptr)
434{
435 return atomic64_sub_and_test(1, ptr);
436}
437
438/**
439 * atomic64_inc_and_test - increment and test
440 * @ptr: pointer to type atomic64_t
441 *
442 * Atomically increments @ptr by 1
443 * and returns true if the result is zero, or false for all
444 * other cases.
445 */
446static inline int atomic64_inc_and_test(atomic64_t *ptr)
447{
448 return atomic64_sub_and_test(-1, ptr);
449}
450
451/**
452 * atomic64_add_negative - add and test if negative
453 * @delta: integer value to add
454 * @ptr: pointer to type atomic64_t
455 *
456 * Atomically adds @delta to @ptr and returns true
457 * if the result is negative, or false when
458 * result is greater than or equal to zero.
459 */
460static inline int
461atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
462{
463 long long old_val = atomic64_add_return(delta, ptr);
464
465 return old_val < 0;
466}
467
250#include <asm-generic/atomic.h> 468#include <asm-generic/atomic.h>
251#endif /* _ASM_X86_ATOMIC_32_H */ 469#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index bae482df6039..f03b23e32864 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -7,6 +7,20 @@
7#include <linux/nodemask.h> 7#include <linux/nodemask.h>
8#include <linux/percpu.h> 8#include <linux/percpu.h>
9 9
10#ifdef CONFIG_SMP
11
12extern void prefill_possible_map(void);
13
14#else /* CONFIG_SMP */
15
16static inline void prefill_possible_map(void) {}
17
18#define cpu_physical_id(cpu) boot_cpu_physical_apicid
19#define safe_smp_processor_id() 0
20#define stack_smp_processor_id() 0
21
22#endif /* CONFIG_SMP */
23
10struct x86_cpu { 24struct x86_cpu {
11 struct cpu cpu; 25 struct cpu cpu;
12}; 26};
@@ -17,4 +31,11 @@ extern void arch_unregister_cpu(int);
17#endif 31#endif
18 32
19DECLARE_PER_CPU(int, cpu_state); 33DECLARE_PER_CPU(int, cpu_state);
34
35#ifdef CONFIG_X86_HAS_BOOT_CPU_ID
36extern unsigned char boot_cpu_id;
37#else
38#define boot_cpu_id 0
39#endif
40
20#endif /* _ASM_X86_CPU_H */ 41#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h
new file mode 100644
index 000000000000..26c6dad90479
--- /dev/null
+++ b/arch/x86/include/asm/cpumask.h
@@ -0,0 +1,28 @@
1#ifndef _ASM_X86_CPUMASK_H
2#define _ASM_X86_CPUMASK_H
3#ifndef __ASSEMBLY__
4#include <linux/cpumask.h>
5
6#ifdef CONFIG_X86_64
7
8extern cpumask_var_t cpu_callin_mask;
9extern cpumask_var_t cpu_callout_mask;
10extern cpumask_var_t cpu_initialized_mask;
11extern cpumask_var_t cpu_sibling_setup_mask;
12
13#else /* CONFIG_X86_32 */
14
15extern cpumask_t cpu_callin_map;
16extern cpumask_t cpu_callout_map;
17extern cpumask_t cpu_initialized;
18extern cpumask_t cpu_sibling_setup_map;
19
20#define cpu_callin_mask ((struct cpumask *)&cpu_callin_map)
21#define cpu_callout_mask ((struct cpumask *)&cpu_callout_map)
22#define cpu_initialized_mask ((struct cpumask *)&cpu_initialized)
23#define cpu_sibling_setup_mask ((struct cpumask *)&cpu_sibling_setup_map)
24
25#endif /* CONFIG_X86_32 */
26
27#endif /* __ASSEMBLY__ */
28#endif /* _ASM_X86_CPUMASK_H */
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index 0930b4f8d672..c68c361697e1 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -1,39 +1,21 @@
1#ifndef _ASM_X86_CURRENT_H 1#ifndef _ASM_X86_CURRENT_H
2#define _ASM_X86_CURRENT_H 2#define _ASM_X86_CURRENT_H
3 3
4#ifdef CONFIG_X86_32
5#include <linux/compiler.h> 4#include <linux/compiler.h>
6#include <asm/percpu.h> 5#include <asm/percpu.h>
7 6
7#ifndef __ASSEMBLY__
8struct task_struct; 8struct task_struct;
9 9
10DECLARE_PER_CPU(struct task_struct *, current_task); 10DECLARE_PER_CPU(struct task_struct *, current_task);
11static __always_inline struct task_struct *get_current(void)
12{
13 return x86_read_percpu(current_task);
14}
15
16#else /* X86_32 */
17
18#ifndef __ASSEMBLY__
19#include <asm/pda.h>
20
21struct task_struct;
22 11
23static __always_inline struct task_struct *get_current(void) 12static __always_inline struct task_struct *get_current(void)
24{ 13{
25 return read_pda(pcurrent); 14 return percpu_read(current_task);
26} 15}
27 16
28#else /* __ASSEMBLY__ */ 17#define current get_current()
29
30#include <asm/asm-offsets.h>
31#define GET_CURRENT(reg) movq %gs:(pda_pcurrent),reg
32 18
33#endif /* __ASSEMBLY__ */ 19#endif /* __ASSEMBLY__ */
34 20
35#endif /* X86_32 */
36
37#define current get_current()
38
39#endif /* _ASM_X86_CURRENT_H */ 21#endif /* _ASM_X86_CURRENT_H */
diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h
index 2c05b737ee22..4334502d3664 100644
--- a/arch/x86/include/asm/genapic_32.h
+++ b/arch/x86/include/asm/genapic_32.h
@@ -138,11 +138,4 @@ struct genapic {
138extern struct genapic *genapic; 138extern struct genapic *genapic;
139extern void es7000_update_genapic_to_cluster(void); 139extern void es7000_update_genapic_to_cluster(void);
140 140
141enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
142#define get_uv_system_type() UV_NONE
143#define is_uv_system() 0
144#define uv_wakeup_secondary(a, b) 1
145#define uv_system_init() do {} while (0)
146
147
148#endif /* _ASM_X86_GENAPIC_32_H */ 141#endif /* _ASM_X86_GENAPIC_32_H */
diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h
index adf32fb56aa6..7bb092c59055 100644
--- a/arch/x86/include/asm/genapic_64.h
+++ b/arch/x86/include/asm/genapic_64.h
@@ -51,15 +51,9 @@ extern struct genapic apic_x2apic_phys;
51extern int acpi_madt_oem_check(char *, char *); 51extern int acpi_madt_oem_check(char *, char *);
52 52
53extern void apic_send_IPI_self(int vector); 53extern void apic_send_IPI_self(int vector);
54enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
55extern enum uv_system_type get_uv_system_type(void);
56extern int is_uv_system(void);
57 54
58extern struct genapic apic_x2apic_uv_x; 55extern struct genapic apic_x2apic_uv_x;
59DECLARE_PER_CPU(int, x2apic_extra_bits); 56DECLARE_PER_CPU(int, x2apic_extra_bits);
60extern void uv_cpu_init(void);
61extern void uv_system_init(void);
62extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
63 57
64extern void setup_apic_routing(void); 58extern void setup_apic_routing(void);
65 59
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 000787df66e6..46ebed797e4f 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -1,11 +1,53 @@
1#ifdef CONFIG_X86_32 1#ifndef _ASM_X86_HARDIRQ_H
2# include "hardirq_32.h" 2#define _ASM_X86_HARDIRQ_H
3#else 3
4# include "hardirq_64.h" 4#include <linux/threads.h>
5#include <linux/irq.h>
6
7typedef struct {
8 unsigned int __softirq_pending;
9 unsigned int __nmi_count; /* arch dependent */
10 unsigned int irq0_irqs;
11#ifdef CONFIG_X86_LOCAL_APIC
12 unsigned int apic_timer_irqs; /* arch dependent */
13 unsigned int irq_spurious_count;
14#endif
15 unsigned int apic_perf_irqs;
16#ifdef CONFIG_SMP
17 unsigned int irq_resched_count;
18 unsigned int irq_call_count;
19 unsigned int irq_tlb_count;
20#endif
21#ifdef CONFIG_X86_MCE
22 unsigned int irq_thermal_count;
23# ifdef CONFIG_X86_64
24 unsigned int irq_threshold_count;
25# endif
5#endif 26#endif
27} ____cacheline_aligned irq_cpustat_t;
28
29DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
30
31/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
32#define MAX_HARDIRQS_PER_CPU NR_VECTORS
33
34#define __ARCH_IRQ_STAT
35
36#define inc_irq_stat(member) percpu_add(irq_stat.member, 1)
37
38#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending)
39
40#define __ARCH_SET_SOFTIRQ_PENDING
41
42#define set_softirq_pending(x) percpu_write(irq_stat.__softirq_pending, (x))
43#define or_softirq_pending(x) percpu_or(irq_stat.__softirq_pending, (x))
44
45extern void ack_bad_irq(unsigned int irq);
6 46
7extern u64 arch_irq_stat_cpu(unsigned int cpu); 47extern u64 arch_irq_stat_cpu(unsigned int cpu);
8#define arch_irq_stat_cpu arch_irq_stat_cpu 48#define arch_irq_stat_cpu arch_irq_stat_cpu
9 49
10extern u64 arch_irq_stat(void); 50extern u64 arch_irq_stat(void);
11#define arch_irq_stat arch_irq_stat 51#define arch_irq_stat arch_irq_stat
52
53#endif /* _ASM_X86_HARDIRQ_H */
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
deleted file mode 100644
index cf7954d1405f..000000000000
--- a/arch/x86/include/asm/hardirq_32.h
+++ /dev/null
@@ -1,30 +0,0 @@
1#ifndef _ASM_X86_HARDIRQ_32_H
2#define _ASM_X86_HARDIRQ_32_H
3
4#include <linux/threads.h>
5#include <linux/irq.h>
6
7typedef struct {
8 unsigned int __softirq_pending;
9 unsigned long idle_timestamp;
10 unsigned int __nmi_count; /* arch dependent */
11 unsigned int apic_timer_irqs; /* arch dependent */
12 unsigned int irq0_irqs;
13 unsigned int irq_resched_count;
14 unsigned int irq_call_count;
15 unsigned int irq_tlb_count;
16 unsigned int irq_thermal_count;
17 unsigned int irq_spurious_count;
18} ____cacheline_aligned irq_cpustat_t;
19
20DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
21
22#define __ARCH_IRQ_STAT
23#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member)
24
25#define inc_irq_stat(member) (__get_cpu_var(irq_stat).member++)
26
27void ack_bad_irq(unsigned int irq);
28#include <linux/irq_cpustat.h>
29
30#endif /* _ASM_X86_HARDIRQ_32_H */
diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h
deleted file mode 100644
index b5a6b5d56704..000000000000
--- a/arch/x86/include/asm/hardirq_64.h
+++ /dev/null
@@ -1,25 +0,0 @@
1#ifndef _ASM_X86_HARDIRQ_64_H
2#define _ASM_X86_HARDIRQ_64_H
3
4#include <linux/threads.h>
5#include <linux/irq.h>
6#include <asm/pda.h>
7#include <asm/apic.h>
8
9/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
10#define MAX_HARDIRQS_PER_CPU NR_VECTORS
11
12#define __ARCH_IRQ_STAT 1
13
14#define inc_irq_stat(member) add_pda(member, 1)
15
16#define local_softirq_pending() read_pda(__softirq_pending)
17
18#define __ARCH_SET_SOFTIRQ_PENDING 1
19
20#define set_softirq_pending(x) write_pda(__softirq_pending, (x))
21#define or_softirq_pending(x) or_pda(__softirq_pending, (x))
22
23extern void ack_bad_irq(unsigned int irq);
24
25#endif /* _ASM_X86_HARDIRQ_64_H */
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 8de644b6b959..aa93e53b85ee 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -30,6 +30,8 @@
30/* Interrupt handlers registered during init_IRQ */ 30/* Interrupt handlers registered during init_IRQ */
31extern void apic_timer_interrupt(void); 31extern void apic_timer_interrupt(void);
32extern void error_interrupt(void); 32extern void error_interrupt(void);
33extern void perf_counter_interrupt(void);
34
33extern void spurious_interrupt(void); 35extern void spurious_interrupt(void);
34extern void thermal_interrupt(void); 36extern void thermal_interrupt(void);
35extern void reschedule_interrupt(void); 37extern void reschedule_interrupt(void);
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index fa0fd068bc2e..000000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,31 +0,0 @@
1#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
2#define _ASM_X86_INTEL_ARCH_PERFMON_H
3
4#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
5#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
6
7#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
8#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
9
10#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
11#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
12#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
13#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
14
15#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
16#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
17#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
18#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
19 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
20
21union cpuid10_eax {
22 struct {
23 unsigned int version_id:8;
24 unsigned int num_counters:8;
25 unsigned int bit_width:8;
26 unsigned int mask_length:8;
27 } split;
28 unsigned int full;
29};
30
31#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 7a1f44ac1f17..08ec793aa043 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -114,38 +114,16 @@ struct IR_IO_APIC_route_entry {
114extern int nr_ioapics; 114extern int nr_ioapics;
115extern int nr_ioapic_registers[MAX_IO_APICS]; 115extern int nr_ioapic_registers[MAX_IO_APICS];
116 116
117/*
118 * MP-BIOS irq configuration table structures:
119 */
120
121#define MP_MAX_IOAPIC_PIN 127 117#define MP_MAX_IOAPIC_PIN 127
122 118
123struct mp_config_ioapic {
124 unsigned long mp_apicaddr;
125 unsigned int mp_apicid;
126 unsigned char mp_type;
127 unsigned char mp_apicver;
128 unsigned char mp_flags;
129};
130
131struct mp_config_intsrc {
132 unsigned int mp_dstapic;
133 unsigned char mp_type;
134 unsigned char mp_irqtype;
135 unsigned short mp_irqflag;
136 unsigned char mp_srcbus;
137 unsigned char mp_srcbusirq;
138 unsigned char mp_dstirq;
139};
140
141/* I/O APIC entries */ 119/* I/O APIC entries */
142extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; 120extern struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
143 121
144/* # of MP IRQ source entries */ 122/* # of MP IRQ source entries */
145extern int mp_irq_entries; 123extern int mp_irq_entries;
146 124
147/* MP IRQ source entries */ 125/* MP IRQ source entries */
148extern struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; 126extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
149 127
150/* non-0 if default (table-less) MP configuration */ 128/* non-0 if default (table-less) MP configuration */
151extern int mpc_default_type; 129extern int mpc_default_type;
diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h
index 89c898ab298b..77843225b7ea 100644
--- a/arch/x86/include/asm/irq_regs.h
+++ b/arch/x86/include/asm/irq_regs.h
@@ -1,5 +1,31 @@
1#ifdef CONFIG_X86_32 1/*
2# include "irq_regs_32.h" 2 * Per-cpu current frame pointer - the location of the last exception frame on
3#else 3 * the stack, stored in the per-cpu area.
4# include "irq_regs_64.h" 4 *
5#endif 5 * Jeremy Fitzhardinge <jeremy@goop.org>
6 */
7#ifndef _ASM_X86_IRQ_REGS_H
8#define _ASM_X86_IRQ_REGS_H
9
10#include <asm/percpu.h>
11
12#define ARCH_HAS_OWN_IRQ_REGS
13
14DECLARE_PER_CPU(struct pt_regs *, irq_regs);
15
16static inline struct pt_regs *get_irq_regs(void)
17{
18 return percpu_read(irq_regs);
19}
20
21static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
22{
23 struct pt_regs *old_regs;
24
25 old_regs = get_irq_regs();
26 percpu_write(irq_regs, new_regs);
27
28 return old_regs;
29}
30
31#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h
deleted file mode 100644
index 86afd7473457..000000000000
--- a/arch/x86/include/asm/irq_regs_32.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/*
2 * Per-cpu current frame pointer - the location of the last exception frame on
3 * the stack, stored in the per-cpu area.
4 *
5 * Jeremy Fitzhardinge <jeremy@goop.org>
6 */
7#ifndef _ASM_X86_IRQ_REGS_32_H
8#define _ASM_X86_IRQ_REGS_32_H
9
10#include <asm/percpu.h>
11
12#define ARCH_HAS_OWN_IRQ_REGS
13
14DECLARE_PER_CPU(struct pt_regs *, irq_regs);
15
16static inline struct pt_regs *get_irq_regs(void)
17{
18 return x86_read_percpu(irq_regs);
19}
20
21static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
22{
23 struct pt_regs *old_regs;
24
25 old_regs = get_irq_regs();
26 x86_write_percpu(irq_regs, new_regs);
27
28 return old_regs;
29}
30
31#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_regs_64.h b/arch/x86/include/asm/irq_regs_64.h
deleted file mode 100644
index 3dd9c0b70270..000000000000
--- a/arch/x86/include/asm/irq_regs_64.h
+++ /dev/null
@@ -1 +0,0 @@
1#include <asm-generic/irq_regs.h>
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index f7ff65032b9d..0e2220bb3142 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -49,31 +49,33 @@
49 * some of the following vectors are 'rare', they are merged 49 * some of the following vectors are 'rare', they are merged
50 * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. 50 * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
51 * TLB, reschedule and local APIC vectors are performance-critical. 51 * TLB, reschedule and local APIC vectors are performance-critical.
52 *
53 * Vectors 0xf0-0xfa are free (reserved for future Linux use).
54 */ 52 */
55#ifdef CONFIG_X86_32 53#ifdef CONFIG_X86_32
56 54
57# define SPURIOUS_APIC_VECTOR 0xff 55# define SPURIOUS_APIC_VECTOR 0xff
58# define ERROR_APIC_VECTOR 0xfe 56# define ERROR_APIC_VECTOR 0xfe
59# define INVALIDATE_TLB_VECTOR 0xfd 57# define RESCHEDULE_VECTOR 0xfd
60# define RESCHEDULE_VECTOR 0xfc 58# define CALL_FUNCTION_VECTOR 0xfc
61# define CALL_FUNCTION_VECTOR 0xfb 59# define CALL_FUNCTION_SINGLE_VECTOR 0xfb
62# define CALL_FUNCTION_SINGLE_VECTOR 0xfa 60# define THERMAL_APIC_VECTOR 0xfa
63# define THERMAL_APIC_VECTOR 0xf0 61/* 0xf8 - 0xf9 : free */
62# define INVALIDATE_TLB_VECTOR_END 0xf7
63# define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */
64
65# define NUM_INVALIDATE_TLB_VECTORS 8
64 66
65#else 67#else
66 68
67#define SPURIOUS_APIC_VECTOR 0xff 69# define SPURIOUS_APIC_VECTOR 0xff
68#define ERROR_APIC_VECTOR 0xfe 70# define ERROR_APIC_VECTOR 0xfe
69#define RESCHEDULE_VECTOR 0xfd 71# define RESCHEDULE_VECTOR 0xfd
70#define CALL_FUNCTION_VECTOR 0xfc 72# define CALL_FUNCTION_VECTOR 0xfc
71#define CALL_FUNCTION_SINGLE_VECTOR 0xfb 73# define CALL_FUNCTION_SINGLE_VECTOR 0xfb
72#define THERMAL_APIC_VECTOR 0xfa 74# define THERMAL_APIC_VECTOR 0xfa
73#define THRESHOLD_APIC_VECTOR 0xf9 75# define THRESHOLD_APIC_VECTOR 0xf9
74#define UV_BAU_MESSAGE 0xf8 76# define UV_BAU_MESSAGE 0xf8
75#define INVALIDATE_TLB_VECTOR_END 0xf7 77# define INVALIDATE_TLB_VECTOR_END 0xf7
76#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */ 78# define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */
77 79
78#define NUM_INVALIDATE_TLB_VECTORS 8 80#define NUM_INVALIDATE_TLB_VECTORS 8
79 81
@@ -87,6 +89,11 @@
87#define LOCAL_TIMER_VECTOR 0xef 89#define LOCAL_TIMER_VECTOR 0xef
88 90
89/* 91/*
92 * Performance monitoring interrupt vector:
93 */
94#define LOCAL_PERF_VECTOR 0xee
95
96/*
90 * First APIC vector available to drivers: (vectors 0x30-0xee) we 97 * First APIC vector available to drivers: (vectors 0x30-0xee) we
91 * start at 0x31(0x41) to spread out vectors evenly between priority 98 * start at 0x31(0x41) to spread out vectors evenly between priority
92 * levels. (0x80 is the syscall vector) 99 * levels. (0x80 is the syscall vector)
@@ -105,6 +112,8 @@
105 112
106#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER) 113#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
107 114
115#include <asm/apicnum.h> /* need MAX_IO_APICS */
116
108#ifndef CONFIG_SPARSE_IRQ 117#ifndef CONFIG_SPARSE_IRQ
109# if NR_CPUS < MAX_IO_APICS 118# if NR_CPUS < MAX_IO_APICS
110# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS)) 119# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
@@ -112,11 +121,12 @@
112# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) 121# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
113# endif 122# endif
114#else 123#else
115# if (8 * NR_CPUS) > (32 * MAX_IO_APICS) 124
116# define NR_IRQS (NR_VECTORS + (8 * NR_CPUS)) 125# define NR_IRQS \
117# else 126 ((8 * NR_CPUS) > (32 * MAX_IO_APICS) ? \
118# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) 127 (NR_VECTORS + (8 * NR_CPUS)) : \
119# endif 128 (NR_VECTORS + (32 * MAX_IO_APICS))) \
129
120#endif 130#endif
121 131
122#elif defined(CONFIG_X86_VOYAGER) 132#elif defined(CONFIG_X86_VOYAGER)
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
index 6b1add8e31dd..b87b077cc231 100644
--- a/arch/x86/include/asm/mach-default/entry_arch.h
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -11,10 +11,26 @@
11 */ 11 */
12#ifdef CONFIG_X86_SMP 12#ifdef CONFIG_X86_SMP
13BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) 13BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
14BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
15BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) 14BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
16BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) 15BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
17BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) 16BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
17
18BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0,
19 smp_invalidate_interrupt)
20BUILD_INTERRUPT3(invalidate_interrupt1,INVALIDATE_TLB_VECTOR_START+1,
21 smp_invalidate_interrupt)
22BUILD_INTERRUPT3(invalidate_interrupt2,INVALIDATE_TLB_VECTOR_START+2,
23 smp_invalidate_interrupt)
24BUILD_INTERRUPT3(invalidate_interrupt3,INVALIDATE_TLB_VECTOR_START+3,
25 smp_invalidate_interrupt)
26BUILD_INTERRUPT3(invalidate_interrupt4,INVALIDATE_TLB_VECTOR_START+4,
27 smp_invalidate_interrupt)
28BUILD_INTERRUPT3(invalidate_interrupt5,INVALIDATE_TLB_VECTOR_START+5,
29 smp_invalidate_interrupt)
30BUILD_INTERRUPT3(invalidate_interrupt6,INVALIDATE_TLB_VECTOR_START+6,
31 smp_invalidate_interrupt)
32BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
33 smp_invalidate_interrupt)
18#endif 34#endif
19 35
20/* 36/*
@@ -25,10 +41,15 @@ BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
25 * a much simpler SMP time architecture: 41 * a much simpler SMP time architecture:
26 */ 42 */
27#ifdef CONFIG_X86_LOCAL_APIC 43#ifdef CONFIG_X86_LOCAL_APIC
44
28BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) 45BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
29BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) 46BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
30BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) 47BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
31 48
49#ifdef CONFIG_PERF_COUNTERS
50BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
51#endif
52
32#ifdef CONFIG_X86_MCE_P4THERMAL 53#ifdef CONFIG_X86_MCE_P4THERMAL
33BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) 54BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
34#endif 55#endif
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 8aeeb3fd73db..52948df9cd1d 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -21,11 +21,54 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
21int init_new_context(struct task_struct *tsk, struct mm_struct *mm); 21int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
22void destroy_context(struct mm_struct *mm); 22void destroy_context(struct mm_struct *mm);
23 23
24#ifdef CONFIG_X86_32 24
25# include "mmu_context_32.h" 25static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
26#else 26{
27# include "mmu_context_64.h" 27#ifdef CONFIG_SMP
28 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
29 percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
30#endif
31}
32
33static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
34 struct task_struct *tsk)
35{
36 unsigned cpu = smp_processor_id();
37
38 if (likely(prev != next)) {
39 /* stop flush ipis for the previous mm */
40 cpu_clear(cpu, prev->cpu_vm_mask);
41#ifdef CONFIG_SMP
42 percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
43 percpu_write(cpu_tlbstate.active_mm, next);
28#endif 44#endif
45 cpu_set(cpu, next->cpu_vm_mask);
46
47 /* Re-load page tables */
48 load_cr3(next->pgd);
49
50 /*
51 * load the LDT, if the LDT is different:
52 */
53 if (unlikely(prev->context.ldt != next->context.ldt))
54 load_LDT_nolock(&next->context);
55 }
56#ifdef CONFIG_SMP
57 else {
58 percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
59 BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
60
61 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
62 /* We were in lazy tlb mode and leave_mm disabled
63 * tlb flush IPI delivery. We must reload CR3
64 * to make sure to use no freed page tables.
65 */
66 load_cr3(next->pgd);
67 load_LDT_nolock(&next->context);
68 }
69 }
70#endif
71}
29 72
30#define activate_mm(prev, next) \ 73#define activate_mm(prev, next) \
31do { \ 74do { \
@@ -33,5 +76,17 @@ do { \
33 switch_mm((prev), (next), NULL); \ 76 switch_mm((prev), (next), NULL); \
34} while (0); 77} while (0);
35 78
79#ifdef CONFIG_X86_32
80#define deactivate_mm(tsk, mm) \
81do { \
82 loadsegment(gs, 0); \
83} while (0)
84#else
85#define deactivate_mm(tsk, mm) \
86do { \
87 load_gs_index(0); \
88 loadsegment(fs, 0); \
89} while (0)
90#endif
36 91
37#endif /* _ASM_X86_MMU_CONTEXT_H */ 92#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h
deleted file mode 100644
index 7e98ce1d2c0e..000000000000
--- a/arch/x86/include/asm/mmu_context_32.h
+++ /dev/null
@@ -1,55 +0,0 @@
1#ifndef _ASM_X86_MMU_CONTEXT_32_H
2#define _ASM_X86_MMU_CONTEXT_32_H
3
4static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
5{
6#ifdef CONFIG_SMP
7 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
8 x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
9#endif
10}
11
12static inline void switch_mm(struct mm_struct *prev,
13 struct mm_struct *next,
14 struct task_struct *tsk)
15{
16 int cpu = smp_processor_id();
17
18 if (likely(prev != next)) {
19 /* stop flush ipis for the previous mm */
20 cpu_clear(cpu, prev->cpu_vm_mask);
21#ifdef CONFIG_SMP
22 x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
23 x86_write_percpu(cpu_tlbstate.active_mm, next);
24#endif
25 cpu_set(cpu, next->cpu_vm_mask);
26
27 /* Re-load page tables */
28 load_cr3(next->pgd);
29
30 /*
31 * load the LDT, if the LDT is different:
32 */
33 if (unlikely(prev->context.ldt != next->context.ldt))
34 load_LDT_nolock(&next->context);
35 }
36#ifdef CONFIG_SMP
37 else {
38 x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
39 BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
40
41 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
42 /* We were in lazy tlb mode and leave_mm disabled
43 * tlb flush IPI delivery. We must reload %cr3.
44 */
45 load_cr3(next->pgd);
46 load_LDT_nolock(&next->context);
47 }
48 }
49#endif
50}
51
52#define deactivate_mm(tsk, mm) \
53 asm("movl %0,%%gs": :"r" (0));
54
55#endif /* _ASM_X86_MMU_CONTEXT_32_H */
diff --git a/arch/x86/include/asm/mmu_context_64.h b/arch/x86/include/asm/mmu_context_64.h
deleted file mode 100644
index 677d36e9540a..000000000000
--- a/arch/x86/include/asm/mmu_context_64.h
+++ /dev/null
@@ -1,54 +0,0 @@
1#ifndef _ASM_X86_MMU_CONTEXT_64_H
2#define _ASM_X86_MMU_CONTEXT_64_H
3
4#include <asm/pda.h>
5
6static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
7{
8#ifdef CONFIG_SMP
9 if (read_pda(mmu_state) == TLBSTATE_OK)
10 write_pda(mmu_state, TLBSTATE_LAZY);
11#endif
12}
13
14static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
15 struct task_struct *tsk)
16{
17 unsigned cpu = smp_processor_id();
18 if (likely(prev != next)) {
19 /* stop flush ipis for the previous mm */
20 cpu_clear(cpu, prev->cpu_vm_mask);
21#ifdef CONFIG_SMP
22 write_pda(mmu_state, TLBSTATE_OK);
23 write_pda(active_mm, next);
24#endif
25 cpu_set(cpu, next->cpu_vm_mask);
26 load_cr3(next->pgd);
27
28 if (unlikely(next->context.ldt != prev->context.ldt))
29 load_LDT_nolock(&next->context);
30 }
31#ifdef CONFIG_SMP
32 else {
33 write_pda(mmu_state, TLBSTATE_OK);
34 if (read_pda(active_mm) != next)
35 BUG();
36 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
37 /* We were in lazy tlb mode and leave_mm disabled
38 * tlb flush IPI delivery. We must reload CR3
39 * to make sure to use no freed page tables.
40 */
41 load_cr3(next->pgd);
42 load_LDT_nolock(&next->context);
43 }
44 }
45#endif
46}
47
48#define deactivate_mm(tsk, mm) \
49do { \
50 load_gs_index(0); \
51 asm volatile("movl %0,%%fs"::"r"(0)); \
52} while (0)
53
54#endif /* _ASM_X86_MMU_CONTEXT_64_H */
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
index 59568bc4767f..4a7f96d7c188 100644
--- a/arch/x86/include/asm/mpspec_def.h
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -24,17 +24,18 @@
24# endif 24# endif
25#endif 25#endif
26 26
27struct intel_mp_floating { 27/* Intel MP Floating Pointer Structure */
28 char mpf_signature[4]; /* "_MP_" */ 28struct mpf_intel {
29 unsigned int mpf_physptr; /* Configuration table address */ 29 char signature[4]; /* "_MP_" */
30 unsigned char mpf_length; /* Our length (paragraphs) */ 30 unsigned int physptr; /* Configuration table address */
31 unsigned char mpf_specification;/* Specification version */ 31 unsigned char length; /* Our length (paragraphs) */
32 unsigned char mpf_checksum; /* Checksum (makes sum 0) */ 32 unsigned char specification; /* Specification version */
33 unsigned char mpf_feature1; /* Standard or configuration ? */ 33 unsigned char checksum; /* Checksum (makes sum 0) */
34 unsigned char mpf_feature2; /* Bit7 set for IMCR|PIC */ 34 unsigned char feature1; /* Standard or configuration ? */
35 unsigned char mpf_feature3; /* Unused (0) */ 35 unsigned char feature2; /* Bit7 set for IMCR|PIC */
36 unsigned char mpf_feature4; /* Unused (0) */ 36 unsigned char feature3; /* Unused (0) */
37 unsigned char mpf_feature5; /* Unused (0) */ 37 unsigned char feature4; /* Unused (0) */
38 unsigned char feature5; /* Unused (0) */
38}; 39};
39 40
40#define MPC_SIGNATURE "PCMP" 41#define MPC_SIGNATURE "PCMP"
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 5ebca29f44f0..e27fdbe5f9e4 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -13,8 +13,8 @@
13#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) 13#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
14#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) 14#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
15 15
16#define IRQSTACK_ORDER 2 16#define IRQ_STACK_ORDER 2
17#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER) 17#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
18 18
19#define STACKFAULT_STACK 1 19#define STACKFAULT_STACK 1
20#define DOUBLEFAULT_STACK 2 20#define DOUBLEFAULT_STACK 2
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index c09a14127584..ccd59f00fd5c 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -244,7 +244,8 @@ struct pv_mmu_ops {
244 void (*flush_tlb_user)(void); 244 void (*flush_tlb_user)(void);
245 void (*flush_tlb_kernel)(void); 245 void (*flush_tlb_kernel)(void);
246 void (*flush_tlb_single)(unsigned long addr); 246 void (*flush_tlb_single)(unsigned long addr);
247 void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm, 247 void (*flush_tlb_others)(const struct cpumask *cpus,
248 struct mm_struct *mm,
248 unsigned long va); 249 unsigned long va);
249 250
250 /* Hooks for allocating and freeing a pagetable top-level */ 251 /* Hooks for allocating and freeing a pagetable top-level */
@@ -984,10 +985,11 @@ static inline void __flush_tlb_single(unsigned long addr)
984 PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr); 985 PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
985} 986}
986 987
987static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, 988static inline void flush_tlb_others(const struct cpumask *cpumask,
989 struct mm_struct *mm,
988 unsigned long va) 990 unsigned long va)
989{ 991{
990 PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va); 992 PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va);
991} 993}
992 994
993static inline int paravirt_pgd_alloc(struct mm_struct *mm) 995static inline int paravirt_pgd_alloc(struct mm_struct *mm)
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
deleted file mode 100644
index 2fbfff88df37..000000000000
--- a/arch/x86/include/asm/pda.h
+++ /dev/null
@@ -1,137 +0,0 @@
1#ifndef _ASM_X86_PDA_H
2#define _ASM_X86_PDA_H
3
4#ifndef __ASSEMBLY__
5#include <linux/stddef.h>
6#include <linux/types.h>
7#include <linux/cache.h>
8#include <asm/page.h>
9
10/* Per processor datastructure. %gs points to it while the kernel runs */
11struct x8664_pda {
12 struct task_struct *pcurrent; /* 0 Current process */
13 unsigned long data_offset; /* 8 Per cpu data offset from linker
14 address */
15 unsigned long kernelstack; /* 16 top of kernel stack for current */
16 unsigned long oldrsp; /* 24 user rsp for system call */
17 int irqcount; /* 32 Irq nesting counter. Starts -1 */
18 unsigned int cpunumber; /* 36 Logical CPU number */
19#ifdef CONFIG_CC_STACKPROTECTOR
20 unsigned long stack_canary; /* 40 stack canary value */
21 /* gcc-ABI: this canary MUST be at
22 offset 40!!! */
23#endif
24 char *irqstackptr;
25 short nodenumber; /* number of current node (32k max) */
26 short in_bootmem; /* pda lives in bootmem */
27 unsigned int __softirq_pending;
28 unsigned int __nmi_count; /* number of NMI on this CPUs */
29 short mmu_state;
30 short isidle;
31 struct mm_struct *active_mm;
32 unsigned apic_timer_irqs;
33 unsigned irq0_irqs;
34 unsigned irq_resched_count;
35 unsigned irq_call_count;
36 unsigned irq_tlb_count;
37 unsigned irq_thermal_count;
38 unsigned irq_threshold_count;
39 unsigned irq_spurious_count;
40} ____cacheline_aligned_in_smp;
41
42extern struct x8664_pda **_cpu_pda;
43extern void pda_init(int);
44
45#define cpu_pda(i) (_cpu_pda[i])
46
47/*
48 * There is no fast way to get the base address of the PDA, all the accesses
49 * have to mention %fs/%gs. So it needs to be done this Torvaldian way.
50 */
51extern void __bad_pda_field(void) __attribute__((noreturn));
52
53/*
54 * proxy_pda doesn't actually exist, but tell gcc it is accessed for
55 * all PDA accesses so it gets read/write dependencies right.
56 */
57extern struct x8664_pda _proxy_pda;
58
59#define pda_offset(field) offsetof(struct x8664_pda, field)
60
61#define pda_to_op(op, field, val) \
62do { \
63 typedef typeof(_proxy_pda.field) T__; \
64 if (0) { T__ tmp__; tmp__ = (val); } /* type checking */ \
65 switch (sizeof(_proxy_pda.field)) { \
66 case 2: \
67 asm(op "w %1,%%gs:%c2" : \
68 "+m" (_proxy_pda.field) : \
69 "ri" ((T__)val), \
70 "i"(pda_offset(field))); \
71 break; \
72 case 4: \
73 asm(op "l %1,%%gs:%c2" : \
74 "+m" (_proxy_pda.field) : \
75 "ri" ((T__)val), \
76 "i" (pda_offset(field))); \
77 break; \
78 case 8: \
79 asm(op "q %1,%%gs:%c2": \
80 "+m" (_proxy_pda.field) : \
81 "ri" ((T__)val), \
82 "i"(pda_offset(field))); \
83 break; \
84 default: \
85 __bad_pda_field(); \
86 } \
87} while (0)
88
89#define pda_from_op(op, field) \
90({ \
91 typeof(_proxy_pda.field) ret__; \
92 switch (sizeof(_proxy_pda.field)) { \
93 case 2: \
94 asm(op "w %%gs:%c1,%0" : \
95 "=r" (ret__) : \
96 "i" (pda_offset(field)), \
97 "m" (_proxy_pda.field)); \
98 break; \
99 case 4: \
100 asm(op "l %%gs:%c1,%0": \
101 "=r" (ret__): \
102 "i" (pda_offset(field)), \
103 "m" (_proxy_pda.field)); \
104 break; \
105 case 8: \
106 asm(op "q %%gs:%c1,%0": \
107 "=r" (ret__) : \
108 "i" (pda_offset(field)), \
109 "m" (_proxy_pda.field)); \
110 break; \
111 default: \
112 __bad_pda_field(); \
113 } \
114 ret__; \
115})
116
117#define read_pda(field) pda_from_op("mov", field)
118#define write_pda(field, val) pda_to_op("mov", field, val)
119#define add_pda(field, val) pda_to_op("add", field, val)
120#define sub_pda(field, val) pda_to_op("sub", field, val)
121#define or_pda(field, val) pda_to_op("or", field, val)
122
123/* This is not atomic against other CPUs -- CPU preemption needs to be off */
124#define test_and_clear_bit_pda(bit, field) \
125({ \
126 int old__; \
127 asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0" \
128 : "=r" (old__), "+m" (_proxy_pda.field) \
129 : "dIr" (bit), "i" (pda_offset(field)) : "memory");\
130 old__; \
131})
132
133#endif
134
135#define PDA_STACKOFFSET (5*8)
136
137#endif /* _ASM_X86_PDA_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index ece72053ba63..0b64af4f13ac 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -2,53 +2,12 @@
2#define _ASM_X86_PERCPU_H 2#define _ASM_X86_PERCPU_H
3 3
4#ifdef CONFIG_X86_64 4#ifdef CONFIG_X86_64
5#include <linux/compiler.h> 5#define __percpu_seg gs
6 6#define __percpu_mov_op movq
7/* Same as asm-generic/percpu.h, except that we store the per cpu offset 7#else
8 in the PDA. Longer term the PDA and every per cpu variable 8#define __percpu_seg fs
9 should be just put into a single section and referenced directly 9#define __percpu_mov_op movl
10 from %gs */
11
12#ifdef CONFIG_SMP
13#include <asm/pda.h>
14
15#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
16#define __my_cpu_offset read_pda(data_offset)
17
18#define per_cpu_offset(x) (__per_cpu_offset(x))
19
20#endif 10#endif
21#include <asm-generic/percpu.h>
22
23DECLARE_PER_CPU(struct x8664_pda, pda);
24
25/*
26 * These are supposed to be implemented as a single instruction which
27 * operates on the per-cpu data base segment. x86-64 doesn't have
28 * that yet, so this is a fairly inefficient workaround for the
29 * meantime. The single instruction is atomic with respect to
30 * preemption and interrupts, so we need to explicitly disable
31 * interrupts here to achieve the same effect. However, because it
32 * can be used from within interrupt-disable/enable, we can't actually
33 * disable interrupts; disabling preemption is enough.
34 */
35#define x86_read_percpu(var) \
36 ({ \
37 typeof(per_cpu_var(var)) __tmp; \
38 preempt_disable(); \
39 __tmp = __get_cpu_var(var); \
40 preempt_enable(); \
41 __tmp; \
42 })
43
44#define x86_write_percpu(var, val) \
45 do { \
46 preempt_disable(); \
47 __get_cpu_var(var) = (val); \
48 preempt_enable(); \
49 } while(0)
50
51#else /* CONFIG_X86_64 */
52 11
53#ifdef __ASSEMBLY__ 12#ifdef __ASSEMBLY__
54 13
@@ -65,47 +24,26 @@ DECLARE_PER_CPU(struct x8664_pda, pda);
65 * PER_CPU(cpu_gdt_descr, %ebx) 24 * PER_CPU(cpu_gdt_descr, %ebx)
66 */ 25 */
67#ifdef CONFIG_SMP 26#ifdef CONFIG_SMP
68#define PER_CPU(var, reg) \ 27#define PER_CPU(var, reg) \
69 movl %fs:per_cpu__##this_cpu_off, reg; \ 28 __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg; \
70 lea per_cpu__##var(reg), reg 29 lea per_cpu__##var(reg), reg
71#define PER_CPU_VAR(var) %fs:per_cpu__##var 30#define PER_CPU_VAR(var) %__percpu_seg:per_cpu__##var
72#else /* ! SMP */ 31#else /* ! SMP */
73#define PER_CPU(var, reg) \ 32#define PER_CPU(var, reg) \
74 movl $per_cpu__##var, reg 33 __percpu_mov_op $per_cpu__##var, reg
75#define PER_CPU_VAR(var) per_cpu__##var 34#define PER_CPU_VAR(var) per_cpu__##var
76#endif /* SMP */ 35#endif /* SMP */
77 36
78#else /* ...!ASSEMBLY */ 37#else /* ...!ASSEMBLY */
79 38
80/* 39#include <linux/stringify.h>
81 * PER_CPU finds an address of a per-cpu variable.
82 *
83 * Args:
84 * var - variable name
85 * cpu - 32bit register containing the current CPU number
86 *
87 * The resulting address is stored in the "cpu" argument.
88 *
89 * Example:
90 * PER_CPU(cpu_gdt_descr, %ebx)
91 */
92#ifdef CONFIG_SMP
93
94#define __my_cpu_offset x86_read_percpu(this_cpu_off)
95 40
96/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */ 41#ifdef CONFIG_SMP
97#define __percpu_seg "%%fs:" 42#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x
98 43#define __my_cpu_offset percpu_read(this_cpu_off)
99#else /* !SMP */ 44#else
100 45#define __percpu_arg(x) "%" #x
101#define __percpu_seg "" 46#endif
102
103#endif /* SMP */
104
105#include <asm-generic/percpu.h>
106
107/* We can use this directly for local CPU (faster). */
108DECLARE_PER_CPU(unsigned long, this_cpu_off);
109 47
110/* For arch-specific code, we can use direct single-insn ops (they 48/* For arch-specific code, we can use direct single-insn ops (they
111 * don't give an lvalue though). */ 49 * don't give an lvalue though). */
@@ -120,20 +58,25 @@ do { \
120 } \ 58 } \
121 switch (sizeof(var)) { \ 59 switch (sizeof(var)) { \
122 case 1: \ 60 case 1: \
123 asm(op "b %1,"__percpu_seg"%0" \ 61 asm(op "b %1,"__percpu_arg(0) \
124 : "+m" (var) \ 62 : "+m" (var) \
125 : "ri" ((T__)val)); \ 63 : "ri" ((T__)val)); \
126 break; \ 64 break; \
127 case 2: \ 65 case 2: \
128 asm(op "w %1,"__percpu_seg"%0" \ 66 asm(op "w %1,"__percpu_arg(0) \
129 : "+m" (var) \ 67 : "+m" (var) \
130 : "ri" ((T__)val)); \ 68 : "ri" ((T__)val)); \
131 break; \ 69 break; \
132 case 4: \ 70 case 4: \
133 asm(op "l %1,"__percpu_seg"%0" \ 71 asm(op "l %1,"__percpu_arg(0) \
134 : "+m" (var) \ 72 : "+m" (var) \
135 : "ri" ((T__)val)); \ 73 : "ri" ((T__)val)); \
136 break; \ 74 break; \
75 case 8: \
76 asm(op "q %1,"__percpu_arg(0) \
77 : "+m" (var) \
78 : "re" ((T__)val)); \
79 break; \
137 default: __bad_percpu_size(); \ 80 default: __bad_percpu_size(); \
138 } \ 81 } \
139} while (0) 82} while (0)
@@ -143,17 +86,22 @@ do { \
143 typeof(var) ret__; \ 86 typeof(var) ret__; \
144 switch (sizeof(var)) { \ 87 switch (sizeof(var)) { \
145 case 1: \ 88 case 1: \
146 asm(op "b "__percpu_seg"%1,%0" \ 89 asm(op "b "__percpu_arg(1)",%0" \
147 : "=r" (ret__) \ 90 : "=r" (ret__) \
148 : "m" (var)); \ 91 : "m" (var)); \
149 break; \ 92 break; \
150 case 2: \ 93 case 2: \
151 asm(op "w "__percpu_seg"%1,%0" \ 94 asm(op "w "__percpu_arg(1)",%0" \
152 : "=r" (ret__) \ 95 : "=r" (ret__) \
153 : "m" (var)); \ 96 : "m" (var)); \
154 break; \ 97 break; \
155 case 4: \ 98 case 4: \
156 asm(op "l "__percpu_seg"%1,%0" \ 99 asm(op "l "__percpu_arg(1)",%0" \
100 : "=r" (ret__) \
101 : "m" (var)); \
102 break; \
103 case 8: \
104 asm(op "q "__percpu_arg(1)",%0" \
157 : "=r" (ret__) \ 105 : "=r" (ret__) \
158 : "m" (var)); \ 106 : "m" (var)); \
159 break; \ 107 break; \
@@ -162,13 +110,30 @@ do { \
162 ret__; \ 110 ret__; \
163}) 111})
164 112
165#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var) 113#define percpu_read(var) percpu_from_op("mov", per_cpu__##var)
166#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val) 114#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val)
167#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val) 115#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val)
168#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val) 116#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val)
169#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val) 117#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val)
118#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val)
119#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val)
120
121/* This is not atomic against other CPUs -- CPU preemption needs to be off */
122#define x86_test_and_clear_bit_percpu(bit, var) \
123({ \
124 int old__; \
125 asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \
126 : "=r" (old__), "+m" (per_cpu__##var) \
127 : "dIr" (bit)); \
128 old__; \
129})
130
131#include <asm-generic/percpu.h>
132
133/* We can use this directly for local CPU (faster). */
134DECLARE_PER_CPU(unsigned long, this_cpu_off);
135
170#endif /* !__ASSEMBLY__ */ 136#endif /* !__ASSEMBLY__ */
171#endif /* !CONFIG_X86_64 */
172 137
173#ifdef CONFIG_SMP 138#ifdef CONFIG_SMP
174 139
@@ -195,9 +160,9 @@ do { \
195#define early_per_cpu_ptr(_name) (_name##_early_ptr) 160#define early_per_cpu_ptr(_name) (_name##_early_ptr)
196#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx]) 161#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
197#define early_per_cpu(_name, _cpu) \ 162#define early_per_cpu(_name, _cpu) \
198 (early_per_cpu_ptr(_name) ? \ 163 *(early_per_cpu_ptr(_name) ? \
199 early_per_cpu_ptr(_name)[_cpu] : \ 164 &early_per_cpu_ptr(_name)[_cpu] : \
200 per_cpu(_name, _cpu)) 165 &per_cpu(_name, _cpu))
201 166
202#else /* !CONFIG_SMP */ 167#else /* !CONFIG_SMP */
203#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ 168#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 000000000000..2e08ed736647
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,95 @@
1#ifndef _ASM_X86_PERF_COUNTER_H
2#define _ASM_X86_PERF_COUNTER_H
3
4/*
5 * Performance counter hw details:
6 */
7
8#define X86_PMC_MAX_GENERIC 8
9#define X86_PMC_MAX_FIXED 3
10
11#define X86_PMC_IDX_GENERIC 0
12#define X86_PMC_IDX_FIXED 32
13#define X86_PMC_IDX_MAX 64
14
15#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
16#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
17
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20
21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
22#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
23#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
24#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
25
26/*
27 * Includes eventsel and unit mask as well:
28 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff
30
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36
37#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
38
39/*
40 * Intel "Architectural Performance Monitoring" CPUID
41 * detection/enumeration details:
42 */
43union cpuid10_eax {
44 struct {
45 unsigned int version_id:8;
46 unsigned int num_counters:8;
47 unsigned int bit_width:8;
48 unsigned int mask_length:8;
49 } split;
50 unsigned int full;
51};
52
53union cpuid10_edx {
54 struct {
55 unsigned int num_counters_fixed:4;
56 unsigned int reserved:28;
57 } split;
58 unsigned int full;
59};
60
61
62/*
63 * Fixed-purpose performance counters:
64 */
65
66/*
67 * All 3 fixed-mode PMCs are configured via this single MSR:
68 */
69#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
70
71/*
72 * The counts are available in three separate MSRs:
73 */
74
75/* Instr_Retired.Any: */
76#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
77#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
78
79/* CPU_CLK_Unhalted.Core: */
80#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
81#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
82
83/* CPU_CLK_Unhalted.Ref: */
84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
86
87#ifdef CONFIG_PERF_COUNTERS
88extern void init_hw_perf_counters(void);
89extern void perf_counters_lapic_init(int nmi);
90#else
91static inline void init_hw_perf_counters(void) { }
92static inline void perf_counters_lapic_init(int nmi) { }
93#endif
94
95#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index ba09289accaa..1df9637dfda3 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -11,7 +11,6 @@
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <linux/bitops.h> 12#include <linux/bitops.h>
13#include <linux/threads.h> 13#include <linux/threads.h>
14#include <asm/pda.h>
15 14
16extern pud_t level3_kernel_pgt[512]; 15extern pud_t level3_kernel_pgt[512];
17extern pud_t level3_ident_pgt[512]; 16extern pud_t level3_ident_pgt[512];
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3bfd5235a9eb..c15766a2969f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -378,6 +378,30 @@ union thread_xstate {
378 378
379#ifdef CONFIG_X86_64 379#ifdef CONFIG_X86_64
380DECLARE_PER_CPU(struct orig_ist, orig_ist); 380DECLARE_PER_CPU(struct orig_ist, orig_ist);
381
382union irq_stack_union {
383 char irq_stack[IRQ_STACK_SIZE];
384 /*
385 * GCC hardcodes the stack canary as %gs:40. Since the
386 * irq_stack is the object at %gs:0, we reserve the bottom
387 * 48 bytes of the irq stack for the canary.
388 */
389 struct {
390 char gs_base[40];
391 unsigned long stack_canary;
392 };
393};
394
395DECLARE_PER_CPU(union irq_stack_union, irq_stack_union);
396DECLARE_PER_CPU(char *, irq_stack_ptr);
397
398static inline void load_gs_base(int cpu)
399{
400 /* Memory clobbers used to order pda/percpu accesses */
401 mb();
402 wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
403 mb();
404}
381#endif 405#endif
382 406
383extern void print_cpu_info(struct cpuinfo_x86 *); 407extern void print_cpu_info(struct cpuinfo_x86 *);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ebe858cdc8a3..536949749bc2 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -100,7 +100,6 @@ extern unsigned long init_pg_tables_start;
100extern unsigned long init_pg_tables_end; 100extern unsigned long init_pg_tables_end;
101 101
102#else 102#else
103void __init x86_64_init_pda(void);
104void __init x86_64_start_kernel(char *real_mode); 103void __init x86_64_start_kernel(char *real_mode);
105void __init x86_64_start_reservations(char *real_mode_data); 104void __init x86_64_start_reservations(char *real_mode_data);
106 105
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 19953df61c52..45ef8a1b9d7c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -15,34 +15,8 @@
15# include <asm/io_apic.h> 15# include <asm/io_apic.h>
16# endif 16# endif
17#endif 17#endif
18#include <asm/pda.h>
19#include <asm/thread_info.h> 18#include <asm/thread_info.h>
20 19#include <asm/cpumask.h>
21#ifdef CONFIG_X86_64
22
23extern cpumask_var_t cpu_callin_mask;
24extern cpumask_var_t cpu_callout_mask;
25extern cpumask_var_t cpu_initialized_mask;
26extern cpumask_var_t cpu_sibling_setup_mask;
27
28#else /* CONFIG_X86_32 */
29
30extern cpumask_t cpu_callin_map;
31extern cpumask_t cpu_callout_map;
32extern cpumask_t cpu_initialized;
33extern cpumask_t cpu_sibling_setup_map;
34
35#define cpu_callin_mask ((struct cpumask *)&cpu_callin_map)
36#define cpu_callout_mask ((struct cpumask *)&cpu_callout_map)
37#define cpu_initialized_mask ((struct cpumask *)&cpu_initialized)
38#define cpu_sibling_setup_mask ((struct cpumask *)&cpu_sibling_setup_map)
39
40#endif /* CONFIG_X86_32 */
41
42extern void (*mtrr_hook)(void);
43extern void zap_low_mappings(void);
44
45extern int __cpuinit get_local_pda(int cpu);
46 20
47extern int smp_num_siblings; 21extern int smp_num_siblings;
48extern unsigned int num_processors; 22extern unsigned int num_processors;
@@ -50,9 +24,7 @@ extern unsigned int num_processors;
50DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); 24DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
51DECLARE_PER_CPU(cpumask_t, cpu_core_map); 25DECLARE_PER_CPU(cpumask_t, cpu_core_map);
52DECLARE_PER_CPU(u16, cpu_llc_id); 26DECLARE_PER_CPU(u16, cpu_llc_id);
53#ifdef CONFIG_X86_32
54DECLARE_PER_CPU(int, cpu_number); 27DECLARE_PER_CPU(int, cpu_number);
55#endif
56 28
57static inline struct cpumask *cpu_sibling_mask(int cpu) 29static inline struct cpumask *cpu_sibling_mask(int cpu)
58{ 30{
@@ -167,8 +139,6 @@ void play_dead_common(void);
167void native_send_call_func_ipi(const struct cpumask *mask); 139void native_send_call_func_ipi(const struct cpumask *mask);
168void native_send_call_func_single_ipi(int cpu); 140void native_send_call_func_single_ipi(int cpu);
169 141
170extern void prefill_possible_map(void);
171
172void smp_store_cpu_info(int id); 142void smp_store_cpu_info(int id);
173#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) 143#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
174 144
@@ -177,10 +147,6 @@ static inline int num_booting_cpus(void)
177{ 147{
178 return cpumask_weight(cpu_callout_mask); 148 return cpumask_weight(cpu_callout_mask);
179} 149}
180#else
181static inline void prefill_possible_map(void)
182{
183}
184#endif /* CONFIG_SMP */ 150#endif /* CONFIG_SMP */
185 151
186extern unsigned disabled_cpus __cpuinitdata; 152extern unsigned disabled_cpus __cpuinitdata;
@@ -191,11 +157,11 @@ extern unsigned disabled_cpus __cpuinitdata;
191 * from the initial startup. We map APIC_BASE very early in page_setup(), 157 * from the initial startup. We map APIC_BASE very early in page_setup(),
192 * so this is correct in the x86 case. 158 * so this is correct in the x86 case.
193 */ 159 */
194#define raw_smp_processor_id() (x86_read_percpu(cpu_number)) 160#define raw_smp_processor_id() (percpu_read(cpu_number))
195extern int safe_smp_processor_id(void); 161extern int safe_smp_processor_id(void);
196 162
197#elif defined(CONFIG_X86_64_SMP) 163#elif defined(CONFIG_X86_64_SMP)
198#define raw_smp_processor_id() read_pda(cpunumber) 164#define raw_smp_processor_id() (percpu_read(cpu_number))
199 165
200#define stack_smp_processor_id() \ 166#define stack_smp_processor_id() \
201({ \ 167({ \
@@ -205,10 +171,6 @@ extern int safe_smp_processor_id(void);
205}) 171})
206#define safe_smp_processor_id() smp_processor_id() 172#define safe_smp_processor_id() smp_processor_id()
207 173
208#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */
209#define cpu_physical_id(cpu) boot_cpu_physical_apicid
210#define safe_smp_processor_id() 0
211#define stack_smp_processor_id() 0
212#endif 174#endif
213 175
214#ifdef CONFIG_X86_LOCAL_APIC 176#ifdef CONFIG_X86_LOCAL_APIC
@@ -251,11 +213,5 @@ static inline int hard_smp_processor_id(void)
251 213
252#endif /* CONFIG_X86_LOCAL_APIC */ 214#endif /* CONFIG_X86_LOCAL_APIC */
253 215
254#ifdef CONFIG_X86_HAS_BOOT_CPU_ID
255extern unsigned char boot_cpu_id;
256#else
257#define boot_cpu_id 0
258#endif
259
260#endif /* __ASSEMBLY__ */ 216#endif /* __ASSEMBLY__ */
261#endif /* _ASM_X86_SMP_H */ 217#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
new file mode 100644
index 000000000000..36a700acaf2b
--- /dev/null
+++ b/arch/x86/include/asm/stackprotector.h
@@ -0,0 +1,38 @@
1#ifndef _ASM_STACKPROTECTOR_H
2#define _ASM_STACKPROTECTOR_H 1
3
4#include <asm/tsc.h>
5#include <asm/processor.h>
6
7/*
8 * Initialize the stackprotector canary value.
9 *
10 * NOTE: this must only be called from functions that never return,
11 * and it must always be inlined.
12 */
13static __always_inline void boot_init_stack_canary(void)
14{
15 u64 canary;
16 u64 tsc;
17
18 /*
19 * Build time only check to make sure the stack_canary is at
20 * offset 40 in the pda; this is a gcc ABI requirement
21 */
22 BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40);
23
24 /*
25 * We both use the random pool and the current TSC as a source
26 * of randomness. The TSC only matters for very early init,
27 * there it already has some randomness on most systems. Later
28 * on during the bootup the random pool has true entropy too.
29 */
30 get_random_bytes(&canary, sizeof(canary));
31 tsc = __native_read_tsc();
32 canary += tsc + (tsc << 32UL);
33
34 current->stack_canary = canary;
35 percpu_write(irq_stack_union.stack_canary, canary);
36}
37
38#endif
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 8e626ea33a1a..2fcc70bc85f3 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -86,27 +86,44 @@ do { \
86 , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ 86 , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
87 "r12", "r13", "r14", "r15" 87 "r12", "r13", "r14", "r15"
88 88
89#ifdef CONFIG_CC_STACKPROTECTOR
90#define __switch_canary \
91 "movq %P[task_canary](%%rsi),%%r8\n\t" \
92 "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
93#define __switch_canary_oparam \
94 , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary))
95#define __switch_canary_iparam \
96 , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
97#else /* CC_STACKPROTECTOR */
98#define __switch_canary
99#define __switch_canary_oparam
100#define __switch_canary_iparam
101#endif /* CC_STACKPROTECTOR */
102
89/* Save restore flags to clear handle leaking NT */ 103/* Save restore flags to clear handle leaking NT */
90#define switch_to(prev, next, last) \ 104#define switch_to(prev, next, last) \
91 asm volatile(SAVE_CONTEXT \ 105 asm volatile(SAVE_CONTEXT \
92 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ 106 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
93 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ 107 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
94 "call __switch_to\n\t" \ 108 "call __switch_to\n\t" \
95 ".globl thread_return\n" \ 109 ".globl thread_return\n" \
96 "thread_return:\n\t" \ 110 "thread_return:\n\t" \
97 "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \ 111 "movq "__percpu_arg([current_task])",%%rsi\n\t" \
112 __switch_canary \
98 "movq %P[thread_info](%%rsi),%%r8\n\t" \ 113 "movq %P[thread_info](%%rsi),%%r8\n\t" \
99 LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \ 114 LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
100 "movq %%rax,%%rdi\n\t" \ 115 "movq %%rax,%%rdi\n\t" \
101 "jc ret_from_fork\n\t" \ 116 "jc ret_from_fork\n\t" \
102 RESTORE_CONTEXT \ 117 RESTORE_CONTEXT \
103 : "=a" (last) \ 118 : "=a" (last) \
119 __switch_canary_oparam \
104 : [next] "S" (next), [prev] "D" (prev), \ 120 : [next] "S" (next), [prev] "D" (prev), \
105 [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ 121 [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
106 [ti_flags] "i" (offsetof(struct thread_info, flags)), \ 122 [ti_flags] "i" (offsetof(struct thread_info, flags)), \
107 [tif_fork] "i" (TIF_FORK), \ 123 [tif_fork] "i" (TIF_FORK), \
108 [thread_info] "i" (offsetof(struct task_struct, stack)), \ 124 [thread_info] "i" (offsetof(struct task_struct, stack)), \
109 [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ 125 [current_task] "m" (per_cpu_var(current_task)) \
126 __switch_canary_iparam \
110 : "memory", "cc" __EXTRA_CLOBBER) 127 : "memory", "cc" __EXTRA_CLOBBER)
111#endif 128#endif
112 129
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 98789647baa9..f38488989db7 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -82,6 +82,7 @@ struct thread_info {
82#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ 82#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
83#define TIF_SECCOMP 8 /* secure computing */ 83#define TIF_SECCOMP 8 /* secure computing */
84#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ 84#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
85#define TIF_PERF_COUNTERS 11 /* notify perf counter work */
85#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 86#define TIF_NOTSC 16 /* TSC is not accessible in userland */
86#define TIF_IA32 17 /* 32bit process */ 87#define TIF_IA32 17 /* 32bit process */
87#define TIF_FORK 18 /* ret_from_fork */ 88#define TIF_FORK 18 /* ret_from_fork */
@@ -104,6 +105,7 @@ struct thread_info {
104#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 105#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
105#define _TIF_SECCOMP (1 << TIF_SECCOMP) 106#define _TIF_SECCOMP (1 << TIF_SECCOMP)
106#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) 107#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
108#define _TIF_PERF_COUNTERS (1 << TIF_PERF_COUNTERS)
107#define _TIF_NOTSC (1 << TIF_NOTSC) 109#define _TIF_NOTSC (1 << TIF_NOTSC)
108#define _TIF_IA32 (1 << TIF_IA32) 110#define _TIF_IA32 (1 << TIF_IA32)
109#define _TIF_FORK (1 << TIF_FORK) 111#define _TIF_FORK (1 << TIF_FORK)
@@ -135,7 +137,7 @@ struct thread_info {
135 137
136/* Only used for 64 bit */ 138/* Only used for 64 bit */
137#define _TIF_DO_NOTIFY_MASK \ 139#define _TIF_DO_NOTIFY_MASK \
138 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) 140 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
139 141
140/* flags to check in __switch_to() */ 142/* flags to check in __switch_to() */
141#define _TIF_WORK_CTXSW \ 143#define _TIF_WORK_CTXSW \
@@ -194,25 +196,21 @@ static inline struct thread_info *current_thread_info(void)
194 196
195#else /* X86_32 */ 197#else /* X86_32 */
196 198
197#include <asm/pda.h> 199#include <asm/percpu.h>
200#define KERNEL_STACK_OFFSET (5*8)
198 201
199/* 202/*
200 * macros/functions for gaining access to the thread information structure 203 * macros/functions for gaining access to the thread information structure
201 * preempt_count needs to be 1 initially, until the scheduler is functional. 204 * preempt_count needs to be 1 initially, until the scheduler is functional.
202 */ 205 */
203#ifndef __ASSEMBLY__ 206#ifndef __ASSEMBLY__
204static inline struct thread_info *current_thread_info(void) 207DECLARE_PER_CPU(unsigned long, kernel_stack);
205{
206 struct thread_info *ti;
207 ti = (void *)(read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE);
208 return ti;
209}
210 208
211/* do not use in interrupt context */ 209static inline struct thread_info *current_thread_info(void)
212static inline struct thread_info *stack_thread_info(void)
213{ 210{
214 struct thread_info *ti; 211 struct thread_info *ti;
215 asm("andq %%rsp,%0; " : "=r" (ti) : "0" (~(THREAD_SIZE - 1))); 212 ti = (void *)(percpu_read(kernel_stack) +
213 KERNEL_STACK_OFFSET - THREAD_SIZE);
216 return ti; 214 return ti;
217} 215}
218 216
@@ -220,8 +218,8 @@ static inline struct thread_info *stack_thread_info(void)
220 218
221/* how to get the thread information struct from ASM */ 219/* how to get the thread information struct from ASM */
222#define GET_THREAD_INFO(reg) \ 220#define GET_THREAD_INFO(reg) \
223 movq %gs:pda_kernelstack,reg ; \ 221 movq PER_CPU_VAR(kernel_stack),reg ; \
224 subq $(THREAD_SIZE-PDA_STACKOFFSET),reg 222 subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg
225 223
226#endif 224#endif
227 225
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 0e7bbb549116..d3539f998f88 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -113,7 +113,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
113 __flush_tlb(); 113 __flush_tlb();
114} 114}
115 115
116static inline void native_flush_tlb_others(const cpumask_t *cpumask, 116static inline void native_flush_tlb_others(const struct cpumask *cpumask,
117 struct mm_struct *mm, 117 struct mm_struct *mm,
118 unsigned long va) 118 unsigned long va)
119{ 119{
@@ -142,31 +142,28 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
142 flush_tlb_mm(vma->vm_mm); 142 flush_tlb_mm(vma->vm_mm);
143} 143}
144 144
145void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm, 145void native_flush_tlb_others(const struct cpumask *cpumask,
146 unsigned long va); 146 struct mm_struct *mm, unsigned long va);
147 147
148#define TLBSTATE_OK 1 148#define TLBSTATE_OK 1
149#define TLBSTATE_LAZY 2 149#define TLBSTATE_LAZY 2
150 150
151#ifdef CONFIG_X86_32
152struct tlb_state { 151struct tlb_state {
153 struct mm_struct *active_mm; 152 struct mm_struct *active_mm;
154 int state; 153 int state;
155 char __cacheline_padding[L1_CACHE_BYTES-8];
156}; 154};
157DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); 155DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
158 156
159void reset_lazy_tlbstate(void);
160#else
161static inline void reset_lazy_tlbstate(void) 157static inline void reset_lazy_tlbstate(void)
162{ 158{
159 percpu_write(cpu_tlbstate.state, 0);
160 percpu_write(cpu_tlbstate.active_mm, &init_mm);
163} 161}
164#endif
165 162
166#endif /* SMP */ 163#endif /* SMP */
167 164
168#ifndef CONFIG_PARAVIRT 165#ifndef CONFIG_PARAVIRT
169#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(&mask, mm, va) 166#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(mask, mm, va)
170#endif 167#endif
171 168
172static inline void flush_tlb_kernel_range(unsigned long start, 169static inline void flush_tlb_kernel_range(unsigned long start,
@@ -175,4 +172,6 @@ static inline void flush_tlb_kernel_range(unsigned long start,
175 flush_tlb_all(); 172 flush_tlb_all();
176} 173}
177 174
175extern void zap_low_mappings(void);
176
178#endif /* _ASM_X86_TLBFLUSH_H */ 177#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 4e2f2e0aab27..10022ed3a4b6 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -83,7 +83,8 @@ extern cpumask_t *node_to_cpumask_map;
83DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); 83DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
84 84
85/* Returns the number of the current Node. */ 85/* Returns the number of the current Node. */
86#define numa_node_id() read_pda(nodenumber) 86DECLARE_PER_CPU(int, node_number);
87#define numa_node_id() percpu_read(node_number)
87 88
88#ifdef CONFIG_DEBUG_PER_CPU_MAPS 89#ifdef CONFIG_DEBUG_PER_CPU_MAPS
89extern int cpu_to_node(int cpu); 90extern int cpu_to_node(int cpu);
@@ -102,10 +103,7 @@ static inline int cpu_to_node(int cpu)
102/* Same function but used if called before per_cpu areas are setup */ 103/* Same function but used if called before per_cpu areas are setup */
103static inline int early_cpu_to_node(int cpu) 104static inline int early_cpu_to_node(int cpu)
104{ 105{
105 if (early_per_cpu_ptr(x86_cpu_to_node_map)) 106 return early_per_cpu(x86_cpu_to_node_map, cpu);
106 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
107
108 return per_cpu(x86_cpu_to_node_map, cpu);
109} 107}
110 108
111/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ 109/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
@@ -192,9 +190,20 @@ extern int __node_distance(int, int);
192 190
193#else /* !CONFIG_NUMA */ 191#else /* !CONFIG_NUMA */
194 192
195#define numa_node_id() 0 193static inline int numa_node_id(void)
196#define cpu_to_node(cpu) 0 194{
197#define early_cpu_to_node(cpu) 0 195 return 0;
196}
197
198static inline int cpu_to_node(int cpu)
199{
200 return 0;
201}
202
203static inline int early_cpu_to_node(int cpu)
204{
205 return 0;
206}
198 207
199static inline const cpumask_t *cpumask_of_node(int node) 208static inline const cpumask_t *cpumask_of_node(int node)
200{ 209{
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index 780ba0ab94f9..90f06c25221d 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -13,6 +13,7 @@ extern unsigned char *trampoline_base;
13 13
14extern unsigned long init_rsp; 14extern unsigned long init_rsp;
15extern unsigned long initial_code; 15extern unsigned long initial_code;
16extern unsigned long initial_gs;
16 17
17#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) 18#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
18#define TRAMPOLINE_BASE 0x6000 19#define TRAMPOLINE_BASE 0x6000
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78430a4..7e47658b0a6f 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,7 @@
338#define __NR_dup3 330 338#define __NR_dup3 330
339#define __NR_pipe2 331 339#define __NR_pipe2 331
340#define __NR_inotify_init1 332 340#define __NR_inotify_init1 332
341#define __NR_perf_counter_open 333
341 342
342#ifdef __KERNEL__ 343#ifdef __KERNEL__
343 344
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index d2e415e6666f..53025feaf88d 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,7 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
653__SYSCALL(__NR_pipe2, sys_pipe2) 653__SYSCALL(__NR_pipe2, sys_pipe2)
654#define __NR_inotify_init1 294 654#define __NR_inotify_init1 294
655__SYSCALL(__NR_inotify_init1, sys_inotify_init1) 655__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
656 656#define __NR_perf_counter_open 295
657__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
657 658
658#ifndef __NO_STUBS 659#ifndef __NO_STUBS
659#define __ARCH_WANT_OLD_READDIR 660#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
new file mode 100644
index 000000000000..8ac1d7e312f3
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv.h
@@ -0,0 +1,33 @@
1#ifndef _ASM_X86_UV_UV_H
2#define _ASM_X86_UV_UV_H
3
4enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
5
6#ifdef CONFIG_X86_UV
7
8extern enum uv_system_type get_uv_system_type(void);
9extern int is_uv_system(void);
10extern void uv_cpu_init(void);
11extern void uv_system_init(void);
12extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
13extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
14 struct mm_struct *mm,
15 unsigned long va,
16 unsigned int cpu);
17
18#else /* X86_UV */
19
20static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; }
21static inline int is_uv_system(void) { return 0; }
22static inline void uv_cpu_init(void) { }
23static inline void uv_system_init(void) { }
24static inline int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
25{ return 1; }
26static inline const struct cpumask *
27uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
28 unsigned long va, unsigned int cpu)
29{ return cpumask; }
30
31#endif /* X86_UV */
32
33#endif /* _ASM_X86_UV_UV_H */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 50423c7b56b2..9b0e61bf7a88 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -325,7 +325,6 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
325#define cpubit_isset(cpu, bau_local_cpumask) \ 325#define cpubit_isset(cpu, bau_local_cpumask) \
326 test_bit((cpu), (bau_local_cpumask).bits) 326 test_bit((cpu), (bau_local_cpumask).bits)
327 327
328extern int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long);
329extern void uv_bau_message_intr1(void); 328extern void uv_bau_message_intr1(void);
330extern void uv_bau_timeout_intr1(void); 329extern void uv_bau_timeout_intr1(void);
331 330
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d364df03c1d6..a99437c965cc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,6 +23,7 @@ nostackp := $(call cc-option, -fno-stack-protector)
23CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) 23CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
24CFLAGS_hpet.o := $(nostackp) 24CFLAGS_hpet.o := $(nostackp)
25CFLAGS_tsc.o := $(nostackp) 25CFLAGS_tsc.o := $(nostackp)
26CFLAGS_paravirt.o := $(nostackp)
26 27
27obj-y := process_$(BITS).o signal.o entry_$(BITS).o 28obj-y := process_$(BITS).o signal.o entry_$(BITS).o
28obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 29obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
@@ -57,7 +58,7 @@ obj-$(CONFIG_PCI) += early-quirks.o
57apm-y := apm_32.o 58apm-y := apm_32.o
58obj-$(CONFIG_APM) += apm.o 59obj-$(CONFIG_APM) += apm.o
59obj-$(CONFIG_X86_SMP) += smp.o 60obj-$(CONFIG_X86_SMP) += smp.o
60obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o tlb_$(BITS).o 61obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o
61obj-$(CONFIG_X86_32_SMP) += smpcommon.o 62obj-$(CONFIG_X86_32_SMP) += smpcommon.o
62obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o 63obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o
63obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o 64obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
@@ -114,10 +115,11 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
114### 115###
115# 64 bit specific files 116# 64 bit specific files
116ifeq ($(CONFIG_X86_64),y) 117ifeq ($(CONFIG_X86_64),y)
117 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 118 obj-y += genapic_64.o genapic_flat_64.o
118 obj-y += bios_uv.o uv_irq.o uv_sysfs.o
119 obj-y += genx2apic_cluster.o 119 obj-y += genx2apic_cluster.o
120 obj-y += genx2apic_phys.o 120 obj-y += genx2apic_phys.o
121 obj-$(CONFIG_X86_UV) += genx2apic_uv_x.o tlb_uv.o
122 obj-$(CONFIG_X86_UV) += bios_uv.o uv_irq.o uv_sysfs.o
121 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 123 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
122 obj-$(CONFIG_AUDIT) += audit_64.o 124 obj-$(CONFIG_AUDIT) += audit_64.o
123 125
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7678f10c4568..c193ec3c695e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -912,8 +912,8 @@ static u8 __init uniq_ioapic_id(u8 id)
912 DECLARE_BITMAP(used, 256); 912 DECLARE_BITMAP(used, 256);
913 bitmap_zero(used, 256); 913 bitmap_zero(used, 256);
914 for (i = 0; i < nr_ioapics; i++) { 914 for (i = 0; i < nr_ioapics; i++) {
915 struct mp_config_ioapic *ia = &mp_ioapics[i]; 915 struct mpc_ioapic *ia = &mp_ioapics[i];
916 __set_bit(ia->mp_apicid, used); 916 __set_bit(ia->apicid, used);
917 } 917 }
918 if (!test_bit(id, used)) 918 if (!test_bit(id, used))
919 return id; 919 return id;
@@ -945,29 +945,29 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
945 945
946 idx = nr_ioapics; 946 idx = nr_ioapics;
947 947
948 mp_ioapics[idx].mp_type = MP_IOAPIC; 948 mp_ioapics[idx].type = MP_IOAPIC;
949 mp_ioapics[idx].mp_flags = MPC_APIC_USABLE; 949 mp_ioapics[idx].flags = MPC_APIC_USABLE;
950 mp_ioapics[idx].mp_apicaddr = address; 950 mp_ioapics[idx].apicaddr = address;
951 951
952 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 952 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
953 mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id); 953 mp_ioapics[idx].apicid = uniq_ioapic_id(id);
954#ifdef CONFIG_X86_32 954#ifdef CONFIG_X86_32
955 mp_ioapics[idx].mp_apicver = io_apic_get_version(idx); 955 mp_ioapics[idx].apicver = io_apic_get_version(idx);
956#else 956#else
957 mp_ioapics[idx].mp_apicver = 0; 957 mp_ioapics[idx].apicver = 0;
958#endif 958#endif
959 /* 959 /*
960 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 960 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
961 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 961 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
962 */ 962 */
963 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid; 963 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
964 mp_ioapic_routing[idx].gsi_base = gsi_base; 964 mp_ioapic_routing[idx].gsi_base = gsi_base;
965 mp_ioapic_routing[idx].gsi_end = gsi_base + 965 mp_ioapic_routing[idx].gsi_end = gsi_base +
966 io_apic_get_redir_entries(idx); 966 io_apic_get_redir_entries(idx);
967 967
968 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " 968 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
969 "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid, 969 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
970 mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr, 970 mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
971 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); 971 mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
972 972
973 nr_ioapics++; 973 nr_ioapics++;
@@ -996,19 +996,19 @@ int __init acpi_probe_gsi(void)
996 return max_gsi + 1; 996 return max_gsi + 1;
997} 997}
998 998
999static void assign_to_mp_irq(struct mp_config_intsrc *m, 999static void assign_to_mp_irq(struct mpc_intsrc *m,
1000 struct mp_config_intsrc *mp_irq) 1000 struct mpc_intsrc *mp_irq)
1001{ 1001{
1002 memcpy(mp_irq, m, sizeof(struct mp_config_intsrc)); 1002 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
1003} 1003}
1004 1004
1005static int mp_irq_cmp(struct mp_config_intsrc *mp_irq, 1005static int mp_irq_cmp(struct mpc_intsrc *mp_irq,
1006 struct mp_config_intsrc *m) 1006 struct mpc_intsrc *m)
1007{ 1007{
1008 return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc)); 1008 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
1009} 1009}
1010 1010
1011static void save_mp_irq(struct mp_config_intsrc *m) 1011static void save_mp_irq(struct mpc_intsrc *m)
1012{ 1012{
1013 int i; 1013 int i;
1014 1014
@@ -1026,7 +1026,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
1026{ 1026{
1027 int ioapic; 1027 int ioapic;
1028 int pin; 1028 int pin;
1029 struct mp_config_intsrc mp_irq; 1029 struct mpc_intsrc mp_irq;
1030 1030
1031 /* 1031 /*
1032 * Convert 'gsi' to 'ioapic.pin'. 1032 * Convert 'gsi' to 'ioapic.pin'.
@@ -1044,13 +1044,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
1044 if ((bus_irq == 0) && (trigger == 3)) 1044 if ((bus_irq == 0) && (trigger == 3))
1045 trigger = 1; 1045 trigger = 1;
1046 1046
1047 mp_irq.mp_type = MP_INTSRC; 1047 mp_irq.type = MP_INTSRC;
1048 mp_irq.mp_irqtype = mp_INT; 1048 mp_irq.irqtype = mp_INT;
1049 mp_irq.mp_irqflag = (trigger << 2) | polarity; 1049 mp_irq.irqflag = (trigger << 2) | polarity;
1050 mp_irq.mp_srcbus = MP_ISA_BUS; 1050 mp_irq.srcbus = MP_ISA_BUS;
1051 mp_irq.mp_srcbusirq = bus_irq; /* IRQ */ 1051 mp_irq.srcbusirq = bus_irq; /* IRQ */
1052 mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */ 1052 mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */
1053 mp_irq.mp_dstirq = pin; /* INTIN# */ 1053 mp_irq.dstirq = pin; /* INTIN# */
1054 1054
1055 save_mp_irq(&mp_irq); 1055 save_mp_irq(&mp_irq);
1056} 1056}
@@ -1060,7 +1060,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1060 int i; 1060 int i;
1061 int ioapic; 1061 int ioapic;
1062 unsigned int dstapic; 1062 unsigned int dstapic;
1063 struct mp_config_intsrc mp_irq; 1063 struct mpc_intsrc mp_irq;
1064 1064
1065#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 1065#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
1066 /* 1066 /*
@@ -1085,7 +1085,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1085 ioapic = mp_find_ioapic(0); 1085 ioapic = mp_find_ioapic(0);
1086 if (ioapic < 0) 1086 if (ioapic < 0)
1087 return; 1087 return;
1088 dstapic = mp_ioapics[ioapic].mp_apicid; 1088 dstapic = mp_ioapics[ioapic].apicid;
1089 1089
1090 /* 1090 /*
1091 * Use the default configuration for the IRQs 0-15. Unless 1091 * Use the default configuration for the IRQs 0-15. Unless
@@ -1095,16 +1095,14 @@ void __init mp_config_acpi_legacy_irqs(void)
1095 int idx; 1095 int idx;
1096 1096
1097 for (idx = 0; idx < mp_irq_entries; idx++) { 1097 for (idx = 0; idx < mp_irq_entries; idx++) {
1098 struct mp_config_intsrc *irq = mp_irqs + idx; 1098 struct mpc_intsrc *irq = mp_irqs + idx;
1099 1099
1100 /* Do we already have a mapping for this ISA IRQ? */ 1100 /* Do we already have a mapping for this ISA IRQ? */
1101 if (irq->mp_srcbus == MP_ISA_BUS 1101 if (irq->srcbus == MP_ISA_BUS && irq->srcbusirq == i)
1102 && irq->mp_srcbusirq == i)
1103 break; 1102 break;
1104 1103
1105 /* Do we already have a mapping for this IOAPIC pin */ 1104 /* Do we already have a mapping for this IOAPIC pin */
1106 if (irq->mp_dstapic == dstapic && 1105 if (irq->dstapic == dstapic && irq->dstirq == i)
1107 irq->mp_dstirq == i)
1108 break; 1106 break;
1109 } 1107 }
1110 1108
@@ -1113,13 +1111,13 @@ void __init mp_config_acpi_legacy_irqs(void)
1113 continue; /* IRQ already used */ 1111 continue; /* IRQ already used */
1114 } 1112 }
1115 1113
1116 mp_irq.mp_type = MP_INTSRC; 1114 mp_irq.type = MP_INTSRC;
1117 mp_irq.mp_irqflag = 0; /* Conforming */ 1115 mp_irq.irqflag = 0; /* Conforming */
1118 mp_irq.mp_srcbus = MP_ISA_BUS; 1116 mp_irq.srcbus = MP_ISA_BUS;
1119 mp_irq.mp_dstapic = dstapic; 1117 mp_irq.dstapic = dstapic;
1120 mp_irq.mp_irqtype = mp_INT; 1118 mp_irq.irqtype = mp_INT;
1121 mp_irq.mp_srcbusirq = i; /* Identity mapped */ 1119 mp_irq.srcbusirq = i; /* Identity mapped */
1122 mp_irq.mp_dstirq = i; 1120 mp_irq.dstirq = i;
1123 1121
1124 save_mp_irq(&mp_irq); 1122 save_mp_irq(&mp_irq);
1125 } 1123 }
@@ -1230,22 +1228,22 @@ int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
1230 u32 gsi, int triggering, int polarity) 1228 u32 gsi, int triggering, int polarity)
1231{ 1229{
1232#ifdef CONFIG_X86_MPPARSE 1230#ifdef CONFIG_X86_MPPARSE
1233 struct mp_config_intsrc mp_irq; 1231 struct mpc_intsrc mp_irq;
1234 int ioapic; 1232 int ioapic;
1235 1233
1236 if (!acpi_ioapic) 1234 if (!acpi_ioapic)
1237 return 0; 1235 return 0;
1238 1236
1239 /* print the entry should happen on mptable identically */ 1237 /* print the entry should happen on mptable identically */
1240 mp_irq.mp_type = MP_INTSRC; 1238 mp_irq.type = MP_INTSRC;
1241 mp_irq.mp_irqtype = mp_INT; 1239 mp_irq.irqtype = mp_INT;
1242 mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | 1240 mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
1243 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); 1241 (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
1244 mp_irq.mp_srcbus = number; 1242 mp_irq.srcbus = number;
1245 mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); 1243 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1246 ioapic = mp_find_ioapic(gsi); 1244 ioapic = mp_find_ioapic(gsi);
1247 mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id; 1245 mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
1248 mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base; 1246 mp_irq.dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base;
1249 1247
1250 save_mp_irq(&mp_irq); 1248 save_mp_irq(&mp_irq);
1251#endif 1249#endif
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index a60c1f3bcb87..7c243a2c5115 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,6 +101,7 @@ int acpi_save_state_mem(void)
101 stack_start.sp = temp_stack + sizeof(temp_stack); 101 stack_start.sp = temp_stack + sizeof(temp_stack);
102 early_gdt_descr.address = 102 early_gdt_descr.address =
103 (unsigned long)get_cpu_gdt_table(smp_processor_id()); 103 (unsigned long)get_cpu_gdt_table(smp_processor_id());
104 initial_gs = per_cpu_offset(smp_processor_id());
104#endif 105#endif
105 initial_code = (unsigned long)wakeup_long64; 106 initial_code = (unsigned long)wakeup_long64;
106 saved_magic = 0x123456789abcdef0; 107 saved_magic = 0x123456789abcdef0;
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 115449f869ee..abfa0b641aea 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -35,6 +35,7 @@
35#include <linux/nmi.h> 35#include <linux/nmi.h>
36#include <linux/timex.h> 36#include <linux/timex.h>
37 37
38#include <asm/perf_counter.h>
38#include <asm/atomic.h> 39#include <asm/atomic.h>
39#include <asm/mtrr.h> 40#include <asm/mtrr.h>
40#include <asm/mpspec.h> 41#include <asm/mpspec.h>
@@ -780,6 +781,8 @@ static void local_apic_timer_interrupt(void)
780 inc_irq_stat(apic_timer_irqs); 781 inc_irq_stat(apic_timer_irqs);
781 782
782 evt->event_handler(evt); 783 evt->event_handler(evt);
784
785 perf_counter_unthrottle();
783} 786}
784 787
785/* 788/*
@@ -1130,6 +1133,13 @@ void __cpuinit setup_local_APIC(void)
1130 unsigned int value; 1133 unsigned int value;
1131 int i, j; 1134 int i, j;
1132 1135
1136 if (disable_apic) {
1137#ifdef CONFIG_X86_IO_APIC
1138 disable_ioapic_setup();
1139#endif
1140 return;
1141 }
1142
1133#ifdef CONFIG_X86_32 1143#ifdef CONFIG_X86_32
1134 /* Pound the ESR really hard over the head with a big hammer - mbligh */ 1144 /* Pound the ESR really hard over the head with a big hammer - mbligh */
1135 if (lapic_is_integrated() && esr_disable) { 1145 if (lapic_is_integrated() && esr_disable) {
@@ -1139,6 +1149,7 @@ void __cpuinit setup_local_APIC(void)
1139 apic_write(APIC_ESR, 0); 1149 apic_write(APIC_ESR, 0);
1140 } 1150 }
1141#endif 1151#endif
1152 perf_counters_lapic_init(0);
1142 1153
1143 preempt_disable(); 1154 preempt_disable();
1144 1155
@@ -1570,11 +1581,11 @@ int apic_version[MAX_APICS];
1570 1581
1571int __init APIC_init_uniprocessor(void) 1582int __init APIC_init_uniprocessor(void)
1572{ 1583{
1573#ifdef CONFIG_X86_64
1574 if (disable_apic) { 1584 if (disable_apic) {
1575 pr_info("Apic disabled\n"); 1585 pr_info("Apic disabled\n");
1576 return -1; 1586 return -1;
1577 } 1587 }
1588#ifdef CONFIG_X86_64
1578 if (!cpu_has_apic) { 1589 if (!cpu_has_apic) {
1579 disable_apic = 1; 1590 disable_apic = 1;
1580 pr_info("Apic disabled by BIOS\n"); 1591 pr_info("Apic disabled by BIOS\n");
@@ -1877,17 +1888,8 @@ void __cpuinit generic_processor_info(int apicid, int version)
1877#endif 1888#endif
1878 1889
1879#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) 1890#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1880 /* are we being called early in kernel startup? */ 1891 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1881 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1892 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1882 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
1883 u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1884
1885 cpu_to_apicid[cpu] = apicid;
1886 bios_cpu_apicid[cpu] = apicid;
1887 } else {
1888 per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1889 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1890 }
1891#endif 1893#endif
1892 1894
1893 set_cpu_possible(cpu, true); 1895 set_cpu_possible(cpu, true);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 1d41d3f1edbc..8793ab33e2c1 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -11,7 +11,6 @@
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
12#include <linux/suspend.h> 12#include <linux/suspend.h>
13#include <linux/kbuild.h> 13#include <linux/kbuild.h>
14#include <asm/pda.h>
15#include <asm/processor.h> 14#include <asm/processor.h>
16#include <asm/segment.h> 15#include <asm/segment.h>
17#include <asm/thread_info.h> 16#include <asm/thread_info.h>
@@ -48,16 +47,6 @@ int main(void)
48#endif 47#endif
49 BLANK(); 48 BLANK();
50#undef ENTRY 49#undef ENTRY
51#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
52 ENTRY(kernelstack);
53 ENTRY(oldrsp);
54 ENTRY(pcurrent);
55 ENTRY(irqcount);
56 ENTRY(cpunumber);
57 ENTRY(irqstackptr);
58 ENTRY(data_offset);
59 BLANK();
60#undef ENTRY
61#ifdef CONFIG_PARAVIRT 50#ifdef CONFIG_PARAVIRT
62 BLANK(); 51 BLANK();
63 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); 52 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82db7f45e2de..c3813306e0b4 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
1# 1#
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details, features and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
@@ -22,11 +22,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
22obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 22obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
23obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 23obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
24 24
25obj-$(CONFIG_X86_MCE) += mcheck/ 25obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
26obj-$(CONFIG_MTRR) += mtrr/
27obj-$(CONFIG_CPU_FREQ) += cpufreq/
28 26
29obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 27obj-$(CONFIG_X86_MCE) += mcheck/
28obj-$(CONFIG_MTRR) += mtrr/
29obj-$(CONFIG_CPU_FREQ) += cpufreq/
30
31obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
30 32
31quiet_cmd_mkcapflags = MKCAP $@ 33quiet_cmd_mkcapflags = MKCAP $@
32 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ 34 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 83492b1f93b1..6fd316689c47 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -17,18 +17,21 @@
17#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
18#include <asm/mtrr.h> 18#include <asm/mtrr.h>
19#include <asm/mce.h> 19#include <asm/mce.h>
20#include <asm/perf_counter.h>
20#include <asm/pat.h> 21#include <asm/pat.h>
21#include <asm/asm.h> 22#include <asm/asm.h>
22#include <asm/numa.h> 23#include <asm/numa.h>
23#include <asm/smp.h> 24#include <asm/smp.h>
25#include <asm/cpu.h>
26#include <asm/cpumask.h>
24#ifdef CONFIG_X86_LOCAL_APIC 27#ifdef CONFIG_X86_LOCAL_APIC
25#include <asm/mpspec.h> 28#include <asm/mpspec.h>
26#include <asm/apic.h> 29#include <asm/apic.h>
27#include <mach_apic.h> 30#include <mach_apic.h>
28#include <asm/genapic.h> 31#include <asm/genapic.h>
32#include <asm/uv/uv.h>
29#endif 33#endif
30 34
31#include <asm/pda.h>
32#include <asm/pgtable.h> 35#include <asm/pgtable.h>
33#include <asm/processor.h> 36#include <asm/processor.h>
34#include <asm/desc.h> 37#include <asm/desc.h>
@@ -62,23 +65,23 @@ cpumask_t cpu_sibling_setup_map;
62 65
63static struct cpu_dev *this_cpu __cpuinitdata; 66static struct cpu_dev *this_cpu __cpuinitdata;
64 67
68DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
65#ifdef CONFIG_X86_64 69#ifdef CONFIG_X86_64
66/* We need valid kernel segments for data and code in long mode too 70 /*
67 * IRET will check the segment types kkeil 2000/10/28 71 * We need valid kernel segments for data and code in long mode too
68 * Also sysret mandates a special GDT layout 72 * IRET will check the segment types kkeil 2000/10/28
69 */ 73 * Also sysret mandates a special GDT layout
70/* The TLS descriptors are currently at a different place compared to i386. 74 *
71 Hopefully nobody expects them at a fixed place (Wine?) */ 75 * The TLS descriptors are currently at a different place compared to i386.
72DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { 76 * Hopefully nobody expects them at a fixed place (Wine?)
77 */
73 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, 78 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
74 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, 79 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
75 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, 80 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
76 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, 81 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
77 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, 82 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
78 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, 83 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
79} };
80#else 84#else
81DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
82 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 85 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
83 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 86 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
84 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 87 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -110,9 +113,9 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
110 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, 113 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
111 114
112 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 115 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
113 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, 116 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
114} };
115#endif 117#endif
118} };
116EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 119EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
117 120
118#ifdef CONFIG_X86_32 121#ifdef CONFIG_X86_32
@@ -772,6 +775,7 @@ void __init identify_boot_cpu(void)
772#else 775#else
773 vgetcpu_set_mode(); 776 vgetcpu_set_mode();
774#endif 777#endif
778 init_hw_perf_counters();
775} 779}
776 780
777void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 781void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -877,54 +881,26 @@ static __init int setup_disablecpuid(char *arg)
877__setup("clearcpuid=", setup_disablecpuid); 881__setup("clearcpuid=", setup_disablecpuid);
878 882
879#ifdef CONFIG_X86_64 883#ifdef CONFIG_X86_64
880struct x8664_pda **_cpu_pda __read_mostly;
881EXPORT_SYMBOL(_cpu_pda);
882
883struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; 884struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
884 885
885static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; 886DEFINE_PER_CPU_FIRST(union irq_stack_union,
886 887 irq_stack_union) __aligned(PAGE_SIZE);
887void __cpuinit pda_init(int cpu) 888#ifdef CONFIG_SMP
888{ 889DEFINE_PER_CPU(char *, irq_stack_ptr); /* will be set during per cpu init */
889 struct x8664_pda *pda = cpu_pda(cpu); 890#else
891DEFINE_PER_CPU(char *, irq_stack_ptr) =
892 per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
893#endif
890 894
891 /* Setup up data that may be needed in __get_free_pages early */ 895DEFINE_PER_CPU(unsigned long, kernel_stack) =
892 loadsegment(fs, 0); 896 (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
893 loadsegment(gs, 0); 897EXPORT_PER_CPU_SYMBOL(kernel_stack);
894 /* Memory clobbers used to order PDA accessed */
895 mb();
896 wrmsrl(MSR_GS_BASE, pda);
897 mb();
898
899 pda->cpunumber = cpu;
900 pda->irqcount = -1;
901 pda->kernelstack = (unsigned long)stack_thread_info() -
902 PDA_STACKOFFSET + THREAD_SIZE;
903 pda->active_mm = &init_mm;
904 pda->mmu_state = 0;
905
906 if (cpu == 0) {
907 /* others are initialized in smpboot.c */
908 pda->pcurrent = &init_task;
909 pda->irqstackptr = boot_cpu_stack;
910 pda->irqstackptr += IRQSTACKSIZE - 64;
911 } else {
912 if (!pda->irqstackptr) {
913 pda->irqstackptr = (char *)
914 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
915 if (!pda->irqstackptr)
916 panic("cannot allocate irqstack for cpu %d",
917 cpu);
918 pda->irqstackptr += IRQSTACKSIZE - 64;
919 }
920 898
921 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) 899DEFINE_PER_CPU(unsigned int, irq_count) = -1;
922 pda->nodenumber = cpu_to_node(cpu);
923 }
924}
925 900
926static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + 901static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
927 DEBUG_STKSZ] __page_aligned_bss; 902 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
903 __aligned(PAGE_SIZE);
928 904
929extern asmlinkage void ignore_sysret(void); 905extern asmlinkage void ignore_sysret(void);
930 906
@@ -982,15 +958,18 @@ void __cpuinit cpu_init(void)
982 struct tss_struct *t = &per_cpu(init_tss, cpu); 958 struct tss_struct *t = &per_cpu(init_tss, cpu);
983 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); 959 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
984 unsigned long v; 960 unsigned long v;
985 char *estacks = NULL;
986 struct task_struct *me; 961 struct task_struct *me;
987 int i; 962 int i;
988 963
989 /* CPU 0 is initialised in head64.c */ 964 loadsegment(fs, 0);
990 if (cpu != 0) 965 loadsegment(gs, 0);
991 pda_init(cpu); 966 load_gs_base(cpu);
992 else 967
993 estacks = boot_exception_stacks; 968#ifdef CONFIG_NUMA
969 if (cpu != 0 && percpu_read(node_number) == 0 &&
970 cpu_to_node(cpu) != NUMA_NO_NODE)
971 percpu_write(node_number, cpu_to_node(cpu));
972#endif
994 973
995 me = current; 974 me = current;
996 975
@@ -1024,18 +1003,13 @@ void __cpuinit cpu_init(void)
1024 * set up and load the per-CPU TSS 1003 * set up and load the per-CPU TSS
1025 */ 1004 */
1026 if (!orig_ist->ist[0]) { 1005 if (!orig_ist->ist[0]) {
1027 static const unsigned int order[N_EXCEPTION_STACKS] = { 1006 static const unsigned int sizes[N_EXCEPTION_STACKS] = {
1028 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, 1007 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
1029 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER 1008 [DEBUG_STACK - 1] = DEBUG_STKSZ
1030 }; 1009 };
1010 char *estacks = per_cpu(exception_stacks, cpu);
1031 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 1011 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1032 if (cpu) { 1012 estacks += sizes[v];
1033 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1034 if (!estacks)
1035 panic("Cannot allocate exception "
1036 "stack %ld %d\n", v, cpu);
1037 }
1038 estacks += PAGE_SIZE << order[v];
1039 orig_ist->ist[v] = t->x86_tss.ist[v] = 1013 orig_ist->ist[v] = t->x86_tss.ist[v] =
1040 (unsigned long)estacks; 1014 (unsigned long)estacks;
1041 } 1015 }
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index da299eb85fc0..7293508d8f5c 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -147,7 +147,16 @@ struct _cpuid4_info {
147 union _cpuid4_leaf_ecx ecx; 147 union _cpuid4_leaf_ecx ecx;
148 unsigned long size; 148 unsigned long size;
149 unsigned long can_disable; 149 unsigned long can_disable;
150 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ 150 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
151};
152
153/* subset of above _cpuid4_info w/o shared_cpu_map */
154struct _cpuid4_info_regs {
155 union _cpuid4_leaf_eax eax;
156 union _cpuid4_leaf_ebx ebx;
157 union _cpuid4_leaf_ecx ecx;
158 unsigned long size;
159 unsigned long can_disable;
151}; 160};
152 161
153#ifdef CONFIG_PCI 162#ifdef CONFIG_PCI
@@ -278,7 +287,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
278} 287}
279 288
280static void __cpuinit 289static void __cpuinit
281amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf) 290amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
282{ 291{
283 if (index < 3) 292 if (index < 3)
284 return; 293 return;
@@ -286,7 +295,8 @@ amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
286} 295}
287 296
288static int 297static int
289__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) 298__cpuinit cpuid4_cache_lookup_regs(int index,
299 struct _cpuid4_info_regs *this_leaf)
290{ 300{
291 union _cpuid4_leaf_eax eax; 301 union _cpuid4_leaf_eax eax;
292 union _cpuid4_leaf_ebx ebx; 302 union _cpuid4_leaf_ebx ebx;
@@ -314,6 +324,15 @@ __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
314 return 0; 324 return 0;
315} 325}
316 326
327static int
328__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
329{
330 struct _cpuid4_info_regs *leaf_regs =
331 (struct _cpuid4_info_regs *)this_leaf;
332
333 return cpuid4_cache_lookup_regs(index, leaf_regs);
334}
335
317static int __cpuinit find_num_cache_leaves(void) 336static int __cpuinit find_num_cache_leaves(void)
318{ 337{
319 unsigned int eax, ebx, ecx, edx; 338 unsigned int eax, ebx, ecx, edx;
@@ -353,11 +372,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
353 * parameters cpuid leaf to find the cache details 372 * parameters cpuid leaf to find the cache details
354 */ 373 */
355 for (i = 0; i < num_cache_leaves; i++) { 374 for (i = 0; i < num_cache_leaves; i++) {
356 struct _cpuid4_info this_leaf; 375 struct _cpuid4_info_regs this_leaf;
357
358 int retval; 376 int retval;
359 377
360 retval = cpuid4_cache_lookup(i, &this_leaf); 378 retval = cpuid4_cache_lookup_regs(i, &this_leaf);
361 if (retval >= 0) { 379 if (retval >= 0) {
362 switch(this_leaf.eax.split.level) { 380 switch(this_leaf.eax.split.level) {
363 case 1: 381 case 1:
@@ -506,17 +524,20 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
506 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; 524 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
507 525
508 if (num_threads_sharing == 1) 526 if (num_threads_sharing == 1)
509 cpu_set(cpu, this_leaf->shared_cpu_map); 527 cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
510 else { 528 else {
511 index_msb = get_count_order(num_threads_sharing); 529 index_msb = get_count_order(num_threads_sharing);
512 530
513 for_each_online_cpu(i) { 531 for_each_online_cpu(i) {
514 if (cpu_data(i).apicid >> index_msb == 532 if (cpu_data(i).apicid >> index_msb ==
515 c->apicid >> index_msb) { 533 c->apicid >> index_msb) {
516 cpu_set(i, this_leaf->shared_cpu_map); 534 cpumask_set_cpu(i,
535 to_cpumask(this_leaf->shared_cpu_map));
517 if (i != cpu && per_cpu(cpuid4_info, i)) { 536 if (i != cpu && per_cpu(cpuid4_info, i)) {
518 sibling_leaf = CPUID4_INFO_IDX(i, index); 537 sibling_leaf =
519 cpu_set(cpu, sibling_leaf->shared_cpu_map); 538 CPUID4_INFO_IDX(i, index);
539 cpumask_set_cpu(cpu, to_cpumask(
540 sibling_leaf->shared_cpu_map));
520 } 541 }
521 } 542 }
522 } 543 }
@@ -528,9 +549,10 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
528 int sibling; 549 int sibling;
529 550
530 this_leaf = CPUID4_INFO_IDX(cpu, index); 551 this_leaf = CPUID4_INFO_IDX(cpu, index);
531 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { 552 for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
532 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 553 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
533 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 554 cpumask_clear_cpu(cpu,
555 to_cpumask(sibling_leaf->shared_cpu_map));
534 } 556 }
535} 557}
536#else 558#else
@@ -635,8 +657,9 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
635 int n = 0; 657 int n = 0;
636 658
637 if (len > 1) { 659 if (len > 1) {
638 cpumask_t *mask = &this_leaf->shared_cpu_map; 660 const struct cpumask *mask;
639 661
662 mask = to_cpumask(this_leaf->shared_cpu_map);
640 n = type? 663 n = type?
641 cpulist_scnprintf(buf, len-2, mask) : 664 cpulist_scnprintf(buf, len-2, mask) :
642 cpumask_scnprintf(buf, len-2, mask); 665 cpumask_scnprintf(buf, len-2, mask);
@@ -699,7 +722,8 @@ static struct pci_dev *get_k8_northbridge(int node)
699 722
700static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) 723static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
701{ 724{
702 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); 725 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
726 int node = cpu_to_node(cpumask_first(mask));
703 struct pci_dev *dev = NULL; 727 struct pci_dev *dev = NULL;
704 ssize_t ret = 0; 728 ssize_t ret = 0;
705 int i; 729 int i;
@@ -733,7 +757,8 @@ static ssize_t
733store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, 757store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
734 size_t count) 758 size_t count)
735{ 759{
736 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); 760 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
761 int node = cpu_to_node(cpumask_first(mask));
737 struct pci_dev *dev = NULL; 762 struct pci_dev *dev = NULL;
738 unsigned int ret, index, val; 763 unsigned int ret, index, val;
739 764
@@ -878,7 +903,7 @@ err_out:
878 return -ENOMEM; 903 return -ENOMEM;
879} 904}
880 905
881static cpumask_t cache_dev_map = CPU_MASK_NONE; 906static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
882 907
883/* Add/Remove cache interface for CPU device */ 908/* Add/Remove cache interface for CPU device */
884static int __cpuinit cache_add_dev(struct sys_device * sys_dev) 909static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
@@ -918,7 +943,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
918 } 943 }
919 kobject_uevent(&(this_object->kobj), KOBJ_ADD); 944 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
920 } 945 }
921 cpu_set(cpu, cache_dev_map); 946 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
922 947
923 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 948 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
924 return 0; 949 return 0;
@@ -931,9 +956,9 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
931 956
932 if (per_cpu(cpuid4_info, cpu) == NULL) 957 if (per_cpu(cpuid4_info, cpu) == NULL)
933 return; 958 return;
934 if (!cpu_isset(cpu, cache_dev_map)) 959 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
935 return; 960 return;
936 cpu_clear(cpu, cache_dev_map); 961 cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
937 962
938 for (i = 0; i < num_cache_leaves; i++) 963 for (i = 0; i < num_cache_leaves; i++)
939 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); 964 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 8ae8c4ff094d..4772e91e8246 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -67,7 +67,7 @@ static struct threshold_block threshold_defaults = {
67struct threshold_bank { 67struct threshold_bank {
68 struct kobject *kobj; 68 struct kobject *kobj;
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
73 73
@@ -481,7 +481,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
481 481
482#ifdef CONFIG_SMP 482#ifdef CONFIG_SMP
483 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 483 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
484 i = first_cpu(per_cpu(cpu_core_map, cpu)); 484 i = cpumask_first(&per_cpu(cpu_core_map, cpu));
485 485
486 /* first core not up yet */ 486 /* first core not up yet */
487 if (cpu_data(i).cpu_core_id) 487 if (cpu_data(i).cpu_core_id)
@@ -501,7 +501,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
501 if (err) 501 if (err)
502 goto out; 502 goto out;
503 503
504 b->cpus = per_cpu(cpu_core_map, cpu); 504 cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
505 per_cpu(threshold_banks, cpu)[bank] = b; 505 per_cpu(threshold_banks, cpu)[bank] = b;
506 goto out; 506 goto out;
507 } 507 }
@@ -512,15 +512,20 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
512 err = -ENOMEM; 512 err = -ENOMEM;
513 goto out; 513 goto out;
514 } 514 }
515 if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
516 kfree(b);
517 err = -ENOMEM;
518 goto out;
519 }
515 520
516 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); 521 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);
517 if (!b->kobj) 522 if (!b->kobj)
518 goto out_free; 523 goto out_free;
519 524
520#ifndef CONFIG_SMP 525#ifndef CONFIG_SMP
521 b->cpus = CPU_MASK_ALL; 526 cpumask_setall(b->cpus);
522#else 527#else
523 b->cpus = per_cpu(cpu_core_map, cpu); 528 cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
524#endif 529#endif
525 530
526 per_cpu(threshold_banks, cpu)[bank] = b; 531 per_cpu(threshold_banks, cpu)[bank] = b;
@@ -529,7 +534,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
529 if (err) 534 if (err)
530 goto out_free; 535 goto out_free;
531 536
532 for_each_cpu_mask_nr(i, b->cpus) { 537 for_each_cpu(i, b->cpus) {
533 if (i == cpu) 538 if (i == cpu)
534 continue; 539 continue;
535 540
@@ -545,6 +550,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
545 550
546out_free: 551out_free:
547 per_cpu(threshold_banks, cpu)[bank] = NULL; 552 per_cpu(threshold_banks, cpu)[bank] = NULL;
553 free_cpumask_var(b->cpus);
548 kfree(b); 554 kfree(b);
549out: 555out:
550 return err; 556 return err;
@@ -619,7 +625,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
619#endif 625#endif
620 626
621 /* remove all sibling symlinks before unregistering */ 627 /* remove all sibling symlinks before unregistering */
622 for_each_cpu_mask_nr(i, b->cpus) { 628 for_each_cpu(i, b->cpus) {
623 if (i == cpu) 629 if (i == cpu)
624 continue; 630 continue;
625 631
@@ -632,6 +638,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
632free_out: 638free_out:
633 kobject_del(b->kobj); 639 kobject_del(b->kobj);
634 kobject_put(b->kobj); 640 kobject_put(b->kobj);
641 free_cpumask_var(b->cpus);
635 kfree(b); 642 kfree(b);
636 per_cpu(threshold_banks, cpu)[bank] = NULL; 643 per_cpu(threshold_banks, cpu)[bank] = NULL;
637} 644}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 4b48f251fd39..5e8c79e748a6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -7,6 +7,7 @@
7#include <linux/interrupt.h> 7#include <linux/interrupt.h>
8#include <linux/percpu.h> 8#include <linux/percpu.h>
9#include <asm/processor.h> 9#include <asm/processor.h>
10#include <asm/apic.h>
10#include <asm/msr.h> 11#include <asm/msr.h>
11#include <asm/mce.h> 12#include <asm/mce.h>
12#include <asm/hw_irq.h> 13#include <asm/hw_irq.h>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 000000000000..383d4c6423a1
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,733 @@
1/*
2 * Performance counter x86 architecture code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#include <linux/perf_counter.h>
11#include <linux/capability.h>
12#include <linux/notifier.h>
13#include <linux/hardirq.h>
14#include <linux/kprobes.h>
15#include <linux/module.h>
16#include <linux/kdebug.h>
17#include <linux/sched.h>
18
19#include <asm/perf_counter.h>
20#include <asm/apic.h>
21
22static bool perf_counters_initialized __read_mostly;
23
24/*
25 * Number of (generic) HW counters:
26 */
27static int nr_counters_generic __read_mostly;
28static u64 perf_counter_mask __read_mostly;
29static u64 counter_value_mask __read_mostly;
30
31static int nr_counters_fixed __read_mostly;
32
33struct cpu_hw_counters {
34 struct perf_counter *counters[X86_PMC_IDX_MAX];
35 unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
36 unsigned long interrupts;
37 u64 global_enable;
38};
39
40/*
41 * Intel PerfMon v3. Used on Core2 and later.
42 */
43static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
44
45static const int intel_perfmon_event_map[] =
46{
47 [PERF_COUNT_CPU_CYCLES] = 0x003c,
48 [PERF_COUNT_INSTRUCTIONS] = 0x00c0,
49 [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e,
50 [PERF_COUNT_CACHE_MISSES] = 0x412e,
51 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4,
52 [PERF_COUNT_BRANCH_MISSES] = 0x00c5,
53 [PERF_COUNT_BUS_CYCLES] = 0x013c,
54};
55
56static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
57
58/*
59 * Propagate counter elapsed time into the generic counter.
60 * Can only be executed on the CPU where the counter is active.
61 * Returns the delta events processed.
62 */
63static void
64x86_perf_counter_update(struct perf_counter *counter,
65 struct hw_perf_counter *hwc, int idx)
66{
67 u64 prev_raw_count, new_raw_count, delta;
68
69 /*
70 * Careful: an NMI might modify the previous counter value.
71 *
72 * Our tactic to handle this is to first atomically read and
73 * exchange a new raw count - then add that new-prev delta
74 * count to the generic counter atomically:
75 */
76again:
77 prev_raw_count = atomic64_read(&hwc->prev_count);
78 rdmsrl(hwc->counter_base + idx, new_raw_count);
79
80 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
81 new_raw_count) != prev_raw_count)
82 goto again;
83
84 /*
85 * Now we have the new raw value and have updated the prev
86 * timestamp already. We can now calculate the elapsed delta
87 * (counter-)time and add that to the generic counter.
88 *
89 * Careful, not all hw sign-extends above the physical width
90 * of the count, so we do that by clipping the delta to 32 bits:
91 */
92 delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
93
94 atomic64_add(delta, &counter->count);
95 atomic64_sub(delta, &hwc->period_left);
96}
97
98/*
99 * Setup the hardware configuration for a given hw_event_type
100 */
101static int __hw_perf_counter_init(struct perf_counter *counter)
102{
103 struct perf_counter_hw_event *hw_event = &counter->hw_event;
104 struct hw_perf_counter *hwc = &counter->hw;
105
106 if (unlikely(!perf_counters_initialized))
107 return -EINVAL;
108
109 /*
110 * Generate PMC IRQs:
111 * (keep 'enabled' bit clear for now)
112 */
113 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
114
115 /*
116 * Count user and OS events unless requested not to.
117 */
118 if (!hw_event->exclude_user)
119 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
120 if (!hw_event->exclude_kernel)
121 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
122
123 /*
124 * If privileged enough, allow NMI events:
125 */
126 hwc->nmi = 0;
127 if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
128 hwc->nmi = 1;
129
130 hwc->irq_period = hw_event->irq_period;
131 /*
132 * Intel PMCs cannot be accessed sanely above 32 bit width,
133 * so we install an artificial 1<<31 period regardless of
134 * the generic counter period:
135 */
136 if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
137 hwc->irq_period = 0x7FFFFFFF;
138
139 atomic64_set(&hwc->period_left, hwc->irq_period);
140
141 /*
142 * Raw event type provide the config in the event structure
143 */
144 if (hw_event->raw) {
145 hwc->config |= hw_event->type;
146 } else {
147 if (hw_event->type >= max_intel_perfmon_events)
148 return -EINVAL;
149 /*
150 * The generic map:
151 */
152 hwc->config |= intel_perfmon_event_map[hw_event->type];
153 }
154 counter->wakeup_pending = 0;
155
156 return 0;
157}
158
159u64 hw_perf_save_disable(void)
160{
161 u64 ctrl;
162
163 if (unlikely(!perf_counters_initialized))
164 return 0;
165
166 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
167 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
168
169 return ctrl;
170}
171EXPORT_SYMBOL_GPL(hw_perf_save_disable);
172
173void hw_perf_restore(u64 ctrl)
174{
175 if (unlikely(!perf_counters_initialized))
176 return;
177
178 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
179}
180EXPORT_SYMBOL_GPL(hw_perf_restore);
181
182static inline void
183__pmc_fixed_disable(struct perf_counter *counter,
184 struct hw_perf_counter *hwc, unsigned int __idx)
185{
186 int idx = __idx - X86_PMC_IDX_FIXED;
187 u64 ctrl_val, mask;
188 int err;
189
190 mask = 0xfULL << (idx * 4);
191
192 rdmsrl(hwc->config_base, ctrl_val);
193 ctrl_val &= ~mask;
194 err = checking_wrmsrl(hwc->config_base, ctrl_val);
195}
196
197static inline void
198__pmc_generic_disable(struct perf_counter *counter,
199 struct hw_perf_counter *hwc, unsigned int idx)
200{
201 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
202 __pmc_fixed_disable(counter, hwc, idx);
203 else
204 wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
205}
206
207static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
208
209/*
210 * Set the next IRQ period, based on the hwc->period_left value.
211 * To be called with the counter disabled in hw:
212 */
213static void
214__hw_perf_counter_set_period(struct perf_counter *counter,
215 struct hw_perf_counter *hwc, int idx)
216{
217 s64 left = atomic64_read(&hwc->period_left);
218 s32 period = hwc->irq_period;
219 int err;
220
221 /*
222 * If we are way outside a reasoable range then just skip forward:
223 */
224 if (unlikely(left <= -period)) {
225 left = period;
226 atomic64_set(&hwc->period_left, left);
227 }
228
229 if (unlikely(left <= 0)) {
230 left += period;
231 atomic64_set(&hwc->period_left, left);
232 }
233
234 per_cpu(prev_left[idx], smp_processor_id()) = left;
235
236 /*
237 * The hw counter starts counting from this counter offset,
238 * mark it to be able to extra future deltas:
239 */
240 atomic64_set(&hwc->prev_count, (u64)-left);
241
242 err = checking_wrmsrl(hwc->counter_base + idx,
243 (u64)(-left) & counter_value_mask);
244}
245
246static inline void
247__pmc_fixed_enable(struct perf_counter *counter,
248 struct hw_perf_counter *hwc, unsigned int __idx)
249{
250 int idx = __idx - X86_PMC_IDX_FIXED;
251 u64 ctrl_val, bits, mask;
252 int err;
253
254 /*
255 * Enable IRQ generation (0x8),
256 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
257 * if requested:
258 */
259 bits = 0x8ULL;
260 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
261 bits |= 0x2;
262 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
263 bits |= 0x1;
264 bits <<= (idx * 4);
265 mask = 0xfULL << (idx * 4);
266
267 rdmsrl(hwc->config_base, ctrl_val);
268 ctrl_val &= ~mask;
269 ctrl_val |= bits;
270 err = checking_wrmsrl(hwc->config_base, ctrl_val);
271}
272
273static void
274__pmc_generic_enable(struct perf_counter *counter,
275 struct hw_perf_counter *hwc, int idx)
276{
277 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
278 __pmc_fixed_enable(counter, hwc, idx);
279 else
280 wrmsr(hwc->config_base + idx,
281 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
282}
283
284static int
285fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
286{
287 unsigned int event;
288
289 if (unlikely(hwc->nmi))
290 return -1;
291
292 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
293
294 if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS]))
295 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
296 if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES]))
297 return X86_PMC_IDX_FIXED_CPU_CYCLES;
298 if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES]))
299 return X86_PMC_IDX_FIXED_BUS_CYCLES;
300
301 return -1;
302}
303
304/*
305 * Find a PMC slot for the freshly enabled / scheduled in counter:
306 */
307static int pmc_generic_enable(struct perf_counter *counter)
308{
309 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
310 struct hw_perf_counter *hwc = &counter->hw;
311 int idx;
312
313 idx = fixed_mode_idx(counter, hwc);
314 if (idx >= 0) {
315 /*
316 * Try to get the fixed counter, if that is already taken
317 * then try to get a generic counter:
318 */
319 if (test_and_set_bit(idx, cpuc->used))
320 goto try_generic;
321
322 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
323 /*
324 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
325 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
326 */
327 hwc->counter_base =
328 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
329 hwc->idx = idx;
330 } else {
331 idx = hwc->idx;
332 /* Try to get the previous generic counter again */
333 if (test_and_set_bit(idx, cpuc->used)) {
334try_generic:
335 idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
336 if (idx == nr_counters_generic)
337 return -EAGAIN;
338
339 set_bit(idx, cpuc->used);
340 hwc->idx = idx;
341 }
342 hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
343 hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
344 }
345
346 perf_counters_lapic_init(hwc->nmi);
347
348 __pmc_generic_disable(counter, hwc, idx);
349
350 cpuc->counters[idx] = counter;
351 /*
352 * Make it visible before enabling the hw:
353 */
354 smp_wmb();
355
356 __hw_perf_counter_set_period(counter, hwc, idx);
357 __pmc_generic_enable(counter, hwc, idx);
358
359 return 0;
360}
361
362void perf_counter_print_debug(void)
363{
364 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
365 struct cpu_hw_counters *cpuc;
366 int cpu, idx;
367
368 if (!nr_counters_generic)
369 return;
370
371 local_irq_disable();
372
373 cpu = smp_processor_id();
374 cpuc = &per_cpu(cpu_hw_counters, cpu);
375
376 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
377 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
378 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
379 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
380
381 printk(KERN_INFO "\n");
382 printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl);
383 printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status);
384 printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow);
385 printk(KERN_INFO "CPU#%d: fixed: %016llx\n", cpu, fixed);
386 printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used);
387
388 for (idx = 0; idx < nr_counters_generic; idx++) {
389 rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
390 rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count);
391
392 prev_left = per_cpu(prev_left[idx], cpu);
393
394 printk(KERN_INFO "CPU#%d: gen-PMC%d ctrl: %016llx\n",
395 cpu, idx, pmc_ctrl);
396 printk(KERN_INFO "CPU#%d: gen-PMC%d count: %016llx\n",
397 cpu, idx, pmc_count);
398 printk(KERN_INFO "CPU#%d: gen-PMC%d left: %016llx\n",
399 cpu, idx, prev_left);
400 }
401 for (idx = 0; idx < nr_counters_fixed; idx++) {
402 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
403
404 printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n",
405 cpu, idx, pmc_count);
406 }
407 local_irq_enable();
408}
409
410static void pmc_generic_disable(struct perf_counter *counter)
411{
412 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
413 struct hw_perf_counter *hwc = &counter->hw;
414 unsigned int idx = hwc->idx;
415
416 __pmc_generic_disable(counter, hwc, idx);
417
418 clear_bit(idx, cpuc->used);
419 cpuc->counters[idx] = NULL;
420 /*
421 * Make sure the cleared pointer becomes visible before we
422 * (potentially) free the counter:
423 */
424 smp_wmb();
425
426 /*
427 * Drain the remaining delta count out of a counter
428 * that we are disabling:
429 */
430 x86_perf_counter_update(counter, hwc, idx);
431}
432
433static void perf_store_irq_data(struct perf_counter *counter, u64 data)
434{
435 struct perf_data *irqdata = counter->irqdata;
436
437 if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
438 irqdata->overrun++;
439 } else {
440 u64 *p = (u64 *) &irqdata->data[irqdata->len];
441
442 *p = data;
443 irqdata->len += sizeof(u64);
444 }
445}
446
447/*
448 * Save and restart an expired counter. Called by NMI contexts,
449 * so it has to be careful about preempting normal counter ops:
450 */
451static void perf_save_and_restart(struct perf_counter *counter)
452{
453 struct hw_perf_counter *hwc = &counter->hw;
454 int idx = hwc->idx;
455
456 x86_perf_counter_update(counter, hwc, idx);
457 __hw_perf_counter_set_period(counter, hwc, idx);
458
459 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
460 __pmc_generic_enable(counter, hwc, idx);
461}
462
463static void
464perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
465{
466 struct perf_counter *counter, *group_leader = sibling->group_leader;
467
468 /*
469 * Store sibling timestamps (if any):
470 */
471 list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
472
473 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
474 perf_store_irq_data(sibling, counter->hw_event.type);
475 perf_store_irq_data(sibling, atomic64_read(&counter->count));
476 }
477}
478
479/*
480 * Maximum interrupt frequency of 100KHz per CPU
481 */
482#define PERFMON_MAX_INTERRUPTS 100000/HZ
483
484/*
485 * This handler is triggered by the local APIC, so the APIC IRQ handling
486 * rules apply:
487 */
488static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
489{
490 int bit, cpu = smp_processor_id();
491 u64 ack, status;
492 struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
493
494 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
495
496 /* Disable counters globally */
497 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
498 ack_APIC_irq();
499
500 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
501 if (!status)
502 goto out;
503
504again:
505 inc_irq_stat(apic_perf_irqs);
506 ack = status;
507 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
508 struct perf_counter *counter = cpuc->counters[bit];
509
510 clear_bit(bit, (unsigned long *) &status);
511 if (!counter)
512 continue;
513
514 perf_save_and_restart(counter);
515
516 switch (counter->hw_event.record_type) {
517 case PERF_RECORD_SIMPLE:
518 continue;
519 case PERF_RECORD_IRQ:
520 perf_store_irq_data(counter, instruction_pointer(regs));
521 break;
522 case PERF_RECORD_GROUP:
523 perf_handle_group(counter, &status, &ack);
524 break;
525 }
526 /*
527 * From NMI context we cannot call into the scheduler to
528 * do a task wakeup - but we mark these generic as
529 * wakeup_pending and initate a wakeup callback:
530 */
531 if (nmi) {
532 counter->wakeup_pending = 1;
533 set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
534 } else {
535 wake_up(&counter->waitq);
536 }
537 }
538
539 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
540
541 /*
542 * Repeat if there is more work to be done:
543 */
544 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
545 if (status)
546 goto again;
547out:
548 /*
549 * Restore - do not reenable when global enable is off or throttled:
550 */
551 if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
552 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
553}
554
555void perf_counter_unthrottle(void)
556{
557 struct cpu_hw_counters *cpuc;
558 u64 global_enable;
559
560 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
561 return;
562
563 if (unlikely(!perf_counters_initialized))
564 return;
565
566 cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
567 if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
568 if (printk_ratelimit())
569 printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
570 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
571 }
572 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable);
573 if (unlikely(cpuc->global_enable && !global_enable))
574 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
575 cpuc->interrupts = 0;
576}
577
578void smp_perf_counter_interrupt(struct pt_regs *regs)
579{
580 irq_enter();
581 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
582 __smp_perf_counter_interrupt(regs, 0);
583
584 irq_exit();
585}
586
587/*
588 * This handler is triggered by NMI contexts:
589 */
590void perf_counter_notify(struct pt_regs *regs)
591{
592 struct cpu_hw_counters *cpuc;
593 unsigned long flags;
594 int bit, cpu;
595
596 local_irq_save(flags);
597 cpu = smp_processor_id();
598 cpuc = &per_cpu(cpu_hw_counters, cpu);
599
600 for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
601 struct perf_counter *counter = cpuc->counters[bit];
602
603 if (!counter)
604 continue;
605
606 if (counter->wakeup_pending) {
607 counter->wakeup_pending = 0;
608 wake_up(&counter->waitq);
609 }
610 }
611
612 local_irq_restore(flags);
613}
614
615void perf_counters_lapic_init(int nmi)
616{
617 u32 apic_val;
618
619 if (!perf_counters_initialized)
620 return;
621 /*
622 * Enable the performance counter vector in the APIC LVT:
623 */
624 apic_val = apic_read(APIC_LVTERR);
625
626 apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
627 if (nmi)
628 apic_write(APIC_LVTPC, APIC_DM_NMI);
629 else
630 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
631 apic_write(APIC_LVTERR, apic_val);
632}
633
634static int __kprobes
635perf_counter_nmi_handler(struct notifier_block *self,
636 unsigned long cmd, void *__args)
637{
638 struct die_args *args = __args;
639 struct pt_regs *regs;
640
641 if (likely(cmd != DIE_NMI_IPI))
642 return NOTIFY_DONE;
643
644 regs = args->regs;
645
646 apic_write(APIC_LVTPC, APIC_DM_NMI);
647 __smp_perf_counter_interrupt(regs, 1);
648
649 return NOTIFY_STOP;
650}
651
652static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
653 .notifier_call = perf_counter_nmi_handler,
654 .next = NULL,
655 .priority = 1
656};
657
658void __init init_hw_perf_counters(void)
659{
660 union cpuid10_eax eax;
661 unsigned int ebx;
662 unsigned int unused;
663 union cpuid10_edx edx;
664
665 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
666 return;
667
668 /*
669 * Check whether the Architectural PerfMon supports
670 * Branch Misses Retired Event or not.
671 */
672 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
673 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
674 return;
675
676 printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
677
678 printk(KERN_INFO "... version: %d\n", eax.split.version_id);
679 printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters);
680 nr_counters_generic = eax.split.num_counters;
681 if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
682 nr_counters_generic = X86_PMC_MAX_GENERIC;
683 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
684 nr_counters_generic, X86_PMC_MAX_GENERIC);
685 }
686 perf_counter_mask = (1 << nr_counters_generic) - 1;
687 perf_max_counters = nr_counters_generic;
688
689 printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width);
690 counter_value_mask = (1ULL << eax.split.bit_width) - 1;
691 printk(KERN_INFO "... value mask: %016Lx\n", counter_value_mask);
692
693 printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length);
694
695 nr_counters_fixed = edx.split.num_counters_fixed;
696 if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
697 nr_counters_fixed = X86_PMC_MAX_FIXED;
698 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
699 nr_counters_fixed, X86_PMC_MAX_FIXED);
700 }
701 printk(KERN_INFO "... fixed counters: %d\n", nr_counters_fixed);
702
703 perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
704
705 printk(KERN_INFO "... counter mask: %016Lx\n", perf_counter_mask);
706 perf_counters_initialized = true;
707
708 perf_counters_lapic_init(0);
709 register_die_notifier(&perf_counter_nmi_notifier);
710}
711
712static void pmc_generic_read(struct perf_counter *counter)
713{
714 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
715}
716
717static const struct hw_perf_counter_ops x86_perf_counter_ops = {
718 .enable = pmc_generic_enable,
719 .disable = pmc_generic_disable,
720 .read = pmc_generic_read,
721};
722
723const struct hw_perf_counter_ops *
724hw_perf_counter_init(struct perf_counter *counter)
725{
726 int err;
727
728 err = __hw_perf_counter_init(counter);
729 if (err)
730 return NULL;
731
732 return &x86_perf_counter_ops;
733}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9abd48b22674..d6f5b9fbde32 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -20,7 +20,7 @@
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/apic.h> 22#include <asm/apic.h>
23#include <asm/intel_arch_perfmon.h> 23#include <asm/perf_counter.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c689d19e35ab..11b93cabdf78 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -24,7 +24,7 @@
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/hpet.h> 25#include <asm/hpet.h>
26#include <linux/kdebug.h> 26#include <linux/kdebug.h>
27#include <asm/smp.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30 30
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index c302d0707048..d35db5993fd6 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -106,7 +106,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
106 const struct stacktrace_ops *ops, void *data) 106 const struct stacktrace_ops *ops, void *data)
107{ 107{
108 const unsigned cpu = get_cpu(); 108 const unsigned cpu = get_cpu();
109 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; 109 unsigned long *irq_stack_end =
110 (unsigned long *)per_cpu(irq_stack_ptr, cpu);
110 unsigned used = 0; 111 unsigned used = 0;
111 struct thread_info *tinfo; 112 struct thread_info *tinfo;
112 int graph = 0; 113 int graph = 0;
@@ -160,23 +161,23 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
160 stack = (unsigned long *) estack_end[-2]; 161 stack = (unsigned long *) estack_end[-2];
161 continue; 162 continue;
162 } 163 }
163 if (irqstack_end) { 164 if (irq_stack_end) {
164 unsigned long *irqstack; 165 unsigned long *irq_stack;
165 irqstack = irqstack_end - 166 irq_stack = irq_stack_end -
166 (IRQSTACKSIZE - 64) / sizeof(*irqstack); 167 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
167 168
168 if (stack >= irqstack && stack < irqstack_end) { 169 if (stack >= irq_stack && stack < irq_stack_end) {
169 if (ops->stack(data, "IRQ") < 0) 170 if (ops->stack(data, "IRQ") < 0)
170 break; 171 break;
171 bp = print_context_stack(tinfo, stack, bp, 172 bp = print_context_stack(tinfo, stack, bp,
172 ops, data, irqstack_end, &graph); 173 ops, data, irq_stack_end, &graph);
173 /* 174 /*
174 * We link to the next stack (which would be 175 * We link to the next stack (which would be
175 * the process stack normally) the last 176 * the process stack normally) the last
176 * pointer (index -1 to end) in the IRQ stack: 177 * pointer (index -1 to end) in the IRQ stack:
177 */ 178 */
178 stack = (unsigned long *) (irqstack_end[-1]); 179 stack = (unsigned long *) (irq_stack_end[-1]);
179 irqstack_end = NULL; 180 irq_stack_end = NULL;
180 ops->stack(data, "EOI"); 181 ops->stack(data, "EOI");
181 continue; 182 continue;
182 } 183 }
@@ -199,10 +200,10 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
199 unsigned long *stack; 200 unsigned long *stack;
200 int i; 201 int i;
201 const int cpu = smp_processor_id(); 202 const int cpu = smp_processor_id();
202 unsigned long *irqstack_end = 203 unsigned long *irq_stack_end =
203 (unsigned long *) (cpu_pda(cpu)->irqstackptr); 204 (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
204 unsigned long *irqstack = 205 unsigned long *irq_stack =
205 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); 206 (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
206 207
207 /* 208 /*
208 * debugging aid: "show_stack(NULL, NULL);" prints the 209 * debugging aid: "show_stack(NULL, NULL);" prints the
@@ -218,9 +219,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
218 219
219 stack = sp; 220 stack = sp;
220 for (i = 0; i < kstack_depth_to_print; i++) { 221 for (i = 0; i < kstack_depth_to_print; i++) {
221 if (stack >= irqstack && stack <= irqstack_end) { 222 if (stack >= irq_stack && stack <= irq_stack_end) {
222 if (stack == irqstack_end) { 223 if (stack == irq_stack_end) {
223 stack = (unsigned long *) (irqstack_end[-1]); 224 stack = (unsigned long *) (irq_stack_end[-1]);
224 printk(" <EOI> "); 225 printk(" <EOI> ");
225 } 226 }
226 } else { 227 } else {
@@ -241,7 +242,7 @@ void show_registers(struct pt_regs *regs)
241 int i; 242 int i;
242 unsigned long sp; 243 unsigned long sp;
243 const int cpu = smp_processor_id(); 244 const int cpu = smp_processor_id();
244 struct task_struct *cur = cpu_pda(cpu)->pcurrent; 245 struct task_struct *cur = current;
245 246
246 sp = regs->sp; 247 sp = regs->sp;
247 printk("CPU %d ", cpu); 248 printk("CPU %d ", cpu);
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 1119d247fe11..b205272ad394 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -366,10 +366,12 @@ void __init efi_init(void)
366 SMBIOS_TABLE_GUID)) { 366 SMBIOS_TABLE_GUID)) {
367 efi.smbios = config_tables[i].table; 367 efi.smbios = config_tables[i].table;
368 printk(" SMBIOS=0x%lx ", config_tables[i].table); 368 printk(" SMBIOS=0x%lx ", config_tables[i].table);
369#ifdef CONFIG_X86_UV
369 } else if (!efi_guidcmp(config_tables[i].guid, 370 } else if (!efi_guidcmp(config_tables[i].guid,
370 UV_SYSTEM_TABLE_GUID)) { 371 UV_SYSTEM_TABLE_GUID)) {
371 efi.uv_systab = config_tables[i].table; 372 efi.uv_systab = config_tables[i].table;
372 printk(" UVsystab=0x%lx ", config_tables[i].table); 373 printk(" UVsystab=0x%lx ", config_tables[i].table);
374#endif
373 } else if (!efi_guidcmp(config_tables[i].guid, 375 } else if (!efi_guidcmp(config_tables[i].guid,
374 HCDP_TABLE_GUID)) { 376 HCDP_TABLE_GUID)) {
375 efi.hcdp = config_tables[i].table; 377 efi.hcdp = config_tables[i].table;
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 652c5287215f..a4ee29127fdf 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -36,6 +36,7 @@
36#include <asm/proto.h> 36#include <asm/proto.h>
37#include <asm/efi.h> 37#include <asm/efi.h>
38#include <asm/cacheflush.h> 38#include <asm/cacheflush.h>
39#include <asm/fixmap.h>
39 40
40static pgd_t save_pgd __initdata; 41static pgd_t save_pgd __initdata;
41static unsigned long efi_flags __initdata; 42static unsigned long efi_flags __initdata;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 46469029e9d3..a0b91aac72a1 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -672,7 +672,7 @@ common_interrupt:
672ENDPROC(common_interrupt) 672ENDPROC(common_interrupt)
673 CFI_ENDPROC 673 CFI_ENDPROC
674 674
675#define BUILD_INTERRUPT(name, nr) \ 675#define BUILD_INTERRUPT3(name, nr, fn) \
676ENTRY(name) \ 676ENTRY(name) \
677 RING0_INT_FRAME; \ 677 RING0_INT_FRAME; \
678 pushl $~(nr); \ 678 pushl $~(nr); \
@@ -680,11 +680,13 @@ ENTRY(name) \
680 SAVE_ALL; \ 680 SAVE_ALL; \
681 TRACE_IRQS_OFF \ 681 TRACE_IRQS_OFF \
682 movl %esp,%eax; \ 682 movl %esp,%eax; \
683 call smp_##name; \ 683 call fn; \
684 jmp ret_from_intr; \ 684 jmp ret_from_intr; \
685 CFI_ENDPROC; \ 685 CFI_ENDPROC; \
686ENDPROC(name) 686ENDPROC(name)
687 687
688#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name)
689
688/* The include is where all of the SMP etc. interrupts come from */ 690/* The include is where all of the SMP etc. interrupts come from */
689#include "entry_arch.h" 691#include "entry_arch.h"
690 692
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a1346217e43c..8f8f61a1fce8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -52,6 +52,7 @@
52#include <asm/irqflags.h> 52#include <asm/irqflags.h>
53#include <asm/paravirt.h> 53#include <asm/paravirt.h>
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/percpu.h>
55 56
56/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57#include <linux/elf-em.h> 58#include <linux/elf-em.h>
@@ -209,7 +210,7 @@ ENTRY(native_usergs_sysret64)
209 210
210 /* %rsp:at FRAMEEND */ 211 /* %rsp:at FRAMEEND */
211 .macro FIXUP_TOP_OF_STACK tmp offset=0 212 .macro FIXUP_TOP_OF_STACK tmp offset=0
212 movq %gs:pda_oldrsp,\tmp 213 movq PER_CPU_VAR(old_rsp),\tmp
213 movq \tmp,RSP+\offset(%rsp) 214 movq \tmp,RSP+\offset(%rsp)
214 movq $__USER_DS,SS+\offset(%rsp) 215 movq $__USER_DS,SS+\offset(%rsp)
215 movq $__USER_CS,CS+\offset(%rsp) 216 movq $__USER_CS,CS+\offset(%rsp)
@@ -220,7 +221,7 @@ ENTRY(native_usergs_sysret64)
220 221
221 .macro RESTORE_TOP_OF_STACK tmp offset=0 222 .macro RESTORE_TOP_OF_STACK tmp offset=0
222 movq RSP+\offset(%rsp),\tmp 223 movq RSP+\offset(%rsp),\tmp
223 movq \tmp,%gs:pda_oldrsp 224 movq \tmp,PER_CPU_VAR(old_rsp)
224 movq EFLAGS+\offset(%rsp),\tmp 225 movq EFLAGS+\offset(%rsp),\tmp
225 movq \tmp,R11+\offset(%rsp) 226 movq \tmp,R11+\offset(%rsp)
226 .endm 227 .endm
@@ -336,15 +337,15 @@ ENTRY(save_args)
336 je 1f 337 je 1f
337 SWAPGS 338 SWAPGS
338 /* 339 /*
339 * irqcount is used to check if a CPU is already on an interrupt stack 340 * irq_count is used to check if a CPU is already on an interrupt stack
340 * or not. While this is essentially redundant with preempt_count it is 341 * or not. While this is essentially redundant with preempt_count it is
341 * a little cheaper to use a separate counter in the PDA (short of 342 * a little cheaper to use a separate counter in the PDA (short of
342 * moving irq_enter into assembly, which would be too much work) 343 * moving irq_enter into assembly, which would be too much work)
343 */ 344 */
3441: incl %gs:pda_irqcount 3451: incl PER_CPU_VAR(irq_count)
345 jne 2f 346 jne 2f
346 popq_cfi %rax /* move return address... */ 347 popq_cfi %rax /* move return address... */
347 mov %gs:pda_irqstackptr,%rsp 348 mov PER_CPU_VAR(irq_stack_ptr),%rsp
348 EMPTY_FRAME 0 349 EMPTY_FRAME 0
349 pushq_cfi %rbp /* backlink for unwinder */ 350 pushq_cfi %rbp /* backlink for unwinder */
350 pushq_cfi %rax /* ... to the new stack */ 351 pushq_cfi %rax /* ... to the new stack */
@@ -468,7 +469,7 @@ END(ret_from_fork)
468ENTRY(system_call) 469ENTRY(system_call)
469 CFI_STARTPROC simple 470 CFI_STARTPROC simple
470 CFI_SIGNAL_FRAME 471 CFI_SIGNAL_FRAME
471 CFI_DEF_CFA rsp,PDA_STACKOFFSET 472 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
472 CFI_REGISTER rip,rcx 473 CFI_REGISTER rip,rcx
473 /*CFI_REGISTER rflags,r11*/ 474 /*CFI_REGISTER rflags,r11*/
474 SWAPGS_UNSAFE_STACK 475 SWAPGS_UNSAFE_STACK
@@ -479,8 +480,8 @@ ENTRY(system_call)
479 */ 480 */
480ENTRY(system_call_after_swapgs) 481ENTRY(system_call_after_swapgs)
481 482
482 movq %rsp,%gs:pda_oldrsp 483 movq %rsp,PER_CPU_VAR(old_rsp)
483 movq %gs:pda_kernelstack,%rsp 484 movq PER_CPU_VAR(kernel_stack),%rsp
484 /* 485 /*
485 * No need to follow this irqs off/on section - it's straight 486 * No need to follow this irqs off/on section - it's straight
486 * and short: 487 * and short:
@@ -523,7 +524,7 @@ sysret_check:
523 CFI_REGISTER rip,rcx 524 CFI_REGISTER rip,rcx
524 RESTORE_ARGS 0,-ARG_SKIP,1 525 RESTORE_ARGS 0,-ARG_SKIP,1
525 /*CFI_REGISTER rflags,r11*/ 526 /*CFI_REGISTER rflags,r11*/
526 movq %gs:pda_oldrsp, %rsp 527 movq PER_CPU_VAR(old_rsp), %rsp
527 USERGS_SYSRET64 528 USERGS_SYSRET64
528 529
529 CFI_RESTORE_STATE 530 CFI_RESTORE_STATE
@@ -833,11 +834,11 @@ common_interrupt:
833 XCPT_FRAME 834 XCPT_FRAME
834 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ 835 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
835 interrupt do_IRQ 836 interrupt do_IRQ
836 /* 0(%rsp): oldrsp-ARGOFFSET */ 837 /* 0(%rsp): old_rsp-ARGOFFSET */
837ret_from_intr: 838ret_from_intr:
838 DISABLE_INTERRUPTS(CLBR_NONE) 839 DISABLE_INTERRUPTS(CLBR_NONE)
839 TRACE_IRQS_OFF 840 TRACE_IRQS_OFF
840 decl %gs:pda_irqcount 841 decl PER_CPU_VAR(irq_count)
841 leaveq 842 leaveq
842 CFI_DEF_CFA_REGISTER rsp 843 CFI_DEF_CFA_REGISTER rsp
843 CFI_ADJUST_CFA_OFFSET -8 844 CFI_ADJUST_CFA_OFFSET -8
@@ -982,8 +983,10 @@ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
982 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 983 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
983#endif 984#endif
984 985
986#ifdef CONFIG_X86_UV
985apicinterrupt UV_BAU_MESSAGE \ 987apicinterrupt UV_BAU_MESSAGE \
986 uv_bau_message_intr1 uv_bau_message_interrupt 988 uv_bau_message_intr1 uv_bau_message_interrupt
989#endif
987apicinterrupt LOCAL_TIMER_VECTOR \ 990apicinterrupt LOCAL_TIMER_VECTOR \
988 apic_timer_interrupt smp_apic_timer_interrupt 991 apic_timer_interrupt smp_apic_timer_interrupt
989 992
@@ -1025,6 +1028,11 @@ apicinterrupt ERROR_APIC_VECTOR \
1025apicinterrupt SPURIOUS_APIC_VECTOR \ 1028apicinterrupt SPURIOUS_APIC_VECTOR \
1026 spurious_interrupt smp_spurious_interrupt 1029 spurious_interrupt smp_spurious_interrupt
1027 1030
1031#ifdef CONFIG_PERF_COUNTERS
1032apicinterrupt LOCAL_PERF_VECTOR \
1033 perf_counter_interrupt smp_perf_counter_interrupt
1034#endif
1035
1028/* 1036/*
1029 * Exception entry points. 1037 * Exception entry points.
1030 */ 1038 */
@@ -1073,10 +1081,10 @@ ENTRY(\sym)
1073 TRACE_IRQS_OFF 1081 TRACE_IRQS_OFF
1074 movq %rsp,%rdi /* pt_regs pointer */ 1082 movq %rsp,%rdi /* pt_regs pointer */
1075 xorl %esi,%esi /* no error code */ 1083 xorl %esi,%esi /* no error code */
1076 movq %gs:pda_data_offset, %rbp 1084 PER_CPU(init_tss, %rbp)
1077 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) 1085 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
1078 call \do_sym 1086 call \do_sym
1079 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) 1087 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
1080 jmp paranoid_exit /* %ebx: no swapgs flag */ 1088 jmp paranoid_exit /* %ebx: no swapgs flag */
1081 CFI_ENDPROC 1089 CFI_ENDPROC
1082END(\sym) 1090END(\sym)
@@ -1260,14 +1268,14 @@ ENTRY(call_softirq)
1260 CFI_REL_OFFSET rbp,0 1268 CFI_REL_OFFSET rbp,0
1261 mov %rsp,%rbp 1269 mov %rsp,%rbp
1262 CFI_DEF_CFA_REGISTER rbp 1270 CFI_DEF_CFA_REGISTER rbp
1263 incl %gs:pda_irqcount 1271 incl PER_CPU_VAR(irq_count)
1264 cmove %gs:pda_irqstackptr,%rsp 1272 cmove PER_CPU_VAR(irq_stack_ptr),%rsp
1265 push %rbp # backlink for old unwinder 1273 push %rbp # backlink for old unwinder
1266 call __do_softirq 1274 call __do_softirq
1267 leaveq 1275 leaveq
1268 CFI_DEF_CFA_REGISTER rsp 1276 CFI_DEF_CFA_REGISTER rsp
1269 CFI_ADJUST_CFA_OFFSET -8 1277 CFI_ADJUST_CFA_OFFSET -8
1270 decl %gs:pda_irqcount 1278 decl PER_CPU_VAR(irq_count)
1271 ret 1279 ret
1272 CFI_ENDPROC 1280 CFI_ENDPROC
1273END(call_softirq) 1281END(call_softirq)
@@ -1297,15 +1305,15 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1297 movq %rdi, %rsp # we don't return, adjust the stack frame 1305 movq %rdi, %rsp # we don't return, adjust the stack frame
1298 CFI_ENDPROC 1306 CFI_ENDPROC
1299 DEFAULT_FRAME 1307 DEFAULT_FRAME
130011: incl %gs:pda_irqcount 130811: incl PER_CPU_VAR(irq_count)
1301 movq %rsp,%rbp 1309 movq %rsp,%rbp
1302 CFI_DEF_CFA_REGISTER rbp 1310 CFI_DEF_CFA_REGISTER rbp
1303 cmovzq %gs:pda_irqstackptr,%rsp 1311 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
1304 pushq %rbp # backlink for old unwinder 1312 pushq %rbp # backlink for old unwinder
1305 call xen_evtchn_do_upcall 1313 call xen_evtchn_do_upcall
1306 popq %rsp 1314 popq %rsp
1307 CFI_DEF_CFA_REGISTER rsp 1315 CFI_DEF_CFA_REGISTER rsp
1308 decl %gs:pda_irqcount 1316 decl PER_CPU_VAR(irq_count)
1309 jmp error_exit 1317 jmp error_exit
1310 CFI_ENDPROC 1318 CFI_ENDPROC
1311END(do_hypervisor_callback) 1319END(do_hypervisor_callback)
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 2bced78b0b8e..e656c2721154 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -32,7 +32,9 @@ extern struct genapic apic_x2apic_cluster;
32struct genapic __read_mostly *genapic = &apic_flat; 32struct genapic __read_mostly *genapic = &apic_flat;
33 33
34static struct genapic *apic_probe[] __initdata = { 34static struct genapic *apic_probe[] __initdata = {
35#ifdef CONFIG_X86_UV
35 &apic_x2apic_uv_x, 36 &apic_x2apic_uv_x,
37#endif
36 &apic_x2apic_phys, 38 &apic_x2apic_phys,
37 &apic_x2apic_cluster, 39 &apic_x2apic_cluster,
38 &apic_physflat, 40 &apic_physflat,
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index b193e082f6ce..bfe36249145c 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -25,6 +25,7 @@
25#include <asm/ipi.h> 25#include <asm/ipi.h>
26#include <asm/genapic.h> 26#include <asm/genapic.h>
27#include <asm/pgtable.h> 27#include <asm/pgtable.h>
28#include <asm/uv/uv.h>
28#include <asm/uv/uv_mmrs.h> 29#include <asm/uv/uv_mmrs.h>
29#include <asm/uv/uv_hub.h> 30#include <asm/uv/uv_hub.h>
30#include <asm/uv/bios.h> 31#include <asm/uv/bios.h>
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b9a4d8c4b935..f5b272247690 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,27 +26,6 @@
26#include <asm/bios_ebda.h> 26#include <asm/bios_ebda.h>
27#include <asm/trampoline.h> 27#include <asm/trampoline.h>
28 28
29/* boot cpu pda */
30static struct x8664_pda _boot_cpu_pda;
31
32#ifdef CONFIG_SMP
33/*
34 * We install an empty cpu_pda pointer table to indicate to early users
35 * (numa_set_node) that the cpu_pda pointer table for cpus other than
36 * the boot cpu is not yet setup.
37 */
38static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
39#else
40static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
41#endif
42
43void __init x86_64_init_pda(void)
44{
45 _cpu_pda = __cpu_pda;
46 cpu_pda(0) = &_boot_cpu_pda;
47 pda_init(0);
48}
49
50static void __init zap_identity_mappings(void) 29static void __init zap_identity_mappings(void)
51{ 30{
52 pgd_t *pgd = pgd_offset_k(0UL); 31 pgd_t *pgd = pgd_offset_k(0UL);
@@ -112,8 +91,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
112 if (console_loglevel == 10) 91 if (console_loglevel == 10)
113 early_printk("Kernel alive\n"); 92 early_printk("Kernel alive\n");
114 93
115 x86_64_init_pda();
116
117 x86_64_start_reservations(real_mode_data); 94 x86_64_start_reservations(real_mode_data);
118} 95}
119 96
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index e835b4eea70b..24c0e5cd71e3 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -429,12 +429,14 @@ is386: movl $2,%ecx # set MP
429 ljmp $(__KERNEL_CS),$1f 429 ljmp $(__KERNEL_CS),$1f
4301: movl $(__KERNEL_DS),%eax # reload all the segment registers 4301: movl $(__KERNEL_DS),%eax # reload all the segment registers
431 movl %eax,%ss # after changing gdt. 431 movl %eax,%ss # after changing gdt.
432 movl %eax,%fs # gets reset once there's real percpu
433 432
434 movl $(__USER_DS),%eax # DS/ES contains default USER segment 433 movl $(__USER_DS),%eax # DS/ES contains default USER segment
435 movl %eax,%ds 434 movl %eax,%ds
436 movl %eax,%es 435 movl %eax,%es
437 436
437 movl $(__KERNEL_PERCPU), %eax
438 movl %eax,%fs # set this cpu's percpu
439
438 xorl %eax,%eax # Clear GS and LDT 440 xorl %eax,%eax # Clear GS and LDT
439 movl %eax,%gs 441 movl %eax,%gs
440 lldt %ax 442 lldt %ax
@@ -446,8 +448,6 @@ is386: movl $2,%ecx # set MP
446 movb $1, ready 448 movb $1, ready
447 cmpb $0,%cl # the first CPU calls start_kernel 449 cmpb $0,%cl # the first CPU calls start_kernel
448 je 1f 450 je 1f
449 movl $(__KERNEL_PERCPU), %eax
450 movl %eax,%fs # set this cpu's percpu
451 movl (stack_start), %esp 451 movl (stack_start), %esp
4521: 4521:
453#endif /* CONFIG_SMP */ 453#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 0e275d495563..a0a2b5ca9b7d 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,7 @@
19#include <asm/msr.h> 19#include <asm/msr.h>
20#include <asm/cache.h> 20#include <asm/cache.h>
21#include <asm/processor-flags.h> 21#include <asm/processor-flags.h>
22#include <asm/percpu.h>
22 23
23#ifdef CONFIG_PARAVIRT 24#ifdef CONFIG_PARAVIRT
24#include <asm/asm-offsets.h> 25#include <asm/asm-offsets.h>
@@ -204,6 +205,19 @@ ENTRY(secondary_startup_64)
204 pushq $0 205 pushq $0
205 popfq 206 popfq
206 207
208#ifdef CONFIG_SMP
209 /*
210 * Fix up static pointers that need __per_cpu_load added. The assembler
211 * is unable to do this directly. This is only needed for the boot cpu.
212 * These values are set up with the correct base addresses by C code for
213 * secondary cpus.
214 */
215 movq initial_gs(%rip), %rax
216 cmpl $0, per_cpu__cpu_number(%rax)
217 jne 1f
218 addq %rax, early_gdt_descr_base(%rip)
2191:
220#endif
207 /* 221 /*
208 * We must switch to a new descriptor in kernel space for the GDT 222 * We must switch to a new descriptor in kernel space for the GDT
209 * because soon the kernel won't have access anymore to the userspace 223 * because soon the kernel won't have access anymore to the userspace
@@ -226,12 +240,15 @@ ENTRY(secondary_startup_64)
226 movl %eax,%fs 240 movl %eax,%fs
227 movl %eax,%gs 241 movl %eax,%gs
228 242
229 /* 243 /* Set up %gs.
230 * Setup up a dummy PDA. this is just for some early bootup code 244 *
231 * that does in_interrupt() 245 * The base of %gs always points to the bottom of the irqstack
232 */ 246 * union. If the stack protector canary is enabled, it is
247 * located at %gs:40. Note that, on SMP, the boot cpu uses
248 * init data section till per cpu areas are set up.
249 */
233 movl $MSR_GS_BASE,%ecx 250 movl $MSR_GS_BASE,%ecx
234 movq $empty_zero_page,%rax 251 movq initial_gs(%rip),%rax
235 movq %rax,%rdx 252 movq %rax,%rdx
236 shrq $32,%rdx 253 shrq $32,%rdx
237 wrmsr 254 wrmsr
@@ -257,6 +274,12 @@ ENTRY(secondary_startup_64)
257 .align 8 274 .align 8
258 ENTRY(initial_code) 275 ENTRY(initial_code)
259 .quad x86_64_start_kernel 276 .quad x86_64_start_kernel
277 ENTRY(initial_gs)
278#ifdef CONFIG_SMP
279 .quad __per_cpu_load
280#else
281 .quad PER_CPU_VAR(irq_stack_union)
282#endif
260 __FINITDATA 283 __FINITDATA
261 284
262 ENTRY(stack_start) 285 ENTRY(stack_start)
@@ -401,7 +424,8 @@ NEXT_PAGE(level2_spare_pgt)
401 .globl early_gdt_descr 424 .globl early_gdt_descr
402early_gdt_descr: 425early_gdt_descr:
403 .word GDT_ENTRIES*8-1 426 .word GDT_ENTRIES*8-1
404 .quad per_cpu__gdt_page 427early_gdt_descr_base:
428 .quad per_cpu__gdt_page
405 429
406ENTRY(phys_base) 430ENTRY(phys_base)
407 /* This must match the first entry in level2_kernel_pgt */ 431 /* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index bc7ac4da90d7..f61d945620b3 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -46,6 +46,7 @@
46#include <asm/idle.h> 46#include <asm/idle.h>
47#include <asm/io.h> 47#include <asm/io.h>
48#include <asm/smp.h> 48#include <asm/smp.h>
49#include <asm/cpu.h>
49#include <asm/desc.h> 50#include <asm/desc.h>
50#include <asm/proto.h> 51#include <asm/proto.h>
51#include <asm/acpi.h> 52#include <asm/acpi.h>
@@ -82,11 +83,11 @@ static DEFINE_SPINLOCK(vector_lock);
82int nr_ioapic_registers[MAX_IO_APICS]; 83int nr_ioapic_registers[MAX_IO_APICS];
83 84
84/* I/O APIC entries */ 85/* I/O APIC entries */
85struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; 86struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
86int nr_ioapics; 87int nr_ioapics;
87 88
88/* MP IRQ source entries */ 89/* MP IRQ source entries */
89struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; 90struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
90 91
91/* # of MP IRQ source entries */ 92/* # of MP IRQ source entries */
92int mp_irq_entries; 93int mp_irq_entries;
@@ -356,7 +357,7 @@ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
356 357
357 if (!cfg->move_in_progress) { 358 if (!cfg->move_in_progress) {
358 /* it means that domain is not changed */ 359 /* it means that domain is not changed */
359 if (!cpumask_intersects(&desc->affinity, mask)) 360 if (!cpumask_intersects(desc->affinity, mask))
360 cfg->move_desc_pending = 1; 361 cfg->move_desc_pending = 1;
361 } 362 }
362} 363}
@@ -386,7 +387,7 @@ struct io_apic {
386static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 387static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
387{ 388{
388 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) 389 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
389 + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); 390 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
390} 391}
391 392
392static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) 393static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -579,9 +580,9 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
579 if (assign_irq_vector(irq, cfg, mask)) 580 if (assign_irq_vector(irq, cfg, mask))
580 return BAD_APICID; 581 return BAD_APICID;
581 582
582 cpumask_and(&desc->affinity, cfg->domain, mask); 583 cpumask_and(desc->affinity, cfg->domain, mask);
583 set_extra_move_desc(desc, mask); 584 set_extra_move_desc(desc, mask);
584 return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask); 585 return cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
585} 586}
586 587
587static void 588static void
@@ -944,10 +945,10 @@ static int find_irq_entry(int apic, int pin, int type)
944 int i; 945 int i;
945 946
946 for (i = 0; i < mp_irq_entries; i++) 947 for (i = 0; i < mp_irq_entries; i++)
947 if (mp_irqs[i].mp_irqtype == type && 948 if (mp_irqs[i].irqtype == type &&
948 (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || 949 (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
949 mp_irqs[i].mp_dstapic == MP_APIC_ALL) && 950 mp_irqs[i].dstapic == MP_APIC_ALL) &&
950 mp_irqs[i].mp_dstirq == pin) 951 mp_irqs[i].dstirq == pin)
951 return i; 952 return i;
952 953
953 return -1; 954 return -1;
@@ -961,13 +962,13 @@ static int __init find_isa_irq_pin(int irq, int type)
961 int i; 962 int i;
962 963
963 for (i = 0; i < mp_irq_entries; i++) { 964 for (i = 0; i < mp_irq_entries; i++) {
964 int lbus = mp_irqs[i].mp_srcbus; 965 int lbus = mp_irqs[i].srcbus;
965 966
966 if (test_bit(lbus, mp_bus_not_pci) && 967 if (test_bit(lbus, mp_bus_not_pci) &&
967 (mp_irqs[i].mp_irqtype == type) && 968 (mp_irqs[i].irqtype == type) &&
968 (mp_irqs[i].mp_srcbusirq == irq)) 969 (mp_irqs[i].srcbusirq == irq))
969 970
970 return mp_irqs[i].mp_dstirq; 971 return mp_irqs[i].dstirq;
971 } 972 }
972 return -1; 973 return -1;
973} 974}
@@ -977,17 +978,17 @@ static int __init find_isa_irq_apic(int irq, int type)
977 int i; 978 int i;
978 979
979 for (i = 0; i < mp_irq_entries; i++) { 980 for (i = 0; i < mp_irq_entries; i++) {
980 int lbus = mp_irqs[i].mp_srcbus; 981 int lbus = mp_irqs[i].srcbus;
981 982
982 if (test_bit(lbus, mp_bus_not_pci) && 983 if (test_bit(lbus, mp_bus_not_pci) &&
983 (mp_irqs[i].mp_irqtype == type) && 984 (mp_irqs[i].irqtype == type) &&
984 (mp_irqs[i].mp_srcbusirq == irq)) 985 (mp_irqs[i].srcbusirq == irq))
985 break; 986 break;
986 } 987 }
987 if (i < mp_irq_entries) { 988 if (i < mp_irq_entries) {
988 int apic; 989 int apic;
989 for(apic = 0; apic < nr_ioapics; apic++) { 990 for(apic = 0; apic < nr_ioapics; apic++) {
990 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) 991 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
991 return apic; 992 return apic;
992 } 993 }
993 } 994 }
@@ -1012,23 +1013,23 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
1012 return -1; 1013 return -1;
1013 } 1014 }
1014 for (i = 0; i < mp_irq_entries; i++) { 1015 for (i = 0; i < mp_irq_entries; i++) {
1015 int lbus = mp_irqs[i].mp_srcbus; 1016 int lbus = mp_irqs[i].srcbus;
1016 1017
1017 for (apic = 0; apic < nr_ioapics; apic++) 1018 for (apic = 0; apic < nr_ioapics; apic++)
1018 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || 1019 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
1019 mp_irqs[i].mp_dstapic == MP_APIC_ALL) 1020 mp_irqs[i].dstapic == MP_APIC_ALL)
1020 break; 1021 break;
1021 1022
1022 if (!test_bit(lbus, mp_bus_not_pci) && 1023 if (!test_bit(lbus, mp_bus_not_pci) &&
1023 !mp_irqs[i].mp_irqtype && 1024 !mp_irqs[i].irqtype &&
1024 (bus == lbus) && 1025 (bus == lbus) &&
1025 (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { 1026 (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
1026 int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); 1027 int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
1027 1028
1028 if (!(apic || IO_APIC_IRQ(irq))) 1029 if (!(apic || IO_APIC_IRQ(irq)))
1029 continue; 1030 continue;
1030 1031
1031 if (pin == (mp_irqs[i].mp_srcbusirq & 3)) 1032 if (pin == (mp_irqs[i].srcbusirq & 3))
1032 return irq; 1033 return irq;
1033 /* 1034 /*
1034 * Use the first all-but-pin matching entry as a 1035 * Use the first all-but-pin matching entry as a
@@ -1071,7 +1072,7 @@ static int EISA_ELCR(unsigned int irq)
1071 * EISA conforming in the MP table, that means its trigger type must 1072 * EISA conforming in the MP table, that means its trigger type must
1072 * be read in from the ELCR */ 1073 * be read in from the ELCR */
1073 1074
1074#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) 1075#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
1075#define default_EISA_polarity(idx) default_ISA_polarity(idx) 1076#define default_EISA_polarity(idx) default_ISA_polarity(idx)
1076 1077
1077/* PCI interrupts are always polarity one level triggered, 1078/* PCI interrupts are always polarity one level triggered,
@@ -1088,13 +1089,13 @@ static int EISA_ELCR(unsigned int irq)
1088 1089
1089static int MPBIOS_polarity(int idx) 1090static int MPBIOS_polarity(int idx)
1090{ 1091{
1091 int bus = mp_irqs[idx].mp_srcbus; 1092 int bus = mp_irqs[idx].srcbus;
1092 int polarity; 1093 int polarity;
1093 1094
1094 /* 1095 /*
1095 * Determine IRQ line polarity (high active or low active): 1096 * Determine IRQ line polarity (high active or low active):
1096 */ 1097 */
1097 switch (mp_irqs[idx].mp_irqflag & 3) 1098 switch (mp_irqs[idx].irqflag & 3)
1098 { 1099 {
1099 case 0: /* conforms, ie. bus-type dependent polarity */ 1100 case 0: /* conforms, ie. bus-type dependent polarity */
1100 if (test_bit(bus, mp_bus_not_pci)) 1101 if (test_bit(bus, mp_bus_not_pci))
@@ -1130,13 +1131,13 @@ static int MPBIOS_polarity(int idx)
1130 1131
1131static int MPBIOS_trigger(int idx) 1132static int MPBIOS_trigger(int idx)
1132{ 1133{
1133 int bus = mp_irqs[idx].mp_srcbus; 1134 int bus = mp_irqs[idx].srcbus;
1134 int trigger; 1135 int trigger;
1135 1136
1136 /* 1137 /*
1137 * Determine IRQ trigger mode (edge or level sensitive): 1138 * Determine IRQ trigger mode (edge or level sensitive):
1138 */ 1139 */
1139 switch ((mp_irqs[idx].mp_irqflag>>2) & 3) 1140 switch ((mp_irqs[idx].irqflag>>2) & 3)
1140 { 1141 {
1141 case 0: /* conforms, ie. bus-type dependent */ 1142 case 0: /* conforms, ie. bus-type dependent */
1142 if (test_bit(bus, mp_bus_not_pci)) 1143 if (test_bit(bus, mp_bus_not_pci))
@@ -1214,16 +1215,16 @@ int (*ioapic_renumber_irq)(int ioapic, int irq);
1214static int pin_2_irq(int idx, int apic, int pin) 1215static int pin_2_irq(int idx, int apic, int pin)
1215{ 1216{
1216 int irq, i; 1217 int irq, i;
1217 int bus = mp_irqs[idx].mp_srcbus; 1218 int bus = mp_irqs[idx].srcbus;
1218 1219
1219 /* 1220 /*
1220 * Debugging check, we are in big trouble if this message pops up! 1221 * Debugging check, we are in big trouble if this message pops up!
1221 */ 1222 */
1222 if (mp_irqs[idx].mp_dstirq != pin) 1223 if (mp_irqs[idx].dstirq != pin)
1223 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); 1224 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
1224 1225
1225 if (test_bit(bus, mp_bus_not_pci)) { 1226 if (test_bit(bus, mp_bus_not_pci)) {
1226 irq = mp_irqs[idx].mp_srcbusirq; 1227 irq = mp_irqs[idx].srcbusirq;
1227 } else { 1228 } else {
1228 /* 1229 /*
1229 * PCI IRQs are mapped in order 1230 * PCI IRQs are mapped in order
@@ -1566,14 +1567,14 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_de
1566 apic_printk(APIC_VERBOSE,KERN_DEBUG 1567 apic_printk(APIC_VERBOSE,KERN_DEBUG
1567 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1568 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
1568 "IRQ %d Mode:%i Active:%i)\n", 1569 "IRQ %d Mode:%i Active:%i)\n",
1569 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, 1570 apic, mp_ioapics[apic].apicid, pin, cfg->vector,
1570 irq, trigger, polarity); 1571 irq, trigger, polarity);
1571 1572
1572 1573
1573 if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, 1574 if (setup_ioapic_entry(mp_ioapics[apic].apicid, irq, &entry,
1574 dest, trigger, polarity, cfg->vector)) { 1575 dest, trigger, polarity, cfg->vector)) {
1575 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1576 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1576 mp_ioapics[apic].mp_apicid, pin); 1577 mp_ioapics[apic].apicid, pin);
1577 __clear_irq_vector(irq, cfg); 1578 __clear_irq_vector(irq, cfg);
1578 return; 1579 return;
1579 } 1580 }
@@ -1604,12 +1605,10 @@ static void __init setup_IO_APIC_irqs(void)
1604 notcon = 1; 1605 notcon = 1;
1605 apic_printk(APIC_VERBOSE, 1606 apic_printk(APIC_VERBOSE,
1606 KERN_DEBUG " %d-%d", 1607 KERN_DEBUG " %d-%d",
1607 mp_ioapics[apic].mp_apicid, 1608 mp_ioapics[apic].apicid, pin);
1608 pin);
1609 } else 1609 } else
1610 apic_printk(APIC_VERBOSE, " %d-%d", 1610 apic_printk(APIC_VERBOSE, " %d-%d",
1611 mp_ioapics[apic].mp_apicid, 1611 mp_ioapics[apic].apicid, pin);
1612 pin);
1613 continue; 1612 continue;
1614 } 1613 }
1615 if (notcon) { 1614 if (notcon) {
@@ -1699,7 +1698,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1699 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1698 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1700 for (i = 0; i < nr_ioapics; i++) 1699 for (i = 0; i < nr_ioapics; i++)
1701 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1700 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1702 mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); 1701 mp_ioapics[i].apicid, nr_ioapic_registers[i]);
1703 1702
1704 /* 1703 /*
1705 * We are a bit conservative about what we expect. We have to 1704 * We are a bit conservative about what we expect. We have to
@@ -1719,7 +1718,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1719 spin_unlock_irqrestore(&ioapic_lock, flags); 1718 spin_unlock_irqrestore(&ioapic_lock, flags);
1720 1719
1721 printk("\n"); 1720 printk("\n");
1722 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); 1721 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
1723 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1722 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1724 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1723 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1725 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); 1724 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -2121,14 +2120,14 @@ static void __init setup_ioapic_ids_from_mpc(void)
2121 reg_00.raw = io_apic_read(apic, 0); 2120 reg_00.raw = io_apic_read(apic, 0);
2122 spin_unlock_irqrestore(&ioapic_lock, flags); 2121 spin_unlock_irqrestore(&ioapic_lock, flags);
2123 2122
2124 old_id = mp_ioapics[apic].mp_apicid; 2123 old_id = mp_ioapics[apic].apicid;
2125 2124
2126 if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { 2125 if (mp_ioapics[apic].apicid >= get_physical_broadcast()) {
2127 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", 2126 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
2128 apic, mp_ioapics[apic].mp_apicid); 2127 apic, mp_ioapics[apic].apicid);
2129 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 2128 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
2130 reg_00.bits.ID); 2129 reg_00.bits.ID);
2131 mp_ioapics[apic].mp_apicid = reg_00.bits.ID; 2130 mp_ioapics[apic].apicid = reg_00.bits.ID;
2132 } 2131 }
2133 2132
2134 /* 2133 /*
@@ -2137,9 +2136,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
2137 * 'stuck on smp_invalidate_needed IPI wait' messages. 2136 * 'stuck on smp_invalidate_needed IPI wait' messages.
2138 */ 2137 */
2139 if (check_apicid_used(phys_id_present_map, 2138 if (check_apicid_used(phys_id_present_map,
2140 mp_ioapics[apic].mp_apicid)) { 2139 mp_ioapics[apic].apicid)) {
2141 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 2140 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
2142 apic, mp_ioapics[apic].mp_apicid); 2141 apic, mp_ioapics[apic].apicid);
2143 for (i = 0; i < get_physical_broadcast(); i++) 2142 for (i = 0; i < get_physical_broadcast(); i++)
2144 if (!physid_isset(i, phys_id_present_map)) 2143 if (!physid_isset(i, phys_id_present_map))
2145 break; 2144 break;
@@ -2148,13 +2147,13 @@ static void __init setup_ioapic_ids_from_mpc(void)
2148 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 2147 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
2149 i); 2148 i);
2150 physid_set(i, phys_id_present_map); 2149 physid_set(i, phys_id_present_map);
2151 mp_ioapics[apic].mp_apicid = i; 2150 mp_ioapics[apic].apicid = i;
2152 } else { 2151 } else {
2153 physid_mask_t tmp; 2152 physid_mask_t tmp;
2154 tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); 2153 tmp = apicid_to_cpu_present(mp_ioapics[apic].apicid);
2155 apic_printk(APIC_VERBOSE, "Setting %d in the " 2154 apic_printk(APIC_VERBOSE, "Setting %d in the "
2156 "phys_id_present_map\n", 2155 "phys_id_present_map\n",
2157 mp_ioapics[apic].mp_apicid); 2156 mp_ioapics[apic].apicid);
2158 physids_or(phys_id_present_map, phys_id_present_map, tmp); 2157 physids_or(phys_id_present_map, phys_id_present_map, tmp);
2159 } 2158 }
2160 2159
@@ -2163,11 +2162,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
2163 * We need to adjust the IRQ routing table 2162 * We need to adjust the IRQ routing table
2164 * if the ID changed. 2163 * if the ID changed.
2165 */ 2164 */
2166 if (old_id != mp_ioapics[apic].mp_apicid) 2165 if (old_id != mp_ioapics[apic].apicid)
2167 for (i = 0; i < mp_irq_entries; i++) 2166 for (i = 0; i < mp_irq_entries; i++)
2168 if (mp_irqs[i].mp_dstapic == old_id) 2167 if (mp_irqs[i].dstapic == old_id)
2169 mp_irqs[i].mp_dstapic 2168 mp_irqs[i].dstapic
2170 = mp_ioapics[apic].mp_apicid; 2169 = mp_ioapics[apic].apicid;
2171 2170
2172 /* 2171 /*
2173 * Read the right value from the MPC table and 2172 * Read the right value from the MPC table and
@@ -2175,9 +2174,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
2175 */ 2174 */
2176 apic_printk(APIC_VERBOSE, KERN_INFO 2175 apic_printk(APIC_VERBOSE, KERN_INFO
2177 "...changing IO-APIC physical APIC ID to %d ...", 2176 "...changing IO-APIC physical APIC ID to %d ...",
2178 mp_ioapics[apic].mp_apicid); 2177 mp_ioapics[apic].apicid);
2179 2178
2180 reg_00.bits.ID = mp_ioapics[apic].mp_apicid; 2179 reg_00.bits.ID = mp_ioapics[apic].apicid;
2181 spin_lock_irqsave(&ioapic_lock, flags); 2180 spin_lock_irqsave(&ioapic_lock, flags);
2182 io_apic_write(apic, 0, reg_00.raw); 2181 io_apic_write(apic, 0, reg_00.raw);
2183 spin_unlock_irqrestore(&ioapic_lock, flags); 2182 spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2188,7 +2187,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
2188 spin_lock_irqsave(&ioapic_lock, flags); 2187 spin_lock_irqsave(&ioapic_lock, flags);
2189 reg_00.raw = io_apic_read(apic, 0); 2188 reg_00.raw = io_apic_read(apic, 0);
2190 spin_unlock_irqrestore(&ioapic_lock, flags); 2189 spin_unlock_irqrestore(&ioapic_lock, flags);
2191 if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) 2190 if (reg_00.bits.ID != mp_ioapics[apic].apicid)
2192 printk("could not set ID!\n"); 2191 printk("could not set ID!\n");
2193 else 2192 else
2194 apic_printk(APIC_VERBOSE, " ok.\n"); 2193 apic_printk(APIC_VERBOSE, " ok.\n");
@@ -2383,7 +2382,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2383 if (cfg->move_in_progress) 2382 if (cfg->move_in_progress)
2384 send_cleanup_vector(cfg); 2383 send_cleanup_vector(cfg);
2385 2384
2386 cpumask_copy(&desc->affinity, mask); 2385 cpumask_copy(desc->affinity, mask);
2387} 2386}
2388 2387
2389static int migrate_irq_remapped_level_desc(struct irq_desc *desc) 2388static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
@@ -2405,11 +2404,11 @@ static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
2405 } 2404 }
2406 2405
2407 /* everthing is clear. we have right of way */ 2406 /* everthing is clear. we have right of way */
2408 migrate_ioapic_irq_desc(desc, &desc->pending_mask); 2407 migrate_ioapic_irq_desc(desc, desc->pending_mask);
2409 2408
2410 ret = 0; 2409 ret = 0;
2411 desc->status &= ~IRQ_MOVE_PENDING; 2410 desc->status &= ~IRQ_MOVE_PENDING;
2412 cpumask_clear(&desc->pending_mask); 2411 cpumask_clear(desc->pending_mask);
2413 2412
2414unmask: 2413unmask:
2415 unmask_IO_APIC_irq_desc(desc); 2414 unmask_IO_APIC_irq_desc(desc);
@@ -2434,7 +2433,7 @@ static void ir_irq_migration(struct work_struct *work)
2434 continue; 2433 continue;
2435 } 2434 }
2436 2435
2437 desc->chip->set_affinity(irq, &desc->pending_mask); 2436 desc->chip->set_affinity(irq, desc->pending_mask);
2438 spin_unlock_irqrestore(&desc->lock, flags); 2437 spin_unlock_irqrestore(&desc->lock, flags);
2439 } 2438 }
2440 } 2439 }
@@ -2448,7 +2447,7 @@ static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2448{ 2447{
2449 if (desc->status & IRQ_LEVEL) { 2448 if (desc->status & IRQ_LEVEL) {
2450 desc->status |= IRQ_MOVE_PENDING; 2449 desc->status |= IRQ_MOVE_PENDING;
2451 cpumask_copy(&desc->pending_mask, mask); 2450 cpumask_copy(desc->pending_mask, mask);
2452 migrate_irq_remapped_level_desc(desc); 2451 migrate_irq_remapped_level_desc(desc);
2453 return; 2452 return;
2454 } 2453 }
@@ -2516,7 +2515,7 @@ static void irq_complete_move(struct irq_desc **descp)
2516 2515
2517 /* domain has not changed, but affinity did */ 2516 /* domain has not changed, but affinity did */
2518 me = smp_processor_id(); 2517 me = smp_processor_id();
2519 if (cpu_isset(me, desc->affinity)) { 2518 if (cpumask_test_cpu(me, desc->affinity)) {
2520 *descp = desc = move_irq_desc(desc, me); 2519 *descp = desc = move_irq_desc(desc, me);
2521 /* get the new one */ 2520 /* get the new one */
2522 cfg = desc->chip_data; 2521 cfg = desc->chip_data;
@@ -3118,8 +3117,8 @@ static int ioapic_resume(struct sys_device *dev)
3118 3117
3119 spin_lock_irqsave(&ioapic_lock, flags); 3118 spin_lock_irqsave(&ioapic_lock, flags);
3120 reg_00.raw = io_apic_read(dev->id, 0); 3119 reg_00.raw = io_apic_read(dev->id, 0);
3121 if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { 3120 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
3122 reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; 3121 reg_00.bits.ID = mp_ioapics[dev->id].apicid;
3123 io_apic_write(dev->id, 0, reg_00.raw); 3122 io_apic_write(dev->id, 0, reg_00.raw);
3124 } 3123 }
3125 spin_unlock_irqrestore(&ioapic_lock, flags); 3124 spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -3184,7 +3183,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
3184 3183
3185 irq = 0; 3184 irq = 0;
3186 spin_lock_irqsave(&vector_lock, flags); 3185 spin_lock_irqsave(&vector_lock, flags);
3187 for (new = irq_want; new < NR_IRQS; new++) { 3186 for (new = irq_want; new < nr_irqs; new++) {
3188 if (platform_legacy_irq(new)) 3187 if (platform_legacy_irq(new))
3189 continue; 3188 continue;
3190 3189
@@ -3259,6 +3258,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3259 int err; 3258 int err;
3260 unsigned dest; 3259 unsigned dest;
3261 3260
3261 if (disable_apic)
3262 return -ENXIO;
3263
3262 cfg = irq_cfg(irq); 3264 cfg = irq_cfg(irq);
3263 err = assign_irq_vector(irq, cfg, TARGET_CPUS); 3265 err = assign_irq_vector(irq, cfg, TARGET_CPUS);
3264 if (err) 3266 if (err)
@@ -3727,6 +3729,9 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3727 struct irq_cfg *cfg; 3729 struct irq_cfg *cfg;
3728 int err; 3730 int err;
3729 3731
3732 if (disable_apic)
3733 return -ENXIO;
3734
3730 cfg = irq_cfg(irq); 3735 cfg = irq_cfg(irq);
3731 err = assign_irq_vector(irq, cfg, TARGET_CPUS); 3736 err = assign_irq_vector(irq, cfg, TARGET_CPUS);
3732 if (!err) { 3737 if (!err) {
@@ -3761,7 +3766,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3761} 3766}
3762#endif /* CONFIG_HT_IRQ */ 3767#endif /* CONFIG_HT_IRQ */
3763 3768
3764#ifdef CONFIG_X86_64 3769#ifdef CONFIG_X86_UV
3765/* 3770/*
3766 * Re-target the irq to the specified CPU and enable the specified MMR located 3771 * Re-target the irq to the specified CPU and enable the specified MMR located
3767 * on the specified blade to allow the sending of MSIs to the specified CPU. 3772 * on the specified blade to allow the sending of MSIs to the specified CPU.
@@ -3861,6 +3866,22 @@ void __init probe_nr_irqs_gsi(void)
3861 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); 3866 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
3862} 3867}
3863 3868
3869#ifdef CONFIG_SPARSE_IRQ
3870int __init arch_probe_nr_irqs(void)
3871{
3872 int nr;
3873
3874 nr = ((8 * nr_cpu_ids) > (32 * nr_ioapics) ?
3875 (NR_VECTORS + (8 * nr_cpu_ids)) :
3876 (NR_VECTORS + (32 * nr_ioapics)));
3877
3878 if (nr < nr_irqs && nr > nr_irqs_gsi)
3879 nr_irqs = nr;
3880
3881 return 0;
3882}
3883#endif
3884
3864/* -------------------------------------------------------------------------- 3885/* --------------------------------------------------------------------------
3865 ACPI-based IOAPIC Configuration 3886 ACPI-based IOAPIC Configuration
3866 -------------------------------------------------------------------------- */ 3887 -------------------------------------------------------------------------- */
@@ -3995,8 +4016,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
3995 return -1; 4016 return -1;
3996 4017
3997 for (i = 0; i < mp_irq_entries; i++) 4018 for (i = 0; i < mp_irq_entries; i++)
3998 if (mp_irqs[i].mp_irqtype == mp_INT && 4019 if (mp_irqs[i].irqtype == mp_INT &&
3999 mp_irqs[i].mp_srcbusirq == bus_irq) 4020 mp_irqs[i].srcbusirq == bus_irq)
4000 break; 4021 break;
4001 if (i >= mp_irq_entries) 4022 if (i >= mp_irq_entries)
4002 return -1; 4023 return -1;
@@ -4050,7 +4071,7 @@ void __init setup_ioapic_dest(void)
4050 */ 4071 */
4051 if (desc->status & 4072 if (desc->status &
4052 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) 4073 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
4053 mask = &desc->affinity; 4074 mask = desc->affinity;
4054 else 4075 else
4055 mask = TARGET_CPUS; 4076 mask = TARGET_CPUS;
4056 4077
@@ -4111,7 +4132,7 @@ void __init ioapic_init_mappings(void)
4111 ioapic_res = ioapic_setup_resources(); 4132 ioapic_res = ioapic_setup_resources();
4112 for (i = 0; i < nr_ioapics; i++) { 4133 for (i = 0; i < nr_ioapics; i++) {
4113 if (smp_found_config) { 4134 if (smp_found_config) {
4114 ioapic_phys = mp_ioapics[i].mp_apicaddr; 4135 ioapic_phys = mp_ioapics[i].apicaddr;
4115#ifdef CONFIG_X86_32 4136#ifdef CONFIG_X86_32
4116 if (!ioapic_phys) { 4137 if (!ioapic_phys) {
4117 printk(KERN_ERR 4138 printk(KERN_ERR
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3973e2df7f87..a6bca1d33a8a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -36,11 +36,7 @@ void ack_bad_irq(unsigned int irq)
36#endif 36#endif
37} 37}
38 38
39#ifdef CONFIG_X86_32 39#define irq_stats(x) (&per_cpu(irq_stat, x))
40# define irq_stats(x) (&per_cpu(irq_stat, x))
41#else
42# define irq_stats(x) cpu_pda(x)
43#endif
44/* 40/*
45 * /proc/interrupts printing: 41 * /proc/interrupts printing:
46 */ 42 */
@@ -57,6 +53,10 @@ static int show_other_interrupts(struct seq_file *p)
57 for_each_online_cpu(j) 53 for_each_online_cpu(j)
58 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); 54 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
59 seq_printf(p, " Local timer interrupts\n"); 55 seq_printf(p, " Local timer interrupts\n");
56 seq_printf(p, "CNT: ");
57 for_each_online_cpu(j)
58 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
59 seq_printf(p, " Performance counter interrupts\n");
60#endif 60#endif
61#ifdef CONFIG_SMP 61#ifdef CONFIG_SMP
62 seq_printf(p, "RES: "); 62 seq_printf(p, "RES: ");
@@ -164,6 +164,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
164 164
165#ifdef CONFIG_X86_LOCAL_APIC 165#ifdef CONFIG_X86_LOCAL_APIC
166 sum += irq_stats(cpu)->apic_timer_irqs; 166 sum += irq_stats(cpu)->apic_timer_irqs;
167 sum += irq_stats(cpu)->apic_perf_irqs;
167#endif 168#endif
168#ifdef CONFIG_SMP 169#ifdef CONFIG_SMP
169 sum += irq_stats(cpu)->irq_resched_count; 170 sum += irq_stats(cpu)->irq_resched_count;
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 74b9ff7341e9..e0f29be8ab0b 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -248,7 +248,7 @@ void fixup_irqs(void)
248 if (irq == 2) 248 if (irq == 2)
249 continue; 249 continue;
250 250
251 affinity = &desc->affinity; 251 affinity = desc->affinity;
252 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { 252 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
253 printk("Breaking affinity for irq %i\n", irq); 253 printk("Breaking affinity for irq %i\n", irq);
254 affinity = cpu_all_mask; 254 affinity = cpu_all_mask;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 63c88e6ec025..018963aa6ee3 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,6 +18,13 @@
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <asm/io_apic.h> 19#include <asm/io_apic.h>
20#include <asm/idle.h> 20#include <asm/idle.h>
21#include <asm/apic.h>
22
23DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
24EXPORT_PER_CPU_SYMBOL(irq_stat);
25
26DEFINE_PER_CPU(struct pt_regs *, irq_regs);
27EXPORT_PER_CPU_SYMBOL(irq_regs);
21 28
22/* 29/*
23 * Probabilistic stack overflow check: 30 * Probabilistic stack overflow check:
@@ -100,7 +107,7 @@ void fixup_irqs(void)
100 /* interrupt's are disabled at this point */ 107 /* interrupt's are disabled at this point */
101 spin_lock(&desc->lock); 108 spin_lock(&desc->lock);
102 109
103 affinity = &desc->affinity; 110 affinity = desc->affinity;
104 if (!irq_has_action(irq) || 111 if (!irq_has_action(irq) ||
105 cpumask_equal(affinity, cpu_online_mask)) { 112 cpumask_equal(affinity, cpu_online_mask)) {
106 spin_unlock(&desc->lock); 113 spin_unlock(&desc->lock);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 10a09c2f1828..f6ff71cdaba8 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -111,28 +111,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
111 return 0; 111 return 0;
112} 112}
113 113
114/* Overridden in paravirt.c */ 114static void __init smp_intr_init(void)
115void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
116
117void __init native_init_IRQ(void)
118{ 115{
119 int i;
120
121 /* all the set up before the call gates are initialised */
122 pre_intr_init_hook();
123
124 /*
125 * Cover the whole vector space, no vector can escape
126 * us. (some of these will be overridden and become
127 * 'special' SMP interrupts)
128 */
129 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
130 /* SYSCALL_VECTOR was reserved in trap_init. */
131 if (i != SYSCALL_VECTOR)
132 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
133 }
134
135
136#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) 116#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
137 /* 117 /*
138 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 118 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@ -140,8 +120,15 @@ void __init native_init_IRQ(void)
140 */ 120 */
141 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 121 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
142 122
143 /* IPI for invalidation */ 123 /* IPIs for invalidation */
144 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); 124 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
125 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
126 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
127 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
128 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
129 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
130 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
131 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
145 132
146 /* IPI for generic function call */ 133 /* IPI for generic function call */
147 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 134 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
@@ -154,6 +141,11 @@ void __init native_init_IRQ(void)
154 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 141 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
155 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); 142 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
156#endif 143#endif
144}
145
146static void __init apic_intr_init(void)
147{
148 smp_intr_init();
157 149
158#ifdef CONFIG_X86_LOCAL_APIC 150#ifdef CONFIG_X86_LOCAL_APIC
159 /* self generated IPI for local APIC timer */ 151 /* self generated IPI for local APIC timer */
@@ -162,12 +154,40 @@ void __init native_init_IRQ(void)
162 /* IPI vectors for APIC spurious and error interrupts */ 154 /* IPI vectors for APIC spurious and error interrupts */
163 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 155 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
164 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 156 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
165#endif 157# ifdef CONFIG_PERF_COUNTERS
158 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
159# endif
166 160
167#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) 161# ifdef CONFIG_X86_MCE_P4THERMAL
168 /* thermal monitor LVT interrupt */ 162 /* thermal monitor LVT interrupt */
169 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 163 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
164# endif
170#endif 165#endif
166}
167
168/* Overridden in paravirt.c */
169void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
170
171void __init native_init_IRQ(void)
172{
173 int i;
174
175 /* all the set up before the call gates are initialised */
176 pre_intr_init_hook();
177
178 apic_intr_init();
179
180 /*
181 * Cover the whole vector space, no vector can escape
182 * us. (some of these will be overridden and become
183 * 'special' SMP interrupts)
184 */
185 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
186 int vector = FIRST_EXTERNAL_VECTOR + i;
187 /* SYSCALL_VECTOR was reserved in trap_init. */
188 if (!test_bit(vector, used_vectors))
189 set_intr_gate(vector, interrupt[i]);
190 }
171 191
172 /* setup after call gates are initialised (usually add in 192 /* setup after call gates are initialised (usually add in
173 * the architecture specific gates) 193 * the architecture specific gates)
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index da481a1e3f30..16e1fc687504 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -150,6 +150,11 @@ static void __init apic_intr_init(void)
150 /* IPI vectors for APIC spurious and error interrupts */ 150 /* IPI vectors for APIC spurious and error interrupts */
151 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 151 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
152 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 152 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
153
154 /* Performance monitoring interrupt: */
155#ifdef CONFIG_PERF_COUNTERS
156 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
157#endif
153} 158}
154 159
155void __init native_init_IRQ(void) 160void __init native_init_IRQ(void)
@@ -157,6 +162,9 @@ void __init native_init_IRQ(void)
157 int i; 162 int i;
158 163
159 init_ISA_irqs(); 164 init_ISA_irqs();
165
166 apic_intr_init();
167
160 /* 168 /*
161 * Cover the whole vector space, no vector can escape 169 * Cover the whole vector space, no vector can escape
162 * us. (some of these will be overridden and become 170 * us. (some of these will be overridden and become
@@ -164,12 +172,10 @@ void __init native_init_IRQ(void)
164 */ 172 */
165 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { 173 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
166 int vector = FIRST_EXTERNAL_VECTOR + i; 174 int vector = FIRST_EXTERNAL_VECTOR + i;
167 if (vector != IA32_SYSCALL_VECTOR) 175 if (!test_bit(vector, used_vectors))
168 set_intr_gate(vector, interrupt[i]); 176 set_intr_gate(vector, interrupt[i]);
169 } 177 }
170 178
171 apic_intr_init();
172
173 if (!acpi_ioapic) 179 if (!acpi_ioapic)
174 setup_irq(2, &irq2); 180 setup_irq(2, &irq2);
175} 181}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index b7f4c929e615..5e9f4fc51385 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -87,9 +87,9 @@
87#include <linux/cpu.h> 87#include <linux/cpu.h>
88#include <linux/firmware.h> 88#include <linux/firmware.h>
89#include <linux/platform_device.h> 89#include <linux/platform_device.h>
90#include <linux/uaccess.h>
90 91
91#include <asm/msr.h> 92#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h> 93#include <asm/processor.h>
94#include <asm/microcode.h> 94#include <asm/microcode.h>
95 95
@@ -196,7 +196,7 @@ static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
196 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1; 196 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
197} 197}
198 198
199static inline int 199static inline int
200update_match_revision(struct microcode_header_intel *mc_header, int rev) 200update_match_revision(struct microcode_header_intel *mc_header, int rev)
201{ 201{
202 return (mc_header->rev <= rev) ? 0 : 1; 202 return (mc_header->rev <= rev) ? 0 : 1;
@@ -442,8 +442,8 @@ static int request_microcode_fw(int cpu, struct device *device)
442 return ret; 442 return ret;
443 } 443 }
444 444
445 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, 445 ret = generic_load_microcode(cpu, (void *)firmware->data,
446 &get_ucode_fw); 446 firmware->size, &get_ucode_fw);
447 447
448 release_firmware(firmware); 448 release_firmware(firmware);
449 449
@@ -460,7 +460,7 @@ static int request_microcode_user(int cpu, const void __user *buf, size_t size)
460 /* We should bind the task to the CPU */ 460 /* We should bind the task to the CPU */
461 BUG_ON(cpu != raw_smp_processor_id()); 461 BUG_ON(cpu != raw_smp_processor_id());
462 462
463 return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user); 463 return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
464} 464}
465 465
466static void microcode_fini_cpu(int cpu) 466static void microcode_fini_cpu(int cpu)
diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c
index 3db0a5442eb1..0edd819050e7 100644
--- a/arch/x86/kernel/module_32.c
+++ b/arch/x86/kernel/module_32.c
@@ -42,7 +42,7 @@ void module_free(struct module *mod, void *module_region)
42{ 42{
43 vfree(module_region); 43 vfree(module_region);
44 /* FIXME: If module_region == mod->init_region, trim exception 44 /* FIXME: If module_region == mod->init_region, trim exception
45 table entries. */ 45 table entries. */
46} 46}
47 47
48/* We don't need anything special. */ 48/* We don't need anything special. */
@@ -113,13 +113,13 @@ int module_finalize(const Elf_Ehdr *hdr,
113 *para = NULL; 113 *para = NULL;
114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
115 115
116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
117 if (!strcmp(".text", secstrings + s->sh_name)) 117 if (!strcmp(".text", secstrings + s->sh_name))
118 text = s; 118 text = s;
119 if (!strcmp(".altinstructions", secstrings + s->sh_name)) 119 if (!strcmp(".altinstructions", secstrings + s->sh_name))
120 alt = s; 120 alt = s;
121 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 121 if (!strcmp(".smp_locks", secstrings + s->sh_name))
122 locks= s; 122 locks = s;
123 if (!strcmp(".parainstructions", secstrings + s->sh_name)) 123 if (!strcmp(".parainstructions", secstrings + s->sh_name))
124 para = s; 124 para = s;
125 } 125 }
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index 6ba87830d4b1..c23880b90b5c 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -30,14 +30,14 @@
30#include <asm/page.h> 30#include <asm/page.h>
31#include <asm/pgtable.h> 31#include <asm/pgtable.h>
32 32
33#define DEBUGP(fmt...) 33#define DEBUGP(fmt...)
34 34
35#ifndef CONFIG_UML 35#ifndef CONFIG_UML
36void module_free(struct module *mod, void *module_region) 36void module_free(struct module *mod, void *module_region)
37{ 37{
38 vfree(module_region); 38 vfree(module_region);
39 /* FIXME: If module_region == mod->init_region, trim exception 39 /* FIXME: If module_region == mod->init_region, trim exception
40 table entries. */ 40 table entries. */
41} 41}
42 42
43void *module_alloc(unsigned long size) 43void *module_alloc(unsigned long size)
@@ -77,7 +77,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
77 Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; 77 Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
78 Elf64_Sym *sym; 78 Elf64_Sym *sym;
79 void *loc; 79 void *loc;
80 u64 val; 80 u64 val;
81 81
82 DEBUGP("Applying relocate section %u to %u\n", relsec, 82 DEBUGP("Applying relocate section %u to %u\n", relsec,
83 sechdrs[relsec].sh_info); 83 sechdrs[relsec].sh_info);
@@ -91,11 +91,11 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
91 sym = (Elf64_Sym *)sechdrs[symindex].sh_addr 91 sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
92 + ELF64_R_SYM(rel[i].r_info); 92 + ELF64_R_SYM(rel[i].r_info);
93 93
94 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", 94 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
95 (int)ELF64_R_TYPE(rel[i].r_info), 95 (int)ELF64_R_TYPE(rel[i].r_info),
96 sym->st_value, rel[i].r_addend, (u64)loc); 96 sym->st_value, rel[i].r_addend, (u64)loc);
97 97
98 val = sym->st_value + rel[i].r_addend; 98 val = sym->st_value + rel[i].r_addend;
99 99
100 switch (ELF64_R_TYPE(rel[i].r_info)) { 100 switch (ELF64_R_TYPE(rel[i].r_info)) {
101 case R_X86_64_NONE: 101 case R_X86_64_NONE:
@@ -113,16 +113,16 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
113 if ((s64)val != *(s32 *)loc) 113 if ((s64)val != *(s32 *)loc)
114 goto overflow; 114 goto overflow;
115 break; 115 break;
116 case R_X86_64_PC32: 116 case R_X86_64_PC32:
117 val -= (u64)loc; 117 val -= (u64)loc;
118 *(u32 *)loc = val; 118 *(u32 *)loc = val;
119#if 0 119#if 0
120 if ((s64)val != *(s32 *)loc) 120 if ((s64)val != *(s32 *)loc)
121 goto overflow; 121 goto overflow;
122#endif 122#endif
123 break; 123 break;
124 default: 124 default:
125 printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n", 125 printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n",
126 me->name, ELF64_R_TYPE(rel[i].r_info)); 126 me->name, ELF64_R_TYPE(rel[i].r_info));
127 return -ENOEXEC; 127 return -ENOEXEC;
128 } 128 }
@@ -130,7 +130,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
130 return 0; 130 return 0;
131 131
132overflow: 132overflow:
133 printk(KERN_ERR "overflow in relocation type %d val %Lx\n", 133 printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
134 (int)ELF64_R_TYPE(rel[i].r_info), val); 134 (int)ELF64_R_TYPE(rel[i].r_info), val);
135 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", 135 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
136 me->name); 136 me->name);
@@ -143,13 +143,13 @@ int apply_relocate(Elf_Shdr *sechdrs,
143 unsigned int relsec, 143 unsigned int relsec,
144 struct module *me) 144 struct module *me)
145{ 145{
146 printk("non add relocation not supported\n"); 146 printk(KERN_ERR "non add relocation not supported\n");
147 return -ENOSYS; 147 return -ENOSYS;
148} 148}
149 149
150int module_finalize(const Elf_Ehdr *hdr, 150int module_finalize(const Elf_Ehdr *hdr,
151 const Elf_Shdr *sechdrs, 151 const Elf_Shdr *sechdrs,
152 struct module *me) 152 struct module *me)
153{ 153{
154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, 154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
155 *para = NULL; 155 *para = NULL;
@@ -161,7 +161,7 @@ int module_finalize(const Elf_Ehdr *hdr,
161 if (!strcmp(".altinstructions", secstrings + s->sh_name)) 161 if (!strcmp(".altinstructions", secstrings + s->sh_name))
162 alt = s; 162 alt = s;
163 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 163 if (!strcmp(".smp_locks", secstrings + s->sh_name))
164 locks= s; 164 locks = s;
165 if (!strcmp(".parainstructions", secstrings + s->sh_name)) 165 if (!strcmp(".parainstructions", secstrings + s->sh_name))
166 para = s; 166 para = s;
167 } 167 }
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index a649a4ccad43..fa6bb263892e 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -144,11 +144,11 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)
144 if (bad_ioapic(m->apicaddr)) 144 if (bad_ioapic(m->apicaddr))
145 return; 145 return;
146 146
147 mp_ioapics[nr_ioapics].mp_apicaddr = m->apicaddr; 147 mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
148 mp_ioapics[nr_ioapics].mp_apicid = m->apicid; 148 mp_ioapics[nr_ioapics].apicid = m->apicid;
149 mp_ioapics[nr_ioapics].mp_type = m->type; 149 mp_ioapics[nr_ioapics].type = m->type;
150 mp_ioapics[nr_ioapics].mp_apicver = m->apicver; 150 mp_ioapics[nr_ioapics].apicver = m->apicver;
151 mp_ioapics[nr_ioapics].mp_flags = m->flags; 151 mp_ioapics[nr_ioapics].flags = m->flags;
152 nr_ioapics++; 152 nr_ioapics++;
153} 153}
154 154
@@ -160,55 +160,55 @@ static void print_MP_intsrc_info(struct mpc_intsrc *m)
160 m->srcbusirq, m->dstapic, m->dstirq); 160 m->srcbusirq, m->dstapic, m->dstirq);
161} 161}
162 162
163static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) 163static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
164{ 164{
165 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," 165 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
166 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 166 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
167 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, 167 mp_irq->irqtype, mp_irq->irqflag & 3,
168 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, 168 (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
169 mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq); 169 mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
170} 170}
171 171
172static void __init assign_to_mp_irq(struct mpc_intsrc *m, 172static void __init assign_to_mp_irq(struct mpc_intsrc *m,
173 struct mp_config_intsrc *mp_irq) 173 struct mpc_intsrc *mp_irq)
174{ 174{
175 mp_irq->mp_dstapic = m->dstapic; 175 mp_irq->dstapic = m->dstapic;
176 mp_irq->mp_type = m->type; 176 mp_irq->type = m->type;
177 mp_irq->mp_irqtype = m->irqtype; 177 mp_irq->irqtype = m->irqtype;
178 mp_irq->mp_irqflag = m->irqflag; 178 mp_irq->irqflag = m->irqflag;
179 mp_irq->mp_srcbus = m->srcbus; 179 mp_irq->srcbus = m->srcbus;
180 mp_irq->mp_srcbusirq = m->srcbusirq; 180 mp_irq->srcbusirq = m->srcbusirq;
181 mp_irq->mp_dstirq = m->dstirq; 181 mp_irq->dstirq = m->dstirq;
182} 182}
183 183
184static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq, 184static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
185 struct mpc_intsrc *m) 185 struct mpc_intsrc *m)
186{ 186{
187 m->dstapic = mp_irq->mp_dstapic; 187 m->dstapic = mp_irq->dstapic;
188 m->type = mp_irq->mp_type; 188 m->type = mp_irq->type;
189 m->irqtype = mp_irq->mp_irqtype; 189 m->irqtype = mp_irq->irqtype;
190 m->irqflag = mp_irq->mp_irqflag; 190 m->irqflag = mp_irq->irqflag;
191 m->srcbus = mp_irq->mp_srcbus; 191 m->srcbus = mp_irq->srcbus;
192 m->srcbusirq = mp_irq->mp_srcbusirq; 192 m->srcbusirq = mp_irq->srcbusirq;
193 m->dstirq = mp_irq->mp_dstirq; 193 m->dstirq = mp_irq->dstirq;
194} 194}
195 195
196static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq, 196static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
197 struct mpc_intsrc *m) 197 struct mpc_intsrc *m)
198{ 198{
199 if (mp_irq->mp_dstapic != m->dstapic) 199 if (mp_irq->dstapic != m->dstapic)
200 return 1; 200 return 1;
201 if (mp_irq->mp_type != m->type) 201 if (mp_irq->type != m->type)
202 return 2; 202 return 2;
203 if (mp_irq->mp_irqtype != m->irqtype) 203 if (mp_irq->irqtype != m->irqtype)
204 return 3; 204 return 3;
205 if (mp_irq->mp_irqflag != m->irqflag) 205 if (mp_irq->irqflag != m->irqflag)
206 return 4; 206 return 4;
207 if (mp_irq->mp_srcbus != m->srcbus) 207 if (mp_irq->srcbus != m->srcbus)
208 return 5; 208 return 5;
209 if (mp_irq->mp_srcbusirq != m->srcbusirq) 209 if (mp_irq->srcbusirq != m->srcbusirq)
210 return 6; 210 return 6;
211 if (mp_irq->mp_dstirq != m->dstirq) 211 if (mp_irq->dstirq != m->dstirq)
212 return 7; 212 return 7;
213 213
214 return 0; 214 return 0;
@@ -417,7 +417,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
417 intsrc.type = MP_INTSRC; 417 intsrc.type = MP_INTSRC;
418 intsrc.irqflag = 0; /* conforming */ 418 intsrc.irqflag = 0; /* conforming */
419 intsrc.srcbus = 0; 419 intsrc.srcbus = 0;
420 intsrc.dstapic = mp_ioapics[0].mp_apicid; 420 intsrc.dstapic = mp_ioapics[0].apicid;
421 421
422 intsrc.irqtype = mp_INT; 422 intsrc.irqtype = mp_INT;
423 423
@@ -570,14 +570,14 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
570 } 570 }
571} 571}
572 572
573static struct intel_mp_floating *mpf_found; 573static struct mpf_intel *mpf_found;
574 574
575/* 575/*
576 * Scan the memory blocks for an SMP configuration block. 576 * Scan the memory blocks for an SMP configuration block.
577 */ 577 */
578static void __init __get_smp_config(unsigned int early) 578static void __init __get_smp_config(unsigned int early)
579{ 579{
580 struct intel_mp_floating *mpf = mpf_found; 580 struct mpf_intel *mpf = mpf_found;
581 581
582 if (!mpf) 582 if (!mpf)
583 return; 583 return;
@@ -598,9 +598,9 @@ static void __init __get_smp_config(unsigned int early)
598 } 598 }
599 599
600 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", 600 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
601 mpf->mpf_specification); 601 mpf->specification);
602#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) 602#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
603 if (mpf->mpf_feature2 & (1 << 7)) { 603 if (mpf->feature2 & (1 << 7)) {
604 printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); 604 printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
605 pic_mode = 1; 605 pic_mode = 1;
606 } else { 606 } else {
@@ -611,7 +611,7 @@ static void __init __get_smp_config(unsigned int early)
611 /* 611 /*
612 * Now see if we need to read further. 612 * Now see if we need to read further.
613 */ 613 */
614 if (mpf->mpf_feature1 != 0) { 614 if (mpf->feature1 != 0) {
615 if (early) { 615 if (early) {
616 /* 616 /*
617 * local APIC has default address 617 * local APIC has default address
@@ -621,16 +621,16 @@ static void __init __get_smp_config(unsigned int early)
621 } 621 }
622 622
623 printk(KERN_INFO "Default MP configuration #%d\n", 623 printk(KERN_INFO "Default MP configuration #%d\n",
624 mpf->mpf_feature1); 624 mpf->feature1);
625 construct_default_ISA_mptable(mpf->mpf_feature1); 625 construct_default_ISA_mptable(mpf->feature1);
626 626
627 } else if (mpf->mpf_physptr) { 627 } else if (mpf->physptr) {
628 628
629 /* 629 /*
630 * Read the physical hardware table. Anything here will 630 * Read the physical hardware table. Anything here will
631 * override the defaults. 631 * override the defaults.
632 */ 632 */
633 if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) { 633 if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) {
634#ifdef CONFIG_X86_LOCAL_APIC 634#ifdef CONFIG_X86_LOCAL_APIC
635 smp_found_config = 0; 635 smp_found_config = 0;
636#endif 636#endif
@@ -688,19 +688,19 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
688 unsigned reserve) 688 unsigned reserve)
689{ 689{
690 unsigned int *bp = phys_to_virt(base); 690 unsigned int *bp = phys_to_virt(base);
691 struct intel_mp_floating *mpf; 691 struct mpf_intel *mpf;
692 692
693 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", 693 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
694 bp, length); 694 bp, length);
695 BUILD_BUG_ON(sizeof(*mpf) != 16); 695 BUILD_BUG_ON(sizeof(*mpf) != 16);
696 696
697 while (length > 0) { 697 while (length > 0) {
698 mpf = (struct intel_mp_floating *)bp; 698 mpf = (struct mpf_intel *)bp;
699 if ((*bp == SMP_MAGIC_IDENT) && 699 if ((*bp == SMP_MAGIC_IDENT) &&
700 (mpf->mpf_length == 1) && 700 (mpf->length == 1) &&
701 !mpf_checksum((unsigned char *)bp, 16) && 701 !mpf_checksum((unsigned char *)bp, 16) &&
702 ((mpf->mpf_specification == 1) 702 ((mpf->specification == 1)
703 || (mpf->mpf_specification == 4))) { 703 || (mpf->specification == 4))) {
704#ifdef CONFIG_X86_LOCAL_APIC 704#ifdef CONFIG_X86_LOCAL_APIC
705 smp_found_config = 1; 705 smp_found_config = 1;
706#endif 706#endif
@@ -713,7 +713,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
713 return 1; 713 return 1;
714 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, 714 reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
715 BOOTMEM_DEFAULT); 715 BOOTMEM_DEFAULT);
716 if (mpf->mpf_physptr) { 716 if (mpf->physptr) {
717 unsigned long size = PAGE_SIZE; 717 unsigned long size = PAGE_SIZE;
718#ifdef CONFIG_X86_32 718#ifdef CONFIG_X86_32
719 /* 719 /*
@@ -722,14 +722,14 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
722 * the bottom is mapped now. 722 * the bottom is mapped now.
723 * PC-9800's MPC table places on the very last 723 * PC-9800's MPC table places on the very last
724 * of physical memory; so that simply reserving 724 * of physical memory; so that simply reserving
725 * PAGE_SIZE from mpg->mpf_physptr yields BUG() 725 * PAGE_SIZE from mpf->physptr yields BUG()
726 * in reserve_bootmem. 726 * in reserve_bootmem.
727 */ 727 */
728 unsigned long end = max_low_pfn * PAGE_SIZE; 728 unsigned long end = max_low_pfn * PAGE_SIZE;
729 if (mpf->mpf_physptr + size > end) 729 if (mpf->physptr + size > end)
730 size = end - mpf->mpf_physptr; 730 size = end - mpf->physptr;
731#endif 731#endif
732 reserve_bootmem_generic(mpf->mpf_physptr, size, 732 reserve_bootmem_generic(mpf->physptr, size,
733 BOOTMEM_DEFAULT); 733 BOOTMEM_DEFAULT);
734 } 734 }
735 735
@@ -809,15 +809,15 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
809 /* not legacy */ 809 /* not legacy */
810 810
811 for (i = 0; i < mp_irq_entries; i++) { 811 for (i = 0; i < mp_irq_entries; i++) {
812 if (mp_irqs[i].mp_irqtype != mp_INT) 812 if (mp_irqs[i].irqtype != mp_INT)
813 continue; 813 continue;
814 814
815 if (mp_irqs[i].mp_irqflag != 0x0f) 815 if (mp_irqs[i].irqflag != 0x0f)
816 continue; 816 continue;
817 817
818 if (mp_irqs[i].mp_srcbus != m->srcbus) 818 if (mp_irqs[i].srcbus != m->srcbus)
819 continue; 819 continue;
820 if (mp_irqs[i].mp_srcbusirq != m->srcbusirq) 820 if (mp_irqs[i].srcbusirq != m->srcbusirq)
821 continue; 821 continue;
822 if (irq_used[i]) { 822 if (irq_used[i]) {
823 /* already claimed */ 823 /* already claimed */
@@ -922,10 +922,10 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
922 if (irq_used[i]) 922 if (irq_used[i])
923 continue; 923 continue;
924 924
925 if (mp_irqs[i].mp_irqtype != mp_INT) 925 if (mp_irqs[i].irqtype != mp_INT)
926 continue; 926 continue;
927 927
928 if (mp_irqs[i].mp_irqflag != 0x0f) 928 if (mp_irqs[i].irqflag != 0x0f)
929 continue; 929 continue;
930 930
931 if (nr_m_spare > 0) { 931 if (nr_m_spare > 0) {
@@ -1001,7 +1001,7 @@ static int __init update_mp_table(void)
1001{ 1001{
1002 char str[16]; 1002 char str[16];
1003 char oem[10]; 1003 char oem[10];
1004 struct intel_mp_floating *mpf; 1004 struct mpf_intel *mpf;
1005 struct mpc_table *mpc, *mpc_new; 1005 struct mpc_table *mpc, *mpc_new;
1006 1006
1007 if (!enable_update_mptable) 1007 if (!enable_update_mptable)
@@ -1014,19 +1014,19 @@ static int __init update_mp_table(void)
1014 /* 1014 /*
1015 * Now see if we need to go further. 1015 * Now see if we need to go further.
1016 */ 1016 */
1017 if (mpf->mpf_feature1 != 0) 1017 if (mpf->feature1 != 0)
1018 return 0; 1018 return 0;
1019 1019
1020 if (!mpf->mpf_physptr) 1020 if (!mpf->physptr)
1021 return 0; 1021 return 0;
1022 1022
1023 mpc = phys_to_virt(mpf->mpf_physptr); 1023 mpc = phys_to_virt(mpf->physptr);
1024 1024
1025 if (!smp_check_mpc(mpc, oem, str)) 1025 if (!smp_check_mpc(mpc, oem, str))
1026 return 0; 1026 return 0;
1027 1027
1028 printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf)); 1028 printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf));
1029 printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr); 1029 printk(KERN_INFO "physptr: %x\n", mpf->physptr);
1030 1030
1031 if (mpc_new_phys && mpc->length > mpc_new_length) { 1031 if (mpc_new_phys && mpc->length > mpc_new_length) {
1032 mpc_new_phys = 0; 1032 mpc_new_phys = 0;
@@ -1047,23 +1047,23 @@ static int __init update_mp_table(void)
1047 } 1047 }
1048 printk(KERN_INFO "use in-positon replacing\n"); 1048 printk(KERN_INFO "use in-positon replacing\n");
1049 } else { 1049 } else {
1050 mpf->mpf_physptr = mpc_new_phys; 1050 mpf->physptr = mpc_new_phys;
1051 mpc_new = phys_to_virt(mpc_new_phys); 1051 mpc_new = phys_to_virt(mpc_new_phys);
1052 memcpy(mpc_new, mpc, mpc->length); 1052 memcpy(mpc_new, mpc, mpc->length);
1053 mpc = mpc_new; 1053 mpc = mpc_new;
1054 /* check if we can modify that */ 1054 /* check if we can modify that */
1055 if (mpc_new_phys - mpf->mpf_physptr) { 1055 if (mpc_new_phys - mpf->physptr) {
1056 struct intel_mp_floating *mpf_new; 1056 struct mpf_intel *mpf_new;
1057 /* steal 16 bytes from [0, 1k) */ 1057 /* steal 16 bytes from [0, 1k) */
1058 printk(KERN_INFO "mpf new: %x\n", 0x400 - 16); 1058 printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
1059 mpf_new = phys_to_virt(0x400 - 16); 1059 mpf_new = phys_to_virt(0x400 - 16);
1060 memcpy(mpf_new, mpf, 16); 1060 memcpy(mpf_new, mpf, 16);
1061 mpf = mpf_new; 1061 mpf = mpf_new;
1062 mpf->mpf_physptr = mpc_new_phys; 1062 mpf->physptr = mpc_new_phys;
1063 } 1063 }
1064 mpf->mpf_checksum = 0; 1064 mpf->checksum = 0;
1065 mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16); 1065 mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
1066 printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr); 1066 printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
1067 } 1067 }
1068 1068
1069 /* 1069 /*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 726266695b2c..3cf3413ec626 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -35,10 +35,10 @@
35#include <linux/device.h> 35#include <linux/device.h>
36#include <linux/cpu.h> 36#include <linux/cpu.h>
37#include <linux/notifier.h> 37#include <linux/notifier.h>
38#include <linux/uaccess.h>
38 39
39#include <asm/processor.h> 40#include <asm/processor.h>
40#include <asm/msr.h> 41#include <asm/msr.h>
41#include <asm/uaccess.h>
42#include <asm/system.h> 42#include <asm/system.h>
43 43
44static struct class *msr_class; 44static struct class *msr_class;
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 7228979f1e7f..23b6d9e6e4f5 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -61,11 +61,7 @@ static int endflag __initdata;
61 61
62static inline unsigned int get_nmi_count(int cpu) 62static inline unsigned int get_nmi_count(int cpu)
63{ 63{
64#ifdef CONFIG_X86_64 64 return per_cpu(irq_stat, cpu).__nmi_count;
65 return cpu_pda(cpu)->__nmi_count;
66#else
67 return nmi_count(cpu);
68#endif
69} 65}
70 66
71static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
@@ -82,12 +78,8 @@ static inline int mce_in_progress(void)
82 */ 78 */
83static inline unsigned int get_timer_irqs(int cpu) 79static inline unsigned int get_timer_irqs(int cpu)
84{ 80{
85#ifdef CONFIG_X86_64
86 return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
87#else
88 return per_cpu(irq_stat, cpu).apic_timer_irqs + 81 return per_cpu(irq_stat, cpu).apic_timer_irqs +
89 per_cpu(irq_stat, cpu).irq0_irqs; 82 per_cpu(irq_stat, cpu).irq0_irqs;
90#endif
91} 83}
92 84
93#ifdef CONFIG_SMP 85#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a546f55c77b4..1a1ae8edc40c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -66,9 +66,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
66DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 66DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
67EXPORT_PER_CPU_SYMBOL(current_task); 67EXPORT_PER_CPU_SYMBOL(current_task);
68 68
69DEFINE_PER_CPU(int, cpu_number);
70EXPORT_PER_CPU_SYMBOL(cpu_number);
71
72/* 69/*
73 * Return saved PC of a blocked thread. 70 * Return saved PC of a blocked thread.
74 */ 71 */
@@ -111,7 +108,6 @@ void cpu_idle(void)
111 play_dead(); 108 play_dead();
112 109
113 local_irq_disable(); 110 local_irq_disable();
114 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
115 /* Don't trace irqs off for idle */ 111 /* Don't trace irqs off for idle */
116 stop_critical_timings(); 112 stop_critical_timings();
117 pm_idle(); 113 pm_idle();
@@ -591,7 +587,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
591 if (prev->gs | next->gs) 587 if (prev->gs | next->gs)
592 loadsegment(gs, next->gs); 588 loadsegment(gs, next->gs);
593 589
594 x86_write_percpu(current_task, next_p); 590 percpu_write(current_task, next_p);
595 591
596 return prev_p; 592 return prev_p;
597} 593}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 85b4cb5c1980..8eb169e45584 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -16,6 +16,7 @@
16 16
17#include <stdarg.h> 17#include <stdarg.h>
18 18
19#include <linux/stackprotector.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <linux/errno.h> 21#include <linux/errno.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
@@ -47,7 +48,6 @@
47#include <asm/processor.h> 48#include <asm/processor.h>
48#include <asm/i387.h> 49#include <asm/i387.h>
49#include <asm/mmu_context.h> 50#include <asm/mmu_context.h>
50#include <asm/pda.h>
51#include <asm/prctl.h> 51#include <asm/prctl.h>
52#include <asm/desc.h> 52#include <asm/desc.h>
53#include <asm/proto.h> 53#include <asm/proto.h>
@@ -58,6 +58,12 @@
58 58
59asmlinkage extern void ret_from_fork(void); 59asmlinkage extern void ret_from_fork(void);
60 60
61DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
62EXPORT_PER_CPU_SYMBOL(current_task);
63
64DEFINE_PER_CPU(unsigned long, old_rsp);
65static DEFINE_PER_CPU(unsigned char, is_idle);
66
61unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; 67unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
62 68
63static ATOMIC_NOTIFIER_HEAD(idle_notifier); 69static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -76,13 +82,13 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
76 82
77void enter_idle(void) 83void enter_idle(void)
78{ 84{
79 write_pda(isidle, 1); 85 percpu_write(is_idle, 1);
80 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 86 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
81} 87}
82 88
83static void __exit_idle(void) 89static void __exit_idle(void)
84{ 90{
85 if (test_and_clear_bit_pda(0, isidle) == 0) 91 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
86 return; 92 return;
87 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 93 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
88} 94}
@@ -112,6 +118,17 @@ static inline void play_dead(void)
112void cpu_idle(void) 118void cpu_idle(void)
113{ 119{
114 current_thread_info()->status |= TS_POLLING; 120 current_thread_info()->status |= TS_POLLING;
121
122 /*
123 * If we're the non-boot CPU, nothing set the PDA stack
124 * canary up for us - and if we are the boot CPU we have
125 * a 0 stack canary. This is a good place for updating
126 * it, as we wont ever return from this function (so the
127 * invalid canaries already on the stack wont ever
128 * trigger):
129 */
130 boot_init_stack_canary();
131
115 /* endless idle loop with no priority at all */ 132 /* endless idle loop with no priority at all */
116 while (1) { 133 while (1) {
117 tick_nohz_stop_sched_tick(1); 134 tick_nohz_stop_sched_tick(1);
@@ -397,7 +414,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
397 load_gs_index(0); 414 load_gs_index(0);
398 regs->ip = new_ip; 415 regs->ip = new_ip;
399 regs->sp = new_sp; 416 regs->sp = new_sp;
400 write_pda(oldrsp, new_sp); 417 percpu_write(old_rsp, new_sp);
401 regs->cs = __USER_CS; 418 regs->cs = __USER_CS;
402 regs->ss = __USER_DS; 419 regs->ss = __USER_DS;
403 regs->flags = 0x200; 420 regs->flags = 0x200;
@@ -618,21 +635,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
618 /* 635 /*
619 * Switch the PDA and FPU contexts. 636 * Switch the PDA and FPU contexts.
620 */ 637 */
621 prev->usersp = read_pda(oldrsp); 638 prev->usersp = percpu_read(old_rsp);
622 write_pda(oldrsp, next->usersp); 639 percpu_write(old_rsp, next->usersp);
623 write_pda(pcurrent, next_p); 640 percpu_write(current_task, next_p);
624 641
625 write_pda(kernelstack, 642 percpu_write(kernel_stack,
626 (unsigned long)task_stack_page(next_p) + 643 (unsigned long)task_stack_page(next_p) +
627 THREAD_SIZE - PDA_STACKOFFSET); 644 THREAD_SIZE - KERNEL_STACK_OFFSET);
628#ifdef CONFIG_CC_STACKPROTECTOR
629 write_pda(stack_canary, next_p->stack_canary);
630 /*
631 * Build time only check to make sure the stack_canary is at
632 * offset 40 in the pda; this is a gcc ABI requirement
633 */
634 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
635#endif
636 645
637 /* 646 /*
638 * Now maybe reload the debug registers and handle I/O bitmaps 647 * Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 2b46eb41643b..f8536fee5c12 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -14,6 +14,7 @@
14#include <asm/reboot.h> 14#include <asm/reboot.h>
15#include <asm/pci_x86.h> 15#include <asm/pci_x86.h>
16#include <asm/virtext.h> 16#include <asm/virtext.h>
17#include <asm/cpu.h>
17 18
18#ifdef CONFIG_X86_32 19#ifdef CONFIG_X86_32
19# include <linux/dmi.h> 20# include <linux/dmi.h>
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c461f6d69074..d5d6693b706d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -89,7 +89,7 @@
89 89
90#include <asm/system.h> 90#include <asm/system.h>
91#include <asm/vsyscall.h> 91#include <asm/vsyscall.h>
92#include <asm/smp.h> 92#include <asm/cpu.h>
93#include <asm/desc.h> 93#include <asm/desc.h>
94#include <asm/dma.h> 94#include <asm/dma.h>
95#include <asm/iommu.h> 95#include <asm/iommu.h>
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 01161077a49c..e553803cd2db 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -13,6 +13,23 @@
13#include <asm/mpspec.h> 13#include <asm/mpspec.h>
14#include <asm/apicdef.h> 14#include <asm/apicdef.h>
15#include <asm/highmem.h> 15#include <asm/highmem.h>
16#include <asm/proto.h>
17#include <asm/cpumask.h>
18
19#ifdef CONFIG_DEBUG_PER_CPU_MAPS
20# define DBG(x...) printk(KERN_DEBUG x)
21#else
22# define DBG(x...)
23#endif
24
25/*
26 * Could be inside CONFIG_HAVE_SETUP_PER_CPU_AREA with other stuff but
27 * voyager wants cpu_number too.
28 */
29#ifdef CONFIG_SMP
30DEFINE_PER_CPU(int, cpu_number);
31EXPORT_PER_CPU_SYMBOL(cpu_number);
32#endif
16 33
17#ifdef CONFIG_X86_LOCAL_APIC 34#ifdef CONFIG_X86_LOCAL_APIC
18unsigned int num_processors; 35unsigned int num_processors;
@@ -26,31 +43,60 @@ unsigned int max_physical_apicid;
26physid_mask_t phys_cpu_present_map; 43physid_mask_t phys_cpu_present_map;
27#endif 44#endif
28 45
29/* map cpu index to physical APIC ID */ 46/*
47 * Map cpu index to physical APIC ID
48 */
30DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); 49DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
31DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); 50DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
32EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); 51EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
33EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); 52EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
34 53
35#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 54#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
36#define X86_64_NUMA 1 55#define X86_64_NUMA 1 /* (used later) */
56DEFINE_PER_CPU(int, node_number) = 0;
57EXPORT_PER_CPU_SYMBOL(node_number);
37 58
38/* map cpu index to node index */ 59/*
60 * Map cpu index to node index
61 */
39DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 62DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
40EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); 63EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
41 64
42/* which logical CPUs are on which nodes */ 65/*
66 * Which logical CPUs are on which nodes
67 */
43cpumask_t *node_to_cpumask_map; 68cpumask_t *node_to_cpumask_map;
44EXPORT_SYMBOL(node_to_cpumask_map); 69EXPORT_SYMBOL(node_to_cpumask_map);
45 70
46/* setup node_to_cpumask_map */ 71/*
72 * Setup node_to_cpumask_map
73 */
47static void __init setup_node_to_cpumask_map(void); 74static void __init setup_node_to_cpumask_map(void);
48 75
49#else 76#else
50static inline void setup_node_to_cpumask_map(void) { } 77static inline void setup_node_to_cpumask_map(void) { }
51#endif 78#endif
52 79
53#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) 80#ifdef CONFIG_X86_64
81
82/* correctly size the local cpu masks */
83static void __init setup_cpu_local_masks(void)
84{
85 alloc_bootmem_cpumask_var(&cpu_initialized_mask);
86 alloc_bootmem_cpumask_var(&cpu_callin_mask);
87 alloc_bootmem_cpumask_var(&cpu_callout_mask);
88 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
89}
90
91#else /* CONFIG_X86_32 */
92
93static inline void setup_cpu_local_masks(void)
94{
95}
96
97#endif /* CONFIG_X86_32 */
98
99#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
54/* 100/*
55 * Copy data used in early init routines from the initial arrays to the 101 * Copy data used in early init routines from the initial arrays to the
56 * per cpu data areas. These arrays then become expendable and the 102 * per cpu data areas. These arrays then become expendable and the
@@ -79,78 +125,14 @@ static void __init setup_per_cpu_maps(void)
79#endif 125#endif
80} 126}
81 127
82#ifdef CONFIG_X86_32 128#ifdef CONFIG_X86_64
83/* 129unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
84 * Great future not-so-futuristic plan: make i386 and x86_64 do it 130 [0] = (unsigned long)__per_cpu_load,
85 * the same way 131};
86 */ 132#else
87unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 133unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
134#endif
88EXPORT_SYMBOL(__per_cpu_offset); 135EXPORT_SYMBOL(__per_cpu_offset);
89static inline void setup_cpu_pda_map(void) { }
90
91#elif !defined(CONFIG_SMP)
92static inline void setup_cpu_pda_map(void) { }
93
94#else /* CONFIG_SMP && CONFIG_X86_64 */
95
96/*
97 * Allocate cpu_pda pointer table and array via alloc_bootmem.
98 */
99static void __init setup_cpu_pda_map(void)
100{
101 char *pda;
102 struct x8664_pda **new_cpu_pda;
103 unsigned long size;
104 int cpu;
105
106 size = roundup(sizeof(struct x8664_pda), cache_line_size());
107
108 /* allocate cpu_pda array and pointer table */
109 {
110 unsigned long tsize = nr_cpu_ids * sizeof(void *);
111 unsigned long asize = size * (nr_cpu_ids - 1);
112
113 tsize = roundup(tsize, cache_line_size());
114 new_cpu_pda = alloc_bootmem(tsize + asize);
115 pda = (char *)new_cpu_pda + tsize;
116 }
117
118 /* initialize pointer table to static pda's */
119 for_each_possible_cpu(cpu) {
120 if (cpu == 0) {
121 /* leave boot cpu pda in place */
122 new_cpu_pda[0] = cpu_pda(0);
123 continue;
124 }
125 new_cpu_pda[cpu] = (struct x8664_pda *)pda;
126 new_cpu_pda[cpu]->in_bootmem = 1;
127 pda += size;
128 }
129
130 /* point to new pointer table */
131 _cpu_pda = new_cpu_pda;
132}
133
134#endif /* CONFIG_SMP && CONFIG_X86_64 */
135
136#ifdef CONFIG_X86_64
137
138/* correctly size the local cpu masks */
139static void __init setup_cpu_local_masks(void)
140{
141 alloc_bootmem_cpumask_var(&cpu_initialized_mask);
142 alloc_bootmem_cpumask_var(&cpu_callin_mask);
143 alloc_bootmem_cpumask_var(&cpu_callout_mask);
144 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
145}
146
147#else /* CONFIG_X86_32 */
148
149static inline void setup_cpu_local_masks(void)
150{
151}
152
153#endif /* CONFIG_X86_32 */
154 136
155/* 137/*
156 * Great future plan: 138 * Great future plan:
@@ -164,9 +146,6 @@ void __init setup_per_cpu_areas(void)
164 int cpu; 146 int cpu;
165 unsigned long align = 1; 147 unsigned long align = 1;
166 148
167 /* Setup cpu_pda map */
168 setup_cpu_pda_map();
169
170 /* Copy section for each CPU (we discard the original) */ 149 /* Copy section for each CPU (we discard the original) */
171 old_size = PERCPU_ENOUGH_ROOM; 150 old_size = PERCPU_ENOUGH_ROOM;
172 align = max_t(unsigned long, PAGE_SIZE, align); 151 align = max_t(unsigned long, PAGE_SIZE, align);
@@ -197,8 +176,23 @@ void __init setup_per_cpu_areas(void)
197 cpu, node, __pa(ptr)); 176 cpu, node, __pa(ptr));
198 } 177 }
199#endif 178#endif
179
180 memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
200 per_cpu_offset(cpu) = ptr - __per_cpu_start; 181 per_cpu_offset(cpu) = ptr - __per_cpu_start;
201 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 182 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
183 per_cpu(cpu_number, cpu) = cpu;
184#ifdef CONFIG_X86_64
185 per_cpu(irq_stack_ptr, cpu) =
186 per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE - 64;
187 /*
188 * Up to this point, CPU0 has been using .data.init
189 * area. Reload %gs offset for CPU0.
190 */
191 if (cpu == 0)
192 load_gs_base(cpu);
193#endif
194
195 DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
202 } 196 }
203 197
204 /* Setup percpu data maps */ 198 /* Setup percpu data maps */
@@ -220,6 +214,7 @@ void __init setup_per_cpu_areas(void)
220 * Requires node_possible_map to be valid. 214 * Requires node_possible_map to be valid.
221 * 215 *
222 * Note: node_to_cpumask() is not valid until after this is done. 216 * Note: node_to_cpumask() is not valid until after this is done.
217 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
223 */ 218 */
224static void __init setup_node_to_cpumask_map(void) 219static void __init setup_node_to_cpumask_map(void)
225{ 220{
@@ -235,6 +230,7 @@ static void __init setup_node_to_cpumask_map(void)
235 230
236 /* allocate the map */ 231 /* allocate the map */
237 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); 232 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
233 DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);
238 234
239 pr_debug("Node to cpumask map at %p for %d nodes\n", 235 pr_debug("Node to cpumask map at %p for %d nodes\n",
240 map, nr_node_ids); 236 map, nr_node_ids);
@@ -247,17 +243,23 @@ void __cpuinit numa_set_node(int cpu, int node)
247{ 243{
248 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 244 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
249 245
250 if (cpu_pda(cpu) && node != NUMA_NO_NODE) 246 /* early setting, no percpu area yet */
251 cpu_pda(cpu)->nodenumber = node; 247 if (cpu_to_node_map) {
252
253 if (cpu_to_node_map)
254 cpu_to_node_map[cpu] = node; 248 cpu_to_node_map[cpu] = node;
249 return;
250 }
255 251
256 else if (per_cpu_offset(cpu)) 252#ifdef CONFIG_DEBUG_PER_CPU_MAPS
257 per_cpu(x86_cpu_to_node_map, cpu) = node; 253 if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) {
254 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
255 dump_stack();
256 return;
257 }
258#endif
259 per_cpu(x86_cpu_to_node_map, cpu) = node;
258 260
259 else 261 if (node != NUMA_NO_NODE)
260 pr_debug("Setting node for non-present cpu %d\n", cpu); 262 per_cpu(node_number, cpu) = node;
261} 263}
262 264
263void __cpuinit numa_clear_node(int cpu) 265void __cpuinit numa_clear_node(int cpu)
@@ -274,7 +276,7 @@ void __cpuinit numa_add_cpu(int cpu)
274 276
275void __cpuinit numa_remove_cpu(int cpu) 277void __cpuinit numa_remove_cpu(int cpu)
276{ 278{
277 cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]); 279 cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
278} 280}
279 281
280#else /* CONFIG_DEBUG_PER_CPU_MAPS */ 282#else /* CONFIG_DEBUG_PER_CPU_MAPS */
@@ -284,7 +286,7 @@ void __cpuinit numa_remove_cpu(int cpu)
284 */ 286 */
285static void __cpuinit numa_set_cpumask(int cpu, int enable) 287static void __cpuinit numa_set_cpumask(int cpu, int enable)
286{ 288{
287 int node = cpu_to_node(cpu); 289 int node = early_cpu_to_node(cpu);
288 cpumask_t *mask; 290 cpumask_t *mask;
289 char buf[64]; 291 char buf[64];
290 292
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index df0587f24c54..0bc73d67acfb 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,7 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9 9#include <linux/perf_counter.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
@@ -893,6 +893,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
893 tracehook_notify_resume(regs); 893 tracehook_notify_resume(regs);
894 } 894 }
895 895
896 if (thread_info_flags & _TIF_PERF_COUNTERS) {
897 clear_thread_flag(TIF_PERF_COUNTERS);
898 perf_counter_notify(regs);
899 }
900
896#ifdef CONFIG_X86_32 901#ifdef CONFIG_X86_32
897 clear_thread_flag(TIF_IRET); 902 clear_thread_flag(TIF_IRET);
898#endif /* CONFIG_X86_32 */ 903#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bb1a3b1fc87f..def770b57b5a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -53,7 +53,6 @@
53#include <asm/nmi.h> 53#include <asm/nmi.h>
54#include <asm/irq.h> 54#include <asm/irq.h>
55#include <asm/idle.h> 55#include <asm/idle.h>
56#include <asm/smp.h>
57#include <asm/trampoline.h> 56#include <asm/trampoline.h>
58#include <asm/cpu.h> 57#include <asm/cpu.h>
59#include <asm/numa.h> 58#include <asm/numa.h>
@@ -63,6 +62,7 @@
63#include <asm/vmi.h> 62#include <asm/vmi.h>
64#include <asm/genapic.h> 63#include <asm/genapic.h>
65#include <asm/setup.h> 64#include <asm/setup.h>
65#include <asm/uv/uv.h>
66#include <linux/mc146818rtc.h> 66#include <linux/mc146818rtc.h>
67 67
68#include <mach_apic.h> 68#include <mach_apic.h>
@@ -745,52 +745,6 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
745 complete(&c_idle->done); 745 complete(&c_idle->done);
746} 746}
747 747
748#ifdef CONFIG_X86_64
749
750/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
751static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
752{
753 if (!after_bootmem)
754 free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
755}
756
757/*
758 * Allocate node local memory for the AP pda.
759 *
760 * Must be called after the _cpu_pda pointer table is initialized.
761 */
762int __cpuinit get_local_pda(int cpu)
763{
764 struct x8664_pda *oldpda, *newpda;
765 unsigned long size = sizeof(struct x8664_pda);
766 int node = cpu_to_node(cpu);
767
768 if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
769 return 0;
770
771 oldpda = cpu_pda(cpu);
772 newpda = kmalloc_node(size, GFP_ATOMIC, node);
773 if (!newpda) {
774 printk(KERN_ERR "Could not allocate node local PDA "
775 "for CPU %d on node %d\n", cpu, node);
776
777 if (oldpda)
778 return 0; /* have a usable pda */
779 else
780 return -1;
781 }
782
783 if (oldpda) {
784 memcpy(newpda, oldpda, size);
785 free_bootmem_pda(oldpda);
786 }
787
788 newpda->in_bootmem = 0;
789 cpu_pda(cpu) = newpda;
790 return 0;
791}
792#endif /* CONFIG_X86_64 */
793
794static int __cpuinit do_boot_cpu(int apicid, int cpu) 748static int __cpuinit do_boot_cpu(int apicid, int cpu)
795/* 749/*
796 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 750 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -808,16 +762,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
808 }; 762 };
809 INIT_WORK(&c_idle.work, do_fork_idle); 763 INIT_WORK(&c_idle.work, do_fork_idle);
810 764
811#ifdef CONFIG_X86_64
812 /* Allocate node local memory for AP pdas */
813 if (cpu > 0) {
814 boot_error = get_local_pda(cpu);
815 if (boot_error)
816 goto restore_state;
817 /* if can't get pda memory, can't start cpu */
818 }
819#endif
820
821 alternatives_smp_switch(1); 765 alternatives_smp_switch(1);
822 766
823 c_idle.idle = get_idle_for_cpu(cpu); 767 c_idle.idle = get_idle_for_cpu(cpu);
@@ -847,14 +791,17 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
847 791
848 set_idle_for_cpu(cpu, c_idle.idle); 792 set_idle_for_cpu(cpu, c_idle.idle);
849do_rest: 793do_rest:
850#ifdef CONFIG_X86_32
851 per_cpu(current_task, cpu) = c_idle.idle; 794 per_cpu(current_task, cpu) = c_idle.idle;
795#ifdef CONFIG_X86_32
852 init_gdt(cpu); 796 init_gdt(cpu);
853 /* Stack for startup_32 can be just as for start_secondary onwards */ 797 /* Stack for startup_32 can be just as for start_secondary onwards */
854 irq_ctx_init(cpu); 798 irq_ctx_init(cpu);
855#else 799#else
856 cpu_pda(cpu)->pcurrent = c_idle.idle;
857 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 800 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
801 initial_gs = per_cpu_offset(cpu);
802 per_cpu(kernel_stack, cpu) =
803 (unsigned long)task_stack_page(c_idle.idle) -
804 KERNEL_STACK_OFFSET + THREAD_SIZE;
858#endif 805#endif
859 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 806 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
860 initial_code = (unsigned long)start_secondary; 807 initial_code = (unsigned long)start_secondary;
@@ -931,9 +878,7 @@ do_rest:
931 inquire_remote_apic(apicid); 878 inquire_remote_apic(apicid);
932 } 879 }
933 } 880 }
934#ifdef CONFIG_X86_64 881
935restore_state:
936#endif
937 if (boot_error) { 882 if (boot_error) {
938 /* Try to put things back the way they were before ... */ 883 /* Try to put things back the way they were before ... */
939 numa_remove_cpu(cpu); /* was set by numa_add_cpu */ 884 numa_remove_cpu(cpu); /* was set by numa_add_cpu */
@@ -1125,6 +1070,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1125 printk(KERN_ERR "... forcing use of dummy APIC emulation." 1070 printk(KERN_ERR "... forcing use of dummy APIC emulation."
1126 "(tell your hw vendor)\n"); 1071 "(tell your hw vendor)\n");
1127 smpboot_clear_io_apic(); 1072 smpboot_clear_io_apic();
1073 disable_ioapic_setup();
1128 return -1; 1074 return -1;
1129 } 1075 }
1130 1076
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 397e309839dd..add36b4e37c9 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -3,11 +3,16 @@
3 */ 3 */
4#include <linux/module.h> 4#include <linux/module.h>
5#include <asm/smp.h> 5#include <asm/smp.h>
6#include <asm/sections.h>
6 7
7#ifdef CONFIG_X86_32 8#ifdef CONFIG_X86_64
9DEFINE_PER_CPU(unsigned long, this_cpu_off) = (unsigned long)__per_cpu_load;
10#else
8DEFINE_PER_CPU(unsigned long, this_cpu_off); 11DEFINE_PER_CPU(unsigned long, this_cpu_off);
12#endif
9EXPORT_PER_CPU_SYMBOL(this_cpu_off); 13EXPORT_PER_CPU_SYMBOL(this_cpu_off);
10 14
15#ifdef CONFIG_X86_32
11/* 16/*
12 * Initialize the CPU's GDT. This is either the boot CPU doing itself 17 * Initialize the CPU's GDT. This is either the boot CPU doing itself
13 * (still using the master per-cpu area), or a CPU doing it for a 18 * (still using the master per-cpu area), or a CPU doing it for a
@@ -23,8 +28,5 @@ __cpuinit void init_gdt(int cpu)
23 28
24 write_gdt_entry(get_cpu_gdt_table(cpu), 29 write_gdt_entry(get_cpu_gdt_table(cpu),
25 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); 30 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
26
27 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
28 per_cpu(cpu_number, cpu) = cpu;
29} 31}
30#endif 32#endif
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index e2e86a08f31d..0c4d601bc853 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,4 @@ ENTRY(sys_call_table)
332 .long sys_dup3 /* 330 */ 332 .long sys_dup3 /* 330 */
333 .long sys_pipe2 333 .long sys_pipe2
334 .long sys_inotify_init1 334 .long sys_inotify_init1
335 .long sys_perf_counter_open
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
deleted file mode 100644
index ce5054642247..000000000000
--- a/arch/x86/kernel/tlb_32.c
+++ /dev/null
@@ -1,256 +0,0 @@
1#include <linux/spinlock.h>
2#include <linux/cpu.h>
3#include <linux/interrupt.h>
4
5#include <asm/tlbflush.h>
6
7DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate)
8 ____cacheline_aligned = { &init_mm, 0, };
9
10/* must come after the send_IPI functions above for inlining */
11#include <mach_ipi.h>
12
13/*
14 * Smarter SMP flushing macros.
15 * c/o Linus Torvalds.
16 *
17 * These mean you can really definitely utterly forget about
18 * writing to user space from interrupts. (Its not allowed anyway).
19 *
20 * Optimizations Manfred Spraul <manfred@colorfullife.com>
21 */
22
23static cpumask_t flush_cpumask;
24static struct mm_struct *flush_mm;
25static unsigned long flush_va;
26static DEFINE_SPINLOCK(tlbstate_lock);
27
28/*
29 * We cannot call mmdrop() because we are in interrupt context,
30 * instead update mm->cpu_vm_mask.
31 *
32 * We need to reload %cr3 since the page tables may be going
33 * away from under us..
34 */
35void leave_mm(int cpu)
36{
37 BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
38 cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
39 load_cr3(swapper_pg_dir);
40}
41EXPORT_SYMBOL_GPL(leave_mm);
42
43/*
44 *
45 * The flush IPI assumes that a thread switch happens in this order:
46 * [cpu0: the cpu that switches]
47 * 1) switch_mm() either 1a) or 1b)
48 * 1a) thread switch to a different mm
49 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
50 * Stop ipi delivery for the old mm. This is not synchronized with
51 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
52 * for the wrong mm, and in the worst case we perform a superfluous
53 * tlb flush.
54 * 1a2) set cpu_tlbstate to TLBSTATE_OK
55 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
56 * was in lazy tlb mode.
57 * 1a3) update cpu_tlbstate[].active_mm
58 * Now cpu0 accepts tlb flushes for the new mm.
59 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
60 * Now the other cpus will send tlb flush ipis.
61 * 1a4) change cr3.
62 * 1b) thread switch without mm change
63 * cpu_tlbstate[].active_mm is correct, cpu0 already handles
64 * flush ipis.
65 * 1b1) set cpu_tlbstate to TLBSTATE_OK
66 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
67 * Atomically set the bit [other cpus will start sending flush ipis],
68 * and test the bit.
69 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
70 * 2) switch %%esp, ie current
71 *
72 * The interrupt must handle 2 special cases:
73 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
74 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
75 * runs in kernel space, the cpu could load tlb entries for user space
76 * pages.
77 *
78 * The good news is that cpu_tlbstate is local to each cpu, no
79 * write/read ordering problems.
80 */
81
82/*
83 * TLB flush IPI:
84 *
85 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
86 * 2) Leave the mm if we are in the lazy tlb mode.
87 */
88
89void smp_invalidate_interrupt(struct pt_regs *regs)
90{
91 unsigned long cpu;
92
93 cpu = get_cpu();
94
95 if (!cpu_isset(cpu, flush_cpumask))
96 goto out;
97 /*
98 * This was a BUG() but until someone can quote me the
99 * line from the intel manual that guarantees an IPI to
100 * multiple CPUs is retried _only_ on the erroring CPUs
101 * its staying as a return
102 *
103 * BUG();
104 */
105
106 if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
107 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
108 if (flush_va == TLB_FLUSH_ALL)
109 local_flush_tlb();
110 else
111 __flush_tlb_one(flush_va);
112 } else
113 leave_mm(cpu);
114 }
115 ack_APIC_irq();
116 smp_mb__before_clear_bit();
117 cpu_clear(cpu, flush_cpumask);
118 smp_mb__after_clear_bit();
119out:
120 put_cpu_no_resched();
121 inc_irq_stat(irq_tlb_count);
122}
123
124void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
125 unsigned long va)
126{
127 cpumask_t cpumask = *cpumaskp;
128
129 /*
130 * A couple of (to be removed) sanity checks:
131 *
132 * - current CPU must not be in mask
133 * - mask must exist :)
134 */
135 BUG_ON(cpus_empty(cpumask));
136 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
137 BUG_ON(!mm);
138
139#ifdef CONFIG_HOTPLUG_CPU
140 /* If a CPU which we ran on has gone down, OK. */
141 cpus_and(cpumask, cpumask, cpu_online_map);
142 if (unlikely(cpus_empty(cpumask)))
143 return;
144#endif
145
146 /*
147 * i'm not happy about this global shared spinlock in the
148 * MM hot path, but we'll see how contended it is.
149 * AK: x86-64 has a faster method that could be ported.
150 */
151 spin_lock(&tlbstate_lock);
152
153 flush_mm = mm;
154 flush_va = va;
155 cpus_or(flush_cpumask, cpumask, flush_cpumask);
156
157 /*
158 * Make the above memory operations globally visible before
159 * sending the IPI.
160 */
161 smp_mb();
162 /*
163 * We have to send the IPI only to
164 * CPUs affected.
165 */
166 send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
167
168 while (!cpus_empty(flush_cpumask))
169 /* nothing. lockup detection does not belong here */
170 cpu_relax();
171
172 flush_mm = NULL;
173 flush_va = 0;
174 spin_unlock(&tlbstate_lock);
175}
176
177void flush_tlb_current_task(void)
178{
179 struct mm_struct *mm = current->mm;
180 cpumask_t cpu_mask;
181
182 preempt_disable();
183 cpu_mask = mm->cpu_vm_mask;
184 cpu_clear(smp_processor_id(), cpu_mask);
185
186 local_flush_tlb();
187 if (!cpus_empty(cpu_mask))
188 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
189 preempt_enable();
190}
191
192void flush_tlb_mm(struct mm_struct *mm)
193{
194 cpumask_t cpu_mask;
195
196 preempt_disable();
197 cpu_mask = mm->cpu_vm_mask;
198 cpu_clear(smp_processor_id(), cpu_mask);
199
200 if (current->active_mm == mm) {
201 if (current->mm)
202 local_flush_tlb();
203 else
204 leave_mm(smp_processor_id());
205 }
206 if (!cpus_empty(cpu_mask))
207 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
208
209 preempt_enable();
210}
211
212void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
213{
214 struct mm_struct *mm = vma->vm_mm;
215 cpumask_t cpu_mask;
216
217 preempt_disable();
218 cpu_mask = mm->cpu_vm_mask;
219 cpu_clear(smp_processor_id(), cpu_mask);
220
221 if (current->active_mm == mm) {
222 if (current->mm)
223 __flush_tlb_one(va);
224 else
225 leave_mm(smp_processor_id());
226 }
227
228 if (!cpus_empty(cpu_mask))
229 flush_tlb_others(cpu_mask, mm, va);
230
231 preempt_enable();
232}
233EXPORT_SYMBOL(flush_tlb_page);
234
235static void do_flush_tlb_all(void *info)
236{
237 unsigned long cpu = smp_processor_id();
238
239 __flush_tlb_all();
240 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
241 leave_mm(cpu);
242}
243
244void flush_tlb_all(void)
245{
246 on_each_cpu(do_flush_tlb_all, NULL, 1);
247}
248
249void reset_lazy_tlbstate(void)
250{
251 int cpu = raw_smp_processor_id();
252
253 per_cpu(cpu_tlbstate, cpu).state = 0;
254 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
255}
256
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 6812b829ed83..89fce1b6d01f 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -11,6 +11,7 @@
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12 12
13#include <asm/mmu_context.h> 13#include <asm/mmu_context.h>
14#include <asm/uv/uv.h>
14#include <asm/uv/uv_mmrs.h> 15#include <asm/uv/uv_mmrs.h>
15#include <asm/uv/uv_hub.h> 16#include <asm/uv/uv_hub.h>
16#include <asm/uv/uv_bau.h> 17#include <asm/uv/uv_bau.h>
@@ -210,14 +211,15 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
210 * 211 *
211 * Send a broadcast and wait for a broadcast message to complete. 212 * Send a broadcast and wait for a broadcast message to complete.
212 * 213 *
213 * The cpumaskp mask contains the cpus the broadcast was sent to. 214 * The flush_mask contains the cpus the broadcast was sent to.
214 * 215 *
215 * Returns 1 if all remote flushing was done. The mask is zeroed. 216 * Returns NULL if all remote flushing was done. The mask is zeroed.
216 * Returns 0 if some remote flushing remains to be done. The mask is left 217 * Returns @flush_mask if some remote flushing remains to be done. The
217 * unchanged. 218 * mask will have some bits still set.
218 */ 219 */
219int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc, 220const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
220 cpumask_t *cpumaskp) 221 struct bau_desc *bau_desc,
222 struct cpumask *flush_mask)
221{ 223{
222 int completion_status = 0; 224 int completion_status = 0;
223 int right_shift; 225 int right_shift;
@@ -264,59 +266,69 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
264 * Success, so clear the remote cpu's from the mask so we don't 266 * Success, so clear the remote cpu's from the mask so we don't
265 * use the IPI method of shootdown on them. 267 * use the IPI method of shootdown on them.
266 */ 268 */
267 for_each_cpu_mask(bit, *cpumaskp) { 269 for_each_cpu(bit, flush_mask) {
268 blade = uv_cpu_to_blade_id(bit); 270 blade = uv_cpu_to_blade_id(bit);
269 if (blade == this_blade) 271 if (blade == this_blade)
270 continue; 272 continue;
271 cpu_clear(bit, *cpumaskp); 273 cpumask_clear_cpu(bit, flush_mask);
272 } 274 }
273 if (!cpus_empty(*cpumaskp)) 275 if (!cpumask_empty(flush_mask))
274 return 0; 276 return flush_mask;
275 return 1; 277 return NULL;
276} 278}
277 279
278/** 280/**
279 * uv_flush_tlb_others - globally purge translation cache of a virtual 281 * uv_flush_tlb_others - globally purge translation cache of a virtual
280 * address or all TLB's 282 * address or all TLB's
281 * @cpumaskp: mask of all cpu's in which the address is to be removed 283 * @cpumask: mask of all cpu's in which the address is to be removed
282 * @mm: mm_struct containing virtual address range 284 * @mm: mm_struct containing virtual address range
283 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) 285 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
286 * @cpu: the current cpu
284 * 287 *
285 * This is the entry point for initiating any UV global TLB shootdown. 288 * This is the entry point for initiating any UV global TLB shootdown.
286 * 289 *
287 * Purges the translation caches of all specified processors of the given 290 * Purges the translation caches of all specified processors of the given
288 * virtual address, or purges all TLB's on specified processors. 291 * virtual address, or purges all TLB's on specified processors.
289 * 292 *
290 * The caller has derived the cpumaskp from the mm_struct and has subtracted 293 * The caller has derived the cpumask from the mm_struct. This function
291 * the local cpu from the mask. This function is called only if there 294 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
292 * are bits set in the mask. (e.g. flush_tlb_page())
293 * 295 *
294 * The cpumaskp is converted into a nodemask of the nodes containing 296 * The cpumask is converted into a nodemask of the nodes containing
295 * the cpus. 297 * the cpus.
296 * 298 *
297 * Returns 1 if all remote flushing was done. 299 * Note that this function should be called with preemption disabled.
298 * Returns 0 if some remote flushing remains to be done. 300 *
301 * Returns NULL if all remote flushing was done.
302 * Returns pointer to cpumask if some remote flushing remains to be
303 * done. The returned pointer is valid till preemption is re-enabled.
299 */ 304 */
300int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, 305const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
301 unsigned long va) 306 struct mm_struct *mm,
307 unsigned long va, unsigned int cpu)
302{ 308{
309 static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
310 struct cpumask *flush_mask = &__get_cpu_var(flush_tlb_mask);
303 int i; 311 int i;
304 int bit; 312 int bit;
305 int blade; 313 int blade;
306 int cpu; 314 int uv_cpu;
307 int this_blade; 315 int this_blade;
308 int locals = 0; 316 int locals = 0;
309 struct bau_desc *bau_desc; 317 struct bau_desc *bau_desc;
310 318
311 cpu = uv_blade_processor_id(); 319 WARN_ON(!in_atomic());
320
321 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
322
323 uv_cpu = uv_blade_processor_id();
312 this_blade = uv_numa_blade_id(); 324 this_blade = uv_numa_blade_id();
313 bau_desc = __get_cpu_var(bau_control).descriptor_base; 325 bau_desc = __get_cpu_var(bau_control).descriptor_base;
314 bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu; 326 bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
315 327
316 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 328 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
317 329
318 i = 0; 330 i = 0;
319 for_each_cpu_mask(bit, *cpumaskp) { 331 for_each_cpu(bit, flush_mask) {
320 blade = uv_cpu_to_blade_id(bit); 332 blade = uv_cpu_to_blade_id(bit);
321 BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1)); 333 BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
322 if (blade == this_blade) { 334 if (blade == this_blade) {
@@ -331,17 +343,17 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
331 * no off_node flushing; return status for local node 343 * no off_node flushing; return status for local node
332 */ 344 */
333 if (locals) 345 if (locals)
334 return 0; 346 return flush_mask;
335 else 347 else
336 return 1; 348 return NULL;
337 } 349 }
338 __get_cpu_var(ptcstats).requestor++; 350 __get_cpu_var(ptcstats).requestor++;
339 __get_cpu_var(ptcstats).ntargeted += i; 351 __get_cpu_var(ptcstats).ntargeted += i;
340 352
341 bau_desc->payload.address = va; 353 bau_desc->payload.address = va;
342 bau_desc->payload.sending_cpu = smp_processor_id(); 354 bau_desc->payload.sending_cpu = cpu;
343 355
344 return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp); 356 return uv_flush_send_and_wait(uv_cpu, this_blade, bau_desc, flush_mask);
345} 357}
346 358
347/* 359/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7932338d7cb3..17483fe98e9c 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -59,7 +59,6 @@
59#ifdef CONFIG_X86_64 59#ifdef CONFIG_X86_64
60#include <asm/pgalloc.h> 60#include <asm/pgalloc.h>
61#include <asm/proto.h> 61#include <asm/proto.h>
62#include <asm/pda.h>
63#else 62#else
64#include <asm/processor-flags.h> 63#include <asm/processor-flags.h>
65#include <asm/arch_hooks.h> 64#include <asm/arch_hooks.h>
@@ -983,8 +982,13 @@ void __init trap_init(void)
983#endif 982#endif
984 set_intr_gate(19, &simd_coprocessor_error); 983 set_intr_gate(19, &simd_coprocessor_error);
985 984
985 /* Reserve all the builtin and the syscall vector: */
986 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
987 set_bit(i, used_vectors);
988
986#ifdef CONFIG_IA32_EMULATION 989#ifdef CONFIG_IA32_EMULATION
987 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 990 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
991 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
988#endif 992#endif
989 993
990#ifdef CONFIG_X86_32 994#ifdef CONFIG_X86_32
@@ -1001,17 +1005,9 @@ void __init trap_init(void)
1001 } 1005 }
1002 1006
1003 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 1007 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
1004#endif
1005
1006 /* Reserve all the builtin and the syscall vector: */
1007 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
1008 set_bit(i, used_vectors);
1009
1010#ifdef CONFIG_X86_64
1011 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
1012#else
1013 set_bit(SYSCALL_VECTOR, used_vectors); 1008 set_bit(SYSCALL_VECTOR, used_vectors);
1014#endif 1009#endif
1010
1015 /* 1011 /*
1016 * Should be a barrier for any external CPU state: 1012 * Should be a barrier for any external CPU state:
1017 */ 1013 */
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 82c67559dde7..3eba7f7bac05 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -178,14 +178,7 @@ SECTIONS
178 __initramfs_end = .; 178 __initramfs_end = .;
179 } 179 }
180#endif 180#endif
181 . = ALIGN(PAGE_SIZE); 181 PERCPU(PAGE_SIZE)
182 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
183 __per_cpu_start = .;
184 *(.data.percpu.page_aligned)
185 *(.data.percpu)
186 *(.data.percpu.shared_aligned)
187 __per_cpu_end = .;
188 }
189 . = ALIGN(PAGE_SIZE); 182 . = ALIGN(PAGE_SIZE);
190 /* freed after init ends here */ 183 /* freed after init ends here */
191 184
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 1a614c0e6bef..c9740996430a 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -5,6 +5,7 @@
5#define LOAD_OFFSET __START_KERNEL_map 5#define LOAD_OFFSET __START_KERNEL_map
6 6
7#include <asm-generic/vmlinux.lds.h> 7#include <asm-generic/vmlinux.lds.h>
8#include <asm/asm-offsets.h>
8#include <asm/page.h> 9#include <asm/page.h>
9 10
10#undef i386 /* in case the preprocessor is a 32bit one */ 11#undef i386 /* in case the preprocessor is a 32bit one */
@@ -13,12 +14,14 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
13OUTPUT_ARCH(i386:x86-64) 14OUTPUT_ARCH(i386:x86-64)
14ENTRY(phys_startup_64) 15ENTRY(phys_startup_64)
15jiffies_64 = jiffies; 16jiffies_64 = jiffies;
16_proxy_pda = 1;
17PHDRS { 17PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */ 18 text PT_LOAD FLAGS(5); /* R_E */
19 data PT_LOAD FLAGS(7); /* RWE */ 19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */ 20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */ 21 data.init PT_LOAD FLAGS(7); /* RWE */
22#ifdef CONFIG_SMP
23 percpu PT_LOAD FLAGS(7); /* RWE */
24#endif
22 note PT_NOTE FLAGS(0); /* ___ */ 25 note PT_NOTE FLAGS(0); /* ___ */
23} 26}
24SECTIONS 27SECTIONS
@@ -208,14 +211,28 @@ SECTIONS
208 __initramfs_end = .; 211 __initramfs_end = .;
209#endif 212#endif
210 213
214#ifdef CONFIG_SMP
215 /*
216 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
217 * output PHDR, so the next output section - __data_nosave - should
218 * switch it back to data.init. Also, pda should be at the head of
219 * percpu area. Preallocate it and define the percpu offset symbol
220 * so that it can be accessed as a percpu variable.
221 */
222 . = ALIGN(PAGE_SIZE);
223 PERCPU_VADDR(0, :percpu)
224#else
211 PERCPU(PAGE_SIZE) 225 PERCPU(PAGE_SIZE)
226#endif
212 227
213 . = ALIGN(PAGE_SIZE); 228 . = ALIGN(PAGE_SIZE);
214 __init_end = .; 229 __init_end = .;
215 230
216 . = ALIGN(PAGE_SIZE); 231 . = ALIGN(PAGE_SIZE);
217 __nosave_begin = .; 232 __nosave_begin = .;
218 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } 233 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
234 *(.data.nosave)
235 } :data.init /* switch back to data.init, see PERCPU_VADDR() above */
219 . = ALIGN(PAGE_SIZE); 236 . = ALIGN(PAGE_SIZE);
220 __nosave_end = .; 237 __nosave_end = .;
221 238
@@ -244,3 +261,8 @@ SECTIONS
244 */ 261 */
245ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), 262ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
246 "kernel image bigger than KERNEL_IMAGE_SIZE") 263 "kernel image bigger than KERNEL_IMAGE_SIZE")
264
265#ifdef CONFIG_SMP
266ASSERT((per_cpu__irq_stack_union == 0),
267 "irq_stack_union is not at start of per-cpu area");
268#endif
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 695e426aa354..3909e3ba5ce3 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -58,5 +58,3 @@ EXPORT_SYMBOL(__memcpy);
58EXPORT_SYMBOL(empty_zero_page); 58EXPORT_SYMBOL(empty_zero_page);
59EXPORT_SYMBOL(init_level4_pgt); 59EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index); 60EXPORT_SYMBOL(load_gs_index);
61
62EXPORT_SYMBOL(_proxy_pda);
diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c
index d914a7996a66..66b7eb57d8e4 100644
--- a/arch/x86/mach-voyager/setup.c
+++ b/arch/x86/mach-voyager/setup.c
@@ -9,6 +9,7 @@
9#include <asm/e820.h> 9#include <asm/e820.h>
10#include <asm/io.h> 10#include <asm/io.h>
11#include <asm/setup.h> 11#include <asm/setup.h>
12#include <asm/cpu.h>
12 13
13void __init pre_intr_init_hook(void) 14void __init pre_intr_init_hook(void)
14{ 15{
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 7ffcdeec4631..2c74aec4efc1 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -400,7 +400,7 @@ void __init find_smp_config(void)
400 VOYAGER_SUS_IN_CONTROL_PORT); 400 VOYAGER_SUS_IN_CONTROL_PORT);
401 401
402 current_thread_info()->cpu = boot_cpu_id; 402 current_thread_info()->cpu = boot_cpu_id;
403 x86_write_percpu(cpu_number, boot_cpu_id); 403 percpu_write(cpu_number, boot_cpu_id);
404} 404}
405 405
406/* 406/*
@@ -529,6 +529,7 @@ static void __init do_boot_cpu(__u8 cpu)
529 stack_start.sp = (void *)idle->thread.sp; 529 stack_start.sp = (void *)idle->thread.sp;
530 530
531 init_gdt(cpu); 531 init_gdt(cpu);
532 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
532 per_cpu(current_task, cpu) = idle; 533 per_cpu(current_task, cpu) = idle;
533 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 534 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
534 irq_ctx_init(cpu); 535 irq_ctx_init(cpu);
@@ -1746,6 +1747,7 @@ static void __init voyager_smp_prepare_cpus(unsigned int max_cpus)
1746static void __cpuinit voyager_smp_prepare_boot_cpu(void) 1747static void __cpuinit voyager_smp_prepare_boot_cpu(void)
1747{ 1748{
1748 init_gdt(smp_processor_id()); 1749 init_gdt(smp_processor_id());
1750 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
1749 switch_to_new_gdt(); 1751 switch_to_new_gdt();
1750 1752
1751 cpu_online_map = cpumask_of_cpu(smp_processor_id()); 1753 cpu_online_map = cpumask_of_cpu(smp_processor_id());
@@ -1779,7 +1781,7 @@ static void __init voyager_smp_cpus_done(unsigned int max_cpus)
1779void __init smp_setup_processor_id(void) 1781void __init smp_setup_processor_id(void)
1780{ 1782{
1781 current_thread_info()->cpu = hard_smp_processor_id(); 1783 current_thread_info()->cpu = hard_smp_processor_id();
1782 x86_write_percpu(cpu_number, hard_smp_processor_id()); 1784 percpu_write(cpu_number, hard_smp_processor_id());
1783} 1785}
1784 1786
1785static void voyager_send_call_func(const struct cpumask *callmask) 1787static void voyager_send_call_func(const struct cpumask *callmask)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index d8cc96a2738f..9f05157220f5 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,6 +1,8 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o gup.o 2 pat.o pgtable.o gup.o
3 3
4obj-$(CONFIG_X86_SMP) += tlb.o
5
4obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o 6obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
5 7
6obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 8obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c76ef1d701c9..8c3f3113a6ec 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -26,6 +26,7 @@
26#include <linux/kprobes.h> 26#include <linux/kprobes.h>
27#include <linux/uaccess.h> 27#include <linux/uaccess.h>
28#include <linux/kdebug.h> 28#include <linux/kdebug.h>
29#include <linux/magic.h>
29 30
30#include <asm/system.h> 31#include <asm/system.h>
31#include <asm/desc.h> 32#include <asm/desc.h>
@@ -91,8 +92,8 @@ static inline int notify_page_fault(struct pt_regs *regs)
91 * 92 *
92 * Opcode checker based on code by Richard Brunner 93 * Opcode checker based on code by Richard Brunner
93 */ 94 */
94static int is_prefetch(struct pt_regs *regs, unsigned long addr, 95static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
95 unsigned long error_code) 96 unsigned long addr)
96{ 97{
97 unsigned char *instr; 98 unsigned char *instr;
98 int scan_more = 1; 99 int scan_more = 1;
@@ -409,15 +410,15 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
409} 410}
410 411
411#ifdef CONFIG_X86_64 412#ifdef CONFIG_X86_64
412static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, 413static noinline void pgtable_bad(struct pt_regs *regs,
413 unsigned long error_code) 414 unsigned long error_code, unsigned long address)
414{ 415{
415 unsigned long flags = oops_begin(); 416 unsigned long flags = oops_begin();
416 int sig = SIGKILL; 417 int sig = SIGKILL;
417 struct task_struct *tsk; 418 struct task_struct *tsk = current;
418 419
419 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 420 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
420 current->comm, address); 421 tsk->comm, address);
421 dump_pagetable(address); 422 dump_pagetable(address);
422 tsk = current; 423 tsk = current;
423 tsk->thread.cr2 = address; 424 tsk->thread.cr2 = address;
@@ -429,6 +430,196 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
429} 430}
430#endif 431#endif
431 432
433static noinline void no_context(struct pt_regs *regs,
434 unsigned long error_code, unsigned long address)
435{
436 struct task_struct *tsk = current;
437 unsigned long *stackend;
438
439#ifdef CONFIG_X86_64
440 unsigned long flags;
441 int sig;
442#endif
443
444 /* Are we prepared to handle this kernel fault? */
445 if (fixup_exception(regs))
446 return;
447
448 /*
449 * X86_32
450 * Valid to do another page fault here, because if this fault
451 * had been triggered by is_prefetch fixup_exception would have
452 * handled it.
453 *
454 * X86_64
455 * Hall of shame of CPU/BIOS bugs.
456 */
457 if (is_prefetch(regs, error_code, address))
458 return;
459
460 if (is_errata93(regs, address))
461 return;
462
463 /*
464 * Oops. The kernel tried to access some bad page. We'll have to
465 * terminate things with extreme prejudice.
466 */
467#ifdef CONFIG_X86_32
468 bust_spinlocks(1);
469#else
470 flags = oops_begin();
471#endif
472
473 show_fault_oops(regs, error_code, address);
474
475 stackend = end_of_stack(tsk);
476 if (*stackend != STACK_END_MAGIC)
477 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
478
479 tsk->thread.cr2 = address;
480 tsk->thread.trap_no = 14;
481 tsk->thread.error_code = error_code;
482
483#ifdef CONFIG_X86_32
484 die("Oops", regs, error_code);
485 bust_spinlocks(0);
486 do_exit(SIGKILL);
487#else
488 sig = SIGKILL;
489 if (__die("Oops", regs, error_code))
490 sig = 0;
491 /* Executive summary in case the body of the oops scrolled away */
492 printk(KERN_EMERG "CR2: %016lx\n", address);
493 oops_end(flags, regs, sig);
494#endif
495}
496
497static void __bad_area_nosemaphore(struct pt_regs *regs,
498 unsigned long error_code, unsigned long address,
499 int si_code)
500{
501 struct task_struct *tsk = current;
502
503 /* User mode accesses just cause a SIGSEGV */
504 if (error_code & PF_USER) {
505 /*
506 * It's possible to have interrupts off here.
507 */
508 local_irq_enable();
509
510 /*
511 * Valid to do another page fault here because this one came
512 * from user space.
513 */
514 if (is_prefetch(regs, error_code, address))
515 return;
516
517 if (is_errata100(regs, address))
518 return;
519
520 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
521 printk_ratelimit()) {
522 printk(
523 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
524 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
525 tsk->comm, task_pid_nr(tsk), address,
526 (void *) regs->ip, (void *) regs->sp, error_code);
527 print_vma_addr(" in ", regs->ip);
528 printk("\n");
529 }
530
531 tsk->thread.cr2 = address;
532 /* Kernel addresses are always protection faults */
533 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
534 tsk->thread.trap_no = 14;
535 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
536 return;
537 }
538
539 if (is_f00f_bug(regs, address))
540 return;
541
542 no_context(regs, error_code, address);
543}
544
545static noinline void bad_area_nosemaphore(struct pt_regs *regs,
546 unsigned long error_code, unsigned long address)
547{
548 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
549}
550
551static void __bad_area(struct pt_regs *regs,
552 unsigned long error_code, unsigned long address,
553 int si_code)
554{
555 struct mm_struct *mm = current->mm;
556
557 /*
558 * Something tried to access memory that isn't in our memory map..
559 * Fix it, but check if it's kernel or user first..
560 */
561 up_read(&mm->mmap_sem);
562
563 __bad_area_nosemaphore(regs, error_code, address, si_code);
564}
565
566static noinline void bad_area(struct pt_regs *regs,
567 unsigned long error_code, unsigned long address)
568{
569 __bad_area(regs, error_code, address, SEGV_MAPERR);
570}
571
572static noinline void bad_area_access_error(struct pt_regs *regs,
573 unsigned long error_code, unsigned long address)
574{
575 __bad_area(regs, error_code, address, SEGV_ACCERR);
576}
577
578/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
579static void out_of_memory(struct pt_regs *regs,
580 unsigned long error_code, unsigned long address)
581{
582 /*
583 * We ran out of memory, call the OOM killer, and return the userspace
584 * (which will retry the fault, or kill us if we got oom-killed).
585 */
586 up_read(&current->mm->mmap_sem);
587 pagefault_out_of_memory();
588}
589
590static void do_sigbus(struct pt_regs *regs,
591 unsigned long error_code, unsigned long address)
592{
593 struct task_struct *tsk = current;
594 struct mm_struct *mm = tsk->mm;
595
596 up_read(&mm->mmap_sem);
597
598 /* Kernel mode? Handle exceptions or die */
599 if (!(error_code & PF_USER))
600 no_context(regs, error_code, address);
601#ifdef CONFIG_X86_32
602 /* User space => ok to do another page fault */
603 if (is_prefetch(regs, error_code, address))
604 return;
605#endif
606 tsk->thread.cr2 = address;
607 tsk->thread.error_code = error_code;
608 tsk->thread.trap_no = 14;
609 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
610}
611
612static noinline void mm_fault_error(struct pt_regs *regs,
613 unsigned long error_code, unsigned long address, unsigned int fault)
614{
615 if (fault & VM_FAULT_OOM)
616 out_of_memory(regs, error_code, address);
617 else if (fault & VM_FAULT_SIGBUS)
618 do_sigbus(regs, error_code, address);
619 else
620 BUG();
621}
622
432static int spurious_fault_check(unsigned long error_code, pte_t *pte) 623static int spurious_fault_check(unsigned long error_code, pte_t *pte)
433{ 624{
434 if ((error_code & PF_WRITE) && !pte_write(*pte)) 625 if ((error_code & PF_WRITE) && !pte_write(*pte))
@@ -448,8 +639,8 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
448 * There are no security implications to leaving a stale TLB when 639 * There are no security implications to leaving a stale TLB when
449 * increasing the permissions on a page. 640 * increasing the permissions on a page.
450 */ 641 */
451static int spurious_fault(unsigned long address, 642static noinline int spurious_fault(unsigned long error_code,
452 unsigned long error_code) 643 unsigned long address)
453{ 644{
454 pgd_t *pgd; 645 pgd_t *pgd;
455 pud_t *pud; 646 pud_t *pud;
@@ -494,7 +685,7 @@ static int spurious_fault(unsigned long address,
494 * 685 *
495 * This assumes no large pages in there. 686 * This assumes no large pages in there.
496 */ 687 */
497static int vmalloc_fault(unsigned long address) 688static noinline int vmalloc_fault(unsigned long address)
498{ 689{
499#ifdef CONFIG_X86_32 690#ifdef CONFIG_X86_32
500 unsigned long pgd_paddr; 691 unsigned long pgd_paddr;
@@ -573,6 +764,25 @@ static int vmalloc_fault(unsigned long address)
573 764
574int show_unhandled_signals = 1; 765int show_unhandled_signals = 1;
575 766
767static inline int access_error(unsigned long error_code, int write,
768 struct vm_area_struct *vma)
769{
770 if (write) {
771 /* write, present and write, not present */
772 if (unlikely(!(vma->vm_flags & VM_WRITE)))
773 return 1;
774 } else if (unlikely(error_code & PF_PROT)) {
775 /* read, present */
776 return 1;
777 } else {
778 /* read, not present */
779 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
780 return 1;
781 }
782
783 return 0;
784}
785
576/* 786/*
577 * This routine handles page faults. It determines the address, 787 * This routine handles page faults. It determines the address,
578 * and the problem, and then passes it off to one of the appropriate 788 * and the problem, and then passes it off to one of the appropriate
@@ -583,16 +793,12 @@ asmlinkage
583#endif 793#endif
584void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) 794void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
585{ 795{
796 unsigned long address;
586 struct task_struct *tsk; 797 struct task_struct *tsk;
587 struct mm_struct *mm; 798 struct mm_struct *mm;
588 struct vm_area_struct *vma; 799 struct vm_area_struct *vma;
589 unsigned long address; 800 int write;
590 int write, si_code;
591 int fault; 801 int fault;
592#ifdef CONFIG_X86_64
593 unsigned long flags;
594 int sig;
595#endif
596 802
597 tsk = current; 803 tsk = current;
598 mm = tsk->mm; 804 mm = tsk->mm;
@@ -601,8 +807,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
601 /* get the address */ 807 /* get the address */
602 address = read_cr2(); 808 address = read_cr2();
603 809
604 si_code = SEGV_MAPERR;
605
606 if (unlikely(kmmio_fault(regs, address))) 810 if (unlikely(kmmio_fault(regs, address)))
607 return; 811 return;
608 812
@@ -629,7 +833,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
629 return; 833 return;
630 834
631 /* Can handle a stale RO->RW TLB */ 835 /* Can handle a stale RO->RW TLB */
632 if (spurious_fault(address, error_code)) 836 if (spurious_fault(error_code, address))
633 return; 837 return;
634 838
635 /* kprobes don't want to hook the spurious faults. */ 839 /* kprobes don't want to hook the spurious faults. */
@@ -639,13 +843,12 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
639 * Don't take the mm semaphore here. If we fixup a prefetch 843 * Don't take the mm semaphore here. If we fixup a prefetch
640 * fault we could otherwise deadlock. 844 * fault we could otherwise deadlock.
641 */ 845 */
642 goto bad_area_nosemaphore; 846 bad_area_nosemaphore(regs, error_code, address);
847 return;
643 } 848 }
644 849
645 /* kprobes don't want to hook the spurious faults. */ 850 if (unlikely(notify_page_fault(regs)))
646 if (notify_page_fault(regs))
647 return; 851 return;
648
649 /* 852 /*
650 * It's safe to allow irq's after cr2 has been saved and the 853 * It's safe to allow irq's after cr2 has been saved and the
651 * vmalloc fault has been handled. 854 * vmalloc fault has been handled.
@@ -661,15 +864,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
661 864
662#ifdef CONFIG_X86_64 865#ifdef CONFIG_X86_64
663 if (unlikely(error_code & PF_RSVD)) 866 if (unlikely(error_code & PF_RSVD))
664 pgtable_bad(address, regs, error_code); 867 pgtable_bad(regs, error_code, address);
665#endif 868#endif
666 869
667 /* 870 /*
668 * If we're in an interrupt, have no user context or are running in an 871 * If we're in an interrupt, have no user context or are running in an
669 * atomic region then we must not take the fault. 872 * atomic region then we must not take the fault.
670 */ 873 */
671 if (unlikely(in_atomic() || !mm)) 874 if (unlikely(in_atomic() || !mm)) {
672 goto bad_area_nosemaphore; 875 bad_area_nosemaphore(regs, error_code, address);
876 return;
877 }
673 878
674 /* 879 /*
675 * When running in the kernel we expect faults to occur only to 880 * When running in the kernel we expect faults to occur only to
@@ -687,20 +892,26 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
687 * source. If this is invalid we can skip the address space check, 892 * source. If this is invalid we can skip the address space check,
688 * thus avoiding the deadlock. 893 * thus avoiding the deadlock.
689 */ 894 */
690 if (!down_read_trylock(&mm->mmap_sem)) { 895 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
691 if ((error_code & PF_USER) == 0 && 896 if ((error_code & PF_USER) == 0 &&
692 !search_exception_tables(regs->ip)) 897 !search_exception_tables(regs->ip)) {
693 goto bad_area_nosemaphore; 898 bad_area_nosemaphore(regs, error_code, address);
899 return;
900 }
694 down_read(&mm->mmap_sem); 901 down_read(&mm->mmap_sem);
695 } 902 }
696 903
697 vma = find_vma(mm, address); 904 vma = find_vma(mm, address);
698 if (!vma) 905 if (unlikely(!vma)) {
699 goto bad_area; 906 bad_area(regs, error_code, address);
700 if (vma->vm_start <= address) 907 return;
908 }
909 if (likely(vma->vm_start <= address))
701 goto good_area; 910 goto good_area;
702 if (!(vma->vm_flags & VM_GROWSDOWN)) 911 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
703 goto bad_area; 912 bad_area(regs, error_code, address);
913 return;
914 }
704 if (error_code & PF_USER) { 915 if (error_code & PF_USER) {
705 /* 916 /*
706 * Accessing the stack below %sp is always a bug. 917 * Accessing the stack below %sp is always a bug.
@@ -708,31 +919,25 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
708 * and pusha to work. ("enter $65535,$31" pushes 919 * and pusha to work. ("enter $65535,$31" pushes
709 * 32 pointers and then decrements %sp by 65535.) 920 * 32 pointers and then decrements %sp by 65535.)
710 */ 921 */
711 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) 922 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
712 goto bad_area; 923 bad_area(regs, error_code, address);
924 return;
925 }
713 } 926 }
714 if (expand_stack(vma, address)) 927 if (unlikely(expand_stack(vma, address))) {
715 goto bad_area; 928 bad_area(regs, error_code, address);
716/* 929 return;
717 * Ok, we have a good vm_area for this memory access, so 930 }
718 * we can handle it.. 931
719 */ 932 /*
933 * Ok, we have a good vm_area for this memory access, so
934 * we can handle it..
935 */
720good_area: 936good_area:
721 si_code = SEGV_ACCERR; 937 write = error_code & PF_WRITE;
722 write = 0; 938 if (unlikely(access_error(error_code, write, vma))) {
723 switch (error_code & (PF_PROT|PF_WRITE)) { 939 bad_area_access_error(regs, error_code, address);
724 default: /* 3: write, present */ 940 return;
725 /* fall through */
726 case PF_WRITE: /* write, not present */
727 if (!(vma->vm_flags & VM_WRITE))
728 goto bad_area;
729 write++;
730 break;
731 case PF_PROT: /* read, present */
732 goto bad_area;
733 case 0: /* read, not present */
734 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
735 goto bad_area;
736 } 941 }
737 942
738 /* 943 /*
@@ -742,11 +947,8 @@ good_area:
742 */ 947 */
743 fault = handle_mm_fault(mm, vma, address, write); 948 fault = handle_mm_fault(mm, vma, address, write);
744 if (unlikely(fault & VM_FAULT_ERROR)) { 949 if (unlikely(fault & VM_FAULT_ERROR)) {
745 if (fault & VM_FAULT_OOM) 950 mm_fault_error(regs, error_code, address, fault);
746 goto out_of_memory; 951 return;
747 else if (fault & VM_FAULT_SIGBUS)
748 goto do_sigbus;
749 BUG();
750 } 952 }
751 if (fault & VM_FAULT_MAJOR) 953 if (fault & VM_FAULT_MAJOR)
752 tsk->maj_flt++; 954 tsk->maj_flt++;
@@ -764,128 +966,6 @@ good_area:
764 } 966 }
765#endif 967#endif
766 up_read(&mm->mmap_sem); 968 up_read(&mm->mmap_sem);
767 return;
768
769/*
770 * Something tried to access memory that isn't in our memory map..
771 * Fix it, but check if it's kernel or user first..
772 */
773bad_area:
774 up_read(&mm->mmap_sem);
775
776bad_area_nosemaphore:
777 /* User mode accesses just cause a SIGSEGV */
778 if (error_code & PF_USER) {
779 /*
780 * It's possible to have interrupts off here.
781 */
782 local_irq_enable();
783
784 /*
785 * Valid to do another page fault here because this one came
786 * from user space.
787 */
788 if (is_prefetch(regs, address, error_code))
789 return;
790
791 if (is_errata100(regs, address))
792 return;
793
794 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
795 printk_ratelimit()) {
796 printk(
797 "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
798 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
799 tsk->comm, task_pid_nr(tsk), address,
800 (void *) regs->ip, (void *) regs->sp, error_code);
801 print_vma_addr(" in ", regs->ip);
802 printk("\n");
803 }
804
805 tsk->thread.cr2 = address;
806 /* Kernel addresses are always protection faults */
807 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
808 tsk->thread.trap_no = 14;
809 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
810 return;
811 }
812
813 if (is_f00f_bug(regs, address))
814 return;
815
816no_context:
817 /* Are we prepared to handle this kernel fault? */
818 if (fixup_exception(regs))
819 return;
820
821 /*
822 * X86_32
823 * Valid to do another page fault here, because if this fault
824 * had been triggered by is_prefetch fixup_exception would have
825 * handled it.
826 *
827 * X86_64
828 * Hall of shame of CPU/BIOS bugs.
829 */
830 if (is_prefetch(regs, address, error_code))
831 return;
832
833 if (is_errata93(regs, address))
834 return;
835
836/*
837 * Oops. The kernel tried to access some bad page. We'll have to
838 * terminate things with extreme prejudice.
839 */
840#ifdef CONFIG_X86_32
841 bust_spinlocks(1);
842#else
843 flags = oops_begin();
844#endif
845
846 show_fault_oops(regs, error_code, address);
847
848 tsk->thread.cr2 = address;
849 tsk->thread.trap_no = 14;
850 tsk->thread.error_code = error_code;
851
852#ifdef CONFIG_X86_32
853 die("Oops", regs, error_code);
854 bust_spinlocks(0);
855 do_exit(SIGKILL);
856#else
857 sig = SIGKILL;
858 if (__die("Oops", regs, error_code))
859 sig = 0;
860 /* Executive summary in case the body of the oops scrolled away */
861 printk(KERN_EMERG "CR2: %016lx\n", address);
862 oops_end(flags, regs, sig);
863#endif
864
865out_of_memory:
866 /*
867 * We ran out of memory, call the OOM killer, and return the userspace
868 * (which will retry the fault, or kill us if we got oom-killed).
869 */
870 up_read(&mm->mmap_sem);
871 pagefault_out_of_memory();
872 return;
873
874do_sigbus:
875 up_read(&mm->mmap_sem);
876
877 /* Kernel mode? Handle exceptions or die */
878 if (!(error_code & PF_USER))
879 goto no_context;
880#ifdef CONFIG_X86_32
881 /* User space => ok to do another page fault */
882 if (is_prefetch(regs, address, error_code))
883 return;
884#endif
885 tsk->thread.cr2 = address;
886 tsk->thread.error_code = error_code;
887 tsk->thread.trap_no = 14;
888 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
889} 969}
890 970
891DEFINE_SPINLOCK(pgd_lock); 971DEFINE_SPINLOCK(pgd_lock);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 2cef05074413..00263bf07a88 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,7 +49,6 @@
49#include <asm/paravirt.h> 49#include <asm/paravirt.h>
50#include <asm/setup.h> 50#include <asm/setup.h>
51#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
52#include <asm/smp.h>
53 52
54unsigned int __VMALLOC_RESERVE = 128 << 20; 53unsigned int __VMALLOC_RESERVE = 128 << 20;
55 54
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 09737c8af074..15df1baee100 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -21,6 +21,7 @@
21#include <asm/numa.h> 21#include <asm/numa.h>
22#include <asm/e820.h> 22#include <asm/e820.h>
23#include <asm/genapic.h> 23#include <asm/genapic.h>
24#include <asm/uv/uv.h>
24 25
25int acpi_numa __initdata; 26int acpi_numa __initdata;
26 27
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/mm/tlb.c
index f8be6f1d2e48..72a6d4ebe34d 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/mm/tlb.c
@@ -1,22 +1,18 @@
1#include <linux/init.h> 1#include <linux/init.h>
2 2
3#include <linux/mm.h> 3#include <linux/mm.h>
4#include <linux/delay.h>
5#include <linux/spinlock.h> 4#include <linux/spinlock.h>
6#include <linux/smp.h> 5#include <linux/smp.h>
7#include <linux/kernel_stat.h>
8#include <linux/mc146818rtc.h>
9#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/module.h>
10 8
11#include <asm/mtrr.h>
12#include <asm/pgalloc.h>
13#include <asm/tlbflush.h> 9#include <asm/tlbflush.h>
14#include <asm/mmu_context.h> 10#include <asm/mmu_context.h>
15#include <asm/proto.h> 11#include <asm/apic.h>
16#include <asm/apicdef.h> 12#include <asm/uv/uv.h>
17#include <asm/idle.h> 13
18#include <asm/uv/uv_hub.h> 14DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
19#include <asm/uv/uv_bau.h> 15 = { &init_mm, 0, };
20 16
21#include <mach_ipi.h> 17#include <mach_ipi.h>
22/* 18/*
@@ -33,7 +29,7 @@
33 * To avoid global state use 8 different call vectors. 29 * To avoid global state use 8 different call vectors.
34 * Each CPU uses a specific vector to trigger flushes on other 30 * Each CPU uses a specific vector to trigger flushes on other
35 * CPUs. Depending on the received vector the target CPUs look into 31 * CPUs. Depending on the received vector the target CPUs look into
36 * the right per cpu variable for the flush data. 32 * the right array slot for the flush data.
37 * 33 *
38 * With more than 8 CPUs they are hashed to the 8 available 34 * With more than 8 CPUs they are hashed to the 8 available
39 * vectors. The limited global vector space forces us to this right now. 35 * vectors. The limited global vector space forces us to this right now.
@@ -43,18 +39,18 @@
43 39
44union smp_flush_state { 40union smp_flush_state {
45 struct { 41 struct {
46 cpumask_t flush_cpumask;
47 struct mm_struct *flush_mm; 42 struct mm_struct *flush_mm;
48 unsigned long flush_va; 43 unsigned long flush_va;
49 spinlock_t tlbstate_lock; 44 spinlock_t tlbstate_lock;
45 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
50 }; 46 };
51 char pad[SMP_CACHE_BYTES]; 47 char pad[CONFIG_X86_INTERNODE_CACHE_BYTES];
52} ____cacheline_aligned; 48} ____cacheline_internodealigned_in_smp;
53 49
54/* State is put into the per CPU data section, but padded 50/* State is put into the per CPU data section, but padded
55 to a full cache line because other CPUs can access it and we don't 51 to a full cache line because other CPUs can access it and we don't
56 want false sharing in the per cpu data segment. */ 52 want false sharing in the per cpu data segment. */
57static DEFINE_PER_CPU(union smp_flush_state, flush_state); 53static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
58 54
59/* 55/*
60 * We cannot call mmdrop() because we are in interrupt context, 56 * We cannot call mmdrop() because we are in interrupt context,
@@ -62,9 +58,9 @@ static DEFINE_PER_CPU(union smp_flush_state, flush_state);
62 */ 58 */
63void leave_mm(int cpu) 59void leave_mm(int cpu)
64{ 60{
65 if (read_pda(mmu_state) == TLBSTATE_OK) 61 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
66 BUG(); 62 BUG();
67 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); 63 cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
68 load_cr3(swapper_pg_dir); 64 load_cr3(swapper_pg_dir);
69} 65}
70EXPORT_SYMBOL_GPL(leave_mm); 66EXPORT_SYMBOL_GPL(leave_mm);
@@ -117,10 +113,20 @@ EXPORT_SYMBOL_GPL(leave_mm);
117 * Interrupts are disabled. 113 * Interrupts are disabled.
118 */ 114 */
119 115
120asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) 116/*
117 * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop
118 * but still used for documentation purpose but the usage is slightly
119 * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt
120 * entry calls in with the first parameter in %eax. Maybe define
121 * intrlinkage?
122 */
123#ifdef CONFIG_X86_64
124asmlinkage
125#endif
126void smp_invalidate_interrupt(struct pt_regs *regs)
121{ 127{
122 int cpu; 128 unsigned int cpu;
123 int sender; 129 unsigned int sender;
124 union smp_flush_state *f; 130 union smp_flush_state *f;
125 131
126 cpu = smp_processor_id(); 132 cpu = smp_processor_id();
@@ -129,9 +135,9 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
129 * Use that to determine where the sender put the data. 135 * Use that to determine where the sender put the data.
130 */ 136 */
131 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; 137 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
132 f = &per_cpu(flush_state, sender); 138 f = &flush_state[sender];
133 139
134 if (!cpu_isset(cpu, f->flush_cpumask)) 140 if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
135 goto out; 141 goto out;
136 /* 142 /*
137 * This was a BUG() but until someone can quote me the 143 * This was a BUG() but until someone can quote me the
@@ -142,8 +148,8 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
142 * BUG(); 148 * BUG();
143 */ 149 */
144 150
145 if (f->flush_mm == read_pda(active_mm)) { 151 if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
146 if (read_pda(mmu_state) == TLBSTATE_OK) { 152 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
147 if (f->flush_va == TLB_FLUSH_ALL) 153 if (f->flush_va == TLB_FLUSH_ALL)
148 local_flush_tlb(); 154 local_flush_tlb();
149 else 155 else
@@ -153,23 +159,21 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
153 } 159 }
154out: 160out:
155 ack_APIC_irq(); 161 ack_APIC_irq();
156 cpu_clear(cpu, f->flush_cpumask); 162 smp_mb__before_clear_bit();
163 cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
164 smp_mb__after_clear_bit();
157 inc_irq_stat(irq_tlb_count); 165 inc_irq_stat(irq_tlb_count);
158} 166}
159 167
160void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, 168static void flush_tlb_others_ipi(const struct cpumask *cpumask,
161 unsigned long va) 169 struct mm_struct *mm, unsigned long va)
162{ 170{
163 int sender; 171 unsigned int sender;
164 union smp_flush_state *f; 172 union smp_flush_state *f;
165 cpumask_t cpumask = *cpumaskp;
166
167 if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va))
168 return;
169 173
170 /* Caller has disabled preemption */ 174 /* Caller has disabled preemption */
171 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; 175 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
172 f = &per_cpu(flush_state, sender); 176 f = &flush_state[sender];
173 177
174 /* 178 /*
175 * Could avoid this lock when 179 * Could avoid this lock when
@@ -180,7 +184,8 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
180 184
181 f->flush_mm = mm; 185 f->flush_mm = mm;
182 f->flush_va = va; 186 f->flush_va = va;
183 cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask); 187 cpumask_andnot(to_cpumask(f->flush_cpumask),
188 cpumask, cpumask_of(smp_processor_id()));
184 189
185 /* 190 /*
186 * Make the above memory operations globally visible before 191 * Make the above memory operations globally visible before
@@ -191,9 +196,10 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
191 * We have to send the IPI only to 196 * We have to send the IPI only to
192 * CPUs affected. 197 * CPUs affected.
193 */ 198 */
194 send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender); 199 send_IPI_mask(to_cpumask(f->flush_cpumask),
200 INVALIDATE_TLB_VECTOR_START + sender);
195 201
196 while (!cpus_empty(f->flush_cpumask)) 202 while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
197 cpu_relax(); 203 cpu_relax();
198 204
199 f->flush_mm = NULL; 205 f->flush_mm = NULL;
@@ -201,12 +207,28 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
201 spin_unlock(&f->tlbstate_lock); 207 spin_unlock(&f->tlbstate_lock);
202} 208}
203 209
210void native_flush_tlb_others(const struct cpumask *cpumask,
211 struct mm_struct *mm, unsigned long va)
212{
213 if (is_uv_system()) {
214 unsigned int cpu;
215
216 cpu = get_cpu();
217 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
218 if (cpumask)
219 flush_tlb_others_ipi(cpumask, mm, va);
220 put_cpu();
221 return;
222 }
223 flush_tlb_others_ipi(cpumask, mm, va);
224}
225
204static int __cpuinit init_smp_flush(void) 226static int __cpuinit init_smp_flush(void)
205{ 227{
206 int i; 228 int i;
207 229
208 for_each_possible_cpu(i) 230 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
209 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); 231 spin_lock_init(&flush_state[i].tlbstate_lock);
210 232
211 return 0; 233 return 0;
212} 234}
@@ -215,25 +237,18 @@ core_initcall(init_smp_flush);
215void flush_tlb_current_task(void) 237void flush_tlb_current_task(void)
216{ 238{
217 struct mm_struct *mm = current->mm; 239 struct mm_struct *mm = current->mm;
218 cpumask_t cpu_mask;
219 240
220 preempt_disable(); 241 preempt_disable();
221 cpu_mask = mm->cpu_vm_mask;
222 cpu_clear(smp_processor_id(), cpu_mask);
223 242
224 local_flush_tlb(); 243 local_flush_tlb();
225 if (!cpus_empty(cpu_mask)) 244 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
226 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); 245 flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
227 preempt_enable(); 246 preempt_enable();
228} 247}
229 248
230void flush_tlb_mm(struct mm_struct *mm) 249void flush_tlb_mm(struct mm_struct *mm)
231{ 250{
232 cpumask_t cpu_mask;
233
234 preempt_disable(); 251 preempt_disable();
235 cpu_mask = mm->cpu_vm_mask;
236 cpu_clear(smp_processor_id(), cpu_mask);
237 252
238 if (current->active_mm == mm) { 253 if (current->active_mm == mm) {
239 if (current->mm) 254 if (current->mm)
@@ -241,8 +256,8 @@ void flush_tlb_mm(struct mm_struct *mm)
241 else 256 else
242 leave_mm(smp_processor_id()); 257 leave_mm(smp_processor_id());
243 } 258 }
244 if (!cpus_empty(cpu_mask)) 259 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
245 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); 260 flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
246 261
247 preempt_enable(); 262 preempt_enable();
248} 263}
@@ -250,11 +265,8 @@ void flush_tlb_mm(struct mm_struct *mm)
250void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) 265void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
251{ 266{
252 struct mm_struct *mm = vma->vm_mm; 267 struct mm_struct *mm = vma->vm_mm;
253 cpumask_t cpu_mask;
254 268
255 preempt_disable(); 269 preempt_disable();
256 cpu_mask = mm->cpu_vm_mask;
257 cpu_clear(smp_processor_id(), cpu_mask);
258 270
259 if (current->active_mm == mm) { 271 if (current->active_mm == mm) {
260 if (current->mm) 272 if (current->mm)
@@ -263,8 +275,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
263 leave_mm(smp_processor_id()); 275 leave_mm(smp_processor_id());
264 } 276 }
265 277
266 if (!cpus_empty(cpu_mask)) 278 if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
267 flush_tlb_others(cpu_mask, mm, va); 279 flush_tlb_others(&mm->cpu_vm_mask, mm, va);
268 280
269 preempt_enable(); 281 preempt_enable();
270} 282}
@@ -274,7 +286,7 @@ static void do_flush_tlb_all(void *info)
274 unsigned long cpu = smp_processor_id(); 286 unsigned long cpu = smp_processor_id();
275 287
276 __flush_tlb_all(); 288 __flush_tlb_all();
277 if (read_pda(mmu_state) == TLBSTATE_LAZY) 289 if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
278 leave_mm(cpu); 290 leave_mm(cpu);
279} 291}
280 292
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 202864ad49a7..c638685136e1 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
40 40
41 switch (val) { 41 switch (val) {
42 case DIE_NMI: 42 case DIE_NMI:
43 if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu))) 43 case DIE_NMI_IPI:
44 ret = NOTIFY_STOP; 44 model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
45 ret = NOTIFY_STOP;
45 break; 46 break;
46 default: 47 default:
47 break; 48 break;
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)
134static struct notifier_block profile_exceptions_nb = { 135static struct notifier_block profile_exceptions_nb = {
135 .notifier_call = profile_exceptions_notify, 136 .notifier_call = profile_exceptions_notify,
136 .next = NULL, 137 .next = NULL,
137 .priority = 0 138 .priority = 2
138}; 139};
139 140
140static int nmi_setup(void) 141static int nmi_setup(void)
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index e9f80c744cf3..85eb6268374f 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
18#include <asm/msr.h> 18#include <asm/msr.h>
19#include <asm/apic.h> 19#include <asm/apic.h>
20#include <asm/nmi.h> 20#include <asm/nmi.h>
21#include <asm/intel_arch_perfmon.h> 21#include <asm/perf_counter.h>
22 22
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24#include "op_counter.h" 24#include "op_counter.h"
@@ -126,6 +126,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
126 u64 val; 126 u64 val;
127 int i; 127 int i;
128 128
129 /*
130 * This can happen if perf counters are in use when
131 * we steal the die notifier NMI.
132 */
133 if (unlikely(!reset_value))
134 goto out;
135
129 for (i = 0 ; i < num_counters; ++i) { 136 for (i = 0 ; i < num_counters; ++i) {
130 if (!reset_value[i]) 137 if (!reset_value[i])
131 continue; 138 continue;
@@ -136,6 +143,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
136 } 143 }
137 } 144 }
138 145
146out:
139 /* Only P6 based Pentium M need to re-unmask the apic vector but it 147 /* Only P6 based Pentium M need to re-unmask the apic vector but it
140 * doesn't hurt other P6 variant */ 148 * doesn't hurt other P6 variant */
141 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); 149 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bea215230b20..bef941f61451 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -634,35 +634,27 @@ static void xen_flush_tlb_single(unsigned long addr)
634 preempt_enable(); 634 preempt_enable();
635} 635}
636 636
637static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, 637static void xen_flush_tlb_others(const struct cpumask *cpus,
638 unsigned long va) 638 struct mm_struct *mm, unsigned long va)
639{ 639{
640 struct { 640 struct {
641 struct mmuext_op op; 641 struct mmuext_op op;
642 cpumask_t mask; 642 DECLARE_BITMAP(mask, NR_CPUS);
643 } *args; 643 } *args;
644 cpumask_t cpumask = *cpus;
645 struct multicall_space mcs; 644 struct multicall_space mcs;
646 645
647 /* 646 BUG_ON(cpumask_empty(cpus));
648 * A couple of (to be removed) sanity checks:
649 *
650 * - current CPU must not be in mask
651 * - mask must exist :)
652 */
653 BUG_ON(cpus_empty(cpumask));
654 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
655 BUG_ON(!mm); 647 BUG_ON(!mm);
656 648
657 /* If a CPU which we ran on has gone down, OK. */
658 cpus_and(cpumask, cpumask, cpu_online_map);
659 if (cpus_empty(cpumask))
660 return;
661
662 mcs = xen_mc_entry(sizeof(*args)); 649 mcs = xen_mc_entry(sizeof(*args));
663 args = mcs.args; 650 args = mcs.args;
664 args->mask = cpumask; 651 args->op.arg2.vcpumask = to_cpumask(args->mask);
665 args->op.arg2.vcpumask = &args->mask; 652
653 /* Remove us, and any offline CPUS. */
654 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
655 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
656 if (unlikely(cpumask_empty(to_cpumask(args->mask))))
657 goto issue;
666 658
667 if (va == TLB_FLUSH_ALL) { 659 if (va == TLB_FLUSH_ALL) {
668 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; 660 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
@@ -673,6 +665,7 @@ static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
673 665
674 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); 666 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
675 667
668issue:
676 xen_mc_issue(PARAVIRT_LAZY_MMU); 669 xen_mc_issue(PARAVIRT_LAZY_MMU);
677} 670}
678 671
@@ -702,17 +695,17 @@ static void xen_write_cr0(unsigned long cr0)
702 695
703static void xen_write_cr2(unsigned long cr2) 696static void xen_write_cr2(unsigned long cr2)
704{ 697{
705 x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; 698 percpu_read(xen_vcpu)->arch.cr2 = cr2;
706} 699}
707 700
708static unsigned long xen_read_cr2(void) 701static unsigned long xen_read_cr2(void)
709{ 702{
710 return x86_read_percpu(xen_vcpu)->arch.cr2; 703 return percpu_read(xen_vcpu)->arch.cr2;
711} 704}
712 705
713static unsigned long xen_read_cr2_direct(void) 706static unsigned long xen_read_cr2_direct(void)
714{ 707{
715 return x86_read_percpu(xen_vcpu_info.arch.cr2); 708 return percpu_read(xen_vcpu_info.arch.cr2);
716} 709}
717 710
718static void xen_write_cr4(unsigned long cr4) 711static void xen_write_cr4(unsigned long cr4)
@@ -725,12 +718,12 @@ static void xen_write_cr4(unsigned long cr4)
725 718
726static unsigned long xen_read_cr3(void) 719static unsigned long xen_read_cr3(void)
727{ 720{
728 return x86_read_percpu(xen_cr3); 721 return percpu_read(xen_cr3);
729} 722}
730 723
731static void set_current_cr3(void *v) 724static void set_current_cr3(void *v)
732{ 725{
733 x86_write_percpu(xen_current_cr3, (unsigned long)v); 726 percpu_write(xen_current_cr3, (unsigned long)v);
734} 727}
735 728
736static void __xen_write_cr3(bool kernel, unsigned long cr3) 729static void __xen_write_cr3(bool kernel, unsigned long cr3)
@@ -755,7 +748,7 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3)
755 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 748 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
756 749
757 if (kernel) { 750 if (kernel) {
758 x86_write_percpu(xen_cr3, cr3); 751 percpu_write(xen_cr3, cr3);
759 752
760 /* Update xen_current_cr3 once the batch has actually 753 /* Update xen_current_cr3 once the batch has actually
761 been submitted. */ 754 been submitted. */
@@ -771,7 +764,7 @@ static void xen_write_cr3(unsigned long cr3)
771 764
772 /* Update while interrupts are disabled, so its atomic with 765 /* Update while interrupts are disabled, so its atomic with
773 respect to ipis */ 766 respect to ipis */
774 x86_write_percpu(xen_cr3, cr3); 767 percpu_write(xen_cr3, cr3);
775 768
776 __xen_write_cr3(true, cr3); 769 __xen_write_cr3(true, cr3);
777 770
@@ -1652,7 +1645,6 @@ asmlinkage void __init xen_start_kernel(void)
1652#ifdef CONFIG_X86_64 1645#ifdef CONFIG_X86_64
1653 /* Disable until direct per-cpu data access. */ 1646 /* Disable until direct per-cpu data access. */
1654 have_vcpu_info_placement = 0; 1647 have_vcpu_info_placement = 0;
1655 x86_64_init_pda();
1656#endif 1648#endif
1657 1649
1658 xen_smp_init(); 1650 xen_smp_init();
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index bb042608c602..2e8271431e1a 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -39,7 +39,7 @@ static unsigned long xen_save_fl(void)
39 struct vcpu_info *vcpu; 39 struct vcpu_info *vcpu;
40 unsigned long flags; 40 unsigned long flags;
41 41
42 vcpu = x86_read_percpu(xen_vcpu); 42 vcpu = percpu_read(xen_vcpu);
43 43
44 /* flag has opposite sense of mask */ 44 /* flag has opposite sense of mask */
45 flags = !vcpu->evtchn_upcall_mask; 45 flags = !vcpu->evtchn_upcall_mask;
@@ -62,7 +62,7 @@ static void xen_restore_fl(unsigned long flags)
62 make sure we're don't switch CPUs between getting the vcpu 62 make sure we're don't switch CPUs between getting the vcpu
63 pointer and updating the mask. */ 63 pointer and updating the mask. */
64 preempt_disable(); 64 preempt_disable();
65 vcpu = x86_read_percpu(xen_vcpu); 65 vcpu = percpu_read(xen_vcpu);
66 vcpu->evtchn_upcall_mask = flags; 66 vcpu->evtchn_upcall_mask = flags;
67 preempt_enable_no_resched(); 67 preempt_enable_no_resched();
68 68
@@ -83,7 +83,7 @@ static void xen_irq_disable(void)
83 make sure we're don't switch CPUs between getting the vcpu 83 make sure we're don't switch CPUs between getting the vcpu
84 pointer and updating the mask. */ 84 pointer and updating the mask. */
85 preempt_disable(); 85 preempt_disable();
86 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; 86 percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
87 preempt_enable_no_resched(); 87 preempt_enable_no_resched();
88} 88}
89 89
@@ -96,7 +96,7 @@ static void xen_irq_enable(void)
96 the caller is confused and is trying to re-enable interrupts 96 the caller is confused and is trying to re-enable interrupts
97 on an indeterminate processor. */ 97 on an indeterminate processor. */
98 98
99 vcpu = x86_read_percpu(xen_vcpu); 99 vcpu = percpu_read(xen_vcpu);
100 vcpu->evtchn_upcall_mask = 0; 100 vcpu->evtchn_upcall_mask = 0;
101 101
102 /* Doesn't matter if we get preempted here, because any 102 /* Doesn't matter if we get preempted here, because any
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 503c240e26c7..98cb9869eb24 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1063,18 +1063,14 @@ static void drop_other_mm_ref(void *info)
1063 struct mm_struct *mm = info; 1063 struct mm_struct *mm = info;
1064 struct mm_struct *active_mm; 1064 struct mm_struct *active_mm;
1065 1065
1066#ifdef CONFIG_X86_64 1066 active_mm = percpu_read(cpu_tlbstate.active_mm);
1067 active_mm = read_pda(active_mm);
1068#else
1069 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
1070#endif
1071 1067
1072 if (active_mm == mm) 1068 if (active_mm == mm)
1073 leave_mm(smp_processor_id()); 1069 leave_mm(smp_processor_id());
1074 1070
1075 /* If this cpu still has a stale cr3 reference, then make sure 1071 /* If this cpu still has a stale cr3 reference, then make sure
1076 it has been flushed. */ 1072 it has been flushed. */
1077 if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) { 1073 if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
1078 load_cr3(swapper_pg_dir); 1074 load_cr3(swapper_pg_dir);
1079 arch_flush_lazy_cpu_mode(); 1075 arch_flush_lazy_cpu_mode();
1080 } 1076 }
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index fa3e10725d98..9e565da5d1f7 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -41,7 +41,7 @@ static inline void xen_mc_issue(unsigned mode)
41 xen_mc_flush(); 41 xen_mc_flush();
42 42
43 /* restore flags saved in xen_mc_batch */ 43 /* restore flags saved in xen_mc_batch */
44 local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); 44 local_irq_restore(percpu_read(xen_mc_irq_flags));
45} 45}
46 46
47/* Set up a callback to be called when the current batch is flushed */ 47/* Set up a callback to be called when the current batch is flushed */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c44e2069c7c7..72c2eb9b64cd 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -50,11 +50,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
50 */ 50 */
51static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) 51static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
52{ 52{
53#ifdef CONFIG_X86_32 53 inc_irq_stat(irq_resched_count);
54 __get_cpu_var(irq_stat).irq_resched_count++;
55#else
56 add_pda(irq_resched_count, 1);
57#endif
58 54
59 return IRQ_HANDLED; 55 return IRQ_HANDLED;
60} 56}
@@ -78,7 +74,7 @@ static __cpuinit void cpu_bringup(void)
78 xen_setup_cpu_clockevents(); 74 xen_setup_cpu_clockevents();
79 75
80 cpu_set(cpu, cpu_online_map); 76 cpu_set(cpu, cpu_online_map);
81 x86_write_percpu(cpu_state, CPU_ONLINE); 77 percpu_write(cpu_state, CPU_ONLINE);
82 wmb(); 78 wmb();
83 79
84 /* We can take interrupts now: we're officially "up". */ 80 /* We can take interrupts now: we're officially "up". */
@@ -283,22 +279,11 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
283 struct task_struct *idle = idle_task(cpu); 279 struct task_struct *idle = idle_task(cpu);
284 int rc; 280 int rc;
285 281
286#ifdef CONFIG_X86_64 282 per_cpu(current_task, cpu) = idle;
287 /* Allocate node local memory for AP pdas */
288 WARN_ON(cpu == 0);
289 if (cpu > 0) {
290 rc = get_local_pda(cpu);
291 if (rc)
292 return rc;
293 }
294#endif
295
296#ifdef CONFIG_X86_32 283#ifdef CONFIG_X86_32
297 init_gdt(cpu); 284 init_gdt(cpu);
298 per_cpu(current_task, cpu) = idle;
299 irq_ctx_init(cpu); 285 irq_ctx_init(cpu);
300#else 286#else
301 cpu_pda(cpu)->pcurrent = idle;
302 clear_tsk_thread_flag(idle, TIF_FORK); 287 clear_tsk_thread_flag(idle, TIF_FORK);
303#endif 288#endif
304 xen_setup_timer(cpu); 289 xen_setup_timer(cpu);
@@ -445,11 +430,7 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
445{ 430{
446 irq_enter(); 431 irq_enter();
447 generic_smp_call_function_interrupt(); 432 generic_smp_call_function_interrupt();
448#ifdef CONFIG_X86_32 433 inc_irq_stat(irq_call_count);
449 __get_cpu_var(irq_stat).irq_call_count++;
450#else
451 add_pda(irq_call_count, 1);
452#endif
453 irq_exit(); 434 irq_exit();
454 435
455 return IRQ_HANDLED; 436 return IRQ_HANDLED;
@@ -459,11 +440,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
459{ 440{
460 irq_enter(); 441 irq_enter();
461 generic_smp_call_function_single_interrupt(); 442 generic_smp_call_function_single_interrupt();
462#ifdef CONFIG_X86_32 443 inc_irq_stat(irq_call_count);
463 __get_cpu_var(irq_stat).irq_call_count++;
464#else
465 add_pda(irq_call_count, 1);
466#endif
467 irq_exit(); 444 irq_exit();
468 445
469 return IRQ_HANDLED; 446 return IRQ_HANDLED;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 212ffe012b76..95be7b434724 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -6,6 +6,7 @@
6 6
7#include <asm/xen/hypercall.h> 7#include <asm/xen/hypercall.h>
8#include <asm/xen/page.h> 8#include <asm/xen/page.h>
9#include <asm/fixmap.h>
9 10
10#include "xen-ops.h" 11#include "xen-ops.h"
11#include "mmu.h" 12#include "mmu.h"
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 05794c566e87..d6fc51f4ce85 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -17,6 +17,7 @@
17#include <asm/processor-flags.h> 17#include <asm/processor-flags.h>
18#include <asm/errno.h> 18#include <asm/errno.h>
19#include <asm/segment.h> 19#include <asm/segment.h>
20#include <asm/percpu.h>
20 21
21#include <xen/interface/xen.h> 22#include <xen/interface/xen.h>
22 23
@@ -28,12 +29,10 @@
28 29
29#if 1 30#if 1
30/* 31/*
31 x86-64 does not yet support direct access to percpu variables 32 FIXME: x86_64 now can support direct access to percpu variables
32 via a segment override, so we just need to make sure this code 33 via a segment override. Update xen accordingly.
33 never gets used
34 */ 34 */
35#define BUG ud2a 35#define BUG ud2a
36#define PER_CPU_VAR(var, off) 0xdeadbeef
37#endif 36#endif
38 37
39/* 38/*
@@ -45,14 +44,14 @@ ENTRY(xen_irq_enable_direct)
45 BUG 44 BUG
46 45
47 /* Unmask events */ 46 /* Unmask events */
48 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 47 movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
49 48
50 /* Preempt here doesn't matter because that will deal with 49 /* Preempt here doesn't matter because that will deal with
51 any pending interrupts. The pending check may end up being 50 any pending interrupts. The pending check may end up being
52 run on the wrong CPU, but that doesn't hurt. */ 51 run on the wrong CPU, but that doesn't hurt. */
53 52
54 /* Test for pending */ 53 /* Test for pending */
55 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending) 54 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
56 jz 1f 55 jz 1f
57 56
582: call check_events 572: call check_events
@@ -69,7 +68,7 @@ ENDPATCH(xen_irq_enable_direct)
69ENTRY(xen_irq_disable_direct) 68ENTRY(xen_irq_disable_direct)
70 BUG 69 BUG
71 70
72 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 71 movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
73ENDPATCH(xen_irq_disable_direct) 72ENDPATCH(xen_irq_disable_direct)
74 ret 73 ret
75 ENDPROC(xen_irq_disable_direct) 74 ENDPROC(xen_irq_disable_direct)
@@ -87,7 +86,7 @@ ENDPATCH(xen_irq_disable_direct)
87ENTRY(xen_save_fl_direct) 86ENTRY(xen_save_fl_direct)
88 BUG 87 BUG
89 88
90 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 89 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
91 setz %ah 90 setz %ah
92 addb %ah,%ah 91 addb %ah,%ah
93ENDPATCH(xen_save_fl_direct) 92ENDPATCH(xen_save_fl_direct)
@@ -107,13 +106,13 @@ ENTRY(xen_restore_fl_direct)
107 BUG 106 BUG
108 107
109 testb $X86_EFLAGS_IF>>8, %ah 108 testb $X86_EFLAGS_IF>>8, %ah
110 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 109 setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
111 /* Preempt here doesn't matter because that will deal with 110 /* Preempt here doesn't matter because that will deal with
112 any pending interrupts. The pending check may end up being 111 any pending interrupts. The pending check may end up being
113 run on the wrong CPU, but that doesn't hurt. */ 112 run on the wrong CPU, but that doesn't hurt. */
114 113
115 /* check for unmasked and pending */ 114 /* check for unmasked and pending */
116 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending) 115 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
117 jz 1f 116 jz 1f
1182: call check_events 1172: call check_events
1191: 1181:
@@ -195,11 +194,11 @@ RELOC(xen_sysexit, 1b+1)
195ENTRY(xen_sysret64) 194ENTRY(xen_sysret64)
196 /* We're already on the usermode stack at this point, but still 195 /* We're already on the usermode stack at this point, but still
197 with the kernel gs, so we can easily switch back */ 196 with the kernel gs, so we can easily switch back */
198 movq %rsp, %gs:pda_oldrsp 197 movq %rsp, PER_CPU_VAR(old_rsp)
199 movq %gs:pda_kernelstack,%rsp 198 movq PER_CPU_VAR(kernel_stack),%rsp
200 199
201 pushq $__USER_DS 200 pushq $__USER_DS
202 pushq %gs:pda_oldrsp 201 pushq PER_CPU_VAR(old_rsp)
203 pushq %r11 202 pushq %r11
204 pushq $__USER_CS 203 pushq $__USER_CS
205 pushq %rcx 204 pushq %rcx
@@ -212,11 +211,11 @@ RELOC(xen_sysret64, 1b+1)
212ENTRY(xen_sysret32) 211ENTRY(xen_sysret32)
213 /* We're already on the usermode stack at this point, but still 212 /* We're already on the usermode stack at this point, but still
214 with the kernel gs, so we can easily switch back */ 213 with the kernel gs, so we can easily switch back */
215 movq %rsp, %gs:pda_oldrsp 214 movq %rsp, PER_CPU_VAR(old_rsp)
216 movq %gs:pda_kernelstack, %rsp 215 movq PER_CPU_VAR(kernel_stack), %rsp
217 216
218 pushq $__USER32_DS 217 pushq $__USER32_DS
219 pushq %gs:pda_oldrsp 218 pushq PER_CPU_VAR(old_rsp)
220 pushq %r11 219 pushq %r11
221 pushq $__USER32_CS 220 pushq $__USER32_CS
222 pushq %rcx 221 pushq %rcx
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 7bc22a471fe3..259f6e806314 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -824,8 +824,14 @@ static int acpi_idle_bm_check(void)
824 */ 824 */
825static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) 825static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
826{ 826{
827 u64 perf_flags;
828
829 u64 pctrl;
830
827 /* Don't trace irqs off for idle */ 831 /* Don't trace irqs off for idle */
828 stop_critical_timings(); 832 stop_critical_timings();
833 perf_flags = hw_perf_save_disable();
834 pctrl = hw_perf_save_disable();
829 if (cx->entry_method == ACPI_CSTATE_FFH) { 835 if (cx->entry_method == ACPI_CSTATE_FFH) {
830 /* Call into architectural FFH based C-state */ 836 /* Call into architectural FFH based C-state */
831 acpi_processor_ffh_cstate_enter(cx); 837 acpi_processor_ffh_cstate_enter(cx);
@@ -840,6 +846,8 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
840 gets asserted in time to freeze execution properly. */ 846 gets asserted in time to freeze execution properly. */
841 unused = inl(acpi_gbl_FADT.xpm_timer_block.address); 847 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
842 } 848 }
849 hw_perf_restore(perf_flags);
850 hw_perf_restore(pctrl);
843 start_critical_timings(); 851 start_critical_timings();
844} 852}
845 853
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 719ee5c1c8d9..5b257a57bc57 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -107,7 +107,7 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
107/* 107/*
108 * Print cpu online, possible, present, and system maps 108 * Print cpu online, possible, present, and system maps
109 */ 109 */
110static ssize_t print_cpus_map(char *buf, cpumask_t *map) 110static ssize_t print_cpus_map(char *buf, const struct cpumask *map)
111{ 111{
112 int n = cpulist_scnprintf(buf, PAGE_SIZE-2, map); 112 int n = cpulist_scnprintf(buf, PAGE_SIZE-2, map);
113 113
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index a778fb52b11f..bf6b13206d00 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -31,7 +31,10 @@
31#include <linux/hardirq.h> 31#include <linux/hardirq.h>
32#include <linux/topology.h> 32#include <linux/topology.h>
33 33
34#define define_one_ro(_name) \ 34#define define_one_ro_named(_name, _func) \
35static SYSDEV_ATTR(_name, 0444, _func, NULL)
36
37#define define_one_ro(_name) \
35static SYSDEV_ATTR(_name, 0444, show_##_name, NULL) 38static SYSDEV_ATTR(_name, 0444, show_##_name, NULL)
36 39
37#define define_id_show_func(name) \ 40#define define_id_show_func(name) \
@@ -42,8 +45,8 @@ static ssize_t show_##name(struct sys_device *dev, \
42 return sprintf(buf, "%d\n", topology_##name(cpu)); \ 45 return sprintf(buf, "%d\n", topology_##name(cpu)); \
43} 46}
44 47
45#if defined(topology_thread_siblings) || defined(topology_core_siblings) 48#if defined(topology_thread_cpumask) || defined(topology_core_cpumask)
46static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf) 49static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf)
47{ 50{
48 ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; 51 ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
49 int n = 0; 52 int n = 0;
@@ -65,7 +68,7 @@ static ssize_t show_##name(struct sys_device *dev, \
65 struct sysdev_attribute *attr, char *buf) \ 68 struct sysdev_attribute *attr, char *buf) \
66{ \ 69{ \
67 unsigned int cpu = dev->id; \ 70 unsigned int cpu = dev->id; \
68 return show_cpumap(0, &(topology_##name(cpu)), buf); \ 71 return show_cpumap(0, topology_##name(cpu), buf); \
69} 72}
70 73
71#define define_siblings_show_list(name) \ 74#define define_siblings_show_list(name) \
@@ -74,7 +77,7 @@ static ssize_t show_##name##_list(struct sys_device *dev, \
74 char *buf) \ 77 char *buf) \
75{ \ 78{ \
76 unsigned int cpu = dev->id; \ 79 unsigned int cpu = dev->id; \
77 return show_cpumap(1, &(topology_##name(cpu)), buf); \ 80 return show_cpumap(1, topology_##name(cpu), buf); \
78} 81}
79 82
80#else 83#else
@@ -82,9 +85,7 @@ static ssize_t show_##name##_list(struct sys_device *dev, \
82static ssize_t show_##name(struct sys_device *dev, \ 85static ssize_t show_##name(struct sys_device *dev, \
83 struct sysdev_attribute *attr, char *buf) \ 86 struct sysdev_attribute *attr, char *buf) \
84{ \ 87{ \
85 unsigned int cpu = dev->id; \ 88 return show_cpumap(0, topology_##name(dev->id), buf); \
86 cpumask_t mask = topology_##name(cpu); \
87 return show_cpumap(0, &mask, buf); \
88} 89}
89 90
90#define define_siblings_show_list(name) \ 91#define define_siblings_show_list(name) \
@@ -92,9 +93,7 @@ static ssize_t show_##name##_list(struct sys_device *dev, \
92 struct sysdev_attribute *attr, \ 93 struct sysdev_attribute *attr, \
93 char *buf) \ 94 char *buf) \
94{ \ 95{ \
95 unsigned int cpu = dev->id; \ 96 return show_cpumap(1, topology_##name(dev->id), buf); \
96 cpumask_t mask = topology_##name(cpu); \
97 return show_cpumap(1, &mask, buf); \
98} 97}
99#endif 98#endif
100 99
@@ -107,13 +106,13 @@ define_one_ro(physical_package_id);
107define_id_show_func(core_id); 106define_id_show_func(core_id);
108define_one_ro(core_id); 107define_one_ro(core_id);
109 108
110define_siblings_show_func(thread_siblings); 109define_siblings_show_func(thread_cpumask);
111define_one_ro(thread_siblings); 110define_one_ro_named(thread_siblings, show_thread_cpumask);
112define_one_ro(thread_siblings_list); 111define_one_ro_named(thread_siblings_list, show_thread_cpumask_list);
113 112
114define_siblings_show_func(core_siblings); 113define_siblings_show_func(core_cpumask);
115define_one_ro(core_siblings); 114define_one_ro_named(core_siblings, show_core_cpumask);
116define_one_ro(core_siblings_list); 115define_one_ro_named(core_siblings_list, show_core_cpumask_list);
117 116
118static struct attribute *default_attrs[] = { 117static struct attribute *default_attrs[] = {
119 &attr_physical_package_id.attr, 118 &attr_physical_package_id.attr,
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 33a9351c896d..fa71b84f217b 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -25,6 +25,7 @@
25#include <linux/kbd_kern.h> 25#include <linux/kbd_kern.h>
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/quotaops.h> 27#include <linux/quotaops.h>
28#include <linux/perf_counter.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
29#include <linux/module.h> 30#include <linux/module.h>
30#include <linux/suspend.h> 31#include <linux/suspend.h>
@@ -244,6 +245,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
244 struct pt_regs *regs = get_irq_regs(); 245 struct pt_regs *regs = get_irq_regs();
245 if (regs) 246 if (regs)
246 show_regs(regs); 247 show_regs(regs);
248 perf_counter_print_debug();
247} 249}
248static struct sysrq_key_op sysrq_showregs_op = { 250static struct sysrq_key_op sysrq_showregs_op = {
249 .handler = sysrq_handle_showregs, 251 .handler = sysrq_handle_showregs,
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index 777fba48d2d3..3009e0171e54 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -244,7 +244,7 @@ static ssize_t host_control_on_shutdown_store(struct device *dev,
244 */ 244 */
245int dcdbas_smi_request(struct smi_cmd *smi_cmd) 245int dcdbas_smi_request(struct smi_cmd *smi_cmd)
246{ 246{
247 cpumask_t old_mask; 247 cpumask_var_t old_mask;
248 int ret = 0; 248 int ret = 0;
249 249
250 if (smi_cmd->magic != SMI_CMD_MAGIC) { 250 if (smi_cmd->magic != SMI_CMD_MAGIC) {
@@ -254,8 +254,11 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
254 } 254 }
255 255
256 /* SMI requires CPU 0 */ 256 /* SMI requires CPU 0 */
257 old_mask = current->cpus_allowed; 257 if (!alloc_cpumask_var(&old_mask, GFP_KERNEL))
258 set_cpus_allowed_ptr(current, &cpumask_of_cpu(0)); 258 return -ENOMEM;
259
260 cpumask_copy(old_mask, &current->cpus_allowed);
261 set_cpus_allowed_ptr(current, cpumask_of(0));
259 if (smp_processor_id() != 0) { 262 if (smp_processor_id() != 0) {
260 dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n", 263 dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n",
261 __func__); 264 __func__);
@@ -275,7 +278,8 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
275 ); 278 );
276 279
277out: 280out:
278 set_cpus_allowed_ptr(current, &old_mask); 281 set_cpus_allowed_ptr(current, old_mask);
282 free_cpumask_var(old_mask);
279 return ret; 283 return ret;
280} 284}
281 285
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index c64e6798878a..1c484084ed4f 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -162,7 +162,7 @@ config ENCLOSURE_SERVICES
162config SGI_XP 162config SGI_XP
163 tristate "Support communication between SGI SSIs" 163 tristate "Support communication between SGI SSIs"
164 depends on NET 164 depends on NET
165 depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_64) && SMP 165 depends on (IA64_GENERIC || IA64_SGI_SN2 || IA64_SGI_UV || X86_UV) && SMP
166 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2 166 select IA64_UNCACHED_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
167 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2 167 select GENERIC_ALLOCATOR if IA64_GENERIC || IA64_SGI_SN2
168 select SGI_GRU if (IA64_GENERIC || IA64_SGI_UV || X86_64) && SMP 168 select SGI_GRU if (IA64_GENERIC || IA64_SGI_UV || X86_64) && SMP
@@ -189,7 +189,7 @@ config HP_ILO
189 189
190config SGI_GRU 190config SGI_GRU
191 tristate "SGI GRU driver" 191 tristate "SGI GRU driver"
192 depends on (X86_64 || IA64_SGI_UV || IA64_GENERIC) && SMP 192 depends on (X86_UV || IA64_SGI_UV || IA64_GENERIC) && SMP
193 default n 193 default n
194 select MMU_NOTIFIER 194 select MMU_NOTIFIER
195 ---help--- 195 ---help---
diff --git a/drivers/misc/sgi-gru/gru.h b/drivers/misc/sgi-gru/gru.h
index f93f03a9e6e9..1b5f579df15f 100644
--- a/drivers/misc/sgi-gru/gru.h
+++ b/drivers/misc/sgi-gru/gru.h
@@ -19,6 +19,8 @@
19#ifndef __GRU_H__ 19#ifndef __GRU_H__
20#define __GRU_H__ 20#define __GRU_H__
21 21
22#include <asm/uv/uv.h>
23
22/* 24/*
23 * GRU architectural definitions 25 * GRU architectural definitions
24 */ 26 */
diff --git a/drivers/misc/sgi-xp/xp.h b/drivers/misc/sgi-xp/xp.h
index 7b4cbd5e03e9..069ad3a1c2ac 100644
--- a/drivers/misc/sgi-xp/xp.h
+++ b/drivers/misc/sgi-xp/xp.h
@@ -15,6 +15,8 @@
15 15
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18#include <asm/uv/uv.h>
19
18#ifdef CONFIG_IA64 20#ifdef CONFIG_IA64
19#include <asm/system.h> 21#include <asm/system.h>
20#include <asm/sn/arch.h> /* defines is_shub1() and is_shub2() */ 22#include <asm/sn/arch.h> /* defines is_shub1() and is_shub2() */
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
index 89218f7cfaa7..6576170de962 100644
--- a/drivers/misc/sgi-xp/xpc_main.c
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -318,7 +318,7 @@ xpc_hb_checker(void *ignore)
318 318
319 /* this thread was marked active by xpc_hb_init() */ 319 /* this thread was marked active by xpc_hb_init() */
320 320
321 set_cpus_allowed_ptr(current, &cpumask_of_cpu(XPC_HB_CHECK_CPU)); 321 set_cpus_allowed_ptr(current, cpumask_of(XPC_HB_CHECK_CPU));
322 322
323 /* set our heartbeating to other partitions into motion */ 323 /* set our heartbeating to other partitions into motion */
324 xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ); 324 xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c
index ab0e09bf154d..847e9bb0098f 100644
--- a/drivers/net/sfc/efx.c
+++ b/drivers/net/sfc/efx.c
@@ -854,20 +854,27 @@ static void efx_fini_io(struct efx_nic *efx)
854 * interrupts across them. */ 854 * interrupts across them. */
855static int efx_wanted_rx_queues(void) 855static int efx_wanted_rx_queues(void)
856{ 856{
857 cpumask_t core_mask; 857 cpumask_var_t core_mask;
858 int count; 858 int count;
859 int cpu; 859 int cpu;
860 860
861 cpus_clear(core_mask); 861 if (!alloc_cpumask_var(&core_mask, GFP_KERNEL)) {
862 printk(KERN_WARNING
863 "efx.c: allocation failure, irq balancing hobbled\n");
864 return 1;
865 }
866
867 cpumask_clear(core_mask);
862 count = 0; 868 count = 0;
863 for_each_online_cpu(cpu) { 869 for_each_online_cpu(cpu) {
864 if (!cpu_isset(cpu, core_mask)) { 870 if (!cpumask_test_cpu(cpu, core_mask)) {
865 ++count; 871 ++count;
866 cpus_or(core_mask, core_mask, 872 cpumask_or(core_mask, core_mask,
867 topology_core_siblings(cpu)); 873 topology_core_cpumask(cpu));
868 } 874 }
869 } 875 }
870 876
877 free_cpumask_var(core_mask);
871 return count; 878 return count;
872} 879}
873 880
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index 9da5a4b81133..c3ea5fa7d05a 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -38,7 +38,7 @@
38 38
39static LIST_HEAD(dying_tasks); 39static LIST_HEAD(dying_tasks);
40static LIST_HEAD(dead_tasks); 40static LIST_HEAD(dead_tasks);
41static cpumask_t marked_cpus = CPU_MASK_NONE; 41static cpumask_var_t marked_cpus;
42static DEFINE_SPINLOCK(task_mortuary); 42static DEFINE_SPINLOCK(task_mortuary);
43static void process_task_mortuary(void); 43static void process_task_mortuary(void);
44 44
@@ -456,10 +456,10 @@ static void mark_done(int cpu)
456{ 456{
457 int i; 457 int i;
458 458
459 cpu_set(cpu, marked_cpus); 459 cpumask_set_cpu(cpu, marked_cpus);
460 460
461 for_each_online_cpu(i) { 461 for_each_online_cpu(i) {
462 if (!cpu_isset(i, marked_cpus)) 462 if (!cpumask_test_cpu(i, marked_cpus))
463 return; 463 return;
464 } 464 }
465 465
@@ -468,7 +468,7 @@ static void mark_done(int cpu)
468 */ 468 */
469 process_task_mortuary(); 469 process_task_mortuary();
470 470
471 cpus_clear(marked_cpus); 471 cpumask_clear(marked_cpus);
472} 472}
473 473
474 474
@@ -565,6 +565,20 @@ void sync_buffer(int cpu)
565 mutex_unlock(&buffer_mutex); 565 mutex_unlock(&buffer_mutex);
566} 566}
567 567
568int __init buffer_sync_init(void)
569{
570 if (!alloc_cpumask_var(&marked_cpus, GFP_KERNEL))
571 return -ENOMEM;
572
573 cpumask_clear(marked_cpus);
574 return 0;
575}
576
577void __exit buffer_sync_cleanup(void)
578{
579 free_cpumask_var(marked_cpus);
580}
581
568/* The function can be used to add a buffer worth of data directly to 582/* The function can be used to add a buffer worth of data directly to
569 * the kernel buffer. The buffer is assumed to be a circular buffer. 583 * the kernel buffer. The buffer is assumed to be a circular buffer.
570 * Take the entries from index start and end at index end, wrapping 584 * Take the entries from index start and end at index end, wrapping
diff --git a/drivers/oprofile/buffer_sync.h b/drivers/oprofile/buffer_sync.h
index 3110732c1835..0ebf5db62679 100644
--- a/drivers/oprofile/buffer_sync.h
+++ b/drivers/oprofile/buffer_sync.h
@@ -19,4 +19,8 @@ void sync_stop(void);
19/* sync the given CPU's buffer */ 19/* sync the given CPU's buffer */
20void sync_buffer(int cpu); 20void sync_buffer(int cpu);
21 21
22/* initialize/destroy the buffer system. */
23int buffer_sync_init(void);
24void buffer_sync_cleanup(void);
25
22#endif /* OPROFILE_BUFFER_SYNC_H */ 26#endif /* OPROFILE_BUFFER_SYNC_H */
diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c
index 3cffce90f82a..ced39f602292 100644
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -183,6 +183,10 @@ static int __init oprofile_init(void)
183{ 183{
184 int err; 184 int err;
185 185
186 err = buffer_sync_init();
187 if (err)
188 return err;
189
186 err = oprofile_arch_init(&oprofile_ops); 190 err = oprofile_arch_init(&oprofile_ops);
187 191
188 if (err < 0 || timer) { 192 if (err < 0 || timer) {
@@ -191,8 +195,10 @@ static int __init oprofile_init(void)
191 } 195 }
192 196
193 err = oprofilefs_register(); 197 err = oprofilefs_register();
194 if (err) 198 if (err) {
195 oprofile_arch_exit(); 199 oprofile_arch_exit();
200 buffer_sync_cleanup();
201 }
196 202
197 return err; 203 return err;
198} 204}
@@ -202,6 +208,7 @@ static void __exit oprofile_exit(void)
202{ 208{
203 oprofilefs_unregister(); 209 oprofilefs_unregister();
204 oprofile_arch_exit(); 210 oprofile_arch_exit();
211 buffer_sync_cleanup();
205} 212}
206 213
207 214
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index f78371b22529..5a57753ea9fc 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -6,6 +6,7 @@
6#include <linux/irq.h> 6#include <linux/irq.h>
7#include <asm/io_apic.h> 7#include <asm/io_apic.h>
8#include <asm/smp.h> 8#include <asm/smp.h>
9#include <asm/cpu.h>
9#include <linux/intel-iommu.h> 10#include <linux/intel-iommu.h>
10#include "intr_remapping.h" 11#include "intr_remapping.h"
11 12
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index eb0dfdeaa949..3141e149d595 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -26,6 +26,7 @@
26#include <linux/irq.h> 26#include <linux/irq.h>
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/bootmem.h>
29 30
30#include <asm/ptrace.h> 31#include <asm/ptrace.h>
31#include <asm/irq.h> 32#include <asm/irq.h>
@@ -75,7 +76,14 @@ enum {
75static int evtchn_to_irq[NR_EVENT_CHANNELS] = { 76static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
76 [0 ... NR_EVENT_CHANNELS-1] = -1 77 [0 ... NR_EVENT_CHANNELS-1] = -1
77}; 78};
78static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; 79struct cpu_evtchn_s {
80 unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG];
81};
82static struct cpu_evtchn_s *cpu_evtchn_mask_p;
83static inline unsigned long *cpu_evtchn_mask(int cpu)
84{
85 return cpu_evtchn_mask_p[cpu].bits;
86}
79static u8 cpu_evtchn[NR_EVENT_CHANNELS]; 87static u8 cpu_evtchn[NR_EVENT_CHANNELS];
80 88
81/* Reference counts for bindings to IRQs. */ 89/* Reference counts for bindings to IRQs. */
@@ -115,7 +123,7 @@ static inline unsigned long active_evtchns(unsigned int cpu,
115 unsigned int idx) 123 unsigned int idx)
116{ 124{
117 return (sh->evtchn_pending[idx] & 125 return (sh->evtchn_pending[idx] &
118 cpu_evtchn_mask[cpu][idx] & 126 cpu_evtchn_mask(cpu)[idx] &
119 ~sh->evtchn_mask[idx]); 127 ~sh->evtchn_mask[idx]);
120} 128}
121 129
@@ -125,11 +133,11 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
125 133
126 BUG_ON(irq == -1); 134 BUG_ON(irq == -1);
127#ifdef CONFIG_SMP 135#ifdef CONFIG_SMP
128 irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu); 136 cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
129#endif 137#endif
130 138
131 __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); 139 __clear_bit(chn, cpu_evtchn_mask(cpu_evtchn[chn]));
132 __set_bit(chn, cpu_evtchn_mask[cpu]); 140 __set_bit(chn, cpu_evtchn_mask(cpu));
133 141
134 cpu_evtchn[chn] = cpu; 142 cpu_evtchn[chn] = cpu;
135} 143}
@@ -142,12 +150,12 @@ static void init_evtchn_cpu_bindings(void)
142 150
143 /* By default all event channels notify CPU#0. */ 151 /* By default all event channels notify CPU#0. */
144 for_each_irq_desc(i, desc) { 152 for_each_irq_desc(i, desc) {
145 desc->affinity = cpumask_of_cpu(0); 153 cpumask_copy(desc->affinity, cpumask_of(0));
146 } 154 }
147#endif 155#endif
148 156
149 memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); 157 memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
150 memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); 158 memset(cpu_evtchn_mask(0), ~0, sizeof(cpu_evtchn_mask(0)));
151} 159}
152 160
153static inline unsigned int cpu_from_evtchn(unsigned int evtchn) 161static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
@@ -822,6 +830,10 @@ static struct irq_chip xen_dynamic_chip __read_mostly = {
822void __init xen_init_IRQ(void) 830void __init xen_init_IRQ(void)
823{ 831{
824 int i; 832 int i;
833 size_t size = nr_cpu_ids * sizeof(struct cpu_evtchn_s);
834
835 cpu_evtchn_mask_p = alloc_bootmem(size);
836 BUG_ON(cpu_evtchn_mask_p == NULL);
825 837
826 init_evtchn_cpu_bindings(); 838 init_evtchn_cpu_bindings();
827 839
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 9b91617b9582..e7e83b65c18f 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -100,7 +100,7 @@ static void do_suspend(void)
100 /* XXX use normal device tree? */ 100 /* XXX use normal device tree? */
101 xenbus_suspend(); 101 xenbus_suspend();
102 102
103 err = stop_machine(xen_suspend, &cancelled, &cpumask_of_cpu(0)); 103 err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
104 if (err) { 104 if (err) {
105 printk(KERN_ERR "failed to start xen_suspend: %d\n", err); 105 printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
106 goto out; 106 goto out;
diff --git a/fs/exec.c b/fs/exec.c
index 929b58004b7e..af1600cfa8c9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/perf_counter.h>
36#include <linux/highmem.h> 37#include <linux/highmem.h>
37#include <linux/spinlock.h> 38#include <linux/spinlock.h>
38#include <linux/key.h> 39#include <linux/key.h>
@@ -1010,6 +1011,13 @@ int flush_old_exec(struct linux_binprm * bprm)
1010 1011
1011 current->personality &= ~bprm->per_clear; 1012 current->personality &= ~bprm->per_clear;
1012 1013
1014 /*
1015 * Flush performance counters when crossing a
1016 * security domain:
1017 */
1018 if (!get_dumpable(current->mm))
1019 perf_counter_exit_task(current);
1020
1013 /* An exec changes our domain. We are no longer part of the thread 1021 /* An exec changes our domain. We are no longer part of the thread
1014 group */ 1022 group */
1015 1023
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index b0e63c672ebd..00f45ff081a6 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -80,4 +80,56 @@ extern void setup_per_cpu_areas(void);
80#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \ 80#define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
81 __typeof__(type) per_cpu_var(name) 81 __typeof__(type) per_cpu_var(name)
82 82
83/*
84 * Optional methods for optimized non-lvalue per-cpu variable access.
85 *
86 * @var can be a percpu variable or a field of it and its size should
87 * equal char, int or long. percpu_read() evaluates to a lvalue and
88 * all others to void.
89 *
90 * These operations are guaranteed to be atomic w.r.t. preemption.
91 * The generic versions use plain get/put_cpu_var(). Archs are
92 * encouraged to implement single-instruction alternatives which don't
93 * require preemption protection.
94 */
95#ifndef percpu_read
96# define percpu_read(var) \
97 ({ \
98 typeof(per_cpu_var(var)) __tmp_var__; \
99 __tmp_var__ = get_cpu_var(var); \
100 put_cpu_var(var); \
101 __tmp_var__; \
102 })
103#endif
104
105#define __percpu_generic_to_op(var, val, op) \
106do { \
107 get_cpu_var(var) op val; \
108 put_cpu_var(var); \
109} while (0)
110
111#ifndef percpu_write
112# define percpu_write(var, val) __percpu_generic_to_op(var, (val), =)
113#endif
114
115#ifndef percpu_add
116# define percpu_add(var, val) __percpu_generic_to_op(var, (val), +=)
117#endif
118
119#ifndef percpu_sub
120# define percpu_sub(var, val) __percpu_generic_to_op(var, (val), -=)
121#endif
122
123#ifndef percpu_and
124# define percpu_and(var, val) __percpu_generic_to_op(var, (val), &=)
125#endif
126
127#ifndef percpu_or
128# define percpu_or(var, val) __percpu_generic_to_op(var, (val), |=)
129#endif
130
131#ifndef percpu_xor
132# define percpu_xor(var, val) __percpu_generic_to_op(var, (val), ^=)
133#endif
134
83#endif /* _ASM_GENERIC_PERCPU_H_ */ 135#endif /* _ASM_GENERIC_PERCPU_H_ */
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 79a7ff925bf8..4ce48e878530 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -9,7 +9,7 @@ extern char __bss_start[], __bss_stop[];
9extern char __init_begin[], __init_end[]; 9extern char __init_begin[], __init_end[];
10extern char _sinittext[], _einittext[]; 10extern char _sinittext[], _einittext[];
11extern char _end[]; 11extern char _end[];
12extern char __per_cpu_start[], __per_cpu_end[]; 12extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
13extern char __kprobes_text_start[], __kprobes_text_end[]; 13extern char __kprobes_text_start[], __kprobes_text_end[];
14extern char __initdata_begin[], __initdata_end[]; 14extern char __initdata_begin[], __initdata_end[];
15extern char __start_rodata[], __end_rodata[]; 15extern char __start_rodata[], __end_rodata[];
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index c61fab1dd2f8..53e21f36a802 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -430,12 +430,47 @@
430 *(.initcall7.init) \ 430 *(.initcall7.init) \
431 *(.initcall7s.init) 431 *(.initcall7s.init)
432 432
433#define PERCPU(align) \ 433/**
434 . = ALIGN(align); \ 434 * PERCPU_VADDR - define output section for percpu area
435 VMLINUX_SYMBOL(__per_cpu_start) = .; \ 435 * @vaddr: explicit base address (optional)
436 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \ 436 * @phdr: destination PHDR (optional)
437 *
438 * Macro which expands to output section for percpu area. If @vaddr
439 * is not blank, it specifies explicit base address and all percpu
440 * symbols will be offset from the given address. If blank, @vaddr
441 * always equals @laddr + LOAD_OFFSET.
442 *
443 * @phdr defines the output PHDR to use if not blank. Be warned that
444 * output PHDR is sticky. If @phdr is specified, the next output
445 * section in the linker script will go there too. @phdr should have
446 * a leading colon.
447 *
448 * This macro defines three symbols, __per_cpu_load, __per_cpu_start
449 * and __per_cpu_end. The first one is the vaddr of loaded percpu
450 * init data. __per_cpu_start equals @vaddr and __per_cpu_end is the
451 * end offset.
452 */
453#define PERCPU_VADDR(vaddr, phdr) \
454 VMLINUX_SYMBOL(__per_cpu_load) = .; \
455 .data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load) \
456 - LOAD_OFFSET) { \
457 VMLINUX_SYMBOL(__per_cpu_start) = .; \
458 *(.data.percpu.first) \
437 *(.data.percpu.page_aligned) \ 459 *(.data.percpu.page_aligned) \
438 *(.data.percpu) \ 460 *(.data.percpu) \
439 *(.data.percpu.shared_aligned) \ 461 *(.data.percpu.shared_aligned) \
440 } \ 462 VMLINUX_SYMBOL(__per_cpu_end) = .; \
441 VMLINUX_SYMBOL(__per_cpu_end) = .; 463 } phdr \
464 . = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu);
465
466/**
467 * PERCPU - define output section for percpu area, simple version
468 * @align: required alignment
469 *
470 * Align to @align and outputs output section for percpu area. This
471 * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and
472 * __per_cpu_start will be identical.
473 */
474#define PERCPU(align) \
475 . = ALIGN(align); \
476 PERCPU_VADDR( , )
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e752d973fa21..2ee96942a9d6 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -120,6 +120,16 @@ extern struct group_info init_groups;
120 120
121extern struct cred init_cred; 121extern struct cred init_cred;
122 122
123#ifdef CONFIG_PERF_COUNTERS
124# define INIT_PERF_COUNTERS(tsk) \
125 .perf_counter_ctx.counter_list = \
126 LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \
127 .perf_counter_ctx.lock = \
128 __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
129#else
130# define INIT_PERF_COUNTERS(tsk)
131#endif
132
123/* 133/*
124 * INIT_TASK is used to set up the first task table, touch at 134 * INIT_TASK is used to set up the first task table, touch at
125 * your own risk!. Base=0, limit=0x1fffff (=2MB) 135 * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -184,6 +194,7 @@ extern struct cred init_cred;
184 INIT_IDS \ 194 INIT_IDS \
185 INIT_TRACE_IRQFLAGS \ 195 INIT_TRACE_IRQFLAGS \
186 INIT_LOCKDEP \ 196 INIT_LOCKDEP \
197 INIT_PERF_COUNTERS(tsk) \
187} 198}
188 199
189 200
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9127f6b51a39..472f11765f60 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -467,6 +467,7 @@ int show_interrupts(struct seq_file *p, void *v);
467struct irq_desc; 467struct irq_desc;
468 468
469extern int early_irq_init(void); 469extern int early_irq_init(void);
470extern int arch_probe_nr_irqs(void);
470extern int arch_early_irq_init(void); 471extern int arch_early_irq_init(void);
471extern int arch_init_chip_data(struct irq_desc *desc, int cpu); 472extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
472 473
diff --git a/include/linux/irq.h b/include/linux/irq.h
index f899b502f186..27a67536511e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -182,11 +182,11 @@ struct irq_desc {
182 unsigned int irqs_unhandled; 182 unsigned int irqs_unhandled;
183 spinlock_t lock; 183 spinlock_t lock;
184#ifdef CONFIG_SMP 184#ifdef CONFIG_SMP
185 cpumask_t affinity; 185 cpumask_var_t affinity;
186 unsigned int cpu; 186 unsigned int cpu;
187#endif
188#ifdef CONFIG_GENERIC_PENDING_IRQ 187#ifdef CONFIG_GENERIC_PENDING_IRQ
189 cpumask_t pending_mask; 188 cpumask_var_t pending_mask;
189#endif
190#endif 190#endif
191#ifdef CONFIG_PROC_FS 191#ifdef CONFIG_PROC_FS
192 struct proc_dir_entry *dir; 192 struct proc_dir_entry *dir;
@@ -422,4 +422,84 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
422 422
423#endif /* !CONFIG_S390 */ 423#endif /* !CONFIG_S390 */
424 424
425#ifdef CONFIG_SMP
426/**
427 * init_alloc_desc_masks - allocate cpumasks for irq_desc
428 * @desc: pointer to irq_desc struct
429 * @cpu: cpu which will be handling the cpumasks
430 * @boot: true if need bootmem
431 *
432 * Allocates affinity and pending_mask cpumask if required.
433 * Returns true if successful (or not required).
434 * Side effect: affinity has all bits set, pending_mask has all bits clear.
435 */
436static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
437 bool boot)
438{
439 int node;
440
441 if (boot) {
442 alloc_bootmem_cpumask_var(&desc->affinity);
443 cpumask_setall(desc->affinity);
444
445#ifdef CONFIG_GENERIC_PENDING_IRQ
446 alloc_bootmem_cpumask_var(&desc->pending_mask);
447 cpumask_clear(desc->pending_mask);
448#endif
449 return true;
450 }
451
452 node = cpu_to_node(cpu);
453
454 if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
455 return false;
456 cpumask_setall(desc->affinity);
457
458#ifdef CONFIG_GENERIC_PENDING_IRQ
459 if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
460 free_cpumask_var(desc->affinity);
461 return false;
462 }
463 cpumask_clear(desc->pending_mask);
464#endif
465 return true;
466}
467
468/**
469 * init_copy_desc_masks - copy cpumasks for irq_desc
470 * @old_desc: pointer to old irq_desc struct
471 * @new_desc: pointer to new irq_desc struct
472 *
473 * Insures affinity and pending_masks are copied to new irq_desc.
474 * If !CONFIG_CPUMASKS_OFFSTACK the cpumasks are embedded in the
475 * irq_desc struct so the copy is redundant.
476 */
477
478static inline void init_copy_desc_masks(struct irq_desc *old_desc,
479 struct irq_desc *new_desc)
480{
481#ifdef CONFIG_CPUMASKS_OFFSTACK
482 cpumask_copy(new_desc->affinity, old_desc->affinity);
483
484#ifdef CONFIG_GENERIC_PENDING_IRQ
485 cpumask_copy(new_desc->pending_mask, old_desc->pending_mask);
486#endif
487#endif
488}
489
490#else /* !CONFIG_SMP */
491
492static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
493 bool boot)
494{
495 return true;
496}
497
498static inline void init_copy_desc_masks(struct irq_desc *old_desc,
499 struct irq_desc *new_desc)
500{
501}
502
503#endif /* CONFIG_SMP */
504
425#endif /* _LINUX_IRQ_H */ 505#endif /* _LINUX_IRQ_H */
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index 86af92e9e84c..887477bc2ab0 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -20,6 +20,7 @@
20 20
21# define for_each_irq_desc_reverse(irq, desc) \ 21# define for_each_irq_desc_reverse(irq, desc) \
22 for (irq = nr_irqs - 1; irq >= 0; irq--) 22 for (irq = nr_irqs - 1; irq >= 0; irq--)
23
23#else /* CONFIG_GENERIC_HARDIRQS */ 24#else /* CONFIG_GENERIC_HARDIRQS */
24 25
25extern int nr_irqs; 26extern int nr_irqs;
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 570d20413119..ecfa66817634 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -78,7 +78,15 @@ static inline unsigned int kstat_irqs(unsigned int irq)
78 return sum; 78 return sum;
79} 79}
80 80
81
82/*
83 * Lock/unlock the current runqueue - to extract task statistics:
84 */
85extern void curr_rq_lock_irq_save(unsigned long *flags);
86extern void curr_rq_unlock_irq_restore(unsigned long *flags);
87extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
81extern unsigned long long task_delta_exec(struct task_struct *); 88extern unsigned long long task_delta_exec(struct task_struct *);
89
82extern void account_user_time(struct task_struct *, cputime_t, cputime_t); 90extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
83extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); 91extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
84extern void account_steal_time(cputime_t); 92extern void account_steal_time(cputime_t);
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 0b4df7eba852..5b4e28bcb788 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -49,4 +49,5 @@
49#define FUTEXFS_SUPER_MAGIC 0xBAD1DEA 49#define FUTEXFS_SUPER_MAGIC 0xBAD1DEA
50#define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA 50#define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA
51 51
52#define STACK_END_MAGIC 0x57AC6E9D
52#endif /* __LINUX_MAGIC_H__ */ 53#endif /* __LINUX_MAGIC_H__ */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 9f2a3751873a..0e24202b5a4e 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -9,34 +9,39 @@
9#include <asm/percpu.h> 9#include <asm/percpu.h>
10 10
11#ifdef CONFIG_SMP 11#ifdef CONFIG_SMP
12#define DEFINE_PER_CPU(type, name) \ 12#define PER_CPU_BASE_SECTION ".data.percpu"
13 __attribute__((__section__(".data.percpu"))) \
14 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
15 13
16#ifdef MODULE 14#ifdef MODULE
17#define SHARED_ALIGNED_SECTION ".data.percpu" 15#define PER_CPU_SHARED_ALIGNED_SECTION ""
18#else 16#else
19#define SHARED_ALIGNED_SECTION ".data.percpu.shared_aligned" 17#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
20#endif 18#endif
19#define PER_CPU_FIRST_SECTION ".first"
21 20
22#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 21#else
23 __attribute__((__section__(SHARED_ALIGNED_SECTION))) \ 22
24 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name \ 23#define PER_CPU_BASE_SECTION ".data"
25 ____cacheline_aligned_in_smp 24#define PER_CPU_SHARED_ALIGNED_SECTION ""
25#define PER_CPU_FIRST_SECTION ""
26
27#endif
26 28
27#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ 29#define DEFINE_PER_CPU_SECTION(type, name, section) \
28 __attribute__((__section__(".data.percpu.page_aligned"))) \ 30 __attribute__((__section__(PER_CPU_BASE_SECTION section))) \
29 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name 31 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
30#else 32
31#define DEFINE_PER_CPU(type, name) \ 33#define DEFINE_PER_CPU(type, name) \
32 PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name 34 DEFINE_PER_CPU_SECTION(type, name, "")
33 35
34#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \ 36#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name) \
35 DEFINE_PER_CPU(type, name) 37 DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
38 ____cacheline_aligned_in_smp
36 39
37#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ 40#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
38 DEFINE_PER_CPU(type, name) 41 DEFINE_PER_CPU_SECTION(type, name, ".page_aligned")
39#endif 42
43#define DEFINE_PER_CPU_FIRST(type, name) \
44 DEFINE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION)
40 45
41#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) 46#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
42#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) 47#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
new file mode 100644
index 000000000000..c83f51d6e359
--- /dev/null
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,295 @@
1/*
2 * Performance counters:
3 *
4 * Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
6 *
7 * Data type definitions, declarations, prototypes.
8 *
9 * Started by: Thomas Gleixner and Ingo Molnar
10 *
11 * For licencing details see kernel-base/COPYING
12 */
13#ifndef _LINUX_PERF_COUNTER_H
14#define _LINUX_PERF_COUNTER_H
15
16#include <asm/atomic.h>
17#include <asm/ioctl.h>
18
19#ifdef CONFIG_PERF_COUNTERS
20# include <asm/perf_counter.h>
21#endif
22
23#include <linux/list.h>
24#include <linux/mutex.h>
25#include <linux/rculist.h>
26#include <linux/rcupdate.h>
27#include <linux/spinlock.h>
28
29struct task_struct;
30
31/*
32 * User-space ABI bits:
33 */
34
35/*
36 * Generalized performance counter event types, used by the hw_event.type
37 * parameter of the sys_perf_counter_open() syscall:
38 */
39enum hw_event_types {
40 /*
41 * Common hardware events, generalized by the kernel:
42 */
43 PERF_COUNT_CPU_CYCLES = 0,
44 PERF_COUNT_INSTRUCTIONS = 1,
45 PERF_COUNT_CACHE_REFERENCES = 2,
46 PERF_COUNT_CACHE_MISSES = 3,
47 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
48 PERF_COUNT_BRANCH_MISSES = 5,
49 PERF_COUNT_BUS_CYCLES = 6,
50
51 PERF_HW_EVENTS_MAX = 7,
52
53 /*
54 * Special "software" counters provided by the kernel, even if
55 * the hardware does not support performance counters. These
56 * counters measure various physical and sw events of the
57 * kernel (and allow the profiling of them as well):
58 */
59 PERF_COUNT_CPU_CLOCK = -1,
60 PERF_COUNT_TASK_CLOCK = -2,
61 PERF_COUNT_PAGE_FAULTS = -3,
62 PERF_COUNT_CONTEXT_SWITCHES = -4,
63 PERF_COUNT_CPU_MIGRATIONS = -5,
64
65 PERF_SW_EVENTS_MIN = -6,
66};
67
68/*
69 * IRQ-notification data record type:
70 */
71enum perf_counter_record_type {
72 PERF_RECORD_SIMPLE = 0,
73 PERF_RECORD_IRQ = 1,
74 PERF_RECORD_GROUP = 2,
75};
76
77/*
78 * Hardware event to monitor via a performance monitoring counter:
79 */
80struct perf_counter_hw_event {
81 s64 type;
82
83 u64 irq_period;
84 u32 record_type;
85
86 u32 disabled : 1, /* off by default */
87 nmi : 1, /* NMI sampling */
88 raw : 1, /* raw event type */
89 inherit : 1, /* children inherit it */
90 pinned : 1, /* must always be on PMU */
91 exclusive : 1, /* only group on PMU */
92 exclude_user : 1, /* don't count user */
93 exclude_kernel : 1, /* ditto kernel */
94 exclude_hv : 1, /* ditto hypervisor */
95
96 __reserved_1 : 23;
97
98 u64 __reserved_2;
99};
100
101/*
102 * Ioctls that can be done on a perf counter fd:
103 */
104#define PERF_COUNTER_IOC_ENABLE _IO('$', 0)
105#define PERF_COUNTER_IOC_DISABLE _IO('$', 1)
106
107/*
108 * Kernel-internal data types:
109 */
110
111/**
112 * struct hw_perf_counter - performance counter hardware details:
113 */
114struct hw_perf_counter {
115#ifdef CONFIG_PERF_COUNTERS
116 u64 config;
117 unsigned long config_base;
118 unsigned long counter_base;
119 int nmi;
120 unsigned int idx;
121 atomic64_t prev_count;
122 u64 irq_period;
123 atomic64_t period_left;
124#endif
125};
126
127/*
128 * Hardcoded buffer length limit for now, for IRQ-fed events:
129 */
130#define PERF_DATA_BUFLEN 2048
131
132/**
133 * struct perf_data - performance counter IRQ data sampling ...
134 */
135struct perf_data {
136 int len;
137 int rd_idx;
138 int overrun;
139 u8 data[PERF_DATA_BUFLEN];
140};
141
142struct perf_counter;
143
144/**
145 * struct hw_perf_counter_ops - performance counter hw ops
146 */
147struct hw_perf_counter_ops {
148 int (*enable) (struct perf_counter *counter);
149 void (*disable) (struct perf_counter *counter);
150 void (*read) (struct perf_counter *counter);
151};
152
153/**
154 * enum perf_counter_active_state - the states of a counter
155 */
156enum perf_counter_active_state {
157 PERF_COUNTER_STATE_ERROR = -2,
158 PERF_COUNTER_STATE_OFF = -1,
159 PERF_COUNTER_STATE_INACTIVE = 0,
160 PERF_COUNTER_STATE_ACTIVE = 1,
161};
162
163struct file;
164
165/**
166 * struct perf_counter - performance counter kernel representation:
167 */
168struct perf_counter {
169#ifdef CONFIG_PERF_COUNTERS
170 struct list_head list_entry;
171 struct list_head sibling_list;
172 struct perf_counter *group_leader;
173 const struct hw_perf_counter_ops *hw_ops;
174
175 enum perf_counter_active_state state;
176 atomic64_t count;
177
178 struct perf_counter_hw_event hw_event;
179 struct hw_perf_counter hw;
180
181 struct perf_counter_context *ctx;
182 struct task_struct *task;
183 struct file *filp;
184
185 struct perf_counter *parent;
186 struct list_head child_list;
187
188 /*
189 * Protect attach/detach and child_list:
190 */
191 struct mutex mutex;
192
193 int oncpu;
194 int cpu;
195
196 /* read() / irq related data */
197 wait_queue_head_t waitq;
198 /* optional: for NMIs */
199 int wakeup_pending;
200 struct perf_data *irqdata;
201 struct perf_data *usrdata;
202 struct perf_data data[2];
203#endif
204};
205
206/**
207 * struct perf_counter_context - counter context structure
208 *
209 * Used as a container for task counters and CPU counters as well:
210 */
211struct perf_counter_context {
212#ifdef CONFIG_PERF_COUNTERS
213 /*
214 * Protect the states of the counters in the list,
215 * nr_active, and the list:
216 */
217 spinlock_t lock;
218 /*
219 * Protect the list of counters. Locking either mutex or lock
220 * is sufficient to ensure the list doesn't change; to change
221 * the list you need to lock both the mutex and the spinlock.
222 */
223 struct mutex mutex;
224
225 struct list_head counter_list;
226 int nr_counters;
227 int nr_active;
228 int is_active;
229 struct task_struct *task;
230#endif
231};
232
233/**
234 * struct perf_counter_cpu_context - per cpu counter context structure
235 */
236struct perf_cpu_context {
237 struct perf_counter_context ctx;
238 struct perf_counter_context *task_ctx;
239 int active_oncpu;
240 int max_pertask;
241 int exclusive;
242};
243
244/*
245 * Set by architecture code:
246 */
247extern int perf_max_counters;
248
249#ifdef CONFIG_PERF_COUNTERS
250extern const struct hw_perf_counter_ops *
251hw_perf_counter_init(struct perf_counter *counter);
252
253extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
254extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
255extern void perf_counter_task_tick(struct task_struct *task, int cpu);
256extern void perf_counter_init_task(struct task_struct *child);
257extern void perf_counter_exit_task(struct task_struct *child);
258extern void perf_counter_notify(struct pt_regs *regs);
259extern void perf_counter_print_debug(void);
260extern void perf_counter_unthrottle(void);
261extern u64 hw_perf_save_disable(void);
262extern void hw_perf_restore(u64 ctrl);
263extern int perf_counter_task_disable(void);
264extern int perf_counter_task_enable(void);
265extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
266 struct perf_cpu_context *cpuctx,
267 struct perf_counter_context *ctx, int cpu);
268
269/*
270 * Return 1 for a software counter, 0 for a hardware counter
271 */
272static inline int is_software_counter(struct perf_counter *counter)
273{
274 return !counter->hw_event.raw && counter->hw_event.type < 0;
275}
276
277#else
278static inline void
279perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
280static inline void
281perf_counter_task_sched_out(struct task_struct *task, int cpu) { }
282static inline void
283perf_counter_task_tick(struct task_struct *task, int cpu) { }
284static inline void perf_counter_init_task(struct task_struct *child) { }
285static inline void perf_counter_exit_task(struct task_struct *child) { }
286static inline void perf_counter_notify(struct pt_regs *regs) { }
287static inline void perf_counter_print_debug(void) { }
288static inline void perf_counter_unthrottle(void) { }
289static inline void hw_perf_restore(u64 ctrl) { }
290static inline u64 hw_perf_save_disable(void) { return 0; }
291static inline int perf_counter_task_disable(void) { return -EINVAL; }
292static inline int perf_counter_task_enable(void) { return -EINVAL; }
293#endif
294
295#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 48d887e3c6e7..b00df4c79c63 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,4 +85,7 @@
85#define PR_SET_TIMERSLACK 29 85#define PR_SET_TIMERSLACK 29
86#define PR_GET_TIMERSLACK 30 86#define PR_GET_TIMERSLACK 30
87 87
88#define PR_TASK_PERF_COUNTERS_DISABLE 31
89#define PR_TASK_PERF_COUNTERS_ENABLE 32
90
88#endif /* _LINUX_PRCTL_H */ 91#endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8981e52c714f..726d27044778 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,6 +71,7 @@ struct sched_param {
71#include <linux/fs_struct.h> 71#include <linux/fs_struct.h>
72#include <linux/compiler.h> 72#include <linux/compiler.h>
73#include <linux/completion.h> 73#include <linux/completion.h>
74#include <linux/perf_counter.h>
74#include <linux/pid.h> 75#include <linux/pid.h>
75#include <linux/percpu.h> 76#include <linux/percpu.h>
76#include <linux/topology.h> 77#include <linux/topology.h>
@@ -136,6 +137,8 @@ extern unsigned long nr_running(void);
136extern unsigned long nr_uninterruptible(void); 137extern unsigned long nr_uninterruptible(void);
137extern unsigned long nr_active(void); 138extern unsigned long nr_active(void);
138extern unsigned long nr_iowait(void); 139extern unsigned long nr_iowait(void);
140extern u64 cpu_nr_switches(int cpu);
141extern u64 cpu_nr_migrations(int cpu);
139 142
140struct seq_file; 143struct seq_file;
141struct cfs_rq; 144struct cfs_rq;
@@ -1052,6 +1055,8 @@ struct sched_entity {
1052 u64 last_wakeup; 1055 u64 last_wakeup;
1053 u64 avg_overlap; 1056 u64 avg_overlap;
1054 1057
1058 u64 nr_migrations;
1059
1055#ifdef CONFIG_SCHEDSTATS 1060#ifdef CONFIG_SCHEDSTATS
1056 u64 wait_start; 1061 u64 wait_start;
1057 u64 wait_max; 1062 u64 wait_max;
@@ -1067,7 +1072,6 @@ struct sched_entity {
1067 u64 exec_max; 1072 u64 exec_max;
1068 u64 slice_max; 1073 u64 slice_max;
1069 1074
1070 u64 nr_migrations;
1071 u64 nr_migrations_cold; 1075 u64 nr_migrations_cold;
1072 u64 nr_failed_migrations_affine; 1076 u64 nr_failed_migrations_affine;
1073 u64 nr_failed_migrations_running; 1077 u64 nr_failed_migrations_running;
@@ -1178,10 +1182,9 @@ struct task_struct {
1178 pid_t pid; 1182 pid_t pid;
1179 pid_t tgid; 1183 pid_t tgid;
1180 1184
1181#ifdef CONFIG_CC_STACKPROTECTOR
1182 /* Canary value for the -fstack-protector gcc feature */ 1185 /* Canary value for the -fstack-protector gcc feature */
1183 unsigned long stack_canary; 1186 unsigned long stack_canary;
1184#endif 1187
1185 /* 1188 /*
1186 * pointers to (original) parent process, youngest child, younger sibling, 1189 * pointers to (original) parent process, youngest child, younger sibling,
1187 * older sibling, respectively. (p->father can be replaced with 1190 * older sibling, respectively. (p->father can be replaced with
@@ -1370,6 +1373,7 @@ struct task_struct {
1370 struct list_head pi_state_list; 1373 struct list_head pi_state_list;
1371 struct futex_pi_state *pi_state_cache; 1374 struct futex_pi_state *pi_state_cache;
1372#endif 1375#endif
1376 struct perf_counter_context perf_counter_ctx;
1373#ifdef CONFIG_NUMA 1377#ifdef CONFIG_NUMA
1374 struct mempolicy *mempolicy; 1378 struct mempolicy *mempolicy;
1375 short il_next; 1379 short il_next;
@@ -2087,6 +2091,19 @@ static inline int object_is_on_stack(void *obj)
2087 2091
2088extern void thread_info_cache_init(void); 2092extern void thread_info_cache_init(void);
2089 2093
2094#ifdef CONFIG_DEBUG_STACK_USAGE
2095static inline unsigned long stack_not_used(struct task_struct *p)
2096{
2097 unsigned long *n = end_of_stack(p);
2098
2099 do { /* Skip over canary */
2100 n++;
2101 } while (!*n);
2102
2103 return (unsigned long)n - (unsigned long)end_of_stack(p);
2104}
2105#endif
2106
2090/* set thread flags in other task's structures 2107/* set thread flags in other task's structures
2091 * - see asm/thread_info.h for TIF_xxxx flags available 2108 * - see asm/thread_info.h for TIF_xxxx flags available
2092 */ 2109 */
@@ -2336,6 +2353,13 @@ static inline void inc_syscw(struct task_struct *tsk)
2336#define TASK_SIZE_OF(tsk) TASK_SIZE 2353#define TASK_SIZE_OF(tsk) TASK_SIZE
2337#endif 2354#endif
2338 2355
2356/*
2357 * Call the function if the target task is executing on a CPU right now:
2358 */
2359extern void task_oncpu_function_call(struct task_struct *p,
2360 void (*func) (void *info), void *info);
2361
2362
2339#ifdef CONFIG_MM_OWNER 2363#ifdef CONFIG_MM_OWNER
2340extern void mm_update_next_owner(struct mm_struct *mm); 2364extern void mm_update_next_owner(struct mm_struct *mm);
2341extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); 2365extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/stackprotector.h b/include/linux/stackprotector.h
new file mode 100644
index 000000000000..6f3e54c704c0
--- /dev/null
+++ b/include/linux/stackprotector.h
@@ -0,0 +1,16 @@
1#ifndef _LINUX_STACKPROTECTOR_H
2#define _LINUX_STACKPROTECTOR_H 1
3
4#include <linux/compiler.h>
5#include <linux/sched.h>
6#include <linux/random.h>
7
8#ifdef CONFIG_CC_STACKPROTECTOR
9# include <asm/stackprotector.h>
10#else
11static inline void boot_init_stack_canary(void)
12{
13}
14#endif
15
16#endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f9f900cfd066..88255d3261a4 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -55,6 +55,7 @@ struct compat_timeval;
55struct robust_list_head; 55struct robust_list_head;
56struct getcpu_cache; 56struct getcpu_cache;
57struct old_linux_dirent; 57struct old_linux_dirent;
58struct perf_counter_hw_event;
58 59
59#include <linux/types.h> 60#include <linux/types.h>
60#include <linux/aio_abi.h> 61#include <linux/aio_abi.h>
@@ -694,4 +695,11 @@ asmlinkage long sys_pipe(int __user *);
694 695
695int kernel_execve(const char *filename, char *const argv[], char *const envp[]); 696int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
696 697
698
699asmlinkage int sys_perf_counter_open(
700
701 struct perf_counter_hw_event *hw_event_uptr __user,
702 pid_t pid,
703 int cpu,
704 int group_fd);
697#endif 705#endif
diff --git a/include/linux/topology.h b/include/linux/topology.h
index e632d29f0544..a16b9e06f2e5 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -193,5 +193,11 @@ int arch_update_cpu_topology(void);
193#ifndef topology_core_siblings 193#ifndef topology_core_siblings
194#define topology_core_siblings(cpu) cpumask_of_cpu(cpu) 194#define topology_core_siblings(cpu) cpumask_of_cpu(cpu)
195#endif 195#endif
196#ifndef topology_thread_cpumask
197#define topology_thread_cpumask(cpu) cpumask_of(cpu)
198#endif
199#ifndef topology_core_cpumask
200#define topology_core_cpumask(cpu) cpumask_of(cpu)
201#endif
196 202
197#endif /* _LINUX_TOPOLOGY_H */ 203#endif /* _LINUX_TOPOLOGY_H */
diff --git a/init/Kconfig b/init/Kconfig
index f068071fcc5d..5a3ad5c20e2b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -869,6 +869,36 @@ config AIO
869 by some high performance threaded applications. Disabling 869 by some high performance threaded applications. Disabling
870 this option saves about 7k. 870 this option saves about 7k.
871 871
872config HAVE_PERF_COUNTERS
873 bool
874
875menu "Performance Counters"
876
877config PERF_COUNTERS
878 bool "Kernel Performance Counters"
879 depends on HAVE_PERF_COUNTERS
880 default y
881 select ANON_INODES
882 help
883 Enable kernel support for performance counter hardware.
884
885 Performance counters are special hardware registers available
886 on most modern CPUs. These registers count the number of certain
887 types of hw events: such as instructions executed, cachemisses
888 suffered, or branches mis-predicted - without slowing down the
889 kernel or applications. These registers can also trigger interrupts
890 when a threshold number of events have passed - and can thus be
891 used to profile the code that runs on that CPU.
892
893 The Linux Performance Counter subsystem provides an abstraction of
894 these hardware capabilities, available via a system call. It
895 provides per task and per CPU counters, and it provides event
896 capabilities on top of those.
897
898 Say Y if unsure.
899
900endmenu
901
872config VM_EVENT_COUNTERS 902config VM_EVENT_COUNTERS
873 default y 903 default y
874 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED 904 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
diff --git a/init/main.c b/init/main.c
index 844209453c02..bfe4fb0c9842 100644
--- a/init/main.c
+++ b/init/main.c
@@ -14,6 +14,7 @@
14#include <linux/proc_fs.h> 14#include <linux/proc_fs.h>
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <linux/stackprotector.h>
17#include <linux/string.h> 18#include <linux/string.h>
18#include <linux/ctype.h> 19#include <linux/ctype.h>
19#include <linux/delay.h> 20#include <linux/delay.h>
@@ -539,6 +540,12 @@ asmlinkage void __init start_kernel(void)
539 */ 540 */
540 lockdep_init(); 541 lockdep_init();
541 debug_objects_early_init(); 542 debug_objects_early_init();
543
544 /*
545 * Set up the the initial canary ASAP:
546 */
547 boot_init_stack_canary();
548
542 cgroup_init_early(); 549 cgroup_init_early();
543 550
544 local_irq_disable(); 551 local_irq_disable();
diff --git a/kernel/Makefile b/kernel/Makefile
index 170a9213c1b6..5537554ed808 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -92,6 +92,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
92obj-$(CONFIG_FUNCTION_TRACER) += trace/ 92obj-$(CONFIG_FUNCTION_TRACER) += trace/
93obj-$(CONFIG_TRACING) += trace/ 93obj-$(CONFIG_TRACING) += trace/
94obj-$(CONFIG_SMP) += sched_cpupri.o 94obj-$(CONFIG_SMP) += sched_cpupri.o
95obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
95 96
96ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 97ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
97# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 98# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index efd30ccf3858..f52c24eb8a8f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -162,6 +162,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
162{ 162{
163 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 163 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
164 164
165#ifdef CONFIG_PERF_COUNTERS
166 WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
167#endif
165 trace_sched_process_free(tsk); 168 trace_sched_process_free(tsk);
166 put_task_struct(tsk); 169 put_task_struct(tsk);
167} 170}
@@ -980,12 +983,9 @@ static void check_stack_usage(void)
980{ 983{
981 static DEFINE_SPINLOCK(low_water_lock); 984 static DEFINE_SPINLOCK(low_water_lock);
982 static int lowest_to_date = THREAD_SIZE; 985 static int lowest_to_date = THREAD_SIZE;
983 unsigned long *n = end_of_stack(current);
984 unsigned long free; 986 unsigned long free;
985 987
986 while (*n == 0) 988 free = stack_not_used(current);
987 n++;
988 free = (unsigned long)n - (unsigned long)end_of_stack(current);
989 989
990 if (free >= lowest_to_date) 990 if (free >= lowest_to_date)
991 return; 991 return;
@@ -1096,10 +1096,6 @@ NORET_TYPE void do_exit(long code)
1096 tsk->mempolicy = NULL; 1096 tsk->mempolicy = NULL;
1097#endif 1097#endif
1098#ifdef CONFIG_FUTEX 1098#ifdef CONFIG_FUTEX
1099 /*
1100 * This must happen late, after the PID is not
1101 * hashed anymore:
1102 */
1103 if (unlikely(!list_empty(&tsk->pi_state_list))) 1099 if (unlikely(!list_empty(&tsk->pi_state_list)))
1104 exit_pi_state_list(tsk); 1100 exit_pi_state_list(tsk);
1105 if (unlikely(current->pi_state_cache)) 1101 if (unlikely(current->pi_state_cache))
@@ -1366,6 +1362,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
1366 */ 1362 */
1367 read_unlock(&tasklist_lock); 1363 read_unlock(&tasklist_lock);
1368 1364
1365 /*
1366 * Flush inherited counters to the parent - before the parent
1367 * gets woken up by child-exit notifications.
1368 */
1369 perf_counter_exit_task(p);
1370
1369 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1371 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
1370 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1372 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1371 ? p->signal->group_exit_code : p->exit_code; 1373 ? p->signal->group_exit_code : p->exit_code;
diff --git a/kernel/fork.c b/kernel/fork.c
index a66fbde20715..4640a3e0085e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -61,6 +61,7 @@
61#include <linux/proc_fs.h> 61#include <linux/proc_fs.h>
62#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <trace/sched.h> 63#include <trace/sched.h>
64#include <linux/magic.h>
64 65
65#include <asm/pgtable.h> 66#include <asm/pgtable.h>
66#include <asm/pgalloc.h> 67#include <asm/pgalloc.h>
@@ -212,6 +213,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
212{ 213{
213 struct task_struct *tsk; 214 struct task_struct *tsk;
214 struct thread_info *ti; 215 struct thread_info *ti;
216 unsigned long *stackend;
217
215 int err; 218 int err;
216 219
217 prepare_to_copy(orig); 220 prepare_to_copy(orig);
@@ -237,6 +240,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
237 goto out; 240 goto out;
238 241
239 setup_thread_stack(tsk, orig); 242 setup_thread_stack(tsk, orig);
243 stackend = end_of_stack(tsk);
244 *stackend = STACK_END_MAGIC; /* for overflow detection */
240 245
241#ifdef CONFIG_CC_STACKPROTECTOR 246#ifdef CONFIG_CC_STACKPROTECTOR
242 tsk->stack_canary = get_random_int(); 247 tsk->stack_canary = get_random_int();
@@ -984,6 +989,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
984 goto fork_out; 989 goto fork_out;
985 990
986 rt_mutex_init_task(p); 991 rt_mutex_init_task(p);
992 perf_counter_init_task(p);
987 993
988#ifdef CONFIG_PROVE_LOCKING 994#ifdef CONFIG_PROVE_LOCKING
989 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); 995 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 7de11bd64dfe..122fef4b0bd3 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -46,7 +46,10 @@ void dynamic_irq_init(unsigned int irq)
46 desc->irq_count = 0; 46 desc->irq_count = 0;
47 desc->irqs_unhandled = 0; 47 desc->irqs_unhandled = 0;
48#ifdef CONFIG_SMP 48#ifdef CONFIG_SMP
49 cpumask_setall(&desc->affinity); 49 cpumask_setall(desc->affinity);
50#ifdef CONFIG_GENERIC_PENDING_IRQ
51 cpumask_clear(desc->pending_mask);
52#endif
50#endif 53#endif
51 spin_unlock_irqrestore(&desc->lock, flags); 54 spin_unlock_irqrestore(&desc->lock, flags);
52} 55}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3aba8d12f328..f51eaee921b6 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -17,6 +17,7 @@
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/rculist.h> 18#include <linux/rculist.h>
19#include <linux/hash.h> 19#include <linux/hash.h>
20#include <linux/bootmem.h>
20 21
21#include "internals.h" 22#include "internals.h"
22 23
@@ -69,6 +70,7 @@ int nr_irqs = NR_IRQS;
69EXPORT_SYMBOL_GPL(nr_irqs); 70EXPORT_SYMBOL_GPL(nr_irqs);
70 71
71#ifdef CONFIG_SPARSE_IRQ 72#ifdef CONFIG_SPARSE_IRQ
73
72static struct irq_desc irq_desc_init = { 74static struct irq_desc irq_desc_init = {
73 .irq = -1, 75 .irq = -1,
74 .status = IRQ_DISABLED, 76 .status = IRQ_DISABLED,
@@ -76,9 +78,6 @@ static struct irq_desc irq_desc_init = {
76 .handle_irq = handle_bad_irq, 78 .handle_irq = handle_bad_irq,
77 .depth = 1, 79 .depth = 1,
78 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 80 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
79#ifdef CONFIG_SMP
80 .affinity = CPU_MASK_ALL
81#endif
82}; 81};
83 82
84void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) 83void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
@@ -113,6 +112,10 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
113 printk(KERN_ERR "can not alloc kstat_irqs\n"); 112 printk(KERN_ERR "can not alloc kstat_irqs\n");
114 BUG_ON(1); 113 BUG_ON(1);
115 } 114 }
115 if (!init_alloc_desc_masks(desc, cpu, false)) {
116 printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
117 BUG_ON(1);
118 }
116 arch_init_chip_data(desc, cpu); 119 arch_init_chip_data(desc, cpu);
117} 120}
118 121
@@ -121,7 +124,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
121 */ 124 */
122DEFINE_SPINLOCK(sparse_irq_lock); 125DEFINE_SPINLOCK(sparse_irq_lock);
123 126
124struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly; 127struct irq_desc **irq_desc_ptrs __read_mostly;
125 128
126static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { 129static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
127 [0 ... NR_IRQS_LEGACY-1] = { 130 [0 ... NR_IRQS_LEGACY-1] = {
@@ -131,14 +134,10 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
131 .handle_irq = handle_bad_irq, 134 .handle_irq = handle_bad_irq,
132 .depth = 1, 135 .depth = 1,
133 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 136 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
134#ifdef CONFIG_SMP
135 .affinity = CPU_MASK_ALL
136#endif
137 } 137 }
138}; 138};
139 139
140/* FIXME: use bootmem alloc ...*/ 140static unsigned int *kstat_irqs_legacy;
141static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
142 141
143int __init early_irq_init(void) 142int __init early_irq_init(void)
144{ 143{
@@ -148,18 +147,30 @@ int __init early_irq_init(void)
148 147
149 init_irq_default_affinity(); 148 init_irq_default_affinity();
150 149
150 /* initialize nr_irqs based on nr_cpu_ids */
151 arch_probe_nr_irqs();
152 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
153
151 desc = irq_desc_legacy; 154 desc = irq_desc_legacy;
152 legacy_count = ARRAY_SIZE(irq_desc_legacy); 155 legacy_count = ARRAY_SIZE(irq_desc_legacy);
153 156
157 /* allocate irq_desc_ptrs array based on nr_irqs */
158 irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
159
160 /* allocate based on nr_cpu_ids */
161 /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
162 kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
163 sizeof(int));
164
154 for (i = 0; i < legacy_count; i++) { 165 for (i = 0; i < legacy_count; i++) {
155 desc[i].irq = i; 166 desc[i].irq = i;
156 desc[i].kstat_irqs = kstat_irqs_legacy[i]; 167 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
157 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 168 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
158 169 init_alloc_desc_masks(&desc[i], 0, true);
159 irq_desc_ptrs[i] = desc + i; 170 irq_desc_ptrs[i] = desc + i;
160 } 171 }
161 172
162 for (i = legacy_count; i < NR_IRQS; i++) 173 for (i = legacy_count; i < nr_irqs; i++)
163 irq_desc_ptrs[i] = NULL; 174 irq_desc_ptrs[i] = NULL;
164 175
165 return arch_early_irq_init(); 176 return arch_early_irq_init();
@@ -167,7 +178,10 @@ int __init early_irq_init(void)
167 178
168struct irq_desc *irq_to_desc(unsigned int irq) 179struct irq_desc *irq_to_desc(unsigned int irq)
169{ 180{
170 return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL; 181 if (irq_desc_ptrs && irq < nr_irqs)
182 return irq_desc_ptrs[irq];
183
184 return NULL;
171} 185}
172 186
173struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) 187struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
@@ -176,10 +190,9 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
176 unsigned long flags; 190 unsigned long flags;
177 int node; 191 int node;
178 192
179 if (irq >= NR_IRQS) { 193 if (irq >= nr_irqs) {
180 printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n", 194 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
181 irq, NR_IRQS); 195 irq, nr_irqs);
182 WARN_ON(1);
183 return NULL; 196 return NULL;
184 } 197 }
185 198
@@ -221,9 +234,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
221 .handle_irq = handle_bad_irq, 234 .handle_irq = handle_bad_irq,
222 .depth = 1, 235 .depth = 1,
223 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), 236 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
224#ifdef CONFIG_SMP
225 .affinity = CPU_MASK_ALL
226#endif
227 } 237 }
228}; 238};
229 239
@@ -235,12 +245,15 @@ int __init early_irq_init(void)
235 245
236 init_irq_default_affinity(); 246 init_irq_default_affinity();
237 247
248 printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
249
238 desc = irq_desc; 250 desc = irq_desc;
239 count = ARRAY_SIZE(irq_desc); 251 count = ARRAY_SIZE(irq_desc);
240 252
241 for (i = 0; i < count; i++) 253 for (i = 0; i < count; i++) {
242 desc[i].irq = i; 254 desc[i].irq = i;
243 255 init_alloc_desc_masks(&desc[i], 0, true);
256 }
244 return arch_early_irq_init(); 257 return arch_early_irq_init();
245} 258}
246 259
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e6d0a43cc125..40416a81a0f5 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,14 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
16extern struct lock_class_key irq_desc_lock_class; 16extern struct lock_class_key irq_desc_lock_class;
17extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); 17extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
18extern spinlock_t sparse_irq_lock; 18extern spinlock_t sparse_irq_lock;
19
20#ifdef CONFIG_SPARSE_IRQ
21/* irq_desc_ptrs allocated at boot time */
22extern struct irq_desc **irq_desc_ptrs;
23#else
24/* irq_desc_ptrs is a fixed size array */
19extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; 25extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
26#endif
20 27
21#ifdef CONFIG_PROC_FS 28#ifdef CONFIG_PROC_FS
22extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 29extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 291f03664552..a3a5dc9ef346 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -90,14 +90,14 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
90 90
91#ifdef CONFIG_GENERIC_PENDING_IRQ 91#ifdef CONFIG_GENERIC_PENDING_IRQ
92 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { 92 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
93 cpumask_copy(&desc->affinity, cpumask); 93 cpumask_copy(desc->affinity, cpumask);
94 desc->chip->set_affinity(irq, cpumask); 94 desc->chip->set_affinity(irq, cpumask);
95 } else { 95 } else {
96 desc->status |= IRQ_MOVE_PENDING; 96 desc->status |= IRQ_MOVE_PENDING;
97 cpumask_copy(&desc->pending_mask, cpumask); 97 cpumask_copy(desc->pending_mask, cpumask);
98 } 98 }
99#else 99#else
100 cpumask_copy(&desc->affinity, cpumask); 100 cpumask_copy(desc->affinity, cpumask);
101 desc->chip->set_affinity(irq, cpumask); 101 desc->chip->set_affinity(irq, cpumask);
102#endif 102#endif
103 desc->status |= IRQ_AFFINITY_SET; 103 desc->status |= IRQ_AFFINITY_SET;
@@ -119,16 +119,16 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
119 * one of the targets is online. 119 * one of the targets is online.
120 */ 120 */
121 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 121 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
122 if (cpumask_any_and(&desc->affinity, cpu_online_mask) 122 if (cpumask_any_and(desc->affinity, cpu_online_mask)
123 < nr_cpu_ids) 123 < nr_cpu_ids)
124 goto set_affinity; 124 goto set_affinity;
125 else 125 else
126 desc->status &= ~IRQ_AFFINITY_SET; 126 desc->status &= ~IRQ_AFFINITY_SET;
127 } 127 }
128 128
129 cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity); 129 cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
130set_affinity: 130set_affinity:
131 desc->chip->set_affinity(irq, &desc->affinity); 131 desc->chip->set_affinity(irq, desc->affinity);
132 132
133 return 0; 133 return 0;
134} 134}
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index bd72329e630c..e05ad9be43b7 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -18,7 +18,7 @@ void move_masked_irq(int irq)
18 18
19 desc->status &= ~IRQ_MOVE_PENDING; 19 desc->status &= ~IRQ_MOVE_PENDING;
20 20
21 if (unlikely(cpumask_empty(&desc->pending_mask))) 21 if (unlikely(cpumask_empty(desc->pending_mask)))
22 return; 22 return;
23 23
24 if (!desc->chip->set_affinity) 24 if (!desc->chip->set_affinity)
@@ -38,13 +38,13 @@ void move_masked_irq(int irq)
38 * For correct operation this depends on the caller 38 * For correct operation this depends on the caller
39 * masking the irqs. 39 * masking the irqs.
40 */ 40 */
41 if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask) 41 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
42 < nr_cpu_ids)) { 42 < nr_cpu_ids)) {
43 cpumask_and(&desc->affinity, 43 cpumask_and(desc->affinity,
44 &desc->pending_mask, cpu_online_mask); 44 desc->pending_mask, cpu_online_mask);
45 desc->chip->set_affinity(irq, &desc->affinity); 45 desc->chip->set_affinity(irq, desc->affinity);
46 } 46 }
47 cpumask_clear(&desc->pending_mask); 47 cpumask_clear(desc->pending_mask);
48} 48}
49 49
50void move_native_irq(int irq) 50void move_native_irq(int irq)
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index acd88356ac76..7f9b80434e32 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -38,15 +38,22 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
38 old_desc->kstat_irqs = NULL; 38 old_desc->kstat_irqs = NULL;
39} 39}
40 40
41static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, 41static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
42 struct irq_desc *desc, int cpu) 42 struct irq_desc *desc, int cpu)
43{ 43{
44 memcpy(desc, old_desc, sizeof(struct irq_desc)); 44 memcpy(desc, old_desc, sizeof(struct irq_desc));
45 if (!init_alloc_desc_masks(desc, cpu, false)) {
46 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
47 "for migration.\n", irq);
48 return false;
49 }
45 spin_lock_init(&desc->lock); 50 spin_lock_init(&desc->lock);
46 desc->cpu = cpu; 51 desc->cpu = cpu;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 52 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); 53 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
54 init_copy_desc_masks(old_desc, desc);
49 arch_init_copy_chip_data(old_desc, desc, cpu); 55 arch_init_copy_chip_data(old_desc, desc, cpu);
56 return true;
50} 57}
51 58
52static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) 59static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
@@ -76,12 +83,18 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
76 node = cpu_to_node(cpu); 83 node = cpu_to_node(cpu);
77 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); 84 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
78 if (!desc) { 85 if (!desc) {
79 printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq); 86 printk(KERN_ERR "irq %d: can not get new irq_desc "
87 "for migration.\n", irq);
88 /* still use old one */
89 desc = old_desc;
90 goto out_unlock;
91 }
92 if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
80 /* still use old one */ 93 /* still use old one */
94 kfree(desc);
81 desc = old_desc; 95 desc = old_desc;
82 goto out_unlock; 96 goto out_unlock;
83 } 97 }
84 init_copy_one_irq_desc(irq, old_desc, desc, cpu);
85 98
86 irq_desc_ptrs[irq] = desc; 99 irq_desc_ptrs[irq] = desc;
87 spin_unlock_irqrestore(&sparse_irq_lock, flags); 100 spin_unlock_irqrestore(&sparse_irq_lock, flags);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index aae3f742bcec..692363dd591f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -20,11 +20,11 @@ static struct proc_dir_entry *root_irq_dir;
20static int irq_affinity_proc_show(struct seq_file *m, void *v) 20static int irq_affinity_proc_show(struct seq_file *m, void *v)
21{ 21{
22 struct irq_desc *desc = irq_to_desc((long)m->private); 22 struct irq_desc *desc = irq_to_desc((long)m->private);
23 const struct cpumask *mask = &desc->affinity; 23 const struct cpumask *mask = desc->affinity;
24 24
25#ifdef CONFIG_GENERIC_PENDING_IRQ 25#ifdef CONFIG_GENERIC_PENDING_IRQ
26 if (desc->status & IRQ_MOVE_PENDING) 26 if (desc->status & IRQ_MOVE_PENDING)
27 mask = &desc->pending_mask; 27 mask = desc->pending_mask;
28#endif 28#endif
29 seq_cpumask(m, mask); 29 seq_cpumask(m, mask);
30 seq_putc(m, '\n'); 30 seq_putc(m, '\n');
diff --git a/kernel/panic.c b/kernel/panic.c
index 2a2ff36ff44d..33cab3de1763 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -74,6 +74,9 @@ NORET_TYPE void panic(const char * fmt, ...)
74 vsnprintf(buf, sizeof(buf), fmt, args); 74 vsnprintf(buf, sizeof(buf), fmt, args);
75 va_end(args); 75 va_end(args);
76 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); 76 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
77#ifdef CONFIG_DEBUG_BUGVERBOSE
78 dump_stack();
79#endif
77 bust_spinlocks(0); 80 bust_spinlocks(0);
78 81
79 /* 82 /*
@@ -355,15 +358,22 @@ EXPORT_SYMBOL(warn_slowpath);
355#endif 358#endif
356 359
357#ifdef CONFIG_CC_STACKPROTECTOR 360#ifdef CONFIG_CC_STACKPROTECTOR
361
362#ifndef GCC_HAS_SP
363#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this.
364#endif
365
358/* 366/*
359 * Called when gcc's -fstack-protector feature is used, and 367 * Called when gcc's -fstack-protector feature is used, and
360 * gcc detects corruption of the on-stack canary value 368 * gcc detects corruption of the on-stack canary value
361 */ 369 */
362void __stack_chk_fail(void) 370void __stack_chk_fail(void)
363{ 371{
364 panic("stack-protector: Kernel stack is corrupted"); 372 panic("stack-protector: Kernel stack is corrupted in: %p\n",
373 __builtin_return_address(0));
365} 374}
366EXPORT_SYMBOL(__stack_chk_fail); 375EXPORT_SYMBOL(__stack_chk_fail);
376
367#endif 377#endif
368 378
369core_param(panic, panic_timeout, int, 0644); 379core_param(panic, panic_timeout, int, 0644);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 000000000000..fcefb0a726f3
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,2199 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#include <linux/fs.h>
11#include <linux/cpu.h>
12#include <linux/smp.h>
13#include <linux/file.h>
14#include <linux/poll.h>
15#include <linux/sysfs.h>
16#include <linux/ptrace.h>
17#include <linux/percpu.h>
18#include <linux/uaccess.h>
19#include <linux/syscalls.h>
20#include <linux/anon_inodes.h>
21#include <linux/kernel_stat.h>
22#include <linux/perf_counter.h>
23#include <linux/mm.h>
24#include <linux/vmstat.h>
25
26/*
27 * Each CPU has a list of per CPU counters:
28 */
29DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
30
31int perf_max_counters __read_mostly = 1;
32static int perf_reserved_percpu __read_mostly;
33static int perf_overcommit __read_mostly = 1;
34
35/*
36 * Mutex for (sysadmin-configurable) counter reservations:
37 */
38static DEFINE_MUTEX(perf_resource_mutex);
39
40/*
41 * Architecture provided APIs - weak aliases:
42 */
43extern __weak const struct hw_perf_counter_ops *
44hw_perf_counter_init(struct perf_counter *counter)
45{
46 return NULL;
47}
48
49u64 __weak hw_perf_save_disable(void) { return 0; }
50void __weak hw_perf_restore(u64 ctrl) { barrier(); }
51void __weak hw_perf_counter_setup(int cpu) { barrier(); }
52int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
53 struct perf_cpu_context *cpuctx,
54 struct perf_counter_context *ctx, int cpu)
55{
56 return 0;
57}
58
59void __weak perf_counter_print_debug(void) { }
60
61static void
62list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
63{
64 struct perf_counter *group_leader = counter->group_leader;
65
66 /*
67 * Depending on whether it is a standalone or sibling counter,
68 * add it straight to the context's counter list, or to the group
69 * leader's sibling list:
70 */
71 if (counter->group_leader == counter)
72 list_add_tail(&counter->list_entry, &ctx->counter_list);
73 else
74 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
75}
76
77static void
78list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
79{
80 struct perf_counter *sibling, *tmp;
81
82 list_del_init(&counter->list_entry);
83
84 /*
85 * If this was a group counter with sibling counters then
86 * upgrade the siblings to singleton counters by adding them
87 * to the context list directly:
88 */
89 list_for_each_entry_safe(sibling, tmp,
90 &counter->sibling_list, list_entry) {
91
92 list_del_init(&sibling->list_entry);
93 list_add_tail(&sibling->list_entry, &ctx->counter_list);
94 sibling->group_leader = sibling;
95 }
96}
97
98static void
99counter_sched_out(struct perf_counter *counter,
100 struct perf_cpu_context *cpuctx,
101 struct perf_counter_context *ctx)
102{
103 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
104 return;
105
106 counter->state = PERF_COUNTER_STATE_INACTIVE;
107 counter->hw_ops->disable(counter);
108 counter->oncpu = -1;
109
110 if (!is_software_counter(counter))
111 cpuctx->active_oncpu--;
112 ctx->nr_active--;
113 if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
114 cpuctx->exclusive = 0;
115}
116
117static void
118group_sched_out(struct perf_counter *group_counter,
119 struct perf_cpu_context *cpuctx,
120 struct perf_counter_context *ctx)
121{
122 struct perf_counter *counter;
123
124 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
125 return;
126
127 counter_sched_out(group_counter, cpuctx, ctx);
128
129 /*
130 * Schedule out siblings (if any):
131 */
132 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
133 counter_sched_out(counter, cpuctx, ctx);
134
135 if (group_counter->hw_event.exclusive)
136 cpuctx->exclusive = 0;
137}
138
139/*
140 * Cross CPU call to remove a performance counter
141 *
142 * We disable the counter on the hardware level first. After that we
143 * remove it from the context list.
144 */
145static void __perf_counter_remove_from_context(void *info)
146{
147 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
148 struct perf_counter *counter = info;
149 struct perf_counter_context *ctx = counter->ctx;
150 unsigned long flags;
151 u64 perf_flags;
152
153 /*
154 * If this is a task context, we need to check whether it is
155 * the current task context of this cpu. If not it has been
156 * scheduled out before the smp call arrived.
157 */
158 if (ctx->task && cpuctx->task_ctx != ctx)
159 return;
160
161 curr_rq_lock_irq_save(&flags);
162 spin_lock(&ctx->lock);
163
164 counter_sched_out(counter, cpuctx, ctx);
165
166 counter->task = NULL;
167 ctx->nr_counters--;
168
169 /*
170 * Protect the list operation against NMI by disabling the
171 * counters on a global level. NOP for non NMI based counters.
172 */
173 perf_flags = hw_perf_save_disable();
174 list_del_counter(counter, ctx);
175 hw_perf_restore(perf_flags);
176
177 if (!ctx->task) {
178 /*
179 * Allow more per task counters with respect to the
180 * reservation:
181 */
182 cpuctx->max_pertask =
183 min(perf_max_counters - ctx->nr_counters,
184 perf_max_counters - perf_reserved_percpu);
185 }
186
187 spin_unlock(&ctx->lock);
188 curr_rq_unlock_irq_restore(&flags);
189}
190
191
192/*
193 * Remove the counter from a task's (or a CPU's) list of counters.
194 *
195 * Must be called with counter->mutex and ctx->mutex held.
196 *
197 * CPU counters are removed with a smp call. For task counters we only
198 * call when the task is on a CPU.
199 */
200static void perf_counter_remove_from_context(struct perf_counter *counter)
201{
202 struct perf_counter_context *ctx = counter->ctx;
203 struct task_struct *task = ctx->task;
204
205 if (!task) {
206 /*
207 * Per cpu counters are removed via an smp call and
208 * the removal is always sucessful.
209 */
210 smp_call_function_single(counter->cpu,
211 __perf_counter_remove_from_context,
212 counter, 1);
213 return;
214 }
215
216retry:
217 task_oncpu_function_call(task, __perf_counter_remove_from_context,
218 counter);
219
220 spin_lock_irq(&ctx->lock);
221 /*
222 * If the context is active we need to retry the smp call.
223 */
224 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
225 spin_unlock_irq(&ctx->lock);
226 goto retry;
227 }
228
229 /*
230 * The lock prevents that this context is scheduled in so we
231 * can remove the counter safely, if the call above did not
232 * succeed.
233 */
234 if (!list_empty(&counter->list_entry)) {
235 ctx->nr_counters--;
236 list_del_counter(counter, ctx);
237 counter->task = NULL;
238 }
239 spin_unlock_irq(&ctx->lock);
240}
241
242/*
243 * Cross CPU call to disable a performance counter
244 */
245static void __perf_counter_disable(void *info)
246{
247 struct perf_counter *counter = info;
248 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
249 struct perf_counter_context *ctx = counter->ctx;
250 unsigned long flags;
251
252 /*
253 * If this is a per-task counter, need to check whether this
254 * counter's task is the current task on this cpu.
255 */
256 if (ctx->task && cpuctx->task_ctx != ctx)
257 return;
258
259 curr_rq_lock_irq_save(&flags);
260 spin_lock(&ctx->lock);
261
262 /*
263 * If the counter is on, turn it off.
264 * If it is in error state, leave it in error state.
265 */
266 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
267 if (counter == counter->group_leader)
268 group_sched_out(counter, cpuctx, ctx);
269 else
270 counter_sched_out(counter, cpuctx, ctx);
271 counter->state = PERF_COUNTER_STATE_OFF;
272 }
273
274 spin_unlock(&ctx->lock);
275 curr_rq_unlock_irq_restore(&flags);
276}
277
278/*
279 * Disable a counter.
280 */
281static void perf_counter_disable(struct perf_counter *counter)
282{
283 struct perf_counter_context *ctx = counter->ctx;
284 struct task_struct *task = ctx->task;
285
286 if (!task) {
287 /*
288 * Disable the counter on the cpu that it's on
289 */
290 smp_call_function_single(counter->cpu, __perf_counter_disable,
291 counter, 1);
292 return;
293 }
294
295 retry:
296 task_oncpu_function_call(task, __perf_counter_disable, counter);
297
298 spin_lock_irq(&ctx->lock);
299 /*
300 * If the counter is still active, we need to retry the cross-call.
301 */
302 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
303 spin_unlock_irq(&ctx->lock);
304 goto retry;
305 }
306
307 /*
308 * Since we have the lock this context can't be scheduled
309 * in, so we can change the state safely.
310 */
311 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
312 counter->state = PERF_COUNTER_STATE_OFF;
313
314 spin_unlock_irq(&ctx->lock);
315}
316
317/*
318 * Disable a counter and all its children.
319 */
320static void perf_counter_disable_family(struct perf_counter *counter)
321{
322 struct perf_counter *child;
323
324 perf_counter_disable(counter);
325
326 /*
327 * Lock the mutex to protect the list of children
328 */
329 mutex_lock(&counter->mutex);
330 list_for_each_entry(child, &counter->child_list, child_list)
331 perf_counter_disable(child);
332 mutex_unlock(&counter->mutex);
333}
334
335static int
336counter_sched_in(struct perf_counter *counter,
337 struct perf_cpu_context *cpuctx,
338 struct perf_counter_context *ctx,
339 int cpu)
340{
341 if (counter->state <= PERF_COUNTER_STATE_OFF)
342 return 0;
343
344 counter->state = PERF_COUNTER_STATE_ACTIVE;
345 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
346 /*
347 * The new state must be visible before we turn it on in the hardware:
348 */
349 smp_wmb();
350
351 if (counter->hw_ops->enable(counter)) {
352 counter->state = PERF_COUNTER_STATE_INACTIVE;
353 counter->oncpu = -1;
354 return -EAGAIN;
355 }
356
357 if (!is_software_counter(counter))
358 cpuctx->active_oncpu++;
359 ctx->nr_active++;
360
361 if (counter->hw_event.exclusive)
362 cpuctx->exclusive = 1;
363
364 return 0;
365}
366
367/*
368 * Return 1 for a group consisting entirely of software counters,
369 * 0 if the group contains any hardware counters.
370 */
371static int is_software_only_group(struct perf_counter *leader)
372{
373 struct perf_counter *counter;
374
375 if (!is_software_counter(leader))
376 return 0;
377 list_for_each_entry(counter, &leader->sibling_list, list_entry)
378 if (!is_software_counter(counter))
379 return 0;
380 return 1;
381}
382
383/*
384 * Work out whether we can put this counter group on the CPU now.
385 */
386static int group_can_go_on(struct perf_counter *counter,
387 struct perf_cpu_context *cpuctx,
388 int can_add_hw)
389{
390 /*
391 * Groups consisting entirely of software counters can always go on.
392 */
393 if (is_software_only_group(counter))
394 return 1;
395 /*
396 * If an exclusive group is already on, no other hardware
397 * counters can go on.
398 */
399 if (cpuctx->exclusive)
400 return 0;
401 /*
402 * If this group is exclusive and there are already
403 * counters on the CPU, it can't go on.
404 */
405 if (counter->hw_event.exclusive && cpuctx->active_oncpu)
406 return 0;
407 /*
408 * Otherwise, try to add it if all previous groups were able
409 * to go on.
410 */
411 return can_add_hw;
412}
413
414/*
415 * Cross CPU call to install and enable a performance counter
416 */
417static void __perf_install_in_context(void *info)
418{
419 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
420 struct perf_counter *counter = info;
421 struct perf_counter_context *ctx = counter->ctx;
422 struct perf_counter *leader = counter->group_leader;
423 int cpu = smp_processor_id();
424 unsigned long flags;
425 u64 perf_flags;
426 int err;
427
428 /*
429 * If this is a task context, we need to check whether it is
430 * the current task context of this cpu. If not it has been
431 * scheduled out before the smp call arrived.
432 */
433 if (ctx->task && cpuctx->task_ctx != ctx)
434 return;
435
436 curr_rq_lock_irq_save(&flags);
437 spin_lock(&ctx->lock);
438
439 /*
440 * Protect the list operation against NMI by disabling the
441 * counters on a global level. NOP for non NMI based counters.
442 */
443 perf_flags = hw_perf_save_disable();
444
445 list_add_counter(counter, ctx);
446 ctx->nr_counters++;
447
448 /*
449 * Don't put the counter on if it is disabled or if
450 * it is in a group and the group isn't on.
451 */
452 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
453 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
454 goto unlock;
455
456 /*
457 * An exclusive counter can't go on if there are already active
458 * hardware counters, and no hardware counter can go on if there
459 * is already an exclusive counter on.
460 */
461 if (!group_can_go_on(counter, cpuctx, 1))
462 err = -EEXIST;
463 else
464 err = counter_sched_in(counter, cpuctx, ctx, cpu);
465
466 if (err) {
467 /*
468 * This counter couldn't go on. If it is in a group
469 * then we have to pull the whole group off.
470 * If the counter group is pinned then put it in error state.
471 */
472 if (leader != counter)
473 group_sched_out(leader, cpuctx, ctx);
474 if (leader->hw_event.pinned)
475 leader->state = PERF_COUNTER_STATE_ERROR;
476 }
477
478 if (!err && !ctx->task && cpuctx->max_pertask)
479 cpuctx->max_pertask--;
480
481 unlock:
482 hw_perf_restore(perf_flags);
483
484 spin_unlock(&ctx->lock);
485 curr_rq_unlock_irq_restore(&flags);
486}
487
488/*
489 * Attach a performance counter to a context
490 *
491 * First we add the counter to the list with the hardware enable bit
492 * in counter->hw_config cleared.
493 *
494 * If the counter is attached to a task which is on a CPU we use a smp
495 * call to enable it in the task context. The task might have been
496 * scheduled away, but we check this in the smp call again.
497 *
498 * Must be called with ctx->mutex held.
499 */
500static void
501perf_install_in_context(struct perf_counter_context *ctx,
502 struct perf_counter *counter,
503 int cpu)
504{
505 struct task_struct *task = ctx->task;
506
507 if (!task) {
508 /*
509 * Per cpu counters are installed via an smp call and
510 * the install is always sucessful.
511 */
512 smp_call_function_single(cpu, __perf_install_in_context,
513 counter, 1);
514 return;
515 }
516
517 counter->task = task;
518retry:
519 task_oncpu_function_call(task, __perf_install_in_context,
520 counter);
521
522 spin_lock_irq(&ctx->lock);
523 /*
524 * we need to retry the smp call.
525 */
526 if (ctx->is_active && list_empty(&counter->list_entry)) {
527 spin_unlock_irq(&ctx->lock);
528 goto retry;
529 }
530
531 /*
532 * The lock prevents that this context is scheduled in so we
533 * can add the counter safely, if it the call above did not
534 * succeed.
535 */
536 if (list_empty(&counter->list_entry)) {
537 list_add_counter(counter, ctx);
538 ctx->nr_counters++;
539 }
540 spin_unlock_irq(&ctx->lock);
541}
542
543/*
544 * Cross CPU call to enable a performance counter
545 */
546static void __perf_counter_enable(void *info)
547{
548 struct perf_counter *counter = info;
549 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
550 struct perf_counter_context *ctx = counter->ctx;
551 struct perf_counter *leader = counter->group_leader;
552 unsigned long flags;
553 int err;
554
555 /*
556 * If this is a per-task counter, need to check whether this
557 * counter's task is the current task on this cpu.
558 */
559 if (ctx->task && cpuctx->task_ctx != ctx)
560 return;
561
562 curr_rq_lock_irq_save(&flags);
563 spin_lock(&ctx->lock);
564
565 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
566 goto unlock;
567 counter->state = PERF_COUNTER_STATE_INACTIVE;
568
569 /*
570 * If the counter is in a group and isn't the group leader,
571 * then don't put it on unless the group is on.
572 */
573 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
574 goto unlock;
575
576 if (!group_can_go_on(counter, cpuctx, 1))
577 err = -EEXIST;
578 else
579 err = counter_sched_in(counter, cpuctx, ctx,
580 smp_processor_id());
581
582 if (err) {
583 /*
584 * If this counter can't go on and it's part of a
585 * group, then the whole group has to come off.
586 */
587 if (leader != counter)
588 group_sched_out(leader, cpuctx, ctx);
589 if (leader->hw_event.pinned)
590 leader->state = PERF_COUNTER_STATE_ERROR;
591 }
592
593 unlock:
594 spin_unlock(&ctx->lock);
595 curr_rq_unlock_irq_restore(&flags);
596}
597
598/*
599 * Enable a counter.
600 */
601static void perf_counter_enable(struct perf_counter *counter)
602{
603 struct perf_counter_context *ctx = counter->ctx;
604 struct task_struct *task = ctx->task;
605
606 if (!task) {
607 /*
608 * Enable the counter on the cpu that it's on
609 */
610 smp_call_function_single(counter->cpu, __perf_counter_enable,
611 counter, 1);
612 return;
613 }
614
615 spin_lock_irq(&ctx->lock);
616 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
617 goto out;
618
619 /*
620 * If the counter is in error state, clear that first.
621 * That way, if we see the counter in error state below, we
622 * know that it has gone back into error state, as distinct
623 * from the task having been scheduled away before the
624 * cross-call arrived.
625 */
626 if (counter->state == PERF_COUNTER_STATE_ERROR)
627 counter->state = PERF_COUNTER_STATE_OFF;
628
629 retry:
630 spin_unlock_irq(&ctx->lock);
631 task_oncpu_function_call(task, __perf_counter_enable, counter);
632
633 spin_lock_irq(&ctx->lock);
634
635 /*
636 * If the context is active and the counter is still off,
637 * we need to retry the cross-call.
638 */
639 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
640 goto retry;
641
642 /*
643 * Since we have the lock this context can't be scheduled
644 * in, so we can change the state safely.
645 */
646 if (counter->state == PERF_COUNTER_STATE_OFF)
647 counter->state = PERF_COUNTER_STATE_INACTIVE;
648 out:
649 spin_unlock_irq(&ctx->lock);
650}
651
652/*
653 * Enable a counter and all its children.
654 */
655static void perf_counter_enable_family(struct perf_counter *counter)
656{
657 struct perf_counter *child;
658
659 perf_counter_enable(counter);
660
661 /*
662 * Lock the mutex to protect the list of children
663 */
664 mutex_lock(&counter->mutex);
665 list_for_each_entry(child, &counter->child_list, child_list)
666 perf_counter_enable(child);
667 mutex_unlock(&counter->mutex);
668}
669
670void __perf_counter_sched_out(struct perf_counter_context *ctx,
671 struct perf_cpu_context *cpuctx)
672{
673 struct perf_counter *counter;
674 u64 flags;
675
676 spin_lock(&ctx->lock);
677 ctx->is_active = 0;
678 if (likely(!ctx->nr_counters))
679 goto out;
680
681 flags = hw_perf_save_disable();
682 if (ctx->nr_active) {
683 list_for_each_entry(counter, &ctx->counter_list, list_entry)
684 group_sched_out(counter, cpuctx, ctx);
685 }
686 hw_perf_restore(flags);
687 out:
688 spin_unlock(&ctx->lock);
689}
690
691/*
692 * Called from scheduler to remove the counters of the current task,
693 * with interrupts disabled.
694 *
695 * We stop each counter and update the counter value in counter->count.
696 *
697 * This does not protect us against NMI, but disable()
698 * sets the disabled bit in the control field of counter _before_
699 * accessing the counter control register. If a NMI hits, then it will
700 * not restart the counter.
701 */
702void perf_counter_task_sched_out(struct task_struct *task, int cpu)
703{
704 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
705 struct perf_counter_context *ctx = &task->perf_counter_ctx;
706
707 if (likely(!cpuctx->task_ctx))
708 return;
709
710 __perf_counter_sched_out(ctx, cpuctx);
711
712 cpuctx->task_ctx = NULL;
713}
714
715static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
716{
717 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
718}
719
720static int
721group_sched_in(struct perf_counter *group_counter,
722 struct perf_cpu_context *cpuctx,
723 struct perf_counter_context *ctx,
724 int cpu)
725{
726 struct perf_counter *counter, *partial_group;
727 int ret;
728
729 if (group_counter->state == PERF_COUNTER_STATE_OFF)
730 return 0;
731
732 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
733 if (ret)
734 return ret < 0 ? ret : 0;
735
736 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
737 return -EAGAIN;
738
739 /*
740 * Schedule in siblings as one group (if any):
741 */
742 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
743 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
744 partial_group = counter;
745 goto group_error;
746 }
747 }
748
749 return 0;
750
751group_error:
752 /*
753 * Groups can be scheduled in as one unit only, so undo any
754 * partial group before returning:
755 */
756 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
757 if (counter == partial_group)
758 break;
759 counter_sched_out(counter, cpuctx, ctx);
760 }
761 counter_sched_out(group_counter, cpuctx, ctx);
762
763 return -EAGAIN;
764}
765
766static void
767__perf_counter_sched_in(struct perf_counter_context *ctx,
768 struct perf_cpu_context *cpuctx, int cpu)
769{
770 struct perf_counter *counter;
771 u64 flags;
772 int can_add_hw = 1;
773
774 spin_lock(&ctx->lock);
775 ctx->is_active = 1;
776 if (likely(!ctx->nr_counters))
777 goto out;
778
779 flags = hw_perf_save_disable();
780
781 /*
782 * First go through the list and put on any pinned groups
783 * in order to give them the best chance of going on.
784 */
785 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
786 if (counter->state <= PERF_COUNTER_STATE_OFF ||
787 !counter->hw_event.pinned)
788 continue;
789 if (counter->cpu != -1 && counter->cpu != cpu)
790 continue;
791
792 if (group_can_go_on(counter, cpuctx, 1))
793 group_sched_in(counter, cpuctx, ctx, cpu);
794
795 /*
796 * If this pinned group hasn't been scheduled,
797 * put it in error state.
798 */
799 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
800 counter->state = PERF_COUNTER_STATE_ERROR;
801 }
802
803 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
804 /*
805 * Ignore counters in OFF or ERROR state, and
806 * ignore pinned counters since we did them already.
807 */
808 if (counter->state <= PERF_COUNTER_STATE_OFF ||
809 counter->hw_event.pinned)
810 continue;
811
812 /*
813 * Listen to the 'cpu' scheduling filter constraint
814 * of counters:
815 */
816 if (counter->cpu != -1 && counter->cpu != cpu)
817 continue;
818
819 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
820 if (group_sched_in(counter, cpuctx, ctx, cpu))
821 can_add_hw = 0;
822 }
823 }
824 hw_perf_restore(flags);
825 out:
826 spin_unlock(&ctx->lock);
827}
828
829/*
830 * Called from scheduler to add the counters of the current task
831 * with interrupts disabled.
832 *
833 * We restore the counter value and then enable it.
834 *
835 * This does not protect us against NMI, but enable()
836 * sets the enabled bit in the control field of counter _before_
837 * accessing the counter control register. If a NMI hits, then it will
838 * keep the counter running.
839 */
840void perf_counter_task_sched_in(struct task_struct *task, int cpu)
841{
842 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
843 struct perf_counter_context *ctx = &task->perf_counter_ctx;
844
845 __perf_counter_sched_in(ctx, cpuctx, cpu);
846 cpuctx->task_ctx = ctx;
847}
848
849static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
850{
851 struct perf_counter_context *ctx = &cpuctx->ctx;
852
853 __perf_counter_sched_in(ctx, cpuctx, cpu);
854}
855
856int perf_counter_task_disable(void)
857{
858 struct task_struct *curr = current;
859 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
860 struct perf_counter *counter;
861 unsigned long flags;
862 u64 perf_flags;
863 int cpu;
864
865 if (likely(!ctx->nr_counters))
866 return 0;
867
868 curr_rq_lock_irq_save(&flags);
869 cpu = smp_processor_id();
870
871 /* force the update of the task clock: */
872 __task_delta_exec(curr, 1);
873
874 perf_counter_task_sched_out(curr, cpu);
875
876 spin_lock(&ctx->lock);
877
878 /*
879 * Disable all the counters:
880 */
881 perf_flags = hw_perf_save_disable();
882
883 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
884 if (counter->state != PERF_COUNTER_STATE_ERROR)
885 counter->state = PERF_COUNTER_STATE_OFF;
886 }
887
888 hw_perf_restore(perf_flags);
889
890 spin_unlock(&ctx->lock);
891
892 curr_rq_unlock_irq_restore(&flags);
893
894 return 0;
895}
896
897int perf_counter_task_enable(void)
898{
899 struct task_struct *curr = current;
900 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
901 struct perf_counter *counter;
902 unsigned long flags;
903 u64 perf_flags;
904 int cpu;
905
906 if (likely(!ctx->nr_counters))
907 return 0;
908
909 curr_rq_lock_irq_save(&flags);
910 cpu = smp_processor_id();
911
912 /* force the update of the task clock: */
913 __task_delta_exec(curr, 1);
914
915 perf_counter_task_sched_out(curr, cpu);
916
917 spin_lock(&ctx->lock);
918
919 /*
920 * Disable all the counters:
921 */
922 perf_flags = hw_perf_save_disable();
923
924 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
925 if (counter->state > PERF_COUNTER_STATE_OFF)
926 continue;
927 counter->state = PERF_COUNTER_STATE_INACTIVE;
928 counter->hw_event.disabled = 0;
929 }
930 hw_perf_restore(perf_flags);
931
932 spin_unlock(&ctx->lock);
933
934 perf_counter_task_sched_in(curr, cpu);
935
936 curr_rq_unlock_irq_restore(&flags);
937
938 return 0;
939}
940
941/*
942 * Round-robin a context's counters:
943 */
944static void rotate_ctx(struct perf_counter_context *ctx)
945{
946 struct perf_counter *counter;
947 u64 perf_flags;
948
949 if (!ctx->nr_counters)
950 return;
951
952 spin_lock(&ctx->lock);
953 /*
954 * Rotate the first entry last (works just fine for group counters too):
955 */
956 perf_flags = hw_perf_save_disable();
957 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
958 list_del(&counter->list_entry);
959 list_add_tail(&counter->list_entry, &ctx->counter_list);
960 break;
961 }
962 hw_perf_restore(perf_flags);
963
964 spin_unlock(&ctx->lock);
965}
966
967void perf_counter_task_tick(struct task_struct *curr, int cpu)
968{
969 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
970 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
971 const int rotate_percpu = 0;
972
973 if (rotate_percpu)
974 perf_counter_cpu_sched_out(cpuctx);
975 perf_counter_task_sched_out(curr, cpu);
976
977 if (rotate_percpu)
978 rotate_ctx(&cpuctx->ctx);
979 rotate_ctx(ctx);
980
981 if (rotate_percpu)
982 perf_counter_cpu_sched_in(cpuctx, cpu);
983 perf_counter_task_sched_in(curr, cpu);
984}
985
986/*
987 * Cross CPU call to read the hardware counter
988 */
989static void __read(void *info)
990{
991 struct perf_counter *counter = info;
992 unsigned long flags;
993
994 curr_rq_lock_irq_save(&flags);
995 counter->hw_ops->read(counter);
996 curr_rq_unlock_irq_restore(&flags);
997}
998
999static u64 perf_counter_read(struct perf_counter *counter)
1000{
1001 /*
1002 * If counter is enabled and currently active on a CPU, update the
1003 * value in the counter structure:
1004 */
1005 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1006 smp_call_function_single(counter->oncpu,
1007 __read, counter, 1);
1008 }
1009
1010 return atomic64_read(&counter->count);
1011}
1012
1013/*
1014 * Cross CPU call to switch performance data pointers
1015 */
1016static void __perf_switch_irq_data(void *info)
1017{
1018 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1019 struct perf_counter *counter = info;
1020 struct perf_counter_context *ctx = counter->ctx;
1021 struct perf_data *oldirqdata = counter->irqdata;
1022
1023 /*
1024 * If this is a task context, we need to check whether it is
1025 * the current task context of this cpu. If not it has been
1026 * scheduled out before the smp call arrived.
1027 */
1028 if (ctx->task) {
1029 if (cpuctx->task_ctx != ctx)
1030 return;
1031 spin_lock(&ctx->lock);
1032 }
1033
1034 /* Change the pointer NMI safe */
1035 atomic_long_set((atomic_long_t *)&counter->irqdata,
1036 (unsigned long) counter->usrdata);
1037 counter->usrdata = oldirqdata;
1038
1039 if (ctx->task)
1040 spin_unlock(&ctx->lock);
1041}
1042
1043static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
1044{
1045 struct perf_counter_context *ctx = counter->ctx;
1046 struct perf_data *oldirqdata = counter->irqdata;
1047 struct task_struct *task = ctx->task;
1048
1049 if (!task) {
1050 smp_call_function_single(counter->cpu,
1051 __perf_switch_irq_data,
1052 counter, 1);
1053 return counter->usrdata;
1054 }
1055
1056retry:
1057 spin_lock_irq(&ctx->lock);
1058 if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
1059 counter->irqdata = counter->usrdata;
1060 counter->usrdata = oldirqdata;
1061 spin_unlock_irq(&ctx->lock);
1062 return oldirqdata;
1063 }
1064 spin_unlock_irq(&ctx->lock);
1065 task_oncpu_function_call(task, __perf_switch_irq_data, counter);
1066 /* Might have failed, because task was scheduled out */
1067 if (counter->irqdata == oldirqdata)
1068 goto retry;
1069
1070 return counter->usrdata;
1071}
1072
1073static void put_context(struct perf_counter_context *ctx)
1074{
1075 if (ctx->task)
1076 put_task_struct(ctx->task);
1077}
1078
1079static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1080{
1081 struct perf_cpu_context *cpuctx;
1082 struct perf_counter_context *ctx;
1083 struct task_struct *task;
1084
1085 /*
1086 * If cpu is not a wildcard then this is a percpu counter:
1087 */
1088 if (cpu != -1) {
1089 /* Must be root to operate on a CPU counter: */
1090 if (!capable(CAP_SYS_ADMIN))
1091 return ERR_PTR(-EACCES);
1092
1093 if (cpu < 0 || cpu > num_possible_cpus())
1094 return ERR_PTR(-EINVAL);
1095
1096 /*
1097 * We could be clever and allow to attach a counter to an
1098 * offline CPU and activate it when the CPU comes up, but
1099 * that's for later.
1100 */
1101 if (!cpu_isset(cpu, cpu_online_map))
1102 return ERR_PTR(-ENODEV);
1103
1104 cpuctx = &per_cpu(perf_cpu_context, cpu);
1105 ctx = &cpuctx->ctx;
1106
1107 return ctx;
1108 }
1109
1110 rcu_read_lock();
1111 if (!pid)
1112 task = current;
1113 else
1114 task = find_task_by_vpid(pid);
1115 if (task)
1116 get_task_struct(task);
1117 rcu_read_unlock();
1118
1119 if (!task)
1120 return ERR_PTR(-ESRCH);
1121
1122 ctx = &task->perf_counter_ctx;
1123 ctx->task = task;
1124
1125 /* Reuse ptrace permission checks for now. */
1126 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1127 put_context(ctx);
1128 return ERR_PTR(-EACCES);
1129 }
1130
1131 return ctx;
1132}
1133
1134/*
1135 * Called when the last reference to the file is gone.
1136 */
1137static int perf_release(struct inode *inode, struct file *file)
1138{
1139 struct perf_counter *counter = file->private_data;
1140 struct perf_counter_context *ctx = counter->ctx;
1141
1142 file->private_data = NULL;
1143
1144 mutex_lock(&ctx->mutex);
1145 mutex_lock(&counter->mutex);
1146
1147 perf_counter_remove_from_context(counter);
1148
1149 mutex_unlock(&counter->mutex);
1150 mutex_unlock(&ctx->mutex);
1151
1152 kfree(counter);
1153 put_context(ctx);
1154
1155 return 0;
1156}
1157
1158/*
1159 * Read the performance counter - simple non blocking version for now
1160 */
1161static ssize_t
1162perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1163{
1164 u64 cntval;
1165
1166 if (count != sizeof(cntval))
1167 return -EINVAL;
1168
1169 /*
1170 * Return end-of-file for a read on a counter that is in
1171 * error state (i.e. because it was pinned but it couldn't be
1172 * scheduled on to the CPU at some point).
1173 */
1174 if (counter->state == PERF_COUNTER_STATE_ERROR)
1175 return 0;
1176
1177 mutex_lock(&counter->mutex);
1178 cntval = perf_counter_read(counter);
1179 mutex_unlock(&counter->mutex);
1180
1181 return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
1182}
1183
1184static ssize_t
1185perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
1186{
1187 if (!usrdata->len)
1188 return 0;
1189
1190 count = min(count, (size_t)usrdata->len);
1191 if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
1192 return -EFAULT;
1193
1194 /* Adjust the counters */
1195 usrdata->len -= count;
1196 if (!usrdata->len)
1197 usrdata->rd_idx = 0;
1198 else
1199 usrdata->rd_idx += count;
1200
1201 return count;
1202}
1203
1204static ssize_t
1205perf_read_irq_data(struct perf_counter *counter,
1206 char __user *buf,
1207 size_t count,
1208 int nonblocking)
1209{
1210 struct perf_data *irqdata, *usrdata;
1211 DECLARE_WAITQUEUE(wait, current);
1212 ssize_t res, res2;
1213
1214 irqdata = counter->irqdata;
1215 usrdata = counter->usrdata;
1216
1217 if (usrdata->len + irqdata->len >= count)
1218 goto read_pending;
1219
1220 if (nonblocking)
1221 return -EAGAIN;
1222
1223 spin_lock_irq(&counter->waitq.lock);
1224 __add_wait_queue(&counter->waitq, &wait);
1225 for (;;) {
1226 set_current_state(TASK_INTERRUPTIBLE);
1227 if (usrdata->len + irqdata->len >= count)
1228 break;
1229
1230 if (signal_pending(current))
1231 break;
1232
1233 if (counter->state == PERF_COUNTER_STATE_ERROR)
1234 break;
1235
1236 spin_unlock_irq(&counter->waitq.lock);
1237 schedule();
1238 spin_lock_irq(&counter->waitq.lock);
1239 }
1240 __remove_wait_queue(&counter->waitq, &wait);
1241 __set_current_state(TASK_RUNNING);
1242 spin_unlock_irq(&counter->waitq.lock);
1243
1244 if (usrdata->len + irqdata->len < count &&
1245 counter->state != PERF_COUNTER_STATE_ERROR)
1246 return -ERESTARTSYS;
1247read_pending:
1248 mutex_lock(&counter->mutex);
1249
1250 /* Drain pending data first: */
1251 res = perf_copy_usrdata(usrdata, buf, count);
1252 if (res < 0 || res == count)
1253 goto out;
1254
1255 /* Switch irq buffer: */
1256 usrdata = perf_switch_irq_data(counter);
1257 res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
1258 if (res2 < 0) {
1259 if (!res)
1260 res = -EFAULT;
1261 } else {
1262 res += res2;
1263 }
1264out:
1265 mutex_unlock(&counter->mutex);
1266
1267 return res;
1268}
1269
1270static ssize_t
1271perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1272{
1273 struct perf_counter *counter = file->private_data;
1274
1275 switch (counter->hw_event.record_type) {
1276 case PERF_RECORD_SIMPLE:
1277 return perf_read_hw(counter, buf, count);
1278
1279 case PERF_RECORD_IRQ:
1280 case PERF_RECORD_GROUP:
1281 return perf_read_irq_data(counter, buf, count,
1282 file->f_flags & O_NONBLOCK);
1283 }
1284 return -EINVAL;
1285}
1286
1287static unsigned int perf_poll(struct file *file, poll_table *wait)
1288{
1289 struct perf_counter *counter = file->private_data;
1290 unsigned int events = 0;
1291 unsigned long flags;
1292
1293 poll_wait(file, &counter->waitq, wait);
1294
1295 spin_lock_irqsave(&counter->waitq.lock, flags);
1296 if (counter->usrdata->len || counter->irqdata->len)
1297 events |= POLLIN;
1298 spin_unlock_irqrestore(&counter->waitq.lock, flags);
1299
1300 return events;
1301}
1302
1303static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1304{
1305 struct perf_counter *counter = file->private_data;
1306 int err = 0;
1307
1308 switch (cmd) {
1309 case PERF_COUNTER_IOC_ENABLE:
1310 perf_counter_enable_family(counter);
1311 break;
1312 case PERF_COUNTER_IOC_DISABLE:
1313 perf_counter_disable_family(counter);
1314 break;
1315 default:
1316 err = -ENOTTY;
1317 }
1318 return err;
1319}
1320
1321static const struct file_operations perf_fops = {
1322 .release = perf_release,
1323 .read = perf_read,
1324 .poll = perf_poll,
1325 .unlocked_ioctl = perf_ioctl,
1326 .compat_ioctl = perf_ioctl,
1327};
1328
1329static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
1330{
1331 int cpu = raw_smp_processor_id();
1332
1333 atomic64_set(&counter->hw.prev_count, cpu_clock(cpu));
1334 return 0;
1335}
1336
1337static void cpu_clock_perf_counter_update(struct perf_counter *counter)
1338{
1339 int cpu = raw_smp_processor_id();
1340 s64 prev;
1341 u64 now;
1342
1343 now = cpu_clock(cpu);
1344 prev = atomic64_read(&counter->hw.prev_count);
1345 atomic64_set(&counter->hw.prev_count, now);
1346 atomic64_add(now - prev, &counter->count);
1347}
1348
1349static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
1350{
1351 cpu_clock_perf_counter_update(counter);
1352}
1353
1354static void cpu_clock_perf_counter_read(struct perf_counter *counter)
1355{
1356 cpu_clock_perf_counter_update(counter);
1357}
1358
1359static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
1360 .enable = cpu_clock_perf_counter_enable,
1361 .disable = cpu_clock_perf_counter_disable,
1362 .read = cpu_clock_perf_counter_read,
1363};
1364
1365/*
1366 * Called from within the scheduler:
1367 */
1368static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
1369{
1370 struct task_struct *curr = counter->task;
1371 u64 delta;
1372
1373 delta = __task_delta_exec(curr, update);
1374
1375 return curr->se.sum_exec_runtime + delta;
1376}
1377
1378static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
1379{
1380 u64 prev;
1381 s64 delta;
1382
1383 prev = atomic64_read(&counter->hw.prev_count);
1384
1385 atomic64_set(&counter->hw.prev_count, now);
1386
1387 delta = now - prev;
1388
1389 atomic64_add(delta, &counter->count);
1390}
1391
1392static void task_clock_perf_counter_read(struct perf_counter *counter)
1393{
1394 u64 now = task_clock_perf_counter_val(counter, 1);
1395
1396 task_clock_perf_counter_update(counter, now);
1397}
1398
1399static int task_clock_perf_counter_enable(struct perf_counter *counter)
1400{
1401 u64 now = task_clock_perf_counter_val(counter, 0);
1402
1403 atomic64_set(&counter->hw.prev_count, now);
1404
1405 return 0;
1406}
1407
1408static void task_clock_perf_counter_disable(struct perf_counter *counter)
1409{
1410 u64 now = task_clock_perf_counter_val(counter, 0);
1411
1412 task_clock_perf_counter_update(counter, now);
1413}
1414
1415static const struct hw_perf_counter_ops perf_ops_task_clock = {
1416 .enable = task_clock_perf_counter_enable,
1417 .disable = task_clock_perf_counter_disable,
1418 .read = task_clock_perf_counter_read,
1419};
1420
1421#ifdef CONFIG_VM_EVENT_COUNTERS
1422#define cpu_page_faults() __get_cpu_var(vm_event_states).event[PGFAULT]
1423#else
1424#define cpu_page_faults() 0
1425#endif
1426
1427static u64 get_page_faults(struct perf_counter *counter)
1428{
1429 struct task_struct *curr = counter->ctx->task;
1430
1431 if (curr)
1432 return curr->maj_flt + curr->min_flt;
1433 return cpu_page_faults();
1434}
1435
1436static void page_faults_perf_counter_update(struct perf_counter *counter)
1437{
1438 u64 prev, now;
1439 s64 delta;
1440
1441 prev = atomic64_read(&counter->hw.prev_count);
1442 now = get_page_faults(counter);
1443
1444 atomic64_set(&counter->hw.prev_count, now);
1445
1446 delta = now - prev;
1447
1448 atomic64_add(delta, &counter->count);
1449}
1450
1451static void page_faults_perf_counter_read(struct perf_counter *counter)
1452{
1453 page_faults_perf_counter_update(counter);
1454}
1455
1456static int page_faults_perf_counter_enable(struct perf_counter *counter)
1457{
1458 atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
1459 return 0;
1460}
1461
1462static void page_faults_perf_counter_disable(struct perf_counter *counter)
1463{
1464 page_faults_perf_counter_update(counter);
1465}
1466
1467static const struct hw_perf_counter_ops perf_ops_page_faults = {
1468 .enable = page_faults_perf_counter_enable,
1469 .disable = page_faults_perf_counter_disable,
1470 .read = page_faults_perf_counter_read,
1471};
1472
1473static u64 get_context_switches(struct perf_counter *counter)
1474{
1475 struct task_struct *curr = counter->ctx->task;
1476
1477 if (curr)
1478 return curr->nvcsw + curr->nivcsw;
1479 return cpu_nr_switches(smp_processor_id());
1480}
1481
1482static void context_switches_perf_counter_update(struct perf_counter *counter)
1483{
1484 u64 prev, now;
1485 s64 delta;
1486
1487 prev = atomic64_read(&counter->hw.prev_count);
1488 now = get_context_switches(counter);
1489
1490 atomic64_set(&counter->hw.prev_count, now);
1491
1492 delta = now - prev;
1493
1494 atomic64_add(delta, &counter->count);
1495}
1496
1497static void context_switches_perf_counter_read(struct perf_counter *counter)
1498{
1499 context_switches_perf_counter_update(counter);
1500}
1501
1502static int context_switches_perf_counter_enable(struct perf_counter *counter)
1503{
1504 atomic64_set(&counter->hw.prev_count, get_context_switches(counter));
1505 return 0;
1506}
1507
1508static void context_switches_perf_counter_disable(struct perf_counter *counter)
1509{
1510 context_switches_perf_counter_update(counter);
1511}
1512
1513static const struct hw_perf_counter_ops perf_ops_context_switches = {
1514 .enable = context_switches_perf_counter_enable,
1515 .disable = context_switches_perf_counter_disable,
1516 .read = context_switches_perf_counter_read,
1517};
1518
1519static inline u64 get_cpu_migrations(struct perf_counter *counter)
1520{
1521 struct task_struct *curr = counter->ctx->task;
1522
1523 if (curr)
1524 return curr->se.nr_migrations;
1525 return cpu_nr_migrations(smp_processor_id());
1526}
1527
1528static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1529{
1530 u64 prev, now;
1531 s64 delta;
1532
1533 prev = atomic64_read(&counter->hw.prev_count);
1534 now = get_cpu_migrations(counter);
1535
1536 atomic64_set(&counter->hw.prev_count, now);
1537
1538 delta = now - prev;
1539
1540 atomic64_add(delta, &counter->count);
1541}
1542
1543static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1544{
1545 cpu_migrations_perf_counter_update(counter);
1546}
1547
1548static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1549{
1550 atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter));
1551 return 0;
1552}
1553
1554static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1555{
1556 cpu_migrations_perf_counter_update(counter);
1557}
1558
1559static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
1560 .enable = cpu_migrations_perf_counter_enable,
1561 .disable = cpu_migrations_perf_counter_disable,
1562 .read = cpu_migrations_perf_counter_read,
1563};
1564
1565static const struct hw_perf_counter_ops *
1566sw_perf_counter_init(struct perf_counter *counter)
1567{
1568 const struct hw_perf_counter_ops *hw_ops = NULL;
1569
1570 /*
1571 * Software counters (currently) can't in general distinguish
1572 * between user, kernel and hypervisor events.
1573 * However, context switches and cpu migrations are considered
1574 * to be kernel events, and page faults are never hypervisor
1575 * events.
1576 */
1577 switch (counter->hw_event.type) {
1578 case PERF_COUNT_CPU_CLOCK:
1579 if (!(counter->hw_event.exclude_user ||
1580 counter->hw_event.exclude_kernel ||
1581 counter->hw_event.exclude_hv))
1582 hw_ops = &perf_ops_cpu_clock;
1583 break;
1584 case PERF_COUNT_TASK_CLOCK:
1585 if (counter->hw_event.exclude_user ||
1586 counter->hw_event.exclude_kernel ||
1587 counter->hw_event.exclude_hv)
1588 break;
1589 /*
1590 * If the user instantiates this as a per-cpu counter,
1591 * use the cpu_clock counter instead.
1592 */
1593 if (counter->ctx->task)
1594 hw_ops = &perf_ops_task_clock;
1595 else
1596 hw_ops = &perf_ops_cpu_clock;
1597 break;
1598 case PERF_COUNT_PAGE_FAULTS:
1599 if (!(counter->hw_event.exclude_user ||
1600 counter->hw_event.exclude_kernel))
1601 hw_ops = &perf_ops_page_faults;
1602 break;
1603 case PERF_COUNT_CONTEXT_SWITCHES:
1604 if (!counter->hw_event.exclude_kernel)
1605 hw_ops = &perf_ops_context_switches;
1606 break;
1607 case PERF_COUNT_CPU_MIGRATIONS:
1608 if (!counter->hw_event.exclude_kernel)
1609 hw_ops = &perf_ops_cpu_migrations;
1610 break;
1611 default:
1612 break;
1613 }
1614 return hw_ops;
1615}
1616
1617/*
1618 * Allocate and initialize a counter structure
1619 */
1620static struct perf_counter *
1621perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1622 int cpu,
1623 struct perf_counter_context *ctx,
1624 struct perf_counter *group_leader,
1625 gfp_t gfpflags)
1626{
1627 const struct hw_perf_counter_ops *hw_ops;
1628 struct perf_counter *counter;
1629
1630 counter = kzalloc(sizeof(*counter), gfpflags);
1631 if (!counter)
1632 return NULL;
1633
1634 /*
1635 * Single counters are their own group leaders, with an
1636 * empty sibling list:
1637 */
1638 if (!group_leader)
1639 group_leader = counter;
1640
1641 mutex_init(&counter->mutex);
1642 INIT_LIST_HEAD(&counter->list_entry);
1643 INIT_LIST_HEAD(&counter->sibling_list);
1644 init_waitqueue_head(&counter->waitq);
1645
1646 INIT_LIST_HEAD(&counter->child_list);
1647
1648 counter->irqdata = &counter->data[0];
1649 counter->usrdata = &counter->data[1];
1650 counter->cpu = cpu;
1651 counter->hw_event = *hw_event;
1652 counter->wakeup_pending = 0;
1653 counter->group_leader = group_leader;
1654 counter->hw_ops = NULL;
1655 counter->ctx = ctx;
1656
1657 counter->state = PERF_COUNTER_STATE_INACTIVE;
1658 if (hw_event->disabled)
1659 counter->state = PERF_COUNTER_STATE_OFF;
1660
1661 hw_ops = NULL;
1662 if (!hw_event->raw && hw_event->type < 0)
1663 hw_ops = sw_perf_counter_init(counter);
1664 else
1665 hw_ops = hw_perf_counter_init(counter);
1666
1667 if (!hw_ops) {
1668 kfree(counter);
1669 return NULL;
1670 }
1671 counter->hw_ops = hw_ops;
1672
1673 return counter;
1674}
1675
1676/**
1677 * sys_perf_task_open - open a performance counter, associate it to a task/cpu
1678 *
1679 * @hw_event_uptr: event type attributes for monitoring/sampling
1680 * @pid: target pid
1681 * @cpu: target cpu
1682 * @group_fd: group leader counter fd
1683 */
1684asmlinkage int
1685sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1686 pid_t pid, int cpu, int group_fd)
1687{
1688 struct perf_counter *counter, *group_leader;
1689 struct perf_counter_hw_event hw_event;
1690 struct perf_counter_context *ctx;
1691 struct file *counter_file = NULL;
1692 struct file *group_file = NULL;
1693 int fput_needed = 0;
1694 int fput_needed2 = 0;
1695 int ret;
1696
1697 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
1698 return -EFAULT;
1699
1700 /*
1701 * Get the target context (task or percpu):
1702 */
1703 ctx = find_get_context(pid, cpu);
1704 if (IS_ERR(ctx))
1705 return PTR_ERR(ctx);
1706
1707 /*
1708 * Look up the group leader (we will attach this counter to it):
1709 */
1710 group_leader = NULL;
1711 if (group_fd != -1) {
1712 ret = -EINVAL;
1713 group_file = fget_light(group_fd, &fput_needed);
1714 if (!group_file)
1715 goto err_put_context;
1716 if (group_file->f_op != &perf_fops)
1717 goto err_put_context;
1718
1719 group_leader = group_file->private_data;
1720 /*
1721 * Do not allow a recursive hierarchy (this new sibling
1722 * becoming part of another group-sibling):
1723 */
1724 if (group_leader->group_leader != group_leader)
1725 goto err_put_context;
1726 /*
1727 * Do not allow to attach to a group in a different
1728 * task or CPU context:
1729 */
1730 if (group_leader->ctx != ctx)
1731 goto err_put_context;
1732 /*
1733 * Only a group leader can be exclusive or pinned
1734 */
1735 if (hw_event.exclusive || hw_event.pinned)
1736 goto err_put_context;
1737 }
1738
1739 ret = -EINVAL;
1740 counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
1741 GFP_KERNEL);
1742 if (!counter)
1743 goto err_put_context;
1744
1745 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
1746 if (ret < 0)
1747 goto err_free_put_context;
1748
1749 counter_file = fget_light(ret, &fput_needed2);
1750 if (!counter_file)
1751 goto err_free_put_context;
1752
1753 counter->filp = counter_file;
1754 mutex_lock(&ctx->mutex);
1755 perf_install_in_context(ctx, counter, cpu);
1756 mutex_unlock(&ctx->mutex);
1757
1758 fput_light(counter_file, fput_needed2);
1759
1760out_fput:
1761 fput_light(group_file, fput_needed);
1762
1763 return ret;
1764
1765err_free_put_context:
1766 kfree(counter);
1767
1768err_put_context:
1769 put_context(ctx);
1770
1771 goto out_fput;
1772}
1773
1774/*
1775 * Initialize the perf_counter context in a task_struct:
1776 */
1777static void
1778__perf_counter_init_context(struct perf_counter_context *ctx,
1779 struct task_struct *task)
1780{
1781 memset(ctx, 0, sizeof(*ctx));
1782 spin_lock_init(&ctx->lock);
1783 mutex_init(&ctx->mutex);
1784 INIT_LIST_HEAD(&ctx->counter_list);
1785 ctx->task = task;
1786}
1787
1788/*
1789 * inherit a counter from parent task to child task:
1790 */
1791static struct perf_counter *
1792inherit_counter(struct perf_counter *parent_counter,
1793 struct task_struct *parent,
1794 struct perf_counter_context *parent_ctx,
1795 struct task_struct *child,
1796 struct perf_counter *group_leader,
1797 struct perf_counter_context *child_ctx)
1798{
1799 struct perf_counter *child_counter;
1800
1801 /*
1802 * Instead of creating recursive hierarchies of counters,
1803 * we link inherited counters back to the original parent,
1804 * which has a filp for sure, which we use as the reference
1805 * count:
1806 */
1807 if (parent_counter->parent)
1808 parent_counter = parent_counter->parent;
1809
1810 child_counter = perf_counter_alloc(&parent_counter->hw_event,
1811 parent_counter->cpu, child_ctx,
1812 group_leader, GFP_KERNEL);
1813 if (!child_counter)
1814 return NULL;
1815
1816 /*
1817 * Link it up in the child's context:
1818 */
1819 child_counter->task = child;
1820 list_add_counter(child_counter, child_ctx);
1821 child_ctx->nr_counters++;
1822
1823 child_counter->parent = parent_counter;
1824 /*
1825 * inherit into child's child as well:
1826 */
1827 child_counter->hw_event.inherit = 1;
1828
1829 /*
1830 * Get a reference to the parent filp - we will fput it
1831 * when the child counter exits. This is safe to do because
1832 * we are in the parent and we know that the filp still
1833 * exists and has a nonzero count:
1834 */
1835 atomic_long_inc(&parent_counter->filp->f_count);
1836
1837 /*
1838 * Link this into the parent counter's child list
1839 */
1840 mutex_lock(&parent_counter->mutex);
1841 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
1842
1843 /*
1844 * Make the child state follow the state of the parent counter,
1845 * not its hw_event.disabled bit. We hold the parent's mutex,
1846 * so we won't race with perf_counter_{en,dis}able_family.
1847 */
1848 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
1849 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
1850 else
1851 child_counter->state = PERF_COUNTER_STATE_OFF;
1852
1853 mutex_unlock(&parent_counter->mutex);
1854
1855 return child_counter;
1856}
1857
1858static int inherit_group(struct perf_counter *parent_counter,
1859 struct task_struct *parent,
1860 struct perf_counter_context *parent_ctx,
1861 struct task_struct *child,
1862 struct perf_counter_context *child_ctx)
1863{
1864 struct perf_counter *leader;
1865 struct perf_counter *sub;
1866
1867 leader = inherit_counter(parent_counter, parent, parent_ctx,
1868 child, NULL, child_ctx);
1869 if (!leader)
1870 return -ENOMEM;
1871 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
1872 if (!inherit_counter(sub, parent, parent_ctx,
1873 child, leader, child_ctx))
1874 return -ENOMEM;
1875 }
1876 return 0;
1877}
1878
1879static void sync_child_counter(struct perf_counter *child_counter,
1880 struct perf_counter *parent_counter)
1881{
1882 u64 parent_val, child_val;
1883
1884 parent_val = atomic64_read(&parent_counter->count);
1885 child_val = atomic64_read(&child_counter->count);
1886
1887 /*
1888 * Add back the child's count to the parent's count:
1889 */
1890 atomic64_add(child_val, &parent_counter->count);
1891
1892 /*
1893 * Remove this counter from the parent's list
1894 */
1895 mutex_lock(&parent_counter->mutex);
1896 list_del_init(&child_counter->child_list);
1897 mutex_unlock(&parent_counter->mutex);
1898
1899 /*
1900 * Release the parent counter, if this was the last
1901 * reference to it.
1902 */
1903 fput(parent_counter->filp);
1904}
1905
1906static void
1907__perf_counter_exit_task(struct task_struct *child,
1908 struct perf_counter *child_counter,
1909 struct perf_counter_context *child_ctx)
1910{
1911 struct perf_counter *parent_counter;
1912 struct perf_counter *sub, *tmp;
1913
1914 /*
1915 * If we do not self-reap then we have to wait for the
1916 * child task to unschedule (it will happen for sure),
1917 * so that its counter is at its final count. (This
1918 * condition triggers rarely - child tasks usually get
1919 * off their CPU before the parent has a chance to
1920 * get this far into the reaping action)
1921 */
1922 if (child != current) {
1923 wait_task_inactive(child, 0);
1924 list_del_init(&child_counter->list_entry);
1925 } else {
1926 struct perf_cpu_context *cpuctx;
1927 unsigned long flags;
1928 u64 perf_flags;
1929
1930 /*
1931 * Disable and unlink this counter.
1932 *
1933 * Be careful about zapping the list - IRQ/NMI context
1934 * could still be processing it:
1935 */
1936 curr_rq_lock_irq_save(&flags);
1937 perf_flags = hw_perf_save_disable();
1938
1939 cpuctx = &__get_cpu_var(perf_cpu_context);
1940
1941 group_sched_out(child_counter, cpuctx, child_ctx);
1942
1943 list_del_init(&child_counter->list_entry);
1944
1945 child_ctx->nr_counters--;
1946
1947 hw_perf_restore(perf_flags);
1948 curr_rq_unlock_irq_restore(&flags);
1949 }
1950
1951 parent_counter = child_counter->parent;
1952 /*
1953 * It can happen that parent exits first, and has counters
1954 * that are still around due to the child reference. These
1955 * counters need to be zapped - but otherwise linger.
1956 */
1957 if (parent_counter) {
1958 sync_child_counter(child_counter, parent_counter);
1959 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
1960 list_entry) {
1961 if (sub->parent) {
1962 sync_child_counter(sub, sub->parent);
1963 kfree(sub);
1964 }
1965 }
1966 kfree(child_counter);
1967 }
1968}
1969
1970/*
1971 * When a child task exits, feed back counter values to parent counters.
1972 *
1973 * Note: we may be running in child context, but the PID is not hashed
1974 * anymore so new counters will not be added.
1975 */
1976void perf_counter_exit_task(struct task_struct *child)
1977{
1978 struct perf_counter *child_counter, *tmp;
1979 struct perf_counter_context *child_ctx;
1980
1981 child_ctx = &child->perf_counter_ctx;
1982
1983 if (likely(!child_ctx->nr_counters))
1984 return;
1985
1986 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
1987 list_entry)
1988 __perf_counter_exit_task(child, child_counter, child_ctx);
1989}
1990
1991/*
1992 * Initialize the perf_counter context in task_struct
1993 */
1994void perf_counter_init_task(struct task_struct *child)
1995{
1996 struct perf_counter_context *child_ctx, *parent_ctx;
1997 struct perf_counter *counter;
1998 struct task_struct *parent = current;
1999
2000 child_ctx = &child->perf_counter_ctx;
2001 parent_ctx = &parent->perf_counter_ctx;
2002
2003 __perf_counter_init_context(child_ctx, child);
2004
2005 /*
2006 * This is executed from the parent task context, so inherit
2007 * counters that have been marked for cloning:
2008 */
2009
2010 if (likely(!parent_ctx->nr_counters))
2011 return;
2012
2013 /*
2014 * Lock the parent list. No need to lock the child - not PID
2015 * hashed yet and not running, so nobody can access it.
2016 */
2017 mutex_lock(&parent_ctx->mutex);
2018
2019 /*
2020 * We dont have to disable NMIs - we are only looking at
2021 * the list, not manipulating it:
2022 */
2023 list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
2024 if (!counter->hw_event.inherit)
2025 continue;
2026
2027 if (inherit_group(counter, parent,
2028 parent_ctx, child, child_ctx))
2029 break;
2030 }
2031
2032 mutex_unlock(&parent_ctx->mutex);
2033}
2034
2035static void __cpuinit perf_counter_init_cpu(int cpu)
2036{
2037 struct perf_cpu_context *cpuctx;
2038
2039 cpuctx = &per_cpu(perf_cpu_context, cpu);
2040 __perf_counter_init_context(&cpuctx->ctx, NULL);
2041
2042 mutex_lock(&perf_resource_mutex);
2043 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
2044 mutex_unlock(&perf_resource_mutex);
2045
2046 hw_perf_counter_setup(cpu);
2047}
2048
2049#ifdef CONFIG_HOTPLUG_CPU
2050static void __perf_counter_exit_cpu(void *info)
2051{
2052 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
2053 struct perf_counter_context *ctx = &cpuctx->ctx;
2054 struct perf_counter *counter, *tmp;
2055
2056 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
2057 __perf_counter_remove_from_context(counter);
2058}
2059static void perf_counter_exit_cpu(int cpu)
2060{
2061 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
2062 struct perf_counter_context *ctx = &cpuctx->ctx;
2063
2064 mutex_lock(&ctx->mutex);
2065 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
2066 mutex_unlock(&ctx->mutex);
2067}
2068#else
2069static inline void perf_counter_exit_cpu(int cpu) { }
2070#endif
2071
2072static int __cpuinit
2073perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
2074{
2075 unsigned int cpu = (long)hcpu;
2076
2077 switch (action) {
2078
2079 case CPU_UP_PREPARE:
2080 case CPU_UP_PREPARE_FROZEN:
2081 perf_counter_init_cpu(cpu);
2082 break;
2083
2084 case CPU_DOWN_PREPARE:
2085 case CPU_DOWN_PREPARE_FROZEN:
2086 perf_counter_exit_cpu(cpu);
2087 break;
2088
2089 default:
2090 break;
2091 }
2092
2093 return NOTIFY_OK;
2094}
2095
2096static struct notifier_block __cpuinitdata perf_cpu_nb = {
2097 .notifier_call = perf_cpu_notify,
2098};
2099
2100static int __init perf_counter_init(void)
2101{
2102 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
2103 (void *)(long)smp_processor_id());
2104 register_cpu_notifier(&perf_cpu_nb);
2105
2106 return 0;
2107}
2108early_initcall(perf_counter_init);
2109
2110static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
2111{
2112 return sprintf(buf, "%d\n", perf_reserved_percpu);
2113}
2114
2115static ssize_t
2116perf_set_reserve_percpu(struct sysdev_class *class,
2117 const char *buf,
2118 size_t count)
2119{
2120 struct perf_cpu_context *cpuctx;
2121 unsigned long val;
2122 int err, cpu, mpt;
2123
2124 err = strict_strtoul(buf, 10, &val);
2125 if (err)
2126 return err;
2127 if (val > perf_max_counters)
2128 return -EINVAL;
2129
2130 mutex_lock(&perf_resource_mutex);
2131 perf_reserved_percpu = val;
2132 for_each_online_cpu(cpu) {
2133 cpuctx = &per_cpu(perf_cpu_context, cpu);
2134 spin_lock_irq(&cpuctx->ctx.lock);
2135 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
2136 perf_max_counters - perf_reserved_percpu);
2137 cpuctx->max_pertask = mpt;
2138 spin_unlock_irq(&cpuctx->ctx.lock);
2139 }
2140 mutex_unlock(&perf_resource_mutex);
2141
2142 return count;
2143}
2144
2145static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
2146{
2147 return sprintf(buf, "%d\n", perf_overcommit);
2148}
2149
2150static ssize_t
2151perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
2152{
2153 unsigned long val;
2154 int err;
2155
2156 err = strict_strtoul(buf, 10, &val);
2157 if (err)
2158 return err;
2159 if (val > 1)
2160 return -EINVAL;
2161
2162 mutex_lock(&perf_resource_mutex);
2163 perf_overcommit = val;
2164 mutex_unlock(&perf_resource_mutex);
2165
2166 return count;
2167}
2168
2169static SYSDEV_CLASS_ATTR(
2170 reserve_percpu,
2171 0644,
2172 perf_show_reserve_percpu,
2173 perf_set_reserve_percpu
2174 );
2175
2176static SYSDEV_CLASS_ATTR(
2177 overcommit,
2178 0644,
2179 perf_show_overcommit,
2180 perf_set_overcommit
2181 );
2182
2183static struct attribute *perfclass_attrs[] = {
2184 &attr_reserve_percpu.attr,
2185 &attr_overcommit.attr,
2186 NULL
2187};
2188
2189static struct attribute_group perfclass_attr_group = {
2190 .attrs = perfclass_attrs,
2191 .name = "perf_counters",
2192};
2193
2194static int __init perf_counter_sysfs_init(void)
2195{
2196 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
2197 &perfclass_attr_group);
2198}
2199device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index c1d0ed360088..83b68ff6df80 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -558,6 +558,7 @@ struct rq {
558 struct load_weight load; 558 struct load_weight load;
559 unsigned long nr_load_updates; 559 unsigned long nr_load_updates;
560 u64 nr_switches; 560 u64 nr_switches;
561 u64 nr_migrations_in;
561 562
562 struct cfs_rq cfs; 563 struct cfs_rq cfs;
563 struct rt_rq rt; 564 struct rt_rq rt;
@@ -668,7 +669,7 @@ static inline int cpu_of(struct rq *rq)
668#define task_rq(p) cpu_rq(task_cpu(p)) 669#define task_rq(p) cpu_rq(task_cpu(p))
669#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 670#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
670 671
671static inline void update_rq_clock(struct rq *rq) 672inline void update_rq_clock(struct rq *rq)
672{ 673{
673 rq->clock = sched_clock_cpu(cpu_of(rq)); 674 rq->clock = sched_clock_cpu(cpu_of(rq));
674} 675}
@@ -979,6 +980,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
979 } 980 }
980} 981}
981 982
983void curr_rq_lock_irq_save(unsigned long *flags)
984 __acquires(rq->lock)
985{
986 struct rq *rq;
987
988 local_irq_save(*flags);
989 rq = cpu_rq(smp_processor_id());
990 spin_lock(&rq->lock);
991}
992
993void curr_rq_unlock_irq_restore(unsigned long *flags)
994 __releases(rq->lock)
995{
996 struct rq *rq;
997
998 rq = cpu_rq(smp_processor_id());
999 spin_unlock(&rq->lock);
1000 local_irq_restore(*flags);
1001}
1002
982void task_rq_unlock_wait(struct task_struct *p) 1003void task_rq_unlock_wait(struct task_struct *p)
983{ 1004{
984 struct rq *rq = task_rq(p); 1005 struct rq *rq = task_rq(p);
@@ -1885,12 +1906,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1885 p->se.sleep_start -= clock_offset; 1906 p->se.sleep_start -= clock_offset;
1886 if (p->se.block_start) 1907 if (p->se.block_start)
1887 p->se.block_start -= clock_offset; 1908 p->se.block_start -= clock_offset;
1909#endif
1888 if (old_cpu != new_cpu) { 1910 if (old_cpu != new_cpu) {
1889 schedstat_inc(p, se.nr_migrations); 1911 p->se.nr_migrations++;
1912 new_rq->nr_migrations_in++;
1913#ifdef CONFIG_SCHEDSTATS
1890 if (task_hot(p, old_rq->clock, NULL)) 1914 if (task_hot(p, old_rq->clock, NULL))
1891 schedstat_inc(p, se.nr_forced2_migrations); 1915 schedstat_inc(p, se.nr_forced2_migrations);
1892 }
1893#endif 1916#endif
1917 }
1894 p->se.vruntime -= old_cfsrq->min_vruntime - 1918 p->se.vruntime -= old_cfsrq->min_vruntime -
1895 new_cfsrq->min_vruntime; 1919 new_cfsrq->min_vruntime;
1896 1920
@@ -2242,6 +2266,27 @@ static int sched_balance_self(int cpu, int flag)
2242 2266
2243#endif /* CONFIG_SMP */ 2267#endif /* CONFIG_SMP */
2244 2268
2269/**
2270 * task_oncpu_function_call - call a function on the cpu on which a task runs
2271 * @p: the task to evaluate
2272 * @func: the function to be called
2273 * @info: the function call argument
2274 *
2275 * Calls the function @func when the task is currently running. This might
2276 * be on the current CPU, which just calls the function directly
2277 */
2278void task_oncpu_function_call(struct task_struct *p,
2279 void (*func) (void *info), void *info)
2280{
2281 int cpu;
2282
2283 preempt_disable();
2284 cpu = task_cpu(p);
2285 if (task_curr(p))
2286 smp_call_function_single(cpu, func, info, 1);
2287 preempt_enable();
2288}
2289
2245/*** 2290/***
2246 * try_to_wake_up - wake up a thread 2291 * try_to_wake_up - wake up a thread
2247 * @p: the to-be-woken-up thread 2292 * @p: the to-be-woken-up thread
@@ -2384,6 +2429,7 @@ static void __sched_fork(struct task_struct *p)
2384 p->se.exec_start = 0; 2429 p->se.exec_start = 0;
2385 p->se.sum_exec_runtime = 0; 2430 p->se.sum_exec_runtime = 0;
2386 p->se.prev_sum_exec_runtime = 0; 2431 p->se.prev_sum_exec_runtime = 0;
2432 p->se.nr_migrations = 0;
2387 p->se.last_wakeup = 0; 2433 p->se.last_wakeup = 0;
2388 p->se.avg_overlap = 0; 2434 p->se.avg_overlap = 0;
2389 2435
@@ -2604,6 +2650,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2604 */ 2650 */
2605 prev_state = prev->state; 2651 prev_state = prev->state;
2606 finish_arch_switch(prev); 2652 finish_arch_switch(prev);
2653 perf_counter_task_sched_in(current, cpu_of(rq));
2607 finish_lock_switch(rq, prev); 2654 finish_lock_switch(rq, prev);
2608#ifdef CONFIG_SMP 2655#ifdef CONFIG_SMP
2609 if (current->sched_class->post_schedule) 2656 if (current->sched_class->post_schedule)
@@ -2766,6 +2813,21 @@ unsigned long nr_active(void)
2766} 2813}
2767 2814
2768/* 2815/*
2816 * Externally visible per-cpu scheduler statistics:
2817 * cpu_nr_switches(cpu) - number of context switches on that cpu
2818 * cpu_nr_migrations(cpu) - number of migrations into that cpu
2819 */
2820u64 cpu_nr_switches(int cpu)
2821{
2822 return cpu_rq(cpu)->nr_switches;
2823}
2824
2825u64 cpu_nr_migrations(int cpu)
2826{
2827 return cpu_rq(cpu)->nr_migrations_in;
2828}
2829
2830/*
2769 * Update rq->cpu_load[] statistics. This function is usually called every 2831 * Update rq->cpu_load[] statistics. This function is usually called every
2770 * scheduler tick (TICK_NSEC). 2832 * scheduler tick (TICK_NSEC).
2771 */ 2833 */
@@ -4137,6 +4199,29 @@ EXPORT_PER_CPU_SYMBOL(kstat);
4137 * Return any ns on the sched_clock that have not yet been banked in 4199 * Return any ns on the sched_clock that have not yet been banked in
4138 * @p in case that task is currently running. 4200 * @p in case that task is currently running.
4139 */ 4201 */
4202unsigned long long __task_delta_exec(struct task_struct *p, int update)
4203{
4204 s64 delta_exec;
4205 struct rq *rq;
4206
4207 rq = task_rq(p);
4208 WARN_ON_ONCE(!runqueue_is_locked());
4209 WARN_ON_ONCE(!task_current(rq, p));
4210
4211 if (update)
4212 update_rq_clock(rq);
4213
4214 delta_exec = rq->clock - p->se.exec_start;
4215
4216 WARN_ON_ONCE(delta_exec < 0);
4217
4218 return delta_exec;
4219}
4220
4221/*
4222 * Return any ns on the sched_clock that have not yet been banked in
4223 * @p in case that task is currently running.
4224 */
4140unsigned long long task_delta_exec(struct task_struct *p) 4225unsigned long long task_delta_exec(struct task_struct *p)
4141{ 4226{
4142 unsigned long flags; 4227 unsigned long flags;
@@ -4396,6 +4481,7 @@ void scheduler_tick(void)
4396 update_rq_clock(rq); 4481 update_rq_clock(rq);
4397 update_cpu_load(rq); 4482 update_cpu_load(rq);
4398 curr->sched_class->task_tick(rq, curr, 0); 4483 curr->sched_class->task_tick(rq, curr, 0);
4484 perf_counter_task_tick(curr, cpu);
4399 spin_unlock(&rq->lock); 4485 spin_unlock(&rq->lock);
4400 4486
4401#ifdef CONFIG_SMP 4487#ifdef CONFIG_SMP
@@ -4591,6 +4677,7 @@ need_resched_nonpreemptible:
4591 4677
4592 if (likely(prev != next)) { 4678 if (likely(prev != next)) {
4593 sched_info_switch(prev, next); 4679 sched_info_switch(prev, next);
4680 perf_counter_task_sched_out(prev, cpu);
4594 4681
4595 rq->nr_switches++; 4682 rq->nr_switches++;
4596 rq->curr = next; 4683 rq->curr = next;
@@ -5944,12 +6031,7 @@ void sched_show_task(struct task_struct *p)
5944 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 6031 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5945#endif 6032#endif
5946#ifdef CONFIG_DEBUG_STACK_USAGE 6033#ifdef CONFIG_DEBUG_STACK_USAGE
5947 { 6034 free = stack_not_used(p);
5948 unsigned long *n = end_of_stack(p);
5949 while (!*n)
5950 n++;
5951 free = (unsigned long)n - (unsigned long)end_of_stack(p);
5952 }
5953#endif 6035#endif
5954 printk(KERN_CONT "%5lu %5d %6d\n", free, 6036 printk(KERN_CONT "%5lu %5d %6d\n", free,
5955 task_pid_nr(p), task_pid_nr(p->real_parent)); 6037 task_pid_nr(p), task_pid_nr(p->real_parent));
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bac1061cea2f..da932f4c8524 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -960,12 +960,13 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
960 960
961static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 961static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
962 962
963static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 963static inline int pick_optimal_cpu(int this_cpu,
964 const struct cpumask *mask)
964{ 965{
965 int first; 966 int first;
966 967
967 /* "this_cpu" is cheaper to preempt than a remote processor */ 968 /* "this_cpu" is cheaper to preempt than a remote processor */
968 if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) 969 if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
969 return this_cpu; 970 return this_cpu;
970 971
971 first = cpumask_first(mask); 972 first = cpumask_first(mask);
@@ -981,6 +982,7 @@ static int find_lowest_rq(struct task_struct *task)
981 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); 982 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
982 int this_cpu = smp_processor_id(); 983 int this_cpu = smp_processor_id();
983 int cpu = task_cpu(task); 984 int cpu = task_cpu(task);
985 cpumask_var_t domain_mask;
984 986
985 if (task->rt.nr_cpus_allowed == 1) 987 if (task->rt.nr_cpus_allowed == 1)
986 return -1; /* No other targets possible */ 988 return -1; /* No other targets possible */
@@ -1013,19 +1015,25 @@ static int find_lowest_rq(struct task_struct *task)
1013 if (this_cpu == cpu) 1015 if (this_cpu == cpu)
1014 this_cpu = -1; /* Skip this_cpu opt if the same */ 1016 this_cpu = -1; /* Skip this_cpu opt if the same */
1015 1017
1016 for_each_domain(cpu, sd) { 1018 if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
1017 if (sd->flags & SD_WAKE_AFFINE) { 1019 for_each_domain(cpu, sd) {
1018 cpumask_t domain_mask; 1020 if (sd->flags & SD_WAKE_AFFINE) {
1019 int best_cpu; 1021 int best_cpu;
1020 1022
1021 cpumask_and(&domain_mask, sched_domain_span(sd), 1023 cpumask_and(domain_mask,
1022 lowest_mask); 1024 sched_domain_span(sd),
1025 lowest_mask);
1023 1026
1024 best_cpu = pick_optimal_cpu(this_cpu, 1027 best_cpu = pick_optimal_cpu(this_cpu,
1025 &domain_mask); 1028 domain_mask);
1026 if (best_cpu != -1) 1029
1027 return best_cpu; 1030 if (best_cpu != -1) {
1031 free_cpumask_var(domain_mask);
1032 return best_cpu;
1033 }
1034 }
1028 } 1035 }
1036 free_cpumask_var(domain_mask);
1029 } 1037 }
1030 1038
1031 /* 1039 /*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..0365b4899a3d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -795,6 +795,11 @@ int __init __weak early_irq_init(void)
795 return 0; 795 return 0;
796} 796}
797 797
798int __init __weak arch_probe_nr_irqs(void)
799{
800 return 0;
801}
802
798int __init __weak arch_early_irq_init(void) 803int __init __weak arch_early_irq_init(void)
799{ 804{
800 return 0; 805 return 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index f145c415bc16..c5e7dec4966e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h>
17#include <linux/resource.h> 18#include <linux/resource.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1791,6 +1792,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1791 case PR_SET_TSC: 1792 case PR_SET_TSC:
1792 error = SET_TSC_CTL(arg2); 1793 error = SET_TSC_CTL(arg2);
1793 break; 1794 break;
1795 case PR_TASK_PERF_COUNTERS_DISABLE:
1796 error = perf_counter_task_disable();
1797 break;
1798 case PR_TASK_PERF_COUNTERS_ENABLE:
1799 error = perf_counter_task_enable();
1800 break;
1794 case PR_GET_TIMERSLACK: 1801 case PR_GET_TIMERSLACK:
1795 error = current->timer_slack_ns; 1802 error = current->timer_slack_ns;
1796 break; 1803 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 27dad2967387..68320f6b07b5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime);
175cond_syscall(compat_sys_timerfd_gettime); 175cond_syscall(compat_sys_timerfd_gettime);
176cond_syscall(sys_eventfd); 176cond_syscall(sys_eventfd);
177cond_syscall(sys_eventfd2); 177cond_syscall(sys_eventfd2);
178
179/* performance counters: */
180cond_syscall(sys_perf_counter_open);