aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2008-12-13 06:25:51 -0500
committerRusty Russell <rusty@rustcorp.com.au>2008-12-13 06:25:51 -0500
commit968ea6d80e395cf11a51143cfa1b9a14ada676df (patch)
treedc2acec8c9bdced33afe1e273ee5e0b0b93d2703 /arch/x86/kernel
parent7be7585393d311866653564fbcd10a3232773c0b (diff)
parent8299608f140ae321e4eb5d1306184265d2b9511e (diff)
Merge ../linux-2.6-x86
Conflicts: arch/x86/kernel/io_apic.c kernel/sched.c kernel/sched_stats.h
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/boot.c11
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/ds.c692
-rw-r--r--arch/x86/kernel/dumpstack.c351
-rw-r--r--arch/x86/kernel/dumpstack.h39
-rw-r--r--arch/x86/kernel/dumpstack_32.c307
-rw-r--r--arch/x86/kernel/dumpstack_64.c289
-rw-r--r--arch/x86/kernel/entry_32.S51
-rw-r--r--arch/x86/kernel/entry_64.S83
-rw-r--r--arch/x86/kernel/es7000_32.c62
-rw-r--r--arch/x86/kernel/ftrace.c390
-rw-r--r--arch/x86/kernel/genapic_64.c4
-rw-r--r--arch/x86/kernel/io_apic.c631
-rw-r--r--arch/x86/kernel/irq.c3
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/irqinit_32.c3
-rw-r--r--arch/x86/kernel/irqinit_64.c3
-rw-r--r--arch/x86/kernel/mpparse.c25
-rw-r--r--arch/x86/kernel/numaq_32.c10
-rw-r--r--arch/x86/kernel/process.c32
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/ptrace.c98
-rw-r--r--arch/x86/kernel/reboot.c31
-rw-r--r--arch/x86/kernel/setup.c17
-rw-r--r--arch/x86/kernel/smp.c13
-rw-r--r--arch/x86/kernel/smpboot.c17
-rw-r--r--arch/x86/kernel/stacktrace.c64
-rw-r--r--arch/x86/kernel/vsyscall_64.c3
33 files changed, 1921 insertions, 1334 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b62a7667828e..1cad9318d217 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,7 +25,7 @@ CFLAGS_tsc.o := $(nostackp)
25 25
26obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o 26obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
27obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 27obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
28obj-y += time_$(BITS).o ioport.o ldt.o 28obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
29obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 29obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
30obj-$(CONFIG_X86_VISWS) += visws_quirks.o 30obj-$(CONFIG_X86_VISWS) += visws_quirks.o
31obj-$(CONFIG_X86_32) += probe_roms_32.o 31obj-$(CONFIG_X86_32) += probe_roms_32.o
@@ -65,6 +65,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
65obj-$(CONFIG_X86_IO_APIC) += io_apic.o 65obj-$(CONFIG_X86_IO_APIC) += io_apic.o
66obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 66obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
67obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 67obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
68obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
68obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 69obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 70obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 71obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4c51a2f8fd31..65d0b72777ea 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1360,6 +1360,17 @@ static void __init acpi_process_madt(void)
1360 disable_acpi(); 1360 disable_acpi();
1361 } 1361 }
1362 } 1362 }
1363
1364 /*
1365 * ACPI supports both logical (e.g. Hyper-Threading) and physical
1366 * processors, where MPS only supports physical.
1367 */
1368 if (acpi_lapic && acpi_ioapic)
1369 printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
1370 "information\n");
1371 else if (acpi_lapic)
1372 printk(KERN_INFO "Using ACPI for processor (LAPIC) "
1373 "configuration information\n");
1363#endif 1374#endif
1364 return; 1375 return;
1365} 1376}
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5145a6e72bbb..3a26525a3f31 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -391,11 +391,7 @@ static int power_off;
391#else 391#else
392static int power_off = 1; 392static int power_off = 1;
393#endif 393#endif
394#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
395static int realmode_power_off = 1;
396#else
397static int realmode_power_off; 394static int realmode_power_off;
398#endif
399#ifdef CONFIG_APM_ALLOW_INTS 395#ifdef CONFIG_APM_ALLOW_INTS
400static int allow_ints = 1; 396static int allow_ints = 1;
401#else 397#else
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8e48c5d4467d..88ea02dcb622 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/ftrace.h>
36 37
37#include <linux/acpi.h> 38#include <linux/acpi.h>
38#include <acpi/processor.h> 39#include <acpi/processor.h>
@@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
391 unsigned int next_perf_state = 0; /* Index into perf table */ 392 unsigned int next_perf_state = 0; /* Index into perf table */
392 unsigned int i; 393 unsigned int i;
393 int result = 0; 394 int result = 0;
395 struct power_trace it;
394 396
395 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); 397 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
396 398
@@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
427 } 429 }
428 } 430 }
429 431
432 trace_power_mark(&it, POWER_PSTATE, next_perf_state);
433
430 switch (data->cpu_feature) { 434 switch (data->cpu_feature) {
431 case SYSTEM_INTEL_MSR_CAPABLE: 435 case SYSTEM_INTEL_MSR_CAPABLE:
432 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 436 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index cce0b6118d55..816f27f289b1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -307,12 +307,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
307 set_cpu_cap(c, X86_FEATURE_P4); 307 set_cpu_cap(c, X86_FEATURE_P4);
308 if (c->x86 == 6) 308 if (c->x86 == 6)
309 set_cpu_cap(c, X86_FEATURE_P3); 309 set_cpu_cap(c, X86_FEATURE_P3);
310#endif
310 311
311 if (cpu_has_bts) 312 if (cpu_has_bts)
312 ptrace_bts_init_intel(c); 313 ptrace_bts_init_intel(c);
313 314
314#endif
315
316 detect_extended_topology(c); 315 detect_extended_topology(c);
317 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { 316 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
318 /* 317 /*
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index a2d1176c38ee..19a8c2c0389f 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -7,13 +7,12 @@
7 * 7 *
8 * It manages: 8 * It manages:
9 * - per-thread and per-cpu allocation of BTS and PEBS 9 * - per-thread and per-cpu allocation of BTS and PEBS
10 * - buffer memory allocation (optional) 10 * - buffer overflow handling (to be done)
11 * - buffer overflow handling
12 * - buffer access 11 * - buffer access
13 * 12 *
14 * It assumes: 13 * It assumes:
15 * - get_task_struct on all parameter tasks 14 * - get_task_struct on all traced tasks
16 * - current is allowed to trace parameter tasks 15 * - current is allowed to trace tasks
17 * 16 *
18 * 17 *
19 * Copyright (C) 2007-2008 Intel Corporation. 18 * Copyright (C) 2007-2008 Intel Corporation.
@@ -28,6 +27,7 @@
28#include <linux/slab.h> 27#include <linux/slab.h>
29#include <linux/sched.h> 28#include <linux/sched.h>
30#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/kernel.h>
31 31
32 32
33/* 33/*
@@ -44,6 +44,33 @@ struct ds_configuration {
44}; 44};
45static struct ds_configuration ds_cfg; 45static struct ds_configuration ds_cfg;
46 46
47/*
48 * A BTS or PEBS tracer.
49 *
50 * This holds the configuration of the tracer and serves as a handle
51 * to identify tracers.
52 */
53struct ds_tracer {
54 /* the DS context (partially) owned by this tracer */
55 struct ds_context *context;
56 /* the buffer provided on ds_request() and its size in bytes */
57 void *buffer;
58 size_t size;
59};
60
61struct bts_tracer {
62 /* the common DS part */
63 struct ds_tracer ds;
64 /* buffer overflow notification function */
65 bts_ovfl_callback_t ovfl;
66};
67
68struct pebs_tracer {
69 /* the common DS part */
70 struct ds_tracer ds;
71 /* buffer overflow notification function */
72 pebs_ovfl_callback_t ovfl;
73};
47 74
48/* 75/*
49 * Debug Store (DS) save area configuration (see Intel64 and IA32 76 * Debug Store (DS) save area configuration (see Intel64 and IA32
@@ -107,34 +134,13 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
107 (*(unsigned long *)base) = value; 134 (*(unsigned long *)base) = value;
108} 135}
109 136
137#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
110 138
111/*
112 * Locking is done only for allocating BTS or PEBS resources and for
113 * guarding context and buffer memory allocation.
114 *
115 * Most functions require the current task to own the ds context part
116 * they are going to access. All the locking is done when validating
117 * access to the context.
118 */
119static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
120 139
121/* 140/*
122 * Validate that the current task is allowed to access the BTS/PEBS 141 * Locking is done only for allocating BTS or PEBS resources.
123 * buffer of the parameter task.
124 *
125 * Returns 0, if access is granted; -Eerrno, otherwise.
126 */ 142 */
127static inline int ds_validate_access(struct ds_context *context, 143static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
128 enum ds_qualifier qual)
129{
130 if (!context)
131 return -EPERM;
132
133 if (context->owner[qual] == current)
134 return 0;
135
136 return -EPERM;
137}
138 144
139 145
140/* 146/*
@@ -183,51 +189,13 @@ static inline int check_tracer(struct task_struct *task)
183 * 189 *
184 * Contexts are use-counted. They are allocated on first access and 190 * Contexts are use-counted. They are allocated on first access and
185 * deallocated when the last user puts the context. 191 * deallocated when the last user puts the context.
186 *
187 * We distinguish between an allocating and a non-allocating get of a
188 * context:
189 * - the allocating get is used for requesting BTS/PEBS resources. It
190 * requires the caller to hold the global ds_lock.
191 * - the non-allocating get is used for all other cases. A
192 * non-existing context indicates an error. It acquires and releases
193 * the ds_lock itself for obtaining the context.
194 *
195 * A context and its DS configuration are allocated and deallocated
196 * together. A context always has a DS configuration of the
197 * appropriate size.
198 */ 192 */
199static DEFINE_PER_CPU(struct ds_context *, system_context); 193static DEFINE_PER_CPU(struct ds_context *, system_context);
200 194
201#define this_system_context per_cpu(system_context, smp_processor_id()) 195#define this_system_context per_cpu(system_context, smp_processor_id())
202 196
203/*
204 * Returns the pointer to the parameter task's context or to the
205 * system-wide context, if task is NULL.
206 *
207 * Increases the use count of the returned context, if not NULL.
208 */
209static inline struct ds_context *ds_get_context(struct task_struct *task) 197static inline struct ds_context *ds_get_context(struct task_struct *task)
210{ 198{
211 struct ds_context *context;
212 unsigned long irq;
213
214 spin_lock_irqsave(&ds_lock, irq);
215
216 context = (task ? task->thread.ds_ctx : this_system_context);
217 if (context)
218 context->count++;
219
220 spin_unlock_irqrestore(&ds_lock, irq);
221
222 return context;
223}
224
225/*
226 * Same as ds_get_context, but allocates the context and it's DS
227 * structure, if necessary; returns NULL; if out of memory.
228 */
229static inline struct ds_context *ds_alloc_context(struct task_struct *task)
230{
231 struct ds_context **p_context = 199 struct ds_context **p_context =
232 (task ? &task->thread.ds_ctx : &this_system_context); 200 (task ? &task->thread.ds_ctx : &this_system_context);
233 struct ds_context *context = *p_context; 201 struct ds_context *context = *p_context;
@@ -238,16 +206,9 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task)
238 if (!context) 206 if (!context)
239 return NULL; 207 return NULL;
240 208
241 context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
242 if (!context->ds) {
243 kfree(context);
244 return NULL;
245 }
246
247 spin_lock_irqsave(&ds_lock, irq); 209 spin_lock_irqsave(&ds_lock, irq);
248 210
249 if (*p_context) { 211 if (*p_context) {
250 kfree(context->ds);
251 kfree(context); 212 kfree(context);
252 213
253 context = *p_context; 214 context = *p_context;
@@ -272,10 +233,6 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task)
272 return context; 233 return context;
273} 234}
274 235
275/*
276 * Decreases the use count of the parameter context, if not NULL.
277 * Deallocates the context, if the use count reaches zero.
278 */
279static inline void ds_put_context(struct ds_context *context) 236static inline void ds_put_context(struct ds_context *context)
280{ 237{
281 unsigned long irq; 238 unsigned long irq;
@@ -296,13 +253,6 @@ static inline void ds_put_context(struct ds_context *context)
296 if (!context->task || (context->task == current)) 253 if (!context->task || (context->task == current))
297 wrmsrl(MSR_IA32_DS_AREA, 0); 254 wrmsrl(MSR_IA32_DS_AREA, 0);
298 255
299 put_tracer(context->task);
300
301 /* free any leftover buffers from tracers that did not
302 * deallocate them properly. */
303 kfree(context->buffer[ds_bts]);
304 kfree(context->buffer[ds_pebs]);
305 kfree(context->ds);
306 kfree(context); 256 kfree(context);
307 out: 257 out:
308 spin_unlock_irqrestore(&ds_lock, irq); 258 spin_unlock_irqrestore(&ds_lock, irq);
@@ -312,345 +262,342 @@ static inline void ds_put_context(struct ds_context *context)
312/* 262/*
313 * Handle a buffer overflow 263 * Handle a buffer overflow
314 * 264 *
315 * task: the task whose buffers are overflowing;
316 * NULL for a buffer overflow on the current cpu
317 * context: the ds context 265 * context: the ds context
318 * qual: the buffer type 266 * qual: the buffer type
319 */ 267 */
320static void ds_overflow(struct task_struct *task, struct ds_context *context, 268static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
321 enum ds_qualifier qual) 269{
322{ 270 switch (qual) {
323 if (!context) 271 case ds_bts: {
324 return; 272 struct bts_tracer *tracer =
325 273 container_of(context->owner[qual],
326 if (context->callback[qual]) 274 struct bts_tracer, ds);
327 (*context->callback[qual])(task); 275 if (tracer->ovfl)
328 276 tracer->ovfl(tracer);
329 /* todo: do some more overflow handling */ 277 }
278 break;
279 case ds_pebs: {
280 struct pebs_tracer *tracer =
281 container_of(context->owner[qual],
282 struct pebs_tracer, ds);
283 if (tracer->ovfl)
284 tracer->ovfl(tracer);
285 }
286 break;
287 }
330} 288}
331 289
332 290
333/* 291static void ds_install_ds_config(struct ds_context *context,
334 * Allocate a non-pageable buffer of the parameter size. 292 enum ds_qualifier qual,
335 * Checks the memory and the locked memory rlimit. 293 void *base, size_t size, size_t ith)
336 *
337 * Returns the buffer, if successful;
338 * NULL, if out of memory or rlimit exceeded.
339 *
340 * size: the requested buffer size in bytes
341 * pages (out): if not NULL, contains the number of pages reserved
342 */
343static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
344{ 294{
345 unsigned long rlim, vm, pgsz; 295 unsigned long buffer, adj;
346 void *buffer;
347
348 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
349
350 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
351 vm = current->mm->total_vm + pgsz;
352 if (rlim < vm)
353 return NULL;
354 296
355 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 297 /* adjust the buffer address and size to meet alignment
356 vm = current->mm->locked_vm + pgsz; 298 * constraints:
357 if (rlim < vm) 299 * - buffer is double-word aligned
358 return NULL; 300 * - size is multiple of record size
301 *
302 * We checked the size at the very beginning; we have enough
303 * space to do the adjustment.
304 */
305 buffer = (unsigned long)base;
359 306
360 buffer = kzalloc(size, GFP_KERNEL); 307 adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
361 if (!buffer) 308 buffer += adj;
362 return NULL; 309 size -= adj;
363 310
364 current->mm->total_vm += pgsz; 311 size /= ds_cfg.sizeof_rec[qual];
365 current->mm->locked_vm += pgsz; 312 size *= ds_cfg.sizeof_rec[qual];
366 313
367 if (pages) 314 ds_set(context->ds, qual, ds_buffer_base, buffer);
368 *pages = pgsz; 315 ds_set(context->ds, qual, ds_index, buffer);
316 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
369 317
370 return buffer; 318 /* The value for 'no threshold' is -1, which will set the
319 * threshold outside of the buffer, just like we want it.
320 */
321 ds_set(context->ds, qual,
322 ds_interrupt_threshold, buffer + size - ith);
371} 323}
372 324
373static int ds_request(struct task_struct *task, void *base, size_t size, 325static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual,
374 ds_ovfl_callback_t ovfl, enum ds_qualifier qual) 326 struct task_struct *task,
327 void *base, size_t size, size_t th)
375{ 328{
376 struct ds_context *context; 329 struct ds_context *context;
377 unsigned long buffer, adj;
378 const unsigned long alignment = (1 << 3);
379 unsigned long irq; 330 unsigned long irq;
380 int error = 0; 331 int error;
381 332
333 error = -EOPNOTSUPP;
382 if (!ds_cfg.sizeof_ds) 334 if (!ds_cfg.sizeof_ds)
383 return -EOPNOTSUPP; 335 goto out;
336
337 error = -EINVAL;
338 if (!base)
339 goto out;
384 340
385 /* we require some space to do alignment adjustments below */ 341 /* we require some space to do alignment adjustments below */
386 if (size < (alignment + ds_cfg.sizeof_rec[qual])) 342 error = -EINVAL;
387 return -EINVAL; 343 if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
344 goto out;
388 345
389 /* buffer overflow notification is not yet implemented */ 346 if (th != (size_t)-1) {
390 if (ovfl) 347 th *= ds_cfg.sizeof_rec[qual];
391 return -EOPNOTSUPP; 348
349 error = -EINVAL;
350 if (size <= th)
351 goto out;
352 }
392 353
354 tracer->buffer = base;
355 tracer->size = size;
393 356
394 context = ds_alloc_context(task); 357 error = -ENOMEM;
358 context = ds_get_context(task);
395 if (!context) 359 if (!context)
396 return -ENOMEM; 360 goto out;
361 tracer->context = context;
362
397 363
398 spin_lock_irqsave(&ds_lock, irq); 364 spin_lock_irqsave(&ds_lock, irq);
399 365
400 error = -EPERM; 366 error = -EPERM;
401 if (!check_tracer(task)) 367 if (!check_tracer(task))
402 goto out_unlock; 368 goto out_unlock;
403
404 get_tracer(task); 369 get_tracer(task);
405 370
406 error = -EALREADY;
407 if (context->owner[qual] == current)
408 goto out_put_tracer;
409 error = -EPERM; 371 error = -EPERM;
410 if (context->owner[qual] != NULL) 372 if (context->owner[qual])
411 goto out_put_tracer; 373 goto out_put_tracer;
412 context->owner[qual] = current; 374 context->owner[qual] = tracer;
413 375
414 spin_unlock_irqrestore(&ds_lock, irq); 376 spin_unlock_irqrestore(&ds_lock, irq);
415 377
416 378
417 error = -ENOMEM; 379 ds_install_ds_config(context, qual, base, size, th);
418 if (!base) {
419 base = ds_allocate_buffer(size, &context->pages[qual]);
420 if (!base)
421 goto out_release;
422
423 context->buffer[qual] = base;
424 }
425 error = 0;
426 380
427 context->callback[qual] = ovfl; 381 return 0;
428
429 /* adjust the buffer address and size to meet alignment
430 * constraints:
431 * - buffer is double-word aligned
432 * - size is multiple of record size
433 *
434 * We checked the size at the very beginning; we have enough
435 * space to do the adjustment.
436 */
437 buffer = (unsigned long)base;
438
439 adj = ALIGN(buffer, alignment) - buffer;
440 buffer += adj;
441 size -= adj;
442
443 size /= ds_cfg.sizeof_rec[qual];
444 size *= ds_cfg.sizeof_rec[qual];
445
446 ds_set(context->ds, qual, ds_buffer_base, buffer);
447 ds_set(context->ds, qual, ds_index, buffer);
448 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
449
450 if (ovfl) {
451 /* todo: select a suitable interrupt threshold */
452 } else
453 ds_set(context->ds, qual,
454 ds_interrupt_threshold, buffer + size + 1);
455
456 /* we keep the context until ds_release */
457 return error;
458
459 out_release:
460 context->owner[qual] = NULL;
461 ds_put_context(context);
462 put_tracer(task);
463 return error;
464 382
465 out_put_tracer: 383 out_put_tracer:
466 spin_unlock_irqrestore(&ds_lock, irq);
467 ds_put_context(context);
468 put_tracer(task); 384 put_tracer(task);
469 return error;
470
471 out_unlock: 385 out_unlock:
472 spin_unlock_irqrestore(&ds_lock, irq); 386 spin_unlock_irqrestore(&ds_lock, irq);
473 ds_put_context(context); 387 ds_put_context(context);
388 tracer->context = NULL;
389 out:
474 return error; 390 return error;
475} 391}
476 392
477int ds_request_bts(struct task_struct *task, void *base, size_t size, 393struct bts_tracer *ds_request_bts(struct task_struct *task,
478 ds_ovfl_callback_t ovfl) 394 void *base, size_t size,
395 bts_ovfl_callback_t ovfl, size_t th)
479{ 396{
480 return ds_request(task, base, size, ovfl, ds_bts); 397 struct bts_tracer *tracer;
481} 398 int error;
482 399
483int ds_request_pebs(struct task_struct *task, void *base, size_t size, 400 /* buffer overflow notification is not yet implemented */
484 ds_ovfl_callback_t ovfl) 401 error = -EOPNOTSUPP;
485{ 402 if (ovfl)
486 return ds_request(task, base, size, ovfl, ds_pebs); 403 goto out;
404
405 error = -ENOMEM;
406 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
407 if (!tracer)
408 goto out;
409 tracer->ovfl = ovfl;
410
411 error = ds_request(&tracer->ds, ds_bts, task, base, size, th);
412 if (error < 0)
413 goto out_tracer;
414
415 return tracer;
416
417 out_tracer:
418 kfree(tracer);
419 out:
420 return ERR_PTR(error);
487} 421}
488 422
489static int ds_release(struct task_struct *task, enum ds_qualifier qual) 423struct pebs_tracer *ds_request_pebs(struct task_struct *task,
424 void *base, size_t size,
425 pebs_ovfl_callback_t ovfl, size_t th)
490{ 426{
491 struct ds_context *context; 427 struct pebs_tracer *tracer;
492 int error; 428 int error;
493 429
494 context = ds_get_context(task); 430 /* buffer overflow notification is not yet implemented */
495 error = ds_validate_access(context, qual); 431 error = -EOPNOTSUPP;
496 if (error < 0) 432 if (ovfl)
497 goto out; 433 goto out;
498 434
499 kfree(context->buffer[qual]); 435 error = -ENOMEM;
500 context->buffer[qual] = NULL; 436 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
437 if (!tracer)
438 goto out;
439 tracer->ovfl = ovfl;
501 440
502 current->mm->total_vm -= context->pages[qual]; 441 error = ds_request(&tracer->ds, ds_pebs, task, base, size, th);
503 current->mm->locked_vm -= context->pages[qual]; 442 if (error < 0)
504 context->pages[qual] = 0; 443 goto out_tracer;
505 context->owner[qual] = NULL;
506 444
507 /* 445 return tracer;
508 * we put the context twice: 446
509 * once for the ds_get_context 447 out_tracer:
510 * once for the corresponding ds_request 448 kfree(tracer);
511 */
512 ds_put_context(context);
513 out: 449 out:
514 ds_put_context(context); 450 return ERR_PTR(error);
515 return error;
516} 451}
517 452
518int ds_release_bts(struct task_struct *task) 453static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual)
519{ 454{
520 return ds_release(task, ds_bts); 455 BUG_ON(tracer->context->owner[qual] != tracer);
456 tracer->context->owner[qual] = NULL;
457
458 put_tracer(tracer->context->task);
459 ds_put_context(tracer->context);
521} 460}
522 461
523int ds_release_pebs(struct task_struct *task) 462int ds_release_bts(struct bts_tracer *tracer)
524{ 463{
525 return ds_release(task, ds_pebs); 464 if (!tracer)
465 return -EINVAL;
466
467 ds_release(&tracer->ds, ds_bts);
468 kfree(tracer);
469
470 return 0;
526} 471}
527 472
528static int ds_get_index(struct task_struct *task, size_t *pos, 473int ds_release_pebs(struct pebs_tracer *tracer)
529 enum ds_qualifier qual)
530{ 474{
531 struct ds_context *context; 475 if (!tracer)
532 unsigned long base, index; 476 return -EINVAL;
533 int error;
534 477
535 context = ds_get_context(task); 478 ds_release(&tracer->ds, ds_pebs);
536 error = ds_validate_access(context, qual); 479 kfree(tracer);
537 if (error < 0) 480
538 goto out; 481 return 0;
482}
483
484static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual)
485{
486 unsigned long base, index;
539 487
540 base = ds_get(context->ds, qual, ds_buffer_base); 488 base = ds_get(context->ds, qual, ds_buffer_base);
541 index = ds_get(context->ds, qual, ds_index); 489 index = ds_get(context->ds, qual, ds_index);
542 490
543 error = ((index - base) / ds_cfg.sizeof_rec[qual]); 491 return (index - base) / ds_cfg.sizeof_rec[qual];
544 if (pos)
545 *pos = error;
546 out:
547 ds_put_context(context);
548 return error;
549} 492}
550 493
551int ds_get_bts_index(struct task_struct *task, size_t *pos) 494int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos)
552{ 495{
553 return ds_get_index(task, pos, ds_bts); 496 if (!tracer)
497 return -EINVAL;
498
499 if (!pos)
500 return -EINVAL;
501
502 *pos = ds_get_index(tracer->ds.context, ds_bts);
503
504 return 0;
554} 505}
555 506
556int ds_get_pebs_index(struct task_struct *task, size_t *pos) 507int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos)
557{ 508{
558 return ds_get_index(task, pos, ds_pebs); 509 if (!tracer)
510 return -EINVAL;
511
512 if (!pos)
513 return -EINVAL;
514
515 *pos = ds_get_index(tracer->ds.context, ds_pebs);
516
517 return 0;
559} 518}
560 519
561static int ds_get_end(struct task_struct *task, size_t *pos, 520static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual)
562 enum ds_qualifier qual)
563{ 521{
564 struct ds_context *context; 522 unsigned long base, max;
565 unsigned long base, end;
566 int error;
567
568 context = ds_get_context(task);
569 error = ds_validate_access(context, qual);
570 if (error < 0)
571 goto out;
572 523
573 base = ds_get(context->ds, qual, ds_buffer_base); 524 base = ds_get(context->ds, qual, ds_buffer_base);
574 end = ds_get(context->ds, qual, ds_absolute_maximum); 525 max = ds_get(context->ds, qual, ds_absolute_maximum);
575 526
576 error = ((end - base) / ds_cfg.sizeof_rec[qual]); 527 return (max - base) / ds_cfg.sizeof_rec[qual];
577 if (pos)
578 *pos = error;
579 out:
580 ds_put_context(context);
581 return error;
582} 528}
583 529
584int ds_get_bts_end(struct task_struct *task, size_t *pos) 530int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos)
585{ 531{
586 return ds_get_end(task, pos, ds_bts); 532 if (!tracer)
533 return -EINVAL;
534
535 if (!pos)
536 return -EINVAL;
537
538 *pos = ds_get_end(tracer->ds.context, ds_bts);
539
540 return 0;
587} 541}
588 542
589int ds_get_pebs_end(struct task_struct *task, size_t *pos) 543int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos)
590{ 544{
591 return ds_get_end(task, pos, ds_pebs); 545 if (!tracer)
546 return -EINVAL;
547
548 if (!pos)
549 return -EINVAL;
550
551 *pos = ds_get_end(tracer->ds.context, ds_pebs);
552
553 return 0;
592} 554}
593 555
594static int ds_access(struct task_struct *task, size_t index, 556static int ds_access(struct ds_context *context, enum ds_qualifier qual,
595 const void **record, enum ds_qualifier qual) 557 size_t index, const void **record)
596{ 558{
597 struct ds_context *context;
598 unsigned long base, idx; 559 unsigned long base, idx;
599 int error;
600 560
601 if (!record) 561 if (!record)
602 return -EINVAL; 562 return -EINVAL;
603 563
604 context = ds_get_context(task);
605 error = ds_validate_access(context, qual);
606 if (error < 0)
607 goto out;
608
609 base = ds_get(context->ds, qual, ds_buffer_base); 564 base = ds_get(context->ds, qual, ds_buffer_base);
610 idx = base + (index * ds_cfg.sizeof_rec[qual]); 565 idx = base + (index * ds_cfg.sizeof_rec[qual]);
611 566
612 error = -EINVAL;
613 if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) 567 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
614 goto out; 568 return -EINVAL;
615 569
616 *record = (const void *)idx; 570 *record = (const void *)idx;
617 error = ds_cfg.sizeof_rec[qual]; 571
618 out: 572 return ds_cfg.sizeof_rec[qual];
619 ds_put_context(context);
620 return error;
621} 573}
622 574
623int ds_access_bts(struct task_struct *task, size_t index, const void **record) 575int ds_access_bts(struct bts_tracer *tracer, size_t index,
576 const void **record)
624{ 577{
625 return ds_access(task, index, record, ds_bts); 578 if (!tracer)
579 return -EINVAL;
580
581 return ds_access(tracer->ds.context, ds_bts, index, record);
626} 582}
627 583
628int ds_access_pebs(struct task_struct *task, size_t index, const void **record) 584int ds_access_pebs(struct pebs_tracer *tracer, size_t index,
585 const void **record)
629{ 586{
630 return ds_access(task, index, record, ds_pebs); 587 if (!tracer)
588 return -EINVAL;
589
590 return ds_access(tracer->ds.context, ds_pebs, index, record);
631} 591}
632 592
633static int ds_write(struct task_struct *task, const void *record, size_t size, 593static int ds_write(struct ds_context *context, enum ds_qualifier qual,
634 enum ds_qualifier qual, int force) 594 const void *record, size_t size)
635{ 595{
636 struct ds_context *context; 596 int bytes_written = 0;
637 int error;
638 597
639 if (!record) 598 if (!record)
640 return -EINVAL; 599 return -EINVAL;
641 600
642 error = -EPERM;
643 context = ds_get_context(task);
644 if (!context)
645 goto out;
646
647 if (!force) {
648 error = ds_validate_access(context, qual);
649 if (error < 0)
650 goto out;
651 }
652
653 error = 0;
654 while (size) { 601 while (size) {
655 unsigned long base, index, end, write_end, int_th; 602 unsigned long base, index, end, write_end, int_th;
656 unsigned long write_size, adj_write_size; 603 unsigned long write_size, adj_write_size;
@@ -678,14 +625,14 @@ static int ds_write(struct task_struct *task, const void *record, size_t size,
678 write_end = end; 625 write_end = end;
679 626
680 if (write_end <= index) 627 if (write_end <= index)
681 goto out; 628 break;
682 629
683 write_size = min((unsigned long) size, write_end - index); 630 write_size = min((unsigned long) size, write_end - index);
684 memcpy((void *)index, record, write_size); 631 memcpy((void *)index, record, write_size);
685 632
686 record = (const char *)record + write_size; 633 record = (const char *)record + write_size;
687 size -= write_size; 634 size -= write_size;
688 error += write_size; 635 bytes_written += write_size;
689 636
690 adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; 637 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
691 adj_write_size *= ds_cfg.sizeof_rec[qual]; 638 adj_write_size *= ds_cfg.sizeof_rec[qual];
@@ -700,47 +647,32 @@ static int ds_write(struct task_struct *task, const void *record, size_t size,
700 ds_set(context->ds, qual, ds_index, index); 647 ds_set(context->ds, qual, ds_index, index);
701 648
702 if (index >= int_th) 649 if (index >= int_th)
703 ds_overflow(task, context, qual); 650 ds_overflow(context, qual);
704 } 651 }
705 652
706 out: 653 return bytes_written;
707 ds_put_context(context);
708 return error;
709} 654}
710 655
711int ds_write_bts(struct task_struct *task, const void *record, size_t size) 656int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size)
712{ 657{
713 return ds_write(task, record, size, ds_bts, /* force = */ 0); 658 if (!tracer)
714} 659 return -EINVAL;
715 660
716int ds_write_pebs(struct task_struct *task, const void *record, size_t size) 661 return ds_write(tracer->ds.context, ds_bts, record, size);
717{
718 return ds_write(task, record, size, ds_pebs, /* force = */ 0);
719} 662}
720 663
721int ds_unchecked_write_bts(struct task_struct *task, 664int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size)
722 const void *record, size_t size)
723{ 665{
724 return ds_write(task, record, size, ds_bts, /* force = */ 1); 666 if (!tracer)
725} 667 return -EINVAL;
726 668
727int ds_unchecked_write_pebs(struct task_struct *task, 669 return ds_write(tracer->ds.context, ds_pebs, record, size);
728 const void *record, size_t size)
729{
730 return ds_write(task, record, size, ds_pebs, /* force = */ 1);
731} 670}
732 671
733static int ds_reset_or_clear(struct task_struct *task, 672static void ds_reset_or_clear(struct ds_context *context,
734 enum ds_qualifier qual, int clear) 673 enum ds_qualifier qual, int clear)
735{ 674{
736 struct ds_context *context;
737 unsigned long base, end; 675 unsigned long base, end;
738 int error;
739
740 context = ds_get_context(task);
741 error = ds_validate_access(context, qual);
742 if (error < 0)
743 goto out;
744 676
745 base = ds_get(context->ds, qual, ds_buffer_base); 677 base = ds_get(context->ds, qual, ds_buffer_base);
746 end = ds_get(context->ds, qual, ds_absolute_maximum); 678 end = ds_get(context->ds, qual, ds_absolute_maximum);
@@ -749,70 +681,69 @@ static int ds_reset_or_clear(struct task_struct *task,
749 memset((void *)base, 0, end - base); 681 memset((void *)base, 0, end - base);
750 682
751 ds_set(context->ds, qual, ds_index, base); 683 ds_set(context->ds, qual, ds_index, base);
752
753 error = 0;
754 out:
755 ds_put_context(context);
756 return error;
757} 684}
758 685
759int ds_reset_bts(struct task_struct *task) 686int ds_reset_bts(struct bts_tracer *tracer)
760{ 687{
761 return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); 688 if (!tracer)
689 return -EINVAL;
690
691 ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0);
692
693 return 0;
762} 694}
763 695
764int ds_reset_pebs(struct task_struct *task) 696int ds_reset_pebs(struct pebs_tracer *tracer)
765{ 697{
766 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); 698 if (!tracer)
699 return -EINVAL;
700
701 ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0);
702
703 return 0;
767} 704}
768 705
769int ds_clear_bts(struct task_struct *task) 706int ds_clear_bts(struct bts_tracer *tracer)
770{ 707{
771 return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); 708 if (!tracer)
709 return -EINVAL;
710
711 ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1);
712
713 return 0;
772} 714}
773 715
774int ds_clear_pebs(struct task_struct *task) 716int ds_clear_pebs(struct pebs_tracer *tracer)
775{ 717{
776 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); 718 if (!tracer)
719 return -EINVAL;
720
721 ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1);
722
723 return 0;
777} 724}
778 725
779int ds_get_pebs_reset(struct task_struct *task, u64 *value) 726int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value)
780{ 727{
781 struct ds_context *context; 728 if (!tracer)
782 int error; 729 return -EINVAL;
783 730
784 if (!value) 731 if (!value)
785 return -EINVAL; 732 return -EINVAL;
786 733
787 context = ds_get_context(task); 734 *value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
788 error = ds_validate_access(context, ds_pebs);
789 if (error < 0)
790 goto out;
791 735
792 *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); 736 return 0;
793
794 error = 0;
795 out:
796 ds_put_context(context);
797 return error;
798} 737}
799 738
800int ds_set_pebs_reset(struct task_struct *task, u64 value) 739int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
801{ 740{
802 struct ds_context *context; 741 if (!tracer)
803 int error; 742 return -EINVAL;
804
805 context = ds_get_context(task);
806 error = ds_validate_access(context, ds_pebs);
807 if (error < 0)
808 goto out;
809 743
810 *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; 744 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
811 745
812 error = 0; 746 return 0;
813 out:
814 ds_put_context(context);
815 return error;
816} 747}
817 748
818static const struct ds_configuration ds_cfg_var = { 749static const struct ds_configuration ds_cfg_var = {
@@ -840,6 +771,10 @@ static inline void
840ds_configure(const struct ds_configuration *cfg) 771ds_configure(const struct ds_configuration *cfg)
841{ 772{
842 ds_cfg = *cfg; 773 ds_cfg = *cfg;
774
775 printk(KERN_INFO "DS available\n");
776
777 BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds);
843} 778}
844 779
845void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) 780void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -847,17 +782,16 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
847 switch (c->x86) { 782 switch (c->x86) {
848 case 0x6: 783 case 0x6:
849 switch (c->x86_model) { 784 switch (c->x86_model) {
785 case 0 ... 0xC:
786 /* sorry, don't know about them */
787 break;
850 case 0xD: 788 case 0xD:
851 case 0xE: /* Pentium M */ 789 case 0xE: /* Pentium M */
852 ds_configure(&ds_cfg_var); 790 ds_configure(&ds_cfg_var);
853 break; 791 break;
854 case 0xF: /* Core2 */ 792 default: /* Core2, Atom, ... */
855 case 0x1C: /* Atom */
856 ds_configure(&ds_cfg_64); 793 ds_configure(&ds_cfg_64);
857 break; 794 break;
858 default:
859 /* sorry, don't know about them */
860 break;
861 } 795 }
862 break; 796 break;
863 case 0xF: 797 case 0xF:
@@ -884,6 +818,8 @@ void ds_free(struct ds_context *context)
884 * is dying. There should not be any user of that context left 818 * is dying. There should not be any user of that context left
885 * to disturb us, anymore. */ 819 * to disturb us, anymore. */
886 unsigned long leftovers = context->count; 820 unsigned long leftovers = context->count;
887 while (leftovers--) 821 while (leftovers--) {
822 put_tracer(context->task);
888 ds_put_context(context); 823 ds_put_context(context);
824 }
889} 825}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
new file mode 100644
index 000000000000..6b1f6f6f8661
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.c
@@ -0,0 +1,351 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16#include <linux/sysfs.h>
17
18#include <asm/stacktrace.h>
19
20#include "dumpstack.h"
21
22int panic_on_unrecovered_nmi;
23unsigned int code_bytes = 64;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static int die_counter;
26
27void printk_address(unsigned long address, int reliable)
28{
29 printk(" [<%p>] %s%pS\n", (void *) address,
30 reliable ? "" : "? ", (void *) address);
31}
32
33#ifdef CONFIG_FUNCTION_GRAPH_TRACER
34static void
35print_ftrace_graph_addr(unsigned long addr, void *data,
36 const struct stacktrace_ops *ops,
37 struct thread_info *tinfo, int *graph)
38{
39 struct task_struct *task = tinfo->task;
40 unsigned long ret_addr;
41 int index = task->curr_ret_stack;
42
43 if (addr != (unsigned long)return_to_handler)
44 return;
45
46 if (!task->ret_stack || index < *graph)
47 return;
48
49 index -= *graph;
50 ret_addr = task->ret_stack[index].ret;
51
52 ops->address(data, ret_addr, 1);
53
54 (*graph)++;
55}
56#else
57static inline void
58print_ftrace_graph_addr(unsigned long addr, void *data,
59 const struct stacktrace_ops *ops,
60 struct thread_info *tinfo, int *graph)
61{ }
62#endif
63
64/*
65 * x86-64 can have up to three kernel stacks:
66 * process stack
67 * interrupt stack
68 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
69 */
70
71static inline int valid_stack_ptr(struct thread_info *tinfo,
72 void *p, unsigned int size, void *end)
73{
74 void *t = tinfo;
75 if (end) {
76 if (p < end && p >= (end-THREAD_SIZE))
77 return 1;
78 else
79 return 0;
80 }
81 return p > t && p < t + THREAD_SIZE - size;
82}
83
84unsigned long
85print_context_stack(struct thread_info *tinfo,
86 unsigned long *stack, unsigned long bp,
87 const struct stacktrace_ops *ops, void *data,
88 unsigned long *end, int *graph)
89{
90 struct stack_frame *frame = (struct stack_frame *)bp;
91
92 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
93 unsigned long addr;
94
95 addr = *stack;
96 if (__kernel_text_address(addr)) {
97 if ((unsigned long) stack == bp + sizeof(long)) {
98 ops->address(data, addr, 1);
99 frame = frame->next_frame;
100 bp = (unsigned long) frame;
101 } else {
102 ops->address(data, addr, bp == 0);
103 }
104 print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
105 }
106 stack++;
107 }
108 return bp;
109}
110
111
112static void
113print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
114{
115 printk(data);
116 print_symbol(msg, symbol);
117 printk("\n");
118}
119
120static void print_trace_warning(void *data, char *msg)
121{
122 printk("%s%s\n", (char *)data, msg);
123}
124
125static int print_trace_stack(void *data, char *name)
126{
127 printk("%s <%s> ", (char *)data, name);
128 return 0;
129}
130
131/*
132 * Print one address/symbol entries per line.
133 */
134static void print_trace_address(void *data, unsigned long addr, int reliable)
135{
136 touch_nmi_watchdog();
137 printk(data);
138 printk_address(addr, reliable);
139}
140
141static const struct stacktrace_ops print_trace_ops = {
142 .warning = print_trace_warning,
143 .warning_symbol = print_trace_warning_symbol,
144 .stack = print_trace_stack,
145 .address = print_trace_address,
146};
147
148void
149show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
150 unsigned long *stack, unsigned long bp, char *log_lvl)
151{
152 printk("%sCall Trace:\n", log_lvl);
153 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
154}
155
156void show_trace(struct task_struct *task, struct pt_regs *regs,
157 unsigned long *stack, unsigned long bp)
158{
159 show_trace_log_lvl(task, regs, stack, bp, "");
160}
161
162void show_stack(struct task_struct *task, unsigned long *sp)
163{
164 show_stack_log_lvl(task, NULL, sp, 0, "");
165}
166
167/*
168 * The architecture-independent dump_stack generator
169 */
170void dump_stack(void)
171{
172 unsigned long bp = 0;
173 unsigned long stack;
174
175#ifdef CONFIG_FRAME_POINTER
176 if (!bp)
177 get_bp(bp);
178#endif
179
180 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
181 current->pid, current->comm, print_tainted(),
182 init_utsname()->release,
183 (int)strcspn(init_utsname()->version, " "),
184 init_utsname()->version);
185 show_trace(NULL, NULL, &stack, bp);
186}
187EXPORT_SYMBOL(dump_stack);
188
189static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
190static int die_owner = -1;
191static unsigned int die_nest_count;
192
193unsigned __kprobes long oops_begin(void)
194{
195 int cpu;
196 unsigned long flags;
197
198 oops_enter();
199
200 /* racy, but better than risking deadlock. */
201 raw_local_irq_save(flags);
202 cpu = smp_processor_id();
203 if (!__raw_spin_trylock(&die_lock)) {
204 if (cpu == die_owner)
205 /* nested oops. should stop eventually */;
206 else
207 __raw_spin_lock(&die_lock);
208 }
209 die_nest_count++;
210 die_owner = cpu;
211 console_verbose();
212 bust_spinlocks(1);
213 return flags;
214}
215
216void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
217{
218 if (regs && kexec_should_crash(current))
219 crash_kexec(regs);
220
221 bust_spinlocks(0);
222 die_owner = -1;
223 add_taint(TAINT_DIE);
224 die_nest_count--;
225 if (!die_nest_count)
226 /* Nest count reaches zero, release the lock. */
227 __raw_spin_unlock(&die_lock);
228 raw_local_irq_restore(flags);
229 oops_exit();
230
231 if (!signr)
232 return;
233 if (in_interrupt())
234 panic("Fatal exception in interrupt");
235 if (panic_on_oops)
236 panic("Fatal exception");
237 do_exit(signr);
238}
239
240int __kprobes __die(const char *str, struct pt_regs *regs, long err)
241{
242#ifdef CONFIG_X86_32
243 unsigned short ss;
244 unsigned long sp;
245#endif
246 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
247#ifdef CONFIG_PREEMPT
248 printk("PREEMPT ");
249#endif
250#ifdef CONFIG_SMP
251 printk("SMP ");
252#endif
253#ifdef CONFIG_DEBUG_PAGEALLOC
254 printk("DEBUG_PAGEALLOC");
255#endif
256 printk("\n");
257 sysfs_printk_last_file();
258 if (notify_die(DIE_OOPS, str, regs, err,
259 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
260 return 1;
261
262 show_registers(regs);
263#ifdef CONFIG_X86_32
264 sp = (unsigned long) (&regs->sp);
265 savesegment(ss, ss);
266 if (user_mode(regs)) {
267 sp = regs->sp;
268 ss = regs->ss & 0xffff;
269 }
270 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
271 print_symbol("%s", regs->ip);
272 printk(" SS:ESP %04x:%08lx\n", ss, sp);
273#else
274 /* Executive summary in case the oops scrolled away */
275 printk(KERN_ALERT "RIP ");
276 printk_address(regs->ip, 1);
277 printk(" RSP <%016lx>\n", regs->sp);
278#endif
279 return 0;
280}
281
282/*
283 * This is gone through when something in the kernel has done something bad
284 * and is about to be terminated:
285 */
286void die(const char *str, struct pt_regs *regs, long err)
287{
288 unsigned long flags = oops_begin();
289 int sig = SIGSEGV;
290
291 if (!user_mode_vm(regs))
292 report_bug(regs->ip, regs);
293
294 if (__die(str, regs, err))
295 sig = 0;
296 oops_end(flags, regs, sig);
297}
298
299void notrace __kprobes
300die_nmi(char *str, struct pt_regs *regs, int do_panic)
301{
302 unsigned long flags;
303
304 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
305 return;
306
307 /*
308 * We are in trouble anyway, lets at least try
309 * to get a message out.
310 */
311 flags = oops_begin();
312 printk(KERN_EMERG "%s", str);
313 printk(" on CPU%d, ip %08lx, registers:\n",
314 smp_processor_id(), regs->ip);
315 show_registers(regs);
316 oops_end(flags, regs, 0);
317 if (do_panic || panic_on_oops)
318 panic("Non maskable interrupt");
319 nmi_exit();
320 local_irq_enable();
321 do_exit(SIGBUS);
322}
323
324static int __init oops_setup(char *s)
325{
326 if (!s)
327 return -EINVAL;
328 if (!strcmp(s, "panic"))
329 panic_on_oops = 1;
330 return 0;
331}
332early_param("oops", oops_setup);
333
334static int __init kstack_setup(char *s)
335{
336 if (!s)
337 return -EINVAL;
338 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
339 return 0;
340}
341early_param("kstack", kstack_setup);
342
343static int __init code_bytes_setup(char *s)
344{
345 code_bytes = simple_strtoul(s, NULL, 0);
346 if (code_bytes > 8192)
347 code_bytes = 8192;
348
349 return 1;
350}
351__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
new file mode 100644
index 000000000000..da87590b8698
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5
6#ifndef DUMPSTACK_H
7#define DUMPSTACK_H
8
9#ifdef CONFIG_X86_32
10#define STACKSLOTS_PER_LINE 8
11#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
12#else
13#define STACKSLOTS_PER_LINE 4
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif
16
17extern unsigned long
18print_context_stack(struct thread_info *tinfo,
19 unsigned long *stack, unsigned long bp,
20 const struct stacktrace_ops *ops, void *data,
21 unsigned long *end, int *graph);
22
23extern void
24show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
25 unsigned long *stack, unsigned long bp, char *log_lvl);
26
27extern void
28show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
29 unsigned long *sp, unsigned long bp, char *log_lvl);
30
31extern unsigned int code_bytes;
32extern int kstack_depth_to_print;
33
34/* The form of the top of the frame on the stack */
35struct stack_frame {
36 struct stack_frame *next_frame;
37 unsigned long return_address;
38};
39#endif
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index b3614752197b..d593cd1f58dc 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,69 +17,14 @@
17 17
18#include <asm/stacktrace.h> 18#include <asm/stacktrace.h>
19 19
20#define STACKSLOTS_PER_LINE 8 20#include "dumpstack.h"
21#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33
34static inline int valid_stack_ptr(struct thread_info *tinfo,
35 void *p, unsigned int size, void *end)
36{
37 void *t = tinfo;
38 if (end) {
39 if (p < end && p >= (end-THREAD_SIZE))
40 return 1;
41 else
42 return 0;
43 }
44 return p > t && p < t + THREAD_SIZE - size;
45}
46
47/* The form of the top of the frame on the stack */
48struct stack_frame {
49 struct stack_frame *next_frame;
50 unsigned long return_address;
51};
52
53static inline unsigned long
54print_context_stack(struct thread_info *tinfo,
55 unsigned long *stack, unsigned long bp,
56 const struct stacktrace_ops *ops, void *data,
57 unsigned long *end)
58{
59 struct stack_frame *frame = (struct stack_frame *)bp;
60
61 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
62 unsigned long addr;
63
64 addr = *stack;
65 if (__kernel_text_address(addr)) {
66 if ((unsigned long) stack == bp + sizeof(long)) {
67 ops->address(data, addr, 1);
68 frame = frame->next_frame;
69 bp = (unsigned long) frame;
70 } else {
71 ops->address(data, addr, bp == 0);
72 }
73 }
74 stack++;
75 }
76 return bp;
77}
78 21
79void dump_trace(struct task_struct *task, struct pt_regs *regs, 22void dump_trace(struct task_struct *task, struct pt_regs *regs,
80 unsigned long *stack, unsigned long bp, 23 unsigned long *stack, unsigned long bp,
81 const struct stacktrace_ops *ops, void *data) 24 const struct stacktrace_ops *ops, void *data)
82{ 25{
26 int graph = 0;
27
83 if (!task) 28 if (!task)
84 task = current; 29 task = current;
85 30
@@ -107,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
107 52
108 context = (struct thread_info *) 53 context = (struct thread_info *)
109 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 54 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
110 bp = print_context_stack(context, stack, bp, ops, data, NULL); 55 bp = print_context_stack(context, stack, bp, ops,
56 data, NULL, &graph);
111 57
112 stack = (unsigned long *)context->previous_esp; 58 stack = (unsigned long *)context->previous_esp;
113 if (!stack) 59 if (!stack)
@@ -119,57 +65,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
119} 65}
120EXPORT_SYMBOL(dump_trace); 66EXPORT_SYMBOL(dump_trace);
121 67
122static void 68void
123print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
124{
125 printk(data);
126 print_symbol(msg, symbol);
127 printk("\n");
128}
129
130static void print_trace_warning(void *data, char *msg)
131{
132 printk("%s%s\n", (char *)data, msg);
133}
134
135static int print_trace_stack(void *data, char *name)
136{
137 printk("%s <%s> ", (char *)data, name);
138 return 0;
139}
140
141/*
142 * Print one address/symbol entries per line.
143 */
144static void print_trace_address(void *data, unsigned long addr, int reliable)
145{
146 touch_nmi_watchdog();
147 printk(data);
148 printk_address(addr, reliable);
149}
150
151static const struct stacktrace_ops print_trace_ops = {
152 .warning = print_trace_warning,
153 .warning_symbol = print_trace_warning_symbol,
154 .stack = print_trace_stack,
155 .address = print_trace_address,
156};
157
158static void
159show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
160 unsigned long *stack, unsigned long bp, char *log_lvl)
161{
162 printk("%sCall Trace:\n", log_lvl);
163 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
164}
165
166void show_trace(struct task_struct *task, struct pt_regs *regs,
167 unsigned long *stack, unsigned long bp)
168{
169 show_trace_log_lvl(task, regs, stack, bp, "");
170}
171
172static void
173show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 69show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
174 unsigned long *sp, unsigned long bp, char *log_lvl) 70 unsigned long *sp, unsigned long bp, char *log_lvl)
175{ 71{
@@ -196,33 +92,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
196 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 92 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
197} 93}
198 94
199void show_stack(struct task_struct *task, unsigned long *sp)
200{
201 show_stack_log_lvl(task, NULL, sp, 0, "");
202}
203
204/*
205 * The architecture-independent dump_stack generator
206 */
207void dump_stack(void)
208{
209 unsigned long bp = 0;
210 unsigned long stack;
211
212#ifdef CONFIG_FRAME_POINTER
213 if (!bp)
214 get_bp(bp);
215#endif
216
217 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
218 current->pid, current->comm, print_tainted(),
219 init_utsname()->release,
220 (int)strcspn(init_utsname()->version, " "),
221 init_utsname()->version);
222 show_trace(NULL, NULL, &stack, bp);
223}
224
225EXPORT_SYMBOL(dump_stack);
226 95
227void show_registers(struct pt_regs *regs) 96void show_registers(struct pt_regs *regs)
228{ 97{
@@ -283,167 +152,3 @@ int is_valid_bugaddr(unsigned long ip)
283 return ud2 == 0x0b0f; 152 return ud2 == 0x0b0f;
284} 153}
285 154
286static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
287static int die_owner = -1;
288static unsigned int die_nest_count;
289
290unsigned __kprobes long oops_begin(void)
291{
292 unsigned long flags;
293
294 oops_enter();
295
296 if (die_owner != raw_smp_processor_id()) {
297 console_verbose();
298 raw_local_irq_save(flags);
299 __raw_spin_lock(&die_lock);
300 die_owner = smp_processor_id();
301 die_nest_count = 0;
302 bust_spinlocks(1);
303 } else {
304 raw_local_irq_save(flags);
305 }
306 die_nest_count++;
307 return flags;
308}
309
310void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
311{
312 bust_spinlocks(0);
313 die_owner = -1;
314 add_taint(TAINT_DIE);
315 __raw_spin_unlock(&die_lock);
316 raw_local_irq_restore(flags);
317
318 if (!regs)
319 return;
320
321 if (kexec_should_crash(current))
322 crash_kexec(regs);
323 if (in_interrupt())
324 panic("Fatal exception in interrupt");
325 if (panic_on_oops)
326 panic("Fatal exception");
327 oops_exit();
328 do_exit(signr);
329}
330
331int __kprobes __die(const char *str, struct pt_regs *regs, long err)
332{
333 unsigned short ss;
334 unsigned long sp;
335
336 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
337#ifdef CONFIG_PREEMPT
338 printk("PREEMPT ");
339#endif
340#ifdef CONFIG_SMP
341 printk("SMP ");
342#endif
343#ifdef CONFIG_DEBUG_PAGEALLOC
344 printk("DEBUG_PAGEALLOC");
345#endif
346 printk("\n");
347 sysfs_printk_last_file();
348 if (notify_die(DIE_OOPS, str, regs, err,
349 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
350 return 1;
351
352 show_registers(regs);
353 /* Executive summary in case the oops scrolled away */
354 sp = (unsigned long) (&regs->sp);
355 savesegment(ss, ss);
356 if (user_mode(regs)) {
357 sp = regs->sp;
358 ss = regs->ss & 0xffff;
359 }
360 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
361 print_symbol("%s", regs->ip);
362 printk(" SS:ESP %04x:%08lx\n", ss, sp);
363 return 0;
364}
365
366/*
367 * This is gone through when something in the kernel has done something bad
368 * and is about to be terminated:
369 */
370void die(const char *str, struct pt_regs *regs, long err)
371{
372 unsigned long flags = oops_begin();
373
374 if (die_nest_count < 3) {
375 report_bug(regs->ip, regs);
376
377 if (__die(str, regs, err))
378 regs = NULL;
379 } else {
380 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
381 }
382
383 oops_end(flags, regs, SIGSEGV);
384}
385
386static DEFINE_SPINLOCK(nmi_print_lock);
387
388void notrace __kprobes
389die_nmi(char *str, struct pt_regs *regs, int do_panic)
390{
391 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
392 return;
393
394 spin_lock(&nmi_print_lock);
395 /*
396 * We are in trouble anyway, lets at least try
397 * to get a message out:
398 */
399 bust_spinlocks(1);
400 printk(KERN_EMERG "%s", str);
401 printk(" on CPU%d, ip %08lx, registers:\n",
402 smp_processor_id(), regs->ip);
403 show_registers(regs);
404 if (do_panic)
405 panic("Non maskable interrupt");
406 console_silent();
407 spin_unlock(&nmi_print_lock);
408
409 /*
410 * If we are in kernel we are probably nested up pretty bad
411 * and might aswell get out now while we still can:
412 */
413 if (!user_mode_vm(regs)) {
414 current->thread.trap_no = 2;
415 crash_kexec(regs);
416 }
417
418 bust_spinlocks(0);
419 do_exit(SIGSEGV);
420}
421
422static int __init oops_setup(char *s)
423{
424 if (!s)
425 return -EINVAL;
426 if (!strcmp(s, "panic"))
427 panic_on_oops = 1;
428 return 0;
429}
430early_param("oops", oops_setup);
431
432static int __init kstack_setup(char *s)
433{
434 if (!s)
435 return -EINVAL;
436 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
437 return 0;
438}
439early_param("kstack", kstack_setup);
440
441static int __init code_bytes_setup(char *s)
442{
443 code_bytes = simple_strtoul(s, NULL, 0);
444 if (code_bytes > 8192)
445 code_bytes = 8192;
446
447 return 1;
448}
449__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 96a5db7da8a7..c302d0707048 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -17,19 +17,7 @@
17 17
18#include <asm/stacktrace.h> 18#include <asm/stacktrace.h>
19 19
20#define STACKSLOTS_PER_LINE 4 20#include "dumpstack.h"
21#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33 21
34static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 22static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
35 unsigned *usedp, char **idp) 23 unsigned *usedp, char **idp)
@@ -113,51 +101,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
113 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 101 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
114 */ 102 */
115 103
116static inline int valid_stack_ptr(struct thread_info *tinfo,
117 void *p, unsigned int size, void *end)
118{
119 void *t = tinfo;
120 if (end) {
121 if (p < end && p >= (end-THREAD_SIZE))
122 return 1;
123 else
124 return 0;
125 }
126 return p > t && p < t + THREAD_SIZE - size;
127}
128
129/* The form of the top of the frame on the stack */
130struct stack_frame {
131 struct stack_frame *next_frame;
132 unsigned long return_address;
133};
134
135static inline unsigned long
136print_context_stack(struct thread_info *tinfo,
137 unsigned long *stack, unsigned long bp,
138 const struct stacktrace_ops *ops, void *data,
139 unsigned long *end)
140{
141 struct stack_frame *frame = (struct stack_frame *)bp;
142
143 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
144 unsigned long addr;
145
146 addr = *stack;
147 if (__kernel_text_address(addr)) {
148 if ((unsigned long) stack == bp + sizeof(long)) {
149 ops->address(data, addr, 1);
150 frame = frame->next_frame;
151 bp = (unsigned long) frame;
152 } else {
153 ops->address(data, addr, bp == 0);
154 }
155 }
156 stack++;
157 }
158 return bp;
159}
160
161void dump_trace(struct task_struct *task, struct pt_regs *regs, 104void dump_trace(struct task_struct *task, struct pt_regs *regs,
162 unsigned long *stack, unsigned long bp, 105 unsigned long *stack, unsigned long bp,
163 const struct stacktrace_ops *ops, void *data) 106 const struct stacktrace_ops *ops, void *data)
@@ -166,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
166 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; 109 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
167 unsigned used = 0; 110 unsigned used = 0;
168 struct thread_info *tinfo; 111 struct thread_info *tinfo;
112 int graph = 0;
169 113
170 if (!task) 114 if (!task)
171 task = current; 115 task = current;
@@ -206,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
206 break; 150 break;
207 151
208 bp = print_context_stack(tinfo, stack, bp, ops, 152 bp = print_context_stack(tinfo, stack, bp, ops,
209 data, estack_end); 153 data, estack_end, &graph);
210 ops->stack(data, "<EOE>"); 154 ops->stack(data, "<EOE>");
211 /* 155 /*
212 * We link to the next stack via the 156 * We link to the next stack via the
@@ -225,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
225 if (ops->stack(data, "IRQ") < 0) 169 if (ops->stack(data, "IRQ") < 0)
226 break; 170 break;
227 bp = print_context_stack(tinfo, stack, bp, 171 bp = print_context_stack(tinfo, stack, bp,
228 ops, data, irqstack_end); 172 ops, data, irqstack_end, &graph);
229 /* 173 /*
230 * We link to the next stack (which would be 174 * We link to the next stack (which would be
231 * the process stack normally) the last 175 * the process stack normally) the last
@@ -243,62 +187,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
243 /* 187 /*
244 * This handles the process stack: 188 * This handles the process stack:
245 */ 189 */
246 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); 190 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);
247 put_cpu(); 191 put_cpu();
248} 192}
249EXPORT_SYMBOL(dump_trace); 193EXPORT_SYMBOL(dump_trace);
250 194
251static void 195void
252print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
253{
254 printk(data);
255 print_symbol(msg, symbol);
256 printk("\n");
257}
258
259static void print_trace_warning(void *data, char *msg)
260{
261 printk("%s%s\n", (char *)data, msg);
262}
263
264static int print_trace_stack(void *data, char *name)
265{
266 printk("%s <%s> ", (char *)data, name);
267 return 0;
268}
269
270/*
271 * Print one address/symbol entries per line.
272 */
273static void print_trace_address(void *data, unsigned long addr, int reliable)
274{
275 touch_nmi_watchdog();
276 printk(data);
277 printk_address(addr, reliable);
278}
279
280static const struct stacktrace_ops print_trace_ops = {
281 .warning = print_trace_warning,
282 .warning_symbol = print_trace_warning_symbol,
283 .stack = print_trace_stack,
284 .address = print_trace_address,
285};
286
287static void
288show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
289 unsigned long *stack, unsigned long bp, char *log_lvl)
290{
291 printk("%sCall Trace:\n", log_lvl);
292 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
293}
294
295void show_trace(struct task_struct *task, struct pt_regs *regs,
296 unsigned long *stack, unsigned long bp)
297{
298 show_trace_log_lvl(task, regs, stack, bp, "");
299}
300
301static void
302show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 196show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
303 unsigned long *sp, unsigned long bp, char *log_lvl) 197 unsigned long *sp, unsigned long bp, char *log_lvl)
304{ 198{
@@ -342,33 +236,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
342 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 236 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
343} 237}
344 238
345void show_stack(struct task_struct *task, unsigned long *sp)
346{
347 show_stack_log_lvl(task, NULL, sp, 0, "");
348}
349
350/*
351 * The architecture-independent dump_stack generator
352 */
353void dump_stack(void)
354{
355 unsigned long bp = 0;
356 unsigned long stack;
357
358#ifdef CONFIG_FRAME_POINTER
359 if (!bp)
360 get_bp(bp);
361#endif
362
363 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
364 current->pid, current->comm, print_tainted(),
365 init_utsname()->release,
366 (int)strcspn(init_utsname()->version, " "),
367 init_utsname()->version);
368 show_trace(NULL, NULL, &stack, bp);
369}
370EXPORT_SYMBOL(dump_stack);
371
372void show_registers(struct pt_regs *regs) 239void show_registers(struct pt_regs *regs)
373{ 240{
374 int i; 241 int i;
@@ -429,147 +296,3 @@ int is_valid_bugaddr(unsigned long ip)
429 return ud2 == 0x0b0f; 296 return ud2 == 0x0b0f;
430} 297}
431 298
432static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
433static int die_owner = -1;
434static unsigned int die_nest_count;
435
436unsigned __kprobes long oops_begin(void)
437{
438 int cpu;
439 unsigned long flags;
440
441 oops_enter();
442
443 /* racy, but better than risking deadlock. */
444 raw_local_irq_save(flags);
445 cpu = smp_processor_id();
446 if (!__raw_spin_trylock(&die_lock)) {
447 if (cpu == die_owner)
448 /* nested oops. should stop eventually */;
449 else
450 __raw_spin_lock(&die_lock);
451 }
452 die_nest_count++;
453 die_owner = cpu;
454 console_verbose();
455 bust_spinlocks(1);
456 return flags;
457}
458
459void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
460{
461 die_owner = -1;
462 bust_spinlocks(0);
463 die_nest_count--;
464 if (!die_nest_count)
465 /* Nest count reaches zero, release the lock. */
466 __raw_spin_unlock(&die_lock);
467 raw_local_irq_restore(flags);
468 if (!regs) {
469 oops_exit();
470 return;
471 }
472 if (in_interrupt())
473 panic("Fatal exception in interrupt");
474 if (panic_on_oops)
475 panic("Fatal exception");
476 oops_exit();
477 do_exit(signr);
478}
479
480int __kprobes __die(const char *str, struct pt_regs *regs, long err)
481{
482 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
483#ifdef CONFIG_PREEMPT
484 printk("PREEMPT ");
485#endif
486#ifdef CONFIG_SMP
487 printk("SMP ");
488#endif
489#ifdef CONFIG_DEBUG_PAGEALLOC
490 printk("DEBUG_PAGEALLOC");
491#endif
492 printk("\n");
493 sysfs_printk_last_file();
494 if (notify_die(DIE_OOPS, str, regs, err,
495 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
496 return 1;
497
498 show_registers(regs);
499 add_taint(TAINT_DIE);
500 /* Executive summary in case the oops scrolled away */
501 printk(KERN_ALERT "RIP ");
502 printk_address(regs->ip, 1);
503 printk(" RSP <%016lx>\n", regs->sp);
504 if (kexec_should_crash(current))
505 crash_kexec(regs);
506 return 0;
507}
508
509void die(const char *str, struct pt_regs *regs, long err)
510{
511 unsigned long flags = oops_begin();
512
513 if (!user_mode(regs))
514 report_bug(regs->ip, regs);
515
516 if (__die(str, regs, err))
517 regs = NULL;
518 oops_end(flags, regs, SIGSEGV);
519}
520
521notrace __kprobes void
522die_nmi(char *str, struct pt_regs *regs, int do_panic)
523{
524 unsigned long flags;
525
526 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
527 return;
528
529 flags = oops_begin();
530 /*
531 * We are in trouble anyway, lets at least try
532 * to get a message out.
533 */
534 printk(KERN_EMERG "%s", str);
535 printk(" on CPU%d, ip %08lx, registers:\n",
536 smp_processor_id(), regs->ip);
537 show_registers(regs);
538 if (kexec_should_crash(current))
539 crash_kexec(regs);
540 if (do_panic || panic_on_oops)
541 panic("Non maskable interrupt");
542 oops_end(flags, NULL, SIGBUS);
543 nmi_exit();
544 local_irq_enable();
545 do_exit(SIGBUS);
546}
547
548static int __init oops_setup(char *s)
549{
550 if (!s)
551 return -EINVAL;
552 if (!strcmp(s, "panic"))
553 panic_on_oops = 1;
554 return 0;
555}
556early_param("oops", oops_setup);
557
558static int __init kstack_setup(char *s)
559{
560 if (!s)
561 return -EINVAL;
562 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
563 return 0;
564}
565early_param("kstack", kstack_setup);
566
567static int __init code_bytes_setup(char *s)
568{
569 code_bytes = simple_strtoul(s, NULL, 0);
570 if (code_bytes > 8192)
571 code_bytes = 8192;
572
573 return 1;
574}
575__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 28b597ef9ca1..43ceb3f454bf 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1157,6 +1157,9 @@ ENTRY(mcount)
1157END(mcount) 1157END(mcount)
1158 1158
1159ENTRY(ftrace_caller) 1159ENTRY(ftrace_caller)
1160 cmpl $0, function_trace_stop
1161 jne ftrace_stub
1162
1160 pushl %eax 1163 pushl %eax
1161 pushl %ecx 1164 pushl %ecx
1162 pushl %edx 1165 pushl %edx
@@ -1171,6 +1174,11 @@ ftrace_call:
1171 popl %edx 1174 popl %edx
1172 popl %ecx 1175 popl %ecx
1173 popl %eax 1176 popl %eax
1177#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1178.globl ftrace_graph_call
1179ftrace_graph_call:
1180 jmp ftrace_stub
1181#endif
1174 1182
1175.globl ftrace_stub 1183.globl ftrace_stub
1176ftrace_stub: 1184ftrace_stub:
@@ -1180,8 +1188,18 @@ END(ftrace_caller)
1180#else /* ! CONFIG_DYNAMIC_FTRACE */ 1188#else /* ! CONFIG_DYNAMIC_FTRACE */
1181 1189
1182ENTRY(mcount) 1190ENTRY(mcount)
1191 cmpl $0, function_trace_stop
1192 jne ftrace_stub
1193
1183 cmpl $ftrace_stub, ftrace_trace_function 1194 cmpl $ftrace_stub, ftrace_trace_function
1184 jnz trace 1195 jnz trace
1196#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1197 cmpl $ftrace_stub, ftrace_graph_return
1198 jnz ftrace_graph_caller
1199
1200 cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
1201 jnz ftrace_graph_caller
1202#endif
1185.globl ftrace_stub 1203.globl ftrace_stub
1186ftrace_stub: 1204ftrace_stub:
1187 ret 1205 ret
@@ -1200,12 +1218,43 @@ trace:
1200 popl %edx 1218 popl %edx
1201 popl %ecx 1219 popl %ecx
1202 popl %eax 1220 popl %eax
1203
1204 jmp ftrace_stub 1221 jmp ftrace_stub
1205END(mcount) 1222END(mcount)
1206#endif /* CONFIG_DYNAMIC_FTRACE */ 1223#endif /* CONFIG_DYNAMIC_FTRACE */
1207#endif /* CONFIG_FUNCTION_TRACER */ 1224#endif /* CONFIG_FUNCTION_TRACER */
1208 1225
1226#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1227ENTRY(ftrace_graph_caller)
1228 cmpl $0, function_trace_stop
1229 jne ftrace_stub
1230
1231 pushl %eax
1232 pushl %ecx
1233 pushl %edx
1234 movl 0xc(%esp), %edx
1235 lea 0x4(%ebp), %eax
1236 subl $MCOUNT_INSN_SIZE, %edx
1237 call prepare_ftrace_return
1238 popl %edx
1239 popl %ecx
1240 popl %eax
1241 ret
1242END(ftrace_graph_caller)
1243
1244.globl return_to_handler
1245return_to_handler:
1246 pushl $0
1247 pushl %eax
1248 pushl %ecx
1249 pushl %edx
1250 call ftrace_return_to_handler
1251 movl %eax, 0xc(%esp)
1252 popl %edx
1253 popl %ecx
1254 popl %eax
1255 ret
1256#endif
1257
1209.section .rodata,"a" 1258.section .rodata,"a"
1210#include "syscall_table_32.S" 1259#include "syscall_table_32.S"
1211 1260
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b86f332c96a6..54e0bbdccb99 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -68,6 +68,8 @@ ENTRY(mcount)
68END(mcount) 68END(mcount)
69 69
70ENTRY(ftrace_caller) 70ENTRY(ftrace_caller)
71 cmpl $0, function_trace_stop
72 jne ftrace_stub
71 73
72 /* taken from glibc */ 74 /* taken from glibc */
73 subq $0x38, %rsp 75 subq $0x38, %rsp
@@ -96,6 +98,12 @@ ftrace_call:
96 movq (%rsp), %rax 98 movq (%rsp), %rax
97 addq $0x38, %rsp 99 addq $0x38, %rsp
98 100
101#ifdef CONFIG_FUNCTION_GRAPH_TRACER
102.globl ftrace_graph_call
103ftrace_graph_call:
104 jmp ftrace_stub
105#endif
106
99.globl ftrace_stub 107.globl ftrace_stub
100ftrace_stub: 108ftrace_stub:
101 retq 109 retq
@@ -103,8 +111,20 @@ END(ftrace_caller)
103 111
104#else /* ! CONFIG_DYNAMIC_FTRACE */ 112#else /* ! CONFIG_DYNAMIC_FTRACE */
105ENTRY(mcount) 113ENTRY(mcount)
114 cmpl $0, function_trace_stop
115 jne ftrace_stub
116
106 cmpq $ftrace_stub, ftrace_trace_function 117 cmpq $ftrace_stub, ftrace_trace_function
107 jnz trace 118 jnz trace
119
120#ifdef CONFIG_FUNCTION_GRAPH_TRACER
121 cmpq $ftrace_stub, ftrace_graph_return
122 jnz ftrace_graph_caller
123
124 cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
125 jnz ftrace_graph_caller
126#endif
127
108.globl ftrace_stub 128.globl ftrace_stub
109ftrace_stub: 129ftrace_stub:
110 retq 130 retq
@@ -140,6 +160,69 @@ END(mcount)
140#endif /* CONFIG_DYNAMIC_FTRACE */ 160#endif /* CONFIG_DYNAMIC_FTRACE */
141#endif /* CONFIG_FUNCTION_TRACER */ 161#endif /* CONFIG_FUNCTION_TRACER */
142 162
163#ifdef CONFIG_FUNCTION_GRAPH_TRACER
164ENTRY(ftrace_graph_caller)
165 cmpl $0, function_trace_stop
166 jne ftrace_stub
167
168 subq $0x38, %rsp
169 movq %rax, (%rsp)
170 movq %rcx, 8(%rsp)
171 movq %rdx, 16(%rsp)
172 movq %rsi, 24(%rsp)
173 movq %rdi, 32(%rsp)
174 movq %r8, 40(%rsp)
175 movq %r9, 48(%rsp)
176
177 leaq 8(%rbp), %rdi
178 movq 0x38(%rsp), %rsi
179 subq $MCOUNT_INSN_SIZE, %rsi
180
181 call prepare_ftrace_return
182
183 movq 48(%rsp), %r9
184 movq 40(%rsp), %r8
185 movq 32(%rsp), %rdi
186 movq 24(%rsp), %rsi
187 movq 16(%rsp), %rdx
188 movq 8(%rsp), %rcx
189 movq (%rsp), %rax
190 addq $0x38, %rsp
191 retq
192END(ftrace_graph_caller)
193
194
195.globl return_to_handler
196return_to_handler:
197 subq $80, %rsp
198
199 movq %rax, (%rsp)
200 movq %rcx, 8(%rsp)
201 movq %rdx, 16(%rsp)
202 movq %rsi, 24(%rsp)
203 movq %rdi, 32(%rsp)
204 movq %r8, 40(%rsp)
205 movq %r9, 48(%rsp)
206 movq %r10, 56(%rsp)
207 movq %r11, 64(%rsp)
208
209 call ftrace_return_to_handler
210
211 movq %rax, 72(%rsp)
212 movq 64(%rsp), %r11
213 movq 56(%rsp), %r10
214 movq 48(%rsp), %r9
215 movq 40(%rsp), %r8
216 movq 32(%rsp), %rdi
217 movq 24(%rsp), %rsi
218 movq 16(%rsp), %rdx
219 movq 8(%rsp), %rcx
220 movq (%rsp), %rax
221 addq $72, %rsp
222 retq
223#endif
224
225
143#ifndef CONFIG_PREEMPT 226#ifndef CONFIG_PREEMPT
144#define retint_kernel retint_restore_args 227#define retint_kernel retint_restore_args
145#endif 228#endif
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
index 0aa2c443d600..53699c931ad4 100644
--- a/arch/x86/kernel/es7000_32.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -38,8 +38,11 @@
38#include <asm/io.h> 38#include <asm/io.h>
39#include <asm/nmi.h> 39#include <asm/nmi.h>
40#include <asm/smp.h> 40#include <asm/smp.h>
41#include <asm/atomic.h>
41#include <asm/apicdef.h> 42#include <asm/apicdef.h>
42#include <mach_mpparse.h> 43#include <mach_mpparse.h>
44#include <asm/genapic.h>
45#include <asm/setup.h>
43 46
44/* 47/*
45 * ES7000 chipsets 48 * ES7000 chipsets
@@ -161,6 +164,43 @@ es7000_rename_gsi(int ioapic, int gsi)
161 return gsi; 164 return gsi;
162} 165}
163 166
167static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
168{
169 unsigned long vect = 0, psaival = 0;
170
171 if (psai == NULL)
172 return -1;
173
174 vect = ((unsigned long)__pa(eip)/0x1000) << 16;
175 psaival = (0x1000000 | vect | cpu);
176
177 while (*psai & 0x1000000)
178 ;
179
180 *psai = psaival;
181
182 return 0;
183}
184
185static void noop_wait_for_deassert(atomic_t *deassert_not_used)
186{
187}
188
189static int __init es7000_update_genapic(void)
190{
191 genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
192
193 /* MPENTIUMIII */
194 if (boot_cpu_data.x86 == 6 &&
195 (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) {
196 es7000_update_genapic_to_cluster();
197 genapic->wait_for_init_deassert = noop_wait_for_deassert;
198 genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
199 }
200
201 return 0;
202}
203
164void __init 204void __init
165setup_unisys(void) 205setup_unisys(void)
166{ 206{
@@ -176,6 +216,8 @@ setup_unisys(void)
176 else 216 else
177 es7000_plat = ES7000_CLASSIC; 217 es7000_plat = ES7000_CLASSIC;
178 ioapic_renumber_irq = es7000_rename_gsi; 218 ioapic_renumber_irq = es7000_rename_gsi;
219
220 x86_quirks->update_genapic = es7000_update_genapic;
179} 221}
180 222
181/* 223/*
@@ -317,26 +359,6 @@ es7000_mip_write(struct mip_reg *mip_reg)
317 return status; 359 return status;
318} 360}
319 361
320int
321es7000_start_cpu(int cpu, unsigned long eip)
322{
323 unsigned long vect = 0, psaival = 0;
324
325 if (psai == NULL)
326 return -1;
327
328 vect = ((unsigned long)__pa(eip)/0x1000) << 16;
329 psaival = (0x1000000 | vect | cpu);
330
331 while (*psai & 0x1000000)
332 ;
333
334 *psai = psaival;
335
336 return 0;
337
338}
339
340void __init 362void __init
341es7000_sw_apic(void) 363es7000_sw_apic(void)
342{ 364{
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 50ea0ac8c9bf..1b43086b097a 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -14,14 +14,17 @@
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ftrace.h> 15#include <linux/ftrace.h>
16#include <linux/percpu.h> 16#include <linux/percpu.h>
17#include <linux/sched.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/list.h> 19#include <linux/list.h>
19 20
20#include <asm/ftrace.h> 21#include <asm/ftrace.h>
22#include <linux/ftrace.h>
21#include <asm/nops.h> 23#include <asm/nops.h>
24#include <asm/nmi.h>
22 25
23 26
24static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; 27#ifdef CONFIG_DYNAMIC_FTRACE
25 28
26union ftrace_code_union { 29union ftrace_code_union {
27 char code[MCOUNT_INSN_SIZE]; 30 char code[MCOUNT_INSN_SIZE];
@@ -31,18 +34,12 @@ union ftrace_code_union {
31 } __attribute__((packed)); 34 } __attribute__((packed));
32}; 35};
33 36
34
35static int ftrace_calc_offset(long ip, long addr) 37static int ftrace_calc_offset(long ip, long addr)
36{ 38{
37 return (int)(addr - ip); 39 return (int)(addr - ip);
38} 40}
39 41
40unsigned char *ftrace_nop_replace(void) 42static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
41{
42 return ftrace_nop;
43}
44
45unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
46{ 43{
47 static union ftrace_code_union calc; 44 static union ftrace_code_union calc;
48 45
@@ -56,7 +53,142 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
56 return calc.code; 53 return calc.code;
57} 54}
58 55
59int 56/*
57 * Modifying code must take extra care. On an SMP machine, if
58 * the code being modified is also being executed on another CPU
59 * that CPU will have undefined results and possibly take a GPF.
60 * We use kstop_machine to stop other CPUS from exectuing code.
61 * But this does not stop NMIs from happening. We still need
62 * to protect against that. We separate out the modification of
63 * the code to take care of this.
64 *
65 * Two buffers are added: An IP buffer and a "code" buffer.
66 *
67 * 1) Put the instruction pointer into the IP buffer
68 * and the new code into the "code" buffer.
69 * 2) Set a flag that says we are modifying code
70 * 3) Wait for any running NMIs to finish.
71 * 4) Write the code
72 * 5) clear the flag.
73 * 6) Wait for any running NMIs to finish.
74 *
75 * If an NMI is executed, the first thing it does is to call
76 * "ftrace_nmi_enter". This will check if the flag is set to write
77 * and if it is, it will write what is in the IP and "code" buffers.
78 *
79 * The trick is, it does not matter if everyone is writing the same
80 * content to the code location. Also, if a CPU is executing code
81 * it is OK to write to that code location if the contents being written
82 * are the same as what exists.
83 */
84
85static atomic_t in_nmi = ATOMIC_INIT(0);
86static int mod_code_status; /* holds return value of text write */
87static int mod_code_write; /* set when NMI should do the write */
88static void *mod_code_ip; /* holds the IP to write to */
89static void *mod_code_newcode; /* holds the text to write to the IP */
90
91static unsigned nmi_wait_count;
92static atomic_t nmi_update_count = ATOMIC_INIT(0);
93
94int ftrace_arch_read_dyn_info(char *buf, int size)
95{
96 int r;
97
98 r = snprintf(buf, size, "%u %u",
99 nmi_wait_count,
100 atomic_read(&nmi_update_count));
101 return r;
102}
103
104static void ftrace_mod_code(void)
105{
106 /*
107 * Yes, more than one CPU process can be writing to mod_code_status.
108 * (and the code itself)
109 * But if one were to fail, then they all should, and if one were
110 * to succeed, then they all should.
111 */
112 mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
113 MCOUNT_INSN_SIZE);
114}
115
116void ftrace_nmi_enter(void)
117{
118 atomic_inc(&in_nmi);
119 /* Must have in_nmi seen before reading write flag */
120 smp_mb();
121 if (mod_code_write) {
122 ftrace_mod_code();
123 atomic_inc(&nmi_update_count);
124 }
125}
126
127void ftrace_nmi_exit(void)
128{
129 /* Finish all executions before clearing in_nmi */
130 smp_wmb();
131 atomic_dec(&in_nmi);
132}
133
134static void wait_for_nmi(void)
135{
136 int waited = 0;
137
138 while (atomic_read(&in_nmi)) {
139 waited = 1;
140 cpu_relax();
141 }
142
143 if (waited)
144 nmi_wait_count++;
145}
146
147static int
148do_ftrace_mod_code(unsigned long ip, void *new_code)
149{
150 mod_code_ip = (void *)ip;
151 mod_code_newcode = new_code;
152
153 /* The buffers need to be visible before we let NMIs write them */
154 smp_wmb();
155
156 mod_code_write = 1;
157
158 /* Make sure write bit is visible before we wait on NMIs */
159 smp_mb();
160
161 wait_for_nmi();
162
163 /* Make sure all running NMIs have finished before we write the code */
164 smp_mb();
165
166 ftrace_mod_code();
167
168 /* Make sure the write happens before clearing the bit */
169 smp_wmb();
170
171 mod_code_write = 0;
172
173 /* make sure NMIs see the cleared bit */
174 smp_mb();
175
176 wait_for_nmi();
177
178 return mod_code_status;
179}
180
181
182
183
184static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
185
186static unsigned char *ftrace_nop_replace(void)
187{
188 return ftrace_nop;
189}
190
191static int
60ftrace_modify_code(unsigned long ip, unsigned char *old_code, 192ftrace_modify_code(unsigned long ip, unsigned char *old_code,
61 unsigned char *new_code) 193 unsigned char *new_code)
62{ 194{
@@ -81,7 +213,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
81 return -EINVAL; 213 return -EINVAL;
82 214
83 /* replace the text with the new text */ 215 /* replace the text with the new text */
84 if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) 216 if (do_ftrace_mod_code(ip, new_code))
85 return -EPERM; 217 return -EPERM;
86 218
87 sync_core(); 219 sync_core();
@@ -89,6 +221,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
89 return 0; 221 return 0;
90} 222}
91 223
224int ftrace_make_nop(struct module *mod,
225 struct dyn_ftrace *rec, unsigned long addr)
226{
227 unsigned char *new, *old;
228 unsigned long ip = rec->ip;
229
230 old = ftrace_call_replace(ip, addr);
231 new = ftrace_nop_replace();
232
233 return ftrace_modify_code(rec->ip, old, new);
234}
235
236int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
237{
238 unsigned char *new, *old;
239 unsigned long ip = rec->ip;
240
241 old = ftrace_nop_replace();
242 new = ftrace_call_replace(ip, addr);
243
244 return ftrace_modify_code(rec->ip, old, new);
245}
246
92int ftrace_update_ftrace_func(ftrace_func_t func) 247int ftrace_update_ftrace_func(ftrace_func_t func)
93{ 248{
94 unsigned long ip = (unsigned long)(&ftrace_call); 249 unsigned long ip = (unsigned long)(&ftrace_call);
@@ -165,3 +320,218 @@ int __init ftrace_dyn_arch_init(void *data)
165 320
166 return 0; 321 return 0;
167} 322}
323#endif
324
325#ifdef CONFIG_FUNCTION_GRAPH_TRACER
326
327#ifdef CONFIG_DYNAMIC_FTRACE
328extern void ftrace_graph_call(void);
329
330static int ftrace_mod_jmp(unsigned long ip,
331 int old_offset, int new_offset)
332{
333 unsigned char code[MCOUNT_INSN_SIZE];
334
335 if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
336 return -EFAULT;
337
338 if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
339 return -EINVAL;
340
341 *(int *)(&code[1]) = new_offset;
342
343 if (do_ftrace_mod_code(ip, &code))
344 return -EPERM;
345
346 return 0;
347}
348
349int ftrace_enable_ftrace_graph_caller(void)
350{
351 unsigned long ip = (unsigned long)(&ftrace_graph_call);
352 int old_offset, new_offset;
353
354 old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
355 new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
356
357 return ftrace_mod_jmp(ip, old_offset, new_offset);
358}
359
360int ftrace_disable_ftrace_graph_caller(void)
361{
362 unsigned long ip = (unsigned long)(&ftrace_graph_call);
363 int old_offset, new_offset;
364
365 old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
366 new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
367
368 return ftrace_mod_jmp(ip, old_offset, new_offset);
369}
370
371#else /* CONFIG_DYNAMIC_FTRACE */
372
373/*
374 * These functions are picked from those used on
375 * this page for dynamic ftrace. They have been
376 * simplified to ignore all traces in NMI context.
377 */
378static atomic_t in_nmi;
379
380void ftrace_nmi_enter(void)
381{
382 atomic_inc(&in_nmi);
383}
384
385void ftrace_nmi_exit(void)
386{
387 atomic_dec(&in_nmi);
388}
389
390#endif /* !CONFIG_DYNAMIC_FTRACE */
391
392/* Add a function return address to the trace stack on thread info.*/
393static int push_return_trace(unsigned long ret, unsigned long long time,
394 unsigned long func, int *depth)
395{
396 int index;
397
398 if (!current->ret_stack)
399 return -EBUSY;
400
401 /* The return trace stack is full */
402 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
403 atomic_inc(&current->trace_overrun);
404 return -EBUSY;
405 }
406
407 index = ++current->curr_ret_stack;
408 barrier();
409 current->ret_stack[index].ret = ret;
410 current->ret_stack[index].func = func;
411 current->ret_stack[index].calltime = time;
412 *depth = index;
413
414 return 0;
415}
416
417/* Retrieve a function return address to the trace stack on thread info.*/
418static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
419{
420 int index;
421
422 index = current->curr_ret_stack;
423
424 if (unlikely(index < 0)) {
425 ftrace_graph_stop();
426 WARN_ON(1);
427 /* Might as well panic, otherwise we have no where to go */
428 *ret = (unsigned long)panic;
429 return;
430 }
431
432 *ret = current->ret_stack[index].ret;
433 trace->func = current->ret_stack[index].func;
434 trace->calltime = current->ret_stack[index].calltime;
435 trace->overrun = atomic_read(&current->trace_overrun);
436 trace->depth = index;
437 barrier();
438 current->curr_ret_stack--;
439
440}
441
442/*
443 * Send the trace to the ring-buffer.
444 * @return the original return address.
445 */
446unsigned long ftrace_return_to_handler(void)
447{
448 struct ftrace_graph_ret trace;
449 unsigned long ret;
450
451 pop_return_trace(&trace, &ret);
452 trace.rettime = cpu_clock(raw_smp_processor_id());
453 ftrace_graph_return(&trace);
454
455 if (unlikely(!ret)) {
456 ftrace_graph_stop();
457 WARN_ON(1);
458 /* Might as well panic. What else to do? */
459 ret = (unsigned long)panic;
460 }
461
462 return ret;
463}
464
465/*
466 * Hook the return address and push it in the stack of return addrs
467 * in current thread info.
468 */
469void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
470{
471 unsigned long old;
472 unsigned long long calltime;
473 int faulted;
474 struct ftrace_graph_ent trace;
475 unsigned long return_hooker = (unsigned long)
476 &return_to_handler;
477
478 /* Nmi's are currently unsupported */
479 if (unlikely(atomic_read(&in_nmi)))
480 return;
481
482 if (unlikely(atomic_read(&current->tracing_graph_pause)))
483 return;
484
485 /*
486 * Protect against fault, even if it shouldn't
487 * happen. This tool is too much intrusive to
488 * ignore such a protection.
489 */
490 asm volatile(
491 "1: " _ASM_MOV " (%[parent_old]), %[old]\n"
492 "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n"
493 " movl $0, %[faulted]\n"
494
495 ".section .fixup, \"ax\"\n"
496 "3: movl $1, %[faulted]\n"
497 ".previous\n"
498
499 _ASM_EXTABLE(1b, 3b)
500 _ASM_EXTABLE(2b, 3b)
501
502 : [parent_replaced] "=r" (parent), [old] "=r" (old),
503 [faulted] "=r" (faulted)
504 : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
505 : "memory"
506 );
507
508 if (unlikely(faulted)) {
509 ftrace_graph_stop();
510 WARN_ON(1);
511 return;
512 }
513
514 if (unlikely(!__kernel_text_address(old))) {
515 ftrace_graph_stop();
516 *parent = old;
517 WARN_ON(1);
518 return;
519 }
520
521 calltime = cpu_clock(raw_smp_processor_id());
522
523 if (push_return_trace(old, calltime,
524 self_addr, &trace.depth) == -EBUSY) {
525 *parent = old;
526 return;
527 }
528
529 trace.func = self_addr;
530
531 /* Only trace if the calling function expects to */
532 if (!ftrace_graph_entry(&trace)) {
533 current->curr_ret_stack--;
534 *parent = old;
535 }
536}
537#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 6c9bfc9e1e95..2bced78b0b8e 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -21,6 +21,7 @@
21#include <asm/smp.h> 21#include <asm/smp.h>
22#include <asm/ipi.h> 22#include <asm/ipi.h>
23#include <asm/genapic.h> 23#include <asm/genapic.h>
24#include <asm/setup.h>
24 25
25extern struct genapic apic_flat; 26extern struct genapic apic_flat;
26extern struct genapic apic_physflat; 27extern struct genapic apic_physflat;
@@ -53,6 +54,9 @@ void __init setup_apic_routing(void)
53 genapic = &apic_physflat; 54 genapic = &apic_physflat;
54 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); 55 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
55 } 56 }
57
58 if (x86_quirks->update_genapic)
59 x86_quirks->update_genapic();
56} 60}
57 61
58/* Same for both flat and physical. */ 62/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 1184210e6d0c..d7f0993b8056 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -108,8 +108,33 @@ static int __init parse_noapic(char *str)
108early_param("noapic", parse_noapic); 108early_param("noapic", parse_noapic);
109 109
110struct irq_pin_list; 110struct irq_pin_list;
111
112/*
113 * This is performance-critical, we want to do it O(1)
114 *
115 * the indexing order of this array favors 1:1 mappings
116 * between pins and IRQs.
117 */
118
119struct irq_pin_list {
120 int apic, pin;
121 struct irq_pin_list *next;
122};
123
124static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
125{
126 struct irq_pin_list *pin;
127 int node;
128
129 node = cpu_to_node(cpu);
130
131 pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
132 printk(KERN_DEBUG " alloc irq_2_pin on cpu %d node %d\n", cpu, node);
133
134 return pin;
135}
136
111struct irq_cfg { 137struct irq_cfg {
112 unsigned int irq;
113 struct irq_pin_list *irq_2_pin; 138 struct irq_pin_list *irq_2_pin;
114 cpumask_t domain; 139 cpumask_t domain;
115 cpumask_t old_domain; 140 cpumask_t old_domain;
@@ -119,81 +144,95 @@ struct irq_cfg {
119}; 144};
120 145
121/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 146/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
147#ifdef CONFIG_SPARSE_IRQ
148static struct irq_cfg irq_cfgx[] = {
149#else
122static struct irq_cfg irq_cfgx[NR_IRQS] = { 150static struct irq_cfg irq_cfgx[NR_IRQS] = {
123 [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, 151#endif
124 [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, 152 [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
125 [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, 153 [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
126 [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, 154 [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
127 [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, 155 [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
128 [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, 156 [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
129 [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, 157 [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
130 [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, 158 [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
131 [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, 159 [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
132 [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, 160 [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
133 [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, 161 [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
134 [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, 162 [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
135 [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, 163 [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
136 [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, 164 [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
137 [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, 165 [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
138 [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, 166 [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
167 [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
139}; 168};
140 169
141#define for_each_irq_cfg(irq, cfg) \ 170void __init arch_early_irq_init(void)
142 for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
143
144static struct irq_cfg *irq_cfg(unsigned int irq)
145{ 171{
146 return irq < nr_irqs ? irq_cfgx + irq : NULL; 172 struct irq_cfg *cfg;
173 struct irq_desc *desc;
174 int count;
175 int i;
176
177 cfg = irq_cfgx;
178 count = ARRAY_SIZE(irq_cfgx);
179
180 for (i = 0; i < count; i++) {
181 desc = irq_to_desc(i);
182 desc->chip_data = &cfg[i];
183 }
147} 184}
148 185
149static struct irq_cfg *irq_cfg_alloc(unsigned int irq) 186#ifdef CONFIG_SPARSE_IRQ
187static struct irq_cfg *irq_cfg(unsigned int irq)
150{ 188{
151 return irq_cfg(irq); 189 struct irq_cfg *cfg = NULL;
190 struct irq_desc *desc;
191
192 desc = irq_to_desc(irq);
193 if (desc)
194 cfg = desc->chip_data;
195
196 return cfg;
152} 197}
153 198
154/* 199static struct irq_cfg *get_one_free_irq_cfg(int cpu)
155 * Rough estimation of how many shared IRQs there are, can be changed 200{
156 * anytime. 201 struct irq_cfg *cfg;
157 */ 202 int node;
158#define MAX_PLUS_SHARED_IRQS NR_IRQS
159#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
160 203
161/* 204 node = cpu_to_node(cpu);
162 * This is performance-critical, we want to do it O(1)
163 *
164 * the indexing order of this array favors 1:1 mappings
165 * between pins and IRQs.
166 */
167 205
168struct irq_pin_list { 206 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
169 int apic, pin; 207 printk(KERN_DEBUG " alloc irq_cfg on cpu %d node %d\n", cpu, node);
170 struct irq_pin_list *next;
171};
172 208
173static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; 209 return cfg;
174static struct irq_pin_list *irq_2_pin_ptr; 210}
175 211
176static void __init irq_2_pin_init(void) 212void arch_init_chip_data(struct irq_desc *desc, int cpu)
177{ 213{
178 struct irq_pin_list *pin = irq_2_pin_head; 214 struct irq_cfg *cfg;
179 int i;
180
181 for (i = 1; i < PIN_MAP_SIZE; i++)
182 pin[i-1].next = &pin[i];
183 215
184 irq_2_pin_ptr = &pin[0]; 216 cfg = desc->chip_data;
217 if (!cfg) {
218 desc->chip_data = get_one_free_irq_cfg(cpu);
219 if (!desc->chip_data) {
220 printk(KERN_ERR "can not alloc irq_cfg\n");
221 BUG_ON(1);
222 }
223 }
185} 224}
186 225
187static struct irq_pin_list *get_one_free_irq_2_pin(void) 226#else
227static struct irq_cfg *irq_cfg(unsigned int irq)
188{ 228{
189 struct irq_pin_list *pin = irq_2_pin_ptr; 229 return irq < nr_irqs ? irq_cfgx + irq : NULL;
230}
190 231
191 if (!pin) 232#endif
192 panic("can not get more irq_2_pin\n");
193 233
194 irq_2_pin_ptr = pin->next; 234static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
195 pin->next = NULL; 235{
196 return pin;
197} 236}
198 237
199struct io_apic { 238struct io_apic {
@@ -237,11 +276,10 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned
237 writel(value, &io_apic->data); 276 writel(value, &io_apic->data);
238} 277}
239 278
240static bool io_apic_level_ack_pending(unsigned int irq) 279static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
241{ 280{
242 struct irq_pin_list *entry; 281 struct irq_pin_list *entry;
243 unsigned long flags; 282 unsigned long flags;
244 struct irq_cfg *cfg = irq_cfg(irq);
245 283
246 spin_lock_irqsave(&ioapic_lock, flags); 284 spin_lock_irqsave(&ioapic_lock, flags);
247 entry = cfg->irq_2_pin; 285 entry = cfg->irq_2_pin;
@@ -323,13 +361,12 @@ static void ioapic_mask_entry(int apic, int pin)
323} 361}
324 362
325#ifdef CONFIG_SMP 363#ifdef CONFIG_SMP
326static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) 364static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
327{ 365{
328 int apic, pin; 366 int apic, pin;
329 struct irq_cfg *cfg;
330 struct irq_pin_list *entry; 367 struct irq_pin_list *entry;
368 u8 vector = cfg->vector;
331 369
332 cfg = irq_cfg(irq);
333 entry = cfg->irq_2_pin; 370 entry = cfg->irq_2_pin;
334 for (;;) { 371 for (;;) {
335 unsigned int reg; 372 unsigned int reg;
@@ -359,24 +396,27 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
359 } 396 }
360} 397}
361 398
362static int assign_irq_vector(int irq, cpumask_t mask); 399static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
363 400
364static void set_ioapic_affinity_irq(unsigned int irq, 401static void set_ioapic_affinity_irq_desc(struct irq_desc *desc,
365 const struct cpumask *mask) 402 const struct cpumask *mask)
366{ 403{
367 struct irq_cfg *cfg; 404 struct irq_cfg *cfg;
368 unsigned long flags; 405 unsigned long flags;
369 unsigned int dest; 406 unsigned int dest;
370 cpumask_t tmp; 407 cpumask_t tmp;
371 struct irq_desc *desc; 408 unsigned int irq;
372 409
373 if (!cpumask_intersects(mask, cpu_online_mask)) 410 if (!cpumask_intersects(mask, cpu_online_mask))
374 return; 411 return;
375 412
376 cfg = irq_cfg(irq); 413 irq = desc->irq;
377 if (assign_irq_vector(irq, *mask)) 414 cfg = desc->chip_data;
415 if (assign_irq_vector(irq, cfg, *mask))
378 return; 416 return;
379 417
418 set_extra_move_desc(desc, *mask);
419
380 cpumask_and(&tmp, &cfg->domain, mask); 420 cpumask_and(&tmp, &cfg->domain, mask);
381 dest = cpu_mask_to_apicid(tmp); 421 dest = cpu_mask_to_apicid(tmp);
382 /* 422 /*
@@ -384,12 +424,21 @@ static void set_ioapic_affinity_irq(unsigned int irq,
384 */ 424 */
385 dest = SET_APIC_LOGICAL_ID(dest); 425 dest = SET_APIC_LOGICAL_ID(dest);
386 426
387 desc = irq_to_desc(irq);
388 spin_lock_irqsave(&ioapic_lock, flags); 427 spin_lock_irqsave(&ioapic_lock, flags);
389 __target_IO_APIC_irq(irq, dest, cfg->vector); 428 __target_IO_APIC_irq(irq, dest, cfg);
390 cpumask_copy(&desc->affinity, mask); 429 cpumask_copy(&desc->affinity, mask);
391 spin_unlock_irqrestore(&ioapic_lock, flags); 430 spin_unlock_irqrestore(&ioapic_lock, flags);
392} 431}
432
433static void set_ioapic_affinity_irq(unsigned int irq,
434 const struct cpumask *mask)
435{
436 struct irq_desc *desc;
437
438 desc = irq_to_desc(irq);
439
440 set_ioapic_affinity_irq_desc(desc, mask);
441}
393#endif /* CONFIG_SMP */ 442#endif /* CONFIG_SMP */
394 443
395/* 444/*
@@ -397,16 +446,18 @@ static void set_ioapic_affinity_irq(unsigned int irq,
397 * shared ISA-space IRQs, so we have to support them. We are super 446 * shared ISA-space IRQs, so we have to support them. We are super
398 * fast in the common case, and fast for shared ISA-space IRQs. 447 * fast in the common case, and fast for shared ISA-space IRQs.
399 */ 448 */
400static void add_pin_to_irq(unsigned int irq, int apic, int pin) 449static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
401{ 450{
402 struct irq_cfg *cfg;
403 struct irq_pin_list *entry; 451 struct irq_pin_list *entry;
404 452
405 /* first time to refer irq_cfg, so with new */
406 cfg = irq_cfg_alloc(irq);
407 entry = cfg->irq_2_pin; 453 entry = cfg->irq_2_pin;
408 if (!entry) { 454 if (!entry) {
409 entry = get_one_free_irq_2_pin(); 455 entry = get_one_free_irq_2_pin(cpu);
456 if (!entry) {
457 printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
458 apic, pin);
459 return;
460 }
410 cfg->irq_2_pin = entry; 461 cfg->irq_2_pin = entry;
411 entry->apic = apic; 462 entry->apic = apic;
412 entry->pin = pin; 463 entry->pin = pin;
@@ -421,7 +472,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
421 entry = entry->next; 472 entry = entry->next;
422 } 473 }
423 474
424 entry->next = get_one_free_irq_2_pin(); 475 entry->next = get_one_free_irq_2_pin(cpu);
425 entry = entry->next; 476 entry = entry->next;
426 entry->apic = apic; 477 entry->apic = apic;
427 entry->pin = pin; 478 entry->pin = pin;
@@ -430,11 +481,10 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
430/* 481/*
431 * Reroute an IRQ to a different pin. 482 * Reroute an IRQ to a different pin.
432 */ 483 */
433static void __init replace_pin_at_irq(unsigned int irq, 484static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
434 int oldapic, int oldpin, 485 int oldapic, int oldpin,
435 int newapic, int newpin) 486 int newapic, int newpin)
436{ 487{
437 struct irq_cfg *cfg = irq_cfg(irq);
438 struct irq_pin_list *entry = cfg->irq_2_pin; 488 struct irq_pin_list *entry = cfg->irq_2_pin;
439 int replaced = 0; 489 int replaced = 0;
440 490
@@ -451,18 +501,16 @@ static void __init replace_pin_at_irq(unsigned int irq,
451 501
452 /* why? call replace before add? */ 502 /* why? call replace before add? */
453 if (!replaced) 503 if (!replaced)
454 add_pin_to_irq(irq, newapic, newpin); 504 add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
455} 505}
456 506
457static inline void io_apic_modify_irq(unsigned int irq, 507static inline void io_apic_modify_irq(struct irq_cfg *cfg,
458 int mask_and, int mask_or, 508 int mask_and, int mask_or,
459 void (*final)(struct irq_pin_list *entry)) 509 void (*final)(struct irq_pin_list *entry))
460{ 510{
461 int pin; 511 int pin;
462 struct irq_cfg *cfg;
463 struct irq_pin_list *entry; 512 struct irq_pin_list *entry;
464 513
465 cfg = irq_cfg(irq);
466 for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { 514 for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
467 unsigned int reg; 515 unsigned int reg;
468 pin = entry->pin; 516 pin = entry->pin;
@@ -475,9 +523,9 @@ static inline void io_apic_modify_irq(unsigned int irq,
475 } 523 }
476} 524}
477 525
478static void __unmask_IO_APIC_irq(unsigned int irq) 526static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
479{ 527{
480 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL); 528 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
481} 529}
482 530
483#ifdef CONFIG_X86_64 531#ifdef CONFIG_X86_64
@@ -492,47 +540,64 @@ void io_apic_sync(struct irq_pin_list *entry)
492 readl(&io_apic->data); 540 readl(&io_apic->data);
493} 541}
494 542
495static void __mask_IO_APIC_irq(unsigned int irq) 543static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
496{ 544{
497 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 545 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
498} 546}
499#else /* CONFIG_X86_32 */ 547#else /* CONFIG_X86_32 */
500static void __mask_IO_APIC_irq(unsigned int irq) 548static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
501{ 549{
502 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL); 550 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
503} 551}
504 552
505static void __mask_and_edge_IO_APIC_irq(unsigned int irq) 553static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
506{ 554{
507 io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER, 555 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
508 IO_APIC_REDIR_MASKED, NULL); 556 IO_APIC_REDIR_MASKED, NULL);
509} 557}
510 558
511static void __unmask_and_level_IO_APIC_irq(unsigned int irq) 559static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
512{ 560{
513 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 561 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
514 IO_APIC_REDIR_LEVEL_TRIGGER, NULL); 562 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
515} 563}
516#endif /* CONFIG_X86_32 */ 564#endif /* CONFIG_X86_32 */
517 565
518static void mask_IO_APIC_irq (unsigned int irq) 566static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
519{ 567{
568 struct irq_cfg *cfg = desc->chip_data;
520 unsigned long flags; 569 unsigned long flags;
521 570
571 BUG_ON(!cfg);
572
522 spin_lock_irqsave(&ioapic_lock, flags); 573 spin_lock_irqsave(&ioapic_lock, flags);
523 __mask_IO_APIC_irq(irq); 574 __mask_IO_APIC_irq(cfg);
524 spin_unlock_irqrestore(&ioapic_lock, flags); 575 spin_unlock_irqrestore(&ioapic_lock, flags);
525} 576}
526 577
527static void unmask_IO_APIC_irq (unsigned int irq) 578static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
528{ 579{
580 struct irq_cfg *cfg = desc->chip_data;
529 unsigned long flags; 581 unsigned long flags;
530 582
531 spin_lock_irqsave(&ioapic_lock, flags); 583 spin_lock_irqsave(&ioapic_lock, flags);
532 __unmask_IO_APIC_irq(irq); 584 __unmask_IO_APIC_irq(cfg);
533 spin_unlock_irqrestore(&ioapic_lock, flags); 585 spin_unlock_irqrestore(&ioapic_lock, flags);
534} 586}
535 587
588static void mask_IO_APIC_irq(unsigned int irq)
589{
590 struct irq_desc *desc = irq_to_desc(irq);
591
592 mask_IO_APIC_irq_desc(desc);
593}
594static void unmask_IO_APIC_irq(unsigned int irq)
595{
596 struct irq_desc *desc = irq_to_desc(irq);
597
598 unmask_IO_APIC_irq_desc(desc);
599}
600
536static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) 601static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
537{ 602{
538 struct IO_APIC_route_entry entry; 603 struct IO_APIC_route_entry entry;
@@ -809,7 +874,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
809 */ 874 */
810static int EISA_ELCR(unsigned int irq) 875static int EISA_ELCR(unsigned int irq)
811{ 876{
812 if (irq < 16) { 877 if (irq < NR_IRQS_LEGACY) {
813 unsigned int port = 0x4d0 + (irq >> 3); 878 unsigned int port = 0x4d0 + (irq >> 3);
814 return (inb(port) >> (irq & 7)) & 1; 879 return (inb(port) >> (irq & 7)) & 1;
815 } 880 }
@@ -1034,7 +1099,7 @@ void unlock_vector_lock(void)
1034 spin_unlock(&vector_lock); 1099 spin_unlock(&vector_lock);
1035} 1100}
1036 1101
1037static int __assign_irq_vector(int irq, cpumask_t mask) 1102static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
1038{ 1103{
1039 /* 1104 /*
1040 * NOTE! The local APIC isn't very good at handling 1105 * NOTE! The local APIC isn't very good at handling
@@ -1050,16 +1115,13 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
1050 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; 1115 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
1051 unsigned int old_vector; 1116 unsigned int old_vector;
1052 int cpu; 1117 int cpu;
1053 struct irq_cfg *cfg;
1054 1118
1055 cfg = irq_cfg(irq); 1119 if ((cfg->move_in_progress) || cfg->move_cleanup_count)
1120 return -EBUSY;
1056 1121
1057 /* Only try and allocate irqs on cpus that are present */ 1122 /* Only try and allocate irqs on cpus that are present */
1058 cpus_and(mask, mask, cpu_online_map); 1123 cpus_and(mask, mask, cpu_online_map);
1059 1124
1060 if ((cfg->move_in_progress) || cfg->move_cleanup_count)
1061 return -EBUSY;
1062
1063 old_vector = cfg->vector; 1125 old_vector = cfg->vector;
1064 if (old_vector) { 1126 if (old_vector) {
1065 cpumask_t tmp; 1127 cpumask_t tmp;
@@ -1113,24 +1175,22 @@ next:
1113 return -ENOSPC; 1175 return -ENOSPC;
1114} 1176}
1115 1177
1116static int assign_irq_vector(int irq, cpumask_t mask) 1178static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
1117{ 1179{
1118 int err; 1180 int err;
1119 unsigned long flags; 1181 unsigned long flags;
1120 1182
1121 spin_lock_irqsave(&vector_lock, flags); 1183 spin_lock_irqsave(&vector_lock, flags);
1122 err = __assign_irq_vector(irq, mask); 1184 err = __assign_irq_vector(irq, cfg, mask);
1123 spin_unlock_irqrestore(&vector_lock, flags); 1185 spin_unlock_irqrestore(&vector_lock, flags);
1124 return err; 1186 return err;
1125} 1187}
1126 1188
1127static void __clear_irq_vector(int irq) 1189static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1128{ 1190{
1129 struct irq_cfg *cfg;
1130 cpumask_t mask; 1191 cpumask_t mask;
1131 int cpu, vector; 1192 int cpu, vector;
1132 1193
1133 cfg = irq_cfg(irq);
1134 BUG_ON(!cfg->vector); 1194 BUG_ON(!cfg->vector);
1135 1195
1136 vector = cfg->vector; 1196 vector = cfg->vector;
@@ -1162,9 +1222,13 @@ void __setup_vector_irq(int cpu)
1162 /* This function must be called with vector_lock held */ 1222 /* This function must be called with vector_lock held */
1163 int irq, vector; 1223 int irq, vector;
1164 struct irq_cfg *cfg; 1224 struct irq_cfg *cfg;
1225 struct irq_desc *desc;
1165 1226
1166 /* Mark the inuse vectors */ 1227 /* Mark the inuse vectors */
1167 for_each_irq_cfg(irq, cfg) { 1228 for_each_irq_desc(irq, desc) {
1229 if (!desc)
1230 continue;
1231 cfg = desc->chip_data;
1168 if (!cpu_isset(cpu, cfg->domain)) 1232 if (!cpu_isset(cpu, cfg->domain))
1169 continue; 1233 continue;
1170 vector = cfg->vector; 1234 vector = cfg->vector;
@@ -1215,11 +1279,8 @@ static inline int IO_APIC_irq_trigger(int irq)
1215} 1279}
1216#endif 1280#endif
1217 1281
1218static void ioapic_register_intr(int irq, unsigned long trigger) 1282static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
1219{ 1283{
1220 struct irq_desc *desc;
1221
1222 desc = irq_to_desc(irq);
1223 1284
1224 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1285 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1225 trigger == IOAPIC_LEVEL) 1286 trigger == IOAPIC_LEVEL)
@@ -1311,7 +1372,7 @@ static int setup_ioapic_entry(int apic, int irq,
1311 return 0; 1372 return 0;
1312} 1373}
1313 1374
1314static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, 1375static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
1315 int trigger, int polarity) 1376 int trigger, int polarity)
1316{ 1377{
1317 struct irq_cfg *cfg; 1378 struct irq_cfg *cfg;
@@ -1321,10 +1382,10 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
1321 if (!IO_APIC_IRQ(irq)) 1382 if (!IO_APIC_IRQ(irq))
1322 return; 1383 return;
1323 1384
1324 cfg = irq_cfg(irq); 1385 cfg = desc->chip_data;
1325 1386
1326 mask = TARGET_CPUS; 1387 mask = TARGET_CPUS;
1327 if (assign_irq_vector(irq, mask)) 1388 if (assign_irq_vector(irq, cfg, mask))
1328 return; 1389 return;
1329 1390
1330 cpus_and(mask, cfg->domain, mask); 1391 cpus_and(mask, cfg->domain, mask);
@@ -1341,12 +1402,12 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
1341 cfg->vector)) { 1402 cfg->vector)) {
1342 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1403 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1343 mp_ioapics[apic].mp_apicid, pin); 1404 mp_ioapics[apic].mp_apicid, pin);
1344 __clear_irq_vector(irq); 1405 __clear_irq_vector(irq, cfg);
1345 return; 1406 return;
1346 } 1407 }
1347 1408
1348 ioapic_register_intr(irq, trigger); 1409 ioapic_register_intr(irq, desc, trigger);
1349 if (irq < 16) 1410 if (irq < NR_IRQS_LEGACY)
1350 disable_8259A_irq(irq); 1411 disable_8259A_irq(irq);
1351 1412
1352 ioapic_write_entry(apic, pin, entry); 1413 ioapic_write_entry(apic, pin, entry);
@@ -1356,6 +1417,9 @@ static void __init setup_IO_APIC_irqs(void)
1356{ 1417{
1357 int apic, pin, idx, irq; 1418 int apic, pin, idx, irq;
1358 int notcon = 0; 1419 int notcon = 0;
1420 struct irq_desc *desc;
1421 struct irq_cfg *cfg;
1422 int cpu = boot_cpu_id;
1359 1423
1360 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1424 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1361 1425
@@ -1387,9 +1451,15 @@ static void __init setup_IO_APIC_irqs(void)
1387 if (multi_timer_check(apic, irq)) 1451 if (multi_timer_check(apic, irq))
1388 continue; 1452 continue;
1389#endif 1453#endif
1390 add_pin_to_irq(irq, apic, pin); 1454 desc = irq_to_desc_alloc_cpu(irq, cpu);
1455 if (!desc) {
1456 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1457 continue;
1458 }
1459 cfg = desc->chip_data;
1460 add_pin_to_irq_cpu(cfg, cpu, apic, pin);
1391 1461
1392 setup_IO_APIC_irq(apic, pin, irq, 1462 setup_IO_APIC_irq(apic, pin, irq, desc,
1393 irq_trigger(idx), irq_polarity(idx)); 1463 irq_trigger(idx), irq_polarity(idx));
1394 } 1464 }
1395 } 1465 }
@@ -1448,6 +1518,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1448 union IO_APIC_reg_03 reg_03; 1518 union IO_APIC_reg_03 reg_03;
1449 unsigned long flags; 1519 unsigned long flags;
1450 struct irq_cfg *cfg; 1520 struct irq_cfg *cfg;
1521 struct irq_desc *desc;
1451 unsigned int irq; 1522 unsigned int irq;
1452 1523
1453 if (apic_verbosity == APIC_QUIET) 1524 if (apic_verbosity == APIC_QUIET)
@@ -1537,8 +1608,13 @@ __apicdebuginit(void) print_IO_APIC(void)
1537 } 1608 }
1538 } 1609 }
1539 printk(KERN_DEBUG "IRQ to pin mappings:\n"); 1610 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1540 for_each_irq_cfg(irq, cfg) { 1611 for_each_irq_desc(irq, desc) {
1541 struct irq_pin_list *entry = cfg->irq_2_pin; 1612 struct irq_pin_list *entry;
1613
1614 if (!desc)
1615 continue;
1616 cfg = desc->chip_data;
1617 entry = cfg->irq_2_pin;
1542 if (!entry) 1618 if (!entry)
1543 continue; 1619 continue;
1544 printk(KERN_DEBUG "IRQ%d ", irq); 1620 printk(KERN_DEBUG "IRQ%d ", irq);
@@ -2022,14 +2098,16 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2022{ 2098{
2023 int was_pending = 0; 2099 int was_pending = 0;
2024 unsigned long flags; 2100 unsigned long flags;
2101 struct irq_cfg *cfg;
2025 2102
2026 spin_lock_irqsave(&ioapic_lock, flags); 2103 spin_lock_irqsave(&ioapic_lock, flags);
2027 if (irq < 16) { 2104 if (irq < NR_IRQS_LEGACY) {
2028 disable_8259A_irq(irq); 2105 disable_8259A_irq(irq);
2029 if (i8259A_irq_pending(irq)) 2106 if (i8259A_irq_pending(irq))
2030 was_pending = 1; 2107 was_pending = 1;
2031 } 2108 }
2032 __unmask_IO_APIC_irq(irq); 2109 cfg = irq_cfg(irq);
2110 __unmask_IO_APIC_irq(cfg);
2033 spin_unlock_irqrestore(&ioapic_lock, flags); 2111 spin_unlock_irqrestore(&ioapic_lock, flags);
2034 2112
2035 return was_pending; 2113 return was_pending;
@@ -2092,35 +2170,37 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
2092 * as simple as edge triggered migration and we can do the irq migration 2170 * as simple as edge triggered migration and we can do the irq migration
2093 * with a simple atomic update to IO-APIC RTE. 2171 * with a simple atomic update to IO-APIC RTE.
2094 */ 2172 */
2095static void migrate_ioapic_irq(int irq, cpumask_t mask) 2173static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)
2096{ 2174{
2097 struct irq_cfg *cfg; 2175 struct irq_cfg *cfg;
2098 struct irq_desc *desc;
2099 cpumask_t tmp, cleanup_mask; 2176 cpumask_t tmp, cleanup_mask;
2100 struct irte irte; 2177 struct irte irte;
2101 int modify_ioapic_rte; 2178 int modify_ioapic_rte;
2102 unsigned int dest; 2179 unsigned int dest;
2103 unsigned long flags; 2180 unsigned long flags;
2181 unsigned int irq;
2104 2182
2105 cpus_and(tmp, mask, cpu_online_map); 2183 cpus_and(tmp, mask, cpu_online_map);
2106 if (cpus_empty(tmp)) 2184 if (cpus_empty(tmp))
2107 return; 2185 return;
2108 2186
2187 irq = desc->irq;
2109 if (get_irte(irq, &irte)) 2188 if (get_irte(irq, &irte))
2110 return; 2189 return;
2111 2190
2112 if (assign_irq_vector(irq, mask)) 2191 cfg = desc->chip_data;
2192 if (assign_irq_vector(irq, cfg, mask))
2113 return; 2193 return;
2114 2194
2115 cfg = irq_cfg(irq); 2195 set_extra_move_desc(desc, mask);
2196
2116 cpus_and(tmp, cfg->domain, mask); 2197 cpus_and(tmp, cfg->domain, mask);
2117 dest = cpu_mask_to_apicid(tmp); 2198 dest = cpu_mask_to_apicid(tmp);
2118 2199
2119 desc = irq_to_desc(irq);
2120 modify_ioapic_rte = desc->status & IRQ_LEVEL; 2200 modify_ioapic_rte = desc->status & IRQ_LEVEL;
2121 if (modify_ioapic_rte) { 2201 if (modify_ioapic_rte) {
2122 spin_lock_irqsave(&ioapic_lock, flags); 2202 spin_lock_irqsave(&ioapic_lock, flags);
2123 __target_IO_APIC_irq(irq, dest, cfg->vector); 2203 __target_IO_APIC_irq(irq, dest, cfg);
2124 spin_unlock_irqrestore(&ioapic_lock, flags); 2204 spin_unlock_irqrestore(&ioapic_lock, flags);
2125 } 2205 }
2126 2206
@@ -2142,14 +2222,14 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask)
2142 desc->affinity = mask; 2222 desc->affinity = mask;
2143} 2223}
2144 2224
2145static int migrate_irq_remapped_level(int irq) 2225static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
2146{ 2226{
2147 int ret = -1; 2227 int ret = -1;
2148 struct irq_desc *desc = irq_to_desc(irq); 2228 struct irq_cfg *cfg = desc->chip_data;
2149 2229
2150 mask_IO_APIC_irq(irq); 2230 mask_IO_APIC_irq_desc(desc);
2151 2231
2152 if (io_apic_level_ack_pending(irq)) { 2232 if (io_apic_level_ack_pending(cfg)) {
2153 /* 2233 /*
2154 * Interrupt in progress. Migrating irq now will change the 2234 * Interrupt in progress. Migrating irq now will change the
2155 * vector information in the IO-APIC RTE and that will confuse 2235 * vector information in the IO-APIC RTE and that will confuse
@@ -2161,14 +2241,15 @@ static int migrate_irq_remapped_level(int irq)
2161 } 2241 }
2162 2242
2163 /* everthing is clear. we have right of way */ 2243 /* everthing is clear. we have right of way */
2164 migrate_ioapic_irq(irq, desc->pending_mask); 2244 migrate_ioapic_irq_desc(desc, desc->pending_mask);
2165 2245
2166 ret = 0; 2246 ret = 0;
2167 desc->status &= ~IRQ_MOVE_PENDING; 2247 desc->status &= ~IRQ_MOVE_PENDING;
2168 cpus_clear(desc->pending_mask); 2248 cpus_clear(desc->pending_mask);
2169 2249
2170unmask: 2250unmask:
2171 unmask_IO_APIC_irq(irq); 2251 unmask_IO_APIC_irq_desc(desc);
2252
2172 return ret; 2253 return ret;
2173} 2254}
2174 2255
@@ -2178,6 +2259,9 @@ static void ir_irq_migration(struct work_struct *work)
2178 struct irq_desc *desc; 2259 struct irq_desc *desc;
2179 2260
2180 for_each_irq_desc(irq, desc) { 2261 for_each_irq_desc(irq, desc) {
2262 if (!desc)
2263 continue;
2264
2181 if (desc->status & IRQ_MOVE_PENDING) { 2265 if (desc->status & IRQ_MOVE_PENDING) {
2182 unsigned long flags; 2266 unsigned long flags;
2183 2267
@@ -2198,19 +2282,24 @@ static void ir_irq_migration(struct work_struct *work)
2198/* 2282/*
2199 * Migrates the IRQ destination in the process context. 2283 * Migrates the IRQ destination in the process context.
2200 */ 2284 */
2201static void set_ir_ioapic_affinity_irq(unsigned int irq, 2285static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2202 const struct cpumask *mask) 2286 const struct cpumask *mask)
2203{ 2287{
2204 struct irq_desc *desc = irq_to_desc(irq);
2205
2206 if (desc->status & IRQ_LEVEL) { 2288 if (desc->status & IRQ_LEVEL) {
2207 desc->status |= IRQ_MOVE_PENDING; 2289 desc->status |= IRQ_MOVE_PENDING;
2208 cpumask_copy(&desc->pending_mask, mask); 2290 cpumask_copy(&desc->pending_mask, mask);
2209 migrate_irq_remapped_level(irq); 2291 migrate_irq_remapped_level_desc(desc);
2210 return; 2292 return;
2211 } 2293 }
2212 2294
2213 migrate_ioapic_irq(irq, *mask); 2295 migrate_ioapic_irq_desc(desc, mask);
2296}
2297static void set_ir_ioapic_affinity_irq(unsigned int irq,
2298 const struct cpumask *mask)
2299{
2300 struct irq_desc *desc = irq_to_desc(irq);
2301
2302 set_ir_ioapic_affinity_irq_desc(desc, mask);
2214} 2303}
2215#endif 2304#endif
2216 2305
@@ -2230,6 +2319,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2230 struct irq_cfg *cfg; 2319 struct irq_cfg *cfg;
2231 irq = __get_cpu_var(vector_irq)[vector]; 2320 irq = __get_cpu_var(vector_irq)[vector];
2232 2321
2322 if (irq == -1)
2323 continue;
2324
2233 desc = irq_to_desc(irq); 2325 desc = irq_to_desc(irq);
2234 if (!desc) 2326 if (!desc)
2235 continue; 2327 continue;
@@ -2251,9 +2343,10 @@ unlock:
2251 irq_exit(); 2343 irq_exit();
2252} 2344}
2253 2345
2254static void irq_complete_move(unsigned int irq) 2346static void irq_complete_move(struct irq_desc **descp)
2255{ 2347{
2256 struct irq_cfg *cfg = irq_cfg(irq); 2348 struct irq_desc *desc = *descp;
2349 struct irq_cfg *cfg = desc->chip_data;
2257 unsigned vector, me; 2350 unsigned vector, me;
2258 2351
2259 if (likely(!cfg->move_in_progress)) 2352 if (likely(!cfg->move_in_progress))
@@ -2271,8 +2364,9 @@ static void irq_complete_move(unsigned int irq)
2271 } 2364 }
2272} 2365}
2273#else 2366#else
2274static inline void irq_complete_move(unsigned int irq) {} 2367static inline void irq_complete_move(struct irq_desc **descp) {}
2275#endif 2368#endif
2369
2276#ifdef CONFIG_INTR_REMAP 2370#ifdef CONFIG_INTR_REMAP
2277static void ack_x2apic_level(unsigned int irq) 2371static void ack_x2apic_level(unsigned int irq)
2278{ 2372{
@@ -2283,11 +2377,14 @@ static void ack_x2apic_edge(unsigned int irq)
2283{ 2377{
2284 ack_x2APIC_irq(); 2378 ack_x2APIC_irq();
2285} 2379}
2380
2286#endif 2381#endif
2287 2382
2288static void ack_apic_edge(unsigned int irq) 2383static void ack_apic_edge(unsigned int irq)
2289{ 2384{
2290 irq_complete_move(irq); 2385 struct irq_desc *desc = irq_to_desc(irq);
2386
2387 irq_complete_move(&desc);
2291 move_native_irq(irq); 2388 move_native_irq(irq);
2292 ack_APIC_irq(); 2389 ack_APIC_irq();
2293} 2390}
@@ -2296,18 +2393,21 @@ atomic_t irq_mis_count;
2296 2393
2297static void ack_apic_level(unsigned int irq) 2394static void ack_apic_level(unsigned int irq)
2298{ 2395{
2396 struct irq_desc *desc = irq_to_desc(irq);
2397
2299#ifdef CONFIG_X86_32 2398#ifdef CONFIG_X86_32
2300 unsigned long v; 2399 unsigned long v;
2301 int i; 2400 int i;
2302#endif 2401#endif
2402 struct irq_cfg *cfg;
2303 int do_unmask_irq = 0; 2403 int do_unmask_irq = 0;
2304 2404
2305 irq_complete_move(irq); 2405 irq_complete_move(&desc);
2306#ifdef CONFIG_GENERIC_PENDING_IRQ 2406#ifdef CONFIG_GENERIC_PENDING_IRQ
2307 /* If we are moving the irq we need to mask it */ 2407 /* If we are moving the irq we need to mask it */
2308 if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { 2408 if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
2309 do_unmask_irq = 1; 2409 do_unmask_irq = 1;
2310 mask_IO_APIC_irq(irq); 2410 mask_IO_APIC_irq_desc(desc);
2311 } 2411 }
2312#endif 2412#endif
2313 2413
@@ -2331,7 +2431,8 @@ static void ack_apic_level(unsigned int irq)
2331 * operation to prevent an edge-triggered interrupt escaping meanwhile. 2431 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2332 * The idea is from Manfred Spraul. --macro 2432 * The idea is from Manfred Spraul. --macro
2333 */ 2433 */
2334 i = irq_cfg(irq)->vector; 2434 cfg = desc->chip_data;
2435 i = cfg->vector;
2335 2436
2336 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); 2437 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2337#endif 2438#endif
@@ -2370,17 +2471,18 @@ static void ack_apic_level(unsigned int irq)
2370 * accurate and is causing problems then it is a hardware bug 2471 * accurate and is causing problems then it is a hardware bug
2371 * and you can go talk to the chipset vendor about it. 2472 * and you can go talk to the chipset vendor about it.
2372 */ 2473 */
2373 if (!io_apic_level_ack_pending(irq)) 2474 cfg = desc->chip_data;
2475 if (!io_apic_level_ack_pending(cfg))
2374 move_masked_irq(irq); 2476 move_masked_irq(irq);
2375 unmask_IO_APIC_irq(irq); 2477 unmask_IO_APIC_irq_desc(desc);
2376 } 2478 }
2377 2479
2378#ifdef CONFIG_X86_32 2480#ifdef CONFIG_X86_32
2379 if (!(v & (1 << (i & 0x1f)))) { 2481 if (!(v & (1 << (i & 0x1f)))) {
2380 atomic_inc(&irq_mis_count); 2482 atomic_inc(&irq_mis_count);
2381 spin_lock(&ioapic_lock); 2483 spin_lock(&ioapic_lock);
2382 __mask_and_edge_IO_APIC_irq(irq); 2484 __mask_and_edge_IO_APIC_irq(cfg);
2383 __unmask_and_level_IO_APIC_irq(irq); 2485 __unmask_and_level_IO_APIC_irq(cfg);
2384 spin_unlock(&ioapic_lock); 2486 spin_unlock(&ioapic_lock);
2385 } 2487 }
2386#endif 2488#endif
@@ -2431,20 +2533,22 @@ static inline void init_IO_APIC_traps(void)
2431 * Also, we've got to be careful not to trash gate 2533 * Also, we've got to be careful not to trash gate
2432 * 0x80, because int 0x80 is hm, kind of importantish. ;) 2534 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2433 */ 2535 */
2434 for_each_irq_cfg(irq, cfg) { 2536 for_each_irq_desc(irq, desc) {
2435 if (IO_APIC_IRQ(irq) && !cfg->vector) { 2537 if (!desc)
2538 continue;
2539
2540 cfg = desc->chip_data;
2541 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
2436 /* 2542 /*
2437 * Hmm.. We don't have an entry for this, 2543 * Hmm.. We don't have an entry for this,
2438 * so default to an old-fashioned 8259 2544 * so default to an old-fashioned 8259
2439 * interrupt if we can.. 2545 * interrupt if we can..
2440 */ 2546 */
2441 if (irq < 16) 2547 if (irq < NR_IRQS_LEGACY)
2442 make_8259A_irq(irq); 2548 make_8259A_irq(irq);
2443 else { 2549 else
2444 desc = irq_to_desc(irq);
2445 /* Strange. Oh, well.. */ 2550 /* Strange. Oh, well.. */
2446 desc->chip = &no_irq_chip; 2551 desc->chip = &no_irq_chip;
2447 }
2448 } 2552 }
2449 } 2553 }
2450} 2554}
@@ -2469,7 +2573,7 @@ static void unmask_lapic_irq(unsigned int irq)
2469 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); 2573 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2470} 2574}
2471 2575
2472static void ack_lapic_irq (unsigned int irq) 2576static void ack_lapic_irq(unsigned int irq)
2473{ 2577{
2474 ack_APIC_irq(); 2578 ack_APIC_irq();
2475} 2579}
@@ -2481,11 +2585,8 @@ static struct irq_chip lapic_chip __read_mostly = {
2481 .ack = ack_lapic_irq, 2585 .ack = ack_lapic_irq,
2482}; 2586};
2483 2587
2484static void lapic_register_intr(int irq) 2588static void lapic_register_intr(int irq, struct irq_desc *desc)
2485{ 2589{
2486 struct irq_desc *desc;
2487
2488 desc = irq_to_desc(irq);
2489 desc->status &= ~IRQ_LEVEL; 2590 desc->status &= ~IRQ_LEVEL;
2490 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, 2591 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2491 "edge"); 2592 "edge");
@@ -2589,7 +2690,9 @@ int timer_through_8259 __initdata;
2589 */ 2690 */
2590static inline void __init check_timer(void) 2691static inline void __init check_timer(void)
2591{ 2692{
2592 struct irq_cfg *cfg = irq_cfg(0); 2693 struct irq_desc *desc = irq_to_desc(0);
2694 struct irq_cfg *cfg = desc->chip_data;
2695 int cpu = boot_cpu_id;
2593 int apic1, pin1, apic2, pin2; 2696 int apic1, pin1, apic2, pin2;
2594 unsigned long flags; 2697 unsigned long flags;
2595 unsigned int ver; 2698 unsigned int ver;
@@ -2604,7 +2707,7 @@ static inline void __init check_timer(void)
2604 * get/set the timer IRQ vector: 2707 * get/set the timer IRQ vector:
2605 */ 2708 */
2606 disable_8259A_irq(0); 2709 disable_8259A_irq(0);
2607 assign_irq_vector(0, TARGET_CPUS); 2710 assign_irq_vector(0, cfg, TARGET_CPUS);
2608 2711
2609 /* 2712 /*
2610 * As IRQ0 is to be enabled in the 8259A, the virtual 2713 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2655,10 +2758,10 @@ static inline void __init check_timer(void)
2655 * Ok, does IRQ0 through the IOAPIC work? 2758 * Ok, does IRQ0 through the IOAPIC work?
2656 */ 2759 */
2657 if (no_pin1) { 2760 if (no_pin1) {
2658 add_pin_to_irq(0, apic1, pin1); 2761 add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
2659 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2762 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
2660 } 2763 }
2661 unmask_IO_APIC_irq(0); 2764 unmask_IO_APIC_irq_desc(desc);
2662 if (timer_irq_works()) { 2765 if (timer_irq_works()) {
2663 if (nmi_watchdog == NMI_IO_APIC) { 2766 if (nmi_watchdog == NMI_IO_APIC) {
2664 setup_nmi(); 2767 setup_nmi();
@@ -2684,9 +2787,9 @@ static inline void __init check_timer(void)
2684 /* 2787 /*
2685 * legacy devices should be connected to IO APIC #0 2788 * legacy devices should be connected to IO APIC #0
2686 */ 2789 */
2687 replace_pin_at_irq(0, apic1, pin1, apic2, pin2); 2790 replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
2688 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 2791 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
2689 unmask_IO_APIC_irq(0); 2792 unmask_IO_APIC_irq_desc(desc);
2690 enable_8259A_irq(0); 2793 enable_8259A_irq(0);
2691 if (timer_irq_works()) { 2794 if (timer_irq_works()) {
2692 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 2795 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2718,7 +2821,7 @@ static inline void __init check_timer(void)
2718 apic_printk(APIC_QUIET, KERN_INFO 2821 apic_printk(APIC_QUIET, KERN_INFO
2719 "...trying to set up timer as Virtual Wire IRQ...\n"); 2822 "...trying to set up timer as Virtual Wire IRQ...\n");
2720 2823
2721 lapic_register_intr(0); 2824 lapic_register_intr(0, desc);
2722 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 2825 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
2723 enable_8259A_irq(0); 2826 enable_8259A_irq(0);
2724 2827
@@ -2903,22 +3006,26 @@ unsigned int create_irq_nr(unsigned int irq_want)
2903 unsigned int irq; 3006 unsigned int irq;
2904 unsigned int new; 3007 unsigned int new;
2905 unsigned long flags; 3008 unsigned long flags;
2906 struct irq_cfg *cfg_new; 3009 struct irq_cfg *cfg_new = NULL;
2907 3010 int cpu = boot_cpu_id;
2908 irq_want = nr_irqs - 1; 3011 struct irq_desc *desc_new = NULL;
2909 3012
2910 irq = 0; 3013 irq = 0;
2911 spin_lock_irqsave(&vector_lock, flags); 3014 spin_lock_irqsave(&vector_lock, flags);
2912 for (new = irq_want; new > 0; new--) { 3015 for (new = irq_want; new < NR_IRQS; new++) {
2913 if (platform_legacy_irq(new)) 3016 if (platform_legacy_irq(new))
2914 continue; 3017 continue;
2915 cfg_new = irq_cfg(new); 3018
2916 if (cfg_new && cfg_new->vector != 0) 3019 desc_new = irq_to_desc_alloc_cpu(new, cpu);
3020 if (!desc_new) {
3021 printk(KERN_INFO "can not get irq_desc for %d\n", new);
3022 continue;
3023 }
3024 cfg_new = desc_new->chip_data;
3025
3026 if (cfg_new->vector != 0)
2917 continue; 3027 continue;
2918 /* check if need to create one */ 3028 if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
2919 if (!cfg_new)
2920 cfg_new = irq_cfg_alloc(new);
2921 if (__assign_irq_vector(new, TARGET_CPUS) == 0)
2922 irq = new; 3029 irq = new;
2923 break; 3030 break;
2924 } 3031 }
@@ -2926,15 +3033,21 @@ unsigned int create_irq_nr(unsigned int irq_want)
2926 3033
2927 if (irq > 0) { 3034 if (irq > 0) {
2928 dynamic_irq_init(irq); 3035 dynamic_irq_init(irq);
3036 /* restore it, in case dynamic_irq_init clear it */
3037 if (desc_new)
3038 desc_new->chip_data = cfg_new;
2929 } 3039 }
2930 return irq; 3040 return irq;
2931} 3041}
2932 3042
3043static int nr_irqs_gsi = NR_IRQS_LEGACY;
2933int create_irq(void) 3044int create_irq(void)
2934{ 3045{
3046 unsigned int irq_want;
2935 int irq; 3047 int irq;
2936 3048
2937 irq = create_irq_nr(nr_irqs - 1); 3049 irq_want = nr_irqs_gsi;
3050 irq = create_irq_nr(irq_want);
2938 3051
2939 if (irq == 0) 3052 if (irq == 0)
2940 irq = -1; 3053 irq = -1;
@@ -2945,14 +3058,22 @@ int create_irq(void)
2945void destroy_irq(unsigned int irq) 3058void destroy_irq(unsigned int irq)
2946{ 3059{
2947 unsigned long flags; 3060 unsigned long flags;
3061 struct irq_cfg *cfg;
3062 struct irq_desc *desc;
2948 3063
3064 /* store it, in case dynamic_irq_cleanup clear it */
3065 desc = irq_to_desc(irq);
3066 cfg = desc->chip_data;
2949 dynamic_irq_cleanup(irq); 3067 dynamic_irq_cleanup(irq);
3068 /* connect back irq_cfg */
3069 if (desc)
3070 desc->chip_data = cfg;
2950 3071
2951#ifdef CONFIG_INTR_REMAP 3072#ifdef CONFIG_INTR_REMAP
2952 free_irte(irq); 3073 free_irte(irq);
2953#endif 3074#endif
2954 spin_lock_irqsave(&vector_lock, flags); 3075 spin_lock_irqsave(&vector_lock, flags);
2955 __clear_irq_vector(irq); 3076 __clear_irq_vector(irq, cfg);
2956 spin_unlock_irqrestore(&vector_lock, flags); 3077 spin_unlock_irqrestore(&vector_lock, flags);
2957} 3078}
2958 3079
@@ -2967,12 +3088,12 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
2967 unsigned dest; 3088 unsigned dest;
2968 cpumask_t tmp; 3089 cpumask_t tmp;
2969 3090
3091 cfg = irq_cfg(irq);
2970 tmp = TARGET_CPUS; 3092 tmp = TARGET_CPUS;
2971 err = assign_irq_vector(irq, tmp); 3093 err = assign_irq_vector(irq, cfg, tmp);
2972 if (err) 3094 if (err)
2973 return err; 3095 return err;
2974 3096
2975 cfg = irq_cfg(irq);
2976 cpus_and(tmp, cfg->domain, tmp); 3097 cpus_and(tmp, cfg->domain, tmp);
2977 dest = cpu_mask_to_apicid(tmp); 3098 dest = cpu_mask_to_apicid(tmp);
2978 3099
@@ -3030,34 +3151,34 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3030#ifdef CONFIG_SMP 3151#ifdef CONFIG_SMP
3031static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) 3152static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3032{ 3153{
3154 struct irq_desc *desc = irq_to_desc(irq);
3033 struct irq_cfg *cfg; 3155 struct irq_cfg *cfg;
3034 struct msi_msg msg; 3156 struct msi_msg msg;
3035 unsigned int dest; 3157 unsigned int dest;
3036 cpumask_t tmp; 3158 cpumask_t tmp;
3037 struct irq_desc *desc;
3038 3159
3039 if (!cpumask_intersects(mask, cpu_online_mask)) 3160 if (!cpumask_intersects(mask, cpu_online_mask))
3040 return; 3161 return;
3041 3162
3042 if (assign_irq_vector(irq, *mask)) 3163 cfg = desc->chip_data;
3164 if (assign_irq_vector(irq, cfg, *mask))
3043 return; 3165 return;
3044 3166
3045 cfg = irq_cfg(irq); 3167 set_extra_move_desc(desc, *mask);
3168
3046 cpumask_and(&tmp, &cfg->domain, mask); 3169 cpumask_and(&tmp, &cfg->domain, mask);
3047 dest = cpu_mask_to_apicid(tmp); 3170 dest = cpu_mask_to_apicid(tmp);
3048 3171
3049 read_msi_msg(irq, &msg); 3172 read_msi_msg_desc(desc, &msg);
3050 3173
3051 msg.data &= ~MSI_DATA_VECTOR_MASK; 3174 msg.data &= ~MSI_DATA_VECTOR_MASK;
3052 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3175 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3053 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; 3176 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3054 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3177 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3055 3178
3056 write_msi_msg(irq, &msg); 3179 write_msi_msg_desc(desc, &msg);
3057 desc = irq_to_desc(irq);
3058 cpumask_copy(&desc->affinity, mask); 3180 cpumask_copy(&desc->affinity, mask);
3059} 3181}
3060
3061#ifdef CONFIG_INTR_REMAP 3182#ifdef CONFIG_INTR_REMAP
3062/* 3183/*
3063 * Migrate the MSI irq to another cpumask. This migration is 3184 * Migrate the MSI irq to another cpumask. This migration is
@@ -3066,11 +3187,11 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3066static void ir_set_msi_irq_affinity(unsigned int irq, 3187static void ir_set_msi_irq_affinity(unsigned int irq,
3067 const struct cpumask *mask) 3188 const struct cpumask *mask)
3068{ 3189{
3190 struct irq_desc *desc = irq_to_desc(irq);
3069 struct irq_cfg *cfg; 3191 struct irq_cfg *cfg;
3070 unsigned int dest; 3192 unsigned int dest;
3071 cpumask_t tmp, cleanup_mask; 3193 cpumask_t tmp, cleanup_mask;
3072 struct irte irte; 3194 struct irte irte;
3073 struct irq_desc *desc;
3074 3195
3075 if (!cpumask_intersects(mask, cpu_online_mask)) 3196 if (!cpumask_intersects(mask, cpu_online_mask))
3076 return; 3197 return;
@@ -3078,10 +3199,12 @@ static void ir_set_msi_irq_affinity(unsigned int irq,
3078 if (get_irte(irq, &irte)) 3199 if (get_irte(irq, &irte))
3079 return; 3200 return;
3080 3201
3081 if (assign_irq_vector(irq, *mask)) 3202 cfg = desc->chip_data;
3203 if (assign_irq_vector(irq, cfg, *mask))
3082 return; 3204 return;
3083 3205
3084 cfg = irq_cfg(irq); 3206 set_extra_move_desc(desc, mask);
3207
3085 cpumask_and(&tmp, &cfg->domain, mask); 3208 cpumask_and(&tmp, &cfg->domain, mask);
3086 dest = cpu_mask_to_apicid(tmp); 3209 dest = cpu_mask_to_apicid(tmp);
3087 3210
@@ -3105,9 +3228,9 @@ static void ir_set_msi_irq_affinity(unsigned int irq,
3105 cfg->move_in_progress = 0; 3228 cfg->move_in_progress = 0;
3106 } 3229 }
3107 3230
3108 desc = irq_to_desc(irq);
3109 cpumask_copy(&desc->affinity, mask); 3231 cpumask_copy(&desc->affinity, mask);
3110} 3232}
3233
3111#endif 3234#endif
3112#endif /* CONFIG_SMP */ 3235#endif /* CONFIG_SMP */
3113 3236
@@ -3166,7 +3289,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3166} 3289}
3167#endif 3290#endif
3168 3291
3169static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) 3292static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3170{ 3293{
3171 int ret; 3294 int ret;
3172 struct msi_msg msg; 3295 struct msi_msg msg;
@@ -3175,7 +3298,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
3175 if (ret < 0) 3298 if (ret < 0)
3176 return ret; 3299 return ret;
3177 3300
3178 set_irq_msi(irq, desc); 3301 set_irq_msi(irq, msidesc);
3179 write_msi_msg(irq, &msg); 3302 write_msi_msg(irq, &msg);
3180 3303
3181#ifdef CONFIG_INTR_REMAP 3304#ifdef CONFIG_INTR_REMAP
@@ -3195,26 +3318,13 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
3195 return 0; 3318 return 0;
3196} 3319}
3197 3320
3198static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) 3321int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
3199{
3200 unsigned int irq;
3201
3202 irq = dev->bus->number;
3203 irq <<= 8;
3204 irq |= dev->devfn;
3205 irq <<= 12;
3206
3207 return irq;
3208}
3209
3210int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
3211{ 3322{
3212 unsigned int irq; 3323 unsigned int irq;
3213 int ret; 3324 int ret;
3214 unsigned int irq_want; 3325 unsigned int irq_want;
3215 3326
3216 irq_want = build_irq_for_pci_dev(dev) + 0x100; 3327 irq_want = nr_irqs_gsi;
3217
3218 irq = create_irq_nr(irq_want); 3328 irq = create_irq_nr(irq_want);
3219 if (irq == 0) 3329 if (irq == 0)
3220 return -1; 3330 return -1;
@@ -3228,7 +3338,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
3228 goto error; 3338 goto error;
3229no_ir: 3339no_ir:
3230#endif 3340#endif
3231 ret = setup_msi_irq(dev, desc, irq); 3341 ret = setup_msi_irq(dev, msidesc, irq);
3232 if (ret < 0) { 3342 if (ret < 0) {
3233 destroy_irq(irq); 3343 destroy_irq(irq);
3234 return ret; 3344 return ret;
@@ -3246,7 +3356,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3246{ 3356{
3247 unsigned int irq; 3357 unsigned int irq;
3248 int ret, sub_handle; 3358 int ret, sub_handle;
3249 struct msi_desc *desc; 3359 struct msi_desc *msidesc;
3250 unsigned int irq_want; 3360 unsigned int irq_want;
3251 3361
3252#ifdef CONFIG_INTR_REMAP 3362#ifdef CONFIG_INTR_REMAP
@@ -3254,10 +3364,11 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3254 int index = 0; 3364 int index = 0;
3255#endif 3365#endif
3256 3366
3257 irq_want = build_irq_for_pci_dev(dev) + 0x100; 3367 irq_want = nr_irqs_gsi;
3258 sub_handle = 0; 3368 sub_handle = 0;
3259 list_for_each_entry(desc, &dev->msi_list, list) { 3369 list_for_each_entry(msidesc, &dev->msi_list, list) {
3260 irq = create_irq_nr(irq_want--); 3370 irq = create_irq_nr(irq_want);
3371 irq_want++;
3261 if (irq == 0) 3372 if (irq == 0)
3262 return -1; 3373 return -1;
3263#ifdef CONFIG_INTR_REMAP 3374#ifdef CONFIG_INTR_REMAP
@@ -3289,7 +3400,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3289 } 3400 }
3290no_ir: 3401no_ir:
3291#endif 3402#endif
3292 ret = setup_msi_irq(dev, desc, irq); 3403 ret = setup_msi_irq(dev, msidesc, irq);
3293 if (ret < 0) 3404 if (ret < 0)
3294 goto error; 3405 goto error;
3295 sub_handle++; 3406 sub_handle++;
@@ -3310,19 +3421,21 @@ void arch_teardown_msi_irq(unsigned int irq)
3310#ifdef CONFIG_SMP 3421#ifdef CONFIG_SMP
3311static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3422static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3312{ 3423{
3424 struct irq_desc *desc = irq_to_desc(irq);
3313 struct irq_cfg *cfg; 3425 struct irq_cfg *cfg;
3314 struct msi_msg msg; 3426 struct msi_msg msg;
3315 unsigned int dest; 3427 unsigned int dest;
3316 cpumask_t tmp; 3428 cpumask_t tmp;
3317 struct irq_desc *desc;
3318 3429
3319 if (!cpumask_intersects(mask, cpu_online_mask)) 3430 if (!cpumask_intersects(mask, cpu_online_mask))
3320 return; 3431 return;
3321 3432
3322 if (assign_irq_vector(irq, *mask)) 3433 cfg = desc->chip_data;
3434 if (assign_irq_vector(irq, cfg, *mask))
3323 return; 3435 return;
3324 3436
3325 cfg = irq_cfg(irq); 3437 set_extra_move_desc(desc, *mask);
3438
3326 cpumask_and(&tmp, &cfg->domain, mask); 3439 cpumask_and(&tmp, &cfg->domain, mask);
3327 dest = cpu_mask_to_apicid(tmp); 3440 dest = cpu_mask_to_apicid(tmp);
3328 3441
@@ -3334,9 +3447,9 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3334 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3447 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3335 3448
3336 dmar_msi_write(irq, &msg); 3449 dmar_msi_write(irq, &msg);
3337 desc = irq_to_desc(irq);
3338 cpumask_copy(&desc->affinity, mask); 3450 cpumask_copy(&desc->affinity, mask);
3339} 3451}
3452
3340#endif /* CONFIG_SMP */ 3453#endif /* CONFIG_SMP */
3341 3454
3342struct irq_chip dmar_msi_type = { 3455struct irq_chip dmar_msi_type = {
@@ -3370,8 +3483,8 @@ int arch_setup_dmar_msi(unsigned int irq)
3370#ifdef CONFIG_SMP 3483#ifdef CONFIG_SMP
3371static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3484static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3372{ 3485{
3486 struct irq_desc *desc = irq_to_desc(irq);
3373 struct irq_cfg *cfg; 3487 struct irq_cfg *cfg;
3374 struct irq_desc *desc;
3375 struct msi_msg msg; 3488 struct msi_msg msg;
3376 unsigned int dest; 3489 unsigned int dest;
3377 cpumask_t tmp; 3490 cpumask_t tmp;
@@ -3379,10 +3492,12 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3379 if (!cpumask_intersects(mask, cpu_online_mask)) 3492 if (!cpumask_intersects(mask, cpu_online_mask))
3380 return; 3493 return;
3381 3494
3382 if (assign_irq_vector(irq, *mask)) 3495 cfg = desc->chip_data;
3496 if (assign_irq_vector(irq, cfg, *mask))
3383 return; 3497 return;
3384 3498
3385 cfg = irq_cfg(irq); 3499 set_extra_move_desc(desc, *mask);
3500
3386 cpumask_and(&tmp, &cfg->domain, mask); 3501 cpumask_and(&tmp, &cfg->domain, mask);
3387 dest = cpu_mask_to_apicid(tmp); 3502 dest = cpu_mask_to_apicid(tmp);
3388 3503
@@ -3394,9 +3509,9 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3394 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3509 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3395 3510
3396 hpet_msi_write(irq, &msg); 3511 hpet_msi_write(irq, &msg);
3397 desc = irq_to_desc(irq);
3398 cpumask_copy(&desc->affinity, mask); 3512 cpumask_copy(&desc->affinity, mask);
3399} 3513}
3514
3400#endif /* CONFIG_SMP */ 3515#endif /* CONFIG_SMP */
3401 3516
3402struct irq_chip hpet_msi_type = { 3517struct irq_chip hpet_msi_type = {
@@ -3451,25 +3566,27 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
3451 3566
3452static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) 3567static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3453{ 3568{
3569 struct irq_desc *desc = irq_to_desc(irq);
3454 struct irq_cfg *cfg; 3570 struct irq_cfg *cfg;
3455 unsigned int dest; 3571 unsigned int dest;
3456 cpumask_t tmp; 3572 cpumask_t tmp;
3457 struct irq_desc *desc;
3458 3573
3459 if (!cpumask_intersects(mask, cpu_online_mask)) 3574 if (!cpumask_intersects(mask, cpu_online_mask))
3460 return; 3575 return;
3461 3576
3462 if (assign_irq_vector(irq, *mask)) 3577 cfg = desc->chip_data;
3578 if (assign_irq_vector(irq, cfg, *mask))
3463 return; 3579 return;
3464 3580
3465 cfg = irq_cfg(irq); 3581 set_extra_move_desc(desc, *mask);
3582
3466 cpumask_and(&tmp, &cfg->domain, mask); 3583 cpumask_and(&tmp, &cfg->domain, mask);
3467 dest = cpu_mask_to_apicid(tmp); 3584 dest = cpu_mask_to_apicid(tmp);
3468 3585
3469 target_ht_irq(irq, dest, cfg->vector); 3586 target_ht_irq(irq, dest, cfg->vector);
3470 desc = irq_to_desc(irq);
3471 cpumask_copy(&desc->affinity, mask); 3587 cpumask_copy(&desc->affinity, mask);
3472} 3588}
3589
3473#endif 3590#endif
3474 3591
3475static struct irq_chip ht_irq_chip = { 3592static struct irq_chip ht_irq_chip = {
@@ -3489,13 +3606,13 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3489 int err; 3606 int err;
3490 cpumask_t tmp; 3607 cpumask_t tmp;
3491 3608
3609 cfg = irq_cfg(irq);
3492 tmp = TARGET_CPUS; 3610 tmp = TARGET_CPUS;
3493 err = assign_irq_vector(irq, tmp); 3611 err = assign_irq_vector(irq, cfg, tmp);
3494 if (!err) { 3612 if (!err) {
3495 struct ht_irq_msg msg; 3613 struct ht_irq_msg msg;
3496 unsigned dest; 3614 unsigned dest;
3497 3615
3498 cfg = irq_cfg(irq);
3499 cpus_and(tmp, cfg->domain, tmp); 3616 cpus_and(tmp, cfg->domain, tmp);
3500 dest = cpu_mask_to_apicid(tmp); 3617 dest = cpu_mask_to_apicid(tmp);
3501 3618
@@ -3541,7 +3658,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3541 unsigned long flags; 3658 unsigned long flags;
3542 int err; 3659 int err;
3543 3660
3544 err = assign_irq_vector(irq, *eligible_cpu); 3661 cfg = irq_cfg(irq);
3662
3663 err = assign_irq_vector(irq, cfg, *eligible_cpu);
3545 if (err != 0) 3664 if (err != 0)
3546 return err; 3665 return err;
3547 3666
@@ -3550,8 +3669,6 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3550 irq_name); 3669 irq_name);
3551 spin_unlock_irqrestore(&vector_lock, flags); 3670 spin_unlock_irqrestore(&vector_lock, flags);
3552 3671
3553 cfg = irq_cfg(irq);
3554
3555 mmr_value = 0; 3672 mmr_value = 0;
3556 entry = (struct uv_IO_APIC_route_entry *)&mmr_value; 3673 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3557 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); 3674 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3603,9 +3720,16 @@ int __init io_apic_get_redir_entries (int ioapic)
3603 return reg_01.bits.entries; 3720 return reg_01.bits.entries;
3604} 3721}
3605 3722
3606int __init probe_nr_irqs(void) 3723void __init probe_nr_irqs_gsi(void)
3607{ 3724{
3608 return NR_IRQS; 3725 int idx;
3726 int nr = 0;
3727
3728 for (idx = 0; idx < nr_ioapics; idx++)
3729 nr += io_apic_get_redir_entries(idx) + 1;
3730
3731 if (nr > nr_irqs_gsi)
3732 nr_irqs_gsi = nr;
3609} 3733}
3610 3734
3611/* -------------------------------------------------------------------------- 3735/* --------------------------------------------------------------------------
@@ -3704,19 +3828,31 @@ int __init io_apic_get_version(int ioapic)
3704 3828
3705int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) 3829int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
3706{ 3830{
3831 struct irq_desc *desc;
3832 struct irq_cfg *cfg;
3833 int cpu = boot_cpu_id;
3834
3707 if (!IO_APIC_IRQ(irq)) { 3835 if (!IO_APIC_IRQ(irq)) {
3708 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", 3836 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
3709 ioapic); 3837 ioapic);
3710 return -EINVAL; 3838 return -EINVAL;
3711 } 3839 }
3712 3840
3841 desc = irq_to_desc_alloc_cpu(irq, cpu);
3842 if (!desc) {
3843 printk(KERN_INFO "can not get irq_desc %d\n", irq);
3844 return 0;
3845 }
3846
3713 /* 3847 /*
3714 * IRQs < 16 are already in the irq_2_pin[] map 3848 * IRQs < 16 are already in the irq_2_pin[] map
3715 */ 3849 */
3716 if (irq >= 16) 3850 if (irq >= NR_IRQS_LEGACY) {
3717 add_pin_to_irq(irq, ioapic, pin); 3851 cfg = desc->chip_data;
3852 add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
3853 }
3718 3854
3719 setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); 3855 setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
3720 3856
3721 return 0; 3857 return 0;
3722} 3858}
@@ -3770,9 +3906,10 @@ void __init setup_ioapic_dest(void)
3770 * when you have too many devices, because at that time only boot 3906 * when you have too many devices, because at that time only boot
3771 * cpu is online. 3907 * cpu is online.
3772 */ 3908 */
3773 cfg = irq_cfg(irq); 3909 desc = irq_to_desc(irq);
3910 cfg = desc->chip_data;
3774 if (!cfg->vector) { 3911 if (!cfg->vector) {
3775 setup_IO_APIC_irq(ioapic, pin, irq, 3912 setup_IO_APIC_irq(ioapic, pin, irq, desc,
3776 irq_trigger(irq_entry), 3913 irq_trigger(irq_entry),
3777 irq_polarity(irq_entry)); 3914 irq_polarity(irq_entry));
3778 continue; 3915 continue;
@@ -3782,7 +3919,6 @@ void __init setup_ioapic_dest(void)
3782 /* 3919 /*
3783 * Honour affinities which have been set in early boot 3920 * Honour affinities which have been set in early boot
3784 */ 3921 */
3785 desc = irq_to_desc(irq);
3786 if (desc->status & 3922 if (desc->status &
3787 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) 3923 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
3788 mask = desc->affinity; 3924 mask = desc->affinity;
@@ -3791,10 +3927,10 @@ void __init setup_ioapic_dest(void)
3791 3927
3792#ifdef CONFIG_INTR_REMAP 3928#ifdef CONFIG_INTR_REMAP
3793 if (intr_remapping_enabled) 3929 if (intr_remapping_enabled)
3794 set_ir_ioapic_affinity_irq(irq, &mask); 3930 set_ir_ioapic_affinity_irq_desc(desc, &mask);
3795 else 3931 else
3796#endif 3932#endif
3797 set_ioapic_affinity_irq(irq, &mask); 3933 set_ioapic_affinity_irq_desc(desc, &mask);
3798 } 3934 }
3799 3935
3800 } 3936 }
@@ -3843,7 +3979,6 @@ void __init ioapic_init_mappings(void)
3843 struct resource *ioapic_res; 3979 struct resource *ioapic_res;
3844 int i; 3980 int i;
3845 3981
3846 irq_2_pin_init();
3847 ioapic_res = ioapic_setup_resources(); 3982 ioapic_res = ioapic_setup_resources();
3848 for (i = 0; i < nr_ioapics; i++) { 3983 for (i = 0; i < nr_ioapics; i++) {
3849 if (smp_found_config) { 3984 if (smp_found_config) {
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc52f649..3f1d9d18df67 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -118,6 +118,9 @@ int show_interrupts(struct seq_file *p, void *v)
118 } 118 }
119 119
120 desc = irq_to_desc(i); 120 desc = irq_to_desc(i);
121 if (!desc)
122 return 0;
123
121 spin_lock_irqsave(&desc->lock, flags); 124 spin_lock_irqsave(&desc->lock, flags);
122#ifndef CONFIG_SMP 125#ifndef CONFIG_SMP
123 any_count = kstat_irqs(i); 126 any_count = kstat_irqs(i);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 87870a49be4e..9cf9cbbf7a02 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -242,6 +242,8 @@ void fixup_irqs(cpumask_t map)
242 for_each_irq_desc(irq, desc) { 242 for_each_irq_desc(irq, desc) {
243 cpumask_t mask; 243 cpumask_t mask;
244 244
245 if (!desc)
246 continue;
245 if (irq == 2) 247 if (irq == 2)
246 continue; 248 continue;
247 249
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 7d37f847544d..27f2307b0a34 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -94,6 +94,8 @@ void fixup_irqs(cpumask_t map)
94 int break_affinity = 0; 94 int break_affinity = 0;
95 int set_affinity = 1; 95 int set_affinity = 1;
96 96
97 if (!desc)
98 continue;
97 if (irq == 2) 99 if (irq == 2)
98 continue; 100 continue;
99 101
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 845aa9803e80..6a92f47c52e7 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -68,8 +68,7 @@ void __init init_ISA_irqs (void)
68 /* 68 /*
69 * 16 old-style INTA-cycle interrupts: 69 * 16 old-style INTA-cycle interrupts:
70 */ 70 */
71 for (i = 0; i < 16; i++) { 71 for (i = 0; i < NR_IRQS_LEGACY; i++) {
72 /* first time call this irq_desc */
73 struct irq_desc *desc = irq_to_desc(i); 72 struct irq_desc *desc = irq_to_desc(i);
74 73
75 desc->status = IRQ_DISABLED; 74 desc->status = IRQ_DISABLED;
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index ff0235391285..40c1e62ec785 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -142,8 +142,7 @@ void __init init_ISA_irqs(void)
142 init_bsp_APIC(); 142 init_bsp_APIC();
143 init_8259A(0); 143 init_8259A(0);
144 144
145 for (i = 0; i < 16; i++) { 145 for (i = 0; i < NR_IRQS_LEGACY; i++) {
146 /* first time call this irq_desc */
147 struct irq_desc *desc = irq_to_desc(i); 146 struct irq_desc *desc = irq_to_desc(i);
148 147
149 desc->status = IRQ_DISABLED; 148 desc->status = IRQ_DISABLED;
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 0f4c1fd5a1f4..45e3b69808ba 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -586,26 +586,23 @@ static void __init __get_smp_config(unsigned int early)
586{ 586{
587 struct intel_mp_floating *mpf = mpf_found; 587 struct intel_mp_floating *mpf = mpf_found;
588 588
589 if (x86_quirks->mach_get_smp_config) { 589 if (!mpf)
590 if (x86_quirks->mach_get_smp_config(early)) 590 return;
591 return; 591
592 }
593 if (acpi_lapic && early) 592 if (acpi_lapic && early)
594 return; 593 return;
594
595 /* 595 /*
596 * ACPI supports both logical (e.g. Hyper-Threading) and physical 596 * MPS doesn't support hyperthreading, aka only have
597 * processors, where MPS only supports physical. 597 * thread 0 apic id in MPS table
598 */ 598 */
599 if (acpi_lapic && acpi_ioapic) { 599 if (acpi_lapic && acpi_ioapic)
600 printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
601 "information\n");
602 return; 600 return;
603 } else if (acpi_lapic)
604 printk(KERN_INFO "Using ACPI for processor (LAPIC) "
605 "configuration information\n");
606 601
607 if (!mpf) 602 if (x86_quirks->mach_get_smp_config) {
608 return; 603 if (x86_quirks->mach_get_smp_config(early))
604 return;
605 }
609 606
610 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", 607 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
611 mpf->mpf_specification); 608 mpf->mpf_specification);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 4caff39078e0..0deea37a53cf 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -31,7 +31,7 @@
31#include <asm/numaq.h> 31#include <asm/numaq.h>
32#include <asm/topology.h> 32#include <asm/topology.h>
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/mpspec.h> 34#include <asm/genapic.h>
35#include <asm/e820.h> 35#include <asm/e820.h>
36#include <asm/setup.h> 36#include <asm/setup.h>
37 37
@@ -235,6 +235,13 @@ static int __init numaq_setup_ioapic_ids(void)
235 return 1; 235 return 1;
236} 236}
237 237
238static int __init numaq_update_genapic(void)
239{
240 genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi;
241
242 return 0;
243}
244
238static struct x86_quirks numaq_x86_quirks __initdata = { 245static struct x86_quirks numaq_x86_quirks __initdata = {
239 .arch_pre_time_init = numaq_pre_time_init, 246 .arch_pre_time_init = numaq_pre_time_init,
240 .arch_time_init = NULL, 247 .arch_time_init = NULL,
@@ -250,6 +257,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
250 .mpc_oem_pci_bus = mpc_oem_pci_bus, 257 .mpc_oem_pci_bus = mpc_oem_pci_bus,
251 .smp_read_mpc_oem = smp_read_mpc_oem, 258 .smp_read_mpc_oem = smp_read_mpc_oem,
252 .setup_ioapic_ids = numaq_setup_ioapic_ids, 259 .setup_ioapic_ids = numaq_setup_ioapic_ids,
260 .update_genapic = numaq_update_genapic,
253}; 261};
254 262
255void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, 263void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c622772744d8..95d811a9594f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -7,7 +7,9 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/pm.h> 8#include <linux/pm.h>
9#include <linux/clockchips.h> 9#include <linux/clockchips.h>
10#include <linux/ftrace.h>
10#include <asm/system.h> 11#include <asm/system.h>
12#include <asm/apic.h>
11 13
12unsigned long idle_halt; 14unsigned long idle_halt;
13EXPORT_SYMBOL(idle_halt); 15EXPORT_SYMBOL(idle_halt);
@@ -100,6 +102,9 @@ static inline int hlt_use_halt(void)
100void default_idle(void) 102void default_idle(void)
101{ 103{
102 if (hlt_use_halt()) { 104 if (hlt_use_halt()) {
105 struct power_trace it;
106
107 trace_power_start(&it, POWER_CSTATE, 1);
103 current_thread_info()->status &= ~TS_POLLING; 108 current_thread_info()->status &= ~TS_POLLING;
104 /* 109 /*
105 * TS_POLLING-cleared state must be visible before we 110 * TS_POLLING-cleared state must be visible before we
@@ -112,6 +117,7 @@ void default_idle(void)
112 else 117 else
113 local_irq_enable(); 118 local_irq_enable();
114 current_thread_info()->status |= TS_POLLING; 119 current_thread_info()->status |= TS_POLLING;
120 trace_power_end(&it);
115 } else { 121 } else {
116 local_irq_enable(); 122 local_irq_enable();
117 /* loop is done by the caller */ 123 /* loop is done by the caller */
@@ -122,6 +128,21 @@ void default_idle(void)
122EXPORT_SYMBOL(default_idle); 128EXPORT_SYMBOL(default_idle);
123#endif 129#endif
124 130
131void stop_this_cpu(void *dummy)
132{
133 local_irq_disable();
134 /*
135 * Remove this CPU:
136 */
137 cpu_clear(smp_processor_id(), cpu_online_map);
138 disable_local_APIC();
139
140 for (;;) {
141 if (hlt_works(smp_processor_id()))
142 halt();
143 }
144}
145
125static void do_nothing(void *unused) 146static void do_nothing(void *unused)
126{ 147{
127} 148}
@@ -154,24 +175,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
154 */ 175 */
155void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 176void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
156{ 177{
178 struct power_trace it;
179
180 trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
157 if (!need_resched()) { 181 if (!need_resched()) {
158 __monitor((void *)&current_thread_info()->flags, 0, 0); 182 __monitor((void *)&current_thread_info()->flags, 0, 0);
159 smp_mb(); 183 smp_mb();
160 if (!need_resched()) 184 if (!need_resched())
161 __mwait(ax, cx); 185 __mwait(ax, cx);
162 } 186 }
187 trace_power_end(&it);
163} 188}
164 189
165/* Default MONITOR/MWAIT with no hints, used for default C1 state */ 190/* Default MONITOR/MWAIT with no hints, used for default C1 state */
166static void mwait_idle(void) 191static void mwait_idle(void)
167{ 192{
193 struct power_trace it;
168 if (!need_resched()) { 194 if (!need_resched()) {
195 trace_power_start(&it, POWER_CSTATE, 1);
169 __monitor((void *)&current_thread_info()->flags, 0, 0); 196 __monitor((void *)&current_thread_info()->flags, 0, 0);
170 smp_mb(); 197 smp_mb();
171 if (!need_resched()) 198 if (!need_resched())
172 __sti_mwait(0, 0); 199 __sti_mwait(0, 0);
173 else 200 else
174 local_irq_enable(); 201 local_irq_enable();
202 trace_power_end(&it);
175 } else 203 } else
176 local_irq_enable(); 204 local_irq_enable();
177} 205}
@@ -183,9 +211,13 @@ static void mwait_idle(void)
183 */ 211 */
184static void poll_idle(void) 212static void poll_idle(void)
185{ 213{
214 struct power_trace it;
215
216 trace_power_start(&it, POWER_CSTATE, 0);
186 local_irq_enable(); 217 local_irq_enable();
187 while (!need_resched()) 218 while (!need_resched())
188 cpu_relax(); 219 cpu_relax();
220 trace_power_end(&it);
189} 221}
190 222
191/* 223/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302fe6d45..24c2276aa453 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,7 @@
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/dmi.h> 40#include <linux/dmi.h>
41#include <linux/ftrace.h>
41 42
42#include <asm/uaccess.h> 43#include <asm/uaccess.h>
43#include <asm/pgtable.h> 44#include <asm/pgtable.h>
@@ -548,7 +549,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
548 * the task-switch, and shows up in ret_from_fork in entry.S, 549 * the task-switch, and shows up in ret_from_fork in entry.S,
549 * for example. 550 * for example.
550 */ 551 */
551struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 552__notrace_funcgraph struct task_struct *
553__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
552{ 554{
553 struct thread_struct *prev = &prev_p->thread, 555 struct thread_struct *prev = &prev_p->thread,
554 *next = &next_p->thread; 556 *next = &next_p->thread;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c958120fb1b6..fbb321d53d34 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -39,6 +39,7 @@
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/uaccess.h> 40#include <linux/uaccess.h>
41#include <linux/io.h> 41#include <linux/io.h>
42#include <linux/ftrace.h>
42 43
43#include <asm/pgtable.h> 44#include <asm/pgtable.h>
44#include <asm/system.h> 45#include <asm/system.h>
@@ -551,8 +552,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
551 * - could test fs/gs bitsliced 552 * - could test fs/gs bitsliced
552 * 553 *
553 * Kprobes not supported here. Set the probe on schedule instead. 554 * Kprobes not supported here. Set the probe on schedule instead.
555 * Function graph tracer not supported too.
554 */ 556 */
555struct task_struct * 557__notrace_funcgraph struct task_struct *
556__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 558__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
557{ 559{
558 struct thread_struct *prev = &prev_p->thread; 560 struct thread_struct *prev = &prev_p->thread;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0a6d8c12e10d..2c8ec1ba75e6 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -668,14 +668,14 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
668 size_t bts_index, bts_end; 668 size_t bts_index, bts_end;
669 int error; 669 int error;
670 670
671 error = ds_get_bts_end(child, &bts_end); 671 error = ds_get_bts_end(child->bts, &bts_end);
672 if (error < 0) 672 if (error < 0)
673 return error; 673 return error;
674 674
675 if (bts_end <= index) 675 if (bts_end <= index)
676 return -EINVAL; 676 return -EINVAL;
677 677
678 error = ds_get_bts_index(child, &bts_index); 678 error = ds_get_bts_index(child->bts, &bts_index);
679 if (error < 0) 679 if (error < 0)
680 return error; 680 return error;
681 681
@@ -684,7 +684,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
684 if (bts_end <= bts_index) 684 if (bts_end <= bts_index)
685 bts_index -= bts_end; 685 bts_index -= bts_end;
686 686
687 error = ds_access_bts(child, bts_index, &bts_record); 687 error = ds_access_bts(child->bts, bts_index, &bts_record);
688 if (error < 0) 688 if (error < 0)
689 return error; 689 return error;
690 690
@@ -705,14 +705,14 @@ static int ptrace_bts_drain(struct task_struct *child,
705 size_t end, i; 705 size_t end, i;
706 int error; 706 int error;
707 707
708 error = ds_get_bts_index(child, &end); 708 error = ds_get_bts_index(child->bts, &end);
709 if (error < 0) 709 if (error < 0)
710 return error; 710 return error;
711 711
712 if (size < (end * sizeof(struct bts_struct))) 712 if (size < (end * sizeof(struct bts_struct)))
713 return -EIO; 713 return -EIO;
714 714
715 error = ds_access_bts(child, 0, (const void **)&raw); 715 error = ds_access_bts(child->bts, 0, (const void **)&raw);
716 if (error < 0) 716 if (error < 0)
717 return error; 717 return error;
718 718
@@ -723,18 +723,13 @@ static int ptrace_bts_drain(struct task_struct *child,
723 return -EFAULT; 723 return -EFAULT;
724 } 724 }
725 725
726 error = ds_clear_bts(child); 726 error = ds_clear_bts(child->bts);
727 if (error < 0) 727 if (error < 0)
728 return error; 728 return error;
729 729
730 return end; 730 return end;
731} 731}
732 732
733static void ptrace_bts_ovfl(struct task_struct *child)
734{
735 send_sig(child->thread.bts_ovfl_signal, child, 0);
736}
737
738static int ptrace_bts_config(struct task_struct *child, 733static int ptrace_bts_config(struct task_struct *child,
739 long cfg_size, 734 long cfg_size,
740 const struct ptrace_bts_config __user *ucfg) 735 const struct ptrace_bts_config __user *ucfg)
@@ -760,23 +755,45 @@ static int ptrace_bts_config(struct task_struct *child,
760 goto errout; 755 goto errout;
761 756
762 if (cfg.flags & PTRACE_BTS_O_ALLOC) { 757 if (cfg.flags & PTRACE_BTS_O_ALLOC) {
763 ds_ovfl_callback_t ovfl = NULL; 758 bts_ovfl_callback_t ovfl = NULL;
764 unsigned int sig = 0; 759 unsigned int sig = 0;
765 760
766 /* we ignore the error in case we were not tracing child */ 761 error = -EINVAL;
767 (void)ds_release_bts(child); 762 if (cfg.size < (10 * bts_cfg.sizeof_bts))
763 goto errout;
768 764
769 if (cfg.flags & PTRACE_BTS_O_SIGNAL) { 765 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
770 if (!cfg.signal) 766 if (!cfg.signal)
771 goto errout; 767 goto errout;
772 768
769 error = -EOPNOTSUPP;
770 goto errout;
771
773 sig = cfg.signal; 772 sig = cfg.signal;
774 ovfl = ptrace_bts_ovfl;
775 } 773 }
776 774
777 error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); 775 if (child->bts) {
778 if (error < 0) 776 (void)ds_release_bts(child->bts);
777 kfree(child->bts_buffer);
778
779 child->bts = NULL;
780 child->bts_buffer = NULL;
781 }
782
783 error = -ENOMEM;
784 child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL);
785 if (!child->bts_buffer)
786 goto errout;
787
788 child->bts = ds_request_bts(child, child->bts_buffer, cfg.size,
789 ovfl, /* th = */ (size_t)-1);
790 if (IS_ERR(child->bts)) {
791 error = PTR_ERR(child->bts);
792 kfree(child->bts_buffer);
793 child->bts = NULL;
794 child->bts_buffer = NULL;
779 goto errout; 795 goto errout;
796 }
780 797
781 child->thread.bts_ovfl_signal = sig; 798 child->thread.bts_ovfl_signal = sig;
782 } 799 }
@@ -823,15 +840,15 @@ static int ptrace_bts_status(struct task_struct *child,
823 if (cfg_size < sizeof(cfg)) 840 if (cfg_size < sizeof(cfg))
824 return -EIO; 841 return -EIO;
825 842
826 error = ds_get_bts_end(child, &end); 843 error = ds_get_bts_end(child->bts, &end);
827 if (error < 0) 844 if (error < 0)
828 return error; 845 return error;
829 846
830 error = ds_access_bts(child, /* index = */ 0, &base); 847 error = ds_access_bts(child->bts, /* index = */ 0, &base);
831 if (error < 0) 848 if (error < 0)
832 return error; 849 return error;
833 850
834 error = ds_access_bts(child, /* index = */ end, &max); 851 error = ds_access_bts(child->bts, /* index = */ end, &max);
835 if (error < 0) 852 if (error < 0)
836 return error; 853 return error;
837 854
@@ -884,10 +901,7 @@ static int ptrace_bts_write_record(struct task_struct *child,
884 return -EINVAL; 901 return -EINVAL;
885 } 902 }
886 903
887 /* The writing task will be the switched-to task on a context 904 return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts);
888 * switch. It needs to write into the switched-from task's BTS
889 * buffer. */
890 return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
891} 905}
892 906
893void ptrace_bts_take_timestamp(struct task_struct *tsk, 907void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -929,17 +943,16 @@ void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
929 switch (c->x86) { 943 switch (c->x86) {
930 case 0x6: 944 case 0x6:
931 switch (c->x86_model) { 945 switch (c->x86_model) {
946 case 0 ... 0xC:
947 /* sorry, don't know about them */
948 break;
932 case 0xD: 949 case 0xD:
933 case 0xE: /* Pentium M */ 950 case 0xE: /* Pentium M */
934 bts_configure(&bts_cfg_pentium_m); 951 bts_configure(&bts_cfg_pentium_m);
935 break; 952 break;
936 case 0xF: /* Core2 */ 953 default: /* Core2, Atom, ... */
937 case 0x1C: /* Atom */
938 bts_configure(&bts_cfg_core2); 954 bts_configure(&bts_cfg_core2);
939 break; 955 break;
940 default:
941 /* sorry, don't know about them */
942 break;
943 } 956 }
944 break; 957 break;
945 case 0xF: 958 case 0xF:
@@ -973,13 +986,17 @@ void ptrace_disable(struct task_struct *child)
973 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 986 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
974#endif 987#endif
975#ifdef CONFIG_X86_PTRACE_BTS 988#ifdef CONFIG_X86_PTRACE_BTS
976 (void)ds_release_bts(child); 989 if (child->bts) {
990 (void)ds_release_bts(child->bts);
991 kfree(child->bts_buffer);
992 child->bts_buffer = NULL;
977 993
978 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; 994 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
979 if (!child->thread.debugctlmsr) 995 if (!child->thread.debugctlmsr)
980 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 996 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
981 997
982 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 998 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
999 }
983#endif /* CONFIG_X86_PTRACE_BTS */ 1000#endif /* CONFIG_X86_PTRACE_BTS */
984} 1001}
985 1002
@@ -1111,9 +1128,16 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1111 (child, data, (struct ptrace_bts_config __user *)addr); 1128 (child, data, (struct ptrace_bts_config __user *)addr);
1112 break; 1129 break;
1113 1130
1114 case PTRACE_BTS_SIZE: 1131 case PTRACE_BTS_SIZE: {
1115 ret = ds_get_bts_index(child, /* pos = */ NULL); 1132 size_t size;
1133
1134 ret = ds_get_bts_index(child->bts, &size);
1135 if (ret == 0) {
1136 BUG_ON(size != (int) size);
1137 ret = (int) size;
1138 }
1116 break; 1139 break;
1140 }
1117 1141
1118 case PTRACE_BTS_GET: 1142 case PTRACE_BTS_GET:
1119 ret = ptrace_bts_read_record 1143 ret = ptrace_bts_read_record
@@ -1121,7 +1145,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1121 break; 1145 break;
1122 1146
1123 case PTRACE_BTS_CLEAR: 1147 case PTRACE_BTS_CLEAR:
1124 ret = ds_clear_bts(child); 1148 ret = ds_clear_bts(child->bts);
1125 break; 1149 break;
1126 1150
1127 case PTRACE_BTS_DRAIN: 1151 case PTRACE_BTS_DRAIN:
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index cc5a2545dd41..0e3dbc7b2bdb 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -36,7 +36,10 @@ int reboot_force;
36static int reboot_cpu = -1; 36static int reboot_cpu = -1;
37#endif 37#endif
38 38
39/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] 39/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
40bool port_cf9_safe = false;
41
42/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
40 warm Don't set the cold reboot flag 43 warm Don't set the cold reboot flag
41 cold Set the cold reboot flag 44 cold Set the cold reboot flag
42 bios Reboot by jumping through the BIOS (only for X86_32) 45 bios Reboot by jumping through the BIOS (only for X86_32)
@@ -45,6 +48,7 @@ static int reboot_cpu = -1;
45 kbd Use the keyboard controller. cold reset (default) 48 kbd Use the keyboard controller. cold reset (default)
46 acpi Use the RESET_REG in the FADT 49 acpi Use the RESET_REG in the FADT
47 efi Use efi reset_system runtime service 50 efi Use efi reset_system runtime service
51 pci Use the so-called "PCI reset register", CF9
48 force Avoid anything that could hang. 52 force Avoid anything that could hang.
49 */ 53 */
50static int __init reboot_setup(char *str) 54static int __init reboot_setup(char *str)
@@ -79,6 +83,7 @@ static int __init reboot_setup(char *str)
79 case 'k': 83 case 'k':
80 case 't': 84 case 't':
81 case 'e': 85 case 'e':
86 case 'p':
82 reboot_type = *str; 87 reboot_type = *str;
83 break; 88 break;
84 89
@@ -404,12 +409,27 @@ static void native_machine_emergency_restart(void)
404 reboot_type = BOOT_KBD; 409 reboot_type = BOOT_KBD;
405 break; 410 break;
406 411
407
408 case BOOT_EFI: 412 case BOOT_EFI:
409 if (efi_enabled) 413 if (efi_enabled)
410 efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD, 414 efi.reset_system(reboot_mode ?
415 EFI_RESET_WARM :
416 EFI_RESET_COLD,
411 EFI_SUCCESS, 0, NULL); 417 EFI_SUCCESS, 0, NULL);
418 reboot_type = BOOT_KBD;
419 break;
420
421 case BOOT_CF9:
422 port_cf9_safe = true;
423 /* fall through */
412 424
425 case BOOT_CF9_COND:
426 if (port_cf9_safe) {
427 u8 cf9 = inb(0xcf9) & ~6;
428 outb(cf9|2, 0xcf9); /* Request hard reset */
429 udelay(50);
430 outb(cf9|6, 0xcf9); /* Actually do the reset */
431 udelay(50);
432 }
413 reboot_type = BOOT_KBD; 433 reboot_type = BOOT_KBD;
414 break; 434 break;
415 } 435 }
@@ -470,6 +490,11 @@ static void native_machine_restart(char *__unused)
470 490
471static void native_machine_halt(void) 491static void native_machine_halt(void)
472{ 492{
493 /* stop other cpus and apics */
494 machine_shutdown();
495
496 /* stop this cpu */
497 stop_this_cpu(NULL);
473} 498}
474 499
475static void native_machine_power_off(void) 500static void native_machine_power_off(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9d5674f7b6cc..b9018955a04f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -583,7 +583,20 @@ static int __init setup_elfcorehdr(char *arg)
583early_param("elfcorehdr", setup_elfcorehdr); 583early_param("elfcorehdr", setup_elfcorehdr);
584#endif 584#endif
585 585
586static struct x86_quirks default_x86_quirks __initdata; 586static int __init default_update_genapic(void)
587{
588#ifdef CONFIG_X86_SMP
589# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64)
590 genapic->wakeup_cpu = wakeup_secondary_cpu_via_init;
591# endif
592#endif
593
594 return 0;
595}
596
597static struct x86_quirks default_x86_quirks __initdata = {
598 .update_genapic = default_update_genapic,
599};
587 600
588struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; 601struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
589 602
@@ -1082,7 +1095,7 @@ void __init setup_arch(char **cmdline_p)
1082 ioapic_init_mappings(); 1095 ioapic_init_mappings();
1083 1096
1084 /* need to wait for io_apic is mapped */ 1097 /* need to wait for io_apic is mapped */
1085 nr_irqs = probe_nr_irqs(); 1098 probe_nr_irqs_gsi();
1086 1099
1087 kvm_guest_init(); 1100 kvm_guest_init();
1088 1101
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 18f9b19f5f8f..3f92b134ab90 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -140,19 +140,6 @@ void native_send_call_func_ipi(cpumask_t mask)
140 send_IPI_mask(mask, CALL_FUNCTION_VECTOR); 140 send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
141} 141}
142 142
143static void stop_this_cpu(void *dummy)
144{
145 local_irq_disable();
146 /*
147 * Remove this CPU:
148 */
149 cpu_clear(smp_processor_id(), cpu_online_map);
150 disable_local_APIC();
151 if (hlt_works(smp_processor_id()))
152 for (;;) halt();
153 for (;;);
154}
155
156/* 143/*
157 * this function calls the 'stop' function on all other CPUs in the system. 144 * this function calls the 'stop' function on all other CPUs in the system.
158 */ 145 */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 468c2f9d47ae..9d58134e0231 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,6 +62,7 @@
62#include <asm/mtrr.h> 62#include <asm/mtrr.h>
63#include <asm/vmi.h> 63#include <asm/vmi.h>
64#include <asm/genapic.h> 64#include <asm/genapic.h>
65#include <asm/setup.h>
65#include <linux/mc146818rtc.h> 66#include <linux/mc146818rtc.h>
66 67
67#include <mach_apic.h> 68#include <mach_apic.h>
@@ -530,7 +531,7 @@ static void impress_friends(void)
530 pr_debug("Before bogocount - setting activated=1.\n"); 531 pr_debug("Before bogocount - setting activated=1.\n");
531} 532}
532 533
533static inline void __inquire_remote_apic(int apicid) 534void __inquire_remote_apic(int apicid)
534{ 535{
535 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; 536 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
536 char *names[] = { "ID", "VERSION", "SPIV" }; 537 char *names[] = { "ID", "VERSION", "SPIV" };
@@ -569,14 +570,13 @@ static inline void __inquire_remote_apic(int apicid)
569 } 570 }
570} 571}
571 572
572#ifdef WAKE_SECONDARY_VIA_NMI
573/* 573/*
574 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal 574 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
575 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this 575 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
576 * won't ... remember to clear down the APIC, etc later. 576 * won't ... remember to clear down the APIC, etc later.
577 */ 577 */
578static int __devinit 578int __devinit
579wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) 579wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
580{ 580{
581 unsigned long send_status, accept_status = 0; 581 unsigned long send_status, accept_status = 0;
582 int maxlvt; 582 int maxlvt;
@@ -593,7 +593,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
593 * Give the other CPU some time to accept the IPI. 593 * Give the other CPU some time to accept the IPI.
594 */ 594 */
595 udelay(200); 595 udelay(200);
596 if (APIC_INTEGRATED(apic_version[phys_apicid])) { 596 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
597 maxlvt = lapic_get_maxlvt(); 597 maxlvt = lapic_get_maxlvt();
598 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 598 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
599 apic_write(APIC_ESR, 0); 599 apic_write(APIC_ESR, 0);
@@ -608,11 +608,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
608 608
609 return (send_status | accept_status); 609 return (send_status | accept_status);
610} 610}
611#endif /* WAKE_SECONDARY_VIA_NMI */
612 611
613#ifdef WAKE_SECONDARY_VIA_INIT 612int __devinit
614static int __devinit 613wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
615wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
616{ 614{
617 unsigned long send_status, accept_status = 0; 615 unsigned long send_status, accept_status = 0;
618 int maxlvt, num_starts, j; 616 int maxlvt, num_starts, j;
@@ -731,7 +729,6 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
731 729
732 return (send_status | accept_status); 730 return (send_status | accept_status);
733} 731}
734#endif /* WAKE_SECONDARY_VIA_INIT */
735 732
736struct create_idle { 733struct create_idle {
737 struct work_struct work; 734 struct work_struct work;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index a03e7f6d90c3..10786af95545 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -6,6 +6,7 @@
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/stacktrace.h> 7#include <linux/stacktrace.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/uaccess.h>
9#include <asm/stacktrace.h> 10#include <asm/stacktrace.h>
10 11
11static void save_stack_warning(void *data, char *msg) 12static void save_stack_warning(void *data, char *msg)
@@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
83 trace->entries[trace->nr_entries++] = ULONG_MAX; 84 trace->entries[trace->nr_entries++] = ULONG_MAX;
84} 85}
85EXPORT_SYMBOL_GPL(save_stack_trace_tsk); 86EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
87
88/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
89
90struct stack_frame {
91 const void __user *next_fp;
92 unsigned long ret_addr;
93};
94
95static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
96{
97 int ret;
98
99 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
100 return 0;
101
102 ret = 1;
103 pagefault_disable();
104 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
105 ret = 0;
106 pagefault_enable();
107
108 return ret;
109}
110
111static inline void __save_stack_trace_user(struct stack_trace *trace)
112{
113 const struct pt_regs *regs = task_pt_regs(current);
114 const void __user *fp = (const void __user *)regs->bp;
115
116 if (trace->nr_entries < trace->max_entries)
117 trace->entries[trace->nr_entries++] = regs->ip;
118
119 while (trace->nr_entries < trace->max_entries) {
120 struct stack_frame frame;
121
122 frame.next_fp = NULL;
123 frame.ret_addr = 0;
124 if (!copy_stack_frame(fp, &frame))
125 break;
126 if ((unsigned long)fp < regs->sp)
127 break;
128 if (frame.ret_addr) {
129 trace->entries[trace->nr_entries++] =
130 frame.ret_addr;
131 }
132 if (fp == frame.next_fp)
133 break;
134 fp = frame.next_fp;
135 }
136}
137
138void save_stack_trace_user(struct stack_trace *trace)
139{
140 /*
141 * Trace user stack if we are not a kernel thread
142 */
143 if (current->mm) {
144 __save_stack_trace_user(trace);
145 }
146 if (trace->nr_entries < trace->max_entries)
147 trace->entries[trace->nr_entries++] = ULONG_MAX;
148}
149
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 0b8b6690a86d..6f3d3d4cd973 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -17,6 +17,9 @@
17 * want per guest time just set the kernel.vsyscall64 sysctl to 0. 17 * want per guest time just set the kernel.vsyscall64 sysctl to 0.
18 */ 18 */
19 19
20/* Disable profiling for userspace code: */
21#define DISABLE_BRANCH_PROFILING
22
20#include <linux/time.h> 23#include <linux/time.h>
21#include <linux/init.h> 24#include <linux/init.h>
22#include <linux/kernel.h> 25#include <linux/kernel.h>