diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-15 14:26:35 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-15 14:26:35 -0500 |
commit | 83c2f912b43c3a7babbb6cb7ae2a5276c1ed2a3e (patch) | |
tree | eaa7f50dea154d9f19721db69c7adde64d48848f /arch | |
parent | f0ed5b9a28536b8be2f578a9450cfa42ab31ccf8 (diff) | |
parent | 172d1b0b73256551f100fc00c69e356d047103f5 (diff) |
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (39 commits)
perf tools: Fix compile error on x86_64 Ubuntu
perf report: Fix --stdio output alignment when --showcpuutilization used
perf annotate: Get rid of field_sep check
perf annotate: Fix usage string
perf kmem: Fix a memory leak
perf kmem: Add missing closedir() calls
perf top: Add error message for EMFILE
perf test: Change type of '-v' option to INCR
perf script: Add missing closedir() calls
tracing: Fix compile error when static ftrace is enabled
recordmcount: Fix handling of elf64 big-endian objects.
perf tools: Add const.h to MANIFEST to make perf-tar-src-pkg work again
perf tools: Add support for guest/host-only profiling
perf kvm: Do guest-only counting by default
perf top: Don't update total_period on process_sample
perf hists: Stop using 'self' for struct hist_entry
perf hists: Rename total_session to total_period
x86: Add counter when debug stack is used with interrupts enabled
x86: Allow NMIs to hit breakpoints in i386
x86: Keep current stack in NMI breakpoints
...
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/include/asm/debugreg.h | 22 | ||||
-rw-r--r-- | arch/x86/include/asm/desc.h | 12 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 24 | ||||
-rw-r--r-- | arch/x86/kernel/entry_64.S | 218 | ||||
-rw-r--r-- | arch/x86/kernel/head_64.S | 4 | ||||
-rw-r--r-- | arch/x86/kernel/nmi.c | 102 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 20 |
7 files changed, 369 insertions, 33 deletions
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 078ad0caefc..b903d5ea394 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h | |||
@@ -101,6 +101,28 @@ extern void aout_dump_debugregs(struct user *dump); | |||
101 | 101 | ||
102 | extern void hw_breakpoint_restore(void); | 102 | extern void hw_breakpoint_restore(void); |
103 | 103 | ||
104 | #ifdef CONFIG_X86_64 | ||
105 | DECLARE_PER_CPU(int, debug_stack_usage); | ||
106 | static inline void debug_stack_usage_inc(void) | ||
107 | { | ||
108 | __get_cpu_var(debug_stack_usage)++; | ||
109 | } | ||
110 | static inline void debug_stack_usage_dec(void) | ||
111 | { | ||
112 | __get_cpu_var(debug_stack_usage)--; | ||
113 | } | ||
114 | int is_debug_stack(unsigned long addr); | ||
115 | void debug_stack_set_zero(void); | ||
116 | void debug_stack_reset(void); | ||
117 | #else /* !X86_64 */ | ||
118 | static inline int is_debug_stack(unsigned long addr) { return 0; } | ||
119 | static inline void debug_stack_set_zero(void) { } | ||
120 | static inline void debug_stack_reset(void) { } | ||
121 | static inline void debug_stack_usage_inc(void) { } | ||
122 | static inline void debug_stack_usage_dec(void) { } | ||
123 | #endif /* X86_64 */ | ||
124 | |||
125 | |||
104 | #endif /* __KERNEL__ */ | 126 | #endif /* __KERNEL__ */ |
105 | 127 | ||
106 | #endif /* _ASM_X86_DEBUGREG_H */ | 128 | #endif /* _ASM_X86_DEBUGREG_H */ |
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 41935fadfdf..e95822d683f 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h | |||
@@ -35,6 +35,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in | |||
35 | 35 | ||
36 | extern struct desc_ptr idt_descr; | 36 | extern struct desc_ptr idt_descr; |
37 | extern gate_desc idt_table[]; | 37 | extern gate_desc idt_table[]; |
38 | extern struct desc_ptr nmi_idt_descr; | ||
39 | extern gate_desc nmi_idt_table[]; | ||
38 | 40 | ||
39 | struct gdt_page { | 41 | struct gdt_page { |
40 | struct desc_struct gdt[GDT_ENTRIES]; | 42 | struct desc_struct gdt[GDT_ENTRIES]; |
@@ -307,6 +309,16 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) | |||
307 | desc->limit = (limit >> 16) & 0xf; | 309 | desc->limit = (limit >> 16) & 0xf; |
308 | } | 310 | } |
309 | 311 | ||
312 | #ifdef CONFIG_X86_64 | ||
313 | static inline void set_nmi_gate(int gate, void *addr) | ||
314 | { | ||
315 | gate_desc s; | ||
316 | |||
317 | pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); | ||
318 | write_idt_entry(nmi_idt_table, gate, &s); | ||
319 | } | ||
320 | #endif | ||
321 | |||
310 | static inline void _set_gate(int gate, unsigned type, void *addr, | 322 | static inline void _set_gate(int gate, unsigned type, void *addr, |
311 | unsigned dpl, unsigned ist, unsigned seg) | 323 | unsigned dpl, unsigned ist, unsigned seg) |
312 | { | 324 | { |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 850f2963a42..d43cad74f16 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -1021,6 +1021,8 @@ __setup("clearcpuid=", setup_disablecpuid); | |||
1021 | 1021 | ||
1022 | #ifdef CONFIG_X86_64 | 1022 | #ifdef CONFIG_X86_64 |
1023 | struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; | 1023 | struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; |
1024 | struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1, | ||
1025 | (unsigned long) nmi_idt_table }; | ||
1024 | 1026 | ||
1025 | DEFINE_PER_CPU_FIRST(union irq_stack_union, | 1027 | DEFINE_PER_CPU_FIRST(union irq_stack_union, |
1026 | irq_stack_union) __aligned(PAGE_SIZE); | 1028 | irq_stack_union) __aligned(PAGE_SIZE); |
@@ -1085,6 +1087,26 @@ unsigned long kernel_eflags; | |||
1085 | */ | 1087 | */ |
1086 | DEFINE_PER_CPU(struct orig_ist, orig_ist); | 1088 | DEFINE_PER_CPU(struct orig_ist, orig_ist); |
1087 | 1089 | ||
1090 | static DEFINE_PER_CPU(unsigned long, debug_stack_addr); | ||
1091 | DEFINE_PER_CPU(int, debug_stack_usage); | ||
1092 | |||
1093 | int is_debug_stack(unsigned long addr) | ||
1094 | { | ||
1095 | return __get_cpu_var(debug_stack_usage) || | ||
1096 | (addr <= __get_cpu_var(debug_stack_addr) && | ||
1097 | addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); | ||
1098 | } | ||
1099 | |||
1100 | void debug_stack_set_zero(void) | ||
1101 | { | ||
1102 | load_idt((const struct desc_ptr *)&nmi_idt_descr); | ||
1103 | } | ||
1104 | |||
1105 | void debug_stack_reset(void) | ||
1106 | { | ||
1107 | load_idt((const struct desc_ptr *)&idt_descr); | ||
1108 | } | ||
1109 | |||
1088 | #else /* CONFIG_X86_64 */ | 1110 | #else /* CONFIG_X86_64 */ |
1089 | 1111 | ||
1090 | DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; | 1112 | DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; |
@@ -1212,6 +1234,8 @@ void __cpuinit cpu_init(void) | |||
1212 | estacks += exception_stack_sizes[v]; | 1234 | estacks += exception_stack_sizes[v]; |
1213 | oist->ist[v] = t->x86_tss.ist[v] = | 1235 | oist->ist[v] = t->x86_tss.ist[v] = |
1214 | (unsigned long)estacks; | 1236 | (unsigned long)estacks; |
1237 | if (v == DEBUG_STACK-1) | ||
1238 | per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; | ||
1215 | } | 1239 | } |
1216 | } | 1240 | } |
1217 | 1241 | ||
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a20e1cb9dc8..940ba711fc2 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -1480,62 +1480,214 @@ ENTRY(error_exit) | |||
1480 | CFI_ENDPROC | 1480 | CFI_ENDPROC |
1481 | END(error_exit) | 1481 | END(error_exit) |
1482 | 1482 | ||
1483 | /* | ||
1484 | * Test if a given stack is an NMI stack or not. | ||
1485 | */ | ||
1486 | .macro test_in_nmi reg stack nmi_ret normal_ret | ||
1487 | cmpq %\reg, \stack | ||
1488 | ja \normal_ret | ||
1489 | subq $EXCEPTION_STKSZ, %\reg | ||
1490 | cmpq %\reg, \stack | ||
1491 | jb \normal_ret | ||
1492 | jmp \nmi_ret | ||
1493 | .endm | ||
1483 | 1494 | ||
1484 | /* runs on exception stack */ | 1495 | /* runs on exception stack */ |
1485 | ENTRY(nmi) | 1496 | ENTRY(nmi) |
1486 | INTR_FRAME | 1497 | INTR_FRAME |
1487 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 1498 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
1488 | pushq_cfi $-1 | 1499 | /* |
1500 | * We allow breakpoints in NMIs. If a breakpoint occurs, then | ||
1501 | * the iretq it performs will take us out of NMI context. | ||
1502 | * This means that we can have nested NMIs where the next | ||
1503 | * NMI is using the top of the stack of the previous NMI. We | ||
1504 | * can't let it execute because the nested NMI will corrupt the | ||
1505 | * stack of the previous NMI. NMI handlers are not re-entrant | ||
1506 | * anyway. | ||
1507 | * | ||
1508 | * To handle this case we do the following: | ||
1509 | * Check the a special location on the stack that contains | ||
1510 | * a variable that is set when NMIs are executing. | ||
1511 | * The interrupted task's stack is also checked to see if it | ||
1512 | * is an NMI stack. | ||
1513 | * If the variable is not set and the stack is not the NMI | ||
1514 | * stack then: | ||
1515 | * o Set the special variable on the stack | ||
1516 | * o Copy the interrupt frame into a "saved" location on the stack | ||
1517 | * o Copy the interrupt frame into a "copy" location on the stack | ||
1518 | * o Continue processing the NMI | ||
1519 | * If the variable is set or the previous stack is the NMI stack: | ||
1520 | * o Modify the "copy" location to jump to the repeate_nmi | ||
1521 | * o return back to the first NMI | ||
1522 | * | ||
1523 | * Now on exit of the first NMI, we first clear the stack variable | ||
1524 | * The NMI stack will tell any nested NMIs at that point that it is | ||
1525 | * nested. Then we pop the stack normally with iret, and if there was | ||
1526 | * a nested NMI that updated the copy interrupt stack frame, a | ||
1527 | * jump will be made to the repeat_nmi code that will handle the second | ||
1528 | * NMI. | ||
1529 | */ | ||
1530 | |||
1531 | /* Use %rdx as out temp variable throughout */ | ||
1532 | pushq_cfi %rdx | ||
1533 | |||
1534 | /* | ||
1535 | * Check the special variable on the stack to see if NMIs are | ||
1536 | * executing. | ||
1537 | */ | ||
1538 | cmp $1, -8(%rsp) | ||
1539 | je nested_nmi | ||
1540 | |||
1541 | /* | ||
1542 | * Now test if the previous stack was an NMI stack. | ||
1543 | * We need the double check. We check the NMI stack to satisfy the | ||
1544 | * race when the first NMI clears the variable before returning. | ||
1545 | * We check the variable because the first NMI could be in a | ||
1546 | * breakpoint routine using a breakpoint stack. | ||
1547 | */ | ||
1548 | lea 6*8(%rsp), %rdx | ||
1549 | test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi | ||
1550 | |||
1551 | nested_nmi: | ||
1552 | /* | ||
1553 | * Do nothing if we interrupted the fixup in repeat_nmi. | ||
1554 | * It's about to repeat the NMI handler, so we are fine | ||
1555 | * with ignoring this one. | ||
1556 | */ | ||
1557 | movq $repeat_nmi, %rdx | ||
1558 | cmpq 8(%rsp), %rdx | ||
1559 | ja 1f | ||
1560 | movq $end_repeat_nmi, %rdx | ||
1561 | cmpq 8(%rsp), %rdx | ||
1562 | ja nested_nmi_out | ||
1563 | |||
1564 | 1: | ||
1565 | /* Set up the interrupted NMIs stack to jump to repeat_nmi */ | ||
1566 | leaq -6*8(%rsp), %rdx | ||
1567 | movq %rdx, %rsp | ||
1568 | CFI_ADJUST_CFA_OFFSET 6*8 | ||
1569 | pushq_cfi $__KERNEL_DS | ||
1570 | pushq_cfi %rdx | ||
1571 | pushfq_cfi | ||
1572 | pushq_cfi $__KERNEL_CS | ||
1573 | pushq_cfi $repeat_nmi | ||
1574 | |||
1575 | /* Put stack back */ | ||
1576 | addq $(11*8), %rsp | ||
1577 | CFI_ADJUST_CFA_OFFSET -11*8 | ||
1578 | |||
1579 | nested_nmi_out: | ||
1580 | popq_cfi %rdx | ||
1581 | |||
1582 | /* No need to check faults here */ | ||
1583 | INTERRUPT_RETURN | ||
1584 | |||
1585 | first_nmi: | ||
1586 | /* | ||
1587 | * Because nested NMIs will use the pushed location that we | ||
1588 | * stored in rdx, we must keep that space available. | ||
1589 | * Here's what our stack frame will look like: | ||
1590 | * +-------------------------+ | ||
1591 | * | original SS | | ||
1592 | * | original Return RSP | | ||
1593 | * | original RFLAGS | | ||
1594 | * | original CS | | ||
1595 | * | original RIP | | ||
1596 | * +-------------------------+ | ||
1597 | * | temp storage for rdx | | ||
1598 | * +-------------------------+ | ||
1599 | * | NMI executing variable | | ||
1600 | * +-------------------------+ | ||
1601 | * | Saved SS | | ||
1602 | * | Saved Return RSP | | ||
1603 | * | Saved RFLAGS | | ||
1604 | * | Saved CS | | ||
1605 | * | Saved RIP | | ||
1606 | * +-------------------------+ | ||
1607 | * | copied SS | | ||
1608 | * | copied Return RSP | | ||
1609 | * | copied RFLAGS | | ||
1610 | * | copied CS | | ||
1611 | * | copied RIP | | ||
1612 | * +-------------------------+ | ||
1613 | * | pt_regs | | ||
1614 | * +-------------------------+ | ||
1615 | * | ||
1616 | * The saved RIP is used to fix up the copied RIP that a nested | ||
1617 | * NMI may zero out. The original stack frame and the temp storage | ||
1618 | * is also used by nested NMIs and can not be trusted on exit. | ||
1619 | */ | ||
1620 | /* Set the NMI executing variable on the stack. */ | ||
1621 | pushq_cfi $1 | ||
1622 | |||
1623 | /* Copy the stack frame to the Saved frame */ | ||
1624 | .rept 5 | ||
1625 | pushq_cfi 6*8(%rsp) | ||
1626 | .endr | ||
1627 | |||
1628 | /* Make another copy, this one may be modified by nested NMIs */ | ||
1629 | .rept 5 | ||
1630 | pushq_cfi 4*8(%rsp) | ||
1631 | .endr | ||
1632 | |||
1633 | /* Do not pop rdx, nested NMIs will corrupt it */ | ||
1634 | movq 11*8(%rsp), %rdx | ||
1635 | |||
1636 | /* | ||
1637 | * Everything below this point can be preempted by a nested | ||
1638 | * NMI if the first NMI took an exception. Repeated NMIs | ||
1639 | * caused by an exception and nested NMI will start here, and | ||
1640 | * can still be preempted by another NMI. | ||
1641 | */ | ||
1642 | restart_nmi: | ||
1643 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ | ||
1489 | subq $ORIG_RAX-R15, %rsp | 1644 | subq $ORIG_RAX-R15, %rsp |
1490 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 | 1645 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 |
1646 | /* | ||
1647 | * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit | ||
1648 | * as we should not be calling schedule in NMI context. | ||
1649 | * Even with normal interrupts enabled. An NMI should not be | ||
1650 | * setting NEED_RESCHED or anything that normal interrupts and | ||
1651 | * exceptions might do. | ||
1652 | */ | ||
1491 | call save_paranoid | 1653 | call save_paranoid |
1492 | DEFAULT_FRAME 0 | 1654 | DEFAULT_FRAME 0 |
1493 | /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ | 1655 | /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ |
1494 | movq %rsp,%rdi | 1656 | movq %rsp,%rdi |
1495 | movq $-1,%rsi | 1657 | movq $-1,%rsi |
1496 | call do_nmi | 1658 | call do_nmi |
1497 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
1498 | /* paranoidexit; without TRACE_IRQS_OFF */ | ||
1499 | /* ebx: no swapgs flag */ | ||
1500 | DISABLE_INTERRUPTS(CLBR_NONE) | ||
1501 | testl %ebx,%ebx /* swapgs needed? */ | 1659 | testl %ebx,%ebx /* swapgs needed? */ |
1502 | jnz nmi_restore | 1660 | jnz nmi_restore |
1503 | testl $3,CS(%rsp) | ||
1504 | jnz nmi_userspace | ||
1505 | nmi_swapgs: | 1661 | nmi_swapgs: |
1506 | SWAPGS_UNSAFE_STACK | 1662 | SWAPGS_UNSAFE_STACK |
1507 | nmi_restore: | 1663 | nmi_restore: |
1508 | RESTORE_ALL 8 | 1664 | RESTORE_ALL 8 |
1665 | /* Clear the NMI executing stack variable */ | ||
1666 | movq $0, 10*8(%rsp) | ||
1509 | jmp irq_return | 1667 | jmp irq_return |
1510 | nmi_userspace: | ||
1511 | GET_THREAD_INFO(%rcx) | ||
1512 | movl TI_flags(%rcx),%ebx | ||
1513 | andl $_TIF_WORK_MASK,%ebx | ||
1514 | jz nmi_swapgs | ||
1515 | movq %rsp,%rdi /* &pt_regs */ | ||
1516 | call sync_regs | ||
1517 | movq %rax,%rsp /* switch stack for scheduling */ | ||
1518 | testl $_TIF_NEED_RESCHED,%ebx | ||
1519 | jnz nmi_schedule | ||
1520 | movl %ebx,%edx /* arg3: thread flags */ | ||
1521 | ENABLE_INTERRUPTS(CLBR_NONE) | ||
1522 | xorl %esi,%esi /* arg2: oldset */ | ||
1523 | movq %rsp,%rdi /* arg1: &pt_regs */ | ||
1524 | call do_notify_resume | ||
1525 | DISABLE_INTERRUPTS(CLBR_NONE) | ||
1526 | jmp nmi_userspace | ||
1527 | nmi_schedule: | ||
1528 | ENABLE_INTERRUPTS(CLBR_ANY) | ||
1529 | call schedule | ||
1530 | DISABLE_INTERRUPTS(CLBR_ANY) | ||
1531 | jmp nmi_userspace | ||
1532 | CFI_ENDPROC | ||
1533 | #else | ||
1534 | jmp paranoid_exit | ||
1535 | CFI_ENDPROC | 1668 | CFI_ENDPROC |
1536 | #endif | ||
1537 | END(nmi) | 1669 | END(nmi) |
1538 | 1670 | ||
1671 | /* | ||
1672 | * If an NMI hit an iret because of an exception or breakpoint, | ||
1673 | * it can lose its NMI context, and a nested NMI may come in. | ||
1674 | * In that case, the nested NMI will change the preempted NMI's | ||
1675 | * stack to jump to here when it does the final iret. | ||
1676 | */ | ||
1677 | repeat_nmi: | ||
1678 | INTR_FRAME | ||
1679 | /* Update the stack variable to say we are still in NMI */ | ||
1680 | movq $1, 5*8(%rsp) | ||
1681 | |||
1682 | /* copy the saved stack back to copy stack */ | ||
1683 | .rept 5 | ||
1684 | pushq_cfi 4*8(%rsp) | ||
1685 | .endr | ||
1686 | |||
1687 | jmp restart_nmi | ||
1688 | CFI_ENDPROC | ||
1689 | end_repeat_nmi: | ||
1690 | |||
1539 | ENTRY(ignore_sysret) | 1691 | ENTRY(ignore_sysret) |
1540 | CFI_STARTPROC | 1692 | CFI_STARTPROC |
1541 | mov $-ENOSYS,%eax | 1693 | mov $-ENOSYS,%eax |
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e11e39478a4..40f4eb3766d 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -417,6 +417,10 @@ ENTRY(phys_base) | |||
417 | ENTRY(idt_table) | 417 | ENTRY(idt_table) |
418 | .skip IDT_ENTRIES * 16 | 418 | .skip IDT_ENTRIES * 16 |
419 | 419 | ||
420 | .align L1_CACHE_BYTES | ||
421 | ENTRY(nmi_idt_table) | ||
422 | .skip IDT_ENTRIES * 16 | ||
423 | |||
420 | __PAGE_ALIGNED_BSS | 424 | __PAGE_ALIGNED_BSS |
421 | .align PAGE_SIZE | 425 | .align PAGE_SIZE |
422 | ENTRY(empty_zero_page) | 426 | ENTRY(empty_zero_page) |
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index e88f37b58dd..47acaf31916 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c | |||
@@ -405,9 +405,108 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | |||
405 | unknown_nmi_error(reason, regs); | 405 | unknown_nmi_error(reason, regs); |
406 | } | 406 | } |
407 | 407 | ||
408 | /* | ||
409 | * NMIs can hit breakpoints which will cause it to lose its | ||
410 | * NMI context with the CPU when the breakpoint does an iret. | ||
411 | */ | ||
412 | #ifdef CONFIG_X86_32 | ||
413 | /* | ||
414 | * For i386, NMIs use the same stack as the kernel, and we can | ||
415 | * add a workaround to the iret problem in C. Simply have 3 states | ||
416 | * the NMI can be in. | ||
417 | * | ||
418 | * 1) not running | ||
419 | * 2) executing | ||
420 | * 3) latched | ||
421 | * | ||
422 | * When no NMI is in progress, it is in the "not running" state. | ||
423 | * When an NMI comes in, it goes into the "executing" state. | ||
424 | * Normally, if another NMI is triggered, it does not interrupt | ||
425 | * the running NMI and the HW will simply latch it so that when | ||
426 | * the first NMI finishes, it will restart the second NMI. | ||
427 | * (Note, the latch is binary, thus multiple NMIs triggering, | ||
428 | * when one is running, are ignored. Only one NMI is restarted.) | ||
429 | * | ||
430 | * If an NMI hits a breakpoint that executes an iret, another | ||
431 | * NMI can preempt it. We do not want to allow this new NMI | ||
432 | * to run, but we want to execute it when the first one finishes. | ||
433 | * We set the state to "latched", and the first NMI will perform | ||
434 | * an cmpxchg on the state, and if it doesn't successfully | ||
435 | * reset the state to "not running" it will restart the next | ||
436 | * NMI. | ||
437 | */ | ||
438 | enum nmi_states { | ||
439 | NMI_NOT_RUNNING, | ||
440 | NMI_EXECUTING, | ||
441 | NMI_LATCHED, | ||
442 | }; | ||
443 | static DEFINE_PER_CPU(enum nmi_states, nmi_state); | ||
444 | |||
445 | #define nmi_nesting_preprocess(regs) \ | ||
446 | do { \ | ||
447 | if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \ | ||
448 | __get_cpu_var(nmi_state) = NMI_LATCHED; \ | ||
449 | return; \ | ||
450 | } \ | ||
451 | nmi_restart: \ | ||
452 | __get_cpu_var(nmi_state) = NMI_EXECUTING; \ | ||
453 | } while (0) | ||
454 | |||
455 | #define nmi_nesting_postprocess() \ | ||
456 | do { \ | ||
457 | if (cmpxchg(&__get_cpu_var(nmi_state), \ | ||
458 | NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \ | ||
459 | goto nmi_restart; \ | ||
460 | } while (0) | ||
461 | #else /* x86_64 */ | ||
462 | /* | ||
463 | * In x86_64 things are a bit more difficult. This has the same problem | ||
464 | * where an NMI hitting a breakpoint that calls iret will remove the | ||
465 | * NMI context, allowing a nested NMI to enter. What makes this more | ||
466 | * difficult is that both NMIs and breakpoints have their own stack. | ||
467 | * When a new NMI or breakpoint is executed, the stack is set to a fixed | ||
468 | * point. If an NMI is nested, it will have its stack set at that same | ||
469 | * fixed address that the first NMI had, and will start corrupting the | ||
470 | * stack. This is handled in entry_64.S, but the same problem exists with | ||
471 | * the breakpoint stack. | ||
472 | * | ||
473 | * If a breakpoint is being processed, and the debug stack is being used, | ||
474 | * if an NMI comes in and also hits a breakpoint, the stack pointer | ||
475 | * will be set to the same fixed address as the breakpoint that was | ||
476 | * interrupted, causing that stack to be corrupted. To handle this case, | ||
477 | * check if the stack that was interrupted is the debug stack, and if | ||
478 | * so, change the IDT so that new breakpoints will use the current stack | ||
479 | * and not switch to the fixed address. On return of the NMI, switch back | ||
480 | * to the original IDT. | ||
481 | */ | ||
482 | static DEFINE_PER_CPU(int, update_debug_stack); | ||
483 | |||
484 | static inline void nmi_nesting_preprocess(struct pt_regs *regs) | ||
485 | { | ||
486 | /* | ||
487 | * If we interrupted a breakpoint, it is possible that | ||
488 | * the nmi handler will have breakpoints too. We need to | ||
489 | * change the IDT such that breakpoints that happen here | ||
490 | * continue to use the NMI stack. | ||
491 | */ | ||
492 | if (unlikely(is_debug_stack(regs->sp))) { | ||
493 | debug_stack_set_zero(); | ||
494 | __get_cpu_var(update_debug_stack) = 1; | ||
495 | } | ||
496 | } | ||
497 | |||
498 | static inline void nmi_nesting_postprocess(void) | ||
499 | { | ||
500 | if (unlikely(__get_cpu_var(update_debug_stack))) | ||
501 | debug_stack_reset(); | ||
502 | } | ||
503 | #endif | ||
504 | |||
408 | dotraplinkage notrace __kprobes void | 505 | dotraplinkage notrace __kprobes void |
409 | do_nmi(struct pt_regs *regs, long error_code) | 506 | do_nmi(struct pt_regs *regs, long error_code) |
410 | { | 507 | { |
508 | nmi_nesting_preprocess(regs); | ||
509 | |||
411 | nmi_enter(); | 510 | nmi_enter(); |
412 | 511 | ||
413 | inc_irq_stat(__nmi_count); | 512 | inc_irq_stat(__nmi_count); |
@@ -416,6 +515,9 @@ do_nmi(struct pt_regs *regs, long error_code) | |||
416 | default_do_nmi(regs); | 515 | default_do_nmi(regs); |
417 | 516 | ||
418 | nmi_exit(); | 517 | nmi_exit(); |
518 | |||
519 | /* On i386, may loop back to preprocess */ | ||
520 | nmi_nesting_postprocess(); | ||
419 | } | 521 | } |
420 | 522 | ||
421 | void stop_nmi(void) | 523 | void stop_nmi(void) |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index fa1191fb679..482ec3af206 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -311,9 +311,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) | |||
311 | == NOTIFY_STOP) | 311 | == NOTIFY_STOP) |
312 | return; | 312 | return; |
313 | 313 | ||
314 | /* | ||
315 | * Let others (NMI) know that the debug stack is in use | ||
316 | * as we may switch to the interrupt stack. | ||
317 | */ | ||
318 | debug_stack_usage_inc(); | ||
314 | preempt_conditional_sti(regs); | 319 | preempt_conditional_sti(regs); |
315 | do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); | 320 | do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); |
316 | preempt_conditional_cli(regs); | 321 | preempt_conditional_cli(regs); |
322 | debug_stack_usage_dec(); | ||
317 | } | 323 | } |
318 | 324 | ||
319 | #ifdef CONFIG_X86_64 | 325 | #ifdef CONFIG_X86_64 |
@@ -406,6 +412,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
406 | SIGTRAP) == NOTIFY_STOP) | 412 | SIGTRAP) == NOTIFY_STOP) |
407 | return; | 413 | return; |
408 | 414 | ||
415 | /* | ||
416 | * Let others (NMI) know that the debug stack is in use | ||
417 | * as we may switch to the interrupt stack. | ||
418 | */ | ||
419 | debug_stack_usage_inc(); | ||
420 | |||
409 | /* It's safe to allow irq's after DR6 has been saved */ | 421 | /* It's safe to allow irq's after DR6 has been saved */ |
410 | preempt_conditional_sti(regs); | 422 | preempt_conditional_sti(regs); |
411 | 423 | ||
@@ -413,6 +425,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
413 | handle_vm86_trap((struct kernel_vm86_regs *) regs, | 425 | handle_vm86_trap((struct kernel_vm86_regs *) regs, |
414 | error_code, 1); | 426 | error_code, 1); |
415 | preempt_conditional_cli(regs); | 427 | preempt_conditional_cli(regs); |
428 | debug_stack_usage_dec(); | ||
416 | return; | 429 | return; |
417 | } | 430 | } |
418 | 431 | ||
@@ -432,6 +445,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
432 | if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) | 445 | if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) |
433 | send_sigtrap(tsk, regs, error_code, si_code); | 446 | send_sigtrap(tsk, regs, error_code, si_code); |
434 | preempt_conditional_cli(regs); | 447 | preempt_conditional_cli(regs); |
448 | debug_stack_usage_dec(); | ||
435 | 449 | ||
436 | return; | 450 | return; |
437 | } | 451 | } |
@@ -718,4 +732,10 @@ void __init trap_init(void) | |||
718 | cpu_init(); | 732 | cpu_init(); |
719 | 733 | ||
720 | x86_init.irqs.trap_init(); | 734 | x86_init.irqs.trap_init(); |
735 | |||
736 | #ifdef CONFIG_X86_64 | ||
737 | memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); | ||
738 | set_nmi_gate(1, &debug); | ||
739 | set_nmi_gate(3, &int3); | ||
740 | #endif | ||
721 | } | 741 | } |