diff options
author | Ingo Molnar <mingo@elte.hu> | 2012-01-07 07:25:49 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2012-01-07 07:25:49 -0500 |
commit | 03f70388c39cef5dfdc70ce5473ec31577a18e6b (patch) | |
tree | 4d097d0ae955baf9d5dbeac1fd376d637fd15a6d /arch | |
parent | 9e183426bfb52bb44bf3c443d6587e4d02478603 (diff) | |
parent | 42181186ad4db986fcaa40ca95c6e407e9e79372 (diff) |
Merge branch 'tip/x86/core-3' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace into perf/core
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/include/asm/debugreg.h | 22 | ||||
-rw-r--r-- | arch/x86/include/asm/desc.h | 12 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 24 | ||||
-rw-r--r-- | arch/x86/kernel/entry_64.S | 218 | ||||
-rw-r--r-- | arch/x86/kernel/head_64.S | 4 | ||||
-rw-r--r-- | arch/x86/kernel/nmi.c | 102 | ||||
-rw-r--r-- | arch/x86/kernel/traps.c | 20 |
7 files changed, 369 insertions, 33 deletions
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 078ad0caefc..b903d5ea394 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h | |||
@@ -101,6 +101,28 @@ extern void aout_dump_debugregs(struct user *dump); | |||
101 | 101 | ||
102 | extern void hw_breakpoint_restore(void); | 102 | extern void hw_breakpoint_restore(void); |
103 | 103 | ||
104 | #ifdef CONFIG_X86_64 | ||
105 | DECLARE_PER_CPU(int, debug_stack_usage); | ||
106 | static inline void debug_stack_usage_inc(void) | ||
107 | { | ||
108 | __get_cpu_var(debug_stack_usage)++; | ||
109 | } | ||
110 | static inline void debug_stack_usage_dec(void) | ||
111 | { | ||
112 | __get_cpu_var(debug_stack_usage)--; | ||
113 | } | ||
114 | int is_debug_stack(unsigned long addr); | ||
115 | void debug_stack_set_zero(void); | ||
116 | void debug_stack_reset(void); | ||
117 | #else /* !X86_64 */ | ||
118 | static inline int is_debug_stack(unsigned long addr) { return 0; } | ||
119 | static inline void debug_stack_set_zero(void) { } | ||
120 | static inline void debug_stack_reset(void) { } | ||
121 | static inline void debug_stack_usage_inc(void) { } | ||
122 | static inline void debug_stack_usage_dec(void) { } | ||
123 | #endif /* X86_64 */ | ||
124 | |||
125 | |||
104 | #endif /* __KERNEL__ */ | 126 | #endif /* __KERNEL__ */ |
105 | 127 | ||
106 | #endif /* _ASM_X86_DEBUGREG_H */ | 128 | #endif /* _ASM_X86_DEBUGREG_H */ |
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 41935fadfdf..e95822d683f 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h | |||
@@ -35,6 +35,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in | |||
35 | 35 | ||
36 | extern struct desc_ptr idt_descr; | 36 | extern struct desc_ptr idt_descr; |
37 | extern gate_desc idt_table[]; | 37 | extern gate_desc idt_table[]; |
38 | extern struct desc_ptr nmi_idt_descr; | ||
39 | extern gate_desc nmi_idt_table[]; | ||
38 | 40 | ||
39 | struct gdt_page { | 41 | struct gdt_page { |
40 | struct desc_struct gdt[GDT_ENTRIES]; | 42 | struct desc_struct gdt[GDT_ENTRIES]; |
@@ -307,6 +309,16 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) | |||
307 | desc->limit = (limit >> 16) & 0xf; | 309 | desc->limit = (limit >> 16) & 0xf; |
308 | } | 310 | } |
309 | 311 | ||
312 | #ifdef CONFIG_X86_64 | ||
313 | static inline void set_nmi_gate(int gate, void *addr) | ||
314 | { | ||
315 | gate_desc s; | ||
316 | |||
317 | pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); | ||
318 | write_idt_entry(nmi_idt_table, gate, &s); | ||
319 | } | ||
320 | #endif | ||
321 | |||
310 | static inline void _set_gate(int gate, unsigned type, void *addr, | 322 | static inline void _set_gate(int gate, unsigned type, void *addr, |
311 | unsigned dpl, unsigned ist, unsigned seg) | 323 | unsigned dpl, unsigned ist, unsigned seg) |
312 | { | 324 | { |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index aa003b13a83..266e4649b1d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -1026,6 +1026,8 @@ __setup("clearcpuid=", setup_disablecpuid); | |||
1026 | 1026 | ||
1027 | #ifdef CONFIG_X86_64 | 1027 | #ifdef CONFIG_X86_64 |
1028 | struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; | 1028 | struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; |
1029 | struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1, | ||
1030 | (unsigned long) nmi_idt_table }; | ||
1029 | 1031 | ||
1030 | DEFINE_PER_CPU_FIRST(union irq_stack_union, | 1032 | DEFINE_PER_CPU_FIRST(union irq_stack_union, |
1031 | irq_stack_union) __aligned(PAGE_SIZE); | 1033 | irq_stack_union) __aligned(PAGE_SIZE); |
@@ -1090,6 +1092,26 @@ unsigned long kernel_eflags; | |||
1090 | */ | 1092 | */ |
1091 | DEFINE_PER_CPU(struct orig_ist, orig_ist); | 1093 | DEFINE_PER_CPU(struct orig_ist, orig_ist); |
1092 | 1094 | ||
1095 | static DEFINE_PER_CPU(unsigned long, debug_stack_addr); | ||
1096 | DEFINE_PER_CPU(int, debug_stack_usage); | ||
1097 | |||
1098 | int is_debug_stack(unsigned long addr) | ||
1099 | { | ||
1100 | return __get_cpu_var(debug_stack_usage) || | ||
1101 | (addr <= __get_cpu_var(debug_stack_addr) && | ||
1102 | addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); | ||
1103 | } | ||
1104 | |||
1105 | void debug_stack_set_zero(void) | ||
1106 | { | ||
1107 | load_idt((const struct desc_ptr *)&nmi_idt_descr); | ||
1108 | } | ||
1109 | |||
1110 | void debug_stack_reset(void) | ||
1111 | { | ||
1112 | load_idt((const struct desc_ptr *)&idt_descr); | ||
1113 | } | ||
1114 | |||
1093 | #else /* CONFIG_X86_64 */ | 1115 | #else /* CONFIG_X86_64 */ |
1094 | 1116 | ||
1095 | DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; | 1117 | DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; |
@@ -1208,6 +1230,8 @@ void __cpuinit cpu_init(void) | |||
1208 | estacks += exception_stack_sizes[v]; | 1230 | estacks += exception_stack_sizes[v]; |
1209 | oist->ist[v] = t->x86_tss.ist[v] = | 1231 | oist->ist[v] = t->x86_tss.ist[v] = |
1210 | (unsigned long)estacks; | 1232 | (unsigned long)estacks; |
1233 | if (v == DEBUG_STACK-1) | ||
1234 | per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; | ||
1211 | } | 1235 | } |
1212 | } | 1236 | } |
1213 | 1237 | ||
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index faf8d5e74b0..b62aa298df7 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -1475,62 +1475,214 @@ ENTRY(error_exit) | |||
1475 | CFI_ENDPROC | 1475 | CFI_ENDPROC |
1476 | END(error_exit) | 1476 | END(error_exit) |
1477 | 1477 | ||
1478 | /* | ||
1479 | * Test if a given stack is an NMI stack or not. | ||
1480 | */ | ||
1481 | .macro test_in_nmi reg stack nmi_ret normal_ret | ||
1482 | cmpq %\reg, \stack | ||
1483 | ja \normal_ret | ||
1484 | subq $EXCEPTION_STKSZ, %\reg | ||
1485 | cmpq %\reg, \stack | ||
1486 | jb \normal_ret | ||
1487 | jmp \nmi_ret | ||
1488 | .endm | ||
1478 | 1489 | ||
1479 | /* runs on exception stack */ | 1490 | /* runs on exception stack */ |
1480 | ENTRY(nmi) | 1491 | ENTRY(nmi) |
1481 | INTR_FRAME | 1492 | INTR_FRAME |
1482 | PARAVIRT_ADJUST_EXCEPTION_FRAME | 1493 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
1483 | pushq_cfi $-1 | 1494 | /* |
1495 | * We allow breakpoints in NMIs. If a breakpoint occurs, then | ||
1496 | * the iretq it performs will take us out of NMI context. | ||
1497 | * This means that we can have nested NMIs where the next | ||
1498 | * NMI is using the top of the stack of the previous NMI. We | ||
1499 | * can't let it execute because the nested NMI will corrupt the | ||
1500 | * stack of the previous NMI. NMI handlers are not re-entrant | ||
1501 | * anyway. | ||
1502 | * | ||
1503 | * To handle this case we do the following: | ||
1504 | * Check the a special location on the stack that contains | ||
1505 | * a variable that is set when NMIs are executing. | ||
1506 | * The interrupted task's stack is also checked to see if it | ||
1507 | * is an NMI stack. | ||
1508 | * If the variable is not set and the stack is not the NMI | ||
1509 | * stack then: | ||
1510 | * o Set the special variable on the stack | ||
1511 | * o Copy the interrupt frame into a "saved" location on the stack | ||
1512 | * o Copy the interrupt frame into a "copy" location on the stack | ||
1513 | * o Continue processing the NMI | ||
1514 | * If the variable is set or the previous stack is the NMI stack: | ||
1515 | * o Modify the "copy" location to jump to the repeate_nmi | ||
1516 | * o return back to the first NMI | ||
1517 | * | ||
1518 | * Now on exit of the first NMI, we first clear the stack variable | ||
1519 | * The NMI stack will tell any nested NMIs at that point that it is | ||
1520 | * nested. Then we pop the stack normally with iret, and if there was | ||
1521 | * a nested NMI that updated the copy interrupt stack frame, a | ||
1522 | * jump will be made to the repeat_nmi code that will handle the second | ||
1523 | * NMI. | ||
1524 | */ | ||
1525 | |||
1526 | /* Use %rdx as out temp variable throughout */ | ||
1527 | pushq_cfi %rdx | ||
1528 | |||
1529 | /* | ||
1530 | * Check the special variable on the stack to see if NMIs are | ||
1531 | * executing. | ||
1532 | */ | ||
1533 | cmp $1, -8(%rsp) | ||
1534 | je nested_nmi | ||
1535 | |||
1536 | /* | ||
1537 | * Now test if the previous stack was an NMI stack. | ||
1538 | * We need the double check. We check the NMI stack to satisfy the | ||
1539 | * race when the first NMI clears the variable before returning. | ||
1540 | * We check the variable because the first NMI could be in a | ||
1541 | * breakpoint routine using a breakpoint stack. | ||
1542 | */ | ||
1543 | lea 6*8(%rsp), %rdx | ||
1544 | test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi | ||
1545 | |||
1546 | nested_nmi: | ||
1547 | /* | ||
1548 | * Do nothing if we interrupted the fixup in repeat_nmi. | ||
1549 | * It's about to repeat the NMI handler, so we are fine | ||
1550 | * with ignoring this one. | ||
1551 | */ | ||
1552 | movq $repeat_nmi, %rdx | ||
1553 | cmpq 8(%rsp), %rdx | ||
1554 | ja 1f | ||
1555 | movq $end_repeat_nmi, %rdx | ||
1556 | cmpq 8(%rsp), %rdx | ||
1557 | ja nested_nmi_out | ||
1558 | |||
1559 | 1: | ||
1560 | /* Set up the interrupted NMIs stack to jump to repeat_nmi */ | ||
1561 | leaq -6*8(%rsp), %rdx | ||
1562 | movq %rdx, %rsp | ||
1563 | CFI_ADJUST_CFA_OFFSET 6*8 | ||
1564 | pushq_cfi $__KERNEL_DS | ||
1565 | pushq_cfi %rdx | ||
1566 | pushfq_cfi | ||
1567 | pushq_cfi $__KERNEL_CS | ||
1568 | pushq_cfi $repeat_nmi | ||
1569 | |||
1570 | /* Put stack back */ | ||
1571 | addq $(11*8), %rsp | ||
1572 | CFI_ADJUST_CFA_OFFSET -11*8 | ||
1573 | |||
1574 | nested_nmi_out: | ||
1575 | popq_cfi %rdx | ||
1576 | |||
1577 | /* No need to check faults here */ | ||
1578 | INTERRUPT_RETURN | ||
1579 | |||
1580 | first_nmi: | ||
1581 | /* | ||
1582 | * Because nested NMIs will use the pushed location that we | ||
1583 | * stored in rdx, we must keep that space available. | ||
1584 | * Here's what our stack frame will look like: | ||
1585 | * +-------------------------+ | ||
1586 | * | original SS | | ||
1587 | * | original Return RSP | | ||
1588 | * | original RFLAGS | | ||
1589 | * | original CS | | ||
1590 | * | original RIP | | ||
1591 | * +-------------------------+ | ||
1592 | * | temp storage for rdx | | ||
1593 | * +-------------------------+ | ||
1594 | * | NMI executing variable | | ||
1595 | * +-------------------------+ | ||
1596 | * | Saved SS | | ||
1597 | * | Saved Return RSP | | ||
1598 | * | Saved RFLAGS | | ||
1599 | * | Saved CS | | ||
1600 | * | Saved RIP | | ||
1601 | * +-------------------------+ | ||
1602 | * | copied SS | | ||
1603 | * | copied Return RSP | | ||
1604 | * | copied RFLAGS | | ||
1605 | * | copied CS | | ||
1606 | * | copied RIP | | ||
1607 | * +-------------------------+ | ||
1608 | * | pt_regs | | ||
1609 | * +-------------------------+ | ||
1610 | * | ||
1611 | * The saved RIP is used to fix up the copied RIP that a nested | ||
1612 | * NMI may zero out. The original stack frame and the temp storage | ||
1613 | * is also used by nested NMIs and can not be trusted on exit. | ||
1614 | */ | ||
1615 | /* Set the NMI executing variable on the stack. */ | ||
1616 | pushq_cfi $1 | ||
1617 | |||
1618 | /* Copy the stack frame to the Saved frame */ | ||
1619 | .rept 5 | ||
1620 | pushq_cfi 6*8(%rsp) | ||
1621 | .endr | ||
1622 | |||
1623 | /* Make another copy, this one may be modified by nested NMIs */ | ||
1624 | .rept 5 | ||
1625 | pushq_cfi 4*8(%rsp) | ||
1626 | .endr | ||
1627 | |||
1628 | /* Do not pop rdx, nested NMIs will corrupt it */ | ||
1629 | movq 11*8(%rsp), %rdx | ||
1630 | |||
1631 | /* | ||
1632 | * Everything below this point can be preempted by a nested | ||
1633 | * NMI if the first NMI took an exception. Repeated NMIs | ||
1634 | * caused by an exception and nested NMI will start here, and | ||
1635 | * can still be preempted by another NMI. | ||
1636 | */ | ||
1637 | restart_nmi: | ||
1638 | pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ | ||
1484 | subq $ORIG_RAX-R15, %rsp | 1639 | subq $ORIG_RAX-R15, %rsp |
1485 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 | 1640 | CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 |
1641 | /* | ||
1642 | * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit | ||
1643 | * as we should not be calling schedule in NMI context. | ||
1644 | * Even with normal interrupts enabled. An NMI should not be | ||
1645 | * setting NEED_RESCHED or anything that normal interrupts and | ||
1646 | * exceptions might do. | ||
1647 | */ | ||
1486 | call save_paranoid | 1648 | call save_paranoid |
1487 | DEFAULT_FRAME 0 | 1649 | DEFAULT_FRAME 0 |
1488 | /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ | 1650 | /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ |
1489 | movq %rsp,%rdi | 1651 | movq %rsp,%rdi |
1490 | movq $-1,%rsi | 1652 | movq $-1,%rsi |
1491 | call do_nmi | 1653 | call do_nmi |
1492 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
1493 | /* paranoidexit; without TRACE_IRQS_OFF */ | ||
1494 | /* ebx: no swapgs flag */ | ||
1495 | DISABLE_INTERRUPTS(CLBR_NONE) | ||
1496 | testl %ebx,%ebx /* swapgs needed? */ | 1654 | testl %ebx,%ebx /* swapgs needed? */ |
1497 | jnz nmi_restore | 1655 | jnz nmi_restore |
1498 | testl $3,CS(%rsp) | ||
1499 | jnz nmi_userspace | ||
1500 | nmi_swapgs: | 1656 | nmi_swapgs: |
1501 | SWAPGS_UNSAFE_STACK | 1657 | SWAPGS_UNSAFE_STACK |
1502 | nmi_restore: | 1658 | nmi_restore: |
1503 | RESTORE_ALL 8 | 1659 | RESTORE_ALL 8 |
1660 | /* Clear the NMI executing stack variable */ | ||
1661 | movq $0, 10*8(%rsp) | ||
1504 | jmp irq_return | 1662 | jmp irq_return |
1505 | nmi_userspace: | ||
1506 | GET_THREAD_INFO(%rcx) | ||
1507 | movl TI_flags(%rcx),%ebx | ||
1508 | andl $_TIF_WORK_MASK,%ebx | ||
1509 | jz nmi_swapgs | ||
1510 | movq %rsp,%rdi /* &pt_regs */ | ||
1511 | call sync_regs | ||
1512 | movq %rax,%rsp /* switch stack for scheduling */ | ||
1513 | testl $_TIF_NEED_RESCHED,%ebx | ||
1514 | jnz nmi_schedule | ||
1515 | movl %ebx,%edx /* arg3: thread flags */ | ||
1516 | ENABLE_INTERRUPTS(CLBR_NONE) | ||
1517 | xorl %esi,%esi /* arg2: oldset */ | ||
1518 | movq %rsp,%rdi /* arg1: &pt_regs */ | ||
1519 | call do_notify_resume | ||
1520 | DISABLE_INTERRUPTS(CLBR_NONE) | ||
1521 | jmp nmi_userspace | ||
1522 | nmi_schedule: | ||
1523 | ENABLE_INTERRUPTS(CLBR_ANY) | ||
1524 | call schedule | ||
1525 | DISABLE_INTERRUPTS(CLBR_ANY) | ||
1526 | jmp nmi_userspace | ||
1527 | CFI_ENDPROC | ||
1528 | #else | ||
1529 | jmp paranoid_exit | ||
1530 | CFI_ENDPROC | 1663 | CFI_ENDPROC |
1531 | #endif | ||
1532 | END(nmi) | 1664 | END(nmi) |
1533 | 1665 | ||
1666 | /* | ||
1667 | * If an NMI hit an iret because of an exception or breakpoint, | ||
1668 | * it can lose its NMI context, and a nested NMI may come in. | ||
1669 | * In that case, the nested NMI will change the preempted NMI's | ||
1670 | * stack to jump to here when it does the final iret. | ||
1671 | */ | ||
1672 | repeat_nmi: | ||
1673 | INTR_FRAME | ||
1674 | /* Update the stack variable to say we are still in NMI */ | ||
1675 | movq $1, 5*8(%rsp) | ||
1676 | |||
1677 | /* copy the saved stack back to copy stack */ | ||
1678 | .rept 5 | ||
1679 | pushq_cfi 4*8(%rsp) | ||
1680 | .endr | ||
1681 | |||
1682 | jmp restart_nmi | ||
1683 | CFI_ENDPROC | ||
1684 | end_repeat_nmi: | ||
1685 | |||
1534 | ENTRY(ignore_sysret) | 1686 | ENTRY(ignore_sysret) |
1535 | CFI_STARTPROC | 1687 | CFI_STARTPROC |
1536 | mov $-ENOSYS,%eax | 1688 | mov $-ENOSYS,%eax |
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e11e39478a4..40f4eb3766d 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S | |||
@@ -417,6 +417,10 @@ ENTRY(phys_base) | |||
417 | ENTRY(idt_table) | 417 | ENTRY(idt_table) |
418 | .skip IDT_ENTRIES * 16 | 418 | .skip IDT_ENTRIES * 16 |
419 | 419 | ||
420 | .align L1_CACHE_BYTES | ||
421 | ENTRY(nmi_idt_table) | ||
422 | .skip IDT_ENTRIES * 16 | ||
423 | |||
420 | __PAGE_ALIGNED_BSS | 424 | __PAGE_ALIGNED_BSS |
421 | .align PAGE_SIZE | 425 | .align PAGE_SIZE |
422 | ENTRY(empty_zero_page) | 426 | ENTRY(empty_zero_page) |
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index e88f37b58dd..47acaf31916 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c | |||
@@ -405,9 +405,108 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) | |||
405 | unknown_nmi_error(reason, regs); | 405 | unknown_nmi_error(reason, regs); |
406 | } | 406 | } |
407 | 407 | ||
408 | /* | ||
409 | * NMIs can hit breakpoints which will cause it to lose its | ||
410 | * NMI context with the CPU when the breakpoint does an iret. | ||
411 | */ | ||
412 | #ifdef CONFIG_X86_32 | ||
413 | /* | ||
414 | * For i386, NMIs use the same stack as the kernel, and we can | ||
415 | * add a workaround to the iret problem in C. Simply have 3 states | ||
416 | * the NMI can be in. | ||
417 | * | ||
418 | * 1) not running | ||
419 | * 2) executing | ||
420 | * 3) latched | ||
421 | * | ||
422 | * When no NMI is in progress, it is in the "not running" state. | ||
423 | * When an NMI comes in, it goes into the "executing" state. | ||
424 | * Normally, if another NMI is triggered, it does not interrupt | ||
425 | * the running NMI and the HW will simply latch it so that when | ||
426 | * the first NMI finishes, it will restart the second NMI. | ||
427 | * (Note, the latch is binary, thus multiple NMIs triggering, | ||
428 | * when one is running, are ignored. Only one NMI is restarted.) | ||
429 | * | ||
430 | * If an NMI hits a breakpoint that executes an iret, another | ||
431 | * NMI can preempt it. We do not want to allow this new NMI | ||
432 | * to run, but we want to execute it when the first one finishes. | ||
433 | * We set the state to "latched", and the first NMI will perform | ||
434 | * an cmpxchg on the state, and if it doesn't successfully | ||
435 | * reset the state to "not running" it will restart the next | ||
436 | * NMI. | ||
437 | */ | ||
438 | enum nmi_states { | ||
439 | NMI_NOT_RUNNING, | ||
440 | NMI_EXECUTING, | ||
441 | NMI_LATCHED, | ||
442 | }; | ||
443 | static DEFINE_PER_CPU(enum nmi_states, nmi_state); | ||
444 | |||
445 | #define nmi_nesting_preprocess(regs) \ | ||
446 | do { \ | ||
447 | if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \ | ||
448 | __get_cpu_var(nmi_state) = NMI_LATCHED; \ | ||
449 | return; \ | ||
450 | } \ | ||
451 | nmi_restart: \ | ||
452 | __get_cpu_var(nmi_state) = NMI_EXECUTING; \ | ||
453 | } while (0) | ||
454 | |||
455 | #define nmi_nesting_postprocess() \ | ||
456 | do { \ | ||
457 | if (cmpxchg(&__get_cpu_var(nmi_state), \ | ||
458 | NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \ | ||
459 | goto nmi_restart; \ | ||
460 | } while (0) | ||
461 | #else /* x86_64 */ | ||
462 | /* | ||
463 | * In x86_64 things are a bit more difficult. This has the same problem | ||
464 | * where an NMI hitting a breakpoint that calls iret will remove the | ||
465 | * NMI context, allowing a nested NMI to enter. What makes this more | ||
466 | * difficult is that both NMIs and breakpoints have their own stack. | ||
467 | * When a new NMI or breakpoint is executed, the stack is set to a fixed | ||
468 | * point. If an NMI is nested, it will have its stack set at that same | ||
469 | * fixed address that the first NMI had, and will start corrupting the | ||
470 | * stack. This is handled in entry_64.S, but the same problem exists with | ||
471 | * the breakpoint stack. | ||
472 | * | ||
473 | * If a breakpoint is being processed, and the debug stack is being used, | ||
474 | * if an NMI comes in and also hits a breakpoint, the stack pointer | ||
475 | * will be set to the same fixed address as the breakpoint that was | ||
476 | * interrupted, causing that stack to be corrupted. To handle this case, | ||
477 | * check if the stack that was interrupted is the debug stack, and if | ||
478 | * so, change the IDT so that new breakpoints will use the current stack | ||
479 | * and not switch to the fixed address. On return of the NMI, switch back | ||
480 | * to the original IDT. | ||
481 | */ | ||
482 | static DEFINE_PER_CPU(int, update_debug_stack); | ||
483 | |||
484 | static inline void nmi_nesting_preprocess(struct pt_regs *regs) | ||
485 | { | ||
486 | /* | ||
487 | * If we interrupted a breakpoint, it is possible that | ||
488 | * the nmi handler will have breakpoints too. We need to | ||
489 | * change the IDT such that breakpoints that happen here | ||
490 | * continue to use the NMI stack. | ||
491 | */ | ||
492 | if (unlikely(is_debug_stack(regs->sp))) { | ||
493 | debug_stack_set_zero(); | ||
494 | __get_cpu_var(update_debug_stack) = 1; | ||
495 | } | ||
496 | } | ||
497 | |||
498 | static inline void nmi_nesting_postprocess(void) | ||
499 | { | ||
500 | if (unlikely(__get_cpu_var(update_debug_stack))) | ||
501 | debug_stack_reset(); | ||
502 | } | ||
503 | #endif | ||
504 | |||
408 | dotraplinkage notrace __kprobes void | 505 | dotraplinkage notrace __kprobes void |
409 | do_nmi(struct pt_regs *regs, long error_code) | 506 | do_nmi(struct pt_regs *regs, long error_code) |
410 | { | 507 | { |
508 | nmi_nesting_preprocess(regs); | ||
509 | |||
411 | nmi_enter(); | 510 | nmi_enter(); |
412 | 511 | ||
413 | inc_irq_stat(__nmi_count); | 512 | inc_irq_stat(__nmi_count); |
@@ -416,6 +515,9 @@ do_nmi(struct pt_regs *regs, long error_code) | |||
416 | default_do_nmi(regs); | 515 | default_do_nmi(regs); |
417 | 516 | ||
418 | nmi_exit(); | 517 | nmi_exit(); |
518 | |||
519 | /* On i386, may loop back to preprocess */ | ||
520 | nmi_nesting_postprocess(); | ||
419 | } | 521 | } |
420 | 522 | ||
421 | void stop_nmi(void) | 523 | void stop_nmi(void) |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a8e3eb83466..0072b38e3ea 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -316,9 +316,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) | |||
316 | return; | 316 | return; |
317 | #endif | 317 | #endif |
318 | 318 | ||
319 | /* | ||
320 | * Let others (NMI) know that the debug stack is in use | ||
321 | * as we may switch to the interrupt stack. | ||
322 | */ | ||
323 | debug_stack_usage_inc(); | ||
319 | preempt_conditional_sti(regs); | 324 | preempt_conditional_sti(regs); |
320 | do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); | 325 | do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); |
321 | preempt_conditional_cli(regs); | 326 | preempt_conditional_cli(regs); |
327 | debug_stack_usage_dec(); | ||
322 | } | 328 | } |
323 | 329 | ||
324 | #ifdef CONFIG_X86_64 | 330 | #ifdef CONFIG_X86_64 |
@@ -411,6 +417,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
411 | SIGTRAP) == NOTIFY_STOP) | 417 | SIGTRAP) == NOTIFY_STOP) |
412 | return; | 418 | return; |
413 | 419 | ||
420 | /* | ||
421 | * Let others (NMI) know that the debug stack is in use | ||
422 | * as we may switch to the interrupt stack. | ||
423 | */ | ||
424 | debug_stack_usage_inc(); | ||
425 | |||
414 | /* It's safe to allow irq's after DR6 has been saved */ | 426 | /* It's safe to allow irq's after DR6 has been saved */ |
415 | preempt_conditional_sti(regs); | 427 | preempt_conditional_sti(regs); |
416 | 428 | ||
@@ -418,6 +430,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
418 | handle_vm86_trap((struct kernel_vm86_regs *) regs, | 430 | handle_vm86_trap((struct kernel_vm86_regs *) regs, |
419 | error_code, 1); | 431 | error_code, 1); |
420 | preempt_conditional_cli(regs); | 432 | preempt_conditional_cli(regs); |
433 | debug_stack_usage_dec(); | ||
421 | return; | 434 | return; |
422 | } | 435 | } |
423 | 436 | ||
@@ -437,6 +450,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) | |||
437 | if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) | 450 | if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) |
438 | send_sigtrap(tsk, regs, error_code, si_code); | 451 | send_sigtrap(tsk, regs, error_code, si_code); |
439 | preempt_conditional_cli(regs); | 452 | preempt_conditional_cli(regs); |
453 | debug_stack_usage_dec(); | ||
440 | 454 | ||
441 | return; | 455 | return; |
442 | } | 456 | } |
@@ -723,4 +737,10 @@ void __init trap_init(void) | |||
723 | cpu_init(); | 737 | cpu_init(); |
724 | 738 | ||
725 | x86_init.irqs.trap_init(); | 739 | x86_init.irqs.trap_init(); |
740 | |||
741 | #ifdef CONFIG_X86_64 | ||
742 | memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); | ||
743 | set_nmi_gate(1, &debug); | ||
744 | set_nmi_gate(3, &int3); | ||
745 | #endif | ||
726 | } | 746 | } |