12 files changed, 376 insertions, 214 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index bec6666a3cc4..8a8ea7110de8 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -221,6 +221,10 @@ config ARCH_TASK_STRUCT_ALLOCATOR
 config ARCH_THREAD_INFO_ALLOCATOR
        bool
+# Select if arch wants to size task_struct dynamically via arch_task_struct_size:
+config ARCH_WANTS_DYNAMIC_TASK_STRUCT
+        bool
 config HAVE_REGS_AND_STACK_ACCESS_API
        bool
        help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3dbb7e7909ca..b3a1a5d77d92 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
        select ARCH_USE_CMPXCHG_LOCKREF         if X86_64
        select ARCH_USE_QUEUED_RWLOCKS
        select ARCH_USE_QUEUED_SPINLOCKS
+        select ARCH_WANTS_DYNAMIC_TASK_STRUCT
        select ARCH_WANT_FRAME_POINTERS
        select ARCH_WANT_IPC_PARSE_VERSION      if X86_32
        select ARCH_WANT_OPTIONAL_GPIOLIB
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index a15893d17c55..d8c0d3266173 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -297,6 +297,18 @@ config OPTIMIZE_INLINING
          If unsure, say N.
+config DEBUG_ENTRY
+        bool "Debug low-level entry code"
+        depends on DEBUG_KERNEL
+        ---help---
+          This option enables sanity checks in x86's low-level entry code.
+          Some of these sanity checks may slow down kernel entries and
+          exits or otherwise impact performance.
+          This is currently used to help test NMI code.
+          If unsure, say N.
 config DEBUG_NMI_SELFTEST
        bool "NMI Selftest"
        depends on DEBUG_KERNEL && X86_LOCAL_APIC
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3bb2c4302df1..8cb3e438f21e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1237,11 +1237,12 @@ ENTRY(nmi)
         *  If the variable is not set and the stack is not the NMI
         *  stack then:
         *    o Set the special variable on the stack
-         *    o Copy the interrupt frame into a "saved" location on the stack
+         *    o Copy the interrupt frame into an "outermost" location on the
-         *    o Copy the interrupt frame into a "copy" location on the stack
+         *      stack
+         *    o Copy the interrupt frame into an "iret" location on the stack
         *    o Continue processing the NMI
         *  If the variable is set or the previous stack is the NMI stack:
-         *    o Modify the "copy" location to jump to the repeate_nmi
+         *    o Modify the "iret" location to jump to the repeat_nmi
         *    o return back to the first NMI
         *
         * Now on exit of the first NMI, we first clear the stack variable
@@ -1250,31 +1251,151 @@ ENTRY(nmi)
         * a nested NMI that updated the copy interrupt stack frame, a
         * jump will be made to the repeat_nmi code that will handle the second
         * NMI.
+         *
+         * However, espfix prevents us from directly returning to userspace
+         * with a single IRET instruction.  Similarly, IRET to user mode
+         * can fault.  We therefore handle NMIs from user space like
+         * other IST entries.
         */
        /* Use %rdx as our temp variable throughout */
        pushq   %rdx
+        testb   $3, CS-RIP+8(%rsp)
+        jz      .Lnmi_from_kernel
+        /*
+         * NMI from user mode.  We need to run on the thread stack, but we
+         * can't go through the normal entry paths: NMIs are masked, and
+         * we don't want to enable interrupts, because then we'll end
+         * up in an awkward situation in which IRQs are on but NMIs
+         * are off.
+         */
+        SWAPGS
+        cld
+        movq    %rsp, %rdx
+        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+        pushq   5*8(%rdx)       /* pt_regs->ss */
+        pushq   4*8(%rdx)       /* pt_regs->rsp */
+        pushq   3*8(%rdx)       /* pt_regs->flags */
+        pushq   2*8(%rdx)       /* pt_regs->cs */
+        pushq   1*8(%rdx)       /* pt_regs->rip */
+        pushq   $-1             /* pt_regs->orig_ax */
+        pushq   %rdi            /* pt_regs->di */
+        pushq   %rsi            /* pt_regs->si */
+        pushq   (%rdx)          /* pt_regs->dx */
+        pushq   %rcx            /* pt_regs->cx */
+        pushq   %rax            /* pt_regs->ax */
+        pushq   %r8             /* pt_regs->r8 */
+        pushq   %r9             /* pt_regs->r9 */
+        pushq   %r10            /* pt_regs->r10 */
+        pushq   %r11            /* pt_regs->r11 */
+        pushq   %rbx            /* pt_regs->rbx */
+        pushq   %rbp            /* pt_regs->rbp */
+        pushq   %r12            /* pt_regs->r12 */
+        pushq   %r13            /* pt_regs->r13 */
+        pushq   %r14            /* pt_regs->r14 */
+        pushq   %r15            /* pt_regs->r15 */
+        /*
+         * At this point we no longer need to worry about stack damage
+         * due to nesting -- we're on the normal thread stack and we're
+         * done with the NMI stack.
+         */
+        movq    %rsp, %rdi
+        movq    $-1, %rsi
+        call    do_nmi
+        /*
+         * Return back to user mode.  We must *not* do the normal exit
+         * work, because we don't want to enable interrupts.  Fortunately,
+         * do_nmi doesn't modify pt_regs.
+         */
+        SWAPGS
+        jmp     restore_c_regs_and_iret
+.Lnmi_from_kernel:
+        /*
+         * Here's what our stack frame will look like:
+         * +---------------------------------------------------------+
+         * | original SS                                             |
+         * | original Return RSP                                     |
+         * | original RFLAGS                                         |
+         * | original CS                                             |
+         * | original RIP                                            |
+         * +---------------------------------------------------------+
+         * | temp storage for rdx                                    |
+         * +---------------------------------------------------------+
+         * | "NMI executing" variable                                |
+         * +---------------------------------------------------------+
+         * | iret SS          } Copied from "outermost" frame        |
+         * | iret Return RSP  } on each loop iteration; overwritten  |
+         * | iret RFLAGS      } by a nested NMI to force another     |
+         * | iret CS          } iteration if needed.                 |
+         * | iret RIP         }                                      |
+         * +---------------------------------------------------------+
+         * | outermost SS          } initialized in first_nmi;       |
+         * | outermost Return RSP  } will not be changed before      |
+         * | outermost RFLAGS      } NMI processing is done.         |
+         * | outermost CS          } Copied to "iret" frame on each  |
+         * | outermost RIP         } iteration.                      |
+         * +---------------------------------------------------------+
+         * | pt_regs                                                 |
+         * +---------------------------------------------------------+
+         *
+         * The "original" frame is used by hardware.  Before re-enabling
+         * NMIs, we need to be done with it, and we need to leave enough
+         * space for the asm code here.
+         *
+         * We return by executing IRET while RSP points to the "iret" frame.
+         * That will either return for real or it will loop back into NMI
+         * processing.
+         *
+         * The "outermost" frame is copied to the "iret" frame on each
+         * iteration of the loop, so each iteration starts with the "iret"
+         * frame pointing to the final return target.
+         */
        /*
-         * If %cs was not the kernel segment, then the NMI triggered in user
+         * Determine whether we're a nested NMI.
-         * space, which means it is definitely not nested.
+         *
+         * If we interrupted kernel code between repeat_nmi and
+         * end_repeat_nmi, then we are a nested NMI.  We must not
+         * modify the "iret" frame because it's being written by
+         * the outer NMI.  That's okay; the outer NMI handler is
+         * about to about to call do_nmi anyway, so we can just
+         * resume the outer NMI.
         */
-        cmpl    $__KERNEL_CS, 16(%rsp)
-        jne     first_nmi
+        movq    $repeat_nmi, %rdx
+        cmpq    8(%rsp), %rdx
+        ja      1f
+        movq    $end_repeat_nmi, %rdx
+        cmpq    8(%rsp), %rdx
+        ja      nested_nmi_out
+1:
        /*
-         * Check the special variable on the stack to see if NMIs are
+         * Now check "NMI executing".  If it's set, then we're nested.
-         * executing.
+         * This will not detect if we interrupted an outer NMI just
+         * before IRET.
         */
        cmpl    $1, -8(%rsp)
        je      nested_nmi
        /*
-         * Now test if the previous stack was an NMI stack.
+         * Now test if the previous stack was an NMI stack.  This covers
-         * We need the double check. We check the NMI stack to satisfy the
+         * the case where we interrupt an outer NMI after it clears
-         * race when the first NMI clears the variable before returning.
+         * "NMI executing" but before IRET.  We need to be careful, though:
-         * We check the variable because the first NMI could be in a
+         * there is one case in which RSP could point to the NMI stack
-         * breakpoint routine using a breakpoint stack.
+         * despite there being no NMI active: naughty userspace controls
+         * RSP at the very beginning of the SYSCALL targets.  We can
+         * pull a fast one on naughty userspace, though: we program
+         * SYSCALL to mask DF, so userspace cannot cause DF to be set
+         * if it controls the kernel's RSP.  We set DF before we clear
+         * "NMI executing".
         */
        lea     6*8(%rsp), %rdx
        /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
@@ -1286,25 +1407,20 @@ ENTRY(nmi)
        cmpq    %rdx, 4*8(%rsp)
        /* If it is below the NMI stack, it is a normal NMI */
        jb      first_nmi
-        /* Ah, it is within the NMI stack, treat it as nested */
+        /* Ah, it is within the NMI stack. */
+        testb   $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
+        jz      first_nmi       /* RSP was user controlled. */
+        /* This is a nested NMI. */
 nested_nmi:
        /*
-         * Do nothing if we interrupted the fixup in repeat_nmi.
+         * Modify the "iret" frame to point to repeat_nmi, forcing another
-         * It's about to repeat the NMI handler, so we are fine
+         * iteration of NMI handling.
-         * with ignoring this one.
         */
-        movq    $repeat_nmi, %rdx
+        subq    $8, %rsp
-        cmpq    8(%rsp), %rdx
-        ja      1f
-        movq    $end_repeat_nmi, %rdx
-        cmpq    8(%rsp), %rdx
-        ja      nested_nmi_out
-1:
-        /* Set up the interrupted NMIs stack to jump to repeat_nmi */
-        leaq    -1*8(%rsp), %rdx
-        movq    %rdx, %rsp
        leaq    -10*8(%rsp), %rdx
        pushq   $__KERNEL_DS
        pushq   %rdx
@@ -1318,61 +1434,42 @@ nested_nmi:
 nested_nmi_out:
        popq    %rdx
-        /* No need to check faults here */
+        /* We are returning to kernel mode, so this cannot result in a fault. */
        INTERRUPT_RETURN
 first_nmi:
-        /*
+        /* Restore rdx. */
-         * Because nested NMIs will use the pushed location that we
-         * stored in rdx, we must keep that space available.
-         * Here's what our stack frame will look like:
-         * +-------------------------+
-         * | original SS             |
-         * | original Return RSP     |
-         * | original RFLAGS         |
-         * | original CS             |
-         * | original RIP            |
-         * +-------------------------+
-         * | temp storage for rdx    |
-         * +-------------------------+
-         * | NMI executing variable  |
-         * +-------------------------+
-         * | copied SS               |
-         * | copied Return RSP       |
-         * | copied RFLAGS           |
-         * | copied CS               |
-         * | copied RIP              |
-         * +-------------------------+
-         * | Saved SS                |
-         * | Saved Return RSP        |
-         * | Saved RFLAGS            |
-         * | Saved CS                |
-         * | Saved RIP               |
-         * +-------------------------+
-         * | pt_regs                 |
-         * +-------------------------+
-         *
-         * The saved stack frame is used to fix up the copied stack frame
-         * that a nested NMI may change to make the interrupted NMI iret jump
-         * to the repeat_nmi. The original stack frame and the temp storage
-         * is also used by nested NMIs and can not be trusted on exit.
-         */
-        /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
        movq    (%rsp), %rdx
-        /* Set the NMI executing variable on the stack. */
+        /* Make room for "NMI executing". */
-        pushq   $1
+        pushq   $0
-        /* Leave room for the "copied" frame */
+        /* Leave room for the "iret" frame */
        subq    $(5*8), %rsp
-        /* Copy the stack frame to the Saved frame */
+        /* Copy the "original" frame to the "outermost" frame */
        .rept 5
        pushq   11*8(%rsp)
        .endr
        /* Everything up to here is safe from nested NMIs */
+#ifdef CONFIG_DEBUG_ENTRY
+        /*
+         * For ease of testing, unmask NMIs right away.  Disabled by
+         * default because IRET is very expensive.
+         */
+        pushq   $0              /* SS */
+        pushq   %rsp            /* RSP (minus 8 because of the previous push) */
+        addq    $8, (%rsp)      /* Fix up RSP */
+        pushfq                  /* RFLAGS */
+        pushq   $__KERNEL_CS    /* CS */
+        pushq   $1f             /* RIP */
+        INTERRUPT_RETURN        /* continues at repeat_nmi below */
+1:
+#endif
+repeat_nmi:
        /*
         * If there was a nested NMI, the first NMI's iret will return
         * here. But NMIs are still enabled and we can take another
@@ -1381,16 +1478,20 @@ first_nmi:
         * it will just return, as we are about to repeat an NMI anyway.
         * This makes it safe to copy to the stack frame that a nested
         * NMI will update.
+         *
+         * RSP is pointing to "outermost RIP".  gsbase is unknown, but, if
+         * we're repeating an NMI, gsbase has the same value that it had on
+         * the first iteration.  paranoid_entry will load the kernel
+         * gsbase if needed before we call do_nmi.  "NMI executing"
+         * is zero.
         */
-repeat_nmi:
+        movq    $1, 10*8(%rsp)          /* Set "NMI executing". */
        /*
-         * Update the stack variable to say we are still in NMI (the update
+         * Copy the "outermost" frame to the "iret" frame.  NMIs that nest
-         * is benign for the non-repeat case, where 1 was pushed just above
+         * here must not modify the "iret" frame while we're writing to
-         * to this very stack slot).
+         * it or it will end up containing garbage.
         */
-        movq    $1, 10*8(%rsp)
-        /* Make another copy, this one may be modified by nested NMIs */
        addq    $(10*8), %rsp
        .rept 5
        pushq   -6*8(%rsp)
@@ -1399,9 +1500,9 @@ repeat_nmi:
 end_repeat_nmi:
        /*
-         * Everything below this point can be preempted by a nested
+         * Everything below this point can be preempted by a nested NMI.
-         * NMI if the first NMI took an exception and reset our iret stack
+         * If this happens, then the inner NMI will change the "iret"
-         * so that we repeat another NMI.
+         * frame to point back to repeat_nmi.
         */
        pushq   $-1                             /* ORIG_RAX: no syscall to restart */
        ALLOC_PT_GPREGS_ON_STACK
@@ -1415,28 +1516,11 @@ end_repeat_nmi:
         */
        call    paranoid_entry
-        /*
-         * Save off the CR2 register. If we take a page fault in the NMI then
-         * it could corrupt the CR2 value. If the NMI preempts a page fault
-         * handler before it was able to read the CR2 register, and then the
-         * NMI itself takes a page fault, the page fault that was preempted
-         * will read the information from the NMI page fault and not the
-         * origin fault. Save it off and restore it if it changes.
-         * Use the r12 callee-saved register.
-         */
-        movq    %cr2, %r12
        /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
        movq    %rsp, %rdi
        movq    $-1, %rsi
        call    do_nmi
-        /* Did the NMI take a page fault? Restore cr2 if it did */
-        movq    %cr2, %rcx
-        cmpq    %rcx, %r12
-        je      1f
-        movq    %r12, %cr2
-1:
        testl   %ebx, %ebx                      /* swapgs needed? */
        jnz     nmi_restore
 nmi_swapgs:
@@ -1444,11 +1528,26 @@ nmi_swapgs:
 nmi_restore:
        RESTORE_EXTRA_REGS
        RESTORE_C_REGS
-        /* Pop the extra iret frame at once */
+        /* Point RSP at the "iret" frame. */
        REMOVE_PT_GPREGS_FROM_STACK 6*8
-        /* Clear the NMI executing stack variable */
+        /*
-        movq    $0, 5*8(%rsp)
+         * Clear "NMI executing".  Set DF first so that we can easily
+         * distinguish the remaining code between here and IRET from
+         * the SYSCALL entry and exit paths.  On a native kernel, we
+         * could just inspect RIP, but, on paravirt kernels,
+         * INTERRUPT_RETURN can translate into a jump into a
+         * hypercall page.
+         */
+        std
+        movq    $0, 5*8(%rsp)           /* clear "NMI executing" */
+        /*
+         * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
+         * stack in a single instruction.  We are returning to kernel
+         * mode, so this cannot result in a fault.
+         */
        INTERRUPT_RETURN
 END(nmi)
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 0637826292de..c49c5173158e 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -189,6 +189,7 @@ union fpregs_state {
        struct fxregs_state             fxsave;
        struct swregs_state             soft;
        struct xregs_state              xsave;
+        u8 __padding[PAGE_SIZE];
 };
 /*
@@ -198,40 +199,6 @@ union fpregs_state {
 */
 struct fpu {
        /*
-         * @state:
-         *
-         * In-memory copy of all FPU registers that we save/restore
-         * over context switches. If the task is using the FPU then
-         * the registers in the FPU are more recent than this state
-         * copy. If the task context-switches away then they get
-         * saved here and represent the FPU state.
-         *
-         * After context switches there may be a (short) time period
-         * during which the in-FPU hardware registers are unchanged
-         * and still perfectly match this state, if the tasks
-         * scheduled afterwards are not using the FPU.
-         *
-         * This is the 'lazy restore' window of optimization, which
-         * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
-         *
-         * We detect whether a subsequent task uses the FPU via setting
-         * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
-         *
-         * During this window, if the task gets scheduled again, we
-         * might be able to skip having to do a restore from this
-         * memory buffer to the hardware registers - at the cost of
-         * incurring the overhead of #NM fault traps.
-         *
-         * Note that on modern CPUs that support the XSAVEOPT (or other
-         * optimized XSAVE instructions), we don't use #NM traps anymore,
-         * as the hardware can track whether FPU registers need saving
-         * or not. On such CPUs we activate the non-lazy ('eagerfpu')
-         * logic, which unconditionally saves/restores all FPU state
-         * across context switches. (if FPU state exists.)
-         */
-        union fpregs_state              state;
-        /*
         * @last_cpu:
         *
         * Records the last CPU on which this context was loaded into
@@ -288,6 +255,43 @@ struct fpu {
         * deal with bursty apps that only use the FPU for a short time:
         */
        unsigned char                   counter;
+        /*
+         * @state:
+         *
+         * In-memory copy of all FPU registers that we save/restore
+         * over context switches. If the task is using the FPU then
+         * the registers in the FPU are more recent than this state
+         * copy. If the task context-switches away then they get
+         * saved here and represent the FPU state.
+         *
+         * After context switches there may be a (short) time period
+         * during which the in-FPU hardware registers are unchanged
+         * and still perfectly match this state, if the tasks
+         * scheduled afterwards are not using the FPU.
+         *
+         * This is the 'lazy restore' window of optimization, which
+         * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
+         *
+         * We detect whether a subsequent task uses the FPU via setting
+         * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
+         *
+         * During this window, if the task gets scheduled again, we
+         * might be able to skip having to do a restore from this
+         * memory buffer to the hardware registers - at the cost of
+         * incurring the overhead of #NM fault traps.
+         *
+         * Note that on modern CPUs that support the XSAVEOPT (or other
+         * optimized XSAVE instructions), we don't use #NM traps anymore,
+         * as the hardware can track whether FPU registers need saving
+         * or not. On such CPUs we activate the non-lazy ('eagerfpu')
+         * logic, which unconditionally saves/restores all FPU state
+         * across context switches. (if FPU state exists.)
+         */
+        union fpregs_state              state;
+        /*
+         * WARNING: 'state' is dynamically-sized.  Do not put
+         * anything after it here.
+         */
 };
 #endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 43e6519df0d5..944f1785ed0d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -390,9 +390,6 @@ struct thread_struct {
 #endif
        unsigned long           gs;
-        /* Floating point and extended processor state */
-        struct fpu              fpu;
        /* Save middle states of ptrace breakpoints */
        struct perf_event       *ptrace_bps[HBP_NUM];
        /* Debug status used for traps, single steps, etc... */
@@ -418,6 +415,13 @@ struct thread_struct {
        unsigned long           iopl;
        /* Max allowed port in the bitmap, in bytes: */
        unsigned                io_bitmap_max;
+        /* Floating point and extended processor state */
+        struct fpu              fpu;
+        /*
+         * WARNING: 'fpu' is dynamically-sized.  It *MUST* be at
+         * the end.
+         */
 };
 /*
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 32826791e675..0b39173dd971 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -4,6 +4,8 @@
 #include <asm/fpu/internal.h>
 #include <asm/tlbflush.h>
+#include <linux/sched.h>
 /*
 * Initialize the TS bit in CR0 according to the style of context-switches
 * we are using:
@@ -136,6 +138,43 @@ static void __init fpu__init_system_generic(void)
 unsigned int xstate_size;
 EXPORT_SYMBOL_GPL(xstate_size);
+/* Enforce that 'MEMBER' is the last field of 'TYPE': */
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
+        BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
+/*
+ * We append the 'struct fpu' to the task_struct:
+ */
+static void __init fpu__init_task_struct_size(void)
+{
+        int task_size = sizeof(struct task_struct);
+        /*
+         * Subtract off the static size of the register state.
+         * It potentially has a bunch of padding.
+         */
+        task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+        /*
+         * Add back the dynamically-calculated register state
+         * size.
+         */
+        task_size += xstate_size;
+        /*
+         * We dynamically size 'struct fpu', so we require that
+         * it be at the end of 'thread_struct' and that
+         * 'thread_struct' be at the end of 'task_struct'.  If
+         * you hit a compile error here, check the structure to
+         * see if something got added to the end.
+         */
+        CHECK_MEMBER_AT_END_OF(struct fpu, state);
+        CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
+        CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
+        arch_task_struct_size = task_size;
+}
 /*
 * Set up the xstate_size based on the legacy FPU context size.
 *
@@ -287,6 +326,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
        fpu__init_system_generic();
        fpu__init_system_xstate_size_legacy();
        fpu__init_system_xstate();
+        fpu__init_task_struct_size();
        fpu__init_system_ctx_switch();
 }
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index c3e985d1751c..d05bd2e2ee91 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -408,15 +408,15 @@ static void default_do_nmi(struct pt_regs *regs)
 NOKPROBE_SYMBOL(default_do_nmi);
 /*
- * NMIs can hit breakpoints which will cause it to lose its
+ * NMIs can page fault or hit breakpoints which will cause it to lose
- * NMI context with the CPU when the breakpoint does an iret.
+ * its NMI context with the CPU when the breakpoint or page fault does an IRET.
- */
+ *
-#ifdef CONFIG_X86_32
+ * As a result, NMIs can nest if NMIs get unmasked due an IRET during
-/*
+ * NMI processing.  On x86_64, the asm glue protects us from nested NMIs
- * For i386, NMIs use the same stack as the kernel, and we can
+ * if the outer NMI came from kernel mode, but we can still nest if the
- * add a workaround to the iret problem in C (preventing nested
+ * outer NMI came from user mode.
- * NMIs if an NMI takes a trap). Simply have 3 states the NMI
+ *
- * can be in:
+ * To handle these nested NMIs, we have three states:
 *
 *  1) not running
 *  2) executing
@@ -430,15 +430,14 @@ NOKPROBE_SYMBOL(default_do_nmi);
 * (Note, the latch is binary, thus multiple NMIs triggering,
 *  when one is running, are ignored. Only one NMI is restarted.)
 *
- * If an NMI hits a breakpoint that executes an iret, another
+ * If an NMI executes an iret, another NMI can preempt it. We do not
- * NMI can preempt it. We do not want to allow this new NMI
+ * want to allow this new NMI to run, but we want to execute it when the
- * to run, but we want to execute it when the first one finishes.
+ * first one finishes.  We set the state to "latched", and the exit of
- * We set the state to "latched", and the exit of the first NMI will
+ * the first NMI will perform a dec_return, if the result is zero
- * perform a dec_return, if the result is zero (NOT_RUNNING), then
+ * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
- * it will simply exit the NMI handler. If not, the dec_return
+ * dec_return would have set the state to NMI_EXECUTING (what we want it
- * would have set the state to NMI_EXECUTING (what we want it to
+ * to be when we are running). In this case, we simply jump back to
- * be when we are running). In this case, we simply jump back
+ * rerun the NMI handler again, and restart the 'latched' NMI.
- * to rerun the NMI handler again, and restart the 'latched' NMI.
 *
 * No trap (breakpoint or page fault) should be hit before nmi_restart,
 * thus there is no race between the first check of state for NOT_RUNNING
@@ -461,49 +460,36 @@ enum nmi_states {
 static DEFINE_PER_CPU(enum nmi_states, nmi_state);
 static DEFINE_PER_CPU(unsigned long, nmi_cr2);
-#define nmi_nesting_preprocess(regs)                                    \
+#ifdef CONFIG_X86_64
-        do {                                                            \
-                if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {      \
-                        this_cpu_write(nmi_state, NMI_LATCHED);         \
-                        return;                                         \
-                }                                                       \
-                this_cpu_write(nmi_state, NMI_EXECUTING);               \
-                this_cpu_write(nmi_cr2, read_cr2());                    \
-        } while (0);                                                    \
-        nmi_restart:
-#define nmi_nesting_postprocess()                                       \
-        do {                                                            \
-                if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))     \
-                        write_cr2(this_cpu_read(nmi_cr2));              \
-                if (this_cpu_dec_return(nmi_state))                     \
-                        goto nmi_restart;                               \
-        } while (0)
-#else /* x86_64 */
 /*
- * In x86_64 things are a bit more difficult. This has the same problem
+ * In x86_64, we need to handle breakpoint -> NMI -> breakpoint.  Without
- * where an NMI hitting a breakpoint that calls iret will remove the
+ * some care, the inner breakpoint will clobber the outer breakpoint's
- * NMI context, allowing a nested NMI to enter. What makes this more
+ * stack.
- * difficult is that both NMIs and breakpoints have their own stack.
- * When a new NMI or breakpoint is executed, the stack is set to a fixed
- * point. If an NMI is nested, it will have its stack set at that same
- * fixed address that the first NMI had, and will start corrupting the
- * stack. This is handled in entry_64.S, but the same problem exists with
- * the breakpoint stack.
 *
- * If a breakpoint is being processed, and the debug stack is being used,
+ * If a breakpoint is being processed, and the debug stack is being
- * if an NMI comes in and also hits a breakpoint, the stack pointer
+ * used, if an NMI comes in and also hits a breakpoint, the stack
- * will be set to the same fixed address as the breakpoint that was
+ * pointer will be set to the same fixed address as the breakpoint that
- * interrupted, causing that stack to be corrupted. To handle this case,
+ * was interrupted, causing that stack to be corrupted. To handle this
- * check if the stack that was interrupted is the debug stack, and if
+ * case, check if the stack that was interrupted is the debug stack, and
- * so, change the IDT so that new breakpoints will use the current stack
+ * if so, change the IDT so that new breakpoints will use the current
- * and not switch to the fixed address. On return of the NMI, switch back
+ * stack and not switch to the fixed address. On return of the NMI,
- * to the original IDT.
+ * switch back to the original IDT.
 */
 static DEFINE_PER_CPU(int, update_debug_stack);
+#endif
-static inline void nmi_nesting_preprocess(struct pt_regs *regs)
+dotraplinkage notrace void
+do_nmi(struct pt_regs *regs, long error_code)
 {
+        if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
+                this_cpu_write(nmi_state, NMI_LATCHED);
+                return;
+        }
+        this_cpu_write(nmi_state, NMI_EXECUTING);
+        this_cpu_write(nmi_cr2, read_cr2());
+nmi_restart:
+#ifdef CONFIG_X86_64
        /*
         * If we interrupted a breakpoint, it is possible that
         * the nmi handler will have breakpoints too. We need to
@@ -514,22 +500,8 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs)
                debug_stack_set_zero();
                this_cpu_write(update_debug_stack, 1);
        }
-}
-static inline void nmi_nesting_postprocess(void)
-{
-        if (unlikely(this_cpu_read(update_debug_stack))) {
-                debug_stack_reset();
-                this_cpu_write(update_debug_stack, 0);
-        }
-}
 #endif
-dotraplinkage notrace void
-do_nmi(struct pt_regs *regs, long error_code)
-{
-        nmi_nesting_preprocess(regs);
        nmi_enter();
        inc_irq_stat(__nmi_count);
@@ -539,8 +511,17 @@ do_nmi(struct pt_regs *regs, long error_code)
        nmi_exit();
-        /* On i386, may loop back to preprocess */
+#ifdef CONFIG_X86_64
-        nmi_nesting_postprocess();
+        if (unlikely(this_cpu_read(update_debug_stack))) {
+                debug_stack_reset();
+                this_cpu_write(update_debug_stack, 0);
+        }
+#endif
+        if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
+                write_cr2(this_cpu_read(nmi_cr2));
+        if (this_cpu_dec_return(nmi_state))
+                goto nmi_restart;
 }
 NOKPROBE_SYMBOL(do_nmi);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9cad694ed7c4..397688beed4b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
 */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-        *dst = *src;
+        memcpy(dst, src, arch_task_struct_size);
        return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 91a4e6426321..92e6726f6e37 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
                             roundup(sizeof(CORE_STR), 4)) +
                        roundup(sizeof(struct elf_prstatus), 4) +
                        roundup(sizeof(struct elf_prpsinfo), 4) +
-                        roundup(sizeof(struct task_struct), 4);
+                        roundup(arch_task_struct_size, 4);
        *elf_buflen = PAGE_ALIGN(*elf_buflen);
        return size + *elf_buflen;
 }
@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
        /* set up the task structure */
        notes[2].name   = CORE_STR;
        notes[2].type   = NT_TASKSTRUCT;
-        notes[2].datasz = sizeof(struct task_struct);
+        notes[2].datasz = arch_task_struct_size;
        notes[2].data   = current;
        nhdr->p_filesz  += notesize(&notes[2]);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ae21f1591615..04b5ada460b4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1522,8 +1522,6 @@ struct task_struct {
 /* hung task detection */
        unsigned long last_switch_count;
 #endif
-/* CPU-specific state of this task */
-        struct thread_struct thread;
 /* filesystem information */
        struct fs_struct *fs;
 /* open file information */
@@ -1778,8 +1776,22 @@ struct task_struct {
        unsigned long   task_state_change;
 #endif
        int pagefault_disabled;
+/* CPU-specific state of this task */
+        struct thread_struct thread;
+/*
+ * WARNING: on x86, 'thread_struct' contains a variable-sized
+ * structure.  It *MUST* be at the end of 'task_struct'.
+ *
+ * Do not put anything below here!
+ */
 };
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+extern int arch_task_struct_size __read_mostly;
+#else
+# define arch_task_struct_size (sizeof(struct task_struct))
+#endif
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
diff --git a/kernel/fork.c b/kernel/fork.c
index 1bfefc6f96a4..dbd9b8d7b7cc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -287,6 +287,11 @@ static void set_max_threads(unsigned int max_threads_suggested)
        max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
 }
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+/* Initialized by the architecture: */
+int arch_task_struct_size __read_mostly;
+#endif
 void __init fork_init(void)
 {
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
@@ -295,7 +300,7 @@ void __init fork_init(void)
 #endif
        /* create a slab on which task_structs can be allocated */
        task_struct_cachep =
-                kmem_cache_create("task_struct", sizeof(struct task_struct),
+                kmem_cache_create("task_struct", arch_task_struct_size,
                        ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif