1 files changed, 199 insertions, 100 deletions
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3bb2c4302df1..8cb3e438f21e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1237,11 +1237,12 @@ ENTRY(nmi)
         *  If the variable is not set and the stack is not the NMI
         *  stack then:
         *    o Set the special variable on the stack
-         *    o Copy the interrupt frame into a "saved" location on the stack
+         *    o Copy the interrupt frame into an "outermost" location on the
-         *    o Copy the interrupt frame into a "copy" location on the stack
+         *      stack
+         *    o Copy the interrupt frame into an "iret" location on the stack
         *    o Continue processing the NMI
         *  If the variable is set or the previous stack is the NMI stack:
-         *    o Modify the "copy" location to jump to the repeate_nmi
+         *    o Modify the "iret" location to jump to the repeat_nmi
         *    o return back to the first NMI
         *
         * Now on exit of the first NMI, we first clear the stack variable
@@ -1250,31 +1251,151 @@ ENTRY(nmi)
         * a nested NMI that updated the copy interrupt stack frame, a
         * jump will be made to the repeat_nmi code that will handle the second
         * NMI.
+         *
+         * However, espfix prevents us from directly returning to userspace
+         * with a single IRET instruction.  Similarly, IRET to user mode
+         * can fault.  We therefore handle NMIs from user space like
+         * other IST entries.
         */
        /* Use %rdx as our temp variable throughout */
        pushq   %rdx
+        testb   $3, CS-RIP+8(%rsp)
+        jz      .Lnmi_from_kernel
+        /*
+         * NMI from user mode.  We need to run on the thread stack, but we
+         * can't go through the normal entry paths: NMIs are masked, and
+         * we don't want to enable interrupts, because then we'll end
+         * up in an awkward situation in which IRQs are on but NMIs
+         * are off.
+         */
+        SWAPGS
+        cld
+        movq    %rsp, %rdx
+        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+        pushq   5*8(%rdx)       /* pt_regs->ss */
+        pushq   4*8(%rdx)       /* pt_regs->rsp */
+        pushq   3*8(%rdx)       /* pt_regs->flags */
+        pushq   2*8(%rdx)       /* pt_regs->cs */
+        pushq   1*8(%rdx)       /* pt_regs->rip */
+        pushq   $-1             /* pt_regs->orig_ax */
+        pushq   %rdi            /* pt_regs->di */
+        pushq   %rsi            /* pt_regs->si */
+        pushq   (%rdx)          /* pt_regs->dx */
+        pushq   %rcx            /* pt_regs->cx */
+        pushq   %rax            /* pt_regs->ax */
+        pushq   %r8             /* pt_regs->r8 */
+        pushq   %r9             /* pt_regs->r9 */
+        pushq   %r10            /* pt_regs->r10 */
+        pushq   %r11            /* pt_regs->r11 */
+        pushq   %rbx            /* pt_regs->rbx */
+        pushq   %rbp            /* pt_regs->rbp */
+        pushq   %r12            /* pt_regs->r12 */
+        pushq   %r13            /* pt_regs->r13 */
+        pushq   %r14            /* pt_regs->r14 */
+        pushq   %r15            /* pt_regs->r15 */
+        /*
+         * At this point we no longer need to worry about stack damage
+         * due to nesting -- we're on the normal thread stack and we're
+         * done with the NMI stack.
+         */
+        movq    %rsp, %rdi
+        movq    $-1, %rsi
+        call    do_nmi
+        /*
+         * Return back to user mode.  We must *not* do the normal exit
+         * work, because we don't want to enable interrupts.  Fortunately,
+         * do_nmi doesn't modify pt_regs.
+         */
+        SWAPGS
+        jmp     restore_c_regs_and_iret
+.Lnmi_from_kernel:
+        /*
+         * Here's what our stack frame will look like:
+         * +---------------------------------------------------------+
+         * | original SS                                             |
+         * | original Return RSP                                     |
+         * | original RFLAGS                                         |
+         * | original CS                                             |
+         * | original RIP                                            |
+         * +---------------------------------------------------------+
+         * | temp storage for rdx                                    |
+         * +---------------------------------------------------------+
+         * | "NMI executing" variable                                |
+         * +---------------------------------------------------------+
+         * | iret SS          } Copied from "outermost" frame        |
+         * | iret Return RSP  } on each loop iteration; overwritten  |
+         * | iret RFLAGS      } by a nested NMI to force another     |
+         * | iret CS          } iteration if needed.                 |
+         * | iret RIP         }                                      |
+         * +---------------------------------------------------------+
+         * | outermost SS          } initialized in first_nmi;       |
+         * | outermost Return RSP  } will not be changed before      |
+         * | outermost RFLAGS      } NMI processing is done.         |
+         * | outermost CS          } Copied to "iret" frame on each  |
+         * | outermost RIP         } iteration.                      |
+         * +---------------------------------------------------------+
+         * | pt_regs                                                 |
+         * +---------------------------------------------------------+
+         *
+         * The "original" frame is used by hardware.  Before re-enabling
+         * NMIs, we need to be done with it, and we need to leave enough
+         * space for the asm code here.
+         *
+         * We return by executing IRET while RSP points to the "iret" frame.
+         * That will either return for real or it will loop back into NMI
+         * processing.
+         *
+         * The "outermost" frame is copied to the "iret" frame on each
+         * iteration of the loop, so each iteration starts with the "iret"
+         * frame pointing to the final return target.
+         */
        /*
-         * If %cs was not the kernel segment, then the NMI triggered in user
+         * Determine whether we're a nested NMI.
-         * space, which means it is definitely not nested.
+         *
+         * If we interrupted kernel code between repeat_nmi and
+         * end_repeat_nmi, then we are a nested NMI.  We must not
+         * modify the "iret" frame because it's being written by
+         * the outer NMI.  That's okay; the outer NMI handler is
+         * about to about to call do_nmi anyway, so we can just
+         * resume the outer NMI.
         */
-        cmpl    $__KERNEL_CS, 16(%rsp)
-        jne     first_nmi
+        movq    $repeat_nmi, %rdx
+        cmpq    8(%rsp), %rdx
+        ja      1f
+        movq    $end_repeat_nmi, %rdx
+        cmpq    8(%rsp), %rdx
+        ja      nested_nmi_out
+1:
        /*
-         * Check the special variable on the stack to see if NMIs are
+         * Now check "NMI executing".  If it's set, then we're nested.
-         * executing.
+         * This will not detect if we interrupted an outer NMI just
+         * before IRET.
         */
        cmpl    $1, -8(%rsp)
        je      nested_nmi
        /*
-         * Now test if the previous stack was an NMI stack.
+         * Now test if the previous stack was an NMI stack.  This covers
-         * We need the double check. We check the NMI stack to satisfy the
+         * the case where we interrupt an outer NMI after it clears
-         * race when the first NMI clears the variable before returning.
+         * "NMI executing" but before IRET.  We need to be careful, though:
-         * We check the variable because the first NMI could be in a
+         * there is one case in which RSP could point to the NMI stack
-         * breakpoint routine using a breakpoint stack.
+         * despite there being no NMI active: naughty userspace controls
+         * RSP at the very beginning of the SYSCALL targets.  We can
+         * pull a fast one on naughty userspace, though: we program
+         * SYSCALL to mask DF, so userspace cannot cause DF to be set
+         * if it controls the kernel's RSP.  We set DF before we clear
+         * "NMI executing".
         */
        lea     6*8(%rsp), %rdx
        /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
@@ -1286,25 +1407,20 @@ ENTRY(nmi)
        cmpq    %rdx, 4*8(%rsp)
        /* If it is below the NMI stack, it is a normal NMI */
        jb      first_nmi
-        /* Ah, it is within the NMI stack, treat it as nested */
+        /* Ah, it is within the NMI stack. */
+        testb   $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
+        jz      first_nmi       /* RSP was user controlled. */
+        /* This is a nested NMI. */
 nested_nmi:
        /*
-         * Do nothing if we interrupted the fixup in repeat_nmi.
+         * Modify the "iret" frame to point to repeat_nmi, forcing another
-         * It's about to repeat the NMI handler, so we are fine
+         * iteration of NMI handling.
-         * with ignoring this one.
         */
-        movq    $repeat_nmi, %rdx
+        subq    $8, %rsp
-        cmpq    8(%rsp), %rdx
-        ja      1f
-        movq    $end_repeat_nmi, %rdx
-        cmpq    8(%rsp), %rdx
-        ja      nested_nmi_out
-1:
-        /* Set up the interrupted NMIs stack to jump to repeat_nmi */
-        leaq    -1*8(%rsp), %rdx
-        movq    %rdx, %rsp
        leaq    -10*8(%rsp), %rdx
        pushq   $__KERNEL_DS
        pushq   %rdx
@@ -1318,61 +1434,42 @@ nested_nmi:
 nested_nmi_out:
        popq    %rdx
-        /* No need to check faults here */
+        /* We are returning to kernel mode, so this cannot result in a fault. */
        INTERRUPT_RETURN
 first_nmi:
-        /*
+        /* Restore rdx. */
-         * Because nested NMIs will use the pushed location that we
-         * stored in rdx, we must keep that space available.
-         * Here's what our stack frame will look like:
-         * +-------------------------+
-         * | original SS             |
-         * | original Return RSP     |
-         * | original RFLAGS         |
-         * | original CS             |
-         * | original RIP            |
-         * +-------------------------+
-         * | temp storage for rdx    |
-         * +-------------------------+
-         * | NMI executing variable  |
-         * +-------------------------+
-         * | copied SS               |
-         * | copied Return RSP       |
-         * | copied RFLAGS           |
-         * | copied CS               |
-         * | copied RIP              |
-         * +-------------------------+
-         * | Saved SS                |
-         * | Saved Return RSP        |
-         * | Saved RFLAGS            |
-         * | Saved CS                |
-         * | Saved RIP               |
-         * +-------------------------+
-         * | pt_regs                 |
-         * +-------------------------+
-         *
-         * The saved stack frame is used to fix up the copied stack frame
-         * that a nested NMI may change to make the interrupted NMI iret jump
-         * to the repeat_nmi. The original stack frame and the temp storage
-         * is also used by nested NMIs and can not be trusted on exit.
-         */
-        /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
        movq    (%rsp), %rdx
-        /* Set the NMI executing variable on the stack. */
+        /* Make room for "NMI executing". */
-        pushq   $1
+        pushq   $0
-        /* Leave room for the "copied" frame */
+        /* Leave room for the "iret" frame */
        subq    $(5*8), %rsp
-        /* Copy the stack frame to the Saved frame */
+        /* Copy the "original" frame to the "outermost" frame */
        .rept 5
        pushq   11*8(%rsp)
        .endr
        /* Everything up to here is safe from nested NMIs */
+#ifdef CONFIG_DEBUG_ENTRY
+        /*
+         * For ease of testing, unmask NMIs right away.  Disabled by
+         * default because IRET is very expensive.
+         */
+        pushq   $0              /* SS */
+        pushq   %rsp            /* RSP (minus 8 because of the previous push) */
+        addq    $8, (%rsp)      /* Fix up RSP */
+        pushfq                  /* RFLAGS */
+        pushq   $__KERNEL_CS    /* CS */
+        pushq   $1f             /* RIP */
+        INTERRUPT_RETURN        /* continues at repeat_nmi below */
+1:
+#endif
+repeat_nmi:
        /*
         * If there was a nested NMI, the first NMI's iret will return
         * here. But NMIs are still enabled and we can take another
@@ -1381,16 +1478,20 @@ first_nmi:
         * it will just return, as we are about to repeat an NMI anyway.
         * This makes it safe to copy to the stack frame that a nested
         * NMI will update.
+         *
+         * RSP is pointing to "outermost RIP".  gsbase is unknown, but, if
+         * we're repeating an NMI, gsbase has the same value that it had on
+         * the first iteration.  paranoid_entry will load the kernel
+         * gsbase if needed before we call do_nmi.  "NMI executing"
+         * is zero.
         */
-repeat_nmi:
+        movq    $1, 10*8(%rsp)          /* Set "NMI executing". */
        /*
-         * Update the stack variable to say we are still in NMI (the update
+         * Copy the "outermost" frame to the "iret" frame.  NMIs that nest
-         * is benign for the non-repeat case, where 1 was pushed just above
+         * here must not modify the "iret" frame while we're writing to
-         * to this very stack slot).
+         * it or it will end up containing garbage.
         */
-        movq    $1, 10*8(%rsp)
-        /* Make another copy, this one may be modified by nested NMIs */
        addq    $(10*8), %rsp
        .rept 5
        pushq   -6*8(%rsp)
@@ -1399,9 +1500,9 @@ repeat_nmi:
 end_repeat_nmi:
        /*
-         * Everything below this point can be preempted by a nested
+         * Everything below this point can be preempted by a nested NMI.
-         * NMI if the first NMI took an exception and reset our iret stack
+         * If this happens, then the inner NMI will change the "iret"
-         * so that we repeat another NMI.
+         * frame to point back to repeat_nmi.
         */
        pushq   $-1                             /* ORIG_RAX: no syscall to restart */
        ALLOC_PT_GPREGS_ON_STACK
@@ -1415,28 +1516,11 @@ end_repeat_nmi:
         */
        call    paranoid_entry
-        /*
-         * Save off the CR2 register. If we take a page fault in the NMI then
-         * it could corrupt the CR2 value. If the NMI preempts a page fault
-         * handler before it was able to read the CR2 register, and then the
-         * NMI itself takes a page fault, the page fault that was preempted
-         * will read the information from the NMI page fault and not the
-         * origin fault. Save it off and restore it if it changes.
-         * Use the r12 callee-saved register.
-         */
-        movq    %cr2, %r12
        /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
        movq    %rsp, %rdi
        movq    $-1, %rsi
        call    do_nmi
-        /* Did the NMI take a page fault? Restore cr2 if it did */
-        movq    %cr2, %rcx
-        cmpq    %rcx, %r12
-        je      1f
-        movq    %r12, %cr2
-1:
        testl   %ebx, %ebx                      /* swapgs needed? */
        jnz     nmi_restore
 nmi_swapgs:
@@ -1444,11 +1528,26 @@ nmi_swapgs:
 nmi_restore:
        RESTORE_EXTRA_REGS
        RESTORE_C_REGS
-        /* Pop the extra iret frame at once */
+        /* Point RSP at the "iret" frame. */
        REMOVE_PT_GPREGS_FROM_STACK 6*8
-        /* Clear the NMI executing stack variable */
+        /*
-        movq    $0, 5*8(%rsp)
+         * Clear "NMI executing".  Set DF first so that we can easily
+         * distinguish the remaining code between here and IRET from
+         * the SYSCALL entry and exit paths.  On a native kernel, we
+         * could just inspect RIP, but, on paravirt kernels,
+         * INTERRUPT_RETURN can translate into a jump into a
+         * hypercall page.
+         */
+        std
+        movq    $0, 5*8(%rsp)           /* clear "NMI executing" */
+        /*
+         * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
+         * stack in a single instruction.  We are returning to kernel
+         * mode, so this cannot result in a fault.
+         */
        INTERRUPT_RETURN
 END(nmi)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3bb2c4302df1..8cb3e438f21e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S
@@ -1237,11 +1237,12 @@ ENTRY(nmi)
1237	* If the variable is not set and the stack is not the NMI	1237	* If the variable is not set and the stack is not the NMI
1238	* stack then:	1238	* stack then:
1239	* o Set the special variable on the stack	1239	* o Set the special variable on the stack
1240	* o Copy the interrupt frame into a "saved" location on the stack	1240	* o Copy the interrupt frame into an "outermost" location on the
1241	* o Copy the interrupt frame into a "copy" location on the stack	1241	* stack
		1242	* o Copy the interrupt frame into an "iret" location on the stack
1242	* o Continue processing the NMI	1243	* o Continue processing the NMI
1243	* If the variable is set or the previous stack is the NMI stack:	1244	* If the variable is set or the previous stack is the NMI stack:
1244	* o Modify the "copy" location to jump to the repeate_nmi	1245	* o Modify the "iret" location to jump to the repeat_nmi
1245	* o return back to the first NMI	1246	* o return back to the first NMI
1246	*	1247	*
1247	* Now on exit of the first NMI, we first clear the stack variable	1248	* Now on exit of the first NMI, we first clear the stack variable
@@ -1250,31 +1251,151 @@ ENTRY(nmi)
1250	* a nested NMI that updated the copy interrupt stack frame, a	1251	* a nested NMI that updated the copy interrupt stack frame, a
1251	* jump will be made to the repeat_nmi code that will handle the second	1252	* jump will be made to the repeat_nmi code that will handle the second
1252	* NMI.	1253	* NMI.
		1254	*
		1255	* However, espfix prevents us from directly returning to userspace
		1256	* with a single IRET instruction. Similarly, IRET to user mode
		1257	* can fault. We therefore handle NMIs from user space like
		1258	* other IST entries.
1253	*/	1259	*/
1254		1260
1255	/* Use %rdx as our temp variable throughout */	1261	/* Use %rdx as our temp variable throughout */
1256	pushq %rdx	1262	pushq %rdx
1257		1263
		1264	testb $3, CS-RIP+8(%rsp)
		1265	jz .Lnmi_from_kernel
		1266
		1267	/*
		1268	* NMI from user mode. We need to run on the thread stack, but we
		1269	* can't go through the normal entry paths: NMIs are masked, and
		1270	* we don't want to enable interrupts, because then we'll end
		1271	* up in an awkward situation in which IRQs are on but NMIs
		1272	* are off.
		1273	*/
		1274
		1275	SWAPGS
		1276	cld
		1277	movq %rsp, %rdx
		1278	movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
		1279	pushq 58(%rdx) / pt_regs->ss */
		1280	pushq 48(%rdx) / pt_regs->rsp */
		1281	pushq 38(%rdx) / pt_regs->flags */
		1282	pushq 28(%rdx) / pt_regs->cs */
		1283	pushq 18(%rdx) / pt_regs->rip */
		1284	pushq $-1 /* pt_regs->orig_ax */
		1285	pushq %rdi /* pt_regs->di */
		1286	pushq %rsi /* pt_regs->si */
		1287	pushq (%rdx) /* pt_regs->dx */
		1288	pushq %rcx /* pt_regs->cx */
		1289	pushq %rax /* pt_regs->ax */
		1290	pushq %r8 /* pt_regs->r8 */
		1291	pushq %r9 /* pt_regs->r9 */
		1292	pushq %r10 /* pt_regs->r10 */
		1293	pushq %r11 /* pt_regs->r11 */
		1294	pushq %rbx /* pt_regs->rbx */
		1295	pushq %rbp /* pt_regs->rbp */
		1296	pushq %r12 /* pt_regs->r12 */
		1297	pushq %r13 /* pt_regs->r13 */
		1298	pushq %r14 /* pt_regs->r14 */
		1299	pushq %r15 /* pt_regs->r15 */
		1300
		1301	/*
		1302	* At this point we no longer need to worry about stack damage
		1303	* due to nesting -- we're on the normal thread stack and we're
		1304	* done with the NMI stack.
		1305	*/
		1306
		1307	movq %rsp, %rdi
		1308	movq $-1, %rsi
		1309	call do_nmi
		1310
		1311	/*
		1312	* Return back to user mode. We must not do the normal exit
		1313	* work, because we don't want to enable interrupts. Fortunately,
		1314	* do_nmi doesn't modify pt_regs.
		1315	*/
		1316	SWAPGS
		1317	jmp restore_c_regs_and_iret
		1318
		1319	.Lnmi_from_kernel:
		1320	/*
		1321	* Here's what our stack frame will look like:
		1322	* +---------------------------------------------------------+
		1323	* \| original SS \|
		1324	* \| original Return RSP \|
		1325	* \| original RFLAGS \|
		1326	* \| original CS \|
		1327	* \| original RIP \|
		1328	* +---------------------------------------------------------+
		1329	* \| temp storage for rdx \|
		1330	* +---------------------------------------------------------+
		1331	* \| "NMI executing" variable \|
		1332	* +---------------------------------------------------------+
		1333	* \| iret SS } Copied from "outermost" frame \|
		1334	* \| iret Return RSP } on each loop iteration; overwritten \|
		1335	* \| iret RFLAGS } by a nested NMI to force another \|
		1336	* \| iret CS } iteration if needed. \|
		1337	* \| iret RIP } \|
		1338	* +---------------------------------------------------------+
		1339	* \| outermost SS } initialized in first_nmi; \|
		1340	* \| outermost Return RSP } will not be changed before \|
		1341	* \| outermost RFLAGS } NMI processing is done. \|
		1342	* \| outermost CS } Copied to "iret" frame on each \|
		1343	* \| outermost RIP } iteration. \|
		1344	* +---------------------------------------------------------+
		1345	* \| pt_regs \|
		1346	* +---------------------------------------------------------+
		1347	*
		1348	* The "original" frame is used by hardware. Before re-enabling
		1349	* NMIs, we need to be done with it, and we need to leave enough
		1350	* space for the asm code here.
		1351	*
		1352	* We return by executing IRET while RSP points to the "iret" frame.
		1353	* That will either return for real or it will loop back into NMI
		1354	* processing.
		1355	*
		1356	* The "outermost" frame is copied to the "iret" frame on each
		1357	* iteration of the loop, so each iteration starts with the "iret"
		1358	* frame pointing to the final return target.
		1359	*/
		1360
1258	/*	1361	/*
1259	* If %cs was not the kernel segment, then the NMI triggered in user	1362	* Determine whether we're a nested NMI.
1260	* space, which means it is definitely not nested.	1363	*
		1364	* If we interrupted kernel code between repeat_nmi and
		1365	* end_repeat_nmi, then we are a nested NMI. We must not
		1366	* modify the "iret" frame because it's being written by
		1367	* the outer NMI. That's okay; the outer NMI handler is
		1368	* about to about to call do_nmi anyway, so we can just
		1369	* resume the outer NMI.
1261	*/	1370	*/
1262	cmpl $__KERNEL_CS, 16(%rsp)	1371
1263	jne first_nmi	1372	movq $repeat_nmi, %rdx
		1373	cmpq 8(%rsp), %rdx
		1374	ja 1f
		1375	movq $end_repeat_nmi, %rdx
		1376	cmpq 8(%rsp), %rdx
		1377	ja nested_nmi_out
		1378	1:
1264		1379
1265	/*	1380	/*
1266	* Check the special variable on the stack to see if NMIs are	1381	* Now check "NMI executing". If it's set, then we're nested.
1267	* executing.	1382	* This will not detect if we interrupted an outer NMI just
		1383	* before IRET.
1268	*/	1384	*/
1269	cmpl $1, -8(%rsp)	1385	cmpl $1, -8(%rsp)
1270	je nested_nmi	1386	je nested_nmi
1271		1387
1272	/*	1388	/*
1273	* Now test if the previous stack was an NMI stack.	1389	* Now test if the previous stack was an NMI stack. This covers
1274	* We need the double check. We check the NMI stack to satisfy the	1390	* the case where we interrupt an outer NMI after it clears
1275	* race when the first NMI clears the variable before returning.	1391	* "NMI executing" but before IRET. We need to be careful, though:
1276	* We check the variable because the first NMI could be in a	1392	* there is one case in which RSP could point to the NMI stack
1277	* breakpoint routine using a breakpoint stack.	1393	* despite there being no NMI active: naughty userspace controls
		1394	* RSP at the very beginning of the SYSCALL targets. We can
		1395	* pull a fast one on naughty userspace, though: we program
		1396	* SYSCALL to mask DF, so userspace cannot cause DF to be set
		1397	* if it controls the kernel's RSP. We set DF before we clear
		1398	* "NMI executing".
1278	*/	1399	*/
1279	lea 6*8(%rsp), %rdx	1400	lea 6*8(%rsp), %rdx
1280	/* Compare the NMI stack (rdx) with the stack we came from (48(%rsp)) /	1401	/* Compare the NMI stack (rdx) with the stack we came from (48(%rsp)) /
@@ -1286,25 +1407,20 @@ ENTRY(nmi)
1286	cmpq %rdx, 4*8(%rsp)	1407	cmpq %rdx, 4*8(%rsp)
1287	/* If it is below the NMI stack, it is a normal NMI */	1408	/* If it is below the NMI stack, it is a normal NMI */
1288	jb first_nmi	1409	jb first_nmi
1289	/* Ah, it is within the NMI stack, treat it as nested */	1410
		1411	/* Ah, it is within the NMI stack. */
		1412
		1413	testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
		1414	jz first_nmi /* RSP was user controlled. */
		1415
		1416	/* This is a nested NMI. */
1290		1417
1291	nested_nmi:	1418	nested_nmi:
1292	/*	1419	/*
1293	* Do nothing if we interrupted the fixup in repeat_nmi.	1420	* Modify the "iret" frame to point to repeat_nmi, forcing another
1294	* It's about to repeat the NMI handler, so we are fine	1421	* iteration of NMI handling.
1295	* with ignoring this one.
1296	*/	1422	*/
1297	movq $repeat_nmi, %rdx	1423	subq $8, %rsp
1298	cmpq 8(%rsp), %rdx
1299	ja 1f
1300	movq $end_repeat_nmi, %rdx
1301	cmpq 8(%rsp), %rdx
1302	ja nested_nmi_out
1303
1304	1:
1305	/* Set up the interrupted NMIs stack to jump to repeat_nmi */
1306	leaq -1*8(%rsp), %rdx
1307	movq %rdx, %rsp
1308	leaq -10*8(%rsp), %rdx	1424	leaq -10*8(%rsp), %rdx
1309	pushq $__KERNEL_DS	1425	pushq $__KERNEL_DS
1310	pushq %rdx	1426	pushq %rdx
@@ -1318,61 +1434,42 @@ nested_nmi:
1318	nested_nmi_out:	1434	nested_nmi_out:
1319	popq %rdx	1435	popq %rdx
1320		1436
1321	/* No need to check faults here */	1437	/* We are returning to kernel mode, so this cannot result in a fault. */
1322	INTERRUPT_RETURN	1438	INTERRUPT_RETURN
1323		1439
1324	first_nmi:	1440	first_nmi:
1325	/*	1441	/* Restore rdx. */
1326	* Because nested NMIs will use the pushed location that we
1327	* stored in rdx, we must keep that space available.
1328	* Here's what our stack frame will look like:
1329	* +-------------------------+
1330	* \| original SS \|
1331	* \| original Return RSP \|
1332	* \| original RFLAGS \|
1333	* \| original CS \|
1334	* \| original RIP \|
1335	* +-------------------------+
1336	* \| temp storage for rdx \|
1337	* +-------------------------+
1338	* \| NMI executing variable \|
1339	* +-------------------------+
1340	* \| copied SS \|
1341	* \| copied Return RSP \|
1342	* \| copied RFLAGS \|
1343	* \| copied CS \|
1344	* \| copied RIP \|
1345	* +-------------------------+
1346	* \| Saved SS \|
1347	* \| Saved Return RSP \|
1348	* \| Saved RFLAGS \|
1349	* \| Saved CS \|
1350	* \| Saved RIP \|
1351	* +-------------------------+
1352	* \| pt_regs \|
1353	* +-------------------------+
1354	*
1355	* The saved stack frame is used to fix up the copied stack frame
1356	* that a nested NMI may change to make the interrupted NMI iret jump
1357	* to the repeat_nmi. The original stack frame and the temp storage
1358	* is also used by nested NMIs and can not be trusted on exit.
1359	*/
1360	/* Do not pop rdx, nested NMIs will corrupt that part of the stack */
1361	movq (%rsp), %rdx	1442	movq (%rsp), %rdx
1362		1443
1363	/* Set the NMI executing variable on the stack. */	1444	/* Make room for "NMI executing". */
1364	pushq $1	1445	pushq $0
1365		1446
1366	/* Leave room for the "copied" frame */	1447	/* Leave room for the "iret" frame */
1367	subq $(5*8), %rsp	1448	subq $(5*8), %rsp
1368		1449
1369	/* Copy the stack frame to the Saved frame */	1450	/* Copy the "original" frame to the "outermost" frame */
1370	.rept 5	1451	.rept 5
1371	pushq 11*8(%rsp)	1452	pushq 11*8(%rsp)
1372	.endr	1453	.endr
1373		1454
1374	/* Everything up to here is safe from nested NMIs */	1455	/* Everything up to here is safe from nested NMIs */
1375		1456
		1457	#ifdef CONFIG_DEBUG_ENTRY
		1458	/*
		1459	* For ease of testing, unmask NMIs right away. Disabled by
		1460	* default because IRET is very expensive.
		1461	*/
		1462	pushq $0 /* SS */
		1463	pushq %rsp /* RSP (minus 8 because of the previous push) */
		1464	addq $8, (%rsp) /* Fix up RSP */
		1465	pushfq /* RFLAGS */
		1466	pushq $__KERNEL_CS /* CS */
		1467	pushq $1f /* RIP */
		1468	INTERRUPT_RETURN /* continues at repeat_nmi below */
		1469	1:
		1470	#endif
		1471
		1472	repeat_nmi:
1376	/*	1473	/*
1377	* If there was a nested NMI, the first NMI's iret will return	1474	* If there was a nested NMI, the first NMI's iret will return
1378	* here. But NMIs are still enabled and we can take another	1475	* here. But NMIs are still enabled and we can take another
@@ -1381,16 +1478,20 @@ first_nmi:
1381	* it will just return, as we are about to repeat an NMI anyway.	1478	* it will just return, as we are about to repeat an NMI anyway.
1382	* This makes it safe to copy to the stack frame that a nested	1479	* This makes it safe to copy to the stack frame that a nested
1383	* NMI will update.	1480	* NMI will update.
		1481	*
		1482	* RSP is pointing to "outermost RIP". gsbase is unknown, but, if
		1483	* we're repeating an NMI, gsbase has the same value that it had on
		1484	* the first iteration. paranoid_entry will load the kernel
		1485	* gsbase if needed before we call do_nmi. "NMI executing"
		1486	* is zero.
1384	*/	1487	*/
1385	repeat_nmi:	1488	movq $1, 108(%rsp) / Set "NMI executing". */
		1489
1386	/*	1490	/*
1387	* Update the stack variable to say we are still in NMI (the update	1491	* Copy the "outermost" frame to the "iret" frame. NMIs that nest
1388	* is benign for the non-repeat case, where 1 was pushed just above	1492	* here must not modify the "iret" frame while we're writing to
1389	* to this very stack slot).	1493	* it or it will end up containing garbage.
1390	*/	1494	*/
1391	movq $1, 10*8(%rsp)
1392
1393	/* Make another copy, this one may be modified by nested NMIs */
1394	addq $(10*8), %rsp	1495	addq $(10*8), %rsp
1395	.rept 5	1496	.rept 5
1396	pushq -6*8(%rsp)	1497	pushq -6*8(%rsp)
@@ -1399,9 +1500,9 @@ repeat_nmi:
1399	end_repeat_nmi:	1500	end_repeat_nmi:
1400		1501
1401	/*	1502	/*
1402	* Everything below this point can be preempted by a nested	1503	* Everything below this point can be preempted by a nested NMI.
1403	* NMI if the first NMI took an exception and reset our iret stack	1504	* If this happens, then the inner NMI will change the "iret"
1404	* so that we repeat another NMI.	1505	* frame to point back to repeat_nmi.
1405	*/	1506	*/
1406	pushq $-1 /* ORIG_RAX: no syscall to restart */	1507	pushq $-1 /* ORIG_RAX: no syscall to restart */
1407	ALLOC_PT_GPREGS_ON_STACK	1508	ALLOC_PT_GPREGS_ON_STACK
@@ -1415,28 +1516,11 @@ end_repeat_nmi:
1415	*/	1516	*/
1416	call paranoid_entry	1517	call paranoid_entry
1417		1518
1418	/*
1419	* Save off the CR2 register. If we take a page fault in the NMI then
1420	* it could corrupt the CR2 value. If the NMI preempts a page fault
1421	* handler before it was able to read the CR2 register, and then the
1422	* NMI itself takes a page fault, the page fault that was preempted
1423	* will read the information from the NMI page fault and not the
1424	* origin fault. Save it off and restore it if it changes.
1425	* Use the r12 callee-saved register.
1426	*/
1427	movq %cr2, %r12
1428
1429	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */	1519	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1430	movq %rsp, %rdi	1520	movq %rsp, %rdi
1431	movq $-1, %rsi	1521	movq $-1, %rsi
1432	call do_nmi	1522	call do_nmi
1433		1523
1434	/* Did the NMI take a page fault? Restore cr2 if it did */
1435	movq %cr2, %rcx
1436	cmpq %rcx, %r12
1437	je 1f
1438	movq %r12, %cr2
1439	1:
1440	testl %ebx, %ebx /* swapgs needed? */	1524	testl %ebx, %ebx /* swapgs needed? */
1441	jnz nmi_restore	1525	jnz nmi_restore
1442	nmi_swapgs:	1526	nmi_swapgs:
@@ -1444,11 +1528,26 @@ nmi_swapgs:
1444	nmi_restore:	1528	nmi_restore:
1445	RESTORE_EXTRA_REGS	1529	RESTORE_EXTRA_REGS
1446	RESTORE_C_REGS	1530	RESTORE_C_REGS
1447	/* Pop the extra iret frame at once */	1531
		1532	/* Point RSP at the "iret" frame. */
1448	REMOVE_PT_GPREGS_FROM_STACK 6*8	1533	REMOVE_PT_GPREGS_FROM_STACK 6*8
1449		1534
1450	/* Clear the NMI executing stack variable */	1535	/*
1451	movq $0, 5*8(%rsp)	1536	* Clear "NMI executing". Set DF first so that we can easily
		1537	* distinguish the remaining code between here and IRET from
		1538	* the SYSCALL entry and exit paths. On a native kernel, we
		1539	* could just inspect RIP, but, on paravirt kernels,
		1540	* INTERRUPT_RETURN can translate into a jump into a
		1541	* hypercall page.
		1542	*/
		1543	std
		1544	movq $0, 58(%rsp) / clear "NMI executing" */
		1545
		1546	/*
		1547	* INTERRUPT_RETURN reads the "iret" frame and exits the NMI
		1548	* stack in a single instruction. We are returning to kernel
		1549	* mode, so this cannot result in a fault.
		1550	*/
1452	INTERRUPT_RETURN	1551	INTERRUPT_RETURN
1453	END(nmi)	1552	END(nmi)
1454		1553