aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/entry/entry_64.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/entry/entry_64.S')
-rw-r--r--arch/x86/entry/entry_64.S299
1 files changed, 199 insertions, 100 deletions
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3bb2c4302df1..8cb3e438f21e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1237,11 +1237,12 @@ ENTRY(nmi)
1237 * If the variable is not set and the stack is not the NMI 1237 * If the variable is not set and the stack is not the NMI
1238 * stack then: 1238 * stack then:
1239 * o Set the special variable on the stack 1239 * o Set the special variable on the stack
1240 * o Copy the interrupt frame into a "saved" location on the stack 1240 * o Copy the interrupt frame into an "outermost" location on the
1241 * o Copy the interrupt frame into a "copy" location on the stack 1241 * stack
1242 * o Copy the interrupt frame into an "iret" location on the stack
1242 * o Continue processing the NMI 1243 * o Continue processing the NMI
1243 * If the variable is set or the previous stack is the NMI stack: 1244 * If the variable is set or the previous stack is the NMI stack:
1244 * o Modify the "copy" location to jump to the repeate_nmi 1245 * o Modify the "iret" location to jump to the repeat_nmi
1245 * o return back to the first NMI 1246 * o return back to the first NMI
1246 * 1247 *
1247 * Now on exit of the first NMI, we first clear the stack variable 1248 * Now on exit of the first NMI, we first clear the stack variable
@@ -1250,31 +1251,151 @@ ENTRY(nmi)
1250 * a nested NMI that updated the copy interrupt stack frame, a 1251 * a nested NMI that updated the copy interrupt stack frame, a
1251 * jump will be made to the repeat_nmi code that will handle the second 1252 * jump will be made to the repeat_nmi code that will handle the second
1252 * NMI. 1253 * NMI.
1254 *
1255 * However, espfix prevents us from directly returning to userspace
1256 * with a single IRET instruction. Similarly, IRET to user mode
1257 * can fault. We therefore handle NMIs from user space like
1258 * other IST entries.
1253 */ 1259 */
1254 1260
1255 /* Use %rdx as our temp variable throughout */ 1261 /* Use %rdx as our temp variable throughout */
1256 pushq %rdx 1262 pushq %rdx
1257 1263
1264 testb $3, CS-RIP+8(%rsp)
1265 jz .Lnmi_from_kernel
1266
1267 /*
1268 * NMI from user mode. We need to run on the thread stack, but we
1269 * can't go through the normal entry paths: NMIs are masked, and
1270 * we don't want to enable interrupts, because then we'll end
1271 * up in an awkward situation in which IRQs are on but NMIs
1272 * are off.
1273 */
1274
1275 SWAPGS
1276 cld
1277 movq %rsp, %rdx
1278 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
1279 pushq 5*8(%rdx) /* pt_regs->ss */
1280 pushq 4*8(%rdx) /* pt_regs->rsp */
1281 pushq 3*8(%rdx) /* pt_regs->flags */
1282 pushq 2*8(%rdx) /* pt_regs->cs */
1283 pushq 1*8(%rdx) /* pt_regs->rip */
1284 pushq $-1 /* pt_regs->orig_ax */
1285 pushq %rdi /* pt_regs->di */
1286 pushq %rsi /* pt_regs->si */
1287 pushq (%rdx) /* pt_regs->dx */
1288 pushq %rcx /* pt_regs->cx */
1289 pushq %rax /* pt_regs->ax */
1290 pushq %r8 /* pt_regs->r8 */
1291 pushq %r9 /* pt_regs->r9 */
1292 pushq %r10 /* pt_regs->r10 */
1293 pushq %r11 /* pt_regs->r11 */
1294 pushq %rbx /* pt_regs->rbx */
1295 pushq %rbp /* pt_regs->rbp */
1296 pushq %r12 /* pt_regs->r12 */
1297 pushq %r13 /* pt_regs->r13 */
1298 pushq %r14 /* pt_regs->r14 */
1299 pushq %r15 /* pt_regs->r15 */
1300
1301 /*
1302 * At this point we no longer need to worry about stack damage
1303 * due to nesting -- we're on the normal thread stack and we're
1304 * done with the NMI stack.
1305 */
1306
1307 movq %rsp, %rdi
1308 movq $-1, %rsi
1309 call do_nmi
1310
1311 /*
1312 * Return back to user mode. We must *not* do the normal exit
1313 * work, because we don't want to enable interrupts. Fortunately,
1314 * do_nmi doesn't modify pt_regs.
1315 */
1316 SWAPGS
1317 jmp restore_c_regs_and_iret
1318
1319.Lnmi_from_kernel:
1320 /*
1321 * Here's what our stack frame will look like:
1322 * +---------------------------------------------------------+
1323 * | original SS |
1324 * | original Return RSP |
1325 * | original RFLAGS |
1326 * | original CS |
1327 * | original RIP |
1328 * +---------------------------------------------------------+
1329 * | temp storage for rdx |
1330 * +---------------------------------------------------------+
1331 * | "NMI executing" variable |
1332 * +---------------------------------------------------------+
1333 * | iret SS } Copied from "outermost" frame |
1334 * | iret Return RSP } on each loop iteration; overwritten |
1335 * | iret RFLAGS } by a nested NMI to force another |
1336 * | iret CS } iteration if needed. |
1337 * | iret RIP } |
1338 * +---------------------------------------------------------+
1339 * | outermost SS } initialized in first_nmi; |
1340 * | outermost Return RSP } will not be changed before |
1341 * | outermost RFLAGS } NMI processing is done. |
1342 * | outermost CS } Copied to "iret" frame on each |
1343 * | outermost RIP } iteration. |
1344 * +---------------------------------------------------------+
1345 * | pt_regs |
1346 * +---------------------------------------------------------+
1347 *
1348 * The "original" frame is used by hardware. Before re-enabling
1349 * NMIs, we need to be done with it, and we need to leave enough
1350 * space for the asm code here.
1351 *
1352 * We return by executing IRET while RSP points to the "iret" frame.
1353 * That will either return for real or it will loop back into NMI
1354 * processing.
1355 *
1356 * The "outermost" frame is copied to the "iret" frame on each
1357 * iteration of the loop, so each iteration starts with the "iret"
1358 * frame pointing to the final return target.
1359 */
1360
1258 /* 1361 /*
1259 * If %cs was not the kernel segment, then the NMI triggered in user 1362 * Determine whether we're a nested NMI.
1260 * space, which means it is definitely not nested. 1363 *
1364 * If we interrupted kernel code between repeat_nmi and
1365 * end_repeat_nmi, then we are a nested NMI. We must not
1366 * modify the "iret" frame because it's being written by
1367 * the outer NMI. That's okay; the outer NMI handler is
1368 * about to about to call do_nmi anyway, so we can just
1369 * resume the outer NMI.
1261 */ 1370 */
1262 cmpl $__KERNEL_CS, 16(%rsp) 1371
1263 jne first_nmi 1372 movq $repeat_nmi, %rdx
1373 cmpq 8(%rsp), %rdx
1374 ja 1f
1375 movq $end_repeat_nmi, %rdx
1376 cmpq 8(%rsp), %rdx
1377 ja nested_nmi_out
13781:
1264 1379
1265 /* 1380 /*
1266 * Check the special variable on the stack to see if NMIs are 1381 * Now check "NMI executing". If it's set, then we're nested.
1267 * executing. 1382 * This will not detect if we interrupted an outer NMI just
1383 * before IRET.
1268 */ 1384 */
1269 cmpl $1, -8(%rsp) 1385 cmpl $1, -8(%rsp)
1270 je nested_nmi 1386 je nested_nmi
1271 1387
1272 /* 1388 /*
1273 * Now test if the previous stack was an NMI stack. 1389 * Now test if the previous stack was an NMI stack. This covers
1274 * We need the double check. We check the NMI stack to satisfy the 1390 * the case where we interrupt an outer NMI after it clears
1275 * race when the first NMI clears the variable before returning. 1391 * "NMI executing" but before IRET. We need to be careful, though:
1276 * We check the variable because the first NMI could be in a 1392 * there is one case in which RSP could point to the NMI stack
1277 * breakpoint routine using a breakpoint stack. 1393 * despite there being no NMI active: naughty userspace controls
1394 * RSP at the very beginning of the SYSCALL targets. We can
1395 * pull a fast one on naughty userspace, though: we program
1396 * SYSCALL to mask DF, so userspace cannot cause DF to be set
1397 * if it controls the kernel's RSP. We set DF before we clear
1398 * "NMI executing".
1278 */ 1399 */
1279 lea 6*8(%rsp), %rdx 1400 lea 6*8(%rsp), %rdx
1280 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ 1401 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
@@ -1286,25 +1407,20 @@ ENTRY(nmi)
1286 cmpq %rdx, 4*8(%rsp) 1407 cmpq %rdx, 4*8(%rsp)
1287 /* If it is below the NMI stack, it is a normal NMI */ 1408 /* If it is below the NMI stack, it is a normal NMI */
1288 jb first_nmi 1409 jb first_nmi
1289 /* Ah, it is within the NMI stack, treat it as nested */ 1410
1411 /* Ah, it is within the NMI stack. */
1412
1413 testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
1414 jz first_nmi /* RSP was user controlled. */
1415
1416 /* This is a nested NMI. */
1290 1417
1291nested_nmi: 1418nested_nmi:
1292 /* 1419 /*
1293 * Do nothing if we interrupted the fixup in repeat_nmi. 1420 * Modify the "iret" frame to point to repeat_nmi, forcing another
1294 * It's about to repeat the NMI handler, so we are fine 1421 * iteration of NMI handling.
1295 * with ignoring this one.
1296 */ 1422 */
1297 movq $repeat_nmi, %rdx 1423 subq $8, %rsp
1298 cmpq 8(%rsp), %rdx
1299 ja 1f
1300 movq $end_repeat_nmi, %rdx
1301 cmpq 8(%rsp), %rdx
1302 ja nested_nmi_out
1303
13041:
1305 /* Set up the interrupted NMIs stack to jump to repeat_nmi */
1306 leaq -1*8(%rsp), %rdx
1307 movq %rdx, %rsp
1308 leaq -10*8(%rsp), %rdx 1424 leaq -10*8(%rsp), %rdx
1309 pushq $__KERNEL_DS 1425 pushq $__KERNEL_DS
1310 pushq %rdx 1426 pushq %rdx
@@ -1318,61 +1434,42 @@ nested_nmi:
1318nested_nmi_out: 1434nested_nmi_out:
1319 popq %rdx 1435 popq %rdx
1320 1436
1321 /* No need to check faults here */ 1437 /* We are returning to kernel mode, so this cannot result in a fault. */
1322 INTERRUPT_RETURN 1438 INTERRUPT_RETURN
1323 1439
1324first_nmi: 1440first_nmi:
1325 /* 1441 /* Restore rdx. */
1326 * Because nested NMIs will use the pushed location that we
1327 * stored in rdx, we must keep that space available.
1328 * Here's what our stack frame will look like:
1329 * +-------------------------+
1330 * | original SS |
1331 * | original Return RSP |
1332 * | original RFLAGS |
1333 * | original CS |
1334 * | original RIP |
1335 * +-------------------------+
1336 * | temp storage for rdx |
1337 * +-------------------------+
1338 * | NMI executing variable |
1339 * +-------------------------+
1340 * | copied SS |
1341 * | copied Return RSP |
1342 * | copied RFLAGS |
1343 * | copied CS |
1344 * | copied RIP |
1345 * +-------------------------+
1346 * | Saved SS |
1347 * | Saved Return RSP |
1348 * | Saved RFLAGS |
1349 * | Saved CS |
1350 * | Saved RIP |
1351 * +-------------------------+
1352 * | pt_regs |
1353 * +-------------------------+
1354 *
1355 * The saved stack frame is used to fix up the copied stack frame
1356 * that a nested NMI may change to make the interrupted NMI iret jump
1357 * to the repeat_nmi. The original stack frame and the temp storage
1358 * is also used by nested NMIs and can not be trusted on exit.
1359 */
1360 /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
1361 movq (%rsp), %rdx 1442 movq (%rsp), %rdx
1362 1443
1363 /* Set the NMI executing variable on the stack. */ 1444 /* Make room for "NMI executing". */
1364 pushq $1 1445 pushq $0
1365 1446
1366 /* Leave room for the "copied" frame */ 1447 /* Leave room for the "iret" frame */
1367 subq $(5*8), %rsp 1448 subq $(5*8), %rsp
1368 1449
1369 /* Copy the stack frame to the Saved frame */ 1450 /* Copy the "original" frame to the "outermost" frame */
1370 .rept 5 1451 .rept 5
1371 pushq 11*8(%rsp) 1452 pushq 11*8(%rsp)
1372 .endr 1453 .endr
1373 1454
1374 /* Everything up to here is safe from nested NMIs */ 1455 /* Everything up to here is safe from nested NMIs */
1375 1456
1457#ifdef CONFIG_DEBUG_ENTRY
1458 /*
1459 * For ease of testing, unmask NMIs right away. Disabled by
1460 * default because IRET is very expensive.
1461 */
1462 pushq $0 /* SS */
1463 pushq %rsp /* RSP (minus 8 because of the previous push) */
1464 addq $8, (%rsp) /* Fix up RSP */
1465 pushfq /* RFLAGS */
1466 pushq $__KERNEL_CS /* CS */
1467 pushq $1f /* RIP */
1468 INTERRUPT_RETURN /* continues at repeat_nmi below */
14691:
1470#endif
1471
1472repeat_nmi:
1376 /* 1473 /*
1377 * If there was a nested NMI, the first NMI's iret will return 1474 * If there was a nested NMI, the first NMI's iret will return
1378 * here. But NMIs are still enabled and we can take another 1475 * here. But NMIs are still enabled and we can take another
@@ -1381,16 +1478,20 @@ first_nmi:
1381 * it will just return, as we are about to repeat an NMI anyway. 1478 * it will just return, as we are about to repeat an NMI anyway.
1382 * This makes it safe to copy to the stack frame that a nested 1479 * This makes it safe to copy to the stack frame that a nested
1383 * NMI will update. 1480 * NMI will update.
1481 *
1482 * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
1483 * we're repeating an NMI, gsbase has the same value that it had on
1484 * the first iteration. paranoid_entry will load the kernel
1485 * gsbase if needed before we call do_nmi. "NMI executing"
1486 * is zero.
1384 */ 1487 */
1385repeat_nmi: 1488 movq $1, 10*8(%rsp) /* Set "NMI executing". */
1489
1386 /* 1490 /*
1387 * Update the stack variable to say we are still in NMI (the update 1491 * Copy the "outermost" frame to the "iret" frame. NMIs that nest
1388 * is benign for the non-repeat case, where 1 was pushed just above 1492 * here must not modify the "iret" frame while we're writing to
1389 * to this very stack slot). 1493 * it or it will end up containing garbage.
1390 */ 1494 */
1391 movq $1, 10*8(%rsp)
1392
1393 /* Make another copy, this one may be modified by nested NMIs */
1394 addq $(10*8), %rsp 1495 addq $(10*8), %rsp
1395 .rept 5 1496 .rept 5
1396 pushq -6*8(%rsp) 1497 pushq -6*8(%rsp)
@@ -1399,9 +1500,9 @@ repeat_nmi:
1399end_repeat_nmi: 1500end_repeat_nmi:
1400 1501
1401 /* 1502 /*
1402 * Everything below this point can be preempted by a nested 1503 * Everything below this point can be preempted by a nested NMI.
1403 * NMI if the first NMI took an exception and reset our iret stack 1504 * If this happens, then the inner NMI will change the "iret"
1404 * so that we repeat another NMI. 1505 * frame to point back to repeat_nmi.
1405 */ 1506 */
1406 pushq $-1 /* ORIG_RAX: no syscall to restart */ 1507 pushq $-1 /* ORIG_RAX: no syscall to restart */
1407 ALLOC_PT_GPREGS_ON_STACK 1508 ALLOC_PT_GPREGS_ON_STACK
@@ -1415,28 +1516,11 @@ end_repeat_nmi:
1415 */ 1516 */
1416 call paranoid_entry 1517 call paranoid_entry
1417 1518
1418 /*
1419 * Save off the CR2 register. If we take a page fault in the NMI then
1420 * it could corrupt the CR2 value. If the NMI preempts a page fault
1421 * handler before it was able to read the CR2 register, and then the
1422 * NMI itself takes a page fault, the page fault that was preempted
1423 * will read the information from the NMI page fault and not the
1424 * origin fault. Save it off and restore it if it changes.
1425 * Use the r12 callee-saved register.
1426 */
1427 movq %cr2, %r12
1428
1429 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1519 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1430 movq %rsp, %rdi 1520 movq %rsp, %rdi
1431 movq $-1, %rsi 1521 movq $-1, %rsi
1432 call do_nmi 1522 call do_nmi
1433 1523
1434 /* Did the NMI take a page fault? Restore cr2 if it did */
1435 movq %cr2, %rcx
1436 cmpq %rcx, %r12
1437 je 1f
1438 movq %r12, %cr2
14391:
1440 testl %ebx, %ebx /* swapgs needed? */ 1524 testl %ebx, %ebx /* swapgs needed? */
1441 jnz nmi_restore 1525 jnz nmi_restore
1442nmi_swapgs: 1526nmi_swapgs:
@@ -1444,11 +1528,26 @@ nmi_swapgs:
1444nmi_restore: 1528nmi_restore:
1445 RESTORE_EXTRA_REGS 1529 RESTORE_EXTRA_REGS
1446 RESTORE_C_REGS 1530 RESTORE_C_REGS
1447 /* Pop the extra iret frame at once */ 1531
1532 /* Point RSP at the "iret" frame. */
1448 REMOVE_PT_GPREGS_FROM_STACK 6*8 1533 REMOVE_PT_GPREGS_FROM_STACK 6*8
1449 1534
1450 /* Clear the NMI executing stack variable */ 1535 /*
1451 movq $0, 5*8(%rsp) 1536 * Clear "NMI executing". Set DF first so that we can easily
1537 * distinguish the remaining code between here and IRET from
1538 * the SYSCALL entry and exit paths. On a native kernel, we
1539 * could just inspect RIP, but, on paravirt kernels,
1540 * INTERRUPT_RETURN can translate into a jump into a
1541 * hypercall page.
1542 */
1543 std
1544 movq $0, 5*8(%rsp) /* clear "NMI executing" */
1545
1546 /*
1547 * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
1548 * stack in a single instruction. We are returning to kernel
1549 * mode, so this cannot result in a fault.
1550 */
1452 INTERRUPT_RETURN 1551 INTERRUPT_RETURN
1453END(nmi) 1552END(nmi)
1454 1553