aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-07-18 13:49:57 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-07-18 13:49:57 -0400
commit0e1dbccd8f60c1c83d86d693bb87363b7de2319c (patch)
treea77e5bb1ab755edef9db644672fe7cd3a0dc0f73
parentdae57fb64ef636a225d3e6c0ddbbc8e32674dd81 (diff)
parent5aaeb5c01c5b6c0be7b7aadbf3ace9f3a4458c3d (diff)
Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 fixes from Ingo Molnar: "Two families of fixes: - Fix an FPU context related boot crash on newer x86 hardware with larger context sizes than what most people test. To fix this without ugly kludges or extensive reverts we had to touch core task allocator, to allow x86 to determine the task size dynamically, at boot time. I've tested it on a number of x86 platforms, and I cross-built it to a handful of architectures: (warns) (warns) testing x86-64: -git: pass ( 0), -tip: pass ( 0) testing x86-32: -git: pass ( 0), -tip: pass ( 0) testing arm: -git: pass ( 1359), -tip: pass ( 1359) testing cris: -git: pass ( 1031), -tip: pass ( 1031) testing m32r: -git: pass ( 1135), -tip: pass ( 1135) testing m68k: -git: pass ( 1471), -tip: pass ( 1471) testing mips: -git: pass ( 1162), -tip: pass ( 1162) testing mn10300: -git: pass ( 1058), -tip: pass ( 1058) testing parisc: -git: pass ( 1846), -tip: pass ( 1846) testing sparc: -git: pass ( 1185), -tip: pass ( 1185) ... so I hope the cross-arch impact 'none', as intended. (by Dave Hansen) - Fix various NMI handling related bugs unearthed by the big asm code rewrite and generally make the NMI code more robust and more maintainable while at it. These changes are a bit late in the cycle, I hope they are still acceptable. (by Andy Lutomirski)" * 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/fpu, sched: Introduce CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT and use it on x86 x86/fpu, sched: Dynamically allocate 'struct fpu' x86/entry/64, x86/nmi/64: Add CONFIG_DEBUG_ENTRY NMI testing code x86/nmi/64: Make the "NMI executing" variable more consistent x86/nmi/64: Minor asm simplification x86/nmi/64: Use DF to avoid userspace RSP confusing nested NMI detection x86/nmi/64: Reorder nested NMI checks x86/nmi/64: Improve nested NMI comments x86/nmi/64: Switch stacks on userspace NMI entry x86/nmi/64: Remove asm code that saves CR2 x86/nmi: Enable nested do_nmi() handling for 64-bit kernels
-rw-r--r--arch/Kconfig4
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/Kconfig.debug12
-rw-r--r--arch/x86/entry/entry_64.S299
-rw-r--r--arch/x86/include/asm/fpu/types.h72
-rw-r--r--arch/x86/include/asm/processor.h10
-rw-r--r--arch/x86/kernel/fpu/init.c40
-rw-r--r--arch/x86/kernel/nmi.c123
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--fs/proc/kcore.c4
-rw-r--r--include/linux/sched.h16
-rw-r--r--kernel/fork.c7
12 files changed, 376 insertions, 214 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index bec6666a3cc4..8a8ea7110de8 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -221,6 +221,10 @@ config ARCH_TASK_STRUCT_ALLOCATOR
221config ARCH_THREAD_INFO_ALLOCATOR 221config ARCH_THREAD_INFO_ALLOCATOR
222 bool 222 bool
223 223
224# Select if arch wants to size task_struct dynamically via arch_task_struct_size:
225config ARCH_WANTS_DYNAMIC_TASK_STRUCT
226 bool
227
224config HAVE_REGS_AND_STACK_ACCESS_API 228config HAVE_REGS_AND_STACK_ACCESS_API
225 bool 229 bool
226 help 230 help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3dbb7e7909ca..b3a1a5d77d92 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
41 select ARCH_USE_CMPXCHG_LOCKREF if X86_64 41 select ARCH_USE_CMPXCHG_LOCKREF if X86_64
42 select ARCH_USE_QUEUED_RWLOCKS 42 select ARCH_USE_QUEUED_RWLOCKS
43 select ARCH_USE_QUEUED_SPINLOCKS 43 select ARCH_USE_QUEUED_SPINLOCKS
44 select ARCH_WANTS_DYNAMIC_TASK_STRUCT
44 select ARCH_WANT_FRAME_POINTERS 45 select ARCH_WANT_FRAME_POINTERS
45 select ARCH_WANT_IPC_PARSE_VERSION if X86_32 46 select ARCH_WANT_IPC_PARSE_VERSION if X86_32
46 select ARCH_WANT_OPTIONAL_GPIOLIB 47 select ARCH_WANT_OPTIONAL_GPIOLIB
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index a15893d17c55..d8c0d3266173 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -297,6 +297,18 @@ config OPTIMIZE_INLINING
297 297
298 If unsure, say N. 298 If unsure, say N.
299 299
300config DEBUG_ENTRY
301 bool "Debug low-level entry code"
302 depends on DEBUG_KERNEL
303 ---help---
304 This option enables sanity checks in x86's low-level entry code.
305 Some of these sanity checks may slow down kernel entries and
306 exits or otherwise impact performance.
307
308 This is currently used to help test NMI code.
309
310 If unsure, say N.
311
300config DEBUG_NMI_SELFTEST 312config DEBUG_NMI_SELFTEST
301 bool "NMI Selftest" 313 bool "NMI Selftest"
302 depends on DEBUG_KERNEL && X86_LOCAL_APIC 314 depends on DEBUG_KERNEL && X86_LOCAL_APIC
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3bb2c4302df1..8cb3e438f21e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1237,11 +1237,12 @@ ENTRY(nmi)
1237 * If the variable is not set and the stack is not the NMI 1237 * If the variable is not set and the stack is not the NMI
1238 * stack then: 1238 * stack then:
1239 * o Set the special variable on the stack 1239 * o Set the special variable on the stack
1240 * o Copy the interrupt frame into a "saved" location on the stack 1240 * o Copy the interrupt frame into an "outermost" location on the
1241 * o Copy the interrupt frame into a "copy" location on the stack 1241 * stack
1242 * o Copy the interrupt frame into an "iret" location on the stack
1242 * o Continue processing the NMI 1243 * o Continue processing the NMI
1243 * If the variable is set or the previous stack is the NMI stack: 1244 * If the variable is set or the previous stack is the NMI stack:
1244 * o Modify the "copy" location to jump to the repeate_nmi 1245 * o Modify the "iret" location to jump to the repeat_nmi
1245 * o return back to the first NMI 1246 * o return back to the first NMI
1246 * 1247 *
1247 * Now on exit of the first NMI, we first clear the stack variable 1248 * Now on exit of the first NMI, we first clear the stack variable
@@ -1250,31 +1251,151 @@ ENTRY(nmi)
1250 * a nested NMI that updated the copy interrupt stack frame, a 1251 * a nested NMI that updated the copy interrupt stack frame, a
1251 * jump will be made to the repeat_nmi code that will handle the second 1252 * jump will be made to the repeat_nmi code that will handle the second
1252 * NMI. 1253 * NMI.
1254 *
1255 * However, espfix prevents us from directly returning to userspace
1256 * with a single IRET instruction. Similarly, IRET to user mode
1257 * can fault. We therefore handle NMIs from user space like
1258 * other IST entries.
1253 */ 1259 */
1254 1260
1255 /* Use %rdx as our temp variable throughout */ 1261 /* Use %rdx as our temp variable throughout */
1256 pushq %rdx 1262 pushq %rdx
1257 1263
1264 testb $3, CS-RIP+8(%rsp)
1265 jz .Lnmi_from_kernel
1266
1267 /*
1268 * NMI from user mode. We need to run on the thread stack, but we
1269 * can't go through the normal entry paths: NMIs are masked, and
1270 * we don't want to enable interrupts, because then we'll end
1271 * up in an awkward situation in which IRQs are on but NMIs
1272 * are off.
1273 */
1274
1275 SWAPGS
1276 cld
1277 movq %rsp, %rdx
1278 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
1279 pushq 5*8(%rdx) /* pt_regs->ss */
1280 pushq 4*8(%rdx) /* pt_regs->rsp */
1281 pushq 3*8(%rdx) /* pt_regs->flags */
1282 pushq 2*8(%rdx) /* pt_regs->cs */
1283 pushq 1*8(%rdx) /* pt_regs->rip */
1284 pushq $-1 /* pt_regs->orig_ax */
1285 pushq %rdi /* pt_regs->di */
1286 pushq %rsi /* pt_regs->si */
1287 pushq (%rdx) /* pt_regs->dx */
1288 pushq %rcx /* pt_regs->cx */
1289 pushq %rax /* pt_regs->ax */
1290 pushq %r8 /* pt_regs->r8 */
1291 pushq %r9 /* pt_regs->r9 */
1292 pushq %r10 /* pt_regs->r10 */
1293 pushq %r11 /* pt_regs->r11 */
1294 pushq %rbx /* pt_regs->rbx */
1295 pushq %rbp /* pt_regs->rbp */
1296 pushq %r12 /* pt_regs->r12 */
1297 pushq %r13 /* pt_regs->r13 */
1298 pushq %r14 /* pt_regs->r14 */
1299 pushq %r15 /* pt_regs->r15 */
1300
1301 /*
1302 * At this point we no longer need to worry about stack damage
1303 * due to nesting -- we're on the normal thread stack and we're
1304 * done with the NMI stack.
1305 */
1306
1307 movq %rsp, %rdi
1308 movq $-1, %rsi
1309 call do_nmi
1310
1311 /*
1312 * Return back to user mode. We must *not* do the normal exit
1313 * work, because we don't want to enable interrupts. Fortunately,
1314 * do_nmi doesn't modify pt_regs.
1315 */
1316 SWAPGS
1317 jmp restore_c_regs_and_iret
1318
1319.Lnmi_from_kernel:
1320 /*
1321 * Here's what our stack frame will look like:
1322 * +---------------------------------------------------------+
1323 * | original SS |
1324 * | original Return RSP |
1325 * | original RFLAGS |
1326 * | original CS |
1327 * | original RIP |
1328 * +---------------------------------------------------------+
1329 * | temp storage for rdx |
1330 * +---------------------------------------------------------+
1331 * | "NMI executing" variable |
1332 * +---------------------------------------------------------+
1333 * | iret SS } Copied from "outermost" frame |
1334 * | iret Return RSP } on each loop iteration; overwritten |
1335 * | iret RFLAGS } by a nested NMI to force another |
1336 * | iret CS } iteration if needed. |
1337 * | iret RIP } |
1338 * +---------------------------------------------------------+
1339 * | outermost SS } initialized in first_nmi; |
1340 * | outermost Return RSP } will not be changed before |
1341 * | outermost RFLAGS } NMI processing is done. |
1342 * | outermost CS } Copied to "iret" frame on each |
1343 * | outermost RIP } iteration. |
1344 * +---------------------------------------------------------+
1345 * | pt_regs |
1346 * +---------------------------------------------------------+
1347 *
1348 * The "original" frame is used by hardware. Before re-enabling
1349 * NMIs, we need to be done with it, and we need to leave enough
1350 * space for the asm code here.
1351 *
1352 * We return by executing IRET while RSP points to the "iret" frame.
1353 * That will either return for real or it will loop back into NMI
1354 * processing.
1355 *
1356 * The "outermost" frame is copied to the "iret" frame on each
1357 * iteration of the loop, so each iteration starts with the "iret"
1358 * frame pointing to the final return target.
1359 */
1360
1258 /* 1361 /*
1259 * If %cs was not the kernel segment, then the NMI triggered in user 1362 * Determine whether we're a nested NMI.
1260 * space, which means it is definitely not nested. 1363 *
1364 * If we interrupted kernel code between repeat_nmi and
1365 * end_repeat_nmi, then we are a nested NMI. We must not
1366 * modify the "iret" frame because it's being written by
1367 * the outer NMI. That's okay; the outer NMI handler is
1368 * about to about to call do_nmi anyway, so we can just
1369 * resume the outer NMI.
1261 */ 1370 */
1262 cmpl $__KERNEL_CS, 16(%rsp) 1371
1263 jne first_nmi 1372 movq $repeat_nmi, %rdx
1373 cmpq 8(%rsp), %rdx
1374 ja 1f
1375 movq $end_repeat_nmi, %rdx
1376 cmpq 8(%rsp), %rdx
1377 ja nested_nmi_out
13781:
1264 1379
1265 /* 1380 /*
1266 * Check the special variable on the stack to see if NMIs are 1381 * Now check "NMI executing". If it's set, then we're nested.
1267 * executing. 1382 * This will not detect if we interrupted an outer NMI just
1383 * before IRET.
1268 */ 1384 */
1269 cmpl $1, -8(%rsp) 1385 cmpl $1, -8(%rsp)
1270 je nested_nmi 1386 je nested_nmi
1271 1387
1272 /* 1388 /*
1273 * Now test if the previous stack was an NMI stack. 1389 * Now test if the previous stack was an NMI stack. This covers
1274 * We need the double check. We check the NMI stack to satisfy the 1390 * the case where we interrupt an outer NMI after it clears
1275 * race when the first NMI clears the variable before returning. 1391 * "NMI executing" but before IRET. We need to be careful, though:
1276 * We check the variable because the first NMI could be in a 1392 * there is one case in which RSP could point to the NMI stack
1277 * breakpoint routine using a breakpoint stack. 1393 * despite there being no NMI active: naughty userspace controls
1394 * RSP at the very beginning of the SYSCALL targets. We can
1395 * pull a fast one on naughty userspace, though: we program
1396 * SYSCALL to mask DF, so userspace cannot cause DF to be set
1397 * if it controls the kernel's RSP. We set DF before we clear
1398 * "NMI executing".
1278 */ 1399 */
1279 lea 6*8(%rsp), %rdx 1400 lea 6*8(%rsp), %rdx
1280 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ 1401 /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
@@ -1286,25 +1407,20 @@ ENTRY(nmi)
1286 cmpq %rdx, 4*8(%rsp) 1407 cmpq %rdx, 4*8(%rsp)
1287 /* If it is below the NMI stack, it is a normal NMI */ 1408 /* If it is below the NMI stack, it is a normal NMI */
1288 jb first_nmi 1409 jb first_nmi
1289 /* Ah, it is within the NMI stack, treat it as nested */ 1410
1411 /* Ah, it is within the NMI stack. */
1412
1413 testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
1414 jz first_nmi /* RSP was user controlled. */
1415
1416 /* This is a nested NMI. */
1290 1417
1291nested_nmi: 1418nested_nmi:
1292 /* 1419 /*
1293 * Do nothing if we interrupted the fixup in repeat_nmi. 1420 * Modify the "iret" frame to point to repeat_nmi, forcing another
1294 * It's about to repeat the NMI handler, so we are fine 1421 * iteration of NMI handling.
1295 * with ignoring this one.
1296 */ 1422 */
1297 movq $repeat_nmi, %rdx 1423 subq $8, %rsp
1298 cmpq 8(%rsp), %rdx
1299 ja 1f
1300 movq $end_repeat_nmi, %rdx
1301 cmpq 8(%rsp), %rdx
1302 ja nested_nmi_out
1303
13041:
1305 /* Set up the interrupted NMIs stack to jump to repeat_nmi */
1306 leaq -1*8(%rsp), %rdx
1307 movq %rdx, %rsp
1308 leaq -10*8(%rsp), %rdx 1424 leaq -10*8(%rsp), %rdx
1309 pushq $__KERNEL_DS 1425 pushq $__KERNEL_DS
1310 pushq %rdx 1426 pushq %rdx
@@ -1318,61 +1434,42 @@ nested_nmi:
1318nested_nmi_out: 1434nested_nmi_out:
1319 popq %rdx 1435 popq %rdx
1320 1436
1321 /* No need to check faults here */ 1437 /* We are returning to kernel mode, so this cannot result in a fault. */
1322 INTERRUPT_RETURN 1438 INTERRUPT_RETURN
1323 1439
1324first_nmi: 1440first_nmi:
1325 /* 1441 /* Restore rdx. */
1326 * Because nested NMIs will use the pushed location that we
1327 * stored in rdx, we must keep that space available.
1328 * Here's what our stack frame will look like:
1329 * +-------------------------+
1330 * | original SS |
1331 * | original Return RSP |
1332 * | original RFLAGS |
1333 * | original CS |
1334 * | original RIP |
1335 * +-------------------------+
1336 * | temp storage for rdx |
1337 * +-------------------------+
1338 * | NMI executing variable |
1339 * +-------------------------+
1340 * | copied SS |
1341 * | copied Return RSP |
1342 * | copied RFLAGS |
1343 * | copied CS |
1344 * | copied RIP |
1345 * +-------------------------+
1346 * | Saved SS |
1347 * | Saved Return RSP |
1348 * | Saved RFLAGS |
1349 * | Saved CS |
1350 * | Saved RIP |
1351 * +-------------------------+
1352 * | pt_regs |
1353 * +-------------------------+
1354 *
1355 * The saved stack frame is used to fix up the copied stack frame
1356 * that a nested NMI may change to make the interrupted NMI iret jump
1357 * to the repeat_nmi. The original stack frame and the temp storage
1358 * is also used by nested NMIs and can not be trusted on exit.
1359 */
1360 /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
1361 movq (%rsp), %rdx 1442 movq (%rsp), %rdx
1362 1443
1363 /* Set the NMI executing variable on the stack. */ 1444 /* Make room for "NMI executing". */
1364 pushq $1 1445 pushq $0
1365 1446
1366 /* Leave room for the "copied" frame */ 1447 /* Leave room for the "iret" frame */
1367 subq $(5*8), %rsp 1448 subq $(5*8), %rsp
1368 1449
1369 /* Copy the stack frame to the Saved frame */ 1450 /* Copy the "original" frame to the "outermost" frame */
1370 .rept 5 1451 .rept 5
1371 pushq 11*8(%rsp) 1452 pushq 11*8(%rsp)
1372 .endr 1453 .endr
1373 1454
1374 /* Everything up to here is safe from nested NMIs */ 1455 /* Everything up to here is safe from nested NMIs */
1375 1456
1457#ifdef CONFIG_DEBUG_ENTRY
1458 /*
1459 * For ease of testing, unmask NMIs right away. Disabled by
1460 * default because IRET is very expensive.
1461 */
1462 pushq $0 /* SS */
1463 pushq %rsp /* RSP (minus 8 because of the previous push) */
1464 addq $8, (%rsp) /* Fix up RSP */
1465 pushfq /* RFLAGS */
1466 pushq $__KERNEL_CS /* CS */
1467 pushq $1f /* RIP */
1468 INTERRUPT_RETURN /* continues at repeat_nmi below */
14691:
1470#endif
1471
1472repeat_nmi:
1376 /* 1473 /*
1377 * If there was a nested NMI, the first NMI's iret will return 1474 * If there was a nested NMI, the first NMI's iret will return
1378 * here. But NMIs are still enabled and we can take another 1475 * here. But NMIs are still enabled and we can take another
@@ -1381,16 +1478,20 @@ first_nmi:
1381 * it will just return, as we are about to repeat an NMI anyway. 1478 * it will just return, as we are about to repeat an NMI anyway.
1382 * This makes it safe to copy to the stack frame that a nested 1479 * This makes it safe to copy to the stack frame that a nested
1383 * NMI will update. 1480 * NMI will update.
1481 *
1482 * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
1483 * we're repeating an NMI, gsbase has the same value that it had on
1484 * the first iteration. paranoid_entry will load the kernel
1485 * gsbase if needed before we call do_nmi. "NMI executing"
1486 * is zero.
1384 */ 1487 */
1385repeat_nmi: 1488 movq $1, 10*8(%rsp) /* Set "NMI executing". */
1489
1386 /* 1490 /*
1387 * Update the stack variable to say we are still in NMI (the update 1491 * Copy the "outermost" frame to the "iret" frame. NMIs that nest
1388 * is benign for the non-repeat case, where 1 was pushed just above 1492 * here must not modify the "iret" frame while we're writing to
1389 * to this very stack slot). 1493 * it or it will end up containing garbage.
1390 */ 1494 */
1391 movq $1, 10*8(%rsp)
1392
1393 /* Make another copy, this one may be modified by nested NMIs */
1394 addq $(10*8), %rsp 1495 addq $(10*8), %rsp
1395 .rept 5 1496 .rept 5
1396 pushq -6*8(%rsp) 1497 pushq -6*8(%rsp)
@@ -1399,9 +1500,9 @@ repeat_nmi:
1399end_repeat_nmi: 1500end_repeat_nmi:
1400 1501
1401 /* 1502 /*
1402 * Everything below this point can be preempted by a nested 1503 * Everything below this point can be preempted by a nested NMI.
1403 * NMI if the first NMI took an exception and reset our iret stack 1504 * If this happens, then the inner NMI will change the "iret"
1404 * so that we repeat another NMI. 1505 * frame to point back to repeat_nmi.
1405 */ 1506 */
1406 pushq $-1 /* ORIG_RAX: no syscall to restart */ 1507 pushq $-1 /* ORIG_RAX: no syscall to restart */
1407 ALLOC_PT_GPREGS_ON_STACK 1508 ALLOC_PT_GPREGS_ON_STACK
@@ -1415,28 +1516,11 @@ end_repeat_nmi:
1415 */ 1516 */
1416 call paranoid_entry 1517 call paranoid_entry
1417 1518
1418 /*
1419 * Save off the CR2 register. If we take a page fault in the NMI then
1420 * it could corrupt the CR2 value. If the NMI preempts a page fault
1421 * handler before it was able to read the CR2 register, and then the
1422 * NMI itself takes a page fault, the page fault that was preempted
1423 * will read the information from the NMI page fault and not the
1424 * origin fault. Save it off and restore it if it changes.
1425 * Use the r12 callee-saved register.
1426 */
1427 movq %cr2, %r12
1428
1429 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1519 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1430 movq %rsp, %rdi 1520 movq %rsp, %rdi
1431 movq $-1, %rsi 1521 movq $-1, %rsi
1432 call do_nmi 1522 call do_nmi
1433 1523
1434 /* Did the NMI take a page fault? Restore cr2 if it did */
1435 movq %cr2, %rcx
1436 cmpq %rcx, %r12
1437 je 1f
1438 movq %r12, %cr2
14391:
1440 testl %ebx, %ebx /* swapgs needed? */ 1524 testl %ebx, %ebx /* swapgs needed? */
1441 jnz nmi_restore 1525 jnz nmi_restore
1442nmi_swapgs: 1526nmi_swapgs:
@@ -1444,11 +1528,26 @@ nmi_swapgs:
1444nmi_restore: 1528nmi_restore:
1445 RESTORE_EXTRA_REGS 1529 RESTORE_EXTRA_REGS
1446 RESTORE_C_REGS 1530 RESTORE_C_REGS
1447 /* Pop the extra iret frame at once */ 1531
1532 /* Point RSP at the "iret" frame. */
1448 REMOVE_PT_GPREGS_FROM_STACK 6*8 1533 REMOVE_PT_GPREGS_FROM_STACK 6*8
1449 1534
1450 /* Clear the NMI executing stack variable */ 1535 /*
1451 movq $0, 5*8(%rsp) 1536 * Clear "NMI executing". Set DF first so that we can easily
1537 * distinguish the remaining code between here and IRET from
1538 * the SYSCALL entry and exit paths. On a native kernel, we
1539 * could just inspect RIP, but, on paravirt kernels,
1540 * INTERRUPT_RETURN can translate into a jump into a
1541 * hypercall page.
1542 */
1543 std
1544 movq $0, 5*8(%rsp) /* clear "NMI executing" */
1545
1546 /*
1547 * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
1548 * stack in a single instruction. We are returning to kernel
1549 * mode, so this cannot result in a fault.
1550 */
1452 INTERRUPT_RETURN 1551 INTERRUPT_RETURN
1453END(nmi) 1552END(nmi)
1454 1553
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 0637826292de..c49c5173158e 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -189,6 +189,7 @@ union fpregs_state {
189 struct fxregs_state fxsave; 189 struct fxregs_state fxsave;
190 struct swregs_state soft; 190 struct swregs_state soft;
191 struct xregs_state xsave; 191 struct xregs_state xsave;
192 u8 __padding[PAGE_SIZE];
192}; 193};
193 194
194/* 195/*
@@ -198,40 +199,6 @@ union fpregs_state {
198 */ 199 */
199struct fpu { 200struct fpu {
200 /* 201 /*
201 * @state:
202 *
203 * In-memory copy of all FPU registers that we save/restore
204 * over context switches. If the task is using the FPU then
205 * the registers in the FPU are more recent than this state
206 * copy. If the task context-switches away then they get
207 * saved here and represent the FPU state.
208 *
209 * After context switches there may be a (short) time period
210 * during which the in-FPU hardware registers are unchanged
211 * and still perfectly match this state, if the tasks
212 * scheduled afterwards are not using the FPU.
213 *
214 * This is the 'lazy restore' window of optimization, which
215 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
216 *
217 * We detect whether a subsequent task uses the FPU via setting
218 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
219 *
220 * During this window, if the task gets scheduled again, we
221 * might be able to skip having to do a restore from this
222 * memory buffer to the hardware registers - at the cost of
223 * incurring the overhead of #NM fault traps.
224 *
225 * Note that on modern CPUs that support the XSAVEOPT (or other
226 * optimized XSAVE instructions), we don't use #NM traps anymore,
227 * as the hardware can track whether FPU registers need saving
228 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
229 * logic, which unconditionally saves/restores all FPU state
230 * across context switches. (if FPU state exists.)
231 */
232 union fpregs_state state;
233
234 /*
235 * @last_cpu: 202 * @last_cpu:
236 * 203 *
237 * Records the last CPU on which this context was loaded into 204 * Records the last CPU on which this context was loaded into
@@ -288,6 +255,43 @@ struct fpu {
288 * deal with bursty apps that only use the FPU for a short time: 255 * deal with bursty apps that only use the FPU for a short time:
289 */ 256 */
290 unsigned char counter; 257 unsigned char counter;
258 /*
259 * @state:
260 *
261 * In-memory copy of all FPU registers that we save/restore
262 * over context switches. If the task is using the FPU then
263 * the registers in the FPU are more recent than this state
264 * copy. If the task context-switches away then they get
265 * saved here and represent the FPU state.
266 *
267 * After context switches there may be a (short) time period
268 * during which the in-FPU hardware registers are unchanged
269 * and still perfectly match this state, if the tasks
270 * scheduled afterwards are not using the FPU.
271 *
272 * This is the 'lazy restore' window of optimization, which
273 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
274 *
275 * We detect whether a subsequent task uses the FPU via setting
276 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
277 *
278 * During this window, if the task gets scheduled again, we
279 * might be able to skip having to do a restore from this
280 * memory buffer to the hardware registers - at the cost of
281 * incurring the overhead of #NM fault traps.
282 *
283 * Note that on modern CPUs that support the XSAVEOPT (or other
284 * optimized XSAVE instructions), we don't use #NM traps anymore,
285 * as the hardware can track whether FPU registers need saving
286 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
287 * logic, which unconditionally saves/restores all FPU state
288 * across context switches. (if FPU state exists.)
289 */
290 union fpregs_state state;
291 /*
292 * WARNING: 'state' is dynamically-sized. Do not put
293 * anything after it here.
294 */
291}; 295};
292 296
293#endif /* _ASM_X86_FPU_H */ 297#endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 43e6519df0d5..944f1785ed0d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -390,9 +390,6 @@ struct thread_struct {
390#endif 390#endif
391 unsigned long gs; 391 unsigned long gs;
392 392
393 /* Floating point and extended processor state */
394 struct fpu fpu;
395
396 /* Save middle states of ptrace breakpoints */ 393 /* Save middle states of ptrace breakpoints */
397 struct perf_event *ptrace_bps[HBP_NUM]; 394 struct perf_event *ptrace_bps[HBP_NUM];
398 /* Debug status used for traps, single steps, etc... */ 395 /* Debug status used for traps, single steps, etc... */
@@ -418,6 +415,13 @@ struct thread_struct {
418 unsigned long iopl; 415 unsigned long iopl;
419 /* Max allowed port in the bitmap, in bytes: */ 416 /* Max allowed port in the bitmap, in bytes: */
420 unsigned io_bitmap_max; 417 unsigned io_bitmap_max;
418
419 /* Floating point and extended processor state */
420 struct fpu fpu;
421 /*
422 * WARNING: 'fpu' is dynamically-sized. It *MUST* be at
423 * the end.
424 */
421}; 425};
422 426
423/* 427/*
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 32826791e675..0b39173dd971 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -4,6 +4,8 @@
4#include <asm/fpu/internal.h> 4#include <asm/fpu/internal.h>
5#include <asm/tlbflush.h> 5#include <asm/tlbflush.h>
6 6
7#include <linux/sched.h>
8
7/* 9/*
8 * Initialize the TS bit in CR0 according to the style of context-switches 10 * Initialize the TS bit in CR0 according to the style of context-switches
9 * we are using: 11 * we are using:
@@ -136,6 +138,43 @@ static void __init fpu__init_system_generic(void)
136unsigned int xstate_size; 138unsigned int xstate_size;
137EXPORT_SYMBOL_GPL(xstate_size); 139EXPORT_SYMBOL_GPL(xstate_size);
138 140
141/* Enforce that 'MEMBER' is the last field of 'TYPE': */
142#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
143 BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
144
145/*
146 * We append the 'struct fpu' to the task_struct:
147 */
148static void __init fpu__init_task_struct_size(void)
149{
150 int task_size = sizeof(struct task_struct);
151
152 /*
153 * Subtract off the static size of the register state.
154 * It potentially has a bunch of padding.
155 */
156 task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
157
158 /*
159 * Add back the dynamically-calculated register state
160 * size.
161 */
162 task_size += xstate_size;
163
164 /*
165 * We dynamically size 'struct fpu', so we require that
166 * it be at the end of 'thread_struct' and that
167 * 'thread_struct' be at the end of 'task_struct'. If
168 * you hit a compile error here, check the structure to
169 * see if something got added to the end.
170 */
171 CHECK_MEMBER_AT_END_OF(struct fpu, state);
172 CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
173 CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
174
175 arch_task_struct_size = task_size;
176}
177
139/* 178/*
140 * Set up the xstate_size based on the legacy FPU context size. 179 * Set up the xstate_size based on the legacy FPU context size.
141 * 180 *
@@ -287,6 +326,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
287 fpu__init_system_generic(); 326 fpu__init_system_generic();
288 fpu__init_system_xstate_size_legacy(); 327 fpu__init_system_xstate_size_legacy();
289 fpu__init_system_xstate(); 328 fpu__init_system_xstate();
329 fpu__init_task_struct_size();
290 330
291 fpu__init_system_ctx_switch(); 331 fpu__init_system_ctx_switch();
292} 332}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index c3e985d1751c..d05bd2e2ee91 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -408,15 +408,15 @@ static void default_do_nmi(struct pt_regs *regs)
408NOKPROBE_SYMBOL(default_do_nmi); 408NOKPROBE_SYMBOL(default_do_nmi);
409 409
410/* 410/*
411 * NMIs can hit breakpoints which will cause it to lose its 411 * NMIs can page fault or hit breakpoints which will cause it to lose
412 * NMI context with the CPU when the breakpoint does an iret. 412 * its NMI context with the CPU when the breakpoint or page fault does an IRET.
413 */ 413 *
414#ifdef CONFIG_X86_32 414 * As a result, NMIs can nest if NMIs get unmasked due an IRET during
415/* 415 * NMI processing. On x86_64, the asm glue protects us from nested NMIs
416 * For i386, NMIs use the same stack as the kernel, and we can 416 * if the outer NMI came from kernel mode, but we can still nest if the
417 * add a workaround to the iret problem in C (preventing nested 417 * outer NMI came from user mode.
418 * NMIs if an NMI takes a trap). Simply have 3 states the NMI 418 *
419 * can be in: 419 * To handle these nested NMIs, we have three states:
420 * 420 *
421 * 1) not running 421 * 1) not running
422 * 2) executing 422 * 2) executing
@@ -430,15 +430,14 @@ NOKPROBE_SYMBOL(default_do_nmi);
430 * (Note, the latch is binary, thus multiple NMIs triggering, 430 * (Note, the latch is binary, thus multiple NMIs triggering,
431 * when one is running, are ignored. Only one NMI is restarted.) 431 * when one is running, are ignored. Only one NMI is restarted.)
432 * 432 *
433 * If an NMI hits a breakpoint that executes an iret, another 433 * If an NMI executes an iret, another NMI can preempt it. We do not
434 * NMI can preempt it. We do not want to allow this new NMI 434 * want to allow this new NMI to run, but we want to execute it when the
435 * to run, but we want to execute it when the first one finishes. 435 * first one finishes. We set the state to "latched", and the exit of
436 * We set the state to "latched", and the exit of the first NMI will 436 * the first NMI will perform a dec_return, if the result is zero
437 * perform a dec_return, if the result is zero (NOT_RUNNING), then 437 * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
438 * it will simply exit the NMI handler. If not, the dec_return 438 * dec_return would have set the state to NMI_EXECUTING (what we want it
439 * would have set the state to NMI_EXECUTING (what we want it to 439 * to be when we are running). In this case, we simply jump back to
440 * be when we are running). In this case, we simply jump back 440 * rerun the NMI handler again, and restart the 'latched' NMI.
441 * to rerun the NMI handler again, and restart the 'latched' NMI.
442 * 441 *
443 * No trap (breakpoint or page fault) should be hit before nmi_restart, 442 * No trap (breakpoint or page fault) should be hit before nmi_restart,
444 * thus there is no race between the first check of state for NOT_RUNNING 443 * thus there is no race between the first check of state for NOT_RUNNING
@@ -461,49 +460,36 @@ enum nmi_states {
461static DEFINE_PER_CPU(enum nmi_states, nmi_state); 460static DEFINE_PER_CPU(enum nmi_states, nmi_state);
462static DEFINE_PER_CPU(unsigned long, nmi_cr2); 461static DEFINE_PER_CPU(unsigned long, nmi_cr2);
463 462
464#define nmi_nesting_preprocess(regs) \ 463#ifdef CONFIG_X86_64
465 do { \
466 if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \
467 this_cpu_write(nmi_state, NMI_LATCHED); \
468 return; \
469 } \
470 this_cpu_write(nmi_state, NMI_EXECUTING); \
471 this_cpu_write(nmi_cr2, read_cr2()); \
472 } while (0); \
473 nmi_restart:
474
475#define nmi_nesting_postprocess() \
476 do { \
477 if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \
478 write_cr2(this_cpu_read(nmi_cr2)); \
479 if (this_cpu_dec_return(nmi_state)) \
480 goto nmi_restart; \
481 } while (0)
482#else /* x86_64 */
483/* 464/*
484 * In x86_64 things are a bit more difficult. This has the same problem 465 * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without
485 * where an NMI hitting a breakpoint that calls iret will remove the 466 * some care, the inner breakpoint will clobber the outer breakpoint's
486 * NMI context, allowing a nested NMI to enter. What makes this more 467 * stack.
487 * difficult is that both NMIs and breakpoints have their own stack.
488 * When a new NMI or breakpoint is executed, the stack is set to a fixed
489 * point. If an NMI is nested, it will have its stack set at that same
490 * fixed address that the first NMI had, and will start corrupting the
491 * stack. This is handled in entry_64.S, but the same problem exists with
492 * the breakpoint stack.
493 * 468 *
494 * If a breakpoint is being processed, and the debug stack is being used, 469 * If a breakpoint is being processed, and the debug stack is being
495 * if an NMI comes in and also hits a breakpoint, the stack pointer 470 * used, if an NMI comes in and also hits a breakpoint, the stack
496 * will be set to the same fixed address as the breakpoint that was 471 * pointer will be set to the same fixed address as the breakpoint that
497 * interrupted, causing that stack to be corrupted. To handle this case, 472 * was interrupted, causing that stack to be corrupted. To handle this
498 * check if the stack that was interrupted is the debug stack, and if 473 * case, check if the stack that was interrupted is the debug stack, and
499 * so, change the IDT so that new breakpoints will use the current stack 474 * if so, change the IDT so that new breakpoints will use the current
500 * and not switch to the fixed address. On return of the NMI, switch back 475 * stack and not switch to the fixed address. On return of the NMI,
501 * to the original IDT. 476 * switch back to the original IDT.
502 */ 477 */
503static DEFINE_PER_CPU(int, update_debug_stack); 478static DEFINE_PER_CPU(int, update_debug_stack);
479#endif
504 480
505static inline void nmi_nesting_preprocess(struct pt_regs *regs) 481dotraplinkage notrace void
482do_nmi(struct pt_regs *regs, long error_code)
506{ 483{
484 if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
485 this_cpu_write(nmi_state, NMI_LATCHED);
486 return;
487 }
488 this_cpu_write(nmi_state, NMI_EXECUTING);
489 this_cpu_write(nmi_cr2, read_cr2());
490nmi_restart:
491
492#ifdef CONFIG_X86_64
507 /* 493 /*
508 * If we interrupted a breakpoint, it is possible that 494 * If we interrupted a breakpoint, it is possible that
509 * the nmi handler will have breakpoints too. We need to 495 * the nmi handler will have breakpoints too. We need to
@@ -514,22 +500,8 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs)
514 debug_stack_set_zero(); 500 debug_stack_set_zero();
515 this_cpu_write(update_debug_stack, 1); 501 this_cpu_write(update_debug_stack, 1);
516 } 502 }
517}
518
519static inline void nmi_nesting_postprocess(void)
520{
521 if (unlikely(this_cpu_read(update_debug_stack))) {
522 debug_stack_reset();
523 this_cpu_write(update_debug_stack, 0);
524 }
525}
526#endif 503#endif
527 504
528dotraplinkage notrace void
529do_nmi(struct pt_regs *regs, long error_code)
530{
531 nmi_nesting_preprocess(regs);
532
533 nmi_enter(); 505 nmi_enter();
534 506
535 inc_irq_stat(__nmi_count); 507 inc_irq_stat(__nmi_count);
@@ -539,8 +511,17 @@ do_nmi(struct pt_regs *regs, long error_code)
539 511
540 nmi_exit(); 512 nmi_exit();
541 513
542 /* On i386, may loop back to preprocess */ 514#ifdef CONFIG_X86_64
543 nmi_nesting_postprocess(); 515 if (unlikely(this_cpu_read(update_debug_stack))) {
516 debug_stack_reset();
517 this_cpu_write(update_debug_stack, 0);
518 }
519#endif
520
521 if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
522 write_cr2(this_cpu_read(nmi_cr2));
523 if (this_cpu_dec_return(nmi_state))
524 goto nmi_restart;
544} 525}
545NOKPROBE_SYMBOL(do_nmi); 526NOKPROBE_SYMBOL(do_nmi);
546 527
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9cad694ed7c4..397688beed4b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
81 */ 81 */
82int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 82int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
83{ 83{
84 *dst = *src; 84 memcpy(dst, src, arch_task_struct_size);
85 85
86 return fpu__copy(&dst->thread.fpu, &src->thread.fpu); 86 return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
87} 87}
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 91a4e6426321..92e6726f6e37 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
92 roundup(sizeof(CORE_STR), 4)) + 92 roundup(sizeof(CORE_STR), 4)) +
93 roundup(sizeof(struct elf_prstatus), 4) + 93 roundup(sizeof(struct elf_prstatus), 4) +
94 roundup(sizeof(struct elf_prpsinfo), 4) + 94 roundup(sizeof(struct elf_prpsinfo), 4) +
95 roundup(sizeof(struct task_struct), 4); 95 roundup(arch_task_struct_size, 4);
96 *elf_buflen = PAGE_ALIGN(*elf_buflen); 96 *elf_buflen = PAGE_ALIGN(*elf_buflen);
97 return size + *elf_buflen; 97 return size + *elf_buflen;
98} 98}
@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
415 /* set up the task structure */ 415 /* set up the task structure */
416 notes[2].name = CORE_STR; 416 notes[2].name = CORE_STR;
417 notes[2].type = NT_TASKSTRUCT; 417 notes[2].type = NT_TASKSTRUCT;
418 notes[2].datasz = sizeof(struct task_struct); 418 notes[2].datasz = arch_task_struct_size;
419 notes[2].data = current; 419 notes[2].data = current;
420 420
421 nhdr->p_filesz += notesize(&notes[2]); 421 nhdr->p_filesz += notesize(&notes[2]);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ae21f1591615..04b5ada460b4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1522,8 +1522,6 @@ struct task_struct {
1522/* hung task detection */ 1522/* hung task detection */
1523 unsigned long last_switch_count; 1523 unsigned long last_switch_count;
1524#endif 1524#endif
1525/* CPU-specific state of this task */
1526 struct thread_struct thread;
1527/* filesystem information */ 1525/* filesystem information */
1528 struct fs_struct *fs; 1526 struct fs_struct *fs;
1529/* open file information */ 1527/* open file information */
@@ -1778,8 +1776,22 @@ struct task_struct {
1778 unsigned long task_state_change; 1776 unsigned long task_state_change;
1779#endif 1777#endif
1780 int pagefault_disabled; 1778 int pagefault_disabled;
1779/* CPU-specific state of this task */
1780 struct thread_struct thread;
1781/*
1782 * WARNING: on x86, 'thread_struct' contains a variable-sized
1783 * structure. It *MUST* be at the end of 'task_struct'.
1784 *
1785 * Do not put anything below here!
1786 */
1781}; 1787};
1782 1788
1789#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
1790extern int arch_task_struct_size __read_mostly;
1791#else
1792# define arch_task_struct_size (sizeof(struct task_struct))
1793#endif
1794
1783/* Future-safe accessor for struct task_struct's cpus_allowed. */ 1795/* Future-safe accessor for struct task_struct's cpus_allowed. */
1784#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) 1796#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
1785 1797
diff --git a/kernel/fork.c b/kernel/fork.c
index 1bfefc6f96a4..dbd9b8d7b7cc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -287,6 +287,11 @@ static void set_max_threads(unsigned int max_threads_suggested)
287 max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); 287 max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
288} 288}
289 289
290#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
291/* Initialized by the architecture: */
292int arch_task_struct_size __read_mostly;
293#endif
294
290void __init fork_init(void) 295void __init fork_init(void)
291{ 296{
292#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR 297#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
@@ -295,7 +300,7 @@ void __init fork_init(void)
295#endif 300#endif
296 /* create a slab on which task_structs can be allocated */ 301 /* create a slab on which task_structs can be allocated */
297 task_struct_cachep = 302 task_struct_cachep =
298 kmem_cache_create("task_struct", sizeof(struct task_struct), 303 kmem_cache_create("task_struct", arch_task_struct_size,
299 ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); 304 ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
300#endif 305#endif
301 306