diff options
| -rw-r--r-- | arch/Kconfig | 4 | ||||
| -rw-r--r-- | arch/x86/Kconfig | 1 | ||||
| -rw-r--r-- | arch/x86/Kconfig.debug | 12 | ||||
| -rw-r--r-- | arch/x86/entry/entry_64.S | 299 | ||||
| -rw-r--r-- | arch/x86/include/asm/fpu/types.h | 72 | ||||
| -rw-r--r-- | arch/x86/include/asm/processor.h | 10 | ||||
| -rw-r--r-- | arch/x86/kernel/fpu/init.c | 40 | ||||
| -rw-r--r-- | arch/x86/kernel/nmi.c | 123 | ||||
| -rw-r--r-- | arch/x86/kernel/process.c | 2 | ||||
| -rw-r--r-- | fs/proc/kcore.c | 4 | ||||
| -rw-r--r-- | include/linux/sched.h | 16 | ||||
| -rw-r--r-- | kernel/fork.c | 7 |
12 files changed, 376 insertions, 214 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index bec6666a3cc4..8a8ea7110de8 100644 --- a/arch/Kconfig +++ b/arch/Kconfig | |||
| @@ -221,6 +221,10 @@ config ARCH_TASK_STRUCT_ALLOCATOR | |||
| 221 | config ARCH_THREAD_INFO_ALLOCATOR | 221 | config ARCH_THREAD_INFO_ALLOCATOR |
| 222 | bool | 222 | bool |
| 223 | 223 | ||
| 224 | # Select if arch wants to size task_struct dynamically via arch_task_struct_size: | ||
| 225 | config ARCH_WANTS_DYNAMIC_TASK_STRUCT | ||
| 226 | bool | ||
| 227 | |||
| 224 | config HAVE_REGS_AND_STACK_ACCESS_API | 228 | config HAVE_REGS_AND_STACK_ACCESS_API |
| 225 | bool | 229 | bool |
| 226 | help | 230 | help |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3dbb7e7909ca..b3a1a5d77d92 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
| @@ -41,6 +41,7 @@ config X86 | |||
| 41 | select ARCH_USE_CMPXCHG_LOCKREF if X86_64 | 41 | select ARCH_USE_CMPXCHG_LOCKREF if X86_64 |
| 42 | select ARCH_USE_QUEUED_RWLOCKS | 42 | select ARCH_USE_QUEUED_RWLOCKS |
| 43 | select ARCH_USE_QUEUED_SPINLOCKS | 43 | select ARCH_USE_QUEUED_SPINLOCKS |
| 44 | select ARCH_WANTS_DYNAMIC_TASK_STRUCT | ||
| 44 | select ARCH_WANT_FRAME_POINTERS | 45 | select ARCH_WANT_FRAME_POINTERS |
| 45 | select ARCH_WANT_IPC_PARSE_VERSION if X86_32 | 46 | select ARCH_WANT_IPC_PARSE_VERSION if X86_32 |
| 46 | select ARCH_WANT_OPTIONAL_GPIOLIB | 47 | select ARCH_WANT_OPTIONAL_GPIOLIB |
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index a15893d17c55..d8c0d3266173 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug | |||
| @@ -297,6 +297,18 @@ config OPTIMIZE_INLINING | |||
| 297 | 297 | ||
| 298 | If unsure, say N. | 298 | If unsure, say N. |
| 299 | 299 | ||
| 300 | config DEBUG_ENTRY | ||
| 301 | bool "Debug low-level entry code" | ||
| 302 | depends on DEBUG_KERNEL | ||
| 303 | ---help--- | ||
| 304 | This option enables sanity checks in x86's low-level entry code. | ||
| 305 | Some of these sanity checks may slow down kernel entries and | ||
| 306 | exits or otherwise impact performance. | ||
| 307 | |||
| 308 | This is currently used to help test NMI code. | ||
| 309 | |||
| 310 | If unsure, say N. | ||
| 311 | |||
| 300 | config DEBUG_NMI_SELFTEST | 312 | config DEBUG_NMI_SELFTEST |
| 301 | bool "NMI Selftest" | 313 | bool "NMI Selftest" |
| 302 | depends on DEBUG_KERNEL && X86_LOCAL_APIC | 314 | depends on DEBUG_KERNEL && X86_LOCAL_APIC |
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3bb2c4302df1..8cb3e438f21e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S | |||
| @@ -1237,11 +1237,12 @@ ENTRY(nmi) | |||
| 1237 | * If the variable is not set and the stack is not the NMI | 1237 | * If the variable is not set and the stack is not the NMI |
| 1238 | * stack then: | 1238 | * stack then: |
| 1239 | * o Set the special variable on the stack | 1239 | * o Set the special variable on the stack |
| 1240 | * o Copy the interrupt frame into a "saved" location on the stack | 1240 | * o Copy the interrupt frame into an "outermost" location on the |
| 1241 | * o Copy the interrupt frame into a "copy" location on the stack | 1241 | * stack |
| 1242 | * o Copy the interrupt frame into an "iret" location on the stack | ||
| 1242 | * o Continue processing the NMI | 1243 | * o Continue processing the NMI |
| 1243 | * If the variable is set or the previous stack is the NMI stack: | 1244 | * If the variable is set or the previous stack is the NMI stack: |
| 1244 | * o Modify the "copy" location to jump to the repeate_nmi | 1245 | * o Modify the "iret" location to jump to the repeat_nmi |
| 1245 | * o return back to the first NMI | 1246 | * o return back to the first NMI |
| 1246 | * | 1247 | * |
| 1247 | * Now on exit of the first NMI, we first clear the stack variable | 1248 | * Now on exit of the first NMI, we first clear the stack variable |
| @@ -1250,31 +1251,151 @@ ENTRY(nmi) | |||
| 1250 | * a nested NMI that updated the copy interrupt stack frame, a | 1251 | * a nested NMI that updated the copy interrupt stack frame, a |
| 1251 | * jump will be made to the repeat_nmi code that will handle the second | 1252 | * jump will be made to the repeat_nmi code that will handle the second |
| 1252 | * NMI. | 1253 | * NMI. |
| 1254 | * | ||
| 1255 | * However, espfix prevents us from directly returning to userspace | ||
| 1256 | * with a single IRET instruction. Similarly, IRET to user mode | ||
| 1257 | * can fault. We therefore handle NMIs from user space like | ||
| 1258 | * other IST entries. | ||
| 1253 | */ | 1259 | */ |
| 1254 | 1260 | ||
| 1255 | /* Use %rdx as our temp variable throughout */ | 1261 | /* Use %rdx as our temp variable throughout */ |
| 1256 | pushq %rdx | 1262 | pushq %rdx |
| 1257 | 1263 | ||
| 1264 | testb $3, CS-RIP+8(%rsp) | ||
| 1265 | jz .Lnmi_from_kernel | ||
| 1266 | |||
| 1267 | /* | ||
| 1268 | * NMI from user mode. We need to run on the thread stack, but we | ||
| 1269 | * can't go through the normal entry paths: NMIs are masked, and | ||
| 1270 | * we don't want to enable interrupts, because then we'll end | ||
| 1271 | * up in an awkward situation in which IRQs are on but NMIs | ||
| 1272 | * are off. | ||
| 1273 | */ | ||
| 1274 | |||
| 1275 | SWAPGS | ||
| 1276 | cld | ||
| 1277 | movq %rsp, %rdx | ||
| 1278 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp | ||
| 1279 | pushq 5*8(%rdx) /* pt_regs->ss */ | ||
| 1280 | pushq 4*8(%rdx) /* pt_regs->rsp */ | ||
| 1281 | pushq 3*8(%rdx) /* pt_regs->flags */ | ||
| 1282 | pushq 2*8(%rdx) /* pt_regs->cs */ | ||
| 1283 | pushq 1*8(%rdx) /* pt_regs->rip */ | ||
| 1284 | pushq $-1 /* pt_regs->orig_ax */ | ||
| 1285 | pushq %rdi /* pt_regs->di */ | ||
| 1286 | pushq %rsi /* pt_regs->si */ | ||
| 1287 | pushq (%rdx) /* pt_regs->dx */ | ||
| 1288 | pushq %rcx /* pt_regs->cx */ | ||
| 1289 | pushq %rax /* pt_regs->ax */ | ||
| 1290 | pushq %r8 /* pt_regs->r8 */ | ||
| 1291 | pushq %r9 /* pt_regs->r9 */ | ||
| 1292 | pushq %r10 /* pt_regs->r10 */ | ||
| 1293 | pushq %r11 /* pt_regs->r11 */ | ||
| 1294 | pushq %rbx /* pt_regs->rbx */ | ||
| 1295 | pushq %rbp /* pt_regs->rbp */ | ||
| 1296 | pushq %r12 /* pt_regs->r12 */ | ||
| 1297 | pushq %r13 /* pt_regs->r13 */ | ||
| 1298 | pushq %r14 /* pt_regs->r14 */ | ||
| 1299 | pushq %r15 /* pt_regs->r15 */ | ||
| 1300 | |||
| 1301 | /* | ||
| 1302 | * At this point we no longer need to worry about stack damage | ||
| 1303 | * due to nesting -- we're on the normal thread stack and we're | ||
| 1304 | * done with the NMI stack. | ||
| 1305 | */ | ||
| 1306 | |||
| 1307 | movq %rsp, %rdi | ||
| 1308 | movq $-1, %rsi | ||
| 1309 | call do_nmi | ||
| 1310 | |||
| 1311 | /* | ||
| 1312 | * Return back to user mode. We must *not* do the normal exit | ||
| 1313 | * work, because we don't want to enable interrupts. Fortunately, | ||
| 1314 | * do_nmi doesn't modify pt_regs. | ||
| 1315 | */ | ||
| 1316 | SWAPGS | ||
| 1317 | jmp restore_c_regs_and_iret | ||
| 1318 | |||
| 1319 | .Lnmi_from_kernel: | ||
| 1320 | /* | ||
| 1321 | * Here's what our stack frame will look like: | ||
| 1322 | * +---------------------------------------------------------+ | ||
| 1323 | * | original SS | | ||
| 1324 | * | original Return RSP | | ||
| 1325 | * | original RFLAGS | | ||
| 1326 | * | original CS | | ||
| 1327 | * | original RIP | | ||
| 1328 | * +---------------------------------------------------------+ | ||
| 1329 | * | temp storage for rdx | | ||
| 1330 | * +---------------------------------------------------------+ | ||
| 1331 | * | "NMI executing" variable | | ||
| 1332 | * +---------------------------------------------------------+ | ||
| 1333 | * | iret SS } Copied from "outermost" frame | | ||
| 1334 | * | iret Return RSP } on each loop iteration; overwritten | | ||
| 1335 | * | iret RFLAGS } by a nested NMI to force another | | ||
| 1336 | * | iret CS } iteration if needed. | | ||
| 1337 | * | iret RIP } | | ||
| 1338 | * +---------------------------------------------------------+ | ||
| 1339 | * | outermost SS } initialized in first_nmi; | | ||
| 1340 | * | outermost Return RSP } will not be changed before | | ||
| 1341 | * | outermost RFLAGS } NMI processing is done. | | ||
| 1342 | * | outermost CS } Copied to "iret" frame on each | | ||
| 1343 | * | outermost RIP } iteration. | | ||
| 1344 | * +---------------------------------------------------------+ | ||
| 1345 | * | pt_regs | | ||
| 1346 | * +---------------------------------------------------------+ | ||
| 1347 | * | ||
| 1348 | * The "original" frame is used by hardware. Before re-enabling | ||
| 1349 | * NMIs, we need to be done with it, and we need to leave enough | ||
| 1350 | * space for the asm code here. | ||
| 1351 | * | ||
| 1352 | * We return by executing IRET while RSP points to the "iret" frame. | ||
| 1353 | * That will either return for real or it will loop back into NMI | ||
| 1354 | * processing. | ||
| 1355 | * | ||
| 1356 | * The "outermost" frame is copied to the "iret" frame on each | ||
| 1357 | * iteration of the loop, so each iteration starts with the "iret" | ||
| 1358 | * frame pointing to the final return target. | ||
| 1359 | */ | ||
| 1360 | |||
| 1258 | /* | 1361 | /* |
| 1259 | * If %cs was not the kernel segment, then the NMI triggered in user | 1362 | * Determine whether we're a nested NMI. |
| 1260 | * space, which means it is definitely not nested. | 1363 | * |
| 1364 | * If we interrupted kernel code between repeat_nmi and | ||
| 1365 | * end_repeat_nmi, then we are a nested NMI. We must not | ||
| 1366 | * modify the "iret" frame because it's being written by | ||
| 1367 | * the outer NMI. That's okay; the outer NMI handler is | ||
| 1368 | * about to about to call do_nmi anyway, so we can just | ||
| 1369 | * resume the outer NMI. | ||
| 1261 | */ | 1370 | */ |
| 1262 | cmpl $__KERNEL_CS, 16(%rsp) | 1371 | |
| 1263 | jne first_nmi | 1372 | movq $repeat_nmi, %rdx |
| 1373 | cmpq 8(%rsp), %rdx | ||
| 1374 | ja 1f | ||
| 1375 | movq $end_repeat_nmi, %rdx | ||
| 1376 | cmpq 8(%rsp), %rdx | ||
| 1377 | ja nested_nmi_out | ||
| 1378 | 1: | ||
| 1264 | 1379 | ||
| 1265 | /* | 1380 | /* |
| 1266 | * Check the special variable on the stack to see if NMIs are | 1381 | * Now check "NMI executing". If it's set, then we're nested. |
| 1267 | * executing. | 1382 | * This will not detect if we interrupted an outer NMI just |
| 1383 | * before IRET. | ||
| 1268 | */ | 1384 | */ |
| 1269 | cmpl $1, -8(%rsp) | 1385 | cmpl $1, -8(%rsp) |
| 1270 | je nested_nmi | 1386 | je nested_nmi |
| 1271 | 1387 | ||
| 1272 | /* | 1388 | /* |
| 1273 | * Now test if the previous stack was an NMI stack. | 1389 | * Now test if the previous stack was an NMI stack. This covers |
| 1274 | * We need the double check. We check the NMI stack to satisfy the | 1390 | * the case where we interrupt an outer NMI after it clears |
| 1275 | * race when the first NMI clears the variable before returning. | 1391 | * "NMI executing" but before IRET. We need to be careful, though: |
| 1276 | * We check the variable because the first NMI could be in a | 1392 | * there is one case in which RSP could point to the NMI stack |
| 1277 | * breakpoint routine using a breakpoint stack. | 1393 | * despite there being no NMI active: naughty userspace controls |
| 1394 | * RSP at the very beginning of the SYSCALL targets. We can | ||
| 1395 | * pull a fast one on naughty userspace, though: we program | ||
| 1396 | * SYSCALL to mask DF, so userspace cannot cause DF to be set | ||
| 1397 | * if it controls the kernel's RSP. We set DF before we clear | ||
| 1398 | * "NMI executing". | ||
| 1278 | */ | 1399 | */ |
| 1279 | lea 6*8(%rsp), %rdx | 1400 | lea 6*8(%rsp), %rdx |
| 1280 | /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ | 1401 | /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ |
| @@ -1286,25 +1407,20 @@ ENTRY(nmi) | |||
| 1286 | cmpq %rdx, 4*8(%rsp) | 1407 | cmpq %rdx, 4*8(%rsp) |
| 1287 | /* If it is below the NMI stack, it is a normal NMI */ | 1408 | /* If it is below the NMI stack, it is a normal NMI */ |
| 1288 | jb first_nmi | 1409 | jb first_nmi |
| 1289 | /* Ah, it is within the NMI stack, treat it as nested */ | 1410 | |
| 1411 | /* Ah, it is within the NMI stack. */ | ||
| 1412 | |||
| 1413 | testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp) | ||
| 1414 | jz first_nmi /* RSP was user controlled. */ | ||
| 1415 | |||
| 1416 | /* This is a nested NMI. */ | ||
| 1290 | 1417 | ||
| 1291 | nested_nmi: | 1418 | nested_nmi: |
| 1292 | /* | 1419 | /* |
| 1293 | * Do nothing if we interrupted the fixup in repeat_nmi. | 1420 | * Modify the "iret" frame to point to repeat_nmi, forcing another |
| 1294 | * It's about to repeat the NMI handler, so we are fine | 1421 | * iteration of NMI handling. |
| 1295 | * with ignoring this one. | ||
| 1296 | */ | 1422 | */ |
| 1297 | movq $repeat_nmi, %rdx | 1423 | subq $8, %rsp |
| 1298 | cmpq 8(%rsp), %rdx | ||
| 1299 | ja 1f | ||
| 1300 | movq $end_repeat_nmi, %rdx | ||
| 1301 | cmpq 8(%rsp), %rdx | ||
| 1302 | ja nested_nmi_out | ||
| 1303 | |||
| 1304 | 1: | ||
| 1305 | /* Set up the interrupted NMIs stack to jump to repeat_nmi */ | ||
| 1306 | leaq -1*8(%rsp), %rdx | ||
| 1307 | movq %rdx, %rsp | ||
| 1308 | leaq -10*8(%rsp), %rdx | 1424 | leaq -10*8(%rsp), %rdx |
| 1309 | pushq $__KERNEL_DS | 1425 | pushq $__KERNEL_DS |
| 1310 | pushq %rdx | 1426 | pushq %rdx |
| @@ -1318,61 +1434,42 @@ nested_nmi: | |||
| 1318 | nested_nmi_out: | 1434 | nested_nmi_out: |
| 1319 | popq %rdx | 1435 | popq %rdx |
| 1320 | 1436 | ||
| 1321 | /* No need to check faults here */ | 1437 | /* We are returning to kernel mode, so this cannot result in a fault. */ |
| 1322 | INTERRUPT_RETURN | 1438 | INTERRUPT_RETURN |
| 1323 | 1439 | ||
| 1324 | first_nmi: | 1440 | first_nmi: |
| 1325 | /* | 1441 | /* Restore rdx. */ |
| 1326 | * Because nested NMIs will use the pushed location that we | ||
| 1327 | * stored in rdx, we must keep that space available. | ||
| 1328 | * Here's what our stack frame will look like: | ||
| 1329 | * +-------------------------+ | ||
| 1330 | * | original SS | | ||
| 1331 | * | original Return RSP | | ||
| 1332 | * | original RFLAGS | | ||
| 1333 | * | original CS | | ||
| 1334 | * | original RIP | | ||
| 1335 | * +-------------------------+ | ||
| 1336 | * | temp storage for rdx | | ||
| 1337 | * +-------------------------+ | ||
| 1338 | * | NMI executing variable | | ||
| 1339 | * +-------------------------+ | ||
| 1340 | * | copied SS | | ||
| 1341 | * | copied Return RSP | | ||
| 1342 | * | copied RFLAGS | | ||
| 1343 | * | copied CS | | ||
| 1344 | * | copied RIP | | ||
| 1345 | * +-------------------------+ | ||
| 1346 | * | Saved SS | | ||
| 1347 | * | Saved Return RSP | | ||
| 1348 | * | Saved RFLAGS | | ||
| 1349 | * | Saved CS | | ||
| 1350 | * | Saved RIP | | ||
| 1351 | * +-------------------------+ | ||
| 1352 | * | pt_regs | | ||
| 1353 | * +-------------------------+ | ||
| 1354 | * | ||
| 1355 | * The saved stack frame is used to fix up the copied stack frame | ||
| 1356 | * that a nested NMI may change to make the interrupted NMI iret jump | ||
| 1357 | * to the repeat_nmi. The original stack frame and the temp storage | ||
| 1358 | * is also used by nested NMIs and can not be trusted on exit. | ||
| 1359 | */ | ||
| 1360 | /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ | ||
| 1361 | movq (%rsp), %rdx | 1442 | movq (%rsp), %rdx |
| 1362 | 1443 | ||
| 1363 | /* Set the NMI executing variable on the stack. */ | 1444 | /* Make room for "NMI executing". */ |
| 1364 | pushq $1 | 1445 | pushq $0 |
| 1365 | 1446 | ||
| 1366 | /* Leave room for the "copied" frame */ | 1447 | /* Leave room for the "iret" frame */ |
| 1367 | subq $(5*8), %rsp | 1448 | subq $(5*8), %rsp |
| 1368 | 1449 | ||
| 1369 | /* Copy the stack frame to the Saved frame */ | 1450 | /* Copy the "original" frame to the "outermost" frame */ |
| 1370 | .rept 5 | 1451 | .rept 5 |
| 1371 | pushq 11*8(%rsp) | 1452 | pushq 11*8(%rsp) |
| 1372 | .endr | 1453 | .endr |
| 1373 | 1454 | ||
| 1374 | /* Everything up to here is safe from nested NMIs */ | 1455 | /* Everything up to here is safe from nested NMIs */ |
| 1375 | 1456 | ||
| 1457 | #ifdef CONFIG_DEBUG_ENTRY | ||
| 1458 | /* | ||
| 1459 | * For ease of testing, unmask NMIs right away. Disabled by | ||
| 1460 | * default because IRET is very expensive. | ||
| 1461 | */ | ||
| 1462 | pushq $0 /* SS */ | ||
| 1463 | pushq %rsp /* RSP (minus 8 because of the previous push) */ | ||
| 1464 | addq $8, (%rsp) /* Fix up RSP */ | ||
| 1465 | pushfq /* RFLAGS */ | ||
| 1466 | pushq $__KERNEL_CS /* CS */ | ||
| 1467 | pushq $1f /* RIP */ | ||
| 1468 | INTERRUPT_RETURN /* continues at repeat_nmi below */ | ||
| 1469 | 1: | ||
| 1470 | #endif | ||
| 1471 | |||
| 1472 | repeat_nmi: | ||
| 1376 | /* | 1473 | /* |
| 1377 | * If there was a nested NMI, the first NMI's iret will return | 1474 | * If there was a nested NMI, the first NMI's iret will return |
| 1378 | * here. But NMIs are still enabled and we can take another | 1475 | * here. But NMIs are still enabled and we can take another |
| @@ -1381,16 +1478,20 @@ first_nmi: | |||
| 1381 | * it will just return, as we are about to repeat an NMI anyway. | 1478 | * it will just return, as we are about to repeat an NMI anyway. |
| 1382 | * This makes it safe to copy to the stack frame that a nested | 1479 | * This makes it safe to copy to the stack frame that a nested |
| 1383 | * NMI will update. | 1480 | * NMI will update. |
| 1481 | * | ||
| 1482 | * RSP is pointing to "outermost RIP". gsbase is unknown, but, if | ||
| 1483 | * we're repeating an NMI, gsbase has the same value that it had on | ||
| 1484 | * the first iteration. paranoid_entry will load the kernel | ||
| 1485 | * gsbase if needed before we call do_nmi. "NMI executing" | ||
| 1486 | * is zero. | ||
| 1384 | */ | 1487 | */ |
| 1385 | repeat_nmi: | 1488 | movq $1, 10*8(%rsp) /* Set "NMI executing". */ |
| 1489 | |||
| 1386 | /* | 1490 | /* |
| 1387 | * Update the stack variable to say we are still in NMI (the update | 1491 | * Copy the "outermost" frame to the "iret" frame. NMIs that nest |
| 1388 | * is benign for the non-repeat case, where 1 was pushed just above | 1492 | * here must not modify the "iret" frame while we're writing to |
| 1389 | * to this very stack slot). | 1493 | * it or it will end up containing garbage. |
| 1390 | */ | 1494 | */ |
| 1391 | movq $1, 10*8(%rsp) | ||
| 1392 | |||
| 1393 | /* Make another copy, this one may be modified by nested NMIs */ | ||
| 1394 | addq $(10*8), %rsp | 1495 | addq $(10*8), %rsp |
| 1395 | .rept 5 | 1496 | .rept 5 |
| 1396 | pushq -6*8(%rsp) | 1497 | pushq -6*8(%rsp) |
| @@ -1399,9 +1500,9 @@ repeat_nmi: | |||
| 1399 | end_repeat_nmi: | 1500 | end_repeat_nmi: |
| 1400 | 1501 | ||
| 1401 | /* | 1502 | /* |
| 1402 | * Everything below this point can be preempted by a nested | 1503 | * Everything below this point can be preempted by a nested NMI. |
| 1403 | * NMI if the first NMI took an exception and reset our iret stack | 1504 | * If this happens, then the inner NMI will change the "iret" |
| 1404 | * so that we repeat another NMI. | 1505 | * frame to point back to repeat_nmi. |
| 1405 | */ | 1506 | */ |
| 1406 | pushq $-1 /* ORIG_RAX: no syscall to restart */ | 1507 | pushq $-1 /* ORIG_RAX: no syscall to restart */ |
| 1407 | ALLOC_PT_GPREGS_ON_STACK | 1508 | ALLOC_PT_GPREGS_ON_STACK |
| @@ -1415,28 +1516,11 @@ end_repeat_nmi: | |||
| 1415 | */ | 1516 | */ |
| 1416 | call paranoid_entry | 1517 | call paranoid_entry |
| 1417 | 1518 | ||
| 1418 | /* | ||
| 1419 | * Save off the CR2 register. If we take a page fault in the NMI then | ||
| 1420 | * it could corrupt the CR2 value. If the NMI preempts a page fault | ||
| 1421 | * handler before it was able to read the CR2 register, and then the | ||
| 1422 | * NMI itself takes a page fault, the page fault that was preempted | ||
| 1423 | * will read the information from the NMI page fault and not the | ||
| 1424 | * origin fault. Save it off and restore it if it changes. | ||
| 1425 | * Use the r12 callee-saved register. | ||
| 1426 | */ | ||
| 1427 | movq %cr2, %r12 | ||
| 1428 | |||
| 1429 | /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ | 1519 | /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ |
| 1430 | movq %rsp, %rdi | 1520 | movq %rsp, %rdi |
| 1431 | movq $-1, %rsi | 1521 | movq $-1, %rsi |
| 1432 | call do_nmi | 1522 | call do_nmi |
| 1433 | 1523 | ||
| 1434 | /* Did the NMI take a page fault? Restore cr2 if it did */ | ||
| 1435 | movq %cr2, %rcx | ||
| 1436 | cmpq %rcx, %r12 | ||
| 1437 | je 1f | ||
| 1438 | movq %r12, %cr2 | ||
| 1439 | 1: | ||
| 1440 | testl %ebx, %ebx /* swapgs needed? */ | 1524 | testl %ebx, %ebx /* swapgs needed? */ |
| 1441 | jnz nmi_restore | 1525 | jnz nmi_restore |
| 1442 | nmi_swapgs: | 1526 | nmi_swapgs: |
| @@ -1444,11 +1528,26 @@ nmi_swapgs: | |||
| 1444 | nmi_restore: | 1528 | nmi_restore: |
| 1445 | RESTORE_EXTRA_REGS | 1529 | RESTORE_EXTRA_REGS |
| 1446 | RESTORE_C_REGS | 1530 | RESTORE_C_REGS |
| 1447 | /* Pop the extra iret frame at once */ | 1531 | |
| 1532 | /* Point RSP at the "iret" frame. */ | ||
| 1448 | REMOVE_PT_GPREGS_FROM_STACK 6*8 | 1533 | REMOVE_PT_GPREGS_FROM_STACK 6*8 |
| 1449 | 1534 | ||
| 1450 | /* Clear the NMI executing stack variable */ | 1535 | /* |
| 1451 | movq $0, 5*8(%rsp) | 1536 | * Clear "NMI executing". Set DF first so that we can easily |
| 1537 | * distinguish the remaining code between here and IRET from | ||
| 1538 | * the SYSCALL entry and exit paths. On a native kernel, we | ||
| 1539 | * could just inspect RIP, but, on paravirt kernels, | ||
| 1540 | * INTERRUPT_RETURN can translate into a jump into a | ||
| 1541 | * hypercall page. | ||
| 1542 | */ | ||
| 1543 | std | ||
| 1544 | movq $0, 5*8(%rsp) /* clear "NMI executing" */ | ||
| 1545 | |||
| 1546 | /* | ||
| 1547 | * INTERRUPT_RETURN reads the "iret" frame and exits the NMI | ||
| 1548 | * stack in a single instruction. We are returning to kernel | ||
| 1549 | * mode, so this cannot result in a fault. | ||
| 1550 | */ | ||
| 1452 | INTERRUPT_RETURN | 1551 | INTERRUPT_RETURN |
| 1453 | END(nmi) | 1552 | END(nmi) |
| 1454 | 1553 | ||
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 0637826292de..c49c5173158e 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h | |||
| @@ -189,6 +189,7 @@ union fpregs_state { | |||
| 189 | struct fxregs_state fxsave; | 189 | struct fxregs_state fxsave; |
| 190 | struct swregs_state soft; | 190 | struct swregs_state soft; |
| 191 | struct xregs_state xsave; | 191 | struct xregs_state xsave; |
| 192 | u8 __padding[PAGE_SIZE]; | ||
| 192 | }; | 193 | }; |
| 193 | 194 | ||
| 194 | /* | 195 | /* |
| @@ -198,40 +199,6 @@ union fpregs_state { | |||
| 198 | */ | 199 | */ |
| 199 | struct fpu { | 200 | struct fpu { |
| 200 | /* | 201 | /* |
| 201 | * @state: | ||
| 202 | * | ||
| 203 | * In-memory copy of all FPU registers that we save/restore | ||
| 204 | * over context switches. If the task is using the FPU then | ||
| 205 | * the registers in the FPU are more recent than this state | ||
| 206 | * copy. If the task context-switches away then they get | ||
| 207 | * saved here and represent the FPU state. | ||
| 208 | * | ||
| 209 | * After context switches there may be a (short) time period | ||
| 210 | * during which the in-FPU hardware registers are unchanged | ||
| 211 | * and still perfectly match this state, if the tasks | ||
| 212 | * scheduled afterwards are not using the FPU. | ||
| 213 | * | ||
| 214 | * This is the 'lazy restore' window of optimization, which | ||
| 215 | * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'. | ||
| 216 | * | ||
| 217 | * We detect whether a subsequent task uses the FPU via setting | ||
| 218 | * CR0::TS to 1, which causes any FPU use to raise a #NM fault. | ||
| 219 | * | ||
| 220 | * During this window, if the task gets scheduled again, we | ||
| 221 | * might be able to skip having to do a restore from this | ||
| 222 | * memory buffer to the hardware registers - at the cost of | ||
| 223 | * incurring the overhead of #NM fault traps. | ||
| 224 | * | ||
| 225 | * Note that on modern CPUs that support the XSAVEOPT (or other | ||
| 226 | * optimized XSAVE instructions), we don't use #NM traps anymore, | ||
| 227 | * as the hardware can track whether FPU registers need saving | ||
| 228 | * or not. On such CPUs we activate the non-lazy ('eagerfpu') | ||
| 229 | * logic, which unconditionally saves/restores all FPU state | ||
| 230 | * across context switches. (if FPU state exists.) | ||
| 231 | */ | ||
| 232 | union fpregs_state state; | ||
| 233 | |||
| 234 | /* | ||
| 235 | * @last_cpu: | 202 | * @last_cpu: |
| 236 | * | 203 | * |
| 237 | * Records the last CPU on which this context was loaded into | 204 | * Records the last CPU on which this context was loaded into |
| @@ -288,6 +255,43 @@ struct fpu { | |||
| 288 | * deal with bursty apps that only use the FPU for a short time: | 255 | * deal with bursty apps that only use the FPU for a short time: |
| 289 | */ | 256 | */ |
| 290 | unsigned char counter; | 257 | unsigned char counter; |
| 258 | /* | ||
| 259 | * @state: | ||
| 260 | * | ||
| 261 | * In-memory copy of all FPU registers that we save/restore | ||
| 262 | * over context switches. If the task is using the FPU then | ||
| 263 | * the registers in the FPU are more recent than this state | ||
| 264 | * copy. If the task context-switches away then they get | ||
| 265 | * saved here and represent the FPU state. | ||
| 266 | * | ||
| 267 | * After context switches there may be a (short) time period | ||
| 268 | * during which the in-FPU hardware registers are unchanged | ||
| 269 | * and still perfectly match this state, if the tasks | ||
| 270 | * scheduled afterwards are not using the FPU. | ||
| 271 | * | ||
| 272 | * This is the 'lazy restore' window of optimization, which | ||
| 273 | * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'. | ||
| 274 | * | ||
| 275 | * We detect whether a subsequent task uses the FPU via setting | ||
| 276 | * CR0::TS to 1, which causes any FPU use to raise a #NM fault. | ||
| 277 | * | ||
| 278 | * During this window, if the task gets scheduled again, we | ||
| 279 | * might be able to skip having to do a restore from this | ||
| 280 | * memory buffer to the hardware registers - at the cost of | ||
| 281 | * incurring the overhead of #NM fault traps. | ||
| 282 | * | ||
| 283 | * Note that on modern CPUs that support the XSAVEOPT (or other | ||
| 284 | * optimized XSAVE instructions), we don't use #NM traps anymore, | ||
| 285 | * as the hardware can track whether FPU registers need saving | ||
| 286 | * or not. On such CPUs we activate the non-lazy ('eagerfpu') | ||
| 287 | * logic, which unconditionally saves/restores all FPU state | ||
| 288 | * across context switches. (if FPU state exists.) | ||
| 289 | */ | ||
| 290 | union fpregs_state state; | ||
| 291 | /* | ||
| 292 | * WARNING: 'state' is dynamically-sized. Do not put | ||
| 293 | * anything after it here. | ||
| 294 | */ | ||
| 291 | }; | 295 | }; |
| 292 | 296 | ||
| 293 | #endif /* _ASM_X86_FPU_H */ | 297 | #endif /* _ASM_X86_FPU_H */ |
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 43e6519df0d5..944f1785ed0d 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
| @@ -390,9 +390,6 @@ struct thread_struct { | |||
| 390 | #endif | 390 | #endif |
| 391 | unsigned long gs; | 391 | unsigned long gs; |
| 392 | 392 | ||
| 393 | /* Floating point and extended processor state */ | ||
| 394 | struct fpu fpu; | ||
| 395 | |||
| 396 | /* Save middle states of ptrace breakpoints */ | 393 | /* Save middle states of ptrace breakpoints */ |
| 397 | struct perf_event *ptrace_bps[HBP_NUM]; | 394 | struct perf_event *ptrace_bps[HBP_NUM]; |
| 398 | /* Debug status used for traps, single steps, etc... */ | 395 | /* Debug status used for traps, single steps, etc... */ |
| @@ -418,6 +415,13 @@ struct thread_struct { | |||
| 418 | unsigned long iopl; | 415 | unsigned long iopl; |
| 419 | /* Max allowed port in the bitmap, in bytes: */ | 416 | /* Max allowed port in the bitmap, in bytes: */ |
| 420 | unsigned io_bitmap_max; | 417 | unsigned io_bitmap_max; |
| 418 | |||
| 419 | /* Floating point and extended processor state */ | ||
| 420 | struct fpu fpu; | ||
| 421 | /* | ||
| 422 | * WARNING: 'fpu' is dynamically-sized. It *MUST* be at | ||
| 423 | * the end. | ||
| 424 | */ | ||
| 421 | }; | 425 | }; |
| 422 | 426 | ||
| 423 | /* | 427 | /* |
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 32826791e675..0b39173dd971 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c | |||
| @@ -4,6 +4,8 @@ | |||
| 4 | #include <asm/fpu/internal.h> | 4 | #include <asm/fpu/internal.h> |
| 5 | #include <asm/tlbflush.h> | 5 | #include <asm/tlbflush.h> |
| 6 | 6 | ||
| 7 | #include <linux/sched.h> | ||
| 8 | |||
| 7 | /* | 9 | /* |
| 8 | * Initialize the TS bit in CR0 according to the style of context-switches | 10 | * Initialize the TS bit in CR0 according to the style of context-switches |
| 9 | * we are using: | 11 | * we are using: |
| @@ -136,6 +138,43 @@ static void __init fpu__init_system_generic(void) | |||
| 136 | unsigned int xstate_size; | 138 | unsigned int xstate_size; |
| 137 | EXPORT_SYMBOL_GPL(xstate_size); | 139 | EXPORT_SYMBOL_GPL(xstate_size); |
| 138 | 140 | ||
| 141 | /* Enforce that 'MEMBER' is the last field of 'TYPE': */ | ||
| 142 | #define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \ | ||
| 143 | BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER)) | ||
| 144 | |||
| 145 | /* | ||
| 146 | * We append the 'struct fpu' to the task_struct: | ||
| 147 | */ | ||
| 148 | static void __init fpu__init_task_struct_size(void) | ||
| 149 | { | ||
| 150 | int task_size = sizeof(struct task_struct); | ||
| 151 | |||
| 152 | /* | ||
| 153 | * Subtract off the static size of the register state. | ||
| 154 | * It potentially has a bunch of padding. | ||
| 155 | */ | ||
| 156 | task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state); | ||
| 157 | |||
| 158 | /* | ||
| 159 | * Add back the dynamically-calculated register state | ||
| 160 | * size. | ||
| 161 | */ | ||
| 162 | task_size += xstate_size; | ||
| 163 | |||
| 164 | /* | ||
| 165 | * We dynamically size 'struct fpu', so we require that | ||
| 166 | * it be at the end of 'thread_struct' and that | ||
| 167 | * 'thread_struct' be at the end of 'task_struct'. If | ||
| 168 | * you hit a compile error here, check the structure to | ||
| 169 | * see if something got added to the end. | ||
| 170 | */ | ||
| 171 | CHECK_MEMBER_AT_END_OF(struct fpu, state); | ||
| 172 | CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu); | ||
| 173 | CHECK_MEMBER_AT_END_OF(struct task_struct, thread); | ||
| 174 | |||
| 175 | arch_task_struct_size = task_size; | ||
| 176 | } | ||
| 177 | |||
| 139 | /* | 178 | /* |
| 140 | * Set up the xstate_size based on the legacy FPU context size. | 179 | * Set up the xstate_size based on the legacy FPU context size. |
| 141 | * | 180 | * |
| @@ -287,6 +326,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) | |||
| 287 | fpu__init_system_generic(); | 326 | fpu__init_system_generic(); |
| 288 | fpu__init_system_xstate_size_legacy(); | 327 | fpu__init_system_xstate_size_legacy(); |
| 289 | fpu__init_system_xstate(); | 328 | fpu__init_system_xstate(); |
| 329 | fpu__init_task_struct_size(); | ||
| 290 | 330 | ||
| 291 | fpu__init_system_ctx_switch(); | 331 | fpu__init_system_ctx_switch(); |
| 292 | } | 332 | } |
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index c3e985d1751c..d05bd2e2ee91 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c | |||
| @@ -408,15 +408,15 @@ static void default_do_nmi(struct pt_regs *regs) | |||
| 408 | NOKPROBE_SYMBOL(default_do_nmi); | 408 | NOKPROBE_SYMBOL(default_do_nmi); |
| 409 | 409 | ||
| 410 | /* | 410 | /* |
| 411 | * NMIs can hit breakpoints which will cause it to lose its | 411 | * NMIs can page fault or hit breakpoints which will cause it to lose |
| 412 | * NMI context with the CPU when the breakpoint does an iret. | 412 | * its NMI context with the CPU when the breakpoint or page fault does an IRET. |
| 413 | */ | 413 | * |
| 414 | #ifdef CONFIG_X86_32 | 414 | * As a result, NMIs can nest if NMIs get unmasked due an IRET during |
| 415 | /* | 415 | * NMI processing. On x86_64, the asm glue protects us from nested NMIs |
| 416 | * For i386, NMIs use the same stack as the kernel, and we can | 416 | * if the outer NMI came from kernel mode, but we can still nest if the |
| 417 | * add a workaround to the iret problem in C (preventing nested | 417 | * outer NMI came from user mode. |
| 418 | * NMIs if an NMI takes a trap). Simply have 3 states the NMI | 418 | * |
| 419 | * can be in: | 419 | * To handle these nested NMIs, we have three states: |
| 420 | * | 420 | * |
| 421 | * 1) not running | 421 | * 1) not running |
| 422 | * 2) executing | 422 | * 2) executing |
| @@ -430,15 +430,14 @@ NOKPROBE_SYMBOL(default_do_nmi); | |||
| 430 | * (Note, the latch is binary, thus multiple NMIs triggering, | 430 | * (Note, the latch is binary, thus multiple NMIs triggering, |
| 431 | * when one is running, are ignored. Only one NMI is restarted.) | 431 | * when one is running, are ignored. Only one NMI is restarted.) |
| 432 | * | 432 | * |
| 433 | * If an NMI hits a breakpoint that executes an iret, another | 433 | * If an NMI executes an iret, another NMI can preempt it. We do not |
| 434 | * NMI can preempt it. We do not want to allow this new NMI | 434 | * want to allow this new NMI to run, but we want to execute it when the |
| 435 | * to run, but we want to execute it when the first one finishes. | 435 | * first one finishes. We set the state to "latched", and the exit of |
| 436 | * We set the state to "latched", and the exit of the first NMI will | 436 | * the first NMI will perform a dec_return, if the result is zero |
| 437 | * perform a dec_return, if the result is zero (NOT_RUNNING), then | 437 | * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the |
| 438 | * it will simply exit the NMI handler. If not, the dec_return | 438 | * dec_return would have set the state to NMI_EXECUTING (what we want it |
| 439 | * would have set the state to NMI_EXECUTING (what we want it to | 439 | * to be when we are running). In this case, we simply jump back to |
| 440 | * be when we are running). In this case, we simply jump back | 440 | * rerun the NMI handler again, and restart the 'latched' NMI. |
| 441 | * to rerun the NMI handler again, and restart the 'latched' NMI. | ||
| 442 | * | 441 | * |
| 443 | * No trap (breakpoint or page fault) should be hit before nmi_restart, | 442 | * No trap (breakpoint or page fault) should be hit before nmi_restart, |
| 444 | * thus there is no race between the first check of state for NOT_RUNNING | 443 | * thus there is no race between the first check of state for NOT_RUNNING |
| @@ -461,49 +460,36 @@ enum nmi_states { | |||
| 461 | static DEFINE_PER_CPU(enum nmi_states, nmi_state); | 460 | static DEFINE_PER_CPU(enum nmi_states, nmi_state); |
| 462 | static DEFINE_PER_CPU(unsigned long, nmi_cr2); | 461 | static DEFINE_PER_CPU(unsigned long, nmi_cr2); |
| 463 | 462 | ||
| 464 | #define nmi_nesting_preprocess(regs) \ | 463 | #ifdef CONFIG_X86_64 |
| 465 | do { \ | ||
| 466 | if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \ | ||
| 467 | this_cpu_write(nmi_state, NMI_LATCHED); \ | ||
| 468 | return; \ | ||
| 469 | } \ | ||
| 470 | this_cpu_write(nmi_state, NMI_EXECUTING); \ | ||
| 471 | this_cpu_write(nmi_cr2, read_cr2()); \ | ||
| 472 | } while (0); \ | ||
| 473 | nmi_restart: | ||
| 474 | |||
| 475 | #define nmi_nesting_postprocess() \ | ||
| 476 | do { \ | ||
| 477 | if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \ | ||
| 478 | write_cr2(this_cpu_read(nmi_cr2)); \ | ||
| 479 | if (this_cpu_dec_return(nmi_state)) \ | ||
| 480 | goto nmi_restart; \ | ||
| 481 | } while (0) | ||
| 482 | #else /* x86_64 */ | ||
| 483 | /* | 464 | /* |
| 484 | * In x86_64 things are a bit more difficult. This has the same problem | 465 | * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without |
| 485 | * where an NMI hitting a breakpoint that calls iret will remove the | 466 | * some care, the inner breakpoint will clobber the outer breakpoint's |
| 486 | * NMI context, allowing a nested NMI to enter. What makes this more | 467 | * stack. |
| 487 | * difficult is that both NMIs and breakpoints have their own stack. | ||
| 488 | * When a new NMI or breakpoint is executed, the stack is set to a fixed | ||
| 489 | * point. If an NMI is nested, it will have its stack set at that same | ||
| 490 | * fixed address that the first NMI had, and will start corrupting the | ||
| 491 | * stack. This is handled in entry_64.S, but the same problem exists with | ||
| 492 | * the breakpoint stack. | ||
| 493 | * | 468 | * |
| 494 | * If a breakpoint is being processed, and the debug stack is being used, | 469 | * If a breakpoint is being processed, and the debug stack is being |
| 495 | * if an NMI comes in and also hits a breakpoint, the stack pointer | 470 | * used, if an NMI comes in and also hits a breakpoint, the stack |
| 496 | * will be set to the same fixed address as the breakpoint that was | 471 | * pointer will be set to the same fixed address as the breakpoint that |
| 497 | * interrupted, causing that stack to be corrupted. To handle this case, | 472 | * was interrupted, causing that stack to be corrupted. To handle this |
| 498 | * check if the stack that was interrupted is the debug stack, and if | 473 | * case, check if the stack that was interrupted is the debug stack, and |
| 499 | * so, change the IDT so that new breakpoints will use the current stack | 474 | * if so, change the IDT so that new breakpoints will use the current |
| 500 | * and not switch to the fixed address. On return of the NMI, switch back | 475 | * stack and not switch to the fixed address. On return of the NMI, |
| 501 | * to the original IDT. | 476 | * switch back to the original IDT. |
| 502 | */ | 477 | */ |
| 503 | static DEFINE_PER_CPU(int, update_debug_stack); | 478 | static DEFINE_PER_CPU(int, update_debug_stack); |
| 479 | #endif | ||
| 504 | 480 | ||
| 505 | static inline void nmi_nesting_preprocess(struct pt_regs *regs) | 481 | dotraplinkage notrace void |
| 482 | do_nmi(struct pt_regs *regs, long error_code) | ||
| 506 | { | 483 | { |
| 484 | if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { | ||
| 485 | this_cpu_write(nmi_state, NMI_LATCHED); | ||
| 486 | return; | ||
| 487 | } | ||
| 488 | this_cpu_write(nmi_state, NMI_EXECUTING); | ||
| 489 | this_cpu_write(nmi_cr2, read_cr2()); | ||
| 490 | nmi_restart: | ||
| 491 | |||
| 492 | #ifdef CONFIG_X86_64 | ||
| 507 | /* | 493 | /* |
| 508 | * If we interrupted a breakpoint, it is possible that | 494 | * If we interrupted a breakpoint, it is possible that |
| 509 | * the nmi handler will have breakpoints too. We need to | 495 | * the nmi handler will have breakpoints too. We need to |
| @@ -514,22 +500,8 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs) | |||
| 514 | debug_stack_set_zero(); | 500 | debug_stack_set_zero(); |
| 515 | this_cpu_write(update_debug_stack, 1); | 501 | this_cpu_write(update_debug_stack, 1); |
| 516 | } | 502 | } |
| 517 | } | ||
| 518 | |||
| 519 | static inline void nmi_nesting_postprocess(void) | ||
| 520 | { | ||
| 521 | if (unlikely(this_cpu_read(update_debug_stack))) { | ||
| 522 | debug_stack_reset(); | ||
| 523 | this_cpu_write(update_debug_stack, 0); | ||
| 524 | } | ||
| 525 | } | ||
| 526 | #endif | 503 | #endif |
| 527 | 504 | ||
| 528 | dotraplinkage notrace void | ||
| 529 | do_nmi(struct pt_regs *regs, long error_code) | ||
| 530 | { | ||
| 531 | nmi_nesting_preprocess(regs); | ||
| 532 | |||
| 533 | nmi_enter(); | 505 | nmi_enter(); |
| 534 | 506 | ||
| 535 | inc_irq_stat(__nmi_count); | 507 | inc_irq_stat(__nmi_count); |
| @@ -539,8 +511,17 @@ do_nmi(struct pt_regs *regs, long error_code) | |||
| 539 | 511 | ||
| 540 | nmi_exit(); | 512 | nmi_exit(); |
| 541 | 513 | ||
| 542 | /* On i386, may loop back to preprocess */ | 514 | #ifdef CONFIG_X86_64 |
| 543 | nmi_nesting_postprocess(); | 515 | if (unlikely(this_cpu_read(update_debug_stack))) { |
| 516 | debug_stack_reset(); | ||
| 517 | this_cpu_write(update_debug_stack, 0); | ||
| 518 | } | ||
| 519 | #endif | ||
| 520 | |||
| 521 | if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) | ||
| 522 | write_cr2(this_cpu_read(nmi_cr2)); | ||
| 523 | if (this_cpu_dec_return(nmi_state)) | ||
| 524 | goto nmi_restart; | ||
| 544 | } | 525 | } |
| 545 | NOKPROBE_SYMBOL(do_nmi); | 526 | NOKPROBE_SYMBOL(do_nmi); |
| 546 | 527 | ||
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9cad694ed7c4..397688beed4b 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c | |||
| @@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister); | |||
| 81 | */ | 81 | */ |
| 82 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) | 82 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) |
| 83 | { | 83 | { |
| 84 | *dst = *src; | 84 | memcpy(dst, src, arch_task_struct_size); |
| 85 | 85 | ||
| 86 | return fpu__copy(&dst->thread.fpu, &src->thread.fpu); | 86 | return fpu__copy(&dst->thread.fpu, &src->thread.fpu); |
| 87 | } | 87 | } |
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 91a4e6426321..92e6726f6e37 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c | |||
| @@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) | |||
| 92 | roundup(sizeof(CORE_STR), 4)) + | 92 | roundup(sizeof(CORE_STR), 4)) + |
| 93 | roundup(sizeof(struct elf_prstatus), 4) + | 93 | roundup(sizeof(struct elf_prstatus), 4) + |
| 94 | roundup(sizeof(struct elf_prpsinfo), 4) + | 94 | roundup(sizeof(struct elf_prpsinfo), 4) + |
| 95 | roundup(sizeof(struct task_struct), 4); | 95 | roundup(arch_task_struct_size, 4); |
| 96 | *elf_buflen = PAGE_ALIGN(*elf_buflen); | 96 | *elf_buflen = PAGE_ALIGN(*elf_buflen); |
| 97 | return size + *elf_buflen; | 97 | return size + *elf_buflen; |
| 98 | } | 98 | } |
| @@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) | |||
| 415 | /* set up the task structure */ | 415 | /* set up the task structure */ |
| 416 | notes[2].name = CORE_STR; | 416 | notes[2].name = CORE_STR; |
| 417 | notes[2].type = NT_TASKSTRUCT; | 417 | notes[2].type = NT_TASKSTRUCT; |
| 418 | notes[2].datasz = sizeof(struct task_struct); | 418 | notes[2].datasz = arch_task_struct_size; |
| 419 | notes[2].data = current; | 419 | notes[2].data = current; |
| 420 | 420 | ||
| 421 | nhdr->p_filesz += notesize(¬es[2]); | 421 | nhdr->p_filesz += notesize(¬es[2]); |
diff --git a/include/linux/sched.h b/include/linux/sched.h index ae21f1591615..04b5ada460b4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -1522,8 +1522,6 @@ struct task_struct { | |||
| 1522 | /* hung task detection */ | 1522 | /* hung task detection */ |
| 1523 | unsigned long last_switch_count; | 1523 | unsigned long last_switch_count; |
| 1524 | #endif | 1524 | #endif |
| 1525 | /* CPU-specific state of this task */ | ||
| 1526 | struct thread_struct thread; | ||
| 1527 | /* filesystem information */ | 1525 | /* filesystem information */ |
| 1528 | struct fs_struct *fs; | 1526 | struct fs_struct *fs; |
| 1529 | /* open file information */ | 1527 | /* open file information */ |
| @@ -1778,8 +1776,22 @@ struct task_struct { | |||
| 1778 | unsigned long task_state_change; | 1776 | unsigned long task_state_change; |
| 1779 | #endif | 1777 | #endif |
| 1780 | int pagefault_disabled; | 1778 | int pagefault_disabled; |
| 1779 | /* CPU-specific state of this task */ | ||
| 1780 | struct thread_struct thread; | ||
| 1781 | /* | ||
| 1782 | * WARNING: on x86, 'thread_struct' contains a variable-sized | ||
| 1783 | * structure. It *MUST* be at the end of 'task_struct'. | ||
| 1784 | * | ||
| 1785 | * Do not put anything below here! | ||
| 1786 | */ | ||
| 1781 | }; | 1787 | }; |
| 1782 | 1788 | ||
| 1789 | #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT | ||
| 1790 | extern int arch_task_struct_size __read_mostly; | ||
| 1791 | #else | ||
| 1792 | # define arch_task_struct_size (sizeof(struct task_struct)) | ||
| 1793 | #endif | ||
| 1794 | |||
| 1783 | /* Future-safe accessor for struct task_struct's cpus_allowed. */ | 1795 | /* Future-safe accessor for struct task_struct's cpus_allowed. */ |
| 1784 | #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) | 1796 | #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) |
| 1785 | 1797 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 1bfefc6f96a4..dbd9b8d7b7cc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -287,6 +287,11 @@ static void set_max_threads(unsigned int max_threads_suggested) | |||
| 287 | max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); | 287 | max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); |
| 288 | } | 288 | } |
| 289 | 289 | ||
| 290 | #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT | ||
| 291 | /* Initialized by the architecture: */ | ||
| 292 | int arch_task_struct_size __read_mostly; | ||
| 293 | #endif | ||
| 294 | |||
| 290 | void __init fork_init(void) | 295 | void __init fork_init(void) |
| 291 | { | 296 | { |
| 292 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR | 297 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR |
| @@ -295,7 +300,7 @@ void __init fork_init(void) | |||
| 295 | #endif | 300 | #endif |
| 296 | /* create a slab on which task_structs can be allocated */ | 301 | /* create a slab on which task_structs can be allocated */ |
| 297 | task_struct_cachep = | 302 | task_struct_cachep = |
| 298 | kmem_cache_create("task_struct", sizeof(struct task_struct), | 303 | kmem_cache_create("task_struct", arch_task_struct_size, |
| 299 | ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); | 304 | ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); |
| 300 | #endif | 305 | #endif |
| 301 | 306 | ||
