aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBorislav Petkov <bp@suse.de>2018-03-14 14:36:15 -0400
committerThomas Gleixner <tglx@linutronix.de>2018-03-16 15:55:51 -0400
commitbb8c13d61a629276a162c1d2b1a20a815cbcfbb7 (patch)
treea222394a2e4e3b3b8346df9c1713d0fb17e56bf3
parent2613f36ed965d0e5a595a1d931fd3b480e82d6fd (diff)
x86/microcode: Fix CPU synchronization routine
Emanuel reported an issue with a hang during microcode update because my dumb idea to use one atomic synchronization variable for both rendezvous - before and after update - was simply bollocks: microcode: microcode_reload_late: late_cpus: 4 microcode: __reload_late: cpu 2 entered microcode: __reload_late: cpu 1 entered microcode: __reload_late: cpu 3 entered microcode: __reload_late: cpu 0 entered microcode: __reload_late: cpu 1 left microcode: Timeout while waiting for CPUs rendezvous, remaining: 1 CPU1 above would finish, leave and the others will still spin waiting for it to join. So do two synchronization atomics instead, which makes the code a lot more straightforward. Also, since the update is serialized and it also takes quite some time per microcode engine, increase the exit timeout by the number of CPUs on the system. That's ok because the moment all CPUs are done, that timeout will be cut short. Furthermore, panic when some of the CPUs timeout when returning from a microcode update: we can't allow a system with not all cores updated. Also, as an optimization, do not do the exit sync if microcode wasn't updated. Reported-by: Emanuel Czirai <xftroxgpx@protonmail.com> Signed-off-by: Borislav Petkov <bp@suse.de> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Emanuel Czirai <xftroxgpx@protonmail.com> Tested-by: Ashok Raj <ashok.raj@intel.com> Tested-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lkml.kernel.org/r/20180314183615.17629-2-bp@alien8.de
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c68
1 files changed, 41 insertions, 27 deletions
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 9f0fe5bb450d..10c4fc2c91f8 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -517,7 +517,29 @@ static int check_online_cpus(void)
517 return -EINVAL; 517 return -EINVAL;
518} 518}
519 519
520static atomic_t late_cpus; 520static atomic_t late_cpus_in;
521static atomic_t late_cpus_out;
522
523static int __wait_for_cpus(atomic_t *t, long long timeout)
524{
525 int all_cpus = num_online_cpus();
526
527 atomic_inc(t);
528
529 while (atomic_read(t) < all_cpus) {
530 if (timeout < SPINUNIT) {
531 pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
532 all_cpus - atomic_read(t));
533 return 1;
534 }
535
536 ndelay(SPINUNIT);
537 timeout -= SPINUNIT;
538
539 touch_nmi_watchdog();
540 }
541 return 0;
542}
521 543
522/* 544/*
523 * Returns: 545 * Returns:
@@ -527,30 +549,16 @@ static atomic_t late_cpus;
527 */ 549 */
528static int __reload_late(void *info) 550static int __reload_late(void *info)
529{ 551{
530 unsigned int timeout = NSEC_PER_SEC;
531 int all_cpus = num_online_cpus();
532 int cpu = smp_processor_id(); 552 int cpu = smp_processor_id();
533 enum ucode_state err; 553 enum ucode_state err;
534 int ret = 0; 554 int ret = 0;
535 555
536 atomic_dec(&late_cpus);
537
538 /* 556 /*
539 * Wait for all CPUs to arrive. A load will not be attempted unless all 557 * Wait for all CPUs to arrive. A load will not be attempted unless all
540 * CPUs show up. 558 * CPUs show up.
541 * */ 559 * */
542 while (atomic_read(&late_cpus)) { 560 if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
543 if (timeout < SPINUNIT) { 561 return -1;
544 pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
545 atomic_read(&late_cpus));
546 return -1;
547 }
548
549 ndelay(SPINUNIT);
550 timeout -= SPINUNIT;
551
552 touch_nmi_watchdog();
553 }
554 562
555 spin_lock(&update_lock); 563 spin_lock(&update_lock);
556 apply_microcode_local(&err); 564 apply_microcode_local(&err);
@@ -558,15 +566,22 @@ static int __reload_late(void *info)
558 566
559 if (err > UCODE_NFOUND) { 567 if (err > UCODE_NFOUND) {
560 pr_warn("Error reloading microcode on CPU %d\n", cpu); 568 pr_warn("Error reloading microcode on CPU %d\n", cpu);
561 ret = -1; 569 return -1;
562 } else if (err == UCODE_UPDATED) { 570 /* siblings return UCODE_OK because their engine got updated already */
571 } else if (err == UCODE_UPDATED || err == UCODE_OK) {
563 ret = 1; 572 ret = 1;
573 } else {
574 return ret;
564 } 575 }
565 576
566 atomic_inc(&late_cpus); 577 /*
567 578 * Increase the wait timeout to a safe value here since we're
568 while (atomic_read(&late_cpus) != all_cpus) 579 * serializing the microcode update and that could take a while on a
569 cpu_relax(); 580 * large number of CPUs. And that is fine as the *actual* timeout will
581 * be determined by the last CPU finished updating and thus cut short.
582 */
583 if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus()))
584 panic("Timeout during microcode update!\n");
570 585
571 return ret; 586 return ret;
572} 587}
@@ -579,12 +594,11 @@ static int microcode_reload_late(void)
579{ 594{
580 int ret; 595 int ret;
581 596
582 atomic_set(&late_cpus, num_online_cpus()); 597 atomic_set(&late_cpus_in, 0);
598 atomic_set(&late_cpus_out, 0);
583 599
584 ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); 600 ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
585 if (ret < 0) 601 if (ret > 0)
586 return ret;
587 else if (ret > 0)
588 microcode_check(); 602 microcode_check();
589 603
590 return ret; 604 return ret;