x86/microcode: Fix CPU synchronization routine

Emanuel reported an issue with a hang during microcode update because my dumb idea to use one atomic synchronization variable for both rendezvous - before and after update - was simply bollocks: microcode: microcode_reload_late: late_cpus: 4 microcode: __reload_late: cpu 2 entered microcode: __reload_late: cpu 1 entered microcode: __reload_late: cpu 3 entered microcode: __reload_late: cpu 0 entered microcode: __reload_late: cpu 1 left microcode: Timeout while waiting for CPUs rendezvous, remaining: 1 CPU1 above would finish, leave and the others will still spin waiting for it to join. So do two synchronization atomics instead, which makes the code a lot more straightforward. Also, since the update is serialized and it also takes quite some time per microcode engine, increase the exit timeout by the number of CPUs on the system. That's ok because the moment all CPUs are done, that timeout will be cut short. Furthermore, panic when some of the CPUs timeout when returning from a microcode update: we can't allow a system with not all cores updated. Also, as an optimization, do not do the exit sync if microcode wasn't updated. Reported-by: Emanuel Czirai <xftroxgpx@protonmail.com> Signed-off-by: Borislav Petkov <bp@suse.de> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Emanuel Czirai <xftroxgpx@protonmail.com> Tested-by: Ashok Raj <ashok.raj@intel.com> Tested-by: Tom Lendacky <thomas.lendacky@amd.com> Link: https://lkml.kernel.org/r/20180314183615.17629-2-bp@alien8.de
author: Borislav Petkov <bp@suse.de> 2018-03-14 14:36:15 -0400
committer: Thomas Gleixner <tglx@linutronix.de> 2018-03-16 15:55:51 -0400
commit: bb8c13d61a629276a162c1d2b1a20a815cbcfbb7 (patch)
tree: a222394a2e4e3b3b8346df9c1713d0fb17e56bf3
parent: 2613f36ed965d0e5a595a1d931fd3b480e82d6fd (diff)
1 files changed, 41 insertions, 27 deletions
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 9f0fe5bb450d..10c4fc2c91f8 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -517,7 +517,29 @@ static int check_online_cpus(void)
        return -EINVAL;
 }
-static atomic_t late_cpus;
+static atomic_t late_cpus_in;
+static atomic_t late_cpus_out;
+static int __wait_for_cpus(atomic_t *t, long long timeout)
+{
+        int all_cpus = num_online_cpus();
+        atomic_inc(t);
+        while (atomic_read(t) < all_cpus) {
+                if (timeout < SPINUNIT) {
+                        pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
+                                all_cpus - atomic_read(t));
+                        return 1;
+                }
+                ndelay(SPINUNIT);
+                timeout -= SPINUNIT;
+                touch_nmi_watchdog();
+        }
+        return 0;
+}
 /*
 * Returns:
@@ -527,30 +549,16 @@ static atomic_t late_cpus;
 */
 static int __reload_late(void *info)
 {
-        unsigned int timeout = NSEC_PER_SEC;
-        int all_cpus = num_online_cpus();
        int cpu = smp_processor_id();
        enum ucode_state err;
        int ret = 0;
-        atomic_dec(&late_cpus);
        /*
         * Wait for all CPUs to arrive. A load will not be attempted unless all
         * CPUs show up.
         * */
-        while (atomic_read(&late_cpus)) {
+        if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
-                if (timeout < SPINUNIT) {
+                return -1;
-                        pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
-                                atomic_read(&late_cpus));
-                        return -1;
-                }
-                ndelay(SPINUNIT);
-                timeout -= SPINUNIT;
-                touch_nmi_watchdog();
-        }
        spin_lock(&update_lock);
        apply_microcode_local(&err);
@@ -558,15 +566,22 @@ static int __reload_late(void *info)
        if (err > UCODE_NFOUND) {
                pr_warn("Error reloading microcode on CPU %d\n", cpu);
-                ret = -1;
+                return -1;
-        } else if (err == UCODE_UPDATED) {
+        /* siblings return UCODE_OK because their engine got updated already */
+        } else if (err == UCODE_UPDATED || err == UCODE_OK) {
                ret = 1;
+        } else {
+                return ret;
        }
-        atomic_inc(&late_cpus);
+        /*
+         * Increase the wait timeout to a safe value here since we're
-        while (atomic_read(&late_cpus) != all_cpus)
+         * serializing the microcode update and that could take a while on a
-                cpu_relax();
+         * large number of CPUs. And that is fine as the *actual* timeout will
+         * be determined by the last CPU finished updating and thus cut short.
+         */
+        if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus()))
+                panic("Timeout during microcode update!\n");
        return ret;
 }
@@ -579,12 +594,11 @@ static int microcode_reload_late(void)
 {
        int ret;
-        atomic_set(&late_cpus, num_online_cpus());
+        atomic_set(&late_cpus_in,  0);
+        atomic_set(&late_cpus_out, 0);
        ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
-        if (ret < 0)
+        if (ret > 0)
-                return ret;
-        else if (ret > 0)
                microcode_check();
        return ret;
author	Borislav Petkov <bp@suse.de>	2018-03-14 14:36:15 -0400
committer	Thomas Gleixner <tglx@linutronix.de>	2018-03-16 15:55:51 -0400
commit	bb8c13d61a629276a162c1d2b1a20a815cbcfbb7 (patch)
tree	a222394a2e4e3b3b8346df9c1713d0fb17e56bf3
parent	2613f36ed965d0e5a595a1d931fd3b480e82d6fd (diff)

diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 9f0fe5bb450d..10c4fc2c91f8 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -517,7 +517,29 @@ static int check_online_cpus(void)
517	return -EINVAL;	517	return -EINVAL;
518	}	518	}
519		519
520	static atomic_t late_cpus;	520	static atomic_t late_cpus_in;
		521	static atomic_t late_cpus_out;
		522
		523	static int __wait_for_cpus(atomic_t *t, long long timeout)
		524	{
		525	int all_cpus = num_online_cpus();
		526
		527	atomic_inc(t);
		528
		529	while (atomic_read(t) < all_cpus) {
		530	if (timeout < SPINUNIT) {
		531	pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
		532	all_cpus - atomic_read(t));
		533	return 1;
		534	}
		535
		536	ndelay(SPINUNIT);
		537	timeout -= SPINUNIT;
		538
		539	touch_nmi_watchdog();
		540	}
		541	return 0;
		542	}
521		543
522	/*	544	/*
523	* Returns:	545	* Returns:
@@ -527,30 +549,16 @@ static atomic_t late_cpus;
527	*/	549	*/
528	static int __reload_late(void *info)	550	static int __reload_late(void *info)
529	{	551	{
530	unsigned int timeout = NSEC_PER_SEC;
531	int all_cpus = num_online_cpus();
532	int cpu = smp_processor_id();	552	int cpu = smp_processor_id();
533	enum ucode_state err;	553	enum ucode_state err;
534	int ret = 0;	554	int ret = 0;
535		555
536	atomic_dec(&late_cpus);
537
538	/*	556	/*
539	* Wait for all CPUs to arrive. A load will not be attempted unless all	557	* Wait for all CPUs to arrive. A load will not be attempted unless all
540	* CPUs show up.	558	* CPUs show up.
541	* */	559	* */
542	while (atomic_read(&late_cpus)) {	560	if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
543	if (timeout < SPINUNIT) {	561	return -1;
544	pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
545	atomic_read(&late_cpus));
546	return -1;
547	}
548
549	ndelay(SPINUNIT);
550	timeout -= SPINUNIT;
551
552	touch_nmi_watchdog();
553	}
554		562
555	spin_lock(&update_lock);	563	spin_lock(&update_lock);
556	apply_microcode_local(&err);	564	apply_microcode_local(&err);
@@ -558,15 +566,22 @@ static int __reload_late(void *info)
558		566
559	if (err > UCODE_NFOUND) {	567	if (err > UCODE_NFOUND) {
560	pr_warn("Error reloading microcode on CPU %d\n", cpu);	568	pr_warn("Error reloading microcode on CPU %d\n", cpu);
561	ret = -1;	569	return -1;
562	} else if (err == UCODE_UPDATED) {	570	/* siblings return UCODE_OK because their engine got updated already */
		571	} else if (err == UCODE_UPDATED \|\| err == UCODE_OK) {
563	ret = 1;	572	ret = 1;
		573	} else {
		574	return ret;
564	}	575	}
565		576
566	atomic_inc(&late_cpus);	577	/*
567		578	* Increase the wait timeout to a safe value here since we're
568	while (atomic_read(&late_cpus) != all_cpus)	579	* serializing the microcode update and that could take a while on a
569	cpu_relax();	580	* large number of CPUs. And that is fine as the actual timeout will
		581	* be determined by the last CPU finished updating and thus cut short.
		582	*/
		583	if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus()))
		584	panic("Timeout during microcode update!\n");
570		585
571	return ret;	586	return ret;
572	}	587	}
@@ -579,12 +594,11 @@ static int microcode_reload_late(void)
579	{	594	{
580	int ret;	595	int ret;
581		596
582	atomic_set(&late_cpus, num_online_cpus());	597	atomic_set(&late_cpus_in, 0);
		598	atomic_set(&late_cpus_out, 0);
583		599
584	ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);	600	ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
585	if (ret < 0)	601	if (ret > 0)
586	return ret;
587	else if (ret > 0)
588	microcode_check();	602	microcode_check();
589		603
590	return ret;	604	return ret;