aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/smpboot.c
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2015-02-25 13:34:39 -0500
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2015-03-11 16:20:25 -0400
commit8038dad7e888581266c76df15d70ca457a3c5910 (patch)
treea921a15c300418540c71a410a1caf558d6ba8a80 /kernel/smpboot.c
parentc517d838eb7d07bbe9507871fab3931deccff539 (diff)
smpboot: Add common code for notification from dying CPU
RCU ignores offlined CPUs, so they cannot safely run RCU read-side code. (They -can- use SRCU, but not RCU.) This means that any use of RCU during or after the call to arch_cpu_idle_dead(). Unfortunately, commit 2ed53c0d6cc99 added a complete() call, which will contain RCU read-side critical sections if there is a task waiting to be awakened. Which, as it turns out, there almost never is. In my qemu/KVM testing, the to-be-awakened task is not yet asleep more than 99.5% of the time. In current mainline, failure is even harder to reproduce, requiring a virtualized environment that delays the outgoing CPU by at least three jiffies between the time it exits its stop_machine() task at CPU_DYING time and the time it calls arch_cpu_idle_dead() from the idle loop. However, this problem really can occur, especially in virtualized environments, and therefore really does need to be fixed This suggests moving back to the polling loop, but using a much shorter wait, with gentle exponential backoff instead of the old 100-millisecond wait. Most of the time, the loop will exit without waiting at all, and almost all of the remaining uses will wait only five microseconds. If the outgoing CPU is preempted, a loop will wait one jiffy, then increase the wait by a factor of 11/10ths, rounding up. As before, there is a five-second timeout. This commit therefore provides common-code infrastructure to do the dying-to-surviving CPU handoff in a safe manner. This code also provides an indication at CPU-online of whether the CPU to be onlined previously timed out on offline. The new cpu_check_up_prepare() function returns -EBUSY if this CPU previously took more than five seconds to go offline, or -EAGAIN if it has not yet managed to go offline. The rationale for -EAGAIN is that it might still be preempted, so an additional wait might well find it correctly offlined. Architecture-specific code can decide how to handle these conditions. Systems in which CPUs take themselves completely offline might respond to an -EBUSY return as if it was a zero (success) return. Systems in which the surviving CPU must take some action might take it at this time, or might simply mark the other CPU as unusable. Note that architectures that take the easy way out and simply pass the -EBUSY and -EAGAIN upwards will change the sysfs API. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: <linux-api@vger.kernel.org> Cc: <linux-arch@vger.kernel.org> [ paulmck: Fixed state machine for architectures that don't check earlier CPU-hotplug results as suggested by James Hogan. ]
Diffstat (limited to 'kernel/smpboot.c')
-rw-r--r--kernel/smpboot.c156
1 files changed, 156 insertions, 0 deletions
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 40190f28db35..c697f73d82d6 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -4,6 +4,7 @@
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/smp.h> 6#include <linux/smp.h>
7#include <linux/delay.h>
7#include <linux/init.h> 8#include <linux/init.h>
8#include <linux/list.h> 9#include <linux/list.h>
9#include <linux/slab.h> 10#include <linux/slab.h>
@@ -314,3 +315,158 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
314 put_online_cpus(); 315 put_online_cpus();
315} 316}
316EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); 317EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
318
319static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
320
321/*
322 * Called to poll specified CPU's state, for example, when waiting for
323 * a CPU to come online.
324 */
325int cpu_report_state(int cpu)
326{
327 return atomic_read(&per_cpu(cpu_hotplug_state, cpu));
328}
329
330/*
331 * If CPU has died properly, set its state to CPU_UP_PREPARE and
332 * return success. Otherwise, return -EBUSY if the CPU died after
333 * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN
334 * if cpu_wait_death() timed out and the CPU still hasn't gotten around
335 * to dying. In the latter two cases, the CPU might not be set up
336 * properly, but it is up to the arch-specific code to decide.
337 * Finally, -EIO indicates an unanticipated problem.
338 *
339 * Note that it is permissible to omit this call entirely, as is
340 * done in architectures that do no CPU-hotplug error checking.
341 */
342int cpu_check_up_prepare(int cpu)
343{
344 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
345 atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
346 return 0;
347 }
348
349 switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) {
350
351 case CPU_POST_DEAD:
352
353 /* The CPU died properly, so just start it up again. */
354 atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
355 return 0;
356
357 case CPU_DEAD_FROZEN:
358
359 /*
360 * Timeout during CPU death, so let caller know.
361 * The outgoing CPU completed its processing, but after
362 * cpu_wait_death() timed out and reported the error. The
363 * caller is free to proceed, in which case the state
364 * will be reset properly by cpu_set_state_online().
365 * Proceeding despite this -EBUSY return makes sense
366 * for systems where the outgoing CPUs take themselves
367 * offline, with no post-death manipulation required from
368 * a surviving CPU.
369 */
370 return -EBUSY;
371
372 case CPU_BROKEN:
373
374 /*
375 * The most likely reason we got here is that there was
376 * a timeout during CPU death, and the outgoing CPU never
377 * did complete its processing. This could happen on
378 * a virtualized system if the outgoing VCPU gets preempted
379 * for more than five seconds, and the user attempts to
380 * immediately online that same CPU. Trying again later
381 * might return -EBUSY above, hence -EAGAIN.
382 */
383 return -EAGAIN;
384
385 default:
386
387 /* Should not happen. Famous last words. */
388 return -EIO;
389 }
390}
391
392/*
393 * Mark the specified CPU online.
394 *
395 * Note that it is permissible to omit this call entirely, as is
396 * done in architectures that do no CPU-hotplug error checking.
397 */
398void cpu_set_state_online(int cpu)
399{
400 (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE);
401}
402
403#ifdef CONFIG_HOTPLUG_CPU
404
405/*
406 * Wait for the specified CPU to exit the idle loop and die.
407 */
408bool cpu_wait_death(unsigned int cpu, int seconds)
409{
410 int jf_left = seconds * HZ;
411 int oldstate;
412 bool ret = true;
413 int sleep_jf = 1;
414
415 might_sleep();
416
417 /* The outgoing CPU will normally get done quite quickly. */
418 if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
419 goto update_state;
420 udelay(5);
421
422 /* But if the outgoing CPU dawdles, wait increasingly long times. */
423 while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) {
424 schedule_timeout_uninterruptible(sleep_jf);
425 jf_left -= sleep_jf;
426 if (jf_left <= 0)
427 break;
428 sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
429 }
430update_state:
431 oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
432 if (oldstate == CPU_DEAD) {
433 /* Outgoing CPU died normally, update state. */
434 smp_mb(); /* atomic_read() before update. */
435 atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
436 } else {
437 /* Outgoing CPU still hasn't died, set state accordingly. */
438 if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
439 oldstate, CPU_BROKEN) != oldstate)
440 goto update_state;
441 ret = false;
442 }
443 return ret;
444}
445
446/*
447 * Called by the outgoing CPU to report its successful death. Return
448 * false if this report follows the surviving CPU's timing out.
449 *
450 * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU
451 * timed out. This approach allows architectures to omit calls to
452 * cpu_check_up_prepare() and cpu_set_state_online() without defeating
453 * the next cpu_wait_death()'s polling loop.
454 */
455bool cpu_report_death(void)
456{
457 int oldstate;
458 int newstate;
459 int cpu = smp_processor_id();
460
461 do {
462 oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
463 if (oldstate != CPU_BROKEN)
464 newstate = CPU_DEAD;
465 else
466 newstate = CPU_DEAD_FROZEN;
467 } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
468 oldstate, newstate) != oldstate);
469 return newstate == CPU_DEAD;
470}
471
472#endif /* #ifdef CONFIG_HOTPLUG_CPU */