aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRafael J. Wysocki <rafael.j.wysocki@intel.com>2016-07-13 21:55:23 -0400
committerRafael J. Wysocki <rafael.j.wysocki@intel.com>2016-07-15 16:42:48 -0400
commit406f992e4a372dafbe3c2cff7efbb2002a5c8ebd (patch)
treee56632536cd2de2b13c258a760fa832b653961af
parent4c0b6c10fbaf0c82efe2a7ba6c236c633d4f2ed7 (diff)
x86 / hibernate: Use hlt_play_dead() when resuming from hibernation
On Intel hardware, native_play_dead() uses mwait_play_dead() by default and only falls back to the other methods if that fails. That also happens during resume from hibernation, when the restore (boot) kernel runs disable_nonboot_cpus() to take all of the CPUs except for the boot one offline. However, that is problematic, because the address passed to __monitor() in mwait_play_dead() is likely to be written to in the last phase of hibernate image restoration and that causes the "dead" CPU to start executing instructions again. Unfortunately, the page containing the address in that CPU's instruction pointer may not be valid any more at that point. First, that page may have been overwritten with image kernel memory contents already, so the instructions the CPU attempts to execute may simply be invalid. Second, the page tables previously used by that CPU may have been overwritten by image kernel memory contents, so the address in its instruction pointer is impossible to resolve then. A report from Varun Koyyalagunta and investigation carried out by Chen Yu show that the latter sometimes happens in practice. To prevent it from happening, temporarily change the smp_ops.play_dead pointer during resume from hibernation so that it points to a special "play dead" routine which uses hlt_play_dead() and avoids the inadvertent "revivals" of "dead" CPUs this way. A slightly unpleasant consequence of this change is that if the system is hibernated with one or more CPUs offline, it will generally draw more power after resume than it did before hibernation, because the physical state entered by CPUs via hlt_play_dead() is higher-power than the mwait_play_dead() one in the majority of cases. It is possible to work around this, but it is unclear how much of a problem that's going to be in practice, so the workaround will be implemented later if it turns out to be necessary. Link: https://bugzilla.kernel.org/show_bug.cgi?id=106371 Reported-by: Varun Koyyalagunta <cpudebug@centtech.com> Original-by: Chen Yu <yu.c.chen@intel.com> Tested-by: Chen Yu <yu.c.chen@intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Acked-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/include/asm/smp.h1
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/power/cpu.c30
-rw-r--r--kernel/power/hibernate.c7
-rw-r--r--kernel/power/power.h2
5 files changed, 40 insertions, 2 deletions
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 66b057306f40..7427ca895a27 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -135,6 +135,7 @@ int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
135int native_cpu_disable(void); 135int native_cpu_disable(void);
136int common_cpu_die(unsigned int cpu); 136int common_cpu_die(unsigned int cpu);
137void native_cpu_die(unsigned int cpu); 137void native_cpu_die(unsigned int cpu);
138void hlt_play_dead(void);
138void native_play_dead(void); 139void native_play_dead(void);
139void play_dead_common(void); 140void play_dead_common(void);
140void wbinvd_on_cpu(int cpu); 141void wbinvd_on_cpu(int cpu);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fafe8b923cac..8264dfad9cf8 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1622,7 +1622,7 @@ static inline void mwait_play_dead(void)
1622 } 1622 }
1623} 1623}
1624 1624
1625static inline void hlt_play_dead(void) 1625void hlt_play_dead(void)
1626{ 1626{
1627 if (__this_cpu_read(cpu_info.x86) >= 4) 1627 if (__this_cpu_read(cpu_info.x86) >= 4)
1628 wbinvd(); 1628 wbinvd();
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index d5f64996394a..b12c26e2e309 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -12,6 +12,7 @@
12#include <linux/export.h> 12#include <linux/export.h>
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/perf_event.h> 14#include <linux/perf_event.h>
15#include <linux/tboot.h>
15 16
16#include <asm/pgtable.h> 17#include <asm/pgtable.h>
17#include <asm/proto.h> 18#include <asm/proto.h>
@@ -266,6 +267,35 @@ void notrace restore_processor_state(void)
266EXPORT_SYMBOL(restore_processor_state); 267EXPORT_SYMBOL(restore_processor_state);
267#endif 268#endif
268 269
270#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HOTPLUG_CPU)
271static void resume_play_dead(void)
272{
273 play_dead_common();
274 tboot_shutdown(TB_SHUTDOWN_WFS);
275 hlt_play_dead();
276}
277
278int hibernate_resume_nonboot_cpu_disable(void)
279{
280 void (*play_dead)(void) = smp_ops.play_dead;
281 int ret;
282
283 /*
284 * Ensure that MONITOR/MWAIT will not be used in the "play dead" loop
285 * during hibernate image restoration, because it is likely that the
286 * monitored address will be actually written to at that time and then
287 * the "dead" CPU will attempt to execute instructions again, but the
288 * address in its instruction pointer may not be possible to resolve
289 * any more at that point (the page tables used by it previously may
290 * have been overwritten by hibernate image data).
291 */
292 smp_ops.play_dead = resume_play_dead;
293 ret = disable_nonboot_cpus();
294 smp_ops.play_dead = play_dead;
295 return ret;
296}
297#endif
298
269/* 299/*
270 * When bsp_check() is called in hibernate and suspend, cpu hotplug 300 * When bsp_check() is called in hibernate and suspend, cpu hotplug
271 * is disabled already. So it's unnessary to handle race condition between 301 * is disabled already. So it's unnessary to handle race condition between
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 51441d87f0b6..5f3523e18e46 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -409,6 +409,11 @@ int hibernation_snapshot(int platform_mode)
409 goto Close; 409 goto Close;
410} 410}
411 411
412int __weak hibernate_resume_nonboot_cpu_disable(void)
413{
414 return disable_nonboot_cpus();
415}
416
412/** 417/**
413 * resume_target_kernel - Restore system state from a hibernation image. 418 * resume_target_kernel - Restore system state from a hibernation image.
414 * @platform_mode: Whether or not to use the platform driver. 419 * @platform_mode: Whether or not to use the platform driver.
@@ -433,7 +438,7 @@ static int resume_target_kernel(bool platform_mode)
433 if (error) 438 if (error)
434 goto Cleanup; 439 goto Cleanup;
435 440
436 error = disable_nonboot_cpus(); 441 error = hibernate_resume_nonboot_cpu_disable();
437 if (error) 442 if (error)
438 goto Enable_cpus; 443 goto Enable_cpus;
439 444
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 064963e89194..242d8b827dd5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -38,6 +38,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
38} 38}
39#endif /* CONFIG_ARCH_HIBERNATION_HEADER */ 39#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
40 40
41extern int hibernate_resume_nonboot_cpu_disable(void);
42
41/* 43/*
42 * Keep some memory free so that I/O operations can succeed without paging 44 * Keep some memory free so that I/O operations can succeed without paging
43 * [Might this be more than 4 MB?] 45 * [Might this be more than 4 MB?]