aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-power29
-rw-r--r--Documentation/kernel-parameters.txt7
-rw-r--r--Documentation/power/devices.txt34
-rw-r--r--Documentation/power/runtime_pm.txt17
-rw-r--r--Documentation/power/states.txt87
-rw-r--r--Documentation/power/swsusp.txt5
-rw-r--r--drivers/base/power/main.c66
-rw-r--r--drivers/base/power/wakeup.c6
-rw-r--r--drivers/cpuidle/cpuidle.c55
-rw-r--r--drivers/cpuidle/governors/menu.c17
-rw-r--r--include/linux/cpuidle.h7
-rw-r--r--include/linux/pm.h36
-rw-r--r--include/linux/pm_runtime.h6
-rw-r--r--kernel/power/hibernate.c27
-rw-r--r--kernel/power/main.c33
-rw-r--r--kernel/power/power.h9
-rw-r--r--kernel/power/suspend.c96
-rw-r--r--kernel/power/suspend_test.c24
-rw-r--r--kernel/sched/idle.c20
19 files changed, 400 insertions, 181 deletions
diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index 64c9276e9421..f4551816329e 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -7,19 +7,30 @@ Description:
7 subsystem. 7 subsystem.
8 8
9What: /sys/power/state 9What: /sys/power/state
10Date: August 2006 10Date: May 2014
11Contact: Rafael J. Wysocki <rjw@rjwysocki.net> 11Contact: Rafael J. Wysocki <rjw@rjwysocki.net>
12Description: 12Description:
13 The /sys/power/state file controls the system power state. 13 The /sys/power/state file controls system sleep states.
14 Reading from this file returns what states are supported, 14 Reading from this file returns the available sleep state
15 which is hard-coded to 'freeze' (Low-Power Idle), 'standby' 15 labels, which may be "mem", "standby", "freeze" and "disk"
16 (Power-On Suspend), 'mem' (Suspend-to-RAM), and 'disk' 16 (hibernation). The meanings of the first three labels depend on
17 (Suspend-to-Disk). 17 the relative_sleep_states command line argument as follows:
18 1) relative_sleep_states = 1
19 "mem", "standby", "freeze" represent non-hibernation sleep
20 states from the deepest ("mem", always present) to the
21 shallowest ("freeze"). "standby" and "freeze" may or may
22 not be present depending on the capabilities of the
23 platform. "freeze" can only be present if "standby" is
24 present.
25 2) relative_sleep_states = 0 (default)
26 "mem" - "suspend-to-RAM", present if supported.
27 "standby" - "power-on suspend", present if supported.
28 "freeze" - "suspend-to-idle", always present.
18 29
19 Writing to this file one of these strings causes the system to 30 Writing to this file one of these strings causes the system to
20 transition into that state. Please see the file 31 transition into the corresponding state, if available. See
21 Documentation/power/states.txt for a description of each of 32 Documentation/power/states.txt for a description of what
22 these states. 33 "suspend-to-RAM", "power-on suspend" and "suspend-to-idle" mean.
23 34
24What: /sys/power/disk 35What: /sys/power/disk
25Date: September 2006 36Date: September 2006
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 43842177b771..e19a88b63eeb 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2889,6 +2889,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2889 [KNL, SMP] Set scheduler's default relax_domain_level. 2889 [KNL, SMP] Set scheduler's default relax_domain_level.
2890 See Documentation/cgroups/cpusets.txt. 2890 See Documentation/cgroups/cpusets.txt.
2891 2891
2892 relative_sleep_states=
2893 [SUSPEND] Use sleep state labeling where the deepest
2894 state available other than hibernation is always "mem".
2895 Format: { "0" | "1" }
2896 0 -- Traditional sleep state labels.
2897 1 -- Relative sleep state labels.
2898
2892 reserve= [KNL,BUGS] Force the kernel to ignore some iomem area 2899 reserve= [KNL,BUGS] Force the kernel to ignore some iomem area
2893 2900
2894 reservetop= [X86-32] 2901 reservetop= [X86-32]
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index 47d46dff70f7..d172bce0fd49 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -2,6 +2,7 @@ Device Power Management
2 2
3Copyright (c) 2010-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. 3Copyright (c) 2010-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
4Copyright (c) 2010 Alan Stern <stern@rowland.harvard.edu> 4Copyright (c) 2010 Alan Stern <stern@rowland.harvard.edu>
5Copyright (c) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
5 6
6 7
7Most of the code in Linux is device drivers, so most of the Linux power 8Most of the code in Linux is device drivers, so most of the Linux power
@@ -326,6 +327,20 @@ the phases are:
326 driver in some way for the upcoming system power transition, but it 327 driver in some way for the upcoming system power transition, but it
327 should not put the device into a low-power state. 328 should not put the device into a low-power state.
328 329
330 For devices supporting runtime power management, the return value of the
331 prepare callback can be used to indicate to the PM core that it may
332 safely leave the device in runtime suspend (if runtime-suspended
333 already), provided that all of the device's descendants are also left in
334 runtime suspend. Namely, if the prepare callback returns a positive
335 number and that happens for all of the descendants of the device too,
336 and all of them (including the device itself) are runtime-suspended, the
337 PM core will skip the suspend, suspend_late and suspend_noirq suspend
338 phases as well as the resume_noirq, resume_early and resume phases of
339 the following system resume for all of these devices. In that case,
340 the complete callback will be called directly after the prepare callback
341 and is entirely responsible for bringing the device back to the
342 functional state as appropriate.
343
329 2. The suspend methods should quiesce the device to stop it from performing 344 2. The suspend methods should quiesce the device to stop it from performing
330 I/O. They also may save the device registers and put it into the 345 I/O. They also may save the device registers and put it into the
331 appropriate low-power state, depending on the bus type the device is on, 346 appropriate low-power state, depending on the bus type the device is on,
@@ -400,12 +415,23 @@ When resuming from freeze, standby or memory sleep, the phases are:
400 the resume callbacks occur; it's not necessary to wait until the 415 the resume callbacks occur; it's not necessary to wait until the
401 complete phase. 416 complete phase.
402 417
418 Moreover, if the preceding prepare callback returned a positive number,
419 the device may have been left in runtime suspend throughout the whole
420 system suspend and resume (the suspend, suspend_late, suspend_noirq
421 phases of system suspend and the resume_noirq, resume_early, resume
422 phases of system resume may have been skipped for it). In that case,
423 the complete callback is entirely responsible for bringing the device
424 back to the functional state after system suspend if necessary. [For
425 example, it may need to queue up a runtime resume request for the device
426 for this purpose.] To check if that is the case, the complete callback
427 can consult the device's power.direct_complete flag. Namely, if that
428 flag is set when the complete callback is being run, it has been called
429 directly after the preceding prepare and special action may be required
430 to make the device work correctly afterward.
431
403At the end of these phases, drivers should be as functional as they were before 432At the end of these phases, drivers should be as functional as they were before
404suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are 433suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are
405gated on. Even if the device was in a low-power state before the system sleep 434gated on.
406because of runtime power management, afterwards it should be back in its
407full-power state. There are multiple reasons why it's best to do this; they are
408discussed in more detail in Documentation/power/runtime_pm.txt.
409 435
410However, the details here may again be platform-specific. For example, 436However, the details here may again be platform-specific. For example,
411some systems support multiple "run" states, and the mode in effect at 437some systems support multiple "run" states, and the mode in effect at
diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
index 5f96daf8566a..e1bee8a4aaac 100644
--- a/Documentation/power/runtime_pm.txt
+++ b/Documentation/power/runtime_pm.txt
@@ -2,6 +2,7 @@ Runtime Power Management Framework for I/O Devices
2 2
3(C) 2009-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. 3(C) 2009-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
4(C) 2010 Alan Stern <stern@rowland.harvard.edu> 4(C) 2010 Alan Stern <stern@rowland.harvard.edu>
5(C) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
5 6
61. Introduction 71. Introduction
7 8
@@ -444,6 +445,10 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
444 bool pm_runtime_status_suspended(struct device *dev); 445 bool pm_runtime_status_suspended(struct device *dev);
445 - return true if the device's runtime PM status is 'suspended' 446 - return true if the device's runtime PM status is 'suspended'
446 447
448 bool pm_runtime_suspended_if_enabled(struct device *dev);
449 - return true if the device's runtime PM status is 'suspended' and its
450 'power.disable_depth' field is equal to 1
451
447 void pm_runtime_allow(struct device *dev); 452 void pm_runtime_allow(struct device *dev);
448 - set the power.runtime_auto flag for the device and decrease its usage 453 - set the power.runtime_auto flag for the device and decrease its usage
449 counter (used by the /sys/devices/.../power/control interface to 454 counter (used by the /sys/devices/.../power/control interface to
@@ -644,6 +649,18 @@ place (in particular, if the system is not waking up from hibernation), it may
644be more efficient to leave the devices that had been suspended before the system 649be more efficient to leave the devices that had been suspended before the system
645suspend began in the suspended state. 650suspend began in the suspended state.
646 651
652To this end, the PM core provides a mechanism allowing some coordination between
653different levels of device hierarchy. Namely, if a system suspend .prepare()
654callback returns a positive number for a device, that indicates to the PM core
655that the device appears to be runtime-suspended and its state is fine, so it
656may be left in runtime suspend provided that all of its descendants are also
657left in runtime suspend. If that happens, the PM core will not execute any
658system suspend and resume callbacks for all of those devices, except for the
659complete callback, which is then entirely responsible for handling the device
660as appropriate. This only applies to system suspend transitions that are not
661related to hibernation (see Documentation/power/devices.txt for more
662information).
663
647The PM core does its best to reduce the probability of race conditions between 664The PM core does its best to reduce the probability of race conditions between
648the runtime PM and system suspend/resume (and hibernation) callbacks by carrying 665the runtime PM and system suspend/resume (and hibernation) callbacks by carrying
649out the following operations: 666out the following operations:
diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt
index 442d43df9b25..50f3ef9177c1 100644
--- a/Documentation/power/states.txt
+++ b/Documentation/power/states.txt
@@ -1,62 +1,87 @@
1System Power Management Sleep States
1 2
2System Power Management States 3(C) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
3 4
5The kernel supports up to four system sleep states generically, although three
6of them depend on the platform support code to implement the low-level details
7for each state.
4 8
5The kernel supports four power management states generically, though 9The states are represented by strings that can be read or written to the
6one is generic and the other three are dependent on platform support 10/sys/power/state file. Those strings may be "mem", "standby", "freeze" and
7code to implement the low-level details for each state. 11"disk", where the last one always represents hibernation (Suspend-To-Disk) and
8This file describes each state, what they are 12the meaning of the remaining ones depends on the relative_sleep_states command
9commonly called, what ACPI state they map to, and what string to write 13line argument.
10to /sys/power/state to enter that state
11 14
12state: Freeze / Low-Power Idle 15For relative_sleep_states=1, the strings "mem", "standby" and "freeze" label the
16available non-hibernation sleep states from the deepest to the shallowest,
17respectively. In that case, "mem" is always present in /sys/power/state,
18because there is at least one non-hibernation sleep state in every system. If
19the given system supports two non-hibernation sleep states, "standby" is present
20in /sys/power/state in addition to "mem". If the system supports three
21non-hibernation sleep states, "freeze" will be present in /sys/power/state in
22addition to "mem" and "standby".
23
24For relative_sleep_states=0, which is the default, the following descriptions
25apply.
26
27state: Suspend-To-Idle
13ACPI state: S0 28ACPI state: S0
14String: "freeze" 29Label: "freeze"
15 30
16This state is a generic, pure software, light-weight, low-power state. 31This state is a generic, pure software, light-weight, system sleep state.
17It allows more energy to be saved relative to idle by freezing user 32It allows more energy to be saved relative to runtime idle by freezing user
18space and putting all I/O devices into low-power states (possibly 33space and putting all I/O devices into low-power states (possibly
19lower-power than available at run time), such that the processors can 34lower-power than available at run time), such that the processors can
20spend more time in their idle states. 35spend more time in their idle states.
21This state can be used for platforms without Standby/Suspend-to-RAM 36
37This state can be used for platforms without Power-On Suspend/Suspend-to-RAM
22support, or it can be used in addition to Suspend-to-RAM (memory sleep) 38support, or it can be used in addition to Suspend-to-RAM (memory sleep)
23to provide reduced resume latency. 39to provide reduced resume latency. It is always supported.
24 40
25 41
26State: Standby / Power-On Suspend 42State: Standby / Power-On Suspend
27ACPI State: S1 43ACPI State: S1
28String: "standby" 44Label: "standby"
29 45
30This state offers minimal, though real, power savings, while providing 46This state, if supported, offers moderate, though real, power savings, while
31a very low-latency transition back to a working system. No operating 47providing a relatively low-latency transition back to a working system. No
32state is lost (the CPU retains power), so the system easily starts up 48operating state is lost (the CPU retains power), so the system easily starts up
33again where it left off. 49again where it left off.
34 50
35We try to put devices in a low-power state equivalent to D1, which 51In addition to freezing user space and putting all I/O devices into low-power
36also offers low power savings, but low resume latency. Not all devices 52states, which is done for Suspend-To-Idle too, nonboot CPUs are taken offline
37support D1, and those that don't are left on. 53and all low-level system functions are suspended during transitions into this
54state. For this reason, it should allow more energy to be saved relative to
55Suspend-To-Idle, but the resume latency will generally be greater than for that
56state.
38 57
39 58
40State: Suspend-to-RAM 59State: Suspend-to-RAM
41ACPI State: S3 60ACPI State: S3
42String: "mem" 61Label: "mem"
43 62
44This state offers significant power savings as everything in the 63This state, if supported, offers significant power savings as everything in the
45system is put into a low-power state, except for memory, which is 64system is put into a low-power state, except for memory, which should be placed
46placed in self-refresh mode to retain its contents. 65into the self-refresh mode to retain its contents. All of the steps carried out
66when entering Power-On Suspend are also carried out during transitions to STR.
67Additional operations may take place depending on the platform capabilities. In
68particular, on ACPI systems the kernel passes control to the BIOS (platform
69firmware) as the last step during STR transitions and that usually results in
70powering down some more low-level components that aren't directly controlled by
71the kernel.
47 72
48System and device state is saved and kept in memory. All devices are 73System and device state is saved and kept in memory. All devices are suspended
49suspended and put into D3. In many cases, all peripheral buses lose 74and put into low-power states. In many cases, all peripheral buses lose power
50power when entering STR, so devices must be able to handle the 75when entering STR, so devices must be able to handle the transition back to the
51transition back to the On state. 76"on" state.
52 77
53For at least ACPI, STR requires some minimal boot-strapping code to 78For at least ACPI, STR requires some minimal boot-strapping code to resume the
54resume the system from STR. This may be true on other platforms. 79system from it. This may be the case on other platforms too.
55 80
56 81
57State: Suspend-to-disk 82State: Suspend-to-disk
58ACPI State: S4 83ACPI State: S4
59String: "disk" 84Label: "disk"
60 85
61This state offers the greatest power savings, and can be used even in 86This state offers the greatest power savings, and can be used even in
62the absence of low-level platform support for power management. This 87the absence of low-level platform support for power management. This
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
index 079160e22bcc..f732a8321e8a 100644
--- a/Documentation/power/swsusp.txt
+++ b/Documentation/power/swsusp.txt
@@ -220,7 +220,10 @@ Q: After resuming, system is paging heavily, leading to very bad interactivity.
220 220
221A: Try running 221A: Try running
222 222
223cat `cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u` > /dev/null 223cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file
224do
225 test -f "$file" && cat "$file" > /dev/null
226done
224 227
225after resume. swapoff -a; swapon -a may also be useful. 228after resume. swapoff -a; swapon -a may also be useful.
226 229
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 86d5e4fb5b98..343ffad59377 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -479,7 +479,7 @@ static int device_resume_noirq(struct device *dev, pm_message_t state, bool asyn
479 TRACE_DEVICE(dev); 479 TRACE_DEVICE(dev);
480 TRACE_RESUME(0); 480 TRACE_RESUME(0);
481 481
482 if (dev->power.syscore) 482 if (dev->power.syscore || dev->power.direct_complete)
483 goto Out; 483 goto Out;
484 484
485 if (!dev->power.is_noirq_suspended) 485 if (!dev->power.is_noirq_suspended)
@@ -605,7 +605,7 @@ static int device_resume_early(struct device *dev, pm_message_t state, bool asyn
605 TRACE_DEVICE(dev); 605 TRACE_DEVICE(dev);
606 TRACE_RESUME(0); 606 TRACE_RESUME(0);
607 607
608 if (dev->power.syscore) 608 if (dev->power.syscore || dev->power.direct_complete)
609 goto Out; 609 goto Out;
610 610
611 if (!dev->power.is_late_suspended) 611 if (!dev->power.is_late_suspended)
@@ -735,6 +735,12 @@ static int device_resume(struct device *dev, pm_message_t state, bool async)
735 if (dev->power.syscore) 735 if (dev->power.syscore)
736 goto Complete; 736 goto Complete;
737 737
738 if (dev->power.direct_complete) {
739 /* Match the pm_runtime_disable() in __device_suspend(). */
740 pm_runtime_enable(dev);
741 goto Complete;
742 }
743
738 dpm_wait(dev->parent, async); 744 dpm_wait(dev->parent, async);
739 dpm_watchdog_set(&wd, dev); 745 dpm_watchdog_set(&wd, dev);
740 device_lock(dev); 746 device_lock(dev);
@@ -1007,7 +1013,7 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a
1007 goto Complete; 1013 goto Complete;
1008 } 1014 }
1009 1015
1010 if (dev->power.syscore) 1016 if (dev->power.syscore || dev->power.direct_complete)
1011 goto Complete; 1017 goto Complete;
1012 1018
1013 dpm_wait_for_children(dev, async); 1019 dpm_wait_for_children(dev, async);
@@ -1146,7 +1152,7 @@ static int __device_suspend_late(struct device *dev, pm_message_t state, bool as
1146 goto Complete; 1152 goto Complete;
1147 } 1153 }
1148 1154
1149 if (dev->power.syscore) 1155 if (dev->power.syscore || dev->power.direct_complete)
1150 goto Complete; 1156 goto Complete;
1151 1157
1152 dpm_wait_for_children(dev, async); 1158 dpm_wait_for_children(dev, async);
@@ -1332,6 +1338,17 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
1332 if (dev->power.syscore) 1338 if (dev->power.syscore)
1333 goto Complete; 1339 goto Complete;
1334 1340
1341 if (dev->power.direct_complete) {
1342 if (pm_runtime_status_suspended(dev)) {
1343 pm_runtime_disable(dev);
1344 if (pm_runtime_suspended_if_enabled(dev))
1345 goto Complete;
1346
1347 pm_runtime_enable(dev);
1348 }
1349 dev->power.direct_complete = false;
1350 }
1351
1335 dpm_watchdog_set(&wd, dev); 1352 dpm_watchdog_set(&wd, dev);
1336 device_lock(dev); 1353 device_lock(dev);
1337 1354
@@ -1382,10 +1399,19 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
1382 1399
1383 End: 1400 End:
1384 if (!error) { 1401 if (!error) {
1402 struct device *parent = dev->parent;
1403
1385 dev->power.is_suspended = true; 1404 dev->power.is_suspended = true;
1386 if (dev->power.wakeup_path 1405 if (parent) {
1387 && dev->parent && !dev->parent->power.ignore_children) 1406 spin_lock_irq(&parent->power.lock);
1388 dev->parent->power.wakeup_path = true; 1407
1408 dev->parent->power.direct_complete = false;
1409 if (dev->power.wakeup_path
1410 && !dev->parent->power.ignore_children)
1411 dev->parent->power.wakeup_path = true;
1412
1413 spin_unlock_irq(&parent->power.lock);
1414 }
1389 } 1415 }
1390 1416
1391 device_unlock(dev); 1417 device_unlock(dev);
@@ -1487,7 +1513,7 @@ static int device_prepare(struct device *dev, pm_message_t state)
1487{ 1513{
1488 int (*callback)(struct device *) = NULL; 1514 int (*callback)(struct device *) = NULL;
1489 char *info = NULL; 1515 char *info = NULL;
1490 int error = 0; 1516 int ret = 0;
1491 1517
1492 if (dev->power.syscore) 1518 if (dev->power.syscore)
1493 return 0; 1519 return 0;
@@ -1523,17 +1549,27 @@ static int device_prepare(struct device *dev, pm_message_t state)
1523 callback = dev->driver->pm->prepare; 1549 callback = dev->driver->pm->prepare;
1524 } 1550 }
1525 1551
1526 if (callback) { 1552 if (callback)
1527 error = callback(dev); 1553 ret = callback(dev);
1528 suspend_report_result(callback, error);
1529 }
1530 1554
1531 device_unlock(dev); 1555 device_unlock(dev);
1532 1556
1533 if (error) 1557 if (ret < 0) {
1558 suspend_report_result(callback, ret);
1534 pm_runtime_put(dev); 1559 pm_runtime_put(dev);
1535 1560 return ret;
1536 return error; 1561 }
1562 /*
1563 * A positive return value from ->prepare() means "this device appears
1564 * to be runtime-suspended and its state is fine, so if it really is
1565 * runtime-suspended, you can leave it in that state provided that you
1566 * will do the same thing with all of its descendants". This only
1567 * applies to suspend transitions, however.
1568 */
1569 spin_lock_irq(&dev->power.lock);
1570 dev->power.direct_complete = ret > 0 && state.event == PM_EVENT_SUSPEND;
1571 spin_unlock_irq(&dev->power.lock);
1572 return 0;
1537} 1573}
1538 1574
1539/** 1575/**
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 2d56f4113ae7..eb1bd2ecad8b 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -318,10 +318,16 @@ int device_init_wakeup(struct device *dev, bool enable)
318{ 318{
319 int ret = 0; 319 int ret = 0;
320 320
321 if (!dev)
322 return -EINVAL;
323
321 if (enable) { 324 if (enable) {
322 device_set_wakeup_capable(dev, true); 325 device_set_wakeup_capable(dev, true);
323 ret = device_wakeup_enable(dev); 326 ret = device_wakeup_enable(dev);
324 } else { 327 } else {
328 if (dev->power.can_wakeup)
329 device_wakeup_disable(dev);
330
325 device_set_wakeup_capable(dev, false); 331 device_set_wakeup_capable(dev, false);
326 } 332 }
327 333
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 8236746e46bb..cb7019977c50 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -32,6 +32,7 @@ LIST_HEAD(cpuidle_detected_devices);
32static int enabled_devices; 32static int enabled_devices;
33static int off __read_mostly; 33static int off __read_mostly;
34static int initialized __read_mostly; 34static int initialized __read_mostly;
35static bool use_deepest_state __read_mostly;
35 36
36int cpuidle_disabled(void) 37int cpuidle_disabled(void)
37{ 38{
@@ -65,23 +66,42 @@ int cpuidle_play_dead(void)
65} 66}
66 67
67/** 68/**
68 * cpuidle_enabled - check if the cpuidle framework is ready 69 * cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode.
69 * @dev: cpuidle device for this cpu 70 * @enable: Whether enable or disable the feature.
70 * @drv: cpuidle driver for this cpu 71 *
72 * If the "deepest idle" mode is enabled, cpuidle will ignore the governor and
73 * always use the state with the greatest exit latency (out of the states that
74 * are not disabled).
71 * 75 *
72 * Return 0 on success, otherwise: 76 * This function can only be called after cpuidle_pause() to avoid races.
73 * -NODEV : the cpuidle framework is not available
74 * -EBUSY : the cpuidle framework is not initialized
75 */ 77 */
76int cpuidle_enabled(struct cpuidle_driver *drv, struct cpuidle_device *dev) 78void cpuidle_use_deepest_state(bool enable)
77{ 79{
78 if (off || !initialized) 80 use_deepest_state = enable;
79 return -ENODEV; 81}
80 82
81 if (!drv || !dev || !dev->enabled) 83/**
82 return -EBUSY; 84 * cpuidle_find_deepest_state - Find the state of the greatest exit latency.
85 * @drv: cpuidle driver for a given CPU.
86 * @dev: cpuidle device for a given CPU.
87 */
88static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
89 struct cpuidle_device *dev)
90{
91 unsigned int latency_req = 0;
92 int i, ret = CPUIDLE_DRIVER_STATE_START - 1;
83 93
84 return 0; 94 for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
95 struct cpuidle_state *s = &drv->states[i];
96 struct cpuidle_state_usage *su = &dev->states_usage[i];
97
98 if (s->disabled || su->disable || s->exit_latency <= latency_req)
99 continue;
100
101 latency_req = s->exit_latency;
102 ret = i;
103 }
104 return ret;
85} 105}
86 106
87/** 107/**
@@ -138,6 +158,15 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
138 */ 158 */
139int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) 159int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
140{ 160{
161 if (off || !initialized)
162 return -ENODEV;
163
164 if (!drv || !dev || !dev->enabled)
165 return -EBUSY;
166
167 if (unlikely(use_deepest_state))
168 return cpuidle_find_deepest_state(drv, dev);
169
141 return cpuidle_curr_governor->select(drv, dev); 170 return cpuidle_curr_governor->select(drv, dev);
142} 171}
143 172
@@ -169,7 +198,7 @@ int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev,
169 */ 198 */
170void cpuidle_reflect(struct cpuidle_device *dev, int index) 199void cpuidle_reflect(struct cpuidle_device *dev, int index)
171{ 200{
172 if (cpuidle_curr_governor->reflect) 201 if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state))
173 cpuidle_curr_governor->reflect(dev, index); 202 cpuidle_curr_governor->reflect(dev, index);
174} 203}
175 204
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 71b523293354..c4f80c15a48d 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -296,7 +296,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
296 data->needs_update = 0; 296 data->needs_update = 0;
297 } 297 }
298 298
299 data->last_state_idx = 0; 299 data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
300 300
301 /* Special case when user has set very strict latency requirement */ 301 /* Special case when user has set very strict latency requirement */
302 if (unlikely(latency_req == 0)) 302 if (unlikely(latency_req == 0))
@@ -311,13 +311,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
311 data->bucket = which_bucket(data->next_timer_us); 311 data->bucket = which_bucket(data->next_timer_us);
312 312
313 /* 313 /*
314 * if the correction factor is 0 (eg first time init or cpu hotplug
315 * etc), we actually want to start out with a unity factor.
316 */
317 if (data->correction_factor[data->bucket] == 0)
318 data->correction_factor[data->bucket] = RESOLUTION * DECAY;
319
320 /*
321 * Force the result of multiplication to be 64 bits even if both 314 * Force the result of multiplication to be 64 bits even if both
322 * operands are 32 bits. 315 * operands are 32 bits.
323 * Make sure to round up for half microseconds. 316 * Make sure to round up for half microseconds.
@@ -466,9 +459,17 @@ static int menu_enable_device(struct cpuidle_driver *drv,
466 struct cpuidle_device *dev) 459 struct cpuidle_device *dev)
467{ 460{
468 struct menu_device *data = &per_cpu(menu_devices, dev->cpu); 461 struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
462 int i;
469 463
470 memset(data, 0, sizeof(struct menu_device)); 464 memset(data, 0, sizeof(struct menu_device));
471 465
466 /*
467 * if the correction factor is 0 (eg first time init or cpu hotplug
468 * etc), we actually want to start out with a unity factor.
469 */
470 for(i = 0; i < BUCKETS; i++)
471 data->correction_factor[i] = RESOLUTION * DECAY;
472
472 return 0; 473 return 0;
473} 474}
474 475
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index b0238cba440b..c51a436135c4 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -120,8 +120,6 @@ struct cpuidle_driver {
120#ifdef CONFIG_CPU_IDLE 120#ifdef CONFIG_CPU_IDLE
121extern void disable_cpuidle(void); 121extern void disable_cpuidle(void);
122 122
123extern int cpuidle_enabled(struct cpuidle_driver *drv,
124 struct cpuidle_device *dev);
125extern int cpuidle_select(struct cpuidle_driver *drv, 123extern int cpuidle_select(struct cpuidle_driver *drv,
126 struct cpuidle_device *dev); 124 struct cpuidle_device *dev);
127extern int cpuidle_enter(struct cpuidle_driver *drv, 125extern int cpuidle_enter(struct cpuidle_driver *drv,
@@ -145,13 +143,11 @@ extern void cpuidle_resume(void);
145extern int cpuidle_enable_device(struct cpuidle_device *dev); 143extern int cpuidle_enable_device(struct cpuidle_device *dev);
146extern void cpuidle_disable_device(struct cpuidle_device *dev); 144extern void cpuidle_disable_device(struct cpuidle_device *dev);
147extern int cpuidle_play_dead(void); 145extern int cpuidle_play_dead(void);
146extern void cpuidle_use_deepest_state(bool enable);
148 147
149extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); 148extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
150#else 149#else
151static inline void disable_cpuidle(void) { } 150static inline void disable_cpuidle(void) { }
152static inline int cpuidle_enabled(struct cpuidle_driver *drv,
153 struct cpuidle_device *dev)
154{return -ENODEV; }
155static inline int cpuidle_select(struct cpuidle_driver *drv, 151static inline int cpuidle_select(struct cpuidle_driver *drv,
156 struct cpuidle_device *dev) 152 struct cpuidle_device *dev)
157{return -ENODEV; } 153{return -ENODEV; }
@@ -180,6 +176,7 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev)
180{return -ENODEV; } 176{return -ENODEV; }
181static inline void cpuidle_disable_device(struct cpuidle_device *dev) { } 177static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
182static inline int cpuidle_play_dead(void) {return -ENODEV; } 178static inline int cpuidle_play_dead(void) {return -ENODEV; }
179static inline void cpuidle_use_deepest_state(bool enable) {}
183static inline struct cpuidle_driver *cpuidle_get_cpu_driver( 180static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
184 struct cpuidle_device *dev) {return NULL; } 181 struct cpuidle_device *dev) {return NULL; }
185#endif 182#endif
diff --git a/include/linux/pm.h b/include/linux/pm.h
index d915d0345fa1..72c0fe098a27 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -93,13 +93,23 @@ typedef struct pm_message {
93 * been registered) to recover from the race condition. 93 * been registered) to recover from the race condition.
94 * This method is executed for all kinds of suspend transitions and is 94 * This method is executed for all kinds of suspend transitions and is
95 * followed by one of the suspend callbacks: @suspend(), @freeze(), or 95 * followed by one of the suspend callbacks: @suspend(), @freeze(), or
96 * @poweroff(). The PM core executes subsystem-level @prepare() for all 96 * @poweroff(). If the transition is a suspend to memory or standby (that
97 * devices before starting to invoke suspend callbacks for any of them, so 97 * is, not related to hibernation), the return value of @prepare() may be
98 * generally devices may be assumed to be functional or to respond to 98 * used to indicate to the PM core to leave the device in runtime suspend
99 * runtime resume requests while @prepare() is being executed. However, 99 * if applicable. Namely, if @prepare() returns a positive number, the PM
100 * device drivers may NOT assume anything about the availability of user 100 * core will understand that as a declaration that the device appears to be
101 * space at that time and it is NOT valid to request firmware from within 101 * runtime-suspended and it may be left in that state during the entire
102 * @prepare() (it's too late to do that). It also is NOT valid to allocate 102 * transition and during the subsequent resume if all of its descendants
103 * are left in runtime suspend too. If that happens, @complete() will be
104 * executed directly after @prepare() and it must ensure the proper
105 * functioning of the device after the system resume.
106 * The PM core executes subsystem-level @prepare() for all devices before
107 * starting to invoke suspend callbacks for any of them, so generally
108 * devices may be assumed to be functional or to respond to runtime resume
109 * requests while @prepare() is being executed. However, device drivers
110 * may NOT assume anything about the availability of user space at that
111 * time and it is NOT valid to request firmware from within @prepare()
112 * (it's too late to do that). It also is NOT valid to allocate
103 * substantial amounts of memory from @prepare() in the GFP_KERNEL mode. 113 * substantial amounts of memory from @prepare() in the GFP_KERNEL mode.
104 * [To work around these limitations, drivers may register suspend and 114 * [To work around these limitations, drivers may register suspend and
105 * hibernation notifiers to be executed before the freezing of tasks.] 115 * hibernation notifiers to be executed before the freezing of tasks.]
@@ -112,7 +122,16 @@ typedef struct pm_message {
112 * of the other devices that the PM core has unsuccessfully attempted to 122 * of the other devices that the PM core has unsuccessfully attempted to
113 * suspend earlier). 123 * suspend earlier).
114 * The PM core executes subsystem-level @complete() after it has executed 124 * The PM core executes subsystem-level @complete() after it has executed
115 * the appropriate resume callbacks for all devices. 125 * the appropriate resume callbacks for all devices. If the corresponding
126 * @prepare() at the beginning of the suspend transition returned a
127 * positive number and the device was left in runtime suspend (without
128 * executing any suspend and resume callbacks for it), @complete() will be
129 * the only callback executed for the device during resume. In that case,
130 * @complete() must be prepared to do whatever is necessary to ensure the
131 * proper functioning of the device after the system resume. To this end,
132 * @complete() can check the power.direct_complete flag of the device to
133 * learn whether (unset) or not (set) the previous suspend and resume
134 * callbacks have been executed for it.
116 * 135 *
117 * @suspend: Executed before putting the system into a sleep state in which the 136 * @suspend: Executed before putting the system into a sleep state in which the
118 * contents of main memory are preserved. The exact action to perform 137 * contents of main memory are preserved. The exact action to perform
@@ -546,6 +565,7 @@ struct dev_pm_info {
546 bool is_late_suspended:1; 565 bool is_late_suspended:1;
547 bool ignore_children:1; 566 bool ignore_children:1;
548 bool early_init:1; /* Owned by the PM core */ 567 bool early_init:1; /* Owned by the PM core */
568 bool direct_complete:1; /* Owned by the PM core */
549 spinlock_t lock; 569 spinlock_t lock;
550#ifdef CONFIG_PM_SLEEP 570#ifdef CONFIG_PM_SLEEP
551 struct list_head entry; 571 struct list_head entry;
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 2a5897a4afbc..43fd6716f662 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -101,6 +101,11 @@ static inline bool pm_runtime_status_suspended(struct device *dev)
101 return dev->power.runtime_status == RPM_SUSPENDED; 101 return dev->power.runtime_status == RPM_SUSPENDED;
102} 102}
103 103
104static inline bool pm_runtime_suspended_if_enabled(struct device *dev)
105{
106 return pm_runtime_status_suspended(dev) && dev->power.disable_depth == 1;
107}
108
104static inline bool pm_runtime_enabled(struct device *dev) 109static inline bool pm_runtime_enabled(struct device *dev)
105{ 110{
106 return !dev->power.disable_depth; 111 return !dev->power.disable_depth;
@@ -150,6 +155,7 @@ static inline void device_set_run_wake(struct device *dev, bool enable) {}
150static inline bool pm_runtime_suspended(struct device *dev) { return false; } 155static inline bool pm_runtime_suspended(struct device *dev) { return false; }
151static inline bool pm_runtime_active(struct device *dev) { return true; } 156static inline bool pm_runtime_active(struct device *dev) { return true; }
152static inline bool pm_runtime_status_suspended(struct device *dev) { return false; } 157static inline bool pm_runtime_status_suspended(struct device *dev) { return false; }
158static inline bool pm_runtime_suspended_if_enabled(struct device *dev) { return false; }
153static inline bool pm_runtime_enabled(struct device *dev) { return false; } 159static inline bool pm_runtime_enabled(struct device *dev) { return false; }
154 160
155static inline void pm_runtime_no_callbacks(struct device *dev) {} 161static inline void pm_runtime_no_callbacks(struct device *dev) {}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index f4f2073711d3..df88d55dc436 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -35,7 +35,7 @@
35static int nocompress; 35static int nocompress;
36static int noresume; 36static int noresume;
37static int resume_wait; 37static int resume_wait;
38static int resume_delay; 38static unsigned int resume_delay;
39static char resume_file[256] = CONFIG_PM_STD_PARTITION; 39static char resume_file[256] = CONFIG_PM_STD_PARTITION;
40dev_t swsusp_resume_device; 40dev_t swsusp_resume_device;
41sector_t swsusp_resume_block; 41sector_t swsusp_resume_block;
@@ -228,19 +228,23 @@ static void platform_recover(int platform_mode)
228void swsusp_show_speed(struct timeval *start, struct timeval *stop, 228void swsusp_show_speed(struct timeval *start, struct timeval *stop,
229 unsigned nr_pages, char *msg) 229 unsigned nr_pages, char *msg)
230{ 230{
231 s64 elapsed_centisecs64; 231 u64 elapsed_centisecs64;
232 int centisecs; 232 unsigned int centisecs;
233 int k; 233 unsigned int k;
234 int kps; 234 unsigned int kps;
235 235
236 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); 236 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
237 /*
238 * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
239 * it is obvious enough for what went wrong.
240 */
237 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); 241 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
238 centisecs = elapsed_centisecs64; 242 centisecs = elapsed_centisecs64;
239 if (centisecs == 0) 243 if (centisecs == 0)
240 centisecs = 1; /* avoid div-by-zero */ 244 centisecs = 1; /* avoid div-by-zero */
241 k = nr_pages * (PAGE_SIZE / 1024); 245 k = nr_pages * (PAGE_SIZE / 1024);
242 kps = (k * 100) / centisecs; 246 kps = (k * 100) / centisecs;
243 printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", 247 printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
244 msg, k, 248 msg, k,
245 centisecs / 100, centisecs % 100, 249 centisecs / 100, centisecs % 100,
246 kps / 1000, (kps % 1000) / 10); 250 kps / 1000, (kps % 1000) / 10);
@@ -595,7 +599,8 @@ static void power_down(void)
595 case HIBERNATION_PLATFORM: 599 case HIBERNATION_PLATFORM:
596 hibernation_platform_enter(); 600 hibernation_platform_enter();
597 case HIBERNATION_SHUTDOWN: 601 case HIBERNATION_SHUTDOWN:
598 kernel_power_off(); 602 if (pm_power_off)
603 kernel_power_off();
599 break; 604 break;
600#ifdef CONFIG_SUSPEND 605#ifdef CONFIG_SUSPEND
601 case HIBERNATION_SUSPEND: 606 case HIBERNATION_SUSPEND:
@@ -623,7 +628,8 @@ static void power_down(void)
623 * corruption after resume. 628 * corruption after resume.
624 */ 629 */
625 printk(KERN_CRIT "PM: Please power down manually\n"); 630 printk(KERN_CRIT "PM: Please power down manually\n");
626 while(1); 631 while (1)
632 cpu_relax();
627} 633}
628 634
629/** 635/**
@@ -1109,7 +1115,10 @@ static int __init resumewait_setup(char *str)
1109 1115
1110static int __init resumedelay_setup(char *str) 1116static int __init resumedelay_setup(char *str)
1111{ 1117{
1112 resume_delay = simple_strtoul(str, NULL, 0); 1118 int rc = kstrtouint(str, 0, &resume_delay);
1119
1120 if (rc)
1121 return rc;
1113 return 1; 1122 return 1;
1114} 1123}
1115 1124
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6271bc4073ef..573410d6647e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -279,26 +279,26 @@ static inline void pm_print_times_init(void) {}
279struct kobject *power_kobj; 279struct kobject *power_kobj;
280 280
281/** 281/**
282 * state - control system power state. 282 * state - control system sleep states.
283 * 283 *
284 * show() returns what states are supported, which is hard-coded to 284 * show() returns available sleep state labels, which may be "mem", "standby",
285 * 'freeze' (Low-Power Idle), 'standby' (Power-On Suspend), 285 * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a
286 * 'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk). 286 * description of what they mean.
287 * 287 *
288 * store() accepts one of those strings, translates it into the 288 * store() accepts one of those strings, translates it into the proper
289 * proper enumerated value, and initiates a suspend transition. 289 * enumerated value, and initiates a suspend transition.
290 */ 290 */
291static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, 291static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
292 char *buf) 292 char *buf)
293{ 293{
294 char *s = buf; 294 char *s = buf;
295#ifdef CONFIG_SUSPEND 295#ifdef CONFIG_SUSPEND
296 int i; 296 suspend_state_t i;
297
298 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
299 if (pm_states[i].state)
300 s += sprintf(s,"%s ", pm_states[i].label);
297 301
298 for (i = 0; i < PM_SUSPEND_MAX; i++) {
299 if (pm_states[i] && valid_state(i))
300 s += sprintf(s,"%s ", pm_states[i]);
301 }
302#endif 302#endif
303#ifdef CONFIG_HIBERNATION 303#ifdef CONFIG_HIBERNATION
304 s += sprintf(s, "%s\n", "disk"); 304 s += sprintf(s, "%s\n", "disk");
@@ -314,7 +314,7 @@ static suspend_state_t decode_state(const char *buf, size_t n)
314{ 314{
315#ifdef CONFIG_SUSPEND 315#ifdef CONFIG_SUSPEND
316 suspend_state_t state = PM_SUSPEND_MIN; 316 suspend_state_t state = PM_SUSPEND_MIN;
317 const char * const *s; 317 struct pm_sleep_state *s;
318#endif 318#endif
319 char *p; 319 char *p;
320 int len; 320 int len;
@@ -328,8 +328,9 @@ static suspend_state_t decode_state(const char *buf, size_t n)
328 328
329#ifdef CONFIG_SUSPEND 329#ifdef CONFIG_SUSPEND
330 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) 330 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
331 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) 331 if (s->state && len == strlen(s->label)
332 return state; 332 && !strncmp(buf, s->label, len))
333 return s->state;
333#endif 334#endif
334 335
335 return PM_SUSPEND_ON; 336 return PM_SUSPEND_ON;
@@ -447,8 +448,8 @@ static ssize_t autosleep_show(struct kobject *kobj,
447 448
448#ifdef CONFIG_SUSPEND 449#ifdef CONFIG_SUSPEND
449 if (state < PM_SUSPEND_MAX) 450 if (state < PM_SUSPEND_MAX)
450 return sprintf(buf, "%s\n", valid_state(state) ? 451 return sprintf(buf, "%s\n", pm_states[state].state ?
451 pm_states[state] : "error"); 452 pm_states[state].label : "error");
452#endif 453#endif
453#ifdef CONFIG_HIBERNATION 454#ifdef CONFIG_HIBERNATION
454 return sprintf(buf, "disk\n"); 455 return sprintf(buf, "disk\n");
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 15f37ea08719..c60f13b5270a 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -178,17 +178,20 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
178 unsigned int, char *); 178 unsigned int, char *);
179 179
180#ifdef CONFIG_SUSPEND 180#ifdef CONFIG_SUSPEND
181struct pm_sleep_state {
182 const char *label;
183 suspend_state_t state;
184};
185
181/* kernel/power/suspend.c */ 186/* kernel/power/suspend.c */
182extern const char *const pm_states[]; 187extern struct pm_sleep_state pm_states[];
183 188
184extern bool valid_state(suspend_state_t state);
185extern int suspend_devices_and_enter(suspend_state_t state); 189extern int suspend_devices_and_enter(suspend_state_t state);
186#else /* !CONFIG_SUSPEND */ 190#else /* !CONFIG_SUSPEND */
187static inline int suspend_devices_and_enter(suspend_state_t state) 191static inline int suspend_devices_and_enter(suspend_state_t state)
188{ 192{
189 return -ENOSYS; 193 return -ENOSYS;
190} 194}
191static inline bool valid_state(suspend_state_t state) { return false; }
192#endif /* !CONFIG_SUSPEND */ 195#endif /* !CONFIG_SUSPEND */
193 196
194#ifdef CONFIG_PM_TEST_SUSPEND 197#ifdef CONFIG_PM_TEST_SUSPEND
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8233cd4047d7..338a6f147974 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,10 +31,10 @@
31 31
32#include "power.h" 32#include "power.h"
33 33
34const char *const pm_states[PM_SUSPEND_MAX] = { 34struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = {
35 [PM_SUSPEND_FREEZE] = "freeze", 35 [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE },
36 [PM_SUSPEND_STANDBY] = "standby", 36 [PM_SUSPEND_STANDBY] = { .label = "standby", },
37 [PM_SUSPEND_MEM] = "mem", 37 [PM_SUSPEND_MEM] = { .label = "mem", },
38}; 38};
39 39
40static const struct platform_suspend_ops *suspend_ops; 40static const struct platform_suspend_ops *suspend_ops;
@@ -54,9 +54,11 @@ static void freeze_begin(void)
54 54
55static void freeze_enter(void) 55static void freeze_enter(void)
56{ 56{
57 cpuidle_use_deepest_state(true);
57 cpuidle_resume(); 58 cpuidle_resume();
58 wait_event(suspend_freeze_wait_head, suspend_freeze_wake); 59 wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
59 cpuidle_pause(); 60 cpuidle_pause();
61 cpuidle_use_deepest_state(false);
60} 62}
61 63
62void freeze_wake(void) 64void freeze_wake(void)
@@ -66,42 +68,62 @@ void freeze_wake(void)
66} 68}
67EXPORT_SYMBOL_GPL(freeze_wake); 69EXPORT_SYMBOL_GPL(freeze_wake);
68 70
71static bool valid_state(suspend_state_t state)
72{
73 /*
74 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level
75 * support and need to be valid to the low level
76 * implementation, no valid callback implies that none are valid.
77 */
78 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
79}
80
81/*
82 * If this is set, the "mem" label always corresponds to the deepest sleep state
83 * available, the "standby" label corresponds to the second deepest sleep state
84 * available (if any), and the "freeze" label corresponds to the remaining
85 * available sleep state (if there is one).
86 */
87static bool relative_states;
88
89static int __init sleep_states_setup(char *str)
90{
91 relative_states = !strncmp(str, "1", 1);
92 if (relative_states) {
93 pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE;
94 pm_states[PM_SUSPEND_FREEZE].state = 0;
95 }
96 return 1;
97}
98
99__setup("relative_sleep_states=", sleep_states_setup);
100
69/** 101/**
70 * suspend_set_ops - Set the global suspend method table. 102 * suspend_set_ops - Set the global suspend method table.
71 * @ops: Suspend operations to use. 103 * @ops: Suspend operations to use.
72 */ 104 */
73void suspend_set_ops(const struct platform_suspend_ops *ops) 105void suspend_set_ops(const struct platform_suspend_ops *ops)
74{ 106{
107 suspend_state_t i;
108 int j = PM_SUSPEND_MAX - 1;
109
75 lock_system_sleep(); 110 lock_system_sleep();
111
76 suspend_ops = ops; 112 suspend_ops = ops;
113 for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
114 if (valid_state(i))
115 pm_states[j--].state = i;
116 else if (!relative_states)
117 pm_states[j--].state = 0;
118
119 pm_states[j--].state = PM_SUSPEND_FREEZE;
120 while (j >= PM_SUSPEND_MIN)
121 pm_states[j--].state = 0;
122
77 unlock_system_sleep(); 123 unlock_system_sleep();
78} 124}
79EXPORT_SYMBOL_GPL(suspend_set_ops); 125EXPORT_SYMBOL_GPL(suspend_set_ops);
80 126
81bool valid_state(suspend_state_t state)
82{
83 if (state == PM_SUSPEND_FREEZE) {
84#ifdef CONFIG_PM_DEBUG
85 if (pm_test_level != TEST_NONE &&
86 pm_test_level != TEST_FREEZER &&
87 pm_test_level != TEST_DEVICES &&
88 pm_test_level != TEST_PLATFORM) {
89 printk(KERN_WARNING "Unsupported pm_test mode for "
90 "freeze state, please choose "
91 "none/freezer/devices/platform.\n");
92 return false;
93 }
94#endif
95 return true;
96 }
97 /*
98 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
99 * support and need to be valid to the lowlevel
100 * implementation, no valid callback implies that none are valid.
101 */
102 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
103}
104
105/** 127/**
106 * suspend_valid_only_mem - Generic memory-only valid callback. 128 * suspend_valid_only_mem - Generic memory-only valid callback.
107 * 129 *
@@ -328,9 +350,17 @@ static int enter_state(suspend_state_t state)
328{ 350{
329 int error; 351 int error;
330 352
331 if (!valid_state(state)) 353 if (state == PM_SUSPEND_FREEZE) {
332 return -ENODEV; 354#ifdef CONFIG_PM_DEBUG
333 355 if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
356 pr_warning("PM: Unsupported test mode for freeze state,"
357 "please choose none/freezer/devices/platform.\n");
358 return -EAGAIN;
359 }
360#endif
361 } else if (!valid_state(state)) {
362 return -EINVAL;
363 }
334 if (!mutex_trylock(&pm_mutex)) 364 if (!mutex_trylock(&pm_mutex))
335 return -EBUSY; 365 return -EBUSY;
336 366
@@ -341,7 +371,7 @@ static int enter_state(suspend_state_t state)
341 sys_sync(); 371 sys_sync();
342 printk("done.\n"); 372 printk("done.\n");
343 373
344 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 374 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label);
345 error = suspend_prepare(state); 375 error = suspend_prepare(state);
346 if (error) 376 if (error)
347 goto Unlock; 377 goto Unlock;
@@ -349,7 +379,7 @@ static int enter_state(suspend_state_t state)
349 if (suspend_test(TEST_FREEZER)) 379 if (suspend_test(TEST_FREEZER))
350 goto Finish; 380 goto Finish;
351 381
352 pr_debug("PM: Entering %s sleep\n", pm_states[state]); 382 pr_debug("PM: Entering %s sleep\n", pm_states[state].label);
353 pm_restrict_gfp_mask(); 383 pm_restrict_gfp_mask();
354 error = suspend_devices_and_enter(state); 384 error = suspend_devices_and_enter(state);
355 pm_restore_gfp_mask(); 385 pm_restore_gfp_mask();
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 9b2a1d58558d..269b097e78ea 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
92 } 92 }
93 93
94 if (state == PM_SUSPEND_MEM) { 94 if (state == PM_SUSPEND_MEM) {
95 printk(info_test, pm_states[state]); 95 printk(info_test, pm_states[state].label);
96 status = pm_suspend(state); 96 status = pm_suspend(state);
97 if (status == -ENODEV) 97 if (status == -ENODEV)
98 state = PM_SUSPEND_STANDBY; 98 state = PM_SUSPEND_STANDBY;
99 } 99 }
100 if (state == PM_SUSPEND_STANDBY) { 100 if (state == PM_SUSPEND_STANDBY) {
101 printk(info_test, pm_states[state]); 101 printk(info_test, pm_states[state].label);
102 status = pm_suspend(state); 102 status = pm_suspend(state);
103 } 103 }
104 if (status < 0) 104 if (status < 0)
@@ -136,18 +136,16 @@ static char warn_bad_state[] __initdata =
136 136
137static int __init setup_test_suspend(char *value) 137static int __init setup_test_suspend(char *value)
138{ 138{
139 unsigned i; 139 suspend_state_t i;
140 140
141 /* "=mem" ==> "mem" */ 141 /* "=mem" ==> "mem" */
142 value++; 142 value++;
143 for (i = 0; i < PM_SUSPEND_MAX; i++) { 143 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
144 if (!pm_states[i]) 144 if (!strcmp(pm_states[i].label, value)) {
145 continue; 145 test_state = pm_states[i].state;
146 if (strcmp(pm_states[i], value) != 0) 146 return 0;
147 continue; 147 }
148 test_state = (__force suspend_state_t) i; 148
149 return 0;
150 }
151 printk(warn_bad_state, value); 149 printk(warn_bad_state, value);
152 return 0; 150 return 0;
153} 151}
@@ -164,8 +162,8 @@ static int __init test_suspend(void)
164 /* PM is initialized by now; is that state testable? */ 162 /* PM is initialized by now; is that state testable? */
165 if (test_state == PM_SUSPEND_ON) 163 if (test_state == PM_SUSPEND_ON)
166 goto done; 164 goto done;
167 if (!valid_state(test_state)) { 165 if (!pm_states[test_state].state) {
168 printk(warn_bad_state, pm_states[test_state]); 166 printk(warn_bad_state, pm_states[test_state].label);
169 goto done; 167 goto done;
170 } 168 }
171 169
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f4390a079c7..a8f12247ce7c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -101,19 +101,13 @@ static int cpuidle_idle_call(void)
101 rcu_idle_enter(); 101 rcu_idle_enter();
102 102
103 /* 103 /*
104 * Check if the cpuidle framework is ready, otherwise fallback 104 * Ask the cpuidle framework to choose a convenient idle state.
105 * to the default arch specific idle method 105 * Fall back to the default arch specific idle method on errors.
106 */ 106 */
107 ret = cpuidle_enabled(drv, dev); 107 next_state = cpuidle_select(drv, dev);
108
109 if (!ret) {
110 /*
111 * Ask the governor to choose an idle state it thinks
112 * it is convenient to go to. There is *always* a
113 * convenient idle state
114 */
115 next_state = cpuidle_select(drv, dev);
116 108
109 ret = next_state;
110 if (ret >= 0) {
117 /* 111 /*
118 * The idle task must be scheduled, it is pointless to 112 * The idle task must be scheduled, it is pointless to
119 * go to idle, just update no idle residency and get 113 * go to idle, just update no idle residency and get
@@ -140,7 +134,7 @@ static int cpuidle_idle_call(void)
140 CLOCK_EVT_NOTIFY_BROADCAST_ENTER, 134 CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
141 &dev->cpu); 135 &dev->cpu);
142 136
143 if (!ret) { 137 if (ret >= 0) {
144 trace_cpu_idle_rcuidle(next_state, dev->cpu); 138 trace_cpu_idle_rcuidle(next_state, dev->cpu);
145 139
146 /* 140 /*
@@ -175,7 +169,7 @@ static int cpuidle_idle_call(void)
175 * We can't use the cpuidle framework, let's use the default 169 * We can't use the cpuidle framework, let's use the default
176 * idle routine 170 * idle routine
177 */ 171 */
178 if (ret) 172 if (ret < 0)
179 arch_cpu_idle(); 173 arch_cpu_idle();
180 174
181 __current_set_polling(); 175 __current_set_polling();