aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-05-20 12:03:55 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-05-20 12:03:55 -0400
commit46ee9645094ad1eb5b4888882ecaa1fb87dcd2a3 (patch)
treed0a48e993568b6a2415cfc21fc06eaa2fd886429
parentfa5312d9e87e7222c6c384c4e930dc149bc1178d (diff)
parent25f3a5a2854dce8b8413fd24cc9d5b9e3632be54 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/suspend-2.6
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/suspend-2.6: PM: PM QOS update fix Freezer / cgroup freezer: Update stale locking comments PM / platform_bus: Allow runtime PM by default i2c: Fix bus-level power management callbacks PM QOS update PM / Hibernate: Fix block_io.c printk warning PM / Hibernate: Group swap ops PM / Hibernate: Move the first_sector out of swsusp_write PM / Hibernate: Separate block_io PM / Hibernate: Snapshot cleanup FS / libfs: Implement simple_write_to_buffer PM / Hibernate: document open(/dev/snapshot) side effects PM / Runtime: Add sysfs debug files PM: Improve device power management document PM: Update device power management document PM: Allow runtime_suspend methods to call pm_schedule_suspend() PM: pm_wakeup - switch to using bool
-rw-r--r--Documentation/power/devices.txt847
-rw-r--r--Documentation/power/pm_qos_interface.txt48
-rw-r--r--Documentation/power/userland-swsusp.txt4
-rw-r--r--drivers/acpi/processor_idle.c2
-rw-r--r--drivers/base/platform.c6
-rw-r--r--drivers/base/power/runtime.c10
-rw-r--r--drivers/base/power/sysfs.c65
-rw-r--r--drivers/cpuidle/governors/ladder.c2
-rw-r--r--drivers/cpuidle/governors/menu.c2
-rw-r--r--drivers/i2c/i2c-core.c166
-rw-r--r--drivers/net/e1000e/netdev.c22
-rw-r--r--drivers/net/igbvf/netdev.c6
-rw-r--r--drivers/net/wireless/ipw2x00/ipw2100.c11
-rw-r--r--fs/libfs.c35
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/netdevice.h4
-rw-r--r--include/linux/pm_qos_params.h14
-rw-r--r--include/linux/pm_runtime.h7
-rw-r--r--include/linux/pm_wakeup.h38
-rw-r--r--include/sound/pcm.h3
-rw-r--r--kernel/cgroup_freezer.c21
-rw-r--r--kernel/pm_qos_params.c218
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/block_io.c103
-rw-r--r--kernel/power/power.h27
-rw-r--r--kernel/power/snapshot.c145
-rw-r--r--kernel/power/swap.c333
-rw-r--r--kernel/power/user.c37
-rw-r--r--net/mac80211/mlme.c2
-rw-r--r--sound/core/pcm.c3
-rw-r--r--sound/core/pcm_native.c14
31 files changed, 1251 insertions, 949 deletions
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index c9abbd86bc18..57080cd74575 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -1,7 +1,13 @@
1Device Power Management
2
3Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
4Copyright (c) 2010 Alan Stern <stern@rowland.harvard.edu>
5
6
1Most of the code in Linux is device drivers, so most of the Linux power 7Most of the code in Linux is device drivers, so most of the Linux power
2management code is also driver-specific. Most drivers will do very little; 8management (PM) code is also driver-specific. Most drivers will do very
3others, especially for platforms with small batteries (like cell phones), 9little; others, especially for platforms with small batteries (like cell
4will do a lot. 10phones), will do a lot.
5 11
6This writeup gives an overview of how drivers interact with system-wide 12This writeup gives an overview of how drivers interact with system-wide
7power management goals, emphasizing the models and interfaces that are 13power management goals, emphasizing the models and interfaces that are
@@ -15,9 +21,10 @@ Drivers will use one or both of these models to put devices into low-power
15states: 21states:
16 22
17 System Sleep model: 23 System Sleep model:
18 Drivers can enter low power states as part of entering system-wide 24 Drivers can enter low-power states as part of entering system-wide
19 low-power states like "suspend-to-ram", or (mostly for systems with 25 low-power states like "suspend" (also known as "suspend-to-RAM"), or
20 disks) "hibernate" (suspend-to-disk). 26 (mostly for systems with disks) "hibernation" (also known as
27 "suspend-to-disk").
21 28
22 This is something that device, bus, and class drivers collaborate on 29 This is something that device, bus, and class drivers collaborate on
23 by implementing various role-specific suspend and resume methods to 30 by implementing various role-specific suspend and resume methods to
@@ -25,33 +32,41 @@ states:
25 them without loss of data. 32 them without loss of data.
26 33
27 Some drivers can manage hardware wakeup events, which make the system 34 Some drivers can manage hardware wakeup events, which make the system
28 leave that low-power state. This feature may be disabled using the 35 leave the low-power state. This feature may be enabled or disabled
29 relevant /sys/devices/.../power/wakeup file; enabling it may cost some 36 using the relevant /sys/devices/.../power/wakeup file (for Ethernet
30 power usage, but let the whole system enter low power states more often. 37 drivers the ioctl interface used by ethtool may also be used for this
38 purpose); enabling it may cost some power usage, but let the whole
39 system enter low-power states more often.
31 40
32 Runtime Power Management model: 41 Runtime Power Management model:
33 Drivers may also enter low power states while the system is running, 42 Devices may also be put into low-power states while the system is
34 independently of other power management activity. Upstream drivers 43 running, independently of other power management activity in principle.
35 will normally not know (or care) if the device is in some low power 44 However, devices are not generally independent of each other (for
36 state when issuing requests; the driver will auto-resume anything 45 example, a parent device cannot be suspended unless all of its child
37 that's needed when it gets a request. 46 devices have been suspended). Moreover, depending on the bus type the
38 47 device is on, it may be necessary to carry out some bus-specific
39 This doesn't have, or need much infrastructure; it's just something you 48 operations on the device for this purpose. Devices put into low power
40 should do when writing your drivers. For example, clk_disable() unused 49 states at run time may require special handling during system-wide power
41 clocks as part of minimizing power drain for currently-unused hardware. 50 transitions (suspend or hibernation).
42 Of course, sometimes clusters of drivers will collaborate with each 51
43 other, which could involve task-specific power management. 52 For these reasons not only the device driver itself, but also the
44 53 appropriate subsystem (bus type, device type or device class) driver and
45There's not a lot to be said about those low power states except that they 54 the PM core are involved in runtime power management. As in the system
46are very system-specific, and often device-specific. Also, that if enough 55 sleep power management case, they need to collaborate by implementing
47drivers put themselves into low power states (at "runtime"), the effect may be 56 various role-specific suspend and resume methods, so that the hardware
48the same as entering some system-wide low-power state (system sleep) ... and 57 is cleanly powered down and reactivated without data or service loss.
49that synergies exist, so that several drivers using runtime pm might put the 58
50system into a state where even deeper power saving options are available. 59There's not a lot to be said about those low-power states except that they are
51 60very system-specific, and often device-specific. Also, that if enough devices
52Most suspended devices will have quiesced all I/O: no more DMA or irqs, no 61have been put into low-power states (at runtime), the effect may be very similar
53more data read or written, and requests from upstream drivers are no longer 62to entering some system-wide low-power state (system sleep) ... and that
54accepted. A given bus or platform may have different requirements though. 63synergies exist, so that several drivers using runtime PM might put the system
64into a state where even deeper power saving options are available.
65
66Most suspended devices will have quiesced all I/O: no more DMA or IRQs (except
67for wakeup events), no more data read or written, and requests from upstream
68drivers are no longer accepted. A given bus or platform may have different
69requirements though.
55 70
56Examples of hardware wakeup events include an alarm from a real time clock, 71Examples of hardware wakeup events include an alarm from a real time clock,
57network wake-on-LAN packets, keyboard or mouse activity, and media insertion 72network wake-on-LAN packets, keyboard or mouse activity, and media insertion
@@ -60,129 +75,152 @@ or removal (for PCMCIA, MMC/SD, USB, and so on).
60 75
61Interfaces for Entering System Sleep States 76Interfaces for Entering System Sleep States
62=========================================== 77===========================================
63Most of the programming interfaces a device driver needs to know about 78There are programming interfaces provided for subsystems (bus type, device type,
64relate to that first model: entering a system-wide low power state, 79device class) and device drivers to allow them to participate in the power
65rather than just minimizing power consumption by one device. 80management of devices they are concerned with. These interfaces cover both
66 81system sleep and runtime power management.
67 82
68Bus Driver Methods 83
69------------------ 84Device Power Management Operations
70The core methods to suspend and resume devices reside in struct bus_type. 85----------------------------------
71These are mostly of interest to people writing infrastructure for busses 86Device power management operations, at the subsystem level as well as at the
72like PCI or USB, or because they define the primitives that device drivers 87device driver level, are implemented by defining and populating objects of type
73may need to apply in domain-specific ways to their devices: 88struct dev_pm_ops:
74 89
75struct bus_type { 90struct dev_pm_ops {
76 ... 91 int (*prepare)(struct device *dev);
77 int (*suspend)(struct device *dev, pm_message_t state); 92 void (*complete)(struct device *dev);
78 int (*resume)(struct device *dev); 93 int (*suspend)(struct device *dev);
94 int (*resume)(struct device *dev);
95 int (*freeze)(struct device *dev);
96 int (*thaw)(struct device *dev);
97 int (*poweroff)(struct device *dev);
98 int (*restore)(struct device *dev);
99 int (*suspend_noirq)(struct device *dev);
100 int (*resume_noirq)(struct device *dev);
101 int (*freeze_noirq)(struct device *dev);
102 int (*thaw_noirq)(struct device *dev);
103 int (*poweroff_noirq)(struct device *dev);
104 int (*restore_noirq)(struct device *dev);
105 int (*runtime_suspend)(struct device *dev);
106 int (*runtime_resume)(struct device *dev);
107 int (*runtime_idle)(struct device *dev);
79}; 108};
80 109
81Bus drivers implement those methods as appropriate for the hardware and 110This structure is defined in include/linux/pm.h and the methods included in it
82the drivers using it; PCI works differently from USB, and so on. Not many 111are also described in that file. Their roles will be explained in what follows.
83people write bus drivers; most driver code is a "device driver" that 112For now, it should be sufficient to remember that the last three methods are
84builds on top of bus-specific framework code. 113specific to runtime power management while the remaining ones are used during
114system-wide power transitions.
85 115
86For more information on these driver calls, see the description later; 116There also is a deprecated "old" or "legacy" interface for power management
87they are called in phases for every device, respecting the parent-child 117operations available at least for some subsystems. This approach does not use
88sequencing in the driver model tree. Note that as this is being written, 118struct dev_pm_ops objects and it is suitable only for implementing system sleep
89only the suspend() and resume() are widely available; not many bus drivers 119power management methods. Therefore it is not described in this document, so
90leverage all of those phases, or pass them down to lower driver levels. 120please refer directly to the source code for more information about it.
91 121
92 122
93/sys/devices/.../power/wakeup files 123Subsystem-Level Methods
94----------------------------------- 124-----------------------
95All devices in the driver model have two flags to control handling of 125The core methods to suspend and resume devices reside in struct dev_pm_ops
96wakeup events, which are hardware signals that can force the device and/or 126pointed to by the pm member of struct bus_type, struct device_type and
97system out of a low power state. These are initialized by bus or device 127struct class. They are mostly of interest to the people writing infrastructure
98driver code using device_init_wakeup(dev,can_wakeup). 128for buses, like PCI or USB, or device type and device class drivers.
99 129
100The "can_wakeup" flag just records whether the device (and its driver) can 130Bus drivers implement these methods as appropriate for the hardware and the
101physically support wakeup events. When that flag is clear, the sysfs 131drivers using it; PCI works differently from USB, and so on. Not many people
102"wakeup" file is empty, and device_may_wakeup() returns false. 132write subsystem-level drivers; most driver code is a "device driver" that builds
133on top of bus-specific framework code.
103 134
104For devices that can issue wakeup events, a separate flag controls whether 135For more information on these driver calls, see the description later;
105that device should try to use its wakeup mechanism. The initial value of 136they are called in phases for every device, respecting the parent-child
106device_may_wakeup() will be true, so that the device's "wakeup" file holds 137sequencing in the driver model tree.
107the value "enabled". Userspace can change that to "disabled" so that
108device_may_wakeup() returns false; or change it back to "enabled" (so that
109it returns true again).
110 138
111 139
112EXAMPLE: PCI Device Driver Methods 140/sys/devices/.../power/wakeup files
113----------------------------------- 141-----------------------------------
114PCI framework software calls these methods when the PCI device driver bound 142All devices in the driver model have two flags to control handling of wakeup
115to a device device has provided them: 143events (hardware signals that can force the device and/or system out of a low
116 144power state). These flags are initialized by bus or device driver code using
117struct pci_driver { 145device_set_wakeup_capable() and device_set_wakeup_enable(), defined in
118 ... 146include/linux/pm_wakeup.h.
119 int (*suspend)(struct pci_device *pdev, pm_message_t state);
120 int (*suspend_late)(struct pci_device *pdev, pm_message_t state);
121 147
122 int (*resume_early)(struct pci_device *pdev); 148The "can_wakeup" flag just records whether the device (and its driver) can
123 int (*resume)(struct pci_device *pdev); 149physically support wakeup events. The device_set_wakeup_capable() routine
124}; 150affects this flag. The "should_wakeup" flag controls whether the device should
125 151try to use its wakeup mechanism. device_set_wakeup_enable() affects this flag;
126Drivers will implement those methods, and call PCI-specific procedures 152for the most part drivers should not change its value. The initial value of
127like pci_set_power_state(), pci_enable_wake(), pci_save_state(), and 153should_wakeup is supposed to be false for the majority of devices; the major
128pci_restore_state() to manage PCI-specific mechanisms. (PCI config space 154exceptions are power buttons, keyboards, and Ethernet adapters whose WoL
129could be saved during driver probe, if it weren't for the fact that some 155(wake-on-LAN) feature has been set up with ethtool.
130systems rely on userspace tweaking using setpci.) Devices are suspended 156
131before their bridges enter low power states, and likewise bridges resume 157Whether or not a device is capable of issuing wakeup events is a hardware
132before their devices. 158matter, and the kernel is responsible for keeping track of it. By contrast,
133 159whether or not a wakeup-capable device should issue wakeup events is a policy
134 160decision, and it is managed by user space through a sysfs attribute: the
135Upper Layers of Driver Stacks 161power/wakeup file. User space can write the strings "enabled" or "disabled" to
136----------------------------- 162set or clear the should_wakeup flag, respectively. Reads from the file will
137Device drivers generally have at least two interfaces, and the methods 163return the corresponding string if can_wakeup is true, but if can_wakeup is
138sketched above are the ones which apply to the lower level (nearer PCI, USB, 164false then reads will return an empty string, to indicate that the device
139or other bus hardware). The network and block layers are examples of upper 165doesn't support wakeup events. (But even though the file appears empty, writes
140level interfaces, as is a character device talking to userspace. 166will still affect the should_wakeup flag.)
141 167
142Power management requests normally need to flow through those upper levels, 168The device_may_wakeup() routine returns true only if both flags are set.
143which often use domain-oriented requests like "blank that screen". In 169Drivers should check this routine when putting devices in a low-power state
144some cases those upper levels will have power management intelligence that 170during a system sleep transition, to see whether or not to enable the devices'
145relates to end-user activity, or other devices that work in cooperation. 171wakeup mechanisms. However for runtime power management, wakeup events should
146 172be enabled whenever the device and driver both support them, regardless of the
147When those interfaces are structured using class interfaces, there is a 173should_wakeup flag.
148standard way to have the upper layer stop issuing requests to a given 174
149class device (and restart later): 175
150 176/sys/devices/.../power/control files
151struct class { 177------------------------------------
152 ... 178Each device in the driver model has a flag to control whether it is subject to
153 int (*suspend)(struct device *dev, pm_message_t state); 179runtime power management. This flag, called runtime_auto, is initialized by the
154 int (*resume)(struct device *dev); 180bus type (or generally subsystem) code using pm_runtime_allow() or
155}; 181pm_runtime_forbid(); the default is to allow runtime power management.
156 182
157Those calls are issued in specific phases of the process by which the 183The setting can be adjusted by user space by writing either "on" or "auto" to
158system enters a low power "suspend" state, or resumes from it. 184the device's power/control sysfs file. Writing "auto" calls pm_runtime_allow(),
159 185setting the flag and allowing the device to be runtime power-managed by its
160 186driver. Writing "on" calls pm_runtime_forbid(), clearing the flag, returning
161Calling Drivers to Enter System Sleep States 187the device to full power if it was in a low-power state, and preventing the
162============================================ 188device from being runtime power-managed. User space can check the current value
163When the system enters a low power state, each device's driver is asked 189of the runtime_auto flag by reading the file.
164to suspend the device by putting it into state compatible with the target 190
191The device's runtime_auto flag has no effect on the handling of system-wide
192power transitions. In particular, the device can (and in the majority of cases
193should and will) be put into a low-power state during a system-wide transition
194to a sleep state even though its runtime_auto flag is clear.
195
196For more information about the runtime power management framework, refer to
197Documentation/power/runtime_pm.txt.
198
199
200Calling Drivers to Enter and Leave System Sleep States
201======================================================
202When the system goes into a sleep state, each device's driver is asked to
203suspend the device by putting it into a state compatible with the target
165system state. That's usually some version of "off", but the details are 204system state. That's usually some version of "off", but the details are
166system-specific. Also, wakeup-enabled devices will usually stay partly 205system-specific. Also, wakeup-enabled devices will usually stay partly
167functional in order to wake the system. 206functional in order to wake the system.
168 207
169When the system leaves that low power state, the device's driver is asked 208When the system leaves that low-power state, the device's driver is asked to
170to resume it. The suspend and resume operations always go together, and 209resume it by returning it to full power. The suspend and resume operations
171both are multi-phase operations. 210always go together, and both are multi-phase operations.
172 211
173For simple drivers, suspend might quiesce the device using the class code 212For simple drivers, suspend might quiesce the device using class code
174and then turn its hardware as "off" as possible with late_suspend. The 213and then turn its hardware as "off" as possible during suspend_noirq. The
175matching resume calls would then completely reinitialize the hardware 214matching resume calls would then completely reinitialize the hardware
176before reactivating its class I/O queues. 215before reactivating its class I/O queues.
177 216
178More power-aware drivers drivers will use more than one device low power 217More power-aware drivers might prepare the devices for triggering system wakeup
179state, either at runtime or during system sleep states, and might trigger 218events.
180system wakeup events.
181 219
182 220
183Call Sequence Guarantees 221Call Sequence Guarantees
184------------------------ 222------------------------
185To ensure that bridges and similar links needed to talk to a device are 223To ensure that bridges and similar links needing to talk to a device are
186available when the device is suspended or resumed, the device tree is 224available when the device is suspended or resumed, the device tree is
187walked in a bottom-up order to suspend devices. A top-down order is 225walked in a bottom-up order to suspend devices. A top-down order is
188used to resume those devices. 226used to resume those devices.
@@ -194,67 +232,310 @@ its parent; and can't be removed or suspended after that parent.
194The policy is that the device tree should match hardware bus topology. 232The policy is that the device tree should match hardware bus topology.
195(Or at least the control bus, for devices which use multiple busses.) 233(Or at least the control bus, for devices which use multiple busses.)
196In particular, this means that a device registration may fail if the parent of 234In particular, this means that a device registration may fail if the parent of
197the device is suspending (ie. has been chosen by the PM core as the next 235the device is suspending (i.e. has been chosen by the PM core as the next
198device to suspend) or has already suspended, as well as after all of the other 236device to suspend) or has already suspended, as well as after all of the other
199devices have been suspended. Device drivers must be prepared to cope with such 237devices have been suspended. Device drivers must be prepared to cope with such
200situations. 238situations.
201 239
202 240
203Suspending Devices 241System Power Management Phases
204------------------ 242------------------------------
205Suspending a given device is done in several phases. Suspending the 243Suspending or resuming the system is done in several phases. Different phases
206system always includes every phase, executing calls for every device 244are used for standby or memory sleep states ("suspend-to-RAM") and the
207before the next phase begins. Not all busses or classes support all 245hibernation state ("suspend-to-disk"). Each phase involves executing callbacks
208these callbacks; and not all drivers use all the callbacks. 246for every device before the next phase begins. Not all busses or classes
247support all these callbacks and not all drivers use all the callbacks. The
248various phases always run after tasks have been frozen and before they are
249unfrozen. Furthermore, the *_noirq phases run at a time when IRQ handlers have
250been disabled (except for those marked with the IRQ_WAKEUP flag).
209 251
210The phases are seen by driver notifications issued in this order: 252Most phases use bus, type, and class callbacks (that is, methods defined in
253dev->bus->pm, dev->type->pm, and dev->class->pm). The prepare and complete
254phases are exceptions; they use only bus callbacks. When multiple callbacks
255are used in a phase, they are invoked in the order: <class, type, bus> during
256power-down transitions and in the opposite order during power-up transitions.
257For example, during the suspend phase the PM core invokes
211 258
212 1 class.suspend(dev, message) is called after tasks are frozen, for 259 dev->class->pm.suspend(dev);
213 devices associated with a class that has such a method. This 260 dev->type->pm.suspend(dev);
214 method may sleep. 261 dev->bus->pm.suspend(dev);
215 262
216 Since I/O activity usually comes from such higher layers, this is 263before moving on to the next device, whereas during the resume phase the core
217 a good place to quiesce all drivers of a given type (and keep such 264invokes
218 code out of those drivers).
219 265
220 2 bus.suspend(dev, message) is called next. This method may sleep, 266 dev->bus->pm.resume(dev);
221 and is often morphed into a device driver call with bus-specific 267 dev->type->pm.resume(dev);
222 parameters and/or rules. 268 dev->class->pm.resume(dev);
223 269
224 This call should handle parts of device suspend logic that require 270These callbacks may in turn invoke device- or driver-specific methods stored in
225 sleeping. It probably does work to quiesce the device which hasn't 271dev->driver->pm, but they don't have to.
226 been abstracted into class.suspend().
227 272
228The pm_message_t parameter is currently used to refine those semantics
229(described later).
230 273
231At the end of those phases, drivers should normally have stopped all I/O 274Entering System Suspend
232transactions (DMA, IRQs), saved enough state that they can re-initialize 275-----------------------
233or restore previous state (as needed by the hardware), and placed the 276When the system goes into the standby or memory sleep state, the phases are:
234device into a low-power state. On many platforms they will also use 277
235clk_disable() to gate off one or more clock sources; sometimes they will 278 prepare, suspend, suspend_noirq.
236also switch off power supplies, or reduce voltages. Drivers which have 279
237runtime PM support may already have performed some or all of the steps 280 1. The prepare phase is meant to prevent races by preventing new devices
238needed to prepare for the upcoming system sleep state. 281 from being registered; the PM core would never know that all the
282 children of a device had been suspended if new children could be
283 registered at will. (By contrast, devices may be unregistered at any
284 time.) Unlike the other suspend-related phases, during the prepare
285 phase the device tree is traversed top-down.
286
287 The prepare phase uses only a bus callback. After the callback method
288 returns, no new children may be registered below the device. The method
289 may also prepare the device or driver in some way for the upcoming
290 system power transition, but it should not put the device into a
291 low-power state.
292
293 2. The suspend methods should quiesce the device to stop it from performing
294 I/O. They also may save the device registers and put it into the
295 appropriate low-power state, depending on the bus type the device is on,
296 and they may enable wakeup events.
297
298 3. The suspend_noirq phase occurs after IRQ handlers have been disabled,
299 which means that the driver's interrupt handler will not be called while
300 the callback method is running. The methods should save the values of
301 the device's registers that weren't saved previously and finally put the
302 device into the appropriate low-power state.
303
304 The majority of subsystems and device drivers need not implement this
305 callback. However, bus types allowing devices to share interrupt
306 vectors, like PCI, generally need it; otherwise a driver might encounter
307 an error during the suspend phase by fielding a shared interrupt
308 generated by some other device after its own device had been set to low
309 power.
310
311At the end of these phases, drivers should have stopped all I/O transactions
312(DMA, IRQs), saved enough state that they can re-initialize or restore previous
313state (as needed by the hardware), and placed the device into a low-power state.
314On many platforms they will gate off one or more clock sources; sometimes they
315will also switch off power supplies or reduce voltages. (Drivers supporting
316runtime PM may already have performed some or all of these steps.)
317
318If device_may_wakeup(dev) returns true, the device should be prepared for
319generating hardware wakeup signals to trigger a system wakeup event when the
320system is in the sleep state. For example, enable_irq_wake() might identify
321GPIO signals hooked up to a switch or other external hardware, and
322pci_enable_wake() does something similar for the PCI PME signal.
323
324If any of these callbacks returns an error, the system won't enter the desired
325low-power state. Instead the PM core will unwind its actions by resuming all
326the devices that were suspended.
327
328
329Leaving System Suspend
330----------------------
331When resuming from standby or memory sleep, the phases are:
332
333 resume_noirq, resume, complete.
334
335 1. The resume_noirq callback methods should perform any actions needed
336 before the driver's interrupt handlers are invoked. This generally
337 means undoing the actions of the suspend_noirq phase. If the bus type
338 permits devices to share interrupt vectors, like PCI, the method should
339 bring the device and its driver into a state in which the driver can
340 recognize if the device is the source of incoming interrupts, if any,
341 and handle them correctly.
342
343 For example, the PCI bus type's ->pm.resume_noirq() puts the device into
344 the full-power state (D0 in the PCI terminology) and restores the
345 standard configuration registers of the device. Then it calls the
346 device driver's ->pm.resume_noirq() method to perform device-specific
347 actions.
348
349 2. The resume methods should bring the the device back to its operating
350 state, so that it can perform normal I/O. This generally involves
351 undoing the actions of the suspend phase.
352
353 3. The complete phase uses only a bus callback. The method should undo the
354 actions of the prepare phase. Note, however, that new children may be
355 registered below the device as soon as the resume callbacks occur; it's
356 not necessary to wait until the complete phase.
357
358At the end of these phases, drivers should be as functional as they were before
359suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are
360gated on. Even if the device was in a low-power state before the system sleep
361because of runtime power management, afterwards it should be back in its
362full-power state. There are multiple reasons why it's best to do this; they are
363discussed in more detail in Documentation/power/runtime_pm.txt.
239 364
240When any driver sees that its device_can_wakeup(dev), it should make sure 365However, the details here may again be platform-specific. For example,
241to use the relevant hardware signals to trigger a system wakeup event. 366some systems support multiple "run" states, and the mode in effect at
242For example, enable_irq_wake() might identify GPIO signals hooked up to 367the end of resume might not be the one which preceded suspension.
243a switch or other external hardware, and pci_enable_wake() does something 368That means availability of certain clocks or power supplies changed,
244similar for PCI's PME# signal. 369which could easily affect how a driver works.
370
371Drivers need to be able to handle hardware which has been reset since the
372suspend methods were called, for example by complete reinitialization.
373This may be the hardest part, and the one most protected by NDA'd documents
374and chip errata. It's simplest if the hardware state hasn't changed since
375the suspend was carried out, but that can't be guaranteed (in fact, it ususally
376is not the case).
377
378Drivers must also be prepared to notice that the device has been removed
379while the system was powered down, whenever that's physically possible.
380PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of busses
381where common Linux platforms will see such removal. Details of how drivers
382will notice and handle such removals are currently bus-specific, and often
383involve a separate thread.
384
385These callbacks may return an error value, but the PM core will ignore such
386errors since there's nothing it can do about them other than printing them in
387the system log.
388
389
390Entering Hibernation
391--------------------
392Hibernating the system is more complicated than putting it into the standby or
393memory sleep state, because it involves creating and saving a system image.
394Therefore there are more phases for hibernation, with a different set of
395callbacks. These phases always run after tasks have been frozen and memory has
396been freed.
397
398The general procedure for hibernation is to quiesce all devices (freeze), create
399an image of the system memory while everything is stable, reactivate all
400devices (thaw), write the image to permanent storage, and finally shut down the
401system (poweroff). The phases used to accomplish this are:
402
403 prepare, freeze, freeze_noirq, thaw_noirq, thaw, complete,
404 prepare, poweroff, poweroff_noirq
405
406 1. The prepare phase is discussed in the "Entering System Suspend" section
407 above.
408
409 2. The freeze methods should quiesce the device so that it doesn't generate
410 IRQs or DMA, and they may need to save the values of device registers.
411 However the device does not have to be put in a low-power state, and to
412 save time it's best not to do so. Also, the device should not be
413 prepared to generate wakeup events.
414
415 3. The freeze_noirq phase is analogous to the suspend_noirq phase discussed
416 above, except again that the device should not be put in a low-power
417 state and should not be allowed to generate wakeup events.
418
419At this point the system image is created. All devices should be inactive and
420the contents of memory should remain undisturbed while this happens, so that the
421image forms an atomic snapshot of the system state.
422
423 4. The thaw_noirq phase is analogous to the resume_noirq phase discussed
424 above. The main difference is that its methods can assume the device is
425 in the same state as at the end of the freeze_noirq phase.
426
427 5. The thaw phase is analogous to the resume phase discussed above. Its
428 methods should bring the device back to an operating state, so that it
429 can be used for saving the image if necessary.
430
431 6. The complete phase is discussed in the "Leaving System Suspend" section
432 above.
433
434At this point the system image is saved, and the devices then need to be
435prepared for the upcoming system shutdown. This is much like suspending them
436before putting the system into the standby or memory sleep state, and the phases
437are similar.
438
439 7. The prepare phase is discussed above.
440
441 8. The poweroff phase is analogous to the suspend phase.
442
443 9. The poweroff_noirq phase is analogous to the suspend_noirq phase.
444
445The poweroff and poweroff_noirq callbacks should do essentially the same things
446as the suspend and suspend_noirq callbacks. The only notable difference is that
447they need not store the device register values, because the registers should
448already have been stored during the freeze or freeze_noirq phases.
449
450
451Leaving Hibernation
452-------------------
453Resuming from hibernation is, again, more complicated than resuming from a sleep
454state in which the contents of main memory are preserved, because it requires
455a system image to be loaded into memory and the pre-hibernation memory contents
456to be restored before control can be passed back to the image kernel.
457
458Although in principle, the image might be loaded into memory and the
459pre-hibernation memory contents restored by the boot loader, in practice this
460can't be done because boot loaders aren't smart enough and there is no
461established protocol for passing the necessary information. So instead, the
462boot loader loads a fresh instance of the kernel, called the boot kernel, into
463memory and passes control to it in the usual way. Then the boot kernel reads
464the system image, restores the pre-hibernation memory contents, and passes
465control to the image kernel. Thus two different kernels are involved in
466resuming from hibernation. In fact, the boot kernel may be completely different
467from the image kernel: a different configuration and even a different version.
468This has important consequences for device drivers and their subsystems.
469
470To be able to load the system image into memory, the boot kernel needs to
471include at least a subset of device drivers allowing it to access the storage
472medium containing the image, although it doesn't need to include all of the
473drivers present in the image kernel. After the image has been loaded, the
474devices managed by the boot kernel need to be prepared for passing control back
475to the image kernel. This is very similar to the initial steps involved in
476creating a system image, and it is accomplished in the same way, using prepare,
477freeze, and freeze_noirq phases. However the devices affected by these phases
478are only those having drivers in the boot kernel; other devices will still be in
479whatever state the boot loader left them.
480
481Should the restoration of the pre-hibernation memory contents fail, the boot
482kernel would go through the "thawing" procedure described above, using the
483thaw_noirq, thaw, and complete phases, and then continue running normally. This
484happens only rarely. Most often the pre-hibernation memory contents are
485restored successfully and control is passed to the image kernel, which then
486becomes responsible for bringing the system back to the working state.
487
488To achieve this, the image kernel must restore the devices' pre-hibernation
489functionality. The operation is much like waking up from the memory sleep
490state, although it involves different phases:
491
492 restore_noirq, restore, complete
493
494 1. The restore_noirq phase is analogous to the resume_noirq phase.
495
496 2. The restore phase is analogous to the resume phase.
497
498 3. The complete phase is discussed above.
499
500The main difference from resume[_noirq] is that restore[_noirq] must assume the
501device has been accessed and reconfigured by the boot loader or the boot kernel.
502Consequently the state of the device may be different from the state remembered
503from the freeze and freeze_noirq phases. The device may even need to be reset
504and completely re-initialized. In many cases this difference doesn't matter, so
505the resume[_noirq] and restore[_norq] method pointers can be set to the same
506routines. Nevertheless, different callback pointers are used in case there is a
507situation where it actually matters.
245 508
246If a driver (or bus, or class) fails it suspend method, the system won't
247enter the desired low power state; it will resume all the devices it's
248suspended so far.
249 509
250Note that drivers may need to perform different actions based on the target 510System Devices
251system lowpower/sleep state. At this writing, there are only platform 511--------------
252specific APIs through which drivers could determine those target states. 512System devices (sysdevs) follow a slightly different API, which can be found in
513
514 include/linux/sysdev.h
515 drivers/base/sys.c
516
517System devices will be suspended with interrupts disabled, and after all other
518devices have been suspended. On resume, they will be resumed before any other
519devices, and also with interrupts disabled. These things occur in special
520"sysdev_driver" phases, which affect only system devices.
521
522Thus, after the suspend_noirq (or freeze_noirq or poweroff_noirq) phase, when
523the non-boot CPUs are all offline and IRQs are disabled on the remaining online
524CPU, then a sysdev_driver.suspend phase is carried out, and the system enters a
525sleep state (or a system image is created). During resume (or after the image
526has been created or loaded) a sysdev_driver.resume phase is carried out, IRQs
527are enabled on the only online CPU, the non-boot CPUs are enabled, and the
528resume_noirq (or thaw_noirq or restore_noirq) phase begins.
529
530Code to actually enter and exit the system-wide low power state sometimes
531involves hardware details that are only known to the boot firmware, and
532may leave a CPU running software (from SRAM or flash memory) that monitors
533the system and manages its wakeup sequence.
253 534
254 535
255Device Low Power (suspend) States 536Device Low Power (suspend) States
256--------------------------------- 537---------------------------------
257Device low-power states aren't very standard. One device might only handle 538Device low-power states aren't standard. One device might only handle
258"on" and "off, while another might support a dozen different versions of 539"on" and "off, while another might support a dozen different versions of
259"on" (how many engines are active?), plus a state that gets back to "on" 540"on" (how many engines are active?), plus a state that gets back to "on"
260faster than from a full "off". 541faster than from a full "off".
@@ -265,7 +546,7 @@ PCI device may not perform DMA or issue IRQs, and any wakeup events it
265issues would be issued through the PME# bus signal. Plus, there are 546issues would be issued through the PME# bus signal. Plus, there are
266several PCI-standard device states, some of which are optional. 547several PCI-standard device states, some of which are optional.
267 548
268In contrast, integrated system-on-chip processors often use irqs as the 549In contrast, integrated system-on-chip processors often use IRQs as the
269wakeup event sources (so drivers would call enable_irq_wake) and might 550wakeup event sources (so drivers would call enable_irq_wake) and might
270be able to treat DMA completion as a wakeup event (sometimes DMA can stay 551be able to treat DMA completion as a wakeup event (sometimes DMA can stay
271active too, it'd only be the CPU and some peripherals that sleep). 552active too, it'd only be the CPU and some peripherals that sleep).
@@ -284,120 +565,17 @@ ways; the aforementioned LCD might be active in one product's "standby",
284but a different product using the same SOC might work differently. 565but a different product using the same SOC might work differently.
285 566
286 567
287Meaning of pm_message_t.event 568Power Management Notifiers
288----------------------------- 569--------------------------
289Parameters to suspend calls include the device affected and a message of 570There are some operations that cannot be carried out by the power management
290type pm_message_t, which has one field: the event. If driver does not 571callbacks discussed above, because the callbacks occur too late or too early.
291recognize the event code, suspend calls may abort the request and return 572To handle these cases, subsystems and device drivers may register power
292a negative errno. However, most drivers will be fine if they implement 573management notifiers that are called before tasks are frozen and after they have
293PM_EVENT_SUSPEND semantics for all messages. 574been thawed. Generally speaking, the PM notifiers are suitable for performing
575actions that either require user space to be available, or at least won't
576interfere with user space.
294 577
295The event codes are used to refine the goal of suspending the device, and 578For details refer to Documentation/power/notifiers.txt.
296mostly matter when creating or resuming system memory image snapshots, as
297used with suspend-to-disk:
298
299 PM_EVENT_SUSPEND -- quiesce the driver and put hardware into a low-power
300 state. When used with system sleep states like "suspend-to-RAM" or
301 "standby", the upcoming resume() call will often be able to rely on
302 state kept in hardware, or issue system wakeup events.
303
304 PM_EVENT_HIBERNATE -- Put hardware into a low-power state and enable wakeup
305 events as appropriate. It is only used with hibernation
306 (suspend-to-disk) and few devices are able to wake up the system from
307 this state; most are completely powered off.
308
309 PM_EVENT_FREEZE -- quiesce the driver, but don't necessarily change into
310 any low power mode. A system snapshot is about to be taken, often
311 followed by a call to the driver's resume() method. Neither wakeup
312 events nor DMA are allowed.
313
314 PM_EVENT_PRETHAW -- quiesce the driver, knowing that the upcoming resume()
315 will restore a suspend-to-disk snapshot from a different kernel image.
316 Drivers that are smart enough to look at their hardware state during
317 resume() processing need that state to be correct ... a PRETHAW could
318 be used to invalidate that state (by resetting the device), like a
319 shutdown() invocation would before a kexec() or system halt. Other
320 drivers might handle this the same way as PM_EVENT_FREEZE. Neither
321 wakeup events nor DMA are allowed.
322
323To enter "standby" (ACPI S1) or "Suspend to RAM" (STR, ACPI S3) states, or
324the similarly named APM states, only PM_EVENT_SUSPEND is used; the other event
325codes are used for hibernation ("Suspend to Disk", STD, ACPI S4).
326
327There's also PM_EVENT_ON, a value which never appears as a suspend event
328but is sometimes used to record the "not suspended" device state.
329
330
331Resuming Devices
332----------------
333Resuming is done in multiple phases, much like suspending, with all
334devices processing each phase's calls before the next phase begins.
335
336The phases are seen by driver notifications issued in this order:
337
338 1 bus.resume(dev) reverses the effects of bus.suspend(). This may
339 be morphed into a device driver call with bus-specific parameters;
340 implementations may sleep.
341
342 2 class.resume(dev) is called for devices associated with a class
343 that has such a method. Implementations may sleep.
344
345 This reverses the effects of class.suspend(), and would usually
346 reactivate the device's I/O queue.
347
348At the end of those phases, drivers should normally be as functional as
349they were before suspending: I/O can be performed using DMA and IRQs, and
350the relevant clocks are gated on. The device need not be "fully on"; it
351might be in a runtime lowpower/suspend state that acts as if it were.
352
353However, the details here may again be platform-specific. For example,
354some systems support multiple "run" states, and the mode in effect at
355the end of resume() might not be the one which preceded suspension.
356That means availability of certain clocks or power supplies changed,
357which could easily affect how a driver works.
358
359
360Drivers need to be able to handle hardware which has been reset since the
361suspend methods were called, for example by complete reinitialization.
362This may be the hardest part, and the one most protected by NDA'd documents
363and chip errata. It's simplest if the hardware state hasn't changed since
364the suspend() was called, but that can't always be guaranteed.
365
366Drivers must also be prepared to notice that the device has been removed
367while the system was powered off, whenever that's physically possible.
368PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of busses
369where common Linux platforms will see such removal. Details of how drivers
370will notice and handle such removals are currently bus-specific, and often
371involve a separate thread.
372
373
374Note that the bus-specific runtime PM wakeup mechanism can exist, and might
375be defined to share some of the same driver code as for system wakeup. For
376example, a bus-specific device driver's resume() method might be used there,
377so it wouldn't only be called from bus.resume() during system-wide wakeup.
378See bus-specific information about how runtime wakeup events are handled.
379
380
381System Devices
382--------------
383System devices follow a slightly different API, which can be found in
384
385 include/linux/sysdev.h
386 drivers/base/sys.c
387
388System devices will only be suspended with interrupts disabled, and after
389all other devices have been suspended. On resume, they will be resumed
390before any other devices, and also with interrupts disabled.
391
392That is, IRQs are disabled, the suspend_late() phase begins, then the
393sysdev_driver.suspend() phase, and the system enters a sleep state. Then
394the sysdev_driver.resume() phase begins, followed by the resume_early()
395phase, after which IRQs are enabled.
396
397Code to actually enter and exit the system-wide low power state sometimes
398involves hardware details that are only known to the boot firmware, and
399may leave a CPU running software (from SRAM or flash memory) that monitors
400the system and manages its wakeup sequence.
401 579
402 580
403Runtime Power Management 581Runtime Power Management
@@ -407,82 +585,23 @@ running. This feature is useful for devices that are not being used, and
407can offer significant power savings on a running system. These devices 585can offer significant power savings on a running system. These devices
408often support a range of runtime power states, which might use names such 586often support a range of runtime power states, which might use names such
409as "off", "sleep", "idle", "active", and so on. Those states will in some 587as "off", "sleep", "idle", "active", and so on. Those states will in some
410cases (like PCI) be partially constrained by a bus the device uses, and will 588cases (like PCI) be partially constrained by the bus the device uses, and will
411usually include hardware states that are also used in system sleep states. 589usually include hardware states that are also used in system sleep states.
412 590
413However, note that if a driver puts a device into a runtime low power state 591A system-wide power transition can be started while some devices are in low
414and the system then goes into a system-wide sleep state, it normally ought 592power states due to runtime power management. The system sleep PM callbacks
415to resume into that runtime low power state rather than "full on". Such 593should recognize such situations and react to them appropriately, but the
416distinctions would be part of the driver-internal state machine for that 594necessary actions are subsystem-specific.
417hardware; the whole point of runtime power management is to be sure that 595
418drivers are decoupled in that way from the state machine governing phases 596In some cases the decision may be made at the subsystem level while in other
419of the system-wide power/sleep state transitions. 597cases the device driver may be left to decide. In some cases it may be
420 598desirable to leave a suspended device in that state during a system-wide power
421 599transition, but in other cases the device must be put back into the full-power
422Power Saving Techniques 600state temporarily, for example so that its system wakeup capability can be
423----------------------- 601disabled. This all depends on the hardware and the design of the subsystem and
424Normally runtime power management is handled by the drivers without specific 602device driver in question.
425userspace or kernel intervention, by device-aware use of techniques like: 603
426 604During system-wide resume from a sleep state it's best to put devices into the
427 Using information provided by other system layers 605full-power state, as explained in Documentation/power/runtime_pm.txt. Refer to
428 - stay deeply "off" except between open() and close() 606that document for more information regarding this particular issue as well as
429 - if transceiver/PHY indicates "nobody connected", stay "off" 607for information on the device runtime power management framework in general.
430 - application protocols may include power commands or hints
431
432 Using fewer CPU cycles
433 - using DMA instead of PIO
434 - removing timers, or making them lower frequency
435 - shortening "hot" code paths
436 - eliminating cache misses
437 - (sometimes) offloading work to device firmware
438
439 Reducing other resource costs
440 - gating off unused clocks in software (or hardware)
441 - switching off unused power supplies
442 - eliminating (or delaying/merging) IRQs
443 - tuning DMA to use word and/or burst modes
444
445 Using device-specific low power states
446 - using lower voltages
447 - avoiding needless DMA transfers
448
449Read your hardware documentation carefully to see the opportunities that
450may be available. If you can, measure the actual power usage and check
451it against the budget established for your project.
452
453
454Examples: USB hosts, system timer, system CPU
455----------------------------------------------
456USB host controllers make interesting, if complex, examples. In many cases
457these have no work to do: no USB devices are connected, or all of them are
458in the USB "suspend" state. Linux host controller drivers can then disable
459periodic DMA transfers that would otherwise be a constant power drain on the
460memory subsystem, and enter a suspend state. In power-aware controllers,
461entering that suspend state may disable the clock used with USB signaling,
462saving a certain amount of power.
463
464The controller will be woken from that state (with an IRQ) by changes to the
465signal state on the data lines of a given port, for example by an existing
466peripheral requesting "remote wakeup" or by plugging a new peripheral. The
467same wakeup mechanism usually works from "standby" sleep states, and on some
468systems also from "suspend to RAM" (or even "suspend to disk") states.
469(Except that ACPI may be involved instead of normal IRQs, on some hardware.)
470
471System devices like timers and CPUs may have special roles in the platform
472power management scheme. For example, system timers using a "dynamic tick"
473approach don't just save CPU cycles (by eliminating needless timer IRQs),
474but they may also open the door to using lower power CPU "idle" states that
475cost more than a jiffie to enter and exit. On x86 systems these are states
476like "C3"; note that periodic DMA transfers from a USB host controller will
477also prevent entry to a C3 state, much like a periodic timer IRQ.
478
479That kind of runtime mechanism interaction is common. "System On Chip" (SOC)
480processors often have low power idle modes that can't be entered unless
481certain medium-speed clocks (often 12 or 48 MHz) are gated off. When the
482drivers gate those clocks effectively, then the system idle task may be able
483to use the lower power idle modes and thereby increase battery life.
484
485If the CPU can have a "cpufreq" driver, there also may be opportunities
486to shift to lower voltage settings and reduce the power cost of executing
487a given number of instructions. (Without voltage adjustment, it's rare
488for cpufreq to save much power; the cost-per-instruction must go down.)
diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
index c40866e8b957..bfed898a03fc 100644
--- a/Documentation/power/pm_qos_interface.txt
+++ b/Documentation/power/pm_qos_interface.txt
@@ -18,44 +18,46 @@ and pm_qos_params.h. This is done because having the available parameters
18being runtime configurable or changeable from a driver was seen as too easy to 18being runtime configurable or changeable from a driver was seen as too easy to
19abuse. 19abuse.
20 20
21For each parameter a list of performance requirements is maintained along with 21For each parameter a list of performance requests is maintained along with
22an aggregated target value. The aggregated target value is updated with 22an aggregated target value. The aggregated target value is updated with
23changes to the requirement list or elements of the list. Typically the 23changes to the request list or elements of the list. Typically the
24aggregated target value is simply the max or min of the requirement values held 24aggregated target value is simply the max or min of the request values held
25in the parameter list elements. 25in the parameter list elements.
26 26
27From kernel mode the use of this interface is simple: 27From kernel mode the use of this interface is simple:
28pm_qos_add_requirement(param_id, name, target_value):
29Will insert a named element in the list for that identified PM_QOS parameter
30with the target value. Upon change to this list the new target is recomputed
31and any registered notifiers are called only if the target value is now
32different.
33 28
34pm_qos_update_requirement(param_id, name, new_target_value): 29handle = pm_qos_add_request(param_class, target_value):
35Will search the list identified by the param_id for the named list element and 30Will insert an element into the list for that identified PM_QOS class with the
36then update its target value, calling the notification tree if the aggregated 31target value. Upon change to this list the new target is recomputed and any
37target is changed. with that name is already registered. 32registered notifiers are called only if the target value is now different.
33Clients of pm_qos need to save the returned handle.
38 34
39pm_qos_remove_requirement(param_id, name): 35void pm_qos_update_request(handle, new_target_value):
40Will search the identified list for the named element and remove it, after 36Will update the list element pointed to by the handle with the new target value
41removal it will update the aggregate target and call the notification tree if 37and recompute the new aggregated target, calling the notification tree if the
42the target was changed as a result of removing the named requirement. 38target is changed.
39
40void pm_qos_remove_request(handle):
41Will remove the element. After removal it will update the aggregate target and
42call the notification tree if the target was changed as a result of removing
43the request.
43 44
44 45
45From user mode: 46From user mode:
46Only processes can register a pm_qos requirement. To provide for automatic 47Only processes can register a pm_qos request. To provide for automatic
47cleanup for process the interface requires the process to register its 48cleanup of a process, the interface requires the process to register its
48parameter requirements in the following way: 49parameter requests in the following way:
49 50
50To register the default pm_qos target for the specific parameter, the process 51To register the default pm_qos target for the specific parameter, the process
51must open one of /dev/[cpu_dma_latency, network_latency, network_throughput] 52must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
52 53
53As long as the device node is held open that process has a registered 54As long as the device node is held open that process has a registered
54requirement on the parameter. The name of the requirement is "process_<PID>" 55request on the parameter.
55derived from the current->pid from within the open system call.
56 56
57To change the requested target value the process needs to write a s32 value to 57To change the requested target value the process needs to write an s32 value to
58the open device node. This translates to a pm_qos_update_requirement call. 58the open device node. Alternatively the user mode program could write a hex
59string for the value using 10 char long format e.g. "0x12345678". This
60translates to a pm_qos_update_request call.
59 61
60To remove the user mode request for a target value simply close the device 62To remove the user mode request for a target value simply close the device
61node. 63node.
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
index b967cd9137d6..81680f9f5909 100644
--- a/Documentation/power/userland-swsusp.txt
+++ b/Documentation/power/userland-swsusp.txt
@@ -24,6 +24,10 @@ assumed to be in the resume mode. The device cannot be open for simultaneous
24reading and writing. It is also impossible to have the device open more than 24reading and writing. It is also impossible to have the device open more than
25once at a time. 25once at a time.
26 26
27Even opening the device has side effects. Data structures are
28allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are
29called.
30
27The ioctl() commands recognized by the device are: 31The ioctl() commands recognized by the device are:
28 32
29SNAPSHOT_FREEZE - freeze user space processes (the current process is 33SNAPSHOT_FREEZE - freeze user space processes (the current process is
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 5939e7f7d8e9..c3817e1f32c7 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -698,7 +698,7 @@ static int acpi_processor_power_seq_show(struct seq_file *seq, void *offset)
698 "max_cstate: C%d\n" 698 "max_cstate: C%d\n"
699 "maximum allowed latency: %d usec\n", 699 "maximum allowed latency: %d usec\n",
700 pr->power.state ? pr->power.state - pr->power.states : 0, 700 pr->power.state ? pr->power.state - pr->power.states : 0,
701 max_cstate, pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY)); 701 max_cstate, pm_qos_request(PM_QOS_CPU_DMA_LATENCY));
702 702
703 seq_puts(seq, "states:\n"); 703 seq_puts(seq, "states:\n");
704 704
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 765bcf0df3bb..ada6397c23a5 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -967,17 +967,17 @@ static int platform_pm_restore_noirq(struct device *dev)
967 967
968int __weak platform_pm_runtime_suspend(struct device *dev) 968int __weak platform_pm_runtime_suspend(struct device *dev)
969{ 969{
970 return -ENOSYS; 970 return pm_generic_runtime_suspend(dev);
971}; 971};
972 972
973int __weak platform_pm_runtime_resume(struct device *dev) 973int __weak platform_pm_runtime_resume(struct device *dev)
974{ 974{
975 return -ENOSYS; 975 return pm_generic_runtime_resume(dev);
976}; 976};
977 977
978int __weak platform_pm_runtime_idle(struct device *dev) 978int __weak platform_pm_runtime_idle(struct device *dev)
979{ 979{
980 return -ENOSYS; 980 return pm_generic_runtime_idle(dev);
981}; 981};
982 982
983#else /* !CONFIG_PM_RUNTIME */ 983#else /* !CONFIG_PM_RUNTIME */
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 626dd147b75f..b0ec0e9f27e9 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -229,14 +229,16 @@ int __pm_runtime_suspend(struct device *dev, bool from_wq)
229 229
230 if (retval) { 230 if (retval) {
231 dev->power.runtime_status = RPM_ACTIVE; 231 dev->power.runtime_status = RPM_ACTIVE;
232 pm_runtime_cancel_pending(dev);
233
234 if (retval == -EAGAIN || retval == -EBUSY) { 232 if (retval == -EAGAIN || retval == -EBUSY) {
235 notify = true; 233 if (dev->power.timer_expires == 0)
234 notify = true;
236 dev->power.runtime_error = 0; 235 dev->power.runtime_error = 0;
236 } else {
237 pm_runtime_cancel_pending(dev);
237 } 238 }
238 } else { 239 } else {
239 dev->power.runtime_status = RPM_SUSPENDED; 240 dev->power.runtime_status = RPM_SUSPENDED;
241 pm_runtime_deactivate_timer(dev);
240 242
241 if (dev->parent) { 243 if (dev->parent) {
242 parent = dev->parent; 244 parent = dev->parent;
@@ -659,8 +661,6 @@ int pm_schedule_suspend(struct device *dev, unsigned int delay)
659 661
660 if (dev->power.runtime_status == RPM_SUSPENDED) 662 if (dev->power.runtime_status == RPM_SUSPENDED)
661 retval = 1; 663 retval = 1;
662 else if (dev->power.runtime_status == RPM_SUSPENDING)
663 retval = -EINPROGRESS;
664 else if (atomic_read(&dev->power.usage_count) > 0 664 else if (atomic_read(&dev->power.usage_count) > 0
665 || dev->power.disable_depth > 0) 665 || dev->power.disable_depth > 0)
666 retval = -EAGAIN; 666 retval = -EAGAIN;
diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index 86fd9373447e..a4c33bc51257 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -5,6 +5,7 @@
5#include <linux/device.h> 5#include <linux/device.h>
6#include <linux/string.h> 6#include <linux/string.h>
7#include <linux/pm_runtime.h> 7#include <linux/pm_runtime.h>
8#include <asm/atomic.h>
8#include "power.h" 9#include "power.h"
9 10
10/* 11/*
@@ -143,7 +144,59 @@ wake_store(struct device * dev, struct device_attribute *attr,
143 144
144static DEVICE_ATTR(wakeup, 0644, wake_show, wake_store); 145static DEVICE_ATTR(wakeup, 0644, wake_show, wake_store);
145 146
146#ifdef CONFIG_PM_SLEEP_ADVANCED_DEBUG 147#ifdef CONFIG_PM_ADVANCED_DEBUG
148#ifdef CONFIG_PM_RUNTIME
149
150static ssize_t rtpm_usagecount_show(struct device *dev,
151 struct device_attribute *attr, char *buf)
152{
153 return sprintf(buf, "%d\n", atomic_read(&dev->power.usage_count));
154}
155
156static ssize_t rtpm_children_show(struct device *dev,
157 struct device_attribute *attr, char *buf)
158{
159 return sprintf(buf, "%d\n", dev->power.ignore_children ?
160 0 : atomic_read(&dev->power.child_count));
161}
162
163static ssize_t rtpm_enabled_show(struct device *dev,
164 struct device_attribute *attr, char *buf)
165{
166 if ((dev->power.disable_depth) && (dev->power.runtime_auto == false))
167 return sprintf(buf, "disabled & forbidden\n");
168 else if (dev->power.disable_depth)
169 return sprintf(buf, "disabled\n");
170 else if (dev->power.runtime_auto == false)
171 return sprintf(buf, "forbidden\n");
172 return sprintf(buf, "enabled\n");
173}
174
175static ssize_t rtpm_status_show(struct device *dev,
176 struct device_attribute *attr, char *buf)
177{
178 if (dev->power.runtime_error)
179 return sprintf(buf, "error\n");
180 switch (dev->power.runtime_status) {
181 case RPM_SUSPENDED:
182 return sprintf(buf, "suspended\n");
183 case RPM_SUSPENDING:
184 return sprintf(buf, "suspending\n");
185 case RPM_RESUMING:
186 return sprintf(buf, "resuming\n");
187 case RPM_ACTIVE:
188 return sprintf(buf, "active\n");
189 }
190 return -EIO;
191}
192
193static DEVICE_ATTR(runtime_usage, 0444, rtpm_usagecount_show, NULL);
194static DEVICE_ATTR(runtime_active_kids, 0444, rtpm_children_show, NULL);
195static DEVICE_ATTR(runtime_status, 0444, rtpm_status_show, NULL);
196static DEVICE_ATTR(runtime_enabled, 0444, rtpm_enabled_show, NULL);
197
198#endif
199
147static ssize_t async_show(struct device *dev, struct device_attribute *attr, 200static ssize_t async_show(struct device *dev, struct device_attribute *attr,
148 char *buf) 201 char *buf)
149{ 202{
@@ -170,15 +223,21 @@ static ssize_t async_store(struct device *dev, struct device_attribute *attr,
170} 223}
171 224
172static DEVICE_ATTR(async, 0644, async_show, async_store); 225static DEVICE_ATTR(async, 0644, async_show, async_store);
173#endif /* CONFIG_PM_SLEEP_ADVANCED_DEBUG */ 226#endif /* CONFIG_PM_ADVANCED_DEBUG */
174 227
175static struct attribute * power_attrs[] = { 228static struct attribute * power_attrs[] = {
176#ifdef CONFIG_PM_RUNTIME 229#ifdef CONFIG_PM_RUNTIME
177 &dev_attr_control.attr, 230 &dev_attr_control.attr,
178#endif 231#endif
179 &dev_attr_wakeup.attr, 232 &dev_attr_wakeup.attr,
180#ifdef CONFIG_PM_SLEEP_ADVANCED_DEBUG 233#ifdef CONFIG_PM_ADVANCED_DEBUG
181 &dev_attr_async.attr, 234 &dev_attr_async.attr,
235#ifdef CONFIG_PM_RUNTIME
236 &dev_attr_runtime_usage.attr,
237 &dev_attr_runtime_active_kids.attr,
238 &dev_attr_runtime_status.attr,
239 &dev_attr_runtime_enabled.attr,
240#endif
182#endif 241#endif
183 NULL, 242 NULL,
184}; 243};
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 1c1ceb4f218f..12c98900dcf8 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -67,7 +67,7 @@ static int ladder_select_state(struct cpuidle_device *dev)
67 struct ladder_device *ldev = &__get_cpu_var(ladder_devices); 67 struct ladder_device *ldev = &__get_cpu_var(ladder_devices);
68 struct ladder_device_state *last_state; 68 struct ladder_device_state *last_state;
69 int last_residency, last_idx = ldev->last_state_idx; 69 int last_residency, last_idx = ldev->last_state_idx;
70 int latency_req = pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY); 70 int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
71 71
72 /* Special case when user has set very strict latency requirement */ 72 /* Special case when user has set very strict latency requirement */
73 if (unlikely(latency_req == 0)) { 73 if (unlikely(latency_req == 0)) {
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index f8e57c6303f2..b81ad9c731ae 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -182,7 +182,7 @@ static u64 div_round64(u64 dividend, u32 divisor)
182static int menu_select(struct cpuidle_device *dev) 182static int menu_select(struct cpuidle_device *dev)
183{ 183{
184 struct menu_device *data = &__get_cpu_var(menu_devices); 184 struct menu_device *data = &__get_cpu_var(menu_devices);
185 int latency_req = pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY); 185 int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
186 int i; 186 int i;
187 int multiplier; 187 int multiplier;
188 188
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index c2258a51fe0c..7c469a62c3c1 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -159,107 +159,131 @@ static void i2c_device_shutdown(struct device *dev)
159 driver->shutdown(client); 159 driver->shutdown(client);
160} 160}
161 161
162#ifdef CONFIG_SUSPEND 162#ifdef CONFIG_PM_SLEEP
163static int i2c_device_pm_suspend(struct device *dev) 163static int i2c_legacy_suspend(struct device *dev, pm_message_t mesg)
164{ 164{
165 const struct dev_pm_ops *pm; 165 struct i2c_client *client = i2c_verify_client(dev);
166 struct i2c_driver *driver;
166 167
167 if (!dev->driver) 168 if (!client || !dev->driver)
168 return 0; 169 return 0;
169 pm = dev->driver->pm; 170 driver = to_i2c_driver(dev->driver);
170 if (!pm || !pm->suspend) 171 if (!driver->suspend)
171 return 0; 172 return 0;
172 return pm->suspend(dev); 173 return driver->suspend(client, mesg);
173} 174}
174 175
175static int i2c_device_pm_resume(struct device *dev) 176static int i2c_legacy_resume(struct device *dev)
176{ 177{
177 const struct dev_pm_ops *pm; 178 struct i2c_client *client = i2c_verify_client(dev);
179 struct i2c_driver *driver;
178 180
179 if (!dev->driver) 181 if (!client || !dev->driver)
180 return 0; 182 return 0;
181 pm = dev->driver->pm; 183 driver = to_i2c_driver(dev->driver);
182 if (!pm || !pm->resume) 184 if (!driver->resume)
183 return 0; 185 return 0;
184 return pm->resume(dev); 186 return driver->resume(client);
185} 187}
186#else
187#define i2c_device_pm_suspend NULL
188#define i2c_device_pm_resume NULL
189#endif
190 188
191#ifdef CONFIG_PM_RUNTIME 189static int i2c_device_pm_suspend(struct device *dev)
192static int i2c_device_runtime_suspend(struct device *dev)
193{ 190{
194 const struct dev_pm_ops *pm; 191 const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
195 192
196 if (!dev->driver) 193 if (pm_runtime_suspended(dev))
197 return 0;
198 pm = dev->driver->pm;
199 if (!pm || !pm->runtime_suspend)
200 return 0; 194 return 0;
201 return pm->runtime_suspend(dev);
202}
203 195
204static int i2c_device_runtime_resume(struct device *dev) 196 if (pm)
205{ 197 return pm->suspend ? pm->suspend(dev) : 0;
206 const struct dev_pm_ops *pm;
207 198
208 if (!dev->driver) 199 return i2c_legacy_suspend(dev, PMSG_SUSPEND);
209 return 0;
210 pm = dev->driver->pm;
211 if (!pm || !pm->runtime_resume)
212 return 0;
213 return pm->runtime_resume(dev);
214} 200}
215 201
216static int i2c_device_runtime_idle(struct device *dev) 202static int i2c_device_pm_resume(struct device *dev)
217{ 203{
218 const struct dev_pm_ops *pm = NULL; 204 const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
219 int ret; 205 int ret;
220 206
221 if (dev->driver) 207 if (pm)
222 pm = dev->driver->pm; 208 ret = pm->resume ? pm->resume(dev) : 0;
223 if (pm && pm->runtime_idle) { 209 else
224 ret = pm->runtime_idle(dev); 210 ret = i2c_legacy_resume(dev);
225 if (ret) 211
226 return ret; 212 if (!ret) {
213 pm_runtime_disable(dev);
214 pm_runtime_set_active(dev);
215 pm_runtime_enable(dev);
227 } 216 }
228 217
229 return pm_runtime_suspend(dev); 218 return ret;
230} 219}
231#else
232#define i2c_device_runtime_suspend NULL
233#define i2c_device_runtime_resume NULL
234#define i2c_device_runtime_idle NULL
235#endif
236 220
237static int i2c_device_suspend(struct device *dev, pm_message_t mesg) 221static int i2c_device_pm_freeze(struct device *dev)
238{ 222{
239 struct i2c_client *client = i2c_verify_client(dev); 223 const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
240 struct i2c_driver *driver;
241 224
242 if (!client || !dev->driver) 225 if (pm_runtime_suspended(dev))
243 return 0; 226 return 0;
244 driver = to_i2c_driver(dev->driver); 227
245 if (!driver->suspend) 228 if (pm)
246 return 0; 229 return pm->freeze ? pm->freeze(dev) : 0;
247 return driver->suspend(client, mesg); 230
231 return i2c_legacy_suspend(dev, PMSG_FREEZE);
248} 232}
249 233
250static int i2c_device_resume(struct device *dev) 234static int i2c_device_pm_thaw(struct device *dev)
251{ 235{
252 struct i2c_client *client = i2c_verify_client(dev); 236 const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
253 struct i2c_driver *driver;
254 237
255 if (!client || !dev->driver) 238 if (pm_runtime_suspended(dev))
256 return 0; 239 return 0;
257 driver = to_i2c_driver(dev->driver); 240
258 if (!driver->resume) 241 if (pm)
242 return pm->thaw ? pm->thaw(dev) : 0;
243
244 return i2c_legacy_resume(dev);
245}
246
247static int i2c_device_pm_poweroff(struct device *dev)
248{
249 const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
250
251 if (pm_runtime_suspended(dev))
259 return 0; 252 return 0;
260 return driver->resume(client); 253
254 if (pm)
255 return pm->poweroff ? pm->poweroff(dev) : 0;
256
257 return i2c_legacy_suspend(dev, PMSG_HIBERNATE);
261} 258}
262 259
260static int i2c_device_pm_restore(struct device *dev)
261{
262 const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
263 int ret;
264
265 if (pm)
266 ret = pm->restore ? pm->restore(dev) : 0;
267 else
268 ret = i2c_legacy_resume(dev);
269
270 if (!ret) {
271 pm_runtime_disable(dev);
272 pm_runtime_set_active(dev);
273 pm_runtime_enable(dev);
274 }
275
276 return ret;
277}
278#else /* !CONFIG_PM_SLEEP */
279#define i2c_device_pm_suspend NULL
280#define i2c_device_pm_resume NULL
281#define i2c_device_pm_freeze NULL
282#define i2c_device_pm_thaw NULL
283#define i2c_device_pm_poweroff NULL
284#define i2c_device_pm_restore NULL
285#endif /* !CONFIG_PM_SLEEP */
286
263static void i2c_client_dev_release(struct device *dev) 287static void i2c_client_dev_release(struct device *dev)
264{ 288{
265 kfree(to_i2c_client(dev)); 289 kfree(to_i2c_client(dev));
@@ -301,9 +325,15 @@ static const struct attribute_group *i2c_dev_attr_groups[] = {
301static const struct dev_pm_ops i2c_device_pm_ops = { 325static const struct dev_pm_ops i2c_device_pm_ops = {
302 .suspend = i2c_device_pm_suspend, 326 .suspend = i2c_device_pm_suspend,
303 .resume = i2c_device_pm_resume, 327 .resume = i2c_device_pm_resume,
304 .runtime_suspend = i2c_device_runtime_suspend, 328 .freeze = i2c_device_pm_freeze,
305 .runtime_resume = i2c_device_runtime_resume, 329 .thaw = i2c_device_pm_thaw,
306 .runtime_idle = i2c_device_runtime_idle, 330 .poweroff = i2c_device_pm_poweroff,
331 .restore = i2c_device_pm_restore,
332 SET_RUNTIME_PM_OPS(
333 pm_generic_runtime_suspend,
334 pm_generic_runtime_resume,
335 pm_generic_runtime_idle
336 )
307}; 337};
308 338
309struct bus_type i2c_bus_type = { 339struct bus_type i2c_bus_type = {
@@ -312,8 +342,6 @@ struct bus_type i2c_bus_type = {
312 .probe = i2c_device_probe, 342 .probe = i2c_device_probe,
313 .remove = i2c_device_remove, 343 .remove = i2c_device_remove,
314 .shutdown = i2c_device_shutdown, 344 .shutdown = i2c_device_shutdown,
315 .suspend = i2c_device_suspend,
316 .resume = i2c_device_resume,
317 .pm = &i2c_device_pm_ops, 345 .pm = &i2c_device_pm_ops,
318}; 346};
319EXPORT_SYMBOL_GPL(i2c_bus_type); 347EXPORT_SYMBOL_GPL(i2c_bus_type);
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index dbf81788bb40..d5d55c6a373f 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -2524,12 +2524,12 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
2524 * excessive C-state transition latencies result in 2524 * excessive C-state transition latencies result in
2525 * dropped transactions. 2525 * dropped transactions.
2526 */ 2526 */
2527 pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY, 2527 pm_qos_update_request(
2528 adapter->netdev->name, 55); 2528 adapter->netdev->pm_qos_req, 55);
2529 } else { 2529 } else {
2530 pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY, 2530 pm_qos_update_request(
2531 adapter->netdev->name, 2531 adapter->netdev->pm_qos_req,
2532 PM_QOS_DEFAULT_VALUE); 2532 PM_QOS_DEFAULT_VALUE);
2533 } 2533 }
2534 } 2534 }
2535 2535
@@ -2824,8 +2824,8 @@ int e1000e_up(struct e1000_adapter *adapter)
2824 2824
2825 /* DMA latency requirement to workaround early-receive/jumbo issue */ 2825 /* DMA latency requirement to workaround early-receive/jumbo issue */
2826 if (adapter->flags & FLAG_HAS_ERT) 2826 if (adapter->flags & FLAG_HAS_ERT)
2827 pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY, 2827 adapter->netdev->pm_qos_req =
2828 adapter->netdev->name, 2828 pm_qos_add_request(PM_QOS_CPU_DMA_LATENCY,
2829 PM_QOS_DEFAULT_VALUE); 2829 PM_QOS_DEFAULT_VALUE);
2830 2830
2831 /* hardware has been reset, we need to reload some things */ 2831 /* hardware has been reset, we need to reload some things */
@@ -2887,9 +2887,11 @@ void e1000e_down(struct e1000_adapter *adapter)
2887 e1000_clean_tx_ring(adapter); 2887 e1000_clean_tx_ring(adapter);
2888 e1000_clean_rx_ring(adapter); 2888 e1000_clean_rx_ring(adapter);
2889 2889
2890 if (adapter->flags & FLAG_HAS_ERT) 2890 if (adapter->flags & FLAG_HAS_ERT) {
2891 pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, 2891 pm_qos_remove_request(
2892 adapter->netdev->name); 2892 adapter->netdev->pm_qos_req);
2893 adapter->netdev->pm_qos_req = NULL;
2894 }
2893 2895
2894 /* 2896 /*
2895 * TODO: for power management, we could drop the link and 2897 * TODO: for power management, we could drop the link and
diff --git a/drivers/net/igbvf/netdev.c b/drivers/net/igbvf/netdev.c
index 1b1edad1eb5e..f16e981812a9 100644
--- a/drivers/net/igbvf/netdev.c
+++ b/drivers/net/igbvf/netdev.c
@@ -48,6 +48,7 @@
48#define DRV_VERSION "1.0.0-k0" 48#define DRV_VERSION "1.0.0-k0"
49char igbvf_driver_name[] = "igbvf"; 49char igbvf_driver_name[] = "igbvf";
50const char igbvf_driver_version[] = DRV_VERSION; 50const char igbvf_driver_version[] = DRV_VERSION;
51struct pm_qos_request_list *igbvf_driver_pm_qos_req;
51static const char igbvf_driver_string[] = 52static const char igbvf_driver_string[] =
52 "Intel(R) Virtual Function Network Driver"; 53 "Intel(R) Virtual Function Network Driver";
53static const char igbvf_copyright[] = "Copyright (c) 2009 Intel Corporation."; 54static const char igbvf_copyright[] = "Copyright (c) 2009 Intel Corporation.";
@@ -2899,7 +2900,7 @@ static int __init igbvf_init_module(void)
2899 printk(KERN_INFO "%s\n", igbvf_copyright); 2900 printk(KERN_INFO "%s\n", igbvf_copyright);
2900 2901
2901 ret = pci_register_driver(&igbvf_driver); 2902 ret = pci_register_driver(&igbvf_driver);
2902 pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY, igbvf_driver_name, 2903 igbvf_driver_pm_qos_req = pm_qos_add_request(PM_QOS_CPU_DMA_LATENCY,
2903 PM_QOS_DEFAULT_VALUE); 2904 PM_QOS_DEFAULT_VALUE);
2904 2905
2905 return ret; 2906 return ret;
@@ -2915,7 +2916,8 @@ module_init(igbvf_init_module);
2915static void __exit igbvf_exit_module(void) 2916static void __exit igbvf_exit_module(void)
2916{ 2917{
2917 pci_unregister_driver(&igbvf_driver); 2918 pci_unregister_driver(&igbvf_driver);
2918 pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, igbvf_driver_name); 2919 pm_qos_remove_request(igbvf_driver_pm_qos_req);
2920 igbvf_driver_pm_qos_req = NULL;
2919} 2921}
2920module_exit(igbvf_exit_module); 2922module_exit(igbvf_exit_module);
2921 2923
diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c
index 9b72c45a7748..2b05fe5e994c 100644
--- a/drivers/net/wireless/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/ipw2x00/ipw2100.c
@@ -174,6 +174,8 @@ that only one external action is invoked at a time.
174#define DRV_DESCRIPTION "Intel(R) PRO/Wireless 2100 Network Driver" 174#define DRV_DESCRIPTION "Intel(R) PRO/Wireless 2100 Network Driver"
175#define DRV_COPYRIGHT "Copyright(c) 2003-2006 Intel Corporation" 175#define DRV_COPYRIGHT "Copyright(c) 2003-2006 Intel Corporation"
176 176
177struct pm_qos_request_list *ipw2100_pm_qos_req;
178
177/* Debugging stuff */ 179/* Debugging stuff */
178#ifdef CONFIG_IPW2100_DEBUG 180#ifdef CONFIG_IPW2100_DEBUG
179#define IPW2100_RX_DEBUG /* Reception debugging */ 181#define IPW2100_RX_DEBUG /* Reception debugging */
@@ -1739,7 +1741,7 @@ static int ipw2100_up(struct ipw2100_priv *priv, int deferred)
1739 /* the ipw2100 hardware really doesn't want power management delays 1741 /* the ipw2100 hardware really doesn't want power management delays
1740 * longer than 175usec 1742 * longer than 175usec
1741 */ 1743 */
1742 pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100", 175); 1744 pm_qos_update_request(ipw2100_pm_qos_req, 175);
1743 1745
1744 /* If the interrupt is enabled, turn it off... */ 1746 /* If the interrupt is enabled, turn it off... */
1745 spin_lock_irqsave(&priv->low_lock, flags); 1747 spin_lock_irqsave(&priv->low_lock, flags);
@@ -1887,8 +1889,7 @@ static void ipw2100_down(struct ipw2100_priv *priv)
1887 ipw2100_disable_interrupts(priv); 1889 ipw2100_disable_interrupts(priv);
1888 spin_unlock_irqrestore(&priv->low_lock, flags); 1890 spin_unlock_irqrestore(&priv->low_lock, flags);
1889 1891
1890 pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100", 1892 pm_qos_update_request(ipw2100_pm_qos_req, PM_QOS_DEFAULT_VALUE);
1891 PM_QOS_DEFAULT_VALUE);
1892 1893
1893 /* We have to signal any supplicant if we are disassociating */ 1894 /* We have to signal any supplicant if we are disassociating */
1894 if (associated) 1895 if (associated)
@@ -6669,7 +6670,7 @@ static int __init ipw2100_init(void)
6669 if (ret) 6670 if (ret)
6670 goto out; 6671 goto out;
6671 6672
6672 pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100", 6673 ipw2100_pm_qos_req = pm_qos_add_request(PM_QOS_CPU_DMA_LATENCY,
6673 PM_QOS_DEFAULT_VALUE); 6674 PM_QOS_DEFAULT_VALUE);
6674#ifdef CONFIG_IPW2100_DEBUG 6675#ifdef CONFIG_IPW2100_DEBUG
6675 ipw2100_debug_level = debug; 6676 ipw2100_debug_level = debug;
@@ -6692,7 +6693,7 @@ static void __exit ipw2100_exit(void)
6692 &driver_attr_debug_level); 6693 &driver_attr_debug_level);
6693#endif 6694#endif
6694 pci_unregister_driver(&ipw2100_pci_driver); 6695 pci_unregister_driver(&ipw2100_pci_driver);
6695 pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100"); 6696 pm_qos_remove_request(ipw2100_pm_qos_req);
6696} 6697}
6697 6698
6698module_init(ipw2100_init); 6699module_init(ipw2100_init);
diff --git a/fs/libfs.c b/fs/libfs.c
index ea9a6cc9b35c..232bea425b09 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -547,6 +547,40 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
547} 547}
548 548
549/** 549/**
550 * simple_write_to_buffer - copy data from user space to the buffer
551 * @to: the buffer to write to
552 * @available: the size of the buffer
553 * @ppos: the current position in the buffer
554 * @from: the user space buffer to read from
555 * @count: the maximum number of bytes to read
556 *
557 * The simple_write_to_buffer() function reads up to @count bytes from the user
558 * space address starting at @from into the buffer @to at offset @ppos.
559 *
560 * On success, the number of bytes written is returned and the offset @ppos is
561 * advanced by this number, or negative value is returned on error.
562 **/
563ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
564 const void __user *from, size_t count)
565{
566 loff_t pos = *ppos;
567 size_t res;
568
569 if (pos < 0)
570 return -EINVAL;
571 if (pos >= available || !count)
572 return 0;
573 if (count > available - pos)
574 count = available - pos;
575 res = copy_from_user(to + pos, from, count);
576 if (res == count)
577 return -EFAULT;
578 count -= res;
579 *ppos = pos + count;
580 return count;
581}
582
583/**
550 * memory_read_from_buffer - copy data from the buffer 584 * memory_read_from_buffer - copy data from the buffer
551 * @to: the kernel space buffer to read to 585 * @to: the kernel space buffer to read to
552 * @count: the maximum number of bytes to read 586 * @count: the maximum number of bytes to read
@@ -864,6 +898,7 @@ EXPORT_SYMBOL(simple_statfs);
864EXPORT_SYMBOL(simple_sync_file); 898EXPORT_SYMBOL(simple_sync_file);
865EXPORT_SYMBOL(simple_unlink); 899EXPORT_SYMBOL(simple_unlink);
866EXPORT_SYMBOL(simple_read_from_buffer); 900EXPORT_SYMBOL(simple_read_from_buffer);
901EXPORT_SYMBOL(simple_write_to_buffer);
867EXPORT_SYMBOL(memory_read_from_buffer); 902EXPORT_SYMBOL(memory_read_from_buffer);
868EXPORT_SYMBOL(simple_transaction_set); 903EXPORT_SYMBOL(simple_transaction_set);
869EXPORT_SYMBOL(simple_transaction_get); 904EXPORT_SYMBOL(simple_transaction_get);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 44f35aea2f1f..948bd2bfb1de 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2362,6 +2362,8 @@ extern void simple_release_fs(struct vfsmount **mount, int *count);
2362 2362
2363extern ssize_t simple_read_from_buffer(void __user *to, size_t count, 2363extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
2364 loff_t *ppos, const void *from, size_t available); 2364 loff_t *ppos, const void *from, size_t available);
2365extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
2366 const void __user *from, size_t count);
2365 2367
2366extern int simple_fsync(struct file *, struct dentry *, int); 2368extern int simple_fsync(struct file *, struct dentry *, int);
2367 2369
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fa8b47637997..3857517f1ca5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -31,6 +31,7 @@
31#include <linux/if_link.h> 31#include <linux/if_link.h>
32 32
33#ifdef __KERNEL__ 33#ifdef __KERNEL__
34#include <linux/pm_qos_params.h>
34#include <linux/timer.h> 35#include <linux/timer.h>
35#include <linux/delay.h> 36#include <linux/delay.h>
36#include <linux/mm.h> 37#include <linux/mm.h>
@@ -711,6 +712,9 @@ struct net_device {
711 * the interface. 712 * the interface.
712 */ 713 */
713 char name[IFNAMSIZ]; 714 char name[IFNAMSIZ];
715
716 struct pm_qos_request_list *pm_qos_req;
717
714 /* device name hash chain */ 718 /* device name hash chain */
715 struct hlist_node name_hlist; 719 struct hlist_node name_hlist;
716 /* snmp alias */ 720 /* snmp alias */
diff --git a/include/linux/pm_qos_params.h b/include/linux/pm_qos_params.h
index d74f75ed1e47..8ba440e5eb7f 100644
--- a/include/linux/pm_qos_params.h
+++ b/include/linux/pm_qos_params.h
@@ -14,12 +14,14 @@
14#define PM_QOS_NUM_CLASSES 4 14#define PM_QOS_NUM_CLASSES 4
15#define PM_QOS_DEFAULT_VALUE -1 15#define PM_QOS_DEFAULT_VALUE -1
16 16
17int pm_qos_add_requirement(int qos, char *name, s32 value); 17struct pm_qos_request_list;
18int pm_qos_update_requirement(int qos, char *name, s32 new_value);
19void pm_qos_remove_requirement(int qos, char *name);
20 18
21int pm_qos_requirement(int qos); 19struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value);
20void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
21 s32 new_value);
22void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req);
22 23
23int pm_qos_add_notifier(int qos, struct notifier_block *notifier); 24int pm_qos_request(int pm_qos_class);
24int pm_qos_remove_notifier(int qos, struct notifier_block *notifier); 25int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier);
26int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier);
25 27
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index b776db737244..6e81888c6222 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -30,6 +30,9 @@ extern void pm_runtime_enable(struct device *dev);
30extern void __pm_runtime_disable(struct device *dev, bool check_resume); 30extern void __pm_runtime_disable(struct device *dev, bool check_resume);
31extern void pm_runtime_allow(struct device *dev); 31extern void pm_runtime_allow(struct device *dev);
32extern void pm_runtime_forbid(struct device *dev); 32extern void pm_runtime_forbid(struct device *dev);
33extern int pm_generic_runtime_idle(struct device *dev);
34extern int pm_generic_runtime_suspend(struct device *dev);
35extern int pm_generic_runtime_resume(struct device *dev);
33 36
34static inline bool pm_children_suspended(struct device *dev) 37static inline bool pm_children_suspended(struct device *dev)
35{ 38{
@@ -96,6 +99,10 @@ static inline bool device_run_wake(struct device *dev) { return false; }
96static inline void device_set_run_wake(struct device *dev, bool enable) {} 99static inline void device_set_run_wake(struct device *dev, bool enable) {}
97static inline bool pm_runtime_suspended(struct device *dev) { return false; } 100static inline bool pm_runtime_suspended(struct device *dev) { return false; }
98 101
102static inline int pm_generic_runtime_idle(struct device *dev) { return 0; }
103static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; }
104static inline int pm_generic_runtime_resume(struct device *dev) { return 0; }
105
99#endif /* !CONFIG_PM_RUNTIME */ 106#endif /* !CONFIG_PM_RUNTIME */
100 107
101static inline int pm_runtime_get(struct device *dev) 108static inline int pm_runtime_get(struct device *dev)
diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index 0aae7776185e..22d64c18056c 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -25,32 +25,34 @@
25# error "please don't include this file directly" 25# error "please don't include this file directly"
26#endif 26#endif
27 27
28#include <linux/types.h>
29
28#ifdef CONFIG_PM 30#ifdef CONFIG_PM
29 31
30/* changes to device_may_wakeup take effect on the next pm state change. 32/* changes to device_may_wakeup take effect on the next pm state change.
31 * by default, devices should wakeup if they can. 33 * by default, devices should wakeup if they can.
32 */ 34 */
33static inline void device_init_wakeup(struct device *dev, int val) 35static inline void device_init_wakeup(struct device *dev, bool val)
34{ 36{
35 dev->power.can_wakeup = dev->power.should_wakeup = !!val; 37 dev->power.can_wakeup = dev->power.should_wakeup = val;
36} 38}
37 39
38static inline void device_set_wakeup_capable(struct device *dev, int val) 40static inline void device_set_wakeup_capable(struct device *dev, bool capable)
39{ 41{
40 dev->power.can_wakeup = !!val; 42 dev->power.can_wakeup = capable;
41} 43}
42 44
43static inline int device_can_wakeup(struct device *dev) 45static inline bool device_can_wakeup(struct device *dev)
44{ 46{
45 return dev->power.can_wakeup; 47 return dev->power.can_wakeup;
46} 48}
47 49
48static inline void device_set_wakeup_enable(struct device *dev, int val) 50static inline void device_set_wakeup_enable(struct device *dev, bool enable)
49{ 51{
50 dev->power.should_wakeup = !!val; 52 dev->power.should_wakeup = enable;
51} 53}
52 54
53static inline int device_may_wakeup(struct device *dev) 55static inline bool device_may_wakeup(struct device *dev)
54{ 56{
55 return dev->power.can_wakeup && dev->power.should_wakeup; 57 return dev->power.can_wakeup && dev->power.should_wakeup;
56} 58}
@@ -58,20 +60,28 @@ static inline int device_may_wakeup(struct device *dev)
58#else /* !CONFIG_PM */ 60#else /* !CONFIG_PM */
59 61
60/* For some reason the next two routines work even without CONFIG_PM */ 62/* For some reason the next two routines work even without CONFIG_PM */
61static inline void device_init_wakeup(struct device *dev, int val) 63static inline void device_init_wakeup(struct device *dev, bool val)
62{ 64{
63 dev->power.can_wakeup = !!val; 65 dev->power.can_wakeup = val;
64} 66}
65 67
66static inline void device_set_wakeup_capable(struct device *dev, int val) { } 68static inline void device_set_wakeup_capable(struct device *dev, bool capable)
69{
70}
67 71
68static inline int device_can_wakeup(struct device *dev) 72static inline bool device_can_wakeup(struct device *dev)
69{ 73{
70 return dev->power.can_wakeup; 74 return dev->power.can_wakeup;
71} 75}
72 76
73#define device_set_wakeup_enable(dev, val) do {} while (0) 77static inline void device_set_wakeup_enable(struct device *dev, bool enable)
74#define device_may_wakeup(dev) 0 78{
79}
80
81static inline bool device_may_wakeup(struct device *dev)
82{
83 return false;
84}
75 85
76#endif /* !CONFIG_PM */ 86#endif /* !CONFIG_PM */
77 87
diff --git a/include/sound/pcm.h b/include/sound/pcm.h
index 8b611a561985..dd76cdede64d 100644
--- a/include/sound/pcm.h
+++ b/include/sound/pcm.h
@@ -29,6 +29,7 @@
29#include <linux/poll.h> 29#include <linux/poll.h>
30#include <linux/mm.h> 30#include <linux/mm.h>
31#include <linux/bitops.h> 31#include <linux/bitops.h>
32#include <linux/pm_qos_params.h>
32 33
33#define snd_pcm_substream_chip(substream) ((substream)->private_data) 34#define snd_pcm_substream_chip(substream) ((substream)->private_data)
34#define snd_pcm_chip(pcm) ((pcm)->private_data) 35#define snd_pcm_chip(pcm) ((pcm)->private_data)
@@ -365,7 +366,7 @@ struct snd_pcm_substream {
365 int number; 366 int number;
366 char name[32]; /* substream name */ 367 char name[32]; /* substream name */
367 int stream; /* stream (direction) */ 368 int stream; /* stream (direction) */
368 char latency_id[20]; /* latency identifier */ 369 struct pm_qos_request_list *latency_pm_qos_req; /* pm_qos request */
369 size_t buffer_bytes_max; /* limit ring buffer size */ 370 size_t buffer_bytes_max; /* limit ring buffer size */
370 struct snd_dma_buffer dma_buffer; 371 struct snd_dma_buffer dma_buffer;
371 unsigned int dma_buf_id; 372 unsigned int dma_buf_id;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e5c0244962b0..ce71ed53e88f 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -89,10 +89,10 @@ struct cgroup_subsys freezer_subsys;
89 89
90/* Locks taken and their ordering 90/* Locks taken and their ordering
91 * ------------------------------ 91 * ------------------------------
92 * css_set_lock
93 * cgroup_mutex (AKA cgroup_lock) 92 * cgroup_mutex (AKA cgroup_lock)
94 * task->alloc_lock (AKA task_lock)
95 * freezer->lock 93 * freezer->lock
94 * css_set_lock
95 * task->alloc_lock (AKA task_lock)
96 * task->sighand->siglock 96 * task->sighand->siglock
97 * 97 *
98 * cgroup code forces css_set_lock to be taken before task->alloc_lock 98 * cgroup code forces css_set_lock to be taken before task->alloc_lock
@@ -100,33 +100,38 @@ struct cgroup_subsys freezer_subsys;
100 * freezer_create(), freezer_destroy(): 100 * freezer_create(), freezer_destroy():
101 * cgroup_mutex [ by cgroup core ] 101 * cgroup_mutex [ by cgroup core ]
102 * 102 *
103 * can_attach(): 103 * freezer_can_attach():
104 * cgroup_mutex 104 * cgroup_mutex (held by caller of can_attach)
105 * 105 *
106 * cgroup_frozen(): 106 * cgroup_freezing_or_frozen():
107 * task->alloc_lock (to get task's cgroup) 107 * task->alloc_lock (to get task's cgroup)
108 * 108 *
109 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): 109 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
110 * task->alloc_lock (to get task's cgroup)
111 * freezer->lock 110 * freezer->lock
112 * sighand->siglock (if the cgroup is freezing) 111 * sighand->siglock (if the cgroup is freezing)
113 * 112 *
114 * freezer_read(): 113 * freezer_read():
115 * cgroup_mutex 114 * cgroup_mutex
116 * freezer->lock 115 * freezer->lock
116 * write_lock css_set_lock (cgroup iterator start)
117 * task->alloc_lock
117 * read_lock css_set_lock (cgroup iterator start) 118 * read_lock css_set_lock (cgroup iterator start)
118 * 119 *
119 * freezer_write() (freeze): 120 * freezer_write() (freeze):
120 * cgroup_mutex 121 * cgroup_mutex
121 * freezer->lock 122 * freezer->lock
123 * write_lock css_set_lock (cgroup iterator start)
124 * task->alloc_lock
122 * read_lock css_set_lock (cgroup iterator start) 125 * read_lock css_set_lock (cgroup iterator start)
123 * sighand->siglock 126 * sighand->siglock (fake signal delivery inside freeze_task())
124 * 127 *
125 * freezer_write() (unfreeze): 128 * freezer_write() (unfreeze):
126 * cgroup_mutex 129 * cgroup_mutex
127 * freezer->lock 130 * freezer->lock
131 * write_lock css_set_lock (cgroup iterator start)
132 * task->alloc_lock
128 * read_lock css_set_lock (cgroup iterator start) 133 * read_lock css_set_lock (cgroup iterator start)
129 * task->alloc_lock (to prevent races with freeze_task()) 134 * task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
130 * sighand->siglock 135 * sighand->siglock
131 */ 136 */
132static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, 137static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 3db49b9ca374..f42d3f737a33 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -2,7 +2,7 @@
2 * This module exposes the interface to kernel space for specifying 2 * This module exposes the interface to kernel space for specifying
3 * QoS dependencies. It provides infrastructure for registration of: 3 * QoS dependencies. It provides infrastructure for registration of:
4 * 4 *
5 * Dependents on a QoS value : register requirements 5 * Dependents on a QoS value : register requests
6 * Watchers of QoS value : get notified when target QoS value changes 6 * Watchers of QoS value : get notified when target QoS value changes
7 * 7 *
8 * This QoS design is best effort based. Dependents register their QoS needs. 8 * This QoS design is best effort based. Dependents register their QoS needs.
@@ -14,19 +14,21 @@
14 * timeout: usec <-- currently not used. 14 * timeout: usec <-- currently not used.
15 * throughput: kbs (kilo byte / sec) 15 * throughput: kbs (kilo byte / sec)
16 * 16 *
17 * There are lists of pm_qos_objects each one wrapping requirements, notifiers 17 * There are lists of pm_qos_objects each one wrapping requests, notifiers
18 * 18 *
19 * User mode requirements on a QOS parameter register themselves to the 19 * User mode requests on a QOS parameter register themselves to the
20 * subsystem by opening the device node /dev/... and writing there request to 20 * subsystem by opening the device node /dev/... and writing there request to
21 * the node. As long as the process holds a file handle open to the node the 21 * the node. As long as the process holds a file handle open to the node the
22 * client continues to be accounted for. Upon file release the usermode 22 * client continues to be accounted for. Upon file release the usermode
23 * requirement is removed and a new qos target is computed. This way when the 23 * request is removed and a new qos target is computed. This way when the
24 * requirement that the application has is cleaned up when closes the file 24 * request that the application has is cleaned up when closes the file
25 * pointer or exits the pm_qos_object will get an opportunity to clean up. 25 * pointer or exits the pm_qos_object will get an opportunity to clean up.
26 * 26 *
27 * Mark Gross <mgross@linux.intel.com> 27 * Mark Gross <mgross@linux.intel.com>
28 */ 28 */
29 29
30/*#define DEBUG*/
31
30#include <linux/pm_qos_params.h> 32#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 33#include <linux/sched.h>
32#include <linux/spinlock.h> 34#include <linux/spinlock.h>
@@ -42,25 +44,25 @@
42#include <linux/uaccess.h> 44#include <linux/uaccess.h>
43 45
44/* 46/*
45 * locking rule: all changes to requirements or notifiers lists 47 * locking rule: all changes to requests or notifiers lists
46 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 48 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
47 * held, taken with _irqsave. One lock to rule them all 49 * held, taken with _irqsave. One lock to rule them all
48 */ 50 */
49struct requirement_list { 51struct pm_qos_request_list {
50 struct list_head list; 52 struct list_head list;
51 union { 53 union {
52 s32 value; 54 s32 value;
53 s32 usec; 55 s32 usec;
54 s32 kbps; 56 s32 kbps;
55 }; 57 };
56 char *name; 58 int pm_qos_class;
57}; 59};
58 60
59static s32 max_compare(s32 v1, s32 v2); 61static s32 max_compare(s32 v1, s32 v2);
60static s32 min_compare(s32 v1, s32 v2); 62static s32 min_compare(s32 v1, s32 v2);
61 63
62struct pm_qos_object { 64struct pm_qos_object {
63 struct requirement_list requirements; 65 struct pm_qos_request_list requests;
64 struct blocking_notifier_head *notifiers; 66 struct blocking_notifier_head *notifiers;
65 struct miscdevice pm_qos_power_miscdev; 67 struct miscdevice pm_qos_power_miscdev;
66 char *name; 68 char *name;
@@ -72,7 +74,7 @@ struct pm_qos_object {
72static struct pm_qos_object null_pm_qos; 74static struct pm_qos_object null_pm_qos;
73static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
74static struct pm_qos_object cpu_dma_pm_qos = { 76static struct pm_qos_object cpu_dma_pm_qos = {
75 .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)}, 77 .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)},
76 .notifiers = &cpu_dma_lat_notifier, 78 .notifiers = &cpu_dma_lat_notifier,
77 .name = "cpu_dma_latency", 79 .name = "cpu_dma_latency",
78 .default_value = 2000 * USEC_PER_SEC, 80 .default_value = 2000 * USEC_PER_SEC,
@@ -82,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
82 84
83static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
84static struct pm_qos_object network_lat_pm_qos = { 86static struct pm_qos_object network_lat_pm_qos = {
85 .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)}, 87 .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)},
86 .notifiers = &network_lat_notifier, 88 .notifiers = &network_lat_notifier,
87 .name = "network_latency", 89 .name = "network_latency",
88 .default_value = 2000 * USEC_PER_SEC, 90 .default_value = 2000 * USEC_PER_SEC,
@@ -93,8 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = {
93 95
94static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
95static struct pm_qos_object network_throughput_pm_qos = { 97static struct pm_qos_object network_throughput_pm_qos = {
96 .requirements = 98 .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)},
97 {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
98 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
99 .name = "network_throughput", 100 .name = "network_throughput",
100 .default_value = 0, 101 .default_value = 0,
@@ -135,31 +136,34 @@ static s32 min_compare(s32 v1, s32 v2)
135} 136}
136 137
137 138
138static void update_target(int target) 139static void update_target(int pm_qos_class)
139{ 140{
140 s32 extreme_value; 141 s32 extreme_value;
141 struct requirement_list *node; 142 struct pm_qos_request_list *node;
142 unsigned long flags; 143 unsigned long flags;
143 int call_notifier = 0; 144 int call_notifier = 0;
144 145
145 spin_lock_irqsave(&pm_qos_lock, flags); 146 spin_lock_irqsave(&pm_qos_lock, flags);
146 extreme_value = pm_qos_array[target]->default_value; 147 extreme_value = pm_qos_array[pm_qos_class]->default_value;
147 list_for_each_entry(node, 148 list_for_each_entry(node,
148 &pm_qos_array[target]->requirements.list, list) { 149 &pm_qos_array[pm_qos_class]->requests.list, list) {
149 extreme_value = pm_qos_array[target]->comparitor( 150 extreme_value = pm_qos_array[pm_qos_class]->comparitor(
150 extreme_value, node->value); 151 extreme_value, node->value);
151 } 152 }
152 if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) { 153 if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) !=
154 extreme_value) {
153 call_notifier = 1; 155 call_notifier = 1;
154 atomic_set(&pm_qos_array[target]->target_value, extreme_value); 156 atomic_set(&pm_qos_array[pm_qos_class]->target_value,
155 pr_debug(KERN_ERR "new target for qos %d is %d\n", target, 157 extreme_value);
156 atomic_read(&pm_qos_array[target]->target_value)); 158 pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class,
159 atomic_read(&pm_qos_array[pm_qos_class]->target_value));
157 } 160 }
158 spin_unlock_irqrestore(&pm_qos_lock, flags); 161 spin_unlock_irqrestore(&pm_qos_lock, flags);
159 162
160 if (call_notifier) 163 if (call_notifier)
161 blocking_notifier_call_chain(pm_qos_array[target]->notifiers, 164 blocking_notifier_call_chain(
162 (unsigned long) extreme_value, NULL); 165 pm_qos_array[pm_qos_class]->notifiers,
166 (unsigned long) extreme_value, NULL);
163} 167}
164 168
165static int register_pm_qos_misc(struct pm_qos_object *qos) 169static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -185,125 +189,112 @@ static int find_pm_qos_object_by_minor(int minor)
185} 189}
186 190
187/** 191/**
188 * pm_qos_requirement - returns current system wide qos expectation 192 * pm_qos_request - returns current system wide qos expectation
189 * @pm_qos_class: identification of which qos value is requested 193 * @pm_qos_class: identification of which qos value is requested
190 * 194 *
191 * This function returns the current target value in an atomic manner. 195 * This function returns the current target value in an atomic manner.
192 */ 196 */
193int pm_qos_requirement(int pm_qos_class) 197int pm_qos_request(int pm_qos_class)
194{ 198{
195 return atomic_read(&pm_qos_array[pm_qos_class]->target_value); 199 return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
196} 200}
197EXPORT_SYMBOL_GPL(pm_qos_requirement); 201EXPORT_SYMBOL_GPL(pm_qos_request);
198 202
199/** 203/**
200 * pm_qos_add_requirement - inserts new qos request into the list 204 * pm_qos_add_request - inserts new qos request into the list
201 * @pm_qos_class: identifies which list of qos request to us 205 * @pm_qos_class: identifies which list of qos request to us
202 * @name: identifies the request
203 * @value: defines the qos request 206 * @value: defines the qos request
204 * 207 *
205 * This function inserts a new entry in the pm_qos_class list of requested qos 208 * This function inserts a new entry in the pm_qos_class list of requested qos
206 * performance characteristics. It recomputes the aggregate QoS expectations 209 * performance characteristics. It recomputes the aggregate QoS expectations
207 * for the pm_qos_class of parameters. 210 * for the pm_qos_class of parameters, and returns the pm_qos_request list
211 * element as a handle for use in updating and removal. Call needs to save
212 * this handle for later use.
208 */ 213 */
209int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) 214struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value)
210{ 215{
211 struct requirement_list *dep; 216 struct pm_qos_request_list *dep;
212 unsigned long flags; 217 unsigned long flags;
213 218
214 dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL); 219 dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL);
215 if (dep) { 220 if (dep) {
216 if (value == PM_QOS_DEFAULT_VALUE) 221 if (value == PM_QOS_DEFAULT_VALUE)
217 dep->value = pm_qos_array[pm_qos_class]->default_value; 222 dep->value = pm_qos_array[pm_qos_class]->default_value;
218 else 223 else
219 dep->value = value; 224 dep->value = value;
220 dep->name = kstrdup(name, GFP_KERNEL); 225 dep->pm_qos_class = pm_qos_class;
221 if (!dep->name)
222 goto cleanup;
223 226
224 spin_lock_irqsave(&pm_qos_lock, flags); 227 spin_lock_irqsave(&pm_qos_lock, flags);
225 list_add(&dep->list, 228 list_add(&dep->list,
226 &pm_qos_array[pm_qos_class]->requirements.list); 229 &pm_qos_array[pm_qos_class]->requests.list);
227 spin_unlock_irqrestore(&pm_qos_lock, flags); 230 spin_unlock_irqrestore(&pm_qos_lock, flags);
228 update_target(pm_qos_class); 231 update_target(pm_qos_class);
229
230 return 0;
231 } 232 }
232 233
233cleanup: 234 return dep;
234 kfree(dep);
235 return -ENOMEM;
236} 235}
237EXPORT_SYMBOL_GPL(pm_qos_add_requirement); 236EXPORT_SYMBOL_GPL(pm_qos_add_request);
238 237
239/** 238/**
240 * pm_qos_update_requirement - modifies an existing qos request 239 * pm_qos_update_request - modifies an existing qos request
241 * @pm_qos_class: identifies which list of qos request to us 240 * @pm_qos_req : handle to list element holding a pm_qos request to use
242 * @name: identifies the request
243 * @value: defines the qos request 241 * @value: defines the qos request
244 * 242 *
245 * Updates an existing qos requirement for the pm_qos_class of parameters along 243 * Updates an existing qos request for the pm_qos_class of parameters along
246 * with updating the target pm_qos_class value. 244 * with updating the target pm_qos_class value.
247 * 245 *
248 * If the named request isn't in the list then no change is made. 246 * Attempts are made to make this code callable on hot code paths.
249 */ 247 */
250int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) 248void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
249 s32 new_value)
251{ 250{
252 unsigned long flags; 251 unsigned long flags;
253 struct requirement_list *node;
254 int pending_update = 0; 252 int pending_update = 0;
253 s32 temp;
255 254
256 spin_lock_irqsave(&pm_qos_lock, flags); 255 if (pm_qos_req) { /*guard against callers passing in null */
257 list_for_each_entry(node, 256 spin_lock_irqsave(&pm_qos_lock, flags);
258 &pm_qos_array[pm_qos_class]->requirements.list, list) { 257 if (new_value == PM_QOS_DEFAULT_VALUE)
259 if (strcmp(node->name, name) == 0) { 258 temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
260 if (new_value == PM_QOS_DEFAULT_VALUE) 259 else
261 node->value = 260 temp = new_value;
262 pm_qos_array[pm_qos_class]->default_value; 261
263 else 262 if (temp != pm_qos_req->value) {
264 node->value = new_value;
265 pending_update = 1; 263 pending_update = 1;
266 break; 264 pm_qos_req->value = temp;
267 } 265 }
266 spin_unlock_irqrestore(&pm_qos_lock, flags);
267 if (pending_update)
268 update_target(pm_qos_req->pm_qos_class);
268 } 269 }
269 spin_unlock_irqrestore(&pm_qos_lock, flags);
270 if (pending_update)
271 update_target(pm_qos_class);
272
273 return 0;
274} 270}
275EXPORT_SYMBOL_GPL(pm_qos_update_requirement); 271EXPORT_SYMBOL_GPL(pm_qos_update_request);
276 272
277/** 273/**
278 * pm_qos_remove_requirement - modifies an existing qos request 274 * pm_qos_remove_request - modifies an existing qos request
279 * @pm_qos_class: identifies which list of qos request to us 275 * @pm_qos_req: handle to request list element
280 * @name: identifies the request
281 * 276 *
282 * Will remove named qos request from pm_qos_class list of parameters and 277 * Will remove pm qos request from the list of requests and
283 * recompute the current target value for the pm_qos_class. 278 * recompute the current target value for the pm_qos_class. Call this
279 * on slow code paths.
284 */ 280 */
285void pm_qos_remove_requirement(int pm_qos_class, char *name) 281void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
286{ 282{
287 unsigned long flags; 283 unsigned long flags;
288 struct requirement_list *node; 284 int qos_class;
289 int pending_update = 0;
290 285
286 if (pm_qos_req == NULL)
287 return;
288 /* silent return to keep pcm code cleaner */
289
290 qos_class = pm_qos_req->pm_qos_class;
291 spin_lock_irqsave(&pm_qos_lock, flags); 291 spin_lock_irqsave(&pm_qos_lock, flags);
292 list_for_each_entry(node, 292 list_del(&pm_qos_req->list);
293 &pm_qos_array[pm_qos_class]->requirements.list, list) { 293 kfree(pm_qos_req);
294 if (strcmp(node->name, name) == 0) {
295 kfree(node->name);
296 list_del(&node->list);
297 kfree(node);
298 pending_update = 1;
299 break;
300 }
301 }
302 spin_unlock_irqrestore(&pm_qos_lock, flags); 294 spin_unlock_irqrestore(&pm_qos_lock, flags);
303 if (pending_update) 295 update_target(qos_class);
304 update_target(pm_qos_class);
305} 296}
306EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); 297EXPORT_SYMBOL_GPL(pm_qos_remove_request);
307 298
308/** 299/**
309 * pm_qos_add_notifier - sets notification entry for changes to target value 300 * pm_qos_add_notifier - sets notification entry for changes to target value
@@ -313,7 +304,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
313 * will register the notifier into a notification chain that gets called 304 * will register the notifier into a notification chain that gets called
314 * upon changes to the pm_qos_class target value. 305 * upon changes to the pm_qos_class target value.
315 */ 306 */
316 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) 307int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
317{ 308{
318 int retval; 309 int retval;
319 310
@@ -343,21 +334,16 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
343} 334}
344EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 335EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
345 336
346#define PID_NAME_LEN 32
347
348static int pm_qos_power_open(struct inode *inode, struct file *filp) 337static int pm_qos_power_open(struct inode *inode, struct file *filp)
349{ 338{
350 int ret;
351 long pm_qos_class; 339 long pm_qos_class;
352 char name[PID_NAME_LEN];
353 340
354 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 341 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
355 if (pm_qos_class >= 0) { 342 if (pm_qos_class >= 0) {
356 filp->private_data = (void *)pm_qos_class; 343 filp->private_data = (void *) pm_qos_add_request(pm_qos_class,
357 snprintf(name, PID_NAME_LEN, "process_%d", current->pid); 344 PM_QOS_DEFAULT_VALUE);
358 ret = pm_qos_add_requirement(pm_qos_class, name, 345
359 PM_QOS_DEFAULT_VALUE); 346 if (filp->private_data)
360 if (ret >= 0)
361 return 0; 347 return 0;
362 } 348 }
363 return -EPERM; 349 return -EPERM;
@@ -365,32 +351,40 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
365 351
366static int pm_qos_power_release(struct inode *inode, struct file *filp) 352static int pm_qos_power_release(struct inode *inode, struct file *filp)
367{ 353{
368 int pm_qos_class; 354 struct pm_qos_request_list *req;
369 char name[PID_NAME_LEN];
370 355
371 pm_qos_class = (long)filp->private_data; 356 req = (struct pm_qos_request_list *)filp->private_data;
372 snprintf(name, PID_NAME_LEN, "process_%d", current->pid); 357 pm_qos_remove_request(req);
373 pm_qos_remove_requirement(pm_qos_class, name);
374 358
375 return 0; 359 return 0;
376} 360}
377 361
362
378static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 363static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
379 size_t count, loff_t *f_pos) 364 size_t count, loff_t *f_pos)
380{ 365{
381 s32 value; 366 s32 value;
382 int pm_qos_class; 367 int x;
383 char name[PID_NAME_LEN]; 368 char ascii_value[11];
384 369 struct pm_qos_request_list *pm_qos_req;
385 pm_qos_class = (long)filp->private_data; 370
386 if (count != sizeof(s32)) 371 if (count == sizeof(s32)) {
372 if (copy_from_user(&value, buf, sizeof(s32)))
373 return -EFAULT;
374 } else if (count == 11) { /* len('0x12345678/0') */
375 if (copy_from_user(ascii_value, buf, 11))
376 return -EFAULT;
377 x = sscanf(ascii_value, "%x", &value);
378 if (x != 1)
379 return -EINVAL;
380 pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value);
381 } else
387 return -EINVAL; 382 return -EINVAL;
388 if (copy_from_user(&value, buf, sizeof(s32)))
389 return -EFAULT;
390 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
391 pm_qos_update_requirement(pm_qos_class, name, value);
392 383
393 return sizeof(s32); 384 pm_qos_req = (struct pm_qos_request_list *)filp->private_data;
385 pm_qos_update_request(pm_qos_req, value);
386
387 return count;
394} 388}
395 389
396 390
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 43191815f874..524e058dcf06 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,8 @@ obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o 8obj-$(CONFIG_FREEZER) += process.o
9obj-$(CONFIG_SUSPEND) += suspend.o 9obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o 11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
12 block_io.o
12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o 13obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
13 14
14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
new file mode 100644
index 000000000000..97024fd40cd5
--- /dev/null
+++ b/kernel/power/block_io.c
@@ -0,0 +1,103 @@
1/*
2 * This file provides functions for block I/O operations on swap/file.
3 *
4 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
5 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
6 *
7 * This file is released under the GPLv2.
8 */
9
10#include <linux/bio.h>
11#include <linux/kernel.h>
12#include <linux/pagemap.h>
13#include <linux/swap.h>
14
15#include "power.h"
16
17/**
18 * submit - submit BIO request.
19 * @rw: READ or WRITE.
20 * @off physical offset of page.
21 * @page: page we're reading or writing.
22 * @bio_chain: list of pending biod (for async reading)
23 *
24 * Straight from the textbook - allocate and initialize the bio.
25 * If we're reading, make sure the page is marked as dirty.
26 * Then submit it and, if @bio_chain == NULL, wait.
27 */
28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain)
30{
31 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
32 struct bio *bio;
33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
35 bio->bi_sector = sector;
36 bio->bi_bdev = bdev;
37 bio->bi_end_io = end_swap_bio_read;
38
39 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
40 printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
41 (unsigned long long)sector);
42 bio_put(bio);
43 return -EFAULT;
44 }
45
46 lock_page(page);
47 bio_get(bio);
48
49 if (bio_chain == NULL) {
50 submit_bio(bio_rw, bio);
51 wait_on_page_locked(page);
52 if (rw == READ)
53 bio_set_pages_dirty(bio);
54 bio_put(bio);
55 } else {
56 if (rw == READ)
57 get_page(page); /* These pages are freed later */
58 bio->bi_private = *bio_chain;
59 *bio_chain = bio;
60 submit_bio(bio_rw, bio);
61 }
62 return 0;
63}
64
65int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
66{
67 return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
68 virt_to_page(addr), bio_chain);
69}
70
71int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
72{
73 return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
74 virt_to_page(addr), bio_chain);
75}
76
77int hib_wait_on_bio_chain(struct bio **bio_chain)
78{
79 struct bio *bio;
80 struct bio *next_bio;
81 int ret = 0;
82
83 if (bio_chain == NULL)
84 return 0;
85
86 bio = *bio_chain;
87 if (bio == NULL)
88 return 0;
89 while (bio) {
90 struct page *page;
91
92 next_bio = bio->bi_private;
93 page = bio->bi_io_vec[0].bv_page;
94 wait_on_page_locked(page);
95 if (!PageUptodate(page) || PageError(page))
96 ret = -EIO;
97 put_page(page);
98 bio_put(bio);
99 bio = next_bio;
100 }
101 *bio_chain = NULL;
102 return ret;
103}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46c5a26630a3..006270fe382d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -97,24 +97,12 @@ extern int hibernate_preallocate_memory(void);
97 */ 97 */
98 98
99struct snapshot_handle { 99struct snapshot_handle {
100 loff_t offset; /* number of the last byte ready for reading
101 * or writing in the sequence
102 */
103 unsigned int cur; /* number of the block of PAGE_SIZE bytes the 100 unsigned int cur; /* number of the block of PAGE_SIZE bytes the
104 * next operation will refer to (ie. current) 101 * next operation will refer to (ie. current)
105 */ 102 */
106 unsigned int cur_offset; /* offset with respect to the current
107 * block (for the next operation)
108 */
109 unsigned int prev; /* number of the block of PAGE_SIZE bytes that
110 * was the current one previously
111 */
112 void *buffer; /* address of the block to read from 103 void *buffer; /* address of the block to read from
113 * or write to 104 * or write to
114 */ 105 */
115 unsigned int buf_offset; /* location to read from or write to,
116 * given as a displacement from 'buffer'
117 */
118 int sync_read; /* Set to one to notify the caller of 106 int sync_read; /* Set to one to notify the caller of
119 * snapshot_write_next() that it may 107 * snapshot_write_next() that it may
120 * need to call wait_on_bio_chain() 108 * need to call wait_on_bio_chain()
@@ -125,12 +113,12 @@ struct snapshot_handle {
125 * snapshot_read_next()/snapshot_write_next() is allowed to 113 * snapshot_read_next()/snapshot_write_next() is allowed to
126 * read/write data after the function returns 114 * read/write data after the function returns
127 */ 115 */
128#define data_of(handle) ((handle).buffer + (handle).buf_offset) 116#define data_of(handle) ((handle).buffer)
129 117
130extern unsigned int snapshot_additional_pages(struct zone *zone); 118extern unsigned int snapshot_additional_pages(struct zone *zone);
131extern unsigned long snapshot_get_image_size(void); 119extern unsigned long snapshot_get_image_size(void);
132extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 120extern int snapshot_read_next(struct snapshot_handle *handle);
133extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 121extern int snapshot_write_next(struct snapshot_handle *handle);
134extern void snapshot_write_finalize(struct snapshot_handle *handle); 122extern void snapshot_write_finalize(struct snapshot_handle *handle);
135extern int snapshot_image_loaded(struct snapshot_handle *handle); 123extern int snapshot_image_loaded(struct snapshot_handle *handle);
136 124
@@ -154,6 +142,15 @@ extern int swsusp_read(unsigned int *flags_p);
154extern int swsusp_write(unsigned int flags); 142extern int swsusp_write(unsigned int flags);
155extern void swsusp_close(fmode_t); 143extern void swsusp_close(fmode_t);
156 144
145/* kernel/power/block_io.c */
146extern struct block_device *hib_resume_bdev;
147
148extern int hib_bio_read_page(pgoff_t page_off, void *addr,
149 struct bio **bio_chain);
150extern int hib_bio_write_page(pgoff_t page_off, void *addr,
151 struct bio **bio_chain);
152extern int hib_wait_on_bio_chain(struct bio **bio_chain);
153
157struct timeval; 154struct timeval;
158/* kernel/power/swsusp.c */ 155/* kernel/power/swsusp.c */
159extern void swsusp_show_speed(struct timeval *, struct timeval *, 156extern void swsusp_show_speed(struct timeval *, struct timeval *,
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index be861c26dda7..25ce010e9f8b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1604,14 +1604,9 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1604 * snapshot_handle structure. The structure gets updated and a pointer 1604 * snapshot_handle structure. The structure gets updated and a pointer
1605 * to it should be passed to this function every next time. 1605 * to it should be passed to this function every next time.
1606 * 1606 *
1607 * The @count parameter should contain the number of bytes the caller
1608 * wants to read from the snapshot. It must not be zero.
1609 *
1610 * On success the function returns a positive number. Then, the caller 1607 * On success the function returns a positive number. Then, the caller
1611 * is allowed to read up to the returned number of bytes from the memory 1608 * is allowed to read up to the returned number of bytes from the memory
1612 * location computed by the data_of() macro. The number returned 1609 * location computed by the data_of() macro.
1613 * may be smaller than @count, but this only happens if the read would
1614 * cross a page boundary otherwise.
1615 * 1610 *
1616 * The function returns 0 to indicate the end of data stream condition, 1611 * The function returns 0 to indicate the end of data stream condition,
1617 * and a negative number is returned on error. In such cases the 1612 * and a negative number is returned on error. In such cases the
@@ -1619,7 +1614,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1619 * any more. 1614 * any more.
1620 */ 1615 */
1621 1616
1622int snapshot_read_next(struct snapshot_handle *handle, size_t count) 1617int snapshot_read_next(struct snapshot_handle *handle)
1623{ 1618{
1624 if (handle->cur > nr_meta_pages + nr_copy_pages) 1619 if (handle->cur > nr_meta_pages + nr_copy_pages)
1625 return 0; 1620 return 0;
@@ -1630,7 +1625,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1630 if (!buffer) 1625 if (!buffer)
1631 return -ENOMEM; 1626 return -ENOMEM;
1632 } 1627 }
1633 if (!handle->offset) { 1628 if (!handle->cur) {
1634 int error; 1629 int error;
1635 1630
1636 error = init_header((struct swsusp_info *)buffer); 1631 error = init_header((struct swsusp_info *)buffer);
@@ -1639,42 +1634,30 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1639 handle->buffer = buffer; 1634 handle->buffer = buffer;
1640 memory_bm_position_reset(&orig_bm); 1635 memory_bm_position_reset(&orig_bm);
1641 memory_bm_position_reset(&copy_bm); 1636 memory_bm_position_reset(&copy_bm);
1642 } 1637 } else if (handle->cur <= nr_meta_pages) {
1643 if (handle->prev < handle->cur) { 1638 memset(buffer, 0, PAGE_SIZE);
1644 if (handle->cur <= nr_meta_pages) { 1639 pack_pfns(buffer, &orig_bm);
1645 memset(buffer, 0, PAGE_SIZE); 1640 } else {
1646 pack_pfns(buffer, &orig_bm); 1641 struct page *page;
1647 } else {
1648 struct page *page;
1649 1642
1650 page = pfn_to_page(memory_bm_next_pfn(&copy_bm)); 1643 page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
1651 if (PageHighMem(page)) { 1644 if (PageHighMem(page)) {
1652 /* Highmem pages are copied to the buffer, 1645 /* Highmem pages are copied to the buffer,
1653 * because we can't return with a kmapped 1646 * because we can't return with a kmapped
1654 * highmem page (we may not be called again). 1647 * highmem page (we may not be called again).
1655 */ 1648 */
1656 void *kaddr; 1649 void *kaddr;
1657 1650
1658 kaddr = kmap_atomic(page, KM_USER0); 1651 kaddr = kmap_atomic(page, KM_USER0);
1659 memcpy(buffer, kaddr, PAGE_SIZE); 1652 memcpy(buffer, kaddr, PAGE_SIZE);
1660 kunmap_atomic(kaddr, KM_USER0); 1653 kunmap_atomic(kaddr, KM_USER0);
1661 handle->buffer = buffer; 1654 handle->buffer = buffer;
1662 } else { 1655 } else {
1663 handle->buffer = page_address(page); 1656 handle->buffer = page_address(page);
1664 }
1665 } 1657 }
1666 handle->prev = handle->cur;
1667 }
1668 handle->buf_offset = handle->cur_offset;
1669 if (handle->cur_offset + count >= PAGE_SIZE) {
1670 count = PAGE_SIZE - handle->cur_offset;
1671 handle->cur_offset = 0;
1672 handle->cur++;
1673 } else {
1674 handle->cur_offset += count;
1675 } 1658 }
1676 handle->offset += count; 1659 handle->cur++;
1677 return count; 1660 return PAGE_SIZE;
1678} 1661}
1679 1662
1680/** 1663/**
@@ -2133,14 +2116,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2133 * snapshot_handle structure. The structure gets updated and a pointer 2116 * snapshot_handle structure. The structure gets updated and a pointer
2134 * to it should be passed to this function every next time. 2117 * to it should be passed to this function every next time.
2135 * 2118 *
2136 * The @count parameter should contain the number of bytes the caller
2137 * wants to write to the image. It must not be zero.
2138 *
2139 * On success the function returns a positive number. Then, the caller 2119 * On success the function returns a positive number. Then, the caller
2140 * is allowed to write up to the returned number of bytes to the memory 2120 * is allowed to write up to the returned number of bytes to the memory
2141 * location computed by the data_of() macro. The number returned 2121 * location computed by the data_of() macro.
2142 * may be smaller than @count, but this only happens if the write would
2143 * cross a page boundary otherwise.
2144 * 2122 *
2145 * The function returns 0 to indicate the "end of file" condition, 2123 * The function returns 0 to indicate the "end of file" condition,
2146 * and a negative number is returned on error. In such cases the 2124 * and a negative number is returned on error. In such cases the
@@ -2148,16 +2126,18 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2148 * any more. 2126 * any more.
2149 */ 2127 */
2150 2128
2151int snapshot_write_next(struct snapshot_handle *handle, size_t count) 2129int snapshot_write_next(struct snapshot_handle *handle)
2152{ 2130{
2153 static struct chain_allocator ca; 2131 static struct chain_allocator ca;
2154 int error = 0; 2132 int error = 0;
2155 2133
2156 /* Check if we have already loaded the entire image */ 2134 /* Check if we have already loaded the entire image */
2157 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 2135 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
2158 return 0; 2136 return 0;
2159 2137
2160 if (handle->offset == 0) { 2138 handle->sync_read = 1;
2139
2140 if (!handle->cur) {
2161 if (!buffer) 2141 if (!buffer)
2162 /* This makes the buffer be freed by swsusp_free() */ 2142 /* This makes the buffer be freed by swsusp_free() */
2163 buffer = get_image_page(GFP_ATOMIC, PG_ANY); 2143 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
@@ -2166,56 +2146,43 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
2166 return -ENOMEM; 2146 return -ENOMEM;
2167 2147
2168 handle->buffer = buffer; 2148 handle->buffer = buffer;
2169 } 2149 } else if (handle->cur == 1) {
2170 handle->sync_read = 1; 2150 error = load_header(buffer);
2171 if (handle->prev < handle->cur) { 2151 if (error)
2172 if (handle->prev == 0) { 2152 return error;
2173 error = load_header(buffer);
2174 if (error)
2175 return error;
2176 2153
2177 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY); 2154 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
2178 if (error) 2155 if (error)
2179 return error; 2156 return error;
2157
2158 } else if (handle->cur <= nr_meta_pages + 1) {
2159 error = unpack_orig_pfns(buffer, &copy_bm);
2160 if (error)
2161 return error;
2180 2162
2181 } else if (handle->prev <= nr_meta_pages) { 2163 if (handle->cur == nr_meta_pages + 1) {
2182 error = unpack_orig_pfns(buffer, &copy_bm); 2164 error = prepare_image(&orig_bm, &copy_bm);
2183 if (error) 2165 if (error)
2184 return error; 2166 return error;
2185 2167
2186 if (handle->prev == nr_meta_pages) { 2168 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2187 error = prepare_image(&orig_bm, &copy_bm); 2169 memory_bm_position_reset(&orig_bm);
2188 if (error) 2170 restore_pblist = NULL;
2189 return error;
2190
2191 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2192 memory_bm_position_reset(&orig_bm);
2193 restore_pblist = NULL;
2194 handle->buffer = get_buffer(&orig_bm, &ca);
2195 handle->sync_read = 0;
2196 if (IS_ERR(handle->buffer))
2197 return PTR_ERR(handle->buffer);
2198 }
2199 } else {
2200 copy_last_highmem_page();
2201 handle->buffer = get_buffer(&orig_bm, &ca); 2171 handle->buffer = get_buffer(&orig_bm, &ca);
2172 handle->sync_read = 0;
2202 if (IS_ERR(handle->buffer)) 2173 if (IS_ERR(handle->buffer))
2203 return PTR_ERR(handle->buffer); 2174 return PTR_ERR(handle->buffer);
2204 if (handle->buffer != buffer)
2205 handle->sync_read = 0;
2206 } 2175 }
2207 handle->prev = handle->cur;
2208 }
2209 handle->buf_offset = handle->cur_offset;
2210 if (handle->cur_offset + count >= PAGE_SIZE) {
2211 count = PAGE_SIZE - handle->cur_offset;
2212 handle->cur_offset = 0;
2213 handle->cur++;
2214 } else { 2176 } else {
2215 handle->cur_offset += count; 2177 copy_last_highmem_page();
2178 handle->buffer = get_buffer(&orig_bm, &ca);
2179 if (IS_ERR(handle->buffer))
2180 return PTR_ERR(handle->buffer);
2181 if (handle->buffer != buffer)
2182 handle->sync_read = 0;
2216 } 2183 }
2217 handle->offset += count; 2184 handle->cur++;
2218 return count; 2185 return PAGE_SIZE;
2219} 2186}
2220 2187
2221/** 2188/**
@@ -2230,7 +2197,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
2230{ 2197{
2231 copy_last_highmem_page(); 2198 copy_last_highmem_page();
2232 /* Free only if we have loaded the image entirely */ 2199 /* Free only if we have loaded the image entirely */
2233 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) { 2200 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
2234 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 2201 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
2235 free_highmem_data(); 2202 free_highmem_data();
2236 } 2203 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 66824d71983a..b0bb21778391 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -29,6 +29,40 @@
29 29
30#define SWSUSP_SIG "S1SUSPEND" 30#define SWSUSP_SIG "S1SUSPEND"
31 31
32/*
33 * The swap map is a data structure used for keeping track of each page
34 * written to a swap partition. It consists of many swap_map_page
35 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
36 * These structures are stored on the swap and linked together with the
37 * help of the .next_swap member.
38 *
39 * The swap map is created during suspend. The swap map pages are
40 * allocated and populated one at a time, so we only need one memory
41 * page to set up the entire structure.
42 *
43 * During resume we also only need to use one swap_map_page structure
44 * at a time.
45 */
46
47#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
48
49struct swap_map_page {
50 sector_t entries[MAP_PAGE_ENTRIES];
51 sector_t next_swap;
52};
53
54/**
55 * The swap_map_handle structure is used for handling swap in
56 * a file-alike way
57 */
58
59struct swap_map_handle {
60 struct swap_map_page *cur;
61 sector_t cur_swap;
62 sector_t first_sector;
63 unsigned int k;
64};
65
32struct swsusp_header { 66struct swsusp_header {
33 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; 67 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
34 sector_t image; 68 sector_t image;
@@ -145,110 +179,24 @@ int swsusp_swap_in_use(void)
145 */ 179 */
146 180
147static unsigned short root_swap = 0xffff; 181static unsigned short root_swap = 0xffff;
148static struct block_device *resume_bdev; 182struct block_device *hib_resume_bdev;
149
150/**
151 * submit - submit BIO request.
152 * @rw: READ or WRITE.
153 * @off physical offset of page.
154 * @page: page we're reading or writing.
155 * @bio_chain: list of pending biod (for async reading)
156 *
157 * Straight from the textbook - allocate and initialize the bio.
158 * If we're reading, make sure the page is marked as dirty.
159 * Then submit it and, if @bio_chain == NULL, wait.
160 */
161static int submit(int rw, pgoff_t page_off, struct page *page,
162 struct bio **bio_chain)
163{
164 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
165 struct bio *bio;
166
167 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
168 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
169 bio->bi_bdev = resume_bdev;
170 bio->bi_end_io = end_swap_bio_read;
171
172 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
173 printk(KERN_ERR "PM: Adding page to bio failed at %ld\n",
174 page_off);
175 bio_put(bio);
176 return -EFAULT;
177 }
178
179 lock_page(page);
180 bio_get(bio);
181
182 if (bio_chain == NULL) {
183 submit_bio(bio_rw, bio);
184 wait_on_page_locked(page);
185 if (rw == READ)
186 bio_set_pages_dirty(bio);
187 bio_put(bio);
188 } else {
189 if (rw == READ)
190 get_page(page); /* These pages are freed later */
191 bio->bi_private = *bio_chain;
192 *bio_chain = bio;
193 submit_bio(bio_rw, bio);
194 }
195 return 0;
196}
197
198static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
199{
200 return submit(READ, page_off, virt_to_page(addr), bio_chain);
201}
202
203static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
204{
205 return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
206}
207
208static int wait_on_bio_chain(struct bio **bio_chain)
209{
210 struct bio *bio;
211 struct bio *next_bio;
212 int ret = 0;
213
214 if (bio_chain == NULL)
215 return 0;
216
217 bio = *bio_chain;
218 if (bio == NULL)
219 return 0;
220 while (bio) {
221 struct page *page;
222
223 next_bio = bio->bi_private;
224 page = bio->bi_io_vec[0].bv_page;
225 wait_on_page_locked(page);
226 if (!PageUptodate(page) || PageError(page))
227 ret = -EIO;
228 put_page(page);
229 bio_put(bio);
230 bio = next_bio;
231 }
232 *bio_chain = NULL;
233 return ret;
234}
235 183
236/* 184/*
237 * Saving part 185 * Saving part
238 */ 186 */
239 187
240static int mark_swapfiles(sector_t start, unsigned int flags) 188static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
241{ 189{
242 int error; 190 int error;
243 191
244 bio_read_page(swsusp_resume_block, swsusp_header, NULL); 192 hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
245 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || 193 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
246 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { 194 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
247 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); 195 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
248 memcpy(swsusp_header->sig,SWSUSP_SIG, 10); 196 memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
249 swsusp_header->image = start; 197 swsusp_header->image = handle->first_sector;
250 swsusp_header->flags = flags; 198 swsusp_header->flags = flags;
251 error = bio_write_page(swsusp_resume_block, 199 error = hib_bio_write_page(swsusp_resume_block,
252 swsusp_header, NULL); 200 swsusp_header, NULL);
253 } else { 201 } else {
254 printk(KERN_ERR "PM: Swap header not found!\n"); 202 printk(KERN_ERR "PM: Swap header not found!\n");
@@ -260,25 +208,26 @@ static int mark_swapfiles(sector_t start, unsigned int flags)
260/** 208/**
261 * swsusp_swap_check - check if the resume device is a swap device 209 * swsusp_swap_check - check if the resume device is a swap device
262 * and get its index (if so) 210 * and get its index (if so)
211 *
212 * This is called before saving image
263 */ 213 */
264 214static int swsusp_swap_check(void)
265static int swsusp_swap_check(void) /* This is called before saving image */
266{ 215{
267 int res; 216 int res;
268 217
269 res = swap_type_of(swsusp_resume_device, swsusp_resume_block, 218 res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
270 &resume_bdev); 219 &hib_resume_bdev);
271 if (res < 0) 220 if (res < 0)
272 return res; 221 return res;
273 222
274 root_swap = res; 223 root_swap = res;
275 res = blkdev_get(resume_bdev, FMODE_WRITE); 224 res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
276 if (res) 225 if (res)
277 return res; 226 return res;
278 227
279 res = set_blocksize(resume_bdev, PAGE_SIZE); 228 res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
280 if (res < 0) 229 if (res < 0)
281 blkdev_put(resume_bdev, FMODE_WRITE); 230 blkdev_put(hib_resume_bdev, FMODE_WRITE);
282 231
283 return res; 232 return res;
284} 233}
@@ -309,42 +258,9 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
309 } else { 258 } else {
310 src = buf; 259 src = buf;
311 } 260 }
312 return bio_write_page(offset, src, bio_chain); 261 return hib_bio_write_page(offset, src, bio_chain);
313} 262}
314 263
315/*
316 * The swap map is a data structure used for keeping track of each page
317 * written to a swap partition. It consists of many swap_map_page
318 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
319 * These structures are stored on the swap and linked together with the
320 * help of the .next_swap member.
321 *
322 * The swap map is created during suspend. The swap map pages are
323 * allocated and populated one at a time, so we only need one memory
324 * page to set up the entire structure.
325 *
326 * During resume we also only need to use one swap_map_page structure
327 * at a time.
328 */
329
330#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
331
332struct swap_map_page {
333 sector_t entries[MAP_PAGE_ENTRIES];
334 sector_t next_swap;
335};
336
337/**
338 * The swap_map_handle structure is used for handling swap in
339 * a file-alike way
340 */
341
342struct swap_map_handle {
343 struct swap_map_page *cur;
344 sector_t cur_swap;
345 unsigned int k;
346};
347
348static void release_swap_writer(struct swap_map_handle *handle) 264static void release_swap_writer(struct swap_map_handle *handle)
349{ 265{
350 if (handle->cur) 266 if (handle->cur)
@@ -354,16 +270,33 @@ static void release_swap_writer(struct swap_map_handle *handle)
354 270
355static int get_swap_writer(struct swap_map_handle *handle) 271static int get_swap_writer(struct swap_map_handle *handle)
356{ 272{
273 int ret;
274
275 ret = swsusp_swap_check();
276 if (ret) {
277 if (ret != -ENOSPC)
278 printk(KERN_ERR "PM: Cannot find swap device, try "
279 "swapon -a.\n");
280 return ret;
281 }
357 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 282 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
358 if (!handle->cur) 283 if (!handle->cur) {
359 return -ENOMEM; 284 ret = -ENOMEM;
285 goto err_close;
286 }
360 handle->cur_swap = alloc_swapdev_block(root_swap); 287 handle->cur_swap = alloc_swapdev_block(root_swap);
361 if (!handle->cur_swap) { 288 if (!handle->cur_swap) {
362 release_swap_writer(handle); 289 ret = -ENOSPC;
363 return -ENOSPC; 290 goto err_rel;
364 } 291 }
365 handle->k = 0; 292 handle->k = 0;
293 handle->first_sector = handle->cur_swap;
366 return 0; 294 return 0;
295err_rel:
296 release_swap_writer(handle);
297err_close:
298 swsusp_close(FMODE_WRITE);
299 return ret;
367} 300}
368 301
369static int swap_write_page(struct swap_map_handle *handle, void *buf, 302static int swap_write_page(struct swap_map_handle *handle, void *buf,
@@ -380,7 +313,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
380 return error; 313 return error;
381 handle->cur->entries[handle->k++] = offset; 314 handle->cur->entries[handle->k++] = offset;
382 if (handle->k >= MAP_PAGE_ENTRIES) { 315 if (handle->k >= MAP_PAGE_ENTRIES) {
383 error = wait_on_bio_chain(bio_chain); 316 error = hib_wait_on_bio_chain(bio_chain);
384 if (error) 317 if (error)
385 goto out; 318 goto out;
386 offset = alloc_swapdev_block(root_swap); 319 offset = alloc_swapdev_block(root_swap);
@@ -406,6 +339,24 @@ static int flush_swap_writer(struct swap_map_handle *handle)
406 return -EINVAL; 339 return -EINVAL;
407} 340}
408 341
342static int swap_writer_finish(struct swap_map_handle *handle,
343 unsigned int flags, int error)
344{
345 if (!error) {
346 flush_swap_writer(handle);
347 printk(KERN_INFO "PM: S");
348 error = mark_swapfiles(handle, flags);
349 printk("|\n");
350 }
351
352 if (error)
353 free_all_swap_pages(root_swap);
354 release_swap_writer(handle);
355 swsusp_close(FMODE_WRITE);
356
357 return error;
358}
359
409/** 360/**
410 * save_image - save the suspend image data 361 * save_image - save the suspend image data
411 */ 362 */
@@ -431,7 +382,7 @@ static int save_image(struct swap_map_handle *handle,
431 bio = NULL; 382 bio = NULL;
432 do_gettimeofday(&start); 383 do_gettimeofday(&start);
433 while (1) { 384 while (1) {
434 ret = snapshot_read_next(snapshot, PAGE_SIZE); 385 ret = snapshot_read_next(snapshot);
435 if (ret <= 0) 386 if (ret <= 0)
436 break; 387 break;
437 ret = swap_write_page(handle, data_of(*snapshot), &bio); 388 ret = swap_write_page(handle, data_of(*snapshot), &bio);
@@ -441,7 +392,7 @@ static int save_image(struct swap_map_handle *handle,
441 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); 392 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
442 nr_pages++; 393 nr_pages++;
443 } 394 }
444 err2 = wait_on_bio_chain(&bio); 395 err2 = hib_wait_on_bio_chain(&bio);
445 do_gettimeofday(&stop); 396 do_gettimeofday(&stop);
446 if (!ret) 397 if (!ret)
447 ret = err2; 398 ret = err2;
@@ -483,50 +434,34 @@ int swsusp_write(unsigned int flags)
483 struct swap_map_handle handle; 434 struct swap_map_handle handle;
484 struct snapshot_handle snapshot; 435 struct snapshot_handle snapshot;
485 struct swsusp_info *header; 436 struct swsusp_info *header;
437 unsigned long pages;
486 int error; 438 int error;
487 439
488 error = swsusp_swap_check(); 440 pages = snapshot_get_image_size();
441 error = get_swap_writer(&handle);
489 if (error) { 442 if (error) {
490 printk(KERN_ERR "PM: Cannot find swap device, try " 443 printk(KERN_ERR "PM: Cannot get swap writer\n");
491 "swapon -a.\n");
492 return error; 444 return error;
493 } 445 }
446 if (!enough_swap(pages)) {
447 printk(KERN_ERR "PM: Not enough free swap\n");
448 error = -ENOSPC;
449 goto out_finish;
450 }
494 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 451 memset(&snapshot, 0, sizeof(struct snapshot_handle));
495 error = snapshot_read_next(&snapshot, PAGE_SIZE); 452 error = snapshot_read_next(&snapshot);
496 if (error < PAGE_SIZE) { 453 if (error < PAGE_SIZE) {
497 if (error >= 0) 454 if (error >= 0)
498 error = -EFAULT; 455 error = -EFAULT;
499 456
500 goto out; 457 goto out_finish;
501 } 458 }
502 header = (struct swsusp_info *)data_of(snapshot); 459 header = (struct swsusp_info *)data_of(snapshot);
503 if (!enough_swap(header->pages)) { 460 error = swap_write_page(&handle, header, NULL);
504 printk(KERN_ERR "PM: Not enough free swap\n"); 461 if (!error)
505 error = -ENOSPC; 462 error = save_image(&handle, &snapshot, pages - 1);
506 goto out; 463out_finish:
507 } 464 error = swap_writer_finish(&handle, flags, error);
508 error = get_swap_writer(&handle);
509 if (!error) {
510 sector_t start = handle.cur_swap;
511
512 error = swap_write_page(&handle, header, NULL);
513 if (!error)
514 error = save_image(&handle, &snapshot,
515 header->pages - 1);
516
517 if (!error) {
518 flush_swap_writer(&handle);
519 printk(KERN_INFO "PM: S");
520 error = mark_swapfiles(start, flags);
521 printk("|\n");
522 }
523 }
524 if (error)
525 free_all_swap_pages(root_swap);
526
527 release_swap_writer(&handle);
528 out:
529 swsusp_close(FMODE_WRITE);
530 return error; 465 return error;
531} 466}
532 467
@@ -542,18 +477,21 @@ static void release_swap_reader(struct swap_map_handle *handle)
542 handle->cur = NULL; 477 handle->cur = NULL;
543} 478}
544 479
545static int get_swap_reader(struct swap_map_handle *handle, sector_t start) 480static int get_swap_reader(struct swap_map_handle *handle,
481 unsigned int *flags_p)
546{ 482{
547 int error; 483 int error;
548 484
549 if (!start) 485 *flags_p = swsusp_header->flags;
486
487 if (!swsusp_header->image) /* how can this happen? */
550 return -EINVAL; 488 return -EINVAL;
551 489
552 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); 490 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
553 if (!handle->cur) 491 if (!handle->cur)
554 return -ENOMEM; 492 return -ENOMEM;
555 493
556 error = bio_read_page(start, handle->cur, NULL); 494 error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
557 if (error) { 495 if (error) {
558 release_swap_reader(handle); 496 release_swap_reader(handle);
559 return error; 497 return error;
@@ -573,21 +511,28 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
573 offset = handle->cur->entries[handle->k]; 511 offset = handle->cur->entries[handle->k];
574 if (!offset) 512 if (!offset)
575 return -EFAULT; 513 return -EFAULT;
576 error = bio_read_page(offset, buf, bio_chain); 514 error = hib_bio_read_page(offset, buf, bio_chain);
577 if (error) 515 if (error)
578 return error; 516 return error;
579 if (++handle->k >= MAP_PAGE_ENTRIES) { 517 if (++handle->k >= MAP_PAGE_ENTRIES) {
580 error = wait_on_bio_chain(bio_chain); 518 error = hib_wait_on_bio_chain(bio_chain);
581 handle->k = 0; 519 handle->k = 0;
582 offset = handle->cur->next_swap; 520 offset = handle->cur->next_swap;
583 if (!offset) 521 if (!offset)
584 release_swap_reader(handle); 522 release_swap_reader(handle);
585 else if (!error) 523 else if (!error)
586 error = bio_read_page(offset, handle->cur, NULL); 524 error = hib_bio_read_page(offset, handle->cur, NULL);
587 } 525 }
588 return error; 526 return error;
589} 527}
590 528
529static int swap_reader_finish(struct swap_map_handle *handle)
530{
531 release_swap_reader(handle);
532
533 return 0;
534}
535
591/** 536/**
592 * load_image - load the image using the swap map handle 537 * load_image - load the image using the swap map handle
593 * @handle and the snapshot handle @snapshot 538 * @handle and the snapshot handle @snapshot
@@ -615,21 +560,21 @@ static int load_image(struct swap_map_handle *handle,
615 bio = NULL; 560 bio = NULL;
616 do_gettimeofday(&start); 561 do_gettimeofday(&start);
617 for ( ; ; ) { 562 for ( ; ; ) {
618 error = snapshot_write_next(snapshot, PAGE_SIZE); 563 error = snapshot_write_next(snapshot);
619 if (error <= 0) 564 if (error <= 0)
620 break; 565 break;
621 error = swap_read_page(handle, data_of(*snapshot), &bio); 566 error = swap_read_page(handle, data_of(*snapshot), &bio);
622 if (error) 567 if (error)
623 break; 568 break;
624 if (snapshot->sync_read) 569 if (snapshot->sync_read)
625 error = wait_on_bio_chain(&bio); 570 error = hib_wait_on_bio_chain(&bio);
626 if (error) 571 if (error)
627 break; 572 break;
628 if (!(nr_pages % m)) 573 if (!(nr_pages % m))
629 printk("\b\b\b\b%3d%%", nr_pages / m); 574 printk("\b\b\b\b%3d%%", nr_pages / m);
630 nr_pages++; 575 nr_pages++;
631 } 576 }
632 err2 = wait_on_bio_chain(&bio); 577 err2 = hib_wait_on_bio_chain(&bio);
633 do_gettimeofday(&stop); 578 do_gettimeofday(&stop);
634 if (!error) 579 if (!error)
635 error = err2; 580 error = err2;
@@ -657,20 +602,20 @@ int swsusp_read(unsigned int *flags_p)
657 struct snapshot_handle snapshot; 602 struct snapshot_handle snapshot;
658 struct swsusp_info *header; 603 struct swsusp_info *header;
659 604
660 *flags_p = swsusp_header->flags;
661
662 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 605 memset(&snapshot, 0, sizeof(struct snapshot_handle));
663 error = snapshot_write_next(&snapshot, PAGE_SIZE); 606 error = snapshot_write_next(&snapshot);
664 if (error < PAGE_SIZE) 607 if (error < PAGE_SIZE)
665 return error < 0 ? error : -EFAULT; 608 return error < 0 ? error : -EFAULT;
666 header = (struct swsusp_info *)data_of(snapshot); 609 header = (struct swsusp_info *)data_of(snapshot);
667 error = get_swap_reader(&handle, swsusp_header->image); 610 error = get_swap_reader(&handle, flags_p);
611 if (error)
612 goto end;
668 if (!error) 613 if (!error)
669 error = swap_read_page(&handle, header, NULL); 614 error = swap_read_page(&handle, header, NULL);
670 if (!error) 615 if (!error)
671 error = load_image(&handle, &snapshot, header->pages - 1); 616 error = load_image(&handle, &snapshot, header->pages - 1);
672 release_swap_reader(&handle); 617 swap_reader_finish(&handle);
673 618end:
674 if (!error) 619 if (!error)
675 pr_debug("PM: Image successfully loaded\n"); 620 pr_debug("PM: Image successfully loaded\n");
676 else 621 else
@@ -686,11 +631,11 @@ int swsusp_check(void)
686{ 631{
687 int error; 632 int error;
688 633
689 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 634 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
690 if (!IS_ERR(resume_bdev)) { 635 if (!IS_ERR(hib_resume_bdev)) {
691 set_blocksize(resume_bdev, PAGE_SIZE); 636 set_blocksize(hib_resume_bdev, PAGE_SIZE);
692 memset(swsusp_header, 0, PAGE_SIZE); 637 memset(swsusp_header, 0, PAGE_SIZE);
693 error = bio_read_page(swsusp_resume_block, 638 error = hib_bio_read_page(swsusp_resume_block,
694 swsusp_header, NULL); 639 swsusp_header, NULL);
695 if (error) 640 if (error)
696 goto put; 641 goto put;
@@ -698,7 +643,7 @@ int swsusp_check(void)
698 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 643 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
699 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 644 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
700 /* Reset swap signature now */ 645 /* Reset swap signature now */
701 error = bio_write_page(swsusp_resume_block, 646 error = hib_bio_write_page(swsusp_resume_block,
702 swsusp_header, NULL); 647 swsusp_header, NULL);
703 } else { 648 } else {
704 error = -EINVAL; 649 error = -EINVAL;
@@ -706,11 +651,11 @@ int swsusp_check(void)
706 651
707put: 652put:
708 if (error) 653 if (error)
709 blkdev_put(resume_bdev, FMODE_READ); 654 blkdev_put(hib_resume_bdev, FMODE_READ);
710 else 655 else
711 pr_debug("PM: Signature found, resuming\n"); 656 pr_debug("PM: Signature found, resuming\n");
712 } else { 657 } else {
713 error = PTR_ERR(resume_bdev); 658 error = PTR_ERR(hib_resume_bdev);
714 } 659 }
715 660
716 if (error) 661 if (error)
@@ -725,12 +670,12 @@ put:
725 670
726void swsusp_close(fmode_t mode) 671void swsusp_close(fmode_t mode)
727{ 672{
728 if (IS_ERR(resume_bdev)) { 673 if (IS_ERR(hib_resume_bdev)) {
729 pr_debug("PM: Image device not initialised\n"); 674 pr_debug("PM: Image device not initialised\n");
730 return; 675 return;
731 } 676 }
732 677
733 blkdev_put(resume_bdev, mode); 678 blkdev_put(hib_resume_bdev, mode);
734} 679}
735 680
736static int swsusp_header_init(void) 681static int swsusp_header_init(void)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a8c96212bc1b..e819e17877ca 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -151,6 +151,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
151{ 151{
152 struct snapshot_data *data; 152 struct snapshot_data *data;
153 ssize_t res; 153 ssize_t res;
154 loff_t pg_offp = *offp & ~PAGE_MASK;
154 155
155 mutex_lock(&pm_mutex); 156 mutex_lock(&pm_mutex);
156 157
@@ -159,14 +160,19 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
159 res = -ENODATA; 160 res = -ENODATA;
160 goto Unlock; 161 goto Unlock;
161 } 162 }
162 res = snapshot_read_next(&data->handle, count); 163 if (!pg_offp) { /* on page boundary? */
163 if (res > 0) { 164 res = snapshot_read_next(&data->handle);
164 if (copy_to_user(buf, data_of(data->handle), res)) 165 if (res <= 0)
165 res = -EFAULT; 166 goto Unlock;
166 else 167 } else {
167 *offp = data->handle.offset; 168 res = PAGE_SIZE - pg_offp;
168 } 169 }
169 170
171 res = simple_read_from_buffer(buf, count, &pg_offp,
172 data_of(data->handle), res);
173 if (res > 0)
174 *offp += res;
175
170 Unlock: 176 Unlock:
171 mutex_unlock(&pm_mutex); 177 mutex_unlock(&pm_mutex);
172 178
@@ -178,18 +184,25 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
178{ 184{
179 struct snapshot_data *data; 185 struct snapshot_data *data;
180 ssize_t res; 186 ssize_t res;
187 loff_t pg_offp = *offp & ~PAGE_MASK;
181 188
182 mutex_lock(&pm_mutex); 189 mutex_lock(&pm_mutex);
183 190
184 data = filp->private_data; 191 data = filp->private_data;
185 res = snapshot_write_next(&data->handle, count); 192
186 if (res > 0) { 193 if (!pg_offp) {
187 if (copy_from_user(data_of(data->handle), buf, res)) 194 res = snapshot_write_next(&data->handle);
188 res = -EFAULT; 195 if (res <= 0)
189 else 196 goto unlock;
190 *offp = data->handle.offset; 197 } else {
198 res = PAGE_SIZE - pg_offp;
191 } 199 }
192 200
201 res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
202 buf, count);
203 if (res > 0)
204 *offp += res;
205unlock:
193 mutex_unlock(&pm_mutex); 206 mutex_unlock(&pm_mutex);
194 207
195 return res; 208 return res;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 875c8dec940a..88f95e7bab49 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -495,7 +495,7 @@ void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency)
495 s32 beaconint_us; 495 s32 beaconint_us;
496 496
497 if (latency < 0) 497 if (latency < 0)
498 latency = pm_qos_requirement(PM_QOS_NETWORK_LATENCY); 498 latency = pm_qos_request(PM_QOS_NETWORK_LATENCY);
499 499
500 beaconint_us = ieee80211_tu_to_usec( 500 beaconint_us = ieee80211_tu_to_usec(
501 found->vif.bss_conf.beacon_int); 501 found->vif.bss_conf.beacon_int);
diff --git a/sound/core/pcm.c b/sound/core/pcm.c
index 0d428d0896db..cbe815dfbdc8 100644
--- a/sound/core/pcm.c
+++ b/sound/core/pcm.c
@@ -648,9 +648,6 @@ int snd_pcm_new_stream(struct snd_pcm *pcm, int stream, int substream_count)
648 substream->number = idx; 648 substream->number = idx;
649 substream->stream = stream; 649 substream->stream = stream;
650 sprintf(substream->name, "subdevice #%i", idx); 650 sprintf(substream->name, "subdevice #%i", idx);
651 snprintf(substream->latency_id, sizeof(substream->latency_id),
652 "ALSA-PCM%d-%d%c%d", pcm->card->number, pcm->device,
653 (stream ? 'c' : 'p'), idx);
654 substream->buffer_bytes_max = UINT_MAX; 651 substream->buffer_bytes_max = UINT_MAX;
655 if (prev == NULL) 652 if (prev == NULL)
656 pstr->substream = substream; 653 pstr->substream = substream;
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 20b5982c996b..192dd407512f 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -484,11 +484,13 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream,
484 snd_pcm_timer_resolution_change(substream); 484 snd_pcm_timer_resolution_change(substream);
485 runtime->status->state = SNDRV_PCM_STATE_SETUP; 485 runtime->status->state = SNDRV_PCM_STATE_SETUP;
486 486
487 pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, 487 if (substream->latency_pm_qos_req) {
488 substream->latency_id); 488 pm_qos_remove_request(substream->latency_pm_qos_req);
489 substream->latency_pm_qos_req = NULL;
490 }
489 if ((usecs = period_to_usecs(runtime)) >= 0) 491 if ((usecs = period_to_usecs(runtime)) >= 0)
490 pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY, 492 substream->latency_pm_qos_req = pm_qos_add_request(
491 substream->latency_id, usecs); 493 PM_QOS_CPU_DMA_LATENCY, usecs);
492 return 0; 494 return 0;
493 _error: 495 _error:
494 /* hardware might be unuseable from this time, 496 /* hardware might be unuseable from this time,
@@ -543,8 +545,8 @@ static int snd_pcm_hw_free(struct snd_pcm_substream *substream)
543 if (substream->ops->hw_free) 545 if (substream->ops->hw_free)
544 result = substream->ops->hw_free(substream); 546 result = substream->ops->hw_free(substream);
545 runtime->status->state = SNDRV_PCM_STATE_OPEN; 547 runtime->status->state = SNDRV_PCM_STATE_OPEN;
546 pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, 548 pm_qos_remove_request(substream->latency_pm_qos_req);
547 substream->latency_id); 549 substream->latency_pm_qos_req = NULL;
548 return result; 550 return result;
549} 551}
550 552