aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/DocBook/libata.tmpl49
-rw-r--r--Documentation/DocBook/sh.tmpl10
-rw-r--r--Documentation/HOWTO2
-rw-r--r--Documentation/RCU/stallwarn.txt94
-rw-r--r--Documentation/RCU/torture.txt10
-rw-r--r--Documentation/RCU/trace.txt35
-rw-r--r--Documentation/arm/00-INDEX2
-rw-r--r--Documentation/arm/SPEAr/overview.txt60
-rw-r--r--Documentation/cgroups/cgroups.txt3
-rw-r--r--Documentation/credentials.txt14
-rw-r--r--Documentation/feature-removal-schedule.txt66
-rw-r--r--Documentation/filesystems/nfs/nfs41-server.txt2
-rw-r--r--Documentation/filesystems/proc.txt7
-rw-r--r--Documentation/i2c/writing-clients5
-rw-r--r--Documentation/input/elantech.txt8
-rw-r--r--Documentation/intel_txt.txt16
-rw-r--r--Documentation/kernel-parameters.txt20
-rw-r--r--Documentation/kprobes.txt10
-rw-r--r--Documentation/pcmcia/driver-changes.txt13
-rw-r--r--Documentation/power/devices.txt847
-rw-r--r--Documentation/power/pm_qos_interface.txt48
-rw-r--r--Documentation/power/userland-swsusp.txt4
-rw-r--r--Documentation/rbtree.txt58
-rw-r--r--Documentation/scheduler/sched-design-CFS.txt54
-rw-r--r--Documentation/scheduler/sched-rt-group.txt20
-rw-r--r--Documentation/spi/spidev_test.c2
-rw-r--r--Documentation/stable_kernel_rules.txt9
-rw-r--r--Documentation/trace/events.txt3
-rw-r--r--Documentation/trace/ftrace.txt50
-rw-r--r--Documentation/trace/kprobetrace.txt4
30 files changed, 917 insertions, 608 deletions
diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl
index 261b57bc6f08..265c08c96fcd 100644
--- a/Documentation/DocBook/libata.tmpl
+++ b/Documentation/DocBook/libata.tmpl
@@ -107,10 +107,6 @@ void (*dev_config) (struct ata_port *, struct ata_device *);
107 issue of SET FEATURES - XFER MODE, and prior to operation. 107 issue of SET FEATURES - XFER MODE, and prior to operation.
108 </para> 108 </para>
109 <para> 109 <para>
110 Called by ata_device_add() after ata_dev_identify() determines
111 a device is present.
112 </para>
113 <para>
114 This entry may be specified as NULL in ata_port_operations. 110 This entry may be specified as NULL in ata_port_operations.
115 </para> 111 </para>
116 112
@@ -154,8 +150,8 @@ unsigned int (*mode_filter) (struct ata_port *, struct ata_device *, unsigned in
154 150
155 <sect2><title>Taskfile read/write</title> 151 <sect2><title>Taskfile read/write</title>
156 <programlisting> 152 <programlisting>
157void (*tf_load) (struct ata_port *ap, struct ata_taskfile *tf); 153void (*sff_tf_load) (struct ata_port *ap, struct ata_taskfile *tf);
158void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf); 154void (*sff_tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
159 </programlisting> 155 </programlisting>
160 156
161 <para> 157 <para>
@@ -164,36 +160,35 @@ void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
164 hardware registers / DMA buffers, to obtain the current set of 160 hardware registers / DMA buffers, to obtain the current set of
165 taskfile register values. 161 taskfile register values.
166 Most drivers for taskfile-based hardware (PIO or MMIO) use 162 Most drivers for taskfile-based hardware (PIO or MMIO) use
167 ata_tf_load() and ata_tf_read() for these hooks. 163 ata_sff_tf_load() and ata_sff_tf_read() for these hooks.
168 </para> 164 </para>
169 165
170 </sect2> 166 </sect2>
171 167
172 <sect2><title>PIO data read/write</title> 168 <sect2><title>PIO data read/write</title>
173 <programlisting> 169 <programlisting>
174void (*data_xfer) (struct ata_device *, unsigned char *, unsigned int, int); 170void (*sff_data_xfer) (struct ata_device *, unsigned char *, unsigned int, int);
175 </programlisting> 171 </programlisting>
176 172
177 <para> 173 <para>
178All bmdma-style drivers must implement this hook. This is the low-level 174All bmdma-style drivers must implement this hook. This is the low-level
179operation that actually copies the data bytes during a PIO data 175operation that actually copies the data bytes during a PIO data
180transfer. 176transfer.
181Typically the driver 177Typically the driver will choose one of ata_sff_data_xfer_noirq(),
182will choose one of ata_pio_data_xfer_noirq(), ata_pio_data_xfer(), or 178ata_sff_data_xfer(), or ata_sff_data_xfer32().
183ata_mmio_data_xfer().
184 </para> 179 </para>
185 180
186 </sect2> 181 </sect2>
187 182
188 <sect2><title>ATA command execute</title> 183 <sect2><title>ATA command execute</title>
189 <programlisting> 184 <programlisting>
190void (*exec_command)(struct ata_port *ap, struct ata_taskfile *tf); 185void (*sff_exec_command)(struct ata_port *ap, struct ata_taskfile *tf);
191 </programlisting> 186 </programlisting>
192 187
193 <para> 188 <para>
194 causes an ATA command, previously loaded with 189 causes an ATA command, previously loaded with
195 ->tf_load(), to be initiated in hardware. 190 ->tf_load(), to be initiated in hardware.
196 Most drivers for taskfile-based hardware use ata_exec_command() 191 Most drivers for taskfile-based hardware use ata_sff_exec_command()
197 for this hook. 192 for this hook.
198 </para> 193 </para>
199 194
@@ -218,8 +213,8 @@ command.
218 213
219 <sect2><title>Read specific ATA shadow registers</title> 214 <sect2><title>Read specific ATA shadow registers</title>
220 <programlisting> 215 <programlisting>
221u8 (*check_status)(struct ata_port *ap); 216u8 (*sff_check_status)(struct ata_port *ap);
222u8 (*check_altstatus)(struct ata_port *ap); 217u8 (*sff_check_altstatus)(struct ata_port *ap);
223 </programlisting> 218 </programlisting>
224 219
225 <para> 220 <para>
@@ -227,20 +222,14 @@ u8 (*check_altstatus)(struct ata_port *ap);
227 hardware. On some hardware, reading the Status register has 222 hardware. On some hardware, reading the Status register has
228 the side effect of clearing the interrupt condition. 223 the side effect of clearing the interrupt condition.
229 Most drivers for taskfile-based hardware use 224 Most drivers for taskfile-based hardware use
230 ata_check_status() for this hook. 225 ata_sff_check_status() for this hook.
231 </para>
232 <para>
233 Note that because this is called from ata_device_add(), at
234 least a dummy function that clears device interrupts must be
235 provided for all drivers, even if the controller doesn't
236 actually have a taskfile status register.
237 </para> 226 </para>
238 227
239 </sect2> 228 </sect2>
240 229
241 <sect2><title>Select ATA device on bus</title> 230 <sect2><title>Select ATA device on bus</title>
242 <programlisting> 231 <programlisting>
243void (*dev_select)(struct ata_port *ap, unsigned int device); 232void (*sff_dev_select)(struct ata_port *ap, unsigned int device);
244 </programlisting> 233 </programlisting>
245 234
246 <para> 235 <para>
@@ -251,9 +240,7 @@ void (*dev_select)(struct ata_port *ap, unsigned int device);
251 </para> 240 </para>
252 <para> 241 <para>
253 Most drivers for taskfile-based hardware use 242 Most drivers for taskfile-based hardware use
254 ata_std_dev_select() for this hook. Controllers which do not 243 ata_sff_dev_select() for this hook.
255 support second drives on a port (such as SATA contollers) will
256 use ata_noop_dev_select().
257 </para> 244 </para>
258 245
259 </sect2> 246 </sect2>
@@ -441,13 +428,13 @@ void (*irq_clear) (struct ata_port *);
441 to struct ata_host_set. 428 to struct ata_host_set.
442 </para> 429 </para>
443 <para> 430 <para>
444 Most legacy IDE drivers use ata_interrupt() for the 431 Most legacy IDE drivers use ata_sff_interrupt() for the
445 irq_handler hook, which scans all ports in the host_set, 432 irq_handler hook, which scans all ports in the host_set,
446 determines which queued command was active (if any), and calls 433 determines which queued command was active (if any), and calls
447 ata_host_intr(ap,qc). 434 ata_sff_host_intr(ap,qc).
448 </para> 435 </para>
449 <para> 436 <para>
450 Most legacy IDE drivers use ata_bmdma_irq_clear() for the 437 Most legacy IDE drivers use ata_sff_irq_clear() for the
451 irq_clear() hook, which simply clears the interrupt and error 438 irq_clear() hook, which simply clears the interrupt and error
452 flags in the DMA status register. 439 flags in the DMA status register.
453 </para> 440 </para>
@@ -496,10 +483,6 @@ void (*host_stop) (struct ata_host_set *host_set);
496 data from port at this time. 483 data from port at this time.
497 </para> 484 </para>
498 <para> 485 <para>
499 Many drivers use ata_port_stop() as this hook, which frees the
500 PRD table.
501 </para>
502 <para>
503 ->host_stop() is called after all ->port_stop() calls 486 ->host_stop() is called after all ->port_stop() calls
504have completed. The hook must finalize hardware shutdown, release DMA 487have completed. The hook must finalize hardware shutdown, release DMA
505and other resources, etc. 488and other resources, etc.
diff --git a/Documentation/DocBook/sh.tmpl b/Documentation/DocBook/sh.tmpl
index 0c3dc4c69dd1..d858d92cf6d9 100644
--- a/Documentation/DocBook/sh.tmpl
+++ b/Documentation/DocBook/sh.tmpl
@@ -19,13 +19,17 @@
19 </authorgroup> 19 </authorgroup>
20 20
21 <copyright> 21 <copyright>
22 <year>2008</year> 22 <year>2008-2010</year>
23 <holder>Paul Mundt</holder> 23 <holder>Paul Mundt</holder>
24 </copyright> 24 </copyright>
25 <copyright> 25 <copyright>
26 <year>2008</year> 26 <year>2008-2010</year>
27 <holder>Renesas Technology Corp.</holder> 27 <holder>Renesas Technology Corp.</holder>
28 </copyright> 28 </copyright>
29 <copyright>
30 <year>2010</year>
31 <holder>Renesas Electronics Corp.</holder>
32 </copyright>
29 33
30 <legalnotice> 34 <legalnotice>
31 <para> 35 <para>
@@ -77,7 +81,7 @@
77 </chapter> 81 </chapter>
78 <chapter id="clk"> 82 <chapter id="clk">
79 <title>Clock Framework Extensions</title> 83 <title>Clock Framework Extensions</title>
80!Iarch/sh/include/asm/clock.h 84!Iinclude/linux/sh_clk.h
81 </chapter> 85 </chapter>
82 <chapter id="mach"> 86 <chapter id="mach">
83 <title>Machine Specific Interfaces</title> 87 <title>Machine Specific Interfaces</title>
diff --git a/Documentation/HOWTO b/Documentation/HOWTO
index f5395af88a41..40ada93b820a 100644
--- a/Documentation/HOWTO
+++ b/Documentation/HOWTO
@@ -234,7 +234,7 @@ process is as follows:
234 Linus, usually the patches that have already been included in the 234 Linus, usually the patches that have already been included in the
235 -next kernel for a few weeks. The preferred way to submit big changes 235 -next kernel for a few weeks. The preferred way to submit big changes
236 is using git (the kernel's source management tool, more information 236 is using git (the kernel's source management tool, more information
237 can be found at http://git.or.cz/) but plain patches are also just 237 can be found at http://git-scm.com/) but plain patches are also just
238 fine. 238 fine.
239 - After two weeks a -rc1 kernel is released it is now possible to push 239 - After two weeks a -rc1 kernel is released it is now possible to push
240 only patches that do not include new features that could affect the 240 only patches that do not include new features that could affect the
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 1423d2570d78..44c6dcc93d6d 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -3,35 +3,79 @@ Using RCU's CPU Stall Detector
3The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables 3The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables
4RCU's CPU stall detector, which detects conditions that unduly delay 4RCU's CPU stall detector, which detects conditions that unduly delay
5RCU grace periods. The stall detector's idea of what constitutes 5RCU grace periods. The stall detector's idea of what constitutes
6"unduly delayed" is controlled by a pair of C preprocessor macros: 6"unduly delayed" is controlled by a set of C preprocessor macros:
7 7
8RCU_SECONDS_TILL_STALL_CHECK 8RCU_SECONDS_TILL_STALL_CHECK
9 9
10 This macro defines the period of time that RCU will wait from 10 This macro defines the period of time that RCU will wait from
11 the beginning of a grace period until it issues an RCU CPU 11 the beginning of a grace period until it issues an RCU CPU
12 stall warning. It is normally ten seconds. 12 stall warning. This time period is normally ten seconds.
13 13
14RCU_SECONDS_TILL_STALL_RECHECK 14RCU_SECONDS_TILL_STALL_RECHECK
15 15
16 This macro defines the period of time that RCU will wait after 16 This macro defines the period of time that RCU will wait after
17 issuing a stall warning until it issues another stall warning. 17 issuing a stall warning until it issues another stall warning
18 It is normally set to thirty seconds. 18 for the same stall. This time period is normally set to thirty
19 seconds.
19 20
20RCU_STALL_RAT_DELAY 21RCU_STALL_RAT_DELAY
21 22
22 The CPU stall detector tries to make the offending CPU rat on itself, 23 The CPU stall detector tries to make the offending CPU print its
23 as this often gives better-quality stack traces. However, if 24 own warnings, as this often gives better-quality stack traces.
24 the offending CPU does not detect its own stall in the number 25 However, if the offending CPU does not detect its own stall in
25 of jiffies specified by RCU_STALL_RAT_DELAY, then other CPUs will 26 the number of jiffies specified by RCU_STALL_RAT_DELAY, then
26 complain. This is normally set to two jiffies. 27 some other CPU will complain. This delay is normally set to
28 two jiffies.
27 29
28The following problems can result in an RCU CPU stall warning: 30When a CPU detects that it is stalling, it will print a message similar
31to the following:
32
33INFO: rcu_sched_state detected stall on CPU 5 (t=2500 jiffies)
34
35This message indicates that CPU 5 detected that it was causing a stall,
36and that the stall was affecting RCU-sched. This message will normally be
37followed by a stack dump of the offending CPU. On TREE_RCU kernel builds,
38RCU and RCU-sched are implemented by the same underlying mechanism,
39while on TREE_PREEMPT_RCU kernel builds, RCU is instead implemented
40by rcu_preempt_state.
41
42On the other hand, if the offending CPU fails to print out a stall-warning
43message quickly enough, some other CPU will print a message similar to
44the following:
45
46INFO: rcu_bh_state detected stalls on CPUs/tasks: { 3 5 } (detected by 2, 2502 jiffies)
47
48This message indicates that CPU 2 detected that CPUs 3 and 5 were both
49causing stalls, and that the stall was affecting RCU-bh. This message
50will normally be followed by stack dumps for each CPU. Please note that
51TREE_PREEMPT_RCU builds can be stalled by tasks as well as by CPUs,
52and that the tasks will be indicated by PID, for example, "P3421".
53It is even possible for a rcu_preempt_state stall to be caused by both
54CPUs -and- tasks, in which case the offending CPUs and tasks will all
55be called out in the list.
56
57Finally, if the grace period ends just as the stall warning starts
58printing, there will be a spurious stall-warning message:
59
60INFO: rcu_bh_state detected stalls on CPUs/tasks: { } (detected by 4, 2502 jiffies)
61
62This is rare, but does happen from time to time in real life.
63
64So your kernel printed an RCU CPU stall warning. The next question is
65"What caused it?" The following problems can result in RCU CPU stall
66warnings:
29 67
30o A CPU looping in an RCU read-side critical section. 68o A CPU looping in an RCU read-side critical section.
31 69
32o A CPU looping with interrupts disabled. 70o A CPU looping with interrupts disabled. This condition can
71 result in RCU-sched and RCU-bh stalls.
33 72
34o A CPU looping with preemption disabled. 73o A CPU looping with preemption disabled. This condition can
74 result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
75 stalls.
76
77o A CPU looping with bottom halves disabled. This condition can
78 result in RCU-sched and RCU-bh stalls.
35 79
36o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel 80o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
37 without invoking schedule(). 81 without invoking schedule().
@@ -39,20 +83,24 @@ o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
39o A bug in the RCU implementation. 83o A bug in the RCU implementation.
40 84
41o A hardware failure. This is quite unlikely, but has occurred 85o A hardware failure. This is quite unlikely, but has occurred
42 at least once in a former life. A CPU failed in a running system, 86 at least once in real life. A CPU failed in a running system,
43 becoming unresponsive, but not causing an immediate crash. 87 becoming unresponsive, but not causing an immediate crash.
44 This resulted in a series of RCU CPU stall warnings, eventually 88 This resulted in a series of RCU CPU stall warnings, eventually
45 leading the realization that the CPU had failed. 89 leading the realization that the CPU had failed.
46 90
47The RCU, RCU-sched, and RCU-bh implementations have CPU stall warning. 91The RCU, RCU-sched, and RCU-bh implementations have CPU stall
48SRCU does not do so directly, but its calls to synchronize_sched() will 92warning. SRCU does not have its own CPU stall warnings, but its
49result in RCU-sched detecting any CPU stalls that might be occurring. 93calls to synchronize_sched() will result in RCU-sched detecting
50 94RCU-sched-related CPU stalls. Please note that RCU only detects
51To diagnose the cause of the stall, inspect the stack traces. The offending 95CPU stalls when there is a grace period in progress. No grace period,
52function will usually be near the top of the stack. If you have a series 96no CPU stall warnings.
53of stall warnings from a single extended stall, comparing the stack traces 97
54can often help determine where the stall is occurring, which will usually 98To diagnose the cause of the stall, inspect the stack traces.
55be in the function nearest the top of the stack that stays the same from 99The offending function will usually be near the top of the stack.
56trace to trace. 100If you have a series of stall warnings from a single extended stall,
101comparing the stack traces can often help determine where the stall
102is occurring, which will usually be in the function nearest the top of
103that portion of the stack which remains the same from trace to trace.
104If you can reliably trigger the stall, ftrace can be quite helpful.
57 105
58RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE. 106RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE.
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 0e50bc2aa1e2..5d9016795fd8 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -182,16 +182,6 @@ Similarly, sched_expedited RCU provides the following:
182 sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0 182 sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0
183 sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0 183 sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0
184 sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0 184 sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
185 state: -1 / 0:0 3:0 4:0
186
187As before, the first four lines are similar to those for RCU.
188The last line shows the task-migration state. The first number is
189-1 if synchronize_sched_expedited() is idle, -2 if in the process of
190posting wakeups to the migration kthreads, and N when waiting on CPU N.
191Each of the colon-separated fields following the "/" is a CPU:state pair.
192Valid states are "0" for idle, "1" for waiting for quiescent state,
193"2" for passed through quiescent state, and "3" when a race with a
194CPU-hotplug event forces use of the synchronize_sched() primitive.
195 185
196 186
197USAGE 187USAGE
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 8608fd85e921..efd8cc95c06b 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -256,23 +256,23 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
256The output of "cat rcu/rcu_pending" looks as follows: 256The output of "cat rcu/rcu_pending" looks as follows:
257 257
258rcu_sched: 258rcu_sched:
259 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 259 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
260 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 260 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
261 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 261 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
262 3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723 262 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723
263 4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110 263 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110
264 5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456 264 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456
265 6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834 265 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834
266 7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888 266 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888
267rcu_bh: 267rcu_bh:
268 0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314 268 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314
269 1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180 269 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180
270 2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936 270 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936
271 3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863 271 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863
272 4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671 272 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671
273 5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235 273 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235
274 6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 274 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
275 7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 275 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
276 276
277As always, this is once again split into "rcu_sched" and "rcu_bh" 277As always, this is once again split into "rcu_sched" and "rcu_bh"
278portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional 278portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional
@@ -284,6 +284,9 @@ o "np" is the number of times that __rcu_pending() has been invoked
284o "qsp" is the number of times that the RCU was waiting for a 284o "qsp" is the number of times that the RCU was waiting for a
285 quiescent state from this CPU. 285 quiescent state from this CPU.
286 286
287o "rpq" is the number of times that the CPU had passed through
288 a quiescent state, but not yet reported it to RCU.
289
287o "cbr" is the number of times that this CPU had RCU callbacks 290o "cbr" is the number of times that this CPU had RCU callbacks
288 that had passed through a grace period, and were thus ready 291 that had passed through a grace period, and were thus ready
289 to be invoked. 292 to be invoked.
diff --git a/Documentation/arm/00-INDEX b/Documentation/arm/00-INDEX
index 82e418d648d0..7f5fc3ba9c91 100644
--- a/Documentation/arm/00-INDEX
+++ b/Documentation/arm/00-INDEX
@@ -20,6 +20,8 @@ Samsung-S3C24XX
20 - S3C24XX ARM Linux Overview 20 - S3C24XX ARM Linux Overview
21Sharp-LH 21Sharp-LH
22 - Linux on Sharp LH79524 and LH7A40X System On a Chip (SOC) 22 - Linux on Sharp LH79524 and LH7A40X System On a Chip (SOC)
23SPEAr
24 - ST SPEAr platform Linux Overview
23VFP/ 25VFP/
24 - Release notes for Linux Kernel Vector Floating Point support code 26 - Release notes for Linux Kernel Vector Floating Point support code
25empeg/ 27empeg/
diff --git a/Documentation/arm/SPEAr/overview.txt b/Documentation/arm/SPEAr/overview.txt
new file mode 100644
index 000000000000..253a35c6f782
--- /dev/null
+++ b/Documentation/arm/SPEAr/overview.txt
@@ -0,0 +1,60 @@
1 SPEAr ARM Linux Overview
2 ==========================
3
4Introduction
5------------
6
7 SPEAr (Structured Processor Enhanced Architecture).
8 weblink : http://www.st.com/spear
9
10 The ST Microelectronics SPEAr range of ARM9/CortexA9 System-on-Chip CPUs are
11 supported by the 'spear' platform of ARM Linux. Currently SPEAr300,
12 SPEAr310, SPEAr320 and SPEAr600 SOCs are supported. Support for the SPEAr13XX
13 series is in progress.
14
15 Hierarchy in SPEAr is as follows:
16
17 SPEAr (Platform)
18 - SPEAr3XX (3XX SOC series, based on ARM9)
19 - SPEAr300 (SOC)
20 - SPEAr300_EVB (Evaluation Board)
21 - SPEAr310 (SOC)
22 - SPEAr310_EVB (Evaluation Board)
23 - SPEAr320 (SOC)
24 - SPEAr320_EVB (Evaluation Board)
25 - SPEAr6XX (6XX SOC series, based on ARM9)
26 - SPEAr600 (SOC)
27 - SPEAr600_EVB (Evaluation Board)
28 - SPEAr13XX (13XX SOC series, based on ARM CORTEXA9)
29 - SPEAr1300 (SOC)
30
31 Configuration
32 -------------
33
34 A generic configuration is provided for each machine, and can be used as the
35 default by
36 make spear600_defconfig
37 make spear300_defconfig
38 make spear310_defconfig
39 make spear320_defconfig
40
41 Layout
42 ------
43
44 The common files for multiple machine families (SPEAr3XX, SPEAr6XX and
45 SPEAr13XX) are located in the platform code contained in arch/arm/plat-spear
46 with headers in plat/.
47
48 Each machine series have a directory with name arch/arm/mach-spear followed by
49 series name. Like mach-spear3xx, mach-spear6xx and mach-spear13xx.
50
51 Common file for machines of spear3xx family is mach-spear3xx/spear3xx.c and for
52 spear6xx is mach-spear6xx/spear6xx.c. mach-spear* also contain soc/machine
53 specific files, like spear300.c, spear310.c, spear320.c and spear600.c.
54 mach-spear* also contains board specific files for each machine type.
55
56
57 Document Author
58 ---------------
59
60 Viresh Kumar, (c) 2010 ST Microelectronics
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 5eb279a48fa4..57444c2609fc 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -235,8 +235,7 @@ containing the following files describing that cgroup:
235 - cgroup.procs: list of tgids in the cgroup. This list is not 235 - cgroup.procs: list of tgids in the cgroup. This list is not
236 guaranteed to be sorted or free of duplicate tgids, and userspace 236 guaranteed to be sorted or free of duplicate tgids, and userspace
237 should sort/uniquify the list if this property is required. 237 should sort/uniquify the list if this property is required.
238 Writing a tgid into this file moves all threads with that tgid into 238 This is a read-only file, for now.
239 this cgroup.
240 - notify_on_release flag: run the release agent on exit? 239 - notify_on_release flag: run the release agent on exit?
241 - release_agent: the path to use for release notifications (this file 240 - release_agent: the path to use for release notifications (this file
242 exists in the top cgroup only) 241 exists in the top cgroup only)
diff --git a/Documentation/credentials.txt b/Documentation/credentials.txt
index df03169782ea..a2db35287003 100644
--- a/Documentation/credentials.txt
+++ b/Documentation/credentials.txt
@@ -408,9 +408,6 @@ This should be used inside the RCU read lock, as in the following example:
408 ... 408 ...
409 } 409 }
410 410
411A function need not get RCU read lock to use __task_cred() if it is holding a
412spinlock at the time as this implicitly holds the RCU read lock.
413
414Should it be necessary to hold another task's credentials for a long period of 411Should it be necessary to hold another task's credentials for a long period of
415time, and possibly to sleep whilst doing so, then the caller should get a 412time, and possibly to sleep whilst doing so, then the caller should get a
416reference on them using: 413reference on them using:
@@ -426,17 +423,16 @@ credentials, hiding the RCU magic from the caller:
426 uid_t task_uid(task) Task's real UID 423 uid_t task_uid(task) Task's real UID
427 uid_t task_euid(task) Task's effective UID 424 uid_t task_euid(task) Task's effective UID
428 425
429If the caller is holding a spinlock or the RCU read lock at the time anyway, 426If the caller is holding the RCU read lock at the time anyway, then:
430then:
431 427
432 __task_cred(task)->uid 428 __task_cred(task)->uid
433 __task_cred(task)->euid 429 __task_cred(task)->euid
434 430
435should be used instead. Similarly, if multiple aspects of a task's credentials 431should be used instead. Similarly, if multiple aspects of a task's credentials
436need to be accessed, RCU read lock or a spinlock should be used, __task_cred() 432need to be accessed, RCU read lock should be used, __task_cred() called, the
437called, the result stored in a temporary pointer and then the credential 433result stored in a temporary pointer and then the credential aspects called
438aspects called from that before dropping the lock. This prevents the 434from that before dropping the lock. This prevents the potentially expensive
439potentially expensive RCU magic from being invoked multiple times. 435RCU magic from being invoked multiple times.
440 436
441Should some other single aspect of another task's credentials need to be 437Should some other single aspect of another task's credentials need to be
442accessed, then this can be used: 438accessed, then this can be used:
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index ed511af0f79a..e7965f4a385a 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -520,29 +520,6 @@ Who: Hans de Goede <hdegoede@redhat.com>
520 520
521---------------------------- 521----------------------------
522 522
523What: corgikbd, spitzkbd, tosakbd driver
524When: 2.6.35
525Files: drivers/input/keyboard/{corgi,spitz,tosa}kbd.c
526Why: We now have a generic GPIO based matrix keyboard driver that
527 are fully capable of handling all the keys on these devices.
528 The original drivers manipulate the GPIO registers directly
529 and so are difficult to maintain.
530Who: Eric Miao <eric.y.miao@gmail.com>
531
532----------------------------
533
534What: corgi_ssp and corgi_ts driver
535When: 2.6.35
536Files: arch/arm/mach-pxa/corgi_ssp.c, drivers/input/touchscreen/corgi_ts.c
537Why: The corgi touchscreen is now deprecated in favour of the generic
538 ads7846.c driver. The noise reduction technique used in corgi_ts.c,
539 that's to wait till vsync before ADC sampling, is also integrated into
540 ads7846 driver now. Provided that the original driver is not generic
541 and is difficult to maintain, it will be removed later.
542Who: Eric Miao <eric.y.miao@gmail.com>
543
544----------------------------
545
546What: capifs 523What: capifs
547When: February 2011 524When: February 2011
548Files: drivers/isdn/capi/capifs.* 525Files: drivers/isdn/capi/capifs.*
@@ -564,6 +541,16 @@ Who: Avi Kivity <avi@redhat.com>
564 541
565---------------------------- 542----------------------------
566 543
544What: xtime, wall_to_monotonic
545When: 2.6.36+
546Files: kernel/time/timekeeping.c include/linux/time.h
547Why: Cleaning up timekeeping internal values. Please use
548 existing timekeeping accessor functions to access
549 the equivalent functionality.
550Who: John Stultz <johnstul@us.ibm.com>
551
552----------------------------
553
567What: KVM kernel-allocated memory slots 554What: KVM kernel-allocated memory slots
568When: July 2010 555When: July 2010
569Why: Since 2.6.25, kvm supports user-allocated memory slots, which are 556Why: Since 2.6.25, kvm supports user-allocated memory slots, which are
@@ -589,3 +576,36 @@ Why: Useful in 2003, implementation is a hack.
589 Generally invoked by accident today. 576 Generally invoked by accident today.
590 Seen as doing more harm than good. 577 Seen as doing more harm than good.
591Who: Len Brown <len.brown@intel.com> 578Who: Len Brown <len.brown@intel.com>
579
580----------------------------
581
582What: video4linux /dev/vtx teletext API support
583When: 2.6.35
584Files: drivers/media/video/saa5246a.c drivers/media/video/saa5249.c
585 include/linux/videotext.h
586Why: The vtx device nodes have been superseded by vbi device nodes
587 for many years. No applications exist that use the vtx support.
588 Of the two i2c drivers that actually support this API the saa5249
589 has been impossible to use for a year now and no known hardware
590 that supports this device exists. The saa5246a is theoretically
591 supported by the old mxb boards, but it never actually worked.
592
593 In summary: there is no hardware that can use this API and there
594 are no applications actually implementing this API.
595
596 The vtx support still reserves minors 192-223 and we would really
597 like to reuse those for upcoming new functionality. In the unlikely
598 event that new hardware appears that wants to use the functionality
599 provided by the vtx API, then that functionality should be build
600 around the sliced VBI API instead.
601Who: Hans Verkuil <hverkuil@xs4all.nl>
602
603----------------------------
604
605What: IRQF_DISABLED
606When: 2.6.36
607Why: The flag is a NOOP as we run interrupt handlers with interrupts disabled
608Who: Thomas Gleixner <tglx@linutronix.de>
609
610----------------------------
611
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt
index 6a53a84afc72..04884914a1c8 100644
--- a/Documentation/filesystems/nfs/nfs41-server.txt
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@@ -137,7 +137,7 @@ NS*| OPENATTR | OPT | | Section 18.17 |
137 | READ | REQ | | Section 18.22 | 137 | READ | REQ | | Section 18.22 |
138 | READDIR | REQ | | Section 18.23 | 138 | READDIR | REQ | | Section 18.23 |
139 | READLINK | OPT | | Section 18.24 | 139 | READLINK | OPT | | Section 18.24 |
140NS | RECLAIM_COMPLETE | REQ | | Section 18.51 | 140 | RECLAIM_COMPLETE | REQ | | Section 18.51 |
141 | RELEASE_LOCKOWNER | MNI | | N/A | 141 | RELEASE_LOCKOWNER | MNI | | N/A |
142 | REMOVE | REQ | | Section 18.25 | 142 | REMOVE | REQ | | Section 18.25 |
143 | RENAME | REQ | | Section 18.26 | 143 | RENAME | REQ | | Section 18.26 |
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index f6b1b5fca1df..9fb6cbe70bde 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -316,7 +316,7 @@ address perms offset dev inode pathname
31608049000-0804a000 rw-p 00001000 03:00 8312 /opt/test 31608049000-0804a000 rw-p 00001000 03:00 8312 /opt/test
3170804a000-0806b000 rw-p 00000000 00:00 0 [heap] 3170804a000-0806b000 rw-p 00000000 00:00 0 [heap]
318a7cb1000-a7cb2000 ---p 00000000 00:00 0 318a7cb1000-a7cb2000 ---p 00000000 00:00 0
319a7cb2000-a7eb2000 rw-p 00000000 00:00 0 [threadstack:001ff4b4] 319a7cb2000-a7eb2000 rw-p 00000000 00:00 0
320a7eb2000-a7eb3000 ---p 00000000 00:00 0 320a7eb2000-a7eb3000 ---p 00000000 00:00 0
321a7eb3000-a7ed5000 rw-p 00000000 00:00 0 321a7eb3000-a7ed5000 rw-p 00000000 00:00 0
322a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 322a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6
@@ -352,7 +352,6 @@ is not associated with a file:
352 [stack] = the stack of the main process 352 [stack] = the stack of the main process
353 [vdso] = the "virtual dynamic shared object", 353 [vdso] = the "virtual dynamic shared object",
354 the kernel system call handler 354 the kernel system call handler
355 [threadstack:xxxxxxxx] = the stack of the thread, xxxxxxxx is the stack size
356 355
357 or if empty, the mapping is anonymous. 356 or if empty, the mapping is anonymous.
358 357
@@ -566,6 +565,10 @@ The default_smp_affinity mask applies to all non-active IRQs, which are the
566IRQs which have not yet been allocated/activated, and hence which lack a 565IRQs which have not yet been allocated/activated, and hence which lack a
567/proc/irq/[0-9]* directory. 566/proc/irq/[0-9]* directory.
568 567
568The node file on an SMP system shows the node to which the device using the IRQ
569reports itself as being attached. This hardware locality information does not
570include information about any possible driver locality preference.
571
569prof_cpu_mask specifies which CPUs are to be profiled by the system wide 572prof_cpu_mask specifies which CPUs are to be profiled by the system wide
570profiler. Default value is ffffffff (all cpus). 573profiler. Default value is ffffffff (all cpus).
571 574
diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients
index 3219ee0dbfef..5ebf5af1d716 100644
--- a/Documentation/i2c/writing-clients
+++ b/Documentation/i2c/writing-clients
@@ -74,6 +74,11 @@ structure at all. You should use this to keep device-specific data.
74 /* retrieve the value */ 74 /* retrieve the value */
75 void *i2c_get_clientdata(const struct i2c_client *client); 75 void *i2c_get_clientdata(const struct i2c_client *client);
76 76
77Note that starting with kernel 2.6.34, you don't have to set the `data' field
78to NULL in remove() or if probe() failed anymore. The i2c-core does this
79automatically on these occasions. Those are also the only times the core will
80touch this field.
81
77 82
78Accessing the client 83Accessing the client
79==================== 84====================
diff --git a/Documentation/input/elantech.txt b/Documentation/input/elantech.txt
index a10c3b6ba7c4..56941ae1f5db 100644
--- a/Documentation/input/elantech.txt
+++ b/Documentation/input/elantech.txt
@@ -333,14 +333,14 @@ byte 0:
333byte 1: 333byte 1:
334 334
335 bit 7 6 5 4 3 2 1 0 335 bit 7 6 5 4 3 2 1 0
336 x15 x14 x13 x12 x11 x10 x9 x8 336 . . . . . x10 x9 x8
337 337
338byte 2: 338byte 2:
339 339
340 bit 7 6 5 4 3 2 1 0 340 bit 7 6 5 4 3 2 1 0
341 x7 x6 x5 x4 x4 x2 x1 x0 341 x7 x6 x5 x4 x4 x2 x1 x0
342 342
343 x15..x0 = absolute x value (horizontal) 343 x10..x0 = absolute x value (horizontal)
344 344
345byte 3: 345byte 3:
346 346
@@ -350,14 +350,14 @@ byte 3:
350byte 4: 350byte 4:
351 351
352 bit 7 6 5 4 3 2 1 0 352 bit 7 6 5 4 3 2 1 0
353 y15 y14 y13 y12 y11 y10 y8 y8 353 . . . . . . y9 y8
354 354
355byte 5: 355byte 5:
356 356
357 bit 7 6 5 4 3 2 1 0 357 bit 7 6 5 4 3 2 1 0
358 y7 y6 y5 y4 y3 y2 y1 y0 358 y7 y6 y5 y4 y3 y2 y1 y0
359 359
360 y15..y0 = absolute y value (vertical) 360 y9..y0 = absolute y value (vertical)
361 361
362 362
3634.2.2 Two finger touch 3634.2.2 Two finger touch
diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt
index 1423bcc7c507..5dc59b04a71f 100644
--- a/Documentation/intel_txt.txt
+++ b/Documentation/intel_txt.txt
@@ -161,13 +161,15 @@ o In order to put a system into any of the sleep states after a TXT
161 has been restored, it will restore the TPM PCRs and then 161 has been restored, it will restore the TPM PCRs and then
162 transfer control back to the kernel's S3 resume vector. 162 transfer control back to the kernel's S3 resume vector.
163 In order to preserve system integrity across S3, the kernel 163 In order to preserve system integrity across S3, the kernel
164 provides tboot with a set of memory ranges (kernel 164 provides tboot with a set of memory ranges (RAM and RESERVED_KERN
165 code/data/bss, S3 resume code, and AP trampoline) that tboot 165 in the e820 table, but not any memory that BIOS might alter over
166 will calculate a MAC (message authentication code) over and then 166 the S3 transition) that tboot will calculate a MAC (message
167 seal with the TPM. On resume and once the measured environment 167 authentication code) over and then seal with the TPM. On resume
168 has been re-established, tboot will re-calculate the MAC and 168 and once the measured environment has been re-established, tboot
169 verify it against the sealed value. Tboot's policy determines 169 will re-calculate the MAC and verify it against the sealed value.
170 what happens if the verification fails. 170 Tboot's policy determines what happens if the verification fails.
171 Note that the c/s 194 of tboot which has the new MAC code supports
172 this.
171 173
172That's pretty much it for TXT support. 174That's pretty much it for TXT support.
173 175
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e2202e93b148..b9b0d7989f4e 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -99,6 +99,7 @@ parameter is applicable:
99 SWSUSP Software suspend (hibernation) is enabled. 99 SWSUSP Software suspend (hibernation) is enabled.
100 SUSPEND System suspend states are enabled. 100 SUSPEND System suspend states are enabled.
101 FTRACE Function tracing enabled. 101 FTRACE Function tracing enabled.
102 TPM TPM drivers are enabled.
102 TS Appropriate touchscreen support is enabled. 103 TS Appropriate touchscreen support is enabled.
103 UMS USB Mass Storage support is enabled. 104 UMS USB Mass Storage support is enabled.
104 USB USB support is enabled. 105 USB USB support is enabled.
@@ -324,6 +325,8 @@ and is between 256 and 4096 characters. It is defined in the file
324 they are unmapped. Otherwise they are 325 they are unmapped. Otherwise they are
325 flushed before they will be reused, which 326 flushed before they will be reused, which
326 is a lot of faster 327 is a lot of faster
328 off - do not initialize any AMD IOMMU found in
329 the system
327 330
328 amijoy.map= [HW,JOY] Amiga joystick support 331 amijoy.map= [HW,JOY] Amiga joystick support
329 Map of devices attached to JOY0DAT and JOY1DAT 332 Map of devices attached to JOY0DAT and JOY1DAT
@@ -784,8 +787,12 @@ and is between 256 and 4096 characters. It is defined in the file
784 as early as possible in order to facilitate early 787 as early as possible in order to facilitate early
785 boot debugging. 788 boot debugging.
786 789
787 ftrace_dump_on_oops 790 ftrace_dump_on_oops[=orig_cpu]
788 [FTRACE] will dump the trace buffers on oops. 791 [FTRACE] will dump the trace buffers on oops.
792 If no parameter is passed, ftrace will dump
793 buffers of all CPUs, but if you pass orig_cpu, it will
794 dump only the buffer of the CPU that triggered the
795 oops.
789 796
790 ftrace_filter=[function-list] 797 ftrace_filter=[function-list]
791 [FTRACE] Limit the functions traced by the function 798 [FTRACE] Limit the functions traced by the function
@@ -1194,7 +1201,7 @@ and is between 256 and 4096 characters. It is defined in the file
1194 1201
1195 libata.force= [LIBATA] Force configurations. The format is comma 1202 libata.force= [LIBATA] Force configurations. The format is comma
1196 separated list of "[ID:]VAL" where ID is 1203 separated list of "[ID:]VAL" where ID is
1197 PORT[:DEVICE]. PORT and DEVICE are decimal numbers 1204 PORT[.DEVICE]. PORT and DEVICE are decimal numbers
1198 matching port, link or device. Basically, it matches 1205 matching port, link or device. Basically, it matches
1199 the ATA ID string printed on console by libata. If 1206 the ATA ID string printed on console by libata. If
1200 the whole ID part is omitted, the last PORT and DEVICE 1207 the whole ID part is omitted, the last PORT and DEVICE
@@ -2610,6 +2617,15 @@ and is between 256 and 4096 characters. It is defined in the file
2610 2617
2611 tp720= [HW,PS2] 2618 tp720= [HW,PS2]
2612 2619
2620 tpm_suspend_pcr=[HW,TPM]
2621 Format: integer pcr id
2622 Specify that at suspend time, the tpm driver
2623 should extend the specified pcr with zeros,
2624 as a workaround for some chips which fail to
2625 flush the last written pcr on TPM_SaveState.
2626 This will guarantee that all the other pcrs
2627 are saved.
2628
2613 trace_buf_size=nn[KMG] 2629 trace_buf_size=nn[KMG]
2614 [FTRACE] will set tracing buffer size. 2630 [FTRACE] will set tracing buffer size.
2615 2631
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 51ec634ac04b..6653017680dd 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -165,8 +165,8 @@ the user entry_handler invocation is also skipped.
165 165
1661.4 How Does Jump Optimization Work? 1661.4 How Does Jump Optimization Work?
167 167
168If you configured your kernel with CONFIG_OPTPROBES=y (currently 168If your kernel is built with CONFIG_OPTPROBES=y (currently this flag
169this option is supported on x86/x86-64, non-preemptive kernel) and 169is automatically set 'y' on x86/x86-64, non-preemptive kernel) and
170the "debug.kprobes_optimization" kernel parameter is set to 1 (see 170the "debug.kprobes_optimization" kernel parameter is set to 1 (see
171sysctl(8)), Kprobes tries to reduce probe-hit overhead by using a jump 171sysctl(8)), Kprobes tries to reduce probe-hit overhead by using a jump
172instruction instead of a breakpoint instruction at each probepoint. 172instruction instead of a breakpoint instruction at each probepoint.
@@ -271,8 +271,6 @@ tweak the kernel's execution path, you need to suppress optimization,
271using one of the following techniques: 271using one of the following techniques:
272- Specify an empty function for the kprobe's post_handler or break_handler. 272- Specify an empty function for the kprobe's post_handler or break_handler.
273 or 273 or
274- Config CONFIG_OPTPROBES=n.
275 or
276- Execute 'sysctl -w debug.kprobes_optimization=n' 274- Execute 'sysctl -w debug.kprobes_optimization=n'
277 275
2782. Architectures Supported 2762. Architectures Supported
@@ -307,10 +305,6 @@ it useful to "Compile the kernel with debug info" (CONFIG_DEBUG_INFO),
307so you can use "objdump -d -l vmlinux" to see the source-to-object 305so you can use "objdump -d -l vmlinux" to see the source-to-object
308code mapping. 306code mapping.
309 307
310If you want to reduce probing overhead, set "Kprobes jump optimization
311support" (CONFIG_OPTPROBES) to "y". You can find this option under the
312"Kprobes" line.
313
3144. API Reference 3084. API Reference
315 309
316The Kprobes API includes a "register" function and an "unregister" 310The Kprobes API includes a "register" function and an "unregister"
diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.txt
index 446f43b309df..61bc4e943116 100644
--- a/Documentation/pcmcia/driver-changes.txt
+++ b/Documentation/pcmcia/driver-changes.txt
@@ -1,4 +1,17 @@
1This file details changes in 2.6 which affect PCMCIA card driver authors: 1This file details changes in 2.6 which affect PCMCIA card driver authors:
2* No dev_node_t (as of 2.6.35)
3 There is no more need to fill out a "dev_node_t" structure.
4
5* New IRQ request rules (as of 2.6.35)
6 Instead of the old pcmcia_request_irq() interface, drivers may now
7 choose between:
8 - calling request_irq/free_irq directly. Use the IRQ from *p_dev->irq.
9 - use pcmcia_request_irq(p_dev, handler_t); the PCMCIA core will
10 clean up automatically on calls to pcmcia_disable_device() or
11 device ejection.
12 - drivers still not capable of IRQF_SHARED (or not telling us so) may
13 use the deprecated pcmcia_request_exclusive_irq() for the time
14 being; they might receive a shared IRQ nonetheless.
2 15
3* no cs_error / CS_CHECK / CONFIG_PCMCIA_DEBUG (as of 2.6.33) 16* no cs_error / CS_CHECK / CONFIG_PCMCIA_DEBUG (as of 2.6.33)
4 Instead of the cs_error() callback or the CS_CHECK() macro, please use 17 Instead of the cs_error() callback or the CS_CHECK() macro, please use
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index c9abbd86bc18..57080cd74575 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -1,7 +1,13 @@
1Device Power Management
2
3Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
4Copyright (c) 2010 Alan Stern <stern@rowland.harvard.edu>
5
6
1Most of the code in Linux is device drivers, so most of the Linux power 7Most of the code in Linux is device drivers, so most of the Linux power
2management code is also driver-specific. Most drivers will do very little; 8management (PM) code is also driver-specific. Most drivers will do very
3others, especially for platforms with small batteries (like cell phones), 9little; others, especially for platforms with small batteries (like cell
4will do a lot. 10phones), will do a lot.
5 11
6This writeup gives an overview of how drivers interact with system-wide 12This writeup gives an overview of how drivers interact with system-wide
7power management goals, emphasizing the models and interfaces that are 13power management goals, emphasizing the models and interfaces that are
@@ -15,9 +21,10 @@ Drivers will use one or both of these models to put devices into low-power
15states: 21states:
16 22
17 System Sleep model: 23 System Sleep model:
18 Drivers can enter low power states as part of entering system-wide 24 Drivers can enter low-power states as part of entering system-wide
19 low-power states like "suspend-to-ram", or (mostly for systems with 25 low-power states like "suspend" (also known as "suspend-to-RAM"), or
20 disks) "hibernate" (suspend-to-disk). 26 (mostly for systems with disks) "hibernation" (also known as
27 "suspend-to-disk").
21 28
22 This is something that device, bus, and class drivers collaborate on 29 This is something that device, bus, and class drivers collaborate on
23 by implementing various role-specific suspend and resume methods to 30 by implementing various role-specific suspend and resume methods to
@@ -25,33 +32,41 @@ states:
25 them without loss of data. 32 them without loss of data.
26 33
27 Some drivers can manage hardware wakeup events, which make the system 34 Some drivers can manage hardware wakeup events, which make the system
28 leave that low-power state. This feature may be disabled using the 35 leave the low-power state. This feature may be enabled or disabled
29 relevant /sys/devices/.../power/wakeup file; enabling it may cost some 36 using the relevant /sys/devices/.../power/wakeup file (for Ethernet
30 power usage, but let the whole system enter low power states more often. 37 drivers the ioctl interface used by ethtool may also be used for this
38 purpose); enabling it may cost some power usage, but let the whole
39 system enter low-power states more often.
31 40
32 Runtime Power Management model: 41 Runtime Power Management model:
33 Drivers may also enter low power states while the system is running, 42 Devices may also be put into low-power states while the system is
34 independently of other power management activity. Upstream drivers 43 running, independently of other power management activity in principle.
35 will normally not know (or care) if the device is in some low power 44 However, devices are not generally independent of each other (for
36 state when issuing requests; the driver will auto-resume anything 45 example, a parent device cannot be suspended unless all of its child
37 that's needed when it gets a request. 46 devices have been suspended). Moreover, depending on the bus type the
38 47 device is on, it may be necessary to carry out some bus-specific
39 This doesn't have, or need much infrastructure; it's just something you 48 operations on the device for this purpose. Devices put into low power
40 should do when writing your drivers. For example, clk_disable() unused 49 states at run time may require special handling during system-wide power
41 clocks as part of minimizing power drain for currently-unused hardware. 50 transitions (suspend or hibernation).
42 Of course, sometimes clusters of drivers will collaborate with each 51
43 other, which could involve task-specific power management. 52 For these reasons not only the device driver itself, but also the
44 53 appropriate subsystem (bus type, device type or device class) driver and
45There's not a lot to be said about those low power states except that they 54 the PM core are involved in runtime power management. As in the system
46are very system-specific, and often device-specific. Also, that if enough 55 sleep power management case, they need to collaborate by implementing
47drivers put themselves into low power states (at "runtime"), the effect may be 56 various role-specific suspend and resume methods, so that the hardware
48the same as entering some system-wide low-power state (system sleep) ... and 57 is cleanly powered down and reactivated without data or service loss.
49that synergies exist, so that several drivers using runtime pm might put the 58
50system into a state where even deeper power saving options are available. 59There's not a lot to be said about those low-power states except that they are
51 60very system-specific, and often device-specific. Also, that if enough devices
52Most suspended devices will have quiesced all I/O: no more DMA or irqs, no 61have been put into low-power states (at runtime), the effect may be very similar
53more data read or written, and requests from upstream drivers are no longer 62to entering some system-wide low-power state (system sleep) ... and that
54accepted. A given bus or platform may have different requirements though. 63synergies exist, so that several drivers using runtime PM might put the system
64into a state where even deeper power saving options are available.
65
66Most suspended devices will have quiesced all I/O: no more DMA or IRQs (except
67for wakeup events), no more data read or written, and requests from upstream
68drivers are no longer accepted. A given bus or platform may have different
69requirements though.
55 70
56Examples of hardware wakeup events include an alarm from a real time clock, 71Examples of hardware wakeup events include an alarm from a real time clock,
57network wake-on-LAN packets, keyboard or mouse activity, and media insertion 72network wake-on-LAN packets, keyboard or mouse activity, and media insertion
@@ -60,129 +75,152 @@ or removal (for PCMCIA, MMC/SD, USB, and so on).
60 75
61Interfaces for Entering System Sleep States 76Interfaces for Entering System Sleep States
62=========================================== 77===========================================
63Most of the programming interfaces a device driver needs to know about 78There are programming interfaces provided for subsystems (bus type, device type,
64relate to that first model: entering a system-wide low power state, 79device class) and device drivers to allow them to participate in the power
65rather than just minimizing power consumption by one device. 80management of devices they are concerned with. These interfaces cover both
66 81system sleep and runtime power management.
67 82
68Bus Driver Methods 83
69------------------ 84Device Power Management Operations
70The core methods to suspend and resume devices reside in struct bus_type. 85----------------------------------
71These are mostly of interest to people writing infrastructure for busses 86Device power management operations, at the subsystem level as well as at the
72like PCI or USB, or because they define the primitives that device drivers 87device driver level, are implemented by defining and populating objects of type
73may need to apply in domain-specific ways to their devices: 88struct dev_pm_ops:
74 89
75struct bus_type { 90struct dev_pm_ops {
76 ... 91 int (*prepare)(struct device *dev);
77 int (*suspend)(struct device *dev, pm_message_t state); 92 void (*complete)(struct device *dev);
78 int (*resume)(struct device *dev); 93 int (*suspend)(struct device *dev);
94 int (*resume)(struct device *dev);
95 int (*freeze)(struct device *dev);
96 int (*thaw)(struct device *dev);
97 int (*poweroff)(struct device *dev);
98 int (*restore)(struct device *dev);
99 int (*suspend_noirq)(struct device *dev);
100 int (*resume_noirq)(struct device *dev);
101 int (*freeze_noirq)(struct device *dev);
102 int (*thaw_noirq)(struct device *dev);
103 int (*poweroff_noirq)(struct device *dev);
104 int (*restore_noirq)(struct device *dev);
105 int (*runtime_suspend)(struct device *dev);
106 int (*runtime_resume)(struct device *dev);
107 int (*runtime_idle)(struct device *dev);
79}; 108};
80 109
81Bus drivers implement those methods as appropriate for the hardware and 110This structure is defined in include/linux/pm.h and the methods included in it
82the drivers using it; PCI works differently from USB, and so on. Not many 111are also described in that file. Their roles will be explained in what follows.
83people write bus drivers; most driver code is a "device driver" that 112For now, it should be sufficient to remember that the last three methods are
84builds on top of bus-specific framework code. 113specific to runtime power management while the remaining ones are used during
114system-wide power transitions.
85 115
86For more information on these driver calls, see the description later; 116There also is a deprecated "old" or "legacy" interface for power management
87they are called in phases for every device, respecting the parent-child 117operations available at least for some subsystems. This approach does not use
88sequencing in the driver model tree. Note that as this is being written, 118struct dev_pm_ops objects and it is suitable only for implementing system sleep
89only the suspend() and resume() are widely available; not many bus drivers 119power management methods. Therefore it is not described in this document, so
90leverage all of those phases, or pass them down to lower driver levels. 120please refer directly to the source code for more information about it.
91 121
92 122
93/sys/devices/.../power/wakeup files 123Subsystem-Level Methods
94----------------------------------- 124-----------------------
95All devices in the driver model have two flags to control handling of 125The core methods to suspend and resume devices reside in struct dev_pm_ops
96wakeup events, which are hardware signals that can force the device and/or 126pointed to by the pm member of struct bus_type, struct device_type and
97system out of a low power state. These are initialized by bus or device 127struct class. They are mostly of interest to the people writing infrastructure
98driver code using device_init_wakeup(dev,can_wakeup). 128for buses, like PCI or USB, or device type and device class drivers.
99 129
100The "can_wakeup" flag just records whether the device (and its driver) can 130Bus drivers implement these methods as appropriate for the hardware and the
101physically support wakeup events. When that flag is clear, the sysfs 131drivers using it; PCI works differently from USB, and so on. Not many people
102"wakeup" file is empty, and device_may_wakeup() returns false. 132write subsystem-level drivers; most driver code is a "device driver" that builds
133on top of bus-specific framework code.
103 134
104For devices that can issue wakeup events, a separate flag controls whether 135For more information on these driver calls, see the description later;
105that device should try to use its wakeup mechanism. The initial value of 136they are called in phases for every device, respecting the parent-child
106device_may_wakeup() will be true, so that the device's "wakeup" file holds 137sequencing in the driver model tree.
107the value "enabled". Userspace can change that to "disabled" so that
108device_may_wakeup() returns false; or change it back to "enabled" (so that
109it returns true again).
110 138
111 139
112EXAMPLE: PCI Device Driver Methods 140/sys/devices/.../power/wakeup files
113----------------------------------- 141-----------------------------------
114PCI framework software calls these methods when the PCI device driver bound 142All devices in the driver model have two flags to control handling of wakeup
115to a device device has provided them: 143events (hardware signals that can force the device and/or system out of a low
116 144power state). These flags are initialized by bus or device driver code using
117struct pci_driver { 145device_set_wakeup_capable() and device_set_wakeup_enable(), defined in
118 ... 146include/linux/pm_wakeup.h.
119 int (*suspend)(struct pci_device *pdev, pm_message_t state);
120 int (*suspend_late)(struct pci_device *pdev, pm_message_t state);
121 147
122 int (*resume_early)(struct pci_device *pdev); 148The "can_wakeup" flag just records whether the device (and its driver) can
123 int (*resume)(struct pci_device *pdev); 149physically support wakeup events. The device_set_wakeup_capable() routine
124}; 150affects this flag. The "should_wakeup" flag controls whether the device should
125 151try to use its wakeup mechanism. device_set_wakeup_enable() affects this flag;
126Drivers will implement those methods, and call PCI-specific procedures 152for the most part drivers should not change its value. The initial value of
127like pci_set_power_state(), pci_enable_wake(), pci_save_state(), and 153should_wakeup is supposed to be false for the majority of devices; the major
128pci_restore_state() to manage PCI-specific mechanisms. (PCI config space 154exceptions are power buttons, keyboards, and Ethernet adapters whose WoL
129could be saved during driver probe, if it weren't for the fact that some 155(wake-on-LAN) feature has been set up with ethtool.
130systems rely on userspace tweaking using setpci.) Devices are suspended 156
131before their bridges enter low power states, and likewise bridges resume 157Whether or not a device is capable of issuing wakeup events is a hardware
132before their devices. 158matter, and the kernel is responsible for keeping track of it. By contrast,
133 159whether or not a wakeup-capable device should issue wakeup events is a policy
134 160decision, and it is managed by user space through a sysfs attribute: the
135Upper Layers of Driver Stacks 161power/wakeup file. User space can write the strings "enabled" or "disabled" to
136----------------------------- 162set or clear the should_wakeup flag, respectively. Reads from the file will
137Device drivers generally have at least two interfaces, and the methods 163return the corresponding string if can_wakeup is true, but if can_wakeup is
138sketched above are the ones which apply to the lower level (nearer PCI, USB, 164false then reads will return an empty string, to indicate that the device
139or other bus hardware). The network and block layers are examples of upper 165doesn't support wakeup events. (But even though the file appears empty, writes
140level interfaces, as is a character device talking to userspace. 166will still affect the should_wakeup flag.)
141 167
142Power management requests normally need to flow through those upper levels, 168The device_may_wakeup() routine returns true only if both flags are set.
143which often use domain-oriented requests like "blank that screen". In 169Drivers should check this routine when putting devices in a low-power state
144some cases those upper levels will have power management intelligence that 170during a system sleep transition, to see whether or not to enable the devices'
145relates to end-user activity, or other devices that work in cooperation. 171wakeup mechanisms. However for runtime power management, wakeup events should
146 172be enabled whenever the device and driver both support them, regardless of the
147When those interfaces are structured using class interfaces, there is a 173should_wakeup flag.
148standard way to have the upper layer stop issuing requests to a given 174
149class device (and restart later): 175
150 176/sys/devices/.../power/control files
151struct class { 177------------------------------------
152 ... 178Each device in the driver model has a flag to control whether it is subject to
153 int (*suspend)(struct device *dev, pm_message_t state); 179runtime power management. This flag, called runtime_auto, is initialized by the
154 int (*resume)(struct device *dev); 180bus type (or generally subsystem) code using pm_runtime_allow() or
155}; 181pm_runtime_forbid(); the default is to allow runtime power management.
156 182
157Those calls are issued in specific phases of the process by which the 183The setting can be adjusted by user space by writing either "on" or "auto" to
158system enters a low power "suspend" state, or resumes from it. 184the device's power/control sysfs file. Writing "auto" calls pm_runtime_allow(),
159 185setting the flag and allowing the device to be runtime power-managed by its
160 186driver. Writing "on" calls pm_runtime_forbid(), clearing the flag, returning
161Calling Drivers to Enter System Sleep States 187the device to full power if it was in a low-power state, and preventing the
162============================================ 188device from being runtime power-managed. User space can check the current value
163When the system enters a low power state, each device's driver is asked 189of the runtime_auto flag by reading the file.
164to suspend the device by putting it into state compatible with the target 190
191The device's runtime_auto flag has no effect on the handling of system-wide
192power transitions. In particular, the device can (and in the majority of cases
193should and will) be put into a low-power state during a system-wide transition
194to a sleep state even though its runtime_auto flag is clear.
195
196For more information about the runtime power management framework, refer to
197Documentation/power/runtime_pm.txt.
198
199
200Calling Drivers to Enter and Leave System Sleep States
201======================================================
202When the system goes into a sleep state, each device's driver is asked to
203suspend the device by putting it into a state compatible with the target
165system state. That's usually some version of "off", but the details are 204system state. That's usually some version of "off", but the details are
166system-specific. Also, wakeup-enabled devices will usually stay partly 205system-specific. Also, wakeup-enabled devices will usually stay partly
167functional in order to wake the system. 206functional in order to wake the system.
168 207
169When the system leaves that low power state, the device's driver is asked 208When the system leaves that low-power state, the device's driver is asked to
170to resume it. The suspend and resume operations always go together, and 209resume it by returning it to full power. The suspend and resume operations
171both are multi-phase operations. 210always go together, and both are multi-phase operations.
172 211
173For simple drivers, suspend might quiesce the device using the class code 212For simple drivers, suspend might quiesce the device using class code
174and then turn its hardware as "off" as possible with late_suspend. The 213and then turn its hardware as "off" as possible during suspend_noirq. The
175matching resume calls would then completely reinitialize the hardware 214matching resume calls would then completely reinitialize the hardware
176before reactivating its class I/O queues. 215before reactivating its class I/O queues.
177 216
178More power-aware drivers drivers will use more than one device low power 217More power-aware drivers might prepare the devices for triggering system wakeup
179state, either at runtime or during system sleep states, and might trigger 218events.
180system wakeup events.
181 219
182 220
183Call Sequence Guarantees 221Call Sequence Guarantees
184------------------------ 222------------------------
185To ensure that bridges and similar links needed to talk to a device are 223To ensure that bridges and similar links needing to talk to a device are
186available when the device is suspended or resumed, the device tree is 224available when the device is suspended or resumed, the device tree is
187walked in a bottom-up order to suspend devices. A top-down order is 225walked in a bottom-up order to suspend devices. A top-down order is
188used to resume those devices. 226used to resume those devices.
@@ -194,67 +232,310 @@ its parent; and can't be removed or suspended after that parent.
194The policy is that the device tree should match hardware bus topology. 232The policy is that the device tree should match hardware bus topology.
195(Or at least the control bus, for devices which use multiple busses.) 233(Or at least the control bus, for devices which use multiple busses.)
196In particular, this means that a device registration may fail if the parent of 234In particular, this means that a device registration may fail if the parent of
197the device is suspending (ie. has been chosen by the PM core as the next 235the device is suspending (i.e. has been chosen by the PM core as the next
198device to suspend) or has already suspended, as well as after all of the other 236device to suspend) or has already suspended, as well as after all of the other
199devices have been suspended. Device drivers must be prepared to cope with such 237devices have been suspended. Device drivers must be prepared to cope with such
200situations. 238situations.
201 239
202 240
203Suspending Devices 241System Power Management Phases
204------------------ 242------------------------------
205Suspending a given device is done in several phases. Suspending the 243Suspending or resuming the system is done in several phases. Different phases
206system always includes every phase, executing calls for every device 244are used for standby or memory sleep states ("suspend-to-RAM") and the
207before the next phase begins. Not all busses or classes support all 245hibernation state ("suspend-to-disk"). Each phase involves executing callbacks
208these callbacks; and not all drivers use all the callbacks. 246for every device before the next phase begins. Not all busses or classes
247support all these callbacks and not all drivers use all the callbacks. The
248various phases always run after tasks have been frozen and before they are
249unfrozen. Furthermore, the *_noirq phases run at a time when IRQ handlers have
250been disabled (except for those marked with the IRQ_WAKEUP flag).
209 251
210The phases are seen by driver notifications issued in this order: 252Most phases use bus, type, and class callbacks (that is, methods defined in
253dev->bus->pm, dev->type->pm, and dev->class->pm). The prepare and complete
254phases are exceptions; they use only bus callbacks. When multiple callbacks
255are used in a phase, they are invoked in the order: <class, type, bus> during
256power-down transitions and in the opposite order during power-up transitions.
257For example, during the suspend phase the PM core invokes
211 258
212 1 class.suspend(dev, message) is called after tasks are frozen, for 259 dev->class->pm.suspend(dev);
213 devices associated with a class that has such a method. This 260 dev->type->pm.suspend(dev);
214 method may sleep. 261 dev->bus->pm.suspend(dev);
215 262
216 Since I/O activity usually comes from such higher layers, this is 263before moving on to the next device, whereas during the resume phase the core
217 a good place to quiesce all drivers of a given type (and keep such 264invokes
218 code out of those drivers).
219 265
220 2 bus.suspend(dev, message) is called next. This method may sleep, 266 dev->bus->pm.resume(dev);
221 and is often morphed into a device driver call with bus-specific 267 dev->type->pm.resume(dev);
222 parameters and/or rules. 268 dev->class->pm.resume(dev);
223 269
224 This call should handle parts of device suspend logic that require 270These callbacks may in turn invoke device- or driver-specific methods stored in
225 sleeping. It probably does work to quiesce the device which hasn't 271dev->driver->pm, but they don't have to.
226 been abstracted into class.suspend().
227 272
228The pm_message_t parameter is currently used to refine those semantics
229(described later).
230 273
231At the end of those phases, drivers should normally have stopped all I/O 274Entering System Suspend
232transactions (DMA, IRQs), saved enough state that they can re-initialize 275-----------------------
233or restore previous state (as needed by the hardware), and placed the 276When the system goes into the standby or memory sleep state, the phases are:
234device into a low-power state. On many platforms they will also use 277
235clk_disable() to gate off one or more clock sources; sometimes they will 278 prepare, suspend, suspend_noirq.
236also switch off power supplies, or reduce voltages. Drivers which have 279
237runtime PM support may already have performed some or all of the steps 280 1. The prepare phase is meant to prevent races by preventing new devices
238needed to prepare for the upcoming system sleep state. 281 from being registered; the PM core would never know that all the
282 children of a device had been suspended if new children could be
283 registered at will. (By contrast, devices may be unregistered at any
284 time.) Unlike the other suspend-related phases, during the prepare
285 phase the device tree is traversed top-down.
286
287 The prepare phase uses only a bus callback. After the callback method
288 returns, no new children may be registered below the device. The method
289 may also prepare the device or driver in some way for the upcoming
290 system power transition, but it should not put the device into a
291 low-power state.
292
293 2. The suspend methods should quiesce the device to stop it from performing
294 I/O. They also may save the device registers and put it into the
295 appropriate low-power state, depending on the bus type the device is on,
296 and they may enable wakeup events.
297
298 3. The suspend_noirq phase occurs after IRQ handlers have been disabled,
299 which means that the driver's interrupt handler will not be called while
300 the callback method is running. The methods should save the values of
301 the device's registers that weren't saved previously and finally put the
302 device into the appropriate low-power state.
303
304 The majority of subsystems and device drivers need not implement this
305 callback. However, bus types allowing devices to share interrupt
306 vectors, like PCI, generally need it; otherwise a driver might encounter
307 an error during the suspend phase by fielding a shared interrupt
308 generated by some other device after its own device had been set to low
309 power.
310
311At the end of these phases, drivers should have stopped all I/O transactions
312(DMA, IRQs), saved enough state that they can re-initialize or restore previous
313state (as needed by the hardware), and placed the device into a low-power state.
314On many platforms they will gate off one or more clock sources; sometimes they
315will also switch off power supplies or reduce voltages. (Drivers supporting
316runtime PM may already have performed some or all of these steps.)
317
318If device_may_wakeup(dev) returns true, the device should be prepared for
319generating hardware wakeup signals to trigger a system wakeup event when the
320system is in the sleep state. For example, enable_irq_wake() might identify
321GPIO signals hooked up to a switch or other external hardware, and
322pci_enable_wake() does something similar for the PCI PME signal.
323
324If any of these callbacks returns an error, the system won't enter the desired
325low-power state. Instead the PM core will unwind its actions by resuming all
326the devices that were suspended.
327
328
329Leaving System Suspend
330----------------------
331When resuming from standby or memory sleep, the phases are:
332
333 resume_noirq, resume, complete.
334
335 1. The resume_noirq callback methods should perform any actions needed
336 before the driver's interrupt handlers are invoked. This generally
337 means undoing the actions of the suspend_noirq phase. If the bus type
338 permits devices to share interrupt vectors, like PCI, the method should
339 bring the device and its driver into a state in which the driver can
340 recognize if the device is the source of incoming interrupts, if any,
341 and handle them correctly.
342
343 For example, the PCI bus type's ->pm.resume_noirq() puts the device into
344 the full-power state (D0 in the PCI terminology) and restores the
345 standard configuration registers of the device. Then it calls the
346 device driver's ->pm.resume_noirq() method to perform device-specific
347 actions.
348
349 2. The resume methods should bring the the device back to its operating
350 state, so that it can perform normal I/O. This generally involves
351 undoing the actions of the suspend phase.
352
353 3. The complete phase uses only a bus callback. The method should undo the
354 actions of the prepare phase. Note, however, that new children may be
355 registered below the device as soon as the resume callbacks occur; it's
356 not necessary to wait until the complete phase.
357
358At the end of these phases, drivers should be as functional as they were before
359suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are
360gated on. Even if the device was in a low-power state before the system sleep
361because of runtime power management, afterwards it should be back in its
362full-power state. There are multiple reasons why it's best to do this; they are
363discussed in more detail in Documentation/power/runtime_pm.txt.
239 364
240When any driver sees that its device_can_wakeup(dev), it should make sure 365However, the details here may again be platform-specific. For example,
241to use the relevant hardware signals to trigger a system wakeup event. 366some systems support multiple "run" states, and the mode in effect at
242For example, enable_irq_wake() might identify GPIO signals hooked up to 367the end of resume might not be the one which preceded suspension.
243a switch or other external hardware, and pci_enable_wake() does something 368That means availability of certain clocks or power supplies changed,
244similar for PCI's PME# signal. 369which could easily affect how a driver works.
370
371Drivers need to be able to handle hardware which has been reset since the
372suspend methods were called, for example by complete reinitialization.
373This may be the hardest part, and the one most protected by NDA'd documents
374and chip errata. It's simplest if the hardware state hasn't changed since
375the suspend was carried out, but that can't be guaranteed (in fact, it ususally
376is not the case).
377
378Drivers must also be prepared to notice that the device has been removed
379while the system was powered down, whenever that's physically possible.
380PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of busses
381where common Linux platforms will see such removal. Details of how drivers
382will notice and handle such removals are currently bus-specific, and often
383involve a separate thread.
384
385These callbacks may return an error value, but the PM core will ignore such
386errors since there's nothing it can do about them other than printing them in
387the system log.
388
389
390Entering Hibernation
391--------------------
392Hibernating the system is more complicated than putting it into the standby or
393memory sleep state, because it involves creating and saving a system image.
394Therefore there are more phases for hibernation, with a different set of
395callbacks. These phases always run after tasks have been frozen and memory has
396been freed.
397
398The general procedure for hibernation is to quiesce all devices (freeze), create
399an image of the system memory while everything is stable, reactivate all
400devices (thaw), write the image to permanent storage, and finally shut down the
401system (poweroff). The phases used to accomplish this are:
402
403 prepare, freeze, freeze_noirq, thaw_noirq, thaw, complete,
404 prepare, poweroff, poweroff_noirq
405
406 1. The prepare phase is discussed in the "Entering System Suspend" section
407 above.
408
409 2. The freeze methods should quiesce the device so that it doesn't generate
410 IRQs or DMA, and they may need to save the values of device registers.
411 However the device does not have to be put in a low-power state, and to
412 save time it's best not to do so. Also, the device should not be
413 prepared to generate wakeup events.
414
415 3. The freeze_noirq phase is analogous to the suspend_noirq phase discussed
416 above, except again that the device should not be put in a low-power
417 state and should not be allowed to generate wakeup events.
418
419At this point the system image is created. All devices should be inactive and
420the contents of memory should remain undisturbed while this happens, so that the
421image forms an atomic snapshot of the system state.
422
423 4. The thaw_noirq phase is analogous to the resume_noirq phase discussed
424 above. The main difference is that its methods can assume the device is
425 in the same state as at the end of the freeze_noirq phase.
426
427 5. The thaw phase is analogous to the resume phase discussed above. Its
428 methods should bring the device back to an operating state, so that it
429 can be used for saving the image if necessary.
430
431 6. The complete phase is discussed in the "Leaving System Suspend" section
432 above.
433
434At this point the system image is saved, and the devices then need to be
435prepared for the upcoming system shutdown. This is much like suspending them
436before putting the system into the standby or memory sleep state, and the phases
437are similar.
438
439 7. The prepare phase is discussed above.
440
441 8. The poweroff phase is analogous to the suspend phase.
442
443 9. The poweroff_noirq phase is analogous to the suspend_noirq phase.
444
445The poweroff and poweroff_noirq callbacks should do essentially the same things
446as the suspend and suspend_noirq callbacks. The only notable difference is that
447they need not store the device register values, because the registers should
448already have been stored during the freeze or freeze_noirq phases.
449
450
451Leaving Hibernation
452-------------------
453Resuming from hibernation is, again, more complicated than resuming from a sleep
454state in which the contents of main memory are preserved, because it requires
455a system image to be loaded into memory and the pre-hibernation memory contents
456to be restored before control can be passed back to the image kernel.
457
458Although in principle, the image might be loaded into memory and the
459pre-hibernation memory contents restored by the boot loader, in practice this
460can't be done because boot loaders aren't smart enough and there is no
461established protocol for passing the necessary information. So instead, the
462boot loader loads a fresh instance of the kernel, called the boot kernel, into
463memory and passes control to it in the usual way. Then the boot kernel reads
464the system image, restores the pre-hibernation memory contents, and passes
465control to the image kernel. Thus two different kernels are involved in
466resuming from hibernation. In fact, the boot kernel may be completely different
467from the image kernel: a different configuration and even a different version.
468This has important consequences for device drivers and their subsystems.
469
470To be able to load the system image into memory, the boot kernel needs to
471include at least a subset of device drivers allowing it to access the storage
472medium containing the image, although it doesn't need to include all of the
473drivers present in the image kernel. After the image has been loaded, the
474devices managed by the boot kernel need to be prepared for passing control back
475to the image kernel. This is very similar to the initial steps involved in
476creating a system image, and it is accomplished in the same way, using prepare,
477freeze, and freeze_noirq phases. However the devices affected by these phases
478are only those having drivers in the boot kernel; other devices will still be in
479whatever state the boot loader left them.
480
481Should the restoration of the pre-hibernation memory contents fail, the boot
482kernel would go through the "thawing" procedure described above, using the
483thaw_noirq, thaw, and complete phases, and then continue running normally. This
484happens only rarely. Most often the pre-hibernation memory contents are
485restored successfully and control is passed to the image kernel, which then
486becomes responsible for bringing the system back to the working state.
487
488To achieve this, the image kernel must restore the devices' pre-hibernation
489functionality. The operation is much like waking up from the memory sleep
490state, although it involves different phases:
491
492 restore_noirq, restore, complete
493
494 1. The restore_noirq phase is analogous to the resume_noirq phase.
495
496 2. The restore phase is analogous to the resume phase.
497
498 3. The complete phase is discussed above.
499
500The main difference from resume[_noirq] is that restore[_noirq] must assume the
501device has been accessed and reconfigured by the boot loader or the boot kernel.
502Consequently the state of the device may be different from the state remembered
503from the freeze and freeze_noirq phases. The device may even need to be reset
504and completely re-initialized. In many cases this difference doesn't matter, so
505the resume[_noirq] and restore[_norq] method pointers can be set to the same
506routines. Nevertheless, different callback pointers are used in case there is a
507situation where it actually matters.
245 508
246If a driver (or bus, or class) fails it suspend method, the system won't
247enter the desired low power state; it will resume all the devices it's
248suspended so far.
249 509
250Note that drivers may need to perform different actions based on the target 510System Devices
251system lowpower/sleep state. At this writing, there are only platform 511--------------
252specific APIs through which drivers could determine those target states. 512System devices (sysdevs) follow a slightly different API, which can be found in
513
514 include/linux/sysdev.h
515 drivers/base/sys.c
516
517System devices will be suspended with interrupts disabled, and after all other
518devices have been suspended. On resume, they will be resumed before any other
519devices, and also with interrupts disabled. These things occur in special
520"sysdev_driver" phases, which affect only system devices.
521
522Thus, after the suspend_noirq (or freeze_noirq or poweroff_noirq) phase, when
523the non-boot CPUs are all offline and IRQs are disabled on the remaining online
524CPU, then a sysdev_driver.suspend phase is carried out, and the system enters a
525sleep state (or a system image is created). During resume (or after the image
526has been created or loaded) a sysdev_driver.resume phase is carried out, IRQs
527are enabled on the only online CPU, the non-boot CPUs are enabled, and the
528resume_noirq (or thaw_noirq or restore_noirq) phase begins.
529
530Code to actually enter and exit the system-wide low power state sometimes
531involves hardware details that are only known to the boot firmware, and
532may leave a CPU running software (from SRAM or flash memory) that monitors
533the system and manages its wakeup sequence.
253 534
254 535
255Device Low Power (suspend) States 536Device Low Power (suspend) States
256--------------------------------- 537---------------------------------
257Device low-power states aren't very standard. One device might only handle 538Device low-power states aren't standard. One device might only handle
258"on" and "off, while another might support a dozen different versions of 539"on" and "off, while another might support a dozen different versions of
259"on" (how many engines are active?), plus a state that gets back to "on" 540"on" (how many engines are active?), plus a state that gets back to "on"
260faster than from a full "off". 541faster than from a full "off".
@@ -265,7 +546,7 @@ PCI device may not perform DMA or issue IRQs, and any wakeup events it
265issues would be issued through the PME# bus signal. Plus, there are 546issues would be issued through the PME# bus signal. Plus, there are
266several PCI-standard device states, some of which are optional. 547several PCI-standard device states, some of which are optional.
267 548
268In contrast, integrated system-on-chip processors often use irqs as the 549In contrast, integrated system-on-chip processors often use IRQs as the
269wakeup event sources (so drivers would call enable_irq_wake) and might 550wakeup event sources (so drivers would call enable_irq_wake) and might
270be able to treat DMA completion as a wakeup event (sometimes DMA can stay 551be able to treat DMA completion as a wakeup event (sometimes DMA can stay
271active too, it'd only be the CPU and some peripherals that sleep). 552active too, it'd only be the CPU and some peripherals that sleep).
@@ -284,120 +565,17 @@ ways; the aforementioned LCD might be active in one product's "standby",
284but a different product using the same SOC might work differently. 565but a different product using the same SOC might work differently.
285 566
286 567
287Meaning of pm_message_t.event 568Power Management Notifiers
288----------------------------- 569--------------------------
289Parameters to suspend calls include the device affected and a message of 570There are some operations that cannot be carried out by the power management
290type pm_message_t, which has one field: the event. If driver does not 571callbacks discussed above, because the callbacks occur too late or too early.
291recognize the event code, suspend calls may abort the request and return 572To handle these cases, subsystems and device drivers may register power
292a negative errno. However, most drivers will be fine if they implement 573management notifiers that are called before tasks are frozen and after they have
293PM_EVENT_SUSPEND semantics for all messages. 574been thawed. Generally speaking, the PM notifiers are suitable for performing
575actions that either require user space to be available, or at least won't
576interfere with user space.
294 577
295The event codes are used to refine the goal of suspending the device, and 578For details refer to Documentation/power/notifiers.txt.
296mostly matter when creating or resuming system memory image snapshots, as
297used with suspend-to-disk:
298
299 PM_EVENT_SUSPEND -- quiesce the driver and put hardware into a low-power
300 state. When used with system sleep states like "suspend-to-RAM" or
301 "standby", the upcoming resume() call will often be able to rely on
302 state kept in hardware, or issue system wakeup events.
303
304 PM_EVENT_HIBERNATE -- Put hardware into a low-power state and enable wakeup
305 events as appropriate. It is only used with hibernation
306 (suspend-to-disk) and few devices are able to wake up the system from
307 this state; most are completely powered off.
308
309 PM_EVENT_FREEZE -- quiesce the driver, but don't necessarily change into
310 any low power mode. A system snapshot is about to be taken, often
311 followed by a call to the driver's resume() method. Neither wakeup
312 events nor DMA are allowed.
313
314 PM_EVENT_PRETHAW -- quiesce the driver, knowing that the upcoming resume()
315 will restore a suspend-to-disk snapshot from a different kernel image.
316 Drivers that are smart enough to look at their hardware state during
317 resume() processing need that state to be correct ... a PRETHAW could
318 be used to invalidate that state (by resetting the device), like a
319 shutdown() invocation would before a kexec() or system halt. Other
320 drivers might handle this the same way as PM_EVENT_FREEZE. Neither
321 wakeup events nor DMA are allowed.
322
323To enter "standby" (ACPI S1) or "Suspend to RAM" (STR, ACPI S3) states, or
324the similarly named APM states, only PM_EVENT_SUSPEND is used; the other event
325codes are used for hibernation ("Suspend to Disk", STD, ACPI S4).
326
327There's also PM_EVENT_ON, a value which never appears as a suspend event
328but is sometimes used to record the "not suspended" device state.
329
330
331Resuming Devices
332----------------
333Resuming is done in multiple phases, much like suspending, with all
334devices processing each phase's calls before the next phase begins.
335
336The phases are seen by driver notifications issued in this order:
337
338 1 bus.resume(dev) reverses the effects of bus.suspend(). This may
339 be morphed into a device driver call with bus-specific parameters;
340 implementations may sleep.
341
342 2 class.resume(dev) is called for devices associated with a class
343 that has such a method. Implementations may sleep.
344
345 This reverses the effects of class.suspend(), and would usually
346 reactivate the device's I/O queue.
347
348At the end of those phases, drivers should normally be as functional as
349they were before suspending: I/O can be performed using DMA and IRQs, and
350the relevant clocks are gated on. The device need not be "fully on"; it
351might be in a runtime lowpower/suspend state that acts as if it were.
352
353However, the details here may again be platform-specific. For example,
354some systems support multiple "run" states, and the mode in effect at
355the end of resume() might not be the one which preceded suspension.
356That means availability of certain clocks or power supplies changed,
357which could easily affect how a driver works.
358
359
360Drivers need to be able to handle hardware which has been reset since the
361suspend methods were called, for example by complete reinitialization.
362This may be the hardest part, and the one most protected by NDA'd documents
363and chip errata. It's simplest if the hardware state hasn't changed since
364the suspend() was called, but that can't always be guaranteed.
365
366Drivers must also be prepared to notice that the device has been removed
367while the system was powered off, whenever that's physically possible.
368PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of busses
369where common Linux platforms will see such removal. Details of how drivers
370will notice and handle such removals are currently bus-specific, and often
371involve a separate thread.
372
373
374Note that the bus-specific runtime PM wakeup mechanism can exist, and might
375be defined to share some of the same driver code as for system wakeup. For
376example, a bus-specific device driver's resume() method might be used there,
377so it wouldn't only be called from bus.resume() during system-wide wakeup.
378See bus-specific information about how runtime wakeup events are handled.
379
380
381System Devices
382--------------
383System devices follow a slightly different API, which can be found in
384
385 include/linux/sysdev.h
386 drivers/base/sys.c
387
388System devices will only be suspended with interrupts disabled, and after
389all other devices have been suspended. On resume, they will be resumed
390before any other devices, and also with interrupts disabled.
391
392That is, IRQs are disabled, the suspend_late() phase begins, then the
393sysdev_driver.suspend() phase, and the system enters a sleep state. Then
394the sysdev_driver.resume() phase begins, followed by the resume_early()
395phase, after which IRQs are enabled.
396
397Code to actually enter and exit the system-wide low power state sometimes
398involves hardware details that are only known to the boot firmware, and
399may leave a CPU running software (from SRAM or flash memory) that monitors
400the system and manages its wakeup sequence.
401 579
402 580
403Runtime Power Management 581Runtime Power Management
@@ -407,82 +585,23 @@ running. This feature is useful for devices that are not being used, and
407can offer significant power savings on a running system. These devices 585can offer significant power savings on a running system. These devices
408often support a range of runtime power states, which might use names such 586often support a range of runtime power states, which might use names such
409as "off", "sleep", "idle", "active", and so on. Those states will in some 587as "off", "sleep", "idle", "active", and so on. Those states will in some
410cases (like PCI) be partially constrained by a bus the device uses, and will 588cases (like PCI) be partially constrained by the bus the device uses, and will
411usually include hardware states that are also used in system sleep states. 589usually include hardware states that are also used in system sleep states.
412 590
413However, note that if a driver puts a device into a runtime low power state 591A system-wide power transition can be started while some devices are in low
414and the system then goes into a system-wide sleep state, it normally ought 592power states due to runtime power management. The system sleep PM callbacks
415to resume into that runtime low power state rather than "full on". Such 593should recognize such situations and react to them appropriately, but the
416distinctions would be part of the driver-internal state machine for that 594necessary actions are subsystem-specific.
417hardware; the whole point of runtime power management is to be sure that 595
418drivers are decoupled in that way from the state machine governing phases 596In some cases the decision may be made at the subsystem level while in other
419of the system-wide power/sleep state transitions. 597cases the device driver may be left to decide. In some cases it may be
420 598desirable to leave a suspended device in that state during a system-wide power
421 599transition, but in other cases the device must be put back into the full-power
422Power Saving Techniques 600state temporarily, for example so that its system wakeup capability can be
423----------------------- 601disabled. This all depends on the hardware and the design of the subsystem and
424Normally runtime power management is handled by the drivers without specific 602device driver in question.
425userspace or kernel intervention, by device-aware use of techniques like: 603
426 604During system-wide resume from a sleep state it's best to put devices into the
427 Using information provided by other system layers 605full-power state, as explained in Documentation/power/runtime_pm.txt. Refer to
428 - stay deeply "off" except between open() and close() 606that document for more information regarding this particular issue as well as
429 - if transceiver/PHY indicates "nobody connected", stay "off" 607for information on the device runtime power management framework in general.
430 - application protocols may include power commands or hints
431
432 Using fewer CPU cycles
433 - using DMA instead of PIO
434 - removing timers, or making them lower frequency
435 - shortening "hot" code paths
436 - eliminating cache misses
437 - (sometimes) offloading work to device firmware
438
439 Reducing other resource costs
440 - gating off unused clocks in software (or hardware)
441 - switching off unused power supplies
442 - eliminating (or delaying/merging) IRQs
443 - tuning DMA to use word and/or burst modes
444
445 Using device-specific low power states
446 - using lower voltages
447 - avoiding needless DMA transfers
448
449Read your hardware documentation carefully to see the opportunities that
450may be available. If you can, measure the actual power usage and check
451it against the budget established for your project.
452
453
454Examples: USB hosts, system timer, system CPU
455----------------------------------------------
456USB host controllers make interesting, if complex, examples. In many cases
457these have no work to do: no USB devices are connected, or all of them are
458in the USB "suspend" state. Linux host controller drivers can then disable
459periodic DMA transfers that would otherwise be a constant power drain on the
460memory subsystem, and enter a suspend state. In power-aware controllers,
461entering that suspend state may disable the clock used with USB signaling,
462saving a certain amount of power.
463
464The controller will be woken from that state (with an IRQ) by changes to the
465signal state on the data lines of a given port, for example by an existing
466peripheral requesting "remote wakeup" or by plugging a new peripheral. The
467same wakeup mechanism usually works from "standby" sleep states, and on some
468systems also from "suspend to RAM" (or even "suspend to disk") states.
469(Except that ACPI may be involved instead of normal IRQs, on some hardware.)
470
471System devices like timers and CPUs may have special roles in the platform
472power management scheme. For example, system timers using a "dynamic tick"
473approach don't just save CPU cycles (by eliminating needless timer IRQs),
474but they may also open the door to using lower power CPU "idle" states that
475cost more than a jiffie to enter and exit. On x86 systems these are states
476like "C3"; note that periodic DMA transfers from a USB host controller will
477also prevent entry to a C3 state, much like a periodic timer IRQ.
478
479That kind of runtime mechanism interaction is common. "System On Chip" (SOC)
480processors often have low power idle modes that can't be entered unless
481certain medium-speed clocks (often 12 or 48 MHz) are gated off. When the
482drivers gate those clocks effectively, then the system idle task may be able
483to use the lower power idle modes and thereby increase battery life.
484
485If the CPU can have a "cpufreq" driver, there also may be opportunities
486to shift to lower voltage settings and reduce the power cost of executing
487a given number of instructions. (Without voltage adjustment, it's rare
488for cpufreq to save much power; the cost-per-instruction must go down.)
diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
index c40866e8b957..bfed898a03fc 100644
--- a/Documentation/power/pm_qos_interface.txt
+++ b/Documentation/power/pm_qos_interface.txt
@@ -18,44 +18,46 @@ and pm_qos_params.h. This is done because having the available parameters
18being runtime configurable or changeable from a driver was seen as too easy to 18being runtime configurable or changeable from a driver was seen as too easy to
19abuse. 19abuse.
20 20
21For each parameter a list of performance requirements is maintained along with 21For each parameter a list of performance requests is maintained along with
22an aggregated target value. The aggregated target value is updated with 22an aggregated target value. The aggregated target value is updated with
23changes to the requirement list or elements of the list. Typically the 23changes to the request list or elements of the list. Typically the
24aggregated target value is simply the max or min of the requirement values held 24aggregated target value is simply the max or min of the request values held
25in the parameter list elements. 25in the parameter list elements.
26 26
27From kernel mode the use of this interface is simple: 27From kernel mode the use of this interface is simple:
28pm_qos_add_requirement(param_id, name, target_value):
29Will insert a named element in the list for that identified PM_QOS parameter
30with the target value. Upon change to this list the new target is recomputed
31and any registered notifiers are called only if the target value is now
32different.
33 28
34pm_qos_update_requirement(param_id, name, new_target_value): 29handle = pm_qos_add_request(param_class, target_value):
35Will search the list identified by the param_id for the named list element and 30Will insert an element into the list for that identified PM_QOS class with the
36then update its target value, calling the notification tree if the aggregated 31target value. Upon change to this list the new target is recomputed and any
37target is changed. with that name is already registered. 32registered notifiers are called only if the target value is now different.
33Clients of pm_qos need to save the returned handle.
38 34
39pm_qos_remove_requirement(param_id, name): 35void pm_qos_update_request(handle, new_target_value):
40Will search the identified list for the named element and remove it, after 36Will update the list element pointed to by the handle with the new target value
41removal it will update the aggregate target and call the notification tree if 37and recompute the new aggregated target, calling the notification tree if the
42the target was changed as a result of removing the named requirement. 38target is changed.
39
40void pm_qos_remove_request(handle):
41Will remove the element. After removal it will update the aggregate target and
42call the notification tree if the target was changed as a result of removing
43the request.
43 44
44 45
45From user mode: 46From user mode:
46Only processes can register a pm_qos requirement. To provide for automatic 47Only processes can register a pm_qos request. To provide for automatic
47cleanup for process the interface requires the process to register its 48cleanup of a process, the interface requires the process to register its
48parameter requirements in the following way: 49parameter requests in the following way:
49 50
50To register the default pm_qos target for the specific parameter, the process 51To register the default pm_qos target for the specific parameter, the process
51must open one of /dev/[cpu_dma_latency, network_latency, network_throughput] 52must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
52 53
53As long as the device node is held open that process has a registered 54As long as the device node is held open that process has a registered
54requirement on the parameter. The name of the requirement is "process_<PID>" 55request on the parameter.
55derived from the current->pid from within the open system call.
56 56
57To change the requested target value the process needs to write a s32 value to 57To change the requested target value the process needs to write an s32 value to
58the open device node. This translates to a pm_qos_update_requirement call. 58the open device node. Alternatively the user mode program could write a hex
59string for the value using 10 char long format e.g. "0x12345678". This
60translates to a pm_qos_update_request call.
59 61
60To remove the user mode request for a target value simply close the device 62To remove the user mode request for a target value simply close the device
61node. 63node.
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
index b967cd9137d6..81680f9f5909 100644
--- a/Documentation/power/userland-swsusp.txt
+++ b/Documentation/power/userland-swsusp.txt
@@ -24,6 +24,10 @@ assumed to be in the resume mode. The device cannot be open for simultaneous
24reading and writing. It is also impossible to have the device open more than 24reading and writing. It is also impossible to have the device open more than
25once at a time. 25once at a time.
26 26
27Even opening the device has side effects. Data structures are
28allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are
29called.
30
27The ioctl() commands recognized by the device are: 31The ioctl() commands recognized by the device are:
28 32
29SNAPSHOT_FREEZE - freeze user space processes (the current process is 33SNAPSHOT_FREEZE - freeze user space processes (the current process is
diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt
index aae8355d3166..221f38be98f4 100644
--- a/Documentation/rbtree.txt
+++ b/Documentation/rbtree.txt
@@ -190,3 +190,61 @@ Example:
190 for (node = rb_first(&mytree); node; node = rb_next(node)) 190 for (node = rb_first(&mytree); node; node = rb_next(node))
191 printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); 191 printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring);
192 192
193Support for Augmented rbtrees
194-----------------------------
195
196Augmented rbtree is an rbtree with "some" additional data stored in each node.
197This data can be used to augment some new functionality to rbtree.
198Augmented rbtree is an optional feature built on top of basic rbtree
199infrastructure. rbtree user who wants this feature will have an augment
200callback function in rb_root initialized.
201
202This callback function will be called from rbtree core routines whenever
203a node has a change in one or both of its children. It is the responsibility
204of the callback function to recalculate the additional data that is in the
205rb node using new children information. Note that if this new additional
206data affects the parent node's additional data, then callback function has
207to handle it and do the recursive updates.
208
209
210Interval tree is an example of augmented rb tree. Reference -
211"Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein.
212More details about interval trees:
213
214Classical rbtree has a single key and it cannot be directly used to store
215interval ranges like [lo:hi] and do a quick lookup for any overlap with a new
216lo:hi or to find whether there is an exact match for a new lo:hi.
217
218However, rbtree can be augmented to store such interval ranges in a structured
219way making it possible to do efficient lookup and exact match.
220
221This "extra information" stored in each node is the maximum hi
222(max_hi) value among all the nodes that are its descendents. This
223information can be maintained at each node just be looking at the node
224and its immediate children. And this will be used in O(log n) lookup
225for lowest match (lowest start address among all possible matches)
226with something like:
227
228find_lowest_match(lo, hi, node)
229{
230 lowest_match = NULL;
231 while (node) {
232 if (max_hi(node->left) > lo) {
233 // Lowest overlap if any must be on left side
234 node = node->left;
235 } else if (overlap(lo, hi, node)) {
236 lowest_match = node;
237 break;
238 } else if (lo > node->lo) {
239 // Lowest overlap if any must be on right side
240 node = node->right;
241 } else {
242 break;
243 }
244 }
245 return lowest_match;
246}
247
248Finding exact match will be to first find lowest match and then to follow
249successor nodes looking for exact match, until the start of a node is beyond
250the hi value we are looking for.
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index 6f33593e59e2..8239ebbcddce 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -211,7 +211,7 @@ provide fair CPU time to each such task group. For example, it may be
211desirable to first provide fair CPU time to each user on the system and then to 211desirable to first provide fair CPU time to each user on the system and then to
212each task belonging to a user. 212each task belonging to a user.
213 213
214CONFIG_GROUP_SCHED strives to achieve exactly that. It lets tasks to be 214CONFIG_CGROUP_SCHED strives to achieve exactly that. It lets tasks to be
215grouped and divides CPU time fairly among such groups. 215grouped and divides CPU time fairly among such groups.
216 216
217CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and 217CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and
@@ -220,38 +220,11 @@ SCHED_RR) tasks.
220CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and 220CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and
221SCHED_BATCH) tasks. 221SCHED_BATCH) tasks.
222 222
223At present, there are two (mutually exclusive) mechanisms to group tasks for 223 These options need CONFIG_CGROUPS to be defined, and let the administrator
224CPU bandwidth control purposes:
225
226 - Based on user id (CONFIG_USER_SCHED)
227
228 With this option, tasks are grouped according to their user id.
229
230 - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED)
231
232 This options needs CONFIG_CGROUPS to be defined, and lets the administrator
233 create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See 224 create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See
234 Documentation/cgroups/cgroups.txt for more information about this filesystem. 225 Documentation/cgroups/cgroups.txt for more information about this filesystem.
235 226
236Only one of these options to group tasks can be chosen and not both. 227When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
237
238When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new
239user and a "cpu_share" file is added in that directory.
240
241 # cd /sys/kernel/uids
242 # cat 512/cpu_share # Display user 512's CPU share
243 1024
244 # echo 2048 > 512/cpu_share # Modify user 512's CPU share
245 # cat 512/cpu_share # Display user 512's CPU share
246 2048
247 #
248
249CPU bandwidth between two users is divided in the ratio of their CPU shares.
250For example: if you would like user "root" to get twice the bandwidth of user
251"guest," then set the cpu_share for both the users such that "root"'s cpu_share
252is twice "guest"'s cpu_share.
253
254When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each
255group created using the pseudo filesystem. See example steps below to create 228group created using the pseudo filesystem. See example steps below to create
256task groups and modify their CPU share using the "cgroups" pseudo filesystem. 229task groups and modify their CPU share using the "cgroups" pseudo filesystem.
257 230
@@ -273,24 +246,3 @@ task groups and modify their CPU share using the "cgroups" pseudo filesystem.
273 246
274 # #Launch gmplayer (or your favourite movie player) 247 # #Launch gmplayer (or your favourite movie player)
275 # echo <movie_player_pid> > multimedia/tasks 248 # echo <movie_player_pid> > multimedia/tasks
276
2778. Implementation note: user namespaces
278
279User namespaces are intended to be hierarchical. But they are currently
280only partially implemented. Each of those has ramifications for CFS.
281
282First, since user namespaces are hierarchical, the /sys/kernel/uids
283presentation is inadequate. Eventually we will likely want to use sysfs
284tagging to provide private views of /sys/kernel/uids within each user
285namespace.
286
287Second, the hierarchical nature is intended to support completely
288unprivileged use of user namespaces. So if using user groups, then
289we want the users in a user namespace to be children of the user
290who created it.
291
292That is currently unimplemented. So instead, every user in a new
293user namespace will receive 1024 shares just like any user in the
294initial user namespace. Note that at the moment creation of a new
295user namespace requires each of CAP_SYS_ADMIN, CAP_SETUID, and
296CAP_SETGID.
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 86eabe6c3419..605b0d40329d 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -126,23 +126,12 @@ priority!
1262.3 Basis for grouping tasks 1262.3 Basis for grouping tasks
127---------------------------- 127----------------------------
128 128
129There are two compile-time settings for allocating CPU bandwidth. These are 129Enabling CONFIG_RT_GROUP_SCHED lets you explicitly allocate real
130configured using the "Basis for grouping tasks" multiple choice menu under 130CPU bandwidth to task groups.
131General setup > Group CPU Scheduler:
132
133a. CONFIG_USER_SCHED (aka "Basis for grouping tasks" = "user id")
134
135This lets you use the virtual files under
136"/sys/kernel/uids/<uid>/cpu_rt_runtime_us" to control he CPU time reserved for
137each user .
138
139The other option is:
140
141.o CONFIG_CGROUP_SCHED (aka "Basis for grouping tasks" = "Control groups")
142 131
143This uses the /cgroup virtual file system and 132This uses the /cgroup virtual file system and
144"/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each 133"/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each
145control group instead. 134control group.
146 135
147For more information on working with control groups, you should read 136For more information on working with control groups, you should read
148Documentation/cgroups/cgroups.txt as well. 137Documentation/cgroups/cgroups.txt as well.
@@ -161,8 +150,7 @@ For now, this can be simplified to just the following (but see Future plans):
161=============== 150===============
162 151
163There is work in progress to make the scheduling period for each group 152There is work in progress to make the scheduling period for each group
164("/sys/kernel/uids/<uid>/cpu_rt_period_us" or 153("/cgroup/<cgroup>/cpu.rt_period_us") configurable as well.
165"/cgroup/<cgroup>/cpu.rt_period_us" respectively) configurable as well.
166 154
167The constraint on the period is that a subgroup must have a smaller or 155The constraint on the period is that a subgroup must have a smaller or
168equal period to its parent. But realistically its not very useful _yet_ 156equal period to its parent. But realistically its not very useful _yet_
diff --git a/Documentation/spi/spidev_test.c b/Documentation/spi/spidev_test.c
index 10abd3773e49..16feda901469 100644
--- a/Documentation/spi/spidev_test.c
+++ b/Documentation/spi/spidev_test.c
@@ -58,7 +58,7 @@ static void transfer(int fd)
58 }; 58 };
59 59
60 ret = ioctl(fd, SPI_IOC_MESSAGE(1), &tr); 60 ret = ioctl(fd, SPI_IOC_MESSAGE(1), &tr);
61 if (ret == 1) 61 if (ret < 1)
62 pabort("can't send spi message"); 62 pabort("can't send spi message");
63 63
64 for (ret = 0; ret < ARRAY_SIZE(tx); ret++) { 64 for (ret = 0; ret < ARRAY_SIZE(tx); ret++) {
diff --git a/Documentation/stable_kernel_rules.txt b/Documentation/stable_kernel_rules.txt
index 5effa5bd993b..e213f45cf9d7 100644
--- a/Documentation/stable_kernel_rules.txt
+++ b/Documentation/stable_kernel_rules.txt
@@ -18,16 +18,15 @@ Rules on what kind of patches are accepted, and which ones are not, into the
18 - It cannot contain any "trivial" fixes in it (spelling changes, 18 - It cannot contain any "trivial" fixes in it (spelling changes,
19 whitespace cleanups, etc). 19 whitespace cleanups, etc).
20 - It must follow the Documentation/SubmittingPatches rules. 20 - It must follow the Documentation/SubmittingPatches rules.
21 - It or an equivalent fix must already exist in Linus' tree. Quote the 21 - It or an equivalent fix must already exist in Linus' tree (upstream).
22 respective commit ID in Linus' tree in your patch submission to -stable.
23 22
24 23
25Procedure for submitting patches to the -stable tree: 24Procedure for submitting patches to the -stable tree:
26 25
27 - Send the patch, after verifying that it follows the above rules, to 26 - Send the patch, after verifying that it follows the above rules, to
28 stable@kernel.org. 27 stable@kernel.org. You must note the upstream commit ID in the changelog
29 - To have the patch automatically included in the stable tree, add the 28 of your submission.
30 the tag 29 - To have the patch automatically included in the stable tree, add the tag
31 Cc: stable@kernel.org 30 Cc: stable@kernel.org
32 in the sign-off area. Once the patch is merged it will be applied to 31 in the sign-off area. Once the patch is merged it will be applied to
33 the stable tree without anything else needing to be done by the author 32 the stable tree without anything else needing to be done by the author
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index b22000dbc57d..09bd8e902989 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -90,7 +90,8 @@ In order to facilitate early boot debugging, use boot option:
90 90
91 trace_event=[event-list] 91 trace_event=[event-list]
92 92
93The format of this boot option is the same as described in section 2.1. 93event-list is a comma separated list of events. See section 2.1 for event
94format.
94 95
953. Defining an event-enabled tracepoint 963. Defining an event-enabled tracepoint
96======================================= 97=======================================
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 03485bfbd797..557c1edeccaf 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -155,6 +155,9 @@ of ftrace. Here is a list of some of the key files:
155 to be traced. Echoing names of functions into this file 155 to be traced. Echoing names of functions into this file
156 will limit the trace to only those functions. 156 will limit the trace to only those functions.
157 157
158 This interface also allows for commands to be used. See the
159 "Filter commands" section for more details.
160
158 set_ftrace_notrace: 161 set_ftrace_notrace:
159 162
160 This has an effect opposite to that of 163 This has an effect opposite to that of
@@ -1337,12 +1340,14 @@ ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one
1337can either use the sysctl function or set it via the proc system 1340can either use the sysctl function or set it via the proc system
1338interface. 1341interface.
1339 1342
1340 sysctl kernel.ftrace_dump_on_oops=1 1343 sysctl kernel.ftrace_dump_on_oops=n
1341 1344
1342or 1345or
1343 1346
1344 echo 1 > /proc/sys/kernel/ftrace_dump_on_oops 1347 echo n > /proc/sys/kernel/ftrace_dump_on_oops
1345 1348
1349If n = 1, ftrace will dump buffers of all CPUs, if n = 2 ftrace will
1350only dump the buffer of the CPU that triggered the oops.
1346 1351
1347Here's an example of such a dump after a null pointer 1352Here's an example of such a dump after a null pointer
1348dereference in a kernel module: 1353dereference in a kernel module:
@@ -1822,6 +1827,47 @@ this special filter via:
1822 echo > set_graph_function 1827 echo > set_graph_function
1823 1828
1824 1829
1830Filter commands
1831---------------
1832
1833A few commands are supported by the set_ftrace_filter interface.
1834Trace commands have the following format:
1835
1836<function>:<command>:<parameter>
1837
1838The following commands are supported:
1839
1840- mod
1841 This command enables function filtering per module. The
1842 parameter defines the module. For example, if only the write*
1843 functions in the ext3 module are desired, run:
1844
1845 echo 'write*:mod:ext3' > set_ftrace_filter
1846
1847 This command interacts with the filter in the same way as
1848 filtering based on function names. Thus, adding more functions
1849 in a different module is accomplished by appending (>>) to the
1850 filter file. Remove specific module functions by prepending
1851 '!':
1852
1853 echo '!writeback*:mod:ext3' >> set_ftrace_filter
1854
1855- traceon/traceoff
1856 These commands turn tracing on and off when the specified
1857 functions are hit. The parameter determines how many times the
1858 tracing system is turned on and off. If unspecified, there is
1859 no limit. For example, to disable tracing when a schedule bug
1860 is hit the first 5 times, run:
1861
1862 echo '__schedule_bug:traceoff:5' > set_ftrace_filter
1863
1864 These commands are cumulative whether or not they are appended
1865 to set_ftrace_filter. To remove a command, prepend it by '!'
1866 and drop the parameter:
1867
1868 echo '!__schedule_bug:traceoff' > set_ftrace_filter
1869
1870
1825trace_pipe 1871trace_pipe
1826---------- 1872----------
1827 1873
diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index a9100b28eb84..ec94748ae65b 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -40,7 +40,9 @@ Synopsis of kprobe_events
40 $stack : Fetch stack address. 40 $stack : Fetch stack address.
41 $retval : Fetch return value.(*) 41 $retval : Fetch return value.(*)
42 +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) 42 +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
43 NAME=FETCHARG: Set NAME as the argument name of FETCHARG. 43 NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
44 FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
45 (u8/u16/u32/u64/s8/s16/s32/s64) are supported.
44 46
45 (*) only for return probe. 47 (*) only for return probe.
46 (**) this is useful for fetching a field of data structures. 48 (**) this is useful for fetching a field of data structures.