diff options
Diffstat (limited to 'Documentation')
30 files changed, 917 insertions, 608 deletions
diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl index 261b57bc6f08..265c08c96fcd 100644 --- a/Documentation/DocBook/libata.tmpl +++ b/Documentation/DocBook/libata.tmpl | |||
@@ -107,10 +107,6 @@ void (*dev_config) (struct ata_port *, struct ata_device *); | |||
107 | issue of SET FEATURES - XFER MODE, and prior to operation. | 107 | issue of SET FEATURES - XFER MODE, and prior to operation. |
108 | </para> | 108 | </para> |
109 | <para> | 109 | <para> |
110 | Called by ata_device_add() after ata_dev_identify() determines | ||
111 | a device is present. | ||
112 | </para> | ||
113 | <para> | ||
114 | This entry may be specified as NULL in ata_port_operations. | 110 | This entry may be specified as NULL in ata_port_operations. |
115 | </para> | 111 | </para> |
116 | 112 | ||
@@ -154,8 +150,8 @@ unsigned int (*mode_filter) (struct ata_port *, struct ata_device *, unsigned in | |||
154 | 150 | ||
155 | <sect2><title>Taskfile read/write</title> | 151 | <sect2><title>Taskfile read/write</title> |
156 | <programlisting> | 152 | <programlisting> |
157 | void (*tf_load) (struct ata_port *ap, struct ata_taskfile *tf); | 153 | void (*sff_tf_load) (struct ata_port *ap, struct ata_taskfile *tf); |
158 | void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf); | 154 | void (*sff_tf_read) (struct ata_port *ap, struct ata_taskfile *tf); |
159 | </programlisting> | 155 | </programlisting> |
160 | 156 | ||
161 | <para> | 157 | <para> |
@@ -164,36 +160,35 @@ void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf); | |||
164 | hardware registers / DMA buffers, to obtain the current set of | 160 | hardware registers / DMA buffers, to obtain the current set of |
165 | taskfile register values. | 161 | taskfile register values. |
166 | Most drivers for taskfile-based hardware (PIO or MMIO) use | 162 | Most drivers for taskfile-based hardware (PIO or MMIO) use |
167 | ata_tf_load() and ata_tf_read() for these hooks. | 163 | ata_sff_tf_load() and ata_sff_tf_read() for these hooks. |
168 | </para> | 164 | </para> |
169 | 165 | ||
170 | </sect2> | 166 | </sect2> |
171 | 167 | ||
172 | <sect2><title>PIO data read/write</title> | 168 | <sect2><title>PIO data read/write</title> |
173 | <programlisting> | 169 | <programlisting> |
174 | void (*data_xfer) (struct ata_device *, unsigned char *, unsigned int, int); | 170 | void (*sff_data_xfer) (struct ata_device *, unsigned char *, unsigned int, int); |
175 | </programlisting> | 171 | </programlisting> |
176 | 172 | ||
177 | <para> | 173 | <para> |
178 | All bmdma-style drivers must implement this hook. This is the low-level | 174 | All bmdma-style drivers must implement this hook. This is the low-level |
179 | operation that actually copies the data bytes during a PIO data | 175 | operation that actually copies the data bytes during a PIO data |
180 | transfer. | 176 | transfer. |
181 | Typically the driver | 177 | Typically the driver will choose one of ata_sff_data_xfer_noirq(), |
182 | will choose one of ata_pio_data_xfer_noirq(), ata_pio_data_xfer(), or | 178 | ata_sff_data_xfer(), or ata_sff_data_xfer32(). |
183 | ata_mmio_data_xfer(). | ||
184 | </para> | 179 | </para> |
185 | 180 | ||
186 | </sect2> | 181 | </sect2> |
187 | 182 | ||
188 | <sect2><title>ATA command execute</title> | 183 | <sect2><title>ATA command execute</title> |
189 | <programlisting> | 184 | <programlisting> |
190 | void (*exec_command)(struct ata_port *ap, struct ata_taskfile *tf); | 185 | void (*sff_exec_command)(struct ata_port *ap, struct ata_taskfile *tf); |
191 | </programlisting> | 186 | </programlisting> |
192 | 187 | ||
193 | <para> | 188 | <para> |
194 | causes an ATA command, previously loaded with | 189 | causes an ATA command, previously loaded with |
195 | ->tf_load(), to be initiated in hardware. | 190 | ->tf_load(), to be initiated in hardware. |
196 | Most drivers for taskfile-based hardware use ata_exec_command() | 191 | Most drivers for taskfile-based hardware use ata_sff_exec_command() |
197 | for this hook. | 192 | for this hook. |
198 | </para> | 193 | </para> |
199 | 194 | ||
@@ -218,8 +213,8 @@ command. | |||
218 | 213 | ||
219 | <sect2><title>Read specific ATA shadow registers</title> | 214 | <sect2><title>Read specific ATA shadow registers</title> |
220 | <programlisting> | 215 | <programlisting> |
221 | u8 (*check_status)(struct ata_port *ap); | 216 | u8 (*sff_check_status)(struct ata_port *ap); |
222 | u8 (*check_altstatus)(struct ata_port *ap); | 217 | u8 (*sff_check_altstatus)(struct ata_port *ap); |
223 | </programlisting> | 218 | </programlisting> |
224 | 219 | ||
225 | <para> | 220 | <para> |
@@ -227,20 +222,14 @@ u8 (*check_altstatus)(struct ata_port *ap); | |||
227 | hardware. On some hardware, reading the Status register has | 222 | hardware. On some hardware, reading the Status register has |
228 | the side effect of clearing the interrupt condition. | 223 | the side effect of clearing the interrupt condition. |
229 | Most drivers for taskfile-based hardware use | 224 | Most drivers for taskfile-based hardware use |
230 | ata_check_status() for this hook. | 225 | ata_sff_check_status() for this hook. |
231 | </para> | ||
232 | <para> | ||
233 | Note that because this is called from ata_device_add(), at | ||
234 | least a dummy function that clears device interrupts must be | ||
235 | provided for all drivers, even if the controller doesn't | ||
236 | actually have a taskfile status register. | ||
237 | </para> | 226 | </para> |
238 | 227 | ||
239 | </sect2> | 228 | </sect2> |
240 | 229 | ||
241 | <sect2><title>Select ATA device on bus</title> | 230 | <sect2><title>Select ATA device on bus</title> |
242 | <programlisting> | 231 | <programlisting> |
243 | void (*dev_select)(struct ata_port *ap, unsigned int device); | 232 | void (*sff_dev_select)(struct ata_port *ap, unsigned int device); |
244 | </programlisting> | 233 | </programlisting> |
245 | 234 | ||
246 | <para> | 235 | <para> |
@@ -251,9 +240,7 @@ void (*dev_select)(struct ata_port *ap, unsigned int device); | |||
251 | </para> | 240 | </para> |
252 | <para> | 241 | <para> |
253 | Most drivers for taskfile-based hardware use | 242 | Most drivers for taskfile-based hardware use |
254 | ata_std_dev_select() for this hook. Controllers which do not | 243 | ata_sff_dev_select() for this hook. |
255 | support second drives on a port (such as SATA contollers) will | ||
256 | use ata_noop_dev_select(). | ||
257 | </para> | 244 | </para> |
258 | 245 | ||
259 | </sect2> | 246 | </sect2> |
@@ -441,13 +428,13 @@ void (*irq_clear) (struct ata_port *); | |||
441 | to struct ata_host_set. | 428 | to struct ata_host_set. |
442 | </para> | 429 | </para> |
443 | <para> | 430 | <para> |
444 | Most legacy IDE drivers use ata_interrupt() for the | 431 | Most legacy IDE drivers use ata_sff_interrupt() for the |
445 | irq_handler hook, which scans all ports in the host_set, | 432 | irq_handler hook, which scans all ports in the host_set, |
446 | determines which queued command was active (if any), and calls | 433 | determines which queued command was active (if any), and calls |
447 | ata_host_intr(ap,qc). | 434 | ata_sff_host_intr(ap,qc). |
448 | </para> | 435 | </para> |
449 | <para> | 436 | <para> |
450 | Most legacy IDE drivers use ata_bmdma_irq_clear() for the | 437 | Most legacy IDE drivers use ata_sff_irq_clear() for the |
451 | irq_clear() hook, which simply clears the interrupt and error | 438 | irq_clear() hook, which simply clears the interrupt and error |
452 | flags in the DMA status register. | 439 | flags in the DMA status register. |
453 | </para> | 440 | </para> |
@@ -496,10 +483,6 @@ void (*host_stop) (struct ata_host_set *host_set); | |||
496 | data from port at this time. | 483 | data from port at this time. |
497 | </para> | 484 | </para> |
498 | <para> | 485 | <para> |
499 | Many drivers use ata_port_stop() as this hook, which frees the | ||
500 | PRD table. | ||
501 | </para> | ||
502 | <para> | ||
503 | ->host_stop() is called after all ->port_stop() calls | 486 | ->host_stop() is called after all ->port_stop() calls |
504 | have completed. The hook must finalize hardware shutdown, release DMA | 487 | have completed. The hook must finalize hardware shutdown, release DMA |
505 | and other resources, etc. | 488 | and other resources, etc. |
diff --git a/Documentation/DocBook/sh.tmpl b/Documentation/DocBook/sh.tmpl index 0c3dc4c69dd1..d858d92cf6d9 100644 --- a/Documentation/DocBook/sh.tmpl +++ b/Documentation/DocBook/sh.tmpl | |||
@@ -19,13 +19,17 @@ | |||
19 | </authorgroup> | 19 | </authorgroup> |
20 | 20 | ||
21 | <copyright> | 21 | <copyright> |
22 | <year>2008</year> | 22 | <year>2008-2010</year> |
23 | <holder>Paul Mundt</holder> | 23 | <holder>Paul Mundt</holder> |
24 | </copyright> | 24 | </copyright> |
25 | <copyright> | 25 | <copyright> |
26 | <year>2008</year> | 26 | <year>2008-2010</year> |
27 | <holder>Renesas Technology Corp.</holder> | 27 | <holder>Renesas Technology Corp.</holder> |
28 | </copyright> | 28 | </copyright> |
29 | <copyright> | ||
30 | <year>2010</year> | ||
31 | <holder>Renesas Electronics Corp.</holder> | ||
32 | </copyright> | ||
29 | 33 | ||
30 | <legalnotice> | 34 | <legalnotice> |
31 | <para> | 35 | <para> |
@@ -77,7 +81,7 @@ | |||
77 | </chapter> | 81 | </chapter> |
78 | <chapter id="clk"> | 82 | <chapter id="clk"> |
79 | <title>Clock Framework Extensions</title> | 83 | <title>Clock Framework Extensions</title> |
80 | !Iarch/sh/include/asm/clock.h | 84 | !Iinclude/linux/sh_clk.h |
81 | </chapter> | 85 | </chapter> |
82 | <chapter id="mach"> | 86 | <chapter id="mach"> |
83 | <title>Machine Specific Interfaces</title> | 87 | <title>Machine Specific Interfaces</title> |
diff --git a/Documentation/HOWTO b/Documentation/HOWTO index f5395af88a41..40ada93b820a 100644 --- a/Documentation/HOWTO +++ b/Documentation/HOWTO | |||
@@ -234,7 +234,7 @@ process is as follows: | |||
234 | Linus, usually the patches that have already been included in the | 234 | Linus, usually the patches that have already been included in the |
235 | -next kernel for a few weeks. The preferred way to submit big changes | 235 | -next kernel for a few weeks. The preferred way to submit big changes |
236 | is using git (the kernel's source management tool, more information | 236 | is using git (the kernel's source management tool, more information |
237 | can be found at http://git.or.cz/) but plain patches are also just | 237 | can be found at http://git-scm.com/) but plain patches are also just |
238 | fine. | 238 | fine. |
239 | - After two weeks a -rc1 kernel is released it is now possible to push | 239 | - After two weeks a -rc1 kernel is released it is now possible to push |
240 | only patches that do not include new features that could affect the | 240 | only patches that do not include new features that could affect the |
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 1423d2570d78..44c6dcc93d6d 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt | |||
@@ -3,35 +3,79 @@ Using RCU's CPU Stall Detector | |||
3 | The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables | 3 | The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables |
4 | RCU's CPU stall detector, which detects conditions that unduly delay | 4 | RCU's CPU stall detector, which detects conditions that unduly delay |
5 | RCU grace periods. The stall detector's idea of what constitutes | 5 | RCU grace periods. The stall detector's idea of what constitutes |
6 | "unduly delayed" is controlled by a pair of C preprocessor macros: | 6 | "unduly delayed" is controlled by a set of C preprocessor macros: |
7 | 7 | ||
8 | RCU_SECONDS_TILL_STALL_CHECK | 8 | RCU_SECONDS_TILL_STALL_CHECK |
9 | 9 | ||
10 | This macro defines the period of time that RCU will wait from | 10 | This macro defines the period of time that RCU will wait from |
11 | the beginning of a grace period until it issues an RCU CPU | 11 | the beginning of a grace period until it issues an RCU CPU |
12 | stall warning. It is normally ten seconds. | 12 | stall warning. This time period is normally ten seconds. |
13 | 13 | ||
14 | RCU_SECONDS_TILL_STALL_RECHECK | 14 | RCU_SECONDS_TILL_STALL_RECHECK |
15 | 15 | ||
16 | This macro defines the period of time that RCU will wait after | 16 | This macro defines the period of time that RCU will wait after |
17 | issuing a stall warning until it issues another stall warning. | 17 | issuing a stall warning until it issues another stall warning |
18 | It is normally set to thirty seconds. | 18 | for the same stall. This time period is normally set to thirty |
19 | seconds. | ||
19 | 20 | ||
20 | RCU_STALL_RAT_DELAY | 21 | RCU_STALL_RAT_DELAY |
21 | 22 | ||
22 | The CPU stall detector tries to make the offending CPU rat on itself, | 23 | The CPU stall detector tries to make the offending CPU print its |
23 | as this often gives better-quality stack traces. However, if | 24 | own warnings, as this often gives better-quality stack traces. |
24 | the offending CPU does not detect its own stall in the number | 25 | However, if the offending CPU does not detect its own stall in |
25 | of jiffies specified by RCU_STALL_RAT_DELAY, then other CPUs will | 26 | the number of jiffies specified by RCU_STALL_RAT_DELAY, then |
26 | complain. This is normally set to two jiffies. | 27 | some other CPU will complain. This delay is normally set to |
28 | two jiffies. | ||
27 | 29 | ||
28 | The following problems can result in an RCU CPU stall warning: | 30 | When a CPU detects that it is stalling, it will print a message similar |
31 | to the following: | ||
32 | |||
33 | INFO: rcu_sched_state detected stall on CPU 5 (t=2500 jiffies) | ||
34 | |||
35 | This message indicates that CPU 5 detected that it was causing a stall, | ||
36 | and that the stall was affecting RCU-sched. This message will normally be | ||
37 | followed by a stack dump of the offending CPU. On TREE_RCU kernel builds, | ||
38 | RCU and RCU-sched are implemented by the same underlying mechanism, | ||
39 | while on TREE_PREEMPT_RCU kernel builds, RCU is instead implemented | ||
40 | by rcu_preempt_state. | ||
41 | |||
42 | On the other hand, if the offending CPU fails to print out a stall-warning | ||
43 | message quickly enough, some other CPU will print a message similar to | ||
44 | the following: | ||
45 | |||
46 | INFO: rcu_bh_state detected stalls on CPUs/tasks: { 3 5 } (detected by 2, 2502 jiffies) | ||
47 | |||
48 | This message indicates that CPU 2 detected that CPUs 3 and 5 were both | ||
49 | causing stalls, and that the stall was affecting RCU-bh. This message | ||
50 | will normally be followed by stack dumps for each CPU. Please note that | ||
51 | TREE_PREEMPT_RCU builds can be stalled by tasks as well as by CPUs, | ||
52 | and that the tasks will be indicated by PID, for example, "P3421". | ||
53 | It is even possible for a rcu_preempt_state stall to be caused by both | ||
54 | CPUs -and- tasks, in which case the offending CPUs and tasks will all | ||
55 | be called out in the list. | ||
56 | |||
57 | Finally, if the grace period ends just as the stall warning starts | ||
58 | printing, there will be a spurious stall-warning message: | ||
59 | |||
60 | INFO: rcu_bh_state detected stalls on CPUs/tasks: { } (detected by 4, 2502 jiffies) | ||
61 | |||
62 | This is rare, but does happen from time to time in real life. | ||
63 | |||
64 | So your kernel printed an RCU CPU stall warning. The next question is | ||
65 | "What caused it?" The following problems can result in RCU CPU stall | ||
66 | warnings: | ||
29 | 67 | ||
30 | o A CPU looping in an RCU read-side critical section. | 68 | o A CPU looping in an RCU read-side critical section. |
31 | 69 | ||
32 | o A CPU looping with interrupts disabled. | 70 | o A CPU looping with interrupts disabled. This condition can |
71 | result in RCU-sched and RCU-bh stalls. | ||
33 | 72 | ||
34 | o A CPU looping with preemption disabled. | 73 | o A CPU looping with preemption disabled. This condition can |
74 | result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh | ||
75 | stalls. | ||
76 | |||
77 | o A CPU looping with bottom halves disabled. This condition can | ||
78 | result in RCU-sched and RCU-bh stalls. | ||
35 | 79 | ||
36 | o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel | 80 | o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel |
37 | without invoking schedule(). | 81 | without invoking schedule(). |
@@ -39,20 +83,24 @@ o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel | |||
39 | o A bug in the RCU implementation. | 83 | o A bug in the RCU implementation. |
40 | 84 | ||
41 | o A hardware failure. This is quite unlikely, but has occurred | 85 | o A hardware failure. This is quite unlikely, but has occurred |
42 | at least once in a former life. A CPU failed in a running system, | 86 | at least once in real life. A CPU failed in a running system, |
43 | becoming unresponsive, but not causing an immediate crash. | 87 | becoming unresponsive, but not causing an immediate crash. |
44 | This resulted in a series of RCU CPU stall warnings, eventually | 88 | This resulted in a series of RCU CPU stall warnings, eventually |
45 | leading the realization that the CPU had failed. | 89 | leading the realization that the CPU had failed. |
46 | 90 | ||
47 | The RCU, RCU-sched, and RCU-bh implementations have CPU stall warning. | 91 | The RCU, RCU-sched, and RCU-bh implementations have CPU stall |
48 | SRCU does not do so directly, but its calls to synchronize_sched() will | 92 | warning. SRCU does not have its own CPU stall warnings, but its |
49 | result in RCU-sched detecting any CPU stalls that might be occurring. | 93 | calls to synchronize_sched() will result in RCU-sched detecting |
50 | 94 | RCU-sched-related CPU stalls. Please note that RCU only detects | |
51 | To diagnose the cause of the stall, inspect the stack traces. The offending | 95 | CPU stalls when there is a grace period in progress. No grace period, |
52 | function will usually be near the top of the stack. If you have a series | 96 | no CPU stall warnings. |
53 | of stall warnings from a single extended stall, comparing the stack traces | 97 | |
54 | can often help determine where the stall is occurring, which will usually | 98 | To diagnose the cause of the stall, inspect the stack traces. |
55 | be in the function nearest the top of the stack that stays the same from | 99 | The offending function will usually be near the top of the stack. |
56 | trace to trace. | 100 | If you have a series of stall warnings from a single extended stall, |
101 | comparing the stack traces can often help determine where the stall | ||
102 | is occurring, which will usually be in the function nearest the top of | ||
103 | that portion of the stack which remains the same from trace to trace. | ||
104 | If you can reliably trigger the stall, ftrace can be quite helpful. | ||
57 | 105 | ||
58 | RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE. | 106 | RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE. |
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index 0e50bc2aa1e2..5d9016795fd8 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt | |||
@@ -182,16 +182,6 @@ Similarly, sched_expedited RCU provides the following: | |||
182 | sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0 | 182 | sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0 |
183 | sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0 | 183 | sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0 |
184 | sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0 | 184 | sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0 |
185 | state: -1 / 0:0 3:0 4:0 | ||
186 | |||
187 | As before, the first four lines are similar to those for RCU. | ||
188 | The last line shows the task-migration state. The first number is | ||
189 | -1 if synchronize_sched_expedited() is idle, -2 if in the process of | ||
190 | posting wakeups to the migration kthreads, and N when waiting on CPU N. | ||
191 | Each of the colon-separated fields following the "/" is a CPU:state pair. | ||
192 | Valid states are "0" for idle, "1" for waiting for quiescent state, | ||
193 | "2" for passed through quiescent state, and "3" when a race with a | ||
194 | CPU-hotplug event forces use of the synchronize_sched() primitive. | ||
195 | 185 | ||
196 | 186 | ||
197 | USAGE | 187 | USAGE |
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 8608fd85e921..efd8cc95c06b 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt | |||
@@ -256,23 +256,23 @@ o Each element of the form "1/1 0:127 ^0" represents one struct | |||
256 | The output of "cat rcu/rcu_pending" looks as follows: | 256 | The output of "cat rcu/rcu_pending" looks as follows: |
257 | 257 | ||
258 | rcu_sched: | 258 | rcu_sched: |
259 | 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 | 259 | 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 |
260 | 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 | 260 | 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 |
261 | 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 | 261 | 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 |
262 | 3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723 | 262 | 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723 |
263 | 4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110 | 263 | 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110 |
264 | 5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456 | 264 | 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456 |
265 | 6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834 | 265 | 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834 |
266 | 7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888 | 266 | 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888 |
267 | rcu_bh: | 267 | rcu_bh: |
268 | 0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314 | 268 | 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314 |
269 | 1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180 | 269 | 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180 |
270 | 2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936 | 270 | 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936 |
271 | 3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863 | 271 | 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863 |
272 | 4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671 | 272 | 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671 |
273 | 5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235 | 273 | 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235 |
274 | 6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 | 274 | 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 |
275 | 7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 | 275 | 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 |
276 | 276 | ||
277 | As always, this is once again split into "rcu_sched" and "rcu_bh" | 277 | As always, this is once again split into "rcu_sched" and "rcu_bh" |
278 | portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional | 278 | portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional |
@@ -284,6 +284,9 @@ o "np" is the number of times that __rcu_pending() has been invoked | |||
284 | o "qsp" is the number of times that the RCU was waiting for a | 284 | o "qsp" is the number of times that the RCU was waiting for a |
285 | quiescent state from this CPU. | 285 | quiescent state from this CPU. |
286 | 286 | ||
287 | o "rpq" is the number of times that the CPU had passed through | ||
288 | a quiescent state, but not yet reported it to RCU. | ||
289 | |||
287 | o "cbr" is the number of times that this CPU had RCU callbacks | 290 | o "cbr" is the number of times that this CPU had RCU callbacks |
288 | that had passed through a grace period, and were thus ready | 291 | that had passed through a grace period, and were thus ready |
289 | to be invoked. | 292 | to be invoked. |
diff --git a/Documentation/arm/00-INDEX b/Documentation/arm/00-INDEX index 82e418d648d0..7f5fc3ba9c91 100644 --- a/Documentation/arm/00-INDEX +++ b/Documentation/arm/00-INDEX | |||
@@ -20,6 +20,8 @@ Samsung-S3C24XX | |||
20 | - S3C24XX ARM Linux Overview | 20 | - S3C24XX ARM Linux Overview |
21 | Sharp-LH | 21 | Sharp-LH |
22 | - Linux on Sharp LH79524 and LH7A40X System On a Chip (SOC) | 22 | - Linux on Sharp LH79524 and LH7A40X System On a Chip (SOC) |
23 | SPEAr | ||
24 | - ST SPEAr platform Linux Overview | ||
23 | VFP/ | 25 | VFP/ |
24 | - Release notes for Linux Kernel Vector Floating Point support code | 26 | - Release notes for Linux Kernel Vector Floating Point support code |
25 | empeg/ | 27 | empeg/ |
diff --git a/Documentation/arm/SPEAr/overview.txt b/Documentation/arm/SPEAr/overview.txt new file mode 100644 index 000000000000..253a35c6f782 --- /dev/null +++ b/Documentation/arm/SPEAr/overview.txt | |||
@@ -0,0 +1,60 @@ | |||
1 | SPEAr ARM Linux Overview | ||
2 | ========================== | ||
3 | |||
4 | Introduction | ||
5 | ------------ | ||
6 | |||
7 | SPEAr (Structured Processor Enhanced Architecture). | ||
8 | weblink : http://www.st.com/spear | ||
9 | |||
10 | The ST Microelectronics SPEAr range of ARM9/CortexA9 System-on-Chip CPUs are | ||
11 | supported by the 'spear' platform of ARM Linux. Currently SPEAr300, | ||
12 | SPEAr310, SPEAr320 and SPEAr600 SOCs are supported. Support for the SPEAr13XX | ||
13 | series is in progress. | ||
14 | |||
15 | Hierarchy in SPEAr is as follows: | ||
16 | |||
17 | SPEAr (Platform) | ||
18 | - SPEAr3XX (3XX SOC series, based on ARM9) | ||
19 | - SPEAr300 (SOC) | ||
20 | - SPEAr300_EVB (Evaluation Board) | ||
21 | - SPEAr310 (SOC) | ||
22 | - SPEAr310_EVB (Evaluation Board) | ||
23 | - SPEAr320 (SOC) | ||
24 | - SPEAr320_EVB (Evaluation Board) | ||
25 | - SPEAr6XX (6XX SOC series, based on ARM9) | ||
26 | - SPEAr600 (SOC) | ||
27 | - SPEAr600_EVB (Evaluation Board) | ||
28 | - SPEAr13XX (13XX SOC series, based on ARM CORTEXA9) | ||
29 | - SPEAr1300 (SOC) | ||
30 | |||
31 | Configuration | ||
32 | ------------- | ||
33 | |||
34 | A generic configuration is provided for each machine, and can be used as the | ||
35 | default by | ||
36 | make spear600_defconfig | ||
37 | make spear300_defconfig | ||
38 | make spear310_defconfig | ||
39 | make spear320_defconfig | ||
40 | |||
41 | Layout | ||
42 | ------ | ||
43 | |||
44 | The common files for multiple machine families (SPEAr3XX, SPEAr6XX and | ||
45 | SPEAr13XX) are located in the platform code contained in arch/arm/plat-spear | ||
46 | with headers in plat/. | ||
47 | |||
48 | Each machine series have a directory with name arch/arm/mach-spear followed by | ||
49 | series name. Like mach-spear3xx, mach-spear6xx and mach-spear13xx. | ||
50 | |||
51 | Common file for machines of spear3xx family is mach-spear3xx/spear3xx.c and for | ||
52 | spear6xx is mach-spear6xx/spear6xx.c. mach-spear* also contain soc/machine | ||
53 | specific files, like spear300.c, spear310.c, spear320.c and spear600.c. | ||
54 | mach-spear* also contains board specific files for each machine type. | ||
55 | |||
56 | |||
57 | Document Author | ||
58 | --------------- | ||
59 | |||
60 | Viresh Kumar, (c) 2010 ST Microelectronics | ||
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 5eb279a48fa4..57444c2609fc 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt | |||
@@ -235,8 +235,7 @@ containing the following files describing that cgroup: | |||
235 | - cgroup.procs: list of tgids in the cgroup. This list is not | 235 | - cgroup.procs: list of tgids in the cgroup. This list is not |
236 | guaranteed to be sorted or free of duplicate tgids, and userspace | 236 | guaranteed to be sorted or free of duplicate tgids, and userspace |
237 | should sort/uniquify the list if this property is required. | 237 | should sort/uniquify the list if this property is required. |
238 | Writing a tgid into this file moves all threads with that tgid into | 238 | This is a read-only file, for now. |
239 | this cgroup. | ||
240 | - notify_on_release flag: run the release agent on exit? | 239 | - notify_on_release flag: run the release agent on exit? |
241 | - release_agent: the path to use for release notifications (this file | 240 | - release_agent: the path to use for release notifications (this file |
242 | exists in the top cgroup only) | 241 | exists in the top cgroup only) |
diff --git a/Documentation/credentials.txt b/Documentation/credentials.txt index df03169782ea..a2db35287003 100644 --- a/Documentation/credentials.txt +++ b/Documentation/credentials.txt | |||
@@ -408,9 +408,6 @@ This should be used inside the RCU read lock, as in the following example: | |||
408 | ... | 408 | ... |
409 | } | 409 | } |
410 | 410 | ||
411 | A function need not get RCU read lock to use __task_cred() if it is holding a | ||
412 | spinlock at the time as this implicitly holds the RCU read lock. | ||
413 | |||
414 | Should it be necessary to hold another task's credentials for a long period of | 411 | Should it be necessary to hold another task's credentials for a long period of |
415 | time, and possibly to sleep whilst doing so, then the caller should get a | 412 | time, and possibly to sleep whilst doing so, then the caller should get a |
416 | reference on them using: | 413 | reference on them using: |
@@ -426,17 +423,16 @@ credentials, hiding the RCU magic from the caller: | |||
426 | uid_t task_uid(task) Task's real UID | 423 | uid_t task_uid(task) Task's real UID |
427 | uid_t task_euid(task) Task's effective UID | 424 | uid_t task_euid(task) Task's effective UID |
428 | 425 | ||
429 | If the caller is holding a spinlock or the RCU read lock at the time anyway, | 426 | If the caller is holding the RCU read lock at the time anyway, then: |
430 | then: | ||
431 | 427 | ||
432 | __task_cred(task)->uid | 428 | __task_cred(task)->uid |
433 | __task_cred(task)->euid | 429 | __task_cred(task)->euid |
434 | 430 | ||
435 | should be used instead. Similarly, if multiple aspects of a task's credentials | 431 | should be used instead. Similarly, if multiple aspects of a task's credentials |
436 | need to be accessed, RCU read lock or a spinlock should be used, __task_cred() | 432 | need to be accessed, RCU read lock should be used, __task_cred() called, the |
437 | called, the result stored in a temporary pointer and then the credential | 433 | result stored in a temporary pointer and then the credential aspects called |
438 | aspects called from that before dropping the lock. This prevents the | 434 | from that before dropping the lock. This prevents the potentially expensive |
439 | potentially expensive RCU magic from being invoked multiple times. | 435 | RCU magic from being invoked multiple times. |
440 | 436 | ||
441 | Should some other single aspect of another task's credentials need to be | 437 | Should some other single aspect of another task's credentials need to be |
442 | accessed, then this can be used: | 438 | accessed, then this can be used: |
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index ed511af0f79a..e7965f4a385a 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -520,29 +520,6 @@ Who: Hans de Goede <hdegoede@redhat.com> | |||
520 | 520 | ||
521 | ---------------------------- | 521 | ---------------------------- |
522 | 522 | ||
523 | What: corgikbd, spitzkbd, tosakbd driver | ||
524 | When: 2.6.35 | ||
525 | Files: drivers/input/keyboard/{corgi,spitz,tosa}kbd.c | ||
526 | Why: We now have a generic GPIO based matrix keyboard driver that | ||
527 | are fully capable of handling all the keys on these devices. | ||
528 | The original drivers manipulate the GPIO registers directly | ||
529 | and so are difficult to maintain. | ||
530 | Who: Eric Miao <eric.y.miao@gmail.com> | ||
531 | |||
532 | ---------------------------- | ||
533 | |||
534 | What: corgi_ssp and corgi_ts driver | ||
535 | When: 2.6.35 | ||
536 | Files: arch/arm/mach-pxa/corgi_ssp.c, drivers/input/touchscreen/corgi_ts.c | ||
537 | Why: The corgi touchscreen is now deprecated in favour of the generic | ||
538 | ads7846.c driver. The noise reduction technique used in corgi_ts.c, | ||
539 | that's to wait till vsync before ADC sampling, is also integrated into | ||
540 | ads7846 driver now. Provided that the original driver is not generic | ||
541 | and is difficult to maintain, it will be removed later. | ||
542 | Who: Eric Miao <eric.y.miao@gmail.com> | ||
543 | |||
544 | ---------------------------- | ||
545 | |||
546 | What: capifs | 523 | What: capifs |
547 | When: February 2011 | 524 | When: February 2011 |
548 | Files: drivers/isdn/capi/capifs.* | 525 | Files: drivers/isdn/capi/capifs.* |
@@ -564,6 +541,16 @@ Who: Avi Kivity <avi@redhat.com> | |||
564 | 541 | ||
565 | ---------------------------- | 542 | ---------------------------- |
566 | 543 | ||
544 | What: xtime, wall_to_monotonic | ||
545 | When: 2.6.36+ | ||
546 | Files: kernel/time/timekeeping.c include/linux/time.h | ||
547 | Why: Cleaning up timekeeping internal values. Please use | ||
548 | existing timekeeping accessor functions to access | ||
549 | the equivalent functionality. | ||
550 | Who: John Stultz <johnstul@us.ibm.com> | ||
551 | |||
552 | ---------------------------- | ||
553 | |||
567 | What: KVM kernel-allocated memory slots | 554 | What: KVM kernel-allocated memory slots |
568 | When: July 2010 | 555 | When: July 2010 |
569 | Why: Since 2.6.25, kvm supports user-allocated memory slots, which are | 556 | Why: Since 2.6.25, kvm supports user-allocated memory slots, which are |
@@ -589,3 +576,36 @@ Why: Useful in 2003, implementation is a hack. | |||
589 | Generally invoked by accident today. | 576 | Generally invoked by accident today. |
590 | Seen as doing more harm than good. | 577 | Seen as doing more harm than good. |
591 | Who: Len Brown <len.brown@intel.com> | 578 | Who: Len Brown <len.brown@intel.com> |
579 | |||
580 | ---------------------------- | ||
581 | |||
582 | What: video4linux /dev/vtx teletext API support | ||
583 | When: 2.6.35 | ||
584 | Files: drivers/media/video/saa5246a.c drivers/media/video/saa5249.c | ||
585 | include/linux/videotext.h | ||
586 | Why: The vtx device nodes have been superseded by vbi device nodes | ||
587 | for many years. No applications exist that use the vtx support. | ||
588 | Of the two i2c drivers that actually support this API the saa5249 | ||
589 | has been impossible to use for a year now and no known hardware | ||
590 | that supports this device exists. The saa5246a is theoretically | ||
591 | supported by the old mxb boards, but it never actually worked. | ||
592 | |||
593 | In summary: there is no hardware that can use this API and there | ||
594 | are no applications actually implementing this API. | ||
595 | |||
596 | The vtx support still reserves minors 192-223 and we would really | ||
597 | like to reuse those for upcoming new functionality. In the unlikely | ||
598 | event that new hardware appears that wants to use the functionality | ||
599 | provided by the vtx API, then that functionality should be build | ||
600 | around the sliced VBI API instead. | ||
601 | Who: Hans Verkuil <hverkuil@xs4all.nl> | ||
602 | |||
603 | ---------------------------- | ||
604 | |||
605 | What: IRQF_DISABLED | ||
606 | When: 2.6.36 | ||
607 | Why: The flag is a NOOP as we run interrupt handlers with interrupts disabled | ||
608 | Who: Thomas Gleixner <tglx@linutronix.de> | ||
609 | |||
610 | ---------------------------- | ||
611 | |||
diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt index 6a53a84afc72..04884914a1c8 100644 --- a/Documentation/filesystems/nfs/nfs41-server.txt +++ b/Documentation/filesystems/nfs/nfs41-server.txt | |||
@@ -137,7 +137,7 @@ NS*| OPENATTR | OPT | | Section 18.17 | | |||
137 | | READ | REQ | | Section 18.22 | | 137 | | READ | REQ | | Section 18.22 | |
138 | | READDIR | REQ | | Section 18.23 | | 138 | | READDIR | REQ | | Section 18.23 | |
139 | | READLINK | OPT | | Section 18.24 | | 139 | | READLINK | OPT | | Section 18.24 | |
140 | NS | RECLAIM_COMPLETE | REQ | | Section 18.51 | | 140 | | RECLAIM_COMPLETE | REQ | | Section 18.51 | |
141 | | RELEASE_LOCKOWNER | MNI | | N/A | | 141 | | RELEASE_LOCKOWNER | MNI | | N/A | |
142 | | REMOVE | REQ | | Section 18.25 | | 142 | | REMOVE | REQ | | Section 18.25 | |
143 | | RENAME | REQ | | Section 18.26 | | 143 | | RENAME | REQ | | Section 18.26 | |
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index f6b1b5fca1df..9fb6cbe70bde 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -316,7 +316,7 @@ address perms offset dev inode pathname | |||
316 | 08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test | 316 | 08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test |
317 | 0804a000-0806b000 rw-p 00000000 00:00 0 [heap] | 317 | 0804a000-0806b000 rw-p 00000000 00:00 0 [heap] |
318 | a7cb1000-a7cb2000 ---p 00000000 00:00 0 | 318 | a7cb1000-a7cb2000 ---p 00000000 00:00 0 |
319 | a7cb2000-a7eb2000 rw-p 00000000 00:00 0 [threadstack:001ff4b4] | 319 | a7cb2000-a7eb2000 rw-p 00000000 00:00 0 |
320 | a7eb2000-a7eb3000 ---p 00000000 00:00 0 | 320 | a7eb2000-a7eb3000 ---p 00000000 00:00 0 |
321 | a7eb3000-a7ed5000 rw-p 00000000 00:00 0 | 321 | a7eb3000-a7ed5000 rw-p 00000000 00:00 0 |
322 | a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 | 322 | a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 |
@@ -352,7 +352,6 @@ is not associated with a file: | |||
352 | [stack] = the stack of the main process | 352 | [stack] = the stack of the main process |
353 | [vdso] = the "virtual dynamic shared object", | 353 | [vdso] = the "virtual dynamic shared object", |
354 | the kernel system call handler | 354 | the kernel system call handler |
355 | [threadstack:xxxxxxxx] = the stack of the thread, xxxxxxxx is the stack size | ||
356 | 355 | ||
357 | or if empty, the mapping is anonymous. | 356 | or if empty, the mapping is anonymous. |
358 | 357 | ||
@@ -566,6 +565,10 @@ The default_smp_affinity mask applies to all non-active IRQs, which are the | |||
566 | IRQs which have not yet been allocated/activated, and hence which lack a | 565 | IRQs which have not yet been allocated/activated, and hence which lack a |
567 | /proc/irq/[0-9]* directory. | 566 | /proc/irq/[0-9]* directory. |
568 | 567 | ||
568 | The node file on an SMP system shows the node to which the device using the IRQ | ||
569 | reports itself as being attached. This hardware locality information does not | ||
570 | include information about any possible driver locality preference. | ||
571 | |||
569 | prof_cpu_mask specifies which CPUs are to be profiled by the system wide | 572 | prof_cpu_mask specifies which CPUs are to be profiled by the system wide |
570 | profiler. Default value is ffffffff (all cpus). | 573 | profiler. Default value is ffffffff (all cpus). |
571 | 574 | ||
diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients index 3219ee0dbfef..5ebf5af1d716 100644 --- a/Documentation/i2c/writing-clients +++ b/Documentation/i2c/writing-clients | |||
@@ -74,6 +74,11 @@ structure at all. You should use this to keep device-specific data. | |||
74 | /* retrieve the value */ | 74 | /* retrieve the value */ |
75 | void *i2c_get_clientdata(const struct i2c_client *client); | 75 | void *i2c_get_clientdata(const struct i2c_client *client); |
76 | 76 | ||
77 | Note that starting with kernel 2.6.34, you don't have to set the `data' field | ||
78 | to NULL in remove() or if probe() failed anymore. The i2c-core does this | ||
79 | automatically on these occasions. Those are also the only times the core will | ||
80 | touch this field. | ||
81 | |||
77 | 82 | ||
78 | Accessing the client | 83 | Accessing the client |
79 | ==================== | 84 | ==================== |
diff --git a/Documentation/input/elantech.txt b/Documentation/input/elantech.txt index a10c3b6ba7c4..56941ae1f5db 100644 --- a/Documentation/input/elantech.txt +++ b/Documentation/input/elantech.txt | |||
@@ -333,14 +333,14 @@ byte 0: | |||
333 | byte 1: | 333 | byte 1: |
334 | 334 | ||
335 | bit 7 6 5 4 3 2 1 0 | 335 | bit 7 6 5 4 3 2 1 0 |
336 | x15 x14 x13 x12 x11 x10 x9 x8 | 336 | . . . . . x10 x9 x8 |
337 | 337 | ||
338 | byte 2: | 338 | byte 2: |
339 | 339 | ||
340 | bit 7 6 5 4 3 2 1 0 | 340 | bit 7 6 5 4 3 2 1 0 |
341 | x7 x6 x5 x4 x4 x2 x1 x0 | 341 | x7 x6 x5 x4 x4 x2 x1 x0 |
342 | 342 | ||
343 | x15..x0 = absolute x value (horizontal) | 343 | x10..x0 = absolute x value (horizontal) |
344 | 344 | ||
345 | byte 3: | 345 | byte 3: |
346 | 346 | ||
@@ -350,14 +350,14 @@ byte 3: | |||
350 | byte 4: | 350 | byte 4: |
351 | 351 | ||
352 | bit 7 6 5 4 3 2 1 0 | 352 | bit 7 6 5 4 3 2 1 0 |
353 | y15 y14 y13 y12 y11 y10 y8 y8 | 353 | . . . . . . y9 y8 |
354 | 354 | ||
355 | byte 5: | 355 | byte 5: |
356 | 356 | ||
357 | bit 7 6 5 4 3 2 1 0 | 357 | bit 7 6 5 4 3 2 1 0 |
358 | y7 y6 y5 y4 y3 y2 y1 y0 | 358 | y7 y6 y5 y4 y3 y2 y1 y0 |
359 | 359 | ||
360 | y15..y0 = absolute y value (vertical) | 360 | y9..y0 = absolute y value (vertical) |
361 | 361 | ||
362 | 362 | ||
363 | 4.2.2 Two finger touch | 363 | 4.2.2 Two finger touch |
diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt index 1423bcc7c507..5dc59b04a71f 100644 --- a/Documentation/intel_txt.txt +++ b/Documentation/intel_txt.txt | |||
@@ -161,13 +161,15 @@ o In order to put a system into any of the sleep states after a TXT | |||
161 | has been restored, it will restore the TPM PCRs and then | 161 | has been restored, it will restore the TPM PCRs and then |
162 | transfer control back to the kernel's S3 resume vector. | 162 | transfer control back to the kernel's S3 resume vector. |
163 | In order to preserve system integrity across S3, the kernel | 163 | In order to preserve system integrity across S3, the kernel |
164 | provides tboot with a set of memory ranges (kernel | 164 | provides tboot with a set of memory ranges (RAM and RESERVED_KERN |
165 | code/data/bss, S3 resume code, and AP trampoline) that tboot | 165 | in the e820 table, but not any memory that BIOS might alter over |
166 | will calculate a MAC (message authentication code) over and then | 166 | the S3 transition) that tboot will calculate a MAC (message |
167 | seal with the TPM. On resume and once the measured environment | 167 | authentication code) over and then seal with the TPM. On resume |
168 | has been re-established, tboot will re-calculate the MAC and | 168 | and once the measured environment has been re-established, tboot |
169 | verify it against the sealed value. Tboot's policy determines | 169 | will re-calculate the MAC and verify it against the sealed value. |
170 | what happens if the verification fails. | 170 | Tboot's policy determines what happens if the verification fails. |
171 | Note that the c/s 194 of tboot which has the new MAC code supports | ||
172 | this. | ||
171 | 173 | ||
172 | That's pretty much it for TXT support. | 174 | That's pretty much it for TXT support. |
173 | 175 | ||
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e2202e93b148..b9b0d7989f4e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -99,6 +99,7 @@ parameter is applicable: | |||
99 | SWSUSP Software suspend (hibernation) is enabled. | 99 | SWSUSP Software suspend (hibernation) is enabled. |
100 | SUSPEND System suspend states are enabled. | 100 | SUSPEND System suspend states are enabled. |
101 | FTRACE Function tracing enabled. | 101 | FTRACE Function tracing enabled. |
102 | TPM TPM drivers are enabled. | ||
102 | TS Appropriate touchscreen support is enabled. | 103 | TS Appropriate touchscreen support is enabled. |
103 | UMS USB Mass Storage support is enabled. | 104 | UMS USB Mass Storage support is enabled. |
104 | USB USB support is enabled. | 105 | USB USB support is enabled. |
@@ -324,6 +325,8 @@ and is between 256 and 4096 characters. It is defined in the file | |||
324 | they are unmapped. Otherwise they are | 325 | they are unmapped. Otherwise they are |
325 | flushed before they will be reused, which | 326 | flushed before they will be reused, which |
326 | is a lot of faster | 327 | is a lot of faster |
328 | off - do not initialize any AMD IOMMU found in | ||
329 | the system | ||
327 | 330 | ||
328 | amijoy.map= [HW,JOY] Amiga joystick support | 331 | amijoy.map= [HW,JOY] Amiga joystick support |
329 | Map of devices attached to JOY0DAT and JOY1DAT | 332 | Map of devices attached to JOY0DAT and JOY1DAT |
@@ -784,8 +787,12 @@ and is between 256 and 4096 characters. It is defined in the file | |||
784 | as early as possible in order to facilitate early | 787 | as early as possible in order to facilitate early |
785 | boot debugging. | 788 | boot debugging. |
786 | 789 | ||
787 | ftrace_dump_on_oops | 790 | ftrace_dump_on_oops[=orig_cpu] |
788 | [FTRACE] will dump the trace buffers on oops. | 791 | [FTRACE] will dump the trace buffers on oops. |
792 | If no parameter is passed, ftrace will dump | ||
793 | buffers of all CPUs, but if you pass orig_cpu, it will | ||
794 | dump only the buffer of the CPU that triggered the | ||
795 | oops. | ||
789 | 796 | ||
790 | ftrace_filter=[function-list] | 797 | ftrace_filter=[function-list] |
791 | [FTRACE] Limit the functions traced by the function | 798 | [FTRACE] Limit the functions traced by the function |
@@ -1194,7 +1201,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1194 | 1201 | ||
1195 | libata.force= [LIBATA] Force configurations. The format is comma | 1202 | libata.force= [LIBATA] Force configurations. The format is comma |
1196 | separated list of "[ID:]VAL" where ID is | 1203 | separated list of "[ID:]VAL" where ID is |
1197 | PORT[:DEVICE]. PORT and DEVICE are decimal numbers | 1204 | PORT[.DEVICE]. PORT and DEVICE are decimal numbers |
1198 | matching port, link or device. Basically, it matches | 1205 | matching port, link or device. Basically, it matches |
1199 | the ATA ID string printed on console by libata. If | 1206 | the ATA ID string printed on console by libata. If |
1200 | the whole ID part is omitted, the last PORT and DEVICE | 1207 | the whole ID part is omitted, the last PORT and DEVICE |
@@ -2610,6 +2617,15 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2610 | 2617 | ||
2611 | tp720= [HW,PS2] | 2618 | tp720= [HW,PS2] |
2612 | 2619 | ||
2620 | tpm_suspend_pcr=[HW,TPM] | ||
2621 | Format: integer pcr id | ||
2622 | Specify that at suspend time, the tpm driver | ||
2623 | should extend the specified pcr with zeros, | ||
2624 | as a workaround for some chips which fail to | ||
2625 | flush the last written pcr on TPM_SaveState. | ||
2626 | This will guarantee that all the other pcrs | ||
2627 | are saved. | ||
2628 | |||
2613 | trace_buf_size=nn[KMG] | 2629 | trace_buf_size=nn[KMG] |
2614 | [FTRACE] will set tracing buffer size. | 2630 | [FTRACE] will set tracing buffer size. |
2615 | 2631 | ||
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 51ec634ac04b..6653017680dd 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt | |||
@@ -165,8 +165,8 @@ the user entry_handler invocation is also skipped. | |||
165 | 165 | ||
166 | 1.4 How Does Jump Optimization Work? | 166 | 1.4 How Does Jump Optimization Work? |
167 | 167 | ||
168 | If you configured your kernel with CONFIG_OPTPROBES=y (currently | 168 | If your kernel is built with CONFIG_OPTPROBES=y (currently this flag |
169 | this option is supported on x86/x86-64, non-preemptive kernel) and | 169 | is automatically set 'y' on x86/x86-64, non-preemptive kernel) and |
170 | the "debug.kprobes_optimization" kernel parameter is set to 1 (see | 170 | the "debug.kprobes_optimization" kernel parameter is set to 1 (see |
171 | sysctl(8)), Kprobes tries to reduce probe-hit overhead by using a jump | 171 | sysctl(8)), Kprobes tries to reduce probe-hit overhead by using a jump |
172 | instruction instead of a breakpoint instruction at each probepoint. | 172 | instruction instead of a breakpoint instruction at each probepoint. |
@@ -271,8 +271,6 @@ tweak the kernel's execution path, you need to suppress optimization, | |||
271 | using one of the following techniques: | 271 | using one of the following techniques: |
272 | - Specify an empty function for the kprobe's post_handler or break_handler. | 272 | - Specify an empty function for the kprobe's post_handler or break_handler. |
273 | or | 273 | or |
274 | - Config CONFIG_OPTPROBES=n. | ||
275 | or | ||
276 | - Execute 'sysctl -w debug.kprobes_optimization=n' | 274 | - Execute 'sysctl -w debug.kprobes_optimization=n' |
277 | 275 | ||
278 | 2. Architectures Supported | 276 | 2. Architectures Supported |
@@ -307,10 +305,6 @@ it useful to "Compile the kernel with debug info" (CONFIG_DEBUG_INFO), | |||
307 | so you can use "objdump -d -l vmlinux" to see the source-to-object | 305 | so you can use "objdump -d -l vmlinux" to see the source-to-object |
308 | code mapping. | 306 | code mapping. |
309 | 307 | ||
310 | If you want to reduce probing overhead, set "Kprobes jump optimization | ||
311 | support" (CONFIG_OPTPROBES) to "y". You can find this option under the | ||
312 | "Kprobes" line. | ||
313 | |||
314 | 4. API Reference | 308 | 4. API Reference |
315 | 309 | ||
316 | The Kprobes API includes a "register" function and an "unregister" | 310 | The Kprobes API includes a "register" function and an "unregister" |
diff --git a/Documentation/pcmcia/driver-changes.txt b/Documentation/pcmcia/driver-changes.txt index 446f43b309df..61bc4e943116 100644 --- a/Documentation/pcmcia/driver-changes.txt +++ b/Documentation/pcmcia/driver-changes.txt | |||
@@ -1,4 +1,17 @@ | |||
1 | This file details changes in 2.6 which affect PCMCIA card driver authors: | 1 | This file details changes in 2.6 which affect PCMCIA card driver authors: |
2 | * No dev_node_t (as of 2.6.35) | ||
3 | There is no more need to fill out a "dev_node_t" structure. | ||
4 | |||
5 | * New IRQ request rules (as of 2.6.35) | ||
6 | Instead of the old pcmcia_request_irq() interface, drivers may now | ||
7 | choose between: | ||
8 | - calling request_irq/free_irq directly. Use the IRQ from *p_dev->irq. | ||
9 | - use pcmcia_request_irq(p_dev, handler_t); the PCMCIA core will | ||
10 | clean up automatically on calls to pcmcia_disable_device() or | ||
11 | device ejection. | ||
12 | - drivers still not capable of IRQF_SHARED (or not telling us so) may | ||
13 | use the deprecated pcmcia_request_exclusive_irq() for the time | ||
14 | being; they might receive a shared IRQ nonetheless. | ||
2 | 15 | ||
3 | * no cs_error / CS_CHECK / CONFIG_PCMCIA_DEBUG (as of 2.6.33) | 16 | * no cs_error / CS_CHECK / CONFIG_PCMCIA_DEBUG (as of 2.6.33) |
4 | Instead of the cs_error() callback or the CS_CHECK() macro, please use | 17 | Instead of the cs_error() callback or the CS_CHECK() macro, please use |
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt index c9abbd86bc18..57080cd74575 100644 --- a/Documentation/power/devices.txt +++ b/Documentation/power/devices.txt | |||
@@ -1,7 +1,13 @@ | |||
1 | Device Power Management | ||
2 | |||
3 | Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. | ||
4 | Copyright (c) 2010 Alan Stern <stern@rowland.harvard.edu> | ||
5 | |||
6 | |||
1 | Most of the code in Linux is device drivers, so most of the Linux power | 7 | Most of the code in Linux is device drivers, so most of the Linux power |
2 | management code is also driver-specific. Most drivers will do very little; | 8 | management (PM) code is also driver-specific. Most drivers will do very |
3 | others, especially for platforms with small batteries (like cell phones), | 9 | little; others, especially for platforms with small batteries (like cell |
4 | will do a lot. | 10 | phones), will do a lot. |
5 | 11 | ||
6 | This writeup gives an overview of how drivers interact with system-wide | 12 | This writeup gives an overview of how drivers interact with system-wide |
7 | power management goals, emphasizing the models and interfaces that are | 13 | power management goals, emphasizing the models and interfaces that are |
@@ -15,9 +21,10 @@ Drivers will use one or both of these models to put devices into low-power | |||
15 | states: | 21 | states: |
16 | 22 | ||
17 | System Sleep model: | 23 | System Sleep model: |
18 | Drivers can enter low power states as part of entering system-wide | 24 | Drivers can enter low-power states as part of entering system-wide |
19 | low-power states like "suspend-to-ram", or (mostly for systems with | 25 | low-power states like "suspend" (also known as "suspend-to-RAM"), or |
20 | disks) "hibernate" (suspend-to-disk). | 26 | (mostly for systems with disks) "hibernation" (also known as |
27 | "suspend-to-disk"). | ||
21 | 28 | ||
22 | This is something that device, bus, and class drivers collaborate on | 29 | This is something that device, bus, and class drivers collaborate on |
23 | by implementing various role-specific suspend and resume methods to | 30 | by implementing various role-specific suspend and resume methods to |
@@ -25,33 +32,41 @@ states: | |||
25 | them without loss of data. | 32 | them without loss of data. |
26 | 33 | ||
27 | Some drivers can manage hardware wakeup events, which make the system | 34 | Some drivers can manage hardware wakeup events, which make the system |
28 | leave that low-power state. This feature may be disabled using the | 35 | leave the low-power state. This feature may be enabled or disabled |
29 | relevant /sys/devices/.../power/wakeup file; enabling it may cost some | 36 | using the relevant /sys/devices/.../power/wakeup file (for Ethernet |
30 | power usage, but let the whole system enter low power states more often. | 37 | drivers the ioctl interface used by ethtool may also be used for this |
38 | purpose); enabling it may cost some power usage, but let the whole | ||
39 | system enter low-power states more often. | ||
31 | 40 | ||
32 | Runtime Power Management model: | 41 | Runtime Power Management model: |
33 | Drivers may also enter low power states while the system is running, | 42 | Devices may also be put into low-power states while the system is |
34 | independently of other power management activity. Upstream drivers | 43 | running, independently of other power management activity in principle. |
35 | will normally not know (or care) if the device is in some low power | 44 | However, devices are not generally independent of each other (for |
36 | state when issuing requests; the driver will auto-resume anything | 45 | example, a parent device cannot be suspended unless all of its child |
37 | that's needed when it gets a request. | 46 | devices have been suspended). Moreover, depending on the bus type the |
38 | 47 | device is on, it may be necessary to carry out some bus-specific | |
39 | This doesn't have, or need much infrastructure; it's just something you | 48 | operations on the device for this purpose. Devices put into low power |
40 | should do when writing your drivers. For example, clk_disable() unused | 49 | states at run time may require special handling during system-wide power |
41 | clocks as part of minimizing power drain for currently-unused hardware. | 50 | transitions (suspend or hibernation). |
42 | Of course, sometimes clusters of drivers will collaborate with each | 51 | |
43 | other, which could involve task-specific power management. | 52 | For these reasons not only the device driver itself, but also the |
44 | 53 | appropriate subsystem (bus type, device type or device class) driver and | |
45 | There's not a lot to be said about those low power states except that they | 54 | the PM core are involved in runtime power management. As in the system |
46 | are very system-specific, and often device-specific. Also, that if enough | 55 | sleep power management case, they need to collaborate by implementing |
47 | drivers put themselves into low power states (at "runtime"), the effect may be | 56 | various role-specific suspend and resume methods, so that the hardware |
48 | the same as entering some system-wide low-power state (system sleep) ... and | 57 | is cleanly powered down and reactivated without data or service loss. |
49 | that synergies exist, so that several drivers using runtime pm might put the | 58 | |
50 | system into a state where even deeper power saving options are available. | 59 | There's not a lot to be said about those low-power states except that they are |
51 | 60 | very system-specific, and often device-specific. Also, that if enough devices | |
52 | Most suspended devices will have quiesced all I/O: no more DMA or irqs, no | 61 | have been put into low-power states (at runtime), the effect may be very similar |
53 | more data read or written, and requests from upstream drivers are no longer | 62 | to entering some system-wide low-power state (system sleep) ... and that |
54 | accepted. A given bus or platform may have different requirements though. | 63 | synergies exist, so that several drivers using runtime PM might put the system |
64 | into a state where even deeper power saving options are available. | ||
65 | |||
66 | Most suspended devices will have quiesced all I/O: no more DMA or IRQs (except | ||
67 | for wakeup events), no more data read or written, and requests from upstream | ||
68 | drivers are no longer accepted. A given bus or platform may have different | ||
69 | requirements though. | ||
55 | 70 | ||
56 | Examples of hardware wakeup events include an alarm from a real time clock, | 71 | Examples of hardware wakeup events include an alarm from a real time clock, |
57 | network wake-on-LAN packets, keyboard or mouse activity, and media insertion | 72 | network wake-on-LAN packets, keyboard or mouse activity, and media insertion |
@@ -60,129 +75,152 @@ or removal (for PCMCIA, MMC/SD, USB, and so on). | |||
60 | 75 | ||
61 | Interfaces for Entering System Sleep States | 76 | Interfaces for Entering System Sleep States |
62 | =========================================== | 77 | =========================================== |
63 | Most of the programming interfaces a device driver needs to know about | 78 | There are programming interfaces provided for subsystems (bus type, device type, |
64 | relate to that first model: entering a system-wide low power state, | 79 | device class) and device drivers to allow them to participate in the power |
65 | rather than just minimizing power consumption by one device. | 80 | management of devices they are concerned with. These interfaces cover both |
66 | 81 | system sleep and runtime power management. | |
67 | 82 | ||
68 | Bus Driver Methods | 83 | |
69 | ------------------ | 84 | Device Power Management Operations |
70 | The core methods to suspend and resume devices reside in struct bus_type. | 85 | ---------------------------------- |
71 | These are mostly of interest to people writing infrastructure for busses | 86 | Device power management operations, at the subsystem level as well as at the |
72 | like PCI or USB, or because they define the primitives that device drivers | 87 | device driver level, are implemented by defining and populating objects of type |
73 | may need to apply in domain-specific ways to their devices: | 88 | struct dev_pm_ops: |
74 | 89 | ||
75 | struct bus_type { | 90 | struct dev_pm_ops { |
76 | ... | 91 | int (*prepare)(struct device *dev); |
77 | int (*suspend)(struct device *dev, pm_message_t state); | 92 | void (*complete)(struct device *dev); |
78 | int (*resume)(struct device *dev); | 93 | int (*suspend)(struct device *dev); |
94 | int (*resume)(struct device *dev); | ||
95 | int (*freeze)(struct device *dev); | ||
96 | int (*thaw)(struct device *dev); | ||
97 | int (*poweroff)(struct device *dev); | ||
98 | int (*restore)(struct device *dev); | ||
99 | int (*suspend_noirq)(struct device *dev); | ||
100 | int (*resume_noirq)(struct device *dev); | ||
101 | int (*freeze_noirq)(struct device *dev); | ||
102 | int (*thaw_noirq)(struct device *dev); | ||
103 | int (*poweroff_noirq)(struct device *dev); | ||
104 | int (*restore_noirq)(struct device *dev); | ||
105 | int (*runtime_suspend)(struct device *dev); | ||
106 | int (*runtime_resume)(struct device *dev); | ||
107 | int (*runtime_idle)(struct device *dev); | ||
79 | }; | 108 | }; |
80 | 109 | ||
81 | Bus drivers implement those methods as appropriate for the hardware and | 110 | This structure is defined in include/linux/pm.h and the methods included in it |
82 | the drivers using it; PCI works differently from USB, and so on. Not many | 111 | are also described in that file. Their roles will be explained in what follows. |
83 | people write bus drivers; most driver code is a "device driver" that | 112 | For now, it should be sufficient to remember that the last three methods are |
84 | builds on top of bus-specific framework code. | 113 | specific to runtime power management while the remaining ones are used during |
114 | system-wide power transitions. | ||
85 | 115 | ||
86 | For more information on these driver calls, see the description later; | 116 | There also is a deprecated "old" or "legacy" interface for power management |
87 | they are called in phases for every device, respecting the parent-child | 117 | operations available at least for some subsystems. This approach does not use |
88 | sequencing in the driver model tree. Note that as this is being written, | 118 | struct dev_pm_ops objects and it is suitable only for implementing system sleep |
89 | only the suspend() and resume() are widely available; not many bus drivers | 119 | power management methods. Therefore it is not described in this document, so |
90 | leverage all of those phases, or pass them down to lower driver levels. | 120 | please refer directly to the source code for more information about it. |
91 | 121 | ||
92 | 122 | ||
93 | /sys/devices/.../power/wakeup files | 123 | Subsystem-Level Methods |
94 | ----------------------------------- | 124 | ----------------------- |
95 | All devices in the driver model have two flags to control handling of | 125 | The core methods to suspend and resume devices reside in struct dev_pm_ops |
96 | wakeup events, which are hardware signals that can force the device and/or | 126 | pointed to by the pm member of struct bus_type, struct device_type and |
97 | system out of a low power state. These are initialized by bus or device | 127 | struct class. They are mostly of interest to the people writing infrastructure |
98 | driver code using device_init_wakeup(dev,can_wakeup). | 128 | for buses, like PCI or USB, or device type and device class drivers. |
99 | 129 | ||
100 | The "can_wakeup" flag just records whether the device (and its driver) can | 130 | Bus drivers implement these methods as appropriate for the hardware and the |
101 | physically support wakeup events. When that flag is clear, the sysfs | 131 | drivers using it; PCI works differently from USB, and so on. Not many people |
102 | "wakeup" file is empty, and device_may_wakeup() returns false. | 132 | write subsystem-level drivers; most driver code is a "device driver" that builds |
133 | on top of bus-specific framework code. | ||
103 | 134 | ||
104 | For devices that can issue wakeup events, a separate flag controls whether | 135 | For more information on these driver calls, see the description later; |
105 | that device should try to use its wakeup mechanism. The initial value of | 136 | they are called in phases for every device, respecting the parent-child |
106 | device_may_wakeup() will be true, so that the device's "wakeup" file holds | 137 | sequencing in the driver model tree. |
107 | the value "enabled". Userspace can change that to "disabled" so that | ||
108 | device_may_wakeup() returns false; or change it back to "enabled" (so that | ||
109 | it returns true again). | ||
110 | 138 | ||
111 | 139 | ||
112 | EXAMPLE: PCI Device Driver Methods | 140 | /sys/devices/.../power/wakeup files |
113 | ----------------------------------- | 141 | ----------------------------------- |
114 | PCI framework software calls these methods when the PCI device driver bound | 142 | All devices in the driver model have two flags to control handling of wakeup |
115 | to a device device has provided them: | 143 | events (hardware signals that can force the device and/or system out of a low |
116 | 144 | power state). These flags are initialized by bus or device driver code using | |
117 | struct pci_driver { | 145 | device_set_wakeup_capable() and device_set_wakeup_enable(), defined in |
118 | ... | 146 | include/linux/pm_wakeup.h. |
119 | int (*suspend)(struct pci_device *pdev, pm_message_t state); | ||
120 | int (*suspend_late)(struct pci_device *pdev, pm_message_t state); | ||
121 | 147 | ||
122 | int (*resume_early)(struct pci_device *pdev); | 148 | The "can_wakeup" flag just records whether the device (and its driver) can |
123 | int (*resume)(struct pci_device *pdev); | 149 | physically support wakeup events. The device_set_wakeup_capable() routine |
124 | }; | 150 | affects this flag. The "should_wakeup" flag controls whether the device should |
125 | 151 | try to use its wakeup mechanism. device_set_wakeup_enable() affects this flag; | |
126 | Drivers will implement those methods, and call PCI-specific procedures | 152 | for the most part drivers should not change its value. The initial value of |
127 | like pci_set_power_state(), pci_enable_wake(), pci_save_state(), and | 153 | should_wakeup is supposed to be false for the majority of devices; the major |
128 | pci_restore_state() to manage PCI-specific mechanisms. (PCI config space | 154 | exceptions are power buttons, keyboards, and Ethernet adapters whose WoL |
129 | could be saved during driver probe, if it weren't for the fact that some | 155 | (wake-on-LAN) feature has been set up with ethtool. |
130 | systems rely on userspace tweaking using setpci.) Devices are suspended | 156 | |
131 | before their bridges enter low power states, and likewise bridges resume | 157 | Whether or not a device is capable of issuing wakeup events is a hardware |
132 | before their devices. | 158 | matter, and the kernel is responsible for keeping track of it. By contrast, |
133 | 159 | whether or not a wakeup-capable device should issue wakeup events is a policy | |
134 | 160 | decision, and it is managed by user space through a sysfs attribute: the | |
135 | Upper Layers of Driver Stacks | 161 | power/wakeup file. User space can write the strings "enabled" or "disabled" to |
136 | ----------------------------- | 162 | set or clear the should_wakeup flag, respectively. Reads from the file will |
137 | Device drivers generally have at least two interfaces, and the methods | 163 | return the corresponding string if can_wakeup is true, but if can_wakeup is |
138 | sketched above are the ones which apply to the lower level (nearer PCI, USB, | 164 | false then reads will return an empty string, to indicate that the device |
139 | or other bus hardware). The network and block layers are examples of upper | 165 | doesn't support wakeup events. (But even though the file appears empty, writes |
140 | level interfaces, as is a character device talking to userspace. | 166 | will still affect the should_wakeup flag.) |
141 | 167 | ||
142 | Power management requests normally need to flow through those upper levels, | 168 | The device_may_wakeup() routine returns true only if both flags are set. |
143 | which often use domain-oriented requests like "blank that screen". In | 169 | Drivers should check this routine when putting devices in a low-power state |
144 | some cases those upper levels will have power management intelligence that | 170 | during a system sleep transition, to see whether or not to enable the devices' |
145 | relates to end-user activity, or other devices that work in cooperation. | 171 | wakeup mechanisms. However for runtime power management, wakeup events should |
146 | 172 | be enabled whenever the device and driver both support them, regardless of the | |
147 | When those interfaces are structured using class interfaces, there is a | 173 | should_wakeup flag. |
148 | standard way to have the upper layer stop issuing requests to a given | 174 | |
149 | class device (and restart later): | 175 | |
150 | 176 | /sys/devices/.../power/control files | |
151 | struct class { | 177 | ------------------------------------ |
152 | ... | 178 | Each device in the driver model has a flag to control whether it is subject to |
153 | int (*suspend)(struct device *dev, pm_message_t state); | 179 | runtime power management. This flag, called runtime_auto, is initialized by the |
154 | int (*resume)(struct device *dev); | 180 | bus type (or generally subsystem) code using pm_runtime_allow() or |
155 | }; | 181 | pm_runtime_forbid(); the default is to allow runtime power management. |
156 | 182 | ||
157 | Those calls are issued in specific phases of the process by which the | 183 | The setting can be adjusted by user space by writing either "on" or "auto" to |
158 | system enters a low power "suspend" state, or resumes from it. | 184 | the device's power/control sysfs file. Writing "auto" calls pm_runtime_allow(), |
159 | 185 | setting the flag and allowing the device to be runtime power-managed by its | |
160 | 186 | driver. Writing "on" calls pm_runtime_forbid(), clearing the flag, returning | |
161 | Calling Drivers to Enter System Sleep States | 187 | the device to full power if it was in a low-power state, and preventing the |
162 | ============================================ | 188 | device from being runtime power-managed. User space can check the current value |
163 | When the system enters a low power state, each device's driver is asked | 189 | of the runtime_auto flag by reading the file. |
164 | to suspend the device by putting it into state compatible with the target | 190 | |
191 | The device's runtime_auto flag has no effect on the handling of system-wide | ||
192 | power transitions. In particular, the device can (and in the majority of cases | ||
193 | should and will) be put into a low-power state during a system-wide transition | ||
194 | to a sleep state even though its runtime_auto flag is clear. | ||
195 | |||
196 | For more information about the runtime power management framework, refer to | ||
197 | Documentation/power/runtime_pm.txt. | ||
198 | |||
199 | |||
200 | Calling Drivers to Enter and Leave System Sleep States | ||
201 | ====================================================== | ||
202 | When the system goes into a sleep state, each device's driver is asked to | ||
203 | suspend the device by putting it into a state compatible with the target | ||
165 | system state. That's usually some version of "off", but the details are | 204 | system state. That's usually some version of "off", but the details are |
166 | system-specific. Also, wakeup-enabled devices will usually stay partly | 205 | system-specific. Also, wakeup-enabled devices will usually stay partly |
167 | functional in order to wake the system. | 206 | functional in order to wake the system. |
168 | 207 | ||
169 | When the system leaves that low power state, the device's driver is asked | 208 | When the system leaves that low-power state, the device's driver is asked to |
170 | to resume it. The suspend and resume operations always go together, and | 209 | resume it by returning it to full power. The suspend and resume operations |
171 | both are multi-phase operations. | 210 | always go together, and both are multi-phase operations. |
172 | 211 | ||
173 | For simple drivers, suspend might quiesce the device using the class code | 212 | For simple drivers, suspend might quiesce the device using class code |
174 | and then turn its hardware as "off" as possible with late_suspend. The | 213 | and then turn its hardware as "off" as possible during suspend_noirq. The |
175 | matching resume calls would then completely reinitialize the hardware | 214 | matching resume calls would then completely reinitialize the hardware |
176 | before reactivating its class I/O queues. | 215 | before reactivating its class I/O queues. |
177 | 216 | ||
178 | More power-aware drivers drivers will use more than one device low power | 217 | More power-aware drivers might prepare the devices for triggering system wakeup |
179 | state, either at runtime or during system sleep states, and might trigger | 218 | events. |
180 | system wakeup events. | ||
181 | 219 | ||
182 | 220 | ||
183 | Call Sequence Guarantees | 221 | Call Sequence Guarantees |
184 | ------------------------ | 222 | ------------------------ |
185 | To ensure that bridges and similar links needed to talk to a device are | 223 | To ensure that bridges and similar links needing to talk to a device are |
186 | available when the device is suspended or resumed, the device tree is | 224 | available when the device is suspended or resumed, the device tree is |
187 | walked in a bottom-up order to suspend devices. A top-down order is | 225 | walked in a bottom-up order to suspend devices. A top-down order is |
188 | used to resume those devices. | 226 | used to resume those devices. |
@@ -194,67 +232,310 @@ its parent; and can't be removed or suspended after that parent. | |||
194 | The policy is that the device tree should match hardware bus topology. | 232 | The policy is that the device tree should match hardware bus topology. |
195 | (Or at least the control bus, for devices which use multiple busses.) | 233 | (Or at least the control bus, for devices which use multiple busses.) |
196 | In particular, this means that a device registration may fail if the parent of | 234 | In particular, this means that a device registration may fail if the parent of |
197 | the device is suspending (ie. has been chosen by the PM core as the next | 235 | the device is suspending (i.e. has been chosen by the PM core as the next |
198 | device to suspend) or has already suspended, as well as after all of the other | 236 | device to suspend) or has already suspended, as well as after all of the other |
199 | devices have been suspended. Device drivers must be prepared to cope with such | 237 | devices have been suspended. Device drivers must be prepared to cope with such |
200 | situations. | 238 | situations. |
201 | 239 | ||
202 | 240 | ||
203 | Suspending Devices | 241 | System Power Management Phases |
204 | ------------------ | 242 | ------------------------------ |
205 | Suspending a given device is done in several phases. Suspending the | 243 | Suspending or resuming the system is done in several phases. Different phases |
206 | system always includes every phase, executing calls for every device | 244 | are used for standby or memory sleep states ("suspend-to-RAM") and the |
207 | before the next phase begins. Not all busses or classes support all | 245 | hibernation state ("suspend-to-disk"). Each phase involves executing callbacks |
208 | these callbacks; and not all drivers use all the callbacks. | 246 | for every device before the next phase begins. Not all busses or classes |
247 | support all these callbacks and not all drivers use all the callbacks. The | ||
248 | various phases always run after tasks have been frozen and before they are | ||
249 | unfrozen. Furthermore, the *_noirq phases run at a time when IRQ handlers have | ||
250 | been disabled (except for those marked with the IRQ_WAKEUP flag). | ||
209 | 251 | ||
210 | The phases are seen by driver notifications issued in this order: | 252 | Most phases use bus, type, and class callbacks (that is, methods defined in |
253 | dev->bus->pm, dev->type->pm, and dev->class->pm). The prepare and complete | ||
254 | phases are exceptions; they use only bus callbacks. When multiple callbacks | ||
255 | are used in a phase, they are invoked in the order: <class, type, bus> during | ||
256 | power-down transitions and in the opposite order during power-up transitions. | ||
257 | For example, during the suspend phase the PM core invokes | ||
211 | 258 | ||
212 | 1 class.suspend(dev, message) is called after tasks are frozen, for | 259 | dev->class->pm.suspend(dev); |
213 | devices associated with a class that has such a method. This | 260 | dev->type->pm.suspend(dev); |
214 | method may sleep. | 261 | dev->bus->pm.suspend(dev); |
215 | 262 | ||
216 | Since I/O activity usually comes from such higher layers, this is | 263 | before moving on to the next device, whereas during the resume phase the core |
217 | a good place to quiesce all drivers of a given type (and keep such | 264 | invokes |
218 | code out of those drivers). | ||
219 | 265 | ||
220 | 2 bus.suspend(dev, message) is called next. This method may sleep, | 266 | dev->bus->pm.resume(dev); |
221 | and is often morphed into a device driver call with bus-specific | 267 | dev->type->pm.resume(dev); |
222 | parameters and/or rules. | 268 | dev->class->pm.resume(dev); |
223 | 269 | ||
224 | This call should handle parts of device suspend logic that require | 270 | These callbacks may in turn invoke device- or driver-specific methods stored in |
225 | sleeping. It probably does work to quiesce the device which hasn't | 271 | dev->driver->pm, but they don't have to. |
226 | been abstracted into class.suspend(). | ||
227 | 272 | ||
228 | The pm_message_t parameter is currently used to refine those semantics | ||
229 | (described later). | ||
230 | 273 | ||
231 | At the end of those phases, drivers should normally have stopped all I/O | 274 | Entering System Suspend |
232 | transactions (DMA, IRQs), saved enough state that they can re-initialize | 275 | ----------------------- |
233 | or restore previous state (as needed by the hardware), and placed the | 276 | When the system goes into the standby or memory sleep state, the phases are: |
234 | device into a low-power state. On many platforms they will also use | 277 | |
235 | clk_disable() to gate off one or more clock sources; sometimes they will | 278 | prepare, suspend, suspend_noirq. |
236 | also switch off power supplies, or reduce voltages. Drivers which have | 279 | |
237 | runtime PM support may already have performed some or all of the steps | 280 | 1. The prepare phase is meant to prevent races by preventing new devices |
238 | needed to prepare for the upcoming system sleep state. | 281 | from being registered; the PM core would never know that all the |
282 | children of a device had been suspended if new children could be | ||
283 | registered at will. (By contrast, devices may be unregistered at any | ||
284 | time.) Unlike the other suspend-related phases, during the prepare | ||
285 | phase the device tree is traversed top-down. | ||
286 | |||
287 | The prepare phase uses only a bus callback. After the callback method | ||
288 | returns, no new children may be registered below the device. The method | ||
289 | may also prepare the device or driver in some way for the upcoming | ||
290 | system power transition, but it should not put the device into a | ||
291 | low-power state. | ||
292 | |||
293 | 2. The suspend methods should quiesce the device to stop it from performing | ||
294 | I/O. They also may save the device registers and put it into the | ||
295 | appropriate low-power state, depending on the bus type the device is on, | ||
296 | and they may enable wakeup events. | ||
297 | |||
298 | 3. The suspend_noirq phase occurs after IRQ handlers have been disabled, | ||
299 | which means that the driver's interrupt handler will not be called while | ||
300 | the callback method is running. The methods should save the values of | ||
301 | the device's registers that weren't saved previously and finally put the | ||
302 | device into the appropriate low-power state. | ||
303 | |||
304 | The majority of subsystems and device drivers need not implement this | ||
305 | callback. However, bus types allowing devices to share interrupt | ||
306 | vectors, like PCI, generally need it; otherwise a driver might encounter | ||
307 | an error during the suspend phase by fielding a shared interrupt | ||
308 | generated by some other device after its own device had been set to low | ||
309 | power. | ||
310 | |||
311 | At the end of these phases, drivers should have stopped all I/O transactions | ||
312 | (DMA, IRQs), saved enough state that they can re-initialize or restore previous | ||
313 | state (as needed by the hardware), and placed the device into a low-power state. | ||
314 | On many platforms they will gate off one or more clock sources; sometimes they | ||
315 | will also switch off power supplies or reduce voltages. (Drivers supporting | ||
316 | runtime PM may already have performed some or all of these steps.) | ||
317 | |||
318 | If device_may_wakeup(dev) returns true, the device should be prepared for | ||
319 | generating hardware wakeup signals to trigger a system wakeup event when the | ||
320 | system is in the sleep state. For example, enable_irq_wake() might identify | ||
321 | GPIO signals hooked up to a switch or other external hardware, and | ||
322 | pci_enable_wake() does something similar for the PCI PME signal. | ||
323 | |||
324 | If any of these callbacks returns an error, the system won't enter the desired | ||
325 | low-power state. Instead the PM core will unwind its actions by resuming all | ||
326 | the devices that were suspended. | ||
327 | |||
328 | |||
329 | Leaving System Suspend | ||
330 | ---------------------- | ||
331 | When resuming from standby or memory sleep, the phases are: | ||
332 | |||
333 | resume_noirq, resume, complete. | ||
334 | |||
335 | 1. The resume_noirq callback methods should perform any actions needed | ||
336 | before the driver's interrupt handlers are invoked. This generally | ||
337 | means undoing the actions of the suspend_noirq phase. If the bus type | ||
338 | permits devices to share interrupt vectors, like PCI, the method should | ||
339 | bring the device and its driver into a state in which the driver can | ||
340 | recognize if the device is the source of incoming interrupts, if any, | ||
341 | and handle them correctly. | ||
342 | |||
343 | For example, the PCI bus type's ->pm.resume_noirq() puts the device into | ||
344 | the full-power state (D0 in the PCI terminology) and restores the | ||
345 | standard configuration registers of the device. Then it calls the | ||
346 | device driver's ->pm.resume_noirq() method to perform device-specific | ||
347 | actions. | ||
348 | |||
349 | 2. The resume methods should bring the the device back to its operating | ||
350 | state, so that it can perform normal I/O. This generally involves | ||
351 | undoing the actions of the suspend phase. | ||
352 | |||
353 | 3. The complete phase uses only a bus callback. The method should undo the | ||
354 | actions of the prepare phase. Note, however, that new children may be | ||
355 | registered below the device as soon as the resume callbacks occur; it's | ||
356 | not necessary to wait until the complete phase. | ||
357 | |||
358 | At the end of these phases, drivers should be as functional as they were before | ||
359 | suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are | ||
360 | gated on. Even if the device was in a low-power state before the system sleep | ||
361 | because of runtime power management, afterwards it should be back in its | ||
362 | full-power state. There are multiple reasons why it's best to do this; they are | ||
363 | discussed in more detail in Documentation/power/runtime_pm.txt. | ||
239 | 364 | ||
240 | When any driver sees that its device_can_wakeup(dev), it should make sure | 365 | However, the details here may again be platform-specific. For example, |
241 | to use the relevant hardware signals to trigger a system wakeup event. | 366 | some systems support multiple "run" states, and the mode in effect at |
242 | For example, enable_irq_wake() might identify GPIO signals hooked up to | 367 | the end of resume might not be the one which preceded suspension. |
243 | a switch or other external hardware, and pci_enable_wake() does something | 368 | That means availability of certain clocks or power supplies changed, |
244 | similar for PCI's PME# signal. | 369 | which could easily affect how a driver works. |
370 | |||
371 | Drivers need to be able to handle hardware which has been reset since the | ||
372 | suspend methods were called, for example by complete reinitialization. | ||
373 | This may be the hardest part, and the one most protected by NDA'd documents | ||
374 | and chip errata. It's simplest if the hardware state hasn't changed since | ||
375 | the suspend was carried out, but that can't be guaranteed (in fact, it ususally | ||
376 | is not the case). | ||
377 | |||
378 | Drivers must also be prepared to notice that the device has been removed | ||
379 | while the system was powered down, whenever that's physically possible. | ||
380 | PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of busses | ||
381 | where common Linux platforms will see such removal. Details of how drivers | ||
382 | will notice and handle such removals are currently bus-specific, and often | ||
383 | involve a separate thread. | ||
384 | |||
385 | These callbacks may return an error value, but the PM core will ignore such | ||
386 | errors since there's nothing it can do about them other than printing them in | ||
387 | the system log. | ||
388 | |||
389 | |||
390 | Entering Hibernation | ||
391 | -------------------- | ||
392 | Hibernating the system is more complicated than putting it into the standby or | ||
393 | memory sleep state, because it involves creating and saving a system image. | ||
394 | Therefore there are more phases for hibernation, with a different set of | ||
395 | callbacks. These phases always run after tasks have been frozen and memory has | ||
396 | been freed. | ||
397 | |||
398 | The general procedure for hibernation is to quiesce all devices (freeze), create | ||
399 | an image of the system memory while everything is stable, reactivate all | ||
400 | devices (thaw), write the image to permanent storage, and finally shut down the | ||
401 | system (poweroff). The phases used to accomplish this are: | ||
402 | |||
403 | prepare, freeze, freeze_noirq, thaw_noirq, thaw, complete, | ||
404 | prepare, poweroff, poweroff_noirq | ||
405 | |||
406 | 1. The prepare phase is discussed in the "Entering System Suspend" section | ||
407 | above. | ||
408 | |||
409 | 2. The freeze methods should quiesce the device so that it doesn't generate | ||
410 | IRQs or DMA, and they may need to save the values of device registers. | ||
411 | However the device does not have to be put in a low-power state, and to | ||
412 | save time it's best not to do so. Also, the device should not be | ||
413 | prepared to generate wakeup events. | ||
414 | |||
415 | 3. The freeze_noirq phase is analogous to the suspend_noirq phase discussed | ||
416 | above, except again that the device should not be put in a low-power | ||
417 | state and should not be allowed to generate wakeup events. | ||
418 | |||
419 | At this point the system image is created. All devices should be inactive and | ||
420 | the contents of memory should remain undisturbed while this happens, so that the | ||
421 | image forms an atomic snapshot of the system state. | ||
422 | |||
423 | 4. The thaw_noirq phase is analogous to the resume_noirq phase discussed | ||
424 | above. The main difference is that its methods can assume the device is | ||
425 | in the same state as at the end of the freeze_noirq phase. | ||
426 | |||
427 | 5. The thaw phase is analogous to the resume phase discussed above. Its | ||
428 | methods should bring the device back to an operating state, so that it | ||
429 | can be used for saving the image if necessary. | ||
430 | |||
431 | 6. The complete phase is discussed in the "Leaving System Suspend" section | ||
432 | above. | ||
433 | |||
434 | At this point the system image is saved, and the devices then need to be | ||
435 | prepared for the upcoming system shutdown. This is much like suspending them | ||
436 | before putting the system into the standby or memory sleep state, and the phases | ||
437 | are similar. | ||
438 | |||
439 | 7. The prepare phase is discussed above. | ||
440 | |||
441 | 8. The poweroff phase is analogous to the suspend phase. | ||
442 | |||
443 | 9. The poweroff_noirq phase is analogous to the suspend_noirq phase. | ||
444 | |||
445 | The poweroff and poweroff_noirq callbacks should do essentially the same things | ||
446 | as the suspend and suspend_noirq callbacks. The only notable difference is that | ||
447 | they need not store the device register values, because the registers should | ||
448 | already have been stored during the freeze or freeze_noirq phases. | ||
449 | |||
450 | |||
451 | Leaving Hibernation | ||
452 | ------------------- | ||
453 | Resuming from hibernation is, again, more complicated than resuming from a sleep | ||
454 | state in which the contents of main memory are preserved, because it requires | ||
455 | a system image to be loaded into memory and the pre-hibernation memory contents | ||
456 | to be restored before control can be passed back to the image kernel. | ||
457 | |||
458 | Although in principle, the image might be loaded into memory and the | ||
459 | pre-hibernation memory contents restored by the boot loader, in practice this | ||
460 | can't be done because boot loaders aren't smart enough and there is no | ||
461 | established protocol for passing the necessary information. So instead, the | ||
462 | boot loader loads a fresh instance of the kernel, called the boot kernel, into | ||
463 | memory and passes control to it in the usual way. Then the boot kernel reads | ||
464 | the system image, restores the pre-hibernation memory contents, and passes | ||
465 | control to the image kernel. Thus two different kernels are involved in | ||
466 | resuming from hibernation. In fact, the boot kernel may be completely different | ||
467 | from the image kernel: a different configuration and even a different version. | ||
468 | This has important consequences for device drivers and their subsystems. | ||
469 | |||
470 | To be able to load the system image into memory, the boot kernel needs to | ||
471 | include at least a subset of device drivers allowing it to access the storage | ||
472 | medium containing the image, although it doesn't need to include all of the | ||
473 | drivers present in the image kernel. After the image has been loaded, the | ||
474 | devices managed by the boot kernel need to be prepared for passing control back | ||
475 | to the image kernel. This is very similar to the initial steps involved in | ||
476 | creating a system image, and it is accomplished in the same way, using prepare, | ||
477 | freeze, and freeze_noirq phases. However the devices affected by these phases | ||
478 | are only those having drivers in the boot kernel; other devices will still be in | ||
479 | whatever state the boot loader left them. | ||
480 | |||
481 | Should the restoration of the pre-hibernation memory contents fail, the boot | ||
482 | kernel would go through the "thawing" procedure described above, using the | ||
483 | thaw_noirq, thaw, and complete phases, and then continue running normally. This | ||
484 | happens only rarely. Most often the pre-hibernation memory contents are | ||
485 | restored successfully and control is passed to the image kernel, which then | ||
486 | becomes responsible for bringing the system back to the working state. | ||
487 | |||
488 | To achieve this, the image kernel must restore the devices' pre-hibernation | ||
489 | functionality. The operation is much like waking up from the memory sleep | ||
490 | state, although it involves different phases: | ||
491 | |||
492 | restore_noirq, restore, complete | ||
493 | |||
494 | 1. The restore_noirq phase is analogous to the resume_noirq phase. | ||
495 | |||
496 | 2. The restore phase is analogous to the resume phase. | ||
497 | |||
498 | 3. The complete phase is discussed above. | ||
499 | |||
500 | The main difference from resume[_noirq] is that restore[_noirq] must assume the | ||
501 | device has been accessed and reconfigured by the boot loader or the boot kernel. | ||
502 | Consequently the state of the device may be different from the state remembered | ||
503 | from the freeze and freeze_noirq phases. The device may even need to be reset | ||
504 | and completely re-initialized. In many cases this difference doesn't matter, so | ||
505 | the resume[_noirq] and restore[_norq] method pointers can be set to the same | ||
506 | routines. Nevertheless, different callback pointers are used in case there is a | ||
507 | situation where it actually matters. | ||
245 | 508 | ||
246 | If a driver (or bus, or class) fails it suspend method, the system won't | ||
247 | enter the desired low power state; it will resume all the devices it's | ||
248 | suspended so far. | ||
249 | 509 | ||
250 | Note that drivers may need to perform different actions based on the target | 510 | System Devices |
251 | system lowpower/sleep state. At this writing, there are only platform | 511 | -------------- |
252 | specific APIs through which drivers could determine those target states. | 512 | System devices (sysdevs) follow a slightly different API, which can be found in |
513 | |||
514 | include/linux/sysdev.h | ||
515 | drivers/base/sys.c | ||
516 | |||
517 | System devices will be suspended with interrupts disabled, and after all other | ||
518 | devices have been suspended. On resume, they will be resumed before any other | ||
519 | devices, and also with interrupts disabled. These things occur in special | ||
520 | "sysdev_driver" phases, which affect only system devices. | ||
521 | |||
522 | Thus, after the suspend_noirq (or freeze_noirq or poweroff_noirq) phase, when | ||
523 | the non-boot CPUs are all offline and IRQs are disabled on the remaining online | ||
524 | CPU, then a sysdev_driver.suspend phase is carried out, and the system enters a | ||
525 | sleep state (or a system image is created). During resume (or after the image | ||
526 | has been created or loaded) a sysdev_driver.resume phase is carried out, IRQs | ||
527 | are enabled on the only online CPU, the non-boot CPUs are enabled, and the | ||
528 | resume_noirq (or thaw_noirq or restore_noirq) phase begins. | ||
529 | |||
530 | Code to actually enter and exit the system-wide low power state sometimes | ||
531 | involves hardware details that are only known to the boot firmware, and | ||
532 | may leave a CPU running software (from SRAM or flash memory) that monitors | ||
533 | the system and manages its wakeup sequence. | ||
253 | 534 | ||
254 | 535 | ||
255 | Device Low Power (suspend) States | 536 | Device Low Power (suspend) States |
256 | --------------------------------- | 537 | --------------------------------- |
257 | Device low-power states aren't very standard. One device might only handle | 538 | Device low-power states aren't standard. One device might only handle |
258 | "on" and "off, while another might support a dozen different versions of | 539 | "on" and "off, while another might support a dozen different versions of |
259 | "on" (how many engines are active?), plus a state that gets back to "on" | 540 | "on" (how many engines are active?), plus a state that gets back to "on" |
260 | faster than from a full "off". | 541 | faster than from a full "off". |
@@ -265,7 +546,7 @@ PCI device may not perform DMA or issue IRQs, and any wakeup events it | |||
265 | issues would be issued through the PME# bus signal. Plus, there are | 546 | issues would be issued through the PME# bus signal. Plus, there are |
266 | several PCI-standard device states, some of which are optional. | 547 | several PCI-standard device states, some of which are optional. |
267 | 548 | ||
268 | In contrast, integrated system-on-chip processors often use irqs as the | 549 | In contrast, integrated system-on-chip processors often use IRQs as the |
269 | wakeup event sources (so drivers would call enable_irq_wake) and might | 550 | wakeup event sources (so drivers would call enable_irq_wake) and might |
270 | be able to treat DMA completion as a wakeup event (sometimes DMA can stay | 551 | be able to treat DMA completion as a wakeup event (sometimes DMA can stay |
271 | active too, it'd only be the CPU and some peripherals that sleep). | 552 | active too, it'd only be the CPU and some peripherals that sleep). |
@@ -284,120 +565,17 @@ ways; the aforementioned LCD might be active in one product's "standby", | |||
284 | but a different product using the same SOC might work differently. | 565 | but a different product using the same SOC might work differently. |
285 | 566 | ||
286 | 567 | ||
287 | Meaning of pm_message_t.event | 568 | Power Management Notifiers |
288 | ----------------------------- | 569 | -------------------------- |
289 | Parameters to suspend calls include the device affected and a message of | 570 | There are some operations that cannot be carried out by the power management |
290 | type pm_message_t, which has one field: the event. If driver does not | 571 | callbacks discussed above, because the callbacks occur too late or too early. |
291 | recognize the event code, suspend calls may abort the request and return | 572 | To handle these cases, subsystems and device drivers may register power |
292 | a negative errno. However, most drivers will be fine if they implement | 573 | management notifiers that are called before tasks are frozen and after they have |
293 | PM_EVENT_SUSPEND semantics for all messages. | 574 | been thawed. Generally speaking, the PM notifiers are suitable for performing |
575 | actions that either require user space to be available, or at least won't | ||
576 | interfere with user space. | ||
294 | 577 | ||
295 | The event codes are used to refine the goal of suspending the device, and | 578 | For details refer to Documentation/power/notifiers.txt. |
296 | mostly matter when creating or resuming system memory image snapshots, as | ||
297 | used with suspend-to-disk: | ||
298 | |||
299 | PM_EVENT_SUSPEND -- quiesce the driver and put hardware into a low-power | ||
300 | state. When used with system sleep states like "suspend-to-RAM" or | ||
301 | "standby", the upcoming resume() call will often be able to rely on | ||
302 | state kept in hardware, or issue system wakeup events. | ||
303 | |||
304 | PM_EVENT_HIBERNATE -- Put hardware into a low-power state and enable wakeup | ||
305 | events as appropriate. It is only used with hibernation | ||
306 | (suspend-to-disk) and few devices are able to wake up the system from | ||
307 | this state; most are completely powered off. | ||
308 | |||
309 | PM_EVENT_FREEZE -- quiesce the driver, but don't necessarily change into | ||
310 | any low power mode. A system snapshot is about to be taken, often | ||
311 | followed by a call to the driver's resume() method. Neither wakeup | ||
312 | events nor DMA are allowed. | ||
313 | |||
314 | PM_EVENT_PRETHAW -- quiesce the driver, knowing that the upcoming resume() | ||
315 | will restore a suspend-to-disk snapshot from a different kernel image. | ||
316 | Drivers that are smart enough to look at their hardware state during | ||
317 | resume() processing need that state to be correct ... a PRETHAW could | ||
318 | be used to invalidate that state (by resetting the device), like a | ||
319 | shutdown() invocation would before a kexec() or system halt. Other | ||
320 | drivers might handle this the same way as PM_EVENT_FREEZE. Neither | ||
321 | wakeup events nor DMA are allowed. | ||
322 | |||
323 | To enter "standby" (ACPI S1) or "Suspend to RAM" (STR, ACPI S3) states, or | ||
324 | the similarly named APM states, only PM_EVENT_SUSPEND is used; the other event | ||
325 | codes are used for hibernation ("Suspend to Disk", STD, ACPI S4). | ||
326 | |||
327 | There's also PM_EVENT_ON, a value which never appears as a suspend event | ||
328 | but is sometimes used to record the "not suspended" device state. | ||
329 | |||
330 | |||
331 | Resuming Devices | ||
332 | ---------------- | ||
333 | Resuming is done in multiple phases, much like suspending, with all | ||
334 | devices processing each phase's calls before the next phase begins. | ||
335 | |||
336 | The phases are seen by driver notifications issued in this order: | ||
337 | |||
338 | 1 bus.resume(dev) reverses the effects of bus.suspend(). This may | ||
339 | be morphed into a device driver call with bus-specific parameters; | ||
340 | implementations may sleep. | ||
341 | |||
342 | 2 class.resume(dev) is called for devices associated with a class | ||
343 | that has such a method. Implementations may sleep. | ||
344 | |||
345 | This reverses the effects of class.suspend(), and would usually | ||
346 | reactivate the device's I/O queue. | ||
347 | |||
348 | At the end of those phases, drivers should normally be as functional as | ||
349 | they were before suspending: I/O can be performed using DMA and IRQs, and | ||
350 | the relevant clocks are gated on. The device need not be "fully on"; it | ||
351 | might be in a runtime lowpower/suspend state that acts as if it were. | ||
352 | |||
353 | However, the details here may again be platform-specific. For example, | ||
354 | some systems support multiple "run" states, and the mode in effect at | ||
355 | the end of resume() might not be the one which preceded suspension. | ||
356 | That means availability of certain clocks or power supplies changed, | ||
357 | which could easily affect how a driver works. | ||
358 | |||
359 | |||
360 | Drivers need to be able to handle hardware which has been reset since the | ||
361 | suspend methods were called, for example by complete reinitialization. | ||
362 | This may be the hardest part, and the one most protected by NDA'd documents | ||
363 | and chip errata. It's simplest if the hardware state hasn't changed since | ||
364 | the suspend() was called, but that can't always be guaranteed. | ||
365 | |||
366 | Drivers must also be prepared to notice that the device has been removed | ||
367 | while the system was powered off, whenever that's physically possible. | ||
368 | PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of busses | ||
369 | where common Linux platforms will see such removal. Details of how drivers | ||
370 | will notice and handle such removals are currently bus-specific, and often | ||
371 | involve a separate thread. | ||
372 | |||
373 | |||
374 | Note that the bus-specific runtime PM wakeup mechanism can exist, and might | ||
375 | be defined to share some of the same driver code as for system wakeup. For | ||
376 | example, a bus-specific device driver's resume() method might be used there, | ||
377 | so it wouldn't only be called from bus.resume() during system-wide wakeup. | ||
378 | See bus-specific information about how runtime wakeup events are handled. | ||
379 | |||
380 | |||
381 | System Devices | ||
382 | -------------- | ||
383 | System devices follow a slightly different API, which can be found in | ||
384 | |||
385 | include/linux/sysdev.h | ||
386 | drivers/base/sys.c | ||
387 | |||
388 | System devices will only be suspended with interrupts disabled, and after | ||
389 | all other devices have been suspended. On resume, they will be resumed | ||
390 | before any other devices, and also with interrupts disabled. | ||
391 | |||
392 | That is, IRQs are disabled, the suspend_late() phase begins, then the | ||
393 | sysdev_driver.suspend() phase, and the system enters a sleep state. Then | ||
394 | the sysdev_driver.resume() phase begins, followed by the resume_early() | ||
395 | phase, after which IRQs are enabled. | ||
396 | |||
397 | Code to actually enter and exit the system-wide low power state sometimes | ||
398 | involves hardware details that are only known to the boot firmware, and | ||
399 | may leave a CPU running software (from SRAM or flash memory) that monitors | ||
400 | the system and manages its wakeup sequence. | ||
401 | 579 | ||
402 | 580 | ||
403 | Runtime Power Management | 581 | Runtime Power Management |
@@ -407,82 +585,23 @@ running. This feature is useful for devices that are not being used, and | |||
407 | can offer significant power savings on a running system. These devices | 585 | can offer significant power savings on a running system. These devices |
408 | often support a range of runtime power states, which might use names such | 586 | often support a range of runtime power states, which might use names such |
409 | as "off", "sleep", "idle", "active", and so on. Those states will in some | 587 | as "off", "sleep", "idle", "active", and so on. Those states will in some |
410 | cases (like PCI) be partially constrained by a bus the device uses, and will | 588 | cases (like PCI) be partially constrained by the bus the device uses, and will |
411 | usually include hardware states that are also used in system sleep states. | 589 | usually include hardware states that are also used in system sleep states. |
412 | 590 | ||
413 | However, note that if a driver puts a device into a runtime low power state | 591 | A system-wide power transition can be started while some devices are in low |
414 | and the system then goes into a system-wide sleep state, it normally ought | 592 | power states due to runtime power management. The system sleep PM callbacks |
415 | to resume into that runtime low power state rather than "full on". Such | 593 | should recognize such situations and react to them appropriately, but the |
416 | distinctions would be part of the driver-internal state machine for that | 594 | necessary actions are subsystem-specific. |
417 | hardware; the whole point of runtime power management is to be sure that | 595 | |
418 | drivers are decoupled in that way from the state machine governing phases | 596 | In some cases the decision may be made at the subsystem level while in other |
419 | of the system-wide power/sleep state transitions. | 597 | cases the device driver may be left to decide. In some cases it may be |
420 | 598 | desirable to leave a suspended device in that state during a system-wide power | |
421 | 599 | transition, but in other cases the device must be put back into the full-power | |
422 | Power Saving Techniques | 600 | state temporarily, for example so that its system wakeup capability can be |
423 | ----------------------- | 601 | disabled. This all depends on the hardware and the design of the subsystem and |
424 | Normally runtime power management is handled by the drivers without specific | 602 | device driver in question. |
425 | userspace or kernel intervention, by device-aware use of techniques like: | 603 | |
426 | 604 | During system-wide resume from a sleep state it's best to put devices into the | |
427 | Using information provided by other system layers | 605 | full-power state, as explained in Documentation/power/runtime_pm.txt. Refer to |
428 | - stay deeply "off" except between open() and close() | 606 | that document for more information regarding this particular issue as well as |
429 | - if transceiver/PHY indicates "nobody connected", stay "off" | 607 | for information on the device runtime power management framework in general. |
430 | - application protocols may include power commands or hints | ||
431 | |||
432 | Using fewer CPU cycles | ||
433 | - using DMA instead of PIO | ||
434 | - removing timers, or making them lower frequency | ||
435 | - shortening "hot" code paths | ||
436 | - eliminating cache misses | ||
437 | - (sometimes) offloading work to device firmware | ||
438 | |||
439 | Reducing other resource costs | ||
440 | - gating off unused clocks in software (or hardware) | ||
441 | - switching off unused power supplies | ||
442 | - eliminating (or delaying/merging) IRQs | ||
443 | - tuning DMA to use word and/or burst modes | ||
444 | |||
445 | Using device-specific low power states | ||
446 | - using lower voltages | ||
447 | - avoiding needless DMA transfers | ||
448 | |||
449 | Read your hardware documentation carefully to see the opportunities that | ||
450 | may be available. If you can, measure the actual power usage and check | ||
451 | it against the budget established for your project. | ||
452 | |||
453 | |||
454 | Examples: USB hosts, system timer, system CPU | ||
455 | ---------------------------------------------- | ||
456 | USB host controllers make interesting, if complex, examples. In many cases | ||
457 | these have no work to do: no USB devices are connected, or all of them are | ||
458 | in the USB "suspend" state. Linux host controller drivers can then disable | ||
459 | periodic DMA transfers that would otherwise be a constant power drain on the | ||
460 | memory subsystem, and enter a suspend state. In power-aware controllers, | ||
461 | entering that suspend state may disable the clock used with USB signaling, | ||
462 | saving a certain amount of power. | ||
463 | |||
464 | The controller will be woken from that state (with an IRQ) by changes to the | ||
465 | signal state on the data lines of a given port, for example by an existing | ||
466 | peripheral requesting "remote wakeup" or by plugging a new peripheral. The | ||
467 | same wakeup mechanism usually works from "standby" sleep states, and on some | ||
468 | systems also from "suspend to RAM" (or even "suspend to disk") states. | ||
469 | (Except that ACPI may be involved instead of normal IRQs, on some hardware.) | ||
470 | |||
471 | System devices like timers and CPUs may have special roles in the platform | ||
472 | power management scheme. For example, system timers using a "dynamic tick" | ||
473 | approach don't just save CPU cycles (by eliminating needless timer IRQs), | ||
474 | but they may also open the door to using lower power CPU "idle" states that | ||
475 | cost more than a jiffie to enter and exit. On x86 systems these are states | ||
476 | like "C3"; note that periodic DMA transfers from a USB host controller will | ||
477 | also prevent entry to a C3 state, much like a periodic timer IRQ. | ||
478 | |||
479 | That kind of runtime mechanism interaction is common. "System On Chip" (SOC) | ||
480 | processors often have low power idle modes that can't be entered unless | ||
481 | certain medium-speed clocks (often 12 or 48 MHz) are gated off. When the | ||
482 | drivers gate those clocks effectively, then the system idle task may be able | ||
483 | to use the lower power idle modes and thereby increase battery life. | ||
484 | |||
485 | If the CPU can have a "cpufreq" driver, there also may be opportunities | ||
486 | to shift to lower voltage settings and reduce the power cost of executing | ||
487 | a given number of instructions. (Without voltage adjustment, it's rare | ||
488 | for cpufreq to save much power; the cost-per-instruction must go down.) | ||
diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt index c40866e8b957..bfed898a03fc 100644 --- a/Documentation/power/pm_qos_interface.txt +++ b/Documentation/power/pm_qos_interface.txt | |||
@@ -18,44 +18,46 @@ and pm_qos_params.h. This is done because having the available parameters | |||
18 | being runtime configurable or changeable from a driver was seen as too easy to | 18 | being runtime configurable or changeable from a driver was seen as too easy to |
19 | abuse. | 19 | abuse. |
20 | 20 | ||
21 | For each parameter a list of performance requirements is maintained along with | 21 | For each parameter a list of performance requests is maintained along with |
22 | an aggregated target value. The aggregated target value is updated with | 22 | an aggregated target value. The aggregated target value is updated with |
23 | changes to the requirement list or elements of the list. Typically the | 23 | changes to the request list or elements of the list. Typically the |
24 | aggregated target value is simply the max or min of the requirement values held | 24 | aggregated target value is simply the max or min of the request values held |
25 | in the parameter list elements. | 25 | in the parameter list elements. |
26 | 26 | ||
27 | From kernel mode the use of this interface is simple: | 27 | From kernel mode the use of this interface is simple: |
28 | pm_qos_add_requirement(param_id, name, target_value): | ||
29 | Will insert a named element in the list for that identified PM_QOS parameter | ||
30 | with the target value. Upon change to this list the new target is recomputed | ||
31 | and any registered notifiers are called only if the target value is now | ||
32 | different. | ||
33 | 28 | ||
34 | pm_qos_update_requirement(param_id, name, new_target_value): | 29 | handle = pm_qos_add_request(param_class, target_value): |
35 | Will search the list identified by the param_id for the named list element and | 30 | Will insert an element into the list for that identified PM_QOS class with the |
36 | then update its target value, calling the notification tree if the aggregated | 31 | target value. Upon change to this list the new target is recomputed and any |
37 | target is changed. with that name is already registered. | 32 | registered notifiers are called only if the target value is now different. |
33 | Clients of pm_qos need to save the returned handle. | ||
38 | 34 | ||
39 | pm_qos_remove_requirement(param_id, name): | 35 | void pm_qos_update_request(handle, new_target_value): |
40 | Will search the identified list for the named element and remove it, after | 36 | Will update the list element pointed to by the handle with the new target value |
41 | removal it will update the aggregate target and call the notification tree if | 37 | and recompute the new aggregated target, calling the notification tree if the |
42 | the target was changed as a result of removing the named requirement. | 38 | target is changed. |
39 | |||
40 | void pm_qos_remove_request(handle): | ||
41 | Will remove the element. After removal it will update the aggregate target and | ||
42 | call the notification tree if the target was changed as a result of removing | ||
43 | the request. | ||
43 | 44 | ||
44 | 45 | ||
45 | From user mode: | 46 | From user mode: |
46 | Only processes can register a pm_qos requirement. To provide for automatic | 47 | Only processes can register a pm_qos request. To provide for automatic |
47 | cleanup for process the interface requires the process to register its | 48 | cleanup of a process, the interface requires the process to register its |
48 | parameter requirements in the following way: | 49 | parameter requests in the following way: |
49 | 50 | ||
50 | To register the default pm_qos target for the specific parameter, the process | 51 | To register the default pm_qos target for the specific parameter, the process |
51 | must open one of /dev/[cpu_dma_latency, network_latency, network_throughput] | 52 | must open one of /dev/[cpu_dma_latency, network_latency, network_throughput] |
52 | 53 | ||
53 | As long as the device node is held open that process has a registered | 54 | As long as the device node is held open that process has a registered |
54 | requirement on the parameter. The name of the requirement is "process_<PID>" | 55 | request on the parameter. |
55 | derived from the current->pid from within the open system call. | ||
56 | 56 | ||
57 | To change the requested target value the process needs to write a s32 value to | 57 | To change the requested target value the process needs to write an s32 value to |
58 | the open device node. This translates to a pm_qos_update_requirement call. | 58 | the open device node. Alternatively the user mode program could write a hex |
59 | string for the value using 10 char long format e.g. "0x12345678". This | ||
60 | translates to a pm_qos_update_request call. | ||
59 | 61 | ||
60 | To remove the user mode request for a target value simply close the device | 62 | To remove the user mode request for a target value simply close the device |
61 | node. | 63 | node. |
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt index b967cd9137d6..81680f9f5909 100644 --- a/Documentation/power/userland-swsusp.txt +++ b/Documentation/power/userland-swsusp.txt | |||
@@ -24,6 +24,10 @@ assumed to be in the resume mode. The device cannot be open for simultaneous | |||
24 | reading and writing. It is also impossible to have the device open more than | 24 | reading and writing. It is also impossible to have the device open more than |
25 | once at a time. | 25 | once at a time. |
26 | 26 | ||
27 | Even opening the device has side effects. Data structures are | ||
28 | allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are | ||
29 | called. | ||
30 | |||
27 | The ioctl() commands recognized by the device are: | 31 | The ioctl() commands recognized by the device are: |
28 | 32 | ||
29 | SNAPSHOT_FREEZE - freeze user space processes (the current process is | 33 | SNAPSHOT_FREEZE - freeze user space processes (the current process is |
diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt index aae8355d3166..221f38be98f4 100644 --- a/Documentation/rbtree.txt +++ b/Documentation/rbtree.txt | |||
@@ -190,3 +190,61 @@ Example: | |||
190 | for (node = rb_first(&mytree); node; node = rb_next(node)) | 190 | for (node = rb_first(&mytree); node; node = rb_next(node)) |
191 | printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); | 191 | printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); |
192 | 192 | ||
193 | Support for Augmented rbtrees | ||
194 | ----------------------------- | ||
195 | |||
196 | Augmented rbtree is an rbtree with "some" additional data stored in each node. | ||
197 | This data can be used to augment some new functionality to rbtree. | ||
198 | Augmented rbtree is an optional feature built on top of basic rbtree | ||
199 | infrastructure. rbtree user who wants this feature will have an augment | ||
200 | callback function in rb_root initialized. | ||
201 | |||
202 | This callback function will be called from rbtree core routines whenever | ||
203 | a node has a change in one or both of its children. It is the responsibility | ||
204 | of the callback function to recalculate the additional data that is in the | ||
205 | rb node using new children information. Note that if this new additional | ||
206 | data affects the parent node's additional data, then callback function has | ||
207 | to handle it and do the recursive updates. | ||
208 | |||
209 | |||
210 | Interval tree is an example of augmented rb tree. Reference - | ||
211 | "Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein. | ||
212 | More details about interval trees: | ||
213 | |||
214 | Classical rbtree has a single key and it cannot be directly used to store | ||
215 | interval ranges like [lo:hi] and do a quick lookup for any overlap with a new | ||
216 | lo:hi or to find whether there is an exact match for a new lo:hi. | ||
217 | |||
218 | However, rbtree can be augmented to store such interval ranges in a structured | ||
219 | way making it possible to do efficient lookup and exact match. | ||
220 | |||
221 | This "extra information" stored in each node is the maximum hi | ||
222 | (max_hi) value among all the nodes that are its descendents. This | ||
223 | information can be maintained at each node just be looking at the node | ||
224 | and its immediate children. And this will be used in O(log n) lookup | ||
225 | for lowest match (lowest start address among all possible matches) | ||
226 | with something like: | ||
227 | |||
228 | find_lowest_match(lo, hi, node) | ||
229 | { | ||
230 | lowest_match = NULL; | ||
231 | while (node) { | ||
232 | if (max_hi(node->left) > lo) { | ||
233 | // Lowest overlap if any must be on left side | ||
234 | node = node->left; | ||
235 | } else if (overlap(lo, hi, node)) { | ||
236 | lowest_match = node; | ||
237 | break; | ||
238 | } else if (lo > node->lo) { | ||
239 | // Lowest overlap if any must be on right side | ||
240 | node = node->right; | ||
241 | } else { | ||
242 | break; | ||
243 | } | ||
244 | } | ||
245 | return lowest_match; | ||
246 | } | ||
247 | |||
248 | Finding exact match will be to first find lowest match and then to follow | ||
249 | successor nodes looking for exact match, until the start of a node is beyond | ||
250 | the hi value we are looking for. | ||
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt index 6f33593e59e2..8239ebbcddce 100644 --- a/Documentation/scheduler/sched-design-CFS.txt +++ b/Documentation/scheduler/sched-design-CFS.txt | |||
@@ -211,7 +211,7 @@ provide fair CPU time to each such task group. For example, it may be | |||
211 | desirable to first provide fair CPU time to each user on the system and then to | 211 | desirable to first provide fair CPU time to each user on the system and then to |
212 | each task belonging to a user. | 212 | each task belonging to a user. |
213 | 213 | ||
214 | CONFIG_GROUP_SCHED strives to achieve exactly that. It lets tasks to be | 214 | CONFIG_CGROUP_SCHED strives to achieve exactly that. It lets tasks to be |
215 | grouped and divides CPU time fairly among such groups. | 215 | grouped and divides CPU time fairly among such groups. |
216 | 216 | ||
217 | CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and | 217 | CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and |
@@ -220,38 +220,11 @@ SCHED_RR) tasks. | |||
220 | CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and | 220 | CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and |
221 | SCHED_BATCH) tasks. | 221 | SCHED_BATCH) tasks. |
222 | 222 | ||
223 | At present, there are two (mutually exclusive) mechanisms to group tasks for | 223 | These options need CONFIG_CGROUPS to be defined, and let the administrator |
224 | CPU bandwidth control purposes: | ||
225 | |||
226 | - Based on user id (CONFIG_USER_SCHED) | ||
227 | |||
228 | With this option, tasks are grouped according to their user id. | ||
229 | |||
230 | - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED) | ||
231 | |||
232 | This options needs CONFIG_CGROUPS to be defined, and lets the administrator | ||
233 | create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See | 224 | create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See |
234 | Documentation/cgroups/cgroups.txt for more information about this filesystem. | 225 | Documentation/cgroups/cgroups.txt for more information about this filesystem. |
235 | 226 | ||
236 | Only one of these options to group tasks can be chosen and not both. | 227 | When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each |
237 | |||
238 | When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new | ||
239 | user and a "cpu_share" file is added in that directory. | ||
240 | |||
241 | # cd /sys/kernel/uids | ||
242 | # cat 512/cpu_share # Display user 512's CPU share | ||
243 | 1024 | ||
244 | # echo 2048 > 512/cpu_share # Modify user 512's CPU share | ||
245 | # cat 512/cpu_share # Display user 512's CPU share | ||
246 | 2048 | ||
247 | # | ||
248 | |||
249 | CPU bandwidth between two users is divided in the ratio of their CPU shares. | ||
250 | For example: if you would like user "root" to get twice the bandwidth of user | ||
251 | "guest," then set the cpu_share for both the users such that "root"'s cpu_share | ||
252 | is twice "guest"'s cpu_share. | ||
253 | |||
254 | When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each | ||
255 | group created using the pseudo filesystem. See example steps below to create | 228 | group created using the pseudo filesystem. See example steps below to create |
256 | task groups and modify their CPU share using the "cgroups" pseudo filesystem. | 229 | task groups and modify their CPU share using the "cgroups" pseudo filesystem. |
257 | 230 | ||
@@ -273,24 +246,3 @@ task groups and modify their CPU share using the "cgroups" pseudo filesystem. | |||
273 | 246 | ||
274 | # #Launch gmplayer (or your favourite movie player) | 247 | # #Launch gmplayer (or your favourite movie player) |
275 | # echo <movie_player_pid> > multimedia/tasks | 248 | # echo <movie_player_pid> > multimedia/tasks |
276 | |||
277 | 8. Implementation note: user namespaces | ||
278 | |||
279 | User namespaces are intended to be hierarchical. But they are currently | ||
280 | only partially implemented. Each of those has ramifications for CFS. | ||
281 | |||
282 | First, since user namespaces are hierarchical, the /sys/kernel/uids | ||
283 | presentation is inadequate. Eventually we will likely want to use sysfs | ||
284 | tagging to provide private views of /sys/kernel/uids within each user | ||
285 | namespace. | ||
286 | |||
287 | Second, the hierarchical nature is intended to support completely | ||
288 | unprivileged use of user namespaces. So if using user groups, then | ||
289 | we want the users in a user namespace to be children of the user | ||
290 | who created it. | ||
291 | |||
292 | That is currently unimplemented. So instead, every user in a new | ||
293 | user namespace will receive 1024 shares just like any user in the | ||
294 | initial user namespace. Note that at the moment creation of a new | ||
295 | user namespace requires each of CAP_SYS_ADMIN, CAP_SETUID, and | ||
296 | CAP_SETGID. | ||
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index 86eabe6c3419..605b0d40329d 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt | |||
@@ -126,23 +126,12 @@ priority! | |||
126 | 2.3 Basis for grouping tasks | 126 | 2.3 Basis for grouping tasks |
127 | ---------------------------- | 127 | ---------------------------- |
128 | 128 | ||
129 | There are two compile-time settings for allocating CPU bandwidth. These are | 129 | Enabling CONFIG_RT_GROUP_SCHED lets you explicitly allocate real |
130 | configured using the "Basis for grouping tasks" multiple choice menu under | 130 | CPU bandwidth to task groups. |
131 | General setup > Group CPU Scheduler: | ||
132 | |||
133 | a. CONFIG_USER_SCHED (aka "Basis for grouping tasks" = "user id") | ||
134 | |||
135 | This lets you use the virtual files under | ||
136 | "/sys/kernel/uids/<uid>/cpu_rt_runtime_us" to control he CPU time reserved for | ||
137 | each user . | ||
138 | |||
139 | The other option is: | ||
140 | |||
141 | .o CONFIG_CGROUP_SCHED (aka "Basis for grouping tasks" = "Control groups") | ||
142 | 131 | ||
143 | This uses the /cgroup virtual file system and | 132 | This uses the /cgroup virtual file system and |
144 | "/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each | 133 | "/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each |
145 | control group instead. | 134 | control group. |
146 | 135 | ||
147 | For more information on working with control groups, you should read | 136 | For more information on working with control groups, you should read |
148 | Documentation/cgroups/cgroups.txt as well. | 137 | Documentation/cgroups/cgroups.txt as well. |
@@ -161,8 +150,7 @@ For now, this can be simplified to just the following (but see Future plans): | |||
161 | =============== | 150 | =============== |
162 | 151 | ||
163 | There is work in progress to make the scheduling period for each group | 152 | There is work in progress to make the scheduling period for each group |
164 | ("/sys/kernel/uids/<uid>/cpu_rt_period_us" or | 153 | ("/cgroup/<cgroup>/cpu.rt_period_us") configurable as well. |
165 | "/cgroup/<cgroup>/cpu.rt_period_us" respectively) configurable as well. | ||
166 | 154 | ||
167 | The constraint on the period is that a subgroup must have a smaller or | 155 | The constraint on the period is that a subgroup must have a smaller or |
168 | equal period to its parent. But realistically its not very useful _yet_ | 156 | equal period to its parent. But realistically its not very useful _yet_ |
diff --git a/Documentation/spi/spidev_test.c b/Documentation/spi/spidev_test.c index 10abd3773e49..16feda901469 100644 --- a/Documentation/spi/spidev_test.c +++ b/Documentation/spi/spidev_test.c | |||
@@ -58,7 +58,7 @@ static void transfer(int fd) | |||
58 | }; | 58 | }; |
59 | 59 | ||
60 | ret = ioctl(fd, SPI_IOC_MESSAGE(1), &tr); | 60 | ret = ioctl(fd, SPI_IOC_MESSAGE(1), &tr); |
61 | if (ret == 1) | 61 | if (ret < 1) |
62 | pabort("can't send spi message"); | 62 | pabort("can't send spi message"); |
63 | 63 | ||
64 | for (ret = 0; ret < ARRAY_SIZE(tx); ret++) { | 64 | for (ret = 0; ret < ARRAY_SIZE(tx); ret++) { |
diff --git a/Documentation/stable_kernel_rules.txt b/Documentation/stable_kernel_rules.txt index 5effa5bd993b..e213f45cf9d7 100644 --- a/Documentation/stable_kernel_rules.txt +++ b/Documentation/stable_kernel_rules.txt | |||
@@ -18,16 +18,15 @@ Rules on what kind of patches are accepted, and which ones are not, into the | |||
18 | - It cannot contain any "trivial" fixes in it (spelling changes, | 18 | - It cannot contain any "trivial" fixes in it (spelling changes, |
19 | whitespace cleanups, etc). | 19 | whitespace cleanups, etc). |
20 | - It must follow the Documentation/SubmittingPatches rules. | 20 | - It must follow the Documentation/SubmittingPatches rules. |
21 | - It or an equivalent fix must already exist in Linus' tree. Quote the | 21 | - It or an equivalent fix must already exist in Linus' tree (upstream). |
22 | respective commit ID in Linus' tree in your patch submission to -stable. | ||
23 | 22 | ||
24 | 23 | ||
25 | Procedure for submitting patches to the -stable tree: | 24 | Procedure for submitting patches to the -stable tree: |
26 | 25 | ||
27 | - Send the patch, after verifying that it follows the above rules, to | 26 | - Send the patch, after verifying that it follows the above rules, to |
28 | stable@kernel.org. | 27 | stable@kernel.org. You must note the upstream commit ID in the changelog |
29 | - To have the patch automatically included in the stable tree, add the | 28 | of your submission. |
30 | the tag | 29 | - To have the patch automatically included in the stable tree, add the tag |
31 | Cc: stable@kernel.org | 30 | Cc: stable@kernel.org |
32 | in the sign-off area. Once the patch is merged it will be applied to | 31 | in the sign-off area. Once the patch is merged it will be applied to |
33 | the stable tree without anything else needing to be done by the author | 32 | the stable tree without anything else needing to be done by the author |
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index b22000dbc57d..09bd8e902989 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt | |||
@@ -90,7 +90,8 @@ In order to facilitate early boot debugging, use boot option: | |||
90 | 90 | ||
91 | trace_event=[event-list] | 91 | trace_event=[event-list] |
92 | 92 | ||
93 | The format of this boot option is the same as described in section 2.1. | 93 | event-list is a comma separated list of events. See section 2.1 for event |
94 | format. | ||
94 | 95 | ||
95 | 3. Defining an event-enabled tracepoint | 96 | 3. Defining an event-enabled tracepoint |
96 | ======================================= | 97 | ======================================= |
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 03485bfbd797..557c1edeccaf 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt | |||
@@ -155,6 +155,9 @@ of ftrace. Here is a list of some of the key files: | |||
155 | to be traced. Echoing names of functions into this file | 155 | to be traced. Echoing names of functions into this file |
156 | will limit the trace to only those functions. | 156 | will limit the trace to only those functions. |
157 | 157 | ||
158 | This interface also allows for commands to be used. See the | ||
159 | "Filter commands" section for more details. | ||
160 | |||
158 | set_ftrace_notrace: | 161 | set_ftrace_notrace: |
159 | 162 | ||
160 | This has an effect opposite to that of | 163 | This has an effect opposite to that of |
@@ -1337,12 +1340,14 @@ ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one | |||
1337 | can either use the sysctl function or set it via the proc system | 1340 | can either use the sysctl function or set it via the proc system |
1338 | interface. | 1341 | interface. |
1339 | 1342 | ||
1340 | sysctl kernel.ftrace_dump_on_oops=1 | 1343 | sysctl kernel.ftrace_dump_on_oops=n |
1341 | 1344 | ||
1342 | or | 1345 | or |
1343 | 1346 | ||
1344 | echo 1 > /proc/sys/kernel/ftrace_dump_on_oops | 1347 | echo n > /proc/sys/kernel/ftrace_dump_on_oops |
1345 | 1348 | ||
1349 | If n = 1, ftrace will dump buffers of all CPUs, if n = 2 ftrace will | ||
1350 | only dump the buffer of the CPU that triggered the oops. | ||
1346 | 1351 | ||
1347 | Here's an example of such a dump after a null pointer | 1352 | Here's an example of such a dump after a null pointer |
1348 | dereference in a kernel module: | 1353 | dereference in a kernel module: |
@@ -1822,6 +1827,47 @@ this special filter via: | |||
1822 | echo > set_graph_function | 1827 | echo > set_graph_function |
1823 | 1828 | ||
1824 | 1829 | ||
1830 | Filter commands | ||
1831 | --------------- | ||
1832 | |||
1833 | A few commands are supported by the set_ftrace_filter interface. | ||
1834 | Trace commands have the following format: | ||
1835 | |||
1836 | <function>:<command>:<parameter> | ||
1837 | |||
1838 | The following commands are supported: | ||
1839 | |||
1840 | - mod | ||
1841 | This command enables function filtering per module. The | ||
1842 | parameter defines the module. For example, if only the write* | ||
1843 | functions in the ext3 module are desired, run: | ||
1844 | |||
1845 | echo 'write*:mod:ext3' > set_ftrace_filter | ||
1846 | |||
1847 | This command interacts with the filter in the same way as | ||
1848 | filtering based on function names. Thus, adding more functions | ||
1849 | in a different module is accomplished by appending (>>) to the | ||
1850 | filter file. Remove specific module functions by prepending | ||
1851 | '!': | ||
1852 | |||
1853 | echo '!writeback*:mod:ext3' >> set_ftrace_filter | ||
1854 | |||
1855 | - traceon/traceoff | ||
1856 | These commands turn tracing on and off when the specified | ||
1857 | functions are hit. The parameter determines how many times the | ||
1858 | tracing system is turned on and off. If unspecified, there is | ||
1859 | no limit. For example, to disable tracing when a schedule bug | ||
1860 | is hit the first 5 times, run: | ||
1861 | |||
1862 | echo '__schedule_bug:traceoff:5' > set_ftrace_filter | ||
1863 | |||
1864 | These commands are cumulative whether or not they are appended | ||
1865 | to set_ftrace_filter. To remove a command, prepend it by '!' | ||
1866 | and drop the parameter: | ||
1867 | |||
1868 | echo '!__schedule_bug:traceoff' > set_ftrace_filter | ||
1869 | |||
1870 | |||
1825 | trace_pipe | 1871 | trace_pipe |
1826 | ---------- | 1872 | ---------- |
1827 | 1873 | ||
diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index a9100b28eb84..ec94748ae65b 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt | |||
@@ -40,7 +40,9 @@ Synopsis of kprobe_events | |||
40 | $stack : Fetch stack address. | 40 | $stack : Fetch stack address. |
41 | $retval : Fetch return value.(*) | 41 | $retval : Fetch return value.(*) |
42 | +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) | 42 | +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) |
43 | NAME=FETCHARG: Set NAME as the argument name of FETCHARG. | 43 | NAME=FETCHARG : Set NAME as the argument name of FETCHARG. |
44 | FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types | ||
45 | (u8/u16/u32/u64/s8/s16/s32/s64) are supported. | ||
44 | 46 | ||
45 | (*) only for return probe. | 47 | (*) only for return probe. |
46 | (**) this is useful for fetching a field of data structures. | 48 | (**) this is useful for fetching a field of data structures. |