diff options
Diffstat (limited to 'Documentation')
23 files changed, 1577 insertions, 101 deletions
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt index 9f711d2df91b..d2b85237c76e 100644 --- a/Documentation/RCU/RTFP.txt +++ b/Documentation/RCU/RTFP.txt | |||
@@ -743,3 +743,80 @@ Revised: | |||
743 | RCU, realtime RCU, sleepable RCU, performance. | 743 | RCU, realtime RCU, sleepable RCU, performance. |
744 | " | 744 | " |
745 | } | 745 | } |
746 | |||
747 | @article{PaulEMcKenney2008RCUOSR | ||
748 | ,author="Paul E. McKenney and Jonathan Walpole" | ||
749 | ,title="Introducing technology into the {Linux} kernel: a case study" | ||
750 | ,Year="2008" | ||
751 | ,journal="SIGOPS Oper. Syst. Rev." | ||
752 | ,volume="42" | ||
753 | ,number="5" | ||
754 | ,pages="4--17" | ||
755 | ,issn="0163-5980" | ||
756 | ,doi={http://doi.acm.org/10.1145/1400097.1400099} | ||
757 | ,publisher="ACM" | ||
758 | ,address="New York, NY, USA" | ||
759 | ,annotation={ | ||
760 | Linux changed RCU to a far greater degree than RCU has changed Linux. | ||
761 | } | ||
762 | } | ||
763 | |||
764 | @unpublished{PaulEMcKenney2008HierarchicalRCU | ||
765 | ,Author="Paul E. McKenney" | ||
766 | ,Title="Hierarchical {RCU}" | ||
767 | ,month="November" | ||
768 | ,day="3" | ||
769 | ,year="2008" | ||
770 | ,note="Available: | ||
771 | \url{http://lwn.net/Articles/305782/} | ||
772 | [Viewed November 6, 2008]" | ||
773 | ,annotation=" | ||
774 | RCU with combining-tree-based grace-period detection, | ||
775 | permitting it to handle thousands of CPUs. | ||
776 | " | ||
777 | } | ||
778 | |||
779 | @conference{PaulEMcKenney2009MaliciousURCU | ||
780 | ,Author="Paul E. McKenney" | ||
781 | ,Title="Using a Malicious User-Level {RCU} to Torture {RCU}-Based Algorithms" | ||
782 | ,Booktitle="linux.conf.au 2009" | ||
783 | ,month="January" | ||
784 | ,year="2009" | ||
785 | ,address="Hobart, Australia" | ||
786 | ,note="Available: | ||
787 | \url{http://www.rdrop.com/users/paulmck/RCU/urcutorture.2009.01.22a.pdf} | ||
788 | [Viewed February 2, 2009]" | ||
789 | ,annotation=" | ||
790 | Realtime RCU and torture-testing RCU uses. | ||
791 | " | ||
792 | } | ||
793 | |||
794 | @unpublished{MathieuDesnoyers2009URCU | ||
795 | ,Author="Mathieu Desnoyers" | ||
796 | ,Title="[{RFC} git tree] Userspace {RCU} (urcu) for {Linux}" | ||
797 | ,month="February" | ||
798 | ,day="5" | ||
799 | ,year="2009" | ||
800 | ,note="Available: | ||
801 | \url{http://lkml.org/lkml/2009/2/5/572} | ||
802 | \url{git://lttng.org/userspace-rcu.git} | ||
803 | [Viewed February 20, 2009]" | ||
804 | ,annotation=" | ||
805 | Mathieu Desnoyers's user-space RCU implementation. | ||
806 | git://lttng.org/userspace-rcu.git | ||
807 | " | ||
808 | } | ||
809 | |||
810 | @unpublished{PaulEMcKenney2009BloatWatchRCU | ||
811 | ,Author="Paul E. McKenney" | ||
812 | ,Title="{RCU}: The {Bloatwatch} Edition" | ||
813 | ,month="March" | ||
814 | ,day="17" | ||
815 | ,year="2009" | ||
816 | ,note="Available: | ||
817 | \url{http://lwn.net/Articles/323929/} | ||
818 | [Viewed March 20, 2009]" | ||
819 | ,annotation=" | ||
820 | Uniprocessor assumptions allow simplified RCU implementation. | ||
821 | " | ||
822 | } | ||
diff --git a/Documentation/RCU/UP.txt b/Documentation/RCU/UP.txt index aab4a9ec3931..90ec5341ee98 100644 --- a/Documentation/RCU/UP.txt +++ b/Documentation/RCU/UP.txt | |||
@@ -2,14 +2,13 @@ RCU on Uniprocessor Systems | |||
2 | 2 | ||
3 | 3 | ||
4 | A common misconception is that, on UP systems, the call_rcu() primitive | 4 | A common misconception is that, on UP systems, the call_rcu() primitive |
5 | may immediately invoke its function, and that the synchronize_rcu() | 5 | may immediately invoke its function. The basis of this misconception |
6 | primitive may return immediately. The basis of this misconception | ||
7 | is that since there is only one CPU, it should not be necessary to | 6 | is that since there is only one CPU, it should not be necessary to |
8 | wait for anything else to get done, since there are no other CPUs for | 7 | wait for anything else to get done, since there are no other CPUs for |
9 | anything else to be happening on. Although this approach will -sort- -of- | 8 | anything else to be happening on. Although this approach will -sort- -of- |
10 | work a surprising amount of the time, it is a very bad idea in general. | 9 | work a surprising amount of the time, it is a very bad idea in general. |
11 | This document presents three examples that demonstrate exactly how bad an | 10 | This document presents three examples that demonstrate exactly how bad |
12 | idea this is. | 11 | an idea this is. |
13 | 12 | ||
14 | 13 | ||
15 | Example 1: softirq Suicide | 14 | Example 1: softirq Suicide |
@@ -82,11 +81,18 @@ Quick Quiz #2: What locking restriction must RCU callbacks respect? | |||
82 | 81 | ||
83 | Summary | 82 | Summary |
84 | 83 | ||
85 | Permitting call_rcu() to immediately invoke its arguments or permitting | 84 | Permitting call_rcu() to immediately invoke its arguments breaks RCU, |
86 | synchronize_rcu() to immediately return breaks RCU, even on a UP system. | 85 | even on a UP system. So do not do it! Even on a UP system, the RCU |
87 | So do not do it! Even on a UP system, the RCU infrastructure -must- | 86 | infrastructure -must- respect grace periods, and -must- invoke callbacks |
88 | respect grace periods, and -must- invoke callbacks from a known environment | 87 | from a known environment in which no locks are held. |
89 | in which no locks are held. | 88 | |
89 | It -is- safe for synchronize_sched() and synchronize_rcu_bh() to return | ||
90 | immediately on an UP system. It is also safe for synchronize_rcu() | ||
91 | to return immediately on UP systems, except when running preemptable | ||
92 | RCU. | ||
93 | |||
94 | Quick Quiz #3: Why can't synchronize_rcu() return immediately on | ||
95 | UP systems running preemptable RCU? | ||
90 | 96 | ||
91 | 97 | ||
92 | Answer to Quick Quiz #1: | 98 | Answer to Quick Quiz #1: |
@@ -117,3 +123,13 @@ Answer to Quick Quiz #2: | |||
117 | callbacks acquire locks directly. However, a great many RCU | 123 | callbacks acquire locks directly. However, a great many RCU |
118 | callbacks do acquire locks -indirectly-, for example, via | 124 | callbacks do acquire locks -indirectly-, for example, via |
119 | the kfree() primitive. | 125 | the kfree() primitive. |
126 | |||
127 | Answer to Quick Quiz #3: | ||
128 | Why can't synchronize_rcu() return immediately on UP systems | ||
129 | running preemptable RCU? | ||
130 | |||
131 | Because some other task might have been preempted in the middle | ||
132 | of an RCU read-side critical section. If synchronize_rcu() | ||
133 | simply immediately returned, it would prematurely signal the | ||
134 | end of the grace period, which would come as a nasty shock to | ||
135 | that other thread when it started running again. | ||
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index accfe2f5247d..51525a30e8b4 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt | |||
@@ -11,7 +11,10 @@ over a rather long period of time, but improvements are always welcome! | |||
11 | structure is updated more than about 10% of the time, then | 11 | structure is updated more than about 10% of the time, then |
12 | you should strongly consider some other approach, unless | 12 | you should strongly consider some other approach, unless |
13 | detailed performance measurements show that RCU is nonetheless | 13 | detailed performance measurements show that RCU is nonetheless |
14 | the right tool for the job. | 14 | the right tool for the job. Yes, you might think of RCU |
15 | as simply cutting overhead off of the readers and imposing it | ||
16 | on the writers. That is exactly why normal uses of RCU will | ||
17 | do much more reading than updating. | ||
15 | 18 | ||
16 | Another exception is where performance is not an issue, and RCU | 19 | Another exception is where performance is not an issue, and RCU |
17 | provides a simpler implementation. An example of this situation | 20 | provides a simpler implementation. An example of this situation |
@@ -240,10 +243,11 @@ over a rather long period of time, but improvements are always welcome! | |||
240 | instead need to use synchronize_irq() or synchronize_sched(). | 243 | instead need to use synchronize_irq() or synchronize_sched(). |
241 | 244 | ||
242 | 12. Any lock acquired by an RCU callback must be acquired elsewhere | 245 | 12. Any lock acquired by an RCU callback must be acquired elsewhere |
243 | with irq disabled, e.g., via spin_lock_irqsave(). Failing to | 246 | with softirq disabled, e.g., via spin_lock_irqsave(), |
244 | disable irq on a given acquisition of that lock will result in | 247 | spin_lock_bh(), etc. Failing to disable irq on a given |
245 | deadlock as soon as the RCU callback happens to interrupt that | 248 | acquisition of that lock will result in deadlock as soon as the |
246 | acquisition's critical section. | 249 | RCU callback happens to interrupt that acquisition's critical |
250 | section. | ||
247 | 251 | ||
248 | 13. RCU callbacks can be and are executed in parallel. In many cases, | 252 | 13. RCU callbacks can be and are executed in parallel. In many cases, |
249 | the callback code simply wrappers around kfree(), so that this | 253 | the callback code simply wrappers around kfree(), so that this |
@@ -310,3 +314,9 @@ over a rather long period of time, but improvements are always welcome! | |||
310 | Because these primitives only wait for pre-existing readers, | 314 | Because these primitives only wait for pre-existing readers, |
311 | it is the caller's responsibility to guarantee safety to | 315 | it is the caller's responsibility to guarantee safety to |
312 | any subsequent readers. | 316 | any subsequent readers. |
317 | |||
318 | 16. The various RCU read-side primitives do -not- contain memory | ||
319 | barriers. The CPU (and in some cases, the compiler) is free | ||
320 | to reorder code into and out of RCU read-side critical sections. | ||
321 | It is the responsibility of the RCU update-side primitives to | ||
322 | deal with this. | ||
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt index 7aa2002ade77..2a23523ce471 100644 --- a/Documentation/RCU/rcu.txt +++ b/Documentation/RCU/rcu.txt | |||
@@ -36,7 +36,7 @@ o How can the updater tell when a grace period has completed | |||
36 | executed in user mode, or executed in the idle loop, we can | 36 | executed in user mode, or executed in the idle loop, we can |
37 | safely free up that item. | 37 | safely free up that item. |
38 | 38 | ||
39 | Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the | 39 | Preemptible variants of RCU (CONFIG_TREE_PREEMPT_RCU) get the |
40 | same effect, but require that the readers manipulate CPU-local | 40 | same effect, but require that the readers manipulate CPU-local |
41 | counters. These counters allow limited types of blocking | 41 | counters. These counters allow limited types of blocking |
42 | within RCU read-side critical sections. SRCU also uses | 42 | within RCU read-side critical sections. SRCU also uses |
@@ -79,10 +79,10 @@ o I hear that RCU is patented? What is with that? | |||
79 | o I hear that RCU needs work in order to support realtime kernels? | 79 | o I hear that RCU needs work in order to support realtime kernels? |
80 | 80 | ||
81 | This work is largely completed. Realtime-friendly RCU can be | 81 | This work is largely completed. Realtime-friendly RCU can be |
82 | enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter. | 82 | enabled via the CONFIG_TREE_PREEMPT_RCU kernel configuration |
83 | However, work is in progress for enabling priority boosting of | 83 | parameter. However, work is in progress for enabling priority |
84 | preempted RCU read-side critical sections. This is needed if you | 84 | boosting of preempted RCU read-side critical sections. This is |
85 | have CPU-bound realtime threads. | 85 | needed if you have CPU-bound realtime threads. |
86 | 86 | ||
87 | o Where can I find more information on RCU? | 87 | o Where can I find more information on RCU? |
88 | 88 | ||
diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt index 909602d409bb..e439a0edee22 100644 --- a/Documentation/RCU/rcubarrier.txt +++ b/Documentation/RCU/rcubarrier.txt | |||
@@ -170,6 +170,13 @@ module invokes call_rcu() from timers, you will need to first cancel all | |||
170 | the timers, and only then invoke rcu_barrier() to wait for any remaining | 170 | the timers, and only then invoke rcu_barrier() to wait for any remaining |
171 | RCU callbacks to complete. | 171 | RCU callbacks to complete. |
172 | 172 | ||
173 | Of course, if you module uses call_rcu_bh(), you will need to invoke | ||
174 | rcu_barrier_bh() before unloading. Similarly, if your module uses | ||
175 | call_rcu_sched(), you will need to invoke rcu_barrier_sched() before | ||
176 | unloading. If your module uses call_rcu(), call_rcu_bh(), -and- | ||
177 | call_rcu_sched(), then you will need to invoke each of rcu_barrier(), | ||
178 | rcu_barrier_bh(), and rcu_barrier_sched(). | ||
179 | |||
173 | 180 | ||
174 | Implementing rcu_barrier() | 181 | Implementing rcu_barrier() |
175 | 182 | ||
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index a342b6e1cc10..9dba3bb90e60 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt | |||
@@ -76,8 +76,10 @@ torture_type The type of RCU to test: "rcu" for the rcu_read_lock() API, | |||
76 | "rcu_sync" for rcu_read_lock() with synchronous reclamation, | 76 | "rcu_sync" for rcu_read_lock() with synchronous reclamation, |
77 | "rcu_bh" for the rcu_read_lock_bh() API, "rcu_bh_sync" for | 77 | "rcu_bh" for the rcu_read_lock_bh() API, "rcu_bh_sync" for |
78 | rcu_read_lock_bh() with synchronous reclamation, "srcu" for | 78 | rcu_read_lock_bh() with synchronous reclamation, "srcu" for |
79 | the "srcu_read_lock()" API, and "sched" for the use of | 79 | the "srcu_read_lock()" API, "sched" for the use of |
80 | preempt_disable() together with synchronize_sched(). | 80 | preempt_disable() together with synchronize_sched(), |
81 | and "sched_expedited" for the use of preempt_disable() | ||
82 | with synchronize_sched_expedited(). | ||
81 | 83 | ||
82 | verbose Enable debug printk()s. Default is disabled. | 84 | verbose Enable debug printk()s. Default is disabled. |
83 | 85 | ||
@@ -162,6 +164,23 @@ of the "old" and "current" counters for the corresponding CPU. The | |||
162 | "idx" value maps the "old" and "current" values to the underlying array, | 164 | "idx" value maps the "old" and "current" values to the underlying array, |
163 | and is useful for debugging. | 165 | and is useful for debugging. |
164 | 166 | ||
167 | Similarly, sched_expedited RCU provides the following: | ||
168 | |||
169 | sched_expedited-torture: rtc: d0000000016c1880 ver: 1090796 tfle: 0 rta: 1090796 rtaf: 0 rtf: 1090787 rtmbe: 0 nt: 27713319 | ||
170 | sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0 | ||
171 | sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0 | ||
172 | sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0 | ||
173 | state: -1 / 0:0 3:0 4:0 | ||
174 | |||
175 | As before, the first four lines are similar to those for RCU. | ||
176 | The last line shows the task-migration state. The first number is | ||
177 | -1 if synchronize_sched_expedited() is idle, -2 if in the process of | ||
178 | posting wakeups to the migration kthreads, and N when waiting on CPU N. | ||
179 | Each of the colon-separated fields following the "/" is a CPU:state pair. | ||
180 | Valid states are "0" for idle, "1" for waiting for quiescent state, | ||
181 | "2" for passed through quiescent state, and "3" when a race with a | ||
182 | CPU-hotplug event forces use of the synchronize_sched() primitive. | ||
183 | |||
165 | 184 | ||
166 | USAGE | 185 | USAGE |
167 | 186 | ||
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 02cced183b2d..187bbf10c923 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt | |||
@@ -191,8 +191,7 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy). | |||
191 | 191 | ||
192 | The output of "cat rcu/rcudata" looks as follows: | 192 | The output of "cat rcu/rcudata" looks as follows: |
193 | 193 | ||
194 | rcu: | 194 | rcu_sched: |
195 | rcu: | ||
196 | 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10 | 195 | 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10 |
197 | 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10 | 196 | 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10 |
198 | 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10 | 197 | 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10 |
@@ -306,7 +305,7 @@ comma-separated-variable spreadsheet format. | |||
306 | 305 | ||
307 | The output of "cat rcu/rcugp" looks as follows: | 306 | The output of "cat rcu/rcugp" looks as follows: |
308 | 307 | ||
309 | rcu: completed=33062 gpnum=33063 | 308 | rcu_sched: completed=33062 gpnum=33063 |
310 | rcu_bh: completed=464 gpnum=464 | 309 | rcu_bh: completed=464 gpnum=464 |
311 | 310 | ||
312 | Again, this output is for both "rcu" and "rcu_bh". The fields are | 311 | Again, this output is for both "rcu" and "rcu_bh". The fields are |
@@ -413,7 +412,7 @@ o Each element of the form "1/1 0:127 ^0" represents one struct | |||
413 | 412 | ||
414 | The output of "cat rcu/rcu_pending" looks as follows: | 413 | The output of "cat rcu/rcu_pending" looks as follows: |
415 | 414 | ||
416 | rcu: | 415 | rcu_sched: |
417 | 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 | 416 | 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 |
418 | 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 | 417 | 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 |
419 | 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 | 418 | 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 |
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 96170824a717..e41a7fecf0d3 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt | |||
@@ -136,10 +136,10 @@ rcu_read_lock() | |||
136 | Used by a reader to inform the reclaimer that the reader is | 136 | Used by a reader to inform the reclaimer that the reader is |
137 | entering an RCU read-side critical section. It is illegal | 137 | entering an RCU read-side critical section. It is illegal |
138 | to block while in an RCU read-side critical section, though | 138 | to block while in an RCU read-side critical section, though |
139 | kernels built with CONFIG_PREEMPT_RCU can preempt RCU read-side | 139 | kernels built with CONFIG_TREE_PREEMPT_RCU can preempt RCU |
140 | critical sections. Any RCU-protected data structure accessed | 140 | read-side critical sections. Any RCU-protected data structure |
141 | during an RCU read-side critical section is guaranteed to remain | 141 | accessed during an RCU read-side critical section is guaranteed to |
142 | unreclaimed for the full duration of that critical section. | 142 | remain unreclaimed for the full duration of that critical section. |
143 | Reference counts may be used in conjunction with RCU to maintain | 143 | Reference counts may be used in conjunction with RCU to maintain |
144 | longer-term references to data structures. | 144 | longer-term references to data structures. |
145 | 145 | ||
@@ -785,6 +785,7 @@ RCU pointer/list traversal: | |||
785 | rcu_dereference | 785 | rcu_dereference |
786 | list_for_each_entry_rcu | 786 | list_for_each_entry_rcu |
787 | hlist_for_each_entry_rcu | 787 | hlist_for_each_entry_rcu |
788 | hlist_nulls_for_each_entry_rcu | ||
788 | 789 | ||
789 | list_for_each_continue_rcu (to be deprecated in favor of new | 790 | list_for_each_continue_rcu (to be deprecated in favor of new |
790 | list_for_each_entry_continue_rcu) | 791 | list_for_each_entry_continue_rcu) |
@@ -807,19 +808,23 @@ RCU: Critical sections Grace period Barrier | |||
807 | 808 | ||
808 | rcu_read_lock synchronize_net rcu_barrier | 809 | rcu_read_lock synchronize_net rcu_barrier |
809 | rcu_read_unlock synchronize_rcu | 810 | rcu_read_unlock synchronize_rcu |
811 | synchronize_rcu_expedited | ||
810 | call_rcu | 812 | call_rcu |
811 | 813 | ||
812 | 814 | ||
813 | bh: Critical sections Grace period Barrier | 815 | bh: Critical sections Grace period Barrier |
814 | 816 | ||
815 | rcu_read_lock_bh call_rcu_bh rcu_barrier_bh | 817 | rcu_read_lock_bh call_rcu_bh rcu_barrier_bh |
816 | rcu_read_unlock_bh | 818 | rcu_read_unlock_bh synchronize_rcu_bh |
819 | synchronize_rcu_bh_expedited | ||
817 | 820 | ||
818 | 821 | ||
819 | sched: Critical sections Grace period Barrier | 822 | sched: Critical sections Grace period Barrier |
820 | 823 | ||
821 | [preempt_disable] synchronize_sched rcu_barrier_sched | 824 | rcu_read_lock_sched synchronize_sched rcu_barrier_sched |
822 | [and friends] call_rcu_sched | 825 | rcu_read_unlock_sched call_rcu_sched |
826 | [preempt_disable] synchronize_sched_expedited | ||
827 | [and friends] | ||
823 | 828 | ||
824 | 829 | ||
825 | SRCU: Critical sections Grace period Barrier | 830 | SRCU: Critical sections Grace period Barrier |
@@ -827,6 +832,9 @@ SRCU: Critical sections Grace period Barrier | |||
827 | srcu_read_lock synchronize_srcu N/A | 832 | srcu_read_lock synchronize_srcu N/A |
828 | srcu_read_unlock | 833 | srcu_read_unlock |
829 | 834 | ||
835 | SRCU: Initialization/cleanup | ||
836 | init_srcu_struct | ||
837 | cleanup_srcu_struct | ||
830 | 838 | ||
831 | See the comment headers in the source code (or the docbook generated | 839 | See the comment headers in the source code (or the docbook generated |
832 | from them) for more information. | 840 | from them) for more information. |
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 09e031c55887..bb3a53cdfbc3 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -206,24 +206,6 @@ Who: Len Brown <len.brown@intel.com> | |||
206 | 206 | ||
207 | --------------------------- | 207 | --------------------------- |
208 | 208 | ||
209 | What: libata spindown skipping and warning | ||
210 | When: Dec 2008 | ||
211 | Why: Some halt(8) implementations synchronize caches for and spin | ||
212 | down libata disks because libata didn't use to spin down disk on | ||
213 | system halt (only synchronized caches). | ||
214 | Spin down on system halt is now implemented. sysfs node | ||
215 | /sys/class/scsi_disk/h:c:i:l/manage_start_stop is present if | ||
216 | spin down support is available. | ||
217 | Because issuing spin down command to an already spun down disk | ||
218 | makes some disks spin up just to spin down again, libata tracks | ||
219 | device spindown status to skip the extra spindown command and | ||
220 | warn about it. | ||
221 | This is to give userspace tools the time to get updated and will | ||
222 | be removed after userspace is reasonably updated. | ||
223 | Who: Tejun Heo <htejun@gmail.com> | ||
224 | |||
225 | --------------------------- | ||
226 | |||
227 | What: i386/x86_64 bzImage symlinks | 209 | What: i386/x86_64 bzImage symlinks |
228 | When: April 2010 | 210 | When: April 2010 |
229 | 211 | ||
@@ -394,15 +376,6 @@ Who: Thomas Gleixner <tglx@linutronix.de> | |||
394 | 376 | ||
395 | ----------------------------- | 377 | ----------------------------- |
396 | 378 | ||
397 | What: obsolete generic irq defines and typedefs | ||
398 | When: 2.6.30 | ||
399 | Why: The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t) | ||
400 | have been kept around for migration reasons. After more than two years | ||
401 | it's time to remove them finally | ||
402 | Who: Thomas Gleixner <tglx@linutronix.de> | ||
403 | |||
404 | --------------------------- | ||
405 | |||
406 | What: fakephp and associated sysfs files in /sys/bus/pci/slots/ | 379 | What: fakephp and associated sysfs files in /sys/bus/pci/slots/ |
407 | When: 2011 | 380 | When: 2011 |
408 | Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to | 381 | Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to |
@@ -468,3 +441,27 @@ Why: cpu_policy_rwsem has a new cleaner definition making it local to | |||
468 | cpufreq core and contained inside cpufreq.c. Other dependent | 441 | cpufreq core and contained inside cpufreq.c. Other dependent |
469 | drivers should not use it in order to safely avoid lockdep issues. | 442 | drivers should not use it in order to safely avoid lockdep issues. |
470 | Who: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> | 443 | Who: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> |
444 | |||
445 | ---------------------------- | ||
446 | |||
447 | What: sound-slot/service-* module aliases and related clutters in | ||
448 | sound/sound_core.c | ||
449 | When: August 2010 | ||
450 | Why: OSS sound_core grabs all legacy minors (0-255) of SOUND_MAJOR | ||
451 | (14) and requests modules using custom sound-slot/service-* | ||
452 | module aliases. The only benefit of doing this is allowing | ||
453 | use of custom module aliases which might as well be considered | ||
454 | a bug at this point. This preemptive claiming prevents | ||
455 | alternative OSS implementations. | ||
456 | |||
457 | Till the feature is removed, the kernel will be requesting | ||
458 | both sound-slot/service-* and the standard char-major-* module | ||
459 | aliases and allow turning off the pre-claiming selectively via | ||
460 | CONFIG_SOUND_OSS_CORE_PRECLAIM and soundcore.preclaim_oss | ||
461 | kernel parameter. | ||
462 | |||
463 | After the transition phase is complete, both the custom module | ||
464 | aliases and switches to disable it will go away. This removal | ||
465 | will also allow making ALSA OSS emulation independent of | ||
466 | sound_core. The dependency will be broken then too. | ||
467 | Who: Tejun Heo <tj@kernel.org> | ||
diff --git a/Documentation/filesystems/nfs.txt b/Documentation/filesystems/nfs.txt new file mode 100644 index 000000000000..f50f26ce6cd0 --- /dev/null +++ b/Documentation/filesystems/nfs.txt | |||
@@ -0,0 +1,98 @@ | |||
1 | |||
2 | The NFS client | ||
3 | ============== | ||
4 | |||
5 | The NFS version 2 protocol was first documented in RFC1094 (March 1989). | ||
6 | Since then two more major releases of NFS have been published, with NFSv3 | ||
7 | being documented in RFC1813 (June 1995), and NFSv4 in RFC3530 (April | ||
8 | 2003). | ||
9 | |||
10 | The Linux NFS client currently supports all the above published versions, | ||
11 | and work is in progress on adding support for minor version 1 of the NFSv4 | ||
12 | protocol. | ||
13 | |||
14 | The purpose of this document is to provide information on some of the | ||
15 | upcall interfaces that are used in order to provide the NFS client with | ||
16 | some of the information that it requires in order to fully comply with | ||
17 | the NFS spec. | ||
18 | |||
19 | The DNS resolver | ||
20 | ================ | ||
21 | |||
22 | NFSv4 allows for one server to refer the NFS client to data that has been | ||
23 | migrated onto another server by means of the special "fs_locations" | ||
24 | attribute. See | ||
25 | http://tools.ietf.org/html/rfc3530#section-6 | ||
26 | and | ||
27 | http://tools.ietf.org/html/draft-ietf-nfsv4-referrals-00 | ||
28 | |||
29 | The fs_locations information can take the form of either an ip address and | ||
30 | a path, or a DNS hostname and a path. The latter requires the NFS client to | ||
31 | do a DNS lookup in order to mount the new volume, and hence the need for an | ||
32 | upcall to allow userland to provide this service. | ||
33 | |||
34 | Assuming that the user has the 'rpc_pipefs' filesystem mounted in the usual | ||
35 | /var/lib/nfs/rpc_pipefs, the upcall consists of the following steps: | ||
36 | |||
37 | (1) The process checks the dns_resolve cache to see if it contains a | ||
38 | valid entry. If so, it returns that entry and exits. | ||
39 | |||
40 | (2) If no valid entry exists, the helper script '/sbin/nfs_cache_getent' | ||
41 | (may be changed using the 'nfs.cache_getent' kernel boot parameter) | ||
42 | is run, with two arguments: | ||
43 | - the cache name, "dns_resolve" | ||
44 | - the hostname to resolve | ||
45 | |||
46 | (3) After looking up the corresponding ip address, the helper script | ||
47 | writes the result into the rpc_pipefs pseudo-file | ||
48 | '/var/lib/nfs/rpc_pipefs/cache/dns_resolve/channel' | ||
49 | in the following (text) format: | ||
50 | |||
51 | "<ip address> <hostname> <ttl>\n" | ||
52 | |||
53 | Where <ip address> is in the usual IPv4 (123.456.78.90) or IPv6 | ||
54 | (ffee:ddcc:bbaa:9988:7766:5544:3322:1100, ffee::1100, ...) format. | ||
55 | <hostname> is identical to the second argument of the helper | ||
56 | script, and <ttl> is the 'time to live' of this cache entry (in | ||
57 | units of seconds). | ||
58 | |||
59 | Note: If <ip address> is invalid, say the string "0", then a negative | ||
60 | entry is created, which will cause the kernel to treat the hostname | ||
61 | as having no valid DNS translation. | ||
62 | |||
63 | |||
64 | |||
65 | |||
66 | A basic sample /sbin/nfs_cache_getent | ||
67 | ===================================== | ||
68 | |||
69 | #!/bin/bash | ||
70 | # | ||
71 | ttl=600 | ||
72 | # | ||
73 | cut=/usr/bin/cut | ||
74 | getent=/usr/bin/getent | ||
75 | rpc_pipefs=/var/lib/nfs/rpc_pipefs | ||
76 | # | ||
77 | die() | ||
78 | { | ||
79 | echo "Usage: $0 cache_name entry_name" | ||
80 | exit 1 | ||
81 | } | ||
82 | |||
83 | [ $# -lt 2 ] && die | ||
84 | cachename="$1" | ||
85 | cache_path=${rpc_pipefs}/cache/${cachename}/channel | ||
86 | |||
87 | case "${cachename}" in | ||
88 | dns_resolve) | ||
89 | name="$2" | ||
90 | result="$(${getent} hosts ${name} | ${cut} -f1 -d\ )" | ||
91 | [ -z "${result}" ] && result="0" | ||
92 | ;; | ||
93 | *) | ||
94 | die | ||
95 | ;; | ||
96 | esac | ||
97 | echo "${result} ${name} ${ttl}" >${cache_path} | ||
98 | |||
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index dbea4f95fc85..1c058b552e93 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt | |||
@@ -121,6 +121,7 @@ Code Seq# Include File Comments | |||
121 | 'c' 00-7F linux/comstats.h conflict! | 121 | 'c' 00-7F linux/comstats.h conflict! |
122 | 'c' 00-7F linux/coda.h conflict! | 122 | 'c' 00-7F linux/coda.h conflict! |
123 | 'c' 80-9F arch/s390/include/asm/chsc.h | 123 | 'c' 80-9F arch/s390/include/asm/chsc.h |
124 | 'c' A0-AF arch/x86/include/asm/msr.h | ||
124 | 'd' 00-FF linux/char/drm/drm/h conflict! | 125 | 'd' 00-FF linux/char/drm/drm/h conflict! |
125 | 'd' F0-FF linux/digi1.h | 126 | 'd' F0-FF linux/digi1.h |
126 | 'e' all linux/digi1.h conflict! | 127 | 'e' all linux/digi1.h conflict! |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7936b801fe6a..5d4427d17281 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1503,6 +1503,14 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1503 | [NFS] set the TCP port on which the NFSv4 callback | 1503 | [NFS] set the TCP port on which the NFSv4 callback |
1504 | channel should listen. | 1504 | channel should listen. |
1505 | 1505 | ||
1506 | nfs.cache_getent= | ||
1507 | [NFS] sets the pathname to the program which is used | ||
1508 | to update the NFS client cache entries. | ||
1509 | |||
1510 | nfs.cache_getent_timeout= | ||
1511 | [NFS] sets the timeout after which an attempt to | ||
1512 | update a cache entry is deemed to have failed. | ||
1513 | |||
1506 | nfs.idmap_cache_timeout= | 1514 | nfs.idmap_cache_timeout= |
1507 | [NFS] set the maximum lifetime for idmapper cache | 1515 | [NFS] set the maximum lifetime for idmapper cache |
1508 | entries. | 1516 | entries. |
@@ -2395,6 +2403,18 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2395 | stifb= [HW] | 2403 | stifb= [HW] |
2396 | Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]] | 2404 | Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]] |
2397 | 2405 | ||
2406 | sunrpc.min_resvport= | ||
2407 | sunrpc.max_resvport= | ||
2408 | [NFS,SUNRPC] | ||
2409 | SunRPC servers often require that client requests | ||
2410 | originate from a privileged port (i.e. a port in the | ||
2411 | range 0 < portnr < 1024). | ||
2412 | An administrator who wishes to reserve some of these | ||
2413 | ports for other uses may adjust the range that the | ||
2414 | kernel's sunrpc client considers to be privileged | ||
2415 | using these two parameters to set the minimum and | ||
2416 | maximum port values. | ||
2417 | |||
2398 | sunrpc.pool_mode= | 2418 | sunrpc.pool_mode= |
2399 | [NFS] | 2419 | [NFS] |
2400 | Control how the NFS server code allocates CPUs to | 2420 | Control how the NFS server code allocates CPUs to |
@@ -2411,6 +2431,15 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2411 | pernode one pool for each NUMA node (equivalent | 2431 | pernode one pool for each NUMA node (equivalent |
2412 | to global on non-NUMA machines) | 2432 | to global on non-NUMA machines) |
2413 | 2433 | ||
2434 | sunrpc.tcp_slot_table_entries= | ||
2435 | sunrpc.udp_slot_table_entries= | ||
2436 | [NFS,SUNRPC] | ||
2437 | Sets the upper limit on the number of simultaneous | ||
2438 | RPC calls that can be sent from the client to a | ||
2439 | server. Increasing these values may allow you to | ||
2440 | improve throughput, but will also increase the | ||
2441 | amount of memory reserved for use by the client. | ||
2442 | |||
2414 | swiotlb= [IA-64] Number of I/O TLB slabs | 2443 | swiotlb= [IA-64] Number of I/O TLB slabs |
2415 | 2444 | ||
2416 | switches= [HW,M68k] | 2445 | switches= [HW,M68k] |
@@ -2480,6 +2509,11 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2480 | trace_buf_size=nn[KMG] | 2509 | trace_buf_size=nn[KMG] |
2481 | [FTRACE] will set tracing buffer size. | 2510 | [FTRACE] will set tracing buffer size. |
2482 | 2511 | ||
2512 | trace_event=[event-list] | ||
2513 | [FTRACE] Set and start specified trace events in order | ||
2514 | to facilitate early boot debugging. | ||
2515 | See also Documentation/trace/events.txt | ||
2516 | |||
2483 | trix= [HW,OSS] MediaTrix AudioTrix Pro | 2517 | trix= [HW,OSS] MediaTrix AudioTrix Pro |
2484 | Format: | 2518 | Format: |
2485 | <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq> | 2519 | <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq> |
diff --git a/Documentation/keys.txt b/Documentation/keys.txt index b56aacc1fff8..e4dbbdb1bd96 100644 --- a/Documentation/keys.txt +++ b/Documentation/keys.txt | |||
@@ -26,7 +26,7 @@ This document has the following sections: | |||
26 | - Notes on accessing payload contents | 26 | - Notes on accessing payload contents |
27 | - Defining a key type | 27 | - Defining a key type |
28 | - Request-key callback service | 28 | - Request-key callback service |
29 | - Key access filesystem | 29 | - Garbage collection |
30 | 30 | ||
31 | 31 | ||
32 | ============ | 32 | ============ |
@@ -113,6 +113,9 @@ Each key has a number of attributes: | |||
113 | 113 | ||
114 | (*) Dead. The key's type was unregistered, and so the key is now useless. | 114 | (*) Dead. The key's type was unregistered, and so the key is now useless. |
115 | 115 | ||
116 | Keys in the last three states are subject to garbage collection. See the | ||
117 | section on "Garbage collection". | ||
118 | |||
116 | 119 | ||
117 | ==================== | 120 | ==================== |
118 | KEY SERVICE OVERVIEW | 121 | KEY SERVICE OVERVIEW |
@@ -754,6 +757,26 @@ The keyctl syscall functions are: | |||
754 | successful. | 757 | successful. |
755 | 758 | ||
756 | 759 | ||
760 | (*) Install the calling process's session keyring on its parent. | ||
761 | |||
762 | long keyctl(KEYCTL_SESSION_TO_PARENT); | ||
763 | |||
764 | This functions attempts to install the calling process's session keyring | ||
765 | on to the calling process's parent, replacing the parent's current session | ||
766 | keyring. | ||
767 | |||
768 | The calling process must have the same ownership as its parent, the | ||
769 | keyring must have the same ownership as the calling process, the calling | ||
770 | process must have LINK permission on the keyring and the active LSM module | ||
771 | mustn't deny permission, otherwise error EPERM will be returned. | ||
772 | |||
773 | Error ENOMEM will be returned if there was insufficient memory to complete | ||
774 | the operation, otherwise 0 will be returned to indicate success. | ||
775 | |||
776 | The keyring will be replaced next time the parent process leaves the | ||
777 | kernel and resumes executing userspace. | ||
778 | |||
779 | |||
757 | =============== | 780 | =============== |
758 | KERNEL SERVICES | 781 | KERNEL SERVICES |
759 | =============== | 782 | =============== |
@@ -1231,3 +1254,17 @@ by executing: | |||
1231 | 1254 | ||
1232 | In this case, the program isn't required to actually attach the key to a ring; | 1255 | In this case, the program isn't required to actually attach the key to a ring; |
1233 | the rings are provided for reference. | 1256 | the rings are provided for reference. |
1257 | |||
1258 | |||
1259 | ================== | ||
1260 | GARBAGE COLLECTION | ||
1261 | ================== | ||
1262 | |||
1263 | Dead keys (for which the type has been removed) will be automatically unlinked | ||
1264 | from those keyrings that point to them and deleted as soon as possible by a | ||
1265 | background garbage collector. | ||
1266 | |||
1267 | Similarly, revoked and expired keys will be garbage collected, but only after a | ||
1268 | certain amount of time has passed. This time is set as a number of seconds in: | ||
1269 | |||
1270 | /proc/sys/kernel/keys/gc_delay | ||
diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt index 89068030b01b..34f6638aa5ac 100644 --- a/Documentation/kmemleak.txt +++ b/Documentation/kmemleak.txt | |||
@@ -27,6 +27,13 @@ To trigger an intermediate memory scan: | |||
27 | 27 | ||
28 | # echo scan > /sys/kernel/debug/kmemleak | 28 | # echo scan > /sys/kernel/debug/kmemleak |
29 | 29 | ||
30 | To clear the list of all current possible memory leaks: | ||
31 | |||
32 | # echo clear > /sys/kernel/debug/kmemleak | ||
33 | |||
34 | New leaks will then come up upon reading /sys/kernel/debug/kmemleak | ||
35 | again. | ||
36 | |||
30 | Note that the orphan objects are listed in the order they were allocated | 37 | Note that the orphan objects are listed in the order they were allocated |
31 | and one object at the beginning of the list may cause other subsequent | 38 | and one object at the beginning of the list may cause other subsequent |
32 | objects to be reported as orphan. | 39 | objects to be reported as orphan. |
@@ -42,6 +49,9 @@ Memory scanning parameters can be modified at run-time by writing to the | |||
42 | scan=<secs> - set the automatic memory scanning period in seconds | 49 | scan=<secs> - set the automatic memory scanning period in seconds |
43 | (default 600, 0 to stop the automatic scanning) | 50 | (default 600, 0 to stop the automatic scanning) |
44 | scan - trigger a memory scan | 51 | scan - trigger a memory scan |
52 | clear - clear list of current memory leak suspects, done by | ||
53 | marking all current reported unreferenced objects grey | ||
54 | dump=<addr> - dump information about the object found at <addr> | ||
45 | 55 | ||
46 | Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on | 56 | Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on |
47 | the kernel command line. | 57 | the kernel command line. |
@@ -86,6 +96,27 @@ avoid this, kmemleak can also store the number of values pointing to an | |||
86 | address inside the block address range that need to be found so that the | 96 | address inside the block address range that need to be found so that the |
87 | block is not considered a leak. One example is __vmalloc(). | 97 | block is not considered a leak. One example is __vmalloc(). |
88 | 98 | ||
99 | Testing specific sections with kmemleak | ||
100 | --------------------------------------- | ||
101 | |||
102 | Upon initial bootup your /sys/kernel/debug/kmemleak output page may be | ||
103 | quite extensive. This can also be the case if you have very buggy code | ||
104 | when doing development. To work around these situations you can use the | ||
105 | 'clear' command to clear all reported unreferenced objects from the | ||
106 | /sys/kernel/debug/kmemleak output. By issuing a 'scan' after a 'clear' | ||
107 | you can find new unreferenced objects; this should help with testing | ||
108 | specific sections of code. | ||
109 | |||
110 | To test a critical section on demand with a clean kmemleak do: | ||
111 | |||
112 | # echo clear > /sys/kernel/debug/kmemleak | ||
113 | ... test your kernel or modules ... | ||
114 | # echo scan > /sys/kernel/debug/kmemleak | ||
115 | |||
116 | Then as usual to get your report with: | ||
117 | |||
118 | # cat /sys/kernel/debug/kmemleak | ||
119 | |||
89 | Kmemleak API | 120 | Kmemleak API |
90 | ------------ | 121 | ------------ |
91 | 122 | ||
diff --git a/Documentation/s390/s390dbf.txt b/Documentation/s390/s390dbf.txt index 2d10053dd97e..ae66f9b90a25 100644 --- a/Documentation/s390/s390dbf.txt +++ b/Documentation/s390/s390dbf.txt | |||
@@ -495,6 +495,13 @@ and for each vararg a long value. So e.g. for a debug entry with a format | |||
495 | string plus two varargs one would need to allocate a (3 * sizeof(long)) | 495 | string plus two varargs one would need to allocate a (3 * sizeof(long)) |
496 | byte data area in the debug_register() function. | 496 | byte data area in the debug_register() function. |
497 | 497 | ||
498 | IMPORTANT: Using "%s" in sprintf event functions is dangerous. You can only | ||
499 | use "%s" in the sprintf event functions, if the memory for the passed string is | ||
500 | available as long as the debug feature exists. The reason behind this is that | ||
501 | due to performance considerations only a pointer to the string is stored in | ||
502 | the debug feature. If you log a string that is freed afterwards, you will get | ||
503 | an OOPS when inspecting the debug feature, because then the debug feature will | ||
504 | access the already freed memory. | ||
498 | 505 | ||
499 | NOTE: If using the sprintf view do NOT use other event/exception functions | 506 | NOTE: If using the sprintf view do NOT use other event/exception functions |
500 | than the sprintf-event and -exception functions. | 507 | than the sprintf-event and -exception functions. |
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt index 4252697a95d6..1c8eb4518ce0 100644 --- a/Documentation/sound/alsa/ALSA-Configuration.txt +++ b/Documentation/sound/alsa/ALSA-Configuration.txt | |||
@@ -60,6 +60,12 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
60 | slots - Reserve the slot index for the given driver. | 60 | slots - Reserve the slot index for the given driver. |
61 | This option takes multiple strings. | 61 | This option takes multiple strings. |
62 | See "Module Autoloading Support" section for details. | 62 | See "Module Autoloading Support" section for details. |
63 | debug - Specifies the debug message level | ||
64 | (0 = disable debug prints, 1 = normal debug messages, | ||
65 | 2 = verbose debug messages) | ||
66 | This option appears only when CONFIG_SND_DEBUG=y. | ||
67 | This option can be dynamically changed via sysfs | ||
68 | /sys/modules/snd/parameters/debug file. | ||
63 | 69 | ||
64 | Module snd-pcm-oss | 70 | Module snd-pcm-oss |
65 | ------------------ | 71 | ------------------ |
@@ -513,6 +519,26 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
513 | or input, but you may use this module for any application which | 519 | or input, but you may use this module for any application which |
514 | requires a sound card (like RealPlayer). | 520 | requires a sound card (like RealPlayer). |
515 | 521 | ||
522 | pcm_devs - Number of PCM devices assigned to each card | ||
523 | (default = 1, up to 4) | ||
524 | pcm_substreams - Number of PCM substreams assigned to each PCM | ||
525 | (default = 8, up to 16) | ||
526 | hrtimer - Use hrtimer (=1, default) or system timer (=0) | ||
527 | fake_buffer - Fake buffer allocations (default = 1) | ||
528 | |||
529 | When multiple PCM devices are created, snd-dummy gives different | ||
530 | behavior to each PCM device: | ||
531 | 0 = interleaved with mmap support | ||
532 | 1 = non-interleaved with mmap support | ||
533 | 2 = interleaved without mmap | ||
534 | 3 = non-interleaved without mmap | ||
535 | |||
536 | As default, snd-dummy drivers doesn't allocate the real buffers | ||
537 | but either ignores read/write or mmap a single dummy page to all | ||
538 | buffer pages, in order to save the resouces. If your apps need | ||
539 | the read/ written buffer data to be consistent, pass fake_buffer=0 | ||
540 | option. | ||
541 | |||
516 | The power-management is supported. | 542 | The power-management is supported. |
517 | 543 | ||
518 | Module snd-echo3g | 544 | Module snd-echo3g |
@@ -768,6 +794,10 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
768 | bdl_pos_adj - Specifies the DMA IRQ timing delay in samples. | 794 | bdl_pos_adj - Specifies the DMA IRQ timing delay in samples. |
769 | Passing -1 will make the driver to choose the appropriate | 795 | Passing -1 will make the driver to choose the appropriate |
770 | value based on the controller chip. | 796 | value based on the controller chip. |
797 | patch - Specifies the early "patch" files to modify the HD-audio | ||
798 | setup before initializing the codecs. This option is | ||
799 | available only when CONFIG_SND_HDA_PATCH_LOADER=y is set. | ||
800 | See HD-Audio.txt for details. | ||
771 | 801 | ||
772 | [Single (global) options] | 802 | [Single (global) options] |
773 | single_cmd - Use single immediate commands to communicate with | 803 | single_cmd - Use single immediate commands to communicate with |
diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt index 939a3dd58148..97eebd63bedc 100644 --- a/Documentation/sound/alsa/HD-Audio-Models.txt +++ b/Documentation/sound/alsa/HD-Audio-Models.txt | |||
@@ -114,8 +114,8 @@ ALC662/663/272 | |||
114 | samsung-nc10 Samsung NC10 mini notebook | 114 | samsung-nc10 Samsung NC10 mini notebook |
115 | auto auto-config reading BIOS (default) | 115 | auto auto-config reading BIOS (default) |
116 | 116 | ||
117 | ALC882/885 | 117 | ALC882/883/885/888/889 |
118 | ========== | 118 | ====================== |
119 | 3stack-dig 3-jack with SPDIF I/O | 119 | 3stack-dig 3-jack with SPDIF I/O |
120 | 6stack-dig 6-jack digital with SPDIF I/O | 120 | 6stack-dig 6-jack digital with SPDIF I/O |
121 | arima Arima W820Di1 | 121 | arima Arima W820Di1 |
@@ -127,12 +127,8 @@ ALC882/885 | |||
127 | mbp3 Macbook Pro rev3 | 127 | mbp3 Macbook Pro rev3 |
128 | imac24 iMac 24'' with jack detection | 128 | imac24 iMac 24'' with jack detection |
129 | w2jc ASUS W2JC | 129 | w2jc ASUS W2JC |
130 | auto auto-config reading BIOS (default) | 130 | 3stack-2ch-dig 3-jack with SPDIF I/O (ALC883) |
131 | 131 | alc883-6stack-dig 6-jack digital with SPDIF I/O (ALC883) | |
132 | ALC883/888 | ||
133 | ========== | ||
134 | 3stack-dig 3-jack with SPDIF I/O | ||
135 | 6stack-dig 6-jack digital with SPDIF I/O | ||
136 | 3stack-6ch 3-jack 6-channel | 132 | 3stack-6ch 3-jack 6-channel |
137 | 3stack-6ch-dig 3-jack 6-channel with SPDIF I/O | 133 | 3stack-6ch-dig 3-jack 6-channel with SPDIF I/O |
138 | 6stack-dig-demo 6-jack digital for Intel demo board | 134 | 6stack-dig-demo 6-jack digital for Intel demo board |
@@ -140,6 +136,7 @@ ALC883/888 | |||
140 | acer-aspire Acer Aspire 9810 | 136 | acer-aspire Acer Aspire 9810 |
141 | acer-aspire-4930g Acer Aspire 4930G | 137 | acer-aspire-4930g Acer Aspire 4930G |
142 | acer-aspire-6530g Acer Aspire 6530G | 138 | acer-aspire-6530g Acer Aspire 6530G |
139 | acer-aspire-7730g Acer Aspire 7730G | ||
143 | acer-aspire-8930g Acer Aspire 8930G | 140 | acer-aspire-8930g Acer Aspire 8930G |
144 | medion Medion Laptops | 141 | medion Medion Laptops |
145 | medion-md2 Medion MD2 | 142 | medion-md2 Medion MD2 |
@@ -155,10 +152,13 @@ ALC883/888 | |||
155 | 3stack-hp HP machines with 3stack (Lucknow, Samba boards) | 152 | 3stack-hp HP machines with 3stack (Lucknow, Samba boards) |
156 | 6stack-dell Dell machines with 6stack (Inspiron 530) | 153 | 6stack-dell Dell machines with 6stack (Inspiron 530) |
157 | mitac Mitac 8252D | 154 | mitac Mitac 8252D |
155 | clevo-m540r Clevo M540R (6ch + digital) | ||
158 | clevo-m720 Clevo M720 laptop series | 156 | clevo-m720 Clevo M720 laptop series |
159 | fujitsu-pi2515 Fujitsu AMILO Pi2515 | 157 | fujitsu-pi2515 Fujitsu AMILO Pi2515 |
160 | fujitsu-xa3530 Fujitsu AMILO XA3530 | 158 | fujitsu-xa3530 Fujitsu AMILO XA3530 |
161 | 3stack-6ch-intel Intel DG33* boards | 159 | 3stack-6ch-intel Intel DG33* boards |
160 | intel-alc889a Intel IbexPeak with ALC889A | ||
161 | intel-x58 Intel DX58 with ALC889 | ||
162 | asus-p5q ASUS P5Q-EM boards | 162 | asus-p5q ASUS P5Q-EM boards |
163 | mb31 MacBook 3,1 | 163 | mb31 MacBook 3,1 |
164 | sony-vaio-tt Sony VAIO TT | 164 | sony-vaio-tt Sony VAIO TT |
@@ -229,7 +229,7 @@ AD1984 | |||
229 | ====== | 229 | ====== |
230 | basic default configuration | 230 | basic default configuration |
231 | thinkpad Lenovo Thinkpad T61/X61 | 231 | thinkpad Lenovo Thinkpad T61/X61 |
232 | dell Dell T3400 | 232 | dell_desktop Dell T3400 |
233 | 233 | ||
234 | AD1986A | 234 | AD1986A |
235 | ======= | 235 | ======= |
@@ -258,6 +258,7 @@ Conexant 5045 | |||
258 | laptop-micsense Laptop with Mic sense (old model fujitsu) | 258 | laptop-micsense Laptop with Mic sense (old model fujitsu) |
259 | laptop-hpmicsense Laptop with HP and Mic senses | 259 | laptop-hpmicsense Laptop with HP and Mic senses |
260 | benq Benq R55E | 260 | benq Benq R55E |
261 | laptop-hp530 HP 530 laptop | ||
261 | test for testing/debugging purpose, almost all controls | 262 | test for testing/debugging purpose, almost all controls |
262 | can be adjusted. Appearing only when compiled with | 263 | can be adjusted. Appearing only when compiled with |
263 | $CONFIG_SND_DEBUG=y | 264 | $CONFIG_SND_DEBUG=y |
@@ -278,9 +279,16 @@ Conexant 5051 | |||
278 | hp-dv6736 HP dv6736 | 279 | hp-dv6736 HP dv6736 |
279 | lenovo-x200 Lenovo X200 laptop | 280 | lenovo-x200 Lenovo X200 laptop |
280 | 281 | ||
282 | Conexant 5066 | ||
283 | ============= | ||
284 | laptop Basic Laptop config (default) | ||
285 | dell-laptop Dell laptops | ||
286 | olpc-xo-1_5 OLPC XO 1.5 | ||
287 | |||
281 | STAC9200 | 288 | STAC9200 |
282 | ======== | 289 | ======== |
283 | ref Reference board | 290 | ref Reference board |
291 | oqo OQO Model 2 | ||
284 | dell-d21 Dell (unknown) | 292 | dell-d21 Dell (unknown) |
285 | dell-d22 Dell (unknown) | 293 | dell-d22 Dell (unknown) |
286 | dell-d23 Dell (unknown) | 294 | dell-d23 Dell (unknown) |
@@ -368,10 +376,12 @@ STAC92HD73* | |||
368 | =========== | 376 | =========== |
369 | ref Reference board | 377 | ref Reference board |
370 | no-jd BIOS setup but without jack-detection | 378 | no-jd BIOS setup but without jack-detection |
379 | intel Intel DG45* mobos | ||
371 | dell-m6-amic Dell desktops/laptops with analog mics | 380 | dell-m6-amic Dell desktops/laptops with analog mics |
372 | dell-m6-dmic Dell desktops/laptops with digital mics | 381 | dell-m6-dmic Dell desktops/laptops with digital mics |
373 | dell-m6 Dell desktops/laptops with both type of mics | 382 | dell-m6 Dell desktops/laptops with both type of mics |
374 | dell-eq Dell desktops/laptops | 383 | dell-eq Dell desktops/laptops |
384 | alienware Alienware M17x | ||
375 | auto BIOS setup (default) | 385 | auto BIOS setup (default) |
376 | 386 | ||
377 | STAC92HD83* | 387 | STAC92HD83* |
@@ -385,3 +395,8 @@ STAC9872 | |||
385 | ======== | 395 | ======== |
386 | vaio VAIO laptop without SPDIF | 396 | vaio VAIO laptop without SPDIF |
387 | auto BIOS setup (default) | 397 | auto BIOS setup (default) |
398 | |||
399 | Cirrus Logic CS4206/4207 | ||
400 | ======================== | ||
401 | mbp55 MacBook Pro 5,5 | ||
402 | auto BIOS setup (default) | ||
diff --git a/Documentation/sound/alsa/HD-Audio.txt b/Documentation/sound/alsa/HD-Audio.txt index 71ac995b1915..7b8a5f947d1d 100644 --- a/Documentation/sound/alsa/HD-Audio.txt +++ b/Documentation/sound/alsa/HD-Audio.txt | |||
@@ -139,6 +139,10 @@ The driver checks PCI SSID and looks through the static configuration | |||
139 | table until any matching entry is found. If you have a new machine, | 139 | table until any matching entry is found. If you have a new machine, |
140 | you may see a message like below: | 140 | you may see a message like below: |
141 | ------------------------------------------------------------------------ | 141 | ------------------------------------------------------------------------ |
142 | hda_codec: ALC880: BIOS auto-probing. | ||
143 | ------------------------------------------------------------------------ | ||
144 | Meanwhile, in the earlier versions, you would see a message like: | ||
145 | ------------------------------------------------------------------------ | ||
142 | hda_codec: Unknown model for ALC880, trying auto-probe from BIOS... | 146 | hda_codec: Unknown model for ALC880, trying auto-probe from BIOS... |
143 | ------------------------------------------------------------------------ | 147 | ------------------------------------------------------------------------ |
144 | Even if you see such a message, DON'T PANIC. Take a deep breath and | 148 | Even if you see such a message, DON'T PANIC. Take a deep breath and |
@@ -403,6 +407,66 @@ re-configure based on that state, run like below: | |||
403 | ------------------------------------------------------------------------ | 407 | ------------------------------------------------------------------------ |
404 | 408 | ||
405 | 409 | ||
410 | Early Patching | ||
411 | ~~~~~~~~~~~~~~ | ||
412 | When CONFIG_SND_HDA_PATCH_LOADER=y is set, you can pass a "patch" as a | ||
413 | firmware file for modifying the HD-audio setup before initializing the | ||
414 | codec. This can work basically like the reconfiguration via sysfs in | ||
415 | the above, but it does it before the first codec configuration. | ||
416 | |||
417 | A patch file is a plain text file which looks like below: | ||
418 | |||
419 | ------------------------------------------------------------------------ | ||
420 | [codec] | ||
421 | 0x12345678 0xabcd1234 2 | ||
422 | |||
423 | [model] | ||
424 | auto | ||
425 | |||
426 | [pincfg] | ||
427 | 0x12 0x411111f0 | ||
428 | |||
429 | [verb] | ||
430 | 0x20 0x500 0x03 | ||
431 | 0x20 0x400 0xff | ||
432 | |||
433 | [hint] | ||
434 | hp_detect = yes | ||
435 | ------------------------------------------------------------------------ | ||
436 | |||
437 | The file needs to have a line `[codec]`. The next line should contain | ||
438 | three numbers indicating the codec vendor-id (0x12345678 in the | ||
439 | example), the codec subsystem-id (0xabcd1234) and the address (2) of | ||
440 | the codec. The rest patch entries are applied to this specified codec | ||
441 | until another codec entry is given. | ||
442 | |||
443 | The `[model]` line allows to change the model name of the each codec. | ||
444 | In the example above, it will be changed to model=auto. | ||
445 | Note that this overrides the module option. | ||
446 | |||
447 | After the `[pincfg]` line, the contents are parsed as the initial | ||
448 | default pin-configurations just like `user_pin_configs` sysfs above. | ||
449 | The values can be shown in user_pin_configs sysfs file, too. | ||
450 | |||
451 | Similarly, the lines after `[verb]` are parsed as `init_verbs` | ||
452 | sysfs entries, and the lines after `[hint]` are parsed as `hints` | ||
453 | sysfs entries, respectively. | ||
454 | |||
455 | The hd-audio driver reads the file via request_firmware(). Thus, | ||
456 | a patch file has to be located on the appropriate firmware path, | ||
457 | typically, /lib/firmware. For example, when you pass the option | ||
458 | `patch=hda-init.fw`, the file /lib/firmware/hda-init-fw must be | ||
459 | present. | ||
460 | |||
461 | The patch module option is specific to each card instance, and you | ||
462 | need to give one file name for each instance, separated by commas. | ||
463 | For example, if you have two cards, one for an on-board analog and one | ||
464 | for an HDMI video board, you may pass patch option like below: | ||
465 | ------------------------------------------------------------------------ | ||
466 | options snd-hda-intel patch=on-board-patch,hdmi-patch | ||
467 | ------------------------------------------------------------------------ | ||
468 | |||
469 | |||
406 | Power-Saving | 470 | Power-Saving |
407 | ~~~~~~~~~~~~ | 471 | ~~~~~~~~~~~~ |
408 | The power-saving is a kind of auto-suspend of the device. When the | 472 | The power-saving is a kind of auto-suspend of the device. When the |
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 322a00bb99d9..2dbff53369d0 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
@@ -19,6 +19,7 @@ Currently, these files might (depending on your configuration) | |||
19 | show up in /proc/sys/kernel: | 19 | show up in /proc/sys/kernel: |
20 | - acpi_video_flags | 20 | - acpi_video_flags |
21 | - acct | 21 | - acct |
22 | - callhome [ S390 only ] | ||
22 | - auto_msgmni | 23 | - auto_msgmni |
23 | - core_pattern | 24 | - core_pattern |
24 | - core_uses_pid | 25 | - core_uses_pid |
@@ -91,6 +92,21 @@ valid for 30 seconds. | |||
91 | 92 | ||
92 | ============================================================== | 93 | ============================================================== |
93 | 94 | ||
95 | callhome: | ||
96 | |||
97 | Controls the kernel's callhome behavior in case of a kernel panic. | ||
98 | |||
99 | The s390 hardware allows an operating system to send a notification | ||
100 | to a service organization (callhome) in case of an operating system panic. | ||
101 | |||
102 | When the value in this file is 0 (which is the default behavior) | ||
103 | nothing happens in case of a kernel panic. If this value is set to "1" | ||
104 | the complete kernel oops message is send to the IBM customer service | ||
105 | organization in case the mainframe the Linux operating system is running | ||
106 | on has a service contract with IBM. | ||
107 | |||
108 | ============================================================== | ||
109 | |||
94 | core_pattern: | 110 | core_pattern: |
95 | 111 | ||
96 | core_pattern is used to specify a core dumpfile pattern name. | 112 | core_pattern is used to specify a core dumpfile pattern name. |
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index f157d7594ea7..2bcc8d4dea29 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt | |||
@@ -83,6 +83,15 @@ When reading one of these enable files, there are four results: | |||
83 | X - there is a mixture of events enabled and disabled | 83 | X - there is a mixture of events enabled and disabled |
84 | ? - this file does not affect any event | 84 | ? - this file does not affect any event |
85 | 85 | ||
86 | 2.3 Boot option | ||
87 | --------------- | ||
88 | |||
89 | In order to facilitate early boot debugging, use boot option: | ||
90 | |||
91 | trace_event=[event-list] | ||
92 | |||
93 | The format of this boot option is the same as described in section 2.1. | ||
94 | |||
86 | 3. Defining an event-enabled tracepoint | 95 | 3. Defining an event-enabled tracepoint |
87 | ======================================= | 96 | ======================================= |
88 | 97 | ||
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index a39b3c749de5..355d0f1f8c50 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt | |||
@@ -85,26 +85,19 @@ of ftrace. Here is a list of some of the key files: | |||
85 | This file holds the output of the trace in a human | 85 | This file holds the output of the trace in a human |
86 | readable format (described below). | 86 | readable format (described below). |
87 | 87 | ||
88 | latency_trace: | ||
89 | |||
90 | This file shows the same trace but the information | ||
91 | is organized more to display possible latencies | ||
92 | in the system (described below). | ||
93 | |||
94 | trace_pipe: | 88 | trace_pipe: |
95 | 89 | ||
96 | The output is the same as the "trace" file but this | 90 | The output is the same as the "trace" file but this |
97 | file is meant to be streamed with live tracing. | 91 | file is meant to be streamed with live tracing. |
98 | Reads from this file will block until new data | 92 | Reads from this file will block until new data is |
99 | is retrieved. Unlike the "trace" and "latency_trace" | 93 | retrieved. Unlike the "trace" file, this file is a |
100 | files, this file is a consumer. This means reading | 94 | consumer. This means reading from this file causes |
101 | from this file causes sequential reads to display | 95 | sequential reads to display more current data. Once |
102 | more current data. Once data is read from this | 96 | data is read from this file, it is consumed, and |
103 | file, it is consumed, and will not be read | 97 | will not be read again with a sequential read. The |
104 | again with a sequential read. The "trace" and | 98 | "trace" file is static, and if the tracer is not |
105 | "latency_trace" files are static, and if the | 99 | adding more data,they will display the same |
106 | tracer is not adding more data, they will display | 100 | information every time they are read. |
107 | the same information every time they are read. | ||
108 | 101 | ||
109 | trace_options: | 102 | trace_options: |
110 | 103 | ||
@@ -117,10 +110,10 @@ of ftrace. Here is a list of some of the key files: | |||
117 | Some of the tracers record the max latency. | 110 | Some of the tracers record the max latency. |
118 | For example, the time interrupts are disabled. | 111 | For example, the time interrupts are disabled. |
119 | This time is saved in this file. The max trace | 112 | This time is saved in this file. The max trace |
120 | will also be stored, and displayed by either | 113 | will also be stored, and displayed by "trace". |
121 | "trace" or "latency_trace". A new max trace will | 114 | A new max trace will only be recorded if the |
122 | only be recorded if the latency is greater than | 115 | latency is greater than the value in this |
123 | the value in this file. (in microseconds) | 116 | file. (in microseconds) |
124 | 117 | ||
125 | buffer_size_kb: | 118 | buffer_size_kb: |
126 | 119 | ||
@@ -210,7 +203,7 @@ Here is the list of current tracers that may be configured. | |||
210 | the trace with the longest max latency. | 203 | the trace with the longest max latency. |
211 | See tracing_max_latency. When a new max is recorded, | 204 | See tracing_max_latency. When a new max is recorded, |
212 | it replaces the old trace. It is best to view this | 205 | it replaces the old trace. It is best to view this |
213 | trace via the latency_trace file. | 206 | trace with the latency-format option enabled. |
214 | 207 | ||
215 | "preemptoff" | 208 | "preemptoff" |
216 | 209 | ||
@@ -307,8 +300,8 @@ the lowest priority thread (pid 0). | |||
307 | Latency trace format | 300 | Latency trace format |
308 | -------------------- | 301 | -------------------- |
309 | 302 | ||
310 | For traces that display latency times, the latency_trace file | 303 | When the latency-format option is enabled, the trace file gives |
311 | gives somewhat more information to see why a latency happened. | 304 | somewhat more information to see why a latency happened. |
312 | Here is a typical trace. | 305 | Here is a typical trace. |
313 | 306 | ||
314 | # tracer: irqsoff | 307 | # tracer: irqsoff |
@@ -380,9 +373,10 @@ explains which is which. | |||
380 | 373 | ||
381 | The above is mostly meaningful for kernel developers. | 374 | The above is mostly meaningful for kernel developers. |
382 | 375 | ||
383 | time: This differs from the trace file output. The trace file output | 376 | time: When the latency-format option is enabled, the trace file |
384 | includes an absolute timestamp. The timestamp used by the | 377 | output includes a timestamp relative to the start of the |
385 | latency_trace file is relative to the start of the trace. | 378 | trace. This differs from the output when latency-format |
379 | is disabled, which includes an absolute timestamp. | ||
386 | 380 | ||
387 | delay: This is just to help catch your eye a bit better. And | 381 | delay: This is just to help catch your eye a bit better. And |
388 | needs to be fixed to be only relative to the same CPU. | 382 | needs to be fixed to be only relative to the same CPU. |
@@ -440,7 +434,8 @@ Here are the available options: | |||
440 | sym-addr: | 434 | sym-addr: |
441 | bash-4000 [01] 1477.606694: simple_strtoul <c0339346> | 435 | bash-4000 [01] 1477.606694: simple_strtoul <c0339346> |
442 | 436 | ||
443 | verbose - This deals with the latency_trace file. | 437 | verbose - This deals with the trace file when the |
438 | latency-format option is enabled. | ||
444 | 439 | ||
445 | bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \ | 440 | bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \ |
446 | (+0.000ms): simple_strtoul (strict_strtoul) | 441 | (+0.000ms): simple_strtoul (strict_strtoul) |
@@ -472,7 +467,7 @@ Here are the available options: | |||
472 | the app is no longer running | 467 | the app is no longer running |
473 | 468 | ||
474 | The lookup is performed when you read | 469 | The lookup is performed when you read |
475 | trace,trace_pipe,latency_trace. Example: | 470 | trace,trace_pipe. Example: |
476 | 471 | ||
477 | a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 | 472 | a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 |
478 | x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] | 473 | x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] |
@@ -481,6 +476,11 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] | |||
481 | every scheduling event. Will add overhead if | 476 | every scheduling event. Will add overhead if |
482 | there's a lot of tasks running at once. | 477 | there's a lot of tasks running at once. |
483 | 478 | ||
479 | latency-format - This option changes the trace. When | ||
480 | it is enabled, the trace displays | ||
481 | additional information about the | ||
482 | latencies, as described in "Latency | ||
483 | trace format". | ||
484 | 484 | ||
485 | sched_switch | 485 | sched_switch |
486 | ------------ | 486 | ------------ |
@@ -596,12 +596,13 @@ To reset the maximum, echo 0 into tracing_max_latency. Here is | |||
596 | an example: | 596 | an example: |
597 | 597 | ||
598 | # echo irqsoff > current_tracer | 598 | # echo irqsoff > current_tracer |
599 | # echo latency-format > trace_options | ||
599 | # echo 0 > tracing_max_latency | 600 | # echo 0 > tracing_max_latency |
600 | # echo 1 > tracing_enabled | 601 | # echo 1 > tracing_enabled |
601 | # ls -ltr | 602 | # ls -ltr |
602 | [...] | 603 | [...] |
603 | # echo 0 > tracing_enabled | 604 | # echo 0 > tracing_enabled |
604 | # cat latency_trace | 605 | # cat trace |
605 | # tracer: irqsoff | 606 | # tracer: irqsoff |
606 | # | 607 | # |
607 | irqsoff latency trace v1.1.5 on 2.6.26 | 608 | irqsoff latency trace v1.1.5 on 2.6.26 |
@@ -703,12 +704,13 @@ which preemption was disabled. The control of preemptoff tracer | |||
703 | is much like the irqsoff tracer. | 704 | is much like the irqsoff tracer. |
704 | 705 | ||
705 | # echo preemptoff > current_tracer | 706 | # echo preemptoff > current_tracer |
707 | # echo latency-format > trace_options | ||
706 | # echo 0 > tracing_max_latency | 708 | # echo 0 > tracing_max_latency |
707 | # echo 1 > tracing_enabled | 709 | # echo 1 > tracing_enabled |
708 | # ls -ltr | 710 | # ls -ltr |
709 | [...] | 711 | [...] |
710 | # echo 0 > tracing_enabled | 712 | # echo 0 > tracing_enabled |
711 | # cat latency_trace | 713 | # cat trace |
712 | # tracer: preemptoff | 714 | # tracer: preemptoff |
713 | # | 715 | # |
714 | preemptoff latency trace v1.1.5 on 2.6.26-rc8 | 716 | preemptoff latency trace v1.1.5 on 2.6.26-rc8 |
@@ -850,12 +852,13 @@ Again, using this trace is much like the irqsoff and preemptoff | |||
850 | tracers. | 852 | tracers. |
851 | 853 | ||
852 | # echo preemptirqsoff > current_tracer | 854 | # echo preemptirqsoff > current_tracer |
855 | # echo latency-format > trace_options | ||
853 | # echo 0 > tracing_max_latency | 856 | # echo 0 > tracing_max_latency |
854 | # echo 1 > tracing_enabled | 857 | # echo 1 > tracing_enabled |
855 | # ls -ltr | 858 | # ls -ltr |
856 | [...] | 859 | [...] |
857 | # echo 0 > tracing_enabled | 860 | # echo 0 > tracing_enabled |
858 | # cat latency_trace | 861 | # cat trace |
859 | # tracer: preemptirqsoff | 862 | # tracer: preemptirqsoff |
860 | # | 863 | # |
861 | preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 | 864 | preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 |
@@ -1012,11 +1015,12 @@ Instead of performing an 'ls', we will run 'sleep 1' under | |||
1012 | 'chrt' which changes the priority of the task. | 1015 | 'chrt' which changes the priority of the task. |
1013 | 1016 | ||
1014 | # echo wakeup > current_tracer | 1017 | # echo wakeup > current_tracer |
1018 | # echo latency-format > trace_options | ||
1015 | # echo 0 > tracing_max_latency | 1019 | # echo 0 > tracing_max_latency |
1016 | # echo 1 > tracing_enabled | 1020 | # echo 1 > tracing_enabled |
1017 | # chrt -f 5 sleep 1 | 1021 | # chrt -f 5 sleep 1 |
1018 | # echo 0 > tracing_enabled | 1022 | # echo 0 > tracing_enabled |
1019 | # cat latency_trace | 1023 | # cat trace |
1020 | # tracer: wakeup | 1024 | # tracer: wakeup |
1021 | # | 1025 | # |
1022 | wakeup latency trace v1.1.5 on 2.6.26-rc8 | 1026 | wakeup latency trace v1.1.5 on 2.6.26-rc8 |
diff --git a/Documentation/trace/function-graph-fold.vim b/Documentation/trace/function-graph-fold.vim new file mode 100644 index 000000000000..0544b504c8b0 --- /dev/null +++ b/Documentation/trace/function-graph-fold.vim | |||
@@ -0,0 +1,42 @@ | |||
1 | " Enable folding for ftrace function_graph traces. | ||
2 | " | ||
3 | " To use, :source this file while viewing a function_graph trace, or use vim's | ||
4 | " -S option to load from the command-line together with a trace. You can then | ||
5 | " use the usual vim fold commands, such as "za", to open and close nested | ||
6 | " functions. While closed, a fold will show the total time taken for a call, | ||
7 | " as would normally appear on the line with the closing brace. Folded | ||
8 | " functions will not include finish_task_switch(), so folding should remain | ||
9 | " relatively sane even through a context switch. | ||
10 | " | ||
11 | " Note that this will almost certainly only work well with a | ||
12 | " single-CPU trace (e.g. trace-cmd report --cpu 1). | ||
13 | |||
14 | function! FunctionGraphFoldExpr(lnum) | ||
15 | let line = getline(a:lnum) | ||
16 | if line[-1:] == '{' | ||
17 | if line =~ 'finish_task_switch() {$' | ||
18 | return '>1' | ||
19 | endif | ||
20 | return 'a1' | ||
21 | elseif line[-1:] == '}' | ||
22 | return 's1' | ||
23 | else | ||
24 | return '=' | ||
25 | endif | ||
26 | endfunction | ||
27 | |||
28 | function! FunctionGraphFoldText() | ||
29 | let s = split(getline(v:foldstart), '|', 1) | ||
30 | if getline(v:foldend+1) =~ 'finish_task_switch() {$' | ||
31 | let s[2] = ' task switch ' | ||
32 | else | ||
33 | let e = split(getline(v:foldend), '|', 1) | ||
34 | let s[2] = e[2] | ||
35 | endif | ||
36 | return join(s, '|') | ||
37 | endfunction | ||
38 | |||
39 | setlocal foldexpr=FunctionGraphFoldExpr(v:lnum) | ||
40 | setlocal foldtext=FunctionGraphFoldText() | ||
41 | setlocal foldcolumn=12 | ||
42 | setlocal foldmethod=expr | ||
diff --git a/Documentation/trace/ring-buffer-design.txt b/Documentation/trace/ring-buffer-design.txt new file mode 100644 index 000000000000..5b1d23d604c5 --- /dev/null +++ b/Documentation/trace/ring-buffer-design.txt | |||
@@ -0,0 +1,955 @@ | |||
1 | Lockless Ring Buffer Design | ||
2 | =========================== | ||
3 | |||
4 | Copyright 2009 Red Hat Inc. | ||
5 | Author: Steven Rostedt <srostedt@redhat.com> | ||
6 | License: The GNU Free Documentation License, Version 1.2 | ||
7 | (dual licensed under the GPL v2) | ||
8 | Reviewers: Mathieu Desnoyers, Huang Ying, Hidetoshi Seto, | ||
9 | and Frederic Weisbecker. | ||
10 | |||
11 | |||
12 | Written for: 2.6.31 | ||
13 | |||
14 | Terminology used in this Document | ||
15 | --------------------------------- | ||
16 | |||
17 | tail - where new writes happen in the ring buffer. | ||
18 | |||
19 | head - where new reads happen in the ring buffer. | ||
20 | |||
21 | producer - the task that writes into the ring buffer (same as writer) | ||
22 | |||
23 | writer - same as producer | ||
24 | |||
25 | consumer - the task that reads from the buffer (same as reader) | ||
26 | |||
27 | reader - same as consumer. | ||
28 | |||
29 | reader_page - A page outside the ring buffer used solely (for the most part) | ||
30 | by the reader. | ||
31 | |||
32 | head_page - a pointer to the page that the reader will use next | ||
33 | |||
34 | tail_page - a pointer to the page that will be written to next | ||
35 | |||
36 | commit_page - a pointer to the page with the last finished non nested write. | ||
37 | |||
38 | cmpxchg - hardware assisted atomic transaction that performs the following: | ||
39 | |||
40 | A = B iff previous A == C | ||
41 | |||
42 | R = cmpxchg(A, C, B) is saying that we replace A with B if and only if | ||
43 | current A is equal to C, and we put the old (current) A into R | ||
44 | |||
45 | R gets the previous A regardless if A is updated with B or not. | ||
46 | |||
47 | To see if the update was successful a compare of R == C may be used. | ||
48 | |||
49 | The Generic Ring Buffer | ||
50 | ----------------------- | ||
51 | |||
52 | The ring buffer can be used in either an overwrite mode or in | ||
53 | producer/consumer mode. | ||
54 | |||
55 | Producer/consumer mode is where the producer were to fill up the | ||
56 | buffer before the consumer could free up anything, the producer | ||
57 | will stop writing to the buffer. This will lose most recent events. | ||
58 | |||
59 | Overwrite mode is where the produce were to fill up the buffer | ||
60 | before the consumer could free up anything, the producer will | ||
61 | overwrite the older data. This will lose the oldest events. | ||
62 | |||
63 | No two writers can write at the same time (on the same per cpu buffer), | ||
64 | but a writer may interrupt another writer, but it must finish writing | ||
65 | before the previous writer may continue. This is very important to the | ||
66 | algorithm. The writers act like a "stack". The way interrupts works | ||
67 | enforces this behavior. | ||
68 | |||
69 | |||
70 | writer1 start | ||
71 | <preempted> writer2 start | ||
72 | <preempted> writer3 start | ||
73 | writer3 finishes | ||
74 | writer2 finishes | ||
75 | writer1 finishes | ||
76 | |||
77 | This is very much like a writer being preempted by an interrupt and | ||
78 | the interrupt doing a write as well. | ||
79 | |||
80 | Readers can happen at any time. But no two readers may run at the | ||
81 | same time, nor can a reader preempt/interrupt another reader. A reader | ||
82 | can not preempt/interrupt a writer, but it may read/consume from the | ||
83 | buffer at the same time as a writer is writing, but the reader must be | ||
84 | on another processor to do so. A reader may read on its own processor | ||
85 | and can be preempted by a writer. | ||
86 | |||
87 | A writer can preempt a reader, but a reader can not preempt a writer. | ||
88 | But a reader can read the buffer at the same time (on another processor) | ||
89 | as a writer. | ||
90 | |||
91 | The ring buffer is made up of a list of pages held together by a link list. | ||
92 | |||
93 | At initialization a reader page is allocated for the reader that is not | ||
94 | part of the ring buffer. | ||
95 | |||
96 | The head_page, tail_page and commit_page are all initialized to point | ||
97 | to the same page. | ||
98 | |||
99 | The reader page is initialized to have its next pointer pointing to | ||
100 | the head page, and its previous pointer pointing to a page before | ||
101 | the head page. | ||
102 | |||
103 | The reader has its own page to use. At start up time, this page is | ||
104 | allocated but is not attached to the list. When the reader wants | ||
105 | to read from the buffer, if its page is empty (like it is on start up) | ||
106 | it will swap its page with the head_page. The old reader page will | ||
107 | become part of the ring buffer and the head_page will be removed. | ||
108 | The page after the inserted page (old reader_page) will become the | ||
109 | new head page. | ||
110 | |||
111 | Once the new page is given to the reader, the reader could do what | ||
112 | it wants with it, as long as a writer has left that page. | ||
113 | |||
114 | A sample of how the reader page is swapped: Note this does not | ||
115 | show the head page in the buffer, it is for demonstrating a swap | ||
116 | only. | ||
117 | |||
118 | +------+ | ||
119 | |reader| RING BUFFER | ||
120 | |page | | ||
121 | +------+ | ||
122 | +---+ +---+ +---+ | ||
123 | | |-->| |-->| | | ||
124 | | |<--| |<--| | | ||
125 | +---+ +---+ +---+ | ||
126 | ^ | ^ | | ||
127 | | +-------------+ | | ||
128 | +-----------------+ | ||
129 | |||
130 | |||
131 | +------+ | ||
132 | |reader| RING BUFFER | ||
133 | |page |-------------------+ | ||
134 | +------+ v | ||
135 | | +---+ +---+ +---+ | ||
136 | | | |-->| |-->| | | ||
137 | | | |<--| |<--| |<-+ | ||
138 | | +---+ +---+ +---+ | | ||
139 | | ^ | ^ | | | ||
140 | | | +-------------+ | | | ||
141 | | +-----------------+ | | ||
142 | +------------------------------------+ | ||
143 | |||
144 | +------+ | ||
145 | |reader| RING BUFFER | ||
146 | |page |-------------------+ | ||
147 | +------+ <---------------+ v | ||
148 | | ^ +---+ +---+ +---+ | ||
149 | | | | |-->| |-->| | | ||
150 | | | | | | |<--| |<-+ | ||
151 | | | +---+ +---+ +---+ | | ||
152 | | | | ^ | | | ||
153 | | | +-------------+ | | | ||
154 | | +-----------------------------+ | | ||
155 | +------------------------------------+ | ||
156 | |||
157 | +------+ | ||
158 | |buffer| RING BUFFER | ||
159 | |page |-------------------+ | ||
160 | +------+ <---------------+ v | ||
161 | | ^ +---+ +---+ +---+ | ||
162 | | | | | | |-->| | | ||
163 | | | New | | | |<--| |<-+ | ||
164 | | | Reader +---+ +---+ +---+ | | ||
165 | | | page ----^ | | | ||
166 | | | | | | ||
167 | | +-----------------------------+ | | ||
168 | +------------------------------------+ | ||
169 | |||
170 | |||
171 | |||
172 | It is possible that the page swapped is the commit page and the tail page, | ||
173 | if what is in the ring buffer is less than what is held in a buffer page. | ||
174 | |||
175 | |||
176 | reader page commit page tail page | ||
177 | | | | | ||
178 | v | | | ||
179 | +---+ | | | ||
180 | | |<----------+ | | ||
181 | | |<------------------------+ | ||
182 | | |------+ | ||
183 | +---+ | | ||
184 | | | ||
185 | v | ||
186 | +---+ +---+ +---+ +---+ | ||
187 | <---| |--->| |--->| |--->| |---> | ||
188 | --->| |<---| |<---| |<---| |<--- | ||
189 | +---+ +---+ +---+ +---+ | ||
190 | |||
191 | This case is still valid for this algorithm. | ||
192 | When the writer leaves the page, it simply goes into the ring buffer | ||
193 | since the reader page still points to the next location in the ring | ||
194 | buffer. | ||
195 | |||
196 | |||
197 | The main pointers: | ||
198 | |||
199 | reader page - The page used solely by the reader and is not part | ||
200 | of the ring buffer (may be swapped in) | ||
201 | |||
202 | head page - the next page in the ring buffer that will be swapped | ||
203 | with the reader page. | ||
204 | |||
205 | tail page - the page where the next write will take place. | ||
206 | |||
207 | commit page - the page that last finished a write. | ||
208 | |||
209 | The commit page only is updated by the outer most writer in the | ||
210 | writer stack. A writer that preempts another writer will not move the | ||
211 | commit page. | ||
212 | |||
213 | When data is written into the ring buffer, a position is reserved | ||
214 | in the ring buffer and passed back to the writer. When the writer | ||
215 | is finished writing data into that position, it commits the write. | ||
216 | |||
217 | Another write (or a read) may take place at anytime during this | ||
218 | transaction. If another write happens it must finish before continuing | ||
219 | with the previous write. | ||
220 | |||
221 | |||
222 | Write reserve: | ||
223 | |||
224 | Buffer page | ||
225 | +---------+ | ||
226 | |written | | ||
227 | +---------+ <--- given back to writer (current commit) | ||
228 | |reserved | | ||
229 | +---------+ <--- tail pointer | ||
230 | | empty | | ||
231 | +---------+ | ||
232 | |||
233 | Write commit: | ||
234 | |||
235 | Buffer page | ||
236 | +---------+ | ||
237 | |written | | ||
238 | +---------+ | ||
239 | |written | | ||
240 | +---------+ <--- next positon for write (current commit) | ||
241 | | empty | | ||
242 | +---------+ | ||
243 | |||
244 | |||
245 | If a write happens after the first reserve: | ||
246 | |||
247 | Buffer page | ||
248 | +---------+ | ||
249 | |written | | ||
250 | +---------+ <-- current commit | ||
251 | |reserved | | ||
252 | +---------+ <--- given back to second writer | ||
253 | |reserved | | ||
254 | +---------+ <--- tail pointer | ||
255 | |||
256 | After second writer commits: | ||
257 | |||
258 | |||
259 | Buffer page | ||
260 | +---------+ | ||
261 | |written | | ||
262 | +---------+ <--(last full commit) | ||
263 | |reserved | | ||
264 | +---------+ | ||
265 | |pending | | ||
266 | |commit | | ||
267 | +---------+ <--- tail pointer | ||
268 | |||
269 | When the first writer commits: | ||
270 | |||
271 | Buffer page | ||
272 | +---------+ | ||
273 | |written | | ||
274 | +---------+ | ||
275 | |written | | ||
276 | +---------+ | ||
277 | |written | | ||
278 | +---------+ <--(last full commit and tail pointer) | ||
279 | |||
280 | |||
281 | The commit pointer points to the last write location that was | ||
282 | committed without preempting another write. When a write that | ||
283 | preempted another write is committed, it only becomes a pending commit | ||
284 | and will not be a full commit till all writes have been committed. | ||
285 | |||
286 | The commit page points to the page that has the last full commit. | ||
287 | The tail page points to the page with the last write (before | ||
288 | committing). | ||
289 | |||
290 | The tail page is always equal to or after the commit page. It may | ||
291 | be several pages ahead. If the tail page catches up to the commit | ||
292 | page then no more writes may take place (regardless of the mode | ||
293 | of the ring buffer: overwrite and produce/consumer). | ||
294 | |||
295 | The order of pages are: | ||
296 | |||
297 | head page | ||
298 | commit page | ||
299 | tail page | ||
300 | |||
301 | Possible scenario: | ||
302 | tail page | ||
303 | head page commit page | | ||
304 | | | | | ||
305 | v v v | ||
306 | +---+ +---+ +---+ +---+ | ||
307 | <---| |--->| |--->| |--->| |---> | ||
308 | --->| |<---| |<---| |<---| |<--- | ||
309 | +---+ +---+ +---+ +---+ | ||
310 | |||
311 | There is a special case that the head page is after either the commit page | ||
312 | and possibly the tail page. That is when the commit (and tail) page has been | ||
313 | swapped with the reader page. This is because the head page is always | ||
314 | part of the ring buffer, but the reader page is not. When ever there | ||
315 | has been less than a full page that has been committed inside the ring buffer, | ||
316 | and a reader swaps out a page, it will be swapping out the commit page. | ||
317 | |||
318 | |||
319 | reader page commit page tail page | ||
320 | | | | | ||
321 | v | | | ||
322 | +---+ | | | ||
323 | | |<----------+ | | ||
324 | | |<------------------------+ | ||
325 | | |------+ | ||
326 | +---+ | | ||
327 | | | ||
328 | v | ||
329 | +---+ +---+ +---+ +---+ | ||
330 | <---| |--->| |--->| |--->| |---> | ||
331 | --->| |<---| |<---| |<---| |<--- | ||
332 | +---+ +---+ +---+ +---+ | ||
333 | ^ | ||
334 | | | ||
335 | head page | ||
336 | |||
337 | |||
338 | In this case, the head page will not move when the tail and commit | ||
339 | move back into the ring buffer. | ||
340 | |||
341 | The reader can not swap a page into the ring buffer if the commit page | ||
342 | is still on that page. If the read meets the last commit (real commit | ||
343 | not pending or reserved), then there is nothing more to read. | ||
344 | The buffer is considered empty until another full commit finishes. | ||
345 | |||
346 | When the tail meets the head page, if the buffer is in overwrite mode, | ||
347 | the head page will be pushed ahead one. If the buffer is in producer/consumer | ||
348 | mode, the write will fail. | ||
349 | |||
350 | Overwrite mode: | ||
351 | |||
352 | tail page | ||
353 | | | ||
354 | v | ||
355 | +---+ +---+ +---+ +---+ | ||
356 | <---| |--->| |--->| |--->| |---> | ||
357 | --->| |<---| |<---| |<---| |<--- | ||
358 | +---+ +---+ +---+ +---+ | ||
359 | ^ | ||
360 | | | ||
361 | head page | ||
362 | |||
363 | |||
364 | tail page | ||
365 | | | ||
366 | v | ||
367 | +---+ +---+ +---+ +---+ | ||
368 | <---| |--->| |--->| |--->| |---> | ||
369 | --->| |<---| |<---| |<---| |<--- | ||
370 | +---+ +---+ +---+ +---+ | ||
371 | ^ | ||
372 | | | ||
373 | head page | ||
374 | |||
375 | |||
376 | tail page | ||
377 | | | ||
378 | v | ||
379 | +---+ +---+ +---+ +---+ | ||
380 | <---| |--->| |--->| |--->| |---> | ||
381 | --->| |<---| |<---| |<---| |<--- | ||
382 | +---+ +---+ +---+ +---+ | ||
383 | ^ | ||
384 | | | ||
385 | head page | ||
386 | |||
387 | Note, the reader page will still point to the previous head page. | ||
388 | But when a swap takes place, it will use the most recent head page. | ||
389 | |||
390 | |||
391 | Making the Ring Buffer Lockless: | ||
392 | -------------------------------- | ||
393 | |||
394 | The main idea behind the lockless algorithm is to combine the moving | ||
395 | of the head_page pointer with the swapping of pages with the reader. | ||
396 | State flags are placed inside the pointer to the page. To do this, | ||
397 | each page must be aligned in memory by 4 bytes. This will allow the 2 | ||
398 | least significant bits of the address to be used as flags. Since | ||
399 | they will always be zero for the address. To get the address, | ||
400 | simply mask out the flags. | ||
401 | |||
402 | MASK = ~3 | ||
403 | |||
404 | address & MASK | ||
405 | |||
406 | Two flags will be kept by these two bits: | ||
407 | |||
408 | HEADER - the page being pointed to is a head page | ||
409 | |||
410 | UPDATE - the page being pointed to is being updated by a writer | ||
411 | and was or is about to be a head page. | ||
412 | |||
413 | |||
414 | reader page | ||
415 | | | ||
416 | v | ||
417 | +---+ | ||
418 | | |------+ | ||
419 | +---+ | | ||
420 | | | ||
421 | v | ||
422 | +---+ +---+ +---+ +---+ | ||
423 | <---| |--->| |-H->| |--->| |---> | ||
424 | --->| |<---| |<---| |<---| |<--- | ||
425 | +---+ +---+ +---+ +---+ | ||
426 | |||
427 | |||
428 | The above pointer "-H->" would have the HEADER flag set. That is | ||
429 | the next page is the next page to be swapped out by the reader. | ||
430 | This pointer means the next page is the head page. | ||
431 | |||
432 | When the tail page meets the head pointer, it will use cmpxchg to | ||
433 | change the pointer to the UPDATE state: | ||
434 | |||
435 | |||
436 | tail page | ||
437 | | | ||
438 | v | ||
439 | +---+ +---+ +---+ +---+ | ||
440 | <---| |--->| |-H->| |--->| |---> | ||
441 | --->| |<---| |<---| |<---| |<--- | ||
442 | +---+ +---+ +---+ +---+ | ||
443 | |||
444 | tail page | ||
445 | | | ||
446 | v | ||
447 | +---+ +---+ +---+ +---+ | ||
448 | <---| |--->| |-U->| |--->| |---> | ||
449 | --->| |<---| |<---| |<---| |<--- | ||
450 | +---+ +---+ +---+ +---+ | ||
451 | |||
452 | "-U->" represents a pointer in the UPDATE state. | ||
453 | |||
454 | Any access to the reader will need to take some sort of lock to serialize | ||
455 | the readers. But the writers will never take a lock to write to the | ||
456 | ring buffer. This means we only need to worry about a single reader, | ||
457 | and writes only preempt in "stack" formation. | ||
458 | |||
459 | When the reader tries to swap the page with the ring buffer, it | ||
460 | will also use cmpxchg. If the flag bit in the pointer to the | ||
461 | head page does not have the HEADER flag set, the compare will fail | ||
462 | and the reader will need to look for the new head page and try again. | ||
463 | Note, the flag UPDATE and HEADER are never set at the same time. | ||
464 | |||
465 | The reader swaps the reader page as follows: | ||
466 | |||
467 | +------+ | ||
468 | |reader| RING BUFFER | ||
469 | |page | | ||
470 | +------+ | ||
471 | +---+ +---+ +---+ | ||
472 | | |--->| |--->| | | ||
473 | | |<---| |<---| | | ||
474 | +---+ +---+ +---+ | ||
475 | ^ | ^ | | ||
476 | | +---------------+ | | ||
477 | +-----H-------------+ | ||
478 | |||
479 | The reader sets the reader page next pointer as HEADER to the page after | ||
480 | the head page. | ||
481 | |||
482 | |||
483 | +------+ | ||
484 | |reader| RING BUFFER | ||
485 | |page |-------H-----------+ | ||
486 | +------+ v | ||
487 | | +---+ +---+ +---+ | ||
488 | | | |--->| |--->| | | ||
489 | | | |<---| |<---| |<-+ | ||
490 | | +---+ +---+ +---+ | | ||
491 | | ^ | ^ | | | ||
492 | | | +---------------+ | | | ||
493 | | +-----H-------------+ | | ||
494 | +--------------------------------------+ | ||
495 | |||
496 | It does a cmpxchg with the pointer to the previous head page to make it | ||
497 | point to the reader page. Note that the new pointer does not have the HEADER | ||
498 | flag set. This action atomically moves the head page forward. | ||
499 | |||
500 | +------+ | ||
501 | |reader| RING BUFFER | ||
502 | |page |-------H-----------+ | ||
503 | +------+ v | ||
504 | | ^ +---+ +---+ +---+ | ||
505 | | | | |-->| |-->| | | ||
506 | | | | |<--| |<--| |<-+ | ||
507 | | | +---+ +---+ +---+ | | ||
508 | | | | ^ | | | ||
509 | | | +-------------+ | | | ||
510 | | +-----------------------------+ | | ||
511 | +------------------------------------+ | ||
512 | |||
513 | After the new head page is set, the previous pointer of the head page is | ||
514 | updated to the reader page. | ||
515 | |||
516 | +------+ | ||
517 | |reader| RING BUFFER | ||
518 | |page |-------H-----------+ | ||
519 | +------+ <---------------+ v | ||
520 | | ^ +---+ +---+ +---+ | ||
521 | | | | |-->| |-->| | | ||
522 | | | | | | |<--| |<-+ | ||
523 | | | +---+ +---+ +---+ | | ||
524 | | | | ^ | | | ||
525 | | | +-------------+ | | | ||
526 | | +-----------------------------+ | | ||
527 | +------------------------------------+ | ||
528 | |||
529 | +------+ | ||
530 | |buffer| RING BUFFER | ||
531 | |page |-------H-----------+ <--- New head page | ||
532 | +------+ <---------------+ v | ||
533 | | ^ +---+ +---+ +---+ | ||
534 | | | | | | |-->| | | ||
535 | | | New | | | |<--| |<-+ | ||
536 | | | Reader +---+ +---+ +---+ | | ||
537 | | | page ----^ | | | ||
538 | | | | | | ||
539 | | +-----------------------------+ | | ||
540 | +------------------------------------+ | ||
541 | |||
542 | Another important point. The page that the reader page points back to | ||
543 | by its previous pointer (the one that now points to the new head page) | ||
544 | never points back to the reader page. That is because the reader page is | ||
545 | not part of the ring buffer. Traversing the ring buffer via the next pointers | ||
546 | will always stay in the ring buffer. Traversing the ring buffer via the | ||
547 | prev pointers may not. | ||
548 | |||
549 | Note, the way to determine a reader page is simply by examining the previous | ||
550 | pointer of the page. If the next pointer of the previous page does not | ||
551 | point back to the original page, then the original page is a reader page: | ||
552 | |||
553 | |||
554 | +--------+ | ||
555 | | reader | next +----+ | ||
556 | | page |-------->| |<====== (buffer page) | ||
557 | +--------+ +----+ | ||
558 | | | ^ | ||
559 | | v | next | ||
560 | prev | +----+ | ||
561 | +------------->| | | ||
562 | +----+ | ||
563 | |||
564 | The way the head page moves forward: | ||
565 | |||
566 | When the tail page meets the head page and the buffer is in overwrite mode | ||
567 | and more writes take place, the head page must be moved forward before the | ||
568 | writer may move the tail page. The way this is done is that the writer | ||
569 | performs a cmpxchg to convert the pointer to the head page from the HEADER | ||
570 | flag to have the UPDATE flag set. Once this is done, the reader will | ||
571 | not be able to swap the head page from the buffer, nor will it be able to | ||
572 | move the head page, until the writer is finished with the move. | ||
573 | |||
574 | This eliminates any races that the reader can have on the writer. The reader | ||
575 | must spin, and this is why the reader can not preempt the writer. | ||
576 | |||
577 | tail page | ||
578 | | | ||
579 | v | ||
580 | +---+ +---+ +---+ +---+ | ||
581 | <---| |--->| |-H->| |--->| |---> | ||
582 | --->| |<---| |<---| |<---| |<--- | ||
583 | +---+ +---+ +---+ +---+ | ||
584 | |||
585 | tail page | ||
586 | | | ||
587 | v | ||
588 | +---+ +---+ +---+ +---+ | ||
589 | <---| |--->| |-U->| |--->| |---> | ||
590 | --->| |<---| |<---| |<---| |<--- | ||
591 | +---+ +---+ +---+ +---+ | ||
592 | |||
593 | The following page will be made into the new head page. | ||
594 | |||
595 | tail page | ||
596 | | | ||
597 | v | ||
598 | +---+ +---+ +---+ +---+ | ||
599 | <---| |--->| |-U->| |-H->| |---> | ||
600 | --->| |<---| |<---| |<---| |<--- | ||
601 | +---+ +---+ +---+ +---+ | ||
602 | |||
603 | After the new head page has been set, we can set the old head page | ||
604 | pointer back to NORMAL. | ||
605 | |||
606 | tail page | ||
607 | | | ||
608 | v | ||
609 | +---+ +---+ +---+ +---+ | ||
610 | <---| |--->| |--->| |-H->| |---> | ||
611 | --->| |<---| |<---| |<---| |<--- | ||
612 | +---+ +---+ +---+ +---+ | ||
613 | |||
614 | After the head page has been moved, the tail page may now move forward. | ||
615 | |||
616 | tail page | ||
617 | | | ||
618 | v | ||
619 | +---+ +---+ +---+ +---+ | ||
620 | <---| |--->| |--->| |-H->| |---> | ||
621 | --->| |<---| |<---| |<---| |<--- | ||
622 | +---+ +---+ +---+ +---+ | ||
623 | |||
624 | |||
625 | The above are the trivial updates. Now for the more complex scenarios. | ||
626 | |||
627 | |||
628 | As stated before, if enough writes preempt the first write, the | ||
629 | tail page may make it all the way around the buffer and meet the commit | ||
630 | page. At this time, we must start dropping writes (usually with some kind | ||
631 | of warning to the user). But what happens if the commit was still on the | ||
632 | reader page? The commit page is not part of the ring buffer. The tail page | ||
633 | must account for this. | ||
634 | |||
635 | |||
636 | reader page commit page | ||
637 | | | | ||
638 | v | | ||
639 | +---+ | | ||
640 | | |<----------+ | ||
641 | | | | ||
642 | | |------+ | ||
643 | +---+ | | ||
644 | | | ||
645 | v | ||
646 | +---+ +---+ +---+ +---+ | ||
647 | <---| |--->| |-H->| |--->| |---> | ||
648 | --->| |<---| |<---| |<---| |<--- | ||
649 | +---+ +---+ +---+ +---+ | ||
650 | ^ | ||
651 | | | ||
652 | tail page | ||
653 | |||
654 | If the tail page were to simply push the head page forward, the commit when | ||
655 | leaving the reader page would not be pointing to the correct page. | ||
656 | |||
657 | The solution to this is to test if the commit page is on the reader page | ||
658 | before pushing the head page. If it is, then it can be assumed that the | ||
659 | tail page wrapped the buffer, and we must drop new writes. | ||
660 | |||
661 | This is not a race condition, because the commit page can only be moved | ||
662 | by the outter most writer (the writer that was preempted). | ||
663 | This means that the commit will not move while a writer is moving the | ||
664 | tail page. The reader can not swap the reader page if it is also being | ||
665 | used as the commit page. The reader can simply check that the commit | ||
666 | is off the reader page. Once the commit page leaves the reader page | ||
667 | it will never go back on it unless a reader does another swap with the | ||
668 | buffer page that is also the commit page. | ||
669 | |||
670 | |||
671 | Nested writes | ||
672 | ------------- | ||
673 | |||
674 | In the pushing forward of the tail page we must first push forward | ||
675 | the head page if the head page is the next page. If the head page | ||
676 | is not the next page, the tail page is simply updated with a cmpxchg. | ||
677 | |||
678 | Only writers move the tail page. This must be done atomically to protect | ||
679 | against nested writers. | ||
680 | |||
681 | temp_page = tail_page | ||
682 | next_page = temp_page->next | ||
683 | cmpxchg(tail_page, temp_page, next_page) | ||
684 | |||
685 | The above will update the tail page if it is still pointing to the expected | ||
686 | page. If this fails, a nested write pushed it forward, the the current write | ||
687 | does not need to push it. | ||
688 | |||
689 | |||
690 | temp page | ||
691 | | | ||
692 | v | ||
693 | tail page | ||
694 | | | ||
695 | v | ||
696 | +---+ +---+ +---+ +---+ | ||
697 | <---| |--->| |--->| |--->| |---> | ||
698 | --->| |<---| |<---| |<---| |<--- | ||
699 | +---+ +---+ +---+ +---+ | ||
700 | |||
701 | Nested write comes in and moves the tail page forward: | ||
702 | |||
703 | tail page (moved by nested writer) | ||
704 | temp page | | ||
705 | | | | ||
706 | v v | ||
707 | +---+ +---+ +---+ +---+ | ||
708 | <---| |--->| |--->| |--->| |---> | ||
709 | --->| |<---| |<---| |<---| |<--- | ||
710 | +---+ +---+ +---+ +---+ | ||
711 | |||
712 | The above would fail the cmpxchg, but since the tail page has already | ||
713 | been moved forward, the writer will just try again to reserve storage | ||
714 | on the new tail page. | ||
715 | |||
716 | But the moving of the head page is a bit more complex. | ||
717 | |||
718 | tail page | ||
719 | | | ||
720 | v | ||
721 | +---+ +---+ +---+ +---+ | ||
722 | <---| |--->| |-H->| |--->| |---> | ||
723 | --->| |<---| |<---| |<---| |<--- | ||
724 | +---+ +---+ +---+ +---+ | ||
725 | |||
726 | The write converts the head page pointer to UPDATE. | ||
727 | |||
728 | tail page | ||
729 | | | ||
730 | v | ||
731 | +---+ +---+ +---+ +---+ | ||
732 | <---| |--->| |-U->| |--->| |---> | ||
733 | --->| |<---| |<---| |<---| |<--- | ||
734 | +---+ +---+ +---+ +---+ | ||
735 | |||
736 | But if a nested writer preempts here. It will see that the next | ||
737 | page is a head page, but it is also nested. It will detect that | ||
738 | it is nested and will save that information. The detection is the | ||
739 | fact that it sees the UPDATE flag instead of a HEADER or NORMAL | ||
740 | pointer. | ||
741 | |||
742 | The nested writer will set the new head page pointer. | ||
743 | |||
744 | tail page | ||
745 | | | ||
746 | v | ||
747 | +---+ +---+ +---+ +---+ | ||
748 | <---| |--->| |-U->| |-H->| |---> | ||
749 | --->| |<---| |<---| |<---| |<--- | ||
750 | +---+ +---+ +---+ +---+ | ||
751 | |||
752 | But it will not reset the update back to normal. Only the writer | ||
753 | that converted a pointer from HEAD to UPDATE will convert it back | ||
754 | to NORMAL. | ||
755 | |||
756 | tail page | ||
757 | | | ||
758 | v | ||
759 | +---+ +---+ +---+ +---+ | ||
760 | <---| |--->| |-U->| |-H->| |---> | ||
761 | --->| |<---| |<---| |<---| |<--- | ||
762 | +---+ +---+ +---+ +---+ | ||
763 | |||
764 | After the nested writer finishes, the outer most writer will convert | ||
765 | the UPDATE pointer to NORMAL. | ||
766 | |||
767 | |||
768 | tail page | ||
769 | | | ||
770 | v | ||
771 | +---+ +---+ +---+ +---+ | ||
772 | <---| |--->| |--->| |-H->| |---> | ||
773 | --->| |<---| |<---| |<---| |<--- | ||
774 | +---+ +---+ +---+ +---+ | ||
775 | |||
776 | |||
777 | It can be even more complex if several nested writes came in and moved | ||
778 | the tail page ahead several pages: | ||
779 | |||
780 | |||
781 | (first writer) | ||
782 | |||
783 | tail page | ||
784 | | | ||
785 | v | ||
786 | +---+ +---+ +---+ +---+ | ||
787 | <---| |--->| |-H->| |--->| |---> | ||
788 | --->| |<---| |<---| |<---| |<--- | ||
789 | +---+ +---+ +---+ +---+ | ||
790 | |||
791 | The write converts the head page pointer to UPDATE. | ||
792 | |||
793 | tail page | ||
794 | | | ||
795 | v | ||
796 | +---+ +---+ +---+ +---+ | ||
797 | <---| |--->| |-U->| |--->| |---> | ||
798 | --->| |<---| |<---| |<---| |<--- | ||
799 | +---+ +---+ +---+ +---+ | ||
800 | |||
801 | Next writer comes in, and sees the update and sets up the new | ||
802 | head page. | ||
803 | |||
804 | (second writer) | ||
805 | |||
806 | tail page | ||
807 | | | ||
808 | v | ||
809 | +---+ +---+ +---+ +---+ | ||
810 | <---| |--->| |-U->| |-H->| |---> | ||
811 | --->| |<---| |<---| |<---| |<--- | ||
812 | +---+ +---+ +---+ +---+ | ||
813 | |||
814 | The nested writer moves the tail page forward. But does not set the old | ||
815 | update page to NORMAL because it is not the outer most writer. | ||
816 | |||
817 | tail page | ||
818 | | | ||
819 | v | ||
820 | +---+ +---+ +---+ +---+ | ||
821 | <---| |--->| |-U->| |-H->| |---> | ||
822 | --->| |<---| |<---| |<---| |<--- | ||
823 | +---+ +---+ +---+ +---+ | ||
824 | |||
825 | Another writer preempts and sees the page after the tail page is a head page. | ||
826 | It changes it from HEAD to UPDATE. | ||
827 | |||
828 | (third writer) | ||
829 | |||
830 | tail page | ||
831 | | | ||
832 | v | ||
833 | +---+ +---+ +---+ +---+ | ||
834 | <---| |--->| |-U->| |-U->| |---> | ||
835 | --->| |<---| |<---| |<---| |<--- | ||
836 | +---+ +---+ +---+ +---+ | ||
837 | |||
838 | The writer will move the head page forward: | ||
839 | |||
840 | |||
841 | (third writer) | ||
842 | |||
843 | tail page | ||
844 | | | ||
845 | v | ||
846 | +---+ +---+ +---+ +---+ | ||
847 | <---| |--->| |-U->| |-U->| |-H-> | ||
848 | --->| |<---| |<---| |<---| |<--- | ||
849 | +---+ +---+ +---+ +---+ | ||
850 | |||
851 | But now that the third writer did change the HEAD flag to UPDATE it | ||
852 | will convert it to normal: | ||
853 | |||
854 | |||
855 | (third writer) | ||
856 | |||
857 | tail page | ||
858 | | | ||
859 | v | ||
860 | +---+ +---+ +---+ +---+ | ||
861 | <---| |--->| |-U->| |--->| |-H-> | ||
862 | --->| |<---| |<---| |<---| |<--- | ||
863 | +---+ +---+ +---+ +---+ | ||
864 | |||
865 | |||
866 | Then it will move the tail page, and return back to the second writer. | ||
867 | |||
868 | |||
869 | (second writer) | ||
870 | |||
871 | tail page | ||
872 | | | ||
873 | v | ||
874 | +---+ +---+ +---+ +---+ | ||
875 | <---| |--->| |-U->| |--->| |-H-> | ||
876 | --->| |<---| |<---| |<---| |<--- | ||
877 | +---+ +---+ +---+ +---+ | ||
878 | |||
879 | |||
880 | The second writer will fail to move the tail page because it was already | ||
881 | moved, so it will try again and add its data to the new tail page. | ||
882 | It will return to the first writer. | ||
883 | |||
884 | |||
885 | (first writer) | ||
886 | |||
887 | tail page | ||
888 | | | ||
889 | v | ||
890 | +---+ +---+ +---+ +---+ | ||
891 | <---| |--->| |-U->| |--->| |-H-> | ||
892 | --->| |<---| |<---| |<---| |<--- | ||
893 | +---+ +---+ +---+ +---+ | ||
894 | |||
895 | The first writer can not know atomically test if the tail page moved | ||
896 | while it updates the HEAD page. It will then update the head page to | ||
897 | what it thinks is the new head page. | ||
898 | |||
899 | |||
900 | (first writer) | ||
901 | |||
902 | tail page | ||
903 | | | ||
904 | v | ||
905 | +---+ +---+ +---+ +---+ | ||
906 | <---| |--->| |-U->| |-H->| |-H-> | ||
907 | --->| |<---| |<---| |<---| |<--- | ||
908 | +---+ +---+ +---+ +---+ | ||
909 | |||
910 | Since the cmpxchg returns the old value of the pointer the first writer | ||
911 | will see it succeeded in updating the pointer from NORMAL to HEAD. | ||
912 | But as we can see, this is not good enough. It must also check to see | ||
913 | if the tail page is either where it use to be or on the next page: | ||
914 | |||
915 | |||
916 | (first writer) | ||
917 | |||
918 | A B tail page | ||
919 | | | | | ||
920 | v v v | ||
921 | +---+ +---+ +---+ +---+ | ||
922 | <---| |--->| |-U->| |-H->| |-H-> | ||
923 | --->| |<---| |<---| |<---| |<--- | ||
924 | +---+ +---+ +---+ +---+ | ||
925 | |||
926 | If tail page != A and tail page does not equal B, then it must reset the | ||
927 | pointer back to NORMAL. The fact that it only needs to worry about | ||
928 | nested writers, it only needs to check this after setting the HEAD page. | ||
929 | |||
930 | |||
931 | (first writer) | ||
932 | |||
933 | A B tail page | ||
934 | | | | | ||
935 | v v v | ||
936 | +---+ +---+ +---+ +---+ | ||
937 | <---| |--->| |-U->| |--->| |-H-> | ||
938 | --->| |<---| |<---| |<---| |<--- | ||
939 | +---+ +---+ +---+ +---+ | ||
940 | |||
941 | Now the writer can update the head page. This is also why the head page must | ||
942 | remain in UPDATE and only reset by the outer most writer. This prevents | ||
943 | the reader from seeing the incorrect head page. | ||
944 | |||
945 | |||
946 | (first writer) | ||
947 | |||
948 | A B tail page | ||
949 | | | | | ||
950 | v v v | ||
951 | +---+ +---+ +---+ +---+ | ||
952 | <---| |--->| |--->| |--->| |-H-> | ||
953 | --->| |<---| |<---| |<---| |<--- | ||
954 | +---+ +---+ +---+ +---+ | ||
955 | |||