diff options
Diffstat (limited to 'Documentation')
35 files changed, 1939 insertions, 251 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index d05737aaa84b..06b982affe76 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX | |||
@@ -82,6 +82,8 @@ block/ | |||
82 | - info on the Block I/O (BIO) layer. | 82 | - info on the Block I/O (BIO) layer. |
83 | blockdev/ | 83 | blockdev/ |
84 | - info on block devices & drivers | 84 | - info on block devices & drivers |
85 | btmrvl.txt | ||
86 | - info on Marvell Bluetooth driver usage. | ||
85 | cachetlb.txt | 87 | cachetlb.txt |
86 | - describes the cache/TLB flushing interfaces Linux uses. | 88 | - describes the cache/TLB flushing interfaces Linux uses. |
87 | cdrom/ | 89 | cdrom/ |
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt index 9f711d2df91b..d2b85237c76e 100644 --- a/Documentation/RCU/RTFP.txt +++ b/Documentation/RCU/RTFP.txt | |||
@@ -743,3 +743,80 @@ Revised: | |||
743 | RCU, realtime RCU, sleepable RCU, performance. | 743 | RCU, realtime RCU, sleepable RCU, performance. |
744 | " | 744 | " |
745 | } | 745 | } |
746 | |||
747 | @article{PaulEMcKenney2008RCUOSR | ||
748 | ,author="Paul E. McKenney and Jonathan Walpole" | ||
749 | ,title="Introducing technology into the {Linux} kernel: a case study" | ||
750 | ,Year="2008" | ||
751 | ,journal="SIGOPS Oper. Syst. Rev." | ||
752 | ,volume="42" | ||
753 | ,number="5" | ||
754 | ,pages="4--17" | ||
755 | ,issn="0163-5980" | ||
756 | ,doi={http://doi.acm.org/10.1145/1400097.1400099} | ||
757 | ,publisher="ACM" | ||
758 | ,address="New York, NY, USA" | ||
759 | ,annotation={ | ||
760 | Linux changed RCU to a far greater degree than RCU has changed Linux. | ||
761 | } | ||
762 | } | ||
763 | |||
764 | @unpublished{PaulEMcKenney2008HierarchicalRCU | ||
765 | ,Author="Paul E. McKenney" | ||
766 | ,Title="Hierarchical {RCU}" | ||
767 | ,month="November" | ||
768 | ,day="3" | ||
769 | ,year="2008" | ||
770 | ,note="Available: | ||
771 | \url{http://lwn.net/Articles/305782/} | ||
772 | [Viewed November 6, 2008]" | ||
773 | ,annotation=" | ||
774 | RCU with combining-tree-based grace-period detection, | ||
775 | permitting it to handle thousands of CPUs. | ||
776 | " | ||
777 | } | ||
778 | |||
779 | @conference{PaulEMcKenney2009MaliciousURCU | ||
780 | ,Author="Paul E. McKenney" | ||
781 | ,Title="Using a Malicious User-Level {RCU} to Torture {RCU}-Based Algorithms" | ||
782 | ,Booktitle="linux.conf.au 2009" | ||
783 | ,month="January" | ||
784 | ,year="2009" | ||
785 | ,address="Hobart, Australia" | ||
786 | ,note="Available: | ||
787 | \url{http://www.rdrop.com/users/paulmck/RCU/urcutorture.2009.01.22a.pdf} | ||
788 | [Viewed February 2, 2009]" | ||
789 | ,annotation=" | ||
790 | Realtime RCU and torture-testing RCU uses. | ||
791 | " | ||
792 | } | ||
793 | |||
794 | @unpublished{MathieuDesnoyers2009URCU | ||
795 | ,Author="Mathieu Desnoyers" | ||
796 | ,Title="[{RFC} git tree] Userspace {RCU} (urcu) for {Linux}" | ||
797 | ,month="February" | ||
798 | ,day="5" | ||
799 | ,year="2009" | ||
800 | ,note="Available: | ||
801 | \url{http://lkml.org/lkml/2009/2/5/572} | ||
802 | \url{git://lttng.org/userspace-rcu.git} | ||
803 | [Viewed February 20, 2009]" | ||
804 | ,annotation=" | ||
805 | Mathieu Desnoyers's user-space RCU implementation. | ||
806 | git://lttng.org/userspace-rcu.git | ||
807 | " | ||
808 | } | ||
809 | |||
810 | @unpublished{PaulEMcKenney2009BloatWatchRCU | ||
811 | ,Author="Paul E. McKenney" | ||
812 | ,Title="{RCU}: The {Bloatwatch} Edition" | ||
813 | ,month="March" | ||
814 | ,day="17" | ||
815 | ,year="2009" | ||
816 | ,note="Available: | ||
817 | \url{http://lwn.net/Articles/323929/} | ||
818 | [Viewed March 20, 2009]" | ||
819 | ,annotation=" | ||
820 | Uniprocessor assumptions allow simplified RCU implementation. | ||
821 | " | ||
822 | } | ||
diff --git a/Documentation/RCU/UP.txt b/Documentation/RCU/UP.txt index aab4a9ec3931..90ec5341ee98 100644 --- a/Documentation/RCU/UP.txt +++ b/Documentation/RCU/UP.txt | |||
@@ -2,14 +2,13 @@ RCU on Uniprocessor Systems | |||
2 | 2 | ||
3 | 3 | ||
4 | A common misconception is that, on UP systems, the call_rcu() primitive | 4 | A common misconception is that, on UP systems, the call_rcu() primitive |
5 | may immediately invoke its function, and that the synchronize_rcu() | 5 | may immediately invoke its function. The basis of this misconception |
6 | primitive may return immediately. The basis of this misconception | ||
7 | is that since there is only one CPU, it should not be necessary to | 6 | is that since there is only one CPU, it should not be necessary to |
8 | wait for anything else to get done, since there are no other CPUs for | 7 | wait for anything else to get done, since there are no other CPUs for |
9 | anything else to be happening on. Although this approach will -sort- -of- | 8 | anything else to be happening on. Although this approach will -sort- -of- |
10 | work a surprising amount of the time, it is a very bad idea in general. | 9 | work a surprising amount of the time, it is a very bad idea in general. |
11 | This document presents three examples that demonstrate exactly how bad an | 10 | This document presents three examples that demonstrate exactly how bad |
12 | idea this is. | 11 | an idea this is. |
13 | 12 | ||
14 | 13 | ||
15 | Example 1: softirq Suicide | 14 | Example 1: softirq Suicide |
@@ -82,11 +81,18 @@ Quick Quiz #2: What locking restriction must RCU callbacks respect? | |||
82 | 81 | ||
83 | Summary | 82 | Summary |
84 | 83 | ||
85 | Permitting call_rcu() to immediately invoke its arguments or permitting | 84 | Permitting call_rcu() to immediately invoke its arguments breaks RCU, |
86 | synchronize_rcu() to immediately return breaks RCU, even on a UP system. | 85 | even on a UP system. So do not do it! Even on a UP system, the RCU |
87 | So do not do it! Even on a UP system, the RCU infrastructure -must- | 86 | infrastructure -must- respect grace periods, and -must- invoke callbacks |
88 | respect grace periods, and -must- invoke callbacks from a known environment | 87 | from a known environment in which no locks are held. |
89 | in which no locks are held. | 88 | |
89 | It -is- safe for synchronize_sched() and synchronize_rcu_bh() to return | ||
90 | immediately on an UP system. It is also safe for synchronize_rcu() | ||
91 | to return immediately on UP systems, except when running preemptable | ||
92 | RCU. | ||
93 | |||
94 | Quick Quiz #3: Why can't synchronize_rcu() return immediately on | ||
95 | UP systems running preemptable RCU? | ||
90 | 96 | ||
91 | 97 | ||
92 | Answer to Quick Quiz #1: | 98 | Answer to Quick Quiz #1: |
@@ -117,3 +123,13 @@ Answer to Quick Quiz #2: | |||
117 | callbacks acquire locks directly. However, a great many RCU | 123 | callbacks acquire locks directly. However, a great many RCU |
118 | callbacks do acquire locks -indirectly-, for example, via | 124 | callbacks do acquire locks -indirectly-, for example, via |
119 | the kfree() primitive. | 125 | the kfree() primitive. |
126 | |||
127 | Answer to Quick Quiz #3: | ||
128 | Why can't synchronize_rcu() return immediately on UP systems | ||
129 | running preemptable RCU? | ||
130 | |||
131 | Because some other task might have been preempted in the middle | ||
132 | of an RCU read-side critical section. If synchronize_rcu() | ||
133 | simply immediately returned, it would prematurely signal the | ||
134 | end of the grace period, which would come as a nasty shock to | ||
135 | that other thread when it started running again. | ||
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index accfe2f5247d..51525a30e8b4 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt | |||
@@ -11,7 +11,10 @@ over a rather long period of time, but improvements are always welcome! | |||
11 | structure is updated more than about 10% of the time, then | 11 | structure is updated more than about 10% of the time, then |
12 | you should strongly consider some other approach, unless | 12 | you should strongly consider some other approach, unless |
13 | detailed performance measurements show that RCU is nonetheless | 13 | detailed performance measurements show that RCU is nonetheless |
14 | the right tool for the job. | 14 | the right tool for the job. Yes, you might think of RCU |
15 | as simply cutting overhead off of the readers and imposing it | ||
16 | on the writers. That is exactly why normal uses of RCU will | ||
17 | do much more reading than updating. | ||
15 | 18 | ||
16 | Another exception is where performance is not an issue, and RCU | 19 | Another exception is where performance is not an issue, and RCU |
17 | provides a simpler implementation. An example of this situation | 20 | provides a simpler implementation. An example of this situation |
@@ -240,10 +243,11 @@ over a rather long period of time, but improvements are always welcome! | |||
240 | instead need to use synchronize_irq() or synchronize_sched(). | 243 | instead need to use synchronize_irq() or synchronize_sched(). |
241 | 244 | ||
242 | 12. Any lock acquired by an RCU callback must be acquired elsewhere | 245 | 12. Any lock acquired by an RCU callback must be acquired elsewhere |
243 | with irq disabled, e.g., via spin_lock_irqsave(). Failing to | 246 | with softirq disabled, e.g., via spin_lock_irqsave(), |
244 | disable irq on a given acquisition of that lock will result in | 247 | spin_lock_bh(), etc. Failing to disable irq on a given |
245 | deadlock as soon as the RCU callback happens to interrupt that | 248 | acquisition of that lock will result in deadlock as soon as the |
246 | acquisition's critical section. | 249 | RCU callback happens to interrupt that acquisition's critical |
250 | section. | ||
247 | 251 | ||
248 | 13. RCU callbacks can be and are executed in parallel. In many cases, | 252 | 13. RCU callbacks can be and are executed in parallel. In many cases, |
249 | the callback code simply wrappers around kfree(), so that this | 253 | the callback code simply wrappers around kfree(), so that this |
@@ -310,3 +314,9 @@ over a rather long period of time, but improvements are always welcome! | |||
310 | Because these primitives only wait for pre-existing readers, | 314 | Because these primitives only wait for pre-existing readers, |
311 | it is the caller's responsibility to guarantee safety to | 315 | it is the caller's responsibility to guarantee safety to |
312 | any subsequent readers. | 316 | any subsequent readers. |
317 | |||
318 | 16. The various RCU read-side primitives do -not- contain memory | ||
319 | barriers. The CPU (and in some cases, the compiler) is free | ||
320 | to reorder code into and out of RCU read-side critical sections. | ||
321 | It is the responsibility of the RCU update-side primitives to | ||
322 | deal with this. | ||
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt index 7aa2002ade77..2a23523ce471 100644 --- a/Documentation/RCU/rcu.txt +++ b/Documentation/RCU/rcu.txt | |||
@@ -36,7 +36,7 @@ o How can the updater tell when a grace period has completed | |||
36 | executed in user mode, or executed in the idle loop, we can | 36 | executed in user mode, or executed in the idle loop, we can |
37 | safely free up that item. | 37 | safely free up that item. |
38 | 38 | ||
39 | Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the | 39 | Preemptible variants of RCU (CONFIG_TREE_PREEMPT_RCU) get the |
40 | same effect, but require that the readers manipulate CPU-local | 40 | same effect, but require that the readers manipulate CPU-local |
41 | counters. These counters allow limited types of blocking | 41 | counters. These counters allow limited types of blocking |
42 | within RCU read-side critical sections. SRCU also uses | 42 | within RCU read-side critical sections. SRCU also uses |
@@ -79,10 +79,10 @@ o I hear that RCU is patented? What is with that? | |||
79 | o I hear that RCU needs work in order to support realtime kernels? | 79 | o I hear that RCU needs work in order to support realtime kernels? |
80 | 80 | ||
81 | This work is largely completed. Realtime-friendly RCU can be | 81 | This work is largely completed. Realtime-friendly RCU can be |
82 | enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter. | 82 | enabled via the CONFIG_TREE_PREEMPT_RCU kernel configuration |
83 | However, work is in progress for enabling priority boosting of | 83 | parameter. However, work is in progress for enabling priority |
84 | preempted RCU read-side critical sections. This is needed if you | 84 | boosting of preempted RCU read-side critical sections. This is |
85 | have CPU-bound realtime threads. | 85 | needed if you have CPU-bound realtime threads. |
86 | 86 | ||
87 | o Where can I find more information on RCU? | 87 | o Where can I find more information on RCU? |
88 | 88 | ||
diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt index 909602d409bb..e439a0edee22 100644 --- a/Documentation/RCU/rcubarrier.txt +++ b/Documentation/RCU/rcubarrier.txt | |||
@@ -170,6 +170,13 @@ module invokes call_rcu() from timers, you will need to first cancel all | |||
170 | the timers, and only then invoke rcu_barrier() to wait for any remaining | 170 | the timers, and only then invoke rcu_barrier() to wait for any remaining |
171 | RCU callbacks to complete. | 171 | RCU callbacks to complete. |
172 | 172 | ||
173 | Of course, if you module uses call_rcu_bh(), you will need to invoke | ||
174 | rcu_barrier_bh() before unloading. Similarly, if your module uses | ||
175 | call_rcu_sched(), you will need to invoke rcu_barrier_sched() before | ||
176 | unloading. If your module uses call_rcu(), call_rcu_bh(), -and- | ||
177 | call_rcu_sched(), then you will need to invoke each of rcu_barrier(), | ||
178 | rcu_barrier_bh(), and rcu_barrier_sched(). | ||
179 | |||
173 | 180 | ||
174 | Implementing rcu_barrier() | 181 | Implementing rcu_barrier() |
175 | 182 | ||
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index a342b6e1cc10..9dba3bb90e60 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt | |||
@@ -76,8 +76,10 @@ torture_type The type of RCU to test: "rcu" for the rcu_read_lock() API, | |||
76 | "rcu_sync" for rcu_read_lock() with synchronous reclamation, | 76 | "rcu_sync" for rcu_read_lock() with synchronous reclamation, |
77 | "rcu_bh" for the rcu_read_lock_bh() API, "rcu_bh_sync" for | 77 | "rcu_bh" for the rcu_read_lock_bh() API, "rcu_bh_sync" for |
78 | rcu_read_lock_bh() with synchronous reclamation, "srcu" for | 78 | rcu_read_lock_bh() with synchronous reclamation, "srcu" for |
79 | the "srcu_read_lock()" API, and "sched" for the use of | 79 | the "srcu_read_lock()" API, "sched" for the use of |
80 | preempt_disable() together with synchronize_sched(). | 80 | preempt_disable() together with synchronize_sched(), |
81 | and "sched_expedited" for the use of preempt_disable() | ||
82 | with synchronize_sched_expedited(). | ||
81 | 83 | ||
82 | verbose Enable debug printk()s. Default is disabled. | 84 | verbose Enable debug printk()s. Default is disabled. |
83 | 85 | ||
@@ -162,6 +164,23 @@ of the "old" and "current" counters for the corresponding CPU. The | |||
162 | "idx" value maps the "old" and "current" values to the underlying array, | 164 | "idx" value maps the "old" and "current" values to the underlying array, |
163 | and is useful for debugging. | 165 | and is useful for debugging. |
164 | 166 | ||
167 | Similarly, sched_expedited RCU provides the following: | ||
168 | |||
169 | sched_expedited-torture: rtc: d0000000016c1880 ver: 1090796 tfle: 0 rta: 1090796 rtaf: 0 rtf: 1090787 rtmbe: 0 nt: 27713319 | ||
170 | sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0 | ||
171 | sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0 | ||
172 | sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0 | ||
173 | state: -1 / 0:0 3:0 4:0 | ||
174 | |||
175 | As before, the first four lines are similar to those for RCU. | ||
176 | The last line shows the task-migration state. The first number is | ||
177 | -1 if synchronize_sched_expedited() is idle, -2 if in the process of | ||
178 | posting wakeups to the migration kthreads, and N when waiting on CPU N. | ||
179 | Each of the colon-separated fields following the "/" is a CPU:state pair. | ||
180 | Valid states are "0" for idle, "1" for waiting for quiescent state, | ||
181 | "2" for passed through quiescent state, and "3" when a race with a | ||
182 | CPU-hotplug event forces use of the synchronize_sched() primitive. | ||
183 | |||
165 | 184 | ||
166 | USAGE | 185 | USAGE |
167 | 186 | ||
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 02cced183b2d..187bbf10c923 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt | |||
@@ -191,8 +191,7 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy). | |||
191 | 191 | ||
192 | The output of "cat rcu/rcudata" looks as follows: | 192 | The output of "cat rcu/rcudata" looks as follows: |
193 | 193 | ||
194 | rcu: | 194 | rcu_sched: |
195 | rcu: | ||
196 | 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10 | 195 | 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10 |
197 | 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10 | 196 | 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10 |
198 | 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10 | 197 | 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10 |
@@ -306,7 +305,7 @@ comma-separated-variable spreadsheet format. | |||
306 | 305 | ||
307 | The output of "cat rcu/rcugp" looks as follows: | 306 | The output of "cat rcu/rcugp" looks as follows: |
308 | 307 | ||
309 | rcu: completed=33062 gpnum=33063 | 308 | rcu_sched: completed=33062 gpnum=33063 |
310 | rcu_bh: completed=464 gpnum=464 | 309 | rcu_bh: completed=464 gpnum=464 |
311 | 310 | ||
312 | Again, this output is for both "rcu" and "rcu_bh". The fields are | 311 | Again, this output is for both "rcu" and "rcu_bh". The fields are |
@@ -413,7 +412,7 @@ o Each element of the form "1/1 0:127 ^0" represents one struct | |||
413 | 412 | ||
414 | The output of "cat rcu/rcu_pending" looks as follows: | 413 | The output of "cat rcu/rcu_pending" looks as follows: |
415 | 414 | ||
416 | rcu: | 415 | rcu_sched: |
417 | 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 | 416 | 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 |
418 | 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 | 417 | 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 |
419 | 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 | 418 | 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 |
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 96170824a717..e41a7fecf0d3 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt | |||
@@ -136,10 +136,10 @@ rcu_read_lock() | |||
136 | Used by a reader to inform the reclaimer that the reader is | 136 | Used by a reader to inform the reclaimer that the reader is |
137 | entering an RCU read-side critical section. It is illegal | 137 | entering an RCU read-side critical section. It is illegal |
138 | to block while in an RCU read-side critical section, though | 138 | to block while in an RCU read-side critical section, though |
139 | kernels built with CONFIG_PREEMPT_RCU can preempt RCU read-side | 139 | kernels built with CONFIG_TREE_PREEMPT_RCU can preempt RCU |
140 | critical sections. Any RCU-protected data structure accessed | 140 | read-side critical sections. Any RCU-protected data structure |
141 | during an RCU read-side critical section is guaranteed to remain | 141 | accessed during an RCU read-side critical section is guaranteed to |
142 | unreclaimed for the full duration of that critical section. | 142 | remain unreclaimed for the full duration of that critical section. |
143 | Reference counts may be used in conjunction with RCU to maintain | 143 | Reference counts may be used in conjunction with RCU to maintain |
144 | longer-term references to data structures. | 144 | longer-term references to data structures. |
145 | 145 | ||
@@ -785,6 +785,7 @@ RCU pointer/list traversal: | |||
785 | rcu_dereference | 785 | rcu_dereference |
786 | list_for_each_entry_rcu | 786 | list_for_each_entry_rcu |
787 | hlist_for_each_entry_rcu | 787 | hlist_for_each_entry_rcu |
788 | hlist_nulls_for_each_entry_rcu | ||
788 | 789 | ||
789 | list_for_each_continue_rcu (to be deprecated in favor of new | 790 | list_for_each_continue_rcu (to be deprecated in favor of new |
790 | list_for_each_entry_continue_rcu) | 791 | list_for_each_entry_continue_rcu) |
@@ -807,19 +808,23 @@ RCU: Critical sections Grace period Barrier | |||
807 | 808 | ||
808 | rcu_read_lock synchronize_net rcu_barrier | 809 | rcu_read_lock synchronize_net rcu_barrier |
809 | rcu_read_unlock synchronize_rcu | 810 | rcu_read_unlock synchronize_rcu |
811 | synchronize_rcu_expedited | ||
810 | call_rcu | 812 | call_rcu |
811 | 813 | ||
812 | 814 | ||
813 | bh: Critical sections Grace period Barrier | 815 | bh: Critical sections Grace period Barrier |
814 | 816 | ||
815 | rcu_read_lock_bh call_rcu_bh rcu_barrier_bh | 817 | rcu_read_lock_bh call_rcu_bh rcu_barrier_bh |
816 | rcu_read_unlock_bh | 818 | rcu_read_unlock_bh synchronize_rcu_bh |
819 | synchronize_rcu_bh_expedited | ||
817 | 820 | ||
818 | 821 | ||
819 | sched: Critical sections Grace period Barrier | 822 | sched: Critical sections Grace period Barrier |
820 | 823 | ||
821 | [preempt_disable] synchronize_sched rcu_barrier_sched | 824 | rcu_read_lock_sched synchronize_sched rcu_barrier_sched |
822 | [and friends] call_rcu_sched | 825 | rcu_read_unlock_sched call_rcu_sched |
826 | [preempt_disable] synchronize_sched_expedited | ||
827 | [and friends] | ||
823 | 828 | ||
824 | 829 | ||
825 | SRCU: Critical sections Grace period Barrier | 830 | SRCU: Critical sections Grace period Barrier |
@@ -827,6 +832,9 @@ SRCU: Critical sections Grace period Barrier | |||
827 | srcu_read_lock synchronize_srcu N/A | 832 | srcu_read_lock synchronize_srcu N/A |
828 | srcu_read_unlock | 833 | srcu_read_unlock |
829 | 834 | ||
835 | SRCU: Initialization/cleanup | ||
836 | init_srcu_struct | ||
837 | cleanup_srcu_struct | ||
830 | 838 | ||
831 | See the comment headers in the source code (or the docbook generated | 839 | See the comment headers in the source code (or the docbook generated |
832 | from them) for more information. | 840 | from them) for more information. |
diff --git a/Documentation/btmrvl.txt b/Documentation/btmrvl.txt new file mode 100644 index 000000000000..34916a46c099 --- /dev/null +++ b/Documentation/btmrvl.txt | |||
@@ -0,0 +1,119 @@ | |||
1 | ======================================================================= | ||
2 | README for btmrvl driver | ||
3 | ======================================================================= | ||
4 | |||
5 | |||
6 | All commands are used via debugfs interface. | ||
7 | |||
8 | ===================== | ||
9 | Set/get driver configurations: | ||
10 | |||
11 | Path: /debug/btmrvl/config/ | ||
12 | |||
13 | gpiogap=[n] | ||
14 | hscfgcmd | ||
15 | These commands are used to configure the host sleep parameters. | ||
16 | bit 8:0 -- Gap | ||
17 | bit 16:8 -- GPIO | ||
18 | |||
19 | where GPIO is the pin number of GPIO used to wake up the host. | ||
20 | It could be any valid GPIO pin# (e.g. 0-7) or 0xff (SDIO interface | ||
21 | wakeup will be used instead). | ||
22 | |||
23 | where Gap is the gap in milli seconds between wakeup signal and | ||
24 | wakeup event, or 0xff for special host sleep setting. | ||
25 | |||
26 | Usage: | ||
27 | # Use SDIO interface to wake up the host and set GAP to 0x80: | ||
28 | echo 0xff80 > /debug/btmrvl/config/gpiogap | ||
29 | echo 1 > /debug/btmrvl/config/hscfgcmd | ||
30 | |||
31 | # Use GPIO pin #3 to wake up the host and set GAP to 0xff: | ||
32 | echo 0x03ff > /debug/btmrvl/config/gpiogap | ||
33 | echo 1 > /debug/btmrvl/config/hscfgcmd | ||
34 | |||
35 | psmode=[n] | ||
36 | pscmd | ||
37 | These commands are used to enable/disable auto sleep mode | ||
38 | |||
39 | where the option is: | ||
40 | 1 -- Enable auto sleep mode | ||
41 | 0 -- Disable auto sleep mode | ||
42 | |||
43 | Usage: | ||
44 | # Enable auto sleep mode | ||
45 | echo 1 > /debug/btmrvl/config/psmode | ||
46 | echo 1 > /debug/btmrvl/config/pscmd | ||
47 | |||
48 | # Disable auto sleep mode | ||
49 | echo 0 > /debug/btmrvl/config/psmode | ||
50 | echo 1 > /debug/btmrvl/config/pscmd | ||
51 | |||
52 | |||
53 | hsmode=[n] | ||
54 | hscmd | ||
55 | These commands are used to enable host sleep or wake up firmware | ||
56 | |||
57 | where the option is: | ||
58 | 1 -- Enable host sleep | ||
59 | 0 -- Wake up firmware | ||
60 | |||
61 | Usage: | ||
62 | # Enable host sleep | ||
63 | echo 1 > /debug/btmrvl/config/hsmode | ||
64 | echo 1 > /debug/btmrvl/config/hscmd | ||
65 | |||
66 | # Wake up firmware | ||
67 | echo 0 > /debug/btmrvl/config/hsmode | ||
68 | echo 1 > /debug/btmrvl/config/hscmd | ||
69 | |||
70 | |||
71 | ====================== | ||
72 | Get driver status: | ||
73 | |||
74 | Path: /debug/btmrvl/status/ | ||
75 | |||
76 | Usage: | ||
77 | cat /debug/btmrvl/status/<args> | ||
78 | |||
79 | where the args are: | ||
80 | |||
81 | curpsmode | ||
82 | This command displays current auto sleep status. | ||
83 | |||
84 | psstate | ||
85 | This command display the power save state. | ||
86 | |||
87 | hsstate | ||
88 | This command display the host sleep state. | ||
89 | |||
90 | txdnldrdy | ||
91 | This command displays the value of Tx download ready flag. | ||
92 | |||
93 | |||
94 | ===================== | ||
95 | |||
96 | Use hcitool to issue raw hci command, refer to hcitool manual | ||
97 | |||
98 | Usage: Hcitool cmd <ogf> <ocf> [Parameters] | ||
99 | |||
100 | Interface Control Command | ||
101 | hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00 --Enable All interface | ||
102 | hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01 --Enable Wlan interface | ||
103 | hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02 --Enable BT interface | ||
104 | hcitool cmd 0x3f 0x5b 0xf5 0x00 0x00 --Disable All interface | ||
105 | hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01 --Disable Wlan interface | ||
106 | hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02 --Disable BT interface | ||
107 | |||
108 | ======================================================================= | ||
109 | |||
110 | |||
111 | SD8688 firmware: | ||
112 | |||
113 | /lib/firmware/sd8688_helper.bin | ||
114 | /lib/firmware/sd8688.bin | ||
115 | |||
116 | |||
117 | The images can be downloaded from: | ||
118 | |||
119 | git.infradead.org/users/dwmw2/linux-firmware.git/libertas/ | ||
diff --git a/Documentation/connector/Makefile b/Documentation/connector/Makefile index 8df1a7285a06..d98e4df98e24 100644 --- a/Documentation/connector/Makefile +++ b/Documentation/connector/Makefile | |||
@@ -9,3 +9,8 @@ hostprogs-y := ucon | |||
9 | always := $(hostprogs-y) | 9 | always := $(hostprogs-y) |
10 | 10 | ||
11 | HOSTCFLAGS_ucon.o += -I$(objtree)/usr/include | 11 | HOSTCFLAGS_ucon.o += -I$(objtree)/usr/include |
12 | |||
13 | all: modules | ||
14 | |||
15 | modules clean: | ||
16 | $(MAKE) -C ../.. SUBDIRS=$(PWD) $@ | ||
diff --git a/Documentation/connector/cn_test.c b/Documentation/connector/cn_test.c index 6a5be5d5c8e4..1711adc33373 100644 --- a/Documentation/connector/cn_test.c +++ b/Documentation/connector/cn_test.c | |||
@@ -19,6 +19,8 @@ | |||
19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
20 | */ | 20 | */ |
21 | 21 | ||
22 | #define pr_fmt(fmt) "cn_test: " fmt | ||
23 | |||
22 | #include <linux/kernel.h> | 24 | #include <linux/kernel.h> |
23 | #include <linux/module.h> | 25 | #include <linux/module.h> |
24 | #include <linux/moduleparam.h> | 26 | #include <linux/moduleparam.h> |
@@ -27,18 +29,17 @@ | |||
27 | 29 | ||
28 | #include <linux/connector.h> | 30 | #include <linux/connector.h> |
29 | 31 | ||
30 | static struct cb_id cn_test_id = { 0x123, 0x456 }; | 32 | static struct cb_id cn_test_id = { CN_NETLINK_USERS + 3, 0x456 }; |
31 | static char cn_test_name[] = "cn_test"; | 33 | static char cn_test_name[] = "cn_test"; |
32 | static struct sock *nls; | 34 | static struct sock *nls; |
33 | static struct timer_list cn_test_timer; | 35 | static struct timer_list cn_test_timer; |
34 | 36 | ||
35 | void cn_test_callback(void *data) | 37 | static void cn_test_callback(struct cn_msg *msg) |
36 | { | 38 | { |
37 | struct cn_msg *msg = (struct cn_msg *)data; | 39 | pr_info("%s: %lu: idx=%x, val=%x, seq=%u, ack=%u, len=%d: %s.\n", |
38 | 40 | __func__, jiffies, msg->id.idx, msg->id.val, | |
39 | printk("%s: %lu: idx=%x, val=%x, seq=%u, ack=%u, len=%d: %s.\n", | 41 | msg->seq, msg->ack, msg->len, |
40 | __func__, jiffies, msg->id.idx, msg->id.val, | 42 | msg->len ? (char *)msg->data : ""); |
41 | msg->seq, msg->ack, msg->len, (char *)msg->data); | ||
42 | } | 43 | } |
43 | 44 | ||
44 | /* | 45 | /* |
@@ -63,9 +64,7 @@ static int cn_test_want_notify(void) | |||
63 | 64 | ||
64 | skb = alloc_skb(size, GFP_ATOMIC); | 65 | skb = alloc_skb(size, GFP_ATOMIC); |
65 | if (!skb) { | 66 | if (!skb) { |
66 | printk(KERN_ERR "Failed to allocate new skb with size=%u.\n", | 67 | pr_err("failed to allocate new skb with size=%u\n", size); |
67 | size); | ||
68 | |||
69 | return -ENOMEM; | 68 | return -ENOMEM; |
70 | } | 69 | } |
71 | 70 | ||
@@ -114,12 +113,12 @@ static int cn_test_want_notify(void) | |||
114 | //netlink_broadcast(nls, skb, 0, ctl->group, GFP_ATOMIC); | 113 | //netlink_broadcast(nls, skb, 0, ctl->group, GFP_ATOMIC); |
115 | netlink_unicast(nls, skb, 0, 0); | 114 | netlink_unicast(nls, skb, 0, 0); |
116 | 115 | ||
117 | printk(KERN_INFO "Request was sent. Group=0x%x.\n", ctl->group); | 116 | pr_info("request was sent: group=0x%x\n", ctl->group); |
118 | 117 | ||
119 | return 0; | 118 | return 0; |
120 | 119 | ||
121 | nlmsg_failure: | 120 | nlmsg_failure: |
122 | printk(KERN_ERR "Failed to send %u.%u\n", msg->seq, msg->ack); | 121 | pr_err("failed to send %u.%u\n", msg->seq, msg->ack); |
123 | kfree_skb(skb); | 122 | kfree_skb(skb); |
124 | return -EINVAL; | 123 | return -EINVAL; |
125 | } | 124 | } |
@@ -131,6 +130,8 @@ static void cn_test_timer_func(unsigned long __data) | |||
131 | struct cn_msg *m; | 130 | struct cn_msg *m; |
132 | char data[32]; | 131 | char data[32]; |
133 | 132 | ||
133 | pr_debug("%s: timer fired with data %lu\n", __func__, __data); | ||
134 | |||
134 | m = kzalloc(sizeof(*m) + sizeof(data), GFP_ATOMIC); | 135 | m = kzalloc(sizeof(*m) + sizeof(data), GFP_ATOMIC); |
135 | if (m) { | 136 | if (m) { |
136 | 137 | ||
@@ -150,7 +151,7 @@ static void cn_test_timer_func(unsigned long __data) | |||
150 | 151 | ||
151 | cn_test_timer_counter++; | 152 | cn_test_timer_counter++; |
152 | 153 | ||
153 | mod_timer(&cn_test_timer, jiffies + HZ); | 154 | mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000)); |
154 | } | 155 | } |
155 | 156 | ||
156 | static int cn_test_init(void) | 157 | static int cn_test_init(void) |
@@ -168,8 +169,10 @@ static int cn_test_init(void) | |||
168 | } | 169 | } |
169 | 170 | ||
170 | setup_timer(&cn_test_timer, cn_test_timer_func, 0); | 171 | setup_timer(&cn_test_timer, cn_test_timer_func, 0); |
171 | cn_test_timer.expires = jiffies + HZ; | 172 | mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000)); |
172 | add_timer(&cn_test_timer); | 173 | |
174 | pr_info("initialized with id={%u.%u}\n", | ||
175 | cn_test_id.idx, cn_test_id.val); | ||
173 | 176 | ||
174 | return 0; | 177 | return 0; |
175 | 178 | ||
diff --git a/Documentation/connector/connector.txt b/Documentation/connector/connector.txt index ad6e0ba7b38c..81e6bf6ead57 100644 --- a/Documentation/connector/connector.txt +++ b/Documentation/connector/connector.txt | |||
@@ -5,10 +5,10 @@ Kernel Connector. | |||
5 | Kernel connector - new netlink based userspace <-> kernel space easy | 5 | Kernel connector - new netlink based userspace <-> kernel space easy |
6 | to use communication module. | 6 | to use communication module. |
7 | 7 | ||
8 | Connector driver adds possibility to connect various agents using | 8 | The Connector driver makes it easy to connect various agents using a |
9 | netlink based network. One must register callback and | 9 | netlink based network. One must register a callback and an identifier. |
10 | identifier. When driver receives special netlink message with | 10 | When the driver receives a special netlink message with the appropriate |
11 | appropriate identifier, appropriate callback will be called. | 11 | identifier, the appropriate callback will be called. |
12 | 12 | ||
13 | From the userspace point of view it's quite straightforward: | 13 | From the userspace point of view it's quite straightforward: |
14 | 14 | ||
@@ -17,10 +17,10 @@ From the userspace point of view it's quite straightforward: | |||
17 | send(); | 17 | send(); |
18 | recv(); | 18 | recv(); |
19 | 19 | ||
20 | But if kernelspace want to use full power of such connections, driver | 20 | But if kernelspace wants to use the full power of such connections, the |
21 | writer must create special sockets, must know about struct sk_buff | 21 | driver writer must create special sockets, must know about struct sk_buff |
22 | handling... Connector allows any kernelspace agents to use netlink | 22 | handling, etc... The Connector driver allows any kernelspace agents to use |
23 | based networking for inter-process communication in a significantly | 23 | netlink based networking for inter-process communication in a significantly |
24 | easier way: | 24 | easier way: |
25 | 25 | ||
26 | int cn_add_callback(struct cb_id *id, char *name, void (*callback) (void *)); | 26 | int cn_add_callback(struct cb_id *id, char *name, void (*callback) (void *)); |
@@ -32,15 +32,15 @@ struct cb_id | |||
32 | __u32 val; | 32 | __u32 val; |
33 | }; | 33 | }; |
34 | 34 | ||
35 | idx and val are unique identifiers which must be registered in | 35 | idx and val are unique identifiers which must be registered in the |
36 | connector.h for in-kernel usage. void (*callback) (void *) - is a | 36 | connector.h header for in-kernel usage. void (*callback) (void *) is a |
37 | callback function which will be called when message with above idx.val | 37 | callback function which will be called when a message with above idx.val |
38 | will be received by connector core. Argument for that function must | 38 | is received by the connector core. The argument for that function must |
39 | be dereferenced to struct cn_msg *. | 39 | be dereferenced to struct cn_msg *. |
40 | 40 | ||
41 | struct cn_msg | 41 | struct cn_msg |
42 | { | 42 | { |
43 | struct cb_id id; | 43 | struct cb_id id; |
44 | 44 | ||
45 | __u32 seq; | 45 | __u32 seq; |
46 | __u32 ack; | 46 | __u32 ack; |
@@ -55,92 +55,95 @@ Connector interfaces. | |||
55 | 55 | ||
56 | int cn_add_callback(struct cb_id *id, char *name, void (*callback) (void *)); | 56 | int cn_add_callback(struct cb_id *id, char *name, void (*callback) (void *)); |
57 | 57 | ||
58 | Registers new callback with connector core. | 58 | Registers new callback with connector core. |
59 | 59 | ||
60 | struct cb_id *id - unique connector's user identifier. | 60 | struct cb_id *id - unique connector's user identifier. |
61 | It must be registered in connector.h for legal in-kernel users. | 61 | It must be registered in connector.h for legal in-kernel users. |
62 | char *name - connector's callback symbolic name. | 62 | char *name - connector's callback symbolic name. |
63 | void (*callback) (void *) - connector's callback. | 63 | void (*callback) (void *) - connector's callback. |
64 | Argument must be dereferenced to struct cn_msg *. | 64 | Argument must be dereferenced to struct cn_msg *. |
65 | 65 | ||
66 | |||
66 | void cn_del_callback(struct cb_id *id); | 67 | void cn_del_callback(struct cb_id *id); |
67 | 68 | ||
68 | Unregisters new callback with connector core. | 69 | Unregisters new callback with connector core. |
70 | |||
71 | struct cb_id *id - unique connector's user identifier. | ||
69 | 72 | ||
70 | struct cb_id *id - unique connector's user identifier. | ||
71 | 73 | ||
72 | int cn_netlink_send(struct cn_msg *msg, u32 __groups, int gfp_mask); | 74 | int cn_netlink_send(struct cn_msg *msg, u32 __groups, int gfp_mask); |
73 | 75 | ||
74 | Sends message to the specified groups. It can be safely called from | 76 | Sends message to the specified groups. It can be safely called from |
75 | softirq context, but may silently fail under strong memory pressure. | 77 | softirq context, but may silently fail under strong memory pressure. |
76 | If there are no listeners for given group -ESRCH can be returned. | 78 | If there are no listeners for given group -ESRCH can be returned. |
77 | 79 | ||
78 | struct cn_msg * - message header(with attached data). | 80 | struct cn_msg * - message header(with attached data). |
79 | u32 __group - destination group. | 81 | u32 __group - destination group. |
80 | If __group is zero, then appropriate group will | 82 | If __group is zero, then appropriate group will |
81 | be searched through all registered connector users, | 83 | be searched through all registered connector users, |
82 | and message will be delivered to the group which was | 84 | and message will be delivered to the group which was |
83 | created for user with the same ID as in msg. | 85 | created for user with the same ID as in msg. |
84 | If __group is not zero, then message will be delivered | 86 | If __group is not zero, then message will be delivered |
85 | to the specified group. | 87 | to the specified group. |
86 | int gfp_mask - GFP mask. | 88 | int gfp_mask - GFP mask. |
87 | 89 | ||
88 | Note: When registering new callback user, connector core assigns | 90 | Note: When registering new callback user, connector core assigns |
89 | netlink group to the user which is equal to it's id.idx. | 91 | netlink group to the user which is equal to it's id.idx. |
90 | 92 | ||
91 | /*****************************************/ | 93 | /*****************************************/ |
92 | Protocol description. | 94 | Protocol description. |
93 | /*****************************************/ | 95 | /*****************************************/ |
94 | 96 | ||
95 | Current offers transport layer with fixed header. Recommended | 97 | The current framework offers a transport layer with fixed headers. The |
96 | protocol which uses such header is following: | 98 | recommended protocol which uses such a header is as following: |
97 | 99 | ||
98 | msg->seq and msg->ack are used to determine message genealogy. When | 100 | msg->seq and msg->ack are used to determine message genealogy. When |
99 | someone sends message it puts there locally unique sequence and random | 101 | someone sends a message, they use a locally unique sequence and random |
100 | acknowledge numbers. Sequence number may be copied into | 102 | acknowledge number. The sequence number may be copied into |
101 | nlmsghdr->nlmsg_seq too. | 103 | nlmsghdr->nlmsg_seq too. |
102 | 104 | ||
103 | Sequence number is incremented with each message to be sent. | 105 | The sequence number is incremented with each message sent. |
104 | 106 | ||
105 | If we expect reply to our message, then sequence number in received | 107 | If you expect a reply to the message, then the sequence number in the |
106 | message MUST be the same as in original message, and acknowledge | 108 | received message MUST be the same as in the original message, and the |
107 | number MUST be the same + 1. | 109 | acknowledge number MUST be the same + 1. |
108 | 110 | ||
109 | If we receive message and it's sequence number is not equal to one we | 111 | If we receive a message and its sequence number is not equal to one we |
110 | are expecting, then it is new message. If we receive message and it's | 112 | are expecting, then it is a new message. If we receive a message and |
111 | sequence number is the same as one we are expecting, but it's | 113 | its sequence number is the same as one we are expecting, but its |
112 | acknowledge is not equal acknowledge number in original message + 1, | 114 | acknowledge is not equal to the acknowledge number in the original |
113 | then it is new message. | 115 | message + 1, then it is a new message. |
114 | 116 | ||
115 | Obviously, protocol header contains above id. | 117 | Obviously, the protocol header contains the above id. |
116 | 118 | ||
117 | connector allows event notification in the following form: kernel | 119 | The connector allows event notification in the following form: kernel |
118 | driver or userspace process can ask connector to notify it when | 120 | driver or userspace process can ask connector to notify it when |
119 | selected id's will be turned on or off(registered or unregistered it's | 121 | selected ids will be turned on or off (registered or unregistered its |
120 | callback). It is done by sending special command to connector | 122 | callback). It is done by sending a special command to the connector |
121 | driver(it also registers itself with id={-1, -1}). | 123 | driver (it also registers itself with id={-1, -1}). |
122 | 124 | ||
123 | As example of usage Documentation/connector now contains cn_test.c - | 125 | As example of this usage can be found in the cn_test.c module which |
124 | testing module which uses connector to request notification and to | 126 | uses the connector to request notification and to send messages. |
125 | send messages. | ||
126 | 127 | ||
127 | /*****************************************/ | 128 | /*****************************************/ |
128 | Reliability. | 129 | Reliability. |
129 | /*****************************************/ | 130 | /*****************************************/ |
130 | 131 | ||
131 | Netlink itself is not reliable protocol, that means that messages can | 132 | Netlink itself is not a reliable protocol. That means that messages can |
132 | be lost due to memory pressure or process' receiving queue overflowed, | 133 | be lost due to memory pressure or process' receiving queue overflowed, |
133 | so caller is warned must be prepared. That is why struct cn_msg [main | 134 | so caller is warned that it must be prepared. That is why the struct |
134 | connector's message header] contains u32 seq and u32 ack fields. | 135 | cn_msg [main connector's message header] contains u32 seq and u32 ack |
136 | fields. | ||
135 | 137 | ||
136 | /*****************************************/ | 138 | /*****************************************/ |
137 | Userspace usage. | 139 | Userspace usage. |
138 | /*****************************************/ | 140 | /*****************************************/ |
141 | |||
139 | 2.6.14 has a new netlink socket implementation, which by default does not | 142 | 2.6.14 has a new netlink socket implementation, which by default does not |
140 | allow to send data to netlink groups other than 1. | 143 | allow people to send data to netlink groups other than 1. |
141 | So, if to use netlink socket (for example using connector) | 144 | So, if you wish to use a netlink socket (for example using connector) |
142 | with different group number userspace application must subscribe to | 145 | with a different group number, the userspace application must subscribe to |
143 | that group. It can be achieved by following pseudocode: | 146 | that group first. It can be achieved by the following pseudocode: |
144 | 147 | ||
145 | s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR); | 148 | s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR); |
146 | 149 | ||
@@ -160,8 +163,8 @@ if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) { | |||
160 | } | 163 | } |
161 | 164 | ||
162 | Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket | 165 | Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket |
163 | option. To drop multicast subscription one should call above socket option | 166 | option. To drop a multicast subscription, one should call the above socket |
164 | with NETLINK_DROP_MEMBERSHIP parameter which is defined as 0. | 167 | option with the NETLINK_DROP_MEMBERSHIP parameter which is defined as 0. |
165 | 168 | ||
166 | 2.6.14 netlink code only allows to select a group which is less or equal to | 169 | 2.6.14 netlink code only allows to select a group which is less or equal to |
167 | the maximum group number, which is used at netlink_kernel_create() time. | 170 | the maximum group number, which is used at netlink_kernel_create() time. |
diff --git a/Documentation/connector/ucon.c b/Documentation/connector/ucon.c index c5092ad0ce4b..4848db8c71ff 100644 --- a/Documentation/connector/ucon.c +++ b/Documentation/connector/ucon.c | |||
@@ -30,18 +30,24 @@ | |||
30 | 30 | ||
31 | #include <arpa/inet.h> | 31 | #include <arpa/inet.h> |
32 | 32 | ||
33 | #include <stdbool.h> | ||
33 | #include <stdio.h> | 34 | #include <stdio.h> |
34 | #include <stdlib.h> | 35 | #include <stdlib.h> |
35 | #include <unistd.h> | 36 | #include <unistd.h> |
36 | #include <string.h> | 37 | #include <string.h> |
37 | #include <errno.h> | 38 | #include <errno.h> |
38 | #include <time.h> | 39 | #include <time.h> |
40 | #include <getopt.h> | ||
39 | 41 | ||
40 | #include <linux/connector.h> | 42 | #include <linux/connector.h> |
41 | 43 | ||
42 | #define DEBUG | 44 | #define DEBUG |
43 | #define NETLINK_CONNECTOR 11 | 45 | #define NETLINK_CONNECTOR 11 |
44 | 46 | ||
47 | /* Hopefully your userspace connector.h matches this kernel */ | ||
48 | #define CN_TEST_IDX CN_NETLINK_USERS + 3 | ||
49 | #define CN_TEST_VAL 0x456 | ||
50 | |||
45 | #ifdef DEBUG | 51 | #ifdef DEBUG |
46 | #define ulog(f, a...) fprintf(stdout, f, ##a) | 52 | #define ulog(f, a...) fprintf(stdout, f, ##a) |
47 | #else | 53 | #else |
@@ -83,6 +89,25 @@ static int netlink_send(int s, struct cn_msg *msg) | |||
83 | return err; | 89 | return err; |
84 | } | 90 | } |
85 | 91 | ||
92 | static void usage(void) | ||
93 | { | ||
94 | printf( | ||
95 | "Usage: ucon [options] [output file]\n" | ||
96 | "\n" | ||
97 | "\t-h\tthis help screen\n" | ||
98 | "\t-s\tsend buffers to the test module\n" | ||
99 | "\n" | ||
100 | "The default behavior of ucon is to subscribe to the test module\n" | ||
101 | "and wait for state messages. Any ones received are dumped to the\n" | ||
102 | "specified output file (or stdout). The test module is assumed to\n" | ||
103 | "have an id of {%u.%u}\n" | ||
104 | "\n" | ||
105 | "If you get no output, then verify the cn_test module id matches\n" | ||
106 | "the expected id above.\n" | ||
107 | , CN_TEST_IDX, CN_TEST_VAL | ||
108 | ); | ||
109 | } | ||
110 | |||
86 | int main(int argc, char *argv[]) | 111 | int main(int argc, char *argv[]) |
87 | { | 112 | { |
88 | int s; | 113 | int s; |
@@ -94,17 +119,34 @@ int main(int argc, char *argv[]) | |||
94 | FILE *out; | 119 | FILE *out; |
95 | time_t tm; | 120 | time_t tm; |
96 | struct pollfd pfd; | 121 | struct pollfd pfd; |
122 | bool send_msgs = false; | ||
97 | 123 | ||
98 | if (argc < 2) | 124 | while ((s = getopt(argc, argv, "hs")) != -1) { |
99 | out = stdout; | 125 | switch (s) { |
100 | else { | 126 | case 's': |
101 | out = fopen(argv[1], "a+"); | 127 | send_msgs = true; |
128 | break; | ||
129 | |||
130 | case 'h': | ||
131 | usage(); | ||
132 | return 0; | ||
133 | |||
134 | default: | ||
135 | /* getopt() outputs an error for us */ | ||
136 | usage(); | ||
137 | return 1; | ||
138 | } | ||
139 | } | ||
140 | |||
141 | if (argc != optind) { | ||
142 | out = fopen(argv[optind], "a+"); | ||
102 | if (!out) { | 143 | if (!out) { |
103 | ulog("Unable to open %s for writing: %s\n", | 144 | ulog("Unable to open %s for writing: %s\n", |
104 | argv[1], strerror(errno)); | 145 | argv[1], strerror(errno)); |
105 | out = stdout; | 146 | out = stdout; |
106 | } | 147 | } |
107 | } | 148 | } else |
149 | out = stdout; | ||
108 | 150 | ||
109 | memset(buf, 0, sizeof(buf)); | 151 | memset(buf, 0, sizeof(buf)); |
110 | 152 | ||
@@ -115,9 +157,11 @@ int main(int argc, char *argv[]) | |||
115 | } | 157 | } |
116 | 158 | ||
117 | l_local.nl_family = AF_NETLINK; | 159 | l_local.nl_family = AF_NETLINK; |
118 | l_local.nl_groups = 0x123; /* bitmask of requested groups */ | 160 | l_local.nl_groups = -1; /* bitmask of requested groups */ |
119 | l_local.nl_pid = 0; | 161 | l_local.nl_pid = 0; |
120 | 162 | ||
163 | ulog("subscribing to %u.%u\n", CN_TEST_IDX, CN_TEST_VAL); | ||
164 | |||
121 | if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) { | 165 | if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) { |
122 | perror("bind"); | 166 | perror("bind"); |
123 | close(s); | 167 | close(s); |
@@ -130,15 +174,15 @@ int main(int argc, char *argv[]) | |||
130 | setsockopt(s, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &on, sizeof(on)); | 174 | setsockopt(s, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &on, sizeof(on)); |
131 | } | 175 | } |
132 | #endif | 176 | #endif |
133 | if (0) { | 177 | if (send_msgs) { |
134 | int i, j; | 178 | int i, j; |
135 | 179 | ||
136 | memset(buf, 0, sizeof(buf)); | 180 | memset(buf, 0, sizeof(buf)); |
137 | 181 | ||
138 | data = (struct cn_msg *)buf; | 182 | data = (struct cn_msg *)buf; |
139 | 183 | ||
140 | data->id.idx = 0x123; | 184 | data->id.idx = CN_TEST_IDX; |
141 | data->id.val = 0x456; | 185 | data->id.val = CN_TEST_VAL; |
142 | data->seq = seq++; | 186 | data->seq = seq++; |
143 | data->ack = 0; | 187 | data->ack = 0; |
144 | data->len = 0; | 188 | data->len = 0; |
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 09e031c55887..503d21216d58 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -6,6 +6,35 @@ be removed from this file. | |||
6 | 6 | ||
7 | --------------------------- | 7 | --------------------------- |
8 | 8 | ||
9 | What: PRISM54 | ||
10 | When: 2.6.34 | ||
11 | |||
12 | Why: prism54 FullMAC PCI / Cardbus devices used to be supported only by the | ||
13 | prism54 wireless driver. After Intersil stopped selling these | ||
14 | devices in preference for the newer more flexible SoftMAC devices | ||
15 | a SoftMAC device driver was required and prism54 did not support | ||
16 | them. The p54pci driver now exists and has been present in the kernel for | ||
17 | a while. This driver supports both SoftMAC devices and FullMAC devices. | ||
18 | The main difference between these devices was the amount of memory which | ||
19 | could be used for the firmware. The SoftMAC devices support a smaller | ||
20 | amount of memory. Because of this the SoftMAC firmware fits into FullMAC | ||
21 | devices's memory. p54pci supports not only PCI / Cardbus but also USB | ||
22 | and SPI. Since p54pci supports all devices prism54 supports | ||
23 | you will have a conflict. I'm not quite sure how distributions are | ||
24 | handling this conflict right now. prism54 was kept around due to | ||
25 | claims users may experience issues when using the SoftMAC driver. | ||
26 | Time has passed users have not reported issues. If you use prism54 | ||
27 | and for whatever reason you cannot use p54pci please let us know! | ||
28 | E-mail us at: linux-wireless@vger.kernel.org | ||
29 | |||
30 | For more information see the p54 wiki page: | ||
31 | |||
32 | http://wireless.kernel.org/en/users/Drivers/p54 | ||
33 | |||
34 | Who: Luis R. Rodriguez <lrodriguez@atheros.com> | ||
35 | |||
36 | --------------------------- | ||
37 | |||
9 | What: IRQF_SAMPLE_RANDOM | 38 | What: IRQF_SAMPLE_RANDOM |
10 | Check: IRQF_SAMPLE_RANDOM | 39 | Check: IRQF_SAMPLE_RANDOM |
11 | When: July 2009 | 40 | When: July 2009 |
@@ -206,24 +235,6 @@ Who: Len Brown <len.brown@intel.com> | |||
206 | 235 | ||
207 | --------------------------- | 236 | --------------------------- |
208 | 237 | ||
209 | What: libata spindown skipping and warning | ||
210 | When: Dec 2008 | ||
211 | Why: Some halt(8) implementations synchronize caches for and spin | ||
212 | down libata disks because libata didn't use to spin down disk on | ||
213 | system halt (only synchronized caches). | ||
214 | Spin down on system halt is now implemented. sysfs node | ||
215 | /sys/class/scsi_disk/h:c:i:l/manage_start_stop is present if | ||
216 | spin down support is available. | ||
217 | Because issuing spin down command to an already spun down disk | ||
218 | makes some disks spin up just to spin down again, libata tracks | ||
219 | device spindown status to skip the extra spindown command and | ||
220 | warn about it. | ||
221 | This is to give userspace tools the time to get updated and will | ||
222 | be removed after userspace is reasonably updated. | ||
223 | Who: Tejun Heo <htejun@gmail.com> | ||
224 | |||
225 | --------------------------- | ||
226 | |||
227 | What: i386/x86_64 bzImage symlinks | 238 | What: i386/x86_64 bzImage symlinks |
228 | When: April 2010 | 239 | When: April 2010 |
229 | 240 | ||
@@ -235,31 +246,6 @@ Who: Thomas Gleixner <tglx@linutronix.de> | |||
235 | --------------------------- | 246 | --------------------------- |
236 | 247 | ||
237 | What (Why): | 248 | What (Why): |
238 | - include/linux/netfilter_ipv4/ipt_TOS.h ipt_tos.h header files | ||
239 | (superseded by xt_TOS/xt_tos target & match) | ||
240 | |||
241 | - "forwarding" header files like ipt_mac.h in | ||
242 | include/linux/netfilter_ipv4/ and include/linux/netfilter_ipv6/ | ||
243 | |||
244 | - xt_CONNMARK match revision 0 | ||
245 | (superseded by xt_CONNMARK match revision 1) | ||
246 | |||
247 | - xt_MARK target revisions 0 and 1 | ||
248 | (superseded by xt_MARK match revision 2) | ||
249 | |||
250 | - xt_connmark match revision 0 | ||
251 | (superseded by xt_connmark match revision 1) | ||
252 | |||
253 | - xt_conntrack match revision 0 | ||
254 | (superseded by xt_conntrack match revision 1) | ||
255 | |||
256 | - xt_iprange match revision 0, | ||
257 | include/linux/netfilter_ipv4/ipt_iprange.h | ||
258 | (superseded by xt_iprange match revision 1) | ||
259 | |||
260 | - xt_mark match revision 0 | ||
261 | (superseded by xt_mark match revision 1) | ||
262 | |||
263 | - xt_recent: the old ipt_recent proc dir | 249 | - xt_recent: the old ipt_recent proc dir |
264 | (superseded by /proc/net/xt_recent) | 250 | (superseded by /proc/net/xt_recent) |
265 | 251 | ||
@@ -394,15 +380,6 @@ Who: Thomas Gleixner <tglx@linutronix.de> | |||
394 | 380 | ||
395 | ----------------------------- | 381 | ----------------------------- |
396 | 382 | ||
397 | What: obsolete generic irq defines and typedefs | ||
398 | When: 2.6.30 | ||
399 | Why: The defines and typedefs (hw_interrupt_type, no_irq_type, irq_desc_t) | ||
400 | have been kept around for migration reasons. After more than two years | ||
401 | it's time to remove them finally | ||
402 | Who: Thomas Gleixner <tglx@linutronix.de> | ||
403 | |||
404 | --------------------------- | ||
405 | |||
406 | What: fakephp and associated sysfs files in /sys/bus/pci/slots/ | 383 | What: fakephp and associated sysfs files in /sys/bus/pci/slots/ |
407 | When: 2011 | 384 | When: 2011 |
408 | Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to | 385 | Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to |
@@ -468,3 +445,27 @@ Why: cpu_policy_rwsem has a new cleaner definition making it local to | |||
468 | cpufreq core and contained inside cpufreq.c. Other dependent | 445 | cpufreq core and contained inside cpufreq.c. Other dependent |
469 | drivers should not use it in order to safely avoid lockdep issues. | 446 | drivers should not use it in order to safely avoid lockdep issues. |
470 | Who: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> | 447 | Who: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> |
448 | |||
449 | ---------------------------- | ||
450 | |||
451 | What: sound-slot/service-* module aliases and related clutters in | ||
452 | sound/sound_core.c | ||
453 | When: August 2010 | ||
454 | Why: OSS sound_core grabs all legacy minors (0-255) of SOUND_MAJOR | ||
455 | (14) and requests modules using custom sound-slot/service-* | ||
456 | module aliases. The only benefit of doing this is allowing | ||
457 | use of custom module aliases which might as well be considered | ||
458 | a bug at this point. This preemptive claiming prevents | ||
459 | alternative OSS implementations. | ||
460 | |||
461 | Till the feature is removed, the kernel will be requesting | ||
462 | both sound-slot/service-* and the standard char-major-* module | ||
463 | aliases and allow turning off the pre-claiming selectively via | ||
464 | CONFIG_SOUND_OSS_CORE_PRECLAIM and soundcore.preclaim_oss | ||
465 | kernel parameter. | ||
466 | |||
467 | After the transition phase is complete, both the custom module | ||
468 | aliases and switches to disable it will go away. This removal | ||
469 | will also allow making ALSA OSS emulation independent of | ||
470 | sound_core. The dependency will be broken then too. | ||
471 | Who: Tejun Heo <tj@kernel.org> | ||
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt index bf8080640eba..6208f55c44c3 100644 --- a/Documentation/filesystems/9p.txt +++ b/Documentation/filesystems/9p.txt | |||
@@ -123,6 +123,9 @@ available from the same CVS repository. | |||
123 | There are user and developer mailing lists available through the v9fs project | 123 | There are user and developer mailing lists available through the v9fs project |
124 | on sourceforge (http://sourceforge.net/projects/v9fs). | 124 | on sourceforge (http://sourceforge.net/projects/v9fs). |
125 | 125 | ||
126 | A stand-alone version of the module (which should build for any 2.6 kernel) | ||
127 | is available via (http://github.com/ericvh/9p-sac/tree/master) | ||
128 | |||
126 | News and other information is maintained on SWiK (http://swik.net/v9fs). | 129 | News and other information is maintained on SWiK (http://swik.net/v9fs). |
127 | 130 | ||
128 | Bug reports may be issued through the kernel.org bugzilla | 131 | Bug reports may be issued through the kernel.org bugzilla |
diff --git a/Documentation/filesystems/afs.txt b/Documentation/filesystems/afs.txt index 12ad6c7f4e50..ffef91c4e0d6 100644 --- a/Documentation/filesystems/afs.txt +++ b/Documentation/filesystems/afs.txt | |||
@@ -23,15 +23,13 @@ it does support include: | |||
23 | 23 | ||
24 | (*) Security (currently only AFS kaserver and KerberosIV tickets). | 24 | (*) Security (currently only AFS kaserver and KerberosIV tickets). |
25 | 25 | ||
26 | (*) File reading. | 26 | (*) File reading and writing. |
27 | 27 | ||
28 | (*) Automounting. | 28 | (*) Automounting. |
29 | 29 | ||
30 | It does not yet support the following AFS features: | 30 | (*) Local caching (via fscache). |
31 | |||
32 | (*) Write support. | ||
33 | 31 | ||
34 | (*) Local caching. | 32 | It does not yet support the following AFS features: |
35 | 33 | ||
36 | (*) pioctl() system call. | 34 | (*) pioctl() system call. |
37 | 35 | ||
@@ -56,7 +54,7 @@ They permit the debugging messages to be turned on dynamically by manipulating | |||
56 | the masks in the following files: | 54 | the masks in the following files: |
57 | 55 | ||
58 | /sys/module/af_rxrpc/parameters/debug | 56 | /sys/module/af_rxrpc/parameters/debug |
59 | /sys/module/afs/parameters/debug | 57 | /sys/module/kafs/parameters/debug |
60 | 58 | ||
61 | 59 | ||
62 | ===== | 60 | ===== |
@@ -66,9 +64,9 @@ USAGE | |||
66 | When inserting the driver modules the root cell must be specified along with a | 64 | When inserting the driver modules the root cell must be specified along with a |
67 | list of volume location server IP addresses: | 65 | list of volume location server IP addresses: |
68 | 66 | ||
69 | insmod af_rxrpc.o | 67 | modprobe af_rxrpc |
70 | insmod rxkad.o | 68 | modprobe rxkad |
71 | insmod kafs.o rootcell=cambridge.redhat.com:172.16.18.73:172.16.18.91 | 69 | modprobe kafs rootcell=cambridge.redhat.com:172.16.18.73:172.16.18.91 |
72 | 70 | ||
73 | The first module is the AF_RXRPC network protocol driver. This provides the | 71 | The first module is the AF_RXRPC network protocol driver. This provides the |
74 | RxRPC remote operation protocol and may also be accessed from userspace. See: | 72 | RxRPC remote operation protocol and may also be accessed from userspace. See: |
@@ -81,7 +79,7 @@ is the actual filesystem driver for the AFS filesystem. | |||
81 | Once the module has been loaded, more modules can be added by the following | 79 | Once the module has been loaded, more modules can be added by the following |
82 | procedure: | 80 | procedure: |
83 | 81 | ||
84 | echo add grand.central.org 18.7.14.88:128.2.191.224 >/proc/fs/afs/cells | 82 | echo add grand.central.org 18.9.48.14:128.2.203.61:130.237.48.87 >/proc/fs/afs/cells |
85 | 83 | ||
86 | Where the parameters to the "add" command are the name of a cell and a list of | 84 | Where the parameters to the "add" command are the name of a cell and a list of |
87 | volume location servers within that cell, with the latter separated by colons. | 85 | volume location servers within that cell, with the latter separated by colons. |
@@ -101,7 +99,7 @@ The name of the volume can be suffixes with ".backup" or ".readonly" to | |||
101 | specify connection to only volumes of those types. | 99 | specify connection to only volumes of those types. |
102 | 100 | ||
103 | The name of the cell is optional, and if not given during a mount, then the | 101 | The name of the cell is optional, and if not given during a mount, then the |
104 | named volume will be looked up in the cell specified during insmod. | 102 | named volume will be looked up in the cell specified during modprobe. |
105 | 103 | ||
106 | Additional cells can be added through /proc (see later section). | 104 | Additional cells can be added through /proc (see later section). |
107 | 105 | ||
@@ -163,14 +161,14 @@ THE CELL DATABASE | |||
163 | 161 | ||
164 | The filesystem maintains an internal database of all the cells it knows and the | 162 | The filesystem maintains an internal database of all the cells it knows and the |
165 | IP addresses of the volume location servers for those cells. The cell to which | 163 | IP addresses of the volume location servers for those cells. The cell to which |
166 | the system belongs is added to the database when insmod is performed by the | 164 | the system belongs is added to the database when modprobe is performed by the |
167 | "rootcell=" argument or, if compiled in, using a "kafs.rootcell=" argument on | 165 | "rootcell=" argument or, if compiled in, using a "kafs.rootcell=" argument on |
168 | the kernel command line. | 166 | the kernel command line. |
169 | 167 | ||
170 | Further cells can be added by commands similar to the following: | 168 | Further cells can be added by commands similar to the following: |
171 | 169 | ||
172 | echo add CELLNAME VLADDR[:VLADDR][:VLADDR]... >/proc/fs/afs/cells | 170 | echo add CELLNAME VLADDR[:VLADDR][:VLADDR]... >/proc/fs/afs/cells |
173 | echo add grand.central.org 18.7.14.88:128.2.191.224 >/proc/fs/afs/cells | 171 | echo add grand.central.org 18.9.48.14:128.2.203.61:130.237.48.87 >/proc/fs/afs/cells |
174 | 172 | ||
175 | No other cell database operations are available at this time. | 173 | No other cell database operations are available at this time. |
176 | 174 | ||
@@ -233,7 +231,7 @@ insmod /tmp/kafs.o rootcell=cambridge.redhat.com:172.16.18.91 | |||
233 | mount -t afs \%root.afs. /afs | 231 | mount -t afs \%root.afs. /afs |
234 | mount -t afs \%cambridge.redhat.com:root.cell. /afs/cambridge.redhat.com/ | 232 | mount -t afs \%cambridge.redhat.com:root.cell. /afs/cambridge.redhat.com/ |
235 | 233 | ||
236 | echo add grand.central.org 18.7.14.88:128.2.191.224 > /proc/fs/afs/cells | 234 | echo add grand.central.org 18.9.48.14:128.2.203.61:130.237.48.87 > /proc/fs/afs/cells |
237 | mount -t afs "#grand.central.org:root.cell." /afs/grand.central.org/ | 235 | mount -t afs "#grand.central.org:root.cell." /afs/grand.central.org/ |
238 | mount -t afs "#grand.central.org:root.archive." /afs/grand.central.org/archive | 236 | mount -t afs "#grand.central.org:root.archive." /afs/grand.central.org/archive |
239 | mount -t afs "#grand.central.org:root.contrib." /afs/grand.central.org/contrib | 237 | mount -t afs "#grand.central.org:root.contrib." /afs/grand.central.org/contrib |
diff --git a/Documentation/filesystems/nfs.txt b/Documentation/filesystems/nfs.txt new file mode 100644 index 000000000000..f50f26ce6cd0 --- /dev/null +++ b/Documentation/filesystems/nfs.txt | |||
@@ -0,0 +1,98 @@ | |||
1 | |||
2 | The NFS client | ||
3 | ============== | ||
4 | |||
5 | The NFS version 2 protocol was first documented in RFC1094 (March 1989). | ||
6 | Since then two more major releases of NFS have been published, with NFSv3 | ||
7 | being documented in RFC1813 (June 1995), and NFSv4 in RFC3530 (April | ||
8 | 2003). | ||
9 | |||
10 | The Linux NFS client currently supports all the above published versions, | ||
11 | and work is in progress on adding support for minor version 1 of the NFSv4 | ||
12 | protocol. | ||
13 | |||
14 | The purpose of this document is to provide information on some of the | ||
15 | upcall interfaces that are used in order to provide the NFS client with | ||
16 | some of the information that it requires in order to fully comply with | ||
17 | the NFS spec. | ||
18 | |||
19 | The DNS resolver | ||
20 | ================ | ||
21 | |||
22 | NFSv4 allows for one server to refer the NFS client to data that has been | ||
23 | migrated onto another server by means of the special "fs_locations" | ||
24 | attribute. See | ||
25 | http://tools.ietf.org/html/rfc3530#section-6 | ||
26 | and | ||
27 | http://tools.ietf.org/html/draft-ietf-nfsv4-referrals-00 | ||
28 | |||
29 | The fs_locations information can take the form of either an ip address and | ||
30 | a path, or a DNS hostname and a path. The latter requires the NFS client to | ||
31 | do a DNS lookup in order to mount the new volume, and hence the need for an | ||
32 | upcall to allow userland to provide this service. | ||
33 | |||
34 | Assuming that the user has the 'rpc_pipefs' filesystem mounted in the usual | ||
35 | /var/lib/nfs/rpc_pipefs, the upcall consists of the following steps: | ||
36 | |||
37 | (1) The process checks the dns_resolve cache to see if it contains a | ||
38 | valid entry. If so, it returns that entry and exits. | ||
39 | |||
40 | (2) If no valid entry exists, the helper script '/sbin/nfs_cache_getent' | ||
41 | (may be changed using the 'nfs.cache_getent' kernel boot parameter) | ||
42 | is run, with two arguments: | ||
43 | - the cache name, "dns_resolve" | ||
44 | - the hostname to resolve | ||
45 | |||
46 | (3) After looking up the corresponding ip address, the helper script | ||
47 | writes the result into the rpc_pipefs pseudo-file | ||
48 | '/var/lib/nfs/rpc_pipefs/cache/dns_resolve/channel' | ||
49 | in the following (text) format: | ||
50 | |||
51 | "<ip address> <hostname> <ttl>\n" | ||
52 | |||
53 | Where <ip address> is in the usual IPv4 (123.456.78.90) or IPv6 | ||
54 | (ffee:ddcc:bbaa:9988:7766:5544:3322:1100, ffee::1100, ...) format. | ||
55 | <hostname> is identical to the second argument of the helper | ||
56 | script, and <ttl> is the 'time to live' of this cache entry (in | ||
57 | units of seconds). | ||
58 | |||
59 | Note: If <ip address> is invalid, say the string "0", then a negative | ||
60 | entry is created, which will cause the kernel to treat the hostname | ||
61 | as having no valid DNS translation. | ||
62 | |||
63 | |||
64 | |||
65 | |||
66 | A basic sample /sbin/nfs_cache_getent | ||
67 | ===================================== | ||
68 | |||
69 | #!/bin/bash | ||
70 | # | ||
71 | ttl=600 | ||
72 | # | ||
73 | cut=/usr/bin/cut | ||
74 | getent=/usr/bin/getent | ||
75 | rpc_pipefs=/var/lib/nfs/rpc_pipefs | ||
76 | # | ||
77 | die() | ||
78 | { | ||
79 | echo "Usage: $0 cache_name entry_name" | ||
80 | exit 1 | ||
81 | } | ||
82 | |||
83 | [ $# -lt 2 ] && die | ||
84 | cachename="$1" | ||
85 | cache_path=${rpc_pipefs}/cache/${cachename}/channel | ||
86 | |||
87 | case "${cachename}" in | ||
88 | dns_resolve) | ||
89 | name="$2" | ||
90 | result="$(${getent} hosts ${name} | ${cut} -f1 -d\ )" | ||
91 | [ -z "${result}" ] && result="0" | ||
92 | ;; | ||
93 | *) | ||
94 | die | ||
95 | ;; | ||
96 | esac | ||
97 | echo "${result} ${name} ${ttl}" >${cache_path} | ||
98 | |||
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fad18f9456e4..ffead13f9443 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -1167,13 +1167,11 @@ CHAPTER 3: PER-PROCESS PARAMETERS | |||
1167 | 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score | 1167 | 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score |
1168 | ------------------------------------------------------ | 1168 | ------------------------------------------------------ |
1169 | 1169 | ||
1170 | This file can be used to adjust the score used to select which processes should | 1170 | This file can be used to adjust the score used to select which processes |
1171 | be killed in an out-of-memory situation. The oom_adj value is a characteristic | 1171 | should be killed in an out-of-memory situation. Giving it a high score will |
1172 | of the task's mm, so all threads that share an mm with pid will have the same | 1172 | increase the likelihood of this process being killed by the oom-killer. Valid |
1173 | oom_adj value. A high value will increase the likelihood of this process being | 1173 | values are in the range -16 to +15, plus the special value -17, which disables |
1174 | killed by the oom-killer. Valid values are in the range -16 to +15 as | 1174 | oom-killing altogether for this process. |
1175 | explained below and a special value of -17, which disables oom-killing | ||
1176 | altogether for threads sharing pid's mm. | ||
1177 | 1175 | ||
1178 | The process to be killed in an out-of-memory situation is selected among all others | 1176 | The process to be killed in an out-of-memory situation is selected among all others |
1179 | based on its badness score. This value equals the original memory size of the process | 1177 | based on its badness score. This value equals the original memory size of the process |
@@ -1187,9 +1185,6 @@ the parent's score if they do not share the same memory. Thus forking servers | |||
1187 | are the prime candidates to be killed. Having only one 'hungry' child will make | 1185 | are the prime candidates to be killed. Having only one 'hungry' child will make |
1188 | parent less preferable than the child. | 1186 | parent less preferable than the child. |
1189 | 1187 | ||
1190 | /proc/<pid>/oom_adj cannot be changed for kthreads since they are immune from | ||
1191 | oom-killing already. | ||
1192 | |||
1193 | /proc/<pid>/oom_score shows process' current badness score. | 1188 | /proc/<pid>/oom_score shows process' current badness score. |
1194 | 1189 | ||
1195 | The following heuristics are then applied: | 1190 | The following heuristics are then applied: |
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index dbea4f95fc85..1c058b552e93 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt | |||
@@ -121,6 +121,7 @@ Code Seq# Include File Comments | |||
121 | 'c' 00-7F linux/comstats.h conflict! | 121 | 'c' 00-7F linux/comstats.h conflict! |
122 | 'c' 00-7F linux/coda.h conflict! | 122 | 'c' 00-7F linux/coda.h conflict! |
123 | 'c' 80-9F arch/s390/include/asm/chsc.h | 123 | 'c' 80-9F arch/s390/include/asm/chsc.h |
124 | 'c' A0-AF arch/x86/include/asm/msr.h | ||
124 | 'd' 00-FF linux/char/drm/drm/h conflict! | 125 | 'd' 00-FF linux/char/drm/drm/h conflict! |
125 | 'd' F0-FF linux/digi1.h | 126 | 'd' F0-FF linux/digi1.h |
126 | 'e' all linux/digi1.h conflict! | 127 | 'e' all linux/digi1.h conflict! |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7936b801fe6a..cb3a169e372a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1503,6 +1503,14 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1503 | [NFS] set the TCP port on which the NFSv4 callback | 1503 | [NFS] set the TCP port on which the NFSv4 callback |
1504 | channel should listen. | 1504 | channel should listen. |
1505 | 1505 | ||
1506 | nfs.cache_getent= | ||
1507 | [NFS] sets the pathname to the program which is used | ||
1508 | to update the NFS client cache entries. | ||
1509 | |||
1510 | nfs.cache_getent_timeout= | ||
1511 | [NFS] sets the timeout after which an attempt to | ||
1512 | update a cache entry is deemed to have failed. | ||
1513 | |||
1506 | nfs.idmap_cache_timeout= | 1514 | nfs.idmap_cache_timeout= |
1507 | [NFS] set the maximum lifetime for idmapper cache | 1515 | [NFS] set the maximum lifetime for idmapper cache |
1508 | entries. | 1516 | entries. |
@@ -1535,6 +1543,11 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1535 | symbolic names: lapic and ioapic | 1543 | symbolic names: lapic and ioapic |
1536 | Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic | 1544 | Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic |
1537 | 1545 | ||
1546 | netpoll.carrier_timeout= | ||
1547 | [NET] Specifies amount of time (in seconds) that | ||
1548 | netpoll should wait for a carrier. By default netpoll | ||
1549 | waits 4 seconds. | ||
1550 | |||
1538 | no387 [BUGS=X86-32] Tells the kernel to use the 387 maths | 1551 | no387 [BUGS=X86-32] Tells the kernel to use the 387 maths |
1539 | emulation library even if a 387 maths coprocessor | 1552 | emulation library even if a 387 maths coprocessor |
1540 | is present. | 1553 | is present. |
@@ -2395,6 +2408,18 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2395 | stifb= [HW] | 2408 | stifb= [HW] |
2396 | Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]] | 2409 | Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]] |
2397 | 2410 | ||
2411 | sunrpc.min_resvport= | ||
2412 | sunrpc.max_resvport= | ||
2413 | [NFS,SUNRPC] | ||
2414 | SunRPC servers often require that client requests | ||
2415 | originate from a privileged port (i.e. a port in the | ||
2416 | range 0 < portnr < 1024). | ||
2417 | An administrator who wishes to reserve some of these | ||
2418 | ports for other uses may adjust the range that the | ||
2419 | kernel's sunrpc client considers to be privileged | ||
2420 | using these two parameters to set the minimum and | ||
2421 | maximum port values. | ||
2422 | |||
2398 | sunrpc.pool_mode= | 2423 | sunrpc.pool_mode= |
2399 | [NFS] | 2424 | [NFS] |
2400 | Control how the NFS server code allocates CPUs to | 2425 | Control how the NFS server code allocates CPUs to |
@@ -2411,6 +2436,15 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2411 | pernode one pool for each NUMA node (equivalent | 2436 | pernode one pool for each NUMA node (equivalent |
2412 | to global on non-NUMA machines) | 2437 | to global on non-NUMA machines) |
2413 | 2438 | ||
2439 | sunrpc.tcp_slot_table_entries= | ||
2440 | sunrpc.udp_slot_table_entries= | ||
2441 | [NFS,SUNRPC] | ||
2442 | Sets the upper limit on the number of simultaneous | ||
2443 | RPC calls that can be sent from the client to a | ||
2444 | server. Increasing these values may allow you to | ||
2445 | improve throughput, but will also increase the | ||
2446 | amount of memory reserved for use by the client. | ||
2447 | |||
2414 | swiotlb= [IA-64] Number of I/O TLB slabs | 2448 | swiotlb= [IA-64] Number of I/O TLB slabs |
2415 | 2449 | ||
2416 | switches= [HW,M68k] | 2450 | switches= [HW,M68k] |
@@ -2480,6 +2514,11 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2480 | trace_buf_size=nn[KMG] | 2514 | trace_buf_size=nn[KMG] |
2481 | [FTRACE] will set tracing buffer size. | 2515 | [FTRACE] will set tracing buffer size. |
2482 | 2516 | ||
2517 | trace_event=[event-list] | ||
2518 | [FTRACE] Set and start specified trace events in order | ||
2519 | to facilitate early boot debugging. | ||
2520 | See also Documentation/trace/events.txt | ||
2521 | |||
2483 | trix= [HW,OSS] MediaTrix AudioTrix Pro | 2522 | trix= [HW,OSS] MediaTrix AudioTrix Pro |
2484 | Format: | 2523 | Format: |
2485 | <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq> | 2524 | <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq> |
diff --git a/Documentation/keys.txt b/Documentation/keys.txt index b56aacc1fff8..e4dbbdb1bd96 100644 --- a/Documentation/keys.txt +++ b/Documentation/keys.txt | |||
@@ -26,7 +26,7 @@ This document has the following sections: | |||
26 | - Notes on accessing payload contents | 26 | - Notes on accessing payload contents |
27 | - Defining a key type | 27 | - Defining a key type |
28 | - Request-key callback service | 28 | - Request-key callback service |
29 | - Key access filesystem | 29 | - Garbage collection |
30 | 30 | ||
31 | 31 | ||
32 | ============ | 32 | ============ |
@@ -113,6 +113,9 @@ Each key has a number of attributes: | |||
113 | 113 | ||
114 | (*) Dead. The key's type was unregistered, and so the key is now useless. | 114 | (*) Dead. The key's type was unregistered, and so the key is now useless. |
115 | 115 | ||
116 | Keys in the last three states are subject to garbage collection. See the | ||
117 | section on "Garbage collection". | ||
118 | |||
116 | 119 | ||
117 | ==================== | 120 | ==================== |
118 | KEY SERVICE OVERVIEW | 121 | KEY SERVICE OVERVIEW |
@@ -754,6 +757,26 @@ The keyctl syscall functions are: | |||
754 | successful. | 757 | successful. |
755 | 758 | ||
756 | 759 | ||
760 | (*) Install the calling process's session keyring on its parent. | ||
761 | |||
762 | long keyctl(KEYCTL_SESSION_TO_PARENT); | ||
763 | |||
764 | This functions attempts to install the calling process's session keyring | ||
765 | on to the calling process's parent, replacing the parent's current session | ||
766 | keyring. | ||
767 | |||
768 | The calling process must have the same ownership as its parent, the | ||
769 | keyring must have the same ownership as the calling process, the calling | ||
770 | process must have LINK permission on the keyring and the active LSM module | ||
771 | mustn't deny permission, otherwise error EPERM will be returned. | ||
772 | |||
773 | Error ENOMEM will be returned if there was insufficient memory to complete | ||
774 | the operation, otherwise 0 will be returned to indicate success. | ||
775 | |||
776 | The keyring will be replaced next time the parent process leaves the | ||
777 | kernel and resumes executing userspace. | ||
778 | |||
779 | |||
757 | =============== | 780 | =============== |
758 | KERNEL SERVICES | 781 | KERNEL SERVICES |
759 | =============== | 782 | =============== |
@@ -1231,3 +1254,17 @@ by executing: | |||
1231 | 1254 | ||
1232 | In this case, the program isn't required to actually attach the key to a ring; | 1255 | In this case, the program isn't required to actually attach the key to a ring; |
1233 | the rings are provided for reference. | 1256 | the rings are provided for reference. |
1257 | |||
1258 | |||
1259 | ================== | ||
1260 | GARBAGE COLLECTION | ||
1261 | ================== | ||
1262 | |||
1263 | Dead keys (for which the type has been removed) will be automatically unlinked | ||
1264 | from those keyrings that point to them and deleted as soon as possible by a | ||
1265 | background garbage collector. | ||
1266 | |||
1267 | Similarly, revoked and expired keys will be garbage collected, but only after a | ||
1268 | certain amount of time has passed. This time is set as a number of seconds in: | ||
1269 | |||
1270 | /proc/sys/kernel/keys/gc_delay | ||
diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt index 89068030b01b..34f6638aa5ac 100644 --- a/Documentation/kmemleak.txt +++ b/Documentation/kmemleak.txt | |||
@@ -27,6 +27,13 @@ To trigger an intermediate memory scan: | |||
27 | 27 | ||
28 | # echo scan > /sys/kernel/debug/kmemleak | 28 | # echo scan > /sys/kernel/debug/kmemleak |
29 | 29 | ||
30 | To clear the list of all current possible memory leaks: | ||
31 | |||
32 | # echo clear > /sys/kernel/debug/kmemleak | ||
33 | |||
34 | New leaks will then come up upon reading /sys/kernel/debug/kmemleak | ||
35 | again. | ||
36 | |||
30 | Note that the orphan objects are listed in the order they were allocated | 37 | Note that the orphan objects are listed in the order they were allocated |
31 | and one object at the beginning of the list may cause other subsequent | 38 | and one object at the beginning of the list may cause other subsequent |
32 | objects to be reported as orphan. | 39 | objects to be reported as orphan. |
@@ -42,6 +49,9 @@ Memory scanning parameters can be modified at run-time by writing to the | |||
42 | scan=<secs> - set the automatic memory scanning period in seconds | 49 | scan=<secs> - set the automatic memory scanning period in seconds |
43 | (default 600, 0 to stop the automatic scanning) | 50 | (default 600, 0 to stop the automatic scanning) |
44 | scan - trigger a memory scan | 51 | scan - trigger a memory scan |
52 | clear - clear list of current memory leak suspects, done by | ||
53 | marking all current reported unreferenced objects grey | ||
54 | dump=<addr> - dump information about the object found at <addr> | ||
45 | 55 | ||
46 | Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on | 56 | Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on |
47 | the kernel command line. | 57 | the kernel command line. |
@@ -86,6 +96,27 @@ avoid this, kmemleak can also store the number of values pointing to an | |||
86 | address inside the block address range that need to be found so that the | 96 | address inside the block address range that need to be found so that the |
87 | block is not considered a leak. One example is __vmalloc(). | 97 | block is not considered a leak. One example is __vmalloc(). |
88 | 98 | ||
99 | Testing specific sections with kmemleak | ||
100 | --------------------------------------- | ||
101 | |||
102 | Upon initial bootup your /sys/kernel/debug/kmemleak output page may be | ||
103 | quite extensive. This can also be the case if you have very buggy code | ||
104 | when doing development. To work around these situations you can use the | ||
105 | 'clear' command to clear all reported unreferenced objects from the | ||
106 | /sys/kernel/debug/kmemleak output. By issuing a 'scan' after a 'clear' | ||
107 | you can find new unreferenced objects; this should help with testing | ||
108 | specific sections of code. | ||
109 | |||
110 | To test a critical section on demand with a clean kmemleak do: | ||
111 | |||
112 | # echo clear > /sys/kernel/debug/kmemleak | ||
113 | ... test your kernel or modules ... | ||
114 | # echo scan > /sys/kernel/debug/kmemleak | ||
115 | |||
116 | Then as usual to get your report with: | ||
117 | |||
118 | # cat /sys/kernel/debug/kmemleak | ||
119 | |||
89 | Kmemleak API | 120 | Kmemleak API |
90 | ------------ | 121 | ------------ |
91 | 122 | ||
diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX index 1634c6dcecae..50189bf07d53 100644 --- a/Documentation/networking/00-INDEX +++ b/Documentation/networking/00-INDEX | |||
@@ -60,6 +60,8 @@ framerelay.txt | |||
60 | - info on using Frame Relay/Data Link Connection Identifier (DLCI). | 60 | - info on using Frame Relay/Data Link Connection Identifier (DLCI). |
61 | generic_netlink.txt | 61 | generic_netlink.txt |
62 | - info on Generic Netlink | 62 | - info on Generic Netlink |
63 | ieee802154.txt | ||
64 | - Linux IEEE 802.15.4 implementation, API and drivers | ||
63 | ip-sysctl.txt | 65 | ip-sysctl.txt |
64 | - /proc/sys/net/ipv4/* variables | 66 | - /proc/sys/net/ipv4/* variables |
65 | ip_dynaddr.txt | 67 | ip_dynaddr.txt |
diff --git a/Documentation/networking/ieee802154.txt b/Documentation/networking/ieee802154.txt index a0280ad2edc9..23c995e64032 100644 --- a/Documentation/networking/ieee802154.txt +++ b/Documentation/networking/ieee802154.txt | |||
@@ -22,7 +22,7 @@ int sd = socket(PF_IEEE802154, SOCK_DGRAM, 0); | |||
22 | ..... | 22 | ..... |
23 | 23 | ||
24 | The address family, socket addresses etc. are defined in the | 24 | The address family, socket addresses etc. are defined in the |
25 | include/net/ieee802154/af_ieee802154.h header or in the special header | 25 | include/net/af_ieee802154.h header or in the special header |
26 | in our userspace package (see either linux-zigbee sourceforge download page | 26 | in our userspace package (see either linux-zigbee sourceforge download page |
27 | or git tree at git://linux-zigbee.git.sourceforge.net/gitroot/linux-zigbee). | 27 | or git tree at git://linux-zigbee.git.sourceforge.net/gitroot/linux-zigbee). |
28 | 28 | ||
@@ -33,7 +33,7 @@ MLME - MAC Level Management | |||
33 | ============================ | 33 | ============================ |
34 | 34 | ||
35 | Most of IEEE 802.15.4 MLME interfaces are directly mapped on netlink commands. | 35 | Most of IEEE 802.15.4 MLME interfaces are directly mapped on netlink commands. |
36 | See the include/net/ieee802154/nl802154.h header. Our userspace tools package | 36 | See the include/net/nl802154.h header. Our userspace tools package |
37 | (see above) provides CLI configuration utility for radio interfaces and simple | 37 | (see above) provides CLI configuration utility for radio interfaces and simple |
38 | coordinator for IEEE 802.15.4 networks as an example users of MLME protocol. | 38 | coordinator for IEEE 802.15.4 networks as an example users of MLME protocol. |
39 | 39 | ||
@@ -54,10 +54,14 @@ Those types of devices require different approach to be hooked into Linux kernel | |||
54 | HardMAC | 54 | HardMAC |
55 | ======= | 55 | ======= |
56 | 56 | ||
57 | See the header include/net/ieee802154/netdevice.h. You have to implement Linux | 57 | See the header include/net/ieee802154_netdev.h. You have to implement Linux |
58 | net_device, with .type = ARPHRD_IEEE802154. Data is exchanged with socket family | 58 | net_device, with .type = ARPHRD_IEEE802154. Data is exchanged with socket family |
59 | code via plain sk_buffs. The control block of sk_buffs will contain additional | 59 | code via plain sk_buffs. On skb reception skb->cb must contain additional |
60 | info as described in the struct ieee802154_mac_cb. | 60 | info as described in the struct ieee802154_mac_cb. During packet transmission |
61 | the skb->cb is used to provide additional data to device's header_ops->create | ||
62 | function. Be aware, that this data can be overriden later (when socket code | ||
63 | submits skb to qdisc), so if you need something from that cb later, you should | ||
64 | store info in the skb->data on your own. | ||
61 | 65 | ||
62 | To hook the MLME interface you have to populate the ml_priv field of your | 66 | To hook the MLME interface you have to populate the ml_priv field of your |
63 | net_device with a pointer to struct ieee802154_mlme_ops instance. All fields are | 67 | net_device with a pointer to struct ieee802154_mlme_ops instance. All fields are |
@@ -69,8 +73,8 @@ We provide an example of simple HardMAC driver at drivers/ieee802154/fakehard.c | |||
69 | SoftMAC | 73 | SoftMAC |
70 | ======= | 74 | ======= |
71 | 75 | ||
72 | We are going to provide intermediate layer impelementing IEEE 802.15.4 MAC | 76 | We are going to provide intermediate layer implementing IEEE 802.15.4 MAC |
73 | in software. This is currently WIP. | 77 | in software. This is currently WIP. |
74 | 78 | ||
75 | See header include/net/ieee802154/mac802154.h and several drivers in | 79 | See header include/net/mac802154.h and several drivers in drivers/ieee802154/. |
76 | drivers/ieee802154/ | 80 | |
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 8be76235fe67..fbe427a6580c 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -311,9 +311,12 @@ tcp_no_metrics_save - BOOLEAN | |||
311 | connections. | 311 | connections. |
312 | 312 | ||
313 | tcp_orphan_retries - INTEGER | 313 | tcp_orphan_retries - INTEGER |
314 | How may times to retry before killing TCP connection, closed | 314 | This value influences the timeout of a locally closed TCP connection, |
315 | by our side. Default value 7 corresponds to ~50sec-16min | 315 | when RTO retransmissions remain unacknowledged. |
316 | depending on RTO. If you machine is loaded WEB server, | 316 | See tcp_retries2 for more details. |
317 | |||
318 | The default value is 7. | ||
319 | If your machine is a loaded WEB server, | ||
317 | you should think about lowering this value, such sockets | 320 | you should think about lowering this value, such sockets |
318 | may consume significant resources. Cf. tcp_max_orphans. | 321 | may consume significant resources. Cf. tcp_max_orphans. |
319 | 322 | ||
@@ -327,16 +330,28 @@ tcp_retrans_collapse - BOOLEAN | |||
327 | certain TCP stacks. | 330 | certain TCP stacks. |
328 | 331 | ||
329 | tcp_retries1 - INTEGER | 332 | tcp_retries1 - INTEGER |
330 | How many times to retry before deciding that something is wrong | 333 | This value influences the time, after which TCP decides, that |
331 | and it is necessary to report this suspicion to network layer. | 334 | something is wrong due to unacknowledged RTO retransmissions, |
332 | Minimal RFC value is 3, it is default, which corresponds | 335 | and reports this suspicion to the network layer. |
333 | to ~3sec-8min depending on RTO. | 336 | See tcp_retries2 for more details. |
337 | |||
338 | RFC 1122 recommends at least 3 retransmissions, which is the | ||
339 | default. | ||
334 | 340 | ||
335 | tcp_retries2 - INTEGER | 341 | tcp_retries2 - INTEGER |
336 | How may times to retry before killing alive TCP connection. | 342 | This value influences the timeout of an alive TCP connection, |
337 | RFC1122 says that the limit should be longer than 100 sec. | 343 | when RTO retransmissions remain unacknowledged. |
338 | It is too small number. Default value 15 corresponds to ~13-30min | 344 | Given a value of N, a hypothetical TCP connection following |
339 | depending on RTO. | 345 | exponential backoff with an initial RTO of TCP_RTO_MIN would |
346 | retransmit N times before killing the connection at the (N+1)th RTO. | ||
347 | |||
348 | The default value of 15 yields a hypothetical timeout of 924.6 | ||
349 | seconds and is a lower bound for the effective timeout. | ||
350 | TCP will effectively time out at the first RTO which exceeds the | ||
351 | hypothetical timeout. | ||
352 | |||
353 | RFC 1122 recommends at least 100 seconds for the timeout, | ||
354 | which corresponds to a value of at least 8. | ||
340 | 355 | ||
341 | tcp_rfc1337 - BOOLEAN | 356 | tcp_rfc1337 - BOOLEAN |
342 | If set, the TCP stack behaves conforming to RFC1337. If unset, | 357 | If set, the TCP stack behaves conforming to RFC1337. If unset, |
@@ -1282,6 +1297,16 @@ sctp_rmem - vector of 3 INTEGERs: min, default, max | |||
1282 | sctp_wmem - vector of 3 INTEGERs: min, default, max | 1297 | sctp_wmem - vector of 3 INTEGERs: min, default, max |
1283 | See tcp_wmem for a description. | 1298 | See tcp_wmem for a description. |
1284 | 1299 | ||
1300 | addr_scope_policy - INTEGER | ||
1301 | Control IPv4 address scoping - draft-stewart-tsvwg-sctp-ipv4-00 | ||
1302 | |||
1303 | 0 - Disable IPv4 address scoping | ||
1304 | 1 - Enable IPv4 address scoping | ||
1305 | 2 - Follow draft but allow IPv4 private addresses | ||
1306 | 3 - Follow draft but allow IPv4 link local addresses | ||
1307 | |||
1308 | Default: 1 | ||
1309 | |||
1285 | 1310 | ||
1286 | /proc/sys/net/core/* | 1311 | /proc/sys/net/core/* |
1287 | dev_weight - INTEGER | 1312 | dev_weight - INTEGER |
diff --git a/Documentation/s390/s390dbf.txt b/Documentation/s390/s390dbf.txt index 2d10053dd97e..ae66f9b90a25 100644 --- a/Documentation/s390/s390dbf.txt +++ b/Documentation/s390/s390dbf.txt | |||
@@ -495,6 +495,13 @@ and for each vararg a long value. So e.g. for a debug entry with a format | |||
495 | string plus two varargs one would need to allocate a (3 * sizeof(long)) | 495 | string plus two varargs one would need to allocate a (3 * sizeof(long)) |
496 | byte data area in the debug_register() function. | 496 | byte data area in the debug_register() function. |
497 | 497 | ||
498 | IMPORTANT: Using "%s" in sprintf event functions is dangerous. You can only | ||
499 | use "%s" in the sprintf event functions, if the memory for the passed string is | ||
500 | available as long as the debug feature exists. The reason behind this is that | ||
501 | due to performance considerations only a pointer to the string is stored in | ||
502 | the debug feature. If you log a string that is freed afterwards, you will get | ||
503 | an OOPS when inspecting the debug feature, because then the debug feature will | ||
504 | access the already freed memory. | ||
498 | 505 | ||
499 | NOTE: If using the sprintf view do NOT use other event/exception functions | 506 | NOTE: If using the sprintf view do NOT use other event/exception functions |
500 | than the sprintf-event and -exception functions. | 507 | than the sprintf-event and -exception functions. |
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt index 4252697a95d6..1c8eb4518ce0 100644 --- a/Documentation/sound/alsa/ALSA-Configuration.txt +++ b/Documentation/sound/alsa/ALSA-Configuration.txt | |||
@@ -60,6 +60,12 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
60 | slots - Reserve the slot index for the given driver. | 60 | slots - Reserve the slot index for the given driver. |
61 | This option takes multiple strings. | 61 | This option takes multiple strings. |
62 | See "Module Autoloading Support" section for details. | 62 | See "Module Autoloading Support" section for details. |
63 | debug - Specifies the debug message level | ||
64 | (0 = disable debug prints, 1 = normal debug messages, | ||
65 | 2 = verbose debug messages) | ||
66 | This option appears only when CONFIG_SND_DEBUG=y. | ||
67 | This option can be dynamically changed via sysfs | ||
68 | /sys/modules/snd/parameters/debug file. | ||
63 | 69 | ||
64 | Module snd-pcm-oss | 70 | Module snd-pcm-oss |
65 | ------------------ | 71 | ------------------ |
@@ -513,6 +519,26 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
513 | or input, but you may use this module for any application which | 519 | or input, but you may use this module for any application which |
514 | requires a sound card (like RealPlayer). | 520 | requires a sound card (like RealPlayer). |
515 | 521 | ||
522 | pcm_devs - Number of PCM devices assigned to each card | ||
523 | (default = 1, up to 4) | ||
524 | pcm_substreams - Number of PCM substreams assigned to each PCM | ||
525 | (default = 8, up to 16) | ||
526 | hrtimer - Use hrtimer (=1, default) or system timer (=0) | ||
527 | fake_buffer - Fake buffer allocations (default = 1) | ||
528 | |||
529 | When multiple PCM devices are created, snd-dummy gives different | ||
530 | behavior to each PCM device: | ||
531 | 0 = interleaved with mmap support | ||
532 | 1 = non-interleaved with mmap support | ||
533 | 2 = interleaved without mmap | ||
534 | 3 = non-interleaved without mmap | ||
535 | |||
536 | As default, snd-dummy drivers doesn't allocate the real buffers | ||
537 | but either ignores read/write or mmap a single dummy page to all | ||
538 | buffer pages, in order to save the resouces. If your apps need | ||
539 | the read/ written buffer data to be consistent, pass fake_buffer=0 | ||
540 | option. | ||
541 | |||
516 | The power-management is supported. | 542 | The power-management is supported. |
517 | 543 | ||
518 | Module snd-echo3g | 544 | Module snd-echo3g |
@@ -768,6 +794,10 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed. | |||
768 | bdl_pos_adj - Specifies the DMA IRQ timing delay in samples. | 794 | bdl_pos_adj - Specifies the DMA IRQ timing delay in samples. |
769 | Passing -1 will make the driver to choose the appropriate | 795 | Passing -1 will make the driver to choose the appropriate |
770 | value based on the controller chip. | 796 | value based on the controller chip. |
797 | patch - Specifies the early "patch" files to modify the HD-audio | ||
798 | setup before initializing the codecs. This option is | ||
799 | available only when CONFIG_SND_HDA_PATCH_LOADER=y is set. | ||
800 | See HD-Audio.txt for details. | ||
771 | 801 | ||
772 | [Single (global) options] | 802 | [Single (global) options] |
773 | single_cmd - Use single immediate commands to communicate with | 803 | single_cmd - Use single immediate commands to communicate with |
diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt index 939a3dd58148..97eebd63bedc 100644 --- a/Documentation/sound/alsa/HD-Audio-Models.txt +++ b/Documentation/sound/alsa/HD-Audio-Models.txt | |||
@@ -114,8 +114,8 @@ ALC662/663/272 | |||
114 | samsung-nc10 Samsung NC10 mini notebook | 114 | samsung-nc10 Samsung NC10 mini notebook |
115 | auto auto-config reading BIOS (default) | 115 | auto auto-config reading BIOS (default) |
116 | 116 | ||
117 | ALC882/885 | 117 | ALC882/883/885/888/889 |
118 | ========== | 118 | ====================== |
119 | 3stack-dig 3-jack with SPDIF I/O | 119 | 3stack-dig 3-jack with SPDIF I/O |
120 | 6stack-dig 6-jack digital with SPDIF I/O | 120 | 6stack-dig 6-jack digital with SPDIF I/O |
121 | arima Arima W820Di1 | 121 | arima Arima W820Di1 |
@@ -127,12 +127,8 @@ ALC882/885 | |||
127 | mbp3 Macbook Pro rev3 | 127 | mbp3 Macbook Pro rev3 |
128 | imac24 iMac 24'' with jack detection | 128 | imac24 iMac 24'' with jack detection |
129 | w2jc ASUS W2JC | 129 | w2jc ASUS W2JC |
130 | auto auto-config reading BIOS (default) | 130 | 3stack-2ch-dig 3-jack with SPDIF I/O (ALC883) |
131 | 131 | alc883-6stack-dig 6-jack digital with SPDIF I/O (ALC883) | |
132 | ALC883/888 | ||
133 | ========== | ||
134 | 3stack-dig 3-jack with SPDIF I/O | ||
135 | 6stack-dig 6-jack digital with SPDIF I/O | ||
136 | 3stack-6ch 3-jack 6-channel | 132 | 3stack-6ch 3-jack 6-channel |
137 | 3stack-6ch-dig 3-jack 6-channel with SPDIF I/O | 133 | 3stack-6ch-dig 3-jack 6-channel with SPDIF I/O |
138 | 6stack-dig-demo 6-jack digital for Intel demo board | 134 | 6stack-dig-demo 6-jack digital for Intel demo board |
@@ -140,6 +136,7 @@ ALC883/888 | |||
140 | acer-aspire Acer Aspire 9810 | 136 | acer-aspire Acer Aspire 9810 |
141 | acer-aspire-4930g Acer Aspire 4930G | 137 | acer-aspire-4930g Acer Aspire 4930G |
142 | acer-aspire-6530g Acer Aspire 6530G | 138 | acer-aspire-6530g Acer Aspire 6530G |
139 | acer-aspire-7730g Acer Aspire 7730G | ||
143 | acer-aspire-8930g Acer Aspire 8930G | 140 | acer-aspire-8930g Acer Aspire 8930G |
144 | medion Medion Laptops | 141 | medion Medion Laptops |
145 | medion-md2 Medion MD2 | 142 | medion-md2 Medion MD2 |
@@ -155,10 +152,13 @@ ALC883/888 | |||
155 | 3stack-hp HP machines with 3stack (Lucknow, Samba boards) | 152 | 3stack-hp HP machines with 3stack (Lucknow, Samba boards) |
156 | 6stack-dell Dell machines with 6stack (Inspiron 530) | 153 | 6stack-dell Dell machines with 6stack (Inspiron 530) |
157 | mitac Mitac 8252D | 154 | mitac Mitac 8252D |
155 | clevo-m540r Clevo M540R (6ch + digital) | ||
158 | clevo-m720 Clevo M720 laptop series | 156 | clevo-m720 Clevo M720 laptop series |
159 | fujitsu-pi2515 Fujitsu AMILO Pi2515 | 157 | fujitsu-pi2515 Fujitsu AMILO Pi2515 |
160 | fujitsu-xa3530 Fujitsu AMILO XA3530 | 158 | fujitsu-xa3530 Fujitsu AMILO XA3530 |
161 | 3stack-6ch-intel Intel DG33* boards | 159 | 3stack-6ch-intel Intel DG33* boards |
160 | intel-alc889a Intel IbexPeak with ALC889A | ||
161 | intel-x58 Intel DX58 with ALC889 | ||
162 | asus-p5q ASUS P5Q-EM boards | 162 | asus-p5q ASUS P5Q-EM boards |
163 | mb31 MacBook 3,1 | 163 | mb31 MacBook 3,1 |
164 | sony-vaio-tt Sony VAIO TT | 164 | sony-vaio-tt Sony VAIO TT |
@@ -229,7 +229,7 @@ AD1984 | |||
229 | ====== | 229 | ====== |
230 | basic default configuration | 230 | basic default configuration |
231 | thinkpad Lenovo Thinkpad T61/X61 | 231 | thinkpad Lenovo Thinkpad T61/X61 |
232 | dell Dell T3400 | 232 | dell_desktop Dell T3400 |
233 | 233 | ||
234 | AD1986A | 234 | AD1986A |
235 | ======= | 235 | ======= |
@@ -258,6 +258,7 @@ Conexant 5045 | |||
258 | laptop-micsense Laptop with Mic sense (old model fujitsu) | 258 | laptop-micsense Laptop with Mic sense (old model fujitsu) |
259 | laptop-hpmicsense Laptop with HP and Mic senses | 259 | laptop-hpmicsense Laptop with HP and Mic senses |
260 | benq Benq R55E | 260 | benq Benq R55E |
261 | laptop-hp530 HP 530 laptop | ||
261 | test for testing/debugging purpose, almost all controls | 262 | test for testing/debugging purpose, almost all controls |
262 | can be adjusted. Appearing only when compiled with | 263 | can be adjusted. Appearing only when compiled with |
263 | $CONFIG_SND_DEBUG=y | 264 | $CONFIG_SND_DEBUG=y |
@@ -278,9 +279,16 @@ Conexant 5051 | |||
278 | hp-dv6736 HP dv6736 | 279 | hp-dv6736 HP dv6736 |
279 | lenovo-x200 Lenovo X200 laptop | 280 | lenovo-x200 Lenovo X200 laptop |
280 | 281 | ||
282 | Conexant 5066 | ||
283 | ============= | ||
284 | laptop Basic Laptop config (default) | ||
285 | dell-laptop Dell laptops | ||
286 | olpc-xo-1_5 OLPC XO 1.5 | ||
287 | |||
281 | STAC9200 | 288 | STAC9200 |
282 | ======== | 289 | ======== |
283 | ref Reference board | 290 | ref Reference board |
291 | oqo OQO Model 2 | ||
284 | dell-d21 Dell (unknown) | 292 | dell-d21 Dell (unknown) |
285 | dell-d22 Dell (unknown) | 293 | dell-d22 Dell (unknown) |
286 | dell-d23 Dell (unknown) | 294 | dell-d23 Dell (unknown) |
@@ -368,10 +376,12 @@ STAC92HD73* | |||
368 | =========== | 376 | =========== |
369 | ref Reference board | 377 | ref Reference board |
370 | no-jd BIOS setup but without jack-detection | 378 | no-jd BIOS setup but without jack-detection |
379 | intel Intel DG45* mobos | ||
371 | dell-m6-amic Dell desktops/laptops with analog mics | 380 | dell-m6-amic Dell desktops/laptops with analog mics |
372 | dell-m6-dmic Dell desktops/laptops with digital mics | 381 | dell-m6-dmic Dell desktops/laptops with digital mics |
373 | dell-m6 Dell desktops/laptops with both type of mics | 382 | dell-m6 Dell desktops/laptops with both type of mics |
374 | dell-eq Dell desktops/laptops | 383 | dell-eq Dell desktops/laptops |
384 | alienware Alienware M17x | ||
375 | auto BIOS setup (default) | 385 | auto BIOS setup (default) |
376 | 386 | ||
377 | STAC92HD83* | 387 | STAC92HD83* |
@@ -385,3 +395,8 @@ STAC9872 | |||
385 | ======== | 395 | ======== |
386 | vaio VAIO laptop without SPDIF | 396 | vaio VAIO laptop without SPDIF |
387 | auto BIOS setup (default) | 397 | auto BIOS setup (default) |
398 | |||
399 | Cirrus Logic CS4206/4207 | ||
400 | ======================== | ||
401 | mbp55 MacBook Pro 5,5 | ||
402 | auto BIOS setup (default) | ||
diff --git a/Documentation/sound/alsa/HD-Audio.txt b/Documentation/sound/alsa/HD-Audio.txt index 71ac995b1915..7b8a5f947d1d 100644 --- a/Documentation/sound/alsa/HD-Audio.txt +++ b/Documentation/sound/alsa/HD-Audio.txt | |||
@@ -139,6 +139,10 @@ The driver checks PCI SSID and looks through the static configuration | |||
139 | table until any matching entry is found. If you have a new machine, | 139 | table until any matching entry is found. If you have a new machine, |
140 | you may see a message like below: | 140 | you may see a message like below: |
141 | ------------------------------------------------------------------------ | 141 | ------------------------------------------------------------------------ |
142 | hda_codec: ALC880: BIOS auto-probing. | ||
143 | ------------------------------------------------------------------------ | ||
144 | Meanwhile, in the earlier versions, you would see a message like: | ||
145 | ------------------------------------------------------------------------ | ||
142 | hda_codec: Unknown model for ALC880, trying auto-probe from BIOS... | 146 | hda_codec: Unknown model for ALC880, trying auto-probe from BIOS... |
143 | ------------------------------------------------------------------------ | 147 | ------------------------------------------------------------------------ |
144 | Even if you see such a message, DON'T PANIC. Take a deep breath and | 148 | Even if you see such a message, DON'T PANIC. Take a deep breath and |
@@ -403,6 +407,66 @@ re-configure based on that state, run like below: | |||
403 | ------------------------------------------------------------------------ | 407 | ------------------------------------------------------------------------ |
404 | 408 | ||
405 | 409 | ||
410 | Early Patching | ||
411 | ~~~~~~~~~~~~~~ | ||
412 | When CONFIG_SND_HDA_PATCH_LOADER=y is set, you can pass a "patch" as a | ||
413 | firmware file for modifying the HD-audio setup before initializing the | ||
414 | codec. This can work basically like the reconfiguration via sysfs in | ||
415 | the above, but it does it before the first codec configuration. | ||
416 | |||
417 | A patch file is a plain text file which looks like below: | ||
418 | |||
419 | ------------------------------------------------------------------------ | ||
420 | [codec] | ||
421 | 0x12345678 0xabcd1234 2 | ||
422 | |||
423 | [model] | ||
424 | auto | ||
425 | |||
426 | [pincfg] | ||
427 | 0x12 0x411111f0 | ||
428 | |||
429 | [verb] | ||
430 | 0x20 0x500 0x03 | ||
431 | 0x20 0x400 0xff | ||
432 | |||
433 | [hint] | ||
434 | hp_detect = yes | ||
435 | ------------------------------------------------------------------------ | ||
436 | |||
437 | The file needs to have a line `[codec]`. The next line should contain | ||
438 | three numbers indicating the codec vendor-id (0x12345678 in the | ||
439 | example), the codec subsystem-id (0xabcd1234) and the address (2) of | ||
440 | the codec. The rest patch entries are applied to this specified codec | ||
441 | until another codec entry is given. | ||
442 | |||
443 | The `[model]` line allows to change the model name of the each codec. | ||
444 | In the example above, it will be changed to model=auto. | ||
445 | Note that this overrides the module option. | ||
446 | |||
447 | After the `[pincfg]` line, the contents are parsed as the initial | ||
448 | default pin-configurations just like `user_pin_configs` sysfs above. | ||
449 | The values can be shown in user_pin_configs sysfs file, too. | ||
450 | |||
451 | Similarly, the lines after `[verb]` are parsed as `init_verbs` | ||
452 | sysfs entries, and the lines after `[hint]` are parsed as `hints` | ||
453 | sysfs entries, respectively. | ||
454 | |||
455 | The hd-audio driver reads the file via request_firmware(). Thus, | ||
456 | a patch file has to be located on the appropriate firmware path, | ||
457 | typically, /lib/firmware. For example, when you pass the option | ||
458 | `patch=hda-init.fw`, the file /lib/firmware/hda-init-fw must be | ||
459 | present. | ||
460 | |||
461 | The patch module option is specific to each card instance, and you | ||
462 | need to give one file name for each instance, separated by commas. | ||
463 | For example, if you have two cards, one for an on-board analog and one | ||
464 | for an HDMI video board, you may pass patch option like below: | ||
465 | ------------------------------------------------------------------------ | ||
466 | options snd-hda-intel patch=on-board-patch,hdmi-patch | ||
467 | ------------------------------------------------------------------------ | ||
468 | |||
469 | |||
406 | Power-Saving | 470 | Power-Saving |
407 | ~~~~~~~~~~~~ | 471 | ~~~~~~~~~~~~ |
408 | The power-saving is a kind of auto-suspend of the device. When the | 472 | The power-saving is a kind of auto-suspend of the device. When the |
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 322a00bb99d9..2dbff53369d0 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
@@ -19,6 +19,7 @@ Currently, these files might (depending on your configuration) | |||
19 | show up in /proc/sys/kernel: | 19 | show up in /proc/sys/kernel: |
20 | - acpi_video_flags | 20 | - acpi_video_flags |
21 | - acct | 21 | - acct |
22 | - callhome [ S390 only ] | ||
22 | - auto_msgmni | 23 | - auto_msgmni |
23 | - core_pattern | 24 | - core_pattern |
24 | - core_uses_pid | 25 | - core_uses_pid |
@@ -91,6 +92,21 @@ valid for 30 seconds. | |||
91 | 92 | ||
92 | ============================================================== | 93 | ============================================================== |
93 | 94 | ||
95 | callhome: | ||
96 | |||
97 | Controls the kernel's callhome behavior in case of a kernel panic. | ||
98 | |||
99 | The s390 hardware allows an operating system to send a notification | ||
100 | to a service organization (callhome) in case of an operating system panic. | ||
101 | |||
102 | When the value in this file is 0 (which is the default behavior) | ||
103 | nothing happens in case of a kernel panic. If this value is set to "1" | ||
104 | the complete kernel oops message is send to the IBM customer service | ||
105 | organization in case the mainframe the Linux operating system is running | ||
106 | on has a service contract with IBM. | ||
107 | |||
108 | ============================================================== | ||
109 | |||
94 | core_pattern: | 110 | core_pattern: |
95 | 111 | ||
96 | core_pattern is used to specify a core dumpfile pattern name. | 112 | core_pattern is used to specify a core dumpfile pattern name. |
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index f157d7594ea7..2bcc8d4dea29 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt | |||
@@ -83,6 +83,15 @@ When reading one of these enable files, there are four results: | |||
83 | X - there is a mixture of events enabled and disabled | 83 | X - there is a mixture of events enabled and disabled |
84 | ? - this file does not affect any event | 84 | ? - this file does not affect any event |
85 | 85 | ||
86 | 2.3 Boot option | ||
87 | --------------- | ||
88 | |||
89 | In order to facilitate early boot debugging, use boot option: | ||
90 | |||
91 | trace_event=[event-list] | ||
92 | |||
93 | The format of this boot option is the same as described in section 2.1. | ||
94 | |||
86 | 3. Defining an event-enabled tracepoint | 95 | 3. Defining an event-enabled tracepoint |
87 | ======================================= | 96 | ======================================= |
88 | 97 | ||
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index a39b3c749de5..355d0f1f8c50 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt | |||
@@ -85,26 +85,19 @@ of ftrace. Here is a list of some of the key files: | |||
85 | This file holds the output of the trace in a human | 85 | This file holds the output of the trace in a human |
86 | readable format (described below). | 86 | readable format (described below). |
87 | 87 | ||
88 | latency_trace: | ||
89 | |||
90 | This file shows the same trace but the information | ||
91 | is organized more to display possible latencies | ||
92 | in the system (described below). | ||
93 | |||
94 | trace_pipe: | 88 | trace_pipe: |
95 | 89 | ||
96 | The output is the same as the "trace" file but this | 90 | The output is the same as the "trace" file but this |
97 | file is meant to be streamed with live tracing. | 91 | file is meant to be streamed with live tracing. |
98 | Reads from this file will block until new data | 92 | Reads from this file will block until new data is |
99 | is retrieved. Unlike the "trace" and "latency_trace" | 93 | retrieved. Unlike the "trace" file, this file is a |
100 | files, this file is a consumer. This means reading | 94 | consumer. This means reading from this file causes |
101 | from this file causes sequential reads to display | 95 | sequential reads to display more current data. Once |
102 | more current data. Once data is read from this | 96 | data is read from this file, it is consumed, and |
103 | file, it is consumed, and will not be read | 97 | will not be read again with a sequential read. The |
104 | again with a sequential read. The "trace" and | 98 | "trace" file is static, and if the tracer is not |
105 | "latency_trace" files are static, and if the | 99 | adding more data,they will display the same |
106 | tracer is not adding more data, they will display | 100 | information every time they are read. |
107 | the same information every time they are read. | ||
108 | 101 | ||
109 | trace_options: | 102 | trace_options: |
110 | 103 | ||
@@ -117,10 +110,10 @@ of ftrace. Here is a list of some of the key files: | |||
117 | Some of the tracers record the max latency. | 110 | Some of the tracers record the max latency. |
118 | For example, the time interrupts are disabled. | 111 | For example, the time interrupts are disabled. |
119 | This time is saved in this file. The max trace | 112 | This time is saved in this file. The max trace |
120 | will also be stored, and displayed by either | 113 | will also be stored, and displayed by "trace". |
121 | "trace" or "latency_trace". A new max trace will | 114 | A new max trace will only be recorded if the |
122 | only be recorded if the latency is greater than | 115 | latency is greater than the value in this |
123 | the value in this file. (in microseconds) | 116 | file. (in microseconds) |
124 | 117 | ||
125 | buffer_size_kb: | 118 | buffer_size_kb: |
126 | 119 | ||
@@ -210,7 +203,7 @@ Here is the list of current tracers that may be configured. | |||
210 | the trace with the longest max latency. | 203 | the trace with the longest max latency. |
211 | See tracing_max_latency. When a new max is recorded, | 204 | See tracing_max_latency. When a new max is recorded, |
212 | it replaces the old trace. It is best to view this | 205 | it replaces the old trace. It is best to view this |
213 | trace via the latency_trace file. | 206 | trace with the latency-format option enabled. |
214 | 207 | ||
215 | "preemptoff" | 208 | "preemptoff" |
216 | 209 | ||
@@ -307,8 +300,8 @@ the lowest priority thread (pid 0). | |||
307 | Latency trace format | 300 | Latency trace format |
308 | -------------------- | 301 | -------------------- |
309 | 302 | ||
310 | For traces that display latency times, the latency_trace file | 303 | When the latency-format option is enabled, the trace file gives |
311 | gives somewhat more information to see why a latency happened. | 304 | somewhat more information to see why a latency happened. |
312 | Here is a typical trace. | 305 | Here is a typical trace. |
313 | 306 | ||
314 | # tracer: irqsoff | 307 | # tracer: irqsoff |
@@ -380,9 +373,10 @@ explains which is which. | |||
380 | 373 | ||
381 | The above is mostly meaningful for kernel developers. | 374 | The above is mostly meaningful for kernel developers. |
382 | 375 | ||
383 | time: This differs from the trace file output. The trace file output | 376 | time: When the latency-format option is enabled, the trace file |
384 | includes an absolute timestamp. The timestamp used by the | 377 | output includes a timestamp relative to the start of the |
385 | latency_trace file is relative to the start of the trace. | 378 | trace. This differs from the output when latency-format |
379 | is disabled, which includes an absolute timestamp. | ||
386 | 380 | ||
387 | delay: This is just to help catch your eye a bit better. And | 381 | delay: This is just to help catch your eye a bit better. And |
388 | needs to be fixed to be only relative to the same CPU. | 382 | needs to be fixed to be only relative to the same CPU. |
@@ -440,7 +434,8 @@ Here are the available options: | |||
440 | sym-addr: | 434 | sym-addr: |
441 | bash-4000 [01] 1477.606694: simple_strtoul <c0339346> | 435 | bash-4000 [01] 1477.606694: simple_strtoul <c0339346> |
442 | 436 | ||
443 | verbose - This deals with the latency_trace file. | 437 | verbose - This deals with the trace file when the |
438 | latency-format option is enabled. | ||
444 | 439 | ||
445 | bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \ | 440 | bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \ |
446 | (+0.000ms): simple_strtoul (strict_strtoul) | 441 | (+0.000ms): simple_strtoul (strict_strtoul) |
@@ -472,7 +467,7 @@ Here are the available options: | |||
472 | the app is no longer running | 467 | the app is no longer running |
473 | 468 | ||
474 | The lookup is performed when you read | 469 | The lookup is performed when you read |
475 | trace,trace_pipe,latency_trace. Example: | 470 | trace,trace_pipe. Example: |
476 | 471 | ||
477 | a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 | 472 | a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 |
478 | x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] | 473 | x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] |
@@ -481,6 +476,11 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] | |||
481 | every scheduling event. Will add overhead if | 476 | every scheduling event. Will add overhead if |
482 | there's a lot of tasks running at once. | 477 | there's a lot of tasks running at once. |
483 | 478 | ||
479 | latency-format - This option changes the trace. When | ||
480 | it is enabled, the trace displays | ||
481 | additional information about the | ||
482 | latencies, as described in "Latency | ||
483 | trace format". | ||
484 | 484 | ||
485 | sched_switch | 485 | sched_switch |
486 | ------------ | 486 | ------------ |
@@ -596,12 +596,13 @@ To reset the maximum, echo 0 into tracing_max_latency. Here is | |||
596 | an example: | 596 | an example: |
597 | 597 | ||
598 | # echo irqsoff > current_tracer | 598 | # echo irqsoff > current_tracer |
599 | # echo latency-format > trace_options | ||
599 | # echo 0 > tracing_max_latency | 600 | # echo 0 > tracing_max_latency |
600 | # echo 1 > tracing_enabled | 601 | # echo 1 > tracing_enabled |
601 | # ls -ltr | 602 | # ls -ltr |
602 | [...] | 603 | [...] |
603 | # echo 0 > tracing_enabled | 604 | # echo 0 > tracing_enabled |
604 | # cat latency_trace | 605 | # cat trace |
605 | # tracer: irqsoff | 606 | # tracer: irqsoff |
606 | # | 607 | # |
607 | irqsoff latency trace v1.1.5 on 2.6.26 | 608 | irqsoff latency trace v1.1.5 on 2.6.26 |
@@ -703,12 +704,13 @@ which preemption was disabled. The control of preemptoff tracer | |||
703 | is much like the irqsoff tracer. | 704 | is much like the irqsoff tracer. |
704 | 705 | ||
705 | # echo preemptoff > current_tracer | 706 | # echo preemptoff > current_tracer |
707 | # echo latency-format > trace_options | ||
706 | # echo 0 > tracing_max_latency | 708 | # echo 0 > tracing_max_latency |
707 | # echo 1 > tracing_enabled | 709 | # echo 1 > tracing_enabled |
708 | # ls -ltr | 710 | # ls -ltr |
709 | [...] | 711 | [...] |
710 | # echo 0 > tracing_enabled | 712 | # echo 0 > tracing_enabled |
711 | # cat latency_trace | 713 | # cat trace |
712 | # tracer: preemptoff | 714 | # tracer: preemptoff |
713 | # | 715 | # |
714 | preemptoff latency trace v1.1.5 on 2.6.26-rc8 | 716 | preemptoff latency trace v1.1.5 on 2.6.26-rc8 |
@@ -850,12 +852,13 @@ Again, using this trace is much like the irqsoff and preemptoff | |||
850 | tracers. | 852 | tracers. |
851 | 853 | ||
852 | # echo preemptirqsoff > current_tracer | 854 | # echo preemptirqsoff > current_tracer |
855 | # echo latency-format > trace_options | ||
853 | # echo 0 > tracing_max_latency | 856 | # echo 0 > tracing_max_latency |
854 | # echo 1 > tracing_enabled | 857 | # echo 1 > tracing_enabled |
855 | # ls -ltr | 858 | # ls -ltr |
856 | [...] | 859 | [...] |
857 | # echo 0 > tracing_enabled | 860 | # echo 0 > tracing_enabled |
858 | # cat latency_trace | 861 | # cat trace |
859 | # tracer: preemptirqsoff | 862 | # tracer: preemptirqsoff |
860 | # | 863 | # |
861 | preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 | 864 | preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 |
@@ -1012,11 +1015,12 @@ Instead of performing an 'ls', we will run 'sleep 1' under | |||
1012 | 'chrt' which changes the priority of the task. | 1015 | 'chrt' which changes the priority of the task. |
1013 | 1016 | ||
1014 | # echo wakeup > current_tracer | 1017 | # echo wakeup > current_tracer |
1018 | # echo latency-format > trace_options | ||
1015 | # echo 0 > tracing_max_latency | 1019 | # echo 0 > tracing_max_latency |
1016 | # echo 1 > tracing_enabled | 1020 | # echo 1 > tracing_enabled |
1017 | # chrt -f 5 sleep 1 | 1021 | # chrt -f 5 sleep 1 |
1018 | # echo 0 > tracing_enabled | 1022 | # echo 0 > tracing_enabled |
1019 | # cat latency_trace | 1023 | # cat trace |
1020 | # tracer: wakeup | 1024 | # tracer: wakeup |
1021 | # | 1025 | # |
1022 | wakeup latency trace v1.1.5 on 2.6.26-rc8 | 1026 | wakeup latency trace v1.1.5 on 2.6.26-rc8 |
diff --git a/Documentation/trace/function-graph-fold.vim b/Documentation/trace/function-graph-fold.vim new file mode 100644 index 000000000000..0544b504c8b0 --- /dev/null +++ b/Documentation/trace/function-graph-fold.vim | |||
@@ -0,0 +1,42 @@ | |||
1 | " Enable folding for ftrace function_graph traces. | ||
2 | " | ||
3 | " To use, :source this file while viewing a function_graph trace, or use vim's | ||
4 | " -S option to load from the command-line together with a trace. You can then | ||
5 | " use the usual vim fold commands, such as "za", to open and close nested | ||
6 | " functions. While closed, a fold will show the total time taken for a call, | ||
7 | " as would normally appear on the line with the closing brace. Folded | ||
8 | " functions will not include finish_task_switch(), so folding should remain | ||
9 | " relatively sane even through a context switch. | ||
10 | " | ||
11 | " Note that this will almost certainly only work well with a | ||
12 | " single-CPU trace (e.g. trace-cmd report --cpu 1). | ||
13 | |||
14 | function! FunctionGraphFoldExpr(lnum) | ||
15 | let line = getline(a:lnum) | ||
16 | if line[-1:] == '{' | ||
17 | if line =~ 'finish_task_switch() {$' | ||
18 | return '>1' | ||
19 | endif | ||
20 | return 'a1' | ||
21 | elseif line[-1:] == '}' | ||
22 | return 's1' | ||
23 | else | ||
24 | return '=' | ||
25 | endif | ||
26 | endfunction | ||
27 | |||
28 | function! FunctionGraphFoldText() | ||
29 | let s = split(getline(v:foldstart), '|', 1) | ||
30 | if getline(v:foldend+1) =~ 'finish_task_switch() {$' | ||
31 | let s[2] = ' task switch ' | ||
32 | else | ||
33 | let e = split(getline(v:foldend), '|', 1) | ||
34 | let s[2] = e[2] | ||
35 | endif | ||
36 | return join(s, '|') | ||
37 | endfunction | ||
38 | |||
39 | setlocal foldexpr=FunctionGraphFoldExpr(v:lnum) | ||
40 | setlocal foldtext=FunctionGraphFoldText() | ||
41 | setlocal foldcolumn=12 | ||
42 | setlocal foldmethod=expr | ||
diff --git a/Documentation/trace/ring-buffer-design.txt b/Documentation/trace/ring-buffer-design.txt new file mode 100644 index 000000000000..5b1d23d604c5 --- /dev/null +++ b/Documentation/trace/ring-buffer-design.txt | |||
@@ -0,0 +1,955 @@ | |||
1 | Lockless Ring Buffer Design | ||
2 | =========================== | ||
3 | |||
4 | Copyright 2009 Red Hat Inc. | ||
5 | Author: Steven Rostedt <srostedt@redhat.com> | ||
6 | License: The GNU Free Documentation License, Version 1.2 | ||
7 | (dual licensed under the GPL v2) | ||
8 | Reviewers: Mathieu Desnoyers, Huang Ying, Hidetoshi Seto, | ||
9 | and Frederic Weisbecker. | ||
10 | |||
11 | |||
12 | Written for: 2.6.31 | ||
13 | |||
14 | Terminology used in this Document | ||
15 | --------------------------------- | ||
16 | |||
17 | tail - where new writes happen in the ring buffer. | ||
18 | |||
19 | head - where new reads happen in the ring buffer. | ||
20 | |||
21 | producer - the task that writes into the ring buffer (same as writer) | ||
22 | |||
23 | writer - same as producer | ||
24 | |||
25 | consumer - the task that reads from the buffer (same as reader) | ||
26 | |||
27 | reader - same as consumer. | ||
28 | |||
29 | reader_page - A page outside the ring buffer used solely (for the most part) | ||
30 | by the reader. | ||
31 | |||
32 | head_page - a pointer to the page that the reader will use next | ||
33 | |||
34 | tail_page - a pointer to the page that will be written to next | ||
35 | |||
36 | commit_page - a pointer to the page with the last finished non nested write. | ||
37 | |||
38 | cmpxchg - hardware assisted atomic transaction that performs the following: | ||
39 | |||
40 | A = B iff previous A == C | ||
41 | |||
42 | R = cmpxchg(A, C, B) is saying that we replace A with B if and only if | ||
43 | current A is equal to C, and we put the old (current) A into R | ||
44 | |||
45 | R gets the previous A regardless if A is updated with B or not. | ||
46 | |||
47 | To see if the update was successful a compare of R == C may be used. | ||
48 | |||
49 | The Generic Ring Buffer | ||
50 | ----------------------- | ||
51 | |||
52 | The ring buffer can be used in either an overwrite mode or in | ||
53 | producer/consumer mode. | ||
54 | |||
55 | Producer/consumer mode is where the producer were to fill up the | ||
56 | buffer before the consumer could free up anything, the producer | ||
57 | will stop writing to the buffer. This will lose most recent events. | ||
58 | |||
59 | Overwrite mode is where the produce were to fill up the buffer | ||
60 | before the consumer could free up anything, the producer will | ||
61 | overwrite the older data. This will lose the oldest events. | ||
62 | |||
63 | No two writers can write at the same time (on the same per cpu buffer), | ||
64 | but a writer may interrupt another writer, but it must finish writing | ||
65 | before the previous writer may continue. This is very important to the | ||
66 | algorithm. The writers act like a "stack". The way interrupts works | ||
67 | enforces this behavior. | ||
68 | |||
69 | |||
70 | writer1 start | ||
71 | <preempted> writer2 start | ||
72 | <preempted> writer3 start | ||
73 | writer3 finishes | ||
74 | writer2 finishes | ||
75 | writer1 finishes | ||
76 | |||
77 | This is very much like a writer being preempted by an interrupt and | ||
78 | the interrupt doing a write as well. | ||
79 | |||
80 | Readers can happen at any time. But no two readers may run at the | ||
81 | same time, nor can a reader preempt/interrupt another reader. A reader | ||
82 | can not preempt/interrupt a writer, but it may read/consume from the | ||
83 | buffer at the same time as a writer is writing, but the reader must be | ||
84 | on another processor to do so. A reader may read on its own processor | ||
85 | and can be preempted by a writer. | ||
86 | |||
87 | A writer can preempt a reader, but a reader can not preempt a writer. | ||
88 | But a reader can read the buffer at the same time (on another processor) | ||
89 | as a writer. | ||
90 | |||
91 | The ring buffer is made up of a list of pages held together by a link list. | ||
92 | |||
93 | At initialization a reader page is allocated for the reader that is not | ||
94 | part of the ring buffer. | ||
95 | |||
96 | The head_page, tail_page and commit_page are all initialized to point | ||
97 | to the same page. | ||
98 | |||
99 | The reader page is initialized to have its next pointer pointing to | ||
100 | the head page, and its previous pointer pointing to a page before | ||
101 | the head page. | ||
102 | |||
103 | The reader has its own page to use. At start up time, this page is | ||
104 | allocated but is not attached to the list. When the reader wants | ||
105 | to read from the buffer, if its page is empty (like it is on start up) | ||
106 | it will swap its page with the head_page. The old reader page will | ||
107 | become part of the ring buffer and the head_page will be removed. | ||
108 | The page after the inserted page (old reader_page) will become the | ||
109 | new head page. | ||
110 | |||
111 | Once the new page is given to the reader, the reader could do what | ||
112 | it wants with it, as long as a writer has left that page. | ||
113 | |||
114 | A sample of how the reader page is swapped: Note this does not | ||
115 | show the head page in the buffer, it is for demonstrating a swap | ||
116 | only. | ||
117 | |||
118 | +------+ | ||
119 | |reader| RING BUFFER | ||
120 | |page | | ||
121 | +------+ | ||
122 | +---+ +---+ +---+ | ||
123 | | |-->| |-->| | | ||
124 | | |<--| |<--| | | ||
125 | +---+ +---+ +---+ | ||
126 | ^ | ^ | | ||
127 | | +-------------+ | | ||
128 | +-----------------+ | ||
129 | |||
130 | |||
131 | +------+ | ||
132 | |reader| RING BUFFER | ||
133 | |page |-------------------+ | ||
134 | +------+ v | ||
135 | | +---+ +---+ +---+ | ||
136 | | | |-->| |-->| | | ||
137 | | | |<--| |<--| |<-+ | ||
138 | | +---+ +---+ +---+ | | ||
139 | | ^ | ^ | | | ||
140 | | | +-------------+ | | | ||
141 | | +-----------------+ | | ||
142 | +------------------------------------+ | ||
143 | |||
144 | +------+ | ||
145 | |reader| RING BUFFER | ||
146 | |page |-------------------+ | ||
147 | +------+ <---------------+ v | ||
148 | | ^ +---+ +---+ +---+ | ||
149 | | | | |-->| |-->| | | ||
150 | | | | | | |<--| |<-+ | ||
151 | | | +---+ +---+ +---+ | | ||
152 | | | | ^ | | | ||
153 | | | +-------------+ | | | ||
154 | | +-----------------------------+ | | ||
155 | +------------------------------------+ | ||
156 | |||
157 | +------+ | ||
158 | |buffer| RING BUFFER | ||
159 | |page |-------------------+ | ||
160 | +------+ <---------------+ v | ||
161 | | ^ +---+ +---+ +---+ | ||
162 | | | | | | |-->| | | ||
163 | | | New | | | |<--| |<-+ | ||
164 | | | Reader +---+ +---+ +---+ | | ||
165 | | | page ----^ | | | ||
166 | | | | | | ||
167 | | +-----------------------------+ | | ||
168 | +------------------------------------+ | ||
169 | |||
170 | |||
171 | |||
172 | It is possible that the page swapped is the commit page and the tail page, | ||
173 | if what is in the ring buffer is less than what is held in a buffer page. | ||
174 | |||
175 | |||
176 | reader page commit page tail page | ||
177 | | | | | ||
178 | v | | | ||
179 | +---+ | | | ||
180 | | |<----------+ | | ||
181 | | |<------------------------+ | ||
182 | | |------+ | ||
183 | +---+ | | ||
184 | | | ||
185 | v | ||
186 | +---+ +---+ +---+ +---+ | ||
187 | <---| |--->| |--->| |--->| |---> | ||
188 | --->| |<---| |<---| |<---| |<--- | ||
189 | +---+ +---+ +---+ +---+ | ||
190 | |||
191 | This case is still valid for this algorithm. | ||
192 | When the writer leaves the page, it simply goes into the ring buffer | ||
193 | since the reader page still points to the next location in the ring | ||
194 | buffer. | ||
195 | |||
196 | |||
197 | The main pointers: | ||
198 | |||
199 | reader page - The page used solely by the reader and is not part | ||
200 | of the ring buffer (may be swapped in) | ||
201 | |||
202 | head page - the next page in the ring buffer that will be swapped | ||
203 | with the reader page. | ||
204 | |||
205 | tail page - the page where the next write will take place. | ||
206 | |||
207 | commit page - the page that last finished a write. | ||
208 | |||
209 | The commit page only is updated by the outer most writer in the | ||
210 | writer stack. A writer that preempts another writer will not move the | ||
211 | commit page. | ||
212 | |||
213 | When data is written into the ring buffer, a position is reserved | ||
214 | in the ring buffer and passed back to the writer. When the writer | ||
215 | is finished writing data into that position, it commits the write. | ||
216 | |||
217 | Another write (or a read) may take place at anytime during this | ||
218 | transaction. If another write happens it must finish before continuing | ||
219 | with the previous write. | ||
220 | |||
221 | |||
222 | Write reserve: | ||
223 | |||
224 | Buffer page | ||
225 | +---------+ | ||
226 | |written | | ||
227 | +---------+ <--- given back to writer (current commit) | ||
228 | |reserved | | ||
229 | +---------+ <--- tail pointer | ||
230 | | empty | | ||
231 | +---------+ | ||
232 | |||
233 | Write commit: | ||
234 | |||
235 | Buffer page | ||
236 | +---------+ | ||
237 | |written | | ||
238 | +---------+ | ||
239 | |written | | ||
240 | +---------+ <--- next positon for write (current commit) | ||
241 | | empty | | ||
242 | +---------+ | ||
243 | |||
244 | |||
245 | If a write happens after the first reserve: | ||
246 | |||
247 | Buffer page | ||
248 | +---------+ | ||
249 | |written | | ||
250 | +---------+ <-- current commit | ||
251 | |reserved | | ||
252 | +---------+ <--- given back to second writer | ||
253 | |reserved | | ||
254 | +---------+ <--- tail pointer | ||
255 | |||
256 | After second writer commits: | ||
257 | |||
258 | |||
259 | Buffer page | ||
260 | +---------+ | ||
261 | |written | | ||
262 | +---------+ <--(last full commit) | ||
263 | |reserved | | ||
264 | +---------+ | ||
265 | |pending | | ||
266 | |commit | | ||
267 | +---------+ <--- tail pointer | ||
268 | |||
269 | When the first writer commits: | ||
270 | |||
271 | Buffer page | ||
272 | +---------+ | ||
273 | |written | | ||
274 | +---------+ | ||
275 | |written | | ||
276 | +---------+ | ||
277 | |written | | ||
278 | +---------+ <--(last full commit and tail pointer) | ||
279 | |||
280 | |||
281 | The commit pointer points to the last write location that was | ||
282 | committed without preempting another write. When a write that | ||
283 | preempted another write is committed, it only becomes a pending commit | ||
284 | and will not be a full commit till all writes have been committed. | ||
285 | |||
286 | The commit page points to the page that has the last full commit. | ||
287 | The tail page points to the page with the last write (before | ||
288 | committing). | ||
289 | |||
290 | The tail page is always equal to or after the commit page. It may | ||
291 | be several pages ahead. If the tail page catches up to the commit | ||
292 | page then no more writes may take place (regardless of the mode | ||
293 | of the ring buffer: overwrite and produce/consumer). | ||
294 | |||
295 | The order of pages are: | ||
296 | |||
297 | head page | ||
298 | commit page | ||
299 | tail page | ||
300 | |||
301 | Possible scenario: | ||
302 | tail page | ||
303 | head page commit page | | ||
304 | | | | | ||
305 | v v v | ||
306 | +---+ +---+ +---+ +---+ | ||
307 | <---| |--->| |--->| |--->| |---> | ||
308 | --->| |<---| |<---| |<---| |<--- | ||
309 | +---+ +---+ +---+ +---+ | ||
310 | |||
311 | There is a special case that the head page is after either the commit page | ||
312 | and possibly the tail page. That is when the commit (and tail) page has been | ||
313 | swapped with the reader page. This is because the head page is always | ||
314 | part of the ring buffer, but the reader page is not. When ever there | ||
315 | has been less than a full page that has been committed inside the ring buffer, | ||
316 | and a reader swaps out a page, it will be swapping out the commit page. | ||
317 | |||
318 | |||
319 | reader page commit page tail page | ||
320 | | | | | ||
321 | v | | | ||
322 | +---+ | | | ||
323 | | |<----------+ | | ||
324 | | |<------------------------+ | ||
325 | | |------+ | ||
326 | +---+ | | ||
327 | | | ||
328 | v | ||
329 | +---+ +---+ +---+ +---+ | ||
330 | <---| |--->| |--->| |--->| |---> | ||
331 | --->| |<---| |<---| |<---| |<--- | ||
332 | +---+ +---+ +---+ +---+ | ||
333 | ^ | ||
334 | | | ||
335 | head page | ||
336 | |||
337 | |||
338 | In this case, the head page will not move when the tail and commit | ||
339 | move back into the ring buffer. | ||
340 | |||
341 | The reader can not swap a page into the ring buffer if the commit page | ||
342 | is still on that page. If the read meets the last commit (real commit | ||
343 | not pending or reserved), then there is nothing more to read. | ||
344 | The buffer is considered empty until another full commit finishes. | ||
345 | |||
346 | When the tail meets the head page, if the buffer is in overwrite mode, | ||
347 | the head page will be pushed ahead one. If the buffer is in producer/consumer | ||
348 | mode, the write will fail. | ||
349 | |||
350 | Overwrite mode: | ||
351 | |||
352 | tail page | ||
353 | | | ||
354 | v | ||
355 | +---+ +---+ +---+ +---+ | ||
356 | <---| |--->| |--->| |--->| |---> | ||
357 | --->| |<---| |<---| |<---| |<--- | ||
358 | +---+ +---+ +---+ +---+ | ||
359 | ^ | ||
360 | | | ||
361 | head page | ||
362 | |||
363 | |||
364 | tail page | ||
365 | | | ||
366 | v | ||
367 | +---+ +---+ +---+ +---+ | ||
368 | <---| |--->| |--->| |--->| |---> | ||
369 | --->| |<---| |<---| |<---| |<--- | ||
370 | +---+ +---+ +---+ +---+ | ||
371 | ^ | ||
372 | | | ||
373 | head page | ||
374 | |||
375 | |||
376 | tail page | ||
377 | | | ||
378 | v | ||
379 | +---+ +---+ +---+ +---+ | ||
380 | <---| |--->| |--->| |--->| |---> | ||
381 | --->| |<---| |<---| |<---| |<--- | ||
382 | +---+ +---+ +---+ +---+ | ||
383 | ^ | ||
384 | | | ||
385 | head page | ||
386 | |||
387 | Note, the reader page will still point to the previous head page. | ||
388 | But when a swap takes place, it will use the most recent head page. | ||
389 | |||
390 | |||
391 | Making the Ring Buffer Lockless: | ||
392 | -------------------------------- | ||
393 | |||
394 | The main idea behind the lockless algorithm is to combine the moving | ||
395 | of the head_page pointer with the swapping of pages with the reader. | ||
396 | State flags are placed inside the pointer to the page. To do this, | ||
397 | each page must be aligned in memory by 4 bytes. This will allow the 2 | ||
398 | least significant bits of the address to be used as flags. Since | ||
399 | they will always be zero for the address. To get the address, | ||
400 | simply mask out the flags. | ||
401 | |||
402 | MASK = ~3 | ||
403 | |||
404 | address & MASK | ||
405 | |||
406 | Two flags will be kept by these two bits: | ||
407 | |||
408 | HEADER - the page being pointed to is a head page | ||
409 | |||
410 | UPDATE - the page being pointed to is being updated by a writer | ||
411 | and was or is about to be a head page. | ||
412 | |||
413 | |||
414 | reader page | ||
415 | | | ||
416 | v | ||
417 | +---+ | ||
418 | | |------+ | ||
419 | +---+ | | ||
420 | | | ||
421 | v | ||
422 | +---+ +---+ +---+ +---+ | ||
423 | <---| |--->| |-H->| |--->| |---> | ||
424 | --->| |<---| |<---| |<---| |<--- | ||
425 | +---+ +---+ +---+ +---+ | ||
426 | |||
427 | |||
428 | The above pointer "-H->" would have the HEADER flag set. That is | ||
429 | the next page is the next page to be swapped out by the reader. | ||
430 | This pointer means the next page is the head page. | ||
431 | |||
432 | When the tail page meets the head pointer, it will use cmpxchg to | ||
433 | change the pointer to the UPDATE state: | ||
434 | |||
435 | |||
436 | tail page | ||
437 | | | ||
438 | v | ||
439 | +---+ +---+ +---+ +---+ | ||
440 | <---| |--->| |-H->| |--->| |---> | ||
441 | --->| |<---| |<---| |<---| |<--- | ||
442 | +---+ +---+ +---+ +---+ | ||
443 | |||
444 | tail page | ||
445 | | | ||
446 | v | ||
447 | +---+ +---+ +---+ +---+ | ||
448 | <---| |--->| |-U->| |--->| |---> | ||
449 | --->| |<---| |<---| |<---| |<--- | ||
450 | +---+ +---+ +---+ +---+ | ||
451 | |||
452 | "-U->" represents a pointer in the UPDATE state. | ||
453 | |||
454 | Any access to the reader will need to take some sort of lock to serialize | ||
455 | the readers. But the writers will never take a lock to write to the | ||
456 | ring buffer. This means we only need to worry about a single reader, | ||
457 | and writes only preempt in "stack" formation. | ||
458 | |||
459 | When the reader tries to swap the page with the ring buffer, it | ||
460 | will also use cmpxchg. If the flag bit in the pointer to the | ||
461 | head page does not have the HEADER flag set, the compare will fail | ||
462 | and the reader will need to look for the new head page and try again. | ||
463 | Note, the flag UPDATE and HEADER are never set at the same time. | ||
464 | |||
465 | The reader swaps the reader page as follows: | ||
466 | |||
467 | +------+ | ||
468 | |reader| RING BUFFER | ||
469 | |page | | ||
470 | +------+ | ||
471 | +---+ +---+ +---+ | ||
472 | | |--->| |--->| | | ||
473 | | |<---| |<---| | | ||
474 | +---+ +---+ +---+ | ||
475 | ^ | ^ | | ||
476 | | +---------------+ | | ||
477 | +-----H-------------+ | ||
478 | |||
479 | The reader sets the reader page next pointer as HEADER to the page after | ||
480 | the head page. | ||
481 | |||
482 | |||
483 | +------+ | ||
484 | |reader| RING BUFFER | ||
485 | |page |-------H-----------+ | ||
486 | +------+ v | ||
487 | | +---+ +---+ +---+ | ||
488 | | | |--->| |--->| | | ||
489 | | | |<---| |<---| |<-+ | ||
490 | | +---+ +---+ +---+ | | ||
491 | | ^ | ^ | | | ||
492 | | | +---------------+ | | | ||
493 | | +-----H-------------+ | | ||
494 | +--------------------------------------+ | ||
495 | |||
496 | It does a cmpxchg with the pointer to the previous head page to make it | ||
497 | point to the reader page. Note that the new pointer does not have the HEADER | ||
498 | flag set. This action atomically moves the head page forward. | ||
499 | |||
500 | +------+ | ||
501 | |reader| RING BUFFER | ||
502 | |page |-------H-----------+ | ||
503 | +------+ v | ||
504 | | ^ +---+ +---+ +---+ | ||
505 | | | | |-->| |-->| | | ||
506 | | | | |<--| |<--| |<-+ | ||
507 | | | +---+ +---+ +---+ | | ||
508 | | | | ^ | | | ||
509 | | | +-------------+ | | | ||
510 | | +-----------------------------+ | | ||
511 | +------------------------------------+ | ||
512 | |||
513 | After the new head page is set, the previous pointer of the head page is | ||
514 | updated to the reader page. | ||
515 | |||
516 | +------+ | ||
517 | |reader| RING BUFFER | ||
518 | |page |-------H-----------+ | ||
519 | +------+ <---------------+ v | ||
520 | | ^ +---+ +---+ +---+ | ||
521 | | | | |-->| |-->| | | ||
522 | | | | | | |<--| |<-+ | ||
523 | | | +---+ +---+ +---+ | | ||
524 | | | | ^ | | | ||
525 | | | +-------------+ | | | ||
526 | | +-----------------------------+ | | ||
527 | +------------------------------------+ | ||
528 | |||
529 | +------+ | ||
530 | |buffer| RING BUFFER | ||
531 | |page |-------H-----------+ <--- New head page | ||
532 | +------+ <---------------+ v | ||
533 | | ^ +---+ +---+ +---+ | ||
534 | | | | | | |-->| | | ||
535 | | | New | | | |<--| |<-+ | ||
536 | | | Reader +---+ +---+ +---+ | | ||
537 | | | page ----^ | | | ||
538 | | | | | | ||
539 | | +-----------------------------+ | | ||
540 | +------------------------------------+ | ||
541 | |||
542 | Another important point. The page that the reader page points back to | ||
543 | by its previous pointer (the one that now points to the new head page) | ||
544 | never points back to the reader page. That is because the reader page is | ||
545 | not part of the ring buffer. Traversing the ring buffer via the next pointers | ||
546 | will always stay in the ring buffer. Traversing the ring buffer via the | ||
547 | prev pointers may not. | ||
548 | |||
549 | Note, the way to determine a reader page is simply by examining the previous | ||
550 | pointer of the page. If the next pointer of the previous page does not | ||
551 | point back to the original page, then the original page is a reader page: | ||
552 | |||
553 | |||
554 | +--------+ | ||
555 | | reader | next +----+ | ||
556 | | page |-------->| |<====== (buffer page) | ||
557 | +--------+ +----+ | ||
558 | | | ^ | ||
559 | | v | next | ||
560 | prev | +----+ | ||
561 | +------------->| | | ||
562 | +----+ | ||
563 | |||
564 | The way the head page moves forward: | ||
565 | |||
566 | When the tail page meets the head page and the buffer is in overwrite mode | ||
567 | and more writes take place, the head page must be moved forward before the | ||
568 | writer may move the tail page. The way this is done is that the writer | ||
569 | performs a cmpxchg to convert the pointer to the head page from the HEADER | ||
570 | flag to have the UPDATE flag set. Once this is done, the reader will | ||
571 | not be able to swap the head page from the buffer, nor will it be able to | ||
572 | move the head page, until the writer is finished with the move. | ||
573 | |||
574 | This eliminates any races that the reader can have on the writer. The reader | ||
575 | must spin, and this is why the reader can not preempt the writer. | ||
576 | |||
577 | tail page | ||
578 | | | ||
579 | v | ||
580 | +---+ +---+ +---+ +---+ | ||
581 | <---| |--->| |-H->| |--->| |---> | ||
582 | --->| |<---| |<---| |<---| |<--- | ||
583 | +---+ +---+ +---+ +---+ | ||
584 | |||
585 | tail page | ||
586 | | | ||
587 | v | ||
588 | +---+ +---+ +---+ +---+ | ||
589 | <---| |--->| |-U->| |--->| |---> | ||
590 | --->| |<---| |<---| |<---| |<--- | ||
591 | +---+ +---+ +---+ +---+ | ||
592 | |||
593 | The following page will be made into the new head page. | ||
594 | |||
595 | tail page | ||
596 | | | ||
597 | v | ||
598 | +---+ +---+ +---+ +---+ | ||
599 | <---| |--->| |-U->| |-H->| |---> | ||
600 | --->| |<---| |<---| |<---| |<--- | ||
601 | +---+ +---+ +---+ +---+ | ||
602 | |||
603 | After the new head page has been set, we can set the old head page | ||
604 | pointer back to NORMAL. | ||
605 | |||
606 | tail page | ||
607 | | | ||
608 | v | ||
609 | +---+ +---+ +---+ +---+ | ||
610 | <---| |--->| |--->| |-H->| |---> | ||
611 | --->| |<---| |<---| |<---| |<--- | ||
612 | +---+ +---+ +---+ +---+ | ||
613 | |||
614 | After the head page has been moved, the tail page may now move forward. | ||
615 | |||
616 | tail page | ||
617 | | | ||
618 | v | ||
619 | +---+ +---+ +---+ +---+ | ||
620 | <---| |--->| |--->| |-H->| |---> | ||
621 | --->| |<---| |<---| |<---| |<--- | ||
622 | +---+ +---+ +---+ +---+ | ||
623 | |||
624 | |||
625 | The above are the trivial updates. Now for the more complex scenarios. | ||
626 | |||
627 | |||
628 | As stated before, if enough writes preempt the first write, the | ||
629 | tail page may make it all the way around the buffer and meet the commit | ||
630 | page. At this time, we must start dropping writes (usually with some kind | ||
631 | of warning to the user). But what happens if the commit was still on the | ||
632 | reader page? The commit page is not part of the ring buffer. The tail page | ||
633 | must account for this. | ||
634 | |||
635 | |||
636 | reader page commit page | ||
637 | | | | ||
638 | v | | ||
639 | +---+ | | ||
640 | | |<----------+ | ||
641 | | | | ||
642 | | |------+ | ||
643 | +---+ | | ||
644 | | | ||
645 | v | ||
646 | +---+ +---+ +---+ +---+ | ||
647 | <---| |--->| |-H->| |--->| |---> | ||
648 | --->| |<---| |<---| |<---| |<--- | ||
649 | +---+ +---+ +---+ +---+ | ||
650 | ^ | ||
651 | | | ||
652 | tail page | ||
653 | |||
654 | If the tail page were to simply push the head page forward, the commit when | ||
655 | leaving the reader page would not be pointing to the correct page. | ||
656 | |||
657 | The solution to this is to test if the commit page is on the reader page | ||
658 | before pushing the head page. If it is, then it can be assumed that the | ||
659 | tail page wrapped the buffer, and we must drop new writes. | ||
660 | |||
661 | This is not a race condition, because the commit page can only be moved | ||
662 | by the outter most writer (the writer that was preempted). | ||
663 | This means that the commit will not move while a writer is moving the | ||
664 | tail page. The reader can not swap the reader page if it is also being | ||
665 | used as the commit page. The reader can simply check that the commit | ||
666 | is off the reader page. Once the commit page leaves the reader page | ||
667 | it will never go back on it unless a reader does another swap with the | ||
668 | buffer page that is also the commit page. | ||
669 | |||
670 | |||
671 | Nested writes | ||
672 | ------------- | ||
673 | |||
674 | In the pushing forward of the tail page we must first push forward | ||
675 | the head page if the head page is the next page. If the head page | ||
676 | is not the next page, the tail page is simply updated with a cmpxchg. | ||
677 | |||
678 | Only writers move the tail page. This must be done atomically to protect | ||
679 | against nested writers. | ||
680 | |||
681 | temp_page = tail_page | ||
682 | next_page = temp_page->next | ||
683 | cmpxchg(tail_page, temp_page, next_page) | ||
684 | |||
685 | The above will update the tail page if it is still pointing to the expected | ||
686 | page. If this fails, a nested write pushed it forward, the the current write | ||
687 | does not need to push it. | ||
688 | |||
689 | |||
690 | temp page | ||
691 | | | ||
692 | v | ||
693 | tail page | ||
694 | | | ||
695 | v | ||
696 | +---+ +---+ +---+ +---+ | ||
697 | <---| |--->| |--->| |--->| |---> | ||
698 | --->| |<---| |<---| |<---| |<--- | ||
699 | +---+ +---+ +---+ +---+ | ||
700 | |||
701 | Nested write comes in and moves the tail page forward: | ||
702 | |||
703 | tail page (moved by nested writer) | ||
704 | temp page | | ||
705 | | | | ||
706 | v v | ||
707 | +---+ +---+ +---+ +---+ | ||
708 | <---| |--->| |--->| |--->| |---> | ||
709 | --->| |<---| |<---| |<---| |<--- | ||
710 | +---+ +---+ +---+ +---+ | ||
711 | |||
712 | The above would fail the cmpxchg, but since the tail page has already | ||
713 | been moved forward, the writer will just try again to reserve storage | ||
714 | on the new tail page. | ||
715 | |||
716 | But the moving of the head page is a bit more complex. | ||
717 | |||
718 | tail page | ||
719 | | | ||
720 | v | ||
721 | +---+ +---+ +---+ +---+ | ||
722 | <---| |--->| |-H->| |--->| |---> | ||
723 | --->| |<---| |<---| |<---| |<--- | ||
724 | +---+ +---+ +---+ +---+ | ||
725 | |||
726 | The write converts the head page pointer to UPDATE. | ||
727 | |||
728 | tail page | ||
729 | | | ||
730 | v | ||
731 | +---+ +---+ +---+ +---+ | ||
732 | <---| |--->| |-U->| |--->| |---> | ||
733 | --->| |<---| |<---| |<---| |<--- | ||
734 | +---+ +---+ +---+ +---+ | ||
735 | |||
736 | But if a nested writer preempts here. It will see that the next | ||
737 | page is a head page, but it is also nested. It will detect that | ||
738 | it is nested and will save that information. The detection is the | ||
739 | fact that it sees the UPDATE flag instead of a HEADER or NORMAL | ||
740 | pointer. | ||
741 | |||
742 | The nested writer will set the new head page pointer. | ||
743 | |||
744 | tail page | ||
745 | | | ||
746 | v | ||
747 | +---+ +---+ +---+ +---+ | ||
748 | <---| |--->| |-U->| |-H->| |---> | ||
749 | --->| |<---| |<---| |<---| |<--- | ||
750 | +---+ +---+ +---+ +---+ | ||
751 | |||
752 | But it will not reset the update back to normal. Only the writer | ||
753 | that converted a pointer from HEAD to UPDATE will convert it back | ||
754 | to NORMAL. | ||
755 | |||
756 | tail page | ||
757 | | | ||
758 | v | ||
759 | +---+ +---+ +---+ +---+ | ||
760 | <---| |--->| |-U->| |-H->| |---> | ||
761 | --->| |<---| |<---| |<---| |<--- | ||
762 | +---+ +---+ +---+ +---+ | ||
763 | |||
764 | After the nested writer finishes, the outer most writer will convert | ||
765 | the UPDATE pointer to NORMAL. | ||
766 | |||
767 | |||
768 | tail page | ||
769 | | | ||
770 | v | ||
771 | +---+ +---+ +---+ +---+ | ||
772 | <---| |--->| |--->| |-H->| |---> | ||
773 | --->| |<---| |<---| |<---| |<--- | ||
774 | +---+ +---+ +---+ +---+ | ||
775 | |||
776 | |||
777 | It can be even more complex if several nested writes came in and moved | ||
778 | the tail page ahead several pages: | ||
779 | |||
780 | |||
781 | (first writer) | ||
782 | |||
783 | tail page | ||
784 | | | ||
785 | v | ||
786 | +---+ +---+ +---+ +---+ | ||
787 | <---| |--->| |-H->| |--->| |---> | ||
788 | --->| |<---| |<---| |<---| |<--- | ||
789 | +---+ +---+ +---+ +---+ | ||
790 | |||
791 | The write converts the head page pointer to UPDATE. | ||
792 | |||
793 | tail page | ||
794 | | | ||
795 | v | ||
796 | +---+ +---+ +---+ +---+ | ||
797 | <---| |--->| |-U->| |--->| |---> | ||
798 | --->| |<---| |<---| |<---| |<--- | ||
799 | +---+ +---+ +---+ +---+ | ||
800 | |||
801 | Next writer comes in, and sees the update and sets up the new | ||
802 | head page. | ||
803 | |||
804 | (second writer) | ||
805 | |||
806 | tail page | ||
807 | | | ||
808 | v | ||
809 | +---+ +---+ +---+ +---+ | ||
810 | <---| |--->| |-U->| |-H->| |---> | ||
811 | --->| |<---| |<---| |<---| |<--- | ||
812 | +---+ +---+ +---+ +---+ | ||
813 | |||
814 | The nested writer moves the tail page forward. But does not set the old | ||
815 | update page to NORMAL because it is not the outer most writer. | ||
816 | |||
817 | tail page | ||
818 | | | ||
819 | v | ||
820 | +---+ +---+ +---+ +---+ | ||
821 | <---| |--->| |-U->| |-H->| |---> | ||
822 | --->| |<---| |<---| |<---| |<--- | ||
823 | +---+ +---+ +---+ +---+ | ||
824 | |||
825 | Another writer preempts and sees the page after the tail page is a head page. | ||
826 | It changes it from HEAD to UPDATE. | ||
827 | |||
828 | (third writer) | ||
829 | |||
830 | tail page | ||
831 | | | ||
832 | v | ||
833 | +---+ +---+ +---+ +---+ | ||
834 | <---| |--->| |-U->| |-U->| |---> | ||
835 | --->| |<---| |<---| |<---| |<--- | ||
836 | +---+ +---+ +---+ +---+ | ||
837 | |||
838 | The writer will move the head page forward: | ||
839 | |||
840 | |||
841 | (third writer) | ||
842 | |||
843 | tail page | ||
844 | | | ||
845 | v | ||
846 | +---+ +---+ +---+ +---+ | ||
847 | <---| |--->| |-U->| |-U->| |-H-> | ||
848 | --->| |<---| |<---| |<---| |<--- | ||
849 | +---+ +---+ +---+ +---+ | ||
850 | |||
851 | But now that the third writer did change the HEAD flag to UPDATE it | ||
852 | will convert it to normal: | ||
853 | |||
854 | |||
855 | (third writer) | ||
856 | |||
857 | tail page | ||
858 | | | ||
859 | v | ||
860 | +---+ +---+ +---+ +---+ | ||
861 | <---| |--->| |-U->| |--->| |-H-> | ||
862 | --->| |<---| |<---| |<---| |<--- | ||
863 | +---+ +---+ +---+ +---+ | ||
864 | |||
865 | |||
866 | Then it will move the tail page, and return back to the second writer. | ||
867 | |||
868 | |||
869 | (second writer) | ||
870 | |||
871 | tail page | ||
872 | | | ||
873 | v | ||
874 | +---+ +---+ +---+ +---+ | ||
875 | <---| |--->| |-U->| |--->| |-H-> | ||
876 | --->| |<---| |<---| |<---| |<--- | ||
877 | +---+ +---+ +---+ +---+ | ||
878 | |||
879 | |||
880 | The second writer will fail to move the tail page because it was already | ||
881 | moved, so it will try again and add its data to the new tail page. | ||
882 | It will return to the first writer. | ||
883 | |||
884 | |||
885 | (first writer) | ||
886 | |||
887 | tail page | ||
888 | | | ||
889 | v | ||
890 | +---+ +---+ +---+ +---+ | ||
891 | <---| |--->| |-U->| |--->| |-H-> | ||
892 | --->| |<---| |<---| |<---| |<--- | ||
893 | +---+ +---+ +---+ +---+ | ||
894 | |||
895 | The first writer can not know atomically test if the tail page moved | ||
896 | while it updates the HEAD page. It will then update the head page to | ||
897 | what it thinks is the new head page. | ||
898 | |||
899 | |||
900 | (first writer) | ||
901 | |||
902 | tail page | ||
903 | | | ||
904 | v | ||
905 | +---+ +---+ +---+ +---+ | ||
906 | <---| |--->| |-U->| |-H->| |-H-> | ||
907 | --->| |<---| |<---| |<---| |<--- | ||
908 | +---+ +---+ +---+ +---+ | ||
909 | |||
910 | Since the cmpxchg returns the old value of the pointer the first writer | ||
911 | will see it succeeded in updating the pointer from NORMAL to HEAD. | ||
912 | But as we can see, this is not good enough. It must also check to see | ||
913 | if the tail page is either where it use to be or on the next page: | ||
914 | |||
915 | |||
916 | (first writer) | ||
917 | |||
918 | A B tail page | ||
919 | | | | | ||
920 | v v v | ||
921 | +---+ +---+ +---+ +---+ | ||
922 | <---| |--->| |-U->| |-H->| |-H-> | ||
923 | --->| |<---| |<---| |<---| |<--- | ||
924 | +---+ +---+ +---+ +---+ | ||
925 | |||
926 | If tail page != A and tail page does not equal B, then it must reset the | ||
927 | pointer back to NORMAL. The fact that it only needs to worry about | ||
928 | nested writers, it only needs to check this after setting the HEAD page. | ||
929 | |||
930 | |||
931 | (first writer) | ||
932 | |||
933 | A B tail page | ||
934 | | | | | ||
935 | v v v | ||
936 | +---+ +---+ +---+ +---+ | ||
937 | <---| |--->| |-U->| |--->| |-H-> | ||
938 | --->| |<---| |<---| |<---| |<--- | ||
939 | +---+ +---+ +---+ +---+ | ||
940 | |||
941 | Now the writer can update the head page. This is also why the head page must | ||
942 | remain in UPDATE and only reset by the outer most writer. This prevents | ||
943 | the reader from seeing the incorrect head page. | ||
944 | |||
945 | |||
946 | (first writer) | ||
947 | |||
948 | A B tail page | ||
949 | | | | | ||
950 | v v v | ||
951 | +---+ +---+ +---+ +---+ | ||
952 | <---| |--->| |--->| |--->| |-H-> | ||
953 | --->| |<---| |<---| |<---| |<--- | ||
954 | +---+ +---+ +---+ +---+ | ||
955 | |||