aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/RCU
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/RCU')
-rw-r--r--Documentation/RCU/RTFP.txt2
-rw-r--r--Documentation/RCU/checklist.txt17
-rw-r--r--Documentation/RCU/listRCU.txt2
-rw-r--r--Documentation/RCU/rcuref.txt61
-rw-r--r--Documentation/RCU/trace.txt396
-rw-r--r--Documentation/RCU/whatisRCU.txt17
6 files changed, 299 insertions, 196 deletions
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt
index 7c1dfb19fc40..7f40c72a9c51 100644
--- a/Documentation/RCU/RTFP.txt
+++ b/Documentation/RCU/RTFP.txt
@@ -186,7 +186,7 @@ Bibtex Entries
186 186
187@article{Kung80 187@article{Kung80
188,author="H. T. Kung and Q. Lehman" 188,author="H. T. Kung and Q. Lehman"
189,title="Concurrent Maintenance of Binary Search Trees" 189,title="Concurrent Manipulation of Binary Search Trees"
190,Year="1980" 190,Year="1980"
191,Month="September" 191,Month="September"
192,journal="ACM Transactions on Database Systems" 192,journal="ACM Transactions on Database Systems"
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index cdb20d41a44a..31ef8fe07f82 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -271,15 +271,14 @@ over a rather long period of time, but improvements are always welcome!
271 The same cautions apply to call_rcu_bh() and call_rcu_sched(). 271 The same cautions apply to call_rcu_bh() and call_rcu_sched().
272 272
2739. All RCU list-traversal primitives, which include 2739. All RCU list-traversal primitives, which include
274 rcu_dereference(), list_for_each_entry_rcu(), 274 rcu_dereference(), list_for_each_entry_rcu(), and
275 list_for_each_continue_rcu(), and list_for_each_safe_rcu(), 275 list_for_each_safe_rcu(), must be either within an RCU read-side
276 must be either within an RCU read-side critical section or 276 critical section or must be protected by appropriate update-side
277 must be protected by appropriate update-side locks. RCU 277 locks. RCU read-side critical sections are delimited by
278 read-side critical sections are delimited by rcu_read_lock() 278 rcu_read_lock() and rcu_read_unlock(), or by similar primitives
279 and rcu_read_unlock(), or by similar primitives such as 279 such as rcu_read_lock_bh() and rcu_read_unlock_bh(), in which
280 rcu_read_lock_bh() and rcu_read_unlock_bh(), in which case 280 case the matching rcu_dereference() primitive must be used in
281 the matching rcu_dereference() primitive must be used in order 281 order to keep lockdep happy, in this case, rcu_dereference_bh().
282 to keep lockdep happy, in this case, rcu_dereference_bh().
283 282
284 The reason that it is permissible to use RCU list-traversal 283 The reason that it is permissible to use RCU list-traversal
285 primitives when the update-side lock is held is that doing so 284 primitives when the update-side lock is held is that doing so
diff --git a/Documentation/RCU/listRCU.txt b/Documentation/RCU/listRCU.txt
index 4349c1487e91..adb5a3782846 100644
--- a/Documentation/RCU/listRCU.txt
+++ b/Documentation/RCU/listRCU.txt
@@ -205,7 +205,7 @@ RCU ("read-copy update") its name. The RCU code is as follows:
205 audit_copy_rule(&ne->rule, &e->rule); 205 audit_copy_rule(&ne->rule, &e->rule);
206 ne->rule.action = newaction; 206 ne->rule.action = newaction;
207 ne->rule.file_count = newfield_count; 207 ne->rule.file_count = newfield_count;
208 list_replace_rcu(e, ne); 208 list_replace_rcu(&e->list, &ne->list);
209 call_rcu(&e->rcu, audit_free_rule); 209 call_rcu(&e->rcu, audit_free_rule);
210 return 0; 210 return 0;
211 } 211 }
diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt
index 4202ad093130..141d531aa14b 100644
--- a/Documentation/RCU/rcuref.txt
+++ b/Documentation/RCU/rcuref.txt
@@ -20,7 +20,7 @@ release_referenced() delete()
20{ { 20{ {
21 ... write_lock(&list_lock); 21 ... write_lock(&list_lock);
22 atomic_dec(&el->rc, relfunc) ... 22 atomic_dec(&el->rc, relfunc) ...
23 ... delete_element 23 ... remove_element
24} write_unlock(&list_lock); 24} write_unlock(&list_lock);
25 ... 25 ...
26 if (atomic_dec_and_test(&el->rc)) 26 if (atomic_dec_and_test(&el->rc))
@@ -52,7 +52,7 @@ release_referenced() delete()
52{ { 52{ {
53 ... spin_lock(&list_lock); 53 ... spin_lock(&list_lock);
54 if (atomic_dec_and_test(&el->rc)) ... 54 if (atomic_dec_and_test(&el->rc)) ...
55 call_rcu(&el->head, el_free); delete_element 55 call_rcu(&el->head, el_free); remove_element
56 ... spin_unlock(&list_lock); 56 ... spin_unlock(&list_lock);
57} ... 57} ...
58 if (atomic_dec_and_test(&el->rc)) 58 if (atomic_dec_and_test(&el->rc))
@@ -64,3 +64,60 @@ Sometimes, a reference to the element needs to be obtained in the
64update (write) stream. In such cases, atomic_inc_not_zero() might be 64update (write) stream. In such cases, atomic_inc_not_zero() might be
65overkill, since we hold the update-side spinlock. One might instead 65overkill, since we hold the update-side spinlock. One might instead
66use atomic_inc() in such cases. 66use atomic_inc() in such cases.
67
68It is not always convenient to deal with "FAIL" in the
69search_and_reference() code path. In such cases, the
70atomic_dec_and_test() may be moved from delete() to el_free()
71as follows:
72
731. 2.
74add() search_and_reference()
75{ {
76 alloc_object rcu_read_lock();
77 ... search_for_element
78 atomic_set(&el->rc, 1); atomic_inc(&el->rc);
79 spin_lock(&list_lock); ...
80
81 add_element rcu_read_unlock();
82 ... }
83 spin_unlock(&list_lock); 4.
84} delete()
853. {
86release_referenced() spin_lock(&list_lock);
87{ ...
88 ... remove_element
89 if (atomic_dec_and_test(&el->rc)) spin_unlock(&list_lock);
90 kfree(el); ...
91 ... call_rcu(&el->head, el_free);
92} ...
935. }
94void el_free(struct rcu_head *rhp)
95{
96 release_referenced();
97}
98
99The key point is that the initial reference added by add() is not removed
100until after a grace period has elapsed following removal. This means that
101search_and_reference() cannot find this element, which means that the value
102of el->rc cannot increase. Thus, once it reaches zero, there are no
103readers that can or ever will be able to reference the element. The
104element can therefore safely be freed. This in turn guarantees that if
105any reader finds the element, that reader may safely acquire a reference
106without checking the value of the reference counter.
107
108In cases where delete() can sleep, synchronize_rcu() can be called from
109delete(), so that el_free() can be subsumed into delete as follows:
110
1114.
112delete()
113{
114 spin_lock(&list_lock);
115 ...
116 remove_element
117 spin_unlock(&list_lock);
118 ...
119 synchronize_rcu();
120 if (atomic_dec_and_test(&el->rc))
121 kfree(el);
122 ...
123}
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 672d19083252..c776968f4463 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -10,51 +10,63 @@ for rcutree and next for rcutiny.
10 10
11CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats 11CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
12 12
13These implementations of RCU provides several debugfs files under the 13These implementations of RCU provide several debugfs directories under the
14top-level directory "rcu": 14top-level directory "rcu":
15 15
16rcu/rcudata: 16rcu/rcu_bh
17rcu/rcu_preempt
18rcu/rcu_sched
19
20Each directory contains files for the corresponding flavor of RCU.
21Note that rcu/rcu_preempt is only present for CONFIG_TREE_PREEMPT_RCU.
22For CONFIG_TREE_RCU, the RCU flavor maps onto the RCU-sched flavor,
23so that activity for both appears in rcu/rcu_sched.
24
25In addition, the following file appears in the top-level directory:
26rcu/rcutorture. This file displays rcutorture test progress. The output
27of "cat rcu/rcutorture" looks as follows:
28
29rcutorture test sequence: 0 (test in progress)
30rcutorture update version number: 615
31
32The first line shows the number of rcutorture tests that have completed
33since boot. If a test is currently running, the "(test in progress)"
34string will appear as shown above. The second line shows the number of
35update cycles that the current test has started, or zero if there is
36no test in progress.
37
38
39Within each flavor directory (rcu/rcu_bh, rcu/rcu_sched, and possibly
40also rcu/rcu_preempt) the following files will be present:
41
42rcudata:
17 Displays fields in struct rcu_data. 43 Displays fields in struct rcu_data.
18rcu/rcudata.csv: 44rcuexp:
19 Comma-separated values spreadsheet version of rcudata. 45 Displays statistics for expedited grace periods.
20rcu/rcugp: 46rcugp:
21 Displays grace-period counters. 47 Displays grace-period counters.
22rcu/rcuhier: 48rcuhier:
23 Displays the struct rcu_node hierarchy. 49 Displays the struct rcu_node hierarchy.
24rcu/rcu_pending: 50rcu_pending:
25 Displays counts of the reasons rcu_pending() decided that RCU had 51 Displays counts of the reasons rcu_pending() decided that RCU had
26 work to do. 52 work to do.
27rcu/rcutorture: 53rcuboost:
28 Displays rcutorture test progress.
29rcu/rcuboost:
30 Displays RCU boosting statistics. Only present if 54 Displays RCU boosting statistics. Only present if
31 CONFIG_RCU_BOOST=y. 55 CONFIG_RCU_BOOST=y.
32 56
33The output of "cat rcu/rcudata" looks as follows: 57The output of "cat rcu/rcu_preempt/rcudata" looks as follows:
34 58
35rcu_sched: 59 0!c=30455 g=30456 pq=1 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
36 0 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=545/1/0 df=50 of=0 ql=163 qs=NRW. kt=0/W/0 ktl=ebc3 b=10 ci=153737 co=0 ca=0 60 1!c=30719 g=30720 pq=1 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
37 1 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=967/1/0 df=58 of=0 ql=634 qs=NRW. kt=0/W/1 ktl=58c b=10 ci=191037 co=0 ca=0 61 2!c=30150 g=30151 pq=1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
38 2 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1081/1/0 df=175 of=0 ql=74 qs=N.W. kt=0/W/2 ktl=da94 b=10 ci=75991 co=0 ca=0 62 3 c=31249 g=31250 pq=1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
39 3 c=20942 g=20943 pq=1 pgp=20942 qp=1 dt=1846/0/0 df=404 of=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=72261 co=0 ca=0 63 4!c=29502 g=29503 pq=1 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
40 4 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=369/1/0 df=83 of=0 ql=48 qs=N.W. kt=0/W/4 ktl=e0e7 b=10 ci=128365 co=0 ca=0 64 5 c=31201 g=31202 pq=1 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
41 5 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=381/1/0 df=64 of=0 ql=169 qs=NRW. kt=0/W/5 ktl=fb2f b=10 ci=164360 co=0 ca=0 65 6!c=30253 g=30254 pq=1 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
42 6 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1037/1/0 df=183 of=0 ql=62 qs=N.W. kt=0/W/6 ktl=d2ad b=10 ci=65663 co=0 ca=0 66 7 c=31178 g=31178 pq=1 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
43 7 c=20897 g=20897 pq=1 pgp=20896 qp=0 dt=1572/0/0 df=382 of=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=75006 co=0 ca=0 67
44rcu_bh: 68This file has one line per CPU, or eight for this 8-CPU system.
45 0 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=545/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/0 ktl=ebc3 b=10 ci=0 co=0 ca=0 69The fields are as follows:
46 1 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=967/1/0 df=3 of=0 ql=0 qs=.... kt=0/W/1 ktl=58c b=10 ci=151 co=0 ca=0
47 2 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1081/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/2 ktl=da94 b=10 ci=0 co=0 ca=0
48 3 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1846/0/0 df=8 of=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=0 co=0 ca=0
49 4 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=369/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/4 ktl=e0e7 b=10 ci=0 co=0 ca=0
50 5 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=381/1/0 df=4 of=0 ql=0 qs=.... kt=0/W/5 ktl=fb2f b=10 ci=0 co=0 ca=0
51 6 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1037/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/6 ktl=d2ad b=10 ci=0 co=0 ca=0
52 7 c=1474 g=1474 pq=1 pgp=1473 qp=0 dt=1572/0/0 df=8 of=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=0 co=0 ca=0
53
54The first section lists the rcu_data structures for rcu_sched, the second
55for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an
56additional section for rcu_preempt. Each section has one line per CPU,
57or eight for this 8-CPU system. The fields are as follows:
58 70
59o The number at the beginning of each line is the CPU number. 71o The number at the beginning of each line is the CPU number.
60 CPUs numbers followed by an exclamation mark are offline, 72 CPUs numbers followed by an exclamation mark are offline,
@@ -64,11 +76,13 @@ o The number at the beginning of each line is the CPU number.
64 substantially larger than the number of actual CPUs. 76 substantially larger than the number of actual CPUs.
65 77
66o "c" is the count of grace periods that this CPU believes have 78o "c" is the count of grace periods that this CPU believes have
67 completed. Offlined CPUs and CPUs in dynticks idle mode may 79 completed. Offlined CPUs and CPUs in dynticks idle mode may lag
68 lag quite a ways behind, for example, CPU 6 under "rcu_sched" 80 quite a ways behind, for example, CPU 4 under "rcu_sched" above,
69 above, which has been offline through not quite 40,000 RCU grace 81 which has been offline through 16 RCU grace periods. It is not
70 periods. It is not unusual to see CPUs lagging by thousands of 82 unusual to see offline CPUs lagging by thousands of grace periods.
71 grace periods. 83 Note that although the grace-period number is an unsigned long,
84 it is printed out as a signed long to allow more human-friendly
85 representation near boot time.
72 86
73o "g" is the count of grace periods that this CPU believes have 87o "g" is the count of grace periods that this CPU believes have
74 started. Again, offlined CPUs and CPUs in dynticks idle mode 88 started. Again, offlined CPUs and CPUs in dynticks idle mode
@@ -84,30 +98,25 @@ o "pq" indicates that this CPU has passed through a quiescent state
84 CPU has not yet reported that fact, (2) some other CPU has not 98 CPU has not yet reported that fact, (2) some other CPU has not
85 yet reported for this grace period, or (3) both. 99 yet reported for this grace period, or (3) both.
86 100
87o "pgp" indicates which grace period the last-observed quiescent
88 state for this CPU corresponds to. This is important for handling
89 the race between CPU 0 reporting an extended dynticks-idle
90 quiescent state for CPU 1 and CPU 1 suddenly waking up and
91 reporting its own quiescent state. If CPU 1 was the last CPU
92 for the current grace period, then the CPU that loses this race
93 will attempt to incorrectly mark CPU 1 as having checked in for
94 the next grace period!
95
96o "qp" indicates that RCU still expects a quiescent state from 101o "qp" indicates that RCU still expects a quiescent state from
97 this CPU. Offlined CPUs and CPUs in dyntick idle mode might 102 this CPU. Offlined CPUs and CPUs in dyntick idle mode might
98 well have qp=1, which is OK: RCU is still ignoring them. 103 well have qp=1, which is OK: RCU is still ignoring them.
99 104
100o "dt" is the current value of the dyntick counter that is incremented 105o "dt" is the current value of the dyntick counter that is incremented
101 when entering or leaving dynticks idle state, either by the 106 when entering or leaving idle, either due to a context switch or
102 scheduler or by irq. This number is even if the CPU is in 107 due to an interrupt. This number is even if the CPU is in idle
103 dyntick idle mode and odd otherwise. The number after the first 108 from RCU's viewpoint and odd otherwise. The number after the
104 "/" is the interrupt nesting depth when in dyntick-idle state, 109 first "/" is the interrupt nesting depth when in idle state,
105 or one greater than the interrupt-nesting depth otherwise. 110 or a large number added to the interrupt-nesting depth when
106 The number after the second "/" is the NMI nesting depth. 111 running a non-idle task. Some architectures do not accurately
112 count interrupt nesting when running in non-idle kernel context,
113 which can result in interesting anomalies such as negative
114 interrupt-nesting levels. The number after the second "/"
115 is the NMI nesting depth.
107 116
108o "df" is the number of times that some other CPU has forced a 117o "df" is the number of times that some other CPU has forced a
109 quiescent state on behalf of this CPU due to this CPU being in 118 quiescent state on behalf of this CPU due to this CPU being in
110 dynticks-idle state. 119 idle state.
111 120
112o "of" is the number of times that some other CPU has forced a 121o "of" is the number of times that some other CPU has forced a
113 quiescent state on behalf of this CPU due to this CPU being 122 quiescent state on behalf of this CPU due to this CPU being
@@ -120,9 +129,13 @@ o "of" is the number of times that some other CPU has forced a
120 error, so it makes sense to err conservatively. 129 error, so it makes sense to err conservatively.
121 130
122o "ql" is the number of RCU callbacks currently residing on 131o "ql" is the number of RCU callbacks currently residing on
123 this CPU. This is the total number of callbacks, regardless 132 this CPU. The first number is the number of "lazy" callbacks
124 of what state they are in (new, waiting for grace period to 133 that are known to RCU to only be freeing memory, and the number
125 start, waiting for grace period to end, ready to invoke). 134 after the "/" is the total number of callbacks, lazy or not.
135 These counters count callbacks regardless of what phase of
136 grace-period processing that they are in (new, waiting for
137 grace period to start, waiting for grace period to end, ready
138 to invoke).
126 139
127o "qs" gives an indication of the state of the callback queue 140o "qs" gives an indication of the state of the callback queue
128 with four characters: 141 with four characters:
@@ -150,6 +163,43 @@ o "qs" gives an indication of the state of the callback queue
150 If there are no callbacks in a given one of the above states, 163 If there are no callbacks in a given one of the above states,
151 the corresponding character is replaced by ".". 164 the corresponding character is replaced by ".".
152 165
166o "b" is the batch limit for this CPU. If more than this number
167 of RCU callbacks is ready to invoke, then the remainder will
168 be deferred.
169
170o "ci" is the number of RCU callbacks that have been invoked for
171 this CPU. Note that ci+nci+ql is the number of callbacks that have
172 been registered in absence of CPU-hotplug activity.
173
174o "nci" is the number of RCU callbacks that have been offloaded from
175 this CPU. This will always be zero unless the kernel was built
176 with CONFIG_RCU_NOCB_CPU=y and the "rcu_nocbs=" kernel boot
177 parameter was specified.
178
179o "co" is the number of RCU callbacks that have been orphaned due to
180 this CPU going offline. These orphaned callbacks have been moved
181 to an arbitrarily chosen online CPU.
182
183o "ca" is the number of RCU callbacks that have been adopted by this
184 CPU due to other CPUs going offline. Note that ci+co-ca+ql is
185 the number of RCU callbacks registered on this CPU.
186
187
188Kernels compiled with CONFIG_RCU_BOOST=y display the following from
189/debug/rcu/rcu_preempt/rcudata:
190
191 0!c=12865 g=12866 pq=1 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
192 1 c=14407 g=14408 pq=1 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
193 2 c=14407 g=14408 pq=1 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
194 3 c=14407 g=14408 pq=1 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
195 4 c=14405 g=14406 pq=1 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
196 5!c=14168 g=14169 pq=1 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
197 6 c=14404 g=14405 pq=1 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
198 7 c=14407 g=14408 pq=1 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
199
200This is similar to the output discussed above, but contains the following
201additional fields:
202
153o "kt" is the per-CPU kernel-thread state. The digit preceding 203o "kt" is the per-CPU kernel-thread state. The digit preceding
154 the first slash is zero if there is no work pending and 1 204 the first slash is zero if there is no work pending and 1
155 otherwise. The character between the first pair of slashes is 205 otherwise. The character between the first pair of slashes is
@@ -184,35 +234,51 @@ o "ktl" is the low-order 16 bits (in hexadecimal) of the count of
184 234
185 This field is displayed only for CONFIG_RCU_BOOST kernels. 235 This field is displayed only for CONFIG_RCU_BOOST kernels.
186 236
187o "b" is the batch limit for this CPU. If more than this number
188 of RCU callbacks is ready to invoke, then the remainder will
189 be deferred.
190 237
191o "ci" is the number of RCU callbacks that have been invoked for 238The output of "cat rcu/rcu_preempt/rcuexp" looks as follows:
192 this CPU. Note that ci+ql is the number of callbacks that have
193 been registered in absence of CPU-hotplug activity.
194 239
195o "co" is the number of RCU callbacks that have been orphaned due to 240s=21872 d=21872 w=0 tf=0 wd1=0 wd2=0 n=0 sc=21872 dt=21872 dl=0 dx=21872
196 this CPU going offline. These orphaned callbacks have been moved 241
197 to an arbitrarily chosen online CPU. 242These fields are as follows:
243
244o "s" is the starting sequence number.
198 245
199o "ca" is the number of RCU callbacks that have been adopted due to 246o "d" is the ending sequence number. When the starting and ending
200 other CPUs going offline. Note that ci+co-ca+ql is the number of 247 numbers differ, there is an expedited grace period in progress.
201 RCU callbacks registered on this CPU.
202 248
203There is also an rcu/rcudata.csv file with the same information in 249o "w" is the number of times that the sequence numbers have been
204comma-separated-variable spreadsheet format. 250 in danger of wrapping.
205 251
252o "tf" is the number of times that contention has resulted in a
253 failure to begin an expedited grace period.
206 254
207The output of "cat rcu/rcugp" looks as follows: 255o "wd1" and "wd2" are the number of times that an attempt to
256 start an expedited grace period found that someone else had
257 completed an expedited grace period that satisfies the
258 attempted request. "Our work is done."
208 259
209rcu_sched: completed=33062 gpnum=33063 260o "n" is number of times that contention was so great that
210rcu_bh: completed=464 gpnum=464 261 the request was demoted from an expedited grace period to
262 a normal grace period.
263
264o "sc" is the number of times that the attempt to start a
265 new expedited grace period succeeded.
266
267o "dt" is the number of times that we attempted to update
268 the "d" counter.
269
270o "dl" is the number of times that we failed to update the "d"
271 counter.
272
273o "dx" is the number of times that we succeeded in updating
274 the "d" counter.
211 275
212Again, this output is for both "rcu_sched" and "rcu_bh". Note that 276
213kernels built with CONFIG_TREE_PREEMPT_RCU will have an additional 277The output of "cat rcu/rcu_preempt/rcugp" looks as follows:
214"rcu_preempt" line. The fields are taken from the rcu_state structure, 278
215and are as follows: 279completed=31249 gpnum=31250 age=1 max=18
280
281These fields are taken from the rcu_state structure, and are as follows:
216 282
217o "completed" is the number of grace periods that have completed. 283o "completed" is the number of grace periods that have completed.
218 It is comparable to the "c" field from rcu/rcudata in that a 284 It is comparable to the "c" field from rcu/rcudata in that a
@@ -220,44 +286,42 @@ o "completed" is the number of grace periods that have completed.
220 that the corresponding RCU grace period has completed. 286 that the corresponding RCU grace period has completed.
221 287
222o "gpnum" is the number of grace periods that have started. It is 288o "gpnum" is the number of grace periods that have started. It is
223 comparable to the "g" field from rcu/rcudata in that a CPU 289 similarly comparable to the "g" field from rcu/rcudata in that
224 whose "g" field matches the value of "gpnum" is aware that the 290 a CPU whose "g" field matches the value of "gpnum" is aware that
225 corresponding RCU grace period has started. 291 the corresponding RCU grace period has started.
292
293 If these two fields are equal, then there is no grace period
294 in progress, in other words, RCU is idle. On the other hand,
295 if the two fields differ (as they are above), then an RCU grace
296 period is in progress.
226 297
227 If these two fields are equal (as they are for "rcu_bh" above), 298o "age" is the number of jiffies that the current grace period
228 then there is no grace period in progress, in other words, RCU 299 has extended for, or zero if there is no grace period currently
229 is idle. On the other hand, if the two fields differ (as they 300 in effect.
230 do for "rcu_sched" above), then an RCU grace period is in progress.
231 301
302o "max" is the age in jiffies of the longest-duration grace period
303 thus far.
232 304
233The output of "cat rcu/rcuhier" looks as follows, with very long lines: 305The output of "cat rcu/rcu_preempt/rcuhier" looks as follows:
234 306
235c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 307c=14407 g=14408 s=0 jfq=2 j=c863 nfqs=12040/nfqsng=0(12040) fqlh=1051 oqlen=0/0
2361/1 ..>. 0:127 ^0 3083/3 ..>. 0:7 ^0
2373/3 ..>. 0:35 ^0 0/0 ..>. 36:71 ^1 0/0 ..>. 72:107 ^2 0/0 ..>. 108:127 ^3 309e/e ..>. 0:3 ^0 d/d ..>. 4:7 ^1
2383/3f ..>. 0:5 ^0 2/3 ..>. 6:11 ^1 0/0 ..>. 12:17 ^2 0/0 ..>. 18:23 ^3 0/0 ..>. 24:29 ^4 0/0 ..>. 30:35 ^5 0/0 ..>. 36:41 ^0 0/0 ..>. 42:47 ^1 0/0 ..>. 48:53 ^2 0/0 ..>. 54:59 ^3 0/0 ..>. 60:65 ^4 0/0 ..>. 66:71 ^5 0/0 ..>. 72:77 ^0 0/0 ..>. 78:83 ^1 0/0 ..>. 84:89 ^2 0/0 ..>. 90:95 ^3 0/0 ..>. 96:101 ^4 0/0 ..>. 102:107 ^5 0/0 ..>. 108:113 ^0 0/0 ..>. 114:119 ^1 0/0 ..>. 120:125 ^2 0/0 ..>. 126:127 ^3
239rcu_bh:
240c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
2410/1 ..>. 0:127 ^0
2420/3 ..>. 0:35 ^0 0/0 ..>. 36:71 ^1 0/0 ..>. 72:107 ^2 0/0 ..>. 108:127 ^3
2430/3f ..>. 0:5 ^0 0/3 ..>. 6:11 ^1 0/0 ..>. 12:17 ^2 0/0 ..>. 18:23 ^3 0/0 ..>. 24:29 ^4 0/0 ..>. 30:35 ^5 0/0 ..>. 36:41 ^0 0/0 ..>. 42:47 ^1 0/0 ..>. 48:53 ^2 0/0 ..>. 54:59 ^3 0/0 ..>. 60:65 ^4 0/0 ..>. 66:71 ^5 0/0 ..>. 72:77 ^0 0/0 ..>. 78:83 ^1 0/0 ..>. 84:89 ^2 0/0 ..>. 90:95 ^3 0/0 ..>. 96:101 ^4 0/0 ..>. 102:107 ^5 0/0 ..>. 108:113 ^0 0/0 ..>. 114:119 ^1 0/0 ..>. 120:125 ^2 0/0 ..>. 126:127 ^3
244 310
245This is once again split into "rcu_sched" and "rcu_bh" portions, 311The fields are as follows:
246and CONFIG_TREE_PREEMPT_RCU kernels will again have an additional
247"rcu_preempt" section. The fields are as follows:
248 312
249o "c" is exactly the same as "completed" under rcu/rcugp. 313o "c" is exactly the same as "completed" under rcu/rcu_preempt/rcugp.
250 314
251o "g" is exactly the same as "gpnum" under rcu/rcugp. 315o "g" is exactly the same as "gpnum" under rcu/rcu_preempt/rcugp.
252 316
253o "s" is the "signaled" state that drives force_quiescent_state()'s 317o "s" is the current state of the force_quiescent_state()
254 state machine. 318 state machine.
255 319
256o "jfq" is the number of jiffies remaining for this grace period 320o "jfq" is the number of jiffies remaining for this grace period
257 before force_quiescent_state() is invoked to help push things 321 before force_quiescent_state() is invoked to help push things
258 along. Note that CPUs in dyntick-idle mode throughout the grace 322 along. Note that CPUs in idle mode throughout the grace period
259 period will not report on their own, but rather must be check by 323 will not report on their own, but rather must be check by some
260 some other CPU via force_quiescent_state(). 324 other CPU via force_quiescent_state().
261 325
262o "j" is the low-order four hex digits of the jiffies counter. 326o "j" is the low-order four hex digits of the jiffies counter.
263 Yes, Paul did run into a number of problems that turned out to 327 Yes, Paul did run into a number of problems that turned out to
@@ -268,7 +332,8 @@ o "nfqs" is the number of calls to force_quiescent_state() since
268 332
269o "nfqsng" is the number of useless calls to force_quiescent_state(), 333o "nfqsng" is the number of useless calls to force_quiescent_state(),
270 where there wasn't actually a grace period active. This can 334 where there wasn't actually a grace period active. This can
271 happen due to races. The number in parentheses is the difference 335 no longer happen due to grace-period processing being pushed
336 into a kthread. The number in parentheses is the difference
272 between "nfqs" and "nfqsng", or the number of times that 337 between "nfqs" and "nfqsng", or the number of times that
273 force_quiescent_state() actually did some real work. 338 force_quiescent_state() actually did some real work.
274 339
@@ -276,28 +341,27 @@ o "fqlh" is the number of calls to force_quiescent_state() that
276 exited immediately (without even being counted in nfqs above) 341 exited immediately (without even being counted in nfqs above)
277 due to contention on ->fqslock. 342 due to contention on ->fqslock.
278 343
279o Each element of the form "1/1 0:127 ^0" represents one struct 344o Each element of the form "3/3 ..>. 0:7 ^0" represents one rcu_node
280 rcu_node. Each line represents one level of the hierarchy, from 345 structure. Each line represents one level of the hierarchy,
281 root to leaves. It is best to think of the rcu_data structures 346 from root to leaves. It is best to think of the rcu_data
282 as forming yet another level after the leaves. Note that there 347 structures as forming yet another level after the leaves.
283 might be either one, two, or three levels of rcu_node structures, 348 Note that there might be either one, two, three, or even four
284 depending on the relationship between CONFIG_RCU_FANOUT and 349 levels of rcu_node structures, depending on the relationship
285 CONFIG_NR_CPUS. 350 between CONFIG_RCU_FANOUT, CONFIG_RCU_FANOUT_LEAF (possibly
351 adjusted using the rcu_fanout_leaf kernel boot parameter), and
352 CONFIG_NR_CPUS (possibly adjusted using the nr_cpu_ids count of
353 possible CPUs for the booting hardware).
286 354
287 o The numbers separated by the "/" are the qsmask followed 355 o The numbers separated by the "/" are the qsmask followed
288 by the qsmaskinit. The qsmask will have one bit 356 by the qsmaskinit. The qsmask will have one bit
289 set for each entity in the next lower level that 357 set for each entity in the next lower level that has
290 has not yet checked in for the current grace period. 358 not yet checked in for the current grace period ("e"
359 indicating CPUs 5, 6, and 7 in the example above).
291 The qsmaskinit will have one bit for each entity that is 360 The qsmaskinit will have one bit for each entity that is
292 currently expected to check in during each grace period. 361 currently expected to check in during each grace period.
293 The value of qsmaskinit is assigned to that of qsmask 362 The value of qsmaskinit is assigned to that of qsmask
294 at the beginning of each grace period. 363 at the beginning of each grace period.
295 364
296 For example, for "rcu_sched", the qsmask of the first
297 entry of the lowest level is 0x14, meaning that we
298 are still waiting for CPUs 2 and 4 to check in for the
299 current grace period.
300
301 o The characters separated by the ">" indicate the state 365 o The characters separated by the ">" indicate the state
302 of the blocked-tasks lists. A "G" preceding the ">" 366 of the blocked-tasks lists. A "G" preceding the ">"
303 indicates that at least one task blocked in an RCU 367 indicates that at least one task blocked in an RCU
@@ -312,48 +376,39 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
312 A "." character appears if the corresponding condition 376 A "." character appears if the corresponding condition
313 does not hold, so that "..>." indicates that no tasks 377 does not hold, so that "..>." indicates that no tasks
314 are blocked. In contrast, "GE>T" indicates maximal 378 are blocked. In contrast, "GE>T" indicates maximal
315 inconvenience from blocked tasks. 379 inconvenience from blocked tasks. CONFIG_TREE_RCU
380 builds of the kernel will always show "..>.".
316 381
317 o The numbers separated by the ":" are the range of CPUs 382 o The numbers separated by the ":" are the range of CPUs
318 served by this struct rcu_node. This can be helpful 383 served by this struct rcu_node. This can be helpful
319 in working out how the hierarchy is wired together. 384 in working out how the hierarchy is wired together.
320 385
321 For example, the first entry at the lowest level shows 386 For example, the example rcu_node structure shown above
322 "0:5", indicating that it covers CPUs 0 through 5. 387 has "0:7", indicating that it covers CPUs 0 through 7.
323 388
324 o The number after the "^" indicates the bit in the 389 o The number after the "^" indicates the bit in the
325 next higher level rcu_node structure that this 390 next higher level rcu_node structure that this rcu_node
326 rcu_node structure corresponds to. 391 structure corresponds to. For example, the "d/d ..>. 4:7
327 392 ^1" has a "1" in this position, indicating that it
328 For example, the first entry at the lowest level shows 393 corresponds to the "1" bit in the "3" shown in the
329 "^0", indicating that it corresponds to bit zero in 394 "3/3 ..>. 0:7 ^0" entry on the next level up.
330 the first entry at the middle level. 395
331 396
332 397The output of "cat rcu/rcu_sched/rcu_pending" looks as follows:
333The output of "cat rcu/rcu_pending" looks as follows: 398
334 399 0!np=26111 qsp=29 rpq=5386 cbr=1 cng=570 gpc=3674 gps=577 nn=15903
335rcu_sched: 400 1!np=28913 qsp=35 rpq=6097 cbr=1 cng=448 gpc=3700 gps=554 nn=18113
336 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nn=146741 401 2!np=32740 qsp=37 rpq=6202 cbr=0 cng=476 gpc=4627 gps=546 nn=20889
337 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nn=155792 402 3 np=23679 qsp=22 rpq=5044 cbr=1 cng=415 gpc=3403 gps=347 nn=14469
338 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nn=136629 403 4!np=30714 qsp=4 rpq=5574 cbr=0 cng=528 gpc=3931 gps=639 nn=20042
339 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nn=137723 404 5 np=28910 qsp=2 rpq=5246 cbr=0 cng=428 gpc=4105 gps=709 nn=18422
340 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nn=123110 405 6!np=38648 qsp=5 rpq=7076 cbr=0 cng=840 gpc=4072 gps=961 nn=25699
341 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nn=137456 406 7 np=37275 qsp=2 rpq=6873 cbr=0 cng=868 gpc=3416 gps=971 nn=25147
342 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nn=120834 407
343 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nn=144888 408The fields are as follows:
344rcu_bh: 409
345 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nn=145314 410o The leading number is the CPU number, with "!" indicating
346 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nn=143180 411 an offline CPU.
347 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nn=117936
348 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nn=134863
349 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nn=110671
350 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nn=133235
351 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nn=110921
352 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nn=118542
353
354As always, this is once again split into "rcu_sched" and "rcu_bh"
355portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional
356"rcu_preempt" section. The fields are as follows:
357 412
358o "np" is the number of times that __rcu_pending() has been invoked 413o "np" is the number of times that __rcu_pending() has been invoked
359 for the corresponding flavor of RCU. 414 for the corresponding flavor of RCU.
@@ -377,38 +432,23 @@ o "gpc" is the number of times that an old grace period had
377o "gps" is the number of times that a new grace period had started, 432o "gps" is the number of times that a new grace period had started,
378 but this CPU was not yet aware of it. 433 but this CPU was not yet aware of it.
379 434
380o "nn" is the number of times that this CPU needed nothing. Alert 435o "nn" is the number of times that this CPU needed nothing.
381 readers will note that the rcu "nn" number for a given CPU very
382 closely matches the rcu_bh "np" number for that same CPU. This
383 is due to short-circuit evaluation in rcu_pending().
384
385
386The output of "cat rcu/rcutorture" looks as follows:
387
388rcutorture test sequence: 0 (test in progress)
389rcutorture update version number: 615
390
391The first line shows the number of rcutorture tests that have completed
392since boot. If a test is currently running, the "(test in progress)"
393string will appear as shown above. The second line shows the number of
394update cycles that the current test has started, or zero if there is
395no test in progress.
396 436
397 437
398The output of "cat rcu/rcuboost" looks as follows: 438The output of "cat rcu/rcuboost" looks as follows:
399 439
4000:5 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=2f95 bt=300f 4400:3 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
401 balk: nt=0 egt=989 bt=0 nb=0 ny=0 nos=16 441 balk: nt=0 egt=4695 bt=0 nb=0 ny=56 nos=0
4026:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=2f95 bt=300f 4424:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
403 balk: nt=0 egt=225 bt=0 nb=0 ny=0 nos=6 443 balk: nt=0 egt=6541 bt=0 nb=0 ny=126 nos=0
404 444
405This information is output only for rcu_preempt. Each two-line entry 445This information is output only for rcu_preempt. Each two-line entry
406corresponds to a leaf rcu_node strcuture. The fields are as follows: 446corresponds to a leaf rcu_node strcuture. The fields are as follows:
407 447
408o "n:m" is the CPU-number range for the corresponding two-line 448o "n:m" is the CPU-number range for the corresponding two-line
409 entry. In the sample output above, the first entry covers 449 entry. In the sample output above, the first entry covers
410 CPUs zero through five and the second entry covers CPUs 6 450 CPUs zero through three and the second entry covers CPUs four
411 and 7. 451 through seven.
412 452
413o "tasks=TNEB" gives the state of the various segments of the 453o "tasks=TNEB" gives the state of the various segments of the
414 rnp->blocked_tasks list: 454 rnp->blocked_tasks list:
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index bf0f6de2aa00..0cc7820967f4 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -499,6 +499,8 @@ The foo_reclaim() function might appear as follows:
499 { 499 {
500 struct foo *fp = container_of(rp, struct foo, rcu); 500 struct foo *fp = container_of(rp, struct foo, rcu);
501 501
502 foo_cleanup(fp->a);
503
502 kfree(fp); 504 kfree(fp);
503 } 505 }
504 506
@@ -521,6 +523,12 @@ o Use call_rcu() -after- removing a data element from an
521 read-side critical sections that might be referencing that 523 read-side critical sections that might be referencing that
522 data item. 524 data item.
523 525
526If the callback for call_rcu() is not doing anything more than calling
527kfree() on the structure, you can use kfree_rcu() instead of call_rcu()
528to avoid having to write your own callback:
529
530 kfree_rcu(old_fp, rcu);
531
524Again, see checklist.txt for additional rules governing the use of RCU. 532Again, see checklist.txt for additional rules governing the use of RCU.
525 533
526 534
@@ -773,8 +781,8 @@ a single atomic update, converting to RCU will require special care.
773 781
774Also, the presence of synchronize_rcu() means that the RCU version of 782Also, the presence of synchronize_rcu() means that the RCU version of
775delete() can now block. If this is a problem, there is a callback-based 783delete() can now block. If this is a problem, there is a callback-based
776mechanism that never blocks, namely call_rcu(), that can be used in 784mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can
777place of synchronize_rcu(). 785be used in place of synchronize_rcu().
778 786
779 787
7807. FULL LIST OF RCU APIs 7887. FULL LIST OF RCU APIs
@@ -789,9 +797,7 @@ RCU list traversal:
789 list_for_each_entry_rcu 797 list_for_each_entry_rcu
790 hlist_for_each_entry_rcu 798 hlist_for_each_entry_rcu
791 hlist_nulls_for_each_entry_rcu 799 hlist_nulls_for_each_entry_rcu
792 800 list_for_each_entry_continue_rcu
793 list_for_each_continue_rcu (to be deprecated in favor of new
794 list_for_each_entry_continue_rcu)
795 801
796RCU pointer/list update: 802RCU pointer/list update:
797 803
@@ -813,6 +819,7 @@ RCU: Critical sections Grace period Barrier
813 rcu_read_unlock synchronize_rcu 819 rcu_read_unlock synchronize_rcu
814 rcu_dereference synchronize_rcu_expedited 820 rcu_dereference synchronize_rcu_expedited
815 call_rcu 821 call_rcu
822 kfree_rcu
816 823
817 824
818bh: Critical sections Grace period Barrier 825bh: Critical sections Grace period Barrier