aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2017-04-23 05:12:44 -0400
committerIngo Molnar <mingo@kernel.org>2017-04-23 05:12:44 -0400
commit58d30c36d472b75e8e9962d6a640be19d9389128 (patch)
treece161b15e844d081f527f02a4f74ffd1171b2b14
parent94836ecf1e7378b64d37624fbb81fe48fbd4c772 (diff)
parentf2094107ac82bf867184efd77cee30b6a98e2e20 (diff)
Merge branch 'for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into core/rcu
Pull RCU updates from Paul E. McKenney: - Documentation updates. - Miscellaneous fixes. - Parallelize SRCU callback handling (plus overlapping patches). Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--Documentation/RCU/00-INDEX2
-rw-r--r--Documentation/RCU/Design/Data-Structures/Data-Structures.html233
-rw-r--r--Documentation/RCU/Design/Data-Structures/nxtlist.svg34
-rw-r--r--Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html47
-rw-r--r--Documentation/RCU/Design/Requirements/Requirements.html195
-rw-r--r--Documentation/RCU/rcu_dereference.txt9
-rw-r--r--Documentation/RCU/rculist_nulls.txt6
-rw-r--r--Documentation/RCU/stallwarn.txt190
-rw-r--r--Documentation/RCU/whatisRCU.txt32
-rw-r--r--Documentation/memory-barriers.txt2
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c2
-rw-r--r--drivers/gpu/drm/i915/i915_gem_request.h2
-rw-r--r--drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c2
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/signalfd.c2
-rw-r--r--include/linux/dma-fence.h4
-rw-r--r--include/linux/kvm_host.h4
-rw-r--r--include/linux/rcu_node_tree.h99
-rw-r--r--include/linux/rcu_segcblist.h712
-rw-r--r--include/linux/rculist.h3
-rw-r--r--include/linux/rcupdate.h17
-rw-r--r--include/linux/rcutiny.h24
-rw-r--r--include/linux/rcutree.h5
-rw-r--r--include/linux/slab.h6
-rw-r--r--include/linux/srcu.h84
-rw-r--r--include/linux/srcuclassic.h101
-rw-r--r--include/linux/srcutiny.h81
-rw-r--r--include/linux/srcutree.h139
-rw-r--r--include/linux/types.h2
-rw-r--r--include/net/sock.h2
-rw-r--r--init/Kconfig39
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/locking/lockdep.c86
-rw-r--r--kernel/locking/rtmutex-debug.c9
-rw-r--r--kernel/rcu/Makefile4
-rw-r--r--kernel/rcu/rcu.h153
-rw-r--r--kernel/rcu/rcutorture.c35
-rw-r--r--kernel/rcu/srcu.c12
-rw-r--r--kernel/rcu/srcutiny.c215
-rw-r--r--kernel/rcu/srcutree.c996
-rw-r--r--kernel/rcu/tiny.c20
-rw-r--r--kernel/rcu/tiny_plugin.h13
-rw-r--r--kernel/rcu/tree.c696
-rw-r--r--kernel/rcu/tree.h164
-rw-r--r--kernel/rcu/tree_exp.h25
-rw-r--r--kernel/rcu/tree_plugin.h64
-rw-r--r--kernel/rcu/tree_trace.c26
-rw-r--r--kernel/rcu/update.c53
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/signal.c2
-rw-r--r--mm/kasan/kasan.c6
-rw-r--r--mm/kmemcheck.c2
-rw-r--r--mm/mmu_notifier.c14
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/slab.c6
-rw-r--r--mm/slab.h4
-rw-r--r--mm/slab_common.c6
-rw-r--r--mm/slob.c6
-rw-r--r--mm/slub.c12
-rw-r--r--net/dccp/ipv4.c2
-rw-r--r--net/dccp/ipv6.c2
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv6/tcp_ipv6.c2
-rw-r--r--net/llc/af_llc.c2
-rw-r--r--net/llc/llc_conn.c4
-rw-r--r--net/llc/llc_sap.c2
-rw-r--r--net/netfilter/nf_conntrack_core.c8
-rw-r--r--net/smc/af_smc.c2
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh2
71 files changed, 3637 insertions, 1116 deletions
diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
index f773a264ae02..1672573b037a 100644
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -17,7 +17,7 @@ rcu_dereference.txt
17rcubarrier.txt 17rcubarrier.txt
18 - RCU and Unloadable Modules 18 - RCU and Unloadable Modules
19rculist_nulls.txt 19rculist_nulls.txt
20 - RCU list primitives for use with SLAB_DESTROY_BY_RCU 20 - RCU list primitives for use with SLAB_TYPESAFE_BY_RCU
21rcuref.txt 21rcuref.txt
22 - Reference-count design for elements of lists/arrays protected by RCU 22 - Reference-count design for elements of lists/arrays protected by RCU
23rcu.txt 23rcu.txt
diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html
index d583c653a703..38d6d800761f 100644
--- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html
+++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html
@@ -19,6 +19,8 @@ to each other.
19 The <tt>rcu_state</tt> Structure</a> 19 The <tt>rcu_state</tt> Structure</a>
20<li> <a href="#The rcu_node Structure"> 20<li> <a href="#The rcu_node Structure">
21 The <tt>rcu_node</tt> Structure</a> 21 The <tt>rcu_node</tt> Structure</a>
22<li> <a href="#The rcu_segcblist Structure">
23 The <tt>rcu_segcblist</tt> Structure</a>
22<li> <a href="#The rcu_data Structure"> 24<li> <a href="#The rcu_data Structure">
23 The <tt>rcu_data</tt> Structure</a> 25 The <tt>rcu_data</tt> Structure</a>
24<li> <a href="#The rcu_dynticks Structure"> 26<li> <a href="#The rcu_dynticks Structure">
@@ -841,6 +843,134 @@ for lockdep lock-class names.
841Finally, lines&nbsp;64-66 produce an error if the maximum number of 843Finally, lines&nbsp;64-66 produce an error if the maximum number of
842CPUs is too large for the specified fanout. 844CPUs is too large for the specified fanout.
843 845
846<h3><a name="The rcu_segcblist Structure">
847The <tt>rcu_segcblist</tt> Structure</a></h3>
848
849The <tt>rcu_segcblist</tt> structure maintains a segmented list of
850callbacks as follows:
851
852<pre>
853 1 #define RCU_DONE_TAIL 0
854 2 #define RCU_WAIT_TAIL 1
855 3 #define RCU_NEXT_READY_TAIL 2
856 4 #define RCU_NEXT_TAIL 3
857 5 #define RCU_CBLIST_NSEGS 4
858 6
859 7 struct rcu_segcblist {
860 8 struct rcu_head *head;
861 9 struct rcu_head **tails[RCU_CBLIST_NSEGS];
86210 unsigned long gp_seq[RCU_CBLIST_NSEGS];
86311 long len;
86412 long len_lazy;
86513 };
866</pre>
867
868<p>
869The segments are as follows:
870
871<ol>
872<li> <tt>RCU_DONE_TAIL</tt>: Callbacks whose grace periods have elapsed.
873 These callbacks are ready to be invoked.
874<li> <tt>RCU_WAIT_TAIL</tt>: Callbacks that are waiting for the
875 current grace period.
876 Note that different CPUs can have different ideas about which
877 grace period is current, hence the <tt>-&gt;gp_seq</tt> field.
878<li> <tt>RCU_NEXT_READY_TAIL</tt>: Callbacks waiting for the next
879 grace period to start.
880<li> <tt>RCU_NEXT_TAIL</tt>: Callbacks that have not yet been
881 associated with a grace period.
882</ol>
883
884<p>
885The <tt>-&gt;head</tt> pointer references the first callback or
886is <tt>NULL</tt> if the list contains no callbacks (which is
887<i>not</i> the same as being empty).
888Each element of the <tt>-&gt;tails[]</tt> array references the
889<tt>-&gt;next</tt> pointer of the last callback in the corresponding
890segment of the list, or the list's <tt>-&gt;head</tt> pointer if
891that segment and all previous segments are empty.
892If the corresponding segment is empty but some previous segment is
893not empty, then the array element is identical to its predecessor.
894Older callbacks are closer to the head of the list, and new callbacks
895are added at the tail.
896This relationship between the <tt>-&gt;head</tt> pointer, the
897<tt>-&gt;tails[]</tt> array, and the callbacks is shown in this
898diagram:
899
900</p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%">
901
902</p><p>In this figure, the <tt>-&gt;head</tt> pointer references the
903first
904RCU callback in the list.
905The <tt>-&gt;tails[RCU_DONE_TAIL]</tt> array element references
906the <tt>-&gt;head</tt> pointer itself, indicating that none
907of the callbacks is ready to invoke.
908The <tt>-&gt;tails[RCU_WAIT_TAIL]</tt> array element references callback
909CB&nbsp;2's <tt>-&gt;next</tt> pointer, which indicates that
910CB&nbsp;1 and CB&nbsp;2 are both waiting on the current grace period,
911give or take possible disagreements about exactly which grace period
912is the current one.
913The <tt>-&gt;tails[RCU_NEXT_READY_TAIL]</tt> array element
914references the same RCU callback that <tt>-&gt;tails[RCU_WAIT_TAIL]</tt>
915does, which indicates that there are no callbacks waiting on the next
916RCU grace period.
917The <tt>-&gt;tails[RCU_NEXT_TAIL]</tt> array element references
918CB&nbsp;4's <tt>-&gt;next</tt> pointer, indicating that all the
919remaining RCU callbacks have not yet been assigned to an RCU grace
920period.
921Note that the <tt>-&gt;tails[RCU_NEXT_TAIL]</tt> array element
922always references the last RCU callback's <tt>-&gt;next</tt> pointer
923unless the callback list is empty, in which case it references
924the <tt>-&gt;head</tt> pointer.
925
926<p>
927There is one additional important special case for the
928<tt>-&gt;tails[RCU_NEXT_TAIL]</tt> array element: It can be <tt>NULL</tt>
929when this list is <i>disabled</i>.
930Lists are disabled when the corresponding CPU is offline or when
931the corresponding CPU's callbacks are offloaded to a kthread,
932both of which are described elsewhere.
933
934</p><p>CPUs advance their callbacks from the
935<tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the
936<tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments
937as grace periods advance.
938
939</p><p>The <tt>-&gt;gp_seq[]</tt> array records grace-period
940numbers corresponding to the list segments.
941This is what allows different CPUs to have different ideas as to
942which is the current grace period while still avoiding premature
943invocation of their callbacks.
944In particular, this allows CPUs that go idle for extended periods
945to determine which of their callbacks are ready to be invoked after
946reawakening.
947
948</p><p>The <tt>-&gt;len</tt> counter contains the number of
949callbacks in <tt>-&gt;head</tt>, and the
950<tt>-&gt;len_lazy</tt> contains the number of those callbacks that
951are known to only free memory, and whose invocation can therefore
952be safely deferred.
953
954<p><b>Important note</b>: It is the <tt>-&gt;len</tt> field that
955determines whether or not there are callbacks associated with
956this <tt>rcu_segcblist</tt> structure, <i>not</i> the <tt>-&gt;head</tt>
957pointer.
958The reason for this is that all the ready-to-invoke callbacks
959(that is, those in the <tt>RCU_DONE_TAIL</tt> segment) are extracted
960all at once at callback-invocation time.
961If callback invocation must be postponed, for example, because a
962high-priority process just woke up on this CPU, then the remaining
963callbacks are placed back on the <tt>RCU_DONE_TAIL</tt> segment.
964Either way, the <tt>-&gt;len</tt> and <tt>-&gt;len_lazy</tt> counts
965are adjusted after the corresponding callbacks have been invoked, and so
966again it is the <tt>-&gt;len</tt> count that accurately reflects whether
967or not there are callbacks associated with this <tt>rcu_segcblist</tt>
968structure.
969Of course, off-CPU sampling of the <tt>-&gt;len</tt> count requires
970the use of appropriate synchronization, for example, memory barriers.
971This synchronization can be a bit subtle, particularly in the case
972of <tt>rcu_barrier()</tt>.
973
844<h3><a name="The rcu_data Structure"> 974<h3><a name="The rcu_data Structure">
845The <tt>rcu_data</tt> Structure</a></h3> 975The <tt>rcu_data</tt> Structure</a></h3>
846 976
@@ -983,62 +1113,18 @@ choice.
983as follows: 1113as follows:
984 1114
985<pre> 1115<pre>
986 1 struct rcu_head *nxtlist; 1116 1 struct rcu_segcblist cblist;
987 2 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 1117 2 long qlen_last_fqs_check;
988 3 unsigned long nxtcompleted[RCU_NEXT_SIZE]; 1118 3 unsigned long n_cbs_invoked;
989 4 long qlen_lazy; 1119 4 unsigned long n_nocbs_invoked;
990 5 long qlen; 1120 5 unsigned long n_cbs_orphaned;
991 6 long qlen_last_fqs_check; 1121 6 unsigned long n_cbs_adopted;
992 7 unsigned long n_force_qs_snap; 1122 7 unsigned long n_force_qs_snap;
993 8 unsigned long n_cbs_invoked; 1123 8 long blimit;
994 9 unsigned long n_cbs_orphaned;
99510 unsigned long n_cbs_adopted;
99611 long blimit;
997</pre> 1124</pre>
998 1125
999<p>The <tt>-&gt;nxtlist</tt> pointer and the 1126<p>The <tt>-&gt;cblist</tt> structure is the segmented callback list
1000<tt>-&gt;nxttail[]</tt> array form a four-segment list with 1127described earlier.
1001older callbacks near the head and newer ones near the tail.
1002Each segment contains callbacks with the corresponding relationship
1003to the current grace period.
1004The pointer out of the end of each of the four segments is referenced
1005by the element of the <tt>-&gt;nxttail[]</tt> array indexed by
1006<tt>RCU_DONE_TAIL</tt> (for callbacks handled by a prior grace period),
1007<tt>RCU_WAIT_TAIL</tt> (for callbacks waiting on the current grace period),
1008<tt>RCU_NEXT_READY_TAIL</tt> (for callbacks that will wait on the next
1009grace period), and
1010<tt>RCU_NEXT_TAIL</tt> (for callbacks that are not yet associated
1011with a specific grace period)
1012respectively, as shown in the following figure.
1013
1014</p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%">
1015
1016</p><p>In this figure, the <tt>-&gt;nxtlist</tt> pointer references the
1017first
1018RCU callback in the list.
1019The <tt>-&gt;nxttail[RCU_DONE_TAIL]</tt> array element references
1020the <tt>-&gt;nxtlist</tt> pointer itself, indicating that none
1021of the callbacks is ready to invoke.
1022The <tt>-&gt;nxttail[RCU_WAIT_TAIL]</tt> array element references callback
1023CB&nbsp;2's <tt>-&gt;next</tt> pointer, which indicates that
1024CB&nbsp;1 and CB&nbsp;2 are both waiting on the current grace period.
1025The <tt>-&gt;nxttail[RCU_NEXT_READY_TAIL]</tt> array element
1026references the same RCU callback that <tt>-&gt;nxttail[RCU_WAIT_TAIL]</tt>
1027does, which indicates that there are no callbacks waiting on the next
1028RCU grace period.
1029The <tt>-&gt;nxttail[RCU_NEXT_TAIL]</tt> array element references
1030CB&nbsp;4's <tt>-&gt;next</tt> pointer, indicating that all the
1031remaining RCU callbacks have not yet been assigned to an RCU grace
1032period.
1033Note that the <tt>-&gt;nxttail[RCU_NEXT_TAIL]</tt> array element
1034always references the last RCU callback's <tt>-&gt;next</tt> pointer
1035unless the callback list is empty, in which case it references
1036the <tt>-&gt;nxtlist</tt> pointer.
1037
1038</p><p>CPUs advance their callbacks from the
1039<tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the
1040<tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments
1041as grace periods advance.
1042The CPU advances the callbacks in its <tt>rcu_data</tt> structure 1128The CPU advances the callbacks in its <tt>rcu_data</tt> structure
1043whenever it notices that another RCU grace period has completed. 1129whenever it notices that another RCU grace period has completed.
1044The CPU detects the completion of an RCU grace period by noticing 1130The CPU detects the completion of an RCU grace period by noticing
@@ -1049,16 +1135,7 @@ Recall that each <tt>rcu_node</tt> structure's
1049<tt>-&gt;completed</tt> field is updated at the end of each 1135<tt>-&gt;completed</tt> field is updated at the end of each
1050grace period. 1136grace period.
1051 1137
1052</p><p>The <tt>-&gt;nxtcompleted[]</tt> array records grace-period 1138<p>
1053numbers corresponding to the list segments.
1054This allows CPUs that go idle for extended periods to determine
1055which of their callbacks are ready to be invoked after reawakening.
1056
1057</p><p>The <tt>-&gt;qlen</tt> counter contains the number of
1058callbacks in <tt>-&gt;nxtlist</tt>, and the
1059<tt>-&gt;qlen_lazy</tt> contains the number of those callbacks that
1060are known to only free memory, and whose invocation can therefore
1061be safely deferred.
1062The <tt>-&gt;qlen_last_fqs_check</tt> and 1139The <tt>-&gt;qlen_last_fqs_check</tt> and
1063<tt>-&gt;n_force_qs_snap</tt> coordinate the forcing of quiescent 1140<tt>-&gt;n_force_qs_snap</tt> coordinate the forcing of quiescent
1064states from <tt>call_rcu()</tt> and friends when callback 1141states from <tt>call_rcu()</tt> and friends when callback
@@ -1069,6 +1146,10 @@ lists grow excessively long.
1069fields count the number of callbacks invoked, 1146fields count the number of callbacks invoked,
1070sent to other CPUs when this CPU goes offline, 1147sent to other CPUs when this CPU goes offline,
1071and received from other CPUs when those other CPUs go offline. 1148and received from other CPUs when those other CPUs go offline.
1149The <tt>-&gt;n_nocbs_invoked</tt> is used when the CPU's callbacks
1150are offloaded to a kthread.
1151
1152<p>
1072Finally, the <tt>-&gt;blimit</tt> counter is the maximum number of 1153Finally, the <tt>-&gt;blimit</tt> counter is the maximum number of
1073RCU callbacks that may be invoked at a given time. 1154RCU callbacks that may be invoked at a given time.
1074 1155
@@ -1104,6 +1185,9 @@ Its fields are as follows:
1104 1 int dynticks_nesting; 1185 1 int dynticks_nesting;
1105 2 int dynticks_nmi_nesting; 1186 2 int dynticks_nmi_nesting;
1106 3 atomic_t dynticks; 1187 3 atomic_t dynticks;
1188 4 bool rcu_need_heavy_qs;
1189 5 unsigned long rcu_qs_ctr;
1190 6 bool rcu_urgent_qs;
1107</pre> 1191</pre>
1108 1192
1109<p>The <tt>-&gt;dynticks_nesting</tt> field counts the 1193<p>The <tt>-&gt;dynticks_nesting</tt> field counts the
@@ -1117,11 +1201,32 @@ NMIs are counted by the <tt>-&gt;dynticks_nmi_nesting</tt>
1117field, except that NMIs that interrupt non-dyntick-idle execution 1201field, except that NMIs that interrupt non-dyntick-idle execution
1118are not counted. 1202are not counted.
1119 1203
1120</p><p>Finally, the <tt>-&gt;dynticks</tt> field counts the corresponding 1204</p><p>The <tt>-&gt;dynticks</tt> field counts the corresponding
1121CPU's transitions to and from dyntick-idle mode, so that this counter 1205CPU's transitions to and from dyntick-idle mode, so that this counter
1122has an even value when the CPU is in dyntick-idle mode and an odd 1206has an even value when the CPU is in dyntick-idle mode and an odd
1123value otherwise. 1207value otherwise.
1124 1208
1209</p><p>The <tt>-&gt;rcu_need_heavy_qs</tt> field is used
1210to record the fact that the RCU core code would really like to
1211see a quiescent state from the corresponding CPU, so much so that
1212it is willing to call for heavy-weight dyntick-counter operations.
1213This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
1214code, which provide a momentary idle sojourn in response.
1215
1216</p><p>The <tt>-&gt;rcu_qs_ctr</tt> field is used to record
1217quiescent states from <tt>cond_resched()</tt>.
1218Because <tt>cond_resched()</tt> can execute quite frequently, this
1219must be quite lightweight, as in a non-atomic increment of this
1220per-CPU field.
1221
1222</p><p>Finally, the <tt>-&gt;rcu_urgent_qs</tt> field is used to record
1223the fact that the RCU core code would really like to see a quiescent
1224state from the corresponding CPU, with the various other fields indicating
1225just how badly RCU wants this quiescent state.
1226This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
1227code, which, if nothing else, non-atomically increment <tt>-&gt;rcu_qs_ctr</tt>
1228in response.
1229
1125<table> 1230<table>
1126<tr><th>&nbsp;</th></tr> 1231<tr><th>&nbsp;</th></tr>
1127<tr><th align="left">Quick Quiz:</th></tr> 1232<tr><th align="left">Quick Quiz:</th></tr>
diff --git a/Documentation/RCU/Design/Data-Structures/nxtlist.svg b/Documentation/RCU/Design/Data-Structures/nxtlist.svg
index abc4cc73a097..0223e79c38e0 100644
--- a/Documentation/RCU/Design/Data-Structures/nxtlist.svg
+++ b/Documentation/RCU/Design/Data-Structures/nxtlist.svg
@@ -19,7 +19,7 @@
19 id="svg2" 19 id="svg2"
20 version="1.1" 20 version="1.1"
21 inkscape:version="0.48.4 r9939" 21 inkscape:version="0.48.4 r9939"
22 sodipodi:docname="nxtlist.fig"> 22 sodipodi:docname="segcblist.svg">
23 <metadata 23 <metadata
24 id="metadata94"> 24 id="metadata94">
25 <rdf:RDF> 25 <rdf:RDF>
@@ -28,7 +28,7 @@
28 <dc:format>image/svg+xml</dc:format> 28 <dc:format>image/svg+xml</dc:format>
29 <dc:type 29 <dc:type
30 rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> 30 rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
31 <dc:title></dc:title> 31 <dc:title />
32 </cc:Work> 32 </cc:Work>
33 </rdf:RDF> 33 </rdf:RDF>
34 </metadata> 34 </metadata>
@@ -241,61 +241,51 @@
241 xml:space="preserve" 241 xml:space="preserve"
242 x="225" 242 x="225"
243 y="675" 243 y="675"
244 fill="#000000"
245 font-family="Courier"
246 font-style="normal" 244 font-style="normal"
247 font-weight="bold" 245 font-weight="bold"
248 font-size="324" 246 font-size="324"
249 text-anchor="start" 247 id="text64"
250 id="text64">nxtlist</text> 248 style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;head</text>
251 <!-- Text --> 249 <!-- Text -->
252 <text 250 <text
253 xml:space="preserve" 251 xml:space="preserve"
254 x="225" 252 x="225"
255 y="1800" 253 y="1800"
256 fill="#000000"
257 font-family="Courier"
258 font-style="normal" 254 font-style="normal"
259 font-weight="bold" 255 font-weight="bold"
260 font-size="324" 256 font-size="324"
261 text-anchor="start" 257 id="text66"
262 id="text66">nxttail[RCU_DONE_TAIL]</text> 258 style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;tails[RCU_DONE_TAIL]</text>
263 <!-- Text --> 259 <!-- Text -->
264 <text 260 <text
265 xml:space="preserve" 261 xml:space="preserve"
266 x="225" 262 x="225"
267 y="2925" 263 y="2925"
268 fill="#000000"
269 font-family="Courier"
270 font-style="normal" 264 font-style="normal"
271 font-weight="bold" 265 font-weight="bold"
272 font-size="324" 266 font-size="324"
273 text-anchor="start" 267 id="text68"
274 id="text68">nxttail[RCU_WAIT_TAIL]</text> 268 style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;tails[RCU_WAIT_TAIL]</text>
275 <!-- Text --> 269 <!-- Text -->
276 <text 270 <text
277 xml:space="preserve" 271 xml:space="preserve"
278 x="225" 272 x="225"
279 y="4050" 273 y="4050"
280 fill="#000000"
281 font-family="Courier"
282 font-style="normal" 274 font-style="normal"
283 font-weight="bold" 275 font-weight="bold"
284 font-size="324" 276 font-size="324"
285 text-anchor="start" 277 id="text70"
286 id="text70">nxttail[RCU_NEXT_READY_TAIL]</text> 278 style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;tails[RCU_NEXT_READY_TAIL]</text>
287 <!-- Text --> 279 <!-- Text -->
288 <text 280 <text
289 xml:space="preserve" 281 xml:space="preserve"
290 x="225" 282 x="225"
291 y="5175" 283 y="5175"
292 fill="#000000"
293 font-family="Courier"
294 font-style="normal" 284 font-style="normal"
295 font-weight="bold" 285 font-weight="bold"
296 font-size="324" 286 font-size="324"
297 text-anchor="start" 287 id="text72"
298 id="text72">nxttail[RCU_NEXT_TAIL]</text> 288 style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;tails[RCU_NEXT_TAIL]</text>
299 <!-- Text --> 289 <!-- Text -->
300 <text 290 <text
301 xml:space="preserve" 291 xml:space="preserve"
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html
index 7a3194c5559a..e5d0bbd0230b 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html
@@ -284,6 +284,7 @@ Expedited Grace Period Refinements</a></h2>
284 Funnel locking and wait/wakeup</a>. 284 Funnel locking and wait/wakeup</a>.
285<li> <a href="#Use of Workqueues">Use of Workqueues</a>. 285<li> <a href="#Use of Workqueues">Use of Workqueues</a>.
286<li> <a href="#Stall Warnings">Stall warnings</a>. 286<li> <a href="#Stall Warnings">Stall warnings</a>.
287<li> <a href="#Mid-Boot Operation">Mid-boot operation</a>.
287</ol> 288</ol>
288 289
289<h3><a name="Idle-CPU Checks">Idle-CPU Checks</a></h3> 290<h3><a name="Idle-CPU Checks">Idle-CPU Checks</a></h3>
@@ -524,7 +525,7 @@ their grace periods and carrying out their wakeups.
524In earlier implementations, the task requesting the expedited 525In earlier implementations, the task requesting the expedited
525grace period also drove it to completion. 526grace period also drove it to completion.
526This straightforward approach had the disadvantage of needing to 527This straightforward approach had the disadvantage of needing to
527account for signals sent to user tasks, 528account for POSIX signals sent to user tasks,
528so more recent implemementations use the Linux kernel's 529so more recent implemementations use the Linux kernel's
529<a href="https://www.kernel.org/doc/Documentation/workqueue.txt">workqueues</a>. 530<a href="https://www.kernel.org/doc/Documentation/workqueue.txt">workqueues</a>.
530 531
@@ -533,8 +534,8 @@ The requesting task still does counter snapshotting and funnel-lock
533processing, but the task reaching the top of the funnel lock 534processing, but the task reaching the top of the funnel lock
534does a <tt>schedule_work()</tt> (from <tt>_synchronize_rcu_expedited()</tt> 535does a <tt>schedule_work()</tt> (from <tt>_synchronize_rcu_expedited()</tt>
535so that a workqueue kthread does the actual grace-period processing. 536so that a workqueue kthread does the actual grace-period processing.
536Because workqueue kthreads do not accept signals, grace-period-wait 537Because workqueue kthreads do not accept POSIX signals, grace-period-wait
537processing need not allow for signals. 538processing need not allow for POSIX signals.
538 539
539In addition, this approach allows wakeups for the previous expedited 540In addition, this approach allows wakeups for the previous expedited
540grace period to be overlapped with processing for the next expedited 541grace period to be overlapped with processing for the next expedited
@@ -586,6 +587,46 @@ blocking the current grace period are printed.
586Each stall warning results in another pass through the loop, but the 587Each stall warning results in another pass through the loop, but the
587second and subsequent passes use longer stall times. 588second and subsequent passes use longer stall times.
588 589
590<h3><a name="Mid-Boot Operation">Mid-boot operation</a></h3>
591
592<p>
593The use of workqueues has the advantage that the expedited
594grace-period code need not worry about POSIX signals.
595Unfortunately, it has the
596corresponding disadvantage that workqueues cannot be used until
597they are initialized, which does not happen until some time after
598the scheduler spawns the first task.
599Given that there are parts of the kernel that really do want to
600execute grace periods during this mid-boot &ldquo;dead zone&rdquo;,
601expedited grace periods must do something else during thie time.
602
603<p>
604What they do is to fall back to the old practice of requiring that the
605requesting task drive the expedited grace period, as was the case
606before the use of workqueues.
607However, the requesting task is only required to drive the grace period
608during the mid-boot dead zone.
609Before mid-boot, a synchronous grace period is a no-op.
610Some time after mid-boot, workqueues are used.
611
612<p>
613Non-expedited non-SRCU synchronous grace periods must also operate
614normally during mid-boot.
615This is handled by causing non-expedited grace periods to take the
616expedited code path during mid-boot.
617
618<p>
619The current code assumes that there are no POSIX signals during
620the mid-boot dead zone.
621However, if an overwhelming need for POSIX signals somehow arises,
622appropriate adjustments can be made to the expedited stall-warning code.
623One such adjustment would reinstate the pre-workqueue stall-warning
624checks, but only during the mid-boot dead zone.
625
626<p>
627With this refinement, synchronous grace periods can now be used from
628task context pretty much any time during the life of the kernel.
629
589<h3><a name="Summary"> 630<h3><a name="Summary">
590Summary</a></h3> 631Summary</a></h3>
591 632
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
index 21593496aca6..f60adf112663 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -659,8 +659,9 @@ systems with more than one CPU:
659 In other words, a given instance of <tt>synchronize_rcu()</tt> 659 In other words, a given instance of <tt>synchronize_rcu()</tt>
660 can avoid waiting on a given RCU read-side critical section only 660 can avoid waiting on a given RCU read-side critical section only
661 if it can prove that <tt>synchronize_rcu()</tt> started first. 661 if it can prove that <tt>synchronize_rcu()</tt> started first.
662 </font>
662 663
663 <p> 664 <p><font color="ffffff">
664 A related question is &ldquo;When <tt>rcu_read_lock()</tt> 665 A related question is &ldquo;When <tt>rcu_read_lock()</tt>
665 doesn't generate any code, why does it matter how it relates 666 doesn't generate any code, why does it matter how it relates
666 to a grace period?&rdquo; 667 to a grace period?&rdquo;
@@ -675,8 +676,9 @@ systems with more than one CPU:
675 within the critical section, in which case none of the accesses 676 within the critical section, in which case none of the accesses
676 within the critical section may observe the effects of any 677 within the critical section may observe the effects of any
677 access following the grace period. 678 access following the grace period.
679 </font>
678 680
679 <p> 681 <p><font color="ffffff">
680 As of late 2016, mathematical models of RCU take this 682 As of late 2016, mathematical models of RCU take this
681 viewpoint, for example, see slides&nbsp;62 and&nbsp;63 683 viewpoint, for example, see slides&nbsp;62 and&nbsp;63
682 of the 684 of the
@@ -1616,8 +1618,8 @@ CPUs should at least make reasonable forward progress.
1616In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt> 1618In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
1617is permitted to impose modest degradation of real-time latency 1619is permitted to impose modest degradation of real-time latency
1618on non-idle online CPUs. 1620on non-idle online CPUs.
1619That said, it will likely be necessary to take further steps to reduce this 1621Here, &ldquo;modest&rdquo; means roughly the same latency
1620degradation, hopefully to roughly that of a scheduling-clock interrupt. 1622degradation as a scheduling-clock interrupt.
1621 1623
1622<p> 1624<p>
1623There are a number of situations where even 1625There are a number of situations where even
@@ -1913,12 +1915,9 @@ This requirement is another factor driving batching of grace periods,
1913but it is also the driving force behind the checks for large numbers 1915but it is also the driving force behind the checks for large numbers
1914of queued RCU callbacks in the <tt>call_rcu()</tt> code path. 1916of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
1915Finally, high update rates should not delay RCU read-side critical 1917Finally, high update rates should not delay RCU read-side critical
1916sections, although some read-side delays can occur when using 1918sections, although some small read-side delays can occur when using
1917<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use 1919<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
1918of <tt>try_stop_cpus()</tt>. 1920of <tt>smp_call_function_single()</tt>.
1919(In the future, <tt>synchronize_rcu_expedited()</tt> will be
1920converted to use lighter-weight inter-processor interrupts (IPIs),
1921but this will still disturb readers, though to a much smaller degree.)
1922 1921
1923<p> 1922<p>
1924Although all three of these corner cases were understood in the early 1923Although all three of these corner cases were understood in the early
@@ -2154,7 +2153,8 @@ as will <tt>rcu_assign_pointer()</tt>.
2154<p> 2153<p>
2155Although <tt>call_rcu()</tt> may be invoked at any 2154Although <tt>call_rcu()</tt> may be invoked at any
2156time during boot, callbacks are not guaranteed to be invoked until after 2155time during boot, callbacks are not guaranteed to be invoked until after
2157the scheduler is fully up and running. 2156all of RCU's kthreads have been spawned, which occurs at
2157<tt>early_initcall()</tt> time.
2158This delay in callback invocation is due to the fact that RCU does not 2158This delay in callback invocation is due to the fact that RCU does not
2159invoke callbacks until it is fully initialized, and this full initialization 2159invoke callbacks until it is fully initialized, and this full initialization
2160cannot occur until after the scheduler has initialized itself to the 2160cannot occur until after the scheduler has initialized itself to the
@@ -2167,8 +2167,10 @@ on what operations those callbacks could invoke.
2167Perhaps surprisingly, <tt>synchronize_rcu()</tt>, 2167Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
2168<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a> 2168<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
2169(<a href="#Bottom-Half Flavor">discussed below</a>), 2169(<a href="#Bottom-Half Flavor">discussed below</a>),
2170and 2170<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>,
2171<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a> 2171<tt>synchronize_rcu_expedited()</tt>,
2172<tt>synchronize_rcu_bh_expedited()</tt>, and
2173<tt>synchronize_sched_expedited()</tt>
2172will all operate normally 2174will all operate normally
2173during very early boot, the reason being that there is only one CPU 2175during very early boot, the reason being that there is only one CPU
2174and preemption is disabled. 2176and preemption is disabled.
@@ -2178,45 +2180,59 @@ state and thus a grace period, so the early-boot implementation can
2178be a no-op. 2180be a no-op.
2179 2181
2180<p> 2182<p>
2181Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt> 2183However, once the scheduler has spawned its first kthread, this early
2182continue to operate normally through the remainder of boot, courtesy 2184boot trick fails for <tt>synchronize_rcu()</tt> (as well as for
2183of the fact that preemption is disabled across their RCU read-side 2185<tt>synchronize_rcu_expedited()</tt>) in <tt>CONFIG_PREEMPT=y</tt>
2184critical sections and also courtesy of the fact that there is still 2186kernels.
2185only one CPU. 2187The reason is that an RCU read-side critical section might be preempted,
2186However, once the scheduler starts initializing, preemption is enabled. 2188which means that a subsequent <tt>synchronize_rcu()</tt> really does have
2187There is still only a single CPU, but the fact that preemption is enabled 2189to wait for something, as opposed to simply returning immediately.
2188means that the no-op implementation of <tt>synchronize_rcu()</tt> no 2190Unfortunately, <tt>synchronize_rcu()</tt> can't do this until all of
2189longer works in <tt>CONFIG_PREEMPT=y</tt> kernels. 2191its kthreads are spawned, which doesn't happen until some time during
2190Therefore, as soon as the scheduler starts initializing, the early-boot 2192<tt>early_initcalls()</tt> time.
2191fastpath is disabled. 2193But this is no excuse: RCU is nevertheless required to correctly handle
2192This means that <tt>synchronize_rcu()</tt> switches to its runtime 2194synchronous grace periods during this time period.
2193mode of operation where it posts callbacks, which in turn means that 2195Once all of its kthreads are up and running, RCU starts running
2194any call to <tt>synchronize_rcu()</tt> will block until the corresponding 2196normally.
2195callback is invoked.
2196Unfortunately, the callback cannot be invoked until RCU's runtime
2197grace-period machinery is up and running, which cannot happen until
2198the scheduler has initialized itself sufficiently to allow RCU's
2199kthreads to be spawned.
2200Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
2201initialization can result in deadlock.
2202 2197
2203<table> 2198<table>
2204<tr><th>&nbsp;</th></tr> 2199<tr><th>&nbsp;</th></tr>
2205<tr><th align="left">Quick Quiz:</th></tr> 2200<tr><th align="left">Quick Quiz:</th></tr>
2206<tr><td> 2201<tr><td>
2207 So what happens with <tt>synchronize_rcu()</tt> during 2202 How can RCU possibly handle grace periods before all of its
2208 scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> 2203 kthreads have been spawned???
2209 kernels?
2210</td></tr> 2204</td></tr>
2211<tr><th align="left">Answer:</th></tr> 2205<tr><th align="left">Answer:</th></tr>
2212<tr><td bgcolor="#ffffff"><font color="ffffff"> 2206<tr><td bgcolor="#ffffff"><font color="ffffff">
2213 In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> 2207 Very carefully!
2214 maps directly to <tt>synchronize_sched()</tt>. 2208 </font>
2215 Therefore, <tt>synchronize_rcu()</tt> works normally throughout 2209
2216 boot in <tt>CONFIG_PREEMPT=n</tt> kernels. 2210 <p><font color="ffffff">
2217 However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, 2211 During the &ldquo;dead zone&rdquo; between the time that the
2218 so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> 2212 scheduler spawns the first task and the time that all of RCU's
2219 during scheduler initialization. 2213 kthreads have been spawned, all synchronous grace periods are
2214 handled by the expedited grace-period mechanism.
2215 At runtime, this expedited mechanism relies on workqueues, but
2216 during the dead zone the requesting task itself drives the
2217 desired expedited grace period.
2218 Because dead-zone execution takes place within task context,
2219 everything works.
2220 Once the dead zone ends, expedited grace periods go back to
2221 using workqueues, as is required to avoid problems that would
2222 otherwise occur when a user task received a POSIX signal while
2223 driving an expedited grace period.
2224 </font>
2225
2226 <p><font color="ffffff">
2227 And yes, this does mean that it is unhelpful to send POSIX
2228 signals to random tasks between the time that the scheduler
2229 spawns its first kthread and the time that RCU's kthreads
2230 have all been spawned.
2231 If there ever turns out to be a good reason for sending POSIX
2232 signals during that time, appropriate adjustments will be made.
2233 (If it turns out that POSIX signals are sent during this time for
2234 no good reason, other adjustments will be made, appropriate
2235 or otherwise.)
2220</font></td></tr> 2236</font></td></tr>
2221<tr><td>&nbsp;</td></tr> 2237<tr><td>&nbsp;</td></tr>
2222</table> 2238</table>
@@ -2295,12 +2311,61 @@ situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
2295The need for <tt>rcu_barrier()</tt> for module unloading became 2311The need for <tt>rcu_barrier()</tt> for module unloading became
2296apparent later. 2312apparent later.
2297 2313
2314<p>
2315<b>Important note</b>: The <tt>rcu_barrier()</tt> function is not,
2316repeat, <i>not</i>, obligated to wait for a grace period.
2317It is instead only required to wait for RCU callbacks that have
2318already been posted.
2319Therefore, if there are no RCU callbacks posted anywhere in the system,
2320<tt>rcu_barrier()</tt> is within its rights to return immediately.
2321Even if there are callbacks posted, <tt>rcu_barrier()</tt> does not
2322necessarily need to wait for a grace period.
2323
2324<table>
2325<tr><th>&nbsp;</th></tr>
2326<tr><th align="left">Quick Quiz:</th></tr>
2327<tr><td>
2328 Wait a minute!
2329 Each RCU callbacks must wait for a grace period to complete,
2330 and <tt>rcu_barrier()</tt> must wait for each pre-existing
2331 callback to be invoked.
2332 Doesn't <tt>rcu_barrier()</tt> therefore need to wait for
2333 a full grace period if there is even one callback posted anywhere
2334 in the system?
2335</td></tr>
2336<tr><th align="left">Answer:</th></tr>
2337<tr><td bgcolor="#ffffff"><font color="ffffff">
2338 Absolutely not!!!
2339 </font>
2340
2341 <p><font color="ffffff">
2342 Yes, each RCU callbacks must wait for a grace period to complete,
2343 but it might well be partly (or even completely) finished waiting
2344 by the time <tt>rcu_barrier()</tt> is invoked.
2345 In that case, <tt>rcu_barrier()</tt> need only wait for the
2346 remaining portion of the grace period to elapse.
2347 So even if there are quite a few callbacks posted,
2348 <tt>rcu_barrier()</tt> might well return quite quickly.
2349 </font>
2350
2351 <p><font color="ffffff">
2352 So if you need to wait for a grace period as well as for all
2353 pre-existing callbacks, you will need to invoke both
2354 <tt>synchronize_rcu()</tt> and <tt>rcu_barrier()</tt>.
2355 If latency is a concern, you can always use workqueues
2356 to invoke them concurrently.
2357</font></td></tr>
2358<tr><td>&nbsp;</td></tr>
2359</table>
2360
2298<h3><a name="Hotplug CPU">Hotplug CPU</a></h3> 2361<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
2299 2362
2300<p> 2363<p>
2301The Linux kernel supports CPU hotplug, which means that CPUs 2364The Linux kernel supports CPU hotplug, which means that CPUs
2302can come and go. 2365can come and go.
2303It is of course illegal to use any RCU API member from an offline CPU. 2366It is of course illegal to use any RCU API member from an offline CPU,
2367with the exception of <a href="#Sleepable RCU">SRCU</a> read-side
2368critical sections.
2304This requirement was present from day one in DYNIX/ptx, but 2369This requirement was present from day one in DYNIX/ptx, but
2305on the other hand, the Linux kernel's CPU-hotplug implementation 2370on the other hand, the Linux kernel's CPU-hotplug implementation
2306is &ldquo;interesting.&rdquo; 2371is &ldquo;interesting.&rdquo;
@@ -2310,19 +2375,18 @@ The Linux-kernel CPU-hotplug implementation has notifiers that
2310are used to allow the various kernel subsystems (including RCU) 2375are used to allow the various kernel subsystems (including RCU)
2311to respond appropriately to a given CPU-hotplug operation. 2376to respond appropriately to a given CPU-hotplug operation.
2312Most RCU operations may be invoked from CPU-hotplug notifiers, 2377Most RCU operations may be invoked from CPU-hotplug notifiers,
2313including even normal synchronous grace-period operations 2378including even synchronous grace-period operations such as
2314such as <tt>synchronize_rcu()</tt>. 2379<tt>synchronize_rcu()</tt> and <tt>synchronize_rcu_expedited()</tt>.
2315However, expedited grace-period operations such as
2316<tt>synchronize_rcu_expedited()</tt> are not supported,
2317due to the fact that current implementations block CPU-hotplug
2318operations, which could result in deadlock.
2319 2380
2320<p> 2381<p>
2321In addition, all-callback-wait operations such as 2382However, all-callback-wait operations such as
2322<tt>rcu_barrier()</tt> are also not supported, due to the 2383<tt>rcu_barrier()</tt> are also not supported, due to the
2323fact that there are phases of CPU-hotplug operations where 2384fact that there are phases of CPU-hotplug operations where
2324the outgoing CPU's callbacks will not be invoked until after 2385the outgoing CPU's callbacks will not be invoked until after
2325the CPU-hotplug operation ends, which could also result in deadlock. 2386the CPU-hotplug operation ends, which could also result in deadlock.
2387Furthermore, <tt>rcu_barrier()</tt> blocks CPU-hotplug operations
2388during its execution, which results in another type of deadlock
2389when invoked from a CPU-hotplug notifier.
2326 2390
2327<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3> 2391<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
2328 2392
@@ -2864,6 +2928,27 @@ API, which, in combination with <tt>srcu_read_unlock()</tt>,
2864guarantees a full memory barrier. 2928guarantees a full memory barrier.
2865 2929
2866<p> 2930<p>
2931Also unlike other RCU flavors, SRCU's callbacks-wait function
2932<tt>srcu_barrier()</tt> may be invoked from CPU-hotplug notifiers,
2933though this is not necessarily a good idea.
2934The reason that this is possible is that SRCU is insensitive
2935to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt>
2936need not exclude CPU-hotplug operations.
2937
2938<p>
2939As of v4.12, SRCU's callbacks are maintained per-CPU, eliminating
2940a locking bottleneck present in prior kernel versions.
2941Although this will allow users to put much heavier stress on
2942<tt>call_srcu()</tt>, it is important to note that SRCU does not
2943yet take any special steps to deal with callback flooding.
2944So if you are posting (say) 10,000 SRCU callbacks per second per CPU,
2945you are probably totally OK, but if you intend to post (say) 1,000,000
2946SRCU callbacks per second per CPU, please run some tests first.
2947SRCU just might need a few adjustment to deal with that sort of load.
2948Of course, your mileage may vary based on the speed of your CPUs and
2949the size of your memory.
2950
2951<p>
2867The 2952The
2868<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a> 2953<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
2869includes 2954includes
@@ -3021,8 +3106,8 @@ to do some redesign to avoid this scalability problem.
3021 3106
3022<p> 3107<p>
3023RCU disables CPU hotplug in a few places, perhaps most notably in the 3108RCU disables CPU hotplug in a few places, perhaps most notably in the
3024expedited grace-period and <tt>rcu_barrier()</tt> operations. 3109<tt>rcu_barrier()</tt> operations.
3025If there is a strong reason to use expedited grace periods in CPU-hotplug 3110If there is a strong reason to use <tt>rcu_barrier()</tt> in CPU-hotplug
3026notifiers, it will be necessary to avoid disabling CPU hotplug. 3111notifiers, it will be necessary to avoid disabling CPU hotplug.
3027This would introduce some complexity, so there had better be a <i>very</i> 3112This would introduce some complexity, so there had better be a <i>very</i>
3028good reason. 3113good reason.
@@ -3096,9 +3181,5 @@ Andy Lutomirski for their help in rendering
3096this article human readable, and to Michelle Rankin for her support 3181this article human readable, and to Michelle Rankin for her support
3097of this effort. 3182of this effort.
3098Other contributions are acknowledged in the Linux kernel's git archive. 3183Other contributions are acknowledged in the Linux kernel's git archive.
3099The cartoon is copyright (c) 2013 by Melissa Broussard,
3100and is provided
3101under the terms of the Creative Commons Attribution-Share Alike 3.0
3102United States license.
3103 3184
3104</body></html> 3185</body></html>
diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt
index c0bf2441a2ba..b2a613f16d74 100644
--- a/Documentation/RCU/rcu_dereference.txt
+++ b/Documentation/RCU/rcu_dereference.txt
@@ -138,6 +138,15 @@ o Be very careful about comparing pointers obtained from
138 This sort of comparison occurs frequently when scanning 138 This sort of comparison occurs frequently when scanning
139 RCU-protected circular linked lists. 139 RCU-protected circular linked lists.
140 140
141 Note that if checks for being within an RCU read-side
142 critical section are not required and the pointer is never
143 dereferenced, rcu_access_pointer() should be used in place
144 of rcu_dereference(). The rcu_access_pointer() primitive
145 does not require an enclosing read-side critical section,
146 and also omits the smp_read_barrier_depends() included in
147 rcu_dereference(), which in turn should provide a small
148 performance gain in some CPUs (e.g., the DEC Alpha).
149
141 o The comparison is against a pointer that references memory 150 o The comparison is against a pointer that references memory
142 that was initialized "a long time ago." The reason 151 that was initialized "a long time ago." The reason
143 this is safe is that even if misordering occurs, the 152 this is safe is that even if misordering occurs, the
diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt
index 18f9651ff23d..8151f0195f76 100644
--- a/Documentation/RCU/rculist_nulls.txt
+++ b/Documentation/RCU/rculist_nulls.txt
@@ -1,5 +1,5 @@
1Using hlist_nulls to protect read-mostly linked lists and 1Using hlist_nulls to protect read-mostly linked lists and
2objects using SLAB_DESTROY_BY_RCU allocations. 2objects using SLAB_TYPESAFE_BY_RCU allocations.
3 3
4Please read the basics in Documentation/RCU/listRCU.txt 4Please read the basics in Documentation/RCU/listRCU.txt
5 5
@@ -7,7 +7,7 @@ Using special makers (called 'nulls') is a convenient way
7to solve following problem : 7to solve following problem :
8 8
9A typical RCU linked list managing objects which are 9A typical RCU linked list managing objects which are
10allocated with SLAB_DESTROY_BY_RCU kmem_cache can 10allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can
11use following algos : 11use following algos :
12 12
131) Lookup algo 131) Lookup algo
@@ -96,7 +96,7 @@ unlock_chain(); // typically a spin_unlock()
963) Remove algo 963) Remove algo
97-------------- 97--------------
98Nothing special here, we can use a standard RCU hlist deletion. 98Nothing special here, we can use a standard RCU hlist deletion.
99But thanks to SLAB_DESTROY_BY_RCU, beware a deleted object can be reused 99But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused
100very very fast (before the end of RCU grace period) 100very very fast (before the end of RCU grace period)
101 101
102if (put_last_reference_on(obj) { 102if (put_last_reference_on(obj) {
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index e93d04133fe7..96a3d81837e1 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -1,9 +1,102 @@
1Using RCU's CPU Stall Detector 1Using RCU's CPU Stall Detector
2 2
3The rcu_cpu_stall_suppress module parameter enables RCU's CPU stall 3This document first discusses what sorts of issues RCU's CPU stall
4detector, which detects conditions that unduly delay RCU grace periods. 4detector can locate, and then discusses kernel parameters and Kconfig
5This module parameter enables CPU stall detection by default, but 5options that can be used to fine-tune the detector's operation. Finally,
6may be overridden via boot-time parameter or at runtime via sysfs. 6this document explains the stall detector's "splat" format.
7
8
9What Causes RCU CPU Stall Warnings?
10
11So your kernel printed an RCU CPU stall warning. The next question is
12"What caused it?" The following problems can result in RCU CPU stall
13warnings:
14
15o A CPU looping in an RCU read-side critical section.
16
17o A CPU looping with interrupts disabled.
18
19o A CPU looping with preemption disabled. This condition can
20 result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
21 stalls.
22
23o A CPU looping with bottom halves disabled. This condition can
24 result in RCU-sched and RCU-bh stalls.
25
26o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
27 kernel without invoking schedule(). Note that cond_resched()
28 does not necessarily prevent RCU CPU stall warnings. Therefore,
29 if the looping in the kernel is really expected and desirable
30 behavior, you might need to replace some of the cond_resched()
31 calls with calls to cond_resched_rcu_qs().
32
33o Booting Linux using a console connection that is too slow to
34 keep up with the boot-time console-message rate. For example,
35 a 115Kbaud serial console can be -way- too slow to keep up
36 with boot-time message rates, and will frequently result in
37 RCU CPU stall warning messages. Especially if you have added
38 debug printk()s.
39
40o Anything that prevents RCU's grace-period kthreads from running.
41 This can result in the "All QSes seen" console-log message.
42 This message will include information on when the kthread last
43 ran and how often it should be expected to run.
44
45o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
46 happen to preempt a low-priority task in the middle of an RCU
47 read-side critical section. This is especially damaging if
48 that low-priority task is not permitted to run on any other CPU,
49 in which case the next RCU grace period can never complete, which
50 will eventually cause the system to run out of memory and hang.
51 While the system is in the process of running itself out of
52 memory, you might see stall-warning messages.
53
54o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
55 is running at a higher priority than the RCU softirq threads.
56 This will prevent RCU callbacks from ever being invoked,
57 and in a CONFIG_PREEMPT_RCU kernel will further prevent
58 RCU grace periods from ever completing. Either way, the
59 system will eventually run out of memory and hang. In the
60 CONFIG_PREEMPT_RCU case, you might see stall-warning
61 messages.
62
63o A hardware or software issue shuts off the scheduler-clock
64 interrupt on a CPU that is not in dyntick-idle mode. This
65 problem really has happened, and seems to be most likely to
66 result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
67
68o A bug in the RCU implementation.
69
70o A hardware failure. This is quite unlikely, but has occurred
71 at least once in real life. A CPU failed in a running system,
72 becoming unresponsive, but not causing an immediate crash.
73 This resulted in a series of RCU CPU stall warnings, eventually
74 leading the realization that the CPU had failed.
75
76The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall
77warning. Note that SRCU does -not- have CPU stall warnings. Please note
78that RCU only detects CPU stalls when there is a grace period in progress.
79No grace period, no CPU stall warnings.
80
81To diagnose the cause of the stall, inspect the stack traces.
82The offending function will usually be near the top of the stack.
83If you have a series of stall warnings from a single extended stall,
84comparing the stack traces can often help determine where the stall
85is occurring, which will usually be in the function nearest the top of
86that portion of the stack which remains the same from trace to trace.
87If you can reliably trigger the stall, ftrace can be quite helpful.
88
89RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE
90and with RCU's event tracing. For information on RCU's event tracing,
91see include/trace/events/rcu.h.
92
93
94Fine-Tuning the RCU CPU Stall Detector
95
96The rcuupdate.rcu_cpu_stall_suppress module parameter disables RCU's
97CPU stall detector, which detects conditions that unduly delay RCU grace
98periods. This module parameter enables CPU stall detection by default,
99but may be overridden via boot-time parameter or at runtime via sysfs.
7The stall detector's idea of what constitutes "unduly delayed" is 100The stall detector's idea of what constitutes "unduly delayed" is
8controlled by a set of kernel configuration variables and cpp macros: 101controlled by a set of kernel configuration variables and cpp macros:
9 102
@@ -56,6 +149,9 @@ rcupdate.rcu_task_stall_timeout
56 And continues with the output of sched_show_task() for each 149 And continues with the output of sched_show_task() for each
57 task stalling the current RCU-tasks grace period. 150 task stalling the current RCU-tasks grace period.
58 151
152
153Interpreting RCU's CPU Stall-Detector "Splats"
154
59For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling, 155For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling,
60it will print a message similar to the following: 156it will print a message similar to the following:
61 157
@@ -178,89 +274,3 @@ grace period is in flight.
178 274
179It is entirely possible to see stall warnings from normal and from 275It is entirely possible to see stall warnings from normal and from
180expedited grace periods at about the same time from the same run. 276expedited grace periods at about the same time from the same run.
181
182
183What Causes RCU CPU Stall Warnings?
184
185So your kernel printed an RCU CPU stall warning. The next question is
186"What caused it?" The following problems can result in RCU CPU stall
187warnings:
188
189o A CPU looping in an RCU read-side critical section.
190
191o A CPU looping with interrupts disabled. This condition can
192 result in RCU-sched and RCU-bh stalls.
193
194o A CPU looping with preemption disabled. This condition can
195 result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
196 stalls.
197
198o A CPU looping with bottom halves disabled. This condition can
199 result in RCU-sched and RCU-bh stalls.
200
201o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
202 kernel without invoking schedule(). Note that cond_resched()
203 does not necessarily prevent RCU CPU stall warnings. Therefore,
204 if the looping in the kernel is really expected and desirable
205 behavior, you might need to replace some of the cond_resched()
206 calls with calls to cond_resched_rcu_qs().
207
208o Booting Linux using a console connection that is too slow to
209 keep up with the boot-time console-message rate. For example,
210 a 115Kbaud serial console can be -way- too slow to keep up
211 with boot-time message rates, and will frequently result in
212 RCU CPU stall warning messages. Especially if you have added
213 debug printk()s.
214
215o Anything that prevents RCU's grace-period kthreads from running.
216 This can result in the "All QSes seen" console-log message.
217 This message will include information on when the kthread last
218 ran and how often it should be expected to run.
219
220o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
221 happen to preempt a low-priority task in the middle of an RCU
222 read-side critical section. This is especially damaging if
223 that low-priority task is not permitted to run on any other CPU,
224 in which case the next RCU grace period can never complete, which
225 will eventually cause the system to run out of memory and hang.
226 While the system is in the process of running itself out of
227 memory, you might see stall-warning messages.
228
229o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
230 is running at a higher priority than the RCU softirq threads.
231 This will prevent RCU callbacks from ever being invoked,
232 and in a CONFIG_PREEMPT_RCU kernel will further prevent
233 RCU grace periods from ever completing. Either way, the
234 system will eventually run out of memory and hang. In the
235 CONFIG_PREEMPT_RCU case, you might see stall-warning
236 messages.
237
238o A hardware or software issue shuts off the scheduler-clock
239 interrupt on a CPU that is not in dyntick-idle mode. This
240 problem really has happened, and seems to be most likely to
241 result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
242
243o A bug in the RCU implementation.
244
245o A hardware failure. This is quite unlikely, but has occurred
246 at least once in real life. A CPU failed in a running system,
247 becoming unresponsive, but not causing an immediate crash.
248 This resulted in a series of RCU CPU stall warnings, eventually
249 leading the realization that the CPU had failed.
250
251The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall
252warning. Note that SRCU does -not- have CPU stall warnings. Please note
253that RCU only detects CPU stalls when there is a grace period in progress.
254No grace period, no CPU stall warnings.
255
256To diagnose the cause of the stall, inspect the stack traces.
257The offending function will usually be near the top of the stack.
258If you have a series of stall warnings from a single extended stall,
259comparing the stack traces can often help determine where the stall
260is occurring, which will usually be in the function nearest the top of
261that portion of the stack which remains the same from trace to trace.
262If you can reliably trigger the stall, ftrace can be quite helpful.
263
264RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE
265and with RCU's event tracing. For information on RCU's event tracing,
266see include/trace/events/rcu.h.
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 5cbd8b2395b8..8ed6c9f6133c 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -562,7 +562,9 @@ This section presents a "toy" RCU implementation that is based on
562familiar locking primitives. Its overhead makes it a non-starter for 562familiar locking primitives. Its overhead makes it a non-starter for
563real-life use, as does its lack of scalability. It is also unsuitable 563real-life use, as does its lack of scalability. It is also unsuitable
564for realtime use, since it allows scheduling latency to "bleed" from 564for realtime use, since it allows scheduling latency to "bleed" from
565one read-side critical section to another. 565one read-side critical section to another. It also assumes recursive
566reader-writer locks: If you try this with non-recursive locks, and
567you allow nested rcu_read_lock() calls, you can deadlock.
566 568
567However, it is probably the easiest implementation to relate to, so is 569However, it is probably the easiest implementation to relate to, so is
568a good starting point. 570a good starting point.
@@ -587,20 +589,21 @@ It is extremely simple:
587 write_unlock(&rcu_gp_mutex); 589 write_unlock(&rcu_gp_mutex);
588 } 590 }
589 591
590[You can ignore rcu_assign_pointer() and rcu_dereference() without 592[You can ignore rcu_assign_pointer() and rcu_dereference() without missing
591missing much. But here they are anyway. And whatever you do, don't 593much. But here are simplified versions anyway. And whatever you do,
592forget about them when submitting patches making use of RCU!] 594don't forget about them when submitting patches making use of RCU!]
593 595
594 #define rcu_assign_pointer(p, v) ({ \ 596 #define rcu_assign_pointer(p, v) \
595 smp_wmb(); \ 597 ({ \
596 (p) = (v); \ 598 smp_store_release(&(p), (v)); \
597 }) 599 })
598 600
599 #define rcu_dereference(p) ({ \ 601 #define rcu_dereference(p) \
600 typeof(p) _________p1 = p; \ 602 ({ \
601 smp_read_barrier_depends(); \ 603 typeof(p) _________p1 = p; \
602 (_________p1); \ 604 smp_read_barrier_depends(); \
603 }) 605 (_________p1); \
606 })
604 607
605 608
606The rcu_read_lock() and rcu_read_unlock() primitive read-acquire 609The rcu_read_lock() and rcu_read_unlock() primitive read-acquire
@@ -925,7 +928,8 @@ d. Do you need RCU grace periods to complete even in the face
925 928
926e. Is your workload too update-intensive for normal use of 929e. Is your workload too update-intensive for normal use of
927 RCU, but inappropriate for other synchronization mechanisms? 930 RCU, but inappropriate for other synchronization mechanisms?
928 If so, consider SLAB_DESTROY_BY_RCU. But please be careful! 931 If so, consider SLAB_TYPESAFE_BY_RCU (which was originally
932 named SLAB_DESTROY_BY_RCU). But please be careful!
929 933
930f. Do you need read-side critical sections that are respected 934f. Do you need read-side critical sections that are respected
931 even though they are in the middle of the idle loop, during 935 even though they are in the middle of the idle loop, during
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index d2b0a8d81258..08329cb857ed 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -768,7 +768,7 @@ equal to zero, in which case the compiler is within its rights to
768transform the above code into the following: 768transform the above code into the following:
769 769
770 q = READ_ONCE(a); 770 q = READ_ONCE(a);
771 WRITE_ONCE(b, 1); 771 WRITE_ONCE(b, 2);
772 do_something_else(); 772 do_something_else();
773 773
774Given this transformation, the CPU is not required to respect the ordering 774Given this transformation, the CPU is not required to respect the ordering
diff --git a/arch/Kconfig b/arch/Kconfig
index cd211a14a88f..adefaf344239 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -320,6 +320,9 @@ config HAVE_CMPXCHG_LOCAL
320config HAVE_CMPXCHG_DOUBLE 320config HAVE_CMPXCHG_DOUBLE
321 bool 321 bool
322 322
323config ARCH_WEAK_RELEASE_ACQUIRE
324 bool
325
323config ARCH_WANT_IPC_PARSE_VERSION 326config ARCH_WANT_IPC_PARSE_VERSION
324 bool 327 bool
325 328
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 97a8bc8a095c..7a5c9b764cd2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -99,6 +99,7 @@ config PPC
99 select ARCH_USE_BUILTIN_BSWAP 99 select ARCH_USE_BUILTIN_BSWAP
100 select ARCH_USE_CMPXCHG_LOCKREF if PPC64 100 select ARCH_USE_CMPXCHG_LOCKREF if PPC64
101 select ARCH_WANT_IPC_PARSE_VERSION 101 select ARCH_WANT_IPC_PARSE_VERSION
102 select ARCH_WEAK_RELEASE_ACQUIRE
102 select BINFMT_ELF 103 select BINFMT_ELF
103 select BUILDTIME_EXTABLE_SORT 104 select BUILDTIME_EXTABLE_SORT
104 select CLONE_BACKWARDS 105 select CLONE_BACKWARDS
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index fe531f904062..e4dd92b0a71b 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4665,7 +4665,7 @@ i915_gem_load_init(struct drm_i915_private *dev_priv)
4665 dev_priv->requests = KMEM_CACHE(drm_i915_gem_request, 4665 dev_priv->requests = KMEM_CACHE(drm_i915_gem_request,
4666 SLAB_HWCACHE_ALIGN | 4666 SLAB_HWCACHE_ALIGN |
4667 SLAB_RECLAIM_ACCOUNT | 4667 SLAB_RECLAIM_ACCOUNT |
4668 SLAB_DESTROY_BY_RCU); 4668 SLAB_TYPESAFE_BY_RCU);
4669 if (!dev_priv->requests) 4669 if (!dev_priv->requests)
4670 goto err_vmas; 4670 goto err_vmas;
4671 4671
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index ea511f06efaf..9ee2750e1dde 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -493,7 +493,7 @@ static inline struct drm_i915_gem_request *
493__i915_gem_active_get_rcu(const struct i915_gem_active *active) 493__i915_gem_active_get_rcu(const struct i915_gem_active *active)
494{ 494{
495 /* Performing a lockless retrieval of the active request is super 495 /* Performing a lockless retrieval of the active request is super
496 * tricky. SLAB_DESTROY_BY_RCU merely guarantees that the backing 496 * tricky. SLAB_TYPESAFE_BY_RCU merely guarantees that the backing
497 * slab of request objects will not be freed whilst we hold the 497 * slab of request objects will not be freed whilst we hold the
498 * RCU read lock. It does not guarantee that the request itself 498 * RCU read lock. It does not guarantee that the request itself
499 * will not be freed and then *reused*. Viz, 499 * will not be freed and then *reused*. Viz,
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
index 12647af5a336..e7fb47e84a93 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
@@ -1071,7 +1071,7 @@ int ldlm_init(void)
1071 ldlm_lock_slab = kmem_cache_create("ldlm_locks", 1071 ldlm_lock_slab = kmem_cache_create("ldlm_locks",
1072 sizeof(struct ldlm_lock), 0, 1072 sizeof(struct ldlm_lock), 0,
1073 SLAB_HWCACHE_ALIGN | 1073 SLAB_HWCACHE_ALIGN |
1074 SLAB_DESTROY_BY_RCU, NULL); 1074 SLAB_TYPESAFE_BY_RCU, NULL);
1075 if (!ldlm_lock_slab) { 1075 if (!ldlm_lock_slab) {
1076 kmem_cache_destroy(ldlm_resource_slab); 1076 kmem_cache_destroy(ldlm_resource_slab);
1077 return -ENOMEM; 1077 return -ENOMEM;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5adc2fb62b0f..92b255e1ba58 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2340,7 +2340,7 @@ static int jbd2_journal_init_journal_head_cache(void)
2340 jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", 2340 jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
2341 sizeof(struct journal_head), 2341 sizeof(struct journal_head),
2342 0, /* offset */ 2342 0, /* offset */
2343 SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU, 2343 SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU,
2344 NULL); /* ctor */ 2344 NULL); /* ctor */
2345 retval = 0; 2345 retval = 0;
2346 if (!jbd2_journal_head_cache) { 2346 if (!jbd2_journal_head_cache) {
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 270221fcef42..7e3d71109f51 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -38,7 +38,7 @@ void signalfd_cleanup(struct sighand_struct *sighand)
38 /* 38 /*
39 * The lockless check can race with remove_wait_queue() in progress, 39 * The lockless check can race with remove_wait_queue() in progress,
40 * but in this case its caller should run under rcu_read_lock() and 40 * but in this case its caller should run under rcu_read_lock() and
41 * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return. 41 * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return.
42 */ 42 */
43 if (likely(!waitqueue_active(wqh))) 43 if (likely(!waitqueue_active(wqh)))
44 return; 44 return;
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 6048fa404e57..a5195a7d6f77 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -229,7 +229,7 @@ static inline struct dma_fence *dma_fence_get_rcu(struct dma_fence *fence)
229 * 229 *
230 * Function returns NULL if no refcount could be obtained, or the fence. 230 * Function returns NULL if no refcount could be obtained, or the fence.
231 * This function handles acquiring a reference to a fence that may be 231 * This function handles acquiring a reference to a fence that may be
232 * reallocated within the RCU grace period (such as with SLAB_DESTROY_BY_RCU), 232 * reallocated within the RCU grace period (such as with SLAB_TYPESAFE_BY_RCU),
233 * so long as the caller is using RCU on the pointer to the fence. 233 * so long as the caller is using RCU on the pointer to the fence.
234 * 234 *
235 * An alternative mechanism is to employ a seqlock to protect a bunch of 235 * An alternative mechanism is to employ a seqlock to protect a bunch of
@@ -257,7 +257,7 @@ dma_fence_get_rcu_safe(struct dma_fence * __rcu *fencep)
257 * have successfully acquire a reference to it. If it no 257 * have successfully acquire a reference to it. If it no
258 * longer matches, we are holding a reference to some other 258 * longer matches, we are holding a reference to some other
259 * reallocated pointer. This is possible if the allocator 259 * reallocated pointer. This is possible if the allocator
260 * is using a freelist like SLAB_DESTROY_BY_RCU where the 260 * is using a freelist like SLAB_TYPESAFE_BY_RCU where the
261 * fence remains valid for the RCU grace period, but it 261 * fence remains valid for the RCU grace period, but it
262 * may be reallocated. When using such allocators, we are 262 * may be reallocated. When using such allocators, we are
263 * responsible for ensuring the reference we get is to 263 * responsible for ensuring the reference we get is to
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d0250744507a..d6cfa0992220 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -375,8 +375,6 @@ struct kvm {
375 struct mutex slots_lock; 375 struct mutex slots_lock;
376 struct mm_struct *mm; /* userspace tied to this vm */ 376 struct mm_struct *mm; /* userspace tied to this vm */
377 struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM]; 377 struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM];
378 struct srcu_struct srcu;
379 struct srcu_struct irq_srcu;
380 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; 378 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
381 379
382 /* 380 /*
@@ -429,6 +427,8 @@ struct kvm {
429 struct list_head devices; 427 struct list_head devices;
430 struct dentry *debugfs_dentry; 428 struct dentry *debugfs_dentry;
431 struct kvm_stat_data **debugfs_stat_data; 429 struct kvm_stat_data **debugfs_stat_data;
430 struct srcu_struct srcu;
431 struct srcu_struct irq_srcu;
432}; 432};
433 433
434#define kvm_err(fmt, ...) \ 434#define kvm_err(fmt, ...) \
diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h
new file mode 100644
index 000000000000..4b766b61e1a0
--- /dev/null
+++ b/include/linux/rcu_node_tree.h
@@ -0,0 +1,99 @@
1/*
2 * RCU node combining tree definitions. These are used to compute
3 * global attributes while avoiding common-case global contention. A key
4 * property that these computations rely on is a tournament-style approach
5 * where only one of the tasks contending a lower level in the tree need
6 * advance to the next higher level. If properly configured, this allows
7 * unlimited scalability while maintaining a constant level of contention
8 * on the root node.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, you can access it online at
22 * http://www.gnu.org/licenses/gpl-2.0.html.
23 *
24 * Copyright IBM Corporation, 2017
25 *
26 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
27 */
28
29#ifndef __LINUX_RCU_NODE_TREE_H
30#define __LINUX_RCU_NODE_TREE_H
31
32/*
33 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
34 * CONFIG_RCU_FANOUT_LEAF.
35 * In theory, it should be possible to add more levels straightforwardly.
36 * In practice, this did work well going from three levels to four.
37 * Of course, your mileage may vary.
38 */
39
40#ifdef CONFIG_RCU_FANOUT
41#define RCU_FANOUT CONFIG_RCU_FANOUT
42#else /* #ifdef CONFIG_RCU_FANOUT */
43# ifdef CONFIG_64BIT
44# define RCU_FANOUT 64
45# else
46# define RCU_FANOUT 32
47# endif
48#endif /* #else #ifdef CONFIG_RCU_FANOUT */
49
50#ifdef CONFIG_RCU_FANOUT_LEAF
51#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
52#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
53#define RCU_FANOUT_LEAF 16
54#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
55
56#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
57#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT)
58#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT)
59#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT)
60
61#if NR_CPUS <= RCU_FANOUT_1
62# define RCU_NUM_LVLS 1
63# define NUM_RCU_LVL_0 1
64# define NUM_RCU_NODES NUM_RCU_LVL_0
65# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
66# define RCU_NODE_NAME_INIT { "rcu_node_0" }
67# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
68#elif NR_CPUS <= RCU_FANOUT_2
69# define RCU_NUM_LVLS 2
70# define NUM_RCU_LVL_0 1
71# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
72# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
73# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
74# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
75# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
76#elif NR_CPUS <= RCU_FANOUT_3
77# define RCU_NUM_LVLS 3
78# define NUM_RCU_LVL_0 1
79# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
80# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
81# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
82# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
83# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
84# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
85#elif NR_CPUS <= RCU_FANOUT_4
86# define RCU_NUM_LVLS 4
87# define NUM_RCU_LVL_0 1
88# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
89# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
90# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
91# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
92# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
93# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
94# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
95#else
96# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
97#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
98
99#endif /* __LINUX_RCU_NODE_TREE_H */
diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
new file mode 100644
index 000000000000..ced8f313fd05
--- /dev/null
+++ b/include/linux/rcu_segcblist.h
@@ -0,0 +1,712 @@
1/*
2 * RCU segmented callback lists
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright IBM Corporation, 2017
19 *
20 * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 */
22
23#ifndef __KERNEL_RCU_SEGCBLIST_H
24#define __KERNEL_RCU_SEGCBLIST_H
25
26/* Simple unsegmented callback lists. */
27struct rcu_cblist {
28 struct rcu_head *head;
29 struct rcu_head **tail;
30 long len;
31 long len_lazy;
32};
33
34#define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head }
35
36/* Initialize simple callback list. */
37static inline void rcu_cblist_init(struct rcu_cblist *rclp)
38{
39 rclp->head = NULL;
40 rclp->tail = &rclp->head;
41 rclp->len = 0;
42 rclp->len_lazy = 0;
43}
44
45/* Is simple callback list empty? */
46static inline bool rcu_cblist_empty(struct rcu_cblist *rclp)
47{
48 return !rclp->head;
49}
50
51/* Return number of callbacks in simple callback list. */
52static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
53{
54 return rclp->len;
55}
56
57/* Return number of lazy callbacks in simple callback list. */
58static inline long rcu_cblist_n_lazy_cbs(struct rcu_cblist *rclp)
59{
60 return rclp->len_lazy;
61}
62
63/*
64 * Debug function to actually count the number of callbacks.
65 * If the number exceeds the limit specified, return -1.
66 */
67static inline long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
68{
69 int cnt = 0;
70 struct rcu_head **rhpp = &rclp->head;
71
72 for (;;) {
73 if (!*rhpp)
74 return cnt;
75 if (++cnt > lim)
76 return -1;
77 rhpp = &(*rhpp)->next;
78 }
79}
80
81/*
82 * Dequeue the oldest rcu_head structure from the specified callback
83 * list. This function assumes that the callback is non-lazy, but
84 * the caller can later invoke rcu_cblist_dequeued_lazy() if it
85 * finds otherwise (and if it cares about laziness). This allows
86 * different users to have different ways of determining laziness.
87 */
88static inline struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
89{
90 struct rcu_head *rhp;
91
92 rhp = rclp->head;
93 if (!rhp)
94 return NULL;
95 rclp->len--;
96 rclp->head = rhp->next;
97 if (!rclp->head)
98 rclp->tail = &rclp->head;
99 return rhp;
100}
101
102/*
103 * Account for the fact that a previously dequeued callback turned out
104 * to be marked as lazy.
105 */
106static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
107{
108 rclp->len_lazy--;
109}
110
111/*
112 * Interim function to return rcu_cblist head pointer. Longer term, the
113 * rcu_cblist will be used more pervasively, removing the need for this
114 * function.
115 */
116static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
117{
118 return rclp->head;
119}
120
121/*
122 * Interim function to return rcu_cblist head pointer. Longer term, the
123 * rcu_cblist will be used more pervasively, removing the need for this
124 * function.
125 */
126static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
127{
128 WARN_ON_ONCE(rcu_cblist_empty(rclp));
129 return rclp->tail;
130}
131
132/* Complicated segmented callback lists. ;-) */
133
134/*
135 * Index values for segments in rcu_segcblist structure.
136 *
137 * The segments are as follows:
138 *
139 * [head, *tails[RCU_DONE_TAIL]):
140 * Callbacks whose grace period has elapsed, and thus can be invoked.
141 * [*tails[RCU_DONE_TAIL], *tails[RCU_WAIT_TAIL]):
142 * Callbacks waiting for the current GP from the current CPU's viewpoint.
143 * [*tails[RCU_WAIT_TAIL], *tails[RCU_NEXT_READY_TAIL]):
144 * Callbacks that arrived before the next GP started, again from
145 * the current CPU's viewpoint. These can be handled by the next GP.
146 * [*tails[RCU_NEXT_READY_TAIL], *tails[RCU_NEXT_TAIL]):
147 * Callbacks that might have arrived after the next GP started.
148 * There is some uncertainty as to when a given GP starts and
149 * ends, but a CPU knows the exact times if it is the one starting
150 * or ending the GP. Other CPUs know that the previous GP ends
151 * before the next one starts.
152 *
153 * Note that RCU_WAIT_TAIL cannot be empty unless RCU_NEXT_READY_TAIL is also
154 * empty.
155 *
156 * The ->gp_seq[] array contains the grace-period number at which the
157 * corresponding segment of callbacks will be ready to invoke. A given
158 * element of this array is meaningful only when the corresponding segment
159 * is non-empty, and it is never valid for RCU_DONE_TAIL (whose callbacks
160 * are already ready to invoke) or for RCU_NEXT_TAIL (whose callbacks have
161 * not yet been assigned a grace-period number).
162 */
163#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
164#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
165#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
166#define RCU_NEXT_TAIL 3
167#define RCU_CBLIST_NSEGS 4
168
169struct rcu_segcblist {
170 struct rcu_head *head;
171 struct rcu_head **tails[RCU_CBLIST_NSEGS];
172 unsigned long gp_seq[RCU_CBLIST_NSEGS];
173 long len;
174 long len_lazy;
175};
176
177#define RCU_SEGCBLIST_INITIALIZER(n) \
178{ \
179 .head = NULL, \
180 .tails[RCU_DONE_TAIL] = &n.head, \
181 .tails[RCU_WAIT_TAIL] = &n.head, \
182 .tails[RCU_NEXT_READY_TAIL] = &n.head, \
183 .tails[RCU_NEXT_TAIL] = &n.head, \
184}
185
186/*
187 * Initialize an rcu_segcblist structure.
188 */
189static inline void rcu_segcblist_init(struct rcu_segcblist *rsclp)
190{
191 int i;
192
193 BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
194 BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
195 rsclp->head = NULL;
196 for (i = 0; i < RCU_CBLIST_NSEGS; i++)
197 rsclp->tails[i] = &rsclp->head;
198 rsclp->len = 0;
199 rsclp->len_lazy = 0;
200}
201
202/*
203 * Is the specified rcu_segcblist structure empty?
204 *
205 * But careful! The fact that the ->head field is NULL does not
206 * necessarily imply that there are no callbacks associated with
207 * this structure. When callbacks are being invoked, they are
208 * removed as a group. If callback invocation must be preempted,
209 * the remaining callbacks will be added back to the list. Either
210 * way, the counts are updated later.
211 *
212 * So it is often the case that rcu_segcblist_n_cbs() should be used
213 * instead.
214 */
215static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
216{
217 return !rsclp->head;
218}
219
220/* Return number of callbacks in segmented callback list. */
221static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
222{
223 return READ_ONCE(rsclp->len);
224}
225
226/* Return number of lazy callbacks in segmented callback list. */
227static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
228{
229 return rsclp->len_lazy;
230}
231
232/* Return number of lazy callbacks in segmented callback list. */
233static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
234{
235 return rsclp->len - rsclp->len_lazy;
236}
237
238/*
239 * Is the specified rcu_segcblist enabled, for example, not corresponding
240 * to an offline or callback-offloaded CPU?
241 */
242static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
243{
244 return !!rsclp->tails[RCU_NEXT_TAIL];
245}
246
247/*
248 * Disable the specified rcu_segcblist structure, so that callbacks can
249 * no longer be posted to it. This structure must be empty.
250 */
251static inline void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
252{
253 WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
254 WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
255 WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
256 rsclp->tails[RCU_NEXT_TAIL] = NULL;
257}
258
259/*
260 * Is the specified segment of the specified rcu_segcblist structure
261 * empty of callbacks?
262 */
263static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
264{
265 if (seg == RCU_DONE_TAIL)
266 return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
267 return rsclp->tails[seg - 1] == rsclp->tails[seg];
268}
269
270/*
271 * Are all segments following the specified segment of the specified
272 * rcu_segcblist structure empty of callbacks? (The specified
273 * segment might well contain callbacks.)
274 */
275static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
276{
277 return !*rsclp->tails[seg];
278}
279
280/*
281 * Does the specified rcu_segcblist structure contain callbacks that
282 * are ready to be invoked?
283 */
284static inline bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
285{
286 return rcu_segcblist_is_enabled(rsclp) &&
287 &rsclp->head != rsclp->tails[RCU_DONE_TAIL];
288}
289
290/*
291 * Does the specified rcu_segcblist structure contain callbacks that
292 * are still pending, that is, not yet ready to be invoked?
293 */
294static inline bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
295{
296 return rcu_segcblist_is_enabled(rsclp) &&
297 !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL);
298}
299
300/*
301 * Dequeue and return the first ready-to-invoke callback. If there
302 * are no ready-to-invoke callbacks, return NULL. Disables interrupts
303 * to avoid interference. Does not protect from interference from other
304 * CPUs or tasks.
305 */
306static inline struct rcu_head *
307rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
308{
309 unsigned long flags;
310 int i;
311 struct rcu_head *rhp;
312
313 local_irq_save(flags);
314 if (!rcu_segcblist_ready_cbs(rsclp)) {
315 local_irq_restore(flags);
316 return NULL;
317 }
318 rhp = rsclp->head;
319 BUG_ON(!rhp);
320 rsclp->head = rhp->next;
321 for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
322 if (rsclp->tails[i] != &rhp->next)
323 break;
324 rsclp->tails[i] = &rsclp->head;
325 }
326 smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
327 WRITE_ONCE(rsclp->len, rsclp->len - 1);
328 local_irq_restore(flags);
329 return rhp;
330}
331
332/*
333 * Account for the fact that a previously dequeued callback turned out
334 * to be marked as lazy.
335 */
336static inline void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
337{
338 unsigned long flags;
339
340 local_irq_save(flags);
341 rsclp->len_lazy--;
342 local_irq_restore(flags);
343}
344
345/*
346 * Return a pointer to the first callback in the specified rcu_segcblist
347 * structure. This is useful for diagnostics.
348 */
349static inline struct rcu_head *
350rcu_segcblist_first_cb(struct rcu_segcblist *rsclp)
351{
352 if (rcu_segcblist_is_enabled(rsclp))
353 return rsclp->head;
354 return NULL;
355}
356
357/*
358 * Return a pointer to the first pending callback in the specified
359 * rcu_segcblist structure. This is useful just after posting a given
360 * callback -- if that callback is the first pending callback, then
361 * you cannot rely on someone else having already started up the required
362 * grace period.
363 */
364static inline struct rcu_head *
365rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
366{
367 if (rcu_segcblist_is_enabled(rsclp))
368 return *rsclp->tails[RCU_DONE_TAIL];
369 return NULL;
370}
371
372/*
373 * Does the specified rcu_segcblist structure contain callbacks that
374 * have not yet been processed beyond having been posted, that is,
375 * does it contain callbacks in its last segment?
376 */
377static inline bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
378{
379 return rcu_segcblist_is_enabled(rsclp) &&
380 !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
381}
382
383/*
384 * Enqueue the specified callback onto the specified rcu_segcblist
385 * structure, updating accounting as needed. Note that the ->len
386 * field may be accessed locklessly, hence the WRITE_ONCE().
387 * The ->len field is used by rcu_barrier() and friends to determine
388 * if it must post a callback on this structure, and it is OK
389 * for rcu_barrier() to sometimes post callbacks needlessly, but
390 * absolutely not OK for it to ever miss posting a callback.
391 */
392static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
393 struct rcu_head *rhp, bool lazy)
394{
395 WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
396 if (lazy)
397 rsclp->len_lazy++;
398 smp_mb(); /* Ensure counts are updated before callback is enqueued. */
399 rhp->next = NULL;
400 *rsclp->tails[RCU_NEXT_TAIL] = rhp;
401 rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
402}
403
404/*
405 * Entrain the specified callback onto the specified rcu_segcblist at
406 * the end of the last non-empty segment. If the entire rcu_segcblist
407 * is empty, make no change, but return false.
408 *
409 * This is intended for use by rcu_barrier()-like primitives, -not-
410 * for normal grace-period use. IMPORTANT: The callback you enqueue
411 * will wait for all prior callbacks, NOT necessarily for a grace
412 * period. You have been warned.
413 */
414static inline bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
415 struct rcu_head *rhp, bool lazy)
416{
417 int i;
418
419 if (rcu_segcblist_n_cbs(rsclp) == 0)
420 return false;
421 WRITE_ONCE(rsclp->len, rsclp->len + 1);
422 if (lazy)
423 rsclp->len_lazy++;
424 smp_mb(); /* Ensure counts are updated before callback is entrained. */
425 rhp->next = NULL;
426 for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
427 if (rsclp->tails[i] != rsclp->tails[i - 1])
428 break;
429 *rsclp->tails[i] = rhp;
430 for (; i <= RCU_NEXT_TAIL; i++)
431 rsclp->tails[i] = &rhp->next;
432 return true;
433}
434
435/*
436 * Extract only the counts from the specified rcu_segcblist structure,
437 * and place them in the specified rcu_cblist structure. This function
438 * supports both callback orphaning and invocation, hence the separation
439 * of counts and callbacks. (Callbacks ready for invocation must be
440 * orphaned and adopted separately from pending callbacks, but counts
441 * apply to all callbacks. Locking must be used to make sure that
442 * both orphaned-callbacks lists are consistent.)
443 */
444static inline void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
445 struct rcu_cblist *rclp)
446{
447 rclp->len_lazy += rsclp->len_lazy;
448 rclp->len += rsclp->len;
449 rsclp->len_lazy = 0;
450 WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
451}
452
453/*
454 * Extract only those callbacks ready to be invoked from the specified
455 * rcu_segcblist structure and place them in the specified rcu_cblist
456 * structure.
457 */
458static inline void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
459 struct rcu_cblist *rclp)
460{
461 int i;
462
463 if (!rcu_segcblist_ready_cbs(rsclp))
464 return; /* Nothing to do. */
465 *rclp->tail = rsclp->head;
466 rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
467 *rsclp->tails[RCU_DONE_TAIL] = NULL;
468 rclp->tail = rsclp->tails[RCU_DONE_TAIL];
469 for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
470 if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
471 rsclp->tails[i] = &rsclp->head;
472}
473
474/*
475 * Extract only those callbacks still pending (not yet ready to be
476 * invoked) from the specified rcu_segcblist structure and place them in
477 * the specified rcu_cblist structure. Note that this loses information
478 * about any callbacks that might have been partway done waiting for
479 * their grace period. Too bad! They will have to start over.
480 */
481static inline void
482rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
483 struct rcu_cblist *rclp)
484{
485 int i;
486
487 if (!rcu_segcblist_pend_cbs(rsclp))
488 return; /* Nothing to do. */
489 *rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
490 rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
491 *rsclp->tails[RCU_DONE_TAIL] = NULL;
492 for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
493 rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
494}
495
496/*
497 * Move the entire contents of the specified rcu_segcblist structure,
498 * counts, callbacks, and all, to the specified rcu_cblist structure.
499 * @@@ Why do we need this??? Moving early-boot CBs to NOCB lists?
500 * @@@ Memory barrier needed? (Not if only used at boot time...)
501 */
502static inline void rcu_segcblist_extract_all(struct rcu_segcblist *rsclp,
503 struct rcu_cblist *rclp)
504{
505 rcu_segcblist_extract_done_cbs(rsclp, rclp);
506 rcu_segcblist_extract_pend_cbs(rsclp, rclp);
507 rcu_segcblist_extract_count(rsclp, rclp);
508}
509
510/*
511 * Insert counts from the specified rcu_cblist structure in the
512 * specified rcu_segcblist structure.
513 */
514static inline void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
515 struct rcu_cblist *rclp)
516{
517 rsclp->len_lazy += rclp->len_lazy;
518 /* ->len sampled locklessly. */
519 WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
520 rclp->len_lazy = 0;
521 rclp->len = 0;
522}
523
524/*
525 * Move callbacks from the specified rcu_cblist to the beginning of the
526 * done-callbacks segment of the specified rcu_segcblist.
527 */
528static inline void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
529 struct rcu_cblist *rclp)
530{
531 int i;
532
533 if (!rclp->head)
534 return; /* No callbacks to move. */
535 *rclp->tail = rsclp->head;
536 rsclp->head = rclp->head;
537 for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
538 if (&rsclp->head == rsclp->tails[i])
539 rsclp->tails[i] = rclp->tail;
540 else
541 break;
542 rclp->head = NULL;
543 rclp->tail = &rclp->head;
544}
545
546/*
547 * Move callbacks from the specified rcu_cblist to the end of the
548 * new-callbacks segment of the specified rcu_segcblist.
549 */
550static inline void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
551 struct rcu_cblist *rclp)
552{
553 if (!rclp->head)
554 return; /* Nothing to do. */
555 *rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
556 rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
557 rclp->head = NULL;
558 rclp->tail = &rclp->head;
559}
560
561/*
562 * Advance the callbacks in the specified rcu_segcblist structure based
563 * on the current value passed in for the grace-period counter.
564 */
565static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp,
566 unsigned long seq)
567{
568 int i, j;
569
570 WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
571 if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
572 return;
573
574 /*
575 * Find all callbacks whose ->gp_seq numbers indicate that they
576 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
577 */
578 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
579 if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
580 break;
581 rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
582 }
583
584 /* If no callbacks moved, nothing more need be done. */
585 if (i == RCU_WAIT_TAIL)
586 return;
587
588 /* Clean up tail pointers that might have been misordered above. */
589 for (j = RCU_WAIT_TAIL; j < i; j++)
590 rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
591
592 /*
593 * Callbacks moved, so clean up the misordered ->tails[] pointers
594 * that now point into the middle of the list of ready-to-invoke
595 * callbacks. The overall effect is to copy down the later pointers
596 * into the gap that was created by the now-ready segments.
597 */
598 for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
599 if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
600 break; /* No more callbacks. */
601 rsclp->tails[j] = rsclp->tails[i];
602 rsclp->gp_seq[j] = rsclp->gp_seq[i];
603 }
604}
605
606/*
607 * "Accelerate" callbacks based on more-accurate grace-period information.
608 * The reason for this is that RCU does not synchronize the beginnings and
609 * ends of grace periods, and that callbacks are posted locally. This in
610 * turn means that the callbacks must be labelled conservatively early
611 * on, as getting exact information would degrade both performance and
612 * scalability. When more accurate grace-period information becomes
613 * available, previously posted callbacks can be "accelerated", marking
614 * them to complete at the end of the earlier grace period.
615 *
616 * This function operates on an rcu_segcblist structure, and also the
617 * grace-period sequence number seq at which new callbacks would become
618 * ready to invoke. Returns true if there are callbacks that won't be
619 * ready to invoke until seq, false otherwise.
620 */
621static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp,
622 unsigned long seq)
623{
624 int i;
625
626 WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
627 if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
628 return false;
629
630 /*
631 * Find the segment preceding the oldest segment of callbacks
632 * whose ->gp_seq[] completion is at or after that passed in via
633 * "seq", skipping any empty segments. This oldest segment, along
634 * with any later segments, can be merged in with any newly arrived
635 * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
636 * as their ->gp_seq[] grace-period completion sequence number.
637 */
638 for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
639 if (rsclp->tails[i] != rsclp->tails[i - 1] &&
640 ULONG_CMP_LT(rsclp->gp_seq[i], seq))
641 break;
642
643 /*
644 * If all the segments contain callbacks that correspond to
645 * earlier grace-period sequence numbers than "seq", leave.
646 * Assuming that the rcu_segcblist structure has enough
647 * segments in its arrays, this can only happen if some of
648 * the non-done segments contain callbacks that really are
649 * ready to invoke. This situation will get straightened
650 * out by the next call to rcu_segcblist_advance().
651 *
652 * Also advance to the oldest segment of callbacks whose
653 * ->gp_seq[] completion is at or after that passed in via "seq",
654 * skipping any empty segments.
655 */
656 if (++i >= RCU_NEXT_TAIL)
657 return false;
658
659 /*
660 * Merge all later callbacks, including newly arrived callbacks,
661 * into the segment located by the for-loop above. Assign "seq"
662 * as the ->gp_seq[] value in order to correctly handle the case
663 * where there were no pending callbacks in the rcu_segcblist
664 * structure other than in the RCU_NEXT_TAIL segment.
665 */
666 for (; i < RCU_NEXT_TAIL; i++) {
667 rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
668 rsclp->gp_seq[i] = seq;
669 }
670 return true;
671}
672
673/*
674 * Scan the specified rcu_segcblist structure for callbacks that need
675 * a grace period later than the one specified by "seq". We don't look
676 * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
677 * have a grace-period sequence number.
678 */
679static inline bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
680 unsigned long seq)
681{
682 int i;
683
684 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
685 if (rsclp->tails[i - 1] != rsclp->tails[i] &&
686 ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
687 return true;
688 return false;
689}
690
691/*
692 * Interim function to return rcu_segcblist head pointer. Longer term, the
693 * rcu_segcblist will be used more pervasively, removing the need for this
694 * function.
695 */
696static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
697{
698 return rsclp->head;
699}
700
701/*
702 * Interim function to return rcu_segcblist head pointer. Longer term, the
703 * rcu_segcblist will be used more pervasively, removing the need for this
704 * function.
705 */
706static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
707{
708 WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
709 return rsclp->tails[RCU_NEXT_TAIL];
710}
711
712#endif /* __KERNEL_RCU_SEGCBLIST_H */
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 4f7a9561b8c4..b1fd8bf85fdc 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -509,7 +509,8 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n,
509{ 509{
510 struct hlist_node *i, *last = NULL; 510 struct hlist_node *i, *last = NULL;
511 511
512 for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i)) 512 /* Note: write side code, so rcu accessors are not needed. */
513 for (i = h->first; i; i = i->next)
513 last = i; 514 last = i;
514 515
515 if (last) { 516 if (last) {
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index de88b33c0974..f531b29207da 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -363,15 +363,20 @@ static inline void rcu_init_nohz(void)
363#ifdef CONFIG_TASKS_RCU 363#ifdef CONFIG_TASKS_RCU
364#define TASKS_RCU(x) x 364#define TASKS_RCU(x) x
365extern struct srcu_struct tasks_rcu_exit_srcu; 365extern struct srcu_struct tasks_rcu_exit_srcu;
366#define rcu_note_voluntary_context_switch(t) \ 366#define rcu_note_voluntary_context_switch_lite(t) \
367 do { \ 367 do { \
368 rcu_all_qs(); \
369 if (READ_ONCE((t)->rcu_tasks_holdout)) \ 368 if (READ_ONCE((t)->rcu_tasks_holdout)) \
370 WRITE_ONCE((t)->rcu_tasks_holdout, false); \ 369 WRITE_ONCE((t)->rcu_tasks_holdout, false); \
371 } while (0) 370 } while (0)
371#define rcu_note_voluntary_context_switch(t) \
372 do { \
373 rcu_all_qs(); \
374 rcu_note_voluntary_context_switch_lite(t); \
375 } while (0)
372#else /* #ifdef CONFIG_TASKS_RCU */ 376#else /* #ifdef CONFIG_TASKS_RCU */
373#define TASKS_RCU(x) do { } while (0) 377#define TASKS_RCU(x) do { } while (0)
374#define rcu_note_voluntary_context_switch(t) rcu_all_qs() 378#define rcu_note_voluntary_context_switch_lite(t) do { } while (0)
379#define rcu_note_voluntary_context_switch(t) rcu_all_qs()
375#endif /* #else #ifdef CONFIG_TASKS_RCU */ 380#endif /* #else #ifdef CONFIG_TASKS_RCU */
376 381
377/** 382/**
@@ -1127,11 +1132,11 @@ do { \
1127 * if the UNLOCK and LOCK are executed by the same CPU or if the 1132 * if the UNLOCK and LOCK are executed by the same CPU or if the
1128 * UNLOCK and LOCK operate on the same lock variable. 1133 * UNLOCK and LOCK operate on the same lock variable.
1129 */ 1134 */
1130#ifdef CONFIG_PPC 1135#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE
1131#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ 1136#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */
1132#else /* #ifdef CONFIG_PPC */ 1137#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
1133#define smp_mb__after_unlock_lock() do { } while (0) 1138#define smp_mb__after_unlock_lock() do { } while (0)
1134#endif /* #else #ifdef CONFIG_PPC */ 1139#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
1135 1140
1136 1141
1137#endif /* __LINUX_RCUPDATE_H */ 1142#endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index b452953e21c8..74d9c3a1feee 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -33,6 +33,11 @@ static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
33 return 0; 33 return 0;
34} 34}
35 35
36static inline bool rcu_eqs_special_set(int cpu)
37{
38 return false; /* Never flag non-existent other CPUs! */
39}
40
36static inline unsigned long get_state_synchronize_rcu(void) 41static inline unsigned long get_state_synchronize_rcu(void)
37{ 42{
38 return 0; 43 return 0;
@@ -87,10 +92,11 @@ static inline void kfree_call_rcu(struct rcu_head *head,
87 call_rcu(head, func); 92 call_rcu(head, func);
88} 93}
89 94
90static inline void rcu_note_context_switch(void) 95#define rcu_note_context_switch(preempt) \
91{ 96 do { \
92 rcu_sched_qs(); 97 rcu_sched_qs(); \
93} 98 rcu_note_voluntary_context_switch_lite(current); \
99 } while (0)
94 100
95/* 101/*
96 * Take advantage of the fact that there is only one CPU, which 102 * Take advantage of the fact that there is only one CPU, which
@@ -212,14 +218,14 @@ static inline void exit_rcu(void)
212{ 218{
213} 219}
214 220
215#ifdef CONFIG_DEBUG_LOCK_ALLOC 221#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
216extern int rcu_scheduler_active __read_mostly; 222extern int rcu_scheduler_active __read_mostly;
217void rcu_scheduler_starting(void); 223void rcu_scheduler_starting(void);
218#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 224#else /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
219static inline void rcu_scheduler_starting(void) 225static inline void rcu_scheduler_starting(void)
220{ 226{
221} 227}
222#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 228#endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
223 229
224#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) 230#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
225 231
@@ -237,6 +243,10 @@ static inline bool rcu_is_watching(void)
237 243
238#endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ 244#endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
239 245
246static inline void rcu_request_urgent_qs_task(struct task_struct *t)
247{
248}
249
240static inline void rcu_all_qs(void) 250static inline void rcu_all_qs(void)
241{ 251{
242 barrier(); /* Avoid RCU read-side critical sections leaking across. */ 252 barrier(); /* Avoid RCU read-side critical sections leaking across. */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 63a4e4cf40a5..0bacb6b2af69 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,7 +30,7 @@
30#ifndef __LINUX_RCUTREE_H 30#ifndef __LINUX_RCUTREE_H
31#define __LINUX_RCUTREE_H 31#define __LINUX_RCUTREE_H
32 32
33void rcu_note_context_switch(void); 33void rcu_note_context_switch(bool preempt);
34int rcu_needs_cpu(u64 basem, u64 *nextevt); 34int rcu_needs_cpu(u64 basem, u64 *nextevt);
35void rcu_cpu_stall_reset(void); 35void rcu_cpu_stall_reset(void);
36 36
@@ -41,7 +41,7 @@ void rcu_cpu_stall_reset(void);
41 */ 41 */
42static inline void rcu_virt_note_context_switch(int cpu) 42static inline void rcu_virt_note_context_switch(int cpu)
43{ 43{
44 rcu_note_context_switch(); 44 rcu_note_context_switch(false);
45} 45}
46 46
47void synchronize_rcu_bh(void); 47void synchronize_rcu_bh(void);
@@ -108,6 +108,7 @@ void rcu_scheduler_starting(void);
108extern int rcu_scheduler_active __read_mostly; 108extern int rcu_scheduler_active __read_mostly;
109 109
110bool rcu_is_watching(void); 110bool rcu_is_watching(void);
111void rcu_request_urgent_qs_task(struct task_struct *t);
111 112
112void rcu_all_qs(void); 113void rcu_all_qs(void);
113 114
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 3c37a8c51921..04a7f7993e67 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -28,7 +28,7 @@
28#define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ 28#define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */
29#define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */ 29#define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */
30/* 30/*
31 * SLAB_DESTROY_BY_RCU - **WARNING** READ THIS! 31 * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
32 * 32 *
33 * This delays freeing the SLAB page by a grace period, it does _NOT_ 33 * This delays freeing the SLAB page by a grace period, it does _NOT_
34 * delay object freeing. This means that if you do kmem_cache_free() 34 * delay object freeing. This means that if you do kmem_cache_free()
@@ -61,8 +61,10 @@
61 * 61 *
62 * rcu_read_lock before reading the address, then rcu_read_unlock after 62 * rcu_read_lock before reading the address, then rcu_read_unlock after
63 * taking the spinlock within the structure expected at that address. 63 * taking the spinlock within the structure expected at that address.
64 *
65 * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
64 */ 66 */
65#define SLAB_DESTROY_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ 67#define SLAB_TYPESAFE_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */
66#define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ 68#define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */
67#define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */ 69#define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */
68 70
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index a598cf3ac70c..167ad8831aaf 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -22,7 +22,7 @@
22 * Lai Jiangshan <laijs@cn.fujitsu.com> 22 * Lai Jiangshan <laijs@cn.fujitsu.com>
23 * 23 *
24 * For detailed explanation of Read-Copy Update mechanism see - 24 * For detailed explanation of Read-Copy Update mechanism see -
25 * Documentation/RCU/ *.txt 25 * Documentation/RCU/ *.txt
26 * 26 *
27 */ 27 */
28 28
@@ -32,35 +32,9 @@
32#include <linux/mutex.h> 32#include <linux/mutex.h>
33#include <linux/rcupdate.h> 33#include <linux/rcupdate.h>
34#include <linux/workqueue.h> 34#include <linux/workqueue.h>
35#include <linux/rcu_segcblist.h>
35 36
36struct srcu_array { 37struct srcu_struct;
37 unsigned long lock_count[2];
38 unsigned long unlock_count[2];
39};
40
41struct rcu_batch {
42 struct rcu_head *head, **tail;
43};
44
45#define RCU_BATCH_INIT(name) { NULL, &(name.head) }
46
47struct srcu_struct {
48 unsigned long completed;
49 struct srcu_array __percpu *per_cpu_ref;
50 spinlock_t queue_lock; /* protect ->batch_queue, ->running */
51 bool running;
52 /* callbacks just queued */
53 struct rcu_batch batch_queue;
54 /* callbacks try to do the first check_zero */
55 struct rcu_batch batch_check0;
56 /* callbacks done with the first check_zero and the flip */
57 struct rcu_batch batch_check1;
58 struct rcu_batch batch_done;
59 struct delayed_work work;
60#ifdef CONFIG_DEBUG_LOCK_ALLOC
61 struct lockdep_map dep_map;
62#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
63};
64 38
65#ifdef CONFIG_DEBUG_LOCK_ALLOC 39#ifdef CONFIG_DEBUG_LOCK_ALLOC
66 40
@@ -82,46 +56,15 @@ int init_srcu_struct(struct srcu_struct *sp);
82#define __SRCU_DEP_MAP_INIT(srcu_name) 56#define __SRCU_DEP_MAP_INIT(srcu_name)
83#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 57#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
84 58
85void process_srcu(struct work_struct *work); 59#ifdef CONFIG_TINY_SRCU
86 60#include <linux/srcutiny.h>
87#define __SRCU_STRUCT_INIT(name) \ 61#elif defined(CONFIG_TREE_SRCU)
88 { \ 62#include <linux/srcutree.h>
89 .completed = -300, \ 63#elif defined(CONFIG_CLASSIC_SRCU)
90 .per_cpu_ref = &name##_srcu_array, \ 64#include <linux/srcuclassic.h>
91 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ 65#else
92 .running = false, \ 66#error "Unknown SRCU implementation specified to kernel configuration"
93 .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ 67#endif
94 .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \
95 .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \
96 .batch_done = RCU_BATCH_INIT(name.batch_done), \
97 .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
98 __SRCU_DEP_MAP_INIT(name) \
99 }
100
101/*
102 * Define and initialize a srcu struct at build time.
103 * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
104 *
105 * Note that although DEFINE_STATIC_SRCU() hides the name from other
106 * files, the per-CPU variable rules nevertheless require that the
107 * chosen name be globally unique. These rules also prohibit use of
108 * DEFINE_STATIC_SRCU() within a function. If these rules are too
109 * restrictive, declare the srcu_struct manually. For example, in
110 * each file:
111 *
112 * static struct srcu_struct my_srcu;
113 *
114 * Then, before the first use of each my_srcu, manually initialize it:
115 *
116 * init_srcu_struct(&my_srcu);
117 *
118 * See include/linux/percpu-defs.h for the rules on per-CPU variables.
119 */
120#define __DEFINE_SRCU(name, is_static) \
121 static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
122 is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
123#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
124#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)
125 68
126/** 69/**
127 * call_srcu() - Queue a callback for invocation after an SRCU grace period 70 * call_srcu() - Queue a callback for invocation after an SRCU grace period
@@ -147,9 +90,6 @@ void cleanup_srcu_struct(struct srcu_struct *sp);
147int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); 90int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
148void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); 91void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
149void synchronize_srcu(struct srcu_struct *sp); 92void synchronize_srcu(struct srcu_struct *sp);
150void synchronize_srcu_expedited(struct srcu_struct *sp);
151unsigned long srcu_batches_completed(struct srcu_struct *sp);
152void srcu_barrier(struct srcu_struct *sp);
153 93
154#ifdef CONFIG_DEBUG_LOCK_ALLOC 94#ifdef CONFIG_DEBUG_LOCK_ALLOC
155 95
diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h
new file mode 100644
index 000000000000..41cf99930f34
--- /dev/null
+++ b/include/linux/srcuclassic.h
@@ -0,0 +1,101 @@
1/*
2 * Sleepable Read-Copy Update mechanism for mutual exclusion,
3 * classic v4.11 variant.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, you can access it online at
17 * http://www.gnu.org/licenses/gpl-2.0.html.
18 *
19 * Copyright (C) IBM Corporation, 2017
20 *
21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 */
23
24#ifndef _LINUX_SRCU_CLASSIC_H
25#define _LINUX_SRCU_CLASSIC_H
26
27struct srcu_array {
28 unsigned long lock_count[2];
29 unsigned long unlock_count[2];
30};
31
32struct rcu_batch {
33 struct rcu_head *head, **tail;
34};
35
36#define RCU_BATCH_INIT(name) { NULL, &(name.head) }
37
38struct srcu_struct {
39 unsigned long completed;
40 struct srcu_array __percpu *per_cpu_ref;
41 spinlock_t queue_lock; /* protect ->batch_queue, ->running */
42 bool running;
43 /* callbacks just queued */
44 struct rcu_batch batch_queue;
45 /* callbacks try to do the first check_zero */
46 struct rcu_batch batch_check0;
47 /* callbacks done with the first check_zero and the flip */
48 struct rcu_batch batch_check1;
49 struct rcu_batch batch_done;
50 struct delayed_work work;
51#ifdef CONFIG_DEBUG_LOCK_ALLOC
52 struct lockdep_map dep_map;
53#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
54};
55
56void process_srcu(struct work_struct *work);
57
58#define __SRCU_STRUCT_INIT(name) \
59 { \
60 .completed = -300, \
61 .per_cpu_ref = &name##_srcu_array, \
62 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \
63 .running = false, \
64 .batch_queue = RCU_BATCH_INIT(name.batch_queue), \
65 .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \
66 .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \
67 .batch_done = RCU_BATCH_INIT(name.batch_done), \
68 .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
69 __SRCU_DEP_MAP_INIT(name) \
70 }
71
72/*
73 * Define and initialize a srcu struct at build time.
74 * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
75 *
76 * Note that although DEFINE_STATIC_SRCU() hides the name from other
77 * files, the per-CPU variable rules nevertheless require that the
78 * chosen name be globally unique. These rules also prohibit use of
79 * DEFINE_STATIC_SRCU() within a function. If these rules are too
80 * restrictive, declare the srcu_struct manually. For example, in
81 * each file:
82 *
83 * static struct srcu_struct my_srcu;
84 *
85 * Then, before the first use of each my_srcu, manually initialize it:
86 *
87 * init_srcu_struct(&my_srcu);
88 *
89 * See include/linux/percpu-defs.h for the rules on per-CPU variables.
90 */
91#define __DEFINE_SRCU(name, is_static) \
92 static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
93 is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
94#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
95#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)
96
97void synchronize_srcu_expedited(struct srcu_struct *sp);
98void srcu_barrier(struct srcu_struct *sp);
99unsigned long srcu_batches_completed(struct srcu_struct *sp);
100
101#endif
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
new file mode 100644
index 000000000000..4f284e4f4d8c
--- /dev/null
+++ b/include/linux/srcutiny.h
@@ -0,0 +1,81 @@
1/*
2 * Sleepable Read-Copy Update mechanism for mutual exclusion,
3 * tiny variant.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, you can access it online at
17 * http://www.gnu.org/licenses/gpl-2.0.html.
18 *
19 * Copyright (C) IBM Corporation, 2017
20 *
21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 */
23
24#ifndef _LINUX_SRCU_TINY_H
25#define _LINUX_SRCU_TINY_H
26
27#include <linux/swait.h>
28
29struct srcu_struct {
30 int srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */
31 struct swait_queue_head srcu_wq;
32 /* Last srcu_read_unlock() wakes GP. */
33 unsigned long srcu_gp_seq; /* GP seq # for callback tagging. */
34 struct rcu_segcblist srcu_cblist;
35 /* Pending SRCU callbacks. */
36 int srcu_idx; /* Current reader array element. */
37 bool srcu_gp_running; /* GP workqueue running? */
38 bool srcu_gp_waiting; /* GP waiting for readers? */
39 struct work_struct srcu_work; /* For driving grace periods. */
40#ifdef CONFIG_DEBUG_LOCK_ALLOC
41 struct lockdep_map dep_map;
42#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
43};
44
45void srcu_drive_gp(struct work_struct *wp);
46
47#define __SRCU_STRUCT_INIT(name) \
48{ \
49 .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \
50 .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist), \
51 .srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp), \
52 __SRCU_DEP_MAP_INIT(name) \
53}
54
55/*
56 * This odd _STATIC_ arrangement is needed for API compatibility with
57 * Tree SRCU, which needs some per-CPU data.
58 */
59#define DEFINE_SRCU(name) \
60 struct srcu_struct name = __SRCU_STRUCT_INIT(name)
61#define DEFINE_STATIC_SRCU(name) \
62 static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
63
64void synchronize_srcu(struct srcu_struct *sp);
65
66static inline void synchronize_srcu_expedited(struct srcu_struct *sp)
67{
68 synchronize_srcu(sp);
69}
70
71static inline void srcu_barrier(struct srcu_struct *sp)
72{
73 synchronize_srcu(sp);
74}
75
76static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
77{
78 return 0;
79}
80
81#endif
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
new file mode 100644
index 000000000000..0400e211aa44
--- /dev/null
+++ b/include/linux/srcutree.h
@@ -0,0 +1,139 @@
1/*
2 * Sleepable Read-Copy Update mechanism for mutual exclusion,
3 * tree variant.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, you can access it online at
17 * http://www.gnu.org/licenses/gpl-2.0.html.
18 *
19 * Copyright (C) IBM Corporation, 2017
20 *
21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 */
23
24#ifndef _LINUX_SRCU_TREE_H
25#define _LINUX_SRCU_TREE_H
26
27#include <linux/rcu_node_tree.h>
28#include <linux/completion.h>
29
30struct srcu_node;
31struct srcu_struct;
32
33/*
34 * Per-CPU structure feeding into leaf srcu_node, similar in function
35 * to rcu_node.
36 */
37struct srcu_data {
38 /* Read-side state. */
39 unsigned long srcu_lock_count[2]; /* Locks per CPU. */
40 unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */
41
42 /* Update-side state. */
43 spinlock_t lock ____cacheline_internodealigned_in_smp;
44 struct rcu_segcblist srcu_cblist; /* List of callbacks.*/
45 unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */
46 bool srcu_cblist_invoking; /* Invoking these CBs? */
47 struct delayed_work work; /* Context for CB invoking. */
48 struct rcu_head srcu_barrier_head; /* For srcu_barrier() use. */
49 struct srcu_node *mynode; /* Leaf srcu_node. */
50 int cpu;
51 struct srcu_struct *sp;
52};
53
54/*
55 * Node in SRCU combining tree, similar in function to rcu_data.
56 */
57struct srcu_node {
58 spinlock_t lock;
59 unsigned long srcu_have_cbs[4]; /* GP seq for children */
60 /* having CBs, but only */
61 /* is > ->srcu_gq_seq. */
62 struct srcu_node *srcu_parent; /* Next up in tree. */
63 int grplo; /* Least CPU for node. */
64 int grphi; /* Biggest CPU for node. */
65};
66
67/*
68 * Per-SRCU-domain structure, similar in function to rcu_state.
69 */
70struct srcu_struct {
71 struct srcu_node node[NUM_RCU_NODES]; /* Combining tree. */
72 struct srcu_node *level[RCU_NUM_LVLS + 1];
73 /* First node at each level. */
74 struct mutex srcu_cb_mutex; /* Serialize CB preparation. */
75 spinlock_t gp_lock; /* protect ->srcu_cblist */
76 struct mutex srcu_gp_mutex; /* Serialize GP work. */
77 unsigned int srcu_idx; /* Current rdr array element. */
78 unsigned long srcu_gp_seq; /* Grace-period seq #. */
79 unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */
80 atomic_t srcu_exp_cnt; /* # ongoing expedited GPs. */
81 struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */
82 unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */
83 struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */
84 struct completion srcu_barrier_completion;
85 /* Awaken barrier rq at end. */
86 atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */
87 /* callback for the barrier */
88 /* operation. */
89 struct delayed_work work;
90#ifdef CONFIG_DEBUG_LOCK_ALLOC
91 struct lockdep_map dep_map;
92#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
93};
94
95/* Values for state variable (bottom bits of ->srcu_gp_seq). */
96#define SRCU_STATE_IDLE 0
97#define SRCU_STATE_SCAN1 1
98#define SRCU_STATE_SCAN2 2
99
100void process_srcu(struct work_struct *work);
101
102#define __SRCU_STRUCT_INIT(name) \
103 { \
104 .sda = &name##_srcu_data, \
105 .gp_lock = __SPIN_LOCK_UNLOCKED(name.gp_lock), \
106 .srcu_gp_seq_needed = 0 - 1, \
107 __SRCU_DEP_MAP_INIT(name) \
108 }
109
110/*
111 * Define and initialize a srcu struct at build time.
112 * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
113 *
114 * Note that although DEFINE_STATIC_SRCU() hides the name from other
115 * files, the per-CPU variable rules nevertheless require that the
116 * chosen name be globally unique. These rules also prohibit use of
117 * DEFINE_STATIC_SRCU() within a function. If these rules are too
118 * restrictive, declare the srcu_struct manually. For example, in
119 * each file:
120 *
121 * static struct srcu_struct my_srcu;
122 *
123 * Then, before the first use of each my_srcu, manually initialize it:
124 *
125 * init_srcu_struct(&my_srcu);
126 *
127 * See include/linux/percpu-defs.h for the rules on per-CPU variables.
128 */
129#define __DEFINE_SRCU(name, is_static) \
130 static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\
131 is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
132#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
133#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)
134
135void synchronize_srcu_expedited(struct srcu_struct *sp);
136void srcu_barrier(struct srcu_struct *sp);
137unsigned long srcu_batches_completed(struct srcu_struct *sp);
138
139#endif
diff --git a/include/linux/types.h b/include/linux/types.h
index 1e7bd24848fc..258099a4ed82 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -209,7 +209,7 @@ struct ustat {
209 * naturally due ABI requirements, but some architectures (like CRIS) have 209 * naturally due ABI requirements, but some architectures (like CRIS) have
210 * weird ABI and we need to ask it explicitly. 210 * weird ABI and we need to ask it explicitly.
211 * 211 *
212 * The alignment is required to guarantee that bits 0 and 1 of @next will be 212 * The alignment is required to guarantee that bit 0 of @next will be
213 * clear under normal conditions -- as long as we use call_rcu(), 213 * clear under normal conditions -- as long as we use call_rcu(),
214 * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. 214 * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback.
215 * 215 *
diff --git a/include/net/sock.h b/include/net/sock.h
index 03252d53975d..c092f2437546 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -995,7 +995,7 @@ struct smc_hashinfo;
995struct module; 995struct module;
996 996
997/* 997/*
998 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes 998 * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes
999 * un-modified. Special care is taken when initializing object to zero. 999 * un-modified. Special care is taken when initializing object to zero.
1000 */ 1000 */
1001static inline void sk_prot_clear_nulls(struct sock *sk, int size) 1001static inline void sk_prot_clear_nulls(struct sock *sk, int size)
diff --git a/init/Kconfig b/init/Kconfig
index a92f27da4a27..4119a44e4157 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -526,6 +526,35 @@ config SRCU
526 permits arbitrary sleeping or blocking within RCU read-side critical 526 permits arbitrary sleeping or blocking within RCU read-side critical
527 sections. 527 sections.
528 528
529config CLASSIC_SRCU
530 bool "Use v4.11 classic SRCU implementation"
531 default n
532 depends on RCU_EXPERT && SRCU
533 help
534 This option selects the traditional well-tested classic SRCU
535 implementation from v4.11, as might be desired for enterprise
536 Linux distributions. Without this option, the shiny new
537 Tiny SRCU and Tree SRCU implementations are used instead.
538 At some point, it is hoped that Tiny SRCU and Tree SRCU
539 will accumulate enough test time and confidence to allow
540 Classic SRCU to be dropped entirely.
541
542 Say Y if you need a rock-solid SRCU.
543
544 Say N if you would like help test Tree SRCU.
545
546config TINY_SRCU
547 bool
548 default y if TINY_RCU && !CLASSIC_SRCU
549 help
550 This option selects the single-CPU non-preemptible version of SRCU.
551
552config TREE_SRCU
553 bool
554 default y if !TINY_RCU && !CLASSIC_SRCU
555 help
556 This option selects the full-fledged version of SRCU.
557
529config TASKS_RCU 558config TASKS_RCU
530 bool 559 bool
531 default n 560 default n
@@ -612,11 +641,17 @@ config RCU_FANOUT_LEAF
612 initialization. These systems tend to run CPU-bound, and thus 641 initialization. These systems tend to run CPU-bound, and thus
613 are not helped by synchronized interrupts, and thus tend to 642 are not helped by synchronized interrupts, and thus tend to
614 skew them, which reduces lock contention enough that large 643 skew them, which reduces lock contention enough that large
615 leaf-level fanouts work well. 644 leaf-level fanouts work well. That said, setting leaf-level
645 fanout to a large number will likely cause problematic
646 lock contention on the leaf-level rcu_node structures unless
647 you boot with the skew_tick kernel parameter.
616 648
617 Select a specific number if testing RCU itself. 649 Select a specific number if testing RCU itself.
618 650
619 Select the maximum permissible value for large systems. 651 Select the maximum permissible value for large systems, but
652 please understand that you may also need to set the skew_tick
653 kernel boot parameter to avoid contention on the rcu_node
654 structure's locks.
620 655
621 Take the default if unsure. 656 Take the default if unsure.
622 657
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c463c80e93d..9330ce24f1bb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1313,7 +1313,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
1313 if (atomic_dec_and_test(&sighand->count)) { 1313 if (atomic_dec_and_test(&sighand->count)) {
1314 signalfd_cleanup(sighand); 1314 signalfd_cleanup(sighand);
1315 /* 1315 /*
1316 * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it 1316 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
1317 * without an RCU grace period, see __lock_task_sighand(). 1317 * without an RCU grace period, see __lock_task_sighand().
1318 */ 1318 */
1319 kmem_cache_free(sighand_cachep, sighand); 1319 kmem_cache_free(sighand_cachep, sighand);
@@ -2144,7 +2144,7 @@ void __init proc_caches_init(void)
2144{ 2144{
2145 sighand_cachep = kmem_cache_create("sighand_cache", 2145 sighand_cachep = kmem_cache_create("sighand_cache",
2146 sizeof(struct sighand_struct), 0, 2146 sizeof(struct sighand_struct), 0,
2147 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| 2147 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
2148 SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); 2148 SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
2149 signal_cachep = kmem_cache_create("signal_cache", 2149 signal_cachep = kmem_cache_create("signal_cache",
2150 sizeof(struct signal_struct), 0, 2150 sizeof(struct signal_struct), 0,
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a95e5d1f4a9c..e9d4f85b290c 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1144,10 +1144,10 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1144 return 0; 1144 return 0;
1145 1145
1146 printk("\n"); 1146 printk("\n");
1147 printk("======================================================\n"); 1147 pr_warn("======================================================\n");
1148 printk("[ INFO: possible circular locking dependency detected ]\n"); 1148 pr_warn("WARNING: possible circular locking dependency detected\n");
1149 print_kernel_ident(); 1149 print_kernel_ident();
1150 printk("-------------------------------------------------------\n"); 1150 pr_warn("------------------------------------------------------\n");
1151 printk("%s/%d is trying to acquire lock:\n", 1151 printk("%s/%d is trying to acquire lock:\n",
1152 curr->comm, task_pid_nr(curr)); 1152 curr->comm, task_pid_nr(curr));
1153 print_lock(check_src); 1153 print_lock(check_src);
@@ -1482,11 +1482,11 @@ print_bad_irq_dependency(struct task_struct *curr,
1482 return 0; 1482 return 0;
1483 1483
1484 printk("\n"); 1484 printk("\n");
1485 printk("======================================================\n"); 1485 pr_warn("=====================================================\n");
1486 printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", 1486 pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n",
1487 irqclass, irqclass); 1487 irqclass, irqclass);
1488 print_kernel_ident(); 1488 print_kernel_ident();
1489 printk("------------------------------------------------------\n"); 1489 pr_warn("-----------------------------------------------------\n");
1490 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", 1490 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
1491 curr->comm, task_pid_nr(curr), 1491 curr->comm, task_pid_nr(curr),
1492 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, 1492 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
@@ -1711,10 +1711,10 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1711 return 0; 1711 return 0;
1712 1712
1713 printk("\n"); 1713 printk("\n");
1714 printk("=============================================\n"); 1714 pr_warn("============================================\n");
1715 printk("[ INFO: possible recursive locking detected ]\n"); 1715 pr_warn("WARNING: possible recursive locking detected\n");
1716 print_kernel_ident(); 1716 print_kernel_ident();
1717 printk("---------------------------------------------\n"); 1717 pr_warn("--------------------------------------------\n");
1718 printk("%s/%d is trying to acquire lock:\n", 1718 printk("%s/%d is trying to acquire lock:\n",
1719 curr->comm, task_pid_nr(curr)); 1719 curr->comm, task_pid_nr(curr));
1720 print_lock(next); 1720 print_lock(next);
@@ -2061,10 +2061,10 @@ static void print_collision(struct task_struct *curr,
2061 struct lock_chain *chain) 2061 struct lock_chain *chain)
2062{ 2062{
2063 printk("\n"); 2063 printk("\n");
2064 printk("======================\n"); 2064 pr_warn("============================\n");
2065 printk("[chain_key collision ]\n"); 2065 pr_warn("WARNING: chain_key collision\n");
2066 print_kernel_ident(); 2066 print_kernel_ident();
2067 printk("----------------------\n"); 2067 pr_warn("----------------------------\n");
2068 printk("%s/%d: ", current->comm, task_pid_nr(current)); 2068 printk("%s/%d: ", current->comm, task_pid_nr(current));
2069 printk("Hash chain already cached but the contents don't match!\n"); 2069 printk("Hash chain already cached but the contents don't match!\n");
2070 2070
@@ -2360,10 +2360,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2360 return 0; 2360 return 0;
2361 2361
2362 printk("\n"); 2362 printk("\n");
2363 printk("=================================\n"); 2363 pr_warn("================================\n");
2364 printk("[ INFO: inconsistent lock state ]\n"); 2364 pr_warn("WARNING: inconsistent lock state\n");
2365 print_kernel_ident(); 2365 print_kernel_ident();
2366 printk("---------------------------------\n"); 2366 pr_warn("--------------------------------\n");
2367 2367
2368 printk("inconsistent {%s} -> {%s} usage.\n", 2368 printk("inconsistent {%s} -> {%s} usage.\n",
2369 usage_str[prev_bit], usage_str[new_bit]); 2369 usage_str[prev_bit], usage_str[new_bit]);
@@ -2425,10 +2425,10 @@ print_irq_inversion_bug(struct task_struct *curr,
2425 return 0; 2425 return 0;
2426 2426
2427 printk("\n"); 2427 printk("\n");
2428 printk("=========================================================\n"); 2428 pr_warn("========================================================\n");
2429 printk("[ INFO: possible irq lock inversion dependency detected ]\n"); 2429 pr_warn("WARNING: possible irq lock inversion dependency detected\n");
2430 print_kernel_ident(); 2430 print_kernel_ident();
2431 printk("---------------------------------------------------------\n"); 2431 pr_warn("--------------------------------------------------------\n");
2432 printk("%s/%d just changed the state of lock:\n", 2432 printk("%s/%d just changed the state of lock:\n",
2433 curr->comm, task_pid_nr(curr)); 2433 curr->comm, task_pid_nr(curr));
2434 print_lock(this); 2434 print_lock(this);
@@ -3170,10 +3170,10 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
3170 return 0; 3170 return 0;
3171 3171
3172 printk("\n"); 3172 printk("\n");
3173 printk("==================================\n"); 3173 pr_warn("==================================\n");
3174 printk("[ BUG: Nested lock was not taken ]\n"); 3174 pr_warn("WARNING: Nested lock was not taken\n");
3175 print_kernel_ident(); 3175 print_kernel_ident();
3176 printk("----------------------------------\n"); 3176 pr_warn("----------------------------------\n");
3177 3177
3178 printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); 3178 printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
3179 print_lock(hlock); 3179 print_lock(hlock);
@@ -3383,10 +3383,10 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3383 return 0; 3383 return 0;
3384 3384
3385 printk("\n"); 3385 printk("\n");
3386 printk("=====================================\n"); 3386 pr_warn("=====================================\n");
3387 printk("[ BUG: bad unlock balance detected! ]\n"); 3387 pr_warn("WARNING: bad unlock balance detected!\n");
3388 print_kernel_ident(); 3388 print_kernel_ident();
3389 printk("-------------------------------------\n"); 3389 pr_warn("-------------------------------------\n");
3390 printk("%s/%d is trying to release lock (", 3390 printk("%s/%d is trying to release lock (",
3391 curr->comm, task_pid_nr(curr)); 3391 curr->comm, task_pid_nr(curr));
3392 print_lockdep_cache(lock); 3392 print_lockdep_cache(lock);
@@ -3880,10 +3880,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
3880 return 0; 3880 return 0;
3881 3881
3882 printk("\n"); 3882 printk("\n");
3883 printk("=================================\n"); 3883 pr_warn("=================================\n");
3884 printk("[ BUG: bad contention detected! ]\n"); 3884 pr_warn("WARNING: bad contention detected!\n");
3885 print_kernel_ident(); 3885 print_kernel_ident();
3886 printk("---------------------------------\n"); 3886 pr_warn("---------------------------------\n");
3887 printk("%s/%d is trying to contend lock (", 3887 printk("%s/%d is trying to contend lock (",
3888 curr->comm, task_pid_nr(curr)); 3888 curr->comm, task_pid_nr(curr));
3889 print_lockdep_cache(lock); 3889 print_lockdep_cache(lock);
@@ -4244,10 +4244,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
4244 return; 4244 return;
4245 4245
4246 printk("\n"); 4246 printk("\n");
4247 printk("=========================\n"); 4247 pr_warn("=========================\n");
4248 printk("[ BUG: held lock freed! ]\n"); 4248 pr_warn("WARNING: held lock freed!\n");
4249 print_kernel_ident(); 4249 print_kernel_ident();
4250 printk("-------------------------\n"); 4250 pr_warn("-------------------------\n");
4251 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 4251 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
4252 curr->comm, task_pid_nr(curr), mem_from, mem_to-1); 4252 curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
4253 print_lock(hlock); 4253 print_lock(hlock);
@@ -4302,11 +4302,11 @@ static void print_held_locks_bug(void)
4302 return; 4302 return;
4303 4303
4304 printk("\n"); 4304 printk("\n");
4305 printk("=====================================\n"); 4305 pr_warn("====================================\n");
4306 printk("[ BUG: %s/%d still has locks held! ]\n", 4306 pr_warn("WARNING: %s/%d still has locks held!\n",
4307 current->comm, task_pid_nr(current)); 4307 current->comm, task_pid_nr(current));
4308 print_kernel_ident(); 4308 print_kernel_ident();
4309 printk("-------------------------------------\n"); 4309 pr_warn("------------------------------------\n");
4310 lockdep_print_held_locks(current); 4310 lockdep_print_held_locks(current);
4311 printk("\nstack backtrace:\n"); 4311 printk("\nstack backtrace:\n");
4312 dump_stack(); 4312 dump_stack();
@@ -4371,7 +4371,7 @@ retry:
4371 } while_each_thread(g, p); 4371 } while_each_thread(g, p);
4372 4372
4373 printk("\n"); 4373 printk("\n");
4374 printk("=============================================\n\n"); 4374 pr_warn("=============================================\n\n");
4375 4375
4376 if (unlock) 4376 if (unlock)
4377 read_unlock(&tasklist_lock); 4377 read_unlock(&tasklist_lock);
@@ -4401,10 +4401,10 @@ asmlinkage __visible void lockdep_sys_exit(void)
4401 if (!debug_locks_off()) 4401 if (!debug_locks_off())
4402 return; 4402 return;
4403 printk("\n"); 4403 printk("\n");
4404 printk("================================================\n"); 4404 pr_warn("================================================\n");
4405 printk("[ BUG: lock held when returning to user space! ]\n"); 4405 pr_warn("WARNING: lock held when returning to user space!\n");
4406 print_kernel_ident(); 4406 print_kernel_ident();
4407 printk("------------------------------------------------\n"); 4407 pr_warn("------------------------------------------------\n");
4408 printk("%s/%d is leaving the kernel with locks still held!\n", 4408 printk("%s/%d is leaving the kernel with locks still held!\n",
4409 curr->comm, curr->pid); 4409 curr->comm, curr->pid);
4410 lockdep_print_held_locks(curr); 4410 lockdep_print_held_locks(curr);
@@ -4421,13 +4421,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4421#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ 4421#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
4422 /* Note: the following can be executed concurrently, so be careful. */ 4422 /* Note: the following can be executed concurrently, so be careful. */
4423 printk("\n"); 4423 printk("\n");
4424 pr_err("===============================\n"); 4424 pr_warn("=============================\n");
4425 pr_err("[ ERR: suspicious RCU usage. ]\n"); 4425 pr_warn("WARNING: suspicious RCU usage\n");
4426 print_kernel_ident(); 4426 print_kernel_ident();
4427 pr_err("-------------------------------\n"); 4427 pr_warn("-----------------------------\n");
4428 pr_err("%s:%d %s!\n", file, line, s); 4428 printk("%s:%d %s!\n", file, line, s);
4429 pr_err("\nother info that might help us debug this:\n\n"); 4429 printk("\nother info that might help us debug this:\n\n");
4430 pr_err("\n%srcu_scheduler_active = %d, debug_locks = %d\n", 4430 printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
4431 !rcu_lockdep_current_cpu_online() 4431 !rcu_lockdep_current_cpu_online()
4432 ? "RCU used illegally from offline CPU!\n" 4432 ? "RCU used illegally from offline CPU!\n"
4433 : !rcu_is_watching() 4433 : !rcu_is_watching()
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 97ee9df32e0f..db4f55211b04 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -102,10 +102,11 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
102 return; 102 return;
103 } 103 }
104 104
105 printk("\n============================================\n"); 105 pr_warn("\n");
106 printk( "[ BUG: circular locking deadlock detected! ]\n"); 106 pr_warn("============================================\n");
107 printk("%s\n", print_tainted()); 107 pr_warn("WARNING: circular locking deadlock detected!\n");
108 printk( "--------------------------------------------\n"); 108 pr_warn("%s\n", print_tainted());
109 pr_warn("--------------------------------------------\n");
109 printk("%s/%d is deadlocking current task %s/%d\n\n", 110 printk("%s/%d is deadlocking current task %s/%d\n\n",
110 task->comm, task_pid_nr(task), 111 task->comm, task_pid_nr(task),
111 current->comm, task_pid_nr(current)); 112 current->comm, task_pid_nr(current));
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 18dfc485225c..158e6593d58c 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -3,7 +3,9 @@
3KCOV_INSTRUMENT := n 3KCOV_INSTRUMENT := n
4 4
5obj-y += update.o sync.o 5obj-y += update.o sync.o
6obj-$(CONFIG_SRCU) += srcu.o 6obj-$(CONFIG_CLASSIC_SRCU) += srcu.o
7obj-$(CONFIG_TREE_SRCU) += srcutree.o
8obj-$(CONFIG_TINY_SRCU) += srcutiny.o
7obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 9obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
8obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o 10obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
9obj-$(CONFIG_TREE_RCU) += tree.o 11obj-$(CONFIG_TREE_RCU) += tree.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 0d6ff3e471be..73e16ec4054b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -56,6 +56,83 @@
56#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ 56#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \
57 DYNTICK_TASK_FLAG) 57 DYNTICK_TASK_FLAG)
58 58
59
60/*
61 * Grace-period counter management.
62 */
63
64#define RCU_SEQ_CTR_SHIFT 2
65#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1)
66
67/*
68 * Return the counter portion of a sequence number previously returned
69 * by rcu_seq_snap() or rcu_seq_current().
70 */
71static inline unsigned long rcu_seq_ctr(unsigned long s)
72{
73 return s >> RCU_SEQ_CTR_SHIFT;
74}
75
76/*
77 * Return the state portion of a sequence number previously returned
78 * by rcu_seq_snap() or rcu_seq_current().
79 */
80static inline int rcu_seq_state(unsigned long s)
81{
82 return s & RCU_SEQ_STATE_MASK;
83}
84
85/*
86 * Set the state portion of the pointed-to sequence number.
87 * The caller is responsible for preventing conflicting updates.
88 */
89static inline void rcu_seq_set_state(unsigned long *sp, int newstate)
90{
91 WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK);
92 WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate);
93}
94
95/* Adjust sequence number for start of update-side operation. */
96static inline void rcu_seq_start(unsigned long *sp)
97{
98 WRITE_ONCE(*sp, *sp + 1);
99 smp_mb(); /* Ensure update-side operation after counter increment. */
100 WARN_ON_ONCE(rcu_seq_state(*sp) != 1);
101}
102
103/* Adjust sequence number for end of update-side operation. */
104static inline void rcu_seq_end(unsigned long *sp)
105{
106 smp_mb(); /* Ensure update-side operation before counter increment. */
107 WARN_ON_ONCE(!rcu_seq_state(*sp));
108 WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1);
109}
110
111/* Take a snapshot of the update side's sequence number. */
112static inline unsigned long rcu_seq_snap(unsigned long *sp)
113{
114 unsigned long s;
115
116 s = (READ_ONCE(*sp) + 2 * RCU_SEQ_STATE_MASK + 1) & ~RCU_SEQ_STATE_MASK;
117 smp_mb(); /* Above access must not bleed into critical section. */
118 return s;
119}
120
121/* Return the current value the update side's sequence number, no ordering. */
122static inline unsigned long rcu_seq_current(unsigned long *sp)
123{
124 return READ_ONCE(*sp);
125}
126
127/*
128 * Given a snapshot from rcu_seq_snap(), determine whether or not a
129 * full update-side operation has occurred.
130 */
131static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
132{
133 return ULONG_CMP_GE(READ_ONCE(*sp), s);
134}
135
59/* 136/*
60 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally 137 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
61 * by call_rcu() and rcu callback execution, and are therefore not part of the 138 * by call_rcu() and rcu callback execution, and are therefore not part of the
@@ -109,12 +186,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
109 186
110 rcu_lock_acquire(&rcu_callback_map); 187 rcu_lock_acquire(&rcu_callback_map);
111 if (__is_kfree_rcu_offset(offset)) { 188 if (__is_kfree_rcu_offset(offset)) {
112 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); 189 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);)
113 kfree((void *)head - offset); 190 kfree((void *)head - offset);
114 rcu_lock_release(&rcu_callback_map); 191 rcu_lock_release(&rcu_callback_map);
115 return true; 192 return true;
116 } else { 193 } else {
117 RCU_TRACE(trace_rcu_invoke_callback(rn, head)); 194 RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
118 head->func(head); 195 head->func(head);
119 rcu_lock_release(&rcu_callback_map); 196 rcu_lock_release(&rcu_callback_map);
120 return false; 197 return false;
@@ -144,4 +221,76 @@ void rcu_test_sync_prims(void);
144 */ 221 */
145extern void resched_cpu(int cpu); 222extern void resched_cpu(int cpu);
146 223
224#if defined(SRCU) || !defined(TINY_RCU)
225
226#include <linux/rcu_node_tree.h>
227
228extern int rcu_num_lvls;
229extern int num_rcu_lvl[];
230extern int rcu_num_nodes;
231static bool rcu_fanout_exact;
232static int rcu_fanout_leaf;
233
234/*
235 * Compute the per-level fanout, either using the exact fanout specified
236 * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
237 */
238static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
239{
240 int i;
241
242 if (rcu_fanout_exact) {
243 levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
244 for (i = rcu_num_lvls - 2; i >= 0; i--)
245 levelspread[i] = RCU_FANOUT;
246 } else {
247 int ccur;
248 int cprv;
249
250 cprv = nr_cpu_ids;
251 for (i = rcu_num_lvls - 1; i >= 0; i--) {
252 ccur = levelcnt[i];
253 levelspread[i] = (cprv + ccur - 1) / ccur;
254 cprv = ccur;
255 }
256 }
257}
258
259/*
260 * Do a full breadth-first scan of the rcu_node structures for the
261 * specified rcu_state structure.
262 */
263#define rcu_for_each_node_breadth_first(rsp, rnp) \
264 for ((rnp) = &(rsp)->node[0]; \
265 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
266
267/*
268 * Do a breadth-first scan of the non-leaf rcu_node structures for the
269 * specified rcu_state structure. Note that if there is a singleton
270 * rcu_node tree with but one rcu_node structure, this loop is a no-op.
271 */
272#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
273 for ((rnp) = &(rsp)->node[0]; \
274 (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
275
276/*
277 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
278 * structure. Note that if there is a singleton rcu_node tree with but
279 * one rcu_node structure, this loop -will- visit the rcu_node structure.
280 * It is still a leaf node, even if it is also the root node.
281 */
282#define rcu_for_each_leaf_node(rsp, rnp) \
283 for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
284 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
285
286/*
287 * Iterate over all possible CPUs in a leaf RCU node.
288 */
289#define for_each_leaf_node_possible_cpu(rnp, cpu) \
290 for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
291 cpu <= rnp->grphi; \
292 cpu = cpumask_next((cpu), cpu_possible_mask))
293
294#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
295
147#endif /* __LINUX_RCU_H */ 296#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index cccc417a8135..e9d4527cdd43 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -559,19 +559,34 @@ static void srcu_torture_barrier(void)
559 559
560static void srcu_torture_stats(void) 560static void srcu_torture_stats(void)
561{ 561{
562 int cpu; 562 int __maybe_unused cpu;
563 int idx = srcu_ctlp->completed & 0x1; 563 int idx;
564 564
565 pr_alert("%s%s per-CPU(idx=%d):", 565#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU)
566#ifdef CONFIG_TREE_SRCU
567 idx = srcu_ctlp->srcu_idx & 0x1;
568#else /* #ifdef CONFIG_TREE_SRCU */
569 idx = srcu_ctlp->completed & 0x1;
570#endif /* #else #ifdef CONFIG_TREE_SRCU */
571 pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
566 torture_type, TORTURE_FLAG, idx); 572 torture_type, TORTURE_FLAG, idx);
567 for_each_possible_cpu(cpu) { 573 for_each_possible_cpu(cpu) {
568 unsigned long l0, l1; 574 unsigned long l0, l1;
569 unsigned long u0, u1; 575 unsigned long u0, u1;
570 long c0, c1; 576 long c0, c1;
571 struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); 577#ifdef CONFIG_TREE_SRCU
578 struct srcu_data *counts;
572 579
580 counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
581 u0 = counts->srcu_unlock_count[!idx];
582 u1 = counts->srcu_unlock_count[idx];
583#else /* #ifdef CONFIG_TREE_SRCU */
584 struct srcu_array *counts;
585
586 counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
573 u0 = counts->unlock_count[!idx]; 587 u0 = counts->unlock_count[!idx];
574 u1 = counts->unlock_count[idx]; 588 u1 = counts->unlock_count[idx];
589#endif /* #else #ifdef CONFIG_TREE_SRCU */
575 590
576 /* 591 /*
577 * Make sure that a lock is always counted if the corresponding 592 * Make sure that a lock is always counted if the corresponding
@@ -579,14 +594,26 @@ static void srcu_torture_stats(void)
579 */ 594 */
580 smp_rmb(); 595 smp_rmb();
581 596
597#ifdef CONFIG_TREE_SRCU
598 l0 = counts->srcu_lock_count[!idx];
599 l1 = counts->srcu_lock_count[idx];
600#else /* #ifdef CONFIG_TREE_SRCU */
582 l0 = counts->lock_count[!idx]; 601 l0 = counts->lock_count[!idx];
583 l1 = counts->lock_count[idx]; 602 l1 = counts->lock_count[idx];
603#endif /* #else #ifdef CONFIG_TREE_SRCU */
584 604
585 c0 = l0 - u0; 605 c0 = l0 - u0;
586 c1 = l1 - u1; 606 c1 = l1 - u1;
587 pr_cont(" %d(%ld,%ld)", cpu, c0, c1); 607 pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
588 } 608 }
589 pr_cont("\n"); 609 pr_cont("\n");
610#elif defined(CONFIG_TINY_SRCU)
611 idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
612 pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n",
613 torture_type, TORTURE_FLAG, idx,
614 READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
615 READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
616#endif
590} 617}
591 618
592static void srcu_torture_synchronize_expedited(void) 619static void srcu_torture_synchronize_expedited(void)
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index ef3bcfb15b39..584d8a983883 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -22,7 +22,7 @@
22 * Lai Jiangshan <laijs@cn.fujitsu.com> 22 * Lai Jiangshan <laijs@cn.fujitsu.com>
23 * 23 *
24 * For detailed explanation of Read-Copy Update mechanism see - 24 * For detailed explanation of Read-Copy Update mechanism see -
25 * Documentation/RCU/ *.txt 25 * Documentation/RCU/ *.txt
26 * 26 *
27 */ 27 */
28 28
@@ -243,8 +243,14 @@ static bool srcu_readers_active(struct srcu_struct *sp)
243 * cleanup_srcu_struct - deconstruct a sleep-RCU structure 243 * cleanup_srcu_struct - deconstruct a sleep-RCU structure
244 * @sp: structure to clean up. 244 * @sp: structure to clean up.
245 * 245 *
246 * Must invoke this after you are finished using a given srcu_struct that 246 * Must invoke this only after you are finished using a given srcu_struct
247 * was initialized via init_srcu_struct(), else you leak memory. 247 * that was initialized via init_srcu_struct(). This code does some
248 * probabalistic checking, spotting late uses of srcu_read_lock(),
249 * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu().
250 * If any such late uses are detected, the per-CPU memory associated with
251 * the srcu_struct is simply leaked and WARN_ON() is invoked. If the
252 * caller frees the srcu_struct itself, a use-after-free crash will likely
253 * ensue, but at least there will be a warning printed.
248 */ 254 */
249void cleanup_srcu_struct(struct srcu_struct *sp) 255void cleanup_srcu_struct(struct srcu_struct *sp)
250{ 256{
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
new file mode 100644
index 000000000000..b8293527ee18
--- /dev/null
+++ b/kernel/rcu/srcutiny.c
@@ -0,0 +1,215 @@
1/*
2 * Sleepable Read-Copy Update mechanism for mutual exclusion,
3 * tiny version for non-preemptible single-CPU use.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, you can access it online at
17 * http://www.gnu.org/licenses/gpl-2.0.html.
18 *
19 * Copyright (C) IBM Corporation, 2017
20 *
21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 */
23
24#include <linux/export.h>
25#include <linux/mutex.h>
26#include <linux/preempt.h>
27#include <linux/rcupdate_wait.h>
28#include <linux/sched.h>
29#include <linux/delay.h>
30#include <linux/srcu.h>
31
32#include <linux/rcu_node_tree.h>
33#include "rcu.h"
34
35static int init_srcu_struct_fields(struct srcu_struct *sp)
36{
37 sp->srcu_lock_nesting[0] = 0;
38 sp->srcu_lock_nesting[1] = 0;
39 init_swait_queue_head(&sp->srcu_wq);
40 sp->srcu_gp_seq = 0;
41 rcu_segcblist_init(&sp->srcu_cblist);
42 sp->srcu_gp_running = false;
43 sp->srcu_gp_waiting = false;
44 sp->srcu_idx = 0;
45 INIT_WORK(&sp->srcu_work, srcu_drive_gp);
46 return 0;
47}
48
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50
51int __init_srcu_struct(struct srcu_struct *sp, const char *name,
52 struct lock_class_key *key)
53{
54 /* Don't re-initialize a lock while it is held. */
55 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
56 lockdep_init_map(&sp->dep_map, name, key, 0);
57 return init_srcu_struct_fields(sp);
58}
59EXPORT_SYMBOL_GPL(__init_srcu_struct);
60
61#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
62
63/*
64 * init_srcu_struct - initialize a sleep-RCU structure
65 * @sp: structure to initialize.
66 *
67 * Must invoke this on a given srcu_struct before passing that srcu_struct
68 * to any other function. Each srcu_struct represents a separate domain
69 * of SRCU protection.
70 */
71int init_srcu_struct(struct srcu_struct *sp)
72{
73 return init_srcu_struct_fields(sp);
74}
75EXPORT_SYMBOL_GPL(init_srcu_struct);
76
77#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
78
79/*
80 * cleanup_srcu_struct - deconstruct a sleep-RCU structure
81 * @sp: structure to clean up.
82 *
83 * Must invoke this after you are finished using a given srcu_struct that
84 * was initialized via init_srcu_struct(), else you leak memory.
85 */
86void cleanup_srcu_struct(struct srcu_struct *sp)
87{
88 WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
89 flush_work(&sp->srcu_work);
90 WARN_ON(rcu_seq_state(sp->srcu_gp_seq));
91 WARN_ON(sp->srcu_gp_running);
92 WARN_ON(sp->srcu_gp_waiting);
93 WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist));
94}
95EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
96
97/*
98 * Counts the new reader in the appropriate per-CPU element of the
99 * srcu_struct. Must be called from process context.
100 * Returns an index that must be passed to the matching srcu_read_unlock().
101 */
102int __srcu_read_lock(struct srcu_struct *sp)
103{
104 int idx;
105
106 idx = READ_ONCE(sp->srcu_idx);
107 WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1);
108 return idx;
109}
110EXPORT_SYMBOL_GPL(__srcu_read_lock);
111
112/*
113 * Removes the count for the old reader from the appropriate element of
114 * the srcu_struct. Must be called from process context.
115 */
116void __srcu_read_unlock(struct srcu_struct *sp, int idx)
117{
118 int newval = sp->srcu_lock_nesting[idx] - 1;
119
120 WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
121 if (!newval && READ_ONCE(sp->srcu_gp_waiting))
122 swake_up(&sp->srcu_wq);
123}
124EXPORT_SYMBOL_GPL(__srcu_read_unlock);
125
126/*
127 * Workqueue handler to drive one grace period and invoke any callbacks
128 * that become ready as a result. Single-CPU and !PREEMPT operation
129 * means that we get away with murder on synchronization. ;-)
130 */
131void srcu_drive_gp(struct work_struct *wp)
132{
133 int idx;
134 struct rcu_cblist ready_cbs;
135 struct srcu_struct *sp;
136 struct rcu_head *rhp;
137
138 sp = container_of(wp, struct srcu_struct, srcu_work);
139 if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist))
140 return; /* Already running or nothing to do. */
141
142 /* Tag recently arrived callbacks and wait for readers. */
143 WRITE_ONCE(sp->srcu_gp_running, true);
144 rcu_segcblist_accelerate(&sp->srcu_cblist,
145 rcu_seq_snap(&sp->srcu_gp_seq));
146 rcu_seq_start(&sp->srcu_gp_seq);
147 idx = sp->srcu_idx;
148 WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
149 WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
150 swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
151 WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
152 rcu_seq_end(&sp->srcu_gp_seq);
153
154 /* Update callback list based on GP, and invoke ready callbacks. */
155 rcu_segcblist_advance(&sp->srcu_cblist,
156 rcu_seq_current(&sp->srcu_gp_seq));
157 if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
158 rcu_cblist_init(&ready_cbs);
159 local_irq_disable();
160 rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
161 local_irq_enable();
162 rhp = rcu_cblist_dequeue(&ready_cbs);
163 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
164 local_bh_disable();
165 rhp->func(rhp);
166 local_bh_enable();
167 }
168 local_irq_disable();
169 rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
170 local_irq_enable();
171 }
172 WRITE_ONCE(sp->srcu_gp_running, false);
173
174 /*
175 * If more callbacks, reschedule ourselves. This can race with
176 * a call_srcu() at interrupt level, but the ->srcu_gp_running
177 * checks will straighten that out.
178 */
179 if (!rcu_segcblist_empty(&sp->srcu_cblist))
180 schedule_work(&sp->srcu_work);
181}
182EXPORT_SYMBOL_GPL(srcu_drive_gp);
183
184/*
185 * Enqueue an SRCU callback on the specified srcu_struct structure,
186 * initiating grace-period processing if it is not already running.
187 */
188void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
189 rcu_callback_t func)
190{
191 unsigned long flags;
192
193 head->func = func;
194 local_irq_save(flags);
195 rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
196 local_irq_restore(flags);
197 if (!READ_ONCE(sp->srcu_gp_running))
198 schedule_work(&sp->srcu_work);
199}
200EXPORT_SYMBOL_GPL(call_srcu);
201
202/*
203 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
204 */
205void synchronize_srcu(struct srcu_struct *sp)
206{
207 struct rcu_synchronize rs;
208
209 init_rcu_head_on_stack(&rs.head);
210 init_completion(&rs.completion);
211 call_srcu(sp, &rs.head, wakeme_after_rcu);
212 wait_for_completion(&rs.completion);
213 destroy_rcu_head_on_stack(&rs.head);
214}
215EXPORT_SYMBOL_GPL(synchronize_srcu);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
new file mode 100644
index 000000000000..9ecf0acc18eb
--- /dev/null
+++ b/kernel/rcu/srcutree.c
@@ -0,0 +1,996 @@
1/*
2 * Sleepable Read-Copy Update mechanism for mutual exclusion.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright (C) IBM Corporation, 2006
19 * Copyright (C) Fujitsu, 2012
20 *
21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 * Lai Jiangshan <laijs@cn.fujitsu.com>
23 *
24 * For detailed explanation of Read-Copy Update mechanism see -
25 * Documentation/RCU/ *.txt
26 *
27 */
28
29#include <linux/export.h>
30#include <linux/mutex.h>
31#include <linux/percpu.h>
32#include <linux/preempt.h>
33#include <linux/rcupdate_wait.h>
34#include <linux/sched.h>
35#include <linux/smp.h>
36#include <linux/delay.h>
37#include <linux/srcu.h>
38
39#include "rcu.h"
40
41static void srcu_invoke_callbacks(struct work_struct *work);
42static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
43
44/*
45 * Initialize SRCU combining tree. Note that statically allocated
46 * srcu_struct structures might already have srcu_read_lock() and
47 * srcu_read_unlock() running against them. So if the is_static parameter
48 * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
49 */
50static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
51{
52 int cpu;
53 int i;
54 int level = 0;
55 int levelspread[RCU_NUM_LVLS];
56 struct srcu_data *sdp;
57 struct srcu_node *snp;
58 struct srcu_node *snp_first;
59
60 /* Work out the overall tree geometry. */
61 sp->level[0] = &sp->node[0];
62 for (i = 1; i < rcu_num_lvls; i++)
63 sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1];
64 rcu_init_levelspread(levelspread, num_rcu_lvl);
65
66 /* Each pass through this loop initializes one srcu_node structure. */
67 rcu_for_each_node_breadth_first(sp, snp) {
68 spin_lock_init(&snp->lock);
69 for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++)
70 snp->srcu_have_cbs[i] = 0;
71 snp->grplo = -1;
72 snp->grphi = -1;
73 if (snp == &sp->node[0]) {
74 /* Root node, special case. */
75 snp->srcu_parent = NULL;
76 continue;
77 }
78
79 /* Non-root node. */
80 if (snp == sp->level[level + 1])
81 level++;
82 snp->srcu_parent = sp->level[level - 1] +
83 (snp - sp->level[level]) /
84 levelspread[level - 1];
85 }
86
87 /*
88 * Initialize the per-CPU srcu_data array, which feeds into the
89 * leaves of the srcu_node tree.
90 */
91 WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
92 ARRAY_SIZE(sdp->srcu_unlock_count));
93 level = rcu_num_lvls - 1;
94 snp_first = sp->level[level];
95 for_each_possible_cpu(cpu) {
96 sdp = per_cpu_ptr(sp->sda, cpu);
97 spin_lock_init(&sdp->lock);
98 rcu_segcblist_init(&sdp->srcu_cblist);
99 sdp->srcu_cblist_invoking = false;
100 sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
101 sdp->mynode = &snp_first[cpu / levelspread[level]];
102 for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
103 if (snp->grplo < 0)
104 snp->grplo = cpu;
105 snp->grphi = cpu;
106 }
107 sdp->cpu = cpu;
108 INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks);
109 sdp->sp = sp;
110 if (is_static)
111 continue;
112
113 /* Dynamically allocated, better be no srcu_read_locks()! */
114 for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) {
115 sdp->srcu_lock_count[i] = 0;
116 sdp->srcu_unlock_count[i] = 0;
117 }
118 }
119}
120
121/*
122 * Initialize non-compile-time initialized fields, including the
123 * associated srcu_node and srcu_data structures. The is_static
124 * parameter is passed through to init_srcu_struct_nodes(), and
125 * also tells us that ->sda has already been wired up to srcu_data.
126 */
127static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
128{
129 mutex_init(&sp->srcu_cb_mutex);
130 mutex_init(&sp->srcu_gp_mutex);
131 sp->srcu_idx = 0;
132 sp->srcu_gp_seq = 0;
133 atomic_set(&sp->srcu_exp_cnt, 0);
134 sp->srcu_barrier_seq = 0;
135 mutex_init(&sp->srcu_barrier_mutex);
136 atomic_set(&sp->srcu_barrier_cpu_cnt, 0);
137 INIT_DELAYED_WORK(&sp->work, process_srcu);
138 if (!is_static)
139 sp->sda = alloc_percpu(struct srcu_data);
140 init_srcu_struct_nodes(sp, is_static);
141 smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */
142 return sp->sda ? 0 : -ENOMEM;
143}
144
145#ifdef CONFIG_DEBUG_LOCK_ALLOC
146
147int __init_srcu_struct(struct srcu_struct *sp, const char *name,
148 struct lock_class_key *key)
149{
150 /* Don't re-initialize a lock while it is held. */
151 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
152 lockdep_init_map(&sp->dep_map, name, key, 0);
153 spin_lock_init(&sp->gp_lock);
154 return init_srcu_struct_fields(sp, false);
155}
156EXPORT_SYMBOL_GPL(__init_srcu_struct);
157
158#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
159
160/**
161 * init_srcu_struct - initialize a sleep-RCU structure
162 * @sp: structure to initialize.
163 *
164 * Must invoke this on a given srcu_struct before passing that srcu_struct
165 * to any other function. Each srcu_struct represents a separate domain
166 * of SRCU protection.
167 */
168int init_srcu_struct(struct srcu_struct *sp)
169{
170 spin_lock_init(&sp->gp_lock);
171 return init_srcu_struct_fields(sp, false);
172}
173EXPORT_SYMBOL_GPL(init_srcu_struct);
174
175#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
176
177/*
178 * First-use initialization of statically allocated srcu_struct
179 * structure. Wiring up the combining tree is more than can be
180 * done with compile-time initialization, so this check is added
181 * to each update-side SRCU primitive. Use ->gp_lock, which -is-
182 * compile-time initialized, to resolve races involving multiple
183 * CPUs trying to garner first-use privileges.
184 */
185static void check_init_srcu_struct(struct srcu_struct *sp)
186{
187 unsigned long flags;
188
189 WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT);
190 /* The smp_load_acquire() pairs with the smp_store_release(). */
191 if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
192 return; /* Already initialized. */
193 spin_lock_irqsave(&sp->gp_lock, flags);
194 if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
195 spin_unlock_irqrestore(&sp->gp_lock, flags);
196 return;
197 }
198 init_srcu_struct_fields(sp, true);
199 spin_unlock_irqrestore(&sp->gp_lock, flags);
200}
201
202/*
203 * Returns approximate total of the readers' ->srcu_lock_count[] values
204 * for the rank of per-CPU counters specified by idx.
205 */
206static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
207{
208 int cpu;
209 unsigned long sum = 0;
210
211 for_each_possible_cpu(cpu) {
212 struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
213
214 sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
215 }
216 return sum;
217}
218
219/*
220 * Returns approximate total of the readers' ->srcu_unlock_count[] values
221 * for the rank of per-CPU counters specified by idx.
222 */
223static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
224{
225 int cpu;
226 unsigned long sum = 0;
227
228 for_each_possible_cpu(cpu) {
229 struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
230
231 sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
232 }
233 return sum;
234}
235
236/*
237 * Return true if the number of pre-existing readers is determined to
238 * be zero.
239 */
240static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
241{
242 unsigned long unlocks;
243
244 unlocks = srcu_readers_unlock_idx(sp, idx);
245
246 /*
247 * Make sure that a lock is always counted if the corresponding
248 * unlock is counted. Needs to be a smp_mb() as the read side may
249 * contain a read from a variable that is written to before the
250 * synchronize_srcu() in the write side. In this case smp_mb()s
251 * A and B act like the store buffering pattern.
252 *
253 * This smp_mb() also pairs with smp_mb() C to prevent accesses
254 * after the synchronize_srcu() from being executed before the
255 * grace period ends.
256 */
257 smp_mb(); /* A */
258
259 /*
260 * If the locks are the same as the unlocks, then there must have
261 * been no readers on this index at some time in between. This does
262 * not mean that there are no more readers, as one could have read
263 * the current index but not have incremented the lock counter yet.
264 *
265 * Possible bug: There is no guarantee that there haven't been
266 * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were
267 * counted, meaning that this could return true even if there are
268 * still active readers. Since there are no memory barriers around
269 * srcu_flip(), the CPU is not required to increment ->srcu_idx
270 * before running srcu_readers_unlock_idx(), which means that there
271 * could be an arbitrarily large number of critical sections that
272 * execute after srcu_readers_unlock_idx() but use the old value
273 * of ->srcu_idx.
274 */
275 return srcu_readers_lock_idx(sp, idx) == unlocks;
276}
277
278/**
279 * srcu_readers_active - returns true if there are readers. and false
280 * otherwise
281 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
282 *
283 * Note that this is not an atomic primitive, and can therefore suffer
284 * severe errors when invoked on an active srcu_struct. That said, it
285 * can be useful as an error check at cleanup time.
286 */
287static bool srcu_readers_active(struct srcu_struct *sp)
288{
289 int cpu;
290 unsigned long sum = 0;
291
292 for_each_possible_cpu(cpu) {
293 struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
294
295 sum += READ_ONCE(cpuc->srcu_lock_count[0]);
296 sum += READ_ONCE(cpuc->srcu_lock_count[1]);
297 sum -= READ_ONCE(cpuc->srcu_unlock_count[0]);
298 sum -= READ_ONCE(cpuc->srcu_unlock_count[1]);
299 }
300 return sum;
301}
302
303#define SRCU_INTERVAL 1
304
305/**
306 * cleanup_srcu_struct - deconstruct a sleep-RCU structure
307 * @sp: structure to clean up.
308 *
309 * Must invoke this after you are finished using a given srcu_struct that
310 * was initialized via init_srcu_struct(), else you leak memory.
311 */
312void cleanup_srcu_struct(struct srcu_struct *sp)
313{
314 int cpu;
315
316 WARN_ON_ONCE(atomic_read(&sp->srcu_exp_cnt));
317 if (WARN_ON(srcu_readers_active(sp)))
318 return; /* Leakage unless caller handles error. */
319 flush_delayed_work(&sp->work);
320 for_each_possible_cpu(cpu)
321 flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
322 if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
323 WARN_ON(srcu_readers_active(sp))) {
324 pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
325 return; /* Caller forgot to stop doing call_srcu()? */
326 }
327 free_percpu(sp->sda);
328 sp->sda = NULL;
329}
330EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
331
332/*
333 * Counts the new reader in the appropriate per-CPU element of the
334 * srcu_struct. Must be called from process context.
335 * Returns an index that must be passed to the matching srcu_read_unlock().
336 */
337int __srcu_read_lock(struct srcu_struct *sp)
338{
339 int idx;
340
341 idx = READ_ONCE(sp->srcu_idx) & 0x1;
342 __this_cpu_inc(sp->sda->srcu_lock_count[idx]);
343 smp_mb(); /* B */ /* Avoid leaking the critical section. */
344 return idx;
345}
346EXPORT_SYMBOL_GPL(__srcu_read_lock);
347
348/*
349 * Removes the count for the old reader from the appropriate per-CPU
350 * element of the srcu_struct. Note that this may well be a different
351 * CPU than that which was incremented by the corresponding srcu_read_lock().
352 * Must be called from process context.
353 */
354void __srcu_read_unlock(struct srcu_struct *sp, int idx)
355{
356 smp_mb(); /* C */ /* Avoid leaking the critical section. */
357 this_cpu_inc(sp->sda->srcu_unlock_count[idx]);
358}
359EXPORT_SYMBOL_GPL(__srcu_read_unlock);
360
361/*
362 * We use an adaptive strategy for synchronize_srcu() and especially for
363 * synchronize_srcu_expedited(). We spin for a fixed time period
364 * (defined below) to allow SRCU readers to exit their read-side critical
365 * sections. If there are still some readers after a few microseconds,
366 * we repeatedly block for 1-millisecond time periods.
367 */
368#define SRCU_RETRY_CHECK_DELAY 5
369
370/*
371 * Start an SRCU grace period.
372 */
373static void srcu_gp_start(struct srcu_struct *sp)
374{
375 struct srcu_data *sdp = this_cpu_ptr(sp->sda);
376 int state;
377
378 RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock),
379 "Invoked srcu_gp_start() without ->gp_lock!");
380 WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
381 rcu_segcblist_advance(&sdp->srcu_cblist,
382 rcu_seq_current(&sp->srcu_gp_seq));
383 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
384 rcu_seq_snap(&sp->srcu_gp_seq));
385 rcu_seq_start(&sp->srcu_gp_seq);
386 state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
387 WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
388}
389
390/*
391 * Track online CPUs to guide callback workqueue placement.
392 */
393DEFINE_PER_CPU(bool, srcu_online);
394
395void srcu_online_cpu(unsigned int cpu)
396{
397 WRITE_ONCE(per_cpu(srcu_online, cpu), true);
398}
399
400void srcu_offline_cpu(unsigned int cpu)
401{
402 WRITE_ONCE(per_cpu(srcu_online, cpu), false);
403}
404
405/*
406 * Place the workqueue handler on the specified CPU if online, otherwise
407 * just run it whereever. This is useful for placing workqueue handlers
408 * that are to invoke the specified CPU's callbacks.
409 */
410static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
411 struct delayed_work *dwork,
412 unsigned long delay)
413{
414 bool ret;
415
416 preempt_disable();
417 if (READ_ONCE(per_cpu(srcu_online, cpu)))
418 ret = queue_delayed_work_on(cpu, wq, dwork, delay);
419 else
420 ret = queue_delayed_work(wq, dwork, delay);
421 preempt_enable();
422 return ret;
423}
424
425/*
426 * Schedule callback invocation for the specified srcu_data structure,
427 * if possible, on the corresponding CPU.
428 */
429static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
430{
431 srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq,
432 &sdp->work, delay);
433}
434
435/*
436 * Schedule callback invocation for all srcu_data structures associated
437 * with the specified srcu_node structure, if possible, on the corresponding
438 * CPUs.
439 */
440static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp)
441{
442 int cpu;
443
444 for (cpu = snp->grplo; cpu <= snp->grphi; cpu++)
445 srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu),
446 atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
447}
448
449/*
450 * Note the end of an SRCU grace period. Initiates callback invocation
451 * and starts a new grace period if needed.
452 *
453 * The ->srcu_cb_mutex acquisition does not protect any data, but
454 * instead prevents more than one grace period from starting while we
455 * are initiating callback invocation. This allows the ->srcu_have_cbs[]
456 * array to have a finite number of elements.
457 */
458static void srcu_gp_end(struct srcu_struct *sp)
459{
460 bool cbs;
461 unsigned long gpseq;
462 int idx;
463 int idxnext;
464 struct srcu_node *snp;
465
466 /* Prevent more than one additional grace period. */
467 mutex_lock(&sp->srcu_cb_mutex);
468
469 /* End the current grace period. */
470 spin_lock_irq(&sp->gp_lock);
471 idx = rcu_seq_state(sp->srcu_gp_seq);
472 WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
473 rcu_seq_end(&sp->srcu_gp_seq);
474 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
475 spin_unlock_irq(&sp->gp_lock);
476 mutex_unlock(&sp->srcu_gp_mutex);
477 /* A new grace period can start at this point. But only one. */
478
479 /* Initiate callback invocation as needed. */
480 idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
481 idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
482 rcu_for_each_node_breadth_first(sp, snp) {
483 spin_lock_irq(&snp->lock);
484 cbs = false;
485 if (snp >= sp->level[rcu_num_lvls - 1])
486 cbs = snp->srcu_have_cbs[idx] == gpseq;
487 snp->srcu_have_cbs[idx] = gpseq;
488 rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
489 spin_unlock_irq(&snp->lock);
490 if (cbs) {
491 smp_mb(); /* GP end before CB invocation. */
492 srcu_schedule_cbs_snp(sp, snp);
493 }
494 }
495
496 /* Callback initiation done, allow grace periods after next. */
497 mutex_unlock(&sp->srcu_cb_mutex);
498
499 /* Start a new grace period if needed. */
500 spin_lock_irq(&sp->gp_lock);
501 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
502 if (!rcu_seq_state(gpseq) &&
503 ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
504 srcu_gp_start(sp);
505 spin_unlock_irq(&sp->gp_lock);
506 /* Throttle expedited grace periods: Should be rare! */
507 srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) &&
508 rcu_seq_ctr(gpseq) & 0xf
509 ? 0
510 : SRCU_INTERVAL);
511 } else {
512 spin_unlock_irq(&sp->gp_lock);
513 }
514}
515
516/*
517 * Funnel-locking scheme to scalably mediate many concurrent grace-period
518 * requests. The winner has to do the work of actually starting grace
519 * period s. Losers must either ensure that their desired grace-period
520 * number is recorded on at least their leaf srcu_node structure, or they
521 * must take steps to invoke their own callbacks.
522 */
523static void srcu_funnel_gp_start(struct srcu_struct *sp,
524 struct srcu_data *sdp,
525 unsigned long s)
526{
527 unsigned long flags;
528 int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
529 struct srcu_node *snp = sdp->mynode;
530 unsigned long snp_seq;
531
532 /* Each pass through the loop does one level of the srcu_node tree. */
533 for (; snp != NULL; snp = snp->srcu_parent) {
534 if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
535 return; /* GP already done and CBs recorded. */
536 spin_lock_irqsave(&snp->lock, flags);
537 if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
538 snp_seq = snp->srcu_have_cbs[idx];
539 spin_unlock_irqrestore(&snp->lock, flags);
540 if (snp == sdp->mynode && snp_seq != s) {
541 smp_mb(); /* CBs after GP! */
542 srcu_schedule_cbs_sdp(sdp, 0);
543 }
544 return;
545 }
546 snp->srcu_have_cbs[idx] = s;
547 spin_unlock_irqrestore(&snp->lock, flags);
548 }
549
550 /* Top of tree, must ensure the grace period will be started. */
551 spin_lock_irqsave(&sp->gp_lock, flags);
552 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
553 /*
554 * Record need for grace period s. Pair with load
555 * acquire setting up for initialization.
556 */
557 smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/
558 }
559
560 /* If grace period not already done and none in progress, start it. */
561 if (!rcu_seq_done(&sp->srcu_gp_seq, s) &&
562 rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
563 WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
564 srcu_gp_start(sp);
565 queue_delayed_work(system_power_efficient_wq, &sp->work,
566 atomic_read(&sp->srcu_exp_cnt)
567 ? 0
568 : SRCU_INTERVAL);
569 }
570 spin_unlock_irqrestore(&sp->gp_lock, flags);
571}
572
573/*
574 * Wait until all readers counted by array index idx complete, but
575 * loop an additional time if there is an expedited grace period pending.
576 * The caller must ensure that ->srcu_idx is not changed while checking.
577 */
578static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
579{
580 for (;;) {
581 if (srcu_readers_active_idx_check(sp, idx))
582 return true;
583 if (--trycount + !!atomic_read(&sp->srcu_exp_cnt) <= 0)
584 return false;
585 udelay(SRCU_RETRY_CHECK_DELAY);
586 }
587}
588
589/*
590 * Increment the ->srcu_idx counter so that future SRCU readers will
591 * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows
592 * us to wait for pre-existing readers in a starvation-free manner.
593 */
594static void srcu_flip(struct srcu_struct *sp)
595{
596 WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1);
597
598 /*
599 * Ensure that if the updater misses an __srcu_read_unlock()
600 * increment, that task's next __srcu_read_lock() will see the
601 * above counter update. Note that both this memory barrier
602 * and the one in srcu_readers_active_idx_check() provide the
603 * guarantee for __srcu_read_lock().
604 */
605 smp_mb(); /* D */ /* Pairs with C. */
606}
607
608/*
609 * Enqueue an SRCU callback on the srcu_data structure associated with
610 * the current CPU and the specified srcu_struct structure, initiating
611 * grace-period processing if it is not already running.
612 *
613 * Note that all CPUs must agree that the grace period extended beyond
614 * all pre-existing SRCU read-side critical section. On systems with
615 * more than one CPU, this means that when "func()" is invoked, each CPU
616 * is guaranteed to have executed a full memory barrier since the end of
617 * its last corresponding SRCU read-side critical section whose beginning
618 * preceded the call to call_rcu(). It also means that each CPU executing
619 * an SRCU read-side critical section that continues beyond the start of
620 * "func()" must have executed a memory barrier after the call_rcu()
621 * but before the beginning of that SRCU read-side critical section.
622 * Note that these guarantees include CPUs that are offline, idle, or
623 * executing in user mode, as well as CPUs that are executing in the kernel.
624 *
625 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
626 * resulting SRCU callback function "func()", then both CPU A and CPU
627 * B are guaranteed to execute a full memory barrier during the time
628 * interval between the call to call_rcu() and the invocation of "func()".
629 * This guarantee applies even if CPU A and CPU B are the same CPU (but
630 * again only if the system has more than one CPU).
631 *
632 * Of course, these guarantees apply only for invocations of call_srcu(),
633 * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
634 * srcu_struct structure.
635 */
636void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
637 rcu_callback_t func)
638{
639 unsigned long flags;
640 bool needgp = false;
641 unsigned long s;
642 struct srcu_data *sdp;
643
644 check_init_srcu_struct(sp);
645 rhp->func = func;
646 local_irq_save(flags);
647 sdp = this_cpu_ptr(sp->sda);
648 spin_lock(&sdp->lock);
649 rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
650 rcu_segcblist_advance(&sdp->srcu_cblist,
651 rcu_seq_current(&sp->srcu_gp_seq));
652 s = rcu_seq_snap(&sp->srcu_gp_seq);
653 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
654 if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
655 sdp->srcu_gp_seq_needed = s;
656 needgp = true;
657 }
658 spin_unlock_irqrestore(&sdp->lock, flags);
659 if (needgp)
660 srcu_funnel_gp_start(sp, sdp, s);
661}
662EXPORT_SYMBOL_GPL(call_srcu);
663
664/*
665 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
666 */
667static void __synchronize_srcu(struct srcu_struct *sp)
668{
669 struct rcu_synchronize rcu;
670
671 RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
672 lock_is_held(&rcu_bh_lock_map) ||
673 lock_is_held(&rcu_lock_map) ||
674 lock_is_held(&rcu_sched_lock_map),
675 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
676
677 if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
678 return;
679 might_sleep();
680 check_init_srcu_struct(sp);
681 init_completion(&rcu.completion);
682 init_rcu_head_on_stack(&rcu.head);
683 call_srcu(sp, &rcu.head, wakeme_after_rcu);
684 wait_for_completion(&rcu.completion);
685 destroy_rcu_head_on_stack(&rcu.head);
686}
687
688/**
689 * synchronize_srcu_expedited - Brute-force SRCU grace period
690 * @sp: srcu_struct with which to synchronize.
691 *
692 * Wait for an SRCU grace period to elapse, but be more aggressive about
693 * spinning rather than blocking when waiting.
694 *
695 * Note that synchronize_srcu_expedited() has the same deadlock and
696 * memory-ordering properties as does synchronize_srcu().
697 */
698void synchronize_srcu_expedited(struct srcu_struct *sp)
699{
700 bool do_norm = rcu_gp_is_normal();
701
702 check_init_srcu_struct(sp);
703 if (!do_norm) {
704 atomic_inc(&sp->srcu_exp_cnt);
705 smp_mb__after_atomic(); /* increment before GP. */
706 }
707 __synchronize_srcu(sp);
708 if (!do_norm) {
709 smp_mb__before_atomic(); /* GP before decrement. */
710 WARN_ON_ONCE(atomic_dec_return(&sp->srcu_exp_cnt) < 0);
711 }
712}
713EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
714
715/**
716 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
717 * @sp: srcu_struct with which to synchronize.
718 *
719 * Wait for the count to drain to zero of both indexes. To avoid the
720 * possible starvation of synchronize_srcu(), it waits for the count of
721 * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first,
722 * and then flip the srcu_idx and wait for the count of the other index.
723 *
724 * Can block; must be called from process context.
725 *
726 * Note that it is illegal to call synchronize_srcu() from the corresponding
727 * SRCU read-side critical section; doing so will result in deadlock.
728 * However, it is perfectly legal to call synchronize_srcu() on one
729 * srcu_struct from some other srcu_struct's read-side critical section,
730 * as long as the resulting graph of srcu_structs is acyclic.
731 *
732 * There are memory-ordering constraints implied by synchronize_srcu().
733 * On systems with more than one CPU, when synchronize_srcu() returns,
734 * each CPU is guaranteed to have executed a full memory barrier since
735 * the end of its last corresponding SRCU-sched read-side critical section
736 * whose beginning preceded the call to synchronize_srcu(). In addition,
737 * each CPU having an SRCU read-side critical section that extends beyond
738 * the return from synchronize_srcu() is guaranteed to have executed a
739 * full memory barrier after the beginning of synchronize_srcu() and before
740 * the beginning of that SRCU read-side critical section. Note that these
741 * guarantees include CPUs that are offline, idle, or executing in user mode,
742 * as well as CPUs that are executing in the kernel.
743 *
744 * Furthermore, if CPU A invoked synchronize_srcu(), which returned
745 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
746 * to have executed a full memory barrier during the execution of
747 * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
748 * are the same CPU, but again only if the system has more than one CPU.
749 *
750 * Of course, these memory-ordering guarantees apply only when
751 * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
752 * passed the same srcu_struct structure.
753 */
754void synchronize_srcu(struct srcu_struct *sp)
755{
756 if (rcu_gp_is_expedited())
757 synchronize_srcu_expedited(sp);
758 else
759 __synchronize_srcu(sp);
760}
761EXPORT_SYMBOL_GPL(synchronize_srcu);
762
763/*
764 * Callback function for srcu_barrier() use.
765 */
766static void srcu_barrier_cb(struct rcu_head *rhp)
767{
768 struct srcu_data *sdp;
769 struct srcu_struct *sp;
770
771 sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
772 sp = sdp->sp;
773 if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
774 complete(&sp->srcu_barrier_completion);
775}
776
777/**
778 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
779 * @sp: srcu_struct on which to wait for in-flight callbacks.
780 */
781void srcu_barrier(struct srcu_struct *sp)
782{
783 int cpu;
784 struct srcu_data *sdp;
785 unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq);
786
787 check_init_srcu_struct(sp);
788 mutex_lock(&sp->srcu_barrier_mutex);
789 if (rcu_seq_done(&sp->srcu_barrier_seq, s)) {
790 smp_mb(); /* Force ordering following return. */
791 mutex_unlock(&sp->srcu_barrier_mutex);
792 return; /* Someone else did our work for us. */
793 }
794 rcu_seq_start(&sp->srcu_barrier_seq);
795 init_completion(&sp->srcu_barrier_completion);
796
797 /* Initial count prevents reaching zero until all CBs are posted. */
798 atomic_set(&sp->srcu_barrier_cpu_cnt, 1);
799
800 /*
801 * Each pass through this loop enqueues a callback, but only
802 * on CPUs already having callbacks enqueued. Note that if
803 * a CPU already has callbacks enqueue, it must have already
804 * registered the need for a future grace period, so all we
805 * need do is enqueue a callback that will use the same
806 * grace period as the last callback already in the queue.
807 */
808 for_each_possible_cpu(cpu) {
809 sdp = per_cpu_ptr(sp->sda, cpu);
810 spin_lock_irq(&sdp->lock);
811 atomic_inc(&sp->srcu_barrier_cpu_cnt);
812 sdp->srcu_barrier_head.func = srcu_barrier_cb;
813 if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
814 &sdp->srcu_barrier_head, 0))
815 atomic_dec(&sp->srcu_barrier_cpu_cnt);
816 spin_unlock_irq(&sdp->lock);
817 }
818
819 /* Remove the initial count, at which point reaching zero can happen. */
820 if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
821 complete(&sp->srcu_barrier_completion);
822 wait_for_completion(&sp->srcu_barrier_completion);
823
824 rcu_seq_end(&sp->srcu_barrier_seq);
825 mutex_unlock(&sp->srcu_barrier_mutex);
826}
827EXPORT_SYMBOL_GPL(srcu_barrier);
828
829/**
830 * srcu_batches_completed - return batches completed.
831 * @sp: srcu_struct on which to report batch completion.
832 *
833 * Report the number of batches, correlated with, but not necessarily
834 * precisely the same as, the number of grace periods that have elapsed.
835 */
836unsigned long srcu_batches_completed(struct srcu_struct *sp)
837{
838 return sp->srcu_idx;
839}
840EXPORT_SYMBOL_GPL(srcu_batches_completed);
841
842/*
843 * Core SRCU state machine. Push state bits of ->srcu_gp_seq
844 * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has
845 * completed in that state.
846 */
847static void srcu_advance_state(struct srcu_struct *sp)
848{
849 int idx;
850
851 mutex_lock(&sp->srcu_gp_mutex);
852
853 /*
854 * Because readers might be delayed for an extended period after
855 * fetching ->srcu_idx for their index, at any point in time there
856 * might well be readers using both idx=0 and idx=1. We therefore
857 * need to wait for readers to clear from both index values before
858 * invoking a callback.
859 *
860 * The load-acquire ensures that we see the accesses performed
861 * by the prior grace period.
862 */
863 idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
864 if (idx == SRCU_STATE_IDLE) {
865 spin_lock_irq(&sp->gp_lock);
866 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
867 WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
868 spin_unlock_irq(&sp->gp_lock);
869 mutex_unlock(&sp->srcu_gp_mutex);
870 return;
871 }
872 idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
873 if (idx == SRCU_STATE_IDLE)
874 srcu_gp_start(sp);
875 spin_unlock_irq(&sp->gp_lock);
876 if (idx != SRCU_STATE_IDLE) {
877 mutex_unlock(&sp->srcu_gp_mutex);
878 return; /* Someone else started the grace period. */
879 }
880 }
881
882 if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
883 idx = 1 ^ (sp->srcu_idx & 1);
884 if (!try_check_zero(sp, idx, 1)) {
885 mutex_unlock(&sp->srcu_gp_mutex);
886 return; /* readers present, retry later. */
887 }
888 srcu_flip(sp);
889 rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
890 }
891
892 if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
893
894 /*
895 * SRCU read-side critical sections are normally short,
896 * so check at least twice in quick succession after a flip.
897 */
898 idx = 1 ^ (sp->srcu_idx & 1);
899 if (!try_check_zero(sp, idx, 2)) {
900 mutex_unlock(&sp->srcu_gp_mutex);
901 return; /* readers present, retry later. */
902 }
903 srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */
904 }
905}
906
907/*
908 * Invoke a limited number of SRCU callbacks that have passed through
909 * their grace period. If there are more to do, SRCU will reschedule
910 * the workqueue. Note that needed memory barriers have been executed
911 * in this task's context by srcu_readers_active_idx_check().
912 */
913static void srcu_invoke_callbacks(struct work_struct *work)
914{
915 bool more;
916 struct rcu_cblist ready_cbs;
917 struct rcu_head *rhp;
918 struct srcu_data *sdp;
919 struct srcu_struct *sp;
920
921 sdp = container_of(work, struct srcu_data, work.work);
922 sp = sdp->sp;
923 rcu_cblist_init(&ready_cbs);
924 spin_lock_irq(&sdp->lock);
925 smp_mb(); /* Old grace periods before callback invocation! */
926 rcu_segcblist_advance(&sdp->srcu_cblist,
927 rcu_seq_current(&sp->srcu_gp_seq));
928 if (sdp->srcu_cblist_invoking ||
929 !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
930 spin_unlock_irq(&sdp->lock);
931 return; /* Someone else on the job or nothing to do. */
932 }
933
934 /* We are on the job! Extract and invoke ready callbacks. */
935 sdp->srcu_cblist_invoking = true;
936 rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
937 spin_unlock_irq(&sdp->lock);
938 rhp = rcu_cblist_dequeue(&ready_cbs);
939 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
940 local_bh_disable();
941 rhp->func(rhp);
942 local_bh_enable();
943 }
944
945 /*
946 * Update counts, accelerate new callbacks, and if needed,
947 * schedule another round of callback invocation.
948 */
949 spin_lock_irq(&sdp->lock);
950 rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
951 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
952 rcu_seq_snap(&sp->srcu_gp_seq));
953 sdp->srcu_cblist_invoking = false;
954 more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
955 spin_unlock_irq(&sdp->lock);
956 if (more)
957 srcu_schedule_cbs_sdp(sdp, 0);
958}
959
960/*
961 * Finished one round of SRCU grace period. Start another if there are
962 * more SRCU callbacks queued, otherwise put SRCU into not-running state.
963 */
964static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
965{
966 bool pushgp = true;
967
968 spin_lock_irq(&sp->gp_lock);
969 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
970 if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
971 /* All requests fulfilled, time to go idle. */
972 pushgp = false;
973 }
974 } else if (!rcu_seq_state(sp->srcu_gp_seq)) {
975 /* Outstanding request and no GP. Start one. */
976 srcu_gp_start(sp);
977 }
978 spin_unlock_irq(&sp->gp_lock);
979
980 if (pushgp)
981 queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
982}
983
984/*
985 * This is the work-queue function that handles SRCU grace periods.
986 */
987void process_srcu(struct work_struct *work)
988{
989 struct srcu_struct *sp;
990
991 sp = container_of(work, struct srcu_struct, work.work);
992
993 srcu_advance_state(sp);
994 srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
995}
996EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 6ad330dbbae2..e5385731e391 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -79,7 +79,7 @@ EXPORT_SYMBOL(__rcu_is_watching);
79 */ 79 */
80static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 80static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
81{ 81{
82 RCU_TRACE(reset_cpu_stall_ticks(rcp)); 82 RCU_TRACE(reset_cpu_stall_ticks(rcp);)
83 if (rcp->donetail != rcp->curtail) { 83 if (rcp->donetail != rcp->curtail) {
84 rcp->donetail = rcp->curtail; 84 rcp->donetail = rcp->curtail;
85 return 1; 85 return 1;
@@ -125,7 +125,7 @@ void rcu_bh_qs(void)
125 */ 125 */
126void rcu_check_callbacks(int user) 126void rcu_check_callbacks(int user)
127{ 127{
128 RCU_TRACE(check_cpu_stalls()); 128 RCU_TRACE(check_cpu_stalls();)
129 if (user) 129 if (user)
130 rcu_sched_qs(); 130 rcu_sched_qs();
131 else if (!in_softirq()) 131 else if (!in_softirq())
@@ -143,7 +143,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
143 const char *rn = NULL; 143 const char *rn = NULL;
144 struct rcu_head *next, *list; 144 struct rcu_head *next, *list;
145 unsigned long flags; 145 unsigned long flags;
146 RCU_TRACE(int cb_count = 0); 146 RCU_TRACE(int cb_count = 0;)
147 147
148 /* Move the ready-to-invoke callbacks to a local list. */ 148 /* Move the ready-to-invoke callbacks to a local list. */
149 local_irq_save(flags); 149 local_irq_save(flags);
@@ -152,7 +152,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
152 local_irq_restore(flags); 152 local_irq_restore(flags);
153 return; 153 return;
154 } 154 }
155 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); 155 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);)
156 list = rcp->rcucblist; 156 list = rcp->rcucblist;
157 rcp->rcucblist = *rcp->donetail; 157 rcp->rcucblist = *rcp->donetail;
158 *rcp->donetail = NULL; 158 *rcp->donetail = NULL;
@@ -162,7 +162,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
162 local_irq_restore(flags); 162 local_irq_restore(flags);
163 163
164 /* Invoke the callbacks on the local list. */ 164 /* Invoke the callbacks on the local list. */
165 RCU_TRACE(rn = rcp->name); 165 RCU_TRACE(rn = rcp->name;)
166 while (list) { 166 while (list) {
167 next = list->next; 167 next = list->next;
168 prefetch(next); 168 prefetch(next);
@@ -171,9 +171,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
171 __rcu_reclaim(rn, list); 171 __rcu_reclaim(rn, list);
172 local_bh_enable(); 172 local_bh_enable();
173 list = next; 173 list = next;
174 RCU_TRACE(cb_count++); 174 RCU_TRACE(cb_count++;)
175 } 175 }
176 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 176 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);)
177 RCU_TRACE(trace_rcu_batch_end(rcp->name, 177 RCU_TRACE(trace_rcu_batch_end(rcp->name,
178 cb_count, 0, need_resched(), 178 cb_count, 0, need_resched(),
179 is_idle_task(current), 179 is_idle_task(current),
@@ -221,7 +221,7 @@ static void __call_rcu(struct rcu_head *head,
221 local_irq_save(flags); 221 local_irq_save(flags);
222 *rcp->curtail = head; 222 *rcp->curtail = head;
223 rcp->curtail = &head->next; 223 rcp->curtail = &head->next;
224 RCU_TRACE(rcp->qlen++); 224 RCU_TRACE(rcp->qlen++;)
225 local_irq_restore(flags); 225 local_irq_restore(flags);
226 226
227 if (unlikely(is_idle_task(current))) { 227 if (unlikely(is_idle_task(current))) {
@@ -254,8 +254,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
254void __init rcu_init(void) 254void __init rcu_init(void)
255{ 255{
256 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 256 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
257 RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk)); 257 RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);)
258 RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk)); 258 RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);)
259 259
260 rcu_early_boot_tests(); 260 rcu_early_boot_tests();
261} 261}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index c64b827ecbca..371034e77f87 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -52,7 +52,7 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
52 RCU_TRACE(.name = "rcu_bh") 52 RCU_TRACE(.name = "rcu_bh")
53}; 53};
54 54
55#ifdef CONFIG_DEBUG_LOCK_ALLOC 55#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
56#include <linux/kernel_stat.h> 56#include <linux/kernel_stat.h>
57 57
58int rcu_scheduler_active __read_mostly; 58int rcu_scheduler_active __read_mostly;
@@ -65,15 +65,16 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
65 * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. 65 * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
66 * The reason for this is that Tiny RCU does not need kthreads, so does 66 * The reason for this is that Tiny RCU does not need kthreads, so does
67 * not have to care about the fact that the scheduler is half-initialized 67 * not have to care about the fact that the scheduler is half-initialized
68 * at a certain phase of the boot process. 68 * at a certain phase of the boot process. Unless SRCU is in the mix.
69 */ 69 */
70void __init rcu_scheduler_starting(void) 70void __init rcu_scheduler_starting(void)
71{ 71{
72 WARN_ON(nr_context_switches() > 0); 72 WARN_ON(nr_context_switches() > 0);
73 rcu_scheduler_active = RCU_SCHEDULER_RUNNING; 73 rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
74 ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
74} 75}
75 76
76#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 77#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
77 78
78#ifdef CONFIG_RCU_TRACE 79#ifdef CONFIG_RCU_TRACE
79 80
@@ -162,8 +163,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
162 163
163static void check_cpu_stalls(void) 164static void check_cpu_stalls(void)
164{ 165{
165 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); 166 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);)
166 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); 167 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);)
167} 168}
168 169
169#endif /* #ifdef CONFIG_RCU_TRACE */ 170#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 50fee7689e71..23aa02587d0f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -97,8 +97,8 @@ struct rcu_state sname##_state = { \
97 .gpnum = 0UL - 300UL, \ 97 .gpnum = 0UL - 300UL, \
98 .completed = 0UL - 300UL, \ 98 .completed = 0UL - 300UL, \
99 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ 99 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
100 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 100 .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \
101 .orphan_donetail = &sname##_state.orphan_donelist, \ 101 .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \
102 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 102 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
103 .name = RCU_STATE_NAME(sname), \ 103 .name = RCU_STATE_NAME(sname), \
104 .abbr = sabbr, \ 104 .abbr = sabbr, \
@@ -123,7 +123,7 @@ static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
123module_param(rcu_fanout_leaf, int, 0444); 123module_param(rcu_fanout_leaf, int, 0444);
124int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; 124int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
125/* Number of rcu_nodes at specified level. */ 125/* Number of rcu_nodes at specified level. */
126static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; 126int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
127int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ 127int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
128/* panic() on RCU Stall sysctl. */ 128/* panic() on RCU Stall sysctl. */
129int sysctl_panic_on_rcu_stall __read_mostly; 129int sysctl_panic_on_rcu_stall __read_mostly;
@@ -199,7 +199,7 @@ static const int gp_cleanup_delay;
199 199
200/* 200/*
201 * Number of grace periods between delays, normalized by the duration of 201 * Number of grace periods between delays, normalized by the duration of
202 * the delay. The longer the the delay, the more the grace periods between 202 * the delay. The longer the delay, the more the grace periods between
203 * each delay. The reason for this normalization is that it means that, 203 * each delay. The reason for this normalization is that it means that,
204 * for non-zero delays, the overall slowdown of grace periods is constant 204 * for non-zero delays, the overall slowdown of grace periods is constant
205 * regardless of the duration of the delay. This arrangement balances 205 * regardless of the duration of the delay. This arrangement balances
@@ -272,11 +272,19 @@ void rcu_bh_qs(void)
272 } 272 }
273} 273}
274 274
275static DEFINE_PER_CPU(int, rcu_sched_qs_mask); 275/*
276 * Steal a bit from the bottom of ->dynticks for idle entry/exit
277 * control. Initially this is for TLB flushing.
278 */
279#define RCU_DYNTICK_CTRL_MASK 0x1
280#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
281#ifndef rcu_eqs_special_exit
282#define rcu_eqs_special_exit() do { } while (0)
283#endif
276 284
277static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 285static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
278 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 286 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
279 .dynticks = ATOMIC_INIT(1), 287 .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
280#ifdef CONFIG_NO_HZ_FULL_SYSIDLE 288#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
281 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, 289 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
282 .dynticks_idle = ATOMIC_INIT(1), 290 .dynticks_idle = ATOMIC_INIT(1),
@@ -290,15 +298,20 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
290static void rcu_dynticks_eqs_enter(void) 298static void rcu_dynticks_eqs_enter(void)
291{ 299{
292 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 300 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
293 int special; 301 int seq;
294 302
295 /* 303 /*
296 * CPUs seeing atomic_inc_return() must see prior RCU read-side 304 * CPUs seeing atomic_add_return() must see prior RCU read-side
297 * critical sections, and we also must force ordering with the 305 * critical sections, and we also must force ordering with the
298 * next idle sojourn. 306 * next idle sojourn.
299 */ 307 */
300 special = atomic_inc_return(&rdtp->dynticks); 308 seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
301 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1); 309 /* Better be in an extended quiescent state! */
310 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
311 (seq & RCU_DYNTICK_CTRL_CTR));
312 /* Better not have special action (TLB flush) pending! */
313 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
314 (seq & RCU_DYNTICK_CTRL_MASK));
302} 315}
303 316
304/* 317/*
@@ -308,15 +321,22 @@ static void rcu_dynticks_eqs_enter(void)
308static void rcu_dynticks_eqs_exit(void) 321static void rcu_dynticks_eqs_exit(void)
309{ 322{
310 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 323 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
311 int special; 324 int seq;
312 325
313 /* 326 /*
314 * CPUs seeing atomic_inc_return() must see prior idle sojourns, 327 * CPUs seeing atomic_add_return() must see prior idle sojourns,
315 * and we also must force ordering with the next RCU read-side 328 * and we also must force ordering with the next RCU read-side
316 * critical section. 329 * critical section.
317 */ 330 */
318 special = atomic_inc_return(&rdtp->dynticks); 331 seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
319 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1)); 332 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
333 !(seq & RCU_DYNTICK_CTRL_CTR));
334 if (seq & RCU_DYNTICK_CTRL_MASK) {
335 atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks);
336 smp_mb__after_atomic(); /* _exit after clearing mask. */
337 /* Prefer duplicate flushes to losing a flush. */
338 rcu_eqs_special_exit();
339 }
320} 340}
321 341
322/* 342/*
@@ -333,9 +353,9 @@ static void rcu_dynticks_eqs_online(void)
333{ 353{
334 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 354 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
335 355
336 if (atomic_read(&rdtp->dynticks) & 0x1) 356 if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR)
337 return; 357 return;
338 atomic_add(0x1, &rdtp->dynticks); 358 atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
339} 359}
340 360
341/* 361/*
@@ -347,7 +367,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)
347{ 367{
348 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 368 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
349 369
350 return !(atomic_read(&rdtp->dynticks) & 0x1); 370 return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR);
351} 371}
352 372
353/* 373/*
@@ -358,7 +378,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
358{ 378{
359 int snap = atomic_add_return(0, &rdtp->dynticks); 379 int snap = atomic_add_return(0, &rdtp->dynticks);
360 380
361 return snap; 381 return snap & ~RCU_DYNTICK_CTRL_MASK;
362} 382}
363 383
364/* 384/*
@@ -367,7 +387,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
367 */ 387 */
368static bool rcu_dynticks_in_eqs(int snap) 388static bool rcu_dynticks_in_eqs(int snap)
369{ 389{
370 return !(snap & 0x1); 390 return !(snap & RCU_DYNTICK_CTRL_CTR);
371} 391}
372 392
373/* 393/*
@@ -387,14 +407,34 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap)
387static void rcu_dynticks_momentary_idle(void) 407static void rcu_dynticks_momentary_idle(void)
388{ 408{
389 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 409 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
390 int special = atomic_add_return(2, &rdtp->dynticks); 410 int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
411 &rdtp->dynticks);
391 412
392 /* It is illegal to call this from idle state. */ 413 /* It is illegal to call this from idle state. */
393 WARN_ON_ONCE(!(special & 0x1)); 414 WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
394} 415}
395 416
396DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); 417/*
397EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); 418 * Set the special (bottom) bit of the specified CPU so that it
419 * will take special action (such as flushing its TLB) on the
420 * next exit from an extended quiescent state. Returns true if
421 * the bit was successfully set, or false if the CPU was not in
422 * an extended quiescent state.
423 */
424bool rcu_eqs_special_set(int cpu)
425{
426 int old;
427 int new;
428 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
429
430 do {
431 old = atomic_read(&rdtp->dynticks);
432 if (old & RCU_DYNTICK_CTRL_CTR)
433 return false;
434 new = old | RCU_DYNTICK_CTRL_MASK;
435 } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old);
436 return true;
437}
398 438
399/* 439/*
400 * Let the RCU core know that this CPU has gone through the scheduler, 440 * Let the RCU core know that this CPU has gone through the scheduler,
@@ -403,44 +443,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
403 * memory barriers to let the RCU core know about it, regardless of what 443 * memory barriers to let the RCU core know about it, regardless of what
404 * this CPU might (or might not) do in the near future. 444 * this CPU might (or might not) do in the near future.
405 * 445 *
406 * We inform the RCU core by emulating a zero-duration dyntick-idle 446 * We inform the RCU core by emulating a zero-duration dyntick-idle period.
407 * period, which we in turn do by incrementing the ->dynticks counter
408 * by two.
409 * 447 *
410 * The caller must have disabled interrupts. 448 * The caller must have disabled interrupts.
411 */ 449 */
412static void rcu_momentary_dyntick_idle(void) 450static void rcu_momentary_dyntick_idle(void)
413{ 451{
414 struct rcu_data *rdp; 452 raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false);
415 int resched_mask; 453 rcu_dynticks_momentary_idle();
416 struct rcu_state *rsp;
417
418 /*
419 * Yes, we can lose flag-setting operations. This is OK, because
420 * the flag will be set again after some delay.
421 */
422 resched_mask = raw_cpu_read(rcu_sched_qs_mask);
423 raw_cpu_write(rcu_sched_qs_mask, 0);
424
425 /* Find the flavor that needs a quiescent state. */
426 for_each_rcu_flavor(rsp) {
427 rdp = raw_cpu_ptr(rsp->rda);
428 if (!(resched_mask & rsp->flavor_mask))
429 continue;
430 smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
431 if (READ_ONCE(rdp->mynode->completed) !=
432 READ_ONCE(rdp->cond_resched_completed))
433 continue;
434
435 /*
436 * Pretend to be momentarily idle for the quiescent state.
437 * This allows the grace-period kthread to record the
438 * quiescent state, with no need for this CPU to do anything
439 * further.
440 */
441 rcu_dynticks_momentary_idle();
442 break;
443 }
444} 454}
445 455
446/* 456/*
@@ -448,14 +458,22 @@ static void rcu_momentary_dyntick_idle(void)
448 * and requires special handling for preemptible RCU. 458 * and requires special handling for preemptible RCU.
449 * The caller must have disabled interrupts. 459 * The caller must have disabled interrupts.
450 */ 460 */
451void rcu_note_context_switch(void) 461void rcu_note_context_switch(bool preempt)
452{ 462{
453 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 463 barrier(); /* Avoid RCU read-side critical sections leaking down. */
454 trace_rcu_utilization(TPS("Start context switch")); 464 trace_rcu_utilization(TPS("Start context switch"));
455 rcu_sched_qs(); 465 rcu_sched_qs();
456 rcu_preempt_note_context_switch(); 466 rcu_preempt_note_context_switch();
457 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) 467 /* Load rcu_urgent_qs before other flags. */
468 if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs)))
469 goto out;
470 this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
471 if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs)))
458 rcu_momentary_dyntick_idle(); 472 rcu_momentary_dyntick_idle();
473 this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
474 if (!preempt)
475 rcu_note_voluntary_context_switch_lite(current);
476out:
459 trace_rcu_utilization(TPS("End context switch")); 477 trace_rcu_utilization(TPS("End context switch"));
460 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 478 barrier(); /* Avoid RCU read-side critical sections leaking up. */
461} 479}
@@ -478,29 +496,26 @@ void rcu_all_qs(void)
478{ 496{
479 unsigned long flags; 497 unsigned long flags;
480 498
499 if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs))
500 return;
501 preempt_disable();
502 /* Load rcu_urgent_qs before other flags. */
503 if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) {
504 preempt_enable();
505 return;
506 }
507 this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
481 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 508 barrier(); /* Avoid RCU read-side critical sections leaking down. */
482 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { 509 if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) {
483 local_irq_save(flags); 510 local_irq_save(flags);
484 rcu_momentary_dyntick_idle(); 511 rcu_momentary_dyntick_idle();
485 local_irq_restore(flags); 512 local_irq_restore(flags);
486 } 513 }
487 if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { 514 if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)))
488 /*
489 * Yes, we just checked a per-CPU variable with preemption
490 * enabled, so we might be migrated to some other CPU at
491 * this point. That is OK because in that case, the
492 * migration will supply the needed quiescent state.
493 * We might end up needlessly disabling preemption and
494 * invoking rcu_sched_qs() on the destination CPU, but
495 * the probability and cost are both quite low, so this
496 * should not be a problem in practice.
497 */
498 preempt_disable();
499 rcu_sched_qs(); 515 rcu_sched_qs();
500 preempt_enable(); 516 this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
501 }
502 this_cpu_inc(rcu_qs_ctr);
503 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 517 barrier(); /* Avoid RCU read-side critical sections leaking up. */
518 preempt_enable();
504} 519}
505EXPORT_SYMBOL_GPL(rcu_all_qs); 520EXPORT_SYMBOL_GPL(rcu_all_qs);
506 521
@@ -713,16 +728,6 @@ void rcutorture_record_progress(unsigned long vernum)
713EXPORT_SYMBOL_GPL(rcutorture_record_progress); 728EXPORT_SYMBOL_GPL(rcutorture_record_progress);
714 729
715/* 730/*
716 * Does the CPU have callbacks ready to be invoked?
717 */
718static int
719cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
720{
721 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
722 rdp->nxttail[RCU_NEXT_TAIL] != NULL;
723}
724
725/*
726 * Return the root node of the specified rcu_state structure. 731 * Return the root node of the specified rcu_state structure.
727 */ 732 */
728static struct rcu_node *rcu_get_root(struct rcu_state *rsp) 733static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
@@ -752,21 +757,17 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
752static bool 757static bool
753cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 758cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
754{ 759{
755 int i;
756
757 if (rcu_gp_in_progress(rsp)) 760 if (rcu_gp_in_progress(rsp))
758 return false; /* No, a grace period is already in progress. */ 761 return false; /* No, a grace period is already in progress. */
759 if (rcu_future_needs_gp(rsp)) 762 if (rcu_future_needs_gp(rsp))
760 return true; /* Yes, a no-CBs CPU needs one. */ 763 return true; /* Yes, a no-CBs CPU needs one. */
761 if (!rdp->nxttail[RCU_NEXT_TAIL]) 764 if (!rcu_segcblist_is_enabled(&rdp->cblist))
762 return false; /* No, this is a no-CBs (or offline) CPU. */ 765 return false; /* No, this is a no-CBs (or offline) CPU. */
763 if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) 766 if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
764 return true; /* Yes, CPU has newly registered callbacks. */ 767 return true; /* Yes, CPU has newly registered callbacks. */
765 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) 768 if (rcu_segcblist_future_gp_needed(&rdp->cblist,
766 if (rdp->nxttail[i - 1] != rdp->nxttail[i] && 769 READ_ONCE(rsp->completed)))
767 ULONG_CMP_LT(READ_ONCE(rsp->completed), 770 return true; /* Yes, CBs for future grace period. */
768 rdp->nxtcompleted[i]))
769 return true; /* Yes, CBs for future grace period. */
770 return false; /* No grace period needed. */ 771 return false; /* No grace period needed. */
771} 772}
772 773
@@ -1150,6 +1151,24 @@ bool notrace rcu_is_watching(void)
1150} 1151}
1151EXPORT_SYMBOL_GPL(rcu_is_watching); 1152EXPORT_SYMBOL_GPL(rcu_is_watching);
1152 1153
1154/*
1155 * If a holdout task is actually running, request an urgent quiescent
1156 * state from its CPU. This is unsynchronized, so migrations can cause
1157 * the request to go to the wrong CPU. Which is OK, all that will happen
1158 * is that the CPU's next context switch will be a bit slower and next
1159 * time around this task will generate another request.
1160 */
1161void rcu_request_urgent_qs_task(struct task_struct *t)
1162{
1163 int cpu;
1164
1165 barrier();
1166 cpu = task_cpu(t);
1167 if (!task_curr(t))
1168 return; /* This task is not running on that CPU. */
1169 smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true);
1170}
1171
1153#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 1172#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
1154 1173
1155/* 1174/*
@@ -1235,7 +1254,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
1235 bool *isidle, unsigned long *maxj) 1254 bool *isidle, unsigned long *maxj)
1236{ 1255{
1237 unsigned long jtsq; 1256 unsigned long jtsq;
1238 int *rcrmp; 1257 bool *rnhqp;
1258 bool *ruqp;
1239 unsigned long rjtsc; 1259 unsigned long rjtsc;
1240 struct rcu_node *rnp; 1260 struct rcu_node *rnp;
1241 1261
@@ -1271,11 +1291,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
1271 * might not be the case for nohz_full CPUs looping in the kernel. 1291 * might not be the case for nohz_full CPUs looping in the kernel.
1272 */ 1292 */
1273 rnp = rdp->mynode; 1293 rnp = rdp->mynode;
1294 ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
1274 if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && 1295 if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
1275 READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) && 1296 READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) &&
1276 READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { 1297 READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
1277 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); 1298 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
1278 return 1; 1299 return 1;
1300 } else {
1301 /* Load rcu_qs_ctr before store to rcu_urgent_qs. */
1302 smp_store_release(ruqp, true);
1279 } 1303 }
1280 1304
1281 /* Check for the CPU being offline. */ 1305 /* Check for the CPU being offline. */
@@ -1292,7 +1316,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
1292 * in-kernel CPU-bound tasks cannot advance grace periods. 1316 * in-kernel CPU-bound tasks cannot advance grace periods.
1293 * So if the grace period is old enough, make the CPU pay attention. 1317 * So if the grace period is old enough, make the CPU pay attention.
1294 * Note that the unsynchronized assignments to the per-CPU 1318 * Note that the unsynchronized assignments to the per-CPU
1295 * rcu_sched_qs_mask variable are safe. Yes, setting of 1319 * rcu_need_heavy_qs variable are safe. Yes, setting of
1296 * bits can be lost, but they will be set again on the next 1320 * bits can be lost, but they will be set again on the next
1297 * force-quiescent-state pass. So lost bit sets do not result 1321 * force-quiescent-state pass. So lost bit sets do not result
1298 * in incorrect behavior, merely in a grace period lasting 1322 * in incorrect behavior, merely in a grace period lasting
@@ -1306,16 +1330,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
1306 * is set too high, we override with half of the RCU CPU stall 1330 * is set too high, we override with half of the RCU CPU stall
1307 * warning delay. 1331 * warning delay.
1308 */ 1332 */
1309 rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); 1333 rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu);
1310 if (time_after(jiffies, rdp->rsp->gp_start + jtsq) || 1334 if (!READ_ONCE(*rnhqp) &&
1311 time_after(jiffies, rdp->rsp->jiffies_resched)) { 1335 (time_after(jiffies, rdp->rsp->gp_start + jtsq) ||
1312 if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { 1336 time_after(jiffies, rdp->rsp->jiffies_resched))) {
1313 WRITE_ONCE(rdp->cond_resched_completed, 1337 WRITE_ONCE(*rnhqp, true);
1314 READ_ONCE(rdp->mynode->completed)); 1338 /* Store rcu_need_heavy_qs before rcu_urgent_qs. */
1315 smp_mb(); /* ->cond_resched_completed before *rcrmp. */ 1339 smp_store_release(ruqp, true);
1316 WRITE_ONCE(*rcrmp,
1317 READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
1318 }
1319 rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ 1340 rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
1320 } 1341 }
1321 1342
@@ -1475,7 +1496,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1475 1496
1476 print_cpu_stall_info_end(); 1497 print_cpu_stall_info_end();
1477 for_each_possible_cpu(cpu) 1498 for_each_possible_cpu(cpu)
1478 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1499 totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
1500 cpu)->cblist);
1479 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", 1501 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
1480 smp_processor_id(), (long)(jiffies - rsp->gp_start), 1502 smp_processor_id(), (long)(jiffies - rsp->gp_start),
1481 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1503 (long)rsp->gpnum, (long)rsp->completed, totqlen);
@@ -1529,7 +1551,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
1529 print_cpu_stall_info(rsp, smp_processor_id()); 1551 print_cpu_stall_info(rsp, smp_processor_id());
1530 print_cpu_stall_info_end(); 1552 print_cpu_stall_info_end();
1531 for_each_possible_cpu(cpu) 1553 for_each_possible_cpu(cpu)
1532 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1554 totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
1555 cpu)->cblist);
1533 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", 1556 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
1534 jiffies - rsp->gp_start, 1557 jiffies - rsp->gp_start,
1535 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1558 (long)rsp->gpnum, (long)rsp->completed, totqlen);
@@ -1632,30 +1655,6 @@ void rcu_cpu_stall_reset(void)
1632} 1655}
1633 1656
1634/* 1657/*
1635 * Initialize the specified rcu_data structure's default callback list
1636 * to empty. The default callback list is the one that is not used by
1637 * no-callbacks CPUs.
1638 */
1639static void init_default_callback_list(struct rcu_data *rdp)
1640{
1641 int i;
1642
1643 rdp->nxtlist = NULL;
1644 for (i = 0; i < RCU_NEXT_SIZE; i++)
1645 rdp->nxttail[i] = &rdp->nxtlist;
1646}
1647
1648/*
1649 * Initialize the specified rcu_data structure's callback list to empty.
1650 */
1651static void init_callback_list(struct rcu_data *rdp)
1652{
1653 if (init_nocb_callback_list(rdp))
1654 return;
1655 init_default_callback_list(rdp);
1656}
1657
1658/*
1659 * Determine the value that ->completed will have at the end of the 1658 * Determine the value that ->completed will have at the end of the
1660 * next subsequent grace period. This is used to tag callbacks so that 1659 * next subsequent grace period. This is used to tag callbacks so that
1661 * a CPU can invoke callbacks in a timely fashion even if that CPU has 1660 * a CPU can invoke callbacks in a timely fashion even if that CPU has
@@ -1709,7 +1708,6 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1709 unsigned long *c_out) 1708 unsigned long *c_out)
1710{ 1709{
1711 unsigned long c; 1710 unsigned long c;
1712 int i;
1713 bool ret = false; 1711 bool ret = false;
1714 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); 1712 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
1715 1713
@@ -1755,13 +1753,11 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1755 /* 1753 /*
1756 * Get a new grace-period number. If there really is no grace 1754 * Get a new grace-period number. If there really is no grace
1757 * period in progress, it will be smaller than the one we obtained 1755 * period in progress, it will be smaller than the one we obtained
1758 * earlier. Adjust callbacks as needed. Note that even no-CBs 1756 * earlier. Adjust callbacks as needed.
1759 * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
1760 */ 1757 */
1761 c = rcu_cbs_completed(rdp->rsp, rnp_root); 1758 c = rcu_cbs_completed(rdp->rsp, rnp_root);
1762 for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) 1759 if (!rcu_is_nocb_cpu(rdp->cpu))
1763 if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) 1760 (void)rcu_segcblist_accelerate(&rdp->cblist, c);
1764 rdp->nxtcompleted[i] = c;
1765 1761
1766 /* 1762 /*
1767 * If the needed for the required grace period is already 1763 * If the needed for the required grace period is already
@@ -1793,9 +1789,7 @@ out:
1793 1789
1794/* 1790/*
1795 * Clean up any old requests for the just-ended grace period. Also return 1791 * Clean up any old requests for the just-ended grace period. Also return
1796 * whether any additional grace periods have been requested. Also invoke 1792 * whether any additional grace periods have been requested.
1797 * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
1798 * waiting for this grace period to complete.
1799 */ 1793 */
1800static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 1794static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1801{ 1795{
@@ -1841,57 +1835,27 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
1841static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1835static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1842 struct rcu_data *rdp) 1836 struct rcu_data *rdp)
1843{ 1837{
1844 unsigned long c; 1838 bool ret = false;
1845 int i;
1846 bool ret;
1847
1848 /* If the CPU has no callbacks, nothing to do. */
1849 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1850 return false;
1851
1852 /*
1853 * Starting from the sublist containing the callbacks most
1854 * recently assigned a ->completed number and working down, find the
1855 * first sublist that is not assignable to an upcoming grace period.
1856 * Such a sublist has something in it (first two tests) and has
1857 * a ->completed number assigned that will complete sooner than
1858 * the ->completed number for newly arrived callbacks (last test).
1859 *
1860 * The key point is that any later sublist can be assigned the
1861 * same ->completed number as the newly arrived callbacks, which
1862 * means that the callbacks in any of these later sublist can be
1863 * grouped into a single sublist, whether or not they have already
1864 * been assigned a ->completed number.
1865 */
1866 c = rcu_cbs_completed(rsp, rnp);
1867 for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
1868 if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
1869 !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
1870 break;
1871 1839
1872 /* 1840 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1873 * If there are no sublist for unassigned callbacks, leave. 1841 if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1874 * At the same time, advance "i" one sublist, so that "i" will
1875 * index into the sublist where all the remaining callbacks should
1876 * be grouped into.
1877 */
1878 if (++i >= RCU_NEXT_TAIL)
1879 return false; 1842 return false;
1880 1843
1881 /* 1844 /*
1882 * Assign all subsequent callbacks' ->completed number to the next 1845 * Callbacks are often registered with incomplete grace-period
1883 * full grace period and group them all in the sublist initially 1846 * information. Something about the fact that getting exact
1884 * indexed by "i". 1847 * information requires acquiring a global lock... RCU therefore
1848 * makes a conservative estimate of the grace period number at which
1849 * a given callback will become ready to invoke. The following
1850 * code checks this estimate and improves it when possible, thus
1851 * accelerating callback invocation to an earlier grace-period
1852 * number.
1885 */ 1853 */
1886 for (; i <= RCU_NEXT_TAIL; i++) { 1854 if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp)))
1887 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; 1855 ret = rcu_start_future_gp(rnp, rdp, NULL);
1888 rdp->nxtcompleted[i] = c;
1889 }
1890 /* Record any needed additional grace periods. */
1891 ret = rcu_start_future_gp(rnp, rdp, NULL);
1892 1856
1893 /* Trace depending on how much we were able to accelerate. */ 1857 /* Trace depending on how much we were able to accelerate. */
1894 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1858 if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
1895 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); 1859 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
1896 else 1860 else
1897 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); 1861 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
@@ -1911,32 +1875,15 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1911static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1875static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1912 struct rcu_data *rdp) 1876 struct rcu_data *rdp)
1913{ 1877{
1914 int i, j; 1878 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1915 1879 if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1916 /* If the CPU has no callbacks, nothing to do. */
1917 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1918 return false; 1880 return false;
1919 1881
1920 /* 1882 /*
1921 * Find all callbacks whose ->completed numbers indicate that they 1883 * Find all callbacks whose ->completed numbers indicate that they
1922 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. 1884 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
1923 */ 1885 */
1924 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { 1886 rcu_segcblist_advance(&rdp->cblist, rnp->completed);
1925 if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
1926 break;
1927 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
1928 }
1929 /* Clean up any sublist tail pointers that were misordered above. */
1930 for (j = RCU_WAIT_TAIL; j < i; j++)
1931 rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
1932
1933 /* Copy down callbacks to fill in empty sublists. */
1934 for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
1935 if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
1936 break;
1937 rdp->nxttail[j] = rdp->nxttail[i];
1938 rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
1939 }
1940 1887
1941 /* Classify any remaining callbacks. */ 1888 /* Classify any remaining callbacks. */
1942 return rcu_accelerate_cbs(rsp, rnp, rdp); 1889 return rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1981,7 +1928,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1981 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); 1928 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
1982 need_gp = !!(rnp->qsmask & rdp->grpmask); 1929 need_gp = !!(rnp->qsmask & rdp->grpmask);
1983 rdp->cpu_no_qs.b.norm = need_gp; 1930 rdp->cpu_no_qs.b.norm = need_gp;
1984 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 1931 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr);
1985 rdp->core_needs_qs = need_gp; 1932 rdp->core_needs_qs = need_gp;
1986 zero_cpu_stall_ticks(rdp); 1933 zero_cpu_stall_ticks(rdp);
1987 WRITE_ONCE(rdp->gpwrap, false); 1934 WRITE_ONCE(rdp->gpwrap, false);
@@ -2579,7 +2526,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2579 * within the current grace period. 2526 * within the current grace period.
2580 */ 2527 */
2581 rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ 2528 rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
2582 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 2529 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr);
2583 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2530 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2584 return; 2531 return;
2585 } 2532 }
@@ -2653,13 +2600,8 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
2653 * because _rcu_barrier() excludes CPU-hotplug operations, so it 2600 * because _rcu_barrier() excludes CPU-hotplug operations, so it
2654 * cannot be running now. Thus no memory barrier is required. 2601 * cannot be running now. Thus no memory barrier is required.
2655 */ 2602 */
2656 if (rdp->nxtlist != NULL) { 2603 rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist);
2657 rsp->qlen_lazy += rdp->qlen_lazy; 2604 rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done);
2658 rsp->qlen += rdp->qlen;
2659 rdp->n_cbs_orphaned += rdp->qlen;
2660 rdp->qlen_lazy = 0;
2661 WRITE_ONCE(rdp->qlen, 0);
2662 }
2663 2605
2664 /* 2606 /*
2665 * Next, move those callbacks still needing a grace period to 2607 * Next, move those callbacks still needing a grace period to
@@ -2667,31 +2609,18 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
2667 * Some of the callbacks might have gone partway through a grace 2609 * Some of the callbacks might have gone partway through a grace
2668 * period, but that is too bad. They get to start over because we 2610 * period, but that is too bad. They get to start over because we
2669 * cannot assume that grace periods are synchronized across CPUs. 2611 * cannot assume that grace periods are synchronized across CPUs.
2670 * We don't bother updating the ->nxttail[] array yet, instead
2671 * we just reset the whole thing later on.
2672 */ 2612 */
2673 if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { 2613 rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
2674 *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
2675 rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
2676 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
2677 }
2678 2614
2679 /* 2615 /*
2680 * Then move the ready-to-invoke callbacks to the orphanage, 2616 * Then move the ready-to-invoke callbacks to the orphanage,
2681 * where some other CPU will pick them up. These will not be 2617 * where some other CPU will pick them up. These will not be
2682 * required to pass though another grace period: They are done. 2618 * required to pass though another grace period: They are done.
2683 */ 2619 */
2684 if (rdp->nxtlist != NULL) { 2620 rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done);
2685 *rsp->orphan_donetail = rdp->nxtlist;
2686 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
2687 }
2688 2621
2689 /* 2622 /* Finally, disallow further callbacks on this CPU. */
2690 * Finally, initialize the rcu_data structure's list to empty and 2623 rcu_segcblist_disable(&rdp->cblist);
2691 * disallow further callbacks on this CPU.
2692 */
2693 init_callback_list(rdp);
2694 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2695} 2624}
2696 2625
2697/* 2626/*
@@ -2700,7 +2629,6 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
2700 */ 2629 */
2701static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) 2630static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
2702{ 2631{
2703 int i;
2704 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 2632 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
2705 2633
2706 /* No-CBs CPUs are handled specially. */ 2634 /* No-CBs CPUs are handled specially. */
@@ -2709,13 +2637,11 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
2709 return; 2637 return;
2710 2638
2711 /* Do the accounting first. */ 2639 /* Do the accounting first. */
2712 rdp->qlen_lazy += rsp->qlen_lazy; 2640 rdp->n_cbs_adopted += rcu_cblist_n_cbs(&rsp->orphan_done);
2713 rdp->qlen += rsp->qlen; 2641 if (rcu_cblist_n_lazy_cbs(&rsp->orphan_done) !=
2714 rdp->n_cbs_adopted += rsp->qlen; 2642 rcu_cblist_n_cbs(&rsp->orphan_done))
2715 if (rsp->qlen_lazy != rsp->qlen)
2716 rcu_idle_count_callbacks_posted(); 2643 rcu_idle_count_callbacks_posted();
2717 rsp->qlen_lazy = 0; 2644 rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done);
2718 rsp->qlen = 0;
2719 2645
2720 /* 2646 /*
2721 * We do not need a memory barrier here because the only way we 2647 * We do not need a memory barrier here because the only way we
@@ -2723,24 +2649,13 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
2723 * we are the task doing the rcu_barrier(). 2649 * we are the task doing the rcu_barrier().
2724 */ 2650 */
2725 2651
2726 /* First adopt the ready-to-invoke callbacks. */ 2652 /* First adopt the ready-to-invoke callbacks, then the done ones. */
2727 if (rsp->orphan_donelist != NULL) { 2653 rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done);
2728 *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; 2654 WARN_ON_ONCE(!rcu_cblist_empty(&rsp->orphan_done));
2729 *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; 2655 rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
2730 for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) 2656 WARN_ON_ONCE(!rcu_cblist_empty(&rsp->orphan_pend));
2731 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) 2657 WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) !=
2732 rdp->nxttail[i] = rsp->orphan_donetail; 2658 !rcu_segcblist_n_cbs(&rdp->cblist));
2733 rsp->orphan_donelist = NULL;
2734 rsp->orphan_donetail = &rsp->orphan_donelist;
2735 }
2736
2737 /* And then adopt the callbacks that still need a grace period. */
2738 if (rsp->orphan_nxtlist != NULL) {
2739 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
2740 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
2741 rsp->orphan_nxtlist = NULL;
2742 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
2743 }
2744} 2659}
2745 2660
2746/* 2661/*
@@ -2748,14 +2663,14 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
2748 */ 2663 */
2749static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 2664static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
2750{ 2665{
2751 RCU_TRACE(unsigned long mask); 2666 RCU_TRACE(unsigned long mask;)
2752 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); 2667 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);)
2753 RCU_TRACE(struct rcu_node *rnp = rdp->mynode); 2668 RCU_TRACE(struct rcu_node *rnp = rdp->mynode;)
2754 2669
2755 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) 2670 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
2756 return; 2671 return;
2757 2672
2758 RCU_TRACE(mask = rdp->grpmask); 2673 RCU_TRACE(mask = rdp->grpmask;)
2759 trace_rcu_grace_period(rsp->name, 2674 trace_rcu_grace_period(rsp->name,
2760 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 2675 rnp->gpnum + 1 - !!(rnp->qsmask & mask),
2761 TPS("cpuofl")); 2676 TPS("cpuofl"));
@@ -2828,9 +2743,11 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2828 rcu_adopt_orphan_cbs(rsp, flags); 2743 rcu_adopt_orphan_cbs(rsp, flags);
2829 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); 2744 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
2830 2745
2831 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 2746 WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
2832 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 2747 !rcu_segcblist_empty(&rdp->cblist),
2833 cpu, rdp->qlen, rdp->nxtlist); 2748 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
2749 cpu, rcu_segcblist_n_cbs(&rdp->cblist),
2750 rcu_segcblist_first_cb(&rdp->cblist));
2834} 2751}
2835 2752
2836/* 2753/*
@@ -2840,14 +2757,17 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2840static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) 2757static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2841{ 2758{
2842 unsigned long flags; 2759 unsigned long flags;
2843 struct rcu_head *next, *list, **tail; 2760 struct rcu_head *rhp;
2844 long bl, count, count_lazy; 2761 struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
2845 int i; 2762 long bl, count;
2846 2763
2847 /* If no callbacks are ready, just return. */ 2764 /* If no callbacks are ready, just return. */
2848 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 2765 if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
2849 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); 2766 trace_rcu_batch_start(rsp->name,
2850 trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist), 2767 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
2768 rcu_segcblist_n_cbs(&rdp->cblist), 0);
2769 trace_rcu_batch_end(rsp->name, 0,
2770 !rcu_segcblist_empty(&rdp->cblist),
2851 need_resched(), is_idle_task(current), 2771 need_resched(), is_idle_task(current),
2852 rcu_is_callbacks_kthread()); 2772 rcu_is_callbacks_kthread());
2853 return; 2773 return;
@@ -2855,73 +2775,62 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2855 2775
2856 /* 2776 /*
2857 * Extract the list of ready callbacks, disabling to prevent 2777 * Extract the list of ready callbacks, disabling to prevent
2858 * races with call_rcu() from interrupt handlers. 2778 * races with call_rcu() from interrupt handlers. Leave the
2779 * callback counts, as rcu_barrier() needs to be conservative.
2859 */ 2780 */
2860 local_irq_save(flags); 2781 local_irq_save(flags);
2861 WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); 2782 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
2862 bl = rdp->blimit; 2783 bl = rdp->blimit;
2863 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); 2784 trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist),
2864 list = rdp->nxtlist; 2785 rcu_segcblist_n_cbs(&rdp->cblist), bl);
2865 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 2786 rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
2866 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
2867 tail = rdp->nxttail[RCU_DONE_TAIL];
2868 for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
2869 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
2870 rdp->nxttail[i] = &rdp->nxtlist;
2871 local_irq_restore(flags); 2787 local_irq_restore(flags);
2872 2788
2873 /* Invoke callbacks. */ 2789 /* Invoke callbacks. */
2874 count = count_lazy = 0; 2790 rhp = rcu_cblist_dequeue(&rcl);
2875 while (list) { 2791 for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
2876 next = list->next; 2792 debug_rcu_head_unqueue(rhp);
2877 prefetch(next); 2793 if (__rcu_reclaim(rsp->name, rhp))
2878 debug_rcu_head_unqueue(list); 2794 rcu_cblist_dequeued_lazy(&rcl);
2879 if (__rcu_reclaim(rsp->name, list)) 2795 /*
2880 count_lazy++; 2796 * Stop only if limit reached and CPU has something to do.
2881 list = next; 2797 * Note: The rcl structure counts down from zero.
2882 /* Stop only if limit reached and CPU has something to do. */ 2798 */
2883 if (++count >= bl && 2799 if (-rcu_cblist_n_cbs(&rcl) >= bl &&
2884 (need_resched() || 2800 (need_resched() ||
2885 (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) 2801 (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
2886 break; 2802 break;
2887 } 2803 }
2888 2804
2889 local_irq_save(flags); 2805 local_irq_save(flags);
2890 trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), 2806 count = -rcu_cblist_n_cbs(&rcl);
2891 is_idle_task(current), 2807 trace_rcu_batch_end(rsp->name, count, !rcu_cblist_empty(&rcl),
2808 need_resched(), is_idle_task(current),
2892 rcu_is_callbacks_kthread()); 2809 rcu_is_callbacks_kthread());
2893 2810
2894 /* Update count, and requeue any remaining callbacks. */ 2811 /* Update counts and requeue any remaining callbacks. */
2895 if (list != NULL) { 2812 rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
2896 *tail = rdp->nxtlist;
2897 rdp->nxtlist = list;
2898 for (i = 0; i < RCU_NEXT_SIZE; i++)
2899 if (&rdp->nxtlist == rdp->nxttail[i])
2900 rdp->nxttail[i] = tail;
2901 else
2902 break;
2903 }
2904 smp_mb(); /* List handling before counting for rcu_barrier(). */ 2813 smp_mb(); /* List handling before counting for rcu_barrier(). */
2905 rdp->qlen_lazy -= count_lazy;
2906 WRITE_ONCE(rdp->qlen, rdp->qlen - count);
2907 rdp->n_cbs_invoked += count; 2814 rdp->n_cbs_invoked += count;
2815 rcu_segcblist_insert_count(&rdp->cblist, &rcl);
2908 2816
2909 /* Reinstate batch limit if we have worked down the excess. */ 2817 /* Reinstate batch limit if we have worked down the excess. */
2910 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 2818 count = rcu_segcblist_n_cbs(&rdp->cblist);
2819 if (rdp->blimit == LONG_MAX && count <= qlowmark)
2911 rdp->blimit = blimit; 2820 rdp->blimit = blimit;
2912 2821
2913 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ 2822 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
2914 if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { 2823 if (count == 0 && rdp->qlen_last_fqs_check != 0) {
2915 rdp->qlen_last_fqs_check = 0; 2824 rdp->qlen_last_fqs_check = 0;
2916 rdp->n_force_qs_snap = rsp->n_force_qs; 2825 rdp->n_force_qs_snap = rsp->n_force_qs;
2917 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) 2826 } else if (count < rdp->qlen_last_fqs_check - qhimark)
2918 rdp->qlen_last_fqs_check = rdp->qlen; 2827 rdp->qlen_last_fqs_check = count;
2919 WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); 2828 WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0));
2920 2829
2921 local_irq_restore(flags); 2830 local_irq_restore(flags);
2922 2831
2923 /* Re-invoke RCU core processing if there are callbacks remaining. */ 2832 /* Re-invoke RCU core processing if there are callbacks remaining. */
2924 if (cpu_has_callbacks_ready_to_invoke(rdp)) 2833 if (rcu_segcblist_ready_cbs(&rdp->cblist))
2925 invoke_rcu_core(); 2834 invoke_rcu_core();
2926} 2835}
2927 2836
@@ -3087,7 +2996,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
3087 bool needwake; 2996 bool needwake;
3088 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 2997 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
3089 2998
3090 WARN_ON_ONCE(rdp->beenonline == 0); 2999 WARN_ON_ONCE(!rdp->beenonline);
3091 3000
3092 /* Update RCU state based on any recent quiescent states. */ 3001 /* Update RCU state based on any recent quiescent states. */
3093 rcu_check_quiescent_state(rsp, rdp); 3002 rcu_check_quiescent_state(rsp, rdp);
@@ -3105,7 +3014,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
3105 } 3014 }
3106 3015
3107 /* If there are callbacks ready, invoke them. */ 3016 /* If there are callbacks ready, invoke them. */
3108 if (cpu_has_callbacks_ready_to_invoke(rdp)) 3017 if (rcu_segcblist_ready_cbs(&rdp->cblist))
3109 invoke_rcu_callbacks(rsp, rdp); 3018 invoke_rcu_callbacks(rsp, rdp);
3110 3019
3111 /* Do any needed deferred wakeups of rcuo kthreads. */ 3020 /* Do any needed deferred wakeups of rcuo kthreads. */
@@ -3177,7 +3086,8 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
3177 * invoking force_quiescent_state() if the newly enqueued callback 3086 * invoking force_quiescent_state() if the newly enqueued callback
3178 * is the only one waiting for a grace period to complete. 3087 * is the only one waiting for a grace period to complete.
3179 */ 3088 */
3180 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 3089 if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
3090 rdp->qlen_last_fqs_check + qhimark)) {
3181 3091
3182 /* Are we ignoring a completed grace period? */ 3092 /* Are we ignoring a completed grace period? */
3183 note_gp_changes(rsp, rdp); 3093 note_gp_changes(rsp, rdp);
@@ -3195,10 +3105,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
3195 /* Give the grace period a kick. */ 3105 /* Give the grace period a kick. */
3196 rdp->blimit = LONG_MAX; 3106 rdp->blimit = LONG_MAX;
3197 if (rsp->n_force_qs == rdp->n_force_qs_snap && 3107 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
3198 *rdp->nxttail[RCU_DONE_TAIL] != head) 3108 rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
3199 force_quiescent_state(rsp); 3109 force_quiescent_state(rsp);
3200 rdp->n_force_qs_snap = rsp->n_force_qs; 3110 rdp->n_force_qs_snap = rsp->n_force_qs;
3201 rdp->qlen_last_fqs_check = rdp->qlen; 3111 rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
3202 } 3112 }
3203 } 3113 }
3204} 3114}
@@ -3238,7 +3148,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
3238 rdp = this_cpu_ptr(rsp->rda); 3148 rdp = this_cpu_ptr(rsp->rda);
3239 3149
3240 /* Add the callback to our list. */ 3150 /* Add the callback to our list. */
3241 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { 3151 if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) {
3242 int offline; 3152 int offline;
3243 3153
3244 if (cpu != -1) 3154 if (cpu != -1)
@@ -3257,23 +3167,21 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
3257 */ 3167 */
3258 BUG_ON(cpu != -1); 3168 BUG_ON(cpu != -1);
3259 WARN_ON_ONCE(!rcu_is_watching()); 3169 WARN_ON_ONCE(!rcu_is_watching());
3260 if (!likely(rdp->nxtlist)) 3170 if (rcu_segcblist_empty(&rdp->cblist))
3261 init_default_callback_list(rdp); 3171 rcu_segcblist_init(&rdp->cblist);
3262 } 3172 }
3263 WRITE_ONCE(rdp->qlen, rdp->qlen + 1); 3173 rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
3264 if (lazy) 3174 if (!lazy)
3265 rdp->qlen_lazy++;
3266 else
3267 rcu_idle_count_callbacks_posted(); 3175 rcu_idle_count_callbacks_posted();
3268 smp_mb(); /* Count before adding callback for rcu_barrier(). */
3269 *rdp->nxttail[RCU_NEXT_TAIL] = head;
3270 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
3271 3176
3272 if (__is_kfree_rcu_offset((unsigned long)func)) 3177 if (__is_kfree_rcu_offset((unsigned long)func))
3273 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, 3178 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
3274 rdp->qlen_lazy, rdp->qlen); 3179 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
3180 rcu_segcblist_n_cbs(&rdp->cblist));
3275 else 3181 else
3276 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); 3182 trace_rcu_callback(rsp->name, head,
3183 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
3184 rcu_segcblist_n_cbs(&rdp->cblist));
3277 3185
3278 /* Go handle any RCU core processing required. */ 3186 /* Go handle any RCU core processing required. */
3279 __call_rcu_core(rsp, rdp, head, flags); 3187 __call_rcu_core(rsp, rdp, head, flags);
@@ -3519,41 +3427,6 @@ void cond_synchronize_sched(unsigned long oldstate)
3519} 3427}
3520EXPORT_SYMBOL_GPL(cond_synchronize_sched); 3428EXPORT_SYMBOL_GPL(cond_synchronize_sched);
3521 3429
3522/* Adjust sequence number for start of update-side operation. */
3523static void rcu_seq_start(unsigned long *sp)
3524{
3525 WRITE_ONCE(*sp, *sp + 1);
3526 smp_mb(); /* Ensure update-side operation after counter increment. */
3527 WARN_ON_ONCE(!(*sp & 0x1));
3528}
3529
3530/* Adjust sequence number for end of update-side operation. */
3531static void rcu_seq_end(unsigned long *sp)
3532{
3533 smp_mb(); /* Ensure update-side operation before counter increment. */
3534 WRITE_ONCE(*sp, *sp + 1);
3535 WARN_ON_ONCE(*sp & 0x1);
3536}
3537
3538/* Take a snapshot of the update side's sequence number. */
3539static unsigned long rcu_seq_snap(unsigned long *sp)
3540{
3541 unsigned long s;
3542
3543 s = (READ_ONCE(*sp) + 3) & ~0x1;
3544 smp_mb(); /* Above access must not bleed into critical section. */
3545 return s;
3546}
3547
3548/*
3549 * Given a snapshot from rcu_seq_snap(), determine whether or not a
3550 * full update-side operation has occurred.
3551 */
3552static bool rcu_seq_done(unsigned long *sp, unsigned long s)
3553{
3554 return ULONG_CMP_GE(READ_ONCE(*sp), s);
3555}
3556
3557/* 3430/*
3558 * Check to see if there is any immediate RCU-related work to be done 3431 * Check to see if there is any immediate RCU-related work to be done
3559 * by the current CPU, for the specified type of RCU, returning 1 if so. 3432 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -3577,7 +3450,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3577 /* Is the RCU core waiting for a quiescent state from this CPU? */ 3450 /* Is the RCU core waiting for a quiescent state from this CPU? */
3578 if (rcu_scheduler_fully_active && 3451 if (rcu_scheduler_fully_active &&
3579 rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && 3452 rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
3580 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { 3453 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) {
3581 rdp->n_rp_core_needs_qs++; 3454 rdp->n_rp_core_needs_qs++;
3582 } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { 3455 } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) {
3583 rdp->n_rp_report_qs++; 3456 rdp->n_rp_report_qs++;
@@ -3585,7 +3458,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3585 } 3458 }
3586 3459
3587 /* Does this CPU have callbacks ready to invoke? */ 3460 /* Does this CPU have callbacks ready to invoke? */
3588 if (cpu_has_callbacks_ready_to_invoke(rdp)) { 3461 if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
3589 rdp->n_rp_cb_ready++; 3462 rdp->n_rp_cb_ready++;
3590 return 1; 3463 return 1;
3591 } 3464 }
@@ -3649,10 +3522,10 @@ static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
3649 3522
3650 for_each_rcu_flavor(rsp) { 3523 for_each_rcu_flavor(rsp) {
3651 rdp = this_cpu_ptr(rsp->rda); 3524 rdp = this_cpu_ptr(rsp->rda);
3652 if (!rdp->nxtlist) 3525 if (rcu_segcblist_empty(&rdp->cblist))
3653 continue; 3526 continue;
3654 hc = true; 3527 hc = true;
3655 if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { 3528 if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist) || !all_lazy) {
3656 al = false; 3529 al = false;
3657 break; 3530 break;
3658 } 3531 }
@@ -3761,7 +3634,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
3761 __call_rcu(&rdp->barrier_head, 3634 __call_rcu(&rdp->barrier_head,
3762 rcu_barrier_callback, rsp, cpu, 0); 3635 rcu_barrier_callback, rsp, cpu, 0);
3763 } 3636 }
3764 } else if (READ_ONCE(rdp->qlen)) { 3637 } else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
3765 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 3638 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
3766 rsp->barrier_sequence); 3639 rsp->barrier_sequence);
3767 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); 3640 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -3870,8 +3743,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3870 rdp->qlen_last_fqs_check = 0; 3743 rdp->qlen_last_fqs_check = 0;
3871 rdp->n_force_qs_snap = rsp->n_force_qs; 3744 rdp->n_force_qs_snap = rsp->n_force_qs;
3872 rdp->blimit = blimit; 3745 rdp->blimit = blimit;
3873 if (!rdp->nxtlist) 3746 if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
3874 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ 3747 !init_nocb_callback_list(rdp))
3748 rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
3875 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 3749 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
3876 rcu_sysidle_init_percpu_data(rdp->dynticks); 3750 rcu_sysidle_init_percpu_data(rdp->dynticks);
3877 rcu_dynticks_eqs_online(); 3751 rcu_dynticks_eqs_online();
@@ -3890,12 +3764,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3890 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ 3764 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
3891 rdp->completed = rnp->completed; 3765 rdp->completed = rnp->completed;
3892 rdp->cpu_no_qs.b.norm = true; 3766 rdp->cpu_no_qs.b.norm = true;
3893 rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); 3767 rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu);
3894 rdp->core_needs_qs = false; 3768 rdp->core_needs_qs = false;
3895 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); 3769 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3896 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3770 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3897} 3771}
3898 3772
3773/*
3774 * Invoked early in the CPU-online process, when pretty much all
3775 * services are available. The incoming CPU is not present.
3776 */
3899int rcutree_prepare_cpu(unsigned int cpu) 3777int rcutree_prepare_cpu(unsigned int cpu)
3900{ 3778{
3901 struct rcu_state *rsp; 3779 struct rcu_state *rsp;
@@ -3909,6 +3787,9 @@ int rcutree_prepare_cpu(unsigned int cpu)
3909 return 0; 3787 return 0;
3910} 3788}
3911 3789
3790/*
3791 * Update RCU priority boot kthread affinity for CPU-hotplug changes.
3792 */
3912static void rcutree_affinity_setting(unsigned int cpu, int outgoing) 3793static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
3913{ 3794{
3914 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); 3795 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
@@ -3916,20 +3797,34 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
3916 rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); 3797 rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
3917} 3798}
3918 3799
3800/*
3801 * Near the end of the CPU-online process. Pretty much all services
3802 * enabled, and the CPU is now very much alive.
3803 */
3919int rcutree_online_cpu(unsigned int cpu) 3804int rcutree_online_cpu(unsigned int cpu)
3920{ 3805{
3921 sync_sched_exp_online_cleanup(cpu); 3806 sync_sched_exp_online_cleanup(cpu);
3922 rcutree_affinity_setting(cpu, -1); 3807 rcutree_affinity_setting(cpu, -1);
3808 if (IS_ENABLED(CONFIG_TREE_SRCU))
3809 srcu_online_cpu(cpu);
3923 return 0; 3810 return 0;
3924} 3811}
3925 3812
3813/*
3814 * Near the beginning of the process. The CPU is still very much alive
3815 * with pretty much all services enabled.
3816 */
3926int rcutree_offline_cpu(unsigned int cpu) 3817int rcutree_offline_cpu(unsigned int cpu)
3927{ 3818{
3928 rcutree_affinity_setting(cpu, cpu); 3819 rcutree_affinity_setting(cpu, cpu);
3820 if (IS_ENABLED(CONFIG_TREE_SRCU))
3821 srcu_offline_cpu(cpu);
3929 return 0; 3822 return 0;
3930} 3823}
3931 3824
3932 3825/*
3826 * Near the end of the offline process. We do only tracing here.
3827 */
3933int rcutree_dying_cpu(unsigned int cpu) 3828int rcutree_dying_cpu(unsigned int cpu)
3934{ 3829{
3935 struct rcu_state *rsp; 3830 struct rcu_state *rsp;
@@ -3939,6 +3834,9 @@ int rcutree_dying_cpu(unsigned int cpu)
3939 return 0; 3834 return 0;
3940} 3835}
3941 3836
3837/*
3838 * The outgoing CPU is gone and we are running elsewhere.
3839 */
3942int rcutree_dead_cpu(unsigned int cpu) 3840int rcutree_dead_cpu(unsigned int cpu)
3943{ 3841{
3944 struct rcu_state *rsp; 3842 struct rcu_state *rsp;
@@ -3956,6 +3854,10 @@ int rcutree_dead_cpu(unsigned int cpu)
3956 * incoming CPUs are not allowed to use RCU read-side critical sections 3854 * incoming CPUs are not allowed to use RCU read-side critical sections
3957 * until this function is called. Failing to observe this restriction 3855 * until this function is called. Failing to observe this restriction
3958 * will result in lockdep splats. 3856 * will result in lockdep splats.
3857 *
3858 * Note that this function is special in that it is invoked directly
3859 * from the incoming CPU rather than from the cpuhp_step mechanism.
3860 * This is because this function must be invoked at a precise location.
3959 */ 3861 */
3960void rcu_cpu_starting(unsigned int cpu) 3862void rcu_cpu_starting(unsigned int cpu)
3961{ 3863{
@@ -3981,9 +3883,6 @@ void rcu_cpu_starting(unsigned int cpu)
3981 * The CPU is exiting the idle loop into the arch_cpu_idle_dead() 3883 * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
3982 * function. We now remove it from the rcu_node tree's ->qsmaskinit 3884 * function. We now remove it from the rcu_node tree's ->qsmaskinit
3983 * bit masks. 3885 * bit masks.
3984 * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
3985 * function. We now remove it from the rcu_node tree's ->qsmaskinit
3986 * bit masks.
3987 */ 3886 */
3988static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) 3887static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
3989{ 3888{
@@ -3999,6 +3898,14 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
3999 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3898 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4000} 3899}
4001 3900
3901/*
3902 * The outgoing function has no further need of RCU, so remove it from
3903 * the list of CPUs that RCU must track.
3904 *
3905 * Note that this function is special in that it is invoked directly
3906 * from the outgoing CPU rather than from the cpuhp_step mechanism.
3907 * This is because this function must be invoked at a precise location.
3908 */
4002void rcu_report_dead(unsigned int cpu) 3909void rcu_report_dead(unsigned int cpu)
4003{ 3910{
4004 struct rcu_state *rsp; 3911 struct rcu_state *rsp;
@@ -4013,6 +3920,10 @@ void rcu_report_dead(unsigned int cpu)
4013} 3920}
4014#endif 3921#endif
4015 3922
3923/*
3924 * On non-huge systems, use expedited RCU grace periods to make suspend
3925 * and hibernation run faster.
3926 */
4016static int rcu_pm_notify(struct notifier_block *self, 3927static int rcu_pm_notify(struct notifier_block *self,
4017 unsigned long action, void *hcpu) 3928 unsigned long action, void *hcpu)
4018{ 3929{
@@ -4083,7 +3994,7 @@ early_initcall(rcu_spawn_gp_kthread);
4083 * task is booting the system, and such primitives are no-ops). After this 3994 * task is booting the system, and such primitives are no-ops). After this
4084 * function is called, any synchronous grace-period primitives are run as 3995 * function is called, any synchronous grace-period primitives are run as
4085 * expedited, with the requesting task driving the grace period forward. 3996 * expedited, with the requesting task driving the grace period forward.
4086 * A later core_initcall() rcu_exp_runtime_mode() will switch to full 3997 * A later core_initcall() rcu_set_runtime_mode() will switch to full
4087 * runtime RCU functionality. 3998 * runtime RCU functionality.
4088 */ 3999 */
4089void rcu_scheduler_starting(void) 4000void rcu_scheduler_starting(void)
@@ -4096,31 +4007,6 @@ void rcu_scheduler_starting(void)
4096} 4007}
4097 4008
4098/* 4009/*
4099 * Compute the per-level fanout, either using the exact fanout specified
4100 * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
4101 */
4102static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
4103{
4104 int i;
4105
4106 if (rcu_fanout_exact) {
4107 levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
4108 for (i = rcu_num_lvls - 2; i >= 0; i--)
4109 levelspread[i] = RCU_FANOUT;
4110 } else {
4111 int ccur;
4112 int cprv;
4113
4114 cprv = nr_cpu_ids;
4115 for (i = rcu_num_lvls - 1; i >= 0; i--) {
4116 ccur = levelcnt[i];
4117 levelspread[i] = (cprv + ccur - 1) / ccur;
4118 cprv = ccur;
4119 }
4120 }
4121}
4122
4123/*
4124 * Helper function for rcu_init() that initializes one rcu_state structure. 4010 * Helper function for rcu_init() that initializes one rcu_state structure.
4125 */ 4011 */
4126static void __init rcu_init_one(struct rcu_state *rsp) 4012static void __init rcu_init_one(struct rcu_state *rsp)
@@ -4129,9 +4015,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
4129 static const char * const fqs[] = RCU_FQS_NAME_INIT; 4015 static const char * const fqs[] = RCU_FQS_NAME_INIT;
4130 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 4016 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
4131 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 4017 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
4132 static u8 fl_mask = 0x1;
4133 4018
4134 int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
4135 int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ 4019 int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
4136 int cpustride = 1; 4020 int cpustride = 1;
4137 int i; 4021 int i;
@@ -4146,20 +4030,16 @@ static void __init rcu_init_one(struct rcu_state *rsp)
4146 4030
4147 /* Initialize the level-tracking arrays. */ 4031 /* Initialize the level-tracking arrays. */
4148 4032
4149 for (i = 0; i < rcu_num_lvls; i++)
4150 levelcnt[i] = num_rcu_lvl[i];
4151 for (i = 1; i < rcu_num_lvls; i++) 4033 for (i = 1; i < rcu_num_lvls; i++)
4152 rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; 4034 rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1];
4153 rcu_init_levelspread(levelspread, levelcnt); 4035 rcu_init_levelspread(levelspread, num_rcu_lvl);
4154 rsp->flavor_mask = fl_mask;
4155 fl_mask <<= 1;
4156 4036
4157 /* Initialize the elements themselves, starting from the leaves. */ 4037 /* Initialize the elements themselves, starting from the leaves. */
4158 4038
4159 for (i = rcu_num_lvls - 1; i >= 0; i--) { 4039 for (i = rcu_num_lvls - 1; i >= 0; i--) {
4160 cpustride *= levelspread[i]; 4040 cpustride *= levelspread[i];
4161 rnp = rsp->level[i]; 4041 rnp = rsp->level[i];
4162 for (j = 0; j < levelcnt[i]; j++, rnp++) { 4042 for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) {
4163 raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); 4043 raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
4164 lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), 4044 lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
4165 &rcu_node_class[i], buf[i]); 4045 &rcu_node_class[i], buf[i]);
@@ -4332,6 +4212,8 @@ void __init rcu_init(void)
4332 for_each_online_cpu(cpu) { 4212 for_each_online_cpu(cpu) {
4333 rcutree_prepare_cpu(cpu); 4213 rcutree_prepare_cpu(cpu);
4334 rcu_cpu_starting(cpu); 4214 rcu_cpu_starting(cpu);
4215 if (IS_ENABLED(CONFIG_TREE_SRCU))
4216 srcu_online_cpu(cpu);
4335 } 4217 }
4336} 4218}
4337 4219
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ec62a05bfdb3..0e598ab08fea 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -30,80 +30,8 @@
30#include <linux/seqlock.h> 30#include <linux/seqlock.h>
31#include <linux/swait.h> 31#include <linux/swait.h>
32#include <linux/stop_machine.h> 32#include <linux/stop_machine.h>
33 33#include <linux/rcu_segcblist.h>
34/* 34#include <linux/rcu_node_tree.h>
35 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
36 * CONFIG_RCU_FANOUT_LEAF.
37 * In theory, it should be possible to add more levels straightforwardly.
38 * In practice, this did work well going from three levels to four.
39 * Of course, your mileage may vary.
40 */
41
42#ifdef CONFIG_RCU_FANOUT
43#define RCU_FANOUT CONFIG_RCU_FANOUT
44#else /* #ifdef CONFIG_RCU_FANOUT */
45# ifdef CONFIG_64BIT
46# define RCU_FANOUT 64
47# else
48# define RCU_FANOUT 32
49# endif
50#endif /* #else #ifdef CONFIG_RCU_FANOUT */
51
52#ifdef CONFIG_RCU_FANOUT_LEAF
53#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
54#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
55# ifdef CONFIG_64BIT
56# define RCU_FANOUT_LEAF 64
57# else
58# define RCU_FANOUT_LEAF 32
59# endif
60#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
61
62#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
63#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT)
64#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT)
65#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT)
66
67#if NR_CPUS <= RCU_FANOUT_1
68# define RCU_NUM_LVLS 1
69# define NUM_RCU_LVL_0 1
70# define NUM_RCU_NODES NUM_RCU_LVL_0
71# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
72# define RCU_NODE_NAME_INIT { "rcu_node_0" }
73# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
74#elif NR_CPUS <= RCU_FANOUT_2
75# define RCU_NUM_LVLS 2
76# define NUM_RCU_LVL_0 1
77# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
78# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
79# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
80# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
81# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
82#elif NR_CPUS <= RCU_FANOUT_3
83# define RCU_NUM_LVLS 3
84# define NUM_RCU_LVL_0 1
85# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
86# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
87# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
88# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
89# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
90# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
91#elif NR_CPUS <= RCU_FANOUT_4
92# define RCU_NUM_LVLS 4
93# define NUM_RCU_LVL_0 1
94# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
95# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
96# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
97# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
98# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
99# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
100# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
101#else
102# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
103#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
104
105extern int rcu_num_lvls;
106extern int rcu_num_nodes;
107 35
108/* 36/*
109 * Dynticks per-CPU state. 37 * Dynticks per-CPU state.
@@ -113,6 +41,9 @@ struct rcu_dynticks {
113 /* Process level is worth LLONG_MAX/2. */ 41 /* Process level is worth LLONG_MAX/2. */
114 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 42 int dynticks_nmi_nesting; /* Track NMI nesting level. */
115 atomic_t dynticks; /* Even value for idle, else odd. */ 43 atomic_t dynticks; /* Even value for idle, else odd. */
44 bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */
45 unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */
46 bool rcu_urgent_qs; /* GP old need light quiescent state. */
116#ifdef CONFIG_NO_HZ_FULL_SYSIDLE 47#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
117 long long dynticks_idle_nesting; 48 long long dynticks_idle_nesting;
118 /* irq/process nesting level from idle. */ 49 /* irq/process nesting level from idle. */
@@ -262,41 +193,6 @@ struct rcu_node {
262#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) 193#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo))
263 194
264/* 195/*
265 * Do a full breadth-first scan of the rcu_node structures for the
266 * specified rcu_state structure.
267 */
268#define rcu_for_each_node_breadth_first(rsp, rnp) \
269 for ((rnp) = &(rsp)->node[0]; \
270 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
271
272/*
273 * Do a breadth-first scan of the non-leaf rcu_node structures for the
274 * specified rcu_state structure. Note that if there is a singleton
275 * rcu_node tree with but one rcu_node structure, this loop is a no-op.
276 */
277#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
278 for ((rnp) = &(rsp)->node[0]; \
279 (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
280
281/*
282 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
283 * structure. Note that if there is a singleton rcu_node tree with but
284 * one rcu_node structure, this loop -will- visit the rcu_node structure.
285 * It is still a leaf node, even if it is also the root node.
286 */
287#define rcu_for_each_leaf_node(rsp, rnp) \
288 for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
289 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
290
291/*
292 * Iterate over all possible CPUs in a leaf RCU node.
293 */
294#define for_each_leaf_node_possible_cpu(rnp, cpu) \
295 for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
296 cpu <= rnp->grphi; \
297 cpu = cpumask_next((cpu), cpu_possible_mask))
298
299/*
300 * Union to allow "aggregate OR" operation on the need for a quiescent 196 * Union to allow "aggregate OR" operation on the need for a quiescent
301 * state by the normal and expedited grace periods. 197 * state by the normal and expedited grace periods.
302 */ 198 */
@@ -336,34 +232,9 @@ struct rcu_data {
336 /* period it is aware of. */ 232 /* period it is aware of. */
337 233
338 /* 2) batch handling */ 234 /* 2) batch handling */
339 /* 235 struct rcu_segcblist cblist; /* Segmented callback list, with */
340 * If nxtlist is not NULL, it is partitioned as follows. 236 /* different callbacks waiting for */
341 * Any of the partitions might be empty, in which case the 237 /* different grace periods. */
342 * pointer to that partition will be equal to the pointer for
343 * the following partition. When the list is empty, all of
344 * the nxttail elements point to the ->nxtlist pointer itself,
345 * which in that case is NULL.
346 *
347 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
348 * Entries that batch # <= ->completed
349 * The grace period for these entries has completed, and
350 * the other grace-period-completed entries may be moved
351 * here temporarily in rcu_process_callbacks().
352 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
353 * Entries that batch # <= ->completed - 1: waiting for current GP
354 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
355 * Entries known to have arrived before current GP ended
356 * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
357 * Entries that might have arrived after current GP ended
358 * Note that the value of *nxttail[RCU_NEXT_TAIL] will
359 * always be NULL, as this is the end of the list.
360 */
361 struct rcu_head *nxtlist;
362 struct rcu_head **nxttail[RCU_NEXT_SIZE];
363 unsigned long nxtcompleted[RCU_NEXT_SIZE];
364 /* grace periods for sublists. */
365 long qlen_lazy; /* # of lazy queued callbacks */
366 long qlen; /* # of queued callbacks, incl lazy */
367 long qlen_last_fqs_check; 238 long qlen_last_fqs_check;
368 /* qlen at last check for QS forcing */ 239 /* qlen at last check for QS forcing */
369 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 240 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
@@ -482,7 +353,6 @@ struct rcu_state {
482 struct rcu_node *level[RCU_NUM_LVLS + 1]; 353 struct rcu_node *level[RCU_NUM_LVLS + 1];
483 /* Hierarchy levels (+1 to */ 354 /* Hierarchy levels (+1 to */
484 /* shut bogus gcc warning) */ 355 /* shut bogus gcc warning) */
485 u8 flavor_mask; /* bit in flavor mask. */
486 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 356 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
487 call_rcu_func_t call; /* call_rcu() flavor. */ 357 call_rcu_func_t call; /* call_rcu() flavor. */
488 int ncpus; /* # CPUs seen so far. */ 358 int ncpus; /* # CPUs seen so far. */
@@ -502,14 +372,11 @@ struct rcu_state {
502 372
503 raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; 373 raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
504 /* Protect following fields. */ 374 /* Protect following fields. */
505 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ 375 struct rcu_cblist orphan_pend; /* Orphaned callbacks that */
506 /* need a grace period. */ 376 /* need a grace period. */
507 struct rcu_head **orphan_nxttail; /* Tail of above. */ 377 struct rcu_cblist orphan_done; /* Orphaned callbacks that */
508 struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
509 /* are ready to invoke. */ 378 /* are ready to invoke. */
510 struct rcu_head **orphan_donetail; /* Tail of above. */ 379 /* (Contains counts.) */
511 long qlen_lazy; /* Number of lazy callbacks. */
512 long qlen; /* Total number of callbacks. */
513 /* End of fields guarded by orphan_lock. */ 380 /* End of fields guarded by orphan_lock. */
514 381
515 struct mutex barrier_mutex; /* Guards barrier fields. */ 382 struct mutex barrier_mutex; /* Guards barrier fields. */
@@ -596,6 +463,7 @@ extern struct rcu_state rcu_preempt_state;
596#endif /* #ifdef CONFIG_PREEMPT_RCU */ 463#endif /* #ifdef CONFIG_PREEMPT_RCU */
597 464
598int rcu_dynticks_snap(struct rcu_dynticks *rdtp); 465int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
466bool rcu_eqs_special_set(int cpu);
599 467
600#ifdef CONFIG_RCU_BOOST 468#ifdef CONFIG_RCU_BOOST
601DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); 469DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -673,6 +541,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
673static void rcu_dynticks_task_enter(void); 541static void rcu_dynticks_task_enter(void);
674static void rcu_dynticks_task_exit(void); 542static void rcu_dynticks_task_exit(void);
675 543
544#ifdef CONFIG_SRCU
545void srcu_online_cpu(unsigned int cpu);
546void srcu_offline_cpu(unsigned int cpu);
547#else /* #ifdef CONFIG_SRCU */
548void srcu_online_cpu(unsigned int cpu) { }
549void srcu_offline_cpu(unsigned int cpu) { }
550#endif /* #else #ifdef CONFIG_SRCU */
551
676#endif /* #ifndef RCU_TREE_NONCORE */ 552#endif /* #ifndef RCU_TREE_NONCORE */
677 553
678#ifdef CONFIG_RCU_TRACE 554#ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index a7b639ccd46e..e513b4ab1197 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -292,7 +292,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
292 trace_rcu_exp_funnel_lock(rsp->name, rnp->level, 292 trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
293 rnp->grplo, rnp->grphi, 293 rnp->grplo, rnp->grphi,
294 TPS("wait")); 294 TPS("wait"));
295 wait_event(rnp->exp_wq[(s >> 1) & 0x3], 295 wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
296 sync_exp_work_done(rsp, 296 sync_exp_work_done(rsp,
297 &rdp->exp_workdone2, s)); 297 &rdp->exp_workdone2, s));
298 return true; 298 return true;
@@ -331,6 +331,8 @@ static void sync_sched_exp_handler(void *data)
331 return; 331 return;
332 } 332 }
333 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); 333 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
334 /* Store .exp before .rcu_urgent_qs. */
335 smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
334 resched_cpu(smp_processor_id()); 336 resched_cpu(smp_processor_id());
335} 337}
336 338
@@ -531,7 +533,8 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
531 rnp->exp_seq_rq = s; 533 rnp->exp_seq_rq = s;
532 spin_unlock(&rnp->exp_lock); 534 spin_unlock(&rnp->exp_lock);
533 } 535 }
534 wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); 536 smp_mb(); /* All above changes before wakeup. */
537 wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]);
535 } 538 }
536 trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); 539 trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
537 mutex_unlock(&rsp->exp_wake_mutex); 540 mutex_unlock(&rsp->exp_wake_mutex);
@@ -609,9 +612,9 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
609 /* Wait for expedited grace period to complete. */ 612 /* Wait for expedited grace period to complete. */
610 rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); 613 rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
611 rnp = rcu_get_root(rsp); 614 rnp = rcu_get_root(rsp);
612 wait_event(rnp->exp_wq[(s >> 1) & 0x3], 615 wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
613 sync_exp_work_done(rsp, 616 sync_exp_work_done(rsp, &rdp->exp_workdone0, s));
614 &rdp->exp_workdone0, s)); 617 smp_mb(); /* Workqueue actions happen before return. */
615 618
616 /* Let the next expedited grace period start. */ 619 /* Let the next expedited grace period start. */
617 mutex_unlock(&rsp->exp_mutex); 620 mutex_unlock(&rsp->exp_mutex);
@@ -735,15 +738,3 @@ void synchronize_rcu_expedited(void)
735EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 738EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
736 739
737#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ 740#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
738
739/*
740 * Switch to run-time mode once Tree RCU has fully initialized.
741 */
742static int __init rcu_exp_runtime_mode(void)
743{
744 rcu_test_sync_prims();
745 rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
746 rcu_test_sync_prims();
747 return 0;
748}
749core_initcall(rcu_exp_runtime_mode);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a62a8f1caac..7f1d677a2a25 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1350,10 +1350,10 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
1350 */ 1350 */
1351 if ((rdp->completed != rnp->completed || 1351 if ((rdp->completed != rnp->completed ||
1352 unlikely(READ_ONCE(rdp->gpwrap))) && 1352 unlikely(READ_ONCE(rdp->gpwrap))) &&
1353 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) 1353 rcu_segcblist_pend_cbs(&rdp->cblist))
1354 note_gp_changes(rsp, rdp); 1354 note_gp_changes(rsp, rdp);
1355 1355
1356 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1356 if (rcu_segcblist_ready_cbs(&rdp->cblist))
1357 cbs_ready = true; 1357 cbs_ready = true;
1358 } 1358 }
1359 return cbs_ready; 1359 return cbs_ready;
@@ -1461,7 +1461,7 @@ static void rcu_prepare_for_idle(void)
1461 rdtp->last_accelerate = jiffies; 1461 rdtp->last_accelerate = jiffies;
1462 for_each_rcu_flavor(rsp) { 1462 for_each_rcu_flavor(rsp) {
1463 rdp = this_cpu_ptr(rsp->rda); 1463 rdp = this_cpu_ptr(rsp->rda);
1464 if (!*rdp->nxttail[RCU_DONE_TAIL]) 1464 if (rcu_segcblist_pend_cbs(&rdp->cblist))
1465 continue; 1465 continue;
1466 rnp = rdp->mynode; 1466 rnp = rdp->mynode;
1467 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 1467 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
@@ -1529,7 +1529,7 @@ static void rcu_oom_notify_cpu(void *unused)
1529 1529
1530 for_each_rcu_flavor(rsp) { 1530 for_each_rcu_flavor(rsp) {
1531 rdp = raw_cpu_ptr(rsp->rda); 1531 rdp = raw_cpu_ptr(rsp->rda);
1532 if (rdp->qlen_lazy != 0) { 1532 if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) {
1533 atomic_inc(&oom_callback_count); 1533 atomic_inc(&oom_callback_count);
1534 rsp->call(&rdp->oom_head, rcu_oom_callback); 1534 rsp->call(&rdp->oom_head, rcu_oom_callback);
1535 } 1535 }
@@ -1709,7 +1709,7 @@ __setup("rcu_nocbs=", rcu_nocb_setup);
1709 1709
1710static int __init parse_rcu_nocb_poll(char *arg) 1710static int __init parse_rcu_nocb_poll(char *arg)
1711{ 1711{
1712 rcu_nocb_poll = 1; 1712 rcu_nocb_poll = true;
1713 return 0; 1713 return 0;
1714} 1714}
1715early_param("rcu_nocb_poll", parse_rcu_nocb_poll); 1715early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
@@ -1860,7 +1860,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
1860 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1860 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1861 TPS("WakeEmpty")); 1861 TPS("WakeEmpty"));
1862 } else { 1862 } else {
1863 rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; 1863 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE);
1864 /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
1865 smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
1864 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1866 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1865 TPS("WakeEmptyIsDeferred")); 1867 TPS("WakeEmptyIsDeferred"));
1866 } 1868 }
@@ -1872,7 +1874,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
1872 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1874 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1873 TPS("WakeOvf")); 1875 TPS("WakeOvf"));
1874 } else { 1876 } else {
1875 rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; 1877 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE);
1878 /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
1879 smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
1876 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1880 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1877 TPS("WakeOvfIsDeferred")); 1881 TPS("WakeOvfIsDeferred"));
1878 } 1882 }
@@ -1930,30 +1934,26 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
1930 struct rcu_data *rdp, 1934 struct rcu_data *rdp,
1931 unsigned long flags) 1935 unsigned long flags)
1932{ 1936{
1933 long ql = rsp->qlen; 1937 long ql = rcu_cblist_n_cbs(&rsp->orphan_done);
1934 long qll = rsp->qlen_lazy; 1938 long qll = rcu_cblist_n_lazy_cbs(&rsp->orphan_done);
1935 1939
1936 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ 1940 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
1937 if (!rcu_is_nocb_cpu(smp_processor_id())) 1941 if (!rcu_is_nocb_cpu(smp_processor_id()))
1938 return false; 1942 return false;
1939 rsp->qlen = 0;
1940 rsp->qlen_lazy = 0;
1941 1943
1942 /* First, enqueue the donelist, if any. This preserves CB ordering. */ 1944 /* First, enqueue the donelist, if any. This preserves CB ordering. */
1943 if (rsp->orphan_donelist != NULL) { 1945 if (!rcu_cblist_empty(&rsp->orphan_done)) {
1944 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, 1946 __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done),
1945 rsp->orphan_donetail, ql, qll, flags); 1947 rcu_cblist_tail(&rsp->orphan_done),
1946 ql = qll = 0; 1948 ql, qll, flags);
1947 rsp->orphan_donelist = NULL;
1948 rsp->orphan_donetail = &rsp->orphan_donelist;
1949 } 1949 }
1950 if (rsp->orphan_nxtlist != NULL) { 1950 if (!rcu_cblist_empty(&rsp->orphan_pend)) {
1951 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, 1951 __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend),
1952 rsp->orphan_nxttail, ql, qll, flags); 1952 rcu_cblist_tail(&rsp->orphan_pend),
1953 ql = qll = 0; 1953 ql, qll, flags);
1954 rsp->orphan_nxtlist = NULL;
1955 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
1956 } 1954 }
1955 rcu_cblist_init(&rsp->orphan_done);
1956 rcu_cblist_init(&rsp->orphan_pend);
1957 return true; 1957 return true;
1958} 1958}
1959 1959
@@ -2395,16 +2395,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
2395 return false; 2395 return false;
2396 2396
2397 /* If there are early-boot callbacks, move them to nocb lists. */ 2397 /* If there are early-boot callbacks, move them to nocb lists. */
2398 if (rdp->nxtlist) { 2398 if (!rcu_segcblist_empty(&rdp->cblist)) {
2399 rdp->nocb_head = rdp->nxtlist; 2399 rdp->nocb_head = rcu_segcblist_head(&rdp->cblist);
2400 rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL]; 2400 rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist);
2401 atomic_long_set(&rdp->nocb_q_count, rdp->qlen); 2401 atomic_long_set(&rdp->nocb_q_count,
2402 atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy); 2402 rcu_segcblist_n_cbs(&rdp->cblist));
2403 rdp->nxtlist = NULL; 2403 atomic_long_set(&rdp->nocb_q_count_lazy,
2404 rdp->qlen = 0; 2404 rcu_segcblist_n_lazy_cbs(&rdp->cblist));
2405 rdp->qlen_lazy = 0; 2405 rcu_segcblist_init(&rdp->cblist);
2406 } 2406 }
2407 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2407 rcu_segcblist_disable(&rdp->cblist);
2408 return true; 2408 return true;
2409} 2409}
2410 2410
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 8751a748499a..30c5bf89ee58 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -41,11 +41,11 @@
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/debugfs.h> 42#include <linux/debugfs.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/prefetch.h>
44 45
45#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
46#include "tree.h" 47#include "tree.h"
47 48#include "rcu.h"
48DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
49 49
50static int r_open(struct inode *inode, struct file *file, 50static int r_open(struct inode *inode, struct file *file,
51 const struct seq_operations *op) 51 const struct seq_operations *op)
@@ -121,7 +121,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
121 cpu_is_offline(rdp->cpu) ? '!' : ' ', 121 cpu_is_offline(rdp->cpu) ? '!' : ' ',
122 ulong2long(rdp->completed), ulong2long(rdp->gpnum), 122 ulong2long(rdp->completed), ulong2long(rdp->gpnum),
123 rdp->cpu_no_qs.b.norm, 123 rdp->cpu_no_qs.b.norm,
124 rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), 124 rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu),
125 rdp->core_needs_qs); 125 rdp->core_needs_qs);
126 seq_printf(m, " dt=%d/%llx/%d df=%lu", 126 seq_printf(m, " dt=%d/%llx/%d df=%lu",
127 rcu_dynticks_snap(rdp->dynticks), 127 rcu_dynticks_snap(rdp->dynticks),
@@ -130,17 +130,15 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
130 rdp->dynticks_fqs); 130 rdp->dynticks_fqs);
131 seq_printf(m, " of=%lu", rdp->offline_fqs); 131 seq_printf(m, " of=%lu", rdp->offline_fqs);
132 rcu_nocb_q_lengths(rdp, &ql, &qll); 132 rcu_nocb_q_lengths(rdp, &ql, &qll);
133 qll += rdp->qlen_lazy; 133 qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist);
134 ql += rdp->qlen; 134 ql += rcu_segcblist_n_cbs(&rdp->cblist);
135 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", 135 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
136 qll, ql, 136 qll, ql,
137 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 137 ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)],
138 rdp->nxttail[RCU_NEXT_TAIL]], 138 ".R"[!rcu_segcblist_segempty(&rdp->cblist,
139 ".R"[rdp->nxttail[RCU_WAIT_TAIL] != 139 RCU_NEXT_READY_TAIL)],
140 rdp->nxttail[RCU_NEXT_READY_TAIL]], 140 ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)],
141 ".W"[rdp->nxttail[RCU_DONE_TAIL] != 141 ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]);
142 rdp->nxttail[RCU_WAIT_TAIL]],
143 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
144#ifdef CONFIG_RCU_BOOST 142#ifdef CONFIG_RCU_BOOST
145 seq_printf(m, " kt=%d/%c ktl=%x", 143 seq_printf(m, " kt=%d/%c ktl=%x",
146 per_cpu(rcu_cpu_has_work, rdp->cpu), 144 per_cpu(rcu_cpu_has_work, rdp->cpu),
@@ -278,7 +276,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
278 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", 276 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
279 rsp->n_force_qs, rsp->n_force_qs_ngp, 277 rsp->n_force_qs, rsp->n_force_qs_ngp,
280 rsp->n_force_qs - rsp->n_force_qs_ngp, 278 rsp->n_force_qs - rsp->n_force_qs_ngp,
281 READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); 279 READ_ONCE(rsp->n_force_qs_lh),
280 rcu_cblist_n_lazy_cbs(&rsp->orphan_done),
281 rcu_cblist_n_cbs(&rsp->orphan_done));
282 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { 282 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
283 if (rnp->level != level) { 283 if (rnp->level != level) {
284 seq_puts(m, "\n"); 284 seq_puts(m, "\n");
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 55c8530316c7..273e869ca21d 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
124 * non-expedited counterparts? Intended for use within RCU. Note 124 * non-expedited counterparts? Intended for use within RCU. Note
125 * that if the user specifies both rcu_expedited and rcu_normal, then 125 * that if the user specifies both rcu_expedited and rcu_normal, then
126 * rcu_normal wins. (Except during the time period during boot from 126 * rcu_normal wins. (Except during the time period during boot from
127 * when the first task is spawned until the rcu_exp_runtime_mode() 127 * when the first task is spawned until the rcu_set_runtime_mode()
128 * core_initcall() is invoked, at which point everything is expedited.) 128 * core_initcall() is invoked, at which point everything is expedited.)
129 */ 129 */
130bool rcu_gp_is_normal(void) 130bool rcu_gp_is_normal(void)
@@ -190,6 +190,39 @@ void rcu_end_inkernel_boot(void)
190 190
191#endif /* #ifndef CONFIG_TINY_RCU */ 191#endif /* #ifndef CONFIG_TINY_RCU */
192 192
193/*
194 * Test each non-SRCU synchronous grace-period wait API. This is
195 * useful just after a change in mode for these primitives, and
196 * during early boot.
197 */
198void rcu_test_sync_prims(void)
199{
200 if (!IS_ENABLED(CONFIG_PROVE_RCU))
201 return;
202 synchronize_rcu();
203 synchronize_rcu_bh();
204 synchronize_sched();
205 synchronize_rcu_expedited();
206 synchronize_rcu_bh_expedited();
207 synchronize_sched_expedited();
208}
209
210#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU)
211
212/*
213 * Switch to run-time mode once RCU has fully initialized.
214 */
215static int __init rcu_set_runtime_mode(void)
216{
217 rcu_test_sync_prims();
218 rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
219 rcu_test_sync_prims();
220 return 0;
221}
222core_initcall(rcu_set_runtime_mode);
223
224#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
225
193#ifdef CONFIG_PREEMPT_RCU 226#ifdef CONFIG_PREEMPT_RCU
194 227
195/* 228/*
@@ -632,6 +665,7 @@ static void check_holdout_task(struct task_struct *t,
632 put_task_struct(t); 665 put_task_struct(t);
633 return; 666 return;
634 } 667 }
668 rcu_request_urgent_qs_task(t);
635 if (!needreport) 669 if (!needreport)
636 return; 670 return;
637 if (*firstreport) { 671 if (*firstreport) {
@@ -817,23 +851,6 @@ static void rcu_spawn_tasks_kthread(void)
817 851
818#endif /* #ifdef CONFIG_TASKS_RCU */ 852#endif /* #ifdef CONFIG_TASKS_RCU */
819 853
820/*
821 * Test each non-SRCU synchronous grace-period wait API. This is
822 * useful just after a change in mode for these primitives, and
823 * during early boot.
824 */
825void rcu_test_sync_prims(void)
826{
827 if (!IS_ENABLED(CONFIG_PROVE_RCU))
828 return;
829 synchronize_rcu();
830 synchronize_rcu_bh();
831 synchronize_sched();
832 synchronize_rcu_expedited();
833 synchronize_rcu_bh_expedited();
834 synchronize_sched_expedited();
835}
836
837#ifdef CONFIG_PROVE_RCU 854#ifdef CONFIG_PROVE_RCU
838 855
839/* 856/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3b31fc05a0f1..2adf7b6c04e7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3378,7 +3378,7 @@ static void __sched notrace __schedule(bool preempt)
3378 hrtick_clear(rq); 3378 hrtick_clear(rq);
3379 3379
3380 local_irq_disable(); 3380 local_irq_disable();
3381 rcu_note_context_switch(); 3381 rcu_note_context_switch(preempt);
3382 3382
3383 /* 3383 /*
3384 * Make sure that signal_pending_state()->signal_pending() below 3384 * Make sure that signal_pending_state()->signal_pending() below
diff --git a/kernel/signal.c b/kernel/signal.c
index 7e59ebc2c25e..6df5f72158e4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1237,7 +1237,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
1237 } 1237 }
1238 /* 1238 /*
1239 * This sighand can be already freed and even reused, but 1239 * This sighand can be already freed and even reused, but
1240 * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which 1240 * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which
1241 * initializes ->siglock: this slab can't go away, it has 1241 * initializes ->siglock: this slab can't go away, it has
1242 * the same object type, ->siglock can't be reinitialized. 1242 * the same object type, ->siglock can't be reinitialized.
1243 * 1243 *
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 98b27195e38b..4b20061102f6 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -413,7 +413,7 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
413 *size += sizeof(struct kasan_alloc_meta); 413 *size += sizeof(struct kasan_alloc_meta);
414 414
415 /* Add free meta. */ 415 /* Add free meta. */
416 if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor || 416 if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor ||
417 cache->object_size < sizeof(struct kasan_free_meta)) { 417 cache->object_size < sizeof(struct kasan_free_meta)) {
418 cache->kasan_info.free_meta_offset = *size; 418 cache->kasan_info.free_meta_offset = *size;
419 *size += sizeof(struct kasan_free_meta); 419 *size += sizeof(struct kasan_free_meta);
@@ -561,7 +561,7 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
561 unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); 561 unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
562 562
563 /* RCU slabs could be legally used after free within the RCU period */ 563 /* RCU slabs could be legally used after free within the RCU period */
564 if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) 564 if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
565 return; 565 return;
566 566
567 kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); 567 kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
@@ -572,7 +572,7 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
572 s8 shadow_byte; 572 s8 shadow_byte;
573 573
574 /* RCU slabs could be legally used after free within the RCU period */ 574 /* RCU slabs could be legally used after free within the RCU period */
575 if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) 575 if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
576 return false; 576 return false;
577 577
578 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); 578 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index 5bf191756a4a..2d5959c5f7c5 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -95,7 +95,7 @@ void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
95void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) 95void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
96{ 96{
97 /* TODO: RCU freeing is unsupported for now; hide false positives. */ 97 /* TODO: RCU freeing is unsupported for now; hide false positives. */
98 if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) 98 if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU))
99 kmemcheck_mark_freed(object, size); 99 kmemcheck_mark_freed(object, size);
100} 100}
101 101
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index a7652acd2ab9..54ca54562928 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -21,7 +21,7 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22 22
23/* global SRCU for all MMs */ 23/* global SRCU for all MMs */
24static struct srcu_struct srcu; 24DEFINE_STATIC_SRCU(srcu);
25 25
26/* 26/*
27 * This function allows mmu_notifier::release callback to delay a call to 27 * This function allows mmu_notifier::release callback to delay a call to
@@ -252,12 +252,6 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
252 252
253 BUG_ON(atomic_read(&mm->mm_users) <= 0); 253 BUG_ON(atomic_read(&mm->mm_users) <= 0);
254 254
255 /*
256 * Verify that mmu_notifier_init() already run and the global srcu is
257 * initialized.
258 */
259 BUG_ON(!srcu.per_cpu_ref);
260
261 ret = -ENOMEM; 255 ret = -ENOMEM;
262 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); 256 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
263 if (unlikely(!mmu_notifier_mm)) 257 if (unlikely(!mmu_notifier_mm))
@@ -406,9 +400,3 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn,
406 mmdrop(mm); 400 mmdrop(mm);
407} 401}
408EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); 402EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release);
409
410static int __init mmu_notifier_init(void)
411{
412 return init_srcu_struct(&srcu);
413}
414subsys_initcall(mmu_notifier_init);
diff --git a/mm/rmap.c b/mm/rmap.c
index f6838015810f..1b776f793998 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -430,7 +430,7 @@ static void anon_vma_ctor(void *data)
430void __init anon_vma_init(void) 430void __init anon_vma_init(void)
431{ 431{
432 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 432 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
433 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, 433 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
434 anon_vma_ctor); 434 anon_vma_ctor);
435 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, 435 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
436 SLAB_PANIC|SLAB_ACCOUNT); 436 SLAB_PANIC|SLAB_ACCOUNT);
@@ -481,7 +481,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
481 * If this page is still mapped, then its anon_vma cannot have been 481 * If this page is still mapped, then its anon_vma cannot have been
482 * freed. But if it has been unmapped, we have no security against the 482 * freed. But if it has been unmapped, we have no security against the
483 * anon_vma structure being freed and reused (for another anon_vma: 483 * anon_vma structure being freed and reused (for another anon_vma:
484 * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() 484 * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
485 * above cannot corrupt). 485 * above cannot corrupt).
486 */ 486 */
487 if (!page_mapped(page)) { 487 if (!page_mapped(page)) {
diff --git a/mm/slab.c b/mm/slab.c
index 807d86c76908..93c827864862 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1728,7 +1728,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page)
1728 1728
1729 freelist = page->freelist; 1729 freelist = page->freelist;
1730 slab_destroy_debugcheck(cachep, page); 1730 slab_destroy_debugcheck(cachep, page);
1731 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 1731 if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU))
1732 call_rcu(&page->rcu_head, kmem_rcu_free); 1732 call_rcu(&page->rcu_head, kmem_rcu_free);
1733 else 1733 else
1734 kmem_freepages(cachep, page); 1734 kmem_freepages(cachep, page);
@@ -1924,7 +1924,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
1924 1924
1925 cachep->num = 0; 1925 cachep->num = 0;
1926 1926
1927 if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU) 1927 if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU)
1928 return false; 1928 return false;
1929 1929
1930 left = calculate_slab_order(cachep, size, 1930 left = calculate_slab_order(cachep, size,
@@ -2030,7 +2030,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2030 if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 2030 if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
2031 2 * sizeof(unsigned long long))) 2031 2 * sizeof(unsigned long long)))
2032 flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 2032 flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2033 if (!(flags & SLAB_DESTROY_BY_RCU)) 2033 if (!(flags & SLAB_TYPESAFE_BY_RCU))
2034 flags |= SLAB_POISON; 2034 flags |= SLAB_POISON;
2035#endif 2035#endif
2036#endif 2036#endif
diff --git a/mm/slab.h b/mm/slab.h
index 65e7c3fcac72..9cfcf099709c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -126,7 +126,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
126 126
127/* Legal flag mask for kmem_cache_create(), for various configurations */ 127/* Legal flag mask for kmem_cache_create(), for various configurations */
128#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ 128#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
129 SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS ) 129 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
130 130
131#if defined(CONFIG_DEBUG_SLAB) 131#if defined(CONFIG_DEBUG_SLAB)
132#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) 132#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
@@ -415,7 +415,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
415 * back there or track user information then we can 415 * back there or track user information then we can
416 * only use the space before that information. 416 * only use the space before that information.
417 */ 417 */
418 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 418 if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER))
419 return s->inuse; 419 return s->inuse;
420 /* 420 /*
421 * Else we can use all the padding etc for the allocation 421 * Else we can use all the padding etc for the allocation
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 09d0e849b07f..01a0fe2eb332 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -39,7 +39,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
39 * Set of flags that will prevent slab merging 39 * Set of flags that will prevent slab merging
40 */ 40 */
41#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 41#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
42 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ 42 SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
43 SLAB_FAILSLAB | SLAB_KASAN) 43 SLAB_FAILSLAB | SLAB_KASAN)
44 44
45#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ 45#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
@@ -500,7 +500,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
500 struct kmem_cache *s, *s2; 500 struct kmem_cache *s, *s2;
501 501
502 /* 502 /*
503 * On destruction, SLAB_DESTROY_BY_RCU kmem_caches are put on the 503 * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the
504 * @slab_caches_to_rcu_destroy list. The slab pages are freed 504 * @slab_caches_to_rcu_destroy list. The slab pages are freed
505 * through RCU and and the associated kmem_cache are dereferenced 505 * through RCU and and the associated kmem_cache are dereferenced
506 * while freeing the pages, so the kmem_caches should be freed only 506 * while freeing the pages, so the kmem_caches should be freed only
@@ -537,7 +537,7 @@ static int shutdown_cache(struct kmem_cache *s)
537 memcg_unlink_cache(s); 537 memcg_unlink_cache(s);
538 list_del(&s->list); 538 list_del(&s->list);
539 539
540 if (s->flags & SLAB_DESTROY_BY_RCU) { 540 if (s->flags & SLAB_TYPESAFE_BY_RCU) {
541 list_add_tail(&s->list, &slab_caches_to_rcu_destroy); 541 list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
542 schedule_work(&slab_caches_to_rcu_destroy_work); 542 schedule_work(&slab_caches_to_rcu_destroy_work);
543 } else { 543 } else {
diff --git a/mm/slob.c b/mm/slob.c
index eac04d4357ec..1bae78d71096 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -126,7 +126,7 @@ static inline void clear_slob_page_free(struct page *sp)
126 126
127/* 127/*
128 * struct slob_rcu is inserted at the tail of allocated slob blocks, which 128 * struct slob_rcu is inserted at the tail of allocated slob blocks, which
129 * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free 129 * were created with a SLAB_TYPESAFE_BY_RCU slab. slob_rcu is used to free
130 * the block using call_rcu. 130 * the block using call_rcu.
131 */ 131 */
132struct slob_rcu { 132struct slob_rcu {
@@ -524,7 +524,7 @@ EXPORT_SYMBOL(ksize);
524 524
525int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) 525int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
526{ 526{
527 if (flags & SLAB_DESTROY_BY_RCU) { 527 if (flags & SLAB_TYPESAFE_BY_RCU) {
528 /* leave room for rcu footer at the end of object */ 528 /* leave room for rcu footer at the end of object */
529 c->size += sizeof(struct slob_rcu); 529 c->size += sizeof(struct slob_rcu);
530 } 530 }
@@ -598,7 +598,7 @@ static void kmem_rcu_free(struct rcu_head *head)
598void kmem_cache_free(struct kmem_cache *c, void *b) 598void kmem_cache_free(struct kmem_cache *c, void *b)
599{ 599{
600 kmemleak_free_recursive(b, c->flags); 600 kmemleak_free_recursive(b, c->flags);
601 if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { 601 if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) {
602 struct slob_rcu *slob_rcu; 602 struct slob_rcu *slob_rcu;
603 slob_rcu = b + (c->size - sizeof(struct slob_rcu)); 603 slob_rcu = b + (c->size - sizeof(struct slob_rcu));
604 slob_rcu->size = c->size; 604 slob_rcu->size = c->size;
diff --git a/mm/slub.c b/mm/slub.c
index 7f4bc7027ed5..57e5156f02be 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1687,7 +1687,7 @@ static void rcu_free_slab(struct rcu_head *h)
1687 1687
1688static void free_slab(struct kmem_cache *s, struct page *page) 1688static void free_slab(struct kmem_cache *s, struct page *page)
1689{ 1689{
1690 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1690 if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
1691 struct rcu_head *head; 1691 struct rcu_head *head;
1692 1692
1693 if (need_reserve_slab_rcu) { 1693 if (need_reserve_slab_rcu) {
@@ -2963,7 +2963,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
2963 * slab_free_freelist_hook() could have put the items into quarantine. 2963 * slab_free_freelist_hook() could have put the items into quarantine.
2964 * If so, no need to free them. 2964 * If so, no need to free them.
2965 */ 2965 */
2966 if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU)) 2966 if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU))
2967 return; 2967 return;
2968 do_slab_free(s, page, head, tail, cnt, addr); 2968 do_slab_free(s, page, head, tail, cnt, addr);
2969} 2969}
@@ -3433,7 +3433,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3433 * the slab may touch the object after free or before allocation 3433 * the slab may touch the object after free or before allocation
3434 * then we should never poison the object itself. 3434 * then we should never poison the object itself.
3435 */ 3435 */
3436 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 3436 if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
3437 !s->ctor) 3437 !s->ctor)
3438 s->flags |= __OBJECT_POISON; 3438 s->flags |= __OBJECT_POISON;
3439 else 3439 else
@@ -3455,7 +3455,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3455 */ 3455 */
3456 s->inuse = size; 3456 s->inuse = size;
3457 3457
3458 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 3458 if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
3459 s->ctor)) { 3459 s->ctor)) {
3460 /* 3460 /*
3461 * Relocate free pointer after the object if it is not 3461 * Relocate free pointer after the object if it is not
@@ -3537,7 +3537,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3537 s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); 3537 s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
3538 s->reserved = 0; 3538 s->reserved = 0;
3539 3539
3540 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) 3540 if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU))
3541 s->reserved = sizeof(struct rcu_head); 3541 s->reserved = sizeof(struct rcu_head);
3542 3542
3543 if (!calculate_sizes(s, -1)) 3543 if (!calculate_sizes(s, -1))
@@ -5042,7 +5042,7 @@ SLAB_ATTR_RO(cache_dma);
5042 5042
5043static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 5043static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
5044{ 5044{
5045 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 5045 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
5046} 5046}
5047SLAB_ATTR_RO(destroy_by_rcu); 5047SLAB_ATTR_RO(destroy_by_rcu);
5048 5048
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b99168b0fabf..f75482bdee9a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -951,7 +951,7 @@ static struct proto dccp_v4_prot = {
951 .orphan_count = &dccp_orphan_count, 951 .orphan_count = &dccp_orphan_count,
952 .max_header = MAX_DCCP_HEADER, 952 .max_header = MAX_DCCP_HEADER,
953 .obj_size = sizeof(struct dccp_sock), 953 .obj_size = sizeof(struct dccp_sock),
954 .slab_flags = SLAB_DESTROY_BY_RCU, 954 .slab_flags = SLAB_TYPESAFE_BY_RCU,
955 .rsk_prot = &dccp_request_sock_ops, 955 .rsk_prot = &dccp_request_sock_ops,
956 .twsk_prot = &dccp_timewait_sock_ops, 956 .twsk_prot = &dccp_timewait_sock_ops,
957 .h.hashinfo = &dccp_hashinfo, 957 .h.hashinfo = &dccp_hashinfo,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index d9b6a4e403e7..840f14aaa016 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1014,7 +1014,7 @@ static struct proto dccp_v6_prot = {
1014 .orphan_count = &dccp_orphan_count, 1014 .orphan_count = &dccp_orphan_count,
1015 .max_header = MAX_DCCP_HEADER, 1015 .max_header = MAX_DCCP_HEADER,
1016 .obj_size = sizeof(struct dccp6_sock), 1016 .obj_size = sizeof(struct dccp6_sock),
1017 .slab_flags = SLAB_DESTROY_BY_RCU, 1017 .slab_flags = SLAB_TYPESAFE_BY_RCU,
1018 .rsk_prot = &dccp6_request_sock_ops, 1018 .rsk_prot = &dccp6_request_sock_ops,
1019 .twsk_prot = &dccp6_timewait_sock_ops, 1019 .twsk_prot = &dccp6_timewait_sock_ops,
1020 .h.hashinfo = &dccp_hashinfo, 1020 .h.hashinfo = &dccp_hashinfo,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 575e19dcc017..265352e1298b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2402,7 +2402,7 @@ struct proto tcp_prot = {
2402 .sysctl_rmem = sysctl_tcp_rmem, 2402 .sysctl_rmem = sysctl_tcp_rmem,
2403 .max_header = MAX_TCP_HEADER, 2403 .max_header = MAX_TCP_HEADER,
2404 .obj_size = sizeof(struct tcp_sock), 2404 .obj_size = sizeof(struct tcp_sock),
2405 .slab_flags = SLAB_DESTROY_BY_RCU, 2405 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2406 .twsk_prot = &tcp_timewait_sock_ops, 2406 .twsk_prot = &tcp_timewait_sock_ops,
2407 .rsk_prot = &tcp_request_sock_ops, 2407 .rsk_prot = &tcp_request_sock_ops,
2408 .h.hashinfo = &tcp_hashinfo, 2408 .h.hashinfo = &tcp_hashinfo,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 49fa2e8c3fa9..cc01d5fd2e86 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1921,7 +1921,7 @@ struct proto tcpv6_prot = {
1921 .sysctl_rmem = sysctl_tcp_rmem, 1921 .sysctl_rmem = sysctl_tcp_rmem,
1922 .max_header = MAX_TCP_HEADER, 1922 .max_header = MAX_TCP_HEADER,
1923 .obj_size = sizeof(struct tcp6_sock), 1923 .obj_size = sizeof(struct tcp6_sock),
1924 .slab_flags = SLAB_DESTROY_BY_RCU, 1924 .slab_flags = SLAB_TYPESAFE_BY_RCU,
1925 .twsk_prot = &tcp6_timewait_sock_ops, 1925 .twsk_prot = &tcp6_timewait_sock_ops,
1926 .rsk_prot = &tcp6_request_sock_ops, 1926 .rsk_prot = &tcp6_request_sock_ops,
1927 .h.hashinfo = &tcp_hashinfo, 1927 .h.hashinfo = &tcp_hashinfo,
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index cb4fff785cbf..8364fe5b59e4 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -142,7 +142,7 @@ static struct proto llc_proto = {
142 .name = "LLC", 142 .name = "LLC",
143 .owner = THIS_MODULE, 143 .owner = THIS_MODULE,
144 .obj_size = sizeof(struct llc_sock), 144 .obj_size = sizeof(struct llc_sock),
145 .slab_flags = SLAB_DESTROY_BY_RCU, 145 .slab_flags = SLAB_TYPESAFE_BY_RCU,
146}; 146};
147 147
148/** 148/**
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 8bc5a1bd2d45..9b02c13d258b 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -506,7 +506,7 @@ static struct sock *__llc_lookup_established(struct llc_sap *sap,
506again: 506again:
507 sk_nulls_for_each_rcu(rc, node, laddr_hb) { 507 sk_nulls_for_each_rcu(rc, node, laddr_hb) {
508 if (llc_estab_match(sap, daddr, laddr, rc)) { 508 if (llc_estab_match(sap, daddr, laddr, rc)) {
509 /* Extra checks required by SLAB_DESTROY_BY_RCU */ 509 /* Extra checks required by SLAB_TYPESAFE_BY_RCU */
510 if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) 510 if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
511 goto again; 511 goto again;
512 if (unlikely(llc_sk(rc)->sap != sap || 512 if (unlikely(llc_sk(rc)->sap != sap ||
@@ -565,7 +565,7 @@ static struct sock *__llc_lookup_listener(struct llc_sap *sap,
565again: 565again:
566 sk_nulls_for_each_rcu(rc, node, laddr_hb) { 566 sk_nulls_for_each_rcu(rc, node, laddr_hb) {
567 if (llc_listener_match(sap, laddr, rc)) { 567 if (llc_listener_match(sap, laddr, rc)) {
568 /* Extra checks required by SLAB_DESTROY_BY_RCU */ 568 /* Extra checks required by SLAB_TYPESAFE_BY_RCU */
569 if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) 569 if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
570 goto again; 570 goto again;
571 if (unlikely(llc_sk(rc)->sap != sap || 571 if (unlikely(llc_sk(rc)->sap != sap ||
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 5404d0d195cc..63b6ab056370 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -328,7 +328,7 @@ static struct sock *llc_lookup_dgram(struct llc_sap *sap,
328again: 328again:
329 sk_nulls_for_each_rcu(rc, node, laddr_hb) { 329 sk_nulls_for_each_rcu(rc, node, laddr_hb) {
330 if (llc_dgram_match(sap, laddr, rc)) { 330 if (llc_dgram_match(sap, laddr, rc)) {
331 /* Extra checks required by SLAB_DESTROY_BY_RCU */ 331 /* Extra checks required by SLAB_TYPESAFE_BY_RCU */
332 if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) 332 if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
333 goto again; 333 goto again;
334 if (unlikely(llc_sk(rc)->sap != sap || 334 if (unlikely(llc_sk(rc)->sap != sap ||
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ffb78e5f7b70..4cf769f9b32a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -918,7 +918,7 @@ static unsigned int early_drop_list(struct net *net,
918 continue; 918 continue;
919 919
920 /* kill only if still in same netns -- might have moved due to 920 /* kill only if still in same netns -- might have moved due to
921 * SLAB_DESTROY_BY_RCU rules. 921 * SLAB_TYPESAFE_BY_RCU rules.
922 * 922 *
923 * We steal the timer reference. If that fails timer has 923 * We steal the timer reference. If that fails timer has
924 * already fired or someone else deleted it. Just drop ref 924 * already fired or someone else deleted it. Just drop ref
@@ -1073,7 +1073,7 @@ __nf_conntrack_alloc(struct net *net,
1073 1073
1074 /* 1074 /*
1075 * Do not use kmem_cache_zalloc(), as this cache uses 1075 * Do not use kmem_cache_zalloc(), as this cache uses
1076 * SLAB_DESTROY_BY_RCU. 1076 * SLAB_TYPESAFE_BY_RCU.
1077 */ 1077 */
1078 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); 1078 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
1079 if (ct == NULL) 1079 if (ct == NULL)
@@ -1118,7 +1118,7 @@ void nf_conntrack_free(struct nf_conn *ct)
1118 struct net *net = nf_ct_net(ct); 1118 struct net *net = nf_ct_net(ct);
1119 1119
1120 /* A freed object has refcnt == 0, that's 1120 /* A freed object has refcnt == 0, that's
1121 * the golden rule for SLAB_DESTROY_BY_RCU 1121 * the golden rule for SLAB_TYPESAFE_BY_RCU
1122 */ 1122 */
1123 NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); 1123 NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
1124 1124
@@ -1882,7 +1882,7 @@ int nf_conntrack_init_start(void)
1882 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 1882 nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
1883 sizeof(struct nf_conn), 1883 sizeof(struct nf_conn),
1884 NFCT_INFOMASK + 1, 1884 NFCT_INFOMASK + 1,
1885 SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 1885 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
1886 if (!nf_conntrack_cachep) 1886 if (!nf_conntrack_cachep)
1887 goto err_cachep; 1887 goto err_cachep;
1888 1888
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 093803786eac..9659adfe534f 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -101,7 +101,7 @@ struct proto smc_proto = {
101 .unhash = smc_unhash_sk, 101 .unhash = smc_unhash_sk,
102 .obj_size = sizeof(struct smc_sock), 102 .obj_size = sizeof(struct smc_sock),
103 .h.smc_hash = &smc_v4_hashinfo, 103 .h.smc_hash = &smc_v4_hashinfo,
104 .slab_flags = SLAB_DESTROY_BY_RCU, 104 .slab_flags = SLAB_TYPESAFE_BY_RCU,
105}; 105};
106EXPORT_SYMBOL_GPL(smc_proto); 106EXPORT_SYMBOL_GPL(smc_proto);
107 107
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index ea6e373edc27..93eede4e8fbe 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -170,7 +170,7 @@ qemu_append="`identify_qemu_append "$QEMU"`"
170# Pull in Kconfig-fragment boot parameters 170# Pull in Kconfig-fragment boot parameters
171boot_args="`configfrag_boot_params "$boot_args" "$config_template"`" 171boot_args="`configfrag_boot_params "$boot_args" "$config_template"`"
172# Generate kernel-version-specific boot parameters 172# Generate kernel-version-specific boot parameters
173boot_args="`per_version_boot_params "$boot_args" $builddir/.config $seconds`" 173boot_args="`per_version_boot_params "$boot_args" $resdir/.config $seconds`"
174 174
175if test -n "$TORTURE_BUILDONLY" 175if test -n "$TORTURE_BUILDONLY"
176then 176then