summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-05-10 12:50:55 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-05-10 13:30:46 -0400
commitde4d195308ad589626571dbe5789cebf9695a204 (patch)
tree77a6bd6946594ea4e7513aaa73009295530960a1
parentdc9edaab90de9441cc28ac570b23b0d2bdba7879 (diff)
parent20652ed6e44f4963281b65209b917be86ac6765b (diff)
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RCU updates from Ingo Molnar: "The main changes are: - Debloat RCU headers - Parallelize SRCU callback handling (plus overlapping patches) - Improve the performance of Tree SRCU on a CPU-hotplug stress test - Documentation updates - Miscellaneous fixes" * 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (74 commits) rcu: Open-code the rcu_cblist_n_lazy_cbs() function rcu: Open-code the rcu_cblist_n_cbs() function rcu: Open-code the rcu_cblist_empty() function rcu: Separately compile large rcu_segcblist functions srcu: Debloat the <linux/rcu_segcblist.h> header srcu: Adjust default auto-expediting holdoff srcu: Specify auto-expedite holdoff time srcu: Expedite first synchronize_srcu() when idle srcu: Expedited grace periods with reduced memory contention srcu: Make rcutorture writer stalls print SRCU GP state srcu: Exact tracking of srcu_data structures containing callbacks srcu: Make SRCU be built by default srcu: Fix Kconfig botch when SRCU not selected rcu: Make non-preemptive schedule be Tasks RCU quiescent state srcu: Expedite srcu_schedule_cbs_snp() callback invocation srcu: Parallelize callback handling kvm: Move srcu_struct fields to end of struct kvm rcu: Fix typo in PER_RCU_NODE_PERIOD header comment rcu: Use true/false in assignment to bool rcu: Use bool value directly ...
-rw-r--r--Documentation/RCU/00-INDEX2
-rw-r--r--Documentation/RCU/Design/Data-Structures/Data-Structures.html233
-rw-r--r--Documentation/RCU/Design/Data-Structures/nxtlist.svg34
-rw-r--r--Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html47
-rw-r--r--Documentation/RCU/Design/Requirements/Requirements.html195
-rw-r--r--Documentation/RCU/rcu_dereference.txt9
-rw-r--r--Documentation/RCU/rculist_nulls.txt6
-rw-r--r--Documentation/RCU/stallwarn.txt190
-rw-r--r--Documentation/RCU/whatisRCU.txt32
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt8
-rw-r--r--Documentation/memory-barriers.txt2
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c2
-rw-r--r--drivers/gpu/drm/i915/i915_gem_request.h2
-rw-r--r--drivers/gpu/drm/i915/selftests/mock_gem_device.c2
-rw-r--r--drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c2
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/signalfd.c2
-rw-r--r--include/linux/dma-fence.h4
-rw-r--r--include/linux/kvm_host.h4
-rw-r--r--include/linux/rcu_node_tree.h99
-rw-r--r--include/linux/rcu_segcblist.h90
-rw-r--r--include/linux/rculist.h3
-rw-r--r--include/linux/rcupdate.h17
-rw-r--r--include/linux/rcutiny.h24
-rw-r--r--include/linux/rcutree.h5
-rw-r--r--include/linux/slab.h6
-rw-r--r--include/linux/srcu.h84
-rw-r--r--include/linux/srcuclassic.h115
-rw-r--r--include/linux/srcutiny.h93
-rw-r--r--include/linux/srcutree.h150
-rw-r--r--include/linux/types.h2
-rw-r--r--include/net/sock.h2
-rw-r--r--init/Kconfig43
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/locking/lockdep.c86
-rw-r--r--kernel/locking/rtmutex-debug.c9
-rw-r--r--kernel/rcu/Makefile5
-rw-r--r--kernel/rcu/rcu.h153
-rw-r--r--kernel/rcu/rcu_segcblist.c505
-rw-r--r--kernel/rcu/rcu_segcblist.h164
-rw-r--r--kernel/rcu/rcutorture.c43
-rw-r--r--kernel/rcu/srcu.c12
-rw-r--r--kernel/rcu/srcutiny.c216
-rw-r--r--kernel/rcu/srcutree.c1155
-rw-r--r--kernel/rcu/tiny.c20
-rw-r--r--kernel/rcu/tiny_plugin.h13
-rw-r--r--kernel/rcu/tree.c710
-rw-r--r--kernel/rcu/tree.h163
-rw-r--r--kernel/rcu/tree_exp.h25
-rw-r--r--kernel/rcu/tree_plugin.h64
-rw-r--r--kernel/rcu/tree_trace.c26
-rw-r--r--kernel/rcu/update.c53
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/signal.c2
-rw-r--r--mm/kasan/kasan.c6
-rw-r--r--mm/kmemcheck.c2
-rw-r--r--mm/mmu_notifier.c14
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/slab.c6
-rw-r--r--mm/slab.h4
-rw-r--r--mm/slab_common.c6
-rw-r--r--mm/slob.c6
-rw-r--r--mm/slub.c12
-rw-r--r--net/dccp/ipv4.c2
-rw-r--r--net/dccp/ipv6.c2
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv6/tcp_ipv6.c2
-rw-r--r--net/llc/af_llc.c2
-rw-r--r--net/llc/llc_conn.c4
-rw-r--r--net/llc/llc_sap.c2
-rw-r--r--net/netfilter/nf_conntrack_core.c8
-rw-r--r--net/smc/af_smc.c2
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh2
75 files changed, 3904 insertions, 1129 deletions
diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
index f773a264ae02..1672573b037a 100644
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -17,7 +17,7 @@ rcu_dereference.txt
17rcubarrier.txt 17rcubarrier.txt
18 - RCU and Unloadable Modules 18 - RCU and Unloadable Modules
19rculist_nulls.txt 19rculist_nulls.txt
20 - RCU list primitives for use with SLAB_DESTROY_BY_RCU 20 - RCU list primitives for use with SLAB_TYPESAFE_BY_RCU
21rcuref.txt 21rcuref.txt
22 - Reference-count design for elements of lists/arrays protected by RCU 22 - Reference-count design for elements of lists/arrays protected by RCU
23rcu.txt 23rcu.txt
diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html
index d583c653a703..38d6d800761f 100644
--- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html
+++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html
@@ -19,6 +19,8 @@ to each other.
19 The <tt>rcu_state</tt> Structure</a> 19 The <tt>rcu_state</tt> Structure</a>
20<li> <a href="#The rcu_node Structure"> 20<li> <a href="#The rcu_node Structure">
21 The <tt>rcu_node</tt> Structure</a> 21 The <tt>rcu_node</tt> Structure</a>
22<li> <a href="#The rcu_segcblist Structure">
23 The <tt>rcu_segcblist</tt> Structure</a>
22<li> <a href="#The rcu_data Structure"> 24<li> <a href="#The rcu_data Structure">
23 The <tt>rcu_data</tt> Structure</a> 25 The <tt>rcu_data</tt> Structure</a>
24<li> <a href="#The rcu_dynticks Structure"> 26<li> <a href="#The rcu_dynticks Structure">
@@ -841,6 +843,134 @@ for lockdep lock-class names.
841Finally, lines&nbsp;64-66 produce an error if the maximum number of 843Finally, lines&nbsp;64-66 produce an error if the maximum number of
842CPUs is too large for the specified fanout. 844CPUs is too large for the specified fanout.
843 845
846<h3><a name="The rcu_segcblist Structure">
847The <tt>rcu_segcblist</tt> Structure</a></h3>
848
849The <tt>rcu_segcblist</tt> structure maintains a segmented list of
850callbacks as follows:
851
852<pre>
853 1 #define RCU_DONE_TAIL 0
854 2 #define RCU_WAIT_TAIL 1
855 3 #define RCU_NEXT_READY_TAIL 2
856 4 #define RCU_NEXT_TAIL 3
857 5 #define RCU_CBLIST_NSEGS 4
858 6
859 7 struct rcu_segcblist {
860 8 struct rcu_head *head;
861 9 struct rcu_head **tails[RCU_CBLIST_NSEGS];
86210 unsigned long gp_seq[RCU_CBLIST_NSEGS];
86311 long len;
86412 long len_lazy;
86513 };
866</pre>
867
868<p>
869The segments are as follows:
870
871<ol>
872<li> <tt>RCU_DONE_TAIL</tt>: Callbacks whose grace periods have elapsed.
873 These callbacks are ready to be invoked.
874<li> <tt>RCU_WAIT_TAIL</tt>: Callbacks that are waiting for the
875 current grace period.
876 Note that different CPUs can have different ideas about which
877 grace period is current, hence the <tt>-&gt;gp_seq</tt> field.
878<li> <tt>RCU_NEXT_READY_TAIL</tt>: Callbacks waiting for the next
879 grace period to start.
880<li> <tt>RCU_NEXT_TAIL</tt>: Callbacks that have not yet been
881 associated with a grace period.
882</ol>
883
884<p>
885The <tt>-&gt;head</tt> pointer references the first callback or
886is <tt>NULL</tt> if the list contains no callbacks (which is
887<i>not</i> the same as being empty).
888Each element of the <tt>-&gt;tails[]</tt> array references the
889<tt>-&gt;next</tt> pointer of the last callback in the corresponding
890segment of the list, or the list's <tt>-&gt;head</tt> pointer if
891that segment and all previous segments are empty.
892If the corresponding segment is empty but some previous segment is
893not empty, then the array element is identical to its predecessor.
894Older callbacks are closer to the head of the list, and new callbacks
895are added at the tail.
896This relationship between the <tt>-&gt;head</tt> pointer, the
897<tt>-&gt;tails[]</tt> array, and the callbacks is shown in this
898diagram:
899
900</p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%">
901
902</p><p>In this figure, the <tt>-&gt;head</tt> pointer references the
903first
904RCU callback in the list.
905The <tt>-&gt;tails[RCU_DONE_TAIL]</tt> array element references
906the <tt>-&gt;head</tt> pointer itself, indicating that none
907of the callbacks is ready to invoke.
908The <tt>-&gt;tails[RCU_WAIT_TAIL]</tt> array element references callback
909CB&nbsp;2's <tt>-&gt;next</tt> pointer, which indicates that
910CB&nbsp;1 and CB&nbsp;2 are both waiting on the current grace period,
911give or take possible disagreements about exactly which grace period
912is the current one.
913The <tt>-&gt;tails[RCU_NEXT_READY_TAIL]</tt> array element
914references the same RCU callback that <tt>-&gt;tails[RCU_WAIT_TAIL]</tt>
915does, which indicates that there are no callbacks waiting on the next
916RCU grace period.
917The <tt>-&gt;tails[RCU_NEXT_TAIL]</tt> array element references
918CB&nbsp;4's <tt>-&gt;next</tt> pointer, indicating that all the
919remaining RCU callbacks have not yet been assigned to an RCU grace
920period.
921Note that the <tt>-&gt;tails[RCU_NEXT_TAIL]</tt> array element
922always references the last RCU callback's <tt>-&gt;next</tt> pointer
923unless the callback list is empty, in which case it references
924the <tt>-&gt;head</tt> pointer.
925
926<p>
927There is one additional important special case for the
928<tt>-&gt;tails[RCU_NEXT_TAIL]</tt> array element: It can be <tt>NULL</tt>
929when this list is <i>disabled</i>.
930Lists are disabled when the corresponding CPU is offline or when
931the corresponding CPU's callbacks are offloaded to a kthread,
932both of which are described elsewhere.
933
934</p><p>CPUs advance their callbacks from the
935<tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the
936<tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments
937as grace periods advance.
938
939</p><p>The <tt>-&gt;gp_seq[]</tt> array records grace-period
940numbers corresponding to the list segments.
941This is what allows different CPUs to have different ideas as to
942which is the current grace period while still avoiding premature
943invocation of their callbacks.
944In particular, this allows CPUs that go idle for extended periods
945to determine which of their callbacks are ready to be invoked after
946reawakening.
947
948</p><p>The <tt>-&gt;len</tt> counter contains the number of
949callbacks in <tt>-&gt;head</tt>, and the
950<tt>-&gt;len_lazy</tt> contains the number of those callbacks that
951are known to only free memory, and whose invocation can therefore
952be safely deferred.
953
954<p><b>Important note</b>: It is the <tt>-&gt;len</tt> field that
955determines whether or not there are callbacks associated with
956this <tt>rcu_segcblist</tt> structure, <i>not</i> the <tt>-&gt;head</tt>
957pointer.
958The reason for this is that all the ready-to-invoke callbacks
959(that is, those in the <tt>RCU_DONE_TAIL</tt> segment) are extracted
960all at once at callback-invocation time.
961If callback invocation must be postponed, for example, because a
962high-priority process just woke up on this CPU, then the remaining
963callbacks are placed back on the <tt>RCU_DONE_TAIL</tt> segment.
964Either way, the <tt>-&gt;len</tt> and <tt>-&gt;len_lazy</tt> counts
965are adjusted after the corresponding callbacks have been invoked, and so
966again it is the <tt>-&gt;len</tt> count that accurately reflects whether
967or not there are callbacks associated with this <tt>rcu_segcblist</tt>
968structure.
969Of course, off-CPU sampling of the <tt>-&gt;len</tt> count requires
970the use of appropriate synchronization, for example, memory barriers.
971This synchronization can be a bit subtle, particularly in the case
972of <tt>rcu_barrier()</tt>.
973
844<h3><a name="The rcu_data Structure"> 974<h3><a name="The rcu_data Structure">
845The <tt>rcu_data</tt> Structure</a></h3> 975The <tt>rcu_data</tt> Structure</a></h3>
846 976
@@ -983,62 +1113,18 @@ choice.
983as follows: 1113as follows:
984 1114
985<pre> 1115<pre>
986 1 struct rcu_head *nxtlist; 1116 1 struct rcu_segcblist cblist;
987 2 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 1117 2 long qlen_last_fqs_check;
988 3 unsigned long nxtcompleted[RCU_NEXT_SIZE]; 1118 3 unsigned long n_cbs_invoked;
989 4 long qlen_lazy; 1119 4 unsigned long n_nocbs_invoked;
990 5 long qlen; 1120 5 unsigned long n_cbs_orphaned;
991 6 long qlen_last_fqs_check; 1121 6 unsigned long n_cbs_adopted;
992 7 unsigned long n_force_qs_snap; 1122 7 unsigned long n_force_qs_snap;
993 8 unsigned long n_cbs_invoked; 1123 8 long blimit;
994 9 unsigned long n_cbs_orphaned;
99510 unsigned long n_cbs_adopted;
99611 long blimit;
997</pre> 1124</pre>
998 1125
999<p>The <tt>-&gt;nxtlist</tt> pointer and the 1126<p>The <tt>-&gt;cblist</tt> structure is the segmented callback list
1000<tt>-&gt;nxttail[]</tt> array form a four-segment list with 1127described earlier.
1001older callbacks near the head and newer ones near the tail.
1002Each segment contains callbacks with the corresponding relationship
1003to the current grace period.
1004The pointer out of the end of each of the four segments is referenced
1005by the element of the <tt>-&gt;nxttail[]</tt> array indexed by
1006<tt>RCU_DONE_TAIL</tt> (for callbacks handled by a prior grace period),
1007<tt>RCU_WAIT_TAIL</tt> (for callbacks waiting on the current grace period),
1008<tt>RCU_NEXT_READY_TAIL</tt> (for callbacks that will wait on the next
1009grace period), and
1010<tt>RCU_NEXT_TAIL</tt> (for callbacks that are not yet associated
1011with a specific grace period)
1012respectively, as shown in the following figure.
1013
1014</p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%">
1015
1016</p><p>In this figure, the <tt>-&gt;nxtlist</tt> pointer references the
1017first
1018RCU callback in the list.
1019The <tt>-&gt;nxttail[RCU_DONE_TAIL]</tt> array element references
1020the <tt>-&gt;nxtlist</tt> pointer itself, indicating that none
1021of the callbacks is ready to invoke.
1022The <tt>-&gt;nxttail[RCU_WAIT_TAIL]</tt> array element references callback
1023CB&nbsp;2's <tt>-&gt;next</tt> pointer, which indicates that
1024CB&nbsp;1 and CB&nbsp;2 are both waiting on the current grace period.
1025The <tt>-&gt;nxttail[RCU_NEXT_READY_TAIL]</tt> array element
1026references the same RCU callback that <tt>-&gt;nxttail[RCU_WAIT_TAIL]</tt>
1027does, which indicates that there are no callbacks waiting on the next
1028RCU grace period.
1029The <tt>-&gt;nxttail[RCU_NEXT_TAIL]</tt> array element references
1030CB&nbsp;4's <tt>-&gt;next</tt> pointer, indicating that all the
1031remaining RCU callbacks have not yet been assigned to an RCU grace
1032period.
1033Note that the <tt>-&gt;nxttail[RCU_NEXT_TAIL]</tt> array element
1034always references the last RCU callback's <tt>-&gt;next</tt> pointer
1035unless the callback list is empty, in which case it references
1036the <tt>-&gt;nxtlist</tt> pointer.
1037
1038</p><p>CPUs advance their callbacks from the
1039<tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the
1040<tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments
1041as grace periods advance.
1042The CPU advances the callbacks in its <tt>rcu_data</tt> structure 1128The CPU advances the callbacks in its <tt>rcu_data</tt> structure
1043whenever it notices that another RCU grace period has completed. 1129whenever it notices that another RCU grace period has completed.
1044The CPU detects the completion of an RCU grace period by noticing 1130The CPU detects the completion of an RCU grace period by noticing
@@ -1049,16 +1135,7 @@ Recall that each <tt>rcu_node</tt> structure's
1049<tt>-&gt;completed</tt> field is updated at the end of each 1135<tt>-&gt;completed</tt> field is updated at the end of each
1050grace period. 1136grace period.
1051 1137
1052</p><p>The <tt>-&gt;nxtcompleted[]</tt> array records grace-period 1138<p>
1053numbers corresponding to the list segments.
1054This allows CPUs that go idle for extended periods to determine
1055which of their callbacks are ready to be invoked after reawakening.
1056
1057</p><p>The <tt>-&gt;qlen</tt> counter contains the number of
1058callbacks in <tt>-&gt;nxtlist</tt>, and the
1059<tt>-&gt;qlen_lazy</tt> contains the number of those callbacks that
1060are known to only free memory, and whose invocation can therefore
1061be safely deferred.
1062The <tt>-&gt;qlen_last_fqs_check</tt> and 1139The <tt>-&gt;qlen_last_fqs_check</tt> and
1063<tt>-&gt;n_force_qs_snap</tt> coordinate the forcing of quiescent 1140<tt>-&gt;n_force_qs_snap</tt> coordinate the forcing of quiescent
1064states from <tt>call_rcu()</tt> and friends when callback 1141states from <tt>call_rcu()</tt> and friends when callback
@@ -1069,6 +1146,10 @@ lists grow excessively long.
1069fields count the number of callbacks invoked, 1146fields count the number of callbacks invoked,
1070sent to other CPUs when this CPU goes offline, 1147sent to other CPUs when this CPU goes offline,
1071and received from other CPUs when those other CPUs go offline. 1148and received from other CPUs when those other CPUs go offline.
1149The <tt>-&gt;n_nocbs_invoked</tt> is used when the CPU's callbacks
1150are offloaded to a kthread.
1151
1152<p>
1072Finally, the <tt>-&gt;blimit</tt> counter is the maximum number of 1153Finally, the <tt>-&gt;blimit</tt> counter is the maximum number of
1073RCU callbacks that may be invoked at a given time. 1154RCU callbacks that may be invoked at a given time.
1074 1155
@@ -1104,6 +1185,9 @@ Its fields are as follows:
1104 1 int dynticks_nesting; 1185 1 int dynticks_nesting;
1105 2 int dynticks_nmi_nesting; 1186 2 int dynticks_nmi_nesting;
1106 3 atomic_t dynticks; 1187 3 atomic_t dynticks;
1188 4 bool rcu_need_heavy_qs;
1189 5 unsigned long rcu_qs_ctr;
1190 6 bool rcu_urgent_qs;
1107</pre> 1191</pre>
1108 1192
1109<p>The <tt>-&gt;dynticks_nesting</tt> field counts the 1193<p>The <tt>-&gt;dynticks_nesting</tt> field counts the
@@ -1117,11 +1201,32 @@ NMIs are counted by the <tt>-&gt;dynticks_nmi_nesting</tt>
1117field, except that NMIs that interrupt non-dyntick-idle execution 1201field, except that NMIs that interrupt non-dyntick-idle execution
1118are not counted. 1202are not counted.
1119 1203
1120</p><p>Finally, the <tt>-&gt;dynticks</tt> field counts the corresponding 1204</p><p>The <tt>-&gt;dynticks</tt> field counts the corresponding
1121CPU's transitions to and from dyntick-idle mode, so that this counter 1205CPU's transitions to and from dyntick-idle mode, so that this counter
1122has an even value when the CPU is in dyntick-idle mode and an odd 1206has an even value when the CPU is in dyntick-idle mode and an odd
1123value otherwise. 1207value otherwise.
1124 1208
1209</p><p>The <tt>-&gt;rcu_need_heavy_qs</tt> field is used
1210to record the fact that the RCU core code would really like to
1211see a quiescent state from the corresponding CPU, so much so that
1212it is willing to call for heavy-weight dyntick-counter operations.
1213This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
1214code, which provide a momentary idle sojourn in response.
1215
1216</p><p>The <tt>-&gt;rcu_qs_ctr</tt> field is used to record
1217quiescent states from <tt>cond_resched()</tt>.
1218Because <tt>cond_resched()</tt> can execute quite frequently, this
1219must be quite lightweight, as in a non-atomic increment of this
1220per-CPU field.
1221
1222</p><p>Finally, the <tt>-&gt;rcu_urgent_qs</tt> field is used to record
1223the fact that the RCU core code would really like to see a quiescent
1224state from the corresponding CPU, with the various other fields indicating
1225just how badly RCU wants this quiescent state.
1226This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
1227code, which, if nothing else, non-atomically increment <tt>-&gt;rcu_qs_ctr</tt>
1228in response.
1229
1125<table> 1230<table>
1126<tr><th>&nbsp;</th></tr> 1231<tr><th>&nbsp;</th></tr>
1127<tr><th align="left">Quick Quiz:</th></tr> 1232<tr><th align="left">Quick Quiz:</th></tr>
diff --git a/Documentation/RCU/Design/Data-Structures/nxtlist.svg b/Documentation/RCU/Design/Data-Structures/nxtlist.svg
index abc4cc73a097..0223e79c38e0 100644
--- a/Documentation/RCU/Design/Data-Structures/nxtlist.svg
+++ b/Documentation/RCU/Design/Data-Structures/nxtlist.svg
@@ -19,7 +19,7 @@
19 id="svg2" 19 id="svg2"
20 version="1.1" 20 version="1.1"
21 inkscape:version="0.48.4 r9939" 21 inkscape:version="0.48.4 r9939"
22 sodipodi:docname="nxtlist.fig"> 22 sodipodi:docname="segcblist.svg">
23 <metadata 23 <metadata
24 id="metadata94"> 24 id="metadata94">
25 <rdf:RDF> 25 <rdf:RDF>
@@ -28,7 +28,7 @@
28 <dc:format>image/svg+xml</dc:format> 28 <dc:format>image/svg+xml</dc:format>
29 <dc:type 29 <dc:type
30 rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> 30 rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
31 <dc:title></dc:title> 31 <dc:title />
32 </cc:Work> 32 </cc:Work>
33 </rdf:RDF> 33 </rdf:RDF>
34 </metadata> 34 </metadata>
@@ -241,61 +241,51 @@
241 xml:space="preserve" 241 xml:space="preserve"
242 x="225" 242 x="225"
243 y="675" 243 y="675"
244 fill="#000000"
245 font-family="Courier"
246 font-style="normal" 244 font-style="normal"
247 font-weight="bold" 245 font-weight="bold"
248 font-size="324" 246 font-size="324"
249 text-anchor="start" 247 id="text64"
250 id="text64">nxtlist</text> 248 style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;head</text>
251 <!-- Text --> 249 <!-- Text -->
252 <text 250 <text
253 xml:space="preserve" 251 xml:space="preserve"
254 x="225" 252 x="225"
255 y="1800" 253 y="1800"
256 fill="#000000"
257 font-family="Courier"
258 font-style="normal" 254 font-style="normal"
259 font-weight="bold" 255 font-weight="bold"
260 font-size="324" 256 font-size="324"
261 text-anchor="start" 257 id="text66"
262 id="text66">nxttail[RCU_DONE_TAIL]</text> 258 style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;tails[RCU_DONE_TAIL]</text>
263 <!-- Text --> 259 <!-- Text -->
264 <text 260 <text
265 xml:space="preserve" 261 xml:space="preserve"
266 x="225" 262 x="225"
267 y="2925" 263 y="2925"
268 fill="#000000"
269 font-family="Courier"
270 font-style="normal" 264 font-style="normal"
271 font-weight="bold" 265 font-weight="bold"
272 font-size="324" 266 font-size="324"
273 text-anchor="start" 267 id="text68"
274 id="text68">nxttail[RCU_WAIT_TAIL]</text> 268 style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;tails[RCU_WAIT_TAIL]</text>
275 <!-- Text --> 269 <!-- Text -->
276 <text 270 <text
277 xml:space="preserve" 271 xml:space="preserve"
278 x="225" 272 x="225"
279 y="4050" 273 y="4050"
280 fill="#000000"
281 font-family="Courier"
282 font-style="normal" 274 font-style="normal"
283 font-weight="bold" 275 font-weight="bold"
284 font-size="324" 276 font-size="324"
285 text-anchor="start" 277 id="text70"
286 id="text70">nxttail[RCU_NEXT_READY_TAIL]</text> 278 style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;tails[RCU_NEXT_READY_TAIL]</text>
287 <!-- Text --> 279 <!-- Text -->
288 <text 280 <text
289 xml:space="preserve" 281 xml:space="preserve"
290 x="225" 282 x="225"
291 y="5175" 283 y="5175"
292 fill="#000000"
293 font-family="Courier"
294 font-style="normal" 284 font-style="normal"
295 font-weight="bold" 285 font-weight="bold"
296 font-size="324" 286 font-size="324"
297 text-anchor="start" 287 id="text72"
298 id="text72">nxttail[RCU_NEXT_TAIL]</text> 288 style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">-&gt;tails[RCU_NEXT_TAIL]</text>
299 <!-- Text --> 289 <!-- Text -->
300 <text 290 <text
301 xml:space="preserve" 291 xml:space="preserve"
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html
index 7a3194c5559a..e5d0bbd0230b 100644
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html
@@ -284,6 +284,7 @@ Expedited Grace Period Refinements</a></h2>
284 Funnel locking and wait/wakeup</a>. 284 Funnel locking and wait/wakeup</a>.
285<li> <a href="#Use of Workqueues">Use of Workqueues</a>. 285<li> <a href="#Use of Workqueues">Use of Workqueues</a>.
286<li> <a href="#Stall Warnings">Stall warnings</a>. 286<li> <a href="#Stall Warnings">Stall warnings</a>.
287<li> <a href="#Mid-Boot Operation">Mid-boot operation</a>.
287</ol> 288</ol>
288 289
289<h3><a name="Idle-CPU Checks">Idle-CPU Checks</a></h3> 290<h3><a name="Idle-CPU Checks">Idle-CPU Checks</a></h3>
@@ -524,7 +525,7 @@ their grace periods and carrying out their wakeups.
524In earlier implementations, the task requesting the expedited 525In earlier implementations, the task requesting the expedited
525grace period also drove it to completion. 526grace period also drove it to completion.
526This straightforward approach had the disadvantage of needing to 527This straightforward approach had the disadvantage of needing to
527account for signals sent to user tasks, 528account for POSIX signals sent to user tasks,
528so more recent implemementations use the Linux kernel's 529so more recent implemementations use the Linux kernel's
529<a href="https://www.kernel.org/doc/Documentation/workqueue.txt">workqueues</a>. 530<a href="https://www.kernel.org/doc/Documentation/workqueue.txt">workqueues</a>.
530 531
@@ -533,8 +534,8 @@ The requesting task still does counter snapshotting and funnel-lock
533processing, but the task reaching the top of the funnel lock 534processing, but the task reaching the top of the funnel lock
534does a <tt>schedule_work()</tt> (from <tt>_synchronize_rcu_expedited()</tt> 535does a <tt>schedule_work()</tt> (from <tt>_synchronize_rcu_expedited()</tt>
535so that a workqueue kthread does the actual grace-period processing. 536so that a workqueue kthread does the actual grace-period processing.
536Because workqueue kthreads do not accept signals, grace-period-wait 537Because workqueue kthreads do not accept POSIX signals, grace-period-wait
537processing need not allow for signals. 538processing need not allow for POSIX signals.
538 539
539In addition, this approach allows wakeups for the previous expedited 540In addition, this approach allows wakeups for the previous expedited
540grace period to be overlapped with processing for the next expedited 541grace period to be overlapped with processing for the next expedited
@@ -586,6 +587,46 @@ blocking the current grace period are printed.
586Each stall warning results in another pass through the loop, but the 587Each stall warning results in another pass through the loop, but the
587second and subsequent passes use longer stall times. 588second and subsequent passes use longer stall times.
588 589
590<h3><a name="Mid-Boot Operation">Mid-boot operation</a></h3>
591
592<p>
593The use of workqueues has the advantage that the expedited
594grace-period code need not worry about POSIX signals.
595Unfortunately, it has the
596corresponding disadvantage that workqueues cannot be used until
597they are initialized, which does not happen until some time after
598the scheduler spawns the first task.
599Given that there are parts of the kernel that really do want to
600execute grace periods during this mid-boot &ldquo;dead zone&rdquo;,
601expedited grace periods must do something else during thie time.
602
603<p>
604What they do is to fall back to the old practice of requiring that the
605requesting task drive the expedited grace period, as was the case
606before the use of workqueues.
607However, the requesting task is only required to drive the grace period
608during the mid-boot dead zone.
609Before mid-boot, a synchronous grace period is a no-op.
610Some time after mid-boot, workqueues are used.
611
612<p>
613Non-expedited non-SRCU synchronous grace periods must also operate
614normally during mid-boot.
615This is handled by causing non-expedited grace periods to take the
616expedited code path during mid-boot.
617
618<p>
619The current code assumes that there are no POSIX signals during
620the mid-boot dead zone.
621However, if an overwhelming need for POSIX signals somehow arises,
622appropriate adjustments can be made to the expedited stall-warning code.
623One such adjustment would reinstate the pre-workqueue stall-warning
624checks, but only during the mid-boot dead zone.
625
626<p>
627With this refinement, synchronous grace periods can now be used from
628task context pretty much any time during the life of the kernel.
629
589<h3><a name="Summary"> 630<h3><a name="Summary">
590Summary</a></h3> 631Summary</a></h3>
591 632
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
index 21593496aca6..f60adf112663 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -659,8 +659,9 @@ systems with more than one CPU:
659 In other words, a given instance of <tt>synchronize_rcu()</tt> 659 In other words, a given instance of <tt>synchronize_rcu()</tt>
660 can avoid waiting on a given RCU read-side critical section only 660 can avoid waiting on a given RCU read-side critical section only
661 if it can prove that <tt>synchronize_rcu()</tt> started first. 661 if it can prove that <tt>synchronize_rcu()</tt> started first.
662 </font>
662 663
663 <p> 664 <p><font color="ffffff">
664 A related question is &ldquo;When <tt>rcu_read_lock()</tt> 665 A related question is &ldquo;When <tt>rcu_read_lock()</tt>
665 doesn't generate any code, why does it matter how it relates 666 doesn't generate any code, why does it matter how it relates
666 to a grace period?&rdquo; 667 to a grace period?&rdquo;
@@ -675,8 +676,9 @@ systems with more than one CPU:
675 within the critical section, in which case none of the accesses 676 within the critical section, in which case none of the accesses
676 within the critical section may observe the effects of any 677 within the critical section may observe the effects of any
677 access following the grace period. 678 access following the grace period.
679 </font>
678 680
679 <p> 681 <p><font color="ffffff">
680 As of late 2016, mathematical models of RCU take this 682 As of late 2016, mathematical models of RCU take this
681 viewpoint, for example, see slides&nbsp;62 and&nbsp;63 683 viewpoint, for example, see slides&nbsp;62 and&nbsp;63
682 of the 684 of the
@@ -1616,8 +1618,8 @@ CPUs should at least make reasonable forward progress.
1616In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt> 1618In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
1617is permitted to impose modest degradation of real-time latency 1619is permitted to impose modest degradation of real-time latency
1618on non-idle online CPUs. 1620on non-idle online CPUs.
1619That said, it will likely be necessary to take further steps to reduce this 1621Here, &ldquo;modest&rdquo; means roughly the same latency
1620degradation, hopefully to roughly that of a scheduling-clock interrupt. 1622degradation as a scheduling-clock interrupt.
1621 1623
1622<p> 1624<p>
1623There are a number of situations where even 1625There are a number of situations where even
@@ -1913,12 +1915,9 @@ This requirement is another factor driving batching of grace periods,
1913but it is also the driving force behind the checks for large numbers 1915but it is also the driving force behind the checks for large numbers
1914of queued RCU callbacks in the <tt>call_rcu()</tt> code path. 1916of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
1915Finally, high update rates should not delay RCU read-side critical 1917Finally, high update rates should not delay RCU read-side critical
1916sections, although some read-side delays can occur when using 1918sections, although some small read-side delays can occur when using
1917<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use 1919<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
1918of <tt>try_stop_cpus()</tt>. 1920of <tt>smp_call_function_single()</tt>.
1919(In the future, <tt>synchronize_rcu_expedited()</tt> will be
1920converted to use lighter-weight inter-processor interrupts (IPIs),
1921but this will still disturb readers, though to a much smaller degree.)
1922 1921
1923<p> 1922<p>
1924Although all three of these corner cases were understood in the early 1923Although all three of these corner cases were understood in the early
@@ -2154,7 +2153,8 @@ as will <tt>rcu_assign_pointer()</tt>.
2154<p> 2153<p>
2155Although <tt>call_rcu()</tt> may be invoked at any 2154Although <tt>call_rcu()</tt> may be invoked at any
2156time during boot, callbacks are not guaranteed to be invoked until after 2155time during boot, callbacks are not guaranteed to be invoked until after
2157the scheduler is fully up and running. 2156all of RCU's kthreads have been spawned, which occurs at
2157<tt>early_initcall()</tt> time.
2158This delay in callback invocation is due to the fact that RCU does not 2158This delay in callback invocation is due to the fact that RCU does not
2159invoke callbacks until it is fully initialized, and this full initialization 2159invoke callbacks until it is fully initialized, and this full initialization
2160cannot occur until after the scheduler has initialized itself to the 2160cannot occur until after the scheduler has initialized itself to the
@@ -2167,8 +2167,10 @@ on what operations those callbacks could invoke.
2167Perhaps surprisingly, <tt>synchronize_rcu()</tt>, 2167Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
2168<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a> 2168<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
2169(<a href="#Bottom-Half Flavor">discussed below</a>), 2169(<a href="#Bottom-Half Flavor">discussed below</a>),
2170and 2170<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>,
2171<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a> 2171<tt>synchronize_rcu_expedited()</tt>,
2172<tt>synchronize_rcu_bh_expedited()</tt>, and
2173<tt>synchronize_sched_expedited()</tt>
2172will all operate normally 2174will all operate normally
2173during very early boot, the reason being that there is only one CPU 2175during very early boot, the reason being that there is only one CPU
2174and preemption is disabled. 2176and preemption is disabled.
@@ -2178,45 +2180,59 @@ state and thus a grace period, so the early-boot implementation can
2178be a no-op. 2180be a no-op.
2179 2181
2180<p> 2182<p>
2181Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt> 2183However, once the scheduler has spawned its first kthread, this early
2182continue to operate normally through the remainder of boot, courtesy 2184boot trick fails for <tt>synchronize_rcu()</tt> (as well as for
2183of the fact that preemption is disabled across their RCU read-side 2185<tt>synchronize_rcu_expedited()</tt>) in <tt>CONFIG_PREEMPT=y</tt>
2184critical sections and also courtesy of the fact that there is still 2186kernels.
2185only one CPU. 2187The reason is that an RCU read-side critical section might be preempted,
2186However, once the scheduler starts initializing, preemption is enabled. 2188which means that a subsequent <tt>synchronize_rcu()</tt> really does have
2187There is still only a single CPU, but the fact that preemption is enabled 2189to wait for something, as opposed to simply returning immediately.
2188means that the no-op implementation of <tt>synchronize_rcu()</tt> no 2190Unfortunately, <tt>synchronize_rcu()</tt> can't do this until all of
2189longer works in <tt>CONFIG_PREEMPT=y</tt> kernels. 2191its kthreads are spawned, which doesn't happen until some time during
2190Therefore, as soon as the scheduler starts initializing, the early-boot 2192<tt>early_initcalls()</tt> time.
2191fastpath is disabled. 2193But this is no excuse: RCU is nevertheless required to correctly handle
2192This means that <tt>synchronize_rcu()</tt> switches to its runtime 2194synchronous grace periods during this time period.
2193mode of operation where it posts callbacks, which in turn means that 2195Once all of its kthreads are up and running, RCU starts running
2194any call to <tt>synchronize_rcu()</tt> will block until the corresponding 2196normally.
2195callback is invoked.
2196Unfortunately, the callback cannot be invoked until RCU's runtime
2197grace-period machinery is up and running, which cannot happen until
2198the scheduler has initialized itself sufficiently to allow RCU's
2199kthreads to be spawned.
2200Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
2201initialization can result in deadlock.
2202 2197
2203<table> 2198<table>
2204<tr><th>&nbsp;</th></tr> 2199<tr><th>&nbsp;</th></tr>
2205<tr><th align="left">Quick Quiz:</th></tr> 2200<tr><th align="left">Quick Quiz:</th></tr>
2206<tr><td> 2201<tr><td>
2207 So what happens with <tt>synchronize_rcu()</tt> during 2202 How can RCU possibly handle grace periods before all of its
2208 scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> 2203 kthreads have been spawned???
2209 kernels?
2210</td></tr> 2204</td></tr>
2211<tr><th align="left">Answer:</th></tr> 2205<tr><th align="left">Answer:</th></tr>
2212<tr><td bgcolor="#ffffff"><font color="ffffff"> 2206<tr><td bgcolor="#ffffff"><font color="ffffff">
2213 In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> 2207 Very carefully!
2214 maps directly to <tt>synchronize_sched()</tt>. 2208 </font>
2215 Therefore, <tt>synchronize_rcu()</tt> works normally throughout 2209
2216 boot in <tt>CONFIG_PREEMPT=n</tt> kernels. 2210 <p><font color="ffffff">
2217 However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, 2211 During the &ldquo;dead zone&rdquo; between the time that the
2218 so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> 2212 scheduler spawns the first task and the time that all of RCU's
2219 during scheduler initialization. 2213 kthreads have been spawned, all synchronous grace periods are
2214 handled by the expedited grace-period mechanism.
2215 At runtime, this expedited mechanism relies on workqueues, but
2216 during the dead zone the requesting task itself drives the
2217 desired expedited grace period.
2218 Because dead-zone execution takes place within task context,
2219 everything works.
2220 Once the dead zone ends, expedited grace periods go back to
2221 using workqueues, as is required to avoid problems that would
2222 otherwise occur when a user task received a POSIX signal while
2223 driving an expedited grace period.
2224 </font>
2225
2226 <p><font color="ffffff">
2227 And yes, this does mean that it is unhelpful to send POSIX
2228 signals to random tasks between the time that the scheduler
2229 spawns its first kthread and the time that RCU's kthreads
2230 have all been spawned.
2231 If there ever turns out to be a good reason for sending POSIX
2232 signals during that time, appropriate adjustments will be made.
2233 (If it turns out that POSIX signals are sent during this time for
2234 no good reason, other adjustments will be made, appropriate
2235 or otherwise.)
2220</font></td></tr> 2236</font></td></tr>
2221<tr><td>&nbsp;</td></tr> 2237<tr><td>&nbsp;</td></tr>
2222</table> 2238</table>
@@ -2295,12 +2311,61 @@ situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
2295The need for <tt>rcu_barrier()</tt> for module unloading became 2311The need for <tt>rcu_barrier()</tt> for module unloading became
2296apparent later. 2312apparent later.
2297 2313
2314<p>
2315<b>Important note</b>: The <tt>rcu_barrier()</tt> function is not,
2316repeat, <i>not</i>, obligated to wait for a grace period.
2317It is instead only required to wait for RCU callbacks that have
2318already been posted.
2319Therefore, if there are no RCU callbacks posted anywhere in the system,
2320<tt>rcu_barrier()</tt> is within its rights to return immediately.
2321Even if there are callbacks posted, <tt>rcu_barrier()</tt> does not
2322necessarily need to wait for a grace period.
2323
2324<table>
2325<tr><th>&nbsp;</th></tr>
2326<tr><th align="left">Quick Quiz:</th></tr>
2327<tr><td>
2328 Wait a minute!
2329 Each RCU callbacks must wait for a grace period to complete,
2330 and <tt>rcu_barrier()</tt> must wait for each pre-existing
2331 callback to be invoked.
2332 Doesn't <tt>rcu_barrier()</tt> therefore need to wait for
2333 a full grace period if there is even one callback posted anywhere
2334 in the system?
2335</td></tr>
2336<tr><th align="left">Answer:</th></tr>
2337<tr><td bgcolor="#ffffff"><font color="ffffff">
2338 Absolutely not!!!
2339 </font>
2340
2341 <p><font color="ffffff">
2342 Yes, each RCU callbacks must wait for a grace period to complete,
2343 but it might well be partly (or even completely) finished waiting
2344 by the time <tt>rcu_barrier()</tt> is invoked.
2345 In that case, <tt>rcu_barrier()</tt> need only wait for the
2346 remaining portion of the grace period to elapse.
2347 So even if there are quite a few callbacks posted,
2348 <tt>rcu_barrier()</tt> might well return quite quickly.
2349 </font>
2350
2351 <p><font color="ffffff">
2352 So if you need to wait for a grace period as well as for all
2353 pre-existing callbacks, you will need to invoke both
2354 <tt>synchronize_rcu()</tt> and <tt>rcu_barrier()</tt>.
2355 If latency is a concern, you can always use workqueues
2356 to invoke them concurrently.
2357</font></td></tr>
2358<tr><td>&nbsp;</td></tr>
2359</table>
2360
2298<h3><a name="Hotplug CPU">Hotplug CPU</a></h3> 2361<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
2299 2362
2300<p> 2363<p>
2301The Linux kernel supports CPU hotplug, which means that CPUs 2364The Linux kernel supports CPU hotplug, which means that CPUs
2302can come and go. 2365can come and go.
2303It is of course illegal to use any RCU API member from an offline CPU. 2366It is of course illegal to use any RCU API member from an offline CPU,
2367with the exception of <a href="#Sleepable RCU">SRCU</a> read-side
2368critical sections.
2304This requirement was present from day one in DYNIX/ptx, but 2369This requirement was present from day one in DYNIX/ptx, but
2305on the other hand, the Linux kernel's CPU-hotplug implementation 2370on the other hand, the Linux kernel's CPU-hotplug implementation
2306is &ldquo;interesting.&rdquo; 2371is &ldquo;interesting.&rdquo;
@@ -2310,19 +2375,18 @@ The Linux-kernel CPU-hotplug implementation has notifiers that
2310are used to allow the various kernel subsystems (including RCU) 2375are used to allow the various kernel subsystems (including RCU)
2311to respond appropriately to a given CPU-hotplug operation. 2376to respond appropriately to a given CPU-hotplug operation.
2312Most RCU operations may be invoked from CPU-hotplug notifiers, 2377Most RCU operations may be invoked from CPU-hotplug notifiers,
2313including even normal synchronous grace-period operations 2378including even synchronous grace-period operations such as
2314such as <tt>synchronize_rcu()</tt>. 2379<tt>synchronize_rcu()</tt> and <tt>synchronize_rcu_expedited()</tt>.
2315However, expedited grace-period operations such as
2316<tt>synchronize_rcu_expedited()</tt> are not supported,
2317due to the fact that current implementations block CPU-hotplug
2318operations, which could result in deadlock.
2319 2380
2320<p> 2381<p>
2321In addition, all-callback-wait operations such as 2382However, all-callback-wait operations such as
2322<tt>rcu_barrier()</tt> are also not supported, due to the 2383<tt>rcu_barrier()</tt> are also not supported, due to the
2323fact that there are phases of CPU-hotplug operations where 2384fact that there are phases of CPU-hotplug operations where
2324the outgoing CPU's callbacks will not be invoked until after 2385the outgoing CPU's callbacks will not be invoked until after
2325the CPU-hotplug operation ends, which could also result in deadlock. 2386the CPU-hotplug operation ends, which could also result in deadlock.
2387Furthermore, <tt>rcu_barrier()</tt> blocks CPU-hotplug operations
2388during its execution, which results in another type of deadlock
2389when invoked from a CPU-hotplug notifier.
2326 2390
2327<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3> 2391<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
2328 2392
@@ -2864,6 +2928,27 @@ API, which, in combination with <tt>srcu_read_unlock()</tt>,
2864guarantees a full memory barrier. 2928guarantees a full memory barrier.
2865 2929
2866<p> 2930<p>
2931Also unlike other RCU flavors, SRCU's callbacks-wait function
2932<tt>srcu_barrier()</tt> may be invoked from CPU-hotplug notifiers,
2933though this is not necessarily a good idea.
2934The reason that this is possible is that SRCU is insensitive
2935to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt>
2936need not exclude CPU-hotplug operations.
2937
2938<p>
2939As of v4.12, SRCU's callbacks are maintained per-CPU, eliminating
2940a locking bottleneck present in prior kernel versions.
2941Although this will allow users to put much heavier stress on
2942<tt>call_srcu()</tt>, it is important to note that SRCU does not
2943yet take any special steps to deal with callback flooding.
2944So if you are posting (say) 10,000 SRCU callbacks per second per CPU,
2945you are probably totally OK, but if you intend to post (say) 1,000,000
2946SRCU callbacks per second per CPU, please run some tests first.
2947SRCU just might need a few adjustment to deal with that sort of load.
2948Of course, your mileage may vary based on the speed of your CPUs and
2949the size of your memory.
2950
2951<p>
2867The 2952The
2868<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a> 2953<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
2869includes 2954includes
@@ -3021,8 +3106,8 @@ to do some redesign to avoid this scalability problem.
3021 3106
3022<p> 3107<p>
3023RCU disables CPU hotplug in a few places, perhaps most notably in the 3108RCU disables CPU hotplug in a few places, perhaps most notably in the
3024expedited grace-period and <tt>rcu_barrier()</tt> operations. 3109<tt>rcu_barrier()</tt> operations.
3025If there is a strong reason to use expedited grace periods in CPU-hotplug 3110If there is a strong reason to use <tt>rcu_barrier()</tt> in CPU-hotplug
3026notifiers, it will be necessary to avoid disabling CPU hotplug. 3111notifiers, it will be necessary to avoid disabling CPU hotplug.
3027This would introduce some complexity, so there had better be a <i>very</i> 3112This would introduce some complexity, so there had better be a <i>very</i>
3028good reason. 3113good reason.
@@ -3096,9 +3181,5 @@ Andy Lutomirski for their help in rendering
3096this article human readable, and to Michelle Rankin for her support 3181this article human readable, and to Michelle Rankin for her support
3097of this effort. 3182of this effort.
3098Other contributions are acknowledged in the Linux kernel's git archive. 3183Other contributions are acknowledged in the Linux kernel's git archive.
3099The cartoon is copyright (c) 2013 by Melissa Broussard,
3100and is provided
3101under the terms of the Creative Commons Attribution-Share Alike 3.0
3102United States license.
3103 3184
3104</body></html> 3185</body></html>
diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt
index c0bf2441a2ba..b2a613f16d74 100644
--- a/Documentation/RCU/rcu_dereference.txt
+++ b/Documentation/RCU/rcu_dereference.txt
@@ -138,6 +138,15 @@ o Be very careful about comparing pointers obtained from
138 This sort of comparison occurs frequently when scanning 138 This sort of comparison occurs frequently when scanning
139 RCU-protected circular linked lists. 139 RCU-protected circular linked lists.
140 140
141 Note that if checks for being within an RCU read-side
142 critical section are not required and the pointer is never
143 dereferenced, rcu_access_pointer() should be used in place
144 of rcu_dereference(). The rcu_access_pointer() primitive
145 does not require an enclosing read-side critical section,
146 and also omits the smp_read_barrier_depends() included in
147 rcu_dereference(), which in turn should provide a small
148 performance gain in some CPUs (e.g., the DEC Alpha).
149
141 o The comparison is against a pointer that references memory 150 o The comparison is against a pointer that references memory
142 that was initialized "a long time ago." The reason 151 that was initialized "a long time ago." The reason
143 this is safe is that even if misordering occurs, the 152 this is safe is that even if misordering occurs, the
diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt
index 18f9651ff23d..8151f0195f76 100644
--- a/Documentation/RCU/rculist_nulls.txt
+++ b/Documentation/RCU/rculist_nulls.txt
@@ -1,5 +1,5 @@
1Using hlist_nulls to protect read-mostly linked lists and 1Using hlist_nulls to protect read-mostly linked lists and
2objects using SLAB_DESTROY_BY_RCU allocations. 2objects using SLAB_TYPESAFE_BY_RCU allocations.
3 3
4Please read the basics in Documentation/RCU/listRCU.txt 4Please read the basics in Documentation/RCU/listRCU.txt
5 5
@@ -7,7 +7,7 @@ Using special makers (called 'nulls') is a convenient way
7to solve following problem : 7to solve following problem :
8 8
9A typical RCU linked list managing objects which are 9A typical RCU linked list managing objects which are
10allocated with SLAB_DESTROY_BY_RCU kmem_cache can 10allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can
11use following algos : 11use following algos :
12 12
131) Lookup algo 131) Lookup algo
@@ -96,7 +96,7 @@ unlock_chain(); // typically a spin_unlock()
963) Remove algo 963) Remove algo
97-------------- 97--------------
98Nothing special here, we can use a standard RCU hlist deletion. 98Nothing special here, we can use a standard RCU hlist deletion.
99But thanks to SLAB_DESTROY_BY_RCU, beware a deleted object can be reused 99But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused
100very very fast (before the end of RCU grace period) 100very very fast (before the end of RCU grace period)
101 101
102if (put_last_reference_on(obj) { 102if (put_last_reference_on(obj) {
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index e93d04133fe7..96a3d81837e1 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -1,9 +1,102 @@
1Using RCU's CPU Stall Detector 1Using RCU's CPU Stall Detector
2 2
3The rcu_cpu_stall_suppress module parameter enables RCU's CPU stall 3This document first discusses what sorts of issues RCU's CPU stall
4detector, which detects conditions that unduly delay RCU grace periods. 4detector can locate, and then discusses kernel parameters and Kconfig
5This module parameter enables CPU stall detection by default, but 5options that can be used to fine-tune the detector's operation. Finally,
6may be overridden via boot-time parameter or at runtime via sysfs. 6this document explains the stall detector's "splat" format.
7
8
9What Causes RCU CPU Stall Warnings?
10
11So your kernel printed an RCU CPU stall warning. The next question is
12"What caused it?" The following problems can result in RCU CPU stall
13warnings:
14
15o A CPU looping in an RCU read-side critical section.
16
17o A CPU looping with interrupts disabled.
18
19o A CPU looping with preemption disabled. This condition can
20 result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
21 stalls.
22
23o A CPU looping with bottom halves disabled. This condition can
24 result in RCU-sched and RCU-bh stalls.
25
26o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
27 kernel without invoking schedule(). Note that cond_resched()
28 does not necessarily prevent RCU CPU stall warnings. Therefore,
29 if the looping in the kernel is really expected and desirable
30 behavior, you might need to replace some of the cond_resched()
31 calls with calls to cond_resched_rcu_qs().
32
33o Booting Linux using a console connection that is too slow to
34 keep up with the boot-time console-message rate. For example,
35 a 115Kbaud serial console can be -way- too slow to keep up
36 with boot-time message rates, and will frequently result in
37 RCU CPU stall warning messages. Especially if you have added
38 debug printk()s.
39
40o Anything that prevents RCU's grace-period kthreads from running.
41 This can result in the "All QSes seen" console-log message.
42 This message will include information on when the kthread last
43 ran and how often it should be expected to run.
44
45o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
46 happen to preempt a low-priority task in the middle of an RCU
47 read-side critical section. This is especially damaging if
48 that low-priority task is not permitted to run on any other CPU,
49 in which case the next RCU grace period can never complete, which
50 will eventually cause the system to run out of memory and hang.
51 While the system is in the process of running itself out of
52 memory, you might see stall-warning messages.
53
54o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
55 is running at a higher priority than the RCU softirq threads.
56 This will prevent RCU callbacks from ever being invoked,
57 and in a CONFIG_PREEMPT_RCU kernel will further prevent
58 RCU grace periods from ever completing. Either way, the
59 system will eventually run out of memory and hang. In the
60 CONFIG_PREEMPT_RCU case, you might see stall-warning
61 messages.
62
63o A hardware or software issue shuts off the scheduler-clock
64 interrupt on a CPU that is not in dyntick-idle mode. This
65 problem really has happened, and seems to be most likely to
66 result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
67
68o A bug in the RCU implementation.
69
70o A hardware failure. This is quite unlikely, but has occurred
71 at least once in real life. A CPU failed in a running system,
72 becoming unresponsive, but not causing an immediate crash.
73 This resulted in a series of RCU CPU stall warnings, eventually
74 leading the realization that the CPU had failed.
75
76The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall
77warning. Note that SRCU does -not- have CPU stall warnings. Please note
78that RCU only detects CPU stalls when there is a grace period in progress.
79No grace period, no CPU stall warnings.
80
81To diagnose the cause of the stall, inspect the stack traces.
82The offending function will usually be near the top of the stack.
83If you have a series of stall warnings from a single extended stall,
84comparing the stack traces can often help determine where the stall
85is occurring, which will usually be in the function nearest the top of
86that portion of the stack which remains the same from trace to trace.
87If you can reliably trigger the stall, ftrace can be quite helpful.
88
89RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE
90and with RCU's event tracing. For information on RCU's event tracing,
91see include/trace/events/rcu.h.
92
93
94Fine-Tuning the RCU CPU Stall Detector
95
96The rcuupdate.rcu_cpu_stall_suppress module parameter disables RCU's
97CPU stall detector, which detects conditions that unduly delay RCU grace
98periods. This module parameter enables CPU stall detection by default,
99but may be overridden via boot-time parameter or at runtime via sysfs.
7The stall detector's idea of what constitutes "unduly delayed" is 100The stall detector's idea of what constitutes "unduly delayed" is
8controlled by a set of kernel configuration variables and cpp macros: 101controlled by a set of kernel configuration variables and cpp macros:
9 102
@@ -56,6 +149,9 @@ rcupdate.rcu_task_stall_timeout
56 And continues with the output of sched_show_task() for each 149 And continues with the output of sched_show_task() for each
57 task stalling the current RCU-tasks grace period. 150 task stalling the current RCU-tasks grace period.
58 151
152
153Interpreting RCU's CPU Stall-Detector "Splats"
154
59For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling, 155For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling,
60it will print a message similar to the following: 156it will print a message similar to the following:
61 157
@@ -178,89 +274,3 @@ grace period is in flight.
178 274
179It is entirely possible to see stall warnings from normal and from 275It is entirely possible to see stall warnings from normal and from
180expedited grace periods at about the same time from the same run. 276expedited grace periods at about the same time from the same run.
181
182
183What Causes RCU CPU Stall Warnings?
184
185So your kernel printed an RCU CPU stall warning. The next question is
186"What caused it?" The following problems can result in RCU CPU stall
187warnings:
188
189o A CPU looping in an RCU read-side critical section.
190
191o A CPU looping with interrupts disabled. This condition can
192 result in RCU-sched and RCU-bh stalls.
193
194o A CPU looping with preemption disabled. This condition can
195 result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
196 stalls.
197
198o A CPU looping with bottom halves disabled. This condition can
199 result in RCU-sched and RCU-bh stalls.
200
201o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
202 kernel without invoking schedule(). Note that cond_resched()
203 does not necessarily prevent RCU CPU stall warnings. Therefore,
204 if the looping in the kernel is really expected and desirable
205 behavior, you might need to replace some of the cond_resched()
206 calls with calls to cond_resched_rcu_qs().
207
208o Booting Linux using a console connection that is too slow to
209 keep up with the boot-time console-message rate. For example,
210 a 115Kbaud serial console can be -way- too slow to keep up
211 with boot-time message rates, and will frequently result in
212 RCU CPU stall warning messages. Especially if you have added
213 debug printk()s.
214
215o Anything that prevents RCU's grace-period kthreads from running.
216 This can result in the "All QSes seen" console-log message.
217 This message will include information on when the kthread last
218 ran and how often it should be expected to run.
219
220o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
221 happen to preempt a low-priority task in the middle of an RCU
222 read-side critical section. This is especially damaging if
223 that low-priority task is not permitted to run on any other CPU,
224 in which case the next RCU grace period can never complete, which
225 will eventually cause the system to run out of memory and hang.
226 While the system is in the process of running itself out of
227 memory, you might see stall-warning messages.
228
229o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
230 is running at a higher priority than the RCU softirq threads.
231 This will prevent RCU callbacks from ever being invoked,
232 and in a CONFIG_PREEMPT_RCU kernel will further prevent
233 RCU grace periods from ever completing. Either way, the
234 system will eventually run out of memory and hang. In the
235 CONFIG_PREEMPT_RCU case, you might see stall-warning
236 messages.
237
238o A hardware or software issue shuts off the scheduler-clock
239 interrupt on a CPU that is not in dyntick-idle mode. This
240 problem really has happened, and seems to be most likely to
241 result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
242
243o A bug in the RCU implementation.
244
245o A hardware failure. This is quite unlikely, but has occurred
246 at least once in real life. A CPU failed in a running system,
247 becoming unresponsive, but not causing an immediate crash.
248 This resulted in a series of RCU CPU stall warnings, eventually
249 leading the realization that the CPU had failed.
250
251The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall
252warning. Note that SRCU does -not- have CPU stall warnings. Please note
253that RCU only detects CPU stalls when there is a grace period in progress.
254No grace period, no CPU stall warnings.
255
256To diagnose the cause of the stall, inspect the stack traces.
257The offending function will usually be near the top of the stack.
258If you have a series of stall warnings from a single extended stall,
259comparing the stack traces can often help determine where the stall
260is occurring, which will usually be in the function nearest the top of
261that portion of the stack which remains the same from trace to trace.
262If you can reliably trigger the stall, ftrace can be quite helpful.
263
264RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE
265and with RCU's event tracing. For information on RCU's event tracing,
266see include/trace/events/rcu.h.
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 5cbd8b2395b8..8ed6c9f6133c 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -562,7 +562,9 @@ This section presents a "toy" RCU implementation that is based on
562familiar locking primitives. Its overhead makes it a non-starter for 562familiar locking primitives. Its overhead makes it a non-starter for
563real-life use, as does its lack of scalability. It is also unsuitable 563real-life use, as does its lack of scalability. It is also unsuitable
564for realtime use, since it allows scheduling latency to "bleed" from 564for realtime use, since it allows scheduling latency to "bleed" from
565one read-side critical section to another. 565one read-side critical section to another. It also assumes recursive
566reader-writer locks: If you try this with non-recursive locks, and
567you allow nested rcu_read_lock() calls, you can deadlock.
566 568
567However, it is probably the easiest implementation to relate to, so is 569However, it is probably the easiest implementation to relate to, so is
568a good starting point. 570a good starting point.
@@ -587,20 +589,21 @@ It is extremely simple:
587 write_unlock(&rcu_gp_mutex); 589 write_unlock(&rcu_gp_mutex);
588 } 590 }
589 591
590[You can ignore rcu_assign_pointer() and rcu_dereference() without 592[You can ignore rcu_assign_pointer() and rcu_dereference() without missing
591missing much. But here they are anyway. And whatever you do, don't 593much. But here are simplified versions anyway. And whatever you do,
592forget about them when submitting patches making use of RCU!] 594don't forget about them when submitting patches making use of RCU!]
593 595
594 #define rcu_assign_pointer(p, v) ({ \ 596 #define rcu_assign_pointer(p, v) \
595 smp_wmb(); \ 597 ({ \
596 (p) = (v); \ 598 smp_store_release(&(p), (v)); \
597 }) 599 })
598 600
599 #define rcu_dereference(p) ({ \ 601 #define rcu_dereference(p) \
600 typeof(p) _________p1 = p; \ 602 ({ \
601 smp_read_barrier_depends(); \ 603 typeof(p) _________p1 = p; \
602 (_________p1); \ 604 smp_read_barrier_depends(); \
603 }) 605 (_________p1); \
606 })
604 607
605 608
606The rcu_read_lock() and rcu_read_unlock() primitive read-acquire 609The rcu_read_lock() and rcu_read_unlock() primitive read-acquire
@@ -925,7 +928,8 @@ d. Do you need RCU grace periods to complete even in the face
925 928
926e. Is your workload too update-intensive for normal use of 929e. Is your workload too update-intensive for normal use of
927 RCU, but inappropriate for other synchronization mechanisms? 930 RCU, but inappropriate for other synchronization mechanisms?
928 If so, consider SLAB_DESTROY_BY_RCU. But please be careful! 931 If so, consider SLAB_TYPESAFE_BY_RCU (which was originally
932 named SLAB_DESTROY_BY_RCU). But please be careful!
929 933
930f. Do you need read-side critical sections that are respected 934f. Do you need read-side critical sections that are respected
931 even though they are in the middle of the idle loop, during 935 even though they are in the middle of the idle loop, during
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 130e7ecaf9a6..4e0654b56aef 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3800,6 +3800,14 @@
3800 spia_pedr= 3800 spia_pedr=
3801 spia_peddr= 3801 spia_peddr=
3802 3802
3803 srcutree.exp_holdoff [KNL]
3804 Specifies how many nanoseconds must elapse
3805 since the end of the last SRCU grace period for
3806 a given srcu_struct until the next normal SRCU
3807 grace period will be considered for automatic
3808 expediting. Set to zero to disable automatic
3809 expediting.
3810
3803 stacktrace [FTRACE] 3811 stacktrace [FTRACE]
3804 Enabled the stack tracer on boot up. 3812 Enabled the stack tracer on boot up.
3805 3813
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index d2b0a8d81258..08329cb857ed 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -768,7 +768,7 @@ equal to zero, in which case the compiler is within its rights to
768transform the above code into the following: 768transform the above code into the following:
769 769
770 q = READ_ONCE(a); 770 q = READ_ONCE(a);
771 WRITE_ONCE(b, 1); 771 WRITE_ONCE(b, 2);
772 do_something_else(); 772 do_something_else();
773 773
774Given this transformation, the CPU is not required to respect the ordering 774Given this transformation, the CPU is not required to respect the ordering
diff --git a/arch/Kconfig b/arch/Kconfig
index dcbd462b68b1..6c00e5b00f8b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -324,6 +324,9 @@ config HAVE_CMPXCHG_LOCAL
324config HAVE_CMPXCHG_DOUBLE 324config HAVE_CMPXCHG_DOUBLE
325 bool 325 bool
326 326
327config ARCH_WEAK_RELEASE_ACQUIRE
328 bool
329
327config ARCH_WANT_IPC_PARSE_VERSION 330config ARCH_WANT_IPC_PARSE_VERSION
328 bool 331 bool
329 332
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d8834e8bfb05..964da1891ea9 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -146,6 +146,7 @@ config PPC
146 select ARCH_USE_BUILTIN_BSWAP 146 select ARCH_USE_BUILTIN_BSWAP
147 select ARCH_USE_CMPXCHG_LOCKREF if PPC64 147 select ARCH_USE_CMPXCHG_LOCKREF if PPC64
148 select ARCH_WANT_IPC_PARSE_VERSION 148 select ARCH_WANT_IPC_PARSE_VERSION
149 select ARCH_WEAK_RELEASE_ACQUIRE
149 select BINFMT_ELF 150 select BINFMT_ELF
150 select BUILDTIME_EXTABLE_SORT 151 select BUILDTIME_EXTABLE_SORT
151 select CLONE_BACKWARDS 152 select CLONE_BACKWARDS
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 532a577ff7a1..b6ac3df18b58 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4789,7 +4789,7 @@ i915_gem_load_init(struct drm_i915_private *dev_priv)
4789 dev_priv->requests = KMEM_CACHE(drm_i915_gem_request, 4789 dev_priv->requests = KMEM_CACHE(drm_i915_gem_request,
4790 SLAB_HWCACHE_ALIGN | 4790 SLAB_HWCACHE_ALIGN |
4791 SLAB_RECLAIM_ACCOUNT | 4791 SLAB_RECLAIM_ACCOUNT |
4792 SLAB_DESTROY_BY_RCU); 4792 SLAB_TYPESAFE_BY_RCU);
4793 if (!dev_priv->requests) 4793 if (!dev_priv->requests)
4794 goto err_vmas; 4794 goto err_vmas;
4795 4795
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index a211c53c813f..129c58bb4805 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -521,7 +521,7 @@ static inline struct drm_i915_gem_request *
521__i915_gem_active_get_rcu(const struct i915_gem_active *active) 521__i915_gem_active_get_rcu(const struct i915_gem_active *active)
522{ 522{
523 /* Performing a lockless retrieval of the active request is super 523 /* Performing a lockless retrieval of the active request is super
524 * tricky. SLAB_DESTROY_BY_RCU merely guarantees that the backing 524 * tricky. SLAB_TYPESAFE_BY_RCU merely guarantees that the backing
525 * slab of request objects will not be freed whilst we hold the 525 * slab of request objects will not be freed whilst we hold the
526 * RCU read lock. It does not guarantee that the request itself 526 * RCU read lock. It does not guarantee that the request itself
527 * will not be freed and then *reused*. Viz, 527 * will not be freed and then *reused*. Viz,
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 6a8258eacdcb..9f24c5da3f8d 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -174,7 +174,7 @@ struct drm_i915_private *mock_gem_device(void)
174 i915->requests = KMEM_CACHE(mock_request, 174 i915->requests = KMEM_CACHE(mock_request,
175 SLAB_HWCACHE_ALIGN | 175 SLAB_HWCACHE_ALIGN |
176 SLAB_RECLAIM_ACCOUNT | 176 SLAB_RECLAIM_ACCOUNT |
177 SLAB_DESTROY_BY_RCU); 177 SLAB_TYPESAFE_BY_RCU);
178 if (!i915->requests) 178 if (!i915->requests)
179 goto err_vmas; 179 goto err_vmas;
180 180
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
index 6f9d540a97ce..fff930fc3cff 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
@@ -1115,7 +1115,7 @@ int ldlm_init(void)
1115 ldlm_lock_slab = kmem_cache_create("ldlm_locks", 1115 ldlm_lock_slab = kmem_cache_create("ldlm_locks",
1116 sizeof(struct ldlm_lock), 0, 1116 sizeof(struct ldlm_lock), 0,
1117 SLAB_HWCACHE_ALIGN | 1117 SLAB_HWCACHE_ALIGN |
1118 SLAB_DESTROY_BY_RCU, NULL); 1118 SLAB_TYPESAFE_BY_RCU, NULL);
1119 if (!ldlm_lock_slab) { 1119 if (!ldlm_lock_slab) {
1120 kmem_cache_destroy(ldlm_resource_slab); 1120 kmem_cache_destroy(ldlm_resource_slab);
1121 return -ENOMEM; 1121 return -ENOMEM;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5a0245e36240..ebad34266bcf 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2363,7 +2363,7 @@ static int jbd2_journal_init_journal_head_cache(void)
2363 jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", 2363 jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
2364 sizeof(struct journal_head), 2364 sizeof(struct journal_head),
2365 0, /* offset */ 2365 0, /* offset */
2366 SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU, 2366 SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU,
2367 NULL); /* ctor */ 2367 NULL); /* ctor */
2368 retval = 0; 2368 retval = 0;
2369 if (!jbd2_journal_head_cache) { 2369 if (!jbd2_journal_head_cache) {
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 270221fcef42..7e3d71109f51 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -38,7 +38,7 @@ void signalfd_cleanup(struct sighand_struct *sighand)
38 /* 38 /*
39 * The lockless check can race with remove_wait_queue() in progress, 39 * The lockless check can race with remove_wait_queue() in progress,
40 * but in this case its caller should run under rcu_read_lock() and 40 * but in this case its caller should run under rcu_read_lock() and
41 * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return. 41 * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return.
42 */ 42 */
43 if (likely(!waitqueue_active(wqh))) 43 if (likely(!waitqueue_active(wqh)))
44 return; 44 return;
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 6048fa404e57..a5195a7d6f77 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -229,7 +229,7 @@ static inline struct dma_fence *dma_fence_get_rcu(struct dma_fence *fence)
229 * 229 *
230 * Function returns NULL if no refcount could be obtained, or the fence. 230 * Function returns NULL if no refcount could be obtained, or the fence.
231 * This function handles acquiring a reference to a fence that may be 231 * This function handles acquiring a reference to a fence that may be
232 * reallocated within the RCU grace period (such as with SLAB_DESTROY_BY_RCU), 232 * reallocated within the RCU grace period (such as with SLAB_TYPESAFE_BY_RCU),
233 * so long as the caller is using RCU on the pointer to the fence. 233 * so long as the caller is using RCU on the pointer to the fence.
234 * 234 *
235 * An alternative mechanism is to employ a seqlock to protect a bunch of 235 * An alternative mechanism is to employ a seqlock to protect a bunch of
@@ -257,7 +257,7 @@ dma_fence_get_rcu_safe(struct dma_fence * __rcu *fencep)
257 * have successfully acquire a reference to it. If it no 257 * have successfully acquire a reference to it. If it no
258 * longer matches, we are holding a reference to some other 258 * longer matches, we are holding a reference to some other
259 * reallocated pointer. This is possible if the allocator 259 * reallocated pointer. This is possible if the allocator
260 * is using a freelist like SLAB_DESTROY_BY_RCU where the 260 * is using a freelist like SLAB_TYPESAFE_BY_RCU where the
261 * fence remains valid for the RCU grace period, but it 261 * fence remains valid for the RCU grace period, but it
262 * may be reallocated. When using such allocators, we are 262 * may be reallocated. When using such allocators, we are
263 * responsible for ensuring the reference we get is to 263 * responsible for ensuring the reference we get is to
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4d629471869b..2b12b2683359 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -384,8 +384,6 @@ struct kvm {
384 struct mutex slots_lock; 384 struct mutex slots_lock;
385 struct mm_struct *mm; /* userspace tied to this vm */ 385 struct mm_struct *mm; /* userspace tied to this vm */
386 struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM]; 386 struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM];
387 struct srcu_struct srcu;
388 struct srcu_struct irq_srcu;
389 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; 387 struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
390 388
391 /* 389 /*
@@ -438,6 +436,8 @@ struct kvm {
438 struct list_head devices; 436 struct list_head devices;
439 struct dentry *debugfs_dentry; 437 struct dentry *debugfs_dentry;
440 struct kvm_stat_data **debugfs_stat_data; 438 struct kvm_stat_data **debugfs_stat_data;
439 struct srcu_struct srcu;
440 struct srcu_struct irq_srcu;
441}; 441};
442 442
443#define kvm_err(fmt, ...) \ 443#define kvm_err(fmt, ...) \
diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h
new file mode 100644
index 000000000000..4b766b61e1a0
--- /dev/null
+++ b/include/linux/rcu_node_tree.h
@@ -0,0 +1,99 @@
1/*
2 * RCU node combining tree definitions. These are used to compute
3 * global attributes while avoiding common-case global contention. A key
4 * property that these computations rely on is a tournament-style approach
5 * where only one of the tasks contending a lower level in the tree need
6 * advance to the next higher level. If properly configured, this allows
7 * unlimited scalability while maintaining a constant level of contention
8 * on the root node.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, you can access it online at
22 * http://www.gnu.org/licenses/gpl-2.0.html.
23 *
24 * Copyright IBM Corporation, 2017
25 *
26 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
27 */
28
29#ifndef __LINUX_RCU_NODE_TREE_H
30#define __LINUX_RCU_NODE_TREE_H
31
32/*
33 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
34 * CONFIG_RCU_FANOUT_LEAF.
35 * In theory, it should be possible to add more levels straightforwardly.
36 * In practice, this did work well going from three levels to four.
37 * Of course, your mileage may vary.
38 */
39
40#ifdef CONFIG_RCU_FANOUT
41#define RCU_FANOUT CONFIG_RCU_FANOUT
42#else /* #ifdef CONFIG_RCU_FANOUT */
43# ifdef CONFIG_64BIT
44# define RCU_FANOUT 64
45# else
46# define RCU_FANOUT 32
47# endif
48#endif /* #else #ifdef CONFIG_RCU_FANOUT */
49
50#ifdef CONFIG_RCU_FANOUT_LEAF
51#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
52#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
53#define RCU_FANOUT_LEAF 16
54#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
55
56#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
57#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT)
58#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT)
59#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT)
60
61#if NR_CPUS <= RCU_FANOUT_1
62# define RCU_NUM_LVLS 1
63# define NUM_RCU_LVL_0 1
64# define NUM_RCU_NODES NUM_RCU_LVL_0
65# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
66# define RCU_NODE_NAME_INIT { "rcu_node_0" }
67# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
68#elif NR_CPUS <= RCU_FANOUT_2
69# define RCU_NUM_LVLS 2
70# define NUM_RCU_LVL_0 1
71# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
72# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
73# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
74# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
75# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
76#elif NR_CPUS <= RCU_FANOUT_3
77# define RCU_NUM_LVLS 3
78# define NUM_RCU_LVL_0 1
79# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
80# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
81# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
82# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
83# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
84# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
85#elif NR_CPUS <= RCU_FANOUT_4
86# define RCU_NUM_LVLS 4
87# define NUM_RCU_LVL_0 1
88# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
89# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
90# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
91# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
92# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
93# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
94# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
95#else
96# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
97#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
98
99#endif /* __LINUX_RCU_NODE_TREE_H */
diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
new file mode 100644
index 000000000000..ba4d2621d9ca
--- /dev/null
+++ b/include/linux/rcu_segcblist.h
@@ -0,0 +1,90 @@
1/*
2 * RCU segmented callback lists
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright IBM Corporation, 2017
19 *
20 * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 */
22
23#ifndef __INCLUDE_LINUX_RCU_SEGCBLIST_H
24#define __INCLUDE_LINUX_RCU_SEGCBLIST_H
25
26/* Simple unsegmented callback lists. */
27struct rcu_cblist {
28 struct rcu_head *head;
29 struct rcu_head **tail;
30 long len;
31 long len_lazy;
32};
33
34#define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head }
35
36/* Complicated segmented callback lists. ;-) */
37
38/*
39 * Index values for segments in rcu_segcblist structure.
40 *
41 * The segments are as follows:
42 *
43 * [head, *tails[RCU_DONE_TAIL]):
44 * Callbacks whose grace period has elapsed, and thus can be invoked.
45 * [*tails[RCU_DONE_TAIL], *tails[RCU_WAIT_TAIL]):
46 * Callbacks waiting for the current GP from the current CPU's viewpoint.
47 * [*tails[RCU_WAIT_TAIL], *tails[RCU_NEXT_READY_TAIL]):
48 * Callbacks that arrived before the next GP started, again from
49 * the current CPU's viewpoint. These can be handled by the next GP.
50 * [*tails[RCU_NEXT_READY_TAIL], *tails[RCU_NEXT_TAIL]):
51 * Callbacks that might have arrived after the next GP started.
52 * There is some uncertainty as to when a given GP starts and
53 * ends, but a CPU knows the exact times if it is the one starting
54 * or ending the GP. Other CPUs know that the previous GP ends
55 * before the next one starts.
56 *
57 * Note that RCU_WAIT_TAIL cannot be empty unless RCU_NEXT_READY_TAIL is also
58 * empty.
59 *
60 * The ->gp_seq[] array contains the grace-period number at which the
61 * corresponding segment of callbacks will be ready to invoke. A given
62 * element of this array is meaningful only when the corresponding segment
63 * is non-empty, and it is never valid for RCU_DONE_TAIL (whose callbacks
64 * are already ready to invoke) or for RCU_NEXT_TAIL (whose callbacks have
65 * not yet been assigned a grace-period number).
66 */
67#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
68#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
69#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
70#define RCU_NEXT_TAIL 3
71#define RCU_CBLIST_NSEGS 4
72
73struct rcu_segcblist {
74 struct rcu_head *head;
75 struct rcu_head **tails[RCU_CBLIST_NSEGS];
76 unsigned long gp_seq[RCU_CBLIST_NSEGS];
77 long len;
78 long len_lazy;
79};
80
81#define RCU_SEGCBLIST_INITIALIZER(n) \
82{ \
83 .head = NULL, \
84 .tails[RCU_DONE_TAIL] = &n.head, \
85 .tails[RCU_WAIT_TAIL] = &n.head, \
86 .tails[RCU_NEXT_READY_TAIL] = &n.head, \
87 .tails[RCU_NEXT_TAIL] = &n.head, \
88}
89
90#endif /* __INCLUDE_LINUX_RCU_SEGCBLIST_H */
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 4f7a9561b8c4..b1fd8bf85fdc 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -509,7 +509,8 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n,
509{ 509{
510 struct hlist_node *i, *last = NULL; 510 struct hlist_node *i, *last = NULL;
511 511
512 for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i)) 512 /* Note: write side code, so rcu accessors are not needed. */
513 for (i = h->first; i; i = i->next)
513 last = i; 514 last = i;
514 515
515 if (last) { 516 if (last) {
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index dea8f17b2fe3..e1e5d002fdb9 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -368,15 +368,20 @@ static inline void rcu_init_nohz(void)
368#ifdef CONFIG_TASKS_RCU 368#ifdef CONFIG_TASKS_RCU
369#define TASKS_RCU(x) x 369#define TASKS_RCU(x) x
370extern struct srcu_struct tasks_rcu_exit_srcu; 370extern struct srcu_struct tasks_rcu_exit_srcu;
371#define rcu_note_voluntary_context_switch(t) \ 371#define rcu_note_voluntary_context_switch_lite(t) \
372 do { \ 372 do { \
373 rcu_all_qs(); \
374 if (READ_ONCE((t)->rcu_tasks_holdout)) \ 373 if (READ_ONCE((t)->rcu_tasks_holdout)) \
375 WRITE_ONCE((t)->rcu_tasks_holdout, false); \ 374 WRITE_ONCE((t)->rcu_tasks_holdout, false); \
376 } while (0) 375 } while (0)
376#define rcu_note_voluntary_context_switch(t) \
377 do { \
378 rcu_all_qs(); \
379 rcu_note_voluntary_context_switch_lite(t); \
380 } while (0)
377#else /* #ifdef CONFIG_TASKS_RCU */ 381#else /* #ifdef CONFIG_TASKS_RCU */
378#define TASKS_RCU(x) do { } while (0) 382#define TASKS_RCU(x) do { } while (0)
379#define rcu_note_voluntary_context_switch(t) rcu_all_qs() 383#define rcu_note_voluntary_context_switch_lite(t) do { } while (0)
384#define rcu_note_voluntary_context_switch(t) rcu_all_qs()
380#endif /* #else #ifdef CONFIG_TASKS_RCU */ 385#endif /* #else #ifdef CONFIG_TASKS_RCU */
381 386
382/** 387/**
@@ -1132,11 +1137,11 @@ do { \
1132 * if the UNLOCK and LOCK are executed by the same CPU or if the 1137 * if the UNLOCK and LOCK are executed by the same CPU or if the
1133 * UNLOCK and LOCK operate on the same lock variable. 1138 * UNLOCK and LOCK operate on the same lock variable.
1134 */ 1139 */
1135#ifdef CONFIG_PPC 1140#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE
1136#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ 1141#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */
1137#else /* #ifdef CONFIG_PPC */ 1142#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
1138#define smp_mb__after_unlock_lock() do { } while (0) 1143#define smp_mb__after_unlock_lock() do { } while (0)
1139#endif /* #else #ifdef CONFIG_PPC */ 1144#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
1140 1145
1141 1146
1142#endif /* __LINUX_RCUPDATE_H */ 1147#endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index b452953e21c8..74d9c3a1feee 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -33,6 +33,11 @@ static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
33 return 0; 33 return 0;
34} 34}
35 35
36static inline bool rcu_eqs_special_set(int cpu)
37{
38 return false; /* Never flag non-existent other CPUs! */
39}
40
36static inline unsigned long get_state_synchronize_rcu(void) 41static inline unsigned long get_state_synchronize_rcu(void)
37{ 42{
38 return 0; 43 return 0;
@@ -87,10 +92,11 @@ static inline void kfree_call_rcu(struct rcu_head *head,
87 call_rcu(head, func); 92 call_rcu(head, func);
88} 93}
89 94
90static inline void rcu_note_context_switch(void) 95#define rcu_note_context_switch(preempt) \
91{ 96 do { \
92 rcu_sched_qs(); 97 rcu_sched_qs(); \
93} 98 rcu_note_voluntary_context_switch_lite(current); \
99 } while (0)
94 100
95/* 101/*
96 * Take advantage of the fact that there is only one CPU, which 102 * Take advantage of the fact that there is only one CPU, which
@@ -212,14 +218,14 @@ static inline void exit_rcu(void)
212{ 218{
213} 219}
214 220
215#ifdef CONFIG_DEBUG_LOCK_ALLOC 221#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
216extern int rcu_scheduler_active __read_mostly; 222extern int rcu_scheduler_active __read_mostly;
217void rcu_scheduler_starting(void); 223void rcu_scheduler_starting(void);
218#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 224#else /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
219static inline void rcu_scheduler_starting(void) 225static inline void rcu_scheduler_starting(void)
220{ 226{
221} 227}
222#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 228#endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
223 229
224#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) 230#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
225 231
@@ -237,6 +243,10 @@ static inline bool rcu_is_watching(void)
237 243
238#endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ 244#endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
239 245
246static inline void rcu_request_urgent_qs_task(struct task_struct *t)
247{
248}
249
240static inline void rcu_all_qs(void) 250static inline void rcu_all_qs(void)
241{ 251{
242 barrier(); /* Avoid RCU read-side critical sections leaking across. */ 252 barrier(); /* Avoid RCU read-side critical sections leaking across. */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 63a4e4cf40a5..0bacb6b2af69 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,7 +30,7 @@
30#ifndef __LINUX_RCUTREE_H 30#ifndef __LINUX_RCUTREE_H
31#define __LINUX_RCUTREE_H 31#define __LINUX_RCUTREE_H
32 32
33void rcu_note_context_switch(void); 33void rcu_note_context_switch(bool preempt);
34int rcu_needs_cpu(u64 basem, u64 *nextevt); 34int rcu_needs_cpu(u64 basem, u64 *nextevt);
35void rcu_cpu_stall_reset(void); 35void rcu_cpu_stall_reset(void);
36 36
@@ -41,7 +41,7 @@ void rcu_cpu_stall_reset(void);
41 */ 41 */
42static inline void rcu_virt_note_context_switch(int cpu) 42static inline void rcu_virt_note_context_switch(int cpu)
43{ 43{
44 rcu_note_context_switch(); 44 rcu_note_context_switch(false);
45} 45}
46 46
47void synchronize_rcu_bh(void); 47void synchronize_rcu_bh(void);
@@ -108,6 +108,7 @@ void rcu_scheduler_starting(void);
108extern int rcu_scheduler_active __read_mostly; 108extern int rcu_scheduler_active __read_mostly;
109 109
110bool rcu_is_watching(void); 110bool rcu_is_watching(void);
111void rcu_request_urgent_qs_task(struct task_struct *t);
111 112
112void rcu_all_qs(void); 113void rcu_all_qs(void);
113 114
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 3c37a8c51921..04a7f7993e67 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -28,7 +28,7 @@
28#define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ 28#define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */
29#define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */ 29#define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */
30/* 30/*
31 * SLAB_DESTROY_BY_RCU - **WARNING** READ THIS! 31 * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
32 * 32 *
33 * This delays freeing the SLAB page by a grace period, it does _NOT_ 33 * This delays freeing the SLAB page by a grace period, it does _NOT_
34 * delay object freeing. This means that if you do kmem_cache_free() 34 * delay object freeing. This means that if you do kmem_cache_free()
@@ -61,8 +61,10 @@
61 * 61 *
62 * rcu_read_lock before reading the address, then rcu_read_unlock after 62 * rcu_read_lock before reading the address, then rcu_read_unlock after
63 * taking the spinlock within the structure expected at that address. 63 * taking the spinlock within the structure expected at that address.
64 *
65 * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
64 */ 66 */
65#define SLAB_DESTROY_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ 67#define SLAB_TYPESAFE_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */
66#define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ 68#define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */
67#define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */ 69#define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */
68 70
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index a598cf3ac70c..167ad8831aaf 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -22,7 +22,7 @@
22 * Lai Jiangshan <laijs@cn.fujitsu.com> 22 * Lai Jiangshan <laijs@cn.fujitsu.com>
23 * 23 *
24 * For detailed explanation of Read-Copy Update mechanism see - 24 * For detailed explanation of Read-Copy Update mechanism see -
25 * Documentation/RCU/ *.txt 25 * Documentation/RCU/ *.txt
26 * 26 *
27 */ 27 */
28 28
@@ -32,35 +32,9 @@
32#include <linux/mutex.h> 32#include <linux/mutex.h>
33#include <linux/rcupdate.h> 33#include <linux/rcupdate.h>
34#include <linux/workqueue.h> 34#include <linux/workqueue.h>
35#include <linux/rcu_segcblist.h>
35 36
36struct srcu_array { 37struct srcu_struct;
37 unsigned long lock_count[2];
38 unsigned long unlock_count[2];
39};
40
41struct rcu_batch {
42 struct rcu_head *head, **tail;
43};
44
45#define RCU_BATCH_INIT(name) { NULL, &(name.head) }
46
47struct srcu_struct {
48 unsigned long completed;
49 struct srcu_array __percpu *per_cpu_ref;
50 spinlock_t queue_lock; /* protect ->batch_queue, ->running */
51 bool running;
52 /* callbacks just queued */
53 struct rcu_batch batch_queue;
54 /* callbacks try to do the first check_zero */
55 struct rcu_batch batch_check0;
56 /* callbacks done with the first check_zero and the flip */
57 struct rcu_batch batch_check1;
58 struct rcu_batch batch_done;
59 struct delayed_work work;
60#ifdef CONFIG_DEBUG_LOCK_ALLOC
61 struct lockdep_map dep_map;
62#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
63};
64 38
65#ifdef CONFIG_DEBUG_LOCK_ALLOC 39#ifdef CONFIG_DEBUG_LOCK_ALLOC
66 40
@@ -82,46 +56,15 @@ int init_srcu_struct(struct srcu_struct *sp);
82#define __SRCU_DEP_MAP_INIT(srcu_name) 56#define __SRCU_DEP_MAP_INIT(srcu_name)
83#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 57#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
84 58
85void process_srcu(struct work_struct *work); 59#ifdef CONFIG_TINY_SRCU
86 60#include <linux/srcutiny.h>
87#define __SRCU_STRUCT_INIT(name) \ 61#elif defined(CONFIG_TREE_SRCU)
88 { \ 62#include <linux/srcutree.h>
89 .completed = -300, \ 63#elif defined(CONFIG_CLASSIC_SRCU)
90 .per_cpu_ref = &name##_srcu_array, \ 64#include <linux/srcuclassic.h>
91 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ 65#else
92 .running = false, \ 66#error "Unknown SRCU implementation specified to kernel configuration"
93 .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ 67#endif
94 .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \
95 .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \
96 .batch_done = RCU_BATCH_INIT(name.batch_done), \
97 .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
98 __SRCU_DEP_MAP_INIT(name) \
99 }
100
101/*
102 * Define and initialize a srcu struct at build time.
103 * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
104 *
105 * Note that although DEFINE_STATIC_SRCU() hides the name from other
106 * files, the per-CPU variable rules nevertheless require that the
107 * chosen name be globally unique. These rules also prohibit use of
108 * DEFINE_STATIC_SRCU() within a function. If these rules are too
109 * restrictive, declare the srcu_struct manually. For example, in
110 * each file:
111 *
112 * static struct srcu_struct my_srcu;
113 *
114 * Then, before the first use of each my_srcu, manually initialize it:
115 *
116 * init_srcu_struct(&my_srcu);
117 *
118 * See include/linux/percpu-defs.h for the rules on per-CPU variables.
119 */
120#define __DEFINE_SRCU(name, is_static) \
121 static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
122 is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
123#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
124#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)
125 68
126/** 69/**
127 * call_srcu() - Queue a callback for invocation after an SRCU grace period 70 * call_srcu() - Queue a callback for invocation after an SRCU grace period
@@ -147,9 +90,6 @@ void cleanup_srcu_struct(struct srcu_struct *sp);
147int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); 90int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
148void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); 91void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
149void synchronize_srcu(struct srcu_struct *sp); 92void synchronize_srcu(struct srcu_struct *sp);
150void synchronize_srcu_expedited(struct srcu_struct *sp);
151unsigned long srcu_batches_completed(struct srcu_struct *sp);
152void srcu_barrier(struct srcu_struct *sp);
153 93
154#ifdef CONFIG_DEBUG_LOCK_ALLOC 94#ifdef CONFIG_DEBUG_LOCK_ALLOC
155 95
diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h
new file mode 100644
index 000000000000..5753f7322262
--- /dev/null
+++ b/include/linux/srcuclassic.h
@@ -0,0 +1,115 @@
1/*
2 * Sleepable Read-Copy Update mechanism for mutual exclusion,
3 * classic v4.11 variant.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, you can access it online at
17 * http://www.gnu.org/licenses/gpl-2.0.html.
18 *
19 * Copyright (C) IBM Corporation, 2017
20 *
21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 */
23
24#ifndef _LINUX_SRCU_CLASSIC_H
25#define _LINUX_SRCU_CLASSIC_H
26
27struct srcu_array {
28 unsigned long lock_count[2];
29 unsigned long unlock_count[2];
30};
31
32struct rcu_batch {
33 struct rcu_head *head, **tail;
34};
35
36#define RCU_BATCH_INIT(name) { NULL, &(name.head) }
37
38struct srcu_struct {
39 unsigned long completed;
40 struct srcu_array __percpu *per_cpu_ref;
41 spinlock_t queue_lock; /* protect ->batch_queue, ->running */
42 bool running;
43 /* callbacks just queued */
44 struct rcu_batch batch_queue;
45 /* callbacks try to do the first check_zero */
46 struct rcu_batch batch_check0;
47 /* callbacks done with the first check_zero and the flip */
48 struct rcu_batch batch_check1;
49 struct rcu_batch batch_done;
50 struct delayed_work work;
51#ifdef CONFIG_DEBUG_LOCK_ALLOC
52 struct lockdep_map dep_map;
53#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
54};
55
56void process_srcu(struct work_struct *work);
57
58#define __SRCU_STRUCT_INIT(name) \
59 { \
60 .completed = -300, \
61 .per_cpu_ref = &name##_srcu_array, \
62 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \
63 .running = false, \
64 .batch_queue = RCU_BATCH_INIT(name.batch_queue), \
65 .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \
66 .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \
67 .batch_done = RCU_BATCH_INIT(name.batch_done), \
68 .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
69 __SRCU_DEP_MAP_INIT(name) \
70 }
71
72/*
73 * Define and initialize a srcu struct at build time.
74 * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
75 *
76 * Note that although DEFINE_STATIC_SRCU() hides the name from other
77 * files, the per-CPU variable rules nevertheless require that the
78 * chosen name be globally unique. These rules also prohibit use of
79 * DEFINE_STATIC_SRCU() within a function. If these rules are too
80 * restrictive, declare the srcu_struct manually. For example, in
81 * each file:
82 *
83 * static struct srcu_struct my_srcu;
84 *
85 * Then, before the first use of each my_srcu, manually initialize it:
86 *
87 * init_srcu_struct(&my_srcu);
88 *
89 * See include/linux/percpu-defs.h for the rules on per-CPU variables.
90 */
91#define __DEFINE_SRCU(name, is_static) \
92 static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
93 is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
94#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
95#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)
96
97void synchronize_srcu_expedited(struct srcu_struct *sp);
98void srcu_barrier(struct srcu_struct *sp);
99unsigned long srcu_batches_completed(struct srcu_struct *sp);
100
101static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
102 struct srcu_struct *sp, int *flags,
103 unsigned long *gpnum,
104 unsigned long *completed)
105{
106 if (test_type != SRCU_FLAVOR)
107 return;
108 *flags = 0;
109 *completed = sp->completed;
110 *gpnum = *completed;
111 if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check0.head)
112 (*gpnum)++;
113}
114
115#endif
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
new file mode 100644
index 000000000000..42311ee0334f
--- /dev/null
+++ b/include/linux/srcutiny.h
@@ -0,0 +1,93 @@
1/*
2 * Sleepable Read-Copy Update mechanism for mutual exclusion,
3 * tiny variant.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, you can access it online at
17 * http://www.gnu.org/licenses/gpl-2.0.html.
18 *
19 * Copyright (C) IBM Corporation, 2017
20 *
21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 */
23
24#ifndef _LINUX_SRCU_TINY_H
25#define _LINUX_SRCU_TINY_H
26
27#include <linux/swait.h>
28
29struct srcu_struct {
30 int srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */
31 struct swait_queue_head srcu_wq;
32 /* Last srcu_read_unlock() wakes GP. */
33 unsigned long srcu_gp_seq; /* GP seq # for callback tagging. */
34 struct rcu_segcblist srcu_cblist;
35 /* Pending SRCU callbacks. */
36 int srcu_idx; /* Current reader array element. */
37 bool srcu_gp_running; /* GP workqueue running? */
38 bool srcu_gp_waiting; /* GP waiting for readers? */
39 struct work_struct srcu_work; /* For driving grace periods. */
40#ifdef CONFIG_DEBUG_LOCK_ALLOC
41 struct lockdep_map dep_map;
42#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
43};
44
45void srcu_drive_gp(struct work_struct *wp);
46
47#define __SRCU_STRUCT_INIT(name) \
48{ \
49 .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \
50 .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist), \
51 .srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp), \
52 __SRCU_DEP_MAP_INIT(name) \
53}
54
55/*
56 * This odd _STATIC_ arrangement is needed for API compatibility with
57 * Tree SRCU, which needs some per-CPU data.
58 */
59#define DEFINE_SRCU(name) \
60 struct srcu_struct name = __SRCU_STRUCT_INIT(name)
61#define DEFINE_STATIC_SRCU(name) \
62 static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
63
64void synchronize_srcu(struct srcu_struct *sp);
65
66static inline void synchronize_srcu_expedited(struct srcu_struct *sp)
67{
68 synchronize_srcu(sp);
69}
70
71static inline void srcu_barrier(struct srcu_struct *sp)
72{
73 synchronize_srcu(sp);
74}
75
76static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
77{
78 return 0;
79}
80
81static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
82 struct srcu_struct *sp, int *flags,
83 unsigned long *gpnum,
84 unsigned long *completed)
85{
86 if (test_type != SRCU_FLAVOR)
87 return;
88 *flags = 0;
89 *completed = sp->srcu_gp_seq;
90 *gpnum = *completed;
91}
92
93#endif
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
new file mode 100644
index 000000000000..32e86d85fd11
--- /dev/null
+++ b/include/linux/srcutree.h
@@ -0,0 +1,150 @@
1/*
2 * Sleepable Read-Copy Update mechanism for mutual exclusion,
3 * tree variant.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, you can access it online at
17 * http://www.gnu.org/licenses/gpl-2.0.html.
18 *
19 * Copyright (C) IBM Corporation, 2017
20 *
21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 */
23
24#ifndef _LINUX_SRCU_TREE_H
25#define _LINUX_SRCU_TREE_H
26
27#include <linux/rcu_node_tree.h>
28#include <linux/completion.h>
29
30struct srcu_node;
31struct srcu_struct;
32
33/*
34 * Per-CPU structure feeding into leaf srcu_node, similar in function
35 * to rcu_node.
36 */
37struct srcu_data {
38 /* Read-side state. */
39 unsigned long srcu_lock_count[2]; /* Locks per CPU. */
40 unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */
41
42 /* Update-side state. */
43 spinlock_t lock ____cacheline_internodealigned_in_smp;
44 struct rcu_segcblist srcu_cblist; /* List of callbacks.*/
45 unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */
46 unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */
47 bool srcu_cblist_invoking; /* Invoking these CBs? */
48 struct delayed_work work; /* Context for CB invoking. */
49 struct rcu_head srcu_barrier_head; /* For srcu_barrier() use. */
50 struct srcu_node *mynode; /* Leaf srcu_node. */
51 unsigned long grpmask; /* Mask for leaf srcu_node */
52 /* ->srcu_data_have_cbs[]. */
53 int cpu;
54 struct srcu_struct *sp;
55};
56
57/*
58 * Node in SRCU combining tree, similar in function to rcu_data.
59 */
60struct srcu_node {
61 spinlock_t lock;
62 unsigned long srcu_have_cbs[4]; /* GP seq for children */
63 /* having CBs, but only */
64 /* is > ->srcu_gq_seq. */
65 unsigned long srcu_data_have_cbs[4]; /* Which srcu_data structs */
66 /* have CBs for given GP? */
67 unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */
68 struct srcu_node *srcu_parent; /* Next up in tree. */
69 int grplo; /* Least CPU for node. */
70 int grphi; /* Biggest CPU for node. */
71};
72
73/*
74 * Per-SRCU-domain structure, similar in function to rcu_state.
75 */
76struct srcu_struct {
77 struct srcu_node node[NUM_RCU_NODES]; /* Combining tree. */
78 struct srcu_node *level[RCU_NUM_LVLS + 1];
79 /* First node at each level. */
80 struct mutex srcu_cb_mutex; /* Serialize CB preparation. */
81 spinlock_t gp_lock; /* protect ->srcu_cblist */
82 struct mutex srcu_gp_mutex; /* Serialize GP work. */
83 unsigned int srcu_idx; /* Current rdr array element. */
84 unsigned long srcu_gp_seq; /* Grace-period seq #. */
85 unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */
86 unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */
87 unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */
88 struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */
89 unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */
90 struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */
91 struct completion srcu_barrier_completion;
92 /* Awaken barrier rq at end. */
93 atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */
94 /* callback for the barrier */
95 /* operation. */
96 struct delayed_work work;
97#ifdef CONFIG_DEBUG_LOCK_ALLOC
98 struct lockdep_map dep_map;
99#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
100};
101
102/* Values for state variable (bottom bits of ->srcu_gp_seq). */
103#define SRCU_STATE_IDLE 0
104#define SRCU_STATE_SCAN1 1
105#define SRCU_STATE_SCAN2 2
106
107void process_srcu(struct work_struct *work);
108
109#define __SRCU_STRUCT_INIT(name) \
110 { \
111 .sda = &name##_srcu_data, \
112 .gp_lock = __SPIN_LOCK_UNLOCKED(name.gp_lock), \
113 .srcu_gp_seq_needed = 0 - 1, \
114 __SRCU_DEP_MAP_INIT(name) \
115 }
116
117/*
118 * Define and initialize a srcu struct at build time.
119 * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
120 *
121 * Note that although DEFINE_STATIC_SRCU() hides the name from other
122 * files, the per-CPU variable rules nevertheless require that the
123 * chosen name be globally unique. These rules also prohibit use of
124 * DEFINE_STATIC_SRCU() within a function. If these rules are too
125 * restrictive, declare the srcu_struct manually. For example, in
126 * each file:
127 *
128 * static struct srcu_struct my_srcu;
129 *
130 * Then, before the first use of each my_srcu, manually initialize it:
131 *
132 * init_srcu_struct(&my_srcu);
133 *
134 * See include/linux/percpu-defs.h for the rules on per-CPU variables.
135 */
136#define __DEFINE_SRCU(name, is_static) \
137 static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\
138 is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
139#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
140#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)
141
142void synchronize_srcu_expedited(struct srcu_struct *sp);
143void srcu_barrier(struct srcu_struct *sp);
144unsigned long srcu_batches_completed(struct srcu_struct *sp);
145
146void srcutorture_get_gp_data(enum rcutorture_type test_type,
147 struct srcu_struct *sp, int *flags,
148 unsigned long *gpnum, unsigned long *completed);
149
150#endif
diff --git a/include/linux/types.h b/include/linux/types.h
index 1e7bd24848fc..258099a4ed82 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -209,7 +209,7 @@ struct ustat {
209 * naturally due ABI requirements, but some architectures (like CRIS) have 209 * naturally due ABI requirements, but some architectures (like CRIS) have
210 * weird ABI and we need to ask it explicitly. 210 * weird ABI and we need to ask it explicitly.
211 * 211 *
212 * The alignment is required to guarantee that bits 0 and 1 of @next will be 212 * The alignment is required to guarantee that bit 0 of @next will be
213 * clear under normal conditions -- as long as we use call_rcu(), 213 * clear under normal conditions -- as long as we use call_rcu(),
214 * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. 214 * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback.
215 * 215 *
diff --git a/include/net/sock.h b/include/net/sock.h
index 66349e49d468..f33e3d134e0b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -995,7 +995,7 @@ struct smc_hashinfo;
995struct module; 995struct module;
996 996
997/* 997/*
998 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes 998 * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes
999 * un-modified. Special care is taken when initializing object to zero. 999 * un-modified. Special care is taken when initializing object to zero.
1000 */ 1000 */
1001static inline void sk_prot_clear_nulls(struct sock *sk, int size) 1001static inline void sk_prot_clear_nulls(struct sock *sk, int size)
diff --git a/init/Kconfig b/init/Kconfig
index a92f27da4a27..1d3475fc9496 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -521,11 +521,41 @@ config RCU_EXPERT
521 521
522config SRCU 522config SRCU
523 bool 523 bool
524 default y
524 help 525 help
525 This option selects the sleepable version of RCU. This version 526 This option selects the sleepable version of RCU. This version
526 permits arbitrary sleeping or blocking within RCU read-side critical 527 permits arbitrary sleeping or blocking within RCU read-side critical
527 sections. 528 sections.
528 529
530config CLASSIC_SRCU
531 bool "Use v4.11 classic SRCU implementation"
532 default n
533 depends on RCU_EXPERT && SRCU
534 help
535 This option selects the traditional well-tested classic SRCU
536 implementation from v4.11, as might be desired for enterprise
537 Linux distributions. Without this option, the shiny new
538 Tiny SRCU and Tree SRCU implementations are used instead.
539 At some point, it is hoped that Tiny SRCU and Tree SRCU
540 will accumulate enough test time and confidence to allow
541 Classic SRCU to be dropped entirely.
542
543 Say Y if you need a rock-solid SRCU.
544
545 Say N if you would like help test Tree SRCU.
546
547config TINY_SRCU
548 bool
549 default y if SRCU && TINY_RCU && !CLASSIC_SRCU
550 help
551 This option selects the single-CPU non-preemptible version of SRCU.
552
553config TREE_SRCU
554 bool
555 default y if SRCU && !TINY_RCU && !CLASSIC_SRCU
556 help
557 This option selects the full-fledged version of SRCU.
558
529config TASKS_RCU 559config TASKS_RCU
530 bool 560 bool
531 default n 561 default n
@@ -543,6 +573,9 @@ config RCU_STALL_COMMON
543 the tiny variants to disable RCU CPU stall warnings, while 573 the tiny variants to disable RCU CPU stall warnings, while
544 making these warnings mandatory for the tree variants. 574 making these warnings mandatory for the tree variants.
545 575
576config RCU_NEED_SEGCBLIST
577 def_bool ( TREE_RCU || PREEMPT_RCU || TINY_SRCU || TREE_SRCU )
578
546config CONTEXT_TRACKING 579config CONTEXT_TRACKING
547 bool 580 bool
548 581
@@ -612,11 +645,17 @@ config RCU_FANOUT_LEAF
612 initialization. These systems tend to run CPU-bound, and thus 645 initialization. These systems tend to run CPU-bound, and thus
613 are not helped by synchronized interrupts, and thus tend to 646 are not helped by synchronized interrupts, and thus tend to
614 skew them, which reduces lock contention enough that large 647 skew them, which reduces lock contention enough that large
615 leaf-level fanouts work well. 648 leaf-level fanouts work well. That said, setting leaf-level
649 fanout to a large number will likely cause problematic
650 lock contention on the leaf-level rcu_node structures unless
651 you boot with the skew_tick kernel parameter.
616 652
617 Select a specific number if testing RCU itself. 653 Select a specific number if testing RCU itself.
618 654
619 Select the maximum permissible value for large systems. 655 Select the maximum permissible value for large systems, but
656 please understand that you may also need to set the skew_tick
657 kernel boot parameter to avoid contention on the rcu_node
658 structure's locks.
620 659
621 Take the default if unsure. 660 Take the default if unsure.
622 661
diff --git a/kernel/fork.c b/kernel/fork.c
index 08ba696aa561..bfd91b180778 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1337,7 +1337,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
1337 if (atomic_dec_and_test(&sighand->count)) { 1337 if (atomic_dec_and_test(&sighand->count)) {
1338 signalfd_cleanup(sighand); 1338 signalfd_cleanup(sighand);
1339 /* 1339 /*
1340 * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it 1340 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
1341 * without an RCU grace period, see __lock_task_sighand(). 1341 * without an RCU grace period, see __lock_task_sighand().
1342 */ 1342 */
1343 kmem_cache_free(sighand_cachep, sighand); 1343 kmem_cache_free(sighand_cachep, sighand);
@@ -2176,7 +2176,7 @@ void __init proc_caches_init(void)
2176{ 2176{
2177 sighand_cachep = kmem_cache_create("sighand_cache", 2177 sighand_cachep = kmem_cache_create("sighand_cache",
2178 sizeof(struct sighand_struct), 0, 2178 sizeof(struct sighand_struct), 0,
2179 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| 2179 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
2180 SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); 2180 SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
2181 signal_cachep = kmem_cache_create("signal_cache", 2181 signal_cachep = kmem_cache_create("signal_cache",
2182 sizeof(struct signal_struct), 0, 2182 sizeof(struct signal_struct), 0,
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 0a1b3c748478..c0e31bfee25c 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1158,10 +1158,10 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1158 return 0; 1158 return 0;
1159 1159
1160 printk("\n"); 1160 printk("\n");
1161 printk("======================================================\n"); 1161 pr_warn("======================================================\n");
1162 printk("[ INFO: possible circular locking dependency detected ]\n"); 1162 pr_warn("WARNING: possible circular locking dependency detected\n");
1163 print_kernel_ident(); 1163 print_kernel_ident();
1164 printk("-------------------------------------------------------\n"); 1164 pr_warn("------------------------------------------------------\n");
1165 printk("%s/%d is trying to acquire lock:\n", 1165 printk("%s/%d is trying to acquire lock:\n",
1166 curr->comm, task_pid_nr(curr)); 1166 curr->comm, task_pid_nr(curr));
1167 print_lock(check_src); 1167 print_lock(check_src);
@@ -1496,11 +1496,11 @@ print_bad_irq_dependency(struct task_struct *curr,
1496 return 0; 1496 return 0;
1497 1497
1498 printk("\n"); 1498 printk("\n");
1499 printk("======================================================\n"); 1499 pr_warn("=====================================================\n");
1500 printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", 1500 pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n",
1501 irqclass, irqclass); 1501 irqclass, irqclass);
1502 print_kernel_ident(); 1502 print_kernel_ident();
1503 printk("------------------------------------------------------\n"); 1503 pr_warn("-----------------------------------------------------\n");
1504 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", 1504 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
1505 curr->comm, task_pid_nr(curr), 1505 curr->comm, task_pid_nr(curr),
1506 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, 1506 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
@@ -1725,10 +1725,10 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1725 return 0; 1725 return 0;
1726 1726
1727 printk("\n"); 1727 printk("\n");
1728 printk("=============================================\n"); 1728 pr_warn("============================================\n");
1729 printk("[ INFO: possible recursive locking detected ]\n"); 1729 pr_warn("WARNING: possible recursive locking detected\n");
1730 print_kernel_ident(); 1730 print_kernel_ident();
1731 printk("---------------------------------------------\n"); 1731 pr_warn("--------------------------------------------\n");
1732 printk("%s/%d is trying to acquire lock:\n", 1732 printk("%s/%d is trying to acquire lock:\n",
1733 curr->comm, task_pid_nr(curr)); 1733 curr->comm, task_pid_nr(curr));
1734 print_lock(next); 1734 print_lock(next);
@@ -2075,10 +2075,10 @@ static void print_collision(struct task_struct *curr,
2075 struct lock_chain *chain) 2075 struct lock_chain *chain)
2076{ 2076{
2077 printk("\n"); 2077 printk("\n");
2078 printk("======================\n"); 2078 pr_warn("============================\n");
2079 printk("[chain_key collision ]\n"); 2079 pr_warn("WARNING: chain_key collision\n");
2080 print_kernel_ident(); 2080 print_kernel_ident();
2081 printk("----------------------\n"); 2081 pr_warn("----------------------------\n");
2082 printk("%s/%d: ", current->comm, task_pid_nr(current)); 2082 printk("%s/%d: ", current->comm, task_pid_nr(current));
2083 printk("Hash chain already cached but the contents don't match!\n"); 2083 printk("Hash chain already cached but the contents don't match!\n");
2084 2084
@@ -2374,10 +2374,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2374 return 0; 2374 return 0;
2375 2375
2376 printk("\n"); 2376 printk("\n");
2377 printk("=================================\n"); 2377 pr_warn("================================\n");
2378 printk("[ INFO: inconsistent lock state ]\n"); 2378 pr_warn("WARNING: inconsistent lock state\n");
2379 print_kernel_ident(); 2379 print_kernel_ident();
2380 printk("---------------------------------\n"); 2380 pr_warn("--------------------------------\n");
2381 2381
2382 printk("inconsistent {%s} -> {%s} usage.\n", 2382 printk("inconsistent {%s} -> {%s} usage.\n",
2383 usage_str[prev_bit], usage_str[new_bit]); 2383 usage_str[prev_bit], usage_str[new_bit]);
@@ -2439,10 +2439,10 @@ print_irq_inversion_bug(struct task_struct *curr,
2439 return 0; 2439 return 0;
2440 2440
2441 printk("\n"); 2441 printk("\n");
2442 printk("=========================================================\n"); 2442 pr_warn("========================================================\n");
2443 printk("[ INFO: possible irq lock inversion dependency detected ]\n"); 2443 pr_warn("WARNING: possible irq lock inversion dependency detected\n");
2444 print_kernel_ident(); 2444 print_kernel_ident();
2445 printk("---------------------------------------------------------\n"); 2445 pr_warn("--------------------------------------------------------\n");
2446 printk("%s/%d just changed the state of lock:\n", 2446 printk("%s/%d just changed the state of lock:\n",
2447 curr->comm, task_pid_nr(curr)); 2447 curr->comm, task_pid_nr(curr));
2448 print_lock(this); 2448 print_lock(this);
@@ -3190,10 +3190,10 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
3190 return 0; 3190 return 0;
3191 3191
3192 printk("\n"); 3192 printk("\n");
3193 printk("==================================\n"); 3193 pr_warn("==================================\n");
3194 printk("[ BUG: Nested lock was not taken ]\n"); 3194 pr_warn("WARNING: Nested lock was not taken\n");
3195 print_kernel_ident(); 3195 print_kernel_ident();
3196 printk("----------------------------------\n"); 3196 pr_warn("----------------------------------\n");
3197 3197
3198 printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); 3198 printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
3199 print_lock(hlock); 3199 print_lock(hlock);
@@ -3403,10 +3403,10 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3403 return 0; 3403 return 0;
3404 3404
3405 printk("\n"); 3405 printk("\n");
3406 printk("=====================================\n"); 3406 pr_warn("=====================================\n");
3407 printk("[ BUG: bad unlock balance detected! ]\n"); 3407 pr_warn("WARNING: bad unlock balance detected!\n");
3408 print_kernel_ident(); 3408 print_kernel_ident();
3409 printk("-------------------------------------\n"); 3409 pr_warn("-------------------------------------\n");
3410 printk("%s/%d is trying to release lock (", 3410 printk("%s/%d is trying to release lock (",
3411 curr->comm, task_pid_nr(curr)); 3411 curr->comm, task_pid_nr(curr));
3412 print_lockdep_cache(lock); 3412 print_lockdep_cache(lock);
@@ -3975,10 +3975,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
3975 return 0; 3975 return 0;
3976 3976
3977 printk("\n"); 3977 printk("\n");
3978 printk("=================================\n"); 3978 pr_warn("=================================\n");
3979 printk("[ BUG: bad contention detected! ]\n"); 3979 pr_warn("WARNING: bad contention detected!\n");
3980 print_kernel_ident(); 3980 print_kernel_ident();
3981 printk("---------------------------------\n"); 3981 pr_warn("---------------------------------\n");
3982 printk("%s/%d is trying to contend lock (", 3982 printk("%s/%d is trying to contend lock (",
3983 curr->comm, task_pid_nr(curr)); 3983 curr->comm, task_pid_nr(curr));
3984 print_lockdep_cache(lock); 3984 print_lockdep_cache(lock);
@@ -4319,10 +4319,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
4319 return; 4319 return;
4320 4320
4321 printk("\n"); 4321 printk("\n");
4322 printk("=========================\n"); 4322 pr_warn("=========================\n");
4323 printk("[ BUG: held lock freed! ]\n"); 4323 pr_warn("WARNING: held lock freed!\n");
4324 print_kernel_ident(); 4324 print_kernel_ident();
4325 printk("-------------------------\n"); 4325 pr_warn("-------------------------\n");
4326 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 4326 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
4327 curr->comm, task_pid_nr(curr), mem_from, mem_to-1); 4327 curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
4328 print_lock(hlock); 4328 print_lock(hlock);
@@ -4377,11 +4377,11 @@ static void print_held_locks_bug(void)
4377 return; 4377 return;
4378 4378
4379 printk("\n"); 4379 printk("\n");
4380 printk("=====================================\n"); 4380 pr_warn("====================================\n");
4381 printk("[ BUG: %s/%d still has locks held! ]\n", 4381 pr_warn("WARNING: %s/%d still has locks held!\n",
4382 current->comm, task_pid_nr(current)); 4382 current->comm, task_pid_nr(current));
4383 print_kernel_ident(); 4383 print_kernel_ident();
4384 printk("-------------------------------------\n"); 4384 pr_warn("------------------------------------\n");
4385 lockdep_print_held_locks(current); 4385 lockdep_print_held_locks(current);
4386 printk("\nstack backtrace:\n"); 4386 printk("\nstack backtrace:\n");
4387 dump_stack(); 4387 dump_stack();
@@ -4446,7 +4446,7 @@ retry:
4446 } while_each_thread(g, p); 4446 } while_each_thread(g, p);
4447 4447
4448 printk("\n"); 4448 printk("\n");
4449 printk("=============================================\n\n"); 4449 pr_warn("=============================================\n\n");
4450 4450
4451 if (unlock) 4451 if (unlock)
4452 read_unlock(&tasklist_lock); 4452 read_unlock(&tasklist_lock);
@@ -4476,10 +4476,10 @@ asmlinkage __visible void lockdep_sys_exit(void)
4476 if (!debug_locks_off()) 4476 if (!debug_locks_off())
4477 return; 4477 return;
4478 printk("\n"); 4478 printk("\n");
4479 printk("================================================\n"); 4479 pr_warn("================================================\n");
4480 printk("[ BUG: lock held when returning to user space! ]\n"); 4480 pr_warn("WARNING: lock held when returning to user space!\n");
4481 print_kernel_ident(); 4481 print_kernel_ident();
4482 printk("------------------------------------------------\n"); 4482 pr_warn("------------------------------------------------\n");
4483 printk("%s/%d is leaving the kernel with locks still held!\n", 4483 printk("%s/%d is leaving the kernel with locks still held!\n",
4484 curr->comm, curr->pid); 4484 curr->comm, curr->pid);
4485 lockdep_print_held_locks(curr); 4485 lockdep_print_held_locks(curr);
@@ -4496,13 +4496,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4496#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ 4496#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
4497 /* Note: the following can be executed concurrently, so be careful. */ 4497 /* Note: the following can be executed concurrently, so be careful. */
4498 printk("\n"); 4498 printk("\n");
4499 pr_err("===============================\n"); 4499 pr_warn("=============================\n");
4500 pr_err("[ ERR: suspicious RCU usage. ]\n"); 4500 pr_warn("WARNING: suspicious RCU usage\n");
4501 print_kernel_ident(); 4501 print_kernel_ident();
4502 pr_err("-------------------------------\n"); 4502 pr_warn("-----------------------------\n");
4503 pr_err("%s:%d %s!\n", file, line, s); 4503 printk("%s:%d %s!\n", file, line, s);
4504 pr_err("\nother info that might help us debug this:\n\n"); 4504 printk("\nother info that might help us debug this:\n\n");
4505 pr_err("\n%srcu_scheduler_active = %d, debug_locks = %d\n", 4505 printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
4506 !rcu_lockdep_current_cpu_online() 4506 !rcu_lockdep_current_cpu_online()
4507 ? "RCU used illegally from offline CPU!\n" 4507 ? "RCU used illegally from offline CPU!\n"
4508 : !rcu_is_watching() 4508 : !rcu_is_watching()
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 32fe775a2eaf..58e366ad36f4 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -102,10 +102,11 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
102 return; 102 return;
103 } 103 }
104 104
105 printk("\n============================================\n"); 105 pr_warn("\n");
106 printk( "[ BUG: circular locking deadlock detected! ]\n"); 106 pr_warn("============================================\n");
107 printk("%s\n", print_tainted()); 107 pr_warn("WARNING: circular locking deadlock detected!\n");
108 printk( "--------------------------------------------\n"); 108 pr_warn("%s\n", print_tainted());
109 pr_warn("--------------------------------------------\n");
109 printk("%s/%d is deadlocking current task %s/%d\n\n", 110 printk("%s/%d is deadlocking current task %s/%d\n\n",
110 task->comm, task_pid_nr(task), 111 task->comm, task_pid_nr(task),
111 current->comm, task_pid_nr(current)); 112 current->comm, task_pid_nr(current));
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 18dfc485225c..23803c7d5180 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -3,10 +3,13 @@
3KCOV_INSTRUMENT := n 3KCOV_INSTRUMENT := n
4 4
5obj-y += update.o sync.o 5obj-y += update.o sync.o
6obj-$(CONFIG_SRCU) += srcu.o 6obj-$(CONFIG_CLASSIC_SRCU) += srcu.o
7obj-$(CONFIG_TREE_SRCU) += srcutree.o
8obj-$(CONFIG_TINY_SRCU) += srcutiny.o
7obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 9obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
8obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o 10obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
9obj-$(CONFIG_TREE_RCU) += tree.o 11obj-$(CONFIG_TREE_RCU) += tree.o
10obj-$(CONFIG_PREEMPT_RCU) += tree.o 12obj-$(CONFIG_PREEMPT_RCU) += tree.o
11obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o 13obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
12obj-$(CONFIG_TINY_RCU) += tiny.o 14obj-$(CONFIG_TINY_RCU) += tiny.o
15obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 0d6ff3e471be..73e16ec4054b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -56,6 +56,83 @@
56#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ 56#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \
57 DYNTICK_TASK_FLAG) 57 DYNTICK_TASK_FLAG)
58 58
59
60/*
61 * Grace-period counter management.
62 */
63
64#define RCU_SEQ_CTR_SHIFT 2
65#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1)
66
67/*
68 * Return the counter portion of a sequence number previously returned
69 * by rcu_seq_snap() or rcu_seq_current().
70 */
71static inline unsigned long rcu_seq_ctr(unsigned long s)
72{
73 return s >> RCU_SEQ_CTR_SHIFT;
74}
75
76/*
77 * Return the state portion of a sequence number previously returned
78 * by rcu_seq_snap() or rcu_seq_current().
79 */
80static inline int rcu_seq_state(unsigned long s)
81{
82 return s & RCU_SEQ_STATE_MASK;
83}
84
85/*
86 * Set the state portion of the pointed-to sequence number.
87 * The caller is responsible for preventing conflicting updates.
88 */
89static inline void rcu_seq_set_state(unsigned long *sp, int newstate)
90{
91 WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK);
92 WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate);
93}
94
95/* Adjust sequence number for start of update-side operation. */
96static inline void rcu_seq_start(unsigned long *sp)
97{
98 WRITE_ONCE(*sp, *sp + 1);
99 smp_mb(); /* Ensure update-side operation after counter increment. */
100 WARN_ON_ONCE(rcu_seq_state(*sp) != 1);
101}
102
103/* Adjust sequence number for end of update-side operation. */
104static inline void rcu_seq_end(unsigned long *sp)
105{
106 smp_mb(); /* Ensure update-side operation before counter increment. */
107 WARN_ON_ONCE(!rcu_seq_state(*sp));
108 WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1);
109}
110
111/* Take a snapshot of the update side's sequence number. */
112static inline unsigned long rcu_seq_snap(unsigned long *sp)
113{
114 unsigned long s;
115
116 s = (READ_ONCE(*sp) + 2 * RCU_SEQ_STATE_MASK + 1) & ~RCU_SEQ_STATE_MASK;
117 smp_mb(); /* Above access must not bleed into critical section. */
118 return s;
119}
120
121/* Return the current value the update side's sequence number, no ordering. */
122static inline unsigned long rcu_seq_current(unsigned long *sp)
123{
124 return READ_ONCE(*sp);
125}
126
127/*
128 * Given a snapshot from rcu_seq_snap(), determine whether or not a
129 * full update-side operation has occurred.
130 */
131static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
132{
133 return ULONG_CMP_GE(READ_ONCE(*sp), s);
134}
135
59/* 136/*
60 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally 137 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
61 * by call_rcu() and rcu callback execution, and are therefore not part of the 138 * by call_rcu() and rcu callback execution, and are therefore not part of the
@@ -109,12 +186,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
109 186
110 rcu_lock_acquire(&rcu_callback_map); 187 rcu_lock_acquire(&rcu_callback_map);
111 if (__is_kfree_rcu_offset(offset)) { 188 if (__is_kfree_rcu_offset(offset)) {
112 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); 189 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);)
113 kfree((void *)head - offset); 190 kfree((void *)head - offset);
114 rcu_lock_release(&rcu_callback_map); 191 rcu_lock_release(&rcu_callback_map);
115 return true; 192 return true;
116 } else { 193 } else {
117 RCU_TRACE(trace_rcu_invoke_callback(rn, head)); 194 RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
118 head->func(head); 195 head->func(head);
119 rcu_lock_release(&rcu_callback_map); 196 rcu_lock_release(&rcu_callback_map);
120 return false; 197 return false;
@@ -144,4 +221,76 @@ void rcu_test_sync_prims(void);
144 */ 221 */
145extern void resched_cpu(int cpu); 222extern void resched_cpu(int cpu);
146 223
224#if defined(SRCU) || !defined(TINY_RCU)
225
226#include <linux/rcu_node_tree.h>
227
228extern int rcu_num_lvls;
229extern int num_rcu_lvl[];
230extern int rcu_num_nodes;
231static bool rcu_fanout_exact;
232static int rcu_fanout_leaf;
233
234/*
235 * Compute the per-level fanout, either using the exact fanout specified
236 * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
237 */
238static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
239{
240 int i;
241
242 if (rcu_fanout_exact) {
243 levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
244 for (i = rcu_num_lvls - 2; i >= 0; i--)
245 levelspread[i] = RCU_FANOUT;
246 } else {
247 int ccur;
248 int cprv;
249
250 cprv = nr_cpu_ids;
251 for (i = rcu_num_lvls - 1; i >= 0; i--) {
252 ccur = levelcnt[i];
253 levelspread[i] = (cprv + ccur - 1) / ccur;
254 cprv = ccur;
255 }
256 }
257}
258
259/*
260 * Do a full breadth-first scan of the rcu_node structures for the
261 * specified rcu_state structure.
262 */
263#define rcu_for_each_node_breadth_first(rsp, rnp) \
264 for ((rnp) = &(rsp)->node[0]; \
265 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
266
267/*
268 * Do a breadth-first scan of the non-leaf rcu_node structures for the
269 * specified rcu_state structure. Note that if there is a singleton
270 * rcu_node tree with but one rcu_node structure, this loop is a no-op.
271 */
272#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
273 for ((rnp) = &(rsp)->node[0]; \
274 (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
275
276/*
277 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
278 * structure. Note that if there is a singleton rcu_node tree with but
279 * one rcu_node structure, this loop -will- visit the rcu_node structure.
280 * It is still a leaf node, even if it is also the root node.
281 */
282#define rcu_for_each_leaf_node(rsp, rnp) \
283 for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
284 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
285
286/*
287 * Iterate over all possible CPUs in a leaf RCU node.
288 */
289#define for_each_leaf_node_possible_cpu(rnp, cpu) \
290 for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
291 cpu <= rnp->grphi; \
292 cpu = cpumask_next((cpu), cpu_possible_mask))
293
294#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
295
147#endif /* __LINUX_RCU_H */ 296#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
new file mode 100644
index 000000000000..2b62a38b080f
--- /dev/null
+++ b/kernel/rcu/rcu_segcblist.c
@@ -0,0 +1,505 @@
1/*
2 * RCU segmented callback lists, function definitions
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright IBM Corporation, 2017
19 *
20 * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 */
22
23#include <linux/types.h>
24#include <linux/kernel.h>
25#include <linux/interrupt.h>
26
27#include "rcu_segcblist.h"
28
29/* Initialize simple callback list. */
30void rcu_cblist_init(struct rcu_cblist *rclp)
31{
32 rclp->head = NULL;
33 rclp->tail = &rclp->head;
34 rclp->len = 0;
35 rclp->len_lazy = 0;
36}
37
38/*
39 * Debug function to actually count the number of callbacks.
40 * If the number exceeds the limit specified, return -1.
41 */
42long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
43{
44 int cnt = 0;
45 struct rcu_head **rhpp = &rclp->head;
46
47 for (;;) {
48 if (!*rhpp)
49 return cnt;
50 if (++cnt > lim)
51 return -1;
52 rhpp = &(*rhpp)->next;
53 }
54}
55
56/*
57 * Dequeue the oldest rcu_head structure from the specified callback
58 * list. This function assumes that the callback is non-lazy, but
59 * the caller can later invoke rcu_cblist_dequeued_lazy() if it
60 * finds otherwise (and if it cares about laziness). This allows
61 * different users to have different ways of determining laziness.
62 */
63struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
64{
65 struct rcu_head *rhp;
66
67 rhp = rclp->head;
68 if (!rhp)
69 return NULL;
70 rclp->len--;
71 rclp->head = rhp->next;
72 if (!rclp->head)
73 rclp->tail = &rclp->head;
74 return rhp;
75}
76
77/*
78 * Initialize an rcu_segcblist structure.
79 */
80void rcu_segcblist_init(struct rcu_segcblist *rsclp)
81{
82 int i;
83
84 BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
85 BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
86 rsclp->head = NULL;
87 for (i = 0; i < RCU_CBLIST_NSEGS; i++)
88 rsclp->tails[i] = &rsclp->head;
89 rsclp->len = 0;
90 rsclp->len_lazy = 0;
91}
92
93/*
94 * Disable the specified rcu_segcblist structure, so that callbacks can
95 * no longer be posted to it. This structure must be empty.
96 */
97void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
98{
99 WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
100 WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
101 WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
102 rsclp->tails[RCU_NEXT_TAIL] = NULL;
103}
104
105/*
106 * Is the specified segment of the specified rcu_segcblist structure
107 * empty of callbacks?
108 */
109bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
110{
111 if (seg == RCU_DONE_TAIL)
112 return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
113 return rsclp->tails[seg - 1] == rsclp->tails[seg];
114}
115
116/*
117 * Does the specified rcu_segcblist structure contain callbacks that
118 * are ready to be invoked?
119 */
120bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
121{
122 return rcu_segcblist_is_enabled(rsclp) &&
123 &rsclp->head != rsclp->tails[RCU_DONE_TAIL];
124}
125
126/*
127 * Does the specified rcu_segcblist structure contain callbacks that
128 * are still pending, that is, not yet ready to be invoked?
129 */
130bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
131{
132 return rcu_segcblist_is_enabled(rsclp) &&
133 !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL);
134}
135
136/*
137 * Dequeue and return the first ready-to-invoke callback. If there
138 * are no ready-to-invoke callbacks, return NULL. Disables interrupts
139 * to avoid interference. Does not protect from interference from other
140 * CPUs or tasks.
141 */
142struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
143{
144 unsigned long flags;
145 int i;
146 struct rcu_head *rhp;
147
148 local_irq_save(flags);
149 if (!rcu_segcblist_ready_cbs(rsclp)) {
150 local_irq_restore(flags);
151 return NULL;
152 }
153 rhp = rsclp->head;
154 BUG_ON(!rhp);
155 rsclp->head = rhp->next;
156 for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
157 if (rsclp->tails[i] != &rhp->next)
158 break;
159 rsclp->tails[i] = &rsclp->head;
160 }
161 smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
162 WRITE_ONCE(rsclp->len, rsclp->len - 1);
163 local_irq_restore(flags);
164 return rhp;
165}
166
167/*
168 * Account for the fact that a previously dequeued callback turned out
169 * to be marked as lazy.
170 */
171void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
172{
173 unsigned long flags;
174
175 local_irq_save(flags);
176 rsclp->len_lazy--;
177 local_irq_restore(flags);
178}
179
180/*
181 * Return a pointer to the first callback in the specified rcu_segcblist
182 * structure. This is useful for diagnostics.
183 */
184struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp)
185{
186 if (rcu_segcblist_is_enabled(rsclp))
187 return rsclp->head;
188 return NULL;
189}
190
191/*
192 * Return a pointer to the first pending callback in the specified
193 * rcu_segcblist structure. This is useful just after posting a given
194 * callback -- if that callback is the first pending callback, then
195 * you cannot rely on someone else having already started up the required
196 * grace period.
197 */
198struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
199{
200 if (rcu_segcblist_is_enabled(rsclp))
201 return *rsclp->tails[RCU_DONE_TAIL];
202 return NULL;
203}
204
205/*
206 * Does the specified rcu_segcblist structure contain callbacks that
207 * have not yet been processed beyond having been posted, that is,
208 * does it contain callbacks in its last segment?
209 */
210bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
211{
212 return rcu_segcblist_is_enabled(rsclp) &&
213 !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
214}
215
216/*
217 * Enqueue the specified callback onto the specified rcu_segcblist
218 * structure, updating accounting as needed. Note that the ->len
219 * field may be accessed locklessly, hence the WRITE_ONCE().
220 * The ->len field is used by rcu_barrier() and friends to determine
221 * if it must post a callback on this structure, and it is OK
222 * for rcu_barrier() to sometimes post callbacks needlessly, but
223 * absolutely not OK for it to ever miss posting a callback.
224 */
225void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
226 struct rcu_head *rhp, bool lazy)
227{
228 WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
229 if (lazy)
230 rsclp->len_lazy++;
231 smp_mb(); /* Ensure counts are updated before callback is enqueued. */
232 rhp->next = NULL;
233 *rsclp->tails[RCU_NEXT_TAIL] = rhp;
234 rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
235}
236
237/*
238 * Entrain the specified callback onto the specified rcu_segcblist at
239 * the end of the last non-empty segment. If the entire rcu_segcblist
240 * is empty, make no change, but return false.
241 *
242 * This is intended for use by rcu_barrier()-like primitives, -not-
243 * for normal grace-period use. IMPORTANT: The callback you enqueue
244 * will wait for all prior callbacks, NOT necessarily for a grace
245 * period. You have been warned.
246 */
247bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
248 struct rcu_head *rhp, bool lazy)
249{
250 int i;
251
252 if (rcu_segcblist_n_cbs(rsclp) == 0)
253 return false;
254 WRITE_ONCE(rsclp->len, rsclp->len + 1);
255 if (lazy)
256 rsclp->len_lazy++;
257 smp_mb(); /* Ensure counts are updated before callback is entrained. */
258 rhp->next = NULL;
259 for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
260 if (rsclp->tails[i] != rsclp->tails[i - 1])
261 break;
262 *rsclp->tails[i] = rhp;
263 for (; i <= RCU_NEXT_TAIL; i++)
264 rsclp->tails[i] = &rhp->next;
265 return true;
266}
267
268/*
269 * Extract only the counts from the specified rcu_segcblist structure,
270 * and place them in the specified rcu_cblist structure. This function
271 * supports both callback orphaning and invocation, hence the separation
272 * of counts and callbacks. (Callbacks ready for invocation must be
273 * orphaned and adopted separately from pending callbacks, but counts
274 * apply to all callbacks. Locking must be used to make sure that
275 * both orphaned-callbacks lists are consistent.)
276 */
277void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
278 struct rcu_cblist *rclp)
279{
280 rclp->len_lazy += rsclp->len_lazy;
281 rclp->len += rsclp->len;
282 rsclp->len_lazy = 0;
283 WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
284}
285
286/*
287 * Extract only those callbacks ready to be invoked from the specified
288 * rcu_segcblist structure and place them in the specified rcu_cblist
289 * structure.
290 */
291void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
292 struct rcu_cblist *rclp)
293{
294 int i;
295
296 if (!rcu_segcblist_ready_cbs(rsclp))
297 return; /* Nothing to do. */
298 *rclp->tail = rsclp->head;
299 rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
300 *rsclp->tails[RCU_DONE_TAIL] = NULL;
301 rclp->tail = rsclp->tails[RCU_DONE_TAIL];
302 for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
303 if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
304 rsclp->tails[i] = &rsclp->head;
305}
306
307/*
308 * Extract only those callbacks still pending (not yet ready to be
309 * invoked) from the specified rcu_segcblist structure and place them in
310 * the specified rcu_cblist structure. Note that this loses information
311 * about any callbacks that might have been partway done waiting for
312 * their grace period. Too bad! They will have to start over.
313 */
314void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
315 struct rcu_cblist *rclp)
316{
317 int i;
318
319 if (!rcu_segcblist_pend_cbs(rsclp))
320 return; /* Nothing to do. */
321 *rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
322 rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
323 *rsclp->tails[RCU_DONE_TAIL] = NULL;
324 for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
325 rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
326}
327
328/*
329 * Insert counts from the specified rcu_cblist structure in the
330 * specified rcu_segcblist structure.
331 */
332void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
333 struct rcu_cblist *rclp)
334{
335 rsclp->len_lazy += rclp->len_lazy;
336 /* ->len sampled locklessly. */
337 WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
338 rclp->len_lazy = 0;
339 rclp->len = 0;
340}
341
342/*
343 * Move callbacks from the specified rcu_cblist to the beginning of the
344 * done-callbacks segment of the specified rcu_segcblist.
345 */
346void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
347 struct rcu_cblist *rclp)
348{
349 int i;
350
351 if (!rclp->head)
352 return; /* No callbacks to move. */
353 *rclp->tail = rsclp->head;
354 rsclp->head = rclp->head;
355 for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
356 if (&rsclp->head == rsclp->tails[i])
357 rsclp->tails[i] = rclp->tail;
358 else
359 break;
360 rclp->head = NULL;
361 rclp->tail = &rclp->head;
362}
363
364/*
365 * Move callbacks from the specified rcu_cblist to the end of the
366 * new-callbacks segment of the specified rcu_segcblist.
367 */
368void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
369 struct rcu_cblist *rclp)
370{
371 if (!rclp->head)
372 return; /* Nothing to do. */
373 *rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
374 rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
375 rclp->head = NULL;
376 rclp->tail = &rclp->head;
377}
378
379/*
380 * Advance the callbacks in the specified rcu_segcblist structure based
381 * on the current value passed in for the grace-period counter.
382 */
383void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
384{
385 int i, j;
386
387 WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
388 if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
389 return;
390
391 /*
392 * Find all callbacks whose ->gp_seq numbers indicate that they
393 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
394 */
395 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
396 if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
397 break;
398 rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
399 }
400
401 /* If no callbacks moved, nothing more need be done. */
402 if (i == RCU_WAIT_TAIL)
403 return;
404
405 /* Clean up tail pointers that might have been misordered above. */
406 for (j = RCU_WAIT_TAIL; j < i; j++)
407 rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
408
409 /*
410 * Callbacks moved, so clean up the misordered ->tails[] pointers
411 * that now point into the middle of the list of ready-to-invoke
412 * callbacks. The overall effect is to copy down the later pointers
413 * into the gap that was created by the now-ready segments.
414 */
415 for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
416 if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
417 break; /* No more callbacks. */
418 rsclp->tails[j] = rsclp->tails[i];
419 rsclp->gp_seq[j] = rsclp->gp_seq[i];
420 }
421}
422
423/*
424 * "Accelerate" callbacks based on more-accurate grace-period information.
425 * The reason for this is that RCU does not synchronize the beginnings and
426 * ends of grace periods, and that callbacks are posted locally. This in
427 * turn means that the callbacks must be labelled conservatively early
428 * on, as getting exact information would degrade both performance and
429 * scalability. When more accurate grace-period information becomes
430 * available, previously posted callbacks can be "accelerated", marking
431 * them to complete at the end of the earlier grace period.
432 *
433 * This function operates on an rcu_segcblist structure, and also the
434 * grace-period sequence number seq at which new callbacks would become
435 * ready to invoke. Returns true if there are callbacks that won't be
436 * ready to invoke until seq, false otherwise.
437 */
438bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
439{
440 int i;
441
442 WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
443 if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
444 return false;
445
446 /*
447 * Find the segment preceding the oldest segment of callbacks
448 * whose ->gp_seq[] completion is at or after that passed in via
449 * "seq", skipping any empty segments. This oldest segment, along
450 * with any later segments, can be merged in with any newly arrived
451 * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
452 * as their ->gp_seq[] grace-period completion sequence number.
453 */
454 for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
455 if (rsclp->tails[i] != rsclp->tails[i - 1] &&
456 ULONG_CMP_LT(rsclp->gp_seq[i], seq))
457 break;
458
459 /*
460 * If all the segments contain callbacks that correspond to
461 * earlier grace-period sequence numbers than "seq", leave.
462 * Assuming that the rcu_segcblist structure has enough
463 * segments in its arrays, this can only happen if some of
464 * the non-done segments contain callbacks that really are
465 * ready to invoke. This situation will get straightened
466 * out by the next call to rcu_segcblist_advance().
467 *
468 * Also advance to the oldest segment of callbacks whose
469 * ->gp_seq[] completion is at or after that passed in via "seq",
470 * skipping any empty segments.
471 */
472 if (++i >= RCU_NEXT_TAIL)
473 return false;
474
475 /*
476 * Merge all later callbacks, including newly arrived callbacks,
477 * into the segment located by the for-loop above. Assign "seq"
478 * as the ->gp_seq[] value in order to correctly handle the case
479 * where there were no pending callbacks in the rcu_segcblist
480 * structure other than in the RCU_NEXT_TAIL segment.
481 */
482 for (; i < RCU_NEXT_TAIL; i++) {
483 rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
484 rsclp->gp_seq[i] = seq;
485 }
486 return true;
487}
488
489/*
490 * Scan the specified rcu_segcblist structure for callbacks that need
491 * a grace period later than the one specified by "seq". We don't look
492 * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
493 * have a grace-period sequence number.
494 */
495bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
496 unsigned long seq)
497{
498 int i;
499
500 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
501 if (rsclp->tails[i - 1] != rsclp->tails[i] &&
502 ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
503 return true;
504 return false;
505}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
new file mode 100644
index 000000000000..6e36e36478cd
--- /dev/null
+++ b/kernel/rcu/rcu_segcblist.h
@@ -0,0 +1,164 @@
1/*
2 * RCU segmented callback lists, internal-to-rcu header file
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright IBM Corporation, 2017
19 *
20 * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
21 */
22
23#include <linux/rcu_segcblist.h>
24
25/*
26 * Account for the fact that a previously dequeued callback turned out
27 * to be marked as lazy.
28 */
29static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
30{
31 rclp->len_lazy--;
32}
33
34/*
35 * Interim function to return rcu_cblist head pointer. Longer term, the
36 * rcu_cblist will be used more pervasively, removing the need for this
37 * function.
38 */
39static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
40{
41 return rclp->head;
42}
43
44/*
45 * Interim function to return rcu_cblist head pointer. Longer term, the
46 * rcu_cblist will be used more pervasively, removing the need for this
47 * function.
48 */
49static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
50{
51 WARN_ON_ONCE(!rclp->head);
52 return rclp->tail;
53}
54
55void rcu_cblist_init(struct rcu_cblist *rclp);
56long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim);
57struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
58
59/*
60 * Is the specified rcu_segcblist structure empty?
61 *
62 * But careful! The fact that the ->head field is NULL does not
63 * necessarily imply that there are no callbacks associated with
64 * this structure. When callbacks are being invoked, they are
65 * removed as a group. If callback invocation must be preempted,
66 * the remaining callbacks will be added back to the list. Either
67 * way, the counts are updated later.
68 *
69 * So it is often the case that rcu_segcblist_n_cbs() should be used
70 * instead.
71 */
72static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
73{
74 return !rsclp->head;
75}
76
77/* Return number of callbacks in segmented callback list. */
78static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
79{
80 return READ_ONCE(rsclp->len);
81}
82
83/* Return number of lazy callbacks in segmented callback list. */
84static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
85{
86 return rsclp->len_lazy;
87}
88
89/* Return number of lazy callbacks in segmented callback list. */
90static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
91{
92 return rsclp->len - rsclp->len_lazy;
93}
94
95/*
96 * Is the specified rcu_segcblist enabled, for example, not corresponding
97 * to an offline or callback-offloaded CPU?
98 */
99static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
100{
101 return !!rsclp->tails[RCU_NEXT_TAIL];
102}
103
104/*
105 * Are all segments following the specified segment of the specified
106 * rcu_segcblist structure empty of callbacks? (The specified
107 * segment might well contain callbacks.)
108 */
109static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
110{
111 return !*rsclp->tails[seg];
112}
113
114/*
115 * Interim function to return rcu_segcblist head pointer. Longer term, the
116 * rcu_segcblist will be used more pervasively, removing the need for this
117 * function.
118 */
119static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
120{
121 return rsclp->head;
122}
123
124/*
125 * Interim function to return rcu_segcblist head pointer. Longer term, the
126 * rcu_segcblist will be used more pervasively, removing the need for this
127 * function.
128 */
129static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
130{
131 WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
132 return rsclp->tails[RCU_NEXT_TAIL];
133}
134
135void rcu_segcblist_init(struct rcu_segcblist *rsclp);
136void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
137bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg);
138bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
139bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
140struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp);
141void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp);
142struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
143struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
144bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp);
145void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
146 struct rcu_head *rhp, bool lazy);
147bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
148 struct rcu_head *rhp, bool lazy);
149void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
150 struct rcu_cblist *rclp);
151void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
152 struct rcu_cblist *rclp);
153void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
154 struct rcu_cblist *rclp);
155void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
156 struct rcu_cblist *rclp);
157void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
158 struct rcu_cblist *rclp);
159void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
160 struct rcu_cblist *rclp);
161void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
162bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
163bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
164 unsigned long seq);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index cccc417a8135..ae6e574d4cf5 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -559,19 +559,34 @@ static void srcu_torture_barrier(void)
559 559
560static void srcu_torture_stats(void) 560static void srcu_torture_stats(void)
561{ 561{
562 int cpu; 562 int __maybe_unused cpu;
563 int idx = srcu_ctlp->completed & 0x1; 563 int idx;
564 564
565 pr_alert("%s%s per-CPU(idx=%d):", 565#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU)
566#ifdef CONFIG_TREE_SRCU
567 idx = srcu_ctlp->srcu_idx & 0x1;
568#else /* #ifdef CONFIG_TREE_SRCU */
569 idx = srcu_ctlp->completed & 0x1;
570#endif /* #else #ifdef CONFIG_TREE_SRCU */
571 pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
566 torture_type, TORTURE_FLAG, idx); 572 torture_type, TORTURE_FLAG, idx);
567 for_each_possible_cpu(cpu) { 573 for_each_possible_cpu(cpu) {
568 unsigned long l0, l1; 574 unsigned long l0, l1;
569 unsigned long u0, u1; 575 unsigned long u0, u1;
570 long c0, c1; 576 long c0, c1;
571 struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); 577#ifdef CONFIG_TREE_SRCU
578 struct srcu_data *counts;
572 579
580 counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
581 u0 = counts->srcu_unlock_count[!idx];
582 u1 = counts->srcu_unlock_count[idx];
583#else /* #ifdef CONFIG_TREE_SRCU */
584 struct srcu_array *counts;
585
586 counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
573 u0 = counts->unlock_count[!idx]; 587 u0 = counts->unlock_count[!idx];
574 u1 = counts->unlock_count[idx]; 588 u1 = counts->unlock_count[idx];
589#endif /* #else #ifdef CONFIG_TREE_SRCU */
575 590
576 /* 591 /*
577 * Make sure that a lock is always counted if the corresponding 592 * Make sure that a lock is always counted if the corresponding
@@ -579,14 +594,26 @@ static void srcu_torture_stats(void)
579 */ 594 */
580 smp_rmb(); 595 smp_rmb();
581 596
597#ifdef CONFIG_TREE_SRCU
598 l0 = counts->srcu_lock_count[!idx];
599 l1 = counts->srcu_lock_count[idx];
600#else /* #ifdef CONFIG_TREE_SRCU */
582 l0 = counts->lock_count[!idx]; 601 l0 = counts->lock_count[!idx];
583 l1 = counts->lock_count[idx]; 602 l1 = counts->lock_count[idx];
603#endif /* #else #ifdef CONFIG_TREE_SRCU */
584 604
585 c0 = l0 - u0; 605 c0 = l0 - u0;
586 c1 = l1 - u1; 606 c1 = l1 - u1;
587 pr_cont(" %d(%ld,%ld)", cpu, c0, c1); 607 pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
588 } 608 }
589 pr_cont("\n"); 609 pr_cont("\n");
610#elif defined(CONFIG_TINY_SRCU)
611 idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
612 pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n",
613 torture_type, TORTURE_FLAG, idx,
614 READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
615 READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
616#endif
590} 617}
591 618
592static void srcu_torture_synchronize_expedited(void) 619static void srcu_torture_synchronize_expedited(void)
@@ -1333,12 +1360,14 @@ rcu_torture_stats_print(void)
1333 cur_ops->stats(); 1360 cur_ops->stats();
1334 if (rtcv_snap == rcu_torture_current_version && 1361 if (rtcv_snap == rcu_torture_current_version &&
1335 rcu_torture_current != NULL) { 1362 rcu_torture_current != NULL) {
1336 int __maybe_unused flags; 1363 int __maybe_unused flags = 0;
1337 unsigned long __maybe_unused gpnum; 1364 unsigned long __maybe_unused gpnum = 0;
1338 unsigned long __maybe_unused completed; 1365 unsigned long __maybe_unused completed = 0;
1339 1366
1340 rcutorture_get_gp_data(cur_ops->ttype, 1367 rcutorture_get_gp_data(cur_ops->ttype,
1341 &flags, &gpnum, &completed); 1368 &flags, &gpnum, &completed);
1369 srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
1370 &flags, &gpnum, &completed);
1342 wtp = READ_ONCE(writer_task); 1371 wtp = READ_ONCE(writer_task);
1343 pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", 1372 pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
1344 rcu_torture_writer_state_getname(), 1373 rcu_torture_writer_state_getname(),
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index ef3bcfb15b39..584d8a983883 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -22,7 +22,7 @@
22 * Lai Jiangshan <laijs@cn.fujitsu.com> 22 * Lai Jiangshan <laijs@cn.fujitsu.com>
23 * 23 *
24 * For detailed explanation of Read-Copy Update mechanism see - 24 * For detailed explanation of Read-Copy Update mechanism see -
25 * Documentation/RCU/ *.txt 25 * Documentation/RCU/ *.txt
26 * 26 *
27 */ 27 */
28 28
@@ -243,8 +243,14 @@ static bool srcu_readers_active(struct srcu_struct *sp)
243 * cleanup_srcu_struct - deconstruct a sleep-RCU structure 243 * cleanup_srcu_struct - deconstruct a sleep-RCU structure
244 * @sp: structure to clean up. 244 * @sp: structure to clean up.
245 * 245 *
246 * Must invoke this after you are finished using a given srcu_struct that 246 * Must invoke this only after you are finished using a given srcu_struct
247 * was initialized via init_srcu_struct(), else you leak memory. 247 * that was initialized via init_srcu_struct(). This code does some
248 * probabalistic checking, spotting late uses of srcu_read_lock(),
249 * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu().
250 * If any such late uses are detected, the per-CPU memory associated with
251 * the srcu_struct is simply leaked and WARN_ON() is invoked. If the
252 * caller frees the srcu_struct itself, a use-after-free crash will likely
253 * ensue, but at least there will be a warning printed.
248 */ 254 */
249void cleanup_srcu_struct(struct srcu_struct *sp) 255void cleanup_srcu_struct(struct srcu_struct *sp)
250{ 256{
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
new file mode 100644
index 000000000000..36e1f82faed1
--- /dev/null
+++ b/kernel/rcu/srcutiny.c
@@ -0,0 +1,216 @@
1/*
2 * Sleepable Read-Copy Update mechanism for mutual exclusion,
3 * tiny version for non-preemptible single-CPU use.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, you can access it online at
17 * http://www.gnu.org/licenses/gpl-2.0.html.
18 *
19 * Copyright (C) IBM Corporation, 2017
20 *
21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 */
23
24#include <linux/export.h>
25#include <linux/mutex.h>
26#include <linux/preempt.h>
27#include <linux/rcupdate_wait.h>
28#include <linux/sched.h>
29#include <linux/delay.h>
30#include <linux/srcu.h>
31
32#include <linux/rcu_node_tree.h>
33#include "rcu_segcblist.h"
34#include "rcu.h"
35
36static int init_srcu_struct_fields(struct srcu_struct *sp)
37{
38 sp->srcu_lock_nesting[0] = 0;
39 sp->srcu_lock_nesting[1] = 0;
40 init_swait_queue_head(&sp->srcu_wq);
41 sp->srcu_gp_seq = 0;
42 rcu_segcblist_init(&sp->srcu_cblist);
43 sp->srcu_gp_running = false;
44 sp->srcu_gp_waiting = false;
45 sp->srcu_idx = 0;
46 INIT_WORK(&sp->srcu_work, srcu_drive_gp);
47 return 0;
48}
49
50#ifdef CONFIG_DEBUG_LOCK_ALLOC
51
52int __init_srcu_struct(struct srcu_struct *sp, const char *name,
53 struct lock_class_key *key)
54{
55 /* Don't re-initialize a lock while it is held. */
56 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
57 lockdep_init_map(&sp->dep_map, name, key, 0);
58 return init_srcu_struct_fields(sp);
59}
60EXPORT_SYMBOL_GPL(__init_srcu_struct);
61
62#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
63
64/*
65 * init_srcu_struct - initialize a sleep-RCU structure
66 * @sp: structure to initialize.
67 *
68 * Must invoke this on a given srcu_struct before passing that srcu_struct
69 * to any other function. Each srcu_struct represents a separate domain
70 * of SRCU protection.
71 */
72int init_srcu_struct(struct srcu_struct *sp)
73{
74 return init_srcu_struct_fields(sp);
75}
76EXPORT_SYMBOL_GPL(init_srcu_struct);
77
78#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
79
80/*
81 * cleanup_srcu_struct - deconstruct a sleep-RCU structure
82 * @sp: structure to clean up.
83 *
84 * Must invoke this after you are finished using a given srcu_struct that
85 * was initialized via init_srcu_struct(), else you leak memory.
86 */
87void cleanup_srcu_struct(struct srcu_struct *sp)
88{
89 WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
90 flush_work(&sp->srcu_work);
91 WARN_ON(rcu_seq_state(sp->srcu_gp_seq));
92 WARN_ON(sp->srcu_gp_running);
93 WARN_ON(sp->srcu_gp_waiting);
94 WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist));
95}
96EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
97
98/*
99 * Counts the new reader in the appropriate per-CPU element of the
100 * srcu_struct. Must be called from process context.
101 * Returns an index that must be passed to the matching srcu_read_unlock().
102 */
103int __srcu_read_lock(struct srcu_struct *sp)
104{
105 int idx;
106
107 idx = READ_ONCE(sp->srcu_idx);
108 WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1);
109 return idx;
110}
111EXPORT_SYMBOL_GPL(__srcu_read_lock);
112
113/*
114 * Removes the count for the old reader from the appropriate element of
115 * the srcu_struct. Must be called from process context.
116 */
117void __srcu_read_unlock(struct srcu_struct *sp, int idx)
118{
119 int newval = sp->srcu_lock_nesting[idx] - 1;
120
121 WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
122 if (!newval && READ_ONCE(sp->srcu_gp_waiting))
123 swake_up(&sp->srcu_wq);
124}
125EXPORT_SYMBOL_GPL(__srcu_read_unlock);
126
127/*
128 * Workqueue handler to drive one grace period and invoke any callbacks
129 * that become ready as a result. Single-CPU and !PREEMPT operation
130 * means that we get away with murder on synchronization. ;-)
131 */
132void srcu_drive_gp(struct work_struct *wp)
133{
134 int idx;
135 struct rcu_cblist ready_cbs;
136 struct srcu_struct *sp;
137 struct rcu_head *rhp;
138
139 sp = container_of(wp, struct srcu_struct, srcu_work);
140 if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist))
141 return; /* Already running or nothing to do. */
142
143 /* Tag recently arrived callbacks and wait for readers. */
144 WRITE_ONCE(sp->srcu_gp_running, true);
145 rcu_segcblist_accelerate(&sp->srcu_cblist,
146 rcu_seq_snap(&sp->srcu_gp_seq));
147 rcu_seq_start(&sp->srcu_gp_seq);
148 idx = sp->srcu_idx;
149 WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
150 WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
151 swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
152 WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
153 rcu_seq_end(&sp->srcu_gp_seq);
154
155 /* Update callback list based on GP, and invoke ready callbacks. */
156 rcu_segcblist_advance(&sp->srcu_cblist,
157 rcu_seq_current(&sp->srcu_gp_seq));
158 if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
159 rcu_cblist_init(&ready_cbs);
160 local_irq_disable();
161 rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
162 local_irq_enable();
163 rhp = rcu_cblist_dequeue(&ready_cbs);
164 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
165 local_bh_disable();
166 rhp->func(rhp);
167 local_bh_enable();
168 }
169 local_irq_disable();
170 rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
171 local_irq_enable();
172 }
173 WRITE_ONCE(sp->srcu_gp_running, false);
174
175 /*
176 * If more callbacks, reschedule ourselves. This can race with
177 * a call_srcu() at interrupt level, but the ->srcu_gp_running
178 * checks will straighten that out.
179 */
180 if (!rcu_segcblist_empty(&sp->srcu_cblist))
181 schedule_work(&sp->srcu_work);
182}
183EXPORT_SYMBOL_GPL(srcu_drive_gp);
184
185/*
186 * Enqueue an SRCU callback on the specified srcu_struct structure,
187 * initiating grace-period processing if it is not already running.
188 */
189void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
190 rcu_callback_t func)
191{
192 unsigned long flags;
193
194 head->func = func;
195 local_irq_save(flags);
196 rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
197 local_irq_restore(flags);
198 if (!READ_ONCE(sp->srcu_gp_running))
199 schedule_work(&sp->srcu_work);
200}
201EXPORT_SYMBOL_GPL(call_srcu);
202
203/*
204 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
205 */
206void synchronize_srcu(struct srcu_struct *sp)
207{
208 struct rcu_synchronize rs;
209
210 init_rcu_head_on_stack(&rs.head);
211 init_completion(&rs.completion);
212 call_srcu(sp, &rs.head, wakeme_after_rcu);
213 wait_for_completion(&rs.completion);
214 destroy_rcu_head_on_stack(&rs.head);
215}
216EXPORT_SYMBOL_GPL(synchronize_srcu);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
new file mode 100644
index 000000000000..3ae8474557df
--- /dev/null
+++ b/kernel/rcu/srcutree.c
@@ -0,0 +1,1155 @@
1/*
2 * Sleepable Read-Copy Update mechanism for mutual exclusion.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright (C) IBM Corporation, 2006
19 * Copyright (C) Fujitsu, 2012
20 *
21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 * Lai Jiangshan <laijs@cn.fujitsu.com>
23 *
24 * For detailed explanation of Read-Copy Update mechanism see -
25 * Documentation/RCU/ *.txt
26 *
27 */
28
29#include <linux/export.h>
30#include <linux/mutex.h>
31#include <linux/percpu.h>
32#include <linux/preempt.h>
33#include <linux/rcupdate_wait.h>
34#include <linux/sched.h>
35#include <linux/smp.h>
36#include <linux/delay.h>
37#include <linux/module.h>
38#include <linux/srcu.h>
39
40#include "rcu.h"
41#include "rcu_segcblist.h"
42
43ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */
44module_param(exp_holdoff, ulong, 0444);
45
46static void srcu_invoke_callbacks(struct work_struct *work);
47static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
48
49/*
50 * Initialize SRCU combining tree. Note that statically allocated
51 * srcu_struct structures might already have srcu_read_lock() and
52 * srcu_read_unlock() running against them. So if the is_static parameter
53 * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
54 */
55static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
56{
57 int cpu;
58 int i;
59 int level = 0;
60 int levelspread[RCU_NUM_LVLS];
61 struct srcu_data *sdp;
62 struct srcu_node *snp;
63 struct srcu_node *snp_first;
64
65 /* Work out the overall tree geometry. */
66 sp->level[0] = &sp->node[0];
67 for (i = 1; i < rcu_num_lvls; i++)
68 sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1];
69 rcu_init_levelspread(levelspread, num_rcu_lvl);
70
71 /* Each pass through this loop initializes one srcu_node structure. */
72 rcu_for_each_node_breadth_first(sp, snp) {
73 spin_lock_init(&snp->lock);
74 WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
75 ARRAY_SIZE(snp->srcu_data_have_cbs));
76 for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
77 snp->srcu_have_cbs[i] = 0;
78 snp->srcu_data_have_cbs[i] = 0;
79 }
80 snp->srcu_gp_seq_needed_exp = 0;
81 snp->grplo = -1;
82 snp->grphi = -1;
83 if (snp == &sp->node[0]) {
84 /* Root node, special case. */
85 snp->srcu_parent = NULL;
86 continue;
87 }
88
89 /* Non-root node. */
90 if (snp == sp->level[level + 1])
91 level++;
92 snp->srcu_parent = sp->level[level - 1] +
93 (snp - sp->level[level]) /
94 levelspread[level - 1];
95 }
96
97 /*
98 * Initialize the per-CPU srcu_data array, which feeds into the
99 * leaves of the srcu_node tree.
100 */
101 WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
102 ARRAY_SIZE(sdp->srcu_unlock_count));
103 level = rcu_num_lvls - 1;
104 snp_first = sp->level[level];
105 for_each_possible_cpu(cpu) {
106 sdp = per_cpu_ptr(sp->sda, cpu);
107 spin_lock_init(&sdp->lock);
108 rcu_segcblist_init(&sdp->srcu_cblist);
109 sdp->srcu_cblist_invoking = false;
110 sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
111 sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq;
112 sdp->mynode = &snp_first[cpu / levelspread[level]];
113 for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
114 if (snp->grplo < 0)
115 snp->grplo = cpu;
116 snp->grphi = cpu;
117 }
118 sdp->cpu = cpu;
119 INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks);
120 sdp->sp = sp;
121 sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
122 if (is_static)
123 continue;
124
125 /* Dynamically allocated, better be no srcu_read_locks()! */
126 for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) {
127 sdp->srcu_lock_count[i] = 0;
128 sdp->srcu_unlock_count[i] = 0;
129 }
130 }
131}
132
133/*
134 * Initialize non-compile-time initialized fields, including the
135 * associated srcu_node and srcu_data structures. The is_static
136 * parameter is passed through to init_srcu_struct_nodes(), and
137 * also tells us that ->sda has already been wired up to srcu_data.
138 */
139static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
140{
141 mutex_init(&sp->srcu_cb_mutex);
142 mutex_init(&sp->srcu_gp_mutex);
143 sp->srcu_idx = 0;
144 sp->srcu_gp_seq = 0;
145 sp->srcu_barrier_seq = 0;
146 mutex_init(&sp->srcu_barrier_mutex);
147 atomic_set(&sp->srcu_barrier_cpu_cnt, 0);
148 INIT_DELAYED_WORK(&sp->work, process_srcu);
149 if (!is_static)
150 sp->sda = alloc_percpu(struct srcu_data);
151 init_srcu_struct_nodes(sp, is_static);
152 sp->srcu_gp_seq_needed_exp = 0;
153 sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
154 smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */
155 return sp->sda ? 0 : -ENOMEM;
156}
157
158#ifdef CONFIG_DEBUG_LOCK_ALLOC
159
160int __init_srcu_struct(struct srcu_struct *sp, const char *name,
161 struct lock_class_key *key)
162{
163 /* Don't re-initialize a lock while it is held. */
164 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
165 lockdep_init_map(&sp->dep_map, name, key, 0);
166 spin_lock_init(&sp->gp_lock);
167 return init_srcu_struct_fields(sp, false);
168}
169EXPORT_SYMBOL_GPL(__init_srcu_struct);
170
171#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
172
173/**
174 * init_srcu_struct - initialize a sleep-RCU structure
175 * @sp: structure to initialize.
176 *
177 * Must invoke this on a given srcu_struct before passing that srcu_struct
178 * to any other function. Each srcu_struct represents a separate domain
179 * of SRCU protection.
180 */
181int init_srcu_struct(struct srcu_struct *sp)
182{
183 spin_lock_init(&sp->gp_lock);
184 return init_srcu_struct_fields(sp, false);
185}
186EXPORT_SYMBOL_GPL(init_srcu_struct);
187
188#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
189
190/*
191 * First-use initialization of statically allocated srcu_struct
192 * structure. Wiring up the combining tree is more than can be
193 * done with compile-time initialization, so this check is added
194 * to each update-side SRCU primitive. Use ->gp_lock, which -is-
195 * compile-time initialized, to resolve races involving multiple
196 * CPUs trying to garner first-use privileges.
197 */
198static void check_init_srcu_struct(struct srcu_struct *sp)
199{
200 unsigned long flags;
201
202 WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT);
203 /* The smp_load_acquire() pairs with the smp_store_release(). */
204 if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
205 return; /* Already initialized. */
206 spin_lock_irqsave(&sp->gp_lock, flags);
207 if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
208 spin_unlock_irqrestore(&sp->gp_lock, flags);
209 return;
210 }
211 init_srcu_struct_fields(sp, true);
212 spin_unlock_irqrestore(&sp->gp_lock, flags);
213}
214
215/*
216 * Returns approximate total of the readers' ->srcu_lock_count[] values
217 * for the rank of per-CPU counters specified by idx.
218 */
219static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
220{
221 int cpu;
222 unsigned long sum = 0;
223
224 for_each_possible_cpu(cpu) {
225 struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
226
227 sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
228 }
229 return sum;
230}
231
232/*
233 * Returns approximate total of the readers' ->srcu_unlock_count[] values
234 * for the rank of per-CPU counters specified by idx.
235 */
236static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
237{
238 int cpu;
239 unsigned long sum = 0;
240
241 for_each_possible_cpu(cpu) {
242 struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
243
244 sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
245 }
246 return sum;
247}
248
249/*
250 * Return true if the number of pre-existing readers is determined to
251 * be zero.
252 */
253static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
254{
255 unsigned long unlocks;
256
257 unlocks = srcu_readers_unlock_idx(sp, idx);
258
259 /*
260 * Make sure that a lock is always counted if the corresponding
261 * unlock is counted. Needs to be a smp_mb() as the read side may
262 * contain a read from a variable that is written to before the
263 * synchronize_srcu() in the write side. In this case smp_mb()s
264 * A and B act like the store buffering pattern.
265 *
266 * This smp_mb() also pairs with smp_mb() C to prevent accesses
267 * after the synchronize_srcu() from being executed before the
268 * grace period ends.
269 */
270 smp_mb(); /* A */
271
272 /*
273 * If the locks are the same as the unlocks, then there must have
274 * been no readers on this index at some time in between. This does
275 * not mean that there are no more readers, as one could have read
276 * the current index but not have incremented the lock counter yet.
277 *
278 * Possible bug: There is no guarantee that there haven't been
279 * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were
280 * counted, meaning that this could return true even if there are
281 * still active readers. Since there are no memory barriers around
282 * srcu_flip(), the CPU is not required to increment ->srcu_idx
283 * before running srcu_readers_unlock_idx(), which means that there
284 * could be an arbitrarily large number of critical sections that
285 * execute after srcu_readers_unlock_idx() but use the old value
286 * of ->srcu_idx.
287 */
288 return srcu_readers_lock_idx(sp, idx) == unlocks;
289}
290
291/**
292 * srcu_readers_active - returns true if there are readers. and false
293 * otherwise
294 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
295 *
296 * Note that this is not an atomic primitive, and can therefore suffer
297 * severe errors when invoked on an active srcu_struct. That said, it
298 * can be useful as an error check at cleanup time.
299 */
300static bool srcu_readers_active(struct srcu_struct *sp)
301{
302 int cpu;
303 unsigned long sum = 0;
304
305 for_each_possible_cpu(cpu) {
306 struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
307
308 sum += READ_ONCE(cpuc->srcu_lock_count[0]);
309 sum += READ_ONCE(cpuc->srcu_lock_count[1]);
310 sum -= READ_ONCE(cpuc->srcu_unlock_count[0]);
311 sum -= READ_ONCE(cpuc->srcu_unlock_count[1]);
312 }
313 return sum;
314}
315
316#define SRCU_INTERVAL 1
317
318/*
319 * Return grace-period delay, zero if there are expedited grace
320 * periods pending, SRCU_INTERVAL otherwise.
321 */
322static unsigned long srcu_get_delay(struct srcu_struct *sp)
323{
324 if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq),
325 READ_ONCE(sp->srcu_gp_seq_needed_exp)))
326 return 0;
327 return SRCU_INTERVAL;
328}
329
330/**
331 * cleanup_srcu_struct - deconstruct a sleep-RCU structure
332 * @sp: structure to clean up.
333 *
334 * Must invoke this after you are finished using a given srcu_struct that
335 * was initialized via init_srcu_struct(), else you leak memory.
336 */
337void cleanup_srcu_struct(struct srcu_struct *sp)
338{
339 int cpu;
340
341 if (WARN_ON(!srcu_get_delay(sp)))
342 return; /* Leakage unless caller handles error. */
343 if (WARN_ON(srcu_readers_active(sp)))
344 return; /* Leakage unless caller handles error. */
345 flush_delayed_work(&sp->work);
346 for_each_possible_cpu(cpu)
347 flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
348 if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
349 WARN_ON(srcu_readers_active(sp))) {
350 pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
351 return; /* Caller forgot to stop doing call_srcu()? */
352 }
353 free_percpu(sp->sda);
354 sp->sda = NULL;
355}
356EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
357
358/*
359 * Counts the new reader in the appropriate per-CPU element of the
360 * srcu_struct. Must be called from process context.
361 * Returns an index that must be passed to the matching srcu_read_unlock().
362 */
363int __srcu_read_lock(struct srcu_struct *sp)
364{
365 int idx;
366
367 idx = READ_ONCE(sp->srcu_idx) & 0x1;
368 __this_cpu_inc(sp->sda->srcu_lock_count[idx]);
369 smp_mb(); /* B */ /* Avoid leaking the critical section. */
370 return idx;
371}
372EXPORT_SYMBOL_GPL(__srcu_read_lock);
373
374/*
375 * Removes the count for the old reader from the appropriate per-CPU
376 * element of the srcu_struct. Note that this may well be a different
377 * CPU than that which was incremented by the corresponding srcu_read_lock().
378 * Must be called from process context.
379 */
380void __srcu_read_unlock(struct srcu_struct *sp, int idx)
381{
382 smp_mb(); /* C */ /* Avoid leaking the critical section. */
383 this_cpu_inc(sp->sda->srcu_unlock_count[idx]);
384}
385EXPORT_SYMBOL_GPL(__srcu_read_unlock);
386
387/*
388 * We use an adaptive strategy for synchronize_srcu() and especially for
389 * synchronize_srcu_expedited(). We spin for a fixed time period
390 * (defined below) to allow SRCU readers to exit their read-side critical
391 * sections. If there are still some readers after a few microseconds,
392 * we repeatedly block for 1-millisecond time periods.
393 */
394#define SRCU_RETRY_CHECK_DELAY 5
395
396/*
397 * Start an SRCU grace period.
398 */
399static void srcu_gp_start(struct srcu_struct *sp)
400{
401 struct srcu_data *sdp = this_cpu_ptr(sp->sda);
402 int state;
403
404 RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock),
405 "Invoked srcu_gp_start() without ->gp_lock!");
406 WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
407 rcu_segcblist_advance(&sdp->srcu_cblist,
408 rcu_seq_current(&sp->srcu_gp_seq));
409 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
410 rcu_seq_snap(&sp->srcu_gp_seq));
411 smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
412 rcu_seq_start(&sp->srcu_gp_seq);
413 state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
414 WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
415}
416
417/*
418 * Track online CPUs to guide callback workqueue placement.
419 */
420DEFINE_PER_CPU(bool, srcu_online);
421
422void srcu_online_cpu(unsigned int cpu)
423{
424 WRITE_ONCE(per_cpu(srcu_online, cpu), true);
425}
426
427void srcu_offline_cpu(unsigned int cpu)
428{
429 WRITE_ONCE(per_cpu(srcu_online, cpu), false);
430}
431
432/*
433 * Place the workqueue handler on the specified CPU if online, otherwise
434 * just run it whereever. This is useful for placing workqueue handlers
435 * that are to invoke the specified CPU's callbacks.
436 */
437static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
438 struct delayed_work *dwork,
439 unsigned long delay)
440{
441 bool ret;
442
443 preempt_disable();
444 if (READ_ONCE(per_cpu(srcu_online, cpu)))
445 ret = queue_delayed_work_on(cpu, wq, dwork, delay);
446 else
447 ret = queue_delayed_work(wq, dwork, delay);
448 preempt_enable();
449 return ret;
450}
451
452/*
453 * Schedule callback invocation for the specified srcu_data structure,
454 * if possible, on the corresponding CPU.
455 */
456static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
457{
458 srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq,
459 &sdp->work, delay);
460}
461
462/*
463 * Schedule callback invocation for all srcu_data structures associated
464 * with the specified srcu_node structure that have callbacks for the
465 * just-completed grace period, the one corresponding to idx. If possible,
466 * schedule this invocation on the corresponding CPUs.
467 */
468static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
469 unsigned long mask, unsigned long delay)
470{
471 int cpu;
472
473 for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
474 if (!(mask & (1 << (cpu - snp->grplo))))
475 continue;
476 srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay);
477 }
478}
479
480/*
481 * Note the end of an SRCU grace period. Initiates callback invocation
482 * and starts a new grace period if needed.
483 *
484 * The ->srcu_cb_mutex acquisition does not protect any data, but
485 * instead prevents more than one grace period from starting while we
486 * are initiating callback invocation. This allows the ->srcu_have_cbs[]
487 * array to have a finite number of elements.
488 */
489static void srcu_gp_end(struct srcu_struct *sp)
490{
491 unsigned long cbdelay;
492 bool cbs;
493 unsigned long gpseq;
494 int idx;
495 int idxnext;
496 unsigned long mask;
497 struct srcu_node *snp;
498
499 /* Prevent more than one additional grace period. */
500 mutex_lock(&sp->srcu_cb_mutex);
501
502 /* End the current grace period. */
503 spin_lock_irq(&sp->gp_lock);
504 idx = rcu_seq_state(sp->srcu_gp_seq);
505 WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
506 cbdelay = srcu_get_delay(sp);
507 sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
508 rcu_seq_end(&sp->srcu_gp_seq);
509 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
510 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
511 sp->srcu_gp_seq_needed_exp = gpseq;
512 spin_unlock_irq(&sp->gp_lock);
513 mutex_unlock(&sp->srcu_gp_mutex);
514 /* A new grace period can start at this point. But only one. */
515
516 /* Initiate callback invocation as needed. */
517 idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
518 idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
519 rcu_for_each_node_breadth_first(sp, snp) {
520 spin_lock_irq(&snp->lock);
521 cbs = false;
522 if (snp >= sp->level[rcu_num_lvls - 1])
523 cbs = snp->srcu_have_cbs[idx] == gpseq;
524 snp->srcu_have_cbs[idx] = gpseq;
525 rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
526 if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
527 snp->srcu_gp_seq_needed_exp = gpseq;
528 mask = snp->srcu_data_have_cbs[idx];
529 snp->srcu_data_have_cbs[idx] = 0;
530 spin_unlock_irq(&snp->lock);
531 if (cbs) {
532 smp_mb(); /* GP end before CB invocation. */
533 srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
534 }
535 }
536
537 /* Callback initiation done, allow grace periods after next. */
538 mutex_unlock(&sp->srcu_cb_mutex);
539
540 /* Start a new grace period if needed. */
541 spin_lock_irq(&sp->gp_lock);
542 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
543 if (!rcu_seq_state(gpseq) &&
544 ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
545 srcu_gp_start(sp);
546 spin_unlock_irq(&sp->gp_lock);
547 /* Throttle expedited grace periods: Should be rare! */
548 srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
549 ? 0 : SRCU_INTERVAL);
550 } else {
551 spin_unlock_irq(&sp->gp_lock);
552 }
553}
554
555/*
556 * Funnel-locking scheme to scalably mediate many concurrent expedited
557 * grace-period requests. This function is invoked for the first known
558 * expedited request for a grace period that has already been requested,
559 * but without expediting. To start a completely new grace period,
560 * whether expedited or not, use srcu_funnel_gp_start() instead.
561 */
562static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
563 unsigned long s)
564{
565 unsigned long flags;
566
567 for (; snp != NULL; snp = snp->srcu_parent) {
568 if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
569 ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
570 return;
571 spin_lock_irqsave(&snp->lock, flags);
572 if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
573 spin_unlock_irqrestore(&snp->lock, flags);
574 return;
575 }
576 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
577 spin_unlock_irqrestore(&snp->lock, flags);
578 }
579 spin_lock_irqsave(&sp->gp_lock, flags);
580 if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
581 sp->srcu_gp_seq_needed_exp = s;
582 spin_unlock_irqrestore(&sp->gp_lock, flags);
583}
584
585/*
586 * Funnel-locking scheme to scalably mediate many concurrent grace-period
587 * requests. The winner has to do the work of actually starting grace
588 * period s. Losers must either ensure that their desired grace-period
589 * number is recorded on at least their leaf srcu_node structure, or they
590 * must take steps to invoke their own callbacks.
591 */
592static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
593 unsigned long s, bool do_norm)
594{
595 unsigned long flags;
596 int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
597 struct srcu_node *snp = sdp->mynode;
598 unsigned long snp_seq;
599
600 /* Each pass through the loop does one level of the srcu_node tree. */
601 for (; snp != NULL; snp = snp->srcu_parent) {
602 if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
603 return; /* GP already done and CBs recorded. */
604 spin_lock_irqsave(&snp->lock, flags);
605 if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
606 snp_seq = snp->srcu_have_cbs[idx];
607 if (snp == sdp->mynode && snp_seq == s)
608 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
609 spin_unlock_irqrestore(&snp->lock, flags);
610 if (snp == sdp->mynode && snp_seq != s) {
611 smp_mb(); /* CBs after GP! */
612 srcu_schedule_cbs_sdp(sdp, do_norm
613 ? SRCU_INTERVAL
614 : 0);
615 return;
616 }
617 if (!do_norm)
618 srcu_funnel_exp_start(sp, snp, s);
619 return;
620 }
621 snp->srcu_have_cbs[idx] = s;
622 if (snp == sdp->mynode)
623 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
624 if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
625 snp->srcu_gp_seq_needed_exp = s;
626 spin_unlock_irqrestore(&snp->lock, flags);
627 }
628
629 /* Top of tree, must ensure the grace period will be started. */
630 spin_lock_irqsave(&sp->gp_lock, flags);
631 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
632 /*
633 * Record need for grace period s. Pair with load
634 * acquire setting up for initialization.
635 */
636 smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/
637 }
638 if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
639 sp->srcu_gp_seq_needed_exp = s;
640
641 /* If grace period not already done and none in progress, start it. */
642 if (!rcu_seq_done(&sp->srcu_gp_seq, s) &&
643 rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
644 WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
645 srcu_gp_start(sp);
646 queue_delayed_work(system_power_efficient_wq, &sp->work,
647 srcu_get_delay(sp));
648 }
649 spin_unlock_irqrestore(&sp->gp_lock, flags);
650}
651
652/*
653 * Wait until all readers counted by array index idx complete, but
654 * loop an additional time if there is an expedited grace period pending.
655 * The caller must ensure that ->srcu_idx is not changed while checking.
656 */
657static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
658{
659 for (;;) {
660 if (srcu_readers_active_idx_check(sp, idx))
661 return true;
662 if (--trycount + !srcu_get_delay(sp) <= 0)
663 return false;
664 udelay(SRCU_RETRY_CHECK_DELAY);
665 }
666}
667
668/*
669 * Increment the ->srcu_idx counter so that future SRCU readers will
670 * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows
671 * us to wait for pre-existing readers in a starvation-free manner.
672 */
673static void srcu_flip(struct srcu_struct *sp)
674{
675 WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1);
676
677 /*
678 * Ensure that if the updater misses an __srcu_read_unlock()
679 * increment, that task's next __srcu_read_lock() will see the
680 * above counter update. Note that both this memory barrier
681 * and the one in srcu_readers_active_idx_check() provide the
682 * guarantee for __srcu_read_lock().
683 */
684 smp_mb(); /* D */ /* Pairs with C. */
685}
686
687/*
688 * If SRCU is likely idle, return true, otherwise return false.
689 *
690 * Note that it is OK for several current from-idle requests for a new
691 * grace period from idle to specify expediting because they will all end
692 * up requesting the same grace period anyhow. So no loss.
693 *
694 * Note also that if any CPU (including the current one) is still invoking
695 * callbacks, this function will nevertheless say "idle". This is not
696 * ideal, but the overhead of checking all CPUs' callback lists is even
697 * less ideal, especially on large systems. Furthermore, the wakeup
698 * can happen before the callback is fully removed, so we have no choice
699 * but to accept this type of error.
700 *
701 * This function is also subject to counter-wrap errors, but let's face
702 * it, if this function was preempted for enough time for the counters
703 * to wrap, it really doesn't matter whether or not we expedite the grace
704 * period. The extra overhead of a needlessly expedited grace period is
705 * negligible when amoritized over that time period, and the extra latency
706 * of a needlessly non-expedited grace period is similarly negligible.
707 */
708static bool srcu_might_be_idle(struct srcu_struct *sp)
709{
710 unsigned long curseq;
711 unsigned long flags;
712 struct srcu_data *sdp;
713 unsigned long t;
714
715 /* If the local srcu_data structure has callbacks, not idle. */
716 local_irq_save(flags);
717 sdp = this_cpu_ptr(sp->sda);
718 if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
719 local_irq_restore(flags);
720 return false; /* Callbacks already present, so not idle. */
721 }
722 local_irq_restore(flags);
723
724 /*
725 * No local callbacks, so probabalistically probe global state.
726 * Exact information would require acquiring locks, which would
727 * kill scalability, hence the probabalistic nature of the probe.
728 */
729
730 /* First, see if enough time has passed since the last GP. */
731 t = ktime_get_mono_fast_ns();
732 if (exp_holdoff == 0 ||
733 time_in_range_open(t, sp->srcu_last_gp_end,
734 sp->srcu_last_gp_end + exp_holdoff))
735 return false; /* Too soon after last GP. */
736
737 /* Next, check for probable idleness. */
738 curseq = rcu_seq_current(&sp->srcu_gp_seq);
739 smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
740 if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed)))
741 return false; /* Grace period in progress, so not idle. */
742 smp_mb(); /* Order ->srcu_gp_seq with prior access. */
743 if (curseq != rcu_seq_current(&sp->srcu_gp_seq))
744 return false; /* GP # changed, so not idle. */
745 return true; /* With reasonable probability, idle! */
746}
747
748/*
749 * Enqueue an SRCU callback on the srcu_data structure associated with
750 * the current CPU and the specified srcu_struct structure, initiating
751 * grace-period processing if it is not already running.
752 *
753 * Note that all CPUs must agree that the grace period extended beyond
754 * all pre-existing SRCU read-side critical section. On systems with
755 * more than one CPU, this means that when "func()" is invoked, each CPU
756 * is guaranteed to have executed a full memory barrier since the end of
757 * its last corresponding SRCU read-side critical section whose beginning
758 * preceded the call to call_rcu(). It also means that each CPU executing
759 * an SRCU read-side critical section that continues beyond the start of
760 * "func()" must have executed a memory barrier after the call_rcu()
761 * but before the beginning of that SRCU read-side critical section.
762 * Note that these guarantees include CPUs that are offline, idle, or
763 * executing in user mode, as well as CPUs that are executing in the kernel.
764 *
765 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
766 * resulting SRCU callback function "func()", then both CPU A and CPU
767 * B are guaranteed to execute a full memory barrier during the time
768 * interval between the call to call_rcu() and the invocation of "func()".
769 * This guarantee applies even if CPU A and CPU B are the same CPU (but
770 * again only if the system has more than one CPU).
771 *
772 * Of course, these guarantees apply only for invocations of call_srcu(),
773 * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
774 * srcu_struct structure.
775 */
776void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
777 rcu_callback_t func, bool do_norm)
778{
779 unsigned long flags;
780 bool needexp = false;
781 bool needgp = false;
782 unsigned long s;
783 struct srcu_data *sdp;
784
785 check_init_srcu_struct(sp);
786 rhp->func = func;
787 local_irq_save(flags);
788 sdp = this_cpu_ptr(sp->sda);
789 spin_lock(&sdp->lock);
790 rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
791 rcu_segcblist_advance(&sdp->srcu_cblist,
792 rcu_seq_current(&sp->srcu_gp_seq));
793 s = rcu_seq_snap(&sp->srcu_gp_seq);
794 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
795 if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
796 sdp->srcu_gp_seq_needed = s;
797 needgp = true;
798 }
799 if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
800 sdp->srcu_gp_seq_needed_exp = s;
801 needexp = true;
802 }
803 spin_unlock_irqrestore(&sdp->lock, flags);
804 if (needgp)
805 srcu_funnel_gp_start(sp, sdp, s, do_norm);
806 else if (needexp)
807 srcu_funnel_exp_start(sp, sdp->mynode, s);
808}
809
810void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
811 rcu_callback_t func)
812{
813 __call_srcu(sp, rhp, func, true);
814}
815EXPORT_SYMBOL_GPL(call_srcu);
816
817/*
818 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
819 */
820static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
821{
822 struct rcu_synchronize rcu;
823
824 RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
825 lock_is_held(&rcu_bh_lock_map) ||
826 lock_is_held(&rcu_lock_map) ||
827 lock_is_held(&rcu_sched_lock_map),
828 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
829
830 if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
831 return;
832 might_sleep();
833 check_init_srcu_struct(sp);
834 init_completion(&rcu.completion);
835 init_rcu_head_on_stack(&rcu.head);
836 __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
837 wait_for_completion(&rcu.completion);
838 destroy_rcu_head_on_stack(&rcu.head);
839}
840
841/**
842 * synchronize_srcu_expedited - Brute-force SRCU grace period
843 * @sp: srcu_struct with which to synchronize.
844 *
845 * Wait for an SRCU grace period to elapse, but be more aggressive about
846 * spinning rather than blocking when waiting.
847 *
848 * Note that synchronize_srcu_expedited() has the same deadlock and
849 * memory-ordering properties as does synchronize_srcu().
850 */
851void synchronize_srcu_expedited(struct srcu_struct *sp)
852{
853 __synchronize_srcu(sp, rcu_gp_is_normal());
854}
855EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
856
857/**
858 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
859 * @sp: srcu_struct with which to synchronize.
860 *
861 * Wait for the count to drain to zero of both indexes. To avoid the
862 * possible starvation of synchronize_srcu(), it waits for the count of
863 * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first,
864 * and then flip the srcu_idx and wait for the count of the other index.
865 *
866 * Can block; must be called from process context.
867 *
868 * Note that it is illegal to call synchronize_srcu() from the corresponding
869 * SRCU read-side critical section; doing so will result in deadlock.
870 * However, it is perfectly legal to call synchronize_srcu() on one
871 * srcu_struct from some other srcu_struct's read-side critical section,
872 * as long as the resulting graph of srcu_structs is acyclic.
873 *
874 * There are memory-ordering constraints implied by synchronize_srcu().
875 * On systems with more than one CPU, when synchronize_srcu() returns,
876 * each CPU is guaranteed to have executed a full memory barrier since
877 * the end of its last corresponding SRCU-sched read-side critical section
878 * whose beginning preceded the call to synchronize_srcu(). In addition,
879 * each CPU having an SRCU read-side critical section that extends beyond
880 * the return from synchronize_srcu() is guaranteed to have executed a
881 * full memory barrier after the beginning of synchronize_srcu() and before
882 * the beginning of that SRCU read-side critical section. Note that these
883 * guarantees include CPUs that are offline, idle, or executing in user mode,
884 * as well as CPUs that are executing in the kernel.
885 *
886 * Furthermore, if CPU A invoked synchronize_srcu(), which returned
887 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
888 * to have executed a full memory barrier during the execution of
889 * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
890 * are the same CPU, but again only if the system has more than one CPU.
891 *
892 * Of course, these memory-ordering guarantees apply only when
893 * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
894 * passed the same srcu_struct structure.
895 *
896 * If SRCU is likely idle, expedite the first request. This semantic
897 * was provided by Classic SRCU, and is relied upon by its users, so TREE
898 * SRCU must also provide it. Note that detecting idleness is heuristic
899 * and subject to both false positives and negatives.
900 */
901void synchronize_srcu(struct srcu_struct *sp)
902{
903 if (srcu_might_be_idle(sp) || rcu_gp_is_expedited())
904 synchronize_srcu_expedited(sp);
905 else
906 __synchronize_srcu(sp, true);
907}
908EXPORT_SYMBOL_GPL(synchronize_srcu);
909
910/*
911 * Callback function for srcu_barrier() use.
912 */
913static void srcu_barrier_cb(struct rcu_head *rhp)
914{
915 struct srcu_data *sdp;
916 struct srcu_struct *sp;
917
918 sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
919 sp = sdp->sp;
920 if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
921 complete(&sp->srcu_barrier_completion);
922}
923
924/**
925 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
926 * @sp: srcu_struct on which to wait for in-flight callbacks.
927 */
928void srcu_barrier(struct srcu_struct *sp)
929{
930 int cpu;
931 struct srcu_data *sdp;
932 unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq);
933
934 check_init_srcu_struct(sp);
935 mutex_lock(&sp->srcu_barrier_mutex);
936 if (rcu_seq_done(&sp->srcu_barrier_seq, s)) {
937 smp_mb(); /* Force ordering following return. */
938 mutex_unlock(&sp->srcu_barrier_mutex);
939 return; /* Someone else did our work for us. */
940 }
941 rcu_seq_start(&sp->srcu_barrier_seq);
942 init_completion(&sp->srcu_barrier_completion);
943
944 /* Initial count prevents reaching zero until all CBs are posted. */
945 atomic_set(&sp->srcu_barrier_cpu_cnt, 1);
946
947 /*
948 * Each pass through this loop enqueues a callback, but only
949 * on CPUs already having callbacks enqueued. Note that if
950 * a CPU already has callbacks enqueue, it must have already
951 * registered the need for a future grace period, so all we
952 * need do is enqueue a callback that will use the same
953 * grace period as the last callback already in the queue.
954 */
955 for_each_possible_cpu(cpu) {
956 sdp = per_cpu_ptr(sp->sda, cpu);
957 spin_lock_irq(&sdp->lock);
958 atomic_inc(&sp->srcu_barrier_cpu_cnt);
959 sdp->srcu_barrier_head.func = srcu_barrier_cb;
960 if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
961 &sdp->srcu_barrier_head, 0))
962 atomic_dec(&sp->srcu_barrier_cpu_cnt);
963 spin_unlock_irq(&sdp->lock);
964 }
965
966 /* Remove the initial count, at which point reaching zero can happen. */
967 if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
968 complete(&sp->srcu_barrier_completion);
969 wait_for_completion(&sp->srcu_barrier_completion);
970
971 rcu_seq_end(&sp->srcu_barrier_seq);
972 mutex_unlock(&sp->srcu_barrier_mutex);
973}
974EXPORT_SYMBOL_GPL(srcu_barrier);
975
976/**
977 * srcu_batches_completed - return batches completed.
978 * @sp: srcu_struct on which to report batch completion.
979 *
980 * Report the number of batches, correlated with, but not necessarily
981 * precisely the same as, the number of grace periods that have elapsed.
982 */
983unsigned long srcu_batches_completed(struct srcu_struct *sp)
984{
985 return sp->srcu_idx;
986}
987EXPORT_SYMBOL_GPL(srcu_batches_completed);
988
989/*
990 * Core SRCU state machine. Push state bits of ->srcu_gp_seq
991 * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has
992 * completed in that state.
993 */
994static void srcu_advance_state(struct srcu_struct *sp)
995{
996 int idx;
997
998 mutex_lock(&sp->srcu_gp_mutex);
999
1000 /*
1001 * Because readers might be delayed for an extended period after
1002 * fetching ->srcu_idx for their index, at any point in time there
1003 * might well be readers using both idx=0 and idx=1. We therefore
1004 * need to wait for readers to clear from both index values before
1005 * invoking a callback.
1006 *
1007 * The load-acquire ensures that we see the accesses performed
1008 * by the prior grace period.
1009 */
1010 idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
1011 if (idx == SRCU_STATE_IDLE) {
1012 spin_lock_irq(&sp->gp_lock);
1013 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
1014 WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
1015 spin_unlock_irq(&sp->gp_lock);
1016 mutex_unlock(&sp->srcu_gp_mutex);
1017 return;
1018 }
1019 idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
1020 if (idx == SRCU_STATE_IDLE)
1021 srcu_gp_start(sp);
1022 spin_unlock_irq(&sp->gp_lock);
1023 if (idx != SRCU_STATE_IDLE) {
1024 mutex_unlock(&sp->srcu_gp_mutex);
1025 return; /* Someone else started the grace period. */
1026 }
1027 }
1028
1029 if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
1030 idx = 1 ^ (sp->srcu_idx & 1);
1031 if (!try_check_zero(sp, idx, 1)) {
1032 mutex_unlock(&sp->srcu_gp_mutex);
1033 return; /* readers present, retry later. */
1034 }
1035 srcu_flip(sp);
1036 rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
1037 }
1038
1039 if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
1040
1041 /*
1042 * SRCU read-side critical sections are normally short,
1043 * so check at least twice in quick succession after a flip.
1044 */
1045 idx = 1 ^ (sp->srcu_idx & 1);
1046 if (!try_check_zero(sp, idx, 2)) {
1047 mutex_unlock(&sp->srcu_gp_mutex);
1048 return; /* readers present, retry later. */
1049 }
1050 srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */
1051 }
1052}
1053
1054/*
1055 * Invoke a limited number of SRCU callbacks that have passed through
1056 * their grace period. If there are more to do, SRCU will reschedule
1057 * the workqueue. Note that needed memory barriers have been executed
1058 * in this task's context by srcu_readers_active_idx_check().
1059 */
1060static void srcu_invoke_callbacks(struct work_struct *work)
1061{
1062 bool more;
1063 struct rcu_cblist ready_cbs;
1064 struct rcu_head *rhp;
1065 struct srcu_data *sdp;
1066 struct srcu_struct *sp;
1067
1068 sdp = container_of(work, struct srcu_data, work.work);
1069 sp = sdp->sp;
1070 rcu_cblist_init(&ready_cbs);
1071 spin_lock_irq(&sdp->lock);
1072 smp_mb(); /* Old grace periods before callback invocation! */
1073 rcu_segcblist_advance(&sdp->srcu_cblist,
1074 rcu_seq_current(&sp->srcu_gp_seq));
1075 if (sdp->srcu_cblist_invoking ||
1076 !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
1077 spin_unlock_irq(&sdp->lock);
1078 return; /* Someone else on the job or nothing to do. */
1079 }
1080
1081 /* We are on the job! Extract and invoke ready callbacks. */
1082 sdp->srcu_cblist_invoking = true;
1083 rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
1084 spin_unlock_irq(&sdp->lock);
1085 rhp = rcu_cblist_dequeue(&ready_cbs);
1086 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
1087 local_bh_disable();
1088 rhp->func(rhp);
1089 local_bh_enable();
1090 }
1091
1092 /*
1093 * Update counts, accelerate new callbacks, and if needed,
1094 * schedule another round of callback invocation.
1095 */
1096 spin_lock_irq(&sdp->lock);
1097 rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
1098 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
1099 rcu_seq_snap(&sp->srcu_gp_seq));
1100 sdp->srcu_cblist_invoking = false;
1101 more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
1102 spin_unlock_irq(&sdp->lock);
1103 if (more)
1104 srcu_schedule_cbs_sdp(sdp, 0);
1105}
1106
1107/*
1108 * Finished one round of SRCU grace period. Start another if there are
1109 * more SRCU callbacks queued, otherwise put SRCU into not-running state.
1110 */
1111static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
1112{
1113 bool pushgp = true;
1114
1115 spin_lock_irq(&sp->gp_lock);
1116 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
1117 if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
1118 /* All requests fulfilled, time to go idle. */
1119 pushgp = false;
1120 }
1121 } else if (!rcu_seq_state(sp->srcu_gp_seq)) {
1122 /* Outstanding request and no GP. Start one. */
1123 srcu_gp_start(sp);
1124 }
1125 spin_unlock_irq(&sp->gp_lock);
1126
1127 if (pushgp)
1128 queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
1129}
1130
1131/*
1132 * This is the work-queue function that handles SRCU grace periods.
1133 */
1134void process_srcu(struct work_struct *work)
1135{
1136 struct srcu_struct *sp;
1137
1138 sp = container_of(work, struct srcu_struct, work.work);
1139
1140 srcu_advance_state(sp);
1141 srcu_reschedule(sp, srcu_get_delay(sp));
1142}
1143EXPORT_SYMBOL_GPL(process_srcu);
1144
1145void srcutorture_get_gp_data(enum rcutorture_type test_type,
1146 struct srcu_struct *sp, int *flags,
1147 unsigned long *gpnum, unsigned long *completed)
1148{
1149 if (test_type != SRCU_FLAVOR)
1150 return;
1151 *flags = 0;
1152 *completed = rcu_seq_ctr(sp->srcu_gp_seq);
1153 *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed);
1154}
1155EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 6ad330dbbae2..e5385731e391 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -79,7 +79,7 @@ EXPORT_SYMBOL(__rcu_is_watching);
79 */ 79 */
80static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 80static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
81{ 81{
82 RCU_TRACE(reset_cpu_stall_ticks(rcp)); 82 RCU_TRACE(reset_cpu_stall_ticks(rcp);)
83 if (rcp->donetail != rcp->curtail) { 83 if (rcp->donetail != rcp->curtail) {
84 rcp->donetail = rcp->curtail; 84 rcp->donetail = rcp->curtail;
85 return 1; 85 return 1;
@@ -125,7 +125,7 @@ void rcu_bh_qs(void)
125 */ 125 */
126void rcu_check_callbacks(int user) 126void rcu_check_callbacks(int user)
127{ 127{
128 RCU_TRACE(check_cpu_stalls()); 128 RCU_TRACE(check_cpu_stalls();)
129 if (user) 129 if (user)
130 rcu_sched_qs(); 130 rcu_sched_qs();
131 else if (!in_softirq()) 131 else if (!in_softirq())
@@ -143,7 +143,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
143 const char *rn = NULL; 143 const char *rn = NULL;
144 struct rcu_head *next, *list; 144 struct rcu_head *next, *list;
145 unsigned long flags; 145 unsigned long flags;
146 RCU_TRACE(int cb_count = 0); 146 RCU_TRACE(int cb_count = 0;)
147 147
148 /* Move the ready-to-invoke callbacks to a local list. */ 148 /* Move the ready-to-invoke callbacks to a local list. */
149 local_irq_save(flags); 149 local_irq_save(flags);
@@ -152,7 +152,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
152 local_irq_restore(flags); 152 local_irq_restore(flags);
153 return; 153 return;
154 } 154 }
155 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); 155 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);)
156 list = rcp->rcucblist; 156 list = rcp->rcucblist;
157 rcp->rcucblist = *rcp->donetail; 157 rcp->rcucblist = *rcp->donetail;
158 *rcp->donetail = NULL; 158 *rcp->donetail = NULL;
@@ -162,7 +162,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
162 local_irq_restore(flags); 162 local_irq_restore(flags);
163 163
164 /* Invoke the callbacks on the local list. */ 164 /* Invoke the callbacks on the local list. */
165 RCU_TRACE(rn = rcp->name); 165 RCU_TRACE(rn = rcp->name;)
166 while (list) { 166 while (list) {
167 next = list->next; 167 next = list->next;
168 prefetch(next); 168 prefetch(next);
@@ -171,9 +171,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
171 __rcu_reclaim(rn, list); 171 __rcu_reclaim(rn, list);
172 local_bh_enable(); 172 local_bh_enable();
173 list = next; 173 list = next;
174 RCU_TRACE(cb_count++); 174 RCU_TRACE(cb_count++;)
175 } 175 }
176 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 176 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);)
177 RCU_TRACE(trace_rcu_batch_end(rcp->name, 177 RCU_TRACE(trace_rcu_batch_end(rcp->name,
178 cb_count, 0, need_resched(), 178 cb_count, 0, need_resched(),
179 is_idle_task(current), 179 is_idle_task(current),
@@ -221,7 +221,7 @@ static void __call_rcu(struct rcu_head *head,
221 local_irq_save(flags); 221 local_irq_save(flags);
222 *rcp->curtail = head; 222 *rcp->curtail = head;
223 rcp->curtail = &head->next; 223 rcp->curtail = &head->next;
224 RCU_TRACE(rcp->qlen++); 224 RCU_TRACE(rcp->qlen++;)
225 local_irq_restore(flags); 225 local_irq_restore(flags);
226 226
227 if (unlikely(is_idle_task(current))) { 227 if (unlikely(is_idle_task(current))) {
@@ -254,8 +254,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
254void __init rcu_init(void) 254void __init rcu_init(void)
255{ 255{
256 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 256 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
257 RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk)); 257 RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);)
258 RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk)); 258 RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);)
259 259
260 rcu_early_boot_tests(); 260 rcu_early_boot_tests();
261} 261}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index c64b827ecbca..371034e77f87 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -52,7 +52,7 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
52 RCU_TRACE(.name = "rcu_bh") 52 RCU_TRACE(.name = "rcu_bh")
53}; 53};
54 54
55#ifdef CONFIG_DEBUG_LOCK_ALLOC 55#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
56#include <linux/kernel_stat.h> 56#include <linux/kernel_stat.h>
57 57
58int rcu_scheduler_active __read_mostly; 58int rcu_scheduler_active __read_mostly;
@@ -65,15 +65,16 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
65 * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. 65 * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
66 * The reason for this is that Tiny RCU does not need kthreads, so does 66 * The reason for this is that Tiny RCU does not need kthreads, so does
67 * not have to care about the fact that the scheduler is half-initialized 67 * not have to care about the fact that the scheduler is half-initialized
68 * at a certain phase of the boot process. 68 * at a certain phase of the boot process. Unless SRCU is in the mix.
69 */ 69 */
70void __init rcu_scheduler_starting(void) 70void __init rcu_scheduler_starting(void)
71{ 71{
72 WARN_ON(nr_context_switches() > 0); 72 WARN_ON(nr_context_switches() > 0);
73 rcu_scheduler_active = RCU_SCHEDULER_RUNNING; 73 rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
74 ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
74} 75}
75 76
76#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 77#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
77 78
78#ifdef CONFIG_RCU_TRACE 79#ifdef CONFIG_RCU_TRACE
79 80
@@ -162,8 +163,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
162 163
163static void check_cpu_stalls(void) 164static void check_cpu_stalls(void)
164{ 165{
165 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); 166 RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);)
166 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); 167 RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);)
167} 168}
168 169
169#endif /* #ifdef CONFIG_RCU_TRACE */ 170#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a6dcf3bd244f..e354e475e645 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -98,8 +98,8 @@ struct rcu_state sname##_state = { \
98 .gpnum = 0UL - 300UL, \ 98 .gpnum = 0UL - 300UL, \
99 .completed = 0UL - 300UL, \ 99 .completed = 0UL - 300UL, \
100 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ 100 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
101 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 101 .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \
102 .orphan_donetail = &sname##_state.orphan_donelist, \ 102 .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \
103 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 103 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
104 .name = RCU_STATE_NAME(sname), \ 104 .name = RCU_STATE_NAME(sname), \
105 .abbr = sabbr, \ 105 .abbr = sabbr, \
@@ -124,7 +124,7 @@ static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
124module_param(rcu_fanout_leaf, int, 0444); 124module_param(rcu_fanout_leaf, int, 0444);
125int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; 125int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
126/* Number of rcu_nodes at specified level. */ 126/* Number of rcu_nodes at specified level. */
127static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; 127int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
128int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ 128int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
129/* panic() on RCU Stall sysctl. */ 129/* panic() on RCU Stall sysctl. */
130int sysctl_panic_on_rcu_stall __read_mostly; 130int sysctl_panic_on_rcu_stall __read_mostly;
@@ -200,7 +200,7 @@ static const int gp_cleanup_delay;
200 200
201/* 201/*
202 * Number of grace periods between delays, normalized by the duration of 202 * Number of grace periods between delays, normalized by the duration of
203 * the delay. The longer the the delay, the more the grace periods between 203 * the delay. The longer the delay, the more the grace periods between
204 * each delay. The reason for this normalization is that it means that, 204 * each delay. The reason for this normalization is that it means that,
205 * for non-zero delays, the overall slowdown of grace periods is constant 205 * for non-zero delays, the overall slowdown of grace periods is constant
206 * regardless of the duration of the delay. This arrangement balances 206 * regardless of the duration of the delay. This arrangement balances
@@ -273,11 +273,19 @@ void rcu_bh_qs(void)
273 } 273 }
274} 274}
275 275
276static DEFINE_PER_CPU(int, rcu_sched_qs_mask); 276/*
277 * Steal a bit from the bottom of ->dynticks for idle entry/exit
278 * control. Initially this is for TLB flushing.
279 */
280#define RCU_DYNTICK_CTRL_MASK 0x1
281#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
282#ifndef rcu_eqs_special_exit
283#define rcu_eqs_special_exit() do { } while (0)
284#endif
277 285
278static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 286static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
279 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 287 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
280 .dynticks = ATOMIC_INIT(1), 288 .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
281#ifdef CONFIG_NO_HZ_FULL_SYSIDLE 289#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
282 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, 290 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
283 .dynticks_idle = ATOMIC_INIT(1), 291 .dynticks_idle = ATOMIC_INIT(1),
@@ -305,15 +313,20 @@ bool rcu_irq_enter_disabled(void)
305static void rcu_dynticks_eqs_enter(void) 313static void rcu_dynticks_eqs_enter(void)
306{ 314{
307 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 315 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
308 int special; 316 int seq;
309 317
310 /* 318 /*
311 * CPUs seeing atomic_inc_return() must see prior RCU read-side 319 * CPUs seeing atomic_add_return() must see prior RCU read-side
312 * critical sections, and we also must force ordering with the 320 * critical sections, and we also must force ordering with the
313 * next idle sojourn. 321 * next idle sojourn.
314 */ 322 */
315 special = atomic_inc_return(&rdtp->dynticks); 323 seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
316 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1); 324 /* Better be in an extended quiescent state! */
325 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
326 (seq & RCU_DYNTICK_CTRL_CTR));
327 /* Better not have special action (TLB flush) pending! */
328 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
329 (seq & RCU_DYNTICK_CTRL_MASK));
317} 330}
318 331
319/* 332/*
@@ -323,15 +336,22 @@ static void rcu_dynticks_eqs_enter(void)
323static void rcu_dynticks_eqs_exit(void) 336static void rcu_dynticks_eqs_exit(void)
324{ 337{
325 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 338 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
326 int special; 339 int seq;
327 340
328 /* 341 /*
329 * CPUs seeing atomic_inc_return() must see prior idle sojourns, 342 * CPUs seeing atomic_add_return() must see prior idle sojourns,
330 * and we also must force ordering with the next RCU read-side 343 * and we also must force ordering with the next RCU read-side
331 * critical section. 344 * critical section.
332 */ 345 */
333 special = atomic_inc_return(&rdtp->dynticks); 346 seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
334 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1)); 347 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
348 !(seq & RCU_DYNTICK_CTRL_CTR));
349 if (seq & RCU_DYNTICK_CTRL_MASK) {
350 atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks);
351 smp_mb__after_atomic(); /* _exit after clearing mask. */
352 /* Prefer duplicate flushes to losing a flush. */
353 rcu_eqs_special_exit();
354 }
335} 355}
336 356
337/* 357/*
@@ -348,9 +368,9 @@ static void rcu_dynticks_eqs_online(void)
348{ 368{
349 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 369 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
350 370
351 if (atomic_read(&rdtp->dynticks) & 0x1) 371 if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR)
352 return; 372 return;
353 atomic_add(0x1, &rdtp->dynticks); 373 atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
354} 374}
355 375
356/* 376/*
@@ -362,7 +382,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)
362{ 382{
363 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 383 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
364 384
365 return !(atomic_read(&rdtp->dynticks) & 0x1); 385 return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR);
366} 386}
367 387
368/* 388/*
@@ -373,7 +393,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
373{ 393{
374 int snap = atomic_add_return(0, &rdtp->dynticks); 394 int snap = atomic_add_return(0, &rdtp->dynticks);
375 395
376 return snap; 396 return snap & ~RCU_DYNTICK_CTRL_MASK;
377} 397}
378 398
379/* 399/*
@@ -382,7 +402,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
382 */ 402 */
383static bool rcu_dynticks_in_eqs(int snap) 403static bool rcu_dynticks_in_eqs(int snap)
384{ 404{
385 return !(snap & 0x1); 405 return !(snap & RCU_DYNTICK_CTRL_CTR);
386} 406}
387 407
388/* 408/*
@@ -402,14 +422,34 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap)
402static void rcu_dynticks_momentary_idle(void) 422static void rcu_dynticks_momentary_idle(void)
403{ 423{
404 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 424 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
405 int special = atomic_add_return(2, &rdtp->dynticks); 425 int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
426 &rdtp->dynticks);
406 427
407 /* It is illegal to call this from idle state. */ 428 /* It is illegal to call this from idle state. */
408 WARN_ON_ONCE(!(special & 0x1)); 429 WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
409} 430}
410 431
411DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); 432/*
412EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); 433 * Set the special (bottom) bit of the specified CPU so that it
434 * will take special action (such as flushing its TLB) on the
435 * next exit from an extended quiescent state. Returns true if
436 * the bit was successfully set, or false if the CPU was not in
437 * an extended quiescent state.
438 */
439bool rcu_eqs_special_set(int cpu)
440{
441 int old;
442 int new;
443 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
444
445 do {
446 old = atomic_read(&rdtp->dynticks);
447 if (old & RCU_DYNTICK_CTRL_CTR)
448 return false;
449 new = old | RCU_DYNTICK_CTRL_MASK;
450 } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old);
451 return true;
452}
413 453
414/* 454/*
415 * Let the RCU core know that this CPU has gone through the scheduler, 455 * Let the RCU core know that this CPU has gone through the scheduler,
@@ -418,44 +458,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
418 * memory barriers to let the RCU core know about it, regardless of what 458 * memory barriers to let the RCU core know about it, regardless of what
419 * this CPU might (or might not) do in the near future. 459 * this CPU might (or might not) do in the near future.
420 * 460 *
421 * We inform the RCU core by emulating a zero-duration dyntick-idle 461 * We inform the RCU core by emulating a zero-duration dyntick-idle period.
422 * period, which we in turn do by incrementing the ->dynticks counter
423 * by two.
424 * 462 *
425 * The caller must have disabled interrupts. 463 * The caller must have disabled interrupts.
426 */ 464 */
427static void rcu_momentary_dyntick_idle(void) 465static void rcu_momentary_dyntick_idle(void)
428{ 466{
429 struct rcu_data *rdp; 467 raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false);
430 int resched_mask; 468 rcu_dynticks_momentary_idle();
431 struct rcu_state *rsp;
432
433 /*
434 * Yes, we can lose flag-setting operations. This is OK, because
435 * the flag will be set again after some delay.
436 */
437 resched_mask = raw_cpu_read(rcu_sched_qs_mask);
438 raw_cpu_write(rcu_sched_qs_mask, 0);
439
440 /* Find the flavor that needs a quiescent state. */
441 for_each_rcu_flavor(rsp) {
442 rdp = raw_cpu_ptr(rsp->rda);
443 if (!(resched_mask & rsp->flavor_mask))
444 continue;
445 smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
446 if (READ_ONCE(rdp->mynode->completed) !=
447 READ_ONCE(rdp->cond_resched_completed))
448 continue;
449
450 /*
451 * Pretend to be momentarily idle for the quiescent state.
452 * This allows the grace-period kthread to record the
453 * quiescent state, with no need for this CPU to do anything
454 * further.
455 */
456 rcu_dynticks_momentary_idle();
457 break;
458 }
459} 469}
460 470
461/* 471/*
@@ -463,14 +473,22 @@ static void rcu_momentary_dyntick_idle(void)
463 * and requires special handling for preemptible RCU. 473 * and requires special handling for preemptible RCU.
464 * The caller must have disabled interrupts. 474 * The caller must have disabled interrupts.
465 */ 475 */
466void rcu_note_context_switch(void) 476void rcu_note_context_switch(bool preempt)
467{ 477{
468 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 478 barrier(); /* Avoid RCU read-side critical sections leaking down. */
469 trace_rcu_utilization(TPS("Start context switch")); 479 trace_rcu_utilization(TPS("Start context switch"));
470 rcu_sched_qs(); 480 rcu_sched_qs();
471 rcu_preempt_note_context_switch(); 481 rcu_preempt_note_context_switch();
472 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) 482 /* Load rcu_urgent_qs before other flags. */
483 if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs)))
484 goto out;
485 this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
486 if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs)))
473 rcu_momentary_dyntick_idle(); 487 rcu_momentary_dyntick_idle();
488 this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
489 if (!preempt)
490 rcu_note_voluntary_context_switch_lite(current);
491out:
474 trace_rcu_utilization(TPS("End context switch")); 492 trace_rcu_utilization(TPS("End context switch"));
475 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 493 barrier(); /* Avoid RCU read-side critical sections leaking up. */
476} 494}
@@ -493,29 +511,26 @@ void rcu_all_qs(void)
493{ 511{
494 unsigned long flags; 512 unsigned long flags;
495 513
514 if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs))
515 return;
516 preempt_disable();
517 /* Load rcu_urgent_qs before other flags. */
518 if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) {
519 preempt_enable();
520 return;
521 }
522 this_cpu_write(rcu_dynticks.rcu_urgent_qs, false);
496 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 523 barrier(); /* Avoid RCU read-side critical sections leaking down. */
497 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { 524 if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) {
498 local_irq_save(flags); 525 local_irq_save(flags);
499 rcu_momentary_dyntick_idle(); 526 rcu_momentary_dyntick_idle();
500 local_irq_restore(flags); 527 local_irq_restore(flags);
501 } 528 }
502 if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { 529 if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)))
503 /*
504 * Yes, we just checked a per-CPU variable with preemption
505 * enabled, so we might be migrated to some other CPU at
506 * this point. That is OK because in that case, the
507 * migration will supply the needed quiescent state.
508 * We might end up needlessly disabling preemption and
509 * invoking rcu_sched_qs() on the destination CPU, but
510 * the probability and cost are both quite low, so this
511 * should not be a problem in practice.
512 */
513 preempt_disable();
514 rcu_sched_qs(); 530 rcu_sched_qs();
515 preempt_enable(); 531 this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
516 }
517 this_cpu_inc(rcu_qs_ctr);
518 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 532 barrier(); /* Avoid RCU read-side critical sections leaking up. */
533 preempt_enable();
519} 534}
520EXPORT_SYMBOL_GPL(rcu_all_qs); 535EXPORT_SYMBOL_GPL(rcu_all_qs);
521 536
@@ -704,15 +719,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
704 default: 719 default:
705 break; 720 break;
706 } 721 }
707 if (rsp != NULL) { 722 if (rsp == NULL)
708 *flags = READ_ONCE(rsp->gp_flags);
709 *gpnum = READ_ONCE(rsp->gpnum);
710 *completed = READ_ONCE(rsp->completed);
711 return; 723 return;
712 } 724 *flags = READ_ONCE(rsp->gp_flags);
713 *flags = 0; 725 *gpnum = READ_ONCE(rsp->gpnum);
714 *gpnum = 0; 726 *completed = READ_ONCE(rsp->completed);
715 *completed = 0;
716} 727}
717EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); 728EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
718 729
@@ -728,16 +739,6 @@ void rcutorture_record_progress(unsigned long vernum)
728EXPORT_SYMBOL_GPL(rcutorture_record_progress); 739EXPORT_SYMBOL_GPL(rcutorture_record_progress);
729 740
730/* 741/*
731 * Does the CPU have callbacks ready to be invoked?
732 */
733static int
734cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
735{
736 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
737 rdp->nxttail[RCU_NEXT_TAIL] != NULL;
738}
739
740/*
741 * Return the root node of the specified rcu_state structure. 742 * Return the root node of the specified rcu_state structure.
742 */ 743 */
743static struct rcu_node *rcu_get_root(struct rcu_state *rsp) 744static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
@@ -767,21 +768,17 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
767static bool 768static bool
768cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 769cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
769{ 770{
770 int i;
771
772 if (rcu_gp_in_progress(rsp)) 771 if (rcu_gp_in_progress(rsp))
773 return false; /* No, a grace period is already in progress. */ 772 return false; /* No, a grace period is already in progress. */
774 if (rcu_future_needs_gp(rsp)) 773 if (rcu_future_needs_gp(rsp))
775 return true; /* Yes, a no-CBs CPU needs one. */ 774 return true; /* Yes, a no-CBs CPU needs one. */
776 if (!rdp->nxttail[RCU_NEXT_TAIL]) 775 if (!rcu_segcblist_is_enabled(&rdp->cblist))
777 return false; /* No, this is a no-CBs (or offline) CPU. */ 776 return false; /* No, this is a no-CBs (or offline) CPU. */
778 if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) 777 if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
779 return true; /* Yes, CPU has newly registered callbacks. */ 778 return true; /* Yes, CPU has newly registered callbacks. */
780 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) 779 if (rcu_segcblist_future_gp_needed(&rdp->cblist,
781 if (rdp->nxttail[i - 1] != rdp->nxttail[i] && 780 READ_ONCE(rsp->completed)))
782 ULONG_CMP_LT(READ_ONCE(rsp->completed), 781 return true; /* Yes, CBs for future grace period. */
783 rdp->nxtcompleted[i]))
784 return true; /* Yes, CBs for future grace period. */
785 return false; /* No grace period needed. */ 782 return false; /* No grace period needed. */
786} 783}
787 784
@@ -1162,6 +1159,24 @@ bool notrace rcu_is_watching(void)
1162} 1159}
1163EXPORT_SYMBOL_GPL(rcu_is_watching); 1160EXPORT_SYMBOL_GPL(rcu_is_watching);
1164 1161
1162/*
1163 * If a holdout task is actually running, request an urgent quiescent
1164 * state from its CPU. This is unsynchronized, so migrations can cause
1165 * the request to go to the wrong CPU. Which is OK, all that will happen
1166 * is that the CPU's next context switch will be a bit slower and next
1167 * time around this task will generate another request.
1168 */
1169void rcu_request_urgent_qs_task(struct task_struct *t)
1170{
1171 int cpu;
1172
1173 barrier();
1174 cpu = task_cpu(t);
1175 if (!task_curr(t))
1176 return; /* This task is not running on that CPU. */
1177 smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true);
1178}
1179
1165#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 1180#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
1166 1181
1167/* 1182/*
@@ -1247,7 +1262,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
1247 bool *isidle, unsigned long *maxj) 1262 bool *isidle, unsigned long *maxj)
1248{ 1263{
1249 unsigned long jtsq; 1264 unsigned long jtsq;
1250 int *rcrmp; 1265 bool *rnhqp;
1266 bool *ruqp;
1251 unsigned long rjtsc; 1267 unsigned long rjtsc;
1252 struct rcu_node *rnp; 1268 struct rcu_node *rnp;
1253 1269
@@ -1283,11 +1299,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
1283 * might not be the case for nohz_full CPUs looping in the kernel. 1299 * might not be the case for nohz_full CPUs looping in the kernel.
1284 */ 1300 */
1285 rnp = rdp->mynode; 1301 rnp = rdp->mynode;
1302 ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
1286 if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && 1303 if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
1287 READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) && 1304 READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) &&
1288 READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { 1305 READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
1289 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); 1306 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
1290 return 1; 1307 return 1;
1308 } else {
1309 /* Load rcu_qs_ctr before store to rcu_urgent_qs. */
1310 smp_store_release(ruqp, true);
1291 } 1311 }
1292 1312
1293 /* Check for the CPU being offline. */ 1313 /* Check for the CPU being offline. */
@@ -1304,7 +1324,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
1304 * in-kernel CPU-bound tasks cannot advance grace periods. 1324 * in-kernel CPU-bound tasks cannot advance grace periods.
1305 * So if the grace period is old enough, make the CPU pay attention. 1325 * So if the grace period is old enough, make the CPU pay attention.
1306 * Note that the unsynchronized assignments to the per-CPU 1326 * Note that the unsynchronized assignments to the per-CPU
1307 * rcu_sched_qs_mask variable are safe. Yes, setting of 1327 * rcu_need_heavy_qs variable are safe. Yes, setting of
1308 * bits can be lost, but they will be set again on the next 1328 * bits can be lost, but they will be set again on the next
1309 * force-quiescent-state pass. So lost bit sets do not result 1329 * force-quiescent-state pass. So lost bit sets do not result
1310 * in incorrect behavior, merely in a grace period lasting 1330 * in incorrect behavior, merely in a grace period lasting
@@ -1318,16 +1338,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
1318 * is set too high, we override with half of the RCU CPU stall 1338 * is set too high, we override with half of the RCU CPU stall
1319 * warning delay. 1339 * warning delay.
1320 */ 1340 */
1321 rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); 1341 rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu);
1322 if (time_after(jiffies, rdp->rsp->gp_start + jtsq) || 1342 if (!READ_ONCE(*rnhqp) &&
1323 time_after(jiffies, rdp->rsp->jiffies_resched)) { 1343 (time_after(jiffies, rdp->rsp->gp_start + jtsq) ||
1324 if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { 1344 time_after(jiffies, rdp->rsp->jiffies_resched))) {
1325 WRITE_ONCE(rdp->cond_resched_completed, 1345 WRITE_ONCE(*rnhqp, true);
1326 READ_ONCE(rdp->mynode->completed)); 1346 /* Store rcu_need_heavy_qs before rcu_urgent_qs. */
1327 smp_mb(); /* ->cond_resched_completed before *rcrmp. */ 1347 smp_store_release(ruqp, true);
1328 WRITE_ONCE(*rcrmp,
1329 READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
1330 }
1331 rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ 1348 rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
1332 } 1349 }
1333 1350
@@ -1487,7 +1504,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1487 1504
1488 print_cpu_stall_info_end(); 1505 print_cpu_stall_info_end();
1489 for_each_possible_cpu(cpu) 1506 for_each_possible_cpu(cpu)
1490 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1507 totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
1508 cpu)->cblist);
1491 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", 1509 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
1492 smp_processor_id(), (long)(jiffies - rsp->gp_start), 1510 smp_processor_id(), (long)(jiffies - rsp->gp_start),
1493 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1511 (long)rsp->gpnum, (long)rsp->completed, totqlen);
@@ -1541,7 +1559,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
1541 print_cpu_stall_info(rsp, smp_processor_id()); 1559 print_cpu_stall_info(rsp, smp_processor_id());
1542 print_cpu_stall_info_end(); 1560 print_cpu_stall_info_end();
1543 for_each_possible_cpu(cpu) 1561 for_each_possible_cpu(cpu)
1544 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1562 totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
1563 cpu)->cblist);
1545 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", 1564 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
1546 jiffies - rsp->gp_start, 1565 jiffies - rsp->gp_start,
1547 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1566 (long)rsp->gpnum, (long)rsp->completed, totqlen);
@@ -1644,30 +1663,6 @@ void rcu_cpu_stall_reset(void)
1644} 1663}
1645 1664
1646/* 1665/*
1647 * Initialize the specified rcu_data structure's default callback list
1648 * to empty. The default callback list is the one that is not used by
1649 * no-callbacks CPUs.
1650 */
1651static void init_default_callback_list(struct rcu_data *rdp)
1652{
1653 int i;
1654
1655 rdp->nxtlist = NULL;
1656 for (i = 0; i < RCU_NEXT_SIZE; i++)
1657 rdp->nxttail[i] = &rdp->nxtlist;
1658}
1659
1660/*
1661 * Initialize the specified rcu_data structure's callback list to empty.
1662 */
1663static void init_callback_list(struct rcu_data *rdp)
1664{
1665 if (init_nocb_callback_list(rdp))
1666 return;
1667 init_default_callback_list(rdp);
1668}
1669
1670/*
1671 * Determine the value that ->completed will have at the end of the 1666 * Determine the value that ->completed will have at the end of the
1672 * next subsequent grace period. This is used to tag callbacks so that 1667 * next subsequent grace period. This is used to tag callbacks so that
1673 * a CPU can invoke callbacks in a timely fashion even if that CPU has 1668 * a CPU can invoke callbacks in a timely fashion even if that CPU has
@@ -1721,7 +1716,6 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1721 unsigned long *c_out) 1716 unsigned long *c_out)
1722{ 1717{
1723 unsigned long c; 1718 unsigned long c;
1724 int i;
1725 bool ret = false; 1719 bool ret = false;
1726 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); 1720 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
1727 1721
@@ -1767,13 +1761,11 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1767 /* 1761 /*
1768 * Get a new grace-period number. If there really is no grace 1762 * Get a new grace-period number. If there really is no grace
1769 * period in progress, it will be smaller than the one we obtained 1763 * period in progress, it will be smaller than the one we obtained
1770 * earlier. Adjust callbacks as needed. Note that even no-CBs 1764 * earlier. Adjust callbacks as needed.
1771 * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
1772 */ 1765 */
1773 c = rcu_cbs_completed(rdp->rsp, rnp_root); 1766 c = rcu_cbs_completed(rdp->rsp, rnp_root);
1774 for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) 1767 if (!rcu_is_nocb_cpu(rdp->cpu))
1775 if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) 1768 (void)rcu_segcblist_accelerate(&rdp->cblist, c);
1776 rdp->nxtcompleted[i] = c;
1777 1769
1778 /* 1770 /*
1779 * If the needed for the required grace period is already 1771 * If the needed for the required grace period is already
@@ -1805,9 +1797,7 @@ out:
1805 1797
1806/* 1798/*
1807 * Clean up any old requests for the just-ended grace period. Also return 1799 * Clean up any old requests for the just-ended grace period. Also return
1808 * whether any additional grace periods have been requested. Also invoke 1800 * whether any additional grace periods have been requested.
1809 * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
1810 * waiting for this grace period to complete.
1811 */ 1801 */
1812static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 1802static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1813{ 1803{
@@ -1853,57 +1843,27 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
1853static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1843static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1854 struct rcu_data *rdp) 1844 struct rcu_data *rdp)
1855{ 1845{
1856 unsigned long c; 1846 bool ret = false;
1857 int i;
1858 bool ret;
1859
1860 /* If the CPU has no callbacks, nothing to do. */
1861 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1862 return false;
1863
1864 /*
1865 * Starting from the sublist containing the callbacks most
1866 * recently assigned a ->completed number and working down, find the
1867 * first sublist that is not assignable to an upcoming grace period.
1868 * Such a sublist has something in it (first two tests) and has
1869 * a ->completed number assigned that will complete sooner than
1870 * the ->completed number for newly arrived callbacks (last test).
1871 *
1872 * The key point is that any later sublist can be assigned the
1873 * same ->completed number as the newly arrived callbacks, which
1874 * means that the callbacks in any of these later sublist can be
1875 * grouped into a single sublist, whether or not they have already
1876 * been assigned a ->completed number.
1877 */
1878 c = rcu_cbs_completed(rsp, rnp);
1879 for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
1880 if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
1881 !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
1882 break;
1883 1847
1884 /* 1848 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1885 * If there are no sublist for unassigned callbacks, leave. 1849 if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1886 * At the same time, advance "i" one sublist, so that "i" will
1887 * index into the sublist where all the remaining callbacks should
1888 * be grouped into.
1889 */
1890 if (++i >= RCU_NEXT_TAIL)
1891 return false; 1850 return false;
1892 1851
1893 /* 1852 /*
1894 * Assign all subsequent callbacks' ->completed number to the next 1853 * Callbacks are often registered with incomplete grace-period
1895 * full grace period and group them all in the sublist initially 1854 * information. Something about the fact that getting exact
1896 * indexed by "i". 1855 * information requires acquiring a global lock... RCU therefore
1856 * makes a conservative estimate of the grace period number at which
1857 * a given callback will become ready to invoke. The following
1858 * code checks this estimate and improves it when possible, thus
1859 * accelerating callback invocation to an earlier grace-period
1860 * number.
1897 */ 1861 */
1898 for (; i <= RCU_NEXT_TAIL; i++) { 1862 if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp)))
1899 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; 1863 ret = rcu_start_future_gp(rnp, rdp, NULL);
1900 rdp->nxtcompleted[i] = c;
1901 }
1902 /* Record any needed additional grace periods. */
1903 ret = rcu_start_future_gp(rnp, rdp, NULL);
1904 1864
1905 /* Trace depending on how much we were able to accelerate. */ 1865 /* Trace depending on how much we were able to accelerate. */
1906 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1866 if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
1907 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); 1867 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
1908 else 1868 else
1909 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); 1869 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
@@ -1923,32 +1883,15 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1923static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1883static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1924 struct rcu_data *rdp) 1884 struct rcu_data *rdp)
1925{ 1885{
1926 int i, j; 1886 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
1927 1887 if (!rcu_segcblist_pend_cbs(&rdp->cblist))
1928 /* If the CPU has no callbacks, nothing to do. */
1929 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1930 return false; 1888 return false;
1931 1889
1932 /* 1890 /*
1933 * Find all callbacks whose ->completed numbers indicate that they 1891 * Find all callbacks whose ->completed numbers indicate that they
1934 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. 1892 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
1935 */ 1893 */
1936 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { 1894 rcu_segcblist_advance(&rdp->cblist, rnp->completed);
1937 if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
1938 break;
1939 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
1940 }
1941 /* Clean up any sublist tail pointers that were misordered above. */
1942 for (j = RCU_WAIT_TAIL; j < i; j++)
1943 rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
1944
1945 /* Copy down callbacks to fill in empty sublists. */
1946 for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
1947 if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
1948 break;
1949 rdp->nxttail[j] = rdp->nxttail[i];
1950 rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
1951 }
1952 1895
1953 /* Classify any remaining callbacks. */ 1896 /* Classify any remaining callbacks. */
1954 return rcu_accelerate_cbs(rsp, rnp, rdp); 1897 return rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1993,7 +1936,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1993 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); 1936 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
1994 need_gp = !!(rnp->qsmask & rdp->grpmask); 1937 need_gp = !!(rnp->qsmask & rdp->grpmask);
1995 rdp->cpu_no_qs.b.norm = need_gp; 1938 rdp->cpu_no_qs.b.norm = need_gp;
1996 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 1939 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr);
1997 rdp->core_needs_qs = need_gp; 1940 rdp->core_needs_qs = need_gp;
1998 zero_cpu_stall_ticks(rdp); 1941 zero_cpu_stall_ticks(rdp);
1999 WRITE_ONCE(rdp->gpwrap, false); 1942 WRITE_ONCE(rdp->gpwrap, false);
@@ -2591,7 +2534,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2591 * within the current grace period. 2534 * within the current grace period.
2592 */ 2535 */
2593 rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ 2536 rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
2594 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 2537 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr);
2595 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2538 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2596 return; 2539 return;
2597 } 2540 }
@@ -2665,13 +2608,8 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
2665 * because _rcu_barrier() excludes CPU-hotplug operations, so it 2608 * because _rcu_barrier() excludes CPU-hotplug operations, so it
2666 * cannot be running now. Thus no memory barrier is required. 2609 * cannot be running now. Thus no memory barrier is required.
2667 */ 2610 */
2668 if (rdp->nxtlist != NULL) { 2611 rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist);
2669 rsp->qlen_lazy += rdp->qlen_lazy; 2612 rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done);
2670 rsp->qlen += rdp->qlen;
2671 rdp->n_cbs_orphaned += rdp->qlen;
2672 rdp->qlen_lazy = 0;
2673 WRITE_ONCE(rdp->qlen, 0);
2674 }
2675 2613
2676 /* 2614 /*
2677 * Next, move those callbacks still needing a grace period to 2615 * Next, move those callbacks still needing a grace period to
@@ -2679,31 +2617,18 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
2679 * Some of the callbacks might have gone partway through a grace 2617 * Some of the callbacks might have gone partway through a grace
2680 * period, but that is too bad. They get to start over because we 2618 * period, but that is too bad. They get to start over because we
2681 * cannot assume that grace periods are synchronized across CPUs. 2619 * cannot assume that grace periods are synchronized across CPUs.
2682 * We don't bother updating the ->nxttail[] array yet, instead
2683 * we just reset the whole thing later on.
2684 */ 2620 */
2685 if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { 2621 rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
2686 *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
2687 rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
2688 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
2689 }
2690 2622
2691 /* 2623 /*
2692 * Then move the ready-to-invoke callbacks to the orphanage, 2624 * Then move the ready-to-invoke callbacks to the orphanage,
2693 * where some other CPU will pick them up. These will not be 2625 * where some other CPU will pick them up. These will not be
2694 * required to pass though another grace period: They are done. 2626 * required to pass though another grace period: They are done.
2695 */ 2627 */
2696 if (rdp->nxtlist != NULL) { 2628 rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done);
2697 *rsp->orphan_donetail = rdp->nxtlist;
2698 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
2699 }
2700 2629
2701 /* 2630 /* Finally, disallow further callbacks on this CPU. */
2702 * Finally, initialize the rcu_data structure's list to empty and 2631 rcu_segcblist_disable(&rdp->cblist);
2703 * disallow further callbacks on this CPU.
2704 */
2705 init_callback_list(rdp);
2706 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2707} 2632}
2708 2633
2709/* 2634/*
@@ -2712,7 +2637,6 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
2712 */ 2637 */
2713static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) 2638static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
2714{ 2639{
2715 int i;
2716 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 2640 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
2717 2641
2718 /* No-CBs CPUs are handled specially. */ 2642 /* No-CBs CPUs are handled specially. */
@@ -2721,13 +2645,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
2721 return; 2645 return;
2722 2646
2723 /* Do the accounting first. */ 2647 /* Do the accounting first. */
2724 rdp->qlen_lazy += rsp->qlen_lazy; 2648 rdp->n_cbs_adopted += rsp->orphan_done.len;
2725 rdp->qlen += rsp->qlen; 2649 if (rsp->orphan_done.len_lazy != rsp->orphan_done.len)
2726 rdp->n_cbs_adopted += rsp->qlen;
2727 if (rsp->qlen_lazy != rsp->qlen)
2728 rcu_idle_count_callbacks_posted(); 2650 rcu_idle_count_callbacks_posted();
2729 rsp->qlen_lazy = 0; 2651 rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done);
2730 rsp->qlen = 0;
2731 2652
2732 /* 2653 /*
2733 * We do not need a memory barrier here because the only way we 2654 * We do not need a memory barrier here because the only way we
@@ -2735,24 +2656,13 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
2735 * we are the task doing the rcu_barrier(). 2656 * we are the task doing the rcu_barrier().
2736 */ 2657 */
2737 2658
2738 /* First adopt the ready-to-invoke callbacks. */ 2659 /* First adopt the ready-to-invoke callbacks, then the done ones. */
2739 if (rsp->orphan_donelist != NULL) { 2660 rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done);
2740 *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; 2661 WARN_ON_ONCE(rsp->orphan_done.head);
2741 *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; 2662 rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
2742 for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) 2663 WARN_ON_ONCE(rsp->orphan_pend.head);
2743 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) 2664 WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) !=
2744 rdp->nxttail[i] = rsp->orphan_donetail; 2665 !rcu_segcblist_n_cbs(&rdp->cblist));
2745 rsp->orphan_donelist = NULL;
2746 rsp->orphan_donetail = &rsp->orphan_donelist;
2747 }
2748
2749 /* And then adopt the callbacks that still need a grace period. */
2750 if (rsp->orphan_nxtlist != NULL) {
2751 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
2752 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
2753 rsp->orphan_nxtlist = NULL;
2754 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
2755 }
2756} 2666}
2757 2667
2758/* 2668/*
@@ -2760,14 +2670,14 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
2760 */ 2670 */
2761static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 2671static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
2762{ 2672{
2763 RCU_TRACE(unsigned long mask); 2673 RCU_TRACE(unsigned long mask;)
2764 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); 2674 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);)
2765 RCU_TRACE(struct rcu_node *rnp = rdp->mynode); 2675 RCU_TRACE(struct rcu_node *rnp = rdp->mynode;)
2766 2676
2767 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) 2677 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
2768 return; 2678 return;
2769 2679
2770 RCU_TRACE(mask = rdp->grpmask); 2680 RCU_TRACE(mask = rdp->grpmask;)
2771 trace_rcu_grace_period(rsp->name, 2681 trace_rcu_grace_period(rsp->name,
2772 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 2682 rnp->gpnum + 1 - !!(rnp->qsmask & mask),
2773 TPS("cpuofl")); 2683 TPS("cpuofl"));
@@ -2840,9 +2750,11 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2840 rcu_adopt_orphan_cbs(rsp, flags); 2750 rcu_adopt_orphan_cbs(rsp, flags);
2841 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); 2751 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
2842 2752
2843 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 2753 WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
2844 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 2754 !rcu_segcblist_empty(&rdp->cblist),
2845 cpu, rdp->qlen, rdp->nxtlist); 2755 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
2756 cpu, rcu_segcblist_n_cbs(&rdp->cblist),
2757 rcu_segcblist_first_cb(&rdp->cblist));
2846} 2758}
2847 2759
2848/* 2760/*
@@ -2852,14 +2764,17 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2852static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) 2764static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2853{ 2765{
2854 unsigned long flags; 2766 unsigned long flags;
2855 struct rcu_head *next, *list, **tail; 2767 struct rcu_head *rhp;
2856 long bl, count, count_lazy; 2768 struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
2857 int i; 2769 long bl, count;
2858 2770
2859 /* If no callbacks are ready, just return. */ 2771 /* If no callbacks are ready, just return. */
2860 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 2772 if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
2861 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); 2773 trace_rcu_batch_start(rsp->name,
2862 trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist), 2774 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
2775 rcu_segcblist_n_cbs(&rdp->cblist), 0);
2776 trace_rcu_batch_end(rsp->name, 0,
2777 !rcu_segcblist_empty(&rdp->cblist),
2863 need_resched(), is_idle_task(current), 2778 need_resched(), is_idle_task(current),
2864 rcu_is_callbacks_kthread()); 2779 rcu_is_callbacks_kthread());
2865 return; 2780 return;
@@ -2867,73 +2782,61 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2867 2782
2868 /* 2783 /*
2869 * Extract the list of ready callbacks, disabling to prevent 2784 * Extract the list of ready callbacks, disabling to prevent
2870 * races with call_rcu() from interrupt handlers. 2785 * races with call_rcu() from interrupt handlers. Leave the
2786 * callback counts, as rcu_barrier() needs to be conservative.
2871 */ 2787 */
2872 local_irq_save(flags); 2788 local_irq_save(flags);
2873 WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); 2789 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
2874 bl = rdp->blimit; 2790 bl = rdp->blimit;
2875 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); 2791 trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist),
2876 list = rdp->nxtlist; 2792 rcu_segcblist_n_cbs(&rdp->cblist), bl);
2877 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 2793 rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
2878 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
2879 tail = rdp->nxttail[RCU_DONE_TAIL];
2880 for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
2881 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
2882 rdp->nxttail[i] = &rdp->nxtlist;
2883 local_irq_restore(flags); 2794 local_irq_restore(flags);
2884 2795
2885 /* Invoke callbacks. */ 2796 /* Invoke callbacks. */
2886 count = count_lazy = 0; 2797 rhp = rcu_cblist_dequeue(&rcl);
2887 while (list) { 2798 for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
2888 next = list->next; 2799 debug_rcu_head_unqueue(rhp);
2889 prefetch(next); 2800 if (__rcu_reclaim(rsp->name, rhp))
2890 debug_rcu_head_unqueue(list); 2801 rcu_cblist_dequeued_lazy(&rcl);
2891 if (__rcu_reclaim(rsp->name, list)) 2802 /*
2892 count_lazy++; 2803 * Stop only if limit reached and CPU has something to do.
2893 list = next; 2804 * Note: The rcl structure counts down from zero.
2894 /* Stop only if limit reached and CPU has something to do. */ 2805 */
2895 if (++count >= bl && 2806 if (-rcl.len >= bl &&
2896 (need_resched() || 2807 (need_resched() ||
2897 (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) 2808 (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
2898 break; 2809 break;
2899 } 2810 }
2900 2811
2901 local_irq_save(flags); 2812 local_irq_save(flags);
2902 trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), 2813 count = -rcl.len;
2903 is_idle_task(current), 2814 trace_rcu_batch_end(rsp->name, count, !!rcl.head, need_resched(),
2904 rcu_is_callbacks_kthread()); 2815 is_idle_task(current), rcu_is_callbacks_kthread());
2905 2816
2906 /* Update count, and requeue any remaining callbacks. */ 2817 /* Update counts and requeue any remaining callbacks. */
2907 if (list != NULL) { 2818 rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
2908 *tail = rdp->nxtlist;
2909 rdp->nxtlist = list;
2910 for (i = 0; i < RCU_NEXT_SIZE; i++)
2911 if (&rdp->nxtlist == rdp->nxttail[i])
2912 rdp->nxttail[i] = tail;
2913 else
2914 break;
2915 }
2916 smp_mb(); /* List handling before counting for rcu_barrier(). */ 2819 smp_mb(); /* List handling before counting for rcu_barrier(). */
2917 rdp->qlen_lazy -= count_lazy;
2918 WRITE_ONCE(rdp->qlen, rdp->qlen - count);
2919 rdp->n_cbs_invoked += count; 2820 rdp->n_cbs_invoked += count;
2821 rcu_segcblist_insert_count(&rdp->cblist, &rcl);
2920 2822
2921 /* Reinstate batch limit if we have worked down the excess. */ 2823 /* Reinstate batch limit if we have worked down the excess. */
2922 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 2824 count = rcu_segcblist_n_cbs(&rdp->cblist);
2825 if (rdp->blimit == LONG_MAX && count <= qlowmark)
2923 rdp->blimit = blimit; 2826 rdp->blimit = blimit;
2924 2827
2925 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ 2828 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
2926 if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { 2829 if (count == 0 && rdp->qlen_last_fqs_check != 0) {
2927 rdp->qlen_last_fqs_check = 0; 2830 rdp->qlen_last_fqs_check = 0;
2928 rdp->n_force_qs_snap = rsp->n_force_qs; 2831 rdp->n_force_qs_snap = rsp->n_force_qs;
2929 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) 2832 } else if (count < rdp->qlen_last_fqs_check - qhimark)
2930 rdp->qlen_last_fqs_check = rdp->qlen; 2833 rdp->qlen_last_fqs_check = count;
2931 WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); 2834 WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0));
2932 2835
2933 local_irq_restore(flags); 2836 local_irq_restore(flags);
2934 2837
2935 /* Re-invoke RCU core processing if there are callbacks remaining. */ 2838 /* Re-invoke RCU core processing if there are callbacks remaining. */
2936 if (cpu_has_callbacks_ready_to_invoke(rdp)) 2839 if (rcu_segcblist_ready_cbs(&rdp->cblist))
2937 invoke_rcu_core(); 2840 invoke_rcu_core();
2938} 2841}
2939 2842
@@ -3099,7 +3002,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
3099 bool needwake; 3002 bool needwake;
3100 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 3003 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
3101 3004
3102 WARN_ON_ONCE(rdp->beenonline == 0); 3005 WARN_ON_ONCE(!rdp->beenonline);
3103 3006
3104 /* Update RCU state based on any recent quiescent states. */ 3007 /* Update RCU state based on any recent quiescent states. */
3105 rcu_check_quiescent_state(rsp, rdp); 3008 rcu_check_quiescent_state(rsp, rdp);
@@ -3117,7 +3020,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
3117 } 3020 }
3118 3021
3119 /* If there are callbacks ready, invoke them. */ 3022 /* If there are callbacks ready, invoke them. */
3120 if (cpu_has_callbacks_ready_to_invoke(rdp)) 3023 if (rcu_segcblist_ready_cbs(&rdp->cblist))
3121 invoke_rcu_callbacks(rsp, rdp); 3024 invoke_rcu_callbacks(rsp, rdp);
3122 3025
3123 /* Do any needed deferred wakeups of rcuo kthreads. */ 3026 /* Do any needed deferred wakeups of rcuo kthreads. */
@@ -3189,7 +3092,8 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
3189 * invoking force_quiescent_state() if the newly enqueued callback 3092 * invoking force_quiescent_state() if the newly enqueued callback
3190 * is the only one waiting for a grace period to complete. 3093 * is the only one waiting for a grace period to complete.
3191 */ 3094 */
3192 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 3095 if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
3096 rdp->qlen_last_fqs_check + qhimark)) {
3193 3097
3194 /* Are we ignoring a completed grace period? */ 3098 /* Are we ignoring a completed grace period? */
3195 note_gp_changes(rsp, rdp); 3099 note_gp_changes(rsp, rdp);
@@ -3207,10 +3111,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
3207 /* Give the grace period a kick. */ 3111 /* Give the grace period a kick. */
3208 rdp->blimit = LONG_MAX; 3112 rdp->blimit = LONG_MAX;
3209 if (rsp->n_force_qs == rdp->n_force_qs_snap && 3113 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
3210 *rdp->nxttail[RCU_DONE_TAIL] != head) 3114 rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
3211 force_quiescent_state(rsp); 3115 force_quiescent_state(rsp);
3212 rdp->n_force_qs_snap = rsp->n_force_qs; 3116 rdp->n_force_qs_snap = rsp->n_force_qs;
3213 rdp->qlen_last_fqs_check = rdp->qlen; 3117 rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
3214 } 3118 }
3215 } 3119 }
3216} 3120}
@@ -3250,7 +3154,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
3250 rdp = this_cpu_ptr(rsp->rda); 3154 rdp = this_cpu_ptr(rsp->rda);
3251 3155
3252 /* Add the callback to our list. */ 3156 /* Add the callback to our list. */
3253 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { 3157 if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) {
3254 int offline; 3158 int offline;
3255 3159
3256 if (cpu != -1) 3160 if (cpu != -1)
@@ -3269,23 +3173,21 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
3269 */ 3173 */
3270 BUG_ON(cpu != -1); 3174 BUG_ON(cpu != -1);
3271 WARN_ON_ONCE(!rcu_is_watching()); 3175 WARN_ON_ONCE(!rcu_is_watching());
3272 if (!likely(rdp->nxtlist)) 3176 if (rcu_segcblist_empty(&rdp->cblist))
3273 init_default_callback_list(rdp); 3177 rcu_segcblist_init(&rdp->cblist);
3274 } 3178 }
3275 WRITE_ONCE(rdp->qlen, rdp->qlen + 1); 3179 rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
3276 if (lazy) 3180 if (!lazy)
3277 rdp->qlen_lazy++;
3278 else
3279 rcu_idle_count_callbacks_posted(); 3181 rcu_idle_count_callbacks_posted();
3280 smp_mb(); /* Count before adding callback for rcu_barrier(). */
3281 *rdp->nxttail[RCU_NEXT_TAIL] = head;
3282 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
3283 3182
3284 if (__is_kfree_rcu_offset((unsigned long)func)) 3183 if (__is_kfree_rcu_offset((unsigned long)func))
3285 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, 3184 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
3286 rdp->qlen_lazy, rdp->qlen); 3185 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
3186 rcu_segcblist_n_cbs(&rdp->cblist));
3287 else 3187 else
3288 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); 3188 trace_rcu_callback(rsp->name, head,
3189 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
3190 rcu_segcblist_n_cbs(&rdp->cblist));
3289 3191
3290 /* Go handle any RCU core processing required. */ 3192 /* Go handle any RCU core processing required. */
3291 __call_rcu_core(rsp, rdp, head, flags); 3193 __call_rcu_core(rsp, rdp, head, flags);
@@ -3531,41 +3433,6 @@ void cond_synchronize_sched(unsigned long oldstate)
3531} 3433}
3532EXPORT_SYMBOL_GPL(cond_synchronize_sched); 3434EXPORT_SYMBOL_GPL(cond_synchronize_sched);
3533 3435
3534/* Adjust sequence number for start of update-side operation. */
3535static void rcu_seq_start(unsigned long *sp)
3536{
3537 WRITE_ONCE(*sp, *sp + 1);
3538 smp_mb(); /* Ensure update-side operation after counter increment. */
3539 WARN_ON_ONCE(!(*sp & 0x1));
3540}
3541
3542/* Adjust sequence number for end of update-side operation. */
3543static void rcu_seq_end(unsigned long *sp)
3544{
3545 smp_mb(); /* Ensure update-side operation before counter increment. */
3546 WRITE_ONCE(*sp, *sp + 1);
3547 WARN_ON_ONCE(*sp & 0x1);
3548}
3549
3550/* Take a snapshot of the update side's sequence number. */
3551static unsigned long rcu_seq_snap(unsigned long *sp)
3552{
3553 unsigned long s;
3554
3555 s = (READ_ONCE(*sp) + 3) & ~0x1;
3556 smp_mb(); /* Above access must not bleed into critical section. */
3557 return s;
3558}
3559
3560/*
3561 * Given a snapshot from rcu_seq_snap(), determine whether or not a
3562 * full update-side operation has occurred.
3563 */
3564static bool rcu_seq_done(unsigned long *sp, unsigned long s)
3565{
3566 return ULONG_CMP_GE(READ_ONCE(*sp), s);
3567}
3568
3569/* 3436/*
3570 * Check to see if there is any immediate RCU-related work to be done 3437 * Check to see if there is any immediate RCU-related work to be done
3571 * by the current CPU, for the specified type of RCU, returning 1 if so. 3438 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -3589,7 +3456,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3589 /* Is the RCU core waiting for a quiescent state from this CPU? */ 3456 /* Is the RCU core waiting for a quiescent state from this CPU? */
3590 if (rcu_scheduler_fully_active && 3457 if (rcu_scheduler_fully_active &&
3591 rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && 3458 rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
3592 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { 3459 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) {
3593 rdp->n_rp_core_needs_qs++; 3460 rdp->n_rp_core_needs_qs++;
3594 } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { 3461 } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) {
3595 rdp->n_rp_report_qs++; 3462 rdp->n_rp_report_qs++;
@@ -3597,7 +3464,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3597 } 3464 }
3598 3465
3599 /* Does this CPU have callbacks ready to invoke? */ 3466 /* Does this CPU have callbacks ready to invoke? */
3600 if (cpu_has_callbacks_ready_to_invoke(rdp)) { 3467 if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
3601 rdp->n_rp_cb_ready++; 3468 rdp->n_rp_cb_ready++;
3602 return 1; 3469 return 1;
3603 } 3470 }
@@ -3661,10 +3528,10 @@ static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
3661 3528
3662 for_each_rcu_flavor(rsp) { 3529 for_each_rcu_flavor(rsp) {
3663 rdp = this_cpu_ptr(rsp->rda); 3530 rdp = this_cpu_ptr(rsp->rda);
3664 if (!rdp->nxtlist) 3531 if (rcu_segcblist_empty(&rdp->cblist))
3665 continue; 3532 continue;
3666 hc = true; 3533 hc = true;
3667 if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { 3534 if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist) || !all_lazy) {
3668 al = false; 3535 al = false;
3669 break; 3536 break;
3670 } 3537 }
@@ -3773,7 +3640,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
3773 __call_rcu(&rdp->barrier_head, 3640 __call_rcu(&rdp->barrier_head,
3774 rcu_barrier_callback, rsp, cpu, 0); 3641 rcu_barrier_callback, rsp, cpu, 0);
3775 } 3642 }
3776 } else if (READ_ONCE(rdp->qlen)) { 3643 } else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
3777 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 3644 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
3778 rsp->barrier_sequence); 3645 rsp->barrier_sequence);
3779 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); 3646 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -3882,8 +3749,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3882 rdp->qlen_last_fqs_check = 0; 3749 rdp->qlen_last_fqs_check = 0;
3883 rdp->n_force_qs_snap = rsp->n_force_qs; 3750 rdp->n_force_qs_snap = rsp->n_force_qs;
3884 rdp->blimit = blimit; 3751 rdp->blimit = blimit;
3885 if (!rdp->nxtlist) 3752 if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
3886 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ 3753 !init_nocb_callback_list(rdp))
3754 rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
3887 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 3755 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
3888 rcu_sysidle_init_percpu_data(rdp->dynticks); 3756 rcu_sysidle_init_percpu_data(rdp->dynticks);
3889 rcu_dynticks_eqs_online(); 3757 rcu_dynticks_eqs_online();
@@ -3902,12 +3770,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3902 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ 3770 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
3903 rdp->completed = rnp->completed; 3771 rdp->completed = rnp->completed;
3904 rdp->cpu_no_qs.b.norm = true; 3772 rdp->cpu_no_qs.b.norm = true;
3905 rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); 3773 rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu);
3906 rdp->core_needs_qs = false; 3774 rdp->core_needs_qs = false;
3907 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); 3775 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3908 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3776 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3909} 3777}
3910 3778
3779/*
3780 * Invoked early in the CPU-online process, when pretty much all
3781 * services are available. The incoming CPU is not present.
3782 */
3911int rcutree_prepare_cpu(unsigned int cpu) 3783int rcutree_prepare_cpu(unsigned int cpu)
3912{ 3784{
3913 struct rcu_state *rsp; 3785 struct rcu_state *rsp;
@@ -3921,6 +3793,9 @@ int rcutree_prepare_cpu(unsigned int cpu)
3921 return 0; 3793 return 0;
3922} 3794}
3923 3795
3796/*
3797 * Update RCU priority boot kthread affinity for CPU-hotplug changes.
3798 */
3924static void rcutree_affinity_setting(unsigned int cpu, int outgoing) 3799static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
3925{ 3800{
3926 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); 3801 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
@@ -3928,20 +3803,34 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
3928 rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); 3803 rcu_boost_kthread_setaffinity(rdp->mynode, outgoing);
3929} 3804}
3930 3805
3806/*
3807 * Near the end of the CPU-online process. Pretty much all services
3808 * enabled, and the CPU is now very much alive.
3809 */
3931int rcutree_online_cpu(unsigned int cpu) 3810int rcutree_online_cpu(unsigned int cpu)
3932{ 3811{
3933 sync_sched_exp_online_cleanup(cpu); 3812 sync_sched_exp_online_cleanup(cpu);
3934 rcutree_affinity_setting(cpu, -1); 3813 rcutree_affinity_setting(cpu, -1);
3814 if (IS_ENABLED(CONFIG_TREE_SRCU))
3815 srcu_online_cpu(cpu);
3935 return 0; 3816 return 0;
3936} 3817}
3937 3818
3819/*
3820 * Near the beginning of the process. The CPU is still very much alive
3821 * with pretty much all services enabled.
3822 */
3938int rcutree_offline_cpu(unsigned int cpu) 3823int rcutree_offline_cpu(unsigned int cpu)
3939{ 3824{
3940 rcutree_affinity_setting(cpu, cpu); 3825 rcutree_affinity_setting(cpu, cpu);
3826 if (IS_ENABLED(CONFIG_TREE_SRCU))
3827 srcu_offline_cpu(cpu);
3941 return 0; 3828 return 0;
3942} 3829}
3943 3830
3944 3831/*
3832 * Near the end of the offline process. We do only tracing here.
3833 */
3945int rcutree_dying_cpu(unsigned int cpu) 3834int rcutree_dying_cpu(unsigned int cpu)
3946{ 3835{
3947 struct rcu_state *rsp; 3836 struct rcu_state *rsp;
@@ -3951,6 +3840,9 @@ int rcutree_dying_cpu(unsigned int cpu)
3951 return 0; 3840 return 0;
3952} 3841}
3953 3842
3843/*
3844 * The outgoing CPU is gone and we are running elsewhere.
3845 */
3954int rcutree_dead_cpu(unsigned int cpu) 3846int rcutree_dead_cpu(unsigned int cpu)
3955{ 3847{
3956 struct rcu_state *rsp; 3848 struct rcu_state *rsp;
@@ -3968,6 +3860,10 @@ int rcutree_dead_cpu(unsigned int cpu)
3968 * incoming CPUs are not allowed to use RCU read-side critical sections 3860 * incoming CPUs are not allowed to use RCU read-side critical sections
3969 * until this function is called. Failing to observe this restriction 3861 * until this function is called. Failing to observe this restriction
3970 * will result in lockdep splats. 3862 * will result in lockdep splats.
3863 *
3864 * Note that this function is special in that it is invoked directly
3865 * from the incoming CPU rather than from the cpuhp_step mechanism.
3866 * This is because this function must be invoked at a precise location.
3971 */ 3867 */
3972void rcu_cpu_starting(unsigned int cpu) 3868void rcu_cpu_starting(unsigned int cpu)
3973{ 3869{
@@ -3993,9 +3889,6 @@ void rcu_cpu_starting(unsigned int cpu)
3993 * The CPU is exiting the idle loop into the arch_cpu_idle_dead() 3889 * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
3994 * function. We now remove it from the rcu_node tree's ->qsmaskinit 3890 * function. We now remove it from the rcu_node tree's ->qsmaskinit
3995 * bit masks. 3891 * bit masks.
3996 * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
3997 * function. We now remove it from the rcu_node tree's ->qsmaskinit
3998 * bit masks.
3999 */ 3892 */
4000static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) 3893static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
4001{ 3894{
@@ -4011,6 +3904,14 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
4011 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3904 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4012} 3905}
4013 3906
3907/*
3908 * The outgoing function has no further need of RCU, so remove it from
3909 * the list of CPUs that RCU must track.
3910 *
3911 * Note that this function is special in that it is invoked directly
3912 * from the outgoing CPU rather than from the cpuhp_step mechanism.
3913 * This is because this function must be invoked at a precise location.
3914 */
4014void rcu_report_dead(unsigned int cpu) 3915void rcu_report_dead(unsigned int cpu)
4015{ 3916{
4016 struct rcu_state *rsp; 3917 struct rcu_state *rsp;
@@ -4025,6 +3926,10 @@ void rcu_report_dead(unsigned int cpu)
4025} 3926}
4026#endif 3927#endif
4027 3928
3929/*
3930 * On non-huge systems, use expedited RCU grace periods to make suspend
3931 * and hibernation run faster.
3932 */
4028static int rcu_pm_notify(struct notifier_block *self, 3933static int rcu_pm_notify(struct notifier_block *self,
4029 unsigned long action, void *hcpu) 3934 unsigned long action, void *hcpu)
4030{ 3935{
@@ -4095,7 +4000,7 @@ early_initcall(rcu_spawn_gp_kthread);
4095 * task is booting the system, and such primitives are no-ops). After this 4000 * task is booting the system, and such primitives are no-ops). After this
4096 * function is called, any synchronous grace-period primitives are run as 4001 * function is called, any synchronous grace-period primitives are run as
4097 * expedited, with the requesting task driving the grace period forward. 4002 * expedited, with the requesting task driving the grace period forward.
4098 * A later core_initcall() rcu_exp_runtime_mode() will switch to full 4003 * A later core_initcall() rcu_set_runtime_mode() will switch to full
4099 * runtime RCU functionality. 4004 * runtime RCU functionality.
4100 */ 4005 */
4101void rcu_scheduler_starting(void) 4006void rcu_scheduler_starting(void)
@@ -4108,31 +4013,6 @@ void rcu_scheduler_starting(void)
4108} 4013}
4109 4014
4110/* 4015/*
4111 * Compute the per-level fanout, either using the exact fanout specified
4112 * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
4113 */
4114static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
4115{
4116 int i;
4117
4118 if (rcu_fanout_exact) {
4119 levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
4120 for (i = rcu_num_lvls - 2; i >= 0; i--)
4121 levelspread[i] = RCU_FANOUT;
4122 } else {
4123 int ccur;
4124 int cprv;
4125
4126 cprv = nr_cpu_ids;
4127 for (i = rcu_num_lvls - 1; i >= 0; i--) {
4128 ccur = levelcnt[i];
4129 levelspread[i] = (cprv + ccur - 1) / ccur;
4130 cprv = ccur;
4131 }
4132 }
4133}
4134
4135/*
4136 * Helper function for rcu_init() that initializes one rcu_state structure. 4016 * Helper function for rcu_init() that initializes one rcu_state structure.
4137 */ 4017 */
4138static void __init rcu_init_one(struct rcu_state *rsp) 4018static void __init rcu_init_one(struct rcu_state *rsp)
@@ -4141,9 +4021,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
4141 static const char * const fqs[] = RCU_FQS_NAME_INIT; 4021 static const char * const fqs[] = RCU_FQS_NAME_INIT;
4142 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 4022 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
4143 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 4023 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
4144 static u8 fl_mask = 0x1;
4145 4024
4146 int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
4147 int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ 4025 int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
4148 int cpustride = 1; 4026 int cpustride = 1;
4149 int i; 4027 int i;
@@ -4158,20 +4036,16 @@ static void __init rcu_init_one(struct rcu_state *rsp)
4158 4036
4159 /* Initialize the level-tracking arrays. */ 4037 /* Initialize the level-tracking arrays. */
4160 4038
4161 for (i = 0; i < rcu_num_lvls; i++)
4162 levelcnt[i] = num_rcu_lvl[i];
4163 for (i = 1; i < rcu_num_lvls; i++) 4039 for (i = 1; i < rcu_num_lvls; i++)
4164 rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; 4040 rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1];
4165 rcu_init_levelspread(levelspread, levelcnt); 4041 rcu_init_levelspread(levelspread, num_rcu_lvl);
4166 rsp->flavor_mask = fl_mask;
4167 fl_mask <<= 1;
4168 4042
4169 /* Initialize the elements themselves, starting from the leaves. */ 4043 /* Initialize the elements themselves, starting from the leaves. */
4170 4044
4171 for (i = rcu_num_lvls - 1; i >= 0; i--) { 4045 for (i = rcu_num_lvls - 1; i >= 0; i--) {
4172 cpustride *= levelspread[i]; 4046 cpustride *= levelspread[i];
4173 rnp = rsp->level[i]; 4047 rnp = rsp->level[i];
4174 for (j = 0; j < levelcnt[i]; j++, rnp++) { 4048 for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) {
4175 raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); 4049 raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
4176 lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), 4050 lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
4177 &rcu_node_class[i], buf[i]); 4051 &rcu_node_class[i], buf[i]);
@@ -4344,6 +4218,8 @@ void __init rcu_init(void)
4344 for_each_online_cpu(cpu) { 4218 for_each_online_cpu(cpu) {
4345 rcutree_prepare_cpu(cpu); 4219 rcutree_prepare_cpu(cpu);
4346 rcu_cpu_starting(cpu); 4220 rcu_cpu_starting(cpu);
4221 if (IS_ENABLED(CONFIG_TREE_SRCU))
4222 srcu_online_cpu(cpu);
4347 } 4223 }
4348} 4224}
4349 4225
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ec62a05bfdb3..ba38262c3554 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -30,80 +30,9 @@
30#include <linux/seqlock.h> 30#include <linux/seqlock.h>
31#include <linux/swait.h> 31#include <linux/swait.h>
32#include <linux/stop_machine.h> 32#include <linux/stop_machine.h>
33#include <linux/rcu_node_tree.h>
33 34
34/* 35#include "rcu_segcblist.h"
35 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
36 * CONFIG_RCU_FANOUT_LEAF.
37 * In theory, it should be possible to add more levels straightforwardly.
38 * In practice, this did work well going from three levels to four.
39 * Of course, your mileage may vary.
40 */
41
42#ifdef CONFIG_RCU_FANOUT
43#define RCU_FANOUT CONFIG_RCU_FANOUT
44#else /* #ifdef CONFIG_RCU_FANOUT */
45# ifdef CONFIG_64BIT
46# define RCU_FANOUT 64
47# else
48# define RCU_FANOUT 32
49# endif
50#endif /* #else #ifdef CONFIG_RCU_FANOUT */
51
52#ifdef CONFIG_RCU_FANOUT_LEAF
53#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
54#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
55# ifdef CONFIG_64BIT
56# define RCU_FANOUT_LEAF 64
57# else
58# define RCU_FANOUT_LEAF 32
59# endif
60#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
61
62#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
63#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT)
64#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT)
65#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT)
66
67#if NR_CPUS <= RCU_FANOUT_1
68# define RCU_NUM_LVLS 1
69# define NUM_RCU_LVL_0 1
70# define NUM_RCU_NODES NUM_RCU_LVL_0
71# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
72# define RCU_NODE_NAME_INIT { "rcu_node_0" }
73# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
74#elif NR_CPUS <= RCU_FANOUT_2
75# define RCU_NUM_LVLS 2
76# define NUM_RCU_LVL_0 1
77# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
78# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
79# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
80# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
81# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
82#elif NR_CPUS <= RCU_FANOUT_3
83# define RCU_NUM_LVLS 3
84# define NUM_RCU_LVL_0 1
85# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
86# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
87# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
88# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
89# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
90# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
91#elif NR_CPUS <= RCU_FANOUT_4
92# define RCU_NUM_LVLS 4
93# define NUM_RCU_LVL_0 1
94# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
95# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
96# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
97# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
98# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
99# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
100# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
101#else
102# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
103#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
104
105extern int rcu_num_lvls;
106extern int rcu_num_nodes;
107 36
108/* 37/*
109 * Dynticks per-CPU state. 38 * Dynticks per-CPU state.
@@ -113,6 +42,9 @@ struct rcu_dynticks {
113 /* Process level is worth LLONG_MAX/2. */ 42 /* Process level is worth LLONG_MAX/2. */
114 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 43 int dynticks_nmi_nesting; /* Track NMI nesting level. */
115 atomic_t dynticks; /* Even value for idle, else odd. */ 44 atomic_t dynticks; /* Even value for idle, else odd. */
45 bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */
46 unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */
47 bool rcu_urgent_qs; /* GP old need light quiescent state. */
116#ifdef CONFIG_NO_HZ_FULL_SYSIDLE 48#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
117 long long dynticks_idle_nesting; 49 long long dynticks_idle_nesting;
118 /* irq/process nesting level from idle. */ 50 /* irq/process nesting level from idle. */
@@ -262,41 +194,6 @@ struct rcu_node {
262#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) 194#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo))
263 195
264/* 196/*
265 * Do a full breadth-first scan of the rcu_node structures for the
266 * specified rcu_state structure.
267 */
268#define rcu_for_each_node_breadth_first(rsp, rnp) \
269 for ((rnp) = &(rsp)->node[0]; \
270 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
271
272/*
273 * Do a breadth-first scan of the non-leaf rcu_node structures for the
274 * specified rcu_state structure. Note that if there is a singleton
275 * rcu_node tree with but one rcu_node structure, this loop is a no-op.
276 */
277#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
278 for ((rnp) = &(rsp)->node[0]; \
279 (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
280
281/*
282 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
283 * structure. Note that if there is a singleton rcu_node tree with but
284 * one rcu_node structure, this loop -will- visit the rcu_node structure.
285 * It is still a leaf node, even if it is also the root node.
286 */
287#define rcu_for_each_leaf_node(rsp, rnp) \
288 for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
289 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
290
291/*
292 * Iterate over all possible CPUs in a leaf RCU node.
293 */
294#define for_each_leaf_node_possible_cpu(rnp, cpu) \
295 for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
296 cpu <= rnp->grphi; \
297 cpu = cpumask_next((cpu), cpu_possible_mask))
298
299/*
300 * Union to allow "aggregate OR" operation on the need for a quiescent 197 * Union to allow "aggregate OR" operation on the need for a quiescent
301 * state by the normal and expedited grace periods. 198 * state by the normal and expedited grace periods.
302 */ 199 */
@@ -336,34 +233,9 @@ struct rcu_data {
336 /* period it is aware of. */ 233 /* period it is aware of. */
337 234
338 /* 2) batch handling */ 235 /* 2) batch handling */
339 /* 236 struct rcu_segcblist cblist; /* Segmented callback list, with */
340 * If nxtlist is not NULL, it is partitioned as follows. 237 /* different callbacks waiting for */
341 * Any of the partitions might be empty, in which case the 238 /* different grace periods. */
342 * pointer to that partition will be equal to the pointer for
343 * the following partition. When the list is empty, all of
344 * the nxttail elements point to the ->nxtlist pointer itself,
345 * which in that case is NULL.
346 *
347 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
348 * Entries that batch # <= ->completed
349 * The grace period for these entries has completed, and
350 * the other grace-period-completed entries may be moved
351 * here temporarily in rcu_process_callbacks().
352 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
353 * Entries that batch # <= ->completed - 1: waiting for current GP
354 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
355 * Entries known to have arrived before current GP ended
356 * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
357 * Entries that might have arrived after current GP ended
358 * Note that the value of *nxttail[RCU_NEXT_TAIL] will
359 * always be NULL, as this is the end of the list.
360 */
361 struct rcu_head *nxtlist;
362 struct rcu_head **nxttail[RCU_NEXT_SIZE];
363 unsigned long nxtcompleted[RCU_NEXT_SIZE];
364 /* grace periods for sublists. */
365 long qlen_lazy; /* # of lazy queued callbacks */
366 long qlen; /* # of queued callbacks, incl lazy */
367 long qlen_last_fqs_check; 239 long qlen_last_fqs_check;
368 /* qlen at last check for QS forcing */ 240 /* qlen at last check for QS forcing */
369 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 241 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
@@ -482,7 +354,6 @@ struct rcu_state {
482 struct rcu_node *level[RCU_NUM_LVLS + 1]; 354 struct rcu_node *level[RCU_NUM_LVLS + 1];
483 /* Hierarchy levels (+1 to */ 355 /* Hierarchy levels (+1 to */
484 /* shut bogus gcc warning) */ 356 /* shut bogus gcc warning) */
485 u8 flavor_mask; /* bit in flavor mask. */
486 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 357 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
487 call_rcu_func_t call; /* call_rcu() flavor. */ 358 call_rcu_func_t call; /* call_rcu() flavor. */
488 int ncpus; /* # CPUs seen so far. */ 359 int ncpus; /* # CPUs seen so far. */
@@ -502,14 +373,11 @@ struct rcu_state {
502 373
503 raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; 374 raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
504 /* Protect following fields. */ 375 /* Protect following fields. */
505 struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ 376 struct rcu_cblist orphan_pend; /* Orphaned callbacks that */
506 /* need a grace period. */ 377 /* need a grace period. */
507 struct rcu_head **orphan_nxttail; /* Tail of above. */ 378 struct rcu_cblist orphan_done; /* Orphaned callbacks that */
508 struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
509 /* are ready to invoke. */ 379 /* are ready to invoke. */
510 struct rcu_head **orphan_donetail; /* Tail of above. */ 380 /* (Contains counts.) */
511 long qlen_lazy; /* Number of lazy callbacks. */
512 long qlen; /* Total number of callbacks. */
513 /* End of fields guarded by orphan_lock. */ 381 /* End of fields guarded by orphan_lock. */
514 382
515 struct mutex barrier_mutex; /* Guards barrier fields. */ 383 struct mutex barrier_mutex; /* Guards barrier fields. */
@@ -596,6 +464,7 @@ extern struct rcu_state rcu_preempt_state;
596#endif /* #ifdef CONFIG_PREEMPT_RCU */ 464#endif /* #ifdef CONFIG_PREEMPT_RCU */
597 465
598int rcu_dynticks_snap(struct rcu_dynticks *rdtp); 466int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
467bool rcu_eqs_special_set(int cpu);
599 468
600#ifdef CONFIG_RCU_BOOST 469#ifdef CONFIG_RCU_BOOST
601DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); 470DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -673,6 +542,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
673static void rcu_dynticks_task_enter(void); 542static void rcu_dynticks_task_enter(void);
674static void rcu_dynticks_task_exit(void); 543static void rcu_dynticks_task_exit(void);
675 544
545#ifdef CONFIG_SRCU
546void srcu_online_cpu(unsigned int cpu);
547void srcu_offline_cpu(unsigned int cpu);
548#else /* #ifdef CONFIG_SRCU */
549void srcu_online_cpu(unsigned int cpu) { }
550void srcu_offline_cpu(unsigned int cpu) { }
551#endif /* #else #ifdef CONFIG_SRCU */
552
676#endif /* #ifndef RCU_TREE_NONCORE */ 553#endif /* #ifndef RCU_TREE_NONCORE */
677 554
678#ifdef CONFIG_RCU_TRACE 555#ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index a7b639ccd46e..e513b4ab1197 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -292,7 +292,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
292 trace_rcu_exp_funnel_lock(rsp->name, rnp->level, 292 trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
293 rnp->grplo, rnp->grphi, 293 rnp->grplo, rnp->grphi,
294 TPS("wait")); 294 TPS("wait"));
295 wait_event(rnp->exp_wq[(s >> 1) & 0x3], 295 wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
296 sync_exp_work_done(rsp, 296 sync_exp_work_done(rsp,
297 &rdp->exp_workdone2, s)); 297 &rdp->exp_workdone2, s));
298 return true; 298 return true;
@@ -331,6 +331,8 @@ static void sync_sched_exp_handler(void *data)
331 return; 331 return;
332 } 332 }
333 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); 333 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
334 /* Store .exp before .rcu_urgent_qs. */
335 smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
334 resched_cpu(smp_processor_id()); 336 resched_cpu(smp_processor_id());
335} 337}
336 338
@@ -531,7 +533,8 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
531 rnp->exp_seq_rq = s; 533 rnp->exp_seq_rq = s;
532 spin_unlock(&rnp->exp_lock); 534 spin_unlock(&rnp->exp_lock);
533 } 535 }
534 wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); 536 smp_mb(); /* All above changes before wakeup. */
537 wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]);
535 } 538 }
536 trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); 539 trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
537 mutex_unlock(&rsp->exp_wake_mutex); 540 mutex_unlock(&rsp->exp_wake_mutex);
@@ -609,9 +612,9 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
609 /* Wait for expedited grace period to complete. */ 612 /* Wait for expedited grace period to complete. */
610 rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); 613 rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
611 rnp = rcu_get_root(rsp); 614 rnp = rcu_get_root(rsp);
612 wait_event(rnp->exp_wq[(s >> 1) & 0x3], 615 wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
613 sync_exp_work_done(rsp, 616 sync_exp_work_done(rsp, &rdp->exp_workdone0, s));
614 &rdp->exp_workdone0, s)); 617 smp_mb(); /* Workqueue actions happen before return. */
615 618
616 /* Let the next expedited grace period start. */ 619 /* Let the next expedited grace period start. */
617 mutex_unlock(&rsp->exp_mutex); 620 mutex_unlock(&rsp->exp_mutex);
@@ -735,15 +738,3 @@ void synchronize_rcu_expedited(void)
735EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 738EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
736 739
737#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ 740#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
738
739/*
740 * Switch to run-time mode once Tree RCU has fully initialized.
741 */
742static int __init rcu_exp_runtime_mode(void)
743{
744 rcu_test_sync_prims();
745 rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
746 rcu_test_sync_prims();
747 return 0;
748}
749core_initcall(rcu_exp_runtime_mode);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a62a8f1caac..c9a48657512a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1350,10 +1350,10 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
1350 */ 1350 */
1351 if ((rdp->completed != rnp->completed || 1351 if ((rdp->completed != rnp->completed ||
1352 unlikely(READ_ONCE(rdp->gpwrap))) && 1352 unlikely(READ_ONCE(rdp->gpwrap))) &&
1353 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) 1353 rcu_segcblist_pend_cbs(&rdp->cblist))
1354 note_gp_changes(rsp, rdp); 1354 note_gp_changes(rsp, rdp);
1355 1355
1356 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1356 if (rcu_segcblist_ready_cbs(&rdp->cblist))
1357 cbs_ready = true; 1357 cbs_ready = true;
1358 } 1358 }
1359 return cbs_ready; 1359 return cbs_ready;
@@ -1461,7 +1461,7 @@ static void rcu_prepare_for_idle(void)
1461 rdtp->last_accelerate = jiffies; 1461 rdtp->last_accelerate = jiffies;
1462 for_each_rcu_flavor(rsp) { 1462 for_each_rcu_flavor(rsp) {
1463 rdp = this_cpu_ptr(rsp->rda); 1463 rdp = this_cpu_ptr(rsp->rda);
1464 if (!*rdp->nxttail[RCU_DONE_TAIL]) 1464 if (rcu_segcblist_pend_cbs(&rdp->cblist))
1465 continue; 1465 continue;
1466 rnp = rdp->mynode; 1466 rnp = rdp->mynode;
1467 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 1467 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
@@ -1529,7 +1529,7 @@ static void rcu_oom_notify_cpu(void *unused)
1529 1529
1530 for_each_rcu_flavor(rsp) { 1530 for_each_rcu_flavor(rsp) {
1531 rdp = raw_cpu_ptr(rsp->rda); 1531 rdp = raw_cpu_ptr(rsp->rda);
1532 if (rdp->qlen_lazy != 0) { 1532 if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) {
1533 atomic_inc(&oom_callback_count); 1533 atomic_inc(&oom_callback_count);
1534 rsp->call(&rdp->oom_head, rcu_oom_callback); 1534 rsp->call(&rdp->oom_head, rcu_oom_callback);
1535 } 1535 }
@@ -1709,7 +1709,7 @@ __setup("rcu_nocbs=", rcu_nocb_setup);
1709 1709
1710static int __init parse_rcu_nocb_poll(char *arg) 1710static int __init parse_rcu_nocb_poll(char *arg)
1711{ 1711{
1712 rcu_nocb_poll = 1; 1712 rcu_nocb_poll = true;
1713 return 0; 1713 return 0;
1714} 1714}
1715early_param("rcu_nocb_poll", parse_rcu_nocb_poll); 1715early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
@@ -1860,7 +1860,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
1860 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1860 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1861 TPS("WakeEmpty")); 1861 TPS("WakeEmpty"));
1862 } else { 1862 } else {
1863 rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; 1863 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE);
1864 /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
1865 smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
1864 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1866 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1865 TPS("WakeEmptyIsDeferred")); 1867 TPS("WakeEmptyIsDeferred"));
1866 } 1868 }
@@ -1872,7 +1874,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
1872 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1874 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1873 TPS("WakeOvf")); 1875 TPS("WakeOvf"));
1874 } else { 1876 } else {
1875 rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; 1877 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE);
1878 /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
1879 smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
1876 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1880 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1877 TPS("WakeOvfIsDeferred")); 1881 TPS("WakeOvfIsDeferred"));
1878 } 1882 }
@@ -1930,30 +1934,26 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
1930 struct rcu_data *rdp, 1934 struct rcu_data *rdp,
1931 unsigned long flags) 1935 unsigned long flags)
1932{ 1936{
1933 long ql = rsp->qlen; 1937 long ql = rsp->orphan_done.len;
1934 long qll = rsp->qlen_lazy; 1938 long qll = rsp->orphan_done.len_lazy;
1935 1939
1936 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ 1940 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
1937 if (!rcu_is_nocb_cpu(smp_processor_id())) 1941 if (!rcu_is_nocb_cpu(smp_processor_id()))
1938 return false; 1942 return false;
1939 rsp->qlen = 0;
1940 rsp->qlen_lazy = 0;
1941 1943
1942 /* First, enqueue the donelist, if any. This preserves CB ordering. */ 1944 /* First, enqueue the donelist, if any. This preserves CB ordering. */
1943 if (rsp->orphan_donelist != NULL) { 1945 if (rsp->orphan_done.head) {
1944 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, 1946 __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done),
1945 rsp->orphan_donetail, ql, qll, flags); 1947 rcu_cblist_tail(&rsp->orphan_done),
1946 ql = qll = 0; 1948 ql, qll, flags);
1947 rsp->orphan_donelist = NULL;
1948 rsp->orphan_donetail = &rsp->orphan_donelist;
1949 } 1949 }
1950 if (rsp->orphan_nxtlist != NULL) { 1950 if (rsp->orphan_pend.head) {
1951 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, 1951 __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend),
1952 rsp->orphan_nxttail, ql, qll, flags); 1952 rcu_cblist_tail(&rsp->orphan_pend),
1953 ql = qll = 0; 1953 ql, qll, flags);
1954 rsp->orphan_nxtlist = NULL;
1955 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
1956 } 1954 }
1955 rcu_cblist_init(&rsp->orphan_done);
1956 rcu_cblist_init(&rsp->orphan_pend);
1957 return true; 1957 return true;
1958} 1958}
1959 1959
@@ -2395,16 +2395,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
2395 return false; 2395 return false;
2396 2396
2397 /* If there are early-boot callbacks, move them to nocb lists. */ 2397 /* If there are early-boot callbacks, move them to nocb lists. */
2398 if (rdp->nxtlist) { 2398 if (!rcu_segcblist_empty(&rdp->cblist)) {
2399 rdp->nocb_head = rdp->nxtlist; 2399 rdp->nocb_head = rcu_segcblist_head(&rdp->cblist);
2400 rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL]; 2400 rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist);
2401 atomic_long_set(&rdp->nocb_q_count, rdp->qlen); 2401 atomic_long_set(&rdp->nocb_q_count,
2402 atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy); 2402 rcu_segcblist_n_cbs(&rdp->cblist));
2403 rdp->nxtlist = NULL; 2403 atomic_long_set(&rdp->nocb_q_count_lazy,
2404 rdp->qlen = 0; 2404 rcu_segcblist_n_lazy_cbs(&rdp->cblist));
2405 rdp->qlen_lazy = 0; 2405 rcu_segcblist_init(&rdp->cblist);
2406 } 2406 }
2407 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2407 rcu_segcblist_disable(&rdp->cblist);
2408 return true; 2408 return true;
2409} 2409}
2410 2410
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 8751a748499a..6cea17a1ea30 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -41,11 +41,11 @@
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/debugfs.h> 42#include <linux/debugfs.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/prefetch.h>
44 45
45#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
46#include "tree.h" 47#include "tree.h"
47 48#include "rcu.h"
48DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
49 49
50static int r_open(struct inode *inode, struct file *file, 50static int r_open(struct inode *inode, struct file *file,
51 const struct seq_operations *op) 51 const struct seq_operations *op)
@@ -121,7 +121,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
121 cpu_is_offline(rdp->cpu) ? '!' : ' ', 121 cpu_is_offline(rdp->cpu) ? '!' : ' ',
122 ulong2long(rdp->completed), ulong2long(rdp->gpnum), 122 ulong2long(rdp->completed), ulong2long(rdp->gpnum),
123 rdp->cpu_no_qs.b.norm, 123 rdp->cpu_no_qs.b.norm,
124 rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), 124 rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu),
125 rdp->core_needs_qs); 125 rdp->core_needs_qs);
126 seq_printf(m, " dt=%d/%llx/%d df=%lu", 126 seq_printf(m, " dt=%d/%llx/%d df=%lu",
127 rcu_dynticks_snap(rdp->dynticks), 127 rcu_dynticks_snap(rdp->dynticks),
@@ -130,17 +130,15 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
130 rdp->dynticks_fqs); 130 rdp->dynticks_fqs);
131 seq_printf(m, " of=%lu", rdp->offline_fqs); 131 seq_printf(m, " of=%lu", rdp->offline_fqs);
132 rcu_nocb_q_lengths(rdp, &ql, &qll); 132 rcu_nocb_q_lengths(rdp, &ql, &qll);
133 qll += rdp->qlen_lazy; 133 qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist);
134 ql += rdp->qlen; 134 ql += rcu_segcblist_n_cbs(&rdp->cblist);
135 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", 135 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
136 qll, ql, 136 qll, ql,
137 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 137 ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)],
138 rdp->nxttail[RCU_NEXT_TAIL]], 138 ".R"[!rcu_segcblist_segempty(&rdp->cblist,
139 ".R"[rdp->nxttail[RCU_WAIT_TAIL] != 139 RCU_NEXT_READY_TAIL)],
140 rdp->nxttail[RCU_NEXT_READY_TAIL]], 140 ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)],
141 ".W"[rdp->nxttail[RCU_DONE_TAIL] != 141 ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]);
142 rdp->nxttail[RCU_WAIT_TAIL]],
143 ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
144#ifdef CONFIG_RCU_BOOST 142#ifdef CONFIG_RCU_BOOST
145 seq_printf(m, " kt=%d/%c ktl=%x", 143 seq_printf(m, " kt=%d/%c ktl=%x",
146 per_cpu(rcu_cpu_has_work, rdp->cpu), 144 per_cpu(rcu_cpu_has_work, rdp->cpu),
@@ -278,7 +276,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
278 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", 276 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
279 rsp->n_force_qs, rsp->n_force_qs_ngp, 277 rsp->n_force_qs, rsp->n_force_qs_ngp,
280 rsp->n_force_qs - rsp->n_force_qs_ngp, 278 rsp->n_force_qs - rsp->n_force_qs_ngp,
281 READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); 279 READ_ONCE(rsp->n_force_qs_lh),
280 rsp->orphan_done.len_lazy,
281 rsp->orphan_done.len);
282 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { 282 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
283 if (rnp->level != level) { 283 if (rnp->level != level) {
284 seq_puts(m, "\n"); 284 seq_puts(m, "\n");
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 55c8530316c7..273e869ca21d 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
124 * non-expedited counterparts? Intended for use within RCU. Note 124 * non-expedited counterparts? Intended for use within RCU. Note
125 * that if the user specifies both rcu_expedited and rcu_normal, then 125 * that if the user specifies both rcu_expedited and rcu_normal, then
126 * rcu_normal wins. (Except during the time period during boot from 126 * rcu_normal wins. (Except during the time period during boot from
127 * when the first task is spawned until the rcu_exp_runtime_mode() 127 * when the first task is spawned until the rcu_set_runtime_mode()
128 * core_initcall() is invoked, at which point everything is expedited.) 128 * core_initcall() is invoked, at which point everything is expedited.)
129 */ 129 */
130bool rcu_gp_is_normal(void) 130bool rcu_gp_is_normal(void)
@@ -190,6 +190,39 @@ void rcu_end_inkernel_boot(void)
190 190
191#endif /* #ifndef CONFIG_TINY_RCU */ 191#endif /* #ifndef CONFIG_TINY_RCU */
192 192
193/*
194 * Test each non-SRCU synchronous grace-period wait API. This is
195 * useful just after a change in mode for these primitives, and
196 * during early boot.
197 */
198void rcu_test_sync_prims(void)
199{
200 if (!IS_ENABLED(CONFIG_PROVE_RCU))
201 return;
202 synchronize_rcu();
203 synchronize_rcu_bh();
204 synchronize_sched();
205 synchronize_rcu_expedited();
206 synchronize_rcu_bh_expedited();
207 synchronize_sched_expedited();
208}
209
210#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU)
211
212/*
213 * Switch to run-time mode once RCU has fully initialized.
214 */
215static int __init rcu_set_runtime_mode(void)
216{
217 rcu_test_sync_prims();
218 rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
219 rcu_test_sync_prims();
220 return 0;
221}
222core_initcall(rcu_set_runtime_mode);
223
224#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
225
193#ifdef CONFIG_PREEMPT_RCU 226#ifdef CONFIG_PREEMPT_RCU
194 227
195/* 228/*
@@ -632,6 +665,7 @@ static void check_holdout_task(struct task_struct *t,
632 put_task_struct(t); 665 put_task_struct(t);
633 return; 666 return;
634 } 667 }
668 rcu_request_urgent_qs_task(t);
635 if (!needreport) 669 if (!needreport)
636 return; 670 return;
637 if (*firstreport) { 671 if (*firstreport) {
@@ -817,23 +851,6 @@ static void rcu_spawn_tasks_kthread(void)
817 851
818#endif /* #ifdef CONFIG_TASKS_RCU */ 852#endif /* #ifdef CONFIG_TASKS_RCU */
819 853
820/*
821 * Test each non-SRCU synchronous grace-period wait API. This is
822 * useful just after a change in mode for these primitives, and
823 * during early boot.
824 */
825void rcu_test_sync_prims(void)
826{
827 if (!IS_ENABLED(CONFIG_PROVE_RCU))
828 return;
829 synchronize_rcu();
830 synchronize_rcu_bh();
831 synchronize_sched();
832 synchronize_rcu_expedited();
833 synchronize_rcu_bh_expedited();
834 synchronize_sched_expedited();
835}
836
837#ifdef CONFIG_PROVE_RCU 854#ifdef CONFIG_PROVE_RCU
838 855
839/* 856/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c51147a1204c..759f4bd52cd6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3382,7 +3382,7 @@ static void __sched notrace __schedule(bool preempt)
3382 hrtick_clear(rq); 3382 hrtick_clear(rq);
3383 3383
3384 local_irq_disable(); 3384 local_irq_disable();
3385 rcu_note_context_switch(); 3385 rcu_note_context_switch(preempt);
3386 3386
3387 /* 3387 /*
3388 * Make sure that signal_pending_state()->signal_pending() below 3388 * Make sure that signal_pending_state()->signal_pending() below
diff --git a/kernel/signal.c b/kernel/signal.c
index a8c54f384553..ca92bcfeb322 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1237,7 +1237,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
1237 } 1237 }
1238 /* 1238 /*
1239 * This sighand can be already freed and even reused, but 1239 * This sighand can be already freed and even reused, but
1240 * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which 1240 * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which
1241 * initializes ->siglock: this slab can't go away, it has 1241 * initializes ->siglock: this slab can't go away, it has
1242 * the same object type, ->siglock can't be reinitialized. 1242 * the same object type, ->siglock can't be reinitialized.
1243 * 1243 *
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index b10da59cf765..c81549d5c833 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -413,7 +413,7 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
413 *size += sizeof(struct kasan_alloc_meta); 413 *size += sizeof(struct kasan_alloc_meta);
414 414
415 /* Add free meta. */ 415 /* Add free meta. */
416 if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor || 416 if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor ||
417 cache->object_size < sizeof(struct kasan_free_meta)) { 417 cache->object_size < sizeof(struct kasan_free_meta)) {
418 cache->kasan_info.free_meta_offset = *size; 418 cache->kasan_info.free_meta_offset = *size;
419 *size += sizeof(struct kasan_free_meta); 419 *size += sizeof(struct kasan_free_meta);
@@ -561,7 +561,7 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
561 unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); 561 unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
562 562
563 /* RCU slabs could be legally used after free within the RCU period */ 563 /* RCU slabs could be legally used after free within the RCU period */
564 if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) 564 if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
565 return; 565 return;
566 566
567 kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); 567 kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
@@ -572,7 +572,7 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
572 s8 shadow_byte; 572 s8 shadow_byte;
573 573
574 /* RCU slabs could be legally used after free within the RCU period */ 574 /* RCU slabs could be legally used after free within the RCU period */
575 if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) 575 if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
576 return false; 576 return false;
577 577
578 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); 578 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index 5bf191756a4a..2d5959c5f7c5 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -95,7 +95,7 @@ void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
95void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) 95void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
96{ 96{
97 /* TODO: RCU freeing is unsupported for now; hide false positives. */ 97 /* TODO: RCU freeing is unsupported for now; hide false positives. */
98 if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) 98 if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU))
99 kmemcheck_mark_freed(object, size); 99 kmemcheck_mark_freed(object, size);
100} 100}
101 101
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index a7652acd2ab9..54ca54562928 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -21,7 +21,7 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22 22
23/* global SRCU for all MMs */ 23/* global SRCU for all MMs */
24static struct srcu_struct srcu; 24DEFINE_STATIC_SRCU(srcu);
25 25
26/* 26/*
27 * This function allows mmu_notifier::release callback to delay a call to 27 * This function allows mmu_notifier::release callback to delay a call to
@@ -252,12 +252,6 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
252 252
253 BUG_ON(atomic_read(&mm->mm_users) <= 0); 253 BUG_ON(atomic_read(&mm->mm_users) <= 0);
254 254
255 /*
256 * Verify that mmu_notifier_init() already run and the global srcu is
257 * initialized.
258 */
259 BUG_ON(!srcu.per_cpu_ref);
260
261 ret = -ENOMEM; 255 ret = -ENOMEM;
262 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); 256 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
263 if (unlikely(!mmu_notifier_mm)) 257 if (unlikely(!mmu_notifier_mm))
@@ -406,9 +400,3 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn,
406 mmdrop(mm); 400 mmdrop(mm);
407} 401}
408EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); 402EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release);
409
410static int __init mmu_notifier_init(void)
411{
412 return init_srcu_struct(&srcu);
413}
414subsys_initcall(mmu_notifier_init);
diff --git a/mm/rmap.c b/mm/rmap.c
index 3ff241f714eb..d405f0e0ee96 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -430,7 +430,7 @@ static void anon_vma_ctor(void *data)
430void __init anon_vma_init(void) 430void __init anon_vma_init(void)
431{ 431{
432 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 432 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
433 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, 433 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
434 anon_vma_ctor); 434 anon_vma_ctor);
435 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, 435 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
436 SLAB_PANIC|SLAB_ACCOUNT); 436 SLAB_PANIC|SLAB_ACCOUNT);
@@ -481,7 +481,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
481 * If this page is still mapped, then its anon_vma cannot have been 481 * If this page is still mapped, then its anon_vma cannot have been
482 * freed. But if it has been unmapped, we have no security against the 482 * freed. But if it has been unmapped, we have no security against the
483 * anon_vma structure being freed and reused (for another anon_vma: 483 * anon_vma structure being freed and reused (for another anon_vma:
484 * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() 484 * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
485 * above cannot corrupt). 485 * above cannot corrupt).
486 */ 486 */
487 if (!page_mapped(page)) { 487 if (!page_mapped(page)) {
diff --git a/mm/slab.c b/mm/slab.c
index 1880d482a0cb..2a31ee3c5814 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1728,7 +1728,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page)
1728 1728
1729 freelist = page->freelist; 1729 freelist = page->freelist;
1730 slab_destroy_debugcheck(cachep, page); 1730 slab_destroy_debugcheck(cachep, page);
1731 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 1731 if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU))
1732 call_rcu(&page->rcu_head, kmem_rcu_free); 1732 call_rcu(&page->rcu_head, kmem_rcu_free);
1733 else 1733 else
1734 kmem_freepages(cachep, page); 1734 kmem_freepages(cachep, page);
@@ -1924,7 +1924,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
1924 1924
1925 cachep->num = 0; 1925 cachep->num = 0;
1926 1926
1927 if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU) 1927 if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU)
1928 return false; 1928 return false;
1929 1929
1930 left = calculate_slab_order(cachep, size, 1930 left = calculate_slab_order(cachep, size,
@@ -2030,7 +2030,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2030 if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 2030 if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
2031 2 * sizeof(unsigned long long))) 2031 2 * sizeof(unsigned long long)))
2032 flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 2032 flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2033 if (!(flags & SLAB_DESTROY_BY_RCU)) 2033 if (!(flags & SLAB_TYPESAFE_BY_RCU))
2034 flags |= SLAB_POISON; 2034 flags |= SLAB_POISON;
2035#endif 2035#endif
2036#endif 2036#endif
diff --git a/mm/slab.h b/mm/slab.h
index 65e7c3fcac72..9cfcf099709c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -126,7 +126,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
126 126
127/* Legal flag mask for kmem_cache_create(), for various configurations */ 127/* Legal flag mask for kmem_cache_create(), for various configurations */
128#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ 128#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
129 SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS ) 129 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
130 130
131#if defined(CONFIG_DEBUG_SLAB) 131#if defined(CONFIG_DEBUG_SLAB)
132#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) 132#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
@@ -415,7 +415,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
415 * back there or track user information then we can 415 * back there or track user information then we can
416 * only use the space before that information. 416 * only use the space before that information.
417 */ 417 */
418 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) 418 if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER))
419 return s->inuse; 419 return s->inuse;
420 /* 420 /*
421 * Else we can use all the padding etc for the allocation 421 * Else we can use all the padding etc for the allocation
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 09d0e849b07f..01a0fe2eb332 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -39,7 +39,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
39 * Set of flags that will prevent slab merging 39 * Set of flags that will prevent slab merging
40 */ 40 */
41#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 41#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
42 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ 42 SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
43 SLAB_FAILSLAB | SLAB_KASAN) 43 SLAB_FAILSLAB | SLAB_KASAN)
44 44
45#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ 45#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
@@ -500,7 +500,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
500 struct kmem_cache *s, *s2; 500 struct kmem_cache *s, *s2;
501 501
502 /* 502 /*
503 * On destruction, SLAB_DESTROY_BY_RCU kmem_caches are put on the 503 * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the
504 * @slab_caches_to_rcu_destroy list. The slab pages are freed 504 * @slab_caches_to_rcu_destroy list. The slab pages are freed
505 * through RCU and and the associated kmem_cache are dereferenced 505 * through RCU and and the associated kmem_cache are dereferenced
506 * while freeing the pages, so the kmem_caches should be freed only 506 * while freeing the pages, so the kmem_caches should be freed only
@@ -537,7 +537,7 @@ static int shutdown_cache(struct kmem_cache *s)
537 memcg_unlink_cache(s); 537 memcg_unlink_cache(s);
538 list_del(&s->list); 538 list_del(&s->list);
539 539
540 if (s->flags & SLAB_DESTROY_BY_RCU) { 540 if (s->flags & SLAB_TYPESAFE_BY_RCU) {
541 list_add_tail(&s->list, &slab_caches_to_rcu_destroy); 541 list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
542 schedule_work(&slab_caches_to_rcu_destroy_work); 542 schedule_work(&slab_caches_to_rcu_destroy_work);
543 } else { 543 } else {
diff --git a/mm/slob.c b/mm/slob.c
index eac04d4357ec..1bae78d71096 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -126,7 +126,7 @@ static inline void clear_slob_page_free(struct page *sp)
126 126
127/* 127/*
128 * struct slob_rcu is inserted at the tail of allocated slob blocks, which 128 * struct slob_rcu is inserted at the tail of allocated slob blocks, which
129 * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free 129 * were created with a SLAB_TYPESAFE_BY_RCU slab. slob_rcu is used to free
130 * the block using call_rcu. 130 * the block using call_rcu.
131 */ 131 */
132struct slob_rcu { 132struct slob_rcu {
@@ -524,7 +524,7 @@ EXPORT_SYMBOL(ksize);
524 524
525int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) 525int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
526{ 526{
527 if (flags & SLAB_DESTROY_BY_RCU) { 527 if (flags & SLAB_TYPESAFE_BY_RCU) {
528 /* leave room for rcu footer at the end of object */ 528 /* leave room for rcu footer at the end of object */
529 c->size += sizeof(struct slob_rcu); 529 c->size += sizeof(struct slob_rcu);
530 } 530 }
@@ -598,7 +598,7 @@ static void kmem_rcu_free(struct rcu_head *head)
598void kmem_cache_free(struct kmem_cache *c, void *b) 598void kmem_cache_free(struct kmem_cache *c, void *b)
599{ 599{
600 kmemleak_free_recursive(b, c->flags); 600 kmemleak_free_recursive(b, c->flags);
601 if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { 601 if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) {
602 struct slob_rcu *slob_rcu; 602 struct slob_rcu *slob_rcu;
603 slob_rcu = b + (c->size - sizeof(struct slob_rcu)); 603 slob_rcu = b + (c->size - sizeof(struct slob_rcu));
604 slob_rcu->size = c->size; 604 slob_rcu->size = c->size;
diff --git a/mm/slub.c b/mm/slub.c
index 7f4bc7027ed5..57e5156f02be 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1687,7 +1687,7 @@ static void rcu_free_slab(struct rcu_head *h)
1687 1687
1688static void free_slab(struct kmem_cache *s, struct page *page) 1688static void free_slab(struct kmem_cache *s, struct page *page)
1689{ 1689{
1690 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1690 if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
1691 struct rcu_head *head; 1691 struct rcu_head *head;
1692 1692
1693 if (need_reserve_slab_rcu) { 1693 if (need_reserve_slab_rcu) {
@@ -2963,7 +2963,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
2963 * slab_free_freelist_hook() could have put the items into quarantine. 2963 * slab_free_freelist_hook() could have put the items into quarantine.
2964 * If so, no need to free them. 2964 * If so, no need to free them.
2965 */ 2965 */
2966 if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU)) 2966 if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU))
2967 return; 2967 return;
2968 do_slab_free(s, page, head, tail, cnt, addr); 2968 do_slab_free(s, page, head, tail, cnt, addr);
2969} 2969}
@@ -3433,7 +3433,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3433 * the slab may touch the object after free or before allocation 3433 * the slab may touch the object after free or before allocation
3434 * then we should never poison the object itself. 3434 * then we should never poison the object itself.
3435 */ 3435 */
3436 if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && 3436 if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
3437 !s->ctor) 3437 !s->ctor)
3438 s->flags |= __OBJECT_POISON; 3438 s->flags |= __OBJECT_POISON;
3439 else 3439 else
@@ -3455,7 +3455,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3455 */ 3455 */
3456 s->inuse = size; 3456 s->inuse = size;
3457 3457
3458 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || 3458 if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
3459 s->ctor)) { 3459 s->ctor)) {
3460 /* 3460 /*
3461 * Relocate free pointer after the object if it is not 3461 * Relocate free pointer after the object if it is not
@@ -3537,7 +3537,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3537 s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); 3537 s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
3538 s->reserved = 0; 3538 s->reserved = 0;
3539 3539
3540 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) 3540 if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU))
3541 s->reserved = sizeof(struct rcu_head); 3541 s->reserved = sizeof(struct rcu_head);
3542 3542
3543 if (!calculate_sizes(s, -1)) 3543 if (!calculate_sizes(s, -1))
@@ -5042,7 +5042,7 @@ SLAB_ATTR_RO(cache_dma);
5042 5042
5043static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) 5043static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
5044{ 5044{
5045 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); 5045 return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
5046} 5046}
5047SLAB_ATTR_RO(destroy_by_rcu); 5047SLAB_ATTR_RO(destroy_by_rcu);
5048 5048
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b99168b0fabf..f75482bdee9a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -951,7 +951,7 @@ static struct proto dccp_v4_prot = {
951 .orphan_count = &dccp_orphan_count, 951 .orphan_count = &dccp_orphan_count,
952 .max_header = MAX_DCCP_HEADER, 952 .max_header = MAX_DCCP_HEADER,
953 .obj_size = sizeof(struct dccp_sock), 953 .obj_size = sizeof(struct dccp_sock),
954 .slab_flags = SLAB_DESTROY_BY_RCU, 954 .slab_flags = SLAB_TYPESAFE_BY_RCU,
955 .rsk_prot = &dccp_request_sock_ops, 955 .rsk_prot = &dccp_request_sock_ops,
956 .twsk_prot = &dccp_timewait_sock_ops, 956 .twsk_prot = &dccp_timewait_sock_ops,
957 .h.hashinfo = &dccp_hashinfo, 957 .h.hashinfo = &dccp_hashinfo,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index d9b6a4e403e7..840f14aaa016 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1014,7 +1014,7 @@ static struct proto dccp_v6_prot = {
1014 .orphan_count = &dccp_orphan_count, 1014 .orphan_count = &dccp_orphan_count,
1015 .max_header = MAX_DCCP_HEADER, 1015 .max_header = MAX_DCCP_HEADER,
1016 .obj_size = sizeof(struct dccp6_sock), 1016 .obj_size = sizeof(struct dccp6_sock),
1017 .slab_flags = SLAB_DESTROY_BY_RCU, 1017 .slab_flags = SLAB_TYPESAFE_BY_RCU,
1018 .rsk_prot = &dccp6_request_sock_ops, 1018 .rsk_prot = &dccp6_request_sock_ops,
1019 .twsk_prot = &dccp6_timewait_sock_ops, 1019 .twsk_prot = &dccp6_timewait_sock_ops,
1020 .h.hashinfo = &dccp_hashinfo, 1020 .h.hashinfo = &dccp_hashinfo,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3a51582bef55..5ab2aac5ca19 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2395,7 +2395,7 @@ struct proto tcp_prot = {
2395 .sysctl_rmem = sysctl_tcp_rmem, 2395 .sysctl_rmem = sysctl_tcp_rmem,
2396 .max_header = MAX_TCP_HEADER, 2396 .max_header = MAX_TCP_HEADER,
2397 .obj_size = sizeof(struct tcp_sock), 2397 .obj_size = sizeof(struct tcp_sock),
2398 .slab_flags = SLAB_DESTROY_BY_RCU, 2398 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2399 .twsk_prot = &tcp_timewait_sock_ops, 2399 .twsk_prot = &tcp_timewait_sock_ops,
2400 .rsk_prot = &tcp_request_sock_ops, 2400 .rsk_prot = &tcp_request_sock_ops,
2401 .h.hashinfo = &tcp_hashinfo, 2401 .h.hashinfo = &tcp_hashinfo,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index aeb9497b5bb7..7a8237acd210 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1917,7 +1917,7 @@ struct proto tcpv6_prot = {
1917 .sysctl_rmem = sysctl_tcp_rmem, 1917 .sysctl_rmem = sysctl_tcp_rmem,
1918 .max_header = MAX_TCP_HEADER, 1918 .max_header = MAX_TCP_HEADER,
1919 .obj_size = sizeof(struct tcp6_sock), 1919 .obj_size = sizeof(struct tcp6_sock),
1920 .slab_flags = SLAB_DESTROY_BY_RCU, 1920 .slab_flags = SLAB_TYPESAFE_BY_RCU,
1921 .twsk_prot = &tcp6_timewait_sock_ops, 1921 .twsk_prot = &tcp6_timewait_sock_ops,
1922 .rsk_prot = &tcp6_request_sock_ops, 1922 .rsk_prot = &tcp6_request_sock_ops,
1923 .h.hashinfo = &tcp_hashinfo, 1923 .h.hashinfo = &tcp_hashinfo,
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index cb4fff785cbf..8364fe5b59e4 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -142,7 +142,7 @@ static struct proto llc_proto = {
142 .name = "LLC", 142 .name = "LLC",
143 .owner = THIS_MODULE, 143 .owner = THIS_MODULE,
144 .obj_size = sizeof(struct llc_sock), 144 .obj_size = sizeof(struct llc_sock),
145 .slab_flags = SLAB_DESTROY_BY_RCU, 145 .slab_flags = SLAB_TYPESAFE_BY_RCU,
146}; 146};
147 147
148/** 148/**
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 8bc5a1bd2d45..9b02c13d258b 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -506,7 +506,7 @@ static struct sock *__llc_lookup_established(struct llc_sap *sap,
506again: 506again:
507 sk_nulls_for_each_rcu(rc, node, laddr_hb) { 507 sk_nulls_for_each_rcu(rc, node, laddr_hb) {
508 if (llc_estab_match(sap, daddr, laddr, rc)) { 508 if (llc_estab_match(sap, daddr, laddr, rc)) {
509 /* Extra checks required by SLAB_DESTROY_BY_RCU */ 509 /* Extra checks required by SLAB_TYPESAFE_BY_RCU */
510 if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) 510 if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
511 goto again; 511 goto again;
512 if (unlikely(llc_sk(rc)->sap != sap || 512 if (unlikely(llc_sk(rc)->sap != sap ||
@@ -565,7 +565,7 @@ static struct sock *__llc_lookup_listener(struct llc_sap *sap,
565again: 565again:
566 sk_nulls_for_each_rcu(rc, node, laddr_hb) { 566 sk_nulls_for_each_rcu(rc, node, laddr_hb) {
567 if (llc_listener_match(sap, laddr, rc)) { 567 if (llc_listener_match(sap, laddr, rc)) {
568 /* Extra checks required by SLAB_DESTROY_BY_RCU */ 568 /* Extra checks required by SLAB_TYPESAFE_BY_RCU */
569 if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) 569 if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
570 goto again; 570 goto again;
571 if (unlikely(llc_sk(rc)->sap != sap || 571 if (unlikely(llc_sk(rc)->sap != sap ||
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 5404d0d195cc..63b6ab056370 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -328,7 +328,7 @@ static struct sock *llc_lookup_dgram(struct llc_sap *sap,
328again: 328again:
329 sk_nulls_for_each_rcu(rc, node, laddr_hb) { 329 sk_nulls_for_each_rcu(rc, node, laddr_hb) {
330 if (llc_dgram_match(sap, laddr, rc)) { 330 if (llc_dgram_match(sap, laddr, rc)) {
331 /* Extra checks required by SLAB_DESTROY_BY_RCU */ 331 /* Extra checks required by SLAB_TYPESAFE_BY_RCU */
332 if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) 332 if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
333 goto again; 333 goto again;
334 if (unlikely(llc_sk(rc)->sap != sap || 334 if (unlikely(llc_sk(rc)->sap != sap ||
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3c8f1ed2f555..e847dbaa0c6b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -911,7 +911,7 @@ static unsigned int early_drop_list(struct net *net,
911 continue; 911 continue;
912 912
913 /* kill only if still in same netns -- might have moved due to 913 /* kill only if still in same netns -- might have moved due to
914 * SLAB_DESTROY_BY_RCU rules. 914 * SLAB_TYPESAFE_BY_RCU rules.
915 * 915 *
916 * We steal the timer reference. If that fails timer has 916 * We steal the timer reference. If that fails timer has
917 * already fired or someone else deleted it. Just drop ref 917 * already fired or someone else deleted it. Just drop ref
@@ -1114,7 +1114,7 @@ __nf_conntrack_alloc(struct net *net,
1114 1114
1115 /* 1115 /*
1116 * Do not use kmem_cache_zalloc(), as this cache uses 1116 * Do not use kmem_cache_zalloc(), as this cache uses
1117 * SLAB_DESTROY_BY_RCU. 1117 * SLAB_TYPESAFE_BY_RCU.
1118 */ 1118 */
1119 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); 1119 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
1120 if (ct == NULL) 1120 if (ct == NULL)
@@ -1159,7 +1159,7 @@ void nf_conntrack_free(struct nf_conn *ct)
1159 struct net *net = nf_ct_net(ct); 1159 struct net *net = nf_ct_net(ct);
1160 1160
1161 /* A freed object has refcnt == 0, that's 1161 /* A freed object has refcnt == 0, that's
1162 * the golden rule for SLAB_DESTROY_BY_RCU 1162 * the golden rule for SLAB_TYPESAFE_BY_RCU
1163 */ 1163 */
1164 NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); 1164 NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
1165 1165
@@ -1929,7 +1929,7 @@ int nf_conntrack_init_start(void)
1929 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 1929 nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
1930 sizeof(struct nf_conn), 1930 sizeof(struct nf_conn),
1931 NFCT_INFOMASK + 1, 1931 NFCT_INFOMASK + 1,
1932 SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 1932 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
1933 if (!nf_conntrack_cachep) 1933 if (!nf_conntrack_cachep)
1934 goto err_cachep; 1934 goto err_cachep;
1935 1935
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 5b6ee21368a6..6793d7348cc8 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -101,7 +101,7 @@ struct proto smc_proto = {
101 .unhash = smc_unhash_sk, 101 .unhash = smc_unhash_sk,
102 .obj_size = sizeof(struct smc_sock), 102 .obj_size = sizeof(struct smc_sock),
103 .h.smc_hash = &smc_v4_hashinfo, 103 .h.smc_hash = &smc_v4_hashinfo,
104 .slab_flags = SLAB_DESTROY_BY_RCU, 104 .slab_flags = SLAB_TYPESAFE_BY_RCU,
105}; 105};
106EXPORT_SYMBOL_GPL(smc_proto); 106EXPORT_SYMBOL_GPL(smc_proto);
107 107
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index ea6e373edc27..93eede4e8fbe 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -170,7 +170,7 @@ qemu_append="`identify_qemu_append "$QEMU"`"
170# Pull in Kconfig-fragment boot parameters 170# Pull in Kconfig-fragment boot parameters
171boot_args="`configfrag_boot_params "$boot_args" "$config_template"`" 171boot_args="`configfrag_boot_params "$boot_args" "$config_template"`"
172# Generate kernel-version-specific boot parameters 172# Generate kernel-version-specific boot parameters
173boot_args="`per_version_boot_params "$boot_args" $builddir/.config $seconds`" 173boot_args="`per_version_boot_params "$boot_args" $resdir/.config $seconds`"
174 174
175if test -n "$TORTURE_BUILDONLY" 175if test -n "$TORTURE_BUILDONLY"
176then 176then