diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-10 12:50:55 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-10 13:30:46 -0400 |
commit | de4d195308ad589626571dbe5789cebf9695a204 (patch) | |
tree | 77a6bd6946594ea4e7513aaa73009295530960a1 | |
parent | dc9edaab90de9441cc28ac570b23b0d2bdba7879 (diff) | |
parent | 20652ed6e44f4963281b65209b917be86ac6765b (diff) |
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RCU updates from Ingo Molnar:
"The main changes are:
- Debloat RCU headers
- Parallelize SRCU callback handling (plus overlapping patches)
- Improve the performance of Tree SRCU on a CPU-hotplug stress test
- Documentation updates
- Miscellaneous fixes"
* 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (74 commits)
rcu: Open-code the rcu_cblist_n_lazy_cbs() function
rcu: Open-code the rcu_cblist_n_cbs() function
rcu: Open-code the rcu_cblist_empty() function
rcu: Separately compile large rcu_segcblist functions
srcu: Debloat the <linux/rcu_segcblist.h> header
srcu: Adjust default auto-expediting holdoff
srcu: Specify auto-expedite holdoff time
srcu: Expedite first synchronize_srcu() when idle
srcu: Expedited grace periods with reduced memory contention
srcu: Make rcutorture writer stalls print SRCU GP state
srcu: Exact tracking of srcu_data structures containing callbacks
srcu: Make SRCU be built by default
srcu: Fix Kconfig botch when SRCU not selected
rcu: Make non-preemptive schedule be Tasks RCU quiescent state
srcu: Expedite srcu_schedule_cbs_snp() callback invocation
srcu: Parallelize callback handling
kvm: Move srcu_struct fields to end of struct kvm
rcu: Fix typo in PER_RCU_NODE_PERIOD header comment
rcu: Use true/false in assignment to bool
rcu: Use bool value directly
...
75 files changed, 3904 insertions, 1129 deletions
diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX index f773a264ae02..1672573b037a 100644 --- a/Documentation/RCU/00-INDEX +++ b/Documentation/RCU/00-INDEX | |||
@@ -17,7 +17,7 @@ rcu_dereference.txt | |||
17 | rcubarrier.txt | 17 | rcubarrier.txt |
18 | - RCU and Unloadable Modules | 18 | - RCU and Unloadable Modules |
19 | rculist_nulls.txt | 19 | rculist_nulls.txt |
20 | - RCU list primitives for use with SLAB_DESTROY_BY_RCU | 20 | - RCU list primitives for use with SLAB_TYPESAFE_BY_RCU |
21 | rcuref.txt | 21 | rcuref.txt |
22 | - Reference-count design for elements of lists/arrays protected by RCU | 22 | - Reference-count design for elements of lists/arrays protected by RCU |
23 | rcu.txt | 23 | rcu.txt |
diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index d583c653a703..38d6d800761f 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html | |||
@@ -19,6 +19,8 @@ to each other. | |||
19 | The <tt>rcu_state</tt> Structure</a> | 19 | The <tt>rcu_state</tt> Structure</a> |
20 | <li> <a href="#The rcu_node Structure"> | 20 | <li> <a href="#The rcu_node Structure"> |
21 | The <tt>rcu_node</tt> Structure</a> | 21 | The <tt>rcu_node</tt> Structure</a> |
22 | <li> <a href="#The rcu_segcblist Structure"> | ||
23 | The <tt>rcu_segcblist</tt> Structure</a> | ||
22 | <li> <a href="#The rcu_data Structure"> | 24 | <li> <a href="#The rcu_data Structure"> |
23 | The <tt>rcu_data</tt> Structure</a> | 25 | The <tt>rcu_data</tt> Structure</a> |
24 | <li> <a href="#The rcu_dynticks Structure"> | 26 | <li> <a href="#The rcu_dynticks Structure"> |
@@ -841,6 +843,134 @@ for lockdep lock-class names. | |||
841 | Finally, lines 64-66 produce an error if the maximum number of | 843 | Finally, lines 64-66 produce an error if the maximum number of |
842 | CPUs is too large for the specified fanout. | 844 | CPUs is too large for the specified fanout. |
843 | 845 | ||
846 | <h3><a name="The rcu_segcblist Structure"> | ||
847 | The <tt>rcu_segcblist</tt> Structure</a></h3> | ||
848 | |||
849 | The <tt>rcu_segcblist</tt> structure maintains a segmented list of | ||
850 | callbacks as follows: | ||
851 | |||
852 | <pre> | ||
853 | 1 #define RCU_DONE_TAIL 0 | ||
854 | 2 #define RCU_WAIT_TAIL 1 | ||
855 | 3 #define RCU_NEXT_READY_TAIL 2 | ||
856 | 4 #define RCU_NEXT_TAIL 3 | ||
857 | 5 #define RCU_CBLIST_NSEGS 4 | ||
858 | 6 | ||
859 | 7 struct rcu_segcblist { | ||
860 | 8 struct rcu_head *head; | ||
861 | 9 struct rcu_head **tails[RCU_CBLIST_NSEGS]; | ||
862 | 10 unsigned long gp_seq[RCU_CBLIST_NSEGS]; | ||
863 | 11 long len; | ||
864 | 12 long len_lazy; | ||
865 | 13 }; | ||
866 | </pre> | ||
867 | |||
868 | <p> | ||
869 | The segments are as follows: | ||
870 | |||
871 | <ol> | ||
872 | <li> <tt>RCU_DONE_TAIL</tt>: Callbacks whose grace periods have elapsed. | ||
873 | These callbacks are ready to be invoked. | ||
874 | <li> <tt>RCU_WAIT_TAIL</tt>: Callbacks that are waiting for the | ||
875 | current grace period. | ||
876 | Note that different CPUs can have different ideas about which | ||
877 | grace period is current, hence the <tt>->gp_seq</tt> field. | ||
878 | <li> <tt>RCU_NEXT_READY_TAIL</tt>: Callbacks waiting for the next | ||
879 | grace period to start. | ||
880 | <li> <tt>RCU_NEXT_TAIL</tt>: Callbacks that have not yet been | ||
881 | associated with a grace period. | ||
882 | </ol> | ||
883 | |||
884 | <p> | ||
885 | The <tt>->head</tt> pointer references the first callback or | ||
886 | is <tt>NULL</tt> if the list contains no callbacks (which is | ||
887 | <i>not</i> the same as being empty). | ||
888 | Each element of the <tt>->tails[]</tt> array references the | ||
889 | <tt>->next</tt> pointer of the last callback in the corresponding | ||
890 | segment of the list, or the list's <tt>->head</tt> pointer if | ||
891 | that segment and all previous segments are empty. | ||
892 | If the corresponding segment is empty but some previous segment is | ||
893 | not empty, then the array element is identical to its predecessor. | ||
894 | Older callbacks are closer to the head of the list, and new callbacks | ||
895 | are added at the tail. | ||
896 | This relationship between the <tt>->head</tt> pointer, the | ||
897 | <tt>->tails[]</tt> array, and the callbacks is shown in this | ||
898 | diagram: | ||
899 | |||
900 | </p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%"> | ||
901 | |||
902 | </p><p>In this figure, the <tt>->head</tt> pointer references the | ||
903 | first | ||
904 | RCU callback in the list. | ||
905 | The <tt>->tails[RCU_DONE_TAIL]</tt> array element references | ||
906 | the <tt>->head</tt> pointer itself, indicating that none | ||
907 | of the callbacks is ready to invoke. | ||
908 | The <tt>->tails[RCU_WAIT_TAIL]</tt> array element references callback | ||
909 | CB 2's <tt>->next</tt> pointer, which indicates that | ||
910 | CB 1 and CB 2 are both waiting on the current grace period, | ||
911 | give or take possible disagreements about exactly which grace period | ||
912 | is the current one. | ||
913 | The <tt>->tails[RCU_NEXT_READY_TAIL]</tt> array element | ||
914 | references the same RCU callback that <tt>->tails[RCU_WAIT_TAIL]</tt> | ||
915 | does, which indicates that there are no callbacks waiting on the next | ||
916 | RCU grace period. | ||
917 | The <tt>->tails[RCU_NEXT_TAIL]</tt> array element references | ||
918 | CB 4's <tt>->next</tt> pointer, indicating that all the | ||
919 | remaining RCU callbacks have not yet been assigned to an RCU grace | ||
920 | period. | ||
921 | Note that the <tt>->tails[RCU_NEXT_TAIL]</tt> array element | ||
922 | always references the last RCU callback's <tt>->next</tt> pointer | ||
923 | unless the callback list is empty, in which case it references | ||
924 | the <tt>->head</tt> pointer. | ||
925 | |||
926 | <p> | ||
927 | There is one additional important special case for the | ||
928 | <tt>->tails[RCU_NEXT_TAIL]</tt> array element: It can be <tt>NULL</tt> | ||
929 | when this list is <i>disabled</i>. | ||
930 | Lists are disabled when the corresponding CPU is offline or when | ||
931 | the corresponding CPU's callbacks are offloaded to a kthread, | ||
932 | both of which are described elsewhere. | ||
933 | |||
934 | </p><p>CPUs advance their callbacks from the | ||
935 | <tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the | ||
936 | <tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments | ||
937 | as grace periods advance. | ||
938 | |||
939 | </p><p>The <tt>->gp_seq[]</tt> array records grace-period | ||
940 | numbers corresponding to the list segments. | ||
941 | This is what allows different CPUs to have different ideas as to | ||
942 | which is the current grace period while still avoiding premature | ||
943 | invocation of their callbacks. | ||
944 | In particular, this allows CPUs that go idle for extended periods | ||
945 | to determine which of their callbacks are ready to be invoked after | ||
946 | reawakening. | ||
947 | |||
948 | </p><p>The <tt>->len</tt> counter contains the number of | ||
949 | callbacks in <tt>->head</tt>, and the | ||
950 | <tt>->len_lazy</tt> contains the number of those callbacks that | ||
951 | are known to only free memory, and whose invocation can therefore | ||
952 | be safely deferred. | ||
953 | |||
954 | <p><b>Important note</b>: It is the <tt>->len</tt> field that | ||
955 | determines whether or not there are callbacks associated with | ||
956 | this <tt>rcu_segcblist</tt> structure, <i>not</i> the <tt>->head</tt> | ||
957 | pointer. | ||
958 | The reason for this is that all the ready-to-invoke callbacks | ||
959 | (that is, those in the <tt>RCU_DONE_TAIL</tt> segment) are extracted | ||
960 | all at once at callback-invocation time. | ||
961 | If callback invocation must be postponed, for example, because a | ||
962 | high-priority process just woke up on this CPU, then the remaining | ||
963 | callbacks are placed back on the <tt>RCU_DONE_TAIL</tt> segment. | ||
964 | Either way, the <tt>->len</tt> and <tt>->len_lazy</tt> counts | ||
965 | are adjusted after the corresponding callbacks have been invoked, and so | ||
966 | again it is the <tt>->len</tt> count that accurately reflects whether | ||
967 | or not there are callbacks associated with this <tt>rcu_segcblist</tt> | ||
968 | structure. | ||
969 | Of course, off-CPU sampling of the <tt>->len</tt> count requires | ||
970 | the use of appropriate synchronization, for example, memory barriers. | ||
971 | This synchronization can be a bit subtle, particularly in the case | ||
972 | of <tt>rcu_barrier()</tt>. | ||
973 | |||
844 | <h3><a name="The rcu_data Structure"> | 974 | <h3><a name="The rcu_data Structure"> |
845 | The <tt>rcu_data</tt> Structure</a></h3> | 975 | The <tt>rcu_data</tt> Structure</a></h3> |
846 | 976 | ||
@@ -983,62 +1113,18 @@ choice. | |||
983 | as follows: | 1113 | as follows: |
984 | 1114 | ||
985 | <pre> | 1115 | <pre> |
986 | 1 struct rcu_head *nxtlist; | 1116 | 1 struct rcu_segcblist cblist; |
987 | 2 struct rcu_head **nxttail[RCU_NEXT_SIZE]; | 1117 | 2 long qlen_last_fqs_check; |
988 | 3 unsigned long nxtcompleted[RCU_NEXT_SIZE]; | 1118 | 3 unsigned long n_cbs_invoked; |
989 | 4 long qlen_lazy; | 1119 | 4 unsigned long n_nocbs_invoked; |
990 | 5 long qlen; | 1120 | 5 unsigned long n_cbs_orphaned; |
991 | 6 long qlen_last_fqs_check; | 1121 | 6 unsigned long n_cbs_adopted; |
992 | 7 unsigned long n_force_qs_snap; | 1122 | 7 unsigned long n_force_qs_snap; |
993 | 8 unsigned long n_cbs_invoked; | 1123 | 8 long blimit; |
994 | 9 unsigned long n_cbs_orphaned; | ||
995 | 10 unsigned long n_cbs_adopted; | ||
996 | 11 long blimit; | ||
997 | </pre> | 1124 | </pre> |
998 | 1125 | ||
999 | <p>The <tt>->nxtlist</tt> pointer and the | 1126 | <p>The <tt>->cblist</tt> structure is the segmented callback list |
1000 | <tt>->nxttail[]</tt> array form a four-segment list with | 1127 | described earlier. |
1001 | older callbacks near the head and newer ones near the tail. | ||
1002 | Each segment contains callbacks with the corresponding relationship | ||
1003 | to the current grace period. | ||
1004 | The pointer out of the end of each of the four segments is referenced | ||
1005 | by the element of the <tt>->nxttail[]</tt> array indexed by | ||
1006 | <tt>RCU_DONE_TAIL</tt> (for callbacks handled by a prior grace period), | ||
1007 | <tt>RCU_WAIT_TAIL</tt> (for callbacks waiting on the current grace period), | ||
1008 | <tt>RCU_NEXT_READY_TAIL</tt> (for callbacks that will wait on the next | ||
1009 | grace period), and | ||
1010 | <tt>RCU_NEXT_TAIL</tt> (for callbacks that are not yet associated | ||
1011 | with a specific grace period) | ||
1012 | respectively, as shown in the following figure. | ||
1013 | |||
1014 | </p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%"> | ||
1015 | |||
1016 | </p><p>In this figure, the <tt>->nxtlist</tt> pointer references the | ||
1017 | first | ||
1018 | RCU callback in the list. | ||
1019 | The <tt>->nxttail[RCU_DONE_TAIL]</tt> array element references | ||
1020 | the <tt>->nxtlist</tt> pointer itself, indicating that none | ||
1021 | of the callbacks is ready to invoke. | ||
1022 | The <tt>->nxttail[RCU_WAIT_TAIL]</tt> array element references callback | ||
1023 | CB 2's <tt>->next</tt> pointer, which indicates that | ||
1024 | CB 1 and CB 2 are both waiting on the current grace period. | ||
1025 | The <tt>->nxttail[RCU_NEXT_READY_TAIL]</tt> array element | ||
1026 | references the same RCU callback that <tt>->nxttail[RCU_WAIT_TAIL]</tt> | ||
1027 | does, which indicates that there are no callbacks waiting on the next | ||
1028 | RCU grace period. | ||
1029 | The <tt>->nxttail[RCU_NEXT_TAIL]</tt> array element references | ||
1030 | CB 4's <tt>->next</tt> pointer, indicating that all the | ||
1031 | remaining RCU callbacks have not yet been assigned to an RCU grace | ||
1032 | period. | ||
1033 | Note that the <tt>->nxttail[RCU_NEXT_TAIL]</tt> array element | ||
1034 | always references the last RCU callback's <tt>->next</tt> pointer | ||
1035 | unless the callback list is empty, in which case it references | ||
1036 | the <tt>->nxtlist</tt> pointer. | ||
1037 | |||
1038 | </p><p>CPUs advance their callbacks from the | ||
1039 | <tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the | ||
1040 | <tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments | ||
1041 | as grace periods advance. | ||
1042 | The CPU advances the callbacks in its <tt>rcu_data</tt> structure | 1128 | The CPU advances the callbacks in its <tt>rcu_data</tt> structure |
1043 | whenever it notices that another RCU grace period has completed. | 1129 | whenever it notices that another RCU grace period has completed. |
1044 | The CPU detects the completion of an RCU grace period by noticing | 1130 | The CPU detects the completion of an RCU grace period by noticing |
@@ -1049,16 +1135,7 @@ Recall that each <tt>rcu_node</tt> structure's | |||
1049 | <tt>->completed</tt> field is updated at the end of each | 1135 | <tt>->completed</tt> field is updated at the end of each |
1050 | grace period. | 1136 | grace period. |
1051 | 1137 | ||
1052 | </p><p>The <tt>->nxtcompleted[]</tt> array records grace-period | 1138 | <p> |
1053 | numbers corresponding to the list segments. | ||
1054 | This allows CPUs that go idle for extended periods to determine | ||
1055 | which of their callbacks are ready to be invoked after reawakening. | ||
1056 | |||
1057 | </p><p>The <tt>->qlen</tt> counter contains the number of | ||
1058 | callbacks in <tt>->nxtlist</tt>, and the | ||
1059 | <tt>->qlen_lazy</tt> contains the number of those callbacks that | ||
1060 | are known to only free memory, and whose invocation can therefore | ||
1061 | be safely deferred. | ||
1062 | The <tt>->qlen_last_fqs_check</tt> and | 1139 | The <tt>->qlen_last_fqs_check</tt> and |
1063 | <tt>->n_force_qs_snap</tt> coordinate the forcing of quiescent | 1140 | <tt>->n_force_qs_snap</tt> coordinate the forcing of quiescent |
1064 | states from <tt>call_rcu()</tt> and friends when callback | 1141 | states from <tt>call_rcu()</tt> and friends when callback |
@@ -1069,6 +1146,10 @@ lists grow excessively long. | |||
1069 | fields count the number of callbacks invoked, | 1146 | fields count the number of callbacks invoked, |
1070 | sent to other CPUs when this CPU goes offline, | 1147 | sent to other CPUs when this CPU goes offline, |
1071 | and received from other CPUs when those other CPUs go offline. | 1148 | and received from other CPUs when those other CPUs go offline. |
1149 | The <tt>->n_nocbs_invoked</tt> is used when the CPU's callbacks | ||
1150 | are offloaded to a kthread. | ||
1151 | |||
1152 | <p> | ||
1072 | Finally, the <tt>->blimit</tt> counter is the maximum number of | 1153 | Finally, the <tt>->blimit</tt> counter is the maximum number of |
1073 | RCU callbacks that may be invoked at a given time. | 1154 | RCU callbacks that may be invoked at a given time. |
1074 | 1155 | ||
@@ -1104,6 +1185,9 @@ Its fields are as follows: | |||
1104 | 1 int dynticks_nesting; | 1185 | 1 int dynticks_nesting; |
1105 | 2 int dynticks_nmi_nesting; | 1186 | 2 int dynticks_nmi_nesting; |
1106 | 3 atomic_t dynticks; | 1187 | 3 atomic_t dynticks; |
1188 | 4 bool rcu_need_heavy_qs; | ||
1189 | 5 unsigned long rcu_qs_ctr; | ||
1190 | 6 bool rcu_urgent_qs; | ||
1107 | </pre> | 1191 | </pre> |
1108 | 1192 | ||
1109 | <p>The <tt>->dynticks_nesting</tt> field counts the | 1193 | <p>The <tt>->dynticks_nesting</tt> field counts the |
@@ -1117,11 +1201,32 @@ NMIs are counted by the <tt>->dynticks_nmi_nesting</tt> | |||
1117 | field, except that NMIs that interrupt non-dyntick-idle execution | 1201 | field, except that NMIs that interrupt non-dyntick-idle execution |
1118 | are not counted. | 1202 | are not counted. |
1119 | 1203 | ||
1120 | </p><p>Finally, the <tt>->dynticks</tt> field counts the corresponding | 1204 | </p><p>The <tt>->dynticks</tt> field counts the corresponding |
1121 | CPU's transitions to and from dyntick-idle mode, so that this counter | 1205 | CPU's transitions to and from dyntick-idle mode, so that this counter |
1122 | has an even value when the CPU is in dyntick-idle mode and an odd | 1206 | has an even value when the CPU is in dyntick-idle mode and an odd |
1123 | value otherwise. | 1207 | value otherwise. |
1124 | 1208 | ||
1209 | </p><p>The <tt>->rcu_need_heavy_qs</tt> field is used | ||
1210 | to record the fact that the RCU core code would really like to | ||
1211 | see a quiescent state from the corresponding CPU, so much so that | ||
1212 | it is willing to call for heavy-weight dyntick-counter operations. | ||
1213 | This flag is checked by RCU's context-switch and <tt>cond_resched()</tt> | ||
1214 | code, which provide a momentary idle sojourn in response. | ||
1215 | |||
1216 | </p><p>The <tt>->rcu_qs_ctr</tt> field is used to record | ||
1217 | quiescent states from <tt>cond_resched()</tt>. | ||
1218 | Because <tt>cond_resched()</tt> can execute quite frequently, this | ||
1219 | must be quite lightweight, as in a non-atomic increment of this | ||
1220 | per-CPU field. | ||
1221 | |||
1222 | </p><p>Finally, the <tt>->rcu_urgent_qs</tt> field is used to record | ||
1223 | the fact that the RCU core code would really like to see a quiescent | ||
1224 | state from the corresponding CPU, with the various other fields indicating | ||
1225 | just how badly RCU wants this quiescent state. | ||
1226 | This flag is checked by RCU's context-switch and <tt>cond_resched()</tt> | ||
1227 | code, which, if nothing else, non-atomically increment <tt>->rcu_qs_ctr</tt> | ||
1228 | in response. | ||
1229 | |||
1125 | <table> | 1230 | <table> |
1126 | <tr><th> </th></tr> | 1231 | <tr><th> </th></tr> |
1127 | <tr><th align="left">Quick Quiz:</th></tr> | 1232 | <tr><th align="left">Quick Quiz:</th></tr> |
diff --git a/Documentation/RCU/Design/Data-Structures/nxtlist.svg b/Documentation/RCU/Design/Data-Structures/nxtlist.svg index abc4cc73a097..0223e79c38e0 100644 --- a/Documentation/RCU/Design/Data-Structures/nxtlist.svg +++ b/Documentation/RCU/Design/Data-Structures/nxtlist.svg | |||
@@ -19,7 +19,7 @@ | |||
19 | id="svg2" | 19 | id="svg2" |
20 | version="1.1" | 20 | version="1.1" |
21 | inkscape:version="0.48.4 r9939" | 21 | inkscape:version="0.48.4 r9939" |
22 | sodipodi:docname="nxtlist.fig"> | 22 | sodipodi:docname="segcblist.svg"> |
23 | <metadata | 23 | <metadata |
24 | id="metadata94"> | 24 | id="metadata94"> |
25 | <rdf:RDF> | 25 | <rdf:RDF> |
@@ -28,7 +28,7 @@ | |||
28 | <dc:format>image/svg+xml</dc:format> | 28 | <dc:format>image/svg+xml</dc:format> |
29 | <dc:type | 29 | <dc:type |
30 | rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> | 30 | rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> |
31 | <dc:title></dc:title> | 31 | <dc:title /> |
32 | </cc:Work> | 32 | </cc:Work> |
33 | </rdf:RDF> | 33 | </rdf:RDF> |
34 | </metadata> | 34 | </metadata> |
@@ -241,61 +241,51 @@ | |||
241 | xml:space="preserve" | 241 | xml:space="preserve" |
242 | x="225" | 242 | x="225" |
243 | y="675" | 243 | y="675" |
244 | fill="#000000" | ||
245 | font-family="Courier" | ||
246 | font-style="normal" | 244 | font-style="normal" |
247 | font-weight="bold" | 245 | font-weight="bold" |
248 | font-size="324" | 246 | font-size="324" |
249 | text-anchor="start" | 247 | id="text64" |
250 | id="text64">nxtlist</text> | 248 | style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->head</text> |
251 | <!-- Text --> | 249 | <!-- Text --> |
252 | <text | 250 | <text |
253 | xml:space="preserve" | 251 | xml:space="preserve" |
254 | x="225" | 252 | x="225" |
255 | y="1800" | 253 | y="1800" |
256 | fill="#000000" | ||
257 | font-family="Courier" | ||
258 | font-style="normal" | 254 | font-style="normal" |
259 | font-weight="bold" | 255 | font-weight="bold" |
260 | font-size="324" | 256 | font-size="324" |
261 | text-anchor="start" | 257 | id="text66" |
262 | id="text66">nxttail[RCU_DONE_TAIL]</text> | 258 | style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_DONE_TAIL]</text> |
263 | <!-- Text --> | 259 | <!-- Text --> |
264 | <text | 260 | <text |
265 | xml:space="preserve" | 261 | xml:space="preserve" |
266 | x="225" | 262 | x="225" |
267 | y="2925" | 263 | y="2925" |
268 | fill="#000000" | ||
269 | font-family="Courier" | ||
270 | font-style="normal" | 264 | font-style="normal" |
271 | font-weight="bold" | 265 | font-weight="bold" |
272 | font-size="324" | 266 | font-size="324" |
273 | text-anchor="start" | 267 | id="text68" |
274 | id="text68">nxttail[RCU_WAIT_TAIL]</text> | 268 | style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_WAIT_TAIL]</text> |
275 | <!-- Text --> | 269 | <!-- Text --> |
276 | <text | 270 | <text |
277 | xml:space="preserve" | 271 | xml:space="preserve" |
278 | x="225" | 272 | x="225" |
279 | y="4050" | 273 | y="4050" |
280 | fill="#000000" | ||
281 | font-family="Courier" | ||
282 | font-style="normal" | 274 | font-style="normal" |
283 | font-weight="bold" | 275 | font-weight="bold" |
284 | font-size="324" | 276 | font-size="324" |
285 | text-anchor="start" | 277 | id="text70" |
286 | id="text70">nxttail[RCU_NEXT_READY_TAIL]</text> | 278 | style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_NEXT_READY_TAIL]</text> |
287 | <!-- Text --> | 279 | <!-- Text --> |
288 | <text | 280 | <text |
289 | xml:space="preserve" | 281 | xml:space="preserve" |
290 | x="225" | 282 | x="225" |
291 | y="5175" | 283 | y="5175" |
292 | fill="#000000" | ||
293 | font-family="Courier" | ||
294 | font-style="normal" | 284 | font-style="normal" |
295 | font-weight="bold" | 285 | font-weight="bold" |
296 | font-size="324" | 286 | font-size="324" |
297 | text-anchor="start" | 287 | id="text72" |
298 | id="text72">nxttail[RCU_NEXT_TAIL]</text> | 288 | style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_NEXT_TAIL]</text> |
299 | <!-- Text --> | 289 | <!-- Text --> |
300 | <text | 290 | <text |
301 | xml:space="preserve" | 291 | xml:space="preserve" |
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html index 7a3194c5559a..e5d0bbd0230b 100644 --- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html +++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html | |||
@@ -284,6 +284,7 @@ Expedited Grace Period Refinements</a></h2> | |||
284 | Funnel locking and wait/wakeup</a>. | 284 | Funnel locking and wait/wakeup</a>. |
285 | <li> <a href="#Use of Workqueues">Use of Workqueues</a>. | 285 | <li> <a href="#Use of Workqueues">Use of Workqueues</a>. |
286 | <li> <a href="#Stall Warnings">Stall warnings</a>. | 286 | <li> <a href="#Stall Warnings">Stall warnings</a>. |
287 | <li> <a href="#Mid-Boot Operation">Mid-boot operation</a>. | ||
287 | </ol> | 288 | </ol> |
288 | 289 | ||
289 | <h3><a name="Idle-CPU Checks">Idle-CPU Checks</a></h3> | 290 | <h3><a name="Idle-CPU Checks">Idle-CPU Checks</a></h3> |
@@ -524,7 +525,7 @@ their grace periods and carrying out their wakeups. | |||
524 | In earlier implementations, the task requesting the expedited | 525 | In earlier implementations, the task requesting the expedited |
525 | grace period also drove it to completion. | 526 | grace period also drove it to completion. |
526 | This straightforward approach had the disadvantage of needing to | 527 | This straightforward approach had the disadvantage of needing to |
527 | account for signals sent to user tasks, | 528 | account for POSIX signals sent to user tasks, |
528 | so more recent implemementations use the Linux kernel's | 529 | so more recent implemementations use the Linux kernel's |
529 | <a href="https://www.kernel.org/doc/Documentation/workqueue.txt">workqueues</a>. | 530 | <a href="https://www.kernel.org/doc/Documentation/workqueue.txt">workqueues</a>. |
530 | 531 | ||
@@ -533,8 +534,8 @@ The requesting task still does counter snapshotting and funnel-lock | |||
533 | processing, but the task reaching the top of the funnel lock | 534 | processing, but the task reaching the top of the funnel lock |
534 | does a <tt>schedule_work()</tt> (from <tt>_synchronize_rcu_expedited()</tt> | 535 | does a <tt>schedule_work()</tt> (from <tt>_synchronize_rcu_expedited()</tt> |
535 | so that a workqueue kthread does the actual grace-period processing. | 536 | so that a workqueue kthread does the actual grace-period processing. |
536 | Because workqueue kthreads do not accept signals, grace-period-wait | 537 | Because workqueue kthreads do not accept POSIX signals, grace-period-wait |
537 | processing need not allow for signals. | 538 | processing need not allow for POSIX signals. |
538 | 539 | ||
539 | In addition, this approach allows wakeups for the previous expedited | 540 | In addition, this approach allows wakeups for the previous expedited |
540 | grace period to be overlapped with processing for the next expedited | 541 | grace period to be overlapped with processing for the next expedited |
@@ -586,6 +587,46 @@ blocking the current grace period are printed. | |||
586 | Each stall warning results in another pass through the loop, but the | 587 | Each stall warning results in another pass through the loop, but the |
587 | second and subsequent passes use longer stall times. | 588 | second and subsequent passes use longer stall times. |
588 | 589 | ||
590 | <h3><a name="Mid-Boot Operation">Mid-boot operation</a></h3> | ||
591 | |||
592 | <p> | ||
593 | The use of workqueues has the advantage that the expedited | ||
594 | grace-period code need not worry about POSIX signals. | ||
595 | Unfortunately, it has the | ||
596 | corresponding disadvantage that workqueues cannot be used until | ||
597 | they are initialized, which does not happen until some time after | ||
598 | the scheduler spawns the first task. | ||
599 | Given that there are parts of the kernel that really do want to | ||
600 | execute grace periods during this mid-boot “dead zone”, | ||
601 | expedited grace periods must do something else during thie time. | ||
602 | |||
603 | <p> | ||
604 | What they do is to fall back to the old practice of requiring that the | ||
605 | requesting task drive the expedited grace period, as was the case | ||
606 | before the use of workqueues. | ||
607 | However, the requesting task is only required to drive the grace period | ||
608 | during the mid-boot dead zone. | ||
609 | Before mid-boot, a synchronous grace period is a no-op. | ||
610 | Some time after mid-boot, workqueues are used. | ||
611 | |||
612 | <p> | ||
613 | Non-expedited non-SRCU synchronous grace periods must also operate | ||
614 | normally during mid-boot. | ||
615 | This is handled by causing non-expedited grace periods to take the | ||
616 | expedited code path during mid-boot. | ||
617 | |||
618 | <p> | ||
619 | The current code assumes that there are no POSIX signals during | ||
620 | the mid-boot dead zone. | ||
621 | However, if an overwhelming need for POSIX signals somehow arises, | ||
622 | appropriate adjustments can be made to the expedited stall-warning code. | ||
623 | One such adjustment would reinstate the pre-workqueue stall-warning | ||
624 | checks, but only during the mid-boot dead zone. | ||
625 | |||
626 | <p> | ||
627 | With this refinement, synchronous grace periods can now be used from | ||
628 | task context pretty much any time during the life of the kernel. | ||
629 | |||
589 | <h3><a name="Summary"> | 630 | <h3><a name="Summary"> |
590 | Summary</a></h3> | 631 | Summary</a></h3> |
591 | 632 | ||
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 21593496aca6..f60adf112663 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html | |||
@@ -659,8 +659,9 @@ systems with more than one CPU: | |||
659 | In other words, a given instance of <tt>synchronize_rcu()</tt> | 659 | In other words, a given instance of <tt>synchronize_rcu()</tt> |
660 | can avoid waiting on a given RCU read-side critical section only | 660 | can avoid waiting on a given RCU read-side critical section only |
661 | if it can prove that <tt>synchronize_rcu()</tt> started first. | 661 | if it can prove that <tt>synchronize_rcu()</tt> started first. |
662 | </font> | ||
662 | 663 | ||
663 | <p> | 664 | <p><font color="ffffff"> |
664 | A related question is “When <tt>rcu_read_lock()</tt> | 665 | A related question is “When <tt>rcu_read_lock()</tt> |
665 | doesn't generate any code, why does it matter how it relates | 666 | doesn't generate any code, why does it matter how it relates |
666 | to a grace period?” | 667 | to a grace period?” |
@@ -675,8 +676,9 @@ systems with more than one CPU: | |||
675 | within the critical section, in which case none of the accesses | 676 | within the critical section, in which case none of the accesses |
676 | within the critical section may observe the effects of any | 677 | within the critical section may observe the effects of any |
677 | access following the grace period. | 678 | access following the grace period. |
679 | </font> | ||
678 | 680 | ||
679 | <p> | 681 | <p><font color="ffffff"> |
680 | As of late 2016, mathematical models of RCU take this | 682 | As of late 2016, mathematical models of RCU take this |
681 | viewpoint, for example, see slides 62 and 63 | 683 | viewpoint, for example, see slides 62 and 63 |
682 | of the | 684 | of the |
@@ -1616,8 +1618,8 @@ CPUs should at least make reasonable forward progress. | |||
1616 | In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt> | 1618 | In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt> |
1617 | is permitted to impose modest degradation of real-time latency | 1619 | is permitted to impose modest degradation of real-time latency |
1618 | on non-idle online CPUs. | 1620 | on non-idle online CPUs. |
1619 | That said, it will likely be necessary to take further steps to reduce this | 1621 | Here, “modest” means roughly the same latency |
1620 | degradation, hopefully to roughly that of a scheduling-clock interrupt. | 1622 | degradation as a scheduling-clock interrupt. |
1621 | 1623 | ||
1622 | <p> | 1624 | <p> |
1623 | There are a number of situations where even | 1625 | There are a number of situations where even |
@@ -1913,12 +1915,9 @@ This requirement is another factor driving batching of grace periods, | |||
1913 | but it is also the driving force behind the checks for large numbers | 1915 | but it is also the driving force behind the checks for large numbers |
1914 | of queued RCU callbacks in the <tt>call_rcu()</tt> code path. | 1916 | of queued RCU callbacks in the <tt>call_rcu()</tt> code path. |
1915 | Finally, high update rates should not delay RCU read-side critical | 1917 | Finally, high update rates should not delay RCU read-side critical |
1916 | sections, although some read-side delays can occur when using | 1918 | sections, although some small read-side delays can occur when using |
1917 | <tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use | 1919 | <tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use |
1918 | of <tt>try_stop_cpus()</tt>. | 1920 | of <tt>smp_call_function_single()</tt>. |
1919 | (In the future, <tt>synchronize_rcu_expedited()</tt> will be | ||
1920 | converted to use lighter-weight inter-processor interrupts (IPIs), | ||
1921 | but this will still disturb readers, though to a much smaller degree.) | ||
1922 | 1921 | ||
1923 | <p> | 1922 | <p> |
1924 | Although all three of these corner cases were understood in the early | 1923 | Although all three of these corner cases were understood in the early |
@@ -2154,7 +2153,8 @@ as will <tt>rcu_assign_pointer()</tt>. | |||
2154 | <p> | 2153 | <p> |
2155 | Although <tt>call_rcu()</tt> may be invoked at any | 2154 | Although <tt>call_rcu()</tt> may be invoked at any |
2156 | time during boot, callbacks are not guaranteed to be invoked until after | 2155 | time during boot, callbacks are not guaranteed to be invoked until after |
2157 | the scheduler is fully up and running. | 2156 | all of RCU's kthreads have been spawned, which occurs at |
2157 | <tt>early_initcall()</tt> time. | ||
2158 | This delay in callback invocation is due to the fact that RCU does not | 2158 | This delay in callback invocation is due to the fact that RCU does not |
2159 | invoke callbacks until it is fully initialized, and this full initialization | 2159 | invoke callbacks until it is fully initialized, and this full initialization |
2160 | cannot occur until after the scheduler has initialized itself to the | 2160 | cannot occur until after the scheduler has initialized itself to the |
@@ -2167,8 +2167,10 @@ on what operations those callbacks could invoke. | |||
2167 | Perhaps surprisingly, <tt>synchronize_rcu()</tt>, | 2167 | Perhaps surprisingly, <tt>synchronize_rcu()</tt>, |
2168 | <a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a> | 2168 | <a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a> |
2169 | (<a href="#Bottom-Half Flavor">discussed below</a>), | 2169 | (<a href="#Bottom-Half Flavor">discussed below</a>), |
2170 | and | 2170 | <a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>, |
2171 | <a href="#Sched Flavor"><tt>synchronize_sched()</tt></a> | 2171 | <tt>synchronize_rcu_expedited()</tt>, |
2172 | <tt>synchronize_rcu_bh_expedited()</tt>, and | ||
2173 | <tt>synchronize_sched_expedited()</tt> | ||
2172 | will all operate normally | 2174 | will all operate normally |
2173 | during very early boot, the reason being that there is only one CPU | 2175 | during very early boot, the reason being that there is only one CPU |
2174 | and preemption is disabled. | 2176 | and preemption is disabled. |
@@ -2178,45 +2180,59 @@ state and thus a grace period, so the early-boot implementation can | |||
2178 | be a no-op. | 2180 | be a no-op. |
2179 | 2181 | ||
2180 | <p> | 2182 | <p> |
2181 | Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt> | 2183 | However, once the scheduler has spawned its first kthread, this early |
2182 | continue to operate normally through the remainder of boot, courtesy | 2184 | boot trick fails for <tt>synchronize_rcu()</tt> (as well as for |
2183 | of the fact that preemption is disabled across their RCU read-side | 2185 | <tt>synchronize_rcu_expedited()</tt>) in <tt>CONFIG_PREEMPT=y</tt> |
2184 | critical sections and also courtesy of the fact that there is still | 2186 | kernels. |
2185 | only one CPU. | 2187 | The reason is that an RCU read-side critical section might be preempted, |
2186 | However, once the scheduler starts initializing, preemption is enabled. | 2188 | which means that a subsequent <tt>synchronize_rcu()</tt> really does have |
2187 | There is still only a single CPU, but the fact that preemption is enabled | 2189 | to wait for something, as opposed to simply returning immediately. |
2188 | means that the no-op implementation of <tt>synchronize_rcu()</tt> no | 2190 | Unfortunately, <tt>synchronize_rcu()</tt> can't do this until all of |
2189 | longer works in <tt>CONFIG_PREEMPT=y</tt> kernels. | 2191 | its kthreads are spawned, which doesn't happen until some time during |
2190 | Therefore, as soon as the scheduler starts initializing, the early-boot | 2192 | <tt>early_initcalls()</tt> time. |
2191 | fastpath is disabled. | 2193 | But this is no excuse: RCU is nevertheless required to correctly handle |
2192 | This means that <tt>synchronize_rcu()</tt> switches to its runtime | 2194 | synchronous grace periods during this time period. |
2193 | mode of operation where it posts callbacks, which in turn means that | 2195 | Once all of its kthreads are up and running, RCU starts running |
2194 | any call to <tt>synchronize_rcu()</tt> will block until the corresponding | 2196 | normally. |
2195 | callback is invoked. | ||
2196 | Unfortunately, the callback cannot be invoked until RCU's runtime | ||
2197 | grace-period machinery is up and running, which cannot happen until | ||
2198 | the scheduler has initialized itself sufficiently to allow RCU's | ||
2199 | kthreads to be spawned. | ||
2200 | Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler | ||
2201 | initialization can result in deadlock. | ||
2202 | 2197 | ||
2203 | <table> | 2198 | <table> |
2204 | <tr><th> </th></tr> | 2199 | <tr><th> </th></tr> |
2205 | <tr><th align="left">Quick Quiz:</th></tr> | 2200 | <tr><th align="left">Quick Quiz:</th></tr> |
2206 | <tr><td> | 2201 | <tr><td> |
2207 | So what happens with <tt>synchronize_rcu()</tt> during | 2202 | How can RCU possibly handle grace periods before all of its |
2208 | scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> | 2203 | kthreads have been spawned??? |
2209 | kernels? | ||
2210 | </td></tr> | 2204 | </td></tr> |
2211 | <tr><th align="left">Answer:</th></tr> | 2205 | <tr><th align="left">Answer:</th></tr> |
2212 | <tr><td bgcolor="#ffffff"><font color="ffffff"> | 2206 | <tr><td bgcolor="#ffffff"><font color="ffffff"> |
2213 | In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> | 2207 | Very carefully! |
2214 | maps directly to <tt>synchronize_sched()</tt>. | 2208 | </font> |
2215 | Therefore, <tt>synchronize_rcu()</tt> works normally throughout | 2209 | |
2216 | boot in <tt>CONFIG_PREEMPT=n</tt> kernels. | 2210 | <p><font color="ffffff"> |
2217 | However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, | 2211 | During the “dead zone” between the time that the |
2218 | so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> | 2212 | scheduler spawns the first task and the time that all of RCU's |
2219 | during scheduler initialization. | 2213 | kthreads have been spawned, all synchronous grace periods are |
2214 | handled by the expedited grace-period mechanism. | ||
2215 | At runtime, this expedited mechanism relies on workqueues, but | ||
2216 | during the dead zone the requesting task itself drives the | ||
2217 | desired expedited grace period. | ||
2218 | Because dead-zone execution takes place within task context, | ||
2219 | everything works. | ||
2220 | Once the dead zone ends, expedited grace periods go back to | ||
2221 | using workqueues, as is required to avoid problems that would | ||
2222 | otherwise occur when a user task received a POSIX signal while | ||
2223 | driving an expedited grace period. | ||
2224 | </font> | ||
2225 | |||
2226 | <p><font color="ffffff"> | ||
2227 | And yes, this does mean that it is unhelpful to send POSIX | ||
2228 | signals to random tasks between the time that the scheduler | ||
2229 | spawns its first kthread and the time that RCU's kthreads | ||
2230 | have all been spawned. | ||
2231 | If there ever turns out to be a good reason for sending POSIX | ||
2232 | signals during that time, appropriate adjustments will be made. | ||
2233 | (If it turns out that POSIX signals are sent during this time for | ||
2234 | no good reason, other adjustments will be made, appropriate | ||
2235 | or otherwise.) | ||
2220 | </font></td></tr> | 2236 | </font></td></tr> |
2221 | <tr><td> </td></tr> | 2237 | <tr><td> </td></tr> |
2222 | </table> | 2238 | </table> |
@@ -2295,12 +2311,61 @@ situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU. | |||
2295 | The need for <tt>rcu_barrier()</tt> for module unloading became | 2311 | The need for <tt>rcu_barrier()</tt> for module unloading became |
2296 | apparent later. | 2312 | apparent later. |
2297 | 2313 | ||
2314 | <p> | ||
2315 | <b>Important note</b>: The <tt>rcu_barrier()</tt> function is not, | ||
2316 | repeat, <i>not</i>, obligated to wait for a grace period. | ||
2317 | It is instead only required to wait for RCU callbacks that have | ||
2318 | already been posted. | ||
2319 | Therefore, if there are no RCU callbacks posted anywhere in the system, | ||
2320 | <tt>rcu_barrier()</tt> is within its rights to return immediately. | ||
2321 | Even if there are callbacks posted, <tt>rcu_barrier()</tt> does not | ||
2322 | necessarily need to wait for a grace period. | ||
2323 | |||
2324 | <table> | ||
2325 | <tr><th> </th></tr> | ||
2326 | <tr><th align="left">Quick Quiz:</th></tr> | ||
2327 | <tr><td> | ||
2328 | Wait a minute! | ||
2329 | Each RCU callbacks must wait for a grace period to complete, | ||
2330 | and <tt>rcu_barrier()</tt> must wait for each pre-existing | ||
2331 | callback to be invoked. | ||
2332 | Doesn't <tt>rcu_barrier()</tt> therefore need to wait for | ||
2333 | a full grace period if there is even one callback posted anywhere | ||
2334 | in the system? | ||
2335 | </td></tr> | ||
2336 | <tr><th align="left">Answer:</th></tr> | ||
2337 | <tr><td bgcolor="#ffffff"><font color="ffffff"> | ||
2338 | Absolutely not!!! | ||
2339 | </font> | ||
2340 | |||
2341 | <p><font color="ffffff"> | ||
2342 | Yes, each RCU callbacks must wait for a grace period to complete, | ||
2343 | but it might well be partly (or even completely) finished waiting | ||
2344 | by the time <tt>rcu_barrier()</tt> is invoked. | ||
2345 | In that case, <tt>rcu_barrier()</tt> need only wait for the | ||
2346 | remaining portion of the grace period to elapse. | ||
2347 | So even if there are quite a few callbacks posted, | ||
2348 | <tt>rcu_barrier()</tt> might well return quite quickly. | ||
2349 | </font> | ||
2350 | |||
2351 | <p><font color="ffffff"> | ||
2352 | So if you need to wait for a grace period as well as for all | ||
2353 | pre-existing callbacks, you will need to invoke both | ||
2354 | <tt>synchronize_rcu()</tt> and <tt>rcu_barrier()</tt>. | ||
2355 | If latency is a concern, you can always use workqueues | ||
2356 | to invoke them concurrently. | ||
2357 | </font></td></tr> | ||
2358 | <tr><td> </td></tr> | ||
2359 | </table> | ||
2360 | |||
2298 | <h3><a name="Hotplug CPU">Hotplug CPU</a></h3> | 2361 | <h3><a name="Hotplug CPU">Hotplug CPU</a></h3> |
2299 | 2362 | ||
2300 | <p> | 2363 | <p> |
2301 | The Linux kernel supports CPU hotplug, which means that CPUs | 2364 | The Linux kernel supports CPU hotplug, which means that CPUs |
2302 | can come and go. | 2365 | can come and go. |
2303 | It is of course illegal to use any RCU API member from an offline CPU. | 2366 | It is of course illegal to use any RCU API member from an offline CPU, |
2367 | with the exception of <a href="#Sleepable RCU">SRCU</a> read-side | ||
2368 | critical sections. | ||
2304 | This requirement was present from day one in DYNIX/ptx, but | 2369 | This requirement was present from day one in DYNIX/ptx, but |
2305 | on the other hand, the Linux kernel's CPU-hotplug implementation | 2370 | on the other hand, the Linux kernel's CPU-hotplug implementation |
2306 | is “interesting.” | 2371 | is “interesting.” |
@@ -2310,19 +2375,18 @@ The Linux-kernel CPU-hotplug implementation has notifiers that | |||
2310 | are used to allow the various kernel subsystems (including RCU) | 2375 | are used to allow the various kernel subsystems (including RCU) |
2311 | to respond appropriately to a given CPU-hotplug operation. | 2376 | to respond appropriately to a given CPU-hotplug operation. |
2312 | Most RCU operations may be invoked from CPU-hotplug notifiers, | 2377 | Most RCU operations may be invoked from CPU-hotplug notifiers, |
2313 | including even normal synchronous grace-period operations | 2378 | including even synchronous grace-period operations such as |
2314 | such as <tt>synchronize_rcu()</tt>. | 2379 | <tt>synchronize_rcu()</tt> and <tt>synchronize_rcu_expedited()</tt>. |
2315 | However, expedited grace-period operations such as | ||
2316 | <tt>synchronize_rcu_expedited()</tt> are not supported, | ||
2317 | due to the fact that current implementations block CPU-hotplug | ||
2318 | operations, which could result in deadlock. | ||
2319 | 2380 | ||
2320 | <p> | 2381 | <p> |
2321 | In addition, all-callback-wait operations such as | 2382 | However, all-callback-wait operations such as |
2322 | <tt>rcu_barrier()</tt> are also not supported, due to the | 2383 | <tt>rcu_barrier()</tt> are also not supported, due to the |
2323 | fact that there are phases of CPU-hotplug operations where | 2384 | fact that there are phases of CPU-hotplug operations where |
2324 | the outgoing CPU's callbacks will not be invoked until after | 2385 | the outgoing CPU's callbacks will not be invoked until after |
2325 | the CPU-hotplug operation ends, which could also result in deadlock. | 2386 | the CPU-hotplug operation ends, which could also result in deadlock. |
2387 | Furthermore, <tt>rcu_barrier()</tt> blocks CPU-hotplug operations | ||
2388 | during its execution, which results in another type of deadlock | ||
2389 | when invoked from a CPU-hotplug notifier. | ||
2326 | 2390 | ||
2327 | <h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3> | 2391 | <h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3> |
2328 | 2392 | ||
@@ -2864,6 +2928,27 @@ API, which, in combination with <tt>srcu_read_unlock()</tt>, | |||
2864 | guarantees a full memory barrier. | 2928 | guarantees a full memory barrier. |
2865 | 2929 | ||
2866 | <p> | 2930 | <p> |
2931 | Also unlike other RCU flavors, SRCU's callbacks-wait function | ||
2932 | <tt>srcu_barrier()</tt> may be invoked from CPU-hotplug notifiers, | ||
2933 | though this is not necessarily a good idea. | ||
2934 | The reason that this is possible is that SRCU is insensitive | ||
2935 | to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt> | ||
2936 | need not exclude CPU-hotplug operations. | ||
2937 | |||
2938 | <p> | ||
2939 | As of v4.12, SRCU's callbacks are maintained per-CPU, eliminating | ||
2940 | a locking bottleneck present in prior kernel versions. | ||
2941 | Although this will allow users to put much heavier stress on | ||
2942 | <tt>call_srcu()</tt>, it is important to note that SRCU does not | ||
2943 | yet take any special steps to deal with callback flooding. | ||
2944 | So if you are posting (say) 10,000 SRCU callbacks per second per CPU, | ||
2945 | you are probably totally OK, but if you intend to post (say) 1,000,000 | ||
2946 | SRCU callbacks per second per CPU, please run some tests first. | ||
2947 | SRCU just might need a few adjustment to deal with that sort of load. | ||
2948 | Of course, your mileage may vary based on the speed of your CPUs and | ||
2949 | the size of your memory. | ||
2950 | |||
2951 | <p> | ||
2867 | The | 2952 | The |
2868 | <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a> | 2953 | <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a> |
2869 | includes | 2954 | includes |
@@ -3021,8 +3106,8 @@ to do some redesign to avoid this scalability problem. | |||
3021 | 3106 | ||
3022 | <p> | 3107 | <p> |
3023 | RCU disables CPU hotplug in a few places, perhaps most notably in the | 3108 | RCU disables CPU hotplug in a few places, perhaps most notably in the |
3024 | expedited grace-period and <tt>rcu_barrier()</tt> operations. | 3109 | <tt>rcu_barrier()</tt> operations. |
3025 | If there is a strong reason to use expedited grace periods in CPU-hotplug | 3110 | If there is a strong reason to use <tt>rcu_barrier()</tt> in CPU-hotplug |
3026 | notifiers, it will be necessary to avoid disabling CPU hotplug. | 3111 | notifiers, it will be necessary to avoid disabling CPU hotplug. |
3027 | This would introduce some complexity, so there had better be a <i>very</i> | 3112 | This would introduce some complexity, so there had better be a <i>very</i> |
3028 | good reason. | 3113 | good reason. |
@@ -3096,9 +3181,5 @@ Andy Lutomirski for their help in rendering | |||
3096 | this article human readable, and to Michelle Rankin for her support | 3181 | this article human readable, and to Michelle Rankin for her support |
3097 | of this effort. | 3182 | of this effort. |
3098 | Other contributions are acknowledged in the Linux kernel's git archive. | 3183 | Other contributions are acknowledged in the Linux kernel's git archive. |
3099 | The cartoon is copyright (c) 2013 by Melissa Broussard, | ||
3100 | and is provided | ||
3101 | under the terms of the Creative Commons Attribution-Share Alike 3.0 | ||
3102 | United States license. | ||
3103 | 3184 | ||
3104 | </body></html> | 3185 | </body></html> |
diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt index c0bf2441a2ba..b2a613f16d74 100644 --- a/Documentation/RCU/rcu_dereference.txt +++ b/Documentation/RCU/rcu_dereference.txt | |||
@@ -138,6 +138,15 @@ o Be very careful about comparing pointers obtained from | |||
138 | This sort of comparison occurs frequently when scanning | 138 | This sort of comparison occurs frequently when scanning |
139 | RCU-protected circular linked lists. | 139 | RCU-protected circular linked lists. |
140 | 140 | ||
141 | Note that if checks for being within an RCU read-side | ||
142 | critical section are not required and the pointer is never | ||
143 | dereferenced, rcu_access_pointer() should be used in place | ||
144 | of rcu_dereference(). The rcu_access_pointer() primitive | ||
145 | does not require an enclosing read-side critical section, | ||
146 | and also omits the smp_read_barrier_depends() included in | ||
147 | rcu_dereference(), which in turn should provide a small | ||
148 | performance gain in some CPUs (e.g., the DEC Alpha). | ||
149 | |||
141 | o The comparison is against a pointer that references memory | 150 | o The comparison is against a pointer that references memory |
142 | that was initialized "a long time ago." The reason | 151 | that was initialized "a long time ago." The reason |
143 | this is safe is that even if misordering occurs, the | 152 | this is safe is that even if misordering occurs, the |
diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt index 18f9651ff23d..8151f0195f76 100644 --- a/Documentation/RCU/rculist_nulls.txt +++ b/Documentation/RCU/rculist_nulls.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | Using hlist_nulls to protect read-mostly linked lists and | 1 | Using hlist_nulls to protect read-mostly linked lists and |
2 | objects using SLAB_DESTROY_BY_RCU allocations. | 2 | objects using SLAB_TYPESAFE_BY_RCU allocations. |
3 | 3 | ||
4 | Please read the basics in Documentation/RCU/listRCU.txt | 4 | Please read the basics in Documentation/RCU/listRCU.txt |
5 | 5 | ||
@@ -7,7 +7,7 @@ Using special makers (called 'nulls') is a convenient way | |||
7 | to solve following problem : | 7 | to solve following problem : |
8 | 8 | ||
9 | A typical RCU linked list managing objects which are | 9 | A typical RCU linked list managing objects which are |
10 | allocated with SLAB_DESTROY_BY_RCU kmem_cache can | 10 | allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can |
11 | use following algos : | 11 | use following algos : |
12 | 12 | ||
13 | 1) Lookup algo | 13 | 1) Lookup algo |
@@ -96,7 +96,7 @@ unlock_chain(); // typically a spin_unlock() | |||
96 | 3) Remove algo | 96 | 3) Remove algo |
97 | -------------- | 97 | -------------- |
98 | Nothing special here, we can use a standard RCU hlist deletion. | 98 | Nothing special here, we can use a standard RCU hlist deletion. |
99 | But thanks to SLAB_DESTROY_BY_RCU, beware a deleted object can be reused | 99 | But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused |
100 | very very fast (before the end of RCU grace period) | 100 | very very fast (before the end of RCU grace period) |
101 | 101 | ||
102 | if (put_last_reference_on(obj) { | 102 | if (put_last_reference_on(obj) { |
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index e93d04133fe7..96a3d81837e1 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt | |||
@@ -1,9 +1,102 @@ | |||
1 | Using RCU's CPU Stall Detector | 1 | Using RCU's CPU Stall Detector |
2 | 2 | ||
3 | The rcu_cpu_stall_suppress module parameter enables RCU's CPU stall | 3 | This document first discusses what sorts of issues RCU's CPU stall |
4 | detector, which detects conditions that unduly delay RCU grace periods. | 4 | detector can locate, and then discusses kernel parameters and Kconfig |
5 | This module parameter enables CPU stall detection by default, but | 5 | options that can be used to fine-tune the detector's operation. Finally, |
6 | may be overridden via boot-time parameter or at runtime via sysfs. | 6 | this document explains the stall detector's "splat" format. |
7 | |||
8 | |||
9 | What Causes RCU CPU Stall Warnings? | ||
10 | |||
11 | So your kernel printed an RCU CPU stall warning. The next question is | ||
12 | "What caused it?" The following problems can result in RCU CPU stall | ||
13 | warnings: | ||
14 | |||
15 | o A CPU looping in an RCU read-side critical section. | ||
16 | |||
17 | o A CPU looping with interrupts disabled. | ||
18 | |||
19 | o A CPU looping with preemption disabled. This condition can | ||
20 | result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh | ||
21 | stalls. | ||
22 | |||
23 | o A CPU looping with bottom halves disabled. This condition can | ||
24 | result in RCU-sched and RCU-bh stalls. | ||
25 | |||
26 | o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the | ||
27 | kernel without invoking schedule(). Note that cond_resched() | ||
28 | does not necessarily prevent RCU CPU stall warnings. Therefore, | ||
29 | if the looping in the kernel is really expected and desirable | ||
30 | behavior, you might need to replace some of the cond_resched() | ||
31 | calls with calls to cond_resched_rcu_qs(). | ||
32 | |||
33 | o Booting Linux using a console connection that is too slow to | ||
34 | keep up with the boot-time console-message rate. For example, | ||
35 | a 115Kbaud serial console can be -way- too slow to keep up | ||
36 | with boot-time message rates, and will frequently result in | ||
37 | RCU CPU stall warning messages. Especially if you have added | ||
38 | debug printk()s. | ||
39 | |||
40 | o Anything that prevents RCU's grace-period kthreads from running. | ||
41 | This can result in the "All QSes seen" console-log message. | ||
42 | This message will include information on when the kthread last | ||
43 | ran and how often it should be expected to run. | ||
44 | |||
45 | o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might | ||
46 | happen to preempt a low-priority task in the middle of an RCU | ||
47 | read-side critical section. This is especially damaging if | ||
48 | that low-priority task is not permitted to run on any other CPU, | ||
49 | in which case the next RCU grace period can never complete, which | ||
50 | will eventually cause the system to run out of memory and hang. | ||
51 | While the system is in the process of running itself out of | ||
52 | memory, you might see stall-warning messages. | ||
53 | |||
54 | o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that | ||
55 | is running at a higher priority than the RCU softirq threads. | ||
56 | This will prevent RCU callbacks from ever being invoked, | ||
57 | and in a CONFIG_PREEMPT_RCU kernel will further prevent | ||
58 | RCU grace periods from ever completing. Either way, the | ||
59 | system will eventually run out of memory and hang. In the | ||
60 | CONFIG_PREEMPT_RCU case, you might see stall-warning | ||
61 | messages. | ||
62 | |||
63 | o A hardware or software issue shuts off the scheduler-clock | ||
64 | interrupt on a CPU that is not in dyntick-idle mode. This | ||
65 | problem really has happened, and seems to be most likely to | ||
66 | result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels. | ||
67 | |||
68 | o A bug in the RCU implementation. | ||
69 | |||
70 | o A hardware failure. This is quite unlikely, but has occurred | ||
71 | at least once in real life. A CPU failed in a running system, | ||
72 | becoming unresponsive, but not causing an immediate crash. | ||
73 | This resulted in a series of RCU CPU stall warnings, eventually | ||
74 | leading the realization that the CPU had failed. | ||
75 | |||
76 | The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall | ||
77 | warning. Note that SRCU does -not- have CPU stall warnings. Please note | ||
78 | that RCU only detects CPU stalls when there is a grace period in progress. | ||
79 | No grace period, no CPU stall warnings. | ||
80 | |||
81 | To diagnose the cause of the stall, inspect the stack traces. | ||
82 | The offending function will usually be near the top of the stack. | ||
83 | If you have a series of stall warnings from a single extended stall, | ||
84 | comparing the stack traces can often help determine where the stall | ||
85 | is occurring, which will usually be in the function nearest the top of | ||
86 | that portion of the stack which remains the same from trace to trace. | ||
87 | If you can reliably trigger the stall, ftrace can be quite helpful. | ||
88 | |||
89 | RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE | ||
90 | and with RCU's event tracing. For information on RCU's event tracing, | ||
91 | see include/trace/events/rcu.h. | ||
92 | |||
93 | |||
94 | Fine-Tuning the RCU CPU Stall Detector | ||
95 | |||
96 | The rcuupdate.rcu_cpu_stall_suppress module parameter disables RCU's | ||
97 | CPU stall detector, which detects conditions that unduly delay RCU grace | ||
98 | periods. This module parameter enables CPU stall detection by default, | ||
99 | but may be overridden via boot-time parameter or at runtime via sysfs. | ||
7 | The stall detector's idea of what constitutes "unduly delayed" is | 100 | The stall detector's idea of what constitutes "unduly delayed" is |
8 | controlled by a set of kernel configuration variables and cpp macros: | 101 | controlled by a set of kernel configuration variables and cpp macros: |
9 | 102 | ||
@@ -56,6 +149,9 @@ rcupdate.rcu_task_stall_timeout | |||
56 | And continues with the output of sched_show_task() for each | 149 | And continues with the output of sched_show_task() for each |
57 | task stalling the current RCU-tasks grace period. | 150 | task stalling the current RCU-tasks grace period. |
58 | 151 | ||
152 | |||
153 | Interpreting RCU's CPU Stall-Detector "Splats" | ||
154 | |||
59 | For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling, | 155 | For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling, |
60 | it will print a message similar to the following: | 156 | it will print a message similar to the following: |
61 | 157 | ||
@@ -178,89 +274,3 @@ grace period is in flight. | |||
178 | 274 | ||
179 | It is entirely possible to see stall warnings from normal and from | 275 | It is entirely possible to see stall warnings from normal and from |
180 | expedited grace periods at about the same time from the same run. | 276 | expedited grace periods at about the same time from the same run. |
181 | |||
182 | |||
183 | What Causes RCU CPU Stall Warnings? | ||
184 | |||
185 | So your kernel printed an RCU CPU stall warning. The next question is | ||
186 | "What caused it?" The following problems can result in RCU CPU stall | ||
187 | warnings: | ||
188 | |||
189 | o A CPU looping in an RCU read-side critical section. | ||
190 | |||
191 | o A CPU looping with interrupts disabled. This condition can | ||
192 | result in RCU-sched and RCU-bh stalls. | ||
193 | |||
194 | o A CPU looping with preemption disabled. This condition can | ||
195 | result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh | ||
196 | stalls. | ||
197 | |||
198 | o A CPU looping with bottom halves disabled. This condition can | ||
199 | result in RCU-sched and RCU-bh stalls. | ||
200 | |||
201 | o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the | ||
202 | kernel without invoking schedule(). Note that cond_resched() | ||
203 | does not necessarily prevent RCU CPU stall warnings. Therefore, | ||
204 | if the looping in the kernel is really expected and desirable | ||
205 | behavior, you might need to replace some of the cond_resched() | ||
206 | calls with calls to cond_resched_rcu_qs(). | ||
207 | |||
208 | o Booting Linux using a console connection that is too slow to | ||
209 | keep up with the boot-time console-message rate. For example, | ||
210 | a 115Kbaud serial console can be -way- too slow to keep up | ||
211 | with boot-time message rates, and will frequently result in | ||
212 | RCU CPU stall warning messages. Especially if you have added | ||
213 | debug printk()s. | ||
214 | |||
215 | o Anything that prevents RCU's grace-period kthreads from running. | ||
216 | This can result in the "All QSes seen" console-log message. | ||
217 | This message will include information on when the kthread last | ||
218 | ran and how often it should be expected to run. | ||
219 | |||
220 | o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might | ||
221 | happen to preempt a low-priority task in the middle of an RCU | ||
222 | read-side critical section. This is especially damaging if | ||
223 | that low-priority task is not permitted to run on any other CPU, | ||
224 | in which case the next RCU grace period can never complete, which | ||
225 | will eventually cause the system to run out of memory and hang. | ||
226 | While the system is in the process of running itself out of | ||
227 | memory, you might see stall-warning messages. | ||
228 | |||
229 | o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that | ||
230 | is running at a higher priority than the RCU softirq threads. | ||
231 | This will prevent RCU callbacks from ever being invoked, | ||
232 | and in a CONFIG_PREEMPT_RCU kernel will further prevent | ||
233 | RCU grace periods from ever completing. Either way, the | ||
234 | system will eventually run out of memory and hang. In the | ||
235 | CONFIG_PREEMPT_RCU case, you might see stall-warning | ||
236 | messages. | ||
237 | |||
238 | o A hardware or software issue shuts off the scheduler-clock | ||
239 | interrupt on a CPU that is not in dyntick-idle mode. This | ||
240 | problem really has happened, and seems to be most likely to | ||
241 | result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels. | ||
242 | |||
243 | o A bug in the RCU implementation. | ||
244 | |||
245 | o A hardware failure. This is quite unlikely, but has occurred | ||
246 | at least once in real life. A CPU failed in a running system, | ||
247 | becoming unresponsive, but not causing an immediate crash. | ||
248 | This resulted in a series of RCU CPU stall warnings, eventually | ||
249 | leading the realization that the CPU had failed. | ||
250 | |||
251 | The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall | ||
252 | warning. Note that SRCU does -not- have CPU stall warnings. Please note | ||
253 | that RCU only detects CPU stalls when there is a grace period in progress. | ||
254 | No grace period, no CPU stall warnings. | ||
255 | |||
256 | To diagnose the cause of the stall, inspect the stack traces. | ||
257 | The offending function will usually be near the top of the stack. | ||
258 | If you have a series of stall warnings from a single extended stall, | ||
259 | comparing the stack traces can often help determine where the stall | ||
260 | is occurring, which will usually be in the function nearest the top of | ||
261 | that portion of the stack which remains the same from trace to trace. | ||
262 | If you can reliably trigger the stall, ftrace can be quite helpful. | ||
263 | |||
264 | RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE | ||
265 | and with RCU's event tracing. For information on RCU's event tracing, | ||
266 | see include/trace/events/rcu.h. | ||
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 5cbd8b2395b8..8ed6c9f6133c 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt | |||
@@ -562,7 +562,9 @@ This section presents a "toy" RCU implementation that is based on | |||
562 | familiar locking primitives. Its overhead makes it a non-starter for | 562 | familiar locking primitives. Its overhead makes it a non-starter for |
563 | real-life use, as does its lack of scalability. It is also unsuitable | 563 | real-life use, as does its lack of scalability. It is also unsuitable |
564 | for realtime use, since it allows scheduling latency to "bleed" from | 564 | for realtime use, since it allows scheduling latency to "bleed" from |
565 | one read-side critical section to another. | 565 | one read-side critical section to another. It also assumes recursive |
566 | reader-writer locks: If you try this with non-recursive locks, and | ||
567 | you allow nested rcu_read_lock() calls, you can deadlock. | ||
566 | 568 | ||
567 | However, it is probably the easiest implementation to relate to, so is | 569 | However, it is probably the easiest implementation to relate to, so is |
568 | a good starting point. | 570 | a good starting point. |
@@ -587,20 +589,21 @@ It is extremely simple: | |||
587 | write_unlock(&rcu_gp_mutex); | 589 | write_unlock(&rcu_gp_mutex); |
588 | } | 590 | } |
589 | 591 | ||
590 | [You can ignore rcu_assign_pointer() and rcu_dereference() without | 592 | [You can ignore rcu_assign_pointer() and rcu_dereference() without missing |
591 | missing much. But here they are anyway. And whatever you do, don't | 593 | much. But here are simplified versions anyway. And whatever you do, |
592 | forget about them when submitting patches making use of RCU!] | 594 | don't forget about them when submitting patches making use of RCU!] |
593 | 595 | ||
594 | #define rcu_assign_pointer(p, v) ({ \ | 596 | #define rcu_assign_pointer(p, v) \ |
595 | smp_wmb(); \ | 597 | ({ \ |
596 | (p) = (v); \ | 598 | smp_store_release(&(p), (v)); \ |
597 | }) | 599 | }) |
598 | 600 | ||
599 | #define rcu_dereference(p) ({ \ | 601 | #define rcu_dereference(p) \ |
600 | typeof(p) _________p1 = p; \ | 602 | ({ \ |
601 | smp_read_barrier_depends(); \ | 603 | typeof(p) _________p1 = p; \ |
602 | (_________p1); \ | 604 | smp_read_barrier_depends(); \ |
603 | }) | 605 | (_________p1); \ |
606 | }) | ||
604 | 607 | ||
605 | 608 | ||
606 | The rcu_read_lock() and rcu_read_unlock() primitive read-acquire | 609 | The rcu_read_lock() and rcu_read_unlock() primitive read-acquire |
@@ -925,7 +928,8 @@ d. Do you need RCU grace periods to complete even in the face | |||
925 | 928 | ||
926 | e. Is your workload too update-intensive for normal use of | 929 | e. Is your workload too update-intensive for normal use of |
927 | RCU, but inappropriate for other synchronization mechanisms? | 930 | RCU, but inappropriate for other synchronization mechanisms? |
928 | If so, consider SLAB_DESTROY_BY_RCU. But please be careful! | 931 | If so, consider SLAB_TYPESAFE_BY_RCU (which was originally |
932 | named SLAB_DESTROY_BY_RCU). But please be careful! | ||
929 | 933 | ||
930 | f. Do you need read-side critical sections that are respected | 934 | f. Do you need read-side critical sections that are respected |
931 | even though they are in the middle of the idle loop, during | 935 | even though they are in the middle of the idle loop, during |
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 130e7ecaf9a6..4e0654b56aef 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt | |||
@@ -3800,6 +3800,14 @@ | |||
3800 | spia_pedr= | 3800 | spia_pedr= |
3801 | spia_peddr= | 3801 | spia_peddr= |
3802 | 3802 | ||
3803 | srcutree.exp_holdoff [KNL] | ||
3804 | Specifies how many nanoseconds must elapse | ||
3805 | since the end of the last SRCU grace period for | ||
3806 | a given srcu_struct until the next normal SRCU | ||
3807 | grace period will be considered for automatic | ||
3808 | expediting. Set to zero to disable automatic | ||
3809 | expediting. | ||
3810 | |||
3803 | stacktrace [FTRACE] | 3811 | stacktrace [FTRACE] |
3804 | Enabled the stack tracer on boot up. | 3812 | Enabled the stack tracer on boot up. |
3805 | 3813 | ||
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index d2b0a8d81258..08329cb857ed 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt | |||
@@ -768,7 +768,7 @@ equal to zero, in which case the compiler is within its rights to | |||
768 | transform the above code into the following: | 768 | transform the above code into the following: |
769 | 769 | ||
770 | q = READ_ONCE(a); | 770 | q = READ_ONCE(a); |
771 | WRITE_ONCE(b, 1); | 771 | WRITE_ONCE(b, 2); |
772 | do_something_else(); | 772 | do_something_else(); |
773 | 773 | ||
774 | Given this transformation, the CPU is not required to respect the ordering | 774 | Given this transformation, the CPU is not required to respect the ordering |
diff --git a/arch/Kconfig b/arch/Kconfig index dcbd462b68b1..6c00e5b00f8b 100644 --- a/arch/Kconfig +++ b/arch/Kconfig | |||
@@ -324,6 +324,9 @@ config HAVE_CMPXCHG_LOCAL | |||
324 | config HAVE_CMPXCHG_DOUBLE | 324 | config HAVE_CMPXCHG_DOUBLE |
325 | bool | 325 | bool |
326 | 326 | ||
327 | config ARCH_WEAK_RELEASE_ACQUIRE | ||
328 | bool | ||
329 | |||
327 | config ARCH_WANT_IPC_PARSE_VERSION | 330 | config ARCH_WANT_IPC_PARSE_VERSION |
328 | bool | 331 | bool |
329 | 332 | ||
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d8834e8bfb05..964da1891ea9 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig | |||
@@ -146,6 +146,7 @@ config PPC | |||
146 | select ARCH_USE_BUILTIN_BSWAP | 146 | select ARCH_USE_BUILTIN_BSWAP |
147 | select ARCH_USE_CMPXCHG_LOCKREF if PPC64 | 147 | select ARCH_USE_CMPXCHG_LOCKREF if PPC64 |
148 | select ARCH_WANT_IPC_PARSE_VERSION | 148 | select ARCH_WANT_IPC_PARSE_VERSION |
149 | select ARCH_WEAK_RELEASE_ACQUIRE | ||
149 | select BINFMT_ELF | 150 | select BINFMT_ELF |
150 | select BUILDTIME_EXTABLE_SORT | 151 | select BUILDTIME_EXTABLE_SORT |
151 | select CLONE_BACKWARDS | 152 | select CLONE_BACKWARDS |
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 532a577ff7a1..b6ac3df18b58 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c | |||
@@ -4789,7 +4789,7 @@ i915_gem_load_init(struct drm_i915_private *dev_priv) | |||
4789 | dev_priv->requests = KMEM_CACHE(drm_i915_gem_request, | 4789 | dev_priv->requests = KMEM_CACHE(drm_i915_gem_request, |
4790 | SLAB_HWCACHE_ALIGN | | 4790 | SLAB_HWCACHE_ALIGN | |
4791 | SLAB_RECLAIM_ACCOUNT | | 4791 | SLAB_RECLAIM_ACCOUNT | |
4792 | SLAB_DESTROY_BY_RCU); | 4792 | SLAB_TYPESAFE_BY_RCU); |
4793 | if (!dev_priv->requests) | 4793 | if (!dev_priv->requests) |
4794 | goto err_vmas; | 4794 | goto err_vmas; |
4795 | 4795 | ||
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h index a211c53c813f..129c58bb4805 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.h +++ b/drivers/gpu/drm/i915/i915_gem_request.h | |||
@@ -521,7 +521,7 @@ static inline struct drm_i915_gem_request * | |||
521 | __i915_gem_active_get_rcu(const struct i915_gem_active *active) | 521 | __i915_gem_active_get_rcu(const struct i915_gem_active *active) |
522 | { | 522 | { |
523 | /* Performing a lockless retrieval of the active request is super | 523 | /* Performing a lockless retrieval of the active request is super |
524 | * tricky. SLAB_DESTROY_BY_RCU merely guarantees that the backing | 524 | * tricky. SLAB_TYPESAFE_BY_RCU merely guarantees that the backing |
525 | * slab of request objects will not be freed whilst we hold the | 525 | * slab of request objects will not be freed whilst we hold the |
526 | * RCU read lock. It does not guarantee that the request itself | 526 | * RCU read lock. It does not guarantee that the request itself |
527 | * will not be freed and then *reused*. Viz, | 527 | * will not be freed and then *reused*. Viz, |
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c index 6a8258eacdcb..9f24c5da3f8d 100644 --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c | |||
@@ -174,7 +174,7 @@ struct drm_i915_private *mock_gem_device(void) | |||
174 | i915->requests = KMEM_CACHE(mock_request, | 174 | i915->requests = KMEM_CACHE(mock_request, |
175 | SLAB_HWCACHE_ALIGN | | 175 | SLAB_HWCACHE_ALIGN | |
176 | SLAB_RECLAIM_ACCOUNT | | 176 | SLAB_RECLAIM_ACCOUNT | |
177 | SLAB_DESTROY_BY_RCU); | 177 | SLAB_TYPESAFE_BY_RCU); |
178 | if (!i915->requests) | 178 | if (!i915->requests) |
179 | goto err_vmas; | 179 | goto err_vmas; |
180 | 180 | ||
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c index 6f9d540a97ce..fff930fc3cff 100644 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c | |||
@@ -1115,7 +1115,7 @@ int ldlm_init(void) | |||
1115 | ldlm_lock_slab = kmem_cache_create("ldlm_locks", | 1115 | ldlm_lock_slab = kmem_cache_create("ldlm_locks", |
1116 | sizeof(struct ldlm_lock), 0, | 1116 | sizeof(struct ldlm_lock), 0, |
1117 | SLAB_HWCACHE_ALIGN | | 1117 | SLAB_HWCACHE_ALIGN | |
1118 | SLAB_DESTROY_BY_RCU, NULL); | 1118 | SLAB_TYPESAFE_BY_RCU, NULL); |
1119 | if (!ldlm_lock_slab) { | 1119 | if (!ldlm_lock_slab) { |
1120 | kmem_cache_destroy(ldlm_resource_slab); | 1120 | kmem_cache_destroy(ldlm_resource_slab); |
1121 | return -ENOMEM; | 1121 | return -ENOMEM; |
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5a0245e36240..ebad34266bcf 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c | |||
@@ -2363,7 +2363,7 @@ static int jbd2_journal_init_journal_head_cache(void) | |||
2363 | jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", | 2363 | jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", |
2364 | sizeof(struct journal_head), | 2364 | sizeof(struct journal_head), |
2365 | 0, /* offset */ | 2365 | 0, /* offset */ |
2366 | SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU, | 2366 | SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU, |
2367 | NULL); /* ctor */ | 2367 | NULL); /* ctor */ |
2368 | retval = 0; | 2368 | retval = 0; |
2369 | if (!jbd2_journal_head_cache) { | 2369 | if (!jbd2_journal_head_cache) { |
diff --git a/fs/signalfd.c b/fs/signalfd.c index 270221fcef42..7e3d71109f51 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c | |||
@@ -38,7 +38,7 @@ void signalfd_cleanup(struct sighand_struct *sighand) | |||
38 | /* | 38 | /* |
39 | * The lockless check can race with remove_wait_queue() in progress, | 39 | * The lockless check can race with remove_wait_queue() in progress, |
40 | * but in this case its caller should run under rcu_read_lock() and | 40 | * but in this case its caller should run under rcu_read_lock() and |
41 | * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return. | 41 | * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return. |
42 | */ | 42 | */ |
43 | if (likely(!waitqueue_active(wqh))) | 43 | if (likely(!waitqueue_active(wqh))) |
44 | return; | 44 | return; |
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 6048fa404e57..a5195a7d6f77 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h | |||
@@ -229,7 +229,7 @@ static inline struct dma_fence *dma_fence_get_rcu(struct dma_fence *fence) | |||
229 | * | 229 | * |
230 | * Function returns NULL if no refcount could be obtained, or the fence. | 230 | * Function returns NULL if no refcount could be obtained, or the fence. |
231 | * This function handles acquiring a reference to a fence that may be | 231 | * This function handles acquiring a reference to a fence that may be |
232 | * reallocated within the RCU grace period (such as with SLAB_DESTROY_BY_RCU), | 232 | * reallocated within the RCU grace period (such as with SLAB_TYPESAFE_BY_RCU), |
233 | * so long as the caller is using RCU on the pointer to the fence. | 233 | * so long as the caller is using RCU on the pointer to the fence. |
234 | * | 234 | * |
235 | * An alternative mechanism is to employ a seqlock to protect a bunch of | 235 | * An alternative mechanism is to employ a seqlock to protect a bunch of |
@@ -257,7 +257,7 @@ dma_fence_get_rcu_safe(struct dma_fence * __rcu *fencep) | |||
257 | * have successfully acquire a reference to it. If it no | 257 | * have successfully acquire a reference to it. If it no |
258 | * longer matches, we are holding a reference to some other | 258 | * longer matches, we are holding a reference to some other |
259 | * reallocated pointer. This is possible if the allocator | 259 | * reallocated pointer. This is possible if the allocator |
260 | * is using a freelist like SLAB_DESTROY_BY_RCU where the | 260 | * is using a freelist like SLAB_TYPESAFE_BY_RCU where the |
261 | * fence remains valid for the RCU grace period, but it | 261 | * fence remains valid for the RCU grace period, but it |
262 | * may be reallocated. When using such allocators, we are | 262 | * may be reallocated. When using such allocators, we are |
263 | * responsible for ensuring the reference we get is to | 263 | * responsible for ensuring the reference we get is to |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4d629471869b..2b12b2683359 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
@@ -384,8 +384,6 @@ struct kvm { | |||
384 | struct mutex slots_lock; | 384 | struct mutex slots_lock; |
385 | struct mm_struct *mm; /* userspace tied to this vm */ | 385 | struct mm_struct *mm; /* userspace tied to this vm */ |
386 | struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM]; | 386 | struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM]; |
387 | struct srcu_struct srcu; | ||
388 | struct srcu_struct irq_srcu; | ||
389 | struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; | 387 | struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; |
390 | 388 | ||
391 | /* | 389 | /* |
@@ -438,6 +436,8 @@ struct kvm { | |||
438 | struct list_head devices; | 436 | struct list_head devices; |
439 | struct dentry *debugfs_dentry; | 437 | struct dentry *debugfs_dentry; |
440 | struct kvm_stat_data **debugfs_stat_data; | 438 | struct kvm_stat_data **debugfs_stat_data; |
439 | struct srcu_struct srcu; | ||
440 | struct srcu_struct irq_srcu; | ||
441 | }; | 441 | }; |
442 | 442 | ||
443 | #define kvm_err(fmt, ...) \ | 443 | #define kvm_err(fmt, ...) \ |
diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h new file mode 100644 index 000000000000..4b766b61e1a0 --- /dev/null +++ b/include/linux/rcu_node_tree.h | |||
@@ -0,0 +1,99 @@ | |||
1 | /* | ||
2 | * RCU node combining tree definitions. These are used to compute | ||
3 | * global attributes while avoiding common-case global contention. A key | ||
4 | * property that these computations rely on is a tournament-style approach | ||
5 | * where only one of the tasks contending a lower level in the tree need | ||
6 | * advance to the next higher level. If properly configured, this allows | ||
7 | * unlimited scalability while maintaining a constant level of contention | ||
8 | * on the root node. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License as published by | ||
12 | * the Free Software Foundation; either version 2 of the License, or | ||
13 | * (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | * GNU General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License | ||
21 | * along with this program; if not, you can access it online at | ||
22 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
23 | * | ||
24 | * Copyright IBM Corporation, 2017 | ||
25 | * | ||
26 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
27 | */ | ||
28 | |||
29 | #ifndef __LINUX_RCU_NODE_TREE_H | ||
30 | #define __LINUX_RCU_NODE_TREE_H | ||
31 | |||
32 | /* | ||
33 | * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and | ||
34 | * CONFIG_RCU_FANOUT_LEAF. | ||
35 | * In theory, it should be possible to add more levels straightforwardly. | ||
36 | * In practice, this did work well going from three levels to four. | ||
37 | * Of course, your mileage may vary. | ||
38 | */ | ||
39 | |||
40 | #ifdef CONFIG_RCU_FANOUT | ||
41 | #define RCU_FANOUT CONFIG_RCU_FANOUT | ||
42 | #else /* #ifdef CONFIG_RCU_FANOUT */ | ||
43 | # ifdef CONFIG_64BIT | ||
44 | # define RCU_FANOUT 64 | ||
45 | # else | ||
46 | # define RCU_FANOUT 32 | ||
47 | # endif | ||
48 | #endif /* #else #ifdef CONFIG_RCU_FANOUT */ | ||
49 | |||
50 | #ifdef CONFIG_RCU_FANOUT_LEAF | ||
51 | #define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF | ||
52 | #else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ | ||
53 | #define RCU_FANOUT_LEAF 16 | ||
54 | #endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ | ||
55 | |||
56 | #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) | ||
57 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) | ||
58 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) | ||
59 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) | ||
60 | |||
61 | #if NR_CPUS <= RCU_FANOUT_1 | ||
62 | # define RCU_NUM_LVLS 1 | ||
63 | # define NUM_RCU_LVL_0 1 | ||
64 | # define NUM_RCU_NODES NUM_RCU_LVL_0 | ||
65 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } | ||
66 | # define RCU_NODE_NAME_INIT { "rcu_node_0" } | ||
67 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } | ||
68 | #elif NR_CPUS <= RCU_FANOUT_2 | ||
69 | # define RCU_NUM_LVLS 2 | ||
70 | # define NUM_RCU_LVL_0 1 | ||
71 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
72 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) | ||
73 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } | ||
74 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } | ||
75 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } | ||
76 | #elif NR_CPUS <= RCU_FANOUT_3 | ||
77 | # define RCU_NUM_LVLS 3 | ||
78 | # define NUM_RCU_LVL_0 1 | ||
79 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | ||
80 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
81 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) | ||
82 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } | ||
83 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } | ||
84 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } | ||
85 | #elif NR_CPUS <= RCU_FANOUT_4 | ||
86 | # define RCU_NUM_LVLS 4 | ||
87 | # define NUM_RCU_LVL_0 1 | ||
88 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) | ||
89 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | ||
90 | # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
91 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) | ||
92 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } | ||
93 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } | ||
94 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } | ||
95 | #else | ||
96 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" | ||
97 | #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ | ||
98 | |||
99 | #endif /* __LINUX_RCU_NODE_TREE_H */ | ||
diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h new file mode 100644 index 000000000000..ba4d2621d9ca --- /dev/null +++ b/include/linux/rcu_segcblist.h | |||
@@ -0,0 +1,90 @@ | |||
1 | /* | ||
2 | * RCU segmented callback lists | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, you can access it online at | ||
16 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2017 | ||
19 | * | ||
20 | * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
21 | */ | ||
22 | |||
23 | #ifndef __INCLUDE_LINUX_RCU_SEGCBLIST_H | ||
24 | #define __INCLUDE_LINUX_RCU_SEGCBLIST_H | ||
25 | |||
26 | /* Simple unsegmented callback lists. */ | ||
27 | struct rcu_cblist { | ||
28 | struct rcu_head *head; | ||
29 | struct rcu_head **tail; | ||
30 | long len; | ||
31 | long len_lazy; | ||
32 | }; | ||
33 | |||
34 | #define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head } | ||
35 | |||
36 | /* Complicated segmented callback lists. ;-) */ | ||
37 | |||
38 | /* | ||
39 | * Index values for segments in rcu_segcblist structure. | ||
40 | * | ||
41 | * The segments are as follows: | ||
42 | * | ||
43 | * [head, *tails[RCU_DONE_TAIL]): | ||
44 | * Callbacks whose grace period has elapsed, and thus can be invoked. | ||
45 | * [*tails[RCU_DONE_TAIL], *tails[RCU_WAIT_TAIL]): | ||
46 | * Callbacks waiting for the current GP from the current CPU's viewpoint. | ||
47 | * [*tails[RCU_WAIT_TAIL], *tails[RCU_NEXT_READY_TAIL]): | ||
48 | * Callbacks that arrived before the next GP started, again from | ||
49 | * the current CPU's viewpoint. These can be handled by the next GP. | ||
50 | * [*tails[RCU_NEXT_READY_TAIL], *tails[RCU_NEXT_TAIL]): | ||
51 | * Callbacks that might have arrived after the next GP started. | ||
52 | * There is some uncertainty as to when a given GP starts and | ||
53 | * ends, but a CPU knows the exact times if it is the one starting | ||
54 | * or ending the GP. Other CPUs know that the previous GP ends | ||
55 | * before the next one starts. | ||
56 | * | ||
57 | * Note that RCU_WAIT_TAIL cannot be empty unless RCU_NEXT_READY_TAIL is also | ||
58 | * empty. | ||
59 | * | ||
60 | * The ->gp_seq[] array contains the grace-period number at which the | ||
61 | * corresponding segment of callbacks will be ready to invoke. A given | ||
62 | * element of this array is meaningful only when the corresponding segment | ||
63 | * is non-empty, and it is never valid for RCU_DONE_TAIL (whose callbacks | ||
64 | * are already ready to invoke) or for RCU_NEXT_TAIL (whose callbacks have | ||
65 | * not yet been assigned a grace-period number). | ||
66 | */ | ||
67 | #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ | ||
68 | #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ | ||
69 | #define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ | ||
70 | #define RCU_NEXT_TAIL 3 | ||
71 | #define RCU_CBLIST_NSEGS 4 | ||
72 | |||
73 | struct rcu_segcblist { | ||
74 | struct rcu_head *head; | ||
75 | struct rcu_head **tails[RCU_CBLIST_NSEGS]; | ||
76 | unsigned long gp_seq[RCU_CBLIST_NSEGS]; | ||
77 | long len; | ||
78 | long len_lazy; | ||
79 | }; | ||
80 | |||
81 | #define RCU_SEGCBLIST_INITIALIZER(n) \ | ||
82 | { \ | ||
83 | .head = NULL, \ | ||
84 | .tails[RCU_DONE_TAIL] = &n.head, \ | ||
85 | .tails[RCU_WAIT_TAIL] = &n.head, \ | ||
86 | .tails[RCU_NEXT_READY_TAIL] = &n.head, \ | ||
87 | .tails[RCU_NEXT_TAIL] = &n.head, \ | ||
88 | } | ||
89 | |||
90 | #endif /* __INCLUDE_LINUX_RCU_SEGCBLIST_H */ | ||
diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4f7a9561b8c4..b1fd8bf85fdc 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h | |||
@@ -509,7 +509,8 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n, | |||
509 | { | 509 | { |
510 | struct hlist_node *i, *last = NULL; | 510 | struct hlist_node *i, *last = NULL; |
511 | 511 | ||
512 | for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i)) | 512 | /* Note: write side code, so rcu accessors are not needed. */ |
513 | for (i = h->first; i; i = i->next) | ||
513 | last = i; | 514 | last = i; |
514 | 515 | ||
515 | if (last) { | 516 | if (last) { |
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index dea8f17b2fe3..e1e5d002fdb9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h | |||
@@ -368,15 +368,20 @@ static inline void rcu_init_nohz(void) | |||
368 | #ifdef CONFIG_TASKS_RCU | 368 | #ifdef CONFIG_TASKS_RCU |
369 | #define TASKS_RCU(x) x | 369 | #define TASKS_RCU(x) x |
370 | extern struct srcu_struct tasks_rcu_exit_srcu; | 370 | extern struct srcu_struct tasks_rcu_exit_srcu; |
371 | #define rcu_note_voluntary_context_switch(t) \ | 371 | #define rcu_note_voluntary_context_switch_lite(t) \ |
372 | do { \ | 372 | do { \ |
373 | rcu_all_qs(); \ | ||
374 | if (READ_ONCE((t)->rcu_tasks_holdout)) \ | 373 | if (READ_ONCE((t)->rcu_tasks_holdout)) \ |
375 | WRITE_ONCE((t)->rcu_tasks_holdout, false); \ | 374 | WRITE_ONCE((t)->rcu_tasks_holdout, false); \ |
376 | } while (0) | 375 | } while (0) |
376 | #define rcu_note_voluntary_context_switch(t) \ | ||
377 | do { \ | ||
378 | rcu_all_qs(); \ | ||
379 | rcu_note_voluntary_context_switch_lite(t); \ | ||
380 | } while (0) | ||
377 | #else /* #ifdef CONFIG_TASKS_RCU */ | 381 | #else /* #ifdef CONFIG_TASKS_RCU */ |
378 | #define TASKS_RCU(x) do { } while (0) | 382 | #define TASKS_RCU(x) do { } while (0) |
379 | #define rcu_note_voluntary_context_switch(t) rcu_all_qs() | 383 | #define rcu_note_voluntary_context_switch_lite(t) do { } while (0) |
384 | #define rcu_note_voluntary_context_switch(t) rcu_all_qs() | ||
380 | #endif /* #else #ifdef CONFIG_TASKS_RCU */ | 385 | #endif /* #else #ifdef CONFIG_TASKS_RCU */ |
381 | 386 | ||
382 | /** | 387 | /** |
@@ -1132,11 +1137,11 @@ do { \ | |||
1132 | * if the UNLOCK and LOCK are executed by the same CPU or if the | 1137 | * if the UNLOCK and LOCK are executed by the same CPU or if the |
1133 | * UNLOCK and LOCK operate on the same lock variable. | 1138 | * UNLOCK and LOCK operate on the same lock variable. |
1134 | */ | 1139 | */ |
1135 | #ifdef CONFIG_PPC | 1140 | #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE |
1136 | #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ | 1141 | #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ |
1137 | #else /* #ifdef CONFIG_PPC */ | 1142 | #else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ |
1138 | #define smp_mb__after_unlock_lock() do { } while (0) | 1143 | #define smp_mb__after_unlock_lock() do { } while (0) |
1139 | #endif /* #else #ifdef CONFIG_PPC */ | 1144 | #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ |
1140 | 1145 | ||
1141 | 1146 | ||
1142 | #endif /* __LINUX_RCUPDATE_H */ | 1147 | #endif /* __LINUX_RCUPDATE_H */ |
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index b452953e21c8..74d9c3a1feee 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h | |||
@@ -33,6 +33,11 @@ static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp) | |||
33 | return 0; | 33 | return 0; |
34 | } | 34 | } |
35 | 35 | ||
36 | static inline bool rcu_eqs_special_set(int cpu) | ||
37 | { | ||
38 | return false; /* Never flag non-existent other CPUs! */ | ||
39 | } | ||
40 | |||
36 | static inline unsigned long get_state_synchronize_rcu(void) | 41 | static inline unsigned long get_state_synchronize_rcu(void) |
37 | { | 42 | { |
38 | return 0; | 43 | return 0; |
@@ -87,10 +92,11 @@ static inline void kfree_call_rcu(struct rcu_head *head, | |||
87 | call_rcu(head, func); | 92 | call_rcu(head, func); |
88 | } | 93 | } |
89 | 94 | ||
90 | static inline void rcu_note_context_switch(void) | 95 | #define rcu_note_context_switch(preempt) \ |
91 | { | 96 | do { \ |
92 | rcu_sched_qs(); | 97 | rcu_sched_qs(); \ |
93 | } | 98 | rcu_note_voluntary_context_switch_lite(current); \ |
99 | } while (0) | ||
94 | 100 | ||
95 | /* | 101 | /* |
96 | * Take advantage of the fact that there is only one CPU, which | 102 | * Take advantage of the fact that there is only one CPU, which |
@@ -212,14 +218,14 @@ static inline void exit_rcu(void) | |||
212 | { | 218 | { |
213 | } | 219 | } |
214 | 220 | ||
215 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 221 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) |
216 | extern int rcu_scheduler_active __read_mostly; | 222 | extern int rcu_scheduler_active __read_mostly; |
217 | void rcu_scheduler_starting(void); | 223 | void rcu_scheduler_starting(void); |
218 | #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 224 | #else /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ |
219 | static inline void rcu_scheduler_starting(void) | 225 | static inline void rcu_scheduler_starting(void) |
220 | { | 226 | { |
221 | } | 227 | } |
222 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 228 | #endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ |
223 | 229 | ||
224 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) | 230 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) |
225 | 231 | ||
@@ -237,6 +243,10 @@ static inline bool rcu_is_watching(void) | |||
237 | 243 | ||
238 | #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ | 244 | #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ |
239 | 245 | ||
246 | static inline void rcu_request_urgent_qs_task(struct task_struct *t) | ||
247 | { | ||
248 | } | ||
249 | |||
240 | static inline void rcu_all_qs(void) | 250 | static inline void rcu_all_qs(void) |
241 | { | 251 | { |
242 | barrier(); /* Avoid RCU read-side critical sections leaking across. */ | 252 | barrier(); /* Avoid RCU read-side critical sections leaking across. */ |
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 63a4e4cf40a5..0bacb6b2af69 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h | |||
@@ -30,7 +30,7 @@ | |||
30 | #ifndef __LINUX_RCUTREE_H | 30 | #ifndef __LINUX_RCUTREE_H |
31 | #define __LINUX_RCUTREE_H | 31 | #define __LINUX_RCUTREE_H |
32 | 32 | ||
33 | void rcu_note_context_switch(void); | 33 | void rcu_note_context_switch(bool preempt); |
34 | int rcu_needs_cpu(u64 basem, u64 *nextevt); | 34 | int rcu_needs_cpu(u64 basem, u64 *nextevt); |
35 | void rcu_cpu_stall_reset(void); | 35 | void rcu_cpu_stall_reset(void); |
36 | 36 | ||
@@ -41,7 +41,7 @@ void rcu_cpu_stall_reset(void); | |||
41 | */ | 41 | */ |
42 | static inline void rcu_virt_note_context_switch(int cpu) | 42 | static inline void rcu_virt_note_context_switch(int cpu) |
43 | { | 43 | { |
44 | rcu_note_context_switch(); | 44 | rcu_note_context_switch(false); |
45 | } | 45 | } |
46 | 46 | ||
47 | void synchronize_rcu_bh(void); | 47 | void synchronize_rcu_bh(void); |
@@ -108,6 +108,7 @@ void rcu_scheduler_starting(void); | |||
108 | extern int rcu_scheduler_active __read_mostly; | 108 | extern int rcu_scheduler_active __read_mostly; |
109 | 109 | ||
110 | bool rcu_is_watching(void); | 110 | bool rcu_is_watching(void); |
111 | void rcu_request_urgent_qs_task(struct task_struct *t); | ||
111 | 112 | ||
112 | void rcu_all_qs(void); | 113 | void rcu_all_qs(void); |
113 | 114 | ||
diff --git a/include/linux/slab.h b/include/linux/slab.h index 3c37a8c51921..04a7f7993e67 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h | |||
@@ -28,7 +28,7 @@ | |||
28 | #define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ | 28 | #define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ |
29 | #define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */ | 29 | #define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */ |
30 | /* | 30 | /* |
31 | * SLAB_DESTROY_BY_RCU - **WARNING** READ THIS! | 31 | * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! |
32 | * | 32 | * |
33 | * This delays freeing the SLAB page by a grace period, it does _NOT_ | 33 | * This delays freeing the SLAB page by a grace period, it does _NOT_ |
34 | * delay object freeing. This means that if you do kmem_cache_free() | 34 | * delay object freeing. This means that if you do kmem_cache_free() |
@@ -61,8 +61,10 @@ | |||
61 | * | 61 | * |
62 | * rcu_read_lock before reading the address, then rcu_read_unlock after | 62 | * rcu_read_lock before reading the address, then rcu_read_unlock after |
63 | * taking the spinlock within the structure expected at that address. | 63 | * taking the spinlock within the structure expected at that address. |
64 | * | ||
65 | * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. | ||
64 | */ | 66 | */ |
65 | #define SLAB_DESTROY_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ | 67 | #define SLAB_TYPESAFE_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ |
66 | #define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ | 68 | #define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ |
67 | #define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */ | 69 | #define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */ |
68 | 70 | ||
diff --git a/include/linux/srcu.h b/include/linux/srcu.h index a598cf3ac70c..167ad8831aaf 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h | |||
@@ -22,7 +22,7 @@ | |||
22 | * Lai Jiangshan <laijs@cn.fujitsu.com> | 22 | * Lai Jiangshan <laijs@cn.fujitsu.com> |
23 | * | 23 | * |
24 | * For detailed explanation of Read-Copy Update mechanism see - | 24 | * For detailed explanation of Read-Copy Update mechanism see - |
25 | * Documentation/RCU/ *.txt | 25 | * Documentation/RCU/ *.txt |
26 | * | 26 | * |
27 | */ | 27 | */ |
28 | 28 | ||
@@ -32,35 +32,9 @@ | |||
32 | #include <linux/mutex.h> | 32 | #include <linux/mutex.h> |
33 | #include <linux/rcupdate.h> | 33 | #include <linux/rcupdate.h> |
34 | #include <linux/workqueue.h> | 34 | #include <linux/workqueue.h> |
35 | #include <linux/rcu_segcblist.h> | ||
35 | 36 | ||
36 | struct srcu_array { | 37 | struct srcu_struct; |
37 | unsigned long lock_count[2]; | ||
38 | unsigned long unlock_count[2]; | ||
39 | }; | ||
40 | |||
41 | struct rcu_batch { | ||
42 | struct rcu_head *head, **tail; | ||
43 | }; | ||
44 | |||
45 | #define RCU_BATCH_INIT(name) { NULL, &(name.head) } | ||
46 | |||
47 | struct srcu_struct { | ||
48 | unsigned long completed; | ||
49 | struct srcu_array __percpu *per_cpu_ref; | ||
50 | spinlock_t queue_lock; /* protect ->batch_queue, ->running */ | ||
51 | bool running; | ||
52 | /* callbacks just queued */ | ||
53 | struct rcu_batch batch_queue; | ||
54 | /* callbacks try to do the first check_zero */ | ||
55 | struct rcu_batch batch_check0; | ||
56 | /* callbacks done with the first check_zero and the flip */ | ||
57 | struct rcu_batch batch_check1; | ||
58 | struct rcu_batch batch_done; | ||
59 | struct delayed_work work; | ||
60 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
61 | struct lockdep_map dep_map; | ||
62 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
63 | }; | ||
64 | 38 | ||
65 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 39 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
66 | 40 | ||
@@ -82,46 +56,15 @@ int init_srcu_struct(struct srcu_struct *sp); | |||
82 | #define __SRCU_DEP_MAP_INIT(srcu_name) | 56 | #define __SRCU_DEP_MAP_INIT(srcu_name) |
83 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 57 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
84 | 58 | ||
85 | void process_srcu(struct work_struct *work); | 59 | #ifdef CONFIG_TINY_SRCU |
86 | 60 | #include <linux/srcutiny.h> | |
87 | #define __SRCU_STRUCT_INIT(name) \ | 61 | #elif defined(CONFIG_TREE_SRCU) |
88 | { \ | 62 | #include <linux/srcutree.h> |
89 | .completed = -300, \ | 63 | #elif defined(CONFIG_CLASSIC_SRCU) |
90 | .per_cpu_ref = &name##_srcu_array, \ | 64 | #include <linux/srcuclassic.h> |
91 | .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ | 65 | #else |
92 | .running = false, \ | 66 | #error "Unknown SRCU implementation specified to kernel configuration" |
93 | .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ | 67 | #endif |
94 | .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ | ||
95 | .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ | ||
96 | .batch_done = RCU_BATCH_INIT(name.batch_done), \ | ||
97 | .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ | ||
98 | __SRCU_DEP_MAP_INIT(name) \ | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * Define and initialize a srcu struct at build time. | ||
103 | * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. | ||
104 | * | ||
105 | * Note that although DEFINE_STATIC_SRCU() hides the name from other | ||
106 | * files, the per-CPU variable rules nevertheless require that the | ||
107 | * chosen name be globally unique. These rules also prohibit use of | ||
108 | * DEFINE_STATIC_SRCU() within a function. If these rules are too | ||
109 | * restrictive, declare the srcu_struct manually. For example, in | ||
110 | * each file: | ||
111 | * | ||
112 | * static struct srcu_struct my_srcu; | ||
113 | * | ||
114 | * Then, before the first use of each my_srcu, manually initialize it: | ||
115 | * | ||
116 | * init_srcu_struct(&my_srcu); | ||
117 | * | ||
118 | * See include/linux/percpu-defs.h for the rules on per-CPU variables. | ||
119 | */ | ||
120 | #define __DEFINE_SRCU(name, is_static) \ | ||
121 | static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ | ||
122 | is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) | ||
123 | #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) | ||
124 | #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) | ||
125 | 68 | ||
126 | /** | 69 | /** |
127 | * call_srcu() - Queue a callback for invocation after an SRCU grace period | 70 | * call_srcu() - Queue a callback for invocation after an SRCU grace period |
@@ -147,9 +90,6 @@ void cleanup_srcu_struct(struct srcu_struct *sp); | |||
147 | int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); | 90 | int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); |
148 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); | 91 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); |
149 | void synchronize_srcu(struct srcu_struct *sp); | 92 | void synchronize_srcu(struct srcu_struct *sp); |
150 | void synchronize_srcu_expedited(struct srcu_struct *sp); | ||
151 | unsigned long srcu_batches_completed(struct srcu_struct *sp); | ||
152 | void srcu_barrier(struct srcu_struct *sp); | ||
153 | 93 | ||
154 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 94 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
155 | 95 | ||
diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h new file mode 100644 index 000000000000..5753f7322262 --- /dev/null +++ b/include/linux/srcuclassic.h | |||
@@ -0,0 +1,115 @@ | |||
1 | /* | ||
2 | * Sleepable Read-Copy Update mechanism for mutual exclusion, | ||
3 | * classic v4.11 variant. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, you can access it online at | ||
17 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
18 | * | ||
19 | * Copyright (C) IBM Corporation, 2017 | ||
20 | * | ||
21 | * Author: Paul McKenney <paulmck@us.ibm.com> | ||
22 | */ | ||
23 | |||
24 | #ifndef _LINUX_SRCU_CLASSIC_H | ||
25 | #define _LINUX_SRCU_CLASSIC_H | ||
26 | |||
27 | struct srcu_array { | ||
28 | unsigned long lock_count[2]; | ||
29 | unsigned long unlock_count[2]; | ||
30 | }; | ||
31 | |||
32 | struct rcu_batch { | ||
33 | struct rcu_head *head, **tail; | ||
34 | }; | ||
35 | |||
36 | #define RCU_BATCH_INIT(name) { NULL, &(name.head) } | ||
37 | |||
38 | struct srcu_struct { | ||
39 | unsigned long completed; | ||
40 | struct srcu_array __percpu *per_cpu_ref; | ||
41 | spinlock_t queue_lock; /* protect ->batch_queue, ->running */ | ||
42 | bool running; | ||
43 | /* callbacks just queued */ | ||
44 | struct rcu_batch batch_queue; | ||
45 | /* callbacks try to do the first check_zero */ | ||
46 | struct rcu_batch batch_check0; | ||
47 | /* callbacks done with the first check_zero and the flip */ | ||
48 | struct rcu_batch batch_check1; | ||
49 | struct rcu_batch batch_done; | ||
50 | struct delayed_work work; | ||
51 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
52 | struct lockdep_map dep_map; | ||
53 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
54 | }; | ||
55 | |||
56 | void process_srcu(struct work_struct *work); | ||
57 | |||
58 | #define __SRCU_STRUCT_INIT(name) \ | ||
59 | { \ | ||
60 | .completed = -300, \ | ||
61 | .per_cpu_ref = &name##_srcu_array, \ | ||
62 | .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ | ||
63 | .running = false, \ | ||
64 | .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ | ||
65 | .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ | ||
66 | .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ | ||
67 | .batch_done = RCU_BATCH_INIT(name.batch_done), \ | ||
68 | .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ | ||
69 | __SRCU_DEP_MAP_INIT(name) \ | ||
70 | } | ||
71 | |||
72 | /* | ||
73 | * Define and initialize a srcu struct at build time. | ||
74 | * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. | ||
75 | * | ||
76 | * Note that although DEFINE_STATIC_SRCU() hides the name from other | ||
77 | * files, the per-CPU variable rules nevertheless require that the | ||
78 | * chosen name be globally unique. These rules also prohibit use of | ||
79 | * DEFINE_STATIC_SRCU() within a function. If these rules are too | ||
80 | * restrictive, declare the srcu_struct manually. For example, in | ||
81 | * each file: | ||
82 | * | ||
83 | * static struct srcu_struct my_srcu; | ||
84 | * | ||
85 | * Then, before the first use of each my_srcu, manually initialize it: | ||
86 | * | ||
87 | * init_srcu_struct(&my_srcu); | ||
88 | * | ||
89 | * See include/linux/percpu-defs.h for the rules on per-CPU variables. | ||
90 | */ | ||
91 | #define __DEFINE_SRCU(name, is_static) \ | ||
92 | static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ | ||
93 | is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) | ||
94 | #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) | ||
95 | #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) | ||
96 | |||
97 | void synchronize_srcu_expedited(struct srcu_struct *sp); | ||
98 | void srcu_barrier(struct srcu_struct *sp); | ||
99 | unsigned long srcu_batches_completed(struct srcu_struct *sp); | ||
100 | |||
101 | static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, | ||
102 | struct srcu_struct *sp, int *flags, | ||
103 | unsigned long *gpnum, | ||
104 | unsigned long *completed) | ||
105 | { | ||
106 | if (test_type != SRCU_FLAVOR) | ||
107 | return; | ||
108 | *flags = 0; | ||
109 | *completed = sp->completed; | ||
110 | *gpnum = *completed; | ||
111 | if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check0.head) | ||
112 | (*gpnum)++; | ||
113 | } | ||
114 | |||
115 | #endif | ||
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h new file mode 100644 index 000000000000..42311ee0334f --- /dev/null +++ b/include/linux/srcutiny.h | |||
@@ -0,0 +1,93 @@ | |||
1 | /* | ||
2 | * Sleepable Read-Copy Update mechanism for mutual exclusion, | ||
3 | * tiny variant. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, you can access it online at | ||
17 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
18 | * | ||
19 | * Copyright (C) IBM Corporation, 2017 | ||
20 | * | ||
21 | * Author: Paul McKenney <paulmck@us.ibm.com> | ||
22 | */ | ||
23 | |||
24 | #ifndef _LINUX_SRCU_TINY_H | ||
25 | #define _LINUX_SRCU_TINY_H | ||
26 | |||
27 | #include <linux/swait.h> | ||
28 | |||
29 | struct srcu_struct { | ||
30 | int srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ | ||
31 | struct swait_queue_head srcu_wq; | ||
32 | /* Last srcu_read_unlock() wakes GP. */ | ||
33 | unsigned long srcu_gp_seq; /* GP seq # for callback tagging. */ | ||
34 | struct rcu_segcblist srcu_cblist; | ||
35 | /* Pending SRCU callbacks. */ | ||
36 | int srcu_idx; /* Current reader array element. */ | ||
37 | bool srcu_gp_running; /* GP workqueue running? */ | ||
38 | bool srcu_gp_waiting; /* GP waiting for readers? */ | ||
39 | struct work_struct srcu_work; /* For driving grace periods. */ | ||
40 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
41 | struct lockdep_map dep_map; | ||
42 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
43 | }; | ||
44 | |||
45 | void srcu_drive_gp(struct work_struct *wp); | ||
46 | |||
47 | #define __SRCU_STRUCT_INIT(name) \ | ||
48 | { \ | ||
49 | .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \ | ||
50 | .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist), \ | ||
51 | .srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp), \ | ||
52 | __SRCU_DEP_MAP_INIT(name) \ | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * This odd _STATIC_ arrangement is needed for API compatibility with | ||
57 | * Tree SRCU, which needs some per-CPU data. | ||
58 | */ | ||
59 | #define DEFINE_SRCU(name) \ | ||
60 | struct srcu_struct name = __SRCU_STRUCT_INIT(name) | ||
61 | #define DEFINE_STATIC_SRCU(name) \ | ||
62 | static struct srcu_struct name = __SRCU_STRUCT_INIT(name) | ||
63 | |||
64 | void synchronize_srcu(struct srcu_struct *sp); | ||
65 | |||
66 | static inline void synchronize_srcu_expedited(struct srcu_struct *sp) | ||
67 | { | ||
68 | synchronize_srcu(sp); | ||
69 | } | ||
70 | |||
71 | static inline void srcu_barrier(struct srcu_struct *sp) | ||
72 | { | ||
73 | synchronize_srcu(sp); | ||
74 | } | ||
75 | |||
76 | static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) | ||
77 | { | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, | ||
82 | struct srcu_struct *sp, int *flags, | ||
83 | unsigned long *gpnum, | ||
84 | unsigned long *completed) | ||
85 | { | ||
86 | if (test_type != SRCU_FLAVOR) | ||
87 | return; | ||
88 | *flags = 0; | ||
89 | *completed = sp->srcu_gp_seq; | ||
90 | *gpnum = *completed; | ||
91 | } | ||
92 | |||
93 | #endif | ||
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h new file mode 100644 index 000000000000..32e86d85fd11 --- /dev/null +++ b/include/linux/srcutree.h | |||
@@ -0,0 +1,150 @@ | |||
1 | /* | ||
2 | * Sleepable Read-Copy Update mechanism for mutual exclusion, | ||
3 | * tree variant. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, you can access it online at | ||
17 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
18 | * | ||
19 | * Copyright (C) IBM Corporation, 2017 | ||
20 | * | ||
21 | * Author: Paul McKenney <paulmck@us.ibm.com> | ||
22 | */ | ||
23 | |||
24 | #ifndef _LINUX_SRCU_TREE_H | ||
25 | #define _LINUX_SRCU_TREE_H | ||
26 | |||
27 | #include <linux/rcu_node_tree.h> | ||
28 | #include <linux/completion.h> | ||
29 | |||
30 | struct srcu_node; | ||
31 | struct srcu_struct; | ||
32 | |||
33 | /* | ||
34 | * Per-CPU structure feeding into leaf srcu_node, similar in function | ||
35 | * to rcu_node. | ||
36 | */ | ||
37 | struct srcu_data { | ||
38 | /* Read-side state. */ | ||
39 | unsigned long srcu_lock_count[2]; /* Locks per CPU. */ | ||
40 | unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */ | ||
41 | |||
42 | /* Update-side state. */ | ||
43 | spinlock_t lock ____cacheline_internodealigned_in_smp; | ||
44 | struct rcu_segcblist srcu_cblist; /* List of callbacks.*/ | ||
45 | unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */ | ||
46 | unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ | ||
47 | bool srcu_cblist_invoking; /* Invoking these CBs? */ | ||
48 | struct delayed_work work; /* Context for CB invoking. */ | ||
49 | struct rcu_head srcu_barrier_head; /* For srcu_barrier() use. */ | ||
50 | struct srcu_node *mynode; /* Leaf srcu_node. */ | ||
51 | unsigned long grpmask; /* Mask for leaf srcu_node */ | ||
52 | /* ->srcu_data_have_cbs[]. */ | ||
53 | int cpu; | ||
54 | struct srcu_struct *sp; | ||
55 | }; | ||
56 | |||
57 | /* | ||
58 | * Node in SRCU combining tree, similar in function to rcu_data. | ||
59 | */ | ||
60 | struct srcu_node { | ||
61 | spinlock_t lock; | ||
62 | unsigned long srcu_have_cbs[4]; /* GP seq for children */ | ||
63 | /* having CBs, but only */ | ||
64 | /* is > ->srcu_gq_seq. */ | ||
65 | unsigned long srcu_data_have_cbs[4]; /* Which srcu_data structs */ | ||
66 | /* have CBs for given GP? */ | ||
67 | unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ | ||
68 | struct srcu_node *srcu_parent; /* Next up in tree. */ | ||
69 | int grplo; /* Least CPU for node. */ | ||
70 | int grphi; /* Biggest CPU for node. */ | ||
71 | }; | ||
72 | |||
73 | /* | ||
74 | * Per-SRCU-domain structure, similar in function to rcu_state. | ||
75 | */ | ||
76 | struct srcu_struct { | ||
77 | struct srcu_node node[NUM_RCU_NODES]; /* Combining tree. */ | ||
78 | struct srcu_node *level[RCU_NUM_LVLS + 1]; | ||
79 | /* First node at each level. */ | ||
80 | struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ | ||
81 | spinlock_t gp_lock; /* protect ->srcu_cblist */ | ||
82 | struct mutex srcu_gp_mutex; /* Serialize GP work. */ | ||
83 | unsigned int srcu_idx; /* Current rdr array element. */ | ||
84 | unsigned long srcu_gp_seq; /* Grace-period seq #. */ | ||
85 | unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ | ||
86 | unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ | ||
87 | unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ | ||
88 | struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ | ||
89 | unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ | ||
90 | struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ | ||
91 | struct completion srcu_barrier_completion; | ||
92 | /* Awaken barrier rq at end. */ | ||
93 | atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */ | ||
94 | /* callback for the barrier */ | ||
95 | /* operation. */ | ||
96 | struct delayed_work work; | ||
97 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
98 | struct lockdep_map dep_map; | ||
99 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
100 | }; | ||
101 | |||
102 | /* Values for state variable (bottom bits of ->srcu_gp_seq). */ | ||
103 | #define SRCU_STATE_IDLE 0 | ||
104 | #define SRCU_STATE_SCAN1 1 | ||
105 | #define SRCU_STATE_SCAN2 2 | ||
106 | |||
107 | void process_srcu(struct work_struct *work); | ||
108 | |||
109 | #define __SRCU_STRUCT_INIT(name) \ | ||
110 | { \ | ||
111 | .sda = &name##_srcu_data, \ | ||
112 | .gp_lock = __SPIN_LOCK_UNLOCKED(name.gp_lock), \ | ||
113 | .srcu_gp_seq_needed = 0 - 1, \ | ||
114 | __SRCU_DEP_MAP_INIT(name) \ | ||
115 | } | ||
116 | |||
117 | /* | ||
118 | * Define and initialize a srcu struct at build time. | ||
119 | * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. | ||
120 | * | ||
121 | * Note that although DEFINE_STATIC_SRCU() hides the name from other | ||
122 | * files, the per-CPU variable rules nevertheless require that the | ||
123 | * chosen name be globally unique. These rules also prohibit use of | ||
124 | * DEFINE_STATIC_SRCU() within a function. If these rules are too | ||
125 | * restrictive, declare the srcu_struct manually. For example, in | ||
126 | * each file: | ||
127 | * | ||
128 | * static struct srcu_struct my_srcu; | ||
129 | * | ||
130 | * Then, before the first use of each my_srcu, manually initialize it: | ||
131 | * | ||
132 | * init_srcu_struct(&my_srcu); | ||
133 | * | ||
134 | * See include/linux/percpu-defs.h for the rules on per-CPU variables. | ||
135 | */ | ||
136 | #define __DEFINE_SRCU(name, is_static) \ | ||
137 | static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\ | ||
138 | is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) | ||
139 | #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) | ||
140 | #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) | ||
141 | |||
142 | void synchronize_srcu_expedited(struct srcu_struct *sp); | ||
143 | void srcu_barrier(struct srcu_struct *sp); | ||
144 | unsigned long srcu_batches_completed(struct srcu_struct *sp); | ||
145 | |||
146 | void srcutorture_get_gp_data(enum rcutorture_type test_type, | ||
147 | struct srcu_struct *sp, int *flags, | ||
148 | unsigned long *gpnum, unsigned long *completed); | ||
149 | |||
150 | #endif | ||
diff --git a/include/linux/types.h b/include/linux/types.h index 1e7bd24848fc..258099a4ed82 100644 --- a/include/linux/types.h +++ b/include/linux/types.h | |||
@@ -209,7 +209,7 @@ struct ustat { | |||
209 | * naturally due ABI requirements, but some architectures (like CRIS) have | 209 | * naturally due ABI requirements, but some architectures (like CRIS) have |
210 | * weird ABI and we need to ask it explicitly. | 210 | * weird ABI and we need to ask it explicitly. |
211 | * | 211 | * |
212 | * The alignment is required to guarantee that bits 0 and 1 of @next will be | 212 | * The alignment is required to guarantee that bit 0 of @next will be |
213 | * clear under normal conditions -- as long as we use call_rcu(), | 213 | * clear under normal conditions -- as long as we use call_rcu(), |
214 | * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. | 214 | * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. |
215 | * | 215 | * |
diff --git a/include/net/sock.h b/include/net/sock.h index 66349e49d468..f33e3d134e0b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h | |||
@@ -995,7 +995,7 @@ struct smc_hashinfo; | |||
995 | struct module; | 995 | struct module; |
996 | 996 | ||
997 | /* | 997 | /* |
998 | * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes | 998 | * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes |
999 | * un-modified. Special care is taken when initializing object to zero. | 999 | * un-modified. Special care is taken when initializing object to zero. |
1000 | */ | 1000 | */ |
1001 | static inline void sk_prot_clear_nulls(struct sock *sk, int size) | 1001 | static inline void sk_prot_clear_nulls(struct sock *sk, int size) |
diff --git a/init/Kconfig b/init/Kconfig index a92f27da4a27..1d3475fc9496 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -521,11 +521,41 @@ config RCU_EXPERT | |||
521 | 521 | ||
522 | config SRCU | 522 | config SRCU |
523 | bool | 523 | bool |
524 | default y | ||
524 | help | 525 | help |
525 | This option selects the sleepable version of RCU. This version | 526 | This option selects the sleepable version of RCU. This version |
526 | permits arbitrary sleeping or blocking within RCU read-side critical | 527 | permits arbitrary sleeping or blocking within RCU read-side critical |
527 | sections. | 528 | sections. |
528 | 529 | ||
530 | config CLASSIC_SRCU | ||
531 | bool "Use v4.11 classic SRCU implementation" | ||
532 | default n | ||
533 | depends on RCU_EXPERT && SRCU | ||
534 | help | ||
535 | This option selects the traditional well-tested classic SRCU | ||
536 | implementation from v4.11, as might be desired for enterprise | ||
537 | Linux distributions. Without this option, the shiny new | ||
538 | Tiny SRCU and Tree SRCU implementations are used instead. | ||
539 | At some point, it is hoped that Tiny SRCU and Tree SRCU | ||
540 | will accumulate enough test time and confidence to allow | ||
541 | Classic SRCU to be dropped entirely. | ||
542 | |||
543 | Say Y if you need a rock-solid SRCU. | ||
544 | |||
545 | Say N if you would like help test Tree SRCU. | ||
546 | |||
547 | config TINY_SRCU | ||
548 | bool | ||
549 | default y if SRCU && TINY_RCU && !CLASSIC_SRCU | ||
550 | help | ||
551 | This option selects the single-CPU non-preemptible version of SRCU. | ||
552 | |||
553 | config TREE_SRCU | ||
554 | bool | ||
555 | default y if SRCU && !TINY_RCU && !CLASSIC_SRCU | ||
556 | help | ||
557 | This option selects the full-fledged version of SRCU. | ||
558 | |||
529 | config TASKS_RCU | 559 | config TASKS_RCU |
530 | bool | 560 | bool |
531 | default n | 561 | default n |
@@ -543,6 +573,9 @@ config RCU_STALL_COMMON | |||
543 | the tiny variants to disable RCU CPU stall warnings, while | 573 | the tiny variants to disable RCU CPU stall warnings, while |
544 | making these warnings mandatory for the tree variants. | 574 | making these warnings mandatory for the tree variants. |
545 | 575 | ||
576 | config RCU_NEED_SEGCBLIST | ||
577 | def_bool ( TREE_RCU || PREEMPT_RCU || TINY_SRCU || TREE_SRCU ) | ||
578 | |||
546 | config CONTEXT_TRACKING | 579 | config CONTEXT_TRACKING |
547 | bool | 580 | bool |
548 | 581 | ||
@@ -612,11 +645,17 @@ config RCU_FANOUT_LEAF | |||
612 | initialization. These systems tend to run CPU-bound, and thus | 645 | initialization. These systems tend to run CPU-bound, and thus |
613 | are not helped by synchronized interrupts, and thus tend to | 646 | are not helped by synchronized interrupts, and thus tend to |
614 | skew them, which reduces lock contention enough that large | 647 | skew them, which reduces lock contention enough that large |
615 | leaf-level fanouts work well. | 648 | leaf-level fanouts work well. That said, setting leaf-level |
649 | fanout to a large number will likely cause problematic | ||
650 | lock contention on the leaf-level rcu_node structures unless | ||
651 | you boot with the skew_tick kernel parameter. | ||
616 | 652 | ||
617 | Select a specific number if testing RCU itself. | 653 | Select a specific number if testing RCU itself. |
618 | 654 | ||
619 | Select the maximum permissible value for large systems. | 655 | Select the maximum permissible value for large systems, but |
656 | please understand that you may also need to set the skew_tick | ||
657 | kernel boot parameter to avoid contention on the rcu_node | ||
658 | structure's locks. | ||
620 | 659 | ||
621 | Take the default if unsure. | 660 | Take the default if unsure. |
622 | 661 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 08ba696aa561..bfd91b180778 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1337,7 +1337,7 @@ void __cleanup_sighand(struct sighand_struct *sighand) | |||
1337 | if (atomic_dec_and_test(&sighand->count)) { | 1337 | if (atomic_dec_and_test(&sighand->count)) { |
1338 | signalfd_cleanup(sighand); | 1338 | signalfd_cleanup(sighand); |
1339 | /* | 1339 | /* |
1340 | * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it | 1340 | * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it |
1341 | * without an RCU grace period, see __lock_task_sighand(). | 1341 | * without an RCU grace period, see __lock_task_sighand(). |
1342 | */ | 1342 | */ |
1343 | kmem_cache_free(sighand_cachep, sighand); | 1343 | kmem_cache_free(sighand_cachep, sighand); |
@@ -2176,7 +2176,7 @@ void __init proc_caches_init(void) | |||
2176 | { | 2176 | { |
2177 | sighand_cachep = kmem_cache_create("sighand_cache", | 2177 | sighand_cachep = kmem_cache_create("sighand_cache", |
2178 | sizeof(struct sighand_struct), 0, | 2178 | sizeof(struct sighand_struct), 0, |
2179 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| | 2179 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| |
2180 | SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); | 2180 | SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); |
2181 | signal_cachep = kmem_cache_create("signal_cache", | 2181 | signal_cachep = kmem_cache_create("signal_cache", |
2182 | sizeof(struct signal_struct), 0, | 2182 | sizeof(struct signal_struct), 0, |
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 0a1b3c748478..c0e31bfee25c 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -1158,10 +1158,10 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, | |||
1158 | return 0; | 1158 | return 0; |
1159 | 1159 | ||
1160 | printk("\n"); | 1160 | printk("\n"); |
1161 | printk("======================================================\n"); | 1161 | pr_warn("======================================================\n"); |
1162 | printk("[ INFO: possible circular locking dependency detected ]\n"); | 1162 | pr_warn("WARNING: possible circular locking dependency detected\n"); |
1163 | print_kernel_ident(); | 1163 | print_kernel_ident(); |
1164 | printk("-------------------------------------------------------\n"); | 1164 | pr_warn("------------------------------------------------------\n"); |
1165 | printk("%s/%d is trying to acquire lock:\n", | 1165 | printk("%s/%d is trying to acquire lock:\n", |
1166 | curr->comm, task_pid_nr(curr)); | 1166 | curr->comm, task_pid_nr(curr)); |
1167 | print_lock(check_src); | 1167 | print_lock(check_src); |
@@ -1496,11 +1496,11 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
1496 | return 0; | 1496 | return 0; |
1497 | 1497 | ||
1498 | printk("\n"); | 1498 | printk("\n"); |
1499 | printk("======================================================\n"); | 1499 | pr_warn("=====================================================\n"); |
1500 | printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", | 1500 | pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", |
1501 | irqclass, irqclass); | 1501 | irqclass, irqclass); |
1502 | print_kernel_ident(); | 1502 | print_kernel_ident(); |
1503 | printk("------------------------------------------------------\n"); | 1503 | pr_warn("-----------------------------------------------------\n"); |
1504 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", | 1504 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", |
1505 | curr->comm, task_pid_nr(curr), | 1505 | curr->comm, task_pid_nr(curr), |
1506 | curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, | 1506 | curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, |
@@ -1725,10 +1725,10 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | |||
1725 | return 0; | 1725 | return 0; |
1726 | 1726 | ||
1727 | printk("\n"); | 1727 | printk("\n"); |
1728 | printk("=============================================\n"); | 1728 | pr_warn("============================================\n"); |
1729 | printk("[ INFO: possible recursive locking detected ]\n"); | 1729 | pr_warn("WARNING: possible recursive locking detected\n"); |
1730 | print_kernel_ident(); | 1730 | print_kernel_ident(); |
1731 | printk("---------------------------------------------\n"); | 1731 | pr_warn("--------------------------------------------\n"); |
1732 | printk("%s/%d is trying to acquire lock:\n", | 1732 | printk("%s/%d is trying to acquire lock:\n", |
1733 | curr->comm, task_pid_nr(curr)); | 1733 | curr->comm, task_pid_nr(curr)); |
1734 | print_lock(next); | 1734 | print_lock(next); |
@@ -2075,10 +2075,10 @@ static void print_collision(struct task_struct *curr, | |||
2075 | struct lock_chain *chain) | 2075 | struct lock_chain *chain) |
2076 | { | 2076 | { |
2077 | printk("\n"); | 2077 | printk("\n"); |
2078 | printk("======================\n"); | 2078 | pr_warn("============================\n"); |
2079 | printk("[chain_key collision ]\n"); | 2079 | pr_warn("WARNING: chain_key collision\n"); |
2080 | print_kernel_ident(); | 2080 | print_kernel_ident(); |
2081 | printk("----------------------\n"); | 2081 | pr_warn("----------------------------\n"); |
2082 | printk("%s/%d: ", current->comm, task_pid_nr(current)); | 2082 | printk("%s/%d: ", current->comm, task_pid_nr(current)); |
2083 | printk("Hash chain already cached but the contents don't match!\n"); | 2083 | printk("Hash chain already cached but the contents don't match!\n"); |
2084 | 2084 | ||
@@ -2374,10 +2374,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, | |||
2374 | return 0; | 2374 | return 0; |
2375 | 2375 | ||
2376 | printk("\n"); | 2376 | printk("\n"); |
2377 | printk("=================================\n"); | 2377 | pr_warn("================================\n"); |
2378 | printk("[ INFO: inconsistent lock state ]\n"); | 2378 | pr_warn("WARNING: inconsistent lock state\n"); |
2379 | print_kernel_ident(); | 2379 | print_kernel_ident(); |
2380 | printk("---------------------------------\n"); | 2380 | pr_warn("--------------------------------\n"); |
2381 | 2381 | ||
2382 | printk("inconsistent {%s} -> {%s} usage.\n", | 2382 | printk("inconsistent {%s} -> {%s} usage.\n", |
2383 | usage_str[prev_bit], usage_str[new_bit]); | 2383 | usage_str[prev_bit], usage_str[new_bit]); |
@@ -2439,10 +2439,10 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
2439 | return 0; | 2439 | return 0; |
2440 | 2440 | ||
2441 | printk("\n"); | 2441 | printk("\n"); |
2442 | printk("=========================================================\n"); | 2442 | pr_warn("========================================================\n"); |
2443 | printk("[ INFO: possible irq lock inversion dependency detected ]\n"); | 2443 | pr_warn("WARNING: possible irq lock inversion dependency detected\n"); |
2444 | print_kernel_ident(); | 2444 | print_kernel_ident(); |
2445 | printk("---------------------------------------------------------\n"); | 2445 | pr_warn("--------------------------------------------------------\n"); |
2446 | printk("%s/%d just changed the state of lock:\n", | 2446 | printk("%s/%d just changed the state of lock:\n", |
2447 | curr->comm, task_pid_nr(curr)); | 2447 | curr->comm, task_pid_nr(curr)); |
2448 | print_lock(this); | 2448 | print_lock(this); |
@@ -3190,10 +3190,10 @@ print_lock_nested_lock_not_held(struct task_struct *curr, | |||
3190 | return 0; | 3190 | return 0; |
3191 | 3191 | ||
3192 | printk("\n"); | 3192 | printk("\n"); |
3193 | printk("==================================\n"); | 3193 | pr_warn("==================================\n"); |
3194 | printk("[ BUG: Nested lock was not taken ]\n"); | 3194 | pr_warn("WARNING: Nested lock was not taken\n"); |
3195 | print_kernel_ident(); | 3195 | print_kernel_ident(); |
3196 | printk("----------------------------------\n"); | 3196 | pr_warn("----------------------------------\n"); |
3197 | 3197 | ||
3198 | printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); | 3198 | printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); |
3199 | print_lock(hlock); | 3199 | print_lock(hlock); |
@@ -3403,10 +3403,10 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
3403 | return 0; | 3403 | return 0; |
3404 | 3404 | ||
3405 | printk("\n"); | 3405 | printk("\n"); |
3406 | printk("=====================================\n"); | 3406 | pr_warn("=====================================\n"); |
3407 | printk("[ BUG: bad unlock balance detected! ]\n"); | 3407 | pr_warn("WARNING: bad unlock balance detected!\n"); |
3408 | print_kernel_ident(); | 3408 | print_kernel_ident(); |
3409 | printk("-------------------------------------\n"); | 3409 | pr_warn("-------------------------------------\n"); |
3410 | printk("%s/%d is trying to release lock (", | 3410 | printk("%s/%d is trying to release lock (", |
3411 | curr->comm, task_pid_nr(curr)); | 3411 | curr->comm, task_pid_nr(curr)); |
3412 | print_lockdep_cache(lock); | 3412 | print_lockdep_cache(lock); |
@@ -3975,10 +3975,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
3975 | return 0; | 3975 | return 0; |
3976 | 3976 | ||
3977 | printk("\n"); | 3977 | printk("\n"); |
3978 | printk("=================================\n"); | 3978 | pr_warn("=================================\n"); |
3979 | printk("[ BUG: bad contention detected! ]\n"); | 3979 | pr_warn("WARNING: bad contention detected!\n"); |
3980 | print_kernel_ident(); | 3980 | print_kernel_ident(); |
3981 | printk("---------------------------------\n"); | 3981 | pr_warn("---------------------------------\n"); |
3982 | printk("%s/%d is trying to contend lock (", | 3982 | printk("%s/%d is trying to contend lock (", |
3983 | curr->comm, task_pid_nr(curr)); | 3983 | curr->comm, task_pid_nr(curr)); |
3984 | print_lockdep_cache(lock); | 3984 | print_lockdep_cache(lock); |
@@ -4319,10 +4319,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, | |||
4319 | return; | 4319 | return; |
4320 | 4320 | ||
4321 | printk("\n"); | 4321 | printk("\n"); |
4322 | printk("=========================\n"); | 4322 | pr_warn("=========================\n"); |
4323 | printk("[ BUG: held lock freed! ]\n"); | 4323 | pr_warn("WARNING: held lock freed!\n"); |
4324 | print_kernel_ident(); | 4324 | print_kernel_ident(); |
4325 | printk("-------------------------\n"); | 4325 | pr_warn("-------------------------\n"); |
4326 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", | 4326 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", |
4327 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); | 4327 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); |
4328 | print_lock(hlock); | 4328 | print_lock(hlock); |
@@ -4377,11 +4377,11 @@ static void print_held_locks_bug(void) | |||
4377 | return; | 4377 | return; |
4378 | 4378 | ||
4379 | printk("\n"); | 4379 | printk("\n"); |
4380 | printk("=====================================\n"); | 4380 | pr_warn("====================================\n"); |
4381 | printk("[ BUG: %s/%d still has locks held! ]\n", | 4381 | pr_warn("WARNING: %s/%d still has locks held!\n", |
4382 | current->comm, task_pid_nr(current)); | 4382 | current->comm, task_pid_nr(current)); |
4383 | print_kernel_ident(); | 4383 | print_kernel_ident(); |
4384 | printk("-------------------------------------\n"); | 4384 | pr_warn("------------------------------------\n"); |
4385 | lockdep_print_held_locks(current); | 4385 | lockdep_print_held_locks(current); |
4386 | printk("\nstack backtrace:\n"); | 4386 | printk("\nstack backtrace:\n"); |
4387 | dump_stack(); | 4387 | dump_stack(); |
@@ -4446,7 +4446,7 @@ retry: | |||
4446 | } while_each_thread(g, p); | 4446 | } while_each_thread(g, p); |
4447 | 4447 | ||
4448 | printk("\n"); | 4448 | printk("\n"); |
4449 | printk("=============================================\n\n"); | 4449 | pr_warn("=============================================\n\n"); |
4450 | 4450 | ||
4451 | if (unlock) | 4451 | if (unlock) |
4452 | read_unlock(&tasklist_lock); | 4452 | read_unlock(&tasklist_lock); |
@@ -4476,10 +4476,10 @@ asmlinkage __visible void lockdep_sys_exit(void) | |||
4476 | if (!debug_locks_off()) | 4476 | if (!debug_locks_off()) |
4477 | return; | 4477 | return; |
4478 | printk("\n"); | 4478 | printk("\n"); |
4479 | printk("================================================\n"); | 4479 | pr_warn("================================================\n"); |
4480 | printk("[ BUG: lock held when returning to user space! ]\n"); | 4480 | pr_warn("WARNING: lock held when returning to user space!\n"); |
4481 | print_kernel_ident(); | 4481 | print_kernel_ident(); |
4482 | printk("------------------------------------------------\n"); | 4482 | pr_warn("------------------------------------------------\n"); |
4483 | printk("%s/%d is leaving the kernel with locks still held!\n", | 4483 | printk("%s/%d is leaving the kernel with locks still held!\n", |
4484 | curr->comm, curr->pid); | 4484 | curr->comm, curr->pid); |
4485 | lockdep_print_held_locks(curr); | 4485 | lockdep_print_held_locks(curr); |
@@ -4496,13 +4496,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) | |||
4496 | #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ | 4496 | #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ |
4497 | /* Note: the following can be executed concurrently, so be careful. */ | 4497 | /* Note: the following can be executed concurrently, so be careful. */ |
4498 | printk("\n"); | 4498 | printk("\n"); |
4499 | pr_err("===============================\n"); | 4499 | pr_warn("=============================\n"); |
4500 | pr_err("[ ERR: suspicious RCU usage. ]\n"); | 4500 | pr_warn("WARNING: suspicious RCU usage\n"); |
4501 | print_kernel_ident(); | 4501 | print_kernel_ident(); |
4502 | pr_err("-------------------------------\n"); | 4502 | pr_warn("-----------------------------\n"); |
4503 | pr_err("%s:%d %s!\n", file, line, s); | 4503 | printk("%s:%d %s!\n", file, line, s); |
4504 | pr_err("\nother info that might help us debug this:\n\n"); | 4504 | printk("\nother info that might help us debug this:\n\n"); |
4505 | pr_err("\n%srcu_scheduler_active = %d, debug_locks = %d\n", | 4505 | printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", |
4506 | !rcu_lockdep_current_cpu_online() | 4506 | !rcu_lockdep_current_cpu_online() |
4507 | ? "RCU used illegally from offline CPU!\n" | 4507 | ? "RCU used illegally from offline CPU!\n" |
4508 | : !rcu_is_watching() | 4508 | : !rcu_is_watching() |
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 32fe775a2eaf..58e366ad36f4 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c | |||
@@ -102,10 +102,11 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) | |||
102 | return; | 102 | return; |
103 | } | 103 | } |
104 | 104 | ||
105 | printk("\n============================================\n"); | 105 | pr_warn("\n"); |
106 | printk( "[ BUG: circular locking deadlock detected! ]\n"); | 106 | pr_warn("============================================\n"); |
107 | printk("%s\n", print_tainted()); | 107 | pr_warn("WARNING: circular locking deadlock detected!\n"); |
108 | printk( "--------------------------------------------\n"); | 108 | pr_warn("%s\n", print_tainted()); |
109 | pr_warn("--------------------------------------------\n"); | ||
109 | printk("%s/%d is deadlocking current task %s/%d\n\n", | 110 | printk("%s/%d is deadlocking current task %s/%d\n\n", |
110 | task->comm, task_pid_nr(task), | 111 | task->comm, task_pid_nr(task), |
111 | current->comm, task_pid_nr(current)); | 112 | current->comm, task_pid_nr(current)); |
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 18dfc485225c..23803c7d5180 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile | |||
@@ -3,10 +3,13 @@ | |||
3 | KCOV_INSTRUMENT := n | 3 | KCOV_INSTRUMENT := n |
4 | 4 | ||
5 | obj-y += update.o sync.o | 5 | obj-y += update.o sync.o |
6 | obj-$(CONFIG_SRCU) += srcu.o | 6 | obj-$(CONFIG_CLASSIC_SRCU) += srcu.o |
7 | obj-$(CONFIG_TREE_SRCU) += srcutree.o | ||
8 | obj-$(CONFIG_TINY_SRCU) += srcutiny.o | ||
7 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 9 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
8 | obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o | 10 | obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o |
9 | obj-$(CONFIG_TREE_RCU) += tree.o | 11 | obj-$(CONFIG_TREE_RCU) += tree.o |
10 | obj-$(CONFIG_PREEMPT_RCU) += tree.o | 12 | obj-$(CONFIG_PREEMPT_RCU) += tree.o |
11 | obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o | 13 | obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o |
12 | obj-$(CONFIG_TINY_RCU) += tiny.o | 14 | obj-$(CONFIG_TINY_RCU) += tiny.o |
15 | obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o | ||
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 0d6ff3e471be..73e16ec4054b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h | |||
@@ -56,6 +56,83 @@ | |||
56 | #define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ | 56 | #define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ |
57 | DYNTICK_TASK_FLAG) | 57 | DYNTICK_TASK_FLAG) |
58 | 58 | ||
59 | |||
60 | /* | ||
61 | * Grace-period counter management. | ||
62 | */ | ||
63 | |||
64 | #define RCU_SEQ_CTR_SHIFT 2 | ||
65 | #define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) | ||
66 | |||
67 | /* | ||
68 | * Return the counter portion of a sequence number previously returned | ||
69 | * by rcu_seq_snap() or rcu_seq_current(). | ||
70 | */ | ||
71 | static inline unsigned long rcu_seq_ctr(unsigned long s) | ||
72 | { | ||
73 | return s >> RCU_SEQ_CTR_SHIFT; | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | * Return the state portion of a sequence number previously returned | ||
78 | * by rcu_seq_snap() or rcu_seq_current(). | ||
79 | */ | ||
80 | static inline int rcu_seq_state(unsigned long s) | ||
81 | { | ||
82 | return s & RCU_SEQ_STATE_MASK; | ||
83 | } | ||
84 | |||
85 | /* | ||
86 | * Set the state portion of the pointed-to sequence number. | ||
87 | * The caller is responsible for preventing conflicting updates. | ||
88 | */ | ||
89 | static inline void rcu_seq_set_state(unsigned long *sp, int newstate) | ||
90 | { | ||
91 | WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK); | ||
92 | WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate); | ||
93 | } | ||
94 | |||
95 | /* Adjust sequence number for start of update-side operation. */ | ||
96 | static inline void rcu_seq_start(unsigned long *sp) | ||
97 | { | ||
98 | WRITE_ONCE(*sp, *sp + 1); | ||
99 | smp_mb(); /* Ensure update-side operation after counter increment. */ | ||
100 | WARN_ON_ONCE(rcu_seq_state(*sp) != 1); | ||
101 | } | ||
102 | |||
103 | /* Adjust sequence number for end of update-side operation. */ | ||
104 | static inline void rcu_seq_end(unsigned long *sp) | ||
105 | { | ||
106 | smp_mb(); /* Ensure update-side operation before counter increment. */ | ||
107 | WARN_ON_ONCE(!rcu_seq_state(*sp)); | ||
108 | WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1); | ||
109 | } | ||
110 | |||
111 | /* Take a snapshot of the update side's sequence number. */ | ||
112 | static inline unsigned long rcu_seq_snap(unsigned long *sp) | ||
113 | { | ||
114 | unsigned long s; | ||
115 | |||
116 | s = (READ_ONCE(*sp) + 2 * RCU_SEQ_STATE_MASK + 1) & ~RCU_SEQ_STATE_MASK; | ||
117 | smp_mb(); /* Above access must not bleed into critical section. */ | ||
118 | return s; | ||
119 | } | ||
120 | |||
121 | /* Return the current value the update side's sequence number, no ordering. */ | ||
122 | static inline unsigned long rcu_seq_current(unsigned long *sp) | ||
123 | { | ||
124 | return READ_ONCE(*sp); | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | * Given a snapshot from rcu_seq_snap(), determine whether or not a | ||
129 | * full update-side operation has occurred. | ||
130 | */ | ||
131 | static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) | ||
132 | { | ||
133 | return ULONG_CMP_GE(READ_ONCE(*sp), s); | ||
134 | } | ||
135 | |||
59 | /* | 136 | /* |
60 | * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally | 137 | * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally |
61 | * by call_rcu() and rcu callback execution, and are therefore not part of the | 138 | * by call_rcu() and rcu callback execution, and are therefore not part of the |
@@ -109,12 +186,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) | |||
109 | 186 | ||
110 | rcu_lock_acquire(&rcu_callback_map); | 187 | rcu_lock_acquire(&rcu_callback_map); |
111 | if (__is_kfree_rcu_offset(offset)) { | 188 | if (__is_kfree_rcu_offset(offset)) { |
112 | RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); | 189 | RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);) |
113 | kfree((void *)head - offset); | 190 | kfree((void *)head - offset); |
114 | rcu_lock_release(&rcu_callback_map); | 191 | rcu_lock_release(&rcu_callback_map); |
115 | return true; | 192 | return true; |
116 | } else { | 193 | } else { |
117 | RCU_TRACE(trace_rcu_invoke_callback(rn, head)); | 194 | RCU_TRACE(trace_rcu_invoke_callback(rn, head);) |
118 | head->func(head); | 195 | head->func(head); |
119 | rcu_lock_release(&rcu_callback_map); | 196 | rcu_lock_release(&rcu_callback_map); |
120 | return false; | 197 | return false; |
@@ -144,4 +221,76 @@ void rcu_test_sync_prims(void); | |||
144 | */ | 221 | */ |
145 | extern void resched_cpu(int cpu); | 222 | extern void resched_cpu(int cpu); |
146 | 223 | ||
224 | #if defined(SRCU) || !defined(TINY_RCU) | ||
225 | |||
226 | #include <linux/rcu_node_tree.h> | ||
227 | |||
228 | extern int rcu_num_lvls; | ||
229 | extern int num_rcu_lvl[]; | ||
230 | extern int rcu_num_nodes; | ||
231 | static bool rcu_fanout_exact; | ||
232 | static int rcu_fanout_leaf; | ||
233 | |||
234 | /* | ||
235 | * Compute the per-level fanout, either using the exact fanout specified | ||
236 | * or balancing the tree, depending on the rcu_fanout_exact boot parameter. | ||
237 | */ | ||
238 | static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) | ||
239 | { | ||
240 | int i; | ||
241 | |||
242 | if (rcu_fanout_exact) { | ||
243 | levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; | ||
244 | for (i = rcu_num_lvls - 2; i >= 0; i--) | ||
245 | levelspread[i] = RCU_FANOUT; | ||
246 | } else { | ||
247 | int ccur; | ||
248 | int cprv; | ||
249 | |||
250 | cprv = nr_cpu_ids; | ||
251 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | ||
252 | ccur = levelcnt[i]; | ||
253 | levelspread[i] = (cprv + ccur - 1) / ccur; | ||
254 | cprv = ccur; | ||
255 | } | ||
256 | } | ||
257 | } | ||
258 | |||
259 | /* | ||
260 | * Do a full breadth-first scan of the rcu_node structures for the | ||
261 | * specified rcu_state structure. | ||
262 | */ | ||
263 | #define rcu_for_each_node_breadth_first(rsp, rnp) \ | ||
264 | for ((rnp) = &(rsp)->node[0]; \ | ||
265 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | ||
266 | |||
267 | /* | ||
268 | * Do a breadth-first scan of the non-leaf rcu_node structures for the | ||
269 | * specified rcu_state structure. Note that if there is a singleton | ||
270 | * rcu_node tree with but one rcu_node structure, this loop is a no-op. | ||
271 | */ | ||
272 | #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ | ||
273 | for ((rnp) = &(rsp)->node[0]; \ | ||
274 | (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) | ||
275 | |||
276 | /* | ||
277 | * Scan the leaves of the rcu_node hierarchy for the specified rcu_state | ||
278 | * structure. Note that if there is a singleton rcu_node tree with but | ||
279 | * one rcu_node structure, this loop -will- visit the rcu_node structure. | ||
280 | * It is still a leaf node, even if it is also the root node. | ||
281 | */ | ||
282 | #define rcu_for_each_leaf_node(rsp, rnp) \ | ||
283 | for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ | ||
284 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | ||
285 | |||
286 | /* | ||
287 | * Iterate over all possible CPUs in a leaf RCU node. | ||
288 | */ | ||
289 | #define for_each_leaf_node_possible_cpu(rnp, cpu) \ | ||
290 | for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ | ||
291 | cpu <= rnp->grphi; \ | ||
292 | cpu = cpumask_next((cpu), cpu_possible_mask)) | ||
293 | |||
294 | #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ | ||
295 | |||
147 | #endif /* __LINUX_RCU_H */ | 296 | #endif /* __LINUX_RCU_H */ |
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c new file mode 100644 index 000000000000..2b62a38b080f --- /dev/null +++ b/kernel/rcu/rcu_segcblist.c | |||
@@ -0,0 +1,505 @@ | |||
1 | /* | ||
2 | * RCU segmented callback lists, function definitions | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, you can access it online at | ||
16 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2017 | ||
19 | * | ||
20 | * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
21 | */ | ||
22 | |||
23 | #include <linux/types.h> | ||
24 | #include <linux/kernel.h> | ||
25 | #include <linux/interrupt.h> | ||
26 | |||
27 | #include "rcu_segcblist.h" | ||
28 | |||
29 | /* Initialize simple callback list. */ | ||
30 | void rcu_cblist_init(struct rcu_cblist *rclp) | ||
31 | { | ||
32 | rclp->head = NULL; | ||
33 | rclp->tail = &rclp->head; | ||
34 | rclp->len = 0; | ||
35 | rclp->len_lazy = 0; | ||
36 | } | ||
37 | |||
38 | /* | ||
39 | * Debug function to actually count the number of callbacks. | ||
40 | * If the number exceeds the limit specified, return -1. | ||
41 | */ | ||
42 | long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim) | ||
43 | { | ||
44 | int cnt = 0; | ||
45 | struct rcu_head **rhpp = &rclp->head; | ||
46 | |||
47 | for (;;) { | ||
48 | if (!*rhpp) | ||
49 | return cnt; | ||
50 | if (++cnt > lim) | ||
51 | return -1; | ||
52 | rhpp = &(*rhpp)->next; | ||
53 | } | ||
54 | } | ||
55 | |||
56 | /* | ||
57 | * Dequeue the oldest rcu_head structure from the specified callback | ||
58 | * list. This function assumes that the callback is non-lazy, but | ||
59 | * the caller can later invoke rcu_cblist_dequeued_lazy() if it | ||
60 | * finds otherwise (and if it cares about laziness). This allows | ||
61 | * different users to have different ways of determining laziness. | ||
62 | */ | ||
63 | struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) | ||
64 | { | ||
65 | struct rcu_head *rhp; | ||
66 | |||
67 | rhp = rclp->head; | ||
68 | if (!rhp) | ||
69 | return NULL; | ||
70 | rclp->len--; | ||
71 | rclp->head = rhp->next; | ||
72 | if (!rclp->head) | ||
73 | rclp->tail = &rclp->head; | ||
74 | return rhp; | ||
75 | } | ||
76 | |||
77 | /* | ||
78 | * Initialize an rcu_segcblist structure. | ||
79 | */ | ||
80 | void rcu_segcblist_init(struct rcu_segcblist *rsclp) | ||
81 | { | ||
82 | int i; | ||
83 | |||
84 | BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq)); | ||
85 | BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq)); | ||
86 | rsclp->head = NULL; | ||
87 | for (i = 0; i < RCU_CBLIST_NSEGS; i++) | ||
88 | rsclp->tails[i] = &rsclp->head; | ||
89 | rsclp->len = 0; | ||
90 | rsclp->len_lazy = 0; | ||
91 | } | ||
92 | |||
93 | /* | ||
94 | * Disable the specified rcu_segcblist structure, so that callbacks can | ||
95 | * no longer be posted to it. This structure must be empty. | ||
96 | */ | ||
97 | void rcu_segcblist_disable(struct rcu_segcblist *rsclp) | ||
98 | { | ||
99 | WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); | ||
100 | WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); | ||
101 | WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); | ||
102 | rsclp->tails[RCU_NEXT_TAIL] = NULL; | ||
103 | } | ||
104 | |||
105 | /* | ||
106 | * Is the specified segment of the specified rcu_segcblist structure | ||
107 | * empty of callbacks? | ||
108 | */ | ||
109 | bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) | ||
110 | { | ||
111 | if (seg == RCU_DONE_TAIL) | ||
112 | return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; | ||
113 | return rsclp->tails[seg - 1] == rsclp->tails[seg]; | ||
114 | } | ||
115 | |||
116 | /* | ||
117 | * Does the specified rcu_segcblist structure contain callbacks that | ||
118 | * are ready to be invoked? | ||
119 | */ | ||
120 | bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp) | ||
121 | { | ||
122 | return rcu_segcblist_is_enabled(rsclp) && | ||
123 | &rsclp->head != rsclp->tails[RCU_DONE_TAIL]; | ||
124 | } | ||
125 | |||
126 | /* | ||
127 | * Does the specified rcu_segcblist structure contain callbacks that | ||
128 | * are still pending, that is, not yet ready to be invoked? | ||
129 | */ | ||
130 | bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp) | ||
131 | { | ||
132 | return rcu_segcblist_is_enabled(rsclp) && | ||
133 | !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL); | ||
134 | } | ||
135 | |||
136 | /* | ||
137 | * Dequeue and return the first ready-to-invoke callback. If there | ||
138 | * are no ready-to-invoke callbacks, return NULL. Disables interrupts | ||
139 | * to avoid interference. Does not protect from interference from other | ||
140 | * CPUs or tasks. | ||
141 | */ | ||
142 | struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp) | ||
143 | { | ||
144 | unsigned long flags; | ||
145 | int i; | ||
146 | struct rcu_head *rhp; | ||
147 | |||
148 | local_irq_save(flags); | ||
149 | if (!rcu_segcblist_ready_cbs(rsclp)) { | ||
150 | local_irq_restore(flags); | ||
151 | return NULL; | ||
152 | } | ||
153 | rhp = rsclp->head; | ||
154 | BUG_ON(!rhp); | ||
155 | rsclp->head = rhp->next; | ||
156 | for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) { | ||
157 | if (rsclp->tails[i] != &rhp->next) | ||
158 | break; | ||
159 | rsclp->tails[i] = &rsclp->head; | ||
160 | } | ||
161 | smp_mb(); /* Dequeue before decrement for rcu_barrier(). */ | ||
162 | WRITE_ONCE(rsclp->len, rsclp->len - 1); | ||
163 | local_irq_restore(flags); | ||
164 | return rhp; | ||
165 | } | ||
166 | |||
167 | /* | ||
168 | * Account for the fact that a previously dequeued callback turned out | ||
169 | * to be marked as lazy. | ||
170 | */ | ||
171 | void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp) | ||
172 | { | ||
173 | unsigned long flags; | ||
174 | |||
175 | local_irq_save(flags); | ||
176 | rsclp->len_lazy--; | ||
177 | local_irq_restore(flags); | ||
178 | } | ||
179 | |||
180 | /* | ||
181 | * Return a pointer to the first callback in the specified rcu_segcblist | ||
182 | * structure. This is useful for diagnostics. | ||
183 | */ | ||
184 | struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp) | ||
185 | { | ||
186 | if (rcu_segcblist_is_enabled(rsclp)) | ||
187 | return rsclp->head; | ||
188 | return NULL; | ||
189 | } | ||
190 | |||
191 | /* | ||
192 | * Return a pointer to the first pending callback in the specified | ||
193 | * rcu_segcblist structure. This is useful just after posting a given | ||
194 | * callback -- if that callback is the first pending callback, then | ||
195 | * you cannot rely on someone else having already started up the required | ||
196 | * grace period. | ||
197 | */ | ||
198 | struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) | ||
199 | { | ||
200 | if (rcu_segcblist_is_enabled(rsclp)) | ||
201 | return *rsclp->tails[RCU_DONE_TAIL]; | ||
202 | return NULL; | ||
203 | } | ||
204 | |||
205 | /* | ||
206 | * Does the specified rcu_segcblist structure contain callbacks that | ||
207 | * have not yet been processed beyond having been posted, that is, | ||
208 | * does it contain callbacks in its last segment? | ||
209 | */ | ||
210 | bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp) | ||
211 | { | ||
212 | return rcu_segcblist_is_enabled(rsclp) && | ||
213 | !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL); | ||
214 | } | ||
215 | |||
216 | /* | ||
217 | * Enqueue the specified callback onto the specified rcu_segcblist | ||
218 | * structure, updating accounting as needed. Note that the ->len | ||
219 | * field may be accessed locklessly, hence the WRITE_ONCE(). | ||
220 | * The ->len field is used by rcu_barrier() and friends to determine | ||
221 | * if it must post a callback on this structure, and it is OK | ||
222 | * for rcu_barrier() to sometimes post callbacks needlessly, but | ||
223 | * absolutely not OK for it to ever miss posting a callback. | ||
224 | */ | ||
225 | void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, | ||
226 | struct rcu_head *rhp, bool lazy) | ||
227 | { | ||
228 | WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */ | ||
229 | if (lazy) | ||
230 | rsclp->len_lazy++; | ||
231 | smp_mb(); /* Ensure counts are updated before callback is enqueued. */ | ||
232 | rhp->next = NULL; | ||
233 | *rsclp->tails[RCU_NEXT_TAIL] = rhp; | ||
234 | rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; | ||
235 | } | ||
236 | |||
237 | /* | ||
238 | * Entrain the specified callback onto the specified rcu_segcblist at | ||
239 | * the end of the last non-empty segment. If the entire rcu_segcblist | ||
240 | * is empty, make no change, but return false. | ||
241 | * | ||
242 | * This is intended for use by rcu_barrier()-like primitives, -not- | ||
243 | * for normal grace-period use. IMPORTANT: The callback you enqueue | ||
244 | * will wait for all prior callbacks, NOT necessarily for a grace | ||
245 | * period. You have been warned. | ||
246 | */ | ||
247 | bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, | ||
248 | struct rcu_head *rhp, bool lazy) | ||
249 | { | ||
250 | int i; | ||
251 | |||
252 | if (rcu_segcblist_n_cbs(rsclp) == 0) | ||
253 | return false; | ||
254 | WRITE_ONCE(rsclp->len, rsclp->len + 1); | ||
255 | if (lazy) | ||
256 | rsclp->len_lazy++; | ||
257 | smp_mb(); /* Ensure counts are updated before callback is entrained. */ | ||
258 | rhp->next = NULL; | ||
259 | for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) | ||
260 | if (rsclp->tails[i] != rsclp->tails[i - 1]) | ||
261 | break; | ||
262 | *rsclp->tails[i] = rhp; | ||
263 | for (; i <= RCU_NEXT_TAIL; i++) | ||
264 | rsclp->tails[i] = &rhp->next; | ||
265 | return true; | ||
266 | } | ||
267 | |||
268 | /* | ||
269 | * Extract only the counts from the specified rcu_segcblist structure, | ||
270 | * and place them in the specified rcu_cblist structure. This function | ||
271 | * supports both callback orphaning and invocation, hence the separation | ||
272 | * of counts and callbacks. (Callbacks ready for invocation must be | ||
273 | * orphaned and adopted separately from pending callbacks, but counts | ||
274 | * apply to all callbacks. Locking must be used to make sure that | ||
275 | * both orphaned-callbacks lists are consistent.) | ||
276 | */ | ||
277 | void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, | ||
278 | struct rcu_cblist *rclp) | ||
279 | { | ||
280 | rclp->len_lazy += rsclp->len_lazy; | ||
281 | rclp->len += rsclp->len; | ||
282 | rsclp->len_lazy = 0; | ||
283 | WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */ | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * Extract only those callbacks ready to be invoked from the specified | ||
288 | * rcu_segcblist structure and place them in the specified rcu_cblist | ||
289 | * structure. | ||
290 | */ | ||
291 | void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, | ||
292 | struct rcu_cblist *rclp) | ||
293 | { | ||
294 | int i; | ||
295 | |||
296 | if (!rcu_segcblist_ready_cbs(rsclp)) | ||
297 | return; /* Nothing to do. */ | ||
298 | *rclp->tail = rsclp->head; | ||
299 | rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; | ||
300 | *rsclp->tails[RCU_DONE_TAIL] = NULL; | ||
301 | rclp->tail = rsclp->tails[RCU_DONE_TAIL]; | ||
302 | for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) | ||
303 | if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) | ||
304 | rsclp->tails[i] = &rsclp->head; | ||
305 | } | ||
306 | |||
307 | /* | ||
308 | * Extract only those callbacks still pending (not yet ready to be | ||
309 | * invoked) from the specified rcu_segcblist structure and place them in | ||
310 | * the specified rcu_cblist structure. Note that this loses information | ||
311 | * about any callbacks that might have been partway done waiting for | ||
312 | * their grace period. Too bad! They will have to start over. | ||
313 | */ | ||
314 | void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, | ||
315 | struct rcu_cblist *rclp) | ||
316 | { | ||
317 | int i; | ||
318 | |||
319 | if (!rcu_segcblist_pend_cbs(rsclp)) | ||
320 | return; /* Nothing to do. */ | ||
321 | *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; | ||
322 | rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; | ||
323 | *rsclp->tails[RCU_DONE_TAIL] = NULL; | ||
324 | for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) | ||
325 | rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL]; | ||
326 | } | ||
327 | |||
328 | /* | ||
329 | * Insert counts from the specified rcu_cblist structure in the | ||
330 | * specified rcu_segcblist structure. | ||
331 | */ | ||
332 | void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, | ||
333 | struct rcu_cblist *rclp) | ||
334 | { | ||
335 | rsclp->len_lazy += rclp->len_lazy; | ||
336 | /* ->len sampled locklessly. */ | ||
337 | WRITE_ONCE(rsclp->len, rsclp->len + rclp->len); | ||
338 | rclp->len_lazy = 0; | ||
339 | rclp->len = 0; | ||
340 | } | ||
341 | |||
342 | /* | ||
343 | * Move callbacks from the specified rcu_cblist to the beginning of the | ||
344 | * done-callbacks segment of the specified rcu_segcblist. | ||
345 | */ | ||
346 | void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, | ||
347 | struct rcu_cblist *rclp) | ||
348 | { | ||
349 | int i; | ||
350 | |||
351 | if (!rclp->head) | ||
352 | return; /* No callbacks to move. */ | ||
353 | *rclp->tail = rsclp->head; | ||
354 | rsclp->head = rclp->head; | ||
355 | for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) | ||
356 | if (&rsclp->head == rsclp->tails[i]) | ||
357 | rsclp->tails[i] = rclp->tail; | ||
358 | else | ||
359 | break; | ||
360 | rclp->head = NULL; | ||
361 | rclp->tail = &rclp->head; | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * Move callbacks from the specified rcu_cblist to the end of the | ||
366 | * new-callbacks segment of the specified rcu_segcblist. | ||
367 | */ | ||
368 | void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, | ||
369 | struct rcu_cblist *rclp) | ||
370 | { | ||
371 | if (!rclp->head) | ||
372 | return; /* Nothing to do. */ | ||
373 | *rsclp->tails[RCU_NEXT_TAIL] = rclp->head; | ||
374 | rsclp->tails[RCU_NEXT_TAIL] = rclp->tail; | ||
375 | rclp->head = NULL; | ||
376 | rclp->tail = &rclp->head; | ||
377 | } | ||
378 | |||
379 | /* | ||
380 | * Advance the callbacks in the specified rcu_segcblist structure based | ||
381 | * on the current value passed in for the grace-period counter. | ||
382 | */ | ||
383 | void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) | ||
384 | { | ||
385 | int i, j; | ||
386 | |||
387 | WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); | ||
388 | if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) | ||
389 | return; | ||
390 | |||
391 | /* | ||
392 | * Find all callbacks whose ->gp_seq numbers indicate that they | ||
393 | * are ready to invoke, and put them into the RCU_DONE_TAIL segment. | ||
394 | */ | ||
395 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { | ||
396 | if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) | ||
397 | break; | ||
398 | rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i]; | ||
399 | } | ||
400 | |||
401 | /* If no callbacks moved, nothing more need be done. */ | ||
402 | if (i == RCU_WAIT_TAIL) | ||
403 | return; | ||
404 | |||
405 | /* Clean up tail pointers that might have been misordered above. */ | ||
406 | for (j = RCU_WAIT_TAIL; j < i; j++) | ||
407 | rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL]; | ||
408 | |||
409 | /* | ||
410 | * Callbacks moved, so clean up the misordered ->tails[] pointers | ||
411 | * that now point into the middle of the list of ready-to-invoke | ||
412 | * callbacks. The overall effect is to copy down the later pointers | ||
413 | * into the gap that was created by the now-ready segments. | ||
414 | */ | ||
415 | for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { | ||
416 | if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) | ||
417 | break; /* No more callbacks. */ | ||
418 | rsclp->tails[j] = rsclp->tails[i]; | ||
419 | rsclp->gp_seq[j] = rsclp->gp_seq[i]; | ||
420 | } | ||
421 | } | ||
422 | |||
423 | /* | ||
424 | * "Accelerate" callbacks based on more-accurate grace-period information. | ||
425 | * The reason for this is that RCU does not synchronize the beginnings and | ||
426 | * ends of grace periods, and that callbacks are posted locally. This in | ||
427 | * turn means that the callbacks must be labelled conservatively early | ||
428 | * on, as getting exact information would degrade both performance and | ||
429 | * scalability. When more accurate grace-period information becomes | ||
430 | * available, previously posted callbacks can be "accelerated", marking | ||
431 | * them to complete at the end of the earlier grace period. | ||
432 | * | ||
433 | * This function operates on an rcu_segcblist structure, and also the | ||
434 | * grace-period sequence number seq at which new callbacks would become | ||
435 | * ready to invoke. Returns true if there are callbacks that won't be | ||
436 | * ready to invoke until seq, false otherwise. | ||
437 | */ | ||
438 | bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) | ||
439 | { | ||
440 | int i; | ||
441 | |||
442 | WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); | ||
443 | if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) | ||
444 | return false; | ||
445 | |||
446 | /* | ||
447 | * Find the segment preceding the oldest segment of callbacks | ||
448 | * whose ->gp_seq[] completion is at or after that passed in via | ||
449 | * "seq", skipping any empty segments. This oldest segment, along | ||
450 | * with any later segments, can be merged in with any newly arrived | ||
451 | * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq" | ||
452 | * as their ->gp_seq[] grace-period completion sequence number. | ||
453 | */ | ||
454 | for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--) | ||
455 | if (rsclp->tails[i] != rsclp->tails[i - 1] && | ||
456 | ULONG_CMP_LT(rsclp->gp_seq[i], seq)) | ||
457 | break; | ||
458 | |||
459 | /* | ||
460 | * If all the segments contain callbacks that correspond to | ||
461 | * earlier grace-period sequence numbers than "seq", leave. | ||
462 | * Assuming that the rcu_segcblist structure has enough | ||
463 | * segments in its arrays, this can only happen if some of | ||
464 | * the non-done segments contain callbacks that really are | ||
465 | * ready to invoke. This situation will get straightened | ||
466 | * out by the next call to rcu_segcblist_advance(). | ||
467 | * | ||
468 | * Also advance to the oldest segment of callbacks whose | ||
469 | * ->gp_seq[] completion is at or after that passed in via "seq", | ||
470 | * skipping any empty segments. | ||
471 | */ | ||
472 | if (++i >= RCU_NEXT_TAIL) | ||
473 | return false; | ||
474 | |||
475 | /* | ||
476 | * Merge all later callbacks, including newly arrived callbacks, | ||
477 | * into the segment located by the for-loop above. Assign "seq" | ||
478 | * as the ->gp_seq[] value in order to correctly handle the case | ||
479 | * where there were no pending callbacks in the rcu_segcblist | ||
480 | * structure other than in the RCU_NEXT_TAIL segment. | ||
481 | */ | ||
482 | for (; i < RCU_NEXT_TAIL; i++) { | ||
483 | rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; | ||
484 | rsclp->gp_seq[i] = seq; | ||
485 | } | ||
486 | return true; | ||
487 | } | ||
488 | |||
489 | /* | ||
490 | * Scan the specified rcu_segcblist structure for callbacks that need | ||
491 | * a grace period later than the one specified by "seq". We don't look | ||
492 | * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't | ||
493 | * have a grace-period sequence number. | ||
494 | */ | ||
495 | bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, | ||
496 | unsigned long seq) | ||
497 | { | ||
498 | int i; | ||
499 | |||
500 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) | ||
501 | if (rsclp->tails[i - 1] != rsclp->tails[i] && | ||
502 | ULONG_CMP_LT(seq, rsclp->gp_seq[i])) | ||
503 | return true; | ||
504 | return false; | ||
505 | } | ||
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h new file mode 100644 index 000000000000..6e36e36478cd --- /dev/null +++ b/kernel/rcu/rcu_segcblist.h | |||
@@ -0,0 +1,164 @@ | |||
1 | /* | ||
2 | * RCU segmented callback lists, internal-to-rcu header file | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, you can access it online at | ||
16 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2017 | ||
19 | * | ||
20 | * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
21 | */ | ||
22 | |||
23 | #include <linux/rcu_segcblist.h> | ||
24 | |||
25 | /* | ||
26 | * Account for the fact that a previously dequeued callback turned out | ||
27 | * to be marked as lazy. | ||
28 | */ | ||
29 | static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) | ||
30 | { | ||
31 | rclp->len_lazy--; | ||
32 | } | ||
33 | |||
34 | /* | ||
35 | * Interim function to return rcu_cblist head pointer. Longer term, the | ||
36 | * rcu_cblist will be used more pervasively, removing the need for this | ||
37 | * function. | ||
38 | */ | ||
39 | static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp) | ||
40 | { | ||
41 | return rclp->head; | ||
42 | } | ||
43 | |||
44 | /* | ||
45 | * Interim function to return rcu_cblist head pointer. Longer term, the | ||
46 | * rcu_cblist will be used more pervasively, removing the need for this | ||
47 | * function. | ||
48 | */ | ||
49 | static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp) | ||
50 | { | ||
51 | WARN_ON_ONCE(!rclp->head); | ||
52 | return rclp->tail; | ||
53 | } | ||
54 | |||
55 | void rcu_cblist_init(struct rcu_cblist *rclp); | ||
56 | long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim); | ||
57 | struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); | ||
58 | |||
59 | /* | ||
60 | * Is the specified rcu_segcblist structure empty? | ||
61 | * | ||
62 | * But careful! The fact that the ->head field is NULL does not | ||
63 | * necessarily imply that there are no callbacks associated with | ||
64 | * this structure. When callbacks are being invoked, they are | ||
65 | * removed as a group. If callback invocation must be preempted, | ||
66 | * the remaining callbacks will be added back to the list. Either | ||
67 | * way, the counts are updated later. | ||
68 | * | ||
69 | * So it is often the case that rcu_segcblist_n_cbs() should be used | ||
70 | * instead. | ||
71 | */ | ||
72 | static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) | ||
73 | { | ||
74 | return !rsclp->head; | ||
75 | } | ||
76 | |||
77 | /* Return number of callbacks in segmented callback list. */ | ||
78 | static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) | ||
79 | { | ||
80 | return READ_ONCE(rsclp->len); | ||
81 | } | ||
82 | |||
83 | /* Return number of lazy callbacks in segmented callback list. */ | ||
84 | static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) | ||
85 | { | ||
86 | return rsclp->len_lazy; | ||
87 | } | ||
88 | |||
89 | /* Return number of lazy callbacks in segmented callback list. */ | ||
90 | static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) | ||
91 | { | ||
92 | return rsclp->len - rsclp->len_lazy; | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * Is the specified rcu_segcblist enabled, for example, not corresponding | ||
97 | * to an offline or callback-offloaded CPU? | ||
98 | */ | ||
99 | static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) | ||
100 | { | ||
101 | return !!rsclp->tails[RCU_NEXT_TAIL]; | ||
102 | } | ||
103 | |||
104 | /* | ||
105 | * Are all segments following the specified segment of the specified | ||
106 | * rcu_segcblist structure empty of callbacks? (The specified | ||
107 | * segment might well contain callbacks.) | ||
108 | */ | ||
109 | static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) | ||
110 | { | ||
111 | return !*rsclp->tails[seg]; | ||
112 | } | ||
113 | |||
114 | /* | ||
115 | * Interim function to return rcu_segcblist head pointer. Longer term, the | ||
116 | * rcu_segcblist will be used more pervasively, removing the need for this | ||
117 | * function. | ||
118 | */ | ||
119 | static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) | ||
120 | { | ||
121 | return rsclp->head; | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * Interim function to return rcu_segcblist head pointer. Longer term, the | ||
126 | * rcu_segcblist will be used more pervasively, removing the need for this | ||
127 | * function. | ||
128 | */ | ||
129 | static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) | ||
130 | { | ||
131 | WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); | ||
132 | return rsclp->tails[RCU_NEXT_TAIL]; | ||
133 | } | ||
134 | |||
135 | void rcu_segcblist_init(struct rcu_segcblist *rsclp); | ||
136 | void rcu_segcblist_disable(struct rcu_segcblist *rsclp); | ||
137 | bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg); | ||
138 | bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); | ||
139 | bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); | ||
140 | struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp); | ||
141 | void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp); | ||
142 | struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); | ||
143 | struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); | ||
144 | bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp); | ||
145 | void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, | ||
146 | struct rcu_head *rhp, bool lazy); | ||
147 | bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, | ||
148 | struct rcu_head *rhp, bool lazy); | ||
149 | void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, | ||
150 | struct rcu_cblist *rclp); | ||
151 | void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, | ||
152 | struct rcu_cblist *rclp); | ||
153 | void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, | ||
154 | struct rcu_cblist *rclp); | ||
155 | void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, | ||
156 | struct rcu_cblist *rclp); | ||
157 | void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, | ||
158 | struct rcu_cblist *rclp); | ||
159 | void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, | ||
160 | struct rcu_cblist *rclp); | ||
161 | void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq); | ||
162 | bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); | ||
163 | bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, | ||
164 | unsigned long seq); | ||
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index cccc417a8135..ae6e574d4cf5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
@@ -559,19 +559,34 @@ static void srcu_torture_barrier(void) | |||
559 | 559 | ||
560 | static void srcu_torture_stats(void) | 560 | static void srcu_torture_stats(void) |
561 | { | 561 | { |
562 | int cpu; | 562 | int __maybe_unused cpu; |
563 | int idx = srcu_ctlp->completed & 0x1; | 563 | int idx; |
564 | 564 | ||
565 | pr_alert("%s%s per-CPU(idx=%d):", | 565 | #if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU) |
566 | #ifdef CONFIG_TREE_SRCU | ||
567 | idx = srcu_ctlp->srcu_idx & 0x1; | ||
568 | #else /* #ifdef CONFIG_TREE_SRCU */ | ||
569 | idx = srcu_ctlp->completed & 0x1; | ||
570 | #endif /* #else #ifdef CONFIG_TREE_SRCU */ | ||
571 | pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", | ||
566 | torture_type, TORTURE_FLAG, idx); | 572 | torture_type, TORTURE_FLAG, idx); |
567 | for_each_possible_cpu(cpu) { | 573 | for_each_possible_cpu(cpu) { |
568 | unsigned long l0, l1; | 574 | unsigned long l0, l1; |
569 | unsigned long u0, u1; | 575 | unsigned long u0, u1; |
570 | long c0, c1; | 576 | long c0, c1; |
571 | struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); | 577 | #ifdef CONFIG_TREE_SRCU |
578 | struct srcu_data *counts; | ||
572 | 579 | ||
580 | counts = per_cpu_ptr(srcu_ctlp->sda, cpu); | ||
581 | u0 = counts->srcu_unlock_count[!idx]; | ||
582 | u1 = counts->srcu_unlock_count[idx]; | ||
583 | #else /* #ifdef CONFIG_TREE_SRCU */ | ||
584 | struct srcu_array *counts; | ||
585 | |||
586 | counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); | ||
573 | u0 = counts->unlock_count[!idx]; | 587 | u0 = counts->unlock_count[!idx]; |
574 | u1 = counts->unlock_count[idx]; | 588 | u1 = counts->unlock_count[idx]; |
589 | #endif /* #else #ifdef CONFIG_TREE_SRCU */ | ||
575 | 590 | ||
576 | /* | 591 | /* |
577 | * Make sure that a lock is always counted if the corresponding | 592 | * Make sure that a lock is always counted if the corresponding |
@@ -579,14 +594,26 @@ static void srcu_torture_stats(void) | |||
579 | */ | 594 | */ |
580 | smp_rmb(); | 595 | smp_rmb(); |
581 | 596 | ||
597 | #ifdef CONFIG_TREE_SRCU | ||
598 | l0 = counts->srcu_lock_count[!idx]; | ||
599 | l1 = counts->srcu_lock_count[idx]; | ||
600 | #else /* #ifdef CONFIG_TREE_SRCU */ | ||
582 | l0 = counts->lock_count[!idx]; | 601 | l0 = counts->lock_count[!idx]; |
583 | l1 = counts->lock_count[idx]; | 602 | l1 = counts->lock_count[idx]; |
603 | #endif /* #else #ifdef CONFIG_TREE_SRCU */ | ||
584 | 604 | ||
585 | c0 = l0 - u0; | 605 | c0 = l0 - u0; |
586 | c1 = l1 - u1; | 606 | c1 = l1 - u1; |
587 | pr_cont(" %d(%ld,%ld)", cpu, c0, c1); | 607 | pr_cont(" %d(%ld,%ld)", cpu, c0, c1); |
588 | } | 608 | } |
589 | pr_cont("\n"); | 609 | pr_cont("\n"); |
610 | #elif defined(CONFIG_TINY_SRCU) | ||
611 | idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; | ||
612 | pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n", | ||
613 | torture_type, TORTURE_FLAG, idx, | ||
614 | READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), | ||
615 | READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); | ||
616 | #endif | ||
590 | } | 617 | } |
591 | 618 | ||
592 | static void srcu_torture_synchronize_expedited(void) | 619 | static void srcu_torture_synchronize_expedited(void) |
@@ -1333,12 +1360,14 @@ rcu_torture_stats_print(void) | |||
1333 | cur_ops->stats(); | 1360 | cur_ops->stats(); |
1334 | if (rtcv_snap == rcu_torture_current_version && | 1361 | if (rtcv_snap == rcu_torture_current_version && |
1335 | rcu_torture_current != NULL) { | 1362 | rcu_torture_current != NULL) { |
1336 | int __maybe_unused flags; | 1363 | int __maybe_unused flags = 0; |
1337 | unsigned long __maybe_unused gpnum; | 1364 | unsigned long __maybe_unused gpnum = 0; |
1338 | unsigned long __maybe_unused completed; | 1365 | unsigned long __maybe_unused completed = 0; |
1339 | 1366 | ||
1340 | rcutorture_get_gp_data(cur_ops->ttype, | 1367 | rcutorture_get_gp_data(cur_ops->ttype, |
1341 | &flags, &gpnum, &completed); | 1368 | &flags, &gpnum, &completed); |
1369 | srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, | ||
1370 | &flags, &gpnum, &completed); | ||
1342 | wtp = READ_ONCE(writer_task); | 1371 | wtp = READ_ONCE(writer_task); |
1343 | pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", | 1372 | pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", |
1344 | rcu_torture_writer_state_getname(), | 1373 | rcu_torture_writer_state_getname(), |
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index ef3bcfb15b39..584d8a983883 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c | |||
@@ -22,7 +22,7 @@ | |||
22 | * Lai Jiangshan <laijs@cn.fujitsu.com> | 22 | * Lai Jiangshan <laijs@cn.fujitsu.com> |
23 | * | 23 | * |
24 | * For detailed explanation of Read-Copy Update mechanism see - | 24 | * For detailed explanation of Read-Copy Update mechanism see - |
25 | * Documentation/RCU/ *.txt | 25 | * Documentation/RCU/ *.txt |
26 | * | 26 | * |
27 | */ | 27 | */ |
28 | 28 | ||
@@ -243,8 +243,14 @@ static bool srcu_readers_active(struct srcu_struct *sp) | |||
243 | * cleanup_srcu_struct - deconstruct a sleep-RCU structure | 243 | * cleanup_srcu_struct - deconstruct a sleep-RCU structure |
244 | * @sp: structure to clean up. | 244 | * @sp: structure to clean up. |
245 | * | 245 | * |
246 | * Must invoke this after you are finished using a given srcu_struct that | 246 | * Must invoke this only after you are finished using a given srcu_struct |
247 | * was initialized via init_srcu_struct(), else you leak memory. | 247 | * that was initialized via init_srcu_struct(). This code does some |
248 | * probabalistic checking, spotting late uses of srcu_read_lock(), | ||
249 | * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu(). | ||
250 | * If any such late uses are detected, the per-CPU memory associated with | ||
251 | * the srcu_struct is simply leaked and WARN_ON() is invoked. If the | ||
252 | * caller frees the srcu_struct itself, a use-after-free crash will likely | ||
253 | * ensue, but at least there will be a warning printed. | ||
248 | */ | 254 | */ |
249 | void cleanup_srcu_struct(struct srcu_struct *sp) | 255 | void cleanup_srcu_struct(struct srcu_struct *sp) |
250 | { | 256 | { |
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c new file mode 100644 index 000000000000..36e1f82faed1 --- /dev/null +++ b/kernel/rcu/srcutiny.c | |||
@@ -0,0 +1,216 @@ | |||
1 | /* | ||
2 | * Sleepable Read-Copy Update mechanism for mutual exclusion, | ||
3 | * tiny version for non-preemptible single-CPU use. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, you can access it online at | ||
17 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
18 | * | ||
19 | * Copyright (C) IBM Corporation, 2017 | ||
20 | * | ||
21 | * Author: Paul McKenney <paulmck@us.ibm.com> | ||
22 | */ | ||
23 | |||
24 | #include <linux/export.h> | ||
25 | #include <linux/mutex.h> | ||
26 | #include <linux/preempt.h> | ||
27 | #include <linux/rcupdate_wait.h> | ||
28 | #include <linux/sched.h> | ||
29 | #include <linux/delay.h> | ||
30 | #include <linux/srcu.h> | ||
31 | |||
32 | #include <linux/rcu_node_tree.h> | ||
33 | #include "rcu_segcblist.h" | ||
34 | #include "rcu.h" | ||
35 | |||
36 | static int init_srcu_struct_fields(struct srcu_struct *sp) | ||
37 | { | ||
38 | sp->srcu_lock_nesting[0] = 0; | ||
39 | sp->srcu_lock_nesting[1] = 0; | ||
40 | init_swait_queue_head(&sp->srcu_wq); | ||
41 | sp->srcu_gp_seq = 0; | ||
42 | rcu_segcblist_init(&sp->srcu_cblist); | ||
43 | sp->srcu_gp_running = false; | ||
44 | sp->srcu_gp_waiting = false; | ||
45 | sp->srcu_idx = 0; | ||
46 | INIT_WORK(&sp->srcu_work, srcu_drive_gp); | ||
47 | return 0; | ||
48 | } | ||
49 | |||
50 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
51 | |||
52 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, | ||
53 | struct lock_class_key *key) | ||
54 | { | ||
55 | /* Don't re-initialize a lock while it is held. */ | ||
56 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | ||
57 | lockdep_init_map(&sp->dep_map, name, key, 0); | ||
58 | return init_srcu_struct_fields(sp); | ||
59 | } | ||
60 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | ||
61 | |||
62 | #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
63 | |||
64 | /* | ||
65 | * init_srcu_struct - initialize a sleep-RCU structure | ||
66 | * @sp: structure to initialize. | ||
67 | * | ||
68 | * Must invoke this on a given srcu_struct before passing that srcu_struct | ||
69 | * to any other function. Each srcu_struct represents a separate domain | ||
70 | * of SRCU protection. | ||
71 | */ | ||
72 | int init_srcu_struct(struct srcu_struct *sp) | ||
73 | { | ||
74 | return init_srcu_struct_fields(sp); | ||
75 | } | ||
76 | EXPORT_SYMBOL_GPL(init_srcu_struct); | ||
77 | |||
78 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
79 | |||
80 | /* | ||
81 | * cleanup_srcu_struct - deconstruct a sleep-RCU structure | ||
82 | * @sp: structure to clean up. | ||
83 | * | ||
84 | * Must invoke this after you are finished using a given srcu_struct that | ||
85 | * was initialized via init_srcu_struct(), else you leak memory. | ||
86 | */ | ||
87 | void cleanup_srcu_struct(struct srcu_struct *sp) | ||
88 | { | ||
89 | WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); | ||
90 | flush_work(&sp->srcu_work); | ||
91 | WARN_ON(rcu_seq_state(sp->srcu_gp_seq)); | ||
92 | WARN_ON(sp->srcu_gp_running); | ||
93 | WARN_ON(sp->srcu_gp_waiting); | ||
94 | WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)); | ||
95 | } | ||
96 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); | ||
97 | |||
98 | /* | ||
99 | * Counts the new reader in the appropriate per-CPU element of the | ||
100 | * srcu_struct. Must be called from process context. | ||
101 | * Returns an index that must be passed to the matching srcu_read_unlock(). | ||
102 | */ | ||
103 | int __srcu_read_lock(struct srcu_struct *sp) | ||
104 | { | ||
105 | int idx; | ||
106 | |||
107 | idx = READ_ONCE(sp->srcu_idx); | ||
108 | WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1); | ||
109 | return idx; | ||
110 | } | ||
111 | EXPORT_SYMBOL_GPL(__srcu_read_lock); | ||
112 | |||
113 | /* | ||
114 | * Removes the count for the old reader from the appropriate element of | ||
115 | * the srcu_struct. Must be called from process context. | ||
116 | */ | ||
117 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | ||
118 | { | ||
119 | int newval = sp->srcu_lock_nesting[idx] - 1; | ||
120 | |||
121 | WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); | ||
122 | if (!newval && READ_ONCE(sp->srcu_gp_waiting)) | ||
123 | swake_up(&sp->srcu_wq); | ||
124 | } | ||
125 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | ||
126 | |||
127 | /* | ||
128 | * Workqueue handler to drive one grace period and invoke any callbacks | ||
129 | * that become ready as a result. Single-CPU and !PREEMPT operation | ||
130 | * means that we get away with murder on synchronization. ;-) | ||
131 | */ | ||
132 | void srcu_drive_gp(struct work_struct *wp) | ||
133 | { | ||
134 | int idx; | ||
135 | struct rcu_cblist ready_cbs; | ||
136 | struct srcu_struct *sp; | ||
137 | struct rcu_head *rhp; | ||
138 | |||
139 | sp = container_of(wp, struct srcu_struct, srcu_work); | ||
140 | if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist)) | ||
141 | return; /* Already running or nothing to do. */ | ||
142 | |||
143 | /* Tag recently arrived callbacks and wait for readers. */ | ||
144 | WRITE_ONCE(sp->srcu_gp_running, true); | ||
145 | rcu_segcblist_accelerate(&sp->srcu_cblist, | ||
146 | rcu_seq_snap(&sp->srcu_gp_seq)); | ||
147 | rcu_seq_start(&sp->srcu_gp_seq); | ||
148 | idx = sp->srcu_idx; | ||
149 | WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); | ||
150 | WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ | ||
151 | swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); | ||
152 | WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ | ||
153 | rcu_seq_end(&sp->srcu_gp_seq); | ||
154 | |||
155 | /* Update callback list based on GP, and invoke ready callbacks. */ | ||
156 | rcu_segcblist_advance(&sp->srcu_cblist, | ||
157 | rcu_seq_current(&sp->srcu_gp_seq)); | ||
158 | if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) { | ||
159 | rcu_cblist_init(&ready_cbs); | ||
160 | local_irq_disable(); | ||
161 | rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs); | ||
162 | local_irq_enable(); | ||
163 | rhp = rcu_cblist_dequeue(&ready_cbs); | ||
164 | for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { | ||
165 | local_bh_disable(); | ||
166 | rhp->func(rhp); | ||
167 | local_bh_enable(); | ||
168 | } | ||
169 | local_irq_disable(); | ||
170 | rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs); | ||
171 | local_irq_enable(); | ||
172 | } | ||
173 | WRITE_ONCE(sp->srcu_gp_running, false); | ||
174 | |||
175 | /* | ||
176 | * If more callbacks, reschedule ourselves. This can race with | ||
177 | * a call_srcu() at interrupt level, but the ->srcu_gp_running | ||
178 | * checks will straighten that out. | ||
179 | */ | ||
180 | if (!rcu_segcblist_empty(&sp->srcu_cblist)) | ||
181 | schedule_work(&sp->srcu_work); | ||
182 | } | ||
183 | EXPORT_SYMBOL_GPL(srcu_drive_gp); | ||
184 | |||
185 | /* | ||
186 | * Enqueue an SRCU callback on the specified srcu_struct structure, | ||
187 | * initiating grace-period processing if it is not already running. | ||
188 | */ | ||
189 | void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | ||
190 | rcu_callback_t func) | ||
191 | { | ||
192 | unsigned long flags; | ||
193 | |||
194 | head->func = func; | ||
195 | local_irq_save(flags); | ||
196 | rcu_segcblist_enqueue(&sp->srcu_cblist, head, false); | ||
197 | local_irq_restore(flags); | ||
198 | if (!READ_ONCE(sp->srcu_gp_running)) | ||
199 | schedule_work(&sp->srcu_work); | ||
200 | } | ||
201 | EXPORT_SYMBOL_GPL(call_srcu); | ||
202 | |||
203 | /* | ||
204 | * synchronize_srcu - wait for prior SRCU read-side critical-section completion | ||
205 | */ | ||
206 | void synchronize_srcu(struct srcu_struct *sp) | ||
207 | { | ||
208 | struct rcu_synchronize rs; | ||
209 | |||
210 | init_rcu_head_on_stack(&rs.head); | ||
211 | init_completion(&rs.completion); | ||
212 | call_srcu(sp, &rs.head, wakeme_after_rcu); | ||
213 | wait_for_completion(&rs.completion); | ||
214 | destroy_rcu_head_on_stack(&rs.head); | ||
215 | } | ||
216 | EXPORT_SYMBOL_GPL(synchronize_srcu); | ||
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c new file mode 100644 index 000000000000..3ae8474557df --- /dev/null +++ b/kernel/rcu/srcutree.c | |||
@@ -0,0 +1,1155 @@ | |||
1 | /* | ||
2 | * Sleepable Read-Copy Update mechanism for mutual exclusion. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, you can access it online at | ||
16 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
17 | * | ||
18 | * Copyright (C) IBM Corporation, 2006 | ||
19 | * Copyright (C) Fujitsu, 2012 | ||
20 | * | ||
21 | * Author: Paul McKenney <paulmck@us.ibm.com> | ||
22 | * Lai Jiangshan <laijs@cn.fujitsu.com> | ||
23 | * | ||
24 | * For detailed explanation of Read-Copy Update mechanism see - | ||
25 | * Documentation/RCU/ *.txt | ||
26 | * | ||
27 | */ | ||
28 | |||
29 | #include <linux/export.h> | ||
30 | #include <linux/mutex.h> | ||
31 | #include <linux/percpu.h> | ||
32 | #include <linux/preempt.h> | ||
33 | #include <linux/rcupdate_wait.h> | ||
34 | #include <linux/sched.h> | ||
35 | #include <linux/smp.h> | ||
36 | #include <linux/delay.h> | ||
37 | #include <linux/module.h> | ||
38 | #include <linux/srcu.h> | ||
39 | |||
40 | #include "rcu.h" | ||
41 | #include "rcu_segcblist.h" | ||
42 | |||
43 | ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */ | ||
44 | module_param(exp_holdoff, ulong, 0444); | ||
45 | |||
46 | static void srcu_invoke_callbacks(struct work_struct *work); | ||
47 | static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); | ||
48 | |||
49 | /* | ||
50 | * Initialize SRCU combining tree. Note that statically allocated | ||
51 | * srcu_struct structures might already have srcu_read_lock() and | ||
52 | * srcu_read_unlock() running against them. So if the is_static parameter | ||
53 | * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. | ||
54 | */ | ||
55 | static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) | ||
56 | { | ||
57 | int cpu; | ||
58 | int i; | ||
59 | int level = 0; | ||
60 | int levelspread[RCU_NUM_LVLS]; | ||
61 | struct srcu_data *sdp; | ||
62 | struct srcu_node *snp; | ||
63 | struct srcu_node *snp_first; | ||
64 | |||
65 | /* Work out the overall tree geometry. */ | ||
66 | sp->level[0] = &sp->node[0]; | ||
67 | for (i = 1; i < rcu_num_lvls; i++) | ||
68 | sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1]; | ||
69 | rcu_init_levelspread(levelspread, num_rcu_lvl); | ||
70 | |||
71 | /* Each pass through this loop initializes one srcu_node structure. */ | ||
72 | rcu_for_each_node_breadth_first(sp, snp) { | ||
73 | spin_lock_init(&snp->lock); | ||
74 | WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != | ||
75 | ARRAY_SIZE(snp->srcu_data_have_cbs)); | ||
76 | for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { | ||
77 | snp->srcu_have_cbs[i] = 0; | ||
78 | snp->srcu_data_have_cbs[i] = 0; | ||
79 | } | ||
80 | snp->srcu_gp_seq_needed_exp = 0; | ||
81 | snp->grplo = -1; | ||
82 | snp->grphi = -1; | ||
83 | if (snp == &sp->node[0]) { | ||
84 | /* Root node, special case. */ | ||
85 | snp->srcu_parent = NULL; | ||
86 | continue; | ||
87 | } | ||
88 | |||
89 | /* Non-root node. */ | ||
90 | if (snp == sp->level[level + 1]) | ||
91 | level++; | ||
92 | snp->srcu_parent = sp->level[level - 1] + | ||
93 | (snp - sp->level[level]) / | ||
94 | levelspread[level - 1]; | ||
95 | } | ||
96 | |||
97 | /* | ||
98 | * Initialize the per-CPU srcu_data array, which feeds into the | ||
99 | * leaves of the srcu_node tree. | ||
100 | */ | ||
101 | WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != | ||
102 | ARRAY_SIZE(sdp->srcu_unlock_count)); | ||
103 | level = rcu_num_lvls - 1; | ||
104 | snp_first = sp->level[level]; | ||
105 | for_each_possible_cpu(cpu) { | ||
106 | sdp = per_cpu_ptr(sp->sda, cpu); | ||
107 | spin_lock_init(&sdp->lock); | ||
108 | rcu_segcblist_init(&sdp->srcu_cblist); | ||
109 | sdp->srcu_cblist_invoking = false; | ||
110 | sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; | ||
111 | sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq; | ||
112 | sdp->mynode = &snp_first[cpu / levelspread[level]]; | ||
113 | for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { | ||
114 | if (snp->grplo < 0) | ||
115 | snp->grplo = cpu; | ||
116 | snp->grphi = cpu; | ||
117 | } | ||
118 | sdp->cpu = cpu; | ||
119 | INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks); | ||
120 | sdp->sp = sp; | ||
121 | sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); | ||
122 | if (is_static) | ||
123 | continue; | ||
124 | |||
125 | /* Dynamically allocated, better be no srcu_read_locks()! */ | ||
126 | for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) { | ||
127 | sdp->srcu_lock_count[i] = 0; | ||
128 | sdp->srcu_unlock_count[i] = 0; | ||
129 | } | ||
130 | } | ||
131 | } | ||
132 | |||
133 | /* | ||
134 | * Initialize non-compile-time initialized fields, including the | ||
135 | * associated srcu_node and srcu_data structures. The is_static | ||
136 | * parameter is passed through to init_srcu_struct_nodes(), and | ||
137 | * also tells us that ->sda has already been wired up to srcu_data. | ||
138 | */ | ||
139 | static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static) | ||
140 | { | ||
141 | mutex_init(&sp->srcu_cb_mutex); | ||
142 | mutex_init(&sp->srcu_gp_mutex); | ||
143 | sp->srcu_idx = 0; | ||
144 | sp->srcu_gp_seq = 0; | ||
145 | sp->srcu_barrier_seq = 0; | ||
146 | mutex_init(&sp->srcu_barrier_mutex); | ||
147 | atomic_set(&sp->srcu_barrier_cpu_cnt, 0); | ||
148 | INIT_DELAYED_WORK(&sp->work, process_srcu); | ||
149 | if (!is_static) | ||
150 | sp->sda = alloc_percpu(struct srcu_data); | ||
151 | init_srcu_struct_nodes(sp, is_static); | ||
152 | sp->srcu_gp_seq_needed_exp = 0; | ||
153 | sp->srcu_last_gp_end = ktime_get_mono_fast_ns(); | ||
154 | smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */ | ||
155 | return sp->sda ? 0 : -ENOMEM; | ||
156 | } | ||
157 | |||
158 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
159 | |||
160 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, | ||
161 | struct lock_class_key *key) | ||
162 | { | ||
163 | /* Don't re-initialize a lock while it is held. */ | ||
164 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | ||
165 | lockdep_init_map(&sp->dep_map, name, key, 0); | ||
166 | spin_lock_init(&sp->gp_lock); | ||
167 | return init_srcu_struct_fields(sp, false); | ||
168 | } | ||
169 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | ||
170 | |||
171 | #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
172 | |||
173 | /** | ||
174 | * init_srcu_struct - initialize a sleep-RCU structure | ||
175 | * @sp: structure to initialize. | ||
176 | * | ||
177 | * Must invoke this on a given srcu_struct before passing that srcu_struct | ||
178 | * to any other function. Each srcu_struct represents a separate domain | ||
179 | * of SRCU protection. | ||
180 | */ | ||
181 | int init_srcu_struct(struct srcu_struct *sp) | ||
182 | { | ||
183 | spin_lock_init(&sp->gp_lock); | ||
184 | return init_srcu_struct_fields(sp, false); | ||
185 | } | ||
186 | EXPORT_SYMBOL_GPL(init_srcu_struct); | ||
187 | |||
188 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
189 | |||
190 | /* | ||
191 | * First-use initialization of statically allocated srcu_struct | ||
192 | * structure. Wiring up the combining tree is more than can be | ||
193 | * done with compile-time initialization, so this check is added | ||
194 | * to each update-side SRCU primitive. Use ->gp_lock, which -is- | ||
195 | * compile-time initialized, to resolve races involving multiple | ||
196 | * CPUs trying to garner first-use privileges. | ||
197 | */ | ||
198 | static void check_init_srcu_struct(struct srcu_struct *sp) | ||
199 | { | ||
200 | unsigned long flags; | ||
201 | |||
202 | WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT); | ||
203 | /* The smp_load_acquire() pairs with the smp_store_release(). */ | ||
204 | if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ | ||
205 | return; /* Already initialized. */ | ||
206 | spin_lock_irqsave(&sp->gp_lock, flags); | ||
207 | if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { | ||
208 | spin_unlock_irqrestore(&sp->gp_lock, flags); | ||
209 | return; | ||
210 | } | ||
211 | init_srcu_struct_fields(sp, true); | ||
212 | spin_unlock_irqrestore(&sp->gp_lock, flags); | ||
213 | } | ||
214 | |||
215 | /* | ||
216 | * Returns approximate total of the readers' ->srcu_lock_count[] values | ||
217 | * for the rank of per-CPU counters specified by idx. | ||
218 | */ | ||
219 | static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) | ||
220 | { | ||
221 | int cpu; | ||
222 | unsigned long sum = 0; | ||
223 | |||
224 | for_each_possible_cpu(cpu) { | ||
225 | struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); | ||
226 | |||
227 | sum += READ_ONCE(cpuc->srcu_lock_count[idx]); | ||
228 | } | ||
229 | return sum; | ||
230 | } | ||
231 | |||
232 | /* | ||
233 | * Returns approximate total of the readers' ->srcu_unlock_count[] values | ||
234 | * for the rank of per-CPU counters specified by idx. | ||
235 | */ | ||
236 | static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) | ||
237 | { | ||
238 | int cpu; | ||
239 | unsigned long sum = 0; | ||
240 | |||
241 | for_each_possible_cpu(cpu) { | ||
242 | struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); | ||
243 | |||
244 | sum += READ_ONCE(cpuc->srcu_unlock_count[idx]); | ||
245 | } | ||
246 | return sum; | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * Return true if the number of pre-existing readers is determined to | ||
251 | * be zero. | ||
252 | */ | ||
253 | static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | ||
254 | { | ||
255 | unsigned long unlocks; | ||
256 | |||
257 | unlocks = srcu_readers_unlock_idx(sp, idx); | ||
258 | |||
259 | /* | ||
260 | * Make sure that a lock is always counted if the corresponding | ||
261 | * unlock is counted. Needs to be a smp_mb() as the read side may | ||
262 | * contain a read from a variable that is written to before the | ||
263 | * synchronize_srcu() in the write side. In this case smp_mb()s | ||
264 | * A and B act like the store buffering pattern. | ||
265 | * | ||
266 | * This smp_mb() also pairs with smp_mb() C to prevent accesses | ||
267 | * after the synchronize_srcu() from being executed before the | ||
268 | * grace period ends. | ||
269 | */ | ||
270 | smp_mb(); /* A */ | ||
271 | |||
272 | /* | ||
273 | * If the locks are the same as the unlocks, then there must have | ||
274 | * been no readers on this index at some time in between. This does | ||
275 | * not mean that there are no more readers, as one could have read | ||
276 | * the current index but not have incremented the lock counter yet. | ||
277 | * | ||
278 | * Possible bug: There is no guarantee that there haven't been | ||
279 | * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were | ||
280 | * counted, meaning that this could return true even if there are | ||
281 | * still active readers. Since there are no memory barriers around | ||
282 | * srcu_flip(), the CPU is not required to increment ->srcu_idx | ||
283 | * before running srcu_readers_unlock_idx(), which means that there | ||
284 | * could be an arbitrarily large number of critical sections that | ||
285 | * execute after srcu_readers_unlock_idx() but use the old value | ||
286 | * of ->srcu_idx. | ||
287 | */ | ||
288 | return srcu_readers_lock_idx(sp, idx) == unlocks; | ||
289 | } | ||
290 | |||
291 | /** | ||
292 | * srcu_readers_active - returns true if there are readers. and false | ||
293 | * otherwise | ||
294 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). | ||
295 | * | ||
296 | * Note that this is not an atomic primitive, and can therefore suffer | ||
297 | * severe errors when invoked on an active srcu_struct. That said, it | ||
298 | * can be useful as an error check at cleanup time. | ||
299 | */ | ||
300 | static bool srcu_readers_active(struct srcu_struct *sp) | ||
301 | { | ||
302 | int cpu; | ||
303 | unsigned long sum = 0; | ||
304 | |||
305 | for_each_possible_cpu(cpu) { | ||
306 | struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); | ||
307 | |||
308 | sum += READ_ONCE(cpuc->srcu_lock_count[0]); | ||
309 | sum += READ_ONCE(cpuc->srcu_lock_count[1]); | ||
310 | sum -= READ_ONCE(cpuc->srcu_unlock_count[0]); | ||
311 | sum -= READ_ONCE(cpuc->srcu_unlock_count[1]); | ||
312 | } | ||
313 | return sum; | ||
314 | } | ||
315 | |||
316 | #define SRCU_INTERVAL 1 | ||
317 | |||
318 | /* | ||
319 | * Return grace-period delay, zero if there are expedited grace | ||
320 | * periods pending, SRCU_INTERVAL otherwise. | ||
321 | */ | ||
322 | static unsigned long srcu_get_delay(struct srcu_struct *sp) | ||
323 | { | ||
324 | if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq), | ||
325 | READ_ONCE(sp->srcu_gp_seq_needed_exp))) | ||
326 | return 0; | ||
327 | return SRCU_INTERVAL; | ||
328 | } | ||
329 | |||
330 | /** | ||
331 | * cleanup_srcu_struct - deconstruct a sleep-RCU structure | ||
332 | * @sp: structure to clean up. | ||
333 | * | ||
334 | * Must invoke this after you are finished using a given srcu_struct that | ||
335 | * was initialized via init_srcu_struct(), else you leak memory. | ||
336 | */ | ||
337 | void cleanup_srcu_struct(struct srcu_struct *sp) | ||
338 | { | ||
339 | int cpu; | ||
340 | |||
341 | if (WARN_ON(!srcu_get_delay(sp))) | ||
342 | return; /* Leakage unless caller handles error. */ | ||
343 | if (WARN_ON(srcu_readers_active(sp))) | ||
344 | return; /* Leakage unless caller handles error. */ | ||
345 | flush_delayed_work(&sp->work); | ||
346 | for_each_possible_cpu(cpu) | ||
347 | flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); | ||
348 | if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || | ||
349 | WARN_ON(srcu_readers_active(sp))) { | ||
350 | pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); | ||
351 | return; /* Caller forgot to stop doing call_srcu()? */ | ||
352 | } | ||
353 | free_percpu(sp->sda); | ||
354 | sp->sda = NULL; | ||
355 | } | ||
356 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); | ||
357 | |||
358 | /* | ||
359 | * Counts the new reader in the appropriate per-CPU element of the | ||
360 | * srcu_struct. Must be called from process context. | ||
361 | * Returns an index that must be passed to the matching srcu_read_unlock(). | ||
362 | */ | ||
363 | int __srcu_read_lock(struct srcu_struct *sp) | ||
364 | { | ||
365 | int idx; | ||
366 | |||
367 | idx = READ_ONCE(sp->srcu_idx) & 0x1; | ||
368 | __this_cpu_inc(sp->sda->srcu_lock_count[idx]); | ||
369 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ | ||
370 | return idx; | ||
371 | } | ||
372 | EXPORT_SYMBOL_GPL(__srcu_read_lock); | ||
373 | |||
374 | /* | ||
375 | * Removes the count for the old reader from the appropriate per-CPU | ||
376 | * element of the srcu_struct. Note that this may well be a different | ||
377 | * CPU than that which was incremented by the corresponding srcu_read_lock(). | ||
378 | * Must be called from process context. | ||
379 | */ | ||
380 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | ||
381 | { | ||
382 | smp_mb(); /* C */ /* Avoid leaking the critical section. */ | ||
383 | this_cpu_inc(sp->sda->srcu_unlock_count[idx]); | ||
384 | } | ||
385 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | ||
386 | |||
387 | /* | ||
388 | * We use an adaptive strategy for synchronize_srcu() and especially for | ||
389 | * synchronize_srcu_expedited(). We spin for a fixed time period | ||
390 | * (defined below) to allow SRCU readers to exit their read-side critical | ||
391 | * sections. If there are still some readers after a few microseconds, | ||
392 | * we repeatedly block for 1-millisecond time periods. | ||
393 | */ | ||
394 | #define SRCU_RETRY_CHECK_DELAY 5 | ||
395 | |||
396 | /* | ||
397 | * Start an SRCU grace period. | ||
398 | */ | ||
399 | static void srcu_gp_start(struct srcu_struct *sp) | ||
400 | { | ||
401 | struct srcu_data *sdp = this_cpu_ptr(sp->sda); | ||
402 | int state; | ||
403 | |||
404 | RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock), | ||
405 | "Invoked srcu_gp_start() without ->gp_lock!"); | ||
406 | WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); | ||
407 | rcu_segcblist_advance(&sdp->srcu_cblist, | ||
408 | rcu_seq_current(&sp->srcu_gp_seq)); | ||
409 | (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, | ||
410 | rcu_seq_snap(&sp->srcu_gp_seq)); | ||
411 | smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ | ||
412 | rcu_seq_start(&sp->srcu_gp_seq); | ||
413 | state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); | ||
414 | WARN_ON_ONCE(state != SRCU_STATE_SCAN1); | ||
415 | } | ||
416 | |||
417 | /* | ||
418 | * Track online CPUs to guide callback workqueue placement. | ||
419 | */ | ||
420 | DEFINE_PER_CPU(bool, srcu_online); | ||
421 | |||
422 | void srcu_online_cpu(unsigned int cpu) | ||
423 | { | ||
424 | WRITE_ONCE(per_cpu(srcu_online, cpu), true); | ||
425 | } | ||
426 | |||
427 | void srcu_offline_cpu(unsigned int cpu) | ||
428 | { | ||
429 | WRITE_ONCE(per_cpu(srcu_online, cpu), false); | ||
430 | } | ||
431 | |||
432 | /* | ||
433 | * Place the workqueue handler on the specified CPU if online, otherwise | ||
434 | * just run it whereever. This is useful for placing workqueue handlers | ||
435 | * that are to invoke the specified CPU's callbacks. | ||
436 | */ | ||
437 | static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | ||
438 | struct delayed_work *dwork, | ||
439 | unsigned long delay) | ||
440 | { | ||
441 | bool ret; | ||
442 | |||
443 | preempt_disable(); | ||
444 | if (READ_ONCE(per_cpu(srcu_online, cpu))) | ||
445 | ret = queue_delayed_work_on(cpu, wq, dwork, delay); | ||
446 | else | ||
447 | ret = queue_delayed_work(wq, dwork, delay); | ||
448 | preempt_enable(); | ||
449 | return ret; | ||
450 | } | ||
451 | |||
452 | /* | ||
453 | * Schedule callback invocation for the specified srcu_data structure, | ||
454 | * if possible, on the corresponding CPU. | ||
455 | */ | ||
456 | static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) | ||
457 | { | ||
458 | srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, | ||
459 | &sdp->work, delay); | ||
460 | } | ||
461 | |||
462 | /* | ||
463 | * Schedule callback invocation for all srcu_data structures associated | ||
464 | * with the specified srcu_node structure that have callbacks for the | ||
465 | * just-completed grace period, the one corresponding to idx. If possible, | ||
466 | * schedule this invocation on the corresponding CPUs. | ||
467 | */ | ||
468 | static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp, | ||
469 | unsigned long mask, unsigned long delay) | ||
470 | { | ||
471 | int cpu; | ||
472 | |||
473 | for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { | ||
474 | if (!(mask & (1 << (cpu - snp->grplo)))) | ||
475 | continue; | ||
476 | srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay); | ||
477 | } | ||
478 | } | ||
479 | |||
480 | /* | ||
481 | * Note the end of an SRCU grace period. Initiates callback invocation | ||
482 | * and starts a new grace period if needed. | ||
483 | * | ||
484 | * The ->srcu_cb_mutex acquisition does not protect any data, but | ||
485 | * instead prevents more than one grace period from starting while we | ||
486 | * are initiating callback invocation. This allows the ->srcu_have_cbs[] | ||
487 | * array to have a finite number of elements. | ||
488 | */ | ||
489 | static void srcu_gp_end(struct srcu_struct *sp) | ||
490 | { | ||
491 | unsigned long cbdelay; | ||
492 | bool cbs; | ||
493 | unsigned long gpseq; | ||
494 | int idx; | ||
495 | int idxnext; | ||
496 | unsigned long mask; | ||
497 | struct srcu_node *snp; | ||
498 | |||
499 | /* Prevent more than one additional grace period. */ | ||
500 | mutex_lock(&sp->srcu_cb_mutex); | ||
501 | |||
502 | /* End the current grace period. */ | ||
503 | spin_lock_irq(&sp->gp_lock); | ||
504 | idx = rcu_seq_state(sp->srcu_gp_seq); | ||
505 | WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); | ||
506 | cbdelay = srcu_get_delay(sp); | ||
507 | sp->srcu_last_gp_end = ktime_get_mono_fast_ns(); | ||
508 | rcu_seq_end(&sp->srcu_gp_seq); | ||
509 | gpseq = rcu_seq_current(&sp->srcu_gp_seq); | ||
510 | if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) | ||
511 | sp->srcu_gp_seq_needed_exp = gpseq; | ||
512 | spin_unlock_irq(&sp->gp_lock); | ||
513 | mutex_unlock(&sp->srcu_gp_mutex); | ||
514 | /* A new grace period can start at this point. But only one. */ | ||
515 | |||
516 | /* Initiate callback invocation as needed. */ | ||
517 | idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); | ||
518 | idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); | ||
519 | rcu_for_each_node_breadth_first(sp, snp) { | ||
520 | spin_lock_irq(&snp->lock); | ||
521 | cbs = false; | ||
522 | if (snp >= sp->level[rcu_num_lvls - 1]) | ||
523 | cbs = snp->srcu_have_cbs[idx] == gpseq; | ||
524 | snp->srcu_have_cbs[idx] = gpseq; | ||
525 | rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); | ||
526 | if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) | ||
527 | snp->srcu_gp_seq_needed_exp = gpseq; | ||
528 | mask = snp->srcu_data_have_cbs[idx]; | ||
529 | snp->srcu_data_have_cbs[idx] = 0; | ||
530 | spin_unlock_irq(&snp->lock); | ||
531 | if (cbs) { | ||
532 | smp_mb(); /* GP end before CB invocation. */ | ||
533 | srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); | ||
534 | } | ||
535 | } | ||
536 | |||
537 | /* Callback initiation done, allow grace periods after next. */ | ||
538 | mutex_unlock(&sp->srcu_cb_mutex); | ||
539 | |||
540 | /* Start a new grace period if needed. */ | ||
541 | spin_lock_irq(&sp->gp_lock); | ||
542 | gpseq = rcu_seq_current(&sp->srcu_gp_seq); | ||
543 | if (!rcu_seq_state(gpseq) && | ||
544 | ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { | ||
545 | srcu_gp_start(sp); | ||
546 | spin_unlock_irq(&sp->gp_lock); | ||
547 | /* Throttle expedited grace periods: Should be rare! */ | ||
548 | srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff | ||
549 | ? 0 : SRCU_INTERVAL); | ||
550 | } else { | ||
551 | spin_unlock_irq(&sp->gp_lock); | ||
552 | } | ||
553 | } | ||
554 | |||
555 | /* | ||
556 | * Funnel-locking scheme to scalably mediate many concurrent expedited | ||
557 | * grace-period requests. This function is invoked for the first known | ||
558 | * expedited request for a grace period that has already been requested, | ||
559 | * but without expediting. To start a completely new grace period, | ||
560 | * whether expedited or not, use srcu_funnel_gp_start() instead. | ||
561 | */ | ||
562 | static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, | ||
563 | unsigned long s) | ||
564 | { | ||
565 | unsigned long flags; | ||
566 | |||
567 | for (; snp != NULL; snp = snp->srcu_parent) { | ||
568 | if (rcu_seq_done(&sp->srcu_gp_seq, s) || | ||
569 | ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) | ||
570 | return; | ||
571 | spin_lock_irqsave(&snp->lock, flags); | ||
572 | if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { | ||
573 | spin_unlock_irqrestore(&snp->lock, flags); | ||
574 | return; | ||
575 | } | ||
576 | WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); | ||
577 | spin_unlock_irqrestore(&snp->lock, flags); | ||
578 | } | ||
579 | spin_lock_irqsave(&sp->gp_lock, flags); | ||
580 | if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) | ||
581 | sp->srcu_gp_seq_needed_exp = s; | ||
582 | spin_unlock_irqrestore(&sp->gp_lock, flags); | ||
583 | } | ||
584 | |||
585 | /* | ||
586 | * Funnel-locking scheme to scalably mediate many concurrent grace-period | ||
587 | * requests. The winner has to do the work of actually starting grace | ||
588 | * period s. Losers must either ensure that their desired grace-period | ||
589 | * number is recorded on at least their leaf srcu_node structure, or they | ||
590 | * must take steps to invoke their own callbacks. | ||
591 | */ | ||
592 | static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, | ||
593 | unsigned long s, bool do_norm) | ||
594 | { | ||
595 | unsigned long flags; | ||
596 | int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); | ||
597 | struct srcu_node *snp = sdp->mynode; | ||
598 | unsigned long snp_seq; | ||
599 | |||
600 | /* Each pass through the loop does one level of the srcu_node tree. */ | ||
601 | for (; snp != NULL; snp = snp->srcu_parent) { | ||
602 | if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) | ||
603 | return; /* GP already done and CBs recorded. */ | ||
604 | spin_lock_irqsave(&snp->lock, flags); | ||
605 | if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { | ||
606 | snp_seq = snp->srcu_have_cbs[idx]; | ||
607 | if (snp == sdp->mynode && snp_seq == s) | ||
608 | snp->srcu_data_have_cbs[idx] |= sdp->grpmask; | ||
609 | spin_unlock_irqrestore(&snp->lock, flags); | ||
610 | if (snp == sdp->mynode && snp_seq != s) { | ||
611 | smp_mb(); /* CBs after GP! */ | ||
612 | srcu_schedule_cbs_sdp(sdp, do_norm | ||
613 | ? SRCU_INTERVAL | ||
614 | : 0); | ||
615 | return; | ||
616 | } | ||
617 | if (!do_norm) | ||
618 | srcu_funnel_exp_start(sp, snp, s); | ||
619 | return; | ||
620 | } | ||
621 | snp->srcu_have_cbs[idx] = s; | ||
622 | if (snp == sdp->mynode) | ||
623 | snp->srcu_data_have_cbs[idx] |= sdp->grpmask; | ||
624 | if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) | ||
625 | snp->srcu_gp_seq_needed_exp = s; | ||
626 | spin_unlock_irqrestore(&snp->lock, flags); | ||
627 | } | ||
628 | |||
629 | /* Top of tree, must ensure the grace period will be started. */ | ||
630 | spin_lock_irqsave(&sp->gp_lock, flags); | ||
631 | if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { | ||
632 | /* | ||
633 | * Record need for grace period s. Pair with load | ||
634 | * acquire setting up for initialization. | ||
635 | */ | ||
636 | smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/ | ||
637 | } | ||
638 | if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) | ||
639 | sp->srcu_gp_seq_needed_exp = s; | ||
640 | |||
641 | /* If grace period not already done and none in progress, start it. */ | ||
642 | if (!rcu_seq_done(&sp->srcu_gp_seq, s) && | ||
643 | rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { | ||
644 | WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); | ||
645 | srcu_gp_start(sp); | ||
646 | queue_delayed_work(system_power_efficient_wq, &sp->work, | ||
647 | srcu_get_delay(sp)); | ||
648 | } | ||
649 | spin_unlock_irqrestore(&sp->gp_lock, flags); | ||
650 | } | ||
651 | |||
652 | /* | ||
653 | * Wait until all readers counted by array index idx complete, but | ||
654 | * loop an additional time if there is an expedited grace period pending. | ||
655 | * The caller must ensure that ->srcu_idx is not changed while checking. | ||
656 | */ | ||
657 | static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) | ||
658 | { | ||
659 | for (;;) { | ||
660 | if (srcu_readers_active_idx_check(sp, idx)) | ||
661 | return true; | ||
662 | if (--trycount + !srcu_get_delay(sp) <= 0) | ||
663 | return false; | ||
664 | udelay(SRCU_RETRY_CHECK_DELAY); | ||
665 | } | ||
666 | } | ||
667 | |||
668 | /* | ||
669 | * Increment the ->srcu_idx counter so that future SRCU readers will | ||
670 | * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows | ||
671 | * us to wait for pre-existing readers in a starvation-free manner. | ||
672 | */ | ||
673 | static void srcu_flip(struct srcu_struct *sp) | ||
674 | { | ||
675 | WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); | ||
676 | |||
677 | /* | ||
678 | * Ensure that if the updater misses an __srcu_read_unlock() | ||
679 | * increment, that task's next __srcu_read_lock() will see the | ||
680 | * above counter update. Note that both this memory barrier | ||
681 | * and the one in srcu_readers_active_idx_check() provide the | ||
682 | * guarantee for __srcu_read_lock(). | ||
683 | */ | ||
684 | smp_mb(); /* D */ /* Pairs with C. */ | ||
685 | } | ||
686 | |||
687 | /* | ||
688 | * If SRCU is likely idle, return true, otherwise return false. | ||
689 | * | ||
690 | * Note that it is OK for several current from-idle requests for a new | ||
691 | * grace period from idle to specify expediting because they will all end | ||
692 | * up requesting the same grace period anyhow. So no loss. | ||
693 | * | ||
694 | * Note also that if any CPU (including the current one) is still invoking | ||
695 | * callbacks, this function will nevertheless say "idle". This is not | ||
696 | * ideal, but the overhead of checking all CPUs' callback lists is even | ||
697 | * less ideal, especially on large systems. Furthermore, the wakeup | ||
698 | * can happen before the callback is fully removed, so we have no choice | ||
699 | * but to accept this type of error. | ||
700 | * | ||
701 | * This function is also subject to counter-wrap errors, but let's face | ||
702 | * it, if this function was preempted for enough time for the counters | ||
703 | * to wrap, it really doesn't matter whether or not we expedite the grace | ||
704 | * period. The extra overhead of a needlessly expedited grace period is | ||
705 | * negligible when amoritized over that time period, and the extra latency | ||
706 | * of a needlessly non-expedited grace period is similarly negligible. | ||
707 | */ | ||
708 | static bool srcu_might_be_idle(struct srcu_struct *sp) | ||
709 | { | ||
710 | unsigned long curseq; | ||
711 | unsigned long flags; | ||
712 | struct srcu_data *sdp; | ||
713 | unsigned long t; | ||
714 | |||
715 | /* If the local srcu_data structure has callbacks, not idle. */ | ||
716 | local_irq_save(flags); | ||
717 | sdp = this_cpu_ptr(sp->sda); | ||
718 | if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { | ||
719 | local_irq_restore(flags); | ||
720 | return false; /* Callbacks already present, so not idle. */ | ||
721 | } | ||
722 | local_irq_restore(flags); | ||
723 | |||
724 | /* | ||
725 | * No local callbacks, so probabalistically probe global state. | ||
726 | * Exact information would require acquiring locks, which would | ||
727 | * kill scalability, hence the probabalistic nature of the probe. | ||
728 | */ | ||
729 | |||
730 | /* First, see if enough time has passed since the last GP. */ | ||
731 | t = ktime_get_mono_fast_ns(); | ||
732 | if (exp_holdoff == 0 || | ||
733 | time_in_range_open(t, sp->srcu_last_gp_end, | ||
734 | sp->srcu_last_gp_end + exp_holdoff)) | ||
735 | return false; /* Too soon after last GP. */ | ||
736 | |||
737 | /* Next, check for probable idleness. */ | ||
738 | curseq = rcu_seq_current(&sp->srcu_gp_seq); | ||
739 | smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ | ||
740 | if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed))) | ||
741 | return false; /* Grace period in progress, so not idle. */ | ||
742 | smp_mb(); /* Order ->srcu_gp_seq with prior access. */ | ||
743 | if (curseq != rcu_seq_current(&sp->srcu_gp_seq)) | ||
744 | return false; /* GP # changed, so not idle. */ | ||
745 | return true; /* With reasonable probability, idle! */ | ||
746 | } | ||
747 | |||
748 | /* | ||
749 | * Enqueue an SRCU callback on the srcu_data structure associated with | ||
750 | * the current CPU and the specified srcu_struct structure, initiating | ||
751 | * grace-period processing if it is not already running. | ||
752 | * | ||
753 | * Note that all CPUs must agree that the grace period extended beyond | ||
754 | * all pre-existing SRCU read-side critical section. On systems with | ||
755 | * more than one CPU, this means that when "func()" is invoked, each CPU | ||
756 | * is guaranteed to have executed a full memory barrier since the end of | ||
757 | * its last corresponding SRCU read-side critical section whose beginning | ||
758 | * preceded the call to call_rcu(). It also means that each CPU executing | ||
759 | * an SRCU read-side critical section that continues beyond the start of | ||
760 | * "func()" must have executed a memory barrier after the call_rcu() | ||
761 | * but before the beginning of that SRCU read-side critical section. | ||
762 | * Note that these guarantees include CPUs that are offline, idle, or | ||
763 | * executing in user mode, as well as CPUs that are executing in the kernel. | ||
764 | * | ||
765 | * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the | ||
766 | * resulting SRCU callback function "func()", then both CPU A and CPU | ||
767 | * B are guaranteed to execute a full memory barrier during the time | ||
768 | * interval between the call to call_rcu() and the invocation of "func()". | ||
769 | * This guarantee applies even if CPU A and CPU B are the same CPU (but | ||
770 | * again only if the system has more than one CPU). | ||
771 | * | ||
772 | * Of course, these guarantees apply only for invocations of call_srcu(), | ||
773 | * srcu_read_lock(), and srcu_read_unlock() that are all passed the same | ||
774 | * srcu_struct structure. | ||
775 | */ | ||
776 | void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, | ||
777 | rcu_callback_t func, bool do_norm) | ||
778 | { | ||
779 | unsigned long flags; | ||
780 | bool needexp = false; | ||
781 | bool needgp = false; | ||
782 | unsigned long s; | ||
783 | struct srcu_data *sdp; | ||
784 | |||
785 | check_init_srcu_struct(sp); | ||
786 | rhp->func = func; | ||
787 | local_irq_save(flags); | ||
788 | sdp = this_cpu_ptr(sp->sda); | ||
789 | spin_lock(&sdp->lock); | ||
790 | rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); | ||
791 | rcu_segcblist_advance(&sdp->srcu_cblist, | ||
792 | rcu_seq_current(&sp->srcu_gp_seq)); | ||
793 | s = rcu_seq_snap(&sp->srcu_gp_seq); | ||
794 | (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); | ||
795 | if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { | ||
796 | sdp->srcu_gp_seq_needed = s; | ||
797 | needgp = true; | ||
798 | } | ||
799 | if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { | ||
800 | sdp->srcu_gp_seq_needed_exp = s; | ||
801 | needexp = true; | ||
802 | } | ||
803 | spin_unlock_irqrestore(&sdp->lock, flags); | ||
804 | if (needgp) | ||
805 | srcu_funnel_gp_start(sp, sdp, s, do_norm); | ||
806 | else if (needexp) | ||
807 | srcu_funnel_exp_start(sp, sdp->mynode, s); | ||
808 | } | ||
809 | |||
810 | void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, | ||
811 | rcu_callback_t func) | ||
812 | { | ||
813 | __call_srcu(sp, rhp, func, true); | ||
814 | } | ||
815 | EXPORT_SYMBOL_GPL(call_srcu); | ||
816 | |||
817 | /* | ||
818 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | ||
819 | */ | ||
820 | static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) | ||
821 | { | ||
822 | struct rcu_synchronize rcu; | ||
823 | |||
824 | RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || | ||
825 | lock_is_held(&rcu_bh_lock_map) || | ||
826 | lock_is_held(&rcu_lock_map) || | ||
827 | lock_is_held(&rcu_sched_lock_map), | ||
828 | "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); | ||
829 | |||
830 | if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) | ||
831 | return; | ||
832 | might_sleep(); | ||
833 | check_init_srcu_struct(sp); | ||
834 | init_completion(&rcu.completion); | ||
835 | init_rcu_head_on_stack(&rcu.head); | ||
836 | __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm); | ||
837 | wait_for_completion(&rcu.completion); | ||
838 | destroy_rcu_head_on_stack(&rcu.head); | ||
839 | } | ||
840 | |||
841 | /** | ||
842 | * synchronize_srcu_expedited - Brute-force SRCU grace period | ||
843 | * @sp: srcu_struct with which to synchronize. | ||
844 | * | ||
845 | * Wait for an SRCU grace period to elapse, but be more aggressive about | ||
846 | * spinning rather than blocking when waiting. | ||
847 | * | ||
848 | * Note that synchronize_srcu_expedited() has the same deadlock and | ||
849 | * memory-ordering properties as does synchronize_srcu(). | ||
850 | */ | ||
851 | void synchronize_srcu_expedited(struct srcu_struct *sp) | ||
852 | { | ||
853 | __synchronize_srcu(sp, rcu_gp_is_normal()); | ||
854 | } | ||
855 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); | ||
856 | |||
857 | /** | ||
858 | * synchronize_srcu - wait for prior SRCU read-side critical-section completion | ||
859 | * @sp: srcu_struct with which to synchronize. | ||
860 | * | ||
861 | * Wait for the count to drain to zero of both indexes. To avoid the | ||
862 | * possible starvation of synchronize_srcu(), it waits for the count of | ||
863 | * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, | ||
864 | * and then flip the srcu_idx and wait for the count of the other index. | ||
865 | * | ||
866 | * Can block; must be called from process context. | ||
867 | * | ||
868 | * Note that it is illegal to call synchronize_srcu() from the corresponding | ||
869 | * SRCU read-side critical section; doing so will result in deadlock. | ||
870 | * However, it is perfectly legal to call synchronize_srcu() on one | ||
871 | * srcu_struct from some other srcu_struct's read-side critical section, | ||
872 | * as long as the resulting graph of srcu_structs is acyclic. | ||
873 | * | ||
874 | * There are memory-ordering constraints implied by synchronize_srcu(). | ||
875 | * On systems with more than one CPU, when synchronize_srcu() returns, | ||
876 | * each CPU is guaranteed to have executed a full memory barrier since | ||
877 | * the end of its last corresponding SRCU-sched read-side critical section | ||
878 | * whose beginning preceded the call to synchronize_srcu(). In addition, | ||
879 | * each CPU having an SRCU read-side critical section that extends beyond | ||
880 | * the return from synchronize_srcu() is guaranteed to have executed a | ||
881 | * full memory barrier after the beginning of synchronize_srcu() and before | ||
882 | * the beginning of that SRCU read-side critical section. Note that these | ||
883 | * guarantees include CPUs that are offline, idle, or executing in user mode, | ||
884 | * as well as CPUs that are executing in the kernel. | ||
885 | * | ||
886 | * Furthermore, if CPU A invoked synchronize_srcu(), which returned | ||
887 | * to its caller on CPU B, then both CPU A and CPU B are guaranteed | ||
888 | * to have executed a full memory barrier during the execution of | ||
889 | * synchronize_srcu(). This guarantee applies even if CPU A and CPU B | ||
890 | * are the same CPU, but again only if the system has more than one CPU. | ||
891 | * | ||
892 | * Of course, these memory-ordering guarantees apply only when | ||
893 | * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are | ||
894 | * passed the same srcu_struct structure. | ||
895 | * | ||
896 | * If SRCU is likely idle, expedite the first request. This semantic | ||
897 | * was provided by Classic SRCU, and is relied upon by its users, so TREE | ||
898 | * SRCU must also provide it. Note that detecting idleness is heuristic | ||
899 | * and subject to both false positives and negatives. | ||
900 | */ | ||
901 | void synchronize_srcu(struct srcu_struct *sp) | ||
902 | { | ||
903 | if (srcu_might_be_idle(sp) || rcu_gp_is_expedited()) | ||
904 | synchronize_srcu_expedited(sp); | ||
905 | else | ||
906 | __synchronize_srcu(sp, true); | ||
907 | } | ||
908 | EXPORT_SYMBOL_GPL(synchronize_srcu); | ||
909 | |||
910 | /* | ||
911 | * Callback function for srcu_barrier() use. | ||
912 | */ | ||
913 | static void srcu_barrier_cb(struct rcu_head *rhp) | ||
914 | { | ||
915 | struct srcu_data *sdp; | ||
916 | struct srcu_struct *sp; | ||
917 | |||
918 | sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); | ||
919 | sp = sdp->sp; | ||
920 | if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) | ||
921 | complete(&sp->srcu_barrier_completion); | ||
922 | } | ||
923 | |||
924 | /** | ||
925 | * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. | ||
926 | * @sp: srcu_struct on which to wait for in-flight callbacks. | ||
927 | */ | ||
928 | void srcu_barrier(struct srcu_struct *sp) | ||
929 | { | ||
930 | int cpu; | ||
931 | struct srcu_data *sdp; | ||
932 | unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq); | ||
933 | |||
934 | check_init_srcu_struct(sp); | ||
935 | mutex_lock(&sp->srcu_barrier_mutex); | ||
936 | if (rcu_seq_done(&sp->srcu_barrier_seq, s)) { | ||
937 | smp_mb(); /* Force ordering following return. */ | ||
938 | mutex_unlock(&sp->srcu_barrier_mutex); | ||
939 | return; /* Someone else did our work for us. */ | ||
940 | } | ||
941 | rcu_seq_start(&sp->srcu_barrier_seq); | ||
942 | init_completion(&sp->srcu_barrier_completion); | ||
943 | |||
944 | /* Initial count prevents reaching zero until all CBs are posted. */ | ||
945 | atomic_set(&sp->srcu_barrier_cpu_cnt, 1); | ||
946 | |||
947 | /* | ||
948 | * Each pass through this loop enqueues a callback, but only | ||
949 | * on CPUs already having callbacks enqueued. Note that if | ||
950 | * a CPU already has callbacks enqueue, it must have already | ||
951 | * registered the need for a future grace period, so all we | ||
952 | * need do is enqueue a callback that will use the same | ||
953 | * grace period as the last callback already in the queue. | ||
954 | */ | ||
955 | for_each_possible_cpu(cpu) { | ||
956 | sdp = per_cpu_ptr(sp->sda, cpu); | ||
957 | spin_lock_irq(&sdp->lock); | ||
958 | atomic_inc(&sp->srcu_barrier_cpu_cnt); | ||
959 | sdp->srcu_barrier_head.func = srcu_barrier_cb; | ||
960 | if (!rcu_segcblist_entrain(&sdp->srcu_cblist, | ||
961 | &sdp->srcu_barrier_head, 0)) | ||
962 | atomic_dec(&sp->srcu_barrier_cpu_cnt); | ||
963 | spin_unlock_irq(&sdp->lock); | ||
964 | } | ||
965 | |||
966 | /* Remove the initial count, at which point reaching zero can happen. */ | ||
967 | if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) | ||
968 | complete(&sp->srcu_barrier_completion); | ||
969 | wait_for_completion(&sp->srcu_barrier_completion); | ||
970 | |||
971 | rcu_seq_end(&sp->srcu_barrier_seq); | ||
972 | mutex_unlock(&sp->srcu_barrier_mutex); | ||
973 | } | ||
974 | EXPORT_SYMBOL_GPL(srcu_barrier); | ||
975 | |||
976 | /** | ||
977 | * srcu_batches_completed - return batches completed. | ||
978 | * @sp: srcu_struct on which to report batch completion. | ||
979 | * | ||
980 | * Report the number of batches, correlated with, but not necessarily | ||
981 | * precisely the same as, the number of grace periods that have elapsed. | ||
982 | */ | ||
983 | unsigned long srcu_batches_completed(struct srcu_struct *sp) | ||
984 | { | ||
985 | return sp->srcu_idx; | ||
986 | } | ||
987 | EXPORT_SYMBOL_GPL(srcu_batches_completed); | ||
988 | |||
989 | /* | ||
990 | * Core SRCU state machine. Push state bits of ->srcu_gp_seq | ||
991 | * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has | ||
992 | * completed in that state. | ||
993 | */ | ||
994 | static void srcu_advance_state(struct srcu_struct *sp) | ||
995 | { | ||
996 | int idx; | ||
997 | |||
998 | mutex_lock(&sp->srcu_gp_mutex); | ||
999 | |||
1000 | /* | ||
1001 | * Because readers might be delayed for an extended period after | ||
1002 | * fetching ->srcu_idx for their index, at any point in time there | ||
1003 | * might well be readers using both idx=0 and idx=1. We therefore | ||
1004 | * need to wait for readers to clear from both index values before | ||
1005 | * invoking a callback. | ||
1006 | * | ||
1007 | * The load-acquire ensures that we see the accesses performed | ||
1008 | * by the prior grace period. | ||
1009 | */ | ||
1010 | idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ | ||
1011 | if (idx == SRCU_STATE_IDLE) { | ||
1012 | spin_lock_irq(&sp->gp_lock); | ||
1013 | if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { | ||
1014 | WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); | ||
1015 | spin_unlock_irq(&sp->gp_lock); | ||
1016 | mutex_unlock(&sp->srcu_gp_mutex); | ||
1017 | return; | ||
1018 | } | ||
1019 | idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); | ||
1020 | if (idx == SRCU_STATE_IDLE) | ||
1021 | srcu_gp_start(sp); | ||
1022 | spin_unlock_irq(&sp->gp_lock); | ||
1023 | if (idx != SRCU_STATE_IDLE) { | ||
1024 | mutex_unlock(&sp->srcu_gp_mutex); | ||
1025 | return; /* Someone else started the grace period. */ | ||
1026 | } | ||
1027 | } | ||
1028 | |||
1029 | if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { | ||
1030 | idx = 1 ^ (sp->srcu_idx & 1); | ||
1031 | if (!try_check_zero(sp, idx, 1)) { | ||
1032 | mutex_unlock(&sp->srcu_gp_mutex); | ||
1033 | return; /* readers present, retry later. */ | ||
1034 | } | ||
1035 | srcu_flip(sp); | ||
1036 | rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2); | ||
1037 | } | ||
1038 | |||
1039 | if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { | ||
1040 | |||
1041 | /* | ||
1042 | * SRCU read-side critical sections are normally short, | ||
1043 | * so check at least twice in quick succession after a flip. | ||
1044 | */ | ||
1045 | idx = 1 ^ (sp->srcu_idx & 1); | ||
1046 | if (!try_check_zero(sp, idx, 2)) { | ||
1047 | mutex_unlock(&sp->srcu_gp_mutex); | ||
1048 | return; /* readers present, retry later. */ | ||
1049 | } | ||
1050 | srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */ | ||
1051 | } | ||
1052 | } | ||
1053 | |||
1054 | /* | ||
1055 | * Invoke a limited number of SRCU callbacks that have passed through | ||
1056 | * their grace period. If there are more to do, SRCU will reschedule | ||
1057 | * the workqueue. Note that needed memory barriers have been executed | ||
1058 | * in this task's context by srcu_readers_active_idx_check(). | ||
1059 | */ | ||
1060 | static void srcu_invoke_callbacks(struct work_struct *work) | ||
1061 | { | ||
1062 | bool more; | ||
1063 | struct rcu_cblist ready_cbs; | ||
1064 | struct rcu_head *rhp; | ||
1065 | struct srcu_data *sdp; | ||
1066 | struct srcu_struct *sp; | ||
1067 | |||
1068 | sdp = container_of(work, struct srcu_data, work.work); | ||
1069 | sp = sdp->sp; | ||
1070 | rcu_cblist_init(&ready_cbs); | ||
1071 | spin_lock_irq(&sdp->lock); | ||
1072 | smp_mb(); /* Old grace periods before callback invocation! */ | ||
1073 | rcu_segcblist_advance(&sdp->srcu_cblist, | ||
1074 | rcu_seq_current(&sp->srcu_gp_seq)); | ||
1075 | if (sdp->srcu_cblist_invoking || | ||
1076 | !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { | ||
1077 | spin_unlock_irq(&sdp->lock); | ||
1078 | return; /* Someone else on the job or nothing to do. */ | ||
1079 | } | ||
1080 | |||
1081 | /* We are on the job! Extract and invoke ready callbacks. */ | ||
1082 | sdp->srcu_cblist_invoking = true; | ||
1083 | rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); | ||
1084 | spin_unlock_irq(&sdp->lock); | ||
1085 | rhp = rcu_cblist_dequeue(&ready_cbs); | ||
1086 | for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { | ||
1087 | local_bh_disable(); | ||
1088 | rhp->func(rhp); | ||
1089 | local_bh_enable(); | ||
1090 | } | ||
1091 | |||
1092 | /* | ||
1093 | * Update counts, accelerate new callbacks, and if needed, | ||
1094 | * schedule another round of callback invocation. | ||
1095 | */ | ||
1096 | spin_lock_irq(&sdp->lock); | ||
1097 | rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); | ||
1098 | (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, | ||
1099 | rcu_seq_snap(&sp->srcu_gp_seq)); | ||
1100 | sdp->srcu_cblist_invoking = false; | ||
1101 | more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); | ||
1102 | spin_unlock_irq(&sdp->lock); | ||
1103 | if (more) | ||
1104 | srcu_schedule_cbs_sdp(sdp, 0); | ||
1105 | } | ||
1106 | |||
1107 | /* | ||
1108 | * Finished one round of SRCU grace period. Start another if there are | ||
1109 | * more SRCU callbacks queued, otherwise put SRCU into not-running state. | ||
1110 | */ | ||
1111 | static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) | ||
1112 | { | ||
1113 | bool pushgp = true; | ||
1114 | |||
1115 | spin_lock_irq(&sp->gp_lock); | ||
1116 | if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { | ||
1117 | if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { | ||
1118 | /* All requests fulfilled, time to go idle. */ | ||
1119 | pushgp = false; | ||
1120 | } | ||
1121 | } else if (!rcu_seq_state(sp->srcu_gp_seq)) { | ||
1122 | /* Outstanding request and no GP. Start one. */ | ||
1123 | srcu_gp_start(sp); | ||
1124 | } | ||
1125 | spin_unlock_irq(&sp->gp_lock); | ||
1126 | |||
1127 | if (pushgp) | ||
1128 | queue_delayed_work(system_power_efficient_wq, &sp->work, delay); | ||
1129 | } | ||
1130 | |||
1131 | /* | ||
1132 | * This is the work-queue function that handles SRCU grace periods. | ||
1133 | */ | ||
1134 | void process_srcu(struct work_struct *work) | ||
1135 | { | ||
1136 | struct srcu_struct *sp; | ||
1137 | |||
1138 | sp = container_of(work, struct srcu_struct, work.work); | ||
1139 | |||
1140 | srcu_advance_state(sp); | ||
1141 | srcu_reschedule(sp, srcu_get_delay(sp)); | ||
1142 | } | ||
1143 | EXPORT_SYMBOL_GPL(process_srcu); | ||
1144 | |||
1145 | void srcutorture_get_gp_data(enum rcutorture_type test_type, | ||
1146 | struct srcu_struct *sp, int *flags, | ||
1147 | unsigned long *gpnum, unsigned long *completed) | ||
1148 | { | ||
1149 | if (test_type != SRCU_FLAVOR) | ||
1150 | return; | ||
1151 | *flags = 0; | ||
1152 | *completed = rcu_seq_ctr(sp->srcu_gp_seq); | ||
1153 | *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed); | ||
1154 | } | ||
1155 | EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); | ||
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 6ad330dbbae2..e5385731e391 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c | |||
@@ -79,7 +79,7 @@ EXPORT_SYMBOL(__rcu_is_watching); | |||
79 | */ | 79 | */ |
80 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | 80 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) |
81 | { | 81 | { |
82 | RCU_TRACE(reset_cpu_stall_ticks(rcp)); | 82 | RCU_TRACE(reset_cpu_stall_ticks(rcp);) |
83 | if (rcp->donetail != rcp->curtail) { | 83 | if (rcp->donetail != rcp->curtail) { |
84 | rcp->donetail = rcp->curtail; | 84 | rcp->donetail = rcp->curtail; |
85 | return 1; | 85 | return 1; |
@@ -125,7 +125,7 @@ void rcu_bh_qs(void) | |||
125 | */ | 125 | */ |
126 | void rcu_check_callbacks(int user) | 126 | void rcu_check_callbacks(int user) |
127 | { | 127 | { |
128 | RCU_TRACE(check_cpu_stalls()); | 128 | RCU_TRACE(check_cpu_stalls();) |
129 | if (user) | 129 | if (user) |
130 | rcu_sched_qs(); | 130 | rcu_sched_qs(); |
131 | else if (!in_softirq()) | 131 | else if (!in_softirq()) |
@@ -143,7 +143,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
143 | const char *rn = NULL; | 143 | const char *rn = NULL; |
144 | struct rcu_head *next, *list; | 144 | struct rcu_head *next, *list; |
145 | unsigned long flags; | 145 | unsigned long flags; |
146 | RCU_TRACE(int cb_count = 0); | 146 | RCU_TRACE(int cb_count = 0;) |
147 | 147 | ||
148 | /* Move the ready-to-invoke callbacks to a local list. */ | 148 | /* Move the ready-to-invoke callbacks to a local list. */ |
149 | local_irq_save(flags); | 149 | local_irq_save(flags); |
@@ -152,7 +152,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
152 | local_irq_restore(flags); | 152 | local_irq_restore(flags); |
153 | return; | 153 | return; |
154 | } | 154 | } |
155 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); | 155 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);) |
156 | list = rcp->rcucblist; | 156 | list = rcp->rcucblist; |
157 | rcp->rcucblist = *rcp->donetail; | 157 | rcp->rcucblist = *rcp->donetail; |
158 | *rcp->donetail = NULL; | 158 | *rcp->donetail = NULL; |
@@ -162,7 +162,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
162 | local_irq_restore(flags); | 162 | local_irq_restore(flags); |
163 | 163 | ||
164 | /* Invoke the callbacks on the local list. */ | 164 | /* Invoke the callbacks on the local list. */ |
165 | RCU_TRACE(rn = rcp->name); | 165 | RCU_TRACE(rn = rcp->name;) |
166 | while (list) { | 166 | while (list) { |
167 | next = list->next; | 167 | next = list->next; |
168 | prefetch(next); | 168 | prefetch(next); |
@@ -171,9 +171,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
171 | __rcu_reclaim(rn, list); | 171 | __rcu_reclaim(rn, list); |
172 | local_bh_enable(); | 172 | local_bh_enable(); |
173 | list = next; | 173 | list = next; |
174 | RCU_TRACE(cb_count++); | 174 | RCU_TRACE(cb_count++;) |
175 | } | 175 | } |
176 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | 176 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);) |
177 | RCU_TRACE(trace_rcu_batch_end(rcp->name, | 177 | RCU_TRACE(trace_rcu_batch_end(rcp->name, |
178 | cb_count, 0, need_resched(), | 178 | cb_count, 0, need_resched(), |
179 | is_idle_task(current), | 179 | is_idle_task(current), |
@@ -221,7 +221,7 @@ static void __call_rcu(struct rcu_head *head, | |||
221 | local_irq_save(flags); | 221 | local_irq_save(flags); |
222 | *rcp->curtail = head; | 222 | *rcp->curtail = head; |
223 | rcp->curtail = &head->next; | 223 | rcp->curtail = &head->next; |
224 | RCU_TRACE(rcp->qlen++); | 224 | RCU_TRACE(rcp->qlen++;) |
225 | local_irq_restore(flags); | 225 | local_irq_restore(flags); |
226 | 226 | ||
227 | if (unlikely(is_idle_task(current))) { | 227 | if (unlikely(is_idle_task(current))) { |
@@ -254,8 +254,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); | |||
254 | void __init rcu_init(void) | 254 | void __init rcu_init(void) |
255 | { | 255 | { |
256 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 256 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
257 | RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk)); | 257 | RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);) |
258 | RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk)); | 258 | RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);) |
259 | 259 | ||
260 | rcu_early_boot_tests(); | 260 | rcu_early_boot_tests(); |
261 | } | 261 | } |
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index c64b827ecbca..371034e77f87 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h | |||
@@ -52,7 +52,7 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { | |||
52 | RCU_TRACE(.name = "rcu_bh") | 52 | RCU_TRACE(.name = "rcu_bh") |
53 | }; | 53 | }; |
54 | 54 | ||
55 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 55 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) |
56 | #include <linux/kernel_stat.h> | 56 | #include <linux/kernel_stat.h> |
57 | 57 | ||
58 | int rcu_scheduler_active __read_mostly; | 58 | int rcu_scheduler_active __read_mostly; |
@@ -65,15 +65,16 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); | |||
65 | * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. | 65 | * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. |
66 | * The reason for this is that Tiny RCU does not need kthreads, so does | 66 | * The reason for this is that Tiny RCU does not need kthreads, so does |
67 | * not have to care about the fact that the scheduler is half-initialized | 67 | * not have to care about the fact that the scheduler is half-initialized |
68 | * at a certain phase of the boot process. | 68 | * at a certain phase of the boot process. Unless SRCU is in the mix. |
69 | */ | 69 | */ |
70 | void __init rcu_scheduler_starting(void) | 70 | void __init rcu_scheduler_starting(void) |
71 | { | 71 | { |
72 | WARN_ON(nr_context_switches() > 0); | 72 | WARN_ON(nr_context_switches() > 0); |
73 | rcu_scheduler_active = RCU_SCHEDULER_RUNNING; | 73 | rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU) |
74 | ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING; | ||
74 | } | 75 | } |
75 | 76 | ||
76 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 77 | #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ |
77 | 78 | ||
78 | #ifdef CONFIG_RCU_TRACE | 79 | #ifdef CONFIG_RCU_TRACE |
79 | 80 | ||
@@ -162,8 +163,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) | |||
162 | 163 | ||
163 | static void check_cpu_stalls(void) | 164 | static void check_cpu_stalls(void) |
164 | { | 165 | { |
165 | RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); | 166 | RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);) |
166 | RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); | 167 | RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);) |
167 | } | 168 | } |
168 | 169 | ||
169 | #endif /* #ifdef CONFIG_RCU_TRACE */ | 170 | #endif /* #ifdef CONFIG_RCU_TRACE */ |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a6dcf3bd244f..e354e475e645 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -98,8 +98,8 @@ struct rcu_state sname##_state = { \ | |||
98 | .gpnum = 0UL - 300UL, \ | 98 | .gpnum = 0UL - 300UL, \ |
99 | .completed = 0UL - 300UL, \ | 99 | .completed = 0UL - 300UL, \ |
100 | .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ | 100 | .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ |
101 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ | 101 | .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \ |
102 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 102 | .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \ |
103 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 103 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
104 | .name = RCU_STATE_NAME(sname), \ | 104 | .name = RCU_STATE_NAME(sname), \ |
105 | .abbr = sabbr, \ | 105 | .abbr = sabbr, \ |
@@ -124,7 +124,7 @@ static int rcu_fanout_leaf = RCU_FANOUT_LEAF; | |||
124 | module_param(rcu_fanout_leaf, int, 0444); | 124 | module_param(rcu_fanout_leaf, int, 0444); |
125 | int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; | 125 | int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; |
126 | /* Number of rcu_nodes at specified level. */ | 126 | /* Number of rcu_nodes at specified level. */ |
127 | static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; | 127 | int num_rcu_lvl[] = NUM_RCU_LVL_INIT; |
128 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ | 128 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ |
129 | /* panic() on RCU Stall sysctl. */ | 129 | /* panic() on RCU Stall sysctl. */ |
130 | int sysctl_panic_on_rcu_stall __read_mostly; | 130 | int sysctl_panic_on_rcu_stall __read_mostly; |
@@ -200,7 +200,7 @@ static const int gp_cleanup_delay; | |||
200 | 200 | ||
201 | /* | 201 | /* |
202 | * Number of grace periods between delays, normalized by the duration of | 202 | * Number of grace periods between delays, normalized by the duration of |
203 | * the delay. The longer the the delay, the more the grace periods between | 203 | * the delay. The longer the delay, the more the grace periods between |
204 | * each delay. The reason for this normalization is that it means that, | 204 | * each delay. The reason for this normalization is that it means that, |
205 | * for non-zero delays, the overall slowdown of grace periods is constant | 205 | * for non-zero delays, the overall slowdown of grace periods is constant |
206 | * regardless of the duration of the delay. This arrangement balances | 206 | * regardless of the duration of the delay. This arrangement balances |
@@ -273,11 +273,19 @@ void rcu_bh_qs(void) | |||
273 | } | 273 | } |
274 | } | 274 | } |
275 | 275 | ||
276 | static DEFINE_PER_CPU(int, rcu_sched_qs_mask); | 276 | /* |
277 | * Steal a bit from the bottom of ->dynticks for idle entry/exit | ||
278 | * control. Initially this is for TLB flushing. | ||
279 | */ | ||
280 | #define RCU_DYNTICK_CTRL_MASK 0x1 | ||
281 | #define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) | ||
282 | #ifndef rcu_eqs_special_exit | ||
283 | #define rcu_eqs_special_exit() do { } while (0) | ||
284 | #endif | ||
277 | 285 | ||
278 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 286 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
279 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | 287 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, |
280 | .dynticks = ATOMIC_INIT(1), | 288 | .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), |
281 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | 289 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE |
282 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, | 290 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, |
283 | .dynticks_idle = ATOMIC_INIT(1), | 291 | .dynticks_idle = ATOMIC_INIT(1), |
@@ -305,15 +313,20 @@ bool rcu_irq_enter_disabled(void) | |||
305 | static void rcu_dynticks_eqs_enter(void) | 313 | static void rcu_dynticks_eqs_enter(void) |
306 | { | 314 | { |
307 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 315 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
308 | int special; | 316 | int seq; |
309 | 317 | ||
310 | /* | 318 | /* |
311 | * CPUs seeing atomic_inc_return() must see prior RCU read-side | 319 | * CPUs seeing atomic_add_return() must see prior RCU read-side |
312 | * critical sections, and we also must force ordering with the | 320 | * critical sections, and we also must force ordering with the |
313 | * next idle sojourn. | 321 | * next idle sojourn. |
314 | */ | 322 | */ |
315 | special = atomic_inc_return(&rdtp->dynticks); | 323 | seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); |
316 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1); | 324 | /* Better be in an extended quiescent state! */ |
325 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && | ||
326 | (seq & RCU_DYNTICK_CTRL_CTR)); | ||
327 | /* Better not have special action (TLB flush) pending! */ | ||
328 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && | ||
329 | (seq & RCU_DYNTICK_CTRL_MASK)); | ||
317 | } | 330 | } |
318 | 331 | ||
319 | /* | 332 | /* |
@@ -323,15 +336,22 @@ static void rcu_dynticks_eqs_enter(void) | |||
323 | static void rcu_dynticks_eqs_exit(void) | 336 | static void rcu_dynticks_eqs_exit(void) |
324 | { | 337 | { |
325 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 338 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
326 | int special; | 339 | int seq; |
327 | 340 | ||
328 | /* | 341 | /* |
329 | * CPUs seeing atomic_inc_return() must see prior idle sojourns, | 342 | * CPUs seeing atomic_add_return() must see prior idle sojourns, |
330 | * and we also must force ordering with the next RCU read-side | 343 | * and we also must force ordering with the next RCU read-side |
331 | * critical section. | 344 | * critical section. |
332 | */ | 345 | */ |
333 | special = atomic_inc_return(&rdtp->dynticks); | 346 | seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); |
334 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1)); | 347 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && |
348 | !(seq & RCU_DYNTICK_CTRL_CTR)); | ||
349 | if (seq & RCU_DYNTICK_CTRL_MASK) { | ||
350 | atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks); | ||
351 | smp_mb__after_atomic(); /* _exit after clearing mask. */ | ||
352 | /* Prefer duplicate flushes to losing a flush. */ | ||
353 | rcu_eqs_special_exit(); | ||
354 | } | ||
335 | } | 355 | } |
336 | 356 | ||
337 | /* | 357 | /* |
@@ -348,9 +368,9 @@ static void rcu_dynticks_eqs_online(void) | |||
348 | { | 368 | { |
349 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 369 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
350 | 370 | ||
351 | if (atomic_read(&rdtp->dynticks) & 0x1) | 371 | if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR) |
352 | return; | 372 | return; |
353 | atomic_add(0x1, &rdtp->dynticks); | 373 | atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); |
354 | } | 374 | } |
355 | 375 | ||
356 | /* | 376 | /* |
@@ -362,7 +382,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void) | |||
362 | { | 382 | { |
363 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 383 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
364 | 384 | ||
365 | return !(atomic_read(&rdtp->dynticks) & 0x1); | 385 | return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR); |
366 | } | 386 | } |
367 | 387 | ||
368 | /* | 388 | /* |
@@ -373,7 +393,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp) | |||
373 | { | 393 | { |
374 | int snap = atomic_add_return(0, &rdtp->dynticks); | 394 | int snap = atomic_add_return(0, &rdtp->dynticks); |
375 | 395 | ||
376 | return snap; | 396 | return snap & ~RCU_DYNTICK_CTRL_MASK; |
377 | } | 397 | } |
378 | 398 | ||
379 | /* | 399 | /* |
@@ -382,7 +402,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp) | |||
382 | */ | 402 | */ |
383 | static bool rcu_dynticks_in_eqs(int snap) | 403 | static bool rcu_dynticks_in_eqs(int snap) |
384 | { | 404 | { |
385 | return !(snap & 0x1); | 405 | return !(snap & RCU_DYNTICK_CTRL_CTR); |
386 | } | 406 | } |
387 | 407 | ||
388 | /* | 408 | /* |
@@ -402,14 +422,34 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) | |||
402 | static void rcu_dynticks_momentary_idle(void) | 422 | static void rcu_dynticks_momentary_idle(void) |
403 | { | 423 | { |
404 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 424 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
405 | int special = atomic_add_return(2, &rdtp->dynticks); | 425 | int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, |
426 | &rdtp->dynticks); | ||
406 | 427 | ||
407 | /* It is illegal to call this from idle state. */ | 428 | /* It is illegal to call this from idle state. */ |
408 | WARN_ON_ONCE(!(special & 0x1)); | 429 | WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); |
409 | } | 430 | } |
410 | 431 | ||
411 | DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); | 432 | /* |
412 | EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); | 433 | * Set the special (bottom) bit of the specified CPU so that it |
434 | * will take special action (such as flushing its TLB) on the | ||
435 | * next exit from an extended quiescent state. Returns true if | ||
436 | * the bit was successfully set, or false if the CPU was not in | ||
437 | * an extended quiescent state. | ||
438 | */ | ||
439 | bool rcu_eqs_special_set(int cpu) | ||
440 | { | ||
441 | int old; | ||
442 | int new; | ||
443 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
444 | |||
445 | do { | ||
446 | old = atomic_read(&rdtp->dynticks); | ||
447 | if (old & RCU_DYNTICK_CTRL_CTR) | ||
448 | return false; | ||
449 | new = old | RCU_DYNTICK_CTRL_MASK; | ||
450 | } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old); | ||
451 | return true; | ||
452 | } | ||
413 | 453 | ||
414 | /* | 454 | /* |
415 | * Let the RCU core know that this CPU has gone through the scheduler, | 455 | * Let the RCU core know that this CPU has gone through the scheduler, |
@@ -418,44 +458,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); | |||
418 | * memory barriers to let the RCU core know about it, regardless of what | 458 | * memory barriers to let the RCU core know about it, regardless of what |
419 | * this CPU might (or might not) do in the near future. | 459 | * this CPU might (or might not) do in the near future. |
420 | * | 460 | * |
421 | * We inform the RCU core by emulating a zero-duration dyntick-idle | 461 | * We inform the RCU core by emulating a zero-duration dyntick-idle period. |
422 | * period, which we in turn do by incrementing the ->dynticks counter | ||
423 | * by two. | ||
424 | * | 462 | * |
425 | * The caller must have disabled interrupts. | 463 | * The caller must have disabled interrupts. |
426 | */ | 464 | */ |
427 | static void rcu_momentary_dyntick_idle(void) | 465 | static void rcu_momentary_dyntick_idle(void) |
428 | { | 466 | { |
429 | struct rcu_data *rdp; | 467 | raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); |
430 | int resched_mask; | 468 | rcu_dynticks_momentary_idle(); |
431 | struct rcu_state *rsp; | ||
432 | |||
433 | /* | ||
434 | * Yes, we can lose flag-setting operations. This is OK, because | ||
435 | * the flag will be set again after some delay. | ||
436 | */ | ||
437 | resched_mask = raw_cpu_read(rcu_sched_qs_mask); | ||
438 | raw_cpu_write(rcu_sched_qs_mask, 0); | ||
439 | |||
440 | /* Find the flavor that needs a quiescent state. */ | ||
441 | for_each_rcu_flavor(rsp) { | ||
442 | rdp = raw_cpu_ptr(rsp->rda); | ||
443 | if (!(resched_mask & rsp->flavor_mask)) | ||
444 | continue; | ||
445 | smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ | ||
446 | if (READ_ONCE(rdp->mynode->completed) != | ||
447 | READ_ONCE(rdp->cond_resched_completed)) | ||
448 | continue; | ||
449 | |||
450 | /* | ||
451 | * Pretend to be momentarily idle for the quiescent state. | ||
452 | * This allows the grace-period kthread to record the | ||
453 | * quiescent state, with no need for this CPU to do anything | ||
454 | * further. | ||
455 | */ | ||
456 | rcu_dynticks_momentary_idle(); | ||
457 | break; | ||
458 | } | ||
459 | } | 469 | } |
460 | 470 | ||
461 | /* | 471 | /* |
@@ -463,14 +473,22 @@ static void rcu_momentary_dyntick_idle(void) | |||
463 | * and requires special handling for preemptible RCU. | 473 | * and requires special handling for preemptible RCU. |
464 | * The caller must have disabled interrupts. | 474 | * The caller must have disabled interrupts. |
465 | */ | 475 | */ |
466 | void rcu_note_context_switch(void) | 476 | void rcu_note_context_switch(bool preempt) |
467 | { | 477 | { |
468 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ | 478 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ |
469 | trace_rcu_utilization(TPS("Start context switch")); | 479 | trace_rcu_utilization(TPS("Start context switch")); |
470 | rcu_sched_qs(); | 480 | rcu_sched_qs(); |
471 | rcu_preempt_note_context_switch(); | 481 | rcu_preempt_note_context_switch(); |
472 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) | 482 | /* Load rcu_urgent_qs before other flags. */ |
483 | if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) | ||
484 | goto out; | ||
485 | this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); | ||
486 | if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) | ||
473 | rcu_momentary_dyntick_idle(); | 487 | rcu_momentary_dyntick_idle(); |
488 | this_cpu_inc(rcu_dynticks.rcu_qs_ctr); | ||
489 | if (!preempt) | ||
490 | rcu_note_voluntary_context_switch_lite(current); | ||
491 | out: | ||
474 | trace_rcu_utilization(TPS("End context switch")); | 492 | trace_rcu_utilization(TPS("End context switch")); |
475 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ | 493 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ |
476 | } | 494 | } |
@@ -493,29 +511,26 @@ void rcu_all_qs(void) | |||
493 | { | 511 | { |
494 | unsigned long flags; | 512 | unsigned long flags; |
495 | 513 | ||
514 | if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs)) | ||
515 | return; | ||
516 | preempt_disable(); | ||
517 | /* Load rcu_urgent_qs before other flags. */ | ||
518 | if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) { | ||
519 | preempt_enable(); | ||
520 | return; | ||
521 | } | ||
522 | this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); | ||
496 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ | 523 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ |
497 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { | 524 | if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) { |
498 | local_irq_save(flags); | 525 | local_irq_save(flags); |
499 | rcu_momentary_dyntick_idle(); | 526 | rcu_momentary_dyntick_idle(); |
500 | local_irq_restore(flags); | 527 | local_irq_restore(flags); |
501 | } | 528 | } |
502 | if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { | 529 | if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) |
503 | /* | ||
504 | * Yes, we just checked a per-CPU variable with preemption | ||
505 | * enabled, so we might be migrated to some other CPU at | ||
506 | * this point. That is OK because in that case, the | ||
507 | * migration will supply the needed quiescent state. | ||
508 | * We might end up needlessly disabling preemption and | ||
509 | * invoking rcu_sched_qs() on the destination CPU, but | ||
510 | * the probability and cost are both quite low, so this | ||
511 | * should not be a problem in practice. | ||
512 | */ | ||
513 | preempt_disable(); | ||
514 | rcu_sched_qs(); | 530 | rcu_sched_qs(); |
515 | preempt_enable(); | 531 | this_cpu_inc(rcu_dynticks.rcu_qs_ctr); |
516 | } | ||
517 | this_cpu_inc(rcu_qs_ctr); | ||
518 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ | 532 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ |
533 | preempt_enable(); | ||
519 | } | 534 | } |
520 | EXPORT_SYMBOL_GPL(rcu_all_qs); | 535 | EXPORT_SYMBOL_GPL(rcu_all_qs); |
521 | 536 | ||
@@ -704,15 +719,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, | |||
704 | default: | 719 | default: |
705 | break; | 720 | break; |
706 | } | 721 | } |
707 | if (rsp != NULL) { | 722 | if (rsp == NULL) |
708 | *flags = READ_ONCE(rsp->gp_flags); | ||
709 | *gpnum = READ_ONCE(rsp->gpnum); | ||
710 | *completed = READ_ONCE(rsp->completed); | ||
711 | return; | 723 | return; |
712 | } | 724 | *flags = READ_ONCE(rsp->gp_flags); |
713 | *flags = 0; | 725 | *gpnum = READ_ONCE(rsp->gpnum); |
714 | *gpnum = 0; | 726 | *completed = READ_ONCE(rsp->completed); |
715 | *completed = 0; | ||
716 | } | 727 | } |
717 | EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); | 728 | EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); |
718 | 729 | ||
@@ -728,16 +739,6 @@ void rcutorture_record_progress(unsigned long vernum) | |||
728 | EXPORT_SYMBOL_GPL(rcutorture_record_progress); | 739 | EXPORT_SYMBOL_GPL(rcutorture_record_progress); |
729 | 740 | ||
730 | /* | 741 | /* |
731 | * Does the CPU have callbacks ready to be invoked? | ||
732 | */ | ||
733 | static int | ||
734 | cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) | ||
735 | { | ||
736 | return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && | ||
737 | rdp->nxttail[RCU_NEXT_TAIL] != NULL; | ||
738 | } | ||
739 | |||
740 | /* | ||
741 | * Return the root node of the specified rcu_state structure. | 742 | * Return the root node of the specified rcu_state structure. |
742 | */ | 743 | */ |
743 | static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | 744 | static struct rcu_node *rcu_get_root(struct rcu_state *rsp) |
@@ -767,21 +768,17 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) | |||
767 | static bool | 768 | static bool |
768 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | 769 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) |
769 | { | 770 | { |
770 | int i; | ||
771 | |||
772 | if (rcu_gp_in_progress(rsp)) | 771 | if (rcu_gp_in_progress(rsp)) |
773 | return false; /* No, a grace period is already in progress. */ | 772 | return false; /* No, a grace period is already in progress. */ |
774 | if (rcu_future_needs_gp(rsp)) | 773 | if (rcu_future_needs_gp(rsp)) |
775 | return true; /* Yes, a no-CBs CPU needs one. */ | 774 | return true; /* Yes, a no-CBs CPU needs one. */ |
776 | if (!rdp->nxttail[RCU_NEXT_TAIL]) | 775 | if (!rcu_segcblist_is_enabled(&rdp->cblist)) |
777 | return false; /* No, this is a no-CBs (or offline) CPU. */ | 776 | return false; /* No, this is a no-CBs (or offline) CPU. */ |
778 | if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) | 777 | if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) |
779 | return true; /* Yes, CPU has newly registered callbacks. */ | 778 | return true; /* Yes, CPU has newly registered callbacks. */ |
780 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) | 779 | if (rcu_segcblist_future_gp_needed(&rdp->cblist, |
781 | if (rdp->nxttail[i - 1] != rdp->nxttail[i] && | 780 | READ_ONCE(rsp->completed))) |
782 | ULONG_CMP_LT(READ_ONCE(rsp->completed), | 781 | return true; /* Yes, CBs for future grace period. */ |
783 | rdp->nxtcompleted[i])) | ||
784 | return true; /* Yes, CBs for future grace period. */ | ||
785 | return false; /* No grace period needed. */ | 782 | return false; /* No grace period needed. */ |
786 | } | 783 | } |
787 | 784 | ||
@@ -1162,6 +1159,24 @@ bool notrace rcu_is_watching(void) | |||
1162 | } | 1159 | } |
1163 | EXPORT_SYMBOL_GPL(rcu_is_watching); | 1160 | EXPORT_SYMBOL_GPL(rcu_is_watching); |
1164 | 1161 | ||
1162 | /* | ||
1163 | * If a holdout task is actually running, request an urgent quiescent | ||
1164 | * state from its CPU. This is unsynchronized, so migrations can cause | ||
1165 | * the request to go to the wrong CPU. Which is OK, all that will happen | ||
1166 | * is that the CPU's next context switch will be a bit slower and next | ||
1167 | * time around this task will generate another request. | ||
1168 | */ | ||
1169 | void rcu_request_urgent_qs_task(struct task_struct *t) | ||
1170 | { | ||
1171 | int cpu; | ||
1172 | |||
1173 | barrier(); | ||
1174 | cpu = task_cpu(t); | ||
1175 | if (!task_curr(t)) | ||
1176 | return; /* This task is not running on that CPU. */ | ||
1177 | smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true); | ||
1178 | } | ||
1179 | |||
1165 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) | 1180 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) |
1166 | 1181 | ||
1167 | /* | 1182 | /* |
@@ -1247,7 +1262,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
1247 | bool *isidle, unsigned long *maxj) | 1262 | bool *isidle, unsigned long *maxj) |
1248 | { | 1263 | { |
1249 | unsigned long jtsq; | 1264 | unsigned long jtsq; |
1250 | int *rcrmp; | 1265 | bool *rnhqp; |
1266 | bool *ruqp; | ||
1251 | unsigned long rjtsc; | 1267 | unsigned long rjtsc; |
1252 | struct rcu_node *rnp; | 1268 | struct rcu_node *rnp; |
1253 | 1269 | ||
@@ -1283,11 +1299,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
1283 | * might not be the case for nohz_full CPUs looping in the kernel. | 1299 | * might not be the case for nohz_full CPUs looping in the kernel. |
1284 | */ | 1300 | */ |
1285 | rnp = rdp->mynode; | 1301 | rnp = rdp->mynode; |
1302 | ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); | ||
1286 | if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && | 1303 | if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && |
1287 | READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) && | 1304 | READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && |
1288 | READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { | 1305 | READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { |
1289 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); | 1306 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); |
1290 | return 1; | 1307 | return 1; |
1308 | } else { | ||
1309 | /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ | ||
1310 | smp_store_release(ruqp, true); | ||
1291 | } | 1311 | } |
1292 | 1312 | ||
1293 | /* Check for the CPU being offline. */ | 1313 | /* Check for the CPU being offline. */ |
@@ -1304,7 +1324,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
1304 | * in-kernel CPU-bound tasks cannot advance grace periods. | 1324 | * in-kernel CPU-bound tasks cannot advance grace periods. |
1305 | * So if the grace period is old enough, make the CPU pay attention. | 1325 | * So if the grace period is old enough, make the CPU pay attention. |
1306 | * Note that the unsynchronized assignments to the per-CPU | 1326 | * Note that the unsynchronized assignments to the per-CPU |
1307 | * rcu_sched_qs_mask variable are safe. Yes, setting of | 1327 | * rcu_need_heavy_qs variable are safe. Yes, setting of |
1308 | * bits can be lost, but they will be set again on the next | 1328 | * bits can be lost, but they will be set again on the next |
1309 | * force-quiescent-state pass. So lost bit sets do not result | 1329 | * force-quiescent-state pass. So lost bit sets do not result |
1310 | * in incorrect behavior, merely in a grace period lasting | 1330 | * in incorrect behavior, merely in a grace period lasting |
@@ -1318,16 +1338,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
1318 | * is set too high, we override with half of the RCU CPU stall | 1338 | * is set too high, we override with half of the RCU CPU stall |
1319 | * warning delay. | 1339 | * warning delay. |
1320 | */ | 1340 | */ |
1321 | rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); | 1341 | rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); |
1322 | if (time_after(jiffies, rdp->rsp->gp_start + jtsq) || | 1342 | if (!READ_ONCE(*rnhqp) && |
1323 | time_after(jiffies, rdp->rsp->jiffies_resched)) { | 1343 | (time_after(jiffies, rdp->rsp->gp_start + jtsq) || |
1324 | if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { | 1344 | time_after(jiffies, rdp->rsp->jiffies_resched))) { |
1325 | WRITE_ONCE(rdp->cond_resched_completed, | 1345 | WRITE_ONCE(*rnhqp, true); |
1326 | READ_ONCE(rdp->mynode->completed)); | 1346 | /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ |
1327 | smp_mb(); /* ->cond_resched_completed before *rcrmp. */ | 1347 | smp_store_release(ruqp, true); |
1328 | WRITE_ONCE(*rcrmp, | ||
1329 | READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); | ||
1330 | } | ||
1331 | rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ | 1348 | rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ |
1332 | } | 1349 | } |
1333 | 1350 | ||
@@ -1487,7 +1504,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) | |||
1487 | 1504 | ||
1488 | print_cpu_stall_info_end(); | 1505 | print_cpu_stall_info_end(); |
1489 | for_each_possible_cpu(cpu) | 1506 | for_each_possible_cpu(cpu) |
1490 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; | 1507 | totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, |
1508 | cpu)->cblist); | ||
1491 | pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", | 1509 | pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", |
1492 | smp_processor_id(), (long)(jiffies - rsp->gp_start), | 1510 | smp_processor_id(), (long)(jiffies - rsp->gp_start), |
1493 | (long)rsp->gpnum, (long)rsp->completed, totqlen); | 1511 | (long)rsp->gpnum, (long)rsp->completed, totqlen); |
@@ -1541,7 +1559,8 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
1541 | print_cpu_stall_info(rsp, smp_processor_id()); | 1559 | print_cpu_stall_info(rsp, smp_processor_id()); |
1542 | print_cpu_stall_info_end(); | 1560 | print_cpu_stall_info_end(); |
1543 | for_each_possible_cpu(cpu) | 1561 | for_each_possible_cpu(cpu) |
1544 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; | 1562 | totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, |
1563 | cpu)->cblist); | ||
1545 | pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", | 1564 | pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", |
1546 | jiffies - rsp->gp_start, | 1565 | jiffies - rsp->gp_start, |
1547 | (long)rsp->gpnum, (long)rsp->completed, totqlen); | 1566 | (long)rsp->gpnum, (long)rsp->completed, totqlen); |
@@ -1644,30 +1663,6 @@ void rcu_cpu_stall_reset(void) | |||
1644 | } | 1663 | } |
1645 | 1664 | ||
1646 | /* | 1665 | /* |
1647 | * Initialize the specified rcu_data structure's default callback list | ||
1648 | * to empty. The default callback list is the one that is not used by | ||
1649 | * no-callbacks CPUs. | ||
1650 | */ | ||
1651 | static void init_default_callback_list(struct rcu_data *rdp) | ||
1652 | { | ||
1653 | int i; | ||
1654 | |||
1655 | rdp->nxtlist = NULL; | ||
1656 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
1657 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1658 | } | ||
1659 | |||
1660 | /* | ||
1661 | * Initialize the specified rcu_data structure's callback list to empty. | ||
1662 | */ | ||
1663 | static void init_callback_list(struct rcu_data *rdp) | ||
1664 | { | ||
1665 | if (init_nocb_callback_list(rdp)) | ||
1666 | return; | ||
1667 | init_default_callback_list(rdp); | ||
1668 | } | ||
1669 | |||
1670 | /* | ||
1671 | * Determine the value that ->completed will have at the end of the | 1666 | * Determine the value that ->completed will have at the end of the |
1672 | * next subsequent grace period. This is used to tag callbacks so that | 1667 | * next subsequent grace period. This is used to tag callbacks so that |
1673 | * a CPU can invoke callbacks in a timely fashion even if that CPU has | 1668 | * a CPU can invoke callbacks in a timely fashion even if that CPU has |
@@ -1721,7 +1716,6 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | |||
1721 | unsigned long *c_out) | 1716 | unsigned long *c_out) |
1722 | { | 1717 | { |
1723 | unsigned long c; | 1718 | unsigned long c; |
1724 | int i; | ||
1725 | bool ret = false; | 1719 | bool ret = false; |
1726 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); | 1720 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); |
1727 | 1721 | ||
@@ -1767,13 +1761,11 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | |||
1767 | /* | 1761 | /* |
1768 | * Get a new grace-period number. If there really is no grace | 1762 | * Get a new grace-period number. If there really is no grace |
1769 | * period in progress, it will be smaller than the one we obtained | 1763 | * period in progress, it will be smaller than the one we obtained |
1770 | * earlier. Adjust callbacks as needed. Note that even no-CBs | 1764 | * earlier. Adjust callbacks as needed. |
1771 | * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. | ||
1772 | */ | 1765 | */ |
1773 | c = rcu_cbs_completed(rdp->rsp, rnp_root); | 1766 | c = rcu_cbs_completed(rdp->rsp, rnp_root); |
1774 | for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) | 1767 | if (!rcu_is_nocb_cpu(rdp->cpu)) |
1775 | if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) | 1768 | (void)rcu_segcblist_accelerate(&rdp->cblist, c); |
1776 | rdp->nxtcompleted[i] = c; | ||
1777 | 1769 | ||
1778 | /* | 1770 | /* |
1779 | * If the needed for the required grace period is already | 1771 | * If the needed for the required grace period is already |
@@ -1805,9 +1797,7 @@ out: | |||
1805 | 1797 | ||
1806 | /* | 1798 | /* |
1807 | * Clean up any old requests for the just-ended grace period. Also return | 1799 | * Clean up any old requests for the just-ended grace period. Also return |
1808 | * whether any additional grace periods have been requested. Also invoke | 1800 | * whether any additional grace periods have been requested. |
1809 | * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads | ||
1810 | * waiting for this grace period to complete. | ||
1811 | */ | 1801 | */ |
1812 | static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | 1802 | static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) |
1813 | { | 1803 | { |
@@ -1853,57 +1843,27 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) | |||
1853 | static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | 1843 | static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, |
1854 | struct rcu_data *rdp) | 1844 | struct rcu_data *rdp) |
1855 | { | 1845 | { |
1856 | unsigned long c; | 1846 | bool ret = false; |
1857 | int i; | ||
1858 | bool ret; | ||
1859 | |||
1860 | /* If the CPU has no callbacks, nothing to do. */ | ||
1861 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) | ||
1862 | return false; | ||
1863 | |||
1864 | /* | ||
1865 | * Starting from the sublist containing the callbacks most | ||
1866 | * recently assigned a ->completed number and working down, find the | ||
1867 | * first sublist that is not assignable to an upcoming grace period. | ||
1868 | * Such a sublist has something in it (first two tests) and has | ||
1869 | * a ->completed number assigned that will complete sooner than | ||
1870 | * the ->completed number for newly arrived callbacks (last test). | ||
1871 | * | ||
1872 | * The key point is that any later sublist can be assigned the | ||
1873 | * same ->completed number as the newly arrived callbacks, which | ||
1874 | * means that the callbacks in any of these later sublist can be | ||
1875 | * grouped into a single sublist, whether or not they have already | ||
1876 | * been assigned a ->completed number. | ||
1877 | */ | ||
1878 | c = rcu_cbs_completed(rsp, rnp); | ||
1879 | for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) | ||
1880 | if (rdp->nxttail[i] != rdp->nxttail[i - 1] && | ||
1881 | !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) | ||
1882 | break; | ||
1883 | 1847 | ||
1884 | /* | 1848 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ |
1885 | * If there are no sublist for unassigned callbacks, leave. | 1849 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) |
1886 | * At the same time, advance "i" one sublist, so that "i" will | ||
1887 | * index into the sublist where all the remaining callbacks should | ||
1888 | * be grouped into. | ||
1889 | */ | ||
1890 | if (++i >= RCU_NEXT_TAIL) | ||
1891 | return false; | 1850 | return false; |
1892 | 1851 | ||
1893 | /* | 1852 | /* |
1894 | * Assign all subsequent callbacks' ->completed number to the next | 1853 | * Callbacks are often registered with incomplete grace-period |
1895 | * full grace period and group them all in the sublist initially | 1854 | * information. Something about the fact that getting exact |
1896 | * indexed by "i". | 1855 | * information requires acquiring a global lock... RCU therefore |
1856 | * makes a conservative estimate of the grace period number at which | ||
1857 | * a given callback will become ready to invoke. The following | ||
1858 | * code checks this estimate and improves it when possible, thus | ||
1859 | * accelerating callback invocation to an earlier grace-period | ||
1860 | * number. | ||
1897 | */ | 1861 | */ |
1898 | for (; i <= RCU_NEXT_TAIL; i++) { | 1862 | if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp))) |
1899 | rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; | 1863 | ret = rcu_start_future_gp(rnp, rdp, NULL); |
1900 | rdp->nxtcompleted[i] = c; | ||
1901 | } | ||
1902 | /* Record any needed additional grace periods. */ | ||
1903 | ret = rcu_start_future_gp(rnp, rdp, NULL); | ||
1904 | 1864 | ||
1905 | /* Trace depending on how much we were able to accelerate. */ | 1865 | /* Trace depending on how much we were able to accelerate. */ |
1906 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) | 1866 | if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) |
1907 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); | 1867 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); |
1908 | else | 1868 | else |
1909 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); | 1869 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); |
@@ -1923,32 +1883,15 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1923 | static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | 1883 | static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, |
1924 | struct rcu_data *rdp) | 1884 | struct rcu_data *rdp) |
1925 | { | 1885 | { |
1926 | int i, j; | 1886 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ |
1927 | 1887 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) | |
1928 | /* If the CPU has no callbacks, nothing to do. */ | ||
1929 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) | ||
1930 | return false; | 1888 | return false; |
1931 | 1889 | ||
1932 | /* | 1890 | /* |
1933 | * Find all callbacks whose ->completed numbers indicate that they | 1891 | * Find all callbacks whose ->completed numbers indicate that they |
1934 | * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. | 1892 | * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. |
1935 | */ | 1893 | */ |
1936 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { | 1894 | rcu_segcblist_advance(&rdp->cblist, rnp->completed); |
1937 | if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) | ||
1938 | break; | ||
1939 | rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; | ||
1940 | } | ||
1941 | /* Clean up any sublist tail pointers that were misordered above. */ | ||
1942 | for (j = RCU_WAIT_TAIL; j < i; j++) | ||
1943 | rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; | ||
1944 | |||
1945 | /* Copy down callbacks to fill in empty sublists. */ | ||
1946 | for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { | ||
1947 | if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) | ||
1948 | break; | ||
1949 | rdp->nxttail[j] = rdp->nxttail[i]; | ||
1950 | rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; | ||
1951 | } | ||
1952 | 1895 | ||
1953 | /* Classify any remaining callbacks. */ | 1896 | /* Classify any remaining callbacks. */ |
1954 | return rcu_accelerate_cbs(rsp, rnp, rdp); | 1897 | return rcu_accelerate_cbs(rsp, rnp, rdp); |
@@ -1993,7 +1936,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1993 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); | 1936 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); |
1994 | need_gp = !!(rnp->qsmask & rdp->grpmask); | 1937 | need_gp = !!(rnp->qsmask & rdp->grpmask); |
1995 | rdp->cpu_no_qs.b.norm = need_gp; | 1938 | rdp->cpu_no_qs.b.norm = need_gp; |
1996 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); | 1939 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); |
1997 | rdp->core_needs_qs = need_gp; | 1940 | rdp->core_needs_qs = need_gp; |
1998 | zero_cpu_stall_ticks(rdp); | 1941 | zero_cpu_stall_ticks(rdp); |
1999 | WRITE_ONCE(rdp->gpwrap, false); | 1942 | WRITE_ONCE(rdp->gpwrap, false); |
@@ -2591,7 +2534,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
2591 | * within the current grace period. | 2534 | * within the current grace period. |
2592 | */ | 2535 | */ |
2593 | rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ | 2536 | rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ |
2594 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); | 2537 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); |
2595 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 2538 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
2596 | return; | 2539 | return; |
2597 | } | 2540 | } |
@@ -2665,13 +2608,8 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
2665 | * because _rcu_barrier() excludes CPU-hotplug operations, so it | 2608 | * because _rcu_barrier() excludes CPU-hotplug operations, so it |
2666 | * cannot be running now. Thus no memory barrier is required. | 2609 | * cannot be running now. Thus no memory barrier is required. |
2667 | */ | 2610 | */ |
2668 | if (rdp->nxtlist != NULL) { | 2611 | rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); |
2669 | rsp->qlen_lazy += rdp->qlen_lazy; | 2612 | rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); |
2670 | rsp->qlen += rdp->qlen; | ||
2671 | rdp->n_cbs_orphaned += rdp->qlen; | ||
2672 | rdp->qlen_lazy = 0; | ||
2673 | WRITE_ONCE(rdp->qlen, 0); | ||
2674 | } | ||
2675 | 2613 | ||
2676 | /* | 2614 | /* |
2677 | * Next, move those callbacks still needing a grace period to | 2615 | * Next, move those callbacks still needing a grace period to |
@@ -2679,31 +2617,18 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
2679 | * Some of the callbacks might have gone partway through a grace | 2617 | * Some of the callbacks might have gone partway through a grace |
2680 | * period, but that is too bad. They get to start over because we | 2618 | * period, but that is too bad. They get to start over because we |
2681 | * cannot assume that grace periods are synchronized across CPUs. | 2619 | * cannot assume that grace periods are synchronized across CPUs. |
2682 | * We don't bother updating the ->nxttail[] array yet, instead | ||
2683 | * we just reset the whole thing later on. | ||
2684 | */ | 2620 | */ |
2685 | if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { | 2621 | rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); |
2686 | *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; | ||
2687 | rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; | ||
2688 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; | ||
2689 | } | ||
2690 | 2622 | ||
2691 | /* | 2623 | /* |
2692 | * Then move the ready-to-invoke callbacks to the orphanage, | 2624 | * Then move the ready-to-invoke callbacks to the orphanage, |
2693 | * where some other CPU will pick them up. These will not be | 2625 | * where some other CPU will pick them up. These will not be |
2694 | * required to pass though another grace period: They are done. | 2626 | * required to pass though another grace period: They are done. |
2695 | */ | 2627 | */ |
2696 | if (rdp->nxtlist != NULL) { | 2628 | rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); |
2697 | *rsp->orphan_donetail = rdp->nxtlist; | ||
2698 | rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; | ||
2699 | } | ||
2700 | 2629 | ||
2701 | /* | 2630 | /* Finally, disallow further callbacks on this CPU. */ |
2702 | * Finally, initialize the rcu_data structure's list to empty and | 2631 | rcu_segcblist_disable(&rdp->cblist); |
2703 | * disallow further callbacks on this CPU. | ||
2704 | */ | ||
2705 | init_callback_list(rdp); | ||
2706 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | ||
2707 | } | 2632 | } |
2708 | 2633 | ||
2709 | /* | 2634 | /* |
@@ -2712,7 +2637,6 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
2712 | */ | 2637 | */ |
2713 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) | 2638 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) |
2714 | { | 2639 | { |
2715 | int i; | ||
2716 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); | 2640 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); |
2717 | 2641 | ||
2718 | /* No-CBs CPUs are handled specially. */ | 2642 | /* No-CBs CPUs are handled specially. */ |
@@ -2721,13 +2645,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) | |||
2721 | return; | 2645 | return; |
2722 | 2646 | ||
2723 | /* Do the accounting first. */ | 2647 | /* Do the accounting first. */ |
2724 | rdp->qlen_lazy += rsp->qlen_lazy; | 2648 | rdp->n_cbs_adopted += rsp->orphan_done.len; |
2725 | rdp->qlen += rsp->qlen; | 2649 | if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) |
2726 | rdp->n_cbs_adopted += rsp->qlen; | ||
2727 | if (rsp->qlen_lazy != rsp->qlen) | ||
2728 | rcu_idle_count_callbacks_posted(); | 2650 | rcu_idle_count_callbacks_posted(); |
2729 | rsp->qlen_lazy = 0; | 2651 | rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); |
2730 | rsp->qlen = 0; | ||
2731 | 2652 | ||
2732 | /* | 2653 | /* |
2733 | * We do not need a memory barrier here because the only way we | 2654 | * We do not need a memory barrier here because the only way we |
@@ -2735,24 +2656,13 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) | |||
2735 | * we are the task doing the rcu_barrier(). | 2656 | * we are the task doing the rcu_barrier(). |
2736 | */ | 2657 | */ |
2737 | 2658 | ||
2738 | /* First adopt the ready-to-invoke callbacks. */ | 2659 | /* First adopt the ready-to-invoke callbacks, then the done ones. */ |
2739 | if (rsp->orphan_donelist != NULL) { | 2660 | rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); |
2740 | *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; | 2661 | WARN_ON_ONCE(rsp->orphan_done.head); |
2741 | *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; | 2662 | rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); |
2742 | for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) | 2663 | WARN_ON_ONCE(rsp->orphan_pend.head); |
2743 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) | 2664 | WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != |
2744 | rdp->nxttail[i] = rsp->orphan_donetail; | 2665 | !rcu_segcblist_n_cbs(&rdp->cblist)); |
2745 | rsp->orphan_donelist = NULL; | ||
2746 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
2747 | } | ||
2748 | |||
2749 | /* And then adopt the callbacks that still need a grace period. */ | ||
2750 | if (rsp->orphan_nxtlist != NULL) { | ||
2751 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; | ||
2752 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; | ||
2753 | rsp->orphan_nxtlist = NULL; | ||
2754 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
2755 | } | ||
2756 | } | 2666 | } |
2757 | 2667 | ||
2758 | /* | 2668 | /* |
@@ -2760,14 +2670,14 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) | |||
2760 | */ | 2670 | */ |
2761 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 2671 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) |
2762 | { | 2672 | { |
2763 | RCU_TRACE(unsigned long mask); | 2673 | RCU_TRACE(unsigned long mask;) |
2764 | RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); | 2674 | RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);) |
2765 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); | 2675 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) |
2766 | 2676 | ||
2767 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) | 2677 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) |
2768 | return; | 2678 | return; |
2769 | 2679 | ||
2770 | RCU_TRACE(mask = rdp->grpmask); | 2680 | RCU_TRACE(mask = rdp->grpmask;) |
2771 | trace_rcu_grace_period(rsp->name, | 2681 | trace_rcu_grace_period(rsp->name, |
2772 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), | 2682 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), |
2773 | TPS("cpuofl")); | 2683 | TPS("cpuofl")); |
@@ -2840,9 +2750,11 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
2840 | rcu_adopt_orphan_cbs(rsp, flags); | 2750 | rcu_adopt_orphan_cbs(rsp, flags); |
2841 | raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); | 2751 | raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); |
2842 | 2752 | ||
2843 | WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, | 2753 | WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || |
2844 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", | 2754 | !rcu_segcblist_empty(&rdp->cblist), |
2845 | cpu, rdp->qlen, rdp->nxtlist); | 2755 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", |
2756 | cpu, rcu_segcblist_n_cbs(&rdp->cblist), | ||
2757 | rcu_segcblist_first_cb(&rdp->cblist)); | ||
2846 | } | 2758 | } |
2847 | 2759 | ||
2848 | /* | 2760 | /* |
@@ -2852,14 +2764,17 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
2852 | static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | 2764 | static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) |
2853 | { | 2765 | { |
2854 | unsigned long flags; | 2766 | unsigned long flags; |
2855 | struct rcu_head *next, *list, **tail; | 2767 | struct rcu_head *rhp; |
2856 | long bl, count, count_lazy; | 2768 | struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); |
2857 | int i; | 2769 | long bl, count; |
2858 | 2770 | ||
2859 | /* If no callbacks are ready, just return. */ | 2771 | /* If no callbacks are ready, just return. */ |
2860 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | 2772 | if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { |
2861 | trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); | 2773 | trace_rcu_batch_start(rsp->name, |
2862 | trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist), | 2774 | rcu_segcblist_n_lazy_cbs(&rdp->cblist), |
2775 | rcu_segcblist_n_cbs(&rdp->cblist), 0); | ||
2776 | trace_rcu_batch_end(rsp->name, 0, | ||
2777 | !rcu_segcblist_empty(&rdp->cblist), | ||
2863 | need_resched(), is_idle_task(current), | 2778 | need_resched(), is_idle_task(current), |
2864 | rcu_is_callbacks_kthread()); | 2779 | rcu_is_callbacks_kthread()); |
2865 | return; | 2780 | return; |
@@ -2867,73 +2782,61 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
2867 | 2782 | ||
2868 | /* | 2783 | /* |
2869 | * Extract the list of ready callbacks, disabling to prevent | 2784 | * Extract the list of ready callbacks, disabling to prevent |
2870 | * races with call_rcu() from interrupt handlers. | 2785 | * races with call_rcu() from interrupt handlers. Leave the |
2786 | * callback counts, as rcu_barrier() needs to be conservative. | ||
2871 | */ | 2787 | */ |
2872 | local_irq_save(flags); | 2788 | local_irq_save(flags); |
2873 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); | 2789 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); |
2874 | bl = rdp->blimit; | 2790 | bl = rdp->blimit; |
2875 | trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); | 2791 | trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist), |
2876 | list = rdp->nxtlist; | 2792 | rcu_segcblist_n_cbs(&rdp->cblist), bl); |
2877 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | 2793 | rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); |
2878 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; | ||
2879 | tail = rdp->nxttail[RCU_DONE_TAIL]; | ||
2880 | for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) | ||
2881 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) | ||
2882 | rdp->nxttail[i] = &rdp->nxtlist; | ||
2883 | local_irq_restore(flags); | 2794 | local_irq_restore(flags); |
2884 | 2795 | ||
2885 | /* Invoke callbacks. */ | 2796 | /* Invoke callbacks. */ |
2886 | count = count_lazy = 0; | 2797 | rhp = rcu_cblist_dequeue(&rcl); |
2887 | while (list) { | 2798 | for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { |
2888 | next = list->next; | 2799 | debug_rcu_head_unqueue(rhp); |
2889 | prefetch(next); | 2800 | if (__rcu_reclaim(rsp->name, rhp)) |
2890 | debug_rcu_head_unqueue(list); | 2801 | rcu_cblist_dequeued_lazy(&rcl); |
2891 | if (__rcu_reclaim(rsp->name, list)) | 2802 | /* |
2892 | count_lazy++; | 2803 | * Stop only if limit reached and CPU has something to do. |
2893 | list = next; | 2804 | * Note: The rcl structure counts down from zero. |
2894 | /* Stop only if limit reached and CPU has something to do. */ | 2805 | */ |
2895 | if (++count >= bl && | 2806 | if (-rcl.len >= bl && |
2896 | (need_resched() || | 2807 | (need_resched() || |
2897 | (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) | 2808 | (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) |
2898 | break; | 2809 | break; |
2899 | } | 2810 | } |
2900 | 2811 | ||
2901 | local_irq_save(flags); | 2812 | local_irq_save(flags); |
2902 | trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), | 2813 | count = -rcl.len; |
2903 | is_idle_task(current), | 2814 | trace_rcu_batch_end(rsp->name, count, !!rcl.head, need_resched(), |
2904 | rcu_is_callbacks_kthread()); | 2815 | is_idle_task(current), rcu_is_callbacks_kthread()); |
2905 | 2816 | ||
2906 | /* Update count, and requeue any remaining callbacks. */ | 2817 | /* Update counts and requeue any remaining callbacks. */ |
2907 | if (list != NULL) { | 2818 | rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); |
2908 | *tail = rdp->nxtlist; | ||
2909 | rdp->nxtlist = list; | ||
2910 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
2911 | if (&rdp->nxtlist == rdp->nxttail[i]) | ||
2912 | rdp->nxttail[i] = tail; | ||
2913 | else | ||
2914 | break; | ||
2915 | } | ||
2916 | smp_mb(); /* List handling before counting for rcu_barrier(). */ | 2819 | smp_mb(); /* List handling before counting for rcu_barrier(). */ |
2917 | rdp->qlen_lazy -= count_lazy; | ||
2918 | WRITE_ONCE(rdp->qlen, rdp->qlen - count); | ||
2919 | rdp->n_cbs_invoked += count; | 2820 | rdp->n_cbs_invoked += count; |
2821 | rcu_segcblist_insert_count(&rdp->cblist, &rcl); | ||
2920 | 2822 | ||
2921 | /* Reinstate batch limit if we have worked down the excess. */ | 2823 | /* Reinstate batch limit if we have worked down the excess. */ |
2922 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) | 2824 | count = rcu_segcblist_n_cbs(&rdp->cblist); |
2825 | if (rdp->blimit == LONG_MAX && count <= qlowmark) | ||
2923 | rdp->blimit = blimit; | 2826 | rdp->blimit = blimit; |
2924 | 2827 | ||
2925 | /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ | 2828 | /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ |
2926 | if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { | 2829 | if (count == 0 && rdp->qlen_last_fqs_check != 0) { |
2927 | rdp->qlen_last_fqs_check = 0; | 2830 | rdp->qlen_last_fqs_check = 0; |
2928 | rdp->n_force_qs_snap = rsp->n_force_qs; | 2831 | rdp->n_force_qs_snap = rsp->n_force_qs; |
2929 | } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) | 2832 | } else if (count < rdp->qlen_last_fqs_check - qhimark) |
2930 | rdp->qlen_last_fqs_check = rdp->qlen; | 2833 | rdp->qlen_last_fqs_check = count; |
2931 | WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); | 2834 | WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); |
2932 | 2835 | ||
2933 | local_irq_restore(flags); | 2836 | local_irq_restore(flags); |
2934 | 2837 | ||
2935 | /* Re-invoke RCU core processing if there are callbacks remaining. */ | 2838 | /* Re-invoke RCU core processing if there are callbacks remaining. */ |
2936 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 2839 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) |
2937 | invoke_rcu_core(); | 2840 | invoke_rcu_core(); |
2938 | } | 2841 | } |
2939 | 2842 | ||
@@ -3099,7 +3002,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
3099 | bool needwake; | 3002 | bool needwake; |
3100 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); | 3003 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); |
3101 | 3004 | ||
3102 | WARN_ON_ONCE(rdp->beenonline == 0); | 3005 | WARN_ON_ONCE(!rdp->beenonline); |
3103 | 3006 | ||
3104 | /* Update RCU state based on any recent quiescent states. */ | 3007 | /* Update RCU state based on any recent quiescent states. */ |
3105 | rcu_check_quiescent_state(rsp, rdp); | 3008 | rcu_check_quiescent_state(rsp, rdp); |
@@ -3117,7 +3020,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
3117 | } | 3020 | } |
3118 | 3021 | ||
3119 | /* If there are callbacks ready, invoke them. */ | 3022 | /* If there are callbacks ready, invoke them. */ |
3120 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 3023 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) |
3121 | invoke_rcu_callbacks(rsp, rdp); | 3024 | invoke_rcu_callbacks(rsp, rdp); |
3122 | 3025 | ||
3123 | /* Do any needed deferred wakeups of rcuo kthreads. */ | 3026 | /* Do any needed deferred wakeups of rcuo kthreads. */ |
@@ -3189,7 +3092,8 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
3189 | * invoking force_quiescent_state() if the newly enqueued callback | 3092 | * invoking force_quiescent_state() if the newly enqueued callback |
3190 | * is the only one waiting for a grace period to complete. | 3093 | * is the only one waiting for a grace period to complete. |
3191 | */ | 3094 | */ |
3192 | if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | 3095 | if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) > |
3096 | rdp->qlen_last_fqs_check + qhimark)) { | ||
3193 | 3097 | ||
3194 | /* Are we ignoring a completed grace period? */ | 3098 | /* Are we ignoring a completed grace period? */ |
3195 | note_gp_changes(rsp, rdp); | 3099 | note_gp_changes(rsp, rdp); |
@@ -3207,10 +3111,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
3207 | /* Give the grace period a kick. */ | 3111 | /* Give the grace period a kick. */ |
3208 | rdp->blimit = LONG_MAX; | 3112 | rdp->blimit = LONG_MAX; |
3209 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | 3113 | if (rsp->n_force_qs == rdp->n_force_qs_snap && |
3210 | *rdp->nxttail[RCU_DONE_TAIL] != head) | 3114 | rcu_segcblist_first_pend_cb(&rdp->cblist) != head) |
3211 | force_quiescent_state(rsp); | 3115 | force_quiescent_state(rsp); |
3212 | rdp->n_force_qs_snap = rsp->n_force_qs; | 3116 | rdp->n_force_qs_snap = rsp->n_force_qs; |
3213 | rdp->qlen_last_fqs_check = rdp->qlen; | 3117 | rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); |
3214 | } | 3118 | } |
3215 | } | 3119 | } |
3216 | } | 3120 | } |
@@ -3250,7 +3154,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, | |||
3250 | rdp = this_cpu_ptr(rsp->rda); | 3154 | rdp = this_cpu_ptr(rsp->rda); |
3251 | 3155 | ||
3252 | /* Add the callback to our list. */ | 3156 | /* Add the callback to our list. */ |
3253 | if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { | 3157 | if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) { |
3254 | int offline; | 3158 | int offline; |
3255 | 3159 | ||
3256 | if (cpu != -1) | 3160 | if (cpu != -1) |
@@ -3269,23 +3173,21 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, | |||
3269 | */ | 3173 | */ |
3270 | BUG_ON(cpu != -1); | 3174 | BUG_ON(cpu != -1); |
3271 | WARN_ON_ONCE(!rcu_is_watching()); | 3175 | WARN_ON_ONCE(!rcu_is_watching()); |
3272 | if (!likely(rdp->nxtlist)) | 3176 | if (rcu_segcblist_empty(&rdp->cblist)) |
3273 | init_default_callback_list(rdp); | 3177 | rcu_segcblist_init(&rdp->cblist); |
3274 | } | 3178 | } |
3275 | WRITE_ONCE(rdp->qlen, rdp->qlen + 1); | 3179 | rcu_segcblist_enqueue(&rdp->cblist, head, lazy); |
3276 | if (lazy) | 3180 | if (!lazy) |
3277 | rdp->qlen_lazy++; | ||
3278 | else | ||
3279 | rcu_idle_count_callbacks_posted(); | 3181 | rcu_idle_count_callbacks_posted(); |
3280 | smp_mb(); /* Count before adding callback for rcu_barrier(). */ | ||
3281 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | ||
3282 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | ||
3283 | 3182 | ||
3284 | if (__is_kfree_rcu_offset((unsigned long)func)) | 3183 | if (__is_kfree_rcu_offset((unsigned long)func)) |
3285 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, | 3184 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, |
3286 | rdp->qlen_lazy, rdp->qlen); | 3185 | rcu_segcblist_n_lazy_cbs(&rdp->cblist), |
3186 | rcu_segcblist_n_cbs(&rdp->cblist)); | ||
3287 | else | 3187 | else |
3288 | trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); | 3188 | trace_rcu_callback(rsp->name, head, |
3189 | rcu_segcblist_n_lazy_cbs(&rdp->cblist), | ||
3190 | rcu_segcblist_n_cbs(&rdp->cblist)); | ||
3289 | 3191 | ||
3290 | /* Go handle any RCU core processing required. */ | 3192 | /* Go handle any RCU core processing required. */ |
3291 | __call_rcu_core(rsp, rdp, head, flags); | 3193 | __call_rcu_core(rsp, rdp, head, flags); |
@@ -3531,41 +3433,6 @@ void cond_synchronize_sched(unsigned long oldstate) | |||
3531 | } | 3433 | } |
3532 | EXPORT_SYMBOL_GPL(cond_synchronize_sched); | 3434 | EXPORT_SYMBOL_GPL(cond_synchronize_sched); |
3533 | 3435 | ||
3534 | /* Adjust sequence number for start of update-side operation. */ | ||
3535 | static void rcu_seq_start(unsigned long *sp) | ||
3536 | { | ||
3537 | WRITE_ONCE(*sp, *sp + 1); | ||
3538 | smp_mb(); /* Ensure update-side operation after counter increment. */ | ||
3539 | WARN_ON_ONCE(!(*sp & 0x1)); | ||
3540 | } | ||
3541 | |||
3542 | /* Adjust sequence number for end of update-side operation. */ | ||
3543 | static void rcu_seq_end(unsigned long *sp) | ||
3544 | { | ||
3545 | smp_mb(); /* Ensure update-side operation before counter increment. */ | ||
3546 | WRITE_ONCE(*sp, *sp + 1); | ||
3547 | WARN_ON_ONCE(*sp & 0x1); | ||
3548 | } | ||
3549 | |||
3550 | /* Take a snapshot of the update side's sequence number. */ | ||
3551 | static unsigned long rcu_seq_snap(unsigned long *sp) | ||
3552 | { | ||
3553 | unsigned long s; | ||
3554 | |||
3555 | s = (READ_ONCE(*sp) + 3) & ~0x1; | ||
3556 | smp_mb(); /* Above access must not bleed into critical section. */ | ||
3557 | return s; | ||
3558 | } | ||
3559 | |||
3560 | /* | ||
3561 | * Given a snapshot from rcu_seq_snap(), determine whether or not a | ||
3562 | * full update-side operation has occurred. | ||
3563 | */ | ||
3564 | static bool rcu_seq_done(unsigned long *sp, unsigned long s) | ||
3565 | { | ||
3566 | return ULONG_CMP_GE(READ_ONCE(*sp), s); | ||
3567 | } | ||
3568 | |||
3569 | /* | 3436 | /* |
3570 | * Check to see if there is any immediate RCU-related work to be done | 3437 | * Check to see if there is any immediate RCU-related work to be done |
3571 | * by the current CPU, for the specified type of RCU, returning 1 if so. | 3438 | * by the current CPU, for the specified type of RCU, returning 1 if so. |
@@ -3589,7 +3456,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
3589 | /* Is the RCU core waiting for a quiescent state from this CPU? */ | 3456 | /* Is the RCU core waiting for a quiescent state from this CPU? */ |
3590 | if (rcu_scheduler_fully_active && | 3457 | if (rcu_scheduler_fully_active && |
3591 | rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && | 3458 | rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && |
3592 | rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { | 3459 | rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) { |
3593 | rdp->n_rp_core_needs_qs++; | 3460 | rdp->n_rp_core_needs_qs++; |
3594 | } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { | 3461 | } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { |
3595 | rdp->n_rp_report_qs++; | 3462 | rdp->n_rp_report_qs++; |
@@ -3597,7 +3464,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
3597 | } | 3464 | } |
3598 | 3465 | ||
3599 | /* Does this CPU have callbacks ready to invoke? */ | 3466 | /* Does this CPU have callbacks ready to invoke? */ |
3600 | if (cpu_has_callbacks_ready_to_invoke(rdp)) { | 3467 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) { |
3601 | rdp->n_rp_cb_ready++; | 3468 | rdp->n_rp_cb_ready++; |
3602 | return 1; | 3469 | return 1; |
3603 | } | 3470 | } |
@@ -3661,10 +3528,10 @@ static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) | |||
3661 | 3528 | ||
3662 | for_each_rcu_flavor(rsp) { | 3529 | for_each_rcu_flavor(rsp) { |
3663 | rdp = this_cpu_ptr(rsp->rda); | 3530 | rdp = this_cpu_ptr(rsp->rda); |
3664 | if (!rdp->nxtlist) | 3531 | if (rcu_segcblist_empty(&rdp->cblist)) |
3665 | continue; | 3532 | continue; |
3666 | hc = true; | 3533 | hc = true; |
3667 | if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { | 3534 | if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist) || !all_lazy) { |
3668 | al = false; | 3535 | al = false; |
3669 | break; | 3536 | break; |
3670 | } | 3537 | } |
@@ -3773,7 +3640,7 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
3773 | __call_rcu(&rdp->barrier_head, | 3640 | __call_rcu(&rdp->barrier_head, |
3774 | rcu_barrier_callback, rsp, cpu, 0); | 3641 | rcu_barrier_callback, rsp, cpu, 0); |
3775 | } | 3642 | } |
3776 | } else if (READ_ONCE(rdp->qlen)) { | 3643 | } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { |
3777 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, | 3644 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, |
3778 | rsp->barrier_sequence); | 3645 | rsp->barrier_sequence); |
3779 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); | 3646 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); |
@@ -3882,8 +3749,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
3882 | rdp->qlen_last_fqs_check = 0; | 3749 | rdp->qlen_last_fqs_check = 0; |
3883 | rdp->n_force_qs_snap = rsp->n_force_qs; | 3750 | rdp->n_force_qs_snap = rsp->n_force_qs; |
3884 | rdp->blimit = blimit; | 3751 | rdp->blimit = blimit; |
3885 | if (!rdp->nxtlist) | 3752 | if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ |
3886 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ | 3753 | !init_nocb_callback_list(rdp)) |
3754 | rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ | ||
3887 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 3755 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
3888 | rcu_sysidle_init_percpu_data(rdp->dynticks); | 3756 | rcu_sysidle_init_percpu_data(rdp->dynticks); |
3889 | rcu_dynticks_eqs_online(); | 3757 | rcu_dynticks_eqs_online(); |
@@ -3902,12 +3770,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
3902 | rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ | 3770 | rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ |
3903 | rdp->completed = rnp->completed; | 3771 | rdp->completed = rnp->completed; |
3904 | rdp->cpu_no_qs.b.norm = true; | 3772 | rdp->cpu_no_qs.b.norm = true; |
3905 | rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); | 3773 | rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); |
3906 | rdp->core_needs_qs = false; | 3774 | rdp->core_needs_qs = false; |
3907 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); | 3775 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); |
3908 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 3776 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
3909 | } | 3777 | } |
3910 | 3778 | ||
3779 | /* | ||
3780 | * Invoked early in the CPU-online process, when pretty much all | ||
3781 | * services are available. The incoming CPU is not present. | ||
3782 | */ | ||
3911 | int rcutree_prepare_cpu(unsigned int cpu) | 3783 | int rcutree_prepare_cpu(unsigned int cpu) |
3912 | { | 3784 | { |
3913 | struct rcu_state *rsp; | 3785 | struct rcu_state *rsp; |
@@ -3921,6 +3793,9 @@ int rcutree_prepare_cpu(unsigned int cpu) | |||
3921 | return 0; | 3793 | return 0; |
3922 | } | 3794 | } |
3923 | 3795 | ||
3796 | /* | ||
3797 | * Update RCU priority boot kthread affinity for CPU-hotplug changes. | ||
3798 | */ | ||
3924 | static void rcutree_affinity_setting(unsigned int cpu, int outgoing) | 3799 | static void rcutree_affinity_setting(unsigned int cpu, int outgoing) |
3925 | { | 3800 | { |
3926 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); | 3801 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); |
@@ -3928,20 +3803,34 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) | |||
3928 | rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); | 3803 | rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); |
3929 | } | 3804 | } |
3930 | 3805 | ||
3806 | /* | ||
3807 | * Near the end of the CPU-online process. Pretty much all services | ||
3808 | * enabled, and the CPU is now very much alive. | ||
3809 | */ | ||
3931 | int rcutree_online_cpu(unsigned int cpu) | 3810 | int rcutree_online_cpu(unsigned int cpu) |
3932 | { | 3811 | { |
3933 | sync_sched_exp_online_cleanup(cpu); | 3812 | sync_sched_exp_online_cleanup(cpu); |
3934 | rcutree_affinity_setting(cpu, -1); | 3813 | rcutree_affinity_setting(cpu, -1); |
3814 | if (IS_ENABLED(CONFIG_TREE_SRCU)) | ||
3815 | srcu_online_cpu(cpu); | ||
3935 | return 0; | 3816 | return 0; |
3936 | } | 3817 | } |
3937 | 3818 | ||
3819 | /* | ||
3820 | * Near the beginning of the process. The CPU is still very much alive | ||
3821 | * with pretty much all services enabled. | ||
3822 | */ | ||
3938 | int rcutree_offline_cpu(unsigned int cpu) | 3823 | int rcutree_offline_cpu(unsigned int cpu) |
3939 | { | 3824 | { |
3940 | rcutree_affinity_setting(cpu, cpu); | 3825 | rcutree_affinity_setting(cpu, cpu); |
3826 | if (IS_ENABLED(CONFIG_TREE_SRCU)) | ||
3827 | srcu_offline_cpu(cpu); | ||
3941 | return 0; | 3828 | return 0; |
3942 | } | 3829 | } |
3943 | 3830 | ||
3944 | 3831 | /* | |
3832 | * Near the end of the offline process. We do only tracing here. | ||
3833 | */ | ||
3945 | int rcutree_dying_cpu(unsigned int cpu) | 3834 | int rcutree_dying_cpu(unsigned int cpu) |
3946 | { | 3835 | { |
3947 | struct rcu_state *rsp; | 3836 | struct rcu_state *rsp; |
@@ -3951,6 +3840,9 @@ int rcutree_dying_cpu(unsigned int cpu) | |||
3951 | return 0; | 3840 | return 0; |
3952 | } | 3841 | } |
3953 | 3842 | ||
3843 | /* | ||
3844 | * The outgoing CPU is gone and we are running elsewhere. | ||
3845 | */ | ||
3954 | int rcutree_dead_cpu(unsigned int cpu) | 3846 | int rcutree_dead_cpu(unsigned int cpu) |
3955 | { | 3847 | { |
3956 | struct rcu_state *rsp; | 3848 | struct rcu_state *rsp; |
@@ -3968,6 +3860,10 @@ int rcutree_dead_cpu(unsigned int cpu) | |||
3968 | * incoming CPUs are not allowed to use RCU read-side critical sections | 3860 | * incoming CPUs are not allowed to use RCU read-side critical sections |
3969 | * until this function is called. Failing to observe this restriction | 3861 | * until this function is called. Failing to observe this restriction |
3970 | * will result in lockdep splats. | 3862 | * will result in lockdep splats. |
3863 | * | ||
3864 | * Note that this function is special in that it is invoked directly | ||
3865 | * from the incoming CPU rather than from the cpuhp_step mechanism. | ||
3866 | * This is because this function must be invoked at a precise location. | ||
3971 | */ | 3867 | */ |
3972 | void rcu_cpu_starting(unsigned int cpu) | 3868 | void rcu_cpu_starting(unsigned int cpu) |
3973 | { | 3869 | { |
@@ -3993,9 +3889,6 @@ void rcu_cpu_starting(unsigned int cpu) | |||
3993 | * The CPU is exiting the idle loop into the arch_cpu_idle_dead() | 3889 | * The CPU is exiting the idle loop into the arch_cpu_idle_dead() |
3994 | * function. We now remove it from the rcu_node tree's ->qsmaskinit | 3890 | * function. We now remove it from the rcu_node tree's ->qsmaskinit |
3995 | * bit masks. | 3891 | * bit masks. |
3996 | * The CPU is exiting the idle loop into the arch_cpu_idle_dead() | ||
3997 | * function. We now remove it from the rcu_node tree's ->qsmaskinit | ||
3998 | * bit masks. | ||
3999 | */ | 3892 | */ |
4000 | static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) | 3893 | static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) |
4001 | { | 3894 | { |
@@ -4011,6 +3904,14 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) | |||
4011 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 3904 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
4012 | } | 3905 | } |
4013 | 3906 | ||
3907 | /* | ||
3908 | * The outgoing function has no further need of RCU, so remove it from | ||
3909 | * the list of CPUs that RCU must track. | ||
3910 | * | ||
3911 | * Note that this function is special in that it is invoked directly | ||
3912 | * from the outgoing CPU rather than from the cpuhp_step mechanism. | ||
3913 | * This is because this function must be invoked at a precise location. | ||
3914 | */ | ||
4014 | void rcu_report_dead(unsigned int cpu) | 3915 | void rcu_report_dead(unsigned int cpu) |
4015 | { | 3916 | { |
4016 | struct rcu_state *rsp; | 3917 | struct rcu_state *rsp; |
@@ -4025,6 +3926,10 @@ void rcu_report_dead(unsigned int cpu) | |||
4025 | } | 3926 | } |
4026 | #endif | 3927 | #endif |
4027 | 3928 | ||
3929 | /* | ||
3930 | * On non-huge systems, use expedited RCU grace periods to make suspend | ||
3931 | * and hibernation run faster. | ||
3932 | */ | ||
4028 | static int rcu_pm_notify(struct notifier_block *self, | 3933 | static int rcu_pm_notify(struct notifier_block *self, |
4029 | unsigned long action, void *hcpu) | 3934 | unsigned long action, void *hcpu) |
4030 | { | 3935 | { |
@@ -4095,7 +4000,7 @@ early_initcall(rcu_spawn_gp_kthread); | |||
4095 | * task is booting the system, and such primitives are no-ops). After this | 4000 | * task is booting the system, and such primitives are no-ops). After this |
4096 | * function is called, any synchronous grace-period primitives are run as | 4001 | * function is called, any synchronous grace-period primitives are run as |
4097 | * expedited, with the requesting task driving the grace period forward. | 4002 | * expedited, with the requesting task driving the grace period forward. |
4098 | * A later core_initcall() rcu_exp_runtime_mode() will switch to full | 4003 | * A later core_initcall() rcu_set_runtime_mode() will switch to full |
4099 | * runtime RCU functionality. | 4004 | * runtime RCU functionality. |
4100 | */ | 4005 | */ |
4101 | void rcu_scheduler_starting(void) | 4006 | void rcu_scheduler_starting(void) |
@@ -4108,31 +4013,6 @@ void rcu_scheduler_starting(void) | |||
4108 | } | 4013 | } |
4109 | 4014 | ||
4110 | /* | 4015 | /* |
4111 | * Compute the per-level fanout, either using the exact fanout specified | ||
4112 | * or balancing the tree, depending on the rcu_fanout_exact boot parameter. | ||
4113 | */ | ||
4114 | static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) | ||
4115 | { | ||
4116 | int i; | ||
4117 | |||
4118 | if (rcu_fanout_exact) { | ||
4119 | levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; | ||
4120 | for (i = rcu_num_lvls - 2; i >= 0; i--) | ||
4121 | levelspread[i] = RCU_FANOUT; | ||
4122 | } else { | ||
4123 | int ccur; | ||
4124 | int cprv; | ||
4125 | |||
4126 | cprv = nr_cpu_ids; | ||
4127 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | ||
4128 | ccur = levelcnt[i]; | ||
4129 | levelspread[i] = (cprv + ccur - 1) / ccur; | ||
4130 | cprv = ccur; | ||
4131 | } | ||
4132 | } | ||
4133 | } | ||
4134 | |||
4135 | /* | ||
4136 | * Helper function for rcu_init() that initializes one rcu_state structure. | 4016 | * Helper function for rcu_init() that initializes one rcu_state structure. |
4137 | */ | 4017 | */ |
4138 | static void __init rcu_init_one(struct rcu_state *rsp) | 4018 | static void __init rcu_init_one(struct rcu_state *rsp) |
@@ -4141,9 +4021,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
4141 | static const char * const fqs[] = RCU_FQS_NAME_INIT; | 4021 | static const char * const fqs[] = RCU_FQS_NAME_INIT; |
4142 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | 4022 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; |
4143 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | 4023 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; |
4144 | static u8 fl_mask = 0x1; | ||
4145 | 4024 | ||
4146 | int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ | ||
4147 | int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ | 4025 | int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ |
4148 | int cpustride = 1; | 4026 | int cpustride = 1; |
4149 | int i; | 4027 | int i; |
@@ -4158,20 +4036,16 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
4158 | 4036 | ||
4159 | /* Initialize the level-tracking arrays. */ | 4037 | /* Initialize the level-tracking arrays. */ |
4160 | 4038 | ||
4161 | for (i = 0; i < rcu_num_lvls; i++) | ||
4162 | levelcnt[i] = num_rcu_lvl[i]; | ||
4163 | for (i = 1; i < rcu_num_lvls; i++) | 4039 | for (i = 1; i < rcu_num_lvls; i++) |
4164 | rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; | 4040 | rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1]; |
4165 | rcu_init_levelspread(levelspread, levelcnt); | 4041 | rcu_init_levelspread(levelspread, num_rcu_lvl); |
4166 | rsp->flavor_mask = fl_mask; | ||
4167 | fl_mask <<= 1; | ||
4168 | 4042 | ||
4169 | /* Initialize the elements themselves, starting from the leaves. */ | 4043 | /* Initialize the elements themselves, starting from the leaves. */ |
4170 | 4044 | ||
4171 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | 4045 | for (i = rcu_num_lvls - 1; i >= 0; i--) { |
4172 | cpustride *= levelspread[i]; | 4046 | cpustride *= levelspread[i]; |
4173 | rnp = rsp->level[i]; | 4047 | rnp = rsp->level[i]; |
4174 | for (j = 0; j < levelcnt[i]; j++, rnp++) { | 4048 | for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) { |
4175 | raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); | 4049 | raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); |
4176 | lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), | 4050 | lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), |
4177 | &rcu_node_class[i], buf[i]); | 4051 | &rcu_node_class[i], buf[i]); |
@@ -4344,6 +4218,8 @@ void __init rcu_init(void) | |||
4344 | for_each_online_cpu(cpu) { | 4218 | for_each_online_cpu(cpu) { |
4345 | rcutree_prepare_cpu(cpu); | 4219 | rcutree_prepare_cpu(cpu); |
4346 | rcu_cpu_starting(cpu); | 4220 | rcu_cpu_starting(cpu); |
4221 | if (IS_ENABLED(CONFIG_TREE_SRCU)) | ||
4222 | srcu_online_cpu(cpu); | ||
4347 | } | 4223 | } |
4348 | } | 4224 | } |
4349 | 4225 | ||
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ec62a05bfdb3..ba38262c3554 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
@@ -30,80 +30,9 @@ | |||
30 | #include <linux/seqlock.h> | 30 | #include <linux/seqlock.h> |
31 | #include <linux/swait.h> | 31 | #include <linux/swait.h> |
32 | #include <linux/stop_machine.h> | 32 | #include <linux/stop_machine.h> |
33 | #include <linux/rcu_node_tree.h> | ||
33 | 34 | ||
34 | /* | 35 | #include "rcu_segcblist.h" |
35 | * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and | ||
36 | * CONFIG_RCU_FANOUT_LEAF. | ||
37 | * In theory, it should be possible to add more levels straightforwardly. | ||
38 | * In practice, this did work well going from three levels to four. | ||
39 | * Of course, your mileage may vary. | ||
40 | */ | ||
41 | |||
42 | #ifdef CONFIG_RCU_FANOUT | ||
43 | #define RCU_FANOUT CONFIG_RCU_FANOUT | ||
44 | #else /* #ifdef CONFIG_RCU_FANOUT */ | ||
45 | # ifdef CONFIG_64BIT | ||
46 | # define RCU_FANOUT 64 | ||
47 | # else | ||
48 | # define RCU_FANOUT 32 | ||
49 | # endif | ||
50 | #endif /* #else #ifdef CONFIG_RCU_FANOUT */ | ||
51 | |||
52 | #ifdef CONFIG_RCU_FANOUT_LEAF | ||
53 | #define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF | ||
54 | #else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ | ||
55 | # ifdef CONFIG_64BIT | ||
56 | # define RCU_FANOUT_LEAF 64 | ||
57 | # else | ||
58 | # define RCU_FANOUT_LEAF 32 | ||
59 | # endif | ||
60 | #endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ | ||
61 | |||
62 | #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) | ||
63 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) | ||
64 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) | ||
65 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) | ||
66 | |||
67 | #if NR_CPUS <= RCU_FANOUT_1 | ||
68 | # define RCU_NUM_LVLS 1 | ||
69 | # define NUM_RCU_LVL_0 1 | ||
70 | # define NUM_RCU_NODES NUM_RCU_LVL_0 | ||
71 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } | ||
72 | # define RCU_NODE_NAME_INIT { "rcu_node_0" } | ||
73 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } | ||
74 | #elif NR_CPUS <= RCU_FANOUT_2 | ||
75 | # define RCU_NUM_LVLS 2 | ||
76 | # define NUM_RCU_LVL_0 1 | ||
77 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
78 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) | ||
79 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } | ||
80 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } | ||
81 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } | ||
82 | #elif NR_CPUS <= RCU_FANOUT_3 | ||
83 | # define RCU_NUM_LVLS 3 | ||
84 | # define NUM_RCU_LVL_0 1 | ||
85 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | ||
86 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
87 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) | ||
88 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } | ||
89 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } | ||
90 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } | ||
91 | #elif NR_CPUS <= RCU_FANOUT_4 | ||
92 | # define RCU_NUM_LVLS 4 | ||
93 | # define NUM_RCU_LVL_0 1 | ||
94 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) | ||
95 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | ||
96 | # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
97 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) | ||
98 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } | ||
99 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } | ||
100 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } | ||
101 | #else | ||
102 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" | ||
103 | #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ | ||
104 | |||
105 | extern int rcu_num_lvls; | ||
106 | extern int rcu_num_nodes; | ||
107 | 36 | ||
108 | /* | 37 | /* |
109 | * Dynticks per-CPU state. | 38 | * Dynticks per-CPU state. |
@@ -113,6 +42,9 @@ struct rcu_dynticks { | |||
113 | /* Process level is worth LLONG_MAX/2. */ | 42 | /* Process level is worth LLONG_MAX/2. */ |
114 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 43 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
115 | atomic_t dynticks; /* Even value for idle, else odd. */ | 44 | atomic_t dynticks; /* Even value for idle, else odd. */ |
45 | bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ | ||
46 | unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ | ||
47 | bool rcu_urgent_qs; /* GP old need light quiescent state. */ | ||
116 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | 48 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE |
117 | long long dynticks_idle_nesting; | 49 | long long dynticks_idle_nesting; |
118 | /* irq/process nesting level from idle. */ | 50 | /* irq/process nesting level from idle. */ |
@@ -262,41 +194,6 @@ struct rcu_node { | |||
262 | #define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) | 194 | #define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) |
263 | 195 | ||
264 | /* | 196 | /* |
265 | * Do a full breadth-first scan of the rcu_node structures for the | ||
266 | * specified rcu_state structure. | ||
267 | */ | ||
268 | #define rcu_for_each_node_breadth_first(rsp, rnp) \ | ||
269 | for ((rnp) = &(rsp)->node[0]; \ | ||
270 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | ||
271 | |||
272 | /* | ||
273 | * Do a breadth-first scan of the non-leaf rcu_node structures for the | ||
274 | * specified rcu_state structure. Note that if there is a singleton | ||
275 | * rcu_node tree with but one rcu_node structure, this loop is a no-op. | ||
276 | */ | ||
277 | #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ | ||
278 | for ((rnp) = &(rsp)->node[0]; \ | ||
279 | (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) | ||
280 | |||
281 | /* | ||
282 | * Scan the leaves of the rcu_node hierarchy for the specified rcu_state | ||
283 | * structure. Note that if there is a singleton rcu_node tree with but | ||
284 | * one rcu_node structure, this loop -will- visit the rcu_node structure. | ||
285 | * It is still a leaf node, even if it is also the root node. | ||
286 | */ | ||
287 | #define rcu_for_each_leaf_node(rsp, rnp) \ | ||
288 | for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ | ||
289 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | ||
290 | |||
291 | /* | ||
292 | * Iterate over all possible CPUs in a leaf RCU node. | ||
293 | */ | ||
294 | #define for_each_leaf_node_possible_cpu(rnp, cpu) \ | ||
295 | for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ | ||
296 | cpu <= rnp->grphi; \ | ||
297 | cpu = cpumask_next((cpu), cpu_possible_mask)) | ||
298 | |||
299 | /* | ||
300 | * Union to allow "aggregate OR" operation on the need for a quiescent | 197 | * Union to allow "aggregate OR" operation on the need for a quiescent |
301 | * state by the normal and expedited grace periods. | 198 | * state by the normal and expedited grace periods. |
302 | */ | 199 | */ |
@@ -336,34 +233,9 @@ struct rcu_data { | |||
336 | /* period it is aware of. */ | 233 | /* period it is aware of. */ |
337 | 234 | ||
338 | /* 2) batch handling */ | 235 | /* 2) batch handling */ |
339 | /* | 236 | struct rcu_segcblist cblist; /* Segmented callback list, with */ |
340 | * If nxtlist is not NULL, it is partitioned as follows. | 237 | /* different callbacks waiting for */ |
341 | * Any of the partitions might be empty, in which case the | 238 | /* different grace periods. */ |
342 | * pointer to that partition will be equal to the pointer for | ||
343 | * the following partition. When the list is empty, all of | ||
344 | * the nxttail elements point to the ->nxtlist pointer itself, | ||
345 | * which in that case is NULL. | ||
346 | * | ||
347 | * [nxtlist, *nxttail[RCU_DONE_TAIL]): | ||
348 | * Entries that batch # <= ->completed | ||
349 | * The grace period for these entries has completed, and | ||
350 | * the other grace-period-completed entries may be moved | ||
351 | * here temporarily in rcu_process_callbacks(). | ||
352 | * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): | ||
353 | * Entries that batch # <= ->completed - 1: waiting for current GP | ||
354 | * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): | ||
355 | * Entries known to have arrived before current GP ended | ||
356 | * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): | ||
357 | * Entries that might have arrived after current GP ended | ||
358 | * Note that the value of *nxttail[RCU_NEXT_TAIL] will | ||
359 | * always be NULL, as this is the end of the list. | ||
360 | */ | ||
361 | struct rcu_head *nxtlist; | ||
362 | struct rcu_head **nxttail[RCU_NEXT_SIZE]; | ||
363 | unsigned long nxtcompleted[RCU_NEXT_SIZE]; | ||
364 | /* grace periods for sublists. */ | ||
365 | long qlen_lazy; /* # of lazy queued callbacks */ | ||
366 | long qlen; /* # of queued callbacks, incl lazy */ | ||
367 | long qlen_last_fqs_check; | 239 | long qlen_last_fqs_check; |
368 | /* qlen at last check for QS forcing */ | 240 | /* qlen at last check for QS forcing */ |
369 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | 241 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ |
@@ -482,7 +354,6 @@ struct rcu_state { | |||
482 | struct rcu_node *level[RCU_NUM_LVLS + 1]; | 354 | struct rcu_node *level[RCU_NUM_LVLS + 1]; |
483 | /* Hierarchy levels (+1 to */ | 355 | /* Hierarchy levels (+1 to */ |
484 | /* shut bogus gcc warning) */ | 356 | /* shut bogus gcc warning) */ |
485 | u8 flavor_mask; /* bit in flavor mask. */ | ||
486 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ | 357 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
487 | call_rcu_func_t call; /* call_rcu() flavor. */ | 358 | call_rcu_func_t call; /* call_rcu() flavor. */ |
488 | int ncpus; /* # CPUs seen so far. */ | 359 | int ncpus; /* # CPUs seen so far. */ |
@@ -502,14 +373,11 @@ struct rcu_state { | |||
502 | 373 | ||
503 | raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; | 374 | raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; |
504 | /* Protect following fields. */ | 375 | /* Protect following fields. */ |
505 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ | 376 | struct rcu_cblist orphan_pend; /* Orphaned callbacks that */ |
506 | /* need a grace period. */ | 377 | /* need a grace period. */ |
507 | struct rcu_head **orphan_nxttail; /* Tail of above. */ | 378 | struct rcu_cblist orphan_done; /* Orphaned callbacks that */ |
508 | struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ | ||
509 | /* are ready to invoke. */ | 379 | /* are ready to invoke. */ |
510 | struct rcu_head **orphan_donetail; /* Tail of above. */ | 380 | /* (Contains counts.) */ |
511 | long qlen_lazy; /* Number of lazy callbacks. */ | ||
512 | long qlen; /* Total number of callbacks. */ | ||
513 | /* End of fields guarded by orphan_lock. */ | 381 | /* End of fields guarded by orphan_lock. */ |
514 | 382 | ||
515 | struct mutex barrier_mutex; /* Guards barrier fields. */ | 383 | struct mutex barrier_mutex; /* Guards barrier fields. */ |
@@ -596,6 +464,7 @@ extern struct rcu_state rcu_preempt_state; | |||
596 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | 464 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ |
597 | 465 | ||
598 | int rcu_dynticks_snap(struct rcu_dynticks *rdtp); | 466 | int rcu_dynticks_snap(struct rcu_dynticks *rdtp); |
467 | bool rcu_eqs_special_set(int cpu); | ||
599 | 468 | ||
600 | #ifdef CONFIG_RCU_BOOST | 469 | #ifdef CONFIG_RCU_BOOST |
601 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | 470 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); |
@@ -673,6 +542,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp); | |||
673 | static void rcu_dynticks_task_enter(void); | 542 | static void rcu_dynticks_task_enter(void); |
674 | static void rcu_dynticks_task_exit(void); | 543 | static void rcu_dynticks_task_exit(void); |
675 | 544 | ||
545 | #ifdef CONFIG_SRCU | ||
546 | void srcu_online_cpu(unsigned int cpu); | ||
547 | void srcu_offline_cpu(unsigned int cpu); | ||
548 | #else /* #ifdef CONFIG_SRCU */ | ||
549 | void srcu_online_cpu(unsigned int cpu) { } | ||
550 | void srcu_offline_cpu(unsigned int cpu) { } | ||
551 | #endif /* #else #ifdef CONFIG_SRCU */ | ||
552 | |||
676 | #endif /* #ifndef RCU_TREE_NONCORE */ | 553 | #endif /* #ifndef RCU_TREE_NONCORE */ |
677 | 554 | ||
678 | #ifdef CONFIG_RCU_TRACE | 555 | #ifdef CONFIG_RCU_TRACE |
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index a7b639ccd46e..e513b4ab1197 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h | |||
@@ -292,7 +292,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) | |||
292 | trace_rcu_exp_funnel_lock(rsp->name, rnp->level, | 292 | trace_rcu_exp_funnel_lock(rsp->name, rnp->level, |
293 | rnp->grplo, rnp->grphi, | 293 | rnp->grplo, rnp->grphi, |
294 | TPS("wait")); | 294 | TPS("wait")); |
295 | wait_event(rnp->exp_wq[(s >> 1) & 0x3], | 295 | wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], |
296 | sync_exp_work_done(rsp, | 296 | sync_exp_work_done(rsp, |
297 | &rdp->exp_workdone2, s)); | 297 | &rdp->exp_workdone2, s)); |
298 | return true; | 298 | return true; |
@@ -331,6 +331,8 @@ static void sync_sched_exp_handler(void *data) | |||
331 | return; | 331 | return; |
332 | } | 332 | } |
333 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); | 333 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); |
334 | /* Store .exp before .rcu_urgent_qs. */ | ||
335 | smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); | ||
334 | resched_cpu(smp_processor_id()); | 336 | resched_cpu(smp_processor_id()); |
335 | } | 337 | } |
336 | 338 | ||
@@ -531,7 +533,8 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) | |||
531 | rnp->exp_seq_rq = s; | 533 | rnp->exp_seq_rq = s; |
532 | spin_unlock(&rnp->exp_lock); | 534 | spin_unlock(&rnp->exp_lock); |
533 | } | 535 | } |
534 | wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); | 536 | smp_mb(); /* All above changes before wakeup. */ |
537 | wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]); | ||
535 | } | 538 | } |
536 | trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); | 539 | trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); |
537 | mutex_unlock(&rsp->exp_wake_mutex); | 540 | mutex_unlock(&rsp->exp_wake_mutex); |
@@ -609,9 +612,9 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, | |||
609 | /* Wait for expedited grace period to complete. */ | 612 | /* Wait for expedited grace period to complete. */ |
610 | rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); | 613 | rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); |
611 | rnp = rcu_get_root(rsp); | 614 | rnp = rcu_get_root(rsp); |
612 | wait_event(rnp->exp_wq[(s >> 1) & 0x3], | 615 | wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], |
613 | sync_exp_work_done(rsp, | 616 | sync_exp_work_done(rsp, &rdp->exp_workdone0, s)); |
614 | &rdp->exp_workdone0, s)); | 617 | smp_mb(); /* Workqueue actions happen before return. */ |
615 | 618 | ||
616 | /* Let the next expedited grace period start. */ | 619 | /* Let the next expedited grace period start. */ |
617 | mutex_unlock(&rsp->exp_mutex); | 620 | mutex_unlock(&rsp->exp_mutex); |
@@ -735,15 +738,3 @@ void synchronize_rcu_expedited(void) | |||
735 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | 738 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); |
736 | 739 | ||
737 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | 740 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ |
738 | |||
739 | /* | ||
740 | * Switch to run-time mode once Tree RCU has fully initialized. | ||
741 | */ | ||
742 | static int __init rcu_exp_runtime_mode(void) | ||
743 | { | ||
744 | rcu_test_sync_prims(); | ||
745 | rcu_scheduler_active = RCU_SCHEDULER_RUNNING; | ||
746 | rcu_test_sync_prims(); | ||
747 | return 0; | ||
748 | } | ||
749 | core_initcall(rcu_exp_runtime_mode); | ||
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a62a8f1caac..c9a48657512a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -1350,10 +1350,10 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) | |||
1350 | */ | 1350 | */ |
1351 | if ((rdp->completed != rnp->completed || | 1351 | if ((rdp->completed != rnp->completed || |
1352 | unlikely(READ_ONCE(rdp->gpwrap))) && | 1352 | unlikely(READ_ONCE(rdp->gpwrap))) && |
1353 | rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) | 1353 | rcu_segcblist_pend_cbs(&rdp->cblist)) |
1354 | note_gp_changes(rsp, rdp); | 1354 | note_gp_changes(rsp, rdp); |
1355 | 1355 | ||
1356 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 1356 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) |
1357 | cbs_ready = true; | 1357 | cbs_ready = true; |
1358 | } | 1358 | } |
1359 | return cbs_ready; | 1359 | return cbs_ready; |
@@ -1461,7 +1461,7 @@ static void rcu_prepare_for_idle(void) | |||
1461 | rdtp->last_accelerate = jiffies; | 1461 | rdtp->last_accelerate = jiffies; |
1462 | for_each_rcu_flavor(rsp) { | 1462 | for_each_rcu_flavor(rsp) { |
1463 | rdp = this_cpu_ptr(rsp->rda); | 1463 | rdp = this_cpu_ptr(rsp->rda); |
1464 | if (!*rdp->nxttail[RCU_DONE_TAIL]) | 1464 | if (rcu_segcblist_pend_cbs(&rdp->cblist)) |
1465 | continue; | 1465 | continue; |
1466 | rnp = rdp->mynode; | 1466 | rnp = rdp->mynode; |
1467 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ | 1467 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ |
@@ -1529,7 +1529,7 @@ static void rcu_oom_notify_cpu(void *unused) | |||
1529 | 1529 | ||
1530 | for_each_rcu_flavor(rsp) { | 1530 | for_each_rcu_flavor(rsp) { |
1531 | rdp = raw_cpu_ptr(rsp->rda); | 1531 | rdp = raw_cpu_ptr(rsp->rda); |
1532 | if (rdp->qlen_lazy != 0) { | 1532 | if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) { |
1533 | atomic_inc(&oom_callback_count); | 1533 | atomic_inc(&oom_callback_count); |
1534 | rsp->call(&rdp->oom_head, rcu_oom_callback); | 1534 | rsp->call(&rdp->oom_head, rcu_oom_callback); |
1535 | } | 1535 | } |
@@ -1709,7 +1709,7 @@ __setup("rcu_nocbs=", rcu_nocb_setup); | |||
1709 | 1709 | ||
1710 | static int __init parse_rcu_nocb_poll(char *arg) | 1710 | static int __init parse_rcu_nocb_poll(char *arg) |
1711 | { | 1711 | { |
1712 | rcu_nocb_poll = 1; | 1712 | rcu_nocb_poll = true; |
1713 | return 0; | 1713 | return 0; |
1714 | } | 1714 | } |
1715 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); | 1715 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); |
@@ -1860,7 +1860,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
1860 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 1860 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
1861 | TPS("WakeEmpty")); | 1861 | TPS("WakeEmpty")); |
1862 | } else { | 1862 | } else { |
1863 | rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; | 1863 | WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE); |
1864 | /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ | ||
1865 | smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); | ||
1864 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 1866 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
1865 | TPS("WakeEmptyIsDeferred")); | 1867 | TPS("WakeEmptyIsDeferred")); |
1866 | } | 1868 | } |
@@ -1872,7 +1874,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
1872 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 1874 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
1873 | TPS("WakeOvf")); | 1875 | TPS("WakeOvf")); |
1874 | } else { | 1876 | } else { |
1875 | rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; | 1877 | WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE); |
1878 | /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ | ||
1879 | smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); | ||
1876 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 1880 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
1877 | TPS("WakeOvfIsDeferred")); | 1881 | TPS("WakeOvfIsDeferred")); |
1878 | } | 1882 | } |
@@ -1930,30 +1934,26 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | |||
1930 | struct rcu_data *rdp, | 1934 | struct rcu_data *rdp, |
1931 | unsigned long flags) | 1935 | unsigned long flags) |
1932 | { | 1936 | { |
1933 | long ql = rsp->qlen; | 1937 | long ql = rsp->orphan_done.len; |
1934 | long qll = rsp->qlen_lazy; | 1938 | long qll = rsp->orphan_done.len_lazy; |
1935 | 1939 | ||
1936 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ | 1940 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ |
1937 | if (!rcu_is_nocb_cpu(smp_processor_id())) | 1941 | if (!rcu_is_nocb_cpu(smp_processor_id())) |
1938 | return false; | 1942 | return false; |
1939 | rsp->qlen = 0; | ||
1940 | rsp->qlen_lazy = 0; | ||
1941 | 1943 | ||
1942 | /* First, enqueue the donelist, if any. This preserves CB ordering. */ | 1944 | /* First, enqueue the donelist, if any. This preserves CB ordering. */ |
1943 | if (rsp->orphan_donelist != NULL) { | 1945 | if (rsp->orphan_done.head) { |
1944 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, | 1946 | __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done), |
1945 | rsp->orphan_donetail, ql, qll, flags); | 1947 | rcu_cblist_tail(&rsp->orphan_done), |
1946 | ql = qll = 0; | 1948 | ql, qll, flags); |
1947 | rsp->orphan_donelist = NULL; | ||
1948 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
1949 | } | 1949 | } |
1950 | if (rsp->orphan_nxtlist != NULL) { | 1950 | if (rsp->orphan_pend.head) { |
1951 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, | 1951 | __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend), |
1952 | rsp->orphan_nxttail, ql, qll, flags); | 1952 | rcu_cblist_tail(&rsp->orphan_pend), |
1953 | ql = qll = 0; | 1953 | ql, qll, flags); |
1954 | rsp->orphan_nxtlist = NULL; | ||
1955 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
1956 | } | 1954 | } |
1955 | rcu_cblist_init(&rsp->orphan_done); | ||
1956 | rcu_cblist_init(&rsp->orphan_pend); | ||
1957 | return true; | 1957 | return true; |
1958 | } | 1958 | } |
1959 | 1959 | ||
@@ -2395,16 +2395,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) | |||
2395 | return false; | 2395 | return false; |
2396 | 2396 | ||
2397 | /* If there are early-boot callbacks, move them to nocb lists. */ | 2397 | /* If there are early-boot callbacks, move them to nocb lists. */ |
2398 | if (rdp->nxtlist) { | 2398 | if (!rcu_segcblist_empty(&rdp->cblist)) { |
2399 | rdp->nocb_head = rdp->nxtlist; | 2399 | rdp->nocb_head = rcu_segcblist_head(&rdp->cblist); |
2400 | rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL]; | 2400 | rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist); |
2401 | atomic_long_set(&rdp->nocb_q_count, rdp->qlen); | 2401 | atomic_long_set(&rdp->nocb_q_count, |
2402 | atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy); | 2402 | rcu_segcblist_n_cbs(&rdp->cblist)); |
2403 | rdp->nxtlist = NULL; | 2403 | atomic_long_set(&rdp->nocb_q_count_lazy, |
2404 | rdp->qlen = 0; | 2404 | rcu_segcblist_n_lazy_cbs(&rdp->cblist)); |
2405 | rdp->qlen_lazy = 0; | 2405 | rcu_segcblist_init(&rdp->cblist); |
2406 | } | 2406 | } |
2407 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | 2407 | rcu_segcblist_disable(&rdp->cblist); |
2408 | return true; | 2408 | return true; |
2409 | } | 2409 | } |
2410 | 2410 | ||
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 8751a748499a..6cea17a1ea30 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c | |||
@@ -41,11 +41,11 @@ | |||
41 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
42 | #include <linux/debugfs.h> | 42 | #include <linux/debugfs.h> |
43 | #include <linux/seq_file.h> | 43 | #include <linux/seq_file.h> |
44 | #include <linux/prefetch.h> | ||
44 | 45 | ||
45 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
46 | #include "tree.h" | 47 | #include "tree.h" |
47 | 48 | #include "rcu.h" | |
48 | DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); | ||
49 | 49 | ||
50 | static int r_open(struct inode *inode, struct file *file, | 50 | static int r_open(struct inode *inode, struct file *file, |
51 | const struct seq_operations *op) | 51 | const struct seq_operations *op) |
@@ -121,7 +121,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
121 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 121 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
122 | ulong2long(rdp->completed), ulong2long(rdp->gpnum), | 122 | ulong2long(rdp->completed), ulong2long(rdp->gpnum), |
123 | rdp->cpu_no_qs.b.norm, | 123 | rdp->cpu_no_qs.b.norm, |
124 | rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), | 124 | rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu), |
125 | rdp->core_needs_qs); | 125 | rdp->core_needs_qs); |
126 | seq_printf(m, " dt=%d/%llx/%d df=%lu", | 126 | seq_printf(m, " dt=%d/%llx/%d df=%lu", |
127 | rcu_dynticks_snap(rdp->dynticks), | 127 | rcu_dynticks_snap(rdp->dynticks), |
@@ -130,17 +130,15 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
130 | rdp->dynticks_fqs); | 130 | rdp->dynticks_fqs); |
131 | seq_printf(m, " of=%lu", rdp->offline_fqs); | 131 | seq_printf(m, " of=%lu", rdp->offline_fqs); |
132 | rcu_nocb_q_lengths(rdp, &ql, &qll); | 132 | rcu_nocb_q_lengths(rdp, &ql, &qll); |
133 | qll += rdp->qlen_lazy; | 133 | qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist); |
134 | ql += rdp->qlen; | 134 | ql += rcu_segcblist_n_cbs(&rdp->cblist); |
135 | seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", | 135 | seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", |
136 | qll, ql, | 136 | qll, ql, |
137 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | 137 | ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)], |
138 | rdp->nxttail[RCU_NEXT_TAIL]], | 138 | ".R"[!rcu_segcblist_segempty(&rdp->cblist, |
139 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | 139 | RCU_NEXT_READY_TAIL)], |
140 | rdp->nxttail[RCU_NEXT_READY_TAIL]], | 140 | ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)], |
141 | ".W"[rdp->nxttail[RCU_DONE_TAIL] != | 141 | ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]); |
142 | rdp->nxttail[RCU_WAIT_TAIL]], | ||
143 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); | ||
144 | #ifdef CONFIG_RCU_BOOST | 142 | #ifdef CONFIG_RCU_BOOST |
145 | seq_printf(m, " kt=%d/%c ktl=%x", | 143 | seq_printf(m, " kt=%d/%c ktl=%x", |
146 | per_cpu(rcu_cpu_has_work, rdp->cpu), | 144 | per_cpu(rcu_cpu_has_work, rdp->cpu), |
@@ -278,7 +276,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
278 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", | 276 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", |
279 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 277 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
280 | rsp->n_force_qs - rsp->n_force_qs_ngp, | 278 | rsp->n_force_qs - rsp->n_force_qs_ngp, |
281 | READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); | 279 | READ_ONCE(rsp->n_force_qs_lh), |
280 | rsp->orphan_done.len_lazy, | ||
281 | rsp->orphan_done.len); | ||
282 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { | 282 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { |
283 | if (rnp->level != level) { | 283 | if (rnp->level != level) { |
284 | seq_puts(m, "\n"); | 284 | seq_puts(m, "\n"); |
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 55c8530316c7..273e869ca21d 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); | |||
124 | * non-expedited counterparts? Intended for use within RCU. Note | 124 | * non-expedited counterparts? Intended for use within RCU. Note |
125 | * that if the user specifies both rcu_expedited and rcu_normal, then | 125 | * that if the user specifies both rcu_expedited and rcu_normal, then |
126 | * rcu_normal wins. (Except during the time period during boot from | 126 | * rcu_normal wins. (Except during the time period during boot from |
127 | * when the first task is spawned until the rcu_exp_runtime_mode() | 127 | * when the first task is spawned until the rcu_set_runtime_mode() |
128 | * core_initcall() is invoked, at which point everything is expedited.) | 128 | * core_initcall() is invoked, at which point everything is expedited.) |
129 | */ | 129 | */ |
130 | bool rcu_gp_is_normal(void) | 130 | bool rcu_gp_is_normal(void) |
@@ -190,6 +190,39 @@ void rcu_end_inkernel_boot(void) | |||
190 | 190 | ||
191 | #endif /* #ifndef CONFIG_TINY_RCU */ | 191 | #endif /* #ifndef CONFIG_TINY_RCU */ |
192 | 192 | ||
193 | /* | ||
194 | * Test each non-SRCU synchronous grace-period wait API. This is | ||
195 | * useful just after a change in mode for these primitives, and | ||
196 | * during early boot. | ||
197 | */ | ||
198 | void rcu_test_sync_prims(void) | ||
199 | { | ||
200 | if (!IS_ENABLED(CONFIG_PROVE_RCU)) | ||
201 | return; | ||
202 | synchronize_rcu(); | ||
203 | synchronize_rcu_bh(); | ||
204 | synchronize_sched(); | ||
205 | synchronize_rcu_expedited(); | ||
206 | synchronize_rcu_bh_expedited(); | ||
207 | synchronize_sched_expedited(); | ||
208 | } | ||
209 | |||
210 | #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) | ||
211 | |||
212 | /* | ||
213 | * Switch to run-time mode once RCU has fully initialized. | ||
214 | */ | ||
215 | static int __init rcu_set_runtime_mode(void) | ||
216 | { | ||
217 | rcu_test_sync_prims(); | ||
218 | rcu_scheduler_active = RCU_SCHEDULER_RUNNING; | ||
219 | rcu_test_sync_prims(); | ||
220 | return 0; | ||
221 | } | ||
222 | core_initcall(rcu_set_runtime_mode); | ||
223 | |||
224 | #endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */ | ||
225 | |||
193 | #ifdef CONFIG_PREEMPT_RCU | 226 | #ifdef CONFIG_PREEMPT_RCU |
194 | 227 | ||
195 | /* | 228 | /* |
@@ -632,6 +665,7 @@ static void check_holdout_task(struct task_struct *t, | |||
632 | put_task_struct(t); | 665 | put_task_struct(t); |
633 | return; | 666 | return; |
634 | } | 667 | } |
668 | rcu_request_urgent_qs_task(t); | ||
635 | if (!needreport) | 669 | if (!needreport) |
636 | return; | 670 | return; |
637 | if (*firstreport) { | 671 | if (*firstreport) { |
@@ -817,23 +851,6 @@ static void rcu_spawn_tasks_kthread(void) | |||
817 | 851 | ||
818 | #endif /* #ifdef CONFIG_TASKS_RCU */ | 852 | #endif /* #ifdef CONFIG_TASKS_RCU */ |
819 | 853 | ||
820 | /* | ||
821 | * Test each non-SRCU synchronous grace-period wait API. This is | ||
822 | * useful just after a change in mode for these primitives, and | ||
823 | * during early boot. | ||
824 | */ | ||
825 | void rcu_test_sync_prims(void) | ||
826 | { | ||
827 | if (!IS_ENABLED(CONFIG_PROVE_RCU)) | ||
828 | return; | ||
829 | synchronize_rcu(); | ||
830 | synchronize_rcu_bh(); | ||
831 | synchronize_sched(); | ||
832 | synchronize_rcu_expedited(); | ||
833 | synchronize_rcu_bh_expedited(); | ||
834 | synchronize_sched_expedited(); | ||
835 | } | ||
836 | |||
837 | #ifdef CONFIG_PROVE_RCU | 854 | #ifdef CONFIG_PROVE_RCU |
838 | 855 | ||
839 | /* | 856 | /* |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c51147a1204c..759f4bd52cd6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -3382,7 +3382,7 @@ static void __sched notrace __schedule(bool preempt) | |||
3382 | hrtick_clear(rq); | 3382 | hrtick_clear(rq); |
3383 | 3383 | ||
3384 | local_irq_disable(); | 3384 | local_irq_disable(); |
3385 | rcu_note_context_switch(); | 3385 | rcu_note_context_switch(preempt); |
3386 | 3386 | ||
3387 | /* | 3387 | /* |
3388 | * Make sure that signal_pending_state()->signal_pending() below | 3388 | * Make sure that signal_pending_state()->signal_pending() below |
diff --git a/kernel/signal.c b/kernel/signal.c index a8c54f384553..ca92bcfeb322 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -1237,7 +1237,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, | |||
1237 | } | 1237 | } |
1238 | /* | 1238 | /* |
1239 | * This sighand can be already freed and even reused, but | 1239 | * This sighand can be already freed and even reused, but |
1240 | * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which | 1240 | * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which |
1241 | * initializes ->siglock: this slab can't go away, it has | 1241 | * initializes ->siglock: this slab can't go away, it has |
1242 | * the same object type, ->siglock can't be reinitialized. | 1242 | * the same object type, ->siglock can't be reinitialized. |
1243 | * | 1243 | * |
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index b10da59cf765..c81549d5c833 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c | |||
@@ -413,7 +413,7 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, | |||
413 | *size += sizeof(struct kasan_alloc_meta); | 413 | *size += sizeof(struct kasan_alloc_meta); |
414 | 414 | ||
415 | /* Add free meta. */ | 415 | /* Add free meta. */ |
416 | if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor || | 416 | if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor || |
417 | cache->object_size < sizeof(struct kasan_free_meta)) { | 417 | cache->object_size < sizeof(struct kasan_free_meta)) { |
418 | cache->kasan_info.free_meta_offset = *size; | 418 | cache->kasan_info.free_meta_offset = *size; |
419 | *size += sizeof(struct kasan_free_meta); | 419 | *size += sizeof(struct kasan_free_meta); |
@@ -561,7 +561,7 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object) | |||
561 | unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); | 561 | unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); |
562 | 562 | ||
563 | /* RCU slabs could be legally used after free within the RCU period */ | 563 | /* RCU slabs could be legally used after free within the RCU period */ |
564 | if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) | 564 | if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) |
565 | return; | 565 | return; |
566 | 566 | ||
567 | kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); | 567 | kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); |
@@ -572,7 +572,7 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object) | |||
572 | s8 shadow_byte; | 572 | s8 shadow_byte; |
573 | 573 | ||
574 | /* RCU slabs could be legally used after free within the RCU period */ | 574 | /* RCU slabs could be legally used after free within the RCU period */ |
575 | if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) | 575 | if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) |
576 | return false; | 576 | return false; |
577 | 577 | ||
578 | shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); | 578 | shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); |
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index 5bf191756a4a..2d5959c5f7c5 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c | |||
@@ -95,7 +95,7 @@ void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, | |||
95 | void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) | 95 | void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) |
96 | { | 96 | { |
97 | /* TODO: RCU freeing is unsupported for now; hide false positives. */ | 97 | /* TODO: RCU freeing is unsupported for now; hide false positives. */ |
98 | if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) | 98 | if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU)) |
99 | kmemcheck_mark_freed(object, size); | 99 | kmemcheck_mark_freed(object, size); |
100 | } | 100 | } |
101 | 101 | ||
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index a7652acd2ab9..54ca54562928 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -21,7 +21,7 @@ | |||
21 | #include <linux/slab.h> | 21 | #include <linux/slab.h> |
22 | 22 | ||
23 | /* global SRCU for all MMs */ | 23 | /* global SRCU for all MMs */ |
24 | static struct srcu_struct srcu; | 24 | DEFINE_STATIC_SRCU(srcu); |
25 | 25 | ||
26 | /* | 26 | /* |
27 | * This function allows mmu_notifier::release callback to delay a call to | 27 | * This function allows mmu_notifier::release callback to delay a call to |
@@ -252,12 +252,6 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, | |||
252 | 252 | ||
253 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | 253 | BUG_ON(atomic_read(&mm->mm_users) <= 0); |
254 | 254 | ||
255 | /* | ||
256 | * Verify that mmu_notifier_init() already run and the global srcu is | ||
257 | * initialized. | ||
258 | */ | ||
259 | BUG_ON(!srcu.per_cpu_ref); | ||
260 | |||
261 | ret = -ENOMEM; | 255 | ret = -ENOMEM; |
262 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); | 256 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); |
263 | if (unlikely(!mmu_notifier_mm)) | 257 | if (unlikely(!mmu_notifier_mm)) |
@@ -406,9 +400,3 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, | |||
406 | mmdrop(mm); | 400 | mmdrop(mm); |
407 | } | 401 | } |
408 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); | 402 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); |
409 | |||
410 | static int __init mmu_notifier_init(void) | ||
411 | { | ||
412 | return init_srcu_struct(&srcu); | ||
413 | } | ||
414 | subsys_initcall(mmu_notifier_init); | ||
@@ -430,7 +430,7 @@ static void anon_vma_ctor(void *data) | |||
430 | void __init anon_vma_init(void) | 430 | void __init anon_vma_init(void) |
431 | { | 431 | { |
432 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), | 432 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), |
433 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, | 433 | 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, |
434 | anon_vma_ctor); | 434 | anon_vma_ctor); |
435 | anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, | 435 | anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, |
436 | SLAB_PANIC|SLAB_ACCOUNT); | 436 | SLAB_PANIC|SLAB_ACCOUNT); |
@@ -481,7 +481,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) | |||
481 | * If this page is still mapped, then its anon_vma cannot have been | 481 | * If this page is still mapped, then its anon_vma cannot have been |
482 | * freed. But if it has been unmapped, we have no security against the | 482 | * freed. But if it has been unmapped, we have no security against the |
483 | * anon_vma structure being freed and reused (for another anon_vma: | 483 | * anon_vma structure being freed and reused (for another anon_vma: |
484 | * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() | 484 | * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() |
485 | * above cannot corrupt). | 485 | * above cannot corrupt). |
486 | */ | 486 | */ |
487 | if (!page_mapped(page)) { | 487 | if (!page_mapped(page)) { |
@@ -1728,7 +1728,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) | |||
1728 | 1728 | ||
1729 | freelist = page->freelist; | 1729 | freelist = page->freelist; |
1730 | slab_destroy_debugcheck(cachep, page); | 1730 | slab_destroy_debugcheck(cachep, page); |
1731 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) | 1731 | if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU)) |
1732 | call_rcu(&page->rcu_head, kmem_rcu_free); | 1732 | call_rcu(&page->rcu_head, kmem_rcu_free); |
1733 | else | 1733 | else |
1734 | kmem_freepages(cachep, page); | 1734 | kmem_freepages(cachep, page); |
@@ -1924,7 +1924,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, | |||
1924 | 1924 | ||
1925 | cachep->num = 0; | 1925 | cachep->num = 0; |
1926 | 1926 | ||
1927 | if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU) | 1927 | if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU) |
1928 | return false; | 1928 | return false; |
1929 | 1929 | ||
1930 | left = calculate_slab_order(cachep, size, | 1930 | left = calculate_slab_order(cachep, size, |
@@ -2030,7 +2030,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2030 | if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + | 2030 | if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + |
2031 | 2 * sizeof(unsigned long long))) | 2031 | 2 * sizeof(unsigned long long))) |
2032 | flags |= SLAB_RED_ZONE | SLAB_STORE_USER; | 2032 | flags |= SLAB_RED_ZONE | SLAB_STORE_USER; |
2033 | if (!(flags & SLAB_DESTROY_BY_RCU)) | 2033 | if (!(flags & SLAB_TYPESAFE_BY_RCU)) |
2034 | flags |= SLAB_POISON; | 2034 | flags |= SLAB_POISON; |
2035 | #endif | 2035 | #endif |
2036 | #endif | 2036 | #endif |
@@ -126,7 +126,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, | |||
126 | 126 | ||
127 | /* Legal flag mask for kmem_cache_create(), for various configurations */ | 127 | /* Legal flag mask for kmem_cache_create(), for various configurations */ |
128 | #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ | 128 | #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ |
129 | SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS ) | 129 | SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) |
130 | 130 | ||
131 | #if defined(CONFIG_DEBUG_SLAB) | 131 | #if defined(CONFIG_DEBUG_SLAB) |
132 | #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) | 132 | #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) |
@@ -415,7 +415,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) | |||
415 | * back there or track user information then we can | 415 | * back there or track user information then we can |
416 | * only use the space before that information. | 416 | * only use the space before that information. |
417 | */ | 417 | */ |
418 | if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) | 418 | if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) |
419 | return s->inuse; | 419 | return s->inuse; |
420 | /* | 420 | /* |
421 | * Else we can use all the padding etc for the allocation | 421 | * Else we can use all the padding etc for the allocation |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 09d0e849b07f..01a0fe2eb332 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -39,7 +39,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, | |||
39 | * Set of flags that will prevent slab merging | 39 | * Set of flags that will prevent slab merging |
40 | */ | 40 | */ |
41 | #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | 41 | #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ |
42 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ | 42 | SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \ |
43 | SLAB_FAILSLAB | SLAB_KASAN) | 43 | SLAB_FAILSLAB | SLAB_KASAN) |
44 | 44 | ||
45 | #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ | 45 | #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ |
@@ -500,7 +500,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) | |||
500 | struct kmem_cache *s, *s2; | 500 | struct kmem_cache *s, *s2; |
501 | 501 | ||
502 | /* | 502 | /* |
503 | * On destruction, SLAB_DESTROY_BY_RCU kmem_caches are put on the | 503 | * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the |
504 | * @slab_caches_to_rcu_destroy list. The slab pages are freed | 504 | * @slab_caches_to_rcu_destroy list. The slab pages are freed |
505 | * through RCU and and the associated kmem_cache are dereferenced | 505 | * through RCU and and the associated kmem_cache are dereferenced |
506 | * while freeing the pages, so the kmem_caches should be freed only | 506 | * while freeing the pages, so the kmem_caches should be freed only |
@@ -537,7 +537,7 @@ static int shutdown_cache(struct kmem_cache *s) | |||
537 | memcg_unlink_cache(s); | 537 | memcg_unlink_cache(s); |
538 | list_del(&s->list); | 538 | list_del(&s->list); |
539 | 539 | ||
540 | if (s->flags & SLAB_DESTROY_BY_RCU) { | 540 | if (s->flags & SLAB_TYPESAFE_BY_RCU) { |
541 | list_add_tail(&s->list, &slab_caches_to_rcu_destroy); | 541 | list_add_tail(&s->list, &slab_caches_to_rcu_destroy); |
542 | schedule_work(&slab_caches_to_rcu_destroy_work); | 542 | schedule_work(&slab_caches_to_rcu_destroy_work); |
543 | } else { | 543 | } else { |
@@ -126,7 +126,7 @@ static inline void clear_slob_page_free(struct page *sp) | |||
126 | 126 | ||
127 | /* | 127 | /* |
128 | * struct slob_rcu is inserted at the tail of allocated slob blocks, which | 128 | * struct slob_rcu is inserted at the tail of allocated slob blocks, which |
129 | * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free | 129 | * were created with a SLAB_TYPESAFE_BY_RCU slab. slob_rcu is used to free |
130 | * the block using call_rcu. | 130 | * the block using call_rcu. |
131 | */ | 131 | */ |
132 | struct slob_rcu { | 132 | struct slob_rcu { |
@@ -524,7 +524,7 @@ EXPORT_SYMBOL(ksize); | |||
524 | 524 | ||
525 | int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) | 525 | int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) |
526 | { | 526 | { |
527 | if (flags & SLAB_DESTROY_BY_RCU) { | 527 | if (flags & SLAB_TYPESAFE_BY_RCU) { |
528 | /* leave room for rcu footer at the end of object */ | 528 | /* leave room for rcu footer at the end of object */ |
529 | c->size += sizeof(struct slob_rcu); | 529 | c->size += sizeof(struct slob_rcu); |
530 | } | 530 | } |
@@ -598,7 +598,7 @@ static void kmem_rcu_free(struct rcu_head *head) | |||
598 | void kmem_cache_free(struct kmem_cache *c, void *b) | 598 | void kmem_cache_free(struct kmem_cache *c, void *b) |
599 | { | 599 | { |
600 | kmemleak_free_recursive(b, c->flags); | 600 | kmemleak_free_recursive(b, c->flags); |
601 | if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { | 601 | if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) { |
602 | struct slob_rcu *slob_rcu; | 602 | struct slob_rcu *slob_rcu; |
603 | slob_rcu = b + (c->size - sizeof(struct slob_rcu)); | 603 | slob_rcu = b + (c->size - sizeof(struct slob_rcu)); |
604 | slob_rcu->size = c->size; | 604 | slob_rcu->size = c->size; |
@@ -1687,7 +1687,7 @@ static void rcu_free_slab(struct rcu_head *h) | |||
1687 | 1687 | ||
1688 | static void free_slab(struct kmem_cache *s, struct page *page) | 1688 | static void free_slab(struct kmem_cache *s, struct page *page) |
1689 | { | 1689 | { |
1690 | if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { | 1690 | if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { |
1691 | struct rcu_head *head; | 1691 | struct rcu_head *head; |
1692 | 1692 | ||
1693 | if (need_reserve_slab_rcu) { | 1693 | if (need_reserve_slab_rcu) { |
@@ -2963,7 +2963,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, | |||
2963 | * slab_free_freelist_hook() could have put the items into quarantine. | 2963 | * slab_free_freelist_hook() could have put the items into quarantine. |
2964 | * If so, no need to free them. | 2964 | * If so, no need to free them. |
2965 | */ | 2965 | */ |
2966 | if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU)) | 2966 | if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU)) |
2967 | return; | 2967 | return; |
2968 | do_slab_free(s, page, head, tail, cnt, addr); | 2968 | do_slab_free(s, page, head, tail, cnt, addr); |
2969 | } | 2969 | } |
@@ -3433,7 +3433,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
3433 | * the slab may touch the object after free or before allocation | 3433 | * the slab may touch the object after free or before allocation |
3434 | * then we should never poison the object itself. | 3434 | * then we should never poison the object itself. |
3435 | */ | 3435 | */ |
3436 | if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && | 3436 | if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) && |
3437 | !s->ctor) | 3437 | !s->ctor) |
3438 | s->flags |= __OBJECT_POISON; | 3438 | s->flags |= __OBJECT_POISON; |
3439 | else | 3439 | else |
@@ -3455,7 +3455,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
3455 | */ | 3455 | */ |
3456 | s->inuse = size; | 3456 | s->inuse = size; |
3457 | 3457 | ||
3458 | if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || | 3458 | if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || |
3459 | s->ctor)) { | 3459 | s->ctor)) { |
3460 | /* | 3460 | /* |
3461 | * Relocate free pointer after the object if it is not | 3461 | * Relocate free pointer after the object if it is not |
@@ -3537,7 +3537,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) | |||
3537 | s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); | 3537 | s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); |
3538 | s->reserved = 0; | 3538 | s->reserved = 0; |
3539 | 3539 | ||
3540 | if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) | 3540 | if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) |
3541 | s->reserved = sizeof(struct rcu_head); | 3541 | s->reserved = sizeof(struct rcu_head); |
3542 | 3542 | ||
3543 | if (!calculate_sizes(s, -1)) | 3543 | if (!calculate_sizes(s, -1)) |
@@ -5042,7 +5042,7 @@ SLAB_ATTR_RO(cache_dma); | |||
5042 | 5042 | ||
5043 | static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) | 5043 | static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) |
5044 | { | 5044 | { |
5045 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); | 5045 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU)); |
5046 | } | 5046 | } |
5047 | SLAB_ATTR_RO(destroy_by_rcu); | 5047 | SLAB_ATTR_RO(destroy_by_rcu); |
5048 | 5048 | ||
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b99168b0fabf..f75482bdee9a 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c | |||
@@ -951,7 +951,7 @@ static struct proto dccp_v4_prot = { | |||
951 | .orphan_count = &dccp_orphan_count, | 951 | .orphan_count = &dccp_orphan_count, |
952 | .max_header = MAX_DCCP_HEADER, | 952 | .max_header = MAX_DCCP_HEADER, |
953 | .obj_size = sizeof(struct dccp_sock), | 953 | .obj_size = sizeof(struct dccp_sock), |
954 | .slab_flags = SLAB_DESTROY_BY_RCU, | 954 | .slab_flags = SLAB_TYPESAFE_BY_RCU, |
955 | .rsk_prot = &dccp_request_sock_ops, | 955 | .rsk_prot = &dccp_request_sock_ops, |
956 | .twsk_prot = &dccp_timewait_sock_ops, | 956 | .twsk_prot = &dccp_timewait_sock_ops, |
957 | .h.hashinfo = &dccp_hashinfo, | 957 | .h.hashinfo = &dccp_hashinfo, |
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index d9b6a4e403e7..840f14aaa016 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c | |||
@@ -1014,7 +1014,7 @@ static struct proto dccp_v6_prot = { | |||
1014 | .orphan_count = &dccp_orphan_count, | 1014 | .orphan_count = &dccp_orphan_count, |
1015 | .max_header = MAX_DCCP_HEADER, | 1015 | .max_header = MAX_DCCP_HEADER, |
1016 | .obj_size = sizeof(struct dccp6_sock), | 1016 | .obj_size = sizeof(struct dccp6_sock), |
1017 | .slab_flags = SLAB_DESTROY_BY_RCU, | 1017 | .slab_flags = SLAB_TYPESAFE_BY_RCU, |
1018 | .rsk_prot = &dccp6_request_sock_ops, | 1018 | .rsk_prot = &dccp6_request_sock_ops, |
1019 | .twsk_prot = &dccp6_timewait_sock_ops, | 1019 | .twsk_prot = &dccp6_timewait_sock_ops, |
1020 | .h.hashinfo = &dccp_hashinfo, | 1020 | .h.hashinfo = &dccp_hashinfo, |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3a51582bef55..5ab2aac5ca19 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -2395,7 +2395,7 @@ struct proto tcp_prot = { | |||
2395 | .sysctl_rmem = sysctl_tcp_rmem, | 2395 | .sysctl_rmem = sysctl_tcp_rmem, |
2396 | .max_header = MAX_TCP_HEADER, | 2396 | .max_header = MAX_TCP_HEADER, |
2397 | .obj_size = sizeof(struct tcp_sock), | 2397 | .obj_size = sizeof(struct tcp_sock), |
2398 | .slab_flags = SLAB_DESTROY_BY_RCU, | 2398 | .slab_flags = SLAB_TYPESAFE_BY_RCU, |
2399 | .twsk_prot = &tcp_timewait_sock_ops, | 2399 | .twsk_prot = &tcp_timewait_sock_ops, |
2400 | .rsk_prot = &tcp_request_sock_ops, | 2400 | .rsk_prot = &tcp_request_sock_ops, |
2401 | .h.hashinfo = &tcp_hashinfo, | 2401 | .h.hashinfo = &tcp_hashinfo, |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index aeb9497b5bb7..7a8237acd210 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
@@ -1917,7 +1917,7 @@ struct proto tcpv6_prot = { | |||
1917 | .sysctl_rmem = sysctl_tcp_rmem, | 1917 | .sysctl_rmem = sysctl_tcp_rmem, |
1918 | .max_header = MAX_TCP_HEADER, | 1918 | .max_header = MAX_TCP_HEADER, |
1919 | .obj_size = sizeof(struct tcp6_sock), | 1919 | .obj_size = sizeof(struct tcp6_sock), |
1920 | .slab_flags = SLAB_DESTROY_BY_RCU, | 1920 | .slab_flags = SLAB_TYPESAFE_BY_RCU, |
1921 | .twsk_prot = &tcp6_timewait_sock_ops, | 1921 | .twsk_prot = &tcp6_timewait_sock_ops, |
1922 | .rsk_prot = &tcp6_request_sock_ops, | 1922 | .rsk_prot = &tcp6_request_sock_ops, |
1923 | .h.hashinfo = &tcp_hashinfo, | 1923 | .h.hashinfo = &tcp_hashinfo, |
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index cb4fff785cbf..8364fe5b59e4 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c | |||
@@ -142,7 +142,7 @@ static struct proto llc_proto = { | |||
142 | .name = "LLC", | 142 | .name = "LLC", |
143 | .owner = THIS_MODULE, | 143 | .owner = THIS_MODULE, |
144 | .obj_size = sizeof(struct llc_sock), | 144 | .obj_size = sizeof(struct llc_sock), |
145 | .slab_flags = SLAB_DESTROY_BY_RCU, | 145 | .slab_flags = SLAB_TYPESAFE_BY_RCU, |
146 | }; | 146 | }; |
147 | 147 | ||
148 | /** | 148 | /** |
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 8bc5a1bd2d45..9b02c13d258b 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c | |||
@@ -506,7 +506,7 @@ static struct sock *__llc_lookup_established(struct llc_sap *sap, | |||
506 | again: | 506 | again: |
507 | sk_nulls_for_each_rcu(rc, node, laddr_hb) { | 507 | sk_nulls_for_each_rcu(rc, node, laddr_hb) { |
508 | if (llc_estab_match(sap, daddr, laddr, rc)) { | 508 | if (llc_estab_match(sap, daddr, laddr, rc)) { |
509 | /* Extra checks required by SLAB_DESTROY_BY_RCU */ | 509 | /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ |
510 | if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) | 510 | if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) |
511 | goto again; | 511 | goto again; |
512 | if (unlikely(llc_sk(rc)->sap != sap || | 512 | if (unlikely(llc_sk(rc)->sap != sap || |
@@ -565,7 +565,7 @@ static struct sock *__llc_lookup_listener(struct llc_sap *sap, | |||
565 | again: | 565 | again: |
566 | sk_nulls_for_each_rcu(rc, node, laddr_hb) { | 566 | sk_nulls_for_each_rcu(rc, node, laddr_hb) { |
567 | if (llc_listener_match(sap, laddr, rc)) { | 567 | if (llc_listener_match(sap, laddr, rc)) { |
568 | /* Extra checks required by SLAB_DESTROY_BY_RCU */ | 568 | /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ |
569 | if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) | 569 | if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) |
570 | goto again; | 570 | goto again; |
571 | if (unlikely(llc_sk(rc)->sap != sap || | 571 | if (unlikely(llc_sk(rc)->sap != sap || |
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index 5404d0d195cc..63b6ab056370 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c | |||
@@ -328,7 +328,7 @@ static struct sock *llc_lookup_dgram(struct llc_sap *sap, | |||
328 | again: | 328 | again: |
329 | sk_nulls_for_each_rcu(rc, node, laddr_hb) { | 329 | sk_nulls_for_each_rcu(rc, node, laddr_hb) { |
330 | if (llc_dgram_match(sap, laddr, rc)) { | 330 | if (llc_dgram_match(sap, laddr, rc)) { |
331 | /* Extra checks required by SLAB_DESTROY_BY_RCU */ | 331 | /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ |
332 | if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) | 332 | if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) |
333 | goto again; | 333 | goto again; |
334 | if (unlikely(llc_sk(rc)->sap != sap || | 334 | if (unlikely(llc_sk(rc)->sap != sap || |
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3c8f1ed2f555..e847dbaa0c6b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c | |||
@@ -911,7 +911,7 @@ static unsigned int early_drop_list(struct net *net, | |||
911 | continue; | 911 | continue; |
912 | 912 | ||
913 | /* kill only if still in same netns -- might have moved due to | 913 | /* kill only if still in same netns -- might have moved due to |
914 | * SLAB_DESTROY_BY_RCU rules. | 914 | * SLAB_TYPESAFE_BY_RCU rules. |
915 | * | 915 | * |
916 | * We steal the timer reference. If that fails timer has | 916 | * We steal the timer reference. If that fails timer has |
917 | * already fired or someone else deleted it. Just drop ref | 917 | * already fired or someone else deleted it. Just drop ref |
@@ -1114,7 +1114,7 @@ __nf_conntrack_alloc(struct net *net, | |||
1114 | 1114 | ||
1115 | /* | 1115 | /* |
1116 | * Do not use kmem_cache_zalloc(), as this cache uses | 1116 | * Do not use kmem_cache_zalloc(), as this cache uses |
1117 | * SLAB_DESTROY_BY_RCU. | 1117 | * SLAB_TYPESAFE_BY_RCU. |
1118 | */ | 1118 | */ |
1119 | ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); | 1119 | ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); |
1120 | if (ct == NULL) | 1120 | if (ct == NULL) |
@@ -1159,7 +1159,7 @@ void nf_conntrack_free(struct nf_conn *ct) | |||
1159 | struct net *net = nf_ct_net(ct); | 1159 | struct net *net = nf_ct_net(ct); |
1160 | 1160 | ||
1161 | /* A freed object has refcnt == 0, that's | 1161 | /* A freed object has refcnt == 0, that's |
1162 | * the golden rule for SLAB_DESTROY_BY_RCU | 1162 | * the golden rule for SLAB_TYPESAFE_BY_RCU |
1163 | */ | 1163 | */ |
1164 | NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); | 1164 | NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); |
1165 | 1165 | ||
@@ -1929,7 +1929,7 @@ int nf_conntrack_init_start(void) | |||
1929 | nf_conntrack_cachep = kmem_cache_create("nf_conntrack", | 1929 | nf_conntrack_cachep = kmem_cache_create("nf_conntrack", |
1930 | sizeof(struct nf_conn), | 1930 | sizeof(struct nf_conn), |
1931 | NFCT_INFOMASK + 1, | 1931 | NFCT_INFOMASK + 1, |
1932 | SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); | 1932 | SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); |
1933 | if (!nf_conntrack_cachep) | 1933 | if (!nf_conntrack_cachep) |
1934 | goto err_cachep; | 1934 | goto err_cachep; |
1935 | 1935 | ||
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5b6ee21368a6..6793d7348cc8 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c | |||
@@ -101,7 +101,7 @@ struct proto smc_proto = { | |||
101 | .unhash = smc_unhash_sk, | 101 | .unhash = smc_unhash_sk, |
102 | .obj_size = sizeof(struct smc_sock), | 102 | .obj_size = sizeof(struct smc_sock), |
103 | .h.smc_hash = &smc_v4_hashinfo, | 103 | .h.smc_hash = &smc_v4_hashinfo, |
104 | .slab_flags = SLAB_DESTROY_BY_RCU, | 104 | .slab_flags = SLAB_TYPESAFE_BY_RCU, |
105 | }; | 105 | }; |
106 | EXPORT_SYMBOL_GPL(smc_proto); | 106 | EXPORT_SYMBOL_GPL(smc_proto); |
107 | 107 | ||
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index ea6e373edc27..93eede4e8fbe 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | |||
@@ -170,7 +170,7 @@ qemu_append="`identify_qemu_append "$QEMU"`" | |||
170 | # Pull in Kconfig-fragment boot parameters | 170 | # Pull in Kconfig-fragment boot parameters |
171 | boot_args="`configfrag_boot_params "$boot_args" "$config_template"`" | 171 | boot_args="`configfrag_boot_params "$boot_args" "$config_template"`" |
172 | # Generate kernel-version-specific boot parameters | 172 | # Generate kernel-version-specific boot parameters |
173 | boot_args="`per_version_boot_params "$boot_args" $builddir/.config $seconds`" | 173 | boot_args="`per_version_boot_params "$boot_args" $resdir/.config $seconds`" |
174 | 174 | ||
175 | if test -n "$TORTURE_BUILDONLY" | 175 | if test -n "$TORTURE_BUILDONLY" |
176 | then | 176 | then |