diff options
author | Ingo Molnar <mingo@kernel.org> | 2017-04-23 05:12:44 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2017-04-23 05:12:44 -0400 |
commit | 58d30c36d472b75e8e9962d6a640be19d9389128 (patch) | |
tree | ce161b15e844d081f527f02a4f74ffd1171b2b14 | |
parent | 94836ecf1e7378b64d37624fbb81fe48fbd4c772 (diff) | |
parent | f2094107ac82bf867184efd77cee30b6a98e2e20 (diff) |
Merge branch 'for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into core/rcu
Pull RCU updates from Paul E. McKenney:
- Documentation updates.
- Miscellaneous fixes.
- Parallelize SRCU callback handling (plus overlapping patches).
Signed-off-by: Ingo Molnar <mingo@kernel.org>
71 files changed, 3637 insertions, 1116 deletions
diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX index f773a264ae02..1672573b037a 100644 --- a/Documentation/RCU/00-INDEX +++ b/Documentation/RCU/00-INDEX | |||
@@ -17,7 +17,7 @@ rcu_dereference.txt | |||
17 | rcubarrier.txt | 17 | rcubarrier.txt |
18 | - RCU and Unloadable Modules | 18 | - RCU and Unloadable Modules |
19 | rculist_nulls.txt | 19 | rculist_nulls.txt |
20 | - RCU list primitives for use with SLAB_DESTROY_BY_RCU | 20 | - RCU list primitives for use with SLAB_TYPESAFE_BY_RCU |
21 | rcuref.txt | 21 | rcuref.txt |
22 | - Reference-count design for elements of lists/arrays protected by RCU | 22 | - Reference-count design for elements of lists/arrays protected by RCU |
23 | rcu.txt | 23 | rcu.txt |
diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index d583c653a703..38d6d800761f 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html | |||
@@ -19,6 +19,8 @@ to each other. | |||
19 | The <tt>rcu_state</tt> Structure</a> | 19 | The <tt>rcu_state</tt> Structure</a> |
20 | <li> <a href="#The rcu_node Structure"> | 20 | <li> <a href="#The rcu_node Structure"> |
21 | The <tt>rcu_node</tt> Structure</a> | 21 | The <tt>rcu_node</tt> Structure</a> |
22 | <li> <a href="#The rcu_segcblist Structure"> | ||
23 | The <tt>rcu_segcblist</tt> Structure</a> | ||
22 | <li> <a href="#The rcu_data Structure"> | 24 | <li> <a href="#The rcu_data Structure"> |
23 | The <tt>rcu_data</tt> Structure</a> | 25 | The <tt>rcu_data</tt> Structure</a> |
24 | <li> <a href="#The rcu_dynticks Structure"> | 26 | <li> <a href="#The rcu_dynticks Structure"> |
@@ -841,6 +843,134 @@ for lockdep lock-class names. | |||
841 | Finally, lines 64-66 produce an error if the maximum number of | 843 | Finally, lines 64-66 produce an error if the maximum number of |
842 | CPUs is too large for the specified fanout. | 844 | CPUs is too large for the specified fanout. |
843 | 845 | ||
846 | <h3><a name="The rcu_segcblist Structure"> | ||
847 | The <tt>rcu_segcblist</tt> Structure</a></h3> | ||
848 | |||
849 | The <tt>rcu_segcblist</tt> structure maintains a segmented list of | ||
850 | callbacks as follows: | ||
851 | |||
852 | <pre> | ||
853 | 1 #define RCU_DONE_TAIL 0 | ||
854 | 2 #define RCU_WAIT_TAIL 1 | ||
855 | 3 #define RCU_NEXT_READY_TAIL 2 | ||
856 | 4 #define RCU_NEXT_TAIL 3 | ||
857 | 5 #define RCU_CBLIST_NSEGS 4 | ||
858 | 6 | ||
859 | 7 struct rcu_segcblist { | ||
860 | 8 struct rcu_head *head; | ||
861 | 9 struct rcu_head **tails[RCU_CBLIST_NSEGS]; | ||
862 | 10 unsigned long gp_seq[RCU_CBLIST_NSEGS]; | ||
863 | 11 long len; | ||
864 | 12 long len_lazy; | ||
865 | 13 }; | ||
866 | </pre> | ||
867 | |||
868 | <p> | ||
869 | The segments are as follows: | ||
870 | |||
871 | <ol> | ||
872 | <li> <tt>RCU_DONE_TAIL</tt>: Callbacks whose grace periods have elapsed. | ||
873 | These callbacks are ready to be invoked. | ||
874 | <li> <tt>RCU_WAIT_TAIL</tt>: Callbacks that are waiting for the | ||
875 | current grace period. | ||
876 | Note that different CPUs can have different ideas about which | ||
877 | grace period is current, hence the <tt>->gp_seq</tt> field. | ||
878 | <li> <tt>RCU_NEXT_READY_TAIL</tt>: Callbacks waiting for the next | ||
879 | grace period to start. | ||
880 | <li> <tt>RCU_NEXT_TAIL</tt>: Callbacks that have not yet been | ||
881 | associated with a grace period. | ||
882 | </ol> | ||
883 | |||
884 | <p> | ||
885 | The <tt>->head</tt> pointer references the first callback or | ||
886 | is <tt>NULL</tt> if the list contains no callbacks (which is | ||
887 | <i>not</i> the same as being empty). | ||
888 | Each element of the <tt>->tails[]</tt> array references the | ||
889 | <tt>->next</tt> pointer of the last callback in the corresponding | ||
890 | segment of the list, or the list's <tt>->head</tt> pointer if | ||
891 | that segment and all previous segments are empty. | ||
892 | If the corresponding segment is empty but some previous segment is | ||
893 | not empty, then the array element is identical to its predecessor. | ||
894 | Older callbacks are closer to the head of the list, and new callbacks | ||
895 | are added at the tail. | ||
896 | This relationship between the <tt>->head</tt> pointer, the | ||
897 | <tt>->tails[]</tt> array, and the callbacks is shown in this | ||
898 | diagram: | ||
899 | |||
900 | </p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%"> | ||
901 | |||
902 | </p><p>In this figure, the <tt>->head</tt> pointer references the | ||
903 | first | ||
904 | RCU callback in the list. | ||
905 | The <tt>->tails[RCU_DONE_TAIL]</tt> array element references | ||
906 | the <tt>->head</tt> pointer itself, indicating that none | ||
907 | of the callbacks is ready to invoke. | ||
908 | The <tt>->tails[RCU_WAIT_TAIL]</tt> array element references callback | ||
909 | CB 2's <tt>->next</tt> pointer, which indicates that | ||
910 | CB 1 and CB 2 are both waiting on the current grace period, | ||
911 | give or take possible disagreements about exactly which grace period | ||
912 | is the current one. | ||
913 | The <tt>->tails[RCU_NEXT_READY_TAIL]</tt> array element | ||
914 | references the same RCU callback that <tt>->tails[RCU_WAIT_TAIL]</tt> | ||
915 | does, which indicates that there are no callbacks waiting on the next | ||
916 | RCU grace period. | ||
917 | The <tt>->tails[RCU_NEXT_TAIL]</tt> array element references | ||
918 | CB 4's <tt>->next</tt> pointer, indicating that all the | ||
919 | remaining RCU callbacks have not yet been assigned to an RCU grace | ||
920 | period. | ||
921 | Note that the <tt>->tails[RCU_NEXT_TAIL]</tt> array element | ||
922 | always references the last RCU callback's <tt>->next</tt> pointer | ||
923 | unless the callback list is empty, in which case it references | ||
924 | the <tt>->head</tt> pointer. | ||
925 | |||
926 | <p> | ||
927 | There is one additional important special case for the | ||
928 | <tt>->tails[RCU_NEXT_TAIL]</tt> array element: It can be <tt>NULL</tt> | ||
929 | when this list is <i>disabled</i>. | ||
930 | Lists are disabled when the corresponding CPU is offline or when | ||
931 | the corresponding CPU's callbacks are offloaded to a kthread, | ||
932 | both of which are described elsewhere. | ||
933 | |||
934 | </p><p>CPUs advance their callbacks from the | ||
935 | <tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the | ||
936 | <tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments | ||
937 | as grace periods advance. | ||
938 | |||
939 | </p><p>The <tt>->gp_seq[]</tt> array records grace-period | ||
940 | numbers corresponding to the list segments. | ||
941 | This is what allows different CPUs to have different ideas as to | ||
942 | which is the current grace period while still avoiding premature | ||
943 | invocation of their callbacks. | ||
944 | In particular, this allows CPUs that go idle for extended periods | ||
945 | to determine which of their callbacks are ready to be invoked after | ||
946 | reawakening. | ||
947 | |||
948 | </p><p>The <tt>->len</tt> counter contains the number of | ||
949 | callbacks in <tt>->head</tt>, and the | ||
950 | <tt>->len_lazy</tt> contains the number of those callbacks that | ||
951 | are known to only free memory, and whose invocation can therefore | ||
952 | be safely deferred. | ||
953 | |||
954 | <p><b>Important note</b>: It is the <tt>->len</tt> field that | ||
955 | determines whether or not there are callbacks associated with | ||
956 | this <tt>rcu_segcblist</tt> structure, <i>not</i> the <tt>->head</tt> | ||
957 | pointer. | ||
958 | The reason for this is that all the ready-to-invoke callbacks | ||
959 | (that is, those in the <tt>RCU_DONE_TAIL</tt> segment) are extracted | ||
960 | all at once at callback-invocation time. | ||
961 | If callback invocation must be postponed, for example, because a | ||
962 | high-priority process just woke up on this CPU, then the remaining | ||
963 | callbacks are placed back on the <tt>RCU_DONE_TAIL</tt> segment. | ||
964 | Either way, the <tt>->len</tt> and <tt>->len_lazy</tt> counts | ||
965 | are adjusted after the corresponding callbacks have been invoked, and so | ||
966 | again it is the <tt>->len</tt> count that accurately reflects whether | ||
967 | or not there are callbacks associated with this <tt>rcu_segcblist</tt> | ||
968 | structure. | ||
969 | Of course, off-CPU sampling of the <tt>->len</tt> count requires | ||
970 | the use of appropriate synchronization, for example, memory barriers. | ||
971 | This synchronization can be a bit subtle, particularly in the case | ||
972 | of <tt>rcu_barrier()</tt>. | ||
973 | |||
844 | <h3><a name="The rcu_data Structure"> | 974 | <h3><a name="The rcu_data Structure"> |
845 | The <tt>rcu_data</tt> Structure</a></h3> | 975 | The <tt>rcu_data</tt> Structure</a></h3> |
846 | 976 | ||
@@ -983,62 +1113,18 @@ choice. | |||
983 | as follows: | 1113 | as follows: |
984 | 1114 | ||
985 | <pre> | 1115 | <pre> |
986 | 1 struct rcu_head *nxtlist; | 1116 | 1 struct rcu_segcblist cblist; |
987 | 2 struct rcu_head **nxttail[RCU_NEXT_SIZE]; | 1117 | 2 long qlen_last_fqs_check; |
988 | 3 unsigned long nxtcompleted[RCU_NEXT_SIZE]; | 1118 | 3 unsigned long n_cbs_invoked; |
989 | 4 long qlen_lazy; | 1119 | 4 unsigned long n_nocbs_invoked; |
990 | 5 long qlen; | 1120 | 5 unsigned long n_cbs_orphaned; |
991 | 6 long qlen_last_fqs_check; | 1121 | 6 unsigned long n_cbs_adopted; |
992 | 7 unsigned long n_force_qs_snap; | 1122 | 7 unsigned long n_force_qs_snap; |
993 | 8 unsigned long n_cbs_invoked; | 1123 | 8 long blimit; |
994 | 9 unsigned long n_cbs_orphaned; | ||
995 | 10 unsigned long n_cbs_adopted; | ||
996 | 11 long blimit; | ||
997 | </pre> | 1124 | </pre> |
998 | 1125 | ||
999 | <p>The <tt>->nxtlist</tt> pointer and the | 1126 | <p>The <tt>->cblist</tt> structure is the segmented callback list |
1000 | <tt>->nxttail[]</tt> array form a four-segment list with | 1127 | described earlier. |
1001 | older callbacks near the head and newer ones near the tail. | ||
1002 | Each segment contains callbacks with the corresponding relationship | ||
1003 | to the current grace period. | ||
1004 | The pointer out of the end of each of the four segments is referenced | ||
1005 | by the element of the <tt>->nxttail[]</tt> array indexed by | ||
1006 | <tt>RCU_DONE_TAIL</tt> (for callbacks handled by a prior grace period), | ||
1007 | <tt>RCU_WAIT_TAIL</tt> (for callbacks waiting on the current grace period), | ||
1008 | <tt>RCU_NEXT_READY_TAIL</tt> (for callbacks that will wait on the next | ||
1009 | grace period), and | ||
1010 | <tt>RCU_NEXT_TAIL</tt> (for callbacks that are not yet associated | ||
1011 | with a specific grace period) | ||
1012 | respectively, as shown in the following figure. | ||
1013 | |||
1014 | </p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%"> | ||
1015 | |||
1016 | </p><p>In this figure, the <tt>->nxtlist</tt> pointer references the | ||
1017 | first | ||
1018 | RCU callback in the list. | ||
1019 | The <tt>->nxttail[RCU_DONE_TAIL]</tt> array element references | ||
1020 | the <tt>->nxtlist</tt> pointer itself, indicating that none | ||
1021 | of the callbacks is ready to invoke. | ||
1022 | The <tt>->nxttail[RCU_WAIT_TAIL]</tt> array element references callback | ||
1023 | CB 2's <tt>->next</tt> pointer, which indicates that | ||
1024 | CB 1 and CB 2 are both waiting on the current grace period. | ||
1025 | The <tt>->nxttail[RCU_NEXT_READY_TAIL]</tt> array element | ||
1026 | references the same RCU callback that <tt>->nxttail[RCU_WAIT_TAIL]</tt> | ||
1027 | does, which indicates that there are no callbacks waiting on the next | ||
1028 | RCU grace period. | ||
1029 | The <tt>->nxttail[RCU_NEXT_TAIL]</tt> array element references | ||
1030 | CB 4's <tt>->next</tt> pointer, indicating that all the | ||
1031 | remaining RCU callbacks have not yet been assigned to an RCU grace | ||
1032 | period. | ||
1033 | Note that the <tt>->nxttail[RCU_NEXT_TAIL]</tt> array element | ||
1034 | always references the last RCU callback's <tt>->next</tt> pointer | ||
1035 | unless the callback list is empty, in which case it references | ||
1036 | the <tt>->nxtlist</tt> pointer. | ||
1037 | |||
1038 | </p><p>CPUs advance their callbacks from the | ||
1039 | <tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the | ||
1040 | <tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments | ||
1041 | as grace periods advance. | ||
1042 | The CPU advances the callbacks in its <tt>rcu_data</tt> structure | 1128 | The CPU advances the callbacks in its <tt>rcu_data</tt> structure |
1043 | whenever it notices that another RCU grace period has completed. | 1129 | whenever it notices that another RCU grace period has completed. |
1044 | The CPU detects the completion of an RCU grace period by noticing | 1130 | The CPU detects the completion of an RCU grace period by noticing |
@@ -1049,16 +1135,7 @@ Recall that each <tt>rcu_node</tt> structure's | |||
1049 | <tt>->completed</tt> field is updated at the end of each | 1135 | <tt>->completed</tt> field is updated at the end of each |
1050 | grace period. | 1136 | grace period. |
1051 | 1137 | ||
1052 | </p><p>The <tt>->nxtcompleted[]</tt> array records grace-period | 1138 | <p> |
1053 | numbers corresponding to the list segments. | ||
1054 | This allows CPUs that go idle for extended periods to determine | ||
1055 | which of their callbacks are ready to be invoked after reawakening. | ||
1056 | |||
1057 | </p><p>The <tt>->qlen</tt> counter contains the number of | ||
1058 | callbacks in <tt>->nxtlist</tt>, and the | ||
1059 | <tt>->qlen_lazy</tt> contains the number of those callbacks that | ||
1060 | are known to only free memory, and whose invocation can therefore | ||
1061 | be safely deferred. | ||
1062 | The <tt>->qlen_last_fqs_check</tt> and | 1139 | The <tt>->qlen_last_fqs_check</tt> and |
1063 | <tt>->n_force_qs_snap</tt> coordinate the forcing of quiescent | 1140 | <tt>->n_force_qs_snap</tt> coordinate the forcing of quiescent |
1064 | states from <tt>call_rcu()</tt> and friends when callback | 1141 | states from <tt>call_rcu()</tt> and friends when callback |
@@ -1069,6 +1146,10 @@ lists grow excessively long. | |||
1069 | fields count the number of callbacks invoked, | 1146 | fields count the number of callbacks invoked, |
1070 | sent to other CPUs when this CPU goes offline, | 1147 | sent to other CPUs when this CPU goes offline, |
1071 | and received from other CPUs when those other CPUs go offline. | 1148 | and received from other CPUs when those other CPUs go offline. |
1149 | The <tt>->n_nocbs_invoked</tt> is used when the CPU's callbacks | ||
1150 | are offloaded to a kthread. | ||
1151 | |||
1152 | <p> | ||
1072 | Finally, the <tt>->blimit</tt> counter is the maximum number of | 1153 | Finally, the <tt>->blimit</tt> counter is the maximum number of |
1073 | RCU callbacks that may be invoked at a given time. | 1154 | RCU callbacks that may be invoked at a given time. |
1074 | 1155 | ||
@@ -1104,6 +1185,9 @@ Its fields are as follows: | |||
1104 | 1 int dynticks_nesting; | 1185 | 1 int dynticks_nesting; |
1105 | 2 int dynticks_nmi_nesting; | 1186 | 2 int dynticks_nmi_nesting; |
1106 | 3 atomic_t dynticks; | 1187 | 3 atomic_t dynticks; |
1188 | 4 bool rcu_need_heavy_qs; | ||
1189 | 5 unsigned long rcu_qs_ctr; | ||
1190 | 6 bool rcu_urgent_qs; | ||
1107 | </pre> | 1191 | </pre> |
1108 | 1192 | ||
1109 | <p>The <tt>->dynticks_nesting</tt> field counts the | 1193 | <p>The <tt>->dynticks_nesting</tt> field counts the |
@@ -1117,11 +1201,32 @@ NMIs are counted by the <tt>->dynticks_nmi_nesting</tt> | |||
1117 | field, except that NMIs that interrupt non-dyntick-idle execution | 1201 | field, except that NMIs that interrupt non-dyntick-idle execution |
1118 | are not counted. | 1202 | are not counted. |
1119 | 1203 | ||
1120 | </p><p>Finally, the <tt>->dynticks</tt> field counts the corresponding | 1204 | </p><p>The <tt>->dynticks</tt> field counts the corresponding |
1121 | CPU's transitions to and from dyntick-idle mode, so that this counter | 1205 | CPU's transitions to and from dyntick-idle mode, so that this counter |
1122 | has an even value when the CPU is in dyntick-idle mode and an odd | 1206 | has an even value when the CPU is in dyntick-idle mode and an odd |
1123 | value otherwise. | 1207 | value otherwise. |
1124 | 1208 | ||
1209 | </p><p>The <tt>->rcu_need_heavy_qs</tt> field is used | ||
1210 | to record the fact that the RCU core code would really like to | ||
1211 | see a quiescent state from the corresponding CPU, so much so that | ||
1212 | it is willing to call for heavy-weight dyntick-counter operations. | ||
1213 | This flag is checked by RCU's context-switch and <tt>cond_resched()</tt> | ||
1214 | code, which provide a momentary idle sojourn in response. | ||
1215 | |||
1216 | </p><p>The <tt>->rcu_qs_ctr</tt> field is used to record | ||
1217 | quiescent states from <tt>cond_resched()</tt>. | ||
1218 | Because <tt>cond_resched()</tt> can execute quite frequently, this | ||
1219 | must be quite lightweight, as in a non-atomic increment of this | ||
1220 | per-CPU field. | ||
1221 | |||
1222 | </p><p>Finally, the <tt>->rcu_urgent_qs</tt> field is used to record | ||
1223 | the fact that the RCU core code would really like to see a quiescent | ||
1224 | state from the corresponding CPU, with the various other fields indicating | ||
1225 | just how badly RCU wants this quiescent state. | ||
1226 | This flag is checked by RCU's context-switch and <tt>cond_resched()</tt> | ||
1227 | code, which, if nothing else, non-atomically increment <tt>->rcu_qs_ctr</tt> | ||
1228 | in response. | ||
1229 | |||
1125 | <table> | 1230 | <table> |
1126 | <tr><th> </th></tr> | 1231 | <tr><th> </th></tr> |
1127 | <tr><th align="left">Quick Quiz:</th></tr> | 1232 | <tr><th align="left">Quick Quiz:</th></tr> |
diff --git a/Documentation/RCU/Design/Data-Structures/nxtlist.svg b/Documentation/RCU/Design/Data-Structures/nxtlist.svg index abc4cc73a097..0223e79c38e0 100644 --- a/Documentation/RCU/Design/Data-Structures/nxtlist.svg +++ b/Documentation/RCU/Design/Data-Structures/nxtlist.svg | |||
@@ -19,7 +19,7 @@ | |||
19 | id="svg2" | 19 | id="svg2" |
20 | version="1.1" | 20 | version="1.1" |
21 | inkscape:version="0.48.4 r9939" | 21 | inkscape:version="0.48.4 r9939" |
22 | sodipodi:docname="nxtlist.fig"> | 22 | sodipodi:docname="segcblist.svg"> |
23 | <metadata | 23 | <metadata |
24 | id="metadata94"> | 24 | id="metadata94"> |
25 | <rdf:RDF> | 25 | <rdf:RDF> |
@@ -28,7 +28,7 @@ | |||
28 | <dc:format>image/svg+xml</dc:format> | 28 | <dc:format>image/svg+xml</dc:format> |
29 | <dc:type | 29 | <dc:type |
30 | rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> | 30 | rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> |
31 | <dc:title></dc:title> | 31 | <dc:title /> |
32 | </cc:Work> | 32 | </cc:Work> |
33 | </rdf:RDF> | 33 | </rdf:RDF> |
34 | </metadata> | 34 | </metadata> |
@@ -241,61 +241,51 @@ | |||
241 | xml:space="preserve" | 241 | xml:space="preserve" |
242 | x="225" | 242 | x="225" |
243 | y="675" | 243 | y="675" |
244 | fill="#000000" | ||
245 | font-family="Courier" | ||
246 | font-style="normal" | 244 | font-style="normal" |
247 | font-weight="bold" | 245 | font-weight="bold" |
248 | font-size="324" | 246 | font-size="324" |
249 | text-anchor="start" | 247 | id="text64" |
250 | id="text64">nxtlist</text> | 248 | style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->head</text> |
251 | <!-- Text --> | 249 | <!-- Text --> |
252 | <text | 250 | <text |
253 | xml:space="preserve" | 251 | xml:space="preserve" |
254 | x="225" | 252 | x="225" |
255 | y="1800" | 253 | y="1800" |
256 | fill="#000000" | ||
257 | font-family="Courier" | ||
258 | font-style="normal" | 254 | font-style="normal" |
259 | font-weight="bold" | 255 | font-weight="bold" |
260 | font-size="324" | 256 | font-size="324" |
261 | text-anchor="start" | 257 | id="text66" |
262 | id="text66">nxttail[RCU_DONE_TAIL]</text> | 258 | style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_DONE_TAIL]</text> |
263 | <!-- Text --> | 259 | <!-- Text --> |
264 | <text | 260 | <text |
265 | xml:space="preserve" | 261 | xml:space="preserve" |
266 | x="225" | 262 | x="225" |
267 | y="2925" | 263 | y="2925" |
268 | fill="#000000" | ||
269 | font-family="Courier" | ||
270 | font-style="normal" | 264 | font-style="normal" |
271 | font-weight="bold" | 265 | font-weight="bold" |
272 | font-size="324" | 266 | font-size="324" |
273 | text-anchor="start" | 267 | id="text68" |
274 | id="text68">nxttail[RCU_WAIT_TAIL]</text> | 268 | style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_WAIT_TAIL]</text> |
275 | <!-- Text --> | 269 | <!-- Text --> |
276 | <text | 270 | <text |
277 | xml:space="preserve" | 271 | xml:space="preserve" |
278 | x="225" | 272 | x="225" |
279 | y="4050" | 273 | y="4050" |
280 | fill="#000000" | ||
281 | font-family="Courier" | ||
282 | font-style="normal" | 274 | font-style="normal" |
283 | font-weight="bold" | 275 | font-weight="bold" |
284 | font-size="324" | 276 | font-size="324" |
285 | text-anchor="start" | 277 | id="text70" |
286 | id="text70">nxttail[RCU_NEXT_READY_TAIL]</text> | 278 | style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_NEXT_READY_TAIL]</text> |
287 | <!-- Text --> | 279 | <!-- Text --> |
288 | <text | 280 | <text |
289 | xml:space="preserve" | 281 | xml:space="preserve" |
290 | x="225" | 282 | x="225" |
291 | y="5175" | 283 | y="5175" |
292 | fill="#000000" | ||
293 | font-family="Courier" | ||
294 | font-style="normal" | 284 | font-style="normal" |
295 | font-weight="bold" | 285 | font-weight="bold" |
296 | font-size="324" | 286 | font-size="324" |
297 | text-anchor="start" | 287 | id="text72" |
298 | id="text72">nxttail[RCU_NEXT_TAIL]</text> | 288 | style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_NEXT_TAIL]</text> |
299 | <!-- Text --> | 289 | <!-- Text --> |
300 | <text | 290 | <text |
301 | xml:space="preserve" | 291 | xml:space="preserve" |
diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html index 7a3194c5559a..e5d0bbd0230b 100644 --- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html +++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html | |||
@@ -284,6 +284,7 @@ Expedited Grace Period Refinements</a></h2> | |||
284 | Funnel locking and wait/wakeup</a>. | 284 | Funnel locking and wait/wakeup</a>. |
285 | <li> <a href="#Use of Workqueues">Use of Workqueues</a>. | 285 | <li> <a href="#Use of Workqueues">Use of Workqueues</a>. |
286 | <li> <a href="#Stall Warnings">Stall warnings</a>. | 286 | <li> <a href="#Stall Warnings">Stall warnings</a>. |
287 | <li> <a href="#Mid-Boot Operation">Mid-boot operation</a>. | ||
287 | </ol> | 288 | </ol> |
288 | 289 | ||
289 | <h3><a name="Idle-CPU Checks">Idle-CPU Checks</a></h3> | 290 | <h3><a name="Idle-CPU Checks">Idle-CPU Checks</a></h3> |
@@ -524,7 +525,7 @@ their grace periods and carrying out their wakeups. | |||
524 | In earlier implementations, the task requesting the expedited | 525 | In earlier implementations, the task requesting the expedited |
525 | grace period also drove it to completion. | 526 | grace period also drove it to completion. |
526 | This straightforward approach had the disadvantage of needing to | 527 | This straightforward approach had the disadvantage of needing to |
527 | account for signals sent to user tasks, | 528 | account for POSIX signals sent to user tasks, |
528 | so more recent implemementations use the Linux kernel's | 529 | so more recent implemementations use the Linux kernel's |
529 | <a href="https://www.kernel.org/doc/Documentation/workqueue.txt">workqueues</a>. | 530 | <a href="https://www.kernel.org/doc/Documentation/workqueue.txt">workqueues</a>. |
530 | 531 | ||
@@ -533,8 +534,8 @@ The requesting task still does counter snapshotting and funnel-lock | |||
533 | processing, but the task reaching the top of the funnel lock | 534 | processing, but the task reaching the top of the funnel lock |
534 | does a <tt>schedule_work()</tt> (from <tt>_synchronize_rcu_expedited()</tt> | 535 | does a <tt>schedule_work()</tt> (from <tt>_synchronize_rcu_expedited()</tt> |
535 | so that a workqueue kthread does the actual grace-period processing. | 536 | so that a workqueue kthread does the actual grace-period processing. |
536 | Because workqueue kthreads do not accept signals, grace-period-wait | 537 | Because workqueue kthreads do not accept POSIX signals, grace-period-wait |
537 | processing need not allow for signals. | 538 | processing need not allow for POSIX signals. |
538 | 539 | ||
539 | In addition, this approach allows wakeups for the previous expedited | 540 | In addition, this approach allows wakeups for the previous expedited |
540 | grace period to be overlapped with processing for the next expedited | 541 | grace period to be overlapped with processing for the next expedited |
@@ -586,6 +587,46 @@ blocking the current grace period are printed. | |||
586 | Each stall warning results in another pass through the loop, but the | 587 | Each stall warning results in another pass through the loop, but the |
587 | second and subsequent passes use longer stall times. | 588 | second and subsequent passes use longer stall times. |
588 | 589 | ||
590 | <h3><a name="Mid-Boot Operation">Mid-boot operation</a></h3> | ||
591 | |||
592 | <p> | ||
593 | The use of workqueues has the advantage that the expedited | ||
594 | grace-period code need not worry about POSIX signals. | ||
595 | Unfortunately, it has the | ||
596 | corresponding disadvantage that workqueues cannot be used until | ||
597 | they are initialized, which does not happen until some time after | ||
598 | the scheduler spawns the first task. | ||
599 | Given that there are parts of the kernel that really do want to | ||
600 | execute grace periods during this mid-boot “dead zone”, | ||
601 | expedited grace periods must do something else during thie time. | ||
602 | |||
603 | <p> | ||
604 | What they do is to fall back to the old practice of requiring that the | ||
605 | requesting task drive the expedited grace period, as was the case | ||
606 | before the use of workqueues. | ||
607 | However, the requesting task is only required to drive the grace period | ||
608 | during the mid-boot dead zone. | ||
609 | Before mid-boot, a synchronous grace period is a no-op. | ||
610 | Some time after mid-boot, workqueues are used. | ||
611 | |||
612 | <p> | ||
613 | Non-expedited non-SRCU synchronous grace periods must also operate | ||
614 | normally during mid-boot. | ||
615 | This is handled by causing non-expedited grace periods to take the | ||
616 | expedited code path during mid-boot. | ||
617 | |||
618 | <p> | ||
619 | The current code assumes that there are no POSIX signals during | ||
620 | the mid-boot dead zone. | ||
621 | However, if an overwhelming need for POSIX signals somehow arises, | ||
622 | appropriate adjustments can be made to the expedited stall-warning code. | ||
623 | One such adjustment would reinstate the pre-workqueue stall-warning | ||
624 | checks, but only during the mid-boot dead zone. | ||
625 | |||
626 | <p> | ||
627 | With this refinement, synchronous grace periods can now be used from | ||
628 | task context pretty much any time during the life of the kernel. | ||
629 | |||
589 | <h3><a name="Summary"> | 630 | <h3><a name="Summary"> |
590 | Summary</a></h3> | 631 | Summary</a></h3> |
591 | 632 | ||
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 21593496aca6..f60adf112663 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html | |||
@@ -659,8 +659,9 @@ systems with more than one CPU: | |||
659 | In other words, a given instance of <tt>synchronize_rcu()</tt> | 659 | In other words, a given instance of <tt>synchronize_rcu()</tt> |
660 | can avoid waiting on a given RCU read-side critical section only | 660 | can avoid waiting on a given RCU read-side critical section only |
661 | if it can prove that <tt>synchronize_rcu()</tt> started first. | 661 | if it can prove that <tt>synchronize_rcu()</tt> started first. |
662 | </font> | ||
662 | 663 | ||
663 | <p> | 664 | <p><font color="ffffff"> |
664 | A related question is “When <tt>rcu_read_lock()</tt> | 665 | A related question is “When <tt>rcu_read_lock()</tt> |
665 | doesn't generate any code, why does it matter how it relates | 666 | doesn't generate any code, why does it matter how it relates |
666 | to a grace period?” | 667 | to a grace period?” |
@@ -675,8 +676,9 @@ systems with more than one CPU: | |||
675 | within the critical section, in which case none of the accesses | 676 | within the critical section, in which case none of the accesses |
676 | within the critical section may observe the effects of any | 677 | within the critical section may observe the effects of any |
677 | access following the grace period. | 678 | access following the grace period. |
679 | </font> | ||
678 | 680 | ||
679 | <p> | 681 | <p><font color="ffffff"> |
680 | As of late 2016, mathematical models of RCU take this | 682 | As of late 2016, mathematical models of RCU take this |
681 | viewpoint, for example, see slides 62 and 63 | 683 | viewpoint, for example, see slides 62 and 63 |
682 | of the | 684 | of the |
@@ -1616,8 +1618,8 @@ CPUs should at least make reasonable forward progress. | |||
1616 | In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt> | 1618 | In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt> |
1617 | is permitted to impose modest degradation of real-time latency | 1619 | is permitted to impose modest degradation of real-time latency |
1618 | on non-idle online CPUs. | 1620 | on non-idle online CPUs. |
1619 | That said, it will likely be necessary to take further steps to reduce this | 1621 | Here, “modest” means roughly the same latency |
1620 | degradation, hopefully to roughly that of a scheduling-clock interrupt. | 1622 | degradation as a scheduling-clock interrupt. |
1621 | 1623 | ||
1622 | <p> | 1624 | <p> |
1623 | There are a number of situations where even | 1625 | There are a number of situations where even |
@@ -1913,12 +1915,9 @@ This requirement is another factor driving batching of grace periods, | |||
1913 | but it is also the driving force behind the checks for large numbers | 1915 | but it is also the driving force behind the checks for large numbers |
1914 | of queued RCU callbacks in the <tt>call_rcu()</tt> code path. | 1916 | of queued RCU callbacks in the <tt>call_rcu()</tt> code path. |
1915 | Finally, high update rates should not delay RCU read-side critical | 1917 | Finally, high update rates should not delay RCU read-side critical |
1916 | sections, although some read-side delays can occur when using | 1918 | sections, although some small read-side delays can occur when using |
1917 | <tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use | 1919 | <tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use |
1918 | of <tt>try_stop_cpus()</tt>. | 1920 | of <tt>smp_call_function_single()</tt>. |
1919 | (In the future, <tt>synchronize_rcu_expedited()</tt> will be | ||
1920 | converted to use lighter-weight inter-processor interrupts (IPIs), | ||
1921 | but this will still disturb readers, though to a much smaller degree.) | ||
1922 | 1921 | ||
1923 | <p> | 1922 | <p> |
1924 | Although all three of these corner cases were understood in the early | 1923 | Although all three of these corner cases were understood in the early |
@@ -2154,7 +2153,8 @@ as will <tt>rcu_assign_pointer()</tt>. | |||
2154 | <p> | 2153 | <p> |
2155 | Although <tt>call_rcu()</tt> may be invoked at any | 2154 | Although <tt>call_rcu()</tt> may be invoked at any |
2156 | time during boot, callbacks are not guaranteed to be invoked until after | 2155 | time during boot, callbacks are not guaranteed to be invoked until after |
2157 | the scheduler is fully up and running. | 2156 | all of RCU's kthreads have been spawned, which occurs at |
2157 | <tt>early_initcall()</tt> time. | ||
2158 | This delay in callback invocation is due to the fact that RCU does not | 2158 | This delay in callback invocation is due to the fact that RCU does not |
2159 | invoke callbacks until it is fully initialized, and this full initialization | 2159 | invoke callbacks until it is fully initialized, and this full initialization |
2160 | cannot occur until after the scheduler has initialized itself to the | 2160 | cannot occur until after the scheduler has initialized itself to the |
@@ -2167,8 +2167,10 @@ on what operations those callbacks could invoke. | |||
2167 | Perhaps surprisingly, <tt>synchronize_rcu()</tt>, | 2167 | Perhaps surprisingly, <tt>synchronize_rcu()</tt>, |
2168 | <a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a> | 2168 | <a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a> |
2169 | (<a href="#Bottom-Half Flavor">discussed below</a>), | 2169 | (<a href="#Bottom-Half Flavor">discussed below</a>), |
2170 | and | 2170 | <a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>, |
2171 | <a href="#Sched Flavor"><tt>synchronize_sched()</tt></a> | 2171 | <tt>synchronize_rcu_expedited()</tt>, |
2172 | <tt>synchronize_rcu_bh_expedited()</tt>, and | ||
2173 | <tt>synchronize_sched_expedited()</tt> | ||
2172 | will all operate normally | 2174 | will all operate normally |
2173 | during very early boot, the reason being that there is only one CPU | 2175 | during very early boot, the reason being that there is only one CPU |
2174 | and preemption is disabled. | 2176 | and preemption is disabled. |
@@ -2178,45 +2180,59 @@ state and thus a grace period, so the early-boot implementation can | |||
2178 | be a no-op. | 2180 | be a no-op. |
2179 | 2181 | ||
2180 | <p> | 2182 | <p> |
2181 | Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt> | 2183 | However, once the scheduler has spawned its first kthread, this early |
2182 | continue to operate normally through the remainder of boot, courtesy | 2184 | boot trick fails for <tt>synchronize_rcu()</tt> (as well as for |
2183 | of the fact that preemption is disabled across their RCU read-side | 2185 | <tt>synchronize_rcu_expedited()</tt>) in <tt>CONFIG_PREEMPT=y</tt> |
2184 | critical sections and also courtesy of the fact that there is still | 2186 | kernels. |
2185 | only one CPU. | 2187 | The reason is that an RCU read-side critical section might be preempted, |
2186 | However, once the scheduler starts initializing, preemption is enabled. | 2188 | which means that a subsequent <tt>synchronize_rcu()</tt> really does have |
2187 | There is still only a single CPU, but the fact that preemption is enabled | 2189 | to wait for something, as opposed to simply returning immediately. |
2188 | means that the no-op implementation of <tt>synchronize_rcu()</tt> no | 2190 | Unfortunately, <tt>synchronize_rcu()</tt> can't do this until all of |
2189 | longer works in <tt>CONFIG_PREEMPT=y</tt> kernels. | 2191 | its kthreads are spawned, which doesn't happen until some time during |
2190 | Therefore, as soon as the scheduler starts initializing, the early-boot | 2192 | <tt>early_initcalls()</tt> time. |
2191 | fastpath is disabled. | 2193 | But this is no excuse: RCU is nevertheless required to correctly handle |
2192 | This means that <tt>synchronize_rcu()</tt> switches to its runtime | 2194 | synchronous grace periods during this time period. |
2193 | mode of operation where it posts callbacks, which in turn means that | 2195 | Once all of its kthreads are up and running, RCU starts running |
2194 | any call to <tt>synchronize_rcu()</tt> will block until the corresponding | 2196 | normally. |
2195 | callback is invoked. | ||
2196 | Unfortunately, the callback cannot be invoked until RCU's runtime | ||
2197 | grace-period machinery is up and running, which cannot happen until | ||
2198 | the scheduler has initialized itself sufficiently to allow RCU's | ||
2199 | kthreads to be spawned. | ||
2200 | Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler | ||
2201 | initialization can result in deadlock. | ||
2202 | 2197 | ||
2203 | <table> | 2198 | <table> |
2204 | <tr><th> </th></tr> | 2199 | <tr><th> </th></tr> |
2205 | <tr><th align="left">Quick Quiz:</th></tr> | 2200 | <tr><th align="left">Quick Quiz:</th></tr> |
2206 | <tr><td> | 2201 | <tr><td> |
2207 | So what happens with <tt>synchronize_rcu()</tt> during | 2202 | How can RCU possibly handle grace periods before all of its |
2208 | scheduler initialization for <tt>CONFIG_PREEMPT=n</tt> | 2203 | kthreads have been spawned??? |
2209 | kernels? | ||
2210 | </td></tr> | 2204 | </td></tr> |
2211 | <tr><th align="left">Answer:</th></tr> | 2205 | <tr><th align="left">Answer:</th></tr> |
2212 | <tr><td bgcolor="#ffffff"><font color="ffffff"> | 2206 | <tr><td bgcolor="#ffffff"><font color="ffffff"> |
2213 | In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt> | 2207 | Very carefully! |
2214 | maps directly to <tt>synchronize_sched()</tt>. | 2208 | </font> |
2215 | Therefore, <tt>synchronize_rcu()</tt> works normally throughout | 2209 | |
2216 | boot in <tt>CONFIG_PREEMPT=n</tt> kernels. | 2210 | <p><font color="ffffff"> |
2217 | However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels, | 2211 | During the “dead zone” between the time that the |
2218 | so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt> | 2212 | scheduler spawns the first task and the time that all of RCU's |
2219 | during scheduler initialization. | 2213 | kthreads have been spawned, all synchronous grace periods are |
2214 | handled by the expedited grace-period mechanism. | ||
2215 | At runtime, this expedited mechanism relies on workqueues, but | ||
2216 | during the dead zone the requesting task itself drives the | ||
2217 | desired expedited grace period. | ||
2218 | Because dead-zone execution takes place within task context, | ||
2219 | everything works. | ||
2220 | Once the dead zone ends, expedited grace periods go back to | ||
2221 | using workqueues, as is required to avoid problems that would | ||
2222 | otherwise occur when a user task received a POSIX signal while | ||
2223 | driving an expedited grace period. | ||
2224 | </font> | ||
2225 | |||
2226 | <p><font color="ffffff"> | ||
2227 | And yes, this does mean that it is unhelpful to send POSIX | ||
2228 | signals to random tasks between the time that the scheduler | ||
2229 | spawns its first kthread and the time that RCU's kthreads | ||
2230 | have all been spawned. | ||
2231 | If there ever turns out to be a good reason for sending POSIX | ||
2232 | signals during that time, appropriate adjustments will be made. | ||
2233 | (If it turns out that POSIX signals are sent during this time for | ||
2234 | no good reason, other adjustments will be made, appropriate | ||
2235 | or otherwise.) | ||
2220 | </font></td></tr> | 2236 | </font></td></tr> |
2221 | <tr><td> </td></tr> | 2237 | <tr><td> </td></tr> |
2222 | </table> | 2238 | </table> |
@@ -2295,12 +2311,61 @@ situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU. | |||
2295 | The need for <tt>rcu_barrier()</tt> for module unloading became | 2311 | The need for <tt>rcu_barrier()</tt> for module unloading became |
2296 | apparent later. | 2312 | apparent later. |
2297 | 2313 | ||
2314 | <p> | ||
2315 | <b>Important note</b>: The <tt>rcu_barrier()</tt> function is not, | ||
2316 | repeat, <i>not</i>, obligated to wait for a grace period. | ||
2317 | It is instead only required to wait for RCU callbacks that have | ||
2318 | already been posted. | ||
2319 | Therefore, if there are no RCU callbacks posted anywhere in the system, | ||
2320 | <tt>rcu_barrier()</tt> is within its rights to return immediately. | ||
2321 | Even if there are callbacks posted, <tt>rcu_barrier()</tt> does not | ||
2322 | necessarily need to wait for a grace period. | ||
2323 | |||
2324 | <table> | ||
2325 | <tr><th> </th></tr> | ||
2326 | <tr><th align="left">Quick Quiz:</th></tr> | ||
2327 | <tr><td> | ||
2328 | Wait a minute! | ||
2329 | Each RCU callbacks must wait for a grace period to complete, | ||
2330 | and <tt>rcu_barrier()</tt> must wait for each pre-existing | ||
2331 | callback to be invoked. | ||
2332 | Doesn't <tt>rcu_barrier()</tt> therefore need to wait for | ||
2333 | a full grace period if there is even one callback posted anywhere | ||
2334 | in the system? | ||
2335 | </td></tr> | ||
2336 | <tr><th align="left">Answer:</th></tr> | ||
2337 | <tr><td bgcolor="#ffffff"><font color="ffffff"> | ||
2338 | Absolutely not!!! | ||
2339 | </font> | ||
2340 | |||
2341 | <p><font color="ffffff"> | ||
2342 | Yes, each RCU callbacks must wait for a grace period to complete, | ||
2343 | but it might well be partly (or even completely) finished waiting | ||
2344 | by the time <tt>rcu_barrier()</tt> is invoked. | ||
2345 | In that case, <tt>rcu_barrier()</tt> need only wait for the | ||
2346 | remaining portion of the grace period to elapse. | ||
2347 | So even if there are quite a few callbacks posted, | ||
2348 | <tt>rcu_barrier()</tt> might well return quite quickly. | ||
2349 | </font> | ||
2350 | |||
2351 | <p><font color="ffffff"> | ||
2352 | So if you need to wait for a grace period as well as for all | ||
2353 | pre-existing callbacks, you will need to invoke both | ||
2354 | <tt>synchronize_rcu()</tt> and <tt>rcu_barrier()</tt>. | ||
2355 | If latency is a concern, you can always use workqueues | ||
2356 | to invoke them concurrently. | ||
2357 | </font></td></tr> | ||
2358 | <tr><td> </td></tr> | ||
2359 | </table> | ||
2360 | |||
2298 | <h3><a name="Hotplug CPU">Hotplug CPU</a></h3> | 2361 | <h3><a name="Hotplug CPU">Hotplug CPU</a></h3> |
2299 | 2362 | ||
2300 | <p> | 2363 | <p> |
2301 | The Linux kernel supports CPU hotplug, which means that CPUs | 2364 | The Linux kernel supports CPU hotplug, which means that CPUs |
2302 | can come and go. | 2365 | can come and go. |
2303 | It is of course illegal to use any RCU API member from an offline CPU. | 2366 | It is of course illegal to use any RCU API member from an offline CPU, |
2367 | with the exception of <a href="#Sleepable RCU">SRCU</a> read-side | ||
2368 | critical sections. | ||
2304 | This requirement was present from day one in DYNIX/ptx, but | 2369 | This requirement was present from day one in DYNIX/ptx, but |
2305 | on the other hand, the Linux kernel's CPU-hotplug implementation | 2370 | on the other hand, the Linux kernel's CPU-hotplug implementation |
2306 | is “interesting.” | 2371 | is “interesting.” |
@@ -2310,19 +2375,18 @@ The Linux-kernel CPU-hotplug implementation has notifiers that | |||
2310 | are used to allow the various kernel subsystems (including RCU) | 2375 | are used to allow the various kernel subsystems (including RCU) |
2311 | to respond appropriately to a given CPU-hotplug operation. | 2376 | to respond appropriately to a given CPU-hotplug operation. |
2312 | Most RCU operations may be invoked from CPU-hotplug notifiers, | 2377 | Most RCU operations may be invoked from CPU-hotplug notifiers, |
2313 | including even normal synchronous grace-period operations | 2378 | including even synchronous grace-period operations such as |
2314 | such as <tt>synchronize_rcu()</tt>. | 2379 | <tt>synchronize_rcu()</tt> and <tt>synchronize_rcu_expedited()</tt>. |
2315 | However, expedited grace-period operations such as | ||
2316 | <tt>synchronize_rcu_expedited()</tt> are not supported, | ||
2317 | due to the fact that current implementations block CPU-hotplug | ||
2318 | operations, which could result in deadlock. | ||
2319 | 2380 | ||
2320 | <p> | 2381 | <p> |
2321 | In addition, all-callback-wait operations such as | 2382 | However, all-callback-wait operations such as |
2322 | <tt>rcu_barrier()</tt> are also not supported, due to the | 2383 | <tt>rcu_barrier()</tt> are also not supported, due to the |
2323 | fact that there are phases of CPU-hotplug operations where | 2384 | fact that there are phases of CPU-hotplug operations where |
2324 | the outgoing CPU's callbacks will not be invoked until after | 2385 | the outgoing CPU's callbacks will not be invoked until after |
2325 | the CPU-hotplug operation ends, which could also result in deadlock. | 2386 | the CPU-hotplug operation ends, which could also result in deadlock. |
2387 | Furthermore, <tt>rcu_barrier()</tt> blocks CPU-hotplug operations | ||
2388 | during its execution, which results in another type of deadlock | ||
2389 | when invoked from a CPU-hotplug notifier. | ||
2326 | 2390 | ||
2327 | <h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3> | 2391 | <h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3> |
2328 | 2392 | ||
@@ -2864,6 +2928,27 @@ API, which, in combination with <tt>srcu_read_unlock()</tt>, | |||
2864 | guarantees a full memory barrier. | 2928 | guarantees a full memory barrier. |
2865 | 2929 | ||
2866 | <p> | 2930 | <p> |
2931 | Also unlike other RCU flavors, SRCU's callbacks-wait function | ||
2932 | <tt>srcu_barrier()</tt> may be invoked from CPU-hotplug notifiers, | ||
2933 | though this is not necessarily a good idea. | ||
2934 | The reason that this is possible is that SRCU is insensitive | ||
2935 | to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt> | ||
2936 | need not exclude CPU-hotplug operations. | ||
2937 | |||
2938 | <p> | ||
2939 | As of v4.12, SRCU's callbacks are maintained per-CPU, eliminating | ||
2940 | a locking bottleneck present in prior kernel versions. | ||
2941 | Although this will allow users to put much heavier stress on | ||
2942 | <tt>call_srcu()</tt>, it is important to note that SRCU does not | ||
2943 | yet take any special steps to deal with callback flooding. | ||
2944 | So if you are posting (say) 10,000 SRCU callbacks per second per CPU, | ||
2945 | you are probably totally OK, but if you intend to post (say) 1,000,000 | ||
2946 | SRCU callbacks per second per CPU, please run some tests first. | ||
2947 | SRCU just might need a few adjustment to deal with that sort of load. | ||
2948 | Of course, your mileage may vary based on the speed of your CPUs and | ||
2949 | the size of your memory. | ||
2950 | |||
2951 | <p> | ||
2867 | The | 2952 | The |
2868 | <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a> | 2953 | <a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a> |
2869 | includes | 2954 | includes |
@@ -3021,8 +3106,8 @@ to do some redesign to avoid this scalability problem. | |||
3021 | 3106 | ||
3022 | <p> | 3107 | <p> |
3023 | RCU disables CPU hotplug in a few places, perhaps most notably in the | 3108 | RCU disables CPU hotplug in a few places, perhaps most notably in the |
3024 | expedited grace-period and <tt>rcu_barrier()</tt> operations. | 3109 | <tt>rcu_barrier()</tt> operations. |
3025 | If there is a strong reason to use expedited grace periods in CPU-hotplug | 3110 | If there is a strong reason to use <tt>rcu_barrier()</tt> in CPU-hotplug |
3026 | notifiers, it will be necessary to avoid disabling CPU hotplug. | 3111 | notifiers, it will be necessary to avoid disabling CPU hotplug. |
3027 | This would introduce some complexity, so there had better be a <i>very</i> | 3112 | This would introduce some complexity, so there had better be a <i>very</i> |
3028 | good reason. | 3113 | good reason. |
@@ -3096,9 +3181,5 @@ Andy Lutomirski for their help in rendering | |||
3096 | this article human readable, and to Michelle Rankin for her support | 3181 | this article human readable, and to Michelle Rankin for her support |
3097 | of this effort. | 3182 | of this effort. |
3098 | Other contributions are acknowledged in the Linux kernel's git archive. | 3183 | Other contributions are acknowledged in the Linux kernel's git archive. |
3099 | The cartoon is copyright (c) 2013 by Melissa Broussard, | ||
3100 | and is provided | ||
3101 | under the terms of the Creative Commons Attribution-Share Alike 3.0 | ||
3102 | United States license. | ||
3103 | 3184 | ||
3104 | </body></html> | 3185 | </body></html> |
diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt index c0bf2441a2ba..b2a613f16d74 100644 --- a/Documentation/RCU/rcu_dereference.txt +++ b/Documentation/RCU/rcu_dereference.txt | |||
@@ -138,6 +138,15 @@ o Be very careful about comparing pointers obtained from | |||
138 | This sort of comparison occurs frequently when scanning | 138 | This sort of comparison occurs frequently when scanning |
139 | RCU-protected circular linked lists. | 139 | RCU-protected circular linked lists. |
140 | 140 | ||
141 | Note that if checks for being within an RCU read-side | ||
142 | critical section are not required and the pointer is never | ||
143 | dereferenced, rcu_access_pointer() should be used in place | ||
144 | of rcu_dereference(). The rcu_access_pointer() primitive | ||
145 | does not require an enclosing read-side critical section, | ||
146 | and also omits the smp_read_barrier_depends() included in | ||
147 | rcu_dereference(), which in turn should provide a small | ||
148 | performance gain in some CPUs (e.g., the DEC Alpha). | ||
149 | |||
141 | o The comparison is against a pointer that references memory | 150 | o The comparison is against a pointer that references memory |
142 | that was initialized "a long time ago." The reason | 151 | that was initialized "a long time ago." The reason |
143 | this is safe is that even if misordering occurs, the | 152 | this is safe is that even if misordering occurs, the |
diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt index 18f9651ff23d..8151f0195f76 100644 --- a/Documentation/RCU/rculist_nulls.txt +++ b/Documentation/RCU/rculist_nulls.txt | |||
@@ -1,5 +1,5 @@ | |||
1 | Using hlist_nulls to protect read-mostly linked lists and | 1 | Using hlist_nulls to protect read-mostly linked lists and |
2 | objects using SLAB_DESTROY_BY_RCU allocations. | 2 | objects using SLAB_TYPESAFE_BY_RCU allocations. |
3 | 3 | ||
4 | Please read the basics in Documentation/RCU/listRCU.txt | 4 | Please read the basics in Documentation/RCU/listRCU.txt |
5 | 5 | ||
@@ -7,7 +7,7 @@ Using special makers (called 'nulls') is a convenient way | |||
7 | to solve following problem : | 7 | to solve following problem : |
8 | 8 | ||
9 | A typical RCU linked list managing objects which are | 9 | A typical RCU linked list managing objects which are |
10 | allocated with SLAB_DESTROY_BY_RCU kmem_cache can | 10 | allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can |
11 | use following algos : | 11 | use following algos : |
12 | 12 | ||
13 | 1) Lookup algo | 13 | 1) Lookup algo |
@@ -96,7 +96,7 @@ unlock_chain(); // typically a spin_unlock() | |||
96 | 3) Remove algo | 96 | 3) Remove algo |
97 | -------------- | 97 | -------------- |
98 | Nothing special here, we can use a standard RCU hlist deletion. | 98 | Nothing special here, we can use a standard RCU hlist deletion. |
99 | But thanks to SLAB_DESTROY_BY_RCU, beware a deleted object can be reused | 99 | But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused |
100 | very very fast (before the end of RCU grace period) | 100 | very very fast (before the end of RCU grace period) |
101 | 101 | ||
102 | if (put_last_reference_on(obj) { | 102 | if (put_last_reference_on(obj) { |
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index e93d04133fe7..96a3d81837e1 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt | |||
@@ -1,9 +1,102 @@ | |||
1 | Using RCU's CPU Stall Detector | 1 | Using RCU's CPU Stall Detector |
2 | 2 | ||
3 | The rcu_cpu_stall_suppress module parameter enables RCU's CPU stall | 3 | This document first discusses what sorts of issues RCU's CPU stall |
4 | detector, which detects conditions that unduly delay RCU grace periods. | 4 | detector can locate, and then discusses kernel parameters and Kconfig |
5 | This module parameter enables CPU stall detection by default, but | 5 | options that can be used to fine-tune the detector's operation. Finally, |
6 | may be overridden via boot-time parameter or at runtime via sysfs. | 6 | this document explains the stall detector's "splat" format. |
7 | |||
8 | |||
9 | What Causes RCU CPU Stall Warnings? | ||
10 | |||
11 | So your kernel printed an RCU CPU stall warning. The next question is | ||
12 | "What caused it?" The following problems can result in RCU CPU stall | ||
13 | warnings: | ||
14 | |||
15 | o A CPU looping in an RCU read-side critical section. | ||
16 | |||
17 | o A CPU looping with interrupts disabled. | ||
18 | |||
19 | o A CPU looping with preemption disabled. This condition can | ||
20 | result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh | ||
21 | stalls. | ||
22 | |||
23 | o A CPU looping with bottom halves disabled. This condition can | ||
24 | result in RCU-sched and RCU-bh stalls. | ||
25 | |||
26 | o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the | ||
27 | kernel without invoking schedule(). Note that cond_resched() | ||
28 | does not necessarily prevent RCU CPU stall warnings. Therefore, | ||
29 | if the looping in the kernel is really expected and desirable | ||
30 | behavior, you might need to replace some of the cond_resched() | ||
31 | calls with calls to cond_resched_rcu_qs(). | ||
32 | |||
33 | o Booting Linux using a console connection that is too slow to | ||
34 | keep up with the boot-time console-message rate. For example, | ||
35 | a 115Kbaud serial console can be -way- too slow to keep up | ||
36 | with boot-time message rates, and will frequently result in | ||
37 | RCU CPU stall warning messages. Especially if you have added | ||
38 | debug printk()s. | ||
39 | |||
40 | o Anything that prevents RCU's grace-period kthreads from running. | ||
41 | This can result in the "All QSes seen" console-log message. | ||
42 | This message will include information on when the kthread last | ||
43 | ran and how often it should be expected to run. | ||
44 | |||
45 | o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might | ||
46 | happen to preempt a low-priority task in the middle of an RCU | ||
47 | read-side critical section. This is especially damaging if | ||
48 | that low-priority task is not permitted to run on any other CPU, | ||
49 | in which case the next RCU grace period can never complete, which | ||
50 | will eventually cause the system to run out of memory and hang. | ||
51 | While the system is in the process of running itself out of | ||
52 | memory, you might see stall-warning messages. | ||
53 | |||
54 | o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that | ||
55 | is running at a higher priority than the RCU softirq threads. | ||
56 | This will prevent RCU callbacks from ever being invoked, | ||
57 | and in a CONFIG_PREEMPT_RCU kernel will further prevent | ||
58 | RCU grace periods from ever completing. Either way, the | ||
59 | system will eventually run out of memory and hang. In the | ||
60 | CONFIG_PREEMPT_RCU case, you might see stall-warning | ||
61 | messages. | ||
62 | |||
63 | o A hardware or software issue shuts off the scheduler-clock | ||
64 | interrupt on a CPU that is not in dyntick-idle mode. This | ||
65 | problem really has happened, and seems to be most likely to | ||
66 | result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels. | ||
67 | |||
68 | o A bug in the RCU implementation. | ||
69 | |||
70 | o A hardware failure. This is quite unlikely, but has occurred | ||
71 | at least once in real life. A CPU failed in a running system, | ||
72 | becoming unresponsive, but not causing an immediate crash. | ||
73 | This resulted in a series of RCU CPU stall warnings, eventually | ||
74 | leading the realization that the CPU had failed. | ||
75 | |||
76 | The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall | ||
77 | warning. Note that SRCU does -not- have CPU stall warnings. Please note | ||
78 | that RCU only detects CPU stalls when there is a grace period in progress. | ||
79 | No grace period, no CPU stall warnings. | ||
80 | |||
81 | To diagnose the cause of the stall, inspect the stack traces. | ||
82 | The offending function will usually be near the top of the stack. | ||
83 | If you have a series of stall warnings from a single extended stall, | ||
84 | comparing the stack traces can often help determine where the stall | ||
85 | is occurring, which will usually be in the function nearest the top of | ||
86 | that portion of the stack which remains the same from trace to trace. | ||
87 | If you can reliably trigger the stall, ftrace can be quite helpful. | ||
88 | |||
89 | RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE | ||
90 | and with RCU's event tracing. For information on RCU's event tracing, | ||
91 | see include/trace/events/rcu.h. | ||
92 | |||
93 | |||
94 | Fine-Tuning the RCU CPU Stall Detector | ||
95 | |||
96 | The rcuupdate.rcu_cpu_stall_suppress module parameter disables RCU's | ||
97 | CPU stall detector, which detects conditions that unduly delay RCU grace | ||
98 | periods. This module parameter enables CPU stall detection by default, | ||
99 | but may be overridden via boot-time parameter or at runtime via sysfs. | ||
7 | The stall detector's idea of what constitutes "unduly delayed" is | 100 | The stall detector's idea of what constitutes "unduly delayed" is |
8 | controlled by a set of kernel configuration variables and cpp macros: | 101 | controlled by a set of kernel configuration variables and cpp macros: |
9 | 102 | ||
@@ -56,6 +149,9 @@ rcupdate.rcu_task_stall_timeout | |||
56 | And continues with the output of sched_show_task() for each | 149 | And continues with the output of sched_show_task() for each |
57 | task stalling the current RCU-tasks grace period. | 150 | task stalling the current RCU-tasks grace period. |
58 | 151 | ||
152 | |||
153 | Interpreting RCU's CPU Stall-Detector "Splats" | ||
154 | |||
59 | For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling, | 155 | For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling, |
60 | it will print a message similar to the following: | 156 | it will print a message similar to the following: |
61 | 157 | ||
@@ -178,89 +274,3 @@ grace period is in flight. | |||
178 | 274 | ||
179 | It is entirely possible to see stall warnings from normal and from | 275 | It is entirely possible to see stall warnings from normal and from |
180 | expedited grace periods at about the same time from the same run. | 276 | expedited grace periods at about the same time from the same run. |
181 | |||
182 | |||
183 | What Causes RCU CPU Stall Warnings? | ||
184 | |||
185 | So your kernel printed an RCU CPU stall warning. The next question is | ||
186 | "What caused it?" The following problems can result in RCU CPU stall | ||
187 | warnings: | ||
188 | |||
189 | o A CPU looping in an RCU read-side critical section. | ||
190 | |||
191 | o A CPU looping with interrupts disabled. This condition can | ||
192 | result in RCU-sched and RCU-bh stalls. | ||
193 | |||
194 | o A CPU looping with preemption disabled. This condition can | ||
195 | result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh | ||
196 | stalls. | ||
197 | |||
198 | o A CPU looping with bottom halves disabled. This condition can | ||
199 | result in RCU-sched and RCU-bh stalls. | ||
200 | |||
201 | o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the | ||
202 | kernel without invoking schedule(). Note that cond_resched() | ||
203 | does not necessarily prevent RCU CPU stall warnings. Therefore, | ||
204 | if the looping in the kernel is really expected and desirable | ||
205 | behavior, you might need to replace some of the cond_resched() | ||
206 | calls with calls to cond_resched_rcu_qs(). | ||
207 | |||
208 | o Booting Linux using a console connection that is too slow to | ||
209 | keep up with the boot-time console-message rate. For example, | ||
210 | a 115Kbaud serial console can be -way- too slow to keep up | ||
211 | with boot-time message rates, and will frequently result in | ||
212 | RCU CPU stall warning messages. Especially if you have added | ||
213 | debug printk()s. | ||
214 | |||
215 | o Anything that prevents RCU's grace-period kthreads from running. | ||
216 | This can result in the "All QSes seen" console-log message. | ||
217 | This message will include information on when the kthread last | ||
218 | ran and how often it should be expected to run. | ||
219 | |||
220 | o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might | ||
221 | happen to preempt a low-priority task in the middle of an RCU | ||
222 | read-side critical section. This is especially damaging if | ||
223 | that low-priority task is not permitted to run on any other CPU, | ||
224 | in which case the next RCU grace period can never complete, which | ||
225 | will eventually cause the system to run out of memory and hang. | ||
226 | While the system is in the process of running itself out of | ||
227 | memory, you might see stall-warning messages. | ||
228 | |||
229 | o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that | ||
230 | is running at a higher priority than the RCU softirq threads. | ||
231 | This will prevent RCU callbacks from ever being invoked, | ||
232 | and in a CONFIG_PREEMPT_RCU kernel will further prevent | ||
233 | RCU grace periods from ever completing. Either way, the | ||
234 | system will eventually run out of memory and hang. In the | ||
235 | CONFIG_PREEMPT_RCU case, you might see stall-warning | ||
236 | messages. | ||
237 | |||
238 | o A hardware or software issue shuts off the scheduler-clock | ||
239 | interrupt on a CPU that is not in dyntick-idle mode. This | ||
240 | problem really has happened, and seems to be most likely to | ||
241 | result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels. | ||
242 | |||
243 | o A bug in the RCU implementation. | ||
244 | |||
245 | o A hardware failure. This is quite unlikely, but has occurred | ||
246 | at least once in real life. A CPU failed in a running system, | ||
247 | becoming unresponsive, but not causing an immediate crash. | ||
248 | This resulted in a series of RCU CPU stall warnings, eventually | ||
249 | leading the realization that the CPU had failed. | ||
250 | |||
251 | The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall | ||
252 | warning. Note that SRCU does -not- have CPU stall warnings. Please note | ||
253 | that RCU only detects CPU stalls when there is a grace period in progress. | ||
254 | No grace period, no CPU stall warnings. | ||
255 | |||
256 | To diagnose the cause of the stall, inspect the stack traces. | ||
257 | The offending function will usually be near the top of the stack. | ||
258 | If you have a series of stall warnings from a single extended stall, | ||
259 | comparing the stack traces can often help determine where the stall | ||
260 | is occurring, which will usually be in the function nearest the top of | ||
261 | that portion of the stack which remains the same from trace to trace. | ||
262 | If you can reliably trigger the stall, ftrace can be quite helpful. | ||
263 | |||
264 | RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE | ||
265 | and with RCU's event tracing. For information on RCU's event tracing, | ||
266 | see include/trace/events/rcu.h. | ||
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 5cbd8b2395b8..8ed6c9f6133c 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt | |||
@@ -562,7 +562,9 @@ This section presents a "toy" RCU implementation that is based on | |||
562 | familiar locking primitives. Its overhead makes it a non-starter for | 562 | familiar locking primitives. Its overhead makes it a non-starter for |
563 | real-life use, as does its lack of scalability. It is also unsuitable | 563 | real-life use, as does its lack of scalability. It is also unsuitable |
564 | for realtime use, since it allows scheduling latency to "bleed" from | 564 | for realtime use, since it allows scheduling latency to "bleed" from |
565 | one read-side critical section to another. | 565 | one read-side critical section to another. It also assumes recursive |
566 | reader-writer locks: If you try this with non-recursive locks, and | ||
567 | you allow nested rcu_read_lock() calls, you can deadlock. | ||
566 | 568 | ||
567 | However, it is probably the easiest implementation to relate to, so is | 569 | However, it is probably the easiest implementation to relate to, so is |
568 | a good starting point. | 570 | a good starting point. |
@@ -587,20 +589,21 @@ It is extremely simple: | |||
587 | write_unlock(&rcu_gp_mutex); | 589 | write_unlock(&rcu_gp_mutex); |
588 | } | 590 | } |
589 | 591 | ||
590 | [You can ignore rcu_assign_pointer() and rcu_dereference() without | 592 | [You can ignore rcu_assign_pointer() and rcu_dereference() without missing |
591 | missing much. But here they are anyway. And whatever you do, don't | 593 | much. But here are simplified versions anyway. And whatever you do, |
592 | forget about them when submitting patches making use of RCU!] | 594 | don't forget about them when submitting patches making use of RCU!] |
593 | 595 | ||
594 | #define rcu_assign_pointer(p, v) ({ \ | 596 | #define rcu_assign_pointer(p, v) \ |
595 | smp_wmb(); \ | 597 | ({ \ |
596 | (p) = (v); \ | 598 | smp_store_release(&(p), (v)); \ |
597 | }) | 599 | }) |
598 | 600 | ||
599 | #define rcu_dereference(p) ({ \ | 601 | #define rcu_dereference(p) \ |
600 | typeof(p) _________p1 = p; \ | 602 | ({ \ |
601 | smp_read_barrier_depends(); \ | 603 | typeof(p) _________p1 = p; \ |
602 | (_________p1); \ | 604 | smp_read_barrier_depends(); \ |
603 | }) | 605 | (_________p1); \ |
606 | }) | ||
604 | 607 | ||
605 | 608 | ||
606 | The rcu_read_lock() and rcu_read_unlock() primitive read-acquire | 609 | The rcu_read_lock() and rcu_read_unlock() primitive read-acquire |
@@ -925,7 +928,8 @@ d. Do you need RCU grace periods to complete even in the face | |||
925 | 928 | ||
926 | e. Is your workload too update-intensive for normal use of | 929 | e. Is your workload too update-intensive for normal use of |
927 | RCU, but inappropriate for other synchronization mechanisms? | 930 | RCU, but inappropriate for other synchronization mechanisms? |
928 | If so, consider SLAB_DESTROY_BY_RCU. But please be careful! | 931 | If so, consider SLAB_TYPESAFE_BY_RCU (which was originally |
932 | named SLAB_DESTROY_BY_RCU). But please be careful! | ||
929 | 933 | ||
930 | f. Do you need read-side critical sections that are respected | 934 | f. Do you need read-side critical sections that are respected |
931 | even though they are in the middle of the idle loop, during | 935 | even though they are in the middle of the idle loop, during |
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index d2b0a8d81258..08329cb857ed 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt | |||
@@ -768,7 +768,7 @@ equal to zero, in which case the compiler is within its rights to | |||
768 | transform the above code into the following: | 768 | transform the above code into the following: |
769 | 769 | ||
770 | q = READ_ONCE(a); | 770 | q = READ_ONCE(a); |
771 | WRITE_ONCE(b, 1); | 771 | WRITE_ONCE(b, 2); |
772 | do_something_else(); | 772 | do_something_else(); |
773 | 773 | ||
774 | Given this transformation, the CPU is not required to respect the ordering | 774 | Given this transformation, the CPU is not required to respect the ordering |
diff --git a/arch/Kconfig b/arch/Kconfig index cd211a14a88f..adefaf344239 100644 --- a/arch/Kconfig +++ b/arch/Kconfig | |||
@@ -320,6 +320,9 @@ config HAVE_CMPXCHG_LOCAL | |||
320 | config HAVE_CMPXCHG_DOUBLE | 320 | config HAVE_CMPXCHG_DOUBLE |
321 | bool | 321 | bool |
322 | 322 | ||
323 | config ARCH_WEAK_RELEASE_ACQUIRE | ||
324 | bool | ||
325 | |||
323 | config ARCH_WANT_IPC_PARSE_VERSION | 326 | config ARCH_WANT_IPC_PARSE_VERSION |
324 | bool | 327 | bool |
325 | 328 | ||
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 97a8bc8a095c..7a5c9b764cd2 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig | |||
@@ -99,6 +99,7 @@ config PPC | |||
99 | select ARCH_USE_BUILTIN_BSWAP | 99 | select ARCH_USE_BUILTIN_BSWAP |
100 | select ARCH_USE_CMPXCHG_LOCKREF if PPC64 | 100 | select ARCH_USE_CMPXCHG_LOCKREF if PPC64 |
101 | select ARCH_WANT_IPC_PARSE_VERSION | 101 | select ARCH_WANT_IPC_PARSE_VERSION |
102 | select ARCH_WEAK_RELEASE_ACQUIRE | ||
102 | select BINFMT_ELF | 103 | select BINFMT_ELF |
103 | select BUILDTIME_EXTABLE_SORT | 104 | select BUILDTIME_EXTABLE_SORT |
104 | select CLONE_BACKWARDS | 105 | select CLONE_BACKWARDS |
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index fe531f904062..e4dd92b0a71b 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c | |||
@@ -4665,7 +4665,7 @@ i915_gem_load_init(struct drm_i915_private *dev_priv) | |||
4665 | dev_priv->requests = KMEM_CACHE(drm_i915_gem_request, | 4665 | dev_priv->requests = KMEM_CACHE(drm_i915_gem_request, |
4666 | SLAB_HWCACHE_ALIGN | | 4666 | SLAB_HWCACHE_ALIGN | |
4667 | SLAB_RECLAIM_ACCOUNT | | 4667 | SLAB_RECLAIM_ACCOUNT | |
4668 | SLAB_DESTROY_BY_RCU); | 4668 | SLAB_TYPESAFE_BY_RCU); |
4669 | if (!dev_priv->requests) | 4669 | if (!dev_priv->requests) |
4670 | goto err_vmas; | 4670 | goto err_vmas; |
4671 | 4671 | ||
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h index ea511f06efaf..9ee2750e1dde 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.h +++ b/drivers/gpu/drm/i915/i915_gem_request.h | |||
@@ -493,7 +493,7 @@ static inline struct drm_i915_gem_request * | |||
493 | __i915_gem_active_get_rcu(const struct i915_gem_active *active) | 493 | __i915_gem_active_get_rcu(const struct i915_gem_active *active) |
494 | { | 494 | { |
495 | /* Performing a lockless retrieval of the active request is super | 495 | /* Performing a lockless retrieval of the active request is super |
496 | * tricky. SLAB_DESTROY_BY_RCU merely guarantees that the backing | 496 | * tricky. SLAB_TYPESAFE_BY_RCU merely guarantees that the backing |
497 | * slab of request objects will not be freed whilst we hold the | 497 | * slab of request objects will not be freed whilst we hold the |
498 | * RCU read lock. It does not guarantee that the request itself | 498 | * RCU read lock. It does not guarantee that the request itself |
499 | * will not be freed and then *reused*. Viz, | 499 | * will not be freed and then *reused*. Viz, |
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c index 12647af5a336..e7fb47e84a93 100644 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c +++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c | |||
@@ -1071,7 +1071,7 @@ int ldlm_init(void) | |||
1071 | ldlm_lock_slab = kmem_cache_create("ldlm_locks", | 1071 | ldlm_lock_slab = kmem_cache_create("ldlm_locks", |
1072 | sizeof(struct ldlm_lock), 0, | 1072 | sizeof(struct ldlm_lock), 0, |
1073 | SLAB_HWCACHE_ALIGN | | 1073 | SLAB_HWCACHE_ALIGN | |
1074 | SLAB_DESTROY_BY_RCU, NULL); | 1074 | SLAB_TYPESAFE_BY_RCU, NULL); |
1075 | if (!ldlm_lock_slab) { | 1075 | if (!ldlm_lock_slab) { |
1076 | kmem_cache_destroy(ldlm_resource_slab); | 1076 | kmem_cache_destroy(ldlm_resource_slab); |
1077 | return -ENOMEM; | 1077 | return -ENOMEM; |
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5adc2fb62b0f..92b255e1ba58 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c | |||
@@ -2340,7 +2340,7 @@ static int jbd2_journal_init_journal_head_cache(void) | |||
2340 | jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", | 2340 | jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", |
2341 | sizeof(struct journal_head), | 2341 | sizeof(struct journal_head), |
2342 | 0, /* offset */ | 2342 | 0, /* offset */ |
2343 | SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU, | 2343 | SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU, |
2344 | NULL); /* ctor */ | 2344 | NULL); /* ctor */ |
2345 | retval = 0; | 2345 | retval = 0; |
2346 | if (!jbd2_journal_head_cache) { | 2346 | if (!jbd2_journal_head_cache) { |
diff --git a/fs/signalfd.c b/fs/signalfd.c index 270221fcef42..7e3d71109f51 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c | |||
@@ -38,7 +38,7 @@ void signalfd_cleanup(struct sighand_struct *sighand) | |||
38 | /* | 38 | /* |
39 | * The lockless check can race with remove_wait_queue() in progress, | 39 | * The lockless check can race with remove_wait_queue() in progress, |
40 | * but in this case its caller should run under rcu_read_lock() and | 40 | * but in this case its caller should run under rcu_read_lock() and |
41 | * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return. | 41 | * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return. |
42 | */ | 42 | */ |
43 | if (likely(!waitqueue_active(wqh))) | 43 | if (likely(!waitqueue_active(wqh))) |
44 | return; | 44 | return; |
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 6048fa404e57..a5195a7d6f77 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h | |||
@@ -229,7 +229,7 @@ static inline struct dma_fence *dma_fence_get_rcu(struct dma_fence *fence) | |||
229 | * | 229 | * |
230 | * Function returns NULL if no refcount could be obtained, or the fence. | 230 | * Function returns NULL if no refcount could be obtained, or the fence. |
231 | * This function handles acquiring a reference to a fence that may be | 231 | * This function handles acquiring a reference to a fence that may be |
232 | * reallocated within the RCU grace period (such as with SLAB_DESTROY_BY_RCU), | 232 | * reallocated within the RCU grace period (such as with SLAB_TYPESAFE_BY_RCU), |
233 | * so long as the caller is using RCU on the pointer to the fence. | 233 | * so long as the caller is using RCU on the pointer to the fence. |
234 | * | 234 | * |
235 | * An alternative mechanism is to employ a seqlock to protect a bunch of | 235 | * An alternative mechanism is to employ a seqlock to protect a bunch of |
@@ -257,7 +257,7 @@ dma_fence_get_rcu_safe(struct dma_fence * __rcu *fencep) | |||
257 | * have successfully acquire a reference to it. If it no | 257 | * have successfully acquire a reference to it. If it no |
258 | * longer matches, we are holding a reference to some other | 258 | * longer matches, we are holding a reference to some other |
259 | * reallocated pointer. This is possible if the allocator | 259 | * reallocated pointer. This is possible if the allocator |
260 | * is using a freelist like SLAB_DESTROY_BY_RCU where the | 260 | * is using a freelist like SLAB_TYPESAFE_BY_RCU where the |
261 | * fence remains valid for the RCU grace period, but it | 261 | * fence remains valid for the RCU grace period, but it |
262 | * may be reallocated. When using such allocators, we are | 262 | * may be reallocated. When using such allocators, we are |
263 | * responsible for ensuring the reference we get is to | 263 | * responsible for ensuring the reference we get is to |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d0250744507a..d6cfa0992220 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
@@ -375,8 +375,6 @@ struct kvm { | |||
375 | struct mutex slots_lock; | 375 | struct mutex slots_lock; |
376 | struct mm_struct *mm; /* userspace tied to this vm */ | 376 | struct mm_struct *mm; /* userspace tied to this vm */ |
377 | struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM]; | 377 | struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM]; |
378 | struct srcu_struct srcu; | ||
379 | struct srcu_struct irq_srcu; | ||
380 | struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; | 378 | struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; |
381 | 379 | ||
382 | /* | 380 | /* |
@@ -429,6 +427,8 @@ struct kvm { | |||
429 | struct list_head devices; | 427 | struct list_head devices; |
430 | struct dentry *debugfs_dentry; | 428 | struct dentry *debugfs_dentry; |
431 | struct kvm_stat_data **debugfs_stat_data; | 429 | struct kvm_stat_data **debugfs_stat_data; |
430 | struct srcu_struct srcu; | ||
431 | struct srcu_struct irq_srcu; | ||
432 | }; | 432 | }; |
433 | 433 | ||
434 | #define kvm_err(fmt, ...) \ | 434 | #define kvm_err(fmt, ...) \ |
diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h new file mode 100644 index 000000000000..4b766b61e1a0 --- /dev/null +++ b/include/linux/rcu_node_tree.h | |||
@@ -0,0 +1,99 @@ | |||
1 | /* | ||
2 | * RCU node combining tree definitions. These are used to compute | ||
3 | * global attributes while avoiding common-case global contention. A key | ||
4 | * property that these computations rely on is a tournament-style approach | ||
5 | * where only one of the tasks contending a lower level in the tree need | ||
6 | * advance to the next higher level. If properly configured, this allows | ||
7 | * unlimited scalability while maintaining a constant level of contention | ||
8 | * on the root node. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License as published by | ||
12 | * the Free Software Foundation; either version 2 of the License, or | ||
13 | * (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
18 | * GNU General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License | ||
21 | * along with this program; if not, you can access it online at | ||
22 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
23 | * | ||
24 | * Copyright IBM Corporation, 2017 | ||
25 | * | ||
26 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
27 | */ | ||
28 | |||
29 | #ifndef __LINUX_RCU_NODE_TREE_H | ||
30 | #define __LINUX_RCU_NODE_TREE_H | ||
31 | |||
32 | /* | ||
33 | * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and | ||
34 | * CONFIG_RCU_FANOUT_LEAF. | ||
35 | * In theory, it should be possible to add more levels straightforwardly. | ||
36 | * In practice, this did work well going from three levels to four. | ||
37 | * Of course, your mileage may vary. | ||
38 | */ | ||
39 | |||
40 | #ifdef CONFIG_RCU_FANOUT | ||
41 | #define RCU_FANOUT CONFIG_RCU_FANOUT | ||
42 | #else /* #ifdef CONFIG_RCU_FANOUT */ | ||
43 | # ifdef CONFIG_64BIT | ||
44 | # define RCU_FANOUT 64 | ||
45 | # else | ||
46 | # define RCU_FANOUT 32 | ||
47 | # endif | ||
48 | #endif /* #else #ifdef CONFIG_RCU_FANOUT */ | ||
49 | |||
50 | #ifdef CONFIG_RCU_FANOUT_LEAF | ||
51 | #define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF | ||
52 | #else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ | ||
53 | #define RCU_FANOUT_LEAF 16 | ||
54 | #endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ | ||
55 | |||
56 | #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) | ||
57 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) | ||
58 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) | ||
59 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) | ||
60 | |||
61 | #if NR_CPUS <= RCU_FANOUT_1 | ||
62 | # define RCU_NUM_LVLS 1 | ||
63 | # define NUM_RCU_LVL_0 1 | ||
64 | # define NUM_RCU_NODES NUM_RCU_LVL_0 | ||
65 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } | ||
66 | # define RCU_NODE_NAME_INIT { "rcu_node_0" } | ||
67 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } | ||
68 | #elif NR_CPUS <= RCU_FANOUT_2 | ||
69 | # define RCU_NUM_LVLS 2 | ||
70 | # define NUM_RCU_LVL_0 1 | ||
71 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
72 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) | ||
73 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } | ||
74 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } | ||
75 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } | ||
76 | #elif NR_CPUS <= RCU_FANOUT_3 | ||
77 | # define RCU_NUM_LVLS 3 | ||
78 | # define NUM_RCU_LVL_0 1 | ||
79 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | ||
80 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
81 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) | ||
82 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } | ||
83 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } | ||
84 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } | ||
85 | #elif NR_CPUS <= RCU_FANOUT_4 | ||
86 | # define RCU_NUM_LVLS 4 | ||
87 | # define NUM_RCU_LVL_0 1 | ||
88 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) | ||
89 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | ||
90 | # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
91 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) | ||
92 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } | ||
93 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } | ||
94 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } | ||
95 | #else | ||
96 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" | ||
97 | #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ | ||
98 | |||
99 | #endif /* __LINUX_RCU_NODE_TREE_H */ | ||
diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h new file mode 100644 index 000000000000..ced8f313fd05 --- /dev/null +++ b/include/linux/rcu_segcblist.h | |||
@@ -0,0 +1,712 @@ | |||
1 | /* | ||
2 | * RCU segmented callback lists | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, you can access it online at | ||
16 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2017 | ||
19 | * | ||
20 | * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
21 | */ | ||
22 | |||
23 | #ifndef __KERNEL_RCU_SEGCBLIST_H | ||
24 | #define __KERNEL_RCU_SEGCBLIST_H | ||
25 | |||
26 | /* Simple unsegmented callback lists. */ | ||
27 | struct rcu_cblist { | ||
28 | struct rcu_head *head; | ||
29 | struct rcu_head **tail; | ||
30 | long len; | ||
31 | long len_lazy; | ||
32 | }; | ||
33 | |||
34 | #define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head } | ||
35 | |||
36 | /* Initialize simple callback list. */ | ||
37 | static inline void rcu_cblist_init(struct rcu_cblist *rclp) | ||
38 | { | ||
39 | rclp->head = NULL; | ||
40 | rclp->tail = &rclp->head; | ||
41 | rclp->len = 0; | ||
42 | rclp->len_lazy = 0; | ||
43 | } | ||
44 | |||
45 | /* Is simple callback list empty? */ | ||
46 | static inline bool rcu_cblist_empty(struct rcu_cblist *rclp) | ||
47 | { | ||
48 | return !rclp->head; | ||
49 | } | ||
50 | |||
51 | /* Return number of callbacks in simple callback list. */ | ||
52 | static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) | ||
53 | { | ||
54 | return rclp->len; | ||
55 | } | ||
56 | |||
57 | /* Return number of lazy callbacks in simple callback list. */ | ||
58 | static inline long rcu_cblist_n_lazy_cbs(struct rcu_cblist *rclp) | ||
59 | { | ||
60 | return rclp->len_lazy; | ||
61 | } | ||
62 | |||
63 | /* | ||
64 | * Debug function to actually count the number of callbacks. | ||
65 | * If the number exceeds the limit specified, return -1. | ||
66 | */ | ||
67 | static inline long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim) | ||
68 | { | ||
69 | int cnt = 0; | ||
70 | struct rcu_head **rhpp = &rclp->head; | ||
71 | |||
72 | for (;;) { | ||
73 | if (!*rhpp) | ||
74 | return cnt; | ||
75 | if (++cnt > lim) | ||
76 | return -1; | ||
77 | rhpp = &(*rhpp)->next; | ||
78 | } | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | * Dequeue the oldest rcu_head structure from the specified callback | ||
83 | * list. This function assumes that the callback is non-lazy, but | ||
84 | * the caller can later invoke rcu_cblist_dequeued_lazy() if it | ||
85 | * finds otherwise (and if it cares about laziness). This allows | ||
86 | * different users to have different ways of determining laziness. | ||
87 | */ | ||
88 | static inline struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) | ||
89 | { | ||
90 | struct rcu_head *rhp; | ||
91 | |||
92 | rhp = rclp->head; | ||
93 | if (!rhp) | ||
94 | return NULL; | ||
95 | rclp->len--; | ||
96 | rclp->head = rhp->next; | ||
97 | if (!rclp->head) | ||
98 | rclp->tail = &rclp->head; | ||
99 | return rhp; | ||
100 | } | ||
101 | |||
102 | /* | ||
103 | * Account for the fact that a previously dequeued callback turned out | ||
104 | * to be marked as lazy. | ||
105 | */ | ||
106 | static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) | ||
107 | { | ||
108 | rclp->len_lazy--; | ||
109 | } | ||
110 | |||
111 | /* | ||
112 | * Interim function to return rcu_cblist head pointer. Longer term, the | ||
113 | * rcu_cblist will be used more pervasively, removing the need for this | ||
114 | * function. | ||
115 | */ | ||
116 | static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp) | ||
117 | { | ||
118 | return rclp->head; | ||
119 | } | ||
120 | |||
121 | /* | ||
122 | * Interim function to return rcu_cblist head pointer. Longer term, the | ||
123 | * rcu_cblist will be used more pervasively, removing the need for this | ||
124 | * function. | ||
125 | */ | ||
126 | static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp) | ||
127 | { | ||
128 | WARN_ON_ONCE(rcu_cblist_empty(rclp)); | ||
129 | return rclp->tail; | ||
130 | } | ||
131 | |||
132 | /* Complicated segmented callback lists. ;-) */ | ||
133 | |||
134 | /* | ||
135 | * Index values for segments in rcu_segcblist structure. | ||
136 | * | ||
137 | * The segments are as follows: | ||
138 | * | ||
139 | * [head, *tails[RCU_DONE_TAIL]): | ||
140 | * Callbacks whose grace period has elapsed, and thus can be invoked. | ||
141 | * [*tails[RCU_DONE_TAIL], *tails[RCU_WAIT_TAIL]): | ||
142 | * Callbacks waiting for the current GP from the current CPU's viewpoint. | ||
143 | * [*tails[RCU_WAIT_TAIL], *tails[RCU_NEXT_READY_TAIL]): | ||
144 | * Callbacks that arrived before the next GP started, again from | ||
145 | * the current CPU's viewpoint. These can be handled by the next GP. | ||
146 | * [*tails[RCU_NEXT_READY_TAIL], *tails[RCU_NEXT_TAIL]): | ||
147 | * Callbacks that might have arrived after the next GP started. | ||
148 | * There is some uncertainty as to when a given GP starts and | ||
149 | * ends, but a CPU knows the exact times if it is the one starting | ||
150 | * or ending the GP. Other CPUs know that the previous GP ends | ||
151 | * before the next one starts. | ||
152 | * | ||
153 | * Note that RCU_WAIT_TAIL cannot be empty unless RCU_NEXT_READY_TAIL is also | ||
154 | * empty. | ||
155 | * | ||
156 | * The ->gp_seq[] array contains the grace-period number at which the | ||
157 | * corresponding segment of callbacks will be ready to invoke. A given | ||
158 | * element of this array is meaningful only when the corresponding segment | ||
159 | * is non-empty, and it is never valid for RCU_DONE_TAIL (whose callbacks | ||
160 | * are already ready to invoke) or for RCU_NEXT_TAIL (whose callbacks have | ||
161 | * not yet been assigned a grace-period number). | ||
162 | */ | ||
163 | #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ | ||
164 | #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ | ||
165 | #define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ | ||
166 | #define RCU_NEXT_TAIL 3 | ||
167 | #define RCU_CBLIST_NSEGS 4 | ||
168 | |||
169 | struct rcu_segcblist { | ||
170 | struct rcu_head *head; | ||
171 | struct rcu_head **tails[RCU_CBLIST_NSEGS]; | ||
172 | unsigned long gp_seq[RCU_CBLIST_NSEGS]; | ||
173 | long len; | ||
174 | long len_lazy; | ||
175 | }; | ||
176 | |||
177 | #define RCU_SEGCBLIST_INITIALIZER(n) \ | ||
178 | { \ | ||
179 | .head = NULL, \ | ||
180 | .tails[RCU_DONE_TAIL] = &n.head, \ | ||
181 | .tails[RCU_WAIT_TAIL] = &n.head, \ | ||
182 | .tails[RCU_NEXT_READY_TAIL] = &n.head, \ | ||
183 | .tails[RCU_NEXT_TAIL] = &n.head, \ | ||
184 | } | ||
185 | |||
186 | /* | ||
187 | * Initialize an rcu_segcblist structure. | ||
188 | */ | ||
189 | static inline void rcu_segcblist_init(struct rcu_segcblist *rsclp) | ||
190 | { | ||
191 | int i; | ||
192 | |||
193 | BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq)); | ||
194 | BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq)); | ||
195 | rsclp->head = NULL; | ||
196 | for (i = 0; i < RCU_CBLIST_NSEGS; i++) | ||
197 | rsclp->tails[i] = &rsclp->head; | ||
198 | rsclp->len = 0; | ||
199 | rsclp->len_lazy = 0; | ||
200 | } | ||
201 | |||
202 | /* | ||
203 | * Is the specified rcu_segcblist structure empty? | ||
204 | * | ||
205 | * But careful! The fact that the ->head field is NULL does not | ||
206 | * necessarily imply that there are no callbacks associated with | ||
207 | * this structure. When callbacks are being invoked, they are | ||
208 | * removed as a group. If callback invocation must be preempted, | ||
209 | * the remaining callbacks will be added back to the list. Either | ||
210 | * way, the counts are updated later. | ||
211 | * | ||
212 | * So it is often the case that rcu_segcblist_n_cbs() should be used | ||
213 | * instead. | ||
214 | */ | ||
215 | static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) | ||
216 | { | ||
217 | return !rsclp->head; | ||
218 | } | ||
219 | |||
220 | /* Return number of callbacks in segmented callback list. */ | ||
221 | static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) | ||
222 | { | ||
223 | return READ_ONCE(rsclp->len); | ||
224 | } | ||
225 | |||
226 | /* Return number of lazy callbacks in segmented callback list. */ | ||
227 | static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) | ||
228 | { | ||
229 | return rsclp->len_lazy; | ||
230 | } | ||
231 | |||
232 | /* Return number of lazy callbacks in segmented callback list. */ | ||
233 | static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) | ||
234 | { | ||
235 | return rsclp->len - rsclp->len_lazy; | ||
236 | } | ||
237 | |||
238 | /* | ||
239 | * Is the specified rcu_segcblist enabled, for example, not corresponding | ||
240 | * to an offline or callback-offloaded CPU? | ||
241 | */ | ||
242 | static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) | ||
243 | { | ||
244 | return !!rsclp->tails[RCU_NEXT_TAIL]; | ||
245 | } | ||
246 | |||
247 | /* | ||
248 | * Disable the specified rcu_segcblist structure, so that callbacks can | ||
249 | * no longer be posted to it. This structure must be empty. | ||
250 | */ | ||
251 | static inline void rcu_segcblist_disable(struct rcu_segcblist *rsclp) | ||
252 | { | ||
253 | WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); | ||
254 | WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); | ||
255 | WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); | ||
256 | rsclp->tails[RCU_NEXT_TAIL] = NULL; | ||
257 | } | ||
258 | |||
259 | /* | ||
260 | * Is the specified segment of the specified rcu_segcblist structure | ||
261 | * empty of callbacks? | ||
262 | */ | ||
263 | static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) | ||
264 | { | ||
265 | if (seg == RCU_DONE_TAIL) | ||
266 | return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; | ||
267 | return rsclp->tails[seg - 1] == rsclp->tails[seg]; | ||
268 | } | ||
269 | |||
270 | /* | ||
271 | * Are all segments following the specified segment of the specified | ||
272 | * rcu_segcblist structure empty of callbacks? (The specified | ||
273 | * segment might well contain callbacks.) | ||
274 | */ | ||
275 | static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) | ||
276 | { | ||
277 | return !*rsclp->tails[seg]; | ||
278 | } | ||
279 | |||
280 | /* | ||
281 | * Does the specified rcu_segcblist structure contain callbacks that | ||
282 | * are ready to be invoked? | ||
283 | */ | ||
284 | static inline bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp) | ||
285 | { | ||
286 | return rcu_segcblist_is_enabled(rsclp) && | ||
287 | &rsclp->head != rsclp->tails[RCU_DONE_TAIL]; | ||
288 | } | ||
289 | |||
290 | /* | ||
291 | * Does the specified rcu_segcblist structure contain callbacks that | ||
292 | * are still pending, that is, not yet ready to be invoked? | ||
293 | */ | ||
294 | static inline bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp) | ||
295 | { | ||
296 | return rcu_segcblist_is_enabled(rsclp) && | ||
297 | !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL); | ||
298 | } | ||
299 | |||
300 | /* | ||
301 | * Dequeue and return the first ready-to-invoke callback. If there | ||
302 | * are no ready-to-invoke callbacks, return NULL. Disables interrupts | ||
303 | * to avoid interference. Does not protect from interference from other | ||
304 | * CPUs or tasks. | ||
305 | */ | ||
306 | static inline struct rcu_head * | ||
307 | rcu_segcblist_dequeue(struct rcu_segcblist *rsclp) | ||
308 | { | ||
309 | unsigned long flags; | ||
310 | int i; | ||
311 | struct rcu_head *rhp; | ||
312 | |||
313 | local_irq_save(flags); | ||
314 | if (!rcu_segcblist_ready_cbs(rsclp)) { | ||
315 | local_irq_restore(flags); | ||
316 | return NULL; | ||
317 | } | ||
318 | rhp = rsclp->head; | ||
319 | BUG_ON(!rhp); | ||
320 | rsclp->head = rhp->next; | ||
321 | for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) { | ||
322 | if (rsclp->tails[i] != &rhp->next) | ||
323 | break; | ||
324 | rsclp->tails[i] = &rsclp->head; | ||
325 | } | ||
326 | smp_mb(); /* Dequeue before decrement for rcu_barrier(). */ | ||
327 | WRITE_ONCE(rsclp->len, rsclp->len - 1); | ||
328 | local_irq_restore(flags); | ||
329 | return rhp; | ||
330 | } | ||
331 | |||
332 | /* | ||
333 | * Account for the fact that a previously dequeued callback turned out | ||
334 | * to be marked as lazy. | ||
335 | */ | ||
336 | static inline void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp) | ||
337 | { | ||
338 | unsigned long flags; | ||
339 | |||
340 | local_irq_save(flags); | ||
341 | rsclp->len_lazy--; | ||
342 | local_irq_restore(flags); | ||
343 | } | ||
344 | |||
345 | /* | ||
346 | * Return a pointer to the first callback in the specified rcu_segcblist | ||
347 | * structure. This is useful for diagnostics. | ||
348 | */ | ||
349 | static inline struct rcu_head * | ||
350 | rcu_segcblist_first_cb(struct rcu_segcblist *rsclp) | ||
351 | { | ||
352 | if (rcu_segcblist_is_enabled(rsclp)) | ||
353 | return rsclp->head; | ||
354 | return NULL; | ||
355 | } | ||
356 | |||
357 | /* | ||
358 | * Return a pointer to the first pending callback in the specified | ||
359 | * rcu_segcblist structure. This is useful just after posting a given | ||
360 | * callback -- if that callback is the first pending callback, then | ||
361 | * you cannot rely on someone else having already started up the required | ||
362 | * grace period. | ||
363 | */ | ||
364 | static inline struct rcu_head * | ||
365 | rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) | ||
366 | { | ||
367 | if (rcu_segcblist_is_enabled(rsclp)) | ||
368 | return *rsclp->tails[RCU_DONE_TAIL]; | ||
369 | return NULL; | ||
370 | } | ||
371 | |||
372 | /* | ||
373 | * Does the specified rcu_segcblist structure contain callbacks that | ||
374 | * have not yet been processed beyond having been posted, that is, | ||
375 | * does it contain callbacks in its last segment? | ||
376 | */ | ||
377 | static inline bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp) | ||
378 | { | ||
379 | return rcu_segcblist_is_enabled(rsclp) && | ||
380 | !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL); | ||
381 | } | ||
382 | |||
383 | /* | ||
384 | * Enqueue the specified callback onto the specified rcu_segcblist | ||
385 | * structure, updating accounting as needed. Note that the ->len | ||
386 | * field may be accessed locklessly, hence the WRITE_ONCE(). | ||
387 | * The ->len field is used by rcu_barrier() and friends to determine | ||
388 | * if it must post a callback on this structure, and it is OK | ||
389 | * for rcu_barrier() to sometimes post callbacks needlessly, but | ||
390 | * absolutely not OK for it to ever miss posting a callback. | ||
391 | */ | ||
392 | static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, | ||
393 | struct rcu_head *rhp, bool lazy) | ||
394 | { | ||
395 | WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */ | ||
396 | if (lazy) | ||
397 | rsclp->len_lazy++; | ||
398 | smp_mb(); /* Ensure counts are updated before callback is enqueued. */ | ||
399 | rhp->next = NULL; | ||
400 | *rsclp->tails[RCU_NEXT_TAIL] = rhp; | ||
401 | rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; | ||
402 | } | ||
403 | |||
404 | /* | ||
405 | * Entrain the specified callback onto the specified rcu_segcblist at | ||
406 | * the end of the last non-empty segment. If the entire rcu_segcblist | ||
407 | * is empty, make no change, but return false. | ||
408 | * | ||
409 | * This is intended for use by rcu_barrier()-like primitives, -not- | ||
410 | * for normal grace-period use. IMPORTANT: The callback you enqueue | ||
411 | * will wait for all prior callbacks, NOT necessarily for a grace | ||
412 | * period. You have been warned. | ||
413 | */ | ||
414 | static inline bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, | ||
415 | struct rcu_head *rhp, bool lazy) | ||
416 | { | ||
417 | int i; | ||
418 | |||
419 | if (rcu_segcblist_n_cbs(rsclp) == 0) | ||
420 | return false; | ||
421 | WRITE_ONCE(rsclp->len, rsclp->len + 1); | ||
422 | if (lazy) | ||
423 | rsclp->len_lazy++; | ||
424 | smp_mb(); /* Ensure counts are updated before callback is entrained. */ | ||
425 | rhp->next = NULL; | ||
426 | for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) | ||
427 | if (rsclp->tails[i] != rsclp->tails[i - 1]) | ||
428 | break; | ||
429 | *rsclp->tails[i] = rhp; | ||
430 | for (; i <= RCU_NEXT_TAIL; i++) | ||
431 | rsclp->tails[i] = &rhp->next; | ||
432 | return true; | ||
433 | } | ||
434 | |||
435 | /* | ||
436 | * Extract only the counts from the specified rcu_segcblist structure, | ||
437 | * and place them in the specified rcu_cblist structure. This function | ||
438 | * supports both callback orphaning and invocation, hence the separation | ||
439 | * of counts and callbacks. (Callbacks ready for invocation must be | ||
440 | * orphaned and adopted separately from pending callbacks, but counts | ||
441 | * apply to all callbacks. Locking must be used to make sure that | ||
442 | * both orphaned-callbacks lists are consistent.) | ||
443 | */ | ||
444 | static inline void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, | ||
445 | struct rcu_cblist *rclp) | ||
446 | { | ||
447 | rclp->len_lazy += rsclp->len_lazy; | ||
448 | rclp->len += rsclp->len; | ||
449 | rsclp->len_lazy = 0; | ||
450 | WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */ | ||
451 | } | ||
452 | |||
453 | /* | ||
454 | * Extract only those callbacks ready to be invoked from the specified | ||
455 | * rcu_segcblist structure and place them in the specified rcu_cblist | ||
456 | * structure. | ||
457 | */ | ||
458 | static inline void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, | ||
459 | struct rcu_cblist *rclp) | ||
460 | { | ||
461 | int i; | ||
462 | |||
463 | if (!rcu_segcblist_ready_cbs(rsclp)) | ||
464 | return; /* Nothing to do. */ | ||
465 | *rclp->tail = rsclp->head; | ||
466 | rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; | ||
467 | *rsclp->tails[RCU_DONE_TAIL] = NULL; | ||
468 | rclp->tail = rsclp->tails[RCU_DONE_TAIL]; | ||
469 | for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) | ||
470 | if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) | ||
471 | rsclp->tails[i] = &rsclp->head; | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * Extract only those callbacks still pending (not yet ready to be | ||
476 | * invoked) from the specified rcu_segcblist structure and place them in | ||
477 | * the specified rcu_cblist structure. Note that this loses information | ||
478 | * about any callbacks that might have been partway done waiting for | ||
479 | * their grace period. Too bad! They will have to start over. | ||
480 | */ | ||
481 | static inline void | ||
482 | rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, | ||
483 | struct rcu_cblist *rclp) | ||
484 | { | ||
485 | int i; | ||
486 | |||
487 | if (!rcu_segcblist_pend_cbs(rsclp)) | ||
488 | return; /* Nothing to do. */ | ||
489 | *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; | ||
490 | rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; | ||
491 | *rsclp->tails[RCU_DONE_TAIL] = NULL; | ||
492 | for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) | ||
493 | rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL]; | ||
494 | } | ||
495 | |||
496 | /* | ||
497 | * Move the entire contents of the specified rcu_segcblist structure, | ||
498 | * counts, callbacks, and all, to the specified rcu_cblist structure. | ||
499 | * @@@ Why do we need this??? Moving early-boot CBs to NOCB lists? | ||
500 | * @@@ Memory barrier needed? (Not if only used at boot time...) | ||
501 | */ | ||
502 | static inline void rcu_segcblist_extract_all(struct rcu_segcblist *rsclp, | ||
503 | struct rcu_cblist *rclp) | ||
504 | { | ||
505 | rcu_segcblist_extract_done_cbs(rsclp, rclp); | ||
506 | rcu_segcblist_extract_pend_cbs(rsclp, rclp); | ||
507 | rcu_segcblist_extract_count(rsclp, rclp); | ||
508 | } | ||
509 | |||
510 | /* | ||
511 | * Insert counts from the specified rcu_cblist structure in the | ||
512 | * specified rcu_segcblist structure. | ||
513 | */ | ||
514 | static inline void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, | ||
515 | struct rcu_cblist *rclp) | ||
516 | { | ||
517 | rsclp->len_lazy += rclp->len_lazy; | ||
518 | /* ->len sampled locklessly. */ | ||
519 | WRITE_ONCE(rsclp->len, rsclp->len + rclp->len); | ||
520 | rclp->len_lazy = 0; | ||
521 | rclp->len = 0; | ||
522 | } | ||
523 | |||
524 | /* | ||
525 | * Move callbacks from the specified rcu_cblist to the beginning of the | ||
526 | * done-callbacks segment of the specified rcu_segcblist. | ||
527 | */ | ||
528 | static inline void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, | ||
529 | struct rcu_cblist *rclp) | ||
530 | { | ||
531 | int i; | ||
532 | |||
533 | if (!rclp->head) | ||
534 | return; /* No callbacks to move. */ | ||
535 | *rclp->tail = rsclp->head; | ||
536 | rsclp->head = rclp->head; | ||
537 | for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) | ||
538 | if (&rsclp->head == rsclp->tails[i]) | ||
539 | rsclp->tails[i] = rclp->tail; | ||
540 | else | ||
541 | break; | ||
542 | rclp->head = NULL; | ||
543 | rclp->tail = &rclp->head; | ||
544 | } | ||
545 | |||
546 | /* | ||
547 | * Move callbacks from the specified rcu_cblist to the end of the | ||
548 | * new-callbacks segment of the specified rcu_segcblist. | ||
549 | */ | ||
550 | static inline void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, | ||
551 | struct rcu_cblist *rclp) | ||
552 | { | ||
553 | if (!rclp->head) | ||
554 | return; /* Nothing to do. */ | ||
555 | *rsclp->tails[RCU_NEXT_TAIL] = rclp->head; | ||
556 | rsclp->tails[RCU_NEXT_TAIL] = rclp->tail; | ||
557 | rclp->head = NULL; | ||
558 | rclp->tail = &rclp->head; | ||
559 | } | ||
560 | |||
561 | /* | ||
562 | * Advance the callbacks in the specified rcu_segcblist structure based | ||
563 | * on the current value passed in for the grace-period counter. | ||
564 | */ | ||
565 | static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp, | ||
566 | unsigned long seq) | ||
567 | { | ||
568 | int i, j; | ||
569 | |||
570 | WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); | ||
571 | if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) | ||
572 | return; | ||
573 | |||
574 | /* | ||
575 | * Find all callbacks whose ->gp_seq numbers indicate that they | ||
576 | * are ready to invoke, and put them into the RCU_DONE_TAIL segment. | ||
577 | */ | ||
578 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { | ||
579 | if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) | ||
580 | break; | ||
581 | rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i]; | ||
582 | } | ||
583 | |||
584 | /* If no callbacks moved, nothing more need be done. */ | ||
585 | if (i == RCU_WAIT_TAIL) | ||
586 | return; | ||
587 | |||
588 | /* Clean up tail pointers that might have been misordered above. */ | ||
589 | for (j = RCU_WAIT_TAIL; j < i; j++) | ||
590 | rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL]; | ||
591 | |||
592 | /* | ||
593 | * Callbacks moved, so clean up the misordered ->tails[] pointers | ||
594 | * that now point into the middle of the list of ready-to-invoke | ||
595 | * callbacks. The overall effect is to copy down the later pointers | ||
596 | * into the gap that was created by the now-ready segments. | ||
597 | */ | ||
598 | for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { | ||
599 | if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) | ||
600 | break; /* No more callbacks. */ | ||
601 | rsclp->tails[j] = rsclp->tails[i]; | ||
602 | rsclp->gp_seq[j] = rsclp->gp_seq[i]; | ||
603 | } | ||
604 | } | ||
605 | |||
606 | /* | ||
607 | * "Accelerate" callbacks based on more-accurate grace-period information. | ||
608 | * The reason for this is that RCU does not synchronize the beginnings and | ||
609 | * ends of grace periods, and that callbacks are posted locally. This in | ||
610 | * turn means that the callbacks must be labelled conservatively early | ||
611 | * on, as getting exact information would degrade both performance and | ||
612 | * scalability. When more accurate grace-period information becomes | ||
613 | * available, previously posted callbacks can be "accelerated", marking | ||
614 | * them to complete at the end of the earlier grace period. | ||
615 | * | ||
616 | * This function operates on an rcu_segcblist structure, and also the | ||
617 | * grace-period sequence number seq at which new callbacks would become | ||
618 | * ready to invoke. Returns true if there are callbacks that won't be | ||
619 | * ready to invoke until seq, false otherwise. | ||
620 | */ | ||
621 | static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, | ||
622 | unsigned long seq) | ||
623 | { | ||
624 | int i; | ||
625 | |||
626 | WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); | ||
627 | if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) | ||
628 | return false; | ||
629 | |||
630 | /* | ||
631 | * Find the segment preceding the oldest segment of callbacks | ||
632 | * whose ->gp_seq[] completion is at or after that passed in via | ||
633 | * "seq", skipping any empty segments. This oldest segment, along | ||
634 | * with any later segments, can be merged in with any newly arrived | ||
635 | * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq" | ||
636 | * as their ->gp_seq[] grace-period completion sequence number. | ||
637 | */ | ||
638 | for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--) | ||
639 | if (rsclp->tails[i] != rsclp->tails[i - 1] && | ||
640 | ULONG_CMP_LT(rsclp->gp_seq[i], seq)) | ||
641 | break; | ||
642 | |||
643 | /* | ||
644 | * If all the segments contain callbacks that correspond to | ||
645 | * earlier grace-period sequence numbers than "seq", leave. | ||
646 | * Assuming that the rcu_segcblist structure has enough | ||
647 | * segments in its arrays, this can only happen if some of | ||
648 | * the non-done segments contain callbacks that really are | ||
649 | * ready to invoke. This situation will get straightened | ||
650 | * out by the next call to rcu_segcblist_advance(). | ||
651 | * | ||
652 | * Also advance to the oldest segment of callbacks whose | ||
653 | * ->gp_seq[] completion is at or after that passed in via "seq", | ||
654 | * skipping any empty segments. | ||
655 | */ | ||
656 | if (++i >= RCU_NEXT_TAIL) | ||
657 | return false; | ||
658 | |||
659 | /* | ||
660 | * Merge all later callbacks, including newly arrived callbacks, | ||
661 | * into the segment located by the for-loop above. Assign "seq" | ||
662 | * as the ->gp_seq[] value in order to correctly handle the case | ||
663 | * where there were no pending callbacks in the rcu_segcblist | ||
664 | * structure other than in the RCU_NEXT_TAIL segment. | ||
665 | */ | ||
666 | for (; i < RCU_NEXT_TAIL; i++) { | ||
667 | rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; | ||
668 | rsclp->gp_seq[i] = seq; | ||
669 | } | ||
670 | return true; | ||
671 | } | ||
672 | |||
673 | /* | ||
674 | * Scan the specified rcu_segcblist structure for callbacks that need | ||
675 | * a grace period later than the one specified by "seq". We don't look | ||
676 | * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't | ||
677 | * have a grace-period sequence number. | ||
678 | */ | ||
679 | static inline bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, | ||
680 | unsigned long seq) | ||
681 | { | ||
682 | int i; | ||
683 | |||
684 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) | ||
685 | if (rsclp->tails[i - 1] != rsclp->tails[i] && | ||
686 | ULONG_CMP_LT(seq, rsclp->gp_seq[i])) | ||
687 | return true; | ||
688 | return false; | ||
689 | } | ||
690 | |||
691 | /* | ||
692 | * Interim function to return rcu_segcblist head pointer. Longer term, the | ||
693 | * rcu_segcblist will be used more pervasively, removing the need for this | ||
694 | * function. | ||
695 | */ | ||
696 | static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) | ||
697 | { | ||
698 | return rsclp->head; | ||
699 | } | ||
700 | |||
701 | /* | ||
702 | * Interim function to return rcu_segcblist head pointer. Longer term, the | ||
703 | * rcu_segcblist will be used more pervasively, removing the need for this | ||
704 | * function. | ||
705 | */ | ||
706 | static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) | ||
707 | { | ||
708 | WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); | ||
709 | return rsclp->tails[RCU_NEXT_TAIL]; | ||
710 | } | ||
711 | |||
712 | #endif /* __KERNEL_RCU_SEGCBLIST_H */ | ||
diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4f7a9561b8c4..b1fd8bf85fdc 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h | |||
@@ -509,7 +509,8 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n, | |||
509 | { | 509 | { |
510 | struct hlist_node *i, *last = NULL; | 510 | struct hlist_node *i, *last = NULL; |
511 | 511 | ||
512 | for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i)) | 512 | /* Note: write side code, so rcu accessors are not needed. */ |
513 | for (i = h->first; i; i = i->next) | ||
513 | last = i; | 514 | last = i; |
514 | 515 | ||
515 | if (last) { | 516 | if (last) { |
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index de88b33c0974..f531b29207da 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h | |||
@@ -363,15 +363,20 @@ static inline void rcu_init_nohz(void) | |||
363 | #ifdef CONFIG_TASKS_RCU | 363 | #ifdef CONFIG_TASKS_RCU |
364 | #define TASKS_RCU(x) x | 364 | #define TASKS_RCU(x) x |
365 | extern struct srcu_struct tasks_rcu_exit_srcu; | 365 | extern struct srcu_struct tasks_rcu_exit_srcu; |
366 | #define rcu_note_voluntary_context_switch(t) \ | 366 | #define rcu_note_voluntary_context_switch_lite(t) \ |
367 | do { \ | 367 | do { \ |
368 | rcu_all_qs(); \ | ||
369 | if (READ_ONCE((t)->rcu_tasks_holdout)) \ | 368 | if (READ_ONCE((t)->rcu_tasks_holdout)) \ |
370 | WRITE_ONCE((t)->rcu_tasks_holdout, false); \ | 369 | WRITE_ONCE((t)->rcu_tasks_holdout, false); \ |
371 | } while (0) | 370 | } while (0) |
371 | #define rcu_note_voluntary_context_switch(t) \ | ||
372 | do { \ | ||
373 | rcu_all_qs(); \ | ||
374 | rcu_note_voluntary_context_switch_lite(t); \ | ||
375 | } while (0) | ||
372 | #else /* #ifdef CONFIG_TASKS_RCU */ | 376 | #else /* #ifdef CONFIG_TASKS_RCU */ |
373 | #define TASKS_RCU(x) do { } while (0) | 377 | #define TASKS_RCU(x) do { } while (0) |
374 | #define rcu_note_voluntary_context_switch(t) rcu_all_qs() | 378 | #define rcu_note_voluntary_context_switch_lite(t) do { } while (0) |
379 | #define rcu_note_voluntary_context_switch(t) rcu_all_qs() | ||
375 | #endif /* #else #ifdef CONFIG_TASKS_RCU */ | 380 | #endif /* #else #ifdef CONFIG_TASKS_RCU */ |
376 | 381 | ||
377 | /** | 382 | /** |
@@ -1127,11 +1132,11 @@ do { \ | |||
1127 | * if the UNLOCK and LOCK are executed by the same CPU or if the | 1132 | * if the UNLOCK and LOCK are executed by the same CPU or if the |
1128 | * UNLOCK and LOCK operate on the same lock variable. | 1133 | * UNLOCK and LOCK operate on the same lock variable. |
1129 | */ | 1134 | */ |
1130 | #ifdef CONFIG_PPC | 1135 | #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE |
1131 | #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ | 1136 | #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ |
1132 | #else /* #ifdef CONFIG_PPC */ | 1137 | #else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ |
1133 | #define smp_mb__after_unlock_lock() do { } while (0) | 1138 | #define smp_mb__after_unlock_lock() do { } while (0) |
1134 | #endif /* #else #ifdef CONFIG_PPC */ | 1139 | #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ |
1135 | 1140 | ||
1136 | 1141 | ||
1137 | #endif /* __LINUX_RCUPDATE_H */ | 1142 | #endif /* __LINUX_RCUPDATE_H */ |
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index b452953e21c8..74d9c3a1feee 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h | |||
@@ -33,6 +33,11 @@ static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp) | |||
33 | return 0; | 33 | return 0; |
34 | } | 34 | } |
35 | 35 | ||
36 | static inline bool rcu_eqs_special_set(int cpu) | ||
37 | { | ||
38 | return false; /* Never flag non-existent other CPUs! */ | ||
39 | } | ||
40 | |||
36 | static inline unsigned long get_state_synchronize_rcu(void) | 41 | static inline unsigned long get_state_synchronize_rcu(void) |
37 | { | 42 | { |
38 | return 0; | 43 | return 0; |
@@ -87,10 +92,11 @@ static inline void kfree_call_rcu(struct rcu_head *head, | |||
87 | call_rcu(head, func); | 92 | call_rcu(head, func); |
88 | } | 93 | } |
89 | 94 | ||
90 | static inline void rcu_note_context_switch(void) | 95 | #define rcu_note_context_switch(preempt) \ |
91 | { | 96 | do { \ |
92 | rcu_sched_qs(); | 97 | rcu_sched_qs(); \ |
93 | } | 98 | rcu_note_voluntary_context_switch_lite(current); \ |
99 | } while (0) | ||
94 | 100 | ||
95 | /* | 101 | /* |
96 | * Take advantage of the fact that there is only one CPU, which | 102 | * Take advantage of the fact that there is only one CPU, which |
@@ -212,14 +218,14 @@ static inline void exit_rcu(void) | |||
212 | { | 218 | { |
213 | } | 219 | } |
214 | 220 | ||
215 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 221 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) |
216 | extern int rcu_scheduler_active __read_mostly; | 222 | extern int rcu_scheduler_active __read_mostly; |
217 | void rcu_scheduler_starting(void); | 223 | void rcu_scheduler_starting(void); |
218 | #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 224 | #else /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ |
219 | static inline void rcu_scheduler_starting(void) | 225 | static inline void rcu_scheduler_starting(void) |
220 | { | 226 | { |
221 | } | 227 | } |
222 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 228 | #endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ |
223 | 229 | ||
224 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) | 230 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) |
225 | 231 | ||
@@ -237,6 +243,10 @@ static inline bool rcu_is_watching(void) | |||
237 | 243 | ||
238 | #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ | 244 | #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ |
239 | 245 | ||
246 | static inline void rcu_request_urgent_qs_task(struct task_struct *t) | ||
247 | { | ||
248 | } | ||
249 | |||
240 | static inline void rcu_all_qs(void) | 250 | static inline void rcu_all_qs(void) |
241 | { | 251 | { |
242 | barrier(); /* Avoid RCU read-side critical sections leaking across. */ | 252 | barrier(); /* Avoid RCU read-side critical sections leaking across. */ |
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 63a4e4cf40a5..0bacb6b2af69 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h | |||
@@ -30,7 +30,7 @@ | |||
30 | #ifndef __LINUX_RCUTREE_H | 30 | #ifndef __LINUX_RCUTREE_H |
31 | #define __LINUX_RCUTREE_H | 31 | #define __LINUX_RCUTREE_H |
32 | 32 | ||
33 | void rcu_note_context_switch(void); | 33 | void rcu_note_context_switch(bool preempt); |
34 | int rcu_needs_cpu(u64 basem, u64 *nextevt); | 34 | int rcu_needs_cpu(u64 basem, u64 *nextevt); |
35 | void rcu_cpu_stall_reset(void); | 35 | void rcu_cpu_stall_reset(void); |
36 | 36 | ||
@@ -41,7 +41,7 @@ void rcu_cpu_stall_reset(void); | |||
41 | */ | 41 | */ |
42 | static inline void rcu_virt_note_context_switch(int cpu) | 42 | static inline void rcu_virt_note_context_switch(int cpu) |
43 | { | 43 | { |
44 | rcu_note_context_switch(); | 44 | rcu_note_context_switch(false); |
45 | } | 45 | } |
46 | 46 | ||
47 | void synchronize_rcu_bh(void); | 47 | void synchronize_rcu_bh(void); |
@@ -108,6 +108,7 @@ void rcu_scheduler_starting(void); | |||
108 | extern int rcu_scheduler_active __read_mostly; | 108 | extern int rcu_scheduler_active __read_mostly; |
109 | 109 | ||
110 | bool rcu_is_watching(void); | 110 | bool rcu_is_watching(void); |
111 | void rcu_request_urgent_qs_task(struct task_struct *t); | ||
111 | 112 | ||
112 | void rcu_all_qs(void); | 113 | void rcu_all_qs(void); |
113 | 114 | ||
diff --git a/include/linux/slab.h b/include/linux/slab.h index 3c37a8c51921..04a7f7993e67 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h | |||
@@ -28,7 +28,7 @@ | |||
28 | #define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ | 28 | #define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ |
29 | #define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */ | 29 | #define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */ |
30 | /* | 30 | /* |
31 | * SLAB_DESTROY_BY_RCU - **WARNING** READ THIS! | 31 | * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! |
32 | * | 32 | * |
33 | * This delays freeing the SLAB page by a grace period, it does _NOT_ | 33 | * This delays freeing the SLAB page by a grace period, it does _NOT_ |
34 | * delay object freeing. This means that if you do kmem_cache_free() | 34 | * delay object freeing. This means that if you do kmem_cache_free() |
@@ -61,8 +61,10 @@ | |||
61 | * | 61 | * |
62 | * rcu_read_lock before reading the address, then rcu_read_unlock after | 62 | * rcu_read_lock before reading the address, then rcu_read_unlock after |
63 | * taking the spinlock within the structure expected at that address. | 63 | * taking the spinlock within the structure expected at that address. |
64 | * | ||
65 | * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. | ||
64 | */ | 66 | */ |
65 | #define SLAB_DESTROY_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ | 67 | #define SLAB_TYPESAFE_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ |
66 | #define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ | 68 | #define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ |
67 | #define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */ | 69 | #define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */ |
68 | 70 | ||
diff --git a/include/linux/srcu.h b/include/linux/srcu.h index a598cf3ac70c..167ad8831aaf 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h | |||
@@ -22,7 +22,7 @@ | |||
22 | * Lai Jiangshan <laijs@cn.fujitsu.com> | 22 | * Lai Jiangshan <laijs@cn.fujitsu.com> |
23 | * | 23 | * |
24 | * For detailed explanation of Read-Copy Update mechanism see - | 24 | * For detailed explanation of Read-Copy Update mechanism see - |
25 | * Documentation/RCU/ *.txt | 25 | * Documentation/RCU/ *.txt |
26 | * | 26 | * |
27 | */ | 27 | */ |
28 | 28 | ||
@@ -32,35 +32,9 @@ | |||
32 | #include <linux/mutex.h> | 32 | #include <linux/mutex.h> |
33 | #include <linux/rcupdate.h> | 33 | #include <linux/rcupdate.h> |
34 | #include <linux/workqueue.h> | 34 | #include <linux/workqueue.h> |
35 | #include <linux/rcu_segcblist.h> | ||
35 | 36 | ||
36 | struct srcu_array { | 37 | struct srcu_struct; |
37 | unsigned long lock_count[2]; | ||
38 | unsigned long unlock_count[2]; | ||
39 | }; | ||
40 | |||
41 | struct rcu_batch { | ||
42 | struct rcu_head *head, **tail; | ||
43 | }; | ||
44 | |||
45 | #define RCU_BATCH_INIT(name) { NULL, &(name.head) } | ||
46 | |||
47 | struct srcu_struct { | ||
48 | unsigned long completed; | ||
49 | struct srcu_array __percpu *per_cpu_ref; | ||
50 | spinlock_t queue_lock; /* protect ->batch_queue, ->running */ | ||
51 | bool running; | ||
52 | /* callbacks just queued */ | ||
53 | struct rcu_batch batch_queue; | ||
54 | /* callbacks try to do the first check_zero */ | ||
55 | struct rcu_batch batch_check0; | ||
56 | /* callbacks done with the first check_zero and the flip */ | ||
57 | struct rcu_batch batch_check1; | ||
58 | struct rcu_batch batch_done; | ||
59 | struct delayed_work work; | ||
60 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
61 | struct lockdep_map dep_map; | ||
62 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
63 | }; | ||
64 | 38 | ||
65 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 39 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
66 | 40 | ||
@@ -82,46 +56,15 @@ int init_srcu_struct(struct srcu_struct *sp); | |||
82 | #define __SRCU_DEP_MAP_INIT(srcu_name) | 56 | #define __SRCU_DEP_MAP_INIT(srcu_name) |
83 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 57 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
84 | 58 | ||
85 | void process_srcu(struct work_struct *work); | 59 | #ifdef CONFIG_TINY_SRCU |
86 | 60 | #include <linux/srcutiny.h> | |
87 | #define __SRCU_STRUCT_INIT(name) \ | 61 | #elif defined(CONFIG_TREE_SRCU) |
88 | { \ | 62 | #include <linux/srcutree.h> |
89 | .completed = -300, \ | 63 | #elif defined(CONFIG_CLASSIC_SRCU) |
90 | .per_cpu_ref = &name##_srcu_array, \ | 64 | #include <linux/srcuclassic.h> |
91 | .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ | 65 | #else |
92 | .running = false, \ | 66 | #error "Unknown SRCU implementation specified to kernel configuration" |
93 | .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ | 67 | #endif |
94 | .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ | ||
95 | .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ | ||
96 | .batch_done = RCU_BATCH_INIT(name.batch_done), \ | ||
97 | .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ | ||
98 | __SRCU_DEP_MAP_INIT(name) \ | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * Define and initialize a srcu struct at build time. | ||
103 | * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. | ||
104 | * | ||
105 | * Note that although DEFINE_STATIC_SRCU() hides the name from other | ||
106 | * files, the per-CPU variable rules nevertheless require that the | ||
107 | * chosen name be globally unique. These rules also prohibit use of | ||
108 | * DEFINE_STATIC_SRCU() within a function. If these rules are too | ||
109 | * restrictive, declare the srcu_struct manually. For example, in | ||
110 | * each file: | ||
111 | * | ||
112 | * static struct srcu_struct my_srcu; | ||
113 | * | ||
114 | * Then, before the first use of each my_srcu, manually initialize it: | ||
115 | * | ||
116 | * init_srcu_struct(&my_srcu); | ||
117 | * | ||
118 | * See include/linux/percpu-defs.h for the rules on per-CPU variables. | ||
119 | */ | ||
120 | #define __DEFINE_SRCU(name, is_static) \ | ||
121 | static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ | ||
122 | is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) | ||
123 | #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) | ||
124 | #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) | ||
125 | 68 | ||
126 | /** | 69 | /** |
127 | * call_srcu() - Queue a callback for invocation after an SRCU grace period | 70 | * call_srcu() - Queue a callback for invocation after an SRCU grace period |
@@ -147,9 +90,6 @@ void cleanup_srcu_struct(struct srcu_struct *sp); | |||
147 | int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); | 90 | int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); |
148 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); | 91 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); |
149 | void synchronize_srcu(struct srcu_struct *sp); | 92 | void synchronize_srcu(struct srcu_struct *sp); |
150 | void synchronize_srcu_expedited(struct srcu_struct *sp); | ||
151 | unsigned long srcu_batches_completed(struct srcu_struct *sp); | ||
152 | void srcu_barrier(struct srcu_struct *sp); | ||
153 | 93 | ||
154 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 94 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
155 | 95 | ||
diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h new file mode 100644 index 000000000000..41cf99930f34 --- /dev/null +++ b/include/linux/srcuclassic.h | |||
@@ -0,0 +1,101 @@ | |||
1 | /* | ||
2 | * Sleepable Read-Copy Update mechanism for mutual exclusion, | ||
3 | * classic v4.11 variant. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, you can access it online at | ||
17 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
18 | * | ||
19 | * Copyright (C) IBM Corporation, 2017 | ||
20 | * | ||
21 | * Author: Paul McKenney <paulmck@us.ibm.com> | ||
22 | */ | ||
23 | |||
24 | #ifndef _LINUX_SRCU_CLASSIC_H | ||
25 | #define _LINUX_SRCU_CLASSIC_H | ||
26 | |||
27 | struct srcu_array { | ||
28 | unsigned long lock_count[2]; | ||
29 | unsigned long unlock_count[2]; | ||
30 | }; | ||
31 | |||
32 | struct rcu_batch { | ||
33 | struct rcu_head *head, **tail; | ||
34 | }; | ||
35 | |||
36 | #define RCU_BATCH_INIT(name) { NULL, &(name.head) } | ||
37 | |||
38 | struct srcu_struct { | ||
39 | unsigned long completed; | ||
40 | struct srcu_array __percpu *per_cpu_ref; | ||
41 | spinlock_t queue_lock; /* protect ->batch_queue, ->running */ | ||
42 | bool running; | ||
43 | /* callbacks just queued */ | ||
44 | struct rcu_batch batch_queue; | ||
45 | /* callbacks try to do the first check_zero */ | ||
46 | struct rcu_batch batch_check0; | ||
47 | /* callbacks done with the first check_zero and the flip */ | ||
48 | struct rcu_batch batch_check1; | ||
49 | struct rcu_batch batch_done; | ||
50 | struct delayed_work work; | ||
51 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
52 | struct lockdep_map dep_map; | ||
53 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
54 | }; | ||
55 | |||
56 | void process_srcu(struct work_struct *work); | ||
57 | |||
58 | #define __SRCU_STRUCT_INIT(name) \ | ||
59 | { \ | ||
60 | .completed = -300, \ | ||
61 | .per_cpu_ref = &name##_srcu_array, \ | ||
62 | .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ | ||
63 | .running = false, \ | ||
64 | .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ | ||
65 | .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ | ||
66 | .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ | ||
67 | .batch_done = RCU_BATCH_INIT(name.batch_done), \ | ||
68 | .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ | ||
69 | __SRCU_DEP_MAP_INIT(name) \ | ||
70 | } | ||
71 | |||
72 | /* | ||
73 | * Define and initialize a srcu struct at build time. | ||
74 | * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. | ||
75 | * | ||
76 | * Note that although DEFINE_STATIC_SRCU() hides the name from other | ||
77 | * files, the per-CPU variable rules nevertheless require that the | ||
78 | * chosen name be globally unique. These rules also prohibit use of | ||
79 | * DEFINE_STATIC_SRCU() within a function. If these rules are too | ||
80 | * restrictive, declare the srcu_struct manually. For example, in | ||
81 | * each file: | ||
82 | * | ||
83 | * static struct srcu_struct my_srcu; | ||
84 | * | ||
85 | * Then, before the first use of each my_srcu, manually initialize it: | ||
86 | * | ||
87 | * init_srcu_struct(&my_srcu); | ||
88 | * | ||
89 | * See include/linux/percpu-defs.h for the rules on per-CPU variables. | ||
90 | */ | ||
91 | #define __DEFINE_SRCU(name, is_static) \ | ||
92 | static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ | ||
93 | is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) | ||
94 | #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) | ||
95 | #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) | ||
96 | |||
97 | void synchronize_srcu_expedited(struct srcu_struct *sp); | ||
98 | void srcu_barrier(struct srcu_struct *sp); | ||
99 | unsigned long srcu_batches_completed(struct srcu_struct *sp); | ||
100 | |||
101 | #endif | ||
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h new file mode 100644 index 000000000000..4f284e4f4d8c --- /dev/null +++ b/include/linux/srcutiny.h | |||
@@ -0,0 +1,81 @@ | |||
1 | /* | ||
2 | * Sleepable Read-Copy Update mechanism for mutual exclusion, | ||
3 | * tiny variant. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, you can access it online at | ||
17 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
18 | * | ||
19 | * Copyright (C) IBM Corporation, 2017 | ||
20 | * | ||
21 | * Author: Paul McKenney <paulmck@us.ibm.com> | ||
22 | */ | ||
23 | |||
24 | #ifndef _LINUX_SRCU_TINY_H | ||
25 | #define _LINUX_SRCU_TINY_H | ||
26 | |||
27 | #include <linux/swait.h> | ||
28 | |||
29 | struct srcu_struct { | ||
30 | int srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ | ||
31 | struct swait_queue_head srcu_wq; | ||
32 | /* Last srcu_read_unlock() wakes GP. */ | ||
33 | unsigned long srcu_gp_seq; /* GP seq # for callback tagging. */ | ||
34 | struct rcu_segcblist srcu_cblist; | ||
35 | /* Pending SRCU callbacks. */ | ||
36 | int srcu_idx; /* Current reader array element. */ | ||
37 | bool srcu_gp_running; /* GP workqueue running? */ | ||
38 | bool srcu_gp_waiting; /* GP waiting for readers? */ | ||
39 | struct work_struct srcu_work; /* For driving grace periods. */ | ||
40 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
41 | struct lockdep_map dep_map; | ||
42 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
43 | }; | ||
44 | |||
45 | void srcu_drive_gp(struct work_struct *wp); | ||
46 | |||
47 | #define __SRCU_STRUCT_INIT(name) \ | ||
48 | { \ | ||
49 | .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \ | ||
50 | .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist), \ | ||
51 | .srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp), \ | ||
52 | __SRCU_DEP_MAP_INIT(name) \ | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * This odd _STATIC_ arrangement is needed for API compatibility with | ||
57 | * Tree SRCU, which needs some per-CPU data. | ||
58 | */ | ||
59 | #define DEFINE_SRCU(name) \ | ||
60 | struct srcu_struct name = __SRCU_STRUCT_INIT(name) | ||
61 | #define DEFINE_STATIC_SRCU(name) \ | ||
62 | static struct srcu_struct name = __SRCU_STRUCT_INIT(name) | ||
63 | |||
64 | void synchronize_srcu(struct srcu_struct *sp); | ||
65 | |||
66 | static inline void synchronize_srcu_expedited(struct srcu_struct *sp) | ||
67 | { | ||
68 | synchronize_srcu(sp); | ||
69 | } | ||
70 | |||
71 | static inline void srcu_barrier(struct srcu_struct *sp) | ||
72 | { | ||
73 | synchronize_srcu(sp); | ||
74 | } | ||
75 | |||
76 | static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) | ||
77 | { | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | #endif | ||
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h new file mode 100644 index 000000000000..0400e211aa44 --- /dev/null +++ b/include/linux/srcutree.h | |||
@@ -0,0 +1,139 @@ | |||
1 | /* | ||
2 | * Sleepable Read-Copy Update mechanism for mutual exclusion, | ||
3 | * tree variant. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, you can access it online at | ||
17 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
18 | * | ||
19 | * Copyright (C) IBM Corporation, 2017 | ||
20 | * | ||
21 | * Author: Paul McKenney <paulmck@us.ibm.com> | ||
22 | */ | ||
23 | |||
24 | #ifndef _LINUX_SRCU_TREE_H | ||
25 | #define _LINUX_SRCU_TREE_H | ||
26 | |||
27 | #include <linux/rcu_node_tree.h> | ||
28 | #include <linux/completion.h> | ||
29 | |||
30 | struct srcu_node; | ||
31 | struct srcu_struct; | ||
32 | |||
33 | /* | ||
34 | * Per-CPU structure feeding into leaf srcu_node, similar in function | ||
35 | * to rcu_node. | ||
36 | */ | ||
37 | struct srcu_data { | ||
38 | /* Read-side state. */ | ||
39 | unsigned long srcu_lock_count[2]; /* Locks per CPU. */ | ||
40 | unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */ | ||
41 | |||
42 | /* Update-side state. */ | ||
43 | spinlock_t lock ____cacheline_internodealigned_in_smp; | ||
44 | struct rcu_segcblist srcu_cblist; /* List of callbacks.*/ | ||
45 | unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */ | ||
46 | bool srcu_cblist_invoking; /* Invoking these CBs? */ | ||
47 | struct delayed_work work; /* Context for CB invoking. */ | ||
48 | struct rcu_head srcu_barrier_head; /* For srcu_barrier() use. */ | ||
49 | struct srcu_node *mynode; /* Leaf srcu_node. */ | ||
50 | int cpu; | ||
51 | struct srcu_struct *sp; | ||
52 | }; | ||
53 | |||
54 | /* | ||
55 | * Node in SRCU combining tree, similar in function to rcu_data. | ||
56 | */ | ||
57 | struct srcu_node { | ||
58 | spinlock_t lock; | ||
59 | unsigned long srcu_have_cbs[4]; /* GP seq for children */ | ||
60 | /* having CBs, but only */ | ||
61 | /* is > ->srcu_gq_seq. */ | ||
62 | struct srcu_node *srcu_parent; /* Next up in tree. */ | ||
63 | int grplo; /* Least CPU for node. */ | ||
64 | int grphi; /* Biggest CPU for node. */ | ||
65 | }; | ||
66 | |||
67 | /* | ||
68 | * Per-SRCU-domain structure, similar in function to rcu_state. | ||
69 | */ | ||
70 | struct srcu_struct { | ||
71 | struct srcu_node node[NUM_RCU_NODES]; /* Combining tree. */ | ||
72 | struct srcu_node *level[RCU_NUM_LVLS + 1]; | ||
73 | /* First node at each level. */ | ||
74 | struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ | ||
75 | spinlock_t gp_lock; /* protect ->srcu_cblist */ | ||
76 | struct mutex srcu_gp_mutex; /* Serialize GP work. */ | ||
77 | unsigned int srcu_idx; /* Current rdr array element. */ | ||
78 | unsigned long srcu_gp_seq; /* Grace-period seq #. */ | ||
79 | unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ | ||
80 | atomic_t srcu_exp_cnt; /* # ongoing expedited GPs. */ | ||
81 | struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ | ||
82 | unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ | ||
83 | struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ | ||
84 | struct completion srcu_barrier_completion; | ||
85 | /* Awaken barrier rq at end. */ | ||
86 | atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */ | ||
87 | /* callback for the barrier */ | ||
88 | /* operation. */ | ||
89 | struct delayed_work work; | ||
90 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
91 | struct lockdep_map dep_map; | ||
92 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
93 | }; | ||
94 | |||
95 | /* Values for state variable (bottom bits of ->srcu_gp_seq). */ | ||
96 | #define SRCU_STATE_IDLE 0 | ||
97 | #define SRCU_STATE_SCAN1 1 | ||
98 | #define SRCU_STATE_SCAN2 2 | ||
99 | |||
100 | void process_srcu(struct work_struct *work); | ||
101 | |||
102 | #define __SRCU_STRUCT_INIT(name) \ | ||
103 | { \ | ||
104 | .sda = &name##_srcu_data, \ | ||
105 | .gp_lock = __SPIN_LOCK_UNLOCKED(name.gp_lock), \ | ||
106 | .srcu_gp_seq_needed = 0 - 1, \ | ||
107 | __SRCU_DEP_MAP_INIT(name) \ | ||
108 | } | ||
109 | |||
110 | /* | ||
111 | * Define and initialize a srcu struct at build time. | ||
112 | * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. | ||
113 | * | ||
114 | * Note that although DEFINE_STATIC_SRCU() hides the name from other | ||
115 | * files, the per-CPU variable rules nevertheless require that the | ||
116 | * chosen name be globally unique. These rules also prohibit use of | ||
117 | * DEFINE_STATIC_SRCU() within a function. If these rules are too | ||
118 | * restrictive, declare the srcu_struct manually. For example, in | ||
119 | * each file: | ||
120 | * | ||
121 | * static struct srcu_struct my_srcu; | ||
122 | * | ||
123 | * Then, before the first use of each my_srcu, manually initialize it: | ||
124 | * | ||
125 | * init_srcu_struct(&my_srcu); | ||
126 | * | ||
127 | * See include/linux/percpu-defs.h for the rules on per-CPU variables. | ||
128 | */ | ||
129 | #define __DEFINE_SRCU(name, is_static) \ | ||
130 | static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\ | ||
131 | is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) | ||
132 | #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) | ||
133 | #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) | ||
134 | |||
135 | void synchronize_srcu_expedited(struct srcu_struct *sp); | ||
136 | void srcu_barrier(struct srcu_struct *sp); | ||
137 | unsigned long srcu_batches_completed(struct srcu_struct *sp); | ||
138 | |||
139 | #endif | ||
diff --git a/include/linux/types.h b/include/linux/types.h index 1e7bd24848fc..258099a4ed82 100644 --- a/include/linux/types.h +++ b/include/linux/types.h | |||
@@ -209,7 +209,7 @@ struct ustat { | |||
209 | * naturally due ABI requirements, but some architectures (like CRIS) have | 209 | * naturally due ABI requirements, but some architectures (like CRIS) have |
210 | * weird ABI and we need to ask it explicitly. | 210 | * weird ABI and we need to ask it explicitly. |
211 | * | 211 | * |
212 | * The alignment is required to guarantee that bits 0 and 1 of @next will be | 212 | * The alignment is required to guarantee that bit 0 of @next will be |
213 | * clear under normal conditions -- as long as we use call_rcu(), | 213 | * clear under normal conditions -- as long as we use call_rcu(), |
214 | * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. | 214 | * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. |
215 | * | 215 | * |
diff --git a/include/net/sock.h b/include/net/sock.h index 03252d53975d..c092f2437546 100644 --- a/include/net/sock.h +++ b/include/net/sock.h | |||
@@ -995,7 +995,7 @@ struct smc_hashinfo; | |||
995 | struct module; | 995 | struct module; |
996 | 996 | ||
997 | /* | 997 | /* |
998 | * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes | 998 | * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes |
999 | * un-modified. Special care is taken when initializing object to zero. | 999 | * un-modified. Special care is taken when initializing object to zero. |
1000 | */ | 1000 | */ |
1001 | static inline void sk_prot_clear_nulls(struct sock *sk, int size) | 1001 | static inline void sk_prot_clear_nulls(struct sock *sk, int size) |
diff --git a/init/Kconfig b/init/Kconfig index a92f27da4a27..4119a44e4157 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -526,6 +526,35 @@ config SRCU | |||
526 | permits arbitrary sleeping or blocking within RCU read-side critical | 526 | permits arbitrary sleeping or blocking within RCU read-side critical |
527 | sections. | 527 | sections. |
528 | 528 | ||
529 | config CLASSIC_SRCU | ||
530 | bool "Use v4.11 classic SRCU implementation" | ||
531 | default n | ||
532 | depends on RCU_EXPERT && SRCU | ||
533 | help | ||
534 | This option selects the traditional well-tested classic SRCU | ||
535 | implementation from v4.11, as might be desired for enterprise | ||
536 | Linux distributions. Without this option, the shiny new | ||
537 | Tiny SRCU and Tree SRCU implementations are used instead. | ||
538 | At some point, it is hoped that Tiny SRCU and Tree SRCU | ||
539 | will accumulate enough test time and confidence to allow | ||
540 | Classic SRCU to be dropped entirely. | ||
541 | |||
542 | Say Y if you need a rock-solid SRCU. | ||
543 | |||
544 | Say N if you would like help test Tree SRCU. | ||
545 | |||
546 | config TINY_SRCU | ||
547 | bool | ||
548 | default y if TINY_RCU && !CLASSIC_SRCU | ||
549 | help | ||
550 | This option selects the single-CPU non-preemptible version of SRCU. | ||
551 | |||
552 | config TREE_SRCU | ||
553 | bool | ||
554 | default y if !TINY_RCU && !CLASSIC_SRCU | ||
555 | help | ||
556 | This option selects the full-fledged version of SRCU. | ||
557 | |||
529 | config TASKS_RCU | 558 | config TASKS_RCU |
530 | bool | 559 | bool |
531 | default n | 560 | default n |
@@ -612,11 +641,17 @@ config RCU_FANOUT_LEAF | |||
612 | initialization. These systems tend to run CPU-bound, and thus | 641 | initialization. These systems tend to run CPU-bound, and thus |
613 | are not helped by synchronized interrupts, and thus tend to | 642 | are not helped by synchronized interrupts, and thus tend to |
614 | skew them, which reduces lock contention enough that large | 643 | skew them, which reduces lock contention enough that large |
615 | leaf-level fanouts work well. | 644 | leaf-level fanouts work well. That said, setting leaf-level |
645 | fanout to a large number will likely cause problematic | ||
646 | lock contention on the leaf-level rcu_node structures unless | ||
647 | you boot with the skew_tick kernel parameter. | ||
616 | 648 | ||
617 | Select a specific number if testing RCU itself. | 649 | Select a specific number if testing RCU itself. |
618 | 650 | ||
619 | Select the maximum permissible value for large systems. | 651 | Select the maximum permissible value for large systems, but |
652 | please understand that you may also need to set the skew_tick | ||
653 | kernel boot parameter to avoid contention on the rcu_node | ||
654 | structure's locks. | ||
620 | 655 | ||
621 | Take the default if unsure. | 656 | Take the default if unsure. |
622 | 657 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 6c463c80e93d..9330ce24f1bb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1313,7 +1313,7 @@ void __cleanup_sighand(struct sighand_struct *sighand) | |||
1313 | if (atomic_dec_and_test(&sighand->count)) { | 1313 | if (atomic_dec_and_test(&sighand->count)) { |
1314 | signalfd_cleanup(sighand); | 1314 | signalfd_cleanup(sighand); |
1315 | /* | 1315 | /* |
1316 | * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it | 1316 | * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it |
1317 | * without an RCU grace period, see __lock_task_sighand(). | 1317 | * without an RCU grace period, see __lock_task_sighand(). |
1318 | */ | 1318 | */ |
1319 | kmem_cache_free(sighand_cachep, sighand); | 1319 | kmem_cache_free(sighand_cachep, sighand); |
@@ -2144,7 +2144,7 @@ void __init proc_caches_init(void) | |||
2144 | { | 2144 | { |
2145 | sighand_cachep = kmem_cache_create("sighand_cache", | 2145 | sighand_cachep = kmem_cache_create("sighand_cache", |
2146 | sizeof(struct sighand_struct), 0, | 2146 | sizeof(struct sighand_struct), 0, |
2147 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| | 2147 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| |
2148 | SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); | 2148 | SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); |
2149 | signal_cachep = kmem_cache_create("signal_cache", | 2149 | signal_cachep = kmem_cache_create("signal_cache", |
2150 | sizeof(struct signal_struct), 0, | 2150 | sizeof(struct signal_struct), 0, |
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index a95e5d1f4a9c..e9d4f85b290c 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -1144,10 +1144,10 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, | |||
1144 | return 0; | 1144 | return 0; |
1145 | 1145 | ||
1146 | printk("\n"); | 1146 | printk("\n"); |
1147 | printk("======================================================\n"); | 1147 | pr_warn("======================================================\n"); |
1148 | printk("[ INFO: possible circular locking dependency detected ]\n"); | 1148 | pr_warn("WARNING: possible circular locking dependency detected\n"); |
1149 | print_kernel_ident(); | 1149 | print_kernel_ident(); |
1150 | printk("-------------------------------------------------------\n"); | 1150 | pr_warn("------------------------------------------------------\n"); |
1151 | printk("%s/%d is trying to acquire lock:\n", | 1151 | printk("%s/%d is trying to acquire lock:\n", |
1152 | curr->comm, task_pid_nr(curr)); | 1152 | curr->comm, task_pid_nr(curr)); |
1153 | print_lock(check_src); | 1153 | print_lock(check_src); |
@@ -1482,11 +1482,11 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
1482 | return 0; | 1482 | return 0; |
1483 | 1483 | ||
1484 | printk("\n"); | 1484 | printk("\n"); |
1485 | printk("======================================================\n"); | 1485 | pr_warn("=====================================================\n"); |
1486 | printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", | 1486 | pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", |
1487 | irqclass, irqclass); | 1487 | irqclass, irqclass); |
1488 | print_kernel_ident(); | 1488 | print_kernel_ident(); |
1489 | printk("------------------------------------------------------\n"); | 1489 | pr_warn("-----------------------------------------------------\n"); |
1490 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", | 1490 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", |
1491 | curr->comm, task_pid_nr(curr), | 1491 | curr->comm, task_pid_nr(curr), |
1492 | curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, | 1492 | curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, |
@@ -1711,10 +1711,10 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | |||
1711 | return 0; | 1711 | return 0; |
1712 | 1712 | ||
1713 | printk("\n"); | 1713 | printk("\n"); |
1714 | printk("=============================================\n"); | 1714 | pr_warn("============================================\n"); |
1715 | printk("[ INFO: possible recursive locking detected ]\n"); | 1715 | pr_warn("WARNING: possible recursive locking detected\n"); |
1716 | print_kernel_ident(); | 1716 | print_kernel_ident(); |
1717 | printk("---------------------------------------------\n"); | 1717 | pr_warn("--------------------------------------------\n"); |
1718 | printk("%s/%d is trying to acquire lock:\n", | 1718 | printk("%s/%d is trying to acquire lock:\n", |
1719 | curr->comm, task_pid_nr(curr)); | 1719 | curr->comm, task_pid_nr(curr)); |
1720 | print_lock(next); | 1720 | print_lock(next); |
@@ -2061,10 +2061,10 @@ static void print_collision(struct task_struct *curr, | |||
2061 | struct lock_chain *chain) | 2061 | struct lock_chain *chain) |
2062 | { | 2062 | { |
2063 | printk("\n"); | 2063 | printk("\n"); |
2064 | printk("======================\n"); | 2064 | pr_warn("============================\n"); |
2065 | printk("[chain_key collision ]\n"); | 2065 | pr_warn("WARNING: chain_key collision\n"); |
2066 | print_kernel_ident(); | 2066 | print_kernel_ident(); |
2067 | printk("----------------------\n"); | 2067 | pr_warn("----------------------------\n"); |
2068 | printk("%s/%d: ", current->comm, task_pid_nr(current)); | 2068 | printk("%s/%d: ", current->comm, task_pid_nr(current)); |
2069 | printk("Hash chain already cached but the contents don't match!\n"); | 2069 | printk("Hash chain already cached but the contents don't match!\n"); |
2070 | 2070 | ||
@@ -2360,10 +2360,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, | |||
2360 | return 0; | 2360 | return 0; |
2361 | 2361 | ||
2362 | printk("\n"); | 2362 | printk("\n"); |
2363 | printk("=================================\n"); | 2363 | pr_warn("================================\n"); |
2364 | printk("[ INFO: inconsistent lock state ]\n"); | 2364 | pr_warn("WARNING: inconsistent lock state\n"); |
2365 | print_kernel_ident(); | 2365 | print_kernel_ident(); |
2366 | printk("---------------------------------\n"); | 2366 | pr_warn("--------------------------------\n"); |
2367 | 2367 | ||
2368 | printk("inconsistent {%s} -> {%s} usage.\n", | 2368 | printk("inconsistent {%s} -> {%s} usage.\n", |
2369 | usage_str[prev_bit], usage_str[new_bit]); | 2369 | usage_str[prev_bit], usage_str[new_bit]); |
@@ -2425,10 +2425,10 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
2425 | return 0; | 2425 | return 0; |
2426 | 2426 | ||
2427 | printk("\n"); | 2427 | printk("\n"); |
2428 | printk("=========================================================\n"); | 2428 | pr_warn("========================================================\n"); |
2429 | printk("[ INFO: possible irq lock inversion dependency detected ]\n"); | 2429 | pr_warn("WARNING: possible irq lock inversion dependency detected\n"); |
2430 | print_kernel_ident(); | 2430 | print_kernel_ident(); |
2431 | printk("---------------------------------------------------------\n"); | 2431 | pr_warn("--------------------------------------------------------\n"); |
2432 | printk("%s/%d just changed the state of lock:\n", | 2432 | printk("%s/%d just changed the state of lock:\n", |
2433 | curr->comm, task_pid_nr(curr)); | 2433 | curr->comm, task_pid_nr(curr)); |
2434 | print_lock(this); | 2434 | print_lock(this); |
@@ -3170,10 +3170,10 @@ print_lock_nested_lock_not_held(struct task_struct *curr, | |||
3170 | return 0; | 3170 | return 0; |
3171 | 3171 | ||
3172 | printk("\n"); | 3172 | printk("\n"); |
3173 | printk("==================================\n"); | 3173 | pr_warn("==================================\n"); |
3174 | printk("[ BUG: Nested lock was not taken ]\n"); | 3174 | pr_warn("WARNING: Nested lock was not taken\n"); |
3175 | print_kernel_ident(); | 3175 | print_kernel_ident(); |
3176 | printk("----------------------------------\n"); | 3176 | pr_warn("----------------------------------\n"); |
3177 | 3177 | ||
3178 | printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); | 3178 | printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); |
3179 | print_lock(hlock); | 3179 | print_lock(hlock); |
@@ -3383,10 +3383,10 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
3383 | return 0; | 3383 | return 0; |
3384 | 3384 | ||
3385 | printk("\n"); | 3385 | printk("\n"); |
3386 | printk("=====================================\n"); | 3386 | pr_warn("=====================================\n"); |
3387 | printk("[ BUG: bad unlock balance detected! ]\n"); | 3387 | pr_warn("WARNING: bad unlock balance detected!\n"); |
3388 | print_kernel_ident(); | 3388 | print_kernel_ident(); |
3389 | printk("-------------------------------------\n"); | 3389 | pr_warn("-------------------------------------\n"); |
3390 | printk("%s/%d is trying to release lock (", | 3390 | printk("%s/%d is trying to release lock (", |
3391 | curr->comm, task_pid_nr(curr)); | 3391 | curr->comm, task_pid_nr(curr)); |
3392 | print_lockdep_cache(lock); | 3392 | print_lockdep_cache(lock); |
@@ -3880,10 +3880,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
3880 | return 0; | 3880 | return 0; |
3881 | 3881 | ||
3882 | printk("\n"); | 3882 | printk("\n"); |
3883 | printk("=================================\n"); | 3883 | pr_warn("=================================\n"); |
3884 | printk("[ BUG: bad contention detected! ]\n"); | 3884 | pr_warn("WARNING: bad contention detected!\n"); |
3885 | print_kernel_ident(); | 3885 | print_kernel_ident(); |
3886 | printk("---------------------------------\n"); | 3886 | pr_warn("---------------------------------\n"); |
3887 | printk("%s/%d is trying to contend lock (", | 3887 | printk("%s/%d is trying to contend lock (", |
3888 | curr->comm, task_pid_nr(curr)); | 3888 | curr->comm, task_pid_nr(curr)); |
3889 | print_lockdep_cache(lock); | 3889 | print_lockdep_cache(lock); |
@@ -4244,10 +4244,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, | |||
4244 | return; | 4244 | return; |
4245 | 4245 | ||
4246 | printk("\n"); | 4246 | printk("\n"); |
4247 | printk("=========================\n"); | 4247 | pr_warn("=========================\n"); |
4248 | printk("[ BUG: held lock freed! ]\n"); | 4248 | pr_warn("WARNING: held lock freed!\n"); |
4249 | print_kernel_ident(); | 4249 | print_kernel_ident(); |
4250 | printk("-------------------------\n"); | 4250 | pr_warn("-------------------------\n"); |
4251 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", | 4251 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", |
4252 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); | 4252 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); |
4253 | print_lock(hlock); | 4253 | print_lock(hlock); |
@@ -4302,11 +4302,11 @@ static void print_held_locks_bug(void) | |||
4302 | return; | 4302 | return; |
4303 | 4303 | ||
4304 | printk("\n"); | 4304 | printk("\n"); |
4305 | printk("=====================================\n"); | 4305 | pr_warn("====================================\n"); |
4306 | printk("[ BUG: %s/%d still has locks held! ]\n", | 4306 | pr_warn("WARNING: %s/%d still has locks held!\n", |
4307 | current->comm, task_pid_nr(current)); | 4307 | current->comm, task_pid_nr(current)); |
4308 | print_kernel_ident(); | 4308 | print_kernel_ident(); |
4309 | printk("-------------------------------------\n"); | 4309 | pr_warn("------------------------------------\n"); |
4310 | lockdep_print_held_locks(current); | 4310 | lockdep_print_held_locks(current); |
4311 | printk("\nstack backtrace:\n"); | 4311 | printk("\nstack backtrace:\n"); |
4312 | dump_stack(); | 4312 | dump_stack(); |
@@ -4371,7 +4371,7 @@ retry: | |||
4371 | } while_each_thread(g, p); | 4371 | } while_each_thread(g, p); |
4372 | 4372 | ||
4373 | printk("\n"); | 4373 | printk("\n"); |
4374 | printk("=============================================\n\n"); | 4374 | pr_warn("=============================================\n\n"); |
4375 | 4375 | ||
4376 | if (unlock) | 4376 | if (unlock) |
4377 | read_unlock(&tasklist_lock); | 4377 | read_unlock(&tasklist_lock); |
@@ -4401,10 +4401,10 @@ asmlinkage __visible void lockdep_sys_exit(void) | |||
4401 | if (!debug_locks_off()) | 4401 | if (!debug_locks_off()) |
4402 | return; | 4402 | return; |
4403 | printk("\n"); | 4403 | printk("\n"); |
4404 | printk("================================================\n"); | 4404 | pr_warn("================================================\n"); |
4405 | printk("[ BUG: lock held when returning to user space! ]\n"); | 4405 | pr_warn("WARNING: lock held when returning to user space!\n"); |
4406 | print_kernel_ident(); | 4406 | print_kernel_ident(); |
4407 | printk("------------------------------------------------\n"); | 4407 | pr_warn("------------------------------------------------\n"); |
4408 | printk("%s/%d is leaving the kernel with locks still held!\n", | 4408 | printk("%s/%d is leaving the kernel with locks still held!\n", |
4409 | curr->comm, curr->pid); | 4409 | curr->comm, curr->pid); |
4410 | lockdep_print_held_locks(curr); | 4410 | lockdep_print_held_locks(curr); |
@@ -4421,13 +4421,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) | |||
4421 | #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ | 4421 | #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ |
4422 | /* Note: the following can be executed concurrently, so be careful. */ | 4422 | /* Note: the following can be executed concurrently, so be careful. */ |
4423 | printk("\n"); | 4423 | printk("\n"); |
4424 | pr_err("===============================\n"); | 4424 | pr_warn("=============================\n"); |
4425 | pr_err("[ ERR: suspicious RCU usage. ]\n"); | 4425 | pr_warn("WARNING: suspicious RCU usage\n"); |
4426 | print_kernel_ident(); | 4426 | print_kernel_ident(); |
4427 | pr_err("-------------------------------\n"); | 4427 | pr_warn("-----------------------------\n"); |
4428 | pr_err("%s:%d %s!\n", file, line, s); | 4428 | printk("%s:%d %s!\n", file, line, s); |
4429 | pr_err("\nother info that might help us debug this:\n\n"); | 4429 | printk("\nother info that might help us debug this:\n\n"); |
4430 | pr_err("\n%srcu_scheduler_active = %d, debug_locks = %d\n", | 4430 | printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", |
4431 | !rcu_lockdep_current_cpu_online() | 4431 | !rcu_lockdep_current_cpu_online() |
4432 | ? "RCU used illegally from offline CPU!\n" | 4432 | ? "RCU used illegally from offline CPU!\n" |
4433 | : !rcu_is_watching() | 4433 | : !rcu_is_watching() |
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 97ee9df32e0f..db4f55211b04 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c | |||
@@ -102,10 +102,11 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) | |||
102 | return; | 102 | return; |
103 | } | 103 | } |
104 | 104 | ||
105 | printk("\n============================================\n"); | 105 | pr_warn("\n"); |
106 | printk( "[ BUG: circular locking deadlock detected! ]\n"); | 106 | pr_warn("============================================\n"); |
107 | printk("%s\n", print_tainted()); | 107 | pr_warn("WARNING: circular locking deadlock detected!\n"); |
108 | printk( "--------------------------------------------\n"); | 108 | pr_warn("%s\n", print_tainted()); |
109 | pr_warn("--------------------------------------------\n"); | ||
109 | printk("%s/%d is deadlocking current task %s/%d\n\n", | 110 | printk("%s/%d is deadlocking current task %s/%d\n\n", |
110 | task->comm, task_pid_nr(task), | 111 | task->comm, task_pid_nr(task), |
111 | current->comm, task_pid_nr(current)); | 112 | current->comm, task_pid_nr(current)); |
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 18dfc485225c..158e6593d58c 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile | |||
@@ -3,7 +3,9 @@ | |||
3 | KCOV_INSTRUMENT := n | 3 | KCOV_INSTRUMENT := n |
4 | 4 | ||
5 | obj-y += update.o sync.o | 5 | obj-y += update.o sync.o |
6 | obj-$(CONFIG_SRCU) += srcu.o | 6 | obj-$(CONFIG_CLASSIC_SRCU) += srcu.o |
7 | obj-$(CONFIG_TREE_SRCU) += srcutree.o | ||
8 | obj-$(CONFIG_TINY_SRCU) += srcutiny.o | ||
7 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 9 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
8 | obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o | 10 | obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o |
9 | obj-$(CONFIG_TREE_RCU) += tree.o | 11 | obj-$(CONFIG_TREE_RCU) += tree.o |
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 0d6ff3e471be..73e16ec4054b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h | |||
@@ -56,6 +56,83 @@ | |||
56 | #define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ | 56 | #define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ |
57 | DYNTICK_TASK_FLAG) | 57 | DYNTICK_TASK_FLAG) |
58 | 58 | ||
59 | |||
60 | /* | ||
61 | * Grace-period counter management. | ||
62 | */ | ||
63 | |||
64 | #define RCU_SEQ_CTR_SHIFT 2 | ||
65 | #define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) | ||
66 | |||
67 | /* | ||
68 | * Return the counter portion of a sequence number previously returned | ||
69 | * by rcu_seq_snap() or rcu_seq_current(). | ||
70 | */ | ||
71 | static inline unsigned long rcu_seq_ctr(unsigned long s) | ||
72 | { | ||
73 | return s >> RCU_SEQ_CTR_SHIFT; | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | * Return the state portion of a sequence number previously returned | ||
78 | * by rcu_seq_snap() or rcu_seq_current(). | ||
79 | */ | ||
80 | static inline int rcu_seq_state(unsigned long s) | ||
81 | { | ||
82 | return s & RCU_SEQ_STATE_MASK; | ||
83 | } | ||
84 | |||
85 | /* | ||
86 | * Set the state portion of the pointed-to sequence number. | ||
87 | * The caller is responsible for preventing conflicting updates. | ||
88 | */ | ||
89 | static inline void rcu_seq_set_state(unsigned long *sp, int newstate) | ||
90 | { | ||
91 | WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK); | ||
92 | WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate); | ||
93 | } | ||
94 | |||
95 | /* Adjust sequence number for start of update-side operation. */ | ||
96 | static inline void rcu_seq_start(unsigned long *sp) | ||
97 | { | ||
98 | WRITE_ONCE(*sp, *sp + 1); | ||
99 | smp_mb(); /* Ensure update-side operation after counter increment. */ | ||
100 | WARN_ON_ONCE(rcu_seq_state(*sp) != 1); | ||
101 | } | ||
102 | |||
103 | /* Adjust sequence number for end of update-side operation. */ | ||
104 | static inline void rcu_seq_end(unsigned long *sp) | ||
105 | { | ||
106 | smp_mb(); /* Ensure update-side operation before counter increment. */ | ||
107 | WARN_ON_ONCE(!rcu_seq_state(*sp)); | ||
108 | WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1); | ||
109 | } | ||
110 | |||
111 | /* Take a snapshot of the update side's sequence number. */ | ||
112 | static inline unsigned long rcu_seq_snap(unsigned long *sp) | ||
113 | { | ||
114 | unsigned long s; | ||
115 | |||
116 | s = (READ_ONCE(*sp) + 2 * RCU_SEQ_STATE_MASK + 1) & ~RCU_SEQ_STATE_MASK; | ||
117 | smp_mb(); /* Above access must not bleed into critical section. */ | ||
118 | return s; | ||
119 | } | ||
120 | |||
121 | /* Return the current value the update side's sequence number, no ordering. */ | ||
122 | static inline unsigned long rcu_seq_current(unsigned long *sp) | ||
123 | { | ||
124 | return READ_ONCE(*sp); | ||
125 | } | ||
126 | |||
127 | /* | ||
128 | * Given a snapshot from rcu_seq_snap(), determine whether or not a | ||
129 | * full update-side operation has occurred. | ||
130 | */ | ||
131 | static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) | ||
132 | { | ||
133 | return ULONG_CMP_GE(READ_ONCE(*sp), s); | ||
134 | } | ||
135 | |||
59 | /* | 136 | /* |
60 | * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally | 137 | * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally |
61 | * by call_rcu() and rcu callback execution, and are therefore not part of the | 138 | * by call_rcu() and rcu callback execution, and are therefore not part of the |
@@ -109,12 +186,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) | |||
109 | 186 | ||
110 | rcu_lock_acquire(&rcu_callback_map); | 187 | rcu_lock_acquire(&rcu_callback_map); |
111 | if (__is_kfree_rcu_offset(offset)) { | 188 | if (__is_kfree_rcu_offset(offset)) { |
112 | RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); | 189 | RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);) |
113 | kfree((void *)head - offset); | 190 | kfree((void *)head - offset); |
114 | rcu_lock_release(&rcu_callback_map); | 191 | rcu_lock_release(&rcu_callback_map); |
115 | return true; | 192 | return true; |
116 | } else { | 193 | } else { |
117 | RCU_TRACE(trace_rcu_invoke_callback(rn, head)); | 194 | RCU_TRACE(trace_rcu_invoke_callback(rn, head);) |
118 | head->func(head); | 195 | head->func(head); |
119 | rcu_lock_release(&rcu_callback_map); | 196 | rcu_lock_release(&rcu_callback_map); |
120 | return false; | 197 | return false; |
@@ -144,4 +221,76 @@ void rcu_test_sync_prims(void); | |||
144 | */ | 221 | */ |
145 | extern void resched_cpu(int cpu); | 222 | extern void resched_cpu(int cpu); |
146 | 223 | ||
224 | #if defined(SRCU) || !defined(TINY_RCU) | ||
225 | |||
226 | #include <linux/rcu_node_tree.h> | ||
227 | |||
228 | extern int rcu_num_lvls; | ||
229 | extern int num_rcu_lvl[]; | ||
230 | extern int rcu_num_nodes; | ||
231 | static bool rcu_fanout_exact; | ||
232 | static int rcu_fanout_leaf; | ||
233 | |||
234 | /* | ||
235 | * Compute the per-level fanout, either using the exact fanout specified | ||
236 | * or balancing the tree, depending on the rcu_fanout_exact boot parameter. | ||
237 | */ | ||
238 | static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) | ||
239 | { | ||
240 | int i; | ||
241 | |||
242 | if (rcu_fanout_exact) { | ||
243 | levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; | ||
244 | for (i = rcu_num_lvls - 2; i >= 0; i--) | ||
245 | levelspread[i] = RCU_FANOUT; | ||
246 | } else { | ||
247 | int ccur; | ||
248 | int cprv; | ||
249 | |||
250 | cprv = nr_cpu_ids; | ||
251 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | ||
252 | ccur = levelcnt[i]; | ||
253 | levelspread[i] = (cprv + ccur - 1) / ccur; | ||
254 | cprv = ccur; | ||
255 | } | ||
256 | } | ||
257 | } | ||
258 | |||
259 | /* | ||
260 | * Do a full breadth-first scan of the rcu_node structures for the | ||
261 | * specified rcu_state structure. | ||
262 | */ | ||
263 | #define rcu_for_each_node_breadth_first(rsp, rnp) \ | ||
264 | for ((rnp) = &(rsp)->node[0]; \ | ||
265 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | ||
266 | |||
267 | /* | ||
268 | * Do a breadth-first scan of the non-leaf rcu_node structures for the | ||
269 | * specified rcu_state structure. Note that if there is a singleton | ||
270 | * rcu_node tree with but one rcu_node structure, this loop is a no-op. | ||
271 | */ | ||
272 | #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ | ||
273 | for ((rnp) = &(rsp)->node[0]; \ | ||
274 | (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) | ||
275 | |||
276 | /* | ||
277 | * Scan the leaves of the rcu_node hierarchy for the specified rcu_state | ||
278 | * structure. Note that if there is a singleton rcu_node tree with but | ||
279 | * one rcu_node structure, this loop -will- visit the rcu_node structure. | ||
280 | * It is still a leaf node, even if it is also the root node. | ||
281 | */ | ||
282 | #define rcu_for_each_leaf_node(rsp, rnp) \ | ||
283 | for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ | ||
284 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | ||
285 | |||
286 | /* | ||
287 | * Iterate over all possible CPUs in a leaf RCU node. | ||
288 | */ | ||
289 | #define for_each_leaf_node_possible_cpu(rnp, cpu) \ | ||
290 | for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ | ||
291 | cpu <= rnp->grphi; \ | ||
292 | cpu = cpumask_next((cpu), cpu_possible_mask)) | ||
293 | |||
294 | #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ | ||
295 | |||
147 | #endif /* __LINUX_RCU_H */ | 296 | #endif /* __LINUX_RCU_H */ |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index cccc417a8135..e9d4527cdd43 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
@@ -559,19 +559,34 @@ static void srcu_torture_barrier(void) | |||
559 | 559 | ||
560 | static void srcu_torture_stats(void) | 560 | static void srcu_torture_stats(void) |
561 | { | 561 | { |
562 | int cpu; | 562 | int __maybe_unused cpu; |
563 | int idx = srcu_ctlp->completed & 0x1; | 563 | int idx; |
564 | 564 | ||
565 | pr_alert("%s%s per-CPU(idx=%d):", | 565 | #if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU) |
566 | #ifdef CONFIG_TREE_SRCU | ||
567 | idx = srcu_ctlp->srcu_idx & 0x1; | ||
568 | #else /* #ifdef CONFIG_TREE_SRCU */ | ||
569 | idx = srcu_ctlp->completed & 0x1; | ||
570 | #endif /* #else #ifdef CONFIG_TREE_SRCU */ | ||
571 | pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", | ||
566 | torture_type, TORTURE_FLAG, idx); | 572 | torture_type, TORTURE_FLAG, idx); |
567 | for_each_possible_cpu(cpu) { | 573 | for_each_possible_cpu(cpu) { |
568 | unsigned long l0, l1; | 574 | unsigned long l0, l1; |
569 | unsigned long u0, u1; | 575 | unsigned long u0, u1; |
570 | long c0, c1; | 576 | long c0, c1; |
571 | struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); | 577 | #ifdef CONFIG_TREE_SRCU |
578 | struct srcu_data *counts; | ||
572 | 579 | ||
580 | counts = per_cpu_ptr(srcu_ctlp->sda, cpu); | ||
581 | u0 = counts->srcu_unlock_count[!idx]; | ||
582 | u1 = counts->srcu_unlock_count[idx]; | ||
583 | #else /* #ifdef CONFIG_TREE_SRCU */ | ||
584 | struct srcu_array *counts; | ||
585 | |||
586 | counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); | ||
573 | u0 = counts->unlock_count[!idx]; | 587 | u0 = counts->unlock_count[!idx]; |
574 | u1 = counts->unlock_count[idx]; | 588 | u1 = counts->unlock_count[idx]; |
589 | #endif /* #else #ifdef CONFIG_TREE_SRCU */ | ||
575 | 590 | ||
576 | /* | 591 | /* |
577 | * Make sure that a lock is always counted if the corresponding | 592 | * Make sure that a lock is always counted if the corresponding |
@@ -579,14 +594,26 @@ static void srcu_torture_stats(void) | |||
579 | */ | 594 | */ |
580 | smp_rmb(); | 595 | smp_rmb(); |
581 | 596 | ||
597 | #ifdef CONFIG_TREE_SRCU | ||
598 | l0 = counts->srcu_lock_count[!idx]; | ||
599 | l1 = counts->srcu_lock_count[idx]; | ||
600 | #else /* #ifdef CONFIG_TREE_SRCU */ | ||
582 | l0 = counts->lock_count[!idx]; | 601 | l0 = counts->lock_count[!idx]; |
583 | l1 = counts->lock_count[idx]; | 602 | l1 = counts->lock_count[idx]; |
603 | #endif /* #else #ifdef CONFIG_TREE_SRCU */ | ||
584 | 604 | ||
585 | c0 = l0 - u0; | 605 | c0 = l0 - u0; |
586 | c1 = l1 - u1; | 606 | c1 = l1 - u1; |
587 | pr_cont(" %d(%ld,%ld)", cpu, c0, c1); | 607 | pr_cont(" %d(%ld,%ld)", cpu, c0, c1); |
588 | } | 608 | } |
589 | pr_cont("\n"); | 609 | pr_cont("\n"); |
610 | #elif defined(CONFIG_TINY_SRCU) | ||
611 | idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; | ||
612 | pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n", | ||
613 | torture_type, TORTURE_FLAG, idx, | ||
614 | READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), | ||
615 | READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); | ||
616 | #endif | ||
590 | } | 617 | } |
591 | 618 | ||
592 | static void srcu_torture_synchronize_expedited(void) | 619 | static void srcu_torture_synchronize_expedited(void) |
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index ef3bcfb15b39..584d8a983883 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c | |||
@@ -22,7 +22,7 @@ | |||
22 | * Lai Jiangshan <laijs@cn.fujitsu.com> | 22 | * Lai Jiangshan <laijs@cn.fujitsu.com> |
23 | * | 23 | * |
24 | * For detailed explanation of Read-Copy Update mechanism see - | 24 | * For detailed explanation of Read-Copy Update mechanism see - |
25 | * Documentation/RCU/ *.txt | 25 | * Documentation/RCU/ *.txt |
26 | * | 26 | * |
27 | */ | 27 | */ |
28 | 28 | ||
@@ -243,8 +243,14 @@ static bool srcu_readers_active(struct srcu_struct *sp) | |||
243 | * cleanup_srcu_struct - deconstruct a sleep-RCU structure | 243 | * cleanup_srcu_struct - deconstruct a sleep-RCU structure |
244 | * @sp: structure to clean up. | 244 | * @sp: structure to clean up. |
245 | * | 245 | * |
246 | * Must invoke this after you are finished using a given srcu_struct that | 246 | * Must invoke this only after you are finished using a given srcu_struct |
247 | * was initialized via init_srcu_struct(), else you leak memory. | 247 | * that was initialized via init_srcu_struct(). This code does some |
248 | * probabalistic checking, spotting late uses of srcu_read_lock(), | ||
249 | * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu(). | ||
250 | * If any such late uses are detected, the per-CPU memory associated with | ||
251 | * the srcu_struct is simply leaked and WARN_ON() is invoked. If the | ||
252 | * caller frees the srcu_struct itself, a use-after-free crash will likely | ||
253 | * ensue, but at least there will be a warning printed. | ||
248 | */ | 254 | */ |
249 | void cleanup_srcu_struct(struct srcu_struct *sp) | 255 | void cleanup_srcu_struct(struct srcu_struct *sp) |
250 | { | 256 | { |
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c new file mode 100644 index 000000000000..b8293527ee18 --- /dev/null +++ b/kernel/rcu/srcutiny.c | |||
@@ -0,0 +1,215 @@ | |||
1 | /* | ||
2 | * Sleepable Read-Copy Update mechanism for mutual exclusion, | ||
3 | * tiny version for non-preemptible single-CPU use. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, you can access it online at | ||
17 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
18 | * | ||
19 | * Copyright (C) IBM Corporation, 2017 | ||
20 | * | ||
21 | * Author: Paul McKenney <paulmck@us.ibm.com> | ||
22 | */ | ||
23 | |||
24 | #include <linux/export.h> | ||
25 | #include <linux/mutex.h> | ||
26 | #include <linux/preempt.h> | ||
27 | #include <linux/rcupdate_wait.h> | ||
28 | #include <linux/sched.h> | ||
29 | #include <linux/delay.h> | ||
30 | #include <linux/srcu.h> | ||
31 | |||
32 | #include <linux/rcu_node_tree.h> | ||
33 | #include "rcu.h" | ||
34 | |||
35 | static int init_srcu_struct_fields(struct srcu_struct *sp) | ||
36 | { | ||
37 | sp->srcu_lock_nesting[0] = 0; | ||
38 | sp->srcu_lock_nesting[1] = 0; | ||
39 | init_swait_queue_head(&sp->srcu_wq); | ||
40 | sp->srcu_gp_seq = 0; | ||
41 | rcu_segcblist_init(&sp->srcu_cblist); | ||
42 | sp->srcu_gp_running = false; | ||
43 | sp->srcu_gp_waiting = false; | ||
44 | sp->srcu_idx = 0; | ||
45 | INIT_WORK(&sp->srcu_work, srcu_drive_gp); | ||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
50 | |||
51 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, | ||
52 | struct lock_class_key *key) | ||
53 | { | ||
54 | /* Don't re-initialize a lock while it is held. */ | ||
55 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | ||
56 | lockdep_init_map(&sp->dep_map, name, key, 0); | ||
57 | return init_srcu_struct_fields(sp); | ||
58 | } | ||
59 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | ||
60 | |||
61 | #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
62 | |||
63 | /* | ||
64 | * init_srcu_struct - initialize a sleep-RCU structure | ||
65 | * @sp: structure to initialize. | ||
66 | * | ||
67 | * Must invoke this on a given srcu_struct before passing that srcu_struct | ||
68 | * to any other function. Each srcu_struct represents a separate domain | ||
69 | * of SRCU protection. | ||
70 | */ | ||
71 | int init_srcu_struct(struct srcu_struct *sp) | ||
72 | { | ||
73 | return init_srcu_struct_fields(sp); | ||
74 | } | ||
75 | EXPORT_SYMBOL_GPL(init_srcu_struct); | ||
76 | |||
77 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
78 | |||
79 | /* | ||
80 | * cleanup_srcu_struct - deconstruct a sleep-RCU structure | ||
81 | * @sp: structure to clean up. | ||
82 | * | ||
83 | * Must invoke this after you are finished using a given srcu_struct that | ||
84 | * was initialized via init_srcu_struct(), else you leak memory. | ||
85 | */ | ||
86 | void cleanup_srcu_struct(struct srcu_struct *sp) | ||
87 | { | ||
88 | WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); | ||
89 | flush_work(&sp->srcu_work); | ||
90 | WARN_ON(rcu_seq_state(sp->srcu_gp_seq)); | ||
91 | WARN_ON(sp->srcu_gp_running); | ||
92 | WARN_ON(sp->srcu_gp_waiting); | ||
93 | WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)); | ||
94 | } | ||
95 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); | ||
96 | |||
97 | /* | ||
98 | * Counts the new reader in the appropriate per-CPU element of the | ||
99 | * srcu_struct. Must be called from process context. | ||
100 | * Returns an index that must be passed to the matching srcu_read_unlock(). | ||
101 | */ | ||
102 | int __srcu_read_lock(struct srcu_struct *sp) | ||
103 | { | ||
104 | int idx; | ||
105 | |||
106 | idx = READ_ONCE(sp->srcu_idx); | ||
107 | WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1); | ||
108 | return idx; | ||
109 | } | ||
110 | EXPORT_SYMBOL_GPL(__srcu_read_lock); | ||
111 | |||
112 | /* | ||
113 | * Removes the count for the old reader from the appropriate element of | ||
114 | * the srcu_struct. Must be called from process context. | ||
115 | */ | ||
116 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | ||
117 | { | ||
118 | int newval = sp->srcu_lock_nesting[idx] - 1; | ||
119 | |||
120 | WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); | ||
121 | if (!newval && READ_ONCE(sp->srcu_gp_waiting)) | ||
122 | swake_up(&sp->srcu_wq); | ||
123 | } | ||
124 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | ||
125 | |||
126 | /* | ||
127 | * Workqueue handler to drive one grace period and invoke any callbacks | ||
128 | * that become ready as a result. Single-CPU and !PREEMPT operation | ||
129 | * means that we get away with murder on synchronization. ;-) | ||
130 | */ | ||
131 | void srcu_drive_gp(struct work_struct *wp) | ||
132 | { | ||
133 | int idx; | ||
134 | struct rcu_cblist ready_cbs; | ||
135 | struct srcu_struct *sp; | ||
136 | struct rcu_head *rhp; | ||
137 | |||
138 | sp = container_of(wp, struct srcu_struct, srcu_work); | ||
139 | if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist)) | ||
140 | return; /* Already running or nothing to do. */ | ||
141 | |||
142 | /* Tag recently arrived callbacks and wait for readers. */ | ||
143 | WRITE_ONCE(sp->srcu_gp_running, true); | ||
144 | rcu_segcblist_accelerate(&sp->srcu_cblist, | ||
145 | rcu_seq_snap(&sp->srcu_gp_seq)); | ||
146 | rcu_seq_start(&sp->srcu_gp_seq); | ||
147 | idx = sp->srcu_idx; | ||
148 | WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); | ||
149 | WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ | ||
150 | swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); | ||
151 | WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ | ||
152 | rcu_seq_end(&sp->srcu_gp_seq); | ||
153 | |||
154 | /* Update callback list based on GP, and invoke ready callbacks. */ | ||
155 | rcu_segcblist_advance(&sp->srcu_cblist, | ||
156 | rcu_seq_current(&sp->srcu_gp_seq)); | ||
157 | if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) { | ||
158 | rcu_cblist_init(&ready_cbs); | ||
159 | local_irq_disable(); | ||
160 | rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs); | ||
161 | local_irq_enable(); | ||
162 | rhp = rcu_cblist_dequeue(&ready_cbs); | ||
163 | for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { | ||
164 | local_bh_disable(); | ||
165 | rhp->func(rhp); | ||
166 | local_bh_enable(); | ||
167 | } | ||
168 | local_irq_disable(); | ||
169 | rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs); | ||
170 | local_irq_enable(); | ||
171 | } | ||
172 | WRITE_ONCE(sp->srcu_gp_running, false); | ||
173 | |||
174 | /* | ||
175 | * If more callbacks, reschedule ourselves. This can race with | ||
176 | * a call_srcu() at interrupt level, but the ->srcu_gp_running | ||
177 | * checks will straighten that out. | ||
178 | */ | ||
179 | if (!rcu_segcblist_empty(&sp->srcu_cblist)) | ||
180 | schedule_work(&sp->srcu_work); | ||
181 | } | ||
182 | EXPORT_SYMBOL_GPL(srcu_drive_gp); | ||
183 | |||
184 | /* | ||
185 | * Enqueue an SRCU callback on the specified srcu_struct structure, | ||
186 | * initiating grace-period processing if it is not already running. | ||
187 | */ | ||
188 | void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | ||
189 | rcu_callback_t func) | ||
190 | { | ||
191 | unsigned long flags; | ||
192 | |||
193 | head->func = func; | ||
194 | local_irq_save(flags); | ||
195 | rcu_segcblist_enqueue(&sp->srcu_cblist, head, false); | ||
196 | local_irq_restore(flags); | ||
197 | if (!READ_ONCE(sp->srcu_gp_running)) | ||
198 | schedule_work(&sp->srcu_work); | ||
199 | } | ||
200 | EXPORT_SYMBOL_GPL(call_srcu); | ||
201 | |||
202 | /* | ||
203 | * synchronize_srcu - wait for prior SRCU read-side critical-section completion | ||
204 | */ | ||
205 | void synchronize_srcu(struct srcu_struct *sp) | ||
206 | { | ||
207 | struct rcu_synchronize rs; | ||
208 | |||
209 | init_rcu_head_on_stack(&rs.head); | ||
210 | init_completion(&rs.completion); | ||
211 | call_srcu(sp, &rs.head, wakeme_after_rcu); | ||
212 | wait_for_completion(&rs.completion); | ||
213 | destroy_rcu_head_on_stack(&rs.head); | ||
214 | } | ||
215 | EXPORT_SYMBOL_GPL(synchronize_srcu); | ||
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c new file mode 100644 index 000000000000..9ecf0acc18eb --- /dev/null +++ b/kernel/rcu/srcutree.c | |||
@@ -0,0 +1,996 @@ | |||
1 | /* | ||
2 | * Sleepable Read-Copy Update mechanism for mutual exclusion. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, you can access it online at | ||
16 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
17 | * | ||
18 | * Copyright (C) IBM Corporation, 2006 | ||
19 | * Copyright (C) Fujitsu, 2012 | ||
20 | * | ||
21 | * Author: Paul McKenney <paulmck@us.ibm.com> | ||
22 | * Lai Jiangshan <laijs@cn.fujitsu.com> | ||
23 | * | ||
24 | * For detailed explanation of Read-Copy Update mechanism see - | ||
25 | * Documentation/RCU/ *.txt | ||
26 | * | ||
27 | */ | ||
28 | |||
29 | #include <linux/export.h> | ||
30 | #include <linux/mutex.h> | ||
31 | #include <linux/percpu.h> | ||
32 | #include <linux/preempt.h> | ||
33 | #include <linux/rcupdate_wait.h> | ||
34 | #include <linux/sched.h> | ||
35 | #include <linux/smp.h> | ||
36 | #include <linux/delay.h> | ||
37 | #include <linux/srcu.h> | ||
38 | |||
39 | #include "rcu.h" | ||
40 | |||
41 | static void srcu_invoke_callbacks(struct work_struct *work); | ||
42 | static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); | ||
43 | |||
44 | /* | ||
45 | * Initialize SRCU combining tree. Note that statically allocated | ||
46 | * srcu_struct structures might already have srcu_read_lock() and | ||
47 | * srcu_read_unlock() running against them. So if the is_static parameter | ||
48 | * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. | ||
49 | */ | ||
50 | static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) | ||
51 | { | ||
52 | int cpu; | ||
53 | int i; | ||
54 | int level = 0; | ||
55 | int levelspread[RCU_NUM_LVLS]; | ||
56 | struct srcu_data *sdp; | ||
57 | struct srcu_node *snp; | ||
58 | struct srcu_node *snp_first; | ||
59 | |||
60 | /* Work out the overall tree geometry. */ | ||
61 | sp->level[0] = &sp->node[0]; | ||
62 | for (i = 1; i < rcu_num_lvls; i++) | ||
63 | sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1]; | ||
64 | rcu_init_levelspread(levelspread, num_rcu_lvl); | ||
65 | |||
66 | /* Each pass through this loop initializes one srcu_node structure. */ | ||
67 | rcu_for_each_node_breadth_first(sp, snp) { | ||
68 | spin_lock_init(&snp->lock); | ||
69 | for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) | ||
70 | snp->srcu_have_cbs[i] = 0; | ||
71 | snp->grplo = -1; | ||
72 | snp->grphi = -1; | ||
73 | if (snp == &sp->node[0]) { | ||
74 | /* Root node, special case. */ | ||
75 | snp->srcu_parent = NULL; | ||
76 | continue; | ||
77 | } | ||
78 | |||
79 | /* Non-root node. */ | ||
80 | if (snp == sp->level[level + 1]) | ||
81 | level++; | ||
82 | snp->srcu_parent = sp->level[level - 1] + | ||
83 | (snp - sp->level[level]) / | ||
84 | levelspread[level - 1]; | ||
85 | } | ||
86 | |||
87 | /* | ||
88 | * Initialize the per-CPU srcu_data array, which feeds into the | ||
89 | * leaves of the srcu_node tree. | ||
90 | */ | ||
91 | WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != | ||
92 | ARRAY_SIZE(sdp->srcu_unlock_count)); | ||
93 | level = rcu_num_lvls - 1; | ||
94 | snp_first = sp->level[level]; | ||
95 | for_each_possible_cpu(cpu) { | ||
96 | sdp = per_cpu_ptr(sp->sda, cpu); | ||
97 | spin_lock_init(&sdp->lock); | ||
98 | rcu_segcblist_init(&sdp->srcu_cblist); | ||
99 | sdp->srcu_cblist_invoking = false; | ||
100 | sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; | ||
101 | sdp->mynode = &snp_first[cpu / levelspread[level]]; | ||
102 | for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { | ||
103 | if (snp->grplo < 0) | ||
104 | snp->grplo = cpu; | ||
105 | snp->grphi = cpu; | ||
106 | } | ||
107 | sdp->cpu = cpu; | ||
108 | INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks); | ||
109 | sdp->sp = sp; | ||
110 | if (is_static) | ||
111 | continue; | ||
112 | |||
113 | /* Dynamically allocated, better be no srcu_read_locks()! */ | ||
114 | for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) { | ||
115 | sdp->srcu_lock_count[i] = 0; | ||
116 | sdp->srcu_unlock_count[i] = 0; | ||
117 | } | ||
118 | } | ||
119 | } | ||
120 | |||
121 | /* | ||
122 | * Initialize non-compile-time initialized fields, including the | ||
123 | * associated srcu_node and srcu_data structures. The is_static | ||
124 | * parameter is passed through to init_srcu_struct_nodes(), and | ||
125 | * also tells us that ->sda has already been wired up to srcu_data. | ||
126 | */ | ||
127 | static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static) | ||
128 | { | ||
129 | mutex_init(&sp->srcu_cb_mutex); | ||
130 | mutex_init(&sp->srcu_gp_mutex); | ||
131 | sp->srcu_idx = 0; | ||
132 | sp->srcu_gp_seq = 0; | ||
133 | atomic_set(&sp->srcu_exp_cnt, 0); | ||
134 | sp->srcu_barrier_seq = 0; | ||
135 | mutex_init(&sp->srcu_barrier_mutex); | ||
136 | atomic_set(&sp->srcu_barrier_cpu_cnt, 0); | ||
137 | INIT_DELAYED_WORK(&sp->work, process_srcu); | ||
138 | if (!is_static) | ||
139 | sp->sda = alloc_percpu(struct srcu_data); | ||
140 | init_srcu_struct_nodes(sp, is_static); | ||
141 | smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */ | ||
142 | return sp->sda ? 0 : -ENOMEM; | ||
143 | } | ||
144 | |||
145 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
146 | |||
147 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, | ||
148 | struct lock_class_key *key) | ||
149 | { | ||
150 | /* Don't re-initialize a lock while it is held. */ | ||
151 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | ||
152 | lockdep_init_map(&sp->dep_map, name, key, 0); | ||
153 | spin_lock_init(&sp->gp_lock); | ||
154 | return init_srcu_struct_fields(sp, false); | ||
155 | } | ||
156 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | ||
157 | |||
158 | #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
159 | |||
160 | /** | ||
161 | * init_srcu_struct - initialize a sleep-RCU structure | ||
162 | * @sp: structure to initialize. | ||
163 | * | ||
164 | * Must invoke this on a given srcu_struct before passing that srcu_struct | ||
165 | * to any other function. Each srcu_struct represents a separate domain | ||
166 | * of SRCU protection. | ||
167 | */ | ||
168 | int init_srcu_struct(struct srcu_struct *sp) | ||
169 | { | ||
170 | spin_lock_init(&sp->gp_lock); | ||
171 | return init_srcu_struct_fields(sp, false); | ||
172 | } | ||
173 | EXPORT_SYMBOL_GPL(init_srcu_struct); | ||
174 | |||
175 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
176 | |||
177 | /* | ||
178 | * First-use initialization of statically allocated srcu_struct | ||
179 | * structure. Wiring up the combining tree is more than can be | ||
180 | * done with compile-time initialization, so this check is added | ||
181 | * to each update-side SRCU primitive. Use ->gp_lock, which -is- | ||
182 | * compile-time initialized, to resolve races involving multiple | ||
183 | * CPUs trying to garner first-use privileges. | ||
184 | */ | ||
185 | static void check_init_srcu_struct(struct srcu_struct *sp) | ||
186 | { | ||
187 | unsigned long flags; | ||
188 | |||
189 | WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT); | ||
190 | /* The smp_load_acquire() pairs with the smp_store_release(). */ | ||
191 | if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ | ||
192 | return; /* Already initialized. */ | ||
193 | spin_lock_irqsave(&sp->gp_lock, flags); | ||
194 | if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { | ||
195 | spin_unlock_irqrestore(&sp->gp_lock, flags); | ||
196 | return; | ||
197 | } | ||
198 | init_srcu_struct_fields(sp, true); | ||
199 | spin_unlock_irqrestore(&sp->gp_lock, flags); | ||
200 | } | ||
201 | |||
202 | /* | ||
203 | * Returns approximate total of the readers' ->srcu_lock_count[] values | ||
204 | * for the rank of per-CPU counters specified by idx. | ||
205 | */ | ||
206 | static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) | ||
207 | { | ||
208 | int cpu; | ||
209 | unsigned long sum = 0; | ||
210 | |||
211 | for_each_possible_cpu(cpu) { | ||
212 | struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); | ||
213 | |||
214 | sum += READ_ONCE(cpuc->srcu_lock_count[idx]); | ||
215 | } | ||
216 | return sum; | ||
217 | } | ||
218 | |||
219 | /* | ||
220 | * Returns approximate total of the readers' ->srcu_unlock_count[] values | ||
221 | * for the rank of per-CPU counters specified by idx. | ||
222 | */ | ||
223 | static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) | ||
224 | { | ||
225 | int cpu; | ||
226 | unsigned long sum = 0; | ||
227 | |||
228 | for_each_possible_cpu(cpu) { | ||
229 | struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); | ||
230 | |||
231 | sum += READ_ONCE(cpuc->srcu_unlock_count[idx]); | ||
232 | } | ||
233 | return sum; | ||
234 | } | ||
235 | |||
236 | /* | ||
237 | * Return true if the number of pre-existing readers is determined to | ||
238 | * be zero. | ||
239 | */ | ||
240 | static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | ||
241 | { | ||
242 | unsigned long unlocks; | ||
243 | |||
244 | unlocks = srcu_readers_unlock_idx(sp, idx); | ||
245 | |||
246 | /* | ||
247 | * Make sure that a lock is always counted if the corresponding | ||
248 | * unlock is counted. Needs to be a smp_mb() as the read side may | ||
249 | * contain a read from a variable that is written to before the | ||
250 | * synchronize_srcu() in the write side. In this case smp_mb()s | ||
251 | * A and B act like the store buffering pattern. | ||
252 | * | ||
253 | * This smp_mb() also pairs with smp_mb() C to prevent accesses | ||
254 | * after the synchronize_srcu() from being executed before the | ||
255 | * grace period ends. | ||
256 | */ | ||
257 | smp_mb(); /* A */ | ||
258 | |||
259 | /* | ||
260 | * If the locks are the same as the unlocks, then there must have | ||
261 | * been no readers on this index at some time in between. This does | ||
262 | * not mean that there are no more readers, as one could have read | ||
263 | * the current index but not have incremented the lock counter yet. | ||
264 | * | ||
265 | * Possible bug: There is no guarantee that there haven't been | ||
266 | * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were | ||
267 | * counted, meaning that this could return true even if there are | ||
268 | * still active readers. Since there are no memory barriers around | ||
269 | * srcu_flip(), the CPU is not required to increment ->srcu_idx | ||
270 | * before running srcu_readers_unlock_idx(), which means that there | ||
271 | * could be an arbitrarily large number of critical sections that | ||
272 | * execute after srcu_readers_unlock_idx() but use the old value | ||
273 | * of ->srcu_idx. | ||
274 | */ | ||
275 | return srcu_readers_lock_idx(sp, idx) == unlocks; | ||
276 | } | ||
277 | |||
278 | /** | ||
279 | * srcu_readers_active - returns true if there are readers. and false | ||
280 | * otherwise | ||
281 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). | ||
282 | * | ||
283 | * Note that this is not an atomic primitive, and can therefore suffer | ||
284 | * severe errors when invoked on an active srcu_struct. That said, it | ||
285 | * can be useful as an error check at cleanup time. | ||
286 | */ | ||
287 | static bool srcu_readers_active(struct srcu_struct *sp) | ||
288 | { | ||
289 | int cpu; | ||
290 | unsigned long sum = 0; | ||
291 | |||
292 | for_each_possible_cpu(cpu) { | ||
293 | struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); | ||
294 | |||
295 | sum += READ_ONCE(cpuc->srcu_lock_count[0]); | ||
296 | sum += READ_ONCE(cpuc->srcu_lock_count[1]); | ||
297 | sum -= READ_ONCE(cpuc->srcu_unlock_count[0]); | ||
298 | sum -= READ_ONCE(cpuc->srcu_unlock_count[1]); | ||
299 | } | ||
300 | return sum; | ||
301 | } | ||
302 | |||
303 | #define SRCU_INTERVAL 1 | ||
304 | |||
305 | /** | ||
306 | * cleanup_srcu_struct - deconstruct a sleep-RCU structure | ||
307 | * @sp: structure to clean up. | ||
308 | * | ||
309 | * Must invoke this after you are finished using a given srcu_struct that | ||
310 | * was initialized via init_srcu_struct(), else you leak memory. | ||
311 | */ | ||
312 | void cleanup_srcu_struct(struct srcu_struct *sp) | ||
313 | { | ||
314 | int cpu; | ||
315 | |||
316 | WARN_ON_ONCE(atomic_read(&sp->srcu_exp_cnt)); | ||
317 | if (WARN_ON(srcu_readers_active(sp))) | ||
318 | return; /* Leakage unless caller handles error. */ | ||
319 | flush_delayed_work(&sp->work); | ||
320 | for_each_possible_cpu(cpu) | ||
321 | flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); | ||
322 | if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || | ||
323 | WARN_ON(srcu_readers_active(sp))) { | ||
324 | pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); | ||
325 | return; /* Caller forgot to stop doing call_srcu()? */ | ||
326 | } | ||
327 | free_percpu(sp->sda); | ||
328 | sp->sda = NULL; | ||
329 | } | ||
330 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); | ||
331 | |||
332 | /* | ||
333 | * Counts the new reader in the appropriate per-CPU element of the | ||
334 | * srcu_struct. Must be called from process context. | ||
335 | * Returns an index that must be passed to the matching srcu_read_unlock(). | ||
336 | */ | ||
337 | int __srcu_read_lock(struct srcu_struct *sp) | ||
338 | { | ||
339 | int idx; | ||
340 | |||
341 | idx = READ_ONCE(sp->srcu_idx) & 0x1; | ||
342 | __this_cpu_inc(sp->sda->srcu_lock_count[idx]); | ||
343 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ | ||
344 | return idx; | ||
345 | } | ||
346 | EXPORT_SYMBOL_GPL(__srcu_read_lock); | ||
347 | |||
348 | /* | ||
349 | * Removes the count for the old reader from the appropriate per-CPU | ||
350 | * element of the srcu_struct. Note that this may well be a different | ||
351 | * CPU than that which was incremented by the corresponding srcu_read_lock(). | ||
352 | * Must be called from process context. | ||
353 | */ | ||
354 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | ||
355 | { | ||
356 | smp_mb(); /* C */ /* Avoid leaking the critical section. */ | ||
357 | this_cpu_inc(sp->sda->srcu_unlock_count[idx]); | ||
358 | } | ||
359 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | ||
360 | |||
361 | /* | ||
362 | * We use an adaptive strategy for synchronize_srcu() and especially for | ||
363 | * synchronize_srcu_expedited(). We spin for a fixed time period | ||
364 | * (defined below) to allow SRCU readers to exit their read-side critical | ||
365 | * sections. If there are still some readers after a few microseconds, | ||
366 | * we repeatedly block for 1-millisecond time periods. | ||
367 | */ | ||
368 | #define SRCU_RETRY_CHECK_DELAY 5 | ||
369 | |||
370 | /* | ||
371 | * Start an SRCU grace period. | ||
372 | */ | ||
373 | static void srcu_gp_start(struct srcu_struct *sp) | ||
374 | { | ||
375 | struct srcu_data *sdp = this_cpu_ptr(sp->sda); | ||
376 | int state; | ||
377 | |||
378 | RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock), | ||
379 | "Invoked srcu_gp_start() without ->gp_lock!"); | ||
380 | WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); | ||
381 | rcu_segcblist_advance(&sdp->srcu_cblist, | ||
382 | rcu_seq_current(&sp->srcu_gp_seq)); | ||
383 | (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, | ||
384 | rcu_seq_snap(&sp->srcu_gp_seq)); | ||
385 | rcu_seq_start(&sp->srcu_gp_seq); | ||
386 | state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); | ||
387 | WARN_ON_ONCE(state != SRCU_STATE_SCAN1); | ||
388 | } | ||
389 | |||
390 | /* | ||
391 | * Track online CPUs to guide callback workqueue placement. | ||
392 | */ | ||
393 | DEFINE_PER_CPU(bool, srcu_online); | ||
394 | |||
395 | void srcu_online_cpu(unsigned int cpu) | ||
396 | { | ||
397 | WRITE_ONCE(per_cpu(srcu_online, cpu), true); | ||
398 | } | ||
399 | |||
400 | void srcu_offline_cpu(unsigned int cpu) | ||
401 | { | ||
402 | WRITE_ONCE(per_cpu(srcu_online, cpu), false); | ||
403 | } | ||
404 | |||
405 | /* | ||
406 | * Place the workqueue handler on the specified CPU if online, otherwise | ||
407 | * just run it whereever. This is useful for placing workqueue handlers | ||
408 | * that are to invoke the specified CPU's callbacks. | ||
409 | */ | ||
410 | static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | ||
411 | struct delayed_work *dwork, | ||
412 | unsigned long delay) | ||
413 | { | ||
414 | bool ret; | ||
415 | |||
416 | preempt_disable(); | ||
417 | if (READ_ONCE(per_cpu(srcu_online, cpu))) | ||
418 | ret = queue_delayed_work_on(cpu, wq, dwork, delay); | ||
419 | else | ||
420 | ret = queue_delayed_work(wq, dwork, delay); | ||
421 | preempt_enable(); | ||
422 | return ret; | ||
423 | } | ||
424 | |||
425 | /* | ||
426 | * Schedule callback invocation for the specified srcu_data structure, | ||
427 | * if possible, on the corresponding CPU. | ||
428 | */ | ||
429 | static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) | ||
430 | { | ||
431 | srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, | ||
432 | &sdp->work, delay); | ||
433 | } | ||
434 | |||
435 | /* | ||
436 | * Schedule callback invocation for all srcu_data structures associated | ||
437 | * with the specified srcu_node structure, if possible, on the corresponding | ||
438 | * CPUs. | ||
439 | */ | ||
440 | static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp) | ||
441 | { | ||
442 | int cpu; | ||
443 | |||
444 | for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) | ||
445 | srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), | ||
446 | atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL); | ||
447 | } | ||
448 | |||
449 | /* | ||
450 | * Note the end of an SRCU grace period. Initiates callback invocation | ||
451 | * and starts a new grace period if needed. | ||
452 | * | ||
453 | * The ->srcu_cb_mutex acquisition does not protect any data, but | ||
454 | * instead prevents more than one grace period from starting while we | ||
455 | * are initiating callback invocation. This allows the ->srcu_have_cbs[] | ||
456 | * array to have a finite number of elements. | ||
457 | */ | ||
458 | static void srcu_gp_end(struct srcu_struct *sp) | ||
459 | { | ||
460 | bool cbs; | ||
461 | unsigned long gpseq; | ||
462 | int idx; | ||
463 | int idxnext; | ||
464 | struct srcu_node *snp; | ||
465 | |||
466 | /* Prevent more than one additional grace period. */ | ||
467 | mutex_lock(&sp->srcu_cb_mutex); | ||
468 | |||
469 | /* End the current grace period. */ | ||
470 | spin_lock_irq(&sp->gp_lock); | ||
471 | idx = rcu_seq_state(sp->srcu_gp_seq); | ||
472 | WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); | ||
473 | rcu_seq_end(&sp->srcu_gp_seq); | ||
474 | gpseq = rcu_seq_current(&sp->srcu_gp_seq); | ||
475 | spin_unlock_irq(&sp->gp_lock); | ||
476 | mutex_unlock(&sp->srcu_gp_mutex); | ||
477 | /* A new grace period can start at this point. But only one. */ | ||
478 | |||
479 | /* Initiate callback invocation as needed. */ | ||
480 | idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); | ||
481 | idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); | ||
482 | rcu_for_each_node_breadth_first(sp, snp) { | ||
483 | spin_lock_irq(&snp->lock); | ||
484 | cbs = false; | ||
485 | if (snp >= sp->level[rcu_num_lvls - 1]) | ||
486 | cbs = snp->srcu_have_cbs[idx] == gpseq; | ||
487 | snp->srcu_have_cbs[idx] = gpseq; | ||
488 | rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); | ||
489 | spin_unlock_irq(&snp->lock); | ||
490 | if (cbs) { | ||
491 | smp_mb(); /* GP end before CB invocation. */ | ||
492 | srcu_schedule_cbs_snp(sp, snp); | ||
493 | } | ||
494 | } | ||
495 | |||
496 | /* Callback initiation done, allow grace periods after next. */ | ||
497 | mutex_unlock(&sp->srcu_cb_mutex); | ||
498 | |||
499 | /* Start a new grace period if needed. */ | ||
500 | spin_lock_irq(&sp->gp_lock); | ||
501 | gpseq = rcu_seq_current(&sp->srcu_gp_seq); | ||
502 | if (!rcu_seq_state(gpseq) && | ||
503 | ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { | ||
504 | srcu_gp_start(sp); | ||
505 | spin_unlock_irq(&sp->gp_lock); | ||
506 | /* Throttle expedited grace periods: Should be rare! */ | ||
507 | srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) && | ||
508 | rcu_seq_ctr(gpseq) & 0xf | ||
509 | ? 0 | ||
510 | : SRCU_INTERVAL); | ||
511 | } else { | ||
512 | spin_unlock_irq(&sp->gp_lock); | ||
513 | } | ||
514 | } | ||
515 | |||
516 | /* | ||
517 | * Funnel-locking scheme to scalably mediate many concurrent grace-period | ||
518 | * requests. The winner has to do the work of actually starting grace | ||
519 | * period s. Losers must either ensure that their desired grace-period | ||
520 | * number is recorded on at least their leaf srcu_node structure, or they | ||
521 | * must take steps to invoke their own callbacks. | ||
522 | */ | ||
523 | static void srcu_funnel_gp_start(struct srcu_struct *sp, | ||
524 | struct srcu_data *sdp, | ||
525 | unsigned long s) | ||
526 | { | ||
527 | unsigned long flags; | ||
528 | int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); | ||
529 | struct srcu_node *snp = sdp->mynode; | ||
530 | unsigned long snp_seq; | ||
531 | |||
532 | /* Each pass through the loop does one level of the srcu_node tree. */ | ||
533 | for (; snp != NULL; snp = snp->srcu_parent) { | ||
534 | if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) | ||
535 | return; /* GP already done and CBs recorded. */ | ||
536 | spin_lock_irqsave(&snp->lock, flags); | ||
537 | if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { | ||
538 | snp_seq = snp->srcu_have_cbs[idx]; | ||
539 | spin_unlock_irqrestore(&snp->lock, flags); | ||
540 | if (snp == sdp->mynode && snp_seq != s) { | ||
541 | smp_mb(); /* CBs after GP! */ | ||
542 | srcu_schedule_cbs_sdp(sdp, 0); | ||
543 | } | ||
544 | return; | ||
545 | } | ||
546 | snp->srcu_have_cbs[idx] = s; | ||
547 | spin_unlock_irqrestore(&snp->lock, flags); | ||
548 | } | ||
549 | |||
550 | /* Top of tree, must ensure the grace period will be started. */ | ||
551 | spin_lock_irqsave(&sp->gp_lock, flags); | ||
552 | if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { | ||
553 | /* | ||
554 | * Record need for grace period s. Pair with load | ||
555 | * acquire setting up for initialization. | ||
556 | */ | ||
557 | smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/ | ||
558 | } | ||
559 | |||
560 | /* If grace period not already done and none in progress, start it. */ | ||
561 | if (!rcu_seq_done(&sp->srcu_gp_seq, s) && | ||
562 | rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { | ||
563 | WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); | ||
564 | srcu_gp_start(sp); | ||
565 | queue_delayed_work(system_power_efficient_wq, &sp->work, | ||
566 | atomic_read(&sp->srcu_exp_cnt) | ||
567 | ? 0 | ||
568 | : SRCU_INTERVAL); | ||
569 | } | ||
570 | spin_unlock_irqrestore(&sp->gp_lock, flags); | ||
571 | } | ||
572 | |||
573 | /* | ||
574 | * Wait until all readers counted by array index idx complete, but | ||
575 | * loop an additional time if there is an expedited grace period pending. | ||
576 | * The caller must ensure that ->srcu_idx is not changed while checking. | ||
577 | */ | ||
578 | static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) | ||
579 | { | ||
580 | for (;;) { | ||
581 | if (srcu_readers_active_idx_check(sp, idx)) | ||
582 | return true; | ||
583 | if (--trycount + !!atomic_read(&sp->srcu_exp_cnt) <= 0) | ||
584 | return false; | ||
585 | udelay(SRCU_RETRY_CHECK_DELAY); | ||
586 | } | ||
587 | } | ||
588 | |||
589 | /* | ||
590 | * Increment the ->srcu_idx counter so that future SRCU readers will | ||
591 | * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows | ||
592 | * us to wait for pre-existing readers in a starvation-free manner. | ||
593 | */ | ||
594 | static void srcu_flip(struct srcu_struct *sp) | ||
595 | { | ||
596 | WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); | ||
597 | |||
598 | /* | ||
599 | * Ensure that if the updater misses an __srcu_read_unlock() | ||
600 | * increment, that task's next __srcu_read_lock() will see the | ||
601 | * above counter update. Note that both this memory barrier | ||
602 | * and the one in srcu_readers_active_idx_check() provide the | ||
603 | * guarantee for __srcu_read_lock(). | ||
604 | */ | ||
605 | smp_mb(); /* D */ /* Pairs with C. */ | ||
606 | } | ||
607 | |||
608 | /* | ||
609 | * Enqueue an SRCU callback on the srcu_data structure associated with | ||
610 | * the current CPU and the specified srcu_struct structure, initiating | ||
611 | * grace-period processing if it is not already running. | ||
612 | * | ||
613 | * Note that all CPUs must agree that the grace period extended beyond | ||
614 | * all pre-existing SRCU read-side critical section. On systems with | ||
615 | * more than one CPU, this means that when "func()" is invoked, each CPU | ||
616 | * is guaranteed to have executed a full memory barrier since the end of | ||
617 | * its last corresponding SRCU read-side critical section whose beginning | ||
618 | * preceded the call to call_rcu(). It also means that each CPU executing | ||
619 | * an SRCU read-side critical section that continues beyond the start of | ||
620 | * "func()" must have executed a memory barrier after the call_rcu() | ||
621 | * but before the beginning of that SRCU read-side critical section. | ||
622 | * Note that these guarantees include CPUs that are offline, idle, or | ||
623 | * executing in user mode, as well as CPUs that are executing in the kernel. | ||
624 | * | ||
625 | * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the | ||
626 | * resulting SRCU callback function "func()", then both CPU A and CPU | ||
627 | * B are guaranteed to execute a full memory barrier during the time | ||
628 | * interval between the call to call_rcu() and the invocation of "func()". | ||
629 | * This guarantee applies even if CPU A and CPU B are the same CPU (but | ||
630 | * again only if the system has more than one CPU). | ||
631 | * | ||
632 | * Of course, these guarantees apply only for invocations of call_srcu(), | ||
633 | * srcu_read_lock(), and srcu_read_unlock() that are all passed the same | ||
634 | * srcu_struct structure. | ||
635 | */ | ||
636 | void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, | ||
637 | rcu_callback_t func) | ||
638 | { | ||
639 | unsigned long flags; | ||
640 | bool needgp = false; | ||
641 | unsigned long s; | ||
642 | struct srcu_data *sdp; | ||
643 | |||
644 | check_init_srcu_struct(sp); | ||
645 | rhp->func = func; | ||
646 | local_irq_save(flags); | ||
647 | sdp = this_cpu_ptr(sp->sda); | ||
648 | spin_lock(&sdp->lock); | ||
649 | rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); | ||
650 | rcu_segcblist_advance(&sdp->srcu_cblist, | ||
651 | rcu_seq_current(&sp->srcu_gp_seq)); | ||
652 | s = rcu_seq_snap(&sp->srcu_gp_seq); | ||
653 | (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); | ||
654 | if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { | ||
655 | sdp->srcu_gp_seq_needed = s; | ||
656 | needgp = true; | ||
657 | } | ||
658 | spin_unlock_irqrestore(&sdp->lock, flags); | ||
659 | if (needgp) | ||
660 | srcu_funnel_gp_start(sp, sdp, s); | ||
661 | } | ||
662 | EXPORT_SYMBOL_GPL(call_srcu); | ||
663 | |||
664 | /* | ||
665 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | ||
666 | */ | ||
667 | static void __synchronize_srcu(struct srcu_struct *sp) | ||
668 | { | ||
669 | struct rcu_synchronize rcu; | ||
670 | |||
671 | RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || | ||
672 | lock_is_held(&rcu_bh_lock_map) || | ||
673 | lock_is_held(&rcu_lock_map) || | ||
674 | lock_is_held(&rcu_sched_lock_map), | ||
675 | "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); | ||
676 | |||
677 | if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) | ||
678 | return; | ||
679 | might_sleep(); | ||
680 | check_init_srcu_struct(sp); | ||
681 | init_completion(&rcu.completion); | ||
682 | init_rcu_head_on_stack(&rcu.head); | ||
683 | call_srcu(sp, &rcu.head, wakeme_after_rcu); | ||
684 | wait_for_completion(&rcu.completion); | ||
685 | destroy_rcu_head_on_stack(&rcu.head); | ||
686 | } | ||
687 | |||
688 | /** | ||
689 | * synchronize_srcu_expedited - Brute-force SRCU grace period | ||
690 | * @sp: srcu_struct with which to synchronize. | ||
691 | * | ||
692 | * Wait for an SRCU grace period to elapse, but be more aggressive about | ||
693 | * spinning rather than blocking when waiting. | ||
694 | * | ||
695 | * Note that synchronize_srcu_expedited() has the same deadlock and | ||
696 | * memory-ordering properties as does synchronize_srcu(). | ||
697 | */ | ||
698 | void synchronize_srcu_expedited(struct srcu_struct *sp) | ||
699 | { | ||
700 | bool do_norm = rcu_gp_is_normal(); | ||
701 | |||
702 | check_init_srcu_struct(sp); | ||
703 | if (!do_norm) { | ||
704 | atomic_inc(&sp->srcu_exp_cnt); | ||
705 | smp_mb__after_atomic(); /* increment before GP. */ | ||
706 | } | ||
707 | __synchronize_srcu(sp); | ||
708 | if (!do_norm) { | ||
709 | smp_mb__before_atomic(); /* GP before decrement. */ | ||
710 | WARN_ON_ONCE(atomic_dec_return(&sp->srcu_exp_cnt) < 0); | ||
711 | } | ||
712 | } | ||
713 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); | ||
714 | |||
715 | /** | ||
716 | * synchronize_srcu - wait for prior SRCU read-side critical-section completion | ||
717 | * @sp: srcu_struct with which to synchronize. | ||
718 | * | ||
719 | * Wait for the count to drain to zero of both indexes. To avoid the | ||
720 | * possible starvation of synchronize_srcu(), it waits for the count of | ||
721 | * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, | ||
722 | * and then flip the srcu_idx and wait for the count of the other index. | ||
723 | * | ||
724 | * Can block; must be called from process context. | ||
725 | * | ||
726 | * Note that it is illegal to call synchronize_srcu() from the corresponding | ||
727 | * SRCU read-side critical section; doing so will result in deadlock. | ||
728 | * However, it is perfectly legal to call synchronize_srcu() on one | ||
729 | * srcu_struct from some other srcu_struct's read-side critical section, | ||
730 | * as long as the resulting graph of srcu_structs is acyclic. | ||
731 | * | ||
732 | * There are memory-ordering constraints implied by synchronize_srcu(). | ||
733 | * On systems with more than one CPU, when synchronize_srcu() returns, | ||
734 | * each CPU is guaranteed to have executed a full memory barrier since | ||
735 | * the end of its last corresponding SRCU-sched read-side critical section | ||
736 | * whose beginning preceded the call to synchronize_srcu(). In addition, | ||
737 | * each CPU having an SRCU read-side critical section that extends beyond | ||
738 | * the return from synchronize_srcu() is guaranteed to have executed a | ||
739 | * full memory barrier after the beginning of synchronize_srcu() and before | ||
740 | * the beginning of that SRCU read-side critical section. Note that these | ||
741 | * guarantees include CPUs that are offline, idle, or executing in user mode, | ||
742 | * as well as CPUs that are executing in the kernel. | ||
743 | * | ||
744 | * Furthermore, if CPU A invoked synchronize_srcu(), which returned | ||
745 | * to its caller on CPU B, then both CPU A and CPU B are guaranteed | ||
746 | * to have executed a full memory barrier during the execution of | ||
747 | * synchronize_srcu(). This guarantee applies even if CPU A and CPU B | ||
748 | * are the same CPU, but again only if the system has more than one CPU. | ||
749 | * | ||
750 | * Of course, these memory-ordering guarantees apply only when | ||
751 | * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are | ||
752 | * passed the same srcu_struct structure. | ||
753 | */ | ||
754 | void synchronize_srcu(struct srcu_struct *sp) | ||
755 | { | ||
756 | if (rcu_gp_is_expedited()) | ||
757 | synchronize_srcu_expedited(sp); | ||
758 | else | ||
759 | __synchronize_srcu(sp); | ||
760 | } | ||
761 | EXPORT_SYMBOL_GPL(synchronize_srcu); | ||
762 | |||
763 | /* | ||
764 | * Callback function for srcu_barrier() use. | ||
765 | */ | ||
766 | static void srcu_barrier_cb(struct rcu_head *rhp) | ||
767 | { | ||
768 | struct srcu_data *sdp; | ||
769 | struct srcu_struct *sp; | ||
770 | |||
771 | sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); | ||
772 | sp = sdp->sp; | ||
773 | if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) | ||
774 | complete(&sp->srcu_barrier_completion); | ||
775 | } | ||
776 | |||
777 | /** | ||
778 | * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. | ||
779 | * @sp: srcu_struct on which to wait for in-flight callbacks. | ||
780 | */ | ||
781 | void srcu_barrier(struct srcu_struct *sp) | ||
782 | { | ||
783 | int cpu; | ||
784 | struct srcu_data *sdp; | ||
785 | unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq); | ||
786 | |||
787 | check_init_srcu_struct(sp); | ||
788 | mutex_lock(&sp->srcu_barrier_mutex); | ||
789 | if (rcu_seq_done(&sp->srcu_barrier_seq, s)) { | ||
790 | smp_mb(); /* Force ordering following return. */ | ||
791 | mutex_unlock(&sp->srcu_barrier_mutex); | ||
792 | return; /* Someone else did our work for us. */ | ||
793 | } | ||
794 | rcu_seq_start(&sp->srcu_barrier_seq); | ||
795 | init_completion(&sp->srcu_barrier_completion); | ||
796 | |||
797 | /* Initial count prevents reaching zero until all CBs are posted. */ | ||
798 | atomic_set(&sp->srcu_barrier_cpu_cnt, 1); | ||
799 | |||
800 | /* | ||
801 | * Each pass through this loop enqueues a callback, but only | ||
802 | * on CPUs already having callbacks enqueued. Note that if | ||
803 | * a CPU already has callbacks enqueue, it must have already | ||
804 | * registered the need for a future grace period, so all we | ||
805 | * need do is enqueue a callback that will use the same | ||
806 | * grace period as the last callback already in the queue. | ||
807 | */ | ||
808 | for_each_possible_cpu(cpu) { | ||
809 | sdp = per_cpu_ptr(sp->sda, cpu); | ||
810 | spin_lock_irq(&sdp->lock); | ||
811 | atomic_inc(&sp->srcu_barrier_cpu_cnt); | ||
812 | sdp->srcu_barrier_head.func = srcu_barrier_cb; | ||
813 | if (!rcu_segcblist_entrain(&sdp->srcu_cblist, | ||
814 | &sdp->srcu_barrier_head, 0)) | ||
815 | atomic_dec(&sp->srcu_barrier_cpu_cnt); | ||
816 | spin_unlock_irq(&sdp->lock); | ||
817 | } | ||
818 | |||
819 | /* Remove the initial count, at which point reaching zero can happen. */ | ||
820 | if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) | ||
821 | complete(&sp->srcu_barrier_completion); | ||
822 | wait_for_completion(&sp->srcu_barrier_completion); | ||
823 | |||
824 | rcu_seq_end(&sp->srcu_barrier_seq); | ||
825 | mutex_unlock(&sp->srcu_barrier_mutex); | ||
826 | } | ||
827 | EXPORT_SYMBOL_GPL(srcu_barrier); | ||
828 | |||
829 | /** | ||
830 | * srcu_batches_completed - return batches completed. | ||
831 | * @sp: srcu_struct on which to report batch completion. | ||
832 | * | ||
833 | * Report the number of batches, correlated with, but not necessarily | ||
834 | * precisely the same as, the number of grace periods that have elapsed. | ||
835 | */ | ||
836 | unsigned long srcu_batches_completed(struct srcu_struct *sp) | ||
837 | { | ||
838 | return sp->srcu_idx; | ||
839 | } | ||
840 | EXPORT_SYMBOL_GPL(srcu_batches_completed); | ||
841 | |||
842 | /* | ||
843 | * Core SRCU state machine. Push state bits of ->srcu_gp_seq | ||
844 | * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has | ||
845 | * completed in that state. | ||
846 | */ | ||
847 | static void srcu_advance_state(struct srcu_struct *sp) | ||
848 | { | ||
849 | int idx; | ||
850 | |||
851 | mutex_lock(&sp->srcu_gp_mutex); | ||
852 | |||
853 | /* | ||
854 | * Because readers might be delayed for an extended period after | ||
855 | * fetching ->srcu_idx for their index, at any point in time there | ||
856 | * might well be readers using both idx=0 and idx=1. We therefore | ||
857 | * need to wait for readers to clear from both index values before | ||
858 | * invoking a callback. | ||
859 | * | ||
860 | * The load-acquire ensures that we see the accesses performed | ||
861 | * by the prior grace period. | ||
862 | */ | ||
863 | idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ | ||
864 | if (idx == SRCU_STATE_IDLE) { | ||
865 | spin_lock_irq(&sp->gp_lock); | ||
866 | if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { | ||
867 | WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); | ||
868 | spin_unlock_irq(&sp->gp_lock); | ||
869 | mutex_unlock(&sp->srcu_gp_mutex); | ||
870 | return; | ||
871 | } | ||
872 | idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); | ||
873 | if (idx == SRCU_STATE_IDLE) | ||
874 | srcu_gp_start(sp); | ||
875 | spin_unlock_irq(&sp->gp_lock); | ||
876 | if (idx != SRCU_STATE_IDLE) { | ||
877 | mutex_unlock(&sp->srcu_gp_mutex); | ||
878 | return; /* Someone else started the grace period. */ | ||
879 | } | ||
880 | } | ||
881 | |||
882 | if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { | ||
883 | idx = 1 ^ (sp->srcu_idx & 1); | ||
884 | if (!try_check_zero(sp, idx, 1)) { | ||
885 | mutex_unlock(&sp->srcu_gp_mutex); | ||
886 | return; /* readers present, retry later. */ | ||
887 | } | ||
888 | srcu_flip(sp); | ||
889 | rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2); | ||
890 | } | ||
891 | |||
892 | if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { | ||
893 | |||
894 | /* | ||
895 | * SRCU read-side critical sections are normally short, | ||
896 | * so check at least twice in quick succession after a flip. | ||
897 | */ | ||
898 | idx = 1 ^ (sp->srcu_idx & 1); | ||
899 | if (!try_check_zero(sp, idx, 2)) { | ||
900 | mutex_unlock(&sp->srcu_gp_mutex); | ||
901 | return; /* readers present, retry later. */ | ||
902 | } | ||
903 | srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */ | ||
904 | } | ||
905 | } | ||
906 | |||
907 | /* | ||
908 | * Invoke a limited number of SRCU callbacks that have passed through | ||
909 | * their grace period. If there are more to do, SRCU will reschedule | ||
910 | * the workqueue. Note that needed memory barriers have been executed | ||
911 | * in this task's context by srcu_readers_active_idx_check(). | ||
912 | */ | ||
913 | static void srcu_invoke_callbacks(struct work_struct *work) | ||
914 | { | ||
915 | bool more; | ||
916 | struct rcu_cblist ready_cbs; | ||
917 | struct rcu_head *rhp; | ||
918 | struct srcu_data *sdp; | ||
919 | struct srcu_struct *sp; | ||
920 | |||
921 | sdp = container_of(work, struct srcu_data, work.work); | ||
922 | sp = sdp->sp; | ||
923 | rcu_cblist_init(&ready_cbs); | ||
924 | spin_lock_irq(&sdp->lock); | ||
925 | smp_mb(); /* Old grace periods before callback invocation! */ | ||
926 | rcu_segcblist_advance(&sdp->srcu_cblist, | ||
927 | rcu_seq_current(&sp->srcu_gp_seq)); | ||
928 | if (sdp->srcu_cblist_invoking || | ||
929 | !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { | ||
930 | spin_unlock_irq(&sdp->lock); | ||
931 | return; /* Someone else on the job or nothing to do. */ | ||
932 | } | ||
933 | |||
934 | /* We are on the job! Extract and invoke ready callbacks. */ | ||
935 | sdp->srcu_cblist_invoking = true; | ||
936 | rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); | ||
937 | spin_unlock_irq(&sdp->lock); | ||
938 | rhp = rcu_cblist_dequeue(&ready_cbs); | ||
939 | for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { | ||
940 | local_bh_disable(); | ||
941 | rhp->func(rhp); | ||
942 | local_bh_enable(); | ||
943 | } | ||
944 | |||
945 | /* | ||
946 | * Update counts, accelerate new callbacks, and if needed, | ||
947 | * schedule another round of callback invocation. | ||
948 | */ | ||
949 | spin_lock_irq(&sdp->lock); | ||
950 | rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); | ||
951 | (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, | ||
952 | rcu_seq_snap(&sp->srcu_gp_seq)); | ||
953 | sdp->srcu_cblist_invoking = false; | ||
954 | more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); | ||
955 | spin_unlock_irq(&sdp->lock); | ||
956 | if (more) | ||
957 | srcu_schedule_cbs_sdp(sdp, 0); | ||
958 | } | ||
959 | |||
960 | /* | ||
961 | * Finished one round of SRCU grace period. Start another if there are | ||
962 | * more SRCU callbacks queued, otherwise put SRCU into not-running state. | ||
963 | */ | ||
964 | static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) | ||
965 | { | ||
966 | bool pushgp = true; | ||
967 | |||
968 | spin_lock_irq(&sp->gp_lock); | ||
969 | if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { | ||
970 | if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { | ||
971 | /* All requests fulfilled, time to go idle. */ | ||
972 | pushgp = false; | ||
973 | } | ||
974 | } else if (!rcu_seq_state(sp->srcu_gp_seq)) { | ||
975 | /* Outstanding request and no GP. Start one. */ | ||
976 | srcu_gp_start(sp); | ||
977 | } | ||
978 | spin_unlock_irq(&sp->gp_lock); | ||
979 | |||
980 | if (pushgp) | ||
981 | queue_delayed_work(system_power_efficient_wq, &sp->work, delay); | ||
982 | } | ||
983 | |||
984 | /* | ||
985 | * This is the work-queue function that handles SRCU grace periods. | ||
986 | */ | ||
987 | void process_srcu(struct work_struct *work) | ||
988 | { | ||
989 | struct srcu_struct *sp; | ||
990 | |||
991 | sp = container_of(work, struct srcu_struct, work.work); | ||
992 | |||
993 | srcu_advance_state(sp); | ||
994 | srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL); | ||
995 | } | ||
996 | EXPORT_SYMBOL_GPL(process_srcu); | ||
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 6ad330dbbae2..e5385731e391 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c | |||
@@ -79,7 +79,7 @@ EXPORT_SYMBOL(__rcu_is_watching); | |||
79 | */ | 79 | */ |
80 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | 80 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) |
81 | { | 81 | { |
82 | RCU_TRACE(reset_cpu_stall_ticks(rcp)); | 82 | RCU_TRACE(reset_cpu_stall_ticks(rcp);) |
83 | if (rcp->donetail != rcp->curtail) { | 83 | if (rcp->donetail != rcp->curtail) { |
84 | rcp->donetail = rcp->curtail; | 84 | rcp->donetail = rcp->curtail; |
85 | return 1; | 85 | return 1; |
@@ -125,7 +125,7 @@ void rcu_bh_qs(void) | |||
125 | */ | 125 | */ |
126 | void rcu_check_callbacks(int user) | 126 | void rcu_check_callbacks(int user) |
127 | { | 127 | { |
128 | RCU_TRACE(check_cpu_stalls()); | 128 | RCU_TRACE(check_cpu_stalls();) |
129 | if (user) | 129 | if (user) |
130 | rcu_sched_qs(); | 130 | rcu_sched_qs(); |
131 | else if (!in_softirq()) | 131 | else if (!in_softirq()) |
@@ -143,7 +143,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
143 | const char *rn = NULL; | 143 | const char *rn = NULL; |
144 | struct rcu_head *next, *list; | 144 | struct rcu_head *next, *list; |
145 | unsigned long flags; | 145 | unsigned long flags; |
146 | RCU_TRACE(int cb_count = 0); | 146 | RCU_TRACE(int cb_count = 0;) |
147 | 147 | ||
148 | /* Move the ready-to-invoke callbacks to a local list. */ | 148 | /* Move the ready-to-invoke callbacks to a local list. */ |
149 | local_irq_save(flags); | 149 | local_irq_save(flags); |
@@ -152,7 +152,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
152 | local_irq_restore(flags); | 152 | local_irq_restore(flags); |
153 | return; | 153 | return; |
154 | } | 154 | } |
155 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); | 155 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);) |
156 | list = rcp->rcucblist; | 156 | list = rcp->rcucblist; |
157 | rcp->rcucblist = *rcp->donetail; | 157 | rcp->rcucblist = *rcp->donetail; |
158 | *rcp->donetail = NULL; | 158 | *rcp->donetail = NULL; |
@@ -162,7 +162,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
162 | local_irq_restore(flags); | 162 | local_irq_restore(flags); |
163 | 163 | ||
164 | /* Invoke the callbacks on the local list. */ | 164 | /* Invoke the callbacks on the local list. */ |
165 | RCU_TRACE(rn = rcp->name); | 165 | RCU_TRACE(rn = rcp->name;) |
166 | while (list) { | 166 | while (list) { |
167 | next = list->next; | 167 | next = list->next; |
168 | prefetch(next); | 168 | prefetch(next); |
@@ -171,9 +171,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
171 | __rcu_reclaim(rn, list); | 171 | __rcu_reclaim(rn, list); |
172 | local_bh_enable(); | 172 | local_bh_enable(); |
173 | list = next; | 173 | list = next; |
174 | RCU_TRACE(cb_count++); | 174 | RCU_TRACE(cb_count++;) |
175 | } | 175 | } |
176 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | 176 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);) |
177 | RCU_TRACE(trace_rcu_batch_end(rcp->name, | 177 | RCU_TRACE(trace_rcu_batch_end(rcp->name, |
178 | cb_count, 0, need_resched(), | 178 | cb_count, 0, need_resched(), |
179 | is_idle_task(current), | 179 | is_idle_task(current), |
@@ -221,7 +221,7 @@ static void __call_rcu(struct rcu_head *head, | |||
221 | local_irq_save(flags); | 221 | local_irq_save(flags); |
222 | *rcp->curtail = head; | 222 | *rcp->curtail = head; |
223 | rcp->curtail = &head->next; | 223 | rcp->curtail = &head->next; |
224 | RCU_TRACE(rcp->qlen++); | 224 | RCU_TRACE(rcp->qlen++;) |
225 | local_irq_restore(flags); | 225 | local_irq_restore(flags); |
226 | 226 | ||
227 | if (unlikely(is_idle_task(current))) { | 227 | if (unlikely(is_idle_task(current))) { |
@@ -254,8 +254,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); | |||
254 | void __init rcu_init(void) | 254 | void __init rcu_init(void) |
255 | { | 255 | { |
256 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 256 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
257 | RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk)); | 257 | RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);) |
258 | RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk)); | 258 | RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);) |
259 | 259 | ||
260 | rcu_early_boot_tests(); | 260 | rcu_early_boot_tests(); |
261 | } | 261 | } |
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index c64b827ecbca..371034e77f87 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h | |||
@@ -52,7 +52,7 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { | |||
52 | RCU_TRACE(.name = "rcu_bh") | 52 | RCU_TRACE(.name = "rcu_bh") |
53 | }; | 53 | }; |
54 | 54 | ||
55 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 55 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) |
56 | #include <linux/kernel_stat.h> | 56 | #include <linux/kernel_stat.h> |
57 | 57 | ||
58 | int rcu_scheduler_active __read_mostly; | 58 | int rcu_scheduler_active __read_mostly; |
@@ -65,15 +65,16 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); | |||
65 | * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. | 65 | * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. |
66 | * The reason for this is that Tiny RCU does not need kthreads, so does | 66 | * The reason for this is that Tiny RCU does not need kthreads, so does |
67 | * not have to care about the fact that the scheduler is half-initialized | 67 | * not have to care about the fact that the scheduler is half-initialized |
68 | * at a certain phase of the boot process. | 68 | * at a certain phase of the boot process. Unless SRCU is in the mix. |
69 | */ | 69 | */ |
70 | void __init rcu_scheduler_starting(void) | 70 | void __init rcu_scheduler_starting(void) |
71 | { | 71 | { |
72 | WARN_ON(nr_context_switches() > 0); | 72 | WARN_ON(nr_context_switches() > 0); |
73 | rcu_scheduler_active = RCU_SCHEDULER_RUNNING; | 73 | rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU) |
74 | ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING; | ||
74 | } | 75 | } |
75 | 76 | ||
76 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 77 | #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ |
77 | 78 | ||
78 | #ifdef CONFIG_RCU_TRACE | 79 | #ifdef CONFIG_RCU_TRACE |
79 | 80 | ||
@@ -162,8 +163,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) | |||
162 | 163 | ||
163 | static void check_cpu_stalls(void) | 164 | static void check_cpu_stalls(void) |
164 | { | 165 | { |
165 | RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); | 166 | RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);) |
166 | RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); | 167 | RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);) |
167 | } | 168 | } |
168 | 169 | ||
169 | #endif /* #ifdef CONFIG_RCU_TRACE */ | 170 | #endif /* #ifdef CONFIG_RCU_TRACE */ |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 50fee7689e71..23aa02587d0f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -97,8 +97,8 @@ struct rcu_state sname##_state = { \ | |||
97 | .gpnum = 0UL - 300UL, \ | 97 | .gpnum = 0UL - 300UL, \ |
98 | .completed = 0UL - 300UL, \ | 98 | .completed = 0UL - 300UL, \ |
99 | .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ | 99 | .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ |
100 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ | 100 | .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \ |
101 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 101 | .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \ |
102 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 102 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
103 | .name = RCU_STATE_NAME(sname), \ | 103 | .name = RCU_STATE_NAME(sname), \ |
104 | .abbr = sabbr, \ | 104 | .abbr = sabbr, \ |
@@ -123,7 +123,7 @@ static int rcu_fanout_leaf = RCU_FANOUT_LEAF; | |||
123 | module_param(rcu_fanout_leaf, int, 0444); | 123 | module_param(rcu_fanout_leaf, int, 0444); |
124 | int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; | 124 | int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; |
125 | /* Number of rcu_nodes at specified level. */ | 125 | /* Number of rcu_nodes at specified level. */ |
126 | static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; | 126 | int num_rcu_lvl[] = NUM_RCU_LVL_INIT; |
127 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ | 127 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ |
128 | /* panic() on RCU Stall sysctl. */ | 128 | /* panic() on RCU Stall sysctl. */ |
129 | int sysctl_panic_on_rcu_stall __read_mostly; | 129 | int sysctl_panic_on_rcu_stall __read_mostly; |
@@ -199,7 +199,7 @@ static const int gp_cleanup_delay; | |||
199 | 199 | ||
200 | /* | 200 | /* |
201 | * Number of grace periods between delays, normalized by the duration of | 201 | * Number of grace periods between delays, normalized by the duration of |
202 | * the delay. The longer the the delay, the more the grace periods between | 202 | * the delay. The longer the delay, the more the grace periods between |
203 | * each delay. The reason for this normalization is that it means that, | 203 | * each delay. The reason for this normalization is that it means that, |
204 | * for non-zero delays, the overall slowdown of grace periods is constant | 204 | * for non-zero delays, the overall slowdown of grace periods is constant |
205 | * regardless of the duration of the delay. This arrangement balances | 205 | * regardless of the duration of the delay. This arrangement balances |
@@ -272,11 +272,19 @@ void rcu_bh_qs(void) | |||
272 | } | 272 | } |
273 | } | 273 | } |
274 | 274 | ||
275 | static DEFINE_PER_CPU(int, rcu_sched_qs_mask); | 275 | /* |
276 | * Steal a bit from the bottom of ->dynticks for idle entry/exit | ||
277 | * control. Initially this is for TLB flushing. | ||
278 | */ | ||
279 | #define RCU_DYNTICK_CTRL_MASK 0x1 | ||
280 | #define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) | ||
281 | #ifndef rcu_eqs_special_exit | ||
282 | #define rcu_eqs_special_exit() do { } while (0) | ||
283 | #endif | ||
276 | 284 | ||
277 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 285 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
278 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | 286 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, |
279 | .dynticks = ATOMIC_INIT(1), | 287 | .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), |
280 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | 288 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE |
281 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, | 289 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, |
282 | .dynticks_idle = ATOMIC_INIT(1), | 290 | .dynticks_idle = ATOMIC_INIT(1), |
@@ -290,15 +298,20 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | |||
290 | static void rcu_dynticks_eqs_enter(void) | 298 | static void rcu_dynticks_eqs_enter(void) |
291 | { | 299 | { |
292 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 300 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
293 | int special; | 301 | int seq; |
294 | 302 | ||
295 | /* | 303 | /* |
296 | * CPUs seeing atomic_inc_return() must see prior RCU read-side | 304 | * CPUs seeing atomic_add_return() must see prior RCU read-side |
297 | * critical sections, and we also must force ordering with the | 305 | * critical sections, and we also must force ordering with the |
298 | * next idle sojourn. | 306 | * next idle sojourn. |
299 | */ | 307 | */ |
300 | special = atomic_inc_return(&rdtp->dynticks); | 308 | seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); |
301 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1); | 309 | /* Better be in an extended quiescent state! */ |
310 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && | ||
311 | (seq & RCU_DYNTICK_CTRL_CTR)); | ||
312 | /* Better not have special action (TLB flush) pending! */ | ||
313 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && | ||
314 | (seq & RCU_DYNTICK_CTRL_MASK)); | ||
302 | } | 315 | } |
303 | 316 | ||
304 | /* | 317 | /* |
@@ -308,15 +321,22 @@ static void rcu_dynticks_eqs_enter(void) | |||
308 | static void rcu_dynticks_eqs_exit(void) | 321 | static void rcu_dynticks_eqs_exit(void) |
309 | { | 322 | { |
310 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 323 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
311 | int special; | 324 | int seq; |
312 | 325 | ||
313 | /* | 326 | /* |
314 | * CPUs seeing atomic_inc_return() must see prior idle sojourns, | 327 | * CPUs seeing atomic_add_return() must see prior idle sojourns, |
315 | * and we also must force ordering with the next RCU read-side | 328 | * and we also must force ordering with the next RCU read-side |
316 | * critical section. | 329 | * critical section. |
317 | */ | 330 | */ |
318 | special = atomic_inc_return(&rdtp->dynticks); | 331 | seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); |
319 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1)); | 332 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && |
333 | !(seq & RCU_DYNTICK_CTRL_CTR)); | ||
334 | if (seq & RCU_DYNTICK_CTRL_MASK) { | ||
335 | atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks); | ||
336 | smp_mb__after_atomic(); /* _exit after clearing mask. */ | ||
337 | /* Prefer duplicate flushes to losing a flush. */ | ||
338 | rcu_eqs_special_exit(); | ||
339 | } | ||
320 | } | 340 | } |
321 | 341 | ||
322 | /* | 342 | /* |
@@ -333,9 +353,9 @@ static void rcu_dynticks_eqs_online(void) | |||
333 | { | 353 | { |
334 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 354 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
335 | 355 | ||
336 | if (atomic_read(&rdtp->dynticks) & 0x1) | 356 | if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR) |
337 | return; | 357 | return; |
338 | atomic_add(0x1, &rdtp->dynticks); | 358 | atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); |
339 | } | 359 | } |
340 | 360 | ||
341 | /* | 361 | /* |
@@ -347,7 +367,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void) | |||
347 | { | 367 | { |
348 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 368 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
349 | 369 | ||
350 | return !(atomic_read(&rdtp->dynticks) & 0x1); | 370 | return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR); |
351 | } | 371 | } |
352 | 372 | ||
353 | /* | 373 | /* |
@@ -358,7 +378,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp) | |||
358 | { | 378 | { |
359 | int snap = atomic_add_return(0, &rdtp->dynticks); | 379 | int snap = atomic_add_return(0, &rdtp->dynticks); |
360 | 380 | ||
361 | return snap; | 381 | return snap & ~RCU_DYNTICK_CTRL_MASK; |
362 | } | 382 | } |
363 | 383 | ||
364 | /* | 384 | /* |
@@ -367,7 +387,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp) | |||
367 | */ | 387 | */ |
368 | static bool rcu_dynticks_in_eqs(int snap) | 388 | static bool rcu_dynticks_in_eqs(int snap) |
369 | { | 389 | { |
370 | return !(snap & 0x1); | 390 | return !(snap & RCU_DYNTICK_CTRL_CTR); |
371 | } | 391 | } |
372 | 392 | ||
373 | /* | 393 | /* |
@@ -387,14 +407,34 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) | |||
387 | static void rcu_dynticks_momentary_idle(void) | 407 | static void rcu_dynticks_momentary_idle(void) |
388 | { | 408 | { |
389 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 409 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
390 | int special = atomic_add_return(2, &rdtp->dynticks); | 410 | int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, |
411 | &rdtp->dynticks); | ||
391 | 412 | ||
392 | /* It is illegal to call this from idle state. */ | 413 | /* It is illegal to call this from idle state. */ |
393 | WARN_ON_ONCE(!(special & 0x1)); | 414 | WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); |
394 | } | 415 | } |
395 | 416 | ||
396 | DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); | 417 | /* |
397 | EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); | 418 | * Set the special (bottom) bit of the specified CPU so that it |
419 | * will take special action (such as flushing its TLB) on the | ||
420 | * next exit from an extended quiescent state. Returns true if | ||
421 | * the bit was successfully set, or false if the CPU was not in | ||
422 | * an extended quiescent state. | ||
423 | */ | ||
424 | bool rcu_eqs_special_set(int cpu) | ||
425 | { | ||
426 | int old; | ||
427 | int new; | ||
428 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
429 | |||
430 | do { | ||
431 | old = atomic_read(&rdtp->dynticks); | ||
432 | if (old & RCU_DYNTICK_CTRL_CTR) | ||
433 | return false; | ||
434 | new = old | RCU_DYNTICK_CTRL_MASK; | ||
435 | } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old); | ||
436 | return true; | ||
437 | } | ||
398 | 438 | ||
399 | /* | 439 | /* |
400 | * Let the RCU core know that this CPU has gone through the scheduler, | 440 | * Let the RCU core know that this CPU has gone through the scheduler, |
@@ -403,44 +443,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); | |||
403 | * memory barriers to let the RCU core know about it, regardless of what | 443 | * memory barriers to let the RCU core know about it, regardless of what |
404 | * this CPU might (or might not) do in the near future. | 444 | * this CPU might (or might not) do in the near future. |
405 | * | 445 | * |
406 | * We inform the RCU core by emulating a zero-duration dyntick-idle | 446 | * We inform the RCU core by emulating a zero-duration dyntick-idle period. |
407 | * period, which we in turn do by incrementing the ->dynticks counter | ||
408 | * by two. | ||
409 | * | 447 | * |
410 | * The caller must have disabled interrupts. | 448 | * The caller must have disabled interrupts. |
411 | */ | 449 | */ |
412 | static void rcu_momentary_dyntick_idle(void) | 450 | static void rcu_momentary_dyntick_idle(void) |
413 | { | 451 | { |
414 | struct rcu_data *rdp; | 452 | raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); |
415 | int resched_mask; | 453 | rcu_dynticks_momentary_idle(); |
416 | struct rcu_state *rsp; | ||
417 | |||
418 | /* | ||
419 | * Yes, we can lose flag-setting operations. This is OK, because | ||
420 | * the flag will be set again after some delay. | ||
421 | */ | ||
422 | resched_mask = raw_cpu_read(rcu_sched_qs_mask); | ||
423 | raw_cpu_write(rcu_sched_qs_mask, 0); | ||
424 | |||
425 | /* Find the flavor that needs a quiescent state. */ | ||
426 | for_each_rcu_flavor(rsp) { | ||
427 | rdp = raw_cpu_ptr(rsp->rda); | ||
428 | if (!(resched_mask & rsp->flavor_mask)) | ||
429 | continue; | ||
430 | smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ | ||
431 | if (READ_ONCE(rdp->mynode->completed) != | ||
432 | READ_ONCE(rdp->cond_resched_completed)) | ||
433 | continue; | ||
434 | |||
435 | /* | ||
436 | * Pretend to be momentarily idle for the quiescent state. | ||
437 | * This allows the grace-period kthread to record the | ||
438 | * quiescent state, with no need for this CPU to do anything | ||
439 | * further. | ||
440 | */ | ||
441 | rcu_dynticks_momentary_idle(); | ||
442 | break; | ||
443 | } | ||
444 | } | 454 | } |
445 | 455 | ||
446 | /* | 456 | /* |
@@ -448,14 +458,22 @@ static void rcu_momentary_dyntick_idle(void) | |||
448 | * and requires special handling for preemptible RCU. | 458 | * and requires special handling for preemptible RCU. |
449 | * The caller must have disabled interrupts. | 459 | * The caller must have disabled interrupts. |
450 | */ | 460 | */ |
451 | void rcu_note_context_switch(void) | 461 | void rcu_note_context_switch(bool preempt) |
452 | { | 462 | { |
453 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ | 463 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ |
454 | trace_rcu_utilization(TPS("Start context switch")); | 464 | trace_rcu_utilization(TPS("Start context switch")); |
455 | rcu_sched_qs(); | 465 | rcu_sched_qs(); |
456 | rcu_preempt_note_context_switch(); | 466 | rcu_preempt_note_context_switch(); |
457 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) | 467 | /* Load rcu_urgent_qs before other flags. */ |
468 | if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) | ||
469 | goto out; | ||
470 | this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); | ||
471 | if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) | ||
458 | rcu_momentary_dyntick_idle(); | 472 | rcu_momentary_dyntick_idle(); |
473 | this_cpu_inc(rcu_dynticks.rcu_qs_ctr); | ||
474 | if (!preempt) | ||
475 | rcu_note_voluntary_context_switch_lite(current); | ||
476 | out: | ||
459 | trace_rcu_utilization(TPS("End context switch")); | 477 | trace_rcu_utilization(TPS("End context switch")); |
460 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ | 478 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ |
461 | } | 479 | } |
@@ -478,29 +496,26 @@ void rcu_all_qs(void) | |||
478 | { | 496 | { |
479 | unsigned long flags; | 497 | unsigned long flags; |
480 | 498 | ||
499 | if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs)) | ||
500 | return; | ||
501 | preempt_disable(); | ||
502 | /* Load rcu_urgent_qs before other flags. */ | ||
503 | if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) { | ||
504 | preempt_enable(); | ||
505 | return; | ||
506 | } | ||
507 | this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); | ||
481 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ | 508 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ |
482 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { | 509 | if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) { |
483 | local_irq_save(flags); | 510 | local_irq_save(flags); |
484 | rcu_momentary_dyntick_idle(); | 511 | rcu_momentary_dyntick_idle(); |
485 | local_irq_restore(flags); | 512 | local_irq_restore(flags); |
486 | } | 513 | } |
487 | if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { | 514 | if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) |
488 | /* | ||
489 | * Yes, we just checked a per-CPU variable with preemption | ||
490 | * enabled, so we might be migrated to some other CPU at | ||
491 | * this point. That is OK because in that case, the | ||
492 | * migration will supply the needed quiescent state. | ||
493 | * We might end up needlessly disabling preemption and | ||
494 | * invoking rcu_sched_qs() on the destination CPU, but | ||
495 | * the probability and cost are both quite low, so this | ||
496 | * should not be a problem in practice. | ||
497 | */ | ||
498 | preempt_disable(); | ||
499 | rcu_sched_qs(); | 515 | rcu_sched_qs(); |
500 | preempt_enable(); | 516 | this_cpu_inc(rcu_dynticks.rcu_qs_ctr); |
501 | } | ||
502 | this_cpu_inc(rcu_qs_ctr); | ||
503 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ | 517 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ |
518 | preempt_enable(); | ||
504 | } | 519 | } |
505 | EXPORT_SYMBOL_GPL(rcu_all_qs); | 520 | EXPORT_SYMBOL_GPL(rcu_all_qs); |
506 | 521 | ||
@@ -713,16 +728,6 @@ void rcutorture_record_progress(unsigned long vernum) | |||
713 | EXPORT_SYMBOL_GPL(rcutorture_record_progress); | 728 | EXPORT_SYMBOL_GPL(rcutorture_record_progress); |
714 | 729 | ||
715 | /* | 730 | /* |
716 | * Does the CPU have callbacks ready to be invoked? | ||
717 | */ | ||
718 | static int | ||
719 | cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) | ||
720 | { | ||
721 | return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && | ||
722 | rdp->nxttail[RCU_NEXT_TAIL] != NULL; | ||
723 | } | ||
724 | |||
725 | /* | ||
726 | * Return the root node of the specified rcu_state structure. | 731 | * Return the root node of the specified rcu_state structure. |
727 | */ | 732 | */ |
728 | static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | 733 | static struct rcu_node *rcu_get_root(struct rcu_state *rsp) |
@@ -752,21 +757,17 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) | |||
752 | static bool | 757 | static bool |
753 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | 758 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) |
754 | { | 759 | { |
755 | int i; | ||
756 | |||
757 | if (rcu_gp_in_progress(rsp)) | 760 | if (rcu_gp_in_progress(rsp)) |
758 | return false; /* No, a grace period is already in progress. */ | 761 | return false; /* No, a grace period is already in progress. */ |
759 | if (rcu_future_needs_gp(rsp)) | 762 | if (rcu_future_needs_gp(rsp)) |
760 | return true; /* Yes, a no-CBs CPU needs one. */ | 763 | return true; /* Yes, a no-CBs CPU needs one. */ |
761 | if (!rdp->nxttail[RCU_NEXT_TAIL]) | 764 | if (!rcu_segcblist_is_enabled(&rdp->cblist)) |
762 | return false; /* No, this is a no-CBs (or offline) CPU. */ | 765 | return false; /* No, this is a no-CBs (or offline) CPU. */ |
763 | if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) | 766 | if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) |
764 | return true; /* Yes, CPU has newly registered callbacks. */ | 767 | return true; /* Yes, CPU has newly registered callbacks. */ |
765 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) | 768 | if (rcu_segcblist_future_gp_needed(&rdp->cblist, |
766 | if (rdp->nxttail[i - 1] != rdp->nxttail[i] && | 769 | READ_ONCE(rsp->completed))) |
767 | ULONG_CMP_LT(READ_ONCE(rsp->completed), | 770 | return true; /* Yes, CBs for future grace period. */ |
768 | rdp->nxtcompleted[i])) | ||
769 | return true; /* Yes, CBs for future grace period. */ | ||
770 | return false; /* No grace period needed. */ | 771 | return false; /* No grace period needed. */ |
771 | } | 772 | } |
772 | 773 | ||
@@ -1150,6 +1151,24 @@ bool notrace rcu_is_watching(void) | |||
1150 | } | 1151 | } |
1151 | EXPORT_SYMBOL_GPL(rcu_is_watching); | 1152 | EXPORT_SYMBOL_GPL(rcu_is_watching); |
1152 | 1153 | ||
1154 | /* | ||
1155 | * If a holdout task is actually running, request an urgent quiescent | ||
1156 | * state from its CPU. This is unsynchronized, so migrations can cause | ||
1157 | * the request to go to the wrong CPU. Which is OK, all that will happen | ||
1158 | * is that the CPU's next context switch will be a bit slower and next | ||
1159 | * time around this task will generate another request. | ||
1160 | */ | ||
1161 | void rcu_request_urgent_qs_task(struct task_struct *t) | ||
1162 | { | ||
1163 | int cpu; | ||
1164 | |||
1165 | barrier(); | ||
1166 | cpu = task_cpu(t); | ||
1167 | if (!task_curr(t)) | ||
1168 | return; /* This task is not running on that CPU. */ | ||
1169 | smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true); | ||
1170 | } | ||
1171 | |||
1153 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) | 1172 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) |
1154 | 1173 | ||
1155 | /* | 1174 | /* |
@@ -1235,7 +1254,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
1235 | bool *isidle, unsigned long *maxj) | 1254 | bool *isidle, unsigned long *maxj) |
1236 | { | 1255 | { |
1237 | unsigned long jtsq; | 1256 | unsigned long jtsq; |
1238 | int *rcrmp; | 1257 | bool *rnhqp; |
1258 | bool *ruqp; | ||
1239 | unsigned long rjtsc; | 1259 | unsigned long rjtsc; |
1240 | struct rcu_node *rnp; | 1260 | struct rcu_node *rnp; |
1241 | 1261 | ||
@@ -1271,11 +1291,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
1271 | * might not be the case for nohz_full CPUs looping in the kernel. | 1291 | * might not be the case for nohz_full CPUs looping in the kernel. |
1272 | */ | 1292 | */ |
1273 | rnp = rdp->mynode; | 1293 | rnp = rdp->mynode; |
1294 | ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); | ||
1274 | if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && | 1295 | if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && |
1275 | READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) && | 1296 | READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && |
1276 | READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { | 1297 | READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { |
1277 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); | 1298 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); |
1278 | return 1; | 1299 | return 1; |
1300 | } else { | ||
1301 | /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ | ||
1302 | smp_store_release(ruqp, true); | ||
1279 | } | 1303 | } |
1280 | 1304 | ||
1281 | /* Check for the CPU being offline. */ | 1305 | /* Check for the CPU being offline. */ |
@@ -1292,7 +1316,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
1292 | * in-kernel CPU-bound tasks cannot advance grace periods. | 1316 | * in-kernel CPU-bound tasks cannot advance grace periods. |
1293 | * So if the grace period is old enough, make the CPU pay attention. | 1317 | * So if the grace period is old enough, make the CPU pay attention. |
1294 | * Note that the unsynchronized assignments to the per-CPU | 1318 | * Note that the unsynchronized assignments to the per-CPU |
1295 | * rcu_sched_qs_mask variable are safe. Yes, setting of | 1319 | * rcu_need_heavy_qs variable are safe. Yes, setting of |
1296 | * bits can be lost, but they will be set again on the next | 1320 | * bits can be lost, but they will be set again on the next |
1297 | * force-quiescent-state pass. So lost bit sets do not result | 1321 | * force-quiescent-state pass. So lost bit sets do not result |
1298 | * in incorrect behavior, merely in a grace period lasting | 1322 | * in incorrect behavior, merely in a grace period lasting |
@@ -1306,16 +1330,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
1306 | * is set too high, we override with half of the RCU CPU stall | 1330 | * is set too high, we override with half of the RCU CPU stall |
1307 | * warning delay. | 1331 | * warning delay. |
1308 | */ | 1332 | */ |
1309 | rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); | 1333 | rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); |
1310 | if (time_after(jiffies, rdp->rsp->gp_start + jtsq) || | 1334 | if (!READ_ONCE(*rnhqp) && |
1311 | time_after(jiffies, rdp->rsp->jiffies_resched)) { | 1335 | (time_after(jiffies, rdp->rsp->gp_start + jtsq) || |
1312 | if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { | 1336 | time_after(jiffies, rdp->rsp->jiffies_resched))) { |
1313 | WRITE_ONCE(rdp->cond_resched_completed, | 1337 | WRITE_ONCE(*rnhqp, true); |
1314 | READ_ONCE(rdp->mynode->completed)); | 1338 | /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ |
1315 | smp_mb(); /* ->cond_resched_completed before *rcrmp. */ | 1339 | smp_store_release(ruqp, true); |
1316 | WRITE_ONCE(*rcrmp, | ||
1317 | READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); | ||
1318 | } | ||
1319 | rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ | 1340 | rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ |
1320 | } | 1341 | } |
1321 | 1342 | ||
@@ -1475,7 +1496,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) | |||
1475 | 1496 | ||
1476 | print_cpu_stall_info_end(); | 1497 | print_cpu_stall_info_end(); |
1477 | for_each_possible_cpu(cpu) | 1498 | for_each_possible_cpu(cpu) |
1478 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; | 1499 | totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, |
1500 | cpu)->cblist); | ||
1479 | pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", | 1501 | pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", |
1480 | smp_processor_id(), (long)(jiffies - rsp->gp_start), | 1502 | smp_processor_id(), (long)(jiffies - rsp->gp_start), |
1481 | (long)rsp->gpnum, (long)rsp->completed, totqlen); | 1503 | (long)rsp->gpnum, (long)rsp->completed, totqlen); |
@@ -1529,7 +1551,8 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
1529 | print_cpu_stall_info(rsp, smp_processor_id()); | 1551 | print_cpu_stall_info(rsp, smp_processor_id()); |
1530 | print_cpu_stall_info_end(); | 1552 | print_cpu_stall_info_end(); |
1531 | for_each_possible_cpu(cpu) | 1553 | for_each_possible_cpu(cpu) |
1532 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; | 1554 | totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, |
1555 | cpu)->cblist); | ||
1533 | pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", | 1556 | pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", |
1534 | jiffies - rsp->gp_start, | 1557 | jiffies - rsp->gp_start, |
1535 | (long)rsp->gpnum, (long)rsp->completed, totqlen); | 1558 | (long)rsp->gpnum, (long)rsp->completed, totqlen); |
@@ -1632,30 +1655,6 @@ void rcu_cpu_stall_reset(void) | |||
1632 | } | 1655 | } |
1633 | 1656 | ||
1634 | /* | 1657 | /* |
1635 | * Initialize the specified rcu_data structure's default callback list | ||
1636 | * to empty. The default callback list is the one that is not used by | ||
1637 | * no-callbacks CPUs. | ||
1638 | */ | ||
1639 | static void init_default_callback_list(struct rcu_data *rdp) | ||
1640 | { | ||
1641 | int i; | ||
1642 | |||
1643 | rdp->nxtlist = NULL; | ||
1644 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
1645 | rdp->nxttail[i] = &rdp->nxtlist; | ||
1646 | } | ||
1647 | |||
1648 | /* | ||
1649 | * Initialize the specified rcu_data structure's callback list to empty. | ||
1650 | */ | ||
1651 | static void init_callback_list(struct rcu_data *rdp) | ||
1652 | { | ||
1653 | if (init_nocb_callback_list(rdp)) | ||
1654 | return; | ||
1655 | init_default_callback_list(rdp); | ||
1656 | } | ||
1657 | |||
1658 | /* | ||
1659 | * Determine the value that ->completed will have at the end of the | 1658 | * Determine the value that ->completed will have at the end of the |
1660 | * next subsequent grace period. This is used to tag callbacks so that | 1659 | * next subsequent grace period. This is used to tag callbacks so that |
1661 | * a CPU can invoke callbacks in a timely fashion even if that CPU has | 1660 | * a CPU can invoke callbacks in a timely fashion even if that CPU has |
@@ -1709,7 +1708,6 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | |||
1709 | unsigned long *c_out) | 1708 | unsigned long *c_out) |
1710 | { | 1709 | { |
1711 | unsigned long c; | 1710 | unsigned long c; |
1712 | int i; | ||
1713 | bool ret = false; | 1711 | bool ret = false; |
1714 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); | 1712 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); |
1715 | 1713 | ||
@@ -1755,13 +1753,11 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | |||
1755 | /* | 1753 | /* |
1756 | * Get a new grace-period number. If there really is no grace | 1754 | * Get a new grace-period number. If there really is no grace |
1757 | * period in progress, it will be smaller than the one we obtained | 1755 | * period in progress, it will be smaller than the one we obtained |
1758 | * earlier. Adjust callbacks as needed. Note that even no-CBs | 1756 | * earlier. Adjust callbacks as needed. |
1759 | * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. | ||
1760 | */ | 1757 | */ |
1761 | c = rcu_cbs_completed(rdp->rsp, rnp_root); | 1758 | c = rcu_cbs_completed(rdp->rsp, rnp_root); |
1762 | for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) | 1759 | if (!rcu_is_nocb_cpu(rdp->cpu)) |
1763 | if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) | 1760 | (void)rcu_segcblist_accelerate(&rdp->cblist, c); |
1764 | rdp->nxtcompleted[i] = c; | ||
1765 | 1761 | ||
1766 | /* | 1762 | /* |
1767 | * If the needed for the required grace period is already | 1763 | * If the needed for the required grace period is already |
@@ -1793,9 +1789,7 @@ out: | |||
1793 | 1789 | ||
1794 | /* | 1790 | /* |
1795 | * Clean up any old requests for the just-ended grace period. Also return | 1791 | * Clean up any old requests for the just-ended grace period. Also return |
1796 | * whether any additional grace periods have been requested. Also invoke | 1792 | * whether any additional grace periods have been requested. |
1797 | * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads | ||
1798 | * waiting for this grace period to complete. | ||
1799 | */ | 1793 | */ |
1800 | static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | 1794 | static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) |
1801 | { | 1795 | { |
@@ -1841,57 +1835,27 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) | |||
1841 | static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | 1835 | static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, |
1842 | struct rcu_data *rdp) | 1836 | struct rcu_data *rdp) |
1843 | { | 1837 | { |
1844 | unsigned long c; | 1838 | bool ret = false; |
1845 | int i; | ||
1846 | bool ret; | ||
1847 | |||
1848 | /* If the CPU has no callbacks, nothing to do. */ | ||
1849 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) | ||
1850 | return false; | ||
1851 | |||
1852 | /* | ||
1853 | * Starting from the sublist containing the callbacks most | ||
1854 | * recently assigned a ->completed number and working down, find the | ||
1855 | * first sublist that is not assignable to an upcoming grace period. | ||
1856 | * Such a sublist has something in it (first two tests) and has | ||
1857 | * a ->completed number assigned that will complete sooner than | ||
1858 | * the ->completed number for newly arrived callbacks (last test). | ||
1859 | * | ||
1860 | * The key point is that any later sublist can be assigned the | ||
1861 | * same ->completed number as the newly arrived callbacks, which | ||
1862 | * means that the callbacks in any of these later sublist can be | ||
1863 | * grouped into a single sublist, whether or not they have already | ||
1864 | * been assigned a ->completed number. | ||
1865 | */ | ||
1866 | c = rcu_cbs_completed(rsp, rnp); | ||
1867 | for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) | ||
1868 | if (rdp->nxttail[i] != rdp->nxttail[i - 1] && | ||
1869 | !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) | ||
1870 | break; | ||
1871 | 1839 | ||
1872 | /* | 1840 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ |
1873 | * If there are no sublist for unassigned callbacks, leave. | 1841 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) |
1874 | * At the same time, advance "i" one sublist, so that "i" will | ||
1875 | * index into the sublist where all the remaining callbacks should | ||
1876 | * be grouped into. | ||
1877 | */ | ||
1878 | if (++i >= RCU_NEXT_TAIL) | ||
1879 | return false; | 1842 | return false; |
1880 | 1843 | ||
1881 | /* | 1844 | /* |
1882 | * Assign all subsequent callbacks' ->completed number to the next | 1845 | * Callbacks are often registered with incomplete grace-period |
1883 | * full grace period and group them all in the sublist initially | 1846 | * information. Something about the fact that getting exact |
1884 | * indexed by "i". | 1847 | * information requires acquiring a global lock... RCU therefore |
1848 | * makes a conservative estimate of the grace period number at which | ||
1849 | * a given callback will become ready to invoke. The following | ||
1850 | * code checks this estimate and improves it when possible, thus | ||
1851 | * accelerating callback invocation to an earlier grace-period | ||
1852 | * number. | ||
1885 | */ | 1853 | */ |
1886 | for (; i <= RCU_NEXT_TAIL; i++) { | 1854 | if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp))) |
1887 | rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; | 1855 | ret = rcu_start_future_gp(rnp, rdp, NULL); |
1888 | rdp->nxtcompleted[i] = c; | ||
1889 | } | ||
1890 | /* Record any needed additional grace periods. */ | ||
1891 | ret = rcu_start_future_gp(rnp, rdp, NULL); | ||
1892 | 1856 | ||
1893 | /* Trace depending on how much we were able to accelerate. */ | 1857 | /* Trace depending on how much we were able to accelerate. */ |
1894 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) | 1858 | if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) |
1895 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); | 1859 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); |
1896 | else | 1860 | else |
1897 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); | 1861 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); |
@@ -1911,32 +1875,15 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1911 | static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | 1875 | static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, |
1912 | struct rcu_data *rdp) | 1876 | struct rcu_data *rdp) |
1913 | { | 1877 | { |
1914 | int i, j; | 1878 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ |
1915 | 1879 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) | |
1916 | /* If the CPU has no callbacks, nothing to do. */ | ||
1917 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) | ||
1918 | return false; | 1880 | return false; |
1919 | 1881 | ||
1920 | /* | 1882 | /* |
1921 | * Find all callbacks whose ->completed numbers indicate that they | 1883 | * Find all callbacks whose ->completed numbers indicate that they |
1922 | * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. | 1884 | * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. |
1923 | */ | 1885 | */ |
1924 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { | 1886 | rcu_segcblist_advance(&rdp->cblist, rnp->completed); |
1925 | if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) | ||
1926 | break; | ||
1927 | rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; | ||
1928 | } | ||
1929 | /* Clean up any sublist tail pointers that were misordered above. */ | ||
1930 | for (j = RCU_WAIT_TAIL; j < i; j++) | ||
1931 | rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; | ||
1932 | |||
1933 | /* Copy down callbacks to fill in empty sublists. */ | ||
1934 | for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { | ||
1935 | if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) | ||
1936 | break; | ||
1937 | rdp->nxttail[j] = rdp->nxttail[i]; | ||
1938 | rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; | ||
1939 | } | ||
1940 | 1887 | ||
1941 | /* Classify any remaining callbacks. */ | 1888 | /* Classify any remaining callbacks. */ |
1942 | return rcu_accelerate_cbs(rsp, rnp, rdp); | 1889 | return rcu_accelerate_cbs(rsp, rnp, rdp); |
@@ -1981,7 +1928,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1981 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); | 1928 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); |
1982 | need_gp = !!(rnp->qsmask & rdp->grpmask); | 1929 | need_gp = !!(rnp->qsmask & rdp->grpmask); |
1983 | rdp->cpu_no_qs.b.norm = need_gp; | 1930 | rdp->cpu_no_qs.b.norm = need_gp; |
1984 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); | 1931 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); |
1985 | rdp->core_needs_qs = need_gp; | 1932 | rdp->core_needs_qs = need_gp; |
1986 | zero_cpu_stall_ticks(rdp); | 1933 | zero_cpu_stall_ticks(rdp); |
1987 | WRITE_ONCE(rdp->gpwrap, false); | 1934 | WRITE_ONCE(rdp->gpwrap, false); |
@@ -2579,7 +2526,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
2579 | * within the current grace period. | 2526 | * within the current grace period. |
2580 | */ | 2527 | */ |
2581 | rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ | 2528 | rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ |
2582 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); | 2529 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); |
2583 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 2530 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
2584 | return; | 2531 | return; |
2585 | } | 2532 | } |
@@ -2653,13 +2600,8 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
2653 | * because _rcu_barrier() excludes CPU-hotplug operations, so it | 2600 | * because _rcu_barrier() excludes CPU-hotplug operations, so it |
2654 | * cannot be running now. Thus no memory barrier is required. | 2601 | * cannot be running now. Thus no memory barrier is required. |
2655 | */ | 2602 | */ |
2656 | if (rdp->nxtlist != NULL) { | 2603 | rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); |
2657 | rsp->qlen_lazy += rdp->qlen_lazy; | 2604 | rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); |
2658 | rsp->qlen += rdp->qlen; | ||
2659 | rdp->n_cbs_orphaned += rdp->qlen; | ||
2660 | rdp->qlen_lazy = 0; | ||
2661 | WRITE_ONCE(rdp->qlen, 0); | ||
2662 | } | ||
2663 | 2605 | ||
2664 | /* | 2606 | /* |
2665 | * Next, move those callbacks still needing a grace period to | 2607 | * Next, move those callbacks still needing a grace period to |
@@ -2667,31 +2609,18 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
2667 | * Some of the callbacks might have gone partway through a grace | 2609 | * Some of the callbacks might have gone partway through a grace |
2668 | * period, but that is too bad. They get to start over because we | 2610 | * period, but that is too bad. They get to start over because we |
2669 | * cannot assume that grace periods are synchronized across CPUs. | 2611 | * cannot assume that grace periods are synchronized across CPUs. |
2670 | * We don't bother updating the ->nxttail[] array yet, instead | ||
2671 | * we just reset the whole thing later on. | ||
2672 | */ | 2612 | */ |
2673 | if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { | 2613 | rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); |
2674 | *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; | ||
2675 | rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; | ||
2676 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; | ||
2677 | } | ||
2678 | 2614 | ||
2679 | /* | 2615 | /* |
2680 | * Then move the ready-to-invoke callbacks to the orphanage, | 2616 | * Then move the ready-to-invoke callbacks to the orphanage, |
2681 | * where some other CPU will pick them up. These will not be | 2617 | * where some other CPU will pick them up. These will not be |
2682 | * required to pass though another grace period: They are done. | 2618 | * required to pass though another grace period: They are done. |
2683 | */ | 2619 | */ |
2684 | if (rdp->nxtlist != NULL) { | 2620 | rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); |
2685 | *rsp->orphan_donetail = rdp->nxtlist; | ||
2686 | rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; | ||
2687 | } | ||
2688 | 2621 | ||
2689 | /* | 2622 | /* Finally, disallow further callbacks on this CPU. */ |
2690 | * Finally, initialize the rcu_data structure's list to empty and | 2623 | rcu_segcblist_disable(&rdp->cblist); |
2691 | * disallow further callbacks on this CPU. | ||
2692 | */ | ||
2693 | init_callback_list(rdp); | ||
2694 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | ||
2695 | } | 2624 | } |
2696 | 2625 | ||
2697 | /* | 2626 | /* |
@@ -2700,7 +2629,6 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
2700 | */ | 2629 | */ |
2701 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) | 2630 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) |
2702 | { | 2631 | { |
2703 | int i; | ||
2704 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); | 2632 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); |
2705 | 2633 | ||
2706 | /* No-CBs CPUs are handled specially. */ | 2634 | /* No-CBs CPUs are handled specially. */ |
@@ -2709,13 +2637,11 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) | |||
2709 | return; | 2637 | return; |
2710 | 2638 | ||
2711 | /* Do the accounting first. */ | 2639 | /* Do the accounting first. */ |
2712 | rdp->qlen_lazy += rsp->qlen_lazy; | 2640 | rdp->n_cbs_adopted += rcu_cblist_n_cbs(&rsp->orphan_done); |
2713 | rdp->qlen += rsp->qlen; | 2641 | if (rcu_cblist_n_lazy_cbs(&rsp->orphan_done) != |
2714 | rdp->n_cbs_adopted += rsp->qlen; | 2642 | rcu_cblist_n_cbs(&rsp->orphan_done)) |
2715 | if (rsp->qlen_lazy != rsp->qlen) | ||
2716 | rcu_idle_count_callbacks_posted(); | 2643 | rcu_idle_count_callbacks_posted(); |
2717 | rsp->qlen_lazy = 0; | 2644 | rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); |
2718 | rsp->qlen = 0; | ||
2719 | 2645 | ||
2720 | /* | 2646 | /* |
2721 | * We do not need a memory barrier here because the only way we | 2647 | * We do not need a memory barrier here because the only way we |
@@ -2723,24 +2649,13 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) | |||
2723 | * we are the task doing the rcu_barrier(). | 2649 | * we are the task doing the rcu_barrier(). |
2724 | */ | 2650 | */ |
2725 | 2651 | ||
2726 | /* First adopt the ready-to-invoke callbacks. */ | 2652 | /* First adopt the ready-to-invoke callbacks, then the done ones. */ |
2727 | if (rsp->orphan_donelist != NULL) { | 2653 | rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); |
2728 | *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; | 2654 | WARN_ON_ONCE(!rcu_cblist_empty(&rsp->orphan_done)); |
2729 | *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; | 2655 | rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); |
2730 | for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) | 2656 | WARN_ON_ONCE(!rcu_cblist_empty(&rsp->orphan_pend)); |
2731 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) | 2657 | WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != |
2732 | rdp->nxttail[i] = rsp->orphan_donetail; | 2658 | !rcu_segcblist_n_cbs(&rdp->cblist)); |
2733 | rsp->orphan_donelist = NULL; | ||
2734 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
2735 | } | ||
2736 | |||
2737 | /* And then adopt the callbacks that still need a grace period. */ | ||
2738 | if (rsp->orphan_nxtlist != NULL) { | ||
2739 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; | ||
2740 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; | ||
2741 | rsp->orphan_nxtlist = NULL; | ||
2742 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
2743 | } | ||
2744 | } | 2659 | } |
2745 | 2660 | ||
2746 | /* | 2661 | /* |
@@ -2748,14 +2663,14 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) | |||
2748 | */ | 2663 | */ |
2749 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 2664 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) |
2750 | { | 2665 | { |
2751 | RCU_TRACE(unsigned long mask); | 2666 | RCU_TRACE(unsigned long mask;) |
2752 | RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); | 2667 | RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);) |
2753 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); | 2668 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) |
2754 | 2669 | ||
2755 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) | 2670 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) |
2756 | return; | 2671 | return; |
2757 | 2672 | ||
2758 | RCU_TRACE(mask = rdp->grpmask); | 2673 | RCU_TRACE(mask = rdp->grpmask;) |
2759 | trace_rcu_grace_period(rsp->name, | 2674 | trace_rcu_grace_period(rsp->name, |
2760 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), | 2675 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), |
2761 | TPS("cpuofl")); | 2676 | TPS("cpuofl")); |
@@ -2828,9 +2743,11 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
2828 | rcu_adopt_orphan_cbs(rsp, flags); | 2743 | rcu_adopt_orphan_cbs(rsp, flags); |
2829 | raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); | 2744 | raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); |
2830 | 2745 | ||
2831 | WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, | 2746 | WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || |
2832 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", | 2747 | !rcu_segcblist_empty(&rdp->cblist), |
2833 | cpu, rdp->qlen, rdp->nxtlist); | 2748 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", |
2749 | cpu, rcu_segcblist_n_cbs(&rdp->cblist), | ||
2750 | rcu_segcblist_first_cb(&rdp->cblist)); | ||
2834 | } | 2751 | } |
2835 | 2752 | ||
2836 | /* | 2753 | /* |
@@ -2840,14 +2757,17 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
2840 | static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | 2757 | static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) |
2841 | { | 2758 | { |
2842 | unsigned long flags; | 2759 | unsigned long flags; |
2843 | struct rcu_head *next, *list, **tail; | 2760 | struct rcu_head *rhp; |
2844 | long bl, count, count_lazy; | 2761 | struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); |
2845 | int i; | 2762 | long bl, count; |
2846 | 2763 | ||
2847 | /* If no callbacks are ready, just return. */ | 2764 | /* If no callbacks are ready, just return. */ |
2848 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | 2765 | if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { |
2849 | trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); | 2766 | trace_rcu_batch_start(rsp->name, |
2850 | trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist), | 2767 | rcu_segcblist_n_lazy_cbs(&rdp->cblist), |
2768 | rcu_segcblist_n_cbs(&rdp->cblist), 0); | ||
2769 | trace_rcu_batch_end(rsp->name, 0, | ||
2770 | !rcu_segcblist_empty(&rdp->cblist), | ||
2851 | need_resched(), is_idle_task(current), | 2771 | need_resched(), is_idle_task(current), |
2852 | rcu_is_callbacks_kthread()); | 2772 | rcu_is_callbacks_kthread()); |
2853 | return; | 2773 | return; |
@@ -2855,73 +2775,62 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
2855 | 2775 | ||
2856 | /* | 2776 | /* |
2857 | * Extract the list of ready callbacks, disabling to prevent | 2777 | * Extract the list of ready callbacks, disabling to prevent |
2858 | * races with call_rcu() from interrupt handlers. | 2778 | * races with call_rcu() from interrupt handlers. Leave the |
2779 | * callback counts, as rcu_barrier() needs to be conservative. | ||
2859 | */ | 2780 | */ |
2860 | local_irq_save(flags); | 2781 | local_irq_save(flags); |
2861 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); | 2782 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); |
2862 | bl = rdp->blimit; | 2783 | bl = rdp->blimit; |
2863 | trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); | 2784 | trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist), |
2864 | list = rdp->nxtlist; | 2785 | rcu_segcblist_n_cbs(&rdp->cblist), bl); |
2865 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | 2786 | rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); |
2866 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; | ||
2867 | tail = rdp->nxttail[RCU_DONE_TAIL]; | ||
2868 | for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) | ||
2869 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) | ||
2870 | rdp->nxttail[i] = &rdp->nxtlist; | ||
2871 | local_irq_restore(flags); | 2787 | local_irq_restore(flags); |
2872 | 2788 | ||
2873 | /* Invoke callbacks. */ | 2789 | /* Invoke callbacks. */ |
2874 | count = count_lazy = 0; | 2790 | rhp = rcu_cblist_dequeue(&rcl); |
2875 | while (list) { | 2791 | for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { |
2876 | next = list->next; | 2792 | debug_rcu_head_unqueue(rhp); |
2877 | prefetch(next); | 2793 | if (__rcu_reclaim(rsp->name, rhp)) |
2878 | debug_rcu_head_unqueue(list); | 2794 | rcu_cblist_dequeued_lazy(&rcl); |
2879 | if (__rcu_reclaim(rsp->name, list)) | 2795 | /* |
2880 | count_lazy++; | 2796 | * Stop only if limit reached and CPU has something to do. |
2881 | list = next; | 2797 | * Note: The rcl structure counts down from zero. |
2882 | /* Stop only if limit reached and CPU has something to do. */ | 2798 | */ |
2883 | if (++count >= bl && | 2799 | if (-rcu_cblist_n_cbs(&rcl) >= bl && |
2884 | (need_resched() || | 2800 | (need_resched() || |
2885 | (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) | 2801 | (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) |
2886 | break; | 2802 | break; |
2887 | } | 2803 | } |
2888 | 2804 | ||
2889 | local_irq_save(flags); | 2805 | local_irq_save(flags); |
2890 | trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), | 2806 | count = -rcu_cblist_n_cbs(&rcl); |
2891 | is_idle_task(current), | 2807 | trace_rcu_batch_end(rsp->name, count, !rcu_cblist_empty(&rcl), |
2808 | need_resched(), is_idle_task(current), | ||
2892 | rcu_is_callbacks_kthread()); | 2809 | rcu_is_callbacks_kthread()); |
2893 | 2810 | ||
2894 | /* Update count, and requeue any remaining callbacks. */ | 2811 | /* Update counts and requeue any remaining callbacks. */ |
2895 | if (list != NULL) { | 2812 | rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); |
2896 | *tail = rdp->nxtlist; | ||
2897 | rdp->nxtlist = list; | ||
2898 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
2899 | if (&rdp->nxtlist == rdp->nxttail[i]) | ||
2900 | rdp->nxttail[i] = tail; | ||
2901 | else | ||
2902 | break; | ||
2903 | } | ||
2904 | smp_mb(); /* List handling before counting for rcu_barrier(). */ | 2813 | smp_mb(); /* List handling before counting for rcu_barrier(). */ |
2905 | rdp->qlen_lazy -= count_lazy; | ||
2906 | WRITE_ONCE(rdp->qlen, rdp->qlen - count); | ||
2907 | rdp->n_cbs_invoked += count; | 2814 | rdp->n_cbs_invoked += count; |
2815 | rcu_segcblist_insert_count(&rdp->cblist, &rcl); | ||
2908 | 2816 | ||
2909 | /* Reinstate batch limit if we have worked down the excess. */ | 2817 | /* Reinstate batch limit if we have worked down the excess. */ |
2910 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) | 2818 | count = rcu_segcblist_n_cbs(&rdp->cblist); |
2819 | if (rdp->blimit == LONG_MAX && count <= qlowmark) | ||
2911 | rdp->blimit = blimit; | 2820 | rdp->blimit = blimit; |
2912 | 2821 | ||
2913 | /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ | 2822 | /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ |
2914 | if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { | 2823 | if (count == 0 && rdp->qlen_last_fqs_check != 0) { |
2915 | rdp->qlen_last_fqs_check = 0; | 2824 | rdp->qlen_last_fqs_check = 0; |
2916 | rdp->n_force_qs_snap = rsp->n_force_qs; | 2825 | rdp->n_force_qs_snap = rsp->n_force_qs; |
2917 | } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) | 2826 | } else if (count < rdp->qlen_last_fqs_check - qhimark) |
2918 | rdp->qlen_last_fqs_check = rdp->qlen; | 2827 | rdp->qlen_last_fqs_check = count; |
2919 | WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); | 2828 | WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); |
2920 | 2829 | ||
2921 | local_irq_restore(flags); | 2830 | local_irq_restore(flags); |
2922 | 2831 | ||
2923 | /* Re-invoke RCU core processing if there are callbacks remaining. */ | 2832 | /* Re-invoke RCU core processing if there are callbacks remaining. */ |
2924 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 2833 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) |
2925 | invoke_rcu_core(); | 2834 | invoke_rcu_core(); |
2926 | } | 2835 | } |
2927 | 2836 | ||
@@ -3087,7 +2996,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
3087 | bool needwake; | 2996 | bool needwake; |
3088 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); | 2997 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); |
3089 | 2998 | ||
3090 | WARN_ON_ONCE(rdp->beenonline == 0); | 2999 | WARN_ON_ONCE(!rdp->beenonline); |
3091 | 3000 | ||
3092 | /* Update RCU state based on any recent quiescent states. */ | 3001 | /* Update RCU state based on any recent quiescent states. */ |
3093 | rcu_check_quiescent_state(rsp, rdp); | 3002 | rcu_check_quiescent_state(rsp, rdp); |
@@ -3105,7 +3014,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
3105 | } | 3014 | } |
3106 | 3015 | ||
3107 | /* If there are callbacks ready, invoke them. */ | 3016 | /* If there are callbacks ready, invoke them. */ |
3108 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 3017 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) |
3109 | invoke_rcu_callbacks(rsp, rdp); | 3018 | invoke_rcu_callbacks(rsp, rdp); |
3110 | 3019 | ||
3111 | /* Do any needed deferred wakeups of rcuo kthreads. */ | 3020 | /* Do any needed deferred wakeups of rcuo kthreads. */ |
@@ -3177,7 +3086,8 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
3177 | * invoking force_quiescent_state() if the newly enqueued callback | 3086 | * invoking force_quiescent_state() if the newly enqueued callback |
3178 | * is the only one waiting for a grace period to complete. | 3087 | * is the only one waiting for a grace period to complete. |
3179 | */ | 3088 | */ |
3180 | if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | 3089 | if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) > |
3090 | rdp->qlen_last_fqs_check + qhimark)) { | ||
3181 | 3091 | ||
3182 | /* Are we ignoring a completed grace period? */ | 3092 | /* Are we ignoring a completed grace period? */ |
3183 | note_gp_changes(rsp, rdp); | 3093 | note_gp_changes(rsp, rdp); |
@@ -3195,10 +3105,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
3195 | /* Give the grace period a kick. */ | 3105 | /* Give the grace period a kick. */ |
3196 | rdp->blimit = LONG_MAX; | 3106 | rdp->blimit = LONG_MAX; |
3197 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | 3107 | if (rsp->n_force_qs == rdp->n_force_qs_snap && |
3198 | *rdp->nxttail[RCU_DONE_TAIL] != head) | 3108 | rcu_segcblist_first_pend_cb(&rdp->cblist) != head) |
3199 | force_quiescent_state(rsp); | 3109 | force_quiescent_state(rsp); |
3200 | rdp->n_force_qs_snap = rsp->n_force_qs; | 3110 | rdp->n_force_qs_snap = rsp->n_force_qs; |
3201 | rdp->qlen_last_fqs_check = rdp->qlen; | 3111 | rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); |
3202 | } | 3112 | } |
3203 | } | 3113 | } |
3204 | } | 3114 | } |
@@ -3238,7 +3148,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, | |||
3238 | rdp = this_cpu_ptr(rsp->rda); | 3148 | rdp = this_cpu_ptr(rsp->rda); |
3239 | 3149 | ||
3240 | /* Add the callback to our list. */ | 3150 | /* Add the callback to our list. */ |
3241 | if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { | 3151 | if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) { |
3242 | int offline; | 3152 | int offline; |
3243 | 3153 | ||
3244 | if (cpu != -1) | 3154 | if (cpu != -1) |
@@ -3257,23 +3167,21 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, | |||
3257 | */ | 3167 | */ |
3258 | BUG_ON(cpu != -1); | 3168 | BUG_ON(cpu != -1); |
3259 | WARN_ON_ONCE(!rcu_is_watching()); | 3169 | WARN_ON_ONCE(!rcu_is_watching()); |
3260 | if (!likely(rdp->nxtlist)) | 3170 | if (rcu_segcblist_empty(&rdp->cblist)) |
3261 | init_default_callback_list(rdp); | 3171 | rcu_segcblist_init(&rdp->cblist); |
3262 | } | 3172 | } |
3263 | WRITE_ONCE(rdp->qlen, rdp->qlen + 1); | 3173 | rcu_segcblist_enqueue(&rdp->cblist, head, lazy); |
3264 | if (lazy) | 3174 | if (!lazy) |
3265 | rdp->qlen_lazy++; | ||
3266 | else | ||
3267 | rcu_idle_count_callbacks_posted(); | 3175 | rcu_idle_count_callbacks_posted(); |
3268 | smp_mb(); /* Count before adding callback for rcu_barrier(). */ | ||
3269 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | ||
3270 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | ||
3271 | 3176 | ||
3272 | if (__is_kfree_rcu_offset((unsigned long)func)) | 3177 | if (__is_kfree_rcu_offset((unsigned long)func)) |
3273 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, | 3178 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, |
3274 | rdp->qlen_lazy, rdp->qlen); | 3179 | rcu_segcblist_n_lazy_cbs(&rdp->cblist), |
3180 | rcu_segcblist_n_cbs(&rdp->cblist)); | ||
3275 | else | 3181 | else |
3276 | trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); | 3182 | trace_rcu_callback(rsp->name, head, |
3183 | rcu_segcblist_n_lazy_cbs(&rdp->cblist), | ||
3184 | rcu_segcblist_n_cbs(&rdp->cblist)); | ||
3277 | 3185 | ||
3278 | /* Go handle any RCU core processing required. */ | 3186 | /* Go handle any RCU core processing required. */ |
3279 | __call_rcu_core(rsp, rdp, head, flags); | 3187 | __call_rcu_core(rsp, rdp, head, flags); |
@@ -3519,41 +3427,6 @@ void cond_synchronize_sched(unsigned long oldstate) | |||
3519 | } | 3427 | } |
3520 | EXPORT_SYMBOL_GPL(cond_synchronize_sched); | 3428 | EXPORT_SYMBOL_GPL(cond_synchronize_sched); |
3521 | 3429 | ||
3522 | /* Adjust sequence number for start of update-side operation. */ | ||
3523 | static void rcu_seq_start(unsigned long *sp) | ||
3524 | { | ||
3525 | WRITE_ONCE(*sp, *sp + 1); | ||
3526 | smp_mb(); /* Ensure update-side operation after counter increment. */ | ||
3527 | WARN_ON_ONCE(!(*sp & 0x1)); | ||
3528 | } | ||
3529 | |||
3530 | /* Adjust sequence number for end of update-side operation. */ | ||
3531 | static void rcu_seq_end(unsigned long *sp) | ||
3532 | { | ||
3533 | smp_mb(); /* Ensure update-side operation before counter increment. */ | ||
3534 | WRITE_ONCE(*sp, *sp + 1); | ||
3535 | WARN_ON_ONCE(*sp & 0x1); | ||
3536 | } | ||
3537 | |||
3538 | /* Take a snapshot of the update side's sequence number. */ | ||
3539 | static unsigned long rcu_seq_snap(unsigned long *sp) | ||
3540 | { | ||
3541 | unsigned long s; | ||
3542 | |||
3543 | s = (READ_ONCE(*sp) + 3) & ~0x1; | ||
3544 | smp_mb(); /* Above access must not bleed into critical section. */ | ||
3545 | return s; | ||
3546 | } | ||
3547 | |||
3548 | /* | ||
3549 | * Given a snapshot from rcu_seq_snap(), determine whether or not a | ||
3550 | * full update-side operation has occurred. | ||
3551 | */ | ||
3552 | static bool rcu_seq_done(unsigned long *sp, unsigned long s) | ||
3553 | { | ||
3554 | return ULONG_CMP_GE(READ_ONCE(*sp), s); | ||
3555 | } | ||
3556 | |||
3557 | /* | 3430 | /* |
3558 | * Check to see if there is any immediate RCU-related work to be done | 3431 | * Check to see if there is any immediate RCU-related work to be done |
3559 | * by the current CPU, for the specified type of RCU, returning 1 if so. | 3432 | * by the current CPU, for the specified type of RCU, returning 1 if so. |
@@ -3577,7 +3450,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
3577 | /* Is the RCU core waiting for a quiescent state from this CPU? */ | 3450 | /* Is the RCU core waiting for a quiescent state from this CPU? */ |
3578 | if (rcu_scheduler_fully_active && | 3451 | if (rcu_scheduler_fully_active && |
3579 | rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && | 3452 | rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && |
3580 | rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { | 3453 | rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) { |
3581 | rdp->n_rp_core_needs_qs++; | 3454 | rdp->n_rp_core_needs_qs++; |
3582 | } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { | 3455 | } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { |
3583 | rdp->n_rp_report_qs++; | 3456 | rdp->n_rp_report_qs++; |
@@ -3585,7 +3458,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
3585 | } | 3458 | } |
3586 | 3459 | ||
3587 | /* Does this CPU have callbacks ready to invoke? */ | 3460 | /* Does this CPU have callbacks ready to invoke? */ |
3588 | if (cpu_has_callbacks_ready_to_invoke(rdp)) { | 3461 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) { |
3589 | rdp->n_rp_cb_ready++; | 3462 | rdp->n_rp_cb_ready++; |
3590 | return 1; | 3463 | return 1; |
3591 | } | 3464 | } |
@@ -3649,10 +3522,10 @@ static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) | |||
3649 | 3522 | ||
3650 | for_each_rcu_flavor(rsp) { | 3523 | for_each_rcu_flavor(rsp) { |
3651 | rdp = this_cpu_ptr(rsp->rda); | 3524 | rdp = this_cpu_ptr(rsp->rda); |
3652 | if (!rdp->nxtlist) | 3525 | if (rcu_segcblist_empty(&rdp->cblist)) |
3653 | continue; | 3526 | continue; |
3654 | hc = true; | 3527 | hc = true; |
3655 | if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { | 3528 | if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist) || !all_lazy) { |
3656 | al = false; | 3529 | al = false; |
3657 | break; | 3530 | break; |
3658 | } | 3531 | } |
@@ -3761,7 +3634,7 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
3761 | __call_rcu(&rdp->barrier_head, | 3634 | __call_rcu(&rdp->barrier_head, |
3762 | rcu_barrier_callback, rsp, cpu, 0); | 3635 | rcu_barrier_callback, rsp, cpu, 0); |
3763 | } | 3636 | } |
3764 | } else if (READ_ONCE(rdp->qlen)) { | 3637 | } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { |
3765 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, | 3638 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, |
3766 | rsp->barrier_sequence); | 3639 | rsp->barrier_sequence); |
3767 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); | 3640 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); |
@@ -3870,8 +3743,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
3870 | rdp->qlen_last_fqs_check = 0; | 3743 | rdp->qlen_last_fqs_check = 0; |
3871 | rdp->n_force_qs_snap = rsp->n_force_qs; | 3744 | rdp->n_force_qs_snap = rsp->n_force_qs; |
3872 | rdp->blimit = blimit; | 3745 | rdp->blimit = blimit; |
3873 | if (!rdp->nxtlist) | 3746 | if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ |
3874 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ | 3747 | !init_nocb_callback_list(rdp)) |
3748 | rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ | ||
3875 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 3749 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
3876 | rcu_sysidle_init_percpu_data(rdp->dynticks); | 3750 | rcu_sysidle_init_percpu_data(rdp->dynticks); |
3877 | rcu_dynticks_eqs_online(); | 3751 | rcu_dynticks_eqs_online(); |
@@ -3890,12 +3764,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
3890 | rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ | 3764 | rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ |
3891 | rdp->completed = rnp->completed; | 3765 | rdp->completed = rnp->completed; |
3892 | rdp->cpu_no_qs.b.norm = true; | 3766 | rdp->cpu_no_qs.b.norm = true; |
3893 | rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); | 3767 | rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); |
3894 | rdp->core_needs_qs = false; | 3768 | rdp->core_needs_qs = false; |
3895 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); | 3769 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); |
3896 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 3770 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
3897 | } | 3771 | } |
3898 | 3772 | ||
3773 | /* | ||
3774 | * Invoked early in the CPU-online process, when pretty much all | ||
3775 | * services are available. The incoming CPU is not present. | ||
3776 | */ | ||
3899 | int rcutree_prepare_cpu(unsigned int cpu) | 3777 | int rcutree_prepare_cpu(unsigned int cpu) |
3900 | { | 3778 | { |
3901 | struct rcu_state *rsp; | 3779 | struct rcu_state *rsp; |
@@ -3909,6 +3787,9 @@ int rcutree_prepare_cpu(unsigned int cpu) | |||
3909 | return 0; | 3787 | return 0; |
3910 | } | 3788 | } |
3911 | 3789 | ||
3790 | /* | ||
3791 | * Update RCU priority boot kthread affinity for CPU-hotplug changes. | ||
3792 | */ | ||
3912 | static void rcutree_affinity_setting(unsigned int cpu, int outgoing) | 3793 | static void rcutree_affinity_setting(unsigned int cpu, int outgoing) |
3913 | { | 3794 | { |
3914 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); | 3795 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); |
@@ -3916,20 +3797,34 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) | |||
3916 | rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); | 3797 | rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); |
3917 | } | 3798 | } |
3918 | 3799 | ||
3800 | /* | ||
3801 | * Near the end of the CPU-online process. Pretty much all services | ||
3802 | * enabled, and the CPU is now very much alive. | ||
3803 | */ | ||
3919 | int rcutree_online_cpu(unsigned int cpu) | 3804 | int rcutree_online_cpu(unsigned int cpu) |
3920 | { | 3805 | { |
3921 | sync_sched_exp_online_cleanup(cpu); | 3806 | sync_sched_exp_online_cleanup(cpu); |
3922 | rcutree_affinity_setting(cpu, -1); | 3807 | rcutree_affinity_setting(cpu, -1); |
3808 | if (IS_ENABLED(CONFIG_TREE_SRCU)) | ||
3809 | srcu_online_cpu(cpu); | ||
3923 | return 0; | 3810 | return 0; |
3924 | } | 3811 | } |
3925 | 3812 | ||
3813 | /* | ||
3814 | * Near the beginning of the process. The CPU is still very much alive | ||
3815 | * with pretty much all services enabled. | ||
3816 | */ | ||
3926 | int rcutree_offline_cpu(unsigned int cpu) | 3817 | int rcutree_offline_cpu(unsigned int cpu) |
3927 | { | 3818 | { |
3928 | rcutree_affinity_setting(cpu, cpu); | 3819 | rcutree_affinity_setting(cpu, cpu); |
3820 | if (IS_ENABLED(CONFIG_TREE_SRCU)) | ||
3821 | srcu_offline_cpu(cpu); | ||
3929 | return 0; | 3822 | return 0; |
3930 | } | 3823 | } |
3931 | 3824 | ||
3932 | 3825 | /* | |
3826 | * Near the end of the offline process. We do only tracing here. | ||
3827 | */ | ||
3933 | int rcutree_dying_cpu(unsigned int cpu) | 3828 | int rcutree_dying_cpu(unsigned int cpu) |
3934 | { | 3829 | { |
3935 | struct rcu_state *rsp; | 3830 | struct rcu_state *rsp; |
@@ -3939,6 +3834,9 @@ int rcutree_dying_cpu(unsigned int cpu) | |||
3939 | return 0; | 3834 | return 0; |
3940 | } | 3835 | } |
3941 | 3836 | ||
3837 | /* | ||
3838 | * The outgoing CPU is gone and we are running elsewhere. | ||
3839 | */ | ||
3942 | int rcutree_dead_cpu(unsigned int cpu) | 3840 | int rcutree_dead_cpu(unsigned int cpu) |
3943 | { | 3841 | { |
3944 | struct rcu_state *rsp; | 3842 | struct rcu_state *rsp; |
@@ -3956,6 +3854,10 @@ int rcutree_dead_cpu(unsigned int cpu) | |||
3956 | * incoming CPUs are not allowed to use RCU read-side critical sections | 3854 | * incoming CPUs are not allowed to use RCU read-side critical sections |
3957 | * until this function is called. Failing to observe this restriction | 3855 | * until this function is called. Failing to observe this restriction |
3958 | * will result in lockdep splats. | 3856 | * will result in lockdep splats. |
3857 | * | ||
3858 | * Note that this function is special in that it is invoked directly | ||
3859 | * from the incoming CPU rather than from the cpuhp_step mechanism. | ||
3860 | * This is because this function must be invoked at a precise location. | ||
3959 | */ | 3861 | */ |
3960 | void rcu_cpu_starting(unsigned int cpu) | 3862 | void rcu_cpu_starting(unsigned int cpu) |
3961 | { | 3863 | { |
@@ -3981,9 +3883,6 @@ void rcu_cpu_starting(unsigned int cpu) | |||
3981 | * The CPU is exiting the idle loop into the arch_cpu_idle_dead() | 3883 | * The CPU is exiting the idle loop into the arch_cpu_idle_dead() |
3982 | * function. We now remove it from the rcu_node tree's ->qsmaskinit | 3884 | * function. We now remove it from the rcu_node tree's ->qsmaskinit |
3983 | * bit masks. | 3885 | * bit masks. |
3984 | * The CPU is exiting the idle loop into the arch_cpu_idle_dead() | ||
3985 | * function. We now remove it from the rcu_node tree's ->qsmaskinit | ||
3986 | * bit masks. | ||
3987 | */ | 3886 | */ |
3988 | static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) | 3887 | static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) |
3989 | { | 3888 | { |
@@ -3999,6 +3898,14 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) | |||
3999 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 3898 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
4000 | } | 3899 | } |
4001 | 3900 | ||
3901 | /* | ||
3902 | * The outgoing function has no further need of RCU, so remove it from | ||
3903 | * the list of CPUs that RCU must track. | ||
3904 | * | ||
3905 | * Note that this function is special in that it is invoked directly | ||
3906 | * from the outgoing CPU rather than from the cpuhp_step mechanism. | ||
3907 | * This is because this function must be invoked at a precise location. | ||
3908 | */ | ||
4002 | void rcu_report_dead(unsigned int cpu) | 3909 | void rcu_report_dead(unsigned int cpu) |
4003 | { | 3910 | { |
4004 | struct rcu_state *rsp; | 3911 | struct rcu_state *rsp; |
@@ -4013,6 +3920,10 @@ void rcu_report_dead(unsigned int cpu) | |||
4013 | } | 3920 | } |
4014 | #endif | 3921 | #endif |
4015 | 3922 | ||
3923 | /* | ||
3924 | * On non-huge systems, use expedited RCU grace periods to make suspend | ||
3925 | * and hibernation run faster. | ||
3926 | */ | ||
4016 | static int rcu_pm_notify(struct notifier_block *self, | 3927 | static int rcu_pm_notify(struct notifier_block *self, |
4017 | unsigned long action, void *hcpu) | 3928 | unsigned long action, void *hcpu) |
4018 | { | 3929 | { |
@@ -4083,7 +3994,7 @@ early_initcall(rcu_spawn_gp_kthread); | |||
4083 | * task is booting the system, and such primitives are no-ops). After this | 3994 | * task is booting the system, and such primitives are no-ops). After this |
4084 | * function is called, any synchronous grace-period primitives are run as | 3995 | * function is called, any synchronous grace-period primitives are run as |
4085 | * expedited, with the requesting task driving the grace period forward. | 3996 | * expedited, with the requesting task driving the grace period forward. |
4086 | * A later core_initcall() rcu_exp_runtime_mode() will switch to full | 3997 | * A later core_initcall() rcu_set_runtime_mode() will switch to full |
4087 | * runtime RCU functionality. | 3998 | * runtime RCU functionality. |
4088 | */ | 3999 | */ |
4089 | void rcu_scheduler_starting(void) | 4000 | void rcu_scheduler_starting(void) |
@@ -4096,31 +4007,6 @@ void rcu_scheduler_starting(void) | |||
4096 | } | 4007 | } |
4097 | 4008 | ||
4098 | /* | 4009 | /* |
4099 | * Compute the per-level fanout, either using the exact fanout specified | ||
4100 | * or balancing the tree, depending on the rcu_fanout_exact boot parameter. | ||
4101 | */ | ||
4102 | static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) | ||
4103 | { | ||
4104 | int i; | ||
4105 | |||
4106 | if (rcu_fanout_exact) { | ||
4107 | levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; | ||
4108 | for (i = rcu_num_lvls - 2; i >= 0; i--) | ||
4109 | levelspread[i] = RCU_FANOUT; | ||
4110 | } else { | ||
4111 | int ccur; | ||
4112 | int cprv; | ||
4113 | |||
4114 | cprv = nr_cpu_ids; | ||
4115 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | ||
4116 | ccur = levelcnt[i]; | ||
4117 | levelspread[i] = (cprv + ccur - 1) / ccur; | ||
4118 | cprv = ccur; | ||
4119 | } | ||
4120 | } | ||
4121 | } | ||
4122 | |||
4123 | /* | ||
4124 | * Helper function for rcu_init() that initializes one rcu_state structure. | 4010 | * Helper function for rcu_init() that initializes one rcu_state structure. |
4125 | */ | 4011 | */ |
4126 | static void __init rcu_init_one(struct rcu_state *rsp) | 4012 | static void __init rcu_init_one(struct rcu_state *rsp) |
@@ -4129,9 +4015,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
4129 | static const char * const fqs[] = RCU_FQS_NAME_INIT; | 4015 | static const char * const fqs[] = RCU_FQS_NAME_INIT; |
4130 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | 4016 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; |
4131 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | 4017 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; |
4132 | static u8 fl_mask = 0x1; | ||
4133 | 4018 | ||
4134 | int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ | ||
4135 | int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ | 4019 | int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ |
4136 | int cpustride = 1; | 4020 | int cpustride = 1; |
4137 | int i; | 4021 | int i; |
@@ -4146,20 +4030,16 @@ static void __init rcu_init_one(struct rcu_state *rsp) | |||
4146 | 4030 | ||
4147 | /* Initialize the level-tracking arrays. */ | 4031 | /* Initialize the level-tracking arrays. */ |
4148 | 4032 | ||
4149 | for (i = 0; i < rcu_num_lvls; i++) | ||
4150 | levelcnt[i] = num_rcu_lvl[i]; | ||
4151 | for (i = 1; i < rcu_num_lvls; i++) | 4033 | for (i = 1; i < rcu_num_lvls; i++) |
4152 | rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; | 4034 | rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1]; |
4153 | rcu_init_levelspread(levelspread, levelcnt); | 4035 | rcu_init_levelspread(levelspread, num_rcu_lvl); |
4154 | rsp->flavor_mask = fl_mask; | ||
4155 | fl_mask <<= 1; | ||
4156 | 4036 | ||
4157 | /* Initialize the elements themselves, starting from the leaves. */ | 4037 | /* Initialize the elements themselves, starting from the leaves. */ |
4158 | 4038 | ||
4159 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | 4039 | for (i = rcu_num_lvls - 1; i >= 0; i--) { |
4160 | cpustride *= levelspread[i]; | 4040 | cpustride *= levelspread[i]; |
4161 | rnp = rsp->level[i]; | 4041 | rnp = rsp->level[i]; |
4162 | for (j = 0; j < levelcnt[i]; j++, rnp++) { | 4042 | for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) { |
4163 | raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); | 4043 | raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); |
4164 | lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), | 4044 | lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), |
4165 | &rcu_node_class[i], buf[i]); | 4045 | &rcu_node_class[i], buf[i]); |
@@ -4332,6 +4212,8 @@ void __init rcu_init(void) | |||
4332 | for_each_online_cpu(cpu) { | 4212 | for_each_online_cpu(cpu) { |
4333 | rcutree_prepare_cpu(cpu); | 4213 | rcutree_prepare_cpu(cpu); |
4334 | rcu_cpu_starting(cpu); | 4214 | rcu_cpu_starting(cpu); |
4215 | if (IS_ENABLED(CONFIG_TREE_SRCU)) | ||
4216 | srcu_online_cpu(cpu); | ||
4335 | } | 4217 | } |
4336 | } | 4218 | } |
4337 | 4219 | ||
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ec62a05bfdb3..0e598ab08fea 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
@@ -30,80 +30,8 @@ | |||
30 | #include <linux/seqlock.h> | 30 | #include <linux/seqlock.h> |
31 | #include <linux/swait.h> | 31 | #include <linux/swait.h> |
32 | #include <linux/stop_machine.h> | 32 | #include <linux/stop_machine.h> |
33 | 33 | #include <linux/rcu_segcblist.h> | |
34 | /* | 34 | #include <linux/rcu_node_tree.h> |
35 | * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and | ||
36 | * CONFIG_RCU_FANOUT_LEAF. | ||
37 | * In theory, it should be possible to add more levels straightforwardly. | ||
38 | * In practice, this did work well going from three levels to four. | ||
39 | * Of course, your mileage may vary. | ||
40 | */ | ||
41 | |||
42 | #ifdef CONFIG_RCU_FANOUT | ||
43 | #define RCU_FANOUT CONFIG_RCU_FANOUT | ||
44 | #else /* #ifdef CONFIG_RCU_FANOUT */ | ||
45 | # ifdef CONFIG_64BIT | ||
46 | # define RCU_FANOUT 64 | ||
47 | # else | ||
48 | # define RCU_FANOUT 32 | ||
49 | # endif | ||
50 | #endif /* #else #ifdef CONFIG_RCU_FANOUT */ | ||
51 | |||
52 | #ifdef CONFIG_RCU_FANOUT_LEAF | ||
53 | #define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF | ||
54 | #else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ | ||
55 | # ifdef CONFIG_64BIT | ||
56 | # define RCU_FANOUT_LEAF 64 | ||
57 | # else | ||
58 | # define RCU_FANOUT_LEAF 32 | ||
59 | # endif | ||
60 | #endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ | ||
61 | |||
62 | #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) | ||
63 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) | ||
64 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) | ||
65 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) | ||
66 | |||
67 | #if NR_CPUS <= RCU_FANOUT_1 | ||
68 | # define RCU_NUM_LVLS 1 | ||
69 | # define NUM_RCU_LVL_0 1 | ||
70 | # define NUM_RCU_NODES NUM_RCU_LVL_0 | ||
71 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } | ||
72 | # define RCU_NODE_NAME_INIT { "rcu_node_0" } | ||
73 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } | ||
74 | #elif NR_CPUS <= RCU_FANOUT_2 | ||
75 | # define RCU_NUM_LVLS 2 | ||
76 | # define NUM_RCU_LVL_0 1 | ||
77 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
78 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) | ||
79 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } | ||
80 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } | ||
81 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } | ||
82 | #elif NR_CPUS <= RCU_FANOUT_3 | ||
83 | # define RCU_NUM_LVLS 3 | ||
84 | # define NUM_RCU_LVL_0 1 | ||
85 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | ||
86 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
87 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) | ||
88 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } | ||
89 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } | ||
90 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } | ||
91 | #elif NR_CPUS <= RCU_FANOUT_4 | ||
92 | # define RCU_NUM_LVLS 4 | ||
93 | # define NUM_RCU_LVL_0 1 | ||
94 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) | ||
95 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | ||
96 | # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
97 | # define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) | ||
98 | # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } | ||
99 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } | ||
100 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } | ||
101 | #else | ||
102 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" | ||
103 | #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ | ||
104 | |||
105 | extern int rcu_num_lvls; | ||
106 | extern int rcu_num_nodes; | ||
107 | 35 | ||
108 | /* | 36 | /* |
109 | * Dynticks per-CPU state. | 37 | * Dynticks per-CPU state. |
@@ -113,6 +41,9 @@ struct rcu_dynticks { | |||
113 | /* Process level is worth LLONG_MAX/2. */ | 41 | /* Process level is worth LLONG_MAX/2. */ |
114 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 42 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
115 | atomic_t dynticks; /* Even value for idle, else odd. */ | 43 | atomic_t dynticks; /* Even value for idle, else odd. */ |
44 | bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ | ||
45 | unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ | ||
46 | bool rcu_urgent_qs; /* GP old need light quiescent state. */ | ||
116 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | 47 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE |
117 | long long dynticks_idle_nesting; | 48 | long long dynticks_idle_nesting; |
118 | /* irq/process nesting level from idle. */ | 49 | /* irq/process nesting level from idle. */ |
@@ -262,41 +193,6 @@ struct rcu_node { | |||
262 | #define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) | 193 | #define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) |
263 | 194 | ||
264 | /* | 195 | /* |
265 | * Do a full breadth-first scan of the rcu_node structures for the | ||
266 | * specified rcu_state structure. | ||
267 | */ | ||
268 | #define rcu_for_each_node_breadth_first(rsp, rnp) \ | ||
269 | for ((rnp) = &(rsp)->node[0]; \ | ||
270 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | ||
271 | |||
272 | /* | ||
273 | * Do a breadth-first scan of the non-leaf rcu_node structures for the | ||
274 | * specified rcu_state structure. Note that if there is a singleton | ||
275 | * rcu_node tree with but one rcu_node structure, this loop is a no-op. | ||
276 | */ | ||
277 | #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ | ||
278 | for ((rnp) = &(rsp)->node[0]; \ | ||
279 | (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) | ||
280 | |||
281 | /* | ||
282 | * Scan the leaves of the rcu_node hierarchy for the specified rcu_state | ||
283 | * structure. Note that if there is a singleton rcu_node tree with but | ||
284 | * one rcu_node structure, this loop -will- visit the rcu_node structure. | ||
285 | * It is still a leaf node, even if it is also the root node. | ||
286 | */ | ||
287 | #define rcu_for_each_leaf_node(rsp, rnp) \ | ||
288 | for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ | ||
289 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | ||
290 | |||
291 | /* | ||
292 | * Iterate over all possible CPUs in a leaf RCU node. | ||
293 | */ | ||
294 | #define for_each_leaf_node_possible_cpu(rnp, cpu) \ | ||
295 | for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ | ||
296 | cpu <= rnp->grphi; \ | ||
297 | cpu = cpumask_next((cpu), cpu_possible_mask)) | ||
298 | |||
299 | /* | ||
300 | * Union to allow "aggregate OR" operation on the need for a quiescent | 196 | * Union to allow "aggregate OR" operation on the need for a quiescent |
301 | * state by the normal and expedited grace periods. | 197 | * state by the normal and expedited grace periods. |
302 | */ | 198 | */ |
@@ -336,34 +232,9 @@ struct rcu_data { | |||
336 | /* period it is aware of. */ | 232 | /* period it is aware of. */ |
337 | 233 | ||
338 | /* 2) batch handling */ | 234 | /* 2) batch handling */ |
339 | /* | 235 | struct rcu_segcblist cblist; /* Segmented callback list, with */ |
340 | * If nxtlist is not NULL, it is partitioned as follows. | 236 | /* different callbacks waiting for */ |
341 | * Any of the partitions might be empty, in which case the | 237 | /* different grace periods. */ |
342 | * pointer to that partition will be equal to the pointer for | ||
343 | * the following partition. When the list is empty, all of | ||
344 | * the nxttail elements point to the ->nxtlist pointer itself, | ||
345 | * which in that case is NULL. | ||
346 | * | ||
347 | * [nxtlist, *nxttail[RCU_DONE_TAIL]): | ||
348 | * Entries that batch # <= ->completed | ||
349 | * The grace period for these entries has completed, and | ||
350 | * the other grace-period-completed entries may be moved | ||
351 | * here temporarily in rcu_process_callbacks(). | ||
352 | * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): | ||
353 | * Entries that batch # <= ->completed - 1: waiting for current GP | ||
354 | * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): | ||
355 | * Entries known to have arrived before current GP ended | ||
356 | * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): | ||
357 | * Entries that might have arrived after current GP ended | ||
358 | * Note that the value of *nxttail[RCU_NEXT_TAIL] will | ||
359 | * always be NULL, as this is the end of the list. | ||
360 | */ | ||
361 | struct rcu_head *nxtlist; | ||
362 | struct rcu_head **nxttail[RCU_NEXT_SIZE]; | ||
363 | unsigned long nxtcompleted[RCU_NEXT_SIZE]; | ||
364 | /* grace periods for sublists. */ | ||
365 | long qlen_lazy; /* # of lazy queued callbacks */ | ||
366 | long qlen; /* # of queued callbacks, incl lazy */ | ||
367 | long qlen_last_fqs_check; | 238 | long qlen_last_fqs_check; |
368 | /* qlen at last check for QS forcing */ | 239 | /* qlen at last check for QS forcing */ |
369 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | 240 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ |
@@ -482,7 +353,6 @@ struct rcu_state { | |||
482 | struct rcu_node *level[RCU_NUM_LVLS + 1]; | 353 | struct rcu_node *level[RCU_NUM_LVLS + 1]; |
483 | /* Hierarchy levels (+1 to */ | 354 | /* Hierarchy levels (+1 to */ |
484 | /* shut bogus gcc warning) */ | 355 | /* shut bogus gcc warning) */ |
485 | u8 flavor_mask; /* bit in flavor mask. */ | ||
486 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ | 356 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
487 | call_rcu_func_t call; /* call_rcu() flavor. */ | 357 | call_rcu_func_t call; /* call_rcu() flavor. */ |
488 | int ncpus; /* # CPUs seen so far. */ | 358 | int ncpus; /* # CPUs seen so far. */ |
@@ -502,14 +372,11 @@ struct rcu_state { | |||
502 | 372 | ||
503 | raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; | 373 | raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; |
504 | /* Protect following fields. */ | 374 | /* Protect following fields. */ |
505 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ | 375 | struct rcu_cblist orphan_pend; /* Orphaned callbacks that */ |
506 | /* need a grace period. */ | 376 | /* need a grace period. */ |
507 | struct rcu_head **orphan_nxttail; /* Tail of above. */ | 377 | struct rcu_cblist orphan_done; /* Orphaned callbacks that */ |
508 | struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ | ||
509 | /* are ready to invoke. */ | 378 | /* are ready to invoke. */ |
510 | struct rcu_head **orphan_donetail; /* Tail of above. */ | 379 | /* (Contains counts.) */ |
511 | long qlen_lazy; /* Number of lazy callbacks. */ | ||
512 | long qlen; /* Total number of callbacks. */ | ||
513 | /* End of fields guarded by orphan_lock. */ | 380 | /* End of fields guarded by orphan_lock. */ |
514 | 381 | ||
515 | struct mutex barrier_mutex; /* Guards barrier fields. */ | 382 | struct mutex barrier_mutex; /* Guards barrier fields. */ |
@@ -596,6 +463,7 @@ extern struct rcu_state rcu_preempt_state; | |||
596 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | 463 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ |
597 | 464 | ||
598 | int rcu_dynticks_snap(struct rcu_dynticks *rdtp); | 465 | int rcu_dynticks_snap(struct rcu_dynticks *rdtp); |
466 | bool rcu_eqs_special_set(int cpu); | ||
599 | 467 | ||
600 | #ifdef CONFIG_RCU_BOOST | 468 | #ifdef CONFIG_RCU_BOOST |
601 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | 469 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); |
@@ -673,6 +541,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp); | |||
673 | static void rcu_dynticks_task_enter(void); | 541 | static void rcu_dynticks_task_enter(void); |
674 | static void rcu_dynticks_task_exit(void); | 542 | static void rcu_dynticks_task_exit(void); |
675 | 543 | ||
544 | #ifdef CONFIG_SRCU | ||
545 | void srcu_online_cpu(unsigned int cpu); | ||
546 | void srcu_offline_cpu(unsigned int cpu); | ||
547 | #else /* #ifdef CONFIG_SRCU */ | ||
548 | void srcu_online_cpu(unsigned int cpu) { } | ||
549 | void srcu_offline_cpu(unsigned int cpu) { } | ||
550 | #endif /* #else #ifdef CONFIG_SRCU */ | ||
551 | |||
676 | #endif /* #ifndef RCU_TREE_NONCORE */ | 552 | #endif /* #ifndef RCU_TREE_NONCORE */ |
677 | 553 | ||
678 | #ifdef CONFIG_RCU_TRACE | 554 | #ifdef CONFIG_RCU_TRACE |
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index a7b639ccd46e..e513b4ab1197 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h | |||
@@ -292,7 +292,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) | |||
292 | trace_rcu_exp_funnel_lock(rsp->name, rnp->level, | 292 | trace_rcu_exp_funnel_lock(rsp->name, rnp->level, |
293 | rnp->grplo, rnp->grphi, | 293 | rnp->grplo, rnp->grphi, |
294 | TPS("wait")); | 294 | TPS("wait")); |
295 | wait_event(rnp->exp_wq[(s >> 1) & 0x3], | 295 | wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], |
296 | sync_exp_work_done(rsp, | 296 | sync_exp_work_done(rsp, |
297 | &rdp->exp_workdone2, s)); | 297 | &rdp->exp_workdone2, s)); |
298 | return true; | 298 | return true; |
@@ -331,6 +331,8 @@ static void sync_sched_exp_handler(void *data) | |||
331 | return; | 331 | return; |
332 | } | 332 | } |
333 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); | 333 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); |
334 | /* Store .exp before .rcu_urgent_qs. */ | ||
335 | smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); | ||
334 | resched_cpu(smp_processor_id()); | 336 | resched_cpu(smp_processor_id()); |
335 | } | 337 | } |
336 | 338 | ||
@@ -531,7 +533,8 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) | |||
531 | rnp->exp_seq_rq = s; | 533 | rnp->exp_seq_rq = s; |
532 | spin_unlock(&rnp->exp_lock); | 534 | spin_unlock(&rnp->exp_lock); |
533 | } | 535 | } |
534 | wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); | 536 | smp_mb(); /* All above changes before wakeup. */ |
537 | wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]); | ||
535 | } | 538 | } |
536 | trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); | 539 | trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); |
537 | mutex_unlock(&rsp->exp_wake_mutex); | 540 | mutex_unlock(&rsp->exp_wake_mutex); |
@@ -609,9 +612,9 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, | |||
609 | /* Wait for expedited grace period to complete. */ | 612 | /* Wait for expedited grace period to complete. */ |
610 | rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); | 613 | rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); |
611 | rnp = rcu_get_root(rsp); | 614 | rnp = rcu_get_root(rsp); |
612 | wait_event(rnp->exp_wq[(s >> 1) & 0x3], | 615 | wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], |
613 | sync_exp_work_done(rsp, | 616 | sync_exp_work_done(rsp, &rdp->exp_workdone0, s)); |
614 | &rdp->exp_workdone0, s)); | 617 | smp_mb(); /* Workqueue actions happen before return. */ |
615 | 618 | ||
616 | /* Let the next expedited grace period start. */ | 619 | /* Let the next expedited grace period start. */ |
617 | mutex_unlock(&rsp->exp_mutex); | 620 | mutex_unlock(&rsp->exp_mutex); |
@@ -735,15 +738,3 @@ void synchronize_rcu_expedited(void) | |||
735 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | 738 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); |
736 | 739 | ||
737 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | 740 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ |
738 | |||
739 | /* | ||
740 | * Switch to run-time mode once Tree RCU has fully initialized. | ||
741 | */ | ||
742 | static int __init rcu_exp_runtime_mode(void) | ||
743 | { | ||
744 | rcu_test_sync_prims(); | ||
745 | rcu_scheduler_active = RCU_SCHEDULER_RUNNING; | ||
746 | rcu_test_sync_prims(); | ||
747 | return 0; | ||
748 | } | ||
749 | core_initcall(rcu_exp_runtime_mode); | ||
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a62a8f1caac..7f1d677a2a25 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -1350,10 +1350,10 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) | |||
1350 | */ | 1350 | */ |
1351 | if ((rdp->completed != rnp->completed || | 1351 | if ((rdp->completed != rnp->completed || |
1352 | unlikely(READ_ONCE(rdp->gpwrap))) && | 1352 | unlikely(READ_ONCE(rdp->gpwrap))) && |
1353 | rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) | 1353 | rcu_segcblist_pend_cbs(&rdp->cblist)) |
1354 | note_gp_changes(rsp, rdp); | 1354 | note_gp_changes(rsp, rdp); |
1355 | 1355 | ||
1356 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 1356 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) |
1357 | cbs_ready = true; | 1357 | cbs_ready = true; |
1358 | } | 1358 | } |
1359 | return cbs_ready; | 1359 | return cbs_ready; |
@@ -1461,7 +1461,7 @@ static void rcu_prepare_for_idle(void) | |||
1461 | rdtp->last_accelerate = jiffies; | 1461 | rdtp->last_accelerate = jiffies; |
1462 | for_each_rcu_flavor(rsp) { | 1462 | for_each_rcu_flavor(rsp) { |
1463 | rdp = this_cpu_ptr(rsp->rda); | 1463 | rdp = this_cpu_ptr(rsp->rda); |
1464 | if (!*rdp->nxttail[RCU_DONE_TAIL]) | 1464 | if (rcu_segcblist_pend_cbs(&rdp->cblist)) |
1465 | continue; | 1465 | continue; |
1466 | rnp = rdp->mynode; | 1466 | rnp = rdp->mynode; |
1467 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ | 1467 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ |
@@ -1529,7 +1529,7 @@ static void rcu_oom_notify_cpu(void *unused) | |||
1529 | 1529 | ||
1530 | for_each_rcu_flavor(rsp) { | 1530 | for_each_rcu_flavor(rsp) { |
1531 | rdp = raw_cpu_ptr(rsp->rda); | 1531 | rdp = raw_cpu_ptr(rsp->rda); |
1532 | if (rdp->qlen_lazy != 0) { | 1532 | if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) { |
1533 | atomic_inc(&oom_callback_count); | 1533 | atomic_inc(&oom_callback_count); |
1534 | rsp->call(&rdp->oom_head, rcu_oom_callback); | 1534 | rsp->call(&rdp->oom_head, rcu_oom_callback); |
1535 | } | 1535 | } |
@@ -1709,7 +1709,7 @@ __setup("rcu_nocbs=", rcu_nocb_setup); | |||
1709 | 1709 | ||
1710 | static int __init parse_rcu_nocb_poll(char *arg) | 1710 | static int __init parse_rcu_nocb_poll(char *arg) |
1711 | { | 1711 | { |
1712 | rcu_nocb_poll = 1; | 1712 | rcu_nocb_poll = true; |
1713 | return 0; | 1713 | return 0; |
1714 | } | 1714 | } |
1715 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); | 1715 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); |
@@ -1860,7 +1860,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
1860 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 1860 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
1861 | TPS("WakeEmpty")); | 1861 | TPS("WakeEmpty")); |
1862 | } else { | 1862 | } else { |
1863 | rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; | 1863 | WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE); |
1864 | /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ | ||
1865 | smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); | ||
1864 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 1866 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
1865 | TPS("WakeEmptyIsDeferred")); | 1867 | TPS("WakeEmptyIsDeferred")); |
1866 | } | 1868 | } |
@@ -1872,7 +1874,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
1872 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 1874 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
1873 | TPS("WakeOvf")); | 1875 | TPS("WakeOvf")); |
1874 | } else { | 1876 | } else { |
1875 | rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; | 1877 | WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE); |
1878 | /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ | ||
1879 | smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); | ||
1876 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 1880 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
1877 | TPS("WakeOvfIsDeferred")); | 1881 | TPS("WakeOvfIsDeferred")); |
1878 | } | 1882 | } |
@@ -1930,30 +1934,26 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | |||
1930 | struct rcu_data *rdp, | 1934 | struct rcu_data *rdp, |
1931 | unsigned long flags) | 1935 | unsigned long flags) |
1932 | { | 1936 | { |
1933 | long ql = rsp->qlen; | 1937 | long ql = rcu_cblist_n_cbs(&rsp->orphan_done); |
1934 | long qll = rsp->qlen_lazy; | 1938 | long qll = rcu_cblist_n_lazy_cbs(&rsp->orphan_done); |
1935 | 1939 | ||
1936 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ | 1940 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ |
1937 | if (!rcu_is_nocb_cpu(smp_processor_id())) | 1941 | if (!rcu_is_nocb_cpu(smp_processor_id())) |
1938 | return false; | 1942 | return false; |
1939 | rsp->qlen = 0; | ||
1940 | rsp->qlen_lazy = 0; | ||
1941 | 1943 | ||
1942 | /* First, enqueue the donelist, if any. This preserves CB ordering. */ | 1944 | /* First, enqueue the donelist, if any. This preserves CB ordering. */ |
1943 | if (rsp->orphan_donelist != NULL) { | 1945 | if (!rcu_cblist_empty(&rsp->orphan_done)) { |
1944 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, | 1946 | __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done), |
1945 | rsp->orphan_donetail, ql, qll, flags); | 1947 | rcu_cblist_tail(&rsp->orphan_done), |
1946 | ql = qll = 0; | 1948 | ql, qll, flags); |
1947 | rsp->orphan_donelist = NULL; | ||
1948 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
1949 | } | 1949 | } |
1950 | if (rsp->orphan_nxtlist != NULL) { | 1950 | if (!rcu_cblist_empty(&rsp->orphan_pend)) { |
1951 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, | 1951 | __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend), |
1952 | rsp->orphan_nxttail, ql, qll, flags); | 1952 | rcu_cblist_tail(&rsp->orphan_pend), |
1953 | ql = qll = 0; | 1953 | ql, qll, flags); |
1954 | rsp->orphan_nxtlist = NULL; | ||
1955 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
1956 | } | 1954 | } |
1955 | rcu_cblist_init(&rsp->orphan_done); | ||
1956 | rcu_cblist_init(&rsp->orphan_pend); | ||
1957 | return true; | 1957 | return true; |
1958 | } | 1958 | } |
1959 | 1959 | ||
@@ -2395,16 +2395,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) | |||
2395 | return false; | 2395 | return false; |
2396 | 2396 | ||
2397 | /* If there are early-boot callbacks, move them to nocb lists. */ | 2397 | /* If there are early-boot callbacks, move them to nocb lists. */ |
2398 | if (rdp->nxtlist) { | 2398 | if (!rcu_segcblist_empty(&rdp->cblist)) { |
2399 | rdp->nocb_head = rdp->nxtlist; | 2399 | rdp->nocb_head = rcu_segcblist_head(&rdp->cblist); |
2400 | rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL]; | 2400 | rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist); |
2401 | atomic_long_set(&rdp->nocb_q_count, rdp->qlen); | 2401 | atomic_long_set(&rdp->nocb_q_count, |
2402 | atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy); | 2402 | rcu_segcblist_n_cbs(&rdp->cblist)); |
2403 | rdp->nxtlist = NULL; | 2403 | atomic_long_set(&rdp->nocb_q_count_lazy, |
2404 | rdp->qlen = 0; | 2404 | rcu_segcblist_n_lazy_cbs(&rdp->cblist)); |
2405 | rdp->qlen_lazy = 0; | 2405 | rcu_segcblist_init(&rdp->cblist); |
2406 | } | 2406 | } |
2407 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | 2407 | rcu_segcblist_disable(&rdp->cblist); |
2408 | return true; | 2408 | return true; |
2409 | } | 2409 | } |
2410 | 2410 | ||
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 8751a748499a..30c5bf89ee58 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c | |||
@@ -41,11 +41,11 @@ | |||
41 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
42 | #include <linux/debugfs.h> | 42 | #include <linux/debugfs.h> |
43 | #include <linux/seq_file.h> | 43 | #include <linux/seq_file.h> |
44 | #include <linux/prefetch.h> | ||
44 | 45 | ||
45 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
46 | #include "tree.h" | 47 | #include "tree.h" |
47 | 48 | #include "rcu.h" | |
48 | DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); | ||
49 | 49 | ||
50 | static int r_open(struct inode *inode, struct file *file, | 50 | static int r_open(struct inode *inode, struct file *file, |
51 | const struct seq_operations *op) | 51 | const struct seq_operations *op) |
@@ -121,7 +121,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
121 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 121 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
122 | ulong2long(rdp->completed), ulong2long(rdp->gpnum), | 122 | ulong2long(rdp->completed), ulong2long(rdp->gpnum), |
123 | rdp->cpu_no_qs.b.norm, | 123 | rdp->cpu_no_qs.b.norm, |
124 | rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), | 124 | rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu), |
125 | rdp->core_needs_qs); | 125 | rdp->core_needs_qs); |
126 | seq_printf(m, " dt=%d/%llx/%d df=%lu", | 126 | seq_printf(m, " dt=%d/%llx/%d df=%lu", |
127 | rcu_dynticks_snap(rdp->dynticks), | 127 | rcu_dynticks_snap(rdp->dynticks), |
@@ -130,17 +130,15 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
130 | rdp->dynticks_fqs); | 130 | rdp->dynticks_fqs); |
131 | seq_printf(m, " of=%lu", rdp->offline_fqs); | 131 | seq_printf(m, " of=%lu", rdp->offline_fqs); |
132 | rcu_nocb_q_lengths(rdp, &ql, &qll); | 132 | rcu_nocb_q_lengths(rdp, &ql, &qll); |
133 | qll += rdp->qlen_lazy; | 133 | qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist); |
134 | ql += rdp->qlen; | 134 | ql += rcu_segcblist_n_cbs(&rdp->cblist); |
135 | seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", | 135 | seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", |
136 | qll, ql, | 136 | qll, ql, |
137 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | 137 | ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)], |
138 | rdp->nxttail[RCU_NEXT_TAIL]], | 138 | ".R"[!rcu_segcblist_segempty(&rdp->cblist, |
139 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | 139 | RCU_NEXT_READY_TAIL)], |
140 | rdp->nxttail[RCU_NEXT_READY_TAIL]], | 140 | ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)], |
141 | ".W"[rdp->nxttail[RCU_DONE_TAIL] != | 141 | ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]); |
142 | rdp->nxttail[RCU_WAIT_TAIL]], | ||
143 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); | ||
144 | #ifdef CONFIG_RCU_BOOST | 142 | #ifdef CONFIG_RCU_BOOST |
145 | seq_printf(m, " kt=%d/%c ktl=%x", | 143 | seq_printf(m, " kt=%d/%c ktl=%x", |
146 | per_cpu(rcu_cpu_has_work, rdp->cpu), | 144 | per_cpu(rcu_cpu_has_work, rdp->cpu), |
@@ -278,7 +276,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
278 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", | 276 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", |
279 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 277 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
280 | rsp->n_force_qs - rsp->n_force_qs_ngp, | 278 | rsp->n_force_qs - rsp->n_force_qs_ngp, |
281 | READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); | 279 | READ_ONCE(rsp->n_force_qs_lh), |
280 | rcu_cblist_n_lazy_cbs(&rsp->orphan_done), | ||
281 | rcu_cblist_n_cbs(&rsp->orphan_done)); | ||
282 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { | 282 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { |
283 | if (rnp->level != level) { | 283 | if (rnp->level != level) { |
284 | seq_puts(m, "\n"); | 284 | seq_puts(m, "\n"); |
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 55c8530316c7..273e869ca21d 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); | |||
124 | * non-expedited counterparts? Intended for use within RCU. Note | 124 | * non-expedited counterparts? Intended for use within RCU. Note |
125 | * that if the user specifies both rcu_expedited and rcu_normal, then | 125 | * that if the user specifies both rcu_expedited and rcu_normal, then |
126 | * rcu_normal wins. (Except during the time period during boot from | 126 | * rcu_normal wins. (Except during the time period during boot from |
127 | * when the first task is spawned until the rcu_exp_runtime_mode() | 127 | * when the first task is spawned until the rcu_set_runtime_mode() |
128 | * core_initcall() is invoked, at which point everything is expedited.) | 128 | * core_initcall() is invoked, at which point everything is expedited.) |
129 | */ | 129 | */ |
130 | bool rcu_gp_is_normal(void) | 130 | bool rcu_gp_is_normal(void) |
@@ -190,6 +190,39 @@ void rcu_end_inkernel_boot(void) | |||
190 | 190 | ||
191 | #endif /* #ifndef CONFIG_TINY_RCU */ | 191 | #endif /* #ifndef CONFIG_TINY_RCU */ |
192 | 192 | ||
193 | /* | ||
194 | * Test each non-SRCU synchronous grace-period wait API. This is | ||
195 | * useful just after a change in mode for these primitives, and | ||
196 | * during early boot. | ||
197 | */ | ||
198 | void rcu_test_sync_prims(void) | ||
199 | { | ||
200 | if (!IS_ENABLED(CONFIG_PROVE_RCU)) | ||
201 | return; | ||
202 | synchronize_rcu(); | ||
203 | synchronize_rcu_bh(); | ||
204 | synchronize_sched(); | ||
205 | synchronize_rcu_expedited(); | ||
206 | synchronize_rcu_bh_expedited(); | ||
207 | synchronize_sched_expedited(); | ||
208 | } | ||
209 | |||
210 | #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) | ||
211 | |||
212 | /* | ||
213 | * Switch to run-time mode once RCU has fully initialized. | ||
214 | */ | ||
215 | static int __init rcu_set_runtime_mode(void) | ||
216 | { | ||
217 | rcu_test_sync_prims(); | ||
218 | rcu_scheduler_active = RCU_SCHEDULER_RUNNING; | ||
219 | rcu_test_sync_prims(); | ||
220 | return 0; | ||
221 | } | ||
222 | core_initcall(rcu_set_runtime_mode); | ||
223 | |||
224 | #endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */ | ||
225 | |||
193 | #ifdef CONFIG_PREEMPT_RCU | 226 | #ifdef CONFIG_PREEMPT_RCU |
194 | 227 | ||
195 | /* | 228 | /* |
@@ -632,6 +665,7 @@ static void check_holdout_task(struct task_struct *t, | |||
632 | put_task_struct(t); | 665 | put_task_struct(t); |
633 | return; | 666 | return; |
634 | } | 667 | } |
668 | rcu_request_urgent_qs_task(t); | ||
635 | if (!needreport) | 669 | if (!needreport) |
636 | return; | 670 | return; |
637 | if (*firstreport) { | 671 | if (*firstreport) { |
@@ -817,23 +851,6 @@ static void rcu_spawn_tasks_kthread(void) | |||
817 | 851 | ||
818 | #endif /* #ifdef CONFIG_TASKS_RCU */ | 852 | #endif /* #ifdef CONFIG_TASKS_RCU */ |
819 | 853 | ||
820 | /* | ||
821 | * Test each non-SRCU synchronous grace-period wait API. This is | ||
822 | * useful just after a change in mode for these primitives, and | ||
823 | * during early boot. | ||
824 | */ | ||
825 | void rcu_test_sync_prims(void) | ||
826 | { | ||
827 | if (!IS_ENABLED(CONFIG_PROVE_RCU)) | ||
828 | return; | ||
829 | synchronize_rcu(); | ||
830 | synchronize_rcu_bh(); | ||
831 | synchronize_sched(); | ||
832 | synchronize_rcu_expedited(); | ||
833 | synchronize_rcu_bh_expedited(); | ||
834 | synchronize_sched_expedited(); | ||
835 | } | ||
836 | |||
837 | #ifdef CONFIG_PROVE_RCU | 854 | #ifdef CONFIG_PROVE_RCU |
838 | 855 | ||
839 | /* | 856 | /* |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3b31fc05a0f1..2adf7b6c04e7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -3378,7 +3378,7 @@ static void __sched notrace __schedule(bool preempt) | |||
3378 | hrtick_clear(rq); | 3378 | hrtick_clear(rq); |
3379 | 3379 | ||
3380 | local_irq_disable(); | 3380 | local_irq_disable(); |
3381 | rcu_note_context_switch(); | 3381 | rcu_note_context_switch(preempt); |
3382 | 3382 | ||
3383 | /* | 3383 | /* |
3384 | * Make sure that signal_pending_state()->signal_pending() below | 3384 | * Make sure that signal_pending_state()->signal_pending() below |
diff --git a/kernel/signal.c b/kernel/signal.c index 7e59ebc2c25e..6df5f72158e4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -1237,7 +1237,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, | |||
1237 | } | 1237 | } |
1238 | /* | 1238 | /* |
1239 | * This sighand can be already freed and even reused, but | 1239 | * This sighand can be already freed and even reused, but |
1240 | * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which | 1240 | * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which |
1241 | * initializes ->siglock: this slab can't go away, it has | 1241 | * initializes ->siglock: this slab can't go away, it has |
1242 | * the same object type, ->siglock can't be reinitialized. | 1242 | * the same object type, ->siglock can't be reinitialized. |
1243 | * | 1243 | * |
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 98b27195e38b..4b20061102f6 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c | |||
@@ -413,7 +413,7 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, | |||
413 | *size += sizeof(struct kasan_alloc_meta); | 413 | *size += sizeof(struct kasan_alloc_meta); |
414 | 414 | ||
415 | /* Add free meta. */ | 415 | /* Add free meta. */ |
416 | if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor || | 416 | if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor || |
417 | cache->object_size < sizeof(struct kasan_free_meta)) { | 417 | cache->object_size < sizeof(struct kasan_free_meta)) { |
418 | cache->kasan_info.free_meta_offset = *size; | 418 | cache->kasan_info.free_meta_offset = *size; |
419 | *size += sizeof(struct kasan_free_meta); | 419 | *size += sizeof(struct kasan_free_meta); |
@@ -561,7 +561,7 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object) | |||
561 | unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); | 561 | unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); |
562 | 562 | ||
563 | /* RCU slabs could be legally used after free within the RCU period */ | 563 | /* RCU slabs could be legally used after free within the RCU period */ |
564 | if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) | 564 | if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) |
565 | return; | 565 | return; |
566 | 566 | ||
567 | kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); | 567 | kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); |
@@ -572,7 +572,7 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object) | |||
572 | s8 shadow_byte; | 572 | s8 shadow_byte; |
573 | 573 | ||
574 | /* RCU slabs could be legally used after free within the RCU period */ | 574 | /* RCU slabs could be legally used after free within the RCU period */ |
575 | if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) | 575 | if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) |
576 | return false; | 576 | return false; |
577 | 577 | ||
578 | shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); | 578 | shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); |
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index 5bf191756a4a..2d5959c5f7c5 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c | |||
@@ -95,7 +95,7 @@ void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, | |||
95 | void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) | 95 | void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) |
96 | { | 96 | { |
97 | /* TODO: RCU freeing is unsupported for now; hide false positives. */ | 97 | /* TODO: RCU freeing is unsupported for now; hide false positives. */ |
98 | if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) | 98 | if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU)) |
99 | kmemcheck_mark_freed(object, size); | 99 | kmemcheck_mark_freed(object, size); |
100 | } | 100 | } |
101 | 101 | ||
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index a7652acd2ab9..54ca54562928 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -21,7 +21,7 @@ | |||
21 | #include <linux/slab.h> | 21 | #include <linux/slab.h> |
22 | 22 | ||
23 | /* global SRCU for all MMs */ | 23 | /* global SRCU for all MMs */ |
24 | static struct srcu_struct srcu; | 24 | DEFINE_STATIC_SRCU(srcu); |
25 | 25 | ||
26 | /* | 26 | /* |
27 | * This function allows mmu_notifier::release callback to delay a call to | 27 | * This function allows mmu_notifier::release callback to delay a call to |
@@ -252,12 +252,6 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, | |||
252 | 252 | ||
253 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | 253 | BUG_ON(atomic_read(&mm->mm_users) <= 0); |
254 | 254 | ||
255 | /* | ||
256 | * Verify that mmu_notifier_init() already run and the global srcu is | ||
257 | * initialized. | ||
258 | */ | ||
259 | BUG_ON(!srcu.per_cpu_ref); | ||
260 | |||
261 | ret = -ENOMEM; | 255 | ret = -ENOMEM; |
262 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); | 256 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); |
263 | if (unlikely(!mmu_notifier_mm)) | 257 | if (unlikely(!mmu_notifier_mm)) |
@@ -406,9 +400,3 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, | |||
406 | mmdrop(mm); | 400 | mmdrop(mm); |
407 | } | 401 | } |
408 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); | 402 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); |
409 | |||
410 | static int __init mmu_notifier_init(void) | ||
411 | { | ||
412 | return init_srcu_struct(&srcu); | ||
413 | } | ||
414 | subsys_initcall(mmu_notifier_init); | ||
@@ -430,7 +430,7 @@ static void anon_vma_ctor(void *data) | |||
430 | void __init anon_vma_init(void) | 430 | void __init anon_vma_init(void) |
431 | { | 431 | { |
432 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), | 432 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), |
433 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, | 433 | 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, |
434 | anon_vma_ctor); | 434 | anon_vma_ctor); |
435 | anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, | 435 | anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, |
436 | SLAB_PANIC|SLAB_ACCOUNT); | 436 | SLAB_PANIC|SLAB_ACCOUNT); |
@@ -481,7 +481,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) | |||
481 | * If this page is still mapped, then its anon_vma cannot have been | 481 | * If this page is still mapped, then its anon_vma cannot have been |
482 | * freed. But if it has been unmapped, we have no security against the | 482 | * freed. But if it has been unmapped, we have no security against the |
483 | * anon_vma structure being freed and reused (for another anon_vma: | 483 | * anon_vma structure being freed and reused (for another anon_vma: |
484 | * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() | 484 | * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() |
485 | * above cannot corrupt). | 485 | * above cannot corrupt). |
486 | */ | 486 | */ |
487 | if (!page_mapped(page)) { | 487 | if (!page_mapped(page)) { |
@@ -1728,7 +1728,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) | |||
1728 | 1728 | ||
1729 | freelist = page->freelist; | 1729 | freelist = page->freelist; |
1730 | slab_destroy_debugcheck(cachep, page); | 1730 | slab_destroy_debugcheck(cachep, page); |
1731 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) | 1731 | if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU)) |
1732 | call_rcu(&page->rcu_head, kmem_rcu_free); | 1732 | call_rcu(&page->rcu_head, kmem_rcu_free); |
1733 | else | 1733 | else |
1734 | kmem_freepages(cachep, page); | 1734 | kmem_freepages(cachep, page); |
@@ -1924,7 +1924,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, | |||
1924 | 1924 | ||
1925 | cachep->num = 0; | 1925 | cachep->num = 0; |
1926 | 1926 | ||
1927 | if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU) | 1927 | if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU) |
1928 | return false; | 1928 | return false; |
1929 | 1929 | ||
1930 | left = calculate_slab_order(cachep, size, | 1930 | left = calculate_slab_order(cachep, size, |
@@ -2030,7 +2030,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2030 | if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + | 2030 | if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + |
2031 | 2 * sizeof(unsigned long long))) | 2031 | 2 * sizeof(unsigned long long))) |
2032 | flags |= SLAB_RED_ZONE | SLAB_STORE_USER; | 2032 | flags |= SLAB_RED_ZONE | SLAB_STORE_USER; |
2033 | if (!(flags & SLAB_DESTROY_BY_RCU)) | 2033 | if (!(flags & SLAB_TYPESAFE_BY_RCU)) |
2034 | flags |= SLAB_POISON; | 2034 | flags |= SLAB_POISON; |
2035 | #endif | 2035 | #endif |
2036 | #endif | 2036 | #endif |
@@ -126,7 +126,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, | |||
126 | 126 | ||
127 | /* Legal flag mask for kmem_cache_create(), for various configurations */ | 127 | /* Legal flag mask for kmem_cache_create(), for various configurations */ |
128 | #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ | 128 | #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ |
129 | SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS ) | 129 | SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) |
130 | 130 | ||
131 | #if defined(CONFIG_DEBUG_SLAB) | 131 | #if defined(CONFIG_DEBUG_SLAB) |
132 | #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) | 132 | #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) |
@@ -415,7 +415,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) | |||
415 | * back there or track user information then we can | 415 | * back there or track user information then we can |
416 | * only use the space before that information. | 416 | * only use the space before that information. |
417 | */ | 417 | */ |
418 | if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) | 418 | if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) |
419 | return s->inuse; | 419 | return s->inuse; |
420 | /* | 420 | /* |
421 | * Else we can use all the padding etc for the allocation | 421 | * Else we can use all the padding etc for the allocation |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 09d0e849b07f..01a0fe2eb332 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -39,7 +39,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, | |||
39 | * Set of flags that will prevent slab merging | 39 | * Set of flags that will prevent slab merging |
40 | */ | 40 | */ |
41 | #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | 41 | #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ |
42 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ | 42 | SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \ |
43 | SLAB_FAILSLAB | SLAB_KASAN) | 43 | SLAB_FAILSLAB | SLAB_KASAN) |
44 | 44 | ||
45 | #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ | 45 | #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ |
@@ -500,7 +500,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) | |||
500 | struct kmem_cache *s, *s2; | 500 | struct kmem_cache *s, *s2; |
501 | 501 | ||
502 | /* | 502 | /* |
503 | * On destruction, SLAB_DESTROY_BY_RCU kmem_caches are put on the | 503 | * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the |
504 | * @slab_caches_to_rcu_destroy list. The slab pages are freed | 504 | * @slab_caches_to_rcu_destroy list. The slab pages are freed |
505 | * through RCU and and the associated kmem_cache are dereferenced | 505 | * through RCU and and the associated kmem_cache are dereferenced |
506 | * while freeing the pages, so the kmem_caches should be freed only | 506 | * while freeing the pages, so the kmem_caches should be freed only |
@@ -537,7 +537,7 @@ static int shutdown_cache(struct kmem_cache *s) | |||
537 | memcg_unlink_cache(s); | 537 | memcg_unlink_cache(s); |
538 | list_del(&s->list); | 538 | list_del(&s->list); |
539 | 539 | ||
540 | if (s->flags & SLAB_DESTROY_BY_RCU) { | 540 | if (s->flags & SLAB_TYPESAFE_BY_RCU) { |
541 | list_add_tail(&s->list, &slab_caches_to_rcu_destroy); | 541 | list_add_tail(&s->list, &slab_caches_to_rcu_destroy); |
542 | schedule_work(&slab_caches_to_rcu_destroy_work); | 542 | schedule_work(&slab_caches_to_rcu_destroy_work); |
543 | } else { | 543 | } else { |
@@ -126,7 +126,7 @@ static inline void clear_slob_page_free(struct page *sp) | |||
126 | 126 | ||
127 | /* | 127 | /* |
128 | * struct slob_rcu is inserted at the tail of allocated slob blocks, which | 128 | * struct slob_rcu is inserted at the tail of allocated slob blocks, which |
129 | * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free | 129 | * were created with a SLAB_TYPESAFE_BY_RCU slab. slob_rcu is used to free |
130 | * the block using call_rcu. | 130 | * the block using call_rcu. |
131 | */ | 131 | */ |
132 | struct slob_rcu { | 132 | struct slob_rcu { |
@@ -524,7 +524,7 @@ EXPORT_SYMBOL(ksize); | |||
524 | 524 | ||
525 | int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) | 525 | int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) |
526 | { | 526 | { |
527 | if (flags & SLAB_DESTROY_BY_RCU) { | 527 | if (flags & SLAB_TYPESAFE_BY_RCU) { |
528 | /* leave room for rcu footer at the end of object */ | 528 | /* leave room for rcu footer at the end of object */ |
529 | c->size += sizeof(struct slob_rcu); | 529 | c->size += sizeof(struct slob_rcu); |
530 | } | 530 | } |
@@ -598,7 +598,7 @@ static void kmem_rcu_free(struct rcu_head *head) | |||
598 | void kmem_cache_free(struct kmem_cache *c, void *b) | 598 | void kmem_cache_free(struct kmem_cache *c, void *b) |
599 | { | 599 | { |
600 | kmemleak_free_recursive(b, c->flags); | 600 | kmemleak_free_recursive(b, c->flags); |
601 | if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { | 601 | if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) { |
602 | struct slob_rcu *slob_rcu; | 602 | struct slob_rcu *slob_rcu; |
603 | slob_rcu = b + (c->size - sizeof(struct slob_rcu)); | 603 | slob_rcu = b + (c->size - sizeof(struct slob_rcu)); |
604 | slob_rcu->size = c->size; | 604 | slob_rcu->size = c->size; |
@@ -1687,7 +1687,7 @@ static void rcu_free_slab(struct rcu_head *h) | |||
1687 | 1687 | ||
1688 | static void free_slab(struct kmem_cache *s, struct page *page) | 1688 | static void free_slab(struct kmem_cache *s, struct page *page) |
1689 | { | 1689 | { |
1690 | if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { | 1690 | if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { |
1691 | struct rcu_head *head; | 1691 | struct rcu_head *head; |
1692 | 1692 | ||
1693 | if (need_reserve_slab_rcu) { | 1693 | if (need_reserve_slab_rcu) { |
@@ -2963,7 +2963,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, | |||
2963 | * slab_free_freelist_hook() could have put the items into quarantine. | 2963 | * slab_free_freelist_hook() could have put the items into quarantine. |
2964 | * If so, no need to free them. | 2964 | * If so, no need to free them. |
2965 | */ | 2965 | */ |
2966 | if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU)) | 2966 | if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU)) |
2967 | return; | 2967 | return; |
2968 | do_slab_free(s, page, head, tail, cnt, addr); | 2968 | do_slab_free(s, page, head, tail, cnt, addr); |
2969 | } | 2969 | } |
@@ -3433,7 +3433,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
3433 | * the slab may touch the object after free or before allocation | 3433 | * the slab may touch the object after free or before allocation |
3434 | * then we should never poison the object itself. | 3434 | * then we should never poison the object itself. |
3435 | */ | 3435 | */ |
3436 | if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && | 3436 | if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) && |
3437 | !s->ctor) | 3437 | !s->ctor) |
3438 | s->flags |= __OBJECT_POISON; | 3438 | s->flags |= __OBJECT_POISON; |
3439 | else | 3439 | else |
@@ -3455,7 +3455,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
3455 | */ | 3455 | */ |
3456 | s->inuse = size; | 3456 | s->inuse = size; |
3457 | 3457 | ||
3458 | if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || | 3458 | if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || |
3459 | s->ctor)) { | 3459 | s->ctor)) { |
3460 | /* | 3460 | /* |
3461 | * Relocate free pointer after the object if it is not | 3461 | * Relocate free pointer after the object if it is not |
@@ -3537,7 +3537,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) | |||
3537 | s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); | 3537 | s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); |
3538 | s->reserved = 0; | 3538 | s->reserved = 0; |
3539 | 3539 | ||
3540 | if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) | 3540 | if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) |
3541 | s->reserved = sizeof(struct rcu_head); | 3541 | s->reserved = sizeof(struct rcu_head); |
3542 | 3542 | ||
3543 | if (!calculate_sizes(s, -1)) | 3543 | if (!calculate_sizes(s, -1)) |
@@ -5042,7 +5042,7 @@ SLAB_ATTR_RO(cache_dma); | |||
5042 | 5042 | ||
5043 | static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) | 5043 | static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) |
5044 | { | 5044 | { |
5045 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); | 5045 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU)); |
5046 | } | 5046 | } |
5047 | SLAB_ATTR_RO(destroy_by_rcu); | 5047 | SLAB_ATTR_RO(destroy_by_rcu); |
5048 | 5048 | ||
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b99168b0fabf..f75482bdee9a 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c | |||
@@ -951,7 +951,7 @@ static struct proto dccp_v4_prot = { | |||
951 | .orphan_count = &dccp_orphan_count, | 951 | .orphan_count = &dccp_orphan_count, |
952 | .max_header = MAX_DCCP_HEADER, | 952 | .max_header = MAX_DCCP_HEADER, |
953 | .obj_size = sizeof(struct dccp_sock), | 953 | .obj_size = sizeof(struct dccp_sock), |
954 | .slab_flags = SLAB_DESTROY_BY_RCU, | 954 | .slab_flags = SLAB_TYPESAFE_BY_RCU, |
955 | .rsk_prot = &dccp_request_sock_ops, | 955 | .rsk_prot = &dccp_request_sock_ops, |
956 | .twsk_prot = &dccp_timewait_sock_ops, | 956 | .twsk_prot = &dccp_timewait_sock_ops, |
957 | .h.hashinfo = &dccp_hashinfo, | 957 | .h.hashinfo = &dccp_hashinfo, |
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index d9b6a4e403e7..840f14aaa016 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c | |||
@@ -1014,7 +1014,7 @@ static struct proto dccp_v6_prot = { | |||
1014 | .orphan_count = &dccp_orphan_count, | 1014 | .orphan_count = &dccp_orphan_count, |
1015 | .max_header = MAX_DCCP_HEADER, | 1015 | .max_header = MAX_DCCP_HEADER, |
1016 | .obj_size = sizeof(struct dccp6_sock), | 1016 | .obj_size = sizeof(struct dccp6_sock), |
1017 | .slab_flags = SLAB_DESTROY_BY_RCU, | 1017 | .slab_flags = SLAB_TYPESAFE_BY_RCU, |
1018 | .rsk_prot = &dccp6_request_sock_ops, | 1018 | .rsk_prot = &dccp6_request_sock_ops, |
1019 | .twsk_prot = &dccp6_timewait_sock_ops, | 1019 | .twsk_prot = &dccp6_timewait_sock_ops, |
1020 | .h.hashinfo = &dccp_hashinfo, | 1020 | .h.hashinfo = &dccp_hashinfo, |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 575e19dcc017..265352e1298b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -2402,7 +2402,7 @@ struct proto tcp_prot = { | |||
2402 | .sysctl_rmem = sysctl_tcp_rmem, | 2402 | .sysctl_rmem = sysctl_tcp_rmem, |
2403 | .max_header = MAX_TCP_HEADER, | 2403 | .max_header = MAX_TCP_HEADER, |
2404 | .obj_size = sizeof(struct tcp_sock), | 2404 | .obj_size = sizeof(struct tcp_sock), |
2405 | .slab_flags = SLAB_DESTROY_BY_RCU, | 2405 | .slab_flags = SLAB_TYPESAFE_BY_RCU, |
2406 | .twsk_prot = &tcp_timewait_sock_ops, | 2406 | .twsk_prot = &tcp_timewait_sock_ops, |
2407 | .rsk_prot = &tcp_request_sock_ops, | 2407 | .rsk_prot = &tcp_request_sock_ops, |
2408 | .h.hashinfo = &tcp_hashinfo, | 2408 | .h.hashinfo = &tcp_hashinfo, |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 49fa2e8c3fa9..cc01d5fd2e86 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
@@ -1921,7 +1921,7 @@ struct proto tcpv6_prot = { | |||
1921 | .sysctl_rmem = sysctl_tcp_rmem, | 1921 | .sysctl_rmem = sysctl_tcp_rmem, |
1922 | .max_header = MAX_TCP_HEADER, | 1922 | .max_header = MAX_TCP_HEADER, |
1923 | .obj_size = sizeof(struct tcp6_sock), | 1923 | .obj_size = sizeof(struct tcp6_sock), |
1924 | .slab_flags = SLAB_DESTROY_BY_RCU, | 1924 | .slab_flags = SLAB_TYPESAFE_BY_RCU, |
1925 | .twsk_prot = &tcp6_timewait_sock_ops, | 1925 | .twsk_prot = &tcp6_timewait_sock_ops, |
1926 | .rsk_prot = &tcp6_request_sock_ops, | 1926 | .rsk_prot = &tcp6_request_sock_ops, |
1927 | .h.hashinfo = &tcp_hashinfo, | 1927 | .h.hashinfo = &tcp_hashinfo, |
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index cb4fff785cbf..8364fe5b59e4 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c | |||
@@ -142,7 +142,7 @@ static struct proto llc_proto = { | |||
142 | .name = "LLC", | 142 | .name = "LLC", |
143 | .owner = THIS_MODULE, | 143 | .owner = THIS_MODULE, |
144 | .obj_size = sizeof(struct llc_sock), | 144 | .obj_size = sizeof(struct llc_sock), |
145 | .slab_flags = SLAB_DESTROY_BY_RCU, | 145 | .slab_flags = SLAB_TYPESAFE_BY_RCU, |
146 | }; | 146 | }; |
147 | 147 | ||
148 | /** | 148 | /** |
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 8bc5a1bd2d45..9b02c13d258b 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c | |||
@@ -506,7 +506,7 @@ static struct sock *__llc_lookup_established(struct llc_sap *sap, | |||
506 | again: | 506 | again: |
507 | sk_nulls_for_each_rcu(rc, node, laddr_hb) { | 507 | sk_nulls_for_each_rcu(rc, node, laddr_hb) { |
508 | if (llc_estab_match(sap, daddr, laddr, rc)) { | 508 | if (llc_estab_match(sap, daddr, laddr, rc)) { |
509 | /* Extra checks required by SLAB_DESTROY_BY_RCU */ | 509 | /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ |
510 | if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) | 510 | if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) |
511 | goto again; | 511 | goto again; |
512 | if (unlikely(llc_sk(rc)->sap != sap || | 512 | if (unlikely(llc_sk(rc)->sap != sap || |
@@ -565,7 +565,7 @@ static struct sock *__llc_lookup_listener(struct llc_sap *sap, | |||
565 | again: | 565 | again: |
566 | sk_nulls_for_each_rcu(rc, node, laddr_hb) { | 566 | sk_nulls_for_each_rcu(rc, node, laddr_hb) { |
567 | if (llc_listener_match(sap, laddr, rc)) { | 567 | if (llc_listener_match(sap, laddr, rc)) { |
568 | /* Extra checks required by SLAB_DESTROY_BY_RCU */ | 568 | /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ |
569 | if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) | 569 | if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) |
570 | goto again; | 570 | goto again; |
571 | if (unlikely(llc_sk(rc)->sap != sap || | 571 | if (unlikely(llc_sk(rc)->sap != sap || |
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index 5404d0d195cc..63b6ab056370 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c | |||
@@ -328,7 +328,7 @@ static struct sock *llc_lookup_dgram(struct llc_sap *sap, | |||
328 | again: | 328 | again: |
329 | sk_nulls_for_each_rcu(rc, node, laddr_hb) { | 329 | sk_nulls_for_each_rcu(rc, node, laddr_hb) { |
330 | if (llc_dgram_match(sap, laddr, rc)) { | 330 | if (llc_dgram_match(sap, laddr, rc)) { |
331 | /* Extra checks required by SLAB_DESTROY_BY_RCU */ | 331 | /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ |
332 | if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) | 332 | if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) |
333 | goto again; | 333 | goto again; |
334 | if (unlikely(llc_sk(rc)->sap != sap || | 334 | if (unlikely(llc_sk(rc)->sap != sap || |
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ffb78e5f7b70..4cf769f9b32a 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c | |||
@@ -918,7 +918,7 @@ static unsigned int early_drop_list(struct net *net, | |||
918 | continue; | 918 | continue; |
919 | 919 | ||
920 | /* kill only if still in same netns -- might have moved due to | 920 | /* kill only if still in same netns -- might have moved due to |
921 | * SLAB_DESTROY_BY_RCU rules. | 921 | * SLAB_TYPESAFE_BY_RCU rules. |
922 | * | 922 | * |
923 | * We steal the timer reference. If that fails timer has | 923 | * We steal the timer reference. If that fails timer has |
924 | * already fired or someone else deleted it. Just drop ref | 924 | * already fired or someone else deleted it. Just drop ref |
@@ -1073,7 +1073,7 @@ __nf_conntrack_alloc(struct net *net, | |||
1073 | 1073 | ||
1074 | /* | 1074 | /* |
1075 | * Do not use kmem_cache_zalloc(), as this cache uses | 1075 | * Do not use kmem_cache_zalloc(), as this cache uses |
1076 | * SLAB_DESTROY_BY_RCU. | 1076 | * SLAB_TYPESAFE_BY_RCU. |
1077 | */ | 1077 | */ |
1078 | ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); | 1078 | ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); |
1079 | if (ct == NULL) | 1079 | if (ct == NULL) |
@@ -1118,7 +1118,7 @@ void nf_conntrack_free(struct nf_conn *ct) | |||
1118 | struct net *net = nf_ct_net(ct); | 1118 | struct net *net = nf_ct_net(ct); |
1119 | 1119 | ||
1120 | /* A freed object has refcnt == 0, that's | 1120 | /* A freed object has refcnt == 0, that's |
1121 | * the golden rule for SLAB_DESTROY_BY_RCU | 1121 | * the golden rule for SLAB_TYPESAFE_BY_RCU |
1122 | */ | 1122 | */ |
1123 | NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); | 1123 | NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); |
1124 | 1124 | ||
@@ -1882,7 +1882,7 @@ int nf_conntrack_init_start(void) | |||
1882 | nf_conntrack_cachep = kmem_cache_create("nf_conntrack", | 1882 | nf_conntrack_cachep = kmem_cache_create("nf_conntrack", |
1883 | sizeof(struct nf_conn), | 1883 | sizeof(struct nf_conn), |
1884 | NFCT_INFOMASK + 1, | 1884 | NFCT_INFOMASK + 1, |
1885 | SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); | 1885 | SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); |
1886 | if (!nf_conntrack_cachep) | 1886 | if (!nf_conntrack_cachep) |
1887 | goto err_cachep; | 1887 | goto err_cachep; |
1888 | 1888 | ||
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 093803786eac..9659adfe534f 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c | |||
@@ -101,7 +101,7 @@ struct proto smc_proto = { | |||
101 | .unhash = smc_unhash_sk, | 101 | .unhash = smc_unhash_sk, |
102 | .obj_size = sizeof(struct smc_sock), | 102 | .obj_size = sizeof(struct smc_sock), |
103 | .h.smc_hash = &smc_v4_hashinfo, | 103 | .h.smc_hash = &smc_v4_hashinfo, |
104 | .slab_flags = SLAB_DESTROY_BY_RCU, | 104 | .slab_flags = SLAB_TYPESAFE_BY_RCU, |
105 | }; | 105 | }; |
106 | EXPORT_SYMBOL_GPL(smc_proto); | 106 | EXPORT_SYMBOL_GPL(smc_proto); |
107 | 107 | ||
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index ea6e373edc27..93eede4e8fbe 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | |||
@@ -170,7 +170,7 @@ qemu_append="`identify_qemu_append "$QEMU"`" | |||
170 | # Pull in Kconfig-fragment boot parameters | 170 | # Pull in Kconfig-fragment boot parameters |
171 | boot_args="`configfrag_boot_params "$boot_args" "$config_template"`" | 171 | boot_args="`configfrag_boot_params "$boot_args" "$config_template"`" |
172 | # Generate kernel-version-specific boot parameters | 172 | # Generate kernel-version-specific boot parameters |
173 | boot_args="`per_version_boot_params "$boot_args" $builddir/.config $seconds`" | 173 | boot_args="`per_version_boot_params "$boot_args" $resdir/.config $seconds`" |
174 | 174 | ||
175 | if test -n "$TORTURE_BUILDONLY" | 175 | if test -n "$TORTURE_BUILDONLY" |
176 | then | 176 | then |