diff options
author | Li Zefan <lizefan@huawei.com> | 2013-06-09 05:16:29 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2013-06-13 13:48:32 -0400 |
commit | 5c5cc62321d9df7a9a608346fc649c4528380c8f (patch) | |
tree | e7e04867979f4f6920d6d1244057112ceb235390 /kernel/cpuset.c | |
parent | 070b57fcacc9dfc23a180290079078373fb697e1 (diff) |
cpuset: allow to keep tasks in empty cpusets
To achieve this:
- We call update_tasks_cpumask/nodemask() for empty cpusets when
hotplug happens, instead of moving tasks out of them.
- When a cpuset's masks are changed by writing cpuset.cpus/mems,
we also update tasks in child cpusets which are empty.
v3:
- do propagation work in one place for both hotplug and unplug
v2:
- drop rcu_read_lock before calling update_task_nodemask() and
update_task_cpumask(), instead of using workqueue.
- add documentation in include/linux/cgroup.h
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 141 |
1 files changed, 110 insertions, 31 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 82ac1f862cbc..3473dd2580d1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -874,6 +874,45 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) | |||
874 | cgroup_scan_tasks(&scan); | 874 | cgroup_scan_tasks(&scan); |
875 | } | 875 | } |
876 | 876 | ||
877 | /* | ||
878 | * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. | ||
879 | * @root_cs: the root cpuset of the hierarchy | ||
880 | * @update_root: update root cpuset or not? | ||
881 | * @heap: the heap used by cgroup_scan_tasks() | ||
882 | * | ||
883 | * This will update cpumasks of tasks in @root_cs and all other empty cpusets | ||
884 | * which take on cpumask of @root_cs. | ||
885 | * | ||
886 | * Called with cpuset_mutex held | ||
887 | */ | ||
888 | static void update_tasks_cpumask_hier(struct cpuset *root_cs, | ||
889 | bool update_root, struct ptr_heap *heap) | ||
890 | { | ||
891 | struct cpuset *cp; | ||
892 | struct cgroup *pos_cgrp; | ||
893 | |||
894 | if (update_root) | ||
895 | update_tasks_cpumask(root_cs, heap); | ||
896 | |||
897 | rcu_read_lock(); | ||
898 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | ||
899 | /* skip the whole subtree if @cp have some CPU */ | ||
900 | if (!cpumask_empty(cp->cpus_allowed)) { | ||
901 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | ||
902 | continue; | ||
903 | } | ||
904 | if (!css_tryget(&cp->css)) | ||
905 | continue; | ||
906 | rcu_read_unlock(); | ||
907 | |||
908 | update_tasks_cpumask(cp, heap); | ||
909 | |||
910 | rcu_read_lock(); | ||
911 | css_put(&cp->css); | ||
912 | } | ||
913 | rcu_read_unlock(); | ||
914 | } | ||
915 | |||
877 | /** | 916 | /** |
878 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it | 917 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it |
879 | * @cs: the cpuset to consider | 918 | * @cs: the cpuset to consider |
@@ -925,11 +964,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
925 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); | 964 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); |
926 | mutex_unlock(&callback_mutex); | 965 | mutex_unlock(&callback_mutex); |
927 | 966 | ||
928 | /* | 967 | update_tasks_cpumask_hier(cs, true, &heap); |
929 | * Scan tasks in the cpuset, and update the cpumasks of any | ||
930 | * that need an update. | ||
931 | */ | ||
932 | update_tasks_cpumask(cs, &heap); | ||
933 | 968 | ||
934 | heap_free(&heap); | 969 | heap_free(&heap); |
935 | 970 | ||
@@ -1097,6 +1132,45 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) | |||
1097 | } | 1132 | } |
1098 | 1133 | ||
1099 | /* | 1134 | /* |
1135 | * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. | ||
1136 | * @cs: the root cpuset of the hierarchy | ||
1137 | * @update_root: update the root cpuset or not? | ||
1138 | * @heap: the heap used by cgroup_scan_tasks() | ||
1139 | * | ||
1140 | * This will update nodemasks of tasks in @root_cs and all other empty cpusets | ||
1141 | * which take on nodemask of @root_cs. | ||
1142 | * | ||
1143 | * Called with cpuset_mutex held | ||
1144 | */ | ||
1145 | static void update_tasks_nodemask_hier(struct cpuset *root_cs, | ||
1146 | bool update_root, struct ptr_heap *heap) | ||
1147 | { | ||
1148 | struct cpuset *cp; | ||
1149 | struct cgroup *pos_cgrp; | ||
1150 | |||
1151 | if (update_root) | ||
1152 | update_tasks_nodemask(root_cs, heap); | ||
1153 | |||
1154 | rcu_read_lock(); | ||
1155 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | ||
1156 | /* skip the whole subtree if @cp have some CPU */ | ||
1157 | if (!nodes_empty(cp->mems_allowed)) { | ||
1158 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | ||
1159 | continue; | ||
1160 | } | ||
1161 | if (!css_tryget(&cp->css)) | ||
1162 | continue; | ||
1163 | rcu_read_unlock(); | ||
1164 | |||
1165 | update_tasks_nodemask(cp, heap); | ||
1166 | |||
1167 | rcu_read_lock(); | ||
1168 | css_put(&cp->css); | ||
1169 | } | ||
1170 | rcu_read_unlock(); | ||
1171 | } | ||
1172 | |||
1173 | /* | ||
1100 | * Handle user request to change the 'mems' memory placement | 1174 | * Handle user request to change the 'mems' memory placement |
1101 | * of a cpuset. Needs to validate the request, update the | 1175 | * of a cpuset. Needs to validate the request, update the |
1102 | * cpusets mems_allowed, and for each task in the cpuset, | 1176 | * cpusets mems_allowed, and for each task in the cpuset, |
@@ -1160,7 +1234,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1160 | cs->mems_allowed = trialcs->mems_allowed; | 1234 | cs->mems_allowed = trialcs->mems_allowed; |
1161 | mutex_unlock(&callback_mutex); | 1235 | mutex_unlock(&callback_mutex); |
1162 | 1236 | ||
1163 | update_tasks_nodemask(cs, &heap); | 1237 | update_tasks_nodemask_hier(cs, true, &heap); |
1164 | 1238 | ||
1165 | heap_free(&heap); | 1239 | heap_free(&heap); |
1166 | done: | 1240 | done: |
@@ -2048,6 +2122,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) | |||
2048 | static cpumask_t off_cpus; | 2122 | static cpumask_t off_cpus; |
2049 | static nodemask_t off_mems; | 2123 | static nodemask_t off_mems; |
2050 | bool is_empty; | 2124 | bool is_empty; |
2125 | bool sane = cgroup_sane_behavior(cs->css.cgroup); | ||
2051 | 2126 | ||
2052 | retry: | 2127 | retry: |
2053 | wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); | 2128 | wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); |
@@ -2066,21 +2141,29 @@ retry: | |||
2066 | cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); | 2141 | cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); |
2067 | nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); | 2142 | nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); |
2068 | 2143 | ||
2069 | /* remove offline cpus from @cs */ | 2144 | mutex_lock(&callback_mutex); |
2070 | if (!cpumask_empty(&off_cpus)) { | 2145 | cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); |
2071 | mutex_lock(&callback_mutex); | 2146 | mutex_unlock(&callback_mutex); |
2072 | cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); | 2147 | |
2073 | mutex_unlock(&callback_mutex); | 2148 | /* |
2149 | * If sane_behavior flag is set, we need to update tasks' cpumask | ||
2150 | * for empty cpuset to take on ancestor's cpumask. | ||
2151 | */ | ||
2152 | if ((sane && cpumask_empty(cs->cpus_allowed)) || | ||
2153 | !cpumask_empty(&off_cpus)) | ||
2074 | update_tasks_cpumask(cs, NULL); | 2154 | update_tasks_cpumask(cs, NULL); |
2075 | } | ||
2076 | 2155 | ||
2077 | /* remove offline mems from @cs */ | 2156 | mutex_lock(&callback_mutex); |
2078 | if (!nodes_empty(off_mems)) { | 2157 | nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); |
2079 | mutex_lock(&callback_mutex); | 2158 | mutex_unlock(&callback_mutex); |
2080 | nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); | 2159 | |
2081 | mutex_unlock(&callback_mutex); | 2160 | /* |
2161 | * If sane_behavior flag is set, we need to update tasks' nodemask | ||
2162 | * for empty cpuset to take on ancestor's nodemask. | ||
2163 | */ | ||
2164 | if ((sane && nodes_empty(cs->mems_allowed)) || | ||
2165 | !nodes_empty(off_mems)) | ||
2082 | update_tasks_nodemask(cs, NULL); | 2166 | update_tasks_nodemask(cs, NULL); |
2083 | } | ||
2084 | 2167 | ||
2085 | is_empty = cpumask_empty(cs->cpus_allowed) || | 2168 | is_empty = cpumask_empty(cs->cpus_allowed) || |
2086 | nodes_empty(cs->mems_allowed); | 2169 | nodes_empty(cs->mems_allowed); |
@@ -2088,11 +2171,13 @@ retry: | |||
2088 | mutex_unlock(&cpuset_mutex); | 2171 | mutex_unlock(&cpuset_mutex); |
2089 | 2172 | ||
2090 | /* | 2173 | /* |
2091 | * If @cs became empty, move tasks to the nearest ancestor with | 2174 | * If sane_behavior flag is set, we'll keep tasks in empty cpusets. |
2092 | * execution resources. This is full cgroup operation which will | 2175 | * |
2176 | * Otherwise move tasks to the nearest ancestor with execution | ||
2177 | * resources. This is full cgroup operation which will | ||
2093 | * also call back into cpuset. Should be done outside any lock. | 2178 | * also call back into cpuset. Should be done outside any lock. |
2094 | */ | 2179 | */ |
2095 | if (is_empty) | 2180 | if (!sane && is_empty) |
2096 | remove_tasks_in_empty_cpuset(cs); | 2181 | remove_tasks_in_empty_cpuset(cs); |
2097 | } | 2182 | } |
2098 | 2183 | ||
@@ -2114,10 +2199,9 @@ retry: | |||
2114 | */ | 2199 | */ |
2115 | static void cpuset_hotplug_workfn(struct work_struct *work) | 2200 | static void cpuset_hotplug_workfn(struct work_struct *work) |
2116 | { | 2201 | { |
2117 | static cpumask_t new_cpus, tmp_cpus; | 2202 | static cpumask_t new_cpus; |
2118 | static nodemask_t new_mems, tmp_mems; | 2203 | static nodemask_t new_mems; |
2119 | bool cpus_updated, mems_updated; | 2204 | bool cpus_updated, mems_updated; |
2120 | bool cpus_offlined, mems_offlined; | ||
2121 | 2205 | ||
2122 | mutex_lock(&cpuset_mutex); | 2206 | mutex_lock(&cpuset_mutex); |
2123 | 2207 | ||
@@ -2126,12 +2210,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2126 | new_mems = node_states[N_MEMORY]; | 2210 | new_mems = node_states[N_MEMORY]; |
2127 | 2211 | ||
2128 | cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); | 2212 | cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); |
2129 | cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed, | ||
2130 | &new_cpus); | ||
2131 | |||
2132 | mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); | 2213 | mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); |
2133 | nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems); | ||
2134 | mems_offlined = !nodes_empty(tmp_mems); | ||
2135 | 2214 | ||
2136 | /* synchronize cpus_allowed to cpu_active_mask */ | 2215 | /* synchronize cpus_allowed to cpu_active_mask */ |
2137 | if (cpus_updated) { | 2216 | if (cpus_updated) { |
@@ -2151,8 +2230,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2151 | 2230 | ||
2152 | mutex_unlock(&cpuset_mutex); | 2231 | mutex_unlock(&cpuset_mutex); |
2153 | 2232 | ||
2154 | /* if cpus or mems went down, we need to propagate to descendants */ | 2233 | /* if cpus or mems changed, we need to propagate to descendants */ |
2155 | if (cpus_offlined || mems_offlined) { | 2234 | if (cpus_updated || mems_updated) { |
2156 | struct cpuset *cs; | 2235 | struct cpuset *cs; |
2157 | struct cgroup *pos_cgrp; | 2236 | struct cgroup *pos_cgrp; |
2158 | 2237 | ||