summaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
authorLi Zefan <lizefan@huawei.com>2013-06-09 05:16:29 -0400
committerTejun Heo <tj@kernel.org>2013-06-13 13:48:32 -0400
commit5c5cc62321d9df7a9a608346fc649c4528380c8f (patch)
treee7e04867979f4f6920d6d1244057112ceb235390 /kernel/cpuset.c
parent070b57fcacc9dfc23a180290079078373fb697e1 (diff)
cpuset: allow to keep tasks in empty cpusets
To achieve this: - We call update_tasks_cpumask/nodemask() for empty cpusets when hotplug happens, instead of moving tasks out of them. - When a cpuset's masks are changed by writing cpuset.cpus/mems, we also update tasks in child cpusets which are empty. v3: - do propagation work in one place for both hotplug and unplug v2: - drop rcu_read_lock before calling update_task_nodemask() and update_task_cpumask(), instead of using workqueue. - add documentation in include/linux/cgroup.h Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c141
1 files changed, 110 insertions, 31 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 82ac1f862cbc..3473dd2580d1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -874,6 +874,45 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
874 cgroup_scan_tasks(&scan); 874 cgroup_scan_tasks(&scan);
875} 875}
876 876
877/*
878 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
879 * @root_cs: the root cpuset of the hierarchy
880 * @update_root: update root cpuset or not?
881 * @heap: the heap used by cgroup_scan_tasks()
882 *
883 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
884 * which take on cpumask of @root_cs.
885 *
886 * Called with cpuset_mutex held
887 */
888static void update_tasks_cpumask_hier(struct cpuset *root_cs,
889 bool update_root, struct ptr_heap *heap)
890{
891 struct cpuset *cp;
892 struct cgroup *pos_cgrp;
893
894 if (update_root)
895 update_tasks_cpumask(root_cs, heap);
896
897 rcu_read_lock();
898 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
899 /* skip the whole subtree if @cp have some CPU */
900 if (!cpumask_empty(cp->cpus_allowed)) {
901 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
902 continue;
903 }
904 if (!css_tryget(&cp->css))
905 continue;
906 rcu_read_unlock();
907
908 update_tasks_cpumask(cp, heap);
909
910 rcu_read_lock();
911 css_put(&cp->css);
912 }
913 rcu_read_unlock();
914}
915
877/** 916/**
878 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 917 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
879 * @cs: the cpuset to consider 918 * @cs: the cpuset to consider
@@ -925,11 +964,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
925 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 964 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
926 mutex_unlock(&callback_mutex); 965 mutex_unlock(&callback_mutex);
927 966
928 /* 967 update_tasks_cpumask_hier(cs, true, &heap);
929 * Scan tasks in the cpuset, and update the cpumasks of any
930 * that need an update.
931 */
932 update_tasks_cpumask(cs, &heap);
933 968
934 heap_free(&heap); 969 heap_free(&heap);
935 970
@@ -1097,6 +1132,45 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1097} 1132}
1098 1133
1099/* 1134/*
1135 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1136 * @cs: the root cpuset of the hierarchy
1137 * @update_root: update the root cpuset or not?
1138 * @heap: the heap used by cgroup_scan_tasks()
1139 *
1140 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1141 * which take on nodemask of @root_cs.
1142 *
1143 * Called with cpuset_mutex held
1144 */
1145static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1146 bool update_root, struct ptr_heap *heap)
1147{
1148 struct cpuset *cp;
1149 struct cgroup *pos_cgrp;
1150
1151 if (update_root)
1152 update_tasks_nodemask(root_cs, heap);
1153
1154 rcu_read_lock();
1155 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
1156 /* skip the whole subtree if @cp have some CPU */
1157 if (!nodes_empty(cp->mems_allowed)) {
1158 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
1159 continue;
1160 }
1161 if (!css_tryget(&cp->css))
1162 continue;
1163 rcu_read_unlock();
1164
1165 update_tasks_nodemask(cp, heap);
1166
1167 rcu_read_lock();
1168 css_put(&cp->css);
1169 }
1170 rcu_read_unlock();
1171}
1172
1173/*
1100 * Handle user request to change the 'mems' memory placement 1174 * Handle user request to change the 'mems' memory placement
1101 * of a cpuset. Needs to validate the request, update the 1175 * of a cpuset. Needs to validate the request, update the
1102 * cpusets mems_allowed, and for each task in the cpuset, 1176 * cpusets mems_allowed, and for each task in the cpuset,
@@ -1160,7 +1234,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1160 cs->mems_allowed = trialcs->mems_allowed; 1234 cs->mems_allowed = trialcs->mems_allowed;
1161 mutex_unlock(&callback_mutex); 1235 mutex_unlock(&callback_mutex);
1162 1236
1163 update_tasks_nodemask(cs, &heap); 1237 update_tasks_nodemask_hier(cs, true, &heap);
1164 1238
1165 heap_free(&heap); 1239 heap_free(&heap);
1166done: 1240done:
@@ -2048,6 +2122,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2048 static cpumask_t off_cpus; 2122 static cpumask_t off_cpus;
2049 static nodemask_t off_mems; 2123 static nodemask_t off_mems;
2050 bool is_empty; 2124 bool is_empty;
2125 bool sane = cgroup_sane_behavior(cs->css.cgroup);
2051 2126
2052retry: 2127retry:
2053 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); 2128 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -2066,21 +2141,29 @@ retry:
2066 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); 2141 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
2067 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); 2142 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
2068 2143
2069 /* remove offline cpus from @cs */ 2144 mutex_lock(&callback_mutex);
2070 if (!cpumask_empty(&off_cpus)) { 2145 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
2071 mutex_lock(&callback_mutex); 2146 mutex_unlock(&callback_mutex);
2072 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); 2147
2073 mutex_unlock(&callback_mutex); 2148 /*
2149 * If sane_behavior flag is set, we need to update tasks' cpumask
2150 * for empty cpuset to take on ancestor's cpumask.
2151 */
2152 if ((sane && cpumask_empty(cs->cpus_allowed)) ||
2153 !cpumask_empty(&off_cpus))
2074 update_tasks_cpumask(cs, NULL); 2154 update_tasks_cpumask(cs, NULL);
2075 }
2076 2155
2077 /* remove offline mems from @cs */ 2156 mutex_lock(&callback_mutex);
2078 if (!nodes_empty(off_mems)) { 2157 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
2079 mutex_lock(&callback_mutex); 2158 mutex_unlock(&callback_mutex);
2080 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); 2159
2081 mutex_unlock(&callback_mutex); 2160 /*
2161 * If sane_behavior flag is set, we need to update tasks' nodemask
2162 * for empty cpuset to take on ancestor's nodemask.
2163 */
2164 if ((sane && nodes_empty(cs->mems_allowed)) ||
2165 !nodes_empty(off_mems))
2082 update_tasks_nodemask(cs, NULL); 2166 update_tasks_nodemask(cs, NULL);
2083 }
2084 2167
2085 is_empty = cpumask_empty(cs->cpus_allowed) || 2168 is_empty = cpumask_empty(cs->cpus_allowed) ||
2086 nodes_empty(cs->mems_allowed); 2169 nodes_empty(cs->mems_allowed);
@@ -2088,11 +2171,13 @@ retry:
2088 mutex_unlock(&cpuset_mutex); 2171 mutex_unlock(&cpuset_mutex);
2089 2172
2090 /* 2173 /*
2091 * If @cs became empty, move tasks to the nearest ancestor with 2174 * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
2092 * execution resources. This is full cgroup operation which will 2175 *
2176 * Otherwise move tasks to the nearest ancestor with execution
2177 * resources. This is full cgroup operation which will
2093 * also call back into cpuset. Should be done outside any lock. 2178 * also call back into cpuset. Should be done outside any lock.
2094 */ 2179 */
2095 if (is_empty) 2180 if (!sane && is_empty)
2096 remove_tasks_in_empty_cpuset(cs); 2181 remove_tasks_in_empty_cpuset(cs);
2097} 2182}
2098 2183
@@ -2114,10 +2199,9 @@ retry:
2114 */ 2199 */
2115static void cpuset_hotplug_workfn(struct work_struct *work) 2200static void cpuset_hotplug_workfn(struct work_struct *work)
2116{ 2201{
2117 static cpumask_t new_cpus, tmp_cpus; 2202 static cpumask_t new_cpus;
2118 static nodemask_t new_mems, tmp_mems; 2203 static nodemask_t new_mems;
2119 bool cpus_updated, mems_updated; 2204 bool cpus_updated, mems_updated;
2120 bool cpus_offlined, mems_offlined;
2121 2205
2122 mutex_lock(&cpuset_mutex); 2206 mutex_lock(&cpuset_mutex);
2123 2207
@@ -2126,12 +2210,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2126 new_mems = node_states[N_MEMORY]; 2210 new_mems = node_states[N_MEMORY];
2127 2211
2128 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); 2212 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
2129 cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
2130 &new_cpus);
2131
2132 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); 2213 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
2133 nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
2134 mems_offlined = !nodes_empty(tmp_mems);
2135 2214
2136 /* synchronize cpus_allowed to cpu_active_mask */ 2215 /* synchronize cpus_allowed to cpu_active_mask */
2137 if (cpus_updated) { 2216 if (cpus_updated) {
@@ -2151,8 +2230,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2151 2230
2152 mutex_unlock(&cpuset_mutex); 2231 mutex_unlock(&cpuset_mutex);
2153 2232
2154 /* if cpus or mems went down, we need to propagate to descendants */ 2233 /* if cpus or mems changed, we need to propagate to descendants */
2155 if (cpus_offlined || mems_offlined) { 2234 if (cpus_updated || mems_updated) {
2156 struct cpuset *cs; 2235 struct cpuset *cs;
2157 struct cgroup *pos_cgrp; 2236 struct cgroup *pos_cgrp;
2158 2237