aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
authorMiao Xie <miaox@cn.fujitsu.com>2009-06-16 18:31:49 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-06-16 22:47:31 -0400
commit58568d2a8215cb6f55caf2332017d7bdff954e1c (patch)
treeffcdee457494ac78d6550b0aeac86536ca152e7b /kernel/cpuset.c
parent950592f7b991f267d707d372b90f508bbe72acbc (diff)
cpuset,mm: update tasks' mems_allowed in time
Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c184
1 files changed, 46 insertions, 138 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index af5a83d52187..7e75a41bd508 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -97,12 +97,6 @@ struct cpuset {
97 97
98 struct cpuset *parent; /* my parent */ 98 struct cpuset *parent; /* my parent */
99 99
100 /*
101 * Copy of global cpuset_mems_generation as of the most
102 * recent time this cpuset changed its mems_allowed.
103 */
104 int mems_generation;
105
106 struct fmeter fmeter; /* memory_pressure filter */ 100 struct fmeter fmeter; /* memory_pressure filter */
107 101
108 /* partition number for rebuild_sched_domains() */ 102 /* partition number for rebuild_sched_domains() */
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
176 return test_bit(CS_SPREAD_SLAB, &cs->flags); 170 return test_bit(CS_SPREAD_SLAB, &cs->flags);
177} 171}
178 172
179/*
180 * Increment this integer everytime any cpuset changes its
181 * mems_allowed value. Users of cpusets can track this generation
182 * number, and avoid having to lock and reload mems_allowed unless
183 * the cpuset they're using changes generation.
184 *
185 * A single, global generation is needed because cpuset_attach_task() could
186 * reattach a task to a different cpuset, which must not have its
187 * generation numbers aliased with those of that tasks previous cpuset.
188 *
189 * Generations are needed for mems_allowed because one task cannot
190 * modify another's memory placement. So we must enable every task,
191 * on every visit to __alloc_pages(), to efficiently check whether
192 * its current->cpuset->mems_allowed has changed, requiring an update
193 * of its current->mems_allowed.
194 *
195 * Since writes to cpuset_mems_generation are guarded by the cgroup lock
196 * there is no need to mark it atomic.
197 */
198static int cpuset_mems_generation;
199
200static struct cpuset top_cpuset = { 173static struct cpuset top_cpuset = {
201 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 174 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
202}; 175};
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = {
228 * If a task is only holding callback_mutex, then it has read-only 201 * If a task is only holding callback_mutex, then it has read-only
229 * access to cpusets. 202 * access to cpusets.
230 * 203 *
231 * The task_struct fields mems_allowed and mems_generation may only 204 * Now, the task_struct fields mems_allowed and mempolicy may be changed
232 * be accessed in the context of that task, so require no locks. 205 * by other task, we use alloc_lock in the task_struct fields to protect
206 * them.
233 * 207 *
234 * The cpuset_common_file_read() handlers only hold callback_mutex across 208 * The cpuset_common_file_read() handlers only hold callback_mutex across
235 * small pieces of code, such as when reading out possibly multi-word 209 * small pieces of code, such as when reading out possibly multi-word
@@ -349,69 +323,6 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
349 tsk->flags &= ~PF_SPREAD_SLAB; 323 tsk->flags &= ~PF_SPREAD_SLAB;
350} 324}
351 325
352/**
353 * cpuset_update_task_memory_state - update task memory placement
354 *
355 * If the current tasks cpusets mems_allowed changed behind our
356 * backs, update current->mems_allowed, mems_generation and task NUMA
357 * mempolicy to the new value.
358 *
359 * Task mempolicy is updated by rebinding it relative to the
360 * current->cpuset if a task has its memory placement changed.
361 * Do not call this routine if in_interrupt().
362 *
363 * Call without callback_mutex or task_lock() held. May be
364 * called with or without cgroup_mutex held. Thanks in part to
365 * 'the_top_cpuset_hack', the task's cpuset pointer will never
366 * be NULL. This routine also might acquire callback_mutex during
367 * call.
368 *
369 * Reading current->cpuset->mems_generation doesn't need task_lock
370 * to guard the current->cpuset derefence, because it is guarded
371 * from concurrent freeing of current->cpuset using RCU.
372 *
373 * The rcu_dereference() is technically probably not needed,
374 * as I don't actually mind if I see a new cpuset pointer but
375 * an old value of mems_generation. However this really only
376 * matters on alpha systems using cpusets heavily. If I dropped
377 * that rcu_dereference(), it would save them a memory barrier.
378 * For all other arch's, rcu_dereference is a no-op anyway, and for
379 * alpha systems not using cpusets, another planned optimization,
380 * avoiding the rcu critical section for tasks in the root cpuset
381 * which is statically allocated, so can't vanish, will make this
382 * irrelevant. Better to use RCU as intended, than to engage in
383 * some cute trick to save a memory barrier that is impossible to
384 * test, for alpha systems using cpusets heavily, which might not
385 * even exist.
386 *
387 * This routine is needed to update the per-task mems_allowed data,
388 * within the tasks context, when it is trying to allocate memory
389 * (in various mm/mempolicy.c routines) and notices that some other
390 * task has been modifying its cpuset.
391 */
392
393void cpuset_update_task_memory_state(void)
394{
395 int my_cpusets_mem_gen;
396 struct task_struct *tsk = current;
397 struct cpuset *cs;
398
399 rcu_read_lock();
400 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
401 rcu_read_unlock();
402
403 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
404 mutex_lock(&callback_mutex);
405 task_lock(tsk);
406 cs = task_cs(tsk); /* Maybe changed when task not locked */
407 guarantee_online_mems(cs, &tsk->mems_allowed);
408 tsk->cpuset_mems_generation = cs->mems_generation;
409 task_unlock(tsk);
410 mutex_unlock(&callback_mutex);
411 mpol_rebind_task(tsk, &tsk->mems_allowed);
412 }
413}
414
415/* 326/*
416 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? 327 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
417 * 328 *
@@ -1017,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1017 * other task, the task_struct mems_allowed that we are hacking 928 * other task, the task_struct mems_allowed that we are hacking
1018 * is for our current task, which must allocate new pages for that 929 * is for our current task, which must allocate new pages for that
1019 * migrating memory region. 930 * migrating memory region.
1020 *
1021 * We call cpuset_update_task_memory_state() before hacking
1022 * our tasks mems_allowed, so that we are assured of being in
1023 * sync with our tasks cpuset, and in particular, callbacks to
1024 * cpuset_update_task_memory_state() from nested page allocations
1025 * won't see any mismatch of our cpuset and task mems_generation
1026 * values, so won't overwrite our hacked tasks mems_allowed
1027 * nodemask.
1028 */ 931 */
1029 932
1030static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 933static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1032,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1032{ 935{
1033 struct task_struct *tsk = current; 936 struct task_struct *tsk = current;
1034 937
1035 cpuset_update_task_memory_state();
1036
1037 mutex_lock(&callback_mutex);
1038 tsk->mems_allowed = *to; 938 tsk->mems_allowed = *to;
1039 mutex_unlock(&callback_mutex);
1040 939
1041 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 940 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
1042 941
1043 mutex_lock(&callback_mutex);
1044 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); 942 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
1045 mutex_unlock(&callback_mutex);
1046} 943}
1047 944
1048/* 945/*
1049 * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new 946 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
1050 * nodes if memory_migrate flag is set. Called with cgroup_mutex held. 947 * @tsk: the task to change
948 * @newmems: new nodes that the task will be set
949 *
950 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
951 * we structure updates as setting all new allowed nodes, then clearing newly
952 * disallowed ones.
953 *
954 * Called with task's alloc_lock held
955 */
956static void cpuset_change_task_nodemask(struct task_struct *tsk,
957 nodemask_t *newmems)
958{
959 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
960 mpol_rebind_task(tsk, &tsk->mems_allowed);
961 mpol_rebind_task(tsk, newmems);
962 tsk->mems_allowed = *newmems;
963}
964
965/*
966 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
967 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
968 * memory_migrate flag is set. Called with cgroup_mutex held.
1051 */ 969 */
1052static void cpuset_change_nodemask(struct task_struct *p, 970static void cpuset_change_nodemask(struct task_struct *p,
1053 struct cgroup_scanner *scan) 971 struct cgroup_scanner *scan)
@@ -1056,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p,
1056 struct cpuset *cs; 974 struct cpuset *cs;
1057 int migrate; 975 int migrate;
1058 const nodemask_t *oldmem = scan->data; 976 const nodemask_t *oldmem = scan->data;
977 nodemask_t newmems;
978
979 cs = cgroup_cs(scan->cg);
980 guarantee_online_mems(cs, &newmems);
981
982 task_lock(p);
983 cpuset_change_task_nodemask(p, &newmems);
984 task_unlock(p);
1059 985
1060 mm = get_task_mm(p); 986 mm = get_task_mm(p);
1061 if (!mm) 987 if (!mm)
1062 return; 988 return;
1063 989
1064 cs = cgroup_cs(scan->cg);
1065 migrate = is_memory_migrate(cs); 990 migrate = is_memory_migrate(cs);
1066 991
1067 mpol_rebind_mm(mm, &cs->mems_allowed); 992 mpol_rebind_mm(mm, &cs->mems_allowed);
@@ -1114,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1114/* 1039/*
1115 * Handle user request to change the 'mems' memory placement 1040 * Handle user request to change the 'mems' memory placement
1116 * of a cpuset. Needs to validate the request, update the 1041 * of a cpuset. Needs to validate the request, update the
1117 * cpusets mems_allowed and mems_generation, and for each 1042 * cpusets mems_allowed, and for each task in the cpuset,
1118 * task in the cpuset, rebind any vma mempolicies and if 1043 * update mems_allowed and rebind task's mempolicy and any vma
1119 * the cpuset is marked 'memory_migrate', migrate the tasks 1044 * mempolicies and if the cpuset is marked 'memory_migrate',
1120 * pages to the new memory. 1045 * migrate the tasks pages to the new memory.
1121 * 1046 *
1122 * Call with cgroup_mutex held. May take callback_mutex during call. 1047 * Call with cgroup_mutex held. May take callback_mutex during call.
1123 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1048 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1170,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1170 1095
1171 mutex_lock(&callback_mutex); 1096 mutex_lock(&callback_mutex);
1172 cs->mems_allowed = trialcs->mems_allowed; 1097 cs->mems_allowed = trialcs->mems_allowed;
1173 cs->mems_generation = cpuset_mems_generation++;
1174 mutex_unlock(&callback_mutex); 1098 mutex_unlock(&callback_mutex);
1175 1099
1176 update_tasks_nodemask(cs, &oldmem, &heap); 1100 update_tasks_nodemask(cs, &oldmem, &heap);
@@ -1434,15 +1358,18 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1434 1358
1435 if (cs == &top_cpuset) { 1359 if (cs == &top_cpuset) {
1436 cpumask_copy(cpus_attach, cpu_possible_mask); 1360 cpumask_copy(cpus_attach, cpu_possible_mask);
1361 to = node_possible_map;
1437 } else { 1362 } else {
1438 mutex_lock(&callback_mutex);
1439 guarantee_online_cpus(cs, cpus_attach); 1363 guarantee_online_cpus(cs, cpus_attach);
1440 mutex_unlock(&callback_mutex); 1364 guarantee_online_mems(cs, &to);
1441 } 1365 }
1442 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1366 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1443 if (err) 1367 if (err)
1444 return; 1368 return;
1445 1369
1370 task_lock(tsk);
1371 cpuset_change_task_nodemask(tsk, &to);
1372 task_unlock(tsk);
1446 cpuset_update_task_spread_flag(cs, tsk); 1373 cpuset_update_task_spread_flag(cs, tsk);
1447 1374
1448 from = oldcs->mems_allowed; 1375 from = oldcs->mems_allowed;
@@ -1848,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create(
1848 struct cpuset *parent; 1775 struct cpuset *parent;
1849 1776
1850 if (!cont->parent) { 1777 if (!cont->parent) {
1851 /* This is early initialization for the top cgroup */
1852 top_cpuset.mems_generation = cpuset_mems_generation++;
1853 return &top_cpuset.css; 1778 return &top_cpuset.css;
1854 } 1779 }
1855 parent = cgroup_cs(cont->parent); 1780 parent = cgroup_cs(cont->parent);
@@ -1861,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create(
1861 return ERR_PTR(-ENOMEM); 1786 return ERR_PTR(-ENOMEM);
1862 } 1787 }
1863 1788
1864 cpuset_update_task_memory_state();
1865 cs->flags = 0; 1789 cs->flags = 0;
1866 if (is_spread_page(parent)) 1790 if (is_spread_page(parent))
1867 set_bit(CS_SPREAD_PAGE, &cs->flags); 1791 set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1870,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create(
1870 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1794 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1871 cpumask_clear(cs->cpus_allowed); 1795 cpumask_clear(cs->cpus_allowed);
1872 nodes_clear(cs->mems_allowed); 1796 nodes_clear(cs->mems_allowed);
1873 cs->mems_generation = cpuset_mems_generation++;
1874 fmeter_init(&cs->fmeter); 1797 fmeter_init(&cs->fmeter);
1875 cs->relax_domain_level = -1; 1798 cs->relax_domain_level = -1;
1876 1799
@@ -1889,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1889{ 1812{
1890 struct cpuset *cs = cgroup_cs(cont); 1813 struct cpuset *cs = cgroup_cs(cont);
1891 1814
1892 cpuset_update_task_memory_state();
1893
1894 if (is_sched_load_balance(cs)) 1815 if (is_sched_load_balance(cs))
1895 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1816 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1896 1817
@@ -1911,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = {
1911 .early_init = 1, 1832 .early_init = 1,
1912}; 1833};
1913 1834
1914/*
1915 * cpuset_init_early - just enough so that the calls to
1916 * cpuset_update_task_memory_state() in early init code
1917 * are harmless.
1918 */
1919
1920int __init cpuset_init_early(void)
1921{
1922 alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
1923
1924 top_cpuset.mems_generation = cpuset_mems_generation++;
1925 return 0;
1926}
1927
1928
1929/** 1835/**
1930 * cpuset_init - initialize cpusets at system boot 1836 * cpuset_init - initialize cpusets at system boot
1931 * 1837 *
@@ -1936,11 +1842,13 @@ int __init cpuset_init(void)
1936{ 1842{
1937 int err = 0; 1843 int err = 0;
1938 1844
1845 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1846 BUG();
1847
1939 cpumask_setall(top_cpuset.cpus_allowed); 1848 cpumask_setall(top_cpuset.cpus_allowed);
1940 nodes_setall(top_cpuset.mems_allowed); 1849 nodes_setall(top_cpuset.mems_allowed);
1941 1850
1942 fmeter_init(&top_cpuset.fmeter); 1851 fmeter_init(&top_cpuset.fmeter);
1943 top_cpuset.mems_generation = cpuset_mems_generation++;
1944 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 1852 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1945 top_cpuset.relax_domain_level = -1; 1853 top_cpuset.relax_domain_level = -1;
1946 1854