aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMiao Xie <miaox@cn.fujitsu.com>2009-06-16 18:31:49 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-06-16 22:47:31 -0400
commit58568d2a8215cb6f55caf2332017d7bdff954e1c (patch)
treeffcdee457494ac78d6550b0aeac86536ca152e7b
parent950592f7b991f267d707d372b90f508bbe72acbc (diff)
cpuset,mm: update tasks' mems_allowed in time
Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/cpuset.h13
-rw-r--r--include/linux/sched.h8
-rw-r--r--init/main.c6
-rw-r--r--kernel/cpuset.c184
-rw-r--r--kernel/kthread.c2
-rw-r--r--mm/mempolicy.c143
-rw-r--r--mm/page_alloc.c5
7 files changed, 170 insertions, 191 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 05ea1dd7d681..a5740fc4d04b 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -18,7 +18,6 @@
18 18
19extern int number_of_cpusets; /* How many cpusets are defined in system? */ 19extern int number_of_cpusets; /* How many cpusets are defined in system? */
20 20
21extern int cpuset_init_early(void);
22extern int cpuset_init(void); 21extern int cpuset_init(void);
23extern void cpuset_init_smp(void); 22extern void cpuset_init_smp(void);
24extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); 23extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
@@ -27,7 +26,6 @@ extern void cpuset_cpus_allowed_locked(struct task_struct *p,
27extern nodemask_t cpuset_mems_allowed(struct task_struct *p); 26extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
28#define cpuset_current_mems_allowed (current->mems_allowed) 27#define cpuset_current_mems_allowed (current->mems_allowed)
29void cpuset_init_current_mems_allowed(void); 28void cpuset_init_current_mems_allowed(void);
30void cpuset_update_task_memory_state(void);
31int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); 29int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
32 30
33extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask); 31extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask);
@@ -92,9 +90,13 @@ extern void rebuild_sched_domains(void);
92 90
93extern void cpuset_print_task_mems_allowed(struct task_struct *p); 91extern void cpuset_print_task_mems_allowed(struct task_struct *p);
94 92
93static inline void set_mems_allowed(nodemask_t nodemask)
94{
95 current->mems_allowed = nodemask;
96}
97
95#else /* !CONFIG_CPUSETS */ 98#else /* !CONFIG_CPUSETS */
96 99
97static inline int cpuset_init_early(void) { return 0; }
98static inline int cpuset_init(void) { return 0; } 100static inline int cpuset_init(void) { return 0; }
99static inline void cpuset_init_smp(void) {} 101static inline void cpuset_init_smp(void) {}
100 102
@@ -116,7 +118,6 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
116 118
117#define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY]) 119#define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY])
118static inline void cpuset_init_current_mems_allowed(void) {} 120static inline void cpuset_init_current_mems_allowed(void) {}
119static inline void cpuset_update_task_memory_state(void) {}
120 121
121static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) 122static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
122{ 123{
@@ -188,6 +189,10 @@ static inline void cpuset_print_task_mems_allowed(struct task_struct *p)
188{ 189{
189} 190}
190 191
192static inline void set_mems_allowed(nodemask_t nodemask)
193{
194}
195
191#endif /* !CONFIG_CPUSETS */ 196#endif /* !CONFIG_CPUSETS */
192 197
193#endif /* _LINUX_CPUSET_H */ 198#endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c900aa530070..1048bf50540a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1318,7 +1318,8 @@ struct task_struct {
1318/* Thread group tracking */ 1318/* Thread group tracking */
1319 u32 parent_exec_id; 1319 u32 parent_exec_id;
1320 u32 self_exec_id; 1320 u32 self_exec_id;
1321/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ 1321/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
1322 * mempolicy */
1322 spinlock_t alloc_lock; 1323 spinlock_t alloc_lock;
1323 1324
1324#ifdef CONFIG_GENERIC_HARDIRQS 1325#ifdef CONFIG_GENERIC_HARDIRQS
@@ -1386,8 +1387,7 @@ struct task_struct {
1386 cputime_t acct_timexpd; /* stime + utime since last update */ 1387 cputime_t acct_timexpd; /* stime + utime since last update */
1387#endif 1388#endif
1388#ifdef CONFIG_CPUSETS 1389#ifdef CONFIG_CPUSETS
1389 nodemask_t mems_allowed; 1390 nodemask_t mems_allowed; /* Protected by alloc_lock */
1390 int cpuset_mems_generation;
1391 int cpuset_mem_spread_rotor; 1391 int cpuset_mem_spread_rotor;
1392#endif 1392#endif
1393#ifdef CONFIG_CGROUPS 1393#ifdef CONFIG_CGROUPS
@@ -1410,7 +1410,7 @@ struct task_struct {
1410 struct list_head perf_counter_list; 1410 struct list_head perf_counter_list;
1411#endif 1411#endif
1412#ifdef CONFIG_NUMA 1412#ifdef CONFIG_NUMA
1413 struct mempolicy *mempolicy; 1413 struct mempolicy *mempolicy; /* Protected by alloc_lock */
1414 short il_next; 1414 short il_next;
1415#endif 1415#endif
1416 atomic_t fs_excl; /* holding fs exclusive resources */ 1416 atomic_t fs_excl; /* holding fs exclusive resources */
diff --git a/init/main.c b/init/main.c
index f6204f712e7c..5e0d3f047eaf 100644
--- a/init/main.c
+++ b/init/main.c
@@ -670,7 +670,6 @@ asmlinkage void __init start_kernel(void)
670 initrd_start = 0; 670 initrd_start = 0;
671 } 671 }
672#endif 672#endif
673 cpuset_init_early();
674 page_cgroup_init(); 673 page_cgroup_init();
675 enable_debug_pagealloc(); 674 enable_debug_pagealloc();
676 cpu_hotplug_init(); 675 cpu_hotplug_init();
@@ -867,6 +866,11 @@ static noinline int init_post(void)
867static int __init kernel_init(void * unused) 866static int __init kernel_init(void * unused)
868{ 867{
869 lock_kernel(); 868 lock_kernel();
869
870 /*
871 * init can allocate pages on any node
872 */
873 set_mems_allowed(node_possible_map);
870 /* 874 /*
871 * init can run on any cpu. 875 * init can run on any cpu.
872 */ 876 */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index af5a83d52187..7e75a41bd508 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -97,12 +97,6 @@ struct cpuset {
97 97
98 struct cpuset *parent; /* my parent */ 98 struct cpuset *parent; /* my parent */
99 99
100 /*
101 * Copy of global cpuset_mems_generation as of the most
102 * recent time this cpuset changed its mems_allowed.
103 */
104 int mems_generation;
105
106 struct fmeter fmeter; /* memory_pressure filter */ 100 struct fmeter fmeter; /* memory_pressure filter */
107 101
108 /* partition number for rebuild_sched_domains() */ 102 /* partition number for rebuild_sched_domains() */
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
176 return test_bit(CS_SPREAD_SLAB, &cs->flags); 170 return test_bit(CS_SPREAD_SLAB, &cs->flags);
177} 171}
178 172
179/*
180 * Increment this integer everytime any cpuset changes its
181 * mems_allowed value. Users of cpusets can track this generation
182 * number, and avoid having to lock and reload mems_allowed unless
183 * the cpuset they're using changes generation.
184 *
185 * A single, global generation is needed because cpuset_attach_task() could
186 * reattach a task to a different cpuset, which must not have its
187 * generation numbers aliased with those of that tasks previous cpuset.
188 *
189 * Generations are needed for mems_allowed because one task cannot
190 * modify another's memory placement. So we must enable every task,
191 * on every visit to __alloc_pages(), to efficiently check whether
192 * its current->cpuset->mems_allowed has changed, requiring an update
193 * of its current->mems_allowed.
194 *
195 * Since writes to cpuset_mems_generation are guarded by the cgroup lock
196 * there is no need to mark it atomic.
197 */
198static int cpuset_mems_generation;
199
200static struct cpuset top_cpuset = { 173static struct cpuset top_cpuset = {
201 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 174 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
202}; 175};
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = {
228 * If a task is only holding callback_mutex, then it has read-only 201 * If a task is only holding callback_mutex, then it has read-only
229 * access to cpusets. 202 * access to cpusets.
230 * 203 *
231 * The task_struct fields mems_allowed and mems_generation may only 204 * Now, the task_struct fields mems_allowed and mempolicy may be changed
232 * be accessed in the context of that task, so require no locks. 205 * by other task, we use alloc_lock in the task_struct fields to protect
206 * them.
233 * 207 *
234 * The cpuset_common_file_read() handlers only hold callback_mutex across 208 * The cpuset_common_file_read() handlers only hold callback_mutex across
235 * small pieces of code, such as when reading out possibly multi-word 209 * small pieces of code, such as when reading out possibly multi-word
@@ -349,69 +323,6 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
349 tsk->flags &= ~PF_SPREAD_SLAB; 323 tsk->flags &= ~PF_SPREAD_SLAB;
350} 324}
351 325
352/**
353 * cpuset_update_task_memory_state - update task memory placement
354 *
355 * If the current tasks cpusets mems_allowed changed behind our
356 * backs, update current->mems_allowed, mems_generation and task NUMA
357 * mempolicy to the new value.
358 *
359 * Task mempolicy is updated by rebinding it relative to the
360 * current->cpuset if a task has its memory placement changed.
361 * Do not call this routine if in_interrupt().
362 *
363 * Call without callback_mutex or task_lock() held. May be
364 * called with or without cgroup_mutex held. Thanks in part to
365 * 'the_top_cpuset_hack', the task's cpuset pointer will never
366 * be NULL. This routine also might acquire callback_mutex during
367 * call.
368 *
369 * Reading current->cpuset->mems_generation doesn't need task_lock
370 * to guard the current->cpuset derefence, because it is guarded
371 * from concurrent freeing of current->cpuset using RCU.
372 *
373 * The rcu_dereference() is technically probably not needed,
374 * as I don't actually mind if I see a new cpuset pointer but
375 * an old value of mems_generation. However this really only
376 * matters on alpha systems using cpusets heavily. If I dropped
377 * that rcu_dereference(), it would save them a memory barrier.
378 * For all other arch's, rcu_dereference is a no-op anyway, and for
379 * alpha systems not using cpusets, another planned optimization,
380 * avoiding the rcu critical section for tasks in the root cpuset
381 * which is statically allocated, so can't vanish, will make this
382 * irrelevant. Better to use RCU as intended, than to engage in
383 * some cute trick to save a memory barrier that is impossible to
384 * test, for alpha systems using cpusets heavily, which might not
385 * even exist.
386 *
387 * This routine is needed to update the per-task mems_allowed data,
388 * within the tasks context, when it is trying to allocate memory
389 * (in various mm/mempolicy.c routines) and notices that some other
390 * task has been modifying its cpuset.
391 */
392
393void cpuset_update_task_memory_state(void)
394{
395 int my_cpusets_mem_gen;
396 struct task_struct *tsk = current;
397 struct cpuset *cs;
398
399 rcu_read_lock();
400 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
401 rcu_read_unlock();
402
403 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
404 mutex_lock(&callback_mutex);
405 task_lock(tsk);
406 cs = task_cs(tsk); /* Maybe changed when task not locked */
407 guarantee_online_mems(cs, &tsk->mems_allowed);
408 tsk->cpuset_mems_generation = cs->mems_generation;
409 task_unlock(tsk);
410 mutex_unlock(&callback_mutex);
411 mpol_rebind_task(tsk, &tsk->mems_allowed);
412 }
413}
414
415/* 326/*
416 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? 327 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
417 * 328 *
@@ -1017,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1017 * other task, the task_struct mems_allowed that we are hacking 928 * other task, the task_struct mems_allowed that we are hacking
1018 * is for our current task, which must allocate new pages for that 929 * is for our current task, which must allocate new pages for that
1019 * migrating memory region. 930 * migrating memory region.
1020 *
1021 * We call cpuset_update_task_memory_state() before hacking
1022 * our tasks mems_allowed, so that we are assured of being in
1023 * sync with our tasks cpuset, and in particular, callbacks to
1024 * cpuset_update_task_memory_state() from nested page allocations
1025 * won't see any mismatch of our cpuset and task mems_generation
1026 * values, so won't overwrite our hacked tasks mems_allowed
1027 * nodemask.
1028 */ 931 */
1029 932
1030static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 933static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1032,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1032{ 935{
1033 struct task_struct *tsk = current; 936 struct task_struct *tsk = current;
1034 937
1035 cpuset_update_task_memory_state();
1036
1037 mutex_lock(&callback_mutex);
1038 tsk->mems_allowed = *to; 938 tsk->mems_allowed = *to;
1039 mutex_unlock(&callback_mutex);
1040 939
1041 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 940 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
1042 941
1043 mutex_lock(&callback_mutex);
1044 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); 942 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
1045 mutex_unlock(&callback_mutex);
1046} 943}
1047 944
1048/* 945/*
1049 * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new 946 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
1050 * nodes if memory_migrate flag is set. Called with cgroup_mutex held. 947 * @tsk: the task to change
948 * @newmems: new nodes that the task will be set
949 *
950 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
951 * we structure updates as setting all new allowed nodes, then clearing newly
952 * disallowed ones.
953 *
954 * Called with task's alloc_lock held
955 */
956static void cpuset_change_task_nodemask(struct task_struct *tsk,
957 nodemask_t *newmems)
958{
959 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
960 mpol_rebind_task(tsk, &tsk->mems_allowed);
961 mpol_rebind_task(tsk, newmems);
962 tsk->mems_allowed = *newmems;
963}
964
965/*
966 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
967 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
968 * memory_migrate flag is set. Called with cgroup_mutex held.
1051 */ 969 */
1052static void cpuset_change_nodemask(struct task_struct *p, 970static void cpuset_change_nodemask(struct task_struct *p,
1053 struct cgroup_scanner *scan) 971 struct cgroup_scanner *scan)
@@ -1056,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p,
1056 struct cpuset *cs; 974 struct cpuset *cs;
1057 int migrate; 975 int migrate;
1058 const nodemask_t *oldmem = scan->data; 976 const nodemask_t *oldmem = scan->data;
977 nodemask_t newmems;
978
979 cs = cgroup_cs(scan->cg);
980 guarantee_online_mems(cs, &newmems);
981
982 task_lock(p);
983 cpuset_change_task_nodemask(p, &newmems);
984 task_unlock(p);
1059 985
1060 mm = get_task_mm(p); 986 mm = get_task_mm(p);
1061 if (!mm) 987 if (!mm)
1062 return; 988 return;
1063 989
1064 cs = cgroup_cs(scan->cg);
1065 migrate = is_memory_migrate(cs); 990 migrate = is_memory_migrate(cs);
1066 991
1067 mpol_rebind_mm(mm, &cs->mems_allowed); 992 mpol_rebind_mm(mm, &cs->mems_allowed);
@@ -1114,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1114/* 1039/*
1115 * Handle user request to change the 'mems' memory placement 1040 * Handle user request to change the 'mems' memory placement
1116 * of a cpuset. Needs to validate the request, update the 1041 * of a cpuset. Needs to validate the request, update the
1117 * cpusets mems_allowed and mems_generation, and for each 1042 * cpusets mems_allowed, and for each task in the cpuset,
1118 * task in the cpuset, rebind any vma mempolicies and if 1043 * update mems_allowed and rebind task's mempolicy and any vma
1119 * the cpuset is marked 'memory_migrate', migrate the tasks 1044 * mempolicies and if the cpuset is marked 'memory_migrate',
1120 * pages to the new memory. 1045 * migrate the tasks pages to the new memory.
1121 * 1046 *
1122 * Call with cgroup_mutex held. May take callback_mutex during call. 1047 * Call with cgroup_mutex held. May take callback_mutex during call.
1123 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1048 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1170,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1170 1095
1171 mutex_lock(&callback_mutex); 1096 mutex_lock(&callback_mutex);
1172 cs->mems_allowed = trialcs->mems_allowed; 1097 cs->mems_allowed = trialcs->mems_allowed;
1173 cs->mems_generation = cpuset_mems_generation++;
1174 mutex_unlock(&callback_mutex); 1098 mutex_unlock(&callback_mutex);
1175 1099
1176 update_tasks_nodemask(cs, &oldmem, &heap); 1100 update_tasks_nodemask(cs, &oldmem, &heap);
@@ -1434,15 +1358,18 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1434 1358
1435 if (cs == &top_cpuset) { 1359 if (cs == &top_cpuset) {
1436 cpumask_copy(cpus_attach, cpu_possible_mask); 1360 cpumask_copy(cpus_attach, cpu_possible_mask);
1361 to = node_possible_map;
1437 } else { 1362 } else {
1438 mutex_lock(&callback_mutex);
1439 guarantee_online_cpus(cs, cpus_attach); 1363 guarantee_online_cpus(cs, cpus_attach);
1440 mutex_unlock(&callback_mutex); 1364 guarantee_online_mems(cs, &to);
1441 } 1365 }
1442 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1366 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1443 if (err) 1367 if (err)
1444 return; 1368 return;
1445 1369
1370 task_lock(tsk);
1371 cpuset_change_task_nodemask(tsk, &to);
1372 task_unlock(tsk);
1446 cpuset_update_task_spread_flag(cs, tsk); 1373 cpuset_update_task_spread_flag(cs, tsk);
1447 1374
1448 from = oldcs->mems_allowed; 1375 from = oldcs->mems_allowed;
@@ -1848,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create(
1848 struct cpuset *parent; 1775 struct cpuset *parent;
1849 1776
1850 if (!cont->parent) { 1777 if (!cont->parent) {
1851 /* This is early initialization for the top cgroup */
1852 top_cpuset.mems_generation = cpuset_mems_generation++;
1853 return &top_cpuset.css; 1778 return &top_cpuset.css;
1854 } 1779 }
1855 parent = cgroup_cs(cont->parent); 1780 parent = cgroup_cs(cont->parent);
@@ -1861,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create(
1861 return ERR_PTR(-ENOMEM); 1786 return ERR_PTR(-ENOMEM);
1862 } 1787 }
1863 1788
1864 cpuset_update_task_memory_state();
1865 cs->flags = 0; 1789 cs->flags = 0;
1866 if (is_spread_page(parent)) 1790 if (is_spread_page(parent))
1867 set_bit(CS_SPREAD_PAGE, &cs->flags); 1791 set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1870,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create(
1870 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1794 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1871 cpumask_clear(cs->cpus_allowed); 1795 cpumask_clear(cs->cpus_allowed);
1872 nodes_clear(cs->mems_allowed); 1796 nodes_clear(cs->mems_allowed);
1873 cs->mems_generation = cpuset_mems_generation++;
1874 fmeter_init(&cs->fmeter); 1797 fmeter_init(&cs->fmeter);
1875 cs->relax_domain_level = -1; 1798 cs->relax_domain_level = -1;
1876 1799
@@ -1889,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1889{ 1812{
1890 struct cpuset *cs = cgroup_cs(cont); 1813 struct cpuset *cs = cgroup_cs(cont);
1891 1814
1892 cpuset_update_task_memory_state();
1893
1894 if (is_sched_load_balance(cs)) 1815 if (is_sched_load_balance(cs))
1895 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1816 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1896 1817
@@ -1911,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = {
1911 .early_init = 1, 1832 .early_init = 1,
1912}; 1833};
1913 1834
1914/*
1915 * cpuset_init_early - just enough so that the calls to
1916 * cpuset_update_task_memory_state() in early init code
1917 * are harmless.
1918 */
1919
1920int __init cpuset_init_early(void)
1921{
1922 alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
1923
1924 top_cpuset.mems_generation = cpuset_mems_generation++;
1925 return 0;
1926}
1927
1928
1929/** 1835/**
1930 * cpuset_init - initialize cpusets at system boot 1836 * cpuset_init - initialize cpusets at system boot
1931 * 1837 *
@@ -1936,11 +1842,13 @@ int __init cpuset_init(void)
1936{ 1842{
1937 int err = 0; 1843 int err = 0;
1938 1844
1845 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1846 BUG();
1847
1939 cpumask_setall(top_cpuset.cpus_allowed); 1848 cpumask_setall(top_cpuset.cpus_allowed);
1940 nodes_setall(top_cpuset.mems_allowed); 1849 nodes_setall(top_cpuset.mems_allowed);
1941 1850
1942 fmeter_init(&top_cpuset.fmeter); 1851 fmeter_init(&top_cpuset.fmeter);
1943 top_cpuset.mems_generation = cpuset_mems_generation++;
1944 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 1852 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1945 top_cpuset.relax_domain_level = -1; 1853 top_cpuset.relax_domain_level = -1;
1946 1854
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 41c88fe40500..7fa441333529 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -9,6 +9,7 @@
9#include <linux/kthread.h> 9#include <linux/kthread.h>
10#include <linux/completion.h> 10#include <linux/completion.h>
11#include <linux/err.h> 11#include <linux/err.h>
12#include <linux/cpuset.h>
12#include <linux/unistd.h> 13#include <linux/unistd.h>
13#include <linux/file.h> 14#include <linux/file.h>
14#include <linux/module.h> 15#include <linux/module.h>
@@ -236,6 +237,7 @@ int kthreadd(void *unused)
236 ignore_signals(tsk); 237 ignore_signals(tsk);
237 set_user_nice(tsk, KTHREAD_NICE_LEVEL); 238 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
238 set_cpus_allowed_ptr(tsk, cpu_all_mask); 239 set_cpus_allowed_ptr(tsk, cpu_all_mask);
240 set_mems_allowed(node_possible_map);
239 241
240 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 242 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
241 243
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3eb4a6fdc043..46bdf9ddf2ba 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
182 return 0; 182 return 0;
183} 183}
184 184
185/* Create a new policy */ 185/*
186 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
187 * any, for the new policy. mpol_new() has already validated the nodes
188 * parameter with respect to the policy mode and flags. But, we need to
189 * handle an empty nodemask with MPOL_PREFERRED here.
190 *
191 * Must be called holding task's alloc_lock to protect task's mems_allowed
192 * and mempolicy. May also be called holding the mmap_semaphore for write.
193 */
194static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
195{
196 nodemask_t cpuset_context_nmask;
197 int ret;
198
199 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
200 if (pol == NULL)
201 return 0;
202
203 VM_BUG_ON(!nodes);
204 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
205 nodes = NULL; /* explicit local allocation */
206 else {
207 if (pol->flags & MPOL_F_RELATIVE_NODES)
208 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
209 &cpuset_current_mems_allowed);
210 else
211 nodes_and(cpuset_context_nmask, *nodes,
212 cpuset_current_mems_allowed);
213 if (mpol_store_user_nodemask(pol))
214 pol->w.user_nodemask = *nodes;
215 else
216 pol->w.cpuset_mems_allowed =
217 cpuset_current_mems_allowed;
218 }
219
220 ret = mpol_ops[pol->mode].create(pol,
221 nodes ? &cpuset_context_nmask : NULL);
222 return ret;
223}
224
225/*
226 * This function just creates a new policy, does some check and simple
227 * initialization. You must invoke mpol_set_nodemask() to set nodes.
228 */
186static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 229static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
187 nodemask_t *nodes) 230 nodemask_t *nodes)
188{ 231{
189 struct mempolicy *policy; 232 struct mempolicy *policy;
190 nodemask_t cpuset_context_nmask;
191 int ret;
192 233
193 pr_debug("setting mode %d flags %d nodes[0] %lx\n", 234 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
194 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); 235 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
@@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
210 if (((flags & MPOL_F_STATIC_NODES) || 251 if (((flags & MPOL_F_STATIC_NODES) ||
211 (flags & MPOL_F_RELATIVE_NODES))) 252 (flags & MPOL_F_RELATIVE_NODES)))
212 return ERR_PTR(-EINVAL); 253 return ERR_PTR(-EINVAL);
213 nodes = NULL; /* flag local alloc */
214 } 254 }
215 } else if (nodes_empty(*nodes)) 255 } else if (nodes_empty(*nodes))
216 return ERR_PTR(-EINVAL); 256 return ERR_PTR(-EINVAL);
@@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
221 policy->mode = mode; 261 policy->mode = mode;
222 policy->flags = flags; 262 policy->flags = flags;
223 263
224 if (nodes) {
225 /*
226 * cpuset related setup doesn't apply to local allocation
227 */
228 cpuset_update_task_memory_state();
229 if (flags & MPOL_F_RELATIVE_NODES)
230 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
231 &cpuset_current_mems_allowed);
232 else
233 nodes_and(cpuset_context_nmask, *nodes,
234 cpuset_current_mems_allowed);
235 if (mpol_store_user_nodemask(policy))
236 policy->w.user_nodemask = *nodes;
237 else
238 policy->w.cpuset_mems_allowed =
239 cpuset_mems_allowed(current);
240 }
241
242 ret = mpol_ops[mode].create(policy,
243 nodes ? &cpuset_context_nmask : NULL);
244 if (ret < 0) {
245 kmem_cache_free(policy_cache, policy);
246 return ERR_PTR(ret);
247 }
248 return policy; 264 return policy;
249} 265}
250 266
@@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
324/* 340/*
325 * Wrapper for mpol_rebind_policy() that just requires task 341 * Wrapper for mpol_rebind_policy() that just requires task
326 * pointer, and updates task mempolicy. 342 * pointer, and updates task mempolicy.
343 *
344 * Called with task's alloc_lock held.
327 */ 345 */
328 346
329void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 347void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
@@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void)
600static long do_set_mempolicy(unsigned short mode, unsigned short flags, 618static long do_set_mempolicy(unsigned short mode, unsigned short flags,
601 nodemask_t *nodes) 619 nodemask_t *nodes)
602{ 620{
603 struct mempolicy *new; 621 struct mempolicy *new, *old;
604 struct mm_struct *mm = current->mm; 622 struct mm_struct *mm = current->mm;
623 int ret;
605 624
606 new = mpol_new(mode, flags, nodes); 625 new = mpol_new(mode, flags, nodes);
607 if (IS_ERR(new)) 626 if (IS_ERR(new))
@@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
615 */ 634 */
616 if (mm) 635 if (mm)
617 down_write(&mm->mmap_sem); 636 down_write(&mm->mmap_sem);
618 mpol_put(current->mempolicy); 637 task_lock(current);
638 ret = mpol_set_nodemask(new, nodes);
639 if (ret) {
640 task_unlock(current);
641 if (mm)
642 up_write(&mm->mmap_sem);
643 mpol_put(new);
644 return ret;
645 }
646 old = current->mempolicy;
619 current->mempolicy = new; 647 current->mempolicy = new;
620 mpol_set_task_struct_flag(); 648 mpol_set_task_struct_flag();
621 if (new && new->mode == MPOL_INTERLEAVE && 649 if (new && new->mode == MPOL_INTERLEAVE &&
622 nodes_weight(new->v.nodes)) 650 nodes_weight(new->v.nodes))
623 current->il_next = first_node(new->v.nodes); 651 current->il_next = first_node(new->v.nodes);
652 task_unlock(current);
624 if (mm) 653 if (mm)
625 up_write(&mm->mmap_sem); 654 up_write(&mm->mmap_sem);
626 655
656 mpol_put(old);
627 return 0; 657 return 0;
628} 658}
629 659
630/* 660/*
631 * Return nodemask for policy for get_mempolicy() query 661 * Return nodemask for policy for get_mempolicy() query
662 *
663 * Called with task's alloc_lock held
632 */ 664 */
633static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) 665static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
634{ 666{
@@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
674 struct vm_area_struct *vma = NULL; 706 struct vm_area_struct *vma = NULL;
675 struct mempolicy *pol = current->mempolicy; 707 struct mempolicy *pol = current->mempolicy;
676 708
677 cpuset_update_task_memory_state();
678 if (flags & 709 if (flags &
679 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 710 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
680 return -EINVAL; 711 return -EINVAL;
@@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
683 if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 714 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
684 return -EINVAL; 715 return -EINVAL;
685 *policy = 0; /* just so it's initialized */ 716 *policy = 0; /* just so it's initialized */
717 task_lock(current);
686 *nmask = cpuset_current_mems_allowed; 718 *nmask = cpuset_current_mems_allowed;
719 task_unlock(current);
687 return 0; 720 return 0;
688 } 721 }
689 722
@@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
738 } 771 }
739 772
740 err = 0; 773 err = 0;
741 if (nmask) 774 if (nmask) {
775 task_lock(current);
742 get_policy_nodemask(pol, nmask); 776 get_policy_nodemask(pol, nmask);
777 task_unlock(current);
778 }
743 779
744 out: 780 out:
745 mpol_cond_put(pol); 781 mpol_cond_put(pol);
@@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len,
979 return err; 1015 return err;
980 } 1016 }
981 down_write(&mm->mmap_sem); 1017 down_write(&mm->mmap_sem);
1018 task_lock(current);
1019 err = mpol_set_nodemask(new, nmask);
1020 task_unlock(current);
1021 if (err) {
1022 up_write(&mm->mmap_sem);
1023 mpol_put(new);
1024 return err;
1025 }
982 vma = check_range(mm, start, end, nmask, 1026 vma = check_range(mm, start, end, nmask,
983 flags | MPOL_MF_INVERT, &pagelist); 1027 flags | MPOL_MF_INVERT, &pagelist);
984 1028
@@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1545 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1589 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1546 struct zonelist *zl; 1590 struct zonelist *zl;
1547 1591
1548 cpuset_update_task_memory_state();
1549
1550 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1592 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1551 unsigned nid; 1593 unsigned nid;
1552 1594
@@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1593{ 1635{
1594 struct mempolicy *pol = current->mempolicy; 1636 struct mempolicy *pol = current->mempolicy;
1595 1637
1596 if ((gfp & __GFP_WAIT) && !in_interrupt())
1597 cpuset_update_task_memory_state();
1598 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1638 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1599 pol = &default_policy; 1639 pol = &default_policy;
1600 1640
@@ -1854,6 +1894,8 @@ restart:
1854 */ 1894 */
1855void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 1895void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1856{ 1896{
1897 int ret;
1898
1857 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 1899 sp->root = RB_ROOT; /* empty tree == default mempolicy */
1858 spin_lock_init(&sp->lock); 1900 spin_lock_init(&sp->lock);
1859 1901
@@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1863 1905
1864 /* contextualize the tmpfs mount point mempolicy */ 1906 /* contextualize the tmpfs mount point mempolicy */
1865 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 1907 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1866 mpol_put(mpol); /* drop our ref on sb mpol */ 1908 if (IS_ERR(new)) {
1867 if (IS_ERR(new)) 1909 mpol_put(mpol); /* drop our ref on sb mpol */
1868 return; /* no valid nodemask intersection */ 1910 return; /* no valid nodemask intersection */
1911 }
1912
1913 task_lock(current);
1914 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
1915 task_unlock(current);
1916 mpol_put(mpol); /* drop our ref on sb mpol */
1917 if (ret) {
1918 mpol_put(new);
1919 return;
1920 }
1869 1921
1870 /* Create pseudo-vma that contains just the policy */ 1922 /* Create pseudo-vma that contains just the policy */
1871 memset(&pvma, 0, sizeof(struct vm_area_struct)); 1923 memset(&pvma, 0, sizeof(struct vm_area_struct));
@@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2086 new = mpol_new(mode, mode_flags, &nodes); 2138 new = mpol_new(mode, mode_flags, &nodes);
2087 if (IS_ERR(new)) 2139 if (IS_ERR(new))
2088 err = 1; 2140 err = 1;
2089 else if (no_context) 2141 else {
2090 new->w.user_nodemask = nodes; /* save for contextualization */ 2142 int ret;
2143
2144 task_lock(current);
2145 ret = mpol_set_nodemask(new, &nodes);
2146 task_unlock(current);
2147 if (ret)
2148 err = 1;
2149 else if (no_context) {
2150 /* save for contextualization */
2151 new->w.user_nodemask = nodes;
2152 }
2153 }
2091 2154
2092out: 2155out:
2093 /* Restore string for error message */ 2156 /* Restore string for error message */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17d5f539a9aa..7cc3179e3591 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1569,10 +1569,7 @@ nofail_alloc:
1569 1569
1570 /* We now go into synchronous reclaim */ 1570 /* We now go into synchronous reclaim */
1571 cpuset_memory_pressure_bump(); 1571 cpuset_memory_pressure_bump();
1572 /* 1572
1573 * The task's cpuset might have expanded its set of allowable nodes
1574 */
1575 cpuset_update_task_memory_state();
1576 p->flags |= PF_MEMALLOC; 1573 p->flags |= PF_MEMALLOC;
1577 1574
1578 lockdep_set_current_reclaim_state(gfp_mask); 1575 lockdep_set_current_reclaim_state(gfp_mask);