aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/cpuset.h13
-rw-r--r--include/linux/sched.h8
-rw-r--r--init/main.c6
-rw-r--r--kernel/cpuset.c184
-rw-r--r--kernel/kthread.c2
-rw-r--r--mm/mempolicy.c143
-rw-r--r--mm/page_alloc.c5
7 files changed, 170 insertions, 191 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 05ea1dd7d681..a5740fc4d04b 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -18,7 +18,6 @@
18 18
19extern int number_of_cpusets; /* How many cpusets are defined in system? */ 19extern int number_of_cpusets; /* How many cpusets are defined in system? */
20 20
21extern int cpuset_init_early(void);
22extern int cpuset_init(void); 21extern int cpuset_init(void);
23extern void cpuset_init_smp(void); 22extern void cpuset_init_smp(void);
24extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); 23extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
@@ -27,7 +26,6 @@ extern void cpuset_cpus_allowed_locked(struct task_struct *p,
27extern nodemask_t cpuset_mems_allowed(struct task_struct *p); 26extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
28#define cpuset_current_mems_allowed (current->mems_allowed) 27#define cpuset_current_mems_allowed (current->mems_allowed)
29void cpuset_init_current_mems_allowed(void); 28void cpuset_init_current_mems_allowed(void);
30void cpuset_update_task_memory_state(void);
31int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); 29int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
32 30
33extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask); 31extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask);
@@ -92,9 +90,13 @@ extern void rebuild_sched_domains(void);
92 90
93extern void cpuset_print_task_mems_allowed(struct task_struct *p); 91extern void cpuset_print_task_mems_allowed(struct task_struct *p);
94 92
93static inline void set_mems_allowed(nodemask_t nodemask)
94{
95 current->mems_allowed = nodemask;
96}
97
95#else /* !CONFIG_CPUSETS */ 98#else /* !CONFIG_CPUSETS */
96 99
97static inline int cpuset_init_early(void) { return 0; }
98static inline int cpuset_init(void) { return 0; } 100static inline int cpuset_init(void) { return 0; }
99static inline void cpuset_init_smp(void) {} 101static inline void cpuset_init_smp(void) {}
100 102
@@ -116,7 +118,6 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
116 118
117#define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY]) 119#define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY])
118static inline void cpuset_init_current_mems_allowed(void) {} 120static inline void cpuset_init_current_mems_allowed(void) {}
119static inline void cpuset_update_task_memory_state(void) {}
120 121
121static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) 122static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
122{ 123{
@@ -188,6 +189,10 @@ static inline void cpuset_print_task_mems_allowed(struct task_struct *p)
188{ 189{
189} 190}
190 191
192static inline void set_mems_allowed(nodemask_t nodemask)
193{
194}
195
191#endif /* !CONFIG_CPUSETS */ 196#endif /* !CONFIG_CPUSETS */
192 197
193#endif /* _LINUX_CPUSET_H */ 198#endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c900aa530070..1048bf50540a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1318,7 +1318,8 @@ struct task_struct {
1318/* Thread group tracking */ 1318/* Thread group tracking */
1319 u32 parent_exec_id; 1319 u32 parent_exec_id;
1320 u32 self_exec_id; 1320 u32 self_exec_id;
1321/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ 1321/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
1322 * mempolicy */
1322 spinlock_t alloc_lock; 1323 spinlock_t alloc_lock;
1323 1324
1324#ifdef CONFIG_GENERIC_HARDIRQS 1325#ifdef CONFIG_GENERIC_HARDIRQS
@@ -1386,8 +1387,7 @@ struct task_struct {
1386 cputime_t acct_timexpd; /* stime + utime since last update */ 1387 cputime_t acct_timexpd; /* stime + utime since last update */
1387#endif 1388#endif
1388#ifdef CONFIG_CPUSETS 1389#ifdef CONFIG_CPUSETS
1389 nodemask_t mems_allowed; 1390 nodemask_t mems_allowed; /* Protected by alloc_lock */
1390 int cpuset_mems_generation;
1391 int cpuset_mem_spread_rotor; 1391 int cpuset_mem_spread_rotor;
1392#endif 1392#endif
1393#ifdef CONFIG_CGROUPS 1393#ifdef CONFIG_CGROUPS
@@ -1410,7 +1410,7 @@ struct task_struct {
1410 struct list_head perf_counter_list; 1410 struct list_head perf_counter_list;
1411#endif 1411#endif
1412#ifdef CONFIG_NUMA 1412#ifdef CONFIG_NUMA
1413 struct mempolicy *mempolicy; 1413 struct mempolicy *mempolicy; /* Protected by alloc_lock */
1414 short il_next; 1414 short il_next;
1415#endif 1415#endif
1416 atomic_t fs_excl; /* holding fs exclusive resources */ 1416 atomic_t fs_excl; /* holding fs exclusive resources */
diff --git a/init/main.c b/init/main.c
index f6204f712e7c..5e0d3f047eaf 100644
--- a/init/main.c
+++ b/init/main.c
@@ -670,7 +670,6 @@ asmlinkage void __init start_kernel(void)
670 initrd_start = 0; 670 initrd_start = 0;
671 } 671 }
672#endif 672#endif
673 cpuset_init_early();
674 page_cgroup_init(); 673 page_cgroup_init();
675 enable_debug_pagealloc(); 674 enable_debug_pagealloc();
676 cpu_hotplug_init(); 675 cpu_hotplug_init();
@@ -867,6 +866,11 @@ static noinline int init_post(void)
867static int __init kernel_init(void * unused) 866static int __init kernel_init(void * unused)
868{ 867{
869 lock_kernel(); 868 lock_kernel();
869
870 /*
871 * init can allocate pages on any node
872 */
873 set_mems_allowed(node_possible_map);
870 /* 874 /*
871 * init can run on any cpu. 875 * init can run on any cpu.
872 */ 876 */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index af5a83d52187..7e75a41bd508 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -97,12 +97,6 @@ struct cpuset {
97 97
98 struct cpuset *parent; /* my parent */ 98 struct cpuset *parent; /* my parent */
99 99
100 /*
101 * Copy of global cpuset_mems_generation as of the most
102 * recent time this cpuset changed its mems_allowed.
103 */
104 int mems_generation;
105
106 struct fmeter fmeter; /* memory_pressure filter */ 100 struct fmeter fmeter; /* memory_pressure filter */
107 101
108 /* partition number for rebuild_sched_domains() */ 102 /* partition number for rebuild_sched_domains() */
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
176 return test_bit(CS_SPREAD_SLAB, &cs->flags); 170 return test_bit(CS_SPREAD_SLAB, &cs->flags);
177} 171}
178 172
179/*
180 * Increment this integer everytime any cpuset changes its
181 * mems_allowed value. Users of cpusets can track this generation
182 * number, and avoid having to lock and reload mems_allowed unless
183 * the cpuset they're using changes generation.
184 *
185 * A single, global generation is needed because cpuset_attach_task() could
186 * reattach a task to a different cpuset, which must not have its
187 * generation numbers aliased with those of that tasks previous cpuset.
188 *
189 * Generations are needed for mems_allowed because one task cannot
190 * modify another's memory placement. So we must enable every task,
191 * on every visit to __alloc_pages(), to efficiently check whether
192 * its current->cpuset->mems_allowed has changed, requiring an update
193 * of its current->mems_allowed.
194 *
195 * Since writes to cpuset_mems_generation are guarded by the cgroup lock
196 * there is no need to mark it atomic.
197 */
198static int cpuset_mems_generation;
199
200static struct cpuset top_cpuset = { 173static struct cpuset top_cpuset = {
201 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 174 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
202}; 175};
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = {
228 * If a task is only holding callback_mutex, then it has read-only 201 * If a task is only holding callback_mutex, then it has read-only
229 * access to cpusets. 202 * access to cpusets.
230 * 203 *
231 * The task_struct fields mems_allowed and mems_generation may only 204 * Now, the task_struct fields mems_allowed and mempolicy may be changed
232 * be accessed in the context of that task, so require no locks. 205 * by other task, we use alloc_lock in the task_struct fields to protect
206 * them.
233 * 207 *
234 * The cpuset_common_file_read() handlers only hold callback_mutex across 208 * The cpuset_common_file_read() handlers only hold callback_mutex across
235 * small pieces of code, such as when reading out possibly multi-word 209 * small pieces of code, such as when reading out possibly multi-word
@@ -349,69 +323,6 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
349 tsk->flags &= ~PF_SPREAD_SLAB; 323 tsk->flags &= ~PF_SPREAD_SLAB;
350} 324}
351 325
352/**
353 * cpuset_update_task_memory_state - update task memory placement
354 *
355 * If the current tasks cpusets mems_allowed changed behind our
356 * backs, update current->mems_allowed, mems_generation and task NUMA
357 * mempolicy to the new value.
358 *
359 * Task mempolicy is updated by rebinding it relative to the
360 * current->cpuset if a task has its memory placement changed.
361 * Do not call this routine if in_interrupt().
362 *
363 * Call without callback_mutex or task_lock() held. May be
364 * called with or without cgroup_mutex held. Thanks in part to
365 * 'the_top_cpuset_hack', the task's cpuset pointer will never
366 * be NULL. This routine also might acquire callback_mutex during
367 * call.
368 *
369 * Reading current->cpuset->mems_generation doesn't need task_lock
370 * to guard the current->cpuset derefence, because it is guarded
371 * from concurrent freeing of current->cpuset using RCU.
372 *
373 * The rcu_dereference() is technically probably not needed,
374 * as I don't actually mind if I see a new cpuset pointer but
375 * an old value of mems_generation. However this really only
376 * matters on alpha systems using cpusets heavily. If I dropped
377 * that rcu_dereference(), it would save them a memory barrier.
378 * For all other arch's, rcu_dereference is a no-op anyway, and for
379 * alpha systems not using cpusets, another planned optimization,
380 * avoiding the rcu critical section for tasks in the root cpuset
381 * which is statically allocated, so can't vanish, will make this
382 * irrelevant. Better to use RCU as intended, than to engage in
383 * some cute trick to save a memory barrier that is impossible to
384 * test, for alpha systems using cpusets heavily, which might not
385 * even exist.
386 *
387 * This routine is needed to update the per-task mems_allowed data,
388 * within the tasks context, when it is trying to allocate memory
389 * (in various mm/mempolicy.c routines) and notices that some other
390 * task has been modifying its cpuset.
391 */
392
393void cpuset_update_task_memory_state(void)
394{
395 int my_cpusets_mem_gen;
396 struct task_struct *tsk = current;
397 struct cpuset *cs;
398
399 rcu_read_lock();
400 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
401 rcu_read_unlock();
402
403 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
404 mutex_lock(&callback_mutex);
405 task_lock(tsk);
406 cs = task_cs(tsk); /* Maybe changed when task not locked */
407 guarantee_online_mems(cs, &tsk->mems_allowed);
408 tsk->cpuset_mems_generation = cs->mems_generation;
409 task_unlock(tsk);
410 mutex_unlock(&callback_mutex);
411 mpol_rebind_task(tsk, &tsk->mems_allowed);
412 }
413}
414
415/* 326/*
416 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? 327 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
417 * 328 *
@@ -1017,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1017 * other task, the task_struct mems_allowed that we are hacking 928 * other task, the task_struct mems_allowed that we are hacking
1018 * is for our current task, which must allocate new pages for that 929 * is for our current task, which must allocate new pages for that
1019 * migrating memory region. 930 * migrating memory region.
1020 *
1021 * We call cpuset_update_task_memory_state() before hacking
1022 * our tasks mems_allowed, so that we are assured of being in
1023 * sync with our tasks cpuset, and in particular, callbacks to
1024 * cpuset_update_task_memory_state() from nested page allocations
1025 * won't see any mismatch of our cpuset and task mems_generation
1026 * values, so won't overwrite our hacked tasks mems_allowed
1027 * nodemask.
1028 */ 931 */
1029 932
1030static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 933static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1032,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1032{ 935{
1033 struct task_struct *tsk = current; 936 struct task_struct *tsk = current;
1034 937
1035 cpuset_update_task_memory_state();
1036
1037 mutex_lock(&callback_mutex);
1038 tsk->mems_allowed = *to; 938 tsk->mems_allowed = *to;
1039 mutex_unlock(&callback_mutex);
1040 939
1041 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 940 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
1042 941
1043 mutex_lock(&callback_mutex);
1044 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); 942 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
1045 mutex_unlock(&callback_mutex);
1046} 943}
1047 944
1048/* 945/*
1049 * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new 946 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
1050 * nodes if memory_migrate flag is set. Called with cgroup_mutex held. 947 * @tsk: the task to change
948 * @newmems: new nodes that the task will be set
949 *
950 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
951 * we structure updates as setting all new allowed nodes, then clearing newly
952 * disallowed ones.
953 *
954 * Called with task's alloc_lock held
955 */
956static void cpuset_change_task_nodemask(struct task_struct *tsk,
957 nodemask_t *newmems)
958{
959 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
960 mpol_rebind_task(tsk, &tsk->mems_allowed);
961 mpol_rebind_task(tsk, newmems);
962 tsk->mems_allowed = *newmems;
963}
964
965/*
966 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
967 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
968 * memory_migrate flag is set. Called with cgroup_mutex held.
1051 */ 969 */
1052static void cpuset_change_nodemask(struct task_struct *p, 970static void cpuset_change_nodemask(struct task_struct *p,
1053 struct cgroup_scanner *scan) 971 struct cgroup_scanner *scan)
@@ -1056,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p,
1056 struct cpuset *cs; 974 struct cpuset *cs;
1057 int migrate; 975 int migrate;
1058 const nodemask_t *oldmem = scan->data; 976 const nodemask_t *oldmem = scan->data;
977 nodemask_t newmems;
978
979 cs = cgroup_cs(scan->cg);
980 guarantee_online_mems(cs, &newmems);
981
982 task_lock(p);
983 cpuset_change_task_nodemask(p, &newmems);
984 task_unlock(p);
1059 985
1060 mm = get_task_mm(p); 986 mm = get_task_mm(p);
1061 if (!mm) 987 if (!mm)
1062 return; 988 return;
1063 989
1064 cs = cgroup_cs(scan->cg);
1065 migrate = is_memory_migrate(cs); 990 migrate = is_memory_migrate(cs);
1066 991
1067 mpol_rebind_mm(mm, &cs->mems_allowed); 992 mpol_rebind_mm(mm, &cs->mems_allowed);
@@ -1114,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1114/* 1039/*
1115 * Handle user request to change the 'mems' memory placement 1040 * Handle user request to change the 'mems' memory placement
1116 * of a cpuset. Needs to validate the request, update the 1041 * of a cpuset. Needs to validate the request, update the
1117 * cpusets mems_allowed and mems_generation, and for each 1042 * cpusets mems_allowed, and for each task in the cpuset,
1118 * task in the cpuset, rebind any vma mempolicies and if 1043 * update mems_allowed and rebind task's mempolicy and any vma
1119 * the cpuset is marked 'memory_migrate', migrate the tasks 1044 * mempolicies and if the cpuset is marked 'memory_migrate',
1120 * pages to the new memory. 1045 * migrate the tasks pages to the new memory.
1121 * 1046 *
1122 * Call with cgroup_mutex held. May take callback_mutex during call. 1047 * Call with cgroup_mutex held. May take callback_mutex during call.
1123 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1048 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1170,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1170 1095
1171 mutex_lock(&callback_mutex); 1096 mutex_lock(&callback_mutex);
1172 cs->mems_allowed = trialcs->mems_allowed; 1097 cs->mems_allowed = trialcs->mems_allowed;
1173 cs->mems_generation = cpuset_mems_generation++;
1174 mutex_unlock(&callback_mutex); 1098 mutex_unlock(&callback_mutex);
1175 1099
1176 update_tasks_nodemask(cs, &oldmem, &heap); 1100 update_tasks_nodemask(cs, &oldmem, &heap);
@@ -1434,15 +1358,18 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1434 1358
1435 if (cs == &top_cpuset) { 1359 if (cs == &top_cpuset) {
1436 cpumask_copy(cpus_attach, cpu_possible_mask); 1360 cpumask_copy(cpus_attach, cpu_possible_mask);
1361 to = node_possible_map;
1437 } else { 1362 } else {
1438 mutex_lock(&callback_mutex);
1439 guarantee_online_cpus(cs, cpus_attach); 1363 guarantee_online_cpus(cs, cpus_attach);
1440 mutex_unlock(&callback_mutex); 1364 guarantee_online_mems(cs, &to);
1441 } 1365 }
1442 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1366 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1443 if (err) 1367 if (err)
1444 return; 1368 return;
1445 1369
1370 task_lock(tsk);
1371 cpuset_change_task_nodemask(tsk, &to);
1372 task_unlock(tsk);
1446 cpuset_update_task_spread_flag(cs, tsk); 1373 cpuset_update_task_spread_flag(cs, tsk);
1447 1374
1448 from = oldcs->mems_allowed; 1375 from = oldcs->mems_allowed;
@@ -1848,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create(
1848 struct cpuset *parent; 1775 struct cpuset *parent;
1849 1776
1850 if (!cont->parent) { 1777 if (!cont->parent) {
1851 /* This is early initialization for the top cgroup */
1852 top_cpuset.mems_generation = cpuset_mems_generation++;
1853 return &top_cpuset.css; 1778 return &top_cpuset.css;
1854 } 1779 }
1855 parent = cgroup_cs(cont->parent); 1780 parent = cgroup_cs(cont->parent);
@@ -1861,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create(
1861 return ERR_PTR(-ENOMEM); 1786 return ERR_PTR(-ENOMEM);
1862 } 1787 }
1863 1788
1864 cpuset_update_task_memory_state();
1865 cs->flags = 0; 1789 cs->flags = 0;
1866 if (is_spread_page(parent)) 1790 if (is_spread_page(parent))
1867 set_bit(CS_SPREAD_PAGE, &cs->flags); 1791 set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1870,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create(
1870 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1794 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1871 cpumask_clear(cs->cpus_allowed); 1795 cpumask_clear(cs->cpus_allowed);
1872 nodes_clear(cs->mems_allowed); 1796 nodes_clear(cs->mems_allowed);
1873 cs->mems_generation = cpuset_mems_generation++;
1874 fmeter_init(&cs->fmeter); 1797 fmeter_init(&cs->fmeter);
1875 cs->relax_domain_level = -1; 1798 cs->relax_domain_level = -1;
1876 1799
@@ -1889,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1889{ 1812{
1890 struct cpuset *cs = cgroup_cs(cont); 1813 struct cpuset *cs = cgroup_cs(cont);
1891 1814
1892 cpuset_update_task_memory_state();
1893
1894 if (is_sched_load_balance(cs)) 1815 if (is_sched_load_balance(cs))
1895 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1816 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1896 1817
@@ -1911,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = {
1911 .early_init = 1, 1832 .early_init = 1,
1912}; 1833};
1913 1834
1914/*
1915 * cpuset_init_early - just enough so that the calls to
1916 * cpuset_update_task_memory_state() in early init code
1917 * are harmless.
1918 */
1919
1920int __init cpuset_init_early(void)
1921{
1922 alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
1923
1924 top_cpuset.mems_generation = cpuset_mems_generation++;
1925 return 0;
1926}
1927
1928
1929/** 1835/**
1930 * cpuset_init - initialize cpusets at system boot 1836 * cpuset_init - initialize cpusets at system boot
1931 * 1837 *
@@ -1936,11 +1842,13 @@ int __init cpuset_init(void)
1936{ 1842{
1937 int err = 0; 1843 int err = 0;
1938 1844
1845 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1846 BUG();
1847
1939 cpumask_setall(top_cpuset.cpus_allowed); 1848 cpumask_setall(top_cpuset.cpus_allowed);
1940 nodes_setall(top_cpuset.mems_allowed); 1849 nodes_setall(top_cpuset.mems_allowed);
1941 1850
1942 fmeter_init(&top_cpuset.fmeter); 1851 fmeter_init(&top_cpuset.fmeter);
1943 top_cpuset.mems_generation = cpuset_mems_generation++;
1944 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 1852 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1945 top_cpuset.relax_domain_level = -1; 1853 top_cpuset.relax_domain_level = -1;
1946 1854
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 41c88fe40500..7fa441333529 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -9,6 +9,7 @@
9#include <linux/kthread.h> 9#include <linux/kthread.h>
10#include <linux/completion.h> 10#include <linux/completion.h>
11#include <linux/err.h> 11#include <linux/err.h>
12#include <linux/cpuset.h>
12#include <linux/unistd.h> 13#include <linux/unistd.h>
13#include <linux/file.h> 14#include <linux/file.h>
14#include <linux/module.h> 15#include <linux/module.h>
@@ -236,6 +237,7 @@ int kthreadd(void *unused)
236 ignore_signals(tsk); 237 ignore_signals(tsk);
237 set_user_nice(tsk, KTHREAD_NICE_LEVEL); 238 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
238 set_cpus_allowed_ptr(tsk, cpu_all_mask); 239 set_cpus_allowed_ptr(tsk, cpu_all_mask);
240 set_mems_allowed(node_possible_map);
239 241
240 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 242 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
241 243
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3eb4a6fdc043..46bdf9ddf2ba 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
182 return 0; 182 return 0;
183} 183}
184 184
185/* Create a new policy */ 185/*
186 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
187 * any, for the new policy. mpol_new() has already validated the nodes
188 * parameter with respect to the policy mode and flags. But, we need to
189 * handle an empty nodemask with MPOL_PREFERRED here.
190 *
191 * Must be called holding task's alloc_lock to protect task's mems_allowed
192 * and mempolicy. May also be called holding the mmap_semaphore for write.
193 */
194static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
195{
196 nodemask_t cpuset_context_nmask;
197 int ret;
198
199 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
200 if (pol == NULL)
201 return 0;
202
203 VM_BUG_ON(!nodes);
204 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
205 nodes = NULL; /* explicit local allocation */
206 else {
207 if (pol->flags & MPOL_F_RELATIVE_NODES)
208 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
209 &cpuset_current_mems_allowed);
210 else
211 nodes_and(cpuset_context_nmask, *nodes,
212 cpuset_current_mems_allowed);
213 if (mpol_store_user_nodemask(pol))
214 pol->w.user_nodemask = *nodes;
215 else
216 pol->w.cpuset_mems_allowed =
217 cpuset_current_mems_allowed;
218 }
219
220 ret = mpol_ops[pol->mode].create(pol,
221 nodes ? &cpuset_context_nmask : NULL);
222 return ret;
223}
224
225/*
226 * This function just creates a new policy, does some check and simple
227 * initialization. You must invoke mpol_set_nodemask() to set nodes.
228 */
186static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, 229static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
187 nodemask_t *nodes) 230 nodemask_t *nodes)
188{ 231{
189 struct mempolicy *policy; 232 struct mempolicy *policy;
190 nodemask_t cpuset_context_nmask;
191 int ret;
192 233
193 pr_debug("setting mode %d flags %d nodes[0] %lx\n", 234 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
194 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); 235 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
@@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
210 if (((flags & MPOL_F_STATIC_NODES) || 251 if (((flags & MPOL_F_STATIC_NODES) ||
211 (flags & MPOL_F_RELATIVE_NODES))) 252 (flags & MPOL_F_RELATIVE_NODES)))
212 return ERR_PTR(-EINVAL); 253 return ERR_PTR(-EINVAL);
213 nodes = NULL; /* flag local alloc */
214 } 254 }
215 } else if (nodes_empty(*nodes)) 255 } else if (nodes_empty(*nodes))
216 return ERR_PTR(-EINVAL); 256 return ERR_PTR(-EINVAL);
@@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
221 policy->mode = mode; 261 policy->mode = mode;
222 policy->flags = flags; 262 policy->flags = flags;
223 263
224 if (nodes) {
225 /*
226 * cpuset related setup doesn't apply to local allocation
227 */
228 cpuset_update_task_memory_state();
229 if (flags & MPOL_F_RELATIVE_NODES)
230 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
231 &cpuset_current_mems_allowed);
232 else
233 nodes_and(cpuset_context_nmask, *nodes,
234 cpuset_current_mems_allowed);
235 if (mpol_store_user_nodemask(policy))
236 policy->w.user_nodemask = *nodes;
237 else
238 policy->w.cpuset_mems_allowed =
239 cpuset_mems_allowed(current);
240 }
241
242 ret = mpol_ops[mode].create(policy,
243 nodes ? &cpuset_context_nmask : NULL);
244 if (ret < 0) {
245 kmem_cache_free(policy_cache, policy);
246 return ERR_PTR(ret);
247 }
248 return policy; 264 return policy;
249} 265}
250 266
@@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
324/* 340/*
325 * Wrapper for mpol_rebind_policy() that just requires task 341 * Wrapper for mpol_rebind_policy() that just requires task
326 * pointer, and updates task mempolicy. 342 * pointer, and updates task mempolicy.
343 *
344 * Called with task's alloc_lock held.
327 */ 345 */
328 346
329void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 347void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
@@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void)
600static long do_set_mempolicy(unsigned short mode, unsigned short flags, 618static long do_set_mempolicy(unsigned short mode, unsigned short flags,
601 nodemask_t *nodes) 619 nodemask_t *nodes)
602{ 620{
603 struct mempolicy *new; 621 struct mempolicy *new, *old;
604 struct mm_struct *mm = current->mm; 622 struct mm_struct *mm = current->mm;
623 int ret;
605 624
606 new = mpol_new(mode, flags, nodes); 625 new = mpol_new(mode, flags, nodes);
607 if (IS_ERR(new)) 626 if (IS_ERR(new))
@@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
615 */ 634 */
616 if (mm) 635 if (mm)
617 down_write(&mm->mmap_sem); 636 down_write(&mm->mmap_sem);
618 mpol_put(current->mempolicy); 637 task_lock(current);
638 ret = mpol_set_nodemask(new, nodes);
639 if (ret) {
640 task_unlock(current);
641 if (mm)
642 up_write(&mm->mmap_sem);
643 mpol_put(new);
644 return ret;
645 }
646 old = current->mempolicy;
619 current->mempolicy = new; 647 current->mempolicy = new;
620 mpol_set_task_struct_flag(); 648 mpol_set_task_struct_flag();
621 if (new && new->mode == MPOL_INTERLEAVE && 649 if (new && new->mode == MPOL_INTERLEAVE &&
622 nodes_weight(new->v.nodes)) 650 nodes_weight(new->v.nodes))
623 current->il_next = first_node(new->v.nodes); 651 current->il_next = first_node(new->v.nodes);
652 task_unlock(current);
624 if (mm) 653 if (mm)
625 up_write(&mm->mmap_sem); 654 up_write(&mm->mmap_sem);
626 655
656 mpol_put(old);
627 return 0; 657 return 0;
628} 658}
629 659
630/* 660/*
631 * Return nodemask for policy for get_mempolicy() query 661 * Return nodemask for policy for get_mempolicy() query
662 *
663 * Called with task's alloc_lock held
632 */ 664 */
633static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) 665static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
634{ 666{
@@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
674 struct vm_area_struct *vma = NULL; 706 struct vm_area_struct *vma = NULL;
675 struct mempolicy *pol = current->mempolicy; 707 struct mempolicy *pol = current->mempolicy;
676 708
677 cpuset_update_task_memory_state();
678 if (flags & 709 if (flags &
679 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) 710 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
680 return -EINVAL; 711 return -EINVAL;
@@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
683 if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) 714 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
684 return -EINVAL; 715 return -EINVAL;
685 *policy = 0; /* just so it's initialized */ 716 *policy = 0; /* just so it's initialized */
717 task_lock(current);
686 *nmask = cpuset_current_mems_allowed; 718 *nmask = cpuset_current_mems_allowed;
719 task_unlock(current);
687 return 0; 720 return 0;
688 } 721 }
689 722
@@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
738 } 771 }
739 772
740 err = 0; 773 err = 0;
741 if (nmask) 774 if (nmask) {
775 task_lock(current);
742 get_policy_nodemask(pol, nmask); 776 get_policy_nodemask(pol, nmask);
777 task_unlock(current);
778 }
743 779
744 out: 780 out:
745 mpol_cond_put(pol); 781 mpol_cond_put(pol);
@@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len,
979 return err; 1015 return err;
980 } 1016 }
981 down_write(&mm->mmap_sem); 1017 down_write(&mm->mmap_sem);
1018 task_lock(current);
1019 err = mpol_set_nodemask(new, nmask);
1020 task_unlock(current);
1021 if (err) {
1022 up_write(&mm->mmap_sem);
1023 mpol_put(new);
1024 return err;
1025 }
982 vma = check_range(mm, start, end, nmask, 1026 vma = check_range(mm, start, end, nmask,
983 flags | MPOL_MF_INVERT, &pagelist); 1027 flags | MPOL_MF_INVERT, &pagelist);
984 1028
@@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1545 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1589 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1546 struct zonelist *zl; 1590 struct zonelist *zl;
1547 1591
1548 cpuset_update_task_memory_state();
1549
1550 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1592 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1551 unsigned nid; 1593 unsigned nid;
1552 1594
@@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1593{ 1635{
1594 struct mempolicy *pol = current->mempolicy; 1636 struct mempolicy *pol = current->mempolicy;
1595 1637
1596 if ((gfp & __GFP_WAIT) && !in_interrupt())
1597 cpuset_update_task_memory_state();
1598 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1638 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1599 pol = &default_policy; 1639 pol = &default_policy;
1600 1640
@@ -1854,6 +1894,8 @@ restart:
1854 */ 1894 */
1855void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 1895void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1856{ 1896{
1897 int ret;
1898
1857 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 1899 sp->root = RB_ROOT; /* empty tree == default mempolicy */
1858 spin_lock_init(&sp->lock); 1900 spin_lock_init(&sp->lock);
1859 1901
@@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1863 1905
1864 /* contextualize the tmpfs mount point mempolicy */ 1906 /* contextualize the tmpfs mount point mempolicy */
1865 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 1907 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1866 mpol_put(mpol); /* drop our ref on sb mpol */ 1908 if (IS_ERR(new)) {
1867 if (IS_ERR(new)) 1909 mpol_put(mpol); /* drop our ref on sb mpol */
1868 return; /* no valid nodemask intersection */ 1910 return; /* no valid nodemask intersection */
1911 }
1912
1913 task_lock(current);
1914 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
1915 task_unlock(current);
1916 mpol_put(mpol); /* drop our ref on sb mpol */
1917 if (ret) {
1918 mpol_put(new);
1919 return;
1920 }
1869 1921
1870 /* Create pseudo-vma that contains just the policy */ 1922 /* Create pseudo-vma that contains just the policy */
1871 memset(&pvma, 0, sizeof(struct vm_area_struct)); 1923 memset(&pvma, 0, sizeof(struct vm_area_struct));
@@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2086 new = mpol_new(mode, mode_flags, &nodes); 2138 new = mpol_new(mode, mode_flags, &nodes);
2087 if (IS_ERR(new)) 2139 if (IS_ERR(new))
2088 err = 1; 2140 err = 1;
2089 else if (no_context) 2141 else {
2090 new->w.user_nodemask = nodes; /* save for contextualization */ 2142 int ret;
2143
2144 task_lock(current);
2145 ret = mpol_set_nodemask(new, &nodes);
2146 task_unlock(current);
2147 if (ret)
2148 err = 1;
2149 else if (no_context) {
2150 /* save for contextualization */
2151 new->w.user_nodemask = nodes;
2152 }
2153 }
2091 2154
2092out: 2155out:
2093 /* Restore string for error message */ 2156 /* Restore string for error message */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17d5f539a9aa..7cc3179e3591 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1569,10 +1569,7 @@ nofail_alloc:
1569 1569
1570 /* We now go into synchronous reclaim */ 1570 /* We now go into synchronous reclaim */
1571 cpuset_memory_pressure_bump(); 1571 cpuset_memory_pressure_bump();
1572 /* 1572
1573 * The task's cpuset might have expanded its set of allowable nodes
1574 */
1575 cpuset_update_task_memory_state();
1576 p->flags |= PF_MEMALLOC; 1573 p->flags |= PF_MEMALLOC;
1577 1574
1578 lockdep_set_current_reclaim_state(gfp_mask); 1575 lockdep_set_current_reclaim_state(gfp_mask);