diff options
-rw-r--r-- | include/linux/cpuset.h | 13 | ||||
-rw-r--r-- | include/linux/sched.h | 8 | ||||
-rw-r--r-- | init/main.c | 6 | ||||
-rw-r--r-- | kernel/cpuset.c | 184 | ||||
-rw-r--r-- | kernel/kthread.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 143 | ||||
-rw-r--r-- | mm/page_alloc.c | 5 |
7 files changed, 170 insertions, 191 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 05ea1dd7d681..a5740fc4d04b 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -18,7 +18,6 @@ | |||
18 | 18 | ||
19 | extern int number_of_cpusets; /* How many cpusets are defined in system? */ | 19 | extern int number_of_cpusets; /* How many cpusets are defined in system? */ |
20 | 20 | ||
21 | extern int cpuset_init_early(void); | ||
22 | extern int cpuset_init(void); | 21 | extern int cpuset_init(void); |
23 | extern void cpuset_init_smp(void); | 22 | extern void cpuset_init_smp(void); |
24 | extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); | 23 | extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); |
@@ -27,7 +26,6 @@ extern void cpuset_cpus_allowed_locked(struct task_struct *p, | |||
27 | extern nodemask_t cpuset_mems_allowed(struct task_struct *p); | 26 | extern nodemask_t cpuset_mems_allowed(struct task_struct *p); |
28 | #define cpuset_current_mems_allowed (current->mems_allowed) | 27 | #define cpuset_current_mems_allowed (current->mems_allowed) |
29 | void cpuset_init_current_mems_allowed(void); | 28 | void cpuset_init_current_mems_allowed(void); |
30 | void cpuset_update_task_memory_state(void); | ||
31 | int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); | 29 | int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); |
32 | 30 | ||
33 | extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask); | 31 | extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask); |
@@ -92,9 +90,13 @@ extern void rebuild_sched_domains(void); | |||
92 | 90 | ||
93 | extern void cpuset_print_task_mems_allowed(struct task_struct *p); | 91 | extern void cpuset_print_task_mems_allowed(struct task_struct *p); |
94 | 92 | ||
93 | static inline void set_mems_allowed(nodemask_t nodemask) | ||
94 | { | ||
95 | current->mems_allowed = nodemask; | ||
96 | } | ||
97 | |||
95 | #else /* !CONFIG_CPUSETS */ | 98 | #else /* !CONFIG_CPUSETS */ |
96 | 99 | ||
97 | static inline int cpuset_init_early(void) { return 0; } | ||
98 | static inline int cpuset_init(void) { return 0; } | 100 | static inline int cpuset_init(void) { return 0; } |
99 | static inline void cpuset_init_smp(void) {} | 101 | static inline void cpuset_init_smp(void) {} |
100 | 102 | ||
@@ -116,7 +118,6 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) | |||
116 | 118 | ||
117 | #define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY]) | 119 | #define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY]) |
118 | static inline void cpuset_init_current_mems_allowed(void) {} | 120 | static inline void cpuset_init_current_mems_allowed(void) {} |
119 | static inline void cpuset_update_task_memory_state(void) {} | ||
120 | 121 | ||
121 | static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) | 122 | static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) |
122 | { | 123 | { |
@@ -188,6 +189,10 @@ static inline void cpuset_print_task_mems_allowed(struct task_struct *p) | |||
188 | { | 189 | { |
189 | } | 190 | } |
190 | 191 | ||
192 | static inline void set_mems_allowed(nodemask_t nodemask) | ||
193 | { | ||
194 | } | ||
195 | |||
191 | #endif /* !CONFIG_CPUSETS */ | 196 | #endif /* !CONFIG_CPUSETS */ |
192 | 197 | ||
193 | #endif /* _LINUX_CPUSET_H */ | 198 | #endif /* _LINUX_CPUSET_H */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index c900aa530070..1048bf50540a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1318,7 +1318,8 @@ struct task_struct { | |||
1318 | /* Thread group tracking */ | 1318 | /* Thread group tracking */ |
1319 | u32 parent_exec_id; | 1319 | u32 parent_exec_id; |
1320 | u32 self_exec_id; | 1320 | u32 self_exec_id; |
1321 | /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ | 1321 | /* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, |
1322 | * mempolicy */ | ||
1322 | spinlock_t alloc_lock; | 1323 | spinlock_t alloc_lock; |
1323 | 1324 | ||
1324 | #ifdef CONFIG_GENERIC_HARDIRQS | 1325 | #ifdef CONFIG_GENERIC_HARDIRQS |
@@ -1386,8 +1387,7 @@ struct task_struct { | |||
1386 | cputime_t acct_timexpd; /* stime + utime since last update */ | 1387 | cputime_t acct_timexpd; /* stime + utime since last update */ |
1387 | #endif | 1388 | #endif |
1388 | #ifdef CONFIG_CPUSETS | 1389 | #ifdef CONFIG_CPUSETS |
1389 | nodemask_t mems_allowed; | 1390 | nodemask_t mems_allowed; /* Protected by alloc_lock */ |
1390 | int cpuset_mems_generation; | ||
1391 | int cpuset_mem_spread_rotor; | 1391 | int cpuset_mem_spread_rotor; |
1392 | #endif | 1392 | #endif |
1393 | #ifdef CONFIG_CGROUPS | 1393 | #ifdef CONFIG_CGROUPS |
@@ -1410,7 +1410,7 @@ struct task_struct { | |||
1410 | struct list_head perf_counter_list; | 1410 | struct list_head perf_counter_list; |
1411 | #endif | 1411 | #endif |
1412 | #ifdef CONFIG_NUMA | 1412 | #ifdef CONFIG_NUMA |
1413 | struct mempolicy *mempolicy; | 1413 | struct mempolicy *mempolicy; /* Protected by alloc_lock */ |
1414 | short il_next; | 1414 | short il_next; |
1415 | #endif | 1415 | #endif |
1416 | atomic_t fs_excl; /* holding fs exclusive resources */ | 1416 | atomic_t fs_excl; /* holding fs exclusive resources */ |
diff --git a/init/main.c b/init/main.c index f6204f712e7c..5e0d3f047eaf 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -670,7 +670,6 @@ asmlinkage void __init start_kernel(void) | |||
670 | initrd_start = 0; | 670 | initrd_start = 0; |
671 | } | 671 | } |
672 | #endif | 672 | #endif |
673 | cpuset_init_early(); | ||
674 | page_cgroup_init(); | 673 | page_cgroup_init(); |
675 | enable_debug_pagealloc(); | 674 | enable_debug_pagealloc(); |
676 | cpu_hotplug_init(); | 675 | cpu_hotplug_init(); |
@@ -867,6 +866,11 @@ static noinline int init_post(void) | |||
867 | static int __init kernel_init(void * unused) | 866 | static int __init kernel_init(void * unused) |
868 | { | 867 | { |
869 | lock_kernel(); | 868 | lock_kernel(); |
869 | |||
870 | /* | ||
871 | * init can allocate pages on any node | ||
872 | */ | ||
873 | set_mems_allowed(node_possible_map); | ||
870 | /* | 874 | /* |
871 | * init can run on any cpu. | 875 | * init can run on any cpu. |
872 | */ | 876 | */ |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index af5a83d52187..7e75a41bd508 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -97,12 +97,6 @@ struct cpuset { | |||
97 | 97 | ||
98 | struct cpuset *parent; /* my parent */ | 98 | struct cpuset *parent; /* my parent */ |
99 | 99 | ||
100 | /* | ||
101 | * Copy of global cpuset_mems_generation as of the most | ||
102 | * recent time this cpuset changed its mems_allowed. | ||
103 | */ | ||
104 | int mems_generation; | ||
105 | |||
106 | struct fmeter fmeter; /* memory_pressure filter */ | 100 | struct fmeter fmeter; /* memory_pressure filter */ |
107 | 101 | ||
108 | /* partition number for rebuild_sched_domains() */ | 102 | /* partition number for rebuild_sched_domains() */ |
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs) | |||
176 | return test_bit(CS_SPREAD_SLAB, &cs->flags); | 170 | return test_bit(CS_SPREAD_SLAB, &cs->flags); |
177 | } | 171 | } |
178 | 172 | ||
179 | /* | ||
180 | * Increment this integer everytime any cpuset changes its | ||
181 | * mems_allowed value. Users of cpusets can track this generation | ||
182 | * number, and avoid having to lock and reload mems_allowed unless | ||
183 | * the cpuset they're using changes generation. | ||
184 | * | ||
185 | * A single, global generation is needed because cpuset_attach_task() could | ||
186 | * reattach a task to a different cpuset, which must not have its | ||
187 | * generation numbers aliased with those of that tasks previous cpuset. | ||
188 | * | ||
189 | * Generations are needed for mems_allowed because one task cannot | ||
190 | * modify another's memory placement. So we must enable every task, | ||
191 | * on every visit to __alloc_pages(), to efficiently check whether | ||
192 | * its current->cpuset->mems_allowed has changed, requiring an update | ||
193 | * of its current->mems_allowed. | ||
194 | * | ||
195 | * Since writes to cpuset_mems_generation are guarded by the cgroup lock | ||
196 | * there is no need to mark it atomic. | ||
197 | */ | ||
198 | static int cpuset_mems_generation; | ||
199 | |||
200 | static struct cpuset top_cpuset = { | 173 | static struct cpuset top_cpuset = { |
201 | .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), | 174 | .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), |
202 | }; | 175 | }; |
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = { | |||
228 | * If a task is only holding callback_mutex, then it has read-only | 201 | * If a task is only holding callback_mutex, then it has read-only |
229 | * access to cpusets. | 202 | * access to cpusets. |
230 | * | 203 | * |
231 | * The task_struct fields mems_allowed and mems_generation may only | 204 | * Now, the task_struct fields mems_allowed and mempolicy may be changed |
232 | * be accessed in the context of that task, so require no locks. | 205 | * by other task, we use alloc_lock in the task_struct fields to protect |
206 | * them. | ||
233 | * | 207 | * |
234 | * The cpuset_common_file_read() handlers only hold callback_mutex across | 208 | * The cpuset_common_file_read() handlers only hold callback_mutex across |
235 | * small pieces of code, such as when reading out possibly multi-word | 209 | * small pieces of code, such as when reading out possibly multi-word |
@@ -349,69 +323,6 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, | |||
349 | tsk->flags &= ~PF_SPREAD_SLAB; | 323 | tsk->flags &= ~PF_SPREAD_SLAB; |
350 | } | 324 | } |
351 | 325 | ||
352 | /** | ||
353 | * cpuset_update_task_memory_state - update task memory placement | ||
354 | * | ||
355 | * If the current tasks cpusets mems_allowed changed behind our | ||
356 | * backs, update current->mems_allowed, mems_generation and task NUMA | ||
357 | * mempolicy to the new value. | ||
358 | * | ||
359 | * Task mempolicy is updated by rebinding it relative to the | ||
360 | * current->cpuset if a task has its memory placement changed. | ||
361 | * Do not call this routine if in_interrupt(). | ||
362 | * | ||
363 | * Call without callback_mutex or task_lock() held. May be | ||
364 | * called with or without cgroup_mutex held. Thanks in part to | ||
365 | * 'the_top_cpuset_hack', the task's cpuset pointer will never | ||
366 | * be NULL. This routine also might acquire callback_mutex during | ||
367 | * call. | ||
368 | * | ||
369 | * Reading current->cpuset->mems_generation doesn't need task_lock | ||
370 | * to guard the current->cpuset derefence, because it is guarded | ||
371 | * from concurrent freeing of current->cpuset using RCU. | ||
372 | * | ||
373 | * The rcu_dereference() is technically probably not needed, | ||
374 | * as I don't actually mind if I see a new cpuset pointer but | ||
375 | * an old value of mems_generation. However this really only | ||
376 | * matters on alpha systems using cpusets heavily. If I dropped | ||
377 | * that rcu_dereference(), it would save them a memory barrier. | ||
378 | * For all other arch's, rcu_dereference is a no-op anyway, and for | ||
379 | * alpha systems not using cpusets, another planned optimization, | ||
380 | * avoiding the rcu critical section for tasks in the root cpuset | ||
381 | * which is statically allocated, so can't vanish, will make this | ||
382 | * irrelevant. Better to use RCU as intended, than to engage in | ||
383 | * some cute trick to save a memory barrier that is impossible to | ||
384 | * test, for alpha systems using cpusets heavily, which might not | ||
385 | * even exist. | ||
386 | * | ||
387 | * This routine is needed to update the per-task mems_allowed data, | ||
388 | * within the tasks context, when it is trying to allocate memory | ||
389 | * (in various mm/mempolicy.c routines) and notices that some other | ||
390 | * task has been modifying its cpuset. | ||
391 | */ | ||
392 | |||
393 | void cpuset_update_task_memory_state(void) | ||
394 | { | ||
395 | int my_cpusets_mem_gen; | ||
396 | struct task_struct *tsk = current; | ||
397 | struct cpuset *cs; | ||
398 | |||
399 | rcu_read_lock(); | ||
400 | my_cpusets_mem_gen = task_cs(tsk)->mems_generation; | ||
401 | rcu_read_unlock(); | ||
402 | |||
403 | if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { | ||
404 | mutex_lock(&callback_mutex); | ||
405 | task_lock(tsk); | ||
406 | cs = task_cs(tsk); /* Maybe changed when task not locked */ | ||
407 | guarantee_online_mems(cs, &tsk->mems_allowed); | ||
408 | tsk->cpuset_mems_generation = cs->mems_generation; | ||
409 | task_unlock(tsk); | ||
410 | mutex_unlock(&callback_mutex); | ||
411 | mpol_rebind_task(tsk, &tsk->mems_allowed); | ||
412 | } | ||
413 | } | ||
414 | |||
415 | /* | 326 | /* |
416 | * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? | 327 | * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? |
417 | * | 328 | * |
@@ -1017,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
1017 | * other task, the task_struct mems_allowed that we are hacking | 928 | * other task, the task_struct mems_allowed that we are hacking |
1018 | * is for our current task, which must allocate new pages for that | 929 | * is for our current task, which must allocate new pages for that |
1019 | * migrating memory region. | 930 | * migrating memory region. |
1020 | * | ||
1021 | * We call cpuset_update_task_memory_state() before hacking | ||
1022 | * our tasks mems_allowed, so that we are assured of being in | ||
1023 | * sync with our tasks cpuset, and in particular, callbacks to | ||
1024 | * cpuset_update_task_memory_state() from nested page allocations | ||
1025 | * won't see any mismatch of our cpuset and task mems_generation | ||
1026 | * values, so won't overwrite our hacked tasks mems_allowed | ||
1027 | * nodemask. | ||
1028 | */ | 931 | */ |
1029 | 932 | ||
1030 | static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | 933 | static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, |
@@ -1032,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
1032 | { | 935 | { |
1033 | struct task_struct *tsk = current; | 936 | struct task_struct *tsk = current; |
1034 | 937 | ||
1035 | cpuset_update_task_memory_state(); | ||
1036 | |||
1037 | mutex_lock(&callback_mutex); | ||
1038 | tsk->mems_allowed = *to; | 938 | tsk->mems_allowed = *to; |
1039 | mutex_unlock(&callback_mutex); | ||
1040 | 939 | ||
1041 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); | 940 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); |
1042 | 941 | ||
1043 | mutex_lock(&callback_mutex); | ||
1044 | guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); | 942 | guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); |
1045 | mutex_unlock(&callback_mutex); | ||
1046 | } | 943 | } |
1047 | 944 | ||
1048 | /* | 945 | /* |
1049 | * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new | 946 | * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy |
1050 | * nodes if memory_migrate flag is set. Called with cgroup_mutex held. | 947 | * @tsk: the task to change |
948 | * @newmems: new nodes that the task will be set | ||
949 | * | ||
950 | * In order to avoid seeing no nodes if the old and new nodes are disjoint, | ||
951 | * we structure updates as setting all new allowed nodes, then clearing newly | ||
952 | * disallowed ones. | ||
953 | * | ||
954 | * Called with task's alloc_lock held | ||
955 | */ | ||
956 | static void cpuset_change_task_nodemask(struct task_struct *tsk, | ||
957 | nodemask_t *newmems) | ||
958 | { | ||
959 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); | ||
960 | mpol_rebind_task(tsk, &tsk->mems_allowed); | ||
961 | mpol_rebind_task(tsk, newmems); | ||
962 | tsk->mems_allowed = *newmems; | ||
963 | } | ||
964 | |||
965 | /* | ||
966 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy | ||
967 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if | ||
968 | * memory_migrate flag is set. Called with cgroup_mutex held. | ||
1051 | */ | 969 | */ |
1052 | static void cpuset_change_nodemask(struct task_struct *p, | 970 | static void cpuset_change_nodemask(struct task_struct *p, |
1053 | struct cgroup_scanner *scan) | 971 | struct cgroup_scanner *scan) |
@@ -1056,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p, | |||
1056 | struct cpuset *cs; | 974 | struct cpuset *cs; |
1057 | int migrate; | 975 | int migrate; |
1058 | const nodemask_t *oldmem = scan->data; | 976 | const nodemask_t *oldmem = scan->data; |
977 | nodemask_t newmems; | ||
978 | |||
979 | cs = cgroup_cs(scan->cg); | ||
980 | guarantee_online_mems(cs, &newmems); | ||
981 | |||
982 | task_lock(p); | ||
983 | cpuset_change_task_nodemask(p, &newmems); | ||
984 | task_unlock(p); | ||
1059 | 985 | ||
1060 | mm = get_task_mm(p); | 986 | mm = get_task_mm(p); |
1061 | if (!mm) | 987 | if (!mm) |
1062 | return; | 988 | return; |
1063 | 989 | ||
1064 | cs = cgroup_cs(scan->cg); | ||
1065 | migrate = is_memory_migrate(cs); | 990 | migrate = is_memory_migrate(cs); |
1066 | 991 | ||
1067 | mpol_rebind_mm(mm, &cs->mems_allowed); | 992 | mpol_rebind_mm(mm, &cs->mems_allowed); |
@@ -1114,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, | |||
1114 | /* | 1039 | /* |
1115 | * Handle user request to change the 'mems' memory placement | 1040 | * Handle user request to change the 'mems' memory placement |
1116 | * of a cpuset. Needs to validate the request, update the | 1041 | * of a cpuset. Needs to validate the request, update the |
1117 | * cpusets mems_allowed and mems_generation, and for each | 1042 | * cpusets mems_allowed, and for each task in the cpuset, |
1118 | * task in the cpuset, rebind any vma mempolicies and if | 1043 | * update mems_allowed and rebind task's mempolicy and any vma |
1119 | * the cpuset is marked 'memory_migrate', migrate the tasks | 1044 | * mempolicies and if the cpuset is marked 'memory_migrate', |
1120 | * pages to the new memory. | 1045 | * migrate the tasks pages to the new memory. |
1121 | * | 1046 | * |
1122 | * Call with cgroup_mutex held. May take callback_mutex during call. | 1047 | * Call with cgroup_mutex held. May take callback_mutex during call. |
1123 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | 1048 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, |
@@ -1170,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1170 | 1095 | ||
1171 | mutex_lock(&callback_mutex); | 1096 | mutex_lock(&callback_mutex); |
1172 | cs->mems_allowed = trialcs->mems_allowed; | 1097 | cs->mems_allowed = trialcs->mems_allowed; |
1173 | cs->mems_generation = cpuset_mems_generation++; | ||
1174 | mutex_unlock(&callback_mutex); | 1098 | mutex_unlock(&callback_mutex); |
1175 | 1099 | ||
1176 | update_tasks_nodemask(cs, &oldmem, &heap); | 1100 | update_tasks_nodemask(cs, &oldmem, &heap); |
@@ -1434,15 +1358,18 @@ static void cpuset_attach(struct cgroup_subsys *ss, | |||
1434 | 1358 | ||
1435 | if (cs == &top_cpuset) { | 1359 | if (cs == &top_cpuset) { |
1436 | cpumask_copy(cpus_attach, cpu_possible_mask); | 1360 | cpumask_copy(cpus_attach, cpu_possible_mask); |
1361 | to = node_possible_map; | ||
1437 | } else { | 1362 | } else { |
1438 | mutex_lock(&callback_mutex); | ||
1439 | guarantee_online_cpus(cs, cpus_attach); | 1363 | guarantee_online_cpus(cs, cpus_attach); |
1440 | mutex_unlock(&callback_mutex); | 1364 | guarantee_online_mems(cs, &to); |
1441 | } | 1365 | } |
1442 | err = set_cpus_allowed_ptr(tsk, cpus_attach); | 1366 | err = set_cpus_allowed_ptr(tsk, cpus_attach); |
1443 | if (err) | 1367 | if (err) |
1444 | return; | 1368 | return; |
1445 | 1369 | ||
1370 | task_lock(tsk); | ||
1371 | cpuset_change_task_nodemask(tsk, &to); | ||
1372 | task_unlock(tsk); | ||
1446 | cpuset_update_task_spread_flag(cs, tsk); | 1373 | cpuset_update_task_spread_flag(cs, tsk); |
1447 | 1374 | ||
1448 | from = oldcs->mems_allowed; | 1375 | from = oldcs->mems_allowed; |
@@ -1848,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1848 | struct cpuset *parent; | 1775 | struct cpuset *parent; |
1849 | 1776 | ||
1850 | if (!cont->parent) { | 1777 | if (!cont->parent) { |
1851 | /* This is early initialization for the top cgroup */ | ||
1852 | top_cpuset.mems_generation = cpuset_mems_generation++; | ||
1853 | return &top_cpuset.css; | 1778 | return &top_cpuset.css; |
1854 | } | 1779 | } |
1855 | parent = cgroup_cs(cont->parent); | 1780 | parent = cgroup_cs(cont->parent); |
@@ -1861,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1861 | return ERR_PTR(-ENOMEM); | 1786 | return ERR_PTR(-ENOMEM); |
1862 | } | 1787 | } |
1863 | 1788 | ||
1864 | cpuset_update_task_memory_state(); | ||
1865 | cs->flags = 0; | 1789 | cs->flags = 0; |
1866 | if (is_spread_page(parent)) | 1790 | if (is_spread_page(parent)) |
1867 | set_bit(CS_SPREAD_PAGE, &cs->flags); | 1791 | set_bit(CS_SPREAD_PAGE, &cs->flags); |
@@ -1870,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1870 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | 1794 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
1871 | cpumask_clear(cs->cpus_allowed); | 1795 | cpumask_clear(cs->cpus_allowed); |
1872 | nodes_clear(cs->mems_allowed); | 1796 | nodes_clear(cs->mems_allowed); |
1873 | cs->mems_generation = cpuset_mems_generation++; | ||
1874 | fmeter_init(&cs->fmeter); | 1797 | fmeter_init(&cs->fmeter); |
1875 | cs->relax_domain_level = -1; | 1798 | cs->relax_domain_level = -1; |
1876 | 1799 | ||
@@ -1889,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1889 | { | 1812 | { |
1890 | struct cpuset *cs = cgroup_cs(cont); | 1813 | struct cpuset *cs = cgroup_cs(cont); |
1891 | 1814 | ||
1892 | cpuset_update_task_memory_state(); | ||
1893 | |||
1894 | if (is_sched_load_balance(cs)) | 1815 | if (is_sched_load_balance(cs)) |
1895 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); | 1816 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); |
1896 | 1817 | ||
@@ -1911,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = { | |||
1911 | .early_init = 1, | 1832 | .early_init = 1, |
1912 | }; | 1833 | }; |
1913 | 1834 | ||
1914 | /* | ||
1915 | * cpuset_init_early - just enough so that the calls to | ||
1916 | * cpuset_update_task_memory_state() in early init code | ||
1917 | * are harmless. | ||
1918 | */ | ||
1919 | |||
1920 | int __init cpuset_init_early(void) | ||
1921 | { | ||
1922 | alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT); | ||
1923 | |||
1924 | top_cpuset.mems_generation = cpuset_mems_generation++; | ||
1925 | return 0; | ||
1926 | } | ||
1927 | |||
1928 | |||
1929 | /** | 1835 | /** |
1930 | * cpuset_init - initialize cpusets at system boot | 1836 | * cpuset_init - initialize cpusets at system boot |
1931 | * | 1837 | * |
@@ -1936,11 +1842,13 @@ int __init cpuset_init(void) | |||
1936 | { | 1842 | { |
1937 | int err = 0; | 1843 | int err = 0; |
1938 | 1844 | ||
1845 | if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) | ||
1846 | BUG(); | ||
1847 | |||
1939 | cpumask_setall(top_cpuset.cpus_allowed); | 1848 | cpumask_setall(top_cpuset.cpus_allowed); |
1940 | nodes_setall(top_cpuset.mems_allowed); | 1849 | nodes_setall(top_cpuset.mems_allowed); |
1941 | 1850 | ||
1942 | fmeter_init(&top_cpuset.fmeter); | 1851 | fmeter_init(&top_cpuset.fmeter); |
1943 | top_cpuset.mems_generation = cpuset_mems_generation++; | ||
1944 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); | 1852 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); |
1945 | top_cpuset.relax_domain_level = -1; | 1853 | top_cpuset.relax_domain_level = -1; |
1946 | 1854 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index 41c88fe40500..7fa441333529 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/kthread.h> | 9 | #include <linux/kthread.h> |
10 | #include <linux/completion.h> | 10 | #include <linux/completion.h> |
11 | #include <linux/err.h> | 11 | #include <linux/err.h> |
12 | #include <linux/cpuset.h> | ||
12 | #include <linux/unistd.h> | 13 | #include <linux/unistd.h> |
13 | #include <linux/file.h> | 14 | #include <linux/file.h> |
14 | #include <linux/module.h> | 15 | #include <linux/module.h> |
@@ -236,6 +237,7 @@ int kthreadd(void *unused) | |||
236 | ignore_signals(tsk); | 237 | ignore_signals(tsk); |
237 | set_user_nice(tsk, KTHREAD_NICE_LEVEL); | 238 | set_user_nice(tsk, KTHREAD_NICE_LEVEL); |
238 | set_cpus_allowed_ptr(tsk, cpu_all_mask); | 239 | set_cpus_allowed_ptr(tsk, cpu_all_mask); |
240 | set_mems_allowed(node_possible_map); | ||
239 | 241 | ||
240 | current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; | 242 | current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; |
241 | 243 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3eb4a6fdc043..46bdf9ddf2ba 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -182,13 +182,54 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) | |||
182 | return 0; | 182 | return 0; |
183 | } | 183 | } |
184 | 184 | ||
185 | /* Create a new policy */ | 185 | /* |
186 | * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if | ||
187 | * any, for the new policy. mpol_new() has already validated the nodes | ||
188 | * parameter with respect to the policy mode and flags. But, we need to | ||
189 | * handle an empty nodemask with MPOL_PREFERRED here. | ||
190 | * | ||
191 | * Must be called holding task's alloc_lock to protect task's mems_allowed | ||
192 | * and mempolicy. May also be called holding the mmap_semaphore for write. | ||
193 | */ | ||
194 | static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) | ||
195 | { | ||
196 | nodemask_t cpuset_context_nmask; | ||
197 | int ret; | ||
198 | |||
199 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ | ||
200 | if (pol == NULL) | ||
201 | return 0; | ||
202 | |||
203 | VM_BUG_ON(!nodes); | ||
204 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) | ||
205 | nodes = NULL; /* explicit local allocation */ | ||
206 | else { | ||
207 | if (pol->flags & MPOL_F_RELATIVE_NODES) | ||
208 | mpol_relative_nodemask(&cpuset_context_nmask, nodes, | ||
209 | &cpuset_current_mems_allowed); | ||
210 | else | ||
211 | nodes_and(cpuset_context_nmask, *nodes, | ||
212 | cpuset_current_mems_allowed); | ||
213 | if (mpol_store_user_nodemask(pol)) | ||
214 | pol->w.user_nodemask = *nodes; | ||
215 | else | ||
216 | pol->w.cpuset_mems_allowed = | ||
217 | cpuset_current_mems_allowed; | ||
218 | } | ||
219 | |||
220 | ret = mpol_ops[pol->mode].create(pol, | ||
221 | nodes ? &cpuset_context_nmask : NULL); | ||
222 | return ret; | ||
223 | } | ||
224 | |||
225 | /* | ||
226 | * This function just creates a new policy, does some check and simple | ||
227 | * initialization. You must invoke mpol_set_nodemask() to set nodes. | ||
228 | */ | ||
186 | static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | 229 | static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, |
187 | nodemask_t *nodes) | 230 | nodemask_t *nodes) |
188 | { | 231 | { |
189 | struct mempolicy *policy; | 232 | struct mempolicy *policy; |
190 | nodemask_t cpuset_context_nmask; | ||
191 | int ret; | ||
192 | 233 | ||
193 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", | 234 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", |
194 | mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); | 235 | mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); |
@@ -210,7 +251,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
210 | if (((flags & MPOL_F_STATIC_NODES) || | 251 | if (((flags & MPOL_F_STATIC_NODES) || |
211 | (flags & MPOL_F_RELATIVE_NODES))) | 252 | (flags & MPOL_F_RELATIVE_NODES))) |
212 | return ERR_PTR(-EINVAL); | 253 | return ERR_PTR(-EINVAL); |
213 | nodes = NULL; /* flag local alloc */ | ||
214 | } | 254 | } |
215 | } else if (nodes_empty(*nodes)) | 255 | } else if (nodes_empty(*nodes)) |
216 | return ERR_PTR(-EINVAL); | 256 | return ERR_PTR(-EINVAL); |
@@ -221,30 +261,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
221 | policy->mode = mode; | 261 | policy->mode = mode; |
222 | policy->flags = flags; | 262 | policy->flags = flags; |
223 | 263 | ||
224 | if (nodes) { | ||
225 | /* | ||
226 | * cpuset related setup doesn't apply to local allocation | ||
227 | */ | ||
228 | cpuset_update_task_memory_state(); | ||
229 | if (flags & MPOL_F_RELATIVE_NODES) | ||
230 | mpol_relative_nodemask(&cpuset_context_nmask, nodes, | ||
231 | &cpuset_current_mems_allowed); | ||
232 | else | ||
233 | nodes_and(cpuset_context_nmask, *nodes, | ||
234 | cpuset_current_mems_allowed); | ||
235 | if (mpol_store_user_nodemask(policy)) | ||
236 | policy->w.user_nodemask = *nodes; | ||
237 | else | ||
238 | policy->w.cpuset_mems_allowed = | ||
239 | cpuset_mems_allowed(current); | ||
240 | } | ||
241 | |||
242 | ret = mpol_ops[mode].create(policy, | ||
243 | nodes ? &cpuset_context_nmask : NULL); | ||
244 | if (ret < 0) { | ||
245 | kmem_cache_free(policy_cache, policy); | ||
246 | return ERR_PTR(ret); | ||
247 | } | ||
248 | return policy; | 264 | return policy; |
249 | } | 265 | } |
250 | 266 | ||
@@ -324,6 +340,8 @@ static void mpol_rebind_policy(struct mempolicy *pol, | |||
324 | /* | 340 | /* |
325 | * Wrapper for mpol_rebind_policy() that just requires task | 341 | * Wrapper for mpol_rebind_policy() that just requires task |
326 | * pointer, and updates task mempolicy. | 342 | * pointer, and updates task mempolicy. |
343 | * | ||
344 | * Called with task's alloc_lock held. | ||
327 | */ | 345 | */ |
328 | 346 | ||
329 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) | 347 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) |
@@ -600,8 +618,9 @@ static void mpol_set_task_struct_flag(void) | |||
600 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, | 618 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, |
601 | nodemask_t *nodes) | 619 | nodemask_t *nodes) |
602 | { | 620 | { |
603 | struct mempolicy *new; | 621 | struct mempolicy *new, *old; |
604 | struct mm_struct *mm = current->mm; | 622 | struct mm_struct *mm = current->mm; |
623 | int ret; | ||
605 | 624 | ||
606 | new = mpol_new(mode, flags, nodes); | 625 | new = mpol_new(mode, flags, nodes); |
607 | if (IS_ERR(new)) | 626 | if (IS_ERR(new)) |
@@ -615,20 +634,33 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
615 | */ | 634 | */ |
616 | if (mm) | 635 | if (mm) |
617 | down_write(&mm->mmap_sem); | 636 | down_write(&mm->mmap_sem); |
618 | mpol_put(current->mempolicy); | 637 | task_lock(current); |
638 | ret = mpol_set_nodemask(new, nodes); | ||
639 | if (ret) { | ||
640 | task_unlock(current); | ||
641 | if (mm) | ||
642 | up_write(&mm->mmap_sem); | ||
643 | mpol_put(new); | ||
644 | return ret; | ||
645 | } | ||
646 | old = current->mempolicy; | ||
619 | current->mempolicy = new; | 647 | current->mempolicy = new; |
620 | mpol_set_task_struct_flag(); | 648 | mpol_set_task_struct_flag(); |
621 | if (new && new->mode == MPOL_INTERLEAVE && | 649 | if (new && new->mode == MPOL_INTERLEAVE && |
622 | nodes_weight(new->v.nodes)) | 650 | nodes_weight(new->v.nodes)) |
623 | current->il_next = first_node(new->v.nodes); | 651 | current->il_next = first_node(new->v.nodes); |
652 | task_unlock(current); | ||
624 | if (mm) | 653 | if (mm) |
625 | up_write(&mm->mmap_sem); | 654 | up_write(&mm->mmap_sem); |
626 | 655 | ||
656 | mpol_put(old); | ||
627 | return 0; | 657 | return 0; |
628 | } | 658 | } |
629 | 659 | ||
630 | /* | 660 | /* |
631 | * Return nodemask for policy for get_mempolicy() query | 661 | * Return nodemask for policy for get_mempolicy() query |
662 | * | ||
663 | * Called with task's alloc_lock held | ||
632 | */ | 664 | */ |
633 | static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) | 665 | static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) |
634 | { | 666 | { |
@@ -674,7 +706,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
674 | struct vm_area_struct *vma = NULL; | 706 | struct vm_area_struct *vma = NULL; |
675 | struct mempolicy *pol = current->mempolicy; | 707 | struct mempolicy *pol = current->mempolicy; |
676 | 708 | ||
677 | cpuset_update_task_memory_state(); | ||
678 | if (flags & | 709 | if (flags & |
679 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) | 710 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) |
680 | return -EINVAL; | 711 | return -EINVAL; |
@@ -683,7 +714,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
683 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) | 714 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) |
684 | return -EINVAL; | 715 | return -EINVAL; |
685 | *policy = 0; /* just so it's initialized */ | 716 | *policy = 0; /* just so it's initialized */ |
717 | task_lock(current); | ||
686 | *nmask = cpuset_current_mems_allowed; | 718 | *nmask = cpuset_current_mems_allowed; |
719 | task_unlock(current); | ||
687 | return 0; | 720 | return 0; |
688 | } | 721 | } |
689 | 722 | ||
@@ -738,8 +771,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
738 | } | 771 | } |
739 | 772 | ||
740 | err = 0; | 773 | err = 0; |
741 | if (nmask) | 774 | if (nmask) { |
775 | task_lock(current); | ||
742 | get_policy_nodemask(pol, nmask); | 776 | get_policy_nodemask(pol, nmask); |
777 | task_unlock(current); | ||
778 | } | ||
743 | 779 | ||
744 | out: | 780 | out: |
745 | mpol_cond_put(pol); | 781 | mpol_cond_put(pol); |
@@ -979,6 +1015,14 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
979 | return err; | 1015 | return err; |
980 | } | 1016 | } |
981 | down_write(&mm->mmap_sem); | 1017 | down_write(&mm->mmap_sem); |
1018 | task_lock(current); | ||
1019 | err = mpol_set_nodemask(new, nmask); | ||
1020 | task_unlock(current); | ||
1021 | if (err) { | ||
1022 | up_write(&mm->mmap_sem); | ||
1023 | mpol_put(new); | ||
1024 | return err; | ||
1025 | } | ||
982 | vma = check_range(mm, start, end, nmask, | 1026 | vma = check_range(mm, start, end, nmask, |
983 | flags | MPOL_MF_INVERT, &pagelist); | 1027 | flags | MPOL_MF_INVERT, &pagelist); |
984 | 1028 | ||
@@ -1545,8 +1589,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1545 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1589 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
1546 | struct zonelist *zl; | 1590 | struct zonelist *zl; |
1547 | 1591 | ||
1548 | cpuset_update_task_memory_state(); | ||
1549 | |||
1550 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1592 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
1551 | unsigned nid; | 1593 | unsigned nid; |
1552 | 1594 | ||
@@ -1593,8 +1635,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1593 | { | 1635 | { |
1594 | struct mempolicy *pol = current->mempolicy; | 1636 | struct mempolicy *pol = current->mempolicy; |
1595 | 1637 | ||
1596 | if ((gfp & __GFP_WAIT) && !in_interrupt()) | ||
1597 | cpuset_update_task_memory_state(); | ||
1598 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 1638 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
1599 | pol = &default_policy; | 1639 | pol = &default_policy; |
1600 | 1640 | ||
@@ -1854,6 +1894,8 @@ restart: | |||
1854 | */ | 1894 | */ |
1855 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | 1895 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) |
1856 | { | 1896 | { |
1897 | int ret; | ||
1898 | |||
1857 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ | 1899 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ |
1858 | spin_lock_init(&sp->lock); | 1900 | spin_lock_init(&sp->lock); |
1859 | 1901 | ||
@@ -1863,9 +1905,19 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | |||
1863 | 1905 | ||
1864 | /* contextualize the tmpfs mount point mempolicy */ | 1906 | /* contextualize the tmpfs mount point mempolicy */ |
1865 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); | 1907 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); |
1866 | mpol_put(mpol); /* drop our ref on sb mpol */ | 1908 | if (IS_ERR(new)) { |
1867 | if (IS_ERR(new)) | 1909 | mpol_put(mpol); /* drop our ref on sb mpol */ |
1868 | return; /* no valid nodemask intersection */ | 1910 | return; /* no valid nodemask intersection */ |
1911 | } | ||
1912 | |||
1913 | task_lock(current); | ||
1914 | ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); | ||
1915 | task_unlock(current); | ||
1916 | mpol_put(mpol); /* drop our ref on sb mpol */ | ||
1917 | if (ret) { | ||
1918 | mpol_put(new); | ||
1919 | return; | ||
1920 | } | ||
1869 | 1921 | ||
1870 | /* Create pseudo-vma that contains just the policy */ | 1922 | /* Create pseudo-vma that contains just the policy */ |
1871 | memset(&pvma, 0, sizeof(struct vm_area_struct)); | 1923 | memset(&pvma, 0, sizeof(struct vm_area_struct)); |
@@ -2086,8 +2138,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2086 | new = mpol_new(mode, mode_flags, &nodes); | 2138 | new = mpol_new(mode, mode_flags, &nodes); |
2087 | if (IS_ERR(new)) | 2139 | if (IS_ERR(new)) |
2088 | err = 1; | 2140 | err = 1; |
2089 | else if (no_context) | 2141 | else { |
2090 | new->w.user_nodemask = nodes; /* save for contextualization */ | 2142 | int ret; |
2143 | |||
2144 | task_lock(current); | ||
2145 | ret = mpol_set_nodemask(new, &nodes); | ||
2146 | task_unlock(current); | ||
2147 | if (ret) | ||
2148 | err = 1; | ||
2149 | else if (no_context) { | ||
2150 | /* save for contextualization */ | ||
2151 | new->w.user_nodemask = nodes; | ||
2152 | } | ||
2153 | } | ||
2091 | 2154 | ||
2092 | out: | 2155 | out: |
2093 | /* Restore string for error message */ | 2156 | /* Restore string for error message */ |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 17d5f539a9aa..7cc3179e3591 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1569,10 +1569,7 @@ nofail_alloc: | |||
1569 | 1569 | ||
1570 | /* We now go into synchronous reclaim */ | 1570 | /* We now go into synchronous reclaim */ |
1571 | cpuset_memory_pressure_bump(); | 1571 | cpuset_memory_pressure_bump(); |
1572 | /* | 1572 | |
1573 | * The task's cpuset might have expanded its set of allowable nodes | ||
1574 | */ | ||
1575 | cpuset_update_task_memory_state(); | ||
1576 | p->flags |= PF_MEMALLOC; | 1573 | p->flags |= PF_MEMALLOC; |
1577 | 1574 | ||
1578 | lockdep_set_current_reclaim_state(gfp_mask); | 1575 | lockdep_set_current_reclaim_state(gfp_mask); |