diff options
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 184 |
1 files changed, 46 insertions, 138 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index af5a83d52187..7e75a41bd508 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -97,12 +97,6 @@ struct cpuset { | |||
97 | 97 | ||
98 | struct cpuset *parent; /* my parent */ | 98 | struct cpuset *parent; /* my parent */ |
99 | 99 | ||
100 | /* | ||
101 | * Copy of global cpuset_mems_generation as of the most | ||
102 | * recent time this cpuset changed its mems_allowed. | ||
103 | */ | ||
104 | int mems_generation; | ||
105 | |||
106 | struct fmeter fmeter; /* memory_pressure filter */ | 100 | struct fmeter fmeter; /* memory_pressure filter */ |
107 | 101 | ||
108 | /* partition number for rebuild_sched_domains() */ | 102 | /* partition number for rebuild_sched_domains() */ |
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs) | |||
176 | return test_bit(CS_SPREAD_SLAB, &cs->flags); | 170 | return test_bit(CS_SPREAD_SLAB, &cs->flags); |
177 | } | 171 | } |
178 | 172 | ||
179 | /* | ||
180 | * Increment this integer everytime any cpuset changes its | ||
181 | * mems_allowed value. Users of cpusets can track this generation | ||
182 | * number, and avoid having to lock and reload mems_allowed unless | ||
183 | * the cpuset they're using changes generation. | ||
184 | * | ||
185 | * A single, global generation is needed because cpuset_attach_task() could | ||
186 | * reattach a task to a different cpuset, which must not have its | ||
187 | * generation numbers aliased with those of that tasks previous cpuset. | ||
188 | * | ||
189 | * Generations are needed for mems_allowed because one task cannot | ||
190 | * modify another's memory placement. So we must enable every task, | ||
191 | * on every visit to __alloc_pages(), to efficiently check whether | ||
192 | * its current->cpuset->mems_allowed has changed, requiring an update | ||
193 | * of its current->mems_allowed. | ||
194 | * | ||
195 | * Since writes to cpuset_mems_generation are guarded by the cgroup lock | ||
196 | * there is no need to mark it atomic. | ||
197 | */ | ||
198 | static int cpuset_mems_generation; | ||
199 | |||
200 | static struct cpuset top_cpuset = { | 173 | static struct cpuset top_cpuset = { |
201 | .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), | 174 | .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), |
202 | }; | 175 | }; |
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = { | |||
228 | * If a task is only holding callback_mutex, then it has read-only | 201 | * If a task is only holding callback_mutex, then it has read-only |
229 | * access to cpusets. | 202 | * access to cpusets. |
230 | * | 203 | * |
231 | * The task_struct fields mems_allowed and mems_generation may only | 204 | * Now, the task_struct fields mems_allowed and mempolicy may be changed |
232 | * be accessed in the context of that task, so require no locks. | 205 | * by other task, we use alloc_lock in the task_struct fields to protect |
206 | * them. | ||
233 | * | 207 | * |
234 | * The cpuset_common_file_read() handlers only hold callback_mutex across | 208 | * The cpuset_common_file_read() handlers only hold callback_mutex across |
235 | * small pieces of code, such as when reading out possibly multi-word | 209 | * small pieces of code, such as when reading out possibly multi-word |
@@ -349,69 +323,6 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, | |||
349 | tsk->flags &= ~PF_SPREAD_SLAB; | 323 | tsk->flags &= ~PF_SPREAD_SLAB; |
350 | } | 324 | } |
351 | 325 | ||
352 | /** | ||
353 | * cpuset_update_task_memory_state - update task memory placement | ||
354 | * | ||
355 | * If the current tasks cpusets mems_allowed changed behind our | ||
356 | * backs, update current->mems_allowed, mems_generation and task NUMA | ||
357 | * mempolicy to the new value. | ||
358 | * | ||
359 | * Task mempolicy is updated by rebinding it relative to the | ||
360 | * current->cpuset if a task has its memory placement changed. | ||
361 | * Do not call this routine if in_interrupt(). | ||
362 | * | ||
363 | * Call without callback_mutex or task_lock() held. May be | ||
364 | * called with or without cgroup_mutex held. Thanks in part to | ||
365 | * 'the_top_cpuset_hack', the task's cpuset pointer will never | ||
366 | * be NULL. This routine also might acquire callback_mutex during | ||
367 | * call. | ||
368 | * | ||
369 | * Reading current->cpuset->mems_generation doesn't need task_lock | ||
370 | * to guard the current->cpuset derefence, because it is guarded | ||
371 | * from concurrent freeing of current->cpuset using RCU. | ||
372 | * | ||
373 | * The rcu_dereference() is technically probably not needed, | ||
374 | * as I don't actually mind if I see a new cpuset pointer but | ||
375 | * an old value of mems_generation. However this really only | ||
376 | * matters on alpha systems using cpusets heavily. If I dropped | ||
377 | * that rcu_dereference(), it would save them a memory barrier. | ||
378 | * For all other arch's, rcu_dereference is a no-op anyway, and for | ||
379 | * alpha systems not using cpusets, another planned optimization, | ||
380 | * avoiding the rcu critical section for tasks in the root cpuset | ||
381 | * which is statically allocated, so can't vanish, will make this | ||
382 | * irrelevant. Better to use RCU as intended, than to engage in | ||
383 | * some cute trick to save a memory barrier that is impossible to | ||
384 | * test, for alpha systems using cpusets heavily, which might not | ||
385 | * even exist. | ||
386 | * | ||
387 | * This routine is needed to update the per-task mems_allowed data, | ||
388 | * within the tasks context, when it is trying to allocate memory | ||
389 | * (in various mm/mempolicy.c routines) and notices that some other | ||
390 | * task has been modifying its cpuset. | ||
391 | */ | ||
392 | |||
393 | void cpuset_update_task_memory_state(void) | ||
394 | { | ||
395 | int my_cpusets_mem_gen; | ||
396 | struct task_struct *tsk = current; | ||
397 | struct cpuset *cs; | ||
398 | |||
399 | rcu_read_lock(); | ||
400 | my_cpusets_mem_gen = task_cs(tsk)->mems_generation; | ||
401 | rcu_read_unlock(); | ||
402 | |||
403 | if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { | ||
404 | mutex_lock(&callback_mutex); | ||
405 | task_lock(tsk); | ||
406 | cs = task_cs(tsk); /* Maybe changed when task not locked */ | ||
407 | guarantee_online_mems(cs, &tsk->mems_allowed); | ||
408 | tsk->cpuset_mems_generation = cs->mems_generation; | ||
409 | task_unlock(tsk); | ||
410 | mutex_unlock(&callback_mutex); | ||
411 | mpol_rebind_task(tsk, &tsk->mems_allowed); | ||
412 | } | ||
413 | } | ||
414 | |||
415 | /* | 326 | /* |
416 | * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? | 327 | * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? |
417 | * | 328 | * |
@@ -1017,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
1017 | * other task, the task_struct mems_allowed that we are hacking | 928 | * other task, the task_struct mems_allowed that we are hacking |
1018 | * is for our current task, which must allocate new pages for that | 929 | * is for our current task, which must allocate new pages for that |
1019 | * migrating memory region. | 930 | * migrating memory region. |
1020 | * | ||
1021 | * We call cpuset_update_task_memory_state() before hacking | ||
1022 | * our tasks mems_allowed, so that we are assured of being in | ||
1023 | * sync with our tasks cpuset, and in particular, callbacks to | ||
1024 | * cpuset_update_task_memory_state() from nested page allocations | ||
1025 | * won't see any mismatch of our cpuset and task mems_generation | ||
1026 | * values, so won't overwrite our hacked tasks mems_allowed | ||
1027 | * nodemask. | ||
1028 | */ | 931 | */ |
1029 | 932 | ||
1030 | static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | 933 | static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, |
@@ -1032,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
1032 | { | 935 | { |
1033 | struct task_struct *tsk = current; | 936 | struct task_struct *tsk = current; |
1034 | 937 | ||
1035 | cpuset_update_task_memory_state(); | ||
1036 | |||
1037 | mutex_lock(&callback_mutex); | ||
1038 | tsk->mems_allowed = *to; | 938 | tsk->mems_allowed = *to; |
1039 | mutex_unlock(&callback_mutex); | ||
1040 | 939 | ||
1041 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); | 940 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); |
1042 | 941 | ||
1043 | mutex_lock(&callback_mutex); | ||
1044 | guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); | 942 | guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); |
1045 | mutex_unlock(&callback_mutex); | ||
1046 | } | 943 | } |
1047 | 944 | ||
1048 | /* | 945 | /* |
1049 | * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new | 946 | * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy |
1050 | * nodes if memory_migrate flag is set. Called with cgroup_mutex held. | 947 | * @tsk: the task to change |
948 | * @newmems: new nodes that the task will be set | ||
949 | * | ||
950 | * In order to avoid seeing no nodes if the old and new nodes are disjoint, | ||
951 | * we structure updates as setting all new allowed nodes, then clearing newly | ||
952 | * disallowed ones. | ||
953 | * | ||
954 | * Called with task's alloc_lock held | ||
955 | */ | ||
956 | static void cpuset_change_task_nodemask(struct task_struct *tsk, | ||
957 | nodemask_t *newmems) | ||
958 | { | ||
959 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); | ||
960 | mpol_rebind_task(tsk, &tsk->mems_allowed); | ||
961 | mpol_rebind_task(tsk, newmems); | ||
962 | tsk->mems_allowed = *newmems; | ||
963 | } | ||
964 | |||
965 | /* | ||
966 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy | ||
967 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if | ||
968 | * memory_migrate flag is set. Called with cgroup_mutex held. | ||
1051 | */ | 969 | */ |
1052 | static void cpuset_change_nodemask(struct task_struct *p, | 970 | static void cpuset_change_nodemask(struct task_struct *p, |
1053 | struct cgroup_scanner *scan) | 971 | struct cgroup_scanner *scan) |
@@ -1056,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p, | |||
1056 | struct cpuset *cs; | 974 | struct cpuset *cs; |
1057 | int migrate; | 975 | int migrate; |
1058 | const nodemask_t *oldmem = scan->data; | 976 | const nodemask_t *oldmem = scan->data; |
977 | nodemask_t newmems; | ||
978 | |||
979 | cs = cgroup_cs(scan->cg); | ||
980 | guarantee_online_mems(cs, &newmems); | ||
981 | |||
982 | task_lock(p); | ||
983 | cpuset_change_task_nodemask(p, &newmems); | ||
984 | task_unlock(p); | ||
1059 | 985 | ||
1060 | mm = get_task_mm(p); | 986 | mm = get_task_mm(p); |
1061 | if (!mm) | 987 | if (!mm) |
1062 | return; | 988 | return; |
1063 | 989 | ||
1064 | cs = cgroup_cs(scan->cg); | ||
1065 | migrate = is_memory_migrate(cs); | 990 | migrate = is_memory_migrate(cs); |
1066 | 991 | ||
1067 | mpol_rebind_mm(mm, &cs->mems_allowed); | 992 | mpol_rebind_mm(mm, &cs->mems_allowed); |
@@ -1114,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, | |||
1114 | /* | 1039 | /* |
1115 | * Handle user request to change the 'mems' memory placement | 1040 | * Handle user request to change the 'mems' memory placement |
1116 | * of a cpuset. Needs to validate the request, update the | 1041 | * of a cpuset. Needs to validate the request, update the |
1117 | * cpusets mems_allowed and mems_generation, and for each | 1042 | * cpusets mems_allowed, and for each task in the cpuset, |
1118 | * task in the cpuset, rebind any vma mempolicies and if | 1043 | * update mems_allowed and rebind task's mempolicy and any vma |
1119 | * the cpuset is marked 'memory_migrate', migrate the tasks | 1044 | * mempolicies and if the cpuset is marked 'memory_migrate', |
1120 | * pages to the new memory. | 1045 | * migrate the tasks pages to the new memory. |
1121 | * | 1046 | * |
1122 | * Call with cgroup_mutex held. May take callback_mutex during call. | 1047 | * Call with cgroup_mutex held. May take callback_mutex during call. |
1123 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | 1048 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, |
@@ -1170,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1170 | 1095 | ||
1171 | mutex_lock(&callback_mutex); | 1096 | mutex_lock(&callback_mutex); |
1172 | cs->mems_allowed = trialcs->mems_allowed; | 1097 | cs->mems_allowed = trialcs->mems_allowed; |
1173 | cs->mems_generation = cpuset_mems_generation++; | ||
1174 | mutex_unlock(&callback_mutex); | 1098 | mutex_unlock(&callback_mutex); |
1175 | 1099 | ||
1176 | update_tasks_nodemask(cs, &oldmem, &heap); | 1100 | update_tasks_nodemask(cs, &oldmem, &heap); |
@@ -1434,15 +1358,18 @@ static void cpuset_attach(struct cgroup_subsys *ss, | |||
1434 | 1358 | ||
1435 | if (cs == &top_cpuset) { | 1359 | if (cs == &top_cpuset) { |
1436 | cpumask_copy(cpus_attach, cpu_possible_mask); | 1360 | cpumask_copy(cpus_attach, cpu_possible_mask); |
1361 | to = node_possible_map; | ||
1437 | } else { | 1362 | } else { |
1438 | mutex_lock(&callback_mutex); | ||
1439 | guarantee_online_cpus(cs, cpus_attach); | 1363 | guarantee_online_cpus(cs, cpus_attach); |
1440 | mutex_unlock(&callback_mutex); | 1364 | guarantee_online_mems(cs, &to); |
1441 | } | 1365 | } |
1442 | err = set_cpus_allowed_ptr(tsk, cpus_attach); | 1366 | err = set_cpus_allowed_ptr(tsk, cpus_attach); |
1443 | if (err) | 1367 | if (err) |
1444 | return; | 1368 | return; |
1445 | 1369 | ||
1370 | task_lock(tsk); | ||
1371 | cpuset_change_task_nodemask(tsk, &to); | ||
1372 | task_unlock(tsk); | ||
1446 | cpuset_update_task_spread_flag(cs, tsk); | 1373 | cpuset_update_task_spread_flag(cs, tsk); |
1447 | 1374 | ||
1448 | from = oldcs->mems_allowed; | 1375 | from = oldcs->mems_allowed; |
@@ -1848,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1848 | struct cpuset *parent; | 1775 | struct cpuset *parent; |
1849 | 1776 | ||
1850 | if (!cont->parent) { | 1777 | if (!cont->parent) { |
1851 | /* This is early initialization for the top cgroup */ | ||
1852 | top_cpuset.mems_generation = cpuset_mems_generation++; | ||
1853 | return &top_cpuset.css; | 1778 | return &top_cpuset.css; |
1854 | } | 1779 | } |
1855 | parent = cgroup_cs(cont->parent); | 1780 | parent = cgroup_cs(cont->parent); |
@@ -1861,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1861 | return ERR_PTR(-ENOMEM); | 1786 | return ERR_PTR(-ENOMEM); |
1862 | } | 1787 | } |
1863 | 1788 | ||
1864 | cpuset_update_task_memory_state(); | ||
1865 | cs->flags = 0; | 1789 | cs->flags = 0; |
1866 | if (is_spread_page(parent)) | 1790 | if (is_spread_page(parent)) |
1867 | set_bit(CS_SPREAD_PAGE, &cs->flags); | 1791 | set_bit(CS_SPREAD_PAGE, &cs->flags); |
@@ -1870,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1870 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | 1794 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
1871 | cpumask_clear(cs->cpus_allowed); | 1795 | cpumask_clear(cs->cpus_allowed); |
1872 | nodes_clear(cs->mems_allowed); | 1796 | nodes_clear(cs->mems_allowed); |
1873 | cs->mems_generation = cpuset_mems_generation++; | ||
1874 | fmeter_init(&cs->fmeter); | 1797 | fmeter_init(&cs->fmeter); |
1875 | cs->relax_domain_level = -1; | 1798 | cs->relax_domain_level = -1; |
1876 | 1799 | ||
@@ -1889,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1889 | { | 1812 | { |
1890 | struct cpuset *cs = cgroup_cs(cont); | 1813 | struct cpuset *cs = cgroup_cs(cont); |
1891 | 1814 | ||
1892 | cpuset_update_task_memory_state(); | ||
1893 | |||
1894 | if (is_sched_load_balance(cs)) | 1815 | if (is_sched_load_balance(cs)) |
1895 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); | 1816 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); |
1896 | 1817 | ||
@@ -1911,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = { | |||
1911 | .early_init = 1, | 1832 | .early_init = 1, |
1912 | }; | 1833 | }; |
1913 | 1834 | ||
1914 | /* | ||
1915 | * cpuset_init_early - just enough so that the calls to | ||
1916 | * cpuset_update_task_memory_state() in early init code | ||
1917 | * are harmless. | ||
1918 | */ | ||
1919 | |||
1920 | int __init cpuset_init_early(void) | ||
1921 | { | ||
1922 | alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT); | ||
1923 | |||
1924 | top_cpuset.mems_generation = cpuset_mems_generation++; | ||
1925 | return 0; | ||
1926 | } | ||
1927 | |||
1928 | |||
1929 | /** | 1835 | /** |
1930 | * cpuset_init - initialize cpusets at system boot | 1836 | * cpuset_init - initialize cpusets at system boot |
1931 | * | 1837 | * |
@@ -1936,11 +1842,13 @@ int __init cpuset_init(void) | |||
1936 | { | 1842 | { |
1937 | int err = 0; | 1843 | int err = 0; |
1938 | 1844 | ||
1845 | if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) | ||
1846 | BUG(); | ||
1847 | |||
1939 | cpumask_setall(top_cpuset.cpus_allowed); | 1848 | cpumask_setall(top_cpuset.cpus_allowed); |
1940 | nodes_setall(top_cpuset.mems_allowed); | 1849 | nodes_setall(top_cpuset.mems_allowed); |
1941 | 1850 | ||
1942 | fmeter_init(&top_cpuset.fmeter); | 1851 | fmeter_init(&top_cpuset.fmeter); |
1943 | top_cpuset.mems_generation = cpuset_mems_generation++; | ||
1944 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); | 1852 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); |
1945 | top_cpuset.relax_domain_level = -1; | 1853 | top_cpuset.relax_domain_level = -1; |
1946 | 1854 | ||