diff options
-rw-r--r-- | kernel/cpuset.c | 128 |
1 files changed, 43 insertions, 85 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 20cb3916c66c..918bee9dc7a2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -65,7 +65,7 @@ | |||
65 | */ | 65 | */ |
66 | int number_of_cpusets __read_mostly; | 66 | int number_of_cpusets __read_mostly; |
67 | 67 | ||
68 | /* Retrieve the cpuset from a cgroup */ | 68 | /* Forward declare cgroup structures */ |
69 | struct cgroup_subsys cpuset_subsys; | 69 | struct cgroup_subsys cpuset_subsys; |
70 | struct cpuset; | 70 | struct cpuset; |
71 | 71 | ||
@@ -167,17 +167,17 @@ static inline int is_spread_slab(const struct cpuset *cs) | |||
167 | * number, and avoid having to lock and reload mems_allowed unless | 167 | * number, and avoid having to lock and reload mems_allowed unless |
168 | * the cpuset they're using changes generation. | 168 | * the cpuset they're using changes generation. |
169 | * | 169 | * |
170 | * A single, global generation is needed because attach_task() could | 170 | * A single, global generation is needed because cpuset_attach_task() could |
171 | * reattach a task to a different cpuset, which must not have its | 171 | * reattach a task to a different cpuset, which must not have its |
172 | * generation numbers aliased with those of that tasks previous cpuset. | 172 | * generation numbers aliased with those of that tasks previous cpuset. |
173 | * | 173 | * |
174 | * Generations are needed for mems_allowed because one task cannot | 174 | * Generations are needed for mems_allowed because one task cannot |
175 | * modify anothers memory placement. So we must enable every task, | 175 | * modify another's memory placement. So we must enable every task, |
176 | * on every visit to __alloc_pages(), to efficiently check whether | 176 | * on every visit to __alloc_pages(), to efficiently check whether |
177 | * its current->cpuset->mems_allowed has changed, requiring an update | 177 | * its current->cpuset->mems_allowed has changed, requiring an update |
178 | * of its current->mems_allowed. | 178 | * of its current->mems_allowed. |
179 | * | 179 | * |
180 | * Since cpuset_mems_generation is guarded by manage_mutex, | 180 | * Since writes to cpuset_mems_generation are guarded by the cgroup lock |
181 | * there is no need to mark it atomic. | 181 | * there is no need to mark it atomic. |
182 | */ | 182 | */ |
183 | static int cpuset_mems_generation; | 183 | static int cpuset_mems_generation; |
@@ -189,17 +189,20 @@ static struct cpuset top_cpuset = { | |||
189 | }; | 189 | }; |
190 | 190 | ||
191 | /* | 191 | /* |
192 | * We have two global cpuset mutexes below. They can nest. | 192 | * There are two global mutexes guarding cpuset structures. The first |
193 | * It is ok to first take manage_mutex, then nest callback_mutex. We also | 193 | * is the main control groups cgroup_mutex, accessed via |
194 | * require taking task_lock() when dereferencing a tasks cpuset pointer. | 194 | * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific |
195 | * See "The task_lock() exception", at the end of this comment. | 195 | * callback_mutex, below. They can nest. It is ok to first take |
196 | * cgroup_mutex, then nest callback_mutex. We also require taking | ||
197 | * task_lock() when dereferencing a task's cpuset pointer. See "The | ||
198 | * task_lock() exception", at the end of this comment. | ||
196 | * | 199 | * |
197 | * A task must hold both mutexes to modify cpusets. If a task | 200 | * A task must hold both mutexes to modify cpusets. If a task |
198 | * holds manage_mutex, then it blocks others wanting that mutex, | 201 | * holds cgroup_mutex, then it blocks others wanting that mutex, |
199 | * ensuring that it is the only task able to also acquire callback_mutex | 202 | * ensuring that it is the only task able to also acquire callback_mutex |
200 | * and be able to modify cpusets. It can perform various checks on | 203 | * and be able to modify cpusets. It can perform various checks on |
201 | * the cpuset structure first, knowing nothing will change. It can | 204 | * the cpuset structure first, knowing nothing will change. It can |
202 | * also allocate memory while just holding manage_mutex. While it is | 205 | * also allocate memory while just holding cgroup_mutex. While it is |
203 | * performing these checks, various callback routines can briefly | 206 | * performing these checks, various callback routines can briefly |
204 | * acquire callback_mutex to query cpusets. Once it is ready to make | 207 | * acquire callback_mutex to query cpusets. Once it is ready to make |
205 | * the changes, it takes callback_mutex, blocking everyone else. | 208 | * the changes, it takes callback_mutex, blocking everyone else. |
@@ -215,60 +218,16 @@ static struct cpuset top_cpuset = { | |||
215 | * The task_struct fields mems_allowed and mems_generation may only | 218 | * The task_struct fields mems_allowed and mems_generation may only |
216 | * be accessed in the context of that task, so require no locks. | 219 | * be accessed in the context of that task, so require no locks. |
217 | * | 220 | * |
218 | * Any task can increment and decrement the count field without lock. | ||
219 | * So in general, code holding manage_mutex or callback_mutex can't rely | ||
220 | * on the count field not changing. However, if the count goes to | ||
221 | * zero, then only attach_task(), which holds both mutexes, can | ||
222 | * increment it again. Because a count of zero means that no tasks | ||
223 | * are currently attached, therefore there is no way a task attached | ||
224 | * to that cpuset can fork (the other way to increment the count). | ||
225 | * So code holding manage_mutex or callback_mutex can safely assume that | ||
226 | * if the count is zero, it will stay zero. Similarly, if a task | ||
227 | * holds manage_mutex or callback_mutex on a cpuset with zero count, it | ||
228 | * knows that the cpuset won't be removed, as cpuset_rmdir() needs | ||
229 | * both of those mutexes. | ||
230 | * | ||
231 | * The cpuset_common_file_write handler for operations that modify | 221 | * The cpuset_common_file_write handler for operations that modify |
232 | * the cpuset hierarchy holds manage_mutex across the entire operation, | 222 | * the cpuset hierarchy holds cgroup_mutex across the entire operation, |
233 | * single threading all such cpuset modifications across the system. | 223 | * single threading all such cpuset modifications across the system. |
234 | * | 224 | * |
235 | * The cpuset_common_file_read() handlers only hold callback_mutex across | 225 | * The cpuset_common_file_read() handlers only hold callback_mutex across |
236 | * small pieces of code, such as when reading out possibly multi-word | 226 | * small pieces of code, such as when reading out possibly multi-word |
237 | * cpumasks and nodemasks. | 227 | * cpumasks and nodemasks. |
238 | * | 228 | * |
239 | * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't | 229 | * Accessing a task's cpuset should be done in accordance with the |
240 | * (usually) take either mutex. These are the two most performance | 230 | * guidelines for accessing subsystem state in kernel/cgroup.c |
241 | * critical pieces of code here. The exception occurs on cpuset_exit(), | ||
242 | * when a task in a notify_on_release cpuset exits. Then manage_mutex | ||
243 | * is taken, and if the cpuset count is zero, a usermode call made | ||
244 | * to /sbin/cpuset_release_agent with the name of the cpuset (path | ||
245 | * relative to the root of cpuset file system) as the argument. | ||
246 | * | ||
247 | * A cpuset can only be deleted if both its 'count' of using tasks | ||
248 | * is zero, and its list of 'children' cpusets is empty. Since all | ||
249 | * tasks in the system use _some_ cpuset, and since there is always at | ||
250 | * least one task in the system (init), therefore, top_cpuset | ||
251 | * always has either children cpusets and/or using tasks. So we don't | ||
252 | * need a special hack to ensure that top_cpuset cannot be deleted. | ||
253 | * | ||
254 | * The above "Tale of Two Semaphores" would be complete, but for: | ||
255 | * | ||
256 | * The task_lock() exception | ||
257 | * | ||
258 | * The need for this exception arises from the action of attach_task(), | ||
259 | * which overwrites one tasks cpuset pointer with another. It does | ||
260 | * so using both mutexes, however there are several performance | ||
261 | * critical places that need to reference task->cpuset without the | ||
262 | * expense of grabbing a system global mutex. Therefore except as | ||
263 | * noted below, when dereferencing or, as in attach_task(), modifying | ||
264 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock | ||
265 | * (task->alloc_lock) already in the task_struct routinely used for | ||
266 | * such matters. | ||
267 | * | ||
268 | * P.S. One more locking exception. RCU is used to guard the | ||
269 | * update of a tasks cpuset pointer by attach_task() and the | ||
270 | * access of task->cpuset->mems_generation via that pointer in | ||
271 | * the routine cpuset_update_task_memory_state(). | ||
272 | */ | 231 | */ |
273 | 232 | ||
274 | static DEFINE_MUTEX(callback_mutex); | 233 | static DEFINE_MUTEX(callback_mutex); |
@@ -361,15 +320,14 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
361 | * Do not call this routine if in_interrupt(). | 320 | * Do not call this routine if in_interrupt(). |
362 | * | 321 | * |
363 | * Call without callback_mutex or task_lock() held. May be | 322 | * Call without callback_mutex or task_lock() held. May be |
364 | * called with or without manage_mutex held. Thanks in part to | 323 | * called with or without cgroup_mutex held. Thanks in part to |
365 | * 'the_top_cpuset_hack', the tasks cpuset pointer will never | 324 | * 'the_top_cpuset_hack', the task's cpuset pointer will never |
366 | * be NULL. This routine also might acquire callback_mutex and | 325 | * be NULL. This routine also might acquire callback_mutex and |
367 | * current->mm->mmap_sem during call. | 326 | * current->mm->mmap_sem during call. |
368 | * | 327 | * |
369 | * Reading current->cpuset->mems_generation doesn't need task_lock | 328 | * Reading current->cpuset->mems_generation doesn't need task_lock |
370 | * to guard the current->cpuset derefence, because it is guarded | 329 | * to guard the current->cpuset derefence, because it is guarded |
371 | * from concurrent freeing of current->cpuset by attach_task(), | 330 | * from concurrent freeing of current->cpuset using RCU. |
372 | * using RCU. | ||
373 | * | 331 | * |
374 | * The rcu_dereference() is technically probably not needed, | 332 | * The rcu_dereference() is technically probably not needed, |
375 | * as I don't actually mind if I see a new cpuset pointer but | 333 | * as I don't actually mind if I see a new cpuset pointer but |
@@ -431,7 +389,7 @@ void cpuset_update_task_memory_state(void) | |||
431 | * | 389 | * |
432 | * One cpuset is a subset of another if all its allowed CPUs and | 390 | * One cpuset is a subset of another if all its allowed CPUs and |
433 | * Memory Nodes are a subset of the other, and its exclusive flags | 391 | * Memory Nodes are a subset of the other, and its exclusive flags |
434 | * are only set if the other's are set. Call holding manage_mutex. | 392 | * are only set if the other's are set. Call holding cgroup_mutex. |
435 | */ | 393 | */ |
436 | 394 | ||
437 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | 395 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) |
@@ -449,7 +407,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | |||
449 | * If we replaced the flag and mask values of the current cpuset | 407 | * If we replaced the flag and mask values of the current cpuset |
450 | * (cur) with those values in the trial cpuset (trial), would | 408 | * (cur) with those values in the trial cpuset (trial), would |
451 | * our various subset and exclusive rules still be valid? Presumes | 409 | * our various subset and exclusive rules still be valid? Presumes |
452 | * manage_mutex held. | 410 | * cgroup_mutex held. |
453 | * | 411 | * |
454 | * 'cur' is the address of an actual, in-use cpuset. Operations | 412 | * 'cur' is the address of an actual, in-use cpuset. Operations |
455 | * such as list traversal that depend on the actual address of the | 413 | * such as list traversal that depend on the actual address of the |
@@ -483,7 +441,10 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
483 | if (!is_cpuset_subset(trial, par)) | 441 | if (!is_cpuset_subset(trial, par)) |
484 | return -EACCES; | 442 | return -EACCES; |
485 | 443 | ||
486 | /* If either I or some sibling (!= me) is exclusive, we can't overlap */ | 444 | /* |
445 | * If either I or some sibling (!= me) is exclusive, we can't | ||
446 | * overlap | ||
447 | */ | ||
487 | list_for_each_entry(cont, &par->css.cgroup->children, sibling) { | 448 | list_for_each_entry(cont, &par->css.cgroup->children, sibling) { |
488 | c = cgroup_cs(cont); | 449 | c = cgroup_cs(cont); |
489 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && | 450 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && |
@@ -744,7 +705,7 @@ static inline int started_after(void *p1, void *p2) | |||
744 | * @tsk: task to test | 705 | * @tsk: task to test |
745 | * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner | 706 | * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner |
746 | * | 707 | * |
747 | * Call with manage_mutex held. May take callback_mutex during call. | 708 | * Call with cgroup_mutex held. May take callback_mutex during call. |
748 | * Called for each task in a cgroup by cgroup_scan_tasks(). | 709 | * Called for each task in a cgroup by cgroup_scan_tasks(). |
749 | * Return nonzero if this tasks's cpus_allowed mask should be changed (in other | 710 | * Return nonzero if this tasks's cpus_allowed mask should be changed (in other |
750 | * words, if its mask is not equal to its cpuset's mask). | 711 | * words, if its mask is not equal to its cpuset's mask). |
@@ -847,11 +808,11 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
847 | * Temporarilly set tasks mems_allowed to target nodes of migration, | 808 | * Temporarilly set tasks mems_allowed to target nodes of migration, |
848 | * so that the migration code can allocate pages on these nodes. | 809 | * so that the migration code can allocate pages on these nodes. |
849 | * | 810 | * |
850 | * Call holding manage_mutex, so our current->cpuset won't change | 811 | * Call holding cgroup_mutex, so current's cpuset won't change |
851 | * during this call, as manage_mutex holds off any attach_task() | 812 | * during this call, as cgroup_mutex holds off any attach_task() |
852 | * calls. Therefore we don't need to take task_lock around the | 813 | * calls. Therefore we don't need to take task_lock around the |
853 | * call to guarantee_online_mems(), as we know no one is changing | 814 | * call to guarantee_online_mems(), as we know no one is changing |
854 | * our tasks cpuset. | 815 | * our task's cpuset. |
855 | * | 816 | * |
856 | * Hold callback_mutex around the two modifications of our tasks | 817 | * Hold callback_mutex around the two modifications of our tasks |
857 | * mems_allowed to synchronize with cpuset_mems_allowed(). | 818 | * mems_allowed to synchronize with cpuset_mems_allowed(). |
@@ -896,7 +857,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
896 | * the cpuset is marked 'memory_migrate', migrate the tasks | 857 | * the cpuset is marked 'memory_migrate', migrate the tasks |
897 | * pages to the new memory. | 858 | * pages to the new memory. |
898 | * | 859 | * |
899 | * Call with manage_mutex held. May take callback_mutex during call. | 860 | * Call with cgroup_mutex held. May take callback_mutex during call. |
900 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | 861 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, |
901 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | 862 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind |
902 | * their mempolicies to the cpusets new mems_allowed. | 863 | * their mempolicies to the cpusets new mems_allowed. |
@@ -1009,7 +970,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
1009 | * tasklist_lock. Forks can happen again now - the mpol_copy() | 970 | * tasklist_lock. Forks can happen again now - the mpol_copy() |
1010 | * cpuset_being_rebound check will catch such forks, and rebind | 971 | * cpuset_being_rebound check will catch such forks, and rebind |
1011 | * their vma mempolicies too. Because we still hold the global | 972 | * their vma mempolicies too. Because we still hold the global |
1012 | * cpuset manage_mutex, we know that no other rebind effort will | 973 | * cgroup_mutex, we know that no other rebind effort will |
1013 | * be contending for the global variable cpuset_being_rebound. | 974 | * be contending for the global variable cpuset_being_rebound. |
1014 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | 975 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() |
1015 | * is idempotent. Also migrate pages in each mm to new nodes. | 976 | * is idempotent. Also migrate pages in each mm to new nodes. |
@@ -1024,7 +985,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
1024 | mmput(mm); | 985 | mmput(mm); |
1025 | } | 986 | } |
1026 | 987 | ||
1027 | /* We're done rebinding vma's to this cpusets new mems_allowed. */ | 988 | /* We're done rebinding vmas to this cpuset's new mems_allowed. */ |
1028 | kfree(mmarray); | 989 | kfree(mmarray); |
1029 | cpuset_being_rebound = NULL; | 990 | cpuset_being_rebound = NULL; |
1030 | retval = 0; | 991 | retval = 0; |
@@ -1038,7 +999,7 @@ int current_cpuset_is_being_rebound(void) | |||
1038 | } | 999 | } |
1039 | 1000 | ||
1040 | /* | 1001 | /* |
1041 | * Call with manage_mutex held. | 1002 | * Call with cgroup_mutex held. |
1042 | */ | 1003 | */ |
1043 | 1004 | ||
1044 | static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | 1005 | static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) |
@@ -1059,7 +1020,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | |||
1059 | * cs: the cpuset to update | 1020 | * cs: the cpuset to update |
1060 | * buf: the buffer where we read the 0 or 1 | 1021 | * buf: the buffer where we read the 0 or 1 |
1061 | * | 1022 | * |
1062 | * Call with manage_mutex held. | 1023 | * Call with cgroup_mutex held. |
1063 | */ | 1024 | */ |
1064 | 1025 | ||
1065 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | 1026 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) |
@@ -1193,6 +1154,7 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
1193 | return val; | 1154 | return val; |
1194 | } | 1155 | } |
1195 | 1156 | ||
1157 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ | ||
1196 | static int cpuset_can_attach(struct cgroup_subsys *ss, | 1158 | static int cpuset_can_attach(struct cgroup_subsys *ss, |
1197 | struct cgroup *cont, struct task_struct *tsk) | 1159 | struct cgroup *cont, struct task_struct *tsk) |
1198 | { | 1160 | { |
@@ -1540,7 +1502,8 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1540 | * If this becomes a problem for some users who wish to | 1502 | * If this becomes a problem for some users who wish to |
1541 | * allow that scenario, then cpuset_post_clone() could be | 1503 | * allow that scenario, then cpuset_post_clone() could be |
1542 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive | 1504 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive |
1543 | * (and likewise for mems) to the new cgroup. | 1505 | * (and likewise for mems) to the new cgroup. Called with cgroup_mutex |
1506 | * held. | ||
1544 | */ | 1507 | */ |
1545 | static void cpuset_post_clone(struct cgroup_subsys *ss, | 1508 | static void cpuset_post_clone(struct cgroup_subsys *ss, |
1546 | struct cgroup *cgroup) | 1509 | struct cgroup *cgroup) |
@@ -1564,11 +1527,8 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, | |||
1564 | 1527 | ||
1565 | /* | 1528 | /* |
1566 | * cpuset_create - create a cpuset | 1529 | * cpuset_create - create a cpuset |
1567 | * parent: cpuset that will be parent of the new cpuset. | 1530 | * ss: cpuset cgroup subsystem |
1568 | * name: name of the new cpuset. Will be strcpy'ed. | 1531 | * cont: control group that the new cpuset will be part of |
1569 | * mode: mode to set on new inode | ||
1570 | * | ||
1571 | * Must be called with the mutex on the parent inode held | ||
1572 | */ | 1532 | */ |
1573 | 1533 | ||
1574 | static struct cgroup_subsys_state *cpuset_create( | 1534 | static struct cgroup_subsys_state *cpuset_create( |
@@ -1769,7 +1729,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
1769 | * member tasks or cpuset descendants and cpus and memory, before it can | 1729 | * member tasks or cpuset descendants and cpus and memory, before it can |
1770 | * be a candidate for release. | 1730 | * be a candidate for release. |
1771 | * | 1731 | * |
1772 | * Called with manage_mutex held. We take callback_mutex to modify | 1732 | * Called with cgroup_mutex held. We take callback_mutex to modify |
1773 | * cpus_allowed and mems_allowed. | 1733 | * cpus_allowed and mems_allowed. |
1774 | * | 1734 | * |
1775 | * This walk processes the tree from top to bottom, completing one layer | 1735 | * This walk processes the tree from top to bottom, completing one layer |
@@ -1910,7 +1870,7 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) | |||
1910 | 1870 | ||
1911 | /** | 1871 | /** |
1912 | * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. | 1872 | * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. |
1913 | * Must be called with callback_mutex held. | 1873 | * Must be called with callback_mutex held. |
1914 | **/ | 1874 | **/ |
1915 | cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) | 1875 | cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) |
1916 | { | 1876 | { |
@@ -2247,10 +2207,8 @@ void __cpuset_memory_pressure_bump(void) | |||
2247 | * - Used for /proc/<pid>/cpuset. | 2207 | * - Used for /proc/<pid>/cpuset. |
2248 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it | 2208 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it |
2249 | * doesn't really matter if tsk->cpuset changes after we read it, | 2209 | * doesn't really matter if tsk->cpuset changes after we read it, |
2250 | * and we take manage_mutex, keeping attach_task() from changing it | 2210 | * and we take cgroup_mutex, keeping attach_task() from changing it |
2251 | * anyway. No need to check that tsk->cpuset != NULL, thanks to | 2211 | * anyway. |
2252 | * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks | ||
2253 | * cpuset to top_cpuset. | ||
2254 | */ | 2212 | */ |
2255 | static int proc_cpuset_show(struct seq_file *m, void *unused_v) | 2213 | static int proc_cpuset_show(struct seq_file *m, void *unused_v) |
2256 | { | 2214 | { |