diff options
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 466 |
1 files changed, 302 insertions, 164 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 28176d083f7b..5a737ed9dac7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/kernel.h> | 32 | #include <linux/kernel.h> |
33 | #include <linux/kmod.h> | 33 | #include <linux/kmod.h> |
34 | #include <linux/list.h> | 34 | #include <linux/list.h> |
35 | #include <linux/mempolicy.h> | ||
35 | #include <linux/mm.h> | 36 | #include <linux/mm.h> |
36 | #include <linux/module.h> | 37 | #include <linux/module.h> |
37 | #include <linux/mount.h> | 38 | #include <linux/mount.h> |
@@ -60,6 +61,9 @@ struct cpuset { | |||
60 | cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ | 61 | cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ |
61 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ | 62 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ |
62 | 63 | ||
64 | /* | ||
65 | * Count is atomic so can incr (fork) or decr (exit) without a lock. | ||
66 | */ | ||
63 | atomic_t count; /* count tasks using this cpuset */ | 67 | atomic_t count; /* count tasks using this cpuset */ |
64 | 68 | ||
65 | /* | 69 | /* |
@@ -142,80 +146,91 @@ static struct vfsmount *cpuset_mount; | |||
142 | static struct super_block *cpuset_sb = NULL; | 146 | static struct super_block *cpuset_sb = NULL; |
143 | 147 | ||
144 | /* | 148 | /* |
145 | * cpuset_sem should be held by anyone who is depending on the children | 149 | * We have two global cpuset semaphores below. They can nest. |
146 | * or sibling lists of any cpuset, or performing non-atomic operations | 150 | * It is ok to first take manage_sem, then nest callback_sem. We also |
147 | * on the flags or *_allowed values of a cpuset, such as raising the | 151 | * require taking task_lock() when dereferencing a tasks cpuset pointer. |
148 | * CS_REMOVED flag bit iff it is not already raised, or reading and | 152 | * See "The task_lock() exception", at the end of this comment. |
149 | * conditionally modifying the *_allowed values. One kernel global | 153 | * |
150 | * cpuset semaphore should be sufficient - these things don't change | 154 | * A task must hold both semaphores to modify cpusets. If a task |
151 | * that much. | 155 | * holds manage_sem, then it blocks others wanting that semaphore, |
152 | * | 156 | * ensuring that it is the only task able to also acquire callback_sem |
153 | * The code that modifies cpusets holds cpuset_sem across the entire | 157 | * and be able to modify cpusets. It can perform various checks on |
154 | * operation, from cpuset_common_file_write() down, single threading | 158 | * the cpuset structure first, knowing nothing will change. It can |
155 | * all cpuset modifications (except for counter manipulations from | 159 | * also allocate memory while just holding manage_sem. While it is |
156 | * fork and exit) across the system. This presumes that cpuset | 160 | * performing these checks, various callback routines can briefly |
157 | * modifications are rare - better kept simple and safe, even if slow. | 161 | * acquire callback_sem to query cpusets. Once it is ready to make |
158 | * | 162 | * the changes, it takes callback_sem, blocking everyone else. |
159 | * The code that reads cpusets, such as in cpuset_common_file_read() | 163 | * |
160 | * and below, only holds cpuset_sem across small pieces of code, such | 164 | * Calls to the kernel memory allocator can not be made while holding |
161 | * as when reading out possibly multi-word cpumasks and nodemasks, as | 165 | * callback_sem, as that would risk double tripping on callback_sem |
162 | * the risks are less, and the desire for performance a little greater. | 166 | * from one of the callbacks into the cpuset code from within |
163 | * The proc_cpuset_show() routine needs to hold cpuset_sem to insure | 167 | * __alloc_pages(). |
164 | * that no cs->dentry is NULL, as it walks up the cpuset tree to root. | 168 | * |
165 | * | 169 | * If a task is only holding callback_sem, then it has read-only |
166 | * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't | 170 | * access to cpusets. |
167 | * (usually) grab cpuset_sem. These are the two most performance | 171 | * |
168 | * critical pieces of code here. The exception occurs on exit(), | 172 | * The task_struct fields mems_allowed and mems_generation may only |
169 | * when a task in a notify_on_release cpuset exits. Then cpuset_sem | 173 | * be accessed in the context of that task, so require no locks. |
174 | * | ||
175 | * Any task can increment and decrement the count field without lock. | ||
176 | * So in general, code holding manage_sem or callback_sem can't rely | ||
177 | * on the count field not changing. However, if the count goes to | ||
178 | * zero, then only attach_task(), which holds both semaphores, can | ||
179 | * increment it again. Because a count of zero means that no tasks | ||
180 | * are currently attached, therefore there is no way a task attached | ||
181 | * to that cpuset can fork (the other way to increment the count). | ||
182 | * So code holding manage_sem or callback_sem can safely assume that | ||
183 | * if the count is zero, it will stay zero. Similarly, if a task | ||
184 | * holds manage_sem or callback_sem on a cpuset with zero count, it | ||
185 | * knows that the cpuset won't be removed, as cpuset_rmdir() needs | ||
186 | * both of those semaphores. | ||
187 | * | ||
188 | * A possible optimization to improve parallelism would be to make | ||
189 | * callback_sem a R/W semaphore (rwsem), allowing the callback routines | ||
190 | * to proceed in parallel, with read access, until the holder of | ||
191 | * manage_sem needed to take this rwsem for exclusive write access | ||
192 | * and modify some cpusets. | ||
193 | * | ||
194 | * The cpuset_common_file_write handler for operations that modify | ||
195 | * the cpuset hierarchy holds manage_sem across the entire operation, | ||
196 | * single threading all such cpuset modifications across the system. | ||
197 | * | ||
198 | * The cpuset_common_file_read() handlers only hold callback_sem across | ||
199 | * small pieces of code, such as when reading out possibly multi-word | ||
200 | * cpumasks and nodemasks. | ||
201 | * | ||
202 | * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't | ||
203 | * (usually) take either semaphore. These are the two most performance | ||
204 | * critical pieces of code here. The exception occurs on cpuset_exit(), | ||
205 | * when a task in a notify_on_release cpuset exits. Then manage_sem | ||
170 | * is taken, and if the cpuset count is zero, a usermode call made | 206 | * is taken, and if the cpuset count is zero, a usermode call made |
171 | * to /sbin/cpuset_release_agent with the name of the cpuset (path | 207 | * to /sbin/cpuset_release_agent with the name of the cpuset (path |
172 | * relative to the root of cpuset file system) as the argument. | 208 | * relative to the root of cpuset file system) as the argument. |
173 | * | 209 | * |
174 | * A cpuset can only be deleted if both its 'count' of using tasks is | 210 | * A cpuset can only be deleted if both its 'count' of using tasks |
175 | * zero, and its list of 'children' cpusets is empty. Since all tasks | 211 | * is zero, and its list of 'children' cpusets is empty. Since all |
176 | * in the system use _some_ cpuset, and since there is always at least | 212 | * tasks in the system use _some_ cpuset, and since there is always at |
177 | * one task in the system (init, pid == 1), therefore, top_cpuset | 213 | * least one task in the system (init, pid == 1), therefore, top_cpuset |
178 | * always has either children cpusets and/or using tasks. So no need | 214 | * always has either children cpusets and/or using tasks. So we don't |
179 | * for any special hack to ensure that top_cpuset cannot be deleted. | 215 | * need a special hack to ensure that top_cpuset cannot be deleted. |
216 | * | ||
217 | * The above "Tale of Two Semaphores" would be complete, but for: | ||
218 | * | ||
219 | * The task_lock() exception | ||
220 | * | ||
221 | * The need for this exception arises from the action of attach_task(), | ||
222 | * which overwrites one tasks cpuset pointer with another. It does | ||
223 | * so using both semaphores, however there are several performance | ||
224 | * critical places that need to reference task->cpuset without the | ||
225 | * expense of grabbing a system global semaphore. Therefore except as | ||
226 | * noted below, when dereferencing or, as in attach_task(), modifying | ||
227 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock | ||
228 | * (task->alloc_lock) already in the task_struct routinely used for | ||
229 | * such matters. | ||
180 | */ | 230 | */ |
181 | 231 | ||
182 | static DECLARE_MUTEX(cpuset_sem); | 232 | static DECLARE_MUTEX(manage_sem); |
183 | static struct task_struct *cpuset_sem_owner; | 233 | static DECLARE_MUTEX(callback_sem); |
184 | static int cpuset_sem_depth; | ||
185 | |||
186 | /* | ||
187 | * The global cpuset semaphore cpuset_sem can be needed by the | ||
188 | * memory allocator to update a tasks mems_allowed (see the calls | ||
189 | * to cpuset_update_current_mems_allowed()) or to walk up the | ||
190 | * cpuset hierarchy to find a mem_exclusive cpuset see the calls | ||
191 | * to cpuset_excl_nodes_overlap()). | ||
192 | * | ||
193 | * But if the memory allocation is being done by cpuset.c code, it | ||
194 | * usually already holds cpuset_sem. Double tripping on a kernel | ||
195 | * semaphore deadlocks the current task, and any other task that | ||
196 | * subsequently tries to obtain the lock. | ||
197 | * | ||
198 | * Run all up's and down's on cpuset_sem through the following | ||
199 | * wrappers, which will detect this nested locking, and avoid | ||
200 | * deadlocking. | ||
201 | */ | ||
202 | |||
203 | static inline void cpuset_down(struct semaphore *psem) | ||
204 | { | ||
205 | if (cpuset_sem_owner != current) { | ||
206 | down(psem); | ||
207 | cpuset_sem_owner = current; | ||
208 | } | ||
209 | cpuset_sem_depth++; | ||
210 | } | ||
211 | |||
212 | static inline void cpuset_up(struct semaphore *psem) | ||
213 | { | ||
214 | if (--cpuset_sem_depth == 0) { | ||
215 | cpuset_sem_owner = NULL; | ||
216 | up(psem); | ||
217 | } | ||
218 | } | ||
219 | 234 | ||
220 | /* | 235 | /* |
221 | * A couple of forward declarations required, due to cyclic reference loop: | 236 | * A couple of forward declarations required, due to cyclic reference loop: |
@@ -390,7 +405,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) | |||
390 | } | 405 | } |
391 | 406 | ||
392 | /* | 407 | /* |
393 | * Call with cpuset_sem held. Writes path of cpuset into buf. | 408 | * Call with manage_sem held. Writes path of cpuset into buf. |
394 | * Returns 0 on success, -errno on error. | 409 | * Returns 0 on success, -errno on error. |
395 | */ | 410 | */ |
396 | 411 | ||
@@ -442,10 +457,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) | |||
442 | * status of the /sbin/cpuset_release_agent task, so no sense holding | 457 | * status of the /sbin/cpuset_release_agent task, so no sense holding |
443 | * our caller up for that. | 458 | * our caller up for that. |
444 | * | 459 | * |
445 | * The simple act of forking that task might require more memory, | 460 | * When we had only one cpuset semaphore, we had to call this |
446 | * which might need cpuset_sem. So this routine must be called while | 461 | * without holding it, to avoid deadlock when call_usermodehelper() |
447 | * cpuset_sem is not held, to avoid a possible deadlock. See also | 462 | * allocated memory. With two locks, we could now call this while |
448 | * comments for check_for_release(), below. | 463 | * holding manage_sem, but we still don't, so as to minimize |
464 | * the time manage_sem is held. | ||
449 | */ | 465 | */ |
450 | 466 | ||
451 | static void cpuset_release_agent(const char *pathbuf) | 467 | static void cpuset_release_agent(const char *pathbuf) |
@@ -477,15 +493,15 @@ static void cpuset_release_agent(const char *pathbuf) | |||
477 | * cs is notify_on_release() and now both the user count is zero and | 493 | * cs is notify_on_release() and now both the user count is zero and |
478 | * the list of children is empty, prepare cpuset path in a kmalloc'd | 494 | * the list of children is empty, prepare cpuset path in a kmalloc'd |
479 | * buffer, to be returned via ppathbuf, so that the caller can invoke | 495 | * buffer, to be returned via ppathbuf, so that the caller can invoke |
480 | * cpuset_release_agent() with it later on, once cpuset_sem is dropped. | 496 | * cpuset_release_agent() with it later on, once manage_sem is dropped. |
481 | * Call here with cpuset_sem held. | 497 | * Call here with manage_sem held. |
482 | * | 498 | * |
483 | * This check_for_release() routine is responsible for kmalloc'ing | 499 | * This check_for_release() routine is responsible for kmalloc'ing |
484 | * pathbuf. The above cpuset_release_agent() is responsible for | 500 | * pathbuf. The above cpuset_release_agent() is responsible for |
485 | * kfree'ing pathbuf. The caller of these routines is responsible | 501 | * kfree'ing pathbuf. The caller of these routines is responsible |
486 | * for providing a pathbuf pointer, initialized to NULL, then | 502 | * for providing a pathbuf pointer, initialized to NULL, then |
487 | * calling check_for_release() with cpuset_sem held and the address | 503 | * calling check_for_release() with manage_sem held and the address |
488 | * of the pathbuf pointer, then dropping cpuset_sem, then calling | 504 | * of the pathbuf pointer, then dropping manage_sem, then calling |
489 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). | 505 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). |
490 | */ | 506 | */ |
491 | 507 | ||
@@ -516,7 +532,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf) | |||
516 | * One way or another, we guarantee to return some non-empty subset | 532 | * One way or another, we guarantee to return some non-empty subset |
517 | * of cpu_online_map. | 533 | * of cpu_online_map. |
518 | * | 534 | * |
519 | * Call with cpuset_sem held. | 535 | * Call with callback_sem held. |
520 | */ | 536 | */ |
521 | 537 | ||
522 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | 538 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) |
@@ -540,7 +556,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | |||
540 | * One way or another, we guarantee to return some non-empty subset | 556 | * One way or another, we guarantee to return some non-empty subset |
541 | * of node_online_map. | 557 | * of node_online_map. |
542 | * | 558 | * |
543 | * Call with cpuset_sem held. | 559 | * Call with callback_sem held. |
544 | */ | 560 | */ |
545 | 561 | ||
546 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 562 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) |
@@ -555,22 +571,47 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
555 | } | 571 | } |
556 | 572 | ||
557 | /* | 573 | /* |
558 | * Refresh current tasks mems_allowed and mems_generation from | 574 | * Refresh current tasks mems_allowed and mems_generation from current |
559 | * current tasks cpuset. Call with cpuset_sem held. | 575 | * tasks cpuset. |
560 | * | 576 | * |
561 | * This routine is needed to update the per-task mems_allowed | 577 | * Call without callback_sem or task_lock() held. May be called with |
562 | * data, within the tasks context, when it is trying to allocate | 578 | * or without manage_sem held. Will acquire task_lock() and might |
563 | * memory (in various mm/mempolicy.c routines) and notices | 579 | * acquire callback_sem during call. |
564 | * that some other task has been modifying its cpuset. | 580 | * |
581 | * The task_lock() is required to dereference current->cpuset safely. | ||
582 | * Without it, we could pick up the pointer value of current->cpuset | ||
583 | * in one instruction, and then attach_task could give us a different | ||
584 | * cpuset, and then the cpuset we had could be removed and freed, | ||
585 | * and then on our next instruction, we could dereference a no longer | ||
586 | * valid cpuset pointer to get its mems_generation field. | ||
587 | * | ||
588 | * This routine is needed to update the per-task mems_allowed data, | ||
589 | * within the tasks context, when it is trying to allocate memory | ||
590 | * (in various mm/mempolicy.c routines) and notices that some other | ||
591 | * task has been modifying its cpuset. | ||
565 | */ | 592 | */ |
566 | 593 | ||
567 | static void refresh_mems(void) | 594 | static void refresh_mems(void) |
568 | { | 595 | { |
569 | struct cpuset *cs = current->cpuset; | 596 | int my_cpusets_mem_gen; |
597 | |||
598 | task_lock(current); | ||
599 | my_cpusets_mem_gen = current->cpuset->mems_generation; | ||
600 | task_unlock(current); | ||
570 | 601 | ||
571 | if (current->cpuset_mems_generation != cs->mems_generation) { | 602 | if (current->cpuset_mems_generation != my_cpusets_mem_gen) { |
603 | struct cpuset *cs; | ||
604 | nodemask_t oldmem = current->mems_allowed; | ||
605 | |||
606 | down(&callback_sem); | ||
607 | task_lock(current); | ||
608 | cs = current->cpuset; | ||
572 | guarantee_online_mems(cs, ¤t->mems_allowed); | 609 | guarantee_online_mems(cs, ¤t->mems_allowed); |
573 | current->cpuset_mems_generation = cs->mems_generation; | 610 | current->cpuset_mems_generation = cs->mems_generation; |
611 | task_unlock(current); | ||
612 | up(&callback_sem); | ||
613 | if (!nodes_equal(oldmem, current->mems_allowed)) | ||
614 | numa_policy_rebind(&oldmem, ¤t->mems_allowed); | ||
574 | } | 615 | } |
575 | } | 616 | } |
576 | 617 | ||
@@ -579,7 +620,7 @@ static void refresh_mems(void) | |||
579 | * | 620 | * |
580 | * One cpuset is a subset of another if all its allowed CPUs and | 621 | * One cpuset is a subset of another if all its allowed CPUs and |
581 | * Memory Nodes are a subset of the other, and its exclusive flags | 622 | * Memory Nodes are a subset of the other, and its exclusive flags |
582 | * are only set if the other's are set. | 623 | * are only set if the other's are set. Call holding manage_sem. |
583 | */ | 624 | */ |
584 | 625 | ||
585 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | 626 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) |
@@ -597,7 +638,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | |||
597 | * If we replaced the flag and mask values of the current cpuset | 638 | * If we replaced the flag and mask values of the current cpuset |
598 | * (cur) with those values in the trial cpuset (trial), would | 639 | * (cur) with those values in the trial cpuset (trial), would |
599 | * our various subset and exclusive rules still be valid? Presumes | 640 | * our various subset and exclusive rules still be valid? Presumes |
600 | * cpuset_sem held. | 641 | * manage_sem held. |
601 | * | 642 | * |
602 | * 'cur' is the address of an actual, in-use cpuset. Operations | 643 | * 'cur' is the address of an actual, in-use cpuset. Operations |
603 | * such as list traversal that depend on the actual address of the | 644 | * such as list traversal that depend on the actual address of the |
@@ -651,7 +692,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
651 | * exclusive child cpusets | 692 | * exclusive child cpusets |
652 | * Build these two partitions by calling partition_sched_domains | 693 | * Build these two partitions by calling partition_sched_domains |
653 | * | 694 | * |
654 | * Call with cpuset_sem held. May nest a call to the | 695 | * Call with manage_sem held. May nest a call to the |
655 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 696 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. |
656 | */ | 697 | */ |
657 | 698 | ||
@@ -696,6 +737,10 @@ static void update_cpu_domains(struct cpuset *cur) | |||
696 | unlock_cpu_hotplug(); | 737 | unlock_cpu_hotplug(); |
697 | } | 738 | } |
698 | 739 | ||
740 | /* | ||
741 | * Call with manage_sem held. May take callback_sem during call. | ||
742 | */ | ||
743 | |||
699 | static int update_cpumask(struct cpuset *cs, char *buf) | 744 | static int update_cpumask(struct cpuset *cs, char *buf) |
700 | { | 745 | { |
701 | struct cpuset trialcs; | 746 | struct cpuset trialcs; |
@@ -712,12 +757,18 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
712 | if (retval < 0) | 757 | if (retval < 0) |
713 | return retval; | 758 | return retval; |
714 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); | 759 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); |
760 | down(&callback_sem); | ||
715 | cs->cpus_allowed = trialcs.cpus_allowed; | 761 | cs->cpus_allowed = trialcs.cpus_allowed; |
762 | up(&callback_sem); | ||
716 | if (is_cpu_exclusive(cs) && !cpus_unchanged) | 763 | if (is_cpu_exclusive(cs) && !cpus_unchanged) |
717 | update_cpu_domains(cs); | 764 | update_cpu_domains(cs); |
718 | return 0; | 765 | return 0; |
719 | } | 766 | } |
720 | 767 | ||
768 | /* | ||
769 | * Call with manage_sem held. May take callback_sem during call. | ||
770 | */ | ||
771 | |||
721 | static int update_nodemask(struct cpuset *cs, char *buf) | 772 | static int update_nodemask(struct cpuset *cs, char *buf) |
722 | { | 773 | { |
723 | struct cpuset trialcs; | 774 | struct cpuset trialcs; |
@@ -732,9 +783,11 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
732 | return -ENOSPC; | 783 | return -ENOSPC; |
733 | retval = validate_change(cs, &trialcs); | 784 | retval = validate_change(cs, &trialcs); |
734 | if (retval == 0) { | 785 | if (retval == 0) { |
786 | down(&callback_sem); | ||
735 | cs->mems_allowed = trialcs.mems_allowed; | 787 | cs->mems_allowed = trialcs.mems_allowed; |
736 | atomic_inc(&cpuset_mems_generation); | 788 | atomic_inc(&cpuset_mems_generation); |
737 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 789 | cs->mems_generation = atomic_read(&cpuset_mems_generation); |
790 | up(&callback_sem); | ||
738 | } | 791 | } |
739 | return retval; | 792 | return retval; |
740 | } | 793 | } |
@@ -745,6 +798,8 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
745 | * CS_NOTIFY_ON_RELEASE) | 798 | * CS_NOTIFY_ON_RELEASE) |
746 | * cs: the cpuset to update | 799 | * cs: the cpuset to update |
747 | * buf: the buffer where we read the 0 or 1 | 800 | * buf: the buffer where we read the 0 or 1 |
801 | * | ||
802 | * Call with manage_sem held. | ||
748 | */ | 803 | */ |
749 | 804 | ||
750 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | 805 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) |
@@ -766,16 +821,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
766 | return err; | 821 | return err; |
767 | cpu_exclusive_changed = | 822 | cpu_exclusive_changed = |
768 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); | 823 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); |
824 | down(&callback_sem); | ||
769 | if (turning_on) | 825 | if (turning_on) |
770 | set_bit(bit, &cs->flags); | 826 | set_bit(bit, &cs->flags); |
771 | else | 827 | else |
772 | clear_bit(bit, &cs->flags); | 828 | clear_bit(bit, &cs->flags); |
829 | up(&callback_sem); | ||
773 | 830 | ||
774 | if (cpu_exclusive_changed) | 831 | if (cpu_exclusive_changed) |
775 | update_cpu_domains(cs); | 832 | update_cpu_domains(cs); |
776 | return 0; | 833 | return 0; |
777 | } | 834 | } |
778 | 835 | ||
836 | /* | ||
837 | * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly | ||
838 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be | ||
839 | * notified on release. | ||
840 | * | ||
841 | * Call holding manage_sem. May take callback_sem and task_lock of | ||
842 | * the task 'pid' during call. | ||
843 | */ | ||
844 | |||
779 | static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | 845 | static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) |
780 | { | 846 | { |
781 | pid_t pid; | 847 | pid_t pid; |
@@ -792,7 +858,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
792 | read_lock(&tasklist_lock); | 858 | read_lock(&tasklist_lock); |
793 | 859 | ||
794 | tsk = find_task_by_pid(pid); | 860 | tsk = find_task_by_pid(pid); |
795 | if (!tsk) { | 861 | if (!tsk || tsk->flags & PF_EXITING) { |
796 | read_unlock(&tasklist_lock); | 862 | read_unlock(&tasklist_lock); |
797 | return -ESRCH; | 863 | return -ESRCH; |
798 | } | 864 | } |
@@ -810,10 +876,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
810 | get_task_struct(tsk); | 876 | get_task_struct(tsk); |
811 | } | 877 | } |
812 | 878 | ||
879 | down(&callback_sem); | ||
880 | |||
813 | task_lock(tsk); | 881 | task_lock(tsk); |
814 | oldcs = tsk->cpuset; | 882 | oldcs = tsk->cpuset; |
815 | if (!oldcs) { | 883 | if (!oldcs) { |
816 | task_unlock(tsk); | 884 | task_unlock(tsk); |
885 | up(&callback_sem); | ||
817 | put_task_struct(tsk); | 886 | put_task_struct(tsk); |
818 | return -ESRCH; | 887 | return -ESRCH; |
819 | } | 888 | } |
@@ -824,6 +893,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
824 | guarantee_online_cpus(cs, &cpus); | 893 | guarantee_online_cpus(cs, &cpus); |
825 | set_cpus_allowed(tsk, cpus); | 894 | set_cpus_allowed(tsk, cpus); |
826 | 895 | ||
896 | up(&callback_sem); | ||
827 | put_task_struct(tsk); | 897 | put_task_struct(tsk); |
828 | if (atomic_dec_and_test(&oldcs->count)) | 898 | if (atomic_dec_and_test(&oldcs->count)) |
829 | check_for_release(oldcs, ppathbuf); | 899 | check_for_release(oldcs, ppathbuf); |
@@ -867,7 +937,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
867 | } | 937 | } |
868 | buffer[nbytes] = 0; /* nul-terminate */ | 938 | buffer[nbytes] = 0; /* nul-terminate */ |
869 | 939 | ||
870 | cpuset_down(&cpuset_sem); | 940 | down(&manage_sem); |
871 | 941 | ||
872 | if (is_removed(cs)) { | 942 | if (is_removed(cs)) { |
873 | retval = -ENODEV; | 943 | retval = -ENODEV; |
@@ -901,7 +971,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
901 | if (retval == 0) | 971 | if (retval == 0) |
902 | retval = nbytes; | 972 | retval = nbytes; |
903 | out2: | 973 | out2: |
904 | cpuset_up(&cpuset_sem); | 974 | up(&manage_sem); |
905 | cpuset_release_agent(pathbuf); | 975 | cpuset_release_agent(pathbuf); |
906 | out1: | 976 | out1: |
907 | kfree(buffer); | 977 | kfree(buffer); |
@@ -941,9 +1011,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | |||
941 | { | 1011 | { |
942 | cpumask_t mask; | 1012 | cpumask_t mask; |
943 | 1013 | ||
944 | cpuset_down(&cpuset_sem); | 1014 | down(&callback_sem); |
945 | mask = cs->cpus_allowed; | 1015 | mask = cs->cpus_allowed; |
946 | cpuset_up(&cpuset_sem); | 1016 | up(&callback_sem); |
947 | 1017 | ||
948 | return cpulist_scnprintf(page, PAGE_SIZE, mask); | 1018 | return cpulist_scnprintf(page, PAGE_SIZE, mask); |
949 | } | 1019 | } |
@@ -952,9 +1022,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
952 | { | 1022 | { |
953 | nodemask_t mask; | 1023 | nodemask_t mask; |
954 | 1024 | ||
955 | cpuset_down(&cpuset_sem); | 1025 | down(&callback_sem); |
956 | mask = cs->mems_allowed; | 1026 | mask = cs->mems_allowed; |
957 | cpuset_up(&cpuset_sem); | 1027 | up(&callback_sem); |
958 | 1028 | ||
959 | return nodelist_scnprintf(page, PAGE_SIZE, mask); | 1029 | return nodelist_scnprintf(page, PAGE_SIZE, mask); |
960 | } | 1030 | } |
@@ -995,7 +1065,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
995 | goto out; | 1065 | goto out; |
996 | } | 1066 | } |
997 | *s++ = '\n'; | 1067 | *s++ = '\n'; |
998 | *s = '\0'; | ||
999 | 1068 | ||
1000 | retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); | 1069 | retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); |
1001 | out: | 1070 | out: |
@@ -1048,6 +1117,21 @@ static int cpuset_file_release(struct inode *inode, struct file *file) | |||
1048 | return 0; | 1117 | return 0; |
1049 | } | 1118 | } |
1050 | 1119 | ||
1120 | /* | ||
1121 | * cpuset_rename - Only allow simple rename of directories in place. | ||
1122 | */ | ||
1123 | static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry, | ||
1124 | struct inode *new_dir, struct dentry *new_dentry) | ||
1125 | { | ||
1126 | if (!S_ISDIR(old_dentry->d_inode->i_mode)) | ||
1127 | return -ENOTDIR; | ||
1128 | if (new_dentry->d_inode) | ||
1129 | return -EEXIST; | ||
1130 | if (old_dir != new_dir) | ||
1131 | return -EIO; | ||
1132 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); | ||
1133 | } | ||
1134 | |||
1051 | static struct file_operations cpuset_file_operations = { | 1135 | static struct file_operations cpuset_file_operations = { |
1052 | .read = cpuset_file_read, | 1136 | .read = cpuset_file_read, |
1053 | .write = cpuset_file_write, | 1137 | .write = cpuset_file_write, |
@@ -1060,6 +1144,7 @@ static struct inode_operations cpuset_dir_inode_operations = { | |||
1060 | .lookup = simple_lookup, | 1144 | .lookup = simple_lookup, |
1061 | .mkdir = cpuset_mkdir, | 1145 | .mkdir = cpuset_mkdir, |
1062 | .rmdir = cpuset_rmdir, | 1146 | .rmdir = cpuset_rmdir, |
1147 | .rename = cpuset_rename, | ||
1063 | }; | 1148 | }; |
1064 | 1149 | ||
1065 | static int cpuset_create_file(struct dentry *dentry, int mode) | 1150 | static int cpuset_create_file(struct dentry *dentry, int mode) |
@@ -1163,7 +1248,9 @@ struct ctr_struct { | |||
1163 | 1248 | ||
1164 | /* | 1249 | /* |
1165 | * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. | 1250 | * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. |
1166 | * Return actual number of pids loaded. | 1251 | * Return actual number of pids loaded. No need to task_lock(p) |
1252 | * when reading out p->cpuset, as we don't really care if it changes | ||
1253 | * on the next cycle, and we are not going to try to dereference it. | ||
1167 | */ | 1254 | */ |
1168 | static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) | 1255 | static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) |
1169 | { | 1256 | { |
@@ -1205,6 +1292,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) | |||
1205 | return cnt; | 1292 | return cnt; |
1206 | } | 1293 | } |
1207 | 1294 | ||
1295 | /* | ||
1296 | * Handle an open on 'tasks' file. Prepare a buffer listing the | ||
1297 | * process id's of tasks currently attached to the cpuset being opened. | ||
1298 | * | ||
1299 | * Does not require any specific cpuset semaphores, and does not take any. | ||
1300 | */ | ||
1208 | static int cpuset_tasks_open(struct inode *unused, struct file *file) | 1301 | static int cpuset_tasks_open(struct inode *unused, struct file *file) |
1209 | { | 1302 | { |
1210 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | 1303 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); |
@@ -1352,7 +1445,8 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1352 | if (!cs) | 1445 | if (!cs) |
1353 | return -ENOMEM; | 1446 | return -ENOMEM; |
1354 | 1447 | ||
1355 | cpuset_down(&cpuset_sem); | 1448 | down(&manage_sem); |
1449 | refresh_mems(); | ||
1356 | cs->flags = 0; | 1450 | cs->flags = 0; |
1357 | if (notify_on_release(parent)) | 1451 | if (notify_on_release(parent)) |
1358 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1452 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
@@ -1366,25 +1460,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1366 | 1460 | ||
1367 | cs->parent = parent; | 1461 | cs->parent = parent; |
1368 | 1462 | ||
1463 | down(&callback_sem); | ||
1369 | list_add(&cs->sibling, &cs->parent->children); | 1464 | list_add(&cs->sibling, &cs->parent->children); |
1465 | up(&callback_sem); | ||
1370 | 1466 | ||
1371 | err = cpuset_create_dir(cs, name, mode); | 1467 | err = cpuset_create_dir(cs, name, mode); |
1372 | if (err < 0) | 1468 | if (err < 0) |
1373 | goto err; | 1469 | goto err; |
1374 | 1470 | ||
1375 | /* | 1471 | /* |
1376 | * Release cpuset_sem before cpuset_populate_dir() because it | 1472 | * Release manage_sem before cpuset_populate_dir() because it |
1377 | * will down() this new directory's i_sem and if we race with | 1473 | * will down() this new directory's i_sem and if we race with |
1378 | * another mkdir, we might deadlock. | 1474 | * another mkdir, we might deadlock. |
1379 | */ | 1475 | */ |
1380 | cpuset_up(&cpuset_sem); | 1476 | up(&manage_sem); |
1381 | 1477 | ||
1382 | err = cpuset_populate_dir(cs->dentry); | 1478 | err = cpuset_populate_dir(cs->dentry); |
1383 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 1479 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
1384 | return 0; | 1480 | return 0; |
1385 | err: | 1481 | err: |
1386 | list_del(&cs->sibling); | 1482 | list_del(&cs->sibling); |
1387 | cpuset_up(&cpuset_sem); | 1483 | up(&manage_sem); |
1388 | kfree(cs); | 1484 | kfree(cs); |
1389 | return err; | 1485 | return err; |
1390 | } | 1486 | } |
@@ -1406,29 +1502,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1406 | 1502 | ||
1407 | /* the vfs holds both inode->i_sem already */ | 1503 | /* the vfs holds both inode->i_sem already */ |
1408 | 1504 | ||
1409 | cpuset_down(&cpuset_sem); | 1505 | down(&manage_sem); |
1506 | refresh_mems(); | ||
1410 | if (atomic_read(&cs->count) > 0) { | 1507 | if (atomic_read(&cs->count) > 0) { |
1411 | cpuset_up(&cpuset_sem); | 1508 | up(&manage_sem); |
1412 | return -EBUSY; | 1509 | return -EBUSY; |
1413 | } | 1510 | } |
1414 | if (!list_empty(&cs->children)) { | 1511 | if (!list_empty(&cs->children)) { |
1415 | cpuset_up(&cpuset_sem); | 1512 | up(&manage_sem); |
1416 | return -EBUSY; | 1513 | return -EBUSY; |
1417 | } | 1514 | } |
1418 | parent = cs->parent; | 1515 | parent = cs->parent; |
1516 | down(&callback_sem); | ||
1419 | set_bit(CS_REMOVED, &cs->flags); | 1517 | set_bit(CS_REMOVED, &cs->flags); |
1420 | if (is_cpu_exclusive(cs)) | 1518 | if (is_cpu_exclusive(cs)) |
1421 | update_cpu_domains(cs); | 1519 | update_cpu_domains(cs); |
1422 | list_del(&cs->sibling); /* delete my sibling from parent->children */ | 1520 | list_del(&cs->sibling); /* delete my sibling from parent->children */ |
1423 | if (list_empty(&parent->children)) | ||
1424 | check_for_release(parent, &pathbuf); | ||
1425 | spin_lock(&cs->dentry->d_lock); | 1521 | spin_lock(&cs->dentry->d_lock); |
1426 | d = dget(cs->dentry); | 1522 | d = dget(cs->dentry); |
1427 | cs->dentry = NULL; | 1523 | cs->dentry = NULL; |
1428 | spin_unlock(&d->d_lock); | 1524 | spin_unlock(&d->d_lock); |
1429 | cpuset_d_remove_dir(d); | 1525 | cpuset_d_remove_dir(d); |
1430 | dput(d); | 1526 | dput(d); |
1431 | cpuset_up(&cpuset_sem); | 1527 | up(&callback_sem); |
1528 | if (list_empty(&parent->children)) | ||
1529 | check_for_release(parent, &pathbuf); | ||
1530 | up(&manage_sem); | ||
1432 | cpuset_release_agent(pathbuf); | 1531 | cpuset_release_agent(pathbuf); |
1433 | return 0; | 1532 | return 0; |
1434 | } | 1533 | } |
@@ -1488,16 +1587,26 @@ void __init cpuset_init_smp(void) | |||
1488 | * cpuset_fork - attach newly forked task to its parents cpuset. | 1587 | * cpuset_fork - attach newly forked task to its parents cpuset. |
1489 | * @tsk: pointer to task_struct of forking parent process. | 1588 | * @tsk: pointer to task_struct of forking parent process. |
1490 | * | 1589 | * |
1491 | * Description: By default, on fork, a task inherits its | 1590 | * Description: A task inherits its parent's cpuset at fork(). |
1492 | * parent's cpuset. The pointer to the shared cpuset is | 1591 | * |
1493 | * automatically copied in fork.c by dup_task_struct(). | 1592 | * A pointer to the shared cpuset was automatically copied in fork.c |
1494 | * This cpuset_fork() routine need only increment the usage | 1593 | * by dup_task_struct(). However, we ignore that copy, since it was |
1495 | * counter in that cpuset. | 1594 | * not made under the protection of task_lock(), so might no longer be |
1595 | * a valid cpuset pointer. attach_task() might have already changed | ||
1596 | * current->cpuset, allowing the previously referenced cpuset to | ||
1597 | * be removed and freed. Instead, we task_lock(current) and copy | ||
1598 | * its present value of current->cpuset for our freshly forked child. | ||
1599 | * | ||
1600 | * At the point that cpuset_fork() is called, 'current' is the parent | ||
1601 | * task, and the passed argument 'child' points to the child task. | ||
1496 | **/ | 1602 | **/ |
1497 | 1603 | ||
1498 | void cpuset_fork(struct task_struct *tsk) | 1604 | void cpuset_fork(struct task_struct *child) |
1499 | { | 1605 | { |
1500 | atomic_inc(&tsk->cpuset->count); | 1606 | task_lock(current); |
1607 | child->cpuset = current->cpuset; | ||
1608 | atomic_inc(&child->cpuset->count); | ||
1609 | task_unlock(current); | ||
1501 | } | 1610 | } |
1502 | 1611 | ||
1503 | /** | 1612 | /** |
@@ -1506,35 +1615,42 @@ void cpuset_fork(struct task_struct *tsk) | |||
1506 | * | 1615 | * |
1507 | * Description: Detach cpuset from @tsk and release it. | 1616 | * Description: Detach cpuset from @tsk and release it. |
1508 | * | 1617 | * |
1509 | * Note that cpusets marked notify_on_release force every task | 1618 | * Note that cpusets marked notify_on_release force every task in |
1510 | * in them to take the global cpuset_sem semaphore when exiting. | 1619 | * them to take the global manage_sem semaphore when exiting. |
1511 | * This could impact scaling on very large systems. Be reluctant | 1620 | * This could impact scaling on very large systems. Be reluctant to |
1512 | * to use notify_on_release cpusets where very high task exit | 1621 | * use notify_on_release cpusets where very high task exit scaling |
1513 | * scaling is required on large systems. | 1622 | * is required on large systems. |
1514 | * | 1623 | * |
1515 | * Don't even think about derefencing 'cs' after the cpuset use | 1624 | * Don't even think about derefencing 'cs' after the cpuset use count |
1516 | * count goes to zero, except inside a critical section guarded | 1625 | * goes to zero, except inside a critical section guarded by manage_sem |
1517 | * by the cpuset_sem semaphore. If you don't hold cpuset_sem, | 1626 | * or callback_sem. Otherwise a zero cpuset use count is a license to |
1518 | * then a zero cpuset use count is a license to any other task to | 1627 | * any other task to nuke the cpuset immediately, via cpuset_rmdir(). |
1519 | * nuke the cpuset immediately. | 1628 | * |
1629 | * This routine has to take manage_sem, not callback_sem, because | ||
1630 | * it is holding that semaphore while calling check_for_release(), | ||
1631 | * which calls kmalloc(), so can't be called holding callback__sem(). | ||
1632 | * | ||
1633 | * We don't need to task_lock() this reference to tsk->cpuset, | ||
1634 | * because tsk is already marked PF_EXITING, so attach_task() won't | ||
1635 | * mess with it. | ||
1520 | **/ | 1636 | **/ |
1521 | 1637 | ||
1522 | void cpuset_exit(struct task_struct *tsk) | 1638 | void cpuset_exit(struct task_struct *tsk) |
1523 | { | 1639 | { |
1524 | struct cpuset *cs; | 1640 | struct cpuset *cs; |
1525 | 1641 | ||
1526 | task_lock(tsk); | 1642 | BUG_ON(!(tsk->flags & PF_EXITING)); |
1643 | |||
1527 | cs = tsk->cpuset; | 1644 | cs = tsk->cpuset; |
1528 | tsk->cpuset = NULL; | 1645 | tsk->cpuset = NULL; |
1529 | task_unlock(tsk); | ||
1530 | 1646 | ||
1531 | if (notify_on_release(cs)) { | 1647 | if (notify_on_release(cs)) { |
1532 | char *pathbuf = NULL; | 1648 | char *pathbuf = NULL; |
1533 | 1649 | ||
1534 | cpuset_down(&cpuset_sem); | 1650 | down(&manage_sem); |
1535 | if (atomic_dec_and_test(&cs->count)) | 1651 | if (atomic_dec_and_test(&cs->count)) |
1536 | check_for_release(cs, &pathbuf); | 1652 | check_for_release(cs, &pathbuf); |
1537 | cpuset_up(&cpuset_sem); | 1653 | up(&manage_sem); |
1538 | cpuset_release_agent(pathbuf); | 1654 | cpuset_release_agent(pathbuf); |
1539 | } else { | 1655 | } else { |
1540 | atomic_dec(&cs->count); | 1656 | atomic_dec(&cs->count); |
@@ -1555,11 +1671,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | |||
1555 | { | 1671 | { |
1556 | cpumask_t mask; | 1672 | cpumask_t mask; |
1557 | 1673 | ||
1558 | cpuset_down(&cpuset_sem); | 1674 | down(&callback_sem); |
1559 | task_lock((struct task_struct *)tsk); | 1675 | task_lock((struct task_struct *)tsk); |
1560 | guarantee_online_cpus(tsk->cpuset, &mask); | 1676 | guarantee_online_cpus(tsk->cpuset, &mask); |
1561 | task_unlock((struct task_struct *)tsk); | 1677 | task_unlock((struct task_struct *)tsk); |
1562 | cpuset_up(&cpuset_sem); | 1678 | up(&callback_sem); |
1563 | 1679 | ||
1564 | return mask; | 1680 | return mask; |
1565 | } | 1681 | } |
@@ -1575,19 +1691,28 @@ void cpuset_init_current_mems_allowed(void) | |||
1575 | * If the current tasks cpusets mems_allowed changed behind our backs, | 1691 | * If the current tasks cpusets mems_allowed changed behind our backs, |
1576 | * update current->mems_allowed and mems_generation to the new value. | 1692 | * update current->mems_allowed and mems_generation to the new value. |
1577 | * Do not call this routine if in_interrupt(). | 1693 | * Do not call this routine if in_interrupt(). |
1694 | * | ||
1695 | * Call without callback_sem or task_lock() held. May be called | ||
1696 | * with or without manage_sem held. Unless exiting, it will acquire | ||
1697 | * task_lock(). Also might acquire callback_sem during call to | ||
1698 | * refresh_mems(). | ||
1578 | */ | 1699 | */ |
1579 | 1700 | ||
1580 | void cpuset_update_current_mems_allowed(void) | 1701 | void cpuset_update_current_mems_allowed(void) |
1581 | { | 1702 | { |
1582 | struct cpuset *cs = current->cpuset; | 1703 | struct cpuset *cs; |
1704 | int need_to_refresh = 0; | ||
1583 | 1705 | ||
1706 | task_lock(current); | ||
1707 | cs = current->cpuset; | ||
1584 | if (!cs) | 1708 | if (!cs) |
1585 | return; /* task is exiting */ | 1709 | goto done; |
1586 | if (current->cpuset_mems_generation != cs->mems_generation) { | 1710 | if (current->cpuset_mems_generation != cs->mems_generation) |
1587 | cpuset_down(&cpuset_sem); | 1711 | need_to_refresh = 1; |
1712 | done: | ||
1713 | task_unlock(current); | ||
1714 | if (need_to_refresh) | ||
1588 | refresh_mems(); | 1715 | refresh_mems(); |
1589 | cpuset_up(&cpuset_sem); | ||
1590 | } | ||
1591 | } | 1716 | } |
1592 | 1717 | ||
1593 | /** | 1718 | /** |
@@ -1621,7 +1746,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | |||
1621 | 1746 | ||
1622 | /* | 1747 | /* |
1623 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive | 1748 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive |
1624 | * ancestor to the specified cpuset. Call while holding cpuset_sem. | 1749 | * ancestor to the specified cpuset. Call holding callback_sem. |
1625 | * If no ancestor is mem_exclusive (an unusual configuration), then | 1750 | * If no ancestor is mem_exclusive (an unusual configuration), then |
1626 | * returns the root cpuset. | 1751 | * returns the root cpuset. |
1627 | */ | 1752 | */ |
@@ -1648,12 +1773,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
1648 | * GFP_KERNEL allocations are not so marked, so can escape to the | 1773 | * GFP_KERNEL allocations are not so marked, so can escape to the |
1649 | * nearest mem_exclusive ancestor cpuset. | 1774 | * nearest mem_exclusive ancestor cpuset. |
1650 | * | 1775 | * |
1651 | * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() | 1776 | * Scanning up parent cpusets requires callback_sem. The __alloc_pages() |
1652 | * routine only calls here with __GFP_HARDWALL bit _not_ set if | 1777 | * routine only calls here with __GFP_HARDWALL bit _not_ set if |
1653 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks | 1778 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks |
1654 | * mems_allowed came up empty on the first pass over the zonelist. | 1779 | * mems_allowed came up empty on the first pass over the zonelist. |
1655 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are | 1780 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are |
1656 | * short of memory, might require taking the cpuset_sem semaphore. | 1781 | * short of memory, might require taking the callback_sem semaphore. |
1657 | * | 1782 | * |
1658 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() | 1783 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() |
1659 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing | 1784 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing |
@@ -1685,14 +1810,16 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
1685 | return 0; | 1810 | return 0; |
1686 | 1811 | ||
1687 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | 1812 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
1688 | cpuset_down(&cpuset_sem); | 1813 | down(&callback_sem); |
1689 | cs = current->cpuset; | 1814 | |
1690 | if (!cs) | 1815 | if (current->flags & PF_EXITING) /* Let dying task have memory */ |
1691 | goto done; /* current task exiting */ | 1816 | return 1; |
1692 | cs = nearest_exclusive_ancestor(cs); | 1817 | task_lock(current); |
1818 | cs = nearest_exclusive_ancestor(current->cpuset); | ||
1819 | task_unlock(current); | ||
1820 | |||
1693 | allowed = node_isset(node, cs->mems_allowed); | 1821 | allowed = node_isset(node, cs->mems_allowed); |
1694 | done: | 1822 | up(&callback_sem); |
1695 | cpuset_up(&cpuset_sem); | ||
1696 | return allowed; | 1823 | return allowed; |
1697 | } | 1824 | } |
1698 | 1825 | ||
@@ -1705,7 +1832,7 @@ done: | |||
1705 | * determine if task @p's memory usage might impact the memory | 1832 | * determine if task @p's memory usage might impact the memory |
1706 | * available to the current task. | 1833 | * available to the current task. |
1707 | * | 1834 | * |
1708 | * Acquires cpuset_sem - not suitable for calling from a fast path. | 1835 | * Acquires callback_sem - not suitable for calling from a fast path. |
1709 | **/ | 1836 | **/ |
1710 | 1837 | ||
1711 | int cpuset_excl_nodes_overlap(const struct task_struct *p) | 1838 | int cpuset_excl_nodes_overlap(const struct task_struct *p) |
@@ -1713,18 +1840,27 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
1713 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ | 1840 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ |
1714 | int overlap = 0; /* do cpusets overlap? */ | 1841 | int overlap = 0; /* do cpusets overlap? */ |
1715 | 1842 | ||
1716 | cpuset_down(&cpuset_sem); | 1843 | down(&callback_sem); |
1717 | cs1 = current->cpuset; | 1844 | |
1718 | if (!cs1) | 1845 | task_lock(current); |
1719 | goto done; /* current task exiting */ | 1846 | if (current->flags & PF_EXITING) { |
1720 | cs2 = p->cpuset; | 1847 | task_unlock(current); |
1721 | if (!cs2) | 1848 | goto done; |
1722 | goto done; /* task p is exiting */ | 1849 | } |
1723 | cs1 = nearest_exclusive_ancestor(cs1); | 1850 | cs1 = nearest_exclusive_ancestor(current->cpuset); |
1724 | cs2 = nearest_exclusive_ancestor(cs2); | 1851 | task_unlock(current); |
1852 | |||
1853 | task_lock((struct task_struct *)p); | ||
1854 | if (p->flags & PF_EXITING) { | ||
1855 | task_unlock((struct task_struct *)p); | ||
1856 | goto done; | ||
1857 | } | ||
1858 | cs2 = nearest_exclusive_ancestor(p->cpuset); | ||
1859 | task_unlock((struct task_struct *)p); | ||
1860 | |||
1725 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); | 1861 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); |
1726 | done: | 1862 | done: |
1727 | cpuset_up(&cpuset_sem); | 1863 | up(&callback_sem); |
1728 | 1864 | ||
1729 | return overlap; | 1865 | return overlap; |
1730 | } | 1866 | } |
@@ -1733,6 +1869,10 @@ done: | |||
1733 | * proc_cpuset_show() | 1869 | * proc_cpuset_show() |
1734 | * - Print tasks cpuset path into seq_file. | 1870 | * - Print tasks cpuset path into seq_file. |
1735 | * - Used for /proc/<pid>/cpuset. | 1871 | * - Used for /proc/<pid>/cpuset. |
1872 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it | ||
1873 | * doesn't really matter if tsk->cpuset changes after we read it, | ||
1874 | * and we take manage_sem, keeping attach_task() from changing it | ||
1875 | * anyway. | ||
1736 | */ | 1876 | */ |
1737 | 1877 | ||
1738 | static int proc_cpuset_show(struct seq_file *m, void *v) | 1878 | static int proc_cpuset_show(struct seq_file *m, void *v) |
@@ -1747,10 +1887,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
1747 | return -ENOMEM; | 1887 | return -ENOMEM; |
1748 | 1888 | ||
1749 | tsk = m->private; | 1889 | tsk = m->private; |
1750 | cpuset_down(&cpuset_sem); | 1890 | down(&manage_sem); |
1751 | task_lock(tsk); | ||
1752 | cs = tsk->cpuset; | 1891 | cs = tsk->cpuset; |
1753 | task_unlock(tsk); | ||
1754 | if (!cs) { | 1892 | if (!cs) { |
1755 | retval = -EINVAL; | 1893 | retval = -EINVAL; |
1756 | goto out; | 1894 | goto out; |
@@ -1762,7 +1900,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
1762 | seq_puts(m, buf); | 1900 | seq_puts(m, buf); |
1763 | seq_putc(m, '\n'); | 1901 | seq_putc(m, '\n'); |
1764 | out: | 1902 | out: |
1765 | cpuset_up(&cpuset_sem); | 1903 | up(&manage_sem); |
1766 | kfree(buf); | 1904 | kfree(buf); |
1767 | return retval; | 1905 | return retval; |
1768 | } | 1906 | } |