diff options
-rw-r--r-- | include/linux/sched.h | 2 | ||||
-rw-r--r-- | kernel/cpuset.c | 418 |
2 files changed, 282 insertions, 138 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c30bc308ef1..b2d2dc14f0b9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1211,7 +1211,7 @@ extern void unhash_process(struct task_struct *p); | |||
1211 | /* | 1211 | /* |
1212 | * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring | 1212 | * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring |
1213 | * subscriptions and synchronises with wait4(). Also used in procfs. Also | 1213 | * subscriptions and synchronises with wait4(). Also used in procfs. Also |
1214 | * pins the final release of task.io_context. | 1214 | * pins the final release of task.io_context. Also protects ->cpuset. |
1215 | * | 1215 | * |
1216 | * Nests both inside and outside of read_lock(&tasklist_lock). | 1216 | * Nests both inside and outside of read_lock(&tasklist_lock). |
1217 | * It must not be nested with write_lock_irq(&tasklist_lock), | 1217 | * It must not be nested with write_lock_irq(&tasklist_lock), |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index cd54dba2be18..7491352276b2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -60,6 +60,9 @@ struct cpuset { | |||
60 | cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ | 60 | cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ |
61 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ | 61 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ |
62 | 62 | ||
63 | /* | ||
64 | * Count is atomic so can incr (fork) or decr (exit) without a lock. | ||
65 | */ | ||
63 | atomic_t count; /* count tasks using this cpuset */ | 66 | atomic_t count; /* count tasks using this cpuset */ |
64 | 67 | ||
65 | /* | 68 | /* |
@@ -142,44 +145,91 @@ static struct vfsmount *cpuset_mount; | |||
142 | static struct super_block *cpuset_sb = NULL; | 145 | static struct super_block *cpuset_sb = NULL; |
143 | 146 | ||
144 | /* | 147 | /* |
145 | * cpuset_sem should be held by anyone who is depending on the children | 148 | * We have two global cpuset semaphores below. They can nest. |
146 | * or sibling lists of any cpuset, or performing non-atomic operations | 149 | * It is ok to first take manage_sem, then nest callback_sem. We also |
147 | * on the flags or *_allowed values of a cpuset, such as raising the | 150 | * require taking task_lock() when dereferencing a tasks cpuset pointer. |
148 | * CS_REMOVED flag bit iff it is not already raised, or reading and | 151 | * See "The task_lock() exception", at the end of this comment. |
149 | * conditionally modifying the *_allowed values. One kernel global | 152 | * |
150 | * cpuset semaphore should be sufficient - these things don't change | 153 | * A task must hold both semaphores to modify cpusets. If a task |
151 | * that much. | 154 | * holds manage_sem, then it blocks others wanting that semaphore, |
152 | * | 155 | * ensuring that it is the only task able to also acquire callback_sem |
153 | * The code that modifies cpusets holds cpuset_sem across the entire | 156 | * and be able to modify cpusets. It can perform various checks on |
154 | * operation, from cpuset_common_file_write() down, single threading | 157 | * the cpuset structure first, knowing nothing will change. It can |
155 | * all cpuset modifications (except for counter manipulations from | 158 | * also allocate memory while just holding manage_sem. While it is |
156 | * fork and exit) across the system. This presumes that cpuset | 159 | * performing these checks, various callback routines can briefly |
157 | * modifications are rare - better kept simple and safe, even if slow. | 160 | * acquire callback_sem to query cpusets. Once it is ready to make |
158 | * | 161 | * the changes, it takes callback_sem, blocking everyone else. |
159 | * The code that reads cpusets, such as in cpuset_common_file_read() | 162 | * |
160 | * and below, only holds cpuset_sem across small pieces of code, such | 163 | * Calls to the kernel memory allocator can not be made while holding |
161 | * as when reading out possibly multi-word cpumasks and nodemasks, as | 164 | * callback_sem, as that would risk double tripping on callback_sem |
162 | * the risks are less, and the desire for performance a little greater. | 165 | * from one of the callbacks into the cpuset code from within |
163 | * The proc_cpuset_show() routine needs to hold cpuset_sem to insure | 166 | * __alloc_pages(). |
164 | * that no cs->dentry is NULL, as it walks up the cpuset tree to root. | 167 | * |
165 | * | 168 | * If a task is only holding callback_sem, then it has read-only |
166 | * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't | 169 | * access to cpusets. |
167 | * (usually) grab cpuset_sem. These are the two most performance | 170 | * |
168 | * critical pieces of code here. The exception occurs on exit(), | 171 | * The task_struct fields mems_allowed and mems_generation may only |
169 | * when a task in a notify_on_release cpuset exits. Then cpuset_sem | 172 | * be accessed in the context of that task, so require no locks. |
173 | * | ||
174 | * Any task can increment and decrement the count field without lock. | ||
175 | * So in general, code holding manage_sem or callback_sem can't rely | ||
176 | * on the count field not changing. However, if the count goes to | ||
177 | * zero, then only attach_task(), which holds both semaphores, can | ||
178 | * increment it again. Because a count of zero means that no tasks | ||
179 | * are currently attached, therefore there is no way a task attached | ||
180 | * to that cpuset can fork (the other way to increment the count). | ||
181 | * So code holding manage_sem or callback_sem can safely assume that | ||
182 | * if the count is zero, it will stay zero. Similarly, if a task | ||
183 | * holds manage_sem or callback_sem on a cpuset with zero count, it | ||
184 | * knows that the cpuset won't be removed, as cpuset_rmdir() needs | ||
185 | * both of those semaphores. | ||
186 | * | ||
187 | * A possible optimization to improve parallelism would be to make | ||
188 | * callback_sem a R/W semaphore (rwsem), allowing the callback routines | ||
189 | * to proceed in parallel, with read access, until the holder of | ||
190 | * manage_sem needed to take this rwsem for exclusive write access | ||
191 | * and modify some cpusets. | ||
192 | * | ||
193 | * The cpuset_common_file_write handler for operations that modify | ||
194 | * the cpuset hierarchy holds manage_sem across the entire operation, | ||
195 | * single threading all such cpuset modifications across the system. | ||
196 | * | ||
197 | * The cpuset_common_file_read() handlers only hold callback_sem across | ||
198 | * small pieces of code, such as when reading out possibly multi-word | ||
199 | * cpumasks and nodemasks. | ||
200 | * | ||
201 | * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't | ||
202 | * (usually) take either semaphore. These are the two most performance | ||
203 | * critical pieces of code here. The exception occurs on cpuset_exit(), | ||
204 | * when a task in a notify_on_release cpuset exits. Then manage_sem | ||
170 | * is taken, and if the cpuset count is zero, a usermode call made | 205 | * is taken, and if the cpuset count is zero, a usermode call made |
171 | * to /sbin/cpuset_release_agent with the name of the cpuset (path | 206 | * to /sbin/cpuset_release_agent with the name of the cpuset (path |
172 | * relative to the root of cpuset file system) as the argument. | 207 | * relative to the root of cpuset file system) as the argument. |
173 | * | 208 | * |
174 | * A cpuset can only be deleted if both its 'count' of using tasks is | 209 | * A cpuset can only be deleted if both its 'count' of using tasks |
175 | * zero, and its list of 'children' cpusets is empty. Since all tasks | 210 | * is zero, and its list of 'children' cpusets is empty. Since all |
176 | * in the system use _some_ cpuset, and since there is always at least | 211 | * tasks in the system use _some_ cpuset, and since there is always at |
177 | * one task in the system (init, pid == 1), therefore, top_cpuset | 212 | * least one task in the system (init, pid == 1), therefore, top_cpuset |
178 | * always has either children cpusets and/or using tasks. So no need | 213 | * always has either children cpusets and/or using tasks. So we don't |
179 | * for any special hack to ensure that top_cpuset cannot be deleted. | 214 | * need a special hack to ensure that top_cpuset cannot be deleted. |
215 | * | ||
216 | * The above "Tale of Two Semaphores" would be complete, but for: | ||
217 | * | ||
218 | * The task_lock() exception | ||
219 | * | ||
220 | * The need for this exception arises from the action of attach_task(), | ||
221 | * which overwrites one tasks cpuset pointer with another. It does | ||
222 | * so using both semaphores, however there are several performance | ||
223 | * critical places that need to reference task->cpuset without the | ||
224 | * expense of grabbing a system global semaphore. Therefore except as | ||
225 | * noted below, when dereferencing or, as in attach_task(), modifying | ||
226 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock | ||
227 | * (task->alloc_lock) already in the task_struct routinely used for | ||
228 | * such matters. | ||
180 | */ | 229 | */ |
181 | 230 | ||
182 | static DECLARE_MUTEX(cpuset_sem); | 231 | static DECLARE_MUTEX(manage_sem); |
232 | static DECLARE_MUTEX(callback_sem); | ||
183 | 233 | ||
184 | /* | 234 | /* |
185 | * A couple of forward declarations required, due to cyclic reference loop: | 235 | * A couple of forward declarations required, due to cyclic reference loop: |
@@ -354,7 +404,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) | |||
354 | } | 404 | } |
355 | 405 | ||
356 | /* | 406 | /* |
357 | * Call with cpuset_sem held. Writes path of cpuset into buf. | 407 | * Call with manage_sem held. Writes path of cpuset into buf. |
358 | * Returns 0 on success, -errno on error. | 408 | * Returns 0 on success, -errno on error. |
359 | */ | 409 | */ |
360 | 410 | ||
@@ -406,10 +456,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) | |||
406 | * status of the /sbin/cpuset_release_agent task, so no sense holding | 456 | * status of the /sbin/cpuset_release_agent task, so no sense holding |
407 | * our caller up for that. | 457 | * our caller up for that. |
408 | * | 458 | * |
409 | * The simple act of forking that task might require more memory, | 459 | * When we had only one cpuset semaphore, we had to call this |
410 | * which might need cpuset_sem. So this routine must be called while | 460 | * without holding it, to avoid deadlock when call_usermodehelper() |
411 | * cpuset_sem is not held, to avoid a possible deadlock. See also | 461 | * allocated memory. With two locks, we could now call this while |
412 | * comments for check_for_release(), below. | 462 | * holding manage_sem, but we still don't, so as to minimize |
463 | * the time manage_sem is held. | ||
413 | */ | 464 | */ |
414 | 465 | ||
415 | static void cpuset_release_agent(const char *pathbuf) | 466 | static void cpuset_release_agent(const char *pathbuf) |
@@ -441,15 +492,15 @@ static void cpuset_release_agent(const char *pathbuf) | |||
441 | * cs is notify_on_release() and now both the user count is zero and | 492 | * cs is notify_on_release() and now both the user count is zero and |
442 | * the list of children is empty, prepare cpuset path in a kmalloc'd | 493 | * the list of children is empty, prepare cpuset path in a kmalloc'd |
443 | * buffer, to be returned via ppathbuf, so that the caller can invoke | 494 | * buffer, to be returned via ppathbuf, so that the caller can invoke |
444 | * cpuset_release_agent() with it later on, once cpuset_sem is dropped. | 495 | * cpuset_release_agent() with it later on, once manage_sem is dropped. |
445 | * Call here with cpuset_sem held. | 496 | * Call here with manage_sem held. |
446 | * | 497 | * |
447 | * This check_for_release() routine is responsible for kmalloc'ing | 498 | * This check_for_release() routine is responsible for kmalloc'ing |
448 | * pathbuf. The above cpuset_release_agent() is responsible for | 499 | * pathbuf. The above cpuset_release_agent() is responsible for |
449 | * kfree'ing pathbuf. The caller of these routines is responsible | 500 | * kfree'ing pathbuf. The caller of these routines is responsible |
450 | * for providing a pathbuf pointer, initialized to NULL, then | 501 | * for providing a pathbuf pointer, initialized to NULL, then |
451 | * calling check_for_release() with cpuset_sem held and the address | 502 | * calling check_for_release() with manage_sem held and the address |
452 | * of the pathbuf pointer, then dropping cpuset_sem, then calling | 503 | * of the pathbuf pointer, then dropping manage_sem, then calling |
453 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). | 504 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). |
454 | */ | 505 | */ |
455 | 506 | ||
@@ -480,7 +531,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf) | |||
480 | * One way or another, we guarantee to return some non-empty subset | 531 | * One way or another, we guarantee to return some non-empty subset |
481 | * of cpu_online_map. | 532 | * of cpu_online_map. |
482 | * | 533 | * |
483 | * Call with cpuset_sem held. | 534 | * Call with callback_sem held. |
484 | */ | 535 | */ |
485 | 536 | ||
486 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | 537 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) |
@@ -504,7 +555,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | |||
504 | * One way or another, we guarantee to return some non-empty subset | 555 | * One way or another, we guarantee to return some non-empty subset |
505 | * of node_online_map. | 556 | * of node_online_map. |
506 | * | 557 | * |
507 | * Call with cpuset_sem held. | 558 | * Call with callback_sem held. |
508 | */ | 559 | */ |
509 | 560 | ||
510 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 561 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) |
@@ -519,31 +570,44 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
519 | } | 570 | } |
520 | 571 | ||
521 | /* | 572 | /* |
522 | * Refresh current tasks mems_allowed and mems_generation from | 573 | * Refresh current tasks mems_allowed and mems_generation from current |
523 | * current tasks cpuset. Call with cpuset_sem held. | 574 | * tasks cpuset. |
524 | * | 575 | * |
525 | * Be sure to call refresh_mems() on any cpuset operation which | 576 | * Call without callback_sem or task_lock() held. May be called with |
526 | * (1) holds cpuset_sem, and (2) might possibly alloc memory. | 577 | * or without manage_sem held. Will acquire task_lock() and might |
527 | * Call after obtaining cpuset_sem lock, before any possible | 578 | * acquire callback_sem during call. |
528 | * allocation. Otherwise one risks trying to allocate memory | 579 | * |
529 | * while the task cpuset_mems_generation is not the same as | 580 | * The task_lock() is required to dereference current->cpuset safely. |
530 | * the mems_generation in its cpuset, which would deadlock on | 581 | * Without it, we could pick up the pointer value of current->cpuset |
531 | * cpuset_sem in cpuset_update_current_mems_allowed(). | 582 | * in one instruction, and then attach_task could give us a different |
532 | * | 583 | * cpuset, and then the cpuset we had could be removed and freed, |
533 | * Since we hold cpuset_sem, once refresh_mems() is called, the | 584 | * and then on our next instruction, we could dereference a no longer |
534 | * test (current->cpuset_mems_generation != cs->mems_generation) | 585 | * valid cpuset pointer to get its mems_generation field. |
535 | * in cpuset_update_current_mems_allowed() will remain false, | 586 | * |
536 | * until we drop cpuset_sem. Anyone else who would change our | 587 | * This routine is needed to update the per-task mems_allowed data, |
537 | * cpusets mems_generation needs to lock cpuset_sem first. | 588 | * within the tasks context, when it is trying to allocate memory |
589 | * (in various mm/mempolicy.c routines) and notices that some other | ||
590 | * task has been modifying its cpuset. | ||
538 | */ | 591 | */ |
539 | 592 | ||
540 | static void refresh_mems(void) | 593 | static void refresh_mems(void) |
541 | { | 594 | { |
542 | struct cpuset *cs = current->cpuset; | 595 | int my_cpusets_mem_gen; |
596 | |||
597 | task_lock(current); | ||
598 | my_cpusets_mem_gen = current->cpuset->mems_generation; | ||
599 | task_unlock(current); | ||
543 | 600 | ||
544 | if (current->cpuset_mems_generation != cs->mems_generation) { | 601 | if (current->cpuset_mems_generation != my_cpusets_mem_gen) { |
602 | struct cpuset *cs; | ||
603 | |||
604 | down(&callback_sem); | ||
605 | task_lock(current); | ||
606 | cs = current->cpuset; | ||
545 | guarantee_online_mems(cs, ¤t->mems_allowed); | 607 | guarantee_online_mems(cs, ¤t->mems_allowed); |
546 | current->cpuset_mems_generation = cs->mems_generation; | 608 | current->cpuset_mems_generation = cs->mems_generation; |
609 | task_unlock(current); | ||
610 | up(&callback_sem); | ||
547 | } | 611 | } |
548 | } | 612 | } |
549 | 613 | ||
@@ -552,7 +616,7 @@ static void refresh_mems(void) | |||
552 | * | 616 | * |
553 | * One cpuset is a subset of another if all its allowed CPUs and | 617 | * One cpuset is a subset of another if all its allowed CPUs and |
554 | * Memory Nodes are a subset of the other, and its exclusive flags | 618 | * Memory Nodes are a subset of the other, and its exclusive flags |
555 | * are only set if the other's are set. | 619 | * are only set if the other's are set. Call holding manage_sem. |
556 | */ | 620 | */ |
557 | 621 | ||
558 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | 622 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) |
@@ -570,7 +634,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | |||
570 | * If we replaced the flag and mask values of the current cpuset | 634 | * If we replaced the flag and mask values of the current cpuset |
571 | * (cur) with those values in the trial cpuset (trial), would | 635 | * (cur) with those values in the trial cpuset (trial), would |
572 | * our various subset and exclusive rules still be valid? Presumes | 636 | * our various subset and exclusive rules still be valid? Presumes |
573 | * cpuset_sem held. | 637 | * manage_sem held. |
574 | * | 638 | * |
575 | * 'cur' is the address of an actual, in-use cpuset. Operations | 639 | * 'cur' is the address of an actual, in-use cpuset. Operations |
576 | * such as list traversal that depend on the actual address of the | 640 | * such as list traversal that depend on the actual address of the |
@@ -624,7 +688,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
624 | * exclusive child cpusets | 688 | * exclusive child cpusets |
625 | * Build these two partitions by calling partition_sched_domains | 689 | * Build these two partitions by calling partition_sched_domains |
626 | * | 690 | * |
627 | * Call with cpuset_sem held. May nest a call to the | 691 | * Call with manage_sem held. May nest a call to the |
628 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 692 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. |
629 | */ | 693 | */ |
630 | 694 | ||
@@ -669,6 +733,10 @@ static void update_cpu_domains(struct cpuset *cur) | |||
669 | unlock_cpu_hotplug(); | 733 | unlock_cpu_hotplug(); |
670 | } | 734 | } |
671 | 735 | ||
736 | /* | ||
737 | * Call with manage_sem held. May take callback_sem during call. | ||
738 | */ | ||
739 | |||
672 | static int update_cpumask(struct cpuset *cs, char *buf) | 740 | static int update_cpumask(struct cpuset *cs, char *buf) |
673 | { | 741 | { |
674 | struct cpuset trialcs; | 742 | struct cpuset trialcs; |
@@ -685,12 +753,18 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
685 | if (retval < 0) | 753 | if (retval < 0) |
686 | return retval; | 754 | return retval; |
687 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); | 755 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); |
756 | down(&callback_sem); | ||
688 | cs->cpus_allowed = trialcs.cpus_allowed; | 757 | cs->cpus_allowed = trialcs.cpus_allowed; |
758 | up(&callback_sem); | ||
689 | if (is_cpu_exclusive(cs) && !cpus_unchanged) | 759 | if (is_cpu_exclusive(cs) && !cpus_unchanged) |
690 | update_cpu_domains(cs); | 760 | update_cpu_domains(cs); |
691 | return 0; | 761 | return 0; |
692 | } | 762 | } |
693 | 763 | ||
764 | /* | ||
765 | * Call with manage_sem held. May take callback_sem during call. | ||
766 | */ | ||
767 | |||
694 | static int update_nodemask(struct cpuset *cs, char *buf) | 768 | static int update_nodemask(struct cpuset *cs, char *buf) |
695 | { | 769 | { |
696 | struct cpuset trialcs; | 770 | struct cpuset trialcs; |
@@ -705,9 +779,11 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
705 | return -ENOSPC; | 779 | return -ENOSPC; |
706 | retval = validate_change(cs, &trialcs); | 780 | retval = validate_change(cs, &trialcs); |
707 | if (retval == 0) { | 781 | if (retval == 0) { |
782 | down(&callback_sem); | ||
708 | cs->mems_allowed = trialcs.mems_allowed; | 783 | cs->mems_allowed = trialcs.mems_allowed; |
709 | atomic_inc(&cpuset_mems_generation); | 784 | atomic_inc(&cpuset_mems_generation); |
710 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 785 | cs->mems_generation = atomic_read(&cpuset_mems_generation); |
786 | up(&callback_sem); | ||
711 | } | 787 | } |
712 | return retval; | 788 | return retval; |
713 | } | 789 | } |
@@ -718,6 +794,8 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
718 | * CS_NOTIFY_ON_RELEASE) | 794 | * CS_NOTIFY_ON_RELEASE) |
719 | * cs: the cpuset to update | 795 | * cs: the cpuset to update |
720 | * buf: the buffer where we read the 0 or 1 | 796 | * buf: the buffer where we read the 0 or 1 |
797 | * | ||
798 | * Call with manage_sem held. | ||
721 | */ | 799 | */ |
722 | 800 | ||
723 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | 801 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) |
@@ -739,16 +817,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
739 | return err; | 817 | return err; |
740 | cpu_exclusive_changed = | 818 | cpu_exclusive_changed = |
741 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); | 819 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); |
820 | down(&callback_sem); | ||
742 | if (turning_on) | 821 | if (turning_on) |
743 | set_bit(bit, &cs->flags); | 822 | set_bit(bit, &cs->flags); |
744 | else | 823 | else |
745 | clear_bit(bit, &cs->flags); | 824 | clear_bit(bit, &cs->flags); |
825 | up(&callback_sem); | ||
746 | 826 | ||
747 | if (cpu_exclusive_changed) | 827 | if (cpu_exclusive_changed) |
748 | update_cpu_domains(cs); | 828 | update_cpu_domains(cs); |
749 | return 0; | 829 | return 0; |
750 | } | 830 | } |
751 | 831 | ||
832 | /* | ||
833 | * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly | ||
834 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be | ||
835 | * notified on release. | ||
836 | * | ||
837 | * Call holding manage_sem. May take callback_sem and task_lock of | ||
838 | * the task 'pid' during call. | ||
839 | */ | ||
840 | |||
752 | static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | 841 | static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) |
753 | { | 842 | { |
754 | pid_t pid; | 843 | pid_t pid; |
@@ -765,7 +854,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
765 | read_lock(&tasklist_lock); | 854 | read_lock(&tasklist_lock); |
766 | 855 | ||
767 | tsk = find_task_by_pid(pid); | 856 | tsk = find_task_by_pid(pid); |
768 | if (!tsk) { | 857 | if (!tsk || tsk->flags & PF_EXITING) { |
769 | read_unlock(&tasklist_lock); | 858 | read_unlock(&tasklist_lock); |
770 | return -ESRCH; | 859 | return -ESRCH; |
771 | } | 860 | } |
@@ -783,10 +872,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
783 | get_task_struct(tsk); | 872 | get_task_struct(tsk); |
784 | } | 873 | } |
785 | 874 | ||
875 | down(&callback_sem); | ||
876 | |||
786 | task_lock(tsk); | 877 | task_lock(tsk); |
787 | oldcs = tsk->cpuset; | 878 | oldcs = tsk->cpuset; |
788 | if (!oldcs) { | 879 | if (!oldcs) { |
789 | task_unlock(tsk); | 880 | task_unlock(tsk); |
881 | up(&callback_sem); | ||
790 | put_task_struct(tsk); | 882 | put_task_struct(tsk); |
791 | return -ESRCH; | 883 | return -ESRCH; |
792 | } | 884 | } |
@@ -797,6 +889,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
797 | guarantee_online_cpus(cs, &cpus); | 889 | guarantee_online_cpus(cs, &cpus); |
798 | set_cpus_allowed(tsk, cpus); | 890 | set_cpus_allowed(tsk, cpus); |
799 | 891 | ||
892 | up(&callback_sem); | ||
800 | put_task_struct(tsk); | 893 | put_task_struct(tsk); |
801 | if (atomic_dec_and_test(&oldcs->count)) | 894 | if (atomic_dec_and_test(&oldcs->count)) |
802 | check_for_release(oldcs, ppathbuf); | 895 | check_for_release(oldcs, ppathbuf); |
@@ -840,7 +933,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
840 | } | 933 | } |
841 | buffer[nbytes] = 0; /* nul-terminate */ | 934 | buffer[nbytes] = 0; /* nul-terminate */ |
842 | 935 | ||
843 | down(&cpuset_sem); | 936 | down(&manage_sem); |
844 | 937 | ||
845 | if (is_removed(cs)) { | 938 | if (is_removed(cs)) { |
846 | retval = -ENODEV; | 939 | retval = -ENODEV; |
@@ -874,7 +967,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
874 | if (retval == 0) | 967 | if (retval == 0) |
875 | retval = nbytes; | 968 | retval = nbytes; |
876 | out2: | 969 | out2: |
877 | up(&cpuset_sem); | 970 | up(&manage_sem); |
878 | cpuset_release_agent(pathbuf); | 971 | cpuset_release_agent(pathbuf); |
879 | out1: | 972 | out1: |
880 | kfree(buffer); | 973 | kfree(buffer); |
@@ -914,9 +1007,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | |||
914 | { | 1007 | { |
915 | cpumask_t mask; | 1008 | cpumask_t mask; |
916 | 1009 | ||
917 | down(&cpuset_sem); | 1010 | down(&callback_sem); |
918 | mask = cs->cpus_allowed; | 1011 | mask = cs->cpus_allowed; |
919 | up(&cpuset_sem); | 1012 | up(&callback_sem); |
920 | 1013 | ||
921 | return cpulist_scnprintf(page, PAGE_SIZE, mask); | 1014 | return cpulist_scnprintf(page, PAGE_SIZE, mask); |
922 | } | 1015 | } |
@@ -925,9 +1018,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
925 | { | 1018 | { |
926 | nodemask_t mask; | 1019 | nodemask_t mask; |
927 | 1020 | ||
928 | down(&cpuset_sem); | 1021 | down(&callback_sem); |
929 | mask = cs->mems_allowed; | 1022 | mask = cs->mems_allowed; |
930 | up(&cpuset_sem); | 1023 | up(&callback_sem); |
931 | 1024 | ||
932 | return nodelist_scnprintf(page, PAGE_SIZE, mask); | 1025 | return nodelist_scnprintf(page, PAGE_SIZE, mask); |
933 | } | 1026 | } |
@@ -1135,7 +1228,9 @@ struct ctr_struct { | |||
1135 | 1228 | ||
1136 | /* | 1229 | /* |
1137 | * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. | 1230 | * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. |
1138 | * Return actual number of pids loaded. | 1231 | * Return actual number of pids loaded. No need to task_lock(p) |
1232 | * when reading out p->cpuset, as we don't really care if it changes | ||
1233 | * on the next cycle, and we are not going to try to dereference it. | ||
1139 | */ | 1234 | */ |
1140 | static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) | 1235 | static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) |
1141 | { | 1236 | { |
@@ -1177,6 +1272,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) | |||
1177 | return cnt; | 1272 | return cnt; |
1178 | } | 1273 | } |
1179 | 1274 | ||
1275 | /* | ||
1276 | * Handle an open on 'tasks' file. Prepare a buffer listing the | ||
1277 | * process id's of tasks currently attached to the cpuset being opened. | ||
1278 | * | ||
1279 | * Does not require any specific cpuset semaphores, and does not take any. | ||
1280 | */ | ||
1180 | static int cpuset_tasks_open(struct inode *unused, struct file *file) | 1281 | static int cpuset_tasks_open(struct inode *unused, struct file *file) |
1181 | { | 1282 | { |
1182 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | 1283 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); |
@@ -1324,7 +1425,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1324 | if (!cs) | 1425 | if (!cs) |
1325 | return -ENOMEM; | 1426 | return -ENOMEM; |
1326 | 1427 | ||
1327 | down(&cpuset_sem); | 1428 | down(&manage_sem); |
1328 | refresh_mems(); | 1429 | refresh_mems(); |
1329 | cs->flags = 0; | 1430 | cs->flags = 0; |
1330 | if (notify_on_release(parent)) | 1431 | if (notify_on_release(parent)) |
@@ -1339,25 +1440,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1339 | 1440 | ||
1340 | cs->parent = parent; | 1441 | cs->parent = parent; |
1341 | 1442 | ||
1443 | down(&callback_sem); | ||
1342 | list_add(&cs->sibling, &cs->parent->children); | 1444 | list_add(&cs->sibling, &cs->parent->children); |
1445 | up(&callback_sem); | ||
1343 | 1446 | ||
1344 | err = cpuset_create_dir(cs, name, mode); | 1447 | err = cpuset_create_dir(cs, name, mode); |
1345 | if (err < 0) | 1448 | if (err < 0) |
1346 | goto err; | 1449 | goto err; |
1347 | 1450 | ||
1348 | /* | 1451 | /* |
1349 | * Release cpuset_sem before cpuset_populate_dir() because it | 1452 | * Release manage_sem before cpuset_populate_dir() because it |
1350 | * will down() this new directory's i_sem and if we race with | 1453 | * will down() this new directory's i_sem and if we race with |
1351 | * another mkdir, we might deadlock. | 1454 | * another mkdir, we might deadlock. |
1352 | */ | 1455 | */ |
1353 | up(&cpuset_sem); | 1456 | up(&manage_sem); |
1354 | 1457 | ||
1355 | err = cpuset_populate_dir(cs->dentry); | 1458 | err = cpuset_populate_dir(cs->dentry); |
1356 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 1459 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
1357 | return 0; | 1460 | return 0; |
1358 | err: | 1461 | err: |
1359 | list_del(&cs->sibling); | 1462 | list_del(&cs->sibling); |
1360 | up(&cpuset_sem); | 1463 | up(&manage_sem); |
1361 | kfree(cs); | 1464 | kfree(cs); |
1362 | return err; | 1465 | return err; |
1363 | } | 1466 | } |
@@ -1379,30 +1482,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1379 | 1482 | ||
1380 | /* the vfs holds both inode->i_sem already */ | 1483 | /* the vfs holds both inode->i_sem already */ |
1381 | 1484 | ||
1382 | down(&cpuset_sem); | 1485 | down(&manage_sem); |
1383 | refresh_mems(); | 1486 | refresh_mems(); |
1384 | if (atomic_read(&cs->count) > 0) { | 1487 | if (atomic_read(&cs->count) > 0) { |
1385 | up(&cpuset_sem); | 1488 | up(&manage_sem); |
1386 | return -EBUSY; | 1489 | return -EBUSY; |
1387 | } | 1490 | } |
1388 | if (!list_empty(&cs->children)) { | 1491 | if (!list_empty(&cs->children)) { |
1389 | up(&cpuset_sem); | 1492 | up(&manage_sem); |
1390 | return -EBUSY; | 1493 | return -EBUSY; |
1391 | } | 1494 | } |
1392 | parent = cs->parent; | 1495 | parent = cs->parent; |
1496 | down(&callback_sem); | ||
1393 | set_bit(CS_REMOVED, &cs->flags); | 1497 | set_bit(CS_REMOVED, &cs->flags); |
1394 | if (is_cpu_exclusive(cs)) | 1498 | if (is_cpu_exclusive(cs)) |
1395 | update_cpu_domains(cs); | 1499 | update_cpu_domains(cs); |
1396 | list_del(&cs->sibling); /* delete my sibling from parent->children */ | 1500 | list_del(&cs->sibling); /* delete my sibling from parent->children */ |
1397 | if (list_empty(&parent->children)) | ||
1398 | check_for_release(parent, &pathbuf); | ||
1399 | spin_lock(&cs->dentry->d_lock); | 1501 | spin_lock(&cs->dentry->d_lock); |
1400 | d = dget(cs->dentry); | 1502 | d = dget(cs->dentry); |
1401 | cs->dentry = NULL; | 1503 | cs->dentry = NULL; |
1402 | spin_unlock(&d->d_lock); | 1504 | spin_unlock(&d->d_lock); |
1403 | cpuset_d_remove_dir(d); | 1505 | cpuset_d_remove_dir(d); |
1404 | dput(d); | 1506 | dput(d); |
1405 | up(&cpuset_sem); | 1507 | up(&callback_sem); |
1508 | if (list_empty(&parent->children)) | ||
1509 | check_for_release(parent, &pathbuf); | ||
1510 | up(&manage_sem); | ||
1406 | cpuset_release_agent(pathbuf); | 1511 | cpuset_release_agent(pathbuf); |
1407 | return 0; | 1512 | return 0; |
1408 | } | 1513 | } |
@@ -1462,16 +1567,26 @@ void __init cpuset_init_smp(void) | |||
1462 | * cpuset_fork - attach newly forked task to its parents cpuset. | 1567 | * cpuset_fork - attach newly forked task to its parents cpuset. |
1463 | * @tsk: pointer to task_struct of forking parent process. | 1568 | * @tsk: pointer to task_struct of forking parent process. |
1464 | * | 1569 | * |
1465 | * Description: By default, on fork, a task inherits its | 1570 | * Description: A task inherits its parent's cpuset at fork(). |
1466 | * parent's cpuset. The pointer to the shared cpuset is | 1571 | * |
1467 | * automatically copied in fork.c by dup_task_struct(). | 1572 | * A pointer to the shared cpuset was automatically copied in fork.c |
1468 | * This cpuset_fork() routine need only increment the usage | 1573 | * by dup_task_struct(). However, we ignore that copy, since it was |
1469 | * counter in that cpuset. | 1574 | * not made under the protection of task_lock(), so might no longer be |
1575 | * a valid cpuset pointer. attach_task() might have already changed | ||
1576 | * current->cpuset, allowing the previously referenced cpuset to | ||
1577 | * be removed and freed. Instead, we task_lock(current) and copy | ||
1578 | * its present value of current->cpuset for our freshly forked child. | ||
1579 | * | ||
1580 | * At the point that cpuset_fork() is called, 'current' is the parent | ||
1581 | * task, and the passed argument 'child' points to the child task. | ||
1470 | **/ | 1582 | **/ |
1471 | 1583 | ||
1472 | void cpuset_fork(struct task_struct *tsk) | 1584 | void cpuset_fork(struct task_struct *child) |
1473 | { | 1585 | { |
1474 | atomic_inc(&tsk->cpuset->count); | 1586 | task_lock(current); |
1587 | child->cpuset = current->cpuset; | ||
1588 | atomic_inc(&child->cpuset->count); | ||
1589 | task_unlock(current); | ||
1475 | } | 1590 | } |
1476 | 1591 | ||
1477 | /** | 1592 | /** |
@@ -1480,35 +1595,42 @@ void cpuset_fork(struct task_struct *tsk) | |||
1480 | * | 1595 | * |
1481 | * Description: Detach cpuset from @tsk and release it. | 1596 | * Description: Detach cpuset from @tsk and release it. |
1482 | * | 1597 | * |
1483 | * Note that cpusets marked notify_on_release force every task | 1598 | * Note that cpusets marked notify_on_release force every task in |
1484 | * in them to take the global cpuset_sem semaphore when exiting. | 1599 | * them to take the global manage_sem semaphore when exiting. |
1485 | * This could impact scaling on very large systems. Be reluctant | 1600 | * This could impact scaling on very large systems. Be reluctant to |
1486 | * to use notify_on_release cpusets where very high task exit | 1601 | * use notify_on_release cpusets where very high task exit scaling |
1487 | * scaling is required on large systems. | 1602 | * is required on large systems. |
1488 | * | 1603 | * |
1489 | * Don't even think about derefencing 'cs' after the cpuset use | 1604 | * Don't even think about derefencing 'cs' after the cpuset use count |
1490 | * count goes to zero, except inside a critical section guarded | 1605 | * goes to zero, except inside a critical section guarded by manage_sem |
1491 | * by the cpuset_sem semaphore. If you don't hold cpuset_sem, | 1606 | * or callback_sem. Otherwise a zero cpuset use count is a license to |
1492 | * then a zero cpuset use count is a license to any other task to | 1607 | * any other task to nuke the cpuset immediately, via cpuset_rmdir(). |
1493 | * nuke the cpuset immediately. | 1608 | * |
1609 | * This routine has to take manage_sem, not callback_sem, because | ||
1610 | * it is holding that semaphore while calling check_for_release(), | ||
1611 | * which calls kmalloc(), so can't be called holding callback__sem(). | ||
1612 | * | ||
1613 | * We don't need to task_lock() this reference to tsk->cpuset, | ||
1614 | * because tsk is already marked PF_EXITING, so attach_task() won't | ||
1615 | * mess with it. | ||
1494 | **/ | 1616 | **/ |
1495 | 1617 | ||
1496 | void cpuset_exit(struct task_struct *tsk) | 1618 | void cpuset_exit(struct task_struct *tsk) |
1497 | { | 1619 | { |
1498 | struct cpuset *cs; | 1620 | struct cpuset *cs; |
1499 | 1621 | ||
1500 | task_lock(tsk); | 1622 | BUG_ON(!(tsk->flags & PF_EXITING)); |
1623 | |||
1501 | cs = tsk->cpuset; | 1624 | cs = tsk->cpuset; |
1502 | tsk->cpuset = NULL; | 1625 | tsk->cpuset = NULL; |
1503 | task_unlock(tsk); | ||
1504 | 1626 | ||
1505 | if (notify_on_release(cs)) { | 1627 | if (notify_on_release(cs)) { |
1506 | char *pathbuf = NULL; | 1628 | char *pathbuf = NULL; |
1507 | 1629 | ||
1508 | down(&cpuset_sem); | 1630 | down(&manage_sem); |
1509 | if (atomic_dec_and_test(&cs->count)) | 1631 | if (atomic_dec_and_test(&cs->count)) |
1510 | check_for_release(cs, &pathbuf); | 1632 | check_for_release(cs, &pathbuf); |
1511 | up(&cpuset_sem); | 1633 | up(&manage_sem); |
1512 | cpuset_release_agent(pathbuf); | 1634 | cpuset_release_agent(pathbuf); |
1513 | } else { | 1635 | } else { |
1514 | atomic_dec(&cs->count); | 1636 | atomic_dec(&cs->count); |
@@ -1529,11 +1651,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | |||
1529 | { | 1651 | { |
1530 | cpumask_t mask; | 1652 | cpumask_t mask; |
1531 | 1653 | ||
1532 | down(&cpuset_sem); | 1654 | down(&callback_sem); |
1533 | task_lock((struct task_struct *)tsk); | 1655 | task_lock((struct task_struct *)tsk); |
1534 | guarantee_online_cpus(tsk->cpuset, &mask); | 1656 | guarantee_online_cpus(tsk->cpuset, &mask); |
1535 | task_unlock((struct task_struct *)tsk); | 1657 | task_unlock((struct task_struct *)tsk); |
1536 | up(&cpuset_sem); | 1658 | up(&callback_sem); |
1537 | 1659 | ||
1538 | return mask; | 1660 | return mask; |
1539 | } | 1661 | } |
@@ -1549,19 +1671,28 @@ void cpuset_init_current_mems_allowed(void) | |||
1549 | * If the current tasks cpusets mems_allowed changed behind our backs, | 1671 | * If the current tasks cpusets mems_allowed changed behind our backs, |
1550 | * update current->mems_allowed and mems_generation to the new value. | 1672 | * update current->mems_allowed and mems_generation to the new value. |
1551 | * Do not call this routine if in_interrupt(). | 1673 | * Do not call this routine if in_interrupt(). |
1674 | * | ||
1675 | * Call without callback_sem or task_lock() held. May be called | ||
1676 | * with or without manage_sem held. Unless exiting, it will acquire | ||
1677 | * task_lock(). Also might acquire callback_sem during call to | ||
1678 | * refresh_mems(). | ||
1552 | */ | 1679 | */ |
1553 | 1680 | ||
1554 | void cpuset_update_current_mems_allowed(void) | 1681 | void cpuset_update_current_mems_allowed(void) |
1555 | { | 1682 | { |
1556 | struct cpuset *cs = current->cpuset; | 1683 | struct cpuset *cs; |
1684 | int need_to_refresh = 0; | ||
1557 | 1685 | ||
1686 | task_lock(current); | ||
1687 | cs = current->cpuset; | ||
1558 | if (!cs) | 1688 | if (!cs) |
1559 | return; /* task is exiting */ | 1689 | goto done; |
1560 | if (current->cpuset_mems_generation != cs->mems_generation) { | 1690 | if (current->cpuset_mems_generation != cs->mems_generation) |
1561 | down(&cpuset_sem); | 1691 | need_to_refresh = 1; |
1692 | done: | ||
1693 | task_unlock(current); | ||
1694 | if (need_to_refresh) | ||
1562 | refresh_mems(); | 1695 | refresh_mems(); |
1563 | up(&cpuset_sem); | ||
1564 | } | ||
1565 | } | 1696 | } |
1566 | 1697 | ||
1567 | /** | 1698 | /** |
@@ -1595,7 +1726,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | |||
1595 | 1726 | ||
1596 | /* | 1727 | /* |
1597 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive | 1728 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive |
1598 | * ancestor to the specified cpuset. Call while holding cpuset_sem. | 1729 | * ancestor to the specified cpuset. Call holding callback_sem. |
1599 | * If no ancestor is mem_exclusive (an unusual configuration), then | 1730 | * If no ancestor is mem_exclusive (an unusual configuration), then |
1600 | * returns the root cpuset. | 1731 | * returns the root cpuset. |
1601 | */ | 1732 | */ |
@@ -1622,12 +1753,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
1622 | * GFP_KERNEL allocations are not so marked, so can escape to the | 1753 | * GFP_KERNEL allocations are not so marked, so can escape to the |
1623 | * nearest mem_exclusive ancestor cpuset. | 1754 | * nearest mem_exclusive ancestor cpuset. |
1624 | * | 1755 | * |
1625 | * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() | 1756 | * Scanning up parent cpusets requires callback_sem. The __alloc_pages() |
1626 | * routine only calls here with __GFP_HARDWALL bit _not_ set if | 1757 | * routine only calls here with __GFP_HARDWALL bit _not_ set if |
1627 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks | 1758 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks |
1628 | * mems_allowed came up empty on the first pass over the zonelist. | 1759 | * mems_allowed came up empty on the first pass over the zonelist. |
1629 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are | 1760 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are |
1630 | * short of memory, might require taking the cpuset_sem semaphore. | 1761 | * short of memory, might require taking the callback_sem semaphore. |
1631 | * | 1762 | * |
1632 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() | 1763 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() |
1633 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing | 1764 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing |
@@ -1659,14 +1790,16 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
1659 | return 0; | 1790 | return 0; |
1660 | 1791 | ||
1661 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | 1792 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
1662 | down(&cpuset_sem); | 1793 | down(&callback_sem); |
1663 | cs = current->cpuset; | 1794 | |
1664 | if (!cs) | 1795 | if (current->flags & PF_EXITING) /* Let dying task have memory */ |
1665 | goto done; /* current task exiting */ | 1796 | return 1; |
1666 | cs = nearest_exclusive_ancestor(cs); | 1797 | task_lock(current); |
1798 | cs = nearest_exclusive_ancestor(current->cpuset); | ||
1799 | task_unlock(current); | ||
1800 | |||
1667 | allowed = node_isset(node, cs->mems_allowed); | 1801 | allowed = node_isset(node, cs->mems_allowed); |
1668 | done: | 1802 | up(&callback_sem); |
1669 | up(&cpuset_sem); | ||
1670 | return allowed; | 1803 | return allowed; |
1671 | } | 1804 | } |
1672 | 1805 | ||
@@ -1679,7 +1812,7 @@ done: | |||
1679 | * determine if task @p's memory usage might impact the memory | 1812 | * determine if task @p's memory usage might impact the memory |
1680 | * available to the current task. | 1813 | * available to the current task. |
1681 | * | 1814 | * |
1682 | * Acquires cpuset_sem - not suitable for calling from a fast path. | 1815 | * Acquires callback_sem - not suitable for calling from a fast path. |
1683 | **/ | 1816 | **/ |
1684 | 1817 | ||
1685 | int cpuset_excl_nodes_overlap(const struct task_struct *p) | 1818 | int cpuset_excl_nodes_overlap(const struct task_struct *p) |
@@ -1687,18 +1820,27 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
1687 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ | 1820 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ |
1688 | int overlap = 0; /* do cpusets overlap? */ | 1821 | int overlap = 0; /* do cpusets overlap? */ |
1689 | 1822 | ||
1690 | down(&cpuset_sem); | 1823 | down(&callback_sem); |
1691 | cs1 = current->cpuset; | 1824 | |
1692 | if (!cs1) | 1825 | task_lock(current); |
1693 | goto done; /* current task exiting */ | 1826 | if (current->flags & PF_EXITING) { |
1694 | cs2 = p->cpuset; | 1827 | task_unlock(current); |
1695 | if (!cs2) | 1828 | goto done; |
1696 | goto done; /* task p is exiting */ | 1829 | } |
1697 | cs1 = nearest_exclusive_ancestor(cs1); | 1830 | cs1 = nearest_exclusive_ancestor(current->cpuset); |
1698 | cs2 = nearest_exclusive_ancestor(cs2); | 1831 | task_unlock(current); |
1832 | |||
1833 | task_lock((struct task_struct *)p); | ||
1834 | if (p->flags & PF_EXITING) { | ||
1835 | task_unlock((struct task_struct *)p); | ||
1836 | goto done; | ||
1837 | } | ||
1838 | cs2 = nearest_exclusive_ancestor(p->cpuset); | ||
1839 | task_unlock((struct task_struct *)p); | ||
1840 | |||
1699 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); | 1841 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); |
1700 | done: | 1842 | done: |
1701 | up(&cpuset_sem); | 1843 | up(&callback_sem); |
1702 | 1844 | ||
1703 | return overlap; | 1845 | return overlap; |
1704 | } | 1846 | } |
@@ -1707,6 +1849,10 @@ done: | |||
1707 | * proc_cpuset_show() | 1849 | * proc_cpuset_show() |
1708 | * - Print tasks cpuset path into seq_file. | 1850 | * - Print tasks cpuset path into seq_file. |
1709 | * - Used for /proc/<pid>/cpuset. | 1851 | * - Used for /proc/<pid>/cpuset. |
1852 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it | ||
1853 | * doesn't really matter if tsk->cpuset changes after we read it, | ||
1854 | * and we take manage_sem, keeping attach_task() from changing it | ||
1855 | * anyway. | ||
1710 | */ | 1856 | */ |
1711 | 1857 | ||
1712 | static int proc_cpuset_show(struct seq_file *m, void *v) | 1858 | static int proc_cpuset_show(struct seq_file *m, void *v) |
@@ -1721,10 +1867,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
1721 | return -ENOMEM; | 1867 | return -ENOMEM; |
1722 | 1868 | ||
1723 | tsk = m->private; | 1869 | tsk = m->private; |
1724 | down(&cpuset_sem); | 1870 | down(&manage_sem); |
1725 | task_lock(tsk); | ||
1726 | cs = tsk->cpuset; | 1871 | cs = tsk->cpuset; |
1727 | task_unlock(tsk); | ||
1728 | if (!cs) { | 1872 | if (!cs) { |
1729 | retval = -EINVAL; | 1873 | retval = -EINVAL; |
1730 | goto out; | 1874 | goto out; |
@@ -1736,7 +1880,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
1736 | seq_puts(m, buf); | 1880 | seq_puts(m, buf); |
1737 | seq_putc(m, '\n'); | 1881 | seq_putc(m, '\n'); |
1738 | out: | 1882 | out: |
1739 | up(&cpuset_sem); | 1883 | up(&manage_sem); |
1740 | kfree(buf); | 1884 | kfree(buf); |
1741 | return retval; | 1885 | return retval; |
1742 | } | 1886 | } |